diff --git "a/checkpoint-51604/trainer_state.json" "b/checkpoint-51604/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-51604/trainer_state.json" @@ -0,0 +1,361402 @@ +{ + "best_global_step": 44232, + "best_metric": 25.152142349633934, + "best_model_checkpoint": "./whisper-medium-bfloat16-sada-v1/./whisper-medium-bfloat16-sada-v1/whisper-medium-bfloat16-sada-v1/checkpoints/checkpoint-44232", + "epoch": 3.0999186109603905, + "eval_steps": 3686, + "global_step": 51604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.356483993488877e-05, + "grad_norm": 26.447826385498047, + "learning_rate": 0.0, + "loss": 3.3405, + "step": 1 + }, + { + "epoch": 2.712967986977754e-05, + "grad_norm": 29.817049026489258, + "learning_rate": 1.3333333333333334e-08, + "loss": 3.1412, + "step": 2 + }, + { + "epoch": 4.0694519804666304e-05, + "grad_norm": 24.67610740661621, + "learning_rate": 2.6666666666666667e-08, + "loss": 2.9893, + "step": 3 + }, + { + "epoch": 5.425935973955508e-05, + "grad_norm": 16.327356338500977, + "learning_rate": 4e-08, + "loss": 2.2168, + "step": 4 + }, + { + "epoch": 6.782419967444384e-05, + "grad_norm": 19.585880279541016, + "learning_rate": 5.3333333333333334e-08, + "loss": 2.3774, + "step": 5 + }, + { + "epoch": 8.138903960933261e-05, + "grad_norm": 808.9290771484375, + "learning_rate": 6.666666666666668e-08, + "loss": 2.579, + "step": 6 + }, + { + "epoch": 9.495387954422137e-05, + "grad_norm": 24.42464828491211, + "learning_rate": 8e-08, + "loss": 2.9131, + "step": 7 + }, + { + "epoch": 0.00010851871947911015, + "grad_norm": 21.510984420776367, + "learning_rate": 9.333333333333335e-08, + "loss": 2.6682, + "step": 8 + }, + { + "epoch": 0.00012208355941399892, + "grad_norm": 29.152917861938477, + "learning_rate": 1.0666666666666667e-07, + "loss": 3.1137, + "step": 9 + }, + { + "epoch": 0.00013564839934888768, + "grad_norm": 22.31827163696289, + "learning_rate": 1.2000000000000002e-07, + "loss": 2.8541, + "step": 10 + }, + { + "epoch": 0.00014921323928377645, + "grad_norm": 24.468996047973633, + "learning_rate": 1.3333333333333336e-07, + "loss": 3.0269, + "step": 11 + }, + { + "epoch": 0.00016277807921866522, + "grad_norm": 19.033323287963867, + "learning_rate": 1.4666666666666668e-07, + "loss": 2.3867, + "step": 12 + }, + { + "epoch": 0.00017634291915355398, + "grad_norm": 12.42765998840332, + "learning_rate": 1.6e-07, + "loss": 1.6059, + "step": 13 + }, + { + "epoch": 0.00018990775908844275, + "grad_norm": 29.568445205688477, + "learning_rate": 1.7333333333333335e-07, + "loss": 3.4309, + "step": 14 + }, + { + "epoch": 0.00020347259902333151, + "grad_norm": 26.244138717651367, + "learning_rate": 1.866666666666667e-07, + "loss": 3.0973, + "step": 15 + }, + { + "epoch": 0.0002170374389582203, + "grad_norm": 20.88216781616211, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.1823, + "step": 16 + }, + { + "epoch": 0.00023060227889310907, + "grad_norm": 34.993282318115234, + "learning_rate": 2.1333333333333334e-07, + "loss": 4.1343, + "step": 17 + }, + { + "epoch": 0.00024416711882799784, + "grad_norm": 23.788572311401367, + "learning_rate": 2.266666666666667e-07, + "loss": 2.8553, + "step": 18 + }, + { + "epoch": 0.0002577319587628866, + "grad_norm": 29.318506240844727, + "learning_rate": 2.4000000000000003e-07, + "loss": 3.1856, + "step": 19 + }, + { + "epoch": 0.00027129679869777537, + "grad_norm": 19.160127639770508, + "learning_rate": 2.533333333333333e-07, + "loss": 2.4151, + "step": 20 + }, + { + "epoch": 0.00028486163863266414, + "grad_norm": 19.39948081970215, + "learning_rate": 2.666666666666667e-07, + "loss": 2.4019, + "step": 21 + }, + { + "epoch": 0.0002984264785675529, + "grad_norm": 36.512088775634766, + "learning_rate": 2.8e-07, + "loss": 2.331, + "step": 22 + }, + { + "epoch": 0.00031199131850244167, + "grad_norm": 24.091764450073242, + "learning_rate": 2.9333333333333337e-07, + "loss": 2.7035, + "step": 23 + }, + { + "epoch": 0.00032555615843733043, + "grad_norm": 20.815898895263672, + "learning_rate": 3.0666666666666666e-07, + "loss": 2.797, + "step": 24 + }, + { + "epoch": 0.0003391209983722192, + "grad_norm": 20.771465301513672, + "learning_rate": 3.2e-07, + "loss": 2.5728, + "step": 25 + }, + { + "epoch": 0.00035268583830710796, + "grad_norm": 17.95117950439453, + "learning_rate": 3.3333333333333335e-07, + "loss": 2.3432, + "step": 26 + }, + { + "epoch": 0.00036625067824199673, + "grad_norm": 20.376859664916992, + "learning_rate": 3.466666666666667e-07, + "loss": 2.1108, + "step": 27 + }, + { + "epoch": 0.0003798155181768855, + "grad_norm": 26.862537384033203, + "learning_rate": 3.6e-07, + "loss": 3.5235, + "step": 28 + }, + { + "epoch": 0.00039338035811177426, + "grad_norm": 25.942392349243164, + "learning_rate": 3.733333333333334e-07, + "loss": 3.2127, + "step": 29 + }, + { + "epoch": 0.00040694519804666303, + "grad_norm": 26.156614303588867, + "learning_rate": 3.8666666666666674e-07, + "loss": 2.7616, + "step": 30 + }, + { + "epoch": 0.0004205100379815518, + "grad_norm": 19.650487899780273, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.2438, + "step": 31 + }, + { + "epoch": 0.0004340748779164406, + "grad_norm": 20.556169509887695, + "learning_rate": 4.133333333333334e-07, + "loss": 2.9699, + "step": 32 + }, + { + "epoch": 0.0004476397178513294, + "grad_norm": 13.62496280670166, + "learning_rate": 4.266666666666667e-07, + "loss": 1.8121, + "step": 33 + }, + { + "epoch": 0.00046120455778621814, + "grad_norm": 21.830184936523438, + "learning_rate": 4.4e-07, + "loss": 2.0999, + "step": 34 + }, + { + "epoch": 0.0004747693977211069, + "grad_norm": 16.979507446289062, + "learning_rate": 4.533333333333334e-07, + "loss": 2.1961, + "step": 35 + }, + { + "epoch": 0.0004883342376559957, + "grad_norm": 14.923199653625488, + "learning_rate": 4.666666666666667e-07, + "loss": 1.9703, + "step": 36 + }, + { + "epoch": 0.0005018990775908844, + "grad_norm": 20.191953659057617, + "learning_rate": 4.800000000000001e-07, + "loss": 2.5811, + "step": 37 + }, + { + "epoch": 0.0005154639175257732, + "grad_norm": 18.76801872253418, + "learning_rate": 4.933333333333334e-07, + "loss": 2.0224, + "step": 38 + }, + { + "epoch": 0.000529028757460662, + "grad_norm": 23.03506851196289, + "learning_rate": 5.066666666666667e-07, + "loss": 2.9088, + "step": 39 + }, + { + "epoch": 0.0005425935973955507, + "grad_norm": 14.706356048583984, + "learning_rate": 5.2e-07, + "loss": 2.2827, + "step": 40 + }, + { + "epoch": 0.0005561584373304395, + "grad_norm": 22.994956970214844, + "learning_rate": 5.333333333333335e-07, + "loss": 2.9146, + "step": 41 + }, + { + "epoch": 0.0005697232772653283, + "grad_norm": 13.028777122497559, + "learning_rate": 5.466666666666667e-07, + "loss": 1.8232, + "step": 42 + }, + { + "epoch": 0.000583288117200217, + "grad_norm": 25.49199676513672, + "learning_rate": 5.6e-07, + "loss": 2.931, + "step": 43 + }, + { + "epoch": 0.0005968529571351058, + "grad_norm": 12.979634284973145, + "learning_rate": 5.733333333333334e-07, + "loss": 2.0686, + "step": 44 + }, + { + "epoch": 0.0006104177970699946, + "grad_norm": 25.617799758911133, + "learning_rate": 5.866666666666667e-07, + "loss": 2.8559, + "step": 45 + }, + { + "epoch": 0.0006239826370048833, + "grad_norm": 46.10478591918945, + "learning_rate": 6.000000000000001e-07, + "loss": 2.8377, + "step": 46 + }, + { + "epoch": 0.0006375474769397721, + "grad_norm": 24.66064453125, + "learning_rate": 6.133333333333333e-07, + "loss": 2.9532, + "step": 47 + }, + { + "epoch": 0.0006511123168746609, + "grad_norm": 16.62615203857422, + "learning_rate": 6.266666666666667e-07, + "loss": 2.3588, + "step": 48 + }, + { + "epoch": 0.0006646771568095496, + "grad_norm": 16.995166778564453, + "learning_rate": 6.4e-07, + "loss": 2.1482, + "step": 49 + }, + { + "epoch": 0.0006782419967444384, + "grad_norm": 16.163898468017578, + "learning_rate": 6.533333333333334e-07, + "loss": 2.2309, + "step": 50 + }, + { + "epoch": 0.0006918068366793272, + "grad_norm": 16.308940887451172, + "learning_rate": 6.666666666666667e-07, + "loss": 2.1009, + "step": 51 + }, + { + "epoch": 0.0007053716766142159, + "grad_norm": 19.37238121032715, + "learning_rate": 6.800000000000001e-07, + "loss": 2.7178, + "step": 52 + }, + { + "epoch": 0.0007189365165491047, + "grad_norm": 13.650136947631836, + "learning_rate": 6.933333333333334e-07, + "loss": 1.8563, + "step": 53 + }, + { + "epoch": 0.0007325013564839935, + "grad_norm": 16.90062713623047, + "learning_rate": 7.066666666666667e-07, + "loss": 2.2656, + "step": 54 + }, + { + "epoch": 0.0007460661964188822, + "grad_norm": 13.157358169555664, + "learning_rate": 7.2e-07, + "loss": 1.9051, + "step": 55 + }, + { + "epoch": 0.000759631036353771, + "grad_norm": 23.601917266845703, + "learning_rate": 7.333333333333334e-07, + "loss": 3.0102, + "step": 56 + }, + { + "epoch": 0.0007731958762886598, + "grad_norm": 17.35457992553711, + "learning_rate": 7.466666666666668e-07, + "loss": 2.308, + "step": 57 + }, + { + "epoch": 0.0007867607162235485, + "grad_norm": 14.354334831237793, + "learning_rate": 7.6e-07, + "loss": 1.7395, + "step": 58 + }, + { + "epoch": 0.0008003255561584373, + "grad_norm": 9.795291900634766, + "learning_rate": 7.733333333333335e-07, + "loss": 1.587, + "step": 59 + }, + { + "epoch": 0.0008138903960933261, + "grad_norm": 24.102375030517578, + "learning_rate": 7.866666666666667e-07, + "loss": 3.0848, + "step": 60 + }, + { + "epoch": 0.0008274552360282148, + "grad_norm": 20.026756286621094, + "learning_rate": 8.000000000000001e-07, + "loss": 2.2748, + "step": 61 + }, + { + "epoch": 0.0008410200759631036, + "grad_norm": 16.940120697021484, + "learning_rate": 8.133333333333333e-07, + "loss": 2.306, + "step": 62 + }, + { + "epoch": 0.0008545849158979924, + "grad_norm": 21.10868263244629, + "learning_rate": 8.266666666666668e-07, + "loss": 2.5987, + "step": 63 + }, + { + "epoch": 0.0008681497558328812, + "grad_norm": 19.671573638916016, + "learning_rate": 8.400000000000001e-07, + "loss": 2.4477, + "step": 64 + }, + { + "epoch": 0.00088171459576777, + "grad_norm": 21.2540225982666, + "learning_rate": 8.533333333333334e-07, + "loss": 2.8252, + "step": 65 + }, + { + "epoch": 0.0008952794357026588, + "grad_norm": 17.3441162109375, + "learning_rate": 8.666666666666668e-07, + "loss": 2.3916, + "step": 66 + }, + { + "epoch": 0.0009088442756375475, + "grad_norm": 19.243276596069336, + "learning_rate": 8.8e-07, + "loss": 2.4754, + "step": 67 + }, + { + "epoch": 0.0009224091155724363, + "grad_norm": 14.628678321838379, + "learning_rate": 8.933333333333334e-07, + "loss": 2.1844, + "step": 68 + }, + { + "epoch": 0.0009359739555073251, + "grad_norm": 14.639700889587402, + "learning_rate": 9.066666666666668e-07, + "loss": 1.9865, + "step": 69 + }, + { + "epoch": 0.0009495387954422138, + "grad_norm": 14.306645393371582, + "learning_rate": 9.200000000000001e-07, + "loss": 1.9575, + "step": 70 + }, + { + "epoch": 0.0009631036353771026, + "grad_norm": 16.0073299407959, + "learning_rate": 9.333333333333334e-07, + "loss": 2.2496, + "step": 71 + }, + { + "epoch": 0.0009766684753119914, + "grad_norm": 13.942242622375488, + "learning_rate": 9.466666666666667e-07, + "loss": 1.9978, + "step": 72 + }, + { + "epoch": 0.0009902333152468801, + "grad_norm": 14.026994705200195, + "learning_rate": 9.600000000000001e-07, + "loss": 1.9904, + "step": 73 + }, + { + "epoch": 0.0010037981551817689, + "grad_norm": 23.08137321472168, + "learning_rate": 9.733333333333333e-07, + "loss": 2.7297, + "step": 74 + }, + { + "epoch": 0.0010173629951166576, + "grad_norm": 13.018153190612793, + "learning_rate": 9.866666666666668e-07, + "loss": 1.8039, + "step": 75 + }, + { + "epoch": 0.0010309278350515464, + "grad_norm": 19.76833152770996, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.4292, + "step": 76 + }, + { + "epoch": 0.0010444926749864352, + "grad_norm": 17.59891128540039, + "learning_rate": 1.0133333333333333e-06, + "loss": 2.4173, + "step": 77 + }, + { + "epoch": 0.001058057514921324, + "grad_norm": 14.695662498474121, + "learning_rate": 1.0266666666666669e-06, + "loss": 2.0049, + "step": 78 + }, + { + "epoch": 0.0010716223548562127, + "grad_norm": 23.167015075683594, + "learning_rate": 1.04e-06, + "loss": 2.3444, + "step": 79 + }, + { + "epoch": 0.0010851871947911015, + "grad_norm": 13.042938232421875, + "learning_rate": 1.0533333333333333e-06, + "loss": 1.7056, + "step": 80 + }, + { + "epoch": 0.0010987520347259902, + "grad_norm": 16.37158203125, + "learning_rate": 1.066666666666667e-06, + "loss": 2.267, + "step": 81 + }, + { + "epoch": 0.001112316874660879, + "grad_norm": 12.750184059143066, + "learning_rate": 1.08e-06, + "loss": 1.8805, + "step": 82 + }, + { + "epoch": 0.0011258817145957678, + "grad_norm": 14.313203811645508, + "learning_rate": 1.0933333333333334e-06, + "loss": 1.8923, + "step": 83 + }, + { + "epoch": 0.0011394465545306565, + "grad_norm": 13.31512451171875, + "learning_rate": 1.1066666666666667e-06, + "loss": 1.9395, + "step": 84 + }, + { + "epoch": 0.0011530113944655453, + "grad_norm": 10.505036354064941, + "learning_rate": 1.12e-06, + "loss": 1.5185, + "step": 85 + }, + { + "epoch": 0.001166576234400434, + "grad_norm": 13.456559181213379, + "learning_rate": 1.1333333333333334e-06, + "loss": 1.7799, + "step": 86 + }, + { + "epoch": 0.0011801410743353228, + "grad_norm": 10.710407257080078, + "learning_rate": 1.1466666666666668e-06, + "loss": 1.4297, + "step": 87 + }, + { + "epoch": 0.0011937059142702116, + "grad_norm": 13.048550605773926, + "learning_rate": 1.1600000000000001e-06, + "loss": 1.6393, + "step": 88 + }, + { + "epoch": 0.0012072707542051004, + "grad_norm": 17.649770736694336, + "learning_rate": 1.1733333333333335e-06, + "loss": 2.2664, + "step": 89 + }, + { + "epoch": 0.0012208355941399891, + "grad_norm": 13.191062927246094, + "learning_rate": 1.1866666666666668e-06, + "loss": 1.6924, + "step": 90 + }, + { + "epoch": 0.001234400434074878, + "grad_norm": 15.854312896728516, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.1351, + "step": 91 + }, + { + "epoch": 0.0012479652740097667, + "grad_norm": 13.888899803161621, + "learning_rate": 1.2133333333333335e-06, + "loss": 1.9023, + "step": 92 + }, + { + "epoch": 0.0012615301139446554, + "grad_norm": 14.812742233276367, + "learning_rate": 1.2266666666666666e-06, + "loss": 1.8975, + "step": 93 + }, + { + "epoch": 0.0012750949538795442, + "grad_norm": 13.623923301696777, + "learning_rate": 1.2400000000000002e-06, + "loss": 1.9463, + "step": 94 + }, + { + "epoch": 0.001288659793814433, + "grad_norm": 16.09325408935547, + "learning_rate": 1.2533333333333333e-06, + "loss": 1.7501, + "step": 95 + }, + { + "epoch": 0.0013022246337493217, + "grad_norm": 12.794758796691895, + "learning_rate": 1.2666666666666669e-06, + "loss": 1.6783, + "step": 96 + }, + { + "epoch": 0.0013157894736842105, + "grad_norm": 15.762393951416016, + "learning_rate": 1.28e-06, + "loss": 2.0049, + "step": 97 + }, + { + "epoch": 0.0013293543136190993, + "grad_norm": 11.44944953918457, + "learning_rate": 1.2933333333333334e-06, + "loss": 1.5896, + "step": 98 + }, + { + "epoch": 0.001342919153553988, + "grad_norm": 13.99467945098877, + "learning_rate": 1.3066666666666667e-06, + "loss": 1.7135, + "step": 99 + }, + { + "epoch": 0.0013564839934888768, + "grad_norm": 17.947084426879883, + "learning_rate": 1.32e-06, + "loss": 2.1404, + "step": 100 + }, + { + "epoch": 0.0013700488334237656, + "grad_norm": 14.20557975769043, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.875, + "step": 101 + }, + { + "epoch": 0.0013836136733586543, + "grad_norm": 12.921945571899414, + "learning_rate": 1.3466666666666668e-06, + "loss": 1.5374, + "step": 102 + }, + { + "epoch": 0.001397178513293543, + "grad_norm": 13.990961074829102, + "learning_rate": 1.3600000000000001e-06, + "loss": 1.8231, + "step": 103 + }, + { + "epoch": 0.0014107433532284319, + "grad_norm": 10.778789520263672, + "learning_rate": 1.3733333333333335e-06, + "loss": 1.4406, + "step": 104 + }, + { + "epoch": 0.0014243081931633206, + "grad_norm": 11.711599349975586, + "learning_rate": 1.3866666666666668e-06, + "loss": 1.4991, + "step": 105 + }, + { + "epoch": 0.0014378730330982094, + "grad_norm": 14.135971069335938, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.7823, + "step": 106 + }, + { + "epoch": 0.0014514378730330982, + "grad_norm": 15.783368110656738, + "learning_rate": 1.4133333333333335e-06, + "loss": 1.9834, + "step": 107 + }, + { + "epoch": 0.001465002712967987, + "grad_norm": 11.921911239624023, + "learning_rate": 1.4266666666666668e-06, + "loss": 1.5803, + "step": 108 + }, + { + "epoch": 0.0014785675529028757, + "grad_norm": 9.134888648986816, + "learning_rate": 1.44e-06, + "loss": 1.1773, + "step": 109 + }, + { + "epoch": 0.0014921323928377645, + "grad_norm": 8.798376083374023, + "learning_rate": 1.4533333333333335e-06, + "loss": 1.1805, + "step": 110 + }, + { + "epoch": 0.0015056972327726532, + "grad_norm": 11.650533676147461, + "learning_rate": 1.4666666666666669e-06, + "loss": 1.5366, + "step": 111 + }, + { + "epoch": 0.001519262072707542, + "grad_norm": 11.262343406677246, + "learning_rate": 1.48e-06, + "loss": 1.5633, + "step": 112 + }, + { + "epoch": 0.0015328269126424307, + "grad_norm": 12.15826416015625, + "learning_rate": 1.4933333333333336e-06, + "loss": 1.5355, + "step": 113 + }, + { + "epoch": 0.0015463917525773195, + "grad_norm": 10.461808204650879, + "learning_rate": 1.506666666666667e-06, + "loss": 1.2396, + "step": 114 + }, + { + "epoch": 0.0015599565925122083, + "grad_norm": 13.251955032348633, + "learning_rate": 1.52e-06, + "loss": 1.8574, + "step": 115 + }, + { + "epoch": 0.001573521432447097, + "grad_norm": 10.978288650512695, + "learning_rate": 1.5333333333333334e-06, + "loss": 1.5202, + "step": 116 + }, + { + "epoch": 0.0015870862723819858, + "grad_norm": 14.320022583007812, + "learning_rate": 1.546666666666667e-06, + "loss": 1.7818, + "step": 117 + }, + { + "epoch": 0.0016006511123168746, + "grad_norm": 12.169622421264648, + "learning_rate": 1.56e-06, + "loss": 1.5435, + "step": 118 + }, + { + "epoch": 0.0016142159522517633, + "grad_norm": 14.61320686340332, + "learning_rate": 1.5733333333333334e-06, + "loss": 1.8054, + "step": 119 + }, + { + "epoch": 0.0016277807921866521, + "grad_norm": 16.577495574951172, + "learning_rate": 1.586666666666667e-06, + "loss": 1.9222, + "step": 120 + }, + { + "epoch": 0.0016413456321215409, + "grad_norm": 15.620399475097656, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.7704, + "step": 121 + }, + { + "epoch": 0.0016549104720564296, + "grad_norm": 12.883560180664062, + "learning_rate": 1.6133333333333335e-06, + "loss": 1.6625, + "step": 122 + }, + { + "epoch": 0.0016684753119913184, + "grad_norm": 11.239139556884766, + "learning_rate": 1.6266666666666666e-06, + "loss": 1.4829, + "step": 123 + }, + { + "epoch": 0.0016820401519262072, + "grad_norm": 10.646906852722168, + "learning_rate": 1.6400000000000002e-06, + "loss": 1.2867, + "step": 124 + }, + { + "epoch": 0.001695604991861096, + "grad_norm": 16.21167755126953, + "learning_rate": 1.6533333333333335e-06, + "loss": 2.0305, + "step": 125 + }, + { + "epoch": 0.0017091698317959847, + "grad_norm": 14.391514778137207, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.705, + "step": 126 + }, + { + "epoch": 0.0017227346717308735, + "grad_norm": 14.01416301727295, + "learning_rate": 1.6800000000000002e-06, + "loss": 1.8161, + "step": 127 + }, + { + "epoch": 0.0017362995116657625, + "grad_norm": 10.499018669128418, + "learning_rate": 1.6933333333333336e-06, + "loss": 1.3386, + "step": 128 + }, + { + "epoch": 0.0017498643516006512, + "grad_norm": 11.33741283416748, + "learning_rate": 1.7066666666666667e-06, + "loss": 1.5248, + "step": 129 + }, + { + "epoch": 0.00176342919153554, + "grad_norm": 10.196919441223145, + "learning_rate": 1.72e-06, + "loss": 1.4132, + "step": 130 + }, + { + "epoch": 0.0017769940314704287, + "grad_norm": 12.670902252197266, + "learning_rate": 1.7333333333333336e-06, + "loss": 1.7213, + "step": 131 + }, + { + "epoch": 0.0017905588714053175, + "grad_norm": 10.080798149108887, + "learning_rate": 1.7466666666666667e-06, + "loss": 1.2467, + "step": 132 + }, + { + "epoch": 0.0018041237113402063, + "grad_norm": 12.752422332763672, + "learning_rate": 1.76e-06, + "loss": 1.6255, + "step": 133 + }, + { + "epoch": 0.001817688551275095, + "grad_norm": 13.260488510131836, + "learning_rate": 1.7733333333333336e-06, + "loss": 1.57, + "step": 134 + }, + { + "epoch": 0.0018312533912099838, + "grad_norm": 12.308530807495117, + "learning_rate": 1.7866666666666668e-06, + "loss": 1.5528, + "step": 135 + }, + { + "epoch": 0.0018448182311448726, + "grad_norm": 12.984086990356445, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.7313, + "step": 136 + }, + { + "epoch": 0.0018583830710797613, + "grad_norm": 11.399560928344727, + "learning_rate": 1.8133333333333337e-06, + "loss": 1.4549, + "step": 137 + }, + { + "epoch": 0.0018719479110146501, + "grad_norm": 12.361128807067871, + "learning_rate": 1.8266666666666668e-06, + "loss": 1.4399, + "step": 138 + }, + { + "epoch": 0.0018855127509495389, + "grad_norm": 11.601750373840332, + "learning_rate": 1.8400000000000002e-06, + "loss": 1.4653, + "step": 139 + }, + { + "epoch": 0.0018990775908844276, + "grad_norm": 10.362783432006836, + "learning_rate": 1.8533333333333333e-06, + "loss": 1.3435, + "step": 140 + }, + { + "epoch": 0.0019126424308193164, + "grad_norm": 18.056936264038086, + "learning_rate": 1.8666666666666669e-06, + "loss": 2.1454, + "step": 141 + }, + { + "epoch": 0.0019262072707542052, + "grad_norm": 10.150920867919922, + "learning_rate": 1.8800000000000002e-06, + "loss": 1.3057, + "step": 142 + }, + { + "epoch": 0.001939772110689094, + "grad_norm": 13.356412887573242, + "learning_rate": 1.8933333333333333e-06, + "loss": 1.6168, + "step": 143 + }, + { + "epoch": 0.0019533369506239827, + "grad_norm": 12.407734870910645, + "learning_rate": 1.906666666666667e-06, + "loss": 1.2952, + "step": 144 + }, + { + "epoch": 0.0019669017905588715, + "grad_norm": 15.623086929321289, + "learning_rate": 1.9200000000000003e-06, + "loss": 1.6895, + "step": 145 + }, + { + "epoch": 0.0019804666304937602, + "grad_norm": 15.247904777526855, + "learning_rate": 1.9333333333333336e-06, + "loss": 1.7699, + "step": 146 + }, + { + "epoch": 0.001994031470428649, + "grad_norm": 15.589674949645996, + "learning_rate": 1.9466666666666665e-06, + "loss": 1.8888, + "step": 147 + }, + { + "epoch": 0.0020075963103635378, + "grad_norm": 16.04449462890625, + "learning_rate": 1.9600000000000003e-06, + "loss": 1.77, + "step": 148 + }, + { + "epoch": 0.0020211611502984265, + "grad_norm": 14.19361686706543, + "learning_rate": 1.9733333333333336e-06, + "loss": 1.5778, + "step": 149 + }, + { + "epoch": 0.0020347259902333153, + "grad_norm": 12.958545684814453, + "learning_rate": 1.9866666666666666e-06, + "loss": 1.5186, + "step": 150 + }, + { + "epoch": 0.002048290830168204, + "grad_norm": 15.93224811553955, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.6708, + "step": 151 + }, + { + "epoch": 0.002061855670103093, + "grad_norm": 13.619611740112305, + "learning_rate": 2.0133333333333337e-06, + "loss": 1.7241, + "step": 152 + }, + { + "epoch": 0.0020754205100379816, + "grad_norm": 13.133296012878418, + "learning_rate": 2.0266666666666666e-06, + "loss": 1.525, + "step": 153 + }, + { + "epoch": 0.0020889853499728704, + "grad_norm": 17.680557250976562, + "learning_rate": 2.04e-06, + "loss": 1.9817, + "step": 154 + }, + { + "epoch": 0.002102550189907759, + "grad_norm": 13.087940216064453, + "learning_rate": 2.0533333333333337e-06, + "loss": 1.6399, + "step": 155 + }, + { + "epoch": 0.002116115029842648, + "grad_norm": 15.433588027954102, + "learning_rate": 2.0666666666666666e-06, + "loss": 2.1132, + "step": 156 + }, + { + "epoch": 0.0021296798697775367, + "grad_norm": 14.060660362243652, + "learning_rate": 2.08e-06, + "loss": 1.4681, + "step": 157 + }, + { + "epoch": 0.0021432447097124254, + "grad_norm": 18.083341598510742, + "learning_rate": 2.0933333333333338e-06, + "loss": 1.9215, + "step": 158 + }, + { + "epoch": 0.002156809549647314, + "grad_norm": 9.197656631469727, + "learning_rate": 2.1066666666666667e-06, + "loss": 1.1907, + "step": 159 + }, + { + "epoch": 0.002170374389582203, + "grad_norm": 13.31218433380127, + "learning_rate": 2.12e-06, + "loss": 1.4817, + "step": 160 + }, + { + "epoch": 0.0021839392295170917, + "grad_norm": 13.25493049621582, + "learning_rate": 2.133333333333334e-06, + "loss": 1.608, + "step": 161 + }, + { + "epoch": 0.0021975040694519805, + "grad_norm": 10.19467830657959, + "learning_rate": 2.1466666666666667e-06, + "loss": 1.0766, + "step": 162 + }, + { + "epoch": 0.0022110689093868693, + "grad_norm": 11.920318603515625, + "learning_rate": 2.16e-06, + "loss": 1.2979, + "step": 163 + }, + { + "epoch": 0.002224633749321758, + "grad_norm": 14.280396461486816, + "learning_rate": 2.1733333333333334e-06, + "loss": 1.495, + "step": 164 + }, + { + "epoch": 0.002238198589256647, + "grad_norm": 15.358216285705566, + "learning_rate": 2.1866666666666668e-06, + "loss": 1.8681, + "step": 165 + }, + { + "epoch": 0.0022517634291915356, + "grad_norm": 14.859749794006348, + "learning_rate": 2.2e-06, + "loss": 1.8189, + "step": 166 + }, + { + "epoch": 0.0022653282691264243, + "grad_norm": 9.976550102233887, + "learning_rate": 2.2133333333333335e-06, + "loss": 1.2283, + "step": 167 + }, + { + "epoch": 0.002278893109061313, + "grad_norm": 12.821454048156738, + "learning_rate": 2.226666666666667e-06, + "loss": 1.2445, + "step": 168 + }, + { + "epoch": 0.002292457948996202, + "grad_norm": 16.468740463256836, + "learning_rate": 2.24e-06, + "loss": 1.5841, + "step": 169 + }, + { + "epoch": 0.0023060227889310906, + "grad_norm": 10.024906158447266, + "learning_rate": 2.2533333333333335e-06, + "loss": 1.0382, + "step": 170 + }, + { + "epoch": 0.0023195876288659794, + "grad_norm": 15.16279411315918, + "learning_rate": 2.266666666666667e-06, + "loss": 1.5624, + "step": 171 + }, + { + "epoch": 0.002333152468800868, + "grad_norm": 15.173832893371582, + "learning_rate": 2.28e-06, + "loss": 1.606, + "step": 172 + }, + { + "epoch": 0.002346717308735757, + "grad_norm": 10.974044799804688, + "learning_rate": 2.2933333333333335e-06, + "loss": 1.1723, + "step": 173 + }, + { + "epoch": 0.0023602821486706457, + "grad_norm": 11.49937915802002, + "learning_rate": 2.306666666666667e-06, + "loss": 1.3288, + "step": 174 + }, + { + "epoch": 0.0023738469886055344, + "grad_norm": 11.522683143615723, + "learning_rate": 2.3200000000000002e-06, + "loss": 1.311, + "step": 175 + }, + { + "epoch": 0.002387411828540423, + "grad_norm": 10.506412506103516, + "learning_rate": 2.3333333333333336e-06, + "loss": 1.1, + "step": 176 + }, + { + "epoch": 0.002400976668475312, + "grad_norm": 10.82577896118164, + "learning_rate": 2.346666666666667e-06, + "loss": 1.0959, + "step": 177 + }, + { + "epoch": 0.0024145415084102007, + "grad_norm": 15.245636940002441, + "learning_rate": 2.3600000000000003e-06, + "loss": 1.5882, + "step": 178 + }, + { + "epoch": 0.0024281063483450895, + "grad_norm": 15.767924308776855, + "learning_rate": 2.3733333333333336e-06, + "loss": 1.4674, + "step": 179 + }, + { + "epoch": 0.0024416711882799783, + "grad_norm": 13.767078399658203, + "learning_rate": 2.386666666666667e-06, + "loss": 1.4656, + "step": 180 + }, + { + "epoch": 0.002455236028214867, + "grad_norm": 13.210156440734863, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.3215, + "step": 181 + }, + { + "epoch": 0.002468800868149756, + "grad_norm": 12.657727241516113, + "learning_rate": 2.4133333333333337e-06, + "loss": 1.3513, + "step": 182 + }, + { + "epoch": 0.0024823657080846446, + "grad_norm": 10.08989429473877, + "learning_rate": 2.426666666666667e-06, + "loss": 1.053, + "step": 183 + }, + { + "epoch": 0.0024959305480195333, + "grad_norm": 15.046239852905273, + "learning_rate": 2.4400000000000004e-06, + "loss": 1.3805, + "step": 184 + }, + { + "epoch": 0.002509495387954422, + "grad_norm": 9.276785850524902, + "learning_rate": 2.4533333333333333e-06, + "loss": 0.9951, + "step": 185 + }, + { + "epoch": 0.002523060227889311, + "grad_norm": 12.126082420349121, + "learning_rate": 2.466666666666667e-06, + "loss": 1.2206, + "step": 186 + }, + { + "epoch": 0.0025366250678241996, + "grad_norm": 11.439530372619629, + "learning_rate": 2.4800000000000004e-06, + "loss": 1.1019, + "step": 187 + }, + { + "epoch": 0.0025501899077590884, + "grad_norm": 12.447957038879395, + "learning_rate": 2.4933333333333333e-06, + "loss": 1.1439, + "step": 188 + }, + { + "epoch": 0.002563754747693977, + "grad_norm": 9.041702270507812, + "learning_rate": 2.5066666666666667e-06, + "loss": 0.956, + "step": 189 + }, + { + "epoch": 0.002577319587628866, + "grad_norm": 12.3440580368042, + "learning_rate": 2.52e-06, + "loss": 1.1236, + "step": 190 + }, + { + "epoch": 0.0025908844275637547, + "grad_norm": 9.752237319946289, + "learning_rate": 2.5333333333333338e-06, + "loss": 0.9312, + "step": 191 + }, + { + "epoch": 0.0026044492674986435, + "grad_norm": 9.290597915649414, + "learning_rate": 2.5466666666666667e-06, + "loss": 0.9042, + "step": 192 + }, + { + "epoch": 0.0026180141074335322, + "grad_norm": 10.698563575744629, + "learning_rate": 2.56e-06, + "loss": 1.05, + "step": 193 + }, + { + "epoch": 0.002631578947368421, + "grad_norm": 12.696000099182129, + "learning_rate": 2.573333333333334e-06, + "loss": 1.1998, + "step": 194 + }, + { + "epoch": 0.0026451437873033098, + "grad_norm": 12.105278015136719, + "learning_rate": 2.5866666666666667e-06, + "loss": 1.1567, + "step": 195 + }, + { + "epoch": 0.0026587086272381985, + "grad_norm": 14.100083351135254, + "learning_rate": 2.6e-06, + "loss": 1.0665, + "step": 196 + }, + { + "epoch": 0.0026722734671730873, + "grad_norm": 14.057904243469238, + "learning_rate": 2.6133333333333334e-06, + "loss": 1.2221, + "step": 197 + }, + { + "epoch": 0.002685838307107976, + "grad_norm": 10.322908401489258, + "learning_rate": 2.6266666666666668e-06, + "loss": 0.8747, + "step": 198 + }, + { + "epoch": 0.002699403147042865, + "grad_norm": 11.847285270690918, + "learning_rate": 2.64e-06, + "loss": 1.1773, + "step": 199 + }, + { + "epoch": 0.0027129679869777536, + "grad_norm": 9.219653129577637, + "learning_rate": 2.6533333333333335e-06, + "loss": 0.8239, + "step": 200 + }, + { + "epoch": 0.0027265328269126424, + "grad_norm": 11.0589017868042, + "learning_rate": 2.666666666666667e-06, + "loss": 0.8677, + "step": 201 + }, + { + "epoch": 0.002740097666847531, + "grad_norm": 14.54589557647705, + "learning_rate": 2.68e-06, + "loss": 1.0811, + "step": 202 + }, + { + "epoch": 0.00275366250678242, + "grad_norm": 10.602853775024414, + "learning_rate": 2.6933333333333335e-06, + "loss": 0.7882, + "step": 203 + }, + { + "epoch": 0.0027672273467173087, + "grad_norm": 14.062017440795898, + "learning_rate": 2.706666666666667e-06, + "loss": 1.0386, + "step": 204 + }, + { + "epoch": 0.0027807921866521974, + "grad_norm": 10.682347297668457, + "learning_rate": 2.7200000000000002e-06, + "loss": 0.8171, + "step": 205 + }, + { + "epoch": 0.002794357026587086, + "grad_norm": 11.652628898620605, + "learning_rate": 2.7333333333333336e-06, + "loss": 0.8443, + "step": 206 + }, + { + "epoch": 0.002807921866521975, + "grad_norm": 18.055089950561523, + "learning_rate": 2.746666666666667e-06, + "loss": 1.2054, + "step": 207 + }, + { + "epoch": 0.0028214867064568637, + "grad_norm": 10.893925666809082, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.8325, + "step": 208 + }, + { + "epoch": 0.0028350515463917525, + "grad_norm": 10.540617942810059, + "learning_rate": 2.7733333333333336e-06, + "loss": 0.7659, + "step": 209 + }, + { + "epoch": 0.0028486163863266412, + "grad_norm": 419.3698425292969, + "learning_rate": 2.786666666666667e-06, + "loss": 1.5027, + "step": 210 + }, + { + "epoch": 0.00286218122626153, + "grad_norm": 126.77201843261719, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.1524, + "step": 211 + }, + { + "epoch": 0.0028757460661964188, + "grad_norm": 15.473610877990723, + "learning_rate": 2.8133333333333336e-06, + "loss": 1.0535, + "step": 212 + }, + { + "epoch": 0.0028893109061313075, + "grad_norm": 14.767268180847168, + "learning_rate": 2.826666666666667e-06, + "loss": 0.9184, + "step": 213 + }, + { + "epoch": 0.0029028757460661963, + "grad_norm": 13.185673713684082, + "learning_rate": 2.84e-06, + "loss": 0.8815, + "step": 214 + }, + { + "epoch": 0.002916440586001085, + "grad_norm": 7.568323612213135, + "learning_rate": 2.8533333333333337e-06, + "loss": 0.4113, + "step": 215 + }, + { + "epoch": 0.002930005425935974, + "grad_norm": 12.743754386901855, + "learning_rate": 2.866666666666667e-06, + "loss": 0.6079, + "step": 216 + }, + { + "epoch": 0.0029435702658708626, + "grad_norm": 10.583744049072266, + "learning_rate": 2.88e-06, + "loss": 0.7232, + "step": 217 + }, + { + "epoch": 0.0029571351058057514, + "grad_norm": 13.746326446533203, + "learning_rate": 2.8933333333333337e-06, + "loss": 0.5805, + "step": 218 + }, + { + "epoch": 0.00297069994574064, + "grad_norm": 11.463306427001953, + "learning_rate": 2.906666666666667e-06, + "loss": 0.6862, + "step": 219 + }, + { + "epoch": 0.002984264785675529, + "grad_norm": 11.36857795715332, + "learning_rate": 2.92e-06, + "loss": 0.7255, + "step": 220 + }, + { + "epoch": 0.0029978296256104177, + "grad_norm": 13.769044876098633, + "learning_rate": 2.9333333333333338e-06, + "loss": 0.7867, + "step": 221 + }, + { + "epoch": 0.0030113944655453064, + "grad_norm": 13.091547012329102, + "learning_rate": 2.946666666666667e-06, + "loss": 0.7954, + "step": 222 + }, + { + "epoch": 0.003024959305480195, + "grad_norm": 11.349958419799805, + "learning_rate": 2.96e-06, + "loss": 0.5764, + "step": 223 + }, + { + "epoch": 0.003038524145415084, + "grad_norm": 9.373363494873047, + "learning_rate": 2.973333333333334e-06, + "loss": 0.4343, + "step": 224 + }, + { + "epoch": 0.0030520889853499727, + "grad_norm": 8.077041625976562, + "learning_rate": 2.986666666666667e-06, + "loss": 0.4292, + "step": 225 + }, + { + "epoch": 0.0030656538252848615, + "grad_norm": 15.880824089050293, + "learning_rate": 3e-06, + "loss": 0.9626, + "step": 226 + }, + { + "epoch": 0.0030792186652197503, + "grad_norm": 10.466856956481934, + "learning_rate": 3.013333333333334e-06, + "loss": 0.667, + "step": 227 + }, + { + "epoch": 0.003092783505154639, + "grad_norm": 7.603901386260986, + "learning_rate": 3.0266666666666668e-06, + "loss": 0.4072, + "step": 228 + }, + { + "epoch": 0.003106348345089528, + "grad_norm": 9.839252471923828, + "learning_rate": 3.04e-06, + "loss": 0.6784, + "step": 229 + }, + { + "epoch": 0.0031199131850244166, + "grad_norm": 11.480759620666504, + "learning_rate": 3.053333333333334e-06, + "loss": 0.7479, + "step": 230 + }, + { + "epoch": 0.0031334780249593053, + "grad_norm": 10.236591339111328, + "learning_rate": 3.066666666666667e-06, + "loss": 0.6685, + "step": 231 + }, + { + "epoch": 0.003147042864894194, + "grad_norm": 9.190509796142578, + "learning_rate": 3.08e-06, + "loss": 0.4982, + "step": 232 + }, + { + "epoch": 0.003160607704829083, + "grad_norm": 12.890752792358398, + "learning_rate": 3.093333333333334e-06, + "loss": 0.6991, + "step": 233 + }, + { + "epoch": 0.0031741725447639716, + "grad_norm": 13.122543334960938, + "learning_rate": 3.106666666666667e-06, + "loss": 0.7542, + "step": 234 + }, + { + "epoch": 0.0031877373846988604, + "grad_norm": 9.657559394836426, + "learning_rate": 3.12e-06, + "loss": 0.6773, + "step": 235 + }, + { + "epoch": 0.003201302224633749, + "grad_norm": 9.531902313232422, + "learning_rate": 3.133333333333334e-06, + "loss": 0.6669, + "step": 236 + }, + { + "epoch": 0.003214867064568638, + "grad_norm": 10.034440994262695, + "learning_rate": 3.146666666666667e-06, + "loss": 0.5664, + "step": 237 + }, + { + "epoch": 0.0032284319045035267, + "grad_norm": 14.028892517089844, + "learning_rate": 3.1600000000000002e-06, + "loss": 0.7877, + "step": 238 + }, + { + "epoch": 0.0032419967444384155, + "grad_norm": 10.97425651550293, + "learning_rate": 3.173333333333334e-06, + "loss": 0.6554, + "step": 239 + }, + { + "epoch": 0.0032555615843733042, + "grad_norm": 9.296300888061523, + "learning_rate": 3.186666666666667e-06, + "loss": 0.4334, + "step": 240 + }, + { + "epoch": 0.003269126424308193, + "grad_norm": 10.90347671508789, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.4589, + "step": 241 + }, + { + "epoch": 0.0032826912642430818, + "grad_norm": 11.931292533874512, + "learning_rate": 3.213333333333334e-06, + "loss": 0.8794, + "step": 242 + }, + { + "epoch": 0.0032962561041779705, + "grad_norm": 11.814854621887207, + "learning_rate": 3.226666666666667e-06, + "loss": 0.6056, + "step": 243 + }, + { + "epoch": 0.0033098209441128593, + "grad_norm": 9.933923721313477, + "learning_rate": 3.2400000000000003e-06, + "loss": 0.5539, + "step": 244 + }, + { + "epoch": 0.003323385784047748, + "grad_norm": 11.281537055969238, + "learning_rate": 3.2533333333333332e-06, + "loss": 0.7736, + "step": 245 + }, + { + "epoch": 0.003336950623982637, + "grad_norm": 9.407797813415527, + "learning_rate": 3.266666666666667e-06, + "loss": 0.5364, + "step": 246 + }, + { + "epoch": 0.0033505154639175256, + "grad_norm": 11.30167007446289, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.6017, + "step": 247 + }, + { + "epoch": 0.0033640803038524143, + "grad_norm": 13.587013244628906, + "learning_rate": 3.2933333333333333e-06, + "loss": 0.7426, + "step": 248 + }, + { + "epoch": 0.003377645143787303, + "grad_norm": 9.342884063720703, + "learning_rate": 3.306666666666667e-06, + "loss": 0.5546, + "step": 249 + }, + { + "epoch": 0.003391209983722192, + "grad_norm": 11.549297332763672, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.7641, + "step": 250 + }, + { + "epoch": 0.0034047748236570806, + "grad_norm": 27.99814224243164, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.6979, + "step": 251 + }, + { + "epoch": 0.0034183396635919694, + "grad_norm": 10.31883716583252, + "learning_rate": 3.346666666666667e-06, + "loss": 0.3926, + "step": 252 + }, + { + "epoch": 0.003431904503526858, + "grad_norm": 9.961878776550293, + "learning_rate": 3.3600000000000004e-06, + "loss": 0.7379, + "step": 253 + }, + { + "epoch": 0.003445469343461747, + "grad_norm": 10.87812328338623, + "learning_rate": 3.3733333333333334e-06, + "loss": 0.4949, + "step": 254 + }, + { + "epoch": 0.0034590341833966357, + "grad_norm": 12.415057182312012, + "learning_rate": 3.386666666666667e-06, + "loss": 0.7875, + "step": 255 + }, + { + "epoch": 0.003472599023331525, + "grad_norm": 11.507883071899414, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.8186, + "step": 256 + }, + { + "epoch": 0.0034861638632664137, + "grad_norm": 11.974535942077637, + "learning_rate": 3.4133333333333334e-06, + "loss": 0.6245, + "step": 257 + }, + { + "epoch": 0.0034997287032013024, + "grad_norm": 11.632085800170898, + "learning_rate": 3.426666666666667e-06, + "loss": 0.4938, + "step": 258 + }, + { + "epoch": 0.003513293543136191, + "grad_norm": 17.386579513549805, + "learning_rate": 3.44e-06, + "loss": 0.8032, + "step": 259 + }, + { + "epoch": 0.00352685838307108, + "grad_norm": 9.325297355651855, + "learning_rate": 3.4533333333333334e-06, + "loss": 0.5036, + "step": 260 + }, + { + "epoch": 0.0035404232230059687, + "grad_norm": 14.709108352661133, + "learning_rate": 3.4666666666666672e-06, + "loss": 0.7261, + "step": 261 + }, + { + "epoch": 0.0035539880629408575, + "grad_norm": 12.798392295837402, + "learning_rate": 3.48e-06, + "loss": 0.5811, + "step": 262 + }, + { + "epoch": 0.0035675529028757463, + "grad_norm": 11.104814529418945, + "learning_rate": 3.4933333333333335e-06, + "loss": 0.7191, + "step": 263 + }, + { + "epoch": 0.003581117742810635, + "grad_norm": 11.98509693145752, + "learning_rate": 3.5066666666666673e-06, + "loss": 0.6126, + "step": 264 + }, + { + "epoch": 0.003594682582745524, + "grad_norm": 11.815077781677246, + "learning_rate": 3.52e-06, + "loss": 0.5851, + "step": 265 + }, + { + "epoch": 0.0036082474226804126, + "grad_norm": 10.482637405395508, + "learning_rate": 3.5333333333333335e-06, + "loss": 0.6046, + "step": 266 + }, + { + "epoch": 0.0036218122626153013, + "grad_norm": 11.331931114196777, + "learning_rate": 3.5466666666666673e-06, + "loss": 0.8701, + "step": 267 + }, + { + "epoch": 0.00363537710255019, + "grad_norm": 8.687963485717773, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.4519, + "step": 268 + }, + { + "epoch": 0.003648941942485079, + "grad_norm": 11.50002384185791, + "learning_rate": 3.5733333333333336e-06, + "loss": 0.7617, + "step": 269 + }, + { + "epoch": 0.0036625067824199676, + "grad_norm": 10.013592720031738, + "learning_rate": 3.5866666666666673e-06, + "loss": 0.6309, + "step": 270 + }, + { + "epoch": 0.0036760716223548564, + "grad_norm": 9.661239624023438, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.8054, + "step": 271 + }, + { + "epoch": 0.003689636462289745, + "grad_norm": 7.398721218109131, + "learning_rate": 3.6133333333333336e-06, + "loss": 0.4317, + "step": 272 + }, + { + "epoch": 0.003703201302224634, + "grad_norm": 13.290780067443848, + "learning_rate": 3.6266666666666674e-06, + "loss": 0.8972, + "step": 273 + }, + { + "epoch": 0.0037167661421595227, + "grad_norm": 10.806015014648438, + "learning_rate": 3.6400000000000003e-06, + "loss": 0.6503, + "step": 274 + }, + { + "epoch": 0.0037303309820944115, + "grad_norm": 12.646716117858887, + "learning_rate": 3.6533333333333336e-06, + "loss": 0.6457, + "step": 275 + }, + { + "epoch": 0.0037438958220293002, + "grad_norm": 9.958603858947754, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.6338, + "step": 276 + }, + { + "epoch": 0.003757460661964189, + "grad_norm": 11.09229850769043, + "learning_rate": 3.6800000000000003e-06, + "loss": 0.9058, + "step": 277 + }, + { + "epoch": 0.0037710255018990778, + "grad_norm": 10.864788055419922, + "learning_rate": 3.6933333333333337e-06, + "loss": 0.619, + "step": 278 + }, + { + "epoch": 0.0037845903418339665, + "grad_norm": 11.129522323608398, + "learning_rate": 3.7066666666666666e-06, + "loss": 0.6795, + "step": 279 + }, + { + "epoch": 0.0037981551817688553, + "grad_norm": 7.458362102508545, + "learning_rate": 3.7200000000000004e-06, + "loss": 0.5266, + "step": 280 + }, + { + "epoch": 0.003811720021703744, + "grad_norm": 10.560969352722168, + "learning_rate": 3.7333333333333337e-06, + "loss": 0.6546, + "step": 281 + }, + { + "epoch": 0.003825284861638633, + "grad_norm": 9.30290699005127, + "learning_rate": 3.7466666666666667e-06, + "loss": 0.4492, + "step": 282 + }, + { + "epoch": 0.0038388497015735216, + "grad_norm": 9.240416526794434, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.5835, + "step": 283 + }, + { + "epoch": 0.0038524145415084103, + "grad_norm": 7.9771647453308105, + "learning_rate": 3.7733333333333338e-06, + "loss": 0.4386, + "step": 284 + }, + { + "epoch": 0.003865979381443299, + "grad_norm": 12.546287536621094, + "learning_rate": 3.7866666666666667e-06, + "loss": 1.1463, + "step": 285 + }, + { + "epoch": 0.003879544221378188, + "grad_norm": 11.461222648620605, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7051, + "step": 286 + }, + { + "epoch": 0.0038931090613130766, + "grad_norm": 7.314365386962891, + "learning_rate": 3.813333333333334e-06, + "loss": 0.4085, + "step": 287 + }, + { + "epoch": 0.003906673901247965, + "grad_norm": 8.694536209106445, + "learning_rate": 3.826666666666667e-06, + "loss": 0.5046, + "step": 288 + }, + { + "epoch": 0.003920238741182854, + "grad_norm": 7.469860076904297, + "learning_rate": 3.8400000000000005e-06, + "loss": 0.4161, + "step": 289 + }, + { + "epoch": 0.003933803581117743, + "grad_norm": 9.14952278137207, + "learning_rate": 3.853333333333334e-06, + "loss": 0.5604, + "step": 290 + }, + { + "epoch": 0.003947368421052632, + "grad_norm": 10.33279800415039, + "learning_rate": 3.866666666666667e-06, + "loss": 0.6897, + "step": 291 + }, + { + "epoch": 0.0039609332609875205, + "grad_norm": 11.198920249938965, + "learning_rate": 3.88e-06, + "loss": 0.6818, + "step": 292 + }, + { + "epoch": 0.003974498100922409, + "grad_norm": 12.492998123168945, + "learning_rate": 3.893333333333333e-06, + "loss": 0.6517, + "step": 293 + }, + { + "epoch": 0.003988062940857298, + "grad_norm": 8.046018600463867, + "learning_rate": 3.906666666666667e-06, + "loss": 0.5482, + "step": 294 + }, + { + "epoch": 0.004001627780792187, + "grad_norm": 10.134740829467773, + "learning_rate": 3.920000000000001e-06, + "loss": 0.6773, + "step": 295 + }, + { + "epoch": 0.0040151926207270755, + "grad_norm": 9.901281356811523, + "learning_rate": 3.9333333333333335e-06, + "loss": 0.7489, + "step": 296 + }, + { + "epoch": 0.004028757460661964, + "grad_norm": 9.50572681427002, + "learning_rate": 3.946666666666667e-06, + "loss": 0.4926, + "step": 297 + }, + { + "epoch": 0.004042322300596853, + "grad_norm": 11.014986038208008, + "learning_rate": 3.96e-06, + "loss": 0.7032, + "step": 298 + }, + { + "epoch": 0.004055887140531742, + "grad_norm": 7.614107608795166, + "learning_rate": 3.973333333333333e-06, + "loss": 0.4033, + "step": 299 + }, + { + "epoch": 0.004069451980466631, + "grad_norm": 12.066905975341797, + "learning_rate": 3.986666666666667e-06, + "loss": 0.8303, + "step": 300 + }, + { + "epoch": 0.004083016820401519, + "grad_norm": 8.151849746704102, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5789, + "step": 301 + }, + { + "epoch": 0.004096581660336408, + "grad_norm": 9.274304389953613, + "learning_rate": 4.013333333333334e-06, + "loss": 0.8466, + "step": 302 + }, + { + "epoch": 0.004110146500271297, + "grad_norm": 12.942228317260742, + "learning_rate": 4.026666666666667e-06, + "loss": 0.8857, + "step": 303 + }, + { + "epoch": 0.004123711340206186, + "grad_norm": 12.72079086303711, + "learning_rate": 4.04e-06, + "loss": 0.7505, + "step": 304 + }, + { + "epoch": 0.004137276180141074, + "grad_norm": 9.15962028503418, + "learning_rate": 4.053333333333333e-06, + "loss": 0.7775, + "step": 305 + }, + { + "epoch": 0.004150841020075963, + "grad_norm": 10.048176765441895, + "learning_rate": 4.066666666666667e-06, + "loss": 0.8055, + "step": 306 + }, + { + "epoch": 0.004164405860010852, + "grad_norm": 8.6920166015625, + "learning_rate": 4.08e-06, + "loss": 0.4936, + "step": 307 + }, + { + "epoch": 0.004177970699945741, + "grad_norm": 13.407120704650879, + "learning_rate": 4.093333333333334e-06, + "loss": 0.7181, + "step": 308 + }, + { + "epoch": 0.0041915355398806295, + "grad_norm": 8.847993850708008, + "learning_rate": 4.1066666666666674e-06, + "loss": 0.7031, + "step": 309 + }, + { + "epoch": 0.004205100379815518, + "grad_norm": 11.165973663330078, + "learning_rate": 4.12e-06, + "loss": 0.8473, + "step": 310 + }, + { + "epoch": 0.004218665219750407, + "grad_norm": 8.264348030090332, + "learning_rate": 4.133333333333333e-06, + "loss": 0.5387, + "step": 311 + }, + { + "epoch": 0.004232230059685296, + "grad_norm": 11.181669235229492, + "learning_rate": 4.146666666666667e-06, + "loss": 0.6484, + "step": 312 + }, + { + "epoch": 0.0042457948996201846, + "grad_norm": 10.285588264465332, + "learning_rate": 4.16e-06, + "loss": 0.6329, + "step": 313 + }, + { + "epoch": 0.004259359739555073, + "grad_norm": 12.395646095275879, + "learning_rate": 4.173333333333334e-06, + "loss": 0.7357, + "step": 314 + }, + { + "epoch": 0.004272924579489962, + "grad_norm": 9.816717147827148, + "learning_rate": 4.1866666666666675e-06, + "loss": 0.5694, + "step": 315 + }, + { + "epoch": 0.004286489419424851, + "grad_norm": 13.561861991882324, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.7623, + "step": 316 + }, + { + "epoch": 0.00430005425935974, + "grad_norm": 10.933150291442871, + "learning_rate": 4.213333333333333e-06, + "loss": 0.5693, + "step": 317 + }, + { + "epoch": 0.004313619099294628, + "grad_norm": 13.823385238647461, + "learning_rate": 4.226666666666667e-06, + "loss": 0.9241, + "step": 318 + }, + { + "epoch": 0.004327183939229517, + "grad_norm": 15.58350944519043, + "learning_rate": 4.24e-06, + "loss": 0.8872, + "step": 319 + }, + { + "epoch": 0.004340748779164406, + "grad_norm": 13.24020004272461, + "learning_rate": 4.253333333333334e-06, + "loss": 0.7223, + "step": 320 + }, + { + "epoch": 0.004354313619099295, + "grad_norm": 12.51429271697998, + "learning_rate": 4.266666666666668e-06, + "loss": 0.7562, + "step": 321 + }, + { + "epoch": 0.0043678784590341834, + "grad_norm": 11.015605926513672, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.5869, + "step": 322 + }, + { + "epoch": 0.004381443298969072, + "grad_norm": 10.140324592590332, + "learning_rate": 4.2933333333333334e-06, + "loss": 0.6232, + "step": 323 + }, + { + "epoch": 0.004395008138903961, + "grad_norm": 10.032035827636719, + "learning_rate": 4.306666666666666e-06, + "loss": 0.7135, + "step": 324 + }, + { + "epoch": 0.00440857297883885, + "grad_norm": 13.127291679382324, + "learning_rate": 4.32e-06, + "loss": 0.7985, + "step": 325 + }, + { + "epoch": 0.0044221378187737385, + "grad_norm": 14.273109436035156, + "learning_rate": 4.333333333333334e-06, + "loss": 0.7619, + "step": 326 + }, + { + "epoch": 0.004435702658708627, + "grad_norm": 10.589089393615723, + "learning_rate": 4.346666666666667e-06, + "loss": 0.6189, + "step": 327 + }, + { + "epoch": 0.004449267498643516, + "grad_norm": 8.84294319152832, + "learning_rate": 4.360000000000001e-06, + "loss": 0.4853, + "step": 328 + }, + { + "epoch": 0.004462832338578405, + "grad_norm": 10.716634750366211, + "learning_rate": 4.3733333333333335e-06, + "loss": 0.695, + "step": 329 + }, + { + "epoch": 0.004476397178513294, + "grad_norm": 14.340614318847656, + "learning_rate": 4.3866666666666665e-06, + "loss": 0.6939, + "step": 330 + }, + { + "epoch": 0.004489962018448182, + "grad_norm": 15.23061466217041, + "learning_rate": 4.4e-06, + "loss": 0.8588, + "step": 331 + }, + { + "epoch": 0.004503526858383071, + "grad_norm": 10.43969440460205, + "learning_rate": 4.413333333333334e-06, + "loss": 0.6159, + "step": 332 + }, + { + "epoch": 0.00451709169831796, + "grad_norm": 12.997859001159668, + "learning_rate": 4.426666666666667e-06, + "loss": 0.7953, + "step": 333 + }, + { + "epoch": 0.004530656538252849, + "grad_norm": 12.536703109741211, + "learning_rate": 4.440000000000001e-06, + "loss": 0.7339, + "step": 334 + }, + { + "epoch": 0.004544221378187737, + "grad_norm": 8.99543571472168, + "learning_rate": 4.453333333333334e-06, + "loss": 0.5021, + "step": 335 + }, + { + "epoch": 0.004557786218122626, + "grad_norm": 12.806574821472168, + "learning_rate": 4.4666666666666665e-06, + "loss": 0.6863, + "step": 336 + }, + { + "epoch": 0.004571351058057515, + "grad_norm": 12.707160949707031, + "learning_rate": 4.48e-06, + "loss": 1.1108, + "step": 337 + }, + { + "epoch": 0.004584915897992404, + "grad_norm": 12.763883590698242, + "learning_rate": 4.493333333333333e-06, + "loss": 0.795, + "step": 338 + }, + { + "epoch": 0.0045984807379272925, + "grad_norm": 12.135883331298828, + "learning_rate": 4.506666666666667e-06, + "loss": 0.6794, + "step": 339 + }, + { + "epoch": 0.004612045577862181, + "grad_norm": 12.238641738891602, + "learning_rate": 4.520000000000001e-06, + "loss": 0.7768, + "step": 340 + }, + { + "epoch": 0.00462561041779707, + "grad_norm": 13.793907165527344, + "learning_rate": 4.533333333333334e-06, + "loss": 0.9811, + "step": 341 + }, + { + "epoch": 0.004639175257731959, + "grad_norm": 12.732327461242676, + "learning_rate": 4.546666666666667e-06, + "loss": 0.8724, + "step": 342 + }, + { + "epoch": 0.0046527400976668475, + "grad_norm": 14.087691307067871, + "learning_rate": 4.56e-06, + "loss": 0.8568, + "step": 343 + }, + { + "epoch": 0.004666304937601736, + "grad_norm": 15.788073539733887, + "learning_rate": 4.573333333333333e-06, + "loss": 1.0979, + "step": 344 + }, + { + "epoch": 0.004679869777536625, + "grad_norm": 11.585338592529297, + "learning_rate": 4.586666666666667e-06, + "loss": 0.5895, + "step": 345 + }, + { + "epoch": 0.004693434617471514, + "grad_norm": 12.906942367553711, + "learning_rate": 4.600000000000001e-06, + "loss": 0.585, + "step": 346 + }, + { + "epoch": 0.004706999457406403, + "grad_norm": 8.528708457946777, + "learning_rate": 4.613333333333334e-06, + "loss": 0.5236, + "step": 347 + }, + { + "epoch": 0.004720564297341291, + "grad_norm": 11.7548246383667, + "learning_rate": 4.626666666666667e-06, + "loss": 0.5892, + "step": 348 + }, + { + "epoch": 0.00473412913727618, + "grad_norm": 10.338576316833496, + "learning_rate": 4.6400000000000005e-06, + "loss": 0.6664, + "step": 349 + }, + { + "epoch": 0.004747693977211069, + "grad_norm": 12.24909496307373, + "learning_rate": 4.653333333333333e-06, + "loss": 0.563, + "step": 350 + }, + { + "epoch": 0.004761258817145958, + "grad_norm": 13.444903373718262, + "learning_rate": 4.666666666666667e-06, + "loss": 0.8506, + "step": 351 + }, + { + "epoch": 0.004774823657080846, + "grad_norm": 12.478318214416504, + "learning_rate": 4.680000000000001e-06, + "loss": 0.6912, + "step": 352 + }, + { + "epoch": 0.004788388497015735, + "grad_norm": 10.708884239196777, + "learning_rate": 4.693333333333334e-06, + "loss": 0.6012, + "step": 353 + }, + { + "epoch": 0.004801953336950624, + "grad_norm": 10.743097305297852, + "learning_rate": 4.706666666666667e-06, + "loss": 0.7484, + "step": 354 + }, + { + "epoch": 0.004815518176885513, + "grad_norm": 12.8298921585083, + "learning_rate": 4.7200000000000005e-06, + "loss": 0.7067, + "step": 355 + }, + { + "epoch": 0.0048290830168204015, + "grad_norm": 15.00815200805664, + "learning_rate": 4.7333333333333335e-06, + "loss": 0.6472, + "step": 356 + }, + { + "epoch": 0.00484264785675529, + "grad_norm": 12.650643348693848, + "learning_rate": 4.746666666666667e-06, + "loss": 0.7894, + "step": 357 + }, + { + "epoch": 0.004856212696690179, + "grad_norm": 12.84103012084961, + "learning_rate": 4.76e-06, + "loss": 0.8762, + "step": 358 + }, + { + "epoch": 0.004869777536625068, + "grad_norm": 10.693416595458984, + "learning_rate": 4.773333333333334e-06, + "loss": 0.6576, + "step": 359 + }, + { + "epoch": 0.0048833423765599565, + "grad_norm": 15.560778617858887, + "learning_rate": 4.786666666666667e-06, + "loss": 0.8949, + "step": 360 + }, + { + "epoch": 0.004896907216494845, + "grad_norm": 8.915786743164062, + "learning_rate": 4.800000000000001e-06, + "loss": 0.5811, + "step": 361 + }, + { + "epoch": 0.004910472056429734, + "grad_norm": 7.993461608886719, + "learning_rate": 4.8133333333333336e-06, + "loss": 0.4028, + "step": 362 + }, + { + "epoch": 0.004924036896364623, + "grad_norm": 10.388050079345703, + "learning_rate": 4.826666666666667e-06, + "loss": 0.8997, + "step": 363 + }, + { + "epoch": 0.004937601736299512, + "grad_norm": 10.94035816192627, + "learning_rate": 4.84e-06, + "loss": 0.6503, + "step": 364 + }, + { + "epoch": 0.0049511665762344, + "grad_norm": 10.65006160736084, + "learning_rate": 4.853333333333334e-06, + "loss": 0.5838, + "step": 365 + }, + { + "epoch": 0.004964731416169289, + "grad_norm": 8.271740913391113, + "learning_rate": 4.866666666666667e-06, + "loss": 0.5896, + "step": 366 + }, + { + "epoch": 0.004978296256104178, + "grad_norm": 8.88534927368164, + "learning_rate": 4.880000000000001e-06, + "loss": 0.5852, + "step": 367 + }, + { + "epoch": 0.004991861096039067, + "grad_norm": 9.711320877075195, + "learning_rate": 4.893333333333334e-06, + "loss": 0.6128, + "step": 368 + }, + { + "epoch": 0.0050054259359739554, + "grad_norm": 11.093830108642578, + "learning_rate": 4.9066666666666666e-06, + "loss": 0.6288, + "step": 369 + }, + { + "epoch": 0.005018990775908844, + "grad_norm": 11.355315208435059, + "learning_rate": 4.92e-06, + "loss": 0.8694, + "step": 370 + }, + { + "epoch": 0.005032555615843733, + "grad_norm": 14.030156135559082, + "learning_rate": 4.933333333333334e-06, + "loss": 0.6426, + "step": 371 + }, + { + "epoch": 0.005046120455778622, + "grad_norm": 10.229035377502441, + "learning_rate": 4.946666666666667e-06, + "loss": 0.6517, + "step": 372 + }, + { + "epoch": 0.0050596852957135105, + "grad_norm": 11.232610702514648, + "learning_rate": 4.960000000000001e-06, + "loss": 0.6078, + "step": 373 + }, + { + "epoch": 0.005073250135648399, + "grad_norm": 10.226860046386719, + "learning_rate": 4.973333333333334e-06, + "loss": 0.4591, + "step": 374 + }, + { + "epoch": 0.005086814975583288, + "grad_norm": 12.749695777893066, + "learning_rate": 4.986666666666667e-06, + "loss": 0.8302, + "step": 375 + }, + { + "epoch": 0.005100379815518177, + "grad_norm": 14.826332092285156, + "learning_rate": 5e-06, + "loss": 1.0341, + "step": 376 + }, + { + "epoch": 0.0051139446554530656, + "grad_norm": 10.337685585021973, + "learning_rate": 5.013333333333333e-06, + "loss": 0.7354, + "step": 377 + }, + { + "epoch": 0.005127509495387954, + "grad_norm": 13.365252494812012, + "learning_rate": 5.026666666666667e-06, + "loss": 0.6354, + "step": 378 + }, + { + "epoch": 0.005141074335322843, + "grad_norm": 13.6273775100708, + "learning_rate": 5.04e-06, + "loss": 0.6383, + "step": 379 + }, + { + "epoch": 0.005154639175257732, + "grad_norm": 13.137993812561035, + "learning_rate": 5.053333333333334e-06, + "loss": 0.8173, + "step": 380 + }, + { + "epoch": 0.005168204015192621, + "grad_norm": 15.604735374450684, + "learning_rate": 5.0666666666666676e-06, + "loss": 0.8937, + "step": 381 + }, + { + "epoch": 0.005181768855127509, + "grad_norm": 8.28209400177002, + "learning_rate": 5.0800000000000005e-06, + "loss": 0.4823, + "step": 382 + }, + { + "epoch": 0.005195333695062398, + "grad_norm": 11.052202224731445, + "learning_rate": 5.093333333333333e-06, + "loss": 0.7496, + "step": 383 + }, + { + "epoch": 0.005208898534997287, + "grad_norm": 13.050772666931152, + "learning_rate": 5.106666666666667e-06, + "loss": 0.7292, + "step": 384 + }, + { + "epoch": 0.005222463374932176, + "grad_norm": 11.485687255859375, + "learning_rate": 5.12e-06, + "loss": 0.6266, + "step": 385 + }, + { + "epoch": 0.0052360282148670645, + "grad_norm": 11.743054389953613, + "learning_rate": 5.133333333333334e-06, + "loss": 0.779, + "step": 386 + }, + { + "epoch": 0.005249593054801953, + "grad_norm": 9.54704475402832, + "learning_rate": 5.146666666666668e-06, + "loss": 0.7069, + "step": 387 + }, + { + "epoch": 0.005263157894736842, + "grad_norm": 10.612946510314941, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.729, + "step": 388 + }, + { + "epoch": 0.005276722734671731, + "grad_norm": 12.638702392578125, + "learning_rate": 5.1733333333333335e-06, + "loss": 0.7848, + "step": 389 + }, + { + "epoch": 0.0052902875746066195, + "grad_norm": 10.506863594055176, + "learning_rate": 5.186666666666667e-06, + "loss": 0.6553, + "step": 390 + }, + { + "epoch": 0.005303852414541508, + "grad_norm": 9.970185279846191, + "learning_rate": 5.2e-06, + "loss": 0.5511, + "step": 391 + }, + { + "epoch": 0.005317417254476397, + "grad_norm": 12.800992012023926, + "learning_rate": 5.213333333333334e-06, + "loss": 0.6602, + "step": 392 + }, + { + "epoch": 0.005330982094411286, + "grad_norm": 12.4931640625, + "learning_rate": 5.226666666666667e-06, + "loss": 0.8448, + "step": 393 + }, + { + "epoch": 0.005344546934346175, + "grad_norm": 10.761161804199219, + "learning_rate": 5.240000000000001e-06, + "loss": 0.7215, + "step": 394 + }, + { + "epoch": 0.005358111774281063, + "grad_norm": 14.286507606506348, + "learning_rate": 5.2533333333333336e-06, + "loss": 1.0743, + "step": 395 + }, + { + "epoch": 0.005371676614215952, + "grad_norm": 8.349054336547852, + "learning_rate": 5.2666666666666665e-06, + "loss": 0.8267, + "step": 396 + }, + { + "epoch": 0.005385241454150841, + "grad_norm": 9.830233573913574, + "learning_rate": 5.28e-06, + "loss": 0.6112, + "step": 397 + }, + { + "epoch": 0.00539880629408573, + "grad_norm": 14.200364112854004, + "learning_rate": 5.293333333333334e-06, + "loss": 0.9539, + "step": 398 + }, + { + "epoch": 0.005412371134020618, + "grad_norm": 11.197713851928711, + "learning_rate": 5.306666666666667e-06, + "loss": 0.7668, + "step": 399 + }, + { + "epoch": 0.005425935973955507, + "grad_norm": 14.517122268676758, + "learning_rate": 5.320000000000001e-06, + "loss": 0.7894, + "step": 400 + }, + { + "epoch": 0.005439500813890396, + "grad_norm": 10.121296882629395, + "learning_rate": 5.333333333333334e-06, + "loss": 0.798, + "step": 401 + }, + { + "epoch": 0.005453065653825285, + "grad_norm": 11.617986679077148, + "learning_rate": 5.346666666666667e-06, + "loss": 0.6884, + "step": 402 + }, + { + "epoch": 0.0054666304937601735, + "grad_norm": 11.052796363830566, + "learning_rate": 5.36e-06, + "loss": 0.7041, + "step": 403 + }, + { + "epoch": 0.005480195333695062, + "grad_norm": 15.379217147827148, + "learning_rate": 5.373333333333334e-06, + "loss": 0.8555, + "step": 404 + }, + { + "epoch": 0.005493760173629951, + "grad_norm": 10.121376991271973, + "learning_rate": 5.386666666666667e-06, + "loss": 0.6716, + "step": 405 + }, + { + "epoch": 0.00550732501356484, + "grad_norm": 12.791173934936523, + "learning_rate": 5.400000000000001e-06, + "loss": 0.8835, + "step": 406 + }, + { + "epoch": 0.0055208898534997285, + "grad_norm": 11.828323364257812, + "learning_rate": 5.413333333333334e-06, + "loss": 0.5167, + "step": 407 + }, + { + "epoch": 0.005534454693434617, + "grad_norm": 13.594310760498047, + "learning_rate": 5.426666666666667e-06, + "loss": 0.6795, + "step": 408 + }, + { + "epoch": 0.005548019533369506, + "grad_norm": 11.322700500488281, + "learning_rate": 5.4400000000000004e-06, + "loss": 0.5424, + "step": 409 + }, + { + "epoch": 0.005561584373304395, + "grad_norm": 10.033510208129883, + "learning_rate": 5.453333333333334e-06, + "loss": 0.7761, + "step": 410 + }, + { + "epoch": 0.005575149213239284, + "grad_norm": 13.239856719970703, + "learning_rate": 5.466666666666667e-06, + "loss": 0.8716, + "step": 411 + }, + { + "epoch": 0.005588714053174172, + "grad_norm": 12.024213790893555, + "learning_rate": 5.480000000000001e-06, + "loss": 0.7086, + "step": 412 + }, + { + "epoch": 0.005602278893109061, + "grad_norm": 11.27552604675293, + "learning_rate": 5.493333333333334e-06, + "loss": 0.6538, + "step": 413 + }, + { + "epoch": 0.00561584373304395, + "grad_norm": 10.207372665405273, + "learning_rate": 5.506666666666667e-06, + "loss": 0.5706, + "step": 414 + }, + { + "epoch": 0.005629408572978839, + "grad_norm": 13.918736457824707, + "learning_rate": 5.5200000000000005e-06, + "loss": 0.8139, + "step": 415 + }, + { + "epoch": 0.005642973412913727, + "grad_norm": 13.400528907775879, + "learning_rate": 5.533333333333334e-06, + "loss": 0.6078, + "step": 416 + }, + { + "epoch": 0.005656538252848616, + "grad_norm": 12.10545825958252, + "learning_rate": 5.546666666666667e-06, + "loss": 0.7696, + "step": 417 + }, + { + "epoch": 0.005670103092783505, + "grad_norm": 11.999250411987305, + "learning_rate": 5.560000000000001e-06, + "loss": 0.737, + "step": 418 + }, + { + "epoch": 0.005683667932718394, + "grad_norm": 11.821759223937988, + "learning_rate": 5.573333333333334e-06, + "loss": 0.6133, + "step": 419 + }, + { + "epoch": 0.0056972327726532825, + "grad_norm": 15.171578407287598, + "learning_rate": 5.586666666666667e-06, + "loss": 0.8087, + "step": 420 + }, + { + "epoch": 0.005710797612588171, + "grad_norm": 12.336888313293457, + "learning_rate": 5.600000000000001e-06, + "loss": 0.6628, + "step": 421 + }, + { + "epoch": 0.00572436245252306, + "grad_norm": 14.06629467010498, + "learning_rate": 5.613333333333334e-06, + "loss": 0.7217, + "step": 422 + }, + { + "epoch": 0.005737927292457949, + "grad_norm": 14.956746101379395, + "learning_rate": 5.626666666666667e-06, + "loss": 0.7837, + "step": 423 + }, + { + "epoch": 0.0057514921323928376, + "grad_norm": 12.636491775512695, + "learning_rate": 5.64e-06, + "loss": 0.72, + "step": 424 + }, + { + "epoch": 0.005765056972327726, + "grad_norm": 11.995162963867188, + "learning_rate": 5.653333333333334e-06, + "loss": 0.6142, + "step": 425 + }, + { + "epoch": 0.005778621812262615, + "grad_norm": 10.749984741210938, + "learning_rate": 5.666666666666667e-06, + "loss": 0.5413, + "step": 426 + }, + { + "epoch": 0.005792186652197504, + "grad_norm": 15.259883880615234, + "learning_rate": 5.68e-06, + "loss": 0.9497, + "step": 427 + }, + { + "epoch": 0.005805751492132393, + "grad_norm": 12.213555335998535, + "learning_rate": 5.6933333333333344e-06, + "loss": 0.7058, + "step": 428 + }, + { + "epoch": 0.005819316332067281, + "grad_norm": 12.37903881072998, + "learning_rate": 5.706666666666667e-06, + "loss": 0.6457, + "step": 429 + }, + { + "epoch": 0.00583288117200217, + "grad_norm": 11.66431999206543, + "learning_rate": 5.72e-06, + "loss": 0.6813, + "step": 430 + }, + { + "epoch": 0.005846446011937059, + "grad_norm": 11.809267044067383, + "learning_rate": 5.733333333333334e-06, + "loss": 0.6695, + "step": 431 + }, + { + "epoch": 0.005860010851871948, + "grad_norm": 14.370896339416504, + "learning_rate": 5.746666666666667e-06, + "loss": 0.8479, + "step": 432 + }, + { + "epoch": 0.0058735756918068364, + "grad_norm": 9.834295272827148, + "learning_rate": 5.76e-06, + "loss": 0.5117, + "step": 433 + }, + { + "epoch": 0.005887140531741725, + "grad_norm": 12.521942138671875, + "learning_rate": 5.7733333333333345e-06, + "loss": 0.8482, + "step": 434 + }, + { + "epoch": 0.005900705371676614, + "grad_norm": 9.7120943069458, + "learning_rate": 5.7866666666666674e-06, + "loss": 0.6847, + "step": 435 + }, + { + "epoch": 0.005914270211611503, + "grad_norm": 11.147890090942383, + "learning_rate": 5.8e-06, + "loss": 0.7596, + "step": 436 + }, + { + "epoch": 0.0059278350515463915, + "grad_norm": 13.68081283569336, + "learning_rate": 5.813333333333334e-06, + "loss": 0.8189, + "step": 437 + }, + { + "epoch": 0.00594139989148128, + "grad_norm": 16.347137451171875, + "learning_rate": 5.826666666666667e-06, + "loss": 1.066, + "step": 438 + }, + { + "epoch": 0.005954964731416169, + "grad_norm": 11.247100830078125, + "learning_rate": 5.84e-06, + "loss": 0.7022, + "step": 439 + }, + { + "epoch": 0.005968529571351058, + "grad_norm": 11.115147590637207, + "learning_rate": 5.853333333333335e-06, + "loss": 0.7464, + "step": 440 + }, + { + "epoch": 0.005982094411285947, + "grad_norm": 10.570687294006348, + "learning_rate": 5.8666666666666675e-06, + "loss": 0.8147, + "step": 441 + }, + { + "epoch": 0.005995659251220835, + "grad_norm": 10.71081829071045, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.6254, + "step": 442 + }, + { + "epoch": 0.006009224091155724, + "grad_norm": 13.59709644317627, + "learning_rate": 5.893333333333334e-06, + "loss": 0.8346, + "step": 443 + }, + { + "epoch": 0.006022788931090613, + "grad_norm": 11.129714012145996, + "learning_rate": 5.906666666666667e-06, + "loss": 0.5993, + "step": 444 + }, + { + "epoch": 0.006036353771025502, + "grad_norm": 11.353099822998047, + "learning_rate": 5.92e-06, + "loss": 0.5759, + "step": 445 + }, + { + "epoch": 0.00604991861096039, + "grad_norm": 9.343982696533203, + "learning_rate": 5.933333333333335e-06, + "loss": 0.6831, + "step": 446 + }, + { + "epoch": 0.006063483450895279, + "grad_norm": 9.980473518371582, + "learning_rate": 5.946666666666668e-06, + "loss": 0.6248, + "step": 447 + }, + { + "epoch": 0.006077048290830168, + "grad_norm": 8.210841178894043, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.5697, + "step": 448 + }, + { + "epoch": 0.006090613130765057, + "grad_norm": 8.942750930786133, + "learning_rate": 5.973333333333334e-06, + "loss": 0.6616, + "step": 449 + }, + { + "epoch": 0.0061041779706999455, + "grad_norm": 8.67333984375, + "learning_rate": 5.986666666666667e-06, + "loss": 0.6738, + "step": 450 + }, + { + "epoch": 0.006117742810634834, + "grad_norm": 11.01435375213623, + "learning_rate": 6e-06, + "loss": 0.6131, + "step": 451 + }, + { + "epoch": 0.006131307650569723, + "grad_norm": 9.878174781799316, + "learning_rate": 6.013333333333335e-06, + "loss": 0.6119, + "step": 452 + }, + { + "epoch": 0.006144872490504612, + "grad_norm": 12.358595848083496, + "learning_rate": 6.026666666666668e-06, + "loss": 0.7188, + "step": 453 + }, + { + "epoch": 0.0061584373304395005, + "grad_norm": 9.687262535095215, + "learning_rate": 6.040000000000001e-06, + "loss": 0.6678, + "step": 454 + }, + { + "epoch": 0.006172002170374389, + "grad_norm": 9.084615707397461, + "learning_rate": 6.0533333333333335e-06, + "loss": 0.6362, + "step": 455 + }, + { + "epoch": 0.006185567010309278, + "grad_norm": 11.352370262145996, + "learning_rate": 6.066666666666667e-06, + "loss": 0.6485, + "step": 456 + }, + { + "epoch": 0.006199131850244167, + "grad_norm": 13.280981063842773, + "learning_rate": 6.08e-06, + "loss": 0.7483, + "step": 457 + }, + { + "epoch": 0.006212696690179056, + "grad_norm": 8.714627265930176, + "learning_rate": 6.093333333333333e-06, + "loss": 0.5491, + "step": 458 + }, + { + "epoch": 0.006226261530113944, + "grad_norm": 11.420221328735352, + "learning_rate": 6.106666666666668e-06, + "loss": 0.7013, + "step": 459 + }, + { + "epoch": 0.006239826370048833, + "grad_norm": 13.571288108825684, + "learning_rate": 6.120000000000001e-06, + "loss": 0.7147, + "step": 460 + }, + { + "epoch": 0.006253391209983722, + "grad_norm": 8.70838451385498, + "learning_rate": 6.133333333333334e-06, + "loss": 0.5267, + "step": 461 + }, + { + "epoch": 0.006266956049918611, + "grad_norm": 12.226025581359863, + "learning_rate": 6.146666666666667e-06, + "loss": 0.7116, + "step": 462 + }, + { + "epoch": 0.006280520889853499, + "grad_norm": 10.31334400177002, + "learning_rate": 6.16e-06, + "loss": 0.5524, + "step": 463 + }, + { + "epoch": 0.006294085729788388, + "grad_norm": 10.619245529174805, + "learning_rate": 6.173333333333333e-06, + "loss": 0.7472, + "step": 464 + }, + { + "epoch": 0.006307650569723277, + "grad_norm": 13.151286125183105, + "learning_rate": 6.186666666666668e-06, + "loss": 0.7082, + "step": 465 + }, + { + "epoch": 0.006321215409658166, + "grad_norm": 11.661477088928223, + "learning_rate": 6.200000000000001e-06, + "loss": 0.6874, + "step": 466 + }, + { + "epoch": 0.0063347802495930545, + "grad_norm": 11.141332626342773, + "learning_rate": 6.213333333333334e-06, + "loss": 0.8777, + "step": 467 + }, + { + "epoch": 0.006348345089527943, + "grad_norm": 10.642997741699219, + "learning_rate": 6.2266666666666675e-06, + "loss": 0.4727, + "step": 468 + }, + { + "epoch": 0.006361909929462832, + "grad_norm": 11.543622970581055, + "learning_rate": 6.24e-06, + "loss": 0.7775, + "step": 469 + }, + { + "epoch": 0.006375474769397721, + "grad_norm": 11.863630294799805, + "learning_rate": 6.253333333333333e-06, + "loss": 0.6577, + "step": 470 + }, + { + "epoch": 0.0063890396093326095, + "grad_norm": 9.684564590454102, + "learning_rate": 6.266666666666668e-06, + "loss": 0.7033, + "step": 471 + }, + { + "epoch": 0.006402604449267498, + "grad_norm": 12.603571891784668, + "learning_rate": 6.280000000000001e-06, + "loss": 0.796, + "step": 472 + }, + { + "epoch": 0.006416169289202387, + "grad_norm": 11.21391773223877, + "learning_rate": 6.293333333333334e-06, + "loss": 0.5704, + "step": 473 + }, + { + "epoch": 0.006429734129137276, + "grad_norm": 10.67782211303711, + "learning_rate": 6.3066666666666676e-06, + "loss": 0.5791, + "step": 474 + }, + { + "epoch": 0.006443298969072165, + "grad_norm": 13.348045349121094, + "learning_rate": 6.3200000000000005e-06, + "loss": 0.7384, + "step": 475 + }, + { + "epoch": 0.006456863809007053, + "grad_norm": 12.428071975708008, + "learning_rate": 6.333333333333333e-06, + "loss": 0.8675, + "step": 476 + }, + { + "epoch": 0.006470428648941942, + "grad_norm": 7.990952968597412, + "learning_rate": 6.346666666666668e-06, + "loss": 0.4322, + "step": 477 + }, + { + "epoch": 0.006483993488876831, + "grad_norm": 12.624375343322754, + "learning_rate": 6.360000000000001e-06, + "loss": 0.6021, + "step": 478 + }, + { + "epoch": 0.00649755832881172, + "grad_norm": 10.066609382629395, + "learning_rate": 6.373333333333334e-06, + "loss": 0.5912, + "step": 479 + }, + { + "epoch": 0.0065111231687466084, + "grad_norm": 9.639202117919922, + "learning_rate": 6.386666666666668e-06, + "loss": 0.5619, + "step": 480 + }, + { + "epoch": 0.006524688008681497, + "grad_norm": 7.523640155792236, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.442, + "step": 481 + }, + { + "epoch": 0.006538252848616386, + "grad_norm": 11.151276588439941, + "learning_rate": 6.4133333333333335e-06, + "loss": 0.648, + "step": 482 + }, + { + "epoch": 0.006551817688551275, + "grad_norm": 9.548447608947754, + "learning_rate": 6.426666666666668e-06, + "loss": 0.5231, + "step": 483 + }, + { + "epoch": 0.0065653825284861635, + "grad_norm": 10.72411823272705, + "learning_rate": 6.440000000000001e-06, + "loss": 0.6701, + "step": 484 + }, + { + "epoch": 0.006578947368421052, + "grad_norm": 9.582047462463379, + "learning_rate": 6.453333333333334e-06, + "loss": 0.4985, + "step": 485 + }, + { + "epoch": 0.006592512208355941, + "grad_norm": 9.478193283081055, + "learning_rate": 6.466666666666667e-06, + "loss": 0.4983, + "step": 486 + }, + { + "epoch": 0.00660607704829083, + "grad_norm": 11.791730880737305, + "learning_rate": 6.480000000000001e-06, + "loss": 0.609, + "step": 487 + }, + { + "epoch": 0.0066196418882257186, + "grad_norm": 10.534850120544434, + "learning_rate": 6.4933333333333336e-06, + "loss": 0.6347, + "step": 488 + }, + { + "epoch": 0.006633206728160607, + "grad_norm": 8.334867477416992, + "learning_rate": 6.5066666666666665e-06, + "loss": 0.4212, + "step": 489 + }, + { + "epoch": 0.006646771568095496, + "grad_norm": 11.54477310180664, + "learning_rate": 6.520000000000001e-06, + "loss": 0.8759, + "step": 490 + }, + { + "epoch": 0.006660336408030385, + "grad_norm": 9.76768684387207, + "learning_rate": 6.533333333333334e-06, + "loss": 0.6062, + "step": 491 + }, + { + "epoch": 0.006673901247965274, + "grad_norm": 12.092190742492676, + "learning_rate": 6.546666666666667e-06, + "loss": 0.6504, + "step": 492 + }, + { + "epoch": 0.006687466087900162, + "grad_norm": 8.18982982635498, + "learning_rate": 6.560000000000001e-06, + "loss": 0.45, + "step": 493 + }, + { + "epoch": 0.006701030927835051, + "grad_norm": 11.028427124023438, + "learning_rate": 6.573333333333334e-06, + "loss": 0.612, + "step": 494 + }, + { + "epoch": 0.00671459576776994, + "grad_norm": 9.08171558380127, + "learning_rate": 6.5866666666666666e-06, + "loss": 0.5197, + "step": 495 + }, + { + "epoch": 0.006728160607704829, + "grad_norm": 6.607532501220703, + "learning_rate": 6.600000000000001e-06, + "loss": 0.4512, + "step": 496 + }, + { + "epoch": 0.0067417254476397175, + "grad_norm": 9.948437690734863, + "learning_rate": 6.613333333333334e-06, + "loss": 0.502, + "step": 497 + }, + { + "epoch": 0.006755290287574606, + "grad_norm": 15.523694038391113, + "learning_rate": 6.626666666666667e-06, + "loss": 0.7193, + "step": 498 + }, + { + "epoch": 0.006768855127509495, + "grad_norm": 9.694650650024414, + "learning_rate": 6.640000000000001e-06, + "loss": 0.52, + "step": 499 + }, + { + "epoch": 0.006782419967444384, + "grad_norm": 8.258275032043457, + "learning_rate": 6.653333333333334e-06, + "loss": 0.5782, + "step": 500 + }, + { + "epoch": 0.0067959848073792725, + "grad_norm": 9.65252685546875, + "learning_rate": 6.666666666666667e-06, + "loss": 0.6886, + "step": 501 + }, + { + "epoch": 0.006809549647314161, + "grad_norm": 8.29489517211914, + "learning_rate": 6.680000000000001e-06, + "loss": 0.7268, + "step": 502 + }, + { + "epoch": 0.00682311448724905, + "grad_norm": 10.298418998718262, + "learning_rate": 6.693333333333334e-06, + "loss": 0.6744, + "step": 503 + }, + { + "epoch": 0.006836679327183939, + "grad_norm": 7.791603088378906, + "learning_rate": 6.706666666666667e-06, + "loss": 0.4729, + "step": 504 + }, + { + "epoch": 0.006850244167118828, + "grad_norm": 9.78139591217041, + "learning_rate": 6.720000000000001e-06, + "loss": 0.5335, + "step": 505 + }, + { + "epoch": 0.006863809007053716, + "grad_norm": 7.670722007751465, + "learning_rate": 6.733333333333334e-06, + "loss": 0.5309, + "step": 506 + }, + { + "epoch": 0.006877373846988605, + "grad_norm": 7.464871406555176, + "learning_rate": 6.746666666666667e-06, + "loss": 0.7266, + "step": 507 + }, + { + "epoch": 0.006890938686923494, + "grad_norm": 9.976213455200195, + "learning_rate": 6.760000000000001e-06, + "loss": 0.5739, + "step": 508 + }, + { + "epoch": 0.006904503526858383, + "grad_norm": 10.99843978881836, + "learning_rate": 6.773333333333334e-06, + "loss": 0.6217, + "step": 509 + }, + { + "epoch": 0.006918068366793271, + "grad_norm": 10.222423553466797, + "learning_rate": 6.786666666666667e-06, + "loss": 0.6911, + "step": 510 + }, + { + "epoch": 0.00693163320672816, + "grad_norm": 7.624888896942139, + "learning_rate": 6.800000000000001e-06, + "loss": 0.3468, + "step": 511 + }, + { + "epoch": 0.00694519804666305, + "grad_norm": 11.027809143066406, + "learning_rate": 6.813333333333334e-06, + "loss": 0.6974, + "step": 512 + }, + { + "epoch": 0.006958762886597939, + "grad_norm": 8.164054870605469, + "learning_rate": 6.826666666666667e-06, + "loss": 0.5708, + "step": 513 + }, + { + "epoch": 0.006972327726532827, + "grad_norm": 10.84119987487793, + "learning_rate": 6.8400000000000014e-06, + "loss": 0.6467, + "step": 514 + }, + { + "epoch": 0.006985892566467716, + "grad_norm": 12.688886642456055, + "learning_rate": 6.853333333333334e-06, + "loss": 0.7432, + "step": 515 + }, + { + "epoch": 0.006999457406402605, + "grad_norm": 7.462399005889893, + "learning_rate": 6.866666666666667e-06, + "loss": 0.5901, + "step": 516 + }, + { + "epoch": 0.007013022246337494, + "grad_norm": 11.623201370239258, + "learning_rate": 6.88e-06, + "loss": 0.8071, + "step": 517 + }, + { + "epoch": 0.007026587086272382, + "grad_norm": 10.131540298461914, + "learning_rate": 6.893333333333334e-06, + "loss": 0.553, + "step": 518 + }, + { + "epoch": 0.007040151926207271, + "grad_norm": 8.785533905029297, + "learning_rate": 6.906666666666667e-06, + "loss": 0.4544, + "step": 519 + }, + { + "epoch": 0.00705371676614216, + "grad_norm": 11.520009994506836, + "learning_rate": 6.92e-06, + "loss": 0.5378, + "step": 520 + }, + { + "epoch": 0.007067281606077049, + "grad_norm": 8.474822998046875, + "learning_rate": 6.9333333333333344e-06, + "loss": 0.5293, + "step": 521 + }, + { + "epoch": 0.0070808464460119375, + "grad_norm": 8.971126556396484, + "learning_rate": 6.946666666666667e-06, + "loss": 0.5318, + "step": 522 + }, + { + "epoch": 0.007094411285946826, + "grad_norm": 11.110747337341309, + "learning_rate": 6.96e-06, + "loss": 0.6494, + "step": 523 + }, + { + "epoch": 0.007107976125881715, + "grad_norm": 10.621713638305664, + "learning_rate": 6.973333333333334e-06, + "loss": 0.7393, + "step": 524 + }, + { + "epoch": 0.007121540965816604, + "grad_norm": 10.841354370117188, + "learning_rate": 6.986666666666667e-06, + "loss": 0.8147, + "step": 525 + }, + { + "epoch": 0.0071351058057514925, + "grad_norm": 11.297996520996094, + "learning_rate": 7e-06, + "loss": 0.8052, + "step": 526 + }, + { + "epoch": 0.007148670645686381, + "grad_norm": 10.42567253112793, + "learning_rate": 7.0133333333333345e-06, + "loss": 0.6171, + "step": 527 + }, + { + "epoch": 0.00716223548562127, + "grad_norm": 10.648910522460938, + "learning_rate": 7.0266666666666674e-06, + "loss": 0.769, + "step": 528 + }, + { + "epoch": 0.007175800325556159, + "grad_norm": 9.870469093322754, + "learning_rate": 7.04e-06, + "loss": 0.62, + "step": 529 + }, + { + "epoch": 0.007189365165491048, + "grad_norm": 11.896495819091797, + "learning_rate": 7.053333333333334e-06, + "loss": 0.7978, + "step": 530 + }, + { + "epoch": 0.007202930005425936, + "grad_norm": 7.937852382659912, + "learning_rate": 7.066666666666667e-06, + "loss": 0.6155, + "step": 531 + }, + { + "epoch": 0.007216494845360825, + "grad_norm": 10.417679786682129, + "learning_rate": 7.08e-06, + "loss": 0.6908, + "step": 532 + }, + { + "epoch": 0.007230059685295714, + "grad_norm": 7.592164993286133, + "learning_rate": 7.093333333333335e-06, + "loss": 0.6202, + "step": 533 + }, + { + "epoch": 0.007243624525230603, + "grad_norm": 8.694833755493164, + "learning_rate": 7.1066666666666675e-06, + "loss": 0.5173, + "step": 534 + }, + { + "epoch": 0.007257189365165491, + "grad_norm": 8.44509220123291, + "learning_rate": 7.1200000000000004e-06, + "loss": 0.5991, + "step": 535 + }, + { + "epoch": 0.00727075420510038, + "grad_norm": 12.5964937210083, + "learning_rate": 7.133333333333334e-06, + "loss": 0.7781, + "step": 536 + }, + { + "epoch": 0.007284319045035269, + "grad_norm": 9.2064847946167, + "learning_rate": 7.146666666666667e-06, + "loss": 0.5625, + "step": 537 + }, + { + "epoch": 0.007297883884970158, + "grad_norm": 10.25497817993164, + "learning_rate": 7.16e-06, + "loss": 0.5394, + "step": 538 + }, + { + "epoch": 0.0073114487249050465, + "grad_norm": 11.217278480529785, + "learning_rate": 7.173333333333335e-06, + "loss": 0.5819, + "step": 539 + }, + { + "epoch": 0.007325013564839935, + "grad_norm": 11.870972633361816, + "learning_rate": 7.186666666666668e-06, + "loss": 0.6063, + "step": 540 + }, + { + "epoch": 0.007338578404774824, + "grad_norm": 8.924481391906738, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.5024, + "step": 541 + }, + { + "epoch": 0.007352143244709713, + "grad_norm": 11.409536361694336, + "learning_rate": 7.213333333333334e-06, + "loss": 0.8066, + "step": 542 + }, + { + "epoch": 0.0073657080846446015, + "grad_norm": 9.51655101776123, + "learning_rate": 7.226666666666667e-06, + "loss": 0.5438, + "step": 543 + }, + { + "epoch": 0.00737927292457949, + "grad_norm": 10.18490219116211, + "learning_rate": 7.24e-06, + "loss": 0.6093, + "step": 544 + }, + { + "epoch": 0.007392837764514379, + "grad_norm": 10.605599403381348, + "learning_rate": 7.253333333333335e-06, + "loss": 0.6458, + "step": 545 + }, + { + "epoch": 0.007406402604449268, + "grad_norm": 10.196928977966309, + "learning_rate": 7.266666666666668e-06, + "loss": 0.7029, + "step": 546 + }, + { + "epoch": 0.007419967444384157, + "grad_norm": 13.161282539367676, + "learning_rate": 7.280000000000001e-06, + "loss": 0.7401, + "step": 547 + }, + { + "epoch": 0.007433532284319045, + "grad_norm": 13.102421760559082, + "learning_rate": 7.2933333333333335e-06, + "loss": 0.6367, + "step": 548 + }, + { + "epoch": 0.007447097124253934, + "grad_norm": 9.36795711517334, + "learning_rate": 7.306666666666667e-06, + "loss": 0.6751, + "step": 549 + }, + { + "epoch": 0.007460661964188823, + "grad_norm": 13.018122673034668, + "learning_rate": 7.32e-06, + "loss": 0.517, + "step": 550 + }, + { + "epoch": 0.007474226804123712, + "grad_norm": 10.020010948181152, + "learning_rate": 7.333333333333333e-06, + "loss": 0.5724, + "step": 551 + }, + { + "epoch": 0.0074877916440586004, + "grad_norm": 15.177641868591309, + "learning_rate": 7.346666666666668e-06, + "loss": 0.7644, + "step": 552 + }, + { + "epoch": 0.007501356483993489, + "grad_norm": 10.720725059509277, + "learning_rate": 7.360000000000001e-06, + "loss": 0.7464, + "step": 553 + }, + { + "epoch": 0.007514921323928378, + "grad_norm": 14.4567289352417, + "learning_rate": 7.373333333333334e-06, + "loss": 0.8332, + "step": 554 + }, + { + "epoch": 0.007528486163863267, + "grad_norm": 10.411738395690918, + "learning_rate": 7.386666666666667e-06, + "loss": 0.7165, + "step": 555 + }, + { + "epoch": 0.0075420510037981555, + "grad_norm": 12.50633716583252, + "learning_rate": 7.4e-06, + "loss": 0.7115, + "step": 556 + }, + { + "epoch": 0.007555615843733044, + "grad_norm": 9.959630012512207, + "learning_rate": 7.413333333333333e-06, + "loss": 0.5673, + "step": 557 + }, + { + "epoch": 0.007569180683667933, + "grad_norm": 10.122492790222168, + "learning_rate": 7.426666666666668e-06, + "loss": 0.5963, + "step": 558 + }, + { + "epoch": 0.007582745523602822, + "grad_norm": 12.113180160522461, + "learning_rate": 7.440000000000001e-06, + "loss": 0.674, + "step": 559 + }, + { + "epoch": 0.007596310363537711, + "grad_norm": 12.241214752197266, + "learning_rate": 7.453333333333334e-06, + "loss": 0.6173, + "step": 560 + }, + { + "epoch": 0.007609875203472599, + "grad_norm": 12.514921188354492, + "learning_rate": 7.4666666666666675e-06, + "loss": 0.7682, + "step": 561 + }, + { + "epoch": 0.007623440043407488, + "grad_norm": 8.347978591918945, + "learning_rate": 7.48e-06, + "loss": 0.4651, + "step": 562 + }, + { + "epoch": 0.007637004883342377, + "grad_norm": 8.208953857421875, + "learning_rate": 7.493333333333333e-06, + "loss": 0.4366, + "step": 563 + }, + { + "epoch": 0.007650569723277266, + "grad_norm": 9.992890357971191, + "learning_rate": 7.506666666666668e-06, + "loss": 0.4946, + "step": 564 + }, + { + "epoch": 0.007664134563212154, + "grad_norm": 7.846614837646484, + "learning_rate": 7.520000000000001e-06, + "loss": 0.6052, + "step": 565 + }, + { + "epoch": 0.007677699403147043, + "grad_norm": 11.421736717224121, + "learning_rate": 7.533333333333334e-06, + "loss": 0.5692, + "step": 566 + }, + { + "epoch": 0.007691264243081932, + "grad_norm": 9.9599027633667, + "learning_rate": 7.5466666666666675e-06, + "loss": 0.6661, + "step": 567 + }, + { + "epoch": 0.007704829083016821, + "grad_norm": 8.372870445251465, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.6075, + "step": 568 + }, + { + "epoch": 0.0077183939229517095, + "grad_norm": 8.329596519470215, + "learning_rate": 7.573333333333333e-06, + "loss": 0.5444, + "step": 569 + }, + { + "epoch": 0.007731958762886598, + "grad_norm": 12.319808006286621, + "learning_rate": 7.586666666666668e-06, + "loss": 0.7719, + "step": 570 + }, + { + "epoch": 0.007745523602821487, + "grad_norm": 11.747087478637695, + "learning_rate": 7.600000000000001e-06, + "loss": 0.6858, + "step": 571 + }, + { + "epoch": 0.007759088442756376, + "grad_norm": 10.699450492858887, + "learning_rate": 7.613333333333334e-06, + "loss": 0.5836, + "step": 572 + }, + { + "epoch": 0.0077726532826912645, + "grad_norm": 11.650968551635742, + "learning_rate": 7.626666666666668e-06, + "loss": 0.5728, + "step": 573 + }, + { + "epoch": 0.007786218122626153, + "grad_norm": 11.597538948059082, + "learning_rate": 7.640000000000001e-06, + "loss": 0.6785, + "step": 574 + }, + { + "epoch": 0.007799782962561042, + "grad_norm": 11.702094078063965, + "learning_rate": 7.653333333333333e-06, + "loss": 0.7217, + "step": 575 + }, + { + "epoch": 0.00781334780249593, + "grad_norm": 10.184535026550293, + "learning_rate": 7.666666666666667e-06, + "loss": 0.6237, + "step": 576 + }, + { + "epoch": 0.007826912642430819, + "grad_norm": 9.319794654846191, + "learning_rate": 7.680000000000001e-06, + "loss": 0.5526, + "step": 577 + }, + { + "epoch": 0.007840477482365708, + "grad_norm": 11.095110893249512, + "learning_rate": 7.693333333333333e-06, + "loss": 0.5857, + "step": 578 + }, + { + "epoch": 0.007854042322300596, + "grad_norm": 11.33773136138916, + "learning_rate": 7.706666666666669e-06, + "loss": 0.6285, + "step": 579 + }, + { + "epoch": 0.007867607162235486, + "grad_norm": 6.315740585327148, + "learning_rate": 7.72e-06, + "loss": 0.3724, + "step": 580 + }, + { + "epoch": 0.007881172002170374, + "grad_norm": 8.319339752197266, + "learning_rate": 7.733333333333334e-06, + "loss": 0.5526, + "step": 581 + }, + { + "epoch": 0.007894736842105263, + "grad_norm": 7.641243934631348, + "learning_rate": 7.746666666666666e-06, + "loss": 0.5683, + "step": 582 + }, + { + "epoch": 0.007908301682040151, + "grad_norm": 10.190960884094238, + "learning_rate": 7.76e-06, + "loss": 0.7027, + "step": 583 + }, + { + "epoch": 0.007921866521975041, + "grad_norm": 8.367647171020508, + "learning_rate": 7.773333333333334e-06, + "loss": 0.4738, + "step": 584 + }, + { + "epoch": 0.007935431361909929, + "grad_norm": 11.889660835266113, + "learning_rate": 7.786666666666666e-06, + "loss": 0.7248, + "step": 585 + }, + { + "epoch": 0.007948996201844818, + "grad_norm": 11.771404266357422, + "learning_rate": 7.800000000000002e-06, + "loss": 0.9454, + "step": 586 + }, + { + "epoch": 0.007962561041779706, + "grad_norm": 9.058320045471191, + "learning_rate": 7.813333333333334e-06, + "loss": 0.61, + "step": 587 + }, + { + "epoch": 0.007976125881714596, + "grad_norm": 10.473182678222656, + "learning_rate": 7.826666666666667e-06, + "loss": 0.7075, + "step": 588 + }, + { + "epoch": 0.007989690721649484, + "grad_norm": 10.911951065063477, + "learning_rate": 7.840000000000001e-06, + "loss": 0.5541, + "step": 589 + }, + { + "epoch": 0.008003255561584374, + "grad_norm": 11.517815589904785, + "learning_rate": 7.853333333333333e-06, + "loss": 0.6272, + "step": 590 + }, + { + "epoch": 0.008016820401519261, + "grad_norm": 9.43856143951416, + "learning_rate": 7.866666666666667e-06, + "loss": 0.6236, + "step": 591 + }, + { + "epoch": 0.008030385241454151, + "grad_norm": 9.947659492492676, + "learning_rate": 7.88e-06, + "loss": 0.6531, + "step": 592 + }, + { + "epoch": 0.008043950081389039, + "grad_norm": 9.901813507080078, + "learning_rate": 7.893333333333335e-06, + "loss": 0.5937, + "step": 593 + }, + { + "epoch": 0.008057514921323929, + "grad_norm": 13.160162925720215, + "learning_rate": 7.906666666666667e-06, + "loss": 0.6632, + "step": 594 + }, + { + "epoch": 0.008071079761258817, + "grad_norm": 10.932674407958984, + "learning_rate": 7.92e-06, + "loss": 0.6538, + "step": 595 + }, + { + "epoch": 0.008084644601193706, + "grad_norm": 8.634129524230957, + "learning_rate": 7.933333333333334e-06, + "loss": 0.5409, + "step": 596 + }, + { + "epoch": 0.008098209441128594, + "grad_norm": 9.318254470825195, + "learning_rate": 7.946666666666666e-06, + "loss": 0.762, + "step": 597 + }, + { + "epoch": 0.008111774281063484, + "grad_norm": 9.000068664550781, + "learning_rate": 7.960000000000002e-06, + "loss": 0.5916, + "step": 598 + }, + { + "epoch": 0.008125339120998372, + "grad_norm": 11.350335121154785, + "learning_rate": 7.973333333333334e-06, + "loss": 0.7255, + "step": 599 + }, + { + "epoch": 0.008138903960933261, + "grad_norm": 10.49691104888916, + "learning_rate": 7.986666666666668e-06, + "loss": 0.5856, + "step": 600 + }, + { + "epoch": 0.008152468800868149, + "grad_norm": 9.354360580444336, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5959, + "step": 601 + }, + { + "epoch": 0.008166033640803039, + "grad_norm": 9.217838287353516, + "learning_rate": 8.013333333333333e-06, + "loss": 0.6277, + "step": 602 + }, + { + "epoch": 0.008179598480737927, + "grad_norm": 10.236881256103516, + "learning_rate": 8.026666666666667e-06, + "loss": 0.6185, + "step": 603 + }, + { + "epoch": 0.008193163320672816, + "grad_norm": 12.117815017700195, + "learning_rate": 8.040000000000001e-06, + "loss": 0.9136, + "step": 604 + }, + { + "epoch": 0.008206728160607704, + "grad_norm": 9.767302513122559, + "learning_rate": 8.053333333333335e-06, + "loss": 0.5675, + "step": 605 + }, + { + "epoch": 0.008220293000542594, + "grad_norm": 8.892134666442871, + "learning_rate": 8.066666666666667e-06, + "loss": 0.5169, + "step": 606 + }, + { + "epoch": 0.008233857840477482, + "grad_norm": 15.49610424041748, + "learning_rate": 8.08e-06, + "loss": 1.007, + "step": 607 + }, + { + "epoch": 0.008247422680412371, + "grad_norm": 8.730998039245605, + "learning_rate": 8.093333333333334e-06, + "loss": 0.7117, + "step": 608 + }, + { + "epoch": 0.00826098752034726, + "grad_norm": 8.31118392944336, + "learning_rate": 8.106666666666666e-06, + "loss": 0.4896, + "step": 609 + }, + { + "epoch": 0.008274552360282149, + "grad_norm": 10.785163879394531, + "learning_rate": 8.120000000000002e-06, + "loss": 0.6767, + "step": 610 + }, + { + "epoch": 0.008288117200217037, + "grad_norm": 11.21778392791748, + "learning_rate": 8.133333333333334e-06, + "loss": 0.7566, + "step": 611 + }, + { + "epoch": 0.008301682040151926, + "grad_norm": 9.55349349975586, + "learning_rate": 8.146666666666668e-06, + "loss": 0.5732, + "step": 612 + }, + { + "epoch": 0.008315246880086814, + "grad_norm": 9.15683364868164, + "learning_rate": 8.16e-06, + "loss": 0.6867, + "step": 613 + }, + { + "epoch": 0.008328811720021704, + "grad_norm": 10.455933570861816, + "learning_rate": 8.173333333333334e-06, + "loss": 0.609, + "step": 614 + }, + { + "epoch": 0.008342376559956592, + "grad_norm": 10.929725646972656, + "learning_rate": 8.186666666666667e-06, + "loss": 0.8329, + "step": 615 + }, + { + "epoch": 0.008355941399891481, + "grad_norm": 9.305832862854004, + "learning_rate": 8.2e-06, + "loss": 0.6523, + "step": 616 + }, + { + "epoch": 0.00836950623982637, + "grad_norm": 9.64157485961914, + "learning_rate": 8.213333333333335e-06, + "loss": 0.6706, + "step": 617 + }, + { + "epoch": 0.008383071079761259, + "grad_norm": 10.93943977355957, + "learning_rate": 8.226666666666667e-06, + "loss": 0.6378, + "step": 618 + }, + { + "epoch": 0.008396635919696147, + "grad_norm": 9.724624633789062, + "learning_rate": 8.24e-06, + "loss": 0.4588, + "step": 619 + }, + { + "epoch": 0.008410200759631037, + "grad_norm": 7.080091953277588, + "learning_rate": 8.253333333333334e-06, + "loss": 0.4993, + "step": 620 + }, + { + "epoch": 0.008423765599565924, + "grad_norm": 9.397831916809082, + "learning_rate": 8.266666666666667e-06, + "loss": 0.5137, + "step": 621 + }, + { + "epoch": 0.008437330439500814, + "grad_norm": 7.688535690307617, + "learning_rate": 8.28e-06, + "loss": 0.4444, + "step": 622 + }, + { + "epoch": 0.008450895279435702, + "grad_norm": 7.801669120788574, + "learning_rate": 8.293333333333334e-06, + "loss": 0.3718, + "step": 623 + }, + { + "epoch": 0.008464460119370592, + "grad_norm": 9.547070503234863, + "learning_rate": 8.306666666666668e-06, + "loss": 0.476, + "step": 624 + }, + { + "epoch": 0.00847802495930548, + "grad_norm": 12.3087797164917, + "learning_rate": 8.32e-06, + "loss": 0.8642, + "step": 625 + }, + { + "epoch": 0.008491589799240369, + "grad_norm": 9.523991584777832, + "learning_rate": 8.333333333333334e-06, + "loss": 0.7142, + "step": 626 + }, + { + "epoch": 0.008505154639175257, + "grad_norm": 9.142528533935547, + "learning_rate": 8.346666666666668e-06, + "loss": 0.6267, + "step": 627 + }, + { + "epoch": 0.008518719479110147, + "grad_norm": 7.940373420715332, + "learning_rate": 8.36e-06, + "loss": 0.5682, + "step": 628 + }, + { + "epoch": 0.008532284319045035, + "grad_norm": 9.801634788513184, + "learning_rate": 8.373333333333335e-06, + "loss": 0.7428, + "step": 629 + }, + { + "epoch": 0.008545849158979924, + "grad_norm": 6.95482063293457, + "learning_rate": 8.386666666666667e-06, + "loss": 0.5648, + "step": 630 + }, + { + "epoch": 0.008559413998914812, + "grad_norm": 11.808462142944336, + "learning_rate": 8.400000000000001e-06, + "loss": 0.5153, + "step": 631 + }, + { + "epoch": 0.008572978838849702, + "grad_norm": 13.762616157531738, + "learning_rate": 8.413333333333335e-06, + "loss": 0.7114, + "step": 632 + }, + { + "epoch": 0.00858654367878459, + "grad_norm": 10.122282028198242, + "learning_rate": 8.426666666666667e-06, + "loss": 0.6514, + "step": 633 + }, + { + "epoch": 0.00860010851871948, + "grad_norm": 7.477573394775391, + "learning_rate": 8.44e-06, + "loss": 0.3302, + "step": 634 + }, + { + "epoch": 0.008613673358654367, + "grad_norm": 9.370348930358887, + "learning_rate": 8.453333333333334e-06, + "loss": 0.6143, + "step": 635 + }, + { + "epoch": 0.008627238198589257, + "grad_norm": 8.723119735717773, + "learning_rate": 8.466666666666668e-06, + "loss": 0.5394, + "step": 636 + }, + { + "epoch": 0.008640803038524145, + "grad_norm": 8.746380805969238, + "learning_rate": 8.48e-06, + "loss": 0.5191, + "step": 637 + }, + { + "epoch": 0.008654367878459034, + "grad_norm": 10.508950233459473, + "learning_rate": 8.493333333333334e-06, + "loss": 0.7593, + "step": 638 + }, + { + "epoch": 0.008667932718393922, + "grad_norm": 8.828998565673828, + "learning_rate": 8.506666666666668e-06, + "loss": 0.5975, + "step": 639 + }, + { + "epoch": 0.008681497558328812, + "grad_norm": 9.420825004577637, + "learning_rate": 8.52e-06, + "loss": 0.7127, + "step": 640 + }, + { + "epoch": 0.0086950623982637, + "grad_norm": 8.654979705810547, + "learning_rate": 8.533333333333335e-06, + "loss": 0.4494, + "step": 641 + }, + { + "epoch": 0.00870862723819859, + "grad_norm": 8.76844596862793, + "learning_rate": 8.546666666666667e-06, + "loss": 0.5596, + "step": 642 + }, + { + "epoch": 0.008722192078133477, + "grad_norm": 11.083855628967285, + "learning_rate": 8.560000000000001e-06, + "loss": 0.6336, + "step": 643 + }, + { + "epoch": 0.008735756918068367, + "grad_norm": 7.925288200378418, + "learning_rate": 8.573333333333333e-06, + "loss": 0.5497, + "step": 644 + }, + { + "epoch": 0.008749321758003255, + "grad_norm": 9.544169425964355, + "learning_rate": 8.586666666666667e-06, + "loss": 0.5923, + "step": 645 + }, + { + "epoch": 0.008762886597938144, + "grad_norm": 10.407221794128418, + "learning_rate": 8.6e-06, + "loss": 0.7394, + "step": 646 + }, + { + "epoch": 0.008776451437873032, + "grad_norm": 9.7598295211792, + "learning_rate": 8.613333333333333e-06, + "loss": 0.5784, + "step": 647 + }, + { + "epoch": 0.008790016277807922, + "grad_norm": 9.06484317779541, + "learning_rate": 8.626666666666668e-06, + "loss": 0.6591, + "step": 648 + }, + { + "epoch": 0.00880358111774281, + "grad_norm": 8.386137008666992, + "learning_rate": 8.64e-06, + "loss": 0.5355, + "step": 649 + }, + { + "epoch": 0.0088171459576777, + "grad_norm": 8.436918258666992, + "learning_rate": 8.653333333333334e-06, + "loss": 0.6724, + "step": 650 + }, + { + "epoch": 0.008830710797612587, + "grad_norm": 12.046189308166504, + "learning_rate": 8.666666666666668e-06, + "loss": 0.6261, + "step": 651 + }, + { + "epoch": 0.008844275637547477, + "grad_norm": 7.6548566818237305, + "learning_rate": 8.68e-06, + "loss": 0.5628, + "step": 652 + }, + { + "epoch": 0.008857840477482365, + "grad_norm": 9.82762622833252, + "learning_rate": 8.693333333333334e-06, + "loss": 0.45, + "step": 653 + }, + { + "epoch": 0.008871405317417255, + "grad_norm": 9.447340965270996, + "learning_rate": 8.706666666666667e-06, + "loss": 0.4576, + "step": 654 + }, + { + "epoch": 0.008884970157352142, + "grad_norm": 12.754944801330566, + "learning_rate": 8.720000000000001e-06, + "loss": 0.6145, + "step": 655 + }, + { + "epoch": 0.008898534997287032, + "grad_norm": 6.5470123291015625, + "learning_rate": 8.733333333333333e-06, + "loss": 0.4946, + "step": 656 + }, + { + "epoch": 0.00891209983722192, + "grad_norm": 9.364029884338379, + "learning_rate": 8.746666666666667e-06, + "loss": 0.6012, + "step": 657 + }, + { + "epoch": 0.00892566467715681, + "grad_norm": 8.924337387084961, + "learning_rate": 8.76e-06, + "loss": 0.5053, + "step": 658 + }, + { + "epoch": 0.008939229517091698, + "grad_norm": 8.547151565551758, + "learning_rate": 8.773333333333333e-06, + "loss": 0.4407, + "step": 659 + }, + { + "epoch": 0.008952794357026587, + "grad_norm": 10.902571678161621, + "learning_rate": 8.786666666666668e-06, + "loss": 0.5922, + "step": 660 + }, + { + "epoch": 0.008966359196961475, + "grad_norm": 8.33765697479248, + "learning_rate": 8.8e-06, + "loss": 0.5322, + "step": 661 + }, + { + "epoch": 0.008979924036896365, + "grad_norm": 11.391901969909668, + "learning_rate": 8.813333333333334e-06, + "loss": 0.8411, + "step": 662 + }, + { + "epoch": 0.008993488876831253, + "grad_norm": 9.707891464233398, + "learning_rate": 8.826666666666668e-06, + "loss": 0.6167, + "step": 663 + }, + { + "epoch": 0.009007053716766142, + "grad_norm": 8.401199340820312, + "learning_rate": 8.84e-06, + "loss": 0.5928, + "step": 664 + }, + { + "epoch": 0.00902061855670103, + "grad_norm": 8.405213356018066, + "learning_rate": 8.853333333333334e-06, + "loss": 0.5469, + "step": 665 + }, + { + "epoch": 0.00903418339663592, + "grad_norm": 8.465655326843262, + "learning_rate": 8.866666666666668e-06, + "loss": 0.5155, + "step": 666 + }, + { + "epoch": 0.009047748236570808, + "grad_norm": 10.398386001586914, + "learning_rate": 8.880000000000001e-06, + "loss": 0.6042, + "step": 667 + }, + { + "epoch": 0.009061313076505697, + "grad_norm": 9.066248893737793, + "learning_rate": 8.893333333333333e-06, + "loss": 0.5676, + "step": 668 + }, + { + "epoch": 0.009074877916440585, + "grad_norm": 9.53197956085205, + "learning_rate": 8.906666666666667e-06, + "loss": 0.4447, + "step": 669 + }, + { + "epoch": 0.009088442756375475, + "grad_norm": 10.705399513244629, + "learning_rate": 8.920000000000001e-06, + "loss": 0.5066, + "step": 670 + }, + { + "epoch": 0.009102007596310363, + "grad_norm": 12.355603218078613, + "learning_rate": 8.933333333333333e-06, + "loss": 0.6869, + "step": 671 + }, + { + "epoch": 0.009115572436245252, + "grad_norm": 7.856970310211182, + "learning_rate": 8.946666666666669e-06, + "loss": 0.493, + "step": 672 + }, + { + "epoch": 0.00912913727618014, + "grad_norm": 6.55540657043457, + "learning_rate": 8.96e-06, + "loss": 0.4011, + "step": 673 + }, + { + "epoch": 0.00914270211611503, + "grad_norm": 8.944196701049805, + "learning_rate": 8.973333333333334e-06, + "loss": 0.4917, + "step": 674 + }, + { + "epoch": 0.009156266956049918, + "grad_norm": 7.377128601074219, + "learning_rate": 8.986666666666666e-06, + "loss": 0.4674, + "step": 675 + }, + { + "epoch": 0.009169831795984807, + "grad_norm": 9.11181640625, + "learning_rate": 9e-06, + "loss": 0.5211, + "step": 676 + }, + { + "epoch": 0.009183396635919695, + "grad_norm": 7.632635116577148, + "learning_rate": 9.013333333333334e-06, + "loss": 0.5071, + "step": 677 + }, + { + "epoch": 0.009196961475854585, + "grad_norm": 14.62473201751709, + "learning_rate": 9.026666666666666e-06, + "loss": 0.6203, + "step": 678 + }, + { + "epoch": 0.009210526315789473, + "grad_norm": 11.759766578674316, + "learning_rate": 9.040000000000002e-06, + "loss": 0.7579, + "step": 679 + }, + { + "epoch": 0.009224091155724362, + "grad_norm": 8.65388298034668, + "learning_rate": 9.053333333333334e-06, + "loss": 0.5647, + "step": 680 + }, + { + "epoch": 0.00923765599565925, + "grad_norm": 7.002622604370117, + "learning_rate": 9.066666666666667e-06, + "loss": 0.4434, + "step": 681 + }, + { + "epoch": 0.00925122083559414, + "grad_norm": 9.494488716125488, + "learning_rate": 9.080000000000001e-06, + "loss": 0.588, + "step": 682 + }, + { + "epoch": 0.00926478567552903, + "grad_norm": 8.501388549804688, + "learning_rate": 9.093333333333333e-06, + "loss": 0.4625, + "step": 683 + }, + { + "epoch": 0.009278350515463918, + "grad_norm": 7.959138870239258, + "learning_rate": 9.106666666666667e-06, + "loss": 0.5093, + "step": 684 + }, + { + "epoch": 0.009291915355398807, + "grad_norm": 11.173908233642578, + "learning_rate": 9.12e-06, + "loss": 0.6851, + "step": 685 + }, + { + "epoch": 0.009305480195333695, + "grad_norm": 12.43563175201416, + "learning_rate": 9.133333333333335e-06, + "loss": 0.6863, + "step": 686 + }, + { + "epoch": 0.009319045035268585, + "grad_norm": 6.413601398468018, + "learning_rate": 9.146666666666667e-06, + "loss": 0.514, + "step": 687 + }, + { + "epoch": 0.009332609875203473, + "grad_norm": 10.597804069519043, + "learning_rate": 9.16e-06, + "loss": 0.7568, + "step": 688 + }, + { + "epoch": 0.009346174715138362, + "grad_norm": 8.798778533935547, + "learning_rate": 9.173333333333334e-06, + "loss": 0.5934, + "step": 689 + }, + { + "epoch": 0.00935973955507325, + "grad_norm": 9.251697540283203, + "learning_rate": 9.186666666666666e-06, + "loss": 0.6531, + "step": 690 + }, + { + "epoch": 0.00937330439500814, + "grad_norm": 10.191509246826172, + "learning_rate": 9.200000000000002e-06, + "loss": 0.6993, + "step": 691 + }, + { + "epoch": 0.009386869234943028, + "grad_norm": 11.665064811706543, + "learning_rate": 9.213333333333334e-06, + "loss": 0.6526, + "step": 692 + }, + { + "epoch": 0.009400434074877917, + "grad_norm": 7.7081756591796875, + "learning_rate": 9.226666666666668e-06, + "loss": 0.5387, + "step": 693 + }, + { + "epoch": 0.009413998914812805, + "grad_norm": 11.203886985778809, + "learning_rate": 9.240000000000001e-06, + "loss": 0.7149, + "step": 694 + }, + { + "epoch": 0.009427563754747695, + "grad_norm": 9.002643585205078, + "learning_rate": 9.253333333333333e-06, + "loss": 0.6989, + "step": 695 + }, + { + "epoch": 0.009441128594682583, + "grad_norm": 9.278099060058594, + "learning_rate": 9.266666666666667e-06, + "loss": 0.5792, + "step": 696 + }, + { + "epoch": 0.009454693434617472, + "grad_norm": 11.201467514038086, + "learning_rate": 9.280000000000001e-06, + "loss": 0.5509, + "step": 697 + }, + { + "epoch": 0.00946825827455236, + "grad_norm": 7.979068756103516, + "learning_rate": 9.293333333333335e-06, + "loss": 0.5305, + "step": 698 + }, + { + "epoch": 0.00948182311448725, + "grad_norm": 9.39223861694336, + "learning_rate": 9.306666666666667e-06, + "loss": 0.6056, + "step": 699 + }, + { + "epoch": 0.009495387954422138, + "grad_norm": 7.26355504989624, + "learning_rate": 9.32e-06, + "loss": 0.4383, + "step": 700 + }, + { + "epoch": 0.009508952794357027, + "grad_norm": 10.581366539001465, + "learning_rate": 9.333333333333334e-06, + "loss": 0.6815, + "step": 701 + }, + { + "epoch": 0.009522517634291915, + "grad_norm": 12.119690895080566, + "learning_rate": 9.346666666666666e-06, + "loss": 0.8623, + "step": 702 + }, + { + "epoch": 0.009536082474226805, + "grad_norm": 10.759542465209961, + "learning_rate": 9.360000000000002e-06, + "loss": 0.6334, + "step": 703 + }, + { + "epoch": 0.009549647314161693, + "grad_norm": 8.246220588684082, + "learning_rate": 9.373333333333334e-06, + "loss": 0.6312, + "step": 704 + }, + { + "epoch": 0.009563212154096582, + "grad_norm": 8.293639183044434, + "learning_rate": 9.386666666666668e-06, + "loss": 0.3775, + "step": 705 + }, + { + "epoch": 0.00957677699403147, + "grad_norm": 8.565048217773438, + "learning_rate": 9.4e-06, + "loss": 0.5917, + "step": 706 + }, + { + "epoch": 0.00959034183396636, + "grad_norm": 8.329639434814453, + "learning_rate": 9.413333333333334e-06, + "loss": 0.5447, + "step": 707 + }, + { + "epoch": 0.009603906673901248, + "grad_norm": 11.000186920166016, + "learning_rate": 9.426666666666667e-06, + "loss": 0.5511, + "step": 708 + }, + { + "epoch": 0.009617471513836138, + "grad_norm": 7.278872966766357, + "learning_rate": 9.440000000000001e-06, + "loss": 0.4574, + "step": 709 + }, + { + "epoch": 0.009631036353771025, + "grad_norm": 10.6577730178833, + "learning_rate": 9.453333333333335e-06, + "loss": 0.5333, + "step": 710 + }, + { + "epoch": 0.009644601193705915, + "grad_norm": 7.382857799530029, + "learning_rate": 9.466666666666667e-06, + "loss": 0.6323, + "step": 711 + }, + { + "epoch": 0.009658166033640803, + "grad_norm": 7.401951789855957, + "learning_rate": 9.48e-06, + "loss": 0.5206, + "step": 712 + }, + { + "epoch": 0.009671730873575693, + "grad_norm": 10.068018913269043, + "learning_rate": 9.493333333333334e-06, + "loss": 0.6942, + "step": 713 + }, + { + "epoch": 0.00968529571351058, + "grad_norm": 9.770787239074707, + "learning_rate": 9.506666666666667e-06, + "loss": 0.6133, + "step": 714 + }, + { + "epoch": 0.00969886055344547, + "grad_norm": 7.985218048095703, + "learning_rate": 9.52e-06, + "loss": 0.7463, + "step": 715 + }, + { + "epoch": 0.009712425393380358, + "grad_norm": 12.026476860046387, + "learning_rate": 9.533333333333334e-06, + "loss": 0.8637, + "step": 716 + }, + { + "epoch": 0.009725990233315248, + "grad_norm": 8.539562225341797, + "learning_rate": 9.546666666666668e-06, + "loss": 0.6806, + "step": 717 + }, + { + "epoch": 0.009739555073250136, + "grad_norm": 11.429290771484375, + "learning_rate": 9.56e-06, + "loss": 0.7451, + "step": 718 + }, + { + "epoch": 0.009753119913185025, + "grad_norm": 10.24174976348877, + "learning_rate": 9.573333333333334e-06, + "loss": 0.5564, + "step": 719 + }, + { + "epoch": 0.009766684753119913, + "grad_norm": 9.592765808105469, + "learning_rate": 9.586666666666667e-06, + "loss": 0.8731, + "step": 720 + }, + { + "epoch": 0.009780249593054803, + "grad_norm": 9.592291831970215, + "learning_rate": 9.600000000000001e-06, + "loss": 0.4948, + "step": 721 + }, + { + "epoch": 0.00979381443298969, + "grad_norm": 8.600624084472656, + "learning_rate": 9.613333333333335e-06, + "loss": 0.6263, + "step": 722 + }, + { + "epoch": 0.00980737927292458, + "grad_norm": 9.881817817687988, + "learning_rate": 9.626666666666667e-06, + "loss": 0.5483, + "step": 723 + }, + { + "epoch": 0.009820944112859468, + "grad_norm": 10.187382698059082, + "learning_rate": 9.640000000000001e-06, + "loss": 0.5825, + "step": 724 + }, + { + "epoch": 0.009834508952794358, + "grad_norm": 7.736274242401123, + "learning_rate": 9.653333333333335e-06, + "loss": 0.4625, + "step": 725 + }, + { + "epoch": 0.009848073792729246, + "grad_norm": 7.640440940856934, + "learning_rate": 9.666666666666667e-06, + "loss": 0.509, + "step": 726 + }, + { + "epoch": 0.009861638632664135, + "grad_norm": 10.502938270568848, + "learning_rate": 9.68e-06, + "loss": 0.76, + "step": 727 + }, + { + "epoch": 0.009875203472599023, + "grad_norm": 7.112846374511719, + "learning_rate": 9.693333333333334e-06, + "loss": 0.4725, + "step": 728 + }, + { + "epoch": 0.009888768312533913, + "grad_norm": 12.757698059082031, + "learning_rate": 9.706666666666668e-06, + "loss": 0.7601, + "step": 729 + }, + { + "epoch": 0.0099023331524688, + "grad_norm": 9.661845207214355, + "learning_rate": 9.72e-06, + "loss": 0.7257, + "step": 730 + }, + { + "epoch": 0.00991589799240369, + "grad_norm": 9.214828491210938, + "learning_rate": 9.733333333333334e-06, + "loss": 0.5488, + "step": 731 + }, + { + "epoch": 0.009929462832338578, + "grad_norm": 8.286022186279297, + "learning_rate": 9.746666666666668e-06, + "loss": 0.4313, + "step": 732 + }, + { + "epoch": 0.009943027672273468, + "grad_norm": 7.599858283996582, + "learning_rate": 9.760000000000001e-06, + "loss": 0.5103, + "step": 733 + }, + { + "epoch": 0.009956592512208356, + "grad_norm": 9.918793678283691, + "learning_rate": 9.773333333333335e-06, + "loss": 0.5329, + "step": 734 + }, + { + "epoch": 0.009970157352143245, + "grad_norm": 8.888296127319336, + "learning_rate": 9.786666666666667e-06, + "loss": 0.5, + "step": 735 + }, + { + "epoch": 0.009983722192078133, + "grad_norm": 6.787847995758057, + "learning_rate": 9.800000000000001e-06, + "loss": 0.558, + "step": 736 + }, + { + "epoch": 0.009997287032013023, + "grad_norm": 8.136613845825195, + "learning_rate": 9.813333333333333e-06, + "loss": 0.5842, + "step": 737 + }, + { + "epoch": 0.010010851871947911, + "grad_norm": 9.39494800567627, + "learning_rate": 9.826666666666667e-06, + "loss": 0.4338, + "step": 738 + }, + { + "epoch": 0.0100244167118828, + "grad_norm": 9.85543441772461, + "learning_rate": 9.84e-06, + "loss": 0.6418, + "step": 739 + }, + { + "epoch": 0.010037981551817688, + "grad_norm": 8.228201866149902, + "learning_rate": 9.853333333333334e-06, + "loss": 0.6766, + "step": 740 + }, + { + "epoch": 0.010051546391752578, + "grad_norm": 8.459665298461914, + "learning_rate": 9.866666666666668e-06, + "loss": 0.6363, + "step": 741 + }, + { + "epoch": 0.010065111231687466, + "grad_norm": 8.91916275024414, + "learning_rate": 9.88e-06, + "loss": 0.6203, + "step": 742 + }, + { + "epoch": 0.010078676071622356, + "grad_norm": 9.458357810974121, + "learning_rate": 9.893333333333334e-06, + "loss": 0.4676, + "step": 743 + }, + { + "epoch": 0.010092240911557243, + "grad_norm": 8.575897216796875, + "learning_rate": 9.906666666666668e-06, + "loss": 0.5664, + "step": 744 + }, + { + "epoch": 0.010105805751492133, + "grad_norm": 9.874571800231934, + "learning_rate": 9.920000000000002e-06, + "loss": 0.48, + "step": 745 + }, + { + "epoch": 0.010119370591427021, + "grad_norm": 8.027627944946289, + "learning_rate": 9.933333333333334e-06, + "loss": 0.5742, + "step": 746 + }, + { + "epoch": 0.01013293543136191, + "grad_norm": 7.692586898803711, + "learning_rate": 9.946666666666667e-06, + "loss": 0.5971, + "step": 747 + }, + { + "epoch": 0.010146500271296799, + "grad_norm": 10.219922065734863, + "learning_rate": 9.960000000000001e-06, + "loss": 0.5889, + "step": 748 + }, + { + "epoch": 0.010160065111231688, + "grad_norm": 10.142074584960938, + "learning_rate": 9.973333333333333e-06, + "loss": 0.5882, + "step": 749 + }, + { + "epoch": 0.010173629951166576, + "grad_norm": 11.676383018493652, + "learning_rate": 9.986666666666667e-06, + "loss": 0.7412, + "step": 750 + }, + { + "epoch": 0.010187194791101466, + "grad_norm": 7.640814304351807, + "learning_rate": 1e-05, + "loss": 0.5704, + "step": 751 + }, + { + "epoch": 0.010200759631036354, + "grad_norm": 8.325698852539062, + "learning_rate": 9.999862957379746e-06, + "loss": 0.5205, + "step": 752 + }, + { + "epoch": 0.010214324470971243, + "grad_norm": 8.080214500427246, + "learning_rate": 9.999725914759491e-06, + "loss": 0.4838, + "step": 753 + }, + { + "epoch": 0.010227889310906131, + "grad_norm": 10.419060707092285, + "learning_rate": 9.999588872139236e-06, + "loss": 0.7159, + "step": 754 + }, + { + "epoch": 0.01024145415084102, + "grad_norm": 9.908836364746094, + "learning_rate": 9.999451829518982e-06, + "loss": 0.4997, + "step": 755 + }, + { + "epoch": 0.010255018990775909, + "grad_norm": 9.106840133666992, + "learning_rate": 9.999314786898727e-06, + "loss": 0.4769, + "step": 756 + }, + { + "epoch": 0.010268583830710798, + "grad_norm": 9.603809356689453, + "learning_rate": 9.99917774427847e-06, + "loss": 0.6578, + "step": 757 + }, + { + "epoch": 0.010282148670645686, + "grad_norm": 8.265758514404297, + "learning_rate": 9.999040701658217e-06, + "loss": 0.5826, + "step": 758 + }, + { + "epoch": 0.010295713510580576, + "grad_norm": 9.370816230773926, + "learning_rate": 9.998903659037962e-06, + "loss": 0.6632, + "step": 759 + }, + { + "epoch": 0.010309278350515464, + "grad_norm": 8.173223495483398, + "learning_rate": 9.998766616417707e-06, + "loss": 0.5654, + "step": 760 + }, + { + "epoch": 0.010322843190450353, + "grad_norm": 10.313325881958008, + "learning_rate": 9.998629573797451e-06, + "loss": 0.679, + "step": 761 + }, + { + "epoch": 0.010336408030385241, + "grad_norm": 9.628342628479004, + "learning_rate": 9.998492531177198e-06, + "loss": 0.5273, + "step": 762 + }, + { + "epoch": 0.010349972870320131, + "grad_norm": 11.083467483520508, + "learning_rate": 9.998355488556943e-06, + "loss": 0.6539, + "step": 763 + }, + { + "epoch": 0.010363537710255019, + "grad_norm": 9.434314727783203, + "learning_rate": 9.998218445936686e-06, + "loss": 0.5304, + "step": 764 + }, + { + "epoch": 0.010377102550189908, + "grad_norm": 9.450518608093262, + "learning_rate": 9.998081403316432e-06, + "loss": 0.6446, + "step": 765 + }, + { + "epoch": 0.010390667390124796, + "grad_norm": 7.8318867683410645, + "learning_rate": 9.997944360696177e-06, + "loss": 0.4649, + "step": 766 + }, + { + "epoch": 0.010404232230059686, + "grad_norm": 7.722296714782715, + "learning_rate": 9.997807318075922e-06, + "loss": 0.5874, + "step": 767 + }, + { + "epoch": 0.010417797069994574, + "grad_norm": 8.501108169555664, + "learning_rate": 9.997670275455667e-06, + "loss": 0.4284, + "step": 768 + }, + { + "epoch": 0.010431361909929463, + "grad_norm": 9.251705169677734, + "learning_rate": 9.997533232835412e-06, + "loss": 0.4788, + "step": 769 + }, + { + "epoch": 0.010444926749864351, + "grad_norm": 9.010856628417969, + "learning_rate": 9.997396190215158e-06, + "loss": 0.5663, + "step": 770 + }, + { + "epoch": 0.010458491589799241, + "grad_norm": 7.236252784729004, + "learning_rate": 9.997259147594903e-06, + "loss": 0.5842, + "step": 771 + }, + { + "epoch": 0.010472056429734129, + "grad_norm": 8.94779109954834, + "learning_rate": 9.997122104974648e-06, + "loss": 0.502, + "step": 772 + }, + { + "epoch": 0.010485621269669019, + "grad_norm": 7.476988315582275, + "learning_rate": 9.996985062354393e-06, + "loss": 0.496, + "step": 773 + }, + { + "epoch": 0.010499186109603906, + "grad_norm": 7.5336222648620605, + "learning_rate": 9.996848019734138e-06, + "loss": 0.4522, + "step": 774 + }, + { + "epoch": 0.010512750949538796, + "grad_norm": 10.418326377868652, + "learning_rate": 9.996710977113883e-06, + "loss": 0.562, + "step": 775 + }, + { + "epoch": 0.010526315789473684, + "grad_norm": 10.105327606201172, + "learning_rate": 9.996573934493629e-06, + "loss": 0.4644, + "step": 776 + }, + { + "epoch": 0.010539880629408574, + "grad_norm": 7.883210182189941, + "learning_rate": 9.996436891873374e-06, + "loss": 0.5371, + "step": 777 + }, + { + "epoch": 0.010553445469343462, + "grad_norm": 9.350541114807129, + "learning_rate": 9.996299849253119e-06, + "loss": 0.6279, + "step": 778 + }, + { + "epoch": 0.010567010309278351, + "grad_norm": 6.793468475341797, + "learning_rate": 9.996162806632862e-06, + "loss": 0.4615, + "step": 779 + }, + { + "epoch": 0.010580575149213239, + "grad_norm": 7.714431285858154, + "learning_rate": 9.99602576401261e-06, + "loss": 0.4314, + "step": 780 + }, + { + "epoch": 0.010594139989148129, + "grad_norm": 5.912341117858887, + "learning_rate": 9.995888721392354e-06, + "loss": 0.5217, + "step": 781 + }, + { + "epoch": 0.010607704829083017, + "grad_norm": 6.972966194152832, + "learning_rate": 9.995751678772098e-06, + "loss": 0.4698, + "step": 782 + }, + { + "epoch": 0.010621269669017906, + "grad_norm": 10.320362091064453, + "learning_rate": 9.995614636151843e-06, + "loss": 0.6607, + "step": 783 + }, + { + "epoch": 0.010634834508952794, + "grad_norm": 7.825293064117432, + "learning_rate": 9.99547759353159e-06, + "loss": 0.4635, + "step": 784 + }, + { + "epoch": 0.010648399348887684, + "grad_norm": 7.536478519439697, + "learning_rate": 9.995340550911335e-06, + "loss": 0.5775, + "step": 785 + }, + { + "epoch": 0.010661964188822572, + "grad_norm": 8.17536449432373, + "learning_rate": 9.995203508291079e-06, + "loss": 0.5814, + "step": 786 + }, + { + "epoch": 0.010675529028757461, + "grad_norm": 8.918109893798828, + "learning_rate": 9.995066465670824e-06, + "loss": 0.5947, + "step": 787 + }, + { + "epoch": 0.01068909386869235, + "grad_norm": 8.51899528503418, + "learning_rate": 9.99492942305057e-06, + "loss": 0.5125, + "step": 788 + }, + { + "epoch": 0.010702658708627239, + "grad_norm": 9.437289237976074, + "learning_rate": 9.994792380430314e-06, + "loss": 0.6616, + "step": 789 + }, + { + "epoch": 0.010716223548562127, + "grad_norm": 9.305871963500977, + "learning_rate": 9.99465533781006e-06, + "loss": 0.6909, + "step": 790 + }, + { + "epoch": 0.010729788388497016, + "grad_norm": 7.655498027801514, + "learning_rate": 9.994518295189805e-06, + "loss": 0.5515, + "step": 791 + }, + { + "epoch": 0.010743353228431904, + "grad_norm": 9.298210144042969, + "learning_rate": 9.99438125256955e-06, + "loss": 0.4415, + "step": 792 + }, + { + "epoch": 0.010756918068366794, + "grad_norm": 10.402194023132324, + "learning_rate": 9.994244209949295e-06, + "loss": 0.74, + "step": 793 + }, + { + "epoch": 0.010770482908301682, + "grad_norm": 8.370060920715332, + "learning_rate": 9.99410716732904e-06, + "loss": 0.633, + "step": 794 + }, + { + "epoch": 0.010784047748236571, + "grad_norm": 6.4967169761657715, + "learning_rate": 9.993970124708785e-06, + "loss": 0.5302, + "step": 795 + }, + { + "epoch": 0.01079761258817146, + "grad_norm": 9.441495895385742, + "learning_rate": 9.99383308208853e-06, + "loss": 0.7072, + "step": 796 + }, + { + "epoch": 0.010811177428106349, + "grad_norm": 8.881160736083984, + "learning_rate": 9.993696039468276e-06, + "loss": 0.4733, + "step": 797 + }, + { + "epoch": 0.010824742268041237, + "grad_norm": 7.201458930969238, + "learning_rate": 9.99355899684802e-06, + "loss": 0.5835, + "step": 798 + }, + { + "epoch": 0.010838307107976126, + "grad_norm": 8.723085403442383, + "learning_rate": 9.993421954227766e-06, + "loss": 0.4824, + "step": 799 + }, + { + "epoch": 0.010851871947911014, + "grad_norm": 6.840959548950195, + "learning_rate": 9.993284911607511e-06, + "loss": 0.601, + "step": 800 + }, + { + "epoch": 0.010865436787845904, + "grad_norm": 7.874425888061523, + "learning_rate": 9.993147868987256e-06, + "loss": 0.5512, + "step": 801 + }, + { + "epoch": 0.010879001627780792, + "grad_norm": 7.892416000366211, + "learning_rate": 9.993010826367002e-06, + "loss": 0.6114, + "step": 802 + }, + { + "epoch": 0.010892566467715682, + "grad_norm": 8.659323692321777, + "learning_rate": 9.992873783746747e-06, + "loss": 0.5748, + "step": 803 + }, + { + "epoch": 0.01090613130765057, + "grad_norm": 11.159412384033203, + "learning_rate": 9.99273674112649e-06, + "loss": 0.7321, + "step": 804 + }, + { + "epoch": 0.010919696147585459, + "grad_norm": 7.786670684814453, + "learning_rate": 9.992599698506237e-06, + "loss": 0.5842, + "step": 805 + }, + { + "epoch": 0.010933260987520347, + "grad_norm": 9.56280517578125, + "learning_rate": 9.992462655885982e-06, + "loss": 0.5924, + "step": 806 + }, + { + "epoch": 0.010946825827455237, + "grad_norm": 6.8414483070373535, + "learning_rate": 9.992325613265726e-06, + "loss": 0.4962, + "step": 807 + }, + { + "epoch": 0.010960390667390124, + "grad_norm": 13.378902435302734, + "learning_rate": 9.992188570645471e-06, + "loss": 0.7284, + "step": 808 + }, + { + "epoch": 0.010973955507325014, + "grad_norm": 7.02095365524292, + "learning_rate": 9.992051528025216e-06, + "loss": 0.5459, + "step": 809 + }, + { + "epoch": 0.010987520347259902, + "grad_norm": 9.11378002166748, + "learning_rate": 9.991914485404961e-06, + "loss": 0.5669, + "step": 810 + }, + { + "epoch": 0.011001085187194792, + "grad_norm": 4.932730674743652, + "learning_rate": 9.991777442784706e-06, + "loss": 0.312, + "step": 811 + }, + { + "epoch": 0.01101465002712968, + "grad_norm": 8.58910083770752, + "learning_rate": 9.991640400164452e-06, + "loss": 0.5777, + "step": 812 + }, + { + "epoch": 0.01102821486706457, + "grad_norm": 10.448040962219238, + "learning_rate": 9.991503357544197e-06, + "loss": 0.7755, + "step": 813 + }, + { + "epoch": 0.011041779706999457, + "grad_norm": 8.185651779174805, + "learning_rate": 9.991366314923942e-06, + "loss": 0.5835, + "step": 814 + }, + { + "epoch": 0.011055344546934347, + "grad_norm": 9.149280548095703, + "learning_rate": 9.991229272303687e-06, + "loss": 0.6074, + "step": 815 + }, + { + "epoch": 0.011068909386869235, + "grad_norm": 10.284550666809082, + "learning_rate": 9.991092229683432e-06, + "loss": 0.6977, + "step": 816 + }, + { + "epoch": 0.011082474226804124, + "grad_norm": 7.216450214385986, + "learning_rate": 9.990955187063178e-06, + "loss": 0.6064, + "step": 817 + }, + { + "epoch": 0.011096039066739012, + "grad_norm": 7.338936805725098, + "learning_rate": 9.990818144442923e-06, + "loss": 0.6059, + "step": 818 + }, + { + "epoch": 0.011109603906673902, + "grad_norm": 11.070013046264648, + "learning_rate": 9.990681101822668e-06, + "loss": 0.7973, + "step": 819 + }, + { + "epoch": 0.01112316874660879, + "grad_norm": 8.94912052154541, + "learning_rate": 9.990544059202413e-06, + "loss": 0.6547, + "step": 820 + }, + { + "epoch": 0.01113673358654368, + "grad_norm": 9.470596313476562, + "learning_rate": 9.990407016582158e-06, + "loss": 0.4486, + "step": 821 + }, + { + "epoch": 0.011150298426478567, + "grad_norm": 7.785496711730957, + "learning_rate": 9.990269973961902e-06, + "loss": 0.6019, + "step": 822 + }, + { + "epoch": 0.011163863266413457, + "grad_norm": 7.718247413635254, + "learning_rate": 9.990132931341649e-06, + "loss": 0.5839, + "step": 823 + }, + { + "epoch": 0.011177428106348345, + "grad_norm": 9.178659439086914, + "learning_rate": 9.989995888721394e-06, + "loss": 0.4272, + "step": 824 + }, + { + "epoch": 0.011190992946283234, + "grad_norm": 8.833006858825684, + "learning_rate": 9.989858846101137e-06, + "loss": 0.568, + "step": 825 + }, + { + "epoch": 0.011204557786218122, + "grad_norm": 11.957263946533203, + "learning_rate": 9.989721803480882e-06, + "loss": 0.5813, + "step": 826 + }, + { + "epoch": 0.011218122626153012, + "grad_norm": 10.253851890563965, + "learning_rate": 9.98958476086063e-06, + "loss": 0.4993, + "step": 827 + }, + { + "epoch": 0.0112316874660879, + "grad_norm": 9.811375617980957, + "learning_rate": 9.989447718240375e-06, + "loss": 0.4681, + "step": 828 + }, + { + "epoch": 0.01124525230602279, + "grad_norm": 10.285816192626953, + "learning_rate": 9.989310675620118e-06, + "loss": 0.4826, + "step": 829 + }, + { + "epoch": 0.011258817145957677, + "grad_norm": 9.74959945678711, + "learning_rate": 9.989173632999863e-06, + "loss": 0.5396, + "step": 830 + }, + { + "epoch": 0.011272381985892567, + "grad_norm": 8.767083168029785, + "learning_rate": 9.98903659037961e-06, + "loss": 0.5882, + "step": 831 + }, + { + "epoch": 0.011285946825827455, + "grad_norm": 7.351760387420654, + "learning_rate": 9.988899547759354e-06, + "loss": 0.3962, + "step": 832 + }, + { + "epoch": 0.011299511665762344, + "grad_norm": 9.570991516113281, + "learning_rate": 9.988762505139099e-06, + "loss": 0.3962, + "step": 833 + }, + { + "epoch": 0.011313076505697232, + "grad_norm": 7.937053203582764, + "learning_rate": 9.988625462518844e-06, + "loss": 0.4937, + "step": 834 + }, + { + "epoch": 0.011326641345632122, + "grad_norm": 9.582193374633789, + "learning_rate": 9.988488419898589e-06, + "loss": 0.5359, + "step": 835 + }, + { + "epoch": 0.01134020618556701, + "grad_norm": 9.850019454956055, + "learning_rate": 9.988351377278334e-06, + "loss": 0.5519, + "step": 836 + }, + { + "epoch": 0.0113537710255019, + "grad_norm": 10.238424301147461, + "learning_rate": 9.98821433465808e-06, + "loss": 0.5757, + "step": 837 + }, + { + "epoch": 0.011367335865436787, + "grad_norm": 8.211912155151367, + "learning_rate": 9.988077292037825e-06, + "loss": 0.4293, + "step": 838 + }, + { + "epoch": 0.011380900705371677, + "grad_norm": 10.141050338745117, + "learning_rate": 9.98794024941757e-06, + "loss": 0.6541, + "step": 839 + }, + { + "epoch": 0.011394465545306565, + "grad_norm": 8.305087089538574, + "learning_rate": 9.987803206797315e-06, + "loss": 0.4837, + "step": 840 + }, + { + "epoch": 0.011408030385241455, + "grad_norm": 9.628129005432129, + "learning_rate": 9.98766616417706e-06, + "loss": 0.5608, + "step": 841 + }, + { + "epoch": 0.011421595225176343, + "grad_norm": 9.54753303527832, + "learning_rate": 9.987529121556805e-06, + "loss": 0.4388, + "step": 842 + }, + { + "epoch": 0.011435160065111232, + "grad_norm": 9.266098022460938, + "learning_rate": 9.98739207893655e-06, + "loss": 0.4504, + "step": 843 + }, + { + "epoch": 0.01144872490504612, + "grad_norm": 8.243130683898926, + "learning_rate": 9.987255036316296e-06, + "loss": 0.4232, + "step": 844 + }, + { + "epoch": 0.01146228974498101, + "grad_norm": 13.149251937866211, + "learning_rate": 9.987117993696041e-06, + "loss": 0.886, + "step": 845 + }, + { + "epoch": 0.011475854584915898, + "grad_norm": 11.138825416564941, + "learning_rate": 9.986980951075786e-06, + "loss": 0.6746, + "step": 846 + }, + { + "epoch": 0.011489419424850787, + "grad_norm": 11.281259536743164, + "learning_rate": 9.98684390845553e-06, + "loss": 0.9725, + "step": 847 + }, + { + "epoch": 0.011502984264785675, + "grad_norm": 9.976859092712402, + "learning_rate": 9.986706865835275e-06, + "loss": 0.511, + "step": 848 + }, + { + "epoch": 0.011516549104720565, + "grad_norm": 8.621488571166992, + "learning_rate": 9.986569823215022e-06, + "loss": 0.4175, + "step": 849 + }, + { + "epoch": 0.011530113944655453, + "grad_norm": 10.509485244750977, + "learning_rate": 9.986432780594765e-06, + "loss": 0.6582, + "step": 850 + }, + { + "epoch": 0.011543678784590342, + "grad_norm": 11.276257514953613, + "learning_rate": 9.98629573797451e-06, + "loss": 0.5792, + "step": 851 + }, + { + "epoch": 0.01155724362452523, + "grad_norm": 9.814424514770508, + "learning_rate": 9.986158695354255e-06, + "loss": 0.5274, + "step": 852 + }, + { + "epoch": 0.01157080846446012, + "grad_norm": 9.708789825439453, + "learning_rate": 9.986021652734002e-06, + "loss": 0.5654, + "step": 853 + }, + { + "epoch": 0.011584373304395008, + "grad_norm": 9.912487983703613, + "learning_rate": 9.985884610113746e-06, + "loss": 0.6742, + "step": 854 + }, + { + "epoch": 0.011597938144329897, + "grad_norm": 10.270674705505371, + "learning_rate": 9.985747567493491e-06, + "loss": 0.6983, + "step": 855 + }, + { + "epoch": 0.011611502984264785, + "grad_norm": 10.778637886047363, + "learning_rate": 9.985610524873236e-06, + "loss": 0.5671, + "step": 856 + }, + { + "epoch": 0.011625067824199675, + "grad_norm": 10.614242553710938, + "learning_rate": 9.985473482252981e-06, + "loss": 0.7601, + "step": 857 + }, + { + "epoch": 0.011638632664134563, + "grad_norm": 9.367609977722168, + "learning_rate": 9.985336439632726e-06, + "loss": 0.5383, + "step": 858 + }, + { + "epoch": 0.011652197504069452, + "grad_norm": 9.777548789978027, + "learning_rate": 9.985199397012472e-06, + "loss": 0.7425, + "step": 859 + }, + { + "epoch": 0.01166576234400434, + "grad_norm": 8.003934860229492, + "learning_rate": 9.985062354392217e-06, + "loss": 0.5166, + "step": 860 + }, + { + "epoch": 0.01167932718393923, + "grad_norm": 10.420960426330566, + "learning_rate": 9.984925311771962e-06, + "loss": 0.6579, + "step": 861 + }, + { + "epoch": 0.011692892023874118, + "grad_norm": 10.28174877166748, + "learning_rate": 9.984788269151707e-06, + "loss": 0.5522, + "step": 862 + }, + { + "epoch": 0.011706456863809007, + "grad_norm": 9.662100791931152, + "learning_rate": 9.984651226531452e-06, + "loss": 0.4942, + "step": 863 + }, + { + "epoch": 0.011720021703743895, + "grad_norm": 11.054118156433105, + "learning_rate": 9.984514183911198e-06, + "loss": 0.6863, + "step": 864 + }, + { + "epoch": 0.011733586543678785, + "grad_norm": 6.330477237701416, + "learning_rate": 9.984377141290941e-06, + "loss": 0.3532, + "step": 865 + }, + { + "epoch": 0.011747151383613673, + "grad_norm": 6.28380012512207, + "learning_rate": 9.984240098670688e-06, + "loss": 0.4329, + "step": 866 + }, + { + "epoch": 0.011760716223548563, + "grad_norm": 10.519152641296387, + "learning_rate": 9.984103056050433e-06, + "loss": 0.5382, + "step": 867 + }, + { + "epoch": 0.01177428106348345, + "grad_norm": 9.227995872497559, + "learning_rate": 9.983966013430178e-06, + "loss": 0.7551, + "step": 868 + }, + { + "epoch": 0.01178784590341834, + "grad_norm": 8.157322883605957, + "learning_rate": 9.983828970809922e-06, + "loss": 0.4454, + "step": 869 + }, + { + "epoch": 0.011801410743353228, + "grad_norm": 9.75037956237793, + "learning_rate": 9.983691928189669e-06, + "loss": 0.4336, + "step": 870 + }, + { + "epoch": 0.011814975583288118, + "grad_norm": 7.8065338134765625, + "learning_rate": 9.983554885569414e-06, + "loss": 0.396, + "step": 871 + }, + { + "epoch": 0.011828540423223005, + "grad_norm": 9.036543846130371, + "learning_rate": 9.983417842949157e-06, + "loss": 0.7407, + "step": 872 + }, + { + "epoch": 0.011842105263157895, + "grad_norm": 8.74907398223877, + "learning_rate": 9.983280800328902e-06, + "loss": 0.4501, + "step": 873 + }, + { + "epoch": 0.011855670103092783, + "grad_norm": 12.187756538391113, + "learning_rate": 9.98314375770865e-06, + "loss": 0.5641, + "step": 874 + }, + { + "epoch": 0.011869234943027673, + "grad_norm": 8.392354965209961, + "learning_rate": 9.983006715088393e-06, + "loss": 0.5986, + "step": 875 + }, + { + "epoch": 0.01188279978296256, + "grad_norm": 12.247690200805664, + "learning_rate": 9.982869672468138e-06, + "loss": 0.5963, + "step": 876 + }, + { + "epoch": 0.01189636462289745, + "grad_norm": 8.356425285339355, + "learning_rate": 9.982732629847883e-06, + "loss": 0.5343, + "step": 877 + }, + { + "epoch": 0.011909929462832338, + "grad_norm": 8.241572380065918, + "learning_rate": 9.982595587227628e-06, + "loss": 0.4851, + "step": 878 + }, + { + "epoch": 0.011923494302767228, + "grad_norm": 8.037667274475098, + "learning_rate": 9.982458544607374e-06, + "loss": 0.6527, + "step": 879 + }, + { + "epoch": 0.011937059142702116, + "grad_norm": 11.586698532104492, + "learning_rate": 9.982321501987119e-06, + "loss": 0.5354, + "step": 880 + }, + { + "epoch": 0.011950623982637005, + "grad_norm": 9.790533065795898, + "learning_rate": 9.982184459366864e-06, + "loss": 0.5209, + "step": 881 + }, + { + "epoch": 0.011964188822571893, + "grad_norm": 12.119715690612793, + "learning_rate": 9.982047416746609e-06, + "loss": 0.6748, + "step": 882 + }, + { + "epoch": 0.011977753662506783, + "grad_norm": 6.530577659606934, + "learning_rate": 9.981910374126354e-06, + "loss": 0.4553, + "step": 883 + }, + { + "epoch": 0.01199131850244167, + "grad_norm": 12.758757591247559, + "learning_rate": 9.9817733315061e-06, + "loss": 0.7928, + "step": 884 + }, + { + "epoch": 0.01200488334237656, + "grad_norm": 9.270631790161133, + "learning_rate": 9.981636288885845e-06, + "loss": 0.5514, + "step": 885 + }, + { + "epoch": 0.012018448182311448, + "grad_norm": 8.621623039245605, + "learning_rate": 9.98149924626559e-06, + "loss": 0.4155, + "step": 886 + }, + { + "epoch": 0.012032013022246338, + "grad_norm": 10.026289939880371, + "learning_rate": 9.981362203645335e-06, + "loss": 0.5665, + "step": 887 + }, + { + "epoch": 0.012045577862181226, + "grad_norm": 9.698200225830078, + "learning_rate": 9.98122516102508e-06, + "loss": 0.5144, + "step": 888 + }, + { + "epoch": 0.012059142702116115, + "grad_norm": 9.331954002380371, + "learning_rate": 9.981088118404825e-06, + "loss": 0.4738, + "step": 889 + }, + { + "epoch": 0.012072707542051003, + "grad_norm": 9.945034980773926, + "learning_rate": 9.980951075784569e-06, + "loss": 0.646, + "step": 890 + }, + { + "epoch": 0.012086272381985893, + "grad_norm": 9.13231372833252, + "learning_rate": 9.980814033164314e-06, + "loss": 0.4517, + "step": 891 + }, + { + "epoch": 0.01209983722192078, + "grad_norm": 6.877588748931885, + "learning_rate": 9.980676990544061e-06, + "loss": 0.4917, + "step": 892 + }, + { + "epoch": 0.01211340206185567, + "grad_norm": 9.05912971496582, + "learning_rate": 9.980539947923804e-06, + "loss": 0.475, + "step": 893 + }, + { + "epoch": 0.012126966901790558, + "grad_norm": 8.123485565185547, + "learning_rate": 9.98040290530355e-06, + "loss": 0.5569, + "step": 894 + }, + { + "epoch": 0.012140531741725448, + "grad_norm": 9.792627334594727, + "learning_rate": 9.980265862683295e-06, + "loss": 0.5595, + "step": 895 + }, + { + "epoch": 0.012154096581660336, + "grad_norm": 7.942371368408203, + "learning_rate": 9.980128820063042e-06, + "loss": 0.3933, + "step": 896 + }, + { + "epoch": 0.012167661421595225, + "grad_norm": 7.5064239501953125, + "learning_rate": 9.979991777442785e-06, + "loss": 0.4467, + "step": 897 + }, + { + "epoch": 0.012181226261530113, + "grad_norm": 10.825020790100098, + "learning_rate": 9.97985473482253e-06, + "loss": 0.7325, + "step": 898 + }, + { + "epoch": 0.012194791101465003, + "grad_norm": 8.603020668029785, + "learning_rate": 9.979717692202275e-06, + "loss": 0.4764, + "step": 899 + }, + { + "epoch": 0.012208355941399891, + "grad_norm": 7.446374416351318, + "learning_rate": 9.97958064958202e-06, + "loss": 0.4843, + "step": 900 + }, + { + "epoch": 0.01222192078133478, + "grad_norm": 8.170618057250977, + "learning_rate": 9.979443606961766e-06, + "loss": 0.4869, + "step": 901 + }, + { + "epoch": 0.012235485621269668, + "grad_norm": 7.765012741088867, + "learning_rate": 9.979306564341511e-06, + "loss": 0.4327, + "step": 902 + }, + { + "epoch": 0.012249050461204558, + "grad_norm": 7.382414817810059, + "learning_rate": 9.979169521721256e-06, + "loss": 0.3692, + "step": 903 + }, + { + "epoch": 0.012262615301139446, + "grad_norm": 7.898096084594727, + "learning_rate": 9.979032479101001e-06, + "loss": 0.5072, + "step": 904 + }, + { + "epoch": 0.012276180141074336, + "grad_norm": 7.438011646270752, + "learning_rate": 9.978895436480747e-06, + "loss": 0.5407, + "step": 905 + }, + { + "epoch": 0.012289744981009224, + "grad_norm": 9.071755409240723, + "learning_rate": 9.978758393860492e-06, + "loss": 0.4232, + "step": 906 + }, + { + "epoch": 0.012303309820944113, + "grad_norm": 7.857212543487549, + "learning_rate": 9.978621351240237e-06, + "loss": 0.4706, + "step": 907 + }, + { + "epoch": 0.012316874660879001, + "grad_norm": 6.998917102813721, + "learning_rate": 9.97848430861998e-06, + "loss": 0.3282, + "step": 908 + }, + { + "epoch": 0.01233043950081389, + "grad_norm": 7.889724254608154, + "learning_rate": 9.978347265999727e-06, + "loss": 0.5261, + "step": 909 + }, + { + "epoch": 0.012344004340748779, + "grad_norm": 7.27243185043335, + "learning_rate": 9.978210223379472e-06, + "loss": 0.5769, + "step": 910 + }, + { + "epoch": 0.012357569180683668, + "grad_norm": 12.816777229309082, + "learning_rate": 9.978073180759218e-06, + "loss": 0.6092, + "step": 911 + }, + { + "epoch": 0.012371134020618556, + "grad_norm": 7.0225043296813965, + "learning_rate": 9.977936138138961e-06, + "loss": 0.495, + "step": 912 + }, + { + "epoch": 0.012384698860553446, + "grad_norm": 6.259124279022217, + "learning_rate": 9.977799095518708e-06, + "loss": 0.4686, + "step": 913 + }, + { + "epoch": 0.012398263700488334, + "grad_norm": 7.863890647888184, + "learning_rate": 9.977662052898453e-06, + "loss": 0.4138, + "step": 914 + }, + { + "epoch": 0.012411828540423223, + "grad_norm": 8.88696002960205, + "learning_rate": 9.977525010278197e-06, + "loss": 0.55, + "step": 915 + }, + { + "epoch": 0.012425393380358111, + "grad_norm": 9.889415740966797, + "learning_rate": 9.977387967657942e-06, + "loss": 0.7516, + "step": 916 + }, + { + "epoch": 0.012438958220293, + "grad_norm": 9.25155258178711, + "learning_rate": 9.977250925037687e-06, + "loss": 0.6267, + "step": 917 + }, + { + "epoch": 0.012452523060227889, + "grad_norm": 9.628547668457031, + "learning_rate": 9.977113882417432e-06, + "loss": 0.6658, + "step": 918 + }, + { + "epoch": 0.012466087900162778, + "grad_norm": 8.663455963134766, + "learning_rate": 9.976976839797177e-06, + "loss": 0.5878, + "step": 919 + }, + { + "epoch": 0.012479652740097666, + "grad_norm": 9.847900390625, + "learning_rate": 9.976839797176922e-06, + "loss": 0.7301, + "step": 920 + }, + { + "epoch": 0.012493217580032556, + "grad_norm": 7.132843971252441, + "learning_rate": 9.976702754556668e-06, + "loss": 0.4239, + "step": 921 + }, + { + "epoch": 0.012506782419967444, + "grad_norm": 9.100663185119629, + "learning_rate": 9.976565711936413e-06, + "loss": 0.4651, + "step": 922 + }, + { + "epoch": 0.012520347259902333, + "grad_norm": 7.614242076873779, + "learning_rate": 9.976428669316158e-06, + "loss": 0.4575, + "step": 923 + }, + { + "epoch": 0.012533912099837221, + "grad_norm": 5.995017051696777, + "learning_rate": 9.976291626695903e-06, + "loss": 0.4657, + "step": 924 + }, + { + "epoch": 0.012547476939772111, + "grad_norm": 9.48989486694336, + "learning_rate": 9.976154584075648e-06, + "loss": 0.5837, + "step": 925 + }, + { + "epoch": 0.012561041779706999, + "grad_norm": 7.790136337280273, + "learning_rate": 9.976017541455394e-06, + "loss": 0.5658, + "step": 926 + }, + { + "epoch": 0.012574606619641888, + "grad_norm": 10.277789115905762, + "learning_rate": 9.975880498835139e-06, + "loss": 0.747, + "step": 927 + }, + { + "epoch": 0.012588171459576776, + "grad_norm": 8.04004192352295, + "learning_rate": 9.975743456214884e-06, + "loss": 0.5218, + "step": 928 + }, + { + "epoch": 0.012601736299511666, + "grad_norm": 10.6765775680542, + "learning_rate": 9.975606413594629e-06, + "loss": 0.5812, + "step": 929 + }, + { + "epoch": 0.012615301139446554, + "grad_norm": 9.016050338745117, + "learning_rate": 9.975469370974374e-06, + "loss": 0.4729, + "step": 930 + }, + { + "epoch": 0.012628865979381444, + "grad_norm": 10.398818016052246, + "learning_rate": 9.97533232835412e-06, + "loss": 0.798, + "step": 931 + }, + { + "epoch": 0.012642430819316331, + "grad_norm": 6.36821174621582, + "learning_rate": 9.975195285733865e-06, + "loss": 0.3726, + "step": 932 + }, + { + "epoch": 0.012655995659251221, + "grad_norm": 7.994998931884766, + "learning_rate": 9.975058243113608e-06, + "loss": 0.5201, + "step": 933 + }, + { + "epoch": 0.012669560499186109, + "grad_norm": 9.668534278869629, + "learning_rate": 9.974921200493353e-06, + "loss": 0.4835, + "step": 934 + }, + { + "epoch": 0.012683125339120999, + "grad_norm": 12.153990745544434, + "learning_rate": 9.9747841578731e-06, + "loss": 0.574, + "step": 935 + }, + { + "epoch": 0.012696690179055886, + "grad_norm": 8.458842277526855, + "learning_rate": 9.974647115252845e-06, + "loss": 0.5865, + "step": 936 + }, + { + "epoch": 0.012710255018990776, + "grad_norm": 7.827704429626465, + "learning_rate": 9.974510072632589e-06, + "loss": 0.508, + "step": 937 + }, + { + "epoch": 0.012723819858925664, + "grad_norm": 9.112056732177734, + "learning_rate": 9.974373030012334e-06, + "loss": 0.7707, + "step": 938 + }, + { + "epoch": 0.012737384698860554, + "grad_norm": 9.200647354125977, + "learning_rate": 9.974235987392081e-06, + "loss": 0.4883, + "step": 939 + }, + { + "epoch": 0.012750949538795442, + "grad_norm": 9.613358497619629, + "learning_rate": 9.974098944771824e-06, + "loss": 0.5282, + "step": 940 + }, + { + "epoch": 0.012764514378730331, + "grad_norm": 10.657673835754395, + "learning_rate": 9.97396190215157e-06, + "loss": 0.5559, + "step": 941 + }, + { + "epoch": 0.012778079218665219, + "grad_norm": 7.233135223388672, + "learning_rate": 9.973824859531315e-06, + "loss": 0.4735, + "step": 942 + }, + { + "epoch": 0.012791644058600109, + "grad_norm": 6.885103702545166, + "learning_rate": 9.97368781691106e-06, + "loss": 0.5417, + "step": 943 + }, + { + "epoch": 0.012805208898534997, + "grad_norm": 9.678051948547363, + "learning_rate": 9.973550774290805e-06, + "loss": 0.6489, + "step": 944 + }, + { + "epoch": 0.012818773738469886, + "grad_norm": 10.06314754486084, + "learning_rate": 9.97341373167055e-06, + "loss": 0.6242, + "step": 945 + }, + { + "epoch": 0.012832338578404774, + "grad_norm": 11.277862548828125, + "learning_rate": 9.973276689050295e-06, + "loss": 0.7474, + "step": 946 + }, + { + "epoch": 0.012845903418339664, + "grad_norm": 10.614295959472656, + "learning_rate": 9.97313964643004e-06, + "loss": 0.62, + "step": 947 + }, + { + "epoch": 0.012859468258274552, + "grad_norm": 7.130256175994873, + "learning_rate": 9.973002603809786e-06, + "loss": 0.5361, + "step": 948 + }, + { + "epoch": 0.012873033098209441, + "grad_norm": 9.842878341674805, + "learning_rate": 9.972865561189531e-06, + "loss": 0.4971, + "step": 949 + }, + { + "epoch": 0.01288659793814433, + "grad_norm": 9.02463150024414, + "learning_rate": 9.972728518569276e-06, + "loss": 0.526, + "step": 950 + }, + { + "epoch": 0.012900162778079219, + "grad_norm": 9.178728103637695, + "learning_rate": 9.972591475949021e-06, + "loss": 0.6511, + "step": 951 + }, + { + "epoch": 0.012913727618014107, + "grad_norm": 10.073135375976562, + "learning_rate": 9.972454433328767e-06, + "loss": 0.7334, + "step": 952 + }, + { + "epoch": 0.012927292457948996, + "grad_norm": 9.105377197265625, + "learning_rate": 9.972317390708512e-06, + "loss": 0.5276, + "step": 953 + }, + { + "epoch": 0.012940857297883884, + "grad_norm": 9.04026985168457, + "learning_rate": 9.972180348088257e-06, + "loss": 0.5165, + "step": 954 + }, + { + "epoch": 0.012954422137818774, + "grad_norm": 6.8886919021606445, + "learning_rate": 9.972043305468e-06, + "loss": 0.4681, + "step": 955 + }, + { + "epoch": 0.012967986977753662, + "grad_norm": 14.896998405456543, + "learning_rate": 9.971906262847747e-06, + "loss": 0.5799, + "step": 956 + }, + { + "epoch": 0.012981551817688551, + "grad_norm": 14.806241989135742, + "learning_rate": 9.971769220227492e-06, + "loss": 0.8129, + "step": 957 + }, + { + "epoch": 0.01299511665762344, + "grad_norm": 9.378487586975098, + "learning_rate": 9.971632177607236e-06, + "loss": 0.6923, + "step": 958 + }, + { + "epoch": 0.013008681497558329, + "grad_norm": 9.259393692016602, + "learning_rate": 9.971495134986981e-06, + "loss": 0.5494, + "step": 959 + }, + { + "epoch": 0.013022246337493217, + "grad_norm": 6.301532745361328, + "learning_rate": 9.971358092366726e-06, + "loss": 0.3855, + "step": 960 + }, + { + "epoch": 0.013035811177428107, + "grad_norm": 9.353737831115723, + "learning_rate": 9.971221049746473e-06, + "loss": 0.6189, + "step": 961 + }, + { + "epoch": 0.013049376017362994, + "grad_norm": 6.933452129364014, + "learning_rate": 9.971084007126217e-06, + "loss": 0.3879, + "step": 962 + }, + { + "epoch": 0.013062940857297884, + "grad_norm": 8.293742179870605, + "learning_rate": 9.970946964505962e-06, + "loss": 0.5416, + "step": 963 + }, + { + "epoch": 0.013076505697232772, + "grad_norm": 10.877161979675293, + "learning_rate": 9.970809921885707e-06, + "loss": 0.5346, + "step": 964 + }, + { + "epoch": 0.013090070537167662, + "grad_norm": 7.054727077484131, + "learning_rate": 9.970672879265452e-06, + "loss": 0.4926, + "step": 965 + }, + { + "epoch": 0.01310363537710255, + "grad_norm": 13.273823738098145, + "learning_rate": 9.970535836645197e-06, + "loss": 0.5376, + "step": 966 + }, + { + "epoch": 0.013117200217037439, + "grad_norm": 10.357125282287598, + "learning_rate": 9.970398794024943e-06, + "loss": 0.5829, + "step": 967 + }, + { + "epoch": 0.013130765056972327, + "grad_norm": 8.83633804321289, + "learning_rate": 9.970261751404688e-06, + "loss": 0.5177, + "step": 968 + }, + { + "epoch": 0.013144329896907217, + "grad_norm": 10.718758583068848, + "learning_rate": 9.970124708784433e-06, + "loss": 0.6158, + "step": 969 + }, + { + "epoch": 0.013157894736842105, + "grad_norm": 6.956214904785156, + "learning_rate": 9.969987666164178e-06, + "loss": 0.4355, + "step": 970 + }, + { + "epoch": 0.013171459576776994, + "grad_norm": 10.695993423461914, + "learning_rate": 9.969850623543923e-06, + "loss": 0.8561, + "step": 971 + }, + { + "epoch": 0.013185024416711882, + "grad_norm": 8.881560325622559, + "learning_rate": 9.969713580923668e-06, + "loss": 0.4543, + "step": 972 + }, + { + "epoch": 0.013198589256646772, + "grad_norm": 10.895882606506348, + "learning_rate": 9.969576538303412e-06, + "loss": 0.6385, + "step": 973 + }, + { + "epoch": 0.01321215409658166, + "grad_norm": 6.776622295379639, + "learning_rate": 9.969439495683159e-06, + "loss": 0.478, + "step": 974 + }, + { + "epoch": 0.01322571893651655, + "grad_norm": 7.410883903503418, + "learning_rate": 9.969302453062904e-06, + "loss": 0.5611, + "step": 975 + }, + { + "epoch": 0.013239283776451437, + "grad_norm": 8.522477149963379, + "learning_rate": 9.969165410442649e-06, + "loss": 0.6674, + "step": 976 + }, + { + "epoch": 0.013252848616386327, + "grad_norm": 7.759984016418457, + "learning_rate": 9.969028367822393e-06, + "loss": 0.5682, + "step": 977 + }, + { + "epoch": 0.013266413456321215, + "grad_norm": 9.301970481872559, + "learning_rate": 9.96889132520214e-06, + "loss": 0.633, + "step": 978 + }, + { + "epoch": 0.013279978296256104, + "grad_norm": 10.236651420593262, + "learning_rate": 9.968754282581885e-06, + "loss": 0.5233, + "step": 979 + }, + { + "epoch": 0.013293543136190992, + "grad_norm": 9.956327438354492, + "learning_rate": 9.968617239961628e-06, + "loss": 0.7368, + "step": 980 + }, + { + "epoch": 0.013307107976125882, + "grad_norm": 9.681632041931152, + "learning_rate": 9.968480197341373e-06, + "loss": 0.6186, + "step": 981 + }, + { + "epoch": 0.01332067281606077, + "grad_norm": 8.104639053344727, + "learning_rate": 9.96834315472112e-06, + "loss": 0.5579, + "step": 982 + }, + { + "epoch": 0.01333423765599566, + "grad_norm": 10.446602821350098, + "learning_rate": 9.968206112100864e-06, + "loss": 0.7617, + "step": 983 + }, + { + "epoch": 0.013347802495930547, + "grad_norm": 10.916620254516602, + "learning_rate": 9.968069069480609e-06, + "loss": 0.7979, + "step": 984 + }, + { + "epoch": 0.013361367335865437, + "grad_norm": 8.940699577331543, + "learning_rate": 9.967932026860354e-06, + "loss": 0.5431, + "step": 985 + }, + { + "epoch": 0.013374932175800325, + "grad_norm": 8.518926620483398, + "learning_rate": 9.9677949842401e-06, + "loss": 0.4945, + "step": 986 + }, + { + "epoch": 0.013388497015735214, + "grad_norm": 6.727519512176514, + "learning_rate": 9.967657941619844e-06, + "loss": 0.4559, + "step": 987 + }, + { + "epoch": 0.013402061855670102, + "grad_norm": 8.454776763916016, + "learning_rate": 9.96752089899959e-06, + "loss": 0.6239, + "step": 988 + }, + { + "epoch": 0.013415626695604992, + "grad_norm": 7.890241622924805, + "learning_rate": 9.967383856379335e-06, + "loss": 0.6384, + "step": 989 + }, + { + "epoch": 0.01342919153553988, + "grad_norm": 8.478594779968262, + "learning_rate": 9.96724681375908e-06, + "loss": 0.6142, + "step": 990 + }, + { + "epoch": 0.01344275637547477, + "grad_norm": 9.471097946166992, + "learning_rate": 9.967109771138825e-06, + "loss": 0.6417, + "step": 991 + }, + { + "epoch": 0.013456321215409657, + "grad_norm": 9.048232078552246, + "learning_rate": 9.96697272851857e-06, + "loss": 0.5978, + "step": 992 + }, + { + "epoch": 0.013469886055344547, + "grad_norm": 9.924092292785645, + "learning_rate": 9.966835685898315e-06, + "loss": 0.6537, + "step": 993 + }, + { + "epoch": 0.013483450895279435, + "grad_norm": 7.424149990081787, + "learning_rate": 9.96669864327806e-06, + "loss": 0.373, + "step": 994 + }, + { + "epoch": 0.013497015735214325, + "grad_norm": 6.347033500671387, + "learning_rate": 9.966561600657806e-06, + "loss": 0.4798, + "step": 995 + }, + { + "epoch": 0.013510580575149212, + "grad_norm": 6.46332311630249, + "learning_rate": 9.966424558037551e-06, + "loss": 0.4031, + "step": 996 + }, + { + "epoch": 0.013524145415084102, + "grad_norm": 8.49478530883789, + "learning_rate": 9.966287515417296e-06, + "loss": 0.582, + "step": 997 + }, + { + "epoch": 0.01353771025501899, + "grad_norm": 10.428866386413574, + "learning_rate": 9.96615047279704e-06, + "loss": 0.5487, + "step": 998 + }, + { + "epoch": 0.01355127509495388, + "grad_norm": 10.874754905700684, + "learning_rate": 9.966013430176785e-06, + "loss": 0.5315, + "step": 999 + }, + { + "epoch": 0.013564839934888768, + "grad_norm": 9.243977546691895, + "learning_rate": 9.965876387556532e-06, + "loss": 0.5613, + "step": 1000 + }, + { + "epoch": 0.013578404774823657, + "grad_norm": 8.14124584197998, + "learning_rate": 9.965739344936275e-06, + "loss": 0.7554, + "step": 1001 + }, + { + "epoch": 0.013591969614758545, + "grad_norm": 10.059796333312988, + "learning_rate": 9.96560230231602e-06, + "loss": 0.5237, + "step": 1002 + }, + { + "epoch": 0.013605534454693435, + "grad_norm": 7.798550128936768, + "learning_rate": 9.965465259695766e-06, + "loss": 0.5021, + "step": 1003 + }, + { + "epoch": 0.013619099294628323, + "grad_norm": 10.147866249084473, + "learning_rate": 9.965328217075512e-06, + "loss": 0.6757, + "step": 1004 + }, + { + "epoch": 0.013632664134563212, + "grad_norm": 8.648179054260254, + "learning_rate": 9.965191174455256e-06, + "loss": 0.4266, + "step": 1005 + }, + { + "epoch": 0.0136462289744981, + "grad_norm": 11.828720092773438, + "learning_rate": 9.965054131835001e-06, + "loss": 0.702, + "step": 1006 + }, + { + "epoch": 0.01365979381443299, + "grad_norm": 10.831243515014648, + "learning_rate": 9.964917089214746e-06, + "loss": 0.6299, + "step": 1007 + }, + { + "epoch": 0.013673358654367878, + "grad_norm": 8.749032974243164, + "learning_rate": 9.964780046594491e-06, + "loss": 0.6774, + "step": 1008 + }, + { + "epoch": 0.013686923494302767, + "grad_norm": 8.994146347045898, + "learning_rate": 9.964643003974237e-06, + "loss": 0.6774, + "step": 1009 + }, + { + "epoch": 0.013700488334237655, + "grad_norm": 6.151799201965332, + "learning_rate": 9.964505961353982e-06, + "loss": 0.5173, + "step": 1010 + }, + { + "epoch": 0.013714053174172545, + "grad_norm": 9.421857833862305, + "learning_rate": 9.964368918733727e-06, + "loss": 0.4999, + "step": 1011 + }, + { + "epoch": 0.013727618014107433, + "grad_norm": 9.397136688232422, + "learning_rate": 9.964231876113472e-06, + "loss": 0.7757, + "step": 1012 + }, + { + "epoch": 0.013741182854042322, + "grad_norm": 10.461359024047852, + "learning_rate": 9.964094833493217e-06, + "loss": 0.466, + "step": 1013 + }, + { + "epoch": 0.01375474769397721, + "grad_norm": 7.260541915893555, + "learning_rate": 9.963957790872963e-06, + "loss": 0.51, + "step": 1014 + }, + { + "epoch": 0.0137683125339121, + "grad_norm": 8.0062837600708, + "learning_rate": 9.963820748252708e-06, + "loss": 0.6039, + "step": 1015 + }, + { + "epoch": 0.013781877373846988, + "grad_norm": 8.20714282989502, + "learning_rate": 9.963683705632451e-06, + "loss": 0.6827, + "step": 1016 + }, + { + "epoch": 0.013795442213781877, + "grad_norm": 8.585071563720703, + "learning_rate": 9.963546663012198e-06, + "loss": 0.477, + "step": 1017 + }, + { + "epoch": 0.013809007053716765, + "grad_norm": 8.72464370727539, + "learning_rate": 9.963409620391943e-06, + "loss": 0.5906, + "step": 1018 + }, + { + "epoch": 0.013822571893651655, + "grad_norm": 9.473942756652832, + "learning_rate": 9.963272577771688e-06, + "loss": 0.5515, + "step": 1019 + }, + { + "epoch": 0.013836136733586543, + "grad_norm": 12.988280296325684, + "learning_rate": 9.963135535151432e-06, + "loss": 0.7214, + "step": 1020 + }, + { + "epoch": 0.013849701573521432, + "grad_norm": 7.654475212097168, + "learning_rate": 9.962998492531179e-06, + "loss": 0.505, + "step": 1021 + }, + { + "epoch": 0.01386326641345632, + "grad_norm": 10.334722518920898, + "learning_rate": 9.962861449910924e-06, + "loss": 0.739, + "step": 1022 + }, + { + "epoch": 0.01387683125339121, + "grad_norm": 8.86070728302002, + "learning_rate": 9.962724407290667e-06, + "loss": 0.6027, + "step": 1023 + }, + { + "epoch": 0.0138903960933261, + "grad_norm": 8.125641822814941, + "learning_rate": 9.962587364670413e-06, + "loss": 0.6749, + "step": 1024 + }, + { + "epoch": 0.013903960933260988, + "grad_norm": 9.208808898925781, + "learning_rate": 9.96245032205016e-06, + "loss": 0.6022, + "step": 1025 + }, + { + "epoch": 0.013917525773195877, + "grad_norm": 10.276266098022461, + "learning_rate": 9.962313279429903e-06, + "loss": 0.6553, + "step": 1026 + }, + { + "epoch": 0.013931090613130765, + "grad_norm": 9.875320434570312, + "learning_rate": 9.962176236809648e-06, + "loss": 0.5355, + "step": 1027 + }, + { + "epoch": 0.013944655453065655, + "grad_norm": 9.178681373596191, + "learning_rate": 9.962039194189393e-06, + "loss": 0.568, + "step": 1028 + }, + { + "epoch": 0.013958220293000543, + "grad_norm": 10.08647346496582, + "learning_rate": 9.961902151569139e-06, + "loss": 0.6897, + "step": 1029 + }, + { + "epoch": 0.013971785132935432, + "grad_norm": 11.017881393432617, + "learning_rate": 9.961765108948884e-06, + "loss": 0.7302, + "step": 1030 + }, + { + "epoch": 0.01398534997287032, + "grad_norm": 10.513826370239258, + "learning_rate": 9.961628066328629e-06, + "loss": 0.5124, + "step": 1031 + }, + { + "epoch": 0.01399891481280521, + "grad_norm": 12.566719055175781, + "learning_rate": 9.961491023708374e-06, + "loss": 1.041, + "step": 1032 + }, + { + "epoch": 0.014012479652740098, + "grad_norm": 7.526614189147949, + "learning_rate": 9.96135398108812e-06, + "loss": 0.4681, + "step": 1033 + }, + { + "epoch": 0.014026044492674987, + "grad_norm": 10.297669410705566, + "learning_rate": 9.961216938467864e-06, + "loss": 0.6358, + "step": 1034 + }, + { + "epoch": 0.014039609332609875, + "grad_norm": 11.372967720031738, + "learning_rate": 9.96107989584761e-06, + "loss": 0.8378, + "step": 1035 + }, + { + "epoch": 0.014053174172544765, + "grad_norm": 10.149482727050781, + "learning_rate": 9.960942853227355e-06, + "loss": 0.7223, + "step": 1036 + }, + { + "epoch": 0.014066739012479653, + "grad_norm": 8.97303295135498, + "learning_rate": 9.9608058106071e-06, + "loss": 0.551, + "step": 1037 + }, + { + "epoch": 0.014080303852414542, + "grad_norm": 9.059535026550293, + "learning_rate": 9.960668767986845e-06, + "loss": 0.8421, + "step": 1038 + }, + { + "epoch": 0.01409386869234943, + "grad_norm": 12.327607154846191, + "learning_rate": 9.96053172536659e-06, + "loss": 0.6677, + "step": 1039 + }, + { + "epoch": 0.01410743353228432, + "grad_norm": 9.6196928024292, + "learning_rate": 9.960394682746335e-06, + "loss": 0.7298, + "step": 1040 + }, + { + "epoch": 0.014120998372219208, + "grad_norm": 11.276548385620117, + "learning_rate": 9.960257640126079e-06, + "loss": 0.7954, + "step": 1041 + }, + { + "epoch": 0.014134563212154097, + "grad_norm": 9.141057968139648, + "learning_rate": 9.960120597505824e-06, + "loss": 0.6987, + "step": 1042 + }, + { + "epoch": 0.014148128052088985, + "grad_norm": 10.178176879882812, + "learning_rate": 9.959983554885571e-06, + "loss": 0.7049, + "step": 1043 + }, + { + "epoch": 0.014161692892023875, + "grad_norm": 9.974624633789062, + "learning_rate": 9.959846512265316e-06, + "loss": 0.5851, + "step": 1044 + }, + { + "epoch": 0.014175257731958763, + "grad_norm": 9.622721672058105, + "learning_rate": 9.95970946964506e-06, + "loss": 0.6724, + "step": 1045 + }, + { + "epoch": 0.014188822571893652, + "grad_norm": 11.1470947265625, + "learning_rate": 9.959572427024805e-06, + "loss": 0.7835, + "step": 1046 + }, + { + "epoch": 0.01420238741182854, + "grad_norm": 11.35677433013916, + "learning_rate": 9.959435384404552e-06, + "loss": 0.6106, + "step": 1047 + }, + { + "epoch": 0.01421595225176343, + "grad_norm": 10.307941436767578, + "learning_rate": 9.959298341784295e-06, + "loss": 0.655, + "step": 1048 + }, + { + "epoch": 0.014229517091698318, + "grad_norm": 9.735111236572266, + "learning_rate": 9.95916129916404e-06, + "loss": 0.6653, + "step": 1049 + }, + { + "epoch": 0.014243081931633208, + "grad_norm": 12.6301908493042, + "learning_rate": 9.959024256543786e-06, + "loss": 0.8185, + "step": 1050 + }, + { + "epoch": 0.014256646771568095, + "grad_norm": 10.265268325805664, + "learning_rate": 9.95888721392353e-06, + "loss": 0.6743, + "step": 1051 + }, + { + "epoch": 0.014270211611502985, + "grad_norm": 10.745820999145508, + "learning_rate": 9.958750171303276e-06, + "loss": 0.8219, + "step": 1052 + }, + { + "epoch": 0.014283776451437873, + "grad_norm": 9.880329132080078, + "learning_rate": 9.958613128683021e-06, + "loss": 0.6922, + "step": 1053 + }, + { + "epoch": 0.014297341291372763, + "grad_norm": 12.3280029296875, + "learning_rate": 9.958476086062766e-06, + "loss": 0.8872, + "step": 1054 + }, + { + "epoch": 0.01431090613130765, + "grad_norm": 10.505936622619629, + "learning_rate": 9.958339043442511e-06, + "loss": 0.7959, + "step": 1055 + }, + { + "epoch": 0.01432447097124254, + "grad_norm": 9.552633285522461, + "learning_rate": 9.958202000822257e-06, + "loss": 0.6603, + "step": 1056 + }, + { + "epoch": 0.014338035811177428, + "grad_norm": 10.962300300598145, + "learning_rate": 9.958064958202002e-06, + "loss": 0.5793, + "step": 1057 + }, + { + "epoch": 0.014351600651112318, + "grad_norm": 9.987236022949219, + "learning_rate": 9.957927915581747e-06, + "loss": 0.7885, + "step": 1058 + }, + { + "epoch": 0.014365165491047206, + "grad_norm": 8.222881317138672, + "learning_rate": 9.957790872961492e-06, + "loss": 0.5749, + "step": 1059 + }, + { + "epoch": 0.014378730330982095, + "grad_norm": 6.462827682495117, + "learning_rate": 9.957653830341237e-06, + "loss": 0.5504, + "step": 1060 + }, + { + "epoch": 0.014392295170916983, + "grad_norm": 7.626131534576416, + "learning_rate": 9.957516787720983e-06, + "loss": 0.5872, + "step": 1061 + }, + { + "epoch": 0.014405860010851873, + "grad_norm": 10.603384971618652, + "learning_rate": 9.957379745100728e-06, + "loss": 0.7752, + "step": 1062 + }, + { + "epoch": 0.01441942485078676, + "grad_norm": 10.756549835205078, + "learning_rate": 9.957242702480471e-06, + "loss": 0.572, + "step": 1063 + }, + { + "epoch": 0.01443298969072165, + "grad_norm": 10.816752433776855, + "learning_rate": 9.957105659860218e-06, + "loss": 1.0295, + "step": 1064 + }, + { + "epoch": 0.014446554530656538, + "grad_norm": 8.973722457885742, + "learning_rate": 9.956968617239963e-06, + "loss": 0.5985, + "step": 1065 + }, + { + "epoch": 0.014460119370591428, + "grad_norm": 12.367846488952637, + "learning_rate": 9.956831574619707e-06, + "loss": 0.5772, + "step": 1066 + }, + { + "epoch": 0.014473684210526316, + "grad_norm": 9.173750877380371, + "learning_rate": 9.956694531999452e-06, + "loss": 0.7093, + "step": 1067 + }, + { + "epoch": 0.014487249050461205, + "grad_norm": 16.803552627563477, + "learning_rate": 9.956557489379197e-06, + "loss": 0.7741, + "step": 1068 + }, + { + "epoch": 0.014500813890396093, + "grad_norm": 7.7213592529296875, + "learning_rate": 9.956420446758944e-06, + "loss": 0.3714, + "step": 1069 + }, + { + "epoch": 0.014514378730330983, + "grad_norm": 10.083648681640625, + "learning_rate": 9.956283404138687e-06, + "loss": 0.7139, + "step": 1070 + }, + { + "epoch": 0.01452794357026587, + "grad_norm": 9.696493148803711, + "learning_rate": 9.956146361518433e-06, + "loss": 0.7354, + "step": 1071 + }, + { + "epoch": 0.01454150841020076, + "grad_norm": 9.729479789733887, + "learning_rate": 9.956009318898178e-06, + "loss": 0.5718, + "step": 1072 + }, + { + "epoch": 0.014555073250135648, + "grad_norm": 11.502948760986328, + "learning_rate": 9.955872276277923e-06, + "loss": 0.677, + "step": 1073 + }, + { + "epoch": 0.014568638090070538, + "grad_norm": 13.9326171875, + "learning_rate": 9.955735233657668e-06, + "loss": 0.7747, + "step": 1074 + }, + { + "epoch": 0.014582202930005426, + "grad_norm": 9.616620063781738, + "learning_rate": 9.955598191037413e-06, + "loss": 0.6275, + "step": 1075 + }, + { + "epoch": 0.014595767769940315, + "grad_norm": 9.455039024353027, + "learning_rate": 9.955461148417159e-06, + "loss": 0.4911, + "step": 1076 + }, + { + "epoch": 0.014609332609875203, + "grad_norm": 8.987862586975098, + "learning_rate": 9.955324105796904e-06, + "loss": 0.5281, + "step": 1077 + }, + { + "epoch": 0.014622897449810093, + "grad_norm": 11.113691329956055, + "learning_rate": 9.955187063176649e-06, + "loss": 0.6335, + "step": 1078 + }, + { + "epoch": 0.014636462289744981, + "grad_norm": 8.426775932312012, + "learning_rate": 9.955050020556394e-06, + "loss": 0.5143, + "step": 1079 + }, + { + "epoch": 0.01465002712967987, + "grad_norm": 7.055727005004883, + "learning_rate": 9.95491297793614e-06, + "loss": 0.5236, + "step": 1080 + }, + { + "epoch": 0.014663591969614758, + "grad_norm": 9.0564546585083, + "learning_rate": 9.954775935315884e-06, + "loss": 0.6738, + "step": 1081 + }, + { + "epoch": 0.014677156809549648, + "grad_norm": 10.364808082580566, + "learning_rate": 9.95463889269563e-06, + "loss": 0.5723, + "step": 1082 + }, + { + "epoch": 0.014690721649484536, + "grad_norm": 10.174819946289062, + "learning_rate": 9.954501850075375e-06, + "loss": 0.562, + "step": 1083 + }, + { + "epoch": 0.014704286489419426, + "grad_norm": 7.726447582244873, + "learning_rate": 9.954364807455118e-06, + "loss": 0.534, + "step": 1084 + }, + { + "epoch": 0.014717851329354313, + "grad_norm": 9.849736213684082, + "learning_rate": 9.954227764834863e-06, + "loss": 0.5573, + "step": 1085 + }, + { + "epoch": 0.014731416169289203, + "grad_norm": 12.000679016113281, + "learning_rate": 9.95409072221461e-06, + "loss": 0.6672, + "step": 1086 + }, + { + "epoch": 0.014744981009224091, + "grad_norm": 7.9338202476501465, + "learning_rate": 9.953953679594356e-06, + "loss": 0.5134, + "step": 1087 + }, + { + "epoch": 0.01475854584915898, + "grad_norm": 10.13784122467041, + "learning_rate": 9.953816636974099e-06, + "loss": 0.6686, + "step": 1088 + }, + { + "epoch": 0.014772110689093869, + "grad_norm": 9.697139739990234, + "learning_rate": 9.953679594353844e-06, + "loss": 0.4897, + "step": 1089 + }, + { + "epoch": 0.014785675529028758, + "grad_norm": 8.590134620666504, + "learning_rate": 9.953542551733591e-06, + "loss": 0.4271, + "step": 1090 + }, + { + "epoch": 0.014799240368963646, + "grad_norm": 8.787530899047852, + "learning_rate": 9.953405509113335e-06, + "loss": 0.4766, + "step": 1091 + }, + { + "epoch": 0.014812805208898536, + "grad_norm": 9.36292552947998, + "learning_rate": 9.95326846649308e-06, + "loss": 0.4775, + "step": 1092 + }, + { + "epoch": 0.014826370048833424, + "grad_norm": 12.158354759216309, + "learning_rate": 9.953131423872825e-06, + "loss": 0.6472, + "step": 1093 + }, + { + "epoch": 0.014839934888768313, + "grad_norm": 9.429686546325684, + "learning_rate": 9.95299438125257e-06, + "loss": 0.4675, + "step": 1094 + }, + { + "epoch": 0.014853499728703201, + "grad_norm": 11.262648582458496, + "learning_rate": 9.952857338632315e-06, + "loss": 0.5601, + "step": 1095 + }, + { + "epoch": 0.01486706456863809, + "grad_norm": 10.407562255859375, + "learning_rate": 9.95272029601206e-06, + "loss": 0.5163, + "step": 1096 + }, + { + "epoch": 0.014880629408572979, + "grad_norm": 8.313879013061523, + "learning_rate": 9.952583253391806e-06, + "loss": 0.4918, + "step": 1097 + }, + { + "epoch": 0.014894194248507868, + "grad_norm": 7.4962053298950195, + "learning_rate": 9.95244621077155e-06, + "loss": 0.4466, + "step": 1098 + }, + { + "epoch": 0.014907759088442756, + "grad_norm": 8.413519859313965, + "learning_rate": 9.952309168151296e-06, + "loss": 0.5217, + "step": 1099 + }, + { + "epoch": 0.014921323928377646, + "grad_norm": 11.49526596069336, + "learning_rate": 9.952172125531041e-06, + "loss": 0.6421, + "step": 1100 + }, + { + "epoch": 0.014934888768312534, + "grad_norm": 9.416669845581055, + "learning_rate": 9.952035082910786e-06, + "loss": 0.5589, + "step": 1101 + }, + { + "epoch": 0.014948453608247423, + "grad_norm": 7.66764497756958, + "learning_rate": 9.951898040290532e-06, + "loss": 0.6101, + "step": 1102 + }, + { + "epoch": 0.014962018448182311, + "grad_norm": 10.674516677856445, + "learning_rate": 9.951760997670277e-06, + "loss": 0.492, + "step": 1103 + }, + { + "epoch": 0.014975583288117201, + "grad_norm": 9.671248435974121, + "learning_rate": 9.951623955050022e-06, + "loss": 0.6083, + "step": 1104 + }, + { + "epoch": 0.014989148128052089, + "grad_norm": 8.396306991577148, + "learning_rate": 9.951486912429767e-06, + "loss": 0.4111, + "step": 1105 + }, + { + "epoch": 0.015002712967986978, + "grad_norm": 9.04227066040039, + "learning_rate": 9.95134986980951e-06, + "loss": 0.4549, + "step": 1106 + }, + { + "epoch": 0.015016277807921866, + "grad_norm": 11.430936813354492, + "learning_rate": 9.951212827189257e-06, + "loss": 0.6365, + "step": 1107 + }, + { + "epoch": 0.015029842647856756, + "grad_norm": 8.095125198364258, + "learning_rate": 9.951075784569003e-06, + "loss": 0.4207, + "step": 1108 + }, + { + "epoch": 0.015043407487791644, + "grad_norm": 10.55787467956543, + "learning_rate": 9.950938741948746e-06, + "loss": 0.5246, + "step": 1109 + }, + { + "epoch": 0.015056972327726533, + "grad_norm": 8.506059646606445, + "learning_rate": 9.950801699328491e-06, + "loss": 0.5966, + "step": 1110 + }, + { + "epoch": 0.015070537167661421, + "grad_norm": 9.615809440612793, + "learning_rate": 9.950664656708236e-06, + "loss": 0.6869, + "step": 1111 + }, + { + "epoch": 0.015084102007596311, + "grad_norm": 13.257489204406738, + "learning_rate": 9.950527614087983e-06, + "loss": 0.6102, + "step": 1112 + }, + { + "epoch": 0.015097666847531199, + "grad_norm": 10.756219863891602, + "learning_rate": 9.950390571467727e-06, + "loss": 0.5035, + "step": 1113 + }, + { + "epoch": 0.015111231687466089, + "grad_norm": 8.15820598602295, + "learning_rate": 9.950253528847472e-06, + "loss": 0.4859, + "step": 1114 + }, + { + "epoch": 0.015124796527400976, + "grad_norm": 9.289429664611816, + "learning_rate": 9.950116486227217e-06, + "loss": 0.5707, + "step": 1115 + }, + { + "epoch": 0.015138361367335866, + "grad_norm": 8.470824241638184, + "learning_rate": 9.949979443606962e-06, + "loss": 0.3971, + "step": 1116 + }, + { + "epoch": 0.015151926207270754, + "grad_norm": 10.33607006072998, + "learning_rate": 9.949842400986707e-06, + "loss": 0.6046, + "step": 1117 + }, + { + "epoch": 0.015165491047205644, + "grad_norm": 8.18203067779541, + "learning_rate": 9.949705358366453e-06, + "loss": 0.651, + "step": 1118 + }, + { + "epoch": 0.015179055887140532, + "grad_norm": 9.416572570800781, + "learning_rate": 9.949568315746198e-06, + "loss": 0.5913, + "step": 1119 + }, + { + "epoch": 0.015192620727075421, + "grad_norm": 8.580997467041016, + "learning_rate": 9.949431273125943e-06, + "loss": 0.742, + "step": 1120 + }, + { + "epoch": 0.015206185567010309, + "grad_norm": 8.224396705627441, + "learning_rate": 9.949294230505688e-06, + "loss": 0.7175, + "step": 1121 + }, + { + "epoch": 0.015219750406945199, + "grad_norm": 12.354378700256348, + "learning_rate": 9.949157187885433e-06, + "loss": 0.6702, + "step": 1122 + }, + { + "epoch": 0.015233315246880087, + "grad_norm": 9.650470733642578, + "learning_rate": 9.949020145265179e-06, + "loss": 0.5192, + "step": 1123 + }, + { + "epoch": 0.015246880086814976, + "grad_norm": 9.042299270629883, + "learning_rate": 9.948883102644922e-06, + "loss": 0.6337, + "step": 1124 + }, + { + "epoch": 0.015260444926749864, + "grad_norm": 12.153543472290039, + "learning_rate": 9.948746060024669e-06, + "loss": 0.6501, + "step": 1125 + }, + { + "epoch": 0.015274009766684754, + "grad_norm": 9.70035171508789, + "learning_rate": 9.948609017404414e-06, + "loss": 0.6482, + "step": 1126 + }, + { + "epoch": 0.015287574606619642, + "grad_norm": 8.97224235534668, + "learning_rate": 9.94847197478416e-06, + "loss": 0.5474, + "step": 1127 + }, + { + "epoch": 0.015301139446554531, + "grad_norm": 12.410890579223633, + "learning_rate": 9.948334932163903e-06, + "loss": 0.5759, + "step": 1128 + }, + { + "epoch": 0.01531470428648942, + "grad_norm": 11.02632999420166, + "learning_rate": 9.94819788954365e-06, + "loss": 0.6132, + "step": 1129 + }, + { + "epoch": 0.015328269126424309, + "grad_norm": 11.687317848205566, + "learning_rate": 9.948060846923395e-06, + "loss": 0.5043, + "step": 1130 + }, + { + "epoch": 0.015341833966359197, + "grad_norm": 9.231924057006836, + "learning_rate": 9.947923804303138e-06, + "loss": 0.4866, + "step": 1131 + }, + { + "epoch": 0.015355398806294086, + "grad_norm": 9.528881072998047, + "learning_rate": 9.947786761682883e-06, + "loss": 0.5355, + "step": 1132 + }, + { + "epoch": 0.015368963646228974, + "grad_norm": 10.962512969970703, + "learning_rate": 9.94764971906263e-06, + "loss": 0.4515, + "step": 1133 + }, + { + "epoch": 0.015382528486163864, + "grad_norm": 11.26285457611084, + "learning_rate": 9.947512676442374e-06, + "loss": 0.5934, + "step": 1134 + }, + { + "epoch": 0.015396093326098752, + "grad_norm": 11.362238883972168, + "learning_rate": 9.947375633822119e-06, + "loss": 0.5984, + "step": 1135 + }, + { + "epoch": 0.015409658166033641, + "grad_norm": 8.267203330993652, + "learning_rate": 9.947238591201864e-06, + "loss": 0.5483, + "step": 1136 + }, + { + "epoch": 0.01542322300596853, + "grad_norm": 7.78193473815918, + "learning_rate": 9.94710154858161e-06, + "loss": 0.4103, + "step": 1137 + }, + { + "epoch": 0.015436787845903419, + "grad_norm": 11.156635284423828, + "learning_rate": 9.946964505961355e-06, + "loss": 0.6075, + "step": 1138 + }, + { + "epoch": 0.015450352685838307, + "grad_norm": 9.064448356628418, + "learning_rate": 9.9468274633411e-06, + "loss": 0.5393, + "step": 1139 + }, + { + "epoch": 0.015463917525773196, + "grad_norm": 8.925360679626465, + "learning_rate": 9.946690420720845e-06, + "loss": 0.4331, + "step": 1140 + }, + { + "epoch": 0.015477482365708084, + "grad_norm": 11.54565143585205, + "learning_rate": 9.94655337810059e-06, + "loss": 0.6371, + "step": 1141 + }, + { + "epoch": 0.015491047205642974, + "grad_norm": 11.26870059967041, + "learning_rate": 9.946416335480335e-06, + "loss": 0.6514, + "step": 1142 + }, + { + "epoch": 0.015504612045577862, + "grad_norm": 7.8006157875061035, + "learning_rate": 9.94627929286008e-06, + "loss": 0.5568, + "step": 1143 + }, + { + "epoch": 0.015518176885512752, + "grad_norm": 8.671391487121582, + "learning_rate": 9.946142250239826e-06, + "loss": 0.5401, + "step": 1144 + }, + { + "epoch": 0.01553174172544764, + "grad_norm": 8.995376586914062, + "learning_rate": 9.94600520761957e-06, + "loss": 0.5045, + "step": 1145 + }, + { + "epoch": 0.015545306565382529, + "grad_norm": 10.155593872070312, + "learning_rate": 9.945868164999316e-06, + "loss": 0.603, + "step": 1146 + }, + { + "epoch": 0.015558871405317417, + "grad_norm": 7.274258136749268, + "learning_rate": 9.945731122379061e-06, + "loss": 0.4741, + "step": 1147 + }, + { + "epoch": 0.015572436245252307, + "grad_norm": 14.548399925231934, + "learning_rate": 9.945594079758806e-06, + "loss": 0.5413, + "step": 1148 + }, + { + "epoch": 0.015586001085187194, + "grad_norm": 10.607792854309082, + "learning_rate": 9.94545703713855e-06, + "loss": 0.4624, + "step": 1149 + }, + { + "epoch": 0.015599565925122084, + "grad_norm": 8.892013549804688, + "learning_rate": 9.945319994518297e-06, + "loss": 0.563, + "step": 1150 + }, + { + "epoch": 0.015613130765056972, + "grad_norm": 8.671589851379395, + "learning_rate": 9.945182951898042e-06, + "loss": 0.443, + "step": 1151 + }, + { + "epoch": 0.01562669560499186, + "grad_norm": 9.75113296508789, + "learning_rate": 9.945045909277787e-06, + "loss": 0.5217, + "step": 1152 + }, + { + "epoch": 0.01564026044492675, + "grad_norm": 8.981816291809082, + "learning_rate": 9.94490886665753e-06, + "loss": 0.5599, + "step": 1153 + }, + { + "epoch": 0.015653825284861637, + "grad_norm": 11.755138397216797, + "learning_rate": 9.944771824037276e-06, + "loss": 0.6923, + "step": 1154 + }, + { + "epoch": 0.015667390124796527, + "grad_norm": 9.876401901245117, + "learning_rate": 9.944634781417023e-06, + "loss": 0.6527, + "step": 1155 + }, + { + "epoch": 0.015680954964731417, + "grad_norm": 7.154882431030273, + "learning_rate": 9.944497738796766e-06, + "loss": 0.5116, + "step": 1156 + }, + { + "epoch": 0.015694519804666306, + "grad_norm": 9.054726600646973, + "learning_rate": 9.944360696176511e-06, + "loss": 0.4875, + "step": 1157 + }, + { + "epoch": 0.015708084644601192, + "grad_norm": 10.171956062316895, + "learning_rate": 9.944223653556256e-06, + "loss": 0.514, + "step": 1158 + }, + { + "epoch": 0.015721649484536082, + "grad_norm": 10.182366371154785, + "learning_rate": 9.944086610936002e-06, + "loss": 0.6025, + "step": 1159 + }, + { + "epoch": 0.015735214324470972, + "grad_norm": 8.330997467041016, + "learning_rate": 9.943949568315747e-06, + "loss": 0.5326, + "step": 1160 + }, + { + "epoch": 0.01574877916440586, + "grad_norm": 6.465672969818115, + "learning_rate": 9.943812525695492e-06, + "loss": 0.4194, + "step": 1161 + }, + { + "epoch": 0.015762344004340748, + "grad_norm": 6.516511917114258, + "learning_rate": 9.943675483075237e-06, + "loss": 0.4014, + "step": 1162 + }, + { + "epoch": 0.015775908844275637, + "grad_norm": 9.575370788574219, + "learning_rate": 9.943538440454982e-06, + "loss": 0.4957, + "step": 1163 + }, + { + "epoch": 0.015789473684210527, + "grad_norm": 9.82047176361084, + "learning_rate": 9.943401397834728e-06, + "loss": 0.5135, + "step": 1164 + }, + { + "epoch": 0.015803038524145416, + "grad_norm": 6.945639133453369, + "learning_rate": 9.943264355214473e-06, + "loss": 0.437, + "step": 1165 + }, + { + "epoch": 0.015816603364080303, + "grad_norm": 8.417868614196777, + "learning_rate": 9.943127312594218e-06, + "loss": 0.4486, + "step": 1166 + }, + { + "epoch": 0.015830168204015192, + "grad_norm": 7.835817813873291, + "learning_rate": 9.942990269973963e-06, + "loss": 0.4957, + "step": 1167 + }, + { + "epoch": 0.015843733043950082, + "grad_norm": 10.404903411865234, + "learning_rate": 9.942853227353708e-06, + "loss": 0.4902, + "step": 1168 + }, + { + "epoch": 0.01585729788388497, + "grad_norm": 7.886514186859131, + "learning_rate": 9.942716184733453e-06, + "loss": 0.3628, + "step": 1169 + }, + { + "epoch": 0.015870862723819858, + "grad_norm": 6.834493637084961, + "learning_rate": 9.942579142113199e-06, + "loss": 0.351, + "step": 1170 + }, + { + "epoch": 0.015884427563754747, + "grad_norm": 7.969992637634277, + "learning_rate": 9.942442099492942e-06, + "loss": 0.376, + "step": 1171 + }, + { + "epoch": 0.015897992403689637, + "grad_norm": 5.997768402099609, + "learning_rate": 9.942305056872689e-06, + "loss": 0.4865, + "step": 1172 + }, + { + "epoch": 0.015911557243624527, + "grad_norm": 6.358470439910889, + "learning_rate": 9.942168014252434e-06, + "loss": 0.4046, + "step": 1173 + }, + { + "epoch": 0.015925122083559413, + "grad_norm": 6.955049514770508, + "learning_rate": 9.942030971632178e-06, + "loss": 0.3483, + "step": 1174 + }, + { + "epoch": 0.015938686923494302, + "grad_norm": 7.369102478027344, + "learning_rate": 9.941893929011923e-06, + "loss": 0.3663, + "step": 1175 + }, + { + "epoch": 0.015952251763429192, + "grad_norm": 10.159394264221191, + "learning_rate": 9.94175688639167e-06, + "loss": 0.5998, + "step": 1176 + }, + { + "epoch": 0.01596581660336408, + "grad_norm": 7.168752670288086, + "learning_rate": 9.941619843771413e-06, + "loss": 0.3528, + "step": 1177 + }, + { + "epoch": 0.015979381443298968, + "grad_norm": 8.111251831054688, + "learning_rate": 9.941482801151158e-06, + "loss": 0.6514, + "step": 1178 + }, + { + "epoch": 0.015992946283233857, + "grad_norm": 5.76934289932251, + "learning_rate": 9.941345758530903e-06, + "loss": 0.2399, + "step": 1179 + }, + { + "epoch": 0.016006511123168747, + "grad_norm": 6.775606155395508, + "learning_rate": 9.941208715910649e-06, + "loss": 0.3399, + "step": 1180 + }, + { + "epoch": 0.016020075963103637, + "grad_norm": 8.829363822937012, + "learning_rate": 9.941071673290394e-06, + "loss": 0.4531, + "step": 1181 + }, + { + "epoch": 0.016033640803038523, + "grad_norm": 8.818168640136719, + "learning_rate": 9.940934630670139e-06, + "loss": 0.6485, + "step": 1182 + }, + { + "epoch": 0.016047205642973413, + "grad_norm": 8.073450088500977, + "learning_rate": 9.940797588049884e-06, + "loss": 0.5317, + "step": 1183 + }, + { + "epoch": 0.016060770482908302, + "grad_norm": 6.831442832946777, + "learning_rate": 9.94066054542963e-06, + "loss": 0.3936, + "step": 1184 + }, + { + "epoch": 0.016074335322843192, + "grad_norm": 7.418529987335205, + "learning_rate": 9.940523502809375e-06, + "loss": 0.3116, + "step": 1185 + }, + { + "epoch": 0.016087900162778078, + "grad_norm": 7.644543647766113, + "learning_rate": 9.94038646018912e-06, + "loss": 0.5441, + "step": 1186 + }, + { + "epoch": 0.016101465002712968, + "grad_norm": 9.367979049682617, + "learning_rate": 9.940249417568865e-06, + "loss": 0.4866, + "step": 1187 + }, + { + "epoch": 0.016115029842647857, + "grad_norm": 5.3822550773620605, + "learning_rate": 9.94011237494861e-06, + "loss": 0.4112, + "step": 1188 + }, + { + "epoch": 0.016128594682582747, + "grad_norm": 8.408387184143066, + "learning_rate": 9.939975332328355e-06, + "loss": 0.487, + "step": 1189 + }, + { + "epoch": 0.016142159522517633, + "grad_norm": 7.607691764831543, + "learning_rate": 9.9398382897081e-06, + "loss": 0.4014, + "step": 1190 + }, + { + "epoch": 0.016155724362452523, + "grad_norm": 6.106590270996094, + "learning_rate": 9.939701247087846e-06, + "loss": 0.3532, + "step": 1191 + }, + { + "epoch": 0.016169289202387412, + "grad_norm": 5.690080165863037, + "learning_rate": 9.939564204467589e-06, + "loss": 0.4121, + "step": 1192 + }, + { + "epoch": 0.016182854042322302, + "grad_norm": 11.751052856445312, + "learning_rate": 9.939427161847334e-06, + "loss": 0.5127, + "step": 1193 + }, + { + "epoch": 0.016196418882257188, + "grad_norm": 7.280313014984131, + "learning_rate": 9.939290119227081e-06, + "loss": 0.3257, + "step": 1194 + }, + { + "epoch": 0.016209983722192078, + "grad_norm": 8.429166793823242, + "learning_rate": 9.939153076606826e-06, + "loss": 0.4468, + "step": 1195 + }, + { + "epoch": 0.016223548562126967, + "grad_norm": 8.94950008392334, + "learning_rate": 9.93901603398657e-06, + "loss": 0.471, + "step": 1196 + }, + { + "epoch": 0.016237113402061857, + "grad_norm": 7.250218868255615, + "learning_rate": 9.938878991366315e-06, + "loss": 0.3599, + "step": 1197 + }, + { + "epoch": 0.016250678241996743, + "grad_norm": 9.06472396850586, + "learning_rate": 9.938741948746062e-06, + "loss": 0.6125, + "step": 1198 + }, + { + "epoch": 0.016264243081931633, + "grad_norm": 6.513791561126709, + "learning_rate": 9.938604906125805e-06, + "loss": 0.3759, + "step": 1199 + }, + { + "epoch": 0.016277807921866522, + "grad_norm": 8.395824432373047, + "learning_rate": 9.93846786350555e-06, + "loss": 0.64, + "step": 1200 + }, + { + "epoch": 0.016291372761801412, + "grad_norm": 8.430846214294434, + "learning_rate": 9.938330820885296e-06, + "loss": 0.4663, + "step": 1201 + }, + { + "epoch": 0.016304937601736298, + "grad_norm": 12.009727478027344, + "learning_rate": 9.938193778265041e-06, + "loss": 0.7146, + "step": 1202 + }, + { + "epoch": 0.016318502441671188, + "grad_norm": 7.626994609832764, + "learning_rate": 9.938056735644786e-06, + "loss": 0.4532, + "step": 1203 + }, + { + "epoch": 0.016332067281606077, + "grad_norm": 8.316390991210938, + "learning_rate": 9.937919693024531e-06, + "loss": 0.4454, + "step": 1204 + }, + { + "epoch": 0.016345632121540967, + "grad_norm": 6.990311622619629, + "learning_rate": 9.937782650404276e-06, + "loss": 0.4616, + "step": 1205 + }, + { + "epoch": 0.016359196961475853, + "grad_norm": 9.801228523254395, + "learning_rate": 9.937645607784022e-06, + "loss": 0.5937, + "step": 1206 + }, + { + "epoch": 0.016372761801410743, + "grad_norm": 8.944941520690918, + "learning_rate": 9.937508565163767e-06, + "loss": 0.5495, + "step": 1207 + }, + { + "epoch": 0.016386326641345633, + "grad_norm": 7.5177507400512695, + "learning_rate": 9.937371522543512e-06, + "loss": 0.5782, + "step": 1208 + }, + { + "epoch": 0.016399891481280522, + "grad_norm": 4.9404826164245605, + "learning_rate": 9.937234479923257e-06, + "loss": 0.3443, + "step": 1209 + }, + { + "epoch": 0.01641345632121541, + "grad_norm": 9.287525177001953, + "learning_rate": 9.937097437303002e-06, + "loss": 0.559, + "step": 1210 + }, + { + "epoch": 0.016427021161150298, + "grad_norm": 9.66299057006836, + "learning_rate": 9.936960394682748e-06, + "loss": 0.5492, + "step": 1211 + }, + { + "epoch": 0.016440586001085188, + "grad_norm": 7.443748474121094, + "learning_rate": 9.936823352062493e-06, + "loss": 0.4845, + "step": 1212 + }, + { + "epoch": 0.016454150841020077, + "grad_norm": 8.870261192321777, + "learning_rate": 9.936686309442238e-06, + "loss": 0.4303, + "step": 1213 + }, + { + "epoch": 0.016467715680954963, + "grad_norm": 7.891788005828857, + "learning_rate": 9.936549266821981e-06, + "loss": 0.4946, + "step": 1214 + }, + { + "epoch": 0.016481280520889853, + "grad_norm": 8.870428085327148, + "learning_rate": 9.936412224201728e-06, + "loss": 0.5432, + "step": 1215 + }, + { + "epoch": 0.016494845360824743, + "grad_norm": 7.593276023864746, + "learning_rate": 9.936275181581473e-06, + "loss": 0.4883, + "step": 1216 + }, + { + "epoch": 0.016508410200759632, + "grad_norm": 7.547611236572266, + "learning_rate": 9.936138138961217e-06, + "loss": 0.4839, + "step": 1217 + }, + { + "epoch": 0.01652197504069452, + "grad_norm": 9.54443359375, + "learning_rate": 9.936001096340962e-06, + "loss": 0.5909, + "step": 1218 + }, + { + "epoch": 0.016535539880629408, + "grad_norm": 9.170373916625977, + "learning_rate": 9.935864053720709e-06, + "loss": 0.4482, + "step": 1219 + }, + { + "epoch": 0.016549104720564298, + "grad_norm": 11.31208324432373, + "learning_rate": 9.935727011100454e-06, + "loss": 0.5691, + "step": 1220 + }, + { + "epoch": 0.016562669560499187, + "grad_norm": 9.052638053894043, + "learning_rate": 9.935589968480198e-06, + "loss": 0.4134, + "step": 1221 + }, + { + "epoch": 0.016576234400434074, + "grad_norm": 7.448655605316162, + "learning_rate": 9.935452925859943e-06, + "loss": 0.4636, + "step": 1222 + }, + { + "epoch": 0.016589799240368963, + "grad_norm": 8.177346229553223, + "learning_rate": 9.935315883239688e-06, + "loss": 0.4974, + "step": 1223 + }, + { + "epoch": 0.016603364080303853, + "grad_norm": 7.965119361877441, + "learning_rate": 9.935178840619433e-06, + "loss": 0.5963, + "step": 1224 + }, + { + "epoch": 0.016616928920238742, + "grad_norm": 5.672322750091553, + "learning_rate": 9.935041797999178e-06, + "loss": 0.3769, + "step": 1225 + }, + { + "epoch": 0.01663049376017363, + "grad_norm": 7.59871768951416, + "learning_rate": 9.934904755378924e-06, + "loss": 0.4713, + "step": 1226 + }, + { + "epoch": 0.016644058600108518, + "grad_norm": 7.745074272155762, + "learning_rate": 9.934767712758669e-06, + "loss": 0.5435, + "step": 1227 + }, + { + "epoch": 0.016657623440043408, + "grad_norm": 5.864858150482178, + "learning_rate": 9.934630670138414e-06, + "loss": 0.3163, + "step": 1228 + }, + { + "epoch": 0.016671188279978297, + "grad_norm": 6.974841117858887, + "learning_rate": 9.934493627518159e-06, + "loss": 0.4048, + "step": 1229 + }, + { + "epoch": 0.016684753119913184, + "grad_norm": 7.228716850280762, + "learning_rate": 9.934356584897904e-06, + "loss": 0.3618, + "step": 1230 + }, + { + "epoch": 0.016698317959848073, + "grad_norm": 7.441357135772705, + "learning_rate": 9.93421954227765e-06, + "loss": 0.501, + "step": 1231 + }, + { + "epoch": 0.016711882799782963, + "grad_norm": 6.647946834564209, + "learning_rate": 9.934082499657395e-06, + "loss": 0.3837, + "step": 1232 + }, + { + "epoch": 0.016725447639717853, + "grad_norm": 7.531267166137695, + "learning_rate": 9.93394545703714e-06, + "loss": 0.5742, + "step": 1233 + }, + { + "epoch": 0.01673901247965274, + "grad_norm": 8.947808265686035, + "learning_rate": 9.933808414416885e-06, + "loss": 0.4868, + "step": 1234 + }, + { + "epoch": 0.01675257731958763, + "grad_norm": 9.852073669433594, + "learning_rate": 9.93367137179663e-06, + "loss": 0.6792, + "step": 1235 + }, + { + "epoch": 0.016766142159522518, + "grad_norm": 6.282433986663818, + "learning_rate": 9.933534329176374e-06, + "loss": 0.3951, + "step": 1236 + }, + { + "epoch": 0.016779706999457408, + "grad_norm": 7.688558578491211, + "learning_rate": 9.93339728655612e-06, + "loss": 0.3875, + "step": 1237 + }, + { + "epoch": 0.016793271839392294, + "grad_norm": 7.362512111663818, + "learning_rate": 9.933260243935866e-06, + "loss": 0.3627, + "step": 1238 + }, + { + "epoch": 0.016806836679327183, + "grad_norm": 7.692780494689941, + "learning_rate": 9.933123201315609e-06, + "loss": 0.4068, + "step": 1239 + }, + { + "epoch": 0.016820401519262073, + "grad_norm": 7.557626724243164, + "learning_rate": 9.932986158695354e-06, + "loss": 0.4242, + "step": 1240 + }, + { + "epoch": 0.016833966359196963, + "grad_norm": 7.211966037750244, + "learning_rate": 9.932849116075101e-06, + "loss": 0.4827, + "step": 1241 + }, + { + "epoch": 0.01684753119913185, + "grad_norm": 7.751267910003662, + "learning_rate": 9.932712073454845e-06, + "loss": 0.4888, + "step": 1242 + }, + { + "epoch": 0.01686109603906674, + "grad_norm": 9.916178703308105, + "learning_rate": 9.93257503083459e-06, + "loss": 0.4649, + "step": 1243 + }, + { + "epoch": 0.016874660879001628, + "grad_norm": 9.745281219482422, + "learning_rate": 9.932437988214335e-06, + "loss": 0.6296, + "step": 1244 + }, + { + "epoch": 0.016888225718936518, + "grad_norm": 8.180948257446289, + "learning_rate": 9.932300945594082e-06, + "loss": 0.4875, + "step": 1245 + }, + { + "epoch": 0.016901790558871404, + "grad_norm": 6.903131484985352, + "learning_rate": 9.932163902973825e-06, + "loss": 0.44, + "step": 1246 + }, + { + "epoch": 0.016915355398806294, + "grad_norm": 7.563157558441162, + "learning_rate": 9.93202686035357e-06, + "loss": 0.3704, + "step": 1247 + }, + { + "epoch": 0.016928920238741183, + "grad_norm": 9.214814186096191, + "learning_rate": 9.931889817733316e-06, + "loss": 0.428, + "step": 1248 + }, + { + "epoch": 0.016942485078676073, + "grad_norm": 8.57235336303711, + "learning_rate": 9.931752775113061e-06, + "loss": 0.5055, + "step": 1249 + }, + { + "epoch": 0.01695604991861096, + "grad_norm": 11.63420581817627, + "learning_rate": 9.931615732492806e-06, + "loss": 0.5493, + "step": 1250 + }, + { + "epoch": 0.01696961475854585, + "grad_norm": 7.707411289215088, + "learning_rate": 9.931478689872551e-06, + "loss": 0.3918, + "step": 1251 + }, + { + "epoch": 0.016983179598480738, + "grad_norm": 7.267452239990234, + "learning_rate": 9.931341647252296e-06, + "loss": 0.4665, + "step": 1252 + }, + { + "epoch": 0.016996744438415628, + "grad_norm": 7.416873455047607, + "learning_rate": 9.931204604632042e-06, + "loss": 0.4121, + "step": 1253 + }, + { + "epoch": 0.017010309278350514, + "grad_norm": 7.779008388519287, + "learning_rate": 9.931067562011787e-06, + "loss": 0.3935, + "step": 1254 + }, + { + "epoch": 0.017023874118285404, + "grad_norm": 7.955751895904541, + "learning_rate": 9.930930519391532e-06, + "loss": 0.4653, + "step": 1255 + }, + { + "epoch": 0.017037438958220293, + "grad_norm": 9.33484935760498, + "learning_rate": 9.930793476771277e-06, + "loss": 0.5492, + "step": 1256 + }, + { + "epoch": 0.017051003798155183, + "grad_norm": 7.345424175262451, + "learning_rate": 9.93065643415102e-06, + "loss": 0.5052, + "step": 1257 + }, + { + "epoch": 0.01706456863809007, + "grad_norm": 8.924280166625977, + "learning_rate": 9.930519391530768e-06, + "loss": 0.4649, + "step": 1258 + }, + { + "epoch": 0.01707813347802496, + "grad_norm": 9.90567684173584, + "learning_rate": 9.930382348910513e-06, + "loss": 0.4372, + "step": 1259 + }, + { + "epoch": 0.01709169831795985, + "grad_norm": 8.058335304260254, + "learning_rate": 9.930245306290258e-06, + "loss": 0.4078, + "step": 1260 + }, + { + "epoch": 0.017105263157894738, + "grad_norm": 6.7840752601623535, + "learning_rate": 9.930108263670001e-06, + "loss": 0.4555, + "step": 1261 + }, + { + "epoch": 0.017118827997829624, + "grad_norm": 7.814676284790039, + "learning_rate": 9.929971221049747e-06, + "loss": 0.451, + "step": 1262 + }, + { + "epoch": 0.017132392837764514, + "grad_norm": 7.4007720947265625, + "learning_rate": 9.929834178429493e-06, + "loss": 0.3659, + "step": 1263 + }, + { + "epoch": 0.017145957677699403, + "grad_norm": 8.101563453674316, + "learning_rate": 9.929697135809237e-06, + "loss": 0.456, + "step": 1264 + }, + { + "epoch": 0.017159522517634293, + "grad_norm": 8.019292831420898, + "learning_rate": 9.929560093188982e-06, + "loss": 0.375, + "step": 1265 + }, + { + "epoch": 0.01717308735756918, + "grad_norm": 8.656462669372559, + "learning_rate": 9.929423050568727e-06, + "loss": 0.5807, + "step": 1266 + }, + { + "epoch": 0.01718665219750407, + "grad_norm": 6.4881134033203125, + "learning_rate": 9.929286007948472e-06, + "loss": 0.458, + "step": 1267 + }, + { + "epoch": 0.01720021703743896, + "grad_norm": 11.633872985839844, + "learning_rate": 9.929148965328218e-06, + "loss": 0.4073, + "step": 1268 + }, + { + "epoch": 0.017213781877373848, + "grad_norm": 7.690878868103027, + "learning_rate": 9.929011922707963e-06, + "loss": 0.3645, + "step": 1269 + }, + { + "epoch": 0.017227346717308734, + "grad_norm": 7.8148298263549805, + "learning_rate": 9.928874880087708e-06, + "loss": 0.3928, + "step": 1270 + }, + { + "epoch": 0.017240911557243624, + "grad_norm": 7.007535457611084, + "learning_rate": 9.928737837467453e-06, + "loss": 0.3912, + "step": 1271 + }, + { + "epoch": 0.017254476397178514, + "grad_norm": 7.009039878845215, + "learning_rate": 9.928600794847198e-06, + "loss": 0.4528, + "step": 1272 + }, + { + "epoch": 0.017268041237113403, + "grad_norm": 7.008871555328369, + "learning_rate": 9.928463752226944e-06, + "loss": 0.423, + "step": 1273 + }, + { + "epoch": 0.01728160607704829, + "grad_norm": 7.8382039070129395, + "learning_rate": 9.928326709606689e-06, + "loss": 0.3297, + "step": 1274 + }, + { + "epoch": 0.01729517091698318, + "grad_norm": 10.246737480163574, + "learning_rate": 9.928189666986434e-06, + "loss": 0.5127, + "step": 1275 + }, + { + "epoch": 0.01730873575691807, + "grad_norm": 6.562344074249268, + "learning_rate": 9.928052624366179e-06, + "loss": 0.3645, + "step": 1276 + }, + { + "epoch": 0.017322300596852958, + "grad_norm": 8.67978286743164, + "learning_rate": 9.927915581745924e-06, + "loss": 0.3908, + "step": 1277 + }, + { + "epoch": 0.017335865436787844, + "grad_norm": 8.602482795715332, + "learning_rate": 9.92777853912567e-06, + "loss": 0.4914, + "step": 1278 + }, + { + "epoch": 0.017349430276722734, + "grad_norm": 12.13869857788086, + "learning_rate": 9.927641496505413e-06, + "loss": 0.6272, + "step": 1279 + }, + { + "epoch": 0.017362995116657624, + "grad_norm": 8.1736478805542, + "learning_rate": 9.92750445388516e-06, + "loss": 0.4498, + "step": 1280 + }, + { + "epoch": 0.017376559956592513, + "grad_norm": 8.115334510803223, + "learning_rate": 9.927367411264905e-06, + "loss": 0.6326, + "step": 1281 + }, + { + "epoch": 0.0173901247965274, + "grad_norm": 7.3223652839660645, + "learning_rate": 9.927230368644648e-06, + "loss": 0.3429, + "step": 1282 + }, + { + "epoch": 0.01740368963646229, + "grad_norm": 8.461081504821777, + "learning_rate": 9.927093326024394e-06, + "loss": 0.5246, + "step": 1283 + }, + { + "epoch": 0.01741725447639718, + "grad_norm": 10.13033676147461, + "learning_rate": 9.92695628340414e-06, + "loss": 0.4745, + "step": 1284 + }, + { + "epoch": 0.01743081931633207, + "grad_norm": 7.797920227050781, + "learning_rate": 9.926819240783884e-06, + "loss": 0.3978, + "step": 1285 + }, + { + "epoch": 0.017444384156266955, + "grad_norm": 5.9230499267578125, + "learning_rate": 9.92668219816363e-06, + "loss": 0.2708, + "step": 1286 + }, + { + "epoch": 0.017457948996201844, + "grad_norm": 9.4562406539917, + "learning_rate": 9.926545155543374e-06, + "loss": 0.5193, + "step": 1287 + }, + { + "epoch": 0.017471513836136734, + "grad_norm": 6.172946929931641, + "learning_rate": 9.926408112923121e-06, + "loss": 0.3526, + "step": 1288 + }, + { + "epoch": 0.017485078676071623, + "grad_norm": 7.303511619567871, + "learning_rate": 9.926271070302865e-06, + "loss": 0.3829, + "step": 1289 + }, + { + "epoch": 0.01749864351600651, + "grad_norm": 10.855528831481934, + "learning_rate": 9.92613402768261e-06, + "loss": 0.6382, + "step": 1290 + }, + { + "epoch": 0.0175122083559414, + "grad_norm": 7.607392311096191, + "learning_rate": 9.925996985062355e-06, + "loss": 0.4072, + "step": 1291 + }, + { + "epoch": 0.01752577319587629, + "grad_norm": 7.54636287689209, + "learning_rate": 9.9258599424421e-06, + "loss": 0.3862, + "step": 1292 + }, + { + "epoch": 0.01753933803581118, + "grad_norm": 6.631396293640137, + "learning_rate": 9.925722899821845e-06, + "loss": 0.3918, + "step": 1293 + }, + { + "epoch": 0.017552902875746065, + "grad_norm": 7.821700572967529, + "learning_rate": 9.92558585720159e-06, + "loss": 0.41, + "step": 1294 + }, + { + "epoch": 0.017566467715680954, + "grad_norm": 10.430885314941406, + "learning_rate": 9.925448814581336e-06, + "loss": 0.6, + "step": 1295 + }, + { + "epoch": 0.017580032555615844, + "grad_norm": 7.773311614990234, + "learning_rate": 9.925311771961081e-06, + "loss": 0.4693, + "step": 1296 + }, + { + "epoch": 0.017593597395550734, + "grad_norm": 10.369359016418457, + "learning_rate": 9.925174729340826e-06, + "loss": 0.5497, + "step": 1297 + }, + { + "epoch": 0.01760716223548562, + "grad_norm": 11.12063980102539, + "learning_rate": 9.925037686720571e-06, + "loss": 0.5362, + "step": 1298 + }, + { + "epoch": 0.01762072707542051, + "grad_norm": 11.054794311523438, + "learning_rate": 9.924900644100316e-06, + "loss": 0.4868, + "step": 1299 + }, + { + "epoch": 0.0176342919153554, + "grad_norm": 10.415785789489746, + "learning_rate": 9.92476360148006e-06, + "loss": 0.5476, + "step": 1300 + }, + { + "epoch": 0.01764785675529029, + "grad_norm": 7.080949783325195, + "learning_rate": 9.924626558859807e-06, + "loss": 0.4495, + "step": 1301 + }, + { + "epoch": 0.017661421595225175, + "grad_norm": 7.0351738929748535, + "learning_rate": 9.924489516239552e-06, + "loss": 0.379, + "step": 1302 + }, + { + "epoch": 0.017674986435160064, + "grad_norm": 8.77264404296875, + "learning_rate": 9.924352473619297e-06, + "loss": 0.4487, + "step": 1303 + }, + { + "epoch": 0.017688551275094954, + "grad_norm": 8.07907772064209, + "learning_rate": 9.92421543099904e-06, + "loss": 0.4204, + "step": 1304 + }, + { + "epoch": 0.017702116115029844, + "grad_norm": 9.212056159973145, + "learning_rate": 9.924078388378786e-06, + "loss": 0.3904, + "step": 1305 + }, + { + "epoch": 0.01771568095496473, + "grad_norm": 7.853927135467529, + "learning_rate": 9.923941345758533e-06, + "loss": 0.398, + "step": 1306 + }, + { + "epoch": 0.01772924579489962, + "grad_norm": 9.621291160583496, + "learning_rate": 9.923804303138276e-06, + "loss": 0.5393, + "step": 1307 + }, + { + "epoch": 0.01774281063483451, + "grad_norm": 9.081037521362305, + "learning_rate": 9.923667260518021e-06, + "loss": 0.5393, + "step": 1308 + }, + { + "epoch": 0.0177563754747694, + "grad_norm": 7.244717597961426, + "learning_rate": 9.923530217897767e-06, + "loss": 0.3953, + "step": 1309 + }, + { + "epoch": 0.017769940314704285, + "grad_norm": 9.250866889953613, + "learning_rate": 9.923393175277512e-06, + "loss": 0.4133, + "step": 1310 + }, + { + "epoch": 0.017783505154639175, + "grad_norm": 10.438037872314453, + "learning_rate": 9.923256132657257e-06, + "loss": 0.6171, + "step": 1311 + }, + { + "epoch": 0.017797069994574064, + "grad_norm": 7.493050575256348, + "learning_rate": 9.923119090037002e-06, + "loss": 0.3847, + "step": 1312 + }, + { + "epoch": 0.017810634834508954, + "grad_norm": 9.735554695129395, + "learning_rate": 9.922982047416747e-06, + "loss": 0.4911, + "step": 1313 + }, + { + "epoch": 0.01782419967444384, + "grad_norm": 8.777170181274414, + "learning_rate": 9.922845004796492e-06, + "loss": 0.5314, + "step": 1314 + }, + { + "epoch": 0.01783776451437873, + "grad_norm": 14.987051963806152, + "learning_rate": 9.922707962176238e-06, + "loss": 0.9789, + "step": 1315 + }, + { + "epoch": 0.01785132935431362, + "grad_norm": 10.136876106262207, + "learning_rate": 9.922570919555983e-06, + "loss": 0.4572, + "step": 1316 + }, + { + "epoch": 0.01786489419424851, + "grad_norm": 10.369184494018555, + "learning_rate": 9.922433876935728e-06, + "loss": 0.6679, + "step": 1317 + }, + { + "epoch": 0.017878459034183395, + "grad_norm": 10.972477912902832, + "learning_rate": 9.922296834315473e-06, + "loss": 0.6319, + "step": 1318 + }, + { + "epoch": 0.017892023874118285, + "grad_norm": 5.561295509338379, + "learning_rate": 9.922159791695218e-06, + "loss": 0.3676, + "step": 1319 + }, + { + "epoch": 0.017905588714053174, + "grad_norm": 8.989480018615723, + "learning_rate": 9.922022749074964e-06, + "loss": 0.4487, + "step": 1320 + }, + { + "epoch": 0.017919153553988064, + "grad_norm": 12.18389892578125, + "learning_rate": 9.921885706454709e-06, + "loss": 0.5966, + "step": 1321 + }, + { + "epoch": 0.01793271839392295, + "grad_norm": 12.116415977478027, + "learning_rate": 9.921748663834452e-06, + "loss": 0.5144, + "step": 1322 + }, + { + "epoch": 0.01794628323385784, + "grad_norm": 10.763056755065918, + "learning_rate": 9.921611621214199e-06, + "loss": 0.475, + "step": 1323 + }, + { + "epoch": 0.01795984807379273, + "grad_norm": 11.977811813354492, + "learning_rate": 9.921474578593944e-06, + "loss": 0.5767, + "step": 1324 + }, + { + "epoch": 0.01797341291372762, + "grad_norm": 11.171073913574219, + "learning_rate": 9.921337535973688e-06, + "loss": 0.5664, + "step": 1325 + }, + { + "epoch": 0.017986977753662505, + "grad_norm": 7.794250011444092, + "learning_rate": 9.921200493353433e-06, + "loss": 0.3794, + "step": 1326 + }, + { + "epoch": 0.018000542593597395, + "grad_norm": 11.47274112701416, + "learning_rate": 9.92106345073318e-06, + "loss": 0.4382, + "step": 1327 + }, + { + "epoch": 0.018014107433532284, + "grad_norm": 8.128451347351074, + "learning_rate": 9.920926408112925e-06, + "loss": 0.3861, + "step": 1328 + }, + { + "epoch": 0.018027672273467174, + "grad_norm": 10.650931358337402, + "learning_rate": 9.920789365492668e-06, + "loss": 0.5415, + "step": 1329 + }, + { + "epoch": 0.01804123711340206, + "grad_norm": 10.945813179016113, + "learning_rate": 9.920652322872414e-06, + "loss": 0.5758, + "step": 1330 + }, + { + "epoch": 0.01805480195333695, + "grad_norm": 7.864620685577393, + "learning_rate": 9.920515280252159e-06, + "loss": 0.4921, + "step": 1331 + }, + { + "epoch": 0.01806836679327184, + "grad_norm": 11.037473678588867, + "learning_rate": 9.920378237631904e-06, + "loss": 0.6177, + "step": 1332 + }, + { + "epoch": 0.01808193163320673, + "grad_norm": 8.378573417663574, + "learning_rate": 9.92024119501165e-06, + "loss": 0.5739, + "step": 1333 + }, + { + "epoch": 0.018095496473141615, + "grad_norm": 10.051932334899902, + "learning_rate": 9.920104152391394e-06, + "loss": 0.4741, + "step": 1334 + }, + { + "epoch": 0.018109061313076505, + "grad_norm": 8.168304443359375, + "learning_rate": 9.91996710977114e-06, + "loss": 0.4891, + "step": 1335 + }, + { + "epoch": 0.018122626153011395, + "grad_norm": 8.354684829711914, + "learning_rate": 9.919830067150885e-06, + "loss": 0.4372, + "step": 1336 + }, + { + "epoch": 0.018136190992946284, + "grad_norm": 9.77490520477295, + "learning_rate": 9.91969302453063e-06, + "loss": 0.5998, + "step": 1337 + }, + { + "epoch": 0.01814975583288117, + "grad_norm": 14.385671615600586, + "learning_rate": 9.919555981910375e-06, + "loss": 0.6648, + "step": 1338 + }, + { + "epoch": 0.01816332067281606, + "grad_norm": 11.147967338562012, + "learning_rate": 9.91941893929012e-06, + "loss": 0.6632, + "step": 1339 + }, + { + "epoch": 0.01817688551275095, + "grad_norm": 13.484060287475586, + "learning_rate": 9.919281896669865e-06, + "loss": 0.8357, + "step": 1340 + }, + { + "epoch": 0.01819045035268584, + "grad_norm": 9.055566787719727, + "learning_rate": 9.91914485404961e-06, + "loss": 0.4871, + "step": 1341 + }, + { + "epoch": 0.018204015192620725, + "grad_norm": 11.211419105529785, + "learning_rate": 9.919007811429356e-06, + "loss": 0.666, + "step": 1342 + }, + { + "epoch": 0.018217580032555615, + "grad_norm": 8.611656188964844, + "learning_rate": 9.918870768809101e-06, + "loss": 0.4888, + "step": 1343 + }, + { + "epoch": 0.018231144872490505, + "grad_norm": 8.744932174682617, + "learning_rate": 9.918733726188844e-06, + "loss": 0.4902, + "step": 1344 + }, + { + "epoch": 0.018244709712425394, + "grad_norm": 14.03650951385498, + "learning_rate": 9.918596683568591e-06, + "loss": 0.7726, + "step": 1345 + }, + { + "epoch": 0.01825827455236028, + "grad_norm": 7.358846187591553, + "learning_rate": 9.918459640948337e-06, + "loss": 0.3805, + "step": 1346 + }, + { + "epoch": 0.01827183939229517, + "grad_norm": 10.868315696716309, + "learning_rate": 9.91832259832808e-06, + "loss": 0.7644, + "step": 1347 + }, + { + "epoch": 0.01828540423223006, + "grad_norm": 9.55472183227539, + "learning_rate": 9.918185555707825e-06, + "loss": 0.68, + "step": 1348 + }, + { + "epoch": 0.01829896907216495, + "grad_norm": 12.45634937286377, + "learning_rate": 9.918048513087572e-06, + "loss": 0.8103, + "step": 1349 + }, + { + "epoch": 0.018312533912099836, + "grad_norm": 6.8960185050964355, + "learning_rate": 9.917911470467316e-06, + "loss": 0.4549, + "step": 1350 + }, + { + "epoch": 0.018326098752034725, + "grad_norm": 9.113168716430664, + "learning_rate": 9.91777442784706e-06, + "loss": 0.5029, + "step": 1351 + }, + { + "epoch": 0.018339663591969615, + "grad_norm": 9.485050201416016, + "learning_rate": 9.917637385226806e-06, + "loss": 0.5184, + "step": 1352 + }, + { + "epoch": 0.018353228431904504, + "grad_norm": 7.892512321472168, + "learning_rate": 9.917500342606553e-06, + "loss": 0.456, + "step": 1353 + }, + { + "epoch": 0.01836679327183939, + "grad_norm": 9.844609260559082, + "learning_rate": 9.917363299986296e-06, + "loss": 0.494, + "step": 1354 + }, + { + "epoch": 0.01838035811177428, + "grad_norm": 10.845649719238281, + "learning_rate": 9.917226257366041e-06, + "loss": 0.7133, + "step": 1355 + }, + { + "epoch": 0.01839392295170917, + "grad_norm": 10.350767135620117, + "learning_rate": 9.917089214745787e-06, + "loss": 0.6283, + "step": 1356 + }, + { + "epoch": 0.01840748779164406, + "grad_norm": 8.38359260559082, + "learning_rate": 9.916952172125532e-06, + "loss": 0.5112, + "step": 1357 + }, + { + "epoch": 0.018421052631578946, + "grad_norm": 7.515103816986084, + "learning_rate": 9.916815129505277e-06, + "loss": 0.5295, + "step": 1358 + }, + { + "epoch": 0.018434617471513835, + "grad_norm": 10.114617347717285, + "learning_rate": 9.916678086885022e-06, + "loss": 0.6111, + "step": 1359 + }, + { + "epoch": 0.018448182311448725, + "grad_norm": 7.841200351715088, + "learning_rate": 9.916541044264767e-06, + "loss": 0.5065, + "step": 1360 + }, + { + "epoch": 0.018461747151383615, + "grad_norm": 14.176812171936035, + "learning_rate": 9.916404001644513e-06, + "loss": 0.6529, + "step": 1361 + }, + { + "epoch": 0.0184753119913185, + "grad_norm": 8.587362289428711, + "learning_rate": 9.916266959024258e-06, + "loss": 0.5456, + "step": 1362 + }, + { + "epoch": 0.01848887683125339, + "grad_norm": 10.534039497375488, + "learning_rate": 9.916129916404003e-06, + "loss": 0.5069, + "step": 1363 + }, + { + "epoch": 0.01850244167118828, + "grad_norm": 7.789964199066162, + "learning_rate": 9.915992873783748e-06, + "loss": 0.4764, + "step": 1364 + }, + { + "epoch": 0.01851600651112317, + "grad_norm": 10.04820728302002, + "learning_rate": 9.915855831163492e-06, + "loss": 0.6671, + "step": 1365 + }, + { + "epoch": 0.01852957135105806, + "grad_norm": 9.536649703979492, + "learning_rate": 9.915718788543238e-06, + "loss": 0.5025, + "step": 1366 + }, + { + "epoch": 0.018543136190992945, + "grad_norm": 8.573095321655273, + "learning_rate": 9.915581745922984e-06, + "loss": 0.5327, + "step": 1367 + }, + { + "epoch": 0.018556701030927835, + "grad_norm": 11.048916816711426, + "learning_rate": 9.915444703302727e-06, + "loss": 0.6948, + "step": 1368 + }, + { + "epoch": 0.018570265870862725, + "grad_norm": 11.098137855529785, + "learning_rate": 9.915307660682472e-06, + "loss": 0.4529, + "step": 1369 + }, + { + "epoch": 0.018583830710797614, + "grad_norm": 9.225614547729492, + "learning_rate": 9.915170618062219e-06, + "loss": 0.4199, + "step": 1370 + }, + { + "epoch": 0.0185973955507325, + "grad_norm": 8.796092987060547, + "learning_rate": 9.915033575441964e-06, + "loss": 0.5358, + "step": 1371 + }, + { + "epoch": 0.01861096039066739, + "grad_norm": 10.177997589111328, + "learning_rate": 9.914896532821708e-06, + "loss": 0.6011, + "step": 1372 + }, + { + "epoch": 0.01862452523060228, + "grad_norm": 11.551985740661621, + "learning_rate": 9.914759490201453e-06, + "loss": 0.6587, + "step": 1373 + }, + { + "epoch": 0.01863809007053717, + "grad_norm": 8.512029647827148, + "learning_rate": 9.914622447581198e-06, + "loss": 0.5407, + "step": 1374 + }, + { + "epoch": 0.018651654910472056, + "grad_norm": 12.762154579162598, + "learning_rate": 9.914485404960943e-06, + "loss": 0.5407, + "step": 1375 + }, + { + "epoch": 0.018665219750406945, + "grad_norm": 11.62285327911377, + "learning_rate": 9.914348362340688e-06, + "loss": 0.613, + "step": 1376 + }, + { + "epoch": 0.018678784590341835, + "grad_norm": 9.503084182739258, + "learning_rate": 9.914211319720434e-06, + "loss": 0.4501, + "step": 1377 + }, + { + "epoch": 0.018692349430276724, + "grad_norm": 8.48514461517334, + "learning_rate": 9.914074277100179e-06, + "loss": 0.6255, + "step": 1378 + }, + { + "epoch": 0.01870591427021161, + "grad_norm": 10.331761360168457, + "learning_rate": 9.913937234479924e-06, + "loss": 0.5836, + "step": 1379 + }, + { + "epoch": 0.0187194791101465, + "grad_norm": 9.3951997756958, + "learning_rate": 9.91380019185967e-06, + "loss": 0.4938, + "step": 1380 + }, + { + "epoch": 0.01873304395008139, + "grad_norm": 12.574335098266602, + "learning_rate": 9.913663149239414e-06, + "loss": 0.8711, + "step": 1381 + }, + { + "epoch": 0.01874660879001628, + "grad_norm": 9.837108612060547, + "learning_rate": 9.91352610661916e-06, + "loss": 0.7786, + "step": 1382 + }, + { + "epoch": 0.018760173629951166, + "grad_norm": 9.244203567504883, + "learning_rate": 9.913389063998905e-06, + "loss": 0.5481, + "step": 1383 + }, + { + "epoch": 0.018773738469886055, + "grad_norm": 9.47103500366211, + "learning_rate": 9.91325202137865e-06, + "loss": 0.4482, + "step": 1384 + }, + { + "epoch": 0.018787303309820945, + "grad_norm": 8.284035682678223, + "learning_rate": 9.913114978758395e-06, + "loss": 0.4484, + "step": 1385 + }, + { + "epoch": 0.018800868149755835, + "grad_norm": 5.2767744064331055, + "learning_rate": 9.91297793613814e-06, + "loss": 0.3506, + "step": 1386 + }, + { + "epoch": 0.01881443298969072, + "grad_norm": 9.62224006652832, + "learning_rate": 9.912840893517884e-06, + "loss": 0.6029, + "step": 1387 + }, + { + "epoch": 0.01882799782962561, + "grad_norm": 9.716911315917969, + "learning_rate": 9.91270385089763e-06, + "loss": 0.5847, + "step": 1388 + }, + { + "epoch": 0.0188415626695605, + "grad_norm": 10.050057411193848, + "learning_rate": 9.912566808277376e-06, + "loss": 0.6974, + "step": 1389 + }, + { + "epoch": 0.01885512750949539, + "grad_norm": 11.846632957458496, + "learning_rate": 9.91242976565712e-06, + "loss": 0.7102, + "step": 1390 + }, + { + "epoch": 0.018868692349430276, + "grad_norm": 8.277639389038086, + "learning_rate": 9.912292723036864e-06, + "loss": 0.5205, + "step": 1391 + }, + { + "epoch": 0.018882257189365165, + "grad_norm": 8.733463287353516, + "learning_rate": 9.912155680416611e-06, + "loss": 0.5623, + "step": 1392 + }, + { + "epoch": 0.018895822029300055, + "grad_norm": 7.457735061645508, + "learning_rate": 9.912018637796355e-06, + "loss": 0.4823, + "step": 1393 + }, + { + "epoch": 0.018909386869234945, + "grad_norm": 9.394886016845703, + "learning_rate": 9.9118815951761e-06, + "loss": 0.6196, + "step": 1394 + }, + { + "epoch": 0.01892295170916983, + "grad_norm": 11.796133995056152, + "learning_rate": 9.911744552555845e-06, + "loss": 0.6305, + "step": 1395 + }, + { + "epoch": 0.01893651654910472, + "grad_norm": 14.040828704833984, + "learning_rate": 9.911607509935592e-06, + "loss": 0.6857, + "step": 1396 + }, + { + "epoch": 0.01895008138903961, + "grad_norm": 10.615224838256836, + "learning_rate": 9.911470467315336e-06, + "loss": 0.6561, + "step": 1397 + }, + { + "epoch": 0.0189636462289745, + "grad_norm": 8.675980567932129, + "learning_rate": 9.91133342469508e-06, + "loss": 0.5333, + "step": 1398 + }, + { + "epoch": 0.018977211068909386, + "grad_norm": 9.351827621459961, + "learning_rate": 9.911196382074826e-06, + "loss": 0.5872, + "step": 1399 + }, + { + "epoch": 0.018990775908844276, + "grad_norm": 7.693256855010986, + "learning_rate": 9.911059339454571e-06, + "loss": 0.4061, + "step": 1400 + }, + { + "epoch": 0.019004340748779165, + "grad_norm": 8.351990699768066, + "learning_rate": 9.910922296834316e-06, + "loss": 0.3499, + "step": 1401 + }, + { + "epoch": 0.019017905588714055, + "grad_norm": 11.395855903625488, + "learning_rate": 9.910785254214061e-06, + "loss": 0.6039, + "step": 1402 + }, + { + "epoch": 0.01903147042864894, + "grad_norm": 12.576519012451172, + "learning_rate": 9.910648211593807e-06, + "loss": 0.7695, + "step": 1403 + }, + { + "epoch": 0.01904503526858383, + "grad_norm": 9.88300895690918, + "learning_rate": 9.910511168973552e-06, + "loss": 0.5424, + "step": 1404 + }, + { + "epoch": 0.01905860010851872, + "grad_norm": 15.659361839294434, + "learning_rate": 9.910374126353297e-06, + "loss": 0.7825, + "step": 1405 + }, + { + "epoch": 0.01907216494845361, + "grad_norm": 14.127354621887207, + "learning_rate": 9.910237083733042e-06, + "loss": 0.5815, + "step": 1406 + }, + { + "epoch": 0.019085729788388496, + "grad_norm": 12.741280555725098, + "learning_rate": 9.910100041112787e-06, + "loss": 0.7161, + "step": 1407 + }, + { + "epoch": 0.019099294628323386, + "grad_norm": 8.508400917053223, + "learning_rate": 9.90996299849253e-06, + "loss": 0.51, + "step": 1408 + }, + { + "epoch": 0.019112859468258275, + "grad_norm": 9.543746948242188, + "learning_rate": 9.909825955872278e-06, + "loss": 0.5076, + "step": 1409 + }, + { + "epoch": 0.019126424308193165, + "grad_norm": 7.5257248878479, + "learning_rate": 9.909688913252023e-06, + "loss": 0.5517, + "step": 1410 + }, + { + "epoch": 0.01913998914812805, + "grad_norm": 10.631339073181152, + "learning_rate": 9.909551870631768e-06, + "loss": 0.4707, + "step": 1411 + }, + { + "epoch": 0.01915355398806294, + "grad_norm": 10.853009223937988, + "learning_rate": 9.909414828011512e-06, + "loss": 0.6248, + "step": 1412 + }, + { + "epoch": 0.01916711882799783, + "grad_norm": 9.104799270629883, + "learning_rate": 9.909277785391257e-06, + "loss": 0.5229, + "step": 1413 + }, + { + "epoch": 0.01918068366793272, + "grad_norm": 10.746611595153809, + "learning_rate": 9.909140742771004e-06, + "loss": 0.6158, + "step": 1414 + }, + { + "epoch": 0.019194248507867606, + "grad_norm": 12.193516731262207, + "learning_rate": 9.909003700150747e-06, + "loss": 0.7328, + "step": 1415 + }, + { + "epoch": 0.019207813347802496, + "grad_norm": 12.200885772705078, + "learning_rate": 9.908866657530492e-06, + "loss": 0.676, + "step": 1416 + }, + { + "epoch": 0.019221378187737385, + "grad_norm": 8.976455688476562, + "learning_rate": 9.908729614910237e-06, + "loss": 0.4279, + "step": 1417 + }, + { + "epoch": 0.019234943027672275, + "grad_norm": 15.59479808807373, + "learning_rate": 9.908592572289983e-06, + "loss": 0.6255, + "step": 1418 + }, + { + "epoch": 0.01924850786760716, + "grad_norm": 12.440356254577637, + "learning_rate": 9.908455529669728e-06, + "loss": 0.7587, + "step": 1419 + }, + { + "epoch": 0.01926207270754205, + "grad_norm": 10.544697761535645, + "learning_rate": 9.908318487049473e-06, + "loss": 0.5479, + "step": 1420 + }, + { + "epoch": 0.01927563754747694, + "grad_norm": 8.9188814163208, + "learning_rate": 9.908181444429218e-06, + "loss": 0.5158, + "step": 1421 + }, + { + "epoch": 0.01928920238741183, + "grad_norm": 11.99893569946289, + "learning_rate": 9.908044401808963e-06, + "loss": 0.7535, + "step": 1422 + }, + { + "epoch": 0.019302767227346716, + "grad_norm": 10.324481964111328, + "learning_rate": 9.907907359188709e-06, + "loss": 0.5565, + "step": 1423 + }, + { + "epoch": 0.019316332067281606, + "grad_norm": 9.512728691101074, + "learning_rate": 9.907770316568454e-06, + "loss": 0.8355, + "step": 1424 + }, + { + "epoch": 0.019329896907216496, + "grad_norm": 9.267844200134277, + "learning_rate": 9.907633273948199e-06, + "loss": 0.5815, + "step": 1425 + }, + { + "epoch": 0.019343461747151385, + "grad_norm": 8.9126615524292, + "learning_rate": 9.907496231327944e-06, + "loss": 0.6043, + "step": 1426 + }, + { + "epoch": 0.01935702658708627, + "grad_norm": 8.16275691986084, + "learning_rate": 9.90735918870769e-06, + "loss": 0.5935, + "step": 1427 + }, + { + "epoch": 0.01937059142702116, + "grad_norm": 10.281341552734375, + "learning_rate": 9.907222146087434e-06, + "loss": 0.5952, + "step": 1428 + }, + { + "epoch": 0.01938415626695605, + "grad_norm": 9.193373680114746, + "learning_rate": 9.90708510346718e-06, + "loss": 0.551, + "step": 1429 + }, + { + "epoch": 0.01939772110689094, + "grad_norm": 11.814682006835938, + "learning_rate": 9.906948060846923e-06, + "loss": 0.6423, + "step": 1430 + }, + { + "epoch": 0.019411285946825826, + "grad_norm": 9.726921081542969, + "learning_rate": 9.90681101822667e-06, + "loss": 0.664, + "step": 1431 + }, + { + "epoch": 0.019424850786760716, + "grad_norm": 12.540231704711914, + "learning_rate": 9.906673975606415e-06, + "loss": 0.7081, + "step": 1432 + }, + { + "epoch": 0.019438415626695606, + "grad_norm": 9.4815092086792, + "learning_rate": 9.906536932986159e-06, + "loss": 0.498, + "step": 1433 + }, + { + "epoch": 0.019451980466630495, + "grad_norm": 8.652584075927734, + "learning_rate": 9.906399890365904e-06, + "loss": 0.4886, + "step": 1434 + }, + { + "epoch": 0.01946554530656538, + "grad_norm": 11.34803295135498, + "learning_rate": 9.90626284774565e-06, + "loss": 0.6013, + "step": 1435 + }, + { + "epoch": 0.01947911014650027, + "grad_norm": 8.909708023071289, + "learning_rate": 9.906125805125396e-06, + "loss": 0.5329, + "step": 1436 + }, + { + "epoch": 0.01949267498643516, + "grad_norm": 10.569543838500977, + "learning_rate": 9.90598876250514e-06, + "loss": 0.5471, + "step": 1437 + }, + { + "epoch": 0.01950623982637005, + "grad_norm": 9.413229942321777, + "learning_rate": 9.905851719884885e-06, + "loss": 0.4995, + "step": 1438 + }, + { + "epoch": 0.019519804666304937, + "grad_norm": 10.609535217285156, + "learning_rate": 9.905714677264631e-06, + "loss": 0.7724, + "step": 1439 + }, + { + "epoch": 0.019533369506239826, + "grad_norm": 11.432806015014648, + "learning_rate": 9.905577634644375e-06, + "loss": 0.7327, + "step": 1440 + }, + { + "epoch": 0.019546934346174716, + "grad_norm": 10.667911529541016, + "learning_rate": 9.90544059202412e-06, + "loss": 0.5868, + "step": 1441 + }, + { + "epoch": 0.019560499186109605, + "grad_norm": 8.583703994750977, + "learning_rate": 9.905303549403865e-06, + "loss": 0.5778, + "step": 1442 + }, + { + "epoch": 0.01957406402604449, + "grad_norm": 10.466143608093262, + "learning_rate": 9.90516650678361e-06, + "loss": 0.6351, + "step": 1443 + }, + { + "epoch": 0.01958762886597938, + "grad_norm": 11.188680648803711, + "learning_rate": 9.905029464163356e-06, + "loss": 0.5692, + "step": 1444 + }, + { + "epoch": 0.01960119370591427, + "grad_norm": 10.853035926818848, + "learning_rate": 9.9048924215431e-06, + "loss": 0.675, + "step": 1445 + }, + { + "epoch": 0.01961475854584916, + "grad_norm": 8.221631050109863, + "learning_rate": 9.904755378922846e-06, + "loss": 0.7414, + "step": 1446 + }, + { + "epoch": 0.019628323385784047, + "grad_norm": 7.8576483726501465, + "learning_rate": 9.904618336302591e-06, + "loss": 0.5152, + "step": 1447 + }, + { + "epoch": 0.019641888225718936, + "grad_norm": 10.68525218963623, + "learning_rate": 9.904481293682336e-06, + "loss": 0.6551, + "step": 1448 + }, + { + "epoch": 0.019655453065653826, + "grad_norm": 8.449806213378906, + "learning_rate": 9.904344251062081e-06, + "loss": 0.5075, + "step": 1449 + }, + { + "epoch": 0.019669017905588716, + "grad_norm": 6.754986763000488, + "learning_rate": 9.904207208441827e-06, + "loss": 0.3941, + "step": 1450 + }, + { + "epoch": 0.019682582745523602, + "grad_norm": 10.868943214416504, + "learning_rate": 9.904070165821572e-06, + "loss": 0.6048, + "step": 1451 + }, + { + "epoch": 0.01969614758545849, + "grad_norm": 9.51048469543457, + "learning_rate": 9.903933123201317e-06, + "loss": 0.4294, + "step": 1452 + }, + { + "epoch": 0.01970971242539338, + "grad_norm": 7.478344917297363, + "learning_rate": 9.903796080581062e-06, + "loss": 0.4568, + "step": 1453 + }, + { + "epoch": 0.01972327726532827, + "grad_norm": 9.578357696533203, + "learning_rate": 9.903659037960807e-06, + "loss": 0.4779, + "step": 1454 + }, + { + "epoch": 0.019736842105263157, + "grad_norm": 7.627854824066162, + "learning_rate": 9.903521995340551e-06, + "loss": 0.493, + "step": 1455 + }, + { + "epoch": 0.019750406945198046, + "grad_norm": 8.487030029296875, + "learning_rate": 9.903384952720296e-06, + "loss": 0.4559, + "step": 1456 + }, + { + "epoch": 0.019763971785132936, + "grad_norm": 9.19349479675293, + "learning_rate": 9.903247910100043e-06, + "loss": 0.6155, + "step": 1457 + }, + { + "epoch": 0.019777536625067826, + "grad_norm": 9.953857421875, + "learning_rate": 9.903110867479786e-06, + "loss": 0.4479, + "step": 1458 + }, + { + "epoch": 0.019791101465002712, + "grad_norm": 7.651879787445068, + "learning_rate": 9.902973824859532e-06, + "loss": 0.5432, + "step": 1459 + }, + { + "epoch": 0.0198046663049376, + "grad_norm": 9.48837947845459, + "learning_rate": 9.902836782239277e-06, + "loss": 0.592, + "step": 1460 + }, + { + "epoch": 0.01981823114487249, + "grad_norm": 9.862255096435547, + "learning_rate": 9.902699739619022e-06, + "loss": 0.481, + "step": 1461 + }, + { + "epoch": 0.01983179598480738, + "grad_norm": 10.303238868713379, + "learning_rate": 9.902562696998767e-06, + "loss": 0.525, + "step": 1462 + }, + { + "epoch": 0.019845360824742267, + "grad_norm": 9.108551025390625, + "learning_rate": 9.902425654378512e-06, + "loss": 0.4839, + "step": 1463 + }, + { + "epoch": 0.019858925664677157, + "grad_norm": 10.508820533752441, + "learning_rate": 9.902288611758257e-06, + "loss": 0.5794, + "step": 1464 + }, + { + "epoch": 0.019872490504612046, + "grad_norm": 7.103573322296143, + "learning_rate": 9.902151569138003e-06, + "loss": 0.3892, + "step": 1465 + }, + { + "epoch": 0.019886055344546936, + "grad_norm": 7.71785831451416, + "learning_rate": 9.902014526517748e-06, + "loss": 0.4966, + "step": 1466 + }, + { + "epoch": 0.019899620184481822, + "grad_norm": 10.681404113769531, + "learning_rate": 9.901877483897493e-06, + "loss": 0.5445, + "step": 1467 + }, + { + "epoch": 0.01991318502441671, + "grad_norm": 8.622056007385254, + "learning_rate": 9.901740441277238e-06, + "loss": 0.6554, + "step": 1468 + }, + { + "epoch": 0.0199267498643516, + "grad_norm": 10.423203468322754, + "learning_rate": 9.901603398656983e-06, + "loss": 0.5505, + "step": 1469 + }, + { + "epoch": 0.01994031470428649, + "grad_norm": 6.383313179016113, + "learning_rate": 9.901466356036729e-06, + "loss": 0.4474, + "step": 1470 + }, + { + "epoch": 0.019953879544221377, + "grad_norm": 8.957576751708984, + "learning_rate": 9.901329313416474e-06, + "loss": 0.4855, + "step": 1471 + }, + { + "epoch": 0.019967444384156267, + "grad_norm": 8.899796485900879, + "learning_rate": 9.901192270796219e-06, + "loss": 0.4382, + "step": 1472 + }, + { + "epoch": 0.019981009224091156, + "grad_norm": 6.560460090637207, + "learning_rate": 9.901055228175962e-06, + "loss": 0.3409, + "step": 1473 + }, + { + "epoch": 0.019994574064026046, + "grad_norm": 7.418312072753906, + "learning_rate": 9.90091818555571e-06, + "loss": 0.5153, + "step": 1474 + }, + { + "epoch": 0.020008138903960932, + "grad_norm": 6.319566249847412, + "learning_rate": 9.900781142935454e-06, + "loss": 0.2534, + "step": 1475 + }, + { + "epoch": 0.020021703743895822, + "grad_norm": 8.334843635559082, + "learning_rate": 9.900644100315198e-06, + "loss": 0.5104, + "step": 1476 + }, + { + "epoch": 0.02003526858383071, + "grad_norm": 11.526978492736816, + "learning_rate": 9.900507057694943e-06, + "loss": 0.6299, + "step": 1477 + }, + { + "epoch": 0.0200488334237656, + "grad_norm": 8.292689323425293, + "learning_rate": 9.90037001507469e-06, + "loss": 0.4384, + "step": 1478 + }, + { + "epoch": 0.020062398263700487, + "grad_norm": 8.635262489318848, + "learning_rate": 9.900232972454435e-06, + "loss": 0.7269, + "step": 1479 + }, + { + "epoch": 0.020075963103635377, + "grad_norm": 8.290156364440918, + "learning_rate": 9.900095929834179e-06, + "loss": 0.4978, + "step": 1480 + }, + { + "epoch": 0.020089527943570266, + "grad_norm": 9.950539588928223, + "learning_rate": 9.899958887213924e-06, + "loss": 0.5407, + "step": 1481 + }, + { + "epoch": 0.020103092783505156, + "grad_norm": 8.686315536499023, + "learning_rate": 9.899821844593669e-06, + "loss": 0.5834, + "step": 1482 + }, + { + "epoch": 0.020116657623440042, + "grad_norm": 9.254131317138672, + "learning_rate": 9.899684801973414e-06, + "loss": 0.4792, + "step": 1483 + }, + { + "epoch": 0.020130222463374932, + "grad_norm": 11.05291748046875, + "learning_rate": 9.89954775935316e-06, + "loss": 0.5539, + "step": 1484 + }, + { + "epoch": 0.02014378730330982, + "grad_norm": 7.554414749145508, + "learning_rate": 9.899410716732905e-06, + "loss": 0.4654, + "step": 1485 + }, + { + "epoch": 0.02015735214324471, + "grad_norm": 6.7753496170043945, + "learning_rate": 9.89927367411265e-06, + "loss": 0.4215, + "step": 1486 + }, + { + "epoch": 0.020170916983179597, + "grad_norm": 7.892093181610107, + "learning_rate": 9.899136631492395e-06, + "loss": 0.3992, + "step": 1487 + }, + { + "epoch": 0.020184481823114487, + "grad_norm": 7.468218803405762, + "learning_rate": 9.89899958887214e-06, + "loss": 0.4711, + "step": 1488 + }, + { + "epoch": 0.020198046663049377, + "grad_norm": 11.288993835449219, + "learning_rate": 9.898862546251885e-06, + "loss": 0.6988, + "step": 1489 + }, + { + "epoch": 0.020211611502984266, + "grad_norm": 8.754590034484863, + "learning_rate": 9.89872550363163e-06, + "loss": 0.4878, + "step": 1490 + }, + { + "epoch": 0.020225176342919152, + "grad_norm": 8.392120361328125, + "learning_rate": 9.898588461011376e-06, + "loss": 0.4407, + "step": 1491 + }, + { + "epoch": 0.020238741182854042, + "grad_norm": 7.065241813659668, + "learning_rate": 9.89845141839112e-06, + "loss": 0.4507, + "step": 1492 + }, + { + "epoch": 0.02025230602278893, + "grad_norm": 8.016024589538574, + "learning_rate": 9.898314375770866e-06, + "loss": 0.4094, + "step": 1493 + }, + { + "epoch": 0.02026587086272382, + "grad_norm": 10.180988311767578, + "learning_rate": 9.898177333150611e-06, + "loss": 0.5932, + "step": 1494 + }, + { + "epoch": 0.020279435702658707, + "grad_norm": 10.149967193603516, + "learning_rate": 9.898040290530355e-06, + "loss": 0.5695, + "step": 1495 + }, + { + "epoch": 0.020293000542593597, + "grad_norm": 7.135706424713135, + "learning_rate": 9.897903247910101e-06, + "loss": 0.5985, + "step": 1496 + }, + { + "epoch": 0.020306565382528487, + "grad_norm": 9.994475364685059, + "learning_rate": 9.897766205289847e-06, + "loss": 0.7443, + "step": 1497 + }, + { + "epoch": 0.020320130222463376, + "grad_norm": 7.014841556549072, + "learning_rate": 9.89762916266959e-06, + "loss": 0.4455, + "step": 1498 + }, + { + "epoch": 0.020333695062398262, + "grad_norm": 8.133296012878418, + "learning_rate": 9.897492120049335e-06, + "loss": 0.6037, + "step": 1499 + }, + { + "epoch": 0.020347259902333152, + "grad_norm": 7.709084510803223, + "learning_rate": 9.897355077429082e-06, + "loss": 0.5047, + "step": 1500 + }, + { + "epoch": 0.020360824742268042, + "grad_norm": 8.012962341308594, + "learning_rate": 9.897218034808826e-06, + "loss": 0.5213, + "step": 1501 + }, + { + "epoch": 0.02037438958220293, + "grad_norm": 9.919090270996094, + "learning_rate": 9.897080992188571e-06, + "loss": 0.4996, + "step": 1502 + }, + { + "epoch": 0.020387954422137818, + "grad_norm": 8.63750171661377, + "learning_rate": 9.896943949568316e-06, + "loss": 0.6228, + "step": 1503 + }, + { + "epoch": 0.020401519262072707, + "grad_norm": 11.592391967773438, + "learning_rate": 9.896806906948063e-06, + "loss": 0.5698, + "step": 1504 + }, + { + "epoch": 0.020415084102007597, + "grad_norm": 13.14859676361084, + "learning_rate": 9.896669864327806e-06, + "loss": 0.6534, + "step": 1505 + }, + { + "epoch": 0.020428648941942486, + "grad_norm": 10.187736511230469, + "learning_rate": 9.896532821707552e-06, + "loss": 0.6579, + "step": 1506 + }, + { + "epoch": 0.020442213781877373, + "grad_norm": 9.742892265319824, + "learning_rate": 9.896395779087297e-06, + "loss": 0.6211, + "step": 1507 + }, + { + "epoch": 0.020455778621812262, + "grad_norm": 6.923844814300537, + "learning_rate": 9.896258736467042e-06, + "loss": 0.4352, + "step": 1508 + }, + { + "epoch": 0.020469343461747152, + "grad_norm": 7.654608726501465, + "learning_rate": 9.896121693846787e-06, + "loss": 0.4969, + "step": 1509 + }, + { + "epoch": 0.02048290830168204, + "grad_norm": 7.904272079467773, + "learning_rate": 9.895984651226532e-06, + "loss": 0.344, + "step": 1510 + }, + { + "epoch": 0.020496473141616928, + "grad_norm": 7.742966175079346, + "learning_rate": 9.895847608606277e-06, + "loss": 0.5381, + "step": 1511 + }, + { + "epoch": 0.020510037981551817, + "grad_norm": 7.471368312835693, + "learning_rate": 9.895710565986023e-06, + "loss": 0.5984, + "step": 1512 + }, + { + "epoch": 0.020523602821486707, + "grad_norm": 7.382142543792725, + "learning_rate": 9.895573523365768e-06, + "loss": 0.5284, + "step": 1513 + }, + { + "epoch": 0.020537167661421597, + "grad_norm": 9.8018159866333, + "learning_rate": 9.895436480745513e-06, + "loss": 0.5983, + "step": 1514 + }, + { + "epoch": 0.020550732501356483, + "grad_norm": 9.663667678833008, + "learning_rate": 9.895299438125258e-06, + "loss": 0.6159, + "step": 1515 + }, + { + "epoch": 0.020564297341291372, + "grad_norm": 7.4841132164001465, + "learning_rate": 9.895162395505002e-06, + "loss": 0.5217, + "step": 1516 + }, + { + "epoch": 0.020577862181226262, + "grad_norm": 11.48484992980957, + "learning_rate": 9.895025352884749e-06, + "loss": 0.7976, + "step": 1517 + }, + { + "epoch": 0.02059142702116115, + "grad_norm": 8.228372573852539, + "learning_rate": 9.894888310264494e-06, + "loss": 0.4532, + "step": 1518 + }, + { + "epoch": 0.020604991861096038, + "grad_norm": 8.417984962463379, + "learning_rate": 9.894751267644239e-06, + "loss": 0.4683, + "step": 1519 + }, + { + "epoch": 0.020618556701030927, + "grad_norm": 7.508738040924072, + "learning_rate": 9.894614225023982e-06, + "loss": 0.536, + "step": 1520 + }, + { + "epoch": 0.020632121540965817, + "grad_norm": 7.559261322021484, + "learning_rate": 9.89447718240373e-06, + "loss": 0.4799, + "step": 1521 + }, + { + "epoch": 0.020645686380900707, + "grad_norm": 9.143153190612793, + "learning_rate": 9.894340139783474e-06, + "loss": 0.4372, + "step": 1522 + }, + { + "epoch": 0.020659251220835593, + "grad_norm": 11.554931640625, + "learning_rate": 9.894203097163218e-06, + "loss": 0.7439, + "step": 1523 + }, + { + "epoch": 0.020672816060770483, + "grad_norm": 9.780343055725098, + "learning_rate": 9.894066054542963e-06, + "loss": 0.7261, + "step": 1524 + }, + { + "epoch": 0.020686380900705372, + "grad_norm": 10.395544052124023, + "learning_rate": 9.893929011922708e-06, + "loss": 0.7486, + "step": 1525 + }, + { + "epoch": 0.020699945740640262, + "grad_norm": 7.96922492980957, + "learning_rate": 9.893791969302453e-06, + "loss": 0.5898, + "step": 1526 + }, + { + "epoch": 0.020713510580575148, + "grad_norm": 10.448512077331543, + "learning_rate": 9.893654926682199e-06, + "loss": 0.6247, + "step": 1527 + }, + { + "epoch": 0.020727075420510038, + "grad_norm": 9.791532516479492, + "learning_rate": 9.893517884061944e-06, + "loss": 0.6038, + "step": 1528 + }, + { + "epoch": 0.020740640260444927, + "grad_norm": 10.08679485321045, + "learning_rate": 9.893380841441689e-06, + "loss": 0.6682, + "step": 1529 + }, + { + "epoch": 0.020754205100379817, + "grad_norm": 10.283010482788086, + "learning_rate": 9.893243798821434e-06, + "loss": 0.7288, + "step": 1530 + }, + { + "epoch": 0.020767769940314703, + "grad_norm": 7.729386806488037, + "learning_rate": 9.89310675620118e-06, + "loss": 0.5112, + "step": 1531 + }, + { + "epoch": 0.020781334780249593, + "grad_norm": 8.185969352722168, + "learning_rate": 9.892969713580925e-06, + "loss": 0.3796, + "step": 1532 + }, + { + "epoch": 0.020794899620184482, + "grad_norm": 6.503942966461182, + "learning_rate": 9.89283267096067e-06, + "loss": 0.417, + "step": 1533 + }, + { + "epoch": 0.020808464460119372, + "grad_norm": 7.126759052276611, + "learning_rate": 9.892695628340415e-06, + "loss": 0.6023, + "step": 1534 + }, + { + "epoch": 0.020822029300054258, + "grad_norm": 7.82424259185791, + "learning_rate": 9.89255858572016e-06, + "loss": 0.5313, + "step": 1535 + }, + { + "epoch": 0.020835594139989148, + "grad_norm": 8.11983585357666, + "learning_rate": 9.892421543099905e-06, + "loss": 0.5019, + "step": 1536 + }, + { + "epoch": 0.020849158979924037, + "grad_norm": 8.873526573181152, + "learning_rate": 9.89228450047965e-06, + "loss": 0.5015, + "step": 1537 + }, + { + "epoch": 0.020862723819858927, + "grad_norm": 7.774203777313232, + "learning_rate": 9.892147457859394e-06, + "loss": 0.5047, + "step": 1538 + }, + { + "epoch": 0.020876288659793813, + "grad_norm": 6.072090148925781, + "learning_rate": 9.89201041523914e-06, + "loss": 0.391, + "step": 1539 + }, + { + "epoch": 0.020889853499728703, + "grad_norm": 8.668652534484863, + "learning_rate": 9.891873372618886e-06, + "loss": 0.564, + "step": 1540 + }, + { + "epoch": 0.020903418339663592, + "grad_norm": 7.186413764953613, + "learning_rate": 9.89173632999863e-06, + "loss": 0.4955, + "step": 1541 + }, + { + "epoch": 0.020916983179598482, + "grad_norm": 10.218061447143555, + "learning_rate": 9.891599287378375e-06, + "loss": 0.5539, + "step": 1542 + }, + { + "epoch": 0.020930548019533368, + "grad_norm": 8.001330375671387, + "learning_rate": 9.891462244758122e-06, + "loss": 0.5128, + "step": 1543 + }, + { + "epoch": 0.020944112859468258, + "grad_norm": 11.733591079711914, + "learning_rate": 9.891325202137867e-06, + "loss": 0.8777, + "step": 1544 + }, + { + "epoch": 0.020957677699403147, + "grad_norm": 6.905190944671631, + "learning_rate": 9.89118815951761e-06, + "loss": 0.6893, + "step": 1545 + }, + { + "epoch": 0.020971242539338037, + "grad_norm": 11.484460830688477, + "learning_rate": 9.891051116897355e-06, + "loss": 0.6865, + "step": 1546 + }, + { + "epoch": 0.020984807379272923, + "grad_norm": 6.848311424255371, + "learning_rate": 9.890914074277102e-06, + "loss": 0.4589, + "step": 1547 + }, + { + "epoch": 0.020998372219207813, + "grad_norm": 9.122644424438477, + "learning_rate": 9.890777031656846e-06, + "loss": 0.6664, + "step": 1548 + }, + { + "epoch": 0.021011937059142703, + "grad_norm": 9.846575736999512, + "learning_rate": 9.890639989036591e-06, + "loss": 0.8818, + "step": 1549 + }, + { + "epoch": 0.021025501899077592, + "grad_norm": 8.29969596862793, + "learning_rate": 9.890502946416336e-06, + "loss": 0.7295, + "step": 1550 + }, + { + "epoch": 0.02103906673901248, + "grad_norm": 7.323514938354492, + "learning_rate": 9.890365903796081e-06, + "loss": 0.5551, + "step": 1551 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 6.75976037979126, + "learning_rate": 9.890228861175826e-06, + "loss": 0.3497, + "step": 1552 + }, + { + "epoch": 0.021066196418882258, + "grad_norm": 8.923369407653809, + "learning_rate": 9.890091818555572e-06, + "loss": 0.5572, + "step": 1553 + }, + { + "epoch": 0.021079761258817147, + "grad_norm": 8.972877502441406, + "learning_rate": 9.889954775935317e-06, + "loss": 0.5428, + "step": 1554 + }, + { + "epoch": 0.021093326098752033, + "grad_norm": 7.902328014373779, + "learning_rate": 9.889817733315062e-06, + "loss": 0.3918, + "step": 1555 + }, + { + "epoch": 0.021106890938686923, + "grad_norm": 6.842109203338623, + "learning_rate": 9.889680690694807e-06, + "loss": 0.4786, + "step": 1556 + }, + { + "epoch": 0.021120455778621813, + "grad_norm": 9.268389701843262, + "learning_rate": 9.889543648074552e-06, + "loss": 0.5532, + "step": 1557 + }, + { + "epoch": 0.021134020618556702, + "grad_norm": 8.372587203979492, + "learning_rate": 9.889406605454297e-06, + "loss": 0.5423, + "step": 1558 + }, + { + "epoch": 0.02114758545849159, + "grad_norm": 9.80628776550293, + "learning_rate": 9.889269562834041e-06, + "loss": 0.6618, + "step": 1559 + }, + { + "epoch": 0.021161150298426478, + "grad_norm": 8.411478042602539, + "learning_rate": 9.889132520213788e-06, + "loss": 0.6078, + "step": 1560 + }, + { + "epoch": 0.021174715138361368, + "grad_norm": 7.159600734710693, + "learning_rate": 9.888995477593533e-06, + "loss": 0.4421, + "step": 1561 + }, + { + "epoch": 0.021188279978296257, + "grad_norm": 7.710400104522705, + "learning_rate": 9.888858434973278e-06, + "loss": 0.5572, + "step": 1562 + }, + { + "epoch": 0.021201844818231144, + "grad_norm": 9.96301555633545, + "learning_rate": 9.888721392353022e-06, + "loss": 0.6988, + "step": 1563 + }, + { + "epoch": 0.021215409658166033, + "grad_norm": 4.745579719543457, + "learning_rate": 9.888584349732767e-06, + "loss": 0.3555, + "step": 1564 + }, + { + "epoch": 0.021228974498100923, + "grad_norm": 7.724310398101807, + "learning_rate": 9.888447307112514e-06, + "loss": 0.5751, + "step": 1565 + }, + { + "epoch": 0.021242539338035812, + "grad_norm": 8.619111061096191, + "learning_rate": 9.888310264492257e-06, + "loss": 0.5162, + "step": 1566 + }, + { + "epoch": 0.0212561041779707, + "grad_norm": 11.318082809448242, + "learning_rate": 9.888173221872002e-06, + "loss": 0.5747, + "step": 1567 + }, + { + "epoch": 0.021269669017905588, + "grad_norm": 8.530882835388184, + "learning_rate": 9.888036179251748e-06, + "loss": 0.4313, + "step": 1568 + }, + { + "epoch": 0.021283233857840478, + "grad_norm": 7.733654499053955, + "learning_rate": 9.887899136631493e-06, + "loss": 0.5405, + "step": 1569 + }, + { + "epoch": 0.021296798697775367, + "grad_norm": 7.186324596405029, + "learning_rate": 9.887762094011238e-06, + "loss": 0.3582, + "step": 1570 + }, + { + "epoch": 0.021310363537710254, + "grad_norm": 8.344192504882812, + "learning_rate": 9.887625051390983e-06, + "loss": 0.6054, + "step": 1571 + }, + { + "epoch": 0.021323928377645143, + "grad_norm": 8.83353328704834, + "learning_rate": 9.887488008770728e-06, + "loss": 0.6268, + "step": 1572 + }, + { + "epoch": 0.021337493217580033, + "grad_norm": 8.825469017028809, + "learning_rate": 9.887350966150473e-06, + "loss": 0.4609, + "step": 1573 + }, + { + "epoch": 0.021351058057514923, + "grad_norm": 6.755374908447266, + "learning_rate": 9.887213923530219e-06, + "loss": 0.428, + "step": 1574 + }, + { + "epoch": 0.02136462289744981, + "grad_norm": 7.300180435180664, + "learning_rate": 9.887076880909964e-06, + "loss": 0.6107, + "step": 1575 + }, + { + "epoch": 0.0213781877373847, + "grad_norm": 7.595290660858154, + "learning_rate": 9.886939838289709e-06, + "loss": 0.5725, + "step": 1576 + }, + { + "epoch": 0.021391752577319588, + "grad_norm": 8.693061828613281, + "learning_rate": 9.886802795669454e-06, + "loss": 0.4038, + "step": 1577 + }, + { + "epoch": 0.021405317417254478, + "grad_norm": 5.423738479614258, + "learning_rate": 9.8866657530492e-06, + "loss": 0.4415, + "step": 1578 + }, + { + "epoch": 0.021418882257189364, + "grad_norm": 7.295645236968994, + "learning_rate": 9.886528710428945e-06, + "loss": 0.4642, + "step": 1579 + }, + { + "epoch": 0.021432447097124253, + "grad_norm": 6.425195693969727, + "learning_rate": 9.88639166780869e-06, + "loss": 0.4448, + "step": 1580 + }, + { + "epoch": 0.021446011937059143, + "grad_norm": 7.320648193359375, + "learning_rate": 9.886254625188433e-06, + "loss": 0.3699, + "step": 1581 + }, + { + "epoch": 0.021459576776994033, + "grad_norm": 8.849261283874512, + "learning_rate": 9.88611758256818e-06, + "loss": 0.4052, + "step": 1582 + }, + { + "epoch": 0.02147314161692892, + "grad_norm": 7.2577080726623535, + "learning_rate": 9.885980539947925e-06, + "loss": 0.4595, + "step": 1583 + }, + { + "epoch": 0.02148670645686381, + "grad_norm": 10.072991371154785, + "learning_rate": 9.885843497327669e-06, + "loss": 0.6376, + "step": 1584 + }, + { + "epoch": 0.021500271296798698, + "grad_norm": 7.561297416687012, + "learning_rate": 9.885706454707414e-06, + "loss": 0.4906, + "step": 1585 + }, + { + "epoch": 0.021513836136733588, + "grad_norm": 8.303655624389648, + "learning_rate": 9.88556941208716e-06, + "loss": 0.5833, + "step": 1586 + }, + { + "epoch": 0.021527400976668474, + "grad_norm": 6.3671064376831055, + "learning_rate": 9.885432369466906e-06, + "loss": 0.3583, + "step": 1587 + }, + { + "epoch": 0.021540965816603364, + "grad_norm": 7.863743782043457, + "learning_rate": 9.88529532684665e-06, + "loss": 0.4624, + "step": 1588 + }, + { + "epoch": 0.021554530656538253, + "grad_norm": 13.374011039733887, + "learning_rate": 9.885158284226395e-06, + "loss": 0.5748, + "step": 1589 + }, + { + "epoch": 0.021568095496473143, + "grad_norm": 7.014529705047607, + "learning_rate": 9.885021241606142e-06, + "loss": 0.4637, + "step": 1590 + }, + { + "epoch": 0.02158166033640803, + "grad_norm": 7.851729393005371, + "learning_rate": 9.884884198985885e-06, + "loss": 0.4414, + "step": 1591 + }, + { + "epoch": 0.02159522517634292, + "grad_norm": 8.913524627685547, + "learning_rate": 9.88474715636563e-06, + "loss": 0.48, + "step": 1592 + }, + { + "epoch": 0.021608790016277808, + "grad_norm": 5.454252243041992, + "learning_rate": 9.884610113745375e-06, + "loss": 0.4581, + "step": 1593 + }, + { + "epoch": 0.021622354856212698, + "grad_norm": 7.313579559326172, + "learning_rate": 9.88447307112512e-06, + "loss": 0.5275, + "step": 1594 + }, + { + "epoch": 0.021635919696147584, + "grad_norm": 6.9123334884643555, + "learning_rate": 9.884336028504866e-06, + "loss": 0.3787, + "step": 1595 + }, + { + "epoch": 0.021649484536082474, + "grad_norm": 6.8960065841674805, + "learning_rate": 9.884198985884611e-06, + "loss": 0.4861, + "step": 1596 + }, + { + "epoch": 0.021663049376017363, + "grad_norm": 7.385406970977783, + "learning_rate": 9.884061943264356e-06, + "loss": 0.3662, + "step": 1597 + }, + { + "epoch": 0.021676614215952253, + "grad_norm": 8.003942489624023, + "learning_rate": 9.883924900644101e-06, + "loss": 0.5845, + "step": 1598 + }, + { + "epoch": 0.02169017905588714, + "grad_norm": 9.606952667236328, + "learning_rate": 9.883787858023846e-06, + "loss": 0.7209, + "step": 1599 + }, + { + "epoch": 0.02170374389582203, + "grad_norm": 6.4962158203125, + "learning_rate": 9.883650815403592e-06, + "loss": 0.3625, + "step": 1600 + }, + { + "epoch": 0.02171730873575692, + "grad_norm": 8.42620849609375, + "learning_rate": 9.883513772783337e-06, + "loss": 0.3607, + "step": 1601 + }, + { + "epoch": 0.021730873575691808, + "grad_norm": 7.531658172607422, + "learning_rate": 9.883376730163082e-06, + "loss": 0.3686, + "step": 1602 + }, + { + "epoch": 0.021744438415626694, + "grad_norm": 6.723302841186523, + "learning_rate": 9.883239687542827e-06, + "loss": 0.3689, + "step": 1603 + }, + { + "epoch": 0.021758003255561584, + "grad_norm": 7.366732120513916, + "learning_rate": 9.883102644922572e-06, + "loss": 0.3646, + "step": 1604 + }, + { + "epoch": 0.021771568095496473, + "grad_norm": 8.453022003173828, + "learning_rate": 9.882965602302318e-06, + "loss": 0.5229, + "step": 1605 + }, + { + "epoch": 0.021785132935431363, + "grad_norm": 6.444901466369629, + "learning_rate": 9.882828559682061e-06, + "loss": 0.2732, + "step": 1606 + }, + { + "epoch": 0.02179869777536625, + "grad_norm": 6.398029804229736, + "learning_rate": 9.882691517061806e-06, + "loss": 0.2971, + "step": 1607 + }, + { + "epoch": 0.02181226261530114, + "grad_norm": 7.864733695983887, + "learning_rate": 9.882554474441553e-06, + "loss": 0.3659, + "step": 1608 + }, + { + "epoch": 0.02182582745523603, + "grad_norm": 7.267504692077637, + "learning_rate": 9.882417431821297e-06, + "loss": 0.3762, + "step": 1609 + }, + { + "epoch": 0.021839392295170918, + "grad_norm": 6.556393623352051, + "learning_rate": 9.882280389201042e-06, + "loss": 0.3189, + "step": 1610 + }, + { + "epoch": 0.021852957135105804, + "grad_norm": 5.86617374420166, + "learning_rate": 9.882143346580787e-06, + "loss": 0.3805, + "step": 1611 + }, + { + "epoch": 0.021866521975040694, + "grad_norm": 6.718401908874512, + "learning_rate": 9.882006303960534e-06, + "loss": 0.3179, + "step": 1612 + }, + { + "epoch": 0.021880086814975584, + "grad_norm": 6.230490207672119, + "learning_rate": 9.881869261340277e-06, + "loss": 0.3355, + "step": 1613 + }, + { + "epoch": 0.021893651654910473, + "grad_norm": 6.70961856842041, + "learning_rate": 9.881732218720022e-06, + "loss": 0.4008, + "step": 1614 + }, + { + "epoch": 0.02190721649484536, + "grad_norm": 7.182063579559326, + "learning_rate": 9.881595176099768e-06, + "loss": 0.462, + "step": 1615 + }, + { + "epoch": 0.02192078133478025, + "grad_norm": 6.234583854675293, + "learning_rate": 9.881458133479513e-06, + "loss": 0.3682, + "step": 1616 + }, + { + "epoch": 0.02193434617471514, + "grad_norm": 7.814955711364746, + "learning_rate": 9.881321090859258e-06, + "loss": 0.342, + "step": 1617 + }, + { + "epoch": 0.021947911014650028, + "grad_norm": 7.25799560546875, + "learning_rate": 9.881184048239003e-06, + "loss": 0.3851, + "step": 1618 + }, + { + "epoch": 0.021961475854584914, + "grad_norm": 10.37250804901123, + "learning_rate": 9.881047005618748e-06, + "loss": 0.4951, + "step": 1619 + }, + { + "epoch": 0.021975040694519804, + "grad_norm": 6.718634605407715, + "learning_rate": 9.880909962998494e-06, + "loss": 0.3581, + "step": 1620 + }, + { + "epoch": 0.021988605534454694, + "grad_norm": 6.898249626159668, + "learning_rate": 9.880772920378239e-06, + "loss": 0.2678, + "step": 1621 + }, + { + "epoch": 0.022002170374389583, + "grad_norm": 9.463942527770996, + "learning_rate": 9.880635877757984e-06, + "loss": 0.4892, + "step": 1622 + }, + { + "epoch": 0.02201573521432447, + "grad_norm": 8.322531700134277, + "learning_rate": 9.880498835137729e-06, + "loss": 0.3967, + "step": 1623 + }, + { + "epoch": 0.02202930005425936, + "grad_norm": 7.672345161437988, + "learning_rate": 9.880361792517473e-06, + "loss": 0.4155, + "step": 1624 + }, + { + "epoch": 0.02204286489419425, + "grad_norm": 8.024246215820312, + "learning_rate": 9.88022474989722e-06, + "loss": 0.5171, + "step": 1625 + }, + { + "epoch": 0.02205642973412914, + "grad_norm": 8.013132095336914, + "learning_rate": 9.880087707276965e-06, + "loss": 0.4512, + "step": 1626 + }, + { + "epoch": 0.022069994574064025, + "grad_norm": 6.885051250457764, + "learning_rate": 9.87995066465671e-06, + "loss": 0.4934, + "step": 1627 + }, + { + "epoch": 0.022083559413998914, + "grad_norm": 7.661969184875488, + "learning_rate": 9.879813622036453e-06, + "loss": 0.3087, + "step": 1628 + }, + { + "epoch": 0.022097124253933804, + "grad_norm": 10.104140281677246, + "learning_rate": 9.8796765794162e-06, + "loss": 0.6461, + "step": 1629 + }, + { + "epoch": 0.022110689093868693, + "grad_norm": 8.896557807922363, + "learning_rate": 9.879539536795945e-06, + "loss": 0.4799, + "step": 1630 + }, + { + "epoch": 0.02212425393380358, + "grad_norm": 8.90781021118164, + "learning_rate": 9.879402494175689e-06, + "loss": 0.5756, + "step": 1631 + }, + { + "epoch": 0.02213781877373847, + "grad_norm": 6.391355991363525, + "learning_rate": 9.879265451555434e-06, + "loss": 0.3844, + "step": 1632 + }, + { + "epoch": 0.02215138361367336, + "grad_norm": 9.388507843017578, + "learning_rate": 9.879128408935179e-06, + "loss": 0.5452, + "step": 1633 + }, + { + "epoch": 0.02216494845360825, + "grad_norm": 7.579108238220215, + "learning_rate": 9.878991366314924e-06, + "loss": 0.4166, + "step": 1634 + }, + { + "epoch": 0.022178513293543135, + "grad_norm": 8.704465866088867, + "learning_rate": 9.87885432369467e-06, + "loss": 0.3818, + "step": 1635 + }, + { + "epoch": 0.022192078133478024, + "grad_norm": 9.078969955444336, + "learning_rate": 9.878717281074415e-06, + "loss": 0.603, + "step": 1636 + }, + { + "epoch": 0.022205642973412914, + "grad_norm": 6.965075969696045, + "learning_rate": 9.87858023845416e-06, + "loss": 0.3996, + "step": 1637 + }, + { + "epoch": 0.022219207813347804, + "grad_norm": 8.824549674987793, + "learning_rate": 9.878443195833905e-06, + "loss": 0.4398, + "step": 1638 + }, + { + "epoch": 0.02223277265328269, + "grad_norm": 7.689955711364746, + "learning_rate": 9.87830615321365e-06, + "loss": 0.5854, + "step": 1639 + }, + { + "epoch": 0.02224633749321758, + "grad_norm": 7.147670269012451, + "learning_rate": 9.878169110593395e-06, + "loss": 0.4303, + "step": 1640 + }, + { + "epoch": 0.02225990233315247, + "grad_norm": 9.028977394104004, + "learning_rate": 9.87803206797314e-06, + "loss": 0.4257, + "step": 1641 + }, + { + "epoch": 0.02227346717308736, + "grad_norm": 7.3207621574401855, + "learning_rate": 9.877895025352886e-06, + "loss": 0.5107, + "step": 1642 + }, + { + "epoch": 0.022287032013022245, + "grad_norm": 7.111095905303955, + "learning_rate": 9.877757982732631e-06, + "loss": 0.4061, + "step": 1643 + }, + { + "epoch": 0.022300596852957134, + "grad_norm": 6.389806747436523, + "learning_rate": 9.877620940112376e-06, + "loss": 0.3208, + "step": 1644 + }, + { + "epoch": 0.022314161692892024, + "grad_norm": 10.551167488098145, + "learning_rate": 9.877483897492121e-06, + "loss": 0.4442, + "step": 1645 + }, + { + "epoch": 0.022327726532826914, + "grad_norm": 7.994392395019531, + "learning_rate": 9.877346854871866e-06, + "loss": 0.6103, + "step": 1646 + }, + { + "epoch": 0.0223412913727618, + "grad_norm": 7.056299686431885, + "learning_rate": 9.877209812251612e-06, + "loss": 0.3838, + "step": 1647 + }, + { + "epoch": 0.02235485621269669, + "grad_norm": 8.609375953674316, + "learning_rate": 9.877072769631357e-06, + "loss": 0.4534, + "step": 1648 + }, + { + "epoch": 0.02236842105263158, + "grad_norm": 9.271498680114746, + "learning_rate": 9.8769357270111e-06, + "loss": 0.5017, + "step": 1649 + }, + { + "epoch": 0.02238198589256647, + "grad_norm": 9.700071334838867, + "learning_rate": 9.876798684390845e-06, + "loss": 0.5659, + "step": 1650 + }, + { + "epoch": 0.022395550732501355, + "grad_norm": 8.552181243896484, + "learning_rate": 9.876661641770592e-06, + "loss": 0.4607, + "step": 1651 + }, + { + "epoch": 0.022409115572436245, + "grad_norm": 8.963132858276367, + "learning_rate": 9.876524599150336e-06, + "loss": 0.3872, + "step": 1652 + }, + { + "epoch": 0.022422680412371134, + "grad_norm": 6.521531105041504, + "learning_rate": 9.876387556530081e-06, + "loss": 0.2622, + "step": 1653 + }, + { + "epoch": 0.022436245252306024, + "grad_norm": 8.64516830444336, + "learning_rate": 9.876250513909826e-06, + "loss": 0.5327, + "step": 1654 + }, + { + "epoch": 0.02244981009224091, + "grad_norm": 8.970751762390137, + "learning_rate": 9.876113471289573e-06, + "loss": 0.4696, + "step": 1655 + }, + { + "epoch": 0.0224633749321758, + "grad_norm": 10.191628456115723, + "learning_rate": 9.875976428669317e-06, + "loss": 0.4137, + "step": 1656 + }, + { + "epoch": 0.02247693977211069, + "grad_norm": 9.928622245788574, + "learning_rate": 9.875839386049062e-06, + "loss": 0.5412, + "step": 1657 + }, + { + "epoch": 0.02249050461204558, + "grad_norm": 8.106282234191895, + "learning_rate": 9.875702343428807e-06, + "loss": 0.3695, + "step": 1658 + }, + { + "epoch": 0.022504069451980465, + "grad_norm": 10.571444511413574, + "learning_rate": 9.875565300808552e-06, + "loss": 0.6405, + "step": 1659 + }, + { + "epoch": 0.022517634291915355, + "grad_norm": 9.9916353225708, + "learning_rate": 9.875428258188297e-06, + "loss": 0.5684, + "step": 1660 + }, + { + "epoch": 0.022531199131850244, + "grad_norm": 9.80500602722168, + "learning_rate": 9.875291215568042e-06, + "loss": 0.4772, + "step": 1661 + }, + { + "epoch": 0.022544763971785134, + "grad_norm": 8.274618148803711, + "learning_rate": 9.875154172947788e-06, + "loss": 0.2912, + "step": 1662 + }, + { + "epoch": 0.02255832881172002, + "grad_norm": 9.45077133178711, + "learning_rate": 9.875017130327533e-06, + "loss": 0.4303, + "step": 1663 + }, + { + "epoch": 0.02257189365165491, + "grad_norm": 7.721555233001709, + "learning_rate": 9.874880087707278e-06, + "loss": 0.3461, + "step": 1664 + }, + { + "epoch": 0.0225854584915898, + "grad_norm": 8.594109535217285, + "learning_rate": 9.874743045087023e-06, + "loss": 0.6, + "step": 1665 + }, + { + "epoch": 0.02259902333152469, + "grad_norm": 11.237866401672363, + "learning_rate": 9.874606002466768e-06, + "loss": 0.5087, + "step": 1666 + }, + { + "epoch": 0.022612588171459575, + "grad_norm": 6.919260501861572, + "learning_rate": 9.874468959846512e-06, + "loss": 0.4338, + "step": 1667 + }, + { + "epoch": 0.022626153011394465, + "grad_norm": 8.03521728515625, + "learning_rate": 9.874331917226259e-06, + "loss": 0.3715, + "step": 1668 + }, + { + "epoch": 0.022639717851329354, + "grad_norm": 8.449225425720215, + "learning_rate": 9.874194874606004e-06, + "loss": 0.3755, + "step": 1669 + }, + { + "epoch": 0.022653282691264244, + "grad_norm": 8.461599349975586, + "learning_rate": 9.874057831985749e-06, + "loss": 0.3801, + "step": 1670 + }, + { + "epoch": 0.02266684753119913, + "grad_norm": 9.117579460144043, + "learning_rate": 9.873920789365493e-06, + "loss": 0.398, + "step": 1671 + }, + { + "epoch": 0.02268041237113402, + "grad_norm": 11.025849342346191, + "learning_rate": 9.87378374674524e-06, + "loss": 0.6174, + "step": 1672 + }, + { + "epoch": 0.02269397721106891, + "grad_norm": 7.494744777679443, + "learning_rate": 9.873646704124985e-06, + "loss": 0.2603, + "step": 1673 + }, + { + "epoch": 0.0227075420510038, + "grad_norm": 5.179318904876709, + "learning_rate": 9.873509661504728e-06, + "loss": 0.2656, + "step": 1674 + }, + { + "epoch": 0.022721106890938685, + "grad_norm": 7.63469934463501, + "learning_rate": 9.873372618884473e-06, + "loss": 0.3796, + "step": 1675 + }, + { + "epoch": 0.022734671730873575, + "grad_norm": 7.811468601226807, + "learning_rate": 9.873235576264218e-06, + "loss": 0.4328, + "step": 1676 + }, + { + "epoch": 0.022748236570808465, + "grad_norm": 10.301502227783203, + "learning_rate": 9.873098533643964e-06, + "loss": 0.4231, + "step": 1677 + }, + { + "epoch": 0.022761801410743354, + "grad_norm": 9.740961074829102, + "learning_rate": 9.872961491023709e-06, + "loss": 0.4002, + "step": 1678 + }, + { + "epoch": 0.02277536625067824, + "grad_norm": 7.321346759796143, + "learning_rate": 9.872824448403454e-06, + "loss": 0.4012, + "step": 1679 + }, + { + "epoch": 0.02278893109061313, + "grad_norm": 9.273963928222656, + "learning_rate": 9.8726874057832e-06, + "loss": 0.5043, + "step": 1680 + }, + { + "epoch": 0.02280249593054802, + "grad_norm": 9.448346138000488, + "learning_rate": 9.872550363162944e-06, + "loss": 0.4419, + "step": 1681 + }, + { + "epoch": 0.02281606077048291, + "grad_norm": 8.498557090759277, + "learning_rate": 9.87241332054269e-06, + "loss": 0.3631, + "step": 1682 + }, + { + "epoch": 0.022829625610417795, + "grad_norm": 7.859702110290527, + "learning_rate": 9.872276277922435e-06, + "loss": 0.4714, + "step": 1683 + }, + { + "epoch": 0.022843190450352685, + "grad_norm": 8.565177917480469, + "learning_rate": 9.87213923530218e-06, + "loss": 0.4227, + "step": 1684 + }, + { + "epoch": 0.022856755290287575, + "grad_norm": 6.805613040924072, + "learning_rate": 9.872002192681925e-06, + "loss": 0.2548, + "step": 1685 + }, + { + "epoch": 0.022870320130222464, + "grad_norm": 7.8966498374938965, + "learning_rate": 9.87186515006167e-06, + "loss": 0.4295, + "step": 1686 + }, + { + "epoch": 0.02288388497015735, + "grad_norm": 8.844444274902344, + "learning_rate": 9.871728107441415e-06, + "loss": 0.5642, + "step": 1687 + }, + { + "epoch": 0.02289744981009224, + "grad_norm": 8.895710945129395, + "learning_rate": 9.87159106482116e-06, + "loss": 0.6226, + "step": 1688 + }, + { + "epoch": 0.02291101465002713, + "grad_norm": 6.94694185256958, + "learning_rate": 9.871454022200904e-06, + "loss": 0.4478, + "step": 1689 + }, + { + "epoch": 0.02292457948996202, + "grad_norm": 6.563884258270264, + "learning_rate": 9.871316979580651e-06, + "loss": 0.4144, + "step": 1690 + }, + { + "epoch": 0.022938144329896906, + "grad_norm": 7.604683876037598, + "learning_rate": 9.871179936960396e-06, + "loss": 0.3887, + "step": 1691 + }, + { + "epoch": 0.022951709169831795, + "grad_norm": 7.035541534423828, + "learning_rate": 9.87104289434014e-06, + "loss": 0.4423, + "step": 1692 + }, + { + "epoch": 0.022965274009766685, + "grad_norm": 6.9633684158325195, + "learning_rate": 9.870905851719885e-06, + "loss": 0.403, + "step": 1693 + }, + { + "epoch": 0.022978838849701574, + "grad_norm": 6.652359962463379, + "learning_rate": 9.870768809099632e-06, + "loss": 0.3587, + "step": 1694 + }, + { + "epoch": 0.02299240368963646, + "grad_norm": 5.487961769104004, + "learning_rate": 9.870631766479377e-06, + "loss": 0.3562, + "step": 1695 + }, + { + "epoch": 0.02300596852957135, + "grad_norm": 7.946249961853027, + "learning_rate": 9.87049472385912e-06, + "loss": 0.3259, + "step": 1696 + }, + { + "epoch": 0.02301953336950624, + "grad_norm": 7.146481990814209, + "learning_rate": 9.870357681238866e-06, + "loss": 0.3761, + "step": 1697 + }, + { + "epoch": 0.02303309820944113, + "grad_norm": 7.062791347503662, + "learning_rate": 9.870220638618612e-06, + "loss": 0.4946, + "step": 1698 + }, + { + "epoch": 0.023046663049376016, + "grad_norm": 11.19158935546875, + "learning_rate": 9.870083595998356e-06, + "loss": 0.6061, + "step": 1699 + }, + { + "epoch": 0.023060227889310905, + "grad_norm": 6.435602188110352, + "learning_rate": 9.869946553378101e-06, + "loss": 0.3891, + "step": 1700 + }, + { + "epoch": 0.023073792729245795, + "grad_norm": 6.646374702453613, + "learning_rate": 9.869809510757846e-06, + "loss": 0.3387, + "step": 1701 + }, + { + "epoch": 0.023087357569180685, + "grad_norm": 9.965534210205078, + "learning_rate": 9.869672468137591e-06, + "loss": 0.656, + "step": 1702 + }, + { + "epoch": 0.02310092240911557, + "grad_norm": 8.111031532287598, + "learning_rate": 9.869535425517337e-06, + "loss": 0.4995, + "step": 1703 + }, + { + "epoch": 0.02311448724905046, + "grad_norm": 8.167957305908203, + "learning_rate": 9.869398382897082e-06, + "loss": 0.4555, + "step": 1704 + }, + { + "epoch": 0.02312805208898535, + "grad_norm": 10.261533737182617, + "learning_rate": 9.869261340276827e-06, + "loss": 0.5317, + "step": 1705 + }, + { + "epoch": 0.02314161692892024, + "grad_norm": 5.374886989593506, + "learning_rate": 9.869124297656572e-06, + "loss": 0.4071, + "step": 1706 + }, + { + "epoch": 0.02315518176885513, + "grad_norm": 6.966472148895264, + "learning_rate": 9.868987255036317e-06, + "loss": 0.516, + "step": 1707 + }, + { + "epoch": 0.023168746608790015, + "grad_norm": 8.379196166992188, + "learning_rate": 9.868850212416062e-06, + "loss": 0.5193, + "step": 1708 + }, + { + "epoch": 0.023182311448724905, + "grad_norm": 7.361875057220459, + "learning_rate": 9.868713169795808e-06, + "loss": 0.4741, + "step": 1709 + }, + { + "epoch": 0.023195876288659795, + "grad_norm": 7.268063068389893, + "learning_rate": 9.868576127175553e-06, + "loss": 0.5816, + "step": 1710 + }, + { + "epoch": 0.023209441128594684, + "grad_norm": 9.089261054992676, + "learning_rate": 9.868439084555298e-06, + "loss": 0.6629, + "step": 1711 + }, + { + "epoch": 0.02322300596852957, + "grad_norm": 5.615897178649902, + "learning_rate": 9.868302041935043e-06, + "loss": 0.4058, + "step": 1712 + }, + { + "epoch": 0.02323657080846446, + "grad_norm": 5.725692272186279, + "learning_rate": 9.868164999314788e-06, + "loss": 0.411, + "step": 1713 + }, + { + "epoch": 0.02325013564839935, + "grad_norm": 6.945995807647705, + "learning_rate": 9.868027956694532e-06, + "loss": 0.63, + "step": 1714 + }, + { + "epoch": 0.02326370048833424, + "grad_norm": 9.085399627685547, + "learning_rate": 9.867890914074279e-06, + "loss": 0.6327, + "step": 1715 + }, + { + "epoch": 0.023277265328269126, + "grad_norm": 8.57089900970459, + "learning_rate": 9.867753871454024e-06, + "loss": 0.5787, + "step": 1716 + }, + { + "epoch": 0.023290830168204015, + "grad_norm": 7.833351135253906, + "learning_rate": 9.867616828833767e-06, + "loss": 0.5808, + "step": 1717 + }, + { + "epoch": 0.023304395008138905, + "grad_norm": 8.263495445251465, + "learning_rate": 9.867479786213513e-06, + "loss": 0.5269, + "step": 1718 + }, + { + "epoch": 0.023317959848073794, + "grad_norm": 7.39059591293335, + "learning_rate": 9.867342743593258e-06, + "loss": 0.5283, + "step": 1719 + }, + { + "epoch": 0.02333152468800868, + "grad_norm": 6.8416876792907715, + "learning_rate": 9.867205700973005e-06, + "loss": 0.5246, + "step": 1720 + }, + { + "epoch": 0.02334508952794357, + "grad_norm": 8.047124862670898, + "learning_rate": 9.867068658352748e-06, + "loss": 0.6529, + "step": 1721 + }, + { + "epoch": 0.02335865436787846, + "grad_norm": 6.819330215454102, + "learning_rate": 9.866931615732493e-06, + "loss": 0.4388, + "step": 1722 + }, + { + "epoch": 0.02337221920781335, + "grad_norm": 10.002996444702148, + "learning_rate": 9.866794573112238e-06, + "loss": 0.5767, + "step": 1723 + }, + { + "epoch": 0.023385784047748236, + "grad_norm": 8.51870059967041, + "learning_rate": 9.866657530491984e-06, + "loss": 0.6739, + "step": 1724 + }, + { + "epoch": 0.023399348887683125, + "grad_norm": 6.350142955780029, + "learning_rate": 9.866520487871729e-06, + "loss": 0.4684, + "step": 1725 + }, + { + "epoch": 0.023412913727618015, + "grad_norm": 7.514225959777832, + "learning_rate": 9.866383445251474e-06, + "loss": 0.4337, + "step": 1726 + }, + { + "epoch": 0.023426478567552905, + "grad_norm": 7.521295547485352, + "learning_rate": 9.86624640263122e-06, + "loss": 0.5059, + "step": 1727 + }, + { + "epoch": 0.02344004340748779, + "grad_norm": 9.17750358581543, + "learning_rate": 9.866109360010964e-06, + "loss": 0.5332, + "step": 1728 + }, + { + "epoch": 0.02345360824742268, + "grad_norm": 8.220575332641602, + "learning_rate": 9.86597231739071e-06, + "loss": 0.6515, + "step": 1729 + }, + { + "epoch": 0.02346717308735757, + "grad_norm": 7.091249465942383, + "learning_rate": 9.865835274770455e-06, + "loss": 0.5003, + "step": 1730 + }, + { + "epoch": 0.02348073792729246, + "grad_norm": 5.732761383056641, + "learning_rate": 9.8656982321502e-06, + "loss": 0.4773, + "step": 1731 + }, + { + "epoch": 0.023494302767227346, + "grad_norm": 7.218102931976318, + "learning_rate": 9.865561189529943e-06, + "loss": 0.4538, + "step": 1732 + }, + { + "epoch": 0.023507867607162235, + "grad_norm": 9.40397834777832, + "learning_rate": 9.86542414690969e-06, + "loss": 0.6836, + "step": 1733 + }, + { + "epoch": 0.023521432447097125, + "grad_norm": 9.732361793518066, + "learning_rate": 9.865287104289435e-06, + "loss": 0.784, + "step": 1734 + }, + { + "epoch": 0.023534997287032015, + "grad_norm": 7.291615009307861, + "learning_rate": 9.86515006166918e-06, + "loss": 0.5769, + "step": 1735 + }, + { + "epoch": 0.0235485621269669, + "grad_norm": 6.334166049957275, + "learning_rate": 9.865013019048924e-06, + "loss": 0.4295, + "step": 1736 + }, + { + "epoch": 0.02356212696690179, + "grad_norm": 5.990656852722168, + "learning_rate": 9.864875976428671e-06, + "loss": 0.5448, + "step": 1737 + }, + { + "epoch": 0.02357569180683668, + "grad_norm": 7.743168354034424, + "learning_rate": 9.864738933808416e-06, + "loss": 0.4557, + "step": 1738 + }, + { + "epoch": 0.02358925664677157, + "grad_norm": 6.194370746612549, + "learning_rate": 9.86460189118816e-06, + "loss": 0.5178, + "step": 1739 + }, + { + "epoch": 0.023602821486706456, + "grad_norm": 7.523451328277588, + "learning_rate": 9.864464848567905e-06, + "loss": 0.6329, + "step": 1740 + }, + { + "epoch": 0.023616386326641346, + "grad_norm": 7.996037483215332, + "learning_rate": 9.864327805947652e-06, + "loss": 0.7334, + "step": 1741 + }, + { + "epoch": 0.023629951166576235, + "grad_norm": 8.073250770568848, + "learning_rate": 9.864190763327395e-06, + "loss": 0.5141, + "step": 1742 + }, + { + "epoch": 0.023643516006511125, + "grad_norm": 7.416553020477295, + "learning_rate": 9.86405372070714e-06, + "loss": 0.6049, + "step": 1743 + }, + { + "epoch": 0.02365708084644601, + "grad_norm": 9.596695899963379, + "learning_rate": 9.863916678086886e-06, + "loss": 0.6348, + "step": 1744 + }, + { + "epoch": 0.0236706456863809, + "grad_norm": 6.918674945831299, + "learning_rate": 9.86377963546663e-06, + "loss": 0.5653, + "step": 1745 + }, + { + "epoch": 0.02368421052631579, + "grad_norm": 7.482196807861328, + "learning_rate": 9.863642592846376e-06, + "loss": 0.5629, + "step": 1746 + }, + { + "epoch": 0.02369777536625068, + "grad_norm": 7.338730812072754, + "learning_rate": 9.863505550226121e-06, + "loss": 0.4988, + "step": 1747 + }, + { + "epoch": 0.023711340206185566, + "grad_norm": 6.572617530822754, + "learning_rate": 9.863368507605866e-06, + "loss": 0.3389, + "step": 1748 + }, + { + "epoch": 0.023724905046120456, + "grad_norm": 9.038126945495605, + "learning_rate": 9.863231464985611e-06, + "loss": 0.6637, + "step": 1749 + }, + { + "epoch": 0.023738469886055345, + "grad_norm": 11.335290908813477, + "learning_rate": 9.863094422365357e-06, + "loss": 0.7186, + "step": 1750 + }, + { + "epoch": 0.023752034725990235, + "grad_norm": 7.852713584899902, + "learning_rate": 9.862957379745102e-06, + "loss": 0.6083, + "step": 1751 + }, + { + "epoch": 0.02376559956592512, + "grad_norm": 9.250895500183105, + "learning_rate": 9.862820337124847e-06, + "loss": 0.5726, + "step": 1752 + }, + { + "epoch": 0.02377916440586001, + "grad_norm": 8.261543273925781, + "learning_rate": 9.862683294504592e-06, + "loss": 0.5049, + "step": 1753 + }, + { + "epoch": 0.0237927292457949, + "grad_norm": 8.019092559814453, + "learning_rate": 9.862546251884337e-06, + "loss": 0.4875, + "step": 1754 + }, + { + "epoch": 0.02380629408572979, + "grad_norm": 10.993224143981934, + "learning_rate": 9.862409209264082e-06, + "loss": 0.6015, + "step": 1755 + }, + { + "epoch": 0.023819858925664676, + "grad_norm": 7.780472755432129, + "learning_rate": 9.862272166643828e-06, + "loss": 0.4142, + "step": 1756 + }, + { + "epoch": 0.023833423765599566, + "grad_norm": 6.921276569366455, + "learning_rate": 9.862135124023571e-06, + "loss": 0.3451, + "step": 1757 + }, + { + "epoch": 0.023846988605534455, + "grad_norm": 9.880694389343262, + "learning_rate": 9.861998081403316e-06, + "loss": 0.5284, + "step": 1758 + }, + { + "epoch": 0.023860553445469345, + "grad_norm": 8.017871856689453, + "learning_rate": 9.861861038783063e-06, + "loss": 0.6068, + "step": 1759 + }, + { + "epoch": 0.02387411828540423, + "grad_norm": 8.772236824035645, + "learning_rate": 9.861723996162807e-06, + "loss": 0.4839, + "step": 1760 + }, + { + "epoch": 0.02388768312533912, + "grad_norm": 6.419288635253906, + "learning_rate": 9.861586953542552e-06, + "loss": 0.495, + "step": 1761 + }, + { + "epoch": 0.02390124796527401, + "grad_norm": 6.735507965087891, + "learning_rate": 9.861449910922297e-06, + "loss": 0.4861, + "step": 1762 + }, + { + "epoch": 0.0239148128052089, + "grad_norm": 7.9199981689453125, + "learning_rate": 9.861312868302044e-06, + "loss": 0.4156, + "step": 1763 + }, + { + "epoch": 0.023928377645143786, + "grad_norm": 6.752867698669434, + "learning_rate": 9.861175825681787e-06, + "loss": 0.4148, + "step": 1764 + }, + { + "epoch": 0.023941942485078676, + "grad_norm": 6.811943054199219, + "learning_rate": 9.861038783061533e-06, + "loss": 0.3654, + "step": 1765 + }, + { + "epoch": 0.023955507325013566, + "grad_norm": 6.4847412109375, + "learning_rate": 9.860901740441278e-06, + "loss": 0.5301, + "step": 1766 + }, + { + "epoch": 0.023969072164948455, + "grad_norm": 8.286717414855957, + "learning_rate": 9.860764697821023e-06, + "loss": 0.3895, + "step": 1767 + }, + { + "epoch": 0.02398263700488334, + "grad_norm": 7.089239120483398, + "learning_rate": 9.860627655200768e-06, + "loss": 0.4066, + "step": 1768 + }, + { + "epoch": 0.02399620184481823, + "grad_norm": 9.032129287719727, + "learning_rate": 9.860490612580513e-06, + "loss": 0.4692, + "step": 1769 + }, + { + "epoch": 0.02400976668475312, + "grad_norm": 6.0481109619140625, + "learning_rate": 9.860353569960258e-06, + "loss": 0.4172, + "step": 1770 + }, + { + "epoch": 0.02402333152468801, + "grad_norm": 8.551301956176758, + "learning_rate": 9.860216527340004e-06, + "loss": 0.5098, + "step": 1771 + }, + { + "epoch": 0.024036896364622896, + "grad_norm": 8.467119216918945, + "learning_rate": 9.860079484719749e-06, + "loss": 0.4581, + "step": 1772 + }, + { + "epoch": 0.024050461204557786, + "grad_norm": 10.099827766418457, + "learning_rate": 9.859942442099494e-06, + "loss": 0.5755, + "step": 1773 + }, + { + "epoch": 0.024064026044492676, + "grad_norm": 8.21038818359375, + "learning_rate": 9.85980539947924e-06, + "loss": 0.505, + "step": 1774 + }, + { + "epoch": 0.024077590884427565, + "grad_norm": 6.601445198059082, + "learning_rate": 9.859668356858983e-06, + "loss": 0.3997, + "step": 1775 + }, + { + "epoch": 0.02409115572436245, + "grad_norm": 8.146272659301758, + "learning_rate": 9.85953131423873e-06, + "loss": 0.597, + "step": 1776 + }, + { + "epoch": 0.02410472056429734, + "grad_norm": 8.276408195495605, + "learning_rate": 9.859394271618475e-06, + "loss": 0.5451, + "step": 1777 + }, + { + "epoch": 0.02411828540423223, + "grad_norm": 4.720536231994629, + "learning_rate": 9.85925722899822e-06, + "loss": 0.376, + "step": 1778 + }, + { + "epoch": 0.02413185024416712, + "grad_norm": 11.003462791442871, + "learning_rate": 9.859120186377963e-06, + "loss": 0.682, + "step": 1779 + }, + { + "epoch": 0.024145415084102007, + "grad_norm": 9.315871238708496, + "learning_rate": 9.85898314375771e-06, + "loss": 0.4915, + "step": 1780 + }, + { + "epoch": 0.024158979924036896, + "grad_norm": 8.125537872314453, + "learning_rate": 9.858846101137455e-06, + "loss": 0.5417, + "step": 1781 + }, + { + "epoch": 0.024172544763971786, + "grad_norm": 7.264868259429932, + "learning_rate": 9.858709058517199e-06, + "loss": 0.3721, + "step": 1782 + }, + { + "epoch": 0.024186109603906675, + "grad_norm": 8.051470756530762, + "learning_rate": 9.858572015896944e-06, + "loss": 0.6475, + "step": 1783 + }, + { + "epoch": 0.02419967444384156, + "grad_norm": 7.326466083526611, + "learning_rate": 9.85843497327669e-06, + "loss": 0.4801, + "step": 1784 + }, + { + "epoch": 0.02421323928377645, + "grad_norm": 7.266959190368652, + "learning_rate": 9.858297930656434e-06, + "loss": 0.4527, + "step": 1785 + }, + { + "epoch": 0.02422680412371134, + "grad_norm": 9.613940238952637, + "learning_rate": 9.85816088803618e-06, + "loss": 0.4761, + "step": 1786 + }, + { + "epoch": 0.02424036896364623, + "grad_norm": 5.938412189483643, + "learning_rate": 9.858023845415925e-06, + "loss": 0.3423, + "step": 1787 + }, + { + "epoch": 0.024253933803581117, + "grad_norm": 7.765452861785889, + "learning_rate": 9.85788680279567e-06, + "loss": 0.467, + "step": 1788 + }, + { + "epoch": 0.024267498643516006, + "grad_norm": 8.994489669799805, + "learning_rate": 9.857749760175415e-06, + "loss": 0.5166, + "step": 1789 + }, + { + "epoch": 0.024281063483450896, + "grad_norm": 10.041584968566895, + "learning_rate": 9.85761271755516e-06, + "loss": 0.6532, + "step": 1790 + }, + { + "epoch": 0.024294628323385786, + "grad_norm": 8.166485786437988, + "learning_rate": 9.857475674934906e-06, + "loss": 0.5194, + "step": 1791 + }, + { + "epoch": 0.02430819316332067, + "grad_norm": 8.70956802368164, + "learning_rate": 9.85733863231465e-06, + "loss": 0.4713, + "step": 1792 + }, + { + "epoch": 0.02432175800325556, + "grad_norm": 8.65820598602295, + "learning_rate": 9.857201589694396e-06, + "loss": 0.4795, + "step": 1793 + }, + { + "epoch": 0.02433532284319045, + "grad_norm": 7.115111351013184, + "learning_rate": 9.857064547074141e-06, + "loss": 0.4754, + "step": 1794 + }, + { + "epoch": 0.02434888768312534, + "grad_norm": 7.436770439147949, + "learning_rate": 9.856927504453886e-06, + "loss": 0.4603, + "step": 1795 + }, + { + "epoch": 0.024362452523060227, + "grad_norm": 6.943231582641602, + "learning_rate": 9.856790461833631e-06, + "loss": 0.5584, + "step": 1796 + }, + { + "epoch": 0.024376017362995116, + "grad_norm": 7.555288314819336, + "learning_rate": 9.856653419213377e-06, + "loss": 0.4414, + "step": 1797 + }, + { + "epoch": 0.024389582202930006, + "grad_norm": 7.2909417152404785, + "learning_rate": 9.856516376593122e-06, + "loss": 0.4062, + "step": 1798 + }, + { + "epoch": 0.024403147042864896, + "grad_norm": 7.031020641326904, + "learning_rate": 9.856379333972867e-06, + "loss": 0.4749, + "step": 1799 + }, + { + "epoch": 0.024416711882799782, + "grad_norm": 9.029290199279785, + "learning_rate": 9.85624229135261e-06, + "loss": 0.5624, + "step": 1800 + }, + { + "epoch": 0.02443027672273467, + "grad_norm": 6.29926872253418, + "learning_rate": 9.856105248732356e-06, + "loss": 0.3876, + "step": 1801 + }, + { + "epoch": 0.02444384156266956, + "grad_norm": 5.793352127075195, + "learning_rate": 9.855968206112103e-06, + "loss": 0.3683, + "step": 1802 + }, + { + "epoch": 0.02445740640260445, + "grad_norm": 7.6763811111450195, + "learning_rate": 9.855831163491848e-06, + "loss": 0.3993, + "step": 1803 + }, + { + "epoch": 0.024470971242539337, + "grad_norm": 8.698297500610352, + "learning_rate": 9.855694120871591e-06, + "loss": 0.4737, + "step": 1804 + }, + { + "epoch": 0.024484536082474227, + "grad_norm": 5.96561336517334, + "learning_rate": 9.855557078251336e-06, + "loss": 0.3879, + "step": 1805 + }, + { + "epoch": 0.024498100922409116, + "grad_norm": 6.342390537261963, + "learning_rate": 9.855420035631083e-06, + "loss": 0.4059, + "step": 1806 + }, + { + "epoch": 0.024511665762344006, + "grad_norm": 5.760236740112305, + "learning_rate": 9.855282993010827e-06, + "loss": 0.2666, + "step": 1807 + }, + { + "epoch": 0.024525230602278892, + "grad_norm": 9.226096153259277, + "learning_rate": 9.855145950390572e-06, + "loss": 0.3834, + "step": 1808 + }, + { + "epoch": 0.02453879544221378, + "grad_norm": 9.241247177124023, + "learning_rate": 9.855008907770317e-06, + "loss": 0.5269, + "step": 1809 + }, + { + "epoch": 0.02455236028214867, + "grad_norm": 6.75938606262207, + "learning_rate": 9.854871865150062e-06, + "loss": 0.4144, + "step": 1810 + }, + { + "epoch": 0.02456592512208356, + "grad_norm": 10.164109230041504, + "learning_rate": 9.854734822529807e-06, + "loss": 0.5015, + "step": 1811 + }, + { + "epoch": 0.024579489962018447, + "grad_norm": 7.9153523445129395, + "learning_rate": 9.854597779909553e-06, + "loss": 0.3684, + "step": 1812 + }, + { + "epoch": 0.024593054801953337, + "grad_norm": 8.633950233459473, + "learning_rate": 9.854460737289298e-06, + "loss": 0.4694, + "step": 1813 + }, + { + "epoch": 0.024606619641888226, + "grad_norm": 9.800445556640625, + "learning_rate": 9.854323694669043e-06, + "loss": 0.399, + "step": 1814 + }, + { + "epoch": 0.024620184481823116, + "grad_norm": 8.029269218444824, + "learning_rate": 9.854186652048788e-06, + "loss": 0.4464, + "step": 1815 + }, + { + "epoch": 0.024633749321758002, + "grad_norm": 7.147019386291504, + "learning_rate": 9.854049609428533e-06, + "loss": 0.4093, + "step": 1816 + }, + { + "epoch": 0.024647314161692892, + "grad_norm": 6.889081001281738, + "learning_rate": 9.853912566808278e-06, + "loss": 0.3121, + "step": 1817 + }, + { + "epoch": 0.02466087900162778, + "grad_norm": 10.110492706298828, + "learning_rate": 9.853775524188024e-06, + "loss": 0.5249, + "step": 1818 + }, + { + "epoch": 0.02467444384156267, + "grad_norm": 13.570658683776855, + "learning_rate": 9.853638481567769e-06, + "loss": 0.6633, + "step": 1819 + }, + { + "epoch": 0.024688008681497557, + "grad_norm": 10.488743782043457, + "learning_rate": 9.853501438947514e-06, + "loss": 0.482, + "step": 1820 + }, + { + "epoch": 0.024701573521432447, + "grad_norm": 8.375627517700195, + "learning_rate": 9.85336439632726e-06, + "loss": 0.4559, + "step": 1821 + }, + { + "epoch": 0.024715138361367336, + "grad_norm": 7.735020160675049, + "learning_rate": 9.853227353707003e-06, + "loss": 0.58, + "step": 1822 + }, + { + "epoch": 0.024728703201302226, + "grad_norm": 9.131516456604004, + "learning_rate": 9.85309031108675e-06, + "loss": 0.4046, + "step": 1823 + }, + { + "epoch": 0.024742268041237112, + "grad_norm": 7.9446797370910645, + "learning_rate": 9.852953268466495e-06, + "loss": 0.5913, + "step": 1824 + }, + { + "epoch": 0.024755832881172002, + "grad_norm": 6.752584934234619, + "learning_rate": 9.852816225846238e-06, + "loss": 0.4719, + "step": 1825 + }, + { + "epoch": 0.02476939772110689, + "grad_norm": 6.234941482543945, + "learning_rate": 9.852679183225983e-06, + "loss": 0.4259, + "step": 1826 + }, + { + "epoch": 0.02478296256104178, + "grad_norm": 6.754735946655273, + "learning_rate": 9.852542140605729e-06, + "loss": 0.3167, + "step": 1827 + }, + { + "epoch": 0.024796527400976667, + "grad_norm": 8.50892448425293, + "learning_rate": 9.852405097985475e-06, + "loss": 0.3706, + "step": 1828 + }, + { + "epoch": 0.024810092240911557, + "grad_norm": 7.270271301269531, + "learning_rate": 9.852268055365219e-06, + "loss": 0.3154, + "step": 1829 + }, + { + "epoch": 0.024823657080846447, + "grad_norm": 9.009188652038574, + "learning_rate": 9.852131012744964e-06, + "loss": 0.4421, + "step": 1830 + }, + { + "epoch": 0.024837221920781336, + "grad_norm": 8.18327808380127, + "learning_rate": 9.85199397012471e-06, + "loss": 0.3707, + "step": 1831 + }, + { + "epoch": 0.024850786760716222, + "grad_norm": 8.851530075073242, + "learning_rate": 9.851856927504454e-06, + "loss": 0.5161, + "step": 1832 + }, + { + "epoch": 0.024864351600651112, + "grad_norm": 9.75921630859375, + "learning_rate": 9.8517198848842e-06, + "loss": 0.3975, + "step": 1833 + }, + { + "epoch": 0.024877916440586, + "grad_norm": 5.857739448547363, + "learning_rate": 9.851582842263945e-06, + "loss": 0.3638, + "step": 1834 + }, + { + "epoch": 0.02489148128052089, + "grad_norm": 5.9203362464904785, + "learning_rate": 9.85144579964369e-06, + "loss": 0.3065, + "step": 1835 + }, + { + "epoch": 0.024905046120455777, + "grad_norm": 8.831404685974121, + "learning_rate": 9.851308757023435e-06, + "loss": 0.4332, + "step": 1836 + }, + { + "epoch": 0.024918610960390667, + "grad_norm": 7.072160720825195, + "learning_rate": 9.85117171440318e-06, + "loss": 0.2695, + "step": 1837 + }, + { + "epoch": 0.024932175800325557, + "grad_norm": 9.586686134338379, + "learning_rate": 9.851034671782926e-06, + "loss": 0.4816, + "step": 1838 + }, + { + "epoch": 0.024945740640260446, + "grad_norm": 9.074950218200684, + "learning_rate": 9.85089762916267e-06, + "loss": 0.4727, + "step": 1839 + }, + { + "epoch": 0.024959305480195332, + "grad_norm": 8.090068817138672, + "learning_rate": 9.850760586542414e-06, + "loss": 0.3357, + "step": 1840 + }, + { + "epoch": 0.024972870320130222, + "grad_norm": 10.133352279663086, + "learning_rate": 9.850623543922161e-06, + "loss": 0.4867, + "step": 1841 + }, + { + "epoch": 0.024986435160065112, + "grad_norm": 11.123655319213867, + "learning_rate": 9.850486501301906e-06, + "loss": 0.5628, + "step": 1842 + }, + { + "epoch": 0.025, + "grad_norm": 8.120037078857422, + "learning_rate": 9.85034945868165e-06, + "loss": 0.4724, + "step": 1843 + }, + { + "epoch": 0.025013564839934888, + "grad_norm": 8.312864303588867, + "learning_rate": 9.850212416061395e-06, + "loss": 0.5362, + "step": 1844 + }, + { + "epoch": 0.025027129679869777, + "grad_norm": 7.924835205078125, + "learning_rate": 9.850075373441142e-06, + "loss": 0.4809, + "step": 1845 + }, + { + "epoch": 0.025040694519804667, + "grad_norm": 6.584348678588867, + "learning_rate": 9.849938330820887e-06, + "loss": 0.3959, + "step": 1846 + }, + { + "epoch": 0.025054259359739556, + "grad_norm": 8.447508811950684, + "learning_rate": 9.84980128820063e-06, + "loss": 0.4351, + "step": 1847 + }, + { + "epoch": 0.025067824199674443, + "grad_norm": 7.174731731414795, + "learning_rate": 9.849664245580376e-06, + "loss": 0.6281, + "step": 1848 + }, + { + "epoch": 0.025081389039609332, + "grad_norm": 8.77473258972168, + "learning_rate": 9.849527202960123e-06, + "loss": 0.752, + "step": 1849 + }, + { + "epoch": 0.025094953879544222, + "grad_norm": 8.695685386657715, + "learning_rate": 9.849390160339866e-06, + "loss": 0.5067, + "step": 1850 + }, + { + "epoch": 0.02510851871947911, + "grad_norm": 8.629096031188965, + "learning_rate": 9.849253117719611e-06, + "loss": 0.4775, + "step": 1851 + }, + { + "epoch": 0.025122083559413998, + "grad_norm": 7.605898857116699, + "learning_rate": 9.849116075099356e-06, + "loss": 0.4973, + "step": 1852 + }, + { + "epoch": 0.025135648399348887, + "grad_norm": 7.0234694480896, + "learning_rate": 9.848979032479102e-06, + "loss": 0.4908, + "step": 1853 + }, + { + "epoch": 0.025149213239283777, + "grad_norm": 7.903439521789551, + "learning_rate": 9.848841989858847e-06, + "loss": 0.5914, + "step": 1854 + }, + { + "epoch": 0.025162778079218667, + "grad_norm": 7.152914524078369, + "learning_rate": 9.848704947238592e-06, + "loss": 0.408, + "step": 1855 + }, + { + "epoch": 0.025176342919153553, + "grad_norm": 8.838229179382324, + "learning_rate": 9.848567904618337e-06, + "loss": 0.4066, + "step": 1856 + }, + { + "epoch": 0.025189907759088442, + "grad_norm": 5.628579139709473, + "learning_rate": 9.848430861998082e-06, + "loss": 0.4252, + "step": 1857 + }, + { + "epoch": 0.025203472599023332, + "grad_norm": 10.38661003112793, + "learning_rate": 9.848293819377827e-06, + "loss": 0.8377, + "step": 1858 + }, + { + "epoch": 0.02521703743895822, + "grad_norm": 8.017573356628418, + "learning_rate": 9.848156776757573e-06, + "loss": 0.5453, + "step": 1859 + }, + { + "epoch": 0.025230602278893108, + "grad_norm": 7.748189449310303, + "learning_rate": 9.848019734137318e-06, + "loss": 0.3617, + "step": 1860 + }, + { + "epoch": 0.025244167118827997, + "grad_norm": 8.591148376464844, + "learning_rate": 9.847882691517063e-06, + "loss": 0.5322, + "step": 1861 + }, + { + "epoch": 0.025257731958762887, + "grad_norm": 9.662376403808594, + "learning_rate": 9.847745648896808e-06, + "loss": 0.6029, + "step": 1862 + }, + { + "epoch": 0.025271296798697777, + "grad_norm": 8.426177978515625, + "learning_rate": 9.847608606276553e-06, + "loss": 0.5146, + "step": 1863 + }, + { + "epoch": 0.025284861638632663, + "grad_norm": 8.046341896057129, + "learning_rate": 9.847471563656299e-06, + "loss": 0.4083, + "step": 1864 + }, + { + "epoch": 0.025298426478567553, + "grad_norm": 10.659558296203613, + "learning_rate": 9.847334521036042e-06, + "loss": 0.758, + "step": 1865 + }, + { + "epoch": 0.025311991318502442, + "grad_norm": 9.492382049560547, + "learning_rate": 9.847197478415789e-06, + "loss": 0.4734, + "step": 1866 + }, + { + "epoch": 0.025325556158437332, + "grad_norm": 8.557272911071777, + "learning_rate": 9.847060435795534e-06, + "loss": 0.3765, + "step": 1867 + }, + { + "epoch": 0.025339120998372218, + "grad_norm": 7.747757434844971, + "learning_rate": 9.846923393175278e-06, + "loss": 0.4412, + "step": 1868 + }, + { + "epoch": 0.025352685838307108, + "grad_norm": 9.693589210510254, + "learning_rate": 9.846786350555023e-06, + "loss": 0.7016, + "step": 1869 + }, + { + "epoch": 0.025366250678241997, + "grad_norm": 8.812190055847168, + "learning_rate": 9.846649307934768e-06, + "loss": 0.5197, + "step": 1870 + }, + { + "epoch": 0.025379815518176887, + "grad_norm": 7.7039361000061035, + "learning_rate": 9.846512265314515e-06, + "loss": 0.5175, + "step": 1871 + }, + { + "epoch": 0.025393380358111773, + "grad_norm": 7.5168328285217285, + "learning_rate": 9.846375222694258e-06, + "loss": 0.4441, + "step": 1872 + }, + { + "epoch": 0.025406945198046663, + "grad_norm": 8.235729217529297, + "learning_rate": 9.846238180074003e-06, + "loss": 0.4571, + "step": 1873 + }, + { + "epoch": 0.025420510037981552, + "grad_norm": 9.100536346435547, + "learning_rate": 9.846101137453749e-06, + "loss": 0.6317, + "step": 1874 + }, + { + "epoch": 0.025434074877916442, + "grad_norm": 10.118378639221191, + "learning_rate": 9.845964094833494e-06, + "loss": 0.475, + "step": 1875 + }, + { + "epoch": 0.025447639717851328, + "grad_norm": 7.509827136993408, + "learning_rate": 9.845827052213239e-06, + "loss": 0.6773, + "step": 1876 + }, + { + "epoch": 0.025461204557786218, + "grad_norm": 10.755773544311523, + "learning_rate": 9.845690009592984e-06, + "loss": 0.7909, + "step": 1877 + }, + { + "epoch": 0.025474769397721107, + "grad_norm": 10.421545028686523, + "learning_rate": 9.84555296697273e-06, + "loss": 0.5864, + "step": 1878 + }, + { + "epoch": 0.025488334237655997, + "grad_norm": 8.642182350158691, + "learning_rate": 9.845415924352475e-06, + "loss": 0.3965, + "step": 1879 + }, + { + "epoch": 0.025501899077590883, + "grad_norm": 11.319798469543457, + "learning_rate": 9.84527888173222e-06, + "loss": 0.4759, + "step": 1880 + }, + { + "epoch": 0.025515463917525773, + "grad_norm": 9.45016860961914, + "learning_rate": 9.845141839111965e-06, + "loss": 0.5936, + "step": 1881 + }, + { + "epoch": 0.025529028757460662, + "grad_norm": 7.529163360595703, + "learning_rate": 9.84500479649171e-06, + "loss": 0.4488, + "step": 1882 + }, + { + "epoch": 0.025542593597395552, + "grad_norm": 9.053173065185547, + "learning_rate": 9.844867753871454e-06, + "loss": 0.4901, + "step": 1883 + }, + { + "epoch": 0.025556158437330438, + "grad_norm": 9.73664665222168, + "learning_rate": 9.8447307112512e-06, + "loss": 0.5648, + "step": 1884 + }, + { + "epoch": 0.025569723277265328, + "grad_norm": 7.14373254776001, + "learning_rate": 9.844593668630946e-06, + "loss": 0.5184, + "step": 1885 + }, + { + "epoch": 0.025583288117200217, + "grad_norm": 11.394335746765137, + "learning_rate": 9.84445662601069e-06, + "loss": 0.4967, + "step": 1886 + }, + { + "epoch": 0.025596852957135107, + "grad_norm": 7.531684398651123, + "learning_rate": 9.844319583390434e-06, + "loss": 0.3809, + "step": 1887 + }, + { + "epoch": 0.025610417797069993, + "grad_norm": 9.1718111038208, + "learning_rate": 9.844182540770181e-06, + "loss": 0.5149, + "step": 1888 + }, + { + "epoch": 0.025623982637004883, + "grad_norm": 6.893195152282715, + "learning_rate": 9.844045498149926e-06, + "loss": 0.5121, + "step": 1889 + }, + { + "epoch": 0.025637547476939773, + "grad_norm": 8.601831436157227, + "learning_rate": 9.84390845552967e-06, + "loss": 0.5938, + "step": 1890 + }, + { + "epoch": 0.025651112316874662, + "grad_norm": 9.512919425964355, + "learning_rate": 9.843771412909415e-06, + "loss": 0.6849, + "step": 1891 + }, + { + "epoch": 0.02566467715680955, + "grad_norm": 8.861076354980469, + "learning_rate": 9.843634370289162e-06, + "loss": 0.5621, + "step": 1892 + }, + { + "epoch": 0.025678241996744438, + "grad_norm": 6.988520622253418, + "learning_rate": 9.843497327668905e-06, + "loss": 0.5258, + "step": 1893 + }, + { + "epoch": 0.025691806836679328, + "grad_norm": 7.529130458831787, + "learning_rate": 9.84336028504865e-06, + "loss": 0.3526, + "step": 1894 + }, + { + "epoch": 0.025705371676614217, + "grad_norm": 12.82313060760498, + "learning_rate": 9.843223242428396e-06, + "loss": 0.7818, + "step": 1895 + }, + { + "epoch": 0.025718936516549103, + "grad_norm": 5.046250820159912, + "learning_rate": 9.843086199808141e-06, + "loss": 0.4002, + "step": 1896 + }, + { + "epoch": 0.025732501356483993, + "grad_norm": 7.32014799118042, + "learning_rate": 9.842949157187886e-06, + "loss": 0.3978, + "step": 1897 + }, + { + "epoch": 0.025746066196418883, + "grad_norm": 7.691884517669678, + "learning_rate": 9.842812114567631e-06, + "loss": 0.5844, + "step": 1898 + }, + { + "epoch": 0.025759631036353772, + "grad_norm": 12.208202362060547, + "learning_rate": 9.842675071947376e-06, + "loss": 0.7532, + "step": 1899 + }, + { + "epoch": 0.02577319587628866, + "grad_norm": 11.312212944030762, + "learning_rate": 9.842538029327122e-06, + "loss": 0.7064, + "step": 1900 + }, + { + "epoch": 0.025786760716223548, + "grad_norm": 9.51562786102295, + "learning_rate": 9.842400986706867e-06, + "loss": 0.4815, + "step": 1901 + }, + { + "epoch": 0.025800325556158438, + "grad_norm": 10.73644733428955, + "learning_rate": 9.842263944086612e-06, + "loss": 0.546, + "step": 1902 + }, + { + "epoch": 0.025813890396093327, + "grad_norm": 7.965908527374268, + "learning_rate": 9.842126901466357e-06, + "loss": 0.3768, + "step": 1903 + }, + { + "epoch": 0.025827455236028213, + "grad_norm": 10.28653621673584, + "learning_rate": 9.841989858846102e-06, + "loss": 0.637, + "step": 1904 + }, + { + "epoch": 0.025841020075963103, + "grad_norm": 7.386546611785889, + "learning_rate": 9.841852816225847e-06, + "loss": 0.4725, + "step": 1905 + }, + { + "epoch": 0.025854584915897993, + "grad_norm": 8.436369895935059, + "learning_rate": 9.841715773605593e-06, + "loss": 0.4039, + "step": 1906 + }, + { + "epoch": 0.025868149755832882, + "grad_norm": 6.536130428314209, + "learning_rate": 9.841578730985338e-06, + "loss": 0.4383, + "step": 1907 + }, + { + "epoch": 0.02588171459576777, + "grad_norm": 7.055587291717529, + "learning_rate": 9.841441688365081e-06, + "loss": 0.4751, + "step": 1908 + }, + { + "epoch": 0.025895279435702658, + "grad_norm": 10.255319595336914, + "learning_rate": 9.841304645744826e-06, + "loss": 0.4465, + "step": 1909 + }, + { + "epoch": 0.025908844275637548, + "grad_norm": 10.052679061889648, + "learning_rate": 9.841167603124573e-06, + "loss": 0.5233, + "step": 1910 + }, + { + "epoch": 0.025922409115572437, + "grad_norm": 9.571836471557617, + "learning_rate": 9.841030560504319e-06, + "loss": 0.4431, + "step": 1911 + }, + { + "epoch": 0.025935973955507324, + "grad_norm": 6.9510908126831055, + "learning_rate": 9.840893517884062e-06, + "loss": 0.447, + "step": 1912 + }, + { + "epoch": 0.025949538795442213, + "grad_norm": 9.331391334533691, + "learning_rate": 9.840756475263807e-06, + "loss": 0.5862, + "step": 1913 + }, + { + "epoch": 0.025963103635377103, + "grad_norm": 7.1205549240112305, + "learning_rate": 9.840619432643554e-06, + "loss": 0.3083, + "step": 1914 + }, + { + "epoch": 0.025976668475311993, + "grad_norm": 8.160279273986816, + "learning_rate": 9.840482390023298e-06, + "loss": 0.5072, + "step": 1915 + }, + { + "epoch": 0.02599023331524688, + "grad_norm": 8.037257194519043, + "learning_rate": 9.840345347403043e-06, + "loss": 0.4444, + "step": 1916 + }, + { + "epoch": 0.02600379815518177, + "grad_norm": 8.296711921691895, + "learning_rate": 9.840208304782788e-06, + "loss": 0.5377, + "step": 1917 + }, + { + "epoch": 0.026017362995116658, + "grad_norm": 8.820931434631348, + "learning_rate": 9.840071262162533e-06, + "loss": 0.4984, + "step": 1918 + }, + { + "epoch": 0.026030927835051548, + "grad_norm": 8.066828727722168, + "learning_rate": 9.839934219542278e-06, + "loss": 0.4759, + "step": 1919 + }, + { + "epoch": 0.026044492674986434, + "grad_norm": 6.920343399047852, + "learning_rate": 9.839797176922023e-06, + "loss": 0.369, + "step": 1920 + }, + { + "epoch": 0.026058057514921323, + "grad_norm": 6.383228302001953, + "learning_rate": 9.839660134301769e-06, + "loss": 0.3973, + "step": 1921 + }, + { + "epoch": 0.026071622354856213, + "grad_norm": 8.800698280334473, + "learning_rate": 9.839523091681514e-06, + "loss": 0.5097, + "step": 1922 + }, + { + "epoch": 0.026085187194791103, + "grad_norm": 8.012166976928711, + "learning_rate": 9.839386049061259e-06, + "loss": 0.4742, + "step": 1923 + }, + { + "epoch": 0.02609875203472599, + "grad_norm": 9.289155960083008, + "learning_rate": 9.839249006441004e-06, + "loss": 0.6062, + "step": 1924 + }, + { + "epoch": 0.02611231687466088, + "grad_norm": 7.874020576477051, + "learning_rate": 9.83911196382075e-06, + "loss": 0.3556, + "step": 1925 + }, + { + "epoch": 0.026125881714595768, + "grad_norm": 7.704112529754639, + "learning_rate": 9.838974921200495e-06, + "loss": 0.3911, + "step": 1926 + }, + { + "epoch": 0.026139446554530658, + "grad_norm": 5.316188335418701, + "learning_rate": 9.83883787858024e-06, + "loss": 0.3214, + "step": 1927 + }, + { + "epoch": 0.026153011394465544, + "grad_norm": 9.345569610595703, + "learning_rate": 9.838700835959985e-06, + "loss": 0.4175, + "step": 1928 + }, + { + "epoch": 0.026166576234400434, + "grad_norm": 6.218289375305176, + "learning_rate": 9.83856379333973e-06, + "loss": 0.4523, + "step": 1929 + }, + { + "epoch": 0.026180141074335323, + "grad_norm": 6.88295841217041, + "learning_rate": 9.838426750719474e-06, + "loss": 0.3263, + "step": 1930 + }, + { + "epoch": 0.026193705914270213, + "grad_norm": 10.315751075744629, + "learning_rate": 9.83828970809922e-06, + "loss": 0.6166, + "step": 1931 + }, + { + "epoch": 0.0262072707542051, + "grad_norm": 7.963667392730713, + "learning_rate": 9.838152665478966e-06, + "loss": 0.5017, + "step": 1932 + }, + { + "epoch": 0.02622083559413999, + "grad_norm": 4.459794521331787, + "learning_rate": 9.838015622858709e-06, + "loss": 0.224, + "step": 1933 + }, + { + "epoch": 0.026234400434074878, + "grad_norm": 5.795046806335449, + "learning_rate": 9.837878580238454e-06, + "loss": 0.3572, + "step": 1934 + }, + { + "epoch": 0.026247965274009768, + "grad_norm": 7.893136501312256, + "learning_rate": 9.837741537618201e-06, + "loss": 0.4787, + "step": 1935 + }, + { + "epoch": 0.026261530113944654, + "grad_norm": 8.1357421875, + "learning_rate": 9.837604494997945e-06, + "loss": 0.3569, + "step": 1936 + }, + { + "epoch": 0.026275094953879544, + "grad_norm": 8.341879844665527, + "learning_rate": 9.83746745237769e-06, + "loss": 0.3853, + "step": 1937 + }, + { + "epoch": 0.026288659793814433, + "grad_norm": 7.780252456665039, + "learning_rate": 9.837330409757435e-06, + "loss": 0.4921, + "step": 1938 + }, + { + "epoch": 0.026302224633749323, + "grad_norm": 7.302733421325684, + "learning_rate": 9.83719336713718e-06, + "loss": 0.4405, + "step": 1939 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 7.216597080230713, + "learning_rate": 9.837056324516925e-06, + "loss": 0.4886, + "step": 1940 + }, + { + "epoch": 0.0263293543136191, + "grad_norm": 6.767823696136475, + "learning_rate": 9.83691928189667e-06, + "loss": 0.408, + "step": 1941 + }, + { + "epoch": 0.02634291915355399, + "grad_norm": 8.951476097106934, + "learning_rate": 9.836782239276416e-06, + "loss": 0.558, + "step": 1942 + }, + { + "epoch": 0.026356483993488878, + "grad_norm": 7.785768032073975, + "learning_rate": 9.836645196656161e-06, + "loss": 0.3452, + "step": 1943 + }, + { + "epoch": 0.026370048833423764, + "grad_norm": 7.565659999847412, + "learning_rate": 9.836508154035906e-06, + "loss": 0.4212, + "step": 1944 + }, + { + "epoch": 0.026383613673358654, + "grad_norm": 12.119254112243652, + "learning_rate": 9.836371111415651e-06, + "loss": 0.5099, + "step": 1945 + }, + { + "epoch": 0.026397178513293543, + "grad_norm": 8.560036659240723, + "learning_rate": 9.836234068795396e-06, + "loss": 0.516, + "step": 1946 + }, + { + "epoch": 0.026410743353228433, + "grad_norm": 9.19582748413086, + "learning_rate": 9.836097026175142e-06, + "loss": 0.5311, + "step": 1947 + }, + { + "epoch": 0.02642430819316332, + "grad_norm": 7.417659282684326, + "learning_rate": 9.835959983554887e-06, + "loss": 0.4231, + "step": 1948 + }, + { + "epoch": 0.02643787303309821, + "grad_norm": 10.7174072265625, + "learning_rate": 9.835822940934632e-06, + "loss": 0.4025, + "step": 1949 + }, + { + "epoch": 0.0264514378730331, + "grad_norm": 7.692173957824707, + "learning_rate": 9.835685898314377e-06, + "loss": 0.4284, + "step": 1950 + }, + { + "epoch": 0.026465002712967988, + "grad_norm": 8.582910537719727, + "learning_rate": 9.83554885569412e-06, + "loss": 0.4331, + "step": 1951 + }, + { + "epoch": 0.026478567552902874, + "grad_norm": 7.210662364959717, + "learning_rate": 9.835411813073866e-06, + "loss": 0.4255, + "step": 1952 + }, + { + "epoch": 0.026492132392837764, + "grad_norm": 8.901037216186523, + "learning_rate": 9.835274770453613e-06, + "loss": 0.5842, + "step": 1953 + }, + { + "epoch": 0.026505697232772654, + "grad_norm": 9.485045433044434, + "learning_rate": 9.835137727833358e-06, + "loss": 0.4985, + "step": 1954 + }, + { + "epoch": 0.026519262072707543, + "grad_norm": 10.74976634979248, + "learning_rate": 9.835000685213101e-06, + "loss": 0.5271, + "step": 1955 + }, + { + "epoch": 0.02653282691264243, + "grad_norm": 6.724417686462402, + "learning_rate": 9.834863642592847e-06, + "loss": 0.431, + "step": 1956 + }, + { + "epoch": 0.02654639175257732, + "grad_norm": 9.180333137512207, + "learning_rate": 9.834726599972593e-06, + "loss": 0.4647, + "step": 1957 + }, + { + "epoch": 0.02655995659251221, + "grad_norm": 11.721661567687988, + "learning_rate": 9.834589557352337e-06, + "loss": 0.7399, + "step": 1958 + }, + { + "epoch": 0.026573521432447098, + "grad_norm": 7.142617225646973, + "learning_rate": 9.834452514732082e-06, + "loss": 0.5153, + "step": 1959 + }, + { + "epoch": 0.026587086272381984, + "grad_norm": 6.350276470184326, + "learning_rate": 9.834315472111827e-06, + "loss": 0.321, + "step": 1960 + }, + { + "epoch": 0.026600651112316874, + "grad_norm": 7.797658920288086, + "learning_rate": 9.834178429491572e-06, + "loss": 0.3243, + "step": 1961 + }, + { + "epoch": 0.026614215952251764, + "grad_norm": 10.1917724609375, + "learning_rate": 9.834041386871318e-06, + "loss": 0.438, + "step": 1962 + }, + { + "epoch": 0.026627780792186653, + "grad_norm": 7.755753040313721, + "learning_rate": 9.833904344251063e-06, + "loss": 0.4598, + "step": 1963 + }, + { + "epoch": 0.02664134563212154, + "grad_norm": 7.441950798034668, + "learning_rate": 9.833767301630808e-06, + "loss": 0.4423, + "step": 1964 + }, + { + "epoch": 0.02665491047205643, + "grad_norm": 7.679107666015625, + "learning_rate": 9.833630259010553e-06, + "loss": 0.4369, + "step": 1965 + }, + { + "epoch": 0.02666847531199132, + "grad_norm": 6.938713073730469, + "learning_rate": 9.833493216390298e-06, + "loss": 0.4324, + "step": 1966 + }, + { + "epoch": 0.02668204015192621, + "grad_norm": 6.919548034667969, + "learning_rate": 9.833356173770043e-06, + "loss": 0.4445, + "step": 1967 + }, + { + "epoch": 0.026695604991861095, + "grad_norm": 7.489215850830078, + "learning_rate": 9.833219131149789e-06, + "loss": 0.415, + "step": 1968 + }, + { + "epoch": 0.026709169831795984, + "grad_norm": 10.017423629760742, + "learning_rate": 9.833082088529534e-06, + "loss": 0.6345, + "step": 1969 + }, + { + "epoch": 0.026722734671730874, + "grad_norm": 7.480905532836914, + "learning_rate": 9.832945045909279e-06, + "loss": 0.4934, + "step": 1970 + }, + { + "epoch": 0.026736299511665763, + "grad_norm": 7.733413219451904, + "learning_rate": 9.832808003289024e-06, + "loss": 0.4933, + "step": 1971 + }, + { + "epoch": 0.02674986435160065, + "grad_norm": 6.711556911468506, + "learning_rate": 9.83267096066877e-06, + "loss": 0.389, + "step": 1972 + }, + { + "epoch": 0.02676342919153554, + "grad_norm": 7.406049728393555, + "learning_rate": 9.832533918048513e-06, + "loss": 0.3691, + "step": 1973 + }, + { + "epoch": 0.02677699403147043, + "grad_norm": 7.939180850982666, + "learning_rate": 9.83239687542826e-06, + "loss": 0.5205, + "step": 1974 + }, + { + "epoch": 0.02679055887140532, + "grad_norm": 9.426094055175781, + "learning_rate": 9.832259832808005e-06, + "loss": 0.4548, + "step": 1975 + }, + { + "epoch": 0.026804123711340205, + "grad_norm": 7.082668304443359, + "learning_rate": 9.832122790187748e-06, + "loss": 0.47, + "step": 1976 + }, + { + "epoch": 0.026817688551275094, + "grad_norm": 9.439332008361816, + "learning_rate": 9.831985747567494e-06, + "loss": 0.5378, + "step": 1977 + }, + { + "epoch": 0.026831253391209984, + "grad_norm": 10.642860412597656, + "learning_rate": 9.831848704947239e-06, + "loss": 0.6679, + "step": 1978 + }, + { + "epoch": 0.026844818231144874, + "grad_norm": 8.541391372680664, + "learning_rate": 9.831711662326986e-06, + "loss": 0.5449, + "step": 1979 + }, + { + "epoch": 0.02685838307107976, + "grad_norm": 10.421525955200195, + "learning_rate": 9.831574619706729e-06, + "loss": 0.528, + "step": 1980 + }, + { + "epoch": 0.02687194791101465, + "grad_norm": 8.833459854125977, + "learning_rate": 9.831437577086474e-06, + "loss": 0.4835, + "step": 1981 + }, + { + "epoch": 0.02688551275094954, + "grad_norm": 9.601861953735352, + "learning_rate": 9.83130053446622e-06, + "loss": 0.6005, + "step": 1982 + }, + { + "epoch": 0.02689907759088443, + "grad_norm": 8.740257263183594, + "learning_rate": 9.831163491845965e-06, + "loss": 0.5996, + "step": 1983 + }, + { + "epoch": 0.026912642430819315, + "grad_norm": 13.184198379516602, + "learning_rate": 9.83102644922571e-06, + "loss": 0.8019, + "step": 1984 + }, + { + "epoch": 0.026926207270754204, + "grad_norm": 9.501500129699707, + "learning_rate": 9.830889406605455e-06, + "loss": 0.6943, + "step": 1985 + }, + { + "epoch": 0.026939772110689094, + "grad_norm": 10.029322624206543, + "learning_rate": 9.8307523639852e-06, + "loss": 0.5851, + "step": 1986 + }, + { + "epoch": 0.026953336950623984, + "grad_norm": 10.042070388793945, + "learning_rate": 9.830615321364945e-06, + "loss": 0.5627, + "step": 1987 + }, + { + "epoch": 0.02696690179055887, + "grad_norm": 9.21120548248291, + "learning_rate": 9.83047827874469e-06, + "loss": 0.594, + "step": 1988 + }, + { + "epoch": 0.02698046663049376, + "grad_norm": 8.58281135559082, + "learning_rate": 9.830341236124436e-06, + "loss": 0.5862, + "step": 1989 + }, + { + "epoch": 0.02699403147042865, + "grad_norm": 8.158546447753906, + "learning_rate": 9.830204193504181e-06, + "loss": 0.5519, + "step": 1990 + }, + { + "epoch": 0.02700759631036354, + "grad_norm": 7.515542030334473, + "learning_rate": 9.830067150883924e-06, + "loss": 0.4271, + "step": 1991 + }, + { + "epoch": 0.027021161150298425, + "grad_norm": 8.197101593017578, + "learning_rate": 9.829930108263671e-06, + "loss": 0.4717, + "step": 1992 + }, + { + "epoch": 0.027034725990233315, + "grad_norm": 7.137418746948242, + "learning_rate": 9.829793065643416e-06, + "loss": 0.4618, + "step": 1993 + }, + { + "epoch": 0.027048290830168204, + "grad_norm": 8.27993106842041, + "learning_rate": 9.829656023023162e-06, + "loss": 0.5781, + "step": 1994 + }, + { + "epoch": 0.027061855670103094, + "grad_norm": 7.296784400939941, + "learning_rate": 9.829518980402905e-06, + "loss": 0.4858, + "step": 1995 + }, + { + "epoch": 0.02707542051003798, + "grad_norm": 10.619623184204102, + "learning_rate": 9.829381937782652e-06, + "loss": 0.6289, + "step": 1996 + }, + { + "epoch": 0.02708898534997287, + "grad_norm": 9.294803619384766, + "learning_rate": 9.829244895162397e-06, + "loss": 0.4783, + "step": 1997 + }, + { + "epoch": 0.02710255018990776, + "grad_norm": 11.273898124694824, + "learning_rate": 9.82910785254214e-06, + "loss": 0.5999, + "step": 1998 + }, + { + "epoch": 0.02711611502984265, + "grad_norm": 8.930468559265137, + "learning_rate": 9.828970809921886e-06, + "loss": 0.5444, + "step": 1999 + }, + { + "epoch": 0.027129679869777535, + "grad_norm": 7.483041763305664, + "learning_rate": 9.828833767301633e-06, + "loss": 0.4451, + "step": 2000 + }, + { + "epoch": 0.027143244709712425, + "grad_norm": 7.063776969909668, + "learning_rate": 9.828696724681376e-06, + "loss": 0.4535, + "step": 2001 + }, + { + "epoch": 0.027156809549647314, + "grad_norm": 7.864805698394775, + "learning_rate": 9.828559682061121e-06, + "loss": 0.4967, + "step": 2002 + }, + { + "epoch": 0.027170374389582204, + "grad_norm": 7.695542812347412, + "learning_rate": 9.828422639440867e-06, + "loss": 0.4794, + "step": 2003 + }, + { + "epoch": 0.02718393922951709, + "grad_norm": 8.28078842163086, + "learning_rate": 9.828285596820613e-06, + "loss": 0.4757, + "step": 2004 + }, + { + "epoch": 0.02719750406945198, + "grad_norm": 7.822886943817139, + "learning_rate": 9.828148554200357e-06, + "loss": 0.4799, + "step": 2005 + }, + { + "epoch": 0.02721106890938687, + "grad_norm": 6.359421253204346, + "learning_rate": 9.828011511580102e-06, + "loss": 0.4083, + "step": 2006 + }, + { + "epoch": 0.02722463374932176, + "grad_norm": 7.711943626403809, + "learning_rate": 9.827874468959847e-06, + "loss": 0.4785, + "step": 2007 + }, + { + "epoch": 0.027238198589256645, + "grad_norm": 6.911449909210205, + "learning_rate": 9.827737426339592e-06, + "loss": 0.2867, + "step": 2008 + }, + { + "epoch": 0.027251763429191535, + "grad_norm": 5.352276802062988, + "learning_rate": 9.827600383719338e-06, + "loss": 0.2958, + "step": 2009 + }, + { + "epoch": 0.027265328269126424, + "grad_norm": 8.874073028564453, + "learning_rate": 9.827463341099083e-06, + "loss": 0.539, + "step": 2010 + }, + { + "epoch": 0.027278893109061314, + "grad_norm": 8.81011962890625, + "learning_rate": 9.827326298478828e-06, + "loss": 0.4567, + "step": 2011 + }, + { + "epoch": 0.0272924579489962, + "grad_norm": 6.895270347595215, + "learning_rate": 9.827189255858573e-06, + "loss": 0.4397, + "step": 2012 + }, + { + "epoch": 0.02730602278893109, + "grad_norm": 7.135530471801758, + "learning_rate": 9.827052213238318e-06, + "loss": 0.4802, + "step": 2013 + }, + { + "epoch": 0.02731958762886598, + "grad_norm": 7.0626444816589355, + "learning_rate": 9.826915170618063e-06, + "loss": 0.4471, + "step": 2014 + }, + { + "epoch": 0.02733315246880087, + "grad_norm": 5.956888198852539, + "learning_rate": 9.826778127997809e-06, + "loss": 0.4046, + "step": 2015 + }, + { + "epoch": 0.027346717308735755, + "grad_norm": 6.187826156616211, + "learning_rate": 9.826641085377552e-06, + "loss": 0.4077, + "step": 2016 + }, + { + "epoch": 0.027360282148670645, + "grad_norm": 7.30269718170166, + "learning_rate": 9.826504042757299e-06, + "loss": 0.295, + "step": 2017 + }, + { + "epoch": 0.027373846988605535, + "grad_norm": 10.587748527526855, + "learning_rate": 9.826367000137044e-06, + "loss": 0.4924, + "step": 2018 + }, + { + "epoch": 0.027387411828540424, + "grad_norm": 8.8760347366333, + "learning_rate": 9.82622995751679e-06, + "loss": 0.4724, + "step": 2019 + }, + { + "epoch": 0.02740097666847531, + "grad_norm": 8.505038261413574, + "learning_rate": 9.826092914896533e-06, + "loss": 0.5911, + "step": 2020 + }, + { + "epoch": 0.0274145415084102, + "grad_norm": 9.843891143798828, + "learning_rate": 9.825955872276278e-06, + "loss": 0.5681, + "step": 2021 + }, + { + "epoch": 0.02742810634834509, + "grad_norm": 8.450545310974121, + "learning_rate": 9.825818829656025e-06, + "loss": 0.6173, + "step": 2022 + }, + { + "epoch": 0.02744167118827998, + "grad_norm": 7.691187381744385, + "learning_rate": 9.825681787035768e-06, + "loss": 0.3564, + "step": 2023 + }, + { + "epoch": 0.027455236028214865, + "grad_norm": 8.051480293273926, + "learning_rate": 9.825544744415514e-06, + "loss": 0.5775, + "step": 2024 + }, + { + "epoch": 0.027468800868149755, + "grad_norm": 8.318070411682129, + "learning_rate": 9.825407701795259e-06, + "loss": 0.5662, + "step": 2025 + }, + { + "epoch": 0.027482365708084645, + "grad_norm": 7.521555423736572, + "learning_rate": 9.825270659175004e-06, + "loss": 0.5947, + "step": 2026 + }, + { + "epoch": 0.027495930548019534, + "grad_norm": 8.4747896194458, + "learning_rate": 9.825133616554749e-06, + "loss": 0.3616, + "step": 2027 + }, + { + "epoch": 0.02750949538795442, + "grad_norm": 10.147942543029785, + "learning_rate": 9.824996573934494e-06, + "loss": 0.9183, + "step": 2028 + }, + { + "epoch": 0.02752306022788931, + "grad_norm": 8.592843055725098, + "learning_rate": 9.82485953131424e-06, + "loss": 0.5405, + "step": 2029 + }, + { + "epoch": 0.0275366250678242, + "grad_norm": 11.708574295043945, + "learning_rate": 9.824722488693985e-06, + "loss": 0.5954, + "step": 2030 + }, + { + "epoch": 0.02755018990775909, + "grad_norm": 6.89867639541626, + "learning_rate": 9.82458544607373e-06, + "loss": 0.3709, + "step": 2031 + }, + { + "epoch": 0.027563754747693976, + "grad_norm": 5.627826690673828, + "learning_rate": 9.824448403453475e-06, + "loss": 0.3938, + "step": 2032 + }, + { + "epoch": 0.027577319587628865, + "grad_norm": 11.456304550170898, + "learning_rate": 9.82431136083322e-06, + "loss": 0.6755, + "step": 2033 + }, + { + "epoch": 0.027590884427563755, + "grad_norm": 10.685464859008789, + "learning_rate": 9.824174318212964e-06, + "loss": 0.6406, + "step": 2034 + }, + { + "epoch": 0.027604449267498644, + "grad_norm": 7.185917377471924, + "learning_rate": 9.82403727559271e-06, + "loss": 0.5264, + "step": 2035 + }, + { + "epoch": 0.02761801410743353, + "grad_norm": 10.949860572814941, + "learning_rate": 9.823900232972456e-06, + "loss": 0.7405, + "step": 2036 + }, + { + "epoch": 0.02763157894736842, + "grad_norm": 10.623370170593262, + "learning_rate": 9.823763190352201e-06, + "loss": 0.5194, + "step": 2037 + }, + { + "epoch": 0.02764514378730331, + "grad_norm": 7.9095563888549805, + "learning_rate": 9.823626147731944e-06, + "loss": 0.5054, + "step": 2038 + }, + { + "epoch": 0.0276587086272382, + "grad_norm": 11.023244857788086, + "learning_rate": 9.823489105111691e-06, + "loss": 0.9024, + "step": 2039 + }, + { + "epoch": 0.027672273467173086, + "grad_norm": 7.786960124969482, + "learning_rate": 9.823352062491436e-06, + "loss": 0.5226, + "step": 2040 + }, + { + "epoch": 0.027685838307107975, + "grad_norm": 5.96430778503418, + "learning_rate": 9.82321501987118e-06, + "loss": 0.4031, + "step": 2041 + }, + { + "epoch": 0.027699403147042865, + "grad_norm": 8.92218017578125, + "learning_rate": 9.823077977250925e-06, + "loss": 0.7372, + "step": 2042 + }, + { + "epoch": 0.027712967986977755, + "grad_norm": 8.09753131866455, + "learning_rate": 9.822940934630672e-06, + "loss": 0.6224, + "step": 2043 + }, + { + "epoch": 0.02772653282691264, + "grad_norm": 7.157499313354492, + "learning_rate": 9.822803892010415e-06, + "loss": 0.5118, + "step": 2044 + }, + { + "epoch": 0.02774009766684753, + "grad_norm": 9.212157249450684, + "learning_rate": 9.82266684939016e-06, + "loss": 0.4974, + "step": 2045 + }, + { + "epoch": 0.02775366250678242, + "grad_norm": 8.72615909576416, + "learning_rate": 9.822529806769906e-06, + "loss": 0.6206, + "step": 2046 + }, + { + "epoch": 0.02776722734671731, + "grad_norm": 6.013213157653809, + "learning_rate": 9.822392764149651e-06, + "loss": 0.4869, + "step": 2047 + }, + { + "epoch": 0.0277807921866522, + "grad_norm": 6.461771011352539, + "learning_rate": 9.822255721529396e-06, + "loss": 0.4087, + "step": 2048 + }, + { + "epoch": 0.027794357026587085, + "grad_norm": 6.504959583282471, + "learning_rate": 9.822118678909141e-06, + "loss": 0.4269, + "step": 2049 + }, + { + "epoch": 0.027807921866521975, + "grad_norm": 7.922749996185303, + "learning_rate": 9.821981636288887e-06, + "loss": 0.4927, + "step": 2050 + }, + { + "epoch": 0.027821486706456865, + "grad_norm": 8.659578323364258, + "learning_rate": 9.821844593668632e-06, + "loss": 0.4579, + "step": 2051 + }, + { + "epoch": 0.027835051546391754, + "grad_norm": 5.8601508140563965, + "learning_rate": 9.821707551048377e-06, + "loss": 0.3317, + "step": 2052 + }, + { + "epoch": 0.02784861638632664, + "grad_norm": 7.136466979980469, + "learning_rate": 9.821570508428122e-06, + "loss": 0.47, + "step": 2053 + }, + { + "epoch": 0.02786218122626153, + "grad_norm": 11.214995384216309, + "learning_rate": 9.821433465807867e-06, + "loss": 0.8746, + "step": 2054 + }, + { + "epoch": 0.02787574606619642, + "grad_norm": 7.112154960632324, + "learning_rate": 9.821296423187612e-06, + "loss": 0.4076, + "step": 2055 + }, + { + "epoch": 0.02788931090613131, + "grad_norm": 8.567448616027832, + "learning_rate": 9.821159380567358e-06, + "loss": 0.4782, + "step": 2056 + }, + { + "epoch": 0.027902875746066196, + "grad_norm": 9.46505069732666, + "learning_rate": 9.821022337947103e-06, + "loss": 0.3999, + "step": 2057 + }, + { + "epoch": 0.027916440586001085, + "grad_norm": 6.942192554473877, + "learning_rate": 9.820885295326848e-06, + "loss": 0.3914, + "step": 2058 + }, + { + "epoch": 0.027930005425935975, + "grad_norm": 8.371646881103516, + "learning_rate": 9.820748252706591e-06, + "loss": 0.388, + "step": 2059 + }, + { + "epoch": 0.027943570265870864, + "grad_norm": 13.543763160705566, + "learning_rate": 9.820611210086337e-06, + "loss": 0.6458, + "step": 2060 + }, + { + "epoch": 0.02795713510580575, + "grad_norm": 8.756617546081543, + "learning_rate": 9.820474167466084e-06, + "loss": 0.4227, + "step": 2061 + }, + { + "epoch": 0.02797069994574064, + "grad_norm": 6.959203243255615, + "learning_rate": 9.820337124845829e-06, + "loss": 0.4654, + "step": 2062 + }, + { + "epoch": 0.02798426478567553, + "grad_norm": 6.287944316864014, + "learning_rate": 9.820200082225572e-06, + "loss": 0.3993, + "step": 2063 + }, + { + "epoch": 0.02799782962561042, + "grad_norm": 9.025793075561523, + "learning_rate": 9.820063039605317e-06, + "loss": 0.5199, + "step": 2064 + }, + { + "epoch": 0.028011394465545306, + "grad_norm": 7.848881721496582, + "learning_rate": 9.819925996985064e-06, + "loss": 0.3572, + "step": 2065 + }, + { + "epoch": 0.028024959305480195, + "grad_norm": 6.605024814605713, + "learning_rate": 9.819788954364808e-06, + "loss": 0.4549, + "step": 2066 + }, + { + "epoch": 0.028038524145415085, + "grad_norm": 8.04798412322998, + "learning_rate": 9.819651911744553e-06, + "loss": 0.4469, + "step": 2067 + }, + { + "epoch": 0.028052088985349975, + "grad_norm": 7.6047468185424805, + "learning_rate": 9.819514869124298e-06, + "loss": 0.5313, + "step": 2068 + }, + { + "epoch": 0.02806565382528486, + "grad_norm": 8.411491394042969, + "learning_rate": 9.819377826504043e-06, + "loss": 0.4322, + "step": 2069 + }, + { + "epoch": 0.02807921866521975, + "grad_norm": 6.674587249755859, + "learning_rate": 9.819240783883788e-06, + "loss": 0.4656, + "step": 2070 + }, + { + "epoch": 0.02809278350515464, + "grad_norm": 6.000911712646484, + "learning_rate": 9.819103741263534e-06, + "loss": 0.3249, + "step": 2071 + }, + { + "epoch": 0.02810634834508953, + "grad_norm": 6.733808994293213, + "learning_rate": 9.818966698643279e-06, + "loss": 0.5412, + "step": 2072 + }, + { + "epoch": 0.028119913185024416, + "grad_norm": 6.848334312438965, + "learning_rate": 9.818829656023024e-06, + "loss": 0.404, + "step": 2073 + }, + { + "epoch": 0.028133478024959305, + "grad_norm": 6.876999855041504, + "learning_rate": 9.818692613402769e-06, + "loss": 0.3341, + "step": 2074 + }, + { + "epoch": 0.028147042864894195, + "grad_norm": 6.94787073135376, + "learning_rate": 9.818555570782514e-06, + "loss": 0.3818, + "step": 2075 + }, + { + "epoch": 0.028160607704829085, + "grad_norm": 6.569721221923828, + "learning_rate": 9.81841852816226e-06, + "loss": 0.3089, + "step": 2076 + }, + { + "epoch": 0.02817417254476397, + "grad_norm": 8.712794303894043, + "learning_rate": 9.818281485542005e-06, + "loss": 0.4491, + "step": 2077 + }, + { + "epoch": 0.02818773738469886, + "grad_norm": 9.00017261505127, + "learning_rate": 9.81814444292175e-06, + "loss": 0.5822, + "step": 2078 + }, + { + "epoch": 0.02820130222463375, + "grad_norm": 4.432340145111084, + "learning_rate": 9.818007400301495e-06, + "loss": 0.2901, + "step": 2079 + }, + { + "epoch": 0.02821486706456864, + "grad_norm": 7.9513092041015625, + "learning_rate": 9.81787035768124e-06, + "loss": 0.4445, + "step": 2080 + }, + { + "epoch": 0.028228431904503526, + "grad_norm": 7.732169151306152, + "learning_rate": 9.817733315060984e-06, + "loss": 0.5368, + "step": 2081 + }, + { + "epoch": 0.028241996744438416, + "grad_norm": 8.893549919128418, + "learning_rate": 9.81759627244073e-06, + "loss": 0.4249, + "step": 2082 + }, + { + "epoch": 0.028255561584373305, + "grad_norm": 8.477787017822266, + "learning_rate": 9.817459229820476e-06, + "loss": 0.5286, + "step": 2083 + }, + { + "epoch": 0.028269126424308195, + "grad_norm": 10.00739574432373, + "learning_rate": 9.81732218720022e-06, + "loss": 0.5222, + "step": 2084 + }, + { + "epoch": 0.02828269126424308, + "grad_norm": 7.721795558929443, + "learning_rate": 9.817185144579964e-06, + "loss": 0.5228, + "step": 2085 + }, + { + "epoch": 0.02829625610417797, + "grad_norm": 7.207523345947266, + "learning_rate": 9.817048101959711e-06, + "loss": 0.4691, + "step": 2086 + }, + { + "epoch": 0.02830982094411286, + "grad_norm": 7.267091274261475, + "learning_rate": 9.816911059339456e-06, + "loss": 0.3596, + "step": 2087 + }, + { + "epoch": 0.02832338578404775, + "grad_norm": 6.506495952606201, + "learning_rate": 9.8167740167192e-06, + "loss": 0.3933, + "step": 2088 + }, + { + "epoch": 0.028336950623982636, + "grad_norm": 8.37509536743164, + "learning_rate": 9.816636974098945e-06, + "loss": 0.4217, + "step": 2089 + }, + { + "epoch": 0.028350515463917526, + "grad_norm": 8.782181739807129, + "learning_rate": 9.81649993147869e-06, + "loss": 0.4387, + "step": 2090 + }, + { + "epoch": 0.028364080303852415, + "grad_norm": 7.55718469619751, + "learning_rate": 9.816362888858435e-06, + "loss": 0.3816, + "step": 2091 + }, + { + "epoch": 0.028377645143787305, + "grad_norm": 6.185057640075684, + "learning_rate": 9.81622584623818e-06, + "loss": 0.3428, + "step": 2092 + }, + { + "epoch": 0.02839120998372219, + "grad_norm": 7.616037845611572, + "learning_rate": 9.816088803617926e-06, + "loss": 0.4978, + "step": 2093 + }, + { + "epoch": 0.02840477482365708, + "grad_norm": 9.556238174438477, + "learning_rate": 9.815951760997671e-06, + "loss": 0.5639, + "step": 2094 + }, + { + "epoch": 0.02841833966359197, + "grad_norm": 9.778520584106445, + "learning_rate": 9.815814718377416e-06, + "loss": 0.4915, + "step": 2095 + }, + { + "epoch": 0.02843190450352686, + "grad_norm": 7.069403648376465, + "learning_rate": 9.815677675757161e-06, + "loss": 0.4157, + "step": 2096 + }, + { + "epoch": 0.028445469343461746, + "grad_norm": 8.993762016296387, + "learning_rate": 9.815540633136907e-06, + "loss": 0.549, + "step": 2097 + }, + { + "epoch": 0.028459034183396636, + "grad_norm": 9.234196662902832, + "learning_rate": 9.815403590516652e-06, + "loss": 0.4775, + "step": 2098 + }, + { + "epoch": 0.028472599023331525, + "grad_norm": 8.55200481414795, + "learning_rate": 9.815266547896397e-06, + "loss": 0.5154, + "step": 2099 + }, + { + "epoch": 0.028486163863266415, + "grad_norm": 8.749760627746582, + "learning_rate": 9.815129505276142e-06, + "loss": 0.2848, + "step": 2100 + }, + { + "epoch": 0.0284997287032013, + "grad_norm": 10.50694751739502, + "learning_rate": 9.814992462655887e-06, + "loss": 0.53, + "step": 2101 + }, + { + "epoch": 0.02851329354313619, + "grad_norm": 9.22954273223877, + "learning_rate": 9.814855420035632e-06, + "loss": 0.392, + "step": 2102 + }, + { + "epoch": 0.02852685838307108, + "grad_norm": 8.607038497924805, + "learning_rate": 9.814718377415376e-06, + "loss": 0.5229, + "step": 2103 + }, + { + "epoch": 0.02854042322300597, + "grad_norm": 7.133881568908691, + "learning_rate": 9.814581334795123e-06, + "loss": 0.3698, + "step": 2104 + }, + { + "epoch": 0.028553988062940856, + "grad_norm": 8.807280540466309, + "learning_rate": 9.814444292174868e-06, + "loss": 0.5807, + "step": 2105 + }, + { + "epoch": 0.028567552902875746, + "grad_norm": 7.420010089874268, + "learning_rate": 9.814307249554611e-06, + "loss": 0.4282, + "step": 2106 + }, + { + "epoch": 0.028581117742810636, + "grad_norm": 8.718269348144531, + "learning_rate": 9.814170206934357e-06, + "loss": 0.5092, + "step": 2107 + }, + { + "epoch": 0.028594682582745525, + "grad_norm": 8.490839004516602, + "learning_rate": 9.814033164314104e-06, + "loss": 0.5448, + "step": 2108 + }, + { + "epoch": 0.02860824742268041, + "grad_norm": 8.689910888671875, + "learning_rate": 9.813896121693847e-06, + "loss": 0.389, + "step": 2109 + }, + { + "epoch": 0.0286218122626153, + "grad_norm": 11.769254684448242, + "learning_rate": 9.813759079073592e-06, + "loss": 0.6751, + "step": 2110 + }, + { + "epoch": 0.02863537710255019, + "grad_norm": 9.539691925048828, + "learning_rate": 9.813622036453337e-06, + "loss": 0.3824, + "step": 2111 + }, + { + "epoch": 0.02864894194248508, + "grad_norm": 9.793550491333008, + "learning_rate": 9.813484993833083e-06, + "loss": 0.5034, + "step": 2112 + }, + { + "epoch": 0.028662506782419966, + "grad_norm": 8.832734107971191, + "learning_rate": 9.813347951212828e-06, + "loss": 0.3995, + "step": 2113 + }, + { + "epoch": 0.028676071622354856, + "grad_norm": 9.005946159362793, + "learning_rate": 9.813210908592573e-06, + "loss": 0.5074, + "step": 2114 + }, + { + "epoch": 0.028689636462289746, + "grad_norm": 10.138792037963867, + "learning_rate": 9.813073865972318e-06, + "loss": 0.5778, + "step": 2115 + }, + { + "epoch": 0.028703201302224635, + "grad_norm": 9.510467529296875, + "learning_rate": 9.812936823352063e-06, + "loss": 0.5267, + "step": 2116 + }, + { + "epoch": 0.02871676614215952, + "grad_norm": 11.294746398925781, + "learning_rate": 9.812799780731808e-06, + "loss": 0.7965, + "step": 2117 + }, + { + "epoch": 0.02873033098209441, + "grad_norm": 8.62086296081543, + "learning_rate": 9.812662738111554e-06, + "loss": 0.4194, + "step": 2118 + }, + { + "epoch": 0.0287438958220293, + "grad_norm": 10.78759765625, + "learning_rate": 9.812525695491299e-06, + "loss": 0.604, + "step": 2119 + }, + { + "epoch": 0.02875746066196419, + "grad_norm": 25.59532356262207, + "learning_rate": 9.812388652871044e-06, + "loss": 0.3516, + "step": 2120 + }, + { + "epoch": 0.028771025501899077, + "grad_norm": 9.999187469482422, + "learning_rate": 9.81225161025079e-06, + "loss": 0.6214, + "step": 2121 + }, + { + "epoch": 0.028784590341833966, + "grad_norm": 7.349400043487549, + "learning_rate": 9.812114567630534e-06, + "loss": 0.341, + "step": 2122 + }, + { + "epoch": 0.028798155181768856, + "grad_norm": 8.366490364074707, + "learning_rate": 9.81197752501028e-06, + "loss": 0.4919, + "step": 2123 + }, + { + "epoch": 0.028811720021703745, + "grad_norm": 9.633296012878418, + "learning_rate": 9.811840482390023e-06, + "loss": 0.6729, + "step": 2124 + }, + { + "epoch": 0.02882528486163863, + "grad_norm": 9.189534187316895, + "learning_rate": 9.81170343976977e-06, + "loss": 0.5591, + "step": 2125 + }, + { + "epoch": 0.02883884970157352, + "grad_norm": 9.76693344116211, + "learning_rate": 9.811566397149515e-06, + "loss": 0.5726, + "step": 2126 + }, + { + "epoch": 0.02885241454150841, + "grad_norm": 9.259537696838379, + "learning_rate": 9.811429354529259e-06, + "loss": 0.5837, + "step": 2127 + }, + { + "epoch": 0.0288659793814433, + "grad_norm": 9.750483512878418, + "learning_rate": 9.811292311909004e-06, + "loss": 0.7082, + "step": 2128 + }, + { + "epoch": 0.028879544221378187, + "grad_norm": 6.666791915893555, + "learning_rate": 9.811155269288749e-06, + "loss": 0.3975, + "step": 2129 + }, + { + "epoch": 0.028893109061313076, + "grad_norm": 8.596198081970215, + "learning_rate": 9.811018226668496e-06, + "loss": 0.4494, + "step": 2130 + }, + { + "epoch": 0.028906673901247966, + "grad_norm": 7.922978401184082, + "learning_rate": 9.81088118404824e-06, + "loss": 0.5085, + "step": 2131 + }, + { + "epoch": 0.028920238741182856, + "grad_norm": 8.323443412780762, + "learning_rate": 9.810744141427984e-06, + "loss": 0.4305, + "step": 2132 + }, + { + "epoch": 0.02893380358111774, + "grad_norm": 9.101943016052246, + "learning_rate": 9.81060709880773e-06, + "loss": 0.3483, + "step": 2133 + }, + { + "epoch": 0.02894736842105263, + "grad_norm": 6.445491313934326, + "learning_rate": 9.810470056187475e-06, + "loss": 0.3408, + "step": 2134 + }, + { + "epoch": 0.02896093326098752, + "grad_norm": 6.489782810211182, + "learning_rate": 9.81033301356722e-06, + "loss": 0.4466, + "step": 2135 + }, + { + "epoch": 0.02897449810092241, + "grad_norm": 10.018720626831055, + "learning_rate": 9.810195970946965e-06, + "loss": 0.4835, + "step": 2136 + }, + { + "epoch": 0.028988062940857297, + "grad_norm": 8.293522834777832, + "learning_rate": 9.81005892832671e-06, + "loss": 0.5543, + "step": 2137 + }, + { + "epoch": 0.029001627780792186, + "grad_norm": 11.041016578674316, + "learning_rate": 9.809921885706456e-06, + "loss": 0.5851, + "step": 2138 + }, + { + "epoch": 0.029015192620727076, + "grad_norm": 7.387391090393066, + "learning_rate": 9.8097848430862e-06, + "loss": 0.4656, + "step": 2139 + }, + { + "epoch": 0.029028757460661966, + "grad_norm": 6.481193542480469, + "learning_rate": 9.809647800465946e-06, + "loss": 0.4566, + "step": 2140 + }, + { + "epoch": 0.029042322300596852, + "grad_norm": 8.885249137878418, + "learning_rate": 9.809510757845691e-06, + "loss": 0.4445, + "step": 2141 + }, + { + "epoch": 0.02905588714053174, + "grad_norm": 7.59909200668335, + "learning_rate": 9.809373715225436e-06, + "loss": 0.385, + "step": 2142 + }, + { + "epoch": 0.02906945198046663, + "grad_norm": 7.446737289428711, + "learning_rate": 9.809236672605181e-06, + "loss": 0.458, + "step": 2143 + }, + { + "epoch": 0.02908301682040152, + "grad_norm": 5.675897598266602, + "learning_rate": 9.809099629984927e-06, + "loss": 0.441, + "step": 2144 + }, + { + "epoch": 0.029096581660336407, + "grad_norm": 6.516537189483643, + "learning_rate": 9.808962587364672e-06, + "loss": 0.3946, + "step": 2145 + }, + { + "epoch": 0.029110146500271297, + "grad_norm": 12.794913291931152, + "learning_rate": 9.808825544744415e-06, + "loss": 0.4665, + "step": 2146 + }, + { + "epoch": 0.029123711340206186, + "grad_norm": 7.063620090484619, + "learning_rate": 9.808688502124162e-06, + "loss": 0.3895, + "step": 2147 + }, + { + "epoch": 0.029137276180141076, + "grad_norm": 6.9005231857299805, + "learning_rate": 9.808551459503907e-06, + "loss": 0.3933, + "step": 2148 + }, + { + "epoch": 0.029150841020075962, + "grad_norm": 5.196944236755371, + "learning_rate": 9.80841441688365e-06, + "loss": 0.2877, + "step": 2149 + }, + { + "epoch": 0.02916440586001085, + "grad_norm": 6.374208927154541, + "learning_rate": 9.808277374263396e-06, + "loss": 0.2925, + "step": 2150 + }, + { + "epoch": 0.02917797069994574, + "grad_norm": 7.556048393249512, + "learning_rate": 9.808140331643143e-06, + "loss": 0.4883, + "step": 2151 + }, + { + "epoch": 0.02919153553988063, + "grad_norm": 8.038227081298828, + "learning_rate": 9.808003289022886e-06, + "loss": 0.3594, + "step": 2152 + }, + { + "epoch": 0.029205100379815517, + "grad_norm": 7.616580486297607, + "learning_rate": 9.807866246402631e-06, + "loss": 0.4047, + "step": 2153 + }, + { + "epoch": 0.029218665219750407, + "grad_norm": 7.61851167678833, + "learning_rate": 9.807729203782377e-06, + "loss": 0.3985, + "step": 2154 + }, + { + "epoch": 0.029232230059685296, + "grad_norm": 9.345929145812988, + "learning_rate": 9.807592161162124e-06, + "loss": 0.3825, + "step": 2155 + }, + { + "epoch": 0.029245794899620186, + "grad_norm": 11.137937545776367, + "learning_rate": 9.807455118541867e-06, + "loss": 0.5986, + "step": 2156 + }, + { + "epoch": 0.029259359739555072, + "grad_norm": 7.348034381866455, + "learning_rate": 9.807318075921612e-06, + "loss": 0.4781, + "step": 2157 + }, + { + "epoch": 0.029272924579489962, + "grad_norm": 8.785333633422852, + "learning_rate": 9.807181033301357e-06, + "loss": 0.4606, + "step": 2158 + }, + { + "epoch": 0.02928648941942485, + "grad_norm": 7.535184860229492, + "learning_rate": 9.807043990681103e-06, + "loss": 0.4667, + "step": 2159 + }, + { + "epoch": 0.02930005425935974, + "grad_norm": 7.102223873138428, + "learning_rate": 9.806906948060848e-06, + "loss": 0.4271, + "step": 2160 + }, + { + "epoch": 0.029313619099294627, + "grad_norm": 7.109066963195801, + "learning_rate": 9.806769905440593e-06, + "loss": 0.4194, + "step": 2161 + }, + { + "epoch": 0.029327183939229517, + "grad_norm": 7.917111873626709, + "learning_rate": 9.806632862820338e-06, + "loss": 0.4362, + "step": 2162 + }, + { + "epoch": 0.029340748779164406, + "grad_norm": 8.662622451782227, + "learning_rate": 9.806495820200083e-06, + "loss": 0.4151, + "step": 2163 + }, + { + "epoch": 0.029354313619099296, + "grad_norm": 6.46608829498291, + "learning_rate": 9.806358777579828e-06, + "loss": 0.3857, + "step": 2164 + }, + { + "epoch": 0.029367878459034182, + "grad_norm": 5.8811845779418945, + "learning_rate": 9.806221734959574e-06, + "loss": 0.2996, + "step": 2165 + }, + { + "epoch": 0.029381443298969072, + "grad_norm": 10.392623901367188, + "learning_rate": 9.806084692339319e-06, + "loss": 0.6999, + "step": 2166 + }, + { + "epoch": 0.02939500813890396, + "grad_norm": 6.370218753814697, + "learning_rate": 9.805947649719062e-06, + "loss": 0.4297, + "step": 2167 + }, + { + "epoch": 0.02940857297883885, + "grad_norm": 8.179716110229492, + "learning_rate": 9.80581060709881e-06, + "loss": 0.4217, + "step": 2168 + }, + { + "epoch": 0.029422137818773737, + "grad_norm": 11.685245513916016, + "learning_rate": 9.805673564478554e-06, + "loss": 0.4623, + "step": 2169 + }, + { + "epoch": 0.029435702658708627, + "grad_norm": 8.7775239944458, + "learning_rate": 9.8055365218583e-06, + "loss": 0.4665, + "step": 2170 + }, + { + "epoch": 0.029449267498643517, + "grad_norm": 10.319657325744629, + "learning_rate": 9.805399479238043e-06, + "loss": 0.6292, + "step": 2171 + }, + { + "epoch": 0.029462832338578406, + "grad_norm": 8.907188415527344, + "learning_rate": 9.805262436617788e-06, + "loss": 0.464, + "step": 2172 + }, + { + "epoch": 0.029476397178513292, + "grad_norm": 10.170321464538574, + "learning_rate": 9.805125393997535e-06, + "loss": 0.5148, + "step": 2173 + }, + { + "epoch": 0.029489962018448182, + "grad_norm": 8.316375732421875, + "learning_rate": 9.804988351377279e-06, + "loss": 0.6379, + "step": 2174 + }, + { + "epoch": 0.02950352685838307, + "grad_norm": 7.918049335479736, + "learning_rate": 9.804851308757024e-06, + "loss": 0.4739, + "step": 2175 + }, + { + "epoch": 0.02951709169831796, + "grad_norm": 7.706417560577393, + "learning_rate": 9.804714266136769e-06, + "loss": 0.6031, + "step": 2176 + }, + { + "epoch": 0.029530656538252847, + "grad_norm": 8.314271926879883, + "learning_rate": 9.804577223516514e-06, + "loss": 0.4417, + "step": 2177 + }, + { + "epoch": 0.029544221378187737, + "grad_norm": 8.460247993469238, + "learning_rate": 9.80444018089626e-06, + "loss": 0.4503, + "step": 2178 + }, + { + "epoch": 0.029557786218122627, + "grad_norm": 8.907812118530273, + "learning_rate": 9.804303138276004e-06, + "loss": 0.4985, + "step": 2179 + }, + { + "epoch": 0.029571351058057516, + "grad_norm": 7.647563934326172, + "learning_rate": 9.80416609565575e-06, + "loss": 0.4053, + "step": 2180 + }, + { + "epoch": 0.029584915897992402, + "grad_norm": 5.597618579864502, + "learning_rate": 9.804029053035495e-06, + "loss": 0.3109, + "step": 2181 + }, + { + "epoch": 0.029598480737927292, + "grad_norm": 7.514339923858643, + "learning_rate": 9.80389201041524e-06, + "loss": 0.3499, + "step": 2182 + }, + { + "epoch": 0.029612045577862182, + "grad_norm": 7.710766315460205, + "learning_rate": 9.803754967794985e-06, + "loss": 0.5095, + "step": 2183 + }, + { + "epoch": 0.02962561041779707, + "grad_norm": 6.322748184204102, + "learning_rate": 9.80361792517473e-06, + "loss": 0.3844, + "step": 2184 + }, + { + "epoch": 0.029639175257731958, + "grad_norm": 10.536938667297363, + "learning_rate": 9.803480882554476e-06, + "loss": 0.579, + "step": 2185 + }, + { + "epoch": 0.029652740097666847, + "grad_norm": 10.0978364944458, + "learning_rate": 9.80334383993422e-06, + "loss": 0.6163, + "step": 2186 + }, + { + "epoch": 0.029666304937601737, + "grad_norm": 9.807483673095703, + "learning_rate": 9.803206797313966e-06, + "loss": 0.5858, + "step": 2187 + }, + { + "epoch": 0.029679869777536626, + "grad_norm": 8.207659721374512, + "learning_rate": 9.803069754693711e-06, + "loss": 0.4932, + "step": 2188 + }, + { + "epoch": 0.029693434617471513, + "grad_norm": 8.372005462646484, + "learning_rate": 9.802932712073455e-06, + "loss": 0.5703, + "step": 2189 + }, + { + "epoch": 0.029706999457406402, + "grad_norm": 8.212625503540039, + "learning_rate": 9.802795669453201e-06, + "loss": 0.4878, + "step": 2190 + }, + { + "epoch": 0.029720564297341292, + "grad_norm": 7.103729248046875, + "learning_rate": 9.802658626832947e-06, + "loss": 0.3972, + "step": 2191 + }, + { + "epoch": 0.02973412913727618, + "grad_norm": 7.14306640625, + "learning_rate": 9.80252158421269e-06, + "loss": 0.4028, + "step": 2192 + }, + { + "epoch": 0.029747693977211068, + "grad_norm": 7.102175235748291, + "learning_rate": 9.802384541592435e-06, + "loss": 0.4872, + "step": 2193 + }, + { + "epoch": 0.029761258817145957, + "grad_norm": 6.5758161544799805, + "learning_rate": 9.802247498972182e-06, + "loss": 0.3543, + "step": 2194 + }, + { + "epoch": 0.029774823657080847, + "grad_norm": 5.351612567901611, + "learning_rate": 9.802110456351927e-06, + "loss": 0.3671, + "step": 2195 + }, + { + "epoch": 0.029788388497015737, + "grad_norm": 8.146379470825195, + "learning_rate": 9.80197341373167e-06, + "loss": 0.5065, + "step": 2196 + }, + { + "epoch": 0.029801953336950623, + "grad_norm": 6.802483558654785, + "learning_rate": 9.801836371111416e-06, + "loss": 0.4025, + "step": 2197 + }, + { + "epoch": 0.029815518176885512, + "grad_norm": 5.814709186553955, + "learning_rate": 9.801699328491161e-06, + "loss": 0.3556, + "step": 2198 + }, + { + "epoch": 0.029829083016820402, + "grad_norm": 7.450861930847168, + "learning_rate": 9.801562285870906e-06, + "loss": 0.4719, + "step": 2199 + }, + { + "epoch": 0.02984264785675529, + "grad_norm": 7.961710453033447, + "learning_rate": 9.801425243250652e-06, + "loss": 0.3911, + "step": 2200 + }, + { + "epoch": 0.029856212696690178, + "grad_norm": 8.192008018493652, + "learning_rate": 9.801288200630397e-06, + "loss": 0.5058, + "step": 2201 + }, + { + "epoch": 0.029869777536625067, + "grad_norm": 11.230451583862305, + "learning_rate": 9.801151158010142e-06, + "loss": 0.6366, + "step": 2202 + }, + { + "epoch": 0.029883342376559957, + "grad_norm": 8.100739479064941, + "learning_rate": 9.801014115389887e-06, + "loss": 0.4144, + "step": 2203 + }, + { + "epoch": 0.029896907216494847, + "grad_norm": 10.978452682495117, + "learning_rate": 9.800877072769632e-06, + "loss": 0.4183, + "step": 2204 + }, + { + "epoch": 0.029910472056429733, + "grad_norm": 8.023785591125488, + "learning_rate": 9.800740030149377e-06, + "loss": 0.4916, + "step": 2205 + }, + { + "epoch": 0.029924036896364622, + "grad_norm": 6.237468242645264, + "learning_rate": 9.800602987529123e-06, + "loss": 0.3374, + "step": 2206 + }, + { + "epoch": 0.029937601736299512, + "grad_norm": 7.017930507659912, + "learning_rate": 9.800465944908868e-06, + "loss": 0.4195, + "step": 2207 + }, + { + "epoch": 0.029951166576234402, + "grad_norm": 7.397285461425781, + "learning_rate": 9.800328902288613e-06, + "loss": 0.4862, + "step": 2208 + }, + { + "epoch": 0.029964731416169288, + "grad_norm": 6.56115198135376, + "learning_rate": 9.800191859668358e-06, + "loss": 0.4301, + "step": 2209 + }, + { + "epoch": 0.029978296256104178, + "grad_norm": 7.520558834075928, + "learning_rate": 9.800054817048103e-06, + "loss": 0.5281, + "step": 2210 + }, + { + "epoch": 0.029991861096039067, + "grad_norm": 8.068503379821777, + "learning_rate": 9.799917774427847e-06, + "loss": 0.4874, + "step": 2211 + }, + { + "epoch": 0.030005425935973957, + "grad_norm": 5.639297962188721, + "learning_rate": 9.799780731807594e-06, + "loss": 0.3452, + "step": 2212 + }, + { + "epoch": 0.030018990775908843, + "grad_norm": 8.899269104003906, + "learning_rate": 9.799643689187339e-06, + "loss": 0.4678, + "step": 2213 + }, + { + "epoch": 0.030032555615843733, + "grad_norm": 9.969809532165527, + "learning_rate": 9.799506646567082e-06, + "loss": 0.7233, + "step": 2214 + }, + { + "epoch": 0.030046120455778622, + "grad_norm": 6.89511251449585, + "learning_rate": 9.799369603946828e-06, + "loss": 0.3712, + "step": 2215 + }, + { + "epoch": 0.030059685295713512, + "grad_norm": 5.7441086769104, + "learning_rate": 9.799232561326574e-06, + "loss": 0.4397, + "step": 2216 + }, + { + "epoch": 0.030073250135648398, + "grad_norm": 5.628518581390381, + "learning_rate": 9.799095518706318e-06, + "loss": 0.3225, + "step": 2217 + }, + { + "epoch": 0.030086814975583288, + "grad_norm": 4.850487232208252, + "learning_rate": 9.798958476086063e-06, + "loss": 0.3199, + "step": 2218 + }, + { + "epoch": 0.030100379815518177, + "grad_norm": 7.821817398071289, + "learning_rate": 9.798821433465808e-06, + "loss": 0.4984, + "step": 2219 + }, + { + "epoch": 0.030113944655453067, + "grad_norm": 7.856429576873779, + "learning_rate": 9.798684390845553e-06, + "loss": 0.3947, + "step": 2220 + }, + { + "epoch": 0.030127509495387953, + "grad_norm": 8.045611381530762, + "learning_rate": 9.798547348225299e-06, + "loss": 0.4638, + "step": 2221 + }, + { + "epoch": 0.030141074335322843, + "grad_norm": 6.6241455078125, + "learning_rate": 9.798410305605044e-06, + "loss": 0.5669, + "step": 2222 + }, + { + "epoch": 0.030154639175257732, + "grad_norm": 8.82223129272461, + "learning_rate": 9.798273262984789e-06, + "loss": 0.453, + "step": 2223 + }, + { + "epoch": 0.030168204015192622, + "grad_norm": 7.5670485496521, + "learning_rate": 9.798136220364534e-06, + "loss": 0.4064, + "step": 2224 + }, + { + "epoch": 0.030181768855127508, + "grad_norm": 8.467988967895508, + "learning_rate": 9.79799917774428e-06, + "loss": 0.5346, + "step": 2225 + }, + { + "epoch": 0.030195333695062398, + "grad_norm": 8.231279373168945, + "learning_rate": 9.797862135124024e-06, + "loss": 0.5522, + "step": 2226 + }, + { + "epoch": 0.030208898534997287, + "grad_norm": 8.510737419128418, + "learning_rate": 9.79772509250377e-06, + "loss": 0.5652, + "step": 2227 + }, + { + "epoch": 0.030222463374932177, + "grad_norm": 9.737093925476074, + "learning_rate": 9.797588049883515e-06, + "loss": 0.6927, + "step": 2228 + }, + { + "epoch": 0.030236028214867063, + "grad_norm": 8.630640029907227, + "learning_rate": 9.79745100726326e-06, + "loss": 0.5007, + "step": 2229 + }, + { + "epoch": 0.030249593054801953, + "grad_norm": 10.390735626220703, + "learning_rate": 9.797313964643005e-06, + "loss": 0.4877, + "step": 2230 + }, + { + "epoch": 0.030263157894736843, + "grad_norm": 8.383161544799805, + "learning_rate": 9.79717692202275e-06, + "loss": 0.547, + "step": 2231 + }, + { + "epoch": 0.030276722734671732, + "grad_norm": 8.676321029663086, + "learning_rate": 9.797039879402494e-06, + "loss": 0.5753, + "step": 2232 + }, + { + "epoch": 0.03029028757460662, + "grad_norm": 7.566898345947266, + "learning_rate": 9.79690283678224e-06, + "loss": 0.6662, + "step": 2233 + }, + { + "epoch": 0.030303852414541508, + "grad_norm": 6.034102916717529, + "learning_rate": 9.796765794161986e-06, + "loss": 0.363, + "step": 2234 + }, + { + "epoch": 0.030317417254476398, + "grad_norm": 5.795101165771484, + "learning_rate": 9.79662875154173e-06, + "loss": 0.4323, + "step": 2235 + }, + { + "epoch": 0.030330982094411287, + "grad_norm": 6.2028985023498535, + "learning_rate": 9.796491708921475e-06, + "loss": 0.4921, + "step": 2236 + }, + { + "epoch": 0.030344546934346173, + "grad_norm": 8.064584732055664, + "learning_rate": 9.796354666301221e-06, + "loss": 0.4963, + "step": 2237 + }, + { + "epoch": 0.030358111774281063, + "grad_norm": 7.8897624015808105, + "learning_rate": 9.796217623680967e-06, + "loss": 0.4469, + "step": 2238 + }, + { + "epoch": 0.030371676614215953, + "grad_norm": 8.353468894958496, + "learning_rate": 9.79608058106071e-06, + "loss": 0.5563, + "step": 2239 + }, + { + "epoch": 0.030385241454150842, + "grad_norm": 6.22549295425415, + "learning_rate": 9.795943538440455e-06, + "loss": 0.413, + "step": 2240 + }, + { + "epoch": 0.03039880629408573, + "grad_norm": 9.328463554382324, + "learning_rate": 9.7958064958202e-06, + "loss": 0.5494, + "step": 2241 + }, + { + "epoch": 0.030412371134020618, + "grad_norm": 6.968032360076904, + "learning_rate": 9.795669453199946e-06, + "loss": 0.3497, + "step": 2242 + }, + { + "epoch": 0.030425935973955508, + "grad_norm": 6.105712413787842, + "learning_rate": 9.79553241057969e-06, + "loss": 0.3656, + "step": 2243 + }, + { + "epoch": 0.030439500813890397, + "grad_norm": 7.358572006225586, + "learning_rate": 9.795395367959436e-06, + "loss": 0.4569, + "step": 2244 + }, + { + "epoch": 0.030453065653825283, + "grad_norm": 8.833179473876953, + "learning_rate": 9.795258325339181e-06, + "loss": 0.4599, + "step": 2245 + }, + { + "epoch": 0.030466630493760173, + "grad_norm": 7.525769233703613, + "learning_rate": 9.795121282718926e-06, + "loss": 0.3748, + "step": 2246 + }, + { + "epoch": 0.030480195333695063, + "grad_norm": 8.511932373046875, + "learning_rate": 9.794984240098672e-06, + "loss": 0.5218, + "step": 2247 + }, + { + "epoch": 0.030493760173629952, + "grad_norm": 7.897761821746826, + "learning_rate": 9.794847197478417e-06, + "loss": 0.4545, + "step": 2248 + }, + { + "epoch": 0.03050732501356484, + "grad_norm": 6.1534881591796875, + "learning_rate": 9.794710154858162e-06, + "loss": 0.4804, + "step": 2249 + }, + { + "epoch": 0.030520889853499728, + "grad_norm": 8.433366775512695, + "learning_rate": 9.794573112237907e-06, + "loss": 0.5304, + "step": 2250 + }, + { + "epoch": 0.030534454693434618, + "grad_norm": 8.351217269897461, + "learning_rate": 9.794436069617652e-06, + "loss": 0.4634, + "step": 2251 + }, + { + "epoch": 0.030548019533369507, + "grad_norm": 7.2773895263671875, + "learning_rate": 9.794299026997397e-06, + "loss": 0.4746, + "step": 2252 + }, + { + "epoch": 0.030561584373304394, + "grad_norm": 9.009844779968262, + "learning_rate": 9.794161984377143e-06, + "loss": 0.661, + "step": 2253 + }, + { + "epoch": 0.030575149213239283, + "grad_norm": 6.779384136199951, + "learning_rate": 9.794024941756886e-06, + "loss": 0.5563, + "step": 2254 + }, + { + "epoch": 0.030588714053174173, + "grad_norm": 6.640360355377197, + "learning_rate": 9.793887899136633e-06, + "loss": 0.3067, + "step": 2255 + }, + { + "epoch": 0.030602278893109063, + "grad_norm": 8.146496772766113, + "learning_rate": 9.793750856516378e-06, + "loss": 0.391, + "step": 2256 + }, + { + "epoch": 0.03061584373304395, + "grad_norm": 8.719143867492676, + "learning_rate": 9.793613813896122e-06, + "loss": 0.4495, + "step": 2257 + }, + { + "epoch": 0.03062940857297884, + "grad_norm": 6.584810733795166, + "learning_rate": 9.793476771275867e-06, + "loss": 0.4196, + "step": 2258 + }, + { + "epoch": 0.030642973412913728, + "grad_norm": 6.976583957672119, + "learning_rate": 9.793339728655614e-06, + "loss": 0.4883, + "step": 2259 + }, + { + "epoch": 0.030656538252848618, + "grad_norm": 7.167845249176025, + "learning_rate": 9.793202686035357e-06, + "loss": 0.4835, + "step": 2260 + }, + { + "epoch": 0.030670103092783504, + "grad_norm": 7.525091648101807, + "learning_rate": 9.793065643415102e-06, + "loss": 0.4438, + "step": 2261 + }, + { + "epoch": 0.030683667932718393, + "grad_norm": 5.918426513671875, + "learning_rate": 9.792928600794848e-06, + "loss": 0.3747, + "step": 2262 + }, + { + "epoch": 0.030697232772653283, + "grad_norm": 6.939919948577881, + "learning_rate": 9.792791558174594e-06, + "loss": 0.4023, + "step": 2263 + }, + { + "epoch": 0.030710797612588173, + "grad_norm": 6.728094577789307, + "learning_rate": 9.792654515554338e-06, + "loss": 0.3559, + "step": 2264 + }, + { + "epoch": 0.03072436245252306, + "grad_norm": 9.488022804260254, + "learning_rate": 9.792517472934083e-06, + "loss": 0.5879, + "step": 2265 + }, + { + "epoch": 0.03073792729245795, + "grad_norm": 6.902707099914551, + "learning_rate": 9.792380430313828e-06, + "loss": 0.4363, + "step": 2266 + }, + { + "epoch": 0.030751492132392838, + "grad_norm": 12.310553550720215, + "learning_rate": 9.792243387693573e-06, + "loss": 0.5226, + "step": 2267 + }, + { + "epoch": 0.030765056972327728, + "grad_norm": 6.96220588684082, + "learning_rate": 9.792106345073319e-06, + "loss": 0.5304, + "step": 2268 + }, + { + "epoch": 0.030778621812262614, + "grad_norm": 6.4979777336120605, + "learning_rate": 9.791969302453064e-06, + "loss": 0.3763, + "step": 2269 + }, + { + "epoch": 0.030792186652197504, + "grad_norm": 9.991649627685547, + "learning_rate": 9.791832259832809e-06, + "loss": 0.4865, + "step": 2270 + }, + { + "epoch": 0.030805751492132393, + "grad_norm": 7.642621994018555, + "learning_rate": 9.791695217212554e-06, + "loss": 0.502, + "step": 2271 + }, + { + "epoch": 0.030819316332067283, + "grad_norm": 8.6034574508667, + "learning_rate": 9.7915581745923e-06, + "loss": 0.451, + "step": 2272 + }, + { + "epoch": 0.03083288117200217, + "grad_norm": 7.678318023681641, + "learning_rate": 9.791421131972044e-06, + "loss": 0.4539, + "step": 2273 + }, + { + "epoch": 0.03084644601193706, + "grad_norm": 8.544610023498535, + "learning_rate": 9.79128408935179e-06, + "loss": 0.5188, + "step": 2274 + }, + { + "epoch": 0.030860010851871948, + "grad_norm": 8.7626371383667, + "learning_rate": 9.791147046731533e-06, + "loss": 0.6641, + "step": 2275 + }, + { + "epoch": 0.030873575691806838, + "grad_norm": 8.098876953125, + "learning_rate": 9.79101000411128e-06, + "loss": 0.4229, + "step": 2276 + }, + { + "epoch": 0.030887140531741724, + "grad_norm": 9.475790023803711, + "learning_rate": 9.790872961491025e-06, + "loss": 0.6296, + "step": 2277 + }, + { + "epoch": 0.030900705371676614, + "grad_norm": 7.962979793548584, + "learning_rate": 9.79073591887077e-06, + "loss": 0.4772, + "step": 2278 + }, + { + "epoch": 0.030914270211611503, + "grad_norm": 6.497735977172852, + "learning_rate": 9.790598876250514e-06, + "loss": 0.3981, + "step": 2279 + }, + { + "epoch": 0.030927835051546393, + "grad_norm": 8.891101837158203, + "learning_rate": 9.790461833630259e-06, + "loss": 0.4326, + "step": 2280 + }, + { + "epoch": 0.03094139989148128, + "grad_norm": 7.971639633178711, + "learning_rate": 9.790324791010006e-06, + "loss": 0.4792, + "step": 2281 + }, + { + "epoch": 0.03095496473141617, + "grad_norm": 8.719447135925293, + "learning_rate": 9.79018774838975e-06, + "loss": 0.6022, + "step": 2282 + }, + { + "epoch": 0.03096852957135106, + "grad_norm": 9.954371452331543, + "learning_rate": 9.790050705769495e-06, + "loss": 0.6451, + "step": 2283 + }, + { + "epoch": 0.030982094411285948, + "grad_norm": 8.70532512664795, + "learning_rate": 9.78991366314924e-06, + "loss": 0.5648, + "step": 2284 + }, + { + "epoch": 0.030995659251220834, + "grad_norm": 9.778973579406738, + "learning_rate": 9.789776620528985e-06, + "loss": 0.6953, + "step": 2285 + }, + { + "epoch": 0.031009224091155724, + "grad_norm": 8.385767936706543, + "learning_rate": 9.78963957790873e-06, + "loss": 0.4319, + "step": 2286 + }, + { + "epoch": 0.031022788931090613, + "grad_norm": 8.359691619873047, + "learning_rate": 9.789502535288475e-06, + "loss": 0.5077, + "step": 2287 + }, + { + "epoch": 0.031036353771025503, + "grad_norm": 7.561957359313965, + "learning_rate": 9.78936549266822e-06, + "loss": 0.432, + "step": 2288 + }, + { + "epoch": 0.03104991861096039, + "grad_norm": 7.838884353637695, + "learning_rate": 9.789228450047966e-06, + "loss": 0.4496, + "step": 2289 + }, + { + "epoch": 0.03106348345089528, + "grad_norm": 9.930845260620117, + "learning_rate": 9.78909140742771e-06, + "loss": 0.4506, + "step": 2290 + }, + { + "epoch": 0.03107704829083017, + "grad_norm": 5.573061466217041, + "learning_rate": 9.788954364807456e-06, + "loss": 0.3629, + "step": 2291 + }, + { + "epoch": 0.031090613130765058, + "grad_norm": 7.117616176605225, + "learning_rate": 9.788817322187201e-06, + "loss": 0.5884, + "step": 2292 + }, + { + "epoch": 0.031104177970699944, + "grad_norm": 7.11517333984375, + "learning_rate": 9.788680279566946e-06, + "loss": 0.3436, + "step": 2293 + }, + { + "epoch": 0.031117742810634834, + "grad_norm": 8.035987854003906, + "learning_rate": 9.788543236946692e-06, + "loss": 0.534, + "step": 2294 + }, + { + "epoch": 0.031131307650569724, + "grad_norm": 9.97258472442627, + "learning_rate": 9.788406194326437e-06, + "loss": 0.6481, + "step": 2295 + }, + { + "epoch": 0.031144872490504613, + "grad_norm": 8.678667068481445, + "learning_rate": 9.788269151706182e-06, + "loss": 0.4671, + "step": 2296 + }, + { + "epoch": 0.0311584373304395, + "grad_norm": 6.579753875732422, + "learning_rate": 9.788132109085925e-06, + "loss": 0.3592, + "step": 2297 + }, + { + "epoch": 0.03117200217037439, + "grad_norm": 8.960090637207031, + "learning_rate": 9.787995066465672e-06, + "loss": 0.5226, + "step": 2298 + }, + { + "epoch": 0.03118556701030928, + "grad_norm": 8.613097190856934, + "learning_rate": 9.787858023845417e-06, + "loss": 0.6004, + "step": 2299 + }, + { + "epoch": 0.031199131850244168, + "grad_norm": 5.849684715270996, + "learning_rate": 9.787720981225161e-06, + "loss": 0.2737, + "step": 2300 + }, + { + "epoch": 0.031212696690179054, + "grad_norm": 6.570250988006592, + "learning_rate": 9.787583938604906e-06, + "loss": 0.4872, + "step": 2301 + }, + { + "epoch": 0.031226261530113944, + "grad_norm": 6.186343669891357, + "learning_rate": 9.787446895984653e-06, + "loss": 0.4117, + "step": 2302 + }, + { + "epoch": 0.031239826370048834, + "grad_norm": 7.311142921447754, + "learning_rate": 9.787309853364396e-06, + "loss": 0.3393, + "step": 2303 + }, + { + "epoch": 0.03125339120998372, + "grad_norm": 9.665497779846191, + "learning_rate": 9.787172810744142e-06, + "loss": 0.6831, + "step": 2304 + }, + { + "epoch": 0.03126695604991861, + "grad_norm": 6.325101375579834, + "learning_rate": 9.787035768123887e-06, + "loss": 0.48, + "step": 2305 + }, + { + "epoch": 0.0312805208898535, + "grad_norm": 6.751148700714111, + "learning_rate": 9.786898725503634e-06, + "loss": 0.4764, + "step": 2306 + }, + { + "epoch": 0.03129408572978839, + "grad_norm": 7.456347942352295, + "learning_rate": 9.786761682883377e-06, + "loss": 0.4793, + "step": 2307 + }, + { + "epoch": 0.031307650569723275, + "grad_norm": 6.461772441864014, + "learning_rate": 9.786624640263122e-06, + "loss": 0.5333, + "step": 2308 + }, + { + "epoch": 0.03132121540965817, + "grad_norm": 5.796838283538818, + "learning_rate": 9.786487597642868e-06, + "loss": 0.346, + "step": 2309 + }, + { + "epoch": 0.031334780249593054, + "grad_norm": 6.815439224243164, + "learning_rate": 9.786350555022613e-06, + "loss": 0.4939, + "step": 2310 + }, + { + "epoch": 0.03134834508952794, + "grad_norm": 9.490578651428223, + "learning_rate": 9.786213512402358e-06, + "loss": 0.4882, + "step": 2311 + }, + { + "epoch": 0.03136190992946283, + "grad_norm": 8.127197265625, + "learning_rate": 9.786076469782103e-06, + "loss": 0.4171, + "step": 2312 + }, + { + "epoch": 0.03137547476939772, + "grad_norm": 6.784739017486572, + "learning_rate": 9.785939427161848e-06, + "loss": 0.4659, + "step": 2313 + }, + { + "epoch": 0.03138903960933261, + "grad_norm": 6.403912544250488, + "learning_rate": 9.785802384541593e-06, + "loss": 0.3503, + "step": 2314 + }, + { + "epoch": 0.0314026044492675, + "grad_norm": 8.782697677612305, + "learning_rate": 9.785665341921339e-06, + "loss": 0.536, + "step": 2315 + }, + { + "epoch": 0.031416169289202385, + "grad_norm": 7.012528419494629, + "learning_rate": 9.785528299301084e-06, + "loss": 0.4936, + "step": 2316 + }, + { + "epoch": 0.03142973412913728, + "grad_norm": 6.008264064788818, + "learning_rate": 9.785391256680829e-06, + "loss": 0.4579, + "step": 2317 + }, + { + "epoch": 0.031443298969072164, + "grad_norm": 7.621353626251221, + "learning_rate": 9.785254214060572e-06, + "loss": 0.5388, + "step": 2318 + }, + { + "epoch": 0.03145686380900705, + "grad_norm": 7.242863655090332, + "learning_rate": 9.78511717144032e-06, + "loss": 0.5758, + "step": 2319 + }, + { + "epoch": 0.031470428648941944, + "grad_norm": 9.15571117401123, + "learning_rate": 9.784980128820065e-06, + "loss": 0.7023, + "step": 2320 + }, + { + "epoch": 0.03148399348887683, + "grad_norm": 7.427203178405762, + "learning_rate": 9.78484308619981e-06, + "loss": 0.5188, + "step": 2321 + }, + { + "epoch": 0.03149755832881172, + "grad_norm": 7.006646633148193, + "learning_rate": 9.784706043579553e-06, + "loss": 0.4611, + "step": 2322 + }, + { + "epoch": 0.03151112316874661, + "grad_norm": 5.736209392547607, + "learning_rate": 9.784569000959298e-06, + "loss": 0.4489, + "step": 2323 + }, + { + "epoch": 0.031524688008681495, + "grad_norm": 10.523894309997559, + "learning_rate": 9.784431958339045e-06, + "loss": 0.3433, + "step": 2324 + }, + { + "epoch": 0.03153825284861639, + "grad_norm": 9.484429359436035, + "learning_rate": 9.784294915718789e-06, + "loss": 0.6091, + "step": 2325 + }, + { + "epoch": 0.031551817688551274, + "grad_norm": 9.225445747375488, + "learning_rate": 9.784157873098534e-06, + "loss": 0.5688, + "step": 2326 + }, + { + "epoch": 0.03156538252848616, + "grad_norm": 5.675683975219727, + "learning_rate": 9.784020830478279e-06, + "loss": 0.2961, + "step": 2327 + }, + { + "epoch": 0.031578947368421054, + "grad_norm": 7.166435718536377, + "learning_rate": 9.783883787858024e-06, + "loss": 0.5981, + "step": 2328 + }, + { + "epoch": 0.03159251220835594, + "grad_norm": 7.330021858215332, + "learning_rate": 9.78374674523777e-06, + "loss": 0.506, + "step": 2329 + }, + { + "epoch": 0.03160607704829083, + "grad_norm": 8.08161735534668, + "learning_rate": 9.783609702617515e-06, + "loss": 0.5974, + "step": 2330 + }, + { + "epoch": 0.03161964188822572, + "grad_norm": 7.20418643951416, + "learning_rate": 9.78347265999726e-06, + "loss": 0.4709, + "step": 2331 + }, + { + "epoch": 0.031633206728160605, + "grad_norm": 8.695576667785645, + "learning_rate": 9.783335617377005e-06, + "loss": 0.7041, + "step": 2332 + }, + { + "epoch": 0.0316467715680955, + "grad_norm": 10.445798873901367, + "learning_rate": 9.78319857475675e-06, + "loss": 0.5275, + "step": 2333 + }, + { + "epoch": 0.031660336408030385, + "grad_norm": 11.649385452270508, + "learning_rate": 9.783061532136495e-06, + "loss": 0.7355, + "step": 2334 + }, + { + "epoch": 0.03167390124796527, + "grad_norm": 10.855868339538574, + "learning_rate": 9.78292448951624e-06, + "loss": 0.6875, + "step": 2335 + }, + { + "epoch": 0.031687466087900164, + "grad_norm": 8.194964408874512, + "learning_rate": 9.782787446895986e-06, + "loss": 0.4483, + "step": 2336 + }, + { + "epoch": 0.03170103092783505, + "grad_norm": 11.294795036315918, + "learning_rate": 9.782650404275731e-06, + "loss": 0.8189, + "step": 2337 + }, + { + "epoch": 0.03171459576776994, + "grad_norm": 8.887742042541504, + "learning_rate": 9.782513361655476e-06, + "loss": 0.5172, + "step": 2338 + }, + { + "epoch": 0.03172816060770483, + "grad_norm": 10.717185974121094, + "learning_rate": 9.782376319035221e-06, + "loss": 0.8297, + "step": 2339 + }, + { + "epoch": 0.031741725447639715, + "grad_norm": 7.967704772949219, + "learning_rate": 9.782239276414965e-06, + "loss": 0.5237, + "step": 2340 + }, + { + "epoch": 0.03175529028757461, + "grad_norm": 9.60954475402832, + "learning_rate": 9.782102233794712e-06, + "loss": 0.705, + "step": 2341 + }, + { + "epoch": 0.031768855127509495, + "grad_norm": 10.327293395996094, + "learning_rate": 9.781965191174457e-06, + "loss": 0.5558, + "step": 2342 + }, + { + "epoch": 0.03178241996744438, + "grad_norm": 7.521337032318115, + "learning_rate": 9.7818281485542e-06, + "loss": 0.5223, + "step": 2343 + }, + { + "epoch": 0.031795984807379274, + "grad_norm": 7.00119161605835, + "learning_rate": 9.781691105933945e-06, + "loss": 0.427, + "step": 2344 + }, + { + "epoch": 0.03180954964731416, + "grad_norm": 9.523577690124512, + "learning_rate": 9.781554063313692e-06, + "loss": 0.5472, + "step": 2345 + }, + { + "epoch": 0.03182311448724905, + "grad_norm": 11.243821144104004, + "learning_rate": 9.781417020693437e-06, + "loss": 0.8266, + "step": 2346 + }, + { + "epoch": 0.03183667932718394, + "grad_norm": 8.912627220153809, + "learning_rate": 9.781279978073181e-06, + "loss": 0.5904, + "step": 2347 + }, + { + "epoch": 0.031850244167118826, + "grad_norm": 7.393073558807373, + "learning_rate": 9.781142935452926e-06, + "loss": 0.4059, + "step": 2348 + }, + { + "epoch": 0.03186380900705372, + "grad_norm": 7.713298320770264, + "learning_rate": 9.781005892832671e-06, + "loss": 0.3448, + "step": 2349 + }, + { + "epoch": 0.031877373846988605, + "grad_norm": 8.562539100646973, + "learning_rate": 9.780868850212416e-06, + "loss": 0.5605, + "step": 2350 + }, + { + "epoch": 0.03189093868692349, + "grad_norm": 8.060558319091797, + "learning_rate": 9.780731807592162e-06, + "loss": 0.515, + "step": 2351 + }, + { + "epoch": 0.031904503526858384, + "grad_norm": 7.068313121795654, + "learning_rate": 9.780594764971907e-06, + "loss": 0.4106, + "step": 2352 + }, + { + "epoch": 0.03191806836679327, + "grad_norm": 7.237218379974365, + "learning_rate": 9.780457722351652e-06, + "loss": 0.4465, + "step": 2353 + }, + { + "epoch": 0.03193163320672816, + "grad_norm": 8.877727508544922, + "learning_rate": 9.780320679731397e-06, + "loss": 0.6306, + "step": 2354 + }, + { + "epoch": 0.03194519804666305, + "grad_norm": 9.270525932312012, + "learning_rate": 9.780183637111142e-06, + "loss": 0.5204, + "step": 2355 + }, + { + "epoch": 0.031958762886597936, + "grad_norm": 9.77447509765625, + "learning_rate": 9.780046594490888e-06, + "loss": 0.4592, + "step": 2356 + }, + { + "epoch": 0.03197232772653283, + "grad_norm": 8.07950496673584, + "learning_rate": 9.779909551870633e-06, + "loss": 0.5913, + "step": 2357 + }, + { + "epoch": 0.031985892566467715, + "grad_norm": 8.10224723815918, + "learning_rate": 9.779772509250378e-06, + "loss": 0.5171, + "step": 2358 + }, + { + "epoch": 0.0319994574064026, + "grad_norm": 10.335223197937012, + "learning_rate": 9.779635466630123e-06, + "loss": 0.4647, + "step": 2359 + }, + { + "epoch": 0.032013022246337494, + "grad_norm": 9.502020835876465, + "learning_rate": 9.779498424009868e-06, + "loss": 0.5912, + "step": 2360 + }, + { + "epoch": 0.03202658708627238, + "grad_norm": 11.283576965332031, + "learning_rate": 9.779361381389613e-06, + "loss": 0.6365, + "step": 2361 + }, + { + "epoch": 0.03204015192620727, + "grad_norm": 9.328064918518066, + "learning_rate": 9.779224338769359e-06, + "loss": 0.5424, + "step": 2362 + }, + { + "epoch": 0.03205371676614216, + "grad_norm": 7.530999183654785, + "learning_rate": 9.779087296149104e-06, + "loss": 0.4663, + "step": 2363 + }, + { + "epoch": 0.032067281606077046, + "grad_norm": 9.910932540893555, + "learning_rate": 9.778950253528849e-06, + "loss": 0.6107, + "step": 2364 + }, + { + "epoch": 0.03208084644601194, + "grad_norm": 9.952437400817871, + "learning_rate": 9.778813210908592e-06, + "loss": 0.6552, + "step": 2365 + }, + { + "epoch": 0.032094411285946825, + "grad_norm": 16.14950942993164, + "learning_rate": 9.778676168288338e-06, + "loss": 0.8422, + "step": 2366 + }, + { + "epoch": 0.03210797612588171, + "grad_norm": 10.584371566772461, + "learning_rate": 9.778539125668085e-06, + "loss": 0.5551, + "step": 2367 + }, + { + "epoch": 0.032121540965816604, + "grad_norm": 7.612026691436768, + "learning_rate": 9.778402083047828e-06, + "loss": 0.4136, + "step": 2368 + }, + { + "epoch": 0.03213510580575149, + "grad_norm": 7.9320502281188965, + "learning_rate": 9.778265040427573e-06, + "loss": 0.3803, + "step": 2369 + }, + { + "epoch": 0.032148670645686384, + "grad_norm": 9.881982803344727, + "learning_rate": 9.778127997807318e-06, + "loss": 0.6045, + "step": 2370 + }, + { + "epoch": 0.03216223548562127, + "grad_norm": 11.63511848449707, + "learning_rate": 9.777990955187065e-06, + "loss": 0.5567, + "step": 2371 + }, + { + "epoch": 0.032175800325556156, + "grad_norm": 6.592850685119629, + "learning_rate": 9.777853912566809e-06, + "loss": 0.4316, + "step": 2372 + }, + { + "epoch": 0.03218936516549105, + "grad_norm": 7.494507312774658, + "learning_rate": 9.777716869946554e-06, + "loss": 0.4246, + "step": 2373 + }, + { + "epoch": 0.032202930005425935, + "grad_norm": 11.754484176635742, + "learning_rate": 9.777579827326299e-06, + "loss": 0.8605, + "step": 2374 + }, + { + "epoch": 0.03221649484536082, + "grad_norm": 8.185733795166016, + "learning_rate": 9.777442784706044e-06, + "loss": 0.5222, + "step": 2375 + }, + { + "epoch": 0.032230059685295714, + "grad_norm": 7.143501281738281, + "learning_rate": 9.77730574208579e-06, + "loss": 0.487, + "step": 2376 + }, + { + "epoch": 0.0322436245252306, + "grad_norm": 9.401711463928223, + "learning_rate": 9.777168699465535e-06, + "loss": 0.524, + "step": 2377 + }, + { + "epoch": 0.032257189365165494, + "grad_norm": 7.226908206939697, + "learning_rate": 9.77703165684528e-06, + "loss": 0.4596, + "step": 2378 + }, + { + "epoch": 0.03227075420510038, + "grad_norm": 9.148351669311523, + "learning_rate": 9.776894614225025e-06, + "loss": 0.5954, + "step": 2379 + }, + { + "epoch": 0.032284319045035266, + "grad_norm": 9.038419723510742, + "learning_rate": 9.77675757160477e-06, + "loss": 0.4128, + "step": 2380 + }, + { + "epoch": 0.03229788388497016, + "grad_norm": 9.454216957092285, + "learning_rate": 9.776620528984515e-06, + "loss": 0.4678, + "step": 2381 + }, + { + "epoch": 0.032311448724905045, + "grad_norm": 6.976375579833984, + "learning_rate": 9.77648348636426e-06, + "loss": 0.3507, + "step": 2382 + }, + { + "epoch": 0.03232501356483993, + "grad_norm": 7.032500743865967, + "learning_rate": 9.776346443744004e-06, + "loss": 0.368, + "step": 2383 + }, + { + "epoch": 0.032338578404774825, + "grad_norm": 8.405557632446289, + "learning_rate": 9.776209401123751e-06, + "loss": 0.4945, + "step": 2384 + }, + { + "epoch": 0.03235214324470971, + "grad_norm": 9.153585433959961, + "learning_rate": 9.776072358503496e-06, + "loss": 0.5377, + "step": 2385 + }, + { + "epoch": 0.032365708084644604, + "grad_norm": 5.970622539520264, + "learning_rate": 9.775935315883241e-06, + "loss": 0.4277, + "step": 2386 + }, + { + "epoch": 0.03237927292457949, + "grad_norm": 8.095532417297363, + "learning_rate": 9.775798273262985e-06, + "loss": 0.3762, + "step": 2387 + }, + { + "epoch": 0.032392837764514376, + "grad_norm": 5.655706882476807, + "learning_rate": 9.775661230642732e-06, + "loss": 0.3687, + "step": 2388 + }, + { + "epoch": 0.03240640260444927, + "grad_norm": 6.286671161651611, + "learning_rate": 9.775524188022477e-06, + "loss": 0.3654, + "step": 2389 + }, + { + "epoch": 0.032419967444384155, + "grad_norm": 8.802115440368652, + "learning_rate": 9.77538714540222e-06, + "loss": 0.4843, + "step": 2390 + }, + { + "epoch": 0.03243353228431905, + "grad_norm": 6.936021327972412, + "learning_rate": 9.775250102781965e-06, + "loss": 0.3918, + "step": 2391 + }, + { + "epoch": 0.032447097124253935, + "grad_norm": 7.44364595413208, + "learning_rate": 9.77511306016171e-06, + "loss": 0.4782, + "step": 2392 + }, + { + "epoch": 0.03246066196418882, + "grad_norm": 7.128329277038574, + "learning_rate": 9.774976017541456e-06, + "loss": 0.4374, + "step": 2393 + }, + { + "epoch": 0.032474226804123714, + "grad_norm": 10.424921989440918, + "learning_rate": 9.774838974921201e-06, + "loss": 0.7193, + "step": 2394 + }, + { + "epoch": 0.0324877916440586, + "grad_norm": 7.752346515655518, + "learning_rate": 9.774701932300946e-06, + "loss": 0.4007, + "step": 2395 + }, + { + "epoch": 0.032501356483993486, + "grad_norm": 6.869448184967041, + "learning_rate": 9.774564889680691e-06, + "loss": 0.6022, + "step": 2396 + }, + { + "epoch": 0.03251492132392838, + "grad_norm": 6.665087699890137, + "learning_rate": 9.774427847060437e-06, + "loss": 0.4079, + "step": 2397 + }, + { + "epoch": 0.032528486163863266, + "grad_norm": 6.493564128875732, + "learning_rate": 9.774290804440182e-06, + "loss": 0.3501, + "step": 2398 + }, + { + "epoch": 0.03254205100379816, + "grad_norm": 8.1746826171875, + "learning_rate": 9.774153761819927e-06, + "loss": 0.4657, + "step": 2399 + }, + { + "epoch": 0.032555615843733045, + "grad_norm": 8.611894607543945, + "learning_rate": 9.774016719199672e-06, + "loss": 0.431, + "step": 2400 + }, + { + "epoch": 0.03256918068366793, + "grad_norm": 9.883655548095703, + "learning_rate": 9.773879676579417e-06, + "loss": 0.5439, + "step": 2401 + }, + { + "epoch": 0.032582745523602824, + "grad_norm": 8.296216011047363, + "learning_rate": 9.773742633959162e-06, + "loss": 0.4247, + "step": 2402 + }, + { + "epoch": 0.03259631036353771, + "grad_norm": 5.799163341522217, + "learning_rate": 9.773605591338908e-06, + "loss": 0.3812, + "step": 2403 + }, + { + "epoch": 0.032609875203472596, + "grad_norm": 8.742427825927734, + "learning_rate": 9.773468548718653e-06, + "loss": 0.5724, + "step": 2404 + }, + { + "epoch": 0.03262344004340749, + "grad_norm": 14.182036399841309, + "learning_rate": 9.773331506098396e-06, + "loss": 0.6296, + "step": 2405 + }, + { + "epoch": 0.032637004883342376, + "grad_norm": 8.35710620880127, + "learning_rate": 9.773194463478143e-06, + "loss": 0.4689, + "step": 2406 + }, + { + "epoch": 0.03265056972327727, + "grad_norm": 9.080789566040039, + "learning_rate": 9.773057420857888e-06, + "loss": 0.5956, + "step": 2407 + }, + { + "epoch": 0.032664134563212155, + "grad_norm": 8.605250358581543, + "learning_rate": 9.772920378237632e-06, + "loss": 0.542, + "step": 2408 + }, + { + "epoch": 0.03267769940314704, + "grad_norm": 8.098877906799316, + "learning_rate": 9.772783335617377e-06, + "loss": 0.5269, + "step": 2409 + }, + { + "epoch": 0.032691264243081934, + "grad_norm": 7.956370830535889, + "learning_rate": 9.772646292997124e-06, + "loss": 0.4327, + "step": 2410 + }, + { + "epoch": 0.03270482908301682, + "grad_norm": 11.289016723632812, + "learning_rate": 9.772509250376867e-06, + "loss": 0.657, + "step": 2411 + }, + { + "epoch": 0.032718393922951707, + "grad_norm": 12.028441429138184, + "learning_rate": 9.772372207756612e-06, + "loss": 0.6589, + "step": 2412 + }, + { + "epoch": 0.0327319587628866, + "grad_norm": 9.614410400390625, + "learning_rate": 9.772235165136358e-06, + "loss": 0.5971, + "step": 2413 + }, + { + "epoch": 0.032745523602821486, + "grad_norm": 9.182235717773438, + "learning_rate": 9.772098122516105e-06, + "loss": 0.5174, + "step": 2414 + }, + { + "epoch": 0.03275908844275638, + "grad_norm": 7.726785182952881, + "learning_rate": 9.771961079895848e-06, + "loss": 0.5691, + "step": 2415 + }, + { + "epoch": 0.032772653282691265, + "grad_norm": 9.053804397583008, + "learning_rate": 9.771824037275593e-06, + "loss": 0.5847, + "step": 2416 + }, + { + "epoch": 0.03278621812262615, + "grad_norm": 7.800378799438477, + "learning_rate": 9.771686994655338e-06, + "loss": 0.5259, + "step": 2417 + }, + { + "epoch": 0.032799782962561044, + "grad_norm": 6.198645114898682, + "learning_rate": 9.771549952035084e-06, + "loss": 0.4116, + "step": 2418 + }, + { + "epoch": 0.03281334780249593, + "grad_norm": 8.499496459960938, + "learning_rate": 9.771412909414829e-06, + "loss": 0.3744, + "step": 2419 + }, + { + "epoch": 0.03282691264243082, + "grad_norm": 11.547374725341797, + "learning_rate": 9.771275866794574e-06, + "loss": 0.6241, + "step": 2420 + }, + { + "epoch": 0.03284047748236571, + "grad_norm": 10.73283576965332, + "learning_rate": 9.771138824174319e-06, + "loss": 0.5213, + "step": 2421 + }, + { + "epoch": 0.032854042322300596, + "grad_norm": 10.194573402404785, + "learning_rate": 9.771001781554064e-06, + "loss": 0.5759, + "step": 2422 + }, + { + "epoch": 0.03286760716223549, + "grad_norm": 8.34716510772705, + "learning_rate": 9.77086473893381e-06, + "loss": 0.5005, + "step": 2423 + }, + { + "epoch": 0.032881172002170375, + "grad_norm": 7.640896320343018, + "learning_rate": 9.770727696313555e-06, + "loss": 0.5716, + "step": 2424 + }, + { + "epoch": 0.03289473684210526, + "grad_norm": 9.112255096435547, + "learning_rate": 9.7705906536933e-06, + "loss": 0.5085, + "step": 2425 + }, + { + "epoch": 0.032908301682040154, + "grad_norm": 8.637978553771973, + "learning_rate": 9.770453611073043e-06, + "loss": 0.5314, + "step": 2426 + }, + { + "epoch": 0.03292186652197504, + "grad_norm": 6.902368545532227, + "learning_rate": 9.77031656845279e-06, + "loss": 0.5588, + "step": 2427 + }, + { + "epoch": 0.03293543136190993, + "grad_norm": 9.568315505981445, + "learning_rate": 9.770179525832535e-06, + "loss": 0.513, + "step": 2428 + }, + { + "epoch": 0.03294899620184482, + "grad_norm": 8.336440086364746, + "learning_rate": 9.77004248321228e-06, + "loss": 0.4562, + "step": 2429 + }, + { + "epoch": 0.032962561041779706, + "grad_norm": 10.230088233947754, + "learning_rate": 9.769905440592024e-06, + "loss": 0.5673, + "step": 2430 + }, + { + "epoch": 0.0329761258817146, + "grad_norm": 10.522976875305176, + "learning_rate": 9.769768397971771e-06, + "loss": 0.5139, + "step": 2431 + }, + { + "epoch": 0.032989690721649485, + "grad_norm": 6.3357672691345215, + "learning_rate": 9.769631355351516e-06, + "loss": 0.3964, + "step": 2432 + }, + { + "epoch": 0.03300325556158437, + "grad_norm": 7.903251647949219, + "learning_rate": 9.76949431273126e-06, + "loss": 0.3596, + "step": 2433 + }, + { + "epoch": 0.033016820401519265, + "grad_norm": 10.887288093566895, + "learning_rate": 9.769357270111005e-06, + "loss": 0.7458, + "step": 2434 + }, + { + "epoch": 0.03303038524145415, + "grad_norm": 10.064069747924805, + "learning_rate": 9.76922022749075e-06, + "loss": 0.6323, + "step": 2435 + }, + { + "epoch": 0.03304395008138904, + "grad_norm": 7.667444705963135, + "learning_rate": 9.769083184870495e-06, + "loss": 0.6708, + "step": 2436 + }, + { + "epoch": 0.03305751492132393, + "grad_norm": 9.126471519470215, + "learning_rate": 9.76894614225024e-06, + "loss": 0.6835, + "step": 2437 + }, + { + "epoch": 0.033071079761258816, + "grad_norm": 10.632474899291992, + "learning_rate": 9.768809099629985e-06, + "loss": 0.6129, + "step": 2438 + }, + { + "epoch": 0.03308464460119371, + "grad_norm": 8.896145820617676, + "learning_rate": 9.76867205700973e-06, + "loss": 0.542, + "step": 2439 + }, + { + "epoch": 0.033098209441128595, + "grad_norm": 7.0670905113220215, + "learning_rate": 9.768535014389476e-06, + "loss": 0.4013, + "step": 2440 + }, + { + "epoch": 0.03311177428106348, + "grad_norm": 8.080574989318848, + "learning_rate": 9.768397971769221e-06, + "loss": 0.5969, + "step": 2441 + }, + { + "epoch": 0.033125339120998375, + "grad_norm": 6.285196781158447, + "learning_rate": 9.768260929148966e-06, + "loss": 0.3977, + "step": 2442 + }, + { + "epoch": 0.03313890396093326, + "grad_norm": 10.065744400024414, + "learning_rate": 9.768123886528711e-06, + "loss": 0.5623, + "step": 2443 + }, + { + "epoch": 0.03315246880086815, + "grad_norm": 8.418161392211914, + "learning_rate": 9.767986843908457e-06, + "loss": 0.6055, + "step": 2444 + }, + { + "epoch": 0.03316603364080304, + "grad_norm": 9.566819190979004, + "learning_rate": 9.767849801288202e-06, + "loss": 0.669, + "step": 2445 + }, + { + "epoch": 0.033179598480737926, + "grad_norm": 11.408814430236816, + "learning_rate": 9.767712758667947e-06, + "loss": 0.6988, + "step": 2446 + }, + { + "epoch": 0.03319316332067282, + "grad_norm": 8.180804252624512, + "learning_rate": 9.767575716047692e-06, + "loss": 0.4847, + "step": 2447 + }, + { + "epoch": 0.033206728160607706, + "grad_norm": 9.676761627197266, + "learning_rate": 9.767438673427436e-06, + "loss": 0.5613, + "step": 2448 + }, + { + "epoch": 0.03322029300054259, + "grad_norm": 8.572661399841309, + "learning_rate": 9.767301630807182e-06, + "loss": 0.4633, + "step": 2449 + }, + { + "epoch": 0.033233857840477485, + "grad_norm": 7.770209312438965, + "learning_rate": 9.767164588186928e-06, + "loss": 0.4185, + "step": 2450 + }, + { + "epoch": 0.03324742268041237, + "grad_norm": 8.84084415435791, + "learning_rate": 9.767027545566671e-06, + "loss": 0.4005, + "step": 2451 + }, + { + "epoch": 0.03326098752034726, + "grad_norm": 9.046789169311523, + "learning_rate": 9.766890502946416e-06, + "loss": 0.6354, + "step": 2452 + }, + { + "epoch": 0.03327455236028215, + "grad_norm": 6.774611949920654, + "learning_rate": 9.766753460326163e-06, + "loss": 0.4, + "step": 2453 + }, + { + "epoch": 0.033288117200217036, + "grad_norm": 7.675518035888672, + "learning_rate": 9.766616417705908e-06, + "loss": 0.4439, + "step": 2454 + }, + { + "epoch": 0.03330168204015193, + "grad_norm": 6.657246112823486, + "learning_rate": 9.766479375085652e-06, + "loss": 0.2784, + "step": 2455 + }, + { + "epoch": 0.033315246880086816, + "grad_norm": 9.03258991241455, + "learning_rate": 9.766342332465397e-06, + "loss": 0.4827, + "step": 2456 + }, + { + "epoch": 0.0333288117200217, + "grad_norm": 5.894471168518066, + "learning_rate": 9.766205289845144e-06, + "loss": 0.4128, + "step": 2457 + }, + { + "epoch": 0.033342376559956595, + "grad_norm": 7.539176940917969, + "learning_rate": 9.766068247224887e-06, + "loss": 0.5928, + "step": 2458 + }, + { + "epoch": 0.03335594139989148, + "grad_norm": 8.155841827392578, + "learning_rate": 9.765931204604633e-06, + "loss": 0.6345, + "step": 2459 + }, + { + "epoch": 0.03336950623982637, + "grad_norm": 10.992836952209473, + "learning_rate": 9.765794161984378e-06, + "loss": 0.711, + "step": 2460 + }, + { + "epoch": 0.03338307107976126, + "grad_norm": 6.127984046936035, + "learning_rate": 9.765657119364123e-06, + "loss": 0.4092, + "step": 2461 + }, + { + "epoch": 0.03339663591969615, + "grad_norm": 7.339347839355469, + "learning_rate": 9.765520076743868e-06, + "loss": 0.3954, + "step": 2462 + }, + { + "epoch": 0.03341020075963104, + "grad_norm": 10.095440864562988, + "learning_rate": 9.765383034123613e-06, + "loss": 0.6748, + "step": 2463 + }, + { + "epoch": 0.033423765599565926, + "grad_norm": 6.563452243804932, + "learning_rate": 9.765245991503358e-06, + "loss": 0.4238, + "step": 2464 + }, + { + "epoch": 0.03343733043950081, + "grad_norm": 8.337088584899902, + "learning_rate": 9.765108948883104e-06, + "loss": 0.4924, + "step": 2465 + }, + { + "epoch": 0.033450895279435705, + "grad_norm": 7.528881072998047, + "learning_rate": 9.764971906262849e-06, + "loss": 0.3576, + "step": 2466 + }, + { + "epoch": 0.03346446011937059, + "grad_norm": 6.459578037261963, + "learning_rate": 9.764834863642594e-06, + "loss": 0.4604, + "step": 2467 + }, + { + "epoch": 0.03347802495930548, + "grad_norm": 8.550874710083008, + "learning_rate": 9.764697821022339e-06, + "loss": 0.3617, + "step": 2468 + }, + { + "epoch": 0.03349158979924037, + "grad_norm": 8.99507999420166, + "learning_rate": 9.764560778402084e-06, + "loss": 0.4661, + "step": 2469 + }, + { + "epoch": 0.03350515463917526, + "grad_norm": 7.081814289093018, + "learning_rate": 9.76442373578183e-06, + "loss": 0.3885, + "step": 2470 + }, + { + "epoch": 0.03351871947911015, + "grad_norm": 6.856464862823486, + "learning_rate": 9.764286693161575e-06, + "loss": 0.3359, + "step": 2471 + }, + { + "epoch": 0.033532284319045036, + "grad_norm": 7.111790657043457, + "learning_rate": 9.76414965054132e-06, + "loss": 0.3655, + "step": 2472 + }, + { + "epoch": 0.03354584915897992, + "grad_norm": 5.905405044555664, + "learning_rate": 9.764012607921063e-06, + "loss": 0.3725, + "step": 2473 + }, + { + "epoch": 0.033559413998914815, + "grad_norm": 7.881541728973389, + "learning_rate": 9.763875565300809e-06, + "loss": 0.3772, + "step": 2474 + }, + { + "epoch": 0.0335729788388497, + "grad_norm": 5.058144569396973, + "learning_rate": 9.763738522680555e-06, + "loss": 0.4154, + "step": 2475 + }, + { + "epoch": 0.03358654367878459, + "grad_norm": 6.230430603027344, + "learning_rate": 9.763601480060299e-06, + "loss": 0.4082, + "step": 2476 + }, + { + "epoch": 0.03360010851871948, + "grad_norm": 6.842184066772461, + "learning_rate": 9.763464437440044e-06, + "loss": 0.4016, + "step": 2477 + }, + { + "epoch": 0.03361367335865437, + "grad_norm": 7.634958744049072, + "learning_rate": 9.76332739481979e-06, + "loss": 0.5273, + "step": 2478 + }, + { + "epoch": 0.03362723819858926, + "grad_norm": 7.516995429992676, + "learning_rate": 9.763190352199536e-06, + "loss": 0.4233, + "step": 2479 + }, + { + "epoch": 0.033640803038524146, + "grad_norm": 8.077923774719238, + "learning_rate": 9.76305330957928e-06, + "loss": 0.3795, + "step": 2480 + }, + { + "epoch": 0.03365436787845903, + "grad_norm": 8.28658390045166, + "learning_rate": 9.762916266959025e-06, + "loss": 0.4593, + "step": 2481 + }, + { + "epoch": 0.033667932718393925, + "grad_norm": 6.624457359313965, + "learning_rate": 9.76277922433877e-06, + "loss": 0.3079, + "step": 2482 + }, + { + "epoch": 0.03368149755832881, + "grad_norm": 7.0356974601745605, + "learning_rate": 9.762642181718515e-06, + "loss": 0.517, + "step": 2483 + }, + { + "epoch": 0.0336950623982637, + "grad_norm": 6.732090473175049, + "learning_rate": 9.76250513909826e-06, + "loss": 0.4139, + "step": 2484 + }, + { + "epoch": 0.03370862723819859, + "grad_norm": 7.495790481567383, + "learning_rate": 9.762368096478005e-06, + "loss": 0.3672, + "step": 2485 + }, + { + "epoch": 0.03372219207813348, + "grad_norm": 5.692791938781738, + "learning_rate": 9.76223105385775e-06, + "loss": 0.3505, + "step": 2486 + }, + { + "epoch": 0.03373575691806837, + "grad_norm": 6.171116352081299, + "learning_rate": 9.762094011237496e-06, + "loss": 0.3512, + "step": 2487 + }, + { + "epoch": 0.033749321758003256, + "grad_norm": 7.305147647857666, + "learning_rate": 9.761956968617241e-06, + "loss": 0.4651, + "step": 2488 + }, + { + "epoch": 0.03376288659793814, + "grad_norm": 6.791421890258789, + "learning_rate": 9.761819925996986e-06, + "loss": 0.358, + "step": 2489 + }, + { + "epoch": 0.033776451437873035, + "grad_norm": 7.483231544494629, + "learning_rate": 9.761682883376731e-06, + "loss": 0.5161, + "step": 2490 + }, + { + "epoch": 0.03379001627780792, + "grad_norm": 7.795148849487305, + "learning_rate": 9.761545840756475e-06, + "loss": 0.5138, + "step": 2491 + }, + { + "epoch": 0.03380358111774281, + "grad_norm": 9.480402946472168, + "learning_rate": 9.761408798136222e-06, + "loss": 0.4957, + "step": 2492 + }, + { + "epoch": 0.0338171459576777, + "grad_norm": 6.821193218231201, + "learning_rate": 9.761271755515967e-06, + "loss": 0.4319, + "step": 2493 + }, + { + "epoch": 0.03383071079761259, + "grad_norm": 6.973264694213867, + "learning_rate": 9.761134712895712e-06, + "loss": 0.4318, + "step": 2494 + }, + { + "epoch": 0.03384427563754748, + "grad_norm": 7.34822416305542, + "learning_rate": 9.760997670275456e-06, + "loss": 0.3602, + "step": 2495 + }, + { + "epoch": 0.033857840477482366, + "grad_norm": 8.204038619995117, + "learning_rate": 9.760860627655202e-06, + "loss": 0.57, + "step": 2496 + }, + { + "epoch": 0.03387140531741725, + "grad_norm": 6.061381816864014, + "learning_rate": 9.760723585034948e-06, + "loss": 0.3255, + "step": 2497 + }, + { + "epoch": 0.033884970157352146, + "grad_norm": 6.292692184448242, + "learning_rate": 9.760586542414691e-06, + "loss": 0.2811, + "step": 2498 + }, + { + "epoch": 0.03389853499728703, + "grad_norm": 6.235581398010254, + "learning_rate": 9.760449499794436e-06, + "loss": 0.5158, + "step": 2499 + }, + { + "epoch": 0.03391209983722192, + "grad_norm": 4.760245323181152, + "learning_rate": 9.760312457174183e-06, + "loss": 0.263, + "step": 2500 + }, + { + "epoch": 0.03392566467715681, + "grad_norm": 8.374466896057129, + "learning_rate": 9.760175414553927e-06, + "loss": 0.6415, + "step": 2501 + }, + { + "epoch": 0.0339392295170917, + "grad_norm": 5.515857219696045, + "learning_rate": 9.760038371933672e-06, + "loss": 0.3617, + "step": 2502 + }, + { + "epoch": 0.03395279435702659, + "grad_norm": 6.727289199829102, + "learning_rate": 9.759901329313417e-06, + "loss": 0.2757, + "step": 2503 + }, + { + "epoch": 0.033966359196961476, + "grad_norm": 8.423382759094238, + "learning_rate": 9.759764286693162e-06, + "loss": 0.4386, + "step": 2504 + }, + { + "epoch": 0.03397992403689636, + "grad_norm": 6.146212100982666, + "learning_rate": 9.759627244072907e-06, + "loss": 0.3204, + "step": 2505 + }, + { + "epoch": 0.033993488876831256, + "grad_norm": 5.876919269561768, + "learning_rate": 9.759490201452653e-06, + "loss": 0.374, + "step": 2506 + }, + { + "epoch": 0.03400705371676614, + "grad_norm": 5.089951515197754, + "learning_rate": 9.759353158832398e-06, + "loss": 0.3859, + "step": 2507 + }, + { + "epoch": 0.03402061855670103, + "grad_norm": 4.904308319091797, + "learning_rate": 9.759216116212143e-06, + "loss": 0.2447, + "step": 2508 + }, + { + "epoch": 0.03403418339663592, + "grad_norm": 5.4948577880859375, + "learning_rate": 9.759079073591888e-06, + "loss": 0.3394, + "step": 2509 + }, + { + "epoch": 0.03404774823657081, + "grad_norm": 7.540782928466797, + "learning_rate": 9.758942030971633e-06, + "loss": 0.4227, + "step": 2510 + }, + { + "epoch": 0.0340613130765057, + "grad_norm": 8.216975212097168, + "learning_rate": 9.758804988351378e-06, + "loss": 0.435, + "step": 2511 + }, + { + "epoch": 0.03407487791644059, + "grad_norm": 7.674594879150391, + "learning_rate": 9.758667945731124e-06, + "loss": 0.4102, + "step": 2512 + }, + { + "epoch": 0.03408844275637547, + "grad_norm": 6.146790504455566, + "learning_rate": 9.758530903110869e-06, + "loss": 0.3509, + "step": 2513 + }, + { + "epoch": 0.034102007596310366, + "grad_norm": 6.216618061065674, + "learning_rate": 9.758393860490614e-06, + "loss": 0.3734, + "step": 2514 + }, + { + "epoch": 0.03411557243624525, + "grad_norm": 6.018468856811523, + "learning_rate": 9.758256817870359e-06, + "loss": 0.5009, + "step": 2515 + }, + { + "epoch": 0.03412913727618014, + "grad_norm": 6.432427406311035, + "learning_rate": 9.758119775250103e-06, + "loss": 0.3304, + "step": 2516 + }, + { + "epoch": 0.03414270211611503, + "grad_norm": 8.054125785827637, + "learning_rate": 9.757982732629848e-06, + "loss": 0.5894, + "step": 2517 + }, + { + "epoch": 0.03415626695604992, + "grad_norm": 8.152438163757324, + "learning_rate": 9.757845690009595e-06, + "loss": 0.5078, + "step": 2518 + }, + { + "epoch": 0.03416983179598481, + "grad_norm": 9.834712982177734, + "learning_rate": 9.757708647389338e-06, + "loss": 0.4497, + "step": 2519 + }, + { + "epoch": 0.0341833966359197, + "grad_norm": 6.796841621398926, + "learning_rate": 9.757571604769083e-06, + "loss": 0.388, + "step": 2520 + }, + { + "epoch": 0.03419696147585458, + "grad_norm": 8.281469345092773, + "learning_rate": 9.757434562148829e-06, + "loss": 0.3962, + "step": 2521 + }, + { + "epoch": 0.034210526315789476, + "grad_norm": 5.8268208503723145, + "learning_rate": 9.757297519528575e-06, + "loss": 0.3651, + "step": 2522 + }, + { + "epoch": 0.03422409115572436, + "grad_norm": 7.512585163116455, + "learning_rate": 9.757160476908319e-06, + "loss": 0.5323, + "step": 2523 + }, + { + "epoch": 0.03423765599565925, + "grad_norm": 4.740962028503418, + "learning_rate": 9.757023434288064e-06, + "loss": 0.2255, + "step": 2524 + }, + { + "epoch": 0.03425122083559414, + "grad_norm": 5.78472375869751, + "learning_rate": 9.75688639166781e-06, + "loss": 0.3711, + "step": 2525 + }, + { + "epoch": 0.03426478567552903, + "grad_norm": 6.871275901794434, + "learning_rate": 9.756749349047554e-06, + "loss": 0.4481, + "step": 2526 + }, + { + "epoch": 0.03427835051546392, + "grad_norm": 8.590721130371094, + "learning_rate": 9.7566123064273e-06, + "loss": 0.4368, + "step": 2527 + }, + { + "epoch": 0.03429191535539881, + "grad_norm": 6.370484828948975, + "learning_rate": 9.756475263807045e-06, + "loss": 0.4257, + "step": 2528 + }, + { + "epoch": 0.03430548019533369, + "grad_norm": 7.910223484039307, + "learning_rate": 9.75633822118679e-06, + "loss": 0.4623, + "step": 2529 + }, + { + "epoch": 0.034319045035268586, + "grad_norm": 8.094839096069336, + "learning_rate": 9.756201178566535e-06, + "loss": 0.4073, + "step": 2530 + }, + { + "epoch": 0.03433260987520347, + "grad_norm": 5.5130791664123535, + "learning_rate": 9.75606413594628e-06, + "loss": 0.2861, + "step": 2531 + }, + { + "epoch": 0.03434617471513836, + "grad_norm": 10.25592041015625, + "learning_rate": 9.755927093326025e-06, + "loss": 0.4268, + "step": 2532 + }, + { + "epoch": 0.03435973955507325, + "grad_norm": 7.585336685180664, + "learning_rate": 9.75579005070577e-06, + "loss": 0.4435, + "step": 2533 + }, + { + "epoch": 0.03437330439500814, + "grad_norm": 6.732353687286377, + "learning_rate": 9.755653008085514e-06, + "loss": 0.3267, + "step": 2534 + }, + { + "epoch": 0.03438686923494303, + "grad_norm": 8.24579906463623, + "learning_rate": 9.755515965465261e-06, + "loss": 0.4373, + "step": 2535 + }, + { + "epoch": 0.03440043407487792, + "grad_norm": 6.458991050720215, + "learning_rate": 9.755378922845006e-06, + "loss": 0.4026, + "step": 2536 + }, + { + "epoch": 0.0344139989148128, + "grad_norm": 6.871464729309082, + "learning_rate": 9.755241880224751e-06, + "loss": 0.3722, + "step": 2537 + }, + { + "epoch": 0.034427563754747696, + "grad_norm": 6.191283226013184, + "learning_rate": 9.755104837604495e-06, + "loss": 0.442, + "step": 2538 + }, + { + "epoch": 0.03444112859468258, + "grad_norm": 6.4015727043151855, + "learning_rate": 9.754967794984242e-06, + "loss": 0.3593, + "step": 2539 + }, + { + "epoch": 0.03445469343461747, + "grad_norm": 6.351172924041748, + "learning_rate": 9.754830752363987e-06, + "loss": 0.2838, + "step": 2540 + }, + { + "epoch": 0.03446825827455236, + "grad_norm": 6.004393577575684, + "learning_rate": 9.75469370974373e-06, + "loss": 0.3804, + "step": 2541 + }, + { + "epoch": 0.03448182311448725, + "grad_norm": 6.067015647888184, + "learning_rate": 9.754556667123476e-06, + "loss": 0.4143, + "step": 2542 + }, + { + "epoch": 0.03449538795442214, + "grad_norm": 10.404399871826172, + "learning_rate": 9.75441962450322e-06, + "loss": 0.5338, + "step": 2543 + }, + { + "epoch": 0.03450895279435703, + "grad_norm": 8.718807220458984, + "learning_rate": 9.754282581882966e-06, + "loss": 0.3367, + "step": 2544 + }, + { + "epoch": 0.03452251763429191, + "grad_norm": 7.528173923492432, + "learning_rate": 9.754145539262711e-06, + "loss": 0.4584, + "step": 2545 + }, + { + "epoch": 0.034536082474226806, + "grad_norm": 7.738710403442383, + "learning_rate": 9.754008496642456e-06, + "loss": 0.4498, + "step": 2546 + }, + { + "epoch": 0.03454964731416169, + "grad_norm": 9.590245246887207, + "learning_rate": 9.753871454022201e-06, + "loss": 0.584, + "step": 2547 + }, + { + "epoch": 0.03456321215409658, + "grad_norm": 8.712860107421875, + "learning_rate": 9.753734411401947e-06, + "loss": 0.4979, + "step": 2548 + }, + { + "epoch": 0.03457677699403147, + "grad_norm": 8.65670108795166, + "learning_rate": 9.753597368781692e-06, + "loss": 0.4113, + "step": 2549 + }, + { + "epoch": 0.03459034183396636, + "grad_norm": 5.19409704208374, + "learning_rate": 9.753460326161437e-06, + "loss": 0.2783, + "step": 2550 + }, + { + "epoch": 0.03460390667390125, + "grad_norm": 6.452942371368408, + "learning_rate": 9.753323283541182e-06, + "loss": 0.2933, + "step": 2551 + }, + { + "epoch": 0.03461747151383614, + "grad_norm": 7.753796577453613, + "learning_rate": 9.753186240920927e-06, + "loss": 0.4493, + "step": 2552 + }, + { + "epoch": 0.03463103635377102, + "grad_norm": 9.679384231567383, + "learning_rate": 9.753049198300673e-06, + "loss": 0.5649, + "step": 2553 + }, + { + "epoch": 0.034644601193705916, + "grad_norm": 5.654171943664551, + "learning_rate": 9.752912155680418e-06, + "loss": 0.2584, + "step": 2554 + }, + { + "epoch": 0.0346581660336408, + "grad_norm": 6.856190204620361, + "learning_rate": 9.752775113060163e-06, + "loss": 0.4647, + "step": 2555 + }, + { + "epoch": 0.03467173087357569, + "grad_norm": 7.491551876068115, + "learning_rate": 9.752638070439906e-06, + "loss": 0.3394, + "step": 2556 + }, + { + "epoch": 0.03468529571351058, + "grad_norm": 10.0944185256958, + "learning_rate": 9.752501027819653e-06, + "loss": 0.4571, + "step": 2557 + }, + { + "epoch": 0.03469886055344547, + "grad_norm": 7.2645673751831055, + "learning_rate": 9.752363985199398e-06, + "loss": 0.3254, + "step": 2558 + }, + { + "epoch": 0.03471242539338036, + "grad_norm": 8.104140281677246, + "learning_rate": 9.752226942579142e-06, + "loss": 0.3799, + "step": 2559 + }, + { + "epoch": 0.03472599023331525, + "grad_norm": 7.508754730224609, + "learning_rate": 9.752089899958887e-06, + "loss": 0.357, + "step": 2560 + }, + { + "epoch": 0.034739555073250133, + "grad_norm": 9.014158248901367, + "learning_rate": 9.751952857338634e-06, + "loss": 0.5151, + "step": 2561 + }, + { + "epoch": 0.03475311991318503, + "grad_norm": 7.183679580688477, + "learning_rate": 9.75181581471838e-06, + "loss": 0.476, + "step": 2562 + }, + { + "epoch": 0.03476668475311991, + "grad_norm": 8.749469757080078, + "learning_rate": 9.751678772098123e-06, + "loss": 0.4366, + "step": 2563 + }, + { + "epoch": 0.0347802495930548, + "grad_norm": 6.909982204437256, + "learning_rate": 9.751541729477868e-06, + "loss": 0.4122, + "step": 2564 + }, + { + "epoch": 0.03479381443298969, + "grad_norm": 10.063055038452148, + "learning_rate": 9.751404686857615e-06, + "loss": 0.6269, + "step": 2565 + }, + { + "epoch": 0.03480737927292458, + "grad_norm": 7.6741485595703125, + "learning_rate": 9.751267644237358e-06, + "loss": 0.375, + "step": 2566 + }, + { + "epoch": 0.03482094411285947, + "grad_norm": 7.383215427398682, + "learning_rate": 9.751130601617103e-06, + "loss": 0.4652, + "step": 2567 + }, + { + "epoch": 0.03483450895279436, + "grad_norm": 8.722061157226562, + "learning_rate": 9.750993558996849e-06, + "loss": 0.5215, + "step": 2568 + }, + { + "epoch": 0.034848073792729244, + "grad_norm": 8.523194313049316, + "learning_rate": 9.750856516376594e-06, + "loss": 0.448, + "step": 2569 + }, + { + "epoch": 0.03486163863266414, + "grad_norm": 7.315666675567627, + "learning_rate": 9.750719473756339e-06, + "loss": 0.385, + "step": 2570 + }, + { + "epoch": 0.03487520347259902, + "grad_norm": 7.608819007873535, + "learning_rate": 9.750582431136084e-06, + "loss": 0.3911, + "step": 2571 + }, + { + "epoch": 0.03488876831253391, + "grad_norm": 8.650077819824219, + "learning_rate": 9.75044538851583e-06, + "loss": 0.449, + "step": 2572 + }, + { + "epoch": 0.0349023331524688, + "grad_norm": 9.50430679321289, + "learning_rate": 9.750308345895574e-06, + "loss": 0.6055, + "step": 2573 + }, + { + "epoch": 0.03491589799240369, + "grad_norm": 7.039056301116943, + "learning_rate": 9.75017130327532e-06, + "loss": 0.3776, + "step": 2574 + }, + { + "epoch": 0.03492946283233858, + "grad_norm": 8.501744270324707, + "learning_rate": 9.750034260655065e-06, + "loss": 0.4734, + "step": 2575 + }, + { + "epoch": 0.03494302767227347, + "grad_norm": 7.450278282165527, + "learning_rate": 9.74989721803481e-06, + "loss": 0.4372, + "step": 2576 + }, + { + "epoch": 0.034956592512208354, + "grad_norm": 6.7744951248168945, + "learning_rate": 9.749760175414555e-06, + "loss": 0.4991, + "step": 2577 + }, + { + "epoch": 0.03497015735214325, + "grad_norm": 6.929047584533691, + "learning_rate": 9.7496231327943e-06, + "loss": 0.4269, + "step": 2578 + }, + { + "epoch": 0.03498372219207813, + "grad_norm": 8.704987525939941, + "learning_rate": 9.749486090174046e-06, + "loss": 0.4666, + "step": 2579 + }, + { + "epoch": 0.03499728703201302, + "grad_norm": 6.014132499694824, + "learning_rate": 9.74934904755379e-06, + "loss": 0.414, + "step": 2580 + }, + { + "epoch": 0.03501085187194791, + "grad_norm": 9.077361106872559, + "learning_rate": 9.749212004933534e-06, + "loss": 0.4564, + "step": 2581 + }, + { + "epoch": 0.0350244167118828, + "grad_norm": 10.2825288772583, + "learning_rate": 9.749074962313281e-06, + "loss": 0.4913, + "step": 2582 + }, + { + "epoch": 0.03503798155181769, + "grad_norm": 7.14277458190918, + "learning_rate": 9.748937919693026e-06, + "loss": 0.4667, + "step": 2583 + }, + { + "epoch": 0.03505154639175258, + "grad_norm": 7.833453178405762, + "learning_rate": 9.74880087707277e-06, + "loss": 0.4836, + "step": 2584 + }, + { + "epoch": 0.035065111231687464, + "grad_norm": 9.26613712310791, + "learning_rate": 9.748663834452515e-06, + "loss": 0.5534, + "step": 2585 + }, + { + "epoch": 0.03507867607162236, + "grad_norm": 6.707428455352783, + "learning_rate": 9.74852679183226e-06, + "loss": 0.3305, + "step": 2586 + }, + { + "epoch": 0.03509224091155724, + "grad_norm": 6.163125038146973, + "learning_rate": 9.748389749212005e-06, + "loss": 0.2991, + "step": 2587 + }, + { + "epoch": 0.03510580575149213, + "grad_norm": 7.936273574829102, + "learning_rate": 9.74825270659175e-06, + "loss": 0.6276, + "step": 2588 + }, + { + "epoch": 0.03511937059142702, + "grad_norm": 5.5440826416015625, + "learning_rate": 9.748115663971496e-06, + "loss": 0.2897, + "step": 2589 + }, + { + "epoch": 0.03513293543136191, + "grad_norm": 8.129725456237793, + "learning_rate": 9.74797862135124e-06, + "loss": 0.4854, + "step": 2590 + }, + { + "epoch": 0.0351465002712968, + "grad_norm": 7.965428829193115, + "learning_rate": 9.747841578730986e-06, + "loss": 0.4221, + "step": 2591 + }, + { + "epoch": 0.03516006511123169, + "grad_norm": 8.29982852935791, + "learning_rate": 9.747704536110731e-06, + "loss": 0.4947, + "step": 2592 + }, + { + "epoch": 0.035173629951166574, + "grad_norm": 4.266873836517334, + "learning_rate": 9.747567493490476e-06, + "loss": 0.2234, + "step": 2593 + }, + { + "epoch": 0.03518719479110147, + "grad_norm": 7.3198018074035645, + "learning_rate": 9.747430450870222e-06, + "loss": 0.495, + "step": 2594 + }, + { + "epoch": 0.03520075963103635, + "grad_norm": 6.688138961791992, + "learning_rate": 9.747293408249967e-06, + "loss": 0.3579, + "step": 2595 + }, + { + "epoch": 0.03521432447097124, + "grad_norm": 7.3272271156311035, + "learning_rate": 9.747156365629712e-06, + "loss": 0.3372, + "step": 2596 + }, + { + "epoch": 0.03522788931090613, + "grad_norm": 7.216460227966309, + "learning_rate": 9.747019323009457e-06, + "loss": 0.4325, + "step": 2597 + }, + { + "epoch": 0.03524145415084102, + "grad_norm": 6.791269779205322, + "learning_rate": 9.746882280389202e-06, + "loss": 0.4486, + "step": 2598 + }, + { + "epoch": 0.03525501899077591, + "grad_norm": 7.031225204467773, + "learning_rate": 9.746745237768946e-06, + "loss": 0.4497, + "step": 2599 + }, + { + "epoch": 0.0352685838307108, + "grad_norm": 6.494858264923096, + "learning_rate": 9.746608195148693e-06, + "loss": 0.3793, + "step": 2600 + }, + { + "epoch": 0.035282148670645684, + "grad_norm": 5.733344078063965, + "learning_rate": 9.746471152528438e-06, + "loss": 0.3661, + "step": 2601 + }, + { + "epoch": 0.03529571351058058, + "grad_norm": 6.570181369781494, + "learning_rate": 9.746334109908181e-06, + "loss": 0.2795, + "step": 2602 + }, + { + "epoch": 0.03530927835051546, + "grad_norm": 7.355105400085449, + "learning_rate": 9.746197067287926e-06, + "loss": 0.384, + "step": 2603 + }, + { + "epoch": 0.03532284319045035, + "grad_norm": 6.931838512420654, + "learning_rate": 9.746060024667673e-06, + "loss": 0.3559, + "step": 2604 + }, + { + "epoch": 0.03533640803038524, + "grad_norm": 5.6576642990112305, + "learning_rate": 9.745922982047418e-06, + "loss": 0.3835, + "step": 2605 + }, + { + "epoch": 0.03534997287032013, + "grad_norm": 7.809131622314453, + "learning_rate": 9.745785939427162e-06, + "loss": 0.5005, + "step": 2606 + }, + { + "epoch": 0.03536353771025502, + "grad_norm": 7.919498443603516, + "learning_rate": 9.745648896806907e-06, + "loss": 0.4594, + "step": 2607 + }, + { + "epoch": 0.03537710255018991, + "grad_norm": 9.234175682067871, + "learning_rate": 9.745511854186654e-06, + "loss": 0.4565, + "step": 2608 + }, + { + "epoch": 0.035390667390124794, + "grad_norm": 9.688628196716309, + "learning_rate": 9.745374811566397e-06, + "loss": 0.4886, + "step": 2609 + }, + { + "epoch": 0.03540423223005969, + "grad_norm": 8.577376365661621, + "learning_rate": 9.745237768946143e-06, + "loss": 0.6877, + "step": 2610 + }, + { + "epoch": 0.035417797069994574, + "grad_norm": 6.017542839050293, + "learning_rate": 9.745100726325888e-06, + "loss": 0.394, + "step": 2611 + }, + { + "epoch": 0.03543136190992946, + "grad_norm": 6.752650737762451, + "learning_rate": 9.744963683705633e-06, + "loss": 0.3141, + "step": 2612 + }, + { + "epoch": 0.03544492674986435, + "grad_norm": 6.304338455200195, + "learning_rate": 9.744826641085378e-06, + "loss": 0.4181, + "step": 2613 + }, + { + "epoch": 0.03545849158979924, + "grad_norm": 6.569177150726318, + "learning_rate": 9.744689598465123e-06, + "loss": 0.435, + "step": 2614 + }, + { + "epoch": 0.03547205642973413, + "grad_norm": 8.880146026611328, + "learning_rate": 9.744552555844869e-06, + "loss": 0.5495, + "step": 2615 + }, + { + "epoch": 0.03548562126966902, + "grad_norm": 8.08362102508545, + "learning_rate": 9.744415513224614e-06, + "loss": 0.3752, + "step": 2616 + }, + { + "epoch": 0.035499186109603904, + "grad_norm": 10.411994934082031, + "learning_rate": 9.744278470604359e-06, + "loss": 0.5443, + "step": 2617 + }, + { + "epoch": 0.0355127509495388, + "grad_norm": 9.414462089538574, + "learning_rate": 9.744141427984104e-06, + "loss": 0.4406, + "step": 2618 + }, + { + "epoch": 0.035526315789473684, + "grad_norm": 6.886204719543457, + "learning_rate": 9.74400438536385e-06, + "loss": 0.4423, + "step": 2619 + }, + { + "epoch": 0.03553988062940857, + "grad_norm": 7.966657638549805, + "learning_rate": 9.743867342743594e-06, + "loss": 0.4447, + "step": 2620 + }, + { + "epoch": 0.03555344546934346, + "grad_norm": 8.524323463439941, + "learning_rate": 9.74373030012334e-06, + "loss": 0.5478, + "step": 2621 + }, + { + "epoch": 0.03556701030927835, + "grad_norm": 9.147424697875977, + "learning_rate": 9.743593257503085e-06, + "loss": 0.4921, + "step": 2622 + }, + { + "epoch": 0.03558057514921324, + "grad_norm": 5.625079154968262, + "learning_rate": 9.74345621488283e-06, + "loss": 0.3049, + "step": 2623 + }, + { + "epoch": 0.03559413998914813, + "grad_norm": 5.825267791748047, + "learning_rate": 9.743319172262573e-06, + "loss": 0.3975, + "step": 2624 + }, + { + "epoch": 0.035607704829083014, + "grad_norm": 7.468721389770508, + "learning_rate": 9.743182129642319e-06, + "loss": 0.3996, + "step": 2625 + }, + { + "epoch": 0.03562126966901791, + "grad_norm": 6.476320743560791, + "learning_rate": 9.743045087022066e-06, + "loss": 0.4274, + "step": 2626 + }, + { + "epoch": 0.035634834508952794, + "grad_norm": 9.250251770019531, + "learning_rate": 9.742908044401809e-06, + "loss": 0.5437, + "step": 2627 + }, + { + "epoch": 0.03564839934888768, + "grad_norm": 5.550039291381836, + "learning_rate": 9.742771001781554e-06, + "loss": 0.385, + "step": 2628 + }, + { + "epoch": 0.03566196418882257, + "grad_norm": 5.22631311416626, + "learning_rate": 9.7426339591613e-06, + "loss": 0.3875, + "step": 2629 + }, + { + "epoch": 0.03567552902875746, + "grad_norm": 7.69720983505249, + "learning_rate": 9.742496916541046e-06, + "loss": 0.4775, + "step": 2630 + }, + { + "epoch": 0.03568909386869235, + "grad_norm": 7.378520965576172, + "learning_rate": 9.74235987392079e-06, + "loss": 0.4857, + "step": 2631 + }, + { + "epoch": 0.03570265870862724, + "grad_norm": 8.331825256347656, + "learning_rate": 9.742222831300535e-06, + "loss": 0.4132, + "step": 2632 + }, + { + "epoch": 0.035716223548562125, + "grad_norm": 9.630934715270996, + "learning_rate": 9.74208578868028e-06, + "loss": 0.469, + "step": 2633 + }, + { + "epoch": 0.03572978838849702, + "grad_norm": 8.260393142700195, + "learning_rate": 9.741948746060025e-06, + "loss": 0.515, + "step": 2634 + }, + { + "epoch": 0.035743353228431904, + "grad_norm": 7.827507972717285, + "learning_rate": 9.74181170343977e-06, + "loss": 0.4213, + "step": 2635 + }, + { + "epoch": 0.03575691806836679, + "grad_norm": 8.054658889770508, + "learning_rate": 9.741674660819516e-06, + "loss": 0.5804, + "step": 2636 + }, + { + "epoch": 0.03577048290830168, + "grad_norm": 7.067508697509766, + "learning_rate": 9.74153761819926e-06, + "loss": 0.6042, + "step": 2637 + }, + { + "epoch": 0.03578404774823657, + "grad_norm": 8.007244110107422, + "learning_rate": 9.741400575579006e-06, + "loss": 0.7672, + "step": 2638 + }, + { + "epoch": 0.03579761258817146, + "grad_norm": 7.818734169006348, + "learning_rate": 9.741263532958751e-06, + "loss": 0.519, + "step": 2639 + }, + { + "epoch": 0.03581117742810635, + "grad_norm": 5.578385829925537, + "learning_rate": 9.741126490338496e-06, + "loss": 0.4225, + "step": 2640 + }, + { + "epoch": 0.035824742268041235, + "grad_norm": 10.852927207946777, + "learning_rate": 9.740989447718242e-06, + "loss": 0.6556, + "step": 2641 + }, + { + "epoch": 0.03583830710797613, + "grad_norm": 7.257878303527832, + "learning_rate": 9.740852405097985e-06, + "loss": 0.4673, + "step": 2642 + }, + { + "epoch": 0.035851871947911014, + "grad_norm": 8.41199779510498, + "learning_rate": 9.740715362477732e-06, + "loss": 0.379, + "step": 2643 + }, + { + "epoch": 0.0358654367878459, + "grad_norm": 9.266315460205078, + "learning_rate": 9.740578319857477e-06, + "loss": 0.6402, + "step": 2644 + }, + { + "epoch": 0.03587900162778079, + "grad_norm": 6.534460544586182, + "learning_rate": 9.740441277237222e-06, + "loss": 0.3828, + "step": 2645 + }, + { + "epoch": 0.03589256646771568, + "grad_norm": 6.0957746505737305, + "learning_rate": 9.740304234616966e-06, + "loss": 0.3658, + "step": 2646 + }, + { + "epoch": 0.03590613130765057, + "grad_norm": 5.940039157867432, + "learning_rate": 9.740167191996713e-06, + "loss": 0.3807, + "step": 2647 + }, + { + "epoch": 0.03591969614758546, + "grad_norm": 11.143567085266113, + "learning_rate": 9.740030149376458e-06, + "loss": 0.6816, + "step": 2648 + }, + { + "epoch": 0.035933260987520345, + "grad_norm": 5.481468200683594, + "learning_rate": 9.739893106756201e-06, + "loss": 0.4398, + "step": 2649 + }, + { + "epoch": 0.03594682582745524, + "grad_norm": 7.272068977355957, + "learning_rate": 9.739756064135946e-06, + "loss": 0.5468, + "step": 2650 + }, + { + "epoch": 0.035960390667390124, + "grad_norm": 7.569608211517334, + "learning_rate": 9.739619021515693e-06, + "loss": 0.4523, + "step": 2651 + }, + { + "epoch": 0.03597395550732501, + "grad_norm": 7.545230388641357, + "learning_rate": 9.739481978895437e-06, + "loss": 0.461, + "step": 2652 + }, + { + "epoch": 0.0359875203472599, + "grad_norm": 6.203386306762695, + "learning_rate": 9.739344936275182e-06, + "loss": 0.3729, + "step": 2653 + }, + { + "epoch": 0.03600108518719479, + "grad_norm": 6.412045478820801, + "learning_rate": 9.739207893654927e-06, + "loss": 0.5581, + "step": 2654 + }, + { + "epoch": 0.03601465002712968, + "grad_norm": 6.679462909698486, + "learning_rate": 9.739070851034672e-06, + "loss": 0.4062, + "step": 2655 + }, + { + "epoch": 0.03602821486706457, + "grad_norm": 8.94129753112793, + "learning_rate": 9.738933808414418e-06, + "loss": 0.553, + "step": 2656 + }, + { + "epoch": 0.036041779706999455, + "grad_norm": 11.580521583557129, + "learning_rate": 9.738796765794163e-06, + "loss": 0.6685, + "step": 2657 + }, + { + "epoch": 0.03605534454693435, + "grad_norm": 8.204806327819824, + "learning_rate": 9.738659723173908e-06, + "loss": 0.4776, + "step": 2658 + }, + { + "epoch": 0.036068909386869234, + "grad_norm": 10.111762046813965, + "learning_rate": 9.738522680553653e-06, + "loss": 0.6169, + "step": 2659 + }, + { + "epoch": 0.03608247422680412, + "grad_norm": 4.0607500076293945, + "learning_rate": 9.738385637933398e-06, + "loss": 0.3025, + "step": 2660 + }, + { + "epoch": 0.036096039066739014, + "grad_norm": 5.6517133712768555, + "learning_rate": 9.738248595313143e-06, + "loss": 0.2594, + "step": 2661 + }, + { + "epoch": 0.0361096039066739, + "grad_norm": 6.237009525299072, + "learning_rate": 9.738111552692889e-06, + "loss": 0.3978, + "step": 2662 + }, + { + "epoch": 0.03612316874660879, + "grad_norm": 7.200744152069092, + "learning_rate": 9.737974510072634e-06, + "loss": 0.3561, + "step": 2663 + }, + { + "epoch": 0.03613673358654368, + "grad_norm": 6.347243785858154, + "learning_rate": 9.737837467452379e-06, + "loss": 0.3548, + "step": 2664 + }, + { + "epoch": 0.036150298426478565, + "grad_norm": 7.60599946975708, + "learning_rate": 9.737700424832124e-06, + "loss": 0.6212, + "step": 2665 + }, + { + "epoch": 0.03616386326641346, + "grad_norm": 8.653366088867188, + "learning_rate": 9.73756338221187e-06, + "loss": 0.566, + "step": 2666 + }, + { + "epoch": 0.036177428106348344, + "grad_norm": 6.64951229095459, + "learning_rate": 9.737426339591613e-06, + "loss": 0.46, + "step": 2667 + }, + { + "epoch": 0.03619099294628323, + "grad_norm": 6.783572196960449, + "learning_rate": 9.737289296971358e-06, + "loss": 0.3423, + "step": 2668 + }, + { + "epoch": 0.036204557786218124, + "grad_norm": 7.704829216003418, + "learning_rate": 9.737152254351105e-06, + "loss": 0.5727, + "step": 2669 + }, + { + "epoch": 0.03621812262615301, + "grad_norm": 6.415317058563232, + "learning_rate": 9.73701521173085e-06, + "loss": 0.4301, + "step": 2670 + }, + { + "epoch": 0.0362316874660879, + "grad_norm": 8.026145935058594, + "learning_rate": 9.736878169110594e-06, + "loss": 0.5356, + "step": 2671 + }, + { + "epoch": 0.03624525230602279, + "grad_norm": 8.373662948608398, + "learning_rate": 9.736741126490339e-06, + "loss": 0.4125, + "step": 2672 + }, + { + "epoch": 0.036258817145957675, + "grad_norm": 7.531220436096191, + "learning_rate": 9.736604083870086e-06, + "loss": 0.5208, + "step": 2673 + }, + { + "epoch": 0.03627238198589257, + "grad_norm": 10.731002807617188, + "learning_rate": 9.736467041249829e-06, + "loss": 0.7654, + "step": 2674 + }, + { + "epoch": 0.036285946825827455, + "grad_norm": 9.148218154907227, + "learning_rate": 9.736329998629574e-06, + "loss": 0.5439, + "step": 2675 + }, + { + "epoch": 0.03629951166576234, + "grad_norm": 8.247156143188477, + "learning_rate": 9.73619295600932e-06, + "loss": 0.5039, + "step": 2676 + }, + { + "epoch": 0.036313076505697234, + "grad_norm": 7.714818000793457, + "learning_rate": 9.736055913389065e-06, + "loss": 0.5947, + "step": 2677 + }, + { + "epoch": 0.03632664134563212, + "grad_norm": 7.41107702255249, + "learning_rate": 9.73591887076881e-06, + "loss": 0.5236, + "step": 2678 + }, + { + "epoch": 0.03634020618556701, + "grad_norm": 8.908085823059082, + "learning_rate": 9.735781828148555e-06, + "loss": 0.6532, + "step": 2679 + }, + { + "epoch": 0.0363537710255019, + "grad_norm": 8.076541900634766, + "learning_rate": 9.7356447855283e-06, + "loss": 0.6206, + "step": 2680 + }, + { + "epoch": 0.036367335865436785, + "grad_norm": 8.110062599182129, + "learning_rate": 9.735507742908045e-06, + "loss": 0.578, + "step": 2681 + }, + { + "epoch": 0.03638090070537168, + "grad_norm": 7.127892971038818, + "learning_rate": 9.73537070028779e-06, + "loss": 0.4408, + "step": 2682 + }, + { + "epoch": 0.036394465545306565, + "grad_norm": 6.599932670593262, + "learning_rate": 9.735233657667536e-06, + "loss": 0.3832, + "step": 2683 + }, + { + "epoch": 0.03640803038524145, + "grad_norm": 8.253381729125977, + "learning_rate": 9.73509661504728e-06, + "loss": 0.5479, + "step": 2684 + }, + { + "epoch": 0.036421595225176344, + "grad_norm": 8.901662826538086, + "learning_rate": 9.734959572427026e-06, + "loss": 0.5227, + "step": 2685 + }, + { + "epoch": 0.03643516006511123, + "grad_norm": 8.148942947387695, + "learning_rate": 9.734822529806771e-06, + "loss": 0.449, + "step": 2686 + }, + { + "epoch": 0.03644872490504612, + "grad_norm": 8.427522659301758, + "learning_rate": 9.734685487186516e-06, + "loss": 0.4839, + "step": 2687 + }, + { + "epoch": 0.03646228974498101, + "grad_norm": 6.2093825340271, + "learning_rate": 9.734548444566262e-06, + "loss": 0.4307, + "step": 2688 + }, + { + "epoch": 0.036475854584915895, + "grad_norm": 8.204337120056152, + "learning_rate": 9.734411401946005e-06, + "loss": 0.5393, + "step": 2689 + }, + { + "epoch": 0.03648941942485079, + "grad_norm": 6.3967671394348145, + "learning_rate": 9.734274359325752e-06, + "loss": 0.5082, + "step": 2690 + }, + { + "epoch": 0.036502984264785675, + "grad_norm": 6.267707347869873, + "learning_rate": 9.734137316705497e-06, + "loss": 0.4603, + "step": 2691 + }, + { + "epoch": 0.03651654910472056, + "grad_norm": 5.1933135986328125, + "learning_rate": 9.73400027408524e-06, + "loss": 0.2962, + "step": 2692 + }, + { + "epoch": 0.036530113944655454, + "grad_norm": 9.431097984313965, + "learning_rate": 9.733863231464986e-06, + "loss": 0.6592, + "step": 2693 + }, + { + "epoch": 0.03654367878459034, + "grad_norm": 6.868070125579834, + "learning_rate": 9.733726188844731e-06, + "loss": 0.4874, + "step": 2694 + }, + { + "epoch": 0.03655724362452523, + "grad_norm": 6.782998085021973, + "learning_rate": 9.733589146224476e-06, + "loss": 0.355, + "step": 2695 + }, + { + "epoch": 0.03657080846446012, + "grad_norm": 9.54871654510498, + "learning_rate": 9.733452103604221e-06, + "loss": 0.5378, + "step": 2696 + }, + { + "epoch": 0.036584373304395006, + "grad_norm": 6.79172420501709, + "learning_rate": 9.733315060983966e-06, + "loss": 0.4307, + "step": 2697 + }, + { + "epoch": 0.0365979381443299, + "grad_norm": 7.040103435516357, + "learning_rate": 9.733178018363712e-06, + "loss": 0.4087, + "step": 2698 + }, + { + "epoch": 0.036611502984264785, + "grad_norm": 7.5968017578125, + "learning_rate": 9.733040975743457e-06, + "loss": 0.3787, + "step": 2699 + }, + { + "epoch": 0.03662506782419967, + "grad_norm": 7.653820514678955, + "learning_rate": 9.732903933123202e-06, + "loss": 0.4907, + "step": 2700 + }, + { + "epoch": 0.036638632664134564, + "grad_norm": 6.231919765472412, + "learning_rate": 9.732766890502947e-06, + "loss": 0.4119, + "step": 2701 + }, + { + "epoch": 0.03665219750406945, + "grad_norm": 6.035948753356934, + "learning_rate": 9.732629847882692e-06, + "loss": 0.4184, + "step": 2702 + }, + { + "epoch": 0.03666576234400434, + "grad_norm": 7.675937175750732, + "learning_rate": 9.732492805262438e-06, + "loss": 0.4241, + "step": 2703 + }, + { + "epoch": 0.03667932718393923, + "grad_norm": 7.196500301361084, + "learning_rate": 9.732355762642183e-06, + "loss": 0.4696, + "step": 2704 + }, + { + "epoch": 0.036692892023874116, + "grad_norm": 5.7461018562316895, + "learning_rate": 9.732218720021928e-06, + "loss": 0.3911, + "step": 2705 + }, + { + "epoch": 0.03670645686380901, + "grad_norm": 7.213900089263916, + "learning_rate": 9.732081677401673e-06, + "loss": 0.4229, + "step": 2706 + }, + { + "epoch": 0.036720021703743895, + "grad_norm": 7.082585334777832, + "learning_rate": 9.731944634781417e-06, + "loss": 0.434, + "step": 2707 + }, + { + "epoch": 0.03673358654367878, + "grad_norm": 7.268657684326172, + "learning_rate": 9.731807592161163e-06, + "loss": 0.4035, + "step": 2708 + }, + { + "epoch": 0.036747151383613674, + "grad_norm": 9.529043197631836, + "learning_rate": 9.731670549540909e-06, + "loss": 0.4599, + "step": 2709 + }, + { + "epoch": 0.03676071622354856, + "grad_norm": 6.769025802612305, + "learning_rate": 9.731533506920652e-06, + "loss": 0.5103, + "step": 2710 + }, + { + "epoch": 0.036774281063483454, + "grad_norm": 7.708554744720459, + "learning_rate": 9.731396464300397e-06, + "loss": 0.3334, + "step": 2711 + }, + { + "epoch": 0.03678784590341834, + "grad_norm": 8.21073055267334, + "learning_rate": 9.731259421680144e-06, + "loss": 0.3842, + "step": 2712 + }, + { + "epoch": 0.036801410743353226, + "grad_norm": 7.093080520629883, + "learning_rate": 9.73112237905989e-06, + "loss": 0.496, + "step": 2713 + }, + { + "epoch": 0.03681497558328812, + "grad_norm": 8.315792083740234, + "learning_rate": 9.730985336439633e-06, + "loss": 0.432, + "step": 2714 + }, + { + "epoch": 0.036828540423223005, + "grad_norm": 9.862016677856445, + "learning_rate": 9.730848293819378e-06, + "loss": 0.4546, + "step": 2715 + }, + { + "epoch": 0.03684210526315789, + "grad_norm": 8.36974811553955, + "learning_rate": 9.730711251199125e-06, + "loss": 0.5049, + "step": 2716 + }, + { + "epoch": 0.036855670103092784, + "grad_norm": 5.6952972412109375, + "learning_rate": 9.730574208578868e-06, + "loss": 0.297, + "step": 2717 + }, + { + "epoch": 0.03686923494302767, + "grad_norm": 9.639914512634277, + "learning_rate": 9.730437165958614e-06, + "loss": 0.6413, + "step": 2718 + }, + { + "epoch": 0.036882799782962564, + "grad_norm": 9.90926456451416, + "learning_rate": 9.730300123338359e-06, + "loss": 0.5357, + "step": 2719 + }, + { + "epoch": 0.03689636462289745, + "grad_norm": 6.0857439041137695, + "learning_rate": 9.730163080718104e-06, + "loss": 0.3586, + "step": 2720 + }, + { + "epoch": 0.036909929462832336, + "grad_norm": 9.166800498962402, + "learning_rate": 9.730026038097849e-06, + "loss": 0.5881, + "step": 2721 + }, + { + "epoch": 0.03692349430276723, + "grad_norm": 8.03790283203125, + "learning_rate": 9.729888995477594e-06, + "loss": 0.5501, + "step": 2722 + }, + { + "epoch": 0.036937059142702115, + "grad_norm": 6.297428607940674, + "learning_rate": 9.72975195285734e-06, + "loss": 0.3029, + "step": 2723 + }, + { + "epoch": 0.036950623982637, + "grad_norm": 9.2966947555542, + "learning_rate": 9.729614910237085e-06, + "loss": 0.5791, + "step": 2724 + }, + { + "epoch": 0.036964188822571895, + "grad_norm": 7.687098503112793, + "learning_rate": 9.72947786761683e-06, + "loss": 0.4312, + "step": 2725 + }, + { + "epoch": 0.03697775366250678, + "grad_norm": 6.086301803588867, + "learning_rate": 9.729340824996575e-06, + "loss": 0.4283, + "step": 2726 + }, + { + "epoch": 0.036991318502441674, + "grad_norm": 8.64795970916748, + "learning_rate": 9.72920378237632e-06, + "loss": 0.4904, + "step": 2727 + }, + { + "epoch": 0.03700488334237656, + "grad_norm": 9.664411544799805, + "learning_rate": 9.729066739756065e-06, + "loss": 0.4404, + "step": 2728 + }, + { + "epoch": 0.037018448182311446, + "grad_norm": 7.007634162902832, + "learning_rate": 9.72892969713581e-06, + "loss": 0.5053, + "step": 2729 + }, + { + "epoch": 0.03703201302224634, + "grad_norm": 6.635313987731934, + "learning_rate": 9.728792654515556e-06, + "loss": 0.4171, + "step": 2730 + }, + { + "epoch": 0.037045577862181225, + "grad_norm": 8.393570899963379, + "learning_rate": 9.7286556118953e-06, + "loss": 0.4311, + "step": 2731 + }, + { + "epoch": 0.03705914270211612, + "grad_norm": 7.3830718994140625, + "learning_rate": 9.728518569275044e-06, + "loss": 0.3354, + "step": 2732 + }, + { + "epoch": 0.037072707542051005, + "grad_norm": 8.924093246459961, + "learning_rate": 9.728381526654791e-06, + "loss": 0.7512, + "step": 2733 + }, + { + "epoch": 0.03708627238198589, + "grad_norm": 5.993508338928223, + "learning_rate": 9.728244484034536e-06, + "loss": 0.322, + "step": 2734 + }, + { + "epoch": 0.037099837221920784, + "grad_norm": 6.442256450653076, + "learning_rate": 9.72810744141428e-06, + "loss": 0.4612, + "step": 2735 + }, + { + "epoch": 0.03711340206185567, + "grad_norm": 9.762947082519531, + "learning_rate": 9.727970398794025e-06, + "loss": 0.4276, + "step": 2736 + }, + { + "epoch": 0.037126966901790556, + "grad_norm": 11.821605682373047, + "learning_rate": 9.72783335617377e-06, + "loss": 0.7335, + "step": 2737 + }, + { + "epoch": 0.03714053174172545, + "grad_norm": 10.783345222473145, + "learning_rate": 9.727696313553517e-06, + "loss": 0.6055, + "step": 2738 + }, + { + "epoch": 0.037154096581660336, + "grad_norm": 6.828760623931885, + "learning_rate": 9.72755927093326e-06, + "loss": 0.3506, + "step": 2739 + }, + { + "epoch": 0.03716766142159523, + "grad_norm": 6.240414142608643, + "learning_rate": 9.727422228313006e-06, + "loss": 0.4231, + "step": 2740 + }, + { + "epoch": 0.037181226261530115, + "grad_norm": 8.00472354888916, + "learning_rate": 9.727285185692751e-06, + "loss": 0.4975, + "step": 2741 + }, + { + "epoch": 0.037194791101465, + "grad_norm": 6.387625694274902, + "learning_rate": 9.727148143072496e-06, + "loss": 0.4371, + "step": 2742 + }, + { + "epoch": 0.037208355941399894, + "grad_norm": 6.524559020996094, + "learning_rate": 9.727011100452241e-06, + "loss": 0.4472, + "step": 2743 + }, + { + "epoch": 0.03722192078133478, + "grad_norm": 7.937688827514648, + "learning_rate": 9.726874057831986e-06, + "loss": 0.5675, + "step": 2744 + }, + { + "epoch": 0.037235485621269666, + "grad_norm": 9.318580627441406, + "learning_rate": 9.726737015211732e-06, + "loss": 0.5417, + "step": 2745 + }, + { + "epoch": 0.03724905046120456, + "grad_norm": 7.432001113891602, + "learning_rate": 9.726599972591477e-06, + "loss": 0.4219, + "step": 2746 + }, + { + "epoch": 0.037262615301139446, + "grad_norm": 7.556669235229492, + "learning_rate": 9.726462929971222e-06, + "loss": 0.4649, + "step": 2747 + }, + { + "epoch": 0.03727618014107434, + "grad_norm": 7.757282733917236, + "learning_rate": 9.726325887350967e-06, + "loss": 0.3448, + "step": 2748 + }, + { + "epoch": 0.037289744981009225, + "grad_norm": 9.440956115722656, + "learning_rate": 9.726188844730712e-06, + "loss": 0.5709, + "step": 2749 + }, + { + "epoch": 0.03730330982094411, + "grad_norm": 6.542489528656006, + "learning_rate": 9.726051802110456e-06, + "loss": 0.3874, + "step": 2750 + }, + { + "epoch": 0.037316874660879004, + "grad_norm": 9.58883285522461, + "learning_rate": 9.725914759490203e-06, + "loss": 0.5911, + "step": 2751 + }, + { + "epoch": 0.03733043950081389, + "grad_norm": 7.646727561950684, + "learning_rate": 9.725777716869948e-06, + "loss": 0.3651, + "step": 2752 + }, + { + "epoch": 0.037344004340748777, + "grad_norm": 5.984445571899414, + "learning_rate": 9.725640674249693e-06, + "loss": 0.4136, + "step": 2753 + }, + { + "epoch": 0.03735756918068367, + "grad_norm": 5.753891468048096, + "learning_rate": 9.725503631629437e-06, + "loss": 0.4375, + "step": 2754 + }, + { + "epoch": 0.037371134020618556, + "grad_norm": 7.766136169433594, + "learning_rate": 9.725366589009183e-06, + "loss": 0.5288, + "step": 2755 + }, + { + "epoch": 0.03738469886055345, + "grad_norm": 10.080536842346191, + "learning_rate": 9.725229546388929e-06, + "loss": 0.6619, + "step": 2756 + }, + { + "epoch": 0.037398263700488335, + "grad_norm": 6.148612976074219, + "learning_rate": 9.725092503768672e-06, + "loss": 0.3518, + "step": 2757 + }, + { + "epoch": 0.03741182854042322, + "grad_norm": 7.9684977531433105, + "learning_rate": 9.724955461148417e-06, + "loss": 0.4223, + "step": 2758 + }, + { + "epoch": 0.037425393380358114, + "grad_norm": 7.821132659912109, + "learning_rate": 9.724818418528164e-06, + "loss": 0.4633, + "step": 2759 + }, + { + "epoch": 0.037438958220293, + "grad_norm": 9.539966583251953, + "learning_rate": 9.724681375907908e-06, + "loss": 0.5539, + "step": 2760 + }, + { + "epoch": 0.03745252306022789, + "grad_norm": 9.9724760055542, + "learning_rate": 9.724544333287653e-06, + "loss": 0.6003, + "step": 2761 + }, + { + "epoch": 0.03746608790016278, + "grad_norm": 11.4746732711792, + "learning_rate": 9.724407290667398e-06, + "loss": 0.7462, + "step": 2762 + }, + { + "epoch": 0.037479652740097666, + "grad_norm": 8.705740928649902, + "learning_rate": 9.724270248047143e-06, + "loss": 0.5649, + "step": 2763 + }, + { + "epoch": 0.03749321758003256, + "grad_norm": 10.44293212890625, + "learning_rate": 9.724133205426888e-06, + "loss": 0.696, + "step": 2764 + }, + { + "epoch": 0.037506782419967445, + "grad_norm": 6.808481216430664, + "learning_rate": 9.723996162806634e-06, + "loss": 0.4113, + "step": 2765 + }, + { + "epoch": 0.03752034725990233, + "grad_norm": 9.949199676513672, + "learning_rate": 9.723859120186379e-06, + "loss": 0.5159, + "step": 2766 + }, + { + "epoch": 0.037533912099837224, + "grad_norm": 9.318078994750977, + "learning_rate": 9.723722077566124e-06, + "loss": 0.5844, + "step": 2767 + }, + { + "epoch": 0.03754747693977211, + "grad_norm": 9.274469375610352, + "learning_rate": 9.723585034945869e-06, + "loss": 0.5705, + "step": 2768 + }, + { + "epoch": 0.037561041779707, + "grad_norm": 10.949190139770508, + "learning_rate": 9.723447992325614e-06, + "loss": 0.6227, + "step": 2769 + }, + { + "epoch": 0.03757460661964189, + "grad_norm": 6.025665283203125, + "learning_rate": 9.72331094970536e-06, + "loss": 0.4795, + "step": 2770 + }, + { + "epoch": 0.037588171459576776, + "grad_norm": 6.621372699737549, + "learning_rate": 9.723173907085105e-06, + "loss": 0.4146, + "step": 2771 + }, + { + "epoch": 0.03760173629951167, + "grad_norm": 9.159516334533691, + "learning_rate": 9.72303686446485e-06, + "loss": 0.5834, + "step": 2772 + }, + { + "epoch": 0.037615301139446555, + "grad_norm": 5.821058750152588, + "learning_rate": 9.722899821844595e-06, + "loss": 0.3674, + "step": 2773 + }, + { + "epoch": 0.03762886597938144, + "grad_norm": 8.908919334411621, + "learning_rate": 9.72276277922434e-06, + "loss": 0.5253, + "step": 2774 + }, + { + "epoch": 0.037642430819316335, + "grad_norm": 7.447694778442383, + "learning_rate": 9.722625736604084e-06, + "loss": 0.3111, + "step": 2775 + }, + { + "epoch": 0.03765599565925122, + "grad_norm": 9.749129295349121, + "learning_rate": 9.722488693983829e-06, + "loss": 0.586, + "step": 2776 + }, + { + "epoch": 0.03766956049918611, + "grad_norm": 7.002997875213623, + "learning_rate": 9.722351651363576e-06, + "loss": 0.3694, + "step": 2777 + }, + { + "epoch": 0.037683125339121, + "grad_norm": 7.248873710632324, + "learning_rate": 9.72221460874332e-06, + "loss": 0.4867, + "step": 2778 + }, + { + "epoch": 0.037696690179055886, + "grad_norm": 7.910364627838135, + "learning_rate": 9.722077566123064e-06, + "loss": 0.4359, + "step": 2779 + }, + { + "epoch": 0.03771025501899078, + "grad_norm": 7.70396614074707, + "learning_rate": 9.72194052350281e-06, + "loss": 0.4249, + "step": 2780 + }, + { + "epoch": 0.037723819858925665, + "grad_norm": 7.208517551422119, + "learning_rate": 9.721803480882556e-06, + "loss": 0.4349, + "step": 2781 + }, + { + "epoch": 0.03773738469886055, + "grad_norm": 10.791430473327637, + "learning_rate": 9.7216664382623e-06, + "loss": 0.7824, + "step": 2782 + }, + { + "epoch": 0.037750949538795445, + "grad_norm": 9.57097339630127, + "learning_rate": 9.721529395642045e-06, + "loss": 0.5542, + "step": 2783 + }, + { + "epoch": 0.03776451437873033, + "grad_norm": 8.057608604431152, + "learning_rate": 9.72139235302179e-06, + "loss": 0.5714, + "step": 2784 + }, + { + "epoch": 0.03777807921866522, + "grad_norm": 5.941097259521484, + "learning_rate": 9.721255310401535e-06, + "loss": 0.3112, + "step": 2785 + }, + { + "epoch": 0.03779164405860011, + "grad_norm": 7.721672058105469, + "learning_rate": 9.72111826778128e-06, + "loss": 0.3505, + "step": 2786 + }, + { + "epoch": 0.037805208898534996, + "grad_norm": 8.79228401184082, + "learning_rate": 9.720981225161026e-06, + "loss": 0.4403, + "step": 2787 + }, + { + "epoch": 0.03781877373846989, + "grad_norm": 7.517916679382324, + "learning_rate": 9.720844182540771e-06, + "loss": 0.4939, + "step": 2788 + }, + { + "epoch": 0.037832338578404776, + "grad_norm": 6.552666187286377, + "learning_rate": 9.720707139920516e-06, + "loss": 0.3087, + "step": 2789 + }, + { + "epoch": 0.03784590341833966, + "grad_norm": 7.0498456954956055, + "learning_rate": 9.720570097300261e-06, + "loss": 0.3716, + "step": 2790 + }, + { + "epoch": 0.037859468258274555, + "grad_norm": 5.807155132293701, + "learning_rate": 9.720433054680006e-06, + "loss": 0.4509, + "step": 2791 + }, + { + "epoch": 0.03787303309820944, + "grad_norm": 8.969401359558105, + "learning_rate": 9.720296012059752e-06, + "loss": 0.4777, + "step": 2792 + }, + { + "epoch": 0.03788659793814433, + "grad_norm": 6.970736026763916, + "learning_rate": 9.720158969439495e-06, + "loss": 0.3264, + "step": 2793 + }, + { + "epoch": 0.03790016277807922, + "grad_norm": 7.435569763183594, + "learning_rate": 9.720021926819242e-06, + "loss": 0.3839, + "step": 2794 + }, + { + "epoch": 0.037913727618014106, + "grad_norm": 5.675735950469971, + "learning_rate": 9.719884884198987e-06, + "loss": 0.328, + "step": 2795 + }, + { + "epoch": 0.037927292457949, + "grad_norm": 8.411723136901855, + "learning_rate": 9.719747841578732e-06, + "loss": 0.4523, + "step": 2796 + }, + { + "epoch": 0.037940857297883886, + "grad_norm": 5.8838934898376465, + "learning_rate": 9.719610798958476e-06, + "loss": 0.3517, + "step": 2797 + }, + { + "epoch": 0.03795442213781877, + "grad_norm": 8.028676986694336, + "learning_rate": 9.719473756338223e-06, + "loss": 0.6245, + "step": 2798 + }, + { + "epoch": 0.037967986977753665, + "grad_norm": 8.238019943237305, + "learning_rate": 9.719336713717968e-06, + "loss": 0.5008, + "step": 2799 + }, + { + "epoch": 0.03798155181768855, + "grad_norm": 5.822169780731201, + "learning_rate": 9.719199671097711e-06, + "loss": 0.3363, + "step": 2800 + }, + { + "epoch": 0.03799511665762344, + "grad_norm": 6.442393779754639, + "learning_rate": 9.719062628477457e-06, + "loss": 0.423, + "step": 2801 + }, + { + "epoch": 0.03800868149755833, + "grad_norm": 7.843218803405762, + "learning_rate": 9.718925585857203e-06, + "loss": 0.4735, + "step": 2802 + }, + { + "epoch": 0.038022246337493217, + "grad_norm": 7.7069478034973145, + "learning_rate": 9.718788543236947e-06, + "loss": 0.5453, + "step": 2803 + }, + { + "epoch": 0.03803581117742811, + "grad_norm": 5.836462497711182, + "learning_rate": 9.718651500616692e-06, + "loss": 0.3531, + "step": 2804 + }, + { + "epoch": 0.038049376017362996, + "grad_norm": 8.670351028442383, + "learning_rate": 9.718514457996437e-06, + "loss": 0.4763, + "step": 2805 + }, + { + "epoch": 0.03806294085729788, + "grad_norm": 6.254966735839844, + "learning_rate": 9.718377415376182e-06, + "loss": 0.3503, + "step": 2806 + }, + { + "epoch": 0.038076505697232775, + "grad_norm": 6.296969413757324, + "learning_rate": 9.718240372755928e-06, + "loss": 0.4307, + "step": 2807 + }, + { + "epoch": 0.03809007053716766, + "grad_norm": 7.744492530822754, + "learning_rate": 9.718103330135673e-06, + "loss": 0.5408, + "step": 2808 + }, + { + "epoch": 0.03810363537710255, + "grad_norm": 6.265018939971924, + "learning_rate": 9.717966287515418e-06, + "loss": 0.471, + "step": 2809 + }, + { + "epoch": 0.03811720021703744, + "grad_norm": 6.445056438446045, + "learning_rate": 9.717829244895163e-06, + "loss": 0.539, + "step": 2810 + }, + { + "epoch": 0.03813076505697233, + "grad_norm": 6.336724758148193, + "learning_rate": 9.717692202274908e-06, + "loss": 0.4175, + "step": 2811 + }, + { + "epoch": 0.03814432989690722, + "grad_norm": 7.001894474029541, + "learning_rate": 9.717555159654654e-06, + "loss": 0.5386, + "step": 2812 + }, + { + "epoch": 0.038157894736842106, + "grad_norm": 7.220555782318115, + "learning_rate": 9.717418117034399e-06, + "loss": 0.4167, + "step": 2813 + }, + { + "epoch": 0.03817145957677699, + "grad_norm": 7.223845958709717, + "learning_rate": 9.717281074414144e-06, + "loss": 0.3927, + "step": 2814 + }, + { + "epoch": 0.038185024416711885, + "grad_norm": 5.120954513549805, + "learning_rate": 9.717144031793889e-06, + "loss": 0.3547, + "step": 2815 + }, + { + "epoch": 0.03819858925664677, + "grad_norm": 7.686395168304443, + "learning_rate": 9.717006989173634e-06, + "loss": 0.4681, + "step": 2816 + }, + { + "epoch": 0.03821215409658166, + "grad_norm": 8.038447380065918, + "learning_rate": 9.71686994655338e-06, + "loss": 0.4877, + "step": 2817 + }, + { + "epoch": 0.03822571893651655, + "grad_norm": 5.978682518005371, + "learning_rate": 9.716732903933123e-06, + "loss": 0.3321, + "step": 2818 + }, + { + "epoch": 0.03823928377645144, + "grad_norm": 5.974671840667725, + "learning_rate": 9.716595861312868e-06, + "loss": 0.3856, + "step": 2819 + }, + { + "epoch": 0.03825284861638633, + "grad_norm": 6.285922527313232, + "learning_rate": 9.716458818692615e-06, + "loss": 0.2546, + "step": 2820 + }, + { + "epoch": 0.038266413456321216, + "grad_norm": 7.416745185852051, + "learning_rate": 9.71632177607236e-06, + "loss": 0.5546, + "step": 2821 + }, + { + "epoch": 0.0382799782962561, + "grad_norm": 8.314702033996582, + "learning_rate": 9.716184733452104e-06, + "loss": 0.4771, + "step": 2822 + }, + { + "epoch": 0.038293543136190995, + "grad_norm": 5.68748140335083, + "learning_rate": 9.716047690831849e-06, + "loss": 0.3511, + "step": 2823 + }, + { + "epoch": 0.03830710797612588, + "grad_norm": 8.851029396057129, + "learning_rate": 9.715910648211596e-06, + "loss": 0.5407, + "step": 2824 + }, + { + "epoch": 0.03832067281606077, + "grad_norm": 7.432862281799316, + "learning_rate": 9.71577360559134e-06, + "loss": 0.5115, + "step": 2825 + }, + { + "epoch": 0.03833423765599566, + "grad_norm": 7.20439338684082, + "learning_rate": 9.715636562971084e-06, + "loss": 0.4643, + "step": 2826 + }, + { + "epoch": 0.03834780249593055, + "grad_norm": 5.6297149658203125, + "learning_rate": 9.71549952035083e-06, + "loss": 0.3245, + "step": 2827 + }, + { + "epoch": 0.03836136733586544, + "grad_norm": 8.476028442382812, + "learning_rate": 9.715362477730575e-06, + "loss": 0.44, + "step": 2828 + }, + { + "epoch": 0.038374932175800326, + "grad_norm": 7.14154577255249, + "learning_rate": 9.71522543511032e-06, + "loss": 0.3519, + "step": 2829 + }, + { + "epoch": 0.03838849701573521, + "grad_norm": 10.632536888122559, + "learning_rate": 9.715088392490065e-06, + "loss": 0.5263, + "step": 2830 + }, + { + "epoch": 0.038402061855670105, + "grad_norm": 6.201033592224121, + "learning_rate": 9.71495134986981e-06, + "loss": 0.3201, + "step": 2831 + }, + { + "epoch": 0.03841562669560499, + "grad_norm": 56.82540512084961, + "learning_rate": 9.714814307249555e-06, + "loss": 0.6569, + "step": 2832 + }, + { + "epoch": 0.03842919153553988, + "grad_norm": 6.143104553222656, + "learning_rate": 9.7146772646293e-06, + "loss": 0.511, + "step": 2833 + }, + { + "epoch": 0.03844275637547477, + "grad_norm": 10.042500495910645, + "learning_rate": 9.714540222009046e-06, + "loss": 0.6142, + "step": 2834 + }, + { + "epoch": 0.03845632121540966, + "grad_norm": 7.9702911376953125, + "learning_rate": 9.714403179388791e-06, + "loss": 0.5572, + "step": 2835 + }, + { + "epoch": 0.03846988605534455, + "grad_norm": 5.792943000793457, + "learning_rate": 9.714266136768536e-06, + "loss": 0.3807, + "step": 2836 + }, + { + "epoch": 0.038483450895279436, + "grad_norm": 7.481235027313232, + "learning_rate": 9.714129094148281e-06, + "loss": 0.5319, + "step": 2837 + }, + { + "epoch": 0.03849701573521432, + "grad_norm": 6.493351459503174, + "learning_rate": 9.713992051528027e-06, + "loss": 0.4964, + "step": 2838 + }, + { + "epoch": 0.038510580575149216, + "grad_norm": 7.802685260772705, + "learning_rate": 9.713855008907772e-06, + "loss": 0.3775, + "step": 2839 + }, + { + "epoch": 0.0385241454150841, + "grad_norm": 9.089062690734863, + "learning_rate": 9.713717966287515e-06, + "loss": 0.6455, + "step": 2840 + }, + { + "epoch": 0.03853771025501899, + "grad_norm": 16.09809684753418, + "learning_rate": 9.713580923667262e-06, + "loss": 0.537, + "step": 2841 + }, + { + "epoch": 0.03855127509495388, + "grad_norm": 6.924168109893799, + "learning_rate": 9.713443881047007e-06, + "loss": 0.3979, + "step": 2842 + }, + { + "epoch": 0.03856483993488877, + "grad_norm": 6.805057048797607, + "learning_rate": 9.71330683842675e-06, + "loss": 0.398, + "step": 2843 + }, + { + "epoch": 0.03857840477482366, + "grad_norm": 4.829563140869141, + "learning_rate": 9.713169795806496e-06, + "loss": 0.2746, + "step": 2844 + }, + { + "epoch": 0.038591969614758546, + "grad_norm": 7.207157611846924, + "learning_rate": 9.713032753186241e-06, + "loss": 0.4196, + "step": 2845 + }, + { + "epoch": 0.03860553445469343, + "grad_norm": 6.532803535461426, + "learning_rate": 9.712895710565988e-06, + "loss": 0.4179, + "step": 2846 + }, + { + "epoch": 0.038619099294628326, + "grad_norm": 7.656647682189941, + "learning_rate": 9.712758667945731e-06, + "loss": 0.3339, + "step": 2847 + }, + { + "epoch": 0.03863266413456321, + "grad_norm": 6.748168468475342, + "learning_rate": 9.712621625325477e-06, + "loss": 0.3727, + "step": 2848 + }, + { + "epoch": 0.0386462289744981, + "grad_norm": 6.372035980224609, + "learning_rate": 9.712484582705222e-06, + "loss": 0.4391, + "step": 2849 + }, + { + "epoch": 0.03865979381443299, + "grad_norm": 6.9339470863342285, + "learning_rate": 9.712347540084967e-06, + "loss": 0.3268, + "step": 2850 + }, + { + "epoch": 0.03867335865436788, + "grad_norm": 4.469231605529785, + "learning_rate": 9.712210497464712e-06, + "loss": 0.3478, + "step": 2851 + }, + { + "epoch": 0.03868692349430277, + "grad_norm": 5.770340919494629, + "learning_rate": 9.712073454844457e-06, + "loss": 0.2825, + "step": 2852 + }, + { + "epoch": 0.03870048833423766, + "grad_norm": 6.929162979125977, + "learning_rate": 9.711936412224203e-06, + "loss": 0.4475, + "step": 2853 + }, + { + "epoch": 0.03871405317417254, + "grad_norm": 8.737295150756836, + "learning_rate": 9.711799369603948e-06, + "loss": 0.3974, + "step": 2854 + }, + { + "epoch": 0.038727618014107436, + "grad_norm": 6.126337051391602, + "learning_rate": 9.711662326983693e-06, + "loss": 0.371, + "step": 2855 + }, + { + "epoch": 0.03874118285404232, + "grad_norm": 5.784970760345459, + "learning_rate": 9.711525284363438e-06, + "loss": 0.3776, + "step": 2856 + }, + { + "epoch": 0.03875474769397721, + "grad_norm": 5.736753940582275, + "learning_rate": 9.711388241743183e-06, + "loss": 0.3251, + "step": 2857 + }, + { + "epoch": 0.0387683125339121, + "grad_norm": 6.651718616485596, + "learning_rate": 9.711251199122928e-06, + "loss": 0.5339, + "step": 2858 + }, + { + "epoch": 0.03878187737384699, + "grad_norm": 5.592373371124268, + "learning_rate": 9.711114156502674e-06, + "loss": 0.3599, + "step": 2859 + }, + { + "epoch": 0.03879544221378188, + "grad_norm": 5.4346160888671875, + "learning_rate": 9.710977113882419e-06, + "loss": 0.4233, + "step": 2860 + }, + { + "epoch": 0.03880900705371677, + "grad_norm": 5.203075408935547, + "learning_rate": 9.710840071262164e-06, + "loss": 0.2664, + "step": 2861 + }, + { + "epoch": 0.03882257189365165, + "grad_norm": 4.877770900726318, + "learning_rate": 9.710703028641907e-06, + "loss": 0.2852, + "step": 2862 + }, + { + "epoch": 0.038836136733586546, + "grad_norm": 9.12610149383545, + "learning_rate": 9.710565986021654e-06, + "loss": 0.4071, + "step": 2863 + }, + { + "epoch": 0.03884970157352143, + "grad_norm": 7.2080488204956055, + "learning_rate": 9.7104289434014e-06, + "loss": 0.3461, + "step": 2864 + }, + { + "epoch": 0.03886326641345632, + "grad_norm": 7.272829055786133, + "learning_rate": 9.710291900781143e-06, + "loss": 0.3935, + "step": 2865 + }, + { + "epoch": 0.03887683125339121, + "grad_norm": 4.775618076324463, + "learning_rate": 9.710154858160888e-06, + "loss": 0.3434, + "step": 2866 + }, + { + "epoch": 0.0388903960933261, + "grad_norm": 7.19453239440918, + "learning_rate": 9.710017815540635e-06, + "loss": 0.444, + "step": 2867 + }, + { + "epoch": 0.03890396093326099, + "grad_norm": 6.332118511199951, + "learning_rate": 9.709880772920378e-06, + "loss": 0.3496, + "step": 2868 + }, + { + "epoch": 0.03891752577319588, + "grad_norm": 4.616466999053955, + "learning_rate": 9.709743730300124e-06, + "loss": 0.334, + "step": 2869 + }, + { + "epoch": 0.03893109061313076, + "grad_norm": 7.8869452476501465, + "learning_rate": 9.709606687679869e-06, + "loss": 0.3923, + "step": 2870 + }, + { + "epoch": 0.038944655453065656, + "grad_norm": 9.069602012634277, + "learning_rate": 9.709469645059614e-06, + "loss": 0.6302, + "step": 2871 + }, + { + "epoch": 0.03895822029300054, + "grad_norm": 6.296929836273193, + "learning_rate": 9.70933260243936e-06, + "loss": 0.3305, + "step": 2872 + }, + { + "epoch": 0.03897178513293543, + "grad_norm": 5.463098049163818, + "learning_rate": 9.709195559819104e-06, + "loss": 0.3081, + "step": 2873 + }, + { + "epoch": 0.03898534997287032, + "grad_norm": 4.904461860656738, + "learning_rate": 9.70905851719885e-06, + "loss": 0.3297, + "step": 2874 + }, + { + "epoch": 0.03899891481280521, + "grad_norm": 4.22423791885376, + "learning_rate": 9.708921474578595e-06, + "loss": 0.2236, + "step": 2875 + }, + { + "epoch": 0.0390124796527401, + "grad_norm": 6.417382717132568, + "learning_rate": 9.70878443195834e-06, + "loss": 0.3739, + "step": 2876 + }, + { + "epoch": 0.03902604449267499, + "grad_norm": 10.0525484085083, + "learning_rate": 9.708647389338085e-06, + "loss": 0.5515, + "step": 2877 + }, + { + "epoch": 0.03903960933260987, + "grad_norm": 10.286410331726074, + "learning_rate": 9.70851034671783e-06, + "loss": 0.5338, + "step": 2878 + }, + { + "epoch": 0.039053174172544766, + "grad_norm": 7.860719680786133, + "learning_rate": 9.708373304097575e-06, + "loss": 0.5726, + "step": 2879 + }, + { + "epoch": 0.03906673901247965, + "grad_norm": 7.501655578613281, + "learning_rate": 9.70823626147732e-06, + "loss": 0.319, + "step": 2880 + }, + { + "epoch": 0.03908030385241454, + "grad_norm": 7.575356483459473, + "learning_rate": 9.708099218857066e-06, + "loss": 0.4626, + "step": 2881 + }, + { + "epoch": 0.03909386869234943, + "grad_norm": 6.70231819152832, + "learning_rate": 9.707962176236811e-06, + "loss": 0.4091, + "step": 2882 + }, + { + "epoch": 0.03910743353228432, + "grad_norm": 6.1598429679870605, + "learning_rate": 9.707825133616554e-06, + "loss": 0.4387, + "step": 2883 + }, + { + "epoch": 0.03912099837221921, + "grad_norm": 6.087899208068848, + "learning_rate": 9.707688090996301e-06, + "loss": 0.3658, + "step": 2884 + }, + { + "epoch": 0.0391345632121541, + "grad_norm": 6.283573627471924, + "learning_rate": 9.707551048376047e-06, + "loss": 0.394, + "step": 2885 + }, + { + "epoch": 0.03914812805208898, + "grad_norm": 5.307256698608398, + "learning_rate": 9.70741400575579e-06, + "loss": 0.369, + "step": 2886 + }, + { + "epoch": 0.039161692892023876, + "grad_norm": 8.048737525939941, + "learning_rate": 9.707276963135535e-06, + "loss": 0.4478, + "step": 2887 + }, + { + "epoch": 0.03917525773195876, + "grad_norm": 7.899933338165283, + "learning_rate": 9.70713992051528e-06, + "loss": 0.4719, + "step": 2888 + }, + { + "epoch": 0.03918882257189365, + "grad_norm": 4.839743614196777, + "learning_rate": 9.707002877895027e-06, + "loss": 0.3092, + "step": 2889 + }, + { + "epoch": 0.03920238741182854, + "grad_norm": 7.013505935668945, + "learning_rate": 9.70686583527477e-06, + "loss": 0.5501, + "step": 2890 + }, + { + "epoch": 0.03921595225176343, + "grad_norm": 4.528678894042969, + "learning_rate": 9.706728792654516e-06, + "loss": 0.2743, + "step": 2891 + }, + { + "epoch": 0.03922951709169832, + "grad_norm": 5.33062744140625, + "learning_rate": 9.706591750034261e-06, + "loss": 0.2941, + "step": 2892 + }, + { + "epoch": 0.03924308193163321, + "grad_norm": 5.704960346221924, + "learning_rate": 9.706454707414006e-06, + "loss": 0.4226, + "step": 2893 + }, + { + "epoch": 0.03925664677156809, + "grad_norm": 5.998531341552734, + "learning_rate": 9.706317664793751e-06, + "loss": 0.3971, + "step": 2894 + }, + { + "epoch": 0.039270211611502986, + "grad_norm": 6.456258296966553, + "learning_rate": 9.706180622173497e-06, + "loss": 0.4848, + "step": 2895 + }, + { + "epoch": 0.03928377645143787, + "grad_norm": 7.41178035736084, + "learning_rate": 9.706043579553242e-06, + "loss": 0.4481, + "step": 2896 + }, + { + "epoch": 0.03929734129137276, + "grad_norm": 4.458285331726074, + "learning_rate": 9.705906536932987e-06, + "loss": 0.4386, + "step": 2897 + }, + { + "epoch": 0.03931090613130765, + "grad_norm": 9.061515808105469, + "learning_rate": 9.705769494312732e-06, + "loss": 0.6017, + "step": 2898 + }, + { + "epoch": 0.03932447097124254, + "grad_norm": 5.386446952819824, + "learning_rate": 9.705632451692477e-06, + "loss": 0.2854, + "step": 2899 + }, + { + "epoch": 0.03933803581117743, + "grad_norm": 8.063615798950195, + "learning_rate": 9.705495409072223e-06, + "loss": 0.584, + "step": 2900 + }, + { + "epoch": 0.03935160065111232, + "grad_norm": 6.775556564331055, + "learning_rate": 9.705358366451966e-06, + "loss": 0.4385, + "step": 2901 + }, + { + "epoch": 0.039365165491047203, + "grad_norm": 7.160057067871094, + "learning_rate": 9.705221323831713e-06, + "loss": 0.5041, + "step": 2902 + }, + { + "epoch": 0.0393787303309821, + "grad_norm": 8.000185012817383, + "learning_rate": 9.705084281211458e-06, + "loss": 0.5394, + "step": 2903 + }, + { + "epoch": 0.03939229517091698, + "grad_norm": 5.927018165588379, + "learning_rate": 9.704947238591203e-06, + "loss": 0.4488, + "step": 2904 + }, + { + "epoch": 0.03940586001085187, + "grad_norm": 7.7362542152404785, + "learning_rate": 9.704810195970947e-06, + "loss": 0.6043, + "step": 2905 + }, + { + "epoch": 0.03941942485078676, + "grad_norm": 7.398004055023193, + "learning_rate": 9.704673153350694e-06, + "loss": 0.4385, + "step": 2906 + }, + { + "epoch": 0.03943298969072165, + "grad_norm": 6.309388160705566, + "learning_rate": 9.704536110730439e-06, + "loss": 0.438, + "step": 2907 + }, + { + "epoch": 0.03944655453065654, + "grad_norm": 6.576418876647949, + "learning_rate": 9.704399068110182e-06, + "loss": 0.4171, + "step": 2908 + }, + { + "epoch": 0.03946011937059143, + "grad_norm": 5.353044509887695, + "learning_rate": 9.704262025489927e-06, + "loss": 0.3259, + "step": 2909 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 8.141084671020508, + "learning_rate": 9.704124982869674e-06, + "loss": 0.5627, + "step": 2910 + }, + { + "epoch": 0.03948724905046121, + "grad_norm": 6.396823406219482, + "learning_rate": 9.703987940249418e-06, + "loss": 0.4094, + "step": 2911 + }, + { + "epoch": 0.03950081389039609, + "grad_norm": 5.612620830535889, + "learning_rate": 9.703850897629163e-06, + "loss": 0.2663, + "step": 2912 + }, + { + "epoch": 0.03951437873033098, + "grad_norm": 7.430731296539307, + "learning_rate": 9.703713855008908e-06, + "loss": 0.5789, + "step": 2913 + }, + { + "epoch": 0.03952794357026587, + "grad_norm": 9.546046257019043, + "learning_rate": 9.703576812388653e-06, + "loss": 0.5375, + "step": 2914 + }, + { + "epoch": 0.03954150841020076, + "grad_norm": 7.683754920959473, + "learning_rate": 9.703439769768399e-06, + "loss": 0.5557, + "step": 2915 + }, + { + "epoch": 0.03955507325013565, + "grad_norm": 7.001779556274414, + "learning_rate": 9.703302727148144e-06, + "loss": 0.3616, + "step": 2916 + }, + { + "epoch": 0.03956863809007054, + "grad_norm": 7.392351150512695, + "learning_rate": 9.703165684527889e-06, + "loss": 0.5401, + "step": 2917 + }, + { + "epoch": 0.039582202930005424, + "grad_norm": 8.924478530883789, + "learning_rate": 9.703028641907634e-06, + "loss": 0.4819, + "step": 2918 + }, + { + "epoch": 0.03959576776994032, + "grad_norm": 9.197254180908203, + "learning_rate": 9.70289159928738e-06, + "loss": 0.5319, + "step": 2919 + }, + { + "epoch": 0.0396093326098752, + "grad_norm": 6.675315856933594, + "learning_rate": 9.702754556667124e-06, + "loss": 0.4781, + "step": 2920 + }, + { + "epoch": 0.03962289744981009, + "grad_norm": 5.045902252197266, + "learning_rate": 9.70261751404687e-06, + "loss": 0.3193, + "step": 2921 + }, + { + "epoch": 0.03963646228974498, + "grad_norm": 9.415921211242676, + "learning_rate": 9.702480471426615e-06, + "loss": 0.4935, + "step": 2922 + }, + { + "epoch": 0.03965002712967987, + "grad_norm": 8.239355087280273, + "learning_rate": 9.70234342880636e-06, + "loss": 0.4206, + "step": 2923 + }, + { + "epoch": 0.03966359196961476, + "grad_norm": 7.5647430419921875, + "learning_rate": 9.702206386186105e-06, + "loss": 0.432, + "step": 2924 + }, + { + "epoch": 0.03967715680954965, + "grad_norm": 7.646641254425049, + "learning_rate": 9.70206934356585e-06, + "loss": 0.414, + "step": 2925 + }, + { + "epoch": 0.039690721649484534, + "grad_norm": 6.1486735343933105, + "learning_rate": 9.701932300945594e-06, + "loss": 0.4531, + "step": 2926 + }, + { + "epoch": 0.03970428648941943, + "grad_norm": 7.208745002746582, + "learning_rate": 9.70179525832534e-06, + "loss": 0.5187, + "step": 2927 + }, + { + "epoch": 0.03971785132935431, + "grad_norm": 5.661594390869141, + "learning_rate": 9.701658215705086e-06, + "loss": 0.4574, + "step": 2928 + }, + { + "epoch": 0.0397314161692892, + "grad_norm": 8.094099044799805, + "learning_rate": 9.701521173084831e-06, + "loss": 0.5689, + "step": 2929 + }, + { + "epoch": 0.03974498100922409, + "grad_norm": 7.783156394958496, + "learning_rate": 9.701384130464575e-06, + "loss": 0.3938, + "step": 2930 + }, + { + "epoch": 0.03975854584915898, + "grad_norm": 7.18491792678833, + "learning_rate": 9.70124708784432e-06, + "loss": 0.4413, + "step": 2931 + }, + { + "epoch": 0.03977211068909387, + "grad_norm": 7.709397315979004, + "learning_rate": 9.701110045224067e-06, + "loss": 0.6711, + "step": 2932 + }, + { + "epoch": 0.03978567552902876, + "grad_norm": 7.084731101989746, + "learning_rate": 9.70097300260381e-06, + "loss": 0.575, + "step": 2933 + }, + { + "epoch": 0.039799240368963644, + "grad_norm": 7.886052131652832, + "learning_rate": 9.700835959983555e-06, + "loss": 0.5085, + "step": 2934 + }, + { + "epoch": 0.03981280520889854, + "grad_norm": 7.513060092926025, + "learning_rate": 9.7006989173633e-06, + "loss": 0.4145, + "step": 2935 + }, + { + "epoch": 0.03982637004883342, + "grad_norm": 7.711245536804199, + "learning_rate": 9.700561874743046e-06, + "loss": 0.5585, + "step": 2936 + }, + { + "epoch": 0.03983993488876831, + "grad_norm": 7.901716709136963, + "learning_rate": 9.70042483212279e-06, + "loss": 0.4788, + "step": 2937 + }, + { + "epoch": 0.0398534997287032, + "grad_norm": 7.607813835144043, + "learning_rate": 9.700287789502536e-06, + "loss": 0.4626, + "step": 2938 + }, + { + "epoch": 0.03986706456863809, + "grad_norm": 7.198288917541504, + "learning_rate": 9.700150746882281e-06, + "loss": 0.6385, + "step": 2939 + }, + { + "epoch": 0.03988062940857298, + "grad_norm": 5.390675067901611, + "learning_rate": 9.700013704262026e-06, + "loss": 0.3954, + "step": 2940 + }, + { + "epoch": 0.03989419424850787, + "grad_norm": 5.881957054138184, + "learning_rate": 9.699876661641771e-06, + "loss": 0.2802, + "step": 2941 + }, + { + "epoch": 0.039907759088442754, + "grad_norm": 5.026820659637451, + "learning_rate": 9.699739619021517e-06, + "loss": 0.4205, + "step": 2942 + }, + { + "epoch": 0.03992132392837765, + "grad_norm": 8.150197982788086, + "learning_rate": 9.699602576401262e-06, + "loss": 0.3988, + "step": 2943 + }, + { + "epoch": 0.03993488876831253, + "grad_norm": 6.054897308349609, + "learning_rate": 9.699465533781007e-06, + "loss": 0.4282, + "step": 2944 + }, + { + "epoch": 0.03994845360824742, + "grad_norm": 6.490987777709961, + "learning_rate": 9.699328491160752e-06, + "loss": 0.5551, + "step": 2945 + }, + { + "epoch": 0.03996201844818231, + "grad_norm": 7.613975524902344, + "learning_rate": 9.699191448540497e-06, + "loss": 0.4904, + "step": 2946 + }, + { + "epoch": 0.0399755832881172, + "grad_norm": 6.581173419952393, + "learning_rate": 9.699054405920243e-06, + "loss": 0.4138, + "step": 2947 + }, + { + "epoch": 0.03998914812805209, + "grad_norm": 5.803431510925293, + "learning_rate": 9.698917363299986e-06, + "loss": 0.2149, + "step": 2948 + }, + { + "epoch": 0.04000271296798698, + "grad_norm": 6.24812126159668, + "learning_rate": 9.698780320679733e-06, + "loss": 0.4705, + "step": 2949 + }, + { + "epoch": 0.040016277807921864, + "grad_norm": 5.525198936462402, + "learning_rate": 9.698643278059478e-06, + "loss": 0.4222, + "step": 2950 + }, + { + "epoch": 0.04002984264785676, + "grad_norm": 5.797364234924316, + "learning_rate": 9.698506235439222e-06, + "loss": 0.3359, + "step": 2951 + }, + { + "epoch": 0.040043407487791643, + "grad_norm": 5.882540225982666, + "learning_rate": 9.698369192818967e-06, + "loss": 0.4155, + "step": 2952 + }, + { + "epoch": 0.04005697232772653, + "grad_norm": 6.511532783508301, + "learning_rate": 9.698232150198714e-06, + "loss": 0.4248, + "step": 2953 + }, + { + "epoch": 0.04007053716766142, + "grad_norm": 5.9107866287231445, + "learning_rate": 9.698095107578459e-06, + "loss": 0.5851, + "step": 2954 + }, + { + "epoch": 0.04008410200759631, + "grad_norm": 5.039246082305908, + "learning_rate": 9.697958064958202e-06, + "loss": 0.3691, + "step": 2955 + }, + { + "epoch": 0.0400976668475312, + "grad_norm": 7.974409103393555, + "learning_rate": 9.697821022337947e-06, + "loss": 0.6727, + "step": 2956 + }, + { + "epoch": 0.04011123168746609, + "grad_norm": 6.754145622253418, + "learning_rate": 9.697683979717693e-06, + "loss": 0.5457, + "step": 2957 + }, + { + "epoch": 0.040124796527400974, + "grad_norm": 8.055140495300293, + "learning_rate": 9.697546937097438e-06, + "loss": 0.5953, + "step": 2958 + }, + { + "epoch": 0.04013836136733587, + "grad_norm": 7.095843315124512, + "learning_rate": 9.697409894477183e-06, + "loss": 0.4443, + "step": 2959 + }, + { + "epoch": 0.040151926207270754, + "grad_norm": 6.550314426422119, + "learning_rate": 9.697272851856928e-06, + "loss": 0.4016, + "step": 2960 + }, + { + "epoch": 0.04016549104720564, + "grad_norm": 7.918128490447998, + "learning_rate": 9.697135809236673e-06, + "loss": 0.5353, + "step": 2961 + }, + { + "epoch": 0.04017905588714053, + "grad_norm": 5.845370292663574, + "learning_rate": 9.696998766616419e-06, + "loss": 0.4572, + "step": 2962 + }, + { + "epoch": 0.04019262072707542, + "grad_norm": 11.062891960144043, + "learning_rate": 9.696861723996164e-06, + "loss": 0.557, + "step": 2963 + }, + { + "epoch": 0.04020618556701031, + "grad_norm": 7.772587776184082, + "learning_rate": 9.696724681375909e-06, + "loss": 0.5397, + "step": 2964 + }, + { + "epoch": 0.0402197504069452, + "grad_norm": 6.420286178588867, + "learning_rate": 9.696587638755654e-06, + "loss": 0.4739, + "step": 2965 + }, + { + "epoch": 0.040233315246880084, + "grad_norm": 8.348602294921875, + "learning_rate": 9.6964505961354e-06, + "loss": 0.5441, + "step": 2966 + }, + { + "epoch": 0.04024688008681498, + "grad_norm": 8.737804412841797, + "learning_rate": 9.696313553515144e-06, + "loss": 0.5685, + "step": 2967 + }, + { + "epoch": 0.040260444926749864, + "grad_norm": 6.57547664642334, + "learning_rate": 9.69617651089489e-06, + "loss": 0.458, + "step": 2968 + }, + { + "epoch": 0.04027400976668475, + "grad_norm": 7.382069110870361, + "learning_rate": 9.696039468274633e-06, + "loss": 0.3719, + "step": 2969 + }, + { + "epoch": 0.04028757460661964, + "grad_norm": 7.325904846191406, + "learning_rate": 9.695902425654378e-06, + "loss": 0.5263, + "step": 2970 + }, + { + "epoch": 0.04030113944655453, + "grad_norm": 5.9461493492126465, + "learning_rate": 9.695765383034125e-06, + "loss": 0.3528, + "step": 2971 + }, + { + "epoch": 0.04031470428648942, + "grad_norm": 6.645874500274658, + "learning_rate": 9.69562834041387e-06, + "loss": 0.3837, + "step": 2972 + }, + { + "epoch": 0.04032826912642431, + "grad_norm": 6.20402193069458, + "learning_rate": 9.695491297793614e-06, + "loss": 0.5182, + "step": 2973 + }, + { + "epoch": 0.040341833966359195, + "grad_norm": 8.897896766662598, + "learning_rate": 9.695354255173359e-06, + "loss": 0.6084, + "step": 2974 + }, + { + "epoch": 0.04035539880629409, + "grad_norm": 13.328929901123047, + "learning_rate": 9.695217212553106e-06, + "loss": 0.4812, + "step": 2975 + }, + { + "epoch": 0.040368963646228974, + "grad_norm": 6.218155384063721, + "learning_rate": 9.69508016993285e-06, + "loss": 0.4027, + "step": 2976 + }, + { + "epoch": 0.04038252848616386, + "grad_norm": 7.169706344604492, + "learning_rate": 9.694943127312595e-06, + "loss": 0.5696, + "step": 2977 + }, + { + "epoch": 0.04039609332609875, + "grad_norm": 7.479259490966797, + "learning_rate": 9.69480608469234e-06, + "loss": 0.5052, + "step": 2978 + }, + { + "epoch": 0.04040965816603364, + "grad_norm": 6.198448657989502, + "learning_rate": 9.694669042072085e-06, + "loss": 0.3307, + "step": 2979 + }, + { + "epoch": 0.04042322300596853, + "grad_norm": 6.624592304229736, + "learning_rate": 9.69453199945183e-06, + "loss": 0.3751, + "step": 2980 + }, + { + "epoch": 0.04043678784590342, + "grad_norm": 7.683203220367432, + "learning_rate": 9.694394956831575e-06, + "loss": 0.479, + "step": 2981 + }, + { + "epoch": 0.040450352685838305, + "grad_norm": 8.389575958251953, + "learning_rate": 9.69425791421132e-06, + "loss": 0.6028, + "step": 2982 + }, + { + "epoch": 0.0404639175257732, + "grad_norm": 4.756418228149414, + "learning_rate": 9.694120871591066e-06, + "loss": 0.46, + "step": 2983 + }, + { + "epoch": 0.040477482365708084, + "grad_norm": 5.07175350189209, + "learning_rate": 9.69398382897081e-06, + "loss": 0.3011, + "step": 2984 + }, + { + "epoch": 0.04049104720564297, + "grad_norm": 5.877863883972168, + "learning_rate": 9.693846786350556e-06, + "loss": 0.3775, + "step": 2985 + }, + { + "epoch": 0.04050461204557786, + "grad_norm": 6.512579917907715, + "learning_rate": 9.693709743730301e-06, + "loss": 0.449, + "step": 2986 + }, + { + "epoch": 0.04051817688551275, + "grad_norm": 6.056828022003174, + "learning_rate": 9.693572701110046e-06, + "loss": 0.5032, + "step": 2987 + }, + { + "epoch": 0.04053174172544764, + "grad_norm": 7.948339462280273, + "learning_rate": 9.693435658489791e-06, + "loss": 0.5914, + "step": 2988 + }, + { + "epoch": 0.04054530656538253, + "grad_norm": 9.433341979980469, + "learning_rate": 9.693298615869537e-06, + "loss": 0.4392, + "step": 2989 + }, + { + "epoch": 0.040558871405317415, + "grad_norm": 6.124423027038574, + "learning_rate": 9.693161573249282e-06, + "loss": 0.4717, + "step": 2990 + }, + { + "epoch": 0.04057243624525231, + "grad_norm": 7.256816387176514, + "learning_rate": 9.693024530629025e-06, + "loss": 0.4348, + "step": 2991 + }, + { + "epoch": 0.040586001085187194, + "grad_norm": 6.957850456237793, + "learning_rate": 9.692887488008772e-06, + "loss": 0.4246, + "step": 2992 + }, + { + "epoch": 0.04059956592512208, + "grad_norm": 6.565103530883789, + "learning_rate": 9.692750445388517e-06, + "loss": 0.3983, + "step": 2993 + }, + { + "epoch": 0.04061313076505697, + "grad_norm": 6.849097728729248, + "learning_rate": 9.692613402768261e-06, + "loss": 0.4498, + "step": 2994 + }, + { + "epoch": 0.04062669560499186, + "grad_norm": 6.879293441772461, + "learning_rate": 9.692476360148006e-06, + "loss": 0.4052, + "step": 2995 + }, + { + "epoch": 0.04064026044492675, + "grad_norm": 6.714948654174805, + "learning_rate": 9.692339317527753e-06, + "loss": 0.3661, + "step": 2996 + }, + { + "epoch": 0.04065382528486164, + "grad_norm": 8.359753608703613, + "learning_rate": 9.692202274907498e-06, + "loss": 0.4365, + "step": 2997 + }, + { + "epoch": 0.040667390124796525, + "grad_norm": 5.814131736755371, + "learning_rate": 9.692065232287242e-06, + "loss": 0.3528, + "step": 2998 + }, + { + "epoch": 0.04068095496473142, + "grad_norm": 7.428710460662842, + "learning_rate": 9.691928189666987e-06, + "loss": 0.453, + "step": 2999 + }, + { + "epoch": 0.040694519804666304, + "grad_norm": 7.192951202392578, + "learning_rate": 9.691791147046732e-06, + "loss": 0.455, + "step": 3000 + }, + { + "epoch": 0.04070808464460119, + "grad_norm": 14.610760688781738, + "learning_rate": 9.691654104426477e-06, + "loss": 0.4565, + "step": 3001 + }, + { + "epoch": 0.040721649484536084, + "grad_norm": 8.035699844360352, + "learning_rate": 9.691517061806222e-06, + "loss": 0.6312, + "step": 3002 + }, + { + "epoch": 0.04073521432447097, + "grad_norm": 8.7708158493042, + "learning_rate": 9.691380019185967e-06, + "loss": 0.3761, + "step": 3003 + }, + { + "epoch": 0.04074877916440586, + "grad_norm": 7.123685359954834, + "learning_rate": 9.691242976565713e-06, + "loss": 0.4506, + "step": 3004 + }, + { + "epoch": 0.04076234400434075, + "grad_norm": 7.157916069030762, + "learning_rate": 9.691105933945458e-06, + "loss": 0.5899, + "step": 3005 + }, + { + "epoch": 0.040775908844275635, + "grad_norm": 7.610550403594971, + "learning_rate": 9.690968891325203e-06, + "loss": 0.3998, + "step": 3006 + }, + { + "epoch": 0.04078947368421053, + "grad_norm": 6.429940223693848, + "learning_rate": 9.690831848704948e-06, + "loss": 0.3606, + "step": 3007 + }, + { + "epoch": 0.040803038524145414, + "grad_norm": 10.229692459106445, + "learning_rate": 9.690694806084693e-06, + "loss": 0.5872, + "step": 3008 + }, + { + "epoch": 0.0408166033640803, + "grad_norm": 8.889226913452148, + "learning_rate": 9.690557763464439e-06, + "loss": 0.6072, + "step": 3009 + }, + { + "epoch": 0.040830168204015194, + "grad_norm": 11.25901985168457, + "learning_rate": 9.690420720844184e-06, + "loss": 0.5817, + "step": 3010 + }, + { + "epoch": 0.04084373304395008, + "grad_norm": 7.994484901428223, + "learning_rate": 9.690283678223929e-06, + "loss": 0.4718, + "step": 3011 + }, + { + "epoch": 0.04085729788388497, + "grad_norm": 5.7085137367248535, + "learning_rate": 9.690146635603674e-06, + "loss": 0.3379, + "step": 3012 + }, + { + "epoch": 0.04087086272381986, + "grad_norm": 10.04809856414795, + "learning_rate": 9.690009592983418e-06, + "loss": 0.4773, + "step": 3013 + }, + { + "epoch": 0.040884427563754745, + "grad_norm": 6.125181674957275, + "learning_rate": 9.689872550363164e-06, + "loss": 0.3975, + "step": 3014 + }, + { + "epoch": 0.04089799240368964, + "grad_norm": 6.541392803192139, + "learning_rate": 9.68973550774291e-06, + "loss": 0.3605, + "step": 3015 + }, + { + "epoch": 0.040911557243624525, + "grad_norm": 9.999876976013184, + "learning_rate": 9.689598465122653e-06, + "loss": 0.4228, + "step": 3016 + }, + { + "epoch": 0.04092512208355941, + "grad_norm": 7.250632286071777, + "learning_rate": 9.689461422502398e-06, + "loss": 0.4717, + "step": 3017 + }, + { + "epoch": 0.040938686923494304, + "grad_norm": 8.337886810302734, + "learning_rate": 9.689324379882145e-06, + "loss": 0.5489, + "step": 3018 + }, + { + "epoch": 0.04095225176342919, + "grad_norm": 6.490145206451416, + "learning_rate": 9.689187337261889e-06, + "loss": 0.4707, + "step": 3019 + }, + { + "epoch": 0.04096581660336408, + "grad_norm": 6.154169082641602, + "learning_rate": 9.689050294641634e-06, + "loss": 0.3853, + "step": 3020 + }, + { + "epoch": 0.04097938144329897, + "grad_norm": 5.623080253601074, + "learning_rate": 9.688913252021379e-06, + "loss": 0.3446, + "step": 3021 + }, + { + "epoch": 0.040992946283233855, + "grad_norm": 7.021266460418701, + "learning_rate": 9.688776209401126e-06, + "loss": 0.4735, + "step": 3022 + }, + { + "epoch": 0.04100651112316875, + "grad_norm": 8.471443176269531, + "learning_rate": 9.68863916678087e-06, + "loss": 0.5733, + "step": 3023 + }, + { + "epoch": 0.041020075963103635, + "grad_norm": 7.100752353668213, + "learning_rate": 9.688502124160615e-06, + "loss": 0.4746, + "step": 3024 + }, + { + "epoch": 0.04103364080303852, + "grad_norm": 6.678092956542969, + "learning_rate": 9.68836508154036e-06, + "loss": 0.4677, + "step": 3025 + }, + { + "epoch": 0.041047205642973414, + "grad_norm": 9.318276405334473, + "learning_rate": 9.688228038920105e-06, + "loss": 0.6937, + "step": 3026 + }, + { + "epoch": 0.0410607704829083, + "grad_norm": 9.181702613830566, + "learning_rate": 9.68809099629985e-06, + "loss": 0.5157, + "step": 3027 + }, + { + "epoch": 0.04107433532284319, + "grad_norm": 8.174454689025879, + "learning_rate": 9.687953953679595e-06, + "loss": 0.4883, + "step": 3028 + }, + { + "epoch": 0.04108790016277808, + "grad_norm": 6.978145599365234, + "learning_rate": 9.68781691105934e-06, + "loss": 0.4693, + "step": 3029 + }, + { + "epoch": 0.041101465002712965, + "grad_norm": 7.924473762512207, + "learning_rate": 9.687679868439086e-06, + "loss": 0.5957, + "step": 3030 + }, + { + "epoch": 0.04111502984264786, + "grad_norm": 9.04180908203125, + "learning_rate": 9.68754282581883e-06, + "loss": 0.4519, + "step": 3031 + }, + { + "epoch": 0.041128594682582745, + "grad_norm": 10.793227195739746, + "learning_rate": 9.687405783198576e-06, + "loss": 0.5329, + "step": 3032 + }, + { + "epoch": 0.04114215952251763, + "grad_norm": 8.460827827453613, + "learning_rate": 9.687268740578321e-06, + "loss": 0.5493, + "step": 3033 + }, + { + "epoch": 0.041155724362452524, + "grad_norm": 7.148990154266357, + "learning_rate": 9.687131697958065e-06, + "loss": 0.3718, + "step": 3034 + }, + { + "epoch": 0.04116928920238741, + "grad_norm": 8.77853012084961, + "learning_rate": 9.686994655337812e-06, + "loss": 0.6843, + "step": 3035 + }, + { + "epoch": 0.0411828540423223, + "grad_norm": 8.596467018127441, + "learning_rate": 9.686857612717557e-06, + "loss": 0.6503, + "step": 3036 + }, + { + "epoch": 0.04119641888225719, + "grad_norm": 7.798518180847168, + "learning_rate": 9.686720570097302e-06, + "loss": 0.3945, + "step": 3037 + }, + { + "epoch": 0.041209983722192076, + "grad_norm": 7.4871368408203125, + "learning_rate": 9.686583527477045e-06, + "loss": 0.4327, + "step": 3038 + }, + { + "epoch": 0.04122354856212697, + "grad_norm": 11.2709379196167, + "learning_rate": 9.68644648485679e-06, + "loss": 0.5301, + "step": 3039 + }, + { + "epoch": 0.041237113402061855, + "grad_norm": 9.608566284179688, + "learning_rate": 9.686309442236537e-06, + "loss": 0.4869, + "step": 3040 + }, + { + "epoch": 0.04125067824199674, + "grad_norm": 8.80724811553955, + "learning_rate": 9.686172399616281e-06, + "loss": 0.3463, + "step": 3041 + }, + { + "epoch": 0.041264243081931634, + "grad_norm": 7.138818740844727, + "learning_rate": 9.686035356996026e-06, + "loss": 0.5766, + "step": 3042 + }, + { + "epoch": 0.04127780792186652, + "grad_norm": 7.750698566436768, + "learning_rate": 9.685898314375771e-06, + "loss": 0.4288, + "step": 3043 + }, + { + "epoch": 0.04129137276180141, + "grad_norm": 7.257572174072266, + "learning_rate": 9.685761271755516e-06, + "loss": 0.5657, + "step": 3044 + }, + { + "epoch": 0.0413049376017363, + "grad_norm": 6.999053478240967, + "learning_rate": 9.685624229135262e-06, + "loss": 0.4767, + "step": 3045 + }, + { + "epoch": 0.041318502441671186, + "grad_norm": 7.520554065704346, + "learning_rate": 9.685487186515007e-06, + "loss": 0.6193, + "step": 3046 + }, + { + "epoch": 0.04133206728160608, + "grad_norm": 6.847055435180664, + "learning_rate": 9.685350143894752e-06, + "loss": 0.5077, + "step": 3047 + }, + { + "epoch": 0.041345632121540965, + "grad_norm": 8.895929336547852, + "learning_rate": 9.685213101274497e-06, + "loss": 0.6093, + "step": 3048 + }, + { + "epoch": 0.04135919696147585, + "grad_norm": 7.614526748657227, + "learning_rate": 9.685076058654242e-06, + "loss": 0.4347, + "step": 3049 + }, + { + "epoch": 0.041372761801410744, + "grad_norm": 7.294904708862305, + "learning_rate": 9.684939016033987e-06, + "loss": 0.3822, + "step": 3050 + }, + { + "epoch": 0.04138632664134563, + "grad_norm": 7.45383358001709, + "learning_rate": 9.684801973413733e-06, + "loss": 0.468, + "step": 3051 + }, + { + "epoch": 0.041399891481280524, + "grad_norm": 5.2556023597717285, + "learning_rate": 9.684664930793478e-06, + "loss": 0.3647, + "step": 3052 + }, + { + "epoch": 0.04141345632121541, + "grad_norm": 6.5348405838012695, + "learning_rate": 9.684527888173223e-06, + "loss": 0.3567, + "step": 3053 + }, + { + "epoch": 0.041427021161150296, + "grad_norm": 7.64487361907959, + "learning_rate": 9.684390845552968e-06, + "loss": 0.4585, + "step": 3054 + }, + { + "epoch": 0.04144058600108519, + "grad_norm": 8.928524017333984, + "learning_rate": 9.684253802932713e-06, + "loss": 0.52, + "step": 3055 + }, + { + "epoch": 0.041454150841020075, + "grad_norm": 8.130188941955566, + "learning_rate": 9.684116760312457e-06, + "loss": 0.483, + "step": 3056 + }, + { + "epoch": 0.04146771568095496, + "grad_norm": 7.875490188598633, + "learning_rate": 9.683979717692204e-06, + "loss": 0.5101, + "step": 3057 + }, + { + "epoch": 0.041481280520889854, + "grad_norm": 7.798539638519287, + "learning_rate": 9.683842675071949e-06, + "loss": 0.5256, + "step": 3058 + }, + { + "epoch": 0.04149484536082474, + "grad_norm": 9.634237289428711, + "learning_rate": 9.683705632451692e-06, + "loss": 0.5423, + "step": 3059 + }, + { + "epoch": 0.041508410200759634, + "grad_norm": 8.94647216796875, + "learning_rate": 9.683568589831438e-06, + "loss": 0.5583, + "step": 3060 + }, + { + "epoch": 0.04152197504069452, + "grad_norm": 7.550703525543213, + "learning_rate": 9.683431547211184e-06, + "loss": 0.498, + "step": 3061 + }, + { + "epoch": 0.041535539880629406, + "grad_norm": 7.451467990875244, + "learning_rate": 9.683294504590928e-06, + "loss": 0.5766, + "step": 3062 + }, + { + "epoch": 0.0415491047205643, + "grad_norm": 5.609135150909424, + "learning_rate": 9.683157461970673e-06, + "loss": 0.413, + "step": 3063 + }, + { + "epoch": 0.041562669560499185, + "grad_norm": 7.568779468536377, + "learning_rate": 9.683020419350418e-06, + "loss": 0.377, + "step": 3064 + }, + { + "epoch": 0.04157623440043407, + "grad_norm": 6.324046611785889, + "learning_rate": 9.682883376730163e-06, + "loss": 0.462, + "step": 3065 + }, + { + "epoch": 0.041589799240368965, + "grad_norm": 5.042466163635254, + "learning_rate": 9.682746334109909e-06, + "loss": 0.2908, + "step": 3066 + }, + { + "epoch": 0.04160336408030385, + "grad_norm": 10.64928913116455, + "learning_rate": 9.682609291489654e-06, + "loss": 0.5528, + "step": 3067 + }, + { + "epoch": 0.041616928920238744, + "grad_norm": 9.240706443786621, + "learning_rate": 9.682472248869399e-06, + "loss": 0.6876, + "step": 3068 + }, + { + "epoch": 0.04163049376017363, + "grad_norm": 8.309649467468262, + "learning_rate": 9.682335206249144e-06, + "loss": 0.597, + "step": 3069 + }, + { + "epoch": 0.041644058600108516, + "grad_norm": 7.59240198135376, + "learning_rate": 9.68219816362889e-06, + "loss": 0.5662, + "step": 3070 + }, + { + "epoch": 0.04165762344004341, + "grad_norm": 7.805194854736328, + "learning_rate": 9.682061121008635e-06, + "loss": 0.5349, + "step": 3071 + }, + { + "epoch": 0.041671188279978295, + "grad_norm": 8.981900215148926, + "learning_rate": 9.68192407838838e-06, + "loss": 0.4877, + "step": 3072 + }, + { + "epoch": 0.04168475311991319, + "grad_norm": 6.781878471374512, + "learning_rate": 9.681787035768125e-06, + "loss": 0.4304, + "step": 3073 + }, + { + "epoch": 0.041698317959848075, + "grad_norm": 8.323932647705078, + "learning_rate": 9.68164999314787e-06, + "loss": 0.5851, + "step": 3074 + }, + { + "epoch": 0.04171188279978296, + "grad_norm": 6.258712291717529, + "learning_rate": 9.681512950527615e-06, + "loss": 0.4394, + "step": 3075 + }, + { + "epoch": 0.041725447639717854, + "grad_norm": 6.333269119262695, + "learning_rate": 9.68137590790736e-06, + "loss": 0.4918, + "step": 3076 + }, + { + "epoch": 0.04173901247965274, + "grad_norm": 6.979219913482666, + "learning_rate": 9.681238865287104e-06, + "loss": 0.4354, + "step": 3077 + }, + { + "epoch": 0.041752577319587626, + "grad_norm": 6.203427791595459, + "learning_rate": 9.68110182266685e-06, + "loss": 0.4732, + "step": 3078 + }, + { + "epoch": 0.04176614215952252, + "grad_norm": 8.141164779663086, + "learning_rate": 9.680964780046596e-06, + "loss": 0.481, + "step": 3079 + }, + { + "epoch": 0.041779706999457406, + "grad_norm": 6.590114593505859, + "learning_rate": 9.680827737426341e-06, + "loss": 0.4996, + "step": 3080 + }, + { + "epoch": 0.0417932718393923, + "grad_norm": 5.761651992797852, + "learning_rate": 9.680690694806085e-06, + "loss": 0.5205, + "step": 3081 + }, + { + "epoch": 0.041806836679327185, + "grad_norm": 8.878241539001465, + "learning_rate": 9.68055365218583e-06, + "loss": 0.7292, + "step": 3082 + }, + { + "epoch": 0.04182040151926207, + "grad_norm": 10.053977966308594, + "learning_rate": 9.680416609565577e-06, + "loss": 0.5867, + "step": 3083 + }, + { + "epoch": 0.041833966359196964, + "grad_norm": 9.301924705505371, + "learning_rate": 9.68027956694532e-06, + "loss": 0.5521, + "step": 3084 + }, + { + "epoch": 0.04184753119913185, + "grad_norm": 7.205127716064453, + "learning_rate": 9.680142524325065e-06, + "loss": 0.6065, + "step": 3085 + }, + { + "epoch": 0.041861096039066736, + "grad_norm": 7.625129699707031, + "learning_rate": 9.68000548170481e-06, + "loss": 0.5578, + "step": 3086 + }, + { + "epoch": 0.04187466087900163, + "grad_norm": 6.9422125816345215, + "learning_rate": 9.679868439084556e-06, + "loss": 0.6039, + "step": 3087 + }, + { + "epoch": 0.041888225718936516, + "grad_norm": 6.612598419189453, + "learning_rate": 9.679731396464301e-06, + "loss": 0.2846, + "step": 3088 + }, + { + "epoch": 0.04190179055887141, + "grad_norm": 7.027040958404541, + "learning_rate": 9.679594353844046e-06, + "loss": 0.4813, + "step": 3089 + }, + { + "epoch": 0.041915355398806295, + "grad_norm": 6.376227378845215, + "learning_rate": 9.679457311223791e-06, + "loss": 0.4678, + "step": 3090 + }, + { + "epoch": 0.04192892023874118, + "grad_norm": 6.4526214599609375, + "learning_rate": 9.679320268603536e-06, + "loss": 0.5864, + "step": 3091 + }, + { + "epoch": 0.041942485078676074, + "grad_norm": 6.297903060913086, + "learning_rate": 9.679183225983282e-06, + "loss": 0.4228, + "step": 3092 + }, + { + "epoch": 0.04195604991861096, + "grad_norm": 5.46200704574585, + "learning_rate": 9.679046183363027e-06, + "loss": 0.3784, + "step": 3093 + }, + { + "epoch": 0.041969614758545847, + "grad_norm": 6.270568370819092, + "learning_rate": 9.678909140742772e-06, + "loss": 0.3957, + "step": 3094 + }, + { + "epoch": 0.04198317959848074, + "grad_norm": 5.915499210357666, + "learning_rate": 9.678772098122517e-06, + "loss": 0.4285, + "step": 3095 + }, + { + "epoch": 0.041996744438415626, + "grad_norm": 8.470203399658203, + "learning_rate": 9.678635055502262e-06, + "loss": 0.6764, + "step": 3096 + }, + { + "epoch": 0.04201030927835052, + "grad_norm": 6.470853805541992, + "learning_rate": 9.678498012882008e-06, + "loss": 0.2731, + "step": 3097 + }, + { + "epoch": 0.042023874118285405, + "grad_norm": 5.71237325668335, + "learning_rate": 9.678360970261753e-06, + "loss": 0.4201, + "step": 3098 + }, + { + "epoch": 0.04203743895822029, + "grad_norm": 7.107183456420898, + "learning_rate": 9.678223927641496e-06, + "loss": 0.6665, + "step": 3099 + }, + { + "epoch": 0.042051003798155184, + "grad_norm": 8.281554222106934, + "learning_rate": 9.678086885021243e-06, + "loss": 0.5128, + "step": 3100 + }, + { + "epoch": 0.04206456863809007, + "grad_norm": 5.879458904266357, + "learning_rate": 9.677949842400988e-06, + "loss": 0.432, + "step": 3101 + }, + { + "epoch": 0.04207813347802496, + "grad_norm": 7.215609073638916, + "learning_rate": 9.677812799780732e-06, + "loss": 0.4874, + "step": 3102 + }, + { + "epoch": 0.04209169831795985, + "grad_norm": 5.162352085113525, + "learning_rate": 9.677675757160477e-06, + "loss": 0.349, + "step": 3103 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 9.371909141540527, + "learning_rate": 9.677538714540224e-06, + "loss": 0.5949, + "step": 3104 + }, + { + "epoch": 0.04211882799782963, + "grad_norm": 7.499497890472412, + "learning_rate": 9.677401671919969e-06, + "loss": 0.4044, + "step": 3105 + }, + { + "epoch": 0.042132392837764515, + "grad_norm": 4.503114223480225, + "learning_rate": 9.677264629299712e-06, + "loss": 0.3429, + "step": 3106 + }, + { + "epoch": 0.0421459576776994, + "grad_norm": 4.798037528991699, + "learning_rate": 9.677127586679458e-06, + "loss": 0.3175, + "step": 3107 + }, + { + "epoch": 0.042159522517634294, + "grad_norm": 5.130072116851807, + "learning_rate": 9.676990544059203e-06, + "loss": 0.3272, + "step": 3108 + }, + { + "epoch": 0.04217308735756918, + "grad_norm": 7.333490371704102, + "learning_rate": 9.676853501438948e-06, + "loss": 0.3792, + "step": 3109 + }, + { + "epoch": 0.04218665219750407, + "grad_norm": 7.826060771942139, + "learning_rate": 9.676716458818693e-06, + "loss": 0.335, + "step": 3110 + }, + { + "epoch": 0.04220021703743896, + "grad_norm": 5.098236083984375, + "learning_rate": 9.676579416198438e-06, + "loss": 0.367, + "step": 3111 + }, + { + "epoch": 0.042213781877373846, + "grad_norm": 8.051276206970215, + "learning_rate": 9.676442373578184e-06, + "loss": 0.4228, + "step": 3112 + }, + { + "epoch": 0.04222734671730874, + "grad_norm": 5.903053283691406, + "learning_rate": 9.676305330957929e-06, + "loss": 0.4366, + "step": 3113 + }, + { + "epoch": 0.042240911557243625, + "grad_norm": 5.991075038909912, + "learning_rate": 9.676168288337674e-06, + "loss": 0.3609, + "step": 3114 + }, + { + "epoch": 0.04225447639717851, + "grad_norm": 6.211704730987549, + "learning_rate": 9.676031245717419e-06, + "loss": 0.4494, + "step": 3115 + }, + { + "epoch": 0.042268041237113405, + "grad_norm": 7.415727615356445, + "learning_rate": 9.675894203097164e-06, + "loss": 0.5796, + "step": 3116 + }, + { + "epoch": 0.04228160607704829, + "grad_norm": 6.359202861785889, + "learning_rate": 9.67575716047691e-06, + "loss": 0.4023, + "step": 3117 + }, + { + "epoch": 0.04229517091698318, + "grad_norm": 6.670890808105469, + "learning_rate": 9.675620117856655e-06, + "loss": 0.5192, + "step": 3118 + }, + { + "epoch": 0.04230873575691807, + "grad_norm": 6.054116249084473, + "learning_rate": 9.6754830752364e-06, + "loss": 0.4529, + "step": 3119 + }, + { + "epoch": 0.042322300596852956, + "grad_norm": 6.422795295715332, + "learning_rate": 9.675346032616145e-06, + "loss": 0.4398, + "step": 3120 + }, + { + "epoch": 0.04233586543678785, + "grad_norm": 7.176780700683594, + "learning_rate": 9.675208989995888e-06, + "loss": 0.5194, + "step": 3121 + }, + { + "epoch": 0.042349430276722735, + "grad_norm": 6.122620582580566, + "learning_rate": 9.675071947375635e-06, + "loss": 0.4866, + "step": 3122 + }, + { + "epoch": 0.04236299511665762, + "grad_norm": 9.031460762023926, + "learning_rate": 9.67493490475538e-06, + "loss": 0.6913, + "step": 3123 + }, + { + "epoch": 0.042376559956592515, + "grad_norm": 6.635653018951416, + "learning_rate": 9.674797862135124e-06, + "loss": 0.3019, + "step": 3124 + }, + { + "epoch": 0.0423901247965274, + "grad_norm": 9.46362018585205, + "learning_rate": 9.674660819514869e-06, + "loss": 0.4578, + "step": 3125 + }, + { + "epoch": 0.04240368963646229, + "grad_norm": 6.927749156951904, + "learning_rate": 9.674523776894616e-06, + "loss": 0.3316, + "step": 3126 + }, + { + "epoch": 0.04241725447639718, + "grad_norm": 7.503211975097656, + "learning_rate": 9.67438673427436e-06, + "loss": 0.4944, + "step": 3127 + }, + { + "epoch": 0.042430819316332066, + "grad_norm": 6.699067115783691, + "learning_rate": 9.674249691654105e-06, + "loss": 0.4844, + "step": 3128 + }, + { + "epoch": 0.04244438415626696, + "grad_norm": 8.163484573364258, + "learning_rate": 9.67411264903385e-06, + "loss": 0.3978, + "step": 3129 + }, + { + "epoch": 0.042457948996201846, + "grad_norm": 6.068266868591309, + "learning_rate": 9.673975606413597e-06, + "loss": 0.4323, + "step": 3130 + }, + { + "epoch": 0.04247151383613673, + "grad_norm": 8.564496994018555, + "learning_rate": 9.67383856379334e-06, + "loss": 0.4723, + "step": 3131 + }, + { + "epoch": 0.042485078676071625, + "grad_norm": 6.598597049713135, + "learning_rate": 9.673701521173085e-06, + "loss": 0.4161, + "step": 3132 + }, + { + "epoch": 0.04249864351600651, + "grad_norm": 8.628398895263672, + "learning_rate": 9.67356447855283e-06, + "loss": 0.5247, + "step": 3133 + }, + { + "epoch": 0.0425122083559414, + "grad_norm": 6.369931697845459, + "learning_rate": 9.673427435932576e-06, + "loss": 0.3933, + "step": 3134 + }, + { + "epoch": 0.04252577319587629, + "grad_norm": 8.585042953491211, + "learning_rate": 9.673290393312321e-06, + "loss": 0.4359, + "step": 3135 + }, + { + "epoch": 0.042539338035811176, + "grad_norm": 10.62325382232666, + "learning_rate": 9.673153350692066e-06, + "loss": 0.5651, + "step": 3136 + }, + { + "epoch": 0.04255290287574607, + "grad_norm": 7.984013080596924, + "learning_rate": 9.673016308071811e-06, + "loss": 0.5355, + "step": 3137 + }, + { + "epoch": 0.042566467715680956, + "grad_norm": 7.375458717346191, + "learning_rate": 9.672879265451556e-06, + "loss": 0.4132, + "step": 3138 + }, + { + "epoch": 0.04258003255561584, + "grad_norm": 7.199085712432861, + "learning_rate": 9.672742222831302e-06, + "loss": 0.4516, + "step": 3139 + }, + { + "epoch": 0.042593597395550735, + "grad_norm": 8.7071533203125, + "learning_rate": 9.672605180211047e-06, + "loss": 0.4128, + "step": 3140 + }, + { + "epoch": 0.04260716223548562, + "grad_norm": 8.497895240783691, + "learning_rate": 9.672468137590792e-06, + "loss": 0.5441, + "step": 3141 + }, + { + "epoch": 0.04262072707542051, + "grad_norm": 7.061649322509766, + "learning_rate": 9.672331094970535e-06, + "loss": 0.526, + "step": 3142 + }, + { + "epoch": 0.0426342919153554, + "grad_norm": 7.643640995025635, + "learning_rate": 9.672194052350282e-06, + "loss": 0.3823, + "step": 3143 + }, + { + "epoch": 0.042647856755290287, + "grad_norm": 9.279908180236816, + "learning_rate": 9.672057009730028e-06, + "loss": 0.4434, + "step": 3144 + }, + { + "epoch": 0.04266142159522518, + "grad_norm": 6.611797332763672, + "learning_rate": 9.671919967109773e-06, + "loss": 0.3524, + "step": 3145 + }, + { + "epoch": 0.042674986435160066, + "grad_norm": 7.3694586753845215, + "learning_rate": 9.671782924489516e-06, + "loss": 0.5485, + "step": 3146 + }, + { + "epoch": 0.04268855127509495, + "grad_norm": 9.123685836791992, + "learning_rate": 9.671645881869263e-06, + "loss": 0.459, + "step": 3147 + }, + { + "epoch": 0.042702116115029845, + "grad_norm": 6.421139717102051, + "learning_rate": 9.671508839249008e-06, + "loss": 0.3423, + "step": 3148 + }, + { + "epoch": 0.04271568095496473, + "grad_norm": 7.555044174194336, + "learning_rate": 9.671371796628752e-06, + "loss": 0.4307, + "step": 3149 + }, + { + "epoch": 0.04272924579489962, + "grad_norm": 6.719505310058594, + "learning_rate": 9.671234754008497e-06, + "loss": 0.3654, + "step": 3150 + }, + { + "epoch": 0.04274281063483451, + "grad_norm": 10.331208229064941, + "learning_rate": 9.671097711388242e-06, + "loss": 0.6518, + "step": 3151 + }, + { + "epoch": 0.0427563754747694, + "grad_norm": 8.116741180419922, + "learning_rate": 9.670960668767987e-06, + "loss": 0.5126, + "step": 3152 + }, + { + "epoch": 0.04276994031470429, + "grad_norm": 8.701408386230469, + "learning_rate": 9.670823626147732e-06, + "loss": 0.3366, + "step": 3153 + }, + { + "epoch": 0.042783505154639176, + "grad_norm": 9.380898475646973, + "learning_rate": 9.670686583527478e-06, + "loss": 0.4624, + "step": 3154 + }, + { + "epoch": 0.04279706999457406, + "grad_norm": 7.93930721282959, + "learning_rate": 9.670549540907223e-06, + "loss": 0.3686, + "step": 3155 + }, + { + "epoch": 0.042810634834508955, + "grad_norm": 9.560455322265625, + "learning_rate": 9.670412498286968e-06, + "loss": 0.4251, + "step": 3156 + }, + { + "epoch": 0.04282419967444384, + "grad_norm": 9.375251770019531, + "learning_rate": 9.670275455666713e-06, + "loss": 0.6046, + "step": 3157 + }, + { + "epoch": 0.04283776451437873, + "grad_norm": 6.619970798492432, + "learning_rate": 9.670138413046458e-06, + "loss": 0.4829, + "step": 3158 + }, + { + "epoch": 0.04285132935431362, + "grad_norm": 5.41005277633667, + "learning_rate": 9.670001370426204e-06, + "loss": 0.4223, + "step": 3159 + }, + { + "epoch": 0.04286489419424851, + "grad_norm": 9.378849983215332, + "learning_rate": 9.669864327805949e-06, + "loss": 0.5602, + "step": 3160 + }, + { + "epoch": 0.0428784590341834, + "grad_norm": 6.600007057189941, + "learning_rate": 9.669727285185694e-06, + "loss": 0.3363, + "step": 3161 + }, + { + "epoch": 0.042892023874118286, + "grad_norm": 6.645505428314209, + "learning_rate": 9.669590242565439e-06, + "loss": 0.3276, + "step": 3162 + }, + { + "epoch": 0.04290558871405317, + "grad_norm": 7.901943683624268, + "learning_rate": 9.669453199945184e-06, + "loss": 0.3838, + "step": 3163 + }, + { + "epoch": 0.042919153553988065, + "grad_norm": 7.463070392608643, + "learning_rate": 9.669316157324928e-06, + "loss": 0.4293, + "step": 3164 + }, + { + "epoch": 0.04293271839392295, + "grad_norm": 8.90321159362793, + "learning_rate": 9.669179114704675e-06, + "loss": 0.5039, + "step": 3165 + }, + { + "epoch": 0.04294628323385784, + "grad_norm": 8.155440330505371, + "learning_rate": 9.66904207208442e-06, + "loss": 0.4005, + "step": 3166 + }, + { + "epoch": 0.04295984807379273, + "grad_norm": 7.399409294128418, + "learning_rate": 9.668905029464163e-06, + "loss": 0.4321, + "step": 3167 + }, + { + "epoch": 0.04297341291372762, + "grad_norm": 12.316484451293945, + "learning_rate": 9.668767986843908e-06, + "loss": 0.4917, + "step": 3168 + }, + { + "epoch": 0.04298697775366251, + "grad_norm": 7.607637405395508, + "learning_rate": 9.668630944223655e-06, + "loss": 0.4496, + "step": 3169 + }, + { + "epoch": 0.043000542593597396, + "grad_norm": 6.701259136199951, + "learning_rate": 9.668493901603399e-06, + "loss": 0.4955, + "step": 3170 + }, + { + "epoch": 0.04301410743353228, + "grad_norm": 8.808289527893066, + "learning_rate": 9.668356858983144e-06, + "loss": 0.4422, + "step": 3171 + }, + { + "epoch": 0.043027672273467175, + "grad_norm": 6.065435409545898, + "learning_rate": 9.66821981636289e-06, + "loss": 0.437, + "step": 3172 + }, + { + "epoch": 0.04304123711340206, + "grad_norm": 8.855140686035156, + "learning_rate": 9.668082773742636e-06, + "loss": 0.5782, + "step": 3173 + }, + { + "epoch": 0.04305480195333695, + "grad_norm": 6.805079936981201, + "learning_rate": 9.66794573112238e-06, + "loss": 0.4246, + "step": 3174 + }, + { + "epoch": 0.04306836679327184, + "grad_norm": 9.954919815063477, + "learning_rate": 9.667808688502125e-06, + "loss": 0.4627, + "step": 3175 + }, + { + "epoch": 0.04308193163320673, + "grad_norm": 6.02945613861084, + "learning_rate": 9.66767164588187e-06, + "loss": 0.3602, + "step": 3176 + }, + { + "epoch": 0.04309549647314162, + "grad_norm": 8.890292167663574, + "learning_rate": 9.667534603261615e-06, + "loss": 0.5855, + "step": 3177 + }, + { + "epoch": 0.043109061313076506, + "grad_norm": 8.309826850891113, + "learning_rate": 9.66739756064136e-06, + "loss": 0.4543, + "step": 3178 + }, + { + "epoch": 0.04312262615301139, + "grad_norm": 7.819921970367432, + "learning_rate": 9.667260518021105e-06, + "loss": 0.4933, + "step": 3179 + }, + { + "epoch": 0.043136190992946286, + "grad_norm": 7.914549827575684, + "learning_rate": 9.66712347540085e-06, + "loss": 0.5286, + "step": 3180 + }, + { + "epoch": 0.04314975583288117, + "grad_norm": 10.635930061340332, + "learning_rate": 9.666986432780596e-06, + "loss": 0.6149, + "step": 3181 + }, + { + "epoch": 0.04316332067281606, + "grad_norm": 7.439103603363037, + "learning_rate": 9.666849390160341e-06, + "loss": 0.436, + "step": 3182 + }, + { + "epoch": 0.04317688551275095, + "grad_norm": 9.376361846923828, + "learning_rate": 9.666712347540086e-06, + "loss": 0.5325, + "step": 3183 + }, + { + "epoch": 0.04319045035268584, + "grad_norm": 8.954760551452637, + "learning_rate": 9.666575304919831e-06, + "loss": 0.5721, + "step": 3184 + }, + { + "epoch": 0.04320401519262073, + "grad_norm": 6.133548736572266, + "learning_rate": 9.666438262299575e-06, + "loss": 0.3927, + "step": 3185 + }, + { + "epoch": 0.043217580032555616, + "grad_norm": 10.823594093322754, + "learning_rate": 9.666301219679322e-06, + "loss": 0.4759, + "step": 3186 + }, + { + "epoch": 0.0432311448724905, + "grad_norm": 7.342807769775391, + "learning_rate": 9.666164177059067e-06, + "loss": 0.329, + "step": 3187 + }, + { + "epoch": 0.043244709712425396, + "grad_norm": 7.363797187805176, + "learning_rate": 9.666027134438812e-06, + "loss": 0.3638, + "step": 3188 + }, + { + "epoch": 0.04325827455236028, + "grad_norm": 6.551280975341797, + "learning_rate": 9.665890091818556e-06, + "loss": 0.3959, + "step": 3189 + }, + { + "epoch": 0.04327183939229517, + "grad_norm": 8.887022018432617, + "learning_rate": 9.6657530491983e-06, + "loss": 0.6023, + "step": 3190 + }, + { + "epoch": 0.04328540423223006, + "grad_norm": 13.123089790344238, + "learning_rate": 9.665616006578048e-06, + "loss": 0.5458, + "step": 3191 + }, + { + "epoch": 0.04329896907216495, + "grad_norm": 8.466137886047363, + "learning_rate": 9.665478963957791e-06, + "loss": 0.4729, + "step": 3192 + }, + { + "epoch": 0.04331253391209984, + "grad_norm": 8.315871238708496, + "learning_rate": 9.665341921337536e-06, + "loss": 0.4217, + "step": 3193 + }, + { + "epoch": 0.04332609875203473, + "grad_norm": 5.450854301452637, + "learning_rate": 9.665204878717281e-06, + "loss": 0.3635, + "step": 3194 + }, + { + "epoch": 0.04333966359196961, + "grad_norm": 8.070435523986816, + "learning_rate": 9.665067836097027e-06, + "loss": 0.7225, + "step": 3195 + }, + { + "epoch": 0.043353228431904506, + "grad_norm": 7.8917083740234375, + "learning_rate": 9.664930793476772e-06, + "loss": 0.5058, + "step": 3196 + }, + { + "epoch": 0.04336679327183939, + "grad_norm": 7.943650722503662, + "learning_rate": 9.664793750856517e-06, + "loss": 0.5263, + "step": 3197 + }, + { + "epoch": 0.04338035811177428, + "grad_norm": 10.358316421508789, + "learning_rate": 9.664656708236262e-06, + "loss": 0.5996, + "step": 3198 + }, + { + "epoch": 0.04339392295170917, + "grad_norm": 7.26759147644043, + "learning_rate": 9.664519665616007e-06, + "loss": 0.461, + "step": 3199 + }, + { + "epoch": 0.04340748779164406, + "grad_norm": 9.367905616760254, + "learning_rate": 9.664382622995752e-06, + "loss": 0.5831, + "step": 3200 + }, + { + "epoch": 0.04342105263157895, + "grad_norm": 5.629997253417969, + "learning_rate": 9.664245580375498e-06, + "loss": 0.355, + "step": 3201 + }, + { + "epoch": 0.04343461747151384, + "grad_norm": 8.152567863464355, + "learning_rate": 9.664108537755243e-06, + "loss": 0.4118, + "step": 3202 + }, + { + "epoch": 0.04344818231144872, + "grad_norm": 8.233901023864746, + "learning_rate": 9.663971495134988e-06, + "loss": 0.4944, + "step": 3203 + }, + { + "epoch": 0.043461747151383616, + "grad_norm": 7.7873687744140625, + "learning_rate": 9.663834452514733e-06, + "loss": 0.5503, + "step": 3204 + }, + { + "epoch": 0.0434753119913185, + "grad_norm": 6.081096172332764, + "learning_rate": 9.663697409894478e-06, + "loss": 0.4471, + "step": 3205 + }, + { + "epoch": 0.04348887683125339, + "grad_norm": 7.973161220550537, + "learning_rate": 9.663560367274224e-06, + "loss": 0.5002, + "step": 3206 + }, + { + "epoch": 0.04350244167118828, + "grad_norm": 7.322437286376953, + "learning_rate": 9.663423324653967e-06, + "loss": 0.4962, + "step": 3207 + }, + { + "epoch": 0.04351600651112317, + "grad_norm": 8.957536697387695, + "learning_rate": 9.663286282033714e-06, + "loss": 0.5458, + "step": 3208 + }, + { + "epoch": 0.04352957135105806, + "grad_norm": 6.37569522857666, + "learning_rate": 9.663149239413459e-06, + "loss": 0.4478, + "step": 3209 + }, + { + "epoch": 0.04354313619099295, + "grad_norm": 7.474524974822998, + "learning_rate": 9.663012196793203e-06, + "loss": 0.4487, + "step": 3210 + }, + { + "epoch": 0.04355670103092783, + "grad_norm": 10.576992988586426, + "learning_rate": 9.662875154172948e-06, + "loss": 0.6256, + "step": 3211 + }, + { + "epoch": 0.043570265870862726, + "grad_norm": 7.28717565536499, + "learning_rate": 9.662738111552695e-06, + "loss": 0.3999, + "step": 3212 + }, + { + "epoch": 0.04358383071079761, + "grad_norm": 9.66346549987793, + "learning_rate": 9.66260106893244e-06, + "loss": 0.456, + "step": 3213 + }, + { + "epoch": 0.0435973955507325, + "grad_norm": 8.429545402526855, + "learning_rate": 9.662464026312183e-06, + "loss": 0.3777, + "step": 3214 + }, + { + "epoch": 0.04361096039066739, + "grad_norm": 9.386711120605469, + "learning_rate": 9.662326983691928e-06, + "loss": 0.5042, + "step": 3215 + }, + { + "epoch": 0.04362452523060228, + "grad_norm": 6.620604991912842, + "learning_rate": 9.662189941071675e-06, + "loss": 0.3716, + "step": 3216 + }, + { + "epoch": 0.04363809007053717, + "grad_norm": 9.277477264404297, + "learning_rate": 9.662052898451419e-06, + "loss": 0.4826, + "step": 3217 + }, + { + "epoch": 0.04365165491047206, + "grad_norm": 8.308362007141113, + "learning_rate": 9.661915855831164e-06, + "loss": 0.5378, + "step": 3218 + }, + { + "epoch": 0.04366521975040694, + "grad_norm": 6.624105930328369, + "learning_rate": 9.66177881321091e-06, + "loss": 0.4558, + "step": 3219 + }, + { + "epoch": 0.043678784590341836, + "grad_norm": 8.860259056091309, + "learning_rate": 9.661641770590654e-06, + "loss": 0.45, + "step": 3220 + }, + { + "epoch": 0.04369234943027672, + "grad_norm": 10.318958282470703, + "learning_rate": 9.6615047279704e-06, + "loss": 0.5107, + "step": 3221 + }, + { + "epoch": 0.04370591427021161, + "grad_norm": 6.211362361907959, + "learning_rate": 9.661367685350145e-06, + "loss": 0.4365, + "step": 3222 + }, + { + "epoch": 0.0437194791101465, + "grad_norm": 7.084677219390869, + "learning_rate": 9.66123064272989e-06, + "loss": 0.474, + "step": 3223 + }, + { + "epoch": 0.04373304395008139, + "grad_norm": 11.044449806213379, + "learning_rate": 9.661093600109635e-06, + "loss": 0.5917, + "step": 3224 + }, + { + "epoch": 0.04374660879001628, + "grad_norm": 7.987118721008301, + "learning_rate": 9.66095655748938e-06, + "loss": 0.4483, + "step": 3225 + }, + { + "epoch": 0.04376017362995117, + "grad_norm": 5.813055515289307, + "learning_rate": 9.660819514869125e-06, + "loss": 0.431, + "step": 3226 + }, + { + "epoch": 0.04377373846988605, + "grad_norm": 8.15625286102295, + "learning_rate": 9.66068247224887e-06, + "loss": 0.511, + "step": 3227 + }, + { + "epoch": 0.043787303309820946, + "grad_norm": 10.902030944824219, + "learning_rate": 9.660545429628616e-06, + "loss": 0.5694, + "step": 3228 + }, + { + "epoch": 0.04380086814975583, + "grad_norm": 7.064408779144287, + "learning_rate": 9.660408387008361e-06, + "loss": 0.4279, + "step": 3229 + }, + { + "epoch": 0.04381443298969072, + "grad_norm": 5.383235931396484, + "learning_rate": 9.660271344388106e-06, + "loss": 0.3575, + "step": 3230 + }, + { + "epoch": 0.04382799782962561, + "grad_norm": 6.555880546569824, + "learning_rate": 9.660134301767851e-06, + "loss": 0.3319, + "step": 3231 + }, + { + "epoch": 0.0438415626695605, + "grad_norm": 7.2897257804870605, + "learning_rate": 9.659997259147595e-06, + "loss": 0.4501, + "step": 3232 + }, + { + "epoch": 0.04385512750949539, + "grad_norm": 6.164061069488525, + "learning_rate": 9.65986021652734e-06, + "loss": 0.2992, + "step": 3233 + }, + { + "epoch": 0.04386869234943028, + "grad_norm": 7.634287357330322, + "learning_rate": 9.659723173907087e-06, + "loss": 0.2799, + "step": 3234 + }, + { + "epoch": 0.04388225718936516, + "grad_norm": 8.856139183044434, + "learning_rate": 9.65958613128683e-06, + "loss": 0.4722, + "step": 3235 + }, + { + "epoch": 0.043895822029300056, + "grad_norm": 8.83277416229248, + "learning_rate": 9.659449088666576e-06, + "loss": 0.5704, + "step": 3236 + }, + { + "epoch": 0.04390938686923494, + "grad_norm": 9.685917854309082, + "learning_rate": 9.65931204604632e-06, + "loss": 0.444, + "step": 3237 + }, + { + "epoch": 0.04392295170916983, + "grad_norm": 11.792719841003418, + "learning_rate": 9.659175003426068e-06, + "loss": 0.6696, + "step": 3238 + }, + { + "epoch": 0.04393651654910472, + "grad_norm": 9.18247127532959, + "learning_rate": 9.659037960805811e-06, + "loss": 0.4721, + "step": 3239 + }, + { + "epoch": 0.04395008138903961, + "grad_norm": 8.511734008789062, + "learning_rate": 9.658900918185556e-06, + "loss": 0.4038, + "step": 3240 + }, + { + "epoch": 0.0439636462289745, + "grad_norm": 7.665555477142334, + "learning_rate": 9.658763875565301e-06, + "loss": 0.3988, + "step": 3241 + }, + { + "epoch": 0.04397721106890939, + "grad_norm": 9.174798011779785, + "learning_rate": 9.658626832945047e-06, + "loss": 0.4002, + "step": 3242 + }, + { + "epoch": 0.04399077590884427, + "grad_norm": 9.688506126403809, + "learning_rate": 9.658489790324792e-06, + "loss": 0.5421, + "step": 3243 + }, + { + "epoch": 0.04400434074877917, + "grad_norm": 7.3458943367004395, + "learning_rate": 9.658352747704537e-06, + "loss": 0.3167, + "step": 3244 + }, + { + "epoch": 0.04401790558871405, + "grad_norm": 7.715225696563721, + "learning_rate": 9.658215705084282e-06, + "loss": 0.3859, + "step": 3245 + }, + { + "epoch": 0.04403147042864894, + "grad_norm": 6.722654342651367, + "learning_rate": 9.658078662464027e-06, + "loss": 0.4149, + "step": 3246 + }, + { + "epoch": 0.04404503526858383, + "grad_norm": 8.988910675048828, + "learning_rate": 9.657941619843772e-06, + "loss": 0.3963, + "step": 3247 + }, + { + "epoch": 0.04405860010851872, + "grad_norm": 10.600930213928223, + "learning_rate": 9.657804577223518e-06, + "loss": 0.5406, + "step": 3248 + }, + { + "epoch": 0.04407216494845361, + "grad_norm": 7.58202600479126, + "learning_rate": 9.657667534603263e-06, + "loss": 0.4309, + "step": 3249 + }, + { + "epoch": 0.0440857297883885, + "grad_norm": 7.898948669433594, + "learning_rate": 9.657530491983006e-06, + "loss": 0.4165, + "step": 3250 + }, + { + "epoch": 0.044099294628323384, + "grad_norm": 9.094417572021484, + "learning_rate": 9.657393449362753e-06, + "loss": 0.3166, + "step": 3251 + }, + { + "epoch": 0.04411285946825828, + "grad_norm": 9.570877075195312, + "learning_rate": 9.657256406742498e-06, + "loss": 0.5142, + "step": 3252 + }, + { + "epoch": 0.04412642430819316, + "grad_norm": 8.398895263671875, + "learning_rate": 9.657119364122242e-06, + "loss": 0.4448, + "step": 3253 + }, + { + "epoch": 0.04413998914812805, + "grad_norm": 8.620144844055176, + "learning_rate": 9.656982321501987e-06, + "loss": 0.5167, + "step": 3254 + }, + { + "epoch": 0.04415355398806294, + "grad_norm": 9.68919563293457, + "learning_rate": 9.656845278881734e-06, + "loss": 0.534, + "step": 3255 + }, + { + "epoch": 0.04416711882799783, + "grad_norm": 10.442928314208984, + "learning_rate": 9.656708236261479e-06, + "loss": 0.5042, + "step": 3256 + }, + { + "epoch": 0.04418068366793272, + "grad_norm": 6.957244396209717, + "learning_rate": 9.656571193641223e-06, + "loss": 0.3311, + "step": 3257 + }, + { + "epoch": 0.04419424850786761, + "grad_norm": 7.733405590057373, + "learning_rate": 9.656434151020968e-06, + "loss": 0.3523, + "step": 3258 + }, + { + "epoch": 0.044207813347802494, + "grad_norm": 9.627359390258789, + "learning_rate": 9.656297108400713e-06, + "loss": 0.3939, + "step": 3259 + }, + { + "epoch": 0.04422137818773739, + "grad_norm": 8.837615013122559, + "learning_rate": 9.656160065780458e-06, + "loss": 0.5678, + "step": 3260 + }, + { + "epoch": 0.04423494302767227, + "grad_norm": 8.363544464111328, + "learning_rate": 9.656023023160203e-06, + "loss": 0.3741, + "step": 3261 + }, + { + "epoch": 0.04424850786760716, + "grad_norm": 8.912406921386719, + "learning_rate": 9.655885980539948e-06, + "loss": 0.3945, + "step": 3262 + }, + { + "epoch": 0.04426207270754205, + "grad_norm": 6.905295372009277, + "learning_rate": 9.655748937919694e-06, + "loss": 0.4877, + "step": 3263 + }, + { + "epoch": 0.04427563754747694, + "grad_norm": 9.507477760314941, + "learning_rate": 9.655611895299439e-06, + "loss": 0.6051, + "step": 3264 + }, + { + "epoch": 0.04428920238741183, + "grad_norm": 7.033831596374512, + "learning_rate": 9.655474852679184e-06, + "loss": 0.4537, + "step": 3265 + }, + { + "epoch": 0.04430276722734672, + "grad_norm": 8.348673820495605, + "learning_rate": 9.65533781005893e-06, + "loss": 0.525, + "step": 3266 + }, + { + "epoch": 0.044316332067281604, + "grad_norm": 10.708873748779297, + "learning_rate": 9.655200767438674e-06, + "loss": 0.6661, + "step": 3267 + }, + { + "epoch": 0.0443298969072165, + "grad_norm": 10.89995288848877, + "learning_rate": 9.65506372481842e-06, + "loss": 0.7121, + "step": 3268 + }, + { + "epoch": 0.04434346174715138, + "grad_norm": 7.474300384521484, + "learning_rate": 9.654926682198165e-06, + "loss": 0.4611, + "step": 3269 + }, + { + "epoch": 0.04435702658708627, + "grad_norm": 8.304181098937988, + "learning_rate": 9.65478963957791e-06, + "loss": 0.4805, + "step": 3270 + }, + { + "epoch": 0.04437059142702116, + "grad_norm": 8.002850532531738, + "learning_rate": 9.654652596957655e-06, + "loss": 0.538, + "step": 3271 + }, + { + "epoch": 0.04438415626695605, + "grad_norm": 7.4916486740112305, + "learning_rate": 9.654515554337399e-06, + "loss": 0.491, + "step": 3272 + }, + { + "epoch": 0.04439772110689094, + "grad_norm": 6.760610580444336, + "learning_rate": 9.654378511717145e-06, + "loss": 0.495, + "step": 3273 + }, + { + "epoch": 0.04441128594682583, + "grad_norm": 9.63538932800293, + "learning_rate": 9.65424146909689e-06, + "loss": 0.5188, + "step": 3274 + }, + { + "epoch": 0.044424850786760714, + "grad_norm": 6.907187461853027, + "learning_rate": 9.654104426476634e-06, + "loss": 0.3668, + "step": 3275 + }, + { + "epoch": 0.04443841562669561, + "grad_norm": 8.252105712890625, + "learning_rate": 9.65396738385638e-06, + "loss": 0.4168, + "step": 3276 + }, + { + "epoch": 0.04445198046663049, + "grad_norm": 8.260433197021484, + "learning_rate": 9.653830341236126e-06, + "loss": 0.5697, + "step": 3277 + }, + { + "epoch": 0.04446554530656538, + "grad_norm": 6.413388729095459, + "learning_rate": 9.65369329861587e-06, + "loss": 0.3564, + "step": 3278 + }, + { + "epoch": 0.04447911014650027, + "grad_norm": 6.636796474456787, + "learning_rate": 9.653556255995615e-06, + "loss": 0.4, + "step": 3279 + }, + { + "epoch": 0.04449267498643516, + "grad_norm": 7.07858943939209, + "learning_rate": 9.65341921337536e-06, + "loss": 0.3965, + "step": 3280 + }, + { + "epoch": 0.04450623982637005, + "grad_norm": 8.388147354125977, + "learning_rate": 9.653282170755107e-06, + "loss": 0.4213, + "step": 3281 + }, + { + "epoch": 0.04451980466630494, + "grad_norm": 7.466416358947754, + "learning_rate": 9.65314512813485e-06, + "loss": 0.4824, + "step": 3282 + }, + { + "epoch": 0.044533369506239824, + "grad_norm": 6.53095817565918, + "learning_rate": 9.653008085514596e-06, + "loss": 0.4915, + "step": 3283 + }, + { + "epoch": 0.04454693434617472, + "grad_norm": 8.382366180419922, + "learning_rate": 9.65287104289434e-06, + "loss": 0.4591, + "step": 3284 + }, + { + "epoch": 0.0445604991861096, + "grad_norm": 6.588686943054199, + "learning_rate": 9.652734000274086e-06, + "loss": 0.424, + "step": 3285 + }, + { + "epoch": 0.04457406402604449, + "grad_norm": 8.212359428405762, + "learning_rate": 9.652596957653831e-06, + "loss": 0.544, + "step": 3286 + }, + { + "epoch": 0.04458762886597938, + "grad_norm": 9.117085456848145, + "learning_rate": 9.652459915033576e-06, + "loss": 0.537, + "step": 3287 + }, + { + "epoch": 0.04460119370591427, + "grad_norm": 9.341843605041504, + "learning_rate": 9.652322872413321e-06, + "loss": 0.5324, + "step": 3288 + }, + { + "epoch": 0.04461475854584916, + "grad_norm": 8.315926551818848, + "learning_rate": 9.652185829793067e-06, + "loss": 0.5965, + "step": 3289 + }, + { + "epoch": 0.04462832338578405, + "grad_norm": 6.828519821166992, + "learning_rate": 9.652048787172812e-06, + "loss": 0.4762, + "step": 3290 + }, + { + "epoch": 0.044641888225718934, + "grad_norm": 9.261672019958496, + "learning_rate": 9.651911744552557e-06, + "loss": 0.4738, + "step": 3291 + }, + { + "epoch": 0.04465545306565383, + "grad_norm": 11.332183837890625, + "learning_rate": 9.651774701932302e-06, + "loss": 0.4482, + "step": 3292 + }, + { + "epoch": 0.044669017905588713, + "grad_norm": 9.391237258911133, + "learning_rate": 9.651637659312046e-06, + "loss": 0.5526, + "step": 3293 + }, + { + "epoch": 0.0446825827455236, + "grad_norm": 8.419623374938965, + "learning_rate": 9.651500616691793e-06, + "loss": 0.4836, + "step": 3294 + }, + { + "epoch": 0.04469614758545849, + "grad_norm": 7.068924427032471, + "learning_rate": 9.651363574071538e-06, + "loss": 0.5153, + "step": 3295 + }, + { + "epoch": 0.04470971242539338, + "grad_norm": 8.136829376220703, + "learning_rate": 9.651226531451283e-06, + "loss": 0.4878, + "step": 3296 + }, + { + "epoch": 0.04472327726532827, + "grad_norm": 7.575884819030762, + "learning_rate": 9.651089488831026e-06, + "loss": 0.4215, + "step": 3297 + }, + { + "epoch": 0.04473684210526316, + "grad_norm": 10.795706748962402, + "learning_rate": 9.650952446210773e-06, + "loss": 0.7085, + "step": 3298 + }, + { + "epoch": 0.044750406945198044, + "grad_norm": 7.274168491363525, + "learning_rate": 9.650815403590518e-06, + "loss": 0.4627, + "step": 3299 + }, + { + "epoch": 0.04476397178513294, + "grad_norm": 6.689751625061035, + "learning_rate": 9.650678360970262e-06, + "loss": 0.4332, + "step": 3300 + }, + { + "epoch": 0.044777536625067824, + "grad_norm": 9.348709106445312, + "learning_rate": 9.650541318350007e-06, + "loss": 0.4559, + "step": 3301 + }, + { + "epoch": 0.04479110146500271, + "grad_norm": 7.406954765319824, + "learning_rate": 9.650404275729752e-06, + "loss": 0.453, + "step": 3302 + }, + { + "epoch": 0.0448046663049376, + "grad_norm": 10.245407104492188, + "learning_rate": 9.650267233109497e-06, + "loss": 0.6751, + "step": 3303 + }, + { + "epoch": 0.04481823114487249, + "grad_norm": 7.891170978546143, + "learning_rate": 9.650130190489243e-06, + "loss": 0.4504, + "step": 3304 + }, + { + "epoch": 0.04483179598480738, + "grad_norm": 8.200861930847168, + "learning_rate": 9.649993147868988e-06, + "loss": 0.4661, + "step": 3305 + }, + { + "epoch": 0.04484536082474227, + "grad_norm": 6.509058475494385, + "learning_rate": 9.649856105248733e-06, + "loss": 0.3452, + "step": 3306 + }, + { + "epoch": 0.044858925664677154, + "grad_norm": 7.417505741119385, + "learning_rate": 9.649719062628478e-06, + "loss": 0.4858, + "step": 3307 + }, + { + "epoch": 0.04487249050461205, + "grad_norm": 7.186552047729492, + "learning_rate": 9.649582020008223e-06, + "loss": 0.3883, + "step": 3308 + }, + { + "epoch": 0.044886055344546934, + "grad_norm": 8.686360359191895, + "learning_rate": 9.649444977387969e-06, + "loss": 0.3974, + "step": 3309 + }, + { + "epoch": 0.04489962018448182, + "grad_norm": 9.125204086303711, + "learning_rate": 9.649307934767714e-06, + "loss": 0.5173, + "step": 3310 + }, + { + "epoch": 0.04491318502441671, + "grad_norm": 7.6305131912231445, + "learning_rate": 9.649170892147459e-06, + "loss": 0.3277, + "step": 3311 + }, + { + "epoch": 0.0449267498643516, + "grad_norm": 5.706944942474365, + "learning_rate": 9.649033849527204e-06, + "loss": 0.2312, + "step": 3312 + }, + { + "epoch": 0.04494031470428649, + "grad_norm": 6.752208709716797, + "learning_rate": 9.64889680690695e-06, + "loss": 0.364, + "step": 3313 + }, + { + "epoch": 0.04495387954422138, + "grad_norm": 5.917250633239746, + "learning_rate": 9.648759764286694e-06, + "loss": 0.2897, + "step": 3314 + }, + { + "epoch": 0.044967444384156265, + "grad_norm": 7.62347936630249, + "learning_rate": 9.648622721666438e-06, + "loss": 0.4201, + "step": 3315 + }, + { + "epoch": 0.04498100922409116, + "grad_norm": 5.278494358062744, + "learning_rate": 9.648485679046185e-06, + "loss": 0.3581, + "step": 3316 + }, + { + "epoch": 0.044994574064026044, + "grad_norm": 7.693493366241455, + "learning_rate": 9.64834863642593e-06, + "loss": 0.4134, + "step": 3317 + }, + { + "epoch": 0.04500813890396093, + "grad_norm": 12.517108917236328, + "learning_rate": 9.648211593805673e-06, + "loss": 0.5074, + "step": 3318 + }, + { + "epoch": 0.04502170374389582, + "grad_norm": 7.8936309814453125, + "learning_rate": 9.648074551185419e-06, + "loss": 0.5188, + "step": 3319 + }, + { + "epoch": 0.04503526858383071, + "grad_norm": 7.878422737121582, + "learning_rate": 9.647937508565165e-06, + "loss": 0.3353, + "step": 3320 + }, + { + "epoch": 0.0450488334237656, + "grad_norm": 5.747630596160889, + "learning_rate": 9.64780046594491e-06, + "loss": 0.3944, + "step": 3321 + }, + { + "epoch": 0.04506239826370049, + "grad_norm": 6.803525924682617, + "learning_rate": 9.647663423324654e-06, + "loss": 0.4395, + "step": 3322 + }, + { + "epoch": 0.045075963103635375, + "grad_norm": 7.961391448974609, + "learning_rate": 9.6475263807044e-06, + "loss": 0.3521, + "step": 3323 + }, + { + "epoch": 0.04508952794357027, + "grad_norm": 6.996563911437988, + "learning_rate": 9.647389338084146e-06, + "loss": 0.4103, + "step": 3324 + }, + { + "epoch": 0.045103092783505154, + "grad_norm": 7.9197258949279785, + "learning_rate": 9.64725229546389e-06, + "loss": 0.3678, + "step": 3325 + }, + { + "epoch": 0.04511665762344004, + "grad_norm": 7.107825756072998, + "learning_rate": 9.647115252843635e-06, + "loss": 0.3858, + "step": 3326 + }, + { + "epoch": 0.04513022246337493, + "grad_norm": 7.974654674530029, + "learning_rate": 9.64697821022338e-06, + "loss": 0.635, + "step": 3327 + }, + { + "epoch": 0.04514378730330982, + "grad_norm": 7.543798446655273, + "learning_rate": 9.646841167603125e-06, + "loss": 0.3936, + "step": 3328 + }, + { + "epoch": 0.04515735214324471, + "grad_norm": 7.565561771392822, + "learning_rate": 9.64670412498287e-06, + "loss": 0.3904, + "step": 3329 + }, + { + "epoch": 0.0451709169831796, + "grad_norm": 8.22011947631836, + "learning_rate": 9.646567082362616e-06, + "loss": 0.5087, + "step": 3330 + }, + { + "epoch": 0.045184481823114485, + "grad_norm": 7.889253616333008, + "learning_rate": 9.64643003974236e-06, + "loss": 0.4567, + "step": 3331 + }, + { + "epoch": 0.04519804666304938, + "grad_norm": 5.017051696777344, + "learning_rate": 9.646292997122106e-06, + "loss": 0.3415, + "step": 3332 + }, + { + "epoch": 0.045211611502984264, + "grad_norm": 6.998501300811768, + "learning_rate": 9.646155954501851e-06, + "loss": 0.3188, + "step": 3333 + }, + { + "epoch": 0.04522517634291915, + "grad_norm": 8.0191068649292, + "learning_rate": 9.646018911881596e-06, + "loss": 0.4117, + "step": 3334 + }, + { + "epoch": 0.04523874118285404, + "grad_norm": 6.355419635772705, + "learning_rate": 9.645881869261341e-06, + "loss": 0.322, + "step": 3335 + }, + { + "epoch": 0.04525230602278893, + "grad_norm": 8.993732452392578, + "learning_rate": 9.645744826641087e-06, + "loss": 0.4386, + "step": 3336 + }, + { + "epoch": 0.04526587086272382, + "grad_norm": 6.1515302658081055, + "learning_rate": 9.645607784020832e-06, + "loss": 0.3712, + "step": 3337 + }, + { + "epoch": 0.04527943570265871, + "grad_norm": 5.725368499755859, + "learning_rate": 9.645470741400577e-06, + "loss": 0.3441, + "step": 3338 + }, + { + "epoch": 0.045293000542593595, + "grad_norm": 5.5202956199646, + "learning_rate": 9.645333698780322e-06, + "loss": 0.3458, + "step": 3339 + }, + { + "epoch": 0.04530656538252849, + "grad_norm": 5.638996124267578, + "learning_rate": 9.645196656160066e-06, + "loss": 0.2793, + "step": 3340 + }, + { + "epoch": 0.045320130222463374, + "grad_norm": 8.306941032409668, + "learning_rate": 9.64505961353981e-06, + "loss": 0.4708, + "step": 3341 + }, + { + "epoch": 0.04533369506239826, + "grad_norm": 8.367158889770508, + "learning_rate": 9.644922570919558e-06, + "loss": 0.6054, + "step": 3342 + }, + { + "epoch": 0.045347259902333154, + "grad_norm": 7.742980480194092, + "learning_rate": 9.644785528299301e-06, + "loss": 0.3914, + "step": 3343 + }, + { + "epoch": 0.04536082474226804, + "grad_norm": 5.673480033874512, + "learning_rate": 9.644648485679046e-06, + "loss": 0.3357, + "step": 3344 + }, + { + "epoch": 0.04537438958220293, + "grad_norm": 7.242426872253418, + "learning_rate": 9.644511443058792e-06, + "loss": 0.4676, + "step": 3345 + }, + { + "epoch": 0.04538795442213782, + "grad_norm": 6.189298629760742, + "learning_rate": 9.644374400438537e-06, + "loss": 0.4104, + "step": 3346 + }, + { + "epoch": 0.045401519262072705, + "grad_norm": 9.38441276550293, + "learning_rate": 9.644237357818282e-06, + "loss": 0.47, + "step": 3347 + }, + { + "epoch": 0.0454150841020076, + "grad_norm": 6.6407856941223145, + "learning_rate": 9.644100315198027e-06, + "loss": 0.259, + "step": 3348 + }, + { + "epoch": 0.045428648941942484, + "grad_norm": 8.47431468963623, + "learning_rate": 9.643963272577772e-06, + "loss": 0.4683, + "step": 3349 + }, + { + "epoch": 0.04544221378187737, + "grad_norm": 7.654392719268799, + "learning_rate": 9.643826229957517e-06, + "loss": 0.4077, + "step": 3350 + }, + { + "epoch": 0.045455778621812264, + "grad_norm": 7.47833251953125, + "learning_rate": 9.643689187337263e-06, + "loss": 0.4227, + "step": 3351 + }, + { + "epoch": 0.04546934346174715, + "grad_norm": 8.581074714660645, + "learning_rate": 9.643552144717008e-06, + "loss": 0.4527, + "step": 3352 + }, + { + "epoch": 0.04548290830168204, + "grad_norm": 6.268130302429199, + "learning_rate": 9.643415102096753e-06, + "loss": 0.3723, + "step": 3353 + }, + { + "epoch": 0.04549647314161693, + "grad_norm": 8.606585502624512, + "learning_rate": 9.643278059476498e-06, + "loss": 0.4769, + "step": 3354 + }, + { + "epoch": 0.045510037981551815, + "grad_norm": 7.278327465057373, + "learning_rate": 9.643141016856243e-06, + "loss": 0.5091, + "step": 3355 + }, + { + "epoch": 0.04552360282148671, + "grad_norm": 9.956579208374023, + "learning_rate": 9.643003974235989e-06, + "loss": 0.6335, + "step": 3356 + }, + { + "epoch": 0.045537167661421595, + "grad_norm": 7.598023891448975, + "learning_rate": 9.642866931615734e-06, + "loss": 0.5078, + "step": 3357 + }, + { + "epoch": 0.04555073250135648, + "grad_norm": 10.426727294921875, + "learning_rate": 9.642729888995477e-06, + "loss": 0.5021, + "step": 3358 + }, + { + "epoch": 0.045564297341291374, + "grad_norm": 8.202362060546875, + "learning_rate": 9.642592846375224e-06, + "loss": 0.5565, + "step": 3359 + }, + { + "epoch": 0.04557786218122626, + "grad_norm": 9.374622344970703, + "learning_rate": 9.64245580375497e-06, + "loss": 0.6038, + "step": 3360 + }, + { + "epoch": 0.04559142702116115, + "grad_norm": 7.4525065422058105, + "learning_rate": 9.642318761134713e-06, + "loss": 0.5776, + "step": 3361 + }, + { + "epoch": 0.04560499186109604, + "grad_norm": 6.526636123657227, + "learning_rate": 9.642181718514458e-06, + "loss": 0.4252, + "step": 3362 + }, + { + "epoch": 0.045618556701030925, + "grad_norm": 7.638107776641846, + "learning_rate": 9.642044675894205e-06, + "loss": 0.3973, + "step": 3363 + }, + { + "epoch": 0.04563212154096582, + "grad_norm": 7.286427974700928, + "learning_rate": 9.64190763327395e-06, + "loss": 0.4869, + "step": 3364 + }, + { + "epoch": 0.045645686380900705, + "grad_norm": 7.495853424072266, + "learning_rate": 9.641770590653693e-06, + "loss": 0.4342, + "step": 3365 + }, + { + "epoch": 0.04565925122083559, + "grad_norm": 8.357126235961914, + "learning_rate": 9.641633548033439e-06, + "loss": 0.4915, + "step": 3366 + }, + { + "epoch": 0.045672816060770484, + "grad_norm": 6.045694828033447, + "learning_rate": 9.641496505413185e-06, + "loss": 0.3776, + "step": 3367 + }, + { + "epoch": 0.04568638090070537, + "grad_norm": 9.476092338562012, + "learning_rate": 9.641359462792929e-06, + "loss": 0.5883, + "step": 3368 + }, + { + "epoch": 0.04569994574064026, + "grad_norm": 6.358335971832275, + "learning_rate": 9.641222420172674e-06, + "loss": 0.4454, + "step": 3369 + }, + { + "epoch": 0.04571351058057515, + "grad_norm": 8.088183403015137, + "learning_rate": 9.64108537755242e-06, + "loss": 0.5951, + "step": 3370 + }, + { + "epoch": 0.045727075420510035, + "grad_norm": 8.361310005187988, + "learning_rate": 9.640948334932165e-06, + "loss": 0.4649, + "step": 3371 + }, + { + "epoch": 0.04574064026044493, + "grad_norm": 8.782533645629883, + "learning_rate": 9.64081129231191e-06, + "loss": 0.4966, + "step": 3372 + }, + { + "epoch": 0.045754205100379815, + "grad_norm": 8.315211296081543, + "learning_rate": 9.640674249691655e-06, + "loss": 0.5468, + "step": 3373 + }, + { + "epoch": 0.0457677699403147, + "grad_norm": 6.390068054199219, + "learning_rate": 9.6405372070714e-06, + "loss": 0.5631, + "step": 3374 + }, + { + "epoch": 0.045781334780249594, + "grad_norm": 5.683626174926758, + "learning_rate": 9.640400164451145e-06, + "loss": 0.4143, + "step": 3375 + }, + { + "epoch": 0.04579489962018448, + "grad_norm": 5.89096212387085, + "learning_rate": 9.64026312183089e-06, + "loss": 0.4513, + "step": 3376 + }, + { + "epoch": 0.04580846446011937, + "grad_norm": 6.431015968322754, + "learning_rate": 9.640126079210636e-06, + "loss": 0.3856, + "step": 3377 + }, + { + "epoch": 0.04582202930005426, + "grad_norm": 6.129432678222656, + "learning_rate": 9.63998903659038e-06, + "loss": 0.309, + "step": 3378 + }, + { + "epoch": 0.045835594139989146, + "grad_norm": 7.6831135749816895, + "learning_rate": 9.639851993970126e-06, + "loss": 0.7117, + "step": 3379 + }, + { + "epoch": 0.04584915897992404, + "grad_norm": 7.799375057220459, + "learning_rate": 9.639714951349871e-06, + "loss": 0.5477, + "step": 3380 + }, + { + "epoch": 0.045862723819858925, + "grad_norm": 4.793200969696045, + "learning_rate": 9.639577908729616e-06, + "loss": 0.3721, + "step": 3381 + }, + { + "epoch": 0.04587628865979381, + "grad_norm": 7.741333961486816, + "learning_rate": 9.639440866109361e-06, + "loss": 0.5382, + "step": 3382 + }, + { + "epoch": 0.045889853499728704, + "grad_norm": 6.939579963684082, + "learning_rate": 9.639303823489105e-06, + "loss": 0.3954, + "step": 3383 + }, + { + "epoch": 0.04590341833966359, + "grad_norm": 7.812911033630371, + "learning_rate": 9.63916678086885e-06, + "loss": 0.6168, + "step": 3384 + }, + { + "epoch": 0.04591698317959848, + "grad_norm": 6.218721866607666, + "learning_rate": 9.639029738248597e-06, + "loss": 0.5268, + "step": 3385 + }, + { + "epoch": 0.04593054801953337, + "grad_norm": 7.13449764251709, + "learning_rate": 9.63889269562834e-06, + "loss": 0.5016, + "step": 3386 + }, + { + "epoch": 0.045944112859468256, + "grad_norm": 8.776198387145996, + "learning_rate": 9.638755653008086e-06, + "loss": 0.5704, + "step": 3387 + }, + { + "epoch": 0.04595767769940315, + "grad_norm": 8.627269744873047, + "learning_rate": 9.638618610387831e-06, + "loss": 0.6605, + "step": 3388 + }, + { + "epoch": 0.045971242539338035, + "grad_norm": 6.09889554977417, + "learning_rate": 9.638481567767578e-06, + "loss": 0.4876, + "step": 3389 + }, + { + "epoch": 0.04598480737927292, + "grad_norm": 6.051026344299316, + "learning_rate": 9.638344525147321e-06, + "loss": 0.4521, + "step": 3390 + }, + { + "epoch": 0.045998372219207814, + "grad_norm": 8.240869522094727, + "learning_rate": 9.638207482527066e-06, + "loss": 0.474, + "step": 3391 + }, + { + "epoch": 0.0460119370591427, + "grad_norm": 5.21328067779541, + "learning_rate": 9.638070439906812e-06, + "loss": 0.3212, + "step": 3392 + }, + { + "epoch": 0.046025501899077594, + "grad_norm": 6.510119915008545, + "learning_rate": 9.637933397286557e-06, + "loss": 0.4689, + "step": 3393 + }, + { + "epoch": 0.04603906673901248, + "grad_norm": 10.998880386352539, + "learning_rate": 9.637796354666302e-06, + "loss": 0.4932, + "step": 3394 + }, + { + "epoch": 0.046052631578947366, + "grad_norm": 5.134862422943115, + "learning_rate": 9.637659312046047e-06, + "loss": 0.3376, + "step": 3395 + }, + { + "epoch": 0.04606619641888226, + "grad_norm": 6.871024131774902, + "learning_rate": 9.637522269425792e-06, + "loss": 0.3903, + "step": 3396 + }, + { + "epoch": 0.046079761258817145, + "grad_norm": 7.864443778991699, + "learning_rate": 9.637385226805537e-06, + "loss": 0.6089, + "step": 3397 + }, + { + "epoch": 0.04609332609875203, + "grad_norm": 8.049637794494629, + "learning_rate": 9.637248184185283e-06, + "loss": 0.4661, + "step": 3398 + }, + { + "epoch": 0.046106890938686924, + "grad_norm": 6.924012184143066, + "learning_rate": 9.637111141565028e-06, + "loss": 0.4634, + "step": 3399 + }, + { + "epoch": 0.04612045577862181, + "grad_norm": 8.246943473815918, + "learning_rate": 9.636974098944773e-06, + "loss": 0.5229, + "step": 3400 + }, + { + "epoch": 0.046134020618556704, + "grad_norm": 7.552234172821045, + "learning_rate": 9.636837056324516e-06, + "loss": 0.6092, + "step": 3401 + }, + { + "epoch": 0.04614758545849159, + "grad_norm": 7.977777004241943, + "learning_rate": 9.636700013704263e-06, + "loss": 0.4688, + "step": 3402 + }, + { + "epoch": 0.046161150298426476, + "grad_norm": 6.720064640045166, + "learning_rate": 9.636562971084009e-06, + "loss": 0.4417, + "step": 3403 + }, + { + "epoch": 0.04617471513836137, + "grad_norm": 8.497145652770996, + "learning_rate": 9.636425928463754e-06, + "loss": 0.5269, + "step": 3404 + }, + { + "epoch": 0.046188279978296255, + "grad_norm": 7.278395652770996, + "learning_rate": 9.636288885843497e-06, + "loss": 0.4086, + "step": 3405 + }, + { + "epoch": 0.04620184481823114, + "grad_norm": 10.979458808898926, + "learning_rate": 9.636151843223244e-06, + "loss": 0.8015, + "step": 3406 + }, + { + "epoch": 0.046215409658166035, + "grad_norm": 8.874433517456055, + "learning_rate": 9.63601480060299e-06, + "loss": 0.4018, + "step": 3407 + }, + { + "epoch": 0.04622897449810092, + "grad_norm": 6.515244960784912, + "learning_rate": 9.635877757982733e-06, + "loss": 0.5211, + "step": 3408 + }, + { + "epoch": 0.046242539338035814, + "grad_norm": 8.529191970825195, + "learning_rate": 9.635740715362478e-06, + "loss": 0.5331, + "step": 3409 + }, + { + "epoch": 0.0462561041779707, + "grad_norm": 14.629415512084961, + "learning_rate": 9.635603672742223e-06, + "loss": 0.4161, + "step": 3410 + }, + { + "epoch": 0.046269669017905586, + "grad_norm": 9.915886878967285, + "learning_rate": 9.635466630121968e-06, + "loss": 0.5352, + "step": 3411 + }, + { + "epoch": 0.04628323385784048, + "grad_norm": 5.934715747833252, + "learning_rate": 9.635329587501713e-06, + "loss": 0.3417, + "step": 3412 + }, + { + "epoch": 0.046296798697775365, + "grad_norm": 8.283234596252441, + "learning_rate": 9.635192544881459e-06, + "loss": 0.4851, + "step": 3413 + }, + { + "epoch": 0.04631036353771026, + "grad_norm": 7.709534645080566, + "learning_rate": 9.635055502261204e-06, + "loss": 0.4614, + "step": 3414 + }, + { + "epoch": 0.046323928377645145, + "grad_norm": 6.474123954772949, + "learning_rate": 9.634918459640949e-06, + "loss": 0.3908, + "step": 3415 + }, + { + "epoch": 0.04633749321758003, + "grad_norm": 7.290962219238281, + "learning_rate": 9.634781417020694e-06, + "loss": 0.5534, + "step": 3416 + }, + { + "epoch": 0.046351058057514924, + "grad_norm": 6.401426315307617, + "learning_rate": 9.63464437440044e-06, + "loss": 0.3715, + "step": 3417 + }, + { + "epoch": 0.04636462289744981, + "grad_norm": 11.296883583068848, + "learning_rate": 9.634507331780185e-06, + "loss": 0.8566, + "step": 3418 + }, + { + "epoch": 0.046378187737384696, + "grad_norm": 8.75800895690918, + "learning_rate": 9.63437028915993e-06, + "loss": 0.3609, + "step": 3419 + }, + { + "epoch": 0.04639175257731959, + "grad_norm": 6.790275573730469, + "learning_rate": 9.634233246539675e-06, + "loss": 0.4047, + "step": 3420 + }, + { + "epoch": 0.046405317417254476, + "grad_norm": 9.620308876037598, + "learning_rate": 9.63409620391942e-06, + "loss": 0.4652, + "step": 3421 + }, + { + "epoch": 0.04641888225718937, + "grad_norm": 8.478974342346191, + "learning_rate": 9.633959161299165e-06, + "loss": 0.4853, + "step": 3422 + }, + { + "epoch": 0.046432447097124255, + "grad_norm": 7.051397323608398, + "learning_rate": 9.633822118678909e-06, + "loss": 0.5693, + "step": 3423 + }, + { + "epoch": 0.04644601193705914, + "grad_norm": 9.045589447021484, + "learning_rate": 9.633685076058656e-06, + "loss": 0.6992, + "step": 3424 + }, + { + "epoch": 0.046459576776994034, + "grad_norm": 7.413017749786377, + "learning_rate": 9.6335480334384e-06, + "loss": 0.4914, + "step": 3425 + }, + { + "epoch": 0.04647314161692892, + "grad_norm": 8.911940574645996, + "learning_rate": 9.633410990818144e-06, + "loss": 0.7324, + "step": 3426 + }, + { + "epoch": 0.046486706456863806, + "grad_norm": 6.82614803314209, + "learning_rate": 9.63327394819789e-06, + "loss": 0.4615, + "step": 3427 + }, + { + "epoch": 0.0465002712967987, + "grad_norm": 8.758377075195312, + "learning_rate": 9.633136905577636e-06, + "loss": 0.4939, + "step": 3428 + }, + { + "epoch": 0.046513836136733586, + "grad_norm": 8.380070686340332, + "learning_rate": 9.632999862957381e-06, + "loss": 0.5429, + "step": 3429 + }, + { + "epoch": 0.04652740097666848, + "grad_norm": 8.967114448547363, + "learning_rate": 9.632862820337125e-06, + "loss": 0.4541, + "step": 3430 + }, + { + "epoch": 0.046540965816603365, + "grad_norm": 7.549538612365723, + "learning_rate": 9.63272577771687e-06, + "loss": 0.3941, + "step": 3431 + }, + { + "epoch": 0.04655453065653825, + "grad_norm": 5.098023891448975, + "learning_rate": 9.632588735096617e-06, + "loss": 0.3153, + "step": 3432 + }, + { + "epoch": 0.046568095496473144, + "grad_norm": 8.851947784423828, + "learning_rate": 9.63245169247636e-06, + "loss": 0.4906, + "step": 3433 + }, + { + "epoch": 0.04658166033640803, + "grad_norm": 9.41195011138916, + "learning_rate": 9.632314649856106e-06, + "loss": 0.5462, + "step": 3434 + }, + { + "epoch": 0.046595225176342916, + "grad_norm": 7.415404319763184, + "learning_rate": 9.632177607235851e-06, + "loss": 0.4823, + "step": 3435 + }, + { + "epoch": 0.04660879001627781, + "grad_norm": 7.036407947540283, + "learning_rate": 9.632040564615596e-06, + "loss": 0.406, + "step": 3436 + }, + { + "epoch": 0.046622354856212696, + "grad_norm": 8.706899642944336, + "learning_rate": 9.631903521995341e-06, + "loss": 0.5701, + "step": 3437 + }, + { + "epoch": 0.04663591969614759, + "grad_norm": 10.695112228393555, + "learning_rate": 9.631766479375086e-06, + "loss": 0.6838, + "step": 3438 + }, + { + "epoch": 0.046649484536082475, + "grad_norm": 7.57672643661499, + "learning_rate": 9.631629436754832e-06, + "loss": 0.3878, + "step": 3439 + }, + { + "epoch": 0.04666304937601736, + "grad_norm": 6.346618175506592, + "learning_rate": 9.631492394134577e-06, + "loss": 0.4182, + "step": 3440 + }, + { + "epoch": 0.046676614215952254, + "grad_norm": 10.615984916687012, + "learning_rate": 9.631355351514322e-06, + "loss": 0.5873, + "step": 3441 + }, + { + "epoch": 0.04669017905588714, + "grad_norm": 7.282536029815674, + "learning_rate": 9.631218308894067e-06, + "loss": 0.4341, + "step": 3442 + }, + { + "epoch": 0.04670374389582203, + "grad_norm": 6.944650173187256, + "learning_rate": 9.631081266273812e-06, + "loss": 0.4369, + "step": 3443 + }, + { + "epoch": 0.04671730873575692, + "grad_norm": 6.138614177703857, + "learning_rate": 9.630944223653556e-06, + "loss": 0.4169, + "step": 3444 + }, + { + "epoch": 0.046730873575691806, + "grad_norm": 9.108784675598145, + "learning_rate": 9.630807181033303e-06, + "loss": 0.7502, + "step": 3445 + }, + { + "epoch": 0.0467444384156267, + "grad_norm": 7.774845600128174, + "learning_rate": 9.630670138413048e-06, + "loss": 0.5226, + "step": 3446 + }, + { + "epoch": 0.046758003255561585, + "grad_norm": 7.604952812194824, + "learning_rate": 9.630533095792793e-06, + "loss": 0.5076, + "step": 3447 + }, + { + "epoch": 0.04677156809549647, + "grad_norm": 6.560511589050293, + "learning_rate": 9.630396053172537e-06, + "loss": 0.533, + "step": 3448 + }, + { + "epoch": 0.046785132935431364, + "grad_norm": 7.469658374786377, + "learning_rate": 9.630259010552283e-06, + "loss": 0.4622, + "step": 3449 + }, + { + "epoch": 0.04679869777536625, + "grad_norm": 8.4647855758667, + "learning_rate": 9.630121967932029e-06, + "loss": 0.4361, + "step": 3450 + }, + { + "epoch": 0.04681226261530114, + "grad_norm": 6.811697959899902, + "learning_rate": 9.629984925311772e-06, + "loss": 0.4411, + "step": 3451 + }, + { + "epoch": 0.04682582745523603, + "grad_norm": 7.474151611328125, + "learning_rate": 9.629847882691517e-06, + "loss": 0.3762, + "step": 3452 + }, + { + "epoch": 0.046839392295170916, + "grad_norm": 4.840150833129883, + "learning_rate": 9.629710840071262e-06, + "loss": 0.3499, + "step": 3453 + }, + { + "epoch": 0.04685295713510581, + "grad_norm": 6.6185455322265625, + "learning_rate": 9.629573797451008e-06, + "loss": 0.3985, + "step": 3454 + }, + { + "epoch": 0.046866521975040695, + "grad_norm": 6.996201038360596, + "learning_rate": 9.629436754830753e-06, + "loss": 0.5175, + "step": 3455 + }, + { + "epoch": 0.04688008681497558, + "grad_norm": 6.940147399902344, + "learning_rate": 9.629299712210498e-06, + "loss": 0.3709, + "step": 3456 + }, + { + "epoch": 0.046893651654910475, + "grad_norm": 5.500239849090576, + "learning_rate": 9.629162669590243e-06, + "loss": 0.3625, + "step": 3457 + }, + { + "epoch": 0.04690721649484536, + "grad_norm": 5.262004375457764, + "learning_rate": 9.629025626969988e-06, + "loss": 0.3266, + "step": 3458 + }, + { + "epoch": 0.04692078133478025, + "grad_norm": 4.183565139770508, + "learning_rate": 9.628888584349733e-06, + "loss": 0.2382, + "step": 3459 + }, + { + "epoch": 0.04693434617471514, + "grad_norm": 5.720786094665527, + "learning_rate": 9.628751541729479e-06, + "loss": 0.2779, + "step": 3460 + }, + { + "epoch": 0.046947911014650026, + "grad_norm": 8.205238342285156, + "learning_rate": 9.628614499109224e-06, + "loss": 0.4447, + "step": 3461 + }, + { + "epoch": 0.04696147585458492, + "grad_norm": 4.154979705810547, + "learning_rate": 9.628477456488969e-06, + "loss": 0.3197, + "step": 3462 + }, + { + "epoch": 0.046975040694519805, + "grad_norm": 5.651834487915039, + "learning_rate": 9.628340413868714e-06, + "loss": 0.3043, + "step": 3463 + }, + { + "epoch": 0.04698860553445469, + "grad_norm": 5.145767688751221, + "learning_rate": 9.62820337124846e-06, + "loss": 0.345, + "step": 3464 + }, + { + "epoch": 0.047002170374389585, + "grad_norm": 6.6309309005737305, + "learning_rate": 9.628066328628205e-06, + "loss": 0.4225, + "step": 3465 + }, + { + "epoch": 0.04701573521432447, + "grad_norm": 5.8219895362854, + "learning_rate": 9.627929286007948e-06, + "loss": 0.3887, + "step": 3466 + }, + { + "epoch": 0.04702930005425936, + "grad_norm": 5.48001766204834, + "learning_rate": 9.627792243387695e-06, + "loss": 0.4157, + "step": 3467 + }, + { + "epoch": 0.04704286489419425, + "grad_norm": 6.554646968841553, + "learning_rate": 9.62765520076744e-06, + "loss": 0.5496, + "step": 3468 + }, + { + "epoch": 0.047056429734129136, + "grad_norm": 7.223396301269531, + "learning_rate": 9.627518158147184e-06, + "loss": 0.3031, + "step": 3469 + }, + { + "epoch": 0.04706999457406403, + "grad_norm": 7.426342010498047, + "learning_rate": 9.627381115526929e-06, + "loss": 0.3683, + "step": 3470 + }, + { + "epoch": 0.047083559413998916, + "grad_norm": 5.465481758117676, + "learning_rate": 9.627244072906676e-06, + "loss": 0.232, + "step": 3471 + }, + { + "epoch": 0.0470971242539338, + "grad_norm": 6.825517654418945, + "learning_rate": 9.62710703028642e-06, + "loss": 0.4621, + "step": 3472 + }, + { + "epoch": 0.047110689093868695, + "grad_norm": 8.098699569702148, + "learning_rate": 9.626969987666164e-06, + "loss": 0.3715, + "step": 3473 + }, + { + "epoch": 0.04712425393380358, + "grad_norm": 5.700925827026367, + "learning_rate": 9.62683294504591e-06, + "loss": 0.4474, + "step": 3474 + }, + { + "epoch": 0.04713781877373847, + "grad_norm": 5.5523529052734375, + "learning_rate": 9.626695902425656e-06, + "loss": 0.4348, + "step": 3475 + }, + { + "epoch": 0.04715138361367336, + "grad_norm": 7.015794277191162, + "learning_rate": 9.6265588598054e-06, + "loss": 0.4338, + "step": 3476 + }, + { + "epoch": 0.047164948453608246, + "grad_norm": 8.41616439819336, + "learning_rate": 9.626421817185145e-06, + "loss": 0.488, + "step": 3477 + }, + { + "epoch": 0.04717851329354314, + "grad_norm": 6.382577419281006, + "learning_rate": 9.62628477456489e-06, + "loss": 0.3336, + "step": 3478 + }, + { + "epoch": 0.047192078133478026, + "grad_norm": 7.8426690101623535, + "learning_rate": 9.626147731944635e-06, + "loss": 0.4633, + "step": 3479 + }, + { + "epoch": 0.04720564297341291, + "grad_norm": 7.4122538566589355, + "learning_rate": 9.62601068932438e-06, + "loss": 0.5336, + "step": 3480 + }, + { + "epoch": 0.047219207813347805, + "grad_norm": 7.29258394241333, + "learning_rate": 9.625873646704126e-06, + "loss": 0.4743, + "step": 3481 + }, + { + "epoch": 0.04723277265328269, + "grad_norm": 7.250544548034668, + "learning_rate": 9.625736604083871e-06, + "loss": 0.4708, + "step": 3482 + }, + { + "epoch": 0.04724633749321758, + "grad_norm": 6.942728042602539, + "learning_rate": 9.625599561463616e-06, + "loss": 0.3628, + "step": 3483 + }, + { + "epoch": 0.04725990233315247, + "grad_norm": 7.221889495849609, + "learning_rate": 9.625462518843361e-06, + "loss": 0.437, + "step": 3484 + }, + { + "epoch": 0.047273467173087357, + "grad_norm": 6.583343029022217, + "learning_rate": 9.625325476223106e-06, + "loss": 0.3785, + "step": 3485 + }, + { + "epoch": 0.04728703201302225, + "grad_norm": 7.46058988571167, + "learning_rate": 9.625188433602852e-06, + "loss": 0.3579, + "step": 3486 + }, + { + "epoch": 0.047300596852957136, + "grad_norm": 7.267675399780273, + "learning_rate": 9.625051390982597e-06, + "loss": 0.5517, + "step": 3487 + }, + { + "epoch": 0.04731416169289202, + "grad_norm": 6.67761754989624, + "learning_rate": 9.624914348362342e-06, + "loss": 0.3887, + "step": 3488 + }, + { + "epoch": 0.047327726532826915, + "grad_norm": 6.640791416168213, + "learning_rate": 9.624777305742087e-06, + "loss": 0.3597, + "step": 3489 + }, + { + "epoch": 0.0473412913727618, + "grad_norm": 7.292204856872559, + "learning_rate": 9.624640263121832e-06, + "loss": 0.3848, + "step": 3490 + }, + { + "epoch": 0.04735485621269669, + "grad_norm": 11.257608413696289, + "learning_rate": 9.624503220501576e-06, + "loss": 0.6656, + "step": 3491 + }, + { + "epoch": 0.04736842105263158, + "grad_norm": 7.637263298034668, + "learning_rate": 9.624366177881321e-06, + "loss": 0.5056, + "step": 3492 + }, + { + "epoch": 0.04738198589256647, + "grad_norm": 7.3712005615234375, + "learning_rate": 9.624229135261068e-06, + "loss": 0.4726, + "step": 3493 + }, + { + "epoch": 0.04739555073250136, + "grad_norm": 9.142512321472168, + "learning_rate": 9.624092092640811e-06, + "loss": 0.4701, + "step": 3494 + }, + { + "epoch": 0.047409115572436246, + "grad_norm": 8.717741012573242, + "learning_rate": 9.623955050020557e-06, + "loss": 0.4987, + "step": 3495 + }, + { + "epoch": 0.04742268041237113, + "grad_norm": 7.591573715209961, + "learning_rate": 9.623818007400302e-06, + "loss": 0.4128, + "step": 3496 + }, + { + "epoch": 0.047436245252306025, + "grad_norm": 9.524142265319824, + "learning_rate": 9.623680964780049e-06, + "loss": 0.5954, + "step": 3497 + }, + { + "epoch": 0.04744981009224091, + "grad_norm": 6.582391738891602, + "learning_rate": 9.623543922159792e-06, + "loss": 0.3984, + "step": 3498 + }, + { + "epoch": 0.0474633749321758, + "grad_norm": 8.064630508422852, + "learning_rate": 9.623406879539537e-06, + "loss": 0.5132, + "step": 3499 + }, + { + "epoch": 0.04747693977211069, + "grad_norm": 9.81946849822998, + "learning_rate": 9.623269836919282e-06, + "loss": 0.5378, + "step": 3500 + }, + { + "epoch": 0.04749050461204558, + "grad_norm": 6.602692604064941, + "learning_rate": 9.623132794299028e-06, + "loss": 0.3572, + "step": 3501 + }, + { + "epoch": 0.04750406945198047, + "grad_norm": 8.874153137207031, + "learning_rate": 9.622995751678773e-06, + "loss": 0.3976, + "step": 3502 + }, + { + "epoch": 0.047517634291915356, + "grad_norm": 8.241398811340332, + "learning_rate": 9.622858709058518e-06, + "loss": 0.5193, + "step": 3503 + }, + { + "epoch": 0.04753119913185024, + "grad_norm": 6.902553081512451, + "learning_rate": 9.622721666438263e-06, + "loss": 0.5278, + "step": 3504 + }, + { + "epoch": 0.047544763971785135, + "grad_norm": 7.298617839813232, + "learning_rate": 9.622584623818008e-06, + "loss": 0.4108, + "step": 3505 + }, + { + "epoch": 0.04755832881172002, + "grad_norm": 7.684778690338135, + "learning_rate": 9.622447581197753e-06, + "loss": 0.3685, + "step": 3506 + }, + { + "epoch": 0.04757189365165491, + "grad_norm": 8.358283996582031, + "learning_rate": 9.622310538577499e-06, + "loss": 0.3931, + "step": 3507 + }, + { + "epoch": 0.0475854584915898, + "grad_norm": 6.818420886993408, + "learning_rate": 9.622173495957244e-06, + "loss": 0.3026, + "step": 3508 + }, + { + "epoch": 0.04759902333152469, + "grad_norm": 5.5857157707214355, + "learning_rate": 9.622036453336987e-06, + "loss": 0.3427, + "step": 3509 + }, + { + "epoch": 0.04761258817145958, + "grad_norm": 9.893330574035645, + "learning_rate": 9.621899410716734e-06, + "loss": 0.5014, + "step": 3510 + }, + { + "epoch": 0.047626153011394466, + "grad_norm": 7.793977737426758, + "learning_rate": 9.62176236809648e-06, + "loss": 0.4417, + "step": 3511 + }, + { + "epoch": 0.04763971785132935, + "grad_norm": 6.659328937530518, + "learning_rate": 9.621625325476225e-06, + "loss": 0.4144, + "step": 3512 + }, + { + "epoch": 0.047653282691264245, + "grad_norm": 8.679649353027344, + "learning_rate": 9.621488282855968e-06, + "loss": 0.3923, + "step": 3513 + }, + { + "epoch": 0.04766684753119913, + "grad_norm": 6.4089436531066895, + "learning_rate": 9.621351240235715e-06, + "loss": 0.3324, + "step": 3514 + }, + { + "epoch": 0.04768041237113402, + "grad_norm": 8.555294036865234, + "learning_rate": 9.62121419761546e-06, + "loss": 0.4475, + "step": 3515 + }, + { + "epoch": 0.04769397721106891, + "grad_norm": 8.848621368408203, + "learning_rate": 9.621077154995204e-06, + "loss": 0.5785, + "step": 3516 + }, + { + "epoch": 0.0477075420510038, + "grad_norm": 7.2898969650268555, + "learning_rate": 9.620940112374949e-06, + "loss": 0.4367, + "step": 3517 + }, + { + "epoch": 0.04772110689093869, + "grad_norm": 7.436378479003906, + "learning_rate": 9.620803069754696e-06, + "loss": 0.4638, + "step": 3518 + }, + { + "epoch": 0.047734671730873576, + "grad_norm": 6.9417572021484375, + "learning_rate": 9.620666027134439e-06, + "loss": 0.4024, + "step": 3519 + }, + { + "epoch": 0.04774823657080846, + "grad_norm": 5.817279815673828, + "learning_rate": 9.620528984514184e-06, + "loss": 0.2581, + "step": 3520 + }, + { + "epoch": 0.047761801410743356, + "grad_norm": 7.701472282409668, + "learning_rate": 9.62039194189393e-06, + "loss": 0.3917, + "step": 3521 + }, + { + "epoch": 0.04777536625067824, + "grad_norm": 9.857784271240234, + "learning_rate": 9.620254899273675e-06, + "loss": 0.5222, + "step": 3522 + }, + { + "epoch": 0.04778893109061313, + "grad_norm": 5.758129119873047, + "learning_rate": 9.62011785665342e-06, + "loss": 0.3188, + "step": 3523 + }, + { + "epoch": 0.04780249593054802, + "grad_norm": 8.073766708374023, + "learning_rate": 9.619980814033165e-06, + "loss": 0.4716, + "step": 3524 + }, + { + "epoch": 0.04781606077048291, + "grad_norm": 7.061603546142578, + "learning_rate": 9.61984377141291e-06, + "loss": 0.4262, + "step": 3525 + }, + { + "epoch": 0.0478296256104178, + "grad_norm": 8.30761432647705, + "learning_rate": 9.619706728792655e-06, + "loss": 0.5658, + "step": 3526 + }, + { + "epoch": 0.047843190450352686, + "grad_norm": 6.51729154586792, + "learning_rate": 9.6195696861724e-06, + "loss": 0.2978, + "step": 3527 + }, + { + "epoch": 0.04785675529028757, + "grad_norm": 6.144100189208984, + "learning_rate": 9.619432643552146e-06, + "loss": 0.3251, + "step": 3528 + }, + { + "epoch": 0.047870320130222466, + "grad_norm": 5.3046064376831055, + "learning_rate": 9.619295600931891e-06, + "loss": 0.3701, + "step": 3529 + }, + { + "epoch": 0.04788388497015735, + "grad_norm": 6.503147125244141, + "learning_rate": 9.619158558311636e-06, + "loss": 0.3537, + "step": 3530 + }, + { + "epoch": 0.04789744981009224, + "grad_norm": 6.1173176765441895, + "learning_rate": 9.619021515691381e-06, + "loss": 0.3475, + "step": 3531 + }, + { + "epoch": 0.04791101465002713, + "grad_norm": 8.000227928161621, + "learning_rate": 9.618884473071126e-06, + "loss": 0.4106, + "step": 3532 + }, + { + "epoch": 0.04792457948996202, + "grad_norm": 6.4995622634887695, + "learning_rate": 9.618747430450872e-06, + "loss": 0.49, + "step": 3533 + }, + { + "epoch": 0.04793814432989691, + "grad_norm": 5.868189811706543, + "learning_rate": 9.618610387830615e-06, + "loss": 0.3733, + "step": 3534 + }, + { + "epoch": 0.0479517091698318, + "grad_norm": 8.012077331542969, + "learning_rate": 9.61847334521036e-06, + "loss": 0.5749, + "step": 3535 + }, + { + "epoch": 0.04796527400976668, + "grad_norm": 8.922175407409668, + "learning_rate": 9.618336302590107e-06, + "loss": 0.4679, + "step": 3536 + }, + { + "epoch": 0.047978838849701576, + "grad_norm": 7.27028226852417, + "learning_rate": 9.61819925996985e-06, + "loss": 0.3945, + "step": 3537 + }, + { + "epoch": 0.04799240368963646, + "grad_norm": 7.525894641876221, + "learning_rate": 9.618062217349596e-06, + "loss": 0.4702, + "step": 3538 + }, + { + "epoch": 0.04800596852957135, + "grad_norm": 6.440993309020996, + "learning_rate": 9.617925174729341e-06, + "loss": 0.4584, + "step": 3539 + }, + { + "epoch": 0.04801953336950624, + "grad_norm": 7.947900772094727, + "learning_rate": 9.617788132109088e-06, + "loss": 0.6253, + "step": 3540 + }, + { + "epoch": 0.04803309820944113, + "grad_norm": 7.3829216957092285, + "learning_rate": 9.617651089488831e-06, + "loss": 0.4524, + "step": 3541 + }, + { + "epoch": 0.04804666304937602, + "grad_norm": 6.343807220458984, + "learning_rate": 9.617514046868577e-06, + "loss": 0.4988, + "step": 3542 + }, + { + "epoch": 0.04806022788931091, + "grad_norm": 6.1715521812438965, + "learning_rate": 9.617377004248322e-06, + "loss": 0.3523, + "step": 3543 + }, + { + "epoch": 0.04807379272924579, + "grad_norm": 5.150954246520996, + "learning_rate": 9.617239961628067e-06, + "loss": 0.3579, + "step": 3544 + }, + { + "epoch": 0.048087357569180686, + "grad_norm": 5.671821594238281, + "learning_rate": 9.617102919007812e-06, + "loss": 0.2751, + "step": 3545 + }, + { + "epoch": 0.04810092240911557, + "grad_norm": 7.864580154418945, + "learning_rate": 9.616965876387557e-06, + "loss": 0.3646, + "step": 3546 + }, + { + "epoch": 0.04811448724905046, + "grad_norm": 7.008161544799805, + "learning_rate": 9.616828833767302e-06, + "loss": 0.4074, + "step": 3547 + }, + { + "epoch": 0.04812805208898535, + "grad_norm": 6.608465671539307, + "learning_rate": 9.616691791147048e-06, + "loss": 0.504, + "step": 3548 + }, + { + "epoch": 0.04814161692892024, + "grad_norm": 6.499599933624268, + "learning_rate": 9.616554748526793e-06, + "loss": 0.3558, + "step": 3549 + }, + { + "epoch": 0.04815518176885513, + "grad_norm": 7.817872524261475, + "learning_rate": 9.616417705906538e-06, + "loss": 0.4332, + "step": 3550 + }, + { + "epoch": 0.04816874660879002, + "grad_norm": 5.539080619812012, + "learning_rate": 9.616280663286283e-06, + "loss": 0.4154, + "step": 3551 + }, + { + "epoch": 0.0481823114487249, + "grad_norm": 7.684335231781006, + "learning_rate": 9.616143620666027e-06, + "loss": 0.3175, + "step": 3552 + }, + { + "epoch": 0.048195876288659796, + "grad_norm": 5.962637424468994, + "learning_rate": 9.616006578045774e-06, + "loss": 0.3298, + "step": 3553 + }, + { + "epoch": 0.04820944112859468, + "grad_norm": 12.836592674255371, + "learning_rate": 9.615869535425519e-06, + "loss": 0.481, + "step": 3554 + }, + { + "epoch": 0.04822300596852957, + "grad_norm": 7.59215784072876, + "learning_rate": 9.615732492805264e-06, + "loss": 0.4039, + "step": 3555 + }, + { + "epoch": 0.04823657080846446, + "grad_norm": 8.197778701782227, + "learning_rate": 9.615595450185007e-06, + "loss": 0.4606, + "step": 3556 + }, + { + "epoch": 0.04825013564839935, + "grad_norm": 7.668256759643555, + "learning_rate": 9.615458407564754e-06, + "loss": 0.334, + "step": 3557 + }, + { + "epoch": 0.04826370048833424, + "grad_norm": 9.083210945129395, + "learning_rate": 9.6153213649445e-06, + "loss": 0.4652, + "step": 3558 + }, + { + "epoch": 0.04827726532826913, + "grad_norm": 6.373831748962402, + "learning_rate": 9.615184322324243e-06, + "loss": 0.3049, + "step": 3559 + }, + { + "epoch": 0.04829083016820401, + "grad_norm": 7.783139705657959, + "learning_rate": 9.615047279703988e-06, + "loss": 0.4005, + "step": 3560 + }, + { + "epoch": 0.048304395008138906, + "grad_norm": 7.902537822723389, + "learning_rate": 9.614910237083733e-06, + "loss": 0.4502, + "step": 3561 + }, + { + "epoch": 0.04831795984807379, + "grad_norm": 6.235898971557617, + "learning_rate": 9.614773194463478e-06, + "loss": 0.4242, + "step": 3562 + }, + { + "epoch": 0.04833152468800868, + "grad_norm": 8.770342826843262, + "learning_rate": 9.614636151843224e-06, + "loss": 0.4503, + "step": 3563 + }, + { + "epoch": 0.04834508952794357, + "grad_norm": 5.863247394561768, + "learning_rate": 9.614499109222969e-06, + "loss": 0.3367, + "step": 3564 + }, + { + "epoch": 0.04835865436787846, + "grad_norm": 8.953009605407715, + "learning_rate": 9.614362066602714e-06, + "loss": 0.5698, + "step": 3565 + }, + { + "epoch": 0.04837221920781335, + "grad_norm": 8.925472259521484, + "learning_rate": 9.614225023982459e-06, + "loss": 0.4812, + "step": 3566 + }, + { + "epoch": 0.04838578404774824, + "grad_norm": 8.734583854675293, + "learning_rate": 9.614087981362204e-06, + "loss": 0.5386, + "step": 3567 + }, + { + "epoch": 0.04839934888768312, + "grad_norm": 9.682311058044434, + "learning_rate": 9.61395093874195e-06, + "loss": 0.5371, + "step": 3568 + }, + { + "epoch": 0.048412913727618016, + "grad_norm": 5.587878704071045, + "learning_rate": 9.613813896121695e-06, + "loss": 0.2681, + "step": 3569 + }, + { + "epoch": 0.0484264785675529, + "grad_norm": 13.070895195007324, + "learning_rate": 9.61367685350144e-06, + "loss": 0.6073, + "step": 3570 + }, + { + "epoch": 0.04844004340748779, + "grad_norm": 6.551370620727539, + "learning_rate": 9.613539810881185e-06, + "loss": 0.4243, + "step": 3571 + }, + { + "epoch": 0.04845360824742268, + "grad_norm": 9.988333702087402, + "learning_rate": 9.61340276826093e-06, + "loss": 0.6081, + "step": 3572 + }, + { + "epoch": 0.04846717308735757, + "grad_norm": 9.502095222473145, + "learning_rate": 9.613265725640675e-06, + "loss": 0.4635, + "step": 3573 + }, + { + "epoch": 0.04848073792729246, + "grad_norm": 6.666900634765625, + "learning_rate": 9.61312868302042e-06, + "loss": 0.4655, + "step": 3574 + }, + { + "epoch": 0.04849430276722735, + "grad_norm": 9.736672401428223, + "learning_rate": 9.612991640400166e-06, + "loss": 0.5056, + "step": 3575 + }, + { + "epoch": 0.04850786760716223, + "grad_norm": 10.371814727783203, + "learning_rate": 9.612854597779911e-06, + "loss": 0.6686, + "step": 3576 + }, + { + "epoch": 0.048521432447097126, + "grad_norm": 9.954181671142578, + "learning_rate": 9.612717555159654e-06, + "loss": 0.6036, + "step": 3577 + }, + { + "epoch": 0.04853499728703201, + "grad_norm": 9.214134216308594, + "learning_rate": 9.6125805125394e-06, + "loss": 0.5464, + "step": 3578 + }, + { + "epoch": 0.0485485621269669, + "grad_norm": 8.04288101196289, + "learning_rate": 9.612443469919146e-06, + "loss": 0.4741, + "step": 3579 + }, + { + "epoch": 0.04856212696690179, + "grad_norm": 6.793047904968262, + "learning_rate": 9.612306427298892e-06, + "loss": 0.3518, + "step": 3580 + }, + { + "epoch": 0.04857569180683668, + "grad_norm": 7.17483377456665, + "learning_rate": 9.612169384678635e-06, + "loss": 0.5367, + "step": 3581 + }, + { + "epoch": 0.04858925664677157, + "grad_norm": 7.5287957191467285, + "learning_rate": 9.61203234205838e-06, + "loss": 0.5138, + "step": 3582 + }, + { + "epoch": 0.04860282148670646, + "grad_norm": 7.893366813659668, + "learning_rate": 9.611895299438127e-06, + "loss": 0.6222, + "step": 3583 + }, + { + "epoch": 0.04861638632664134, + "grad_norm": 7.582451820373535, + "learning_rate": 9.61175825681787e-06, + "loss": 0.3718, + "step": 3584 + }, + { + "epoch": 0.04862995116657624, + "grad_norm": 6.75073766708374, + "learning_rate": 9.611621214197616e-06, + "loss": 0.4157, + "step": 3585 + }, + { + "epoch": 0.04864351600651112, + "grad_norm": 7.466477870941162, + "learning_rate": 9.611484171577361e-06, + "loss": 0.5135, + "step": 3586 + }, + { + "epoch": 0.04865708084644601, + "grad_norm": 6.721928119659424, + "learning_rate": 9.611347128957106e-06, + "loss": 0.4023, + "step": 3587 + }, + { + "epoch": 0.0486706456863809, + "grad_norm": 8.389557838439941, + "learning_rate": 9.611210086336851e-06, + "loss": 0.562, + "step": 3588 + }, + { + "epoch": 0.04868421052631579, + "grad_norm": 8.539018630981445, + "learning_rate": 9.611073043716597e-06, + "loss": 0.3572, + "step": 3589 + }, + { + "epoch": 0.04869777536625068, + "grad_norm": 8.666969299316406, + "learning_rate": 9.610936001096342e-06, + "loss": 0.6087, + "step": 3590 + }, + { + "epoch": 0.04871134020618557, + "grad_norm": 7.997934341430664, + "learning_rate": 9.610798958476087e-06, + "loss": 0.6246, + "step": 3591 + }, + { + "epoch": 0.048724905046120454, + "grad_norm": 5.703499794006348, + "learning_rate": 9.610661915855832e-06, + "loss": 0.465, + "step": 3592 + }, + { + "epoch": 0.04873846988605535, + "grad_norm": 7.104333877563477, + "learning_rate": 9.610524873235577e-06, + "loss": 0.4973, + "step": 3593 + }, + { + "epoch": 0.04875203472599023, + "grad_norm": 10.022735595703125, + "learning_rate": 9.610387830615322e-06, + "loss": 0.5613, + "step": 3594 + }, + { + "epoch": 0.04876559956592512, + "grad_norm": 9.057912826538086, + "learning_rate": 9.610250787995068e-06, + "loss": 0.4507, + "step": 3595 + }, + { + "epoch": 0.04877916440586001, + "grad_norm": 7.461339473724365, + "learning_rate": 9.610113745374813e-06, + "loss": 0.4029, + "step": 3596 + }, + { + "epoch": 0.0487927292457949, + "grad_norm": 8.977487564086914, + "learning_rate": 9.609976702754558e-06, + "loss": 0.6412, + "step": 3597 + }, + { + "epoch": 0.04880629408572979, + "grad_norm": 8.880142211914062, + "learning_rate": 9.609839660134303e-06, + "loss": 0.4544, + "step": 3598 + }, + { + "epoch": 0.04881985892566468, + "grad_norm": 8.0486478805542, + "learning_rate": 9.609702617514047e-06, + "loss": 0.432, + "step": 3599 + }, + { + "epoch": 0.048833423765599564, + "grad_norm": 8.573427200317383, + "learning_rate": 9.609565574893794e-06, + "loss": 0.5216, + "step": 3600 + }, + { + "epoch": 0.04884698860553446, + "grad_norm": 9.192455291748047, + "learning_rate": 9.609428532273539e-06, + "loss": 0.706, + "step": 3601 + }, + { + "epoch": 0.04886055344546934, + "grad_norm": 8.612578392028809, + "learning_rate": 9.609291489653282e-06, + "loss": 0.6375, + "step": 3602 + }, + { + "epoch": 0.04887411828540423, + "grad_norm": 10.316327095031738, + "learning_rate": 9.609154447033027e-06, + "loss": 0.6691, + "step": 3603 + }, + { + "epoch": 0.04888768312533912, + "grad_norm": 7.199194431304932, + "learning_rate": 9.609017404412773e-06, + "loss": 0.5024, + "step": 3604 + }, + { + "epoch": 0.04890124796527401, + "grad_norm": 7.803562164306641, + "learning_rate": 9.60888036179252e-06, + "loss": 0.4973, + "step": 3605 + }, + { + "epoch": 0.0489148128052089, + "grad_norm": 7.304722785949707, + "learning_rate": 9.608743319172263e-06, + "loss": 0.462, + "step": 3606 + }, + { + "epoch": 0.04892837764514379, + "grad_norm": 6.573477745056152, + "learning_rate": 9.608606276552008e-06, + "loss": 0.4262, + "step": 3607 + }, + { + "epoch": 0.048941942485078674, + "grad_norm": 5.6171112060546875, + "learning_rate": 9.608469233931753e-06, + "loss": 0.4362, + "step": 3608 + }, + { + "epoch": 0.04895550732501357, + "grad_norm": 5.924472808837891, + "learning_rate": 9.608332191311498e-06, + "loss": 0.3588, + "step": 3609 + }, + { + "epoch": 0.04896907216494845, + "grad_norm": 7.795853614807129, + "learning_rate": 9.608195148691244e-06, + "loss": 0.5041, + "step": 3610 + }, + { + "epoch": 0.04898263700488334, + "grad_norm": 7.356218338012695, + "learning_rate": 9.608058106070989e-06, + "loss": 0.4601, + "step": 3611 + }, + { + "epoch": 0.04899620184481823, + "grad_norm": 8.125935554504395, + "learning_rate": 9.607921063450734e-06, + "loss": 0.6058, + "step": 3612 + }, + { + "epoch": 0.04900976668475312, + "grad_norm": 6.391265392303467, + "learning_rate": 9.60778402083048e-06, + "loss": 0.4136, + "step": 3613 + }, + { + "epoch": 0.04902333152468801, + "grad_norm": 6.567355155944824, + "learning_rate": 9.607646978210224e-06, + "loss": 0.419, + "step": 3614 + }, + { + "epoch": 0.0490368963646229, + "grad_norm": 6.011651039123535, + "learning_rate": 9.60750993558997e-06, + "loss": 0.4984, + "step": 3615 + }, + { + "epoch": 0.049050461204557784, + "grad_norm": 7.84192419052124, + "learning_rate": 9.607372892969715e-06, + "loss": 0.4528, + "step": 3616 + }, + { + "epoch": 0.04906402604449268, + "grad_norm": 6.289186954498291, + "learning_rate": 9.607235850349458e-06, + "loss": 0.4001, + "step": 3617 + }, + { + "epoch": 0.04907759088442756, + "grad_norm": 8.025135040283203, + "learning_rate": 9.607098807729205e-06, + "loss": 0.481, + "step": 3618 + }, + { + "epoch": 0.04909115572436245, + "grad_norm": 7.104740619659424, + "learning_rate": 9.60696176510895e-06, + "loss": 0.4878, + "step": 3619 + }, + { + "epoch": 0.04910472056429734, + "grad_norm": 4.802778720855713, + "learning_rate": 9.606824722488695e-06, + "loss": 0.2958, + "step": 3620 + }, + { + "epoch": 0.04911828540423223, + "grad_norm": 10.057589530944824, + "learning_rate": 9.606687679868439e-06, + "loss": 0.5762, + "step": 3621 + }, + { + "epoch": 0.04913185024416712, + "grad_norm": 6.556323051452637, + "learning_rate": 9.606550637248186e-06, + "loss": 0.5157, + "step": 3622 + }, + { + "epoch": 0.04914541508410201, + "grad_norm": 8.838424682617188, + "learning_rate": 9.606413594627931e-06, + "loss": 0.531, + "step": 3623 + }, + { + "epoch": 0.049158979924036894, + "grad_norm": 5.235262870788574, + "learning_rate": 9.606276552007674e-06, + "loss": 0.3372, + "step": 3624 + }, + { + "epoch": 0.04917254476397179, + "grad_norm": 7.009055137634277, + "learning_rate": 9.60613950938742e-06, + "loss": 0.4278, + "step": 3625 + }, + { + "epoch": 0.04918610960390667, + "grad_norm": 7.5201287269592285, + "learning_rate": 9.606002466767166e-06, + "loss": 0.5017, + "step": 3626 + }, + { + "epoch": 0.04919967444384156, + "grad_norm": 7.897444725036621, + "learning_rate": 9.60586542414691e-06, + "loss": 0.3544, + "step": 3627 + }, + { + "epoch": 0.04921323928377645, + "grad_norm": 5.746769905090332, + "learning_rate": 9.605728381526655e-06, + "loss": 0.4308, + "step": 3628 + }, + { + "epoch": 0.04922680412371134, + "grad_norm": 8.231696128845215, + "learning_rate": 9.6055913389064e-06, + "loss": 0.4541, + "step": 3629 + }, + { + "epoch": 0.04924036896364623, + "grad_norm": 6.551959991455078, + "learning_rate": 9.605454296286146e-06, + "loss": 0.4135, + "step": 3630 + }, + { + "epoch": 0.04925393380358112, + "grad_norm": 8.27342700958252, + "learning_rate": 9.60531725366589e-06, + "loss": 0.4599, + "step": 3631 + }, + { + "epoch": 0.049267498643516004, + "grad_norm": 6.334101676940918, + "learning_rate": 9.605180211045636e-06, + "loss": 0.4089, + "step": 3632 + }, + { + "epoch": 0.0492810634834509, + "grad_norm": 4.415005683898926, + "learning_rate": 9.605043168425381e-06, + "loss": 0.2597, + "step": 3633 + }, + { + "epoch": 0.049294628323385783, + "grad_norm": 7.679765224456787, + "learning_rate": 9.604906125805126e-06, + "loss": 0.4174, + "step": 3634 + }, + { + "epoch": 0.04930819316332067, + "grad_norm": 8.780059814453125, + "learning_rate": 9.604769083184871e-06, + "loss": 0.4556, + "step": 3635 + }, + { + "epoch": 0.04932175800325556, + "grad_norm": 8.54918384552002, + "learning_rate": 9.604632040564617e-06, + "loss": 0.4278, + "step": 3636 + }, + { + "epoch": 0.04933532284319045, + "grad_norm": 7.927409648895264, + "learning_rate": 9.604494997944362e-06, + "loss": 0.5148, + "step": 3637 + }, + { + "epoch": 0.04934888768312534, + "grad_norm": 8.835894584655762, + "learning_rate": 9.604357955324107e-06, + "loss": 0.5296, + "step": 3638 + }, + { + "epoch": 0.04936245252306023, + "grad_norm": 10.383392333984375, + "learning_rate": 9.604220912703852e-06, + "loss": 0.504, + "step": 3639 + }, + { + "epoch": 0.049376017362995114, + "grad_norm": 7.870391845703125, + "learning_rate": 9.604083870083597e-06, + "loss": 0.3981, + "step": 3640 + }, + { + "epoch": 0.04938958220293001, + "grad_norm": 7.373791217803955, + "learning_rate": 9.603946827463342e-06, + "loss": 0.4299, + "step": 3641 + }, + { + "epoch": 0.049403147042864894, + "grad_norm": 9.007353782653809, + "learning_rate": 9.603809784843086e-06, + "loss": 0.5603, + "step": 3642 + }, + { + "epoch": 0.04941671188279978, + "grad_norm": 8.766777992248535, + "learning_rate": 9.603672742222833e-06, + "loss": 0.5023, + "step": 3643 + }, + { + "epoch": 0.04943027672273467, + "grad_norm": 9.241238594055176, + "learning_rate": 9.603535699602578e-06, + "loss": 0.6085, + "step": 3644 + }, + { + "epoch": 0.04944384156266956, + "grad_norm": 7.486536502838135, + "learning_rate": 9.603398656982322e-06, + "loss": 0.495, + "step": 3645 + }, + { + "epoch": 0.04945740640260445, + "grad_norm": 6.900058746337891, + "learning_rate": 9.603261614362067e-06, + "loss": 0.4825, + "step": 3646 + }, + { + "epoch": 0.04947097124253934, + "grad_norm": 5.202308177947998, + "learning_rate": 9.603124571741812e-06, + "loss": 0.4335, + "step": 3647 + }, + { + "epoch": 0.049484536082474224, + "grad_norm": 8.208542823791504, + "learning_rate": 9.602987529121559e-06, + "loss": 0.5325, + "step": 3648 + }, + { + "epoch": 0.04949810092240912, + "grad_norm": 11.572811126708984, + "learning_rate": 9.602850486501302e-06, + "loss": 0.8227, + "step": 3649 + }, + { + "epoch": 0.049511665762344004, + "grad_norm": 6.1820969581604, + "learning_rate": 9.602713443881047e-06, + "loss": 0.481, + "step": 3650 + }, + { + "epoch": 0.04952523060227889, + "grad_norm": 7.024624347686768, + "learning_rate": 9.602576401260793e-06, + "loss": 0.4764, + "step": 3651 + }, + { + "epoch": 0.04953879544221378, + "grad_norm": 5.415053367614746, + "learning_rate": 9.602439358640538e-06, + "loss": 0.4297, + "step": 3652 + }, + { + "epoch": 0.04955236028214867, + "grad_norm": 6.229431629180908, + "learning_rate": 9.602302316020283e-06, + "loss": 0.3813, + "step": 3653 + }, + { + "epoch": 0.04956592512208356, + "grad_norm": 6.590900897979736, + "learning_rate": 9.602165273400028e-06, + "loss": 0.6163, + "step": 3654 + }, + { + "epoch": 0.04957948996201845, + "grad_norm": 7.224055767059326, + "learning_rate": 9.602028230779773e-06, + "loss": 0.396, + "step": 3655 + }, + { + "epoch": 0.049593054801953335, + "grad_norm": 5.9420485496521, + "learning_rate": 9.601891188159518e-06, + "loss": 0.3427, + "step": 3656 + }, + { + "epoch": 0.04960661964188823, + "grad_norm": 6.9201250076293945, + "learning_rate": 9.601754145539264e-06, + "loss": 0.3949, + "step": 3657 + }, + { + "epoch": 0.049620184481823114, + "grad_norm": 7.129007816314697, + "learning_rate": 9.601617102919009e-06, + "loss": 0.563, + "step": 3658 + }, + { + "epoch": 0.049633749321758, + "grad_norm": 7.4146013259887695, + "learning_rate": 9.601480060298754e-06, + "loss": 0.483, + "step": 3659 + }, + { + "epoch": 0.04964731416169289, + "grad_norm": 6.436783313751221, + "learning_rate": 9.601343017678497e-06, + "loss": 0.3849, + "step": 3660 + }, + { + "epoch": 0.04966087900162778, + "grad_norm": 7.591388702392578, + "learning_rate": 9.601205975058244e-06, + "loss": 0.4444, + "step": 3661 + }, + { + "epoch": 0.04967444384156267, + "grad_norm": 7.300603866577148, + "learning_rate": 9.60106893243799e-06, + "loss": 0.5952, + "step": 3662 + }, + { + "epoch": 0.04968800868149756, + "grad_norm": 8.756918907165527, + "learning_rate": 9.600931889817735e-06, + "loss": 0.5291, + "step": 3663 + }, + { + "epoch": 0.049701573521432445, + "grad_norm": 6.562898635864258, + "learning_rate": 9.600794847197478e-06, + "loss": 0.4047, + "step": 3664 + }, + { + "epoch": 0.04971513836136734, + "grad_norm": 7.406138896942139, + "learning_rate": 9.600657804577225e-06, + "loss": 0.5202, + "step": 3665 + }, + { + "epoch": 0.049728703201302224, + "grad_norm": 8.242363929748535, + "learning_rate": 9.60052076195697e-06, + "loss": 0.6051, + "step": 3666 + }, + { + "epoch": 0.04974226804123711, + "grad_norm": 8.48302936553955, + "learning_rate": 9.600383719336714e-06, + "loss": 0.5696, + "step": 3667 + }, + { + "epoch": 0.049755832881172, + "grad_norm": 7.773468017578125, + "learning_rate": 9.600246676716459e-06, + "loss": 0.4201, + "step": 3668 + }, + { + "epoch": 0.04976939772110689, + "grad_norm": 6.196767330169678, + "learning_rate": 9.600109634096206e-06, + "loss": 0.5095, + "step": 3669 + }, + { + "epoch": 0.04978296256104178, + "grad_norm": 8.761529922485352, + "learning_rate": 9.59997259147595e-06, + "loss": 0.5945, + "step": 3670 + }, + { + "epoch": 0.04979652740097667, + "grad_norm": 6.655881881713867, + "learning_rate": 9.599835548855694e-06, + "loss": 0.5686, + "step": 3671 + }, + { + "epoch": 0.049810092240911555, + "grad_norm": 11.023362159729004, + "learning_rate": 9.59969850623544e-06, + "loss": 0.6108, + "step": 3672 + }, + { + "epoch": 0.04982365708084645, + "grad_norm": 5.632102012634277, + "learning_rate": 9.599561463615185e-06, + "loss": 0.403, + "step": 3673 + }, + { + "epoch": 0.049837221920781334, + "grad_norm": 5.826066493988037, + "learning_rate": 9.59942442099493e-06, + "loss": 0.6524, + "step": 3674 + }, + { + "epoch": 0.04985078676071622, + "grad_norm": 6.96673583984375, + "learning_rate": 9.599287378374675e-06, + "loss": 0.4092, + "step": 3675 + }, + { + "epoch": 0.04986435160065111, + "grad_norm": 6.401939868927002, + "learning_rate": 9.59915033575442e-06, + "loss": 0.4968, + "step": 3676 + }, + { + "epoch": 0.049877916440586, + "grad_norm": 7.192183494567871, + "learning_rate": 9.599013293134166e-06, + "loss": 0.5048, + "step": 3677 + }, + { + "epoch": 0.04989148128052089, + "grad_norm": 8.71895980834961, + "learning_rate": 9.59887625051391e-06, + "loss": 0.5323, + "step": 3678 + }, + { + "epoch": 0.04990504612045578, + "grad_norm": 5.36781644821167, + "learning_rate": 9.598739207893656e-06, + "loss": 0.3895, + "step": 3679 + }, + { + "epoch": 0.049918610960390665, + "grad_norm": 9.089791297912598, + "learning_rate": 9.598602165273401e-06, + "loss": 0.6048, + "step": 3680 + }, + { + "epoch": 0.04993217580032556, + "grad_norm": 8.7039794921875, + "learning_rate": 9.598465122653146e-06, + "loss": 0.5402, + "step": 3681 + }, + { + "epoch": 0.049945740640260444, + "grad_norm": 7.043251037597656, + "learning_rate": 9.598328080032891e-06, + "loss": 0.4585, + "step": 3682 + }, + { + "epoch": 0.04995930548019533, + "grad_norm": 7.504213809967041, + "learning_rate": 9.598191037412637e-06, + "loss": 0.4944, + "step": 3683 + }, + { + "epoch": 0.049972870320130224, + "grad_norm": 6.896026611328125, + "learning_rate": 9.598053994792382e-06, + "loss": 0.5107, + "step": 3684 + }, + { + "epoch": 0.04998643516006511, + "grad_norm": 8.686966896057129, + "learning_rate": 9.597916952172125e-06, + "loss": 0.4529, + "step": 3685 + }, + { + "epoch": 0.05, + "grad_norm": 9.331398010253906, + "learning_rate": 9.59777990955187e-06, + "loss": 0.5426, + "step": 3686 + }, + { + "epoch": 0.05, + "eval_loss": 0.43753519654273987, + "eval_noise_accuracy": NaN, + "eval_runtime": 4756.5475, + "eval_samples_per_second": 1.056, + "eval_steps_per_second": 0.066, + "eval_wer": 41.67773624540484, + "step": 3686 + }, + { + "epoch": 0.05001356483993489, + "grad_norm": 6.7122392654418945, + "learning_rate": 9.597642866931617e-06, + "loss": 0.5069, + "step": 3687 + }, + { + "epoch": 0.050027129679869775, + "grad_norm": 7.5660929679870605, + "learning_rate": 9.597505824311362e-06, + "loss": 0.4616, + "step": 3688 + }, + { + "epoch": 0.05004069451980467, + "grad_norm": 7.717869281768799, + "learning_rate": 9.597368781691106e-06, + "loss": 0.6139, + "step": 3689 + }, + { + "epoch": 0.050054259359739554, + "grad_norm": 7.52753210067749, + "learning_rate": 9.597231739070851e-06, + "loss": 0.5217, + "step": 3690 + }, + { + "epoch": 0.05006782419967444, + "grad_norm": 8.310872077941895, + "learning_rate": 9.597094696450598e-06, + "loss": 0.5142, + "step": 3691 + }, + { + "epoch": 0.050081389039609334, + "grad_norm": 8.297338485717773, + "learning_rate": 9.596957653830342e-06, + "loss": 0.5885, + "step": 3692 + }, + { + "epoch": 0.05009495387954422, + "grad_norm": 9.738335609436035, + "learning_rate": 9.596820611210087e-06, + "loss": 0.5931, + "step": 3693 + }, + { + "epoch": 0.05010851871947911, + "grad_norm": 5.190647125244141, + "learning_rate": 9.596683568589832e-06, + "loss": 0.3563, + "step": 3694 + }, + { + "epoch": 0.050122083559414, + "grad_norm": 7.0557942390441895, + "learning_rate": 9.596546525969577e-06, + "loss": 0.463, + "step": 3695 + }, + { + "epoch": 0.050135648399348885, + "grad_norm": 8.027969360351562, + "learning_rate": 9.596409483349322e-06, + "loss": 0.4809, + "step": 3696 + }, + { + "epoch": 0.05014921323928378, + "grad_norm": 7.027260780334473, + "learning_rate": 9.596272440729067e-06, + "loss": 0.6049, + "step": 3697 + }, + { + "epoch": 0.050162778079218664, + "grad_norm": 7.350149631500244, + "learning_rate": 9.596135398108813e-06, + "loss": 0.4826, + "step": 3698 + }, + { + "epoch": 0.05017634291915355, + "grad_norm": 8.177252769470215, + "learning_rate": 9.595998355488558e-06, + "loss": 0.4714, + "step": 3699 + }, + { + "epoch": 0.050189907759088444, + "grad_norm": 6.820711135864258, + "learning_rate": 9.595861312868303e-06, + "loss": 0.5061, + "step": 3700 + }, + { + "epoch": 0.05020347259902333, + "grad_norm": 8.725363731384277, + "learning_rate": 9.595724270248048e-06, + "loss": 0.3873, + "step": 3701 + }, + { + "epoch": 0.05021703743895822, + "grad_norm": 7.0893120765686035, + "learning_rate": 9.595587227627793e-06, + "loss": 0.3996, + "step": 3702 + }, + { + "epoch": 0.05023060227889311, + "grad_norm": 7.231411933898926, + "learning_rate": 9.595450185007538e-06, + "loss": 0.5074, + "step": 3703 + }, + { + "epoch": 0.050244167118827995, + "grad_norm": 8.912978172302246, + "learning_rate": 9.595313142387284e-06, + "loss": 0.5896, + "step": 3704 + }, + { + "epoch": 0.05025773195876289, + "grad_norm": 8.96824836730957, + "learning_rate": 9.595176099767029e-06, + "loss": 0.4685, + "step": 3705 + }, + { + "epoch": 0.050271296798697775, + "grad_norm": 6.143014430999756, + "learning_rate": 9.595039057146774e-06, + "loss": 0.4502, + "step": 3706 + }, + { + "epoch": 0.05028486163863266, + "grad_norm": 7.472655773162842, + "learning_rate": 9.594902014526518e-06, + "loss": 0.5547, + "step": 3707 + }, + { + "epoch": 0.050298426478567554, + "grad_norm": 8.069610595703125, + "learning_rate": 9.594764971906264e-06, + "loss": 0.4845, + "step": 3708 + }, + { + "epoch": 0.05031199131850244, + "grad_norm": 9.561776161193848, + "learning_rate": 9.59462792928601e-06, + "loss": 0.5282, + "step": 3709 + }, + { + "epoch": 0.05032555615843733, + "grad_norm": 8.152968406677246, + "learning_rate": 9.594490886665753e-06, + "loss": 0.5347, + "step": 3710 + }, + { + "epoch": 0.05033912099837222, + "grad_norm": 8.85181999206543, + "learning_rate": 9.594353844045498e-06, + "loss": 0.5666, + "step": 3711 + }, + { + "epoch": 0.050352685838307105, + "grad_norm": 6.785340309143066, + "learning_rate": 9.594216801425245e-06, + "loss": 0.4137, + "step": 3712 + }, + { + "epoch": 0.050366250678242, + "grad_norm": 8.958577156066895, + "learning_rate": 9.594079758804989e-06, + "loss": 0.4982, + "step": 3713 + }, + { + "epoch": 0.050379815518176885, + "grad_norm": 8.481755256652832, + "learning_rate": 9.593942716184734e-06, + "loss": 0.4625, + "step": 3714 + }, + { + "epoch": 0.05039338035811177, + "grad_norm": 7.3545002937316895, + "learning_rate": 9.593805673564479e-06, + "loss": 0.4737, + "step": 3715 + }, + { + "epoch": 0.050406945198046664, + "grad_norm": 6.873860836029053, + "learning_rate": 9.593668630944224e-06, + "loss": 0.4953, + "step": 3716 + }, + { + "epoch": 0.05042051003798155, + "grad_norm": 10.820284843444824, + "learning_rate": 9.59353158832397e-06, + "loss": 0.7145, + "step": 3717 + }, + { + "epoch": 0.05043407487791644, + "grad_norm": 6.231873035430908, + "learning_rate": 9.593394545703714e-06, + "loss": 0.3729, + "step": 3718 + }, + { + "epoch": 0.05044763971785133, + "grad_norm": 6.385499477386475, + "learning_rate": 9.59325750308346e-06, + "loss": 0.5353, + "step": 3719 + }, + { + "epoch": 0.050461204557786216, + "grad_norm": 8.888864517211914, + "learning_rate": 9.593120460463205e-06, + "loss": 0.5297, + "step": 3720 + }, + { + "epoch": 0.05047476939772111, + "grad_norm": 8.265332221984863, + "learning_rate": 9.59298341784295e-06, + "loss": 0.3921, + "step": 3721 + }, + { + "epoch": 0.050488334237655995, + "grad_norm": 7.922281265258789, + "learning_rate": 9.592846375222695e-06, + "loss": 0.4898, + "step": 3722 + }, + { + "epoch": 0.05050189907759088, + "grad_norm": 8.701610565185547, + "learning_rate": 9.59270933260244e-06, + "loss": 0.652, + "step": 3723 + }, + { + "epoch": 0.050515463917525774, + "grad_norm": 6.701283931732178, + "learning_rate": 9.592572289982186e-06, + "loss": 0.4075, + "step": 3724 + }, + { + "epoch": 0.05052902875746066, + "grad_norm": 7.879199504852295, + "learning_rate": 9.59243524736193e-06, + "loss": 0.3801, + "step": 3725 + }, + { + "epoch": 0.05054259359739555, + "grad_norm": 9.325870513916016, + "learning_rate": 9.592298204741676e-06, + "loss": 0.4424, + "step": 3726 + }, + { + "epoch": 0.05055615843733044, + "grad_norm": 6.643549919128418, + "learning_rate": 9.592161162121421e-06, + "loss": 0.4227, + "step": 3727 + }, + { + "epoch": 0.050569723277265326, + "grad_norm": 6.620349884033203, + "learning_rate": 9.592024119501165e-06, + "loss": 0.391, + "step": 3728 + }, + { + "epoch": 0.05058328811720022, + "grad_norm": 8.210638046264648, + "learning_rate": 9.59188707688091e-06, + "loss": 0.5309, + "step": 3729 + }, + { + "epoch": 0.050596852957135105, + "grad_norm": 6.82163667678833, + "learning_rate": 9.591750034260657e-06, + "loss": 0.3929, + "step": 3730 + }, + { + "epoch": 0.05061041779706999, + "grad_norm": 8.124215126037598, + "learning_rate": 9.591612991640402e-06, + "loss": 0.5498, + "step": 3731 + }, + { + "epoch": 0.050623982637004884, + "grad_norm": 8.239236831665039, + "learning_rate": 9.591475949020145e-06, + "loss": 0.4836, + "step": 3732 + }, + { + "epoch": 0.05063754747693977, + "grad_norm": 7.730465412139893, + "learning_rate": 9.59133890639989e-06, + "loss": 0.3917, + "step": 3733 + }, + { + "epoch": 0.050651112316874664, + "grad_norm": 6.638607978820801, + "learning_rate": 9.591201863779637e-06, + "loss": 0.414, + "step": 3734 + }, + { + "epoch": 0.05066467715680955, + "grad_norm": 6.625389575958252, + "learning_rate": 9.59106482115938e-06, + "loss": 0.4849, + "step": 3735 + }, + { + "epoch": 0.050678241996744436, + "grad_norm": 6.315176010131836, + "learning_rate": 9.590927778539126e-06, + "loss": 0.2873, + "step": 3736 + }, + { + "epoch": 0.05069180683667933, + "grad_norm": 7.367959976196289, + "learning_rate": 9.590790735918871e-06, + "loss": 0.4603, + "step": 3737 + }, + { + "epoch": 0.050705371676614215, + "grad_norm": 7.775909900665283, + "learning_rate": 9.590653693298616e-06, + "loss": 0.4793, + "step": 3738 + }, + { + "epoch": 0.0507189365165491, + "grad_norm": 7.127101421356201, + "learning_rate": 9.590516650678362e-06, + "loss": 0.4313, + "step": 3739 + }, + { + "epoch": 0.050732501356483994, + "grad_norm": 6.5652313232421875, + "learning_rate": 9.590379608058107e-06, + "loss": 0.497, + "step": 3740 + }, + { + "epoch": 0.05074606619641888, + "grad_norm": 7.324666500091553, + "learning_rate": 9.590242565437852e-06, + "loss": 0.5262, + "step": 3741 + }, + { + "epoch": 0.050759631036353774, + "grad_norm": 7.609711647033691, + "learning_rate": 9.590105522817597e-06, + "loss": 0.4107, + "step": 3742 + }, + { + "epoch": 0.05077319587628866, + "grad_norm": 7.5244293212890625, + "learning_rate": 9.589968480197342e-06, + "loss": 0.5101, + "step": 3743 + }, + { + "epoch": 0.050786760716223546, + "grad_norm": 8.531950950622559, + "learning_rate": 9.589831437577087e-06, + "loss": 0.5998, + "step": 3744 + }, + { + "epoch": 0.05080032555615844, + "grad_norm": 8.188640594482422, + "learning_rate": 9.589694394956833e-06, + "loss": 0.3713, + "step": 3745 + }, + { + "epoch": 0.050813890396093325, + "grad_norm": 10.112589836120605, + "learning_rate": 9.589557352336578e-06, + "loss": 0.6659, + "step": 3746 + }, + { + "epoch": 0.05082745523602821, + "grad_norm": 7.8087334632873535, + "learning_rate": 9.589420309716323e-06, + "loss": 0.6595, + "step": 3747 + }, + { + "epoch": 0.050841020075963105, + "grad_norm": 5.226222038269043, + "learning_rate": 9.589283267096068e-06, + "loss": 0.3452, + "step": 3748 + }, + { + "epoch": 0.05085458491589799, + "grad_norm": 6.403195381164551, + "learning_rate": 9.589146224475813e-06, + "loss": 0.3837, + "step": 3749 + }, + { + "epoch": 0.050868149755832884, + "grad_norm": 7.171356201171875, + "learning_rate": 9.589009181855557e-06, + "loss": 0.3794, + "step": 3750 + }, + { + "epoch": 0.05088171459576777, + "grad_norm": 7.964108943939209, + "learning_rate": 9.588872139235304e-06, + "loss": 0.4694, + "step": 3751 + }, + { + "epoch": 0.050895279435702656, + "grad_norm": 8.096543312072754, + "learning_rate": 9.588735096615049e-06, + "loss": 0.4054, + "step": 3752 + }, + { + "epoch": 0.05090884427563755, + "grad_norm": 7.584107398986816, + "learning_rate": 9.588598053994792e-06, + "loss": 0.4859, + "step": 3753 + }, + { + "epoch": 0.050922409115572435, + "grad_norm": 8.752081871032715, + "learning_rate": 9.588461011374538e-06, + "loss": 0.5418, + "step": 3754 + }, + { + "epoch": 0.05093597395550733, + "grad_norm": 7.704759120941162, + "learning_rate": 9.588323968754283e-06, + "loss": 0.6092, + "step": 3755 + }, + { + "epoch": 0.050949538795442215, + "grad_norm": 8.972350120544434, + "learning_rate": 9.58818692613403e-06, + "loss": 0.5615, + "step": 3756 + }, + { + "epoch": 0.0509631036353771, + "grad_norm": 6.727298259735107, + "learning_rate": 9.588049883513773e-06, + "loss": 0.3994, + "step": 3757 + }, + { + "epoch": 0.050976668475311994, + "grad_norm": 8.107674598693848, + "learning_rate": 9.587912840893518e-06, + "loss": 0.4999, + "step": 3758 + }, + { + "epoch": 0.05099023331524688, + "grad_norm": 9.259480476379395, + "learning_rate": 9.587775798273263e-06, + "loss": 0.4867, + "step": 3759 + }, + { + "epoch": 0.051003798155181766, + "grad_norm": 4.971227169036865, + "learning_rate": 9.587638755653009e-06, + "loss": 0.3711, + "step": 3760 + }, + { + "epoch": 0.05101736299511666, + "grad_norm": 7.973395347595215, + "learning_rate": 9.587501713032754e-06, + "loss": 0.5522, + "step": 3761 + }, + { + "epoch": 0.051030927835051546, + "grad_norm": 12.826807022094727, + "learning_rate": 9.587364670412499e-06, + "loss": 0.7217, + "step": 3762 + }, + { + "epoch": 0.05104449267498644, + "grad_norm": 7.929468631744385, + "learning_rate": 9.587227627792244e-06, + "loss": 0.4582, + "step": 3763 + }, + { + "epoch": 0.051058057514921325, + "grad_norm": 8.002440452575684, + "learning_rate": 9.58709058517199e-06, + "loss": 0.531, + "step": 3764 + }, + { + "epoch": 0.05107162235485621, + "grad_norm": 6.641905784606934, + "learning_rate": 9.586953542551734e-06, + "loss": 0.4144, + "step": 3765 + }, + { + "epoch": 0.051085187194791104, + "grad_norm": 7.030709743499756, + "learning_rate": 9.58681649993148e-06, + "loss": 0.6072, + "step": 3766 + }, + { + "epoch": 0.05109875203472599, + "grad_norm": 7.124312400817871, + "learning_rate": 9.586679457311225e-06, + "loss": 0.4305, + "step": 3767 + }, + { + "epoch": 0.051112316874660876, + "grad_norm": 8.378652572631836, + "learning_rate": 9.586542414690968e-06, + "loss": 0.603, + "step": 3768 + }, + { + "epoch": 0.05112588171459577, + "grad_norm": 6.848226547241211, + "learning_rate": 9.586405372070715e-06, + "loss": 0.4342, + "step": 3769 + }, + { + "epoch": 0.051139446554530656, + "grad_norm": 6.987203598022461, + "learning_rate": 9.58626832945046e-06, + "loss": 0.4606, + "step": 3770 + }, + { + "epoch": 0.05115301139446555, + "grad_norm": 8.837273597717285, + "learning_rate": 9.586131286830206e-06, + "loss": 0.4935, + "step": 3771 + }, + { + "epoch": 0.051166576234400435, + "grad_norm": 9.074352264404297, + "learning_rate": 9.585994244209949e-06, + "loss": 0.6035, + "step": 3772 + }, + { + "epoch": 0.05118014107433532, + "grad_norm": 7.028131008148193, + "learning_rate": 9.585857201589696e-06, + "loss": 0.4161, + "step": 3773 + }, + { + "epoch": 0.051193705914270214, + "grad_norm": 7.087396144866943, + "learning_rate": 9.585720158969441e-06, + "loss": 0.5152, + "step": 3774 + }, + { + "epoch": 0.0512072707542051, + "grad_norm": 10.531502723693848, + "learning_rate": 9.585583116349185e-06, + "loss": 0.6753, + "step": 3775 + }, + { + "epoch": 0.051220835594139986, + "grad_norm": 7.704401969909668, + "learning_rate": 9.58544607372893e-06, + "loss": 0.4096, + "step": 3776 + }, + { + "epoch": 0.05123440043407488, + "grad_norm": 8.0361909866333, + "learning_rate": 9.585309031108677e-06, + "loss": 0.5907, + "step": 3777 + }, + { + "epoch": 0.051247965274009766, + "grad_norm": 10.513652801513672, + "learning_rate": 9.58517198848842e-06, + "loss": 0.6091, + "step": 3778 + }, + { + "epoch": 0.05126153011394466, + "grad_norm": 7.5839924812316895, + "learning_rate": 9.585034945868165e-06, + "loss": 0.4407, + "step": 3779 + }, + { + "epoch": 0.051275094953879545, + "grad_norm": 6.831429958343506, + "learning_rate": 9.58489790324791e-06, + "loss": 0.4066, + "step": 3780 + }, + { + "epoch": 0.05128865979381443, + "grad_norm": 7.035339832305908, + "learning_rate": 9.584760860627657e-06, + "loss": 0.5911, + "step": 3781 + }, + { + "epoch": 0.051302224633749324, + "grad_norm": 7.46907377243042, + "learning_rate": 9.5846238180074e-06, + "loss": 0.6558, + "step": 3782 + }, + { + "epoch": 0.05131578947368421, + "grad_norm": 7.756247043609619, + "learning_rate": 9.584486775387146e-06, + "loss": 0.593, + "step": 3783 + }, + { + "epoch": 0.0513293543136191, + "grad_norm": 10.033884048461914, + "learning_rate": 9.584349732766891e-06, + "loss": 0.4934, + "step": 3784 + }, + { + "epoch": 0.05134291915355399, + "grad_norm": 10.944955825805664, + "learning_rate": 9.584212690146636e-06, + "loss": 0.5255, + "step": 3785 + }, + { + "epoch": 0.051356483993488876, + "grad_norm": 9.13084602355957, + "learning_rate": 9.584075647526382e-06, + "loss": 0.5628, + "step": 3786 + }, + { + "epoch": 0.05137004883342377, + "grad_norm": 9.45106315612793, + "learning_rate": 9.583938604906127e-06, + "loss": 0.5175, + "step": 3787 + }, + { + "epoch": 0.051383613673358655, + "grad_norm": 8.054713249206543, + "learning_rate": 9.583801562285872e-06, + "loss": 0.4418, + "step": 3788 + }, + { + "epoch": 0.05139717851329354, + "grad_norm": 8.625262260437012, + "learning_rate": 9.583664519665617e-06, + "loss": 0.4193, + "step": 3789 + }, + { + "epoch": 0.051410743353228434, + "grad_norm": 8.937756538391113, + "learning_rate": 9.583527477045362e-06, + "loss": 0.4202, + "step": 3790 + }, + { + "epoch": 0.05142430819316332, + "grad_norm": 6.586557388305664, + "learning_rate": 9.583390434425107e-06, + "loss": 0.4883, + "step": 3791 + }, + { + "epoch": 0.05143787303309821, + "grad_norm": 8.319032669067383, + "learning_rate": 9.583253391804853e-06, + "loss": 0.4562, + "step": 3792 + }, + { + "epoch": 0.0514514378730331, + "grad_norm": 6.256007671356201, + "learning_rate": 9.583116349184596e-06, + "loss": 0.3478, + "step": 3793 + }, + { + "epoch": 0.051465002712967986, + "grad_norm": 6.820131778717041, + "learning_rate": 9.582979306564343e-06, + "loss": 0.4388, + "step": 3794 + }, + { + "epoch": 0.05147856755290288, + "grad_norm": 8.185118675231934, + "learning_rate": 9.582842263944088e-06, + "loss": 0.4496, + "step": 3795 + }, + { + "epoch": 0.051492132392837765, + "grad_norm": 8.640547752380371, + "learning_rate": 9.582705221323833e-06, + "loss": 0.5803, + "step": 3796 + }, + { + "epoch": 0.05150569723277265, + "grad_norm": 9.417855262756348, + "learning_rate": 9.582568178703577e-06, + "loss": 0.7674, + "step": 3797 + }, + { + "epoch": 0.051519262072707545, + "grad_norm": 8.23964786529541, + "learning_rate": 9.582431136083322e-06, + "loss": 0.6034, + "step": 3798 + }, + { + "epoch": 0.05153282691264243, + "grad_norm": 9.109663009643555, + "learning_rate": 9.582294093463069e-06, + "loss": 0.4485, + "step": 3799 + }, + { + "epoch": 0.05154639175257732, + "grad_norm": 6.63129997253418, + "learning_rate": 9.582157050842812e-06, + "loss": 0.4129, + "step": 3800 + }, + { + "epoch": 0.05155995659251221, + "grad_norm": 7.460015773773193, + "learning_rate": 9.582020008222558e-06, + "loss": 0.4844, + "step": 3801 + }, + { + "epoch": 0.051573521432447096, + "grad_norm": 6.602466106414795, + "learning_rate": 9.581882965602303e-06, + "loss": 0.3473, + "step": 3802 + }, + { + "epoch": 0.05158708627238199, + "grad_norm": 8.519508361816406, + "learning_rate": 9.581745922982048e-06, + "loss": 0.5535, + "step": 3803 + }, + { + "epoch": 0.051600651112316875, + "grad_norm": 8.118582725524902, + "learning_rate": 9.581608880361793e-06, + "loss": 0.4916, + "step": 3804 + }, + { + "epoch": 0.05161421595225176, + "grad_norm": 8.1503324508667, + "learning_rate": 9.581471837741538e-06, + "loss": 0.631, + "step": 3805 + }, + { + "epoch": 0.051627780792186655, + "grad_norm": 8.25820255279541, + "learning_rate": 9.581334795121283e-06, + "loss": 0.4733, + "step": 3806 + }, + { + "epoch": 0.05164134563212154, + "grad_norm": 9.07652759552002, + "learning_rate": 9.581197752501029e-06, + "loss": 0.4951, + "step": 3807 + }, + { + "epoch": 0.05165491047205643, + "grad_norm": 8.352213859558105, + "learning_rate": 9.581060709880774e-06, + "loss": 0.453, + "step": 3808 + }, + { + "epoch": 0.05166847531199132, + "grad_norm": 6.899398326873779, + "learning_rate": 9.580923667260519e-06, + "loss": 0.4773, + "step": 3809 + }, + { + "epoch": 0.051682040151926206, + "grad_norm": 9.061166763305664, + "learning_rate": 9.580786624640264e-06, + "loss": 0.5618, + "step": 3810 + }, + { + "epoch": 0.0516956049918611, + "grad_norm": 8.538708686828613, + "learning_rate": 9.58064958202001e-06, + "loss": 0.3961, + "step": 3811 + }, + { + "epoch": 0.051709169831795986, + "grad_norm": 6.286700248718262, + "learning_rate": 9.580512539399755e-06, + "loss": 0.3929, + "step": 3812 + }, + { + "epoch": 0.05172273467173087, + "grad_norm": 7.9972920417785645, + "learning_rate": 9.5803754967795e-06, + "loss": 0.5293, + "step": 3813 + }, + { + "epoch": 0.051736299511665765, + "grad_norm": 8.077980995178223, + "learning_rate": 9.580238454159245e-06, + "loss": 0.583, + "step": 3814 + }, + { + "epoch": 0.05174986435160065, + "grad_norm": 7.699104309082031, + "learning_rate": 9.580101411538988e-06, + "loss": 0.457, + "step": 3815 + }, + { + "epoch": 0.05176342919153554, + "grad_norm": 6.289866924285889, + "learning_rate": 9.579964368918735e-06, + "loss": 0.4121, + "step": 3816 + }, + { + "epoch": 0.05177699403147043, + "grad_norm": 8.228445053100586, + "learning_rate": 9.57982732629848e-06, + "loss": 0.4232, + "step": 3817 + }, + { + "epoch": 0.051790558871405316, + "grad_norm": 8.511027336120605, + "learning_rate": 9.579690283678224e-06, + "loss": 0.4626, + "step": 3818 + }, + { + "epoch": 0.05180412371134021, + "grad_norm": 7.104719161987305, + "learning_rate": 9.579553241057969e-06, + "loss": 0.4399, + "step": 3819 + }, + { + "epoch": 0.051817688551275096, + "grad_norm": 5.88472843170166, + "learning_rate": 9.579416198437716e-06, + "loss": 0.3181, + "step": 3820 + }, + { + "epoch": 0.05183125339120998, + "grad_norm": 7.503424167633057, + "learning_rate": 9.57927915581746e-06, + "loss": 0.3587, + "step": 3821 + }, + { + "epoch": 0.051844818231144875, + "grad_norm": 9.12888240814209, + "learning_rate": 9.579142113197205e-06, + "loss": 0.4486, + "step": 3822 + }, + { + "epoch": 0.05185838307107976, + "grad_norm": 6.266063213348389, + "learning_rate": 9.57900507057695e-06, + "loss": 0.446, + "step": 3823 + }, + { + "epoch": 0.05187194791101465, + "grad_norm": 7.437134265899658, + "learning_rate": 9.578868027956695e-06, + "loss": 0.3388, + "step": 3824 + }, + { + "epoch": 0.05188551275094954, + "grad_norm": 6.743585109710693, + "learning_rate": 9.57873098533644e-06, + "loss": 0.372, + "step": 3825 + }, + { + "epoch": 0.051899077590884427, + "grad_norm": 7.227856159210205, + "learning_rate": 9.578593942716185e-06, + "loss": 0.4172, + "step": 3826 + }, + { + "epoch": 0.05191264243081932, + "grad_norm": 5.692536354064941, + "learning_rate": 9.57845690009593e-06, + "loss": 0.3447, + "step": 3827 + }, + { + "epoch": 0.051926207270754206, + "grad_norm": 9.85918140411377, + "learning_rate": 9.578319857475676e-06, + "loss": 0.5169, + "step": 3828 + }, + { + "epoch": 0.05193977211068909, + "grad_norm": 7.414974212646484, + "learning_rate": 9.578182814855421e-06, + "loss": 0.3753, + "step": 3829 + }, + { + "epoch": 0.051953336950623985, + "grad_norm": 9.893929481506348, + "learning_rate": 9.578045772235166e-06, + "loss": 0.6662, + "step": 3830 + }, + { + "epoch": 0.05196690179055887, + "grad_norm": 6.766547203063965, + "learning_rate": 9.577908729614911e-06, + "loss": 0.3478, + "step": 3831 + }, + { + "epoch": 0.05198046663049376, + "grad_norm": 9.39252758026123, + "learning_rate": 9.577771686994656e-06, + "loss": 0.515, + "step": 3832 + }, + { + "epoch": 0.05199403147042865, + "grad_norm": 6.366465091705322, + "learning_rate": 9.577634644374402e-06, + "loss": 0.4645, + "step": 3833 + }, + { + "epoch": 0.05200759631036354, + "grad_norm": 6.590621471405029, + "learning_rate": 9.577497601754147e-06, + "loss": 0.3883, + "step": 3834 + }, + { + "epoch": 0.05202116115029843, + "grad_norm": 8.485236167907715, + "learning_rate": 9.577360559133892e-06, + "loss": 0.3604, + "step": 3835 + }, + { + "epoch": 0.052034725990233316, + "grad_norm": 7.809135913848877, + "learning_rate": 9.577223516513635e-06, + "loss": 0.5289, + "step": 3836 + }, + { + "epoch": 0.0520482908301682, + "grad_norm": 7.558167934417725, + "learning_rate": 9.57708647389338e-06, + "loss": 0.3767, + "step": 3837 + }, + { + "epoch": 0.052061855670103095, + "grad_norm": 7.706725120544434, + "learning_rate": 9.576949431273127e-06, + "loss": 0.4066, + "step": 3838 + }, + { + "epoch": 0.05207542051003798, + "grad_norm": 6.907974720001221, + "learning_rate": 9.576812388652873e-06, + "loss": 0.4079, + "step": 3839 + }, + { + "epoch": 0.05208898534997287, + "grad_norm": 8.75535774230957, + "learning_rate": 9.576675346032616e-06, + "loss": 0.4198, + "step": 3840 + }, + { + "epoch": 0.05210255018990776, + "grad_norm": 7.31072473526001, + "learning_rate": 9.576538303412361e-06, + "loss": 0.519, + "step": 3841 + }, + { + "epoch": 0.05211611502984265, + "grad_norm": 7.72085428237915, + "learning_rate": 9.576401260792108e-06, + "loss": 0.3595, + "step": 3842 + }, + { + "epoch": 0.05212967986977754, + "grad_norm": 6.243660926818848, + "learning_rate": 9.576264218171852e-06, + "loss": 0.3162, + "step": 3843 + }, + { + "epoch": 0.052143244709712426, + "grad_norm": 7.725009918212891, + "learning_rate": 9.576127175551597e-06, + "loss": 0.4577, + "step": 3844 + }, + { + "epoch": 0.05215680954964731, + "grad_norm": 9.114264488220215, + "learning_rate": 9.575990132931342e-06, + "loss": 0.4628, + "step": 3845 + }, + { + "epoch": 0.052170374389582205, + "grad_norm": 7.071927070617676, + "learning_rate": 9.575853090311087e-06, + "loss": 0.4564, + "step": 3846 + }, + { + "epoch": 0.05218393922951709, + "grad_norm": 7.515510082244873, + "learning_rate": 9.575716047690832e-06, + "loss": 0.5247, + "step": 3847 + }, + { + "epoch": 0.05219750406945198, + "grad_norm": 6.874004364013672, + "learning_rate": 9.575579005070578e-06, + "loss": 0.4607, + "step": 3848 + }, + { + "epoch": 0.05221106890938687, + "grad_norm": 9.744914054870605, + "learning_rate": 9.575441962450323e-06, + "loss": 0.4323, + "step": 3849 + }, + { + "epoch": 0.05222463374932176, + "grad_norm": 7.419957637786865, + "learning_rate": 9.575304919830068e-06, + "loss": 0.4978, + "step": 3850 + }, + { + "epoch": 0.05223819858925665, + "grad_norm": 7.166132926940918, + "learning_rate": 9.575167877209813e-06, + "loss": 0.3292, + "step": 3851 + }, + { + "epoch": 0.052251763429191536, + "grad_norm": 7.0660881996154785, + "learning_rate": 9.575030834589558e-06, + "loss": 0.315, + "step": 3852 + }, + { + "epoch": 0.05226532826912642, + "grad_norm": 8.108467102050781, + "learning_rate": 9.574893791969303e-06, + "loss": 0.4017, + "step": 3853 + }, + { + "epoch": 0.052278893109061315, + "grad_norm": 6.043247222900391, + "learning_rate": 9.574756749349049e-06, + "loss": 0.3677, + "step": 3854 + }, + { + "epoch": 0.0522924579489962, + "grad_norm": 9.070390701293945, + "learning_rate": 9.574619706728794e-06, + "loss": 0.4496, + "step": 3855 + }, + { + "epoch": 0.05230602278893109, + "grad_norm": 6.461059093475342, + "learning_rate": 9.574482664108539e-06, + "loss": 0.3591, + "step": 3856 + }, + { + "epoch": 0.05231958762886598, + "grad_norm": 9.728107452392578, + "learning_rate": 9.574345621488284e-06, + "loss": 0.556, + "step": 3857 + }, + { + "epoch": 0.05233315246880087, + "grad_norm": 6.390880107879639, + "learning_rate": 9.574208578868028e-06, + "loss": 0.3917, + "step": 3858 + }, + { + "epoch": 0.05234671730873576, + "grad_norm": 8.299357414245605, + "learning_rate": 9.574071536247775e-06, + "loss": 0.4039, + "step": 3859 + }, + { + "epoch": 0.052360282148670646, + "grad_norm": 7.426069736480713, + "learning_rate": 9.57393449362752e-06, + "loss": 0.5034, + "step": 3860 + }, + { + "epoch": 0.05237384698860553, + "grad_norm": 7.946636199951172, + "learning_rate": 9.573797451007263e-06, + "loss": 0.4228, + "step": 3861 + }, + { + "epoch": 0.052387411828540426, + "grad_norm": 4.704870700836182, + "learning_rate": 9.573660408387008e-06, + "loss": 0.2634, + "step": 3862 + }, + { + "epoch": 0.05240097666847531, + "grad_norm": 6.935413837432861, + "learning_rate": 9.573523365766755e-06, + "loss": 0.3394, + "step": 3863 + }, + { + "epoch": 0.0524145415084102, + "grad_norm": 7.590182781219482, + "learning_rate": 9.5733863231465e-06, + "loss": 0.4889, + "step": 3864 + }, + { + "epoch": 0.05242810634834509, + "grad_norm": 8.211445808410645, + "learning_rate": 9.573249280526244e-06, + "loss": 0.3645, + "step": 3865 + }, + { + "epoch": 0.05244167118827998, + "grad_norm": 7.387567043304443, + "learning_rate": 9.573112237905989e-06, + "loss": 0.3645, + "step": 3866 + }, + { + "epoch": 0.05245523602821487, + "grad_norm": 8.245132446289062, + "learning_rate": 9.572975195285734e-06, + "loss": 0.5184, + "step": 3867 + }, + { + "epoch": 0.052468800868149756, + "grad_norm": 7.972691059112549, + "learning_rate": 9.57283815266548e-06, + "loss": 0.4451, + "step": 3868 + }, + { + "epoch": 0.05248236570808464, + "grad_norm": 6.828716278076172, + "learning_rate": 9.572701110045225e-06, + "loss": 0.3347, + "step": 3869 + }, + { + "epoch": 0.052495930548019536, + "grad_norm": 6.800928592681885, + "learning_rate": 9.57256406742497e-06, + "loss": 0.4737, + "step": 3870 + }, + { + "epoch": 0.05250949538795442, + "grad_norm": 8.231739044189453, + "learning_rate": 9.572427024804715e-06, + "loss": 0.5012, + "step": 3871 + }, + { + "epoch": 0.05252306022788931, + "grad_norm": 6.667128086090088, + "learning_rate": 9.57228998218446e-06, + "loss": 0.34, + "step": 3872 + }, + { + "epoch": 0.0525366250678242, + "grad_norm": 6.889225006103516, + "learning_rate": 9.572152939564205e-06, + "loss": 0.2995, + "step": 3873 + }, + { + "epoch": 0.05255018990775909, + "grad_norm": 7.582406044006348, + "learning_rate": 9.57201589694395e-06, + "loss": 0.3003, + "step": 3874 + }, + { + "epoch": 0.05256375474769398, + "grad_norm": 6.676727771759033, + "learning_rate": 9.571878854323696e-06, + "loss": 0.4458, + "step": 3875 + }, + { + "epoch": 0.05257731958762887, + "grad_norm": 8.827576637268066, + "learning_rate": 9.571741811703441e-06, + "loss": 0.4636, + "step": 3876 + }, + { + "epoch": 0.05259088442756375, + "grad_norm": 8.49599838256836, + "learning_rate": 9.571604769083186e-06, + "loss": 0.5965, + "step": 3877 + }, + { + "epoch": 0.052604449267498646, + "grad_norm": 8.450634956359863, + "learning_rate": 9.571467726462931e-06, + "loss": 0.4008, + "step": 3878 + }, + { + "epoch": 0.05261801410743353, + "grad_norm": 7.036947727203369, + "learning_rate": 9.571330683842676e-06, + "loss": 0.5132, + "step": 3879 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 10.051068305969238, + "learning_rate": 9.57119364122242e-06, + "loss": 0.6715, + "step": 3880 + }, + { + "epoch": 0.05264514378730331, + "grad_norm": 7.0647687911987305, + "learning_rate": 9.571056598602167e-06, + "loss": 0.4592, + "step": 3881 + }, + { + "epoch": 0.0526587086272382, + "grad_norm": 9.587308883666992, + "learning_rate": 9.570919555981912e-06, + "loss": 0.4469, + "step": 3882 + }, + { + "epoch": 0.05267227346717309, + "grad_norm": 7.415927886962891, + "learning_rate": 9.570782513361655e-06, + "loss": 0.4213, + "step": 3883 + }, + { + "epoch": 0.05268583830710798, + "grad_norm": 9.913458824157715, + "learning_rate": 9.5706454707414e-06, + "loss": 0.5517, + "step": 3884 + }, + { + "epoch": 0.05269940314704286, + "grad_norm": 8.070684432983398, + "learning_rate": 9.570508428121147e-06, + "loss": 0.4115, + "step": 3885 + }, + { + "epoch": 0.052712967986977756, + "grad_norm": 6.7835693359375, + "learning_rate": 9.570371385500891e-06, + "loss": 0.3466, + "step": 3886 + }, + { + "epoch": 0.05272653282691264, + "grad_norm": 6.753779411315918, + "learning_rate": 9.570234342880636e-06, + "loss": 0.4023, + "step": 3887 + }, + { + "epoch": 0.05274009766684753, + "grad_norm": 7.101868629455566, + "learning_rate": 9.570097300260381e-06, + "loss": 0.3548, + "step": 3888 + }, + { + "epoch": 0.05275366250678242, + "grad_norm": 7.162428855895996, + "learning_rate": 9.569960257640128e-06, + "loss": 0.3778, + "step": 3889 + }, + { + "epoch": 0.05276722734671731, + "grad_norm": 7.628719329833984, + "learning_rate": 9.569823215019872e-06, + "loss": 0.5685, + "step": 3890 + }, + { + "epoch": 0.0527807921866522, + "grad_norm": 9.619678497314453, + "learning_rate": 9.569686172399617e-06, + "loss": 0.5864, + "step": 3891 + }, + { + "epoch": 0.05279435702658709, + "grad_norm": 7.692152500152588, + "learning_rate": 9.569549129779362e-06, + "loss": 0.4289, + "step": 3892 + }, + { + "epoch": 0.05280792186652197, + "grad_norm": 7.0619096755981445, + "learning_rate": 9.569412087159107e-06, + "loss": 0.4505, + "step": 3893 + }, + { + "epoch": 0.052821486706456866, + "grad_norm": 7.603433132171631, + "learning_rate": 9.569275044538852e-06, + "loss": 0.4641, + "step": 3894 + }, + { + "epoch": 0.05283505154639175, + "grad_norm": 7.238194942474365, + "learning_rate": 9.569138001918598e-06, + "loss": 0.4216, + "step": 3895 + }, + { + "epoch": 0.05284861638632664, + "grad_norm": 7.274506568908691, + "learning_rate": 9.569000959298343e-06, + "loss": 0.4643, + "step": 3896 + }, + { + "epoch": 0.05286218122626153, + "grad_norm": 5.5340256690979, + "learning_rate": 9.568863916678088e-06, + "loss": 0.5625, + "step": 3897 + }, + { + "epoch": 0.05287574606619642, + "grad_norm": 6.174060821533203, + "learning_rate": 9.568726874057833e-06, + "loss": 0.4025, + "step": 3898 + }, + { + "epoch": 0.05288931090613131, + "grad_norm": 7.5895490646362305, + "learning_rate": 9.568589831437578e-06, + "loss": 0.3985, + "step": 3899 + }, + { + "epoch": 0.0529028757460662, + "grad_norm": 7.633170127868652, + "learning_rate": 9.568452788817323e-06, + "loss": 0.3209, + "step": 3900 + }, + { + "epoch": 0.05291644058600108, + "grad_norm": 7.603246212005615, + "learning_rate": 9.568315746197067e-06, + "loss": 0.4712, + "step": 3901 + }, + { + "epoch": 0.052930005425935976, + "grad_norm": 8.203063011169434, + "learning_rate": 9.568178703576814e-06, + "loss": 0.463, + "step": 3902 + }, + { + "epoch": 0.05294357026587086, + "grad_norm": 7.114842891693115, + "learning_rate": 9.568041660956559e-06, + "loss": 0.3827, + "step": 3903 + }, + { + "epoch": 0.05295713510580575, + "grad_norm": 9.283394813537598, + "learning_rate": 9.567904618336304e-06, + "loss": 0.4306, + "step": 3904 + }, + { + "epoch": 0.05297069994574064, + "grad_norm": 6.699897289276123, + "learning_rate": 9.567767575716048e-06, + "loss": 0.3516, + "step": 3905 + }, + { + "epoch": 0.05298426478567553, + "grad_norm": 8.356257438659668, + "learning_rate": 9.567630533095793e-06, + "loss": 0.4896, + "step": 3906 + }, + { + "epoch": 0.05299782962561042, + "grad_norm": 10.476578712463379, + "learning_rate": 9.56749349047554e-06, + "loss": 0.7085, + "step": 3907 + }, + { + "epoch": 0.05301139446554531, + "grad_norm": 7.656018257141113, + "learning_rate": 9.567356447855283e-06, + "loss": 0.3647, + "step": 3908 + }, + { + "epoch": 0.05302495930548019, + "grad_norm": 7.851179599761963, + "learning_rate": 9.567219405235028e-06, + "loss": 0.2917, + "step": 3909 + }, + { + "epoch": 0.053038524145415086, + "grad_norm": 5.665163993835449, + "learning_rate": 9.567082362614774e-06, + "loss": 0.2757, + "step": 3910 + }, + { + "epoch": 0.05305208898534997, + "grad_norm": 7.387456893920898, + "learning_rate": 9.566945319994519e-06, + "loss": 0.4187, + "step": 3911 + }, + { + "epoch": 0.05306565382528486, + "grad_norm": 5.716736316680908, + "learning_rate": 9.566808277374264e-06, + "loss": 0.2802, + "step": 3912 + }, + { + "epoch": 0.05307921866521975, + "grad_norm": 6.717907428741455, + "learning_rate": 9.566671234754009e-06, + "loss": 0.4207, + "step": 3913 + }, + { + "epoch": 0.05309278350515464, + "grad_norm": 7.9929986000061035, + "learning_rate": 9.566534192133754e-06, + "loss": 0.5113, + "step": 3914 + }, + { + "epoch": 0.05310634834508953, + "grad_norm": 9.861250877380371, + "learning_rate": 9.5663971495135e-06, + "loss": 0.5624, + "step": 3915 + }, + { + "epoch": 0.05311991318502442, + "grad_norm": 6.751310348510742, + "learning_rate": 9.566260106893245e-06, + "loss": 0.3325, + "step": 3916 + }, + { + "epoch": 0.0531334780249593, + "grad_norm": 8.147723197937012, + "learning_rate": 9.56612306427299e-06, + "loss": 0.5963, + "step": 3917 + }, + { + "epoch": 0.053147042864894196, + "grad_norm": 6.6191816329956055, + "learning_rate": 9.565986021652735e-06, + "loss": 0.538, + "step": 3918 + }, + { + "epoch": 0.05316060770482908, + "grad_norm": 7.991862773895264, + "learning_rate": 9.565848979032478e-06, + "loss": 0.5118, + "step": 3919 + }, + { + "epoch": 0.05317417254476397, + "grad_norm": 7.633426189422607, + "learning_rate": 9.565711936412225e-06, + "loss": 0.4052, + "step": 3920 + }, + { + "epoch": 0.05318773738469886, + "grad_norm": 7.419528007507324, + "learning_rate": 9.56557489379197e-06, + "loss": 0.3557, + "step": 3921 + }, + { + "epoch": 0.05320130222463375, + "grad_norm": 7.176448822021484, + "learning_rate": 9.565437851171716e-06, + "loss": 0.5445, + "step": 3922 + }, + { + "epoch": 0.05321486706456864, + "grad_norm": 7.414610385894775, + "learning_rate": 9.56530080855146e-06, + "loss": 0.4467, + "step": 3923 + }, + { + "epoch": 0.05322843190450353, + "grad_norm": 6.820290565490723, + "learning_rate": 9.565163765931206e-06, + "loss": 0.4278, + "step": 3924 + }, + { + "epoch": 0.05324199674443841, + "grad_norm": 7.862636089324951, + "learning_rate": 9.565026723310951e-06, + "loss": 0.6389, + "step": 3925 + }, + { + "epoch": 0.05325556158437331, + "grad_norm": 6.555603981018066, + "learning_rate": 9.564889680690695e-06, + "loss": 0.5418, + "step": 3926 + }, + { + "epoch": 0.05326912642430819, + "grad_norm": 7.470492839813232, + "learning_rate": 9.56475263807044e-06, + "loss": 0.4397, + "step": 3927 + }, + { + "epoch": 0.05328269126424308, + "grad_norm": 6.572758674621582, + "learning_rate": 9.564615595450187e-06, + "loss": 0.3383, + "step": 3928 + }, + { + "epoch": 0.05329625610417797, + "grad_norm": 7.897619247436523, + "learning_rate": 9.56447855282993e-06, + "loss": 0.362, + "step": 3929 + }, + { + "epoch": 0.05330982094411286, + "grad_norm": 8.578210830688477, + "learning_rate": 9.564341510209675e-06, + "loss": 0.4427, + "step": 3930 + }, + { + "epoch": 0.05332338578404775, + "grad_norm": 7.26832389831543, + "learning_rate": 9.56420446758942e-06, + "loss": 0.5204, + "step": 3931 + }, + { + "epoch": 0.05333695062398264, + "grad_norm": 6.73231840133667, + "learning_rate": 9.564067424969168e-06, + "loss": 0.4776, + "step": 3932 + }, + { + "epoch": 0.053350515463917524, + "grad_norm": 8.999349594116211, + "learning_rate": 9.563930382348911e-06, + "loss": 0.6015, + "step": 3933 + }, + { + "epoch": 0.05336408030385242, + "grad_norm": 5.895009517669678, + "learning_rate": 9.563793339728656e-06, + "loss": 0.4741, + "step": 3934 + }, + { + "epoch": 0.0533776451437873, + "grad_norm": 8.154631614685059, + "learning_rate": 9.563656297108401e-06, + "loss": 0.5597, + "step": 3935 + }, + { + "epoch": 0.05339120998372219, + "grad_norm": 8.496976852416992, + "learning_rate": 9.563519254488147e-06, + "loss": 0.3741, + "step": 3936 + }, + { + "epoch": 0.05340477482365708, + "grad_norm": 7.624798774719238, + "learning_rate": 9.563382211867892e-06, + "loss": 0.456, + "step": 3937 + }, + { + "epoch": 0.05341833966359197, + "grad_norm": 7.0006818771362305, + "learning_rate": 9.563245169247637e-06, + "loss": 0.4544, + "step": 3938 + }, + { + "epoch": 0.05343190450352686, + "grad_norm": 5.66712760925293, + "learning_rate": 9.563108126627382e-06, + "loss": 0.3363, + "step": 3939 + }, + { + "epoch": 0.05344546934346175, + "grad_norm": 49.34752655029297, + "learning_rate": 9.562971084007127e-06, + "loss": 0.5411, + "step": 3940 + }, + { + "epoch": 0.053459034183396634, + "grad_norm": 4.571079254150391, + "learning_rate": 9.562834041386872e-06, + "loss": 0.3278, + "step": 3941 + }, + { + "epoch": 0.05347259902333153, + "grad_norm": 5.432718276977539, + "learning_rate": 9.562696998766618e-06, + "loss": 0.4818, + "step": 3942 + }, + { + "epoch": 0.05348616386326641, + "grad_norm": 8.875125885009766, + "learning_rate": 9.562559956146363e-06, + "loss": 0.6641, + "step": 3943 + }, + { + "epoch": 0.0534997287032013, + "grad_norm": 6.986327171325684, + "learning_rate": 9.562422913526106e-06, + "loss": 0.4826, + "step": 3944 + }, + { + "epoch": 0.05351329354313619, + "grad_norm": 7.772430896759033, + "learning_rate": 9.562285870905853e-06, + "loss": 0.3818, + "step": 3945 + }, + { + "epoch": 0.05352685838307108, + "grad_norm": 9.246045112609863, + "learning_rate": 9.562148828285598e-06, + "loss": 0.583, + "step": 3946 + }, + { + "epoch": 0.05354042322300597, + "grad_norm": 6.541958332061768, + "learning_rate": 9.562011785665344e-06, + "loss": 0.4191, + "step": 3947 + }, + { + "epoch": 0.05355398806294086, + "grad_norm": 7.479446887969971, + "learning_rate": 9.561874743045087e-06, + "loss": 0.4822, + "step": 3948 + }, + { + "epoch": 0.053567552902875744, + "grad_norm": 6.333560466766357, + "learning_rate": 9.561737700424832e-06, + "loss": 0.3822, + "step": 3949 + }, + { + "epoch": 0.05358111774281064, + "grad_norm": 7.41251802444458, + "learning_rate": 9.561600657804579e-06, + "loss": 0.4563, + "step": 3950 + }, + { + "epoch": 0.05359468258274552, + "grad_norm": 5.815603733062744, + "learning_rate": 9.561463615184323e-06, + "loss": 0.4245, + "step": 3951 + }, + { + "epoch": 0.05360824742268041, + "grad_norm": 8.160974502563477, + "learning_rate": 9.561326572564068e-06, + "loss": 0.3927, + "step": 3952 + }, + { + "epoch": 0.0536218122626153, + "grad_norm": 6.308971881866455, + "learning_rate": 9.561189529943813e-06, + "loss": 0.3795, + "step": 3953 + }, + { + "epoch": 0.05363537710255019, + "grad_norm": 5.154721260070801, + "learning_rate": 9.561052487323558e-06, + "loss": 0.3315, + "step": 3954 + }, + { + "epoch": 0.05364894194248508, + "grad_norm": 6.584885120391846, + "learning_rate": 9.560915444703303e-06, + "loss": 0.4198, + "step": 3955 + }, + { + "epoch": 0.05366250678241997, + "grad_norm": 6.235857963562012, + "learning_rate": 9.560778402083048e-06, + "loss": 0.3357, + "step": 3956 + }, + { + "epoch": 0.053676071622354854, + "grad_norm": 7.801506996154785, + "learning_rate": 9.560641359462794e-06, + "loss": 0.457, + "step": 3957 + }, + { + "epoch": 0.05368963646228975, + "grad_norm": 7.598228931427002, + "learning_rate": 9.560504316842539e-06, + "loss": 0.3478, + "step": 3958 + }, + { + "epoch": 0.05370320130222463, + "grad_norm": 7.4002485275268555, + "learning_rate": 9.560367274222284e-06, + "loss": 0.3257, + "step": 3959 + }, + { + "epoch": 0.05371676614215952, + "grad_norm": 6.289808750152588, + "learning_rate": 9.560230231602029e-06, + "loss": 0.396, + "step": 3960 + }, + { + "epoch": 0.05373033098209441, + "grad_norm": 9.242682456970215, + "learning_rate": 9.560093188981774e-06, + "loss": 0.4616, + "step": 3961 + }, + { + "epoch": 0.0537438958220293, + "grad_norm": 4.504077434539795, + "learning_rate": 9.55995614636152e-06, + "loss": 0.3558, + "step": 3962 + }, + { + "epoch": 0.05375746066196419, + "grad_norm": 6.323431968688965, + "learning_rate": 9.559819103741265e-06, + "loss": 0.3433, + "step": 3963 + }, + { + "epoch": 0.05377102550189908, + "grad_norm": 10.167109489440918, + "learning_rate": 9.55968206112101e-06, + "loss": 0.5097, + "step": 3964 + }, + { + "epoch": 0.053784590341833964, + "grad_norm": 5.399759292602539, + "learning_rate": 9.559545018500755e-06, + "loss": 0.4243, + "step": 3965 + }, + { + "epoch": 0.05379815518176886, + "grad_norm": 6.156068325042725, + "learning_rate": 9.559407975880499e-06, + "loss": 0.4332, + "step": 3966 + }, + { + "epoch": 0.05381172002170374, + "grad_norm": 7.859178066253662, + "learning_rate": 9.559270933260245e-06, + "loss": 0.5583, + "step": 3967 + }, + { + "epoch": 0.05382528486163863, + "grad_norm": 11.053753852844238, + "learning_rate": 9.55913389063999e-06, + "loss": 0.7071, + "step": 3968 + }, + { + "epoch": 0.05383884970157352, + "grad_norm": 6.526443004608154, + "learning_rate": 9.558996848019734e-06, + "loss": 0.4475, + "step": 3969 + }, + { + "epoch": 0.05385241454150841, + "grad_norm": 11.263965606689453, + "learning_rate": 9.55885980539948e-06, + "loss": 0.5345, + "step": 3970 + }, + { + "epoch": 0.0538659793814433, + "grad_norm": 6.6727213859558105, + "learning_rate": 9.558722762779226e-06, + "loss": 0.5038, + "step": 3971 + }, + { + "epoch": 0.05387954422137819, + "grad_norm": 6.891099452972412, + "learning_rate": 9.558585720158971e-06, + "loss": 0.6306, + "step": 3972 + }, + { + "epoch": 0.053893109061313074, + "grad_norm": 6.981039047241211, + "learning_rate": 9.558448677538715e-06, + "loss": 0.3528, + "step": 3973 + }, + { + "epoch": 0.05390667390124797, + "grad_norm": 6.446906566619873, + "learning_rate": 9.55831163491846e-06, + "loss": 0.3754, + "step": 3974 + }, + { + "epoch": 0.053920238741182853, + "grad_norm": 7.622903347015381, + "learning_rate": 9.558174592298205e-06, + "loss": 0.4237, + "step": 3975 + }, + { + "epoch": 0.05393380358111774, + "grad_norm": 7.177746772766113, + "learning_rate": 9.55803754967795e-06, + "loss": 0.2871, + "step": 3976 + }, + { + "epoch": 0.05394736842105263, + "grad_norm": 8.312528610229492, + "learning_rate": 9.557900507057695e-06, + "loss": 0.3976, + "step": 3977 + }, + { + "epoch": 0.05396093326098752, + "grad_norm": 7.196474075317383, + "learning_rate": 9.55776346443744e-06, + "loss": 0.4878, + "step": 3978 + }, + { + "epoch": 0.05397449810092241, + "grad_norm": 6.899267196655273, + "learning_rate": 9.557626421817186e-06, + "loss": 0.4334, + "step": 3979 + }, + { + "epoch": 0.0539880629408573, + "grad_norm": 6.307896614074707, + "learning_rate": 9.557489379196931e-06, + "loss": 0.346, + "step": 3980 + }, + { + "epoch": 0.054001627780792184, + "grad_norm": 6.784140586853027, + "learning_rate": 9.557352336576676e-06, + "loss": 0.4361, + "step": 3981 + }, + { + "epoch": 0.05401519262072708, + "grad_norm": 6.717491626739502, + "learning_rate": 9.557215293956421e-06, + "loss": 0.4461, + "step": 3982 + }, + { + "epoch": 0.054028757460661964, + "grad_norm": 8.60682487487793, + "learning_rate": 9.557078251336167e-06, + "loss": 0.5577, + "step": 3983 + }, + { + "epoch": 0.05404232230059685, + "grad_norm": 6.582596778869629, + "learning_rate": 9.556941208715912e-06, + "loss": 0.5022, + "step": 3984 + }, + { + "epoch": 0.05405588714053174, + "grad_norm": 7.2680559158325195, + "learning_rate": 9.556804166095657e-06, + "loss": 0.4245, + "step": 3985 + }, + { + "epoch": 0.05406945198046663, + "grad_norm": 7.730275630950928, + "learning_rate": 9.556667123475402e-06, + "loss": 0.4852, + "step": 3986 + }, + { + "epoch": 0.05408301682040152, + "grad_norm": 7.522032737731934, + "learning_rate": 9.556530080855147e-06, + "loss": 0.5604, + "step": 3987 + }, + { + "epoch": 0.05409658166033641, + "grad_norm": 6.156473159790039, + "learning_rate": 9.55639303823489e-06, + "loss": 0.4018, + "step": 3988 + }, + { + "epoch": 0.054110146500271294, + "grad_norm": 8.741002082824707, + "learning_rate": 9.556255995614638e-06, + "loss": 0.4961, + "step": 3989 + }, + { + "epoch": 0.05412371134020619, + "grad_norm": 12.489190101623535, + "learning_rate": 9.556118952994383e-06, + "loss": 0.483, + "step": 3990 + }, + { + "epoch": 0.054137276180141074, + "grad_norm": 6.951948642730713, + "learning_rate": 9.555981910374126e-06, + "loss": 0.5054, + "step": 3991 + }, + { + "epoch": 0.05415084102007596, + "grad_norm": 7.510997295379639, + "learning_rate": 9.555844867753871e-06, + "loss": 0.5555, + "step": 3992 + }, + { + "epoch": 0.05416440586001085, + "grad_norm": 7.80217981338501, + "learning_rate": 9.555707825133618e-06, + "loss": 0.6129, + "step": 3993 + }, + { + "epoch": 0.05417797069994574, + "grad_norm": 4.908098220825195, + "learning_rate": 9.555570782513362e-06, + "loss": 0.2775, + "step": 3994 + }, + { + "epoch": 0.05419153553988063, + "grad_norm": 5.269564151763916, + "learning_rate": 9.555433739893107e-06, + "loss": 0.3229, + "step": 3995 + }, + { + "epoch": 0.05420510037981552, + "grad_norm": 5.7877583503723145, + "learning_rate": 9.555296697272852e-06, + "loss": 0.4414, + "step": 3996 + }, + { + "epoch": 0.054218665219750405, + "grad_norm": 8.23887825012207, + "learning_rate": 9.555159654652597e-06, + "loss": 0.523, + "step": 3997 + }, + { + "epoch": 0.0542322300596853, + "grad_norm": 11.467705726623535, + "learning_rate": 9.555022612032343e-06, + "loss": 0.472, + "step": 3998 + }, + { + "epoch": 0.054245794899620184, + "grad_norm": 7.483906269073486, + "learning_rate": 9.554885569412088e-06, + "loss": 0.2986, + "step": 3999 + }, + { + "epoch": 0.05425935973955507, + "grad_norm": 6.731009006500244, + "learning_rate": 9.554748526791833e-06, + "loss": 0.3843, + "step": 4000 + }, + { + "epoch": 0.05427292457948996, + "grad_norm": 6.902194976806641, + "learning_rate": 9.554611484171578e-06, + "loss": 0.494, + "step": 4001 + }, + { + "epoch": 0.05428648941942485, + "grad_norm": 12.021515846252441, + "learning_rate": 9.554474441551323e-06, + "loss": 0.619, + "step": 4002 + }, + { + "epoch": 0.05430005425935974, + "grad_norm": 6.440462589263916, + "learning_rate": 9.554337398931068e-06, + "loss": 0.3956, + "step": 4003 + }, + { + "epoch": 0.05431361909929463, + "grad_norm": 4.968790531158447, + "learning_rate": 9.554200356310814e-06, + "loss": 0.2662, + "step": 4004 + }, + { + "epoch": 0.054327183939229515, + "grad_norm": 6.084278583526611, + "learning_rate": 9.554063313690559e-06, + "loss": 0.335, + "step": 4005 + }, + { + "epoch": 0.05434074877916441, + "grad_norm": 6.48943567276001, + "learning_rate": 9.553926271070304e-06, + "loss": 0.4112, + "step": 4006 + }, + { + "epoch": 0.054354313619099294, + "grad_norm": 6.419584274291992, + "learning_rate": 9.553789228450049e-06, + "loss": 0.379, + "step": 4007 + }, + { + "epoch": 0.05436787845903418, + "grad_norm": 7.043003559112549, + "learning_rate": 9.553652185829794e-06, + "loss": 0.3422, + "step": 4008 + }, + { + "epoch": 0.05438144329896907, + "grad_norm": 7.820255279541016, + "learning_rate": 9.553515143209538e-06, + "loss": 0.4442, + "step": 4009 + }, + { + "epoch": 0.05439500813890396, + "grad_norm": 8.210351943969727, + "learning_rate": 9.553378100589285e-06, + "loss": 0.4472, + "step": 4010 + }, + { + "epoch": 0.05440857297883885, + "grad_norm": 5.902451992034912, + "learning_rate": 9.55324105796903e-06, + "loss": 0.2701, + "step": 4011 + }, + { + "epoch": 0.05442213781877374, + "grad_norm": 8.586173057556152, + "learning_rate": 9.553104015348773e-06, + "loss": 0.3923, + "step": 4012 + }, + { + "epoch": 0.054435702658708625, + "grad_norm": 8.411036491394043, + "learning_rate": 9.552966972728519e-06, + "loss": 0.4476, + "step": 4013 + }, + { + "epoch": 0.05444926749864352, + "grad_norm": 5.854982852935791, + "learning_rate": 9.552829930108265e-06, + "loss": 0.3095, + "step": 4014 + }, + { + "epoch": 0.054462832338578404, + "grad_norm": 5.674253463745117, + "learning_rate": 9.55269288748801e-06, + "loss": 0.4152, + "step": 4015 + }, + { + "epoch": 0.05447639717851329, + "grad_norm": 6.298418998718262, + "learning_rate": 9.552555844867754e-06, + "loss": 0.3349, + "step": 4016 + }, + { + "epoch": 0.05448996201844818, + "grad_norm": 7.791353702545166, + "learning_rate": 9.5524188022475e-06, + "loss": 0.3447, + "step": 4017 + }, + { + "epoch": 0.05450352685838307, + "grad_norm": 5.899839878082275, + "learning_rate": 9.552281759627244e-06, + "loss": 0.3091, + "step": 4018 + }, + { + "epoch": 0.05451709169831796, + "grad_norm": 6.358221530914307, + "learning_rate": 9.55214471700699e-06, + "loss": 0.4, + "step": 4019 + }, + { + "epoch": 0.05453065653825285, + "grad_norm": 7.347357749938965, + "learning_rate": 9.552007674386735e-06, + "loss": 0.3882, + "step": 4020 + }, + { + "epoch": 0.054544221378187735, + "grad_norm": 5.827625751495361, + "learning_rate": 9.55187063176648e-06, + "loss": 0.3086, + "step": 4021 + }, + { + "epoch": 0.05455778621812263, + "grad_norm": 6.325033187866211, + "learning_rate": 9.551733589146225e-06, + "loss": 0.4498, + "step": 4022 + }, + { + "epoch": 0.054571351058057514, + "grad_norm": 10.29806137084961, + "learning_rate": 9.55159654652597e-06, + "loss": 0.6449, + "step": 4023 + }, + { + "epoch": 0.0545849158979924, + "grad_norm": 11.314762115478516, + "learning_rate": 9.551459503905715e-06, + "loss": 0.4036, + "step": 4024 + }, + { + "epoch": 0.054598480737927294, + "grad_norm": 9.842964172363281, + "learning_rate": 9.55132246128546e-06, + "loss": 0.4914, + "step": 4025 + }, + { + "epoch": 0.05461204557786218, + "grad_norm": 6.789485454559326, + "learning_rate": 9.551185418665206e-06, + "loss": 0.3203, + "step": 4026 + }, + { + "epoch": 0.05462561041779707, + "grad_norm": 6.546070575714111, + "learning_rate": 9.551048376044951e-06, + "loss": 0.3166, + "step": 4027 + }, + { + "epoch": 0.05463917525773196, + "grad_norm": 6.985321998596191, + "learning_rate": 9.550911333424696e-06, + "loss": 0.3869, + "step": 4028 + }, + { + "epoch": 0.054652740097666845, + "grad_norm": 14.176462173461914, + "learning_rate": 9.550774290804441e-06, + "loss": 0.5265, + "step": 4029 + }, + { + "epoch": 0.05466630493760174, + "grad_norm": 5.843517780303955, + "learning_rate": 9.550637248184187e-06, + "loss": 0.2101, + "step": 4030 + }, + { + "epoch": 0.054679869777536624, + "grad_norm": 7.350358963012695, + "learning_rate": 9.55050020556393e-06, + "loss": 0.3603, + "step": 4031 + }, + { + "epoch": 0.05469343461747151, + "grad_norm": 7.841879844665527, + "learning_rate": 9.550363162943677e-06, + "loss": 0.5001, + "step": 4032 + }, + { + "epoch": 0.054706999457406404, + "grad_norm": 7.113982200622559, + "learning_rate": 9.550226120323422e-06, + "loss": 0.3808, + "step": 4033 + }, + { + "epoch": 0.05472056429734129, + "grad_norm": 7.686769485473633, + "learning_rate": 9.550089077703166e-06, + "loss": 0.4179, + "step": 4034 + }, + { + "epoch": 0.05473412913727618, + "grad_norm": 7.7569193840026855, + "learning_rate": 9.54995203508291e-06, + "loss": 0.3941, + "step": 4035 + }, + { + "epoch": 0.05474769397721107, + "grad_norm": 11.526226043701172, + "learning_rate": 9.549814992462658e-06, + "loss": 0.5027, + "step": 4036 + }, + { + "epoch": 0.054761258817145955, + "grad_norm": 5.402256011962891, + "learning_rate": 9.549677949842401e-06, + "loss": 0.2569, + "step": 4037 + }, + { + "epoch": 0.05477482365708085, + "grad_norm": 5.559080123901367, + "learning_rate": 9.549540907222146e-06, + "loss": 0.2127, + "step": 4038 + }, + { + "epoch": 0.054788388497015734, + "grad_norm": 7.641923427581787, + "learning_rate": 9.549403864601891e-06, + "loss": 0.3802, + "step": 4039 + }, + { + "epoch": 0.05480195333695062, + "grad_norm": 5.685701370239258, + "learning_rate": 9.549266821981638e-06, + "loss": 0.269, + "step": 4040 + }, + { + "epoch": 0.054815518176885514, + "grad_norm": 6.068294525146484, + "learning_rate": 9.549129779361382e-06, + "loss": 0.4793, + "step": 4041 + }, + { + "epoch": 0.0548290830168204, + "grad_norm": 7.054356098175049, + "learning_rate": 9.548992736741127e-06, + "loss": 0.3806, + "step": 4042 + }, + { + "epoch": 0.05484264785675529, + "grad_norm": 6.411149024963379, + "learning_rate": 9.548855694120872e-06, + "loss": 0.4036, + "step": 4043 + }, + { + "epoch": 0.05485621269669018, + "grad_norm": 8.090110778808594, + "learning_rate": 9.548718651500617e-06, + "loss": 0.4234, + "step": 4044 + }, + { + "epoch": 0.054869777536625065, + "grad_norm": 9.015939712524414, + "learning_rate": 9.548581608880363e-06, + "loss": 0.4021, + "step": 4045 + }, + { + "epoch": 0.05488334237655996, + "grad_norm": 6.575706958770752, + "learning_rate": 9.548444566260108e-06, + "loss": 0.5629, + "step": 4046 + }, + { + "epoch": 0.054896907216494845, + "grad_norm": 5.999131679534912, + "learning_rate": 9.548307523639853e-06, + "loss": 0.4258, + "step": 4047 + }, + { + "epoch": 0.05491047205642973, + "grad_norm": 4.372718811035156, + "learning_rate": 9.548170481019598e-06, + "loss": 0.2472, + "step": 4048 + }, + { + "epoch": 0.054924036896364624, + "grad_norm": 6.035204887390137, + "learning_rate": 9.548033438399343e-06, + "loss": 0.3475, + "step": 4049 + }, + { + "epoch": 0.05493760173629951, + "grad_norm": 6.892722129821777, + "learning_rate": 9.547896395779088e-06, + "loss": 0.5158, + "step": 4050 + }, + { + "epoch": 0.0549511665762344, + "grad_norm": 6.692891597747803, + "learning_rate": 9.547759353158834e-06, + "loss": 0.3995, + "step": 4051 + }, + { + "epoch": 0.05496473141616929, + "grad_norm": 5.360965251922607, + "learning_rate": 9.547622310538577e-06, + "loss": 0.2439, + "step": 4052 + }, + { + "epoch": 0.054978296256104175, + "grad_norm": 6.548325061798096, + "learning_rate": 9.547485267918324e-06, + "loss": 0.5031, + "step": 4053 + }, + { + "epoch": 0.05499186109603907, + "grad_norm": 6.599005699157715, + "learning_rate": 9.54734822529807e-06, + "loss": 0.4628, + "step": 4054 + }, + { + "epoch": 0.055005425935973955, + "grad_norm": 5.94180154800415, + "learning_rate": 9.547211182677814e-06, + "loss": 0.3124, + "step": 4055 + }, + { + "epoch": 0.05501899077590884, + "grad_norm": 6.575500011444092, + "learning_rate": 9.547074140057558e-06, + "loss": 0.4558, + "step": 4056 + }, + { + "epoch": 0.055032555615843734, + "grad_norm": 6.501430511474609, + "learning_rate": 9.546937097437303e-06, + "loss": 0.274, + "step": 4057 + }, + { + "epoch": 0.05504612045577862, + "grad_norm": 8.792854309082031, + "learning_rate": 9.54680005481705e-06, + "loss": 0.4864, + "step": 4058 + }, + { + "epoch": 0.05505968529571351, + "grad_norm": 7.760814189910889, + "learning_rate": 9.546663012196793e-06, + "loss": 0.4147, + "step": 4059 + }, + { + "epoch": 0.0550732501356484, + "grad_norm": 5.555280685424805, + "learning_rate": 9.546525969576539e-06, + "loss": 0.2299, + "step": 4060 + }, + { + "epoch": 0.055086814975583286, + "grad_norm": 10.014328956604004, + "learning_rate": 9.546388926956284e-06, + "loss": 0.4662, + "step": 4061 + }, + { + "epoch": 0.05510037981551818, + "grad_norm": 5.987311840057373, + "learning_rate": 9.546251884336029e-06, + "loss": 0.296, + "step": 4062 + }, + { + "epoch": 0.055113944655453065, + "grad_norm": 9.52847671508789, + "learning_rate": 9.546114841715774e-06, + "loss": 0.4629, + "step": 4063 + }, + { + "epoch": 0.05512750949538795, + "grad_norm": 7.157771587371826, + "learning_rate": 9.54597779909552e-06, + "loss": 0.4614, + "step": 4064 + }, + { + "epoch": 0.055141074335322844, + "grad_norm": 4.649740695953369, + "learning_rate": 9.545840756475264e-06, + "loss": 0.3361, + "step": 4065 + }, + { + "epoch": 0.05515463917525773, + "grad_norm": 8.085515975952148, + "learning_rate": 9.54570371385501e-06, + "loss": 0.5112, + "step": 4066 + }, + { + "epoch": 0.05516820401519262, + "grad_norm": 5.546602725982666, + "learning_rate": 9.545566671234755e-06, + "loss": 0.2755, + "step": 4067 + }, + { + "epoch": 0.05518176885512751, + "grad_norm": 6.3014302253723145, + "learning_rate": 9.5454296286145e-06, + "loss": 0.2099, + "step": 4068 + }, + { + "epoch": 0.055195333695062396, + "grad_norm": 4.440918922424316, + "learning_rate": 9.545292585994245e-06, + "loss": 0.2421, + "step": 4069 + }, + { + "epoch": 0.05520889853499729, + "grad_norm": 6.25147819519043, + "learning_rate": 9.54515554337399e-06, + "loss": 0.3587, + "step": 4070 + }, + { + "epoch": 0.055222463374932175, + "grad_norm": 8.149832725524902, + "learning_rate": 9.545018500753736e-06, + "loss": 0.341, + "step": 4071 + }, + { + "epoch": 0.05523602821486706, + "grad_norm": 7.198922157287598, + "learning_rate": 9.54488145813348e-06, + "loss": 0.2231, + "step": 4072 + }, + { + "epoch": 0.055249593054801954, + "grad_norm": 7.848220348358154, + "learning_rate": 9.544744415513226e-06, + "loss": 0.5554, + "step": 4073 + }, + { + "epoch": 0.05526315789473684, + "grad_norm": 8.43503475189209, + "learning_rate": 9.54460737289297e-06, + "loss": 0.4425, + "step": 4074 + }, + { + "epoch": 0.055276722734671734, + "grad_norm": 8.200838088989258, + "learning_rate": 9.544470330272716e-06, + "loss": 0.3187, + "step": 4075 + }, + { + "epoch": 0.05529028757460662, + "grad_norm": 8.78979778289795, + "learning_rate": 9.544333287652461e-06, + "loss": 0.4962, + "step": 4076 + }, + { + "epoch": 0.055303852414541506, + "grad_norm": 6.847740650177002, + "learning_rate": 9.544196245032205e-06, + "loss": 0.2763, + "step": 4077 + }, + { + "epoch": 0.0553174172544764, + "grad_norm": 6.278258323669434, + "learning_rate": 9.54405920241195e-06, + "loss": 0.2788, + "step": 4078 + }, + { + "epoch": 0.055330982094411285, + "grad_norm": 7.2404279708862305, + "learning_rate": 9.543922159791697e-06, + "loss": 0.3794, + "step": 4079 + }, + { + "epoch": 0.05534454693434617, + "grad_norm": 5.886291980743408, + "learning_rate": 9.543785117171442e-06, + "loss": 0.3809, + "step": 4080 + }, + { + "epoch": 0.055358111774281064, + "grad_norm": 7.01732873916626, + "learning_rate": 9.543648074551186e-06, + "loss": 0.4696, + "step": 4081 + }, + { + "epoch": 0.05537167661421595, + "grad_norm": 7.693508148193359, + "learning_rate": 9.54351103193093e-06, + "loss": 0.2842, + "step": 4082 + }, + { + "epoch": 0.055385241454150844, + "grad_norm": 4.892736434936523, + "learning_rate": 9.543373989310678e-06, + "loss": 0.3533, + "step": 4083 + }, + { + "epoch": 0.05539880629408573, + "grad_norm": 6.771376609802246, + "learning_rate": 9.543236946690421e-06, + "loss": 0.2376, + "step": 4084 + }, + { + "epoch": 0.055412371134020616, + "grad_norm": 6.43611478805542, + "learning_rate": 9.543099904070166e-06, + "loss": 0.3821, + "step": 4085 + }, + { + "epoch": 0.05542593597395551, + "grad_norm": 9.0786771774292, + "learning_rate": 9.542962861449912e-06, + "loss": 0.3953, + "step": 4086 + }, + { + "epoch": 0.055439500813890395, + "grad_norm": 8.896108627319336, + "learning_rate": 9.542825818829657e-06, + "loss": 0.4782, + "step": 4087 + }, + { + "epoch": 0.05545306565382528, + "grad_norm": 7.404421806335449, + "learning_rate": 9.542688776209402e-06, + "loss": 0.4536, + "step": 4088 + }, + { + "epoch": 0.055466630493760175, + "grad_norm": 7.541442394256592, + "learning_rate": 9.542551733589147e-06, + "loss": 0.5236, + "step": 4089 + }, + { + "epoch": 0.05548019533369506, + "grad_norm": 8.85558795928955, + "learning_rate": 9.542414690968892e-06, + "loss": 0.4242, + "step": 4090 + }, + { + "epoch": 0.055493760173629954, + "grad_norm": 8.213829040527344, + "learning_rate": 9.542277648348637e-06, + "loss": 0.47, + "step": 4091 + }, + { + "epoch": 0.05550732501356484, + "grad_norm": 6.225459098815918, + "learning_rate": 9.542140605728383e-06, + "loss": 0.2765, + "step": 4092 + }, + { + "epoch": 0.055520889853499726, + "grad_norm": 6.636776924133301, + "learning_rate": 9.542003563108128e-06, + "loss": 0.3156, + "step": 4093 + }, + { + "epoch": 0.05553445469343462, + "grad_norm": 5.534536361694336, + "learning_rate": 9.541866520487873e-06, + "loss": 0.3332, + "step": 4094 + }, + { + "epoch": 0.055548019533369505, + "grad_norm": 7.985246181488037, + "learning_rate": 9.541729477867618e-06, + "loss": 0.3596, + "step": 4095 + }, + { + "epoch": 0.0555615843733044, + "grad_norm": 8.358070373535156, + "learning_rate": 9.541592435247363e-06, + "loss": 0.4092, + "step": 4096 + }, + { + "epoch": 0.055575149213239285, + "grad_norm": 7.256079196929932, + "learning_rate": 9.541455392627108e-06, + "loss": 0.4237, + "step": 4097 + }, + { + "epoch": 0.05558871405317417, + "grad_norm": 7.3702898025512695, + "learning_rate": 9.541318350006854e-06, + "loss": 0.3429, + "step": 4098 + }, + { + "epoch": 0.055602278893109064, + "grad_norm": 5.6260271072387695, + "learning_rate": 9.541181307386597e-06, + "loss": 0.3909, + "step": 4099 + }, + { + "epoch": 0.05561584373304395, + "grad_norm": 7.551865577697754, + "learning_rate": 9.541044264766342e-06, + "loss": 0.4046, + "step": 4100 + }, + { + "epoch": 0.055629408572978836, + "grad_norm": 8.550874710083008, + "learning_rate": 9.54090722214609e-06, + "loss": 0.5098, + "step": 4101 + }, + { + "epoch": 0.05564297341291373, + "grad_norm": 5.702729225158691, + "learning_rate": 9.540770179525833e-06, + "loss": 0.4438, + "step": 4102 + }, + { + "epoch": 0.055656538252848616, + "grad_norm": 7.241182804107666, + "learning_rate": 9.540633136905578e-06, + "loss": 0.4792, + "step": 4103 + }, + { + "epoch": 0.05567010309278351, + "grad_norm": 5.457747936248779, + "learning_rate": 9.540496094285323e-06, + "loss": 0.3142, + "step": 4104 + }, + { + "epoch": 0.055683667932718395, + "grad_norm": 7.065516948699951, + "learning_rate": 9.540359051665068e-06, + "loss": 0.4914, + "step": 4105 + }, + { + "epoch": 0.05569723277265328, + "grad_norm": 9.57126522064209, + "learning_rate": 9.540222009044813e-06, + "loss": 0.6012, + "step": 4106 + }, + { + "epoch": 0.055710797612588174, + "grad_norm": 7.25575590133667, + "learning_rate": 9.540084966424559e-06, + "loss": 0.3132, + "step": 4107 + }, + { + "epoch": 0.05572436245252306, + "grad_norm": 6.859780788421631, + "learning_rate": 9.539947923804304e-06, + "loss": 0.4588, + "step": 4108 + }, + { + "epoch": 0.055737927292457946, + "grad_norm": 6.783545970916748, + "learning_rate": 9.539810881184049e-06, + "loss": 0.5298, + "step": 4109 + }, + { + "epoch": 0.05575149213239284, + "grad_norm": 6.923549175262451, + "learning_rate": 9.539673838563794e-06, + "loss": 0.4994, + "step": 4110 + }, + { + "epoch": 0.055765056972327726, + "grad_norm": 6.035670280456543, + "learning_rate": 9.53953679594354e-06, + "loss": 0.404, + "step": 4111 + }, + { + "epoch": 0.05577862181226262, + "grad_norm": 7.203039646148682, + "learning_rate": 9.539399753323284e-06, + "loss": 0.608, + "step": 4112 + }, + { + "epoch": 0.055792186652197505, + "grad_norm": 6.048893928527832, + "learning_rate": 9.53926271070303e-06, + "loss": 0.3728, + "step": 4113 + }, + { + "epoch": 0.05580575149213239, + "grad_norm": 7.501040935516357, + "learning_rate": 9.539125668082775e-06, + "loss": 0.4612, + "step": 4114 + }, + { + "epoch": 0.055819316332067284, + "grad_norm": 8.08423900604248, + "learning_rate": 9.53898862546252e-06, + "loss": 0.5972, + "step": 4115 + }, + { + "epoch": 0.05583288117200217, + "grad_norm": 7.387109756469727, + "learning_rate": 9.538851582842265e-06, + "loss": 0.5159, + "step": 4116 + }, + { + "epoch": 0.055846446011937056, + "grad_norm": 9.41187858581543, + "learning_rate": 9.538714540222009e-06, + "loss": 0.5985, + "step": 4117 + }, + { + "epoch": 0.05586001085187195, + "grad_norm": 7.880347728729248, + "learning_rate": 9.538577497601756e-06, + "loss": 0.3691, + "step": 4118 + }, + { + "epoch": 0.055873575691806836, + "grad_norm": 7.511735916137695, + "learning_rate": 9.5384404549815e-06, + "loss": 0.394, + "step": 4119 + }, + { + "epoch": 0.05588714053174173, + "grad_norm": 8.542572021484375, + "learning_rate": 9.538303412361244e-06, + "loss": 0.4999, + "step": 4120 + }, + { + "epoch": 0.055900705371676615, + "grad_norm": 7.3577165603637695, + "learning_rate": 9.53816636974099e-06, + "loss": 0.5576, + "step": 4121 + }, + { + "epoch": 0.0559142702116115, + "grad_norm": 4.93677282333374, + "learning_rate": 9.538029327120736e-06, + "loss": 0.3806, + "step": 4122 + }, + { + "epoch": 0.055927835051546394, + "grad_norm": 7.693540096282959, + "learning_rate": 9.537892284500481e-06, + "loss": 0.3926, + "step": 4123 + }, + { + "epoch": 0.05594139989148128, + "grad_norm": 6.346588134765625, + "learning_rate": 9.537755241880225e-06, + "loss": 0.4056, + "step": 4124 + }, + { + "epoch": 0.05595496473141617, + "grad_norm": 7.617770195007324, + "learning_rate": 9.53761819925997e-06, + "loss": 0.4286, + "step": 4125 + }, + { + "epoch": 0.05596852957135106, + "grad_norm": 9.148374557495117, + "learning_rate": 9.537481156639715e-06, + "loss": 0.5601, + "step": 4126 + }, + { + "epoch": 0.055982094411285946, + "grad_norm": 6.198970794677734, + "learning_rate": 9.53734411401946e-06, + "loss": 0.3854, + "step": 4127 + }, + { + "epoch": 0.05599565925122084, + "grad_norm": 6.690965175628662, + "learning_rate": 9.537207071399206e-06, + "loss": 0.3628, + "step": 4128 + }, + { + "epoch": 0.056009224091155725, + "grad_norm": 6.643925189971924, + "learning_rate": 9.53707002877895e-06, + "loss": 0.3415, + "step": 4129 + }, + { + "epoch": 0.05602278893109061, + "grad_norm": 9.467790603637695, + "learning_rate": 9.536932986158696e-06, + "loss": 0.6822, + "step": 4130 + }, + { + "epoch": 0.056036353771025504, + "grad_norm": 5.68523645401001, + "learning_rate": 9.536795943538441e-06, + "loss": 0.3317, + "step": 4131 + }, + { + "epoch": 0.05604991861096039, + "grad_norm": 7.5742058753967285, + "learning_rate": 9.536658900918186e-06, + "loss": 0.371, + "step": 4132 + }, + { + "epoch": 0.05606348345089528, + "grad_norm": 6.090840816497803, + "learning_rate": 9.536521858297932e-06, + "loss": 0.4172, + "step": 4133 + }, + { + "epoch": 0.05607704829083017, + "grad_norm": 4.493138313293457, + "learning_rate": 9.536384815677677e-06, + "loss": 0.2886, + "step": 4134 + }, + { + "epoch": 0.056090613130765056, + "grad_norm": 7.419286251068115, + "learning_rate": 9.536247773057422e-06, + "loss": 0.3414, + "step": 4135 + }, + { + "epoch": 0.05610417797069995, + "grad_norm": 6.215856552124023, + "learning_rate": 9.536110730437167e-06, + "loss": 0.3276, + "step": 4136 + }, + { + "epoch": 0.056117742810634835, + "grad_norm": 8.837665557861328, + "learning_rate": 9.535973687816912e-06, + "loss": 0.5422, + "step": 4137 + }, + { + "epoch": 0.05613130765056972, + "grad_norm": 7.743021488189697, + "learning_rate": 9.535836645196657e-06, + "loss": 0.3943, + "step": 4138 + }, + { + "epoch": 0.056144872490504615, + "grad_norm": 7.951735019683838, + "learning_rate": 9.535699602576403e-06, + "loss": 0.4472, + "step": 4139 + }, + { + "epoch": 0.0561584373304395, + "grad_norm": 7.74142599105835, + "learning_rate": 9.535562559956148e-06, + "loss": 0.5964, + "step": 4140 + }, + { + "epoch": 0.05617200217037439, + "grad_norm": 7.857154846191406, + "learning_rate": 9.535425517335893e-06, + "loss": 0.3689, + "step": 4141 + }, + { + "epoch": 0.05618556701030928, + "grad_norm": 7.625271797180176, + "learning_rate": 9.535288474715636e-06, + "loss": 0.5103, + "step": 4142 + }, + { + "epoch": 0.056199131850244166, + "grad_norm": 10.438984870910645, + "learning_rate": 9.535151432095382e-06, + "loss": 0.604, + "step": 4143 + }, + { + "epoch": 0.05621269669017906, + "grad_norm": 7.54406213760376, + "learning_rate": 9.535014389475128e-06, + "loss": 0.517, + "step": 4144 + }, + { + "epoch": 0.056226261530113945, + "grad_norm": 6.489705562591553, + "learning_rate": 9.534877346854872e-06, + "loss": 0.4343, + "step": 4145 + }, + { + "epoch": 0.05623982637004883, + "grad_norm": 5.963029384613037, + "learning_rate": 9.534740304234617e-06, + "loss": 0.3866, + "step": 4146 + }, + { + "epoch": 0.056253391209983725, + "grad_norm": 4.999396324157715, + "learning_rate": 9.534603261614362e-06, + "loss": 0.4132, + "step": 4147 + }, + { + "epoch": 0.05626695604991861, + "grad_norm": 5.719071388244629, + "learning_rate": 9.53446621899411e-06, + "loss": 0.4128, + "step": 4148 + }, + { + "epoch": 0.0562805208898535, + "grad_norm": 6.929529666900635, + "learning_rate": 9.534329176373853e-06, + "loss": 0.4214, + "step": 4149 + }, + { + "epoch": 0.05629408572978839, + "grad_norm": 6.40824556350708, + "learning_rate": 9.534192133753598e-06, + "loss": 0.4259, + "step": 4150 + }, + { + "epoch": 0.056307650569723276, + "grad_norm": 6.955355644226074, + "learning_rate": 9.534055091133343e-06, + "loss": 0.4956, + "step": 4151 + }, + { + "epoch": 0.05632121540965817, + "grad_norm": 7.936775207519531, + "learning_rate": 9.533918048513088e-06, + "loss": 0.4496, + "step": 4152 + }, + { + "epoch": 0.056334780249593056, + "grad_norm": 7.043481349945068, + "learning_rate": 9.533781005892833e-06, + "loss": 0.5254, + "step": 4153 + }, + { + "epoch": 0.05634834508952794, + "grad_norm": 6.358124732971191, + "learning_rate": 9.533643963272579e-06, + "loss": 0.4035, + "step": 4154 + }, + { + "epoch": 0.056361909929462835, + "grad_norm": 7.419431686401367, + "learning_rate": 9.533506920652324e-06, + "loss": 0.5604, + "step": 4155 + }, + { + "epoch": 0.05637547476939772, + "grad_norm": 7.382408618927002, + "learning_rate": 9.533369878032069e-06, + "loss": 0.4295, + "step": 4156 + }, + { + "epoch": 0.05638903960933261, + "grad_norm": 7.990595817565918, + "learning_rate": 9.533232835411814e-06, + "loss": 0.639, + "step": 4157 + }, + { + "epoch": 0.0564026044492675, + "grad_norm": 7.487813472747803, + "learning_rate": 9.53309579279156e-06, + "loss": 0.5266, + "step": 4158 + }, + { + "epoch": 0.056416169289202386, + "grad_norm": 6.609699249267578, + "learning_rate": 9.532958750171304e-06, + "loss": 0.3316, + "step": 4159 + }, + { + "epoch": 0.05642973412913728, + "grad_norm": 9.211618423461914, + "learning_rate": 9.532821707551048e-06, + "loss": 0.5462, + "step": 4160 + }, + { + "epoch": 0.056443298969072166, + "grad_norm": 7.354701995849609, + "learning_rate": 9.532684664930795e-06, + "loss": 0.4858, + "step": 4161 + }, + { + "epoch": 0.05645686380900705, + "grad_norm": 6.16267204284668, + "learning_rate": 9.53254762231054e-06, + "loss": 0.4856, + "step": 4162 + }, + { + "epoch": 0.056470428648941945, + "grad_norm": 7.946013450622559, + "learning_rate": 9.532410579690285e-06, + "loss": 0.5195, + "step": 4163 + }, + { + "epoch": 0.05648399348887683, + "grad_norm": 5.928383827209473, + "learning_rate": 9.532273537070029e-06, + "loss": 0.4278, + "step": 4164 + }, + { + "epoch": 0.05649755832881172, + "grad_norm": 6.334818363189697, + "learning_rate": 9.532136494449776e-06, + "loss": 0.4378, + "step": 4165 + }, + { + "epoch": 0.05651112316874661, + "grad_norm": 5.571518421173096, + "learning_rate": 9.53199945182952e-06, + "loss": 0.4053, + "step": 4166 + }, + { + "epoch": 0.056524688008681497, + "grad_norm": 9.457839965820312, + "learning_rate": 9.531862409209264e-06, + "loss": 0.7996, + "step": 4167 + }, + { + "epoch": 0.05653825284861639, + "grad_norm": 7.353999137878418, + "learning_rate": 9.53172536658901e-06, + "loss": 0.5518, + "step": 4168 + }, + { + "epoch": 0.056551817688551276, + "grad_norm": 6.596639633178711, + "learning_rate": 9.531588323968755e-06, + "loss": 0.4009, + "step": 4169 + }, + { + "epoch": 0.05656538252848616, + "grad_norm": 6.9712419509887695, + "learning_rate": 9.5314512813485e-06, + "loss": 0.4528, + "step": 4170 + }, + { + "epoch": 0.056578947368421055, + "grad_norm": 6.697897434234619, + "learning_rate": 9.531314238728245e-06, + "loss": 0.5304, + "step": 4171 + }, + { + "epoch": 0.05659251220835594, + "grad_norm": 7.912126541137695, + "learning_rate": 9.53117719610799e-06, + "loss": 0.4848, + "step": 4172 + }, + { + "epoch": 0.05660607704829083, + "grad_norm": 6.481448173522949, + "learning_rate": 9.531040153487735e-06, + "loss": 0.4329, + "step": 4173 + }, + { + "epoch": 0.05661964188822572, + "grad_norm": 6.28922700881958, + "learning_rate": 9.53090311086748e-06, + "loss": 0.4685, + "step": 4174 + }, + { + "epoch": 0.05663320672816061, + "grad_norm": 6.670666217803955, + "learning_rate": 9.530766068247226e-06, + "loss": 0.4953, + "step": 4175 + }, + { + "epoch": 0.0566467715680955, + "grad_norm": 6.222055435180664, + "learning_rate": 9.53062902562697e-06, + "loss": 0.3747, + "step": 4176 + }, + { + "epoch": 0.056660336408030386, + "grad_norm": 7.121455669403076, + "learning_rate": 9.530491983006716e-06, + "loss": 0.5157, + "step": 4177 + }, + { + "epoch": 0.05667390124796527, + "grad_norm": 7.921200752258301, + "learning_rate": 9.530354940386461e-06, + "loss": 0.5507, + "step": 4178 + }, + { + "epoch": 0.056687466087900165, + "grad_norm": 7.775735855102539, + "learning_rate": 9.530217897766206e-06, + "loss": 0.4857, + "step": 4179 + }, + { + "epoch": 0.05670103092783505, + "grad_norm": 4.79545783996582, + "learning_rate": 9.530080855145952e-06, + "loss": 0.3811, + "step": 4180 + }, + { + "epoch": 0.05671459576776994, + "grad_norm": 8.942028045654297, + "learning_rate": 9.529943812525697e-06, + "loss": 0.7145, + "step": 4181 + }, + { + "epoch": 0.05672816060770483, + "grad_norm": 7.455601692199707, + "learning_rate": 9.52980676990544e-06, + "loss": 0.5705, + "step": 4182 + }, + { + "epoch": 0.05674172544763972, + "grad_norm": 8.078659057617188, + "learning_rate": 9.529669727285187e-06, + "loss": 0.3962, + "step": 4183 + }, + { + "epoch": 0.05675529028757461, + "grad_norm": 6.185054302215576, + "learning_rate": 9.529532684664932e-06, + "loss": 0.5567, + "step": 4184 + }, + { + "epoch": 0.056768855127509496, + "grad_norm": 6.9535908699035645, + "learning_rate": 9.529395642044676e-06, + "loss": 0.5616, + "step": 4185 + }, + { + "epoch": 0.05678241996744438, + "grad_norm": 7.241412162780762, + "learning_rate": 9.529258599424421e-06, + "loss": 0.3479, + "step": 4186 + }, + { + "epoch": 0.056795984807379275, + "grad_norm": 4.792078018188477, + "learning_rate": 9.529121556804168e-06, + "loss": 0.358, + "step": 4187 + }, + { + "epoch": 0.05680954964731416, + "grad_norm": 7.351562023162842, + "learning_rate": 9.528984514183911e-06, + "loss": 0.5947, + "step": 4188 + }, + { + "epoch": 0.05682311448724905, + "grad_norm": 6.312559127807617, + "learning_rate": 9.528847471563656e-06, + "loss": 0.3994, + "step": 4189 + }, + { + "epoch": 0.05683667932718394, + "grad_norm": 7.006595134735107, + "learning_rate": 9.528710428943402e-06, + "loss": 0.4609, + "step": 4190 + }, + { + "epoch": 0.05685024416711883, + "grad_norm": 9.111223220825195, + "learning_rate": 9.528573386323149e-06, + "loss": 0.5611, + "step": 4191 + }, + { + "epoch": 0.05686380900705372, + "grad_norm": 6.596911430358887, + "learning_rate": 9.528436343702892e-06, + "loss": 0.3153, + "step": 4192 + }, + { + "epoch": 0.056877373846988606, + "grad_norm": 7.428248405456543, + "learning_rate": 9.528299301082637e-06, + "loss": 0.5126, + "step": 4193 + }, + { + "epoch": 0.05689093868692349, + "grad_norm": 7.173488140106201, + "learning_rate": 9.528162258462382e-06, + "loss": 0.5381, + "step": 4194 + }, + { + "epoch": 0.056904503526858385, + "grad_norm": 6.860627174377441, + "learning_rate": 9.528025215842128e-06, + "loss": 0.3369, + "step": 4195 + }, + { + "epoch": 0.05691806836679327, + "grad_norm": 8.329655647277832, + "learning_rate": 9.527888173221873e-06, + "loss": 0.5714, + "step": 4196 + }, + { + "epoch": 0.05693163320672816, + "grad_norm": 8.160859107971191, + "learning_rate": 9.527751130601618e-06, + "loss": 0.634, + "step": 4197 + }, + { + "epoch": 0.05694519804666305, + "grad_norm": 5.657465934753418, + "learning_rate": 9.527614087981363e-06, + "loss": 0.2929, + "step": 4198 + }, + { + "epoch": 0.05695876288659794, + "grad_norm": 6.559403419494629, + "learning_rate": 9.527477045361108e-06, + "loss": 0.3336, + "step": 4199 + }, + { + "epoch": 0.05697232772653283, + "grad_norm": 5.446915149688721, + "learning_rate": 9.527340002740853e-06, + "loss": 0.4362, + "step": 4200 + }, + { + "epoch": 0.056985892566467716, + "grad_norm": 7.837641716003418, + "learning_rate": 9.527202960120599e-06, + "loss": 0.4415, + "step": 4201 + }, + { + "epoch": 0.0569994574064026, + "grad_norm": 7.387453556060791, + "learning_rate": 9.527065917500344e-06, + "loss": 0.4282, + "step": 4202 + }, + { + "epoch": 0.057013022246337496, + "grad_norm": 6.099148750305176, + "learning_rate": 9.526928874880087e-06, + "loss": 0.3843, + "step": 4203 + }, + { + "epoch": 0.05702658708627238, + "grad_norm": 6.662630558013916, + "learning_rate": 9.526791832259834e-06, + "loss": 0.4489, + "step": 4204 + }, + { + "epoch": 0.05704015192620727, + "grad_norm": 6.814840316772461, + "learning_rate": 9.52665478963958e-06, + "loss": 0.3799, + "step": 4205 + }, + { + "epoch": 0.05705371676614216, + "grad_norm": 8.855238914489746, + "learning_rate": 9.526517747019325e-06, + "loss": 0.4769, + "step": 4206 + }, + { + "epoch": 0.05706728160607705, + "grad_norm": 6.054271697998047, + "learning_rate": 9.526380704399068e-06, + "loss": 0.4861, + "step": 4207 + }, + { + "epoch": 0.05708084644601194, + "grad_norm": 7.2832417488098145, + "learning_rate": 9.526243661778815e-06, + "loss": 0.3592, + "step": 4208 + }, + { + "epoch": 0.057094411285946826, + "grad_norm": 9.797404289245605, + "learning_rate": 9.52610661915856e-06, + "loss": 0.4768, + "step": 4209 + }, + { + "epoch": 0.05710797612588171, + "grad_norm": 6.120962142944336, + "learning_rate": 9.525969576538304e-06, + "loss": 0.3361, + "step": 4210 + }, + { + "epoch": 0.057121540965816606, + "grad_norm": 7.648814678192139, + "learning_rate": 9.525832533918049e-06, + "loss": 0.585, + "step": 4211 + }, + { + "epoch": 0.05713510580575149, + "grad_norm": 10.157726287841797, + "learning_rate": 9.525695491297794e-06, + "loss": 0.6277, + "step": 4212 + }, + { + "epoch": 0.05714867064568638, + "grad_norm": 7.538335800170898, + "learning_rate": 9.525558448677539e-06, + "loss": 0.4469, + "step": 4213 + }, + { + "epoch": 0.05716223548562127, + "grad_norm": 6.649049282073975, + "learning_rate": 9.525421406057284e-06, + "loss": 0.5059, + "step": 4214 + }, + { + "epoch": 0.05717580032555616, + "grad_norm": 5.081871509552002, + "learning_rate": 9.52528436343703e-06, + "loss": 0.2593, + "step": 4215 + }, + { + "epoch": 0.05718936516549105, + "grad_norm": 8.208040237426758, + "learning_rate": 9.525147320816775e-06, + "loss": 0.447, + "step": 4216 + }, + { + "epoch": 0.05720293000542594, + "grad_norm": 6.849217414855957, + "learning_rate": 9.52501027819652e-06, + "loss": 0.5458, + "step": 4217 + }, + { + "epoch": 0.05721649484536082, + "grad_norm": 8.361808776855469, + "learning_rate": 9.524873235576265e-06, + "loss": 0.4853, + "step": 4218 + }, + { + "epoch": 0.057230059685295716, + "grad_norm": 8.893573760986328, + "learning_rate": 9.52473619295601e-06, + "loss": 0.5399, + "step": 4219 + }, + { + "epoch": 0.0572436245252306, + "grad_norm": 6.600849628448486, + "learning_rate": 9.524599150335755e-06, + "loss": 0.4934, + "step": 4220 + }, + { + "epoch": 0.05725718936516549, + "grad_norm": 7.443354606628418, + "learning_rate": 9.5244621077155e-06, + "loss": 0.3255, + "step": 4221 + }, + { + "epoch": 0.05727075420510038, + "grad_norm": 8.348747253417969, + "learning_rate": 9.524325065095246e-06, + "loss": 0.5823, + "step": 4222 + }, + { + "epoch": 0.05728431904503527, + "grad_norm": 8.935240745544434, + "learning_rate": 9.524188022474991e-06, + "loss": 0.3789, + "step": 4223 + }, + { + "epoch": 0.05729788388497016, + "grad_norm": 7.362415790557861, + "learning_rate": 9.524050979854736e-06, + "loss": 0.5556, + "step": 4224 + }, + { + "epoch": 0.05731144872490505, + "grad_norm": 7.286780834197998, + "learning_rate": 9.52391393723448e-06, + "loss": 0.4845, + "step": 4225 + }, + { + "epoch": 0.05732501356483993, + "grad_norm": 7.429903507232666, + "learning_rate": 9.523776894614226e-06, + "loss": 0.4061, + "step": 4226 + }, + { + "epoch": 0.057338578404774826, + "grad_norm": 7.770839691162109, + "learning_rate": 9.523639851993972e-06, + "loss": 0.4504, + "step": 4227 + }, + { + "epoch": 0.05735214324470971, + "grad_norm": 9.097225189208984, + "learning_rate": 9.523502809373715e-06, + "loss": 0.3617, + "step": 4228 + }, + { + "epoch": 0.0573657080846446, + "grad_norm": 6.412201881408691, + "learning_rate": 9.52336576675346e-06, + "loss": 0.4511, + "step": 4229 + }, + { + "epoch": 0.05737927292457949, + "grad_norm": 6.815554141998291, + "learning_rate": 9.523228724133207e-06, + "loss": 0.4417, + "step": 4230 + }, + { + "epoch": 0.05739283776451438, + "grad_norm": 8.386974334716797, + "learning_rate": 9.523091681512952e-06, + "loss": 0.5956, + "step": 4231 + }, + { + "epoch": 0.05740640260444927, + "grad_norm": 6.767075061798096, + "learning_rate": 9.522954638892696e-06, + "loss": 0.3813, + "step": 4232 + }, + { + "epoch": 0.05741996744438416, + "grad_norm": 5.200197219848633, + "learning_rate": 9.522817596272441e-06, + "loss": 0.2562, + "step": 4233 + }, + { + "epoch": 0.05743353228431904, + "grad_norm": 6.6348466873168945, + "learning_rate": 9.522680553652188e-06, + "loss": 0.3738, + "step": 4234 + }, + { + "epoch": 0.057447097124253936, + "grad_norm": 10.053658485412598, + "learning_rate": 9.522543511031931e-06, + "loss": 0.3722, + "step": 4235 + }, + { + "epoch": 0.05746066196418882, + "grad_norm": 7.908453464508057, + "learning_rate": 9.522406468411676e-06, + "loss": 0.4524, + "step": 4236 + }, + { + "epoch": 0.05747422680412371, + "grad_norm": 5.793609619140625, + "learning_rate": 9.522269425791422e-06, + "loss": 0.4422, + "step": 4237 + }, + { + "epoch": 0.0574877916440586, + "grad_norm": 7.619974136352539, + "learning_rate": 9.522132383171167e-06, + "loss": 0.4348, + "step": 4238 + }, + { + "epoch": 0.05750135648399349, + "grad_norm": 7.1135711669921875, + "learning_rate": 9.521995340550912e-06, + "loss": 0.4894, + "step": 4239 + }, + { + "epoch": 0.05751492132392838, + "grad_norm": 7.235686779022217, + "learning_rate": 9.521858297930657e-06, + "loss": 0.6516, + "step": 4240 + }, + { + "epoch": 0.05752848616386327, + "grad_norm": 9.146217346191406, + "learning_rate": 9.521721255310402e-06, + "loss": 0.4208, + "step": 4241 + }, + { + "epoch": 0.05754205100379815, + "grad_norm": 6.3008317947387695, + "learning_rate": 9.521584212690148e-06, + "loss": 0.3142, + "step": 4242 + }, + { + "epoch": 0.057555615843733046, + "grad_norm": 9.288795471191406, + "learning_rate": 9.521447170069893e-06, + "loss": 0.4521, + "step": 4243 + }, + { + "epoch": 0.05756918068366793, + "grad_norm": 6.542370319366455, + "learning_rate": 9.521310127449638e-06, + "loss": 0.3074, + "step": 4244 + }, + { + "epoch": 0.05758274552360282, + "grad_norm": 8.490067481994629, + "learning_rate": 9.521173084829383e-06, + "loss": 0.3931, + "step": 4245 + }, + { + "epoch": 0.05759631036353771, + "grad_norm": 6.6238627433776855, + "learning_rate": 9.521036042209128e-06, + "loss": 0.2859, + "step": 4246 + }, + { + "epoch": 0.0576098752034726, + "grad_norm": 4.920619487762451, + "learning_rate": 9.520898999588873e-06, + "loss": 0.3434, + "step": 4247 + }, + { + "epoch": 0.05762344004340749, + "grad_norm": 5.122717380523682, + "learning_rate": 9.520761956968619e-06, + "loss": 0.2908, + "step": 4248 + }, + { + "epoch": 0.05763700488334238, + "grad_norm": 7.792344570159912, + "learning_rate": 9.520624914348364e-06, + "loss": 0.4655, + "step": 4249 + }, + { + "epoch": 0.05765056972327726, + "grad_norm": 7.183361530303955, + "learning_rate": 9.520487871728107e-06, + "loss": 0.428, + "step": 4250 + }, + { + "epoch": 0.057664134563212156, + "grad_norm": 7.294766902923584, + "learning_rate": 9.520350829107852e-06, + "loss": 0.5177, + "step": 4251 + }, + { + "epoch": 0.05767769940314704, + "grad_norm": 8.472505569458008, + "learning_rate": 9.5202137864876e-06, + "loss": 0.6232, + "step": 4252 + }, + { + "epoch": 0.05769126424308193, + "grad_norm": 7.00140380859375, + "learning_rate": 9.520076743867343e-06, + "loss": 0.5833, + "step": 4253 + }, + { + "epoch": 0.05770482908301682, + "grad_norm": 6.475766658782959, + "learning_rate": 9.519939701247088e-06, + "loss": 0.2779, + "step": 4254 + }, + { + "epoch": 0.05771839392295171, + "grad_norm": 6.892730236053467, + "learning_rate": 9.519802658626833e-06, + "loss": 0.3747, + "step": 4255 + }, + { + "epoch": 0.0577319587628866, + "grad_norm": 6.7099690437316895, + "learning_rate": 9.51966561600658e-06, + "loss": 0.3761, + "step": 4256 + }, + { + "epoch": 0.05774552360282149, + "grad_norm": 7.462748050689697, + "learning_rate": 9.519528573386324e-06, + "loss": 0.5113, + "step": 4257 + }, + { + "epoch": 0.05775908844275637, + "grad_norm": 7.009143829345703, + "learning_rate": 9.519391530766069e-06, + "loss": 0.4922, + "step": 4258 + }, + { + "epoch": 0.057772653282691266, + "grad_norm": 5.744630336761475, + "learning_rate": 9.519254488145814e-06, + "loss": 0.3321, + "step": 4259 + }, + { + "epoch": 0.05778621812262615, + "grad_norm": 6.243941307067871, + "learning_rate": 9.519117445525559e-06, + "loss": 0.4864, + "step": 4260 + }, + { + "epoch": 0.05779978296256104, + "grad_norm": 6.928425312042236, + "learning_rate": 9.518980402905304e-06, + "loss": 0.4178, + "step": 4261 + }, + { + "epoch": 0.05781334780249593, + "grad_norm": 6.417448997497559, + "learning_rate": 9.51884336028505e-06, + "loss": 0.3074, + "step": 4262 + }, + { + "epoch": 0.05782691264243082, + "grad_norm": 6.333227157592773, + "learning_rate": 9.518706317664795e-06, + "loss": 0.4782, + "step": 4263 + }, + { + "epoch": 0.05784047748236571, + "grad_norm": 6.916779041290283, + "learning_rate": 9.51856927504454e-06, + "loss": 0.4559, + "step": 4264 + }, + { + "epoch": 0.0578540423223006, + "grad_norm": 5.938711166381836, + "learning_rate": 9.518432232424285e-06, + "loss": 0.3962, + "step": 4265 + }, + { + "epoch": 0.05786760716223548, + "grad_norm": 9.80137825012207, + "learning_rate": 9.51829518980403e-06, + "loss": 0.5186, + "step": 4266 + }, + { + "epoch": 0.05788117200217038, + "grad_norm": 5.259732246398926, + "learning_rate": 9.518158147183775e-06, + "loss": 0.3308, + "step": 4267 + }, + { + "epoch": 0.05789473684210526, + "grad_norm": 5.509551525115967, + "learning_rate": 9.518021104563519e-06, + "loss": 0.3196, + "step": 4268 + }, + { + "epoch": 0.05790830168204015, + "grad_norm": 6.878870487213135, + "learning_rate": 9.517884061943266e-06, + "loss": 0.4473, + "step": 4269 + }, + { + "epoch": 0.05792186652197504, + "grad_norm": 6.913940906524658, + "learning_rate": 9.517747019323011e-06, + "loss": 0.5117, + "step": 4270 + }, + { + "epoch": 0.05793543136190993, + "grad_norm": 5.619302749633789, + "learning_rate": 9.517609976702756e-06, + "loss": 0.4227, + "step": 4271 + }, + { + "epoch": 0.05794899620184482, + "grad_norm": 5.867551326751709, + "learning_rate": 9.5174729340825e-06, + "loss": 0.5084, + "step": 4272 + }, + { + "epoch": 0.05796256104177971, + "grad_norm": 5.095597743988037, + "learning_rate": 9.517335891462246e-06, + "loss": 0.3529, + "step": 4273 + }, + { + "epoch": 0.057976125881714594, + "grad_norm": 6.372938632965088, + "learning_rate": 9.517198848841992e-06, + "loss": 0.4535, + "step": 4274 + }, + { + "epoch": 0.05798969072164949, + "grad_norm": 7.448460578918457, + "learning_rate": 9.517061806221735e-06, + "loss": 0.4963, + "step": 4275 + }, + { + "epoch": 0.05800325556158437, + "grad_norm": 6.471002101898193, + "learning_rate": 9.51692476360148e-06, + "loss": 0.4024, + "step": 4276 + }, + { + "epoch": 0.05801682040151926, + "grad_norm": 5.59093713760376, + "learning_rate": 9.516787720981225e-06, + "loss": 0.4188, + "step": 4277 + }, + { + "epoch": 0.05803038524145415, + "grad_norm": 5.795616149902344, + "learning_rate": 9.51665067836097e-06, + "loss": 0.4786, + "step": 4278 + }, + { + "epoch": 0.05804395008138904, + "grad_norm": 6.690035820007324, + "learning_rate": 9.516513635740716e-06, + "loss": 0.3173, + "step": 4279 + }, + { + "epoch": 0.05805751492132393, + "grad_norm": 5.195732116699219, + "learning_rate": 9.516376593120461e-06, + "loss": 0.4226, + "step": 4280 + }, + { + "epoch": 0.05807107976125882, + "grad_norm": 6.298871040344238, + "learning_rate": 9.516239550500206e-06, + "loss": 0.3943, + "step": 4281 + }, + { + "epoch": 0.058084644601193704, + "grad_norm": 6.748794078826904, + "learning_rate": 9.516102507879951e-06, + "loss": 0.4029, + "step": 4282 + }, + { + "epoch": 0.0580982094411286, + "grad_norm": 7.067800998687744, + "learning_rate": 9.515965465259697e-06, + "loss": 0.5859, + "step": 4283 + }, + { + "epoch": 0.05811177428106348, + "grad_norm": 6.708624839782715, + "learning_rate": 9.515828422639442e-06, + "loss": 0.5783, + "step": 4284 + }, + { + "epoch": 0.05812533912099837, + "grad_norm": 8.659140586853027, + "learning_rate": 9.515691380019187e-06, + "loss": 0.435, + "step": 4285 + }, + { + "epoch": 0.05813890396093326, + "grad_norm": 9.124504089355469, + "learning_rate": 9.515554337398932e-06, + "loss": 0.4566, + "step": 4286 + }, + { + "epoch": 0.05815246880086815, + "grad_norm": 7.382999897003174, + "learning_rate": 9.515417294778677e-06, + "loss": 0.4127, + "step": 4287 + }, + { + "epoch": 0.05816603364080304, + "grad_norm": 6.037960052490234, + "learning_rate": 9.515280252158422e-06, + "loss": 0.2892, + "step": 4288 + }, + { + "epoch": 0.05817959848073793, + "grad_norm": 7.261973857879639, + "learning_rate": 9.515143209538168e-06, + "loss": 0.3745, + "step": 4289 + }, + { + "epoch": 0.058193163320672814, + "grad_norm": 5.359521389007568, + "learning_rate": 9.515006166917913e-06, + "loss": 0.4116, + "step": 4290 + }, + { + "epoch": 0.05820672816060771, + "grad_norm": 5.907482624053955, + "learning_rate": 9.514869124297658e-06, + "loss": 0.3783, + "step": 4291 + }, + { + "epoch": 0.05822029300054259, + "grad_norm": 5.990573883056641, + "learning_rate": 9.514732081677403e-06, + "loss": 0.2774, + "step": 4292 + }, + { + "epoch": 0.05823385784047748, + "grad_norm": 7.419851779937744, + "learning_rate": 9.514595039057147e-06, + "loss": 0.535, + "step": 4293 + }, + { + "epoch": 0.05824742268041237, + "grad_norm": 6.0232768058776855, + "learning_rate": 9.514457996436892e-06, + "loss": 0.4913, + "step": 4294 + }, + { + "epoch": 0.05826098752034726, + "grad_norm": 6.996912002563477, + "learning_rate": 9.514320953816639e-06, + "loss": 0.4338, + "step": 4295 + }, + { + "epoch": 0.05827455236028215, + "grad_norm": 7.029957294464111, + "learning_rate": 9.514183911196382e-06, + "loss": 0.4416, + "step": 4296 + }, + { + "epoch": 0.05828811720021704, + "grad_norm": 7.405010223388672, + "learning_rate": 9.514046868576127e-06, + "loss": 0.3396, + "step": 4297 + }, + { + "epoch": 0.058301682040151924, + "grad_norm": 7.603142261505127, + "learning_rate": 9.513909825955872e-06, + "loss": 0.3794, + "step": 4298 + }, + { + "epoch": 0.05831524688008682, + "grad_norm": 5.736361503601074, + "learning_rate": 9.51377278333562e-06, + "loss": 0.3516, + "step": 4299 + }, + { + "epoch": 0.0583288117200217, + "grad_norm": 4.840360164642334, + "learning_rate": 9.513635740715363e-06, + "loss": 0.2519, + "step": 4300 + }, + { + "epoch": 0.05834237655995659, + "grad_norm": 6.485153675079346, + "learning_rate": 9.513498698095108e-06, + "loss": 0.4126, + "step": 4301 + }, + { + "epoch": 0.05835594139989148, + "grad_norm": 8.045442581176758, + "learning_rate": 9.513361655474853e-06, + "loss": 0.6331, + "step": 4302 + }, + { + "epoch": 0.05836950623982637, + "grad_norm": 7.732957363128662, + "learning_rate": 9.513224612854598e-06, + "loss": 0.5064, + "step": 4303 + }, + { + "epoch": 0.05838307107976126, + "grad_norm": 7.127298831939697, + "learning_rate": 9.513087570234344e-06, + "loss": 0.4423, + "step": 4304 + }, + { + "epoch": 0.05839663591969615, + "grad_norm": 6.900515079498291, + "learning_rate": 9.512950527614089e-06, + "loss": 0.4049, + "step": 4305 + }, + { + "epoch": 0.058410200759631034, + "grad_norm": 7.895124435424805, + "learning_rate": 9.512813484993834e-06, + "loss": 0.464, + "step": 4306 + }, + { + "epoch": 0.05842376559956593, + "grad_norm": 9.255189895629883, + "learning_rate": 9.512676442373579e-06, + "loss": 0.5981, + "step": 4307 + }, + { + "epoch": 0.05843733043950081, + "grad_norm": 6.914351940155029, + "learning_rate": 9.512539399753324e-06, + "loss": 0.3439, + "step": 4308 + }, + { + "epoch": 0.0584508952794357, + "grad_norm": 5.998909950256348, + "learning_rate": 9.51240235713307e-06, + "loss": 0.406, + "step": 4309 + }, + { + "epoch": 0.05846446011937059, + "grad_norm": 7.504735469818115, + "learning_rate": 9.512265314512815e-06, + "loss": 0.5795, + "step": 4310 + }, + { + "epoch": 0.05847802495930548, + "grad_norm": 8.014142990112305, + "learning_rate": 9.512128271892558e-06, + "loss": 0.3317, + "step": 4311 + }, + { + "epoch": 0.05849158979924037, + "grad_norm": 7.547921180725098, + "learning_rate": 9.511991229272305e-06, + "loss": 0.4017, + "step": 4312 + }, + { + "epoch": 0.05850515463917526, + "grad_norm": 6.525592803955078, + "learning_rate": 9.51185418665205e-06, + "loss": 0.4516, + "step": 4313 + }, + { + "epoch": 0.058518719479110144, + "grad_norm": 7.8848395347595215, + "learning_rate": 9.511717144031795e-06, + "loss": 0.3885, + "step": 4314 + }, + { + "epoch": 0.05853228431904504, + "grad_norm": 6.738670825958252, + "learning_rate": 9.511580101411539e-06, + "loss": 0.3536, + "step": 4315 + }, + { + "epoch": 0.058545849158979923, + "grad_norm": 5.859437465667725, + "learning_rate": 9.511443058791286e-06, + "loss": 0.3172, + "step": 4316 + }, + { + "epoch": 0.05855941399891481, + "grad_norm": 9.622657775878906, + "learning_rate": 9.511306016171031e-06, + "loss": 0.5437, + "step": 4317 + }, + { + "epoch": 0.0585729788388497, + "grad_norm": 7.662573337554932, + "learning_rate": 9.511168973550774e-06, + "loss": 0.6112, + "step": 4318 + }, + { + "epoch": 0.05858654367878459, + "grad_norm": 4.684839725494385, + "learning_rate": 9.51103193093052e-06, + "loss": 0.2514, + "step": 4319 + }, + { + "epoch": 0.05860010851871948, + "grad_norm": 8.644983291625977, + "learning_rate": 9.510894888310265e-06, + "loss": 0.4111, + "step": 4320 + }, + { + "epoch": 0.05861367335865437, + "grad_norm": 5.771159648895264, + "learning_rate": 9.51075784569001e-06, + "loss": 0.3172, + "step": 4321 + }, + { + "epoch": 0.058627238198589254, + "grad_norm": 9.972764015197754, + "learning_rate": 9.510620803069755e-06, + "loss": 0.6354, + "step": 4322 + }, + { + "epoch": 0.05864080303852415, + "grad_norm": 6.624433517456055, + "learning_rate": 9.5104837604495e-06, + "loss": 0.3547, + "step": 4323 + }, + { + "epoch": 0.058654367878459034, + "grad_norm": 8.031301498413086, + "learning_rate": 9.510346717829245e-06, + "loss": 0.3289, + "step": 4324 + }, + { + "epoch": 0.05866793271839392, + "grad_norm": 6.652493953704834, + "learning_rate": 9.51020967520899e-06, + "loss": 0.4026, + "step": 4325 + }, + { + "epoch": 0.05868149755832881, + "grad_norm": 7.266416549682617, + "learning_rate": 9.510072632588736e-06, + "loss": 0.4049, + "step": 4326 + }, + { + "epoch": 0.0586950623982637, + "grad_norm": 9.784910202026367, + "learning_rate": 9.509935589968481e-06, + "loss": 0.5365, + "step": 4327 + }, + { + "epoch": 0.05870862723819859, + "grad_norm": 6.81508207321167, + "learning_rate": 9.509798547348226e-06, + "loss": 0.4131, + "step": 4328 + }, + { + "epoch": 0.05872219207813348, + "grad_norm": 5.760438919067383, + "learning_rate": 9.509661504727971e-06, + "loss": 0.3698, + "step": 4329 + }, + { + "epoch": 0.058735756918068364, + "grad_norm": 7.417153835296631, + "learning_rate": 9.509524462107717e-06, + "loss": 0.4916, + "step": 4330 + }, + { + "epoch": 0.05874932175800326, + "grad_norm": 5.4909586906433105, + "learning_rate": 9.509387419487462e-06, + "loss": 0.4877, + "step": 4331 + }, + { + "epoch": 0.058762886597938144, + "grad_norm": 9.252147674560547, + "learning_rate": 9.509250376867207e-06, + "loss": 0.3967, + "step": 4332 + }, + { + "epoch": 0.05877645143787303, + "grad_norm": 7.575932502746582, + "learning_rate": 9.50911333424695e-06, + "loss": 0.3575, + "step": 4333 + }, + { + "epoch": 0.05879001627780792, + "grad_norm": 8.144309043884277, + "learning_rate": 9.508976291626697e-06, + "loss": 0.4448, + "step": 4334 + }, + { + "epoch": 0.05880358111774281, + "grad_norm": 6.577676296234131, + "learning_rate": 9.508839249006442e-06, + "loss": 0.3055, + "step": 4335 + }, + { + "epoch": 0.0588171459576777, + "grad_norm": 7.094230651855469, + "learning_rate": 9.508702206386186e-06, + "loss": 0.4132, + "step": 4336 + }, + { + "epoch": 0.05883071079761259, + "grad_norm": 7.453175067901611, + "learning_rate": 9.508565163765931e-06, + "loss": 0.4001, + "step": 4337 + }, + { + "epoch": 0.058844275637547475, + "grad_norm": 5.96773099899292, + "learning_rate": 9.508428121145678e-06, + "loss": 0.3087, + "step": 4338 + }, + { + "epoch": 0.05885784047748237, + "grad_norm": 7.850919246673584, + "learning_rate": 9.508291078525423e-06, + "loss": 0.4591, + "step": 4339 + }, + { + "epoch": 0.058871405317417254, + "grad_norm": 9.217055320739746, + "learning_rate": 9.508154035905167e-06, + "loss": 0.5901, + "step": 4340 + }, + { + "epoch": 0.05888497015735214, + "grad_norm": 6.161579608917236, + "learning_rate": 9.508016993284912e-06, + "loss": 0.3372, + "step": 4341 + }, + { + "epoch": 0.05889853499728703, + "grad_norm": 5.297241687774658, + "learning_rate": 9.507879950664659e-06, + "loss": 0.2816, + "step": 4342 + }, + { + "epoch": 0.05891209983722192, + "grad_norm": 6.89314603805542, + "learning_rate": 9.507742908044402e-06, + "loss": 0.38, + "step": 4343 + }, + { + "epoch": 0.05892566467715681, + "grad_norm": 6.966701984405518, + "learning_rate": 9.507605865424147e-06, + "loss": 0.4003, + "step": 4344 + }, + { + "epoch": 0.0589392295170917, + "grad_norm": 4.778575420379639, + "learning_rate": 9.507468822803893e-06, + "loss": 0.2902, + "step": 4345 + }, + { + "epoch": 0.058952794357026585, + "grad_norm": 6.545941352844238, + "learning_rate": 9.507331780183638e-06, + "loss": 0.3775, + "step": 4346 + }, + { + "epoch": 0.05896635919696148, + "grad_norm": 7.347668170928955, + "learning_rate": 9.507194737563383e-06, + "loss": 0.3566, + "step": 4347 + }, + { + "epoch": 0.058979924036896364, + "grad_norm": 5.372671127319336, + "learning_rate": 9.507057694943128e-06, + "loss": 0.2942, + "step": 4348 + }, + { + "epoch": 0.05899348887683125, + "grad_norm": 6.250411510467529, + "learning_rate": 9.506920652322873e-06, + "loss": 0.393, + "step": 4349 + }, + { + "epoch": 0.05900705371676614, + "grad_norm": 8.113032341003418, + "learning_rate": 9.506783609702618e-06, + "loss": 0.4071, + "step": 4350 + }, + { + "epoch": 0.05902061855670103, + "grad_norm": 7.120643615722656, + "learning_rate": 9.506646567082364e-06, + "loss": 0.3971, + "step": 4351 + }, + { + "epoch": 0.05903418339663592, + "grad_norm": 8.595908164978027, + "learning_rate": 9.506509524462109e-06, + "loss": 0.5667, + "step": 4352 + }, + { + "epoch": 0.05904774823657081, + "grad_norm": 8.357598304748535, + "learning_rate": 9.506372481841854e-06, + "loss": 0.5068, + "step": 4353 + }, + { + "epoch": 0.059061313076505695, + "grad_norm": 9.967005729675293, + "learning_rate": 9.506235439221599e-06, + "loss": 0.3969, + "step": 4354 + }, + { + "epoch": 0.05907487791644059, + "grad_norm": 5.871894836425781, + "learning_rate": 9.506098396601344e-06, + "loss": 0.4236, + "step": 4355 + }, + { + "epoch": 0.059088442756375474, + "grad_norm": 9.404584884643555, + "learning_rate": 9.50596135398109e-06, + "loss": 0.6124, + "step": 4356 + }, + { + "epoch": 0.05910200759631036, + "grad_norm": 6.064000606536865, + "learning_rate": 9.505824311360835e-06, + "loss": 0.3257, + "step": 4357 + }, + { + "epoch": 0.05911557243624525, + "grad_norm": 7.557901382446289, + "learning_rate": 9.505687268740578e-06, + "loss": 0.4041, + "step": 4358 + }, + { + "epoch": 0.05912913727618014, + "grad_norm": 8.263550758361816, + "learning_rate": 9.505550226120325e-06, + "loss": 0.4124, + "step": 4359 + }, + { + "epoch": 0.05914270211611503, + "grad_norm": 6.1635847091674805, + "learning_rate": 9.50541318350007e-06, + "loss": 0.4196, + "step": 4360 + }, + { + "epoch": 0.05915626695604992, + "grad_norm": 7.64167594909668, + "learning_rate": 9.505276140879814e-06, + "loss": 0.3112, + "step": 4361 + }, + { + "epoch": 0.059169831795984805, + "grad_norm": 5.661856651306152, + "learning_rate": 9.505139098259559e-06, + "loss": 0.3511, + "step": 4362 + }, + { + "epoch": 0.0591833966359197, + "grad_norm": 6.983053207397461, + "learning_rate": 9.505002055639304e-06, + "loss": 0.3452, + "step": 4363 + }, + { + "epoch": 0.059196961475854584, + "grad_norm": 6.154072284698486, + "learning_rate": 9.504865013019051e-06, + "loss": 0.3518, + "step": 4364 + }, + { + "epoch": 0.05921052631578947, + "grad_norm": 6.7975239753723145, + "learning_rate": 9.504727970398794e-06, + "loss": 0.3541, + "step": 4365 + }, + { + "epoch": 0.059224091155724363, + "grad_norm": 7.467531681060791, + "learning_rate": 9.50459092777854e-06, + "loss": 0.427, + "step": 4366 + }, + { + "epoch": 0.05923765599565925, + "grad_norm": 9.969931602478027, + "learning_rate": 9.504453885158285e-06, + "loss": 0.5275, + "step": 4367 + }, + { + "epoch": 0.05925122083559414, + "grad_norm": 7.039157390594482, + "learning_rate": 9.50431684253803e-06, + "loss": 0.3867, + "step": 4368 + }, + { + "epoch": 0.05926478567552903, + "grad_norm": 8.324054718017578, + "learning_rate": 9.504179799917775e-06, + "loss": 0.612, + "step": 4369 + }, + { + "epoch": 0.059278350515463915, + "grad_norm": 4.953640937805176, + "learning_rate": 9.50404275729752e-06, + "loss": 0.3555, + "step": 4370 + }, + { + "epoch": 0.05929191535539881, + "grad_norm": 7.4825005531311035, + "learning_rate": 9.503905714677265e-06, + "loss": 0.392, + "step": 4371 + }, + { + "epoch": 0.059305480195333694, + "grad_norm": 6.686773777008057, + "learning_rate": 9.50376867205701e-06, + "loss": 0.4562, + "step": 4372 + }, + { + "epoch": 0.05931904503526858, + "grad_norm": 9.705451011657715, + "learning_rate": 9.503631629436756e-06, + "loss": 0.3732, + "step": 4373 + }, + { + "epoch": 0.059332609875203474, + "grad_norm": 6.056288719177246, + "learning_rate": 9.503494586816501e-06, + "loss": 0.3327, + "step": 4374 + }, + { + "epoch": 0.05934617471513836, + "grad_norm": 8.217073440551758, + "learning_rate": 9.503357544196246e-06, + "loss": 0.4405, + "step": 4375 + }, + { + "epoch": 0.05935973955507325, + "grad_norm": 10.602301597595215, + "learning_rate": 9.50322050157599e-06, + "loss": 0.6816, + "step": 4376 + }, + { + "epoch": 0.05937330439500814, + "grad_norm": 5.682609558105469, + "learning_rate": 9.503083458955737e-06, + "loss": 0.3077, + "step": 4377 + }, + { + "epoch": 0.059386869234943025, + "grad_norm": 6.670687198638916, + "learning_rate": 9.502946416335482e-06, + "loss": 0.3069, + "step": 4378 + }, + { + "epoch": 0.05940043407487792, + "grad_norm": 6.515244007110596, + "learning_rate": 9.502809373715225e-06, + "loss": 0.4505, + "step": 4379 + }, + { + "epoch": 0.059413998914812804, + "grad_norm": 7.868045806884766, + "learning_rate": 9.50267233109497e-06, + "loss": 0.4925, + "step": 4380 + }, + { + "epoch": 0.05942756375474769, + "grad_norm": 8.21262264251709, + "learning_rate": 9.502535288474717e-06, + "loss": 0.5121, + "step": 4381 + }, + { + "epoch": 0.059441128594682584, + "grad_norm": 7.173988342285156, + "learning_rate": 9.502398245854462e-06, + "loss": 0.4495, + "step": 4382 + }, + { + "epoch": 0.05945469343461747, + "grad_norm": 7.583625316619873, + "learning_rate": 9.502261203234206e-06, + "loss": 0.3327, + "step": 4383 + }, + { + "epoch": 0.05946825827455236, + "grad_norm": 7.773822784423828, + "learning_rate": 9.502124160613951e-06, + "loss": 0.2705, + "step": 4384 + }, + { + "epoch": 0.05948182311448725, + "grad_norm": 7.596400737762451, + "learning_rate": 9.501987117993698e-06, + "loss": 0.5565, + "step": 4385 + }, + { + "epoch": 0.059495387954422135, + "grad_norm": 8.142678260803223, + "learning_rate": 9.501850075373441e-06, + "loss": 0.4813, + "step": 4386 + }, + { + "epoch": 0.05950895279435703, + "grad_norm": 8.730583190917969, + "learning_rate": 9.501713032753187e-06, + "loss": 0.6143, + "step": 4387 + }, + { + "epoch": 0.059522517634291915, + "grad_norm": 8.182642936706543, + "learning_rate": 9.501575990132932e-06, + "loss": 0.3412, + "step": 4388 + }, + { + "epoch": 0.0595360824742268, + "grad_norm": 10.084797859191895, + "learning_rate": 9.501438947512677e-06, + "loss": 0.6663, + "step": 4389 + }, + { + "epoch": 0.059549647314161694, + "grad_norm": 6.39261531829834, + "learning_rate": 9.501301904892422e-06, + "loss": 0.3276, + "step": 4390 + }, + { + "epoch": 0.05956321215409658, + "grad_norm": 8.511316299438477, + "learning_rate": 9.501164862272167e-06, + "loss": 0.6514, + "step": 4391 + }, + { + "epoch": 0.05957677699403147, + "grad_norm": 6.509215831756592, + "learning_rate": 9.501027819651913e-06, + "loss": 0.3918, + "step": 4392 + }, + { + "epoch": 0.05959034183396636, + "grad_norm": 9.425021171569824, + "learning_rate": 9.500890777031658e-06, + "loss": 0.4553, + "step": 4393 + }, + { + "epoch": 0.059603906673901245, + "grad_norm": 8.005511283874512, + "learning_rate": 9.500753734411403e-06, + "loss": 0.4595, + "step": 4394 + }, + { + "epoch": 0.05961747151383614, + "grad_norm": 7.280409812927246, + "learning_rate": 9.500616691791148e-06, + "loss": 0.4166, + "step": 4395 + }, + { + "epoch": 0.059631036353771025, + "grad_norm": 7.719388484954834, + "learning_rate": 9.500479649170893e-06, + "loss": 0.4315, + "step": 4396 + }, + { + "epoch": 0.05964460119370591, + "grad_norm": 8.214010238647461, + "learning_rate": 9.500342606550638e-06, + "loss": 0.4435, + "step": 4397 + }, + { + "epoch": 0.059658166033640804, + "grad_norm": 5.552159786224365, + "learning_rate": 9.500205563930384e-06, + "loss": 0.3293, + "step": 4398 + }, + { + "epoch": 0.05967173087357569, + "grad_norm": 9.130636215209961, + "learning_rate": 9.500068521310129e-06, + "loss": 0.4935, + "step": 4399 + }, + { + "epoch": 0.05968529571351058, + "grad_norm": 6.945065975189209, + "learning_rate": 9.499931478689874e-06, + "loss": 0.3051, + "step": 4400 + }, + { + "epoch": 0.05969886055344547, + "grad_norm": 6.370684623718262, + "learning_rate": 9.499794436069617e-06, + "loss": 0.4329, + "step": 4401 + }, + { + "epoch": 0.059712425393380356, + "grad_norm": 7.494338035583496, + "learning_rate": 9.499657393449363e-06, + "loss": 0.4598, + "step": 4402 + }, + { + "epoch": 0.05972599023331525, + "grad_norm": 6.6067328453063965, + "learning_rate": 9.49952035082911e-06, + "loss": 0.3619, + "step": 4403 + }, + { + "epoch": 0.059739555073250135, + "grad_norm": 5.874236583709717, + "learning_rate": 9.499383308208853e-06, + "loss": 0.385, + "step": 4404 + }, + { + "epoch": 0.05975311991318502, + "grad_norm": 6.101635456085205, + "learning_rate": 9.499246265588598e-06, + "loss": 0.3602, + "step": 4405 + }, + { + "epoch": 0.059766684753119914, + "grad_norm": 7.294631004333496, + "learning_rate": 9.499109222968343e-06, + "loss": 0.4314, + "step": 4406 + }, + { + "epoch": 0.0597802495930548, + "grad_norm": 5.815804481506348, + "learning_rate": 9.49897218034809e-06, + "loss": 0.509, + "step": 4407 + }, + { + "epoch": 0.05979381443298969, + "grad_norm": 6.2766194343566895, + "learning_rate": 9.498835137727834e-06, + "loss": 0.4033, + "step": 4408 + }, + { + "epoch": 0.05980737927292458, + "grad_norm": 6.785043716430664, + "learning_rate": 9.498698095107579e-06, + "loss": 0.4169, + "step": 4409 + }, + { + "epoch": 0.059820944112859466, + "grad_norm": 5.83514928817749, + "learning_rate": 9.498561052487324e-06, + "loss": 0.436, + "step": 4410 + }, + { + "epoch": 0.05983450895279436, + "grad_norm": 7.854640483856201, + "learning_rate": 9.49842400986707e-06, + "loss": 0.4659, + "step": 4411 + }, + { + "epoch": 0.059848073792729245, + "grad_norm": 5.841958045959473, + "learning_rate": 9.498286967246814e-06, + "loss": 0.3175, + "step": 4412 + }, + { + "epoch": 0.05986163863266413, + "grad_norm": 6.4418253898620605, + "learning_rate": 9.49814992462656e-06, + "loss": 0.4343, + "step": 4413 + }, + { + "epoch": 0.059875203472599024, + "grad_norm": 8.383974075317383, + "learning_rate": 9.498012882006305e-06, + "loss": 0.6172, + "step": 4414 + }, + { + "epoch": 0.05988876831253391, + "grad_norm": 9.355863571166992, + "learning_rate": 9.49787583938605e-06, + "loss": 0.496, + "step": 4415 + }, + { + "epoch": 0.059902333152468804, + "grad_norm": 6.346243381500244, + "learning_rate": 9.497738796765795e-06, + "loss": 0.4188, + "step": 4416 + }, + { + "epoch": 0.05991589799240369, + "grad_norm": 7.187465190887451, + "learning_rate": 9.49760175414554e-06, + "loss": 0.3622, + "step": 4417 + }, + { + "epoch": 0.059929462832338576, + "grad_norm": 6.729835033416748, + "learning_rate": 9.497464711525285e-06, + "loss": 0.5092, + "step": 4418 + }, + { + "epoch": 0.05994302767227347, + "grad_norm": 6.690181255340576, + "learning_rate": 9.497327668905029e-06, + "loss": 0.426, + "step": 4419 + }, + { + "epoch": 0.059956592512208355, + "grad_norm": 6.71598482131958, + "learning_rate": 9.497190626284776e-06, + "loss": 0.4286, + "step": 4420 + }, + { + "epoch": 0.05997015735214324, + "grad_norm": 5.737694263458252, + "learning_rate": 9.497053583664521e-06, + "loss": 0.3882, + "step": 4421 + }, + { + "epoch": 0.059983722192078134, + "grad_norm": 6.90806770324707, + "learning_rate": 9.496916541044266e-06, + "loss": 0.5185, + "step": 4422 + }, + { + "epoch": 0.05999728703201302, + "grad_norm": 7.9870991706848145, + "learning_rate": 9.49677949842401e-06, + "loss": 0.4179, + "step": 4423 + }, + { + "epoch": 0.060010851871947914, + "grad_norm": 6.754912853240967, + "learning_rate": 9.496642455803757e-06, + "loss": 0.4117, + "step": 4424 + }, + { + "epoch": 0.0600244167118828, + "grad_norm": 8.031686782836914, + "learning_rate": 9.496505413183502e-06, + "loss": 0.3122, + "step": 4425 + }, + { + "epoch": 0.060037981551817686, + "grad_norm": 7.5551276206970215, + "learning_rate": 9.496368370563245e-06, + "loss": 0.3979, + "step": 4426 + }, + { + "epoch": 0.06005154639175258, + "grad_norm": 6.69401741027832, + "learning_rate": 9.49623132794299e-06, + "loss": 0.4028, + "step": 4427 + }, + { + "epoch": 0.060065111231687465, + "grad_norm": 9.470146179199219, + "learning_rate": 9.496094285322737e-06, + "loss": 0.5205, + "step": 4428 + }, + { + "epoch": 0.06007867607162235, + "grad_norm": 8.343303680419922, + "learning_rate": 9.49595724270248e-06, + "loss": 0.5274, + "step": 4429 + }, + { + "epoch": 0.060092240911557245, + "grad_norm": 7.197397708892822, + "learning_rate": 9.495820200082226e-06, + "loss": 0.3156, + "step": 4430 + }, + { + "epoch": 0.06010580575149213, + "grad_norm": 7.303407192230225, + "learning_rate": 9.495683157461971e-06, + "loss": 0.4687, + "step": 4431 + }, + { + "epoch": 0.060119370591427024, + "grad_norm": 9.379711151123047, + "learning_rate": 9.495546114841716e-06, + "loss": 0.4082, + "step": 4432 + }, + { + "epoch": 0.06013293543136191, + "grad_norm": 8.009048461914062, + "learning_rate": 9.495409072221461e-06, + "loss": 0.4236, + "step": 4433 + }, + { + "epoch": 0.060146500271296796, + "grad_norm": 6.281955242156982, + "learning_rate": 9.495272029601207e-06, + "loss": 0.3222, + "step": 4434 + }, + { + "epoch": 0.06016006511123169, + "grad_norm": 7.751125335693359, + "learning_rate": 9.495134986980952e-06, + "loss": 0.5397, + "step": 4435 + }, + { + "epoch": 0.060173629951166575, + "grad_norm": 9.042591094970703, + "learning_rate": 9.494997944360697e-06, + "loss": 0.3726, + "step": 4436 + }, + { + "epoch": 0.06018719479110147, + "grad_norm": 8.509843826293945, + "learning_rate": 9.494860901740442e-06, + "loss": 0.4211, + "step": 4437 + }, + { + "epoch": 0.060200759631036355, + "grad_norm": 10.473103523254395, + "learning_rate": 9.494723859120187e-06, + "loss": 0.6333, + "step": 4438 + }, + { + "epoch": 0.06021432447097124, + "grad_norm": 8.214759826660156, + "learning_rate": 9.494586816499933e-06, + "loss": 0.4896, + "step": 4439 + }, + { + "epoch": 0.060227889310906134, + "grad_norm": 6.723951816558838, + "learning_rate": 9.494449773879678e-06, + "loss": 0.3592, + "step": 4440 + }, + { + "epoch": 0.06024145415084102, + "grad_norm": 7.125319957733154, + "learning_rate": 9.494312731259423e-06, + "loss": 0.4141, + "step": 4441 + }, + { + "epoch": 0.060255018990775906, + "grad_norm": 6.816410541534424, + "learning_rate": 9.494175688639168e-06, + "loss": 0.5237, + "step": 4442 + }, + { + "epoch": 0.0602685838307108, + "grad_norm": 7.859525203704834, + "learning_rate": 9.494038646018913e-06, + "loss": 0.3622, + "step": 4443 + }, + { + "epoch": 0.060282148670645685, + "grad_norm": 8.563793182373047, + "learning_rate": 9.493901603398657e-06, + "loss": 0.3936, + "step": 4444 + }, + { + "epoch": 0.06029571351058058, + "grad_norm": 7.507215976715088, + "learning_rate": 9.493764560778402e-06, + "loss": 0.5026, + "step": 4445 + }, + { + "epoch": 0.060309278350515465, + "grad_norm": 10.416828155517578, + "learning_rate": 9.493627518158149e-06, + "loss": 0.5836, + "step": 4446 + }, + { + "epoch": 0.06032284319045035, + "grad_norm": 8.964219093322754, + "learning_rate": 9.493490475537894e-06, + "loss": 0.4993, + "step": 4447 + }, + { + "epoch": 0.060336408030385244, + "grad_norm": 8.253134727478027, + "learning_rate": 9.493353432917637e-06, + "loss": 0.3943, + "step": 4448 + }, + { + "epoch": 0.06034997287032013, + "grad_norm": 5.660225868225098, + "learning_rate": 9.493216390297383e-06, + "loss": 0.3597, + "step": 4449 + }, + { + "epoch": 0.060363537710255016, + "grad_norm": 6.006372928619385, + "learning_rate": 9.49307934767713e-06, + "loss": 0.4077, + "step": 4450 + }, + { + "epoch": 0.06037710255018991, + "grad_norm": 6.936911106109619, + "learning_rate": 9.492942305056873e-06, + "loss": 0.4965, + "step": 4451 + }, + { + "epoch": 0.060390667390124796, + "grad_norm": 6.896441459655762, + "learning_rate": 9.492805262436618e-06, + "loss": 0.4793, + "step": 4452 + }, + { + "epoch": 0.06040423223005969, + "grad_norm": 6.964752674102783, + "learning_rate": 9.492668219816363e-06, + "loss": 0.4159, + "step": 4453 + }, + { + "epoch": 0.060417797069994575, + "grad_norm": 8.903522491455078, + "learning_rate": 9.492531177196109e-06, + "loss": 0.6766, + "step": 4454 + }, + { + "epoch": 0.06043136190992946, + "grad_norm": 8.512850761413574, + "learning_rate": 9.492394134575854e-06, + "loss": 0.5505, + "step": 4455 + }, + { + "epoch": 0.060444926749864354, + "grad_norm": 8.30809497833252, + "learning_rate": 9.492257091955599e-06, + "loss": 0.5234, + "step": 4456 + }, + { + "epoch": 0.06045849158979924, + "grad_norm": 9.836653709411621, + "learning_rate": 9.492120049335344e-06, + "loss": 0.7575, + "step": 4457 + }, + { + "epoch": 0.060472056429734126, + "grad_norm": 7.873668193817139, + "learning_rate": 9.49198300671509e-06, + "loss": 0.6294, + "step": 4458 + }, + { + "epoch": 0.06048562126966902, + "grad_norm": 7.412785530090332, + "learning_rate": 9.491845964094834e-06, + "loss": 0.5539, + "step": 4459 + }, + { + "epoch": 0.060499186109603906, + "grad_norm": 9.399016380310059, + "learning_rate": 9.49170892147458e-06, + "loss": 0.8516, + "step": 4460 + }, + { + "epoch": 0.0605127509495388, + "grad_norm": 9.500046730041504, + "learning_rate": 9.491571878854325e-06, + "loss": 0.6147, + "step": 4461 + }, + { + "epoch": 0.060526315789473685, + "grad_norm": 7.092861652374268, + "learning_rate": 9.49143483623407e-06, + "loss": 0.4421, + "step": 4462 + }, + { + "epoch": 0.06053988062940857, + "grad_norm": 9.228564262390137, + "learning_rate": 9.491297793613815e-06, + "loss": 0.4491, + "step": 4463 + }, + { + "epoch": 0.060553445469343464, + "grad_norm": 7.104276657104492, + "learning_rate": 9.49116075099356e-06, + "loss": 0.4314, + "step": 4464 + }, + { + "epoch": 0.06056701030927835, + "grad_norm": 5.724406719207764, + "learning_rate": 9.491023708373306e-06, + "loss": 0.3033, + "step": 4465 + }, + { + "epoch": 0.06058057514921324, + "grad_norm": 6.314778804779053, + "learning_rate": 9.490886665753049e-06, + "loss": 0.3754, + "step": 4466 + }, + { + "epoch": 0.06059413998914813, + "grad_norm": 7.489557266235352, + "learning_rate": 9.490749623132796e-06, + "loss": 0.5794, + "step": 4467 + }, + { + "epoch": 0.060607704829083016, + "grad_norm": 6.66653299331665, + "learning_rate": 9.490612580512541e-06, + "loss": 0.519, + "step": 4468 + }, + { + "epoch": 0.06062126966901791, + "grad_norm": 7.7411789894104, + "learning_rate": 9.490475537892285e-06, + "loss": 0.6299, + "step": 4469 + }, + { + "epoch": 0.060634834508952795, + "grad_norm": 6.423578262329102, + "learning_rate": 9.49033849527203e-06, + "loss": 0.4394, + "step": 4470 + }, + { + "epoch": 0.06064839934888768, + "grad_norm": 8.187426567077637, + "learning_rate": 9.490201452651775e-06, + "loss": 0.5408, + "step": 4471 + }, + { + "epoch": 0.060661964188822574, + "grad_norm": 7.673774719238281, + "learning_rate": 9.49006441003152e-06, + "loss": 0.7135, + "step": 4472 + }, + { + "epoch": 0.06067552902875746, + "grad_norm": 6.538902282714844, + "learning_rate": 9.489927367411265e-06, + "loss": 0.6024, + "step": 4473 + }, + { + "epoch": 0.06068909386869235, + "grad_norm": 5.969961166381836, + "learning_rate": 9.48979032479101e-06, + "loss": 0.5093, + "step": 4474 + }, + { + "epoch": 0.06070265870862724, + "grad_norm": 9.37873649597168, + "learning_rate": 9.489653282170756e-06, + "loss": 0.6399, + "step": 4475 + }, + { + "epoch": 0.060716223548562126, + "grad_norm": 7.255432605743408, + "learning_rate": 9.4895162395505e-06, + "loss": 0.5435, + "step": 4476 + }, + { + "epoch": 0.06072978838849702, + "grad_norm": 10.318537712097168, + "learning_rate": 9.489379196930246e-06, + "loss": 0.7516, + "step": 4477 + }, + { + "epoch": 0.060743353228431905, + "grad_norm": 5.879139423370361, + "learning_rate": 9.489242154309991e-06, + "loss": 0.6016, + "step": 4478 + }, + { + "epoch": 0.06075691806836679, + "grad_norm": 6.523935317993164, + "learning_rate": 9.489105111689736e-06, + "loss": 0.457, + "step": 4479 + }, + { + "epoch": 0.060770482908301685, + "grad_norm": 9.083196640014648, + "learning_rate": 9.488968069069481e-06, + "loss": 0.7203, + "step": 4480 + }, + { + "epoch": 0.06078404774823657, + "grad_norm": 6.447391510009766, + "learning_rate": 9.488831026449227e-06, + "loss": 0.5844, + "step": 4481 + }, + { + "epoch": 0.06079761258817146, + "grad_norm": 5.437337875366211, + "learning_rate": 9.488693983828972e-06, + "loss": 0.5329, + "step": 4482 + }, + { + "epoch": 0.06081117742810635, + "grad_norm": 7.313846111297607, + "learning_rate": 9.488556941208717e-06, + "loss": 0.5657, + "step": 4483 + }, + { + "epoch": 0.060824742268041236, + "grad_norm": 8.371441841125488, + "learning_rate": 9.48841989858846e-06, + "loss": 0.5053, + "step": 4484 + }, + { + "epoch": 0.06083830710797613, + "grad_norm": 10.538721084594727, + "learning_rate": 9.488282855968207e-06, + "loss": 0.7031, + "step": 4485 + }, + { + "epoch": 0.060851871947911015, + "grad_norm": 7.72501277923584, + "learning_rate": 9.488145813347953e-06, + "loss": 0.5843, + "step": 4486 + }, + { + "epoch": 0.0608654367878459, + "grad_norm": 7.8343658447265625, + "learning_rate": 9.488008770727696e-06, + "loss": 0.528, + "step": 4487 + }, + { + "epoch": 0.060879001627780795, + "grad_norm": 6.0506086349487305, + "learning_rate": 9.487871728107441e-06, + "loss": 0.3049, + "step": 4488 + }, + { + "epoch": 0.06089256646771568, + "grad_norm": 6.051991939544678, + "learning_rate": 9.487734685487188e-06, + "loss": 0.3489, + "step": 4489 + }, + { + "epoch": 0.06090613130765057, + "grad_norm": 7.702589988708496, + "learning_rate": 9.487597642866933e-06, + "loss": 0.4608, + "step": 4490 + }, + { + "epoch": 0.06091969614758546, + "grad_norm": 8.665907859802246, + "learning_rate": 9.487460600246677e-06, + "loss": 0.6644, + "step": 4491 + }, + { + "epoch": 0.060933260987520346, + "grad_norm": 11.13047981262207, + "learning_rate": 9.487323557626422e-06, + "loss": 0.7117, + "step": 4492 + }, + { + "epoch": 0.06094682582745524, + "grad_norm": 8.9998140335083, + "learning_rate": 9.487186515006169e-06, + "loss": 0.5154, + "step": 4493 + }, + { + "epoch": 0.060960390667390126, + "grad_norm": 7.81365966796875, + "learning_rate": 9.487049472385912e-06, + "loss": 0.4949, + "step": 4494 + }, + { + "epoch": 0.06097395550732501, + "grad_norm": 5.325096607208252, + "learning_rate": 9.486912429765657e-06, + "loss": 0.3743, + "step": 4495 + }, + { + "epoch": 0.060987520347259905, + "grad_norm": 6.310175895690918, + "learning_rate": 9.486775387145403e-06, + "loss": 0.3963, + "step": 4496 + }, + { + "epoch": 0.06100108518719479, + "grad_norm": 5.642719268798828, + "learning_rate": 9.486638344525148e-06, + "loss": 0.4196, + "step": 4497 + }, + { + "epoch": 0.06101465002712968, + "grad_norm": 8.767940521240234, + "learning_rate": 9.486501301904893e-06, + "loss": 0.8928, + "step": 4498 + }, + { + "epoch": 0.06102821486706457, + "grad_norm": 5.511926174163818, + "learning_rate": 9.486364259284638e-06, + "loss": 0.589, + "step": 4499 + }, + { + "epoch": 0.061041779706999456, + "grad_norm": 6.016169548034668, + "learning_rate": 9.486227216664383e-06, + "loss": 0.4195, + "step": 4500 + }, + { + "epoch": 0.06105534454693435, + "grad_norm": 5.542724609375, + "learning_rate": 9.486090174044129e-06, + "loss": 0.2902, + "step": 4501 + }, + { + "epoch": 0.061068909386869236, + "grad_norm": 6.492000579833984, + "learning_rate": 9.485953131423874e-06, + "loss": 0.3672, + "step": 4502 + }, + { + "epoch": 0.06108247422680412, + "grad_norm": 4.9431681632995605, + "learning_rate": 9.485816088803619e-06, + "loss": 0.3597, + "step": 4503 + }, + { + "epoch": 0.061096039066739015, + "grad_norm": 5.572147369384766, + "learning_rate": 9.485679046183364e-06, + "loss": 0.3536, + "step": 4504 + }, + { + "epoch": 0.0611096039066739, + "grad_norm": 5.571905136108398, + "learning_rate": 9.48554200356311e-06, + "loss": 0.3518, + "step": 4505 + }, + { + "epoch": 0.06112316874660879, + "grad_norm": 6.227077007293701, + "learning_rate": 9.485404960942854e-06, + "loss": 0.4521, + "step": 4506 + }, + { + "epoch": 0.06113673358654368, + "grad_norm": 7.696115493774414, + "learning_rate": 9.4852679183226e-06, + "loss": 0.4449, + "step": 4507 + }, + { + "epoch": 0.061150298426478567, + "grad_norm": 8.8063383102417, + "learning_rate": 9.485130875702345e-06, + "loss": 0.538, + "step": 4508 + }, + { + "epoch": 0.06116386326641346, + "grad_norm": 7.627622604370117, + "learning_rate": 9.484993833082088e-06, + "loss": 0.5001, + "step": 4509 + }, + { + "epoch": 0.061177428106348346, + "grad_norm": 7.5650634765625, + "learning_rate": 9.484856790461835e-06, + "loss": 0.4029, + "step": 4510 + }, + { + "epoch": 0.06119099294628323, + "grad_norm": 6.479619979858398, + "learning_rate": 9.48471974784158e-06, + "loss": 0.4588, + "step": 4511 + }, + { + "epoch": 0.061204557786218125, + "grad_norm": 9.150843620300293, + "learning_rate": 9.484582705221324e-06, + "loss": 0.5002, + "step": 4512 + }, + { + "epoch": 0.06121812262615301, + "grad_norm": 7.878711223602295, + "learning_rate": 9.484445662601069e-06, + "loss": 0.6057, + "step": 4513 + }, + { + "epoch": 0.0612316874660879, + "grad_norm": 6.153012752532959, + "learning_rate": 9.484308619980814e-06, + "loss": 0.46, + "step": 4514 + }, + { + "epoch": 0.06124525230602279, + "grad_norm": 6.4063310623168945, + "learning_rate": 9.484171577360561e-06, + "loss": 0.3734, + "step": 4515 + }, + { + "epoch": 0.06125881714595768, + "grad_norm": 6.408999919891357, + "learning_rate": 9.484034534740305e-06, + "loss": 0.4115, + "step": 4516 + }, + { + "epoch": 0.06127238198589257, + "grad_norm": 7.0720696449279785, + "learning_rate": 9.48389749212005e-06, + "loss": 0.5206, + "step": 4517 + }, + { + "epoch": 0.061285946825827456, + "grad_norm": 5.911882400512695, + "learning_rate": 9.483760449499795e-06, + "loss": 0.3498, + "step": 4518 + }, + { + "epoch": 0.06129951166576234, + "grad_norm": 7.035487651824951, + "learning_rate": 9.48362340687954e-06, + "loss": 0.4366, + "step": 4519 + }, + { + "epoch": 0.061313076505697235, + "grad_norm": 7.496477127075195, + "learning_rate": 9.483486364259285e-06, + "loss": 0.6292, + "step": 4520 + }, + { + "epoch": 0.06132664134563212, + "grad_norm": 7.4465813636779785, + "learning_rate": 9.48334932163903e-06, + "loss": 0.7226, + "step": 4521 + }, + { + "epoch": 0.06134020618556701, + "grad_norm": 8.036091804504395, + "learning_rate": 9.483212279018776e-06, + "loss": 0.5393, + "step": 4522 + }, + { + "epoch": 0.0613537710255019, + "grad_norm": 7.74165678024292, + "learning_rate": 9.48307523639852e-06, + "loss": 0.5553, + "step": 4523 + }, + { + "epoch": 0.06136733586543679, + "grad_norm": 9.381231307983398, + "learning_rate": 9.482938193778266e-06, + "loss": 0.6002, + "step": 4524 + }, + { + "epoch": 0.06138090070537168, + "grad_norm": 4.555928707122803, + "learning_rate": 9.482801151158011e-06, + "loss": 0.4052, + "step": 4525 + }, + { + "epoch": 0.061394465545306566, + "grad_norm": 6.310595989227295, + "learning_rate": 9.482664108537756e-06, + "loss": 0.623, + "step": 4526 + }, + { + "epoch": 0.06140803038524145, + "grad_norm": 8.35554027557373, + "learning_rate": 9.4825270659175e-06, + "loss": 0.5584, + "step": 4527 + }, + { + "epoch": 0.061421595225176345, + "grad_norm": 7.265346527099609, + "learning_rate": 9.482390023297247e-06, + "loss": 0.4451, + "step": 4528 + }, + { + "epoch": 0.06143516006511123, + "grad_norm": 6.765527248382568, + "learning_rate": 9.482252980676992e-06, + "loss": 0.4661, + "step": 4529 + }, + { + "epoch": 0.06144872490504612, + "grad_norm": 9.572149276733398, + "learning_rate": 9.482115938056737e-06, + "loss": 0.5986, + "step": 4530 + }, + { + "epoch": 0.06146228974498101, + "grad_norm": 9.109292030334473, + "learning_rate": 9.48197889543648e-06, + "loss": 0.5392, + "step": 4531 + }, + { + "epoch": 0.0614758545849159, + "grad_norm": 7.486854076385498, + "learning_rate": 9.481841852816227e-06, + "loss": 0.6541, + "step": 4532 + }, + { + "epoch": 0.06148941942485079, + "grad_norm": 6.924365520477295, + "learning_rate": 9.481704810195973e-06, + "loss": 0.4384, + "step": 4533 + }, + { + "epoch": 0.061502984264785676, + "grad_norm": 6.249569892883301, + "learning_rate": 9.481567767575716e-06, + "loss": 0.3378, + "step": 4534 + }, + { + "epoch": 0.06151654910472056, + "grad_norm": 8.095330238342285, + "learning_rate": 9.481430724955461e-06, + "loss": 0.5036, + "step": 4535 + }, + { + "epoch": 0.061530113944655455, + "grad_norm": 9.356074333190918, + "learning_rate": 9.481293682335208e-06, + "loss": 0.552, + "step": 4536 + }, + { + "epoch": 0.06154367878459034, + "grad_norm": 8.476466178894043, + "learning_rate": 9.481156639714952e-06, + "loss": 0.6791, + "step": 4537 + }, + { + "epoch": 0.06155724362452523, + "grad_norm": 7.154664993286133, + "learning_rate": 9.481019597094697e-06, + "loss": 0.4817, + "step": 4538 + }, + { + "epoch": 0.06157080846446012, + "grad_norm": 7.894448757171631, + "learning_rate": 9.480882554474442e-06, + "loss": 0.5176, + "step": 4539 + }, + { + "epoch": 0.06158437330439501, + "grad_norm": 8.374079704284668, + "learning_rate": 9.480745511854187e-06, + "loss": 0.4834, + "step": 4540 + }, + { + "epoch": 0.0615979381443299, + "grad_norm": 6.333402156829834, + "learning_rate": 9.480608469233932e-06, + "loss": 0.4673, + "step": 4541 + }, + { + "epoch": 0.061611502984264786, + "grad_norm": 8.98908805847168, + "learning_rate": 9.480471426613678e-06, + "loss": 0.6533, + "step": 4542 + }, + { + "epoch": 0.06162506782419967, + "grad_norm": 5.612579345703125, + "learning_rate": 9.480334383993423e-06, + "loss": 0.2821, + "step": 4543 + }, + { + "epoch": 0.061638632664134566, + "grad_norm": 6.133084774017334, + "learning_rate": 9.480197341373168e-06, + "loss": 0.4413, + "step": 4544 + }, + { + "epoch": 0.06165219750406945, + "grad_norm": 9.591360092163086, + "learning_rate": 9.480060298752913e-06, + "loss": 0.5363, + "step": 4545 + }, + { + "epoch": 0.06166576234400434, + "grad_norm": 6.806924343109131, + "learning_rate": 9.479923256132658e-06, + "loss": 0.3883, + "step": 4546 + }, + { + "epoch": 0.06167932718393923, + "grad_norm": 7.1730546951293945, + "learning_rate": 9.479786213512403e-06, + "loss": 0.3916, + "step": 4547 + }, + { + "epoch": 0.06169289202387412, + "grad_norm": 6.8247199058532715, + "learning_rate": 9.479649170892149e-06, + "loss": 0.5563, + "step": 4548 + }, + { + "epoch": 0.06170645686380901, + "grad_norm": 7.367185592651367, + "learning_rate": 9.479512128271894e-06, + "loss": 0.4814, + "step": 4549 + }, + { + "epoch": 0.061720021703743896, + "grad_norm": 7.490506172180176, + "learning_rate": 9.479375085651639e-06, + "loss": 0.3571, + "step": 4550 + }, + { + "epoch": 0.06173358654367878, + "grad_norm": 5.624199867248535, + "learning_rate": 9.479238043031384e-06, + "loss": 0.3813, + "step": 4551 + }, + { + "epoch": 0.061747151383613676, + "grad_norm": 10.508989334106445, + "learning_rate": 9.479101000411128e-06, + "loss": 0.6798, + "step": 4552 + }, + { + "epoch": 0.06176071622354856, + "grad_norm": 7.556203842163086, + "learning_rate": 9.478963957790873e-06, + "loss": 0.3443, + "step": 4553 + }, + { + "epoch": 0.06177428106348345, + "grad_norm": 5.367369651794434, + "learning_rate": 9.47882691517062e-06, + "loss": 0.3843, + "step": 4554 + }, + { + "epoch": 0.06178784590341834, + "grad_norm": 6.122195243835449, + "learning_rate": 9.478689872550365e-06, + "loss": 0.4674, + "step": 4555 + }, + { + "epoch": 0.06180141074335323, + "grad_norm": 6.965220928192139, + "learning_rate": 9.478552829930108e-06, + "loss": 0.5736, + "step": 4556 + }, + { + "epoch": 0.06181497558328812, + "grad_norm": 7.387735843658447, + "learning_rate": 9.478415787309853e-06, + "loss": 0.426, + "step": 4557 + }, + { + "epoch": 0.061828540423223007, + "grad_norm": 5.978336334228516, + "learning_rate": 9.4782787446896e-06, + "loss": 0.3466, + "step": 4558 + }, + { + "epoch": 0.06184210526315789, + "grad_norm": 8.358417510986328, + "learning_rate": 9.478141702069344e-06, + "loss": 0.5913, + "step": 4559 + }, + { + "epoch": 0.061855670103092786, + "grad_norm": 7.294012069702148, + "learning_rate": 9.478004659449089e-06, + "loss": 0.3471, + "step": 4560 + }, + { + "epoch": 0.06186923494302767, + "grad_norm": 8.477906227111816, + "learning_rate": 9.477867616828834e-06, + "loss": 0.613, + "step": 4561 + }, + { + "epoch": 0.06188279978296256, + "grad_norm": 7.650157928466797, + "learning_rate": 9.47773057420858e-06, + "loss": 0.421, + "step": 4562 + }, + { + "epoch": 0.06189636462289745, + "grad_norm": 6.88037633895874, + "learning_rate": 9.477593531588325e-06, + "loss": 0.5007, + "step": 4563 + }, + { + "epoch": 0.06190992946283234, + "grad_norm": 5.458062171936035, + "learning_rate": 9.47745648896807e-06, + "loss": 0.3362, + "step": 4564 + }, + { + "epoch": 0.06192349430276723, + "grad_norm": 8.52696418762207, + "learning_rate": 9.477319446347815e-06, + "loss": 0.5521, + "step": 4565 + }, + { + "epoch": 0.06193705914270212, + "grad_norm": 6.564294338226318, + "learning_rate": 9.47718240372756e-06, + "loss": 0.4709, + "step": 4566 + }, + { + "epoch": 0.061950623982637, + "grad_norm": 7.3524580001831055, + "learning_rate": 9.477045361107305e-06, + "loss": 0.3965, + "step": 4567 + }, + { + "epoch": 0.061964188822571896, + "grad_norm": 5.915963172912598, + "learning_rate": 9.47690831848705e-06, + "loss": 0.4321, + "step": 4568 + }, + { + "epoch": 0.06197775366250678, + "grad_norm": 7.4592204093933105, + "learning_rate": 9.476771275866796e-06, + "loss": 0.3789, + "step": 4569 + }, + { + "epoch": 0.06199131850244167, + "grad_norm": 5.3918232917785645, + "learning_rate": 9.47663423324654e-06, + "loss": 0.4845, + "step": 4570 + }, + { + "epoch": 0.06200488334237656, + "grad_norm": 7.52297306060791, + "learning_rate": 9.476497190626286e-06, + "loss": 0.5267, + "step": 4571 + }, + { + "epoch": 0.06201844818231145, + "grad_norm": 6.863762855529785, + "learning_rate": 9.476360148006031e-06, + "loss": 0.4432, + "step": 4572 + }, + { + "epoch": 0.06203201302224634, + "grad_norm": 8.048553466796875, + "learning_rate": 9.476223105385776e-06, + "loss": 0.5164, + "step": 4573 + }, + { + "epoch": 0.06204557786218123, + "grad_norm": 6.54671049118042, + "learning_rate": 9.47608606276552e-06, + "loss": 0.4121, + "step": 4574 + }, + { + "epoch": 0.06205914270211611, + "grad_norm": 8.6657133102417, + "learning_rate": 9.475949020145267e-06, + "loss": 0.4646, + "step": 4575 + }, + { + "epoch": 0.062072707542051006, + "grad_norm": 5.57415771484375, + "learning_rate": 9.475811977525012e-06, + "loss": 0.4301, + "step": 4576 + }, + { + "epoch": 0.06208627238198589, + "grad_norm": 6.392549514770508, + "learning_rate": 9.475674934904755e-06, + "loss": 0.6108, + "step": 4577 + }, + { + "epoch": 0.06209983722192078, + "grad_norm": 6.8861308097839355, + "learning_rate": 9.4755378922845e-06, + "loss": 0.447, + "step": 4578 + }, + { + "epoch": 0.06211340206185567, + "grad_norm": 5.661059379577637, + "learning_rate": 9.475400849664247e-06, + "loss": 0.4064, + "step": 4579 + }, + { + "epoch": 0.06212696690179056, + "grad_norm": 7.556037425994873, + "learning_rate": 9.475263807043991e-06, + "loss": 0.6353, + "step": 4580 + }, + { + "epoch": 0.06214053174172545, + "grad_norm": 8.727033615112305, + "learning_rate": 9.475126764423736e-06, + "loss": 0.5662, + "step": 4581 + }, + { + "epoch": 0.06215409658166034, + "grad_norm": 6.477692604064941, + "learning_rate": 9.474989721803481e-06, + "loss": 0.5505, + "step": 4582 + }, + { + "epoch": 0.06216766142159522, + "grad_norm": 10.0453462600708, + "learning_rate": 9.474852679183226e-06, + "loss": 0.6422, + "step": 4583 + }, + { + "epoch": 0.062181226261530116, + "grad_norm": 5.594611167907715, + "learning_rate": 9.474715636562972e-06, + "loss": 0.3492, + "step": 4584 + }, + { + "epoch": 0.062194791101465, + "grad_norm": 6.117213249206543, + "learning_rate": 9.474578593942717e-06, + "loss": 0.4665, + "step": 4585 + }, + { + "epoch": 0.06220835594139989, + "grad_norm": 5.296921253204346, + "learning_rate": 9.474441551322462e-06, + "loss": 0.3757, + "step": 4586 + }, + { + "epoch": 0.06222192078133478, + "grad_norm": 7.115097999572754, + "learning_rate": 9.474304508702207e-06, + "loss": 0.4111, + "step": 4587 + }, + { + "epoch": 0.06223548562126967, + "grad_norm": 6.4802398681640625, + "learning_rate": 9.474167466081952e-06, + "loss": 0.5567, + "step": 4588 + }, + { + "epoch": 0.06224905046120456, + "grad_norm": 6.630932807922363, + "learning_rate": 9.474030423461698e-06, + "loss": 0.6257, + "step": 4589 + }, + { + "epoch": 0.06226261530113945, + "grad_norm": 7.218266010284424, + "learning_rate": 9.473893380841443e-06, + "loss": 0.5485, + "step": 4590 + }, + { + "epoch": 0.06227618014107433, + "grad_norm": 8.605446815490723, + "learning_rate": 9.473756338221188e-06, + "loss": 0.5088, + "step": 4591 + }, + { + "epoch": 0.062289744981009226, + "grad_norm": 6.044310092926025, + "learning_rate": 9.473619295600933e-06, + "loss": 0.4447, + "step": 4592 + }, + { + "epoch": 0.06230330982094411, + "grad_norm": 5.488541603088379, + "learning_rate": 9.473482252980678e-06, + "loss": 0.4453, + "step": 4593 + }, + { + "epoch": 0.062316874660879, + "grad_norm": 5.860615253448486, + "learning_rate": 9.473345210360423e-06, + "loss": 0.3459, + "step": 4594 + }, + { + "epoch": 0.06233043950081389, + "grad_norm": 6.220470905303955, + "learning_rate": 9.473208167740167e-06, + "loss": 0.4925, + "step": 4595 + }, + { + "epoch": 0.06234400434074878, + "grad_norm": 7.89722204208374, + "learning_rate": 9.473071125119912e-06, + "loss": 0.4341, + "step": 4596 + }, + { + "epoch": 0.06235756918068367, + "grad_norm": 7.388474464416504, + "learning_rate": 9.472934082499659e-06, + "loss": 0.5312, + "step": 4597 + }, + { + "epoch": 0.06237113402061856, + "grad_norm": 6.69619607925415, + "learning_rate": 9.472797039879404e-06, + "loss": 0.4506, + "step": 4598 + }, + { + "epoch": 0.06238469886055344, + "grad_norm": 8.746556282043457, + "learning_rate": 9.472659997259148e-06, + "loss": 0.4725, + "step": 4599 + }, + { + "epoch": 0.062398263700488336, + "grad_norm": 7.7122673988342285, + "learning_rate": 9.472522954638893e-06, + "loss": 0.5254, + "step": 4600 + }, + { + "epoch": 0.06241182854042322, + "grad_norm": 7.498342037200928, + "learning_rate": 9.47238591201864e-06, + "loss": 0.4438, + "step": 4601 + }, + { + "epoch": 0.06242539338035811, + "grad_norm": 6.808043956756592, + "learning_rate": 9.472248869398383e-06, + "loss": 0.4356, + "step": 4602 + }, + { + "epoch": 0.062438958220293, + "grad_norm": 6.485644340515137, + "learning_rate": 9.472111826778128e-06, + "loss": 0.5665, + "step": 4603 + }, + { + "epoch": 0.06245252306022789, + "grad_norm": 8.604154586791992, + "learning_rate": 9.471974784157874e-06, + "loss": 0.6638, + "step": 4604 + }, + { + "epoch": 0.06246608790016278, + "grad_norm": 7.15622615814209, + "learning_rate": 9.471837741537619e-06, + "loss": 0.4494, + "step": 4605 + }, + { + "epoch": 0.06247965274009767, + "grad_norm": 5.583507061004639, + "learning_rate": 9.471700698917364e-06, + "loss": 0.4418, + "step": 4606 + }, + { + "epoch": 0.06249321758003255, + "grad_norm": 6.925309658050537, + "learning_rate": 9.471563656297109e-06, + "loss": 0.4999, + "step": 4607 + }, + { + "epoch": 0.06250678241996745, + "grad_norm": 6.870467662811279, + "learning_rate": 9.471426613676854e-06, + "loss": 0.6686, + "step": 4608 + }, + { + "epoch": 0.06252034725990234, + "grad_norm": 8.48794174194336, + "learning_rate": 9.4712895710566e-06, + "loss": 0.6795, + "step": 4609 + }, + { + "epoch": 0.06253391209983722, + "grad_norm": 4.166110038757324, + "learning_rate": 9.471152528436345e-06, + "loss": 0.3632, + "step": 4610 + }, + { + "epoch": 0.06254747693977211, + "grad_norm": 7.131482124328613, + "learning_rate": 9.47101548581609e-06, + "loss": 0.6155, + "step": 4611 + }, + { + "epoch": 0.062561041779707, + "grad_norm": 4.919163227081299, + "learning_rate": 9.470878443195835e-06, + "loss": 0.3775, + "step": 4612 + }, + { + "epoch": 0.06257460661964188, + "grad_norm": 5.341225624084473, + "learning_rate": 9.47074140057558e-06, + "loss": 0.3566, + "step": 4613 + }, + { + "epoch": 0.06258817145957678, + "grad_norm": 7.775325775146484, + "learning_rate": 9.470604357955325e-06, + "loss": 0.6012, + "step": 4614 + }, + { + "epoch": 0.06260173629951167, + "grad_norm": 5.725704669952393, + "learning_rate": 9.47046731533507e-06, + "loss": 0.3809, + "step": 4615 + }, + { + "epoch": 0.06261530113944655, + "grad_norm": 6.180842876434326, + "learning_rate": 9.470330272714816e-06, + "loss": 0.3368, + "step": 4616 + }, + { + "epoch": 0.06262886597938144, + "grad_norm": 5.734785556793213, + "learning_rate": 9.470193230094559e-06, + "loss": 0.4764, + "step": 4617 + }, + { + "epoch": 0.06264243081931634, + "grad_norm": 6.0747528076171875, + "learning_rate": 9.470056187474306e-06, + "loss": 0.5158, + "step": 4618 + }, + { + "epoch": 0.06265599565925122, + "grad_norm": 6.747281551361084, + "learning_rate": 9.469919144854051e-06, + "loss": 0.543, + "step": 4619 + }, + { + "epoch": 0.06266956049918611, + "grad_norm": 7.180576324462891, + "learning_rate": 9.469782102233795e-06, + "loss": 0.4211, + "step": 4620 + }, + { + "epoch": 0.062683125339121, + "grad_norm": 7.709544658660889, + "learning_rate": 9.46964505961354e-06, + "loss": 0.588, + "step": 4621 + }, + { + "epoch": 0.06269669017905588, + "grad_norm": 6.750279903411865, + "learning_rate": 9.469508016993285e-06, + "loss": 0.6244, + "step": 4622 + }, + { + "epoch": 0.06271025501899077, + "grad_norm": 7.5551581382751465, + "learning_rate": 9.469370974373032e-06, + "loss": 0.4485, + "step": 4623 + }, + { + "epoch": 0.06272381985892567, + "grad_norm": 5.993066310882568, + "learning_rate": 9.469233931752775e-06, + "loss": 0.359, + "step": 4624 + }, + { + "epoch": 0.06273738469886056, + "grad_norm": 5.305106163024902, + "learning_rate": 9.46909688913252e-06, + "loss": 0.3307, + "step": 4625 + }, + { + "epoch": 0.06275094953879544, + "grad_norm": 6.406003475189209, + "learning_rate": 9.468959846512266e-06, + "loss": 0.5194, + "step": 4626 + }, + { + "epoch": 0.06276451437873033, + "grad_norm": 5.816180229187012, + "learning_rate": 9.468822803892011e-06, + "loss": 0.4725, + "step": 4627 + }, + { + "epoch": 0.06277807921866523, + "grad_norm": 5.844058513641357, + "learning_rate": 9.468685761271756e-06, + "loss": 0.4568, + "step": 4628 + }, + { + "epoch": 0.0627916440586001, + "grad_norm": 8.83485221862793, + "learning_rate": 9.468548718651501e-06, + "loss": 0.5565, + "step": 4629 + }, + { + "epoch": 0.062805208898535, + "grad_norm": 6.531160831451416, + "learning_rate": 9.468411676031246e-06, + "loss": 0.5795, + "step": 4630 + }, + { + "epoch": 0.06281877373846989, + "grad_norm": 7.367312431335449, + "learning_rate": 9.468274633410992e-06, + "loss": 0.5913, + "step": 4631 + }, + { + "epoch": 0.06283233857840477, + "grad_norm": 6.570948123931885, + "learning_rate": 9.468137590790737e-06, + "loss": 0.5492, + "step": 4632 + }, + { + "epoch": 0.06284590341833966, + "grad_norm": 7.2987751960754395, + "learning_rate": 9.468000548170482e-06, + "loss": 0.5241, + "step": 4633 + }, + { + "epoch": 0.06285946825827456, + "grad_norm": 5.830830097198486, + "learning_rate": 9.467863505550227e-06, + "loss": 0.5137, + "step": 4634 + }, + { + "epoch": 0.06287303309820944, + "grad_norm": 6.69249963760376, + "learning_rate": 9.467726462929972e-06, + "loss": 0.4821, + "step": 4635 + }, + { + "epoch": 0.06288659793814433, + "grad_norm": 4.964227199554443, + "learning_rate": 9.467589420309718e-06, + "loss": 0.3167, + "step": 4636 + }, + { + "epoch": 0.06290016277807922, + "grad_norm": 8.6300687789917, + "learning_rate": 9.467452377689463e-06, + "loss": 0.5386, + "step": 4637 + }, + { + "epoch": 0.0629137276180141, + "grad_norm": 7.205904006958008, + "learning_rate": 9.467315335069208e-06, + "loss": 0.4606, + "step": 4638 + }, + { + "epoch": 0.062927292457949, + "grad_norm": 6.563708782196045, + "learning_rate": 9.467178292448951e-06, + "loss": 0.5257, + "step": 4639 + }, + { + "epoch": 0.06294085729788389, + "grad_norm": 7.573925495147705, + "learning_rate": 9.467041249828698e-06, + "loss": 0.6363, + "step": 4640 + }, + { + "epoch": 0.06295442213781878, + "grad_norm": 7.4121413230896, + "learning_rate": 9.466904207208443e-06, + "loss": 0.5615, + "step": 4641 + }, + { + "epoch": 0.06296798697775366, + "grad_norm": 7.835549831390381, + "learning_rate": 9.466767164588187e-06, + "loss": 0.4246, + "step": 4642 + }, + { + "epoch": 0.06298155181768855, + "grad_norm": 7.209582328796387, + "learning_rate": 9.466630121967932e-06, + "loss": 0.4784, + "step": 4643 + }, + { + "epoch": 0.06299511665762345, + "grad_norm": 5.896394729614258, + "learning_rate": 9.466493079347679e-06, + "loss": 0.587, + "step": 4644 + }, + { + "epoch": 0.06300868149755832, + "grad_norm": 5.918501853942871, + "learning_rate": 9.466356036727422e-06, + "loss": 0.3964, + "step": 4645 + }, + { + "epoch": 0.06302224633749322, + "grad_norm": 5.401607990264893, + "learning_rate": 9.466218994107168e-06, + "loss": 0.28, + "step": 4646 + }, + { + "epoch": 0.06303581117742811, + "grad_norm": 6.370698928833008, + "learning_rate": 9.466081951486913e-06, + "loss": 0.5226, + "step": 4647 + }, + { + "epoch": 0.06304937601736299, + "grad_norm": 6.7199931144714355, + "learning_rate": 9.46594490886666e-06, + "loss": 0.365, + "step": 4648 + }, + { + "epoch": 0.06306294085729788, + "grad_norm": 6.854910373687744, + "learning_rate": 9.465807866246403e-06, + "loss": 0.4036, + "step": 4649 + }, + { + "epoch": 0.06307650569723278, + "grad_norm": 9.008176803588867, + "learning_rate": 9.465670823626148e-06, + "loss": 0.7243, + "step": 4650 + }, + { + "epoch": 0.06309007053716766, + "grad_norm": 8.869474411010742, + "learning_rate": 9.465533781005894e-06, + "loss": 0.5949, + "step": 4651 + }, + { + "epoch": 0.06310363537710255, + "grad_norm": 6.263670921325684, + "learning_rate": 9.465396738385639e-06, + "loss": 0.4491, + "step": 4652 + }, + { + "epoch": 0.06311720021703744, + "grad_norm": 6.57522439956665, + "learning_rate": 9.465259695765384e-06, + "loss": 0.4014, + "step": 4653 + }, + { + "epoch": 0.06313076505697232, + "grad_norm": 7.466639518737793, + "learning_rate": 9.465122653145129e-06, + "loss": 0.5467, + "step": 4654 + }, + { + "epoch": 0.06314432989690721, + "grad_norm": 6.350895881652832, + "learning_rate": 9.464985610524874e-06, + "loss": 0.4841, + "step": 4655 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 6.808922290802002, + "learning_rate": 9.46484856790462e-06, + "loss": 0.5007, + "step": 4656 + }, + { + "epoch": 0.063171459576777, + "grad_norm": 6.720089912414551, + "learning_rate": 9.464711525284365e-06, + "loss": 0.3917, + "step": 4657 + }, + { + "epoch": 0.06318502441671188, + "grad_norm": 8.004690170288086, + "learning_rate": 9.46457448266411e-06, + "loss": 0.4318, + "step": 4658 + }, + { + "epoch": 0.06319858925664677, + "grad_norm": 5.404880523681641, + "learning_rate": 9.464437440043855e-06, + "loss": 0.3531, + "step": 4659 + }, + { + "epoch": 0.06321215409658167, + "grad_norm": 9.199190139770508, + "learning_rate": 9.464300397423598e-06, + "loss": 0.4634, + "step": 4660 + }, + { + "epoch": 0.06322571893651655, + "grad_norm": 6.673431396484375, + "learning_rate": 9.464163354803345e-06, + "loss": 0.4084, + "step": 4661 + }, + { + "epoch": 0.06323928377645144, + "grad_norm": 8.704068183898926, + "learning_rate": 9.46402631218309e-06, + "loss": 0.5797, + "step": 4662 + }, + { + "epoch": 0.06325284861638633, + "grad_norm": 6.691535472869873, + "learning_rate": 9.463889269562834e-06, + "loss": 0.3809, + "step": 4663 + }, + { + "epoch": 0.06326641345632121, + "grad_norm": 7.6262335777282715, + "learning_rate": 9.46375222694258e-06, + "loss": 0.5018, + "step": 4664 + }, + { + "epoch": 0.0632799782962561, + "grad_norm": 6.056190013885498, + "learning_rate": 9.463615184322324e-06, + "loss": 0.3975, + "step": 4665 + }, + { + "epoch": 0.063293543136191, + "grad_norm": 7.264991760253906, + "learning_rate": 9.463478141702071e-06, + "loss": 0.4687, + "step": 4666 + }, + { + "epoch": 0.06330710797612588, + "grad_norm": 8.868270874023438, + "learning_rate": 9.463341099081815e-06, + "loss": 0.5407, + "step": 4667 + }, + { + "epoch": 0.06332067281606077, + "grad_norm": 7.77863883972168, + "learning_rate": 9.46320405646156e-06, + "loss": 0.4224, + "step": 4668 + }, + { + "epoch": 0.06333423765599566, + "grad_norm": 7.1932783126831055, + "learning_rate": 9.463067013841305e-06, + "loss": 0.4011, + "step": 4669 + }, + { + "epoch": 0.06334780249593054, + "grad_norm": 9.048702239990234, + "learning_rate": 9.46292997122105e-06, + "loss": 0.6386, + "step": 4670 + }, + { + "epoch": 0.06336136733586543, + "grad_norm": 7.2237868309021, + "learning_rate": 9.462792928600795e-06, + "loss": 0.4257, + "step": 4671 + }, + { + "epoch": 0.06337493217580033, + "grad_norm": 7.436929225921631, + "learning_rate": 9.46265588598054e-06, + "loss": 0.3499, + "step": 4672 + }, + { + "epoch": 0.06338849701573522, + "grad_norm": 7.9615478515625, + "learning_rate": 9.462518843360286e-06, + "loss": 0.4147, + "step": 4673 + }, + { + "epoch": 0.0634020618556701, + "grad_norm": 8.890054702758789, + "learning_rate": 9.462381800740031e-06, + "loss": 0.5706, + "step": 4674 + }, + { + "epoch": 0.06341562669560499, + "grad_norm": 6.638696193695068, + "learning_rate": 9.462244758119776e-06, + "loss": 0.3208, + "step": 4675 + }, + { + "epoch": 0.06342919153553989, + "grad_norm": 7.171175479888916, + "learning_rate": 9.462107715499521e-06, + "loss": 0.4476, + "step": 4676 + }, + { + "epoch": 0.06344275637547477, + "grad_norm": 6.956133842468262, + "learning_rate": 9.461970672879266e-06, + "loss": 0.5798, + "step": 4677 + }, + { + "epoch": 0.06345632121540966, + "grad_norm": 5.848232746124268, + "learning_rate": 9.46183363025901e-06, + "loss": 0.3643, + "step": 4678 + }, + { + "epoch": 0.06346988605534455, + "grad_norm": 7.391368865966797, + "learning_rate": 9.461696587638757e-06, + "loss": 0.4551, + "step": 4679 + }, + { + "epoch": 0.06348345089527943, + "grad_norm": 7.05242395401001, + "learning_rate": 9.461559545018502e-06, + "loss": 0.4796, + "step": 4680 + }, + { + "epoch": 0.06349701573521432, + "grad_norm": 6.345870018005371, + "learning_rate": 9.461422502398247e-06, + "loss": 0.384, + "step": 4681 + }, + { + "epoch": 0.06351058057514922, + "grad_norm": 6.502596855163574, + "learning_rate": 9.46128545977799e-06, + "loss": 0.3929, + "step": 4682 + }, + { + "epoch": 0.0635241454150841, + "grad_norm": 6.665817737579346, + "learning_rate": 9.461148417157738e-06, + "loss": 0.4351, + "step": 4683 + }, + { + "epoch": 0.06353771025501899, + "grad_norm": 8.588196754455566, + "learning_rate": 9.461011374537483e-06, + "loss": 0.5462, + "step": 4684 + }, + { + "epoch": 0.06355127509495388, + "grad_norm": 6.009618759155273, + "learning_rate": 9.460874331917226e-06, + "loss": 0.3188, + "step": 4685 + }, + { + "epoch": 0.06356483993488876, + "grad_norm": 5.337634563446045, + "learning_rate": 9.460737289296971e-06, + "loss": 0.2744, + "step": 4686 + }, + { + "epoch": 0.06357840477482365, + "grad_norm": 5.24216365814209, + "learning_rate": 9.460600246676718e-06, + "loss": 0.2372, + "step": 4687 + }, + { + "epoch": 0.06359196961475855, + "grad_norm": 8.106593132019043, + "learning_rate": 9.460463204056462e-06, + "loss": 0.5592, + "step": 4688 + }, + { + "epoch": 0.06360553445469344, + "grad_norm": 8.232451438903809, + "learning_rate": 9.460326161436207e-06, + "loss": 0.43, + "step": 4689 + }, + { + "epoch": 0.06361909929462832, + "grad_norm": 7.767101764678955, + "learning_rate": 9.460189118815952e-06, + "loss": 0.5367, + "step": 4690 + }, + { + "epoch": 0.06363266413456321, + "grad_norm": 7.222119331359863, + "learning_rate": 9.460052076195697e-06, + "loss": 0.4096, + "step": 4691 + }, + { + "epoch": 0.0636462289744981, + "grad_norm": 7.16351842880249, + "learning_rate": 9.459915033575442e-06, + "loss": 0.3906, + "step": 4692 + }, + { + "epoch": 0.06365979381443299, + "grad_norm": 5.8841047286987305, + "learning_rate": 9.459777990955188e-06, + "loss": 0.3535, + "step": 4693 + }, + { + "epoch": 0.06367335865436788, + "grad_norm": 7.966698169708252, + "learning_rate": 9.459640948334933e-06, + "loss": 0.473, + "step": 4694 + }, + { + "epoch": 0.06368692349430277, + "grad_norm": 7.866281986236572, + "learning_rate": 9.459503905714678e-06, + "loss": 0.4423, + "step": 4695 + }, + { + "epoch": 0.06370048833423765, + "grad_norm": 6.305347442626953, + "learning_rate": 9.459366863094423e-06, + "loss": 0.4759, + "step": 4696 + }, + { + "epoch": 0.06371405317417254, + "grad_norm": 5.501872539520264, + "learning_rate": 9.459229820474168e-06, + "loss": 0.3627, + "step": 4697 + }, + { + "epoch": 0.06372761801410744, + "grad_norm": 9.103218078613281, + "learning_rate": 9.459092777853914e-06, + "loss": 0.6029, + "step": 4698 + }, + { + "epoch": 0.06374118285404232, + "grad_norm": 6.176473140716553, + "learning_rate": 9.458955735233659e-06, + "loss": 0.2666, + "step": 4699 + }, + { + "epoch": 0.06375474769397721, + "grad_norm": 8.847685813903809, + "learning_rate": 9.458818692613404e-06, + "loss": 0.5445, + "step": 4700 + }, + { + "epoch": 0.0637683125339121, + "grad_norm": 5.573643684387207, + "learning_rate": 9.458681649993149e-06, + "loss": 0.4111, + "step": 4701 + }, + { + "epoch": 0.06378187737384698, + "grad_norm": 8.54345417022705, + "learning_rate": 9.458544607372894e-06, + "loss": 0.4901, + "step": 4702 + }, + { + "epoch": 0.06379544221378187, + "grad_norm": 5.571096897125244, + "learning_rate": 9.458407564752638e-06, + "loss": 0.4008, + "step": 4703 + }, + { + "epoch": 0.06380900705371677, + "grad_norm": 7.14530611038208, + "learning_rate": 9.458270522132383e-06, + "loss": 0.4648, + "step": 4704 + }, + { + "epoch": 0.06382257189365166, + "grad_norm": 8.083465576171875, + "learning_rate": 9.45813347951213e-06, + "loss": 0.4574, + "step": 4705 + }, + { + "epoch": 0.06383613673358654, + "grad_norm": 6.699623107910156, + "learning_rate": 9.457996436891875e-06, + "loss": 0.5082, + "step": 4706 + }, + { + "epoch": 0.06384970157352143, + "grad_norm": 9.337979316711426, + "learning_rate": 9.457859394271618e-06, + "loss": 0.5812, + "step": 4707 + }, + { + "epoch": 0.06386326641345633, + "grad_norm": 8.64738941192627, + "learning_rate": 9.457722351651364e-06, + "loss": 0.4205, + "step": 4708 + }, + { + "epoch": 0.0638768312533912, + "grad_norm": 8.003625869750977, + "learning_rate": 9.45758530903111e-06, + "loss": 0.4685, + "step": 4709 + }, + { + "epoch": 0.0638903960933261, + "grad_norm": 5.80776309967041, + "learning_rate": 9.457448266410854e-06, + "loss": 0.4705, + "step": 4710 + }, + { + "epoch": 0.06390396093326099, + "grad_norm": 8.076574325561523, + "learning_rate": 9.4573112237906e-06, + "loss": 0.4055, + "step": 4711 + }, + { + "epoch": 0.06391752577319587, + "grad_norm": 7.984683513641357, + "learning_rate": 9.457174181170344e-06, + "loss": 0.5085, + "step": 4712 + }, + { + "epoch": 0.06393109061313076, + "grad_norm": 6.753066539764404, + "learning_rate": 9.45703713855009e-06, + "loss": 0.4832, + "step": 4713 + }, + { + "epoch": 0.06394465545306566, + "grad_norm": 9.354840278625488, + "learning_rate": 9.456900095929835e-06, + "loss": 0.4766, + "step": 4714 + }, + { + "epoch": 0.06395822029300054, + "grad_norm": 9.065055847167969, + "learning_rate": 9.45676305330958e-06, + "loss": 0.5931, + "step": 4715 + }, + { + "epoch": 0.06397178513293543, + "grad_norm": 7.048978328704834, + "learning_rate": 9.456626010689325e-06, + "loss": 0.4356, + "step": 4716 + }, + { + "epoch": 0.06398534997287032, + "grad_norm": 6.451005458831787, + "learning_rate": 9.45648896806907e-06, + "loss": 0.4071, + "step": 4717 + }, + { + "epoch": 0.0639989148128052, + "grad_norm": 7.717185974121094, + "learning_rate": 9.456351925448815e-06, + "loss": 0.3656, + "step": 4718 + }, + { + "epoch": 0.0640124796527401, + "grad_norm": 6.763774871826172, + "learning_rate": 9.45621488282856e-06, + "loss": 0.3954, + "step": 4719 + }, + { + "epoch": 0.06402604449267499, + "grad_norm": 8.674193382263184, + "learning_rate": 9.456077840208306e-06, + "loss": 0.3773, + "step": 4720 + }, + { + "epoch": 0.06403960933260988, + "grad_norm": 6.556682109832764, + "learning_rate": 9.455940797588051e-06, + "loss": 0.4592, + "step": 4721 + }, + { + "epoch": 0.06405317417254476, + "grad_norm": 7.341022968292236, + "learning_rate": 9.455803754967796e-06, + "loss": 0.5672, + "step": 4722 + }, + { + "epoch": 0.06406673901247965, + "grad_norm": 8.828081130981445, + "learning_rate": 9.455666712347541e-06, + "loss": 0.5354, + "step": 4723 + }, + { + "epoch": 0.06408030385241455, + "grad_norm": 9.060254096984863, + "learning_rate": 9.455529669727287e-06, + "loss": 0.671, + "step": 4724 + }, + { + "epoch": 0.06409386869234943, + "grad_norm": 9.913342475891113, + "learning_rate": 9.45539262710703e-06, + "loss": 0.501, + "step": 4725 + }, + { + "epoch": 0.06410743353228432, + "grad_norm": 8.063142776489258, + "learning_rate": 9.455255584486777e-06, + "loss": 0.4801, + "step": 4726 + }, + { + "epoch": 0.06412099837221921, + "grad_norm": 9.013092041015625, + "learning_rate": 9.455118541866522e-06, + "loss": 0.4666, + "step": 4727 + }, + { + "epoch": 0.06413456321215409, + "grad_norm": 5.220376014709473, + "learning_rate": 9.454981499246266e-06, + "loss": 0.2331, + "step": 4728 + }, + { + "epoch": 0.06414812805208898, + "grad_norm": 7.095572471618652, + "learning_rate": 9.45484445662601e-06, + "loss": 0.4843, + "step": 4729 + }, + { + "epoch": 0.06416169289202388, + "grad_norm": 7.105030059814453, + "learning_rate": 9.454707414005758e-06, + "loss": 0.4728, + "step": 4730 + }, + { + "epoch": 0.06417525773195876, + "grad_norm": 7.12166166305542, + "learning_rate": 9.454570371385503e-06, + "loss": 0.6313, + "step": 4731 + }, + { + "epoch": 0.06418882257189365, + "grad_norm": 5.229072570800781, + "learning_rate": 9.454433328765246e-06, + "loss": 0.4387, + "step": 4732 + }, + { + "epoch": 0.06420238741182854, + "grad_norm": 5.990358829498291, + "learning_rate": 9.454296286144991e-06, + "loss": 0.2896, + "step": 4733 + }, + { + "epoch": 0.06421595225176342, + "grad_norm": 6.717698097229004, + "learning_rate": 9.454159243524737e-06, + "loss": 0.3484, + "step": 4734 + }, + { + "epoch": 0.06422951709169832, + "grad_norm": 5.361735820770264, + "learning_rate": 9.454022200904482e-06, + "loss": 0.3294, + "step": 4735 + }, + { + "epoch": 0.06424308193163321, + "grad_norm": 8.421164512634277, + "learning_rate": 9.453885158284227e-06, + "loss": 0.3868, + "step": 4736 + }, + { + "epoch": 0.0642566467715681, + "grad_norm": 7.435184001922607, + "learning_rate": 9.453748115663972e-06, + "loss": 0.3906, + "step": 4737 + }, + { + "epoch": 0.06427021161150298, + "grad_norm": 5.671615123748779, + "learning_rate": 9.453611073043717e-06, + "loss": 0.4859, + "step": 4738 + }, + { + "epoch": 0.06428377645143787, + "grad_norm": 8.462576866149902, + "learning_rate": 9.453474030423462e-06, + "loss": 0.6204, + "step": 4739 + }, + { + "epoch": 0.06429734129137277, + "grad_norm": 8.133086204528809, + "learning_rate": 9.453336987803208e-06, + "loss": 0.5126, + "step": 4740 + }, + { + "epoch": 0.06431090613130765, + "grad_norm": 7.058675289154053, + "learning_rate": 9.453199945182953e-06, + "loss": 0.4143, + "step": 4741 + }, + { + "epoch": 0.06432447097124254, + "grad_norm": 5.724821090698242, + "learning_rate": 9.453062902562698e-06, + "loss": 0.4776, + "step": 4742 + }, + { + "epoch": 0.06433803581117743, + "grad_norm": 6.448230266571045, + "learning_rate": 9.452925859942443e-06, + "loss": 0.4158, + "step": 4743 + }, + { + "epoch": 0.06435160065111231, + "grad_norm": 7.204996109008789, + "learning_rate": 9.452788817322188e-06, + "loss": 0.4936, + "step": 4744 + }, + { + "epoch": 0.0643651654910472, + "grad_norm": 6.6610107421875, + "learning_rate": 9.452651774701934e-06, + "loss": 0.2478, + "step": 4745 + }, + { + "epoch": 0.0643787303309821, + "grad_norm": 6.450158596038818, + "learning_rate": 9.452514732081679e-06, + "loss": 0.2828, + "step": 4746 + }, + { + "epoch": 0.06439229517091698, + "grad_norm": 6.0308637619018555, + "learning_rate": 9.452377689461422e-06, + "loss": 0.5849, + "step": 4747 + }, + { + "epoch": 0.06440586001085187, + "grad_norm": 6.551565647125244, + "learning_rate": 9.452240646841169e-06, + "loss": 0.3851, + "step": 4748 + }, + { + "epoch": 0.06441942485078676, + "grad_norm": 7.418384552001953, + "learning_rate": 9.452103604220914e-06, + "loss": 0.356, + "step": 4749 + }, + { + "epoch": 0.06443298969072164, + "grad_norm": 5.9942731857299805, + "learning_rate": 9.451966561600658e-06, + "loss": 0.3414, + "step": 4750 + }, + { + "epoch": 0.06444655453065654, + "grad_norm": 8.315816879272461, + "learning_rate": 9.451829518980403e-06, + "loss": 0.4853, + "step": 4751 + }, + { + "epoch": 0.06446011937059143, + "grad_norm": 9.122920989990234, + "learning_rate": 9.45169247636015e-06, + "loss": 0.5857, + "step": 4752 + }, + { + "epoch": 0.06447368421052632, + "grad_norm": 5.1771931648254395, + "learning_rate": 9.451555433739893e-06, + "loss": 0.3645, + "step": 4753 + }, + { + "epoch": 0.0644872490504612, + "grad_norm": 8.339383125305176, + "learning_rate": 9.451418391119638e-06, + "loss": 0.4586, + "step": 4754 + }, + { + "epoch": 0.0645008138903961, + "grad_norm": 9.403985977172852, + "learning_rate": 9.451281348499384e-06, + "loss": 0.4846, + "step": 4755 + }, + { + "epoch": 0.06451437873033099, + "grad_norm": 5.8434929847717285, + "learning_rate": 9.451144305879129e-06, + "loss": 0.3319, + "step": 4756 + }, + { + "epoch": 0.06452794357026587, + "grad_norm": 7.527695655822754, + "learning_rate": 9.451007263258874e-06, + "loss": 0.4286, + "step": 4757 + }, + { + "epoch": 0.06454150841020076, + "grad_norm": 7.172952175140381, + "learning_rate": 9.45087022063862e-06, + "loss": 0.3179, + "step": 4758 + }, + { + "epoch": 0.06455507325013565, + "grad_norm": 9.334198951721191, + "learning_rate": 9.450733178018364e-06, + "loss": 0.4336, + "step": 4759 + }, + { + "epoch": 0.06456863809007053, + "grad_norm": 8.618959426879883, + "learning_rate": 9.45059613539811e-06, + "loss": 0.2989, + "step": 4760 + }, + { + "epoch": 0.06458220293000543, + "grad_norm": 6.577069282531738, + "learning_rate": 9.450459092777855e-06, + "loss": 0.4176, + "step": 4761 + }, + { + "epoch": 0.06459576776994032, + "grad_norm": 7.702256202697754, + "learning_rate": 9.4503220501576e-06, + "loss": 0.4844, + "step": 4762 + }, + { + "epoch": 0.0646093326098752, + "grad_norm": 7.22300386428833, + "learning_rate": 9.450185007537345e-06, + "loss": 0.4551, + "step": 4763 + }, + { + "epoch": 0.06462289744981009, + "grad_norm": 8.156120300292969, + "learning_rate": 9.45004796491709e-06, + "loss": 0.3866, + "step": 4764 + }, + { + "epoch": 0.06463646228974498, + "grad_norm": 7.702184677124023, + "learning_rate": 9.449910922296835e-06, + "loss": 0.4655, + "step": 4765 + }, + { + "epoch": 0.06465002712967986, + "grad_norm": 8.538884162902832, + "learning_rate": 9.44977387967658e-06, + "loss": 0.4492, + "step": 4766 + }, + { + "epoch": 0.06466359196961476, + "grad_norm": 8.461748123168945, + "learning_rate": 9.449636837056326e-06, + "loss": 0.5118, + "step": 4767 + }, + { + "epoch": 0.06467715680954965, + "grad_norm": 6.9465651512146, + "learning_rate": 9.44949979443607e-06, + "loss": 0.5761, + "step": 4768 + }, + { + "epoch": 0.06469072164948454, + "grad_norm": 7.778523921966553, + "learning_rate": 9.449362751815816e-06, + "loss": 0.6427, + "step": 4769 + }, + { + "epoch": 0.06470428648941942, + "grad_norm": 7.221389293670654, + "learning_rate": 9.449225709195561e-06, + "loss": 0.5079, + "step": 4770 + }, + { + "epoch": 0.06471785132935431, + "grad_norm": 10.108973503112793, + "learning_rate": 9.449088666575305e-06, + "loss": 0.5418, + "step": 4771 + }, + { + "epoch": 0.06473141616928921, + "grad_norm": 7.1080851554870605, + "learning_rate": 9.44895162395505e-06, + "loss": 0.493, + "step": 4772 + }, + { + "epoch": 0.06474498100922409, + "grad_norm": 7.114293098449707, + "learning_rate": 9.448814581334795e-06, + "loss": 0.429, + "step": 4773 + }, + { + "epoch": 0.06475854584915898, + "grad_norm": 10.917875289916992, + "learning_rate": 9.448677538714542e-06, + "loss": 0.6752, + "step": 4774 + }, + { + "epoch": 0.06477211068909387, + "grad_norm": 8.484623908996582, + "learning_rate": 9.448540496094286e-06, + "loss": 0.5194, + "step": 4775 + }, + { + "epoch": 0.06478567552902875, + "grad_norm": 7.934675216674805, + "learning_rate": 9.44840345347403e-06, + "loss": 0.6114, + "step": 4776 + }, + { + "epoch": 0.06479924036896365, + "grad_norm": 7.957132816314697, + "learning_rate": 9.448266410853776e-06, + "loss": 0.5954, + "step": 4777 + }, + { + "epoch": 0.06481280520889854, + "grad_norm": 8.76875114440918, + "learning_rate": 9.448129368233521e-06, + "loss": 0.6049, + "step": 4778 + }, + { + "epoch": 0.06482637004883342, + "grad_norm": 6.457373142242432, + "learning_rate": 9.447992325613266e-06, + "loss": 0.4724, + "step": 4779 + }, + { + "epoch": 0.06483993488876831, + "grad_norm": 10.0708589553833, + "learning_rate": 9.447855282993011e-06, + "loss": 0.5079, + "step": 4780 + }, + { + "epoch": 0.0648534997287032, + "grad_norm": 8.850757598876953, + "learning_rate": 9.447718240372757e-06, + "loss": 0.8194, + "step": 4781 + }, + { + "epoch": 0.0648670645686381, + "grad_norm": 7.76820707321167, + "learning_rate": 9.447581197752502e-06, + "loss": 0.4589, + "step": 4782 + }, + { + "epoch": 0.06488062940857298, + "grad_norm": 8.910704612731934, + "learning_rate": 9.447444155132247e-06, + "loss": 0.5658, + "step": 4783 + }, + { + "epoch": 0.06489419424850787, + "grad_norm": 6.255280017852783, + "learning_rate": 9.447307112511992e-06, + "loss": 0.3485, + "step": 4784 + }, + { + "epoch": 0.06490775908844276, + "grad_norm": 6.20657205581665, + "learning_rate": 9.447170069891737e-06, + "loss": 0.548, + "step": 4785 + }, + { + "epoch": 0.06492132392837764, + "grad_norm": 7.950140953063965, + "learning_rate": 9.447033027271483e-06, + "loss": 0.4335, + "step": 4786 + }, + { + "epoch": 0.06493488876831253, + "grad_norm": 9.454170227050781, + "learning_rate": 9.446895984651228e-06, + "loss": 0.6545, + "step": 4787 + }, + { + "epoch": 0.06494845360824743, + "grad_norm": 5.893679618835449, + "learning_rate": 9.446758942030973e-06, + "loss": 0.3779, + "step": 4788 + }, + { + "epoch": 0.06496201844818231, + "grad_norm": 7.649264812469482, + "learning_rate": 9.446621899410718e-06, + "loss": 0.5083, + "step": 4789 + }, + { + "epoch": 0.0649755832881172, + "grad_norm": 7.816325664520264, + "learning_rate": 9.446484856790462e-06, + "loss": 0.3926, + "step": 4790 + }, + { + "epoch": 0.0649891481280521, + "grad_norm": 8.491294860839844, + "learning_rate": 9.446347814170208e-06, + "loss": 0.5092, + "step": 4791 + }, + { + "epoch": 0.06500271296798697, + "grad_norm": 10.387739181518555, + "learning_rate": 9.446210771549954e-06, + "loss": 0.5933, + "step": 4792 + }, + { + "epoch": 0.06501627780792187, + "grad_norm": 6.905694007873535, + "learning_rate": 9.446073728929697e-06, + "loss": 0.548, + "step": 4793 + }, + { + "epoch": 0.06502984264785676, + "grad_norm": 7.793629169464111, + "learning_rate": 9.445936686309442e-06, + "loss": 0.6157, + "step": 4794 + }, + { + "epoch": 0.06504340748779164, + "grad_norm": 6.952235221862793, + "learning_rate": 9.445799643689189e-06, + "loss": 0.4436, + "step": 4795 + }, + { + "epoch": 0.06505697232772653, + "grad_norm": 8.044235229492188, + "learning_rate": 9.445662601068933e-06, + "loss": 0.4418, + "step": 4796 + }, + { + "epoch": 0.06507053716766142, + "grad_norm": 9.396857261657715, + "learning_rate": 9.445525558448678e-06, + "loss": 0.5409, + "step": 4797 + }, + { + "epoch": 0.06508410200759632, + "grad_norm": 9.31339168548584, + "learning_rate": 9.445388515828423e-06, + "loss": 0.622, + "step": 4798 + }, + { + "epoch": 0.0650976668475312, + "grad_norm": 9.813643455505371, + "learning_rate": 9.44525147320817e-06, + "loss": 0.471, + "step": 4799 + }, + { + "epoch": 0.06511123168746609, + "grad_norm": 8.241580963134766, + "learning_rate": 9.445114430587913e-06, + "loss": 0.7288, + "step": 4800 + }, + { + "epoch": 0.06512479652740098, + "grad_norm": 8.84692668914795, + "learning_rate": 9.444977387967659e-06, + "loss": 0.5046, + "step": 4801 + }, + { + "epoch": 0.06513836136733586, + "grad_norm": 6.492965221405029, + "learning_rate": 9.444840345347404e-06, + "loss": 0.4083, + "step": 4802 + }, + { + "epoch": 0.06515192620727076, + "grad_norm": 8.568784713745117, + "learning_rate": 9.444703302727149e-06, + "loss": 0.5715, + "step": 4803 + }, + { + "epoch": 0.06516549104720565, + "grad_norm": 8.227787017822266, + "learning_rate": 9.444566260106894e-06, + "loss": 0.5159, + "step": 4804 + }, + { + "epoch": 0.06517905588714053, + "grad_norm": 7.11642599105835, + "learning_rate": 9.44442921748664e-06, + "loss": 0.3694, + "step": 4805 + }, + { + "epoch": 0.06519262072707542, + "grad_norm": 7.047884941101074, + "learning_rate": 9.444292174866384e-06, + "loss": 0.5365, + "step": 4806 + }, + { + "epoch": 0.06520618556701031, + "grad_norm": 7.607959270477295, + "learning_rate": 9.44415513224613e-06, + "loss": 0.5075, + "step": 4807 + }, + { + "epoch": 0.06521975040694519, + "grad_norm": 6.377920627593994, + "learning_rate": 9.444018089625875e-06, + "loss": 0.5726, + "step": 4808 + }, + { + "epoch": 0.06523331524688009, + "grad_norm": 6.7274274826049805, + "learning_rate": 9.44388104700562e-06, + "loss": 0.3109, + "step": 4809 + }, + { + "epoch": 0.06524688008681498, + "grad_norm": 8.57203197479248, + "learning_rate": 9.443744004385365e-06, + "loss": 0.4642, + "step": 4810 + }, + { + "epoch": 0.06526044492674986, + "grad_norm": 9.141521453857422, + "learning_rate": 9.443606961765109e-06, + "loss": 0.5892, + "step": 4811 + }, + { + "epoch": 0.06527400976668475, + "grad_norm": 8.446496963500977, + "learning_rate": 9.443469919144855e-06, + "loss": 0.6025, + "step": 4812 + }, + { + "epoch": 0.06528757460661964, + "grad_norm": 6.895211219787598, + "learning_rate": 9.4433328765246e-06, + "loss": 0.4696, + "step": 4813 + }, + { + "epoch": 0.06530113944655454, + "grad_norm": 7.129767894744873, + "learning_rate": 9.443195833904346e-06, + "loss": 0.3288, + "step": 4814 + }, + { + "epoch": 0.06531470428648942, + "grad_norm": 6.581816673278809, + "learning_rate": 9.44305879128409e-06, + "loss": 0.5206, + "step": 4815 + }, + { + "epoch": 0.06532826912642431, + "grad_norm": 8.013421058654785, + "learning_rate": 9.442921748663834e-06, + "loss": 0.6237, + "step": 4816 + }, + { + "epoch": 0.0653418339663592, + "grad_norm": 7.560216426849365, + "learning_rate": 9.442784706043581e-06, + "loss": 0.4682, + "step": 4817 + }, + { + "epoch": 0.06535539880629408, + "grad_norm": 8.105766296386719, + "learning_rate": 9.442647663423325e-06, + "loss": 0.4836, + "step": 4818 + }, + { + "epoch": 0.06536896364622898, + "grad_norm": 7.518866539001465, + "learning_rate": 9.44251062080307e-06, + "loss": 0.4595, + "step": 4819 + }, + { + "epoch": 0.06538252848616387, + "grad_norm": 7.130196571350098, + "learning_rate": 9.442373578182815e-06, + "loss": 0.5789, + "step": 4820 + }, + { + "epoch": 0.06539609332609875, + "grad_norm": 7.420502662658691, + "learning_rate": 9.44223653556256e-06, + "loss": 0.5115, + "step": 4821 + }, + { + "epoch": 0.06540965816603364, + "grad_norm": 9.361381530761719, + "learning_rate": 9.442099492942306e-06, + "loss": 0.549, + "step": 4822 + }, + { + "epoch": 0.06542322300596853, + "grad_norm": 7.627239227294922, + "learning_rate": 9.44196245032205e-06, + "loss": 0.4832, + "step": 4823 + }, + { + "epoch": 0.06543678784590341, + "grad_norm": 5.974226474761963, + "learning_rate": 9.441825407701796e-06, + "loss": 0.4339, + "step": 4824 + }, + { + "epoch": 0.0654503526858383, + "grad_norm": 6.015488624572754, + "learning_rate": 9.441688365081541e-06, + "loss": 0.4868, + "step": 4825 + }, + { + "epoch": 0.0654639175257732, + "grad_norm": 7.831210613250732, + "learning_rate": 9.441551322461286e-06, + "loss": 0.445, + "step": 4826 + }, + { + "epoch": 0.06547748236570808, + "grad_norm": 5.982283115386963, + "learning_rate": 9.441414279841031e-06, + "loss": 0.3417, + "step": 4827 + }, + { + "epoch": 0.06549104720564297, + "grad_norm": 8.636922836303711, + "learning_rate": 9.441277237220777e-06, + "loss": 0.387, + "step": 4828 + }, + { + "epoch": 0.06550461204557786, + "grad_norm": 6.525849342346191, + "learning_rate": 9.441140194600522e-06, + "loss": 0.4077, + "step": 4829 + }, + { + "epoch": 0.06551817688551276, + "grad_norm": 5.605798244476318, + "learning_rate": 9.441003151980267e-06, + "loss": 0.3299, + "step": 4830 + }, + { + "epoch": 0.06553174172544764, + "grad_norm": 5.315690994262695, + "learning_rate": 9.440866109360012e-06, + "loss": 0.3236, + "step": 4831 + }, + { + "epoch": 0.06554530656538253, + "grad_norm": 6.616147041320801, + "learning_rate": 9.440729066739757e-06, + "loss": 0.396, + "step": 4832 + }, + { + "epoch": 0.06555887140531742, + "grad_norm": 5.798742771148682, + "learning_rate": 9.4405920241195e-06, + "loss": 0.366, + "step": 4833 + }, + { + "epoch": 0.0655724362452523, + "grad_norm": 7.371942043304443, + "learning_rate": 9.440454981499248e-06, + "loss": 0.5927, + "step": 4834 + }, + { + "epoch": 0.0655860010851872, + "grad_norm": 6.428966522216797, + "learning_rate": 9.440317938878993e-06, + "loss": 0.37, + "step": 4835 + }, + { + "epoch": 0.06559956592512209, + "grad_norm": 7.213183403015137, + "learning_rate": 9.440180896258736e-06, + "loss": 0.4711, + "step": 4836 + }, + { + "epoch": 0.06561313076505697, + "grad_norm": 8.833056449890137, + "learning_rate": 9.440043853638482e-06, + "loss": 0.5364, + "step": 4837 + }, + { + "epoch": 0.06562669560499186, + "grad_norm": 5.978240489959717, + "learning_rate": 9.439906811018228e-06, + "loss": 0.4622, + "step": 4838 + }, + { + "epoch": 0.06564026044492675, + "grad_norm": 6.74068021774292, + "learning_rate": 9.439769768397974e-06, + "loss": 0.4547, + "step": 4839 + }, + { + "epoch": 0.06565382528486163, + "grad_norm": 5.627583980560303, + "learning_rate": 9.439632725777717e-06, + "loss": 0.2917, + "step": 4840 + }, + { + "epoch": 0.06566739012479653, + "grad_norm": 7.947125434875488, + "learning_rate": 9.439495683157462e-06, + "loss": 0.4176, + "step": 4841 + }, + { + "epoch": 0.06568095496473142, + "grad_norm": 7.634304046630859, + "learning_rate": 9.439358640537207e-06, + "loss": 0.5279, + "step": 4842 + }, + { + "epoch": 0.0656945198046663, + "grad_norm": 7.283626556396484, + "learning_rate": 9.439221597916953e-06, + "loss": 0.4384, + "step": 4843 + }, + { + "epoch": 0.06570808464460119, + "grad_norm": 8.669291496276855, + "learning_rate": 9.439084555296698e-06, + "loss": 0.4045, + "step": 4844 + }, + { + "epoch": 0.06572164948453608, + "grad_norm": 5.512225151062012, + "learning_rate": 9.438947512676443e-06, + "loss": 0.3748, + "step": 4845 + }, + { + "epoch": 0.06573521432447098, + "grad_norm": 5.992414951324463, + "learning_rate": 9.438810470056188e-06, + "loss": 0.2993, + "step": 4846 + }, + { + "epoch": 0.06574877916440586, + "grad_norm": 6.625701904296875, + "learning_rate": 9.438673427435933e-06, + "loss": 0.4097, + "step": 4847 + }, + { + "epoch": 0.06576234400434075, + "grad_norm": 9.003067970275879, + "learning_rate": 9.438536384815679e-06, + "loss": 0.4554, + "step": 4848 + }, + { + "epoch": 0.06577590884427564, + "grad_norm": 6.783991813659668, + "learning_rate": 9.438399342195424e-06, + "loss": 0.3955, + "step": 4849 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 8.093332290649414, + "learning_rate": 9.438262299575169e-06, + "loss": 0.5238, + "step": 4850 + }, + { + "epoch": 0.06580303852414542, + "grad_norm": 5.793014049530029, + "learning_rate": 9.438125256954914e-06, + "loss": 0.4743, + "step": 4851 + }, + { + "epoch": 0.06581660336408031, + "grad_norm": 6.596256256103516, + "learning_rate": 9.43798821433466e-06, + "loss": 0.3246, + "step": 4852 + }, + { + "epoch": 0.06583016820401519, + "grad_norm": 5.9357404708862305, + "learning_rate": 9.437851171714404e-06, + "loss": 0.4216, + "step": 4853 + }, + { + "epoch": 0.06584373304395008, + "grad_norm": 6.847752571105957, + "learning_rate": 9.437714129094148e-06, + "loss": 0.36, + "step": 4854 + }, + { + "epoch": 0.06585729788388497, + "grad_norm": 6.785961151123047, + "learning_rate": 9.437577086473895e-06, + "loss": 0.4088, + "step": 4855 + }, + { + "epoch": 0.06587086272381985, + "grad_norm": 9.481924057006836, + "learning_rate": 9.43744004385364e-06, + "loss": 0.5493, + "step": 4856 + }, + { + "epoch": 0.06588442756375475, + "grad_norm": 6.061048984527588, + "learning_rate": 9.437303001233385e-06, + "loss": 0.3236, + "step": 4857 + }, + { + "epoch": 0.06589799240368964, + "grad_norm": 5.480085372924805, + "learning_rate": 9.437165958613129e-06, + "loss": 0.2746, + "step": 4858 + }, + { + "epoch": 0.06591155724362452, + "grad_norm": 6.784924030303955, + "learning_rate": 9.437028915992874e-06, + "loss": 0.3731, + "step": 4859 + }, + { + "epoch": 0.06592512208355941, + "grad_norm": 8.786995887756348, + "learning_rate": 9.43689187337262e-06, + "loss": 0.5125, + "step": 4860 + }, + { + "epoch": 0.0659386869234943, + "grad_norm": 4.699539661407471, + "learning_rate": 9.436754830752364e-06, + "loss": 0.2828, + "step": 4861 + }, + { + "epoch": 0.0659522517634292, + "grad_norm": 5.520371437072754, + "learning_rate": 9.43661778813211e-06, + "loss": 0.3742, + "step": 4862 + }, + { + "epoch": 0.06596581660336408, + "grad_norm": 9.217061996459961, + "learning_rate": 9.436480745511855e-06, + "loss": 0.4443, + "step": 4863 + }, + { + "epoch": 0.06597938144329897, + "grad_norm": 8.988061904907227, + "learning_rate": 9.4363437028916e-06, + "loss": 0.4489, + "step": 4864 + }, + { + "epoch": 0.06599294628323386, + "grad_norm": 5.393219947814941, + "learning_rate": 9.436206660271345e-06, + "loss": 0.311, + "step": 4865 + }, + { + "epoch": 0.06600651112316874, + "grad_norm": 5.600555419921875, + "learning_rate": 9.43606961765109e-06, + "loss": 0.3463, + "step": 4866 + }, + { + "epoch": 0.06602007596310364, + "grad_norm": 6.206772804260254, + "learning_rate": 9.435932575030835e-06, + "loss": 0.358, + "step": 4867 + }, + { + "epoch": 0.06603364080303853, + "grad_norm": 6.5567402839660645, + "learning_rate": 9.43579553241058e-06, + "loss": 0.3859, + "step": 4868 + }, + { + "epoch": 0.06604720564297341, + "grad_norm": 5.7430100440979, + "learning_rate": 9.435658489790326e-06, + "loss": 0.295, + "step": 4869 + }, + { + "epoch": 0.0660607704829083, + "grad_norm": 5.990951061248779, + "learning_rate": 9.43552144717007e-06, + "loss": 0.4127, + "step": 4870 + }, + { + "epoch": 0.0660743353228432, + "grad_norm": 5.039957046508789, + "learning_rate": 9.435384404549816e-06, + "loss": 0.3167, + "step": 4871 + }, + { + "epoch": 0.06608790016277807, + "grad_norm": 6.171939373016357, + "learning_rate": 9.435247361929561e-06, + "loss": 0.393, + "step": 4872 + }, + { + "epoch": 0.06610146500271297, + "grad_norm": 7.143960475921631, + "learning_rate": 9.435110319309306e-06, + "loss": 0.3258, + "step": 4873 + }, + { + "epoch": 0.06611502984264786, + "grad_norm": 5.923823833465576, + "learning_rate": 9.434973276689051e-06, + "loss": 0.3891, + "step": 4874 + }, + { + "epoch": 0.06612859468258274, + "grad_norm": 6.159071922302246, + "learning_rate": 9.434836234068797e-06, + "loss": 0.3959, + "step": 4875 + }, + { + "epoch": 0.06614215952251763, + "grad_norm": 8.630858421325684, + "learning_rate": 9.43469919144854e-06, + "loss": 0.3459, + "step": 4876 + }, + { + "epoch": 0.06615572436245253, + "grad_norm": 7.361391067504883, + "learning_rate": 9.434562148828287e-06, + "loss": 0.4333, + "step": 4877 + }, + { + "epoch": 0.06616928920238742, + "grad_norm": 7.241481304168701, + "learning_rate": 9.434425106208032e-06, + "loss": 0.4162, + "step": 4878 + }, + { + "epoch": 0.0661828540423223, + "grad_norm": 6.5161452293396, + "learning_rate": 9.434288063587776e-06, + "loss": 0.3936, + "step": 4879 + }, + { + "epoch": 0.06619641888225719, + "grad_norm": 7.855652809143066, + "learning_rate": 9.434151020967521e-06, + "loss": 0.4708, + "step": 4880 + }, + { + "epoch": 0.06620998372219208, + "grad_norm": 7.3528218269348145, + "learning_rate": 9.434013978347268e-06, + "loss": 0.267, + "step": 4881 + }, + { + "epoch": 0.06622354856212696, + "grad_norm": 6.682910919189453, + "learning_rate": 9.433876935727013e-06, + "loss": 0.3721, + "step": 4882 + }, + { + "epoch": 0.06623711340206186, + "grad_norm": 5.93548059463501, + "learning_rate": 9.433739893106756e-06, + "loss": 0.447, + "step": 4883 + }, + { + "epoch": 0.06625067824199675, + "grad_norm": 6.8738203048706055, + "learning_rate": 9.433602850486502e-06, + "loss": 0.3783, + "step": 4884 + }, + { + "epoch": 0.06626424308193163, + "grad_norm": 6.762606620788574, + "learning_rate": 9.433465807866247e-06, + "loss": 0.3671, + "step": 4885 + }, + { + "epoch": 0.06627780792186652, + "grad_norm": 7.711604118347168, + "learning_rate": 9.433328765245992e-06, + "loss": 0.5878, + "step": 4886 + }, + { + "epoch": 0.06629137276180141, + "grad_norm": 6.442255020141602, + "learning_rate": 9.433191722625737e-06, + "loss": 0.5005, + "step": 4887 + }, + { + "epoch": 0.0663049376017363, + "grad_norm": 6.653860092163086, + "learning_rate": 9.433054680005482e-06, + "loss": 0.3524, + "step": 4888 + }, + { + "epoch": 0.06631850244167119, + "grad_norm": 7.469925880432129, + "learning_rate": 9.432917637385227e-06, + "loss": 0.5691, + "step": 4889 + }, + { + "epoch": 0.06633206728160608, + "grad_norm": 6.625485897064209, + "learning_rate": 9.432780594764973e-06, + "loss": 0.4386, + "step": 4890 + }, + { + "epoch": 0.06634563212154096, + "grad_norm": 6.054630756378174, + "learning_rate": 9.432643552144718e-06, + "loss": 0.4345, + "step": 4891 + }, + { + "epoch": 0.06635919696147585, + "grad_norm": 6.097280025482178, + "learning_rate": 9.432506509524463e-06, + "loss": 0.3969, + "step": 4892 + }, + { + "epoch": 0.06637276180141075, + "grad_norm": 11.635088920593262, + "learning_rate": 9.432369466904208e-06, + "loss": 0.4464, + "step": 4893 + }, + { + "epoch": 0.06638632664134564, + "grad_norm": 5.808595657348633, + "learning_rate": 9.432232424283953e-06, + "loss": 0.3452, + "step": 4894 + }, + { + "epoch": 0.06639989148128052, + "grad_norm": 6.6349287033081055, + "learning_rate": 9.432095381663699e-06, + "loss": 0.325, + "step": 4895 + }, + { + "epoch": 0.06641345632121541, + "grad_norm": 6.089495658874512, + "learning_rate": 9.431958339043444e-06, + "loss": 0.38, + "step": 4896 + }, + { + "epoch": 0.0664270211611503, + "grad_norm": 6.571595191955566, + "learning_rate": 9.431821296423189e-06, + "loss": 0.3439, + "step": 4897 + }, + { + "epoch": 0.06644058600108518, + "grad_norm": 5.862457752227783, + "learning_rate": 9.431684253802932e-06, + "loss": 0.3136, + "step": 4898 + }, + { + "epoch": 0.06645415084102008, + "grad_norm": 5.299026966094971, + "learning_rate": 9.43154721118268e-06, + "loss": 0.3589, + "step": 4899 + }, + { + "epoch": 0.06646771568095497, + "grad_norm": 4.557490348815918, + "learning_rate": 9.431410168562424e-06, + "loss": 0.2308, + "step": 4900 + }, + { + "epoch": 0.06648128052088985, + "grad_norm": 6.595589637756348, + "learning_rate": 9.431273125942168e-06, + "loss": 0.4131, + "step": 4901 + }, + { + "epoch": 0.06649484536082474, + "grad_norm": 4.352086544036865, + "learning_rate": 9.431136083321913e-06, + "loss": 0.2528, + "step": 4902 + }, + { + "epoch": 0.06650841020075964, + "grad_norm": 7.835720539093018, + "learning_rate": 9.43099904070166e-06, + "loss": 0.4314, + "step": 4903 + }, + { + "epoch": 0.06652197504069451, + "grad_norm": 5.7456769943237305, + "learning_rate": 9.430861998081403e-06, + "loss": 0.3791, + "step": 4904 + }, + { + "epoch": 0.06653553988062941, + "grad_norm": 6.0827765464782715, + "learning_rate": 9.430724955461149e-06, + "loss": 0.4732, + "step": 4905 + }, + { + "epoch": 0.0665491047205643, + "grad_norm": 5.715950012207031, + "learning_rate": 9.430587912840894e-06, + "loss": 0.3575, + "step": 4906 + }, + { + "epoch": 0.06656266956049918, + "grad_norm": 6.261026382446289, + "learning_rate": 9.43045087022064e-06, + "loss": 0.4487, + "step": 4907 + }, + { + "epoch": 0.06657623440043407, + "grad_norm": 6.343835830688477, + "learning_rate": 9.430313827600384e-06, + "loss": 0.3776, + "step": 4908 + }, + { + "epoch": 0.06658979924036897, + "grad_norm": 5.318652153015137, + "learning_rate": 9.43017678498013e-06, + "loss": 0.2873, + "step": 4909 + }, + { + "epoch": 0.06660336408030386, + "grad_norm": 6.48696231842041, + "learning_rate": 9.430039742359875e-06, + "loss": 0.4887, + "step": 4910 + }, + { + "epoch": 0.06661692892023874, + "grad_norm": 8.959114074707031, + "learning_rate": 9.42990269973962e-06, + "loss": 0.4549, + "step": 4911 + }, + { + "epoch": 0.06663049376017363, + "grad_norm": 6.317409515380859, + "learning_rate": 9.429765657119365e-06, + "loss": 0.3447, + "step": 4912 + }, + { + "epoch": 0.06664405860010852, + "grad_norm": 4.799598217010498, + "learning_rate": 9.42962861449911e-06, + "loss": 0.4132, + "step": 4913 + }, + { + "epoch": 0.0666576234400434, + "grad_norm": 5.436349868774414, + "learning_rate": 9.429491571878855e-06, + "loss": 0.3748, + "step": 4914 + }, + { + "epoch": 0.0666711882799783, + "grad_norm": 10.706592559814453, + "learning_rate": 9.4293545292586e-06, + "loss": 0.6593, + "step": 4915 + }, + { + "epoch": 0.06668475311991319, + "grad_norm": 5.608344554901123, + "learning_rate": 9.429217486638346e-06, + "loss": 0.3833, + "step": 4916 + }, + { + "epoch": 0.06669831795984807, + "grad_norm": 3.8971006870269775, + "learning_rate": 9.42908044401809e-06, + "loss": 0.27, + "step": 4917 + }, + { + "epoch": 0.06671188279978296, + "grad_norm": 5.245388984680176, + "learning_rate": 9.428943401397836e-06, + "loss": 0.2754, + "step": 4918 + }, + { + "epoch": 0.06672544763971786, + "grad_norm": 7.9463791847229, + "learning_rate": 9.42880635877758e-06, + "loss": 0.276, + "step": 4919 + }, + { + "epoch": 0.06673901247965273, + "grad_norm": 5.941864490509033, + "learning_rate": 9.428669316157326e-06, + "loss": 0.4042, + "step": 4920 + }, + { + "epoch": 0.06675257731958763, + "grad_norm": 6.566913604736328, + "learning_rate": 9.428532273537072e-06, + "loss": 0.3926, + "step": 4921 + }, + { + "epoch": 0.06676614215952252, + "grad_norm": 6.591503620147705, + "learning_rate": 9.428395230916817e-06, + "loss": 0.5675, + "step": 4922 + }, + { + "epoch": 0.0667797069994574, + "grad_norm": 4.995874404907227, + "learning_rate": 9.42825818829656e-06, + "loss": 0.346, + "step": 4923 + }, + { + "epoch": 0.0667932718393923, + "grad_norm": 9.64616584777832, + "learning_rate": 9.428121145676307e-06, + "loss": 0.5001, + "step": 4924 + }, + { + "epoch": 0.06680683667932719, + "grad_norm": 6.752375602722168, + "learning_rate": 9.427984103056052e-06, + "loss": 0.3467, + "step": 4925 + }, + { + "epoch": 0.06682040151926208, + "grad_norm": 6.289321422576904, + "learning_rate": 9.427847060435796e-06, + "loss": 0.3841, + "step": 4926 + }, + { + "epoch": 0.06683396635919696, + "grad_norm": 4.8542280197143555, + "learning_rate": 9.427710017815541e-06, + "loss": 0.3548, + "step": 4927 + }, + { + "epoch": 0.06684753119913185, + "grad_norm": 5.764210224151611, + "learning_rate": 9.427572975195286e-06, + "loss": 0.459, + "step": 4928 + }, + { + "epoch": 0.06686109603906674, + "grad_norm": 9.977548599243164, + "learning_rate": 9.427435932575031e-06, + "loss": 0.6145, + "step": 4929 + }, + { + "epoch": 0.06687466087900162, + "grad_norm": 7.611691951751709, + "learning_rate": 9.427298889954776e-06, + "loss": 0.4636, + "step": 4930 + }, + { + "epoch": 0.06688822571893652, + "grad_norm": 7.71516227722168, + "learning_rate": 9.427161847334522e-06, + "loss": 0.4561, + "step": 4931 + }, + { + "epoch": 0.06690179055887141, + "grad_norm": 8.868772506713867, + "learning_rate": 9.427024804714267e-06, + "loss": 0.6501, + "step": 4932 + }, + { + "epoch": 0.06691535539880629, + "grad_norm": 6.279958724975586, + "learning_rate": 9.426887762094012e-06, + "loss": 0.3491, + "step": 4933 + }, + { + "epoch": 0.06692892023874118, + "grad_norm": 5.125179290771484, + "learning_rate": 9.426750719473757e-06, + "loss": 0.496, + "step": 4934 + }, + { + "epoch": 0.06694248507867608, + "grad_norm": 8.76303768157959, + "learning_rate": 9.426613676853502e-06, + "loss": 0.5068, + "step": 4935 + }, + { + "epoch": 0.06695604991861095, + "grad_norm": 8.154548645019531, + "learning_rate": 9.426476634233247e-06, + "loss": 0.5096, + "step": 4936 + }, + { + "epoch": 0.06696961475854585, + "grad_norm": 8.462663650512695, + "learning_rate": 9.426339591612993e-06, + "loss": 0.487, + "step": 4937 + }, + { + "epoch": 0.06698317959848074, + "grad_norm": 6.920506954193115, + "learning_rate": 9.426202548992738e-06, + "loss": 0.4396, + "step": 4938 + }, + { + "epoch": 0.06699674443841562, + "grad_norm": 6.6265549659729, + "learning_rate": 9.426065506372483e-06, + "loss": 0.3223, + "step": 4939 + }, + { + "epoch": 0.06701030927835051, + "grad_norm": 9.767786026000977, + "learning_rate": 9.425928463752228e-06, + "loss": 0.5835, + "step": 4940 + }, + { + "epoch": 0.0670238741182854, + "grad_norm": 6.35804557800293, + "learning_rate": 9.425791421131972e-06, + "loss": 0.3549, + "step": 4941 + }, + { + "epoch": 0.0670374389582203, + "grad_norm": 7.147523403167725, + "learning_rate": 9.425654378511719e-06, + "loss": 0.419, + "step": 4942 + }, + { + "epoch": 0.06705100379815518, + "grad_norm": 5.84549617767334, + "learning_rate": 9.425517335891464e-06, + "loss": 0.3965, + "step": 4943 + }, + { + "epoch": 0.06706456863809007, + "grad_norm": 5.475427150726318, + "learning_rate": 9.425380293271207e-06, + "loss": 0.4004, + "step": 4944 + }, + { + "epoch": 0.06707813347802496, + "grad_norm": 6.930068016052246, + "learning_rate": 9.425243250650952e-06, + "loss": 0.452, + "step": 4945 + }, + { + "epoch": 0.06709169831795984, + "grad_norm": 6.776133060455322, + "learning_rate": 9.4251062080307e-06, + "loss": 0.3954, + "step": 4946 + }, + { + "epoch": 0.06710526315789474, + "grad_norm": 5.339784622192383, + "learning_rate": 9.424969165410443e-06, + "loss": 0.3281, + "step": 4947 + }, + { + "epoch": 0.06711882799782963, + "grad_norm": 7.366971969604492, + "learning_rate": 9.424832122790188e-06, + "loss": 0.3948, + "step": 4948 + }, + { + "epoch": 0.06713239283776451, + "grad_norm": 7.1565375328063965, + "learning_rate": 9.424695080169933e-06, + "loss": 0.4643, + "step": 4949 + }, + { + "epoch": 0.0671459576776994, + "grad_norm": 8.737405776977539, + "learning_rate": 9.42455803754968e-06, + "loss": 0.5358, + "step": 4950 + }, + { + "epoch": 0.0671595225176343, + "grad_norm": 6.143312931060791, + "learning_rate": 9.424420994929423e-06, + "loss": 0.5365, + "step": 4951 + }, + { + "epoch": 0.06717308735756918, + "grad_norm": 8.270867347717285, + "learning_rate": 9.424283952309169e-06, + "loss": 0.4848, + "step": 4952 + }, + { + "epoch": 0.06718665219750407, + "grad_norm": 8.47242259979248, + "learning_rate": 9.424146909688914e-06, + "loss": 0.579, + "step": 4953 + }, + { + "epoch": 0.06720021703743896, + "grad_norm": 5.976377010345459, + "learning_rate": 9.424009867068659e-06, + "loss": 0.3659, + "step": 4954 + }, + { + "epoch": 0.06721378187737384, + "grad_norm": 6.550477504730225, + "learning_rate": 9.423872824448404e-06, + "loss": 0.3128, + "step": 4955 + }, + { + "epoch": 0.06722734671730873, + "grad_norm": 8.698010444641113, + "learning_rate": 9.42373578182815e-06, + "loss": 0.4957, + "step": 4956 + }, + { + "epoch": 0.06724091155724363, + "grad_norm": 8.071186065673828, + "learning_rate": 9.423598739207895e-06, + "loss": 0.3966, + "step": 4957 + }, + { + "epoch": 0.06725447639717852, + "grad_norm": 7.382232666015625, + "learning_rate": 9.42346169658764e-06, + "loss": 0.4303, + "step": 4958 + }, + { + "epoch": 0.0672680412371134, + "grad_norm": 7.51535701751709, + "learning_rate": 9.423324653967385e-06, + "loss": 0.5529, + "step": 4959 + }, + { + "epoch": 0.06728160607704829, + "grad_norm": 6.58992862701416, + "learning_rate": 9.42318761134713e-06, + "loss": 0.3999, + "step": 4960 + }, + { + "epoch": 0.06729517091698319, + "grad_norm": 9.268430709838867, + "learning_rate": 9.423050568726875e-06, + "loss": 0.5334, + "step": 4961 + }, + { + "epoch": 0.06730873575691806, + "grad_norm": 7.629427909851074, + "learning_rate": 9.422913526106619e-06, + "loss": 0.5028, + "step": 4962 + }, + { + "epoch": 0.06732230059685296, + "grad_norm": 8.662489891052246, + "learning_rate": 9.422776483486366e-06, + "loss": 0.4264, + "step": 4963 + }, + { + "epoch": 0.06733586543678785, + "grad_norm": 9.464815139770508, + "learning_rate": 9.42263944086611e-06, + "loss": 0.6314, + "step": 4964 + }, + { + "epoch": 0.06734943027672273, + "grad_norm": 7.697113513946533, + "learning_rate": 9.422502398245856e-06, + "loss": 0.4013, + "step": 4965 + }, + { + "epoch": 0.06736299511665762, + "grad_norm": 6.986160755157471, + "learning_rate": 9.4223653556256e-06, + "loss": 0.3854, + "step": 4966 + }, + { + "epoch": 0.06737655995659252, + "grad_norm": 9.744577407836914, + "learning_rate": 9.422228313005345e-06, + "loss": 0.6543, + "step": 4967 + }, + { + "epoch": 0.0673901247965274, + "grad_norm": 5.312530040740967, + "learning_rate": 9.422091270385092e-06, + "loss": 0.3182, + "step": 4968 + }, + { + "epoch": 0.06740368963646229, + "grad_norm": 5.782984256744385, + "learning_rate": 9.421954227764835e-06, + "loss": 0.3181, + "step": 4969 + }, + { + "epoch": 0.06741725447639718, + "grad_norm": 7.159239768981934, + "learning_rate": 9.42181718514458e-06, + "loss": 0.5223, + "step": 4970 + }, + { + "epoch": 0.06743081931633206, + "grad_norm": 7.7484002113342285, + "learning_rate": 9.421680142524325e-06, + "loss": 0.5702, + "step": 4971 + }, + { + "epoch": 0.06744438415626695, + "grad_norm": 8.703696250915527, + "learning_rate": 9.42154309990407e-06, + "loss": 0.4756, + "step": 4972 + }, + { + "epoch": 0.06745794899620185, + "grad_norm": 9.471614837646484, + "learning_rate": 9.421406057283816e-06, + "loss": 0.5448, + "step": 4973 + }, + { + "epoch": 0.06747151383613674, + "grad_norm": 8.256734848022461, + "learning_rate": 9.421269014663561e-06, + "loss": 0.506, + "step": 4974 + }, + { + "epoch": 0.06748507867607162, + "grad_norm": 7.828314781188965, + "learning_rate": 9.421131972043306e-06, + "loss": 0.4023, + "step": 4975 + }, + { + "epoch": 0.06749864351600651, + "grad_norm": 5.341451168060303, + "learning_rate": 9.420994929423051e-06, + "loss": 0.3189, + "step": 4976 + }, + { + "epoch": 0.0675122083559414, + "grad_norm": 11.51528549194336, + "learning_rate": 9.420857886802796e-06, + "loss": 0.769, + "step": 4977 + }, + { + "epoch": 0.06752577319587628, + "grad_norm": 8.201934814453125, + "learning_rate": 9.420720844182542e-06, + "loss": 0.628, + "step": 4978 + }, + { + "epoch": 0.06753933803581118, + "grad_norm": 10.997139930725098, + "learning_rate": 9.420583801562287e-06, + "loss": 0.7566, + "step": 4979 + }, + { + "epoch": 0.06755290287574607, + "grad_norm": 10.251500129699707, + "learning_rate": 9.420446758942032e-06, + "loss": 0.6783, + "step": 4980 + }, + { + "epoch": 0.06756646771568095, + "grad_norm": 7.284598350524902, + "learning_rate": 9.420309716321777e-06, + "loss": 0.4425, + "step": 4981 + }, + { + "epoch": 0.06758003255561584, + "grad_norm": 9.881160736083984, + "learning_rate": 9.420172673701522e-06, + "loss": 0.7197, + "step": 4982 + }, + { + "epoch": 0.06759359739555074, + "grad_norm": 8.86194896697998, + "learning_rate": 9.420035631081268e-06, + "loss": 0.6168, + "step": 4983 + }, + { + "epoch": 0.06760716223548562, + "grad_norm": 4.710881233215332, + "learning_rate": 9.419898588461011e-06, + "loss": 0.3946, + "step": 4984 + }, + { + "epoch": 0.06762072707542051, + "grad_norm": 8.574663162231445, + "learning_rate": 9.419761545840758e-06, + "loss": 0.4273, + "step": 4985 + }, + { + "epoch": 0.0676342919153554, + "grad_norm": 7.641943454742432, + "learning_rate": 9.419624503220503e-06, + "loss": 0.5422, + "step": 4986 + }, + { + "epoch": 0.06764785675529028, + "grad_norm": 5.894495487213135, + "learning_rate": 9.419487460600247e-06, + "loss": 0.4517, + "step": 4987 + }, + { + "epoch": 0.06766142159522517, + "grad_norm": 8.351489067077637, + "learning_rate": 9.419350417979992e-06, + "loss": 0.4357, + "step": 4988 + }, + { + "epoch": 0.06767498643516007, + "grad_norm": 6.786898612976074, + "learning_rate": 9.419213375359739e-06, + "loss": 0.463, + "step": 4989 + }, + { + "epoch": 0.06768855127509496, + "grad_norm": 7.30463981628418, + "learning_rate": 9.419076332739484e-06, + "loss": 0.3529, + "step": 4990 + }, + { + "epoch": 0.06770211611502984, + "grad_norm": 12.051342964172363, + "learning_rate": 9.418939290119227e-06, + "loss": 0.582, + "step": 4991 + }, + { + "epoch": 0.06771568095496473, + "grad_norm": 5.823024749755859, + "learning_rate": 9.418802247498972e-06, + "loss": 0.4335, + "step": 4992 + }, + { + "epoch": 0.06772924579489963, + "grad_norm": 6.894001483917236, + "learning_rate": 9.41866520487872e-06, + "loss": 0.4983, + "step": 4993 + }, + { + "epoch": 0.0677428106348345, + "grad_norm": 5.969340801239014, + "learning_rate": 9.418528162258463e-06, + "loss": 0.3779, + "step": 4994 + }, + { + "epoch": 0.0677563754747694, + "grad_norm": 8.557317733764648, + "learning_rate": 9.418391119638208e-06, + "loss": 0.4169, + "step": 4995 + }, + { + "epoch": 0.06776994031470429, + "grad_norm": 10.391790390014648, + "learning_rate": 9.418254077017953e-06, + "loss": 0.514, + "step": 4996 + }, + { + "epoch": 0.06778350515463917, + "grad_norm": 9.368481636047363, + "learning_rate": 9.418117034397698e-06, + "loss": 0.6133, + "step": 4997 + }, + { + "epoch": 0.06779706999457406, + "grad_norm": 8.122113227844238, + "learning_rate": 9.417979991777443e-06, + "loss": 0.4726, + "step": 4998 + }, + { + "epoch": 0.06781063483450896, + "grad_norm": 9.932873725891113, + "learning_rate": 9.417842949157189e-06, + "loss": 0.4459, + "step": 4999 + }, + { + "epoch": 0.06782419967444384, + "grad_norm": 9.063887596130371, + "learning_rate": 9.417705906536934e-06, + "loss": 0.5304, + "step": 5000 + }, + { + "epoch": 0.06783776451437873, + "grad_norm": 7.094681262969971, + "learning_rate": 9.417568863916679e-06, + "loss": 0.6142, + "step": 5001 + }, + { + "epoch": 0.06785132935431362, + "grad_norm": 6.463903903961182, + "learning_rate": 9.417431821296424e-06, + "loss": 0.4272, + "step": 5002 + }, + { + "epoch": 0.0678648941942485, + "grad_norm": 6.643352031707764, + "learning_rate": 9.41729477867617e-06, + "loss": 0.5139, + "step": 5003 + }, + { + "epoch": 0.0678784590341834, + "grad_norm": 7.88246488571167, + "learning_rate": 9.417157736055915e-06, + "loss": 0.6767, + "step": 5004 + }, + { + "epoch": 0.06789202387411829, + "grad_norm": 6.121093273162842, + "learning_rate": 9.41702069343566e-06, + "loss": 0.3709, + "step": 5005 + }, + { + "epoch": 0.06790558871405318, + "grad_norm": 5.578795433044434, + "learning_rate": 9.416883650815405e-06, + "loss": 0.35, + "step": 5006 + }, + { + "epoch": 0.06791915355398806, + "grad_norm": 7.232629299163818, + "learning_rate": 9.41674660819515e-06, + "loss": 0.3598, + "step": 5007 + }, + { + "epoch": 0.06793271839392295, + "grad_norm": 5.526748180389404, + "learning_rate": 9.416609565574895e-06, + "loss": 0.2788, + "step": 5008 + }, + { + "epoch": 0.06794628323385785, + "grad_norm": 9.73589038848877, + "learning_rate": 9.416472522954639e-06, + "loss": 0.5858, + "step": 5009 + }, + { + "epoch": 0.06795984807379273, + "grad_norm": 6.359070777893066, + "learning_rate": 9.416335480334384e-06, + "loss": 0.4101, + "step": 5010 + }, + { + "epoch": 0.06797341291372762, + "grad_norm": 9.844298362731934, + "learning_rate": 9.41619843771413e-06, + "loss": 0.5371, + "step": 5011 + }, + { + "epoch": 0.06798697775366251, + "grad_norm": 6.679139614105225, + "learning_rate": 9.416061395093874e-06, + "loss": 0.3883, + "step": 5012 + }, + { + "epoch": 0.06800054259359739, + "grad_norm": 7.699390411376953, + "learning_rate": 9.41592435247362e-06, + "loss": 0.3523, + "step": 5013 + }, + { + "epoch": 0.06801410743353228, + "grad_norm": 6.380679607391357, + "learning_rate": 9.415787309853365e-06, + "loss": 0.4075, + "step": 5014 + }, + { + "epoch": 0.06802767227346718, + "grad_norm": 6.638383865356445, + "learning_rate": 9.415650267233112e-06, + "loss": 0.2731, + "step": 5015 + }, + { + "epoch": 0.06804123711340206, + "grad_norm": 8.535331726074219, + "learning_rate": 9.415513224612855e-06, + "loss": 0.54, + "step": 5016 + }, + { + "epoch": 0.06805480195333695, + "grad_norm": 6.8147783279418945, + "learning_rate": 9.4153761819926e-06, + "loss": 0.412, + "step": 5017 + }, + { + "epoch": 0.06806836679327184, + "grad_norm": 10.138755798339844, + "learning_rate": 9.415239139372345e-06, + "loss": 0.667, + "step": 5018 + }, + { + "epoch": 0.06808193163320672, + "grad_norm": 9.323145866394043, + "learning_rate": 9.41510209675209e-06, + "loss": 0.403, + "step": 5019 + }, + { + "epoch": 0.06809549647314161, + "grad_norm": 9.060757637023926, + "learning_rate": 9.414965054131836e-06, + "loss": 0.4802, + "step": 5020 + }, + { + "epoch": 0.06810906131307651, + "grad_norm": 10.200079917907715, + "learning_rate": 9.414828011511581e-06, + "loss": 0.5761, + "step": 5021 + }, + { + "epoch": 0.0681226261530114, + "grad_norm": 7.227720260620117, + "learning_rate": 9.414690968891326e-06, + "loss": 0.4729, + "step": 5022 + }, + { + "epoch": 0.06813619099294628, + "grad_norm": 7.446687698364258, + "learning_rate": 9.414553926271071e-06, + "loss": 0.4184, + "step": 5023 + }, + { + "epoch": 0.06814975583288117, + "grad_norm": 6.290007591247559, + "learning_rate": 9.414416883650816e-06, + "loss": 0.4019, + "step": 5024 + }, + { + "epoch": 0.06816332067281607, + "grad_norm": 8.095898628234863, + "learning_rate": 9.414279841030562e-06, + "loss": 0.5395, + "step": 5025 + }, + { + "epoch": 0.06817688551275095, + "grad_norm": 7.600440502166748, + "learning_rate": 9.414142798410307e-06, + "loss": 0.4258, + "step": 5026 + }, + { + "epoch": 0.06819045035268584, + "grad_norm": 5.6747355461120605, + "learning_rate": 9.41400575579005e-06, + "loss": 0.2181, + "step": 5027 + }, + { + "epoch": 0.06820401519262073, + "grad_norm": 6.703325271606445, + "learning_rate": 9.413868713169797e-06, + "loss": 0.3508, + "step": 5028 + }, + { + "epoch": 0.06821758003255561, + "grad_norm": 10.230548858642578, + "learning_rate": 9.413731670549542e-06, + "loss": 0.7047, + "step": 5029 + }, + { + "epoch": 0.0682311448724905, + "grad_norm": 8.716702461242676, + "learning_rate": 9.413594627929288e-06, + "loss": 0.4245, + "step": 5030 + }, + { + "epoch": 0.0682447097124254, + "grad_norm": 7.822292327880859, + "learning_rate": 9.413457585309031e-06, + "loss": 0.4581, + "step": 5031 + }, + { + "epoch": 0.06825827455236028, + "grad_norm": 7.296429634094238, + "learning_rate": 9.413320542688778e-06, + "loss": 0.3666, + "step": 5032 + }, + { + "epoch": 0.06827183939229517, + "grad_norm": 11.149980545043945, + "learning_rate": 9.413183500068523e-06, + "loss": 0.6431, + "step": 5033 + }, + { + "epoch": 0.06828540423223006, + "grad_norm": 7.966527938842773, + "learning_rate": 9.413046457448267e-06, + "loss": 0.4807, + "step": 5034 + }, + { + "epoch": 0.06829896907216494, + "grad_norm": 9.72920036315918, + "learning_rate": 9.412909414828012e-06, + "loss": 0.5539, + "step": 5035 + }, + { + "epoch": 0.06831253391209983, + "grad_norm": 8.266480445861816, + "learning_rate": 9.412772372207757e-06, + "loss": 0.6183, + "step": 5036 + }, + { + "epoch": 0.06832609875203473, + "grad_norm": 6.185795307159424, + "learning_rate": 9.412635329587502e-06, + "loss": 0.4851, + "step": 5037 + }, + { + "epoch": 0.06833966359196962, + "grad_norm": 8.950263023376465, + "learning_rate": 9.412498286967247e-06, + "loss": 0.5685, + "step": 5038 + }, + { + "epoch": 0.0683532284319045, + "grad_norm": 8.620671272277832, + "learning_rate": 9.412361244346992e-06, + "loss": 0.6541, + "step": 5039 + }, + { + "epoch": 0.0683667932718394, + "grad_norm": 8.568148612976074, + "learning_rate": 9.412224201726738e-06, + "loss": 0.4882, + "step": 5040 + }, + { + "epoch": 0.06838035811177429, + "grad_norm": 9.855090141296387, + "learning_rate": 9.412087159106483e-06, + "loss": 0.4948, + "step": 5041 + }, + { + "epoch": 0.06839392295170917, + "grad_norm": 6.893803119659424, + "learning_rate": 9.411950116486228e-06, + "loss": 0.4697, + "step": 5042 + }, + { + "epoch": 0.06840748779164406, + "grad_norm": 10.63759994506836, + "learning_rate": 9.411813073865973e-06, + "loss": 0.7493, + "step": 5043 + }, + { + "epoch": 0.06842105263157895, + "grad_norm": 7.328856945037842, + "learning_rate": 9.411676031245718e-06, + "loss": 0.4835, + "step": 5044 + }, + { + "epoch": 0.06843461747151383, + "grad_norm": 8.985770225524902, + "learning_rate": 9.411538988625464e-06, + "loss": 0.4683, + "step": 5045 + }, + { + "epoch": 0.06844818231144872, + "grad_norm": 7.5191121101379395, + "learning_rate": 9.411401946005209e-06, + "loss": 0.5416, + "step": 5046 + }, + { + "epoch": 0.06846174715138362, + "grad_norm": 8.515933990478516, + "learning_rate": 9.411264903384954e-06, + "loss": 0.5944, + "step": 5047 + }, + { + "epoch": 0.0684753119913185, + "grad_norm": 9.054481506347656, + "learning_rate": 9.411127860764699e-06, + "loss": 0.6365, + "step": 5048 + }, + { + "epoch": 0.06848887683125339, + "grad_norm": 10.22449016571045, + "learning_rate": 9.410990818144443e-06, + "loss": 0.6094, + "step": 5049 + }, + { + "epoch": 0.06850244167118828, + "grad_norm": 8.933062553405762, + "learning_rate": 9.41085377552419e-06, + "loss": 0.4908, + "step": 5050 + }, + { + "epoch": 0.06851600651112316, + "grad_norm": 10.458733558654785, + "learning_rate": 9.410716732903935e-06, + "loss": 0.6772, + "step": 5051 + }, + { + "epoch": 0.06852957135105806, + "grad_norm": 8.375419616699219, + "learning_rate": 9.410579690283678e-06, + "loss": 0.5184, + "step": 5052 + }, + { + "epoch": 0.06854313619099295, + "grad_norm": 10.835395812988281, + "learning_rate": 9.410442647663423e-06, + "loss": 0.6069, + "step": 5053 + }, + { + "epoch": 0.06855670103092784, + "grad_norm": 7.7150492668151855, + "learning_rate": 9.41030560504317e-06, + "loss": 0.556, + "step": 5054 + }, + { + "epoch": 0.06857026587086272, + "grad_norm": 6.771885871887207, + "learning_rate": 9.410168562422914e-06, + "loss": 0.4191, + "step": 5055 + }, + { + "epoch": 0.06858383071079761, + "grad_norm": 7.518861770629883, + "learning_rate": 9.410031519802659e-06, + "loss": 0.5255, + "step": 5056 + }, + { + "epoch": 0.0685973955507325, + "grad_norm": 8.11260986328125, + "learning_rate": 9.409894477182404e-06, + "loss": 0.5483, + "step": 5057 + }, + { + "epoch": 0.06861096039066739, + "grad_norm": 7.214895248413086, + "learning_rate": 9.40975743456215e-06, + "loss": 0.4182, + "step": 5058 + }, + { + "epoch": 0.06862452523060228, + "grad_norm": 6.130888938903809, + "learning_rate": 9.409620391941894e-06, + "loss": 0.3748, + "step": 5059 + }, + { + "epoch": 0.06863809007053717, + "grad_norm": 8.563584327697754, + "learning_rate": 9.40948334932164e-06, + "loss": 0.5763, + "step": 5060 + }, + { + "epoch": 0.06865165491047205, + "grad_norm": 7.360068321228027, + "learning_rate": 9.409346306701385e-06, + "loss": 0.4332, + "step": 5061 + }, + { + "epoch": 0.06866521975040694, + "grad_norm": 7.93214750289917, + "learning_rate": 9.40920926408113e-06, + "loss": 0.3729, + "step": 5062 + }, + { + "epoch": 0.06867878459034184, + "grad_norm": 7.390862464904785, + "learning_rate": 9.409072221460875e-06, + "loss": 0.373, + "step": 5063 + }, + { + "epoch": 0.06869234943027672, + "grad_norm": 7.377297401428223, + "learning_rate": 9.40893517884062e-06, + "loss": 0.5678, + "step": 5064 + }, + { + "epoch": 0.06870591427021161, + "grad_norm": 8.29587173461914, + "learning_rate": 9.408798136220365e-06, + "loss": 0.6239, + "step": 5065 + }, + { + "epoch": 0.0687194791101465, + "grad_norm": 8.606420516967773, + "learning_rate": 9.40866109360011e-06, + "loss": 0.7011, + "step": 5066 + }, + { + "epoch": 0.06873304395008138, + "grad_norm": 7.273592948913574, + "learning_rate": 9.408524050979856e-06, + "loss": 0.4323, + "step": 5067 + }, + { + "epoch": 0.06874660879001628, + "grad_norm": 8.389978408813477, + "learning_rate": 9.408387008359601e-06, + "loss": 0.5183, + "step": 5068 + }, + { + "epoch": 0.06876017362995117, + "grad_norm": 11.509500503540039, + "learning_rate": 9.408249965739346e-06, + "loss": 0.7078, + "step": 5069 + }, + { + "epoch": 0.06877373846988606, + "grad_norm": 8.271659851074219, + "learning_rate": 9.40811292311909e-06, + "loss": 0.6416, + "step": 5070 + }, + { + "epoch": 0.06878730330982094, + "grad_norm": 9.533174514770508, + "learning_rate": 9.407975880498836e-06, + "loss": 0.5527, + "step": 5071 + }, + { + "epoch": 0.06880086814975583, + "grad_norm": 6.675490856170654, + "learning_rate": 9.407838837878582e-06, + "loss": 0.446, + "step": 5072 + }, + { + "epoch": 0.06881443298969073, + "grad_norm": 7.60020637512207, + "learning_rate": 9.407701795258327e-06, + "loss": 0.5201, + "step": 5073 + }, + { + "epoch": 0.0688279978296256, + "grad_norm": 6.385376453399658, + "learning_rate": 9.40756475263807e-06, + "loss": 0.405, + "step": 5074 + }, + { + "epoch": 0.0688415626695605, + "grad_norm": 7.193553447723389, + "learning_rate": 9.407427710017817e-06, + "loss": 0.4284, + "step": 5075 + }, + { + "epoch": 0.06885512750949539, + "grad_norm": 8.537270545959473, + "learning_rate": 9.407290667397562e-06, + "loss": 0.4469, + "step": 5076 + }, + { + "epoch": 0.06886869234943027, + "grad_norm": 7.109686374664307, + "learning_rate": 9.407153624777306e-06, + "loss": 0.5171, + "step": 5077 + }, + { + "epoch": 0.06888225718936516, + "grad_norm": 5.4427337646484375, + "learning_rate": 9.407016582157051e-06, + "loss": 0.4479, + "step": 5078 + }, + { + "epoch": 0.06889582202930006, + "grad_norm": 11.03667163848877, + "learning_rate": 9.406879539536796e-06, + "loss": 0.491, + "step": 5079 + }, + { + "epoch": 0.06890938686923494, + "grad_norm": 8.841215133666992, + "learning_rate": 9.406742496916541e-06, + "loss": 0.5196, + "step": 5080 + }, + { + "epoch": 0.06892295170916983, + "grad_norm": 8.019497871398926, + "learning_rate": 9.406605454296287e-06, + "loss": 0.4231, + "step": 5081 + }, + { + "epoch": 0.06893651654910472, + "grad_norm": 7.974830627441406, + "learning_rate": 9.406468411676032e-06, + "loss": 0.5112, + "step": 5082 + }, + { + "epoch": 0.0689500813890396, + "grad_norm": 8.05400276184082, + "learning_rate": 9.406331369055777e-06, + "loss": 0.4375, + "step": 5083 + }, + { + "epoch": 0.0689636462289745, + "grad_norm": 6.917042255401611, + "learning_rate": 9.406194326435522e-06, + "loss": 0.4733, + "step": 5084 + }, + { + "epoch": 0.06897721106890939, + "grad_norm": 7.152988910675049, + "learning_rate": 9.406057283815267e-06, + "loss": 0.41, + "step": 5085 + }, + { + "epoch": 0.06899077590884428, + "grad_norm": 7.602834701538086, + "learning_rate": 9.405920241195012e-06, + "loss": 0.4497, + "step": 5086 + }, + { + "epoch": 0.06900434074877916, + "grad_norm": 7.95254373550415, + "learning_rate": 9.405783198574758e-06, + "loss": 0.4025, + "step": 5087 + }, + { + "epoch": 0.06901790558871405, + "grad_norm": 8.849531173706055, + "learning_rate": 9.405646155954503e-06, + "loss": 0.4981, + "step": 5088 + }, + { + "epoch": 0.06903147042864895, + "grad_norm": 9.972321510314941, + "learning_rate": 9.405509113334248e-06, + "loss": 0.5178, + "step": 5089 + }, + { + "epoch": 0.06904503526858383, + "grad_norm": 6.3245697021484375, + "learning_rate": 9.405372070713993e-06, + "loss": 0.3287, + "step": 5090 + }, + { + "epoch": 0.06905860010851872, + "grad_norm": 8.835018157958984, + "learning_rate": 9.405235028093738e-06, + "loss": 0.546, + "step": 5091 + }, + { + "epoch": 0.06907216494845361, + "grad_norm": 8.821615219116211, + "learning_rate": 9.405097985473482e-06, + "loss": 0.4812, + "step": 5092 + }, + { + "epoch": 0.06908572978838849, + "grad_norm": 8.464183807373047, + "learning_rate": 9.404960942853229e-06, + "loss": 0.4705, + "step": 5093 + }, + { + "epoch": 0.06909929462832338, + "grad_norm": 9.141314506530762, + "learning_rate": 9.404823900232974e-06, + "loss": 0.5972, + "step": 5094 + }, + { + "epoch": 0.06911285946825828, + "grad_norm": 7.414323329925537, + "learning_rate": 9.404686857612717e-06, + "loss": 0.3369, + "step": 5095 + }, + { + "epoch": 0.06912642430819316, + "grad_norm": 9.834813117980957, + "learning_rate": 9.404549814992463e-06, + "loss": 0.6474, + "step": 5096 + }, + { + "epoch": 0.06913998914812805, + "grad_norm": 5.757814407348633, + "learning_rate": 9.40441277237221e-06, + "loss": 0.2272, + "step": 5097 + }, + { + "epoch": 0.06915355398806294, + "grad_norm": 9.126583099365234, + "learning_rate": 9.404275729751955e-06, + "loss": 0.5902, + "step": 5098 + }, + { + "epoch": 0.06916711882799782, + "grad_norm": 7.914758682250977, + "learning_rate": 9.404138687131698e-06, + "loss": 0.5755, + "step": 5099 + }, + { + "epoch": 0.06918068366793272, + "grad_norm": 8.502999305725098, + "learning_rate": 9.404001644511443e-06, + "loss": 0.4324, + "step": 5100 + }, + { + "epoch": 0.06919424850786761, + "grad_norm": 6.514773845672607, + "learning_rate": 9.40386460189119e-06, + "loss": 0.4698, + "step": 5101 + }, + { + "epoch": 0.0692078133478025, + "grad_norm": 8.639098167419434, + "learning_rate": 9.403727559270934e-06, + "loss": 0.4559, + "step": 5102 + }, + { + "epoch": 0.06922137818773738, + "grad_norm": 8.98137092590332, + "learning_rate": 9.403590516650679e-06, + "loss": 0.6087, + "step": 5103 + }, + { + "epoch": 0.06923494302767227, + "grad_norm": 8.563377380371094, + "learning_rate": 9.403453474030424e-06, + "loss": 0.391, + "step": 5104 + }, + { + "epoch": 0.06924850786760717, + "grad_norm": 6.598205089569092, + "learning_rate": 9.40331643141017e-06, + "loss": 0.2986, + "step": 5105 + }, + { + "epoch": 0.06926207270754205, + "grad_norm": 7.011862277984619, + "learning_rate": 9.403179388789914e-06, + "loss": 0.4723, + "step": 5106 + }, + { + "epoch": 0.06927563754747694, + "grad_norm": 7.4535908699035645, + "learning_rate": 9.40304234616966e-06, + "loss": 0.477, + "step": 5107 + }, + { + "epoch": 0.06928920238741183, + "grad_norm": 7.099415302276611, + "learning_rate": 9.402905303549405e-06, + "loss": 0.4368, + "step": 5108 + }, + { + "epoch": 0.06930276722734671, + "grad_norm": 6.611301422119141, + "learning_rate": 9.40276826092915e-06, + "loss": 0.3861, + "step": 5109 + }, + { + "epoch": 0.0693163320672816, + "grad_norm": 9.021076202392578, + "learning_rate": 9.402631218308895e-06, + "loss": 0.6379, + "step": 5110 + }, + { + "epoch": 0.0693298969072165, + "grad_norm": 8.319586753845215, + "learning_rate": 9.40249417568864e-06, + "loss": 0.3924, + "step": 5111 + }, + { + "epoch": 0.06934346174715138, + "grad_norm": 8.36997127532959, + "learning_rate": 9.402357133068385e-06, + "loss": 0.5681, + "step": 5112 + }, + { + "epoch": 0.06935702658708627, + "grad_norm": 8.029764175415039, + "learning_rate": 9.40222009044813e-06, + "loss": 0.3658, + "step": 5113 + }, + { + "epoch": 0.06937059142702116, + "grad_norm": 8.845941543579102, + "learning_rate": 9.402083047827876e-06, + "loss": 0.4214, + "step": 5114 + }, + { + "epoch": 0.06938415626695604, + "grad_norm": 4.856622219085693, + "learning_rate": 9.401946005207621e-06, + "loss": 0.4072, + "step": 5115 + }, + { + "epoch": 0.06939772110689094, + "grad_norm": 6.86339807510376, + "learning_rate": 9.401808962587366e-06, + "loss": 0.4897, + "step": 5116 + }, + { + "epoch": 0.06941128594682583, + "grad_norm": 6.278512001037598, + "learning_rate": 9.40167191996711e-06, + "loss": 0.3312, + "step": 5117 + }, + { + "epoch": 0.06942485078676072, + "grad_norm": 7.302290439605713, + "learning_rate": 9.401534877346855e-06, + "loss": 0.4026, + "step": 5118 + }, + { + "epoch": 0.0694384156266956, + "grad_norm": 7.433185577392578, + "learning_rate": 9.401397834726602e-06, + "loss": 0.5592, + "step": 5119 + }, + { + "epoch": 0.0694519804666305, + "grad_norm": 6.701449871063232, + "learning_rate": 9.401260792106345e-06, + "loss": 0.4742, + "step": 5120 + }, + { + "epoch": 0.06946554530656539, + "grad_norm": 7.696980953216553, + "learning_rate": 9.40112374948609e-06, + "loss": 0.4658, + "step": 5121 + }, + { + "epoch": 0.06947911014650027, + "grad_norm": 5.8968987464904785, + "learning_rate": 9.400986706865836e-06, + "loss": 0.3282, + "step": 5122 + }, + { + "epoch": 0.06949267498643516, + "grad_norm": 6.267894744873047, + "learning_rate": 9.400849664245582e-06, + "loss": 0.4685, + "step": 5123 + }, + { + "epoch": 0.06950623982637005, + "grad_norm": 6.821516513824463, + "learning_rate": 9.400712621625326e-06, + "loss": 0.3784, + "step": 5124 + }, + { + "epoch": 0.06951980466630493, + "grad_norm": 8.572927474975586, + "learning_rate": 9.400575579005071e-06, + "loss": 0.4916, + "step": 5125 + }, + { + "epoch": 0.06953336950623983, + "grad_norm": 9.67734146118164, + "learning_rate": 9.400438536384816e-06, + "loss": 0.3831, + "step": 5126 + }, + { + "epoch": 0.06954693434617472, + "grad_norm": 5.525504112243652, + "learning_rate": 9.400301493764561e-06, + "loss": 0.4055, + "step": 5127 + }, + { + "epoch": 0.0695604991861096, + "grad_norm": 9.964433670043945, + "learning_rate": 9.400164451144307e-06, + "loss": 0.607, + "step": 5128 + }, + { + "epoch": 0.06957406402604449, + "grad_norm": 6.102547645568848, + "learning_rate": 9.400027408524052e-06, + "loss": 0.3065, + "step": 5129 + }, + { + "epoch": 0.06958762886597938, + "grad_norm": 7.528356075286865, + "learning_rate": 9.399890365903797e-06, + "loss": 0.5209, + "step": 5130 + }, + { + "epoch": 0.06960119370591428, + "grad_norm": 9.126158714294434, + "learning_rate": 9.399753323283542e-06, + "loss": 0.5031, + "step": 5131 + }, + { + "epoch": 0.06961475854584916, + "grad_norm": 7.302217960357666, + "learning_rate": 9.399616280663287e-06, + "loss": 0.4296, + "step": 5132 + }, + { + "epoch": 0.06962832338578405, + "grad_norm": 5.531710624694824, + "learning_rate": 9.399479238043032e-06, + "loss": 0.2839, + "step": 5133 + }, + { + "epoch": 0.06964188822571894, + "grad_norm": 5.714344024658203, + "learning_rate": 9.399342195422778e-06, + "loss": 0.3392, + "step": 5134 + }, + { + "epoch": 0.06965545306565382, + "grad_norm": 7.495631217956543, + "learning_rate": 9.399205152802521e-06, + "loss": 0.4281, + "step": 5135 + }, + { + "epoch": 0.06966901790558871, + "grad_norm": 6.344272613525391, + "learning_rate": 9.399068110182268e-06, + "loss": 0.4277, + "step": 5136 + }, + { + "epoch": 0.06968258274552361, + "grad_norm": 7.967459678649902, + "learning_rate": 9.398931067562013e-06, + "loss": 0.3662, + "step": 5137 + }, + { + "epoch": 0.06969614758545849, + "grad_norm": 6.817914962768555, + "learning_rate": 9.398794024941757e-06, + "loss": 0.3609, + "step": 5138 + }, + { + "epoch": 0.06970971242539338, + "grad_norm": 4.636778831481934, + "learning_rate": 9.398656982321502e-06, + "loss": 0.2218, + "step": 5139 + }, + { + "epoch": 0.06972327726532827, + "grad_norm": 5.903570175170898, + "learning_rate": 9.398519939701249e-06, + "loss": 0.3154, + "step": 5140 + }, + { + "epoch": 0.06973684210526315, + "grad_norm": 8.081594467163086, + "learning_rate": 9.398382897080994e-06, + "loss": 0.5746, + "step": 5141 + }, + { + "epoch": 0.06975040694519805, + "grad_norm": 7.02551794052124, + "learning_rate": 9.398245854460737e-06, + "loss": 0.3461, + "step": 5142 + }, + { + "epoch": 0.06976397178513294, + "grad_norm": 9.577471733093262, + "learning_rate": 9.398108811840483e-06, + "loss": 0.5571, + "step": 5143 + }, + { + "epoch": 0.06977753662506782, + "grad_norm": 5.7707085609436035, + "learning_rate": 9.39797176922023e-06, + "loss": 0.3019, + "step": 5144 + }, + { + "epoch": 0.06979110146500271, + "grad_norm": 6.380885124206543, + "learning_rate": 9.397834726599973e-06, + "loss": 0.3243, + "step": 5145 + }, + { + "epoch": 0.0698046663049376, + "grad_norm": 8.219667434692383, + "learning_rate": 9.397697683979718e-06, + "loss": 0.4681, + "step": 5146 + }, + { + "epoch": 0.0698182311448725, + "grad_norm": 7.742149353027344, + "learning_rate": 9.397560641359463e-06, + "loss": 0.4521, + "step": 5147 + }, + { + "epoch": 0.06983179598480738, + "grad_norm": 7.737330436706543, + "learning_rate": 9.397423598739208e-06, + "loss": 0.4577, + "step": 5148 + }, + { + "epoch": 0.06984536082474227, + "grad_norm": 7.283128261566162, + "learning_rate": 9.397286556118954e-06, + "loss": 0.3793, + "step": 5149 + }, + { + "epoch": 0.06985892566467716, + "grad_norm": 7.497505187988281, + "learning_rate": 9.397149513498699e-06, + "loss": 0.3595, + "step": 5150 + }, + { + "epoch": 0.06987249050461204, + "grad_norm": 6.325166702270508, + "learning_rate": 9.397012470878444e-06, + "loss": 0.3296, + "step": 5151 + }, + { + "epoch": 0.06988605534454694, + "grad_norm": 7.656963348388672, + "learning_rate": 9.39687542825819e-06, + "loss": 0.4112, + "step": 5152 + }, + { + "epoch": 0.06989962018448183, + "grad_norm": 7.1762285232543945, + "learning_rate": 9.396738385637934e-06, + "loss": 0.3323, + "step": 5153 + }, + { + "epoch": 0.06991318502441671, + "grad_norm": 8.731677055358887, + "learning_rate": 9.39660134301768e-06, + "loss": 0.4093, + "step": 5154 + }, + { + "epoch": 0.0699267498643516, + "grad_norm": 8.793913841247559, + "learning_rate": 9.396464300397425e-06, + "loss": 0.4834, + "step": 5155 + }, + { + "epoch": 0.0699403147042865, + "grad_norm": 8.244073867797852, + "learning_rate": 9.39632725777717e-06, + "loss": 0.3265, + "step": 5156 + }, + { + "epoch": 0.06995387954422137, + "grad_norm": 8.269536972045898, + "learning_rate": 9.396190215156915e-06, + "loss": 0.6059, + "step": 5157 + }, + { + "epoch": 0.06996744438415627, + "grad_norm": 4.943982124328613, + "learning_rate": 9.39605317253666e-06, + "loss": 0.3196, + "step": 5158 + }, + { + "epoch": 0.06998100922409116, + "grad_norm": 6.985373497009277, + "learning_rate": 9.395916129916405e-06, + "loss": 0.3207, + "step": 5159 + }, + { + "epoch": 0.06999457406402604, + "grad_norm": 5.450019359588623, + "learning_rate": 9.395779087296149e-06, + "loss": 0.3491, + "step": 5160 + }, + { + "epoch": 0.07000813890396093, + "grad_norm": 5.4914374351501465, + "learning_rate": 9.395642044675894e-06, + "loss": 0.2804, + "step": 5161 + }, + { + "epoch": 0.07002170374389582, + "grad_norm": 5.401994705200195, + "learning_rate": 9.395505002055641e-06, + "loss": 0.2794, + "step": 5162 + }, + { + "epoch": 0.07003526858383072, + "grad_norm": 6.646243095397949, + "learning_rate": 9.395367959435384e-06, + "loss": 0.3773, + "step": 5163 + }, + { + "epoch": 0.0700488334237656, + "grad_norm": 4.721066951751709, + "learning_rate": 9.39523091681513e-06, + "loss": 0.2703, + "step": 5164 + }, + { + "epoch": 0.07006239826370049, + "grad_norm": 6.9393534660339355, + "learning_rate": 9.395093874194875e-06, + "loss": 0.3115, + "step": 5165 + }, + { + "epoch": 0.07007596310363538, + "grad_norm": 5.734604835510254, + "learning_rate": 9.394956831574622e-06, + "loss": 0.3139, + "step": 5166 + }, + { + "epoch": 0.07008952794357026, + "grad_norm": 6.415687561035156, + "learning_rate": 9.394819788954365e-06, + "loss": 0.4212, + "step": 5167 + }, + { + "epoch": 0.07010309278350516, + "grad_norm": 5.528767108917236, + "learning_rate": 9.39468274633411e-06, + "loss": 0.3037, + "step": 5168 + }, + { + "epoch": 0.07011665762344005, + "grad_norm": 5.984506130218506, + "learning_rate": 9.394545703713856e-06, + "loss": 0.4051, + "step": 5169 + }, + { + "epoch": 0.07013022246337493, + "grad_norm": 6.122251033782959, + "learning_rate": 9.3944086610936e-06, + "loss": 0.2947, + "step": 5170 + }, + { + "epoch": 0.07014378730330982, + "grad_norm": 7.0520710945129395, + "learning_rate": 9.394271618473346e-06, + "loss": 0.4042, + "step": 5171 + }, + { + "epoch": 0.07015735214324471, + "grad_norm": 5.972031116485596, + "learning_rate": 9.394134575853091e-06, + "loss": 0.3343, + "step": 5172 + }, + { + "epoch": 0.0701709169831796, + "grad_norm": 6.22879695892334, + "learning_rate": 9.393997533232836e-06, + "loss": 0.3419, + "step": 5173 + }, + { + "epoch": 0.07018448182311449, + "grad_norm": 7.0622029304504395, + "learning_rate": 9.393860490612581e-06, + "loss": 0.3182, + "step": 5174 + }, + { + "epoch": 0.07019804666304938, + "grad_norm": 6.145981788635254, + "learning_rate": 9.393723447992327e-06, + "loss": 0.378, + "step": 5175 + }, + { + "epoch": 0.07021161150298426, + "grad_norm": 4.783372402191162, + "learning_rate": 9.393586405372072e-06, + "loss": 0.2287, + "step": 5176 + }, + { + "epoch": 0.07022517634291915, + "grad_norm": 6.824057579040527, + "learning_rate": 9.393449362751817e-06, + "loss": 0.3302, + "step": 5177 + }, + { + "epoch": 0.07023874118285404, + "grad_norm": 6.43087911605835, + "learning_rate": 9.39331232013156e-06, + "loss": 0.4253, + "step": 5178 + }, + { + "epoch": 0.07025230602278894, + "grad_norm": 6.022433280944824, + "learning_rate": 9.393175277511307e-06, + "loss": 0.4733, + "step": 5179 + }, + { + "epoch": 0.07026587086272382, + "grad_norm": 7.763913154602051, + "learning_rate": 9.393038234891053e-06, + "loss": 0.397, + "step": 5180 + }, + { + "epoch": 0.07027943570265871, + "grad_norm": 6.655165672302246, + "learning_rate": 9.392901192270798e-06, + "loss": 0.3656, + "step": 5181 + }, + { + "epoch": 0.0702930005425936, + "grad_norm": 5.019929885864258, + "learning_rate": 9.392764149650541e-06, + "loss": 0.2936, + "step": 5182 + }, + { + "epoch": 0.07030656538252848, + "grad_norm": 5.902124881744385, + "learning_rate": 9.392627107030288e-06, + "loss": 0.3893, + "step": 5183 + }, + { + "epoch": 0.07032013022246338, + "grad_norm": 5.768206596374512, + "learning_rate": 9.392490064410033e-06, + "loss": 0.3572, + "step": 5184 + }, + { + "epoch": 0.07033369506239827, + "grad_norm": 6.612945556640625, + "learning_rate": 9.392353021789777e-06, + "loss": 0.329, + "step": 5185 + }, + { + "epoch": 0.07034725990233315, + "grad_norm": 6.474693775177002, + "learning_rate": 9.392215979169522e-06, + "loss": 0.2139, + "step": 5186 + }, + { + "epoch": 0.07036082474226804, + "grad_norm": 7.3352274894714355, + "learning_rate": 9.392078936549267e-06, + "loss": 0.3781, + "step": 5187 + }, + { + "epoch": 0.07037438958220293, + "grad_norm": 4.789456367492676, + "learning_rate": 9.391941893929012e-06, + "loss": 0.2598, + "step": 5188 + }, + { + "epoch": 0.07038795442213781, + "grad_norm": 8.843998908996582, + "learning_rate": 9.391804851308757e-06, + "loss": 0.3813, + "step": 5189 + }, + { + "epoch": 0.0704015192620727, + "grad_norm": 7.1607465744018555, + "learning_rate": 9.391667808688503e-06, + "loss": 0.4614, + "step": 5190 + }, + { + "epoch": 0.0704150841020076, + "grad_norm": 10.029275894165039, + "learning_rate": 9.391530766068248e-06, + "loss": 0.4747, + "step": 5191 + }, + { + "epoch": 0.07042864894194248, + "grad_norm": 6.838831901550293, + "learning_rate": 9.391393723447993e-06, + "loss": 0.3394, + "step": 5192 + }, + { + "epoch": 0.07044221378187737, + "grad_norm": 4.612356662750244, + "learning_rate": 9.391256680827738e-06, + "loss": 0.2104, + "step": 5193 + }, + { + "epoch": 0.07045577862181227, + "grad_norm": 6.393779754638672, + "learning_rate": 9.391119638207483e-06, + "loss": 0.4247, + "step": 5194 + }, + { + "epoch": 0.07046934346174716, + "grad_norm": 6.012467384338379, + "learning_rate": 9.390982595587228e-06, + "loss": 0.33, + "step": 5195 + }, + { + "epoch": 0.07048290830168204, + "grad_norm": 6.7509765625, + "learning_rate": 9.390845552966974e-06, + "loss": 0.4115, + "step": 5196 + }, + { + "epoch": 0.07049647314161693, + "grad_norm": 9.200396537780762, + "learning_rate": 9.390708510346719e-06, + "loss": 0.3352, + "step": 5197 + }, + { + "epoch": 0.07051003798155182, + "grad_norm": 6.655502796173096, + "learning_rate": 9.390571467726464e-06, + "loss": 0.3351, + "step": 5198 + }, + { + "epoch": 0.0705236028214867, + "grad_norm": 8.207301139831543, + "learning_rate": 9.39043442510621e-06, + "loss": 0.4971, + "step": 5199 + }, + { + "epoch": 0.0705371676614216, + "grad_norm": 6.259959697723389, + "learning_rate": 9.390297382485953e-06, + "loss": 0.4291, + "step": 5200 + }, + { + "epoch": 0.07055073250135649, + "grad_norm": 6.283236980438232, + "learning_rate": 9.3901603398657e-06, + "loss": 0.3745, + "step": 5201 + }, + { + "epoch": 0.07056429734129137, + "grad_norm": 6.054280757904053, + "learning_rate": 9.390023297245445e-06, + "loss": 0.418, + "step": 5202 + }, + { + "epoch": 0.07057786218122626, + "grad_norm": 9.356266975402832, + "learning_rate": 9.389886254625188e-06, + "loss": 0.4143, + "step": 5203 + }, + { + "epoch": 0.07059142702116115, + "grad_norm": 4.9835591316223145, + "learning_rate": 9.389749212004933e-06, + "loss": 0.2752, + "step": 5204 + }, + { + "epoch": 0.07060499186109603, + "grad_norm": 4.581337928771973, + "learning_rate": 9.38961216938468e-06, + "loss": 0.2555, + "step": 5205 + }, + { + "epoch": 0.07061855670103093, + "grad_norm": 7.305146217346191, + "learning_rate": 9.389475126764425e-06, + "loss": 0.412, + "step": 5206 + }, + { + "epoch": 0.07063212154096582, + "grad_norm": 7.874765872955322, + "learning_rate": 9.389338084144169e-06, + "loss": 0.4276, + "step": 5207 + }, + { + "epoch": 0.0706456863809007, + "grad_norm": 6.463367462158203, + "learning_rate": 9.389201041523914e-06, + "loss": 0.3235, + "step": 5208 + }, + { + "epoch": 0.07065925122083559, + "grad_norm": 5.8976850509643555, + "learning_rate": 9.389063998903661e-06, + "loss": 0.3237, + "step": 5209 + }, + { + "epoch": 0.07067281606077049, + "grad_norm": 5.706245422363281, + "learning_rate": 9.388926956283404e-06, + "loss": 0.3389, + "step": 5210 + }, + { + "epoch": 0.07068638090070538, + "grad_norm": 6.08206844329834, + "learning_rate": 9.38878991366315e-06, + "loss": 0.3611, + "step": 5211 + }, + { + "epoch": 0.07069994574064026, + "grad_norm": 4.383358478546143, + "learning_rate": 9.388652871042895e-06, + "loss": 0.2436, + "step": 5212 + }, + { + "epoch": 0.07071351058057515, + "grad_norm": 6.253892421722412, + "learning_rate": 9.38851582842264e-06, + "loss": 0.3504, + "step": 5213 + }, + { + "epoch": 0.07072707542051004, + "grad_norm": 7.141596794128418, + "learning_rate": 9.388378785802385e-06, + "loss": 0.3206, + "step": 5214 + }, + { + "epoch": 0.07074064026044492, + "grad_norm": 5.164010047912598, + "learning_rate": 9.38824174318213e-06, + "loss": 0.2827, + "step": 5215 + }, + { + "epoch": 0.07075420510037982, + "grad_norm": 7.599830627441406, + "learning_rate": 9.388104700561876e-06, + "loss": 0.5032, + "step": 5216 + }, + { + "epoch": 0.07076776994031471, + "grad_norm": 5.282959938049316, + "learning_rate": 9.38796765794162e-06, + "loss": 0.2369, + "step": 5217 + }, + { + "epoch": 0.07078133478024959, + "grad_norm": 7.227855682373047, + "learning_rate": 9.387830615321366e-06, + "loss": 0.3755, + "step": 5218 + }, + { + "epoch": 0.07079489962018448, + "grad_norm": 5.423591613769531, + "learning_rate": 9.387693572701111e-06, + "loss": 0.3344, + "step": 5219 + }, + { + "epoch": 0.07080846446011937, + "grad_norm": 5.718276500701904, + "learning_rate": 9.387556530080856e-06, + "loss": 0.2742, + "step": 5220 + }, + { + "epoch": 0.07082202930005425, + "grad_norm": 7.092529296875, + "learning_rate": 9.387419487460601e-06, + "loss": 0.256, + "step": 5221 + }, + { + "epoch": 0.07083559413998915, + "grad_norm": 6.189513206481934, + "learning_rate": 9.387282444840347e-06, + "loss": 0.3178, + "step": 5222 + }, + { + "epoch": 0.07084915897992404, + "grad_norm": 5.499050617218018, + "learning_rate": 9.387145402220092e-06, + "loss": 0.3627, + "step": 5223 + }, + { + "epoch": 0.07086272381985892, + "grad_norm": 5.241664886474609, + "learning_rate": 9.387008359599837e-06, + "loss": 0.2664, + "step": 5224 + }, + { + "epoch": 0.07087628865979381, + "grad_norm": 5.4664626121521, + "learning_rate": 9.38687131697958e-06, + "loss": 0.2282, + "step": 5225 + }, + { + "epoch": 0.0708898534997287, + "grad_norm": 4.729018688201904, + "learning_rate": 9.386734274359327e-06, + "loss": 0.233, + "step": 5226 + }, + { + "epoch": 0.0709034183396636, + "grad_norm": 5.957033157348633, + "learning_rate": 9.386597231739073e-06, + "loss": 0.3356, + "step": 5227 + }, + { + "epoch": 0.07091698317959848, + "grad_norm": 4.692104339599609, + "learning_rate": 9.386460189118816e-06, + "loss": 0.232, + "step": 5228 + }, + { + "epoch": 0.07093054801953337, + "grad_norm": 5.774326324462891, + "learning_rate": 9.386323146498561e-06, + "loss": 0.275, + "step": 5229 + }, + { + "epoch": 0.07094411285946826, + "grad_norm": 5.401110649108887, + "learning_rate": 9.386186103878306e-06, + "loss": 0.2761, + "step": 5230 + }, + { + "epoch": 0.07095767769940314, + "grad_norm": 3.7481627464294434, + "learning_rate": 9.386049061258052e-06, + "loss": 0.1311, + "step": 5231 + }, + { + "epoch": 0.07097124253933804, + "grad_norm": 4.833856105804443, + "learning_rate": 9.385912018637797e-06, + "loss": 0.1642, + "step": 5232 + }, + { + "epoch": 0.07098480737927293, + "grad_norm": 5.788684368133545, + "learning_rate": 9.385774976017542e-06, + "loss": 0.233, + "step": 5233 + }, + { + "epoch": 0.07099837221920781, + "grad_norm": 5.272830009460449, + "learning_rate": 9.385637933397287e-06, + "loss": 0.2, + "step": 5234 + }, + { + "epoch": 0.0710119370591427, + "grad_norm": 5.915139675140381, + "learning_rate": 9.385500890777032e-06, + "loss": 0.2906, + "step": 5235 + }, + { + "epoch": 0.0710255018990776, + "grad_norm": 5.113737106323242, + "learning_rate": 9.385363848156777e-06, + "loss": 0.2174, + "step": 5236 + }, + { + "epoch": 0.07103906673901247, + "grad_norm": 6.7812113761901855, + "learning_rate": 9.385226805536523e-06, + "loss": 0.2514, + "step": 5237 + }, + { + "epoch": 0.07105263157894737, + "grad_norm": 6.516827583312988, + "learning_rate": 9.385089762916268e-06, + "loss": 0.3051, + "step": 5238 + }, + { + "epoch": 0.07106619641888226, + "grad_norm": 6.326895236968994, + "learning_rate": 9.384952720296013e-06, + "loss": 0.3426, + "step": 5239 + }, + { + "epoch": 0.07107976125881714, + "grad_norm": 6.284855842590332, + "learning_rate": 9.384815677675758e-06, + "loss": 0.2702, + "step": 5240 + }, + { + "epoch": 0.07109332609875203, + "grad_norm": 7.56547737121582, + "learning_rate": 9.384678635055503e-06, + "loss": 0.3758, + "step": 5241 + }, + { + "epoch": 0.07110689093868693, + "grad_norm": 5.672407627105713, + "learning_rate": 9.384541592435249e-06, + "loss": 0.236, + "step": 5242 + }, + { + "epoch": 0.07112045577862182, + "grad_norm": 4.76330041885376, + "learning_rate": 9.384404549814992e-06, + "loss": 0.2304, + "step": 5243 + }, + { + "epoch": 0.0711340206185567, + "grad_norm": 5.516509532928467, + "learning_rate": 9.384267507194739e-06, + "loss": 0.1921, + "step": 5244 + }, + { + "epoch": 0.07114758545849159, + "grad_norm": 7.881631851196289, + "learning_rate": 9.384130464574484e-06, + "loss": 0.3665, + "step": 5245 + }, + { + "epoch": 0.07116115029842648, + "grad_norm": 6.179152965545654, + "learning_rate": 9.383993421954228e-06, + "loss": 0.2991, + "step": 5246 + }, + { + "epoch": 0.07117471513836136, + "grad_norm": 5.143016815185547, + "learning_rate": 9.383856379333973e-06, + "loss": 0.2172, + "step": 5247 + }, + { + "epoch": 0.07118827997829626, + "grad_norm": 7.122356414794922, + "learning_rate": 9.38371933671372e-06, + "loss": 0.4823, + "step": 5248 + }, + { + "epoch": 0.07120184481823115, + "grad_norm": 7.425034046173096, + "learning_rate": 9.383582294093465e-06, + "loss": 0.271, + "step": 5249 + }, + { + "epoch": 0.07121540965816603, + "grad_norm": 5.2270073890686035, + "learning_rate": 9.383445251473208e-06, + "loss": 0.3723, + "step": 5250 + }, + { + "epoch": 0.07122897449810092, + "grad_norm": 5.495741367340088, + "learning_rate": 9.383308208852953e-06, + "loss": 0.3244, + "step": 5251 + }, + { + "epoch": 0.07124253933803582, + "grad_norm": 5.709318161010742, + "learning_rate": 9.3831711662327e-06, + "loss": 0.3136, + "step": 5252 + }, + { + "epoch": 0.0712561041779707, + "grad_norm": 6.695450305938721, + "learning_rate": 9.383034123612444e-06, + "loss": 0.3016, + "step": 5253 + }, + { + "epoch": 0.07126966901790559, + "grad_norm": 5.388178825378418, + "learning_rate": 9.382897080992189e-06, + "loss": 0.3409, + "step": 5254 + }, + { + "epoch": 0.07128323385784048, + "grad_norm": 4.733616352081299, + "learning_rate": 9.382760038371934e-06, + "loss": 0.2426, + "step": 5255 + }, + { + "epoch": 0.07129679869777536, + "grad_norm": 7.572624683380127, + "learning_rate": 9.38262299575168e-06, + "loss": 0.289, + "step": 5256 + }, + { + "epoch": 0.07131036353771025, + "grad_norm": 6.182260513305664, + "learning_rate": 9.382485953131424e-06, + "loss": 0.3264, + "step": 5257 + }, + { + "epoch": 0.07132392837764515, + "grad_norm": 6.957624912261963, + "learning_rate": 9.38234891051117e-06, + "loss": 0.3908, + "step": 5258 + }, + { + "epoch": 0.07133749321758004, + "grad_norm": 7.81468391418457, + "learning_rate": 9.382211867890915e-06, + "loss": 0.3832, + "step": 5259 + }, + { + "epoch": 0.07135105805751492, + "grad_norm": 5.3972625732421875, + "learning_rate": 9.38207482527066e-06, + "loss": 0.2363, + "step": 5260 + }, + { + "epoch": 0.07136462289744981, + "grad_norm": 5.949840545654297, + "learning_rate": 9.381937782650405e-06, + "loss": 0.2898, + "step": 5261 + }, + { + "epoch": 0.0713781877373847, + "grad_norm": 7.365085124969482, + "learning_rate": 9.38180074003015e-06, + "loss": 0.2789, + "step": 5262 + }, + { + "epoch": 0.07139175257731958, + "grad_norm": 5.6489410400390625, + "learning_rate": 9.381663697409896e-06, + "loss": 0.2931, + "step": 5263 + }, + { + "epoch": 0.07140531741725448, + "grad_norm": 5.614826202392578, + "learning_rate": 9.38152665478964e-06, + "loss": 0.3929, + "step": 5264 + }, + { + "epoch": 0.07141888225718937, + "grad_norm": 4.9233808517456055, + "learning_rate": 9.381389612169386e-06, + "loss": 0.2442, + "step": 5265 + }, + { + "epoch": 0.07143244709712425, + "grad_norm": 7.177333354949951, + "learning_rate": 9.381252569549131e-06, + "loss": 0.3299, + "step": 5266 + }, + { + "epoch": 0.07144601193705914, + "grad_norm": 6.521148681640625, + "learning_rate": 9.381115526928876e-06, + "loss": 0.3279, + "step": 5267 + }, + { + "epoch": 0.07145957677699404, + "grad_norm": 7.484494209289551, + "learning_rate": 9.38097848430862e-06, + "loss": 0.5624, + "step": 5268 + }, + { + "epoch": 0.07147314161692891, + "grad_norm": 5.805451393127441, + "learning_rate": 9.380841441688365e-06, + "loss": 0.4284, + "step": 5269 + }, + { + "epoch": 0.07148670645686381, + "grad_norm": 7.016268730163574, + "learning_rate": 9.380704399068112e-06, + "loss": 0.339, + "step": 5270 + }, + { + "epoch": 0.0715002712967987, + "grad_norm": 8.196112632751465, + "learning_rate": 9.380567356447855e-06, + "loss": 0.4557, + "step": 5271 + }, + { + "epoch": 0.07151383613673358, + "grad_norm": 4.898420810699463, + "learning_rate": 9.3804303138276e-06, + "loss": 0.3555, + "step": 5272 + }, + { + "epoch": 0.07152740097666847, + "grad_norm": 5.513956546783447, + "learning_rate": 9.380293271207346e-06, + "loss": 0.3358, + "step": 5273 + }, + { + "epoch": 0.07154096581660337, + "grad_norm": 7.390636444091797, + "learning_rate": 9.380156228587093e-06, + "loss": 0.3868, + "step": 5274 + }, + { + "epoch": 0.07155453065653826, + "grad_norm": 5.589601993560791, + "learning_rate": 9.380019185966836e-06, + "loss": 0.2986, + "step": 5275 + }, + { + "epoch": 0.07156809549647314, + "grad_norm": 5.32568359375, + "learning_rate": 9.379882143346581e-06, + "loss": 0.2723, + "step": 5276 + }, + { + "epoch": 0.07158166033640803, + "grad_norm": 4.831085205078125, + "learning_rate": 9.379745100726326e-06, + "loss": 0.2794, + "step": 5277 + }, + { + "epoch": 0.07159522517634292, + "grad_norm": 7.0651984214782715, + "learning_rate": 9.379608058106072e-06, + "loss": 0.3486, + "step": 5278 + }, + { + "epoch": 0.0716087900162778, + "grad_norm": 6.308075904846191, + "learning_rate": 9.379471015485817e-06, + "loss": 0.3143, + "step": 5279 + }, + { + "epoch": 0.0716223548562127, + "grad_norm": 7.2279744148254395, + "learning_rate": 9.379333972865562e-06, + "loss": 0.4998, + "step": 5280 + }, + { + "epoch": 0.07163591969614759, + "grad_norm": 8.429794311523438, + "learning_rate": 9.379196930245307e-06, + "loss": 0.5358, + "step": 5281 + }, + { + "epoch": 0.07164948453608247, + "grad_norm": 6.577387809753418, + "learning_rate": 9.379059887625052e-06, + "loss": 0.3656, + "step": 5282 + }, + { + "epoch": 0.07166304937601736, + "grad_norm": 7.520437240600586, + "learning_rate": 9.378922845004797e-06, + "loss": 0.4559, + "step": 5283 + }, + { + "epoch": 0.07167661421595226, + "grad_norm": 9.928016662597656, + "learning_rate": 9.378785802384543e-06, + "loss": 0.5067, + "step": 5284 + }, + { + "epoch": 0.07169017905588713, + "grad_norm": 7.207648277282715, + "learning_rate": 9.378648759764288e-06, + "loss": 0.5026, + "step": 5285 + }, + { + "epoch": 0.07170374389582203, + "grad_norm": 6.76861047744751, + "learning_rate": 9.378511717144031e-06, + "loss": 0.3645, + "step": 5286 + }, + { + "epoch": 0.07171730873575692, + "grad_norm": 9.00410270690918, + "learning_rate": 9.378374674523778e-06, + "loss": 0.6319, + "step": 5287 + }, + { + "epoch": 0.0717308735756918, + "grad_norm": 7.980982303619385, + "learning_rate": 9.378237631903523e-06, + "loss": 0.4445, + "step": 5288 + }, + { + "epoch": 0.0717444384156267, + "grad_norm": 8.090721130371094, + "learning_rate": 9.378100589283269e-06, + "loss": 0.4886, + "step": 5289 + }, + { + "epoch": 0.07175800325556159, + "grad_norm": 6.054101467132568, + "learning_rate": 9.377963546663012e-06, + "loss": 0.3586, + "step": 5290 + }, + { + "epoch": 0.07177156809549648, + "grad_norm": 8.061224937438965, + "learning_rate": 9.377826504042759e-06, + "loss": 0.5872, + "step": 5291 + }, + { + "epoch": 0.07178513293543136, + "grad_norm": 7.444918632507324, + "learning_rate": 9.377689461422504e-06, + "loss": 0.4041, + "step": 5292 + }, + { + "epoch": 0.07179869777536625, + "grad_norm": 7.369090557098389, + "learning_rate": 9.377552418802248e-06, + "loss": 0.4254, + "step": 5293 + }, + { + "epoch": 0.07181226261530115, + "grad_norm": 8.288656234741211, + "learning_rate": 9.377415376181993e-06, + "loss": 0.4967, + "step": 5294 + }, + { + "epoch": 0.07182582745523602, + "grad_norm": 9.068353652954102, + "learning_rate": 9.37727833356174e-06, + "loss": 0.4257, + "step": 5295 + }, + { + "epoch": 0.07183939229517092, + "grad_norm": 6.7025532722473145, + "learning_rate": 9.377141290941483e-06, + "loss": 0.4438, + "step": 5296 + }, + { + "epoch": 0.07185295713510581, + "grad_norm": 9.162360191345215, + "learning_rate": 9.377004248321228e-06, + "loss": 0.4681, + "step": 5297 + }, + { + "epoch": 0.07186652197504069, + "grad_norm": 6.984382629394531, + "learning_rate": 9.376867205700973e-06, + "loss": 0.4061, + "step": 5298 + }, + { + "epoch": 0.07188008681497558, + "grad_norm": 7.901956081390381, + "learning_rate": 9.376730163080719e-06, + "loss": 0.4063, + "step": 5299 + }, + { + "epoch": 0.07189365165491048, + "grad_norm": 6.5478596687316895, + "learning_rate": 9.376593120460464e-06, + "loss": 0.3991, + "step": 5300 + }, + { + "epoch": 0.07190721649484536, + "grad_norm": 6.169185638427734, + "learning_rate": 9.376456077840209e-06, + "loss": 0.2838, + "step": 5301 + }, + { + "epoch": 0.07192078133478025, + "grad_norm": 7.297759532928467, + "learning_rate": 9.376319035219954e-06, + "loss": 0.3969, + "step": 5302 + }, + { + "epoch": 0.07193434617471514, + "grad_norm": 7.422572612762451, + "learning_rate": 9.3761819925997e-06, + "loss": 0.4276, + "step": 5303 + }, + { + "epoch": 0.07194791101465002, + "grad_norm": 8.516631126403809, + "learning_rate": 9.376044949979445e-06, + "loss": 0.5725, + "step": 5304 + }, + { + "epoch": 0.07196147585458491, + "grad_norm": 11.711851119995117, + "learning_rate": 9.37590790735919e-06, + "loss": 0.4515, + "step": 5305 + }, + { + "epoch": 0.0719750406945198, + "grad_norm": 9.005440711975098, + "learning_rate": 9.375770864738935e-06, + "loss": 0.5241, + "step": 5306 + }, + { + "epoch": 0.0719886055344547, + "grad_norm": 8.646596908569336, + "learning_rate": 9.37563382211868e-06, + "loss": 0.3819, + "step": 5307 + }, + { + "epoch": 0.07200217037438958, + "grad_norm": 6.618277072906494, + "learning_rate": 9.375496779498425e-06, + "loss": 0.4307, + "step": 5308 + }, + { + "epoch": 0.07201573521432447, + "grad_norm": 7.724084377288818, + "learning_rate": 9.37535973687817e-06, + "loss": 0.4109, + "step": 5309 + }, + { + "epoch": 0.07202930005425937, + "grad_norm": 6.707508563995361, + "learning_rate": 9.375222694257916e-06, + "loss": 0.3566, + "step": 5310 + }, + { + "epoch": 0.07204286489419424, + "grad_norm": 6.038589954376221, + "learning_rate": 9.375085651637659e-06, + "loss": 0.4153, + "step": 5311 + }, + { + "epoch": 0.07205642973412914, + "grad_norm": 6.395274639129639, + "learning_rate": 9.374948609017404e-06, + "loss": 0.4648, + "step": 5312 + }, + { + "epoch": 0.07206999457406403, + "grad_norm": 7.895576000213623, + "learning_rate": 9.374811566397151e-06, + "loss": 0.497, + "step": 5313 + }, + { + "epoch": 0.07208355941399891, + "grad_norm": 7.8157877922058105, + "learning_rate": 9.374674523776896e-06, + "loss": 0.555, + "step": 5314 + }, + { + "epoch": 0.0720971242539338, + "grad_norm": 7.841760635375977, + "learning_rate": 9.37453748115664e-06, + "loss": 0.4063, + "step": 5315 + }, + { + "epoch": 0.0721106890938687, + "grad_norm": 6.896250247955322, + "learning_rate": 9.374400438536385e-06, + "loss": 0.4133, + "step": 5316 + }, + { + "epoch": 0.07212425393380358, + "grad_norm": 9.150373458862305, + "learning_rate": 9.374263395916132e-06, + "loss": 0.4356, + "step": 5317 + }, + { + "epoch": 0.07213781877373847, + "grad_norm": 7.397340774536133, + "learning_rate": 9.374126353295875e-06, + "loss": 0.3469, + "step": 5318 + }, + { + "epoch": 0.07215138361367336, + "grad_norm": 7.447941780090332, + "learning_rate": 9.37398931067562e-06, + "loss": 0.5431, + "step": 5319 + }, + { + "epoch": 0.07216494845360824, + "grad_norm": 7.631589889526367, + "learning_rate": 9.373852268055366e-06, + "loss": 0.5301, + "step": 5320 + }, + { + "epoch": 0.07217851329354313, + "grad_norm": 7.500068664550781, + "learning_rate": 9.373715225435111e-06, + "loss": 0.3972, + "step": 5321 + }, + { + "epoch": 0.07219207813347803, + "grad_norm": 9.788630485534668, + "learning_rate": 9.373578182814856e-06, + "loss": 0.6585, + "step": 5322 + }, + { + "epoch": 0.07220564297341292, + "grad_norm": 9.38573169708252, + "learning_rate": 9.373441140194601e-06, + "loss": 0.5269, + "step": 5323 + }, + { + "epoch": 0.0722192078133478, + "grad_norm": 6.643951416015625, + "learning_rate": 9.373304097574346e-06, + "loss": 0.3452, + "step": 5324 + }, + { + "epoch": 0.07223277265328269, + "grad_norm": 7.065097332000732, + "learning_rate": 9.373167054954092e-06, + "loss": 0.4682, + "step": 5325 + }, + { + "epoch": 0.07224633749321759, + "grad_norm": 8.642986297607422, + "learning_rate": 9.373030012333837e-06, + "loss": 0.4537, + "step": 5326 + }, + { + "epoch": 0.07225990233315246, + "grad_norm": 8.121572494506836, + "learning_rate": 9.372892969713582e-06, + "loss": 0.4594, + "step": 5327 + }, + { + "epoch": 0.07227346717308736, + "grad_norm": 10.17613697052002, + "learning_rate": 9.372755927093327e-06, + "loss": 0.5545, + "step": 5328 + }, + { + "epoch": 0.07228703201302225, + "grad_norm": 8.82511043548584, + "learning_rate": 9.37261888447307e-06, + "loss": 0.5543, + "step": 5329 + }, + { + "epoch": 0.07230059685295713, + "grad_norm": 9.076382637023926, + "learning_rate": 9.372481841852817e-06, + "loss": 0.422, + "step": 5330 + }, + { + "epoch": 0.07231416169289202, + "grad_norm": 10.201601028442383, + "learning_rate": 9.372344799232563e-06, + "loss": 0.4684, + "step": 5331 + }, + { + "epoch": 0.07232772653282692, + "grad_norm": 6.451058864593506, + "learning_rate": 9.372207756612308e-06, + "loss": 0.4423, + "step": 5332 + }, + { + "epoch": 0.0723412913727618, + "grad_norm": 7.346825122833252, + "learning_rate": 9.372070713992051e-06, + "loss": 0.3453, + "step": 5333 + }, + { + "epoch": 0.07235485621269669, + "grad_norm": 6.640686988830566, + "learning_rate": 9.371933671371798e-06, + "loss": 0.4205, + "step": 5334 + }, + { + "epoch": 0.07236842105263158, + "grad_norm": 8.72310733795166, + "learning_rate": 9.371796628751543e-06, + "loss": 0.4777, + "step": 5335 + }, + { + "epoch": 0.07238198589256646, + "grad_norm": 10.079517364501953, + "learning_rate": 9.371659586131287e-06, + "loss": 0.584, + "step": 5336 + }, + { + "epoch": 0.07239555073250135, + "grad_norm": 8.20347785949707, + "learning_rate": 9.371522543511032e-06, + "loss": 0.4839, + "step": 5337 + }, + { + "epoch": 0.07240911557243625, + "grad_norm": 6.220577716827393, + "learning_rate": 9.371385500890777e-06, + "loss": 0.2966, + "step": 5338 + }, + { + "epoch": 0.07242268041237114, + "grad_norm": 6.444993019104004, + "learning_rate": 9.371248458270522e-06, + "loss": 0.2874, + "step": 5339 + }, + { + "epoch": 0.07243624525230602, + "grad_norm": 5.895143985748291, + "learning_rate": 9.371111415650268e-06, + "loss": 0.3539, + "step": 5340 + }, + { + "epoch": 0.07244981009224091, + "grad_norm": 8.099685668945312, + "learning_rate": 9.370974373030013e-06, + "loss": 0.5032, + "step": 5341 + }, + { + "epoch": 0.0724633749321758, + "grad_norm": 8.524382591247559, + "learning_rate": 9.370837330409758e-06, + "loss": 0.47, + "step": 5342 + }, + { + "epoch": 0.07247693977211069, + "grad_norm": 7.325585842132568, + "learning_rate": 9.370700287789503e-06, + "loss": 0.2909, + "step": 5343 + }, + { + "epoch": 0.07249050461204558, + "grad_norm": 6.666383743286133, + "learning_rate": 9.370563245169248e-06, + "loss": 0.474, + "step": 5344 + }, + { + "epoch": 0.07250406945198047, + "grad_norm": 9.216879844665527, + "learning_rate": 9.370426202548993e-06, + "loss": 0.4534, + "step": 5345 + }, + { + "epoch": 0.07251763429191535, + "grad_norm": 6.909764766693115, + "learning_rate": 9.370289159928739e-06, + "loss": 0.3154, + "step": 5346 + }, + { + "epoch": 0.07253119913185024, + "grad_norm": 6.703704357147217, + "learning_rate": 9.370152117308484e-06, + "loss": 0.3015, + "step": 5347 + }, + { + "epoch": 0.07254476397178514, + "grad_norm": 7.5935869216918945, + "learning_rate": 9.370015074688229e-06, + "loss": 0.3869, + "step": 5348 + }, + { + "epoch": 0.07255832881172002, + "grad_norm": 5.741928577423096, + "learning_rate": 9.369878032067974e-06, + "loss": 0.2667, + "step": 5349 + }, + { + "epoch": 0.07257189365165491, + "grad_norm": 8.294472694396973, + "learning_rate": 9.36974098944772e-06, + "loss": 0.4624, + "step": 5350 + }, + { + "epoch": 0.0725854584915898, + "grad_norm": 7.555749416351318, + "learning_rate": 9.369603946827465e-06, + "loss": 0.4799, + "step": 5351 + }, + { + "epoch": 0.07259902333152468, + "grad_norm": 5.475797653198242, + "learning_rate": 9.36946690420721e-06, + "loss": 0.2694, + "step": 5352 + }, + { + "epoch": 0.07261258817145957, + "grad_norm": 8.923569679260254, + "learning_rate": 9.369329861586955e-06, + "loss": 0.3958, + "step": 5353 + }, + { + "epoch": 0.07262615301139447, + "grad_norm": 10.241917610168457, + "learning_rate": 9.369192818966698e-06, + "loss": 0.4631, + "step": 5354 + }, + { + "epoch": 0.07263971785132936, + "grad_norm": 6.8803815841674805, + "learning_rate": 9.369055776346444e-06, + "loss": 0.4758, + "step": 5355 + }, + { + "epoch": 0.07265328269126424, + "grad_norm": 5.803779125213623, + "learning_rate": 9.36891873372619e-06, + "loss": 0.4021, + "step": 5356 + }, + { + "epoch": 0.07266684753119913, + "grad_norm": 6.119816780090332, + "learning_rate": 9.368781691105936e-06, + "loss": 0.4006, + "step": 5357 + }, + { + "epoch": 0.07268041237113403, + "grad_norm": 6.700388431549072, + "learning_rate": 9.368644648485679e-06, + "loss": 0.4179, + "step": 5358 + }, + { + "epoch": 0.0726939772110689, + "grad_norm": 6.851258754730225, + "learning_rate": 9.368507605865424e-06, + "loss": 0.5325, + "step": 5359 + }, + { + "epoch": 0.0727075420510038, + "grad_norm": 5.266502857208252, + "learning_rate": 9.368370563245171e-06, + "loss": 0.3019, + "step": 5360 + }, + { + "epoch": 0.07272110689093869, + "grad_norm": 6.201563835144043, + "learning_rate": 9.368233520624915e-06, + "loss": 0.2388, + "step": 5361 + }, + { + "epoch": 0.07273467173087357, + "grad_norm": 7.033480167388916, + "learning_rate": 9.36809647800466e-06, + "loss": 0.2963, + "step": 5362 + }, + { + "epoch": 0.07274823657080846, + "grad_norm": 5.756860256195068, + "learning_rate": 9.367959435384405e-06, + "loss": 0.3519, + "step": 5363 + }, + { + "epoch": 0.07276180141074336, + "grad_norm": 6.793461322784424, + "learning_rate": 9.36782239276415e-06, + "loss": 0.46, + "step": 5364 + }, + { + "epoch": 0.07277536625067824, + "grad_norm": 4.883537292480469, + "learning_rate": 9.367685350143895e-06, + "loss": 0.2155, + "step": 5365 + }, + { + "epoch": 0.07278893109061313, + "grad_norm": 6.818307399749756, + "learning_rate": 9.36754830752364e-06, + "loss": 0.3737, + "step": 5366 + }, + { + "epoch": 0.07280249593054802, + "grad_norm": 6.820581912994385, + "learning_rate": 9.367411264903386e-06, + "loss": 0.3493, + "step": 5367 + }, + { + "epoch": 0.0728160607704829, + "grad_norm": 7.6188578605651855, + "learning_rate": 9.367274222283131e-06, + "loss": 0.3441, + "step": 5368 + }, + { + "epoch": 0.0728296256104178, + "grad_norm": 5.150660037994385, + "learning_rate": 9.367137179662876e-06, + "loss": 0.1959, + "step": 5369 + }, + { + "epoch": 0.07284319045035269, + "grad_norm": 4.716190814971924, + "learning_rate": 9.367000137042621e-06, + "loss": 0.2877, + "step": 5370 + }, + { + "epoch": 0.07285675529028758, + "grad_norm": 7.761122226715088, + "learning_rate": 9.366863094422366e-06, + "loss": 0.4387, + "step": 5371 + }, + { + "epoch": 0.07287032013022246, + "grad_norm": 6.260744094848633, + "learning_rate": 9.366726051802112e-06, + "loss": 0.3426, + "step": 5372 + }, + { + "epoch": 0.07288388497015735, + "grad_norm": 8.073596000671387, + "learning_rate": 9.366589009181857e-06, + "loss": 0.3297, + "step": 5373 + }, + { + "epoch": 0.07289744981009225, + "grad_norm": 8.466248512268066, + "learning_rate": 9.366451966561602e-06, + "loss": 0.3877, + "step": 5374 + }, + { + "epoch": 0.07291101465002713, + "grad_norm": 5.98144006729126, + "learning_rate": 9.366314923941347e-06, + "loss": 0.3286, + "step": 5375 + }, + { + "epoch": 0.07292457948996202, + "grad_norm": 6.250979900360107, + "learning_rate": 9.36617788132109e-06, + "loss": 0.261, + "step": 5376 + }, + { + "epoch": 0.07293814432989691, + "grad_norm": 5.452325344085693, + "learning_rate": 9.366040838700837e-06, + "loss": 0.3406, + "step": 5377 + }, + { + "epoch": 0.07295170916983179, + "grad_norm": 9.418730735778809, + "learning_rate": 9.365903796080583e-06, + "loss": 0.4882, + "step": 5378 + }, + { + "epoch": 0.07296527400976668, + "grad_norm": 6.281920433044434, + "learning_rate": 9.365766753460326e-06, + "loss": 0.2958, + "step": 5379 + }, + { + "epoch": 0.07297883884970158, + "grad_norm": 5.5127153396606445, + "learning_rate": 9.365629710840071e-06, + "loss": 0.3114, + "step": 5380 + }, + { + "epoch": 0.07299240368963646, + "grad_norm": 5.883706092834473, + "learning_rate": 9.365492668219817e-06, + "loss": 0.4695, + "step": 5381 + }, + { + "epoch": 0.07300596852957135, + "grad_norm": 7.855056285858154, + "learning_rate": 9.365355625599563e-06, + "loss": 0.4353, + "step": 5382 + }, + { + "epoch": 0.07301953336950624, + "grad_norm": 5.66411018371582, + "learning_rate": 9.365218582979307e-06, + "loss": 0.2555, + "step": 5383 + }, + { + "epoch": 0.07303309820944112, + "grad_norm": 9.48581600189209, + "learning_rate": 9.365081540359052e-06, + "loss": 0.5845, + "step": 5384 + }, + { + "epoch": 0.07304666304937601, + "grad_norm": 5.951670169830322, + "learning_rate": 9.364944497738797e-06, + "loss": 0.3774, + "step": 5385 + }, + { + "epoch": 0.07306022788931091, + "grad_norm": 6.2371439933776855, + "learning_rate": 9.364807455118542e-06, + "loss": 0.3961, + "step": 5386 + }, + { + "epoch": 0.0730737927292458, + "grad_norm": 7.722793102264404, + "learning_rate": 9.364670412498288e-06, + "loss": 0.4713, + "step": 5387 + }, + { + "epoch": 0.07308735756918068, + "grad_norm": 5.821434497833252, + "learning_rate": 9.364533369878033e-06, + "loss": 0.3013, + "step": 5388 + }, + { + "epoch": 0.07310092240911557, + "grad_norm": 7.257089138031006, + "learning_rate": 9.364396327257778e-06, + "loss": 0.4486, + "step": 5389 + }, + { + "epoch": 0.07311448724905047, + "grad_norm": 5.2516021728515625, + "learning_rate": 9.364259284637523e-06, + "loss": 0.1984, + "step": 5390 + }, + { + "epoch": 0.07312805208898535, + "grad_norm": 9.265874862670898, + "learning_rate": 9.364122242017268e-06, + "loss": 0.5056, + "step": 5391 + }, + { + "epoch": 0.07314161692892024, + "grad_norm": 8.859667778015137, + "learning_rate": 9.363985199397013e-06, + "loss": 0.426, + "step": 5392 + }, + { + "epoch": 0.07315518176885513, + "grad_norm": 5.799472808837891, + "learning_rate": 9.363848156776759e-06, + "loss": 0.3313, + "step": 5393 + }, + { + "epoch": 0.07316874660879001, + "grad_norm": 6.193823337554932, + "learning_rate": 9.363711114156502e-06, + "loss": 0.3371, + "step": 5394 + }, + { + "epoch": 0.0731823114487249, + "grad_norm": 5.171792507171631, + "learning_rate": 9.363574071536249e-06, + "loss": 0.3157, + "step": 5395 + }, + { + "epoch": 0.0731958762886598, + "grad_norm": 6.616139888763428, + "learning_rate": 9.363437028915994e-06, + "loss": 0.2987, + "step": 5396 + }, + { + "epoch": 0.07320944112859468, + "grad_norm": 4.620837211608887, + "learning_rate": 9.36329998629574e-06, + "loss": 0.2024, + "step": 5397 + }, + { + "epoch": 0.07322300596852957, + "grad_norm": 7.700159072875977, + "learning_rate": 9.363162943675483e-06, + "loss": 0.3118, + "step": 5398 + }, + { + "epoch": 0.07323657080846446, + "grad_norm": 8.707534790039062, + "learning_rate": 9.36302590105523e-06, + "loss": 0.4907, + "step": 5399 + }, + { + "epoch": 0.07325013564839934, + "grad_norm": 4.090849876403809, + "learning_rate": 9.362888858434975e-06, + "loss": 0.1439, + "step": 5400 + }, + { + "epoch": 0.07326370048833424, + "grad_norm": 6.129255294799805, + "learning_rate": 9.362751815814718e-06, + "loss": 0.2421, + "step": 5401 + }, + { + "epoch": 0.07327726532826913, + "grad_norm": 4.047583103179932, + "learning_rate": 9.362614773194464e-06, + "loss": 0.206, + "step": 5402 + }, + { + "epoch": 0.07329083016820402, + "grad_norm": 5.2296271324157715, + "learning_rate": 9.36247773057421e-06, + "loss": 0.1171, + "step": 5403 + }, + { + "epoch": 0.0733043950081389, + "grad_norm": 5.380308628082275, + "learning_rate": 9.362340687953954e-06, + "loss": 0.2716, + "step": 5404 + }, + { + "epoch": 0.0733179598480738, + "grad_norm": 5.286651134490967, + "learning_rate": 9.362203645333699e-06, + "loss": 0.2644, + "step": 5405 + }, + { + "epoch": 0.07333152468800869, + "grad_norm": 5.276982307434082, + "learning_rate": 9.362066602713444e-06, + "loss": 0.2055, + "step": 5406 + }, + { + "epoch": 0.07334508952794357, + "grad_norm": 5.366265296936035, + "learning_rate": 9.36192956009319e-06, + "loss": 0.2549, + "step": 5407 + }, + { + "epoch": 0.07335865436787846, + "grad_norm": 7.817133903503418, + "learning_rate": 9.361792517472935e-06, + "loss": 0.3399, + "step": 5408 + }, + { + "epoch": 0.07337221920781335, + "grad_norm": 9.243293762207031, + "learning_rate": 9.36165547485268e-06, + "loss": 0.4259, + "step": 5409 + }, + { + "epoch": 0.07338578404774823, + "grad_norm": 4.293515682220459, + "learning_rate": 9.361518432232425e-06, + "loss": 0.1685, + "step": 5410 + }, + { + "epoch": 0.07339934888768312, + "grad_norm": 6.070296764373779, + "learning_rate": 9.36138138961217e-06, + "loss": 0.3748, + "step": 5411 + }, + { + "epoch": 0.07341291372761802, + "grad_norm": 5.630780220031738, + "learning_rate": 9.361244346991915e-06, + "loss": 0.2693, + "step": 5412 + }, + { + "epoch": 0.0734264785675529, + "grad_norm": 4.271348476409912, + "learning_rate": 9.36110730437166e-06, + "loss": 0.2081, + "step": 5413 + }, + { + "epoch": 0.07344004340748779, + "grad_norm": 6.7612833976745605, + "learning_rate": 9.360970261751406e-06, + "loss": 0.2978, + "step": 5414 + }, + { + "epoch": 0.07345360824742268, + "grad_norm": 6.366386890411377, + "learning_rate": 9.360833219131151e-06, + "loss": 0.364, + "step": 5415 + }, + { + "epoch": 0.07346717308735756, + "grad_norm": 7.247935771942139, + "learning_rate": 9.360696176510896e-06, + "loss": 0.5569, + "step": 5416 + }, + { + "epoch": 0.07348073792729246, + "grad_norm": 6.718783378601074, + "learning_rate": 9.360559133890641e-06, + "loss": 0.3075, + "step": 5417 + }, + { + "epoch": 0.07349430276722735, + "grad_norm": 4.51918888092041, + "learning_rate": 9.360422091270386e-06, + "loss": 0.1943, + "step": 5418 + }, + { + "epoch": 0.07350786760716224, + "grad_norm": 6.580668926239014, + "learning_rate": 9.36028504865013e-06, + "loss": 0.3861, + "step": 5419 + }, + { + "epoch": 0.07352143244709712, + "grad_norm": 7.153140544891357, + "learning_rate": 9.360148006029877e-06, + "loss": 0.3612, + "step": 5420 + }, + { + "epoch": 0.07353499728703201, + "grad_norm": 8.867959022521973, + "learning_rate": 9.360010963409622e-06, + "loss": 0.6152, + "step": 5421 + }, + { + "epoch": 0.07354856212696691, + "grad_norm": 6.527795314788818, + "learning_rate": 9.359873920789365e-06, + "loss": 0.2806, + "step": 5422 + }, + { + "epoch": 0.07356212696690179, + "grad_norm": 7.5667338371276855, + "learning_rate": 9.35973687816911e-06, + "loss": 0.3407, + "step": 5423 + }, + { + "epoch": 0.07357569180683668, + "grad_norm": 8.353006362915039, + "learning_rate": 9.359599835548856e-06, + "loss": 0.3026, + "step": 5424 + }, + { + "epoch": 0.07358925664677157, + "grad_norm": 6.689835548400879, + "learning_rate": 9.359462792928603e-06, + "loss": 0.561, + "step": 5425 + }, + { + "epoch": 0.07360282148670645, + "grad_norm": 10.540879249572754, + "learning_rate": 9.359325750308346e-06, + "loss": 0.4475, + "step": 5426 + }, + { + "epoch": 0.07361638632664134, + "grad_norm": 6.956131935119629, + "learning_rate": 9.359188707688091e-06, + "loss": 0.3313, + "step": 5427 + }, + { + "epoch": 0.07362995116657624, + "grad_norm": 5.805016994476318, + "learning_rate": 9.359051665067837e-06, + "loss": 0.5181, + "step": 5428 + }, + { + "epoch": 0.07364351600651112, + "grad_norm": 6.97525691986084, + "learning_rate": 9.358914622447582e-06, + "loss": 0.3232, + "step": 5429 + }, + { + "epoch": 0.07365708084644601, + "grad_norm": 6.597316265106201, + "learning_rate": 9.358777579827327e-06, + "loss": 0.3352, + "step": 5430 + }, + { + "epoch": 0.0736706456863809, + "grad_norm": 5.008620262145996, + "learning_rate": 9.358640537207072e-06, + "loss": 0.3217, + "step": 5431 + }, + { + "epoch": 0.07368421052631578, + "grad_norm": 10.748154640197754, + "learning_rate": 9.358503494586817e-06, + "loss": 0.4978, + "step": 5432 + }, + { + "epoch": 0.07369777536625068, + "grad_norm": 5.487894535064697, + "learning_rate": 9.358366451966562e-06, + "loss": 0.3388, + "step": 5433 + }, + { + "epoch": 0.07371134020618557, + "grad_norm": 9.345671653747559, + "learning_rate": 9.358229409346308e-06, + "loss": 0.5368, + "step": 5434 + }, + { + "epoch": 0.07372490504612046, + "grad_norm": 8.245410919189453, + "learning_rate": 9.358092366726053e-06, + "loss": 0.3715, + "step": 5435 + }, + { + "epoch": 0.07373846988605534, + "grad_norm": 7.479406833648682, + "learning_rate": 9.357955324105798e-06, + "loss": 0.5563, + "step": 5436 + }, + { + "epoch": 0.07375203472599023, + "grad_norm": 8.960100173950195, + "learning_rate": 9.357818281485541e-06, + "loss": 0.4684, + "step": 5437 + }, + { + "epoch": 0.07376559956592513, + "grad_norm": 8.520807266235352, + "learning_rate": 9.357681238865288e-06, + "loss": 0.6097, + "step": 5438 + }, + { + "epoch": 0.07377916440586, + "grad_norm": 6.47210693359375, + "learning_rate": 9.357544196245034e-06, + "loss": 0.4703, + "step": 5439 + }, + { + "epoch": 0.0737927292457949, + "grad_norm": 8.90979290008545, + "learning_rate": 9.357407153624779e-06, + "loss": 0.7181, + "step": 5440 + }, + { + "epoch": 0.07380629408572979, + "grad_norm": 6.5563459396362305, + "learning_rate": 9.357270111004522e-06, + "loss": 0.4353, + "step": 5441 + }, + { + "epoch": 0.07381985892566467, + "grad_norm": 6.515951156616211, + "learning_rate": 9.357133068384269e-06, + "loss": 0.3192, + "step": 5442 + }, + { + "epoch": 0.07383342376559957, + "grad_norm": 6.885808944702148, + "learning_rate": 9.356996025764014e-06, + "loss": 0.3716, + "step": 5443 + }, + { + "epoch": 0.07384698860553446, + "grad_norm": 7.7976789474487305, + "learning_rate": 9.356858983143758e-06, + "loss": 0.3877, + "step": 5444 + }, + { + "epoch": 0.07386055344546934, + "grad_norm": 8.38793659210205, + "learning_rate": 9.356721940523503e-06, + "loss": 0.6341, + "step": 5445 + }, + { + "epoch": 0.07387411828540423, + "grad_norm": 7.2431960105896, + "learning_rate": 9.35658489790325e-06, + "loss": 0.452, + "step": 5446 + }, + { + "epoch": 0.07388768312533912, + "grad_norm": 7.349884986877441, + "learning_rate": 9.356447855282993e-06, + "loss": 0.3659, + "step": 5447 + }, + { + "epoch": 0.073901247965274, + "grad_norm": 7.766773700714111, + "learning_rate": 9.356310812662738e-06, + "loss": 0.4251, + "step": 5448 + }, + { + "epoch": 0.0739148128052089, + "grad_norm": 6.838222980499268, + "learning_rate": 9.356173770042484e-06, + "loss": 0.4604, + "step": 5449 + }, + { + "epoch": 0.07392837764514379, + "grad_norm": 7.798664093017578, + "learning_rate": 9.356036727422229e-06, + "loss": 0.4924, + "step": 5450 + }, + { + "epoch": 0.07394194248507868, + "grad_norm": 7.425014495849609, + "learning_rate": 9.355899684801974e-06, + "loss": 0.4984, + "step": 5451 + }, + { + "epoch": 0.07395550732501356, + "grad_norm": 8.773723602294922, + "learning_rate": 9.355762642181719e-06, + "loss": 0.524, + "step": 5452 + }, + { + "epoch": 0.07396907216494845, + "grad_norm": 7.775076866149902, + "learning_rate": 9.355625599561464e-06, + "loss": 0.5249, + "step": 5453 + }, + { + "epoch": 0.07398263700488335, + "grad_norm": 7.595400810241699, + "learning_rate": 9.35548855694121e-06, + "loss": 0.4619, + "step": 5454 + }, + { + "epoch": 0.07399620184481823, + "grad_norm": 5.921899318695068, + "learning_rate": 9.355351514320955e-06, + "loss": 0.4386, + "step": 5455 + }, + { + "epoch": 0.07400976668475312, + "grad_norm": 8.846597671508789, + "learning_rate": 9.3552144717007e-06, + "loss": 0.6024, + "step": 5456 + }, + { + "epoch": 0.07402333152468801, + "grad_norm": 7.952986717224121, + "learning_rate": 9.355077429080445e-06, + "loss": 0.5807, + "step": 5457 + }, + { + "epoch": 0.07403689636462289, + "grad_norm": 7.352505683898926, + "learning_rate": 9.35494038646019e-06, + "loss": 0.5195, + "step": 5458 + }, + { + "epoch": 0.07405046120455779, + "grad_norm": 7.050240516662598, + "learning_rate": 9.354803343839935e-06, + "loss": 0.3373, + "step": 5459 + }, + { + "epoch": 0.07406402604449268, + "grad_norm": 5.936134338378906, + "learning_rate": 9.35466630121968e-06, + "loss": 0.4923, + "step": 5460 + }, + { + "epoch": 0.07407759088442756, + "grad_norm": 7.05235481262207, + "learning_rate": 9.354529258599426e-06, + "loss": 0.4507, + "step": 5461 + }, + { + "epoch": 0.07409115572436245, + "grad_norm": 6.731334686279297, + "learning_rate": 9.35439221597917e-06, + "loss": 0.403, + "step": 5462 + }, + { + "epoch": 0.07410472056429734, + "grad_norm": 7.407378196716309, + "learning_rate": 9.354255173358914e-06, + "loss": 0.4771, + "step": 5463 + }, + { + "epoch": 0.07411828540423224, + "grad_norm": 9.32363510131836, + "learning_rate": 9.354118130738661e-06, + "loss": 0.6151, + "step": 5464 + }, + { + "epoch": 0.07413185024416712, + "grad_norm": 6.003567695617676, + "learning_rate": 9.353981088118406e-06, + "loss": 0.3539, + "step": 5465 + }, + { + "epoch": 0.07414541508410201, + "grad_norm": 5.596520900726318, + "learning_rate": 9.35384404549815e-06, + "loss": 0.3631, + "step": 5466 + }, + { + "epoch": 0.0741589799240369, + "grad_norm": 8.424854278564453, + "learning_rate": 9.353707002877895e-06, + "loss": 0.4294, + "step": 5467 + }, + { + "epoch": 0.07417254476397178, + "grad_norm": 6.019887924194336, + "learning_rate": 9.353569960257642e-06, + "loss": 0.3802, + "step": 5468 + }, + { + "epoch": 0.07418610960390667, + "grad_norm": 6.174359321594238, + "learning_rate": 9.353432917637385e-06, + "loss": 0.3467, + "step": 5469 + }, + { + "epoch": 0.07419967444384157, + "grad_norm": 7.855301856994629, + "learning_rate": 9.35329587501713e-06, + "loss": 0.4817, + "step": 5470 + }, + { + "epoch": 0.07421323928377645, + "grad_norm": 7.81881046295166, + "learning_rate": 9.353158832396876e-06, + "loss": 0.407, + "step": 5471 + }, + { + "epoch": 0.07422680412371134, + "grad_norm": 5.211912631988525, + "learning_rate": 9.353021789776621e-06, + "loss": 0.3673, + "step": 5472 + }, + { + "epoch": 0.07424036896364623, + "grad_norm": 7.935372352600098, + "learning_rate": 9.352884747156366e-06, + "loss": 0.4364, + "step": 5473 + }, + { + "epoch": 0.07425393380358111, + "grad_norm": 10.784521102905273, + "learning_rate": 9.352747704536111e-06, + "loss": 0.7188, + "step": 5474 + }, + { + "epoch": 0.074267498643516, + "grad_norm": 6.48038911819458, + "learning_rate": 9.352610661915857e-06, + "loss": 0.3318, + "step": 5475 + }, + { + "epoch": 0.0742810634834509, + "grad_norm": 4.265872478485107, + "learning_rate": 9.352473619295602e-06, + "loss": 0.2757, + "step": 5476 + }, + { + "epoch": 0.07429462832338578, + "grad_norm": 6.2655110359191895, + "learning_rate": 9.352336576675347e-06, + "loss": 0.4365, + "step": 5477 + }, + { + "epoch": 0.07430819316332067, + "grad_norm": 6.877660751342773, + "learning_rate": 9.352199534055092e-06, + "loss": 0.3831, + "step": 5478 + }, + { + "epoch": 0.07432175800325556, + "grad_norm": 5.738787651062012, + "learning_rate": 9.352062491434837e-06, + "loss": 0.3471, + "step": 5479 + }, + { + "epoch": 0.07433532284319046, + "grad_norm": 6.336819171905518, + "learning_rate": 9.351925448814582e-06, + "loss": 0.3498, + "step": 5480 + }, + { + "epoch": 0.07434888768312534, + "grad_norm": 8.17829418182373, + "learning_rate": 9.351788406194328e-06, + "loss": 0.4495, + "step": 5481 + }, + { + "epoch": 0.07436245252306023, + "grad_norm": 5.826980113983154, + "learning_rate": 9.351651363574073e-06, + "loss": 0.3067, + "step": 5482 + }, + { + "epoch": 0.07437601736299512, + "grad_norm": 7.134814262390137, + "learning_rate": 9.351514320953818e-06, + "loss": 0.3902, + "step": 5483 + }, + { + "epoch": 0.07438958220293, + "grad_norm": 6.391220569610596, + "learning_rate": 9.351377278333561e-06, + "loss": 0.3442, + "step": 5484 + }, + { + "epoch": 0.0744031470428649, + "grad_norm": 9.389643669128418, + "learning_rate": 9.351240235713308e-06, + "loss": 0.5295, + "step": 5485 + }, + { + "epoch": 0.07441671188279979, + "grad_norm": 6.2892985343933105, + "learning_rate": 9.351103193093054e-06, + "loss": 0.3376, + "step": 5486 + }, + { + "epoch": 0.07443027672273467, + "grad_norm": 9.812297821044922, + "learning_rate": 9.350966150472797e-06, + "loss": 0.4123, + "step": 5487 + }, + { + "epoch": 0.07444384156266956, + "grad_norm": 7.744887351989746, + "learning_rate": 9.350829107852542e-06, + "loss": 0.3946, + "step": 5488 + }, + { + "epoch": 0.07445740640260445, + "grad_norm": 5.885280132293701, + "learning_rate": 9.350692065232287e-06, + "loss": 0.268, + "step": 5489 + }, + { + "epoch": 0.07447097124253933, + "grad_norm": 8.085082054138184, + "learning_rate": 9.350555022612034e-06, + "loss": 0.5069, + "step": 5490 + }, + { + "epoch": 0.07448453608247423, + "grad_norm": 8.379477500915527, + "learning_rate": 9.350417979991778e-06, + "loss": 0.4574, + "step": 5491 + }, + { + "epoch": 0.07449810092240912, + "grad_norm": 5.472533702850342, + "learning_rate": 9.350280937371523e-06, + "loss": 0.3279, + "step": 5492 + }, + { + "epoch": 0.074511665762344, + "grad_norm": 4.462819576263428, + "learning_rate": 9.350143894751268e-06, + "loss": 0.3856, + "step": 5493 + }, + { + "epoch": 0.07452523060227889, + "grad_norm": 5.359703063964844, + "learning_rate": 9.350006852131013e-06, + "loss": 0.2326, + "step": 5494 + }, + { + "epoch": 0.07453879544221378, + "grad_norm": 6.527775764465332, + "learning_rate": 9.349869809510758e-06, + "loss": 0.3509, + "step": 5495 + }, + { + "epoch": 0.07455236028214868, + "grad_norm": 6.681503772735596, + "learning_rate": 9.349732766890504e-06, + "loss": 0.4265, + "step": 5496 + }, + { + "epoch": 0.07456592512208356, + "grad_norm": 6.34835958480835, + "learning_rate": 9.349595724270249e-06, + "loss": 0.4069, + "step": 5497 + }, + { + "epoch": 0.07457948996201845, + "grad_norm": 8.188531875610352, + "learning_rate": 9.349458681649994e-06, + "loss": 0.481, + "step": 5498 + }, + { + "epoch": 0.07459305480195334, + "grad_norm": 6.506580829620361, + "learning_rate": 9.34932163902974e-06, + "loss": 0.3781, + "step": 5499 + }, + { + "epoch": 0.07460661964188822, + "grad_norm": 8.049922943115234, + "learning_rate": 9.349184596409484e-06, + "loss": 0.4127, + "step": 5500 + }, + { + "epoch": 0.07462018448182312, + "grad_norm": 6.573831081390381, + "learning_rate": 9.34904755378923e-06, + "loss": 0.4403, + "step": 5501 + }, + { + "epoch": 0.07463374932175801, + "grad_norm": 9.937923431396484, + "learning_rate": 9.348910511168975e-06, + "loss": 0.5396, + "step": 5502 + }, + { + "epoch": 0.07464731416169289, + "grad_norm": 7.379303932189941, + "learning_rate": 9.34877346854872e-06, + "loss": 0.5902, + "step": 5503 + }, + { + "epoch": 0.07466087900162778, + "grad_norm": 5.726948261260986, + "learning_rate": 9.348636425928465e-06, + "loss": 0.2322, + "step": 5504 + }, + { + "epoch": 0.07467444384156267, + "grad_norm": 6.571889400482178, + "learning_rate": 9.34849938330821e-06, + "loss": 0.438, + "step": 5505 + }, + { + "epoch": 0.07468800868149755, + "grad_norm": 7.849034309387207, + "learning_rate": 9.348362340687954e-06, + "loss": 0.5921, + "step": 5506 + }, + { + "epoch": 0.07470157352143245, + "grad_norm": 8.672052383422852, + "learning_rate": 9.3482252980677e-06, + "loss": 0.6247, + "step": 5507 + }, + { + "epoch": 0.07471513836136734, + "grad_norm": 9.253410339355469, + "learning_rate": 9.348088255447446e-06, + "loss": 0.4984, + "step": 5508 + }, + { + "epoch": 0.07472870320130222, + "grad_norm": 6.790268898010254, + "learning_rate": 9.34795121282719e-06, + "loss": 0.3814, + "step": 5509 + }, + { + "epoch": 0.07474226804123711, + "grad_norm": 6.44460391998291, + "learning_rate": 9.347814170206934e-06, + "loss": 0.3098, + "step": 5510 + }, + { + "epoch": 0.074755832881172, + "grad_norm": 6.2560834884643555, + "learning_rate": 9.347677127586681e-06, + "loss": 0.3591, + "step": 5511 + }, + { + "epoch": 0.0747693977211069, + "grad_norm": 6.049785614013672, + "learning_rate": 9.347540084966425e-06, + "loss": 0.3241, + "step": 5512 + }, + { + "epoch": 0.07478296256104178, + "grad_norm": 7.192840576171875, + "learning_rate": 9.34740304234617e-06, + "loss": 0.4805, + "step": 5513 + }, + { + "epoch": 0.07479652740097667, + "grad_norm": 7.1252665519714355, + "learning_rate": 9.347265999725915e-06, + "loss": 0.3886, + "step": 5514 + }, + { + "epoch": 0.07481009224091156, + "grad_norm": 5.647397041320801, + "learning_rate": 9.34712895710566e-06, + "loss": 0.3371, + "step": 5515 + }, + { + "epoch": 0.07482365708084644, + "grad_norm": 6.288815021514893, + "learning_rate": 9.346991914485406e-06, + "loss": 0.3722, + "step": 5516 + }, + { + "epoch": 0.07483722192078134, + "grad_norm": 8.36303997039795, + "learning_rate": 9.34685487186515e-06, + "loss": 0.4094, + "step": 5517 + }, + { + "epoch": 0.07485078676071623, + "grad_norm": 7.300492763519287, + "learning_rate": 9.346717829244896e-06, + "loss": 0.3987, + "step": 5518 + }, + { + "epoch": 0.07486435160065111, + "grad_norm": 9.994107246398926, + "learning_rate": 9.346580786624641e-06, + "loss": 0.4823, + "step": 5519 + }, + { + "epoch": 0.074877916440586, + "grad_norm": 8.15617561340332, + "learning_rate": 9.346443744004386e-06, + "loss": 0.6181, + "step": 5520 + }, + { + "epoch": 0.0748914812805209, + "grad_norm": 7.02573299407959, + "learning_rate": 9.346306701384131e-06, + "loss": 0.4387, + "step": 5521 + }, + { + "epoch": 0.07490504612045577, + "grad_norm": 5.594031810760498, + "learning_rate": 9.346169658763877e-06, + "loss": 0.4854, + "step": 5522 + }, + { + "epoch": 0.07491861096039067, + "grad_norm": 6.2929158210754395, + "learning_rate": 9.346032616143622e-06, + "loss": 0.3818, + "step": 5523 + }, + { + "epoch": 0.07493217580032556, + "grad_norm": 7.547408103942871, + "learning_rate": 9.345895573523367e-06, + "loss": 0.4721, + "step": 5524 + }, + { + "epoch": 0.07494574064026044, + "grad_norm": 9.502342224121094, + "learning_rate": 9.345758530903112e-06, + "loss": 0.6484, + "step": 5525 + }, + { + "epoch": 0.07495930548019533, + "grad_norm": 6.876062393188477, + "learning_rate": 9.345621488282857e-06, + "loss": 0.3898, + "step": 5526 + }, + { + "epoch": 0.07497287032013022, + "grad_norm": 6.073515892028809, + "learning_rate": 9.3454844456626e-06, + "loss": 0.4339, + "step": 5527 + }, + { + "epoch": 0.07498643516006512, + "grad_norm": 8.393948554992676, + "learning_rate": 9.345347403042348e-06, + "loss": 0.4187, + "step": 5528 + }, + { + "epoch": 0.075, + "grad_norm": 6.996683597564697, + "learning_rate": 9.345210360422093e-06, + "loss": 0.4117, + "step": 5529 + }, + { + "epoch": 0.07501356483993489, + "grad_norm": 5.959164619445801, + "learning_rate": 9.345073317801836e-06, + "loss": 0.4125, + "step": 5530 + }, + { + "epoch": 0.07502712967986978, + "grad_norm": 8.840587615966797, + "learning_rate": 9.344936275181581e-06, + "loss": 0.4425, + "step": 5531 + }, + { + "epoch": 0.07504069451980466, + "grad_norm": 8.592796325683594, + "learning_rate": 9.344799232561327e-06, + "loss": 0.759, + "step": 5532 + }, + { + "epoch": 0.07505425935973956, + "grad_norm": 9.06377124786377, + "learning_rate": 9.344662189941074e-06, + "loss": 0.5656, + "step": 5533 + }, + { + "epoch": 0.07506782419967445, + "grad_norm": 9.402191162109375, + "learning_rate": 9.344525147320817e-06, + "loss": 0.5917, + "step": 5534 + }, + { + "epoch": 0.07508138903960933, + "grad_norm": 7.394493103027344, + "learning_rate": 9.344388104700562e-06, + "loss": 0.6701, + "step": 5535 + }, + { + "epoch": 0.07509495387954422, + "grad_norm": 8.980380058288574, + "learning_rate": 9.344251062080307e-06, + "loss": 0.5656, + "step": 5536 + }, + { + "epoch": 0.07510851871947911, + "grad_norm": 7.93637228012085, + "learning_rate": 9.344114019460053e-06, + "loss": 0.7571, + "step": 5537 + }, + { + "epoch": 0.075122083559414, + "grad_norm": 7.088751316070557, + "learning_rate": 9.343976976839798e-06, + "loss": 0.3777, + "step": 5538 + }, + { + "epoch": 0.07513564839934889, + "grad_norm": 7.463032245635986, + "learning_rate": 9.343839934219543e-06, + "loss": 0.5838, + "step": 5539 + }, + { + "epoch": 0.07514921323928378, + "grad_norm": 6.51787805557251, + "learning_rate": 9.343702891599288e-06, + "loss": 0.4199, + "step": 5540 + }, + { + "epoch": 0.07516277807921866, + "grad_norm": 7.082813262939453, + "learning_rate": 9.343565848979033e-06, + "loss": 0.339, + "step": 5541 + }, + { + "epoch": 0.07517634291915355, + "grad_norm": 6.463953495025635, + "learning_rate": 9.343428806358778e-06, + "loss": 0.435, + "step": 5542 + }, + { + "epoch": 0.07518990775908845, + "grad_norm": 6.448293685913086, + "learning_rate": 9.343291763738524e-06, + "loss": 0.5372, + "step": 5543 + }, + { + "epoch": 0.07520347259902334, + "grad_norm": 6.879991054534912, + "learning_rate": 9.343154721118269e-06, + "loss": 0.4719, + "step": 5544 + }, + { + "epoch": 0.07521703743895822, + "grad_norm": 6.628254413604736, + "learning_rate": 9.343017678498012e-06, + "loss": 0.3955, + "step": 5545 + }, + { + "epoch": 0.07523060227889311, + "grad_norm": 6.453292369842529, + "learning_rate": 9.34288063587776e-06, + "loss": 0.4081, + "step": 5546 + }, + { + "epoch": 0.075244167118828, + "grad_norm": 7.125642776489258, + "learning_rate": 9.342743593257504e-06, + "loss": 0.5052, + "step": 5547 + }, + { + "epoch": 0.07525773195876288, + "grad_norm": 5.661561489105225, + "learning_rate": 9.34260655063725e-06, + "loss": 0.4128, + "step": 5548 + }, + { + "epoch": 0.07527129679869778, + "grad_norm": 5.218578338623047, + "learning_rate": 9.342469508016993e-06, + "loss": 0.3503, + "step": 5549 + }, + { + "epoch": 0.07528486163863267, + "grad_norm": 9.136665344238281, + "learning_rate": 9.34233246539674e-06, + "loss": 0.5219, + "step": 5550 + }, + { + "epoch": 0.07529842647856755, + "grad_norm": 6.458992958068848, + "learning_rate": 9.342195422776485e-06, + "loss": 0.386, + "step": 5551 + }, + { + "epoch": 0.07531199131850244, + "grad_norm": 6.7415547370910645, + "learning_rate": 9.342058380156229e-06, + "loss": 0.4233, + "step": 5552 + }, + { + "epoch": 0.07532555615843733, + "grad_norm": 6.582118034362793, + "learning_rate": 9.341921337535974e-06, + "loss": 0.4681, + "step": 5553 + }, + { + "epoch": 0.07533912099837221, + "grad_norm": 8.609909057617188, + "learning_rate": 9.34178429491572e-06, + "loss": 0.3503, + "step": 5554 + }, + { + "epoch": 0.0753526858383071, + "grad_norm": 7.300134658813477, + "learning_rate": 9.341647252295464e-06, + "loss": 0.5748, + "step": 5555 + }, + { + "epoch": 0.075366250678242, + "grad_norm": 7.964286804199219, + "learning_rate": 9.34151020967521e-06, + "loss": 0.5295, + "step": 5556 + }, + { + "epoch": 0.07537981551817688, + "grad_norm": 8.639538764953613, + "learning_rate": 9.341373167054954e-06, + "loss": 0.6091, + "step": 5557 + }, + { + "epoch": 0.07539338035811177, + "grad_norm": 7.990708827972412, + "learning_rate": 9.3412361244347e-06, + "loss": 0.5819, + "step": 5558 + }, + { + "epoch": 0.07540694519804667, + "grad_norm": 7.007641315460205, + "learning_rate": 9.341099081814445e-06, + "loss": 0.6349, + "step": 5559 + }, + { + "epoch": 0.07542051003798156, + "grad_norm": 8.992569923400879, + "learning_rate": 9.34096203919419e-06, + "loss": 0.4201, + "step": 5560 + }, + { + "epoch": 0.07543407487791644, + "grad_norm": 8.493398666381836, + "learning_rate": 9.340824996573935e-06, + "loss": 0.4958, + "step": 5561 + }, + { + "epoch": 0.07544763971785133, + "grad_norm": 9.45741081237793, + "learning_rate": 9.34068795395368e-06, + "loss": 0.6258, + "step": 5562 + }, + { + "epoch": 0.07546120455778622, + "grad_norm": 6.528674125671387, + "learning_rate": 9.340550911333426e-06, + "loss": 0.4465, + "step": 5563 + }, + { + "epoch": 0.0754747693977211, + "grad_norm": 8.292900085449219, + "learning_rate": 9.34041386871317e-06, + "loss": 0.3797, + "step": 5564 + }, + { + "epoch": 0.075488334237656, + "grad_norm": 8.294775009155273, + "learning_rate": 9.340276826092916e-06, + "loss": 0.656, + "step": 5565 + }, + { + "epoch": 0.07550189907759089, + "grad_norm": 9.46193790435791, + "learning_rate": 9.340139783472661e-06, + "loss": 0.629, + "step": 5566 + }, + { + "epoch": 0.07551546391752577, + "grad_norm": 9.266633033752441, + "learning_rate": 9.340002740852406e-06, + "loss": 0.5229, + "step": 5567 + }, + { + "epoch": 0.07552902875746066, + "grad_norm": 9.787421226501465, + "learning_rate": 9.339865698232151e-06, + "loss": 0.5492, + "step": 5568 + }, + { + "epoch": 0.07554259359739555, + "grad_norm": 8.293182373046875, + "learning_rate": 9.339728655611897e-06, + "loss": 0.4787, + "step": 5569 + }, + { + "epoch": 0.07555615843733043, + "grad_norm": 7.99152135848999, + "learning_rate": 9.33959161299164e-06, + "loss": 0.5246, + "step": 5570 + }, + { + "epoch": 0.07556972327726533, + "grad_norm": 9.458121299743652, + "learning_rate": 9.339454570371387e-06, + "loss": 0.6619, + "step": 5571 + }, + { + "epoch": 0.07558328811720022, + "grad_norm": 7.905209541320801, + "learning_rate": 9.339317527751132e-06, + "loss": 0.5596, + "step": 5572 + }, + { + "epoch": 0.0755968529571351, + "grad_norm": 6.927152633666992, + "learning_rate": 9.339180485130877e-06, + "loss": 0.5253, + "step": 5573 + }, + { + "epoch": 0.07561041779706999, + "grad_norm": 7.462214469909668, + "learning_rate": 9.33904344251062e-06, + "loss": 0.5062, + "step": 5574 + }, + { + "epoch": 0.07562398263700489, + "grad_norm": 10.976558685302734, + "learning_rate": 9.338906399890366e-06, + "loss": 0.541, + "step": 5575 + }, + { + "epoch": 0.07563754747693978, + "grad_norm": 10.918133735656738, + "learning_rate": 9.338769357270113e-06, + "loss": 0.8168, + "step": 5576 + }, + { + "epoch": 0.07565111231687466, + "grad_norm": 6.584840297698975, + "learning_rate": 9.338632314649856e-06, + "loss": 0.4285, + "step": 5577 + }, + { + "epoch": 0.07566467715680955, + "grad_norm": 6.664927959442139, + "learning_rate": 9.338495272029602e-06, + "loss": 0.5322, + "step": 5578 + }, + { + "epoch": 0.07567824199674444, + "grad_norm": 8.408089637756348, + "learning_rate": 9.338358229409347e-06, + "loss": 0.4547, + "step": 5579 + }, + { + "epoch": 0.07569180683667932, + "grad_norm": 7.419883728027344, + "learning_rate": 9.338221186789092e-06, + "loss": 0.4491, + "step": 5580 + }, + { + "epoch": 0.07570537167661422, + "grad_norm": 8.039356231689453, + "learning_rate": 9.338084144168837e-06, + "loss": 0.5867, + "step": 5581 + }, + { + "epoch": 0.07571893651654911, + "grad_norm": 7.624300479888916, + "learning_rate": 9.337947101548582e-06, + "loss": 0.4764, + "step": 5582 + }, + { + "epoch": 0.07573250135648399, + "grad_norm": 5.7241363525390625, + "learning_rate": 9.337810058928327e-06, + "loss": 0.4624, + "step": 5583 + }, + { + "epoch": 0.07574606619641888, + "grad_norm": 7.553041458129883, + "learning_rate": 9.337673016308073e-06, + "loss": 0.5222, + "step": 5584 + }, + { + "epoch": 0.07575963103635378, + "grad_norm": 9.25890064239502, + "learning_rate": 9.337535973687818e-06, + "loss": 0.568, + "step": 5585 + }, + { + "epoch": 0.07577319587628865, + "grad_norm": 6.928241729736328, + "learning_rate": 9.337398931067563e-06, + "loss": 0.4168, + "step": 5586 + }, + { + "epoch": 0.07578676071622355, + "grad_norm": 7.996704578399658, + "learning_rate": 9.337261888447308e-06, + "loss": 0.4225, + "step": 5587 + }, + { + "epoch": 0.07580032555615844, + "grad_norm": 7.030759811401367, + "learning_rate": 9.337124845827053e-06, + "loss": 0.3397, + "step": 5588 + }, + { + "epoch": 0.07581389039609332, + "grad_norm": 6.895059585571289, + "learning_rate": 9.336987803206798e-06, + "loss": 0.4964, + "step": 5589 + }, + { + "epoch": 0.07582745523602821, + "grad_norm": 7.463877201080322, + "learning_rate": 9.336850760586544e-06, + "loss": 0.3781, + "step": 5590 + }, + { + "epoch": 0.0758410200759631, + "grad_norm": 5.155336380004883, + "learning_rate": 9.336713717966289e-06, + "loss": 0.4071, + "step": 5591 + }, + { + "epoch": 0.075854584915898, + "grad_norm": 8.240578651428223, + "learning_rate": 9.336576675346032e-06, + "loss": 0.5854, + "step": 5592 + }, + { + "epoch": 0.07586814975583288, + "grad_norm": 10.199996948242188, + "learning_rate": 9.33643963272578e-06, + "loss": 0.7208, + "step": 5593 + }, + { + "epoch": 0.07588171459576777, + "grad_norm": 7.2977142333984375, + "learning_rate": 9.336302590105524e-06, + "loss": 0.4726, + "step": 5594 + }, + { + "epoch": 0.07589527943570266, + "grad_norm": 6.186351299285889, + "learning_rate": 9.336165547485268e-06, + "loss": 0.4504, + "step": 5595 + }, + { + "epoch": 0.07590884427563754, + "grad_norm": 6.712085247039795, + "learning_rate": 9.336028504865013e-06, + "loss": 0.4487, + "step": 5596 + }, + { + "epoch": 0.07592240911557244, + "grad_norm": 8.517945289611816, + "learning_rate": 9.33589146224476e-06, + "loss": 0.43, + "step": 5597 + }, + { + "epoch": 0.07593597395550733, + "grad_norm": 6.679684162139893, + "learning_rate": 9.335754419624503e-06, + "loss": 0.3753, + "step": 5598 + }, + { + "epoch": 0.07594953879544221, + "grad_norm": 5.777098655700684, + "learning_rate": 9.335617377004249e-06, + "loss": 0.3154, + "step": 5599 + }, + { + "epoch": 0.0759631036353771, + "grad_norm": 7.810524940490723, + "learning_rate": 9.335480334383994e-06, + "loss": 0.4677, + "step": 5600 + }, + { + "epoch": 0.075976668475312, + "grad_norm": 5.986093044281006, + "learning_rate": 9.335343291763739e-06, + "loss": 0.3448, + "step": 5601 + }, + { + "epoch": 0.07599023331524687, + "grad_norm": 5.423357963562012, + "learning_rate": 9.335206249143484e-06, + "loss": 0.3255, + "step": 5602 + }, + { + "epoch": 0.07600379815518177, + "grad_norm": 7.192202568054199, + "learning_rate": 9.33506920652323e-06, + "loss": 0.3748, + "step": 5603 + }, + { + "epoch": 0.07601736299511666, + "grad_norm": 7.766848087310791, + "learning_rate": 9.334932163902974e-06, + "loss": 0.4521, + "step": 5604 + }, + { + "epoch": 0.07603092783505154, + "grad_norm": 8.132335662841797, + "learning_rate": 9.33479512128272e-06, + "loss": 0.5108, + "step": 5605 + }, + { + "epoch": 0.07604449267498643, + "grad_norm": 9.449626922607422, + "learning_rate": 9.334658078662465e-06, + "loss": 0.5962, + "step": 5606 + }, + { + "epoch": 0.07605805751492133, + "grad_norm": 7.415759563446045, + "learning_rate": 9.33452103604221e-06, + "loss": 0.5091, + "step": 5607 + }, + { + "epoch": 0.07607162235485622, + "grad_norm": 10.916349411010742, + "learning_rate": 9.334383993421955e-06, + "loss": 0.7554, + "step": 5608 + }, + { + "epoch": 0.0760851871947911, + "grad_norm": 10.949301719665527, + "learning_rate": 9.3342469508017e-06, + "loss": 0.6482, + "step": 5609 + }, + { + "epoch": 0.07609875203472599, + "grad_norm": 6.82996940612793, + "learning_rate": 9.334109908181446e-06, + "loss": 0.5572, + "step": 5610 + }, + { + "epoch": 0.07611231687466088, + "grad_norm": 5.560540199279785, + "learning_rate": 9.33397286556119e-06, + "loss": 0.321, + "step": 5611 + }, + { + "epoch": 0.07612588171459576, + "grad_norm": 8.450811386108398, + "learning_rate": 9.333835822940936e-06, + "loss": 0.4809, + "step": 5612 + }, + { + "epoch": 0.07613944655453066, + "grad_norm": 6.785044193267822, + "learning_rate": 9.33369878032068e-06, + "loss": 0.3873, + "step": 5613 + }, + { + "epoch": 0.07615301139446555, + "grad_norm": 6.853118419647217, + "learning_rate": 9.333561737700425e-06, + "loss": 0.4388, + "step": 5614 + }, + { + "epoch": 0.07616657623440043, + "grad_norm": 6.279328346252441, + "learning_rate": 9.333424695080171e-06, + "loss": 0.4931, + "step": 5615 + }, + { + "epoch": 0.07618014107433532, + "grad_norm": 7.503894805908203, + "learning_rate": 9.333287652459917e-06, + "loss": 0.5661, + "step": 5616 + }, + { + "epoch": 0.07619370591427022, + "grad_norm": 6.612764358520508, + "learning_rate": 9.33315060983966e-06, + "loss": 0.4336, + "step": 5617 + }, + { + "epoch": 0.0762072707542051, + "grad_norm": 10.332059860229492, + "learning_rate": 9.333013567219405e-06, + "loss": 0.9445, + "step": 5618 + }, + { + "epoch": 0.07622083559413999, + "grad_norm": 6.674472332000732, + "learning_rate": 9.332876524599152e-06, + "loss": 0.4822, + "step": 5619 + }, + { + "epoch": 0.07623440043407488, + "grad_norm": 8.014720916748047, + "learning_rate": 9.332739481978896e-06, + "loss": 0.6184, + "step": 5620 + }, + { + "epoch": 0.07624796527400976, + "grad_norm": 8.172510147094727, + "learning_rate": 9.33260243935864e-06, + "loss": 0.5272, + "step": 5621 + }, + { + "epoch": 0.07626153011394465, + "grad_norm": 6.460472106933594, + "learning_rate": 9.332465396738386e-06, + "loss": 0.381, + "step": 5622 + }, + { + "epoch": 0.07627509495387955, + "grad_norm": 9.465635299682617, + "learning_rate": 9.332328354118131e-06, + "loss": 0.7602, + "step": 5623 + }, + { + "epoch": 0.07628865979381444, + "grad_norm": 7.5900115966796875, + "learning_rate": 9.332191311497876e-06, + "loss": 0.5347, + "step": 5624 + }, + { + "epoch": 0.07630222463374932, + "grad_norm": 9.143546104431152, + "learning_rate": 9.332054268877622e-06, + "loss": 0.6366, + "step": 5625 + }, + { + "epoch": 0.07631578947368421, + "grad_norm": 6.836942672729492, + "learning_rate": 9.331917226257367e-06, + "loss": 0.2953, + "step": 5626 + }, + { + "epoch": 0.0763293543136191, + "grad_norm": 6.379964828491211, + "learning_rate": 9.331780183637112e-06, + "loss": 0.4679, + "step": 5627 + }, + { + "epoch": 0.07634291915355398, + "grad_norm": 6.510719299316406, + "learning_rate": 9.331643141016857e-06, + "loss": 0.3129, + "step": 5628 + }, + { + "epoch": 0.07635648399348888, + "grad_norm": 6.054704666137695, + "learning_rate": 9.331506098396602e-06, + "loss": 0.3492, + "step": 5629 + }, + { + "epoch": 0.07637004883342377, + "grad_norm": 6.944437503814697, + "learning_rate": 9.331369055776347e-06, + "loss": 0.4512, + "step": 5630 + }, + { + "epoch": 0.07638361367335865, + "grad_norm": 6.5511369705200195, + "learning_rate": 9.331232013156093e-06, + "loss": 0.5384, + "step": 5631 + }, + { + "epoch": 0.07639717851329354, + "grad_norm": 6.254457473754883, + "learning_rate": 9.331094970535838e-06, + "loss": 0.4517, + "step": 5632 + }, + { + "epoch": 0.07641074335322844, + "grad_norm": 6.634145259857178, + "learning_rate": 9.330957927915583e-06, + "loss": 0.3853, + "step": 5633 + }, + { + "epoch": 0.07642430819316332, + "grad_norm": 5.751414775848389, + "learning_rate": 9.330820885295328e-06, + "loss": 0.332, + "step": 5634 + }, + { + "epoch": 0.07643787303309821, + "grad_norm": 7.399209022521973, + "learning_rate": 9.330683842675072e-06, + "loss": 0.4944, + "step": 5635 + }, + { + "epoch": 0.0764514378730331, + "grad_norm": 6.720069885253906, + "learning_rate": 9.330546800054818e-06, + "loss": 0.4179, + "step": 5636 + }, + { + "epoch": 0.07646500271296798, + "grad_norm": 9.419465065002441, + "learning_rate": 9.330409757434564e-06, + "loss": 0.7127, + "step": 5637 + }, + { + "epoch": 0.07647856755290287, + "grad_norm": 7.941328525543213, + "learning_rate": 9.330272714814307e-06, + "loss": 0.5744, + "step": 5638 + }, + { + "epoch": 0.07649213239283777, + "grad_norm": 6.255306243896484, + "learning_rate": 9.330135672194052e-06, + "loss": 0.4448, + "step": 5639 + }, + { + "epoch": 0.07650569723277266, + "grad_norm": 5.894078731536865, + "learning_rate": 9.3299986295738e-06, + "loss": 0.4335, + "step": 5640 + }, + { + "epoch": 0.07651926207270754, + "grad_norm": 6.748673915863037, + "learning_rate": 9.329861586953544e-06, + "loss": 0.4579, + "step": 5641 + }, + { + "epoch": 0.07653282691264243, + "grad_norm": 8.168205261230469, + "learning_rate": 9.329724544333288e-06, + "loss": 0.5004, + "step": 5642 + }, + { + "epoch": 0.07654639175257733, + "grad_norm": 8.05321216583252, + "learning_rate": 9.329587501713033e-06, + "loss": 0.672, + "step": 5643 + }, + { + "epoch": 0.0765599565925122, + "grad_norm": 8.847975730895996, + "learning_rate": 9.329450459092778e-06, + "loss": 0.6043, + "step": 5644 + }, + { + "epoch": 0.0765735214324471, + "grad_norm": 7.880130767822266, + "learning_rate": 9.329313416472523e-06, + "loss": 0.5611, + "step": 5645 + }, + { + "epoch": 0.07658708627238199, + "grad_norm": 9.465781211853027, + "learning_rate": 9.329176373852269e-06, + "loss": 0.6926, + "step": 5646 + }, + { + "epoch": 0.07660065111231687, + "grad_norm": 7.936887741088867, + "learning_rate": 9.329039331232014e-06, + "loss": 0.5892, + "step": 5647 + }, + { + "epoch": 0.07661421595225176, + "grad_norm": 5.909241199493408, + "learning_rate": 9.328902288611759e-06, + "loss": 0.3783, + "step": 5648 + }, + { + "epoch": 0.07662778079218666, + "grad_norm": 5.635401248931885, + "learning_rate": 9.328765245991504e-06, + "loss": 0.4435, + "step": 5649 + }, + { + "epoch": 0.07664134563212154, + "grad_norm": 5.983108043670654, + "learning_rate": 9.32862820337125e-06, + "loss": 0.4104, + "step": 5650 + }, + { + "epoch": 0.07665491047205643, + "grad_norm": 8.282482147216797, + "learning_rate": 9.328491160750994e-06, + "loss": 0.5664, + "step": 5651 + }, + { + "epoch": 0.07666847531199132, + "grad_norm": 7.418560981750488, + "learning_rate": 9.32835411813074e-06, + "loss": 0.4928, + "step": 5652 + }, + { + "epoch": 0.0766820401519262, + "grad_norm": 5.894373416900635, + "learning_rate": 9.328217075510485e-06, + "loss": 0.4474, + "step": 5653 + }, + { + "epoch": 0.0766956049918611, + "grad_norm": 7.399281978607178, + "learning_rate": 9.32808003289023e-06, + "loss": 0.4258, + "step": 5654 + }, + { + "epoch": 0.07670916983179599, + "grad_norm": 5.136994361877441, + "learning_rate": 9.327942990269975e-06, + "loss": 0.3508, + "step": 5655 + }, + { + "epoch": 0.07672273467173088, + "grad_norm": 7.719845771789551, + "learning_rate": 9.32780594764972e-06, + "loss": 0.4642, + "step": 5656 + }, + { + "epoch": 0.07673629951166576, + "grad_norm": 6.12233829498291, + "learning_rate": 9.327668905029464e-06, + "loss": 0.4481, + "step": 5657 + }, + { + "epoch": 0.07674986435160065, + "grad_norm": 5.537606716156006, + "learning_rate": 9.32753186240921e-06, + "loss": 0.2965, + "step": 5658 + }, + { + "epoch": 0.07676342919153555, + "grad_norm": 8.27414321899414, + "learning_rate": 9.327394819788956e-06, + "loss": 0.5603, + "step": 5659 + }, + { + "epoch": 0.07677699403147042, + "grad_norm": 6.896991729736328, + "learning_rate": 9.3272577771687e-06, + "loss": 0.4677, + "step": 5660 + }, + { + "epoch": 0.07679055887140532, + "grad_norm": 8.679915428161621, + "learning_rate": 9.327120734548445e-06, + "loss": 0.4414, + "step": 5661 + }, + { + "epoch": 0.07680412371134021, + "grad_norm": 6.204893589019775, + "learning_rate": 9.326983691928191e-06, + "loss": 0.2856, + "step": 5662 + }, + { + "epoch": 0.07681768855127509, + "grad_norm": 5.730870246887207, + "learning_rate": 9.326846649307935e-06, + "loss": 0.3057, + "step": 5663 + }, + { + "epoch": 0.07683125339120998, + "grad_norm": 6.931346893310547, + "learning_rate": 9.32670960668768e-06, + "loss": 0.3556, + "step": 5664 + }, + { + "epoch": 0.07684481823114488, + "grad_norm": 7.997222423553467, + "learning_rate": 9.326572564067425e-06, + "loss": 0.5052, + "step": 5665 + }, + { + "epoch": 0.07685838307107976, + "grad_norm": 6.066256046295166, + "learning_rate": 9.326435521447172e-06, + "loss": 0.215, + "step": 5666 + }, + { + "epoch": 0.07687194791101465, + "grad_norm": 4.267405033111572, + "learning_rate": 9.326298478826916e-06, + "loss": 0.2927, + "step": 5667 + }, + { + "epoch": 0.07688551275094954, + "grad_norm": 8.280074119567871, + "learning_rate": 9.32616143620666e-06, + "loss": 0.5911, + "step": 5668 + }, + { + "epoch": 0.07689907759088442, + "grad_norm": 8.329846382141113, + "learning_rate": 9.326024393586406e-06, + "loss": 0.4982, + "step": 5669 + }, + { + "epoch": 0.07691264243081931, + "grad_norm": 9.168498992919922, + "learning_rate": 9.325887350966151e-06, + "loss": 0.5727, + "step": 5670 + }, + { + "epoch": 0.07692620727075421, + "grad_norm": 8.74516773223877, + "learning_rate": 9.325750308345896e-06, + "loss": 0.6798, + "step": 5671 + }, + { + "epoch": 0.0769397721106891, + "grad_norm": 5.375270366668701, + "learning_rate": 9.325613265725642e-06, + "loss": 0.258, + "step": 5672 + }, + { + "epoch": 0.07695333695062398, + "grad_norm": 5.242641925811768, + "learning_rate": 9.325476223105387e-06, + "loss": 0.3904, + "step": 5673 + }, + { + "epoch": 0.07696690179055887, + "grad_norm": 6.92979097366333, + "learning_rate": 9.325339180485132e-06, + "loss": 0.5907, + "step": 5674 + }, + { + "epoch": 0.07698046663049377, + "grad_norm": 7.370643138885498, + "learning_rate": 9.325202137864877e-06, + "loss": 0.2967, + "step": 5675 + }, + { + "epoch": 0.07699403147042864, + "grad_norm": 9.519560813903809, + "learning_rate": 9.325065095244622e-06, + "loss": 0.4929, + "step": 5676 + }, + { + "epoch": 0.07700759631036354, + "grad_norm": 5.444814682006836, + "learning_rate": 9.324928052624367e-06, + "loss": 0.2506, + "step": 5677 + }, + { + "epoch": 0.07702116115029843, + "grad_norm": 8.465434074401855, + "learning_rate": 9.324791010004111e-06, + "loss": 0.8218, + "step": 5678 + }, + { + "epoch": 0.07703472599023331, + "grad_norm": 6.251682758331299, + "learning_rate": 9.324653967383858e-06, + "loss": 0.3531, + "step": 5679 + }, + { + "epoch": 0.0770482908301682, + "grad_norm": 6.853287220001221, + "learning_rate": 9.324516924763603e-06, + "loss": 0.3736, + "step": 5680 + }, + { + "epoch": 0.0770618556701031, + "grad_norm": 4.857256889343262, + "learning_rate": 9.324379882143348e-06, + "loss": 0.2983, + "step": 5681 + }, + { + "epoch": 0.07707542051003798, + "grad_norm": 5.270391941070557, + "learning_rate": 9.324242839523092e-06, + "loss": 0.472, + "step": 5682 + }, + { + "epoch": 0.07708898534997287, + "grad_norm": 5.217006683349609, + "learning_rate": 9.324105796902837e-06, + "loss": 0.4913, + "step": 5683 + }, + { + "epoch": 0.07710255018990776, + "grad_norm": 7.181042671203613, + "learning_rate": 9.323968754282584e-06, + "loss": 0.3715, + "step": 5684 + }, + { + "epoch": 0.07711611502984264, + "grad_norm": 6.133905410766602, + "learning_rate": 9.323831711662327e-06, + "loss": 0.4545, + "step": 5685 + }, + { + "epoch": 0.07712967986977753, + "grad_norm": 4.267120361328125, + "learning_rate": 9.323694669042072e-06, + "loss": 0.4204, + "step": 5686 + }, + { + "epoch": 0.07714324470971243, + "grad_norm": 4.616540431976318, + "learning_rate": 9.323557626421818e-06, + "loss": 0.3054, + "step": 5687 + }, + { + "epoch": 0.07715680954964732, + "grad_norm": 8.330711364746094, + "learning_rate": 9.323420583801563e-06, + "loss": 0.5382, + "step": 5688 + }, + { + "epoch": 0.0771703743895822, + "grad_norm": 4.753369331359863, + "learning_rate": 9.323283541181308e-06, + "loss": 0.3366, + "step": 5689 + }, + { + "epoch": 0.07718393922951709, + "grad_norm": 6.189577579498291, + "learning_rate": 9.323146498561053e-06, + "loss": 0.3072, + "step": 5690 + }, + { + "epoch": 0.07719750406945199, + "grad_norm": 6.692112445831299, + "learning_rate": 9.323009455940798e-06, + "loss": 0.3387, + "step": 5691 + }, + { + "epoch": 0.07721106890938687, + "grad_norm": 6.594633102416992, + "learning_rate": 9.322872413320543e-06, + "loss": 0.3818, + "step": 5692 + }, + { + "epoch": 0.07722463374932176, + "grad_norm": 7.216969966888428, + "learning_rate": 9.322735370700289e-06, + "loss": 0.3669, + "step": 5693 + }, + { + "epoch": 0.07723819858925665, + "grad_norm": 5.913942337036133, + "learning_rate": 9.322598328080034e-06, + "loss": 0.3698, + "step": 5694 + }, + { + "epoch": 0.07725176342919153, + "grad_norm": 6.672762393951416, + "learning_rate": 9.322461285459779e-06, + "loss": 0.5071, + "step": 5695 + }, + { + "epoch": 0.07726532826912642, + "grad_norm": 9.726150512695312, + "learning_rate": 9.322324242839524e-06, + "loss": 0.4541, + "step": 5696 + }, + { + "epoch": 0.07727889310906132, + "grad_norm": 4.301455020904541, + "learning_rate": 9.32218720021927e-06, + "loss": 0.3089, + "step": 5697 + }, + { + "epoch": 0.0772924579489962, + "grad_norm": 5.7112908363342285, + "learning_rate": 9.322050157599015e-06, + "loss": 0.3713, + "step": 5698 + }, + { + "epoch": 0.07730602278893109, + "grad_norm": 5.084668159484863, + "learning_rate": 9.32191311497876e-06, + "loss": 0.2896, + "step": 5699 + }, + { + "epoch": 0.07731958762886598, + "grad_norm": 5.623495101928711, + "learning_rate": 9.321776072358503e-06, + "loss": 0.3481, + "step": 5700 + }, + { + "epoch": 0.07733315246880086, + "grad_norm": 6.3728108406066895, + "learning_rate": 9.32163902973825e-06, + "loss": 0.4295, + "step": 5701 + }, + { + "epoch": 0.07734671730873575, + "grad_norm": 5.3552374839782715, + "learning_rate": 9.321501987117995e-06, + "loss": 0.3053, + "step": 5702 + }, + { + "epoch": 0.07736028214867065, + "grad_norm": 7.502532005310059, + "learning_rate": 9.321364944497739e-06, + "loss": 0.5884, + "step": 5703 + }, + { + "epoch": 0.07737384698860554, + "grad_norm": 7.784605026245117, + "learning_rate": 9.321227901877484e-06, + "loss": 0.5498, + "step": 5704 + }, + { + "epoch": 0.07738741182854042, + "grad_norm": 7.458524703979492, + "learning_rate": 9.32109085925723e-06, + "loss": 0.4791, + "step": 5705 + }, + { + "epoch": 0.07740097666847531, + "grad_norm": 5.3284149169921875, + "learning_rate": 9.320953816636974e-06, + "loss": 0.4647, + "step": 5706 + }, + { + "epoch": 0.0774145415084102, + "grad_norm": 8.255356788635254, + "learning_rate": 9.32081677401672e-06, + "loss": 0.4594, + "step": 5707 + }, + { + "epoch": 0.07742810634834509, + "grad_norm": 4.589677333831787, + "learning_rate": 9.320679731396465e-06, + "loss": 0.3016, + "step": 5708 + }, + { + "epoch": 0.07744167118827998, + "grad_norm": 5.580909729003906, + "learning_rate": 9.320542688776211e-06, + "loss": 0.383, + "step": 5709 + }, + { + "epoch": 0.07745523602821487, + "grad_norm": 5.646219253540039, + "learning_rate": 9.320405646155955e-06, + "loss": 0.3397, + "step": 5710 + }, + { + "epoch": 0.07746880086814975, + "grad_norm": 6.87138557434082, + "learning_rate": 9.3202686035357e-06, + "loss": 0.4719, + "step": 5711 + }, + { + "epoch": 0.07748236570808464, + "grad_norm": 6.300363540649414, + "learning_rate": 9.320131560915445e-06, + "loss": 0.4539, + "step": 5712 + }, + { + "epoch": 0.07749593054801954, + "grad_norm": 5.478719234466553, + "learning_rate": 9.31999451829519e-06, + "loss": 0.2794, + "step": 5713 + }, + { + "epoch": 0.07750949538795442, + "grad_norm": 6.467944622039795, + "learning_rate": 9.319857475674936e-06, + "loss": 0.3637, + "step": 5714 + }, + { + "epoch": 0.07752306022788931, + "grad_norm": 5.155062675476074, + "learning_rate": 9.319720433054681e-06, + "loss": 0.3722, + "step": 5715 + }, + { + "epoch": 0.0775366250678242, + "grad_norm": 5.321759223937988, + "learning_rate": 9.319583390434426e-06, + "loss": 0.4636, + "step": 5716 + }, + { + "epoch": 0.07755018990775908, + "grad_norm": 5.5425333976745605, + "learning_rate": 9.319446347814171e-06, + "loss": 0.3092, + "step": 5717 + }, + { + "epoch": 0.07756375474769397, + "grad_norm": 6.155988693237305, + "learning_rate": 9.319309305193916e-06, + "loss": 0.4286, + "step": 5718 + }, + { + "epoch": 0.07757731958762887, + "grad_norm": 5.862438201904297, + "learning_rate": 9.319172262573662e-06, + "loss": 0.3632, + "step": 5719 + }, + { + "epoch": 0.07759088442756376, + "grad_norm": 6.416346549987793, + "learning_rate": 9.319035219953407e-06, + "loss": 0.4667, + "step": 5720 + }, + { + "epoch": 0.07760444926749864, + "grad_norm": 6.346175670623779, + "learning_rate": 9.31889817733315e-06, + "loss": 0.4058, + "step": 5721 + }, + { + "epoch": 0.07761801410743353, + "grad_norm": 5.597818374633789, + "learning_rate": 9.318761134712897e-06, + "loss": 0.4279, + "step": 5722 + }, + { + "epoch": 0.07763157894736843, + "grad_norm": 6.210233211517334, + "learning_rate": 9.318624092092642e-06, + "loss": 0.3866, + "step": 5723 + }, + { + "epoch": 0.0776451437873033, + "grad_norm": 7.871306896209717, + "learning_rate": 9.318487049472387e-06, + "loss": 0.4418, + "step": 5724 + }, + { + "epoch": 0.0776587086272382, + "grad_norm": 6.181976795196533, + "learning_rate": 9.318350006852131e-06, + "loss": 0.5683, + "step": 5725 + }, + { + "epoch": 0.07767227346717309, + "grad_norm": 7.252651691436768, + "learning_rate": 9.318212964231876e-06, + "loss": 0.3099, + "step": 5726 + }, + { + "epoch": 0.07768583830710797, + "grad_norm": 7.75531530380249, + "learning_rate": 9.318075921611623e-06, + "loss": 0.5871, + "step": 5727 + }, + { + "epoch": 0.07769940314704286, + "grad_norm": 7.096785068511963, + "learning_rate": 9.317938878991366e-06, + "loss": 0.5902, + "step": 5728 + }, + { + "epoch": 0.07771296798697776, + "grad_norm": 6.727564811706543, + "learning_rate": 9.317801836371112e-06, + "loss": 0.5454, + "step": 5729 + }, + { + "epoch": 0.07772653282691264, + "grad_norm": 9.612530708312988, + "learning_rate": 9.317664793750857e-06, + "loss": 0.4501, + "step": 5730 + }, + { + "epoch": 0.07774009766684753, + "grad_norm": 9.64505672454834, + "learning_rate": 9.317527751130602e-06, + "loss": 0.7554, + "step": 5731 + }, + { + "epoch": 0.07775366250678242, + "grad_norm": 7.458630561828613, + "learning_rate": 9.317390708510347e-06, + "loss": 0.5514, + "step": 5732 + }, + { + "epoch": 0.0777672273467173, + "grad_norm": 6.933701038360596, + "learning_rate": 9.317253665890092e-06, + "loss": 0.3921, + "step": 5733 + }, + { + "epoch": 0.0777807921866522, + "grad_norm": 6.751209735870361, + "learning_rate": 9.317116623269838e-06, + "loss": 0.4591, + "step": 5734 + }, + { + "epoch": 0.07779435702658709, + "grad_norm": 5.653160572052002, + "learning_rate": 9.316979580649583e-06, + "loss": 0.2992, + "step": 5735 + }, + { + "epoch": 0.07780792186652198, + "grad_norm": 7.52304744720459, + "learning_rate": 9.316842538029328e-06, + "loss": 0.4307, + "step": 5736 + }, + { + "epoch": 0.07782148670645686, + "grad_norm": 7.306817054748535, + "learning_rate": 9.316705495409073e-06, + "loss": 0.4694, + "step": 5737 + }, + { + "epoch": 0.07783505154639175, + "grad_norm": 6.652110576629639, + "learning_rate": 9.316568452788818e-06, + "loss": 0.4859, + "step": 5738 + }, + { + "epoch": 0.07784861638632665, + "grad_norm": 6.3302507400512695, + "learning_rate": 9.316431410168563e-06, + "loss": 0.4153, + "step": 5739 + }, + { + "epoch": 0.07786218122626153, + "grad_norm": 11.367889404296875, + "learning_rate": 9.316294367548309e-06, + "loss": 0.4689, + "step": 5740 + }, + { + "epoch": 0.07787574606619642, + "grad_norm": 5.250146865844727, + "learning_rate": 9.316157324928054e-06, + "loss": 0.3163, + "step": 5741 + }, + { + "epoch": 0.07788931090613131, + "grad_norm": 5.662080764770508, + "learning_rate": 9.316020282307799e-06, + "loss": 0.3199, + "step": 5742 + }, + { + "epoch": 0.07790287574606619, + "grad_norm": 4.832230091094971, + "learning_rate": 9.315883239687542e-06, + "loss": 0.3062, + "step": 5743 + }, + { + "epoch": 0.07791644058600108, + "grad_norm": 7.172216415405273, + "learning_rate": 9.31574619706729e-06, + "loss": 0.3448, + "step": 5744 + }, + { + "epoch": 0.07793000542593598, + "grad_norm": 5.626976013183594, + "learning_rate": 9.315609154447035e-06, + "loss": 0.3872, + "step": 5745 + }, + { + "epoch": 0.07794357026587086, + "grad_norm": 5.600011348724365, + "learning_rate": 9.315472111826778e-06, + "loss": 0.4118, + "step": 5746 + }, + { + "epoch": 0.07795713510580575, + "grad_norm": 6.383627891540527, + "learning_rate": 9.315335069206523e-06, + "loss": 0.4199, + "step": 5747 + }, + { + "epoch": 0.07797069994574064, + "grad_norm": 6.112245082855225, + "learning_rate": 9.31519802658627e-06, + "loss": 0.2488, + "step": 5748 + }, + { + "epoch": 0.07798426478567552, + "grad_norm": 9.794602394104004, + "learning_rate": 9.315060983966015e-06, + "loss": 0.5158, + "step": 5749 + }, + { + "epoch": 0.07799782962561042, + "grad_norm": 6.087313652038574, + "learning_rate": 9.314923941345759e-06, + "loss": 0.348, + "step": 5750 + }, + { + "epoch": 0.07801139446554531, + "grad_norm": 6.725555419921875, + "learning_rate": 9.314786898725504e-06, + "loss": 0.3212, + "step": 5751 + }, + { + "epoch": 0.0780249593054802, + "grad_norm": 7.051589012145996, + "learning_rate": 9.314649856105249e-06, + "loss": 0.331, + "step": 5752 + }, + { + "epoch": 0.07803852414541508, + "grad_norm": 6.443873405456543, + "learning_rate": 9.314512813484994e-06, + "loss": 0.3138, + "step": 5753 + }, + { + "epoch": 0.07805208898534997, + "grad_norm": 7.028903484344482, + "learning_rate": 9.31437577086474e-06, + "loss": 0.3694, + "step": 5754 + }, + { + "epoch": 0.07806565382528487, + "grad_norm": 8.241239547729492, + "learning_rate": 9.314238728244485e-06, + "loss": 0.575, + "step": 5755 + }, + { + "epoch": 0.07807921866521975, + "grad_norm": 6.073769569396973, + "learning_rate": 9.31410168562423e-06, + "loss": 0.324, + "step": 5756 + }, + { + "epoch": 0.07809278350515464, + "grad_norm": 6.878951549530029, + "learning_rate": 9.313964643003975e-06, + "loss": 0.3843, + "step": 5757 + }, + { + "epoch": 0.07810634834508953, + "grad_norm": 7.164097785949707, + "learning_rate": 9.31382760038372e-06, + "loss": 0.4187, + "step": 5758 + }, + { + "epoch": 0.07811991318502441, + "grad_norm": 8.029729843139648, + "learning_rate": 9.313690557763465e-06, + "loss": 0.4543, + "step": 5759 + }, + { + "epoch": 0.0781334780249593, + "grad_norm": 5.422436714172363, + "learning_rate": 9.31355351514321e-06, + "loss": 0.3994, + "step": 5760 + }, + { + "epoch": 0.0781470428648942, + "grad_norm": 8.481468200683594, + "learning_rate": 9.313416472522956e-06, + "loss": 0.4349, + "step": 5761 + }, + { + "epoch": 0.07816060770482908, + "grad_norm": 10.871066093444824, + "learning_rate": 9.313279429902701e-06, + "loss": 0.6141, + "step": 5762 + }, + { + "epoch": 0.07817417254476397, + "grad_norm": 5.691043853759766, + "learning_rate": 9.313142387282446e-06, + "loss": 0.4024, + "step": 5763 + }, + { + "epoch": 0.07818773738469886, + "grad_norm": 6.06660795211792, + "learning_rate": 9.313005344662191e-06, + "loss": 0.3526, + "step": 5764 + }, + { + "epoch": 0.07820130222463374, + "grad_norm": 6.044121265411377, + "learning_rate": 9.312868302041935e-06, + "loss": 0.4961, + "step": 5765 + }, + { + "epoch": 0.07821486706456864, + "grad_norm": 7.829460144042969, + "learning_rate": 9.312731259421682e-06, + "loss": 0.5327, + "step": 5766 + }, + { + "epoch": 0.07822843190450353, + "grad_norm": 6.582695484161377, + "learning_rate": 9.312594216801427e-06, + "loss": 0.4799, + "step": 5767 + }, + { + "epoch": 0.07824199674443842, + "grad_norm": 7.461744785308838, + "learning_rate": 9.31245717418117e-06, + "loss": 0.6009, + "step": 5768 + }, + { + "epoch": 0.0782555615843733, + "grad_norm": 5.9144463539123535, + "learning_rate": 9.312320131560915e-06, + "loss": 0.4502, + "step": 5769 + }, + { + "epoch": 0.0782691264243082, + "grad_norm": 5.842122554779053, + "learning_rate": 9.312183088940662e-06, + "loss": 0.5135, + "step": 5770 + }, + { + "epoch": 0.07828269126424309, + "grad_norm": 6.616742134094238, + "learning_rate": 9.312046046320406e-06, + "loss": 0.3463, + "step": 5771 + }, + { + "epoch": 0.07829625610417797, + "grad_norm": 8.289814949035645, + "learning_rate": 9.311909003700151e-06, + "loss": 0.4491, + "step": 5772 + }, + { + "epoch": 0.07830982094411286, + "grad_norm": 7.775735378265381, + "learning_rate": 9.311771961079896e-06, + "loss": 0.5066, + "step": 5773 + }, + { + "epoch": 0.07832338578404775, + "grad_norm": 8.03044319152832, + "learning_rate": 9.311634918459643e-06, + "loss": 0.5491, + "step": 5774 + }, + { + "epoch": 0.07833695062398263, + "grad_norm": 7.205140113830566, + "learning_rate": 9.311497875839387e-06, + "loss": 0.4925, + "step": 5775 + }, + { + "epoch": 0.07835051546391752, + "grad_norm": 6.0792036056518555, + "learning_rate": 9.311360833219132e-06, + "loss": 0.398, + "step": 5776 + }, + { + "epoch": 0.07836408030385242, + "grad_norm": 8.689724922180176, + "learning_rate": 9.311223790598877e-06, + "loss": 0.5289, + "step": 5777 + }, + { + "epoch": 0.0783776451437873, + "grad_norm": 5.530374050140381, + "learning_rate": 9.311086747978622e-06, + "loss": 0.2988, + "step": 5778 + }, + { + "epoch": 0.07839120998372219, + "grad_norm": 6.380061626434326, + "learning_rate": 9.310949705358367e-06, + "loss": 0.3975, + "step": 5779 + }, + { + "epoch": 0.07840477482365708, + "grad_norm": 6.461102485656738, + "learning_rate": 9.310812662738112e-06, + "loss": 0.4611, + "step": 5780 + }, + { + "epoch": 0.07841833966359196, + "grad_norm": 6.837863922119141, + "learning_rate": 9.310675620117858e-06, + "loss": 0.4362, + "step": 5781 + }, + { + "epoch": 0.07843190450352686, + "grad_norm": 7.854555606842041, + "learning_rate": 9.310538577497603e-06, + "loss": 0.4701, + "step": 5782 + }, + { + "epoch": 0.07844546934346175, + "grad_norm": 8.626256942749023, + "learning_rate": 9.310401534877348e-06, + "loss": 0.5245, + "step": 5783 + }, + { + "epoch": 0.07845903418339664, + "grad_norm": 7.626916885375977, + "learning_rate": 9.310264492257093e-06, + "loss": 0.4941, + "step": 5784 + }, + { + "epoch": 0.07847259902333152, + "grad_norm": 6.258981704711914, + "learning_rate": 9.310127449636838e-06, + "loss": 0.4235, + "step": 5785 + }, + { + "epoch": 0.07848616386326641, + "grad_norm": 7.809725761413574, + "learning_rate": 9.309990407016582e-06, + "loss": 0.4901, + "step": 5786 + }, + { + "epoch": 0.07849972870320131, + "grad_norm": 7.651247024536133, + "learning_rate": 9.309853364396329e-06, + "loss": 0.6151, + "step": 5787 + }, + { + "epoch": 0.07851329354313619, + "grad_norm": 10.644505500793457, + "learning_rate": 9.309716321776074e-06, + "loss": 0.6824, + "step": 5788 + }, + { + "epoch": 0.07852685838307108, + "grad_norm": 5.4550957679748535, + "learning_rate": 9.309579279155819e-06, + "loss": 0.3797, + "step": 5789 + }, + { + "epoch": 0.07854042322300597, + "grad_norm": 5.412901401519775, + "learning_rate": 9.309442236535562e-06, + "loss": 0.4274, + "step": 5790 + }, + { + "epoch": 0.07855398806294085, + "grad_norm": 7.172584056854248, + "learning_rate": 9.30930519391531e-06, + "loss": 0.4625, + "step": 5791 + }, + { + "epoch": 0.07856755290287575, + "grad_norm": 5.476522445678711, + "learning_rate": 9.309168151295055e-06, + "loss": 0.359, + "step": 5792 + }, + { + "epoch": 0.07858111774281064, + "grad_norm": 7.727237224578857, + "learning_rate": 9.309031108674798e-06, + "loss": 0.531, + "step": 5793 + }, + { + "epoch": 0.07859468258274552, + "grad_norm": 7.868569850921631, + "learning_rate": 9.308894066054543e-06, + "loss": 0.4653, + "step": 5794 + }, + { + "epoch": 0.07860824742268041, + "grad_norm": 5.158108711242676, + "learning_rate": 9.308757023434288e-06, + "loss": 0.3064, + "step": 5795 + }, + { + "epoch": 0.0786218122626153, + "grad_norm": 7.59096097946167, + "learning_rate": 9.308619980814034e-06, + "loss": 0.5217, + "step": 5796 + }, + { + "epoch": 0.07863537710255018, + "grad_norm": 5.487745761871338, + "learning_rate": 9.308482938193779e-06, + "loss": 0.4115, + "step": 5797 + }, + { + "epoch": 0.07864894194248508, + "grad_norm": 6.163300514221191, + "learning_rate": 9.308345895573524e-06, + "loss": 0.3926, + "step": 5798 + }, + { + "epoch": 0.07866250678241997, + "grad_norm": 7.577374458312988, + "learning_rate": 9.308208852953269e-06, + "loss": 0.4139, + "step": 5799 + }, + { + "epoch": 0.07867607162235486, + "grad_norm": 5.636563777923584, + "learning_rate": 9.308071810333014e-06, + "loss": 0.4434, + "step": 5800 + }, + { + "epoch": 0.07868963646228974, + "grad_norm": 6.777174472808838, + "learning_rate": 9.30793476771276e-06, + "loss": 0.5933, + "step": 5801 + }, + { + "epoch": 0.07870320130222463, + "grad_norm": 7.4136834144592285, + "learning_rate": 9.307797725092505e-06, + "loss": 0.4929, + "step": 5802 + }, + { + "epoch": 0.07871676614215953, + "grad_norm": 6.381161212921143, + "learning_rate": 9.30766068247225e-06, + "loss": 0.4622, + "step": 5803 + }, + { + "epoch": 0.07873033098209441, + "grad_norm": 8.842148780822754, + "learning_rate": 9.307523639851995e-06, + "loss": 0.6304, + "step": 5804 + }, + { + "epoch": 0.0787438958220293, + "grad_norm": 7.961173057556152, + "learning_rate": 9.30738659723174e-06, + "loss": 0.4819, + "step": 5805 + }, + { + "epoch": 0.0787574606619642, + "grad_norm": 7.602076530456543, + "learning_rate": 9.307249554611485e-06, + "loss": 0.6002, + "step": 5806 + }, + { + "epoch": 0.07877102550189907, + "grad_norm": 6.379063129425049, + "learning_rate": 9.30711251199123e-06, + "loss": 0.3821, + "step": 5807 + }, + { + "epoch": 0.07878459034183397, + "grad_norm": 7.215455055236816, + "learning_rate": 9.306975469370974e-06, + "loss": 0.3239, + "step": 5808 + }, + { + "epoch": 0.07879815518176886, + "grad_norm": 5.471070289611816, + "learning_rate": 9.306838426750721e-06, + "loss": 0.3893, + "step": 5809 + }, + { + "epoch": 0.07881172002170374, + "grad_norm": 6.14961051940918, + "learning_rate": 9.306701384130466e-06, + "loss": 0.4363, + "step": 5810 + }, + { + "epoch": 0.07882528486163863, + "grad_norm": 9.33968448638916, + "learning_rate": 9.30656434151021e-06, + "loss": 0.6062, + "step": 5811 + }, + { + "epoch": 0.07883884970157352, + "grad_norm": 5.688039302825928, + "learning_rate": 9.306427298889955e-06, + "loss": 0.4319, + "step": 5812 + }, + { + "epoch": 0.07885241454150842, + "grad_norm": 6.09004020690918, + "learning_rate": 9.306290256269702e-06, + "loss": 0.279, + "step": 5813 + }, + { + "epoch": 0.0788659793814433, + "grad_norm": 6.6904296875, + "learning_rate": 9.306153213649445e-06, + "loss": 0.415, + "step": 5814 + }, + { + "epoch": 0.07887954422137819, + "grad_norm": 5.589902400970459, + "learning_rate": 9.30601617102919e-06, + "loss": 0.3899, + "step": 5815 + }, + { + "epoch": 0.07889310906131308, + "grad_norm": 6.3662800788879395, + "learning_rate": 9.305879128408935e-06, + "loss": 0.4761, + "step": 5816 + }, + { + "epoch": 0.07890667390124796, + "grad_norm": 5.125332355499268, + "learning_rate": 9.305742085788682e-06, + "loss": 0.2931, + "step": 5817 + }, + { + "epoch": 0.07892023874118285, + "grad_norm": 5.513177871704102, + "learning_rate": 9.305605043168426e-06, + "loss": 0.3567, + "step": 5818 + }, + { + "epoch": 0.07893380358111775, + "grad_norm": 6.107165813446045, + "learning_rate": 9.305468000548171e-06, + "loss": 0.3611, + "step": 5819 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 7.082075119018555, + "learning_rate": 9.305330957927916e-06, + "loss": 0.4489, + "step": 5820 + }, + { + "epoch": 0.07896093326098752, + "grad_norm": 4.847632884979248, + "learning_rate": 9.305193915307661e-06, + "loss": 0.3693, + "step": 5821 + }, + { + "epoch": 0.07897449810092241, + "grad_norm": 5.7258195877075195, + "learning_rate": 9.305056872687407e-06, + "loss": 0.2895, + "step": 5822 + }, + { + "epoch": 0.07898806294085729, + "grad_norm": 7.688549041748047, + "learning_rate": 9.304919830067152e-06, + "loss": 0.5933, + "step": 5823 + }, + { + "epoch": 0.07900162778079219, + "grad_norm": 7.567252159118652, + "learning_rate": 9.304782787446897e-06, + "loss": 0.3919, + "step": 5824 + }, + { + "epoch": 0.07901519262072708, + "grad_norm": 5.830328941345215, + "learning_rate": 9.304645744826642e-06, + "loss": 0.3946, + "step": 5825 + }, + { + "epoch": 0.07902875746066196, + "grad_norm": 7.1246657371521, + "learning_rate": 9.304508702206387e-06, + "loss": 0.3642, + "step": 5826 + }, + { + "epoch": 0.07904232230059685, + "grad_norm": 6.394053936004639, + "learning_rate": 9.304371659586132e-06, + "loss": 0.3562, + "step": 5827 + }, + { + "epoch": 0.07905588714053174, + "grad_norm": 8.098448753356934, + "learning_rate": 9.304234616965878e-06, + "loss": 0.3711, + "step": 5828 + }, + { + "epoch": 0.07906945198046664, + "grad_norm": 6.150120735168457, + "learning_rate": 9.304097574345621e-06, + "loss": 0.3713, + "step": 5829 + }, + { + "epoch": 0.07908301682040152, + "grad_norm": 6.533813953399658, + "learning_rate": 9.303960531725368e-06, + "loss": 0.346, + "step": 5830 + }, + { + "epoch": 0.07909658166033641, + "grad_norm": 5.402994632720947, + "learning_rate": 9.303823489105113e-06, + "loss": 0.3022, + "step": 5831 + }, + { + "epoch": 0.0791101465002713, + "grad_norm": 5.838409900665283, + "learning_rate": 9.303686446484858e-06, + "loss": 0.3552, + "step": 5832 + }, + { + "epoch": 0.07912371134020618, + "grad_norm": 7.16883659362793, + "learning_rate": 9.303549403864602e-06, + "loss": 0.3491, + "step": 5833 + }, + { + "epoch": 0.07913727618014108, + "grad_norm": 5.773194789886475, + "learning_rate": 9.303412361244347e-06, + "loss": 0.4527, + "step": 5834 + }, + { + "epoch": 0.07915084102007597, + "grad_norm": 5.87509822845459, + "learning_rate": 9.303275318624094e-06, + "loss": 0.3477, + "step": 5835 + }, + { + "epoch": 0.07916440586001085, + "grad_norm": 6.574087619781494, + "learning_rate": 9.303138276003837e-06, + "loss": 0.422, + "step": 5836 + }, + { + "epoch": 0.07917797069994574, + "grad_norm": 6.1598591804504395, + "learning_rate": 9.303001233383583e-06, + "loss": 0.3144, + "step": 5837 + }, + { + "epoch": 0.07919153553988063, + "grad_norm": 8.534311294555664, + "learning_rate": 9.302864190763328e-06, + "loss": 0.4805, + "step": 5838 + }, + { + "epoch": 0.07920510037981551, + "grad_norm": 6.305806636810303, + "learning_rate": 9.302727148143073e-06, + "loss": 0.298, + "step": 5839 + }, + { + "epoch": 0.0792186652197504, + "grad_norm": 7.1886210441589355, + "learning_rate": 9.302590105522818e-06, + "loss": 0.4923, + "step": 5840 + }, + { + "epoch": 0.0792322300596853, + "grad_norm": 5.123056411743164, + "learning_rate": 9.302453062902563e-06, + "loss": 0.2947, + "step": 5841 + }, + { + "epoch": 0.07924579489962018, + "grad_norm": 5.131861686706543, + "learning_rate": 9.302316020282308e-06, + "loss": 0.259, + "step": 5842 + }, + { + "epoch": 0.07925935973955507, + "grad_norm": 7.8960676193237305, + "learning_rate": 9.302178977662054e-06, + "loss": 0.5609, + "step": 5843 + }, + { + "epoch": 0.07927292457948996, + "grad_norm": 6.249050617218018, + "learning_rate": 9.302041935041799e-06, + "loss": 0.3695, + "step": 5844 + }, + { + "epoch": 0.07928648941942486, + "grad_norm": 6.410242557525635, + "learning_rate": 9.301904892421544e-06, + "loss": 0.403, + "step": 5845 + }, + { + "epoch": 0.07930005425935974, + "grad_norm": 5.9863786697387695, + "learning_rate": 9.301767849801289e-06, + "loss": 0.3158, + "step": 5846 + }, + { + "epoch": 0.07931361909929463, + "grad_norm": 7.570390224456787, + "learning_rate": 9.301630807181034e-06, + "loss": 0.3927, + "step": 5847 + }, + { + "epoch": 0.07932718393922952, + "grad_norm": 6.073945045471191, + "learning_rate": 9.30149376456078e-06, + "loss": 0.3534, + "step": 5848 + }, + { + "epoch": 0.0793407487791644, + "grad_norm": 6.447844982147217, + "learning_rate": 9.301356721940525e-06, + "loss": 0.385, + "step": 5849 + }, + { + "epoch": 0.0793543136190993, + "grad_norm": 6.498836517333984, + "learning_rate": 9.30121967932027e-06, + "loss": 0.4174, + "step": 5850 + }, + { + "epoch": 0.07936787845903419, + "grad_norm": 5.99136209487915, + "learning_rate": 9.301082636700013e-06, + "loss": 0.3926, + "step": 5851 + }, + { + "epoch": 0.07938144329896907, + "grad_norm": 6.905040740966797, + "learning_rate": 9.30094559407976e-06, + "loss": 0.4497, + "step": 5852 + }, + { + "epoch": 0.07939500813890396, + "grad_norm": 5.003359794616699, + "learning_rate": 9.300808551459505e-06, + "loss": 0.2691, + "step": 5853 + }, + { + "epoch": 0.07940857297883885, + "grad_norm": 5.160627841949463, + "learning_rate": 9.300671508839249e-06, + "loss": 0.2694, + "step": 5854 + }, + { + "epoch": 0.07942213781877373, + "grad_norm": 8.097390174865723, + "learning_rate": 9.300534466218994e-06, + "loss": 0.3491, + "step": 5855 + }, + { + "epoch": 0.07943570265870863, + "grad_norm": 5.463443279266357, + "learning_rate": 9.300397423598741e-06, + "loss": 0.2629, + "step": 5856 + }, + { + "epoch": 0.07944926749864352, + "grad_norm": 6.406078338623047, + "learning_rate": 9.300260380978486e-06, + "loss": 0.4555, + "step": 5857 + }, + { + "epoch": 0.0794628323385784, + "grad_norm": 5.0304646492004395, + "learning_rate": 9.30012333835823e-06, + "loss": 0.3392, + "step": 5858 + }, + { + "epoch": 0.07947639717851329, + "grad_norm": 7.990777969360352, + "learning_rate": 9.299986295737975e-06, + "loss": 0.3921, + "step": 5859 + }, + { + "epoch": 0.07948996201844818, + "grad_norm": 5.737520217895508, + "learning_rate": 9.299849253117722e-06, + "loss": 0.4398, + "step": 5860 + }, + { + "epoch": 0.07950352685838308, + "grad_norm": 6.27367639541626, + "learning_rate": 9.299712210497465e-06, + "loss": 0.3552, + "step": 5861 + }, + { + "epoch": 0.07951709169831796, + "grad_norm": 6.777464866638184, + "learning_rate": 9.29957516787721e-06, + "loss": 0.3864, + "step": 5862 + }, + { + "epoch": 0.07953065653825285, + "grad_norm": 4.802700519561768, + "learning_rate": 9.299438125256955e-06, + "loss": 0.2226, + "step": 5863 + }, + { + "epoch": 0.07954422137818774, + "grad_norm": 5.395394325256348, + "learning_rate": 9.2993010826367e-06, + "loss": 0.2713, + "step": 5864 + }, + { + "epoch": 0.07955778621812262, + "grad_norm": 5.742776870727539, + "learning_rate": 9.299164040016446e-06, + "loss": 0.3854, + "step": 5865 + }, + { + "epoch": 0.07957135105805752, + "grad_norm": 5.93611478805542, + "learning_rate": 9.299026997396191e-06, + "loss": 0.3229, + "step": 5866 + }, + { + "epoch": 0.07958491589799241, + "grad_norm": 9.107505798339844, + "learning_rate": 9.298889954775936e-06, + "loss": 0.5433, + "step": 5867 + }, + { + "epoch": 0.07959848073792729, + "grad_norm": 5.589808940887451, + "learning_rate": 9.298752912155681e-06, + "loss": 0.3513, + "step": 5868 + }, + { + "epoch": 0.07961204557786218, + "grad_norm": 5.592137336730957, + "learning_rate": 9.298615869535427e-06, + "loss": 0.378, + "step": 5869 + }, + { + "epoch": 0.07962561041779707, + "grad_norm": 9.454977989196777, + "learning_rate": 9.298478826915172e-06, + "loss": 0.4719, + "step": 5870 + }, + { + "epoch": 0.07963917525773195, + "grad_norm": 5.227596759796143, + "learning_rate": 9.298341784294917e-06, + "loss": 0.3069, + "step": 5871 + }, + { + "epoch": 0.07965274009766685, + "grad_norm": 10.01084041595459, + "learning_rate": 9.298204741674662e-06, + "loss": 0.4922, + "step": 5872 + }, + { + "epoch": 0.07966630493760174, + "grad_norm": 8.69778823852539, + "learning_rate": 9.298067699054407e-06, + "loss": 0.4907, + "step": 5873 + }, + { + "epoch": 0.07967986977753662, + "grad_norm": 6.665412902832031, + "learning_rate": 9.297930656434152e-06, + "loss": 0.4568, + "step": 5874 + }, + { + "epoch": 0.07969343461747151, + "grad_norm": 7.773486614227295, + "learning_rate": 9.297793613813898e-06, + "loss": 0.3365, + "step": 5875 + }, + { + "epoch": 0.0797069994574064, + "grad_norm": 6.9045233726501465, + "learning_rate": 9.297656571193641e-06, + "loss": 0.4403, + "step": 5876 + }, + { + "epoch": 0.0797205642973413, + "grad_norm": 8.937496185302734, + "learning_rate": 9.297519528573386e-06, + "loss": 0.5166, + "step": 5877 + }, + { + "epoch": 0.07973412913727618, + "grad_norm": 6.471545219421387, + "learning_rate": 9.297382485953133e-06, + "loss": 0.3928, + "step": 5878 + }, + { + "epoch": 0.07974769397721107, + "grad_norm": 6.477344512939453, + "learning_rate": 9.297245443332877e-06, + "loss": 0.3265, + "step": 5879 + }, + { + "epoch": 0.07976125881714596, + "grad_norm": 6.22018575668335, + "learning_rate": 9.297108400712622e-06, + "loss": 0.3617, + "step": 5880 + }, + { + "epoch": 0.07977482365708084, + "grad_norm": 6.94932222366333, + "learning_rate": 9.296971358092367e-06, + "loss": 0.3849, + "step": 5881 + }, + { + "epoch": 0.07978838849701574, + "grad_norm": 6.974071025848389, + "learning_rate": 9.296834315472112e-06, + "loss": 0.4391, + "step": 5882 + }, + { + "epoch": 0.07980195333695063, + "grad_norm": 9.682135581970215, + "learning_rate": 9.296697272851857e-06, + "loss": 0.3802, + "step": 5883 + }, + { + "epoch": 0.07981551817688551, + "grad_norm": 5.655999660491943, + "learning_rate": 9.296560230231603e-06, + "loss": 0.3144, + "step": 5884 + }, + { + "epoch": 0.0798290830168204, + "grad_norm": 7.247800350189209, + "learning_rate": 9.296423187611348e-06, + "loss": 0.4173, + "step": 5885 + }, + { + "epoch": 0.0798426478567553, + "grad_norm": 7.115022659301758, + "learning_rate": 9.296286144991093e-06, + "loss": 0.439, + "step": 5886 + }, + { + "epoch": 0.07985621269669017, + "grad_norm": 9.679914474487305, + "learning_rate": 9.296149102370838e-06, + "loss": 0.576, + "step": 5887 + }, + { + "epoch": 0.07986977753662507, + "grad_norm": 7.323347568511963, + "learning_rate": 9.296012059750583e-06, + "loss": 0.4238, + "step": 5888 + }, + { + "epoch": 0.07988334237655996, + "grad_norm": 8.901277542114258, + "learning_rate": 9.295875017130328e-06, + "loss": 0.3392, + "step": 5889 + }, + { + "epoch": 0.07989690721649484, + "grad_norm": 5.552343368530273, + "learning_rate": 9.295737974510074e-06, + "loss": 0.3932, + "step": 5890 + }, + { + "epoch": 0.07991047205642973, + "grad_norm": 7.533151149749756, + "learning_rate": 9.295600931889819e-06, + "loss": 0.4461, + "step": 5891 + }, + { + "epoch": 0.07992403689636463, + "grad_norm": 7.560209274291992, + "learning_rate": 9.295463889269564e-06, + "loss": 0.4556, + "step": 5892 + }, + { + "epoch": 0.07993760173629952, + "grad_norm": 6.850104808807373, + "learning_rate": 9.295326846649309e-06, + "loss": 0.4201, + "step": 5893 + }, + { + "epoch": 0.0799511665762344, + "grad_norm": 7.063788414001465, + "learning_rate": 9.295189804029053e-06, + "loss": 0.4076, + "step": 5894 + }, + { + "epoch": 0.07996473141616929, + "grad_norm": 8.999375343322754, + "learning_rate": 9.2950527614088e-06, + "loss": 0.8643, + "step": 5895 + }, + { + "epoch": 0.07997829625610418, + "grad_norm": 8.158772468566895, + "learning_rate": 9.294915718788545e-06, + "loss": 0.4004, + "step": 5896 + }, + { + "epoch": 0.07999186109603906, + "grad_norm": 7.329830646514893, + "learning_rate": 9.294778676168288e-06, + "loss": 0.3515, + "step": 5897 + }, + { + "epoch": 0.08000542593597396, + "grad_norm": 6.9633307456970215, + "learning_rate": 9.294641633548033e-06, + "loss": 0.3896, + "step": 5898 + }, + { + "epoch": 0.08001899077590885, + "grad_norm": 10.144608497619629, + "learning_rate": 9.29450459092778e-06, + "loss": 0.4581, + "step": 5899 + }, + { + "epoch": 0.08003255561584373, + "grad_norm": 8.1076078414917, + "learning_rate": 9.294367548307525e-06, + "loss": 0.5964, + "step": 5900 + }, + { + "epoch": 0.08004612045577862, + "grad_norm": 7.228064060211182, + "learning_rate": 9.294230505687269e-06, + "loss": 0.3553, + "step": 5901 + }, + { + "epoch": 0.08005968529571351, + "grad_norm": 7.47061014175415, + "learning_rate": 9.294093463067014e-06, + "loss": 0.4266, + "step": 5902 + }, + { + "epoch": 0.0800732501356484, + "grad_norm": 5.846071243286133, + "learning_rate": 9.29395642044676e-06, + "loss": 0.3742, + "step": 5903 + }, + { + "epoch": 0.08008681497558329, + "grad_norm": 8.855623245239258, + "learning_rate": 9.293819377826504e-06, + "loss": 0.5149, + "step": 5904 + }, + { + "epoch": 0.08010037981551818, + "grad_norm": 7.206357479095459, + "learning_rate": 9.29368233520625e-06, + "loss": 0.5357, + "step": 5905 + }, + { + "epoch": 0.08011394465545306, + "grad_norm": 5.053602695465088, + "learning_rate": 9.293545292585995e-06, + "loss": 0.3443, + "step": 5906 + }, + { + "epoch": 0.08012750949538795, + "grad_norm": 7.7107133865356445, + "learning_rate": 9.29340824996574e-06, + "loss": 0.5076, + "step": 5907 + }, + { + "epoch": 0.08014107433532285, + "grad_norm": 5.697502136230469, + "learning_rate": 9.293271207345485e-06, + "loss": 0.3691, + "step": 5908 + }, + { + "epoch": 0.08015463917525774, + "grad_norm": 6.672901630401611, + "learning_rate": 9.29313416472523e-06, + "loss": 0.3647, + "step": 5909 + }, + { + "epoch": 0.08016820401519262, + "grad_norm": 8.371508598327637, + "learning_rate": 9.292997122104975e-06, + "loss": 0.4835, + "step": 5910 + }, + { + "epoch": 0.08018176885512751, + "grad_norm": 7.232750415802002, + "learning_rate": 9.29286007948472e-06, + "loss": 0.396, + "step": 5911 + }, + { + "epoch": 0.0801953336950624, + "grad_norm": 7.772430419921875, + "learning_rate": 9.292723036864466e-06, + "loss": 0.4429, + "step": 5912 + }, + { + "epoch": 0.08020889853499728, + "grad_norm": 10.908804893493652, + "learning_rate": 9.292585994244211e-06, + "loss": 0.5592, + "step": 5913 + }, + { + "epoch": 0.08022246337493218, + "grad_norm": 6.88921594619751, + "learning_rate": 9.292448951623956e-06, + "loss": 0.4152, + "step": 5914 + }, + { + "epoch": 0.08023602821486707, + "grad_norm": 6.53413200378418, + "learning_rate": 9.292311909003701e-06, + "loss": 0.4636, + "step": 5915 + }, + { + "epoch": 0.08024959305480195, + "grad_norm": 6.98728609085083, + "learning_rate": 9.292174866383445e-06, + "loss": 0.5834, + "step": 5916 + }, + { + "epoch": 0.08026315789473684, + "grad_norm": 6.972382545471191, + "learning_rate": 9.292037823763192e-06, + "loss": 0.4915, + "step": 5917 + }, + { + "epoch": 0.08027672273467173, + "grad_norm": 6.109014511108398, + "learning_rate": 9.291900781142937e-06, + "loss": 0.3519, + "step": 5918 + }, + { + "epoch": 0.08029028757460661, + "grad_norm": 6.935121059417725, + "learning_rate": 9.29176373852268e-06, + "loss": 0.5954, + "step": 5919 + }, + { + "epoch": 0.08030385241454151, + "grad_norm": 4.856385231018066, + "learning_rate": 9.291626695902426e-06, + "loss": 0.324, + "step": 5920 + }, + { + "epoch": 0.0803174172544764, + "grad_norm": 6.718207359313965, + "learning_rate": 9.291489653282172e-06, + "loss": 0.4243, + "step": 5921 + }, + { + "epoch": 0.08033098209441128, + "grad_norm": 7.454409599304199, + "learning_rate": 9.291352610661916e-06, + "loss": 0.3508, + "step": 5922 + }, + { + "epoch": 0.08034454693434617, + "grad_norm": 6.359622955322266, + "learning_rate": 9.291215568041661e-06, + "loss": 0.3864, + "step": 5923 + }, + { + "epoch": 0.08035811177428107, + "grad_norm": 6.2808380126953125, + "learning_rate": 9.291078525421406e-06, + "loss": 0.3505, + "step": 5924 + }, + { + "epoch": 0.08037167661421596, + "grad_norm": 6.294378757476807, + "learning_rate": 9.290941482801153e-06, + "loss": 0.3719, + "step": 5925 + }, + { + "epoch": 0.08038524145415084, + "grad_norm": 6.147368907928467, + "learning_rate": 9.290804440180897e-06, + "loss": 0.4217, + "step": 5926 + }, + { + "epoch": 0.08039880629408573, + "grad_norm": 7.251278877258301, + "learning_rate": 9.290667397560642e-06, + "loss": 0.5946, + "step": 5927 + }, + { + "epoch": 0.08041237113402062, + "grad_norm": 6.4567670822143555, + "learning_rate": 9.290530354940387e-06, + "loss": 0.5966, + "step": 5928 + }, + { + "epoch": 0.0804259359739555, + "grad_norm": 7.372433185577393, + "learning_rate": 9.290393312320132e-06, + "loss": 0.3691, + "step": 5929 + }, + { + "epoch": 0.0804395008138904, + "grad_norm": 6.451076030731201, + "learning_rate": 9.290256269699877e-06, + "loss": 0.413, + "step": 5930 + }, + { + "epoch": 0.08045306565382529, + "grad_norm": 6.687310695648193, + "learning_rate": 9.290119227079623e-06, + "loss": 0.6526, + "step": 5931 + }, + { + "epoch": 0.08046663049376017, + "grad_norm": 5.984297275543213, + "learning_rate": 9.289982184459368e-06, + "loss": 0.3313, + "step": 5932 + }, + { + "epoch": 0.08048019533369506, + "grad_norm": 8.584175109863281, + "learning_rate": 9.289845141839113e-06, + "loss": 0.636, + "step": 5933 + }, + { + "epoch": 0.08049376017362996, + "grad_norm": 4.6923828125, + "learning_rate": 9.289708099218858e-06, + "loss": 0.3024, + "step": 5934 + }, + { + "epoch": 0.08050732501356483, + "grad_norm": 5.2096734046936035, + "learning_rate": 9.289571056598603e-06, + "loss": 0.4632, + "step": 5935 + }, + { + "epoch": 0.08052088985349973, + "grad_norm": 8.152766227722168, + "learning_rate": 9.289434013978348e-06, + "loss": 0.4172, + "step": 5936 + }, + { + "epoch": 0.08053445469343462, + "grad_norm": 4.33236837387085, + "learning_rate": 9.289296971358092e-06, + "loss": 0.2142, + "step": 5937 + }, + { + "epoch": 0.0805480195333695, + "grad_norm": 5.85233736038208, + "learning_rate": 9.289159928737839e-06, + "loss": 0.3797, + "step": 5938 + }, + { + "epoch": 0.08056158437330439, + "grad_norm": 8.55955696105957, + "learning_rate": 9.289022886117584e-06, + "loss": 0.4977, + "step": 5939 + }, + { + "epoch": 0.08057514921323929, + "grad_norm": 6.945651054382324, + "learning_rate": 9.28888584349733e-06, + "loss": 0.4655, + "step": 5940 + }, + { + "epoch": 0.08058871405317418, + "grad_norm": 6.646910190582275, + "learning_rate": 9.288748800877073e-06, + "loss": 0.2605, + "step": 5941 + }, + { + "epoch": 0.08060227889310906, + "grad_norm": 6.876661777496338, + "learning_rate": 9.28861175825682e-06, + "loss": 0.3291, + "step": 5942 + }, + { + "epoch": 0.08061584373304395, + "grad_norm": 7.182325839996338, + "learning_rate": 9.288474715636565e-06, + "loss": 0.4077, + "step": 5943 + }, + { + "epoch": 0.08062940857297884, + "grad_norm": 7.485424995422363, + "learning_rate": 9.288337673016308e-06, + "loss": 0.4214, + "step": 5944 + }, + { + "epoch": 0.08064297341291372, + "grad_norm": 4.918610095977783, + "learning_rate": 9.288200630396053e-06, + "loss": 0.3169, + "step": 5945 + }, + { + "epoch": 0.08065653825284862, + "grad_norm": 10.007776260375977, + "learning_rate": 9.288063587775799e-06, + "loss": 0.4256, + "step": 5946 + }, + { + "epoch": 0.08067010309278351, + "grad_norm": 7.39008092880249, + "learning_rate": 9.287926545155544e-06, + "loss": 0.4945, + "step": 5947 + }, + { + "epoch": 0.08068366793271839, + "grad_norm": 6.747727394104004, + "learning_rate": 9.287789502535289e-06, + "loss": 0.3933, + "step": 5948 + }, + { + "epoch": 0.08069723277265328, + "grad_norm": 6.0362138748168945, + "learning_rate": 9.287652459915034e-06, + "loss": 0.4724, + "step": 5949 + }, + { + "epoch": 0.08071079761258818, + "grad_norm": 7.549953460693359, + "learning_rate": 9.28751541729478e-06, + "loss": 0.4874, + "step": 5950 + }, + { + "epoch": 0.08072436245252305, + "grad_norm": 4.614605903625488, + "learning_rate": 9.287378374674524e-06, + "loss": 0.2949, + "step": 5951 + }, + { + "epoch": 0.08073792729245795, + "grad_norm": 5.358865737915039, + "learning_rate": 9.28724133205427e-06, + "loss": 0.3405, + "step": 5952 + }, + { + "epoch": 0.08075149213239284, + "grad_norm": 6.549921035766602, + "learning_rate": 9.287104289434015e-06, + "loss": 0.4289, + "step": 5953 + }, + { + "epoch": 0.08076505697232772, + "grad_norm": 6.666123867034912, + "learning_rate": 9.28696724681376e-06, + "loss": 0.3671, + "step": 5954 + }, + { + "epoch": 0.08077862181226261, + "grad_norm": 4.828982353210449, + "learning_rate": 9.286830204193505e-06, + "loss": 0.275, + "step": 5955 + }, + { + "epoch": 0.0807921866521975, + "grad_norm": 5.587350368499756, + "learning_rate": 9.28669316157325e-06, + "loss": 0.2844, + "step": 5956 + }, + { + "epoch": 0.0808057514921324, + "grad_norm": 6.715699195861816, + "learning_rate": 9.286556118952996e-06, + "loss": 0.3879, + "step": 5957 + }, + { + "epoch": 0.08081931633206728, + "grad_norm": 5.179941654205322, + "learning_rate": 9.28641907633274e-06, + "loss": 0.3127, + "step": 5958 + }, + { + "epoch": 0.08083288117200217, + "grad_norm": 6.41291618347168, + "learning_rate": 9.286282033712484e-06, + "loss": 0.3665, + "step": 5959 + }, + { + "epoch": 0.08084644601193706, + "grad_norm": 4.751774311065674, + "learning_rate": 9.286144991092231e-06, + "loss": 0.271, + "step": 5960 + }, + { + "epoch": 0.08086001085187194, + "grad_norm": 8.641523361206055, + "learning_rate": 9.286007948471976e-06, + "loss": 0.485, + "step": 5961 + }, + { + "epoch": 0.08087357569180684, + "grad_norm": 7.247644901275635, + "learning_rate": 9.28587090585172e-06, + "loss": 0.4093, + "step": 5962 + }, + { + "epoch": 0.08088714053174173, + "grad_norm": 7.019279479980469, + "learning_rate": 9.285733863231465e-06, + "loss": 0.3565, + "step": 5963 + }, + { + "epoch": 0.08090070537167661, + "grad_norm": 7.704058647155762, + "learning_rate": 9.285596820611212e-06, + "loss": 0.4695, + "step": 5964 + }, + { + "epoch": 0.0809142702116115, + "grad_norm": 5.942479610443115, + "learning_rate": 9.285459777990957e-06, + "loss": 0.5045, + "step": 5965 + }, + { + "epoch": 0.0809278350515464, + "grad_norm": 5.7898640632629395, + "learning_rate": 9.2853227353707e-06, + "loss": 0.3607, + "step": 5966 + }, + { + "epoch": 0.08094139989148127, + "grad_norm": 4.923793315887451, + "learning_rate": 9.285185692750446e-06, + "loss": 0.3329, + "step": 5967 + }, + { + "epoch": 0.08095496473141617, + "grad_norm": 9.048243522644043, + "learning_rate": 9.285048650130192e-06, + "loss": 0.5604, + "step": 5968 + }, + { + "epoch": 0.08096852957135106, + "grad_norm": 5.594561576843262, + "learning_rate": 9.284911607509936e-06, + "loss": 0.5441, + "step": 5969 + }, + { + "epoch": 0.08098209441128594, + "grad_norm": 5.752919673919678, + "learning_rate": 9.284774564889681e-06, + "loss": 0.452, + "step": 5970 + }, + { + "epoch": 0.08099565925122083, + "grad_norm": 8.716838836669922, + "learning_rate": 9.284637522269426e-06, + "loss": 0.5196, + "step": 5971 + }, + { + "epoch": 0.08100922409115573, + "grad_norm": 5.756099700927734, + "learning_rate": 9.284500479649171e-06, + "loss": 0.3725, + "step": 5972 + }, + { + "epoch": 0.08102278893109062, + "grad_norm": 4.061513423919678, + "learning_rate": 9.284363437028917e-06, + "loss": 0.3267, + "step": 5973 + }, + { + "epoch": 0.0810363537710255, + "grad_norm": 8.029759407043457, + "learning_rate": 9.284226394408662e-06, + "loss": 0.57, + "step": 5974 + }, + { + "epoch": 0.08104991861096039, + "grad_norm": 6.886002063751221, + "learning_rate": 9.284089351788407e-06, + "loss": 0.5673, + "step": 5975 + }, + { + "epoch": 0.08106348345089529, + "grad_norm": 7.145142078399658, + "learning_rate": 9.283952309168152e-06, + "loss": 0.3973, + "step": 5976 + }, + { + "epoch": 0.08107704829083016, + "grad_norm": 5.073121070861816, + "learning_rate": 9.283815266547897e-06, + "loss": 0.3745, + "step": 5977 + }, + { + "epoch": 0.08109061313076506, + "grad_norm": 4.851253986358643, + "learning_rate": 9.283678223927643e-06, + "loss": 0.3238, + "step": 5978 + }, + { + "epoch": 0.08110417797069995, + "grad_norm": 6.248661994934082, + "learning_rate": 9.283541181307388e-06, + "loss": 0.33, + "step": 5979 + }, + { + "epoch": 0.08111774281063483, + "grad_norm": 9.505630493164062, + "learning_rate": 9.283404138687133e-06, + "loss": 0.5249, + "step": 5980 + }, + { + "epoch": 0.08113130765056972, + "grad_norm": 6.676036834716797, + "learning_rate": 9.283267096066878e-06, + "loss": 0.3988, + "step": 5981 + }, + { + "epoch": 0.08114487249050462, + "grad_norm": 6.027435779571533, + "learning_rate": 9.283130053446623e-06, + "loss": 0.3132, + "step": 5982 + }, + { + "epoch": 0.0811584373304395, + "grad_norm": 5.747859954833984, + "learning_rate": 9.282993010826368e-06, + "loss": 0.3554, + "step": 5983 + }, + { + "epoch": 0.08117200217037439, + "grad_norm": 7.979687213897705, + "learning_rate": 9.282855968206112e-06, + "loss": 0.4749, + "step": 5984 + }, + { + "epoch": 0.08118556701030928, + "grad_norm": 6.928042888641357, + "learning_rate": 9.282718925585857e-06, + "loss": 0.3338, + "step": 5985 + }, + { + "epoch": 0.08119913185024416, + "grad_norm": 7.120676517486572, + "learning_rate": 9.282581882965604e-06, + "loss": 0.3754, + "step": 5986 + }, + { + "epoch": 0.08121269669017905, + "grad_norm": 6.75504207611084, + "learning_rate": 9.282444840345347e-06, + "loss": 0.3654, + "step": 5987 + }, + { + "epoch": 0.08122626153011395, + "grad_norm": 7.501988410949707, + "learning_rate": 9.282307797725093e-06, + "loss": 0.51, + "step": 5988 + }, + { + "epoch": 0.08123982637004884, + "grad_norm": 5.094061851501465, + "learning_rate": 9.282170755104838e-06, + "loss": 0.4243, + "step": 5989 + }, + { + "epoch": 0.08125339120998372, + "grad_norm": 7.7070536613464355, + "learning_rate": 9.282033712484583e-06, + "loss": 0.3208, + "step": 5990 + }, + { + "epoch": 0.08126695604991861, + "grad_norm": 7.08338737487793, + "learning_rate": 9.281896669864328e-06, + "loss": 0.5609, + "step": 5991 + }, + { + "epoch": 0.0812805208898535, + "grad_norm": 7.908126354217529, + "learning_rate": 9.281759627244073e-06, + "loss": 0.3916, + "step": 5992 + }, + { + "epoch": 0.08129408572978838, + "grad_norm": 6.6103925704956055, + "learning_rate": 9.281622584623819e-06, + "loss": 0.3167, + "step": 5993 + }, + { + "epoch": 0.08130765056972328, + "grad_norm": 7.257528305053711, + "learning_rate": 9.281485542003564e-06, + "loss": 0.5432, + "step": 5994 + }, + { + "epoch": 0.08132121540965817, + "grad_norm": 5.412655353546143, + "learning_rate": 9.281348499383309e-06, + "loss": 0.3902, + "step": 5995 + }, + { + "epoch": 0.08133478024959305, + "grad_norm": 9.151079177856445, + "learning_rate": 9.281211456763054e-06, + "loss": 0.5151, + "step": 5996 + }, + { + "epoch": 0.08134834508952794, + "grad_norm": 9.346158027648926, + "learning_rate": 9.2810744141428e-06, + "loss": 0.5357, + "step": 5997 + }, + { + "epoch": 0.08136190992946284, + "grad_norm": 7.081592082977295, + "learning_rate": 9.280937371522544e-06, + "loss": 0.4624, + "step": 5998 + }, + { + "epoch": 0.08137547476939772, + "grad_norm": 5.856240749359131, + "learning_rate": 9.28080032890229e-06, + "loss": 0.4626, + "step": 5999 + }, + { + "epoch": 0.08138903960933261, + "grad_norm": 5.2702107429504395, + "learning_rate": 9.280663286282035e-06, + "loss": 0.4062, + "step": 6000 + }, + { + "epoch": 0.0814026044492675, + "grad_norm": 6.4666643142700195, + "learning_rate": 9.28052624366178e-06, + "loss": 0.3661, + "step": 6001 + }, + { + "epoch": 0.08141616928920238, + "grad_norm": 8.835675239562988, + "learning_rate": 9.280389201041523e-06, + "loss": 0.5676, + "step": 6002 + }, + { + "epoch": 0.08142973412913727, + "grad_norm": 5.4928364753723145, + "learning_rate": 9.28025215842127e-06, + "loss": 0.4143, + "step": 6003 + }, + { + "epoch": 0.08144329896907217, + "grad_norm": 7.848579406738281, + "learning_rate": 9.280115115801016e-06, + "loss": 0.5458, + "step": 6004 + }, + { + "epoch": 0.08145686380900706, + "grad_norm": 5.464904308319092, + "learning_rate": 9.279978073180759e-06, + "loss": 0.4176, + "step": 6005 + }, + { + "epoch": 0.08147042864894194, + "grad_norm": 7.962911605834961, + "learning_rate": 9.279841030560504e-06, + "loss": 0.5249, + "step": 6006 + }, + { + "epoch": 0.08148399348887683, + "grad_norm": 6.671018600463867, + "learning_rate": 9.279703987940251e-06, + "loss": 0.4121, + "step": 6007 + }, + { + "epoch": 0.08149755832881173, + "grad_norm": 5.402932643890381, + "learning_rate": 9.279566945319996e-06, + "loss": 0.385, + "step": 6008 + }, + { + "epoch": 0.0815111231687466, + "grad_norm": 5.74050760269165, + "learning_rate": 9.27942990269974e-06, + "loss": 0.4242, + "step": 6009 + }, + { + "epoch": 0.0815246880086815, + "grad_norm": 7.29135274887085, + "learning_rate": 9.279292860079485e-06, + "loss": 0.4772, + "step": 6010 + }, + { + "epoch": 0.08153825284861639, + "grad_norm": 5.914697170257568, + "learning_rate": 9.279155817459232e-06, + "loss": 0.3416, + "step": 6011 + }, + { + "epoch": 0.08155181768855127, + "grad_norm": 6.153561592102051, + "learning_rate": 9.279018774838975e-06, + "loss": 0.4026, + "step": 6012 + }, + { + "epoch": 0.08156538252848616, + "grad_norm": 5.489953994750977, + "learning_rate": 9.27888173221872e-06, + "loss": 0.4484, + "step": 6013 + }, + { + "epoch": 0.08157894736842106, + "grad_norm": 7.335266590118408, + "learning_rate": 9.278744689598466e-06, + "loss": 0.4477, + "step": 6014 + }, + { + "epoch": 0.08159251220835594, + "grad_norm": 6.282310962677002, + "learning_rate": 9.27860764697821e-06, + "loss": 0.4318, + "step": 6015 + }, + { + "epoch": 0.08160607704829083, + "grad_norm": 5.783750057220459, + "learning_rate": 9.278470604357956e-06, + "loss": 0.3962, + "step": 6016 + }, + { + "epoch": 0.08161964188822572, + "grad_norm": 7.278753280639648, + "learning_rate": 9.278333561737701e-06, + "loss": 0.4476, + "step": 6017 + }, + { + "epoch": 0.0816332067281606, + "grad_norm": 6.151552677154541, + "learning_rate": 9.278196519117446e-06, + "loss": 0.3628, + "step": 6018 + }, + { + "epoch": 0.0816467715680955, + "grad_norm": 6.142226696014404, + "learning_rate": 9.278059476497192e-06, + "loss": 0.5141, + "step": 6019 + }, + { + "epoch": 0.08166033640803039, + "grad_norm": 7.560679912567139, + "learning_rate": 9.277922433876937e-06, + "loss": 0.4242, + "step": 6020 + }, + { + "epoch": 0.08167390124796528, + "grad_norm": 8.84732437133789, + "learning_rate": 9.277785391256682e-06, + "loss": 0.6015, + "step": 6021 + }, + { + "epoch": 0.08168746608790016, + "grad_norm": 6.05810022354126, + "learning_rate": 9.277648348636427e-06, + "loss": 0.4975, + "step": 6022 + }, + { + "epoch": 0.08170103092783505, + "grad_norm": 6.393009662628174, + "learning_rate": 9.277511306016172e-06, + "loss": 0.5438, + "step": 6023 + }, + { + "epoch": 0.08171459576776995, + "grad_norm": 7.1799116134643555, + "learning_rate": 9.277374263395917e-06, + "loss": 0.4365, + "step": 6024 + }, + { + "epoch": 0.08172816060770483, + "grad_norm": 8.492961883544922, + "learning_rate": 9.277237220775663e-06, + "loss": 0.424, + "step": 6025 + }, + { + "epoch": 0.08174172544763972, + "grad_norm": 5.348793029785156, + "learning_rate": 9.277100178155408e-06, + "loss": 0.3081, + "step": 6026 + }, + { + "epoch": 0.08175529028757461, + "grad_norm": 6.386971473693848, + "learning_rate": 9.276963135535151e-06, + "loss": 0.4382, + "step": 6027 + }, + { + "epoch": 0.08176885512750949, + "grad_norm": 4.901118278503418, + "learning_rate": 9.276826092914896e-06, + "loss": 0.3863, + "step": 6028 + }, + { + "epoch": 0.08178241996744438, + "grad_norm": 6.14877462387085, + "learning_rate": 9.276689050294643e-06, + "loss": 0.3668, + "step": 6029 + }, + { + "epoch": 0.08179598480737928, + "grad_norm": 5.844783782958984, + "learning_rate": 9.276552007674387e-06, + "loss": 0.3808, + "step": 6030 + }, + { + "epoch": 0.08180954964731416, + "grad_norm": 7.566299915313721, + "learning_rate": 9.276414965054132e-06, + "loss": 0.4585, + "step": 6031 + }, + { + "epoch": 0.08182311448724905, + "grad_norm": 4.990257263183594, + "learning_rate": 9.276277922433877e-06, + "loss": 0.3673, + "step": 6032 + }, + { + "epoch": 0.08183667932718394, + "grad_norm": 8.108113288879395, + "learning_rate": 9.276140879813624e-06, + "loss": 0.4479, + "step": 6033 + }, + { + "epoch": 0.08185024416711882, + "grad_norm": 4.93897819519043, + "learning_rate": 9.276003837193368e-06, + "loss": 0.2829, + "step": 6034 + }, + { + "epoch": 0.08186380900705371, + "grad_norm": 6.688863277435303, + "learning_rate": 9.275866794573113e-06, + "loss": 0.4297, + "step": 6035 + }, + { + "epoch": 0.08187737384698861, + "grad_norm": 7.75978422164917, + "learning_rate": 9.275729751952858e-06, + "loss": 0.3495, + "step": 6036 + }, + { + "epoch": 0.0818909386869235, + "grad_norm": 4.973468780517578, + "learning_rate": 9.275592709332603e-06, + "loss": 0.3655, + "step": 6037 + }, + { + "epoch": 0.08190450352685838, + "grad_norm": 5.5856428146362305, + "learning_rate": 9.275455666712348e-06, + "loss": 0.288, + "step": 6038 + }, + { + "epoch": 0.08191806836679327, + "grad_norm": 6.92543888092041, + "learning_rate": 9.275318624092093e-06, + "loss": 0.5344, + "step": 6039 + }, + { + "epoch": 0.08193163320672817, + "grad_norm": 5.716470241546631, + "learning_rate": 9.275181581471839e-06, + "loss": 0.4275, + "step": 6040 + }, + { + "epoch": 0.08194519804666305, + "grad_norm": 5.778858184814453, + "learning_rate": 9.275044538851584e-06, + "loss": 0.3676, + "step": 6041 + }, + { + "epoch": 0.08195876288659794, + "grad_norm": 5.5647292137146, + "learning_rate": 9.274907496231329e-06, + "loss": 0.3778, + "step": 6042 + }, + { + "epoch": 0.08197232772653283, + "grad_norm": 5.90523624420166, + "learning_rate": 9.274770453611074e-06, + "loss": 0.5009, + "step": 6043 + }, + { + "epoch": 0.08198589256646771, + "grad_norm": 6.744014263153076, + "learning_rate": 9.27463341099082e-06, + "loss": 0.4212, + "step": 6044 + }, + { + "epoch": 0.0819994574064026, + "grad_norm": 4.15255069732666, + "learning_rate": 9.274496368370563e-06, + "loss": 0.2897, + "step": 6045 + }, + { + "epoch": 0.0820130222463375, + "grad_norm": 5.653173923492432, + "learning_rate": 9.27435932575031e-06, + "loss": 0.4537, + "step": 6046 + }, + { + "epoch": 0.08202658708627238, + "grad_norm": 5.964437484741211, + "learning_rate": 9.274222283130055e-06, + "loss": 0.3601, + "step": 6047 + }, + { + "epoch": 0.08204015192620727, + "grad_norm": 7.288752555847168, + "learning_rate": 9.2740852405098e-06, + "loss": 0.5236, + "step": 6048 + }, + { + "epoch": 0.08205371676614216, + "grad_norm": 8.482267379760742, + "learning_rate": 9.273948197889543e-06, + "loss": 0.6123, + "step": 6049 + }, + { + "epoch": 0.08206728160607704, + "grad_norm": 7.243941783905029, + "learning_rate": 9.27381115526929e-06, + "loss": 0.3969, + "step": 6050 + }, + { + "epoch": 0.08208084644601193, + "grad_norm": 7.8118696212768555, + "learning_rate": 9.273674112649036e-06, + "loss": 0.5281, + "step": 6051 + }, + { + "epoch": 0.08209441128594683, + "grad_norm": 8.606066703796387, + "learning_rate": 9.273537070028779e-06, + "loss": 0.5148, + "step": 6052 + }, + { + "epoch": 0.08210797612588172, + "grad_norm": 6.728391647338867, + "learning_rate": 9.273400027408524e-06, + "loss": 0.5484, + "step": 6053 + }, + { + "epoch": 0.0821215409658166, + "grad_norm": 6.6387248039245605, + "learning_rate": 9.27326298478827e-06, + "loss": 0.4209, + "step": 6054 + }, + { + "epoch": 0.0821351058057515, + "grad_norm": 4.597111225128174, + "learning_rate": 9.273125942168015e-06, + "loss": 0.4245, + "step": 6055 + }, + { + "epoch": 0.08214867064568639, + "grad_norm": 6.687410831451416, + "learning_rate": 9.27298889954776e-06, + "loss": 0.4103, + "step": 6056 + }, + { + "epoch": 0.08216223548562127, + "grad_norm": 6.309221267700195, + "learning_rate": 9.272851856927505e-06, + "loss": 0.4609, + "step": 6057 + }, + { + "epoch": 0.08217580032555616, + "grad_norm": 5.11376428604126, + "learning_rate": 9.27271481430725e-06, + "loss": 0.298, + "step": 6058 + }, + { + "epoch": 0.08218936516549105, + "grad_norm": 4.943740367889404, + "learning_rate": 9.272577771686995e-06, + "loss": 0.2839, + "step": 6059 + }, + { + "epoch": 0.08220293000542593, + "grad_norm": 5.043617248535156, + "learning_rate": 9.27244072906674e-06, + "loss": 0.3029, + "step": 6060 + }, + { + "epoch": 0.08221649484536082, + "grad_norm": 6.9984965324401855, + "learning_rate": 9.272303686446486e-06, + "loss": 0.2999, + "step": 6061 + }, + { + "epoch": 0.08223005968529572, + "grad_norm": 6.77140474319458, + "learning_rate": 9.27216664382623e-06, + "loss": 0.4098, + "step": 6062 + }, + { + "epoch": 0.0822436245252306, + "grad_norm": 4.928862571716309, + "learning_rate": 9.272029601205976e-06, + "loss": 0.3471, + "step": 6063 + }, + { + "epoch": 0.08225718936516549, + "grad_norm": 5.810544490814209, + "learning_rate": 9.271892558585721e-06, + "loss": 0.3879, + "step": 6064 + }, + { + "epoch": 0.08227075420510038, + "grad_norm": 9.328893661499023, + "learning_rate": 9.271755515965466e-06, + "loss": 0.4518, + "step": 6065 + }, + { + "epoch": 0.08228431904503526, + "grad_norm": 5.197170734405518, + "learning_rate": 9.271618473345212e-06, + "loss": 0.4163, + "step": 6066 + }, + { + "epoch": 0.08229788388497015, + "grad_norm": 5.323058128356934, + "learning_rate": 9.271481430724957e-06, + "loss": 0.4512, + "step": 6067 + }, + { + "epoch": 0.08231144872490505, + "grad_norm": 5.501274585723877, + "learning_rate": 9.271344388104702e-06, + "loss": 0.444, + "step": 6068 + }, + { + "epoch": 0.08232501356483994, + "grad_norm": 7.02902889251709, + "learning_rate": 9.271207345484447e-06, + "loss": 0.511, + "step": 6069 + }, + { + "epoch": 0.08233857840477482, + "grad_norm": 5.899502754211426, + "learning_rate": 9.27107030286419e-06, + "loss": 0.5227, + "step": 6070 + }, + { + "epoch": 0.08235214324470971, + "grad_norm": 6.052436351776123, + "learning_rate": 9.270933260243936e-06, + "loss": 0.3861, + "step": 6071 + }, + { + "epoch": 0.0823657080846446, + "grad_norm": 4.824016094207764, + "learning_rate": 9.270796217623683e-06, + "loss": 0.3264, + "step": 6072 + }, + { + "epoch": 0.08237927292457949, + "grad_norm": 4.96175479888916, + "learning_rate": 9.270659175003426e-06, + "loss": 0.3741, + "step": 6073 + }, + { + "epoch": 0.08239283776451438, + "grad_norm": 6.15988826751709, + "learning_rate": 9.270522132383171e-06, + "loss": 0.4791, + "step": 6074 + }, + { + "epoch": 0.08240640260444927, + "grad_norm": 6.109663963317871, + "learning_rate": 9.270385089762916e-06, + "loss": 0.397, + "step": 6075 + }, + { + "epoch": 0.08241996744438415, + "grad_norm": 9.035543441772461, + "learning_rate": 9.270248047142663e-06, + "loss": 0.3698, + "step": 6076 + }, + { + "epoch": 0.08243353228431904, + "grad_norm": 5.44735050201416, + "learning_rate": 9.270111004522407e-06, + "loss": 0.46, + "step": 6077 + }, + { + "epoch": 0.08244709712425394, + "grad_norm": 6.264368057250977, + "learning_rate": 9.269973961902152e-06, + "loss": 0.4601, + "step": 6078 + }, + { + "epoch": 0.08246066196418882, + "grad_norm": 5.175755500793457, + "learning_rate": 9.269836919281897e-06, + "loss": 0.3602, + "step": 6079 + }, + { + "epoch": 0.08247422680412371, + "grad_norm": 6.975111961364746, + "learning_rate": 9.269699876661642e-06, + "loss": 0.4892, + "step": 6080 + }, + { + "epoch": 0.0824877916440586, + "grad_norm": 6.453178882598877, + "learning_rate": 9.269562834041388e-06, + "loss": 0.3779, + "step": 6081 + }, + { + "epoch": 0.08250135648399348, + "grad_norm": 7.046132564544678, + "learning_rate": 9.269425791421133e-06, + "loss": 0.3137, + "step": 6082 + }, + { + "epoch": 0.08251492132392838, + "grad_norm": 6.725012302398682, + "learning_rate": 9.269288748800878e-06, + "loss": 0.4065, + "step": 6083 + }, + { + "epoch": 0.08252848616386327, + "grad_norm": 5.781598091125488, + "learning_rate": 9.269151706180623e-06, + "loss": 0.4472, + "step": 6084 + }, + { + "epoch": 0.08254205100379816, + "grad_norm": 6.438840866088867, + "learning_rate": 9.269014663560368e-06, + "loss": 0.4877, + "step": 6085 + }, + { + "epoch": 0.08255561584373304, + "grad_norm": 8.401772499084473, + "learning_rate": 9.268877620940113e-06, + "loss": 0.522, + "step": 6086 + }, + { + "epoch": 0.08256918068366793, + "grad_norm": 6.165676593780518, + "learning_rate": 9.268740578319859e-06, + "loss": 0.4356, + "step": 6087 + }, + { + "epoch": 0.08258274552360283, + "grad_norm": 6.382116794586182, + "learning_rate": 9.268603535699602e-06, + "loss": 0.4939, + "step": 6088 + }, + { + "epoch": 0.0825963103635377, + "grad_norm": 9.624469757080078, + "learning_rate": 9.268466493079349e-06, + "loss": 0.3758, + "step": 6089 + }, + { + "epoch": 0.0826098752034726, + "grad_norm": 8.024885177612305, + "learning_rate": 9.268329450459094e-06, + "loss": 0.5816, + "step": 6090 + }, + { + "epoch": 0.08262344004340749, + "grad_norm": 8.202901840209961, + "learning_rate": 9.26819240783884e-06, + "loss": 0.3934, + "step": 6091 + }, + { + "epoch": 0.08263700488334237, + "grad_norm": 6.803320407867432, + "learning_rate": 9.268055365218583e-06, + "loss": 0.4661, + "step": 6092 + }, + { + "epoch": 0.08265056972327726, + "grad_norm": 7.299487590789795, + "learning_rate": 9.26791832259833e-06, + "loss": 0.4511, + "step": 6093 + }, + { + "epoch": 0.08266413456321216, + "grad_norm": 6.050512790679932, + "learning_rate": 9.267781279978075e-06, + "loss": 0.3052, + "step": 6094 + }, + { + "epoch": 0.08267769940314704, + "grad_norm": 4.454148769378662, + "learning_rate": 9.267644237357818e-06, + "loss": 0.3126, + "step": 6095 + }, + { + "epoch": 0.08269126424308193, + "grad_norm": 7.456578731536865, + "learning_rate": 9.267507194737564e-06, + "loss": 0.4647, + "step": 6096 + }, + { + "epoch": 0.08270482908301682, + "grad_norm": 5.903580665588379, + "learning_rate": 9.267370152117309e-06, + "loss": 0.4502, + "step": 6097 + }, + { + "epoch": 0.0827183939229517, + "grad_norm": 7.805593490600586, + "learning_rate": 9.267233109497054e-06, + "loss": 0.5879, + "step": 6098 + }, + { + "epoch": 0.0827319587628866, + "grad_norm": 5.560103893280029, + "learning_rate": 9.267096066876799e-06, + "loss": 0.4223, + "step": 6099 + }, + { + "epoch": 0.08274552360282149, + "grad_norm": 5.442193984985352, + "learning_rate": 9.266959024256544e-06, + "loss": 0.3997, + "step": 6100 + }, + { + "epoch": 0.08275908844275638, + "grad_norm": 7.0261945724487305, + "learning_rate": 9.26682198163629e-06, + "loss": 0.4829, + "step": 6101 + }, + { + "epoch": 0.08277265328269126, + "grad_norm": 6.8044538497924805, + "learning_rate": 9.266684939016035e-06, + "loss": 0.4589, + "step": 6102 + }, + { + "epoch": 0.08278621812262615, + "grad_norm": 4.605037689208984, + "learning_rate": 9.26654789639578e-06, + "loss": 0.3152, + "step": 6103 + }, + { + "epoch": 0.08279978296256105, + "grad_norm": 5.061014652252197, + "learning_rate": 9.266410853775525e-06, + "loss": 0.3283, + "step": 6104 + }, + { + "epoch": 0.08281334780249593, + "grad_norm": 6.143594741821289, + "learning_rate": 9.26627381115527e-06, + "loss": 0.3629, + "step": 6105 + }, + { + "epoch": 0.08282691264243082, + "grad_norm": 5.662109375, + "learning_rate": 9.266136768535015e-06, + "loss": 0.3322, + "step": 6106 + }, + { + "epoch": 0.08284047748236571, + "grad_norm": 4.459405899047852, + "learning_rate": 9.26599972591476e-06, + "loss": 0.3544, + "step": 6107 + }, + { + "epoch": 0.08285404232230059, + "grad_norm": 4.6084747314453125, + "learning_rate": 9.265862683294506e-06, + "loss": 0.3129, + "step": 6108 + }, + { + "epoch": 0.08286760716223548, + "grad_norm": 3.320282459259033, + "learning_rate": 9.26572564067425e-06, + "loss": 0.233, + "step": 6109 + }, + { + "epoch": 0.08288117200217038, + "grad_norm": 5.841864585876465, + "learning_rate": 9.265588598053994e-06, + "loss": 0.4372, + "step": 6110 + }, + { + "epoch": 0.08289473684210526, + "grad_norm": 6.00521993637085, + "learning_rate": 9.265451555433741e-06, + "loss": 0.5053, + "step": 6111 + }, + { + "epoch": 0.08290830168204015, + "grad_norm": 6.163877964019775, + "learning_rate": 9.265314512813486e-06, + "loss": 0.369, + "step": 6112 + }, + { + "epoch": 0.08292186652197504, + "grad_norm": 6.270778656005859, + "learning_rate": 9.26517747019323e-06, + "loss": 0.4653, + "step": 6113 + }, + { + "epoch": 0.08293543136190992, + "grad_norm": 5.3609113693237305, + "learning_rate": 9.265040427572975e-06, + "loss": 0.29, + "step": 6114 + }, + { + "epoch": 0.08294899620184482, + "grad_norm": 4.431620121002197, + "learning_rate": 9.264903384952722e-06, + "loss": 0.2678, + "step": 6115 + }, + { + "epoch": 0.08296256104177971, + "grad_norm": 4.594092845916748, + "learning_rate": 9.264766342332467e-06, + "loss": 0.2984, + "step": 6116 + }, + { + "epoch": 0.0829761258817146, + "grad_norm": 5.207859992980957, + "learning_rate": 9.26462929971221e-06, + "loss": 0.3554, + "step": 6117 + }, + { + "epoch": 0.08298969072164948, + "grad_norm": 6.843330383300781, + "learning_rate": 9.264492257091956e-06, + "loss": 0.3355, + "step": 6118 + }, + { + "epoch": 0.08300325556158437, + "grad_norm": 4.6749186515808105, + "learning_rate": 9.264355214471703e-06, + "loss": 0.3575, + "step": 6119 + }, + { + "epoch": 0.08301682040151927, + "grad_norm": 6.703713893890381, + "learning_rate": 9.264218171851446e-06, + "loss": 0.4154, + "step": 6120 + }, + { + "epoch": 0.08303038524145415, + "grad_norm": 4.594128131866455, + "learning_rate": 9.264081129231191e-06, + "loss": 0.3246, + "step": 6121 + }, + { + "epoch": 0.08304395008138904, + "grad_norm": 4.606286525726318, + "learning_rate": 9.263944086610936e-06, + "loss": 0.3219, + "step": 6122 + }, + { + "epoch": 0.08305751492132393, + "grad_norm": 5.715394496917725, + "learning_rate": 9.263807043990682e-06, + "loss": 0.6073, + "step": 6123 + }, + { + "epoch": 0.08307107976125881, + "grad_norm": 6.055173873901367, + "learning_rate": 9.263670001370427e-06, + "loss": 0.5179, + "step": 6124 + }, + { + "epoch": 0.0830846446011937, + "grad_norm": 8.363194465637207, + "learning_rate": 9.263532958750172e-06, + "loss": 0.4634, + "step": 6125 + }, + { + "epoch": 0.0830982094411286, + "grad_norm": 4.808021545410156, + "learning_rate": 9.263395916129917e-06, + "loss": 0.2605, + "step": 6126 + }, + { + "epoch": 0.08311177428106348, + "grad_norm": 7.058577060699463, + "learning_rate": 9.263258873509662e-06, + "loss": 0.3811, + "step": 6127 + }, + { + "epoch": 0.08312533912099837, + "grad_norm": 8.350895881652832, + "learning_rate": 9.263121830889408e-06, + "loss": 0.4838, + "step": 6128 + }, + { + "epoch": 0.08313890396093326, + "grad_norm": 5.916833400726318, + "learning_rate": 9.262984788269153e-06, + "loss": 0.2989, + "step": 6129 + }, + { + "epoch": 0.08315246880086814, + "grad_norm": 6.115562438964844, + "learning_rate": 9.262847745648898e-06, + "loss": 0.366, + "step": 6130 + }, + { + "epoch": 0.08316603364080304, + "grad_norm": 6.114861965179443, + "learning_rate": 9.262710703028643e-06, + "loss": 0.2954, + "step": 6131 + }, + { + "epoch": 0.08317959848073793, + "grad_norm": 6.159295558929443, + "learning_rate": 9.262573660408388e-06, + "loss": 0.4539, + "step": 6132 + }, + { + "epoch": 0.08319316332067282, + "grad_norm": 5.4912800788879395, + "learning_rate": 9.262436617788133e-06, + "loss": 0.4427, + "step": 6133 + }, + { + "epoch": 0.0832067281606077, + "grad_norm": 5.942880630493164, + "learning_rate": 9.262299575167879e-06, + "loss": 0.3683, + "step": 6134 + }, + { + "epoch": 0.0832202930005426, + "grad_norm": 6.855103015899658, + "learning_rate": 9.262162532547622e-06, + "loss": 0.5219, + "step": 6135 + }, + { + "epoch": 0.08323385784047749, + "grad_norm": 5.067371845245361, + "learning_rate": 9.262025489927369e-06, + "loss": 0.3724, + "step": 6136 + }, + { + "epoch": 0.08324742268041237, + "grad_norm": 6.008987903594971, + "learning_rate": 9.261888447307114e-06, + "loss": 0.4397, + "step": 6137 + }, + { + "epoch": 0.08326098752034726, + "grad_norm": 6.413108825683594, + "learning_rate": 9.261751404686858e-06, + "loss": 0.4324, + "step": 6138 + }, + { + "epoch": 0.08327455236028215, + "grad_norm": 4.374899864196777, + "learning_rate": 9.261614362066603e-06, + "loss": 0.2595, + "step": 6139 + }, + { + "epoch": 0.08328811720021703, + "grad_norm": 5.355887413024902, + "learning_rate": 9.261477319446348e-06, + "loss": 0.3288, + "step": 6140 + }, + { + "epoch": 0.08330168204015193, + "grad_norm": 5.309695243835449, + "learning_rate": 9.261340276826095e-06, + "loss": 0.4869, + "step": 6141 + }, + { + "epoch": 0.08331524688008682, + "grad_norm": 6.58297872543335, + "learning_rate": 9.261203234205838e-06, + "loss": 0.4329, + "step": 6142 + }, + { + "epoch": 0.0833288117200217, + "grad_norm": 6.158263206481934, + "learning_rate": 9.261066191585584e-06, + "loss": 0.5467, + "step": 6143 + }, + { + "epoch": 0.08334237655995659, + "grad_norm": 5.8046112060546875, + "learning_rate": 9.260929148965329e-06, + "loss": 0.5025, + "step": 6144 + }, + { + "epoch": 0.08335594139989148, + "grad_norm": 6.5008225440979, + "learning_rate": 9.260792106345074e-06, + "loss": 0.4938, + "step": 6145 + }, + { + "epoch": 0.08336950623982638, + "grad_norm": 6.469913482666016, + "learning_rate": 9.260655063724819e-06, + "loss": 0.5344, + "step": 6146 + }, + { + "epoch": 0.08338307107976126, + "grad_norm": 6.364598274230957, + "learning_rate": 9.260518021104564e-06, + "loss": 0.3467, + "step": 6147 + }, + { + "epoch": 0.08339663591969615, + "grad_norm": 5.912088871002197, + "learning_rate": 9.26038097848431e-06, + "loss": 0.4113, + "step": 6148 + }, + { + "epoch": 0.08341020075963104, + "grad_norm": 8.526857376098633, + "learning_rate": 9.260243935864055e-06, + "loss": 0.5673, + "step": 6149 + }, + { + "epoch": 0.08342376559956592, + "grad_norm": 4.442380905151367, + "learning_rate": 9.2601068932438e-06, + "loss": 0.3781, + "step": 6150 + }, + { + "epoch": 0.08343733043950081, + "grad_norm": 5.2166314125061035, + "learning_rate": 9.259969850623545e-06, + "loss": 0.3193, + "step": 6151 + }, + { + "epoch": 0.08345089527943571, + "grad_norm": 5.271501064300537, + "learning_rate": 9.25983280800329e-06, + "loss": 0.2821, + "step": 6152 + }, + { + "epoch": 0.08346446011937059, + "grad_norm": 8.49416732788086, + "learning_rate": 9.259695765383034e-06, + "loss": 0.4299, + "step": 6153 + }, + { + "epoch": 0.08347802495930548, + "grad_norm": 6.588915824890137, + "learning_rate": 9.25955872276278e-06, + "loss": 0.4567, + "step": 6154 + }, + { + "epoch": 0.08349158979924037, + "grad_norm": 5.801991939544678, + "learning_rate": 9.259421680142526e-06, + "loss": 0.3607, + "step": 6155 + }, + { + "epoch": 0.08350515463917525, + "grad_norm": 8.316015243530273, + "learning_rate": 9.259284637522271e-06, + "loss": 0.3842, + "step": 6156 + }, + { + "epoch": 0.08351871947911015, + "grad_norm": 4.851253032684326, + "learning_rate": 9.259147594902014e-06, + "loss": 0.3649, + "step": 6157 + }, + { + "epoch": 0.08353228431904504, + "grad_norm": 6.375956058502197, + "learning_rate": 9.259010552281761e-06, + "loss": 0.3603, + "step": 6158 + }, + { + "epoch": 0.08354584915897992, + "grad_norm": 6.797677516937256, + "learning_rate": 9.258873509661506e-06, + "loss": 0.5804, + "step": 6159 + }, + { + "epoch": 0.08355941399891481, + "grad_norm": 5.30154275894165, + "learning_rate": 9.25873646704125e-06, + "loss": 0.3559, + "step": 6160 + }, + { + "epoch": 0.0835729788388497, + "grad_norm": 5.04212760925293, + "learning_rate": 9.258599424420995e-06, + "loss": 0.2718, + "step": 6161 + }, + { + "epoch": 0.0835865436787846, + "grad_norm": 6.430037021636963, + "learning_rate": 9.258462381800742e-06, + "loss": 0.3792, + "step": 6162 + }, + { + "epoch": 0.08360010851871948, + "grad_norm": 5.991696357727051, + "learning_rate": 9.258325339180485e-06, + "loss": 0.4112, + "step": 6163 + }, + { + "epoch": 0.08361367335865437, + "grad_norm": 7.360875129699707, + "learning_rate": 9.25818829656023e-06, + "loss": 0.469, + "step": 6164 + }, + { + "epoch": 0.08362723819858926, + "grad_norm": 7.886593341827393, + "learning_rate": 9.258051253939976e-06, + "loss": 0.5089, + "step": 6165 + }, + { + "epoch": 0.08364080303852414, + "grad_norm": 6.575571060180664, + "learning_rate": 9.257914211319721e-06, + "loss": 0.4877, + "step": 6166 + }, + { + "epoch": 0.08365436787845903, + "grad_norm": 7.016147136688232, + "learning_rate": 9.257777168699466e-06, + "loss": 0.3852, + "step": 6167 + }, + { + "epoch": 0.08366793271839393, + "grad_norm": 5.225693702697754, + "learning_rate": 9.257640126079211e-06, + "loss": 0.2618, + "step": 6168 + }, + { + "epoch": 0.08368149755832881, + "grad_norm": 7.453216552734375, + "learning_rate": 9.257503083458956e-06, + "loss": 0.5469, + "step": 6169 + }, + { + "epoch": 0.0836950623982637, + "grad_norm": 6.120350360870361, + "learning_rate": 9.257366040838702e-06, + "loss": 0.3747, + "step": 6170 + }, + { + "epoch": 0.0837086272381986, + "grad_norm": 7.395015716552734, + "learning_rate": 9.257228998218447e-06, + "loss": 0.3358, + "step": 6171 + }, + { + "epoch": 0.08372219207813347, + "grad_norm": 5.067675590515137, + "learning_rate": 9.257091955598192e-06, + "loss": 0.369, + "step": 6172 + }, + { + "epoch": 0.08373575691806837, + "grad_norm": 4.8170552253723145, + "learning_rate": 9.256954912977937e-06, + "loss": 0.3461, + "step": 6173 + }, + { + "epoch": 0.08374932175800326, + "grad_norm": 5.1357269287109375, + "learning_rate": 9.256817870357682e-06, + "loss": 0.4229, + "step": 6174 + }, + { + "epoch": 0.08376288659793814, + "grad_norm": 5.130326747894287, + "learning_rate": 9.256680827737428e-06, + "loss": 0.3354, + "step": 6175 + }, + { + "epoch": 0.08377645143787303, + "grad_norm": 6.367510795593262, + "learning_rate": 9.256543785117173e-06, + "loss": 0.4579, + "step": 6176 + }, + { + "epoch": 0.08379001627780792, + "grad_norm": 8.241216659545898, + "learning_rate": 9.256406742496918e-06, + "loss": 0.5399, + "step": 6177 + }, + { + "epoch": 0.08380358111774282, + "grad_norm": 7.018741607666016, + "learning_rate": 9.256269699876661e-06, + "loss": 0.4328, + "step": 6178 + }, + { + "epoch": 0.0838171459576777, + "grad_norm": 5.490870952606201, + "learning_rate": 9.256132657256407e-06, + "loss": 0.3, + "step": 6179 + }, + { + "epoch": 0.08383071079761259, + "grad_norm": 5.584865570068359, + "learning_rate": 9.255995614636153e-06, + "loss": 0.3769, + "step": 6180 + }, + { + "epoch": 0.08384427563754748, + "grad_norm": 6.098424911499023, + "learning_rate": 9.255858572015897e-06, + "loss": 0.3675, + "step": 6181 + }, + { + "epoch": 0.08385784047748236, + "grad_norm": 5.313694000244141, + "learning_rate": 9.255721529395642e-06, + "loss": 0.4086, + "step": 6182 + }, + { + "epoch": 0.08387140531741726, + "grad_norm": 5.852326393127441, + "learning_rate": 9.255584486775387e-06, + "loss": 0.2892, + "step": 6183 + }, + { + "epoch": 0.08388497015735215, + "grad_norm": 5.976511001586914, + "learning_rate": 9.255447444155134e-06, + "loss": 0.3364, + "step": 6184 + }, + { + "epoch": 0.08389853499728703, + "grad_norm": 6.835552215576172, + "learning_rate": 9.255310401534878e-06, + "loss": 0.5348, + "step": 6185 + }, + { + "epoch": 0.08391209983722192, + "grad_norm": 6.199057102203369, + "learning_rate": 9.255173358914623e-06, + "loss": 0.3701, + "step": 6186 + }, + { + "epoch": 0.08392566467715681, + "grad_norm": 7.4969096183776855, + "learning_rate": 9.255036316294368e-06, + "loss": 0.482, + "step": 6187 + }, + { + "epoch": 0.08393922951709169, + "grad_norm": 4.38250207901001, + "learning_rate": 9.254899273674113e-06, + "loss": 0.2737, + "step": 6188 + }, + { + "epoch": 0.08395279435702659, + "grad_norm": 6.170988082885742, + "learning_rate": 9.254762231053858e-06, + "loss": 0.4227, + "step": 6189 + }, + { + "epoch": 0.08396635919696148, + "grad_norm": 5.965455532073975, + "learning_rate": 9.254625188433604e-06, + "loss": 0.3835, + "step": 6190 + }, + { + "epoch": 0.08397992403689636, + "grad_norm": 8.786913871765137, + "learning_rate": 9.254488145813349e-06, + "loss": 0.6447, + "step": 6191 + }, + { + "epoch": 0.08399348887683125, + "grad_norm": 6.22054386138916, + "learning_rate": 9.254351103193094e-06, + "loss": 0.3812, + "step": 6192 + }, + { + "epoch": 0.08400705371676614, + "grad_norm": 6.958227157592773, + "learning_rate": 9.254214060572839e-06, + "loss": 0.3831, + "step": 6193 + }, + { + "epoch": 0.08402061855670104, + "grad_norm": 7.628433704376221, + "learning_rate": 9.254077017952584e-06, + "loss": 0.5618, + "step": 6194 + }, + { + "epoch": 0.08403418339663592, + "grad_norm": 5.02267599105835, + "learning_rate": 9.25393997533233e-06, + "loss": 0.3522, + "step": 6195 + }, + { + "epoch": 0.08404774823657081, + "grad_norm": 7.887308120727539, + "learning_rate": 9.253802932712073e-06, + "loss": 0.3614, + "step": 6196 + }, + { + "epoch": 0.0840613130765057, + "grad_norm": 5.1663689613342285, + "learning_rate": 9.25366589009182e-06, + "loss": 0.2748, + "step": 6197 + }, + { + "epoch": 0.08407487791644058, + "grad_norm": 8.479543685913086, + "learning_rate": 9.253528847471565e-06, + "loss": 0.5238, + "step": 6198 + }, + { + "epoch": 0.08408844275637548, + "grad_norm": 6.8001909255981445, + "learning_rate": 9.25339180485131e-06, + "loss": 0.4514, + "step": 6199 + }, + { + "epoch": 0.08410200759631037, + "grad_norm": 6.958734035491943, + "learning_rate": 9.253254762231054e-06, + "loss": 0.3261, + "step": 6200 + }, + { + "epoch": 0.08411557243624525, + "grad_norm": 5.868778228759766, + "learning_rate": 9.2531177196108e-06, + "loss": 0.3915, + "step": 6201 + }, + { + "epoch": 0.08412913727618014, + "grad_norm": 7.7952165603637695, + "learning_rate": 9.252980676990546e-06, + "loss": 0.4618, + "step": 6202 + }, + { + "epoch": 0.08414270211611503, + "grad_norm": 6.064932823181152, + "learning_rate": 9.25284363437029e-06, + "loss": 0.3448, + "step": 6203 + }, + { + "epoch": 0.08415626695604991, + "grad_norm": 6.894956111907959, + "learning_rate": 9.252706591750034e-06, + "loss": 0.5152, + "step": 6204 + }, + { + "epoch": 0.0841698317959848, + "grad_norm": 7.348334789276123, + "learning_rate": 9.252569549129781e-06, + "loss": 0.3158, + "step": 6205 + }, + { + "epoch": 0.0841833966359197, + "grad_norm": 7.321911811828613, + "learning_rate": 9.252432506509525e-06, + "loss": 0.4195, + "step": 6206 + }, + { + "epoch": 0.08419696147585458, + "grad_norm": 5.501030921936035, + "learning_rate": 9.25229546388927e-06, + "loss": 0.2905, + "step": 6207 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 7.6345906257629395, + "learning_rate": 9.252158421269015e-06, + "loss": 0.5114, + "step": 6208 + }, + { + "epoch": 0.08422409115572436, + "grad_norm": 4.891456604003906, + "learning_rate": 9.25202137864876e-06, + "loss": 0.3189, + "step": 6209 + }, + { + "epoch": 0.08423765599565926, + "grad_norm": 7.4617228507995605, + "learning_rate": 9.251884336028505e-06, + "loss": 0.4084, + "step": 6210 + }, + { + "epoch": 0.08425122083559414, + "grad_norm": 6.778748035430908, + "learning_rate": 9.25174729340825e-06, + "loss": 0.3258, + "step": 6211 + }, + { + "epoch": 0.08426478567552903, + "grad_norm": 7.752731800079346, + "learning_rate": 9.251610250787996e-06, + "loss": 0.5006, + "step": 6212 + }, + { + "epoch": 0.08427835051546392, + "grad_norm": 7.47579288482666, + "learning_rate": 9.251473208167741e-06, + "loss": 0.3998, + "step": 6213 + }, + { + "epoch": 0.0842919153553988, + "grad_norm": 5.587324142456055, + "learning_rate": 9.251336165547486e-06, + "loss": 0.3899, + "step": 6214 + }, + { + "epoch": 0.0843054801953337, + "grad_norm": 7.02070426940918, + "learning_rate": 9.251199122927231e-06, + "loss": 0.4737, + "step": 6215 + }, + { + "epoch": 0.08431904503526859, + "grad_norm": 9.948742866516113, + "learning_rate": 9.251062080306977e-06, + "loss": 0.616, + "step": 6216 + }, + { + "epoch": 0.08433260987520347, + "grad_norm": 8.78884506225586, + "learning_rate": 9.250925037686722e-06, + "loss": 0.546, + "step": 6217 + }, + { + "epoch": 0.08434617471513836, + "grad_norm": 7.952242851257324, + "learning_rate": 9.250787995066467e-06, + "loss": 0.9089, + "step": 6218 + }, + { + "epoch": 0.08435973955507325, + "grad_norm": 7.267687797546387, + "learning_rate": 9.250650952446212e-06, + "loss": 0.4925, + "step": 6219 + }, + { + "epoch": 0.08437330439500813, + "grad_norm": 7.006697177886963, + "learning_rate": 9.250513909825957e-06, + "loss": 0.4553, + "step": 6220 + }, + { + "epoch": 0.08438686923494303, + "grad_norm": 6.917703628540039, + "learning_rate": 9.2503768672057e-06, + "loss": 0.3953, + "step": 6221 + }, + { + "epoch": 0.08440043407487792, + "grad_norm": 7.373261451721191, + "learning_rate": 9.250239824585446e-06, + "loss": 0.4828, + "step": 6222 + }, + { + "epoch": 0.0844139989148128, + "grad_norm": 5.668578624725342, + "learning_rate": 9.250102781965193e-06, + "loss": 0.4645, + "step": 6223 + }, + { + "epoch": 0.08442756375474769, + "grad_norm": 5.452922344207764, + "learning_rate": 9.249965739344938e-06, + "loss": 0.4198, + "step": 6224 + }, + { + "epoch": 0.08444112859468259, + "grad_norm": 7.4179792404174805, + "learning_rate": 9.249828696724681e-06, + "loss": 0.4162, + "step": 6225 + }, + { + "epoch": 0.08445469343461748, + "grad_norm": 5.705442905426025, + "learning_rate": 9.249691654104427e-06, + "loss": 0.4993, + "step": 6226 + }, + { + "epoch": 0.08446825827455236, + "grad_norm": 5.681817054748535, + "learning_rate": 9.249554611484173e-06, + "loss": 0.4157, + "step": 6227 + }, + { + "epoch": 0.08448182311448725, + "grad_norm": 8.947497367858887, + "learning_rate": 9.249417568863917e-06, + "loss": 0.4567, + "step": 6228 + }, + { + "epoch": 0.08449538795442214, + "grad_norm": 5.73247766494751, + "learning_rate": 9.249280526243662e-06, + "loss": 0.3684, + "step": 6229 + }, + { + "epoch": 0.08450895279435702, + "grad_norm": 4.705754280090332, + "learning_rate": 9.249143483623407e-06, + "loss": 0.4006, + "step": 6230 + }, + { + "epoch": 0.08452251763429192, + "grad_norm": 6.930330753326416, + "learning_rate": 9.249006441003152e-06, + "loss": 0.6937, + "step": 6231 + }, + { + "epoch": 0.08453608247422681, + "grad_norm": 5.042030334472656, + "learning_rate": 9.248869398382898e-06, + "loss": 0.3317, + "step": 6232 + }, + { + "epoch": 0.08454964731416169, + "grad_norm": 5.851181983947754, + "learning_rate": 9.248732355762643e-06, + "loss": 0.403, + "step": 6233 + }, + { + "epoch": 0.08456321215409658, + "grad_norm": 8.207205772399902, + "learning_rate": 9.248595313142388e-06, + "loss": 0.4522, + "step": 6234 + }, + { + "epoch": 0.08457677699403147, + "grad_norm": 6.045027732849121, + "learning_rate": 9.248458270522133e-06, + "loss": 0.4747, + "step": 6235 + }, + { + "epoch": 0.08459034183396635, + "grad_norm": 7.893372058868408, + "learning_rate": 9.248321227901878e-06, + "loss": 0.5558, + "step": 6236 + }, + { + "epoch": 0.08460390667390125, + "grad_norm": 6.259425163269043, + "learning_rate": 9.248184185281624e-06, + "loss": 0.4348, + "step": 6237 + }, + { + "epoch": 0.08461747151383614, + "grad_norm": 7.237265586853027, + "learning_rate": 9.248047142661369e-06, + "loss": 0.6094, + "step": 6238 + }, + { + "epoch": 0.08463103635377102, + "grad_norm": 6.5295634269714355, + "learning_rate": 9.247910100041114e-06, + "loss": 0.5318, + "step": 6239 + }, + { + "epoch": 0.08464460119370591, + "grad_norm": 6.626628875732422, + "learning_rate": 9.247773057420859e-06, + "loss": 0.2692, + "step": 6240 + }, + { + "epoch": 0.0846581660336408, + "grad_norm": 6.997053623199463, + "learning_rate": 9.247636014800604e-06, + "loss": 0.4068, + "step": 6241 + }, + { + "epoch": 0.0846717308735757, + "grad_norm": 7.042579650878906, + "learning_rate": 9.24749897218035e-06, + "loss": 0.4715, + "step": 6242 + }, + { + "epoch": 0.08468529571351058, + "grad_norm": 8.416293144226074, + "learning_rate": 9.247361929560093e-06, + "loss": 0.607, + "step": 6243 + }, + { + "epoch": 0.08469886055344547, + "grad_norm": 6.7655534744262695, + "learning_rate": 9.24722488693984e-06, + "loss": 0.6028, + "step": 6244 + }, + { + "epoch": 0.08471242539338036, + "grad_norm": 5.684371471405029, + "learning_rate": 9.247087844319585e-06, + "loss": 0.3826, + "step": 6245 + }, + { + "epoch": 0.08472599023331524, + "grad_norm": 7.929623126983643, + "learning_rate": 9.246950801699328e-06, + "loss": 0.6989, + "step": 6246 + }, + { + "epoch": 0.08473955507325014, + "grad_norm": 6.5828070640563965, + "learning_rate": 9.246813759079074e-06, + "loss": 0.3525, + "step": 6247 + }, + { + "epoch": 0.08475311991318503, + "grad_norm": 5.718719959259033, + "learning_rate": 9.246676716458819e-06, + "loss": 0.4485, + "step": 6248 + }, + { + "epoch": 0.08476668475311991, + "grad_norm": 5.7355055809021, + "learning_rate": 9.246539673838566e-06, + "loss": 0.5389, + "step": 6249 + }, + { + "epoch": 0.0847802495930548, + "grad_norm": 5.56732702255249, + "learning_rate": 9.24640263121831e-06, + "loss": 0.3964, + "step": 6250 + }, + { + "epoch": 0.0847938144329897, + "grad_norm": 4.488797187805176, + "learning_rate": 9.246265588598054e-06, + "loss": 0.3242, + "step": 6251 + }, + { + "epoch": 0.08480737927292457, + "grad_norm": 5.86072301864624, + "learning_rate": 9.2461285459778e-06, + "loss": 0.5397, + "step": 6252 + }, + { + "epoch": 0.08482094411285947, + "grad_norm": 11.046340942382812, + "learning_rate": 9.245991503357545e-06, + "loss": 0.9077, + "step": 6253 + }, + { + "epoch": 0.08483450895279436, + "grad_norm": 6.092581748962402, + "learning_rate": 9.24585446073729e-06, + "loss": 0.3411, + "step": 6254 + }, + { + "epoch": 0.08484807379272924, + "grad_norm": 6.326519012451172, + "learning_rate": 9.245717418117035e-06, + "loss": 0.4826, + "step": 6255 + }, + { + "epoch": 0.08486163863266413, + "grad_norm": 6.792282581329346, + "learning_rate": 9.24558037549678e-06, + "loss": 0.3874, + "step": 6256 + }, + { + "epoch": 0.08487520347259903, + "grad_norm": 4.465770721435547, + "learning_rate": 9.245443332876525e-06, + "loss": 0.3047, + "step": 6257 + }, + { + "epoch": 0.08488876831253392, + "grad_norm": 7.95377254486084, + "learning_rate": 9.24530629025627e-06, + "loss": 0.3899, + "step": 6258 + }, + { + "epoch": 0.0849023331524688, + "grad_norm": 7.150856018066406, + "learning_rate": 9.245169247636016e-06, + "loss": 0.4142, + "step": 6259 + }, + { + "epoch": 0.08491589799240369, + "grad_norm": 5.664371967315674, + "learning_rate": 9.245032205015761e-06, + "loss": 0.4416, + "step": 6260 + }, + { + "epoch": 0.08492946283233858, + "grad_norm": 7.216487884521484, + "learning_rate": 9.244895162395504e-06, + "loss": 0.4966, + "step": 6261 + }, + { + "epoch": 0.08494302767227346, + "grad_norm": 5.744422435760498, + "learning_rate": 9.244758119775251e-06, + "loss": 0.3922, + "step": 6262 + }, + { + "epoch": 0.08495659251220836, + "grad_norm": 6.563296794891357, + "learning_rate": 9.244621077154997e-06, + "loss": 0.5386, + "step": 6263 + }, + { + "epoch": 0.08497015735214325, + "grad_norm": 7.1435546875, + "learning_rate": 9.24448403453474e-06, + "loss": 0.5075, + "step": 6264 + }, + { + "epoch": 0.08498372219207813, + "grad_norm": 6.283145427703857, + "learning_rate": 9.244346991914485e-06, + "loss": 0.5252, + "step": 6265 + }, + { + "epoch": 0.08499728703201302, + "grad_norm": 6.215878486633301, + "learning_rate": 9.244209949294232e-06, + "loss": 0.3542, + "step": 6266 + }, + { + "epoch": 0.08501085187194792, + "grad_norm": 7.3920392990112305, + "learning_rate": 9.244072906673977e-06, + "loss": 0.4438, + "step": 6267 + }, + { + "epoch": 0.0850244167118828, + "grad_norm": 4.808956623077393, + "learning_rate": 9.24393586405372e-06, + "loss": 0.2748, + "step": 6268 + }, + { + "epoch": 0.08503798155181769, + "grad_norm": 7.137545108795166, + "learning_rate": 9.243798821433466e-06, + "loss": 0.5199, + "step": 6269 + }, + { + "epoch": 0.08505154639175258, + "grad_norm": 9.734882354736328, + "learning_rate": 9.243661778813213e-06, + "loss": 0.5319, + "step": 6270 + }, + { + "epoch": 0.08506511123168746, + "grad_norm": 6.775323390960693, + "learning_rate": 9.243524736192956e-06, + "loss": 0.3715, + "step": 6271 + }, + { + "epoch": 0.08507867607162235, + "grad_norm": 6.13752555847168, + "learning_rate": 9.243387693572701e-06, + "loss": 0.4219, + "step": 6272 + }, + { + "epoch": 0.08509224091155725, + "grad_norm": 6.74680233001709, + "learning_rate": 9.243250650952447e-06, + "loss": 0.4537, + "step": 6273 + }, + { + "epoch": 0.08510580575149214, + "grad_norm": 8.559282302856445, + "learning_rate": 9.243113608332192e-06, + "loss": 0.5263, + "step": 6274 + }, + { + "epoch": 0.08511937059142702, + "grad_norm": 7.6185479164123535, + "learning_rate": 9.242976565711937e-06, + "loss": 0.3799, + "step": 6275 + }, + { + "epoch": 0.08513293543136191, + "grad_norm": 9.964327812194824, + "learning_rate": 9.242839523091682e-06, + "loss": 0.6025, + "step": 6276 + }, + { + "epoch": 0.0851465002712968, + "grad_norm": 8.188994407653809, + "learning_rate": 9.242702480471427e-06, + "loss": 0.3356, + "step": 6277 + }, + { + "epoch": 0.08516006511123168, + "grad_norm": 7.647279262542725, + "learning_rate": 9.242565437851173e-06, + "loss": 0.4297, + "step": 6278 + }, + { + "epoch": 0.08517362995116658, + "grad_norm": 5.888915538787842, + "learning_rate": 9.242428395230918e-06, + "loss": 0.2316, + "step": 6279 + }, + { + "epoch": 0.08518719479110147, + "grad_norm": 6.352856636047363, + "learning_rate": 9.242291352610663e-06, + "loss": 0.4577, + "step": 6280 + }, + { + "epoch": 0.08520075963103635, + "grad_norm": 10.456108093261719, + "learning_rate": 9.242154309990408e-06, + "loss": 0.751, + "step": 6281 + }, + { + "epoch": 0.08521432447097124, + "grad_norm": 6.264122009277344, + "learning_rate": 9.242017267370153e-06, + "loss": 0.4611, + "step": 6282 + }, + { + "epoch": 0.08522788931090614, + "grad_norm": 6.631239891052246, + "learning_rate": 9.241880224749898e-06, + "loss": 0.3939, + "step": 6283 + }, + { + "epoch": 0.08524145415084101, + "grad_norm": 5.765284538269043, + "learning_rate": 9.241743182129644e-06, + "loss": 0.2895, + "step": 6284 + }, + { + "epoch": 0.08525501899077591, + "grad_norm": 8.23818302154541, + "learning_rate": 9.241606139509389e-06, + "loss": 0.4958, + "step": 6285 + }, + { + "epoch": 0.0852685838307108, + "grad_norm": 11.10578727722168, + "learning_rate": 9.241469096889132e-06, + "loss": 0.4493, + "step": 6286 + }, + { + "epoch": 0.08528214867064568, + "grad_norm": 9.462574005126953, + "learning_rate": 9.241332054268879e-06, + "loss": 0.5745, + "step": 6287 + }, + { + "epoch": 0.08529571351058057, + "grad_norm": 10.102703094482422, + "learning_rate": 9.241195011648624e-06, + "loss": 0.6459, + "step": 6288 + }, + { + "epoch": 0.08530927835051547, + "grad_norm": 10.026189804077148, + "learning_rate": 9.241057969028368e-06, + "loss": 0.707, + "step": 6289 + }, + { + "epoch": 0.08532284319045036, + "grad_norm": 8.160287857055664, + "learning_rate": 9.240920926408113e-06, + "loss": 0.519, + "step": 6290 + }, + { + "epoch": 0.08533640803038524, + "grad_norm": 8.388943672180176, + "learning_rate": 9.240783883787858e-06, + "loss": 0.471, + "step": 6291 + }, + { + "epoch": 0.08534997287032013, + "grad_norm": 7.2902655601501465, + "learning_rate": 9.240646841167605e-06, + "loss": 0.4998, + "step": 6292 + }, + { + "epoch": 0.08536353771025502, + "grad_norm": 6.871696472167969, + "learning_rate": 9.240509798547349e-06, + "loss": 0.4068, + "step": 6293 + }, + { + "epoch": 0.0853771025501899, + "grad_norm": 5.979068279266357, + "learning_rate": 9.240372755927094e-06, + "loss": 0.3112, + "step": 6294 + }, + { + "epoch": 0.0853906673901248, + "grad_norm": 5.659512042999268, + "learning_rate": 9.240235713306839e-06, + "loss": 0.3854, + "step": 6295 + }, + { + "epoch": 0.08540423223005969, + "grad_norm": 7.03498649597168, + "learning_rate": 9.240098670686584e-06, + "loss": 0.4089, + "step": 6296 + }, + { + "epoch": 0.08541779706999457, + "grad_norm": 6.9286909103393555, + "learning_rate": 9.23996162806633e-06, + "loss": 0.4649, + "step": 6297 + }, + { + "epoch": 0.08543136190992946, + "grad_norm": 7.608215808868408, + "learning_rate": 9.239824585446074e-06, + "loss": 0.6787, + "step": 6298 + }, + { + "epoch": 0.08544492674986436, + "grad_norm": 6.875194072723389, + "learning_rate": 9.23968754282582e-06, + "loss": 0.4737, + "step": 6299 + }, + { + "epoch": 0.08545849158979923, + "grad_norm": 7.758100509643555, + "learning_rate": 9.239550500205565e-06, + "loss": 0.4179, + "step": 6300 + }, + { + "epoch": 0.08547205642973413, + "grad_norm": 11.229008674621582, + "learning_rate": 9.23941345758531e-06, + "loss": 0.5832, + "step": 6301 + }, + { + "epoch": 0.08548562126966902, + "grad_norm": 7.749334812164307, + "learning_rate": 9.239276414965055e-06, + "loss": 0.5146, + "step": 6302 + }, + { + "epoch": 0.0854991861096039, + "grad_norm": 7.582353591918945, + "learning_rate": 9.2391393723448e-06, + "loss": 0.4081, + "step": 6303 + }, + { + "epoch": 0.0855127509495388, + "grad_norm": 10.016199111938477, + "learning_rate": 9.239002329724544e-06, + "loss": 0.6398, + "step": 6304 + }, + { + "epoch": 0.08552631578947369, + "grad_norm": 9.337458610534668, + "learning_rate": 9.23886528710429e-06, + "loss": 0.6392, + "step": 6305 + }, + { + "epoch": 0.08553988062940858, + "grad_norm": 7.2592854499816895, + "learning_rate": 9.238728244484036e-06, + "loss": 0.6067, + "step": 6306 + }, + { + "epoch": 0.08555344546934346, + "grad_norm": 8.191620826721191, + "learning_rate": 9.238591201863781e-06, + "loss": 0.5301, + "step": 6307 + }, + { + "epoch": 0.08556701030927835, + "grad_norm": 6.510681629180908, + "learning_rate": 9.238454159243524e-06, + "loss": 0.4273, + "step": 6308 + }, + { + "epoch": 0.08558057514921324, + "grad_norm": 7.1623358726501465, + "learning_rate": 9.238317116623271e-06, + "loss": 0.4393, + "step": 6309 + }, + { + "epoch": 0.08559413998914812, + "grad_norm": 7.298551082611084, + "learning_rate": 9.238180074003017e-06, + "loss": 0.3856, + "step": 6310 + }, + { + "epoch": 0.08560770482908302, + "grad_norm": 8.159375190734863, + "learning_rate": 9.23804303138276e-06, + "loss": 0.4196, + "step": 6311 + }, + { + "epoch": 0.08562126966901791, + "grad_norm": 6.074801921844482, + "learning_rate": 9.237905988762505e-06, + "loss": 0.4268, + "step": 6312 + }, + { + "epoch": 0.08563483450895279, + "grad_norm": 6.758069038391113, + "learning_rate": 9.237768946142252e-06, + "loss": 0.4356, + "step": 6313 + }, + { + "epoch": 0.08564839934888768, + "grad_norm": 7.081048488616943, + "learning_rate": 9.237631903521996e-06, + "loss": 0.368, + "step": 6314 + }, + { + "epoch": 0.08566196418882258, + "grad_norm": 5.807863235473633, + "learning_rate": 9.23749486090174e-06, + "loss": 0.4269, + "step": 6315 + }, + { + "epoch": 0.08567552902875746, + "grad_norm": 8.657190322875977, + "learning_rate": 9.237357818281486e-06, + "loss": 0.4001, + "step": 6316 + }, + { + "epoch": 0.08568909386869235, + "grad_norm": 6.301314353942871, + "learning_rate": 9.237220775661231e-06, + "loss": 0.378, + "step": 6317 + }, + { + "epoch": 0.08570265870862724, + "grad_norm": 11.92894172668457, + "learning_rate": 9.237083733040976e-06, + "loss": 0.5176, + "step": 6318 + }, + { + "epoch": 0.08571622354856212, + "grad_norm": 11.085450172424316, + "learning_rate": 9.236946690420721e-06, + "loss": 0.5153, + "step": 6319 + }, + { + "epoch": 0.08572978838849701, + "grad_norm": 7.214740753173828, + "learning_rate": 9.236809647800467e-06, + "loss": 0.451, + "step": 6320 + }, + { + "epoch": 0.0857433532284319, + "grad_norm": 8.861140251159668, + "learning_rate": 9.236672605180212e-06, + "loss": 0.4407, + "step": 6321 + }, + { + "epoch": 0.0857569180683668, + "grad_norm": 6.006041526794434, + "learning_rate": 9.236535562559957e-06, + "loss": 0.2567, + "step": 6322 + }, + { + "epoch": 0.08577048290830168, + "grad_norm": 6.5395684242248535, + "learning_rate": 9.236398519939702e-06, + "loss": 0.5111, + "step": 6323 + }, + { + "epoch": 0.08578404774823657, + "grad_norm": 7.585844039916992, + "learning_rate": 9.236261477319447e-06, + "loss": 0.4073, + "step": 6324 + }, + { + "epoch": 0.08579761258817147, + "grad_norm": 7.671923637390137, + "learning_rate": 9.236124434699193e-06, + "loss": 0.506, + "step": 6325 + }, + { + "epoch": 0.08581117742810634, + "grad_norm": 9.094991683959961, + "learning_rate": 9.235987392078938e-06, + "loss": 0.6269, + "step": 6326 + }, + { + "epoch": 0.08582474226804124, + "grad_norm": 10.287700653076172, + "learning_rate": 9.235850349458683e-06, + "loss": 0.6421, + "step": 6327 + }, + { + "epoch": 0.08583830710797613, + "grad_norm": 8.045698165893555, + "learning_rate": 9.235713306838428e-06, + "loss": 0.4394, + "step": 6328 + }, + { + "epoch": 0.08585187194791101, + "grad_norm": 6.606595516204834, + "learning_rate": 9.235576264218172e-06, + "loss": 0.5037, + "step": 6329 + }, + { + "epoch": 0.0858654367878459, + "grad_norm": 6.157471656799316, + "learning_rate": 9.235439221597917e-06, + "loss": 0.4601, + "step": 6330 + }, + { + "epoch": 0.0858790016277808, + "grad_norm": 7.08103609085083, + "learning_rate": 9.235302178977664e-06, + "loss": 0.5729, + "step": 6331 + }, + { + "epoch": 0.08589256646771568, + "grad_norm": 8.259641647338867, + "learning_rate": 9.235165136357409e-06, + "loss": 0.4249, + "step": 6332 + }, + { + "epoch": 0.08590613130765057, + "grad_norm": 8.122651100158691, + "learning_rate": 9.235028093737152e-06, + "loss": 0.454, + "step": 6333 + }, + { + "epoch": 0.08591969614758546, + "grad_norm": 6.241043567657471, + "learning_rate": 9.234891051116897e-06, + "loss": 0.49, + "step": 6334 + }, + { + "epoch": 0.08593326098752034, + "grad_norm": 7.80369758605957, + "learning_rate": 9.234754008496644e-06, + "loss": 0.5705, + "step": 6335 + }, + { + "epoch": 0.08594682582745523, + "grad_norm": 9.117842674255371, + "learning_rate": 9.234616965876388e-06, + "loss": 0.4067, + "step": 6336 + }, + { + "epoch": 0.08596039066739013, + "grad_norm": 7.732356548309326, + "learning_rate": 9.234479923256133e-06, + "loss": 0.6352, + "step": 6337 + }, + { + "epoch": 0.08597395550732502, + "grad_norm": 8.73536205291748, + "learning_rate": 9.234342880635878e-06, + "loss": 0.5876, + "step": 6338 + }, + { + "epoch": 0.0859875203472599, + "grad_norm": 9.341818809509277, + "learning_rate": 9.234205838015623e-06, + "loss": 0.4342, + "step": 6339 + }, + { + "epoch": 0.08600108518719479, + "grad_norm": 8.977137565612793, + "learning_rate": 9.234068795395369e-06, + "loss": 0.6642, + "step": 6340 + }, + { + "epoch": 0.08601465002712969, + "grad_norm": 8.28685474395752, + "learning_rate": 9.233931752775114e-06, + "loss": 0.4608, + "step": 6341 + }, + { + "epoch": 0.08602821486706456, + "grad_norm": 7.3103346824646, + "learning_rate": 9.233794710154859e-06, + "loss": 0.5839, + "step": 6342 + }, + { + "epoch": 0.08604177970699946, + "grad_norm": 8.62594223022461, + "learning_rate": 9.233657667534604e-06, + "loss": 0.4688, + "step": 6343 + }, + { + "epoch": 0.08605534454693435, + "grad_norm": 5.9284539222717285, + "learning_rate": 9.23352062491435e-06, + "loss": 0.3716, + "step": 6344 + }, + { + "epoch": 0.08606890938686923, + "grad_norm": 7.447811603546143, + "learning_rate": 9.233383582294094e-06, + "loss": 0.4374, + "step": 6345 + }, + { + "epoch": 0.08608247422680412, + "grad_norm": 11.33387279510498, + "learning_rate": 9.23324653967384e-06, + "loss": 0.8568, + "step": 6346 + }, + { + "epoch": 0.08609603906673902, + "grad_norm": 8.06668472290039, + "learning_rate": 9.233109497053585e-06, + "loss": 0.3983, + "step": 6347 + }, + { + "epoch": 0.0861096039066739, + "grad_norm": 8.146337509155273, + "learning_rate": 9.23297245443333e-06, + "loss": 0.4966, + "step": 6348 + }, + { + "epoch": 0.08612316874660879, + "grad_norm": 7.563227653503418, + "learning_rate": 9.232835411813075e-06, + "loss": 0.6509, + "step": 6349 + }, + { + "epoch": 0.08613673358654368, + "grad_norm": 7.623045921325684, + "learning_rate": 9.23269836919282e-06, + "loss": 0.4637, + "step": 6350 + }, + { + "epoch": 0.08615029842647856, + "grad_norm": 7.928376197814941, + "learning_rate": 9.232561326572564e-06, + "loss": 0.6139, + "step": 6351 + }, + { + "epoch": 0.08616386326641345, + "grad_norm": 9.493436813354492, + "learning_rate": 9.23242428395231e-06, + "loss": 0.6181, + "step": 6352 + }, + { + "epoch": 0.08617742810634835, + "grad_norm": 9.19041919708252, + "learning_rate": 9.232287241332056e-06, + "loss": 0.5277, + "step": 6353 + }, + { + "epoch": 0.08619099294628324, + "grad_norm": 6.41585111618042, + "learning_rate": 9.2321501987118e-06, + "loss": 0.4179, + "step": 6354 + }, + { + "epoch": 0.08620455778621812, + "grad_norm": 8.007993698120117, + "learning_rate": 9.232013156091545e-06, + "loss": 0.413, + "step": 6355 + }, + { + "epoch": 0.08621812262615301, + "grad_norm": 8.298035621643066, + "learning_rate": 9.231876113471291e-06, + "loss": 0.3638, + "step": 6356 + }, + { + "epoch": 0.0862316874660879, + "grad_norm": 8.869283676147461, + "learning_rate": 9.231739070851035e-06, + "loss": 0.6305, + "step": 6357 + }, + { + "epoch": 0.08624525230602278, + "grad_norm": 6.250580310821533, + "learning_rate": 9.23160202823078e-06, + "loss": 0.4675, + "step": 6358 + }, + { + "epoch": 0.08625881714595768, + "grad_norm": 7.592681407928467, + "learning_rate": 9.231464985610525e-06, + "loss": 0.4611, + "step": 6359 + }, + { + "epoch": 0.08627238198589257, + "grad_norm": 9.520365715026855, + "learning_rate": 9.23132794299027e-06, + "loss": 0.6464, + "step": 6360 + }, + { + "epoch": 0.08628594682582745, + "grad_norm": 7.537420749664307, + "learning_rate": 9.231190900370016e-06, + "loss": 0.3702, + "step": 6361 + }, + { + "epoch": 0.08629951166576234, + "grad_norm": 8.972041130065918, + "learning_rate": 9.23105385774976e-06, + "loss": 0.5766, + "step": 6362 + }, + { + "epoch": 0.08631307650569724, + "grad_norm": 8.997112274169922, + "learning_rate": 9.230916815129506e-06, + "loss": 0.564, + "step": 6363 + }, + { + "epoch": 0.08632664134563212, + "grad_norm": 7.076391220092773, + "learning_rate": 9.230779772509251e-06, + "loss": 0.4917, + "step": 6364 + }, + { + "epoch": 0.08634020618556701, + "grad_norm": 8.948736190795898, + "learning_rate": 9.230642729888996e-06, + "loss": 0.599, + "step": 6365 + }, + { + "epoch": 0.0863537710255019, + "grad_norm": 6.879556655883789, + "learning_rate": 9.230505687268741e-06, + "loss": 0.4799, + "step": 6366 + }, + { + "epoch": 0.08636733586543678, + "grad_norm": 6.51353120803833, + "learning_rate": 9.230368644648487e-06, + "loss": 0.5405, + "step": 6367 + }, + { + "epoch": 0.08638090070537167, + "grad_norm": 7.011819839477539, + "learning_rate": 9.230231602028232e-06, + "loss": 0.4663, + "step": 6368 + }, + { + "epoch": 0.08639446554530657, + "grad_norm": 4.813879013061523, + "learning_rate": 9.230094559407977e-06, + "loss": 0.3376, + "step": 6369 + }, + { + "epoch": 0.08640803038524146, + "grad_norm": 10.691216468811035, + "learning_rate": 9.229957516787722e-06, + "loss": 0.7513, + "step": 6370 + }, + { + "epoch": 0.08642159522517634, + "grad_norm": 7.793283462524414, + "learning_rate": 9.229820474167467e-06, + "loss": 0.7388, + "step": 6371 + }, + { + "epoch": 0.08643516006511123, + "grad_norm": 8.81389331817627, + "learning_rate": 9.229683431547211e-06, + "loss": 0.6973, + "step": 6372 + }, + { + "epoch": 0.08644872490504613, + "grad_norm": 7.500302791595459, + "learning_rate": 9.229546388926956e-06, + "loss": 0.4801, + "step": 6373 + }, + { + "epoch": 0.086462289744981, + "grad_norm": 5.932229518890381, + "learning_rate": 9.229409346306703e-06, + "loss": 0.468, + "step": 6374 + }, + { + "epoch": 0.0864758545849159, + "grad_norm": 6.592504978179932, + "learning_rate": 9.229272303686448e-06, + "loss": 0.4442, + "step": 6375 + }, + { + "epoch": 0.08648941942485079, + "grad_norm": 6.387150287628174, + "learning_rate": 9.229135261066192e-06, + "loss": 0.3898, + "step": 6376 + }, + { + "epoch": 0.08650298426478567, + "grad_norm": 8.018961906433105, + "learning_rate": 9.228998218445937e-06, + "loss": 0.6349, + "step": 6377 + }, + { + "epoch": 0.08651654910472056, + "grad_norm": 5.6123738288879395, + "learning_rate": 9.228861175825684e-06, + "loss": 0.4371, + "step": 6378 + }, + { + "epoch": 0.08653011394465546, + "grad_norm": 5.061948776245117, + "learning_rate": 9.228724133205427e-06, + "loss": 0.2684, + "step": 6379 + }, + { + "epoch": 0.08654367878459034, + "grad_norm": 6.513360500335693, + "learning_rate": 9.228587090585172e-06, + "loss": 0.4292, + "step": 6380 + }, + { + "epoch": 0.08655724362452523, + "grad_norm": 6.384948253631592, + "learning_rate": 9.228450047964917e-06, + "loss": 0.4246, + "step": 6381 + }, + { + "epoch": 0.08657080846446012, + "grad_norm": 8.595000267028809, + "learning_rate": 9.228313005344663e-06, + "loss": 0.5576, + "step": 6382 + }, + { + "epoch": 0.086584373304395, + "grad_norm": 6.564844608306885, + "learning_rate": 9.228175962724408e-06, + "loss": 0.4573, + "step": 6383 + }, + { + "epoch": 0.0865979381443299, + "grad_norm": 7.283684253692627, + "learning_rate": 9.228038920104153e-06, + "loss": 0.4442, + "step": 6384 + }, + { + "epoch": 0.08661150298426479, + "grad_norm": 6.786616802215576, + "learning_rate": 9.227901877483898e-06, + "loss": 0.4003, + "step": 6385 + }, + { + "epoch": 0.08662506782419968, + "grad_norm": 5.902645587921143, + "learning_rate": 9.227764834863643e-06, + "loss": 0.4526, + "step": 6386 + }, + { + "epoch": 0.08663863266413456, + "grad_norm": 7.0834527015686035, + "learning_rate": 9.227627792243389e-06, + "loss": 0.4122, + "step": 6387 + }, + { + "epoch": 0.08665219750406945, + "grad_norm": 6.042565822601318, + "learning_rate": 9.227490749623134e-06, + "loss": 0.4246, + "step": 6388 + }, + { + "epoch": 0.08666576234400435, + "grad_norm": 8.80554485321045, + "learning_rate": 9.227353707002879e-06, + "loss": 0.511, + "step": 6389 + }, + { + "epoch": 0.08667932718393923, + "grad_norm": 5.961683750152588, + "learning_rate": 9.227216664382624e-06, + "loss": 0.4253, + "step": 6390 + }, + { + "epoch": 0.08669289202387412, + "grad_norm": 8.175402641296387, + "learning_rate": 9.22707962176237e-06, + "loss": 0.4712, + "step": 6391 + }, + { + "epoch": 0.08670645686380901, + "grad_norm": 7.986851215362549, + "learning_rate": 9.226942579142114e-06, + "loss": 0.5352, + "step": 6392 + }, + { + "epoch": 0.08672002170374389, + "grad_norm": 6.359777927398682, + "learning_rate": 9.22680553652186e-06, + "loss": 0.5483, + "step": 6393 + }, + { + "epoch": 0.08673358654367878, + "grad_norm": 5.708643436431885, + "learning_rate": 9.226668493901603e-06, + "loss": 0.5776, + "step": 6394 + }, + { + "epoch": 0.08674715138361368, + "grad_norm": 6.167367458343506, + "learning_rate": 9.22653145128135e-06, + "loss": 0.3316, + "step": 6395 + }, + { + "epoch": 0.08676071622354856, + "grad_norm": 6.413482666015625, + "learning_rate": 9.226394408661095e-06, + "loss": 0.4938, + "step": 6396 + }, + { + "epoch": 0.08677428106348345, + "grad_norm": 4.977008819580078, + "learning_rate": 9.226257366040839e-06, + "loss": 0.3449, + "step": 6397 + }, + { + "epoch": 0.08678784590341834, + "grad_norm": 5.88831901550293, + "learning_rate": 9.226120323420584e-06, + "loss": 0.3421, + "step": 6398 + }, + { + "epoch": 0.08680141074335322, + "grad_norm": 5.654181480407715, + "learning_rate": 9.225983280800329e-06, + "loss": 0.3852, + "step": 6399 + }, + { + "epoch": 0.08681497558328811, + "grad_norm": 5.2696733474731445, + "learning_rate": 9.225846238180076e-06, + "loss": 0.3692, + "step": 6400 + }, + { + "epoch": 0.08682854042322301, + "grad_norm": 4.9575042724609375, + "learning_rate": 9.22570919555982e-06, + "loss": 0.3205, + "step": 6401 + }, + { + "epoch": 0.0868421052631579, + "grad_norm": 7.896849155426025, + "learning_rate": 9.225572152939565e-06, + "loss": 0.381, + "step": 6402 + }, + { + "epoch": 0.08685567010309278, + "grad_norm": 6.848274230957031, + "learning_rate": 9.22543511031931e-06, + "loss": 0.4501, + "step": 6403 + }, + { + "epoch": 0.08686923494302767, + "grad_norm": 7.6852641105651855, + "learning_rate": 9.225298067699055e-06, + "loss": 0.3739, + "step": 6404 + }, + { + "epoch": 0.08688279978296257, + "grad_norm": 4.626884460449219, + "learning_rate": 9.2251610250788e-06, + "loss": 0.3638, + "step": 6405 + }, + { + "epoch": 0.08689636462289745, + "grad_norm": 8.742130279541016, + "learning_rate": 9.225023982458545e-06, + "loss": 0.4795, + "step": 6406 + }, + { + "epoch": 0.08690992946283234, + "grad_norm": 6.517886161804199, + "learning_rate": 9.22488693983829e-06, + "loss": 0.286, + "step": 6407 + }, + { + "epoch": 0.08692349430276723, + "grad_norm": 6.757209777832031, + "learning_rate": 9.224749897218036e-06, + "loss": 0.4466, + "step": 6408 + }, + { + "epoch": 0.08693705914270211, + "grad_norm": 7.421329498291016, + "learning_rate": 9.22461285459778e-06, + "loss": 0.5565, + "step": 6409 + }, + { + "epoch": 0.086950623982637, + "grad_norm": 7.419090747833252, + "learning_rate": 9.224475811977526e-06, + "loss": 0.5828, + "step": 6410 + }, + { + "epoch": 0.0869641888225719, + "grad_norm": 7.010570526123047, + "learning_rate": 9.224338769357271e-06, + "loss": 0.6597, + "step": 6411 + }, + { + "epoch": 0.08697775366250678, + "grad_norm": 5.421902179718018, + "learning_rate": 9.224201726737015e-06, + "loss": 0.4681, + "step": 6412 + }, + { + "epoch": 0.08699131850244167, + "grad_norm": 8.847437858581543, + "learning_rate": 9.224064684116762e-06, + "loss": 0.5132, + "step": 6413 + }, + { + "epoch": 0.08700488334237656, + "grad_norm": 9.380134582519531, + "learning_rate": 9.223927641496507e-06, + "loss": 0.7398, + "step": 6414 + }, + { + "epoch": 0.08701844818231144, + "grad_norm": 8.125078201293945, + "learning_rate": 9.223790598876252e-06, + "loss": 0.615, + "step": 6415 + }, + { + "epoch": 0.08703201302224634, + "grad_norm": 8.73622989654541, + "learning_rate": 9.223653556255995e-06, + "loss": 0.7287, + "step": 6416 + }, + { + "epoch": 0.08704557786218123, + "grad_norm": 7.888835430145264, + "learning_rate": 9.223516513635742e-06, + "loss": 0.5439, + "step": 6417 + }, + { + "epoch": 0.08705914270211612, + "grad_norm": 4.901173114776611, + "learning_rate": 9.223379471015487e-06, + "loss": 0.2425, + "step": 6418 + }, + { + "epoch": 0.087072707542051, + "grad_norm": 8.069513320922852, + "learning_rate": 9.223242428395231e-06, + "loss": 0.4599, + "step": 6419 + }, + { + "epoch": 0.0870862723819859, + "grad_norm": 7.172439098358154, + "learning_rate": 9.223105385774976e-06, + "loss": 0.5567, + "step": 6420 + }, + { + "epoch": 0.08709983722192079, + "grad_norm": 7.379103660583496, + "learning_rate": 9.222968343154723e-06, + "loss": 0.476, + "step": 6421 + }, + { + "epoch": 0.08711340206185567, + "grad_norm": 6.470625400543213, + "learning_rate": 9.222831300534466e-06, + "loss": 0.4417, + "step": 6422 + }, + { + "epoch": 0.08712696690179056, + "grad_norm": 9.533918380737305, + "learning_rate": 9.222694257914212e-06, + "loss": 0.6035, + "step": 6423 + }, + { + "epoch": 0.08714053174172545, + "grad_norm": 8.095354080200195, + "learning_rate": 9.222557215293957e-06, + "loss": 0.5118, + "step": 6424 + }, + { + "epoch": 0.08715409658166033, + "grad_norm": 8.12088680267334, + "learning_rate": 9.222420172673704e-06, + "loss": 0.4921, + "step": 6425 + }, + { + "epoch": 0.08716766142159522, + "grad_norm": 8.549284934997559, + "learning_rate": 9.222283130053447e-06, + "loss": 0.5674, + "step": 6426 + }, + { + "epoch": 0.08718122626153012, + "grad_norm": 5.377631664276123, + "learning_rate": 9.222146087433192e-06, + "loss": 0.3444, + "step": 6427 + }, + { + "epoch": 0.087194791101465, + "grad_norm": 8.940871238708496, + "learning_rate": 9.222009044812937e-06, + "loss": 0.5085, + "step": 6428 + }, + { + "epoch": 0.08720835594139989, + "grad_norm": 5.863015174865723, + "learning_rate": 9.221872002192683e-06, + "loss": 0.4594, + "step": 6429 + }, + { + "epoch": 0.08722192078133478, + "grad_norm": 7.547945022583008, + "learning_rate": 9.221734959572428e-06, + "loss": 0.4173, + "step": 6430 + }, + { + "epoch": 0.08723548562126966, + "grad_norm": 6.09746789932251, + "learning_rate": 9.221597916952173e-06, + "loss": 0.3942, + "step": 6431 + }, + { + "epoch": 0.08724905046120456, + "grad_norm": 7.346202373504639, + "learning_rate": 9.221460874331918e-06, + "loss": 0.4133, + "step": 6432 + }, + { + "epoch": 0.08726261530113945, + "grad_norm": 6.749691009521484, + "learning_rate": 9.221323831711663e-06, + "loss": 0.5349, + "step": 6433 + }, + { + "epoch": 0.08727618014107434, + "grad_norm": 9.491789817810059, + "learning_rate": 9.221186789091409e-06, + "loss": 0.5181, + "step": 6434 + }, + { + "epoch": 0.08728974498100922, + "grad_norm": 9.204354286193848, + "learning_rate": 9.221049746471154e-06, + "loss": 0.5226, + "step": 6435 + }, + { + "epoch": 0.08730330982094411, + "grad_norm": 4.617055892944336, + "learning_rate": 9.220912703850899e-06, + "loss": 0.404, + "step": 6436 + }, + { + "epoch": 0.08731687466087901, + "grad_norm": 7.702177047729492, + "learning_rate": 9.220775661230642e-06, + "loss": 0.533, + "step": 6437 + }, + { + "epoch": 0.08733043950081389, + "grad_norm": 5.250677585601807, + "learning_rate": 9.22063861861039e-06, + "loss": 0.5105, + "step": 6438 + }, + { + "epoch": 0.08734400434074878, + "grad_norm": 6.100697040557861, + "learning_rate": 9.220501575990134e-06, + "loss": 0.4576, + "step": 6439 + }, + { + "epoch": 0.08735756918068367, + "grad_norm": 6.340228080749512, + "learning_rate": 9.22036453336988e-06, + "loss": 0.4695, + "step": 6440 + }, + { + "epoch": 0.08737113402061855, + "grad_norm": 6.555745601654053, + "learning_rate": 9.220227490749623e-06, + "loss": 0.4301, + "step": 6441 + }, + { + "epoch": 0.08738469886055344, + "grad_norm": 6.206214427947998, + "learning_rate": 9.220090448129368e-06, + "loss": 0.3966, + "step": 6442 + }, + { + "epoch": 0.08739826370048834, + "grad_norm": 6.907894611358643, + "learning_rate": 9.219953405509115e-06, + "loss": 0.5217, + "step": 6443 + }, + { + "epoch": 0.08741182854042322, + "grad_norm": 7.8739728927612305, + "learning_rate": 9.219816362888859e-06, + "loss": 0.4451, + "step": 6444 + }, + { + "epoch": 0.08742539338035811, + "grad_norm": 7.432534694671631, + "learning_rate": 9.219679320268604e-06, + "loss": 0.4527, + "step": 6445 + }, + { + "epoch": 0.087438958220293, + "grad_norm": 6.9938130378723145, + "learning_rate": 9.219542277648349e-06, + "loss": 0.4272, + "step": 6446 + }, + { + "epoch": 0.08745252306022788, + "grad_norm": 6.260620594024658, + "learning_rate": 9.219405235028094e-06, + "loss": 0.4272, + "step": 6447 + }, + { + "epoch": 0.08746608790016278, + "grad_norm": 5.718122482299805, + "learning_rate": 9.21926819240784e-06, + "loss": 0.3104, + "step": 6448 + }, + { + "epoch": 0.08747965274009767, + "grad_norm": 7.861425876617432, + "learning_rate": 9.219131149787585e-06, + "loss": 0.4875, + "step": 6449 + }, + { + "epoch": 0.08749321758003256, + "grad_norm": 7.106180667877197, + "learning_rate": 9.21899410716733e-06, + "loss": 0.5575, + "step": 6450 + }, + { + "epoch": 0.08750678241996744, + "grad_norm": 6.646599292755127, + "learning_rate": 9.218857064547075e-06, + "loss": 0.3591, + "step": 6451 + }, + { + "epoch": 0.08752034725990233, + "grad_norm": 5.428907871246338, + "learning_rate": 9.21872002192682e-06, + "loss": 0.4279, + "step": 6452 + }, + { + "epoch": 0.08753391209983723, + "grad_norm": 6.082606792449951, + "learning_rate": 9.218582979306565e-06, + "loss": 0.4266, + "step": 6453 + }, + { + "epoch": 0.0875474769397721, + "grad_norm": 7.175093650817871, + "learning_rate": 9.21844593668631e-06, + "loss": 0.4241, + "step": 6454 + }, + { + "epoch": 0.087561041779707, + "grad_norm": 6.1567816734313965, + "learning_rate": 9.218308894066054e-06, + "loss": 0.3532, + "step": 6455 + }, + { + "epoch": 0.08757460661964189, + "grad_norm": 6.442914962768555, + "learning_rate": 9.2181718514458e-06, + "loss": 0.5036, + "step": 6456 + }, + { + "epoch": 0.08758817145957677, + "grad_norm": 8.398568153381348, + "learning_rate": 9.218034808825546e-06, + "loss": 0.4629, + "step": 6457 + }, + { + "epoch": 0.08760173629951166, + "grad_norm": 5.158389091491699, + "learning_rate": 9.217897766205291e-06, + "loss": 0.3372, + "step": 6458 + }, + { + "epoch": 0.08761530113944656, + "grad_norm": 5.478622913360596, + "learning_rate": 9.217760723585035e-06, + "loss": 0.4199, + "step": 6459 + }, + { + "epoch": 0.08762886597938144, + "grad_norm": 7.531490325927734, + "learning_rate": 9.217623680964782e-06, + "loss": 0.4555, + "step": 6460 + }, + { + "epoch": 0.08764243081931633, + "grad_norm": 9.306991577148438, + "learning_rate": 9.217486638344527e-06, + "loss": 0.465, + "step": 6461 + }, + { + "epoch": 0.08765599565925122, + "grad_norm": 6.083556652069092, + "learning_rate": 9.21734959572427e-06, + "loss": 0.3366, + "step": 6462 + }, + { + "epoch": 0.0876695604991861, + "grad_norm": 6.515013217926025, + "learning_rate": 9.217212553104015e-06, + "loss": 0.5317, + "step": 6463 + }, + { + "epoch": 0.087683125339121, + "grad_norm": 5.015127658843994, + "learning_rate": 9.217075510483762e-06, + "loss": 0.4141, + "step": 6464 + }, + { + "epoch": 0.08769669017905589, + "grad_norm": 5.3730669021606445, + "learning_rate": 9.216938467863506e-06, + "loss": 0.3362, + "step": 6465 + }, + { + "epoch": 0.08771025501899078, + "grad_norm": 4.873570442199707, + "learning_rate": 9.216801425243251e-06, + "loss": 0.3097, + "step": 6466 + }, + { + "epoch": 0.08772381985892566, + "grad_norm": 8.039539337158203, + "learning_rate": 9.216664382622996e-06, + "loss": 0.5777, + "step": 6467 + }, + { + "epoch": 0.08773738469886055, + "grad_norm": 4.592296123504639, + "learning_rate": 9.216527340002741e-06, + "loss": 0.2098, + "step": 6468 + }, + { + "epoch": 0.08775094953879545, + "grad_norm": 6.273338794708252, + "learning_rate": 9.216390297382486e-06, + "loss": 0.4543, + "step": 6469 + }, + { + "epoch": 0.08776451437873033, + "grad_norm": 7.278250217437744, + "learning_rate": 9.216253254762232e-06, + "loss": 0.5586, + "step": 6470 + }, + { + "epoch": 0.08777807921866522, + "grad_norm": 5.162135601043701, + "learning_rate": 9.216116212141977e-06, + "loss": 0.2329, + "step": 6471 + }, + { + "epoch": 0.08779164405860011, + "grad_norm": 8.303410530090332, + "learning_rate": 9.215979169521722e-06, + "loss": 0.8452, + "step": 6472 + }, + { + "epoch": 0.08780520889853499, + "grad_norm": 5.954988479614258, + "learning_rate": 9.215842126901467e-06, + "loss": 0.3772, + "step": 6473 + }, + { + "epoch": 0.08781877373846989, + "grad_norm": 5.228967666625977, + "learning_rate": 9.215705084281212e-06, + "loss": 0.3282, + "step": 6474 + }, + { + "epoch": 0.08783233857840478, + "grad_norm": 6.5911865234375, + "learning_rate": 9.215568041660958e-06, + "loss": 0.3739, + "step": 6475 + }, + { + "epoch": 0.08784590341833966, + "grad_norm": 6.716029644012451, + "learning_rate": 9.215430999040703e-06, + "loss": 0.3875, + "step": 6476 + }, + { + "epoch": 0.08785946825827455, + "grad_norm": 7.907783031463623, + "learning_rate": 9.215293956420448e-06, + "loss": 0.4738, + "step": 6477 + }, + { + "epoch": 0.08787303309820944, + "grad_norm": 4.8655924797058105, + "learning_rate": 9.215156913800193e-06, + "loss": 0.2228, + "step": 6478 + }, + { + "epoch": 0.08788659793814432, + "grad_norm": 5.766753196716309, + "learning_rate": 9.215019871179938e-06, + "loss": 0.3593, + "step": 6479 + }, + { + "epoch": 0.08790016277807922, + "grad_norm": 4.789899826049805, + "learning_rate": 9.214882828559682e-06, + "loss": 0.3952, + "step": 6480 + }, + { + "epoch": 0.08791372761801411, + "grad_norm": 6.8211212158203125, + "learning_rate": 9.214745785939427e-06, + "loss": 0.3622, + "step": 6481 + }, + { + "epoch": 0.087927292457949, + "grad_norm": 6.701781272888184, + "learning_rate": 9.214608743319174e-06, + "loss": 0.5065, + "step": 6482 + }, + { + "epoch": 0.08794085729788388, + "grad_norm": 5.952450275421143, + "learning_rate": 9.214471700698919e-06, + "loss": 0.3233, + "step": 6483 + }, + { + "epoch": 0.08795442213781877, + "grad_norm": 5.296866416931152, + "learning_rate": 9.214334658078662e-06, + "loss": 0.3589, + "step": 6484 + }, + { + "epoch": 0.08796798697775367, + "grad_norm": 5.844508647918701, + "learning_rate": 9.214197615458408e-06, + "loss": 0.4141, + "step": 6485 + }, + { + "epoch": 0.08798155181768855, + "grad_norm": 7.612835884094238, + "learning_rate": 9.214060572838154e-06, + "loss": 0.4418, + "step": 6486 + }, + { + "epoch": 0.08799511665762344, + "grad_norm": 6.849738597869873, + "learning_rate": 9.213923530217898e-06, + "loss": 0.3187, + "step": 6487 + }, + { + "epoch": 0.08800868149755833, + "grad_norm": 4.437312602996826, + "learning_rate": 9.213786487597643e-06, + "loss": 0.2419, + "step": 6488 + }, + { + "epoch": 0.08802224633749321, + "grad_norm": 6.726041793823242, + "learning_rate": 9.213649444977388e-06, + "loss": 0.631, + "step": 6489 + }, + { + "epoch": 0.0880358111774281, + "grad_norm": 5.427779197692871, + "learning_rate": 9.213512402357134e-06, + "loss": 0.2834, + "step": 6490 + }, + { + "epoch": 0.088049376017363, + "grad_norm": 7.267885208129883, + "learning_rate": 9.213375359736879e-06, + "loss": 0.4951, + "step": 6491 + }, + { + "epoch": 0.08806294085729788, + "grad_norm": 6.37389612197876, + "learning_rate": 9.213238317116624e-06, + "loss": 0.3367, + "step": 6492 + }, + { + "epoch": 0.08807650569723277, + "grad_norm": 9.895709991455078, + "learning_rate": 9.213101274496369e-06, + "loss": 0.4596, + "step": 6493 + }, + { + "epoch": 0.08809007053716766, + "grad_norm": 6.484055042266846, + "learning_rate": 9.212964231876114e-06, + "loss": 0.3538, + "step": 6494 + }, + { + "epoch": 0.08810363537710256, + "grad_norm": 8.263452529907227, + "learning_rate": 9.21282718925586e-06, + "loss": 0.4548, + "step": 6495 + }, + { + "epoch": 0.08811720021703744, + "grad_norm": 4.401450157165527, + "learning_rate": 9.212690146635605e-06, + "loss": 0.3052, + "step": 6496 + }, + { + "epoch": 0.08813076505697233, + "grad_norm": 5.53735876083374, + "learning_rate": 9.21255310401535e-06, + "loss": 0.3622, + "step": 6497 + }, + { + "epoch": 0.08814432989690722, + "grad_norm": 5.593347072601318, + "learning_rate": 9.212416061395095e-06, + "loss": 0.4221, + "step": 6498 + }, + { + "epoch": 0.0881578947368421, + "grad_norm": 5.996750831604004, + "learning_rate": 9.21227901877484e-06, + "loss": 0.274, + "step": 6499 + }, + { + "epoch": 0.088171459576777, + "grad_norm": 8.340768814086914, + "learning_rate": 9.212141976154585e-06, + "loss": 0.6575, + "step": 6500 + }, + { + "epoch": 0.08818502441671189, + "grad_norm": 5.323676109313965, + "learning_rate": 9.21200493353433e-06, + "loss": 0.3042, + "step": 6501 + }, + { + "epoch": 0.08819858925664677, + "grad_norm": 6.757925033569336, + "learning_rate": 9.211867890914074e-06, + "loss": 0.3681, + "step": 6502 + }, + { + "epoch": 0.08821215409658166, + "grad_norm": 5.778159141540527, + "learning_rate": 9.21173084829382e-06, + "loss": 0.4663, + "step": 6503 + }, + { + "epoch": 0.08822571893651655, + "grad_norm": 7.100856781005859, + "learning_rate": 9.211593805673566e-06, + "loss": 0.5936, + "step": 6504 + }, + { + "epoch": 0.08823928377645143, + "grad_norm": 5.355169773101807, + "learning_rate": 9.21145676305331e-06, + "loss": 0.3724, + "step": 6505 + }, + { + "epoch": 0.08825284861638633, + "grad_norm": 6.093937873840332, + "learning_rate": 9.211319720433055e-06, + "loss": 0.4111, + "step": 6506 + }, + { + "epoch": 0.08826641345632122, + "grad_norm": 4.494277477264404, + "learning_rate": 9.211182677812802e-06, + "loss": 0.3176, + "step": 6507 + }, + { + "epoch": 0.0882799782962561, + "grad_norm": 5.083437919616699, + "learning_rate": 9.211045635192547e-06, + "loss": 0.3647, + "step": 6508 + }, + { + "epoch": 0.08829354313619099, + "grad_norm": 5.275075435638428, + "learning_rate": 9.21090859257229e-06, + "loss": 0.3428, + "step": 6509 + }, + { + "epoch": 0.08830710797612588, + "grad_norm": 5.915680408477783, + "learning_rate": 9.210771549952035e-06, + "loss": 0.382, + "step": 6510 + }, + { + "epoch": 0.08832067281606078, + "grad_norm": 5.4041218757629395, + "learning_rate": 9.21063450733178e-06, + "loss": 0.462, + "step": 6511 + }, + { + "epoch": 0.08833423765599566, + "grad_norm": 7.102237224578857, + "learning_rate": 9.210497464711526e-06, + "loss": 0.357, + "step": 6512 + }, + { + "epoch": 0.08834780249593055, + "grad_norm": 6.530831336975098, + "learning_rate": 9.210360422091271e-06, + "loss": 0.3426, + "step": 6513 + }, + { + "epoch": 0.08836136733586544, + "grad_norm": 7.33669376373291, + "learning_rate": 9.210223379471016e-06, + "loss": 0.3969, + "step": 6514 + }, + { + "epoch": 0.08837493217580032, + "grad_norm": 6.789733409881592, + "learning_rate": 9.210086336850761e-06, + "loss": 0.4017, + "step": 6515 + }, + { + "epoch": 0.08838849701573522, + "grad_norm": 8.025328636169434, + "learning_rate": 9.209949294230506e-06, + "loss": 0.507, + "step": 6516 + }, + { + "epoch": 0.08840206185567011, + "grad_norm": 7.598030090332031, + "learning_rate": 9.209812251610252e-06, + "loss": 0.4151, + "step": 6517 + }, + { + "epoch": 0.08841562669560499, + "grad_norm": 7.247796058654785, + "learning_rate": 9.209675208989997e-06, + "loss": 0.4292, + "step": 6518 + }, + { + "epoch": 0.08842919153553988, + "grad_norm": 7.228331565856934, + "learning_rate": 9.209538166369742e-06, + "loss": 0.4642, + "step": 6519 + }, + { + "epoch": 0.08844275637547477, + "grad_norm": 6.910279273986816, + "learning_rate": 9.209401123749487e-06, + "loss": 0.514, + "step": 6520 + }, + { + "epoch": 0.08845632121540965, + "grad_norm": 5.5607805252075195, + "learning_rate": 9.209264081129232e-06, + "loss": 0.3356, + "step": 6521 + }, + { + "epoch": 0.08846988605534455, + "grad_norm": 5.88557243347168, + "learning_rate": 9.209127038508978e-06, + "loss": 0.3158, + "step": 6522 + }, + { + "epoch": 0.08848345089527944, + "grad_norm": 5.932881832122803, + "learning_rate": 9.208989995888723e-06, + "loss": 0.3651, + "step": 6523 + }, + { + "epoch": 0.08849701573521432, + "grad_norm": 8.689351081848145, + "learning_rate": 9.208852953268466e-06, + "loss": 0.5049, + "step": 6524 + }, + { + "epoch": 0.08851058057514921, + "grad_norm": 6.923794746398926, + "learning_rate": 9.208715910648213e-06, + "loss": 0.423, + "step": 6525 + }, + { + "epoch": 0.0885241454150841, + "grad_norm": 7.7035231590271, + "learning_rate": 9.208578868027958e-06, + "loss": 0.4628, + "step": 6526 + }, + { + "epoch": 0.088537710255019, + "grad_norm": 6.414546012878418, + "learning_rate": 9.208441825407702e-06, + "loss": 0.4753, + "step": 6527 + }, + { + "epoch": 0.08855127509495388, + "grad_norm": 5.327549934387207, + "learning_rate": 9.208304782787447e-06, + "loss": 0.3464, + "step": 6528 + }, + { + "epoch": 0.08856483993488877, + "grad_norm": 6.6741623878479, + "learning_rate": 9.208167740167194e-06, + "loss": 0.3185, + "step": 6529 + }, + { + "epoch": 0.08857840477482366, + "grad_norm": 6.519922733306885, + "learning_rate": 9.208030697546937e-06, + "loss": 0.4095, + "step": 6530 + }, + { + "epoch": 0.08859196961475854, + "grad_norm": 7.753058433532715, + "learning_rate": 9.207893654926682e-06, + "loss": 0.4882, + "step": 6531 + }, + { + "epoch": 0.08860553445469344, + "grad_norm": 6.2137956619262695, + "learning_rate": 9.207756612306428e-06, + "loss": 0.3841, + "step": 6532 + }, + { + "epoch": 0.08861909929462833, + "grad_norm": 8.579879760742188, + "learning_rate": 9.207619569686174e-06, + "loss": 0.5528, + "step": 6533 + }, + { + "epoch": 0.08863266413456321, + "grad_norm": 7.121298789978027, + "learning_rate": 9.207482527065918e-06, + "loss": 0.5663, + "step": 6534 + }, + { + "epoch": 0.0886462289744981, + "grad_norm": 9.190324783325195, + "learning_rate": 9.207345484445663e-06, + "loss": 0.4467, + "step": 6535 + }, + { + "epoch": 0.088659793814433, + "grad_norm": 7.344692230224609, + "learning_rate": 9.207208441825408e-06, + "loss": 0.4507, + "step": 6536 + }, + { + "epoch": 0.08867335865436787, + "grad_norm": 6.926096439361572, + "learning_rate": 9.207071399205154e-06, + "loss": 0.3618, + "step": 6537 + }, + { + "epoch": 0.08868692349430277, + "grad_norm": 6.046044826507568, + "learning_rate": 9.206934356584899e-06, + "loss": 0.4779, + "step": 6538 + }, + { + "epoch": 0.08870048833423766, + "grad_norm": 5.877082824707031, + "learning_rate": 9.206797313964644e-06, + "loss": 0.3238, + "step": 6539 + }, + { + "epoch": 0.08871405317417254, + "grad_norm": 5.651632308959961, + "learning_rate": 9.206660271344389e-06, + "loss": 0.3545, + "step": 6540 + }, + { + "epoch": 0.08872761801410743, + "grad_norm": 6.967787265777588, + "learning_rate": 9.206523228724134e-06, + "loss": 0.5903, + "step": 6541 + }, + { + "epoch": 0.08874118285404232, + "grad_norm": 5.524755477905273, + "learning_rate": 9.20638618610388e-06, + "loss": 0.354, + "step": 6542 + }, + { + "epoch": 0.08875474769397722, + "grad_norm": 7.515805721282959, + "learning_rate": 9.206249143483625e-06, + "loss": 0.3619, + "step": 6543 + }, + { + "epoch": 0.0887683125339121, + "grad_norm": 7.457285404205322, + "learning_rate": 9.20611210086337e-06, + "loss": 0.3085, + "step": 6544 + }, + { + "epoch": 0.08878187737384699, + "grad_norm": 8.61219310760498, + "learning_rate": 9.205975058243113e-06, + "loss": 0.5318, + "step": 6545 + }, + { + "epoch": 0.08879544221378188, + "grad_norm": 7.855340957641602, + "learning_rate": 9.20583801562286e-06, + "loss": 0.4218, + "step": 6546 + }, + { + "epoch": 0.08880900705371676, + "grad_norm": 6.994015216827393, + "learning_rate": 9.205700973002605e-06, + "loss": 0.4593, + "step": 6547 + }, + { + "epoch": 0.08882257189365166, + "grad_norm": 6.453470706939697, + "learning_rate": 9.205563930382349e-06, + "loss": 0.4961, + "step": 6548 + }, + { + "epoch": 0.08883613673358655, + "grad_norm": 9.41592025756836, + "learning_rate": 9.205426887762094e-06, + "loss": 0.6156, + "step": 6549 + }, + { + "epoch": 0.08884970157352143, + "grad_norm": 8.2189302444458, + "learning_rate": 9.205289845141839e-06, + "loss": 0.6887, + "step": 6550 + }, + { + "epoch": 0.08886326641345632, + "grad_norm": 10.462276458740234, + "learning_rate": 9.205152802521586e-06, + "loss": 0.5992, + "step": 6551 + }, + { + "epoch": 0.08887683125339121, + "grad_norm": 6.97613000869751, + "learning_rate": 9.20501575990133e-06, + "loss": 0.5114, + "step": 6552 + }, + { + "epoch": 0.0888903960933261, + "grad_norm": 4.763612747192383, + "learning_rate": 9.204878717281075e-06, + "loss": 0.401, + "step": 6553 + }, + { + "epoch": 0.08890396093326099, + "grad_norm": 7.282792091369629, + "learning_rate": 9.20474167466082e-06, + "loss": 0.3885, + "step": 6554 + }, + { + "epoch": 0.08891752577319588, + "grad_norm": 7.352873802185059, + "learning_rate": 9.204604632040565e-06, + "loss": 0.4877, + "step": 6555 + }, + { + "epoch": 0.08893109061313076, + "grad_norm": 6.9966044425964355, + "learning_rate": 9.20446758942031e-06, + "loss": 0.3762, + "step": 6556 + }, + { + "epoch": 0.08894465545306565, + "grad_norm": 6.908959865570068, + "learning_rate": 9.204330546800055e-06, + "loss": 0.4607, + "step": 6557 + }, + { + "epoch": 0.08895822029300055, + "grad_norm": 8.30161190032959, + "learning_rate": 9.2041935041798e-06, + "loss": 0.471, + "step": 6558 + }, + { + "epoch": 0.08897178513293544, + "grad_norm": 8.631653785705566, + "learning_rate": 9.204056461559546e-06, + "loss": 0.4635, + "step": 6559 + }, + { + "epoch": 0.08898534997287032, + "grad_norm": 5.068355083465576, + "learning_rate": 9.203919418939291e-06, + "loss": 0.2714, + "step": 6560 + }, + { + "epoch": 0.08899891481280521, + "grad_norm": 8.348318099975586, + "learning_rate": 9.203782376319036e-06, + "loss": 0.7646, + "step": 6561 + }, + { + "epoch": 0.0890124796527401, + "grad_norm": 5.754628658294678, + "learning_rate": 9.203645333698781e-06, + "loss": 0.2895, + "step": 6562 + }, + { + "epoch": 0.08902604449267498, + "grad_norm": 6.730493068695068, + "learning_rate": 9.203508291078526e-06, + "loss": 0.4034, + "step": 6563 + }, + { + "epoch": 0.08903960933260988, + "grad_norm": 9.065537452697754, + "learning_rate": 9.203371248458272e-06, + "loss": 0.6, + "step": 6564 + }, + { + "epoch": 0.08905317417254477, + "grad_norm": 8.296812057495117, + "learning_rate": 9.203234205838017e-06, + "loss": 0.5082, + "step": 6565 + }, + { + "epoch": 0.08906673901247965, + "grad_norm": 7.59661865234375, + "learning_rate": 9.203097163217762e-06, + "loss": 0.376, + "step": 6566 + }, + { + "epoch": 0.08908030385241454, + "grad_norm": 5.497774124145508, + "learning_rate": 9.202960120597505e-06, + "loss": 0.3297, + "step": 6567 + }, + { + "epoch": 0.08909386869234943, + "grad_norm": 6.900774002075195, + "learning_rate": 9.202823077977252e-06, + "loss": 0.4136, + "step": 6568 + }, + { + "epoch": 0.08910743353228431, + "grad_norm": 7.670721530914307, + "learning_rate": 9.202686035356998e-06, + "loss": 0.3946, + "step": 6569 + }, + { + "epoch": 0.0891209983722192, + "grad_norm": 7.814865589141846, + "learning_rate": 9.202548992736741e-06, + "loss": 0.4598, + "step": 6570 + }, + { + "epoch": 0.0891345632121541, + "grad_norm": 4.466545581817627, + "learning_rate": 9.202411950116486e-06, + "loss": 0.2133, + "step": 6571 + }, + { + "epoch": 0.08914812805208898, + "grad_norm": 5.646350383758545, + "learning_rate": 9.202274907496233e-06, + "loss": 0.3237, + "step": 6572 + }, + { + "epoch": 0.08916169289202387, + "grad_norm": 5.6966705322265625, + "learning_rate": 9.202137864875977e-06, + "loss": 0.2858, + "step": 6573 + }, + { + "epoch": 0.08917525773195877, + "grad_norm": 8.430232048034668, + "learning_rate": 9.202000822255722e-06, + "loss": 0.5661, + "step": 6574 + }, + { + "epoch": 0.08918882257189366, + "grad_norm": 5.816620826721191, + "learning_rate": 9.201863779635467e-06, + "loss": 0.3595, + "step": 6575 + }, + { + "epoch": 0.08920238741182854, + "grad_norm": 7.53162145614624, + "learning_rate": 9.201726737015214e-06, + "loss": 0.5084, + "step": 6576 + }, + { + "epoch": 0.08921595225176343, + "grad_norm": 7.486446380615234, + "learning_rate": 9.201589694394957e-06, + "loss": 0.5061, + "step": 6577 + }, + { + "epoch": 0.08922951709169832, + "grad_norm": 7.160358905792236, + "learning_rate": 9.201452651774702e-06, + "loss": 0.519, + "step": 6578 + }, + { + "epoch": 0.0892430819316332, + "grad_norm": 5.539538860321045, + "learning_rate": 9.201315609154448e-06, + "loss": 0.355, + "step": 6579 + }, + { + "epoch": 0.0892566467715681, + "grad_norm": 6.7116241455078125, + "learning_rate": 9.201178566534193e-06, + "loss": 0.3735, + "step": 6580 + }, + { + "epoch": 0.08927021161150299, + "grad_norm": 6.646668434143066, + "learning_rate": 9.201041523913938e-06, + "loss": 0.4439, + "step": 6581 + }, + { + "epoch": 0.08928377645143787, + "grad_norm": 4.44443416595459, + "learning_rate": 9.200904481293683e-06, + "loss": 0.3334, + "step": 6582 + }, + { + "epoch": 0.08929734129137276, + "grad_norm": 8.177210807800293, + "learning_rate": 9.200767438673428e-06, + "loss": 0.3606, + "step": 6583 + }, + { + "epoch": 0.08931090613130765, + "grad_norm": 4.393777370452881, + "learning_rate": 9.200630396053174e-06, + "loss": 0.3086, + "step": 6584 + }, + { + "epoch": 0.08932447097124253, + "grad_norm": 6.845628261566162, + "learning_rate": 9.200493353432919e-06, + "loss": 0.4952, + "step": 6585 + }, + { + "epoch": 0.08933803581117743, + "grad_norm": 5.229225158691406, + "learning_rate": 9.200356310812664e-06, + "loss": 0.4145, + "step": 6586 + }, + { + "epoch": 0.08935160065111232, + "grad_norm": 5.789185047149658, + "learning_rate": 9.200219268192409e-06, + "loss": 0.4383, + "step": 6587 + }, + { + "epoch": 0.0893651654910472, + "grad_norm": 5.644672870635986, + "learning_rate": 9.200082225572153e-06, + "loss": 0.3293, + "step": 6588 + }, + { + "epoch": 0.08937873033098209, + "grad_norm": 7.748231887817383, + "learning_rate": 9.1999451829519e-06, + "loss": 0.5423, + "step": 6589 + }, + { + "epoch": 0.08939229517091699, + "grad_norm": 5.357306003570557, + "learning_rate": 9.199808140331645e-06, + "loss": 0.4117, + "step": 6590 + }, + { + "epoch": 0.08940586001085188, + "grad_norm": 5.659414291381836, + "learning_rate": 9.19967109771139e-06, + "loss": 0.2913, + "step": 6591 + }, + { + "epoch": 0.08941942485078676, + "grad_norm": 6.218937873840332, + "learning_rate": 9.199534055091133e-06, + "loss": 0.3102, + "step": 6592 + }, + { + "epoch": 0.08943298969072165, + "grad_norm": 4.61331844329834, + "learning_rate": 9.199397012470878e-06, + "loss": 0.2681, + "step": 6593 + }, + { + "epoch": 0.08944655453065654, + "grad_norm": 7.949239253997803, + "learning_rate": 9.199259969850625e-06, + "loss": 0.5616, + "step": 6594 + }, + { + "epoch": 0.08946011937059142, + "grad_norm": 7.079935550689697, + "learning_rate": 9.199122927230369e-06, + "loss": 0.4084, + "step": 6595 + }, + { + "epoch": 0.08947368421052632, + "grad_norm": 6.0219244956970215, + "learning_rate": 9.198985884610114e-06, + "loss": 0.3452, + "step": 6596 + }, + { + "epoch": 0.08948724905046121, + "grad_norm": 7.130395889282227, + "learning_rate": 9.19884884198986e-06, + "loss": 0.4286, + "step": 6597 + }, + { + "epoch": 0.08950081389039609, + "grad_norm": 5.951810359954834, + "learning_rate": 9.198711799369604e-06, + "loss": 0.4855, + "step": 6598 + }, + { + "epoch": 0.08951437873033098, + "grad_norm": 6.121279716491699, + "learning_rate": 9.19857475674935e-06, + "loss": 0.3149, + "step": 6599 + }, + { + "epoch": 0.08952794357026587, + "grad_norm": 6.0540361404418945, + "learning_rate": 9.198437714129095e-06, + "loss": 0.4229, + "step": 6600 + }, + { + "epoch": 0.08954150841020075, + "grad_norm": 7.179762840270996, + "learning_rate": 9.19830067150884e-06, + "loss": 0.4191, + "step": 6601 + }, + { + "epoch": 0.08955507325013565, + "grad_norm": 6.577062606811523, + "learning_rate": 9.198163628888585e-06, + "loss": 0.3497, + "step": 6602 + }, + { + "epoch": 0.08956863809007054, + "grad_norm": 5.288985252380371, + "learning_rate": 9.19802658626833e-06, + "loss": 0.3853, + "step": 6603 + }, + { + "epoch": 0.08958220293000542, + "grad_norm": 8.225791931152344, + "learning_rate": 9.197889543648075e-06, + "loss": 0.3338, + "step": 6604 + }, + { + "epoch": 0.08959576776994031, + "grad_norm": 5.489688873291016, + "learning_rate": 9.19775250102782e-06, + "loss": 0.3542, + "step": 6605 + }, + { + "epoch": 0.0896093326098752, + "grad_norm": 10.086688041687012, + "learning_rate": 9.197615458407566e-06, + "loss": 0.5375, + "step": 6606 + }, + { + "epoch": 0.0896228974498101, + "grad_norm": 5.479747295379639, + "learning_rate": 9.197478415787311e-06, + "loss": 0.3836, + "step": 6607 + }, + { + "epoch": 0.08963646228974498, + "grad_norm": 7.05626106262207, + "learning_rate": 9.197341373167056e-06, + "loss": 0.4384, + "step": 6608 + }, + { + "epoch": 0.08965002712967987, + "grad_norm": 7.809081077575684, + "learning_rate": 9.197204330546801e-06, + "loss": 0.5174, + "step": 6609 + }, + { + "epoch": 0.08966359196961476, + "grad_norm": 7.775928974151611, + "learning_rate": 9.197067287926545e-06, + "loss": 0.3485, + "step": 6610 + }, + { + "epoch": 0.08967715680954964, + "grad_norm": 6.513156414031982, + "learning_rate": 9.196930245306292e-06, + "loss": 0.313, + "step": 6611 + }, + { + "epoch": 0.08969072164948454, + "grad_norm": 7.277476787567139, + "learning_rate": 9.196793202686037e-06, + "loss": 0.3529, + "step": 6612 + }, + { + "epoch": 0.08970428648941943, + "grad_norm": 6.68552303314209, + "learning_rate": 9.19665616006578e-06, + "loss": 0.3194, + "step": 6613 + }, + { + "epoch": 0.08971785132935431, + "grad_norm": 5.553966522216797, + "learning_rate": 9.196519117445526e-06, + "loss": 0.2447, + "step": 6614 + }, + { + "epoch": 0.0897314161692892, + "grad_norm": 8.969708442687988, + "learning_rate": 9.196382074825272e-06, + "loss": 0.3341, + "step": 6615 + }, + { + "epoch": 0.0897449810092241, + "grad_norm": 5.921197414398193, + "learning_rate": 9.196245032205018e-06, + "loss": 0.335, + "step": 6616 + }, + { + "epoch": 0.08975854584915897, + "grad_norm": 6.700681686401367, + "learning_rate": 9.196107989584761e-06, + "loss": 0.3861, + "step": 6617 + }, + { + "epoch": 0.08977211068909387, + "grad_norm": 6.821083068847656, + "learning_rate": 9.195970946964506e-06, + "loss": 0.3251, + "step": 6618 + }, + { + "epoch": 0.08978567552902876, + "grad_norm": 9.051922798156738, + "learning_rate": 9.195833904344251e-06, + "loss": 0.3718, + "step": 6619 + }, + { + "epoch": 0.08979924036896364, + "grad_norm": 7.490516185760498, + "learning_rate": 9.195696861723997e-06, + "loss": 0.3836, + "step": 6620 + }, + { + "epoch": 0.08981280520889853, + "grad_norm": 5.616079807281494, + "learning_rate": 9.195559819103742e-06, + "loss": 0.2593, + "step": 6621 + }, + { + "epoch": 0.08982637004883343, + "grad_norm": 5.4909443855285645, + "learning_rate": 9.195422776483487e-06, + "loss": 0.3281, + "step": 6622 + }, + { + "epoch": 0.08983993488876832, + "grad_norm": 6.207174301147461, + "learning_rate": 9.195285733863232e-06, + "loss": 0.4947, + "step": 6623 + }, + { + "epoch": 0.0898534997287032, + "grad_norm": 5.874509334564209, + "learning_rate": 9.195148691242977e-06, + "loss": 0.3157, + "step": 6624 + }, + { + "epoch": 0.08986706456863809, + "grad_norm": 5.43967866897583, + "learning_rate": 9.195011648622722e-06, + "loss": 0.3673, + "step": 6625 + }, + { + "epoch": 0.08988062940857298, + "grad_norm": 6.953378677368164, + "learning_rate": 9.194874606002468e-06, + "loss": 0.2845, + "step": 6626 + }, + { + "epoch": 0.08989419424850786, + "grad_norm": 5.368503093719482, + "learning_rate": 9.194737563382213e-06, + "loss": 0.4531, + "step": 6627 + }, + { + "epoch": 0.08990775908844276, + "grad_norm": 5.437262535095215, + "learning_rate": 9.194600520761958e-06, + "loss": 0.2641, + "step": 6628 + }, + { + "epoch": 0.08992132392837765, + "grad_norm": 7.543627738952637, + "learning_rate": 9.194463478141703e-06, + "loss": 0.51, + "step": 6629 + }, + { + "epoch": 0.08993488876831253, + "grad_norm": 7.489743232727051, + "learning_rate": 9.194326435521448e-06, + "loss": 0.3921, + "step": 6630 + }, + { + "epoch": 0.08994845360824742, + "grad_norm": 6.219020843505859, + "learning_rate": 9.194189392901194e-06, + "loss": 0.212, + "step": 6631 + }, + { + "epoch": 0.08996201844818232, + "grad_norm": 4.287890911102295, + "learning_rate": 9.194052350280939e-06, + "loss": 0.2232, + "step": 6632 + }, + { + "epoch": 0.0899755832881172, + "grad_norm": 6.739640712738037, + "learning_rate": 9.193915307660684e-06, + "loss": 0.3775, + "step": 6633 + }, + { + "epoch": 0.08998914812805209, + "grad_norm": 4.782443046569824, + "learning_rate": 9.193778265040429e-06, + "loss": 0.2666, + "step": 6634 + }, + { + "epoch": 0.09000271296798698, + "grad_norm": 4.729876518249512, + "learning_rate": 9.193641222420173e-06, + "loss": 0.3394, + "step": 6635 + }, + { + "epoch": 0.09001627780792186, + "grad_norm": 6.182608604431152, + "learning_rate": 9.193504179799918e-06, + "loss": 0.3363, + "step": 6636 + }, + { + "epoch": 0.09002984264785675, + "grad_norm": 5.1976118087768555, + "learning_rate": 9.193367137179665e-06, + "loss": 0.2893, + "step": 6637 + }, + { + "epoch": 0.09004340748779165, + "grad_norm": 6.49461030960083, + "learning_rate": 9.193230094559408e-06, + "loss": 0.413, + "step": 6638 + }, + { + "epoch": 0.09005697232772654, + "grad_norm": 5.422562599182129, + "learning_rate": 9.193093051939153e-06, + "loss": 0.4638, + "step": 6639 + }, + { + "epoch": 0.09007053716766142, + "grad_norm": 4.14987325668335, + "learning_rate": 9.192956009318898e-06, + "loss": 0.1818, + "step": 6640 + }, + { + "epoch": 0.09008410200759631, + "grad_norm": 5.72471809387207, + "learning_rate": 9.192818966698644e-06, + "loss": 0.3415, + "step": 6641 + }, + { + "epoch": 0.0900976668475312, + "grad_norm": 5.782142162322998, + "learning_rate": 9.192681924078389e-06, + "loss": 0.2388, + "step": 6642 + }, + { + "epoch": 0.09011123168746608, + "grad_norm": 4.609967231750488, + "learning_rate": 9.192544881458134e-06, + "loss": 0.2723, + "step": 6643 + }, + { + "epoch": 0.09012479652740098, + "grad_norm": 5.305135726928711, + "learning_rate": 9.19240783883788e-06, + "loss": 0.2486, + "step": 6644 + }, + { + "epoch": 0.09013836136733587, + "grad_norm": 5.36251974105835, + "learning_rate": 9.192270796217624e-06, + "loss": 0.2489, + "step": 6645 + }, + { + "epoch": 0.09015192620727075, + "grad_norm": 3.889577627182007, + "learning_rate": 9.19213375359737e-06, + "loss": 0.2636, + "step": 6646 + }, + { + "epoch": 0.09016549104720564, + "grad_norm": 5.825926780700684, + "learning_rate": 9.191996710977115e-06, + "loss": 0.3873, + "step": 6647 + }, + { + "epoch": 0.09017905588714054, + "grad_norm": 4.97946834564209, + "learning_rate": 9.19185966835686e-06, + "loss": 0.2492, + "step": 6648 + }, + { + "epoch": 0.09019262072707541, + "grad_norm": 3.4398303031921387, + "learning_rate": 9.191722625736605e-06, + "loss": 0.1319, + "step": 6649 + }, + { + "epoch": 0.09020618556701031, + "grad_norm": 6.11957311630249, + "learning_rate": 9.19158558311635e-06, + "loss": 0.2832, + "step": 6650 + }, + { + "epoch": 0.0902197504069452, + "grad_norm": 3.9382517337799072, + "learning_rate": 9.191448540496095e-06, + "loss": 0.2195, + "step": 6651 + }, + { + "epoch": 0.09023331524688008, + "grad_norm": 5.414021968841553, + "learning_rate": 9.19131149787584e-06, + "loss": 0.2337, + "step": 6652 + }, + { + "epoch": 0.09024688008681497, + "grad_norm": 6.332797050476074, + "learning_rate": 9.191174455255584e-06, + "loss": 0.4152, + "step": 6653 + }, + { + "epoch": 0.09026044492674987, + "grad_norm": 5.486518859863281, + "learning_rate": 9.191037412635331e-06, + "loss": 0.1879, + "step": 6654 + }, + { + "epoch": 0.09027400976668476, + "grad_norm": 4.760217189788818, + "learning_rate": 9.190900370015076e-06, + "loss": 0.2908, + "step": 6655 + }, + { + "epoch": 0.09028757460661964, + "grad_norm": 4.51900053024292, + "learning_rate": 9.19076332739482e-06, + "loss": 0.2656, + "step": 6656 + }, + { + "epoch": 0.09030113944655453, + "grad_norm": 4.256173610687256, + "learning_rate": 9.190626284774565e-06, + "loss": 0.264, + "step": 6657 + }, + { + "epoch": 0.09031470428648943, + "grad_norm": 6.5140275955200195, + "learning_rate": 9.190489242154312e-06, + "loss": 0.2888, + "step": 6658 + }, + { + "epoch": 0.0903282691264243, + "grad_norm": 4.295742034912109, + "learning_rate": 9.190352199534057e-06, + "loss": 0.2276, + "step": 6659 + }, + { + "epoch": 0.0903418339663592, + "grad_norm": 3.2832164764404297, + "learning_rate": 9.1902151569138e-06, + "loss": 0.1792, + "step": 6660 + }, + { + "epoch": 0.09035539880629409, + "grad_norm": 4.9751787185668945, + "learning_rate": 9.190078114293546e-06, + "loss": 0.3077, + "step": 6661 + }, + { + "epoch": 0.09036896364622897, + "grad_norm": 4.195662498474121, + "learning_rate": 9.18994107167329e-06, + "loss": 0.2252, + "step": 6662 + }, + { + "epoch": 0.09038252848616386, + "grad_norm": 4.497807502746582, + "learning_rate": 9.189804029053036e-06, + "loss": 0.2317, + "step": 6663 + }, + { + "epoch": 0.09039609332609876, + "grad_norm": 4.431402206420898, + "learning_rate": 9.189666986432781e-06, + "loss": 0.2145, + "step": 6664 + }, + { + "epoch": 0.09040965816603364, + "grad_norm": 3.9221410751342773, + "learning_rate": 9.189529943812526e-06, + "loss": 0.1979, + "step": 6665 + }, + { + "epoch": 0.09042322300596853, + "grad_norm": 4.940886497497559, + "learning_rate": 9.189392901192271e-06, + "loss": 0.2511, + "step": 6666 + }, + { + "epoch": 0.09043678784590342, + "grad_norm": 5.241168975830078, + "learning_rate": 9.189255858572017e-06, + "loss": 0.3224, + "step": 6667 + }, + { + "epoch": 0.0904503526858383, + "grad_norm": 4.011295318603516, + "learning_rate": 9.189118815951762e-06, + "loss": 0.2289, + "step": 6668 + }, + { + "epoch": 0.0904639175257732, + "grad_norm": 4.568144798278809, + "learning_rate": 9.188981773331507e-06, + "loss": 0.1967, + "step": 6669 + }, + { + "epoch": 0.09047748236570809, + "grad_norm": 3.4881293773651123, + "learning_rate": 9.188844730711252e-06, + "loss": 0.1831, + "step": 6670 + }, + { + "epoch": 0.09049104720564298, + "grad_norm": 4.606497287750244, + "learning_rate": 9.188707688090997e-06, + "loss": 0.196, + "step": 6671 + }, + { + "epoch": 0.09050461204557786, + "grad_norm": 4.333430767059326, + "learning_rate": 9.188570645470743e-06, + "loss": 0.3076, + "step": 6672 + }, + { + "epoch": 0.09051817688551275, + "grad_norm": 4.806063652038574, + "learning_rate": 9.188433602850488e-06, + "loss": 0.2858, + "step": 6673 + }, + { + "epoch": 0.09053174172544765, + "grad_norm": 4.858423233032227, + "learning_rate": 9.188296560230233e-06, + "loss": 0.321, + "step": 6674 + }, + { + "epoch": 0.09054530656538252, + "grad_norm": 4.475897312164307, + "learning_rate": 9.188159517609976e-06, + "loss": 0.2666, + "step": 6675 + }, + { + "epoch": 0.09055887140531742, + "grad_norm": 4.082827091217041, + "learning_rate": 9.188022474989723e-06, + "loss": 0.2375, + "step": 6676 + }, + { + "epoch": 0.09057243624525231, + "grad_norm": 5.9085235595703125, + "learning_rate": 9.187885432369468e-06, + "loss": 0.2426, + "step": 6677 + }, + { + "epoch": 0.09058600108518719, + "grad_norm": 4.550703048706055, + "learning_rate": 9.187748389749212e-06, + "loss": 0.2743, + "step": 6678 + }, + { + "epoch": 0.09059956592512208, + "grad_norm": 5.434970855712891, + "learning_rate": 9.187611347128957e-06, + "loss": 0.2744, + "step": 6679 + }, + { + "epoch": 0.09061313076505698, + "grad_norm": 6.112697601318359, + "learning_rate": 9.187474304508704e-06, + "loss": 0.3314, + "step": 6680 + }, + { + "epoch": 0.09062669560499186, + "grad_norm": 4.8938093185424805, + "learning_rate": 9.187337261888447e-06, + "loss": 0.3043, + "step": 6681 + }, + { + "epoch": 0.09064026044492675, + "grad_norm": 4.715324878692627, + "learning_rate": 9.187200219268193e-06, + "loss": 0.3253, + "step": 6682 + }, + { + "epoch": 0.09065382528486164, + "grad_norm": 4.104722023010254, + "learning_rate": 9.187063176647938e-06, + "loss": 0.1884, + "step": 6683 + }, + { + "epoch": 0.09066739012479652, + "grad_norm": 5.138367176055908, + "learning_rate": 9.186926134027685e-06, + "loss": 0.2534, + "step": 6684 + }, + { + "epoch": 0.09068095496473141, + "grad_norm": 4.478262424468994, + "learning_rate": 9.186789091407428e-06, + "loss": 0.233, + "step": 6685 + }, + { + "epoch": 0.09069451980466631, + "grad_norm": 5.584125995635986, + "learning_rate": 9.186652048787173e-06, + "loss": 0.3811, + "step": 6686 + }, + { + "epoch": 0.0907080846446012, + "grad_norm": 4.680305480957031, + "learning_rate": 9.186515006166918e-06, + "loss": 0.2844, + "step": 6687 + }, + { + "epoch": 0.09072164948453608, + "grad_norm": 4.270350933074951, + "learning_rate": 9.186377963546664e-06, + "loss": 0.2473, + "step": 6688 + }, + { + "epoch": 0.09073521432447097, + "grad_norm": 4.412230968475342, + "learning_rate": 9.186240920926409e-06, + "loss": 0.2525, + "step": 6689 + }, + { + "epoch": 0.09074877916440587, + "grad_norm": 8.440362930297852, + "learning_rate": 9.186103878306154e-06, + "loss": 0.4961, + "step": 6690 + }, + { + "epoch": 0.09076234400434074, + "grad_norm": 6.139681816101074, + "learning_rate": 9.1859668356859e-06, + "loss": 0.3885, + "step": 6691 + }, + { + "epoch": 0.09077590884427564, + "grad_norm": 5.739259719848633, + "learning_rate": 9.185829793065644e-06, + "loss": 0.388, + "step": 6692 + }, + { + "epoch": 0.09078947368421053, + "grad_norm": 4.771003723144531, + "learning_rate": 9.18569275044539e-06, + "loss": 0.2196, + "step": 6693 + }, + { + "epoch": 0.09080303852414541, + "grad_norm": 5.155994415283203, + "learning_rate": 9.185555707825135e-06, + "loss": 0.3052, + "step": 6694 + }, + { + "epoch": 0.0908166033640803, + "grad_norm": 7.388957977294922, + "learning_rate": 9.18541866520488e-06, + "loss": 0.5453, + "step": 6695 + }, + { + "epoch": 0.0908301682040152, + "grad_norm": 6.9395928382873535, + "learning_rate": 9.185281622584623e-06, + "loss": 0.3891, + "step": 6696 + }, + { + "epoch": 0.09084373304395008, + "grad_norm": 4.493240833282471, + "learning_rate": 9.18514457996437e-06, + "loss": 0.2625, + "step": 6697 + }, + { + "epoch": 0.09085729788388497, + "grad_norm": 4.7347331047058105, + "learning_rate": 9.185007537344115e-06, + "loss": 0.3222, + "step": 6698 + }, + { + "epoch": 0.09087086272381986, + "grad_norm": 6.198455810546875, + "learning_rate": 9.18487049472386e-06, + "loss": 0.269, + "step": 6699 + }, + { + "epoch": 0.09088442756375474, + "grad_norm": 6.103002548217773, + "learning_rate": 9.184733452103604e-06, + "loss": 0.3221, + "step": 6700 + }, + { + "epoch": 0.09089799240368963, + "grad_norm": 5.563783168792725, + "learning_rate": 9.18459640948335e-06, + "loss": 0.3616, + "step": 6701 + }, + { + "epoch": 0.09091155724362453, + "grad_norm": 8.453128814697266, + "learning_rate": 9.184459366863096e-06, + "loss": 0.548, + "step": 6702 + }, + { + "epoch": 0.09092512208355942, + "grad_norm": 6.275750160217285, + "learning_rate": 9.18432232424284e-06, + "loss": 0.4372, + "step": 6703 + }, + { + "epoch": 0.0909386869234943, + "grad_norm": 5.62317419052124, + "learning_rate": 9.184185281622585e-06, + "loss": 0.2734, + "step": 6704 + }, + { + "epoch": 0.09095225176342919, + "grad_norm": 6.137239933013916, + "learning_rate": 9.18404823900233e-06, + "loss": 0.4953, + "step": 6705 + }, + { + "epoch": 0.09096581660336409, + "grad_norm": 4.584872722625732, + "learning_rate": 9.183911196382075e-06, + "loss": 0.3386, + "step": 6706 + }, + { + "epoch": 0.09097938144329897, + "grad_norm": 3.472832441329956, + "learning_rate": 9.18377415376182e-06, + "loss": 0.1988, + "step": 6707 + }, + { + "epoch": 0.09099294628323386, + "grad_norm": 5.534965991973877, + "learning_rate": 9.183637111141566e-06, + "loss": 0.3871, + "step": 6708 + }, + { + "epoch": 0.09100651112316875, + "grad_norm": 5.001688480377197, + "learning_rate": 9.18350006852131e-06, + "loss": 0.336, + "step": 6709 + }, + { + "epoch": 0.09102007596310363, + "grad_norm": 4.765499114990234, + "learning_rate": 9.183363025901056e-06, + "loss": 0.2713, + "step": 6710 + }, + { + "epoch": 0.09103364080303852, + "grad_norm": 4.206322193145752, + "learning_rate": 9.183225983280801e-06, + "loss": 0.3585, + "step": 6711 + }, + { + "epoch": 0.09104720564297342, + "grad_norm": 5.591857433319092, + "learning_rate": 9.183088940660546e-06, + "loss": 0.3106, + "step": 6712 + }, + { + "epoch": 0.0910607704829083, + "grad_norm": 5.0871429443359375, + "learning_rate": 9.182951898040291e-06, + "loss": 0.3781, + "step": 6713 + }, + { + "epoch": 0.09107433532284319, + "grad_norm": 3.6977598667144775, + "learning_rate": 9.182814855420037e-06, + "loss": 0.2994, + "step": 6714 + }, + { + "epoch": 0.09108790016277808, + "grad_norm": 7.845678806304932, + "learning_rate": 9.182677812799782e-06, + "loss": 0.3482, + "step": 6715 + }, + { + "epoch": 0.09110146500271296, + "grad_norm": 4.26901912689209, + "learning_rate": 9.182540770179527e-06, + "loss": 0.241, + "step": 6716 + }, + { + "epoch": 0.09111502984264785, + "grad_norm": 6.197447776794434, + "learning_rate": 9.182403727559272e-06, + "loss": 0.2939, + "step": 6717 + }, + { + "epoch": 0.09112859468258275, + "grad_norm": 4.808244705200195, + "learning_rate": 9.182266684939016e-06, + "loss": 0.259, + "step": 6718 + }, + { + "epoch": 0.09114215952251764, + "grad_norm": 4.827118396759033, + "learning_rate": 9.182129642318763e-06, + "loss": 0.2705, + "step": 6719 + }, + { + "epoch": 0.09115572436245252, + "grad_norm": 4.969388008117676, + "learning_rate": 9.181992599698508e-06, + "loss": 0.3089, + "step": 6720 + }, + { + "epoch": 0.09116928920238741, + "grad_norm": 7.503845691680908, + "learning_rate": 9.181855557078251e-06, + "loss": 0.4641, + "step": 6721 + }, + { + "epoch": 0.0911828540423223, + "grad_norm": 8.125750541687012, + "learning_rate": 9.181718514457996e-06, + "loss": 0.3738, + "step": 6722 + }, + { + "epoch": 0.09119641888225719, + "grad_norm": 4.031009674072266, + "learning_rate": 9.181581471837743e-06, + "loss": 0.1797, + "step": 6723 + }, + { + "epoch": 0.09120998372219208, + "grad_norm": 4.105594635009766, + "learning_rate": 9.181444429217488e-06, + "loss": 0.219, + "step": 6724 + }, + { + "epoch": 0.09122354856212697, + "grad_norm": 5.3985276222229, + "learning_rate": 9.181307386597232e-06, + "loss": 0.2058, + "step": 6725 + }, + { + "epoch": 0.09123711340206185, + "grad_norm": 3.796344518661499, + "learning_rate": 9.181170343976977e-06, + "loss": 0.133, + "step": 6726 + }, + { + "epoch": 0.09125067824199674, + "grad_norm": 4.979898452758789, + "learning_rate": 9.181033301356724e-06, + "loss": 0.3765, + "step": 6727 + }, + { + "epoch": 0.09126424308193164, + "grad_norm": 8.432291984558105, + "learning_rate": 9.180896258736467e-06, + "loss": 0.3611, + "step": 6728 + }, + { + "epoch": 0.09127780792186652, + "grad_norm": 6.239559650421143, + "learning_rate": 9.180759216116213e-06, + "loss": 0.2648, + "step": 6729 + }, + { + "epoch": 0.09129137276180141, + "grad_norm": 5.098591327667236, + "learning_rate": 9.180622173495958e-06, + "loss": 0.2765, + "step": 6730 + }, + { + "epoch": 0.0913049376017363, + "grad_norm": 5.5804877281188965, + "learning_rate": 9.180485130875703e-06, + "loss": 0.3488, + "step": 6731 + }, + { + "epoch": 0.09131850244167118, + "grad_norm": 3.0934929847717285, + "learning_rate": 9.180348088255448e-06, + "loss": 0.1701, + "step": 6732 + }, + { + "epoch": 0.09133206728160607, + "grad_norm": 5.025936603546143, + "learning_rate": 9.180211045635193e-06, + "loss": 0.3475, + "step": 6733 + }, + { + "epoch": 0.09134563212154097, + "grad_norm": 4.68676233291626, + "learning_rate": 9.180074003014939e-06, + "loss": 0.2897, + "step": 6734 + }, + { + "epoch": 0.09135919696147586, + "grad_norm": 5.646849632263184, + "learning_rate": 9.179936960394684e-06, + "loss": 0.357, + "step": 6735 + }, + { + "epoch": 0.09137276180141074, + "grad_norm": 5.1642560958862305, + "learning_rate": 9.179799917774429e-06, + "loss": 0.3382, + "step": 6736 + }, + { + "epoch": 0.09138632664134563, + "grad_norm": 6.8607177734375, + "learning_rate": 9.179662875154174e-06, + "loss": 0.3976, + "step": 6737 + }, + { + "epoch": 0.09139989148128053, + "grad_norm": 6.149094581604004, + "learning_rate": 9.17952583253392e-06, + "loss": 0.365, + "step": 6738 + }, + { + "epoch": 0.0914134563212154, + "grad_norm": 8.027924537658691, + "learning_rate": 9.179388789913663e-06, + "loss": 0.4587, + "step": 6739 + }, + { + "epoch": 0.0914270211611503, + "grad_norm": 5.100048542022705, + "learning_rate": 9.17925174729341e-06, + "loss": 0.3744, + "step": 6740 + }, + { + "epoch": 0.09144058600108519, + "grad_norm": 4.882765293121338, + "learning_rate": 9.179114704673155e-06, + "loss": 0.3674, + "step": 6741 + }, + { + "epoch": 0.09145415084102007, + "grad_norm": 7.045836448669434, + "learning_rate": 9.1789776620529e-06, + "loss": 0.4231, + "step": 6742 + }, + { + "epoch": 0.09146771568095496, + "grad_norm": 4.628352165222168, + "learning_rate": 9.178840619432643e-06, + "loss": 0.3449, + "step": 6743 + }, + { + "epoch": 0.09148128052088986, + "grad_norm": 5.411994934082031, + "learning_rate": 9.178703576812389e-06, + "loss": 0.4776, + "step": 6744 + }, + { + "epoch": 0.09149484536082474, + "grad_norm": 4.550971031188965, + "learning_rate": 9.178566534192135e-06, + "loss": 0.1952, + "step": 6745 + }, + { + "epoch": 0.09150841020075963, + "grad_norm": 4.175539016723633, + "learning_rate": 9.178429491571879e-06, + "loss": 0.1955, + "step": 6746 + }, + { + "epoch": 0.09152197504069452, + "grad_norm": 6.706912517547607, + "learning_rate": 9.178292448951624e-06, + "loss": 0.3604, + "step": 6747 + }, + { + "epoch": 0.0915355398806294, + "grad_norm": 4.210489273071289, + "learning_rate": 9.17815540633137e-06, + "loss": 0.2213, + "step": 6748 + }, + { + "epoch": 0.0915491047205643, + "grad_norm": 5.2217583656311035, + "learning_rate": 9.178018363711115e-06, + "loss": 0.3848, + "step": 6749 + }, + { + "epoch": 0.09156266956049919, + "grad_norm": 4.583828449249268, + "learning_rate": 9.17788132109086e-06, + "loss": 0.3193, + "step": 6750 + }, + { + "epoch": 0.09157623440043408, + "grad_norm": 4.482635021209717, + "learning_rate": 9.177744278470605e-06, + "loss": 0.3961, + "step": 6751 + }, + { + "epoch": 0.09158979924036896, + "grad_norm": 6.525916576385498, + "learning_rate": 9.17760723585035e-06, + "loss": 0.4937, + "step": 6752 + }, + { + "epoch": 0.09160336408030385, + "grad_norm": 6.397602558135986, + "learning_rate": 9.177470193230095e-06, + "loss": 0.352, + "step": 6753 + }, + { + "epoch": 0.09161692892023875, + "grad_norm": 4.912678241729736, + "learning_rate": 9.17733315060984e-06, + "loss": 0.311, + "step": 6754 + }, + { + "epoch": 0.09163049376017363, + "grad_norm": 5.119250774383545, + "learning_rate": 9.177196107989586e-06, + "loss": 0.392, + "step": 6755 + }, + { + "epoch": 0.09164405860010852, + "grad_norm": 5.633330821990967, + "learning_rate": 9.17705906536933e-06, + "loss": 0.2266, + "step": 6756 + }, + { + "epoch": 0.09165762344004341, + "grad_norm": 6.078437328338623, + "learning_rate": 9.176922022749076e-06, + "loss": 0.4179, + "step": 6757 + }, + { + "epoch": 0.09167118827997829, + "grad_norm": 7.53797721862793, + "learning_rate": 9.176784980128821e-06, + "loss": 0.5831, + "step": 6758 + }, + { + "epoch": 0.09168475311991318, + "grad_norm": 5.71765661239624, + "learning_rate": 9.176647937508566e-06, + "loss": 0.3012, + "step": 6759 + }, + { + "epoch": 0.09169831795984808, + "grad_norm": 4.7395830154418945, + "learning_rate": 9.176510894888311e-06, + "loss": 0.3091, + "step": 6760 + }, + { + "epoch": 0.09171188279978296, + "grad_norm": 6.571170330047607, + "learning_rate": 9.176373852268055e-06, + "loss": 0.3784, + "step": 6761 + }, + { + "epoch": 0.09172544763971785, + "grad_norm": 9.585031509399414, + "learning_rate": 9.176236809647802e-06, + "loss": 0.5182, + "step": 6762 + }, + { + "epoch": 0.09173901247965274, + "grad_norm": 4.436825752258301, + "learning_rate": 9.176099767027547e-06, + "loss": 0.2362, + "step": 6763 + }, + { + "epoch": 0.09175257731958762, + "grad_norm": 5.411114692687988, + "learning_rate": 9.17596272440729e-06, + "loss": 0.3209, + "step": 6764 + }, + { + "epoch": 0.09176614215952252, + "grad_norm": 7.007269859313965, + "learning_rate": 9.175825681787036e-06, + "loss": 0.5216, + "step": 6765 + }, + { + "epoch": 0.09177970699945741, + "grad_norm": 5.4791154861450195, + "learning_rate": 9.175688639166783e-06, + "loss": 0.3457, + "step": 6766 + }, + { + "epoch": 0.0917932718393923, + "grad_norm": 6.231178283691406, + "learning_rate": 9.175551596546528e-06, + "loss": 0.3318, + "step": 6767 + }, + { + "epoch": 0.09180683667932718, + "grad_norm": 7.253838062286377, + "learning_rate": 9.175414553926271e-06, + "loss": 0.4161, + "step": 6768 + }, + { + "epoch": 0.09182040151926207, + "grad_norm": 6.551013469696045, + "learning_rate": 9.175277511306016e-06, + "loss": 0.3841, + "step": 6769 + }, + { + "epoch": 0.09183396635919697, + "grad_norm": 8.448932647705078, + "learning_rate": 9.175140468685762e-06, + "loss": 0.3752, + "step": 6770 + }, + { + "epoch": 0.09184753119913185, + "grad_norm": 8.33633804321289, + "learning_rate": 9.175003426065507e-06, + "loss": 0.452, + "step": 6771 + }, + { + "epoch": 0.09186109603906674, + "grad_norm": 7.129875183105469, + "learning_rate": 9.174866383445252e-06, + "loss": 0.4676, + "step": 6772 + }, + { + "epoch": 0.09187466087900163, + "grad_norm": 7.703853607177734, + "learning_rate": 9.174729340824997e-06, + "loss": 0.5264, + "step": 6773 + }, + { + "epoch": 0.09188822571893651, + "grad_norm": 7.406772613525391, + "learning_rate": 9.174592298204742e-06, + "loss": 0.4702, + "step": 6774 + }, + { + "epoch": 0.0919017905588714, + "grad_norm": 6.876798152923584, + "learning_rate": 9.174455255584487e-06, + "loss": 0.4158, + "step": 6775 + }, + { + "epoch": 0.0919153553988063, + "grad_norm": 6.603085041046143, + "learning_rate": 9.174318212964233e-06, + "loss": 0.3491, + "step": 6776 + }, + { + "epoch": 0.09192892023874118, + "grad_norm": 7.642090320587158, + "learning_rate": 9.174181170343978e-06, + "loss": 0.5037, + "step": 6777 + }, + { + "epoch": 0.09194248507867607, + "grad_norm": 8.186841011047363, + "learning_rate": 9.174044127723723e-06, + "loss": 0.5821, + "step": 6778 + }, + { + "epoch": 0.09195604991861096, + "grad_norm": 8.356229782104492, + "learning_rate": 9.173907085103468e-06, + "loss": 0.4634, + "step": 6779 + }, + { + "epoch": 0.09196961475854584, + "grad_norm": 8.084138870239258, + "learning_rate": 9.173770042483213e-06, + "loss": 0.4217, + "step": 6780 + }, + { + "epoch": 0.09198317959848074, + "grad_norm": 6.783618927001953, + "learning_rate": 9.173632999862959e-06, + "loss": 0.3985, + "step": 6781 + }, + { + "epoch": 0.09199674443841563, + "grad_norm": 6.674602031707764, + "learning_rate": 9.173495957242704e-06, + "loss": 0.3558, + "step": 6782 + }, + { + "epoch": 0.09201030927835052, + "grad_norm": 6.116631507873535, + "learning_rate": 9.173358914622449e-06, + "loss": 0.4686, + "step": 6783 + }, + { + "epoch": 0.0920238741182854, + "grad_norm": 6.704730987548828, + "learning_rate": 9.173221872002194e-06, + "loss": 0.3502, + "step": 6784 + }, + { + "epoch": 0.0920374389582203, + "grad_norm": 7.092480182647705, + "learning_rate": 9.17308482938194e-06, + "loss": 0.4984, + "step": 6785 + }, + { + "epoch": 0.09205100379815519, + "grad_norm": 4.4792914390563965, + "learning_rate": 9.172947786761683e-06, + "loss": 0.2743, + "step": 6786 + }, + { + "epoch": 0.09206456863809007, + "grad_norm": 8.805089950561523, + "learning_rate": 9.172810744141428e-06, + "loss": 0.5396, + "step": 6787 + }, + { + "epoch": 0.09207813347802496, + "grad_norm": 8.390358924865723, + "learning_rate": 9.172673701521175e-06, + "loss": 0.372, + "step": 6788 + }, + { + "epoch": 0.09209169831795985, + "grad_norm": 5.992771148681641, + "learning_rate": 9.172536658900918e-06, + "loss": 0.3, + "step": 6789 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 8.038378715515137, + "learning_rate": 9.172399616280663e-06, + "loss": 0.517, + "step": 6790 + }, + { + "epoch": 0.09211882799782962, + "grad_norm": 6.522902011871338, + "learning_rate": 9.172262573660409e-06, + "loss": 0.3688, + "step": 6791 + }, + { + "epoch": 0.09213239283776452, + "grad_norm": 9.657166481018066, + "learning_rate": 9.172125531040156e-06, + "loss": 0.6019, + "step": 6792 + }, + { + "epoch": 0.0921459576776994, + "grad_norm": 6.100863933563232, + "learning_rate": 9.171988488419899e-06, + "loss": 0.4397, + "step": 6793 + }, + { + "epoch": 0.09215952251763429, + "grad_norm": 6.244131565093994, + "learning_rate": 9.171851445799644e-06, + "loss": 0.3777, + "step": 6794 + }, + { + "epoch": 0.09217308735756918, + "grad_norm": 5.4330034255981445, + "learning_rate": 9.17171440317939e-06, + "loss": 0.3156, + "step": 6795 + }, + { + "epoch": 0.09218665219750406, + "grad_norm": 7.502748966217041, + "learning_rate": 9.171577360559135e-06, + "loss": 0.544, + "step": 6796 + }, + { + "epoch": 0.09220021703743896, + "grad_norm": 7.627804279327393, + "learning_rate": 9.17144031793888e-06, + "loss": 0.5548, + "step": 6797 + }, + { + "epoch": 0.09221378187737385, + "grad_norm": 5.768417835235596, + "learning_rate": 9.171303275318625e-06, + "loss": 0.3931, + "step": 6798 + }, + { + "epoch": 0.09222734671730874, + "grad_norm": 7.4378509521484375, + "learning_rate": 9.17116623269837e-06, + "loss": 0.475, + "step": 6799 + }, + { + "epoch": 0.09224091155724362, + "grad_norm": 6.224193572998047, + "learning_rate": 9.171029190078115e-06, + "loss": 0.3991, + "step": 6800 + }, + { + "epoch": 0.09225447639717851, + "grad_norm": 7.441176891326904, + "learning_rate": 9.17089214745786e-06, + "loss": 0.3735, + "step": 6801 + }, + { + "epoch": 0.09226804123711341, + "grad_norm": 8.220707893371582, + "learning_rate": 9.170755104837606e-06, + "loss": 0.4693, + "step": 6802 + }, + { + "epoch": 0.09228160607704829, + "grad_norm": 4.987671375274658, + "learning_rate": 9.17061806221735e-06, + "loss": 0.4609, + "step": 6803 + }, + { + "epoch": 0.09229517091698318, + "grad_norm": 7.476708889007568, + "learning_rate": 9.170481019597094e-06, + "loss": 0.4838, + "step": 6804 + }, + { + "epoch": 0.09230873575691807, + "grad_norm": 8.56132984161377, + "learning_rate": 9.170343976976841e-06, + "loss": 0.5688, + "step": 6805 + }, + { + "epoch": 0.09232230059685295, + "grad_norm": 9.521162033081055, + "learning_rate": 9.170206934356586e-06, + "loss": 0.6955, + "step": 6806 + }, + { + "epoch": 0.09233586543678785, + "grad_norm": 7.0093464851379395, + "learning_rate": 9.170069891736331e-06, + "loss": 0.4071, + "step": 6807 + }, + { + "epoch": 0.09234943027672274, + "grad_norm": 7.346763610839844, + "learning_rate": 9.169932849116075e-06, + "loss": 0.4915, + "step": 6808 + }, + { + "epoch": 0.09236299511665762, + "grad_norm": 6.831222057342529, + "learning_rate": 9.169795806495822e-06, + "loss": 0.4148, + "step": 6809 + }, + { + "epoch": 0.09237655995659251, + "grad_norm": 7.637134552001953, + "learning_rate": 9.169658763875567e-06, + "loss": 0.5015, + "step": 6810 + }, + { + "epoch": 0.0923901247965274, + "grad_norm": 8.187804222106934, + "learning_rate": 9.16952172125531e-06, + "loss": 0.3932, + "step": 6811 + }, + { + "epoch": 0.09240368963646228, + "grad_norm": 5.186999320983887, + "learning_rate": 9.169384678635056e-06, + "loss": 0.527, + "step": 6812 + }, + { + "epoch": 0.09241725447639718, + "grad_norm": 6.0595011711120605, + "learning_rate": 9.169247636014801e-06, + "loss": 0.511, + "step": 6813 + }, + { + "epoch": 0.09243081931633207, + "grad_norm": 5.7800164222717285, + "learning_rate": 9.169110593394546e-06, + "loss": 0.4759, + "step": 6814 + }, + { + "epoch": 0.09244438415626696, + "grad_norm": 6.1715593338012695, + "learning_rate": 9.168973550774291e-06, + "loss": 0.3191, + "step": 6815 + }, + { + "epoch": 0.09245794899620184, + "grad_norm": 6.853826999664307, + "learning_rate": 9.168836508154036e-06, + "loss": 0.3587, + "step": 6816 + }, + { + "epoch": 0.09247151383613673, + "grad_norm": 6.926836013793945, + "learning_rate": 9.168699465533782e-06, + "loss": 0.3914, + "step": 6817 + }, + { + "epoch": 0.09248507867607163, + "grad_norm": 10.043594360351562, + "learning_rate": 9.168562422913527e-06, + "loss": 0.5131, + "step": 6818 + }, + { + "epoch": 0.0924986435160065, + "grad_norm": 5.947290420532227, + "learning_rate": 9.168425380293272e-06, + "loss": 0.3998, + "step": 6819 + }, + { + "epoch": 0.0925122083559414, + "grad_norm": 6.905599594116211, + "learning_rate": 9.168288337673017e-06, + "loss": 0.4772, + "step": 6820 + }, + { + "epoch": 0.09252577319587629, + "grad_norm": 8.49853515625, + "learning_rate": 9.168151295052762e-06, + "loss": 0.5941, + "step": 6821 + }, + { + "epoch": 0.09253933803581117, + "grad_norm": 7.838303565979004, + "learning_rate": 9.168014252432507e-06, + "loss": 0.4662, + "step": 6822 + }, + { + "epoch": 0.09255290287574607, + "grad_norm": 7.24881649017334, + "learning_rate": 9.167877209812253e-06, + "loss": 0.4947, + "step": 6823 + }, + { + "epoch": 0.09256646771568096, + "grad_norm": 6.450374603271484, + "learning_rate": 9.167740167191998e-06, + "loss": 0.4611, + "step": 6824 + }, + { + "epoch": 0.09258003255561584, + "grad_norm": 7.9566755294799805, + "learning_rate": 9.167603124571743e-06, + "loss": 0.5387, + "step": 6825 + }, + { + "epoch": 0.09259359739555073, + "grad_norm": 6.587372303009033, + "learning_rate": 9.167466081951487e-06, + "loss": 0.411, + "step": 6826 + }, + { + "epoch": 0.09260716223548562, + "grad_norm": 5.616988658905029, + "learning_rate": 9.167329039331233e-06, + "loss": 0.4282, + "step": 6827 + }, + { + "epoch": 0.09262072707542052, + "grad_norm": 7.151764392852783, + "learning_rate": 9.167191996710979e-06, + "loss": 0.4796, + "step": 6828 + }, + { + "epoch": 0.0926342919153554, + "grad_norm": 5.597073554992676, + "learning_rate": 9.167054954090722e-06, + "loss": 0.3801, + "step": 6829 + }, + { + "epoch": 0.09264785675529029, + "grad_norm": 7.279880046844482, + "learning_rate": 9.166917911470467e-06, + "loss": 0.4849, + "step": 6830 + }, + { + "epoch": 0.09266142159522518, + "grad_norm": 8.467265129089355, + "learning_rate": 9.166780868850214e-06, + "loss": 0.5506, + "step": 6831 + }, + { + "epoch": 0.09267498643516006, + "grad_norm": 5.5722222328186035, + "learning_rate": 9.166643826229958e-06, + "loss": 0.3834, + "step": 6832 + }, + { + "epoch": 0.09268855127509495, + "grad_norm": 6.748363494873047, + "learning_rate": 9.166506783609703e-06, + "loss": 0.5296, + "step": 6833 + }, + { + "epoch": 0.09270211611502985, + "grad_norm": 5.948415756225586, + "learning_rate": 9.166369740989448e-06, + "loss": 0.4345, + "step": 6834 + }, + { + "epoch": 0.09271568095496473, + "grad_norm": 8.075774192810059, + "learning_rate": 9.166232698369195e-06, + "loss": 0.4677, + "step": 6835 + }, + { + "epoch": 0.09272924579489962, + "grad_norm": 7.083966255187988, + "learning_rate": 9.166095655748938e-06, + "loss": 0.3969, + "step": 6836 + }, + { + "epoch": 0.09274281063483451, + "grad_norm": 12.493443489074707, + "learning_rate": 9.165958613128683e-06, + "loss": 0.4868, + "step": 6837 + }, + { + "epoch": 0.09275637547476939, + "grad_norm": 8.54082202911377, + "learning_rate": 9.165821570508429e-06, + "loss": 0.5394, + "step": 6838 + }, + { + "epoch": 0.09276994031470429, + "grad_norm": 9.167651176452637, + "learning_rate": 9.165684527888174e-06, + "loss": 0.6204, + "step": 6839 + }, + { + "epoch": 0.09278350515463918, + "grad_norm": 7.0425238609313965, + "learning_rate": 9.165547485267919e-06, + "loss": 0.665, + "step": 6840 + }, + { + "epoch": 0.09279706999457406, + "grad_norm": 7.840307712554932, + "learning_rate": 9.165410442647664e-06, + "loss": 0.4022, + "step": 6841 + }, + { + "epoch": 0.09281063483450895, + "grad_norm": 7.373048305511475, + "learning_rate": 9.16527340002741e-06, + "loss": 0.4509, + "step": 6842 + }, + { + "epoch": 0.09282419967444384, + "grad_norm": 8.078914642333984, + "learning_rate": 9.165136357407155e-06, + "loss": 0.5234, + "step": 6843 + }, + { + "epoch": 0.09283776451437874, + "grad_norm": 6.912423610687256, + "learning_rate": 9.1649993147869e-06, + "loss": 0.4675, + "step": 6844 + }, + { + "epoch": 0.09285132935431362, + "grad_norm": 7.53848934173584, + "learning_rate": 9.164862272166645e-06, + "loss": 0.648, + "step": 6845 + }, + { + "epoch": 0.09286489419424851, + "grad_norm": 8.397604942321777, + "learning_rate": 9.16472522954639e-06, + "loss": 0.3082, + "step": 6846 + }, + { + "epoch": 0.0928784590341834, + "grad_norm": 8.647037506103516, + "learning_rate": 9.164588186926134e-06, + "loss": 0.5764, + "step": 6847 + }, + { + "epoch": 0.09289202387411828, + "grad_norm": 9.911561012268066, + "learning_rate": 9.16445114430588e-06, + "loss": 0.6051, + "step": 6848 + }, + { + "epoch": 0.09290558871405317, + "grad_norm": 5.514062881469727, + "learning_rate": 9.164314101685626e-06, + "loss": 0.4949, + "step": 6849 + }, + { + "epoch": 0.09291915355398807, + "grad_norm": 6.952356815338135, + "learning_rate": 9.16417705906537e-06, + "loss": 0.3764, + "step": 6850 + }, + { + "epoch": 0.09293271839392295, + "grad_norm": 5.612140655517578, + "learning_rate": 9.164040016445114e-06, + "loss": 0.362, + "step": 6851 + }, + { + "epoch": 0.09294628323385784, + "grad_norm": 8.391210556030273, + "learning_rate": 9.163902973824861e-06, + "loss": 0.5587, + "step": 6852 + }, + { + "epoch": 0.09295984807379273, + "grad_norm": 6.7696428298950195, + "learning_rate": 9.163765931204606e-06, + "loss": 0.549, + "step": 6853 + }, + { + "epoch": 0.09297341291372761, + "grad_norm": 5.548990249633789, + "learning_rate": 9.16362888858435e-06, + "loss": 0.2932, + "step": 6854 + }, + { + "epoch": 0.0929869777536625, + "grad_norm": 7.338057518005371, + "learning_rate": 9.163491845964095e-06, + "loss": 0.5116, + "step": 6855 + }, + { + "epoch": 0.0930005425935974, + "grad_norm": 7.7000861167907715, + "learning_rate": 9.16335480334384e-06, + "loss": 0.4699, + "step": 6856 + }, + { + "epoch": 0.09301410743353228, + "grad_norm": 7.001523494720459, + "learning_rate": 9.163217760723585e-06, + "loss": 0.4966, + "step": 6857 + }, + { + "epoch": 0.09302767227346717, + "grad_norm": 8.93632698059082, + "learning_rate": 9.16308071810333e-06, + "loss": 0.5414, + "step": 6858 + }, + { + "epoch": 0.09304123711340206, + "grad_norm": 6.925965785980225, + "learning_rate": 9.162943675483076e-06, + "loss": 0.4734, + "step": 6859 + }, + { + "epoch": 0.09305480195333696, + "grad_norm": 5.772336483001709, + "learning_rate": 9.162806632862821e-06, + "loss": 0.3616, + "step": 6860 + }, + { + "epoch": 0.09306836679327184, + "grad_norm": 6.603212356567383, + "learning_rate": 9.162669590242566e-06, + "loss": 0.5761, + "step": 6861 + }, + { + "epoch": 0.09308193163320673, + "grad_norm": 7.785205364227295, + "learning_rate": 9.162532547622311e-06, + "loss": 0.3954, + "step": 6862 + }, + { + "epoch": 0.09309549647314162, + "grad_norm": 7.772599220275879, + "learning_rate": 9.162395505002056e-06, + "loss": 0.4787, + "step": 6863 + }, + { + "epoch": 0.0931090613130765, + "grad_norm": 7.426291465759277, + "learning_rate": 9.162258462381802e-06, + "loss": 0.462, + "step": 6864 + }, + { + "epoch": 0.0931226261530114, + "grad_norm": 6.77297306060791, + "learning_rate": 9.162121419761547e-06, + "loss": 0.52, + "step": 6865 + }, + { + "epoch": 0.09313619099294629, + "grad_norm": 7.185304164886475, + "learning_rate": 9.161984377141292e-06, + "loss": 0.5018, + "step": 6866 + }, + { + "epoch": 0.09314975583288117, + "grad_norm": 7.237640380859375, + "learning_rate": 9.161847334521037e-06, + "loss": 0.4529, + "step": 6867 + }, + { + "epoch": 0.09316332067281606, + "grad_norm": 6.044459819793701, + "learning_rate": 9.161710291900782e-06, + "loss": 0.539, + "step": 6868 + }, + { + "epoch": 0.09317688551275095, + "grad_norm": 6.312249183654785, + "learning_rate": 9.161573249280526e-06, + "loss": 0.4158, + "step": 6869 + }, + { + "epoch": 0.09319045035268583, + "grad_norm": 6.423272132873535, + "learning_rate": 9.161436206660273e-06, + "loss": 0.4223, + "step": 6870 + }, + { + "epoch": 0.09320401519262073, + "grad_norm": 7.4629621505737305, + "learning_rate": 9.161299164040018e-06, + "loss": 0.5395, + "step": 6871 + }, + { + "epoch": 0.09321758003255562, + "grad_norm": 6.635207653045654, + "learning_rate": 9.161162121419761e-06, + "loss": 0.4035, + "step": 6872 + }, + { + "epoch": 0.0932311448724905, + "grad_norm": 7.116694450378418, + "learning_rate": 9.161025078799507e-06, + "loss": 0.5936, + "step": 6873 + }, + { + "epoch": 0.09324470971242539, + "grad_norm": 7.056005477905273, + "learning_rate": 9.160888036179253e-06, + "loss": 0.6174, + "step": 6874 + }, + { + "epoch": 0.09325827455236028, + "grad_norm": 7.0088019371032715, + "learning_rate": 9.160750993558999e-06, + "loss": 0.5478, + "step": 6875 + }, + { + "epoch": 0.09327183939229518, + "grad_norm": 7.407569885253906, + "learning_rate": 9.160613950938742e-06, + "loss": 0.4403, + "step": 6876 + }, + { + "epoch": 0.09328540423223006, + "grad_norm": 8.24545669555664, + "learning_rate": 9.160476908318487e-06, + "loss": 0.6383, + "step": 6877 + }, + { + "epoch": 0.09329896907216495, + "grad_norm": 7.88601541519165, + "learning_rate": 9.160339865698234e-06, + "loss": 0.6529, + "step": 6878 + }, + { + "epoch": 0.09331253391209984, + "grad_norm": 6.825550079345703, + "learning_rate": 9.160202823077978e-06, + "loss": 0.4358, + "step": 6879 + }, + { + "epoch": 0.09332609875203472, + "grad_norm": 6.37179708480835, + "learning_rate": 9.160065780457723e-06, + "loss": 0.3803, + "step": 6880 + }, + { + "epoch": 0.09333966359196962, + "grad_norm": 6.23637056350708, + "learning_rate": 9.159928737837468e-06, + "loss": 0.4684, + "step": 6881 + }, + { + "epoch": 0.09335322843190451, + "grad_norm": 6.185037612915039, + "learning_rate": 9.159791695217213e-06, + "loss": 0.4371, + "step": 6882 + }, + { + "epoch": 0.09336679327183939, + "grad_norm": 7.396061897277832, + "learning_rate": 9.159654652596958e-06, + "loss": 0.6601, + "step": 6883 + }, + { + "epoch": 0.09338035811177428, + "grad_norm": 5.5555219650268555, + "learning_rate": 9.159517609976703e-06, + "loss": 0.2768, + "step": 6884 + }, + { + "epoch": 0.09339392295170917, + "grad_norm": 6.4903130531311035, + "learning_rate": 9.159380567356449e-06, + "loss": 0.3953, + "step": 6885 + }, + { + "epoch": 0.09340748779164405, + "grad_norm": 6.5438151359558105, + "learning_rate": 9.159243524736194e-06, + "loss": 0.3824, + "step": 6886 + }, + { + "epoch": 0.09342105263157895, + "grad_norm": 7.481869220733643, + "learning_rate": 9.159106482115939e-06, + "loss": 0.4258, + "step": 6887 + }, + { + "epoch": 0.09343461747151384, + "grad_norm": 6.800469875335693, + "learning_rate": 9.158969439495684e-06, + "loss": 0.4607, + "step": 6888 + }, + { + "epoch": 0.09344818231144872, + "grad_norm": 10.425286293029785, + "learning_rate": 9.15883239687543e-06, + "loss": 0.5213, + "step": 6889 + }, + { + "epoch": 0.09346174715138361, + "grad_norm": 7.132293701171875, + "learning_rate": 9.158695354255175e-06, + "loss": 0.3004, + "step": 6890 + }, + { + "epoch": 0.0934753119913185, + "grad_norm": 6.82808780670166, + "learning_rate": 9.15855831163492e-06, + "loss": 0.378, + "step": 6891 + }, + { + "epoch": 0.0934888768312534, + "grad_norm": 6.82515287399292, + "learning_rate": 9.158421269014665e-06, + "loss": 0.4281, + "step": 6892 + }, + { + "epoch": 0.09350244167118828, + "grad_norm": 6.516178131103516, + "learning_rate": 9.15828422639441e-06, + "loss": 0.4204, + "step": 6893 + }, + { + "epoch": 0.09351600651112317, + "grad_norm": 6.38896369934082, + "learning_rate": 9.158147183774154e-06, + "loss": 0.4639, + "step": 6894 + }, + { + "epoch": 0.09352957135105806, + "grad_norm": 7.565627098083496, + "learning_rate": 9.158010141153899e-06, + "loss": 0.5292, + "step": 6895 + }, + { + "epoch": 0.09354313619099294, + "grad_norm": 5.586428642272949, + "learning_rate": 9.157873098533646e-06, + "loss": 0.2986, + "step": 6896 + }, + { + "epoch": 0.09355670103092784, + "grad_norm": 8.367637634277344, + "learning_rate": 9.157736055913389e-06, + "loss": 0.5305, + "step": 6897 + }, + { + "epoch": 0.09357026587086273, + "grad_norm": 8.927364349365234, + "learning_rate": 9.157599013293134e-06, + "loss": 0.4551, + "step": 6898 + }, + { + "epoch": 0.09358383071079761, + "grad_norm": 7.430498123168945, + "learning_rate": 9.15746197067288e-06, + "loss": 0.3489, + "step": 6899 + }, + { + "epoch": 0.0935973955507325, + "grad_norm": 5.662916660308838, + "learning_rate": 9.157324928052626e-06, + "loss": 0.2974, + "step": 6900 + }, + { + "epoch": 0.0936109603906674, + "grad_norm": 6.833930492401123, + "learning_rate": 9.15718788543237e-06, + "loss": 0.4327, + "step": 6901 + }, + { + "epoch": 0.09362452523060227, + "grad_norm": 6.872268199920654, + "learning_rate": 9.157050842812115e-06, + "loss": 0.3694, + "step": 6902 + }, + { + "epoch": 0.09363809007053717, + "grad_norm": 9.673556327819824, + "learning_rate": 9.15691380019186e-06, + "loss": 0.5118, + "step": 6903 + }, + { + "epoch": 0.09365165491047206, + "grad_norm": 9.500045776367188, + "learning_rate": 9.156776757571605e-06, + "loss": 0.5251, + "step": 6904 + }, + { + "epoch": 0.09366521975040694, + "grad_norm": 8.722054481506348, + "learning_rate": 9.15663971495135e-06, + "loss": 0.4133, + "step": 6905 + }, + { + "epoch": 0.09367878459034183, + "grad_norm": 6.192148208618164, + "learning_rate": 9.156502672331096e-06, + "loss": 0.4453, + "step": 6906 + }, + { + "epoch": 0.09369234943027673, + "grad_norm": 7.919445514678955, + "learning_rate": 9.156365629710841e-06, + "loss": 0.3294, + "step": 6907 + }, + { + "epoch": 0.09370591427021162, + "grad_norm": 7.2042670249938965, + "learning_rate": 9.156228587090586e-06, + "loss": 0.4265, + "step": 6908 + }, + { + "epoch": 0.0937194791101465, + "grad_norm": 8.136058807373047, + "learning_rate": 9.156091544470331e-06, + "loss": 0.4295, + "step": 6909 + }, + { + "epoch": 0.09373304395008139, + "grad_norm": 6.617058753967285, + "learning_rate": 9.155954501850076e-06, + "loss": 0.4153, + "step": 6910 + }, + { + "epoch": 0.09374660879001628, + "grad_norm": 4.56997537612915, + "learning_rate": 9.155817459229822e-06, + "loss": 0.2433, + "step": 6911 + }, + { + "epoch": 0.09376017362995116, + "grad_norm": 6.943722724914551, + "learning_rate": 9.155680416609565e-06, + "loss": 0.2948, + "step": 6912 + }, + { + "epoch": 0.09377373846988606, + "grad_norm": 8.942042350769043, + "learning_rate": 9.155543373989312e-06, + "loss": 0.3906, + "step": 6913 + }, + { + "epoch": 0.09378730330982095, + "grad_norm": 8.975082397460938, + "learning_rate": 9.155406331369057e-06, + "loss": 0.5051, + "step": 6914 + }, + { + "epoch": 0.09380086814975583, + "grad_norm": 8.131871223449707, + "learning_rate": 9.155269288748802e-06, + "loss": 0.4293, + "step": 6915 + }, + { + "epoch": 0.09381443298969072, + "grad_norm": 5.864385604858398, + "learning_rate": 9.155132246128546e-06, + "loss": 0.3223, + "step": 6916 + }, + { + "epoch": 0.09382799782962561, + "grad_norm": 8.068056106567383, + "learning_rate": 9.154995203508293e-06, + "loss": 0.4364, + "step": 6917 + }, + { + "epoch": 0.0938415626695605, + "grad_norm": 8.770727157592773, + "learning_rate": 9.154858160888038e-06, + "loss": 0.6053, + "step": 6918 + }, + { + "epoch": 0.09385512750949539, + "grad_norm": 8.11565113067627, + "learning_rate": 9.154721118267781e-06, + "loss": 0.5041, + "step": 6919 + }, + { + "epoch": 0.09386869234943028, + "grad_norm": 10.469564437866211, + "learning_rate": 9.154584075647527e-06, + "loss": 0.4626, + "step": 6920 + }, + { + "epoch": 0.09388225718936516, + "grad_norm": 7.190886497497559, + "learning_rate": 9.154447033027273e-06, + "loss": 0.3804, + "step": 6921 + }, + { + "epoch": 0.09389582202930005, + "grad_norm": 5.395630836486816, + "learning_rate": 9.154309990407017e-06, + "loss": 0.3411, + "step": 6922 + }, + { + "epoch": 0.09390938686923495, + "grad_norm": 7.981282711029053, + "learning_rate": 9.154172947786762e-06, + "loss": 0.4986, + "step": 6923 + }, + { + "epoch": 0.09392295170916984, + "grad_norm": 6.672587871551514, + "learning_rate": 9.154035905166507e-06, + "loss": 0.4883, + "step": 6924 + }, + { + "epoch": 0.09393651654910472, + "grad_norm": 7.278805732727051, + "learning_rate": 9.153898862546252e-06, + "loss": 0.2809, + "step": 6925 + }, + { + "epoch": 0.09395008138903961, + "grad_norm": 5.017495155334473, + "learning_rate": 9.153761819925998e-06, + "loss": 0.3758, + "step": 6926 + }, + { + "epoch": 0.0939636462289745, + "grad_norm": 6.90028190612793, + "learning_rate": 9.153624777305743e-06, + "loss": 0.393, + "step": 6927 + }, + { + "epoch": 0.09397721106890938, + "grad_norm": 4.378829002380371, + "learning_rate": 9.153487734685488e-06, + "loss": 0.2107, + "step": 6928 + }, + { + "epoch": 0.09399077590884428, + "grad_norm": 9.268119812011719, + "learning_rate": 9.153350692065233e-06, + "loss": 0.5547, + "step": 6929 + }, + { + "epoch": 0.09400434074877917, + "grad_norm": 5.17721700668335, + "learning_rate": 9.153213649444978e-06, + "loss": 0.2911, + "step": 6930 + }, + { + "epoch": 0.09401790558871405, + "grad_norm": 6.380768299102783, + "learning_rate": 9.153076606824724e-06, + "loss": 0.3722, + "step": 6931 + }, + { + "epoch": 0.09403147042864894, + "grad_norm": 6.213378429412842, + "learning_rate": 9.152939564204469e-06, + "loss": 0.3076, + "step": 6932 + }, + { + "epoch": 0.09404503526858383, + "grad_norm": 8.14702320098877, + "learning_rate": 9.152802521584214e-06, + "loss": 0.5337, + "step": 6933 + }, + { + "epoch": 0.09405860010851871, + "grad_norm": 7.698368549346924, + "learning_rate": 9.152665478963959e-06, + "loss": 0.41, + "step": 6934 + }, + { + "epoch": 0.09407216494845361, + "grad_norm": 7.380148410797119, + "learning_rate": 9.152528436343704e-06, + "loss": 0.4404, + "step": 6935 + }, + { + "epoch": 0.0940857297883885, + "grad_norm": 8.6791353225708, + "learning_rate": 9.15239139372345e-06, + "loss": 0.3637, + "step": 6936 + }, + { + "epoch": 0.09409929462832338, + "grad_norm": 6.81826639175415, + "learning_rate": 9.152254351103193e-06, + "loss": 0.4461, + "step": 6937 + }, + { + "epoch": 0.09411285946825827, + "grad_norm": 7.351174354553223, + "learning_rate": 9.152117308482938e-06, + "loss": 0.4269, + "step": 6938 + }, + { + "epoch": 0.09412642430819317, + "grad_norm": 10.895195960998535, + "learning_rate": 9.151980265862685e-06, + "loss": 0.8085, + "step": 6939 + }, + { + "epoch": 0.09413998914812806, + "grad_norm": 6.603496074676514, + "learning_rate": 9.151843223242428e-06, + "loss": 0.3417, + "step": 6940 + }, + { + "epoch": 0.09415355398806294, + "grad_norm": 8.624687194824219, + "learning_rate": 9.151706180622174e-06, + "loss": 0.3992, + "step": 6941 + }, + { + "epoch": 0.09416711882799783, + "grad_norm": 5.392604827880859, + "learning_rate": 9.151569138001919e-06, + "loss": 0.3594, + "step": 6942 + }, + { + "epoch": 0.09418068366793272, + "grad_norm": 8.396567344665527, + "learning_rate": 9.151432095381666e-06, + "loss": 0.5646, + "step": 6943 + }, + { + "epoch": 0.0941942485078676, + "grad_norm": 7.314423084259033, + "learning_rate": 9.151295052761409e-06, + "loss": 0.4176, + "step": 6944 + }, + { + "epoch": 0.0942078133478025, + "grad_norm": 9.423177719116211, + "learning_rate": 9.151158010141154e-06, + "loss": 0.5065, + "step": 6945 + }, + { + "epoch": 0.09422137818773739, + "grad_norm": 7.29128360748291, + "learning_rate": 9.1510209675209e-06, + "loss": 0.495, + "step": 6946 + }, + { + "epoch": 0.09423494302767227, + "grad_norm": 7.795336723327637, + "learning_rate": 9.150883924900645e-06, + "loss": 0.3448, + "step": 6947 + }, + { + "epoch": 0.09424850786760716, + "grad_norm": 7.985692977905273, + "learning_rate": 9.15074688228039e-06, + "loss": 0.4502, + "step": 6948 + }, + { + "epoch": 0.09426207270754206, + "grad_norm": 6.575237274169922, + "learning_rate": 9.150609839660135e-06, + "loss": 0.458, + "step": 6949 + }, + { + "epoch": 0.09427563754747693, + "grad_norm": 9.132733345031738, + "learning_rate": 9.15047279703988e-06, + "loss": 0.5559, + "step": 6950 + }, + { + "epoch": 0.09428920238741183, + "grad_norm": 6.025794982910156, + "learning_rate": 9.150335754419625e-06, + "loss": 0.2811, + "step": 6951 + }, + { + "epoch": 0.09430276722734672, + "grad_norm": 6.1030073165893555, + "learning_rate": 9.15019871179937e-06, + "loss": 0.3675, + "step": 6952 + }, + { + "epoch": 0.0943163320672816, + "grad_norm": 7.3172760009765625, + "learning_rate": 9.150061669179116e-06, + "loss": 0.3649, + "step": 6953 + }, + { + "epoch": 0.09432989690721649, + "grad_norm": 9.824678421020508, + "learning_rate": 9.149924626558861e-06, + "loss": 0.4039, + "step": 6954 + }, + { + "epoch": 0.09434346174715139, + "grad_norm": 6.882275104522705, + "learning_rate": 9.149787583938604e-06, + "loss": 0.355, + "step": 6955 + }, + { + "epoch": 0.09435702658708628, + "grad_norm": 9.420613288879395, + "learning_rate": 9.149650541318351e-06, + "loss": 0.5818, + "step": 6956 + }, + { + "epoch": 0.09437059142702116, + "grad_norm": 9.922722816467285, + "learning_rate": 9.149513498698096e-06, + "loss": 0.5429, + "step": 6957 + }, + { + "epoch": 0.09438415626695605, + "grad_norm": 6.600496292114258, + "learning_rate": 9.149376456077842e-06, + "loss": 0.3833, + "step": 6958 + }, + { + "epoch": 0.09439772110689094, + "grad_norm": 7.533786773681641, + "learning_rate": 9.149239413457585e-06, + "loss": 0.4185, + "step": 6959 + }, + { + "epoch": 0.09441128594682582, + "grad_norm": 7.56702184677124, + "learning_rate": 9.149102370837332e-06, + "loss": 0.4251, + "step": 6960 + }, + { + "epoch": 0.09442485078676072, + "grad_norm": 6.285019874572754, + "learning_rate": 9.148965328217077e-06, + "loss": 0.2504, + "step": 6961 + }, + { + "epoch": 0.09443841562669561, + "grad_norm": 5.7258758544921875, + "learning_rate": 9.14882828559682e-06, + "loss": 0.3013, + "step": 6962 + }, + { + "epoch": 0.09445198046663049, + "grad_norm": 6.009842872619629, + "learning_rate": 9.148691242976566e-06, + "loss": 0.2476, + "step": 6963 + }, + { + "epoch": 0.09446554530656538, + "grad_norm": 6.348672389984131, + "learning_rate": 9.148554200356311e-06, + "loss": 0.3463, + "step": 6964 + }, + { + "epoch": 0.09447911014650028, + "grad_norm": 7.1751508712768555, + "learning_rate": 9.148417157736056e-06, + "loss": 0.3392, + "step": 6965 + }, + { + "epoch": 0.09449267498643515, + "grad_norm": 6.150791645050049, + "learning_rate": 9.148280115115801e-06, + "loss": 0.2857, + "step": 6966 + }, + { + "epoch": 0.09450623982637005, + "grad_norm": 8.548272132873535, + "learning_rate": 9.148143072495547e-06, + "loss": 0.4895, + "step": 6967 + }, + { + "epoch": 0.09451980466630494, + "grad_norm": 10.009669303894043, + "learning_rate": 9.148006029875292e-06, + "loss": 0.5655, + "step": 6968 + }, + { + "epoch": 0.09453336950623982, + "grad_norm": 7.893733024597168, + "learning_rate": 9.147868987255037e-06, + "loss": 0.4036, + "step": 6969 + }, + { + "epoch": 0.09454693434617471, + "grad_norm": 5.754116058349609, + "learning_rate": 9.147731944634782e-06, + "loss": 0.3524, + "step": 6970 + }, + { + "epoch": 0.0945604991861096, + "grad_norm": 6.7207159996032715, + "learning_rate": 9.147594902014527e-06, + "loss": 0.4066, + "step": 6971 + }, + { + "epoch": 0.0945740640260445, + "grad_norm": 7.22791051864624, + "learning_rate": 9.147457859394272e-06, + "loss": 0.3332, + "step": 6972 + }, + { + "epoch": 0.09458762886597938, + "grad_norm": 9.238178253173828, + "learning_rate": 9.147320816774018e-06, + "loss": 0.4415, + "step": 6973 + }, + { + "epoch": 0.09460119370591427, + "grad_norm": 9.813645362854004, + "learning_rate": 9.147183774153763e-06, + "loss": 0.364, + "step": 6974 + }, + { + "epoch": 0.09461475854584916, + "grad_norm": 7.853779315948486, + "learning_rate": 9.147046731533508e-06, + "loss": 0.408, + "step": 6975 + }, + { + "epoch": 0.09462832338578404, + "grad_norm": 6.33134126663208, + "learning_rate": 9.146909688913253e-06, + "loss": 0.3985, + "step": 6976 + }, + { + "epoch": 0.09464188822571894, + "grad_norm": 6.850912094116211, + "learning_rate": 9.146772646292997e-06, + "loss": 0.4343, + "step": 6977 + }, + { + "epoch": 0.09465545306565383, + "grad_norm": 7.210448265075684, + "learning_rate": 9.146635603672744e-06, + "loss": 0.4459, + "step": 6978 + }, + { + "epoch": 0.09466901790558871, + "grad_norm": 7.738489151000977, + "learning_rate": 9.146498561052489e-06, + "loss": 0.5907, + "step": 6979 + }, + { + "epoch": 0.0946825827455236, + "grad_norm": 8.45873737335205, + "learning_rate": 9.146361518432232e-06, + "loss": 0.5945, + "step": 6980 + }, + { + "epoch": 0.0946961475854585, + "grad_norm": 8.274968147277832, + "learning_rate": 9.146224475811977e-06, + "loss": 0.4231, + "step": 6981 + }, + { + "epoch": 0.09470971242539337, + "grad_norm": 6.938342571258545, + "learning_rate": 9.146087433191724e-06, + "loss": 0.4254, + "step": 6982 + }, + { + "epoch": 0.09472327726532827, + "grad_norm": 8.165298461914062, + "learning_rate": 9.14595039057147e-06, + "loss": 0.5646, + "step": 6983 + }, + { + "epoch": 0.09473684210526316, + "grad_norm": 6.4615044593811035, + "learning_rate": 9.145813347951213e-06, + "loss": 0.4573, + "step": 6984 + }, + { + "epoch": 0.09475040694519804, + "grad_norm": 7.587967872619629, + "learning_rate": 9.145676305330958e-06, + "loss": 0.3065, + "step": 6985 + }, + { + "epoch": 0.09476397178513293, + "grad_norm": 8.13144302368164, + "learning_rate": 9.145539262710705e-06, + "loss": 0.4407, + "step": 6986 + }, + { + "epoch": 0.09477753662506783, + "grad_norm": 7.422403812408447, + "learning_rate": 9.145402220090448e-06, + "loss": 0.6371, + "step": 6987 + }, + { + "epoch": 0.09479110146500272, + "grad_norm": 6.686308860778809, + "learning_rate": 9.145265177470194e-06, + "loss": 0.5264, + "step": 6988 + }, + { + "epoch": 0.0948046663049376, + "grad_norm": 7.2885589599609375, + "learning_rate": 9.145128134849939e-06, + "loss": 0.4342, + "step": 6989 + }, + { + "epoch": 0.09481823114487249, + "grad_norm": 11.023112297058105, + "learning_rate": 9.144991092229684e-06, + "loss": 0.5057, + "step": 6990 + }, + { + "epoch": 0.09483179598480738, + "grad_norm": 7.915450572967529, + "learning_rate": 9.14485404960943e-06, + "loss": 0.4871, + "step": 6991 + }, + { + "epoch": 0.09484536082474226, + "grad_norm": 6.9886298179626465, + "learning_rate": 9.144717006989174e-06, + "loss": 0.4452, + "step": 6992 + }, + { + "epoch": 0.09485892566467716, + "grad_norm": 7.179165840148926, + "learning_rate": 9.14457996436892e-06, + "loss": 0.4437, + "step": 6993 + }, + { + "epoch": 0.09487249050461205, + "grad_norm": 9.265881538391113, + "learning_rate": 9.144442921748665e-06, + "loss": 0.5318, + "step": 6994 + }, + { + "epoch": 0.09488605534454693, + "grad_norm": 8.351490020751953, + "learning_rate": 9.14430587912841e-06, + "loss": 0.617, + "step": 6995 + }, + { + "epoch": 0.09489962018448182, + "grad_norm": 6.215064525604248, + "learning_rate": 9.144168836508155e-06, + "loss": 0.4392, + "step": 6996 + }, + { + "epoch": 0.09491318502441672, + "grad_norm": 7.292091369628906, + "learning_rate": 9.1440317938879e-06, + "loss": 0.4227, + "step": 6997 + }, + { + "epoch": 0.0949267498643516, + "grad_norm": 9.775028228759766, + "learning_rate": 9.143894751267645e-06, + "loss": 0.5115, + "step": 6998 + }, + { + "epoch": 0.09494031470428649, + "grad_norm": 5.892023086547852, + "learning_rate": 9.14375770864739e-06, + "loss": 0.3761, + "step": 6999 + }, + { + "epoch": 0.09495387954422138, + "grad_norm": 8.380631446838379, + "learning_rate": 9.143620666027136e-06, + "loss": 0.4432, + "step": 7000 + }, + { + "epoch": 0.09496744438415626, + "grad_norm": 6.107229709625244, + "learning_rate": 9.143483623406881e-06, + "loss": 0.4137, + "step": 7001 + }, + { + "epoch": 0.09498100922409115, + "grad_norm": 9.720860481262207, + "learning_rate": 9.143346580786624e-06, + "loss": 0.4973, + "step": 7002 + }, + { + "epoch": 0.09499457406402605, + "grad_norm": 5.571103572845459, + "learning_rate": 9.143209538166371e-06, + "loss": 0.4739, + "step": 7003 + }, + { + "epoch": 0.09500813890396094, + "grad_norm": 7.1101202964782715, + "learning_rate": 9.143072495546116e-06, + "loss": 0.4633, + "step": 7004 + }, + { + "epoch": 0.09502170374389582, + "grad_norm": 6.261571884155273, + "learning_rate": 9.14293545292586e-06, + "loss": 0.3394, + "step": 7005 + }, + { + "epoch": 0.09503526858383071, + "grad_norm": 7.878683090209961, + "learning_rate": 9.142798410305605e-06, + "loss": 0.5295, + "step": 7006 + }, + { + "epoch": 0.0950488334237656, + "grad_norm": 7.509459018707275, + "learning_rate": 9.14266136768535e-06, + "loss": 0.4171, + "step": 7007 + }, + { + "epoch": 0.09506239826370048, + "grad_norm": 7.283979892730713, + "learning_rate": 9.142524325065096e-06, + "loss": 0.4161, + "step": 7008 + }, + { + "epoch": 0.09507596310363538, + "grad_norm": 7.295881748199463, + "learning_rate": 9.14238728244484e-06, + "loss": 0.4315, + "step": 7009 + }, + { + "epoch": 0.09508952794357027, + "grad_norm": 8.373663902282715, + "learning_rate": 9.142250239824586e-06, + "loss": 0.6348, + "step": 7010 + }, + { + "epoch": 0.09510309278350515, + "grad_norm": 7.637971878051758, + "learning_rate": 9.142113197204331e-06, + "loss": 0.3149, + "step": 7011 + }, + { + "epoch": 0.09511665762344004, + "grad_norm": 7.5877203941345215, + "learning_rate": 9.141976154584076e-06, + "loss": 0.61, + "step": 7012 + }, + { + "epoch": 0.09513022246337494, + "grad_norm": 7.664604187011719, + "learning_rate": 9.141839111963821e-06, + "loss": 0.4724, + "step": 7013 + }, + { + "epoch": 0.09514378730330982, + "grad_norm": 7.562532901763916, + "learning_rate": 9.141702069343567e-06, + "loss": 0.493, + "step": 7014 + }, + { + "epoch": 0.09515735214324471, + "grad_norm": 8.182361602783203, + "learning_rate": 9.141565026723312e-06, + "loss": 0.5925, + "step": 7015 + }, + { + "epoch": 0.0951709169831796, + "grad_norm": 8.965066909790039, + "learning_rate": 9.141427984103057e-06, + "loss": 0.4931, + "step": 7016 + }, + { + "epoch": 0.09518448182311448, + "grad_norm": 9.71517562866211, + "learning_rate": 9.141290941482802e-06, + "loss": 0.6534, + "step": 7017 + }, + { + "epoch": 0.09519804666304937, + "grad_norm": 10.30614185333252, + "learning_rate": 9.141153898862547e-06, + "loss": 0.5243, + "step": 7018 + }, + { + "epoch": 0.09521161150298427, + "grad_norm": 6.390085220336914, + "learning_rate": 9.141016856242292e-06, + "loss": 0.434, + "step": 7019 + }, + { + "epoch": 0.09522517634291916, + "grad_norm": 7.809947967529297, + "learning_rate": 9.140879813622036e-06, + "loss": 0.4586, + "step": 7020 + }, + { + "epoch": 0.09523874118285404, + "grad_norm": 5.772617340087891, + "learning_rate": 9.140742771001783e-06, + "loss": 0.5151, + "step": 7021 + }, + { + "epoch": 0.09525230602278893, + "grad_norm": 7.328722953796387, + "learning_rate": 9.140605728381528e-06, + "loss": 0.5251, + "step": 7022 + }, + { + "epoch": 0.09526587086272383, + "grad_norm": 5.074647426605225, + "learning_rate": 9.140468685761271e-06, + "loss": 0.4965, + "step": 7023 + }, + { + "epoch": 0.0952794357026587, + "grad_norm": 6.808316230773926, + "learning_rate": 9.140331643141017e-06, + "loss": 0.6065, + "step": 7024 + }, + { + "epoch": 0.0952930005425936, + "grad_norm": 8.225008964538574, + "learning_rate": 9.140194600520764e-06, + "loss": 0.5348, + "step": 7025 + }, + { + "epoch": 0.09530656538252849, + "grad_norm": 7.2517313957214355, + "learning_rate": 9.140057557900509e-06, + "loss": 0.5803, + "step": 7026 + }, + { + "epoch": 0.09532013022246337, + "grad_norm": 7.177356719970703, + "learning_rate": 9.139920515280252e-06, + "loss": 0.3807, + "step": 7027 + }, + { + "epoch": 0.09533369506239826, + "grad_norm": 6.905899524688721, + "learning_rate": 9.139783472659997e-06, + "loss": 0.461, + "step": 7028 + }, + { + "epoch": 0.09534725990233316, + "grad_norm": 7.33081579208374, + "learning_rate": 9.139646430039744e-06, + "loss": 0.4488, + "step": 7029 + }, + { + "epoch": 0.09536082474226804, + "grad_norm": 6.747955799102783, + "learning_rate": 9.139509387419488e-06, + "loss": 0.4062, + "step": 7030 + }, + { + "epoch": 0.09537438958220293, + "grad_norm": 7.692569255828857, + "learning_rate": 9.139372344799233e-06, + "loss": 0.4782, + "step": 7031 + }, + { + "epoch": 0.09538795442213782, + "grad_norm": 8.025800704956055, + "learning_rate": 9.139235302178978e-06, + "loss": 0.5027, + "step": 7032 + }, + { + "epoch": 0.0954015192620727, + "grad_norm": 6.519133567810059, + "learning_rate": 9.139098259558723e-06, + "loss": 0.6774, + "step": 7033 + }, + { + "epoch": 0.0954150841020076, + "grad_norm": 8.19769287109375, + "learning_rate": 9.138961216938468e-06, + "loss": 0.6202, + "step": 7034 + }, + { + "epoch": 0.09542864894194249, + "grad_norm": 6.383480548858643, + "learning_rate": 9.138824174318214e-06, + "loss": 0.367, + "step": 7035 + }, + { + "epoch": 0.09544221378187738, + "grad_norm": 5.666415214538574, + "learning_rate": 9.138687131697959e-06, + "loss": 0.4575, + "step": 7036 + }, + { + "epoch": 0.09545577862181226, + "grad_norm": 6.4181647300720215, + "learning_rate": 9.138550089077704e-06, + "loss": 0.6902, + "step": 7037 + }, + { + "epoch": 0.09546934346174715, + "grad_norm": 6.897795677185059, + "learning_rate": 9.13841304645745e-06, + "loss": 0.4364, + "step": 7038 + }, + { + "epoch": 0.09548290830168205, + "grad_norm": 7.850866317749023, + "learning_rate": 9.138276003837194e-06, + "loss": 0.5663, + "step": 7039 + }, + { + "epoch": 0.09549647314161692, + "grad_norm": 8.181022644042969, + "learning_rate": 9.13813896121694e-06, + "loss": 0.4992, + "step": 7040 + }, + { + "epoch": 0.09551003798155182, + "grad_norm": 5.118661880493164, + "learning_rate": 9.138001918596685e-06, + "loss": 0.246, + "step": 7041 + }, + { + "epoch": 0.09552360282148671, + "grad_norm": 5.896828651428223, + "learning_rate": 9.13786487597643e-06, + "loss": 0.5334, + "step": 7042 + }, + { + "epoch": 0.09553716766142159, + "grad_norm": 8.302257537841797, + "learning_rate": 9.137727833356175e-06, + "loss": 0.5703, + "step": 7043 + }, + { + "epoch": 0.09555073250135648, + "grad_norm": 8.462867736816406, + "learning_rate": 9.13759079073592e-06, + "loss": 0.672, + "step": 7044 + }, + { + "epoch": 0.09556429734129138, + "grad_norm": 7.171424865722656, + "learning_rate": 9.137453748115664e-06, + "loss": 0.3969, + "step": 7045 + }, + { + "epoch": 0.09557786218122626, + "grad_norm": 8.439470291137695, + "learning_rate": 9.137316705495409e-06, + "loss": 0.6465, + "step": 7046 + }, + { + "epoch": 0.09559142702116115, + "grad_norm": 5.925795555114746, + "learning_rate": 9.137179662875156e-06, + "loss": 0.3869, + "step": 7047 + }, + { + "epoch": 0.09560499186109604, + "grad_norm": 5.8956522941589355, + "learning_rate": 9.1370426202549e-06, + "loss": 0.3619, + "step": 7048 + }, + { + "epoch": 0.09561855670103092, + "grad_norm": 6.720495223999023, + "learning_rate": 9.136905577634644e-06, + "loss": 0.3547, + "step": 7049 + }, + { + "epoch": 0.09563212154096581, + "grad_norm": 6.848424911499023, + "learning_rate": 9.13676853501439e-06, + "loss": 0.4357, + "step": 7050 + }, + { + "epoch": 0.09564568638090071, + "grad_norm": 6.376928806304932, + "learning_rate": 9.136631492394137e-06, + "loss": 0.4398, + "step": 7051 + }, + { + "epoch": 0.0956592512208356, + "grad_norm": 6.086352825164795, + "learning_rate": 9.13649444977388e-06, + "loss": 0.4137, + "step": 7052 + }, + { + "epoch": 0.09567281606077048, + "grad_norm": 8.74813175201416, + "learning_rate": 9.136357407153625e-06, + "loss": 0.5437, + "step": 7053 + }, + { + "epoch": 0.09568638090070537, + "grad_norm": 6.683625221252441, + "learning_rate": 9.13622036453337e-06, + "loss": 0.3634, + "step": 7054 + }, + { + "epoch": 0.09569994574064027, + "grad_norm": 7.161520481109619, + "learning_rate": 9.136083321913116e-06, + "loss": 0.5002, + "step": 7055 + }, + { + "epoch": 0.09571351058057515, + "grad_norm": 8.604036331176758, + "learning_rate": 9.13594627929286e-06, + "loss": 0.538, + "step": 7056 + }, + { + "epoch": 0.09572707542051004, + "grad_norm": 5.650012493133545, + "learning_rate": 9.135809236672606e-06, + "loss": 0.4935, + "step": 7057 + }, + { + "epoch": 0.09574064026044493, + "grad_norm": 6.167545795440674, + "learning_rate": 9.135672194052351e-06, + "loss": 0.4082, + "step": 7058 + }, + { + "epoch": 0.09575420510037981, + "grad_norm": 5.432552337646484, + "learning_rate": 9.135535151432096e-06, + "loss": 0.4674, + "step": 7059 + }, + { + "epoch": 0.0957677699403147, + "grad_norm": 7.347333908081055, + "learning_rate": 9.135398108811841e-06, + "loss": 0.5142, + "step": 7060 + }, + { + "epoch": 0.0957813347802496, + "grad_norm": 6.510279655456543, + "learning_rate": 9.135261066191587e-06, + "loss": 0.3722, + "step": 7061 + }, + { + "epoch": 0.09579489962018448, + "grad_norm": 6.750470161437988, + "learning_rate": 9.135124023571332e-06, + "loss": 0.4415, + "step": 7062 + }, + { + "epoch": 0.09580846446011937, + "grad_norm": 8.847661018371582, + "learning_rate": 9.134986980951075e-06, + "loss": 0.4735, + "step": 7063 + }, + { + "epoch": 0.09582202930005426, + "grad_norm": 6.788100719451904, + "learning_rate": 9.134849938330822e-06, + "loss": 0.5972, + "step": 7064 + }, + { + "epoch": 0.09583559413998914, + "grad_norm": 7.168949127197266, + "learning_rate": 9.134712895710567e-06, + "loss": 0.5564, + "step": 7065 + }, + { + "epoch": 0.09584915897992403, + "grad_norm": 6.668752670288086, + "learning_rate": 9.134575853090312e-06, + "loss": 0.4576, + "step": 7066 + }, + { + "epoch": 0.09586272381985893, + "grad_norm": 7.422762870788574, + "learning_rate": 9.134438810470056e-06, + "loss": 0.5045, + "step": 7067 + }, + { + "epoch": 0.09587628865979382, + "grad_norm": 9.04239273071289, + "learning_rate": 9.134301767849803e-06, + "loss": 0.5752, + "step": 7068 + }, + { + "epoch": 0.0958898534997287, + "grad_norm": 6.335741996765137, + "learning_rate": 9.134164725229548e-06, + "loss": 0.4115, + "step": 7069 + }, + { + "epoch": 0.0959034183396636, + "grad_norm": 6.852840423583984, + "learning_rate": 9.134027682609292e-06, + "loss": 0.4842, + "step": 7070 + }, + { + "epoch": 0.09591698317959849, + "grad_norm": 7.742671489715576, + "learning_rate": 9.133890639989037e-06, + "loss": 0.4178, + "step": 7071 + }, + { + "epoch": 0.09593054801953337, + "grad_norm": 6.211202621459961, + "learning_rate": 9.133753597368784e-06, + "loss": 0.6173, + "step": 7072 + }, + { + "epoch": 0.09594411285946826, + "grad_norm": 6.599330902099609, + "learning_rate": 9.133616554748527e-06, + "loss": 0.384, + "step": 7073 + }, + { + "epoch": 0.09595767769940315, + "grad_norm": 8.556187629699707, + "learning_rate": 9.133479512128272e-06, + "loss": 0.4461, + "step": 7074 + }, + { + "epoch": 0.09597124253933803, + "grad_norm": 6.338963508605957, + "learning_rate": 9.133342469508017e-06, + "loss": 0.4094, + "step": 7075 + }, + { + "epoch": 0.09598480737927292, + "grad_norm": 6.808475017547607, + "learning_rate": 9.133205426887763e-06, + "loss": 0.4967, + "step": 7076 + }, + { + "epoch": 0.09599837221920782, + "grad_norm": 7.2639241218566895, + "learning_rate": 9.133068384267508e-06, + "loss": 0.6361, + "step": 7077 + }, + { + "epoch": 0.0960119370591427, + "grad_norm": 8.805831909179688, + "learning_rate": 9.132931341647253e-06, + "loss": 0.5163, + "step": 7078 + }, + { + "epoch": 0.09602550189907759, + "grad_norm": 7.434836387634277, + "learning_rate": 9.132794299026998e-06, + "loss": 0.5853, + "step": 7079 + }, + { + "epoch": 0.09603906673901248, + "grad_norm": 5.763955593109131, + "learning_rate": 9.132657256406743e-06, + "loss": 0.3556, + "step": 7080 + }, + { + "epoch": 0.09605263157894736, + "grad_norm": 6.502776622772217, + "learning_rate": 9.132520213786488e-06, + "loss": 0.5389, + "step": 7081 + }, + { + "epoch": 0.09606619641888225, + "grad_norm": 7.626429080963135, + "learning_rate": 9.132383171166234e-06, + "loss": 0.5509, + "step": 7082 + }, + { + "epoch": 0.09607976125881715, + "grad_norm": 6.369384288787842, + "learning_rate": 9.132246128545979e-06, + "loss": 0.5716, + "step": 7083 + }, + { + "epoch": 0.09609332609875204, + "grad_norm": 6.6283955574035645, + "learning_rate": 9.132109085925724e-06, + "loss": 0.4178, + "step": 7084 + }, + { + "epoch": 0.09610689093868692, + "grad_norm": 8.60680866241455, + "learning_rate": 9.13197204330547e-06, + "loss": 0.5332, + "step": 7085 + }, + { + "epoch": 0.09612045577862181, + "grad_norm": 7.128846645355225, + "learning_rate": 9.131835000685214e-06, + "loss": 0.4403, + "step": 7086 + }, + { + "epoch": 0.0961340206185567, + "grad_norm": 7.467006206512451, + "learning_rate": 9.13169795806496e-06, + "loss": 0.6024, + "step": 7087 + }, + { + "epoch": 0.09614758545849159, + "grad_norm": 7.103201389312744, + "learning_rate": 9.131560915444703e-06, + "loss": 0.4527, + "step": 7088 + }, + { + "epoch": 0.09616115029842648, + "grad_norm": 6.952934265136719, + "learning_rate": 9.131423872824448e-06, + "loss": 0.5075, + "step": 7089 + }, + { + "epoch": 0.09617471513836137, + "grad_norm": 6.212283611297607, + "learning_rate": 9.131286830204195e-06, + "loss": 0.439, + "step": 7090 + }, + { + "epoch": 0.09618827997829625, + "grad_norm": 6.01862096786499, + "learning_rate": 9.13114978758394e-06, + "loss": 0.4759, + "step": 7091 + }, + { + "epoch": 0.09620184481823114, + "grad_norm": 5.237362861633301, + "learning_rate": 9.131012744963684e-06, + "loss": 0.4819, + "step": 7092 + }, + { + "epoch": 0.09621540965816604, + "grad_norm": 6.305123805999756, + "learning_rate": 9.130875702343429e-06, + "loss": 0.4295, + "step": 7093 + }, + { + "epoch": 0.09622897449810092, + "grad_norm": 6.4239959716796875, + "learning_rate": 9.130738659723176e-06, + "loss": 0.44, + "step": 7094 + }, + { + "epoch": 0.09624253933803581, + "grad_norm": 5.151004314422607, + "learning_rate": 9.13060161710292e-06, + "loss": 0.3439, + "step": 7095 + }, + { + "epoch": 0.0962561041779707, + "grad_norm": 7.108129024505615, + "learning_rate": 9.130464574482664e-06, + "loss": 0.4779, + "step": 7096 + }, + { + "epoch": 0.09626966901790558, + "grad_norm": 6.3904876708984375, + "learning_rate": 9.13032753186241e-06, + "loss": 0.4915, + "step": 7097 + }, + { + "epoch": 0.09628323385784048, + "grad_norm": 5.843562126159668, + "learning_rate": 9.130190489242155e-06, + "loss": 0.5286, + "step": 7098 + }, + { + "epoch": 0.09629679869777537, + "grad_norm": 6.911268711090088, + "learning_rate": 9.1300534466219e-06, + "loss": 0.5027, + "step": 7099 + }, + { + "epoch": 0.09631036353771026, + "grad_norm": 5.505631446838379, + "learning_rate": 9.129916404001645e-06, + "loss": 0.5438, + "step": 7100 + }, + { + "epoch": 0.09632392837764514, + "grad_norm": 6.39338493347168, + "learning_rate": 9.12977936138139e-06, + "loss": 0.4923, + "step": 7101 + }, + { + "epoch": 0.09633749321758003, + "grad_norm": 6.976517677307129, + "learning_rate": 9.129642318761136e-06, + "loss": 0.4571, + "step": 7102 + }, + { + "epoch": 0.09635105805751493, + "grad_norm": 7.448004245758057, + "learning_rate": 9.12950527614088e-06, + "loss": 0.6145, + "step": 7103 + }, + { + "epoch": 0.0963646228974498, + "grad_norm": 8.211416244506836, + "learning_rate": 9.129368233520626e-06, + "loss": 0.465, + "step": 7104 + }, + { + "epoch": 0.0963781877373847, + "grad_norm": 7.556941986083984, + "learning_rate": 9.129231190900371e-06, + "loss": 0.5443, + "step": 7105 + }, + { + "epoch": 0.09639175257731959, + "grad_norm": 7.5758233070373535, + "learning_rate": 9.129094148280116e-06, + "loss": 0.4589, + "step": 7106 + }, + { + "epoch": 0.09640531741725447, + "grad_norm": 6.187138557434082, + "learning_rate": 9.128957105659861e-06, + "loss": 0.4274, + "step": 7107 + }, + { + "epoch": 0.09641888225718936, + "grad_norm": 7.159659385681152, + "learning_rate": 9.128820063039607e-06, + "loss": 0.4513, + "step": 7108 + }, + { + "epoch": 0.09643244709712426, + "grad_norm": 7.0199079513549805, + "learning_rate": 9.128683020419352e-06, + "loss": 0.4961, + "step": 7109 + }, + { + "epoch": 0.09644601193705914, + "grad_norm": 7.702294826507568, + "learning_rate": 9.128545977799095e-06, + "loss": 0.4544, + "step": 7110 + }, + { + "epoch": 0.09645957677699403, + "grad_norm": 8.269706726074219, + "learning_rate": 9.128408935178842e-06, + "loss": 0.4596, + "step": 7111 + }, + { + "epoch": 0.09647314161692892, + "grad_norm": 8.729523658752441, + "learning_rate": 9.128271892558587e-06, + "loss": 0.479, + "step": 7112 + }, + { + "epoch": 0.0964867064568638, + "grad_norm": 5.550798416137695, + "learning_rate": 9.12813484993833e-06, + "loss": 0.3824, + "step": 7113 + }, + { + "epoch": 0.0965002712967987, + "grad_norm": 7.445060729980469, + "learning_rate": 9.127997807318076e-06, + "loss": 0.4643, + "step": 7114 + }, + { + "epoch": 0.09651383613673359, + "grad_norm": 7.734904766082764, + "learning_rate": 9.127860764697821e-06, + "loss": 0.4962, + "step": 7115 + }, + { + "epoch": 0.09652740097666848, + "grad_norm": 5.94295597076416, + "learning_rate": 9.127723722077566e-06, + "loss": 0.3341, + "step": 7116 + }, + { + "epoch": 0.09654096581660336, + "grad_norm": 6.617339134216309, + "learning_rate": 9.127586679457312e-06, + "loss": 0.4918, + "step": 7117 + }, + { + "epoch": 0.09655453065653825, + "grad_norm": 5.663693428039551, + "learning_rate": 9.127449636837057e-06, + "loss": 0.4027, + "step": 7118 + }, + { + "epoch": 0.09656809549647315, + "grad_norm": 10.338403701782227, + "learning_rate": 9.127312594216802e-06, + "loss": 0.5991, + "step": 7119 + }, + { + "epoch": 0.09658166033640803, + "grad_norm": 5.519593238830566, + "learning_rate": 9.127175551596547e-06, + "loss": 0.3773, + "step": 7120 + }, + { + "epoch": 0.09659522517634292, + "grad_norm": 6.9581708908081055, + "learning_rate": 9.127038508976292e-06, + "loss": 0.4163, + "step": 7121 + }, + { + "epoch": 0.09660879001627781, + "grad_norm": 4.920627593994141, + "learning_rate": 9.126901466356037e-06, + "loss": 0.4861, + "step": 7122 + }, + { + "epoch": 0.09662235485621269, + "grad_norm": 7.610362529754639, + "learning_rate": 9.126764423735783e-06, + "loss": 0.5401, + "step": 7123 + }, + { + "epoch": 0.09663591969614758, + "grad_norm": 7.601241111755371, + "learning_rate": 9.126627381115528e-06, + "loss": 0.6369, + "step": 7124 + }, + { + "epoch": 0.09664948453608248, + "grad_norm": 6.8656768798828125, + "learning_rate": 9.126490338495273e-06, + "loss": 0.582, + "step": 7125 + }, + { + "epoch": 0.09666304937601736, + "grad_norm": 6.884796142578125, + "learning_rate": 9.126353295875018e-06, + "loss": 0.4423, + "step": 7126 + }, + { + "epoch": 0.09667661421595225, + "grad_norm": 6.260648250579834, + "learning_rate": 9.126216253254763e-06, + "loss": 0.4073, + "step": 7127 + }, + { + "epoch": 0.09669017905588714, + "grad_norm": 5.977949142456055, + "learning_rate": 9.126079210634507e-06, + "loss": 0.4497, + "step": 7128 + }, + { + "epoch": 0.09670374389582202, + "grad_norm": 6.013444900512695, + "learning_rate": 9.125942168014254e-06, + "loss": 0.3189, + "step": 7129 + }, + { + "epoch": 0.09671730873575692, + "grad_norm": 7.216954231262207, + "learning_rate": 9.125805125393999e-06, + "loss": 0.5192, + "step": 7130 + }, + { + "epoch": 0.09673087357569181, + "grad_norm": 5.777342319488525, + "learning_rate": 9.125668082773742e-06, + "loss": 0.3058, + "step": 7131 + }, + { + "epoch": 0.0967444384156267, + "grad_norm": 7.162139415740967, + "learning_rate": 9.125531040153488e-06, + "loss": 0.3498, + "step": 7132 + }, + { + "epoch": 0.09675800325556158, + "grad_norm": 5.8563551902771, + "learning_rate": 9.125393997533234e-06, + "loss": 0.4007, + "step": 7133 + }, + { + "epoch": 0.09677156809549647, + "grad_norm": 7.047511100769043, + "learning_rate": 9.12525695491298e-06, + "loss": 0.5255, + "step": 7134 + }, + { + "epoch": 0.09678513293543137, + "grad_norm": 5.659986972808838, + "learning_rate": 9.125119912292723e-06, + "loss": 0.407, + "step": 7135 + }, + { + "epoch": 0.09679869777536625, + "grad_norm": 5.891483306884766, + "learning_rate": 9.124982869672468e-06, + "loss": 0.4282, + "step": 7136 + }, + { + "epoch": 0.09681226261530114, + "grad_norm": 6.3045735359191895, + "learning_rate": 9.124845827052215e-06, + "loss": 0.5345, + "step": 7137 + }, + { + "epoch": 0.09682582745523603, + "grad_norm": 8.736778259277344, + "learning_rate": 9.124708784431959e-06, + "loss": 0.6055, + "step": 7138 + }, + { + "epoch": 0.09683939229517091, + "grad_norm": 6.7169718742370605, + "learning_rate": 9.124571741811704e-06, + "loss": 0.4739, + "step": 7139 + }, + { + "epoch": 0.0968529571351058, + "grad_norm": 6.924840927124023, + "learning_rate": 9.124434699191449e-06, + "loss": 0.4695, + "step": 7140 + }, + { + "epoch": 0.0968665219750407, + "grad_norm": 6.980868339538574, + "learning_rate": 9.124297656571194e-06, + "loss": 0.4464, + "step": 7141 + }, + { + "epoch": 0.09688008681497558, + "grad_norm": 6.771517753601074, + "learning_rate": 9.12416061395094e-06, + "loss": 0.3814, + "step": 7142 + }, + { + "epoch": 0.09689365165491047, + "grad_norm": 6.324697017669678, + "learning_rate": 9.124023571330684e-06, + "loss": 0.3017, + "step": 7143 + }, + { + "epoch": 0.09690721649484536, + "grad_norm": 7.514735221862793, + "learning_rate": 9.12388652871043e-06, + "loss": 0.5845, + "step": 7144 + }, + { + "epoch": 0.09692078133478024, + "grad_norm": 6.689491271972656, + "learning_rate": 9.123749486090175e-06, + "loss": 0.4598, + "step": 7145 + }, + { + "epoch": 0.09693434617471514, + "grad_norm": 5.228218078613281, + "learning_rate": 9.12361244346992e-06, + "loss": 0.3677, + "step": 7146 + }, + { + "epoch": 0.09694791101465003, + "grad_norm": 7.894123554229736, + "learning_rate": 9.123475400849665e-06, + "loss": 0.6278, + "step": 7147 + }, + { + "epoch": 0.09696147585458492, + "grad_norm": 8.622876167297363, + "learning_rate": 9.12333835822941e-06, + "loss": 0.533, + "step": 7148 + }, + { + "epoch": 0.0969750406945198, + "grad_norm": 6.318272113800049, + "learning_rate": 9.123201315609156e-06, + "loss": 0.3323, + "step": 7149 + }, + { + "epoch": 0.0969886055344547, + "grad_norm": 10.21066665649414, + "learning_rate": 9.1230642729889e-06, + "loss": 0.8499, + "step": 7150 + }, + { + "epoch": 0.09700217037438959, + "grad_norm": 6.787251949310303, + "learning_rate": 9.122927230368646e-06, + "loss": 0.4278, + "step": 7151 + }, + { + "epoch": 0.09701573521432447, + "grad_norm": 8.636932373046875, + "learning_rate": 9.122790187748391e-06, + "loss": 0.4272, + "step": 7152 + }, + { + "epoch": 0.09702930005425936, + "grad_norm": 8.046987533569336, + "learning_rate": 9.122653145128135e-06, + "loss": 0.5688, + "step": 7153 + }, + { + "epoch": 0.09704286489419425, + "grad_norm": 6.8686652183532715, + "learning_rate": 9.122516102507881e-06, + "loss": 0.7537, + "step": 7154 + }, + { + "epoch": 0.09705642973412913, + "grad_norm": 6.411286354064941, + "learning_rate": 9.122379059887627e-06, + "loss": 0.3266, + "step": 7155 + }, + { + "epoch": 0.09706999457406403, + "grad_norm": 6.423822402954102, + "learning_rate": 9.12224201726737e-06, + "loss": 0.4228, + "step": 7156 + }, + { + "epoch": 0.09708355941399892, + "grad_norm": 6.1799421310424805, + "learning_rate": 9.122104974647115e-06, + "loss": 0.6046, + "step": 7157 + }, + { + "epoch": 0.0970971242539338, + "grad_norm": 7.252535343170166, + "learning_rate": 9.12196793202686e-06, + "loss": 0.5171, + "step": 7158 + }, + { + "epoch": 0.09711068909386869, + "grad_norm": 6.692342281341553, + "learning_rate": 9.121830889406607e-06, + "loss": 0.3652, + "step": 7159 + }, + { + "epoch": 0.09712425393380358, + "grad_norm": 8.808453559875488, + "learning_rate": 9.12169384678635e-06, + "loss": 0.6765, + "step": 7160 + }, + { + "epoch": 0.09713781877373846, + "grad_norm": 9.023077011108398, + "learning_rate": 9.121556804166096e-06, + "loss": 0.3525, + "step": 7161 + }, + { + "epoch": 0.09715138361367336, + "grad_norm": 6.565252304077148, + "learning_rate": 9.121419761545841e-06, + "loss": 0.5343, + "step": 7162 + }, + { + "epoch": 0.09716494845360825, + "grad_norm": 4.769837379455566, + "learning_rate": 9.121282718925586e-06, + "loss": 0.3659, + "step": 7163 + }, + { + "epoch": 0.09717851329354314, + "grad_norm": 6.648383140563965, + "learning_rate": 9.121145676305332e-06, + "loss": 0.4547, + "step": 7164 + }, + { + "epoch": 0.09719207813347802, + "grad_norm": 4.7084455490112305, + "learning_rate": 9.121008633685077e-06, + "loss": 0.3623, + "step": 7165 + }, + { + "epoch": 0.09720564297341291, + "grad_norm": 7.064411640167236, + "learning_rate": 9.120871591064822e-06, + "loss": 0.4873, + "step": 7166 + }, + { + "epoch": 0.09721920781334781, + "grad_norm": 5.559317111968994, + "learning_rate": 9.120734548444567e-06, + "loss": 0.3591, + "step": 7167 + }, + { + "epoch": 0.09723277265328269, + "grad_norm": 8.26363754272461, + "learning_rate": 9.120597505824312e-06, + "loss": 0.5327, + "step": 7168 + }, + { + "epoch": 0.09724633749321758, + "grad_norm": 6.065924644470215, + "learning_rate": 9.120460463204057e-06, + "loss": 0.4092, + "step": 7169 + }, + { + "epoch": 0.09725990233315247, + "grad_norm": 5.650042533874512, + "learning_rate": 9.120323420583803e-06, + "loss": 0.4295, + "step": 7170 + }, + { + "epoch": 0.09727346717308735, + "grad_norm": 5.956927299499512, + "learning_rate": 9.120186377963546e-06, + "loss": 0.447, + "step": 7171 + }, + { + "epoch": 0.09728703201302225, + "grad_norm": 6.778727054595947, + "learning_rate": 9.120049335343293e-06, + "loss": 0.3623, + "step": 7172 + }, + { + "epoch": 0.09730059685295714, + "grad_norm": 6.85474967956543, + "learning_rate": 9.119912292723038e-06, + "loss": 0.5758, + "step": 7173 + }, + { + "epoch": 0.09731416169289202, + "grad_norm": 6.1297149658203125, + "learning_rate": 9.119775250102783e-06, + "loss": 0.3933, + "step": 7174 + }, + { + "epoch": 0.09732772653282691, + "grad_norm": 5.334348678588867, + "learning_rate": 9.119638207482527e-06, + "loss": 0.364, + "step": 7175 + }, + { + "epoch": 0.0973412913727618, + "grad_norm": 6.137834072113037, + "learning_rate": 9.119501164862274e-06, + "loss": 0.4638, + "step": 7176 + }, + { + "epoch": 0.0973548562126967, + "grad_norm": 5.541005611419678, + "learning_rate": 9.119364122242019e-06, + "loss": 0.4322, + "step": 7177 + }, + { + "epoch": 0.09736842105263158, + "grad_norm": 5.325681209564209, + "learning_rate": 9.119227079621762e-06, + "loss": 0.4758, + "step": 7178 + }, + { + "epoch": 0.09738198589256647, + "grad_norm": 6.82753324508667, + "learning_rate": 9.119090037001508e-06, + "loss": 0.5744, + "step": 7179 + }, + { + "epoch": 0.09739555073250136, + "grad_norm": 7.117947101593018, + "learning_rate": 9.118952994381254e-06, + "loss": 0.4686, + "step": 7180 + }, + { + "epoch": 0.09740911557243624, + "grad_norm": 6.0606231689453125, + "learning_rate": 9.118815951760998e-06, + "loss": 0.5273, + "step": 7181 + }, + { + "epoch": 0.09742268041237113, + "grad_norm": 6.981362819671631, + "learning_rate": 9.118678909140743e-06, + "loss": 0.4218, + "step": 7182 + }, + { + "epoch": 0.09743624525230603, + "grad_norm": 5.574527740478516, + "learning_rate": 9.118541866520488e-06, + "loss": 0.4042, + "step": 7183 + }, + { + "epoch": 0.09744981009224091, + "grad_norm": 8.429253578186035, + "learning_rate": 9.118404823900233e-06, + "loss": 0.5175, + "step": 7184 + }, + { + "epoch": 0.0974633749321758, + "grad_norm": 8.173959732055664, + "learning_rate": 9.118267781279979e-06, + "loss": 0.552, + "step": 7185 + }, + { + "epoch": 0.0974769397721107, + "grad_norm": 7.255666732788086, + "learning_rate": 9.118130738659724e-06, + "loss": 0.4516, + "step": 7186 + }, + { + "epoch": 0.09749050461204557, + "grad_norm": 5.435947895050049, + "learning_rate": 9.117993696039469e-06, + "loss": 0.4025, + "step": 7187 + }, + { + "epoch": 0.09750406945198047, + "grad_norm": 9.084199905395508, + "learning_rate": 9.117856653419214e-06, + "loss": 0.5038, + "step": 7188 + }, + { + "epoch": 0.09751763429191536, + "grad_norm": 5.0492987632751465, + "learning_rate": 9.11771961079896e-06, + "loss": 0.2786, + "step": 7189 + }, + { + "epoch": 0.09753119913185024, + "grad_norm": 4.866666316986084, + "learning_rate": 9.117582568178705e-06, + "loss": 0.286, + "step": 7190 + }, + { + "epoch": 0.09754476397178513, + "grad_norm": 6.44254207611084, + "learning_rate": 9.11744552555845e-06, + "loss": 0.4357, + "step": 7191 + }, + { + "epoch": 0.09755832881172002, + "grad_norm": 5.407470703125, + "learning_rate": 9.117308482938195e-06, + "loss": 0.3536, + "step": 7192 + }, + { + "epoch": 0.09757189365165492, + "grad_norm": 5.073469638824463, + "learning_rate": 9.11717144031794e-06, + "loss": 0.2831, + "step": 7193 + }, + { + "epoch": 0.0975854584915898, + "grad_norm": 12.102794647216797, + "learning_rate": 9.117034397697685e-06, + "loss": 0.3974, + "step": 7194 + }, + { + "epoch": 0.09759902333152469, + "grad_norm": 5.545167446136475, + "learning_rate": 9.11689735507743e-06, + "loss": 0.2923, + "step": 7195 + }, + { + "epoch": 0.09761258817145958, + "grad_norm": 6.516904354095459, + "learning_rate": 9.116760312457174e-06, + "loss": 0.451, + "step": 7196 + }, + { + "epoch": 0.09762615301139446, + "grad_norm": 5.952854633331299, + "learning_rate": 9.116623269836919e-06, + "loss": 0.4437, + "step": 7197 + }, + { + "epoch": 0.09763971785132936, + "grad_norm": 6.381673336029053, + "learning_rate": 9.116486227216666e-06, + "loss": 0.489, + "step": 7198 + }, + { + "epoch": 0.09765328269126425, + "grad_norm": 8.40587043762207, + "learning_rate": 9.116349184596411e-06, + "loss": 0.3864, + "step": 7199 + }, + { + "epoch": 0.09766684753119913, + "grad_norm": 9.426403999328613, + "learning_rate": 9.116212141976155e-06, + "loss": 0.4568, + "step": 7200 + }, + { + "epoch": 0.09768041237113402, + "grad_norm": 5.3055853843688965, + "learning_rate": 9.1160750993559e-06, + "loss": 0.427, + "step": 7201 + }, + { + "epoch": 0.09769397721106891, + "grad_norm": 6.662203311920166, + "learning_rate": 9.115938056735647e-06, + "loss": 0.4934, + "step": 7202 + }, + { + "epoch": 0.09770754205100379, + "grad_norm": 5.398073196411133, + "learning_rate": 9.11580101411539e-06, + "loss": 0.2807, + "step": 7203 + }, + { + "epoch": 0.09772110689093869, + "grad_norm": 6.620965003967285, + "learning_rate": 9.115663971495135e-06, + "loss": 0.4409, + "step": 7204 + }, + { + "epoch": 0.09773467173087358, + "grad_norm": 5.027764320373535, + "learning_rate": 9.11552692887488e-06, + "loss": 0.3889, + "step": 7205 + }, + { + "epoch": 0.09774823657080846, + "grad_norm": 4.814938545227051, + "learning_rate": 9.115389886254626e-06, + "loss": 0.4189, + "step": 7206 + }, + { + "epoch": 0.09776180141074335, + "grad_norm": 6.069299697875977, + "learning_rate": 9.115252843634371e-06, + "loss": 0.3884, + "step": 7207 + }, + { + "epoch": 0.09777536625067824, + "grad_norm": 5.239330768585205, + "learning_rate": 9.115115801014116e-06, + "loss": 0.3105, + "step": 7208 + }, + { + "epoch": 0.09778893109061314, + "grad_norm": 4.271071434020996, + "learning_rate": 9.114978758393861e-06, + "loss": 0.191, + "step": 7209 + }, + { + "epoch": 0.09780249593054802, + "grad_norm": 4.778337001800537, + "learning_rate": 9.114841715773606e-06, + "loss": 0.3168, + "step": 7210 + }, + { + "epoch": 0.09781606077048291, + "grad_norm": 4.575054168701172, + "learning_rate": 9.114704673153352e-06, + "loss": 0.213, + "step": 7211 + }, + { + "epoch": 0.0978296256104178, + "grad_norm": 5.952762126922607, + "learning_rate": 9.114567630533097e-06, + "loss": 0.4069, + "step": 7212 + }, + { + "epoch": 0.09784319045035268, + "grad_norm": 5.148465633392334, + "learning_rate": 9.114430587912842e-06, + "loss": 0.3725, + "step": 7213 + }, + { + "epoch": 0.09785675529028758, + "grad_norm": 6.084940433502197, + "learning_rate": 9.114293545292585e-06, + "loss": 0.3756, + "step": 7214 + }, + { + "epoch": 0.09787032013022247, + "grad_norm": 6.896650791168213, + "learning_rate": 9.114156502672332e-06, + "loss": 0.3692, + "step": 7215 + }, + { + "epoch": 0.09788388497015735, + "grad_norm": 5.580122947692871, + "learning_rate": 9.114019460052077e-06, + "loss": 0.3511, + "step": 7216 + }, + { + "epoch": 0.09789744981009224, + "grad_norm": 7.152846813201904, + "learning_rate": 9.113882417431823e-06, + "loss": 0.4377, + "step": 7217 + }, + { + "epoch": 0.09791101465002713, + "grad_norm": 8.045719146728516, + "learning_rate": 9.113745374811566e-06, + "loss": 0.5189, + "step": 7218 + }, + { + "epoch": 0.09792457948996201, + "grad_norm": 7.320886135101318, + "learning_rate": 9.113608332191313e-06, + "loss": 0.4035, + "step": 7219 + }, + { + "epoch": 0.0979381443298969, + "grad_norm": 6.724581241607666, + "learning_rate": 9.113471289571058e-06, + "loss": 0.4957, + "step": 7220 + }, + { + "epoch": 0.0979517091698318, + "grad_norm": 7.842456817626953, + "learning_rate": 9.113334246950802e-06, + "loss": 0.4702, + "step": 7221 + }, + { + "epoch": 0.09796527400976668, + "grad_norm": 7.226176738739014, + "learning_rate": 9.113197204330547e-06, + "loss": 0.3917, + "step": 7222 + }, + { + "epoch": 0.09797883884970157, + "grad_norm": 9.21474838256836, + "learning_rate": 9.113060161710294e-06, + "loss": 0.6722, + "step": 7223 + }, + { + "epoch": 0.09799240368963646, + "grad_norm": 7.727296829223633, + "learning_rate": 9.112923119090037e-06, + "loss": 0.5683, + "step": 7224 + }, + { + "epoch": 0.09800596852957136, + "grad_norm": 6.549561023712158, + "learning_rate": 9.112786076469782e-06, + "loss": 0.3205, + "step": 7225 + }, + { + "epoch": 0.09801953336950624, + "grad_norm": 6.534356594085693, + "learning_rate": 9.112649033849528e-06, + "loss": 0.4209, + "step": 7226 + }, + { + "epoch": 0.09803309820944113, + "grad_norm": 5.915219783782959, + "learning_rate": 9.112511991229273e-06, + "loss": 0.3897, + "step": 7227 + }, + { + "epoch": 0.09804666304937602, + "grad_norm": 9.079439163208008, + "learning_rate": 9.112374948609018e-06, + "loss": 0.695, + "step": 7228 + }, + { + "epoch": 0.0980602278893109, + "grad_norm": 5.777479648590088, + "learning_rate": 9.112237905988763e-06, + "loss": 0.4403, + "step": 7229 + }, + { + "epoch": 0.0980737927292458, + "grad_norm": 5.627225875854492, + "learning_rate": 9.112100863368508e-06, + "loss": 0.3139, + "step": 7230 + }, + { + "epoch": 0.09808735756918069, + "grad_norm": 8.26496410369873, + "learning_rate": 9.111963820748253e-06, + "loss": 0.4764, + "step": 7231 + }, + { + "epoch": 0.09810092240911557, + "grad_norm": 10.168281555175781, + "learning_rate": 9.111826778127999e-06, + "loss": 0.5761, + "step": 7232 + }, + { + "epoch": 0.09811448724905046, + "grad_norm": 6.46494722366333, + "learning_rate": 9.111689735507744e-06, + "loss": 0.3766, + "step": 7233 + }, + { + "epoch": 0.09812805208898535, + "grad_norm": 7.639989852905273, + "learning_rate": 9.111552692887489e-06, + "loss": 0.5479, + "step": 7234 + }, + { + "epoch": 0.09814161692892023, + "grad_norm": 6.320513725280762, + "learning_rate": 9.111415650267234e-06, + "loss": 0.44, + "step": 7235 + }, + { + "epoch": 0.09815518176885513, + "grad_norm": 5.874512195587158, + "learning_rate": 9.11127860764698e-06, + "loss": 0.3496, + "step": 7236 + }, + { + "epoch": 0.09816874660879002, + "grad_norm": 6.659133434295654, + "learning_rate": 9.111141565026725e-06, + "loss": 0.3722, + "step": 7237 + }, + { + "epoch": 0.0981823114487249, + "grad_norm": 7.148982524871826, + "learning_rate": 9.11100452240647e-06, + "loss": 0.5152, + "step": 7238 + }, + { + "epoch": 0.09819587628865979, + "grad_norm": 4.926681995391846, + "learning_rate": 9.110867479786213e-06, + "loss": 0.2027, + "step": 7239 + }, + { + "epoch": 0.09820944112859468, + "grad_norm": 6.684436321258545, + "learning_rate": 9.110730437165958e-06, + "loss": 0.4632, + "step": 7240 + }, + { + "epoch": 0.09822300596852958, + "grad_norm": 6.250188827514648, + "learning_rate": 9.110593394545705e-06, + "loss": 0.4838, + "step": 7241 + }, + { + "epoch": 0.09823657080846446, + "grad_norm": 8.260445594787598, + "learning_rate": 9.11045635192545e-06, + "loss": 0.3673, + "step": 7242 + }, + { + "epoch": 0.09825013564839935, + "grad_norm": 6.159450054168701, + "learning_rate": 9.110319309305194e-06, + "loss": 0.3632, + "step": 7243 + }, + { + "epoch": 0.09826370048833424, + "grad_norm": 4.976537704467773, + "learning_rate": 9.110182266684939e-06, + "loss": 0.3911, + "step": 7244 + }, + { + "epoch": 0.09827726532826912, + "grad_norm": 5.7486371994018555, + "learning_rate": 9.110045224064686e-06, + "loss": 0.2679, + "step": 7245 + }, + { + "epoch": 0.09829083016820402, + "grad_norm": 6.275547981262207, + "learning_rate": 9.10990818144443e-06, + "loss": 0.4296, + "step": 7246 + }, + { + "epoch": 0.09830439500813891, + "grad_norm": 6.213329792022705, + "learning_rate": 9.109771138824175e-06, + "loss": 0.3549, + "step": 7247 + }, + { + "epoch": 0.09831795984807379, + "grad_norm": 7.052527904510498, + "learning_rate": 9.10963409620392e-06, + "loss": 0.3361, + "step": 7248 + }, + { + "epoch": 0.09833152468800868, + "grad_norm": 7.082192420959473, + "learning_rate": 9.109497053583665e-06, + "loss": 0.3309, + "step": 7249 + }, + { + "epoch": 0.09834508952794357, + "grad_norm": 9.278812408447266, + "learning_rate": 9.10936001096341e-06, + "loss": 0.6846, + "step": 7250 + }, + { + "epoch": 0.09835865436787845, + "grad_norm": 7.420627117156982, + "learning_rate": 9.109222968343155e-06, + "loss": 0.4703, + "step": 7251 + }, + { + "epoch": 0.09837221920781335, + "grad_norm": 6.077786922454834, + "learning_rate": 9.1090859257229e-06, + "loss": 0.4106, + "step": 7252 + }, + { + "epoch": 0.09838578404774824, + "grad_norm": 6.341593265533447, + "learning_rate": 9.108948883102646e-06, + "loss": 0.3938, + "step": 7253 + }, + { + "epoch": 0.09839934888768312, + "grad_norm": 8.59618854522705, + "learning_rate": 9.108811840482391e-06, + "loss": 0.4818, + "step": 7254 + }, + { + "epoch": 0.09841291372761801, + "grad_norm": 4.835746765136719, + "learning_rate": 9.108674797862136e-06, + "loss": 0.3209, + "step": 7255 + }, + { + "epoch": 0.0984264785675529, + "grad_norm": 7.091804027557373, + "learning_rate": 9.108537755241881e-06, + "loss": 0.355, + "step": 7256 + }, + { + "epoch": 0.0984400434074878, + "grad_norm": 4.816137313842773, + "learning_rate": 9.108400712621626e-06, + "loss": 0.3266, + "step": 7257 + }, + { + "epoch": 0.09845360824742268, + "grad_norm": 6.731452941894531, + "learning_rate": 9.108263670001372e-06, + "loss": 0.4254, + "step": 7258 + }, + { + "epoch": 0.09846717308735757, + "grad_norm": 6.294147491455078, + "learning_rate": 9.108126627381117e-06, + "loss": 0.345, + "step": 7259 + }, + { + "epoch": 0.09848073792729246, + "grad_norm": 7.3199238777160645, + "learning_rate": 9.107989584760862e-06, + "loss": 0.5939, + "step": 7260 + }, + { + "epoch": 0.09849430276722734, + "grad_norm": 6.344910621643066, + "learning_rate": 9.107852542140605e-06, + "loss": 0.4357, + "step": 7261 + }, + { + "epoch": 0.09850786760716224, + "grad_norm": 5.816447734832764, + "learning_rate": 9.107715499520352e-06, + "loss": 0.2947, + "step": 7262 + }, + { + "epoch": 0.09852143244709713, + "grad_norm": 9.905000686645508, + "learning_rate": 9.107578456900097e-06, + "loss": 0.3196, + "step": 7263 + }, + { + "epoch": 0.09853499728703201, + "grad_norm": 8.064990043640137, + "learning_rate": 9.107441414279841e-06, + "loss": 0.4541, + "step": 7264 + }, + { + "epoch": 0.0985485621269669, + "grad_norm": 8.229348182678223, + "learning_rate": 9.107304371659586e-06, + "loss": 0.5354, + "step": 7265 + }, + { + "epoch": 0.0985621269669018, + "grad_norm": 5.580869197845459, + "learning_rate": 9.107167329039331e-06, + "loss": 0.3896, + "step": 7266 + }, + { + "epoch": 0.09857569180683667, + "grad_norm": 6.056223392486572, + "learning_rate": 9.107030286419078e-06, + "loss": 0.3635, + "step": 7267 + }, + { + "epoch": 0.09858925664677157, + "grad_norm": 8.950169563293457, + "learning_rate": 9.106893243798822e-06, + "loss": 0.485, + "step": 7268 + }, + { + "epoch": 0.09860282148670646, + "grad_norm": 6.488556385040283, + "learning_rate": 9.106756201178567e-06, + "loss": 0.315, + "step": 7269 + }, + { + "epoch": 0.09861638632664134, + "grad_norm": 6.596402645111084, + "learning_rate": 9.106619158558312e-06, + "loss": 0.316, + "step": 7270 + }, + { + "epoch": 0.09862995116657623, + "grad_norm": 6.636503219604492, + "learning_rate": 9.106482115938057e-06, + "loss": 0.5088, + "step": 7271 + }, + { + "epoch": 0.09864351600651113, + "grad_norm": 7.149051189422607, + "learning_rate": 9.106345073317802e-06, + "loss": 0.3957, + "step": 7272 + }, + { + "epoch": 0.09865708084644602, + "grad_norm": 6.950559616088867, + "learning_rate": 9.106208030697548e-06, + "loss": 0.4686, + "step": 7273 + }, + { + "epoch": 0.0986706456863809, + "grad_norm": 5.984715938568115, + "learning_rate": 9.106070988077293e-06, + "loss": 0.4346, + "step": 7274 + }, + { + "epoch": 0.09868421052631579, + "grad_norm": 8.615549087524414, + "learning_rate": 9.105933945457038e-06, + "loss": 0.4999, + "step": 7275 + }, + { + "epoch": 0.09869777536625068, + "grad_norm": 4.9211506843566895, + "learning_rate": 9.105796902836783e-06, + "loss": 0.3002, + "step": 7276 + }, + { + "epoch": 0.09871134020618556, + "grad_norm": 5.210302352905273, + "learning_rate": 9.105659860216528e-06, + "loss": 0.3259, + "step": 7277 + }, + { + "epoch": 0.09872490504612046, + "grad_norm": 6.269593238830566, + "learning_rate": 9.105522817596273e-06, + "loss": 0.3799, + "step": 7278 + }, + { + "epoch": 0.09873846988605535, + "grad_norm": 8.003639221191406, + "learning_rate": 9.105385774976019e-06, + "loss": 0.4215, + "step": 7279 + }, + { + "epoch": 0.09875203472599023, + "grad_norm": 6.339828014373779, + "learning_rate": 9.105248732355764e-06, + "loss": 0.4172, + "step": 7280 + }, + { + "epoch": 0.09876559956592512, + "grad_norm": 9.16840934753418, + "learning_rate": 9.105111689735509e-06, + "loss": 0.4642, + "step": 7281 + }, + { + "epoch": 0.09877916440586001, + "grad_norm": 7.813592433929443, + "learning_rate": 9.104974647115254e-06, + "loss": 0.3545, + "step": 7282 + }, + { + "epoch": 0.0987927292457949, + "grad_norm": 11.686284065246582, + "learning_rate": 9.104837604494998e-06, + "loss": 0.6819, + "step": 7283 + }, + { + "epoch": 0.09880629408572979, + "grad_norm": 9.48971939086914, + "learning_rate": 9.104700561874745e-06, + "loss": 0.4393, + "step": 7284 + }, + { + "epoch": 0.09881985892566468, + "grad_norm": 7.214562892913818, + "learning_rate": 9.10456351925449e-06, + "loss": 0.4649, + "step": 7285 + }, + { + "epoch": 0.09883342376559956, + "grad_norm": 6.164071083068848, + "learning_rate": 9.104426476634233e-06, + "loss": 0.2912, + "step": 7286 + }, + { + "epoch": 0.09884698860553445, + "grad_norm": 4.967360019683838, + "learning_rate": 9.104289434013978e-06, + "loss": 0.2963, + "step": 7287 + }, + { + "epoch": 0.09886055344546935, + "grad_norm": 6.117178916931152, + "learning_rate": 9.104152391393725e-06, + "loss": 0.2573, + "step": 7288 + }, + { + "epoch": 0.09887411828540424, + "grad_norm": 7.606668949127197, + "learning_rate": 9.104015348773469e-06, + "loss": 0.3384, + "step": 7289 + }, + { + "epoch": 0.09888768312533912, + "grad_norm": 6.7847065925598145, + "learning_rate": 9.103878306153214e-06, + "loss": 0.32, + "step": 7290 + }, + { + "epoch": 0.09890124796527401, + "grad_norm": 8.05947208404541, + "learning_rate": 9.103741263532959e-06, + "loss": 0.4911, + "step": 7291 + }, + { + "epoch": 0.0989148128052089, + "grad_norm": 5.386173248291016, + "learning_rate": 9.103604220912704e-06, + "loss": 0.3491, + "step": 7292 + }, + { + "epoch": 0.09892837764514378, + "grad_norm": 7.1301188468933105, + "learning_rate": 9.10346717829245e-06, + "loss": 0.3613, + "step": 7293 + }, + { + "epoch": 0.09894194248507868, + "grad_norm": 7.2487874031066895, + "learning_rate": 9.103330135672195e-06, + "loss": 0.4347, + "step": 7294 + }, + { + "epoch": 0.09895550732501357, + "grad_norm": 7.278442859649658, + "learning_rate": 9.10319309305194e-06, + "loss": 0.3224, + "step": 7295 + }, + { + "epoch": 0.09896907216494845, + "grad_norm": 7.302004337310791, + "learning_rate": 9.103056050431685e-06, + "loss": 0.3513, + "step": 7296 + }, + { + "epoch": 0.09898263700488334, + "grad_norm": 7.292433261871338, + "learning_rate": 9.10291900781143e-06, + "loss": 0.4253, + "step": 7297 + }, + { + "epoch": 0.09899620184481824, + "grad_norm": 6.0885701179504395, + "learning_rate": 9.102781965191175e-06, + "loss": 0.366, + "step": 7298 + }, + { + "epoch": 0.09900976668475311, + "grad_norm": 5.983548164367676, + "learning_rate": 9.10264492257092e-06, + "loss": 0.3136, + "step": 7299 + }, + { + "epoch": 0.09902333152468801, + "grad_norm": 6.487707614898682, + "learning_rate": 9.102507879950666e-06, + "loss": 0.4097, + "step": 7300 + }, + { + "epoch": 0.0990368963646229, + "grad_norm": 4.865837574005127, + "learning_rate": 9.102370837330411e-06, + "loss": 0.2854, + "step": 7301 + }, + { + "epoch": 0.09905046120455778, + "grad_norm": 6.078958034515381, + "learning_rate": 9.102233794710156e-06, + "loss": 0.2813, + "step": 7302 + }, + { + "epoch": 0.09906402604449267, + "grad_norm": 5.06894588470459, + "learning_rate": 9.102096752089901e-06, + "loss": 0.2626, + "step": 7303 + }, + { + "epoch": 0.09907759088442757, + "grad_norm": 7.074959754943848, + "learning_rate": 9.101959709469645e-06, + "loss": 0.3968, + "step": 7304 + }, + { + "epoch": 0.09909115572436246, + "grad_norm": 7.83662223815918, + "learning_rate": 9.101822666849392e-06, + "loss": 0.5285, + "step": 7305 + }, + { + "epoch": 0.09910472056429734, + "grad_norm": 7.409583568572998, + "learning_rate": 9.101685624229137e-06, + "loss": 0.322, + "step": 7306 + }, + { + "epoch": 0.09911828540423223, + "grad_norm": 5.104046821594238, + "learning_rate": 9.10154858160888e-06, + "loss": 0.3672, + "step": 7307 + }, + { + "epoch": 0.09913185024416712, + "grad_norm": 6.365405082702637, + "learning_rate": 9.101411538988625e-06, + "loss": 0.3589, + "step": 7308 + }, + { + "epoch": 0.099145415084102, + "grad_norm": 7.122282028198242, + "learning_rate": 9.10127449636837e-06, + "loss": 0.5278, + "step": 7309 + }, + { + "epoch": 0.0991589799240369, + "grad_norm": 6.264898777008057, + "learning_rate": 9.101137453748118e-06, + "loss": 0.3907, + "step": 7310 + }, + { + "epoch": 0.09917254476397179, + "grad_norm": 6.473680019378662, + "learning_rate": 9.101000411127861e-06, + "loss": 0.441, + "step": 7311 + }, + { + "epoch": 0.09918610960390667, + "grad_norm": 7.2776970863342285, + "learning_rate": 9.100863368507606e-06, + "loss": 0.394, + "step": 7312 + }, + { + "epoch": 0.09919967444384156, + "grad_norm": 5.946937561035156, + "learning_rate": 9.100726325887351e-06, + "loss": 0.375, + "step": 7313 + }, + { + "epoch": 0.09921323928377646, + "grad_norm": 9.565195083618164, + "learning_rate": 9.100589283267097e-06, + "loss": 0.5356, + "step": 7314 + }, + { + "epoch": 0.09922680412371133, + "grad_norm": 6.794239521026611, + "learning_rate": 9.100452240646842e-06, + "loss": 0.4053, + "step": 7315 + }, + { + "epoch": 0.09924036896364623, + "grad_norm": 5.872915744781494, + "learning_rate": 9.100315198026587e-06, + "loss": 0.3963, + "step": 7316 + }, + { + "epoch": 0.09925393380358112, + "grad_norm": 6.350036144256592, + "learning_rate": 9.100178155406332e-06, + "loss": 0.4255, + "step": 7317 + }, + { + "epoch": 0.099267498643516, + "grad_norm": 9.601551055908203, + "learning_rate": 9.100041112786077e-06, + "loss": 0.631, + "step": 7318 + }, + { + "epoch": 0.0992810634834509, + "grad_norm": 7.454033851623535, + "learning_rate": 9.099904070165822e-06, + "loss": 0.4775, + "step": 7319 + }, + { + "epoch": 0.09929462832338579, + "grad_norm": 6.047488689422607, + "learning_rate": 9.099767027545568e-06, + "loss": 0.368, + "step": 7320 + }, + { + "epoch": 0.09930819316332068, + "grad_norm": 7.6970133781433105, + "learning_rate": 9.099629984925313e-06, + "loss": 0.4664, + "step": 7321 + }, + { + "epoch": 0.09932175800325556, + "grad_norm": 9.7297945022583, + "learning_rate": 9.099492942305056e-06, + "loss": 0.5844, + "step": 7322 + }, + { + "epoch": 0.09933532284319045, + "grad_norm": 7.494564056396484, + "learning_rate": 9.099355899684803e-06, + "loss": 0.3959, + "step": 7323 + }, + { + "epoch": 0.09934888768312534, + "grad_norm": 7.014837741851807, + "learning_rate": 9.099218857064548e-06, + "loss": 0.3692, + "step": 7324 + }, + { + "epoch": 0.09936245252306022, + "grad_norm": 9.733050346374512, + "learning_rate": 9.099081814444293e-06, + "loss": 0.5223, + "step": 7325 + }, + { + "epoch": 0.09937601736299512, + "grad_norm": 6.533509254455566, + "learning_rate": 9.098944771824037e-06, + "loss": 0.3632, + "step": 7326 + }, + { + "epoch": 0.09938958220293001, + "grad_norm": 6.939817905426025, + "learning_rate": 9.098807729203784e-06, + "loss": 0.514, + "step": 7327 + }, + { + "epoch": 0.09940314704286489, + "grad_norm": 6.845818519592285, + "learning_rate": 9.098670686583529e-06, + "loss": 0.416, + "step": 7328 + }, + { + "epoch": 0.09941671188279978, + "grad_norm": 7.14758825302124, + "learning_rate": 9.098533643963273e-06, + "loss": 0.485, + "step": 7329 + }, + { + "epoch": 0.09943027672273468, + "grad_norm": 6.510430812835693, + "learning_rate": 9.098396601343018e-06, + "loss": 0.3381, + "step": 7330 + }, + { + "epoch": 0.09944384156266955, + "grad_norm": 7.220770835876465, + "learning_rate": 9.098259558722765e-06, + "loss": 0.3874, + "step": 7331 + }, + { + "epoch": 0.09945740640260445, + "grad_norm": 6.48464298248291, + "learning_rate": 9.098122516102508e-06, + "loss": 0.4366, + "step": 7332 + }, + { + "epoch": 0.09947097124253934, + "grad_norm": 8.122598648071289, + "learning_rate": 9.097985473482253e-06, + "loss": 0.5164, + "step": 7333 + }, + { + "epoch": 0.09948453608247422, + "grad_norm": 9.648215293884277, + "learning_rate": 9.097848430861998e-06, + "loss": 0.8057, + "step": 7334 + }, + { + "epoch": 0.09949810092240911, + "grad_norm": 7.903954029083252, + "learning_rate": 9.097711388241744e-06, + "loss": 0.4771, + "step": 7335 + }, + { + "epoch": 0.099511665762344, + "grad_norm": 7.833309173583984, + "learning_rate": 9.097574345621489e-06, + "loss": 0.4075, + "step": 7336 + }, + { + "epoch": 0.0995252306022789, + "grad_norm": 7.238899230957031, + "learning_rate": 9.097437303001234e-06, + "loss": 0.3949, + "step": 7337 + }, + { + "epoch": 0.09953879544221378, + "grad_norm": 9.062214851379395, + "learning_rate": 9.097300260380979e-06, + "loss": 0.5311, + "step": 7338 + }, + { + "epoch": 0.09955236028214867, + "grad_norm": 6.9983229637146, + "learning_rate": 9.097163217760724e-06, + "loss": 0.5344, + "step": 7339 + }, + { + "epoch": 0.09956592512208357, + "grad_norm": 8.85147762298584, + "learning_rate": 9.09702617514047e-06, + "loss": 0.566, + "step": 7340 + }, + { + "epoch": 0.09957948996201844, + "grad_norm": 8.783763885498047, + "learning_rate": 9.096889132520215e-06, + "loss": 0.5696, + "step": 7341 + }, + { + "epoch": 0.09959305480195334, + "grad_norm": 6.904355049133301, + "learning_rate": 9.09675208989996e-06, + "loss": 0.4652, + "step": 7342 + }, + { + "epoch": 0.09960661964188823, + "grad_norm": 5.64204216003418, + "learning_rate": 9.096615047279705e-06, + "loss": 0.3298, + "step": 7343 + }, + { + "epoch": 0.09962018448182311, + "grad_norm": 7.444753170013428, + "learning_rate": 9.09647800465945e-06, + "loss": 0.579, + "step": 7344 + }, + { + "epoch": 0.099633749321758, + "grad_norm": 7.616522789001465, + "learning_rate": 9.096340962039195e-06, + "loss": 0.4494, + "step": 7345 + }, + { + "epoch": 0.0996473141616929, + "grad_norm": 7.49794340133667, + "learning_rate": 9.09620391941894e-06, + "loss": 0.3801, + "step": 7346 + }, + { + "epoch": 0.09966087900162778, + "grad_norm": 4.459370136260986, + "learning_rate": 9.096066876798684e-06, + "loss": 0.2429, + "step": 7347 + }, + { + "epoch": 0.09967444384156267, + "grad_norm": 6.7406816482543945, + "learning_rate": 9.095929834178431e-06, + "loss": 0.5729, + "step": 7348 + }, + { + "epoch": 0.09968800868149756, + "grad_norm": 5.975917339324951, + "learning_rate": 9.095792791558176e-06, + "loss": 0.46, + "step": 7349 + }, + { + "epoch": 0.09970157352143244, + "grad_norm": 6.288971424102783, + "learning_rate": 9.095655748937921e-06, + "loss": 0.3083, + "step": 7350 + }, + { + "epoch": 0.09971513836136733, + "grad_norm": 5.603311538696289, + "learning_rate": 9.095518706317665e-06, + "loss": 0.3029, + "step": 7351 + }, + { + "epoch": 0.09972870320130223, + "grad_norm": 5.537247180938721, + "learning_rate": 9.09538166369741e-06, + "loss": 0.2807, + "step": 7352 + }, + { + "epoch": 0.09974226804123712, + "grad_norm": 6.449461460113525, + "learning_rate": 9.095244621077157e-06, + "loss": 0.492, + "step": 7353 + }, + { + "epoch": 0.099755832881172, + "grad_norm": 6.406412124633789, + "learning_rate": 9.0951075784569e-06, + "loss": 0.4938, + "step": 7354 + }, + { + "epoch": 0.09976939772110689, + "grad_norm": 6.67724609375, + "learning_rate": 9.094970535836645e-06, + "loss": 0.3277, + "step": 7355 + }, + { + "epoch": 0.09978296256104179, + "grad_norm": 6.947128772735596, + "learning_rate": 9.09483349321639e-06, + "loss": 0.4318, + "step": 7356 + }, + { + "epoch": 0.09979652740097666, + "grad_norm": 9.0242280960083, + "learning_rate": 9.094696450596136e-06, + "loss": 0.471, + "step": 7357 + }, + { + "epoch": 0.09981009224091156, + "grad_norm": 4.7377166748046875, + "learning_rate": 9.094559407975881e-06, + "loss": 0.1808, + "step": 7358 + }, + { + "epoch": 0.09982365708084645, + "grad_norm": 7.46419095993042, + "learning_rate": 9.094422365355626e-06, + "loss": 0.5418, + "step": 7359 + }, + { + "epoch": 0.09983722192078133, + "grad_norm": 7.505744934082031, + "learning_rate": 9.094285322735371e-06, + "loss": 0.4775, + "step": 7360 + }, + { + "epoch": 0.09985078676071622, + "grad_norm": 5.750063896179199, + "learning_rate": 9.094148280115117e-06, + "loss": 0.4013, + "step": 7361 + }, + { + "epoch": 0.09986435160065112, + "grad_norm": 6.930904865264893, + "learning_rate": 9.094011237494862e-06, + "loss": 0.4072, + "step": 7362 + }, + { + "epoch": 0.099877916440586, + "grad_norm": 6.749834060668945, + "learning_rate": 9.093874194874607e-06, + "loss": 0.3633, + "step": 7363 + }, + { + "epoch": 0.09989148128052089, + "grad_norm": 6.0135273933410645, + "learning_rate": 9.093737152254352e-06, + "loss": 0.2702, + "step": 7364 + }, + { + "epoch": 0.09990504612045578, + "grad_norm": 6.587977886199951, + "learning_rate": 9.093600109634097e-06, + "loss": 0.3181, + "step": 7365 + }, + { + "epoch": 0.09991861096039066, + "grad_norm": 6.775217533111572, + "learning_rate": 9.093463067013842e-06, + "loss": 0.4065, + "step": 7366 + }, + { + "epoch": 0.09993217580032555, + "grad_norm": 4.8080153465271, + "learning_rate": 9.093326024393588e-06, + "loss": 0.2511, + "step": 7367 + }, + { + "epoch": 0.09994574064026045, + "grad_norm": 5.388307094573975, + "learning_rate": 9.093188981773333e-06, + "loss": 0.3094, + "step": 7368 + }, + { + "epoch": 0.09995930548019534, + "grad_norm": 5.888590335845947, + "learning_rate": 9.093051939153076e-06, + "loss": 0.324, + "step": 7369 + }, + { + "epoch": 0.09997287032013022, + "grad_norm": 8.33454704284668, + "learning_rate": 9.092914896532823e-06, + "loss": 0.5782, + "step": 7370 + }, + { + "epoch": 0.09998643516006511, + "grad_norm": 5.551248073577881, + "learning_rate": 9.092777853912568e-06, + "loss": 0.2658, + "step": 7371 + }, + { + "epoch": 0.1, + "grad_norm": 6.8736186027526855, + "learning_rate": 9.092640811292312e-06, + "loss": 0.4793, + "step": 7372 + }, + { + "epoch": 0.1, + "eval_loss": 0.40063920617103577, + "eval_noise_accuracy": NaN, + "eval_runtime": 4722.0665, + "eval_samples_per_second": 1.064, + "eval_steps_per_second": 0.066, + "eval_wer": 38.42482468876464, + "step": 7372 + }, + { + "epoch": 0.10001356483993488, + "grad_norm": 7.920837879180908, + "learning_rate": 9.092503768672057e-06, + "loss": 0.3321, + "step": 7373 + }, + { + "epoch": 0.10002712967986978, + "grad_norm": 6.356741428375244, + "learning_rate": 9.092366726051804e-06, + "loss": 0.4058, + "step": 7374 + }, + { + "epoch": 0.10004069451980467, + "grad_norm": 6.426058292388916, + "learning_rate": 9.092229683431549e-06, + "loss": 0.3876, + "step": 7375 + }, + { + "epoch": 0.10005425935973955, + "grad_norm": 8.075918197631836, + "learning_rate": 9.092092640811293e-06, + "loss": 0.3014, + "step": 7376 + }, + { + "epoch": 0.10006782419967444, + "grad_norm": 6.020146369934082, + "learning_rate": 9.091955598191038e-06, + "loss": 0.2436, + "step": 7377 + }, + { + "epoch": 0.10008138903960934, + "grad_norm": 6.327799320220947, + "learning_rate": 9.091818555570783e-06, + "loss": 0.5119, + "step": 7378 + }, + { + "epoch": 0.10009495387954422, + "grad_norm": 4.620138168334961, + "learning_rate": 9.091681512950528e-06, + "loss": 0.2328, + "step": 7379 + }, + { + "epoch": 0.10010851871947911, + "grad_norm": 5.348583698272705, + "learning_rate": 9.091544470330273e-06, + "loss": 0.308, + "step": 7380 + }, + { + "epoch": 0.100122083559414, + "grad_norm": 6.210312366485596, + "learning_rate": 9.091407427710018e-06, + "loss": 0.3159, + "step": 7381 + }, + { + "epoch": 0.10013564839934888, + "grad_norm": 6.302791118621826, + "learning_rate": 9.091270385089764e-06, + "loss": 0.3545, + "step": 7382 + }, + { + "epoch": 0.10014921323928377, + "grad_norm": 6.998288154602051, + "learning_rate": 9.091133342469509e-06, + "loss": 0.3434, + "step": 7383 + }, + { + "epoch": 0.10016277807921867, + "grad_norm": 6.296985149383545, + "learning_rate": 9.090996299849254e-06, + "loss": 0.351, + "step": 7384 + }, + { + "epoch": 0.10017634291915356, + "grad_norm": 7.314630031585693, + "learning_rate": 9.090859257228999e-06, + "loss": 0.4196, + "step": 7385 + }, + { + "epoch": 0.10018990775908844, + "grad_norm": 6.170121669769287, + "learning_rate": 9.090722214608744e-06, + "loss": 0.2835, + "step": 7386 + }, + { + "epoch": 0.10020347259902333, + "grad_norm": 7.3160858154296875, + "learning_rate": 9.09058517198849e-06, + "loss": 0.4609, + "step": 7387 + }, + { + "epoch": 0.10021703743895823, + "grad_norm": 9.504778861999512, + "learning_rate": 9.090448129368235e-06, + "loss": 0.4877, + "step": 7388 + }, + { + "epoch": 0.1002306022788931, + "grad_norm": 5.893861293792725, + "learning_rate": 9.09031108674798e-06, + "loss": 0.3854, + "step": 7389 + }, + { + "epoch": 0.100244167118828, + "grad_norm": 5.9888763427734375, + "learning_rate": 9.090174044127725e-06, + "loss": 0.2018, + "step": 7390 + }, + { + "epoch": 0.10025773195876289, + "grad_norm": 5.4891557693481445, + "learning_rate": 9.090037001507469e-06, + "loss": 0.2832, + "step": 7391 + }, + { + "epoch": 0.10027129679869777, + "grad_norm": 5.3760762214660645, + "learning_rate": 9.089899958887215e-06, + "loss": 0.2774, + "step": 7392 + }, + { + "epoch": 0.10028486163863266, + "grad_norm": 4.814632415771484, + "learning_rate": 9.08976291626696e-06, + "loss": 0.2921, + "step": 7393 + }, + { + "epoch": 0.10029842647856756, + "grad_norm": 6.711721420288086, + "learning_rate": 9.089625873646704e-06, + "loss": 0.397, + "step": 7394 + }, + { + "epoch": 0.10031199131850244, + "grad_norm": 5.53347110748291, + "learning_rate": 9.08948883102645e-06, + "loss": 0.4002, + "step": 7395 + }, + { + "epoch": 0.10032555615843733, + "grad_norm": 7.630945682525635, + "learning_rate": 9.089351788406196e-06, + "loss": 0.3336, + "step": 7396 + }, + { + "epoch": 0.10033912099837222, + "grad_norm": 7.74668550491333, + "learning_rate": 9.08921474578594e-06, + "loss": 0.5478, + "step": 7397 + }, + { + "epoch": 0.1003526858383071, + "grad_norm": 6.943678379058838, + "learning_rate": 9.089077703165685e-06, + "loss": 0.3445, + "step": 7398 + }, + { + "epoch": 0.100366250678242, + "grad_norm": 5.4303059577941895, + "learning_rate": 9.08894066054543e-06, + "loss": 0.3338, + "step": 7399 + }, + { + "epoch": 0.10037981551817689, + "grad_norm": 6.194942474365234, + "learning_rate": 9.088803617925175e-06, + "loss": 0.331, + "step": 7400 + }, + { + "epoch": 0.10039338035811178, + "grad_norm": 8.344117164611816, + "learning_rate": 9.08866657530492e-06, + "loss": 0.4813, + "step": 7401 + }, + { + "epoch": 0.10040694519804666, + "grad_norm": 8.322062492370605, + "learning_rate": 9.088529532684665e-06, + "loss": 0.5301, + "step": 7402 + }, + { + "epoch": 0.10042051003798155, + "grad_norm": 6.835194110870361, + "learning_rate": 9.08839249006441e-06, + "loss": 0.4229, + "step": 7403 + }, + { + "epoch": 0.10043407487791645, + "grad_norm": 6.083949089050293, + "learning_rate": 9.088255447444156e-06, + "loss": 0.2832, + "step": 7404 + }, + { + "epoch": 0.10044763971785133, + "grad_norm": 7.021444797515869, + "learning_rate": 9.088118404823901e-06, + "loss": 0.3431, + "step": 7405 + }, + { + "epoch": 0.10046120455778622, + "grad_norm": 6.372574806213379, + "learning_rate": 9.087981362203646e-06, + "loss": 0.4182, + "step": 7406 + }, + { + "epoch": 0.10047476939772111, + "grad_norm": 6.034136772155762, + "learning_rate": 9.087844319583391e-06, + "loss": 0.396, + "step": 7407 + }, + { + "epoch": 0.10048833423765599, + "grad_norm": 7.240843296051025, + "learning_rate": 9.087707276963137e-06, + "loss": 0.4835, + "step": 7408 + }, + { + "epoch": 0.10050189907759088, + "grad_norm": 7.165036678314209, + "learning_rate": 9.087570234342882e-06, + "loss": 0.3083, + "step": 7409 + }, + { + "epoch": 0.10051546391752578, + "grad_norm": 5.662109851837158, + "learning_rate": 9.087433191722627e-06, + "loss": 0.3616, + "step": 7410 + }, + { + "epoch": 0.10052902875746066, + "grad_norm": 6.025138854980469, + "learning_rate": 9.087296149102372e-06, + "loss": 0.375, + "step": 7411 + }, + { + "epoch": 0.10054259359739555, + "grad_norm": 8.572532653808594, + "learning_rate": 9.087159106482116e-06, + "loss": 0.5646, + "step": 7412 + }, + { + "epoch": 0.10055615843733044, + "grad_norm": 7.93411922454834, + "learning_rate": 9.087022063861862e-06, + "loss": 0.4659, + "step": 7413 + }, + { + "epoch": 0.10056972327726532, + "grad_norm": 7.082610607147217, + "learning_rate": 9.086885021241608e-06, + "loss": 0.4657, + "step": 7414 + }, + { + "epoch": 0.10058328811720021, + "grad_norm": 6.573038578033447, + "learning_rate": 9.086747978621351e-06, + "loss": 0.4511, + "step": 7415 + }, + { + "epoch": 0.10059685295713511, + "grad_norm": 6.642657279968262, + "learning_rate": 9.086610936001096e-06, + "loss": 0.36, + "step": 7416 + }, + { + "epoch": 0.10061041779707, + "grad_norm": 10.133790969848633, + "learning_rate": 9.086473893380843e-06, + "loss": 0.531, + "step": 7417 + }, + { + "epoch": 0.10062398263700488, + "grad_norm": 9.318631172180176, + "learning_rate": 9.086336850760588e-06, + "loss": 0.5223, + "step": 7418 + }, + { + "epoch": 0.10063754747693977, + "grad_norm": 6.531031608581543, + "learning_rate": 9.086199808140332e-06, + "loss": 0.4072, + "step": 7419 + }, + { + "epoch": 0.10065111231687467, + "grad_norm": 7.8613972663879395, + "learning_rate": 9.086062765520077e-06, + "loss": 0.4878, + "step": 7420 + }, + { + "epoch": 0.10066467715680955, + "grad_norm": 8.64606761932373, + "learning_rate": 9.085925722899822e-06, + "loss": 0.4445, + "step": 7421 + }, + { + "epoch": 0.10067824199674444, + "grad_norm": 6.558692455291748, + "learning_rate": 9.085788680279567e-06, + "loss": 0.42, + "step": 7422 + }, + { + "epoch": 0.10069180683667933, + "grad_norm": 6.575648784637451, + "learning_rate": 9.085651637659313e-06, + "loss": 0.2997, + "step": 7423 + }, + { + "epoch": 0.10070537167661421, + "grad_norm": 8.514862060546875, + "learning_rate": 9.085514595039058e-06, + "loss": 0.4951, + "step": 7424 + }, + { + "epoch": 0.1007189365165491, + "grad_norm": 8.240334510803223, + "learning_rate": 9.085377552418803e-06, + "loss": 0.3708, + "step": 7425 + }, + { + "epoch": 0.100732501356484, + "grad_norm": 6.824660301208496, + "learning_rate": 9.085240509798548e-06, + "loss": 0.3471, + "step": 7426 + }, + { + "epoch": 0.10074606619641888, + "grad_norm": 6.1557841300964355, + "learning_rate": 9.085103467178293e-06, + "loss": 0.3489, + "step": 7427 + }, + { + "epoch": 0.10075963103635377, + "grad_norm": 7.019870281219482, + "learning_rate": 9.084966424558038e-06, + "loss": 0.4981, + "step": 7428 + }, + { + "epoch": 0.10077319587628866, + "grad_norm": 7.181682109832764, + "learning_rate": 9.084829381937784e-06, + "loss": 0.4419, + "step": 7429 + }, + { + "epoch": 0.10078676071622354, + "grad_norm": 7.64786958694458, + "learning_rate": 9.084692339317529e-06, + "loss": 0.4479, + "step": 7430 + }, + { + "epoch": 0.10080032555615843, + "grad_norm": 8.473851203918457, + "learning_rate": 9.084555296697274e-06, + "loss": 0.4887, + "step": 7431 + }, + { + "epoch": 0.10081389039609333, + "grad_norm": 7.824499607086182, + "learning_rate": 9.08441825407702e-06, + "loss": 0.3849, + "step": 7432 + }, + { + "epoch": 0.10082745523602822, + "grad_norm": 6.84727668762207, + "learning_rate": 9.084281211456764e-06, + "loss": 0.3483, + "step": 7433 + }, + { + "epoch": 0.1008410200759631, + "grad_norm": 6.267733573913574, + "learning_rate": 9.084144168836508e-06, + "loss": 0.3763, + "step": 7434 + }, + { + "epoch": 0.100854584915898, + "grad_norm": 6.7895283699035645, + "learning_rate": 9.084007126216255e-06, + "loss": 0.4468, + "step": 7435 + }, + { + "epoch": 0.10086814975583289, + "grad_norm": 7.091675758361816, + "learning_rate": 9.083870083596e-06, + "loss": 0.4402, + "step": 7436 + }, + { + "epoch": 0.10088171459576777, + "grad_norm": 6.415123462677002, + "learning_rate": 9.083733040975743e-06, + "loss": 0.2684, + "step": 7437 + }, + { + "epoch": 0.10089527943570266, + "grad_norm": 9.031288146972656, + "learning_rate": 9.083595998355489e-06, + "loss": 0.4844, + "step": 7438 + }, + { + "epoch": 0.10090884427563755, + "grad_norm": 6.327072620391846, + "learning_rate": 9.083458955735235e-06, + "loss": 0.5002, + "step": 7439 + }, + { + "epoch": 0.10092240911557243, + "grad_norm": 7.460447311401367, + "learning_rate": 9.083321913114979e-06, + "loss": 0.4787, + "step": 7440 + }, + { + "epoch": 0.10093597395550732, + "grad_norm": 8.749053001403809, + "learning_rate": 9.083184870494724e-06, + "loss": 0.4513, + "step": 7441 + }, + { + "epoch": 0.10094953879544222, + "grad_norm": 7.02896785736084, + "learning_rate": 9.08304782787447e-06, + "loss": 0.435, + "step": 7442 + }, + { + "epoch": 0.1009631036353771, + "grad_norm": 6.666215896606445, + "learning_rate": 9.082910785254216e-06, + "loss": 0.3513, + "step": 7443 + }, + { + "epoch": 0.10097666847531199, + "grad_norm": 7.783139228820801, + "learning_rate": 9.08277374263396e-06, + "loss": 0.4154, + "step": 7444 + }, + { + "epoch": 0.10099023331524688, + "grad_norm": 6.401657581329346, + "learning_rate": 9.082636700013705e-06, + "loss": 0.5883, + "step": 7445 + }, + { + "epoch": 0.10100379815518176, + "grad_norm": 6.940399169921875, + "learning_rate": 9.08249965739345e-06, + "loss": 0.5427, + "step": 7446 + }, + { + "epoch": 0.10101736299511666, + "grad_norm": 6.089081287384033, + "learning_rate": 9.082362614773195e-06, + "loss": 0.4268, + "step": 7447 + }, + { + "epoch": 0.10103092783505155, + "grad_norm": 6.070066928863525, + "learning_rate": 9.08222557215294e-06, + "loss": 0.41, + "step": 7448 + }, + { + "epoch": 0.10104449267498644, + "grad_norm": 7.00205659866333, + "learning_rate": 9.082088529532686e-06, + "loss": 0.4729, + "step": 7449 + }, + { + "epoch": 0.10105805751492132, + "grad_norm": 6.726981163024902, + "learning_rate": 9.08195148691243e-06, + "loss": 0.4559, + "step": 7450 + }, + { + "epoch": 0.10107162235485621, + "grad_norm": 6.511752605438232, + "learning_rate": 9.081814444292176e-06, + "loss": 0.4722, + "step": 7451 + }, + { + "epoch": 0.1010851871947911, + "grad_norm": 6.6025471687316895, + "learning_rate": 9.081677401671921e-06, + "loss": 0.3068, + "step": 7452 + }, + { + "epoch": 0.10109875203472599, + "grad_norm": 8.296150207519531, + "learning_rate": 9.081540359051666e-06, + "loss": 0.4257, + "step": 7453 + }, + { + "epoch": 0.10111231687466088, + "grad_norm": 5.896810054779053, + "learning_rate": 9.081403316431411e-06, + "loss": 0.4586, + "step": 7454 + }, + { + "epoch": 0.10112588171459577, + "grad_norm": 5.3415446281433105, + "learning_rate": 9.081266273811155e-06, + "loss": 0.2803, + "step": 7455 + }, + { + "epoch": 0.10113944655453065, + "grad_norm": 6.669988632202148, + "learning_rate": 9.081129231190902e-06, + "loss": 0.4133, + "step": 7456 + }, + { + "epoch": 0.10115301139446554, + "grad_norm": 6.040971755981445, + "learning_rate": 9.080992188570647e-06, + "loss": 0.346, + "step": 7457 + }, + { + "epoch": 0.10116657623440044, + "grad_norm": 5.147767066955566, + "learning_rate": 9.080855145950392e-06, + "loss": 0.3541, + "step": 7458 + }, + { + "epoch": 0.10118014107433532, + "grad_norm": 8.809163093566895, + "learning_rate": 9.080718103330136e-06, + "loss": 0.3874, + "step": 7459 + }, + { + "epoch": 0.10119370591427021, + "grad_norm": 7.490042209625244, + "learning_rate": 9.08058106070988e-06, + "loss": 0.4111, + "step": 7460 + }, + { + "epoch": 0.1012072707542051, + "grad_norm": 7.06512975692749, + "learning_rate": 9.080444018089628e-06, + "loss": 0.5666, + "step": 7461 + }, + { + "epoch": 0.10122083559413998, + "grad_norm": 6.0997138023376465, + "learning_rate": 9.080306975469371e-06, + "loss": 0.3504, + "step": 7462 + }, + { + "epoch": 0.10123440043407488, + "grad_norm": 7.22841739654541, + "learning_rate": 9.080169932849116e-06, + "loss": 0.4824, + "step": 7463 + }, + { + "epoch": 0.10124796527400977, + "grad_norm": 6.629993438720703, + "learning_rate": 9.080032890228862e-06, + "loss": 0.4299, + "step": 7464 + }, + { + "epoch": 0.10126153011394466, + "grad_norm": 7.890544414520264, + "learning_rate": 9.079895847608607e-06, + "loss": 0.3292, + "step": 7465 + }, + { + "epoch": 0.10127509495387954, + "grad_norm": 6.033910274505615, + "learning_rate": 9.079758804988352e-06, + "loss": 0.4504, + "step": 7466 + }, + { + "epoch": 0.10128865979381443, + "grad_norm": 6.782378196716309, + "learning_rate": 9.079621762368097e-06, + "loss": 0.3836, + "step": 7467 + }, + { + "epoch": 0.10130222463374933, + "grad_norm": 7.364162921905518, + "learning_rate": 9.079484719747842e-06, + "loss": 0.37, + "step": 7468 + }, + { + "epoch": 0.1013157894736842, + "grad_norm": 6.604950904846191, + "learning_rate": 9.079347677127587e-06, + "loss": 0.3131, + "step": 7469 + }, + { + "epoch": 0.1013293543136191, + "grad_norm": 6.456201553344727, + "learning_rate": 9.079210634507333e-06, + "loss": 0.4283, + "step": 7470 + }, + { + "epoch": 0.10134291915355399, + "grad_norm": 6.778445243835449, + "learning_rate": 9.079073591887078e-06, + "loss": 0.4354, + "step": 7471 + }, + { + "epoch": 0.10135648399348887, + "grad_norm": 7.6478590965271, + "learning_rate": 9.078936549266823e-06, + "loss": 0.359, + "step": 7472 + }, + { + "epoch": 0.10137004883342376, + "grad_norm": 7.1067891120910645, + "learning_rate": 9.078799506646568e-06, + "loss": 0.4484, + "step": 7473 + }, + { + "epoch": 0.10138361367335866, + "grad_norm": 7.187189102172852, + "learning_rate": 9.078662464026313e-06, + "loss": 0.5009, + "step": 7474 + }, + { + "epoch": 0.10139717851329354, + "grad_norm": 7.074535369873047, + "learning_rate": 9.078525421406058e-06, + "loss": 0.4242, + "step": 7475 + }, + { + "epoch": 0.10141074335322843, + "grad_norm": 4.902979373931885, + "learning_rate": 9.078388378785804e-06, + "loss": 0.3391, + "step": 7476 + }, + { + "epoch": 0.10142430819316332, + "grad_norm": 4.236255168914795, + "learning_rate": 9.078251336165547e-06, + "loss": 0.3525, + "step": 7477 + }, + { + "epoch": 0.1014378730330982, + "grad_norm": 6.1331377029418945, + "learning_rate": 9.078114293545294e-06, + "loss": 0.3778, + "step": 7478 + }, + { + "epoch": 0.1014514378730331, + "grad_norm": 5.554660320281982, + "learning_rate": 9.07797725092504e-06, + "loss": 0.3922, + "step": 7479 + }, + { + "epoch": 0.10146500271296799, + "grad_norm": 6.142160892486572, + "learning_rate": 9.077840208304783e-06, + "loss": 0.4035, + "step": 7480 + }, + { + "epoch": 0.10147856755290288, + "grad_norm": 7.984951019287109, + "learning_rate": 9.077703165684528e-06, + "loss": 0.4437, + "step": 7481 + }, + { + "epoch": 0.10149213239283776, + "grad_norm": 6.412516117095947, + "learning_rate": 9.077566123064275e-06, + "loss": 0.3514, + "step": 7482 + }, + { + "epoch": 0.10150569723277265, + "grad_norm": 5.0790205001831055, + "learning_rate": 9.077429080444018e-06, + "loss": 0.2906, + "step": 7483 + }, + { + "epoch": 0.10151926207270755, + "grad_norm": 5.052819728851318, + "learning_rate": 9.077292037823763e-06, + "loss": 0.334, + "step": 7484 + }, + { + "epoch": 0.10153282691264243, + "grad_norm": 5.799262046813965, + "learning_rate": 9.077154995203509e-06, + "loss": 0.3717, + "step": 7485 + }, + { + "epoch": 0.10154639175257732, + "grad_norm": 4.592785835266113, + "learning_rate": 9.077017952583255e-06, + "loss": 0.2758, + "step": 7486 + }, + { + "epoch": 0.10155995659251221, + "grad_norm": 5.1168928146362305, + "learning_rate": 9.076880909962999e-06, + "loss": 0.2698, + "step": 7487 + }, + { + "epoch": 0.10157352143244709, + "grad_norm": 6.366843223571777, + "learning_rate": 9.076743867342744e-06, + "loss": 0.3302, + "step": 7488 + }, + { + "epoch": 0.10158708627238199, + "grad_norm": 7.576469421386719, + "learning_rate": 9.07660682472249e-06, + "loss": 0.5423, + "step": 7489 + }, + { + "epoch": 0.10160065111231688, + "grad_norm": 5.702389240264893, + "learning_rate": 9.076469782102234e-06, + "loss": 0.3139, + "step": 7490 + }, + { + "epoch": 0.10161421595225176, + "grad_norm": 5.949655532836914, + "learning_rate": 9.07633273948198e-06, + "loss": 0.4385, + "step": 7491 + }, + { + "epoch": 0.10162778079218665, + "grad_norm": 6.4940996170043945, + "learning_rate": 9.076195696861725e-06, + "loss": 0.3879, + "step": 7492 + }, + { + "epoch": 0.10164134563212154, + "grad_norm": 6.424781322479248, + "learning_rate": 9.07605865424147e-06, + "loss": 0.4757, + "step": 7493 + }, + { + "epoch": 0.10165491047205642, + "grad_norm": 6.6584296226501465, + "learning_rate": 9.075921611621215e-06, + "loss": 0.3638, + "step": 7494 + }, + { + "epoch": 0.10166847531199132, + "grad_norm": 5.775683403015137, + "learning_rate": 9.07578456900096e-06, + "loss": 0.2799, + "step": 7495 + }, + { + "epoch": 0.10168204015192621, + "grad_norm": 8.302852630615234, + "learning_rate": 9.075647526380706e-06, + "loss": 0.6475, + "step": 7496 + }, + { + "epoch": 0.1016956049918611, + "grad_norm": 8.120766639709473, + "learning_rate": 9.07551048376045e-06, + "loss": 0.4988, + "step": 7497 + }, + { + "epoch": 0.10170916983179598, + "grad_norm": 5.85768461227417, + "learning_rate": 9.075373441140194e-06, + "loss": 0.3416, + "step": 7498 + }, + { + "epoch": 0.10172273467173087, + "grad_norm": 5.978977203369141, + "learning_rate": 9.075236398519941e-06, + "loss": 0.3175, + "step": 7499 + }, + { + "epoch": 0.10173629951166577, + "grad_norm": 6.410449028015137, + "learning_rate": 9.075099355899686e-06, + "loss": 0.3668, + "step": 7500 + }, + { + "epoch": 0.10174986435160065, + "grad_norm": 6.8049821853637695, + "learning_rate": 9.074962313279431e-06, + "loss": 0.2847, + "step": 7501 + }, + { + "epoch": 0.10176342919153554, + "grad_norm": 6.282680511474609, + "learning_rate": 9.074825270659175e-06, + "loss": 0.381, + "step": 7502 + }, + { + "epoch": 0.10177699403147043, + "grad_norm": 6.261760711669922, + "learning_rate": 9.07468822803892e-06, + "loss": 0.4279, + "step": 7503 + }, + { + "epoch": 0.10179055887140531, + "grad_norm": 6.22855806350708, + "learning_rate": 9.074551185418667e-06, + "loss": 0.4138, + "step": 7504 + }, + { + "epoch": 0.1018041237113402, + "grad_norm": 6.344828128814697, + "learning_rate": 9.07441414279841e-06, + "loss": 0.371, + "step": 7505 + }, + { + "epoch": 0.1018176885512751, + "grad_norm": 5.947012424468994, + "learning_rate": 9.074277100178156e-06, + "loss": 0.3358, + "step": 7506 + }, + { + "epoch": 0.10183125339120998, + "grad_norm": 6.401487350463867, + "learning_rate": 9.0741400575579e-06, + "loss": 0.405, + "step": 7507 + }, + { + "epoch": 0.10184481823114487, + "grad_norm": 5.00032901763916, + "learning_rate": 9.074003014937646e-06, + "loss": 0.3694, + "step": 7508 + }, + { + "epoch": 0.10185838307107976, + "grad_norm": 4.818246364593506, + "learning_rate": 9.073865972317391e-06, + "loss": 0.2698, + "step": 7509 + }, + { + "epoch": 0.10187194791101466, + "grad_norm": 7.910471439361572, + "learning_rate": 9.073728929697136e-06, + "loss": 0.486, + "step": 7510 + }, + { + "epoch": 0.10188551275094954, + "grad_norm": 6.008670330047607, + "learning_rate": 9.073591887076882e-06, + "loss": 0.3397, + "step": 7511 + }, + { + "epoch": 0.10189907759088443, + "grad_norm": 7.565354347229004, + "learning_rate": 9.073454844456627e-06, + "loss": 0.3477, + "step": 7512 + }, + { + "epoch": 0.10191264243081932, + "grad_norm": 8.420792579650879, + "learning_rate": 9.073317801836372e-06, + "loss": 0.3352, + "step": 7513 + }, + { + "epoch": 0.1019262072707542, + "grad_norm": 8.789387702941895, + "learning_rate": 9.073180759216117e-06, + "loss": 0.4817, + "step": 7514 + }, + { + "epoch": 0.1019397721106891, + "grad_norm": 8.043279647827148, + "learning_rate": 9.073043716595862e-06, + "loss": 0.3964, + "step": 7515 + }, + { + "epoch": 0.10195333695062399, + "grad_norm": 8.392952919006348, + "learning_rate": 9.072906673975607e-06, + "loss": 0.4948, + "step": 7516 + }, + { + "epoch": 0.10196690179055887, + "grad_norm": 5.2443461418151855, + "learning_rate": 9.072769631355353e-06, + "loss": 0.3307, + "step": 7517 + }, + { + "epoch": 0.10198046663049376, + "grad_norm": 8.217634201049805, + "learning_rate": 9.072632588735098e-06, + "loss": 0.4909, + "step": 7518 + }, + { + "epoch": 0.10199403147042865, + "grad_norm": 7.069033622741699, + "learning_rate": 9.072495546114843e-06, + "loss": 0.3185, + "step": 7519 + }, + { + "epoch": 0.10200759631036353, + "grad_norm": 7.23103666305542, + "learning_rate": 9.072358503494586e-06, + "loss": 0.3573, + "step": 7520 + }, + { + "epoch": 0.10202116115029843, + "grad_norm": 5.622198581695557, + "learning_rate": 9.072221460874333e-06, + "loss": 0.365, + "step": 7521 + }, + { + "epoch": 0.10203472599023332, + "grad_norm": 7.315235614776611, + "learning_rate": 9.072084418254078e-06, + "loss": 0.4812, + "step": 7522 + }, + { + "epoch": 0.1020482908301682, + "grad_norm": 5.597794532775879, + "learning_rate": 9.071947375633822e-06, + "loss": 0.3305, + "step": 7523 + }, + { + "epoch": 0.10206185567010309, + "grad_norm": 7.054171562194824, + "learning_rate": 9.071810333013567e-06, + "loss": 0.5118, + "step": 7524 + }, + { + "epoch": 0.10207542051003798, + "grad_norm": 6.768459320068359, + "learning_rate": 9.071673290393314e-06, + "loss": 0.4258, + "step": 7525 + }, + { + "epoch": 0.10208898534997288, + "grad_norm": 6.371766567230225, + "learning_rate": 9.07153624777306e-06, + "loss": 0.3995, + "step": 7526 + }, + { + "epoch": 0.10210255018990776, + "grad_norm": 8.660905838012695, + "learning_rate": 9.071399205152803e-06, + "loss": 0.4137, + "step": 7527 + }, + { + "epoch": 0.10211611502984265, + "grad_norm": 8.25964069366455, + "learning_rate": 9.071262162532548e-06, + "loss": 0.416, + "step": 7528 + }, + { + "epoch": 0.10212967986977754, + "grad_norm": 6.260815620422363, + "learning_rate": 9.071125119912293e-06, + "loss": 0.6207, + "step": 7529 + }, + { + "epoch": 0.10214324470971242, + "grad_norm": 7.146427154541016, + "learning_rate": 9.070988077292038e-06, + "loss": 0.462, + "step": 7530 + }, + { + "epoch": 0.10215680954964731, + "grad_norm": 7.178138256072998, + "learning_rate": 9.070851034671783e-06, + "loss": 0.3533, + "step": 7531 + }, + { + "epoch": 0.10217037438958221, + "grad_norm": 5.299271583557129, + "learning_rate": 9.070713992051529e-06, + "loss": 0.2712, + "step": 7532 + }, + { + "epoch": 0.10218393922951709, + "grad_norm": 5.644639015197754, + "learning_rate": 9.070576949431274e-06, + "loss": 0.3668, + "step": 7533 + }, + { + "epoch": 0.10219750406945198, + "grad_norm": 5.308669567108154, + "learning_rate": 9.070439906811019e-06, + "loss": 0.3449, + "step": 7534 + }, + { + "epoch": 0.10221106890938687, + "grad_norm": 6.1634368896484375, + "learning_rate": 9.070302864190764e-06, + "loss": 0.3255, + "step": 7535 + }, + { + "epoch": 0.10222463374932175, + "grad_norm": 6.5173115730285645, + "learning_rate": 9.07016582157051e-06, + "loss": 0.3878, + "step": 7536 + }, + { + "epoch": 0.10223819858925665, + "grad_norm": 5.29926061630249, + "learning_rate": 9.070028778950254e-06, + "loss": 0.4313, + "step": 7537 + }, + { + "epoch": 0.10225176342919154, + "grad_norm": 6.556780815124512, + "learning_rate": 9.06989173633e-06, + "loss": 0.3526, + "step": 7538 + }, + { + "epoch": 0.10226532826912642, + "grad_norm": 6.016886234283447, + "learning_rate": 9.069754693709745e-06, + "loss": 0.3085, + "step": 7539 + }, + { + "epoch": 0.10227889310906131, + "grad_norm": 7.689854621887207, + "learning_rate": 9.06961765108949e-06, + "loss": 0.5585, + "step": 7540 + }, + { + "epoch": 0.1022924579489962, + "grad_norm": 5.868163108825684, + "learning_rate": 9.069480608469235e-06, + "loss": 0.3449, + "step": 7541 + }, + { + "epoch": 0.1023060227889311, + "grad_norm": 6.6830596923828125, + "learning_rate": 9.069343565848979e-06, + "loss": 0.2852, + "step": 7542 + }, + { + "epoch": 0.10231958762886598, + "grad_norm": 4.990560054779053, + "learning_rate": 9.069206523228726e-06, + "loss": 0.4104, + "step": 7543 + }, + { + "epoch": 0.10233315246880087, + "grad_norm": 7.2338948249816895, + "learning_rate": 9.06906948060847e-06, + "loss": 0.5202, + "step": 7544 + }, + { + "epoch": 0.10234671730873576, + "grad_norm": 5.067284107208252, + "learning_rate": 9.068932437988214e-06, + "loss": 0.3128, + "step": 7545 + }, + { + "epoch": 0.10236028214867064, + "grad_norm": 6.326758861541748, + "learning_rate": 9.06879539536796e-06, + "loss": 0.3959, + "step": 7546 + }, + { + "epoch": 0.10237384698860554, + "grad_norm": 6.544321537017822, + "learning_rate": 9.068658352747706e-06, + "loss": 0.4257, + "step": 7547 + }, + { + "epoch": 0.10238741182854043, + "grad_norm": 6.375626087188721, + "learning_rate": 9.06852131012745e-06, + "loss": 0.3364, + "step": 7548 + }, + { + "epoch": 0.10240097666847531, + "grad_norm": 8.056066513061523, + "learning_rate": 9.068384267507195e-06, + "loss": 0.5919, + "step": 7549 + }, + { + "epoch": 0.1024145415084102, + "grad_norm": 7.180271148681641, + "learning_rate": 9.06824722488694e-06, + "loss": 0.3403, + "step": 7550 + }, + { + "epoch": 0.1024281063483451, + "grad_norm": 5.0490641593933105, + "learning_rate": 9.068110182266687e-06, + "loss": 0.2743, + "step": 7551 + }, + { + "epoch": 0.10244167118827997, + "grad_norm": 4.436944007873535, + "learning_rate": 9.06797313964643e-06, + "loss": 0.289, + "step": 7552 + }, + { + "epoch": 0.10245523602821487, + "grad_norm": 6.194231033325195, + "learning_rate": 9.067836097026176e-06, + "loss": 0.2942, + "step": 7553 + }, + { + "epoch": 0.10246880086814976, + "grad_norm": 10.341565132141113, + "learning_rate": 9.06769905440592e-06, + "loss": 0.3936, + "step": 7554 + }, + { + "epoch": 0.10248236570808464, + "grad_norm": 6.005364418029785, + "learning_rate": 9.067562011785666e-06, + "loss": 0.2923, + "step": 7555 + }, + { + "epoch": 0.10249593054801953, + "grad_norm": 7.04837703704834, + "learning_rate": 9.067424969165411e-06, + "loss": 0.3438, + "step": 7556 + }, + { + "epoch": 0.10250949538795442, + "grad_norm": 5.870441436767578, + "learning_rate": 9.067287926545156e-06, + "loss": 0.4401, + "step": 7557 + }, + { + "epoch": 0.10252306022788932, + "grad_norm": 8.867186546325684, + "learning_rate": 9.067150883924902e-06, + "loss": 0.5574, + "step": 7558 + }, + { + "epoch": 0.1025366250678242, + "grad_norm": 5.154067516326904, + "learning_rate": 9.067013841304647e-06, + "loss": 0.2851, + "step": 7559 + }, + { + "epoch": 0.10255018990775909, + "grad_norm": 8.348559379577637, + "learning_rate": 9.066876798684392e-06, + "loss": 0.4274, + "step": 7560 + }, + { + "epoch": 0.10256375474769398, + "grad_norm": 5.679464817047119, + "learning_rate": 9.066739756064137e-06, + "loss": 0.5126, + "step": 7561 + }, + { + "epoch": 0.10257731958762886, + "grad_norm": 7.854975700378418, + "learning_rate": 9.066602713443882e-06, + "loss": 0.6144, + "step": 7562 + }, + { + "epoch": 0.10259088442756376, + "grad_norm": 5.350686073303223, + "learning_rate": 9.066465670823626e-06, + "loss": 0.4051, + "step": 7563 + }, + { + "epoch": 0.10260444926749865, + "grad_norm": 7.841783046722412, + "learning_rate": 9.066328628203373e-06, + "loss": 0.5639, + "step": 7564 + }, + { + "epoch": 0.10261801410743353, + "grad_norm": 5.702239036560059, + "learning_rate": 9.066191585583118e-06, + "loss": 0.4122, + "step": 7565 + }, + { + "epoch": 0.10263157894736842, + "grad_norm": 6.680942535400391, + "learning_rate": 9.066054542962863e-06, + "loss": 0.4716, + "step": 7566 + }, + { + "epoch": 0.10264514378730331, + "grad_norm": 7.104112148284912, + "learning_rate": 9.065917500342606e-06, + "loss": 0.3788, + "step": 7567 + }, + { + "epoch": 0.1026587086272382, + "grad_norm": 4.818698883056641, + "learning_rate": 9.065780457722353e-06, + "loss": 0.4281, + "step": 7568 + }, + { + "epoch": 0.10267227346717309, + "grad_norm": 5.675079345703125, + "learning_rate": 9.065643415102099e-06, + "loss": 0.3648, + "step": 7569 + }, + { + "epoch": 0.10268583830710798, + "grad_norm": 6.761694431304932, + "learning_rate": 9.065506372481842e-06, + "loss": 0.363, + "step": 7570 + }, + { + "epoch": 0.10269940314704286, + "grad_norm": 3.90364408493042, + "learning_rate": 9.065369329861587e-06, + "loss": 0.2256, + "step": 7571 + }, + { + "epoch": 0.10271296798697775, + "grad_norm": 7.645195484161377, + "learning_rate": 9.065232287241332e-06, + "loss": 0.4049, + "step": 7572 + }, + { + "epoch": 0.10272653282691264, + "grad_norm": 8.146413803100586, + "learning_rate": 9.065095244621078e-06, + "loss": 0.5601, + "step": 7573 + }, + { + "epoch": 0.10274009766684754, + "grad_norm": 5.143378734588623, + "learning_rate": 9.064958202000823e-06, + "loss": 0.3389, + "step": 7574 + }, + { + "epoch": 0.10275366250678242, + "grad_norm": 5.46747350692749, + "learning_rate": 9.064821159380568e-06, + "loss": 0.2774, + "step": 7575 + }, + { + "epoch": 0.10276722734671731, + "grad_norm": 8.580754280090332, + "learning_rate": 9.064684116760313e-06, + "loss": 0.5529, + "step": 7576 + }, + { + "epoch": 0.1027807921866522, + "grad_norm": 6.984450817108154, + "learning_rate": 9.064547074140058e-06, + "loss": 0.4834, + "step": 7577 + }, + { + "epoch": 0.10279435702658708, + "grad_norm": 4.345678329467773, + "learning_rate": 9.064410031519803e-06, + "loss": 0.2756, + "step": 7578 + }, + { + "epoch": 0.10280792186652198, + "grad_norm": 6.558277130126953, + "learning_rate": 9.064272988899549e-06, + "loss": 0.3277, + "step": 7579 + }, + { + "epoch": 0.10282148670645687, + "grad_norm": 5.508846282958984, + "learning_rate": 9.064135946279294e-06, + "loss": 0.2874, + "step": 7580 + }, + { + "epoch": 0.10283505154639175, + "grad_norm": 6.829710006713867, + "learning_rate": 9.063998903659039e-06, + "loss": 0.5277, + "step": 7581 + }, + { + "epoch": 0.10284861638632664, + "grad_norm": 5.9345784187316895, + "learning_rate": 9.063861861038784e-06, + "loss": 0.3964, + "step": 7582 + }, + { + "epoch": 0.10286218122626153, + "grad_norm": 6.449326992034912, + "learning_rate": 9.06372481841853e-06, + "loss": 0.342, + "step": 7583 + }, + { + "epoch": 0.10287574606619641, + "grad_norm": 7.250338554382324, + "learning_rate": 9.063587775798274e-06, + "loss": 0.4045, + "step": 7584 + }, + { + "epoch": 0.1028893109061313, + "grad_norm": 5.375519275665283, + "learning_rate": 9.063450733178018e-06, + "loss": 0.3806, + "step": 7585 + }, + { + "epoch": 0.1029028757460662, + "grad_norm": 6.157287120819092, + "learning_rate": 9.063313690557765e-06, + "loss": 0.3616, + "step": 7586 + }, + { + "epoch": 0.10291644058600108, + "grad_norm": 7.398186683654785, + "learning_rate": 9.06317664793751e-06, + "loss": 0.6204, + "step": 7587 + }, + { + "epoch": 0.10293000542593597, + "grad_norm": 6.537342548370361, + "learning_rate": 9.063039605317254e-06, + "loss": 0.344, + "step": 7588 + }, + { + "epoch": 0.10294357026587087, + "grad_norm": 5.935485363006592, + "learning_rate": 9.062902562696999e-06, + "loss": 0.412, + "step": 7589 + }, + { + "epoch": 0.10295713510580576, + "grad_norm": 6.052064895629883, + "learning_rate": 9.062765520076746e-06, + "loss": 0.4301, + "step": 7590 + }, + { + "epoch": 0.10297069994574064, + "grad_norm": 6.611637592315674, + "learning_rate": 9.062628477456489e-06, + "loss": 0.4773, + "step": 7591 + }, + { + "epoch": 0.10298426478567553, + "grad_norm": 5.583574295043945, + "learning_rate": 9.062491434836234e-06, + "loss": 0.4517, + "step": 7592 + }, + { + "epoch": 0.10299782962561042, + "grad_norm": 7.296842575073242, + "learning_rate": 9.06235439221598e-06, + "loss": 0.3468, + "step": 7593 + }, + { + "epoch": 0.1030113944655453, + "grad_norm": 7.763660430908203, + "learning_rate": 9.062217349595726e-06, + "loss": 0.4453, + "step": 7594 + }, + { + "epoch": 0.1030249593054802, + "grad_norm": 6.6137375831604, + "learning_rate": 9.06208030697547e-06, + "loss": 0.4982, + "step": 7595 + }, + { + "epoch": 0.10303852414541509, + "grad_norm": 5.976877212524414, + "learning_rate": 9.061943264355215e-06, + "loss": 0.3241, + "step": 7596 + }, + { + "epoch": 0.10305208898534997, + "grad_norm": 6.5366902351379395, + "learning_rate": 9.06180622173496e-06, + "loss": 0.32, + "step": 7597 + }, + { + "epoch": 0.10306565382528486, + "grad_norm": 6.658583164215088, + "learning_rate": 9.061669179114705e-06, + "loss": 0.32, + "step": 7598 + }, + { + "epoch": 0.10307921866521975, + "grad_norm": 7.066991329193115, + "learning_rate": 9.06153213649445e-06, + "loss": 0.4551, + "step": 7599 + }, + { + "epoch": 0.10309278350515463, + "grad_norm": 6.009147644042969, + "learning_rate": 9.061395093874196e-06, + "loss": 0.239, + "step": 7600 + }, + { + "epoch": 0.10310634834508953, + "grad_norm": 7.120341777801514, + "learning_rate": 9.06125805125394e-06, + "loss": 0.3323, + "step": 7601 + }, + { + "epoch": 0.10311991318502442, + "grad_norm": 4.431980133056641, + "learning_rate": 9.061121008633686e-06, + "loss": 0.2647, + "step": 7602 + }, + { + "epoch": 0.1031334780249593, + "grad_norm": 4.943305015563965, + "learning_rate": 9.060983966013431e-06, + "loss": 0.2748, + "step": 7603 + }, + { + "epoch": 0.10314704286489419, + "grad_norm": 8.5430326461792, + "learning_rate": 9.060846923393176e-06, + "loss": 0.332, + "step": 7604 + }, + { + "epoch": 0.10316060770482909, + "grad_norm": 6.468421459197998, + "learning_rate": 9.060709880772922e-06, + "loss": 0.335, + "step": 7605 + }, + { + "epoch": 0.10317417254476398, + "grad_norm": 8.738129615783691, + "learning_rate": 9.060572838152665e-06, + "loss": 0.6343, + "step": 7606 + }, + { + "epoch": 0.10318773738469886, + "grad_norm": 6.822569847106934, + "learning_rate": 9.060435795532412e-06, + "loss": 0.3702, + "step": 7607 + }, + { + "epoch": 0.10320130222463375, + "grad_norm": 4.97053337097168, + "learning_rate": 9.060298752912157e-06, + "loss": 0.2011, + "step": 7608 + }, + { + "epoch": 0.10321486706456864, + "grad_norm": 6.353147029876709, + "learning_rate": 9.060161710291902e-06, + "loss": 0.3346, + "step": 7609 + }, + { + "epoch": 0.10322843190450352, + "grad_norm": 5.7039570808410645, + "learning_rate": 9.060024667671646e-06, + "loss": 0.3617, + "step": 7610 + }, + { + "epoch": 0.10324199674443842, + "grad_norm": 7.808954238891602, + "learning_rate": 9.059887625051391e-06, + "loss": 0.3327, + "step": 7611 + }, + { + "epoch": 0.10325556158437331, + "grad_norm": 8.466326713562012, + "learning_rate": 9.059750582431138e-06, + "loss": 0.6112, + "step": 7612 + }, + { + "epoch": 0.10326912642430819, + "grad_norm": 8.750862121582031, + "learning_rate": 9.059613539810881e-06, + "loss": 0.4486, + "step": 7613 + }, + { + "epoch": 0.10328269126424308, + "grad_norm": 6.696638584136963, + "learning_rate": 9.059476497190626e-06, + "loss": 0.3324, + "step": 7614 + }, + { + "epoch": 0.10329625610417797, + "grad_norm": 7.166508197784424, + "learning_rate": 9.059339454570372e-06, + "loss": 0.5116, + "step": 7615 + }, + { + "epoch": 0.10330982094411285, + "grad_norm": 6.538759708404541, + "learning_rate": 9.059202411950117e-06, + "loss": 0.2418, + "step": 7616 + }, + { + "epoch": 0.10332338578404775, + "grad_norm": 5.720024108886719, + "learning_rate": 9.059065369329862e-06, + "loss": 0.2844, + "step": 7617 + }, + { + "epoch": 0.10333695062398264, + "grad_norm": 7.694665431976318, + "learning_rate": 9.058928326709607e-06, + "loss": 0.4286, + "step": 7618 + }, + { + "epoch": 0.10335051546391752, + "grad_norm": 6.554808139801025, + "learning_rate": 9.058791284089352e-06, + "loss": 0.3963, + "step": 7619 + }, + { + "epoch": 0.10336408030385241, + "grad_norm": 5.333920955657959, + "learning_rate": 9.058654241469098e-06, + "loss": 0.2525, + "step": 7620 + }, + { + "epoch": 0.1033776451437873, + "grad_norm": 10.91849422454834, + "learning_rate": 9.058517198848843e-06, + "loss": 0.3025, + "step": 7621 + }, + { + "epoch": 0.1033912099837222, + "grad_norm": 5.048868656158447, + "learning_rate": 9.058380156228588e-06, + "loss": 0.3264, + "step": 7622 + }, + { + "epoch": 0.10340477482365708, + "grad_norm": 6.952068328857422, + "learning_rate": 9.058243113608333e-06, + "loss": 0.3501, + "step": 7623 + }, + { + "epoch": 0.10341833966359197, + "grad_norm": 7.309422969818115, + "learning_rate": 9.058106070988078e-06, + "loss": 0.5112, + "step": 7624 + }, + { + "epoch": 0.10343190450352686, + "grad_norm": 7.631326675415039, + "learning_rate": 9.057969028367823e-06, + "loss": 0.3806, + "step": 7625 + }, + { + "epoch": 0.10344546934346174, + "grad_norm": 4.360424518585205, + "learning_rate": 9.057831985747569e-06, + "loss": 0.2496, + "step": 7626 + }, + { + "epoch": 0.10345903418339664, + "grad_norm": 5.295195579528809, + "learning_rate": 9.057694943127314e-06, + "loss": 0.3325, + "step": 7627 + }, + { + "epoch": 0.10347259902333153, + "grad_norm": 5.96619987487793, + "learning_rate": 9.057557900507057e-06, + "loss": 0.3414, + "step": 7628 + }, + { + "epoch": 0.10348616386326641, + "grad_norm": 6.825657844543457, + "learning_rate": 9.057420857886804e-06, + "loss": 0.3795, + "step": 7629 + }, + { + "epoch": 0.1034997287032013, + "grad_norm": 8.411014556884766, + "learning_rate": 9.05728381526655e-06, + "loss": 0.3794, + "step": 7630 + }, + { + "epoch": 0.1035132935431362, + "grad_norm": 5.908494472503662, + "learning_rate": 9.057146772646293e-06, + "loss": 0.3492, + "step": 7631 + }, + { + "epoch": 0.10352685838307107, + "grad_norm": 7.131641864776611, + "learning_rate": 9.057009730026038e-06, + "loss": 0.3824, + "step": 7632 + }, + { + "epoch": 0.10354042322300597, + "grad_norm": 5.4454569816589355, + "learning_rate": 9.056872687405785e-06, + "loss": 0.3779, + "step": 7633 + }, + { + "epoch": 0.10355398806294086, + "grad_norm": 8.698836326599121, + "learning_rate": 9.05673564478553e-06, + "loss": 0.5796, + "step": 7634 + }, + { + "epoch": 0.10356755290287574, + "grad_norm": 6.61281156539917, + "learning_rate": 9.056598602165274e-06, + "loss": 0.4096, + "step": 7635 + }, + { + "epoch": 0.10358111774281063, + "grad_norm": 6.629016399383545, + "learning_rate": 9.056461559545019e-06, + "loss": 0.4605, + "step": 7636 + }, + { + "epoch": 0.10359468258274553, + "grad_norm": 7.329859733581543, + "learning_rate": 9.056324516924766e-06, + "loss": 0.4867, + "step": 7637 + }, + { + "epoch": 0.10360824742268042, + "grad_norm": 8.995292663574219, + "learning_rate": 9.056187474304509e-06, + "loss": 0.5501, + "step": 7638 + }, + { + "epoch": 0.1036218122626153, + "grad_norm": 8.081396102905273, + "learning_rate": 9.056050431684254e-06, + "loss": 0.5536, + "step": 7639 + }, + { + "epoch": 0.10363537710255019, + "grad_norm": 6.768117904663086, + "learning_rate": 9.055913389064e-06, + "loss": 0.3652, + "step": 7640 + }, + { + "epoch": 0.10364894194248508, + "grad_norm": 7.371965408325195, + "learning_rate": 9.055776346443745e-06, + "loss": 0.5295, + "step": 7641 + }, + { + "epoch": 0.10366250678241996, + "grad_norm": 6.341915130615234, + "learning_rate": 9.05563930382349e-06, + "loss": 0.4028, + "step": 7642 + }, + { + "epoch": 0.10367607162235486, + "grad_norm": 7.910122394561768, + "learning_rate": 9.055502261203235e-06, + "loss": 0.3858, + "step": 7643 + }, + { + "epoch": 0.10368963646228975, + "grad_norm": 6.287776470184326, + "learning_rate": 9.05536521858298e-06, + "loss": 0.4455, + "step": 7644 + }, + { + "epoch": 0.10370320130222463, + "grad_norm": 6.188860893249512, + "learning_rate": 9.055228175962725e-06, + "loss": 0.361, + "step": 7645 + }, + { + "epoch": 0.10371676614215952, + "grad_norm": 8.657563209533691, + "learning_rate": 9.05509113334247e-06, + "loss": 0.592, + "step": 7646 + }, + { + "epoch": 0.10373033098209442, + "grad_norm": 7.325645923614502, + "learning_rate": 9.054954090722216e-06, + "loss": 0.4473, + "step": 7647 + }, + { + "epoch": 0.1037438958220293, + "grad_norm": 5.263277053833008, + "learning_rate": 9.054817048101961e-06, + "loss": 0.2245, + "step": 7648 + }, + { + "epoch": 0.10375746066196419, + "grad_norm": 8.852340698242188, + "learning_rate": 9.054680005481706e-06, + "loss": 0.6355, + "step": 7649 + }, + { + "epoch": 0.10377102550189908, + "grad_norm": 8.401407241821289, + "learning_rate": 9.054542962861451e-06, + "loss": 0.5284, + "step": 7650 + }, + { + "epoch": 0.10378459034183396, + "grad_norm": 7.178257465362549, + "learning_rate": 9.054405920241196e-06, + "loss": 0.4167, + "step": 7651 + }, + { + "epoch": 0.10379815518176885, + "grad_norm": 8.382111549377441, + "learning_rate": 9.054268877620942e-06, + "loss": 0.4454, + "step": 7652 + }, + { + "epoch": 0.10381172002170375, + "grad_norm": 5.9562602043151855, + "learning_rate": 9.054131835000685e-06, + "loss": 0.4169, + "step": 7653 + }, + { + "epoch": 0.10382528486163864, + "grad_norm": 6.490503311157227, + "learning_rate": 9.05399479238043e-06, + "loss": 0.3807, + "step": 7654 + }, + { + "epoch": 0.10383884970157352, + "grad_norm": 8.71948528289795, + "learning_rate": 9.053857749760177e-06, + "loss": 0.3926, + "step": 7655 + }, + { + "epoch": 0.10385241454150841, + "grad_norm": 6.596456527709961, + "learning_rate": 9.05372070713992e-06, + "loss": 0.3481, + "step": 7656 + }, + { + "epoch": 0.1038659793814433, + "grad_norm": 6.401381492614746, + "learning_rate": 9.053583664519666e-06, + "loss": 0.47, + "step": 7657 + }, + { + "epoch": 0.10387954422137818, + "grad_norm": 6.123128890991211, + "learning_rate": 9.053446621899411e-06, + "loss": 0.335, + "step": 7658 + }, + { + "epoch": 0.10389310906131308, + "grad_norm": 8.19356918334961, + "learning_rate": 9.053309579279158e-06, + "loss": 0.4369, + "step": 7659 + }, + { + "epoch": 0.10390667390124797, + "grad_norm": 5.016690254211426, + "learning_rate": 9.053172536658901e-06, + "loss": 0.2751, + "step": 7660 + }, + { + "epoch": 0.10392023874118285, + "grad_norm": 7.613667011260986, + "learning_rate": 9.053035494038646e-06, + "loss": 0.4002, + "step": 7661 + }, + { + "epoch": 0.10393380358111774, + "grad_norm": 7.103896617889404, + "learning_rate": 9.052898451418392e-06, + "loss": 0.5427, + "step": 7662 + }, + { + "epoch": 0.10394736842105264, + "grad_norm": 5.6750102043151855, + "learning_rate": 9.052761408798137e-06, + "loss": 0.3372, + "step": 7663 + }, + { + "epoch": 0.10396093326098751, + "grad_norm": 8.308810234069824, + "learning_rate": 9.052624366177882e-06, + "loss": 0.5423, + "step": 7664 + }, + { + "epoch": 0.10397449810092241, + "grad_norm": 8.888754844665527, + "learning_rate": 9.052487323557627e-06, + "loss": 0.44, + "step": 7665 + }, + { + "epoch": 0.1039880629408573, + "grad_norm": 8.487558364868164, + "learning_rate": 9.052350280937372e-06, + "loss": 0.5737, + "step": 7666 + }, + { + "epoch": 0.10400162778079218, + "grad_norm": 7.112550258636475, + "learning_rate": 9.052213238317118e-06, + "loss": 0.3866, + "step": 7667 + }, + { + "epoch": 0.10401519262072707, + "grad_norm": 7.264403343200684, + "learning_rate": 9.052076195696863e-06, + "loss": 0.4178, + "step": 7668 + }, + { + "epoch": 0.10402875746066197, + "grad_norm": 6.8076066970825195, + "learning_rate": 9.051939153076608e-06, + "loss": 0.4306, + "step": 7669 + }, + { + "epoch": 0.10404232230059686, + "grad_norm": 8.324333190917969, + "learning_rate": 9.051802110456353e-06, + "loss": 0.5273, + "step": 7670 + }, + { + "epoch": 0.10405588714053174, + "grad_norm": 6.3189568519592285, + "learning_rate": 9.051665067836097e-06, + "loss": 0.5972, + "step": 7671 + }, + { + "epoch": 0.10406945198046663, + "grad_norm": 4.976541996002197, + "learning_rate": 9.051528025215843e-06, + "loss": 0.3731, + "step": 7672 + }, + { + "epoch": 0.10408301682040152, + "grad_norm": 5.368786334991455, + "learning_rate": 9.051390982595589e-06, + "loss": 0.3068, + "step": 7673 + }, + { + "epoch": 0.1040965816603364, + "grad_norm": 6.0019850730896, + "learning_rate": 9.051253939975332e-06, + "loss": 0.2713, + "step": 7674 + }, + { + "epoch": 0.1041101465002713, + "grad_norm": 8.812705039978027, + "learning_rate": 9.051116897355077e-06, + "loss": 0.5425, + "step": 7675 + }, + { + "epoch": 0.10412371134020619, + "grad_norm": 6.608579158782959, + "learning_rate": 9.050979854734824e-06, + "loss": 0.3979, + "step": 7676 + }, + { + "epoch": 0.10413727618014107, + "grad_norm": 6.451629161834717, + "learning_rate": 9.05084281211457e-06, + "loss": 0.3395, + "step": 7677 + }, + { + "epoch": 0.10415084102007596, + "grad_norm": 7.787583351135254, + "learning_rate": 9.050705769494313e-06, + "loss": 0.4003, + "step": 7678 + }, + { + "epoch": 0.10416440586001086, + "grad_norm": 8.09449577331543, + "learning_rate": 9.050568726874058e-06, + "loss": 0.5573, + "step": 7679 + }, + { + "epoch": 0.10417797069994574, + "grad_norm": 5.930427074432373, + "learning_rate": 9.050431684253803e-06, + "loss": 0.3676, + "step": 7680 + }, + { + "epoch": 0.10419153553988063, + "grad_norm": 10.65787410736084, + "learning_rate": 9.050294641633548e-06, + "loss": 0.6163, + "step": 7681 + }, + { + "epoch": 0.10420510037981552, + "grad_norm": 8.752617835998535, + "learning_rate": 9.050157599013294e-06, + "loss": 0.4294, + "step": 7682 + }, + { + "epoch": 0.1042186652197504, + "grad_norm": 5.893950939178467, + "learning_rate": 9.050020556393039e-06, + "loss": 0.2878, + "step": 7683 + }, + { + "epoch": 0.1042322300596853, + "grad_norm": 8.79547119140625, + "learning_rate": 9.049883513772784e-06, + "loss": 0.4097, + "step": 7684 + }, + { + "epoch": 0.10424579489962019, + "grad_norm": 6.729366779327393, + "learning_rate": 9.049746471152529e-06, + "loss": 0.4113, + "step": 7685 + }, + { + "epoch": 0.10425935973955508, + "grad_norm": 7.2673821449279785, + "learning_rate": 9.049609428532274e-06, + "loss": 0.5792, + "step": 7686 + }, + { + "epoch": 0.10427292457948996, + "grad_norm": 8.193077087402344, + "learning_rate": 9.04947238591202e-06, + "loss": 0.5582, + "step": 7687 + }, + { + "epoch": 0.10428648941942485, + "grad_norm": 6.009085655212402, + "learning_rate": 9.049335343291765e-06, + "loss": 0.3136, + "step": 7688 + }, + { + "epoch": 0.10430005425935975, + "grad_norm": 6.634942531585693, + "learning_rate": 9.04919830067151e-06, + "loss": 0.3748, + "step": 7689 + }, + { + "epoch": 0.10431361909929462, + "grad_norm": 6.526018142700195, + "learning_rate": 9.049061258051255e-06, + "loss": 0.5018, + "step": 7690 + }, + { + "epoch": 0.10432718393922952, + "grad_norm": 7.521857738494873, + "learning_rate": 9.048924215431e-06, + "loss": 0.5902, + "step": 7691 + }, + { + "epoch": 0.10434074877916441, + "grad_norm": 7.218600273132324, + "learning_rate": 9.048787172810745e-06, + "loss": 0.533, + "step": 7692 + }, + { + "epoch": 0.10435431361909929, + "grad_norm": 6.537035942077637, + "learning_rate": 9.048650130190489e-06, + "loss": 0.3138, + "step": 7693 + }, + { + "epoch": 0.10436787845903418, + "grad_norm": 7.7859063148498535, + "learning_rate": 9.048513087570236e-06, + "loss": 0.3141, + "step": 7694 + }, + { + "epoch": 0.10438144329896908, + "grad_norm": 5.909757137298584, + "learning_rate": 9.048376044949981e-06, + "loss": 0.3243, + "step": 7695 + }, + { + "epoch": 0.10439500813890396, + "grad_norm": 8.516439437866211, + "learning_rate": 9.048239002329724e-06, + "loss": 0.5211, + "step": 7696 + }, + { + "epoch": 0.10440857297883885, + "grad_norm": 6.303534984588623, + "learning_rate": 9.04810195970947e-06, + "loss": 0.4008, + "step": 7697 + }, + { + "epoch": 0.10442213781877374, + "grad_norm": 5.944772243499756, + "learning_rate": 9.047964917089216e-06, + "loss": 0.4639, + "step": 7698 + }, + { + "epoch": 0.10443570265870862, + "grad_norm": 6.100111961364746, + "learning_rate": 9.04782787446896e-06, + "loss": 0.4217, + "step": 7699 + }, + { + "epoch": 0.10444926749864351, + "grad_norm": 5.208960056304932, + "learning_rate": 9.047690831848705e-06, + "loss": 0.2786, + "step": 7700 + }, + { + "epoch": 0.1044628323385784, + "grad_norm": 4.745123863220215, + "learning_rate": 9.04755378922845e-06, + "loss": 0.3147, + "step": 7701 + }, + { + "epoch": 0.1044763971785133, + "grad_norm": 6.225934982299805, + "learning_rate": 9.047416746608197e-06, + "loss": 0.5697, + "step": 7702 + }, + { + "epoch": 0.10448996201844818, + "grad_norm": 5.887088298797607, + "learning_rate": 9.04727970398794e-06, + "loss": 0.4126, + "step": 7703 + }, + { + "epoch": 0.10450352685838307, + "grad_norm": 6.801510810852051, + "learning_rate": 9.047142661367686e-06, + "loss": 0.4432, + "step": 7704 + }, + { + "epoch": 0.10451709169831797, + "grad_norm": 6.1935296058654785, + "learning_rate": 9.047005618747431e-06, + "loss": 0.3607, + "step": 7705 + }, + { + "epoch": 0.10453065653825284, + "grad_norm": 5.509835720062256, + "learning_rate": 9.046868576127176e-06, + "loss": 0.3956, + "step": 7706 + }, + { + "epoch": 0.10454422137818774, + "grad_norm": 6.765081882476807, + "learning_rate": 9.046731533506921e-06, + "loss": 0.3743, + "step": 7707 + }, + { + "epoch": 0.10455778621812263, + "grad_norm": 5.617398262023926, + "learning_rate": 9.046594490886667e-06, + "loss": 0.3836, + "step": 7708 + }, + { + "epoch": 0.10457135105805751, + "grad_norm": 7.277531623840332, + "learning_rate": 9.046457448266412e-06, + "loss": 0.3759, + "step": 7709 + }, + { + "epoch": 0.1045849158979924, + "grad_norm": 9.614521980285645, + "learning_rate": 9.046320405646157e-06, + "loss": 0.621, + "step": 7710 + }, + { + "epoch": 0.1045984807379273, + "grad_norm": 3.8015425205230713, + "learning_rate": 9.046183363025902e-06, + "loss": 0.2282, + "step": 7711 + }, + { + "epoch": 0.10461204557786218, + "grad_norm": 5.435414791107178, + "learning_rate": 9.046046320405647e-06, + "loss": 0.3101, + "step": 7712 + }, + { + "epoch": 0.10462561041779707, + "grad_norm": 4.895857334136963, + "learning_rate": 9.045909277785392e-06, + "loss": 0.4026, + "step": 7713 + }, + { + "epoch": 0.10463917525773196, + "grad_norm": 5.516895771026611, + "learning_rate": 9.045772235165136e-06, + "loss": 0.3717, + "step": 7714 + }, + { + "epoch": 0.10465274009766684, + "grad_norm": 5.07800817489624, + "learning_rate": 9.045635192544883e-06, + "loss": 0.3757, + "step": 7715 + }, + { + "epoch": 0.10466630493760173, + "grad_norm": 6.86346960067749, + "learning_rate": 9.045498149924628e-06, + "loss": 0.423, + "step": 7716 + }, + { + "epoch": 0.10467986977753663, + "grad_norm": 5.127167224884033, + "learning_rate": 9.045361107304373e-06, + "loss": 0.277, + "step": 7717 + }, + { + "epoch": 0.10469343461747152, + "grad_norm": 5.835663795471191, + "learning_rate": 9.045224064684117e-06, + "loss": 0.4046, + "step": 7718 + }, + { + "epoch": 0.1047069994574064, + "grad_norm": 5.137684345245361, + "learning_rate": 9.045087022063863e-06, + "loss": 0.3185, + "step": 7719 + }, + { + "epoch": 0.10472056429734129, + "grad_norm": 8.725249290466309, + "learning_rate": 9.044949979443609e-06, + "loss": 0.4613, + "step": 7720 + }, + { + "epoch": 0.10473412913727619, + "grad_norm": 5.248940944671631, + "learning_rate": 9.044812936823352e-06, + "loss": 0.2176, + "step": 7721 + }, + { + "epoch": 0.10474769397721106, + "grad_norm": 5.124264240264893, + "learning_rate": 9.044675894203097e-06, + "loss": 0.3233, + "step": 7722 + }, + { + "epoch": 0.10476125881714596, + "grad_norm": 6.276740074157715, + "learning_rate": 9.044538851582843e-06, + "loss": 0.3773, + "step": 7723 + }, + { + "epoch": 0.10477482365708085, + "grad_norm": 4.381329536437988, + "learning_rate": 9.044401808962588e-06, + "loss": 0.1987, + "step": 7724 + }, + { + "epoch": 0.10478838849701573, + "grad_norm": 5.319451332092285, + "learning_rate": 9.044264766342333e-06, + "loss": 0.3297, + "step": 7725 + }, + { + "epoch": 0.10480195333695062, + "grad_norm": 4.579176902770996, + "learning_rate": 9.044127723722078e-06, + "loss": 0.2657, + "step": 7726 + }, + { + "epoch": 0.10481551817688552, + "grad_norm": 4.622448444366455, + "learning_rate": 9.043990681101823e-06, + "loss": 0.2468, + "step": 7727 + }, + { + "epoch": 0.1048290830168204, + "grad_norm": 5.193904876708984, + "learning_rate": 9.043853638481568e-06, + "loss": 0.3484, + "step": 7728 + }, + { + "epoch": 0.10484264785675529, + "grad_norm": 3.5402605533599854, + "learning_rate": 9.043716595861314e-06, + "loss": 0.237, + "step": 7729 + }, + { + "epoch": 0.10485621269669018, + "grad_norm": 4.029506206512451, + "learning_rate": 9.043579553241059e-06, + "loss": 0.3065, + "step": 7730 + }, + { + "epoch": 0.10486977753662506, + "grad_norm": 3.999756097793579, + "learning_rate": 9.043442510620804e-06, + "loss": 0.2237, + "step": 7731 + }, + { + "epoch": 0.10488334237655995, + "grad_norm": 5.484902858734131, + "learning_rate": 9.043305468000549e-06, + "loss": 0.2827, + "step": 7732 + }, + { + "epoch": 0.10489690721649485, + "grad_norm": 5.483381271362305, + "learning_rate": 9.043168425380294e-06, + "loss": 0.3273, + "step": 7733 + }, + { + "epoch": 0.10491047205642974, + "grad_norm": 5.028644561767578, + "learning_rate": 9.04303138276004e-06, + "loss": 0.3247, + "step": 7734 + }, + { + "epoch": 0.10492403689636462, + "grad_norm": 4.479592323303223, + "learning_rate": 9.042894340139785e-06, + "loss": 0.2242, + "step": 7735 + }, + { + "epoch": 0.10493760173629951, + "grad_norm": 4.496436595916748, + "learning_rate": 9.042757297519528e-06, + "loss": 0.2906, + "step": 7736 + }, + { + "epoch": 0.1049511665762344, + "grad_norm": 4.715846061706543, + "learning_rate": 9.042620254899275e-06, + "loss": 0.3378, + "step": 7737 + }, + { + "epoch": 0.10496473141616929, + "grad_norm": 5.131274700164795, + "learning_rate": 9.04248321227902e-06, + "loss": 0.285, + "step": 7738 + }, + { + "epoch": 0.10497829625610418, + "grad_norm": 5.004446029663086, + "learning_rate": 9.042346169658764e-06, + "loss": 0.2417, + "step": 7739 + }, + { + "epoch": 0.10499186109603907, + "grad_norm": 4.499562740325928, + "learning_rate": 9.042209127038509e-06, + "loss": 0.2534, + "step": 7740 + }, + { + "epoch": 0.10500542593597395, + "grad_norm": 5.0885186195373535, + "learning_rate": 9.042072084418256e-06, + "loss": 0.2836, + "step": 7741 + }, + { + "epoch": 0.10501899077590884, + "grad_norm": 5.015206813812256, + "learning_rate": 9.041935041798001e-06, + "loss": 0.3386, + "step": 7742 + }, + { + "epoch": 0.10503255561584374, + "grad_norm": 4.584141254425049, + "learning_rate": 9.041797999177744e-06, + "loss": 0.2396, + "step": 7743 + }, + { + "epoch": 0.10504612045577862, + "grad_norm": 7.612951755523682, + "learning_rate": 9.04166095655749e-06, + "loss": 0.3386, + "step": 7744 + }, + { + "epoch": 0.10505968529571351, + "grad_norm": 5.743917942047119, + "learning_rate": 9.041523913937236e-06, + "loss": 0.2594, + "step": 7745 + }, + { + "epoch": 0.1050732501356484, + "grad_norm": 4.590353012084961, + "learning_rate": 9.04138687131698e-06, + "loss": 0.1969, + "step": 7746 + }, + { + "epoch": 0.10508681497558328, + "grad_norm": 3.4997076988220215, + "learning_rate": 9.041249828696725e-06, + "loss": 0.2088, + "step": 7747 + }, + { + "epoch": 0.10510037981551817, + "grad_norm": 4.085315704345703, + "learning_rate": 9.04111278607647e-06, + "loss": 0.3315, + "step": 7748 + }, + { + "epoch": 0.10511394465545307, + "grad_norm": 5.727183818817139, + "learning_rate": 9.040975743456215e-06, + "loss": 0.2492, + "step": 7749 + }, + { + "epoch": 0.10512750949538796, + "grad_norm": 4.207727432250977, + "learning_rate": 9.04083870083596e-06, + "loss": 0.2776, + "step": 7750 + }, + { + "epoch": 0.10514107433532284, + "grad_norm": 4.011544704437256, + "learning_rate": 9.040701658215706e-06, + "loss": 0.2525, + "step": 7751 + }, + { + "epoch": 0.10515463917525773, + "grad_norm": 4.217907905578613, + "learning_rate": 9.040564615595451e-06, + "loss": 0.2159, + "step": 7752 + }, + { + "epoch": 0.10516820401519263, + "grad_norm": 5.214946746826172, + "learning_rate": 9.040427572975196e-06, + "loss": 0.3362, + "step": 7753 + }, + { + "epoch": 0.1051817688551275, + "grad_norm": 4.34122896194458, + "learning_rate": 9.040290530354941e-06, + "loss": 0.2004, + "step": 7754 + }, + { + "epoch": 0.1051953336950624, + "grad_norm": 4.078724384307861, + "learning_rate": 9.040153487734687e-06, + "loss": 0.3229, + "step": 7755 + }, + { + "epoch": 0.10520889853499729, + "grad_norm": 4.102790832519531, + "learning_rate": 9.040016445114432e-06, + "loss": 0.2667, + "step": 7756 + }, + { + "epoch": 0.10522246337493217, + "grad_norm": 6.194908618927002, + "learning_rate": 9.039879402494177e-06, + "loss": 0.3501, + "step": 7757 + }, + { + "epoch": 0.10523602821486706, + "grad_norm": 4.615872383117676, + "learning_rate": 9.039742359873922e-06, + "loss": 0.1786, + "step": 7758 + }, + { + "epoch": 0.10524959305480196, + "grad_norm": 5.454499244689941, + "learning_rate": 9.039605317253667e-06, + "loss": 0.2936, + "step": 7759 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 4.294172763824463, + "learning_rate": 9.039468274633412e-06, + "loss": 0.2592, + "step": 7760 + }, + { + "epoch": 0.10527672273467173, + "grad_norm": 6.7349534034729, + "learning_rate": 9.039331232013156e-06, + "loss": 0.4178, + "step": 7761 + }, + { + "epoch": 0.10529028757460662, + "grad_norm": 3.4837865829467773, + "learning_rate": 9.039194189392901e-06, + "loss": 0.1591, + "step": 7762 + }, + { + "epoch": 0.1053038524145415, + "grad_norm": 5.425116062164307, + "learning_rate": 9.039057146772648e-06, + "loss": 0.2188, + "step": 7763 + }, + { + "epoch": 0.1053174172544764, + "grad_norm": 4.677393913269043, + "learning_rate": 9.038920104152391e-06, + "loss": 0.3113, + "step": 7764 + }, + { + "epoch": 0.10533098209441129, + "grad_norm": 6.330855369567871, + "learning_rate": 9.038783061532137e-06, + "loss": 0.418, + "step": 7765 + }, + { + "epoch": 0.10534454693434618, + "grad_norm": 5.176547050476074, + "learning_rate": 9.038646018911882e-06, + "loss": 0.2976, + "step": 7766 + }, + { + "epoch": 0.10535811177428106, + "grad_norm": 10.163989067077637, + "learning_rate": 9.038508976291627e-06, + "loss": 0.5595, + "step": 7767 + }, + { + "epoch": 0.10537167661421595, + "grad_norm": 6.411031723022461, + "learning_rate": 9.038371933671372e-06, + "loss": 0.2858, + "step": 7768 + }, + { + "epoch": 0.10538524145415085, + "grad_norm": 4.185857772827148, + "learning_rate": 9.038234891051117e-06, + "loss": 0.184, + "step": 7769 + }, + { + "epoch": 0.10539880629408573, + "grad_norm": 5.364264965057373, + "learning_rate": 9.038097848430863e-06, + "loss": 0.3124, + "step": 7770 + }, + { + "epoch": 0.10541237113402062, + "grad_norm": 5.96777868270874, + "learning_rate": 9.037960805810608e-06, + "loss": 0.2419, + "step": 7771 + }, + { + "epoch": 0.10542593597395551, + "grad_norm": 4.642699241638184, + "learning_rate": 9.037823763190353e-06, + "loss": 0.3169, + "step": 7772 + }, + { + "epoch": 0.10543950081389039, + "grad_norm": 4.479135513305664, + "learning_rate": 9.037686720570098e-06, + "loss": 0.2805, + "step": 7773 + }, + { + "epoch": 0.10545306565382528, + "grad_norm": 5.4274797439575195, + "learning_rate": 9.037549677949843e-06, + "loss": 0.3222, + "step": 7774 + }, + { + "epoch": 0.10546663049376018, + "grad_norm": 7.211741924285889, + "learning_rate": 9.037412635329588e-06, + "loss": 0.42, + "step": 7775 + }, + { + "epoch": 0.10548019533369506, + "grad_norm": 8.449470520019531, + "learning_rate": 9.037275592709334e-06, + "loss": 0.6064, + "step": 7776 + }, + { + "epoch": 0.10549376017362995, + "grad_norm": 6.336565971374512, + "learning_rate": 9.037138550089079e-06, + "loss": 0.3977, + "step": 7777 + }, + { + "epoch": 0.10550732501356484, + "grad_norm": 7.496357440948486, + "learning_rate": 9.037001507468824e-06, + "loss": 0.5426, + "step": 7778 + }, + { + "epoch": 0.10552088985349972, + "grad_norm": 4.601032733917236, + "learning_rate": 9.036864464848567e-06, + "loss": 0.2797, + "step": 7779 + }, + { + "epoch": 0.10553445469343462, + "grad_norm": 5.7666192054748535, + "learning_rate": 9.036727422228314e-06, + "loss": 0.3793, + "step": 7780 + }, + { + "epoch": 0.10554801953336951, + "grad_norm": 5.608741760253906, + "learning_rate": 9.03659037960806e-06, + "loss": 0.324, + "step": 7781 + }, + { + "epoch": 0.1055615843733044, + "grad_norm": 8.527259826660156, + "learning_rate": 9.036453336987803e-06, + "loss": 0.6432, + "step": 7782 + }, + { + "epoch": 0.10557514921323928, + "grad_norm": 7.267186641693115, + "learning_rate": 9.036316294367548e-06, + "loss": 0.6264, + "step": 7783 + }, + { + "epoch": 0.10558871405317417, + "grad_norm": 7.522224426269531, + "learning_rate": 9.036179251747295e-06, + "loss": 0.6605, + "step": 7784 + }, + { + "epoch": 0.10560227889310907, + "grad_norm": 7.065597057342529, + "learning_rate": 9.03604220912704e-06, + "loss": 0.4369, + "step": 7785 + }, + { + "epoch": 0.10561584373304395, + "grad_norm": 5.436134338378906, + "learning_rate": 9.035905166506784e-06, + "loss": 0.425, + "step": 7786 + }, + { + "epoch": 0.10562940857297884, + "grad_norm": 4.210676670074463, + "learning_rate": 9.035768123886529e-06, + "loss": 0.2724, + "step": 7787 + }, + { + "epoch": 0.10564297341291373, + "grad_norm": 5.643392562866211, + "learning_rate": 9.035631081266276e-06, + "loss": 0.3676, + "step": 7788 + }, + { + "epoch": 0.10565653825284861, + "grad_norm": 7.724427223205566, + "learning_rate": 9.03549403864602e-06, + "loss": 0.6004, + "step": 7789 + }, + { + "epoch": 0.1056701030927835, + "grad_norm": 7.485254287719727, + "learning_rate": 9.035356996025764e-06, + "loss": 0.5138, + "step": 7790 + }, + { + "epoch": 0.1056836679327184, + "grad_norm": 6.088940143585205, + "learning_rate": 9.03521995340551e-06, + "loss": 0.3956, + "step": 7791 + }, + { + "epoch": 0.10569723277265328, + "grad_norm": 5.035068511962891, + "learning_rate": 9.035082910785255e-06, + "loss": 0.3601, + "step": 7792 + }, + { + "epoch": 0.10571079761258817, + "grad_norm": 4.646206855773926, + "learning_rate": 9.034945868165e-06, + "loss": 0.3374, + "step": 7793 + }, + { + "epoch": 0.10572436245252306, + "grad_norm": 7.079455852508545, + "learning_rate": 9.034808825544745e-06, + "loss": 0.4465, + "step": 7794 + }, + { + "epoch": 0.10573792729245794, + "grad_norm": 6.685770511627197, + "learning_rate": 9.03467178292449e-06, + "loss": 0.3748, + "step": 7795 + }, + { + "epoch": 0.10575149213239284, + "grad_norm": 6.807404041290283, + "learning_rate": 9.034534740304235e-06, + "loss": 0.479, + "step": 7796 + }, + { + "epoch": 0.10576505697232773, + "grad_norm": 5.163528919219971, + "learning_rate": 9.03439769768398e-06, + "loss": 0.3166, + "step": 7797 + }, + { + "epoch": 0.10577862181226262, + "grad_norm": 4.741008281707764, + "learning_rate": 9.034260655063726e-06, + "loss": 0.3344, + "step": 7798 + }, + { + "epoch": 0.1057921866521975, + "grad_norm": 5.442455768585205, + "learning_rate": 9.034123612443471e-06, + "loss": 0.5008, + "step": 7799 + }, + { + "epoch": 0.1058057514921324, + "grad_norm": 5.555477619171143, + "learning_rate": 9.033986569823216e-06, + "loss": 0.4383, + "step": 7800 + }, + { + "epoch": 0.10581931633206729, + "grad_norm": 4.724064826965332, + "learning_rate": 9.033849527202961e-06, + "loss": 0.2442, + "step": 7801 + }, + { + "epoch": 0.10583288117200217, + "grad_norm": 6.570292949676514, + "learning_rate": 9.033712484582707e-06, + "loss": 0.2917, + "step": 7802 + }, + { + "epoch": 0.10584644601193706, + "grad_norm": 5.492405414581299, + "learning_rate": 9.033575441962452e-06, + "loss": 0.3128, + "step": 7803 + }, + { + "epoch": 0.10586001085187195, + "grad_norm": 8.05829906463623, + "learning_rate": 9.033438399342195e-06, + "loss": 0.4565, + "step": 7804 + }, + { + "epoch": 0.10587357569180683, + "grad_norm": 5.33694314956665, + "learning_rate": 9.03330135672194e-06, + "loss": 0.4555, + "step": 7805 + }, + { + "epoch": 0.10588714053174172, + "grad_norm": 5.460448265075684, + "learning_rate": 9.033164314101687e-06, + "loss": 0.3241, + "step": 7806 + }, + { + "epoch": 0.10590070537167662, + "grad_norm": 7.047703266143799, + "learning_rate": 9.03302727148143e-06, + "loss": 0.4929, + "step": 7807 + }, + { + "epoch": 0.1059142702116115, + "grad_norm": 6.215105056762695, + "learning_rate": 9.032890228861176e-06, + "loss": 0.3524, + "step": 7808 + }, + { + "epoch": 0.10592783505154639, + "grad_norm": 7.404070854187012, + "learning_rate": 9.032753186240921e-06, + "loss": 0.3198, + "step": 7809 + }, + { + "epoch": 0.10594139989148128, + "grad_norm": 5.7098307609558105, + "learning_rate": 9.032616143620668e-06, + "loss": 0.3796, + "step": 7810 + }, + { + "epoch": 0.10595496473141616, + "grad_norm": 4.817533493041992, + "learning_rate": 9.032479101000411e-06, + "loss": 0.3265, + "step": 7811 + }, + { + "epoch": 0.10596852957135106, + "grad_norm": 5.6045308113098145, + "learning_rate": 9.032342058380157e-06, + "loss": 0.2832, + "step": 7812 + }, + { + "epoch": 0.10598209441128595, + "grad_norm": 6.32909631729126, + "learning_rate": 9.032205015759902e-06, + "loss": 0.3385, + "step": 7813 + }, + { + "epoch": 0.10599565925122084, + "grad_norm": 7.143283843994141, + "learning_rate": 9.032067973139647e-06, + "loss": 0.389, + "step": 7814 + }, + { + "epoch": 0.10600922409115572, + "grad_norm": 4.337874889373779, + "learning_rate": 9.031930930519392e-06, + "loss": 0.2671, + "step": 7815 + }, + { + "epoch": 0.10602278893109061, + "grad_norm": 4.63224983215332, + "learning_rate": 9.031793887899137e-06, + "loss": 0.2362, + "step": 7816 + }, + { + "epoch": 0.10603635377102551, + "grad_norm": 5.368533134460449, + "learning_rate": 9.031656845278883e-06, + "loss": 0.2275, + "step": 7817 + }, + { + "epoch": 0.10604991861096039, + "grad_norm": 6.078197956085205, + "learning_rate": 9.031519802658628e-06, + "loss": 0.2543, + "step": 7818 + }, + { + "epoch": 0.10606348345089528, + "grad_norm": 5.173651218414307, + "learning_rate": 9.031382760038373e-06, + "loss": 0.3232, + "step": 7819 + }, + { + "epoch": 0.10607704829083017, + "grad_norm": 5.255737781524658, + "learning_rate": 9.031245717418118e-06, + "loss": 0.3361, + "step": 7820 + }, + { + "epoch": 0.10609061313076505, + "grad_norm": 5.645081520080566, + "learning_rate": 9.031108674797863e-06, + "loss": 0.4237, + "step": 7821 + }, + { + "epoch": 0.10610417797069994, + "grad_norm": 5.730955123901367, + "learning_rate": 9.030971632177607e-06, + "loss": 0.4237, + "step": 7822 + }, + { + "epoch": 0.10611774281063484, + "grad_norm": 7.920175075531006, + "learning_rate": 9.030834589557354e-06, + "loss": 0.5715, + "step": 7823 + }, + { + "epoch": 0.10613130765056972, + "grad_norm": 8.14824104309082, + "learning_rate": 9.030697546937099e-06, + "loss": 0.5307, + "step": 7824 + }, + { + "epoch": 0.10614487249050461, + "grad_norm": 8.063098907470703, + "learning_rate": 9.030560504316844e-06, + "loss": 0.4834, + "step": 7825 + }, + { + "epoch": 0.1061584373304395, + "grad_norm": 5.377615928649902, + "learning_rate": 9.030423461696587e-06, + "loss": 0.3654, + "step": 7826 + }, + { + "epoch": 0.10617200217037438, + "grad_norm": 6.780363082885742, + "learning_rate": 9.030286419076334e-06, + "loss": 0.4732, + "step": 7827 + }, + { + "epoch": 0.10618556701030928, + "grad_norm": 4.737782001495361, + "learning_rate": 9.03014937645608e-06, + "loss": 0.3491, + "step": 7828 + }, + { + "epoch": 0.10619913185024417, + "grad_norm": 4.932500839233398, + "learning_rate": 9.030012333835823e-06, + "loss": 0.3414, + "step": 7829 + }, + { + "epoch": 0.10621269669017906, + "grad_norm": 5.908719539642334, + "learning_rate": 9.029875291215568e-06, + "loss": 0.3506, + "step": 7830 + }, + { + "epoch": 0.10622626153011394, + "grad_norm": 7.003829002380371, + "learning_rate": 9.029738248595313e-06, + "loss": 0.3708, + "step": 7831 + }, + { + "epoch": 0.10623982637004883, + "grad_norm": 5.477092266082764, + "learning_rate": 9.029601205975059e-06, + "loss": 0.2746, + "step": 7832 + }, + { + "epoch": 0.10625339120998373, + "grad_norm": 7.5356059074401855, + "learning_rate": 9.029464163354804e-06, + "loss": 0.4721, + "step": 7833 + }, + { + "epoch": 0.1062669560499186, + "grad_norm": 5.497729778289795, + "learning_rate": 9.029327120734549e-06, + "loss": 0.2657, + "step": 7834 + }, + { + "epoch": 0.1062805208898535, + "grad_norm": 7.161645889282227, + "learning_rate": 9.029190078114294e-06, + "loss": 0.3221, + "step": 7835 + }, + { + "epoch": 0.10629408572978839, + "grad_norm": 5.553386688232422, + "learning_rate": 9.02905303549404e-06, + "loss": 0.3358, + "step": 7836 + }, + { + "epoch": 0.10630765056972327, + "grad_norm": 5.928520202636719, + "learning_rate": 9.028915992873784e-06, + "loss": 0.4901, + "step": 7837 + }, + { + "epoch": 0.10632121540965817, + "grad_norm": 6.184658050537109, + "learning_rate": 9.02877895025353e-06, + "loss": 0.35, + "step": 7838 + }, + { + "epoch": 0.10633478024959306, + "grad_norm": 5.575666904449463, + "learning_rate": 9.028641907633275e-06, + "loss": 0.3209, + "step": 7839 + }, + { + "epoch": 0.10634834508952794, + "grad_norm": 5.495612621307373, + "learning_rate": 9.02850486501302e-06, + "loss": 0.3449, + "step": 7840 + }, + { + "epoch": 0.10636190992946283, + "grad_norm": 6.520381450653076, + "learning_rate": 9.028367822392765e-06, + "loss": 0.3726, + "step": 7841 + }, + { + "epoch": 0.10637547476939772, + "grad_norm": 4.826801300048828, + "learning_rate": 9.02823077977251e-06, + "loss": 0.1888, + "step": 7842 + }, + { + "epoch": 0.1063890396093326, + "grad_norm": 5.568803787231445, + "learning_rate": 9.028093737152255e-06, + "loss": 0.3653, + "step": 7843 + }, + { + "epoch": 0.1064026044492675, + "grad_norm": 6.048960208892822, + "learning_rate": 9.027956694532e-06, + "loss": 0.3131, + "step": 7844 + }, + { + "epoch": 0.10641616928920239, + "grad_norm": 4.980364799499512, + "learning_rate": 9.027819651911746e-06, + "loss": 0.2311, + "step": 7845 + }, + { + "epoch": 0.10642973412913728, + "grad_norm": 6.659889221191406, + "learning_rate": 9.027682609291491e-06, + "loss": 0.2917, + "step": 7846 + }, + { + "epoch": 0.10644329896907216, + "grad_norm": 5.295872211456299, + "learning_rate": 9.027545566671235e-06, + "loss": 0.3012, + "step": 7847 + }, + { + "epoch": 0.10645686380900705, + "grad_norm": 4.086415767669678, + "learning_rate": 9.02740852405098e-06, + "loss": 0.2088, + "step": 7848 + }, + { + "epoch": 0.10647042864894195, + "grad_norm": 4.2984771728515625, + "learning_rate": 9.027271481430727e-06, + "loss": 0.2547, + "step": 7849 + }, + { + "epoch": 0.10648399348887683, + "grad_norm": 7.481341361999512, + "learning_rate": 9.027134438810472e-06, + "loss": 0.3345, + "step": 7850 + }, + { + "epoch": 0.10649755832881172, + "grad_norm": 5.3753252029418945, + "learning_rate": 9.026997396190215e-06, + "loss": 0.2723, + "step": 7851 + }, + { + "epoch": 0.10651112316874661, + "grad_norm": 5.673910140991211, + "learning_rate": 9.02686035356996e-06, + "loss": 0.3708, + "step": 7852 + }, + { + "epoch": 0.10652468800868149, + "grad_norm": 6.441898345947266, + "learning_rate": 9.026723310949707e-06, + "loss": 0.2326, + "step": 7853 + }, + { + "epoch": 0.10653825284861639, + "grad_norm": 7.597372055053711, + "learning_rate": 9.02658626832945e-06, + "loss": 0.484, + "step": 7854 + }, + { + "epoch": 0.10655181768855128, + "grad_norm": 7.246400833129883, + "learning_rate": 9.026449225709196e-06, + "loss": 0.3322, + "step": 7855 + }, + { + "epoch": 0.10656538252848616, + "grad_norm": 3.9777450561523438, + "learning_rate": 9.026312183088941e-06, + "loss": 0.2023, + "step": 7856 + }, + { + "epoch": 0.10657894736842105, + "grad_norm": 5.981658935546875, + "learning_rate": 9.026175140468686e-06, + "loss": 0.3749, + "step": 7857 + }, + { + "epoch": 0.10659251220835594, + "grad_norm": 5.799531936645508, + "learning_rate": 9.026038097848431e-06, + "loss": 0.276, + "step": 7858 + }, + { + "epoch": 0.10660607704829084, + "grad_norm": 8.627510070800781, + "learning_rate": 9.025901055228177e-06, + "loss": 0.4202, + "step": 7859 + }, + { + "epoch": 0.10661964188822572, + "grad_norm": 4.4065728187561035, + "learning_rate": 9.025764012607922e-06, + "loss": 0.269, + "step": 7860 + }, + { + "epoch": 0.10663320672816061, + "grad_norm": 5.778543949127197, + "learning_rate": 9.025626969987667e-06, + "loss": 0.2748, + "step": 7861 + }, + { + "epoch": 0.1066467715680955, + "grad_norm": 4.462474822998047, + "learning_rate": 9.025489927367412e-06, + "loss": 0.223, + "step": 7862 + }, + { + "epoch": 0.10666033640803038, + "grad_norm": 6.182333469390869, + "learning_rate": 9.025352884747157e-06, + "loss": 0.3571, + "step": 7863 + }, + { + "epoch": 0.10667390124796527, + "grad_norm": 4.5009918212890625, + "learning_rate": 9.025215842126903e-06, + "loss": 0.2063, + "step": 7864 + }, + { + "epoch": 0.10668746608790017, + "grad_norm": 5.044652462005615, + "learning_rate": 9.025078799506648e-06, + "loss": 0.3209, + "step": 7865 + }, + { + "epoch": 0.10670103092783505, + "grad_norm": 8.750929832458496, + "learning_rate": 9.024941756886393e-06, + "loss": 0.42, + "step": 7866 + }, + { + "epoch": 0.10671459576776994, + "grad_norm": 8.00412654876709, + "learning_rate": 9.024804714266138e-06, + "loss": 0.5002, + "step": 7867 + }, + { + "epoch": 0.10672816060770483, + "grad_norm": 5.243423938751221, + "learning_rate": 9.024667671645883e-06, + "loss": 0.301, + "step": 7868 + }, + { + "epoch": 0.10674172544763971, + "grad_norm": 5.2352447509765625, + "learning_rate": 9.024530629025627e-06, + "loss": 0.3551, + "step": 7869 + }, + { + "epoch": 0.1067552902875746, + "grad_norm": 6.203370571136475, + "learning_rate": 9.024393586405374e-06, + "loss": 0.3515, + "step": 7870 + }, + { + "epoch": 0.1067688551275095, + "grad_norm": 6.774307727813721, + "learning_rate": 9.024256543785119e-06, + "loss": 0.3556, + "step": 7871 + }, + { + "epoch": 0.10678241996744438, + "grad_norm": 6.160084247589111, + "learning_rate": 9.024119501164862e-06, + "loss": 0.2764, + "step": 7872 + }, + { + "epoch": 0.10679598480737927, + "grad_norm": 7.01254415512085, + "learning_rate": 9.023982458544607e-06, + "loss": 0.2595, + "step": 7873 + }, + { + "epoch": 0.10680954964731416, + "grad_norm": 5.357696533203125, + "learning_rate": 9.023845415924353e-06, + "loss": 0.354, + "step": 7874 + }, + { + "epoch": 0.10682311448724906, + "grad_norm": 7.583346366882324, + "learning_rate": 9.023708373304098e-06, + "loss": 0.5022, + "step": 7875 + }, + { + "epoch": 0.10683667932718394, + "grad_norm": 7.33200740814209, + "learning_rate": 9.023571330683843e-06, + "loss": 0.3473, + "step": 7876 + }, + { + "epoch": 0.10685024416711883, + "grad_norm": 5.276454448699951, + "learning_rate": 9.023434288063588e-06, + "loss": 0.333, + "step": 7877 + }, + { + "epoch": 0.10686380900705372, + "grad_norm": 6.32854700088501, + "learning_rate": 9.023297245443333e-06, + "loss": 0.302, + "step": 7878 + }, + { + "epoch": 0.1068773738469886, + "grad_norm": 8.315030097961426, + "learning_rate": 9.023160202823079e-06, + "loss": 0.423, + "step": 7879 + }, + { + "epoch": 0.1068909386869235, + "grad_norm": 6.148957252502441, + "learning_rate": 9.023023160202824e-06, + "loss": 0.3314, + "step": 7880 + }, + { + "epoch": 0.10690450352685839, + "grad_norm": 5.475643157958984, + "learning_rate": 9.022886117582569e-06, + "loss": 0.2484, + "step": 7881 + }, + { + "epoch": 0.10691806836679327, + "grad_norm": 7.700174808502197, + "learning_rate": 9.022749074962314e-06, + "loss": 0.4882, + "step": 7882 + }, + { + "epoch": 0.10693163320672816, + "grad_norm": 6.275773048400879, + "learning_rate": 9.02261203234206e-06, + "loss": 0.364, + "step": 7883 + }, + { + "epoch": 0.10694519804666305, + "grad_norm": 4.942721843719482, + "learning_rate": 9.022474989721804e-06, + "loss": 0.2736, + "step": 7884 + }, + { + "epoch": 0.10695876288659793, + "grad_norm": 7.186543941497803, + "learning_rate": 9.02233794710155e-06, + "loss": 0.3679, + "step": 7885 + }, + { + "epoch": 0.10697232772653283, + "grad_norm": 5.890157699584961, + "learning_rate": 9.022200904481295e-06, + "loss": 0.4632, + "step": 7886 + }, + { + "epoch": 0.10698589256646772, + "grad_norm": 4.785666465759277, + "learning_rate": 9.022063861861038e-06, + "loss": 0.2462, + "step": 7887 + }, + { + "epoch": 0.1069994574064026, + "grad_norm": 6.629101753234863, + "learning_rate": 9.021926819240785e-06, + "loss": 0.369, + "step": 7888 + }, + { + "epoch": 0.10701302224633749, + "grad_norm": 5.00523042678833, + "learning_rate": 9.02178977662053e-06, + "loss": 0.2692, + "step": 7889 + }, + { + "epoch": 0.10702658708627238, + "grad_norm": 6.541769981384277, + "learning_rate": 9.021652734000274e-06, + "loss": 0.373, + "step": 7890 + }, + { + "epoch": 0.10704015192620728, + "grad_norm": 7.332154273986816, + "learning_rate": 9.021515691380019e-06, + "loss": 0.3307, + "step": 7891 + }, + { + "epoch": 0.10705371676614216, + "grad_norm": 5.106062889099121, + "learning_rate": 9.021378648759766e-06, + "loss": 0.3599, + "step": 7892 + }, + { + "epoch": 0.10706728160607705, + "grad_norm": 5.429165363311768, + "learning_rate": 9.021241606139511e-06, + "loss": 0.295, + "step": 7893 + }, + { + "epoch": 0.10708084644601194, + "grad_norm": 5.772588729858398, + "learning_rate": 9.021104563519255e-06, + "loss": 0.2947, + "step": 7894 + }, + { + "epoch": 0.10709441128594682, + "grad_norm": 6.410425186157227, + "learning_rate": 9.020967520899e-06, + "loss": 0.3394, + "step": 7895 + }, + { + "epoch": 0.10710797612588172, + "grad_norm": 6.282507419586182, + "learning_rate": 9.020830478278747e-06, + "loss": 0.4696, + "step": 7896 + }, + { + "epoch": 0.10712154096581661, + "grad_norm": 5.396022796630859, + "learning_rate": 9.02069343565849e-06, + "loss": 0.3914, + "step": 7897 + }, + { + "epoch": 0.10713510580575149, + "grad_norm": 6.264139175415039, + "learning_rate": 9.020556393038235e-06, + "loss": 0.3744, + "step": 7898 + }, + { + "epoch": 0.10714867064568638, + "grad_norm": 6.039106845855713, + "learning_rate": 9.02041935041798e-06, + "loss": 0.3661, + "step": 7899 + }, + { + "epoch": 0.10716223548562127, + "grad_norm": 6.333813190460205, + "learning_rate": 9.020282307797726e-06, + "loss": 0.4109, + "step": 7900 + }, + { + "epoch": 0.10717580032555615, + "grad_norm": 6.539295196533203, + "learning_rate": 9.02014526517747e-06, + "loss": 0.3662, + "step": 7901 + }, + { + "epoch": 0.10718936516549105, + "grad_norm": 5.055360794067383, + "learning_rate": 9.020008222557216e-06, + "loss": 0.2899, + "step": 7902 + }, + { + "epoch": 0.10720293000542594, + "grad_norm": 7.38791561126709, + "learning_rate": 9.019871179936961e-06, + "loss": 0.4426, + "step": 7903 + }, + { + "epoch": 0.10721649484536082, + "grad_norm": 7.985604286193848, + "learning_rate": 9.019734137316706e-06, + "loss": 0.4565, + "step": 7904 + }, + { + "epoch": 0.10723005968529571, + "grad_norm": 7.373958110809326, + "learning_rate": 9.019597094696452e-06, + "loss": 0.546, + "step": 7905 + }, + { + "epoch": 0.1072436245252306, + "grad_norm": 7.108785629272461, + "learning_rate": 9.019460052076197e-06, + "loss": 0.4315, + "step": 7906 + }, + { + "epoch": 0.1072571893651655, + "grad_norm": 5.99947452545166, + "learning_rate": 9.019323009455942e-06, + "loss": 0.3129, + "step": 7907 + }, + { + "epoch": 0.10727075420510038, + "grad_norm": 8.304336547851562, + "learning_rate": 9.019185966835687e-06, + "loss": 0.5307, + "step": 7908 + }, + { + "epoch": 0.10728431904503527, + "grad_norm": 7.875834941864014, + "learning_rate": 9.019048924215432e-06, + "loss": 0.4748, + "step": 7909 + }, + { + "epoch": 0.10729788388497016, + "grad_norm": 7.1730546951293945, + "learning_rate": 9.018911881595177e-06, + "loss": 0.4478, + "step": 7910 + }, + { + "epoch": 0.10731144872490504, + "grad_norm": 6.1942338943481445, + "learning_rate": 9.018774838974923e-06, + "loss": 0.4474, + "step": 7911 + }, + { + "epoch": 0.10732501356483994, + "grad_norm": 6.3139729499816895, + "learning_rate": 9.018637796354666e-06, + "loss": 0.3733, + "step": 7912 + }, + { + "epoch": 0.10733857840477483, + "grad_norm": 5.5820631980896, + "learning_rate": 9.018500753734413e-06, + "loss": 0.3032, + "step": 7913 + }, + { + "epoch": 0.10735214324470971, + "grad_norm": 7.296183109283447, + "learning_rate": 9.018363711114158e-06, + "loss": 0.4529, + "step": 7914 + }, + { + "epoch": 0.1073657080846446, + "grad_norm": 6.538076877593994, + "learning_rate": 9.018226668493902e-06, + "loss": 0.3656, + "step": 7915 + }, + { + "epoch": 0.1073792729245795, + "grad_norm": 6.249586582183838, + "learning_rate": 9.018089625873647e-06, + "loss": 0.4378, + "step": 7916 + }, + { + "epoch": 0.10739283776451437, + "grad_norm": 5.948966979980469, + "learning_rate": 9.017952583253392e-06, + "loss": 0.4799, + "step": 7917 + }, + { + "epoch": 0.10740640260444927, + "grad_norm": 8.065183639526367, + "learning_rate": 9.017815540633139e-06, + "loss": 0.5837, + "step": 7918 + }, + { + "epoch": 0.10741996744438416, + "grad_norm": 6.967296123504639, + "learning_rate": 9.017678498012882e-06, + "loss": 0.3927, + "step": 7919 + }, + { + "epoch": 0.10743353228431904, + "grad_norm": 5.9258599281311035, + "learning_rate": 9.017541455392627e-06, + "loss": 0.3654, + "step": 7920 + }, + { + "epoch": 0.10744709712425393, + "grad_norm": 5.858676910400391, + "learning_rate": 9.017404412772373e-06, + "loss": 0.3809, + "step": 7921 + }, + { + "epoch": 0.10746066196418882, + "grad_norm": 7.871949195861816, + "learning_rate": 9.017267370152118e-06, + "loss": 0.6005, + "step": 7922 + }, + { + "epoch": 0.10747422680412372, + "grad_norm": 5.41815710067749, + "learning_rate": 9.017130327531863e-06, + "loss": 0.2385, + "step": 7923 + }, + { + "epoch": 0.1074877916440586, + "grad_norm": 6.798672199249268, + "learning_rate": 9.016993284911608e-06, + "loss": 0.4465, + "step": 7924 + }, + { + "epoch": 0.10750135648399349, + "grad_norm": 8.18445873260498, + "learning_rate": 9.016856242291353e-06, + "loss": 0.5255, + "step": 7925 + }, + { + "epoch": 0.10751492132392838, + "grad_norm": 6.567366600036621, + "learning_rate": 9.016719199671099e-06, + "loss": 0.4573, + "step": 7926 + }, + { + "epoch": 0.10752848616386326, + "grad_norm": 6.401097297668457, + "learning_rate": 9.016582157050844e-06, + "loss": 0.371, + "step": 7927 + }, + { + "epoch": 0.10754205100379816, + "grad_norm": 7.027868270874023, + "learning_rate": 9.016445114430589e-06, + "loss": 0.3406, + "step": 7928 + }, + { + "epoch": 0.10755561584373305, + "grad_norm": 6.746181488037109, + "learning_rate": 9.016308071810334e-06, + "loss": 0.3442, + "step": 7929 + }, + { + "epoch": 0.10756918068366793, + "grad_norm": 5.828417778015137, + "learning_rate": 9.016171029190078e-06, + "loss": 0.302, + "step": 7930 + }, + { + "epoch": 0.10758274552360282, + "grad_norm": 8.034521102905273, + "learning_rate": 9.016033986569824e-06, + "loss": 0.4156, + "step": 7931 + }, + { + "epoch": 0.10759631036353771, + "grad_norm": 7.754459381103516, + "learning_rate": 9.01589694394957e-06, + "loss": 0.3714, + "step": 7932 + }, + { + "epoch": 0.1076098752034726, + "grad_norm": 7.801807403564453, + "learning_rate": 9.015759901329315e-06, + "loss": 0.4146, + "step": 7933 + }, + { + "epoch": 0.10762344004340749, + "grad_norm": 5.7779974937438965, + "learning_rate": 9.015622858709058e-06, + "loss": 0.4584, + "step": 7934 + }, + { + "epoch": 0.10763700488334238, + "grad_norm": 6.521368026733398, + "learning_rate": 9.015485816088805e-06, + "loss": 0.4408, + "step": 7935 + }, + { + "epoch": 0.10765056972327726, + "grad_norm": 5.52677059173584, + "learning_rate": 9.01534877346855e-06, + "loss": 0.4148, + "step": 7936 + }, + { + "epoch": 0.10766413456321215, + "grad_norm": 6.3258957862854, + "learning_rate": 9.015211730848294e-06, + "loss": 0.2581, + "step": 7937 + }, + { + "epoch": 0.10767769940314705, + "grad_norm": 6.307536602020264, + "learning_rate": 9.015074688228039e-06, + "loss": 0.3562, + "step": 7938 + }, + { + "epoch": 0.10769126424308194, + "grad_norm": 5.849946022033691, + "learning_rate": 9.014937645607786e-06, + "loss": 0.363, + "step": 7939 + }, + { + "epoch": 0.10770482908301682, + "grad_norm": 6.734907150268555, + "learning_rate": 9.01480060298753e-06, + "loss": 0.3097, + "step": 7940 + }, + { + "epoch": 0.10771839392295171, + "grad_norm": 6.866668224334717, + "learning_rate": 9.014663560367275e-06, + "loss": 0.4212, + "step": 7941 + }, + { + "epoch": 0.1077319587628866, + "grad_norm": 5.562880992889404, + "learning_rate": 9.01452651774702e-06, + "loss": 0.5509, + "step": 7942 + }, + { + "epoch": 0.10774552360282148, + "grad_norm": 6.921916484832764, + "learning_rate": 9.014389475126765e-06, + "loss": 0.4372, + "step": 7943 + }, + { + "epoch": 0.10775908844275638, + "grad_norm": 7.4979448318481445, + "learning_rate": 9.01425243250651e-06, + "loss": 0.4565, + "step": 7944 + }, + { + "epoch": 0.10777265328269127, + "grad_norm": 6.218523025512695, + "learning_rate": 9.014115389886255e-06, + "loss": 0.4455, + "step": 7945 + }, + { + "epoch": 0.10778621812262615, + "grad_norm": 7.628395080566406, + "learning_rate": 9.013978347266e-06, + "loss": 0.5168, + "step": 7946 + }, + { + "epoch": 0.10779978296256104, + "grad_norm": 5.8308258056640625, + "learning_rate": 9.013841304645746e-06, + "loss": 0.3577, + "step": 7947 + }, + { + "epoch": 0.10781334780249593, + "grad_norm": 5.447487831115723, + "learning_rate": 9.01370426202549e-06, + "loss": 0.3153, + "step": 7948 + }, + { + "epoch": 0.10782691264243081, + "grad_norm": 5.508866786956787, + "learning_rate": 9.013567219405236e-06, + "loss": 0.338, + "step": 7949 + }, + { + "epoch": 0.10784047748236571, + "grad_norm": 6.564570426940918, + "learning_rate": 9.013430176784981e-06, + "loss": 0.4166, + "step": 7950 + }, + { + "epoch": 0.1078540423223006, + "grad_norm": 5.234955787658691, + "learning_rate": 9.013293134164726e-06, + "loss": 0.2743, + "step": 7951 + }, + { + "epoch": 0.10786760716223548, + "grad_norm": 5.55792236328125, + "learning_rate": 9.013156091544472e-06, + "loss": 0.3962, + "step": 7952 + }, + { + "epoch": 0.10788117200217037, + "grad_norm": 5.79143762588501, + "learning_rate": 9.013019048924217e-06, + "loss": 0.4126, + "step": 7953 + }, + { + "epoch": 0.10789473684210527, + "grad_norm": 6.015189170837402, + "learning_rate": 9.012882006303962e-06, + "loss": 0.3583, + "step": 7954 + }, + { + "epoch": 0.10790830168204016, + "grad_norm": 7.802090167999268, + "learning_rate": 9.012744963683705e-06, + "loss": 0.4504, + "step": 7955 + }, + { + "epoch": 0.10792186652197504, + "grad_norm": 6.204956531524658, + "learning_rate": 9.01260792106345e-06, + "loss": 0.4034, + "step": 7956 + }, + { + "epoch": 0.10793543136190993, + "grad_norm": 6.640707969665527, + "learning_rate": 9.012470878443197e-06, + "loss": 0.3896, + "step": 7957 + }, + { + "epoch": 0.10794899620184482, + "grad_norm": 5.470214366912842, + "learning_rate": 9.012333835822941e-06, + "loss": 0.4207, + "step": 7958 + }, + { + "epoch": 0.1079625610417797, + "grad_norm": 5.0052056312561035, + "learning_rate": 9.012196793202686e-06, + "loss": 0.2907, + "step": 7959 + }, + { + "epoch": 0.1079761258817146, + "grad_norm": 5.753905773162842, + "learning_rate": 9.012059750582431e-06, + "loss": 0.3415, + "step": 7960 + }, + { + "epoch": 0.10798969072164949, + "grad_norm": 5.227937698364258, + "learning_rate": 9.011922707962178e-06, + "loss": 0.3564, + "step": 7961 + }, + { + "epoch": 0.10800325556158437, + "grad_norm": 5.762558460235596, + "learning_rate": 9.011785665341922e-06, + "loss": 0.343, + "step": 7962 + }, + { + "epoch": 0.10801682040151926, + "grad_norm": 7.346169948577881, + "learning_rate": 9.011648622721667e-06, + "loss": 0.3964, + "step": 7963 + }, + { + "epoch": 0.10803038524145415, + "grad_norm": 6.318621635437012, + "learning_rate": 9.011511580101412e-06, + "loss": 0.4973, + "step": 7964 + }, + { + "epoch": 0.10804395008138903, + "grad_norm": 4.018206596374512, + "learning_rate": 9.011374537481157e-06, + "loss": 0.2901, + "step": 7965 + }, + { + "epoch": 0.10805751492132393, + "grad_norm": 6.441381931304932, + "learning_rate": 9.011237494860902e-06, + "loss": 0.4748, + "step": 7966 + }, + { + "epoch": 0.10807107976125882, + "grad_norm": 5.622974395751953, + "learning_rate": 9.011100452240648e-06, + "loss": 0.2454, + "step": 7967 + }, + { + "epoch": 0.1080846446011937, + "grad_norm": 8.905617713928223, + "learning_rate": 9.010963409620393e-06, + "loss": 0.4727, + "step": 7968 + }, + { + "epoch": 0.10809820944112859, + "grad_norm": 6.8055548667907715, + "learning_rate": 9.010826367000138e-06, + "loss": 0.4134, + "step": 7969 + }, + { + "epoch": 0.10811177428106349, + "grad_norm": 5.826209545135498, + "learning_rate": 9.010689324379883e-06, + "loss": 0.4132, + "step": 7970 + }, + { + "epoch": 0.10812533912099838, + "grad_norm": 5.738723278045654, + "learning_rate": 9.010552281759628e-06, + "loss": 0.4284, + "step": 7971 + }, + { + "epoch": 0.10813890396093326, + "grad_norm": 8.088188171386719, + "learning_rate": 9.010415239139373e-06, + "loss": 0.42, + "step": 7972 + }, + { + "epoch": 0.10815246880086815, + "grad_norm": 6.521784782409668, + "learning_rate": 9.010278196519117e-06, + "loss": 0.5007, + "step": 7973 + }, + { + "epoch": 0.10816603364080304, + "grad_norm": 5.762839317321777, + "learning_rate": 9.010141153898864e-06, + "loss": 0.387, + "step": 7974 + }, + { + "epoch": 0.10817959848073792, + "grad_norm": 7.161439895629883, + "learning_rate": 9.010004111278609e-06, + "loss": 0.3563, + "step": 7975 + }, + { + "epoch": 0.10819316332067282, + "grad_norm": 7.769431114196777, + "learning_rate": 9.009867068658354e-06, + "loss": 0.4547, + "step": 7976 + }, + { + "epoch": 0.10820672816060771, + "grad_norm": 5.171465873718262, + "learning_rate": 9.009730026038098e-06, + "loss": 0.2926, + "step": 7977 + }, + { + "epoch": 0.10822029300054259, + "grad_norm": 6.568623065948486, + "learning_rate": 9.009592983417844e-06, + "loss": 0.3683, + "step": 7978 + }, + { + "epoch": 0.10823385784047748, + "grad_norm": 7.462506294250488, + "learning_rate": 9.00945594079759e-06, + "loss": 0.469, + "step": 7979 + }, + { + "epoch": 0.10824742268041238, + "grad_norm": 8.623321533203125, + "learning_rate": 9.009318898177333e-06, + "loss": 0.5449, + "step": 7980 + }, + { + "epoch": 0.10826098752034725, + "grad_norm": 5.958180904388428, + "learning_rate": 9.009181855557078e-06, + "loss": 0.3889, + "step": 7981 + }, + { + "epoch": 0.10827455236028215, + "grad_norm": 5.110682964324951, + "learning_rate": 9.009044812936824e-06, + "loss": 0.3488, + "step": 7982 + }, + { + "epoch": 0.10828811720021704, + "grad_norm": 6.612009525299072, + "learning_rate": 9.008907770316569e-06, + "loss": 0.4693, + "step": 7983 + }, + { + "epoch": 0.10830168204015192, + "grad_norm": 7.81866455078125, + "learning_rate": 9.008770727696314e-06, + "loss": 0.4485, + "step": 7984 + }, + { + "epoch": 0.10831524688008681, + "grad_norm": 5.3969621658325195, + "learning_rate": 9.008633685076059e-06, + "loss": 0.2723, + "step": 7985 + }, + { + "epoch": 0.1083288117200217, + "grad_norm": 6.036742687225342, + "learning_rate": 9.008496642455804e-06, + "loss": 0.4296, + "step": 7986 + }, + { + "epoch": 0.1083423765599566, + "grad_norm": 5.990718364715576, + "learning_rate": 9.00835959983555e-06, + "loss": 0.2744, + "step": 7987 + }, + { + "epoch": 0.10835594139989148, + "grad_norm": 5.859616279602051, + "learning_rate": 9.008222557215295e-06, + "loss": 0.5528, + "step": 7988 + }, + { + "epoch": 0.10836950623982637, + "grad_norm": 5.424892902374268, + "learning_rate": 9.00808551459504e-06, + "loss": 0.4584, + "step": 7989 + }, + { + "epoch": 0.10838307107976126, + "grad_norm": 5.628893852233887, + "learning_rate": 9.007948471974785e-06, + "loss": 0.4375, + "step": 7990 + }, + { + "epoch": 0.10839663591969614, + "grad_norm": 8.331860542297363, + "learning_rate": 9.00781142935453e-06, + "loss": 0.5223, + "step": 7991 + }, + { + "epoch": 0.10841020075963104, + "grad_norm": 7.251453876495361, + "learning_rate": 9.007674386734275e-06, + "loss": 0.3785, + "step": 7992 + }, + { + "epoch": 0.10842376559956593, + "grad_norm": 4.916853427886963, + "learning_rate": 9.00753734411402e-06, + "loss": 0.4, + "step": 7993 + }, + { + "epoch": 0.10843733043950081, + "grad_norm": 5.355219841003418, + "learning_rate": 9.007400301493766e-06, + "loss": 0.3144, + "step": 7994 + }, + { + "epoch": 0.1084508952794357, + "grad_norm": 7.462691307067871, + "learning_rate": 9.00726325887351e-06, + "loss": 0.4066, + "step": 7995 + }, + { + "epoch": 0.1084644601193706, + "grad_norm": 7.864752292633057, + "learning_rate": 9.007126216253256e-06, + "loss": 0.5326, + "step": 7996 + }, + { + "epoch": 0.10847802495930547, + "grad_norm": 9.031742095947266, + "learning_rate": 9.006989173633001e-06, + "loss": 0.495, + "step": 7997 + }, + { + "epoch": 0.10849158979924037, + "grad_norm": 4.508965969085693, + "learning_rate": 9.006852131012745e-06, + "loss": 0.3401, + "step": 7998 + }, + { + "epoch": 0.10850515463917526, + "grad_norm": 7.050581455230713, + "learning_rate": 9.00671508839249e-06, + "loss": 0.3866, + "step": 7999 + }, + { + "epoch": 0.10851871947911014, + "grad_norm": 6.798739910125732, + "learning_rate": 9.006578045772237e-06, + "loss": 0.5182, + "step": 8000 + }, + { + "epoch": 0.10853228431904503, + "grad_norm": 8.067197799682617, + "learning_rate": 9.006441003151982e-06, + "loss": 0.4609, + "step": 8001 + }, + { + "epoch": 0.10854584915897993, + "grad_norm": 5.153645038604736, + "learning_rate": 9.006303960531725e-06, + "loss": 0.3743, + "step": 8002 + }, + { + "epoch": 0.10855941399891482, + "grad_norm": 5.337229251861572, + "learning_rate": 9.00616691791147e-06, + "loss": 0.2978, + "step": 8003 + }, + { + "epoch": 0.1085729788388497, + "grad_norm": 5.494024276733398, + "learning_rate": 9.006029875291217e-06, + "loss": 0.3665, + "step": 8004 + }, + { + "epoch": 0.10858654367878459, + "grad_norm": 7.02168083190918, + "learning_rate": 9.005892832670961e-06, + "loss": 0.4357, + "step": 8005 + }, + { + "epoch": 0.10860010851871948, + "grad_norm": 7.263876438140869, + "learning_rate": 9.005755790050706e-06, + "loss": 0.4948, + "step": 8006 + }, + { + "epoch": 0.10861367335865436, + "grad_norm": 6.286440849304199, + "learning_rate": 9.005618747430451e-06, + "loss": 0.3874, + "step": 8007 + }, + { + "epoch": 0.10862723819858926, + "grad_norm": 7.506501197814941, + "learning_rate": 9.005481704810196e-06, + "loss": 0.4415, + "step": 8008 + }, + { + "epoch": 0.10864080303852415, + "grad_norm": 4.941030979156494, + "learning_rate": 9.005344662189942e-06, + "loss": 0.2998, + "step": 8009 + }, + { + "epoch": 0.10865436787845903, + "grad_norm": 7.913052558898926, + "learning_rate": 9.005207619569687e-06, + "loss": 0.4619, + "step": 8010 + }, + { + "epoch": 0.10866793271839392, + "grad_norm": 7.360294818878174, + "learning_rate": 9.005070576949432e-06, + "loss": 0.3186, + "step": 8011 + }, + { + "epoch": 0.10868149755832882, + "grad_norm": 6.373230457305908, + "learning_rate": 9.004933534329177e-06, + "loss": 0.3251, + "step": 8012 + }, + { + "epoch": 0.1086950623982637, + "grad_norm": 6.729010105133057, + "learning_rate": 9.004796491708922e-06, + "loss": 0.3687, + "step": 8013 + }, + { + "epoch": 0.10870862723819859, + "grad_norm": 6.747348785400391, + "learning_rate": 9.004659449088668e-06, + "loss": 0.4702, + "step": 8014 + }, + { + "epoch": 0.10872219207813348, + "grad_norm": 7.560379505157471, + "learning_rate": 9.004522406468413e-06, + "loss": 0.5647, + "step": 8015 + }, + { + "epoch": 0.10873575691806836, + "grad_norm": 7.307379245758057, + "learning_rate": 9.004385363848158e-06, + "loss": 0.3971, + "step": 8016 + }, + { + "epoch": 0.10874932175800325, + "grad_norm": 5.274499416351318, + "learning_rate": 9.004248321227903e-06, + "loss": 0.4066, + "step": 8017 + }, + { + "epoch": 0.10876288659793815, + "grad_norm": 6.0423407554626465, + "learning_rate": 9.004111278607648e-06, + "loss": 0.4081, + "step": 8018 + }, + { + "epoch": 0.10877645143787304, + "grad_norm": 6.70522928237915, + "learning_rate": 9.003974235987393e-06, + "loss": 0.3442, + "step": 8019 + }, + { + "epoch": 0.10879001627780792, + "grad_norm": 5.1024298667907715, + "learning_rate": 9.003837193367137e-06, + "loss": 0.3137, + "step": 8020 + }, + { + "epoch": 0.10880358111774281, + "grad_norm": 6.944864749908447, + "learning_rate": 9.003700150746884e-06, + "loss": 0.4837, + "step": 8021 + }, + { + "epoch": 0.1088171459576777, + "grad_norm": 5.7003679275512695, + "learning_rate": 9.003563108126629e-06, + "loss": 0.3844, + "step": 8022 + }, + { + "epoch": 0.10883071079761258, + "grad_norm": 4.994449138641357, + "learning_rate": 9.003426065506372e-06, + "loss": 0.3771, + "step": 8023 + }, + { + "epoch": 0.10884427563754748, + "grad_norm": 5.80933141708374, + "learning_rate": 9.003289022886118e-06, + "loss": 0.3579, + "step": 8024 + }, + { + "epoch": 0.10885784047748237, + "grad_norm": 6.582284927368164, + "learning_rate": 9.003151980265863e-06, + "loss": 0.4292, + "step": 8025 + }, + { + "epoch": 0.10887140531741725, + "grad_norm": 8.22785472869873, + "learning_rate": 9.00301493764561e-06, + "loss": 0.4953, + "step": 8026 + }, + { + "epoch": 0.10888497015735214, + "grad_norm": 7.228267192840576, + "learning_rate": 9.002877895025353e-06, + "loss": 0.5655, + "step": 8027 + }, + { + "epoch": 0.10889853499728704, + "grad_norm": 6.403097629547119, + "learning_rate": 9.002740852405098e-06, + "loss": 0.3735, + "step": 8028 + }, + { + "epoch": 0.10891209983722192, + "grad_norm": 6.976003170013428, + "learning_rate": 9.002603809784844e-06, + "loss": 0.3845, + "step": 8029 + }, + { + "epoch": 0.10892566467715681, + "grad_norm": 5.976942539215088, + "learning_rate": 9.002466767164589e-06, + "loss": 0.4009, + "step": 8030 + }, + { + "epoch": 0.1089392295170917, + "grad_norm": 6.292357921600342, + "learning_rate": 9.002329724544334e-06, + "loss": 0.4074, + "step": 8031 + }, + { + "epoch": 0.10895279435702658, + "grad_norm": 6.266970157623291, + "learning_rate": 9.002192681924079e-06, + "loss": 0.504, + "step": 8032 + }, + { + "epoch": 0.10896635919696147, + "grad_norm": 5.8759965896606445, + "learning_rate": 9.002055639303824e-06, + "loss": 0.2886, + "step": 8033 + }, + { + "epoch": 0.10897992403689637, + "grad_norm": 5.348785400390625, + "learning_rate": 9.00191859668357e-06, + "loss": 0.3285, + "step": 8034 + }, + { + "epoch": 0.10899348887683126, + "grad_norm": 6.855266094207764, + "learning_rate": 9.001781554063315e-06, + "loss": 0.5703, + "step": 8035 + }, + { + "epoch": 0.10900705371676614, + "grad_norm": 4.8924689292907715, + "learning_rate": 9.00164451144306e-06, + "loss": 0.3624, + "step": 8036 + }, + { + "epoch": 0.10902061855670103, + "grad_norm": 5.442574501037598, + "learning_rate": 9.001507468822805e-06, + "loss": 0.3838, + "step": 8037 + }, + { + "epoch": 0.10903418339663593, + "grad_norm": 6.248218059539795, + "learning_rate": 9.001370426202548e-06, + "loss": 0.2921, + "step": 8038 + }, + { + "epoch": 0.1090477482365708, + "grad_norm": 7.346706867218018, + "learning_rate": 9.001233383582295e-06, + "loss": 0.4101, + "step": 8039 + }, + { + "epoch": 0.1090613130765057, + "grad_norm": 11.730439186096191, + "learning_rate": 9.00109634096204e-06, + "loss": 0.6363, + "step": 8040 + }, + { + "epoch": 0.10907487791644059, + "grad_norm": 5.425919532775879, + "learning_rate": 9.000959298341786e-06, + "loss": 0.4337, + "step": 8041 + }, + { + "epoch": 0.10908844275637547, + "grad_norm": 5.481659889221191, + "learning_rate": 9.00082225572153e-06, + "loss": 0.4357, + "step": 8042 + }, + { + "epoch": 0.10910200759631036, + "grad_norm": 6.346214294433594, + "learning_rate": 9.000685213101276e-06, + "loss": 0.3268, + "step": 8043 + }, + { + "epoch": 0.10911557243624526, + "grad_norm": 6.189964294433594, + "learning_rate": 9.000548170481021e-06, + "loss": 0.3496, + "step": 8044 + }, + { + "epoch": 0.10912913727618014, + "grad_norm": 5.65019416809082, + "learning_rate": 9.000411127860765e-06, + "loss": 0.3316, + "step": 8045 + }, + { + "epoch": 0.10914270211611503, + "grad_norm": 8.970365524291992, + "learning_rate": 9.00027408524051e-06, + "loss": 0.668, + "step": 8046 + }, + { + "epoch": 0.10915626695604992, + "grad_norm": 6.946298122406006, + "learning_rate": 9.000137042620257e-06, + "loss": 0.5381, + "step": 8047 + }, + { + "epoch": 0.1091698317959848, + "grad_norm": 5.780150890350342, + "learning_rate": 9e-06, + "loss": 0.4451, + "step": 8048 + }, + { + "epoch": 0.1091833966359197, + "grad_norm": 5.251784801483154, + "learning_rate": 8.999862957379745e-06, + "loss": 0.2382, + "step": 8049 + }, + { + "epoch": 0.10919696147585459, + "grad_norm": 7.304770469665527, + "learning_rate": 8.99972591475949e-06, + "loss": 0.4282, + "step": 8050 + }, + { + "epoch": 0.10921052631578948, + "grad_norm": 6.507119178771973, + "learning_rate": 8.999588872139236e-06, + "loss": 0.3639, + "step": 8051 + }, + { + "epoch": 0.10922409115572436, + "grad_norm": 5.356017589569092, + "learning_rate": 8.999451829518981e-06, + "loss": 0.3561, + "step": 8052 + }, + { + "epoch": 0.10923765599565925, + "grad_norm": 6.379340171813965, + "learning_rate": 8.999314786898726e-06, + "loss": 0.6616, + "step": 8053 + }, + { + "epoch": 0.10925122083559415, + "grad_norm": 5.022793769836426, + "learning_rate": 8.999177744278471e-06, + "loss": 0.2731, + "step": 8054 + }, + { + "epoch": 0.10926478567552902, + "grad_norm": 6.404078006744385, + "learning_rate": 8.999040701658216e-06, + "loss": 0.361, + "step": 8055 + }, + { + "epoch": 0.10927835051546392, + "grad_norm": 8.345273971557617, + "learning_rate": 8.998903659037962e-06, + "loss": 0.4017, + "step": 8056 + }, + { + "epoch": 0.10929191535539881, + "grad_norm": 5.637814044952393, + "learning_rate": 8.998766616417707e-06, + "loss": 0.3219, + "step": 8057 + }, + { + "epoch": 0.10930548019533369, + "grad_norm": 5.074803829193115, + "learning_rate": 8.998629573797452e-06, + "loss": 0.272, + "step": 8058 + }, + { + "epoch": 0.10931904503526858, + "grad_norm": 6.718563556671143, + "learning_rate": 8.998492531177197e-06, + "loss": 0.517, + "step": 8059 + }, + { + "epoch": 0.10933260987520348, + "grad_norm": 5.535350799560547, + "learning_rate": 8.998355488556942e-06, + "loss": 0.2421, + "step": 8060 + }, + { + "epoch": 0.10934617471513836, + "grad_norm": 5.2518815994262695, + "learning_rate": 8.998218445936688e-06, + "loss": 0.3377, + "step": 8061 + }, + { + "epoch": 0.10935973955507325, + "grad_norm": 7.318580627441406, + "learning_rate": 8.998081403316433e-06, + "loss": 0.4297, + "step": 8062 + }, + { + "epoch": 0.10937330439500814, + "grad_norm": 5.513238430023193, + "learning_rate": 8.997944360696176e-06, + "loss": 0.2885, + "step": 8063 + }, + { + "epoch": 0.10938686923494302, + "grad_norm": 5.679646015167236, + "learning_rate": 8.997807318075923e-06, + "loss": 0.2974, + "step": 8064 + }, + { + "epoch": 0.10940043407487791, + "grad_norm": 7.7280778884887695, + "learning_rate": 8.997670275455668e-06, + "loss": 0.3965, + "step": 8065 + }, + { + "epoch": 0.10941399891481281, + "grad_norm": 6.0371246337890625, + "learning_rate": 8.997533232835412e-06, + "loss": 0.3722, + "step": 8066 + }, + { + "epoch": 0.1094275637547477, + "grad_norm": 4.833867073059082, + "learning_rate": 8.997396190215157e-06, + "loss": 0.4026, + "step": 8067 + }, + { + "epoch": 0.10944112859468258, + "grad_norm": 6.109691143035889, + "learning_rate": 8.997259147594902e-06, + "loss": 0.4587, + "step": 8068 + }, + { + "epoch": 0.10945469343461747, + "grad_norm": 6.868312358856201, + "learning_rate": 8.997122104974649e-06, + "loss": 0.4803, + "step": 8069 + }, + { + "epoch": 0.10946825827455237, + "grad_norm": 6.885209560394287, + "learning_rate": 8.996985062354392e-06, + "loss": 0.4001, + "step": 8070 + }, + { + "epoch": 0.10948182311448725, + "grad_norm": 5.021782398223877, + "learning_rate": 8.996848019734138e-06, + "loss": 0.2912, + "step": 8071 + }, + { + "epoch": 0.10949538795442214, + "grad_norm": 6.198353290557861, + "learning_rate": 8.996710977113883e-06, + "loss": 0.3823, + "step": 8072 + }, + { + "epoch": 0.10950895279435703, + "grad_norm": 4.271971225738525, + "learning_rate": 8.996573934493628e-06, + "loss": 0.318, + "step": 8073 + }, + { + "epoch": 0.10952251763429191, + "grad_norm": 5.717055320739746, + "learning_rate": 8.996436891873373e-06, + "loss": 0.2725, + "step": 8074 + }, + { + "epoch": 0.1095360824742268, + "grad_norm": 7.004342079162598, + "learning_rate": 8.996299849253118e-06, + "loss": 0.3012, + "step": 8075 + }, + { + "epoch": 0.1095496473141617, + "grad_norm": 8.217470169067383, + "learning_rate": 8.996162806632864e-06, + "loss": 0.5334, + "step": 8076 + }, + { + "epoch": 0.10956321215409658, + "grad_norm": 5.4812331199646, + "learning_rate": 8.996025764012609e-06, + "loss": 0.3522, + "step": 8077 + }, + { + "epoch": 0.10957677699403147, + "grad_norm": 7.788267612457275, + "learning_rate": 8.995888721392354e-06, + "loss": 0.5658, + "step": 8078 + }, + { + "epoch": 0.10959034183396636, + "grad_norm": 7.480042934417725, + "learning_rate": 8.995751678772099e-06, + "loss": 0.296, + "step": 8079 + }, + { + "epoch": 0.10960390667390124, + "grad_norm": 5.165693759918213, + "learning_rate": 8.995614636151844e-06, + "loss": 0.3376, + "step": 8080 + }, + { + "epoch": 0.10961747151383613, + "grad_norm": 6.612802028656006, + "learning_rate": 8.995477593531588e-06, + "loss": 0.3522, + "step": 8081 + }, + { + "epoch": 0.10963103635377103, + "grad_norm": 6.244966983795166, + "learning_rate": 8.995340550911335e-06, + "loss": 0.5173, + "step": 8082 + }, + { + "epoch": 0.10964460119370592, + "grad_norm": 8.00383186340332, + "learning_rate": 8.99520350829108e-06, + "loss": 0.587, + "step": 8083 + }, + { + "epoch": 0.1096581660336408, + "grad_norm": 6.0822553634643555, + "learning_rate": 8.995066465670825e-06, + "loss": 0.3596, + "step": 8084 + }, + { + "epoch": 0.10967173087357569, + "grad_norm": 6.646093368530273, + "learning_rate": 8.994929423050568e-06, + "loss": 0.5646, + "step": 8085 + }, + { + "epoch": 0.10968529571351059, + "grad_norm": 6.077112674713135, + "learning_rate": 8.994792380430315e-06, + "loss": 0.3491, + "step": 8086 + }, + { + "epoch": 0.10969886055344547, + "grad_norm": 7.046772003173828, + "learning_rate": 8.99465533781006e-06, + "loss": 0.4584, + "step": 8087 + }, + { + "epoch": 0.10971242539338036, + "grad_norm": 7.219210147857666, + "learning_rate": 8.994518295189804e-06, + "loss": 0.4261, + "step": 8088 + }, + { + "epoch": 0.10972599023331525, + "grad_norm": 7.7383131980896, + "learning_rate": 8.99438125256955e-06, + "loss": 0.5359, + "step": 8089 + }, + { + "epoch": 0.10973955507325013, + "grad_norm": 6.165370464324951, + "learning_rate": 8.994244209949296e-06, + "loss": 0.4658, + "step": 8090 + }, + { + "epoch": 0.10975311991318502, + "grad_norm": 7.441448211669922, + "learning_rate": 8.99410716732904e-06, + "loss": 0.4156, + "step": 8091 + }, + { + "epoch": 0.10976668475311992, + "grad_norm": 5.151210784912109, + "learning_rate": 8.993970124708785e-06, + "loss": 0.2865, + "step": 8092 + }, + { + "epoch": 0.1097802495930548, + "grad_norm": 7.134624004364014, + "learning_rate": 8.99383308208853e-06, + "loss": 0.4078, + "step": 8093 + }, + { + "epoch": 0.10979381443298969, + "grad_norm": 7.001804828643799, + "learning_rate": 8.993696039468275e-06, + "loss": 0.416, + "step": 8094 + }, + { + "epoch": 0.10980737927292458, + "grad_norm": 6.804263591766357, + "learning_rate": 8.99355899684802e-06, + "loss": 0.3637, + "step": 8095 + }, + { + "epoch": 0.10982094411285946, + "grad_norm": 6.641819000244141, + "learning_rate": 8.993421954227765e-06, + "loss": 0.4043, + "step": 8096 + }, + { + "epoch": 0.10983450895279435, + "grad_norm": 7.2608489990234375, + "learning_rate": 8.99328491160751e-06, + "loss": 0.5608, + "step": 8097 + }, + { + "epoch": 0.10984807379272925, + "grad_norm": 8.451376914978027, + "learning_rate": 8.993147868987256e-06, + "loss": 0.582, + "step": 8098 + }, + { + "epoch": 0.10986163863266414, + "grad_norm": 5.784000396728516, + "learning_rate": 8.993010826367001e-06, + "loss": 0.2489, + "step": 8099 + }, + { + "epoch": 0.10987520347259902, + "grad_norm": 4.993226051330566, + "learning_rate": 8.992873783746746e-06, + "loss": 0.3522, + "step": 8100 + }, + { + "epoch": 0.10988876831253391, + "grad_norm": 8.610145568847656, + "learning_rate": 8.992736741126491e-06, + "loss": 0.4367, + "step": 8101 + }, + { + "epoch": 0.1099023331524688, + "grad_norm": 8.418461799621582, + "learning_rate": 8.992599698506236e-06, + "loss": 0.5476, + "step": 8102 + }, + { + "epoch": 0.10991589799240369, + "grad_norm": 6.486578941345215, + "learning_rate": 8.992462655885982e-06, + "loss": 0.3355, + "step": 8103 + }, + { + "epoch": 0.10992946283233858, + "grad_norm": 7.250716209411621, + "learning_rate": 8.992325613265727e-06, + "loss": 0.4165, + "step": 8104 + }, + { + "epoch": 0.10994302767227347, + "grad_norm": 6.882135391235352, + "learning_rate": 8.992188570645472e-06, + "loss": 0.3501, + "step": 8105 + }, + { + "epoch": 0.10995659251220835, + "grad_norm": 6.74175500869751, + "learning_rate": 8.992051528025216e-06, + "loss": 0.4644, + "step": 8106 + }, + { + "epoch": 0.10997015735214324, + "grad_norm": 8.485845565795898, + "learning_rate": 8.99191448540496e-06, + "loss": 0.4986, + "step": 8107 + }, + { + "epoch": 0.10998372219207814, + "grad_norm": 8.368968963623047, + "learning_rate": 8.991777442784708e-06, + "loss": 0.4423, + "step": 8108 + }, + { + "epoch": 0.10999728703201302, + "grad_norm": 5.877171993255615, + "learning_rate": 8.991640400164453e-06, + "loss": 0.3683, + "step": 8109 + }, + { + "epoch": 0.11001085187194791, + "grad_norm": 6.4308247566223145, + "learning_rate": 8.991503357544196e-06, + "loss": 0.4337, + "step": 8110 + }, + { + "epoch": 0.1100244167118828, + "grad_norm": 7.429412841796875, + "learning_rate": 8.991366314923941e-06, + "loss": 0.3765, + "step": 8111 + }, + { + "epoch": 0.11003798155181768, + "grad_norm": 9.570294380187988, + "learning_rate": 8.991229272303688e-06, + "loss": 0.6415, + "step": 8112 + }, + { + "epoch": 0.11005154639175257, + "grad_norm": 5.709016799926758, + "learning_rate": 8.991092229683432e-06, + "loss": 0.4225, + "step": 8113 + }, + { + "epoch": 0.11006511123168747, + "grad_norm": 9.141570091247559, + "learning_rate": 8.990955187063177e-06, + "loss": 0.5392, + "step": 8114 + }, + { + "epoch": 0.11007867607162236, + "grad_norm": 6.730615615844727, + "learning_rate": 8.990818144442922e-06, + "loss": 0.4971, + "step": 8115 + }, + { + "epoch": 0.11009224091155724, + "grad_norm": 5.680934429168701, + "learning_rate": 8.990681101822667e-06, + "loss": 0.3641, + "step": 8116 + }, + { + "epoch": 0.11010580575149213, + "grad_norm": 6.4823503494262695, + "learning_rate": 8.990544059202412e-06, + "loss": 0.4476, + "step": 8117 + }, + { + "epoch": 0.11011937059142703, + "grad_norm": 7.183440685272217, + "learning_rate": 8.990407016582158e-06, + "loss": 0.5004, + "step": 8118 + }, + { + "epoch": 0.1101329354313619, + "grad_norm": 5.791065216064453, + "learning_rate": 8.990269973961903e-06, + "loss": 0.3167, + "step": 8119 + }, + { + "epoch": 0.1101465002712968, + "grad_norm": 6.458497047424316, + "learning_rate": 8.990132931341648e-06, + "loss": 0.2997, + "step": 8120 + }, + { + "epoch": 0.11016006511123169, + "grad_norm": 5.587310791015625, + "learning_rate": 8.989995888721393e-06, + "loss": 0.4728, + "step": 8121 + }, + { + "epoch": 0.11017362995116657, + "grad_norm": 8.5230073928833, + "learning_rate": 8.989858846101138e-06, + "loss": 0.5908, + "step": 8122 + }, + { + "epoch": 0.11018719479110146, + "grad_norm": 5.075007915496826, + "learning_rate": 8.989721803480884e-06, + "loss": 0.2898, + "step": 8123 + }, + { + "epoch": 0.11020075963103636, + "grad_norm": 6.269895553588867, + "learning_rate": 8.989584760860629e-06, + "loss": 0.4526, + "step": 8124 + }, + { + "epoch": 0.11021432447097124, + "grad_norm": 5.351110935211182, + "learning_rate": 8.989447718240374e-06, + "loss": 0.2794, + "step": 8125 + }, + { + "epoch": 0.11022788931090613, + "grad_norm": 7.5826520919799805, + "learning_rate": 8.989310675620119e-06, + "loss": 0.4004, + "step": 8126 + }, + { + "epoch": 0.11024145415084102, + "grad_norm": 9.087541580200195, + "learning_rate": 8.989173632999864e-06, + "loss": 0.5204, + "step": 8127 + }, + { + "epoch": 0.1102550189907759, + "grad_norm": 5.370659351348877, + "learning_rate": 8.989036590379608e-06, + "loss": 0.4288, + "step": 8128 + }, + { + "epoch": 0.1102685838307108, + "grad_norm": 7.3217854499816895, + "learning_rate": 8.988899547759355e-06, + "loss": 0.4257, + "step": 8129 + }, + { + "epoch": 0.11028214867064569, + "grad_norm": 6.792525291442871, + "learning_rate": 8.9887625051391e-06, + "loss": 0.4492, + "step": 8130 + }, + { + "epoch": 0.11029571351058058, + "grad_norm": 6.912603378295898, + "learning_rate": 8.988625462518843e-06, + "loss": 0.4292, + "step": 8131 + }, + { + "epoch": 0.11030927835051546, + "grad_norm": 5.782046794891357, + "learning_rate": 8.988488419898588e-06, + "loss": 0.2835, + "step": 8132 + }, + { + "epoch": 0.11032284319045035, + "grad_norm": 7.477275848388672, + "learning_rate": 8.988351377278335e-06, + "loss": 0.4341, + "step": 8133 + }, + { + "epoch": 0.11033640803038525, + "grad_norm": 6.080702781677246, + "learning_rate": 8.98821433465808e-06, + "loss": 0.3573, + "step": 8134 + }, + { + "epoch": 0.11034997287032013, + "grad_norm": 6.686983585357666, + "learning_rate": 8.988077292037824e-06, + "loss": 0.4439, + "step": 8135 + }, + { + "epoch": 0.11036353771025502, + "grad_norm": 7.691760540008545, + "learning_rate": 8.98794024941757e-06, + "loss": 0.5601, + "step": 8136 + }, + { + "epoch": 0.11037710255018991, + "grad_norm": 6.750251293182373, + "learning_rate": 8.987803206797314e-06, + "loss": 0.4148, + "step": 8137 + }, + { + "epoch": 0.11039066739012479, + "grad_norm": 6.1616034507751465, + "learning_rate": 8.98766616417706e-06, + "loss": 0.3266, + "step": 8138 + }, + { + "epoch": 0.11040423223005968, + "grad_norm": 5.9207072257995605, + "learning_rate": 8.987529121556805e-06, + "loss": 0.3685, + "step": 8139 + }, + { + "epoch": 0.11041779706999458, + "grad_norm": 6.5431227684021, + "learning_rate": 8.98739207893655e-06, + "loss": 0.6006, + "step": 8140 + }, + { + "epoch": 0.11043136190992946, + "grad_norm": 7.308724880218506, + "learning_rate": 8.987255036316295e-06, + "loss": 0.4441, + "step": 8141 + }, + { + "epoch": 0.11044492674986435, + "grad_norm": 5.561672687530518, + "learning_rate": 8.98711799369604e-06, + "loss": 0.5001, + "step": 8142 + }, + { + "epoch": 0.11045849158979924, + "grad_norm": 7.62369966506958, + "learning_rate": 8.986980951075785e-06, + "loss": 0.6517, + "step": 8143 + }, + { + "epoch": 0.11047205642973412, + "grad_norm": 8.689558982849121, + "learning_rate": 8.98684390845553e-06, + "loss": 0.685, + "step": 8144 + }, + { + "epoch": 0.11048562126966902, + "grad_norm": 6.912662506103516, + "learning_rate": 8.986706865835276e-06, + "loss": 0.4283, + "step": 8145 + }, + { + "epoch": 0.11049918610960391, + "grad_norm": 5.87105131149292, + "learning_rate": 8.986569823215021e-06, + "loss": 0.4582, + "step": 8146 + }, + { + "epoch": 0.1105127509495388, + "grad_norm": 6.885440826416016, + "learning_rate": 8.986432780594766e-06, + "loss": 0.3726, + "step": 8147 + }, + { + "epoch": 0.11052631578947368, + "grad_norm": 6.723699569702148, + "learning_rate": 8.986295737974511e-06, + "loss": 0.3787, + "step": 8148 + }, + { + "epoch": 0.11053988062940857, + "grad_norm": 6.275148391723633, + "learning_rate": 8.986158695354255e-06, + "loss": 0.4381, + "step": 8149 + }, + { + "epoch": 0.11055344546934347, + "grad_norm": 7.25288724899292, + "learning_rate": 8.986021652734e-06, + "loss": 0.5349, + "step": 8150 + }, + { + "epoch": 0.11056701030927835, + "grad_norm": 7.072878837585449, + "learning_rate": 8.985884610113747e-06, + "loss": 0.3795, + "step": 8151 + }, + { + "epoch": 0.11058057514921324, + "grad_norm": 6.639392852783203, + "learning_rate": 8.985747567493492e-06, + "loss": 0.4184, + "step": 8152 + }, + { + "epoch": 0.11059413998914813, + "grad_norm": 8.746598243713379, + "learning_rate": 8.985610524873236e-06, + "loss": 0.5474, + "step": 8153 + }, + { + "epoch": 0.11060770482908301, + "grad_norm": 6.400727272033691, + "learning_rate": 8.98547348225298e-06, + "loss": 0.469, + "step": 8154 + }, + { + "epoch": 0.1106212696690179, + "grad_norm": 6.057180881500244, + "learning_rate": 8.985336439632728e-06, + "loss": 0.2702, + "step": 8155 + }, + { + "epoch": 0.1106348345089528, + "grad_norm": 8.26634407043457, + "learning_rate": 8.985199397012471e-06, + "loss": 0.4194, + "step": 8156 + }, + { + "epoch": 0.11064839934888768, + "grad_norm": 8.678850173950195, + "learning_rate": 8.985062354392216e-06, + "loss": 0.5281, + "step": 8157 + }, + { + "epoch": 0.11066196418882257, + "grad_norm": 9.13522720336914, + "learning_rate": 8.984925311771961e-06, + "loss": 0.5879, + "step": 8158 + }, + { + "epoch": 0.11067552902875746, + "grad_norm": 7.730494022369385, + "learning_rate": 8.984788269151707e-06, + "loss": 0.5084, + "step": 8159 + }, + { + "epoch": 0.11068909386869234, + "grad_norm": 6.134915351867676, + "learning_rate": 8.984651226531452e-06, + "loss": 0.3894, + "step": 8160 + }, + { + "epoch": 0.11070265870862724, + "grad_norm": 8.060019493103027, + "learning_rate": 8.984514183911197e-06, + "loss": 0.6005, + "step": 8161 + }, + { + "epoch": 0.11071622354856213, + "grad_norm": 8.517770767211914, + "learning_rate": 8.984377141290942e-06, + "loss": 0.7226, + "step": 8162 + }, + { + "epoch": 0.11072978838849702, + "grad_norm": 7.629807472229004, + "learning_rate": 8.984240098670687e-06, + "loss": 0.4774, + "step": 8163 + }, + { + "epoch": 0.1107433532284319, + "grad_norm": 5.178627967834473, + "learning_rate": 8.984103056050433e-06, + "loss": 0.3135, + "step": 8164 + }, + { + "epoch": 0.1107569180683668, + "grad_norm": 7.207585334777832, + "learning_rate": 8.983966013430178e-06, + "loss": 0.4463, + "step": 8165 + }, + { + "epoch": 0.11077048290830169, + "grad_norm": 7.037546157836914, + "learning_rate": 8.983828970809923e-06, + "loss": 0.5043, + "step": 8166 + }, + { + "epoch": 0.11078404774823657, + "grad_norm": 9.877249717712402, + "learning_rate": 8.983691928189668e-06, + "loss": 0.4669, + "step": 8167 + }, + { + "epoch": 0.11079761258817146, + "grad_norm": 9.708126068115234, + "learning_rate": 8.983554885569413e-06, + "loss": 0.4067, + "step": 8168 + }, + { + "epoch": 0.11081117742810635, + "grad_norm": 8.873007774353027, + "learning_rate": 8.983417842949158e-06, + "loss": 0.4924, + "step": 8169 + }, + { + "epoch": 0.11082474226804123, + "grad_norm": 7.495248794555664, + "learning_rate": 8.983280800328904e-06, + "loss": 0.5915, + "step": 8170 + }, + { + "epoch": 0.11083830710797613, + "grad_norm": 7.138182640075684, + "learning_rate": 8.983143757708647e-06, + "loss": 0.502, + "step": 8171 + }, + { + "epoch": 0.11085187194791102, + "grad_norm": 10.35352897644043, + "learning_rate": 8.983006715088394e-06, + "loss": 0.7009, + "step": 8172 + }, + { + "epoch": 0.1108654367878459, + "grad_norm": 7.827433109283447, + "learning_rate": 8.982869672468139e-06, + "loss": 0.4784, + "step": 8173 + }, + { + "epoch": 0.11087900162778079, + "grad_norm": 6.2345805168151855, + "learning_rate": 8.982732629847883e-06, + "loss": 0.3918, + "step": 8174 + }, + { + "epoch": 0.11089256646771568, + "grad_norm": 8.143150329589844, + "learning_rate": 8.982595587227628e-06, + "loss": 0.3812, + "step": 8175 + }, + { + "epoch": 0.11090613130765056, + "grad_norm": 8.251053810119629, + "learning_rate": 8.982458544607373e-06, + "loss": 0.5158, + "step": 8176 + }, + { + "epoch": 0.11091969614758546, + "grad_norm": 9.841389656066895, + "learning_rate": 8.98232150198712e-06, + "loss": 0.6059, + "step": 8177 + }, + { + "epoch": 0.11093326098752035, + "grad_norm": 6.02990198135376, + "learning_rate": 8.982184459366863e-06, + "loss": 0.3568, + "step": 8178 + }, + { + "epoch": 0.11094682582745524, + "grad_norm": 9.152480125427246, + "learning_rate": 8.982047416746608e-06, + "loss": 0.6823, + "step": 8179 + }, + { + "epoch": 0.11096039066739012, + "grad_norm": 7.567519187927246, + "learning_rate": 8.981910374126354e-06, + "loss": 0.4606, + "step": 8180 + }, + { + "epoch": 0.11097395550732501, + "grad_norm": 8.613661766052246, + "learning_rate": 8.981773331506099e-06, + "loss": 0.5394, + "step": 8181 + }, + { + "epoch": 0.11098752034725991, + "grad_norm": 9.88564395904541, + "learning_rate": 8.981636288885844e-06, + "loss": 0.6657, + "step": 8182 + }, + { + "epoch": 0.11100108518719479, + "grad_norm": 6.481333255767822, + "learning_rate": 8.98149924626559e-06, + "loss": 0.4085, + "step": 8183 + }, + { + "epoch": 0.11101465002712968, + "grad_norm": 7.755165100097656, + "learning_rate": 8.981362203645334e-06, + "loss": 0.5385, + "step": 8184 + }, + { + "epoch": 0.11102821486706457, + "grad_norm": 7.4163103103637695, + "learning_rate": 8.98122516102508e-06, + "loss": 0.4321, + "step": 8185 + }, + { + "epoch": 0.11104177970699945, + "grad_norm": 6.633236885070801, + "learning_rate": 8.981088118404825e-06, + "loss": 0.5195, + "step": 8186 + }, + { + "epoch": 0.11105534454693435, + "grad_norm": 7.787785053253174, + "learning_rate": 8.98095107578457e-06, + "loss": 0.496, + "step": 8187 + }, + { + "epoch": 0.11106890938686924, + "grad_norm": 8.917125701904297, + "learning_rate": 8.980814033164315e-06, + "loss": 0.5431, + "step": 8188 + }, + { + "epoch": 0.11108247422680412, + "grad_norm": 8.055899620056152, + "learning_rate": 8.980676990544059e-06, + "loss": 0.5956, + "step": 8189 + }, + { + "epoch": 0.11109603906673901, + "grad_norm": 7.057995319366455, + "learning_rate": 8.980539947923805e-06, + "loss": 0.4864, + "step": 8190 + }, + { + "epoch": 0.1111096039066739, + "grad_norm": 7.074292182922363, + "learning_rate": 8.98040290530355e-06, + "loss": 0.5058, + "step": 8191 + }, + { + "epoch": 0.1111231687466088, + "grad_norm": 5.992422580718994, + "learning_rate": 8.980265862683296e-06, + "loss": 0.6034, + "step": 8192 + }, + { + "epoch": 0.11113673358654368, + "grad_norm": 7.536715507507324, + "learning_rate": 8.98012882006304e-06, + "loss": 0.4364, + "step": 8193 + }, + { + "epoch": 0.11115029842647857, + "grad_norm": 7.123959064483643, + "learning_rate": 8.979991777442786e-06, + "loss": 0.5493, + "step": 8194 + }, + { + "epoch": 0.11116386326641346, + "grad_norm": 8.313048362731934, + "learning_rate": 8.979854734822531e-06, + "loss": 0.6458, + "step": 8195 + }, + { + "epoch": 0.11117742810634834, + "grad_norm": 8.026615142822266, + "learning_rate": 8.979717692202275e-06, + "loss": 0.4076, + "step": 8196 + }, + { + "epoch": 0.11119099294628323, + "grad_norm": 9.584028244018555, + "learning_rate": 8.97958064958202e-06, + "loss": 0.6605, + "step": 8197 + }, + { + "epoch": 0.11120455778621813, + "grad_norm": 7.844478130340576, + "learning_rate": 8.979443606961767e-06, + "loss": 0.5097, + "step": 8198 + }, + { + "epoch": 0.11121812262615301, + "grad_norm": 7.177884578704834, + "learning_rate": 8.97930656434151e-06, + "loss": 0.6406, + "step": 8199 + }, + { + "epoch": 0.1112316874660879, + "grad_norm": 6.159613609313965, + "learning_rate": 8.979169521721256e-06, + "loss": 0.3833, + "step": 8200 + }, + { + "epoch": 0.1112452523060228, + "grad_norm": 10.567780494689941, + "learning_rate": 8.979032479101e-06, + "loss": 0.556, + "step": 8201 + }, + { + "epoch": 0.11125881714595767, + "grad_norm": 7.091883182525635, + "learning_rate": 8.978895436480748e-06, + "loss": 0.4037, + "step": 8202 + }, + { + "epoch": 0.11127238198589257, + "grad_norm": 4.729300498962402, + "learning_rate": 8.978758393860491e-06, + "loss": 0.3745, + "step": 8203 + }, + { + "epoch": 0.11128594682582746, + "grad_norm": 6.985494136810303, + "learning_rate": 8.978621351240236e-06, + "loss": 0.4645, + "step": 8204 + }, + { + "epoch": 0.11129951166576234, + "grad_norm": 9.256119728088379, + "learning_rate": 8.978484308619981e-06, + "loss": 0.7567, + "step": 8205 + }, + { + "epoch": 0.11131307650569723, + "grad_norm": 9.502653121948242, + "learning_rate": 8.978347265999727e-06, + "loss": 0.5219, + "step": 8206 + }, + { + "epoch": 0.11132664134563212, + "grad_norm": 7.369841575622559, + "learning_rate": 8.978210223379472e-06, + "loss": 0.4629, + "step": 8207 + }, + { + "epoch": 0.11134020618556702, + "grad_norm": 8.080727577209473, + "learning_rate": 8.978073180759217e-06, + "loss": 0.7299, + "step": 8208 + }, + { + "epoch": 0.1113537710255019, + "grad_norm": 6.13051700592041, + "learning_rate": 8.977936138138962e-06, + "loss": 0.4482, + "step": 8209 + }, + { + "epoch": 0.11136733586543679, + "grad_norm": 7.078986644744873, + "learning_rate": 8.977799095518707e-06, + "loss": 0.5384, + "step": 8210 + }, + { + "epoch": 0.11138090070537168, + "grad_norm": 9.305482864379883, + "learning_rate": 8.977662052898453e-06, + "loss": 0.8137, + "step": 8211 + }, + { + "epoch": 0.11139446554530656, + "grad_norm": 7.055825710296631, + "learning_rate": 8.977525010278198e-06, + "loss": 0.4351, + "step": 8212 + }, + { + "epoch": 0.11140803038524145, + "grad_norm": 7.0651397705078125, + "learning_rate": 8.977387967657943e-06, + "loss": 0.568, + "step": 8213 + }, + { + "epoch": 0.11142159522517635, + "grad_norm": 7.613426208496094, + "learning_rate": 8.977250925037686e-06, + "loss": 0.5091, + "step": 8214 + }, + { + "epoch": 0.11143516006511123, + "grad_norm": 6.897419452667236, + "learning_rate": 8.977113882417433e-06, + "loss": 0.4008, + "step": 8215 + }, + { + "epoch": 0.11144872490504612, + "grad_norm": 6.627753257751465, + "learning_rate": 8.976976839797178e-06, + "loss": 0.3849, + "step": 8216 + }, + { + "epoch": 0.11146228974498101, + "grad_norm": 8.115882873535156, + "learning_rate": 8.976839797176924e-06, + "loss": 0.4478, + "step": 8217 + }, + { + "epoch": 0.11147585458491589, + "grad_norm": 9.665942192077637, + "learning_rate": 8.976702754556667e-06, + "loss": 0.5848, + "step": 8218 + }, + { + "epoch": 0.11148941942485079, + "grad_norm": 7.184301853179932, + "learning_rate": 8.976565711936412e-06, + "loss": 0.5183, + "step": 8219 + }, + { + "epoch": 0.11150298426478568, + "grad_norm": 6.688892364501953, + "learning_rate": 8.976428669316159e-06, + "loss": 0.4483, + "step": 8220 + }, + { + "epoch": 0.11151654910472056, + "grad_norm": 6.974501132965088, + "learning_rate": 8.976291626695903e-06, + "loss": 0.3754, + "step": 8221 + }, + { + "epoch": 0.11153011394465545, + "grad_norm": 5.824441909790039, + "learning_rate": 8.976154584075648e-06, + "loss": 0.3574, + "step": 8222 + }, + { + "epoch": 0.11154367878459034, + "grad_norm": 7.335148811340332, + "learning_rate": 8.976017541455393e-06, + "loss": 0.5356, + "step": 8223 + }, + { + "epoch": 0.11155724362452524, + "grad_norm": 7.658132553100586, + "learning_rate": 8.975880498835138e-06, + "loss": 0.4987, + "step": 8224 + }, + { + "epoch": 0.11157080846446012, + "grad_norm": 8.850707054138184, + "learning_rate": 8.975743456214883e-06, + "loss": 0.5621, + "step": 8225 + }, + { + "epoch": 0.11158437330439501, + "grad_norm": 6.548914432525635, + "learning_rate": 8.975606413594629e-06, + "loss": 0.4314, + "step": 8226 + }, + { + "epoch": 0.1115979381443299, + "grad_norm": 6.917397499084473, + "learning_rate": 8.975469370974374e-06, + "loss": 0.5412, + "step": 8227 + }, + { + "epoch": 0.11161150298426478, + "grad_norm": 7.306714057922363, + "learning_rate": 8.975332328354119e-06, + "loss": 0.5591, + "step": 8228 + }, + { + "epoch": 0.11162506782419968, + "grad_norm": 6.210080146789551, + "learning_rate": 8.975195285733864e-06, + "loss": 0.423, + "step": 8229 + }, + { + "epoch": 0.11163863266413457, + "grad_norm": 8.907835006713867, + "learning_rate": 8.97505824311361e-06, + "loss": 0.491, + "step": 8230 + }, + { + "epoch": 0.11165219750406945, + "grad_norm": 7.591681003570557, + "learning_rate": 8.974921200493354e-06, + "loss": 0.3927, + "step": 8231 + }, + { + "epoch": 0.11166576234400434, + "grad_norm": 6.334350109100342, + "learning_rate": 8.9747841578731e-06, + "loss": 0.3726, + "step": 8232 + }, + { + "epoch": 0.11167932718393923, + "grad_norm": 7.2389092445373535, + "learning_rate": 8.974647115252845e-06, + "loss": 0.3878, + "step": 8233 + }, + { + "epoch": 0.11169289202387411, + "grad_norm": 7.342161655426025, + "learning_rate": 8.97451007263259e-06, + "loss": 0.5676, + "step": 8234 + }, + { + "epoch": 0.111706456863809, + "grad_norm": 6.798770427703857, + "learning_rate": 8.974373030012335e-06, + "loss": 0.4458, + "step": 8235 + }, + { + "epoch": 0.1117200217037439, + "grad_norm": 7.904015064239502, + "learning_rate": 8.974235987392079e-06, + "loss": 0.722, + "step": 8236 + }, + { + "epoch": 0.11173358654367878, + "grad_norm": 7.646467208862305, + "learning_rate": 8.974098944771825e-06, + "loss": 0.4519, + "step": 8237 + }, + { + "epoch": 0.11174715138361367, + "grad_norm": 6.54522180557251, + "learning_rate": 8.97396190215157e-06, + "loss": 0.4838, + "step": 8238 + }, + { + "epoch": 0.11176071622354856, + "grad_norm": 5.519606113433838, + "learning_rate": 8.973824859531314e-06, + "loss": 0.3386, + "step": 8239 + }, + { + "epoch": 0.11177428106348346, + "grad_norm": 6.7880377769470215, + "learning_rate": 8.97368781691106e-06, + "loss": 0.5005, + "step": 8240 + }, + { + "epoch": 0.11178784590341834, + "grad_norm": 5.924287796020508, + "learning_rate": 8.973550774290806e-06, + "loss": 0.381, + "step": 8241 + }, + { + "epoch": 0.11180141074335323, + "grad_norm": 8.382894515991211, + "learning_rate": 8.97341373167055e-06, + "loss": 0.6, + "step": 8242 + }, + { + "epoch": 0.11181497558328812, + "grad_norm": 6.758570194244385, + "learning_rate": 8.973276689050295e-06, + "loss": 0.3501, + "step": 8243 + }, + { + "epoch": 0.111828540423223, + "grad_norm": 10.115397453308105, + "learning_rate": 8.97313964643004e-06, + "loss": 0.7729, + "step": 8244 + }, + { + "epoch": 0.1118421052631579, + "grad_norm": 7.742098808288574, + "learning_rate": 8.973002603809785e-06, + "loss": 0.5702, + "step": 8245 + }, + { + "epoch": 0.11185567010309279, + "grad_norm": 6.910059452056885, + "learning_rate": 8.97286556118953e-06, + "loss": 0.5774, + "step": 8246 + }, + { + "epoch": 0.11186923494302767, + "grad_norm": 6.298202991485596, + "learning_rate": 8.972728518569276e-06, + "loss": 0.3811, + "step": 8247 + }, + { + "epoch": 0.11188279978296256, + "grad_norm": 6.376705646514893, + "learning_rate": 8.97259147594902e-06, + "loss": 0.455, + "step": 8248 + }, + { + "epoch": 0.11189636462289745, + "grad_norm": 6.498262882232666, + "learning_rate": 8.972454433328766e-06, + "loss": 0.3439, + "step": 8249 + }, + { + "epoch": 0.11190992946283233, + "grad_norm": 5.871890068054199, + "learning_rate": 8.972317390708511e-06, + "loss": 0.513, + "step": 8250 + }, + { + "epoch": 0.11192349430276723, + "grad_norm": 7.096700668334961, + "learning_rate": 8.972180348088256e-06, + "loss": 0.3916, + "step": 8251 + }, + { + "epoch": 0.11193705914270212, + "grad_norm": 6.89772891998291, + "learning_rate": 8.972043305468001e-06, + "loss": 0.4277, + "step": 8252 + }, + { + "epoch": 0.111950623982637, + "grad_norm": 8.329580307006836, + "learning_rate": 8.971906262847747e-06, + "loss": 0.6152, + "step": 8253 + }, + { + "epoch": 0.11196418882257189, + "grad_norm": 5.914628982543945, + "learning_rate": 8.971769220227492e-06, + "loss": 0.5394, + "step": 8254 + }, + { + "epoch": 0.11197775366250678, + "grad_norm": 6.128572463989258, + "learning_rate": 8.971632177607237e-06, + "loss": 0.3583, + "step": 8255 + }, + { + "epoch": 0.11199131850244168, + "grad_norm": 6.694571495056152, + "learning_rate": 8.971495134986982e-06, + "loss": 0.4883, + "step": 8256 + }, + { + "epoch": 0.11200488334237656, + "grad_norm": 6.843101978302002, + "learning_rate": 8.971358092366726e-06, + "loss": 0.4817, + "step": 8257 + }, + { + "epoch": 0.11201844818231145, + "grad_norm": 7.291236877441406, + "learning_rate": 8.971221049746471e-06, + "loss": 0.496, + "step": 8258 + }, + { + "epoch": 0.11203201302224634, + "grad_norm": 6.2700324058532715, + "learning_rate": 8.971084007126218e-06, + "loss": 0.3647, + "step": 8259 + }, + { + "epoch": 0.11204557786218122, + "grad_norm": 7.0316548347473145, + "learning_rate": 8.970946964505963e-06, + "loss": 0.4732, + "step": 8260 + }, + { + "epoch": 0.11205914270211612, + "grad_norm": 5.483545780181885, + "learning_rate": 8.970809921885706e-06, + "loss": 0.3692, + "step": 8261 + }, + { + "epoch": 0.11207270754205101, + "grad_norm": 4.794990539550781, + "learning_rate": 8.970672879265452e-06, + "loss": 0.3539, + "step": 8262 + }, + { + "epoch": 0.11208627238198589, + "grad_norm": 8.039072036743164, + "learning_rate": 8.970535836645198e-06, + "loss": 0.4962, + "step": 8263 + }, + { + "epoch": 0.11209983722192078, + "grad_norm": 5.283228874206543, + "learning_rate": 8.970398794024942e-06, + "loss": 0.3839, + "step": 8264 + }, + { + "epoch": 0.11211340206185567, + "grad_norm": 8.835135459899902, + "learning_rate": 8.970261751404687e-06, + "loss": 0.6153, + "step": 8265 + }, + { + "epoch": 0.11212696690179055, + "grad_norm": 5.2080254554748535, + "learning_rate": 8.970124708784432e-06, + "loss": 0.3055, + "step": 8266 + }, + { + "epoch": 0.11214053174172545, + "grad_norm": 5.243231773376465, + "learning_rate": 8.969987666164177e-06, + "loss": 0.4371, + "step": 8267 + }, + { + "epoch": 0.11215409658166034, + "grad_norm": 5.315313816070557, + "learning_rate": 8.969850623543923e-06, + "loss": 0.4904, + "step": 8268 + }, + { + "epoch": 0.11216766142159522, + "grad_norm": 6.489245891571045, + "learning_rate": 8.969713580923668e-06, + "loss": 0.4523, + "step": 8269 + }, + { + "epoch": 0.11218122626153011, + "grad_norm": 6.258513927459717, + "learning_rate": 8.969576538303413e-06, + "loss": 0.5355, + "step": 8270 + }, + { + "epoch": 0.112194791101465, + "grad_norm": 10.499337196350098, + "learning_rate": 8.969439495683158e-06, + "loss": 0.6127, + "step": 8271 + }, + { + "epoch": 0.1122083559413999, + "grad_norm": 8.055474281311035, + "learning_rate": 8.969302453062903e-06, + "loss": 0.5224, + "step": 8272 + }, + { + "epoch": 0.11222192078133478, + "grad_norm": 7.14866304397583, + "learning_rate": 8.969165410442649e-06, + "loss": 0.3166, + "step": 8273 + }, + { + "epoch": 0.11223548562126967, + "grad_norm": 6.142755031585693, + "learning_rate": 8.969028367822394e-06, + "loss": 0.3691, + "step": 8274 + }, + { + "epoch": 0.11224905046120456, + "grad_norm": 5.231932163238525, + "learning_rate": 8.968891325202139e-06, + "loss": 0.3258, + "step": 8275 + }, + { + "epoch": 0.11226261530113944, + "grad_norm": 4.743508815765381, + "learning_rate": 8.968754282581884e-06, + "loss": 0.3107, + "step": 8276 + }, + { + "epoch": 0.11227618014107434, + "grad_norm": 7.781587600708008, + "learning_rate": 8.96861723996163e-06, + "loss": 0.3915, + "step": 8277 + }, + { + "epoch": 0.11228974498100923, + "grad_norm": 6.297904014587402, + "learning_rate": 8.968480197341374e-06, + "loss": 0.4232, + "step": 8278 + }, + { + "epoch": 0.11230330982094411, + "grad_norm": 5.519118309020996, + "learning_rate": 8.968343154721118e-06, + "loss": 0.4016, + "step": 8279 + }, + { + "epoch": 0.112316874660879, + "grad_norm": 6.020432949066162, + "learning_rate": 8.968206112100865e-06, + "loss": 0.4287, + "step": 8280 + }, + { + "epoch": 0.1123304395008139, + "grad_norm": 5.463769912719727, + "learning_rate": 8.96806906948061e-06, + "loss": 0.4079, + "step": 8281 + }, + { + "epoch": 0.11234400434074877, + "grad_norm": 4.924180030822754, + "learning_rate": 8.967932026860353e-06, + "loss": 0.3063, + "step": 8282 + }, + { + "epoch": 0.11235756918068367, + "grad_norm": 5.264253616333008, + "learning_rate": 8.967794984240099e-06, + "loss": 0.3795, + "step": 8283 + }, + { + "epoch": 0.11237113402061856, + "grad_norm": 4.820040225982666, + "learning_rate": 8.967657941619846e-06, + "loss": 0.3404, + "step": 8284 + }, + { + "epoch": 0.11238469886055344, + "grad_norm": 6.68788480758667, + "learning_rate": 8.96752089899959e-06, + "loss": 0.4868, + "step": 8285 + }, + { + "epoch": 0.11239826370048833, + "grad_norm": 5.885106086730957, + "learning_rate": 8.967383856379334e-06, + "loss": 0.4401, + "step": 8286 + }, + { + "epoch": 0.11241182854042323, + "grad_norm": 4.297672748565674, + "learning_rate": 8.96724681375908e-06, + "loss": 0.2285, + "step": 8287 + }, + { + "epoch": 0.11242539338035812, + "grad_norm": 5.189555644989014, + "learning_rate": 8.967109771138825e-06, + "loss": 0.3196, + "step": 8288 + }, + { + "epoch": 0.112438958220293, + "grad_norm": 6.216172695159912, + "learning_rate": 8.96697272851857e-06, + "loss": 0.4026, + "step": 8289 + }, + { + "epoch": 0.11245252306022789, + "grad_norm": 6.3088579177856445, + "learning_rate": 8.966835685898315e-06, + "loss": 0.3636, + "step": 8290 + }, + { + "epoch": 0.11246608790016278, + "grad_norm": 5.753146648406982, + "learning_rate": 8.96669864327806e-06, + "loss": 0.4288, + "step": 8291 + }, + { + "epoch": 0.11247965274009766, + "grad_norm": 5.191280841827393, + "learning_rate": 8.966561600657805e-06, + "loss": 0.2299, + "step": 8292 + }, + { + "epoch": 0.11249321758003256, + "grad_norm": 4.080352783203125, + "learning_rate": 8.96642455803755e-06, + "loss": 0.2267, + "step": 8293 + }, + { + "epoch": 0.11250678241996745, + "grad_norm": 8.134138107299805, + "learning_rate": 8.966287515417296e-06, + "loss": 0.4069, + "step": 8294 + }, + { + "epoch": 0.11252034725990233, + "grad_norm": 7.030117988586426, + "learning_rate": 8.96615047279704e-06, + "loss": 0.4652, + "step": 8295 + }, + { + "epoch": 0.11253391209983722, + "grad_norm": 5.407596588134766, + "learning_rate": 8.966013430176786e-06, + "loss": 0.3945, + "step": 8296 + }, + { + "epoch": 0.11254747693977211, + "grad_norm": 6.650293827056885, + "learning_rate": 8.965876387556531e-06, + "loss": 0.506, + "step": 8297 + }, + { + "epoch": 0.112561041779707, + "grad_norm": 4.9975996017456055, + "learning_rate": 8.965739344936276e-06, + "loss": 0.3287, + "step": 8298 + }, + { + "epoch": 0.11257460661964189, + "grad_norm": 4.384162902832031, + "learning_rate": 8.965602302316021e-06, + "loss": 0.256, + "step": 8299 + }, + { + "epoch": 0.11258817145957678, + "grad_norm": 6.628147602081299, + "learning_rate": 8.965465259695767e-06, + "loss": 0.3572, + "step": 8300 + }, + { + "epoch": 0.11260173629951166, + "grad_norm": 8.887904167175293, + "learning_rate": 8.96532821707551e-06, + "loss": 0.5277, + "step": 8301 + }, + { + "epoch": 0.11261530113944655, + "grad_norm": 6.017417907714844, + "learning_rate": 8.965191174455257e-06, + "loss": 0.407, + "step": 8302 + }, + { + "epoch": 0.11262886597938145, + "grad_norm": 7.602184772491455, + "learning_rate": 8.965054131835002e-06, + "loss": 0.4128, + "step": 8303 + }, + { + "epoch": 0.11264243081931634, + "grad_norm": 4.909132957458496, + "learning_rate": 8.964917089214746e-06, + "loss": 0.4055, + "step": 8304 + }, + { + "epoch": 0.11265599565925122, + "grad_norm": 5.3864336013793945, + "learning_rate": 8.964780046594491e-06, + "loss": 0.3994, + "step": 8305 + }, + { + "epoch": 0.11266956049918611, + "grad_norm": 6.183436393737793, + "learning_rate": 8.964643003974238e-06, + "loss": 0.4099, + "step": 8306 + }, + { + "epoch": 0.112683125339121, + "grad_norm": 4.554820537567139, + "learning_rate": 8.964505961353981e-06, + "loss": 0.3785, + "step": 8307 + }, + { + "epoch": 0.11269669017905588, + "grad_norm": 5.347585201263428, + "learning_rate": 8.964368918733726e-06, + "loss": 0.2802, + "step": 8308 + }, + { + "epoch": 0.11271025501899078, + "grad_norm": 5.787963390350342, + "learning_rate": 8.964231876113472e-06, + "loss": 0.3772, + "step": 8309 + }, + { + "epoch": 0.11272381985892567, + "grad_norm": 5.89744234085083, + "learning_rate": 8.964094833493218e-06, + "loss": 0.2626, + "step": 8310 + }, + { + "epoch": 0.11273738469886055, + "grad_norm": 8.740042686462402, + "learning_rate": 8.963957790872962e-06, + "loss": 0.4226, + "step": 8311 + }, + { + "epoch": 0.11275094953879544, + "grad_norm": 6.954565048217773, + "learning_rate": 8.963820748252707e-06, + "loss": 0.405, + "step": 8312 + }, + { + "epoch": 0.11276451437873034, + "grad_norm": 5.863924503326416, + "learning_rate": 8.963683705632452e-06, + "loss": 0.3782, + "step": 8313 + }, + { + "epoch": 0.11277807921866521, + "grad_norm": 6.321284294128418, + "learning_rate": 8.963546663012197e-06, + "loss": 0.4133, + "step": 8314 + }, + { + "epoch": 0.11279164405860011, + "grad_norm": 6.586554050445557, + "learning_rate": 8.963409620391943e-06, + "loss": 0.355, + "step": 8315 + }, + { + "epoch": 0.112805208898535, + "grad_norm": 4.738223075866699, + "learning_rate": 8.963272577771688e-06, + "loss": 0.3581, + "step": 8316 + }, + { + "epoch": 0.11281877373846988, + "grad_norm": 6.275232315063477, + "learning_rate": 8.963135535151433e-06, + "loss": 0.3728, + "step": 8317 + }, + { + "epoch": 0.11283233857840477, + "grad_norm": 5.881433010101318, + "learning_rate": 8.962998492531178e-06, + "loss": 0.4443, + "step": 8318 + }, + { + "epoch": 0.11284590341833967, + "grad_norm": 6.449366569519043, + "learning_rate": 8.962861449910923e-06, + "loss": 0.5196, + "step": 8319 + }, + { + "epoch": 0.11285946825827456, + "grad_norm": 9.943666458129883, + "learning_rate": 8.962724407290669e-06, + "loss": 0.5446, + "step": 8320 + }, + { + "epoch": 0.11287303309820944, + "grad_norm": 5.7048211097717285, + "learning_rate": 8.962587364670414e-06, + "loss": 0.3537, + "step": 8321 + }, + { + "epoch": 0.11288659793814433, + "grad_norm": 7.913763523101807, + "learning_rate": 8.962450322050157e-06, + "loss": 0.6023, + "step": 8322 + }, + { + "epoch": 0.11290016277807922, + "grad_norm": 8.065559387207031, + "learning_rate": 8.962313279429904e-06, + "loss": 0.4135, + "step": 8323 + }, + { + "epoch": 0.1129137276180141, + "grad_norm": 6.109066486358643, + "learning_rate": 8.96217623680965e-06, + "loss": 0.3281, + "step": 8324 + }, + { + "epoch": 0.112927292457949, + "grad_norm": 6.949713706970215, + "learning_rate": 8.962039194189394e-06, + "loss": 0.4166, + "step": 8325 + }, + { + "epoch": 0.11294085729788389, + "grad_norm": 6.88997745513916, + "learning_rate": 8.961902151569138e-06, + "loss": 0.3435, + "step": 8326 + }, + { + "epoch": 0.11295442213781877, + "grad_norm": 6.275869369506836, + "learning_rate": 8.961765108948883e-06, + "loss": 0.3766, + "step": 8327 + }, + { + "epoch": 0.11296798697775366, + "grad_norm": 6.755871295928955, + "learning_rate": 8.96162806632863e-06, + "loss": 0.4717, + "step": 8328 + }, + { + "epoch": 0.11298155181768856, + "grad_norm": 4.438249111175537, + "learning_rate": 8.961491023708373e-06, + "loss": 0.3402, + "step": 8329 + }, + { + "epoch": 0.11299511665762343, + "grad_norm": 6.413817405700684, + "learning_rate": 8.961353981088119e-06, + "loss": 0.3636, + "step": 8330 + }, + { + "epoch": 0.11300868149755833, + "grad_norm": 6.51201057434082, + "learning_rate": 8.961216938467864e-06, + "loss": 0.4217, + "step": 8331 + }, + { + "epoch": 0.11302224633749322, + "grad_norm": 6.248153209686279, + "learning_rate": 8.961079895847609e-06, + "loss": 0.4101, + "step": 8332 + }, + { + "epoch": 0.1130358111774281, + "grad_norm": 5.246577262878418, + "learning_rate": 8.960942853227354e-06, + "loss": 0.4865, + "step": 8333 + }, + { + "epoch": 0.11304937601736299, + "grad_norm": 6.23301362991333, + "learning_rate": 8.9608058106071e-06, + "loss": 0.4322, + "step": 8334 + }, + { + "epoch": 0.11306294085729789, + "grad_norm": 6.637114524841309, + "learning_rate": 8.960668767986845e-06, + "loss": 0.5327, + "step": 8335 + }, + { + "epoch": 0.11307650569723278, + "grad_norm": 6.208375930786133, + "learning_rate": 8.96053172536659e-06, + "loss": 0.357, + "step": 8336 + }, + { + "epoch": 0.11309007053716766, + "grad_norm": 8.504342079162598, + "learning_rate": 8.960394682746335e-06, + "loss": 0.5028, + "step": 8337 + }, + { + "epoch": 0.11310363537710255, + "grad_norm": 7.170156478881836, + "learning_rate": 8.96025764012608e-06, + "loss": 0.5765, + "step": 8338 + }, + { + "epoch": 0.11311720021703744, + "grad_norm": 5.024808406829834, + "learning_rate": 8.960120597505825e-06, + "loss": 0.367, + "step": 8339 + }, + { + "epoch": 0.11313076505697232, + "grad_norm": 6.265191078186035, + "learning_rate": 8.959983554885569e-06, + "loss": 0.3077, + "step": 8340 + }, + { + "epoch": 0.11314432989690722, + "grad_norm": 5.695991516113281, + "learning_rate": 8.959846512265316e-06, + "loss": 0.3778, + "step": 8341 + }, + { + "epoch": 0.11315789473684211, + "grad_norm": 6.507431507110596, + "learning_rate": 8.95970946964506e-06, + "loss": 0.3417, + "step": 8342 + }, + { + "epoch": 0.11317145957677699, + "grad_norm": 4.566043376922607, + "learning_rate": 8.959572427024806e-06, + "loss": 0.2781, + "step": 8343 + }, + { + "epoch": 0.11318502441671188, + "grad_norm": 7.40027379989624, + "learning_rate": 8.95943538440455e-06, + "loss": 0.3299, + "step": 8344 + }, + { + "epoch": 0.11319858925664678, + "grad_norm": 6.33690881729126, + "learning_rate": 8.959298341784296e-06, + "loss": 0.496, + "step": 8345 + }, + { + "epoch": 0.11321215409658165, + "grad_norm": 6.977497577667236, + "learning_rate": 8.959161299164042e-06, + "loss": 0.4556, + "step": 8346 + }, + { + "epoch": 0.11322571893651655, + "grad_norm": 8.431841850280762, + "learning_rate": 8.959024256543785e-06, + "loss": 0.4289, + "step": 8347 + }, + { + "epoch": 0.11323928377645144, + "grad_norm": 7.561805725097656, + "learning_rate": 8.95888721392353e-06, + "loss": 0.6006, + "step": 8348 + }, + { + "epoch": 0.11325284861638632, + "grad_norm": 6.46866512298584, + "learning_rate": 8.958750171303277e-06, + "loss": 0.3741, + "step": 8349 + }, + { + "epoch": 0.11326641345632121, + "grad_norm": 6.815622329711914, + "learning_rate": 8.95861312868302e-06, + "loss": 0.4469, + "step": 8350 + }, + { + "epoch": 0.1132799782962561, + "grad_norm": 5.201397895812988, + "learning_rate": 8.958476086062766e-06, + "loss": 0.3749, + "step": 8351 + }, + { + "epoch": 0.113293543136191, + "grad_norm": 7.811657905578613, + "learning_rate": 8.958339043442511e-06, + "loss": 0.3179, + "step": 8352 + }, + { + "epoch": 0.11330710797612588, + "grad_norm": 8.911375999450684, + "learning_rate": 8.958202000822258e-06, + "loss": 0.5212, + "step": 8353 + }, + { + "epoch": 0.11332067281606077, + "grad_norm": 5.220572471618652, + "learning_rate": 8.958064958202001e-06, + "loss": 0.3915, + "step": 8354 + }, + { + "epoch": 0.11333423765599566, + "grad_norm": 5.690229415893555, + "learning_rate": 8.957927915581746e-06, + "loss": 0.3845, + "step": 8355 + }, + { + "epoch": 0.11334780249593054, + "grad_norm": 7.775482654571533, + "learning_rate": 8.957790872961492e-06, + "loss": 0.481, + "step": 8356 + }, + { + "epoch": 0.11336136733586544, + "grad_norm": 7.252620697021484, + "learning_rate": 8.957653830341237e-06, + "loss": 0.6003, + "step": 8357 + }, + { + "epoch": 0.11337493217580033, + "grad_norm": 6.260082244873047, + "learning_rate": 8.957516787720982e-06, + "loss": 0.4395, + "step": 8358 + }, + { + "epoch": 0.11338849701573521, + "grad_norm": 6.594254493713379, + "learning_rate": 8.957379745100727e-06, + "loss": 0.4659, + "step": 8359 + }, + { + "epoch": 0.1134020618556701, + "grad_norm": 6.842567443847656, + "learning_rate": 8.957242702480472e-06, + "loss": 0.4113, + "step": 8360 + }, + { + "epoch": 0.113415626695605, + "grad_norm": 6.782833576202393, + "learning_rate": 8.957105659860218e-06, + "loss": 0.3718, + "step": 8361 + }, + { + "epoch": 0.11342919153553987, + "grad_norm": 7.524899959564209, + "learning_rate": 8.956968617239963e-06, + "loss": 0.7135, + "step": 8362 + }, + { + "epoch": 0.11344275637547477, + "grad_norm": 9.660250663757324, + "learning_rate": 8.956831574619708e-06, + "loss": 0.5347, + "step": 8363 + }, + { + "epoch": 0.11345632121540966, + "grad_norm": 7.660803318023682, + "learning_rate": 8.956694531999453e-06, + "loss": 0.6332, + "step": 8364 + }, + { + "epoch": 0.11346988605534454, + "grad_norm": 4.697131633758545, + "learning_rate": 8.956557489379197e-06, + "loss": 0.3507, + "step": 8365 + }, + { + "epoch": 0.11348345089527943, + "grad_norm": 5.4033708572387695, + "learning_rate": 8.956420446758943e-06, + "loss": 0.3379, + "step": 8366 + }, + { + "epoch": 0.11349701573521433, + "grad_norm": 6.928624629974365, + "learning_rate": 8.956283404138689e-06, + "loss": 0.357, + "step": 8367 + }, + { + "epoch": 0.11351058057514922, + "grad_norm": 6.802042484283447, + "learning_rate": 8.956146361518434e-06, + "loss": 0.518, + "step": 8368 + }, + { + "epoch": 0.1135241454150841, + "grad_norm": 5.62752628326416, + "learning_rate": 8.956009318898177e-06, + "loss": 0.3082, + "step": 8369 + }, + { + "epoch": 0.11353771025501899, + "grad_norm": 5.4324727058410645, + "learning_rate": 8.955872276277922e-06, + "loss": 0.3692, + "step": 8370 + }, + { + "epoch": 0.11355127509495389, + "grad_norm": 7.140956401824951, + "learning_rate": 8.95573523365767e-06, + "loss": 0.4298, + "step": 8371 + }, + { + "epoch": 0.11356483993488876, + "grad_norm": 7.320451736450195, + "learning_rate": 8.955598191037413e-06, + "loss": 0.3834, + "step": 8372 + }, + { + "epoch": 0.11357840477482366, + "grad_norm": 7.000586986541748, + "learning_rate": 8.955461148417158e-06, + "loss": 0.5261, + "step": 8373 + }, + { + "epoch": 0.11359196961475855, + "grad_norm": 5.44891881942749, + "learning_rate": 8.955324105796903e-06, + "loss": 0.3053, + "step": 8374 + }, + { + "epoch": 0.11360553445469343, + "grad_norm": 9.36451244354248, + "learning_rate": 8.955187063176648e-06, + "loss": 0.8099, + "step": 8375 + }, + { + "epoch": 0.11361909929462832, + "grad_norm": 7.311240196228027, + "learning_rate": 8.955050020556393e-06, + "loss": 0.527, + "step": 8376 + }, + { + "epoch": 0.11363266413456322, + "grad_norm": 6.030411720275879, + "learning_rate": 8.954912977936139e-06, + "loss": 0.4306, + "step": 8377 + }, + { + "epoch": 0.1136462289744981, + "grad_norm": 6.5988969802856445, + "learning_rate": 8.954775935315884e-06, + "loss": 0.5582, + "step": 8378 + }, + { + "epoch": 0.11365979381443299, + "grad_norm": 5.595539569854736, + "learning_rate": 8.954638892695629e-06, + "loss": 0.3593, + "step": 8379 + }, + { + "epoch": 0.11367335865436788, + "grad_norm": 6.358658790588379, + "learning_rate": 8.954501850075374e-06, + "loss": 0.3838, + "step": 8380 + }, + { + "epoch": 0.11368692349430276, + "grad_norm": 6.563107013702393, + "learning_rate": 8.95436480745512e-06, + "loss": 0.504, + "step": 8381 + }, + { + "epoch": 0.11370048833423765, + "grad_norm": 5.397589683532715, + "learning_rate": 8.954227764834865e-06, + "loss": 0.4531, + "step": 8382 + }, + { + "epoch": 0.11371405317417255, + "grad_norm": 5.717152118682861, + "learning_rate": 8.95409072221461e-06, + "loss": 0.5114, + "step": 8383 + }, + { + "epoch": 0.11372761801410744, + "grad_norm": 7.90476655960083, + "learning_rate": 8.953953679594355e-06, + "loss": 0.4526, + "step": 8384 + }, + { + "epoch": 0.11374118285404232, + "grad_norm": 4.234797954559326, + "learning_rate": 8.9538166369741e-06, + "loss": 0.2427, + "step": 8385 + }, + { + "epoch": 0.11375474769397721, + "grad_norm": 4.709390163421631, + "learning_rate": 8.953679594353845e-06, + "loss": 0.3211, + "step": 8386 + }, + { + "epoch": 0.1137683125339121, + "grad_norm": 6.4040021896362305, + "learning_rate": 8.953542551733589e-06, + "loss": 0.4683, + "step": 8387 + }, + { + "epoch": 0.11378187737384698, + "grad_norm": 6.014031410217285, + "learning_rate": 8.953405509113336e-06, + "loss": 0.266, + "step": 8388 + }, + { + "epoch": 0.11379544221378188, + "grad_norm": 5.170441150665283, + "learning_rate": 8.95326846649308e-06, + "loss": 0.4082, + "step": 8389 + }, + { + "epoch": 0.11380900705371677, + "grad_norm": 6.247134685516357, + "learning_rate": 8.953131423872824e-06, + "loss": 0.3954, + "step": 8390 + }, + { + "epoch": 0.11382257189365165, + "grad_norm": 5.740721225738525, + "learning_rate": 8.95299438125257e-06, + "loss": 0.4005, + "step": 8391 + }, + { + "epoch": 0.11383613673358654, + "grad_norm": 6.257715702056885, + "learning_rate": 8.952857338632316e-06, + "loss": 0.4731, + "step": 8392 + }, + { + "epoch": 0.11384970157352144, + "grad_norm": 6.638266563415527, + "learning_rate": 8.952720296012062e-06, + "loss": 0.3611, + "step": 8393 + }, + { + "epoch": 0.11386326641345632, + "grad_norm": 4.042613983154297, + "learning_rate": 8.952583253391805e-06, + "loss": 0.3554, + "step": 8394 + }, + { + "epoch": 0.11387683125339121, + "grad_norm": 6.821038246154785, + "learning_rate": 8.95244621077155e-06, + "loss": 0.4554, + "step": 8395 + }, + { + "epoch": 0.1138903960933261, + "grad_norm": 7.462324142456055, + "learning_rate": 8.952309168151295e-06, + "loss": 0.4849, + "step": 8396 + }, + { + "epoch": 0.11390396093326098, + "grad_norm": 5.516175746917725, + "learning_rate": 8.95217212553104e-06, + "loss": 0.3698, + "step": 8397 + }, + { + "epoch": 0.11391752577319587, + "grad_norm": 4.305587291717529, + "learning_rate": 8.952035082910786e-06, + "loss": 0.3145, + "step": 8398 + }, + { + "epoch": 0.11393109061313077, + "grad_norm": 6.322098731994629, + "learning_rate": 8.951898040290531e-06, + "loss": 0.3137, + "step": 8399 + }, + { + "epoch": 0.11394465545306566, + "grad_norm": 5.291042804718018, + "learning_rate": 8.951760997670276e-06, + "loss": 0.2541, + "step": 8400 + }, + { + "epoch": 0.11395822029300054, + "grad_norm": 4.4607319831848145, + "learning_rate": 8.951623955050021e-06, + "loss": 0.264, + "step": 8401 + }, + { + "epoch": 0.11397178513293543, + "grad_norm": 5.201287746429443, + "learning_rate": 8.951486912429766e-06, + "loss": 0.3969, + "step": 8402 + }, + { + "epoch": 0.11398534997287033, + "grad_norm": 7.368963241577148, + "learning_rate": 8.951349869809512e-06, + "loss": 0.5266, + "step": 8403 + }, + { + "epoch": 0.1139989148128052, + "grad_norm": 5.112030506134033, + "learning_rate": 8.951212827189257e-06, + "loss": 0.3223, + "step": 8404 + }, + { + "epoch": 0.1140124796527401, + "grad_norm": 7.074389457702637, + "learning_rate": 8.951075784569002e-06, + "loss": 0.4766, + "step": 8405 + }, + { + "epoch": 0.11402604449267499, + "grad_norm": 9.353997230529785, + "learning_rate": 8.950938741948747e-06, + "loss": 0.401, + "step": 8406 + }, + { + "epoch": 0.11403960933260987, + "grad_norm": 5.88461446762085, + "learning_rate": 8.950801699328492e-06, + "loss": 0.3743, + "step": 8407 + }, + { + "epoch": 0.11405317417254476, + "grad_norm": 6.06886625289917, + "learning_rate": 8.950664656708238e-06, + "loss": 0.4524, + "step": 8408 + }, + { + "epoch": 0.11406673901247966, + "grad_norm": 5.890143394470215, + "learning_rate": 8.950527614087981e-06, + "loss": 0.3616, + "step": 8409 + }, + { + "epoch": 0.11408030385241454, + "grad_norm": 5.265296936035156, + "learning_rate": 8.950390571467728e-06, + "loss": 0.2861, + "step": 8410 + }, + { + "epoch": 0.11409386869234943, + "grad_norm": 5.528092384338379, + "learning_rate": 8.950253528847473e-06, + "loss": 0.4482, + "step": 8411 + }, + { + "epoch": 0.11410743353228432, + "grad_norm": 7.383978843688965, + "learning_rate": 8.950116486227217e-06, + "loss": 0.4293, + "step": 8412 + }, + { + "epoch": 0.1141209983722192, + "grad_norm": 10.20229434967041, + "learning_rate": 8.949979443606962e-06, + "loss": 0.7809, + "step": 8413 + }, + { + "epoch": 0.1141345632121541, + "grad_norm": 9.065457344055176, + "learning_rate": 8.949842400986709e-06, + "loss": 0.4205, + "step": 8414 + }, + { + "epoch": 0.11414812805208899, + "grad_norm": 13.264727592468262, + "learning_rate": 8.949705358366452e-06, + "loss": 0.6723, + "step": 8415 + }, + { + "epoch": 0.11416169289202388, + "grad_norm": 5.87554407119751, + "learning_rate": 8.949568315746197e-06, + "loss": 0.4861, + "step": 8416 + }, + { + "epoch": 0.11417525773195876, + "grad_norm": 5.819100379943848, + "learning_rate": 8.949431273125942e-06, + "loss": 0.4617, + "step": 8417 + }, + { + "epoch": 0.11418882257189365, + "grad_norm": 8.671067237854004, + "learning_rate": 8.949294230505688e-06, + "loss": 0.5866, + "step": 8418 + }, + { + "epoch": 0.11420238741182855, + "grad_norm": 9.370046615600586, + "learning_rate": 8.949157187885433e-06, + "loss": 0.6137, + "step": 8419 + }, + { + "epoch": 0.11421595225176343, + "grad_norm": 7.724165916442871, + "learning_rate": 8.949020145265178e-06, + "loss": 0.494, + "step": 8420 + }, + { + "epoch": 0.11422951709169832, + "grad_norm": 7.872800350189209, + "learning_rate": 8.948883102644923e-06, + "loss": 0.5746, + "step": 8421 + }, + { + "epoch": 0.11424308193163321, + "grad_norm": 9.385541915893555, + "learning_rate": 8.948746060024668e-06, + "loss": 0.6116, + "step": 8422 + }, + { + "epoch": 0.11425664677156809, + "grad_norm": 7.526670932769775, + "learning_rate": 8.948609017404414e-06, + "loss": 0.4799, + "step": 8423 + }, + { + "epoch": 0.11427021161150298, + "grad_norm": 7.574343204498291, + "learning_rate": 8.948471974784159e-06, + "loss": 0.7338, + "step": 8424 + }, + { + "epoch": 0.11428377645143788, + "grad_norm": 7.896305084228516, + "learning_rate": 8.948334932163904e-06, + "loss": 0.5175, + "step": 8425 + }, + { + "epoch": 0.11429734129137276, + "grad_norm": 7.910699367523193, + "learning_rate": 8.948197889543649e-06, + "loss": 0.6635, + "step": 8426 + }, + { + "epoch": 0.11431090613130765, + "grad_norm": 6.910665988922119, + "learning_rate": 8.948060846923394e-06, + "loss": 0.5212, + "step": 8427 + }, + { + "epoch": 0.11432447097124254, + "grad_norm": 6.694674491882324, + "learning_rate": 8.94792380430314e-06, + "loss": 0.4972, + "step": 8428 + }, + { + "epoch": 0.11433803581117742, + "grad_norm": 7.508378028869629, + "learning_rate": 8.947786761682885e-06, + "loss": 0.5029, + "step": 8429 + }, + { + "epoch": 0.11435160065111231, + "grad_norm": 9.271232604980469, + "learning_rate": 8.947649719062628e-06, + "loss": 0.4277, + "step": 8430 + }, + { + "epoch": 0.11436516549104721, + "grad_norm": 7.531868934631348, + "learning_rate": 8.947512676442375e-06, + "loss": 0.3604, + "step": 8431 + }, + { + "epoch": 0.1143787303309821, + "grad_norm": 7.6865057945251465, + "learning_rate": 8.94737563382212e-06, + "loss": 0.4915, + "step": 8432 + }, + { + "epoch": 0.11439229517091698, + "grad_norm": 8.653081893920898, + "learning_rate": 8.947238591201864e-06, + "loss": 0.4014, + "step": 8433 + }, + { + "epoch": 0.11440586001085187, + "grad_norm": 7.704036712646484, + "learning_rate": 8.947101548581609e-06, + "loss": 0.5637, + "step": 8434 + }, + { + "epoch": 0.11441942485078677, + "grad_norm": 6.89192533493042, + "learning_rate": 8.946964505961356e-06, + "loss": 0.3207, + "step": 8435 + }, + { + "epoch": 0.11443298969072165, + "grad_norm": 5.896446704864502, + "learning_rate": 8.9468274633411e-06, + "loss": 0.3823, + "step": 8436 + }, + { + "epoch": 0.11444655453065654, + "grad_norm": 6.155602931976318, + "learning_rate": 8.946690420720844e-06, + "loss": 0.4558, + "step": 8437 + }, + { + "epoch": 0.11446011937059143, + "grad_norm": 9.652669906616211, + "learning_rate": 8.94655337810059e-06, + "loss": 0.6133, + "step": 8438 + }, + { + "epoch": 0.11447368421052631, + "grad_norm": 5.416245937347412, + "learning_rate": 8.946416335480335e-06, + "loss": 0.3073, + "step": 8439 + }, + { + "epoch": 0.1144872490504612, + "grad_norm": 7.159056186676025, + "learning_rate": 8.94627929286008e-06, + "loss": 0.4297, + "step": 8440 + }, + { + "epoch": 0.1145008138903961, + "grad_norm": 6.073346138000488, + "learning_rate": 8.946142250239825e-06, + "loss": 0.3455, + "step": 8441 + }, + { + "epoch": 0.11451437873033098, + "grad_norm": 8.82791805267334, + "learning_rate": 8.94600520761957e-06, + "loss": 0.5866, + "step": 8442 + }, + { + "epoch": 0.11452794357026587, + "grad_norm": 8.02499771118164, + "learning_rate": 8.945868164999315e-06, + "loss": 0.5775, + "step": 8443 + }, + { + "epoch": 0.11454150841020076, + "grad_norm": 6.701679229736328, + "learning_rate": 8.94573112237906e-06, + "loss": 0.5045, + "step": 8444 + }, + { + "epoch": 0.11455507325013564, + "grad_norm": 6.130403995513916, + "learning_rate": 8.945594079758806e-06, + "loss": 0.3851, + "step": 8445 + }, + { + "epoch": 0.11456863809007053, + "grad_norm": 6.936233043670654, + "learning_rate": 8.945457037138551e-06, + "loss": 0.3942, + "step": 8446 + }, + { + "epoch": 0.11458220293000543, + "grad_norm": 8.8202486038208, + "learning_rate": 8.945319994518296e-06, + "loss": 0.4709, + "step": 8447 + }, + { + "epoch": 0.11459576776994032, + "grad_norm": 8.516562461853027, + "learning_rate": 8.945182951898041e-06, + "loss": 0.4454, + "step": 8448 + }, + { + "epoch": 0.1146093326098752, + "grad_norm": 6.624803066253662, + "learning_rate": 8.945045909277786e-06, + "loss": 0.3642, + "step": 8449 + }, + { + "epoch": 0.1146228974498101, + "grad_norm": 6.546815395355225, + "learning_rate": 8.944908866657532e-06, + "loss": 0.3305, + "step": 8450 + }, + { + "epoch": 0.11463646228974499, + "grad_norm": 5.26152229309082, + "learning_rate": 8.944771824037277e-06, + "loss": 0.2514, + "step": 8451 + }, + { + "epoch": 0.11465002712967987, + "grad_norm": 5.911448955535889, + "learning_rate": 8.94463478141702e-06, + "loss": 0.3693, + "step": 8452 + }, + { + "epoch": 0.11466359196961476, + "grad_norm": 5.046181678771973, + "learning_rate": 8.944497738796767e-06, + "loss": 0.283, + "step": 8453 + }, + { + "epoch": 0.11467715680954965, + "grad_norm": 8.00461196899414, + "learning_rate": 8.944360696176512e-06, + "loss": 0.5655, + "step": 8454 + }, + { + "epoch": 0.11469072164948453, + "grad_norm": 6.189986705780029, + "learning_rate": 8.944223653556256e-06, + "loss": 0.3755, + "step": 8455 + }, + { + "epoch": 0.11470428648941942, + "grad_norm": 6.2791266441345215, + "learning_rate": 8.944086610936001e-06, + "loss": 0.3802, + "step": 8456 + }, + { + "epoch": 0.11471785132935432, + "grad_norm": 6.826683044433594, + "learning_rate": 8.943949568315748e-06, + "loss": 0.3289, + "step": 8457 + }, + { + "epoch": 0.1147314161692892, + "grad_norm": 6.9005818367004395, + "learning_rate": 8.943812525695491e-06, + "loss": 0.4531, + "step": 8458 + }, + { + "epoch": 0.11474498100922409, + "grad_norm": 7.6744537353515625, + "learning_rate": 8.943675483075237e-06, + "loss": 0.625, + "step": 8459 + }, + { + "epoch": 0.11475854584915898, + "grad_norm": 6.9535746574401855, + "learning_rate": 8.943538440454982e-06, + "loss": 0.375, + "step": 8460 + }, + { + "epoch": 0.11477211068909386, + "grad_norm": 6.196899890899658, + "learning_rate": 8.943401397834729e-06, + "loss": 0.4092, + "step": 8461 + }, + { + "epoch": 0.11478567552902876, + "grad_norm": 7.166965961456299, + "learning_rate": 8.943264355214472e-06, + "loss": 0.4388, + "step": 8462 + }, + { + "epoch": 0.11479924036896365, + "grad_norm": 6.544147968292236, + "learning_rate": 8.943127312594217e-06, + "loss": 0.4473, + "step": 8463 + }, + { + "epoch": 0.11481280520889854, + "grad_norm": 7.249989032745361, + "learning_rate": 8.942990269973962e-06, + "loss": 0.4632, + "step": 8464 + }, + { + "epoch": 0.11482637004883342, + "grad_norm": 5.988005638122559, + "learning_rate": 8.942853227353708e-06, + "loss": 0.3717, + "step": 8465 + }, + { + "epoch": 0.11483993488876831, + "grad_norm": 6.520822048187256, + "learning_rate": 8.942716184733453e-06, + "loss": 0.3245, + "step": 8466 + }, + { + "epoch": 0.1148534997287032, + "grad_norm": 7.602880001068115, + "learning_rate": 8.942579142113198e-06, + "loss": 0.3753, + "step": 8467 + }, + { + "epoch": 0.11486706456863809, + "grad_norm": 6.565923690795898, + "learning_rate": 8.942442099492943e-06, + "loss": 0.3032, + "step": 8468 + }, + { + "epoch": 0.11488062940857298, + "grad_norm": 6.976824760437012, + "learning_rate": 8.942305056872688e-06, + "loss": 0.3891, + "step": 8469 + }, + { + "epoch": 0.11489419424850787, + "grad_norm": 7.642041206359863, + "learning_rate": 8.942168014252434e-06, + "loss": 0.3304, + "step": 8470 + }, + { + "epoch": 0.11490775908844275, + "grad_norm": 5.320949554443359, + "learning_rate": 8.942030971632179e-06, + "loss": 0.3325, + "step": 8471 + }, + { + "epoch": 0.11492132392837764, + "grad_norm": 8.369524955749512, + "learning_rate": 8.941893929011924e-06, + "loss": 0.463, + "step": 8472 + }, + { + "epoch": 0.11493488876831254, + "grad_norm": 7.027295112609863, + "learning_rate": 8.941756886391667e-06, + "loss": 0.4524, + "step": 8473 + }, + { + "epoch": 0.11494845360824742, + "grad_norm": 7.133763790130615, + "learning_rate": 8.941619843771414e-06, + "loss": 0.5739, + "step": 8474 + }, + { + "epoch": 0.11496201844818231, + "grad_norm": 8.166173934936523, + "learning_rate": 8.94148280115116e-06, + "loss": 0.4271, + "step": 8475 + }, + { + "epoch": 0.1149755832881172, + "grad_norm": 7.305729389190674, + "learning_rate": 8.941345758530905e-06, + "loss": 0.5475, + "step": 8476 + }, + { + "epoch": 0.11498914812805208, + "grad_norm": 6.706455707550049, + "learning_rate": 8.941208715910648e-06, + "loss": 0.3975, + "step": 8477 + }, + { + "epoch": 0.11500271296798698, + "grad_norm": 6.744833469390869, + "learning_rate": 8.941071673290393e-06, + "loss": 0.2786, + "step": 8478 + }, + { + "epoch": 0.11501627780792187, + "grad_norm": 5.566780090332031, + "learning_rate": 8.94093463067014e-06, + "loss": 0.4428, + "step": 8479 + }, + { + "epoch": 0.11502984264785676, + "grad_norm": 9.577759742736816, + "learning_rate": 8.940797588049884e-06, + "loss": 0.546, + "step": 8480 + }, + { + "epoch": 0.11504340748779164, + "grad_norm": 8.199071884155273, + "learning_rate": 8.940660545429629e-06, + "loss": 0.481, + "step": 8481 + }, + { + "epoch": 0.11505697232772653, + "grad_norm": 8.727372169494629, + "learning_rate": 8.940523502809374e-06, + "loss": 0.4539, + "step": 8482 + }, + { + "epoch": 0.11507053716766143, + "grad_norm": 6.768134593963623, + "learning_rate": 8.94038646018912e-06, + "loss": 0.3502, + "step": 8483 + }, + { + "epoch": 0.1150841020075963, + "grad_norm": 6.717687129974365, + "learning_rate": 8.940249417568864e-06, + "loss": 0.4279, + "step": 8484 + }, + { + "epoch": 0.1150976668475312, + "grad_norm": 5.613630294799805, + "learning_rate": 8.94011237494861e-06, + "loss": 0.2794, + "step": 8485 + }, + { + "epoch": 0.11511123168746609, + "grad_norm": 6.158512592315674, + "learning_rate": 8.939975332328355e-06, + "loss": 0.2583, + "step": 8486 + }, + { + "epoch": 0.11512479652740097, + "grad_norm": 6.762033462524414, + "learning_rate": 8.9398382897081e-06, + "loss": 0.3327, + "step": 8487 + }, + { + "epoch": 0.11513836136733586, + "grad_norm": 6.869115829467773, + "learning_rate": 8.939701247087845e-06, + "loss": 0.4211, + "step": 8488 + }, + { + "epoch": 0.11515192620727076, + "grad_norm": 5.681107044219971, + "learning_rate": 8.93956420446759e-06, + "loss": 0.4867, + "step": 8489 + }, + { + "epoch": 0.11516549104720564, + "grad_norm": 6.810515403747559, + "learning_rate": 8.939427161847335e-06, + "loss": 0.5105, + "step": 8490 + }, + { + "epoch": 0.11517905588714053, + "grad_norm": 6.938438415527344, + "learning_rate": 8.93929011922708e-06, + "loss": 0.4056, + "step": 8491 + }, + { + "epoch": 0.11519262072707542, + "grad_norm": 6.570985794067383, + "learning_rate": 8.939153076606826e-06, + "loss": 0.4649, + "step": 8492 + }, + { + "epoch": 0.1152061855670103, + "grad_norm": 5.86760139465332, + "learning_rate": 8.939016033986571e-06, + "loss": 0.3926, + "step": 8493 + }, + { + "epoch": 0.1152197504069452, + "grad_norm": 5.058091163635254, + "learning_rate": 8.938878991366316e-06, + "loss": 0.3044, + "step": 8494 + }, + { + "epoch": 0.11523331524688009, + "grad_norm": 4.3347487449646, + "learning_rate": 8.93874194874606e-06, + "loss": 0.2831, + "step": 8495 + }, + { + "epoch": 0.11524688008681498, + "grad_norm": 6.269077301025391, + "learning_rate": 8.938604906125806e-06, + "loss": 0.4536, + "step": 8496 + }, + { + "epoch": 0.11526044492674986, + "grad_norm": 7.742640495300293, + "learning_rate": 8.938467863505552e-06, + "loss": 0.324, + "step": 8497 + }, + { + "epoch": 0.11527400976668475, + "grad_norm": 5.370919704437256, + "learning_rate": 8.938330820885295e-06, + "loss": 0.3523, + "step": 8498 + }, + { + "epoch": 0.11528757460661965, + "grad_norm": 4.687473297119141, + "learning_rate": 8.93819377826504e-06, + "loss": 0.2756, + "step": 8499 + }, + { + "epoch": 0.11530113944655453, + "grad_norm": 5.896614074707031, + "learning_rate": 8.938056735644787e-06, + "loss": 0.333, + "step": 8500 + }, + { + "epoch": 0.11531470428648942, + "grad_norm": 6.391670227050781, + "learning_rate": 8.937919693024532e-06, + "loss": 0.3908, + "step": 8501 + }, + { + "epoch": 0.11532826912642431, + "grad_norm": 5.86431884765625, + "learning_rate": 8.937782650404276e-06, + "loss": 0.4057, + "step": 8502 + }, + { + "epoch": 0.11534183396635919, + "grad_norm": 6.092579364776611, + "learning_rate": 8.937645607784021e-06, + "loss": 0.4889, + "step": 8503 + }, + { + "epoch": 0.11535539880629408, + "grad_norm": 6.827162742614746, + "learning_rate": 8.937508565163768e-06, + "loss": 0.3921, + "step": 8504 + }, + { + "epoch": 0.11536896364622898, + "grad_norm": 4.707108497619629, + "learning_rate": 8.937371522543511e-06, + "loss": 0.304, + "step": 8505 + }, + { + "epoch": 0.11538252848616386, + "grad_norm": 5.268826961517334, + "learning_rate": 8.937234479923257e-06, + "loss": 0.427, + "step": 8506 + }, + { + "epoch": 0.11539609332609875, + "grad_norm": 6.544800758361816, + "learning_rate": 8.937097437303002e-06, + "loss": 0.3552, + "step": 8507 + }, + { + "epoch": 0.11540965816603364, + "grad_norm": 6.3495097160339355, + "learning_rate": 8.936960394682747e-06, + "loss": 0.3342, + "step": 8508 + }, + { + "epoch": 0.11542322300596852, + "grad_norm": 6.625906467437744, + "learning_rate": 8.936823352062492e-06, + "loss": 0.3527, + "step": 8509 + }, + { + "epoch": 0.11543678784590342, + "grad_norm": 5.523701190948486, + "learning_rate": 8.936686309442237e-06, + "loss": 0.3308, + "step": 8510 + }, + { + "epoch": 0.11545035268583831, + "grad_norm": 5.012180328369141, + "learning_rate": 8.936549266821982e-06, + "loss": 0.3532, + "step": 8511 + }, + { + "epoch": 0.1154639175257732, + "grad_norm": 5.734764575958252, + "learning_rate": 8.936412224201728e-06, + "loss": 0.3206, + "step": 8512 + }, + { + "epoch": 0.11547748236570808, + "grad_norm": 5.883255481719971, + "learning_rate": 8.936275181581473e-06, + "loss": 0.41, + "step": 8513 + }, + { + "epoch": 0.11549104720564297, + "grad_norm": 4.892006874084473, + "learning_rate": 8.936138138961218e-06, + "loss": 0.2848, + "step": 8514 + }, + { + "epoch": 0.11550461204557787, + "grad_norm": 6.763063430786133, + "learning_rate": 8.936001096340963e-06, + "loss": 0.4122, + "step": 8515 + }, + { + "epoch": 0.11551817688551275, + "grad_norm": 5.671562671661377, + "learning_rate": 8.935864053720708e-06, + "loss": 0.321, + "step": 8516 + }, + { + "epoch": 0.11553174172544764, + "grad_norm": 4.950028896331787, + "learning_rate": 8.935727011100454e-06, + "loss": 0.3103, + "step": 8517 + }, + { + "epoch": 0.11554530656538253, + "grad_norm": 5.142571926116943, + "learning_rate": 8.935589968480199e-06, + "loss": 0.3403, + "step": 8518 + }, + { + "epoch": 0.11555887140531741, + "grad_norm": 6.840557098388672, + "learning_rate": 8.935452925859944e-06, + "loss": 0.3295, + "step": 8519 + }, + { + "epoch": 0.1155724362452523, + "grad_norm": 5.858449459075928, + "learning_rate": 8.935315883239687e-06, + "loss": 0.3775, + "step": 8520 + }, + { + "epoch": 0.1155860010851872, + "grad_norm": 9.765681266784668, + "learning_rate": 8.935178840619433e-06, + "loss": 0.4715, + "step": 8521 + }, + { + "epoch": 0.11559956592512208, + "grad_norm": 4.5871262550354, + "learning_rate": 8.93504179799918e-06, + "loss": 0.3114, + "step": 8522 + }, + { + "epoch": 0.11561313076505697, + "grad_norm": 5.991059303283691, + "learning_rate": 8.934904755378923e-06, + "loss": 0.314, + "step": 8523 + }, + { + "epoch": 0.11562669560499186, + "grad_norm": 6.8167314529418945, + "learning_rate": 8.934767712758668e-06, + "loss": 0.3792, + "step": 8524 + }, + { + "epoch": 0.11564026044492674, + "grad_norm": 5.975065231323242, + "learning_rate": 8.934630670138413e-06, + "loss": 0.3955, + "step": 8525 + }, + { + "epoch": 0.11565382528486164, + "grad_norm": 6.0635223388671875, + "learning_rate": 8.934493627518158e-06, + "loss": 0.2928, + "step": 8526 + }, + { + "epoch": 0.11566739012479653, + "grad_norm": 6.711539268493652, + "learning_rate": 8.934356584897904e-06, + "loss": 0.4101, + "step": 8527 + }, + { + "epoch": 0.11568095496473142, + "grad_norm": 7.138375759124756, + "learning_rate": 8.934219542277649e-06, + "loss": 0.4016, + "step": 8528 + }, + { + "epoch": 0.1156945198046663, + "grad_norm": 5.383942127227783, + "learning_rate": 8.934082499657394e-06, + "loss": 0.3301, + "step": 8529 + }, + { + "epoch": 0.1157080846446012, + "grad_norm": 4.986185550689697, + "learning_rate": 8.93394545703714e-06, + "loss": 0.2874, + "step": 8530 + }, + { + "epoch": 0.11572164948453609, + "grad_norm": 7.179394721984863, + "learning_rate": 8.933808414416884e-06, + "loss": 0.4547, + "step": 8531 + }, + { + "epoch": 0.11573521432447097, + "grad_norm": 8.567145347595215, + "learning_rate": 8.93367137179663e-06, + "loss": 0.4977, + "step": 8532 + }, + { + "epoch": 0.11574877916440586, + "grad_norm": 8.377401351928711, + "learning_rate": 8.933534329176375e-06, + "loss": 0.3677, + "step": 8533 + }, + { + "epoch": 0.11576234400434075, + "grad_norm": 4.49258279800415, + "learning_rate": 8.93339728655612e-06, + "loss": 0.2092, + "step": 8534 + }, + { + "epoch": 0.11577590884427563, + "grad_norm": 7.527127742767334, + "learning_rate": 8.933260243935865e-06, + "loss": 0.5539, + "step": 8535 + }, + { + "epoch": 0.11578947368421053, + "grad_norm": 7.6519246101379395, + "learning_rate": 8.93312320131561e-06, + "loss": 0.4513, + "step": 8536 + }, + { + "epoch": 0.11580303852414542, + "grad_norm": 7.789889335632324, + "learning_rate": 8.932986158695355e-06, + "loss": 0.5315, + "step": 8537 + }, + { + "epoch": 0.1158166033640803, + "grad_norm": 5.576120376586914, + "learning_rate": 8.932849116075099e-06, + "loss": 0.3848, + "step": 8538 + }, + { + "epoch": 0.11583016820401519, + "grad_norm": 5.9518513679504395, + "learning_rate": 8.932712073454846e-06, + "loss": 0.34, + "step": 8539 + }, + { + "epoch": 0.11584373304395008, + "grad_norm": 5.986740589141846, + "learning_rate": 8.932575030834591e-06, + "loss": 0.3947, + "step": 8540 + }, + { + "epoch": 0.11585729788388498, + "grad_norm": 6.269256114959717, + "learning_rate": 8.932437988214334e-06, + "loss": 0.4576, + "step": 8541 + }, + { + "epoch": 0.11587086272381986, + "grad_norm": 6.644055366516113, + "learning_rate": 8.93230094559408e-06, + "loss": 0.5877, + "step": 8542 + }, + { + "epoch": 0.11588442756375475, + "grad_norm": 6.24739408493042, + "learning_rate": 8.932163902973827e-06, + "loss": 0.435, + "step": 8543 + }, + { + "epoch": 0.11589799240368964, + "grad_norm": 5.474649429321289, + "learning_rate": 8.932026860353572e-06, + "loss": 0.3883, + "step": 8544 + }, + { + "epoch": 0.11591155724362452, + "grad_norm": 6.5046820640563965, + "learning_rate": 8.931889817733315e-06, + "loss": 0.3502, + "step": 8545 + }, + { + "epoch": 0.11592512208355941, + "grad_norm": 5.845861911773682, + "learning_rate": 8.93175277511306e-06, + "loss": 0.3094, + "step": 8546 + }, + { + "epoch": 0.11593868692349431, + "grad_norm": 5.967545032501221, + "learning_rate": 8.931615732492806e-06, + "loss": 0.4013, + "step": 8547 + }, + { + "epoch": 0.11595225176342919, + "grad_norm": 8.15146541595459, + "learning_rate": 8.93147868987255e-06, + "loss": 0.5326, + "step": 8548 + }, + { + "epoch": 0.11596581660336408, + "grad_norm": 7.425852298736572, + "learning_rate": 8.931341647252296e-06, + "loss": 0.4563, + "step": 8549 + }, + { + "epoch": 0.11597938144329897, + "grad_norm": 7.298816204071045, + "learning_rate": 8.931204604632041e-06, + "loss": 0.4243, + "step": 8550 + }, + { + "epoch": 0.11599294628323385, + "grad_norm": 5.8955464363098145, + "learning_rate": 8.931067562011786e-06, + "loss": 0.3513, + "step": 8551 + }, + { + "epoch": 0.11600651112316875, + "grad_norm": 6.430814743041992, + "learning_rate": 8.930930519391531e-06, + "loss": 0.4493, + "step": 8552 + }, + { + "epoch": 0.11602007596310364, + "grad_norm": 5.85889196395874, + "learning_rate": 8.930793476771277e-06, + "loss": 0.4607, + "step": 8553 + }, + { + "epoch": 0.11603364080303852, + "grad_norm": 4.916161060333252, + "learning_rate": 8.930656434151022e-06, + "loss": 0.2382, + "step": 8554 + }, + { + "epoch": 0.11604720564297341, + "grad_norm": 6.116504192352295, + "learning_rate": 8.930519391530767e-06, + "loss": 0.3769, + "step": 8555 + }, + { + "epoch": 0.1160607704829083, + "grad_norm": 6.084619045257568, + "learning_rate": 8.930382348910512e-06, + "loss": 0.4468, + "step": 8556 + }, + { + "epoch": 0.1160743353228432, + "grad_norm": 8.329657554626465, + "learning_rate": 8.930245306290257e-06, + "loss": 0.4298, + "step": 8557 + }, + { + "epoch": 0.11608790016277808, + "grad_norm": 5.963587760925293, + "learning_rate": 8.930108263670002e-06, + "loss": 0.4773, + "step": 8558 + }, + { + "epoch": 0.11610146500271297, + "grad_norm": 4.934365749359131, + "learning_rate": 8.929971221049748e-06, + "loss": 0.2854, + "step": 8559 + }, + { + "epoch": 0.11611502984264786, + "grad_norm": 6.747567653656006, + "learning_rate": 8.929834178429493e-06, + "loss": 0.4217, + "step": 8560 + }, + { + "epoch": 0.11612859468258274, + "grad_norm": 5.677446365356445, + "learning_rate": 8.929697135809238e-06, + "loss": 0.3809, + "step": 8561 + }, + { + "epoch": 0.11614215952251764, + "grad_norm": 8.166500091552734, + "learning_rate": 8.929560093188983e-06, + "loss": 0.4356, + "step": 8562 + }, + { + "epoch": 0.11615572436245253, + "grad_norm": 7.6310343742370605, + "learning_rate": 8.929423050568727e-06, + "loss": 0.4887, + "step": 8563 + }, + { + "epoch": 0.11616928920238741, + "grad_norm": 8.61746597290039, + "learning_rate": 8.929286007948472e-06, + "loss": 0.5717, + "step": 8564 + }, + { + "epoch": 0.1161828540423223, + "grad_norm": 5.273629188537598, + "learning_rate": 8.929148965328219e-06, + "loss": 0.3516, + "step": 8565 + }, + { + "epoch": 0.1161964188822572, + "grad_norm": 7.014576435089111, + "learning_rate": 8.929011922707962e-06, + "loss": 0.3905, + "step": 8566 + }, + { + "epoch": 0.11620998372219207, + "grad_norm": 6.553751468658447, + "learning_rate": 8.928874880087707e-06, + "loss": 0.3067, + "step": 8567 + }, + { + "epoch": 0.11622354856212697, + "grad_norm": 7.300220489501953, + "learning_rate": 8.928737837467453e-06, + "loss": 0.5209, + "step": 8568 + }, + { + "epoch": 0.11623711340206186, + "grad_norm": 6.204878807067871, + "learning_rate": 8.9286007948472e-06, + "loss": 0.5483, + "step": 8569 + }, + { + "epoch": 0.11625067824199674, + "grad_norm": 6.2912397384643555, + "learning_rate": 8.928463752226943e-06, + "loss": 0.5093, + "step": 8570 + }, + { + "epoch": 0.11626424308193163, + "grad_norm": 6.3425726890563965, + "learning_rate": 8.928326709606688e-06, + "loss": 0.3225, + "step": 8571 + }, + { + "epoch": 0.11627780792186652, + "grad_norm": 5.768918514251709, + "learning_rate": 8.928189666986433e-06, + "loss": 0.3213, + "step": 8572 + }, + { + "epoch": 0.11629137276180142, + "grad_norm": 7.668368816375732, + "learning_rate": 8.928052624366178e-06, + "loss": 0.4191, + "step": 8573 + }, + { + "epoch": 0.1163049376017363, + "grad_norm": 6.228855133056641, + "learning_rate": 8.927915581745924e-06, + "loss": 0.3892, + "step": 8574 + }, + { + "epoch": 0.11631850244167119, + "grad_norm": 7.226455211639404, + "learning_rate": 8.927778539125669e-06, + "loss": 0.4218, + "step": 8575 + }, + { + "epoch": 0.11633206728160608, + "grad_norm": 4.599217891693115, + "learning_rate": 8.927641496505414e-06, + "loss": 0.2281, + "step": 8576 + }, + { + "epoch": 0.11634563212154096, + "grad_norm": 4.633452892303467, + "learning_rate": 8.92750445388516e-06, + "loss": 0.2858, + "step": 8577 + }, + { + "epoch": 0.11635919696147586, + "grad_norm": 5.3547234535217285, + "learning_rate": 8.927367411264904e-06, + "loss": 0.3051, + "step": 8578 + }, + { + "epoch": 0.11637276180141075, + "grad_norm": 7.454463958740234, + "learning_rate": 8.92723036864465e-06, + "loss": 0.5927, + "step": 8579 + }, + { + "epoch": 0.11638632664134563, + "grad_norm": 6.863370895385742, + "learning_rate": 8.927093326024395e-06, + "loss": 0.3766, + "step": 8580 + }, + { + "epoch": 0.11639989148128052, + "grad_norm": 5.2751851081848145, + "learning_rate": 8.926956283404138e-06, + "loss": 0.3781, + "step": 8581 + }, + { + "epoch": 0.11641345632121541, + "grad_norm": 5.876204490661621, + "learning_rate": 8.926819240783885e-06, + "loss": 0.3538, + "step": 8582 + }, + { + "epoch": 0.1164270211611503, + "grad_norm": 5.996270656585693, + "learning_rate": 8.92668219816363e-06, + "loss": 0.5253, + "step": 8583 + }, + { + "epoch": 0.11644058600108519, + "grad_norm": 5.040263652801514, + "learning_rate": 8.926545155543375e-06, + "loss": 0.3009, + "step": 8584 + }, + { + "epoch": 0.11645415084102008, + "grad_norm": 4.986870765686035, + "learning_rate": 8.926408112923119e-06, + "loss": 0.3051, + "step": 8585 + }, + { + "epoch": 0.11646771568095496, + "grad_norm": 7.2736945152282715, + "learning_rate": 8.926271070302866e-06, + "loss": 0.4328, + "step": 8586 + }, + { + "epoch": 0.11648128052088985, + "grad_norm": 6.167227268218994, + "learning_rate": 8.926134027682611e-06, + "loss": 0.4339, + "step": 8587 + }, + { + "epoch": 0.11649484536082474, + "grad_norm": 5.549490928649902, + "learning_rate": 8.925996985062354e-06, + "loss": 0.4644, + "step": 8588 + }, + { + "epoch": 0.11650841020075964, + "grad_norm": 4.778753280639648, + "learning_rate": 8.9258599424421e-06, + "loss": 0.309, + "step": 8589 + }, + { + "epoch": 0.11652197504069452, + "grad_norm": 6.949597358703613, + "learning_rate": 8.925722899821845e-06, + "loss": 0.4446, + "step": 8590 + }, + { + "epoch": 0.11653553988062941, + "grad_norm": 4.778460502624512, + "learning_rate": 8.92558585720159e-06, + "loss": 0.3722, + "step": 8591 + }, + { + "epoch": 0.1165491047205643, + "grad_norm": 6.433259010314941, + "learning_rate": 8.925448814581335e-06, + "loss": 0.4663, + "step": 8592 + }, + { + "epoch": 0.11656266956049918, + "grad_norm": 4.7566938400268555, + "learning_rate": 8.92531177196108e-06, + "loss": 0.4358, + "step": 8593 + }, + { + "epoch": 0.11657623440043408, + "grad_norm": 5.848714351654053, + "learning_rate": 8.925174729340826e-06, + "loss": 0.4231, + "step": 8594 + }, + { + "epoch": 0.11658979924036897, + "grad_norm": 5.631707668304443, + "learning_rate": 8.92503768672057e-06, + "loss": 0.4071, + "step": 8595 + }, + { + "epoch": 0.11660336408030385, + "grad_norm": 7.485988616943359, + "learning_rate": 8.924900644100316e-06, + "loss": 0.5176, + "step": 8596 + }, + { + "epoch": 0.11661692892023874, + "grad_norm": 4.076051235198975, + "learning_rate": 8.924763601480061e-06, + "loss": 0.2642, + "step": 8597 + }, + { + "epoch": 0.11663049376017363, + "grad_norm": 6.663028240203857, + "learning_rate": 8.924626558859806e-06, + "loss": 0.5366, + "step": 8598 + }, + { + "epoch": 0.11664405860010851, + "grad_norm": 6.809154510498047, + "learning_rate": 8.924489516239551e-06, + "loss": 0.4727, + "step": 8599 + }, + { + "epoch": 0.1166576234400434, + "grad_norm": 4.702296733856201, + "learning_rate": 8.924352473619297e-06, + "loss": 0.3143, + "step": 8600 + }, + { + "epoch": 0.1166711882799783, + "grad_norm": 7.061583995819092, + "learning_rate": 8.924215430999042e-06, + "loss": 0.4505, + "step": 8601 + }, + { + "epoch": 0.11668475311991318, + "grad_norm": 4.908576488494873, + "learning_rate": 8.924078388378787e-06, + "loss": 0.2978, + "step": 8602 + }, + { + "epoch": 0.11669831795984807, + "grad_norm": 7.7437543869018555, + "learning_rate": 8.92394134575853e-06, + "loss": 0.622, + "step": 8603 + }, + { + "epoch": 0.11671188279978296, + "grad_norm": 5.4855804443359375, + "learning_rate": 8.923804303138277e-06, + "loss": 0.3476, + "step": 8604 + }, + { + "epoch": 0.11672544763971786, + "grad_norm": 7.190316200256348, + "learning_rate": 8.923667260518023e-06, + "loss": 0.4985, + "step": 8605 + }, + { + "epoch": 0.11673901247965274, + "grad_norm": 7.9240851402282715, + "learning_rate": 8.923530217897766e-06, + "loss": 0.4687, + "step": 8606 + }, + { + "epoch": 0.11675257731958763, + "grad_norm": 5.600784778594971, + "learning_rate": 8.923393175277511e-06, + "loss": 0.3267, + "step": 8607 + }, + { + "epoch": 0.11676614215952252, + "grad_norm": 7.667015075683594, + "learning_rate": 8.923256132657258e-06, + "loss": 0.6106, + "step": 8608 + }, + { + "epoch": 0.1167797069994574, + "grad_norm": 7.757414817810059, + "learning_rate": 8.923119090037003e-06, + "loss": 0.4374, + "step": 8609 + }, + { + "epoch": 0.1167932718393923, + "grad_norm": 4.504366397857666, + "learning_rate": 8.922982047416747e-06, + "loss": 0.2501, + "step": 8610 + }, + { + "epoch": 0.11680683667932719, + "grad_norm": 5.9781813621521, + "learning_rate": 8.922845004796492e-06, + "loss": 0.428, + "step": 8611 + }, + { + "epoch": 0.11682040151926207, + "grad_norm": 7.968356609344482, + "learning_rate": 8.922707962176239e-06, + "loss": 0.4569, + "step": 8612 + }, + { + "epoch": 0.11683396635919696, + "grad_norm": 6.36826753616333, + "learning_rate": 8.922570919555982e-06, + "loss": 0.368, + "step": 8613 + }, + { + "epoch": 0.11684753119913185, + "grad_norm": 6.537027835845947, + "learning_rate": 8.922433876935727e-06, + "loss": 0.4054, + "step": 8614 + }, + { + "epoch": 0.11686109603906673, + "grad_norm": 5.7770185470581055, + "learning_rate": 8.922296834315473e-06, + "loss": 0.3578, + "step": 8615 + }, + { + "epoch": 0.11687466087900163, + "grad_norm": 5.482479095458984, + "learning_rate": 8.922159791695218e-06, + "loss": 0.4998, + "step": 8616 + }, + { + "epoch": 0.11688822571893652, + "grad_norm": 5.315190315246582, + "learning_rate": 8.922022749074963e-06, + "loss": 0.4116, + "step": 8617 + }, + { + "epoch": 0.1169017905588714, + "grad_norm": 7.676490783691406, + "learning_rate": 8.921885706454708e-06, + "loss": 0.4418, + "step": 8618 + }, + { + "epoch": 0.11691535539880629, + "grad_norm": 6.253461837768555, + "learning_rate": 8.921748663834453e-06, + "loss": 0.5353, + "step": 8619 + }, + { + "epoch": 0.11692892023874119, + "grad_norm": 7.636244297027588, + "learning_rate": 8.921611621214199e-06, + "loss": 0.5093, + "step": 8620 + }, + { + "epoch": 0.11694248507867608, + "grad_norm": 6.898770809173584, + "learning_rate": 8.921474578593944e-06, + "loss": 0.5169, + "step": 8621 + }, + { + "epoch": 0.11695604991861096, + "grad_norm": 5.621486663818359, + "learning_rate": 8.921337535973689e-06, + "loss": 0.3106, + "step": 8622 + }, + { + "epoch": 0.11696961475854585, + "grad_norm": 5.616532325744629, + "learning_rate": 8.921200493353434e-06, + "loss": 0.3466, + "step": 8623 + }, + { + "epoch": 0.11698317959848074, + "grad_norm": 7.21028470993042, + "learning_rate": 8.921063450733178e-06, + "loss": 0.5017, + "step": 8624 + }, + { + "epoch": 0.11699674443841562, + "grad_norm": 6.285334587097168, + "learning_rate": 8.920926408112924e-06, + "loss": 0.4761, + "step": 8625 + }, + { + "epoch": 0.11701030927835052, + "grad_norm": 7.654102325439453, + "learning_rate": 8.92078936549267e-06, + "loss": 0.4214, + "step": 8626 + }, + { + "epoch": 0.11702387411828541, + "grad_norm": 9.089051246643066, + "learning_rate": 8.920652322872415e-06, + "loss": 0.6158, + "step": 8627 + }, + { + "epoch": 0.11703743895822029, + "grad_norm": 8.684235572814941, + "learning_rate": 8.920515280252158e-06, + "loss": 0.538, + "step": 8628 + }, + { + "epoch": 0.11705100379815518, + "grad_norm": 7.802737236022949, + "learning_rate": 8.920378237631905e-06, + "loss": 0.5493, + "step": 8629 + }, + { + "epoch": 0.11706456863809007, + "grad_norm": 8.506026268005371, + "learning_rate": 8.92024119501165e-06, + "loss": 0.5459, + "step": 8630 + }, + { + "epoch": 0.11707813347802495, + "grad_norm": 6.507460117340088, + "learning_rate": 8.920104152391394e-06, + "loss": 0.504, + "step": 8631 + }, + { + "epoch": 0.11709169831795985, + "grad_norm": 5.7759881019592285, + "learning_rate": 8.919967109771139e-06, + "loss": 0.3918, + "step": 8632 + }, + { + "epoch": 0.11710526315789474, + "grad_norm": 5.845280170440674, + "learning_rate": 8.919830067150884e-06, + "loss": 0.3118, + "step": 8633 + }, + { + "epoch": 0.11711882799782962, + "grad_norm": 4.979619026184082, + "learning_rate": 8.91969302453063e-06, + "loss": 0.3709, + "step": 8634 + }, + { + "epoch": 0.11713239283776451, + "grad_norm": 5.795769691467285, + "learning_rate": 8.919555981910374e-06, + "loss": 0.3147, + "step": 8635 + }, + { + "epoch": 0.1171459576776994, + "grad_norm": 7.429089069366455, + "learning_rate": 8.91941893929012e-06, + "loss": 0.394, + "step": 8636 + }, + { + "epoch": 0.1171595225176343, + "grad_norm": 7.90415096282959, + "learning_rate": 8.919281896669865e-06, + "loss": 0.39, + "step": 8637 + }, + { + "epoch": 0.11717308735756918, + "grad_norm": 5.948373794555664, + "learning_rate": 8.91914485404961e-06, + "loss": 0.3954, + "step": 8638 + }, + { + "epoch": 0.11718665219750407, + "grad_norm": 6.230757713317871, + "learning_rate": 8.919007811429355e-06, + "loss": 0.3587, + "step": 8639 + }, + { + "epoch": 0.11720021703743896, + "grad_norm": 8.873334884643555, + "learning_rate": 8.9188707688091e-06, + "loss": 0.4246, + "step": 8640 + }, + { + "epoch": 0.11721378187737384, + "grad_norm": 4.675666809082031, + "learning_rate": 8.918733726188846e-06, + "loss": 0.3511, + "step": 8641 + }, + { + "epoch": 0.11722734671730874, + "grad_norm": 6.41331672668457, + "learning_rate": 8.91859668356859e-06, + "loss": 0.3948, + "step": 8642 + }, + { + "epoch": 0.11724091155724363, + "grad_norm": 8.636368751525879, + "learning_rate": 8.918459640948336e-06, + "loss": 0.5156, + "step": 8643 + }, + { + "epoch": 0.11725447639717851, + "grad_norm": 4.357823371887207, + "learning_rate": 8.918322598328081e-06, + "loss": 0.2876, + "step": 8644 + }, + { + "epoch": 0.1172680412371134, + "grad_norm": 5.520859241485596, + "learning_rate": 8.918185555707826e-06, + "loss": 0.3274, + "step": 8645 + }, + { + "epoch": 0.1172816060770483, + "grad_norm": 5.6457695960998535, + "learning_rate": 8.91804851308757e-06, + "loss": 0.4328, + "step": 8646 + }, + { + "epoch": 0.11729517091698317, + "grad_norm": 5.205391883850098, + "learning_rate": 8.917911470467317e-06, + "loss": 0.3475, + "step": 8647 + }, + { + "epoch": 0.11730873575691807, + "grad_norm": 7.350625514984131, + "learning_rate": 8.917774427847062e-06, + "loss": 0.6363, + "step": 8648 + }, + { + "epoch": 0.11732230059685296, + "grad_norm": 7.186862468719482, + "learning_rate": 8.917637385226805e-06, + "loss": 0.5344, + "step": 8649 + }, + { + "epoch": 0.11733586543678784, + "grad_norm": 6.233736038208008, + "learning_rate": 8.91750034260655e-06, + "loss": 0.4215, + "step": 8650 + }, + { + "epoch": 0.11734943027672273, + "grad_norm": 7.031248569488525, + "learning_rate": 8.917363299986297e-06, + "loss": 0.3789, + "step": 8651 + }, + { + "epoch": 0.11736299511665763, + "grad_norm": 6.082882404327393, + "learning_rate": 8.917226257366043e-06, + "loss": 0.3447, + "step": 8652 + }, + { + "epoch": 0.11737655995659252, + "grad_norm": 5.349556922912598, + "learning_rate": 8.917089214745786e-06, + "loss": 0.3558, + "step": 8653 + }, + { + "epoch": 0.1173901247965274, + "grad_norm": 5.494316101074219, + "learning_rate": 8.916952172125531e-06, + "loss": 0.3327, + "step": 8654 + }, + { + "epoch": 0.11740368963646229, + "grad_norm": 5.88250207901001, + "learning_rate": 8.916815129505278e-06, + "loss": 0.405, + "step": 8655 + }, + { + "epoch": 0.11741725447639718, + "grad_norm": 5.0993332862854, + "learning_rate": 8.916678086885022e-06, + "loss": 0.331, + "step": 8656 + }, + { + "epoch": 0.11743081931633206, + "grad_norm": 6.3230462074279785, + "learning_rate": 8.916541044264767e-06, + "loss": 0.317, + "step": 8657 + }, + { + "epoch": 0.11744438415626696, + "grad_norm": 5.73950719833374, + "learning_rate": 8.916404001644512e-06, + "loss": 0.4318, + "step": 8658 + }, + { + "epoch": 0.11745794899620185, + "grad_norm": 5.685035705566406, + "learning_rate": 8.916266959024257e-06, + "loss": 0.4194, + "step": 8659 + }, + { + "epoch": 0.11747151383613673, + "grad_norm": 8.757404327392578, + "learning_rate": 8.916129916404002e-06, + "loss": 0.5594, + "step": 8660 + }, + { + "epoch": 0.11748507867607162, + "grad_norm": 4.2500505447387695, + "learning_rate": 8.915992873783747e-06, + "loss": 0.2261, + "step": 8661 + }, + { + "epoch": 0.11749864351600652, + "grad_norm": 5.846367835998535, + "learning_rate": 8.915855831163493e-06, + "loss": 0.3974, + "step": 8662 + }, + { + "epoch": 0.1175122083559414, + "grad_norm": 7.831251621246338, + "learning_rate": 8.915718788543238e-06, + "loss": 0.406, + "step": 8663 + }, + { + "epoch": 0.11752577319587629, + "grad_norm": 4.684423923492432, + "learning_rate": 8.915581745922983e-06, + "loss": 0.277, + "step": 8664 + }, + { + "epoch": 0.11753933803581118, + "grad_norm": 6.916868686676025, + "learning_rate": 8.915444703302728e-06, + "loss": 0.5125, + "step": 8665 + }, + { + "epoch": 0.11755290287574606, + "grad_norm": 7.832724571228027, + "learning_rate": 8.915307660682473e-06, + "loss": 0.395, + "step": 8666 + }, + { + "epoch": 0.11756646771568095, + "grad_norm": 4.924279689788818, + "learning_rate": 8.915170618062219e-06, + "loss": 0.3728, + "step": 8667 + }, + { + "epoch": 0.11758003255561585, + "grad_norm": 5.629134178161621, + "learning_rate": 8.915033575441964e-06, + "loss": 0.3922, + "step": 8668 + }, + { + "epoch": 0.11759359739555074, + "grad_norm": 5.886200428009033, + "learning_rate": 8.914896532821709e-06, + "loss": 0.2681, + "step": 8669 + }, + { + "epoch": 0.11760716223548562, + "grad_norm": 7.385434627532959, + "learning_rate": 8.914759490201454e-06, + "loss": 0.4226, + "step": 8670 + }, + { + "epoch": 0.11762072707542051, + "grad_norm": 6.801045894622803, + "learning_rate": 8.914622447581198e-06, + "loss": 0.3115, + "step": 8671 + }, + { + "epoch": 0.1176342919153554, + "grad_norm": 6.820047378540039, + "learning_rate": 8.914485404960943e-06, + "loss": 0.4265, + "step": 8672 + }, + { + "epoch": 0.11764785675529028, + "grad_norm": 5.943828582763672, + "learning_rate": 8.91434836234069e-06, + "loss": 0.4057, + "step": 8673 + }, + { + "epoch": 0.11766142159522518, + "grad_norm": 5.837649345397949, + "learning_rate": 8.914211319720433e-06, + "loss": 0.4379, + "step": 8674 + }, + { + "epoch": 0.11767498643516007, + "grad_norm": 6.017063617706299, + "learning_rate": 8.914074277100178e-06, + "loss": 0.4254, + "step": 8675 + }, + { + "epoch": 0.11768855127509495, + "grad_norm": 8.015361785888672, + "learning_rate": 8.913937234479923e-06, + "loss": 0.4845, + "step": 8676 + }, + { + "epoch": 0.11770211611502984, + "grad_norm": 6.960083484649658, + "learning_rate": 8.91380019185967e-06, + "loss": 0.4979, + "step": 8677 + }, + { + "epoch": 0.11771568095496474, + "grad_norm": 6.864342212677002, + "learning_rate": 8.913663149239414e-06, + "loss": 0.3019, + "step": 8678 + }, + { + "epoch": 0.11772924579489961, + "grad_norm": 4.052264213562012, + "learning_rate": 8.913526106619159e-06, + "loss": 0.1943, + "step": 8679 + }, + { + "epoch": 0.11774281063483451, + "grad_norm": 5.639876365661621, + "learning_rate": 8.913389063998904e-06, + "loss": 0.4254, + "step": 8680 + }, + { + "epoch": 0.1177563754747694, + "grad_norm": 4.476650714874268, + "learning_rate": 8.91325202137865e-06, + "loss": 0.2788, + "step": 8681 + }, + { + "epoch": 0.11776994031470428, + "grad_norm": 5.530372619628906, + "learning_rate": 8.913114978758395e-06, + "loss": 0.2955, + "step": 8682 + }, + { + "epoch": 0.11778350515463917, + "grad_norm": 5.871151924133301, + "learning_rate": 8.91297793613814e-06, + "loss": 0.3082, + "step": 8683 + }, + { + "epoch": 0.11779706999457407, + "grad_norm": 8.608987808227539, + "learning_rate": 8.912840893517885e-06, + "loss": 0.3411, + "step": 8684 + }, + { + "epoch": 0.11781063483450896, + "grad_norm": 6.449301719665527, + "learning_rate": 8.91270385089763e-06, + "loss": 0.4697, + "step": 8685 + }, + { + "epoch": 0.11782419967444384, + "grad_norm": 4.713799953460693, + "learning_rate": 8.912566808277375e-06, + "loss": 0.3192, + "step": 8686 + }, + { + "epoch": 0.11783776451437873, + "grad_norm": 6.767887592315674, + "learning_rate": 8.91242976565712e-06, + "loss": 0.2981, + "step": 8687 + }, + { + "epoch": 0.11785132935431362, + "grad_norm": 5.390842437744141, + "learning_rate": 8.912292723036866e-06, + "loss": 0.319, + "step": 8688 + }, + { + "epoch": 0.1178648941942485, + "grad_norm": 6.104610443115234, + "learning_rate": 8.912155680416609e-06, + "loss": 0.2843, + "step": 8689 + }, + { + "epoch": 0.1178784590341834, + "grad_norm": 5.918211936950684, + "learning_rate": 8.912018637796356e-06, + "loss": 0.3109, + "step": 8690 + }, + { + "epoch": 0.11789202387411829, + "grad_norm": 4.825708866119385, + "learning_rate": 8.911881595176101e-06, + "loss": 0.2434, + "step": 8691 + }, + { + "epoch": 0.11790558871405317, + "grad_norm": 6.614751815795898, + "learning_rate": 8.911744552555846e-06, + "loss": 0.4084, + "step": 8692 + }, + { + "epoch": 0.11791915355398806, + "grad_norm": 5.5621418952941895, + "learning_rate": 8.91160750993559e-06, + "loss": 0.2659, + "step": 8693 + }, + { + "epoch": 0.11793271839392296, + "grad_norm": 7.816161155700684, + "learning_rate": 8.911470467315337e-06, + "loss": 0.4222, + "step": 8694 + }, + { + "epoch": 0.11794628323385783, + "grad_norm": 5.4435577392578125, + "learning_rate": 8.911333424695082e-06, + "loss": 0.2992, + "step": 8695 + }, + { + "epoch": 0.11795984807379273, + "grad_norm": 6.946145534515381, + "learning_rate": 8.911196382074825e-06, + "loss": 0.3741, + "step": 8696 + }, + { + "epoch": 0.11797341291372762, + "grad_norm": 5.89337682723999, + "learning_rate": 8.91105933945457e-06, + "loss": 0.2827, + "step": 8697 + }, + { + "epoch": 0.1179869777536625, + "grad_norm": 4.863218307495117, + "learning_rate": 8.910922296834317e-06, + "loss": 0.2464, + "step": 8698 + }, + { + "epoch": 0.1180005425935974, + "grad_norm": 5.901236057281494, + "learning_rate": 8.910785254214061e-06, + "loss": 0.403, + "step": 8699 + }, + { + "epoch": 0.11801410743353229, + "grad_norm": 7.468112468719482, + "learning_rate": 8.910648211593806e-06, + "loss": 0.3294, + "step": 8700 + }, + { + "epoch": 0.11802767227346718, + "grad_norm": 7.0212788581848145, + "learning_rate": 8.910511168973551e-06, + "loss": 0.3739, + "step": 8701 + }, + { + "epoch": 0.11804123711340206, + "grad_norm": 5.818696022033691, + "learning_rate": 8.910374126353296e-06, + "loss": 0.3292, + "step": 8702 + }, + { + "epoch": 0.11805480195333695, + "grad_norm": 5.028084754943848, + "learning_rate": 8.910237083733042e-06, + "loss": 0.3398, + "step": 8703 + }, + { + "epoch": 0.11806836679327185, + "grad_norm": 5.656143665313721, + "learning_rate": 8.910100041112787e-06, + "loss": 0.3112, + "step": 8704 + }, + { + "epoch": 0.11808193163320672, + "grad_norm": 5.130002975463867, + "learning_rate": 8.909962998492532e-06, + "loss": 0.2252, + "step": 8705 + }, + { + "epoch": 0.11809549647314162, + "grad_norm": 6.511909008026123, + "learning_rate": 8.909825955872277e-06, + "loss": 0.4936, + "step": 8706 + }, + { + "epoch": 0.11810906131307651, + "grad_norm": 6.394510746002197, + "learning_rate": 8.909688913252022e-06, + "loss": 0.3642, + "step": 8707 + }, + { + "epoch": 0.11812262615301139, + "grad_norm": 9.243715286254883, + "learning_rate": 8.909551870631767e-06, + "loss": 0.3462, + "step": 8708 + }, + { + "epoch": 0.11813619099294628, + "grad_norm": 4.902815341949463, + "learning_rate": 8.909414828011513e-06, + "loss": 0.2249, + "step": 8709 + }, + { + "epoch": 0.11814975583288118, + "grad_norm": 6.814828872680664, + "learning_rate": 8.909277785391258e-06, + "loss": 0.4949, + "step": 8710 + }, + { + "epoch": 0.11816332067281606, + "grad_norm": 5.949851036071777, + "learning_rate": 8.909140742771003e-06, + "loss": 0.3242, + "step": 8711 + }, + { + "epoch": 0.11817688551275095, + "grad_norm": 5.5174479484558105, + "learning_rate": 8.909003700150748e-06, + "loss": 0.3635, + "step": 8712 + }, + { + "epoch": 0.11819045035268584, + "grad_norm": 6.13067626953125, + "learning_rate": 8.908866657530493e-06, + "loss": 0.4554, + "step": 8713 + }, + { + "epoch": 0.11820401519262072, + "grad_norm": 5.8825602531433105, + "learning_rate": 8.908729614910237e-06, + "loss": 0.2675, + "step": 8714 + }, + { + "epoch": 0.11821758003255561, + "grad_norm": 5.885464191436768, + "learning_rate": 8.908592572289982e-06, + "loss": 0.4082, + "step": 8715 + }, + { + "epoch": 0.1182311448724905, + "grad_norm": 5.775771617889404, + "learning_rate": 8.908455529669729e-06, + "loss": 0.4117, + "step": 8716 + }, + { + "epoch": 0.1182447097124254, + "grad_norm": 5.780278205871582, + "learning_rate": 8.908318487049472e-06, + "loss": 0.3568, + "step": 8717 + }, + { + "epoch": 0.11825827455236028, + "grad_norm": 5.013524055480957, + "learning_rate": 8.908181444429218e-06, + "loss": 0.228, + "step": 8718 + }, + { + "epoch": 0.11827183939229517, + "grad_norm": 5.997282028198242, + "learning_rate": 8.908044401808963e-06, + "loss": 0.423, + "step": 8719 + }, + { + "epoch": 0.11828540423223007, + "grad_norm": 5.713600158691406, + "learning_rate": 8.90790735918871e-06, + "loss": 0.2678, + "step": 8720 + }, + { + "epoch": 0.11829896907216494, + "grad_norm": 4.844542980194092, + "learning_rate": 8.907770316568453e-06, + "loss": 0.391, + "step": 8721 + }, + { + "epoch": 0.11831253391209984, + "grad_norm": 5.493208885192871, + "learning_rate": 8.907633273948198e-06, + "loss": 0.4826, + "step": 8722 + }, + { + "epoch": 0.11832609875203473, + "grad_norm": 7.067254066467285, + "learning_rate": 8.907496231327943e-06, + "loss": 0.4233, + "step": 8723 + }, + { + "epoch": 0.11833966359196961, + "grad_norm": 4.657031536102295, + "learning_rate": 8.907359188707689e-06, + "loss": 0.3033, + "step": 8724 + }, + { + "epoch": 0.1183532284319045, + "grad_norm": 4.645669937133789, + "learning_rate": 8.907222146087434e-06, + "loss": 0.287, + "step": 8725 + }, + { + "epoch": 0.1183667932718394, + "grad_norm": 6.676882743835449, + "learning_rate": 8.907085103467179e-06, + "loss": 0.3964, + "step": 8726 + }, + { + "epoch": 0.11838035811177428, + "grad_norm": 6.381921768188477, + "learning_rate": 8.906948060846924e-06, + "loss": 0.4745, + "step": 8727 + }, + { + "epoch": 0.11839392295170917, + "grad_norm": 4.913093566894531, + "learning_rate": 8.90681101822667e-06, + "loss": 0.4609, + "step": 8728 + }, + { + "epoch": 0.11840748779164406, + "grad_norm": 5.4509196281433105, + "learning_rate": 8.906673975606415e-06, + "loss": 0.3717, + "step": 8729 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 4.535305976867676, + "learning_rate": 8.90653693298616e-06, + "loss": 0.2202, + "step": 8730 + }, + { + "epoch": 0.11843461747151383, + "grad_norm": 5.420087814331055, + "learning_rate": 8.906399890365905e-06, + "loss": 0.4317, + "step": 8731 + }, + { + "epoch": 0.11844818231144873, + "grad_norm": 5.489703178405762, + "learning_rate": 8.906262847745648e-06, + "loss": 0.3349, + "step": 8732 + }, + { + "epoch": 0.11846174715138362, + "grad_norm": 4.907985687255859, + "learning_rate": 8.906125805125395e-06, + "loss": 0.3774, + "step": 8733 + }, + { + "epoch": 0.1184753119913185, + "grad_norm": 5.183343887329102, + "learning_rate": 8.90598876250514e-06, + "loss": 0.3207, + "step": 8734 + }, + { + "epoch": 0.11848887683125339, + "grad_norm": 6.210276126861572, + "learning_rate": 8.905851719884886e-06, + "loss": 0.3997, + "step": 8735 + }, + { + "epoch": 0.11850244167118829, + "grad_norm": 6.589163303375244, + "learning_rate": 8.905714677264629e-06, + "loss": 0.4182, + "step": 8736 + }, + { + "epoch": 0.11851600651112316, + "grad_norm": 7.630585670471191, + "learning_rate": 8.905577634644376e-06, + "loss": 0.4428, + "step": 8737 + }, + { + "epoch": 0.11852957135105806, + "grad_norm": 3.6015408039093018, + "learning_rate": 8.905440592024121e-06, + "loss": 0.3002, + "step": 8738 + }, + { + "epoch": 0.11854313619099295, + "grad_norm": 6.909524917602539, + "learning_rate": 8.905303549403865e-06, + "loss": 0.5407, + "step": 8739 + }, + { + "epoch": 0.11855670103092783, + "grad_norm": 4.912966728210449, + "learning_rate": 8.90516650678361e-06, + "loss": 0.3014, + "step": 8740 + }, + { + "epoch": 0.11857026587086272, + "grad_norm": 7.054931163787842, + "learning_rate": 8.905029464163355e-06, + "loss": 0.5146, + "step": 8741 + }, + { + "epoch": 0.11858383071079762, + "grad_norm": 5.285982608795166, + "learning_rate": 8.9048924215431e-06, + "loss": 0.3855, + "step": 8742 + }, + { + "epoch": 0.1185973955507325, + "grad_norm": 5.740163803100586, + "learning_rate": 8.904755378922845e-06, + "loss": 0.3954, + "step": 8743 + }, + { + "epoch": 0.11861096039066739, + "grad_norm": 6.190319538116455, + "learning_rate": 8.90461833630259e-06, + "loss": 0.3595, + "step": 8744 + }, + { + "epoch": 0.11862452523060228, + "grad_norm": 5.102706432342529, + "learning_rate": 8.904481293682336e-06, + "loss": 0.4847, + "step": 8745 + }, + { + "epoch": 0.11863809007053716, + "grad_norm": 5.089355945587158, + "learning_rate": 8.904344251062081e-06, + "loss": 0.2541, + "step": 8746 + }, + { + "epoch": 0.11865165491047205, + "grad_norm": 4.462001800537109, + "learning_rate": 8.904207208441826e-06, + "loss": 0.349, + "step": 8747 + }, + { + "epoch": 0.11866521975040695, + "grad_norm": 5.57717752456665, + "learning_rate": 8.904070165821571e-06, + "loss": 0.4612, + "step": 8748 + }, + { + "epoch": 0.11867878459034184, + "grad_norm": 6.351544380187988, + "learning_rate": 8.903933123201316e-06, + "loss": 0.4819, + "step": 8749 + }, + { + "epoch": 0.11869234943027672, + "grad_norm": 9.027302742004395, + "learning_rate": 8.903796080581062e-06, + "loss": 0.5791, + "step": 8750 + }, + { + "epoch": 0.11870591427021161, + "grad_norm": 5.457905292510986, + "learning_rate": 8.903659037960807e-06, + "loss": 0.4991, + "step": 8751 + }, + { + "epoch": 0.1187194791101465, + "grad_norm": 6.43155574798584, + "learning_rate": 8.903521995340552e-06, + "loss": 0.5015, + "step": 8752 + }, + { + "epoch": 0.11873304395008139, + "grad_norm": 7.3397111892700195, + "learning_rate": 8.903384952720297e-06, + "loss": 0.5094, + "step": 8753 + }, + { + "epoch": 0.11874660879001628, + "grad_norm": 4.512802600860596, + "learning_rate": 8.90324791010004e-06, + "loss": 0.3044, + "step": 8754 + }, + { + "epoch": 0.11876017362995117, + "grad_norm": 4.797166347503662, + "learning_rate": 8.903110867479787e-06, + "loss": 0.3867, + "step": 8755 + }, + { + "epoch": 0.11877373846988605, + "grad_norm": 5.451875686645508, + "learning_rate": 8.902973824859533e-06, + "loss": 0.3413, + "step": 8756 + }, + { + "epoch": 0.11878730330982094, + "grad_norm": 6.547930717468262, + "learning_rate": 8.902836782239276e-06, + "loss": 0.4287, + "step": 8757 + }, + { + "epoch": 0.11880086814975584, + "grad_norm": 6.215541839599609, + "learning_rate": 8.902699739619021e-06, + "loss": 0.3091, + "step": 8758 + }, + { + "epoch": 0.11881443298969072, + "grad_norm": 4.707695484161377, + "learning_rate": 8.902562696998768e-06, + "loss": 0.2823, + "step": 8759 + }, + { + "epoch": 0.11882799782962561, + "grad_norm": 6.856766700744629, + "learning_rate": 8.902425654378513e-06, + "loss": 0.4124, + "step": 8760 + }, + { + "epoch": 0.1188415626695605, + "grad_norm": 6.834644794464111, + "learning_rate": 8.902288611758257e-06, + "loss": 0.3247, + "step": 8761 + }, + { + "epoch": 0.11885512750949538, + "grad_norm": 5.981697082519531, + "learning_rate": 8.902151569138002e-06, + "loss": 0.2401, + "step": 8762 + }, + { + "epoch": 0.11886869234943027, + "grad_norm": 6.847561359405518, + "learning_rate": 8.902014526517749e-06, + "loss": 0.4761, + "step": 8763 + }, + { + "epoch": 0.11888225718936517, + "grad_norm": 6.1097307205200195, + "learning_rate": 8.901877483897492e-06, + "loss": 0.3229, + "step": 8764 + }, + { + "epoch": 0.11889582202930006, + "grad_norm": 5.037081718444824, + "learning_rate": 8.901740441277238e-06, + "loss": 0.3105, + "step": 8765 + }, + { + "epoch": 0.11890938686923494, + "grad_norm": 7.938305377960205, + "learning_rate": 8.901603398656983e-06, + "loss": 0.3432, + "step": 8766 + }, + { + "epoch": 0.11892295170916983, + "grad_norm": 5.101083755493164, + "learning_rate": 8.901466356036728e-06, + "loss": 0.4207, + "step": 8767 + }, + { + "epoch": 0.11893651654910473, + "grad_norm": 5.133700847625732, + "learning_rate": 8.901329313416473e-06, + "loss": 0.2505, + "step": 8768 + }, + { + "epoch": 0.1189500813890396, + "grad_norm": 6.517621040344238, + "learning_rate": 8.901192270796218e-06, + "loss": 0.3113, + "step": 8769 + }, + { + "epoch": 0.1189636462289745, + "grad_norm": 5.311714172363281, + "learning_rate": 8.901055228175963e-06, + "loss": 0.3419, + "step": 8770 + }, + { + "epoch": 0.11897721106890939, + "grad_norm": 6.59169864654541, + "learning_rate": 8.900918185555709e-06, + "loss": 0.3213, + "step": 8771 + }, + { + "epoch": 0.11899077590884427, + "grad_norm": 5.7038445472717285, + "learning_rate": 8.900781142935454e-06, + "loss": 0.321, + "step": 8772 + }, + { + "epoch": 0.11900434074877916, + "grad_norm": 6.484609603881836, + "learning_rate": 8.900644100315199e-06, + "loss": 0.3432, + "step": 8773 + }, + { + "epoch": 0.11901790558871406, + "grad_norm": 5.342787265777588, + "learning_rate": 8.900507057694944e-06, + "loss": 0.3348, + "step": 8774 + }, + { + "epoch": 0.11903147042864894, + "grad_norm": 5.5178751945495605, + "learning_rate": 8.90037001507469e-06, + "loss": 0.2978, + "step": 8775 + }, + { + "epoch": 0.11904503526858383, + "grad_norm": 8.931994438171387, + "learning_rate": 8.900232972454435e-06, + "loss": 0.5807, + "step": 8776 + }, + { + "epoch": 0.11905860010851872, + "grad_norm": 5.014712810516357, + "learning_rate": 8.90009592983418e-06, + "loss": 0.3402, + "step": 8777 + }, + { + "epoch": 0.1190721649484536, + "grad_norm": 6.701290130615234, + "learning_rate": 8.899958887213925e-06, + "loss": 0.4393, + "step": 8778 + }, + { + "epoch": 0.1190857297883885, + "grad_norm": 8.216998100280762, + "learning_rate": 8.899821844593668e-06, + "loss": 0.4222, + "step": 8779 + }, + { + "epoch": 0.11909929462832339, + "grad_norm": 6.692327976226807, + "learning_rate": 8.899684801973415e-06, + "loss": 0.3498, + "step": 8780 + }, + { + "epoch": 0.11911285946825828, + "grad_norm": 5.495484828948975, + "learning_rate": 8.89954775935316e-06, + "loss": 0.2939, + "step": 8781 + }, + { + "epoch": 0.11912642430819316, + "grad_norm": 5.81107759475708, + "learning_rate": 8.899410716732904e-06, + "loss": 0.3433, + "step": 8782 + }, + { + "epoch": 0.11913998914812805, + "grad_norm": 5.835959434509277, + "learning_rate": 8.899273674112649e-06, + "loss": 0.3926, + "step": 8783 + }, + { + "epoch": 0.11915355398806295, + "grad_norm": 5.609464645385742, + "learning_rate": 8.899136631492394e-06, + "loss": 0.3843, + "step": 8784 + }, + { + "epoch": 0.11916711882799783, + "grad_norm": 5.6295647621154785, + "learning_rate": 8.898999588872141e-06, + "loss": 0.4784, + "step": 8785 + }, + { + "epoch": 0.11918068366793272, + "grad_norm": 5.3232855796813965, + "learning_rate": 8.898862546251885e-06, + "loss": 0.2652, + "step": 8786 + }, + { + "epoch": 0.11919424850786761, + "grad_norm": 5.056864261627197, + "learning_rate": 8.89872550363163e-06, + "loss": 0.3633, + "step": 8787 + }, + { + "epoch": 0.11920781334780249, + "grad_norm": 5.957204341888428, + "learning_rate": 8.898588461011375e-06, + "loss": 0.3739, + "step": 8788 + }, + { + "epoch": 0.11922137818773738, + "grad_norm": 6.640449047088623, + "learning_rate": 8.89845141839112e-06, + "loss": 0.3426, + "step": 8789 + }, + { + "epoch": 0.11923494302767228, + "grad_norm": 5.514519214630127, + "learning_rate": 8.898314375770865e-06, + "loss": 0.4474, + "step": 8790 + }, + { + "epoch": 0.11924850786760716, + "grad_norm": 5.348637580871582, + "learning_rate": 8.89817733315061e-06, + "loss": 0.4132, + "step": 8791 + }, + { + "epoch": 0.11926207270754205, + "grad_norm": 5.748797416687012, + "learning_rate": 8.898040290530356e-06, + "loss": 0.4239, + "step": 8792 + }, + { + "epoch": 0.11927563754747694, + "grad_norm": 7.144001007080078, + "learning_rate": 8.897903247910101e-06, + "loss": 0.4128, + "step": 8793 + }, + { + "epoch": 0.11928920238741182, + "grad_norm": 6.3691277503967285, + "learning_rate": 8.897766205289846e-06, + "loss": 0.4453, + "step": 8794 + }, + { + "epoch": 0.11930276722734671, + "grad_norm": 4.250207424163818, + "learning_rate": 8.897629162669591e-06, + "loss": 0.3401, + "step": 8795 + }, + { + "epoch": 0.11931633206728161, + "grad_norm": 8.125743865966797, + "learning_rate": 8.897492120049336e-06, + "loss": 0.4393, + "step": 8796 + }, + { + "epoch": 0.1193298969072165, + "grad_norm": 5.2263312339782715, + "learning_rate": 8.89735507742908e-06, + "loss": 0.4079, + "step": 8797 + }, + { + "epoch": 0.11934346174715138, + "grad_norm": 6.858083724975586, + "learning_rate": 8.897218034808827e-06, + "loss": 0.5302, + "step": 8798 + }, + { + "epoch": 0.11935702658708627, + "grad_norm": 6.760918140411377, + "learning_rate": 8.897080992188572e-06, + "loss": 0.4562, + "step": 8799 + }, + { + "epoch": 0.11937059142702117, + "grad_norm": 5.127370357513428, + "learning_rate": 8.896943949568317e-06, + "loss": 0.4014, + "step": 8800 + }, + { + "epoch": 0.11938415626695605, + "grad_norm": 6.21633243560791, + "learning_rate": 8.89680690694806e-06, + "loss": 0.3482, + "step": 8801 + }, + { + "epoch": 0.11939772110689094, + "grad_norm": 6.595790386199951, + "learning_rate": 8.896669864327808e-06, + "loss": 0.5033, + "step": 8802 + }, + { + "epoch": 0.11941128594682583, + "grad_norm": 6.032820701599121, + "learning_rate": 8.896532821707553e-06, + "loss": 0.3379, + "step": 8803 + }, + { + "epoch": 0.11942485078676071, + "grad_norm": 5.909610271453857, + "learning_rate": 8.896395779087296e-06, + "loss": 0.3747, + "step": 8804 + }, + { + "epoch": 0.1194384156266956, + "grad_norm": 7.993870735168457, + "learning_rate": 8.896258736467041e-06, + "loss": 0.5432, + "step": 8805 + }, + { + "epoch": 0.1194519804666305, + "grad_norm": 7.652669906616211, + "learning_rate": 8.896121693846788e-06, + "loss": 0.6301, + "step": 8806 + }, + { + "epoch": 0.11946554530656538, + "grad_norm": 6.890285491943359, + "learning_rate": 8.895984651226532e-06, + "loss": 0.375, + "step": 8807 + }, + { + "epoch": 0.11947911014650027, + "grad_norm": 8.647211074829102, + "learning_rate": 8.895847608606277e-06, + "loss": 0.2981, + "step": 8808 + }, + { + "epoch": 0.11949267498643516, + "grad_norm": 5.619905471801758, + "learning_rate": 8.895710565986022e-06, + "loss": 0.4296, + "step": 8809 + }, + { + "epoch": 0.11950623982637004, + "grad_norm": 6.67237663269043, + "learning_rate": 8.895573523365767e-06, + "loss": 0.4411, + "step": 8810 + }, + { + "epoch": 0.11951980466630494, + "grad_norm": 5.588397026062012, + "learning_rate": 8.895436480745512e-06, + "loss": 0.3212, + "step": 8811 + }, + { + "epoch": 0.11953336950623983, + "grad_norm": 6.000301837921143, + "learning_rate": 8.895299438125258e-06, + "loss": 0.4306, + "step": 8812 + }, + { + "epoch": 0.11954693434617472, + "grad_norm": 5.253604412078857, + "learning_rate": 8.895162395505003e-06, + "loss": 0.358, + "step": 8813 + }, + { + "epoch": 0.1195604991861096, + "grad_norm": 7.425019264221191, + "learning_rate": 8.895025352884748e-06, + "loss": 0.5072, + "step": 8814 + }, + { + "epoch": 0.1195740640260445, + "grad_norm": 7.212739944458008, + "learning_rate": 8.894888310264493e-06, + "loss": 0.5211, + "step": 8815 + }, + { + "epoch": 0.11958762886597939, + "grad_norm": 5.9600419998168945, + "learning_rate": 8.894751267644238e-06, + "loss": 0.3272, + "step": 8816 + }, + { + "epoch": 0.11960119370591427, + "grad_norm": 6.486701965332031, + "learning_rate": 8.894614225023983e-06, + "loss": 0.4232, + "step": 8817 + }, + { + "epoch": 0.11961475854584916, + "grad_norm": 6.888497829437256, + "learning_rate": 8.894477182403729e-06, + "loss": 0.5909, + "step": 8818 + }, + { + "epoch": 0.11962832338578405, + "grad_norm": 6.279491424560547, + "learning_rate": 8.894340139783474e-06, + "loss": 0.365, + "step": 8819 + }, + { + "epoch": 0.11964188822571893, + "grad_norm": 8.11490535736084, + "learning_rate": 8.894203097163219e-06, + "loss": 0.4542, + "step": 8820 + }, + { + "epoch": 0.11965545306565382, + "grad_norm": 4.613699913024902, + "learning_rate": 8.894066054542964e-06, + "loss": 0.2962, + "step": 8821 + }, + { + "epoch": 0.11966901790558872, + "grad_norm": 5.960657119750977, + "learning_rate": 8.893929011922708e-06, + "loss": 0.3889, + "step": 8822 + }, + { + "epoch": 0.1196825827455236, + "grad_norm": 6.161025047302246, + "learning_rate": 8.893791969302453e-06, + "loss": 0.4671, + "step": 8823 + }, + { + "epoch": 0.11969614758545849, + "grad_norm": 5.934390544891357, + "learning_rate": 8.8936549266822e-06, + "loss": 0.4146, + "step": 8824 + }, + { + "epoch": 0.11970971242539338, + "grad_norm": 6.650820732116699, + "learning_rate": 8.893517884061943e-06, + "loss": 0.4303, + "step": 8825 + }, + { + "epoch": 0.11972327726532826, + "grad_norm": 8.183843612670898, + "learning_rate": 8.893380841441688e-06, + "loss": 0.3508, + "step": 8826 + }, + { + "epoch": 0.11973684210526316, + "grad_norm": 4.338287830352783, + "learning_rate": 8.893243798821434e-06, + "loss": 0.2491, + "step": 8827 + }, + { + "epoch": 0.11975040694519805, + "grad_norm": 6.535916328430176, + "learning_rate": 8.89310675620118e-06, + "loss": 0.4456, + "step": 8828 + }, + { + "epoch": 0.11976397178513294, + "grad_norm": 4.285501003265381, + "learning_rate": 8.892969713580924e-06, + "loss": 0.2545, + "step": 8829 + }, + { + "epoch": 0.11977753662506782, + "grad_norm": 6.525630950927734, + "learning_rate": 8.892832670960669e-06, + "loss": 0.3846, + "step": 8830 + }, + { + "epoch": 0.11979110146500271, + "grad_norm": 5.863311290740967, + "learning_rate": 8.892695628340414e-06, + "loss": 0.2828, + "step": 8831 + }, + { + "epoch": 0.11980466630493761, + "grad_norm": 7.0193095207214355, + "learning_rate": 8.89255858572016e-06, + "loss": 0.2619, + "step": 8832 + }, + { + "epoch": 0.11981823114487249, + "grad_norm": 6.430945873260498, + "learning_rate": 8.892421543099905e-06, + "loss": 0.3488, + "step": 8833 + }, + { + "epoch": 0.11983179598480738, + "grad_norm": 6.76633882522583, + "learning_rate": 8.89228450047965e-06, + "loss": 0.4671, + "step": 8834 + }, + { + "epoch": 0.11984536082474227, + "grad_norm": 7.113648891448975, + "learning_rate": 8.892147457859395e-06, + "loss": 0.4749, + "step": 8835 + }, + { + "epoch": 0.11985892566467715, + "grad_norm": 8.167976379394531, + "learning_rate": 8.89201041523914e-06, + "loss": 0.4114, + "step": 8836 + }, + { + "epoch": 0.11987249050461204, + "grad_norm": 6.406024932861328, + "learning_rate": 8.891873372618885e-06, + "loss": 0.3753, + "step": 8837 + }, + { + "epoch": 0.11988605534454694, + "grad_norm": 5.606475353240967, + "learning_rate": 8.89173632999863e-06, + "loss": 0.3966, + "step": 8838 + }, + { + "epoch": 0.11989962018448182, + "grad_norm": 8.404739379882812, + "learning_rate": 8.891599287378376e-06, + "loss": 0.4841, + "step": 8839 + }, + { + "epoch": 0.11991318502441671, + "grad_norm": 5.080089092254639, + "learning_rate": 8.89146224475812e-06, + "loss": 0.2323, + "step": 8840 + }, + { + "epoch": 0.1199267498643516, + "grad_norm": 6.521341800689697, + "learning_rate": 8.891325202137866e-06, + "loss": 0.2559, + "step": 8841 + }, + { + "epoch": 0.11994031470428648, + "grad_norm": 6.305020809173584, + "learning_rate": 8.891188159517611e-06, + "loss": 0.3328, + "step": 8842 + }, + { + "epoch": 0.11995387954422138, + "grad_norm": 6.987460136413574, + "learning_rate": 8.891051116897356e-06, + "loss": 0.3205, + "step": 8843 + }, + { + "epoch": 0.11996744438415627, + "grad_norm": 6.851687431335449, + "learning_rate": 8.8909140742771e-06, + "loss": 0.5221, + "step": 8844 + }, + { + "epoch": 0.11998100922409116, + "grad_norm": 5.141279697418213, + "learning_rate": 8.890777031656847e-06, + "loss": 0.3283, + "step": 8845 + }, + { + "epoch": 0.11999457406402604, + "grad_norm": 4.73252534866333, + "learning_rate": 8.890639989036592e-06, + "loss": 0.2327, + "step": 8846 + }, + { + "epoch": 0.12000813890396093, + "grad_norm": 4.8737335205078125, + "learning_rate": 8.890502946416335e-06, + "loss": 0.2834, + "step": 8847 + }, + { + "epoch": 0.12002170374389583, + "grad_norm": 6.2668585777282715, + "learning_rate": 8.89036590379608e-06, + "loss": 0.2975, + "step": 8848 + }, + { + "epoch": 0.1200352685838307, + "grad_norm": 5.536289691925049, + "learning_rate": 8.890228861175828e-06, + "loss": 0.3659, + "step": 8849 + }, + { + "epoch": 0.1200488334237656, + "grad_norm": 6.494061470031738, + "learning_rate": 8.890091818555571e-06, + "loss": 0.5043, + "step": 8850 + }, + { + "epoch": 0.12006239826370049, + "grad_norm": 7.570091247558594, + "learning_rate": 8.889954775935316e-06, + "loss": 0.4507, + "step": 8851 + }, + { + "epoch": 0.12007596310363537, + "grad_norm": 4.734752655029297, + "learning_rate": 8.889817733315061e-06, + "loss": 0.3555, + "step": 8852 + }, + { + "epoch": 0.12008952794357027, + "grad_norm": 5.768855571746826, + "learning_rate": 8.889680690694807e-06, + "loss": 0.2926, + "step": 8853 + }, + { + "epoch": 0.12010309278350516, + "grad_norm": 6.157398223876953, + "learning_rate": 8.889543648074552e-06, + "loss": 0.2724, + "step": 8854 + }, + { + "epoch": 0.12011665762344004, + "grad_norm": 4.96786642074585, + "learning_rate": 8.889406605454297e-06, + "loss": 0.3136, + "step": 8855 + }, + { + "epoch": 0.12013022246337493, + "grad_norm": 6.657893180847168, + "learning_rate": 8.889269562834042e-06, + "loss": 0.461, + "step": 8856 + }, + { + "epoch": 0.12014378730330982, + "grad_norm": 4.225944995880127, + "learning_rate": 8.889132520213787e-06, + "loss": 0.2599, + "step": 8857 + }, + { + "epoch": 0.1201573521432447, + "grad_norm": 6.043015480041504, + "learning_rate": 8.888995477593532e-06, + "loss": 0.369, + "step": 8858 + }, + { + "epoch": 0.1201709169831796, + "grad_norm": 6.2497406005859375, + "learning_rate": 8.888858434973278e-06, + "loss": 0.3979, + "step": 8859 + }, + { + "epoch": 0.12018448182311449, + "grad_norm": 4.952688217163086, + "learning_rate": 8.888721392353023e-06, + "loss": 0.2956, + "step": 8860 + }, + { + "epoch": 0.12019804666304938, + "grad_norm": 10.126988410949707, + "learning_rate": 8.888584349732768e-06, + "loss": 0.6778, + "step": 8861 + }, + { + "epoch": 0.12021161150298426, + "grad_norm": 6.707614421844482, + "learning_rate": 8.888447307112513e-06, + "loss": 0.3775, + "step": 8862 + }, + { + "epoch": 0.12022517634291915, + "grad_norm": 8.868232727050781, + "learning_rate": 8.888310264492258e-06, + "loss": 0.4835, + "step": 8863 + }, + { + "epoch": 0.12023874118285405, + "grad_norm": 4.85640811920166, + "learning_rate": 8.888173221872004e-06, + "loss": 0.3906, + "step": 8864 + }, + { + "epoch": 0.12025230602278893, + "grad_norm": 6.776279926300049, + "learning_rate": 8.888036179251747e-06, + "loss": 0.4601, + "step": 8865 + }, + { + "epoch": 0.12026587086272382, + "grad_norm": 5.490754127502441, + "learning_rate": 8.887899136631492e-06, + "loss": 0.4903, + "step": 8866 + }, + { + "epoch": 0.12027943570265871, + "grad_norm": 6.6817626953125, + "learning_rate": 8.887762094011239e-06, + "loss": 0.4198, + "step": 8867 + }, + { + "epoch": 0.12029300054259359, + "grad_norm": 6.825406551361084, + "learning_rate": 8.887625051390984e-06, + "loss": 0.2965, + "step": 8868 + }, + { + "epoch": 0.12030656538252849, + "grad_norm": 7.271286964416504, + "learning_rate": 8.887488008770728e-06, + "loss": 0.436, + "step": 8869 + }, + { + "epoch": 0.12032013022246338, + "grad_norm": 6.353732585906982, + "learning_rate": 8.887350966150473e-06, + "loss": 0.4983, + "step": 8870 + }, + { + "epoch": 0.12033369506239826, + "grad_norm": 7.468130588531494, + "learning_rate": 8.88721392353022e-06, + "loss": 0.467, + "step": 8871 + }, + { + "epoch": 0.12034725990233315, + "grad_norm": 6.932852268218994, + "learning_rate": 8.887076880909963e-06, + "loss": 0.3871, + "step": 8872 + }, + { + "epoch": 0.12036082474226804, + "grad_norm": 5.543426036834717, + "learning_rate": 8.886939838289708e-06, + "loss": 0.3719, + "step": 8873 + }, + { + "epoch": 0.12037438958220294, + "grad_norm": 5.6385016441345215, + "learning_rate": 8.886802795669454e-06, + "loss": 0.3684, + "step": 8874 + }, + { + "epoch": 0.12038795442213782, + "grad_norm": 6.8638834953308105, + "learning_rate": 8.886665753049199e-06, + "loss": 0.3243, + "step": 8875 + }, + { + "epoch": 0.12040151926207271, + "grad_norm": 5.775953769683838, + "learning_rate": 8.886528710428944e-06, + "loss": 0.3888, + "step": 8876 + }, + { + "epoch": 0.1204150841020076, + "grad_norm": 7.490694046020508, + "learning_rate": 8.886391667808689e-06, + "loss": 0.4846, + "step": 8877 + }, + { + "epoch": 0.12042864894194248, + "grad_norm": 8.884758949279785, + "learning_rate": 8.886254625188434e-06, + "loss": 0.5752, + "step": 8878 + }, + { + "epoch": 0.12044221378187737, + "grad_norm": 6.543981552124023, + "learning_rate": 8.88611758256818e-06, + "loss": 0.4064, + "step": 8879 + }, + { + "epoch": 0.12045577862181227, + "grad_norm": 6.208365440368652, + "learning_rate": 8.885980539947925e-06, + "loss": 0.4236, + "step": 8880 + }, + { + "epoch": 0.12046934346174715, + "grad_norm": 4.321103096008301, + "learning_rate": 8.88584349732767e-06, + "loss": 0.3409, + "step": 8881 + }, + { + "epoch": 0.12048290830168204, + "grad_norm": 5.429330348968506, + "learning_rate": 8.885706454707415e-06, + "loss": 0.3233, + "step": 8882 + }, + { + "epoch": 0.12049647314161693, + "grad_norm": 4.810860633850098, + "learning_rate": 8.88556941208716e-06, + "loss": 0.4434, + "step": 8883 + }, + { + "epoch": 0.12051003798155181, + "grad_norm": 6.53423547744751, + "learning_rate": 8.885432369466905e-06, + "loss": 0.3768, + "step": 8884 + }, + { + "epoch": 0.1205236028214867, + "grad_norm": 5.375133037567139, + "learning_rate": 8.88529532684665e-06, + "loss": 0.4434, + "step": 8885 + }, + { + "epoch": 0.1205371676614216, + "grad_norm": 4.776054382324219, + "learning_rate": 8.885158284226396e-06, + "loss": 0.2839, + "step": 8886 + }, + { + "epoch": 0.12055073250135648, + "grad_norm": 6.459007740020752, + "learning_rate": 8.88502124160614e-06, + "loss": 0.3369, + "step": 8887 + }, + { + "epoch": 0.12056429734129137, + "grad_norm": 5.251793384552002, + "learning_rate": 8.884884198985886e-06, + "loss": 0.3195, + "step": 8888 + }, + { + "epoch": 0.12057786218122626, + "grad_norm": 5.693985939025879, + "learning_rate": 8.884747156365631e-06, + "loss": 0.3219, + "step": 8889 + }, + { + "epoch": 0.12059142702116116, + "grad_norm": 6.1166276931762695, + "learning_rate": 8.884610113745375e-06, + "loss": 0.408, + "step": 8890 + }, + { + "epoch": 0.12060499186109604, + "grad_norm": 5.642457008361816, + "learning_rate": 8.88447307112512e-06, + "loss": 0.4554, + "step": 8891 + }, + { + "epoch": 0.12061855670103093, + "grad_norm": 5.580058574676514, + "learning_rate": 8.884336028504865e-06, + "loss": 0.3554, + "step": 8892 + }, + { + "epoch": 0.12063212154096582, + "grad_norm": 5.26015567779541, + "learning_rate": 8.88419898588461e-06, + "loss": 0.2979, + "step": 8893 + }, + { + "epoch": 0.1206456863809007, + "grad_norm": 6.276143550872803, + "learning_rate": 8.884061943264355e-06, + "loss": 0.2867, + "step": 8894 + }, + { + "epoch": 0.1206592512208356, + "grad_norm": 5.394872665405273, + "learning_rate": 8.8839249006441e-06, + "loss": 0.3516, + "step": 8895 + }, + { + "epoch": 0.12067281606077049, + "grad_norm": 5.29269552230835, + "learning_rate": 8.883787858023846e-06, + "loss": 0.379, + "step": 8896 + }, + { + "epoch": 0.12068638090070537, + "grad_norm": 5.004711151123047, + "learning_rate": 8.883650815403591e-06, + "loss": 0.2192, + "step": 8897 + }, + { + "epoch": 0.12069994574064026, + "grad_norm": 6.229086875915527, + "learning_rate": 8.883513772783336e-06, + "loss": 0.3727, + "step": 8898 + }, + { + "epoch": 0.12071351058057515, + "grad_norm": 7.039980888366699, + "learning_rate": 8.883376730163081e-06, + "loss": 0.4654, + "step": 8899 + }, + { + "epoch": 0.12072707542051003, + "grad_norm": 4.4577436447143555, + "learning_rate": 8.883239687542827e-06, + "loss": 0.2593, + "step": 8900 + }, + { + "epoch": 0.12074064026044493, + "grad_norm": 6.404839992523193, + "learning_rate": 8.883102644922572e-06, + "loss": 0.4518, + "step": 8901 + }, + { + "epoch": 0.12075420510037982, + "grad_norm": 5.0589118003845215, + "learning_rate": 8.882965602302317e-06, + "loss": 0.3132, + "step": 8902 + }, + { + "epoch": 0.1207677699403147, + "grad_norm": 6.009579181671143, + "learning_rate": 8.882828559682062e-06, + "loss": 0.3899, + "step": 8903 + }, + { + "epoch": 0.12078133478024959, + "grad_norm": 5.366706371307373, + "learning_rate": 8.882691517061807e-06, + "loss": 0.2523, + "step": 8904 + }, + { + "epoch": 0.12079489962018448, + "grad_norm": 4.657440662384033, + "learning_rate": 8.88255447444155e-06, + "loss": 0.1995, + "step": 8905 + }, + { + "epoch": 0.12080846446011938, + "grad_norm": 5.873816013336182, + "learning_rate": 8.882417431821298e-06, + "loss": 0.3323, + "step": 8906 + }, + { + "epoch": 0.12082202930005426, + "grad_norm": 6.368682861328125, + "learning_rate": 8.882280389201043e-06, + "loss": 0.3369, + "step": 8907 + }, + { + "epoch": 0.12083559413998915, + "grad_norm": 7.914632797241211, + "learning_rate": 8.882143346580786e-06, + "loss": 0.4662, + "step": 8908 + }, + { + "epoch": 0.12084915897992404, + "grad_norm": 6.338555335998535, + "learning_rate": 8.882006303960531e-06, + "loss": 0.418, + "step": 8909 + }, + { + "epoch": 0.12086272381985892, + "grad_norm": 7.314977169036865, + "learning_rate": 8.881869261340278e-06, + "loss": 0.3806, + "step": 8910 + }, + { + "epoch": 0.12087628865979382, + "grad_norm": 6.259881019592285, + "learning_rate": 8.881732218720024e-06, + "loss": 0.5224, + "step": 8911 + }, + { + "epoch": 0.12088985349972871, + "grad_norm": 5.5211100578308105, + "learning_rate": 8.881595176099767e-06, + "loss": 0.4241, + "step": 8912 + }, + { + "epoch": 0.12090341833966359, + "grad_norm": 5.377979278564453, + "learning_rate": 8.881458133479512e-06, + "loss": 0.2689, + "step": 8913 + }, + { + "epoch": 0.12091698317959848, + "grad_norm": 9.622282028198242, + "learning_rate": 8.881321090859259e-06, + "loss": 0.7268, + "step": 8914 + }, + { + "epoch": 0.12093054801953337, + "grad_norm": 7.1740851402282715, + "learning_rate": 8.881184048239003e-06, + "loss": 0.4064, + "step": 8915 + }, + { + "epoch": 0.12094411285946825, + "grad_norm": 5.090599536895752, + "learning_rate": 8.881047005618748e-06, + "loss": 0.2776, + "step": 8916 + }, + { + "epoch": 0.12095767769940315, + "grad_norm": 7.61503791809082, + "learning_rate": 8.880909962998493e-06, + "loss": 0.4634, + "step": 8917 + }, + { + "epoch": 0.12097124253933804, + "grad_norm": 6.363489627838135, + "learning_rate": 8.880772920378238e-06, + "loss": 0.3618, + "step": 8918 + }, + { + "epoch": 0.12098480737927292, + "grad_norm": 7.990340709686279, + "learning_rate": 8.880635877757983e-06, + "loss": 0.4844, + "step": 8919 + }, + { + "epoch": 0.12099837221920781, + "grad_norm": 6.761416912078857, + "learning_rate": 8.880498835137728e-06, + "loss": 0.2939, + "step": 8920 + }, + { + "epoch": 0.1210119370591427, + "grad_norm": 7.226147174835205, + "learning_rate": 8.880361792517474e-06, + "loss": 0.4321, + "step": 8921 + }, + { + "epoch": 0.1210255018990776, + "grad_norm": 6.197293758392334, + "learning_rate": 8.880224749897219e-06, + "loss": 0.3854, + "step": 8922 + }, + { + "epoch": 0.12103906673901248, + "grad_norm": 7.244717121124268, + "learning_rate": 8.880087707276964e-06, + "loss": 0.5415, + "step": 8923 + }, + { + "epoch": 0.12105263157894737, + "grad_norm": 7.191476345062256, + "learning_rate": 8.87995066465671e-06, + "loss": 0.3971, + "step": 8924 + }, + { + "epoch": 0.12106619641888226, + "grad_norm": 6.18109130859375, + "learning_rate": 8.879813622036454e-06, + "loss": 0.2555, + "step": 8925 + }, + { + "epoch": 0.12107976125881714, + "grad_norm": 7.593710899353027, + "learning_rate": 8.8796765794162e-06, + "loss": 0.3384, + "step": 8926 + }, + { + "epoch": 0.12109332609875204, + "grad_norm": 7.910206317901611, + "learning_rate": 8.879539536795945e-06, + "loss": 0.5664, + "step": 8927 + }, + { + "epoch": 0.12110689093868693, + "grad_norm": 5.355668544769287, + "learning_rate": 8.87940249417569e-06, + "loss": 0.3171, + "step": 8928 + }, + { + "epoch": 0.12112045577862181, + "grad_norm": 11.984055519104004, + "learning_rate": 8.879265451555435e-06, + "loss": 0.5582, + "step": 8929 + }, + { + "epoch": 0.1211340206185567, + "grad_norm": 7.809619903564453, + "learning_rate": 8.879128408935179e-06, + "loss": 0.4411, + "step": 8930 + }, + { + "epoch": 0.1211475854584916, + "grad_norm": 8.758798599243164, + "learning_rate": 8.878991366314925e-06, + "loss": 0.5519, + "step": 8931 + }, + { + "epoch": 0.12116115029842647, + "grad_norm": 5.942320823669434, + "learning_rate": 8.87885432369467e-06, + "loss": 0.3598, + "step": 8932 + }, + { + "epoch": 0.12117471513836137, + "grad_norm": 7.488801956176758, + "learning_rate": 8.878717281074414e-06, + "loss": 0.4909, + "step": 8933 + }, + { + "epoch": 0.12118827997829626, + "grad_norm": 6.506533145904541, + "learning_rate": 8.87858023845416e-06, + "loss": 0.4194, + "step": 8934 + }, + { + "epoch": 0.12120184481823114, + "grad_norm": 8.240914344787598, + "learning_rate": 8.878443195833904e-06, + "loss": 0.4649, + "step": 8935 + }, + { + "epoch": 0.12121540965816603, + "grad_norm": 5.6200642585754395, + "learning_rate": 8.878306153213651e-06, + "loss": 0.3561, + "step": 8936 + }, + { + "epoch": 0.12122897449810092, + "grad_norm": 8.768728256225586, + "learning_rate": 8.878169110593395e-06, + "loss": 0.6145, + "step": 8937 + }, + { + "epoch": 0.12124253933803582, + "grad_norm": 6.123758316040039, + "learning_rate": 8.87803206797314e-06, + "loss": 0.3805, + "step": 8938 + }, + { + "epoch": 0.1212561041779707, + "grad_norm": 7.587152481079102, + "learning_rate": 8.877895025352885e-06, + "loss": 0.5076, + "step": 8939 + }, + { + "epoch": 0.12126966901790559, + "grad_norm": 6.141079425811768, + "learning_rate": 8.87775798273263e-06, + "loss": 0.3797, + "step": 8940 + }, + { + "epoch": 0.12128323385784048, + "grad_norm": 6.49141263961792, + "learning_rate": 8.877620940112376e-06, + "loss": 0.5207, + "step": 8941 + }, + { + "epoch": 0.12129679869777536, + "grad_norm": 7.1045451164245605, + "learning_rate": 8.87748389749212e-06, + "loss": 0.4443, + "step": 8942 + }, + { + "epoch": 0.12131036353771026, + "grad_norm": 9.281726837158203, + "learning_rate": 8.877346854871866e-06, + "loss": 0.4902, + "step": 8943 + }, + { + "epoch": 0.12132392837764515, + "grad_norm": 6.873898506164551, + "learning_rate": 8.877209812251611e-06, + "loss": 0.4277, + "step": 8944 + }, + { + "epoch": 0.12133749321758003, + "grad_norm": 8.532552719116211, + "learning_rate": 8.877072769631356e-06, + "loss": 0.5045, + "step": 8945 + }, + { + "epoch": 0.12135105805751492, + "grad_norm": 7.631991386413574, + "learning_rate": 8.876935727011101e-06, + "loss": 0.5288, + "step": 8946 + }, + { + "epoch": 0.12136462289744981, + "grad_norm": 8.201752662658691, + "learning_rate": 8.876798684390847e-06, + "loss": 0.4411, + "step": 8947 + }, + { + "epoch": 0.1213781877373847, + "grad_norm": 8.924395561218262, + "learning_rate": 8.87666164177059e-06, + "loss": 0.6139, + "step": 8948 + }, + { + "epoch": 0.12139175257731959, + "grad_norm": 7.075215816497803, + "learning_rate": 8.876524599150337e-06, + "loss": 0.3857, + "step": 8949 + }, + { + "epoch": 0.12140531741725448, + "grad_norm": 8.316845893859863, + "learning_rate": 8.876387556530082e-06, + "loss": 0.5236, + "step": 8950 + }, + { + "epoch": 0.12141888225718936, + "grad_norm": 6.827566146850586, + "learning_rate": 8.876250513909827e-06, + "loss": 0.4125, + "step": 8951 + }, + { + "epoch": 0.12143244709712425, + "grad_norm": 8.406014442443848, + "learning_rate": 8.87611347128957e-06, + "loss": 0.5018, + "step": 8952 + }, + { + "epoch": 0.12144601193705915, + "grad_norm": 7.559534072875977, + "learning_rate": 8.875976428669318e-06, + "loss": 0.4495, + "step": 8953 + }, + { + "epoch": 0.12145957677699404, + "grad_norm": 7.216536521911621, + "learning_rate": 8.875839386049063e-06, + "loss": 0.4059, + "step": 8954 + }, + { + "epoch": 0.12147314161692892, + "grad_norm": 6.0153021812438965, + "learning_rate": 8.875702343428806e-06, + "loss": 0.3444, + "step": 8955 + }, + { + "epoch": 0.12148670645686381, + "grad_norm": 7.098453998565674, + "learning_rate": 8.875565300808552e-06, + "loss": 0.3645, + "step": 8956 + }, + { + "epoch": 0.1215002712967987, + "grad_norm": 10.133014678955078, + "learning_rate": 8.875428258188298e-06, + "loss": 0.6622, + "step": 8957 + }, + { + "epoch": 0.12151383613673358, + "grad_norm": 5.959512710571289, + "learning_rate": 8.875291215568042e-06, + "loss": 0.3265, + "step": 8958 + }, + { + "epoch": 0.12152740097666848, + "grad_norm": 7.134252548217773, + "learning_rate": 8.875154172947787e-06, + "loss": 0.4317, + "step": 8959 + }, + { + "epoch": 0.12154096581660337, + "grad_norm": 5.513468265533447, + "learning_rate": 8.875017130327532e-06, + "loss": 0.3288, + "step": 8960 + }, + { + "epoch": 0.12155453065653825, + "grad_norm": 6.831462860107422, + "learning_rate": 8.874880087707277e-06, + "loss": 0.2927, + "step": 8961 + }, + { + "epoch": 0.12156809549647314, + "grad_norm": 7.170071601867676, + "learning_rate": 8.874743045087023e-06, + "loss": 0.3521, + "step": 8962 + }, + { + "epoch": 0.12158166033640803, + "grad_norm": 5.614818096160889, + "learning_rate": 8.874606002466768e-06, + "loss": 0.3094, + "step": 8963 + }, + { + "epoch": 0.12159522517634291, + "grad_norm": 8.15521240234375, + "learning_rate": 8.874468959846513e-06, + "loss": 0.5252, + "step": 8964 + }, + { + "epoch": 0.1216087900162778, + "grad_norm": 7.79371452331543, + "learning_rate": 8.874331917226258e-06, + "loss": 0.4044, + "step": 8965 + }, + { + "epoch": 0.1216223548562127, + "grad_norm": 7.979064464569092, + "learning_rate": 8.874194874606003e-06, + "loss": 0.4673, + "step": 8966 + }, + { + "epoch": 0.12163591969614758, + "grad_norm": 5.6802873611450195, + "learning_rate": 8.874057831985748e-06, + "loss": 0.2628, + "step": 8967 + }, + { + "epoch": 0.12164948453608247, + "grad_norm": 4.979583740234375, + "learning_rate": 8.873920789365494e-06, + "loss": 0.2664, + "step": 8968 + }, + { + "epoch": 0.12166304937601737, + "grad_norm": 7.900975704193115, + "learning_rate": 8.873783746745239e-06, + "loss": 0.3687, + "step": 8969 + }, + { + "epoch": 0.12167661421595226, + "grad_norm": 5.710114002227783, + "learning_rate": 8.873646704124984e-06, + "loss": 0.2958, + "step": 8970 + }, + { + "epoch": 0.12169017905588714, + "grad_norm": 8.994698524475098, + "learning_rate": 8.87350966150473e-06, + "loss": 0.417, + "step": 8971 + }, + { + "epoch": 0.12170374389582203, + "grad_norm": 5.781028747558594, + "learning_rate": 8.873372618884474e-06, + "loss": 0.2652, + "step": 8972 + }, + { + "epoch": 0.12171730873575692, + "grad_norm": 7.393462181091309, + "learning_rate": 8.873235576264218e-06, + "loss": 0.4026, + "step": 8973 + }, + { + "epoch": 0.1217308735756918, + "grad_norm": 4.051027774810791, + "learning_rate": 8.873098533643963e-06, + "loss": 0.2699, + "step": 8974 + }, + { + "epoch": 0.1217444384156267, + "grad_norm": 6.279590129852295, + "learning_rate": 8.87296149102371e-06, + "loss": 0.4096, + "step": 8975 + }, + { + "epoch": 0.12175800325556159, + "grad_norm": 6.457613945007324, + "learning_rate": 8.872824448403455e-06, + "loss": 0.362, + "step": 8976 + }, + { + "epoch": 0.12177156809549647, + "grad_norm": 8.22589111328125, + "learning_rate": 8.872687405783199e-06, + "loss": 0.3367, + "step": 8977 + }, + { + "epoch": 0.12178513293543136, + "grad_norm": 6.8357625007629395, + "learning_rate": 8.872550363162944e-06, + "loss": 0.3844, + "step": 8978 + }, + { + "epoch": 0.12179869777536625, + "grad_norm": 6.326453685760498, + "learning_rate": 8.87241332054269e-06, + "loss": 0.3043, + "step": 8979 + }, + { + "epoch": 0.12181226261530113, + "grad_norm": 4.288938999176025, + "learning_rate": 8.872276277922434e-06, + "loss": 0.2436, + "step": 8980 + }, + { + "epoch": 0.12182582745523603, + "grad_norm": 8.382831573486328, + "learning_rate": 8.87213923530218e-06, + "loss": 0.5516, + "step": 8981 + }, + { + "epoch": 0.12183939229517092, + "grad_norm": 6.088759899139404, + "learning_rate": 8.872002192681924e-06, + "loss": 0.2741, + "step": 8982 + }, + { + "epoch": 0.1218529571351058, + "grad_norm": 9.156352996826172, + "learning_rate": 8.87186515006167e-06, + "loss": 0.5369, + "step": 8983 + }, + { + "epoch": 0.12186652197504069, + "grad_norm": 7.713760852813721, + "learning_rate": 8.871728107441415e-06, + "loss": 0.4179, + "step": 8984 + }, + { + "epoch": 0.12188008681497559, + "grad_norm": 7.203419208526611, + "learning_rate": 8.87159106482116e-06, + "loss": 0.387, + "step": 8985 + }, + { + "epoch": 0.12189365165491048, + "grad_norm": 7.724815368652344, + "learning_rate": 8.871454022200905e-06, + "loss": 0.3765, + "step": 8986 + }, + { + "epoch": 0.12190721649484536, + "grad_norm": 6.901484489440918, + "learning_rate": 8.87131697958065e-06, + "loss": 0.531, + "step": 8987 + }, + { + "epoch": 0.12192078133478025, + "grad_norm": 6.257023811340332, + "learning_rate": 8.871179936960396e-06, + "loss": 0.2903, + "step": 8988 + }, + { + "epoch": 0.12193434617471514, + "grad_norm": 5.7303056716918945, + "learning_rate": 8.87104289434014e-06, + "loss": 0.2536, + "step": 8989 + }, + { + "epoch": 0.12194791101465002, + "grad_norm": 8.241092681884766, + "learning_rate": 8.870905851719886e-06, + "loss": 0.3824, + "step": 8990 + }, + { + "epoch": 0.12196147585458492, + "grad_norm": 5.408999443054199, + "learning_rate": 8.870768809099631e-06, + "loss": 0.263, + "step": 8991 + }, + { + "epoch": 0.12197504069451981, + "grad_norm": 7.0745720863342285, + "learning_rate": 8.870631766479376e-06, + "loss": 0.545, + "step": 8992 + }, + { + "epoch": 0.12198860553445469, + "grad_norm": 7.896223068237305, + "learning_rate": 8.870494723859121e-06, + "loss": 0.4847, + "step": 8993 + }, + { + "epoch": 0.12200217037438958, + "grad_norm": 6.735701560974121, + "learning_rate": 8.870357681238867e-06, + "loss": 0.3635, + "step": 8994 + }, + { + "epoch": 0.12201573521432447, + "grad_norm": 7.802068710327148, + "learning_rate": 8.87022063861861e-06, + "loss": 0.4343, + "step": 8995 + }, + { + "epoch": 0.12202930005425935, + "grad_norm": 7.566933631896973, + "learning_rate": 8.870083595998357e-06, + "loss": 0.3765, + "step": 8996 + }, + { + "epoch": 0.12204286489419425, + "grad_norm": 6.2755303382873535, + "learning_rate": 8.869946553378102e-06, + "loss": 0.4261, + "step": 8997 + }, + { + "epoch": 0.12205642973412914, + "grad_norm": 7.495999813079834, + "learning_rate": 8.869809510757846e-06, + "loss": 0.5575, + "step": 8998 + }, + { + "epoch": 0.12206999457406402, + "grad_norm": 6.404247760772705, + "learning_rate": 8.86967246813759e-06, + "loss": 0.2867, + "step": 8999 + }, + { + "epoch": 0.12208355941399891, + "grad_norm": 6.473306179046631, + "learning_rate": 8.869535425517338e-06, + "loss": 0.3773, + "step": 9000 + }, + { + "epoch": 0.1220971242539338, + "grad_norm": 5.86737060546875, + "learning_rate": 8.869398382897081e-06, + "loss": 0.4001, + "step": 9001 + }, + { + "epoch": 0.1221106890938687, + "grad_norm": 8.885390281677246, + "learning_rate": 8.869261340276826e-06, + "loss": 0.4023, + "step": 9002 + }, + { + "epoch": 0.12212425393380358, + "grad_norm": 7.796073913574219, + "learning_rate": 8.869124297656572e-06, + "loss": 0.4131, + "step": 9003 + }, + { + "epoch": 0.12213781877373847, + "grad_norm": 9.267407417297363, + "learning_rate": 8.868987255036317e-06, + "loss": 0.5593, + "step": 9004 + }, + { + "epoch": 0.12215138361367336, + "grad_norm": 9.252786636352539, + "learning_rate": 8.868850212416062e-06, + "loss": 0.4424, + "step": 9005 + }, + { + "epoch": 0.12216494845360824, + "grad_norm": 7.835489749908447, + "learning_rate": 8.868713169795807e-06, + "loss": 0.3846, + "step": 9006 + }, + { + "epoch": 0.12217851329354314, + "grad_norm": 8.8193998336792, + "learning_rate": 8.868576127175552e-06, + "loss": 0.5156, + "step": 9007 + }, + { + "epoch": 0.12219207813347803, + "grad_norm": 4.891170978546143, + "learning_rate": 8.868439084555297e-06, + "loss": 0.298, + "step": 9008 + }, + { + "epoch": 0.12220564297341291, + "grad_norm": 8.024394989013672, + "learning_rate": 8.868302041935043e-06, + "loss": 0.3598, + "step": 9009 + }, + { + "epoch": 0.1222192078133478, + "grad_norm": 5.259089946746826, + "learning_rate": 8.868164999314788e-06, + "loss": 0.4014, + "step": 9010 + }, + { + "epoch": 0.1222327726532827, + "grad_norm": 7.207768440246582, + "learning_rate": 8.868027956694533e-06, + "loss": 0.3387, + "step": 9011 + }, + { + "epoch": 0.12224633749321757, + "grad_norm": 5.384468078613281, + "learning_rate": 8.867890914074278e-06, + "loss": 0.2665, + "step": 9012 + }, + { + "epoch": 0.12225990233315247, + "grad_norm": 4.976338863372803, + "learning_rate": 8.867753871454023e-06, + "loss": 0.299, + "step": 9013 + }, + { + "epoch": 0.12227346717308736, + "grad_norm": 5.482418537139893, + "learning_rate": 8.867616828833768e-06, + "loss": 0.3313, + "step": 9014 + }, + { + "epoch": 0.12228703201302224, + "grad_norm": 7.398382663726807, + "learning_rate": 8.867479786213514e-06, + "loss": 0.3819, + "step": 9015 + }, + { + "epoch": 0.12230059685295713, + "grad_norm": 6.758072853088379, + "learning_rate": 8.867342743593257e-06, + "loss": 0.3362, + "step": 9016 + }, + { + "epoch": 0.12231416169289203, + "grad_norm": 8.144402503967285, + "learning_rate": 8.867205700973002e-06, + "loss": 0.3044, + "step": 9017 + }, + { + "epoch": 0.12232772653282692, + "grad_norm": 6.351895332336426, + "learning_rate": 8.86706865835275e-06, + "loss": 0.413, + "step": 9018 + }, + { + "epoch": 0.1223412913727618, + "grad_norm": 5.174502849578857, + "learning_rate": 8.866931615732494e-06, + "loss": 0.2405, + "step": 9019 + }, + { + "epoch": 0.12235485621269669, + "grad_norm": 6.177247047424316, + "learning_rate": 8.866794573112238e-06, + "loss": 0.3746, + "step": 9020 + }, + { + "epoch": 0.12236842105263158, + "grad_norm": 8.805863380432129, + "learning_rate": 8.866657530491983e-06, + "loss": 0.5147, + "step": 9021 + }, + { + "epoch": 0.12238198589256646, + "grad_norm": 6.3325653076171875, + "learning_rate": 8.86652048787173e-06, + "loss": 0.3835, + "step": 9022 + }, + { + "epoch": 0.12239555073250136, + "grad_norm": 5.191446304321289, + "learning_rate": 8.866383445251473e-06, + "loss": 0.332, + "step": 9023 + }, + { + "epoch": 0.12240911557243625, + "grad_norm": 8.86601734161377, + "learning_rate": 8.866246402631219e-06, + "loss": 0.3951, + "step": 9024 + }, + { + "epoch": 0.12242268041237113, + "grad_norm": 6.253443241119385, + "learning_rate": 8.866109360010964e-06, + "loss": 0.4532, + "step": 9025 + }, + { + "epoch": 0.12243624525230602, + "grad_norm": 6.619153022766113, + "learning_rate": 8.865972317390709e-06, + "loss": 0.4895, + "step": 9026 + }, + { + "epoch": 0.12244981009224092, + "grad_norm": 6.78643798828125, + "learning_rate": 8.865835274770454e-06, + "loss": 0.4271, + "step": 9027 + }, + { + "epoch": 0.1224633749321758, + "grad_norm": 5.241053581237793, + "learning_rate": 8.8656982321502e-06, + "loss": 0.2711, + "step": 9028 + }, + { + "epoch": 0.12247693977211069, + "grad_norm": 5.32192325592041, + "learning_rate": 8.865561189529944e-06, + "loss": 0.3306, + "step": 9029 + }, + { + "epoch": 0.12249050461204558, + "grad_norm": 6.992579460144043, + "learning_rate": 8.86542414690969e-06, + "loss": 0.4116, + "step": 9030 + }, + { + "epoch": 0.12250406945198046, + "grad_norm": 4.885073661804199, + "learning_rate": 8.865287104289435e-06, + "loss": 0.3057, + "step": 9031 + }, + { + "epoch": 0.12251763429191535, + "grad_norm": 6.843509674072266, + "learning_rate": 8.86515006166918e-06, + "loss": 0.4757, + "step": 9032 + }, + { + "epoch": 0.12253119913185025, + "grad_norm": 4.94041109085083, + "learning_rate": 8.865013019048925e-06, + "loss": 0.334, + "step": 9033 + }, + { + "epoch": 0.12254476397178514, + "grad_norm": 5.497880458831787, + "learning_rate": 8.86487597642867e-06, + "loss": 0.346, + "step": 9034 + }, + { + "epoch": 0.12255832881172002, + "grad_norm": 6.6136298179626465, + "learning_rate": 8.864738933808416e-06, + "loss": 0.3568, + "step": 9035 + }, + { + "epoch": 0.12257189365165491, + "grad_norm": 8.015053749084473, + "learning_rate": 8.86460189118816e-06, + "loss": 0.6683, + "step": 9036 + }, + { + "epoch": 0.1225854584915898, + "grad_norm": 7.854660987854004, + "learning_rate": 8.864464848567906e-06, + "loss": 0.4379, + "step": 9037 + }, + { + "epoch": 0.12259902333152468, + "grad_norm": 7.311586856842041, + "learning_rate": 8.86432780594765e-06, + "loss": 0.556, + "step": 9038 + }, + { + "epoch": 0.12261258817145958, + "grad_norm": 7.905665397644043, + "learning_rate": 8.864190763327396e-06, + "loss": 0.4315, + "step": 9039 + }, + { + "epoch": 0.12262615301139447, + "grad_norm": 9.305538177490234, + "learning_rate": 8.864053720707141e-06, + "loss": 0.4463, + "step": 9040 + }, + { + "epoch": 0.12263971785132935, + "grad_norm": 8.330611228942871, + "learning_rate": 8.863916678086885e-06, + "loss": 0.474, + "step": 9041 + }, + { + "epoch": 0.12265328269126424, + "grad_norm": 6.2622971534729, + "learning_rate": 8.86377963546663e-06, + "loss": 0.514, + "step": 9042 + }, + { + "epoch": 0.12266684753119914, + "grad_norm": 6.760908126831055, + "learning_rate": 8.863642592846375e-06, + "loss": 0.3345, + "step": 9043 + }, + { + "epoch": 0.12268041237113401, + "grad_norm": 5.4044060707092285, + "learning_rate": 8.863505550226122e-06, + "loss": 0.4379, + "step": 9044 + }, + { + "epoch": 0.12269397721106891, + "grad_norm": 6.558187961578369, + "learning_rate": 8.863368507605866e-06, + "loss": 0.325, + "step": 9045 + }, + { + "epoch": 0.1227075420510038, + "grad_norm": 4.971224784851074, + "learning_rate": 8.86323146498561e-06, + "loss": 0.3454, + "step": 9046 + }, + { + "epoch": 0.12272110689093868, + "grad_norm": 7.026587009429932, + "learning_rate": 8.863094422365356e-06, + "loss": 0.3882, + "step": 9047 + }, + { + "epoch": 0.12273467173087357, + "grad_norm": 6.289752960205078, + "learning_rate": 8.862957379745101e-06, + "loss": 0.3456, + "step": 9048 + }, + { + "epoch": 0.12274823657080847, + "grad_norm": 6.927143573760986, + "learning_rate": 8.862820337124846e-06, + "loss": 0.4005, + "step": 9049 + }, + { + "epoch": 0.12276180141074336, + "grad_norm": 5.899670124053955, + "learning_rate": 8.862683294504592e-06, + "loss": 0.35, + "step": 9050 + }, + { + "epoch": 0.12277536625067824, + "grad_norm": 8.341054916381836, + "learning_rate": 8.862546251884337e-06, + "loss": 0.4421, + "step": 9051 + }, + { + "epoch": 0.12278893109061313, + "grad_norm": 9.53610610961914, + "learning_rate": 8.862409209264082e-06, + "loss": 0.5758, + "step": 9052 + }, + { + "epoch": 0.12280249593054803, + "grad_norm": 6.224305629730225, + "learning_rate": 8.862272166643827e-06, + "loss": 0.3574, + "step": 9053 + }, + { + "epoch": 0.1228160607704829, + "grad_norm": 9.136075019836426, + "learning_rate": 8.862135124023572e-06, + "loss": 0.4521, + "step": 9054 + }, + { + "epoch": 0.1228296256104178, + "grad_norm": 7.8247761726379395, + "learning_rate": 8.861998081403317e-06, + "loss": 0.4358, + "step": 9055 + }, + { + "epoch": 0.12284319045035269, + "grad_norm": 6.318701267242432, + "learning_rate": 8.861861038783063e-06, + "loss": 0.2671, + "step": 9056 + }, + { + "epoch": 0.12285675529028757, + "grad_norm": 7.339986801147461, + "learning_rate": 8.861723996162808e-06, + "loss": 0.371, + "step": 9057 + }, + { + "epoch": 0.12287032013022246, + "grad_norm": 6.6726460456848145, + "learning_rate": 8.861586953542553e-06, + "loss": 0.5293, + "step": 9058 + }, + { + "epoch": 0.12288388497015736, + "grad_norm": 8.420669555664062, + "learning_rate": 8.861449910922298e-06, + "loss": 0.4039, + "step": 9059 + }, + { + "epoch": 0.12289744981009224, + "grad_norm": 8.37661075592041, + "learning_rate": 8.861312868302042e-06, + "loss": 0.4564, + "step": 9060 + }, + { + "epoch": 0.12291101465002713, + "grad_norm": 8.317960739135742, + "learning_rate": 8.861175825681789e-06, + "loss": 0.5857, + "step": 9061 + }, + { + "epoch": 0.12292457948996202, + "grad_norm": 5.3735551834106445, + "learning_rate": 8.861038783061534e-06, + "loss": 0.3722, + "step": 9062 + }, + { + "epoch": 0.1229381443298969, + "grad_norm": 8.382852554321289, + "learning_rate": 8.860901740441277e-06, + "loss": 0.4341, + "step": 9063 + }, + { + "epoch": 0.1229517091698318, + "grad_norm": 6.025783538818359, + "learning_rate": 8.860764697821022e-06, + "loss": 0.4085, + "step": 9064 + }, + { + "epoch": 0.12296527400976669, + "grad_norm": 6.140847206115723, + "learning_rate": 8.86062765520077e-06, + "loss": 0.3566, + "step": 9065 + }, + { + "epoch": 0.12297883884970158, + "grad_norm": 8.108762741088867, + "learning_rate": 8.860490612580513e-06, + "loss": 0.3989, + "step": 9066 + }, + { + "epoch": 0.12299240368963646, + "grad_norm": 7.8863205909729, + "learning_rate": 8.860353569960258e-06, + "loss": 0.5424, + "step": 9067 + }, + { + "epoch": 0.12300596852957135, + "grad_norm": 6.769186973571777, + "learning_rate": 8.860216527340003e-06, + "loss": 0.4024, + "step": 9068 + }, + { + "epoch": 0.12301953336950625, + "grad_norm": 7.340047359466553, + "learning_rate": 8.86007948471975e-06, + "loss": 0.4385, + "step": 9069 + }, + { + "epoch": 0.12303309820944112, + "grad_norm": 7.910252094268799, + "learning_rate": 8.859942442099493e-06, + "loss": 0.4162, + "step": 9070 + }, + { + "epoch": 0.12304666304937602, + "grad_norm": 8.465500831604004, + "learning_rate": 8.859805399479239e-06, + "loss": 0.4006, + "step": 9071 + }, + { + "epoch": 0.12306022788931091, + "grad_norm": 6.125222206115723, + "learning_rate": 8.859668356858984e-06, + "loss": 0.4218, + "step": 9072 + }, + { + "epoch": 0.12307379272924579, + "grad_norm": 7.654595375061035, + "learning_rate": 8.859531314238729e-06, + "loss": 0.54, + "step": 9073 + }, + { + "epoch": 0.12308735756918068, + "grad_norm": 6.749236106872559, + "learning_rate": 8.859394271618474e-06, + "loss": 0.3985, + "step": 9074 + }, + { + "epoch": 0.12310092240911558, + "grad_norm": 7.5187835693359375, + "learning_rate": 8.85925722899822e-06, + "loss": 0.3744, + "step": 9075 + }, + { + "epoch": 0.12311448724905046, + "grad_norm": 7.036488056182861, + "learning_rate": 8.859120186377964e-06, + "loss": 0.4609, + "step": 9076 + }, + { + "epoch": 0.12312805208898535, + "grad_norm": 6.052907466888428, + "learning_rate": 8.85898314375771e-06, + "loss": 0.368, + "step": 9077 + }, + { + "epoch": 0.12314161692892024, + "grad_norm": 7.258096694946289, + "learning_rate": 8.858846101137455e-06, + "loss": 0.4296, + "step": 9078 + }, + { + "epoch": 0.12315518176885512, + "grad_norm": 6.9567975997924805, + "learning_rate": 8.8587090585172e-06, + "loss": 0.4346, + "step": 9079 + }, + { + "epoch": 0.12316874660879001, + "grad_norm": 7.460152626037598, + "learning_rate": 8.858572015896945e-06, + "loss": 0.4544, + "step": 9080 + }, + { + "epoch": 0.12318231144872491, + "grad_norm": 6.6859283447265625, + "learning_rate": 8.858434973276689e-06, + "loss": 0.5148, + "step": 9081 + }, + { + "epoch": 0.1231958762886598, + "grad_norm": 7.9823455810546875, + "learning_rate": 8.858297930656436e-06, + "loss": 0.3386, + "step": 9082 + }, + { + "epoch": 0.12320944112859468, + "grad_norm": 7.9896559715271, + "learning_rate": 8.85816088803618e-06, + "loss": 0.4575, + "step": 9083 + }, + { + "epoch": 0.12322300596852957, + "grad_norm": 6.621874809265137, + "learning_rate": 8.858023845415924e-06, + "loss": 0.4037, + "step": 9084 + }, + { + "epoch": 0.12323657080846447, + "grad_norm": 8.450287818908691, + "learning_rate": 8.85788680279567e-06, + "loss": 0.5786, + "step": 9085 + }, + { + "epoch": 0.12325013564839934, + "grad_norm": 6.578836917877197, + "learning_rate": 8.857749760175415e-06, + "loss": 0.318, + "step": 9086 + }, + { + "epoch": 0.12326370048833424, + "grad_norm": 6.603781700134277, + "learning_rate": 8.857612717555161e-06, + "loss": 0.371, + "step": 9087 + }, + { + "epoch": 0.12327726532826913, + "grad_norm": 8.82005500793457, + "learning_rate": 8.857475674934905e-06, + "loss": 0.3703, + "step": 9088 + }, + { + "epoch": 0.12329083016820401, + "grad_norm": 7.259471893310547, + "learning_rate": 8.85733863231465e-06, + "loss": 0.361, + "step": 9089 + }, + { + "epoch": 0.1233043950081389, + "grad_norm": 7.662289142608643, + "learning_rate": 8.857201589694395e-06, + "loss": 0.4494, + "step": 9090 + }, + { + "epoch": 0.1233179598480738, + "grad_norm": 6.764641284942627, + "learning_rate": 8.85706454707414e-06, + "loss": 0.3866, + "step": 9091 + }, + { + "epoch": 0.12333152468800868, + "grad_norm": 7.163659572601318, + "learning_rate": 8.856927504453886e-06, + "loss": 0.5636, + "step": 9092 + }, + { + "epoch": 0.12334508952794357, + "grad_norm": 9.096393585205078, + "learning_rate": 8.85679046183363e-06, + "loss": 0.6099, + "step": 9093 + }, + { + "epoch": 0.12335865436787846, + "grad_norm": 7.353128910064697, + "learning_rate": 8.856653419213376e-06, + "loss": 0.5538, + "step": 9094 + }, + { + "epoch": 0.12337221920781334, + "grad_norm": 7.448648929595947, + "learning_rate": 8.856516376593121e-06, + "loss": 0.4864, + "step": 9095 + }, + { + "epoch": 0.12338578404774823, + "grad_norm": 5.596398830413818, + "learning_rate": 8.856379333972866e-06, + "loss": 0.2952, + "step": 9096 + }, + { + "epoch": 0.12339934888768313, + "grad_norm": 6.426354885101318, + "learning_rate": 8.856242291352612e-06, + "loss": 0.3988, + "step": 9097 + }, + { + "epoch": 0.12341291372761802, + "grad_norm": 6.099701404571533, + "learning_rate": 8.856105248732357e-06, + "loss": 0.3177, + "step": 9098 + }, + { + "epoch": 0.1234264785675529, + "grad_norm": 7.899878978729248, + "learning_rate": 8.8559682061121e-06, + "loss": 0.6812, + "step": 9099 + }, + { + "epoch": 0.12344004340748779, + "grad_norm": 5.207597255706787, + "learning_rate": 8.855831163491847e-06, + "loss": 0.3516, + "step": 9100 + }, + { + "epoch": 0.12345360824742269, + "grad_norm": 7.2196550369262695, + "learning_rate": 8.855694120871592e-06, + "loss": 0.433, + "step": 9101 + }, + { + "epoch": 0.12346717308735757, + "grad_norm": 7.895763397216797, + "learning_rate": 8.855557078251337e-06, + "loss": 0.3492, + "step": 9102 + }, + { + "epoch": 0.12348073792729246, + "grad_norm": 5.530777931213379, + "learning_rate": 8.855420035631081e-06, + "loss": 0.4386, + "step": 9103 + }, + { + "epoch": 0.12349430276722735, + "grad_norm": 5.396377086639404, + "learning_rate": 8.855282993010828e-06, + "loss": 0.434, + "step": 9104 + }, + { + "epoch": 0.12350786760716223, + "grad_norm": 7.874265670776367, + "learning_rate": 8.855145950390573e-06, + "loss": 0.3418, + "step": 9105 + }, + { + "epoch": 0.12352143244709712, + "grad_norm": 5.522342681884766, + "learning_rate": 8.855008907770316e-06, + "loss": 0.33, + "step": 9106 + }, + { + "epoch": 0.12353499728703202, + "grad_norm": 6.041234016418457, + "learning_rate": 8.854871865150062e-06, + "loss": 0.3576, + "step": 9107 + }, + { + "epoch": 0.1235485621269669, + "grad_norm": 6.199678897857666, + "learning_rate": 8.854734822529809e-06, + "loss": 0.4309, + "step": 9108 + }, + { + "epoch": 0.12356212696690179, + "grad_norm": 6.554842472076416, + "learning_rate": 8.854597779909552e-06, + "loss": 0.4581, + "step": 9109 + }, + { + "epoch": 0.12357569180683668, + "grad_norm": 6.786654472351074, + "learning_rate": 8.854460737289297e-06, + "loss": 0.3757, + "step": 9110 + }, + { + "epoch": 0.12358925664677156, + "grad_norm": 5.463475227355957, + "learning_rate": 8.854323694669042e-06, + "loss": 0.3426, + "step": 9111 + }, + { + "epoch": 0.12360282148670645, + "grad_norm": 7.126456260681152, + "learning_rate": 8.854186652048788e-06, + "loss": 0.3837, + "step": 9112 + }, + { + "epoch": 0.12361638632664135, + "grad_norm": 5.688267230987549, + "learning_rate": 8.854049609428533e-06, + "loss": 0.2635, + "step": 9113 + }, + { + "epoch": 0.12362995116657624, + "grad_norm": 7.196133136749268, + "learning_rate": 8.853912566808278e-06, + "loss": 0.3584, + "step": 9114 + }, + { + "epoch": 0.12364351600651112, + "grad_norm": 5.275041103363037, + "learning_rate": 8.853775524188023e-06, + "loss": 0.3848, + "step": 9115 + }, + { + "epoch": 0.12365708084644601, + "grad_norm": 7.063828945159912, + "learning_rate": 8.853638481567768e-06, + "loss": 0.3712, + "step": 9116 + }, + { + "epoch": 0.1236706456863809, + "grad_norm": 6.610826015472412, + "learning_rate": 8.853501438947513e-06, + "loss": 0.3114, + "step": 9117 + }, + { + "epoch": 0.12368421052631579, + "grad_norm": 5.843132019042969, + "learning_rate": 8.853364396327259e-06, + "loss": 0.2995, + "step": 9118 + }, + { + "epoch": 0.12369777536625068, + "grad_norm": 6.580255031585693, + "learning_rate": 8.853227353707004e-06, + "loss": 0.4001, + "step": 9119 + }, + { + "epoch": 0.12371134020618557, + "grad_norm": 5.662643909454346, + "learning_rate": 8.853090311086749e-06, + "loss": 0.2935, + "step": 9120 + }, + { + "epoch": 0.12372490504612045, + "grad_norm": 5.364117622375488, + "learning_rate": 8.852953268466494e-06, + "loss": 0.2076, + "step": 9121 + }, + { + "epoch": 0.12373846988605534, + "grad_norm": 6.283024787902832, + "learning_rate": 8.85281622584624e-06, + "loss": 0.2706, + "step": 9122 + }, + { + "epoch": 0.12375203472599024, + "grad_norm": 7.232073783874512, + "learning_rate": 8.852679183225985e-06, + "loss": 0.3639, + "step": 9123 + }, + { + "epoch": 0.12376559956592512, + "grad_norm": 6.287027835845947, + "learning_rate": 8.852542140605728e-06, + "loss": 0.3419, + "step": 9124 + }, + { + "epoch": 0.12377916440586001, + "grad_norm": 4.151856422424316, + "learning_rate": 8.852405097985475e-06, + "loss": 0.2583, + "step": 9125 + }, + { + "epoch": 0.1237927292457949, + "grad_norm": 5.33691930770874, + "learning_rate": 8.85226805536522e-06, + "loss": 0.306, + "step": 9126 + }, + { + "epoch": 0.12380629408572978, + "grad_norm": 6.553770542144775, + "learning_rate": 8.852131012744965e-06, + "loss": 0.4156, + "step": 9127 + }, + { + "epoch": 0.12381985892566467, + "grad_norm": 6.8148651123046875, + "learning_rate": 8.851993970124709e-06, + "loss": 0.4202, + "step": 9128 + }, + { + "epoch": 0.12383342376559957, + "grad_norm": 5.445761680603027, + "learning_rate": 8.851856927504454e-06, + "loss": 0.345, + "step": 9129 + }, + { + "epoch": 0.12384698860553446, + "grad_norm": 4.460249900817871, + "learning_rate": 8.8517198848842e-06, + "loss": 0.3197, + "step": 9130 + }, + { + "epoch": 0.12386055344546934, + "grad_norm": 4.840601921081543, + "learning_rate": 8.851582842263944e-06, + "loss": 0.3431, + "step": 9131 + }, + { + "epoch": 0.12387411828540423, + "grad_norm": 5.37526273727417, + "learning_rate": 8.85144579964369e-06, + "loss": 0.2976, + "step": 9132 + }, + { + "epoch": 0.12388768312533913, + "grad_norm": 5.441303253173828, + "learning_rate": 8.851308757023435e-06, + "loss": 0.3607, + "step": 9133 + }, + { + "epoch": 0.123901247965274, + "grad_norm": 6.6216301918029785, + "learning_rate": 8.85117171440318e-06, + "loss": 0.3508, + "step": 9134 + }, + { + "epoch": 0.1239148128052089, + "grad_norm": 6.539887428283691, + "learning_rate": 8.851034671782925e-06, + "loss": 0.5543, + "step": 9135 + }, + { + "epoch": 0.12392837764514379, + "grad_norm": 8.177507400512695, + "learning_rate": 8.85089762916267e-06, + "loss": 0.5658, + "step": 9136 + }, + { + "epoch": 0.12394194248507867, + "grad_norm": 6.296236515045166, + "learning_rate": 8.850760586542415e-06, + "loss": 0.5181, + "step": 9137 + }, + { + "epoch": 0.12395550732501356, + "grad_norm": 4.588314533233643, + "learning_rate": 8.85062354392216e-06, + "loss": 0.2806, + "step": 9138 + }, + { + "epoch": 0.12396907216494846, + "grad_norm": 6.38804817199707, + "learning_rate": 8.850486501301906e-06, + "loss": 0.4805, + "step": 9139 + }, + { + "epoch": 0.12398263700488334, + "grad_norm": 7.593857288360596, + "learning_rate": 8.850349458681651e-06, + "loss": 0.411, + "step": 9140 + }, + { + "epoch": 0.12399620184481823, + "grad_norm": 7.149096488952637, + "learning_rate": 8.850212416061396e-06, + "loss": 0.4393, + "step": 9141 + }, + { + "epoch": 0.12400976668475312, + "grad_norm": 5.459890842437744, + "learning_rate": 8.850075373441141e-06, + "loss": 0.4035, + "step": 9142 + }, + { + "epoch": 0.124023331524688, + "grad_norm": 4.965338230133057, + "learning_rate": 8.849938330820886e-06, + "loss": 0.3152, + "step": 9143 + }, + { + "epoch": 0.1240368963646229, + "grad_norm": 9.791523933410645, + "learning_rate": 8.849801288200632e-06, + "loss": 0.4828, + "step": 9144 + }, + { + "epoch": 0.12405046120455779, + "grad_norm": 7.333685398101807, + "learning_rate": 8.849664245580377e-06, + "loss": 0.6252, + "step": 9145 + }, + { + "epoch": 0.12406402604449268, + "grad_norm": 5.748721599578857, + "learning_rate": 8.84952720296012e-06, + "loss": 0.3224, + "step": 9146 + }, + { + "epoch": 0.12407759088442756, + "grad_norm": 6.2932233810424805, + "learning_rate": 8.849390160339867e-06, + "loss": 0.4253, + "step": 9147 + }, + { + "epoch": 0.12409115572436245, + "grad_norm": 9.77763557434082, + "learning_rate": 8.849253117719612e-06, + "loss": 0.5558, + "step": 9148 + }, + { + "epoch": 0.12410472056429735, + "grad_norm": 8.240165710449219, + "learning_rate": 8.849116075099356e-06, + "loss": 0.4641, + "step": 9149 + }, + { + "epoch": 0.12411828540423223, + "grad_norm": 6.591037273406982, + "learning_rate": 8.848979032479101e-06, + "loss": 0.3914, + "step": 9150 + }, + { + "epoch": 0.12413185024416712, + "grad_norm": 7.083926677703857, + "learning_rate": 8.848841989858848e-06, + "loss": 0.4665, + "step": 9151 + }, + { + "epoch": 0.12414541508410201, + "grad_norm": 6.300109386444092, + "learning_rate": 8.848704947238593e-06, + "loss": 0.4426, + "step": 9152 + }, + { + "epoch": 0.12415897992403689, + "grad_norm": 6.684072017669678, + "learning_rate": 8.848567904618336e-06, + "loss": 0.419, + "step": 9153 + }, + { + "epoch": 0.12417254476397178, + "grad_norm": 7.39601993560791, + "learning_rate": 8.848430861998082e-06, + "loss": 0.4649, + "step": 9154 + }, + { + "epoch": 0.12418610960390668, + "grad_norm": 5.754361152648926, + "learning_rate": 8.848293819377827e-06, + "loss": 0.3774, + "step": 9155 + }, + { + "epoch": 0.12419967444384156, + "grad_norm": 5.896346092224121, + "learning_rate": 8.848156776757572e-06, + "loss": 0.3373, + "step": 9156 + }, + { + "epoch": 0.12421323928377645, + "grad_norm": 6.062142372131348, + "learning_rate": 8.848019734137317e-06, + "loss": 0.3933, + "step": 9157 + }, + { + "epoch": 0.12422680412371134, + "grad_norm": 5.823869705200195, + "learning_rate": 8.847882691517062e-06, + "loss": 0.3635, + "step": 9158 + }, + { + "epoch": 0.12424036896364622, + "grad_norm": 5.589511394500732, + "learning_rate": 8.847745648896808e-06, + "loss": 0.3286, + "step": 9159 + }, + { + "epoch": 0.12425393380358112, + "grad_norm": 9.828831672668457, + "learning_rate": 8.847608606276553e-06, + "loss": 0.615, + "step": 9160 + }, + { + "epoch": 0.12426749864351601, + "grad_norm": 6.7783637046813965, + "learning_rate": 8.847471563656298e-06, + "loss": 0.3241, + "step": 9161 + }, + { + "epoch": 0.1242810634834509, + "grad_norm": 6.60084867477417, + "learning_rate": 8.847334521036043e-06, + "loss": 0.3855, + "step": 9162 + }, + { + "epoch": 0.12429462832338578, + "grad_norm": 5.828875541687012, + "learning_rate": 8.847197478415788e-06, + "loss": 0.4035, + "step": 9163 + }, + { + "epoch": 0.12430819316332067, + "grad_norm": 6.1722731590271, + "learning_rate": 8.847060435795533e-06, + "loss": 0.3084, + "step": 9164 + }, + { + "epoch": 0.12432175800325557, + "grad_norm": 4.683402061462402, + "learning_rate": 8.846923393175279e-06, + "loss": 0.3762, + "step": 9165 + }, + { + "epoch": 0.12433532284319045, + "grad_norm": 7.257013320922852, + "learning_rate": 8.846786350555024e-06, + "loss": 0.3237, + "step": 9166 + }, + { + "epoch": 0.12434888768312534, + "grad_norm": 6.222968101501465, + "learning_rate": 8.846649307934769e-06, + "loss": 0.2674, + "step": 9167 + }, + { + "epoch": 0.12436245252306023, + "grad_norm": 4.50567626953125, + "learning_rate": 8.846512265314512e-06, + "loss": 0.1999, + "step": 9168 + }, + { + "epoch": 0.12437601736299511, + "grad_norm": 5.30322790145874, + "learning_rate": 8.84637522269426e-06, + "loss": 0.2918, + "step": 9169 + }, + { + "epoch": 0.12438958220293, + "grad_norm": 4.7391839027404785, + "learning_rate": 8.846238180074005e-06, + "loss": 0.3151, + "step": 9170 + }, + { + "epoch": 0.1244031470428649, + "grad_norm": 4.406193733215332, + "learning_rate": 8.846101137453748e-06, + "loss": 0.2131, + "step": 9171 + }, + { + "epoch": 0.12441671188279978, + "grad_norm": 4.986643314361572, + "learning_rate": 8.845964094833493e-06, + "loss": 0.318, + "step": 9172 + }, + { + "epoch": 0.12443027672273467, + "grad_norm": 5.005754470825195, + "learning_rate": 8.84582705221324e-06, + "loss": 0.4388, + "step": 9173 + }, + { + "epoch": 0.12444384156266956, + "grad_norm": 4.7966389656066895, + "learning_rate": 8.845690009592984e-06, + "loss": 0.1879, + "step": 9174 + }, + { + "epoch": 0.12445740640260444, + "grad_norm": 5.669036388397217, + "learning_rate": 8.845552966972729e-06, + "loss": 0.313, + "step": 9175 + }, + { + "epoch": 0.12447097124253934, + "grad_norm": 4.827810287475586, + "learning_rate": 8.845415924352474e-06, + "loss": 0.2636, + "step": 9176 + }, + { + "epoch": 0.12448453608247423, + "grad_norm": 5.209800720214844, + "learning_rate": 8.845278881732219e-06, + "loss": 0.1888, + "step": 9177 + }, + { + "epoch": 0.12449810092240912, + "grad_norm": 5.564769744873047, + "learning_rate": 8.845141839111964e-06, + "loss": 0.2371, + "step": 9178 + }, + { + "epoch": 0.124511665762344, + "grad_norm": 6.228020668029785, + "learning_rate": 8.84500479649171e-06, + "loss": 0.3072, + "step": 9179 + }, + { + "epoch": 0.1245252306022789, + "grad_norm": 5.935786724090576, + "learning_rate": 8.844867753871455e-06, + "loss": 0.3475, + "step": 9180 + }, + { + "epoch": 0.12453879544221379, + "grad_norm": 5.935708045959473, + "learning_rate": 8.8447307112512e-06, + "loss": 0.4272, + "step": 9181 + }, + { + "epoch": 0.12455236028214867, + "grad_norm": 4.2827582359313965, + "learning_rate": 8.844593668630945e-06, + "loss": 0.2809, + "step": 9182 + }, + { + "epoch": 0.12456592512208356, + "grad_norm": 5.1710944175720215, + "learning_rate": 8.84445662601069e-06, + "loss": 0.2682, + "step": 9183 + }, + { + "epoch": 0.12457948996201845, + "grad_norm": 6.2149152755737305, + "learning_rate": 8.844319583390435e-06, + "loss": 0.4724, + "step": 9184 + }, + { + "epoch": 0.12459305480195333, + "grad_norm": 4.502788066864014, + "learning_rate": 8.84418254077018e-06, + "loss": 0.2608, + "step": 9185 + }, + { + "epoch": 0.12460661964188822, + "grad_norm": 4.851719379425049, + "learning_rate": 8.844045498149926e-06, + "loss": 0.2651, + "step": 9186 + }, + { + "epoch": 0.12462018448182312, + "grad_norm": 17.406539916992188, + "learning_rate": 8.843908455529671e-06, + "loss": 0.4937, + "step": 9187 + }, + { + "epoch": 0.124633749321758, + "grad_norm": 5.009912490844727, + "learning_rate": 8.843771412909416e-06, + "loss": 0.3472, + "step": 9188 + }, + { + "epoch": 0.12464731416169289, + "grad_norm": 4.977118015289307, + "learning_rate": 8.84363437028916e-06, + "loss": 0.2874, + "step": 9189 + }, + { + "epoch": 0.12466087900162778, + "grad_norm": 5.124826431274414, + "learning_rate": 8.843497327668906e-06, + "loss": 0.35, + "step": 9190 + }, + { + "epoch": 0.12467444384156266, + "grad_norm": 5.593939781188965, + "learning_rate": 8.843360285048652e-06, + "loss": 0.2771, + "step": 9191 + }, + { + "epoch": 0.12468800868149756, + "grad_norm": 6.231496334075928, + "learning_rate": 8.843223242428395e-06, + "loss": 0.365, + "step": 9192 + }, + { + "epoch": 0.12470157352143245, + "grad_norm": 4.984182357788086, + "learning_rate": 8.84308619980814e-06, + "loss": 0.2343, + "step": 9193 + }, + { + "epoch": 0.12471513836136734, + "grad_norm": 4.678084850311279, + "learning_rate": 8.842949157187885e-06, + "loss": 0.2855, + "step": 9194 + }, + { + "epoch": 0.12472870320130222, + "grad_norm": 5.703056335449219, + "learning_rate": 8.842812114567632e-06, + "loss": 0.2837, + "step": 9195 + }, + { + "epoch": 0.12474226804123711, + "grad_norm": 4.84023904800415, + "learning_rate": 8.842675071947376e-06, + "loss": 0.2402, + "step": 9196 + }, + { + "epoch": 0.12475583288117201, + "grad_norm": 5.024317264556885, + "learning_rate": 8.842538029327121e-06, + "loss": 0.2354, + "step": 9197 + }, + { + "epoch": 0.12476939772110689, + "grad_norm": 4.591366767883301, + "learning_rate": 8.842400986706866e-06, + "loss": 0.2158, + "step": 9198 + }, + { + "epoch": 0.12478296256104178, + "grad_norm": 7.405924320220947, + "learning_rate": 8.842263944086611e-06, + "loss": 0.4312, + "step": 9199 + }, + { + "epoch": 0.12479652740097667, + "grad_norm": 5.29573917388916, + "learning_rate": 8.842126901466357e-06, + "loss": 0.311, + "step": 9200 + }, + { + "epoch": 0.12481009224091155, + "grad_norm": 5.004824161529541, + "learning_rate": 8.841989858846102e-06, + "loss": 0.2874, + "step": 9201 + }, + { + "epoch": 0.12482365708084645, + "grad_norm": 5.805532455444336, + "learning_rate": 8.841852816225847e-06, + "loss": 0.3594, + "step": 9202 + }, + { + "epoch": 0.12483722192078134, + "grad_norm": 4.611751079559326, + "learning_rate": 8.841715773605592e-06, + "loss": 0.2584, + "step": 9203 + }, + { + "epoch": 0.12485078676071622, + "grad_norm": 4.679304122924805, + "learning_rate": 8.841578730985337e-06, + "loss": 0.2889, + "step": 9204 + }, + { + "epoch": 0.12486435160065111, + "grad_norm": 5.590457439422607, + "learning_rate": 8.841441688365082e-06, + "loss": 0.2739, + "step": 9205 + }, + { + "epoch": 0.124877916440586, + "grad_norm": 5.361841201782227, + "learning_rate": 8.841304645744828e-06, + "loss": 0.2882, + "step": 9206 + }, + { + "epoch": 0.12489148128052088, + "grad_norm": 4.97106409072876, + "learning_rate": 8.841167603124573e-06, + "loss": 0.2692, + "step": 9207 + }, + { + "epoch": 0.12490504612045578, + "grad_norm": 8.392301559448242, + "learning_rate": 8.841030560504318e-06, + "loss": 0.3792, + "step": 9208 + }, + { + "epoch": 0.12491861096039067, + "grad_norm": 9.642327308654785, + "learning_rate": 8.840893517884063e-06, + "loss": 0.4828, + "step": 9209 + }, + { + "epoch": 0.12493217580032556, + "grad_norm": 5.745659351348877, + "learning_rate": 8.840756475263808e-06, + "loss": 0.3439, + "step": 9210 + }, + { + "epoch": 0.12494574064026044, + "grad_norm": 5.605535507202148, + "learning_rate": 8.840619432643552e-06, + "loss": 0.3281, + "step": 9211 + }, + { + "epoch": 0.12495930548019533, + "grad_norm": 4.368957996368408, + "learning_rate": 8.840482390023299e-06, + "loss": 0.3782, + "step": 9212 + }, + { + "epoch": 0.12497287032013023, + "grad_norm": 5.225927352905273, + "learning_rate": 8.840345347403044e-06, + "loss": 0.2577, + "step": 9213 + }, + { + "epoch": 0.1249864351600651, + "grad_norm": 6.89884090423584, + "learning_rate": 8.840208304782787e-06, + "loss": 0.3793, + "step": 9214 + }, + { + "epoch": 0.125, + "grad_norm": 6.6077165603637695, + "learning_rate": 8.840071262162533e-06, + "loss": 0.3503, + "step": 9215 + }, + { + "epoch": 0.1250135648399349, + "grad_norm": 4.647985935211182, + "learning_rate": 8.83993421954228e-06, + "loss": 0.2864, + "step": 9216 + }, + { + "epoch": 0.1250271296798698, + "grad_norm": 4.817343711853027, + "learning_rate": 8.839797176922023e-06, + "loss": 0.3558, + "step": 9217 + }, + { + "epoch": 0.12504069451980468, + "grad_norm": 4.626983642578125, + "learning_rate": 8.839660134301768e-06, + "loss": 0.2187, + "step": 9218 + }, + { + "epoch": 0.12505425935973954, + "grad_norm": 7.510318279266357, + "learning_rate": 8.839523091681513e-06, + "loss": 0.4552, + "step": 9219 + }, + { + "epoch": 0.12506782419967444, + "grad_norm": 6.2717695236206055, + "learning_rate": 8.83938604906126e-06, + "loss": 0.4451, + "step": 9220 + }, + { + "epoch": 0.12508138903960933, + "grad_norm": 4.390897750854492, + "learning_rate": 8.839249006441004e-06, + "loss": 0.2398, + "step": 9221 + }, + { + "epoch": 0.12509495387954422, + "grad_norm": 4.604058265686035, + "learning_rate": 8.839111963820749e-06, + "loss": 0.3178, + "step": 9222 + }, + { + "epoch": 0.12510851871947912, + "grad_norm": 6.668381214141846, + "learning_rate": 8.838974921200494e-06, + "loss": 0.3311, + "step": 9223 + }, + { + "epoch": 0.125122083559414, + "grad_norm": 6.030898094177246, + "learning_rate": 8.838837878580239e-06, + "loss": 0.4095, + "step": 9224 + }, + { + "epoch": 0.12513564839934888, + "grad_norm": 4.5103888511657715, + "learning_rate": 8.838700835959984e-06, + "loss": 0.3738, + "step": 9225 + }, + { + "epoch": 0.12514921323928377, + "grad_norm": 6.137810230255127, + "learning_rate": 8.83856379333973e-06, + "loss": 0.4193, + "step": 9226 + }, + { + "epoch": 0.12516277807921866, + "grad_norm": 6.30726432800293, + "learning_rate": 8.838426750719475e-06, + "loss": 0.2924, + "step": 9227 + }, + { + "epoch": 0.12517634291915355, + "grad_norm": 4.995174407958984, + "learning_rate": 8.83828970809922e-06, + "loss": 0.2214, + "step": 9228 + }, + { + "epoch": 0.12518990775908845, + "grad_norm": 5.176880359649658, + "learning_rate": 8.838152665478965e-06, + "loss": 0.3297, + "step": 9229 + }, + { + "epoch": 0.12520347259902334, + "grad_norm": 5.396122932434082, + "learning_rate": 8.83801562285871e-06, + "loss": 0.2921, + "step": 9230 + }, + { + "epoch": 0.12521703743895823, + "grad_norm": 5.324489116668701, + "learning_rate": 8.837878580238455e-06, + "loss": 0.2938, + "step": 9231 + }, + { + "epoch": 0.1252306022788931, + "grad_norm": 5.036745071411133, + "learning_rate": 8.837741537618199e-06, + "loss": 0.3885, + "step": 9232 + }, + { + "epoch": 0.125244167118828, + "grad_norm": 8.22474479675293, + "learning_rate": 8.837604494997946e-06, + "loss": 0.435, + "step": 9233 + }, + { + "epoch": 0.12525773195876289, + "grad_norm": 4.304083824157715, + "learning_rate": 8.837467452377691e-06, + "loss": 0.3789, + "step": 9234 + }, + { + "epoch": 0.12527129679869778, + "grad_norm": 6.15880012512207, + "learning_rate": 8.837330409757436e-06, + "loss": 0.4561, + "step": 9235 + }, + { + "epoch": 0.12528486163863267, + "grad_norm": 7.8522629737854, + "learning_rate": 8.83719336713718e-06, + "loss": 0.5869, + "step": 9236 + }, + { + "epoch": 0.12529842647856756, + "grad_norm": 5.238290309906006, + "learning_rate": 8.837056324516925e-06, + "loss": 0.3327, + "step": 9237 + }, + { + "epoch": 0.12531199131850243, + "grad_norm": 4.8665056228637695, + "learning_rate": 8.836919281896672e-06, + "loss": 0.2644, + "step": 9238 + }, + { + "epoch": 0.12532555615843732, + "grad_norm": 5.526402950286865, + "learning_rate": 8.836782239276415e-06, + "loss": 0.3795, + "step": 9239 + }, + { + "epoch": 0.12533912099837222, + "grad_norm": 5.553713321685791, + "learning_rate": 8.83664519665616e-06, + "loss": 0.2651, + "step": 9240 + }, + { + "epoch": 0.1253526858383071, + "grad_norm": 4.921950340270996, + "learning_rate": 8.836508154035905e-06, + "loss": 0.2509, + "step": 9241 + }, + { + "epoch": 0.125366250678242, + "grad_norm": 4.910889148712158, + "learning_rate": 8.83637111141565e-06, + "loss": 0.4094, + "step": 9242 + }, + { + "epoch": 0.1253798155181769, + "grad_norm": 6.570157527923584, + "learning_rate": 8.836234068795396e-06, + "loss": 0.4809, + "step": 9243 + }, + { + "epoch": 0.12539338035811176, + "grad_norm": 4.262679100036621, + "learning_rate": 8.836097026175141e-06, + "loss": 0.2098, + "step": 9244 + }, + { + "epoch": 0.12540694519804665, + "grad_norm": 6.565627098083496, + "learning_rate": 8.835959983554886e-06, + "loss": 0.3794, + "step": 9245 + }, + { + "epoch": 0.12542051003798155, + "grad_norm": 7.040469646453857, + "learning_rate": 8.835822940934631e-06, + "loss": 0.3902, + "step": 9246 + }, + { + "epoch": 0.12543407487791644, + "grad_norm": 6.057468891143799, + "learning_rate": 8.835685898314377e-06, + "loss": 0.296, + "step": 9247 + }, + { + "epoch": 0.12544763971785133, + "grad_norm": 6.80775260925293, + "learning_rate": 8.835548855694122e-06, + "loss": 0.3851, + "step": 9248 + }, + { + "epoch": 0.12546120455778623, + "grad_norm": 4.744982719421387, + "learning_rate": 8.835411813073867e-06, + "loss": 0.2469, + "step": 9249 + }, + { + "epoch": 0.12547476939772112, + "grad_norm": 7.163046836853027, + "learning_rate": 8.835274770453612e-06, + "loss": 0.3547, + "step": 9250 + }, + { + "epoch": 0.12548833423765599, + "grad_norm": 3.9816856384277344, + "learning_rate": 8.835137727833357e-06, + "loss": 0.2251, + "step": 9251 + }, + { + "epoch": 0.12550189907759088, + "grad_norm": 5.97204065322876, + "learning_rate": 8.835000685213102e-06, + "loss": 0.3893, + "step": 9252 + }, + { + "epoch": 0.12551546391752577, + "grad_norm": 6.780297756195068, + "learning_rate": 8.834863642592848e-06, + "loss": 0.5383, + "step": 9253 + }, + { + "epoch": 0.12552902875746066, + "grad_norm": 9.242212295532227, + "learning_rate": 8.834726599972591e-06, + "loss": 0.56, + "step": 9254 + }, + { + "epoch": 0.12554259359739556, + "grad_norm": 6.395657539367676, + "learning_rate": 8.834589557352338e-06, + "loss": 0.3167, + "step": 9255 + }, + { + "epoch": 0.12555615843733045, + "grad_norm": 6.785602569580078, + "learning_rate": 8.834452514732083e-06, + "loss": 0.3738, + "step": 9256 + }, + { + "epoch": 0.12556972327726532, + "grad_norm": 8.126018524169922, + "learning_rate": 8.834315472111827e-06, + "loss": 0.4817, + "step": 9257 + }, + { + "epoch": 0.1255832881172002, + "grad_norm": 6.297081470489502, + "learning_rate": 8.834178429491572e-06, + "loss": 0.3509, + "step": 9258 + }, + { + "epoch": 0.1255968529571351, + "grad_norm": 5.203668594360352, + "learning_rate": 8.834041386871319e-06, + "loss": 0.2997, + "step": 9259 + }, + { + "epoch": 0.12561041779707, + "grad_norm": 7.472095012664795, + "learning_rate": 8.833904344251064e-06, + "loss": 0.7271, + "step": 9260 + }, + { + "epoch": 0.1256239826370049, + "grad_norm": 5.302348613739014, + "learning_rate": 8.833767301630807e-06, + "loss": 0.3015, + "step": 9261 + }, + { + "epoch": 0.12563754747693978, + "grad_norm": 4.061131000518799, + "learning_rate": 8.833630259010553e-06, + "loss": 0.2007, + "step": 9262 + }, + { + "epoch": 0.12565111231687467, + "grad_norm": 7.353886127471924, + "learning_rate": 8.833493216390298e-06, + "loss": 0.4466, + "step": 9263 + }, + { + "epoch": 0.12566467715680954, + "grad_norm": 7.458902359008789, + "learning_rate": 8.833356173770043e-06, + "loss": 0.435, + "step": 9264 + }, + { + "epoch": 0.12567824199674443, + "grad_norm": 6.560181140899658, + "learning_rate": 8.833219131149788e-06, + "loss": 0.4431, + "step": 9265 + }, + { + "epoch": 0.12569180683667933, + "grad_norm": 6.126369476318359, + "learning_rate": 8.833082088529533e-06, + "loss": 0.5262, + "step": 9266 + }, + { + "epoch": 0.12570537167661422, + "grad_norm": 5.153446674346924, + "learning_rate": 8.832945045909278e-06, + "loss": 0.3878, + "step": 9267 + }, + { + "epoch": 0.1257189365165491, + "grad_norm": 4.822569370269775, + "learning_rate": 8.832808003289024e-06, + "loss": 0.3006, + "step": 9268 + }, + { + "epoch": 0.125732501356484, + "grad_norm": 5.1498122215271, + "learning_rate": 8.832670960668769e-06, + "loss": 0.3625, + "step": 9269 + }, + { + "epoch": 0.12574606619641887, + "grad_norm": 5.112273693084717, + "learning_rate": 8.832533918048514e-06, + "loss": 0.3, + "step": 9270 + }, + { + "epoch": 0.12575963103635376, + "grad_norm": 6.49813985824585, + "learning_rate": 8.832396875428259e-06, + "loss": 0.2991, + "step": 9271 + }, + { + "epoch": 0.12577319587628866, + "grad_norm": 4.4471354484558105, + "learning_rate": 8.832259832808004e-06, + "loss": 0.3396, + "step": 9272 + }, + { + "epoch": 0.12578676071622355, + "grad_norm": 6.118435382843018, + "learning_rate": 8.83212279018775e-06, + "loss": 0.4997, + "step": 9273 + }, + { + "epoch": 0.12580032555615844, + "grad_norm": 5.259538173675537, + "learning_rate": 8.831985747567495e-06, + "loss": 0.3232, + "step": 9274 + }, + { + "epoch": 0.12581389039609334, + "grad_norm": 7.263064384460449, + "learning_rate": 8.83184870494724e-06, + "loss": 0.5467, + "step": 9275 + }, + { + "epoch": 0.1258274552360282, + "grad_norm": 5.602488994598389, + "learning_rate": 8.831711662326985e-06, + "loss": 0.3366, + "step": 9276 + }, + { + "epoch": 0.1258410200759631, + "grad_norm": 5.831470489501953, + "learning_rate": 8.83157461970673e-06, + "loss": 0.3025, + "step": 9277 + }, + { + "epoch": 0.125854584915898, + "grad_norm": 5.122173309326172, + "learning_rate": 8.831437577086475e-06, + "loss": 0.3382, + "step": 9278 + }, + { + "epoch": 0.12586814975583288, + "grad_norm": 6.195060729980469, + "learning_rate": 8.831300534466219e-06, + "loss": 0.4591, + "step": 9279 + }, + { + "epoch": 0.12588171459576777, + "grad_norm": 6.763493537902832, + "learning_rate": 8.831163491845964e-06, + "loss": 0.4759, + "step": 9280 + }, + { + "epoch": 0.12589527943570267, + "grad_norm": 4.220452308654785, + "learning_rate": 8.831026449225711e-06, + "loss": 0.3805, + "step": 9281 + }, + { + "epoch": 0.12590884427563756, + "grad_norm": 6.6227216720581055, + "learning_rate": 8.830889406605454e-06, + "loss": 0.5137, + "step": 9282 + }, + { + "epoch": 0.12592240911557243, + "grad_norm": 6.230311393737793, + "learning_rate": 8.8307523639852e-06, + "loss": 0.4034, + "step": 9283 + }, + { + "epoch": 0.12593597395550732, + "grad_norm": 6.974530220031738, + "learning_rate": 8.830615321364945e-06, + "loss": 0.521, + "step": 9284 + }, + { + "epoch": 0.1259495387954422, + "grad_norm": 7.5833740234375, + "learning_rate": 8.83047827874469e-06, + "loss": 0.416, + "step": 9285 + }, + { + "epoch": 0.1259631036353771, + "grad_norm": 5.9638848304748535, + "learning_rate": 8.830341236124435e-06, + "loss": 0.2267, + "step": 9286 + }, + { + "epoch": 0.125976668475312, + "grad_norm": 6.692464351654053, + "learning_rate": 8.83020419350418e-06, + "loss": 0.425, + "step": 9287 + }, + { + "epoch": 0.1259902333152469, + "grad_norm": 7.034811973571777, + "learning_rate": 8.830067150883925e-06, + "loss": 0.6341, + "step": 9288 + }, + { + "epoch": 0.12600379815518176, + "grad_norm": 6.189993858337402, + "learning_rate": 8.82993010826367e-06, + "loss": 0.3566, + "step": 9289 + }, + { + "epoch": 0.12601736299511665, + "grad_norm": 5.053646087646484, + "learning_rate": 8.829793065643416e-06, + "loss": 0.3944, + "step": 9290 + }, + { + "epoch": 0.12603092783505154, + "grad_norm": 5.66740083694458, + "learning_rate": 8.829656023023161e-06, + "loss": 0.3948, + "step": 9291 + }, + { + "epoch": 0.12604449267498644, + "grad_norm": 6.799401760101318, + "learning_rate": 8.829518980402906e-06, + "loss": 0.6148, + "step": 9292 + }, + { + "epoch": 0.12605805751492133, + "grad_norm": 7.0509748458862305, + "learning_rate": 8.829381937782651e-06, + "loss": 0.6532, + "step": 9293 + }, + { + "epoch": 0.12607162235485622, + "grad_norm": 8.7008056640625, + "learning_rate": 8.829244895162397e-06, + "loss": 0.533, + "step": 9294 + }, + { + "epoch": 0.12608518719479112, + "grad_norm": 8.131333351135254, + "learning_rate": 8.829107852542142e-06, + "loss": 0.587, + "step": 9295 + }, + { + "epoch": 0.12609875203472598, + "grad_norm": 6.2456769943237305, + "learning_rate": 8.828970809921887e-06, + "loss": 0.4565, + "step": 9296 + }, + { + "epoch": 0.12611231687466087, + "grad_norm": 5.479619026184082, + "learning_rate": 8.82883376730163e-06, + "loss": 0.3835, + "step": 9297 + }, + { + "epoch": 0.12612588171459577, + "grad_norm": 5.468957424163818, + "learning_rate": 8.828696724681377e-06, + "loss": 0.4047, + "step": 9298 + }, + { + "epoch": 0.12613944655453066, + "grad_norm": 5.082484245300293, + "learning_rate": 8.828559682061122e-06, + "loss": 0.4881, + "step": 9299 + }, + { + "epoch": 0.12615301139446555, + "grad_norm": 6.587285995483398, + "learning_rate": 8.828422639440866e-06, + "loss": 0.3129, + "step": 9300 + }, + { + "epoch": 0.12616657623440045, + "grad_norm": 5.907428741455078, + "learning_rate": 8.828285596820611e-06, + "loss": 0.4374, + "step": 9301 + }, + { + "epoch": 0.1261801410743353, + "grad_norm": 5.96776819229126, + "learning_rate": 8.828148554200358e-06, + "loss": 0.4638, + "step": 9302 + }, + { + "epoch": 0.1261937059142702, + "grad_norm": 4.909683704376221, + "learning_rate": 8.828011511580103e-06, + "loss": 0.3214, + "step": 9303 + }, + { + "epoch": 0.1262072707542051, + "grad_norm": 6.210382461547852, + "learning_rate": 8.827874468959847e-06, + "loss": 0.4486, + "step": 9304 + }, + { + "epoch": 0.12622083559414, + "grad_norm": 6.29841947555542, + "learning_rate": 8.827737426339592e-06, + "loss": 0.5134, + "step": 9305 + }, + { + "epoch": 0.12623440043407488, + "grad_norm": 5.0956597328186035, + "learning_rate": 8.827600383719337e-06, + "loss": 0.4155, + "step": 9306 + }, + { + "epoch": 0.12624796527400978, + "grad_norm": 7.529736518859863, + "learning_rate": 8.827463341099082e-06, + "loss": 0.5208, + "step": 9307 + }, + { + "epoch": 0.12626153011394464, + "grad_norm": 6.153759002685547, + "learning_rate": 8.827326298478827e-06, + "loss": 0.4046, + "step": 9308 + }, + { + "epoch": 0.12627509495387954, + "grad_norm": 5.900186061859131, + "learning_rate": 8.827189255858573e-06, + "loss": 0.3207, + "step": 9309 + }, + { + "epoch": 0.12628865979381443, + "grad_norm": 5.983642101287842, + "learning_rate": 8.827052213238318e-06, + "loss": 0.4448, + "step": 9310 + }, + { + "epoch": 0.12630222463374932, + "grad_norm": 5.97969913482666, + "learning_rate": 8.826915170618063e-06, + "loss": 0.456, + "step": 9311 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 4.556216716766357, + "learning_rate": 8.826778127997808e-06, + "loss": 0.4162, + "step": 9312 + }, + { + "epoch": 0.1263293543136191, + "grad_norm": 8.102581024169922, + "learning_rate": 8.826641085377553e-06, + "loss": 0.3589, + "step": 9313 + }, + { + "epoch": 0.126342919153554, + "grad_norm": 5.666214466094971, + "learning_rate": 8.826504042757298e-06, + "loss": 0.5411, + "step": 9314 + }, + { + "epoch": 0.12635648399348887, + "grad_norm": 7.599396228790283, + "learning_rate": 8.826367000137044e-06, + "loss": 0.558, + "step": 9315 + }, + { + "epoch": 0.12637004883342376, + "grad_norm": 6.5936360359191895, + "learning_rate": 8.826229957516789e-06, + "loss": 0.4373, + "step": 9316 + }, + { + "epoch": 0.12638361367335865, + "grad_norm": 7.428076267242432, + "learning_rate": 8.826092914896534e-06, + "loss": 0.5657, + "step": 9317 + }, + { + "epoch": 0.12639717851329355, + "grad_norm": 6.571823596954346, + "learning_rate": 8.82595587227628e-06, + "loss": 0.5244, + "step": 9318 + }, + { + "epoch": 0.12641074335322844, + "grad_norm": 5.48664665222168, + "learning_rate": 8.825818829656023e-06, + "loss": 0.427, + "step": 9319 + }, + { + "epoch": 0.12642430819316333, + "grad_norm": 3.8210299015045166, + "learning_rate": 8.82568178703577e-06, + "loss": 0.2517, + "step": 9320 + }, + { + "epoch": 0.1264378730330982, + "grad_norm": 5.630343914031982, + "learning_rate": 8.825544744415515e-06, + "loss": 0.311, + "step": 9321 + }, + { + "epoch": 0.1264514378730331, + "grad_norm": 4.057847023010254, + "learning_rate": 8.825407701795258e-06, + "loss": 0.2942, + "step": 9322 + }, + { + "epoch": 0.12646500271296798, + "grad_norm": 7.671283721923828, + "learning_rate": 8.825270659175003e-06, + "loss": 0.4061, + "step": 9323 + }, + { + "epoch": 0.12647856755290288, + "grad_norm": 4.4753570556640625, + "learning_rate": 8.82513361655475e-06, + "loss": 0.2965, + "step": 9324 + }, + { + "epoch": 0.12649213239283777, + "grad_norm": 6.036142349243164, + "learning_rate": 8.824996573934494e-06, + "loss": 0.4193, + "step": 9325 + }, + { + "epoch": 0.12650569723277266, + "grad_norm": 6.566324710845947, + "learning_rate": 8.824859531314239e-06, + "loss": 0.3507, + "step": 9326 + }, + { + "epoch": 0.12651926207270756, + "grad_norm": 6.900656223297119, + "learning_rate": 8.824722488693984e-06, + "loss": 0.3373, + "step": 9327 + }, + { + "epoch": 0.12653282691264242, + "grad_norm": 5.317404747009277, + "learning_rate": 8.824585446073731e-06, + "loss": 0.318, + "step": 9328 + }, + { + "epoch": 0.12654639175257731, + "grad_norm": 5.62778902053833, + "learning_rate": 8.824448403453474e-06, + "loss": 0.416, + "step": 9329 + }, + { + "epoch": 0.1265599565925122, + "grad_norm": 6.157272815704346, + "learning_rate": 8.82431136083322e-06, + "loss": 0.3354, + "step": 9330 + }, + { + "epoch": 0.1265735214324471, + "grad_norm": 7.372447490692139, + "learning_rate": 8.824174318212965e-06, + "loss": 0.5728, + "step": 9331 + }, + { + "epoch": 0.126587086272382, + "grad_norm": 5.604654312133789, + "learning_rate": 8.82403727559271e-06, + "loss": 0.3198, + "step": 9332 + }, + { + "epoch": 0.1266006511123169, + "grad_norm": 5.158915996551514, + "learning_rate": 8.823900232972455e-06, + "loss": 0.3034, + "step": 9333 + }, + { + "epoch": 0.12661421595225175, + "grad_norm": 7.212167739868164, + "learning_rate": 8.8237631903522e-06, + "loss": 0.4255, + "step": 9334 + }, + { + "epoch": 0.12662778079218664, + "grad_norm": 8.350520133972168, + "learning_rate": 8.823626147731946e-06, + "loss": 0.722, + "step": 9335 + }, + { + "epoch": 0.12664134563212154, + "grad_norm": 5.930769920349121, + "learning_rate": 8.82348910511169e-06, + "loss": 0.3708, + "step": 9336 + }, + { + "epoch": 0.12665491047205643, + "grad_norm": 8.291120529174805, + "learning_rate": 8.823352062491436e-06, + "loss": 0.5319, + "step": 9337 + }, + { + "epoch": 0.12666847531199132, + "grad_norm": 7.489705562591553, + "learning_rate": 8.823215019871181e-06, + "loss": 0.448, + "step": 9338 + }, + { + "epoch": 0.12668204015192622, + "grad_norm": 6.86050271987915, + "learning_rate": 8.823077977250926e-06, + "loss": 0.4542, + "step": 9339 + }, + { + "epoch": 0.12669560499186108, + "grad_norm": 4.97554349899292, + "learning_rate": 8.82294093463067e-06, + "loss": 0.3649, + "step": 9340 + }, + { + "epoch": 0.12670916983179598, + "grad_norm": 4.901390552520752, + "learning_rate": 8.822803892010417e-06, + "loss": 0.3033, + "step": 9341 + }, + { + "epoch": 0.12672273467173087, + "grad_norm": 6.954668045043945, + "learning_rate": 8.822666849390162e-06, + "loss": 0.3972, + "step": 9342 + }, + { + "epoch": 0.12673629951166576, + "grad_norm": 7.857632637023926, + "learning_rate": 8.822529806769907e-06, + "loss": 0.404, + "step": 9343 + }, + { + "epoch": 0.12674986435160066, + "grad_norm": 5.445584774017334, + "learning_rate": 8.82239276414965e-06, + "loss": 0.3741, + "step": 9344 + }, + { + "epoch": 0.12676342919153555, + "grad_norm": 7.282781600952148, + "learning_rate": 8.822255721529397e-06, + "loss": 0.6576, + "step": 9345 + }, + { + "epoch": 0.12677699403147044, + "grad_norm": 6.466987133026123, + "learning_rate": 8.822118678909142e-06, + "loss": 0.3684, + "step": 9346 + }, + { + "epoch": 0.1267905588714053, + "grad_norm": 6.377092361450195, + "learning_rate": 8.821981636288886e-06, + "loss": 0.4727, + "step": 9347 + }, + { + "epoch": 0.1268041237113402, + "grad_norm": 5.328265190124512, + "learning_rate": 8.821844593668631e-06, + "loss": 0.4439, + "step": 9348 + }, + { + "epoch": 0.1268176885512751, + "grad_norm": 6.653322219848633, + "learning_rate": 8.821707551048376e-06, + "loss": 0.5662, + "step": 9349 + }, + { + "epoch": 0.12683125339120999, + "grad_norm": 7.698816299438477, + "learning_rate": 8.821570508428121e-06, + "loss": 0.5572, + "step": 9350 + }, + { + "epoch": 0.12684481823114488, + "grad_norm": 6.062867641448975, + "learning_rate": 8.821433465807867e-06, + "loss": 0.3371, + "step": 9351 + }, + { + "epoch": 0.12685838307107977, + "grad_norm": 5.479949474334717, + "learning_rate": 8.821296423187612e-06, + "loss": 0.3099, + "step": 9352 + }, + { + "epoch": 0.12687194791101464, + "grad_norm": 6.534171104431152, + "learning_rate": 8.821159380567357e-06, + "loss": 0.6097, + "step": 9353 + }, + { + "epoch": 0.12688551275094953, + "grad_norm": 5.515438079833984, + "learning_rate": 8.821022337947102e-06, + "loss": 0.4474, + "step": 9354 + }, + { + "epoch": 0.12689907759088442, + "grad_norm": 7.6503190994262695, + "learning_rate": 8.820885295326847e-06, + "loss": 0.4072, + "step": 9355 + }, + { + "epoch": 0.12691264243081932, + "grad_norm": 5.590672492980957, + "learning_rate": 8.820748252706593e-06, + "loss": 0.4898, + "step": 9356 + }, + { + "epoch": 0.1269262072707542, + "grad_norm": 4.5936689376831055, + "learning_rate": 8.820611210086338e-06, + "loss": 0.2938, + "step": 9357 + }, + { + "epoch": 0.1269397721106891, + "grad_norm": 6.345268726348877, + "learning_rate": 8.820474167466083e-06, + "loss": 0.2659, + "step": 9358 + }, + { + "epoch": 0.126953336950624, + "grad_norm": 4.725047588348389, + "learning_rate": 8.820337124845828e-06, + "loss": 0.2604, + "step": 9359 + }, + { + "epoch": 0.12696690179055886, + "grad_norm": 6.975269317626953, + "learning_rate": 8.820200082225573e-06, + "loss": 0.5634, + "step": 9360 + }, + { + "epoch": 0.12698046663049375, + "grad_norm": 8.861628532409668, + "learning_rate": 8.820063039605318e-06, + "loss": 0.5182, + "step": 9361 + }, + { + "epoch": 0.12699403147042865, + "grad_norm": 7.023647308349609, + "learning_rate": 8.819925996985062e-06, + "loss": 0.3837, + "step": 9362 + }, + { + "epoch": 0.12700759631036354, + "grad_norm": 8.66125202178955, + "learning_rate": 8.819788954364809e-06, + "loss": 0.5894, + "step": 9363 + }, + { + "epoch": 0.12702116115029843, + "grad_norm": 5.81305456161499, + "learning_rate": 8.819651911744554e-06, + "loss": 0.4321, + "step": 9364 + }, + { + "epoch": 0.12703472599023333, + "grad_norm": 8.77253246307373, + "learning_rate": 8.819514869124297e-06, + "loss": 0.523, + "step": 9365 + }, + { + "epoch": 0.1270482908301682, + "grad_norm": 5.061330318450928, + "learning_rate": 8.819377826504043e-06, + "loss": 0.3109, + "step": 9366 + }, + { + "epoch": 0.12706185567010309, + "grad_norm": 7.228027820587158, + "learning_rate": 8.81924078388379e-06, + "loss": 0.5055, + "step": 9367 + }, + { + "epoch": 0.12707542051003798, + "grad_norm": 5.968118190765381, + "learning_rate": 8.819103741263533e-06, + "loss": 0.3665, + "step": 9368 + }, + { + "epoch": 0.12708898534997287, + "grad_norm": 6.439276218414307, + "learning_rate": 8.818966698643278e-06, + "loss": 0.4819, + "step": 9369 + }, + { + "epoch": 0.12710255018990776, + "grad_norm": 7.18994140625, + "learning_rate": 8.818829656023023e-06, + "loss": 0.5261, + "step": 9370 + }, + { + "epoch": 0.12711611502984266, + "grad_norm": 5.989414215087891, + "learning_rate": 8.81869261340277e-06, + "loss": 0.3479, + "step": 9371 + }, + { + "epoch": 0.12712967986977752, + "grad_norm": 7.132907867431641, + "learning_rate": 8.818555570782514e-06, + "loss": 0.3131, + "step": 9372 + }, + { + "epoch": 0.12714324470971242, + "grad_norm": 6.114035129547119, + "learning_rate": 8.818418528162259e-06, + "loss": 0.4139, + "step": 9373 + }, + { + "epoch": 0.1271568095496473, + "grad_norm": 5.613781929016113, + "learning_rate": 8.818281485542004e-06, + "loss": 0.3695, + "step": 9374 + }, + { + "epoch": 0.1271703743895822, + "grad_norm": 6.31889009475708, + "learning_rate": 8.81814444292175e-06, + "loss": 0.585, + "step": 9375 + }, + { + "epoch": 0.1271839392295171, + "grad_norm": 6.894179821014404, + "learning_rate": 8.818007400301494e-06, + "loss": 0.405, + "step": 9376 + }, + { + "epoch": 0.127197504069452, + "grad_norm": 8.5472412109375, + "learning_rate": 8.81787035768124e-06, + "loss": 0.6192, + "step": 9377 + }, + { + "epoch": 0.12721106890938688, + "grad_norm": 6.004326343536377, + "learning_rate": 8.817733315060985e-06, + "loss": 0.4964, + "step": 9378 + }, + { + "epoch": 0.12722463374932175, + "grad_norm": 7.230207920074463, + "learning_rate": 8.81759627244073e-06, + "loss": 0.5536, + "step": 9379 + }, + { + "epoch": 0.12723819858925664, + "grad_norm": 6.884519100189209, + "learning_rate": 8.817459229820475e-06, + "loss": 0.4793, + "step": 9380 + }, + { + "epoch": 0.12725176342919153, + "grad_norm": 9.041193008422852, + "learning_rate": 8.81732218720022e-06, + "loss": 0.5966, + "step": 9381 + }, + { + "epoch": 0.12726532826912643, + "grad_norm": 6.9641194343566895, + "learning_rate": 8.817185144579966e-06, + "loss": 0.4986, + "step": 9382 + }, + { + "epoch": 0.12727889310906132, + "grad_norm": 4.754835605621338, + "learning_rate": 8.817048101959709e-06, + "loss": 0.3319, + "step": 9383 + }, + { + "epoch": 0.1272924579489962, + "grad_norm": 7.902969837188721, + "learning_rate": 8.816911059339456e-06, + "loss": 0.3398, + "step": 9384 + }, + { + "epoch": 0.12730602278893108, + "grad_norm": 6.064583778381348, + "learning_rate": 8.816774016719201e-06, + "loss": 0.3442, + "step": 9385 + }, + { + "epoch": 0.12731958762886597, + "grad_norm": 6.714383125305176, + "learning_rate": 8.816636974098946e-06, + "loss": 0.4549, + "step": 9386 + }, + { + "epoch": 0.12733315246880086, + "grad_norm": 5.490059852600098, + "learning_rate": 8.81649993147869e-06, + "loss": 0.356, + "step": 9387 + }, + { + "epoch": 0.12734671730873576, + "grad_norm": 5.77320671081543, + "learning_rate": 8.816362888858435e-06, + "loss": 0.3696, + "step": 9388 + }, + { + "epoch": 0.12736028214867065, + "grad_norm": 9.517786979675293, + "learning_rate": 8.816225846238182e-06, + "loss": 0.5112, + "step": 9389 + }, + { + "epoch": 0.12737384698860554, + "grad_norm": 8.97791862487793, + "learning_rate": 8.816088803617925e-06, + "loss": 0.5195, + "step": 9390 + }, + { + "epoch": 0.12738741182854044, + "grad_norm": 4.941748142242432, + "learning_rate": 8.81595176099767e-06, + "loss": 0.3124, + "step": 9391 + }, + { + "epoch": 0.1274009766684753, + "grad_norm": 7.790533542633057, + "learning_rate": 8.815814718377416e-06, + "loss": 0.3706, + "step": 9392 + }, + { + "epoch": 0.1274145415084102, + "grad_norm": 8.520459175109863, + "learning_rate": 8.81567767575716e-06, + "loss": 0.5418, + "step": 9393 + }, + { + "epoch": 0.1274281063483451, + "grad_norm": 6.430764198303223, + "learning_rate": 8.815540633136906e-06, + "loss": 0.3692, + "step": 9394 + }, + { + "epoch": 0.12744167118827998, + "grad_norm": 8.152447700500488, + "learning_rate": 8.815403590516651e-06, + "loss": 0.4874, + "step": 9395 + }, + { + "epoch": 0.12745523602821487, + "grad_norm": 7.969241142272949, + "learning_rate": 8.815266547896396e-06, + "loss": 0.5542, + "step": 9396 + }, + { + "epoch": 0.12746880086814977, + "grad_norm": 5.649544715881348, + "learning_rate": 8.815129505276142e-06, + "loss": 0.3272, + "step": 9397 + }, + { + "epoch": 0.12748236570808463, + "grad_norm": 9.518413543701172, + "learning_rate": 8.814992462655887e-06, + "loss": 0.5107, + "step": 9398 + }, + { + "epoch": 0.12749593054801953, + "grad_norm": 7.647980213165283, + "learning_rate": 8.814855420035632e-06, + "loss": 0.4684, + "step": 9399 + }, + { + "epoch": 0.12750949538795442, + "grad_norm": 5.50203275680542, + "learning_rate": 8.814718377415377e-06, + "loss": 0.3489, + "step": 9400 + }, + { + "epoch": 0.1275230602278893, + "grad_norm": 6.625609397888184, + "learning_rate": 8.814581334795122e-06, + "loss": 0.333, + "step": 9401 + }, + { + "epoch": 0.1275366250678242, + "grad_norm": 4.534736156463623, + "learning_rate": 8.814444292174867e-06, + "loss": 0.2319, + "step": 9402 + }, + { + "epoch": 0.1275501899077591, + "grad_norm": 6.307202339172363, + "learning_rate": 8.814307249554613e-06, + "loss": 0.3773, + "step": 9403 + }, + { + "epoch": 0.12756375474769396, + "grad_norm": 4.505527496337891, + "learning_rate": 8.814170206934358e-06, + "loss": 0.2646, + "step": 9404 + }, + { + "epoch": 0.12757731958762886, + "grad_norm": 5.187893867492676, + "learning_rate": 8.814033164314101e-06, + "loss": 0.2483, + "step": 9405 + }, + { + "epoch": 0.12759088442756375, + "grad_norm": 5.642287254333496, + "learning_rate": 8.813896121693848e-06, + "loss": 0.2713, + "step": 9406 + }, + { + "epoch": 0.12760444926749864, + "grad_norm": 4.95665168762207, + "learning_rate": 8.813759079073593e-06, + "loss": 0.3482, + "step": 9407 + }, + { + "epoch": 0.12761801410743354, + "grad_norm": 6.152754306793213, + "learning_rate": 8.813622036453337e-06, + "loss": 0.384, + "step": 9408 + }, + { + "epoch": 0.12763157894736843, + "grad_norm": 6.814677715301514, + "learning_rate": 8.813484993833082e-06, + "loss": 0.427, + "step": 9409 + }, + { + "epoch": 0.12764514378730332, + "grad_norm": 5.258660793304443, + "learning_rate": 8.813347951212829e-06, + "loss": 0.2911, + "step": 9410 + }, + { + "epoch": 0.1276587086272382, + "grad_norm": 6.964994430541992, + "learning_rate": 8.813210908592574e-06, + "loss": 0.4385, + "step": 9411 + }, + { + "epoch": 0.12767227346717308, + "grad_norm": 5.975459575653076, + "learning_rate": 8.813073865972317e-06, + "loss": 0.4466, + "step": 9412 + }, + { + "epoch": 0.12768583830710797, + "grad_norm": 7.01723575592041, + "learning_rate": 8.812936823352063e-06, + "loss": 0.3753, + "step": 9413 + }, + { + "epoch": 0.12769940314704287, + "grad_norm": 7.5549139976501465, + "learning_rate": 8.81279978073181e-06, + "loss": 0.4748, + "step": 9414 + }, + { + "epoch": 0.12771296798697776, + "grad_norm": 6.264267921447754, + "learning_rate": 8.812662738111553e-06, + "loss": 0.3477, + "step": 9415 + }, + { + "epoch": 0.12772653282691265, + "grad_norm": 6.189760208129883, + "learning_rate": 8.812525695491298e-06, + "loss": 0.4156, + "step": 9416 + }, + { + "epoch": 0.12774009766684752, + "grad_norm": 6.486565589904785, + "learning_rate": 8.812388652871043e-06, + "loss": 0.3718, + "step": 9417 + }, + { + "epoch": 0.1277536625067824, + "grad_norm": 5.472061634063721, + "learning_rate": 8.812251610250789e-06, + "loss": 0.4252, + "step": 9418 + }, + { + "epoch": 0.1277672273467173, + "grad_norm": 6.998287200927734, + "learning_rate": 8.812114567630534e-06, + "loss": 0.2947, + "step": 9419 + }, + { + "epoch": 0.1277807921866522, + "grad_norm": 7.155975341796875, + "learning_rate": 8.811977525010279e-06, + "loss": 0.2957, + "step": 9420 + }, + { + "epoch": 0.1277943570265871, + "grad_norm": 7.168099403381348, + "learning_rate": 8.811840482390024e-06, + "loss": 0.3665, + "step": 9421 + }, + { + "epoch": 0.12780792186652198, + "grad_norm": 6.987526893615723, + "learning_rate": 8.81170343976977e-06, + "loss": 0.372, + "step": 9422 + }, + { + "epoch": 0.12782148670645688, + "grad_norm": 6.95559024810791, + "learning_rate": 8.811566397149514e-06, + "loss": 0.4738, + "step": 9423 + }, + { + "epoch": 0.12783505154639174, + "grad_norm": 6.355195999145508, + "learning_rate": 8.81142935452926e-06, + "loss": 0.3711, + "step": 9424 + }, + { + "epoch": 0.12784861638632664, + "grad_norm": 6.3852057456970215, + "learning_rate": 8.811292311909005e-06, + "loss": 0.4669, + "step": 9425 + }, + { + "epoch": 0.12786218122626153, + "grad_norm": 7.7700324058532715, + "learning_rate": 8.81115526928875e-06, + "loss": 0.4185, + "step": 9426 + }, + { + "epoch": 0.12787574606619642, + "grad_norm": 6.839609622955322, + "learning_rate": 8.811018226668495e-06, + "loss": 0.398, + "step": 9427 + }, + { + "epoch": 0.12788931090613131, + "grad_norm": 7.282422065734863, + "learning_rate": 8.81088118404824e-06, + "loss": 0.4607, + "step": 9428 + }, + { + "epoch": 0.1279028757460662, + "grad_norm": 6.3863043785095215, + "learning_rate": 8.810744141427986e-06, + "loss": 0.4826, + "step": 9429 + }, + { + "epoch": 0.12791644058600107, + "grad_norm": 7.120391368865967, + "learning_rate": 8.810607098807729e-06, + "loss": 0.4484, + "step": 9430 + }, + { + "epoch": 0.12793000542593597, + "grad_norm": 4.8772406578063965, + "learning_rate": 8.810470056187474e-06, + "loss": 0.2619, + "step": 9431 + }, + { + "epoch": 0.12794357026587086, + "grad_norm": 8.101133346557617, + "learning_rate": 8.810333013567221e-06, + "loss": 0.521, + "step": 9432 + }, + { + "epoch": 0.12795713510580575, + "grad_norm": 5.61976957321167, + "learning_rate": 8.810195970946965e-06, + "loss": 0.4034, + "step": 9433 + }, + { + "epoch": 0.12797069994574065, + "grad_norm": 4.876786708831787, + "learning_rate": 8.81005892832671e-06, + "loss": 0.3501, + "step": 9434 + }, + { + "epoch": 0.12798426478567554, + "grad_norm": 7.105213165283203, + "learning_rate": 8.809921885706455e-06, + "loss": 0.5402, + "step": 9435 + }, + { + "epoch": 0.1279978296256104, + "grad_norm": 5.177973747253418, + "learning_rate": 8.809784843086202e-06, + "loss": 0.3523, + "step": 9436 + }, + { + "epoch": 0.1280113944655453, + "grad_norm": 9.35348129272461, + "learning_rate": 8.809647800465945e-06, + "loss": 0.4837, + "step": 9437 + }, + { + "epoch": 0.1280249593054802, + "grad_norm": 4.509084701538086, + "learning_rate": 8.80951075784569e-06, + "loss": 0.2939, + "step": 9438 + }, + { + "epoch": 0.12803852414541508, + "grad_norm": 6.1620612144470215, + "learning_rate": 8.809373715225436e-06, + "loss": 0.4009, + "step": 9439 + }, + { + "epoch": 0.12805208898534998, + "grad_norm": 5.066526412963867, + "learning_rate": 8.80923667260518e-06, + "loss": 0.3032, + "step": 9440 + }, + { + "epoch": 0.12806565382528487, + "grad_norm": 5.660312652587891, + "learning_rate": 8.809099629984926e-06, + "loss": 0.299, + "step": 9441 + }, + { + "epoch": 0.12807921866521976, + "grad_norm": 5.149104118347168, + "learning_rate": 8.808962587364671e-06, + "loss": 0.4128, + "step": 9442 + }, + { + "epoch": 0.12809278350515463, + "grad_norm": 6.729113578796387, + "learning_rate": 8.808825544744416e-06, + "loss": 0.3707, + "step": 9443 + }, + { + "epoch": 0.12810634834508952, + "grad_norm": 5.784377098083496, + "learning_rate": 8.808688502124162e-06, + "loss": 0.3618, + "step": 9444 + }, + { + "epoch": 0.12811991318502441, + "grad_norm": 4.793200969696045, + "learning_rate": 8.808551459503907e-06, + "loss": 0.3187, + "step": 9445 + }, + { + "epoch": 0.1281334780249593, + "grad_norm": 3.443164587020874, + "learning_rate": 8.808414416883652e-06, + "loss": 0.2803, + "step": 9446 + }, + { + "epoch": 0.1281470428648942, + "grad_norm": 5.423295974731445, + "learning_rate": 8.808277374263397e-06, + "loss": 0.3246, + "step": 9447 + }, + { + "epoch": 0.1281606077048291, + "grad_norm": 4.734610080718994, + "learning_rate": 8.80814033164314e-06, + "loss": 0.2972, + "step": 9448 + }, + { + "epoch": 0.12817417254476396, + "grad_norm": 5.268932819366455, + "learning_rate": 8.808003289022887e-06, + "loss": 0.3134, + "step": 9449 + }, + { + "epoch": 0.12818773738469885, + "grad_norm": 5.403271198272705, + "learning_rate": 8.807866246402633e-06, + "loss": 0.3586, + "step": 9450 + }, + { + "epoch": 0.12820130222463375, + "grad_norm": 6.1735944747924805, + "learning_rate": 8.807729203782378e-06, + "loss": 0.341, + "step": 9451 + }, + { + "epoch": 0.12821486706456864, + "grad_norm": 7.038978099822998, + "learning_rate": 8.807592161162121e-06, + "loss": 0.5295, + "step": 9452 + }, + { + "epoch": 0.12822843190450353, + "grad_norm": 6.414157867431641, + "learning_rate": 8.807455118541868e-06, + "loss": 0.3984, + "step": 9453 + }, + { + "epoch": 0.12824199674443842, + "grad_norm": 5.12129020690918, + "learning_rate": 8.807318075921613e-06, + "loss": 0.2543, + "step": 9454 + }, + { + "epoch": 0.12825556158437332, + "grad_norm": 6.3106865882873535, + "learning_rate": 8.807181033301357e-06, + "loss": 0.4916, + "step": 9455 + }, + { + "epoch": 0.12826912642430818, + "grad_norm": 5.073246955871582, + "learning_rate": 8.807043990681102e-06, + "loss": 0.3117, + "step": 9456 + }, + { + "epoch": 0.12828269126424308, + "grad_norm": 6.317397594451904, + "learning_rate": 8.806906948060847e-06, + "loss": 0.4027, + "step": 9457 + }, + { + "epoch": 0.12829625610417797, + "grad_norm": 5.751558303833008, + "learning_rate": 8.806769905440592e-06, + "loss": 0.3245, + "step": 9458 + }, + { + "epoch": 0.12830982094411286, + "grad_norm": 6.40185022354126, + "learning_rate": 8.806632862820338e-06, + "loss": 0.3976, + "step": 9459 + }, + { + "epoch": 0.12832338578404776, + "grad_norm": 7.771173477172852, + "learning_rate": 8.806495820200083e-06, + "loss": 0.575, + "step": 9460 + }, + { + "epoch": 0.12833695062398265, + "grad_norm": 7.367980003356934, + "learning_rate": 8.806358777579828e-06, + "loss": 0.476, + "step": 9461 + }, + { + "epoch": 0.1283505154639175, + "grad_norm": 7.0257158279418945, + "learning_rate": 8.806221734959573e-06, + "loss": 0.4149, + "step": 9462 + }, + { + "epoch": 0.1283640803038524, + "grad_norm": 5.72233772277832, + "learning_rate": 8.806084692339318e-06, + "loss": 0.3742, + "step": 9463 + }, + { + "epoch": 0.1283776451437873, + "grad_norm": 6.113210201263428, + "learning_rate": 8.805947649719063e-06, + "loss": 0.4256, + "step": 9464 + }, + { + "epoch": 0.1283912099837222, + "grad_norm": 5.5373029708862305, + "learning_rate": 8.805810607098809e-06, + "loss": 0.435, + "step": 9465 + }, + { + "epoch": 0.1284047748236571, + "grad_norm": 6.5700836181640625, + "learning_rate": 8.805673564478554e-06, + "loss": 0.4448, + "step": 9466 + }, + { + "epoch": 0.12841833966359198, + "grad_norm": 8.129587173461914, + "learning_rate": 8.805536521858299e-06, + "loss": 0.354, + "step": 9467 + }, + { + "epoch": 0.12843190450352684, + "grad_norm": 5.937401294708252, + "learning_rate": 8.805399479238044e-06, + "loss": 0.5266, + "step": 9468 + }, + { + "epoch": 0.12844546934346174, + "grad_norm": 6.577179908752441, + "learning_rate": 8.80526243661779e-06, + "loss": 0.6406, + "step": 9469 + }, + { + "epoch": 0.12845903418339663, + "grad_norm": 7.066000938415527, + "learning_rate": 8.805125393997533e-06, + "loss": 0.4016, + "step": 9470 + }, + { + "epoch": 0.12847259902333152, + "grad_norm": 5.82532262802124, + "learning_rate": 8.80498835137728e-06, + "loss": 0.4655, + "step": 9471 + }, + { + "epoch": 0.12848616386326642, + "grad_norm": 6.254229545593262, + "learning_rate": 8.804851308757025e-06, + "loss": 0.4648, + "step": 9472 + }, + { + "epoch": 0.1284997287032013, + "grad_norm": 6.324362754821777, + "learning_rate": 8.804714266136768e-06, + "loss": 0.4075, + "step": 9473 + }, + { + "epoch": 0.1285132935431362, + "grad_norm": 5.589186668395996, + "learning_rate": 8.804577223516514e-06, + "loss": 0.4508, + "step": 9474 + }, + { + "epoch": 0.12852685838307107, + "grad_norm": 7.441007137298584, + "learning_rate": 8.80444018089626e-06, + "loss": 0.5351, + "step": 9475 + }, + { + "epoch": 0.12854042322300596, + "grad_norm": 6.498964786529541, + "learning_rate": 8.804303138276004e-06, + "loss": 0.4477, + "step": 9476 + }, + { + "epoch": 0.12855398806294085, + "grad_norm": 7.544116973876953, + "learning_rate": 8.804166095655749e-06, + "loss": 0.4446, + "step": 9477 + }, + { + "epoch": 0.12856755290287575, + "grad_norm": 6.211252689361572, + "learning_rate": 8.804029053035494e-06, + "loss": 0.4494, + "step": 9478 + }, + { + "epoch": 0.12858111774281064, + "grad_norm": 8.794121742248535, + "learning_rate": 8.803892010415241e-06, + "loss": 0.6286, + "step": 9479 + }, + { + "epoch": 0.12859468258274553, + "grad_norm": 7.9721879959106445, + "learning_rate": 8.803754967794985e-06, + "loss": 0.5855, + "step": 9480 + }, + { + "epoch": 0.1286082474226804, + "grad_norm": 10.121832847595215, + "learning_rate": 8.80361792517473e-06, + "loss": 0.6879, + "step": 9481 + }, + { + "epoch": 0.1286218122626153, + "grad_norm": 6.522327899932861, + "learning_rate": 8.803480882554475e-06, + "loss": 0.4942, + "step": 9482 + }, + { + "epoch": 0.12863537710255019, + "grad_norm": 4.621888637542725, + "learning_rate": 8.80334383993422e-06, + "loss": 0.3384, + "step": 9483 + }, + { + "epoch": 0.12864894194248508, + "grad_norm": 6.654333114624023, + "learning_rate": 8.803206797313965e-06, + "loss": 0.3411, + "step": 9484 + }, + { + "epoch": 0.12866250678241997, + "grad_norm": 6.504882335662842, + "learning_rate": 8.80306975469371e-06, + "loss": 0.3864, + "step": 9485 + }, + { + "epoch": 0.12867607162235487, + "grad_norm": 4.813384532928467, + "learning_rate": 8.802932712073456e-06, + "loss": 0.3466, + "step": 9486 + }, + { + "epoch": 0.12868963646228976, + "grad_norm": 11.601316452026367, + "learning_rate": 8.8027956694532e-06, + "loss": 0.4801, + "step": 9487 + }, + { + "epoch": 0.12870320130222462, + "grad_norm": 5.447522163391113, + "learning_rate": 8.802658626832946e-06, + "loss": 0.2911, + "step": 9488 + }, + { + "epoch": 0.12871676614215952, + "grad_norm": 8.586042404174805, + "learning_rate": 8.802521584212691e-06, + "loss": 0.3515, + "step": 9489 + }, + { + "epoch": 0.1287303309820944, + "grad_norm": 7.360052585601807, + "learning_rate": 8.802384541592436e-06, + "loss": 0.397, + "step": 9490 + }, + { + "epoch": 0.1287438958220293, + "grad_norm": 5.286809921264648, + "learning_rate": 8.80224749897218e-06, + "loss": 0.2079, + "step": 9491 + }, + { + "epoch": 0.1287574606619642, + "grad_norm": 6.110873699188232, + "learning_rate": 8.802110456351927e-06, + "loss": 0.4588, + "step": 9492 + }, + { + "epoch": 0.1287710255018991, + "grad_norm": 5.6765007972717285, + "learning_rate": 8.801973413731672e-06, + "loss": 0.287, + "step": 9493 + }, + { + "epoch": 0.12878459034183395, + "grad_norm": 7.5330891609191895, + "learning_rate": 8.801836371111417e-06, + "loss": 0.4577, + "step": 9494 + }, + { + "epoch": 0.12879815518176885, + "grad_norm": 4.70217752456665, + "learning_rate": 8.80169932849116e-06, + "loss": 0.303, + "step": 9495 + }, + { + "epoch": 0.12881172002170374, + "grad_norm": 4.743304252624512, + "learning_rate": 8.801562285870907e-06, + "loss": 0.2838, + "step": 9496 + }, + { + "epoch": 0.12882528486163863, + "grad_norm": 5.863199234008789, + "learning_rate": 8.801425243250653e-06, + "loss": 0.2777, + "step": 9497 + }, + { + "epoch": 0.12883884970157353, + "grad_norm": 5.872696876525879, + "learning_rate": 8.801288200630396e-06, + "loss": 0.3071, + "step": 9498 + }, + { + "epoch": 0.12885241454150842, + "grad_norm": 4.9680047035217285, + "learning_rate": 8.801151158010141e-06, + "loss": 0.3019, + "step": 9499 + }, + { + "epoch": 0.12886597938144329, + "grad_norm": 6.480469703674316, + "learning_rate": 8.801014115389886e-06, + "loss": 0.4678, + "step": 9500 + }, + { + "epoch": 0.12887954422137818, + "grad_norm": 7.21030330657959, + "learning_rate": 8.800877072769632e-06, + "loss": 0.486, + "step": 9501 + }, + { + "epoch": 0.12889310906131307, + "grad_norm": 5.586202621459961, + "learning_rate": 8.800740030149377e-06, + "loss": 0.4048, + "step": 9502 + }, + { + "epoch": 0.12890667390124796, + "grad_norm": 4.934842109680176, + "learning_rate": 8.800602987529122e-06, + "loss": 0.299, + "step": 9503 + }, + { + "epoch": 0.12892023874118286, + "grad_norm": 4.585310459136963, + "learning_rate": 8.800465944908867e-06, + "loss": 0.3386, + "step": 9504 + }, + { + "epoch": 0.12893380358111775, + "grad_norm": 5.990291595458984, + "learning_rate": 8.800328902288612e-06, + "loss": 0.4244, + "step": 9505 + }, + { + "epoch": 0.12894736842105264, + "grad_norm": 6.370843887329102, + "learning_rate": 8.800191859668358e-06, + "loss": 0.337, + "step": 9506 + }, + { + "epoch": 0.1289609332609875, + "grad_norm": 6.9363789558410645, + "learning_rate": 8.800054817048103e-06, + "loss": 0.3059, + "step": 9507 + }, + { + "epoch": 0.1289744981009224, + "grad_norm": 8.460136413574219, + "learning_rate": 8.799917774427848e-06, + "loss": 0.6245, + "step": 9508 + }, + { + "epoch": 0.1289880629408573, + "grad_norm": 6.098921775817871, + "learning_rate": 8.799780731807593e-06, + "loss": 0.4589, + "step": 9509 + }, + { + "epoch": 0.1290016277807922, + "grad_norm": 7.526154041290283, + "learning_rate": 8.799643689187338e-06, + "loss": 0.3528, + "step": 9510 + }, + { + "epoch": 0.12901519262072708, + "grad_norm": 5.546901226043701, + "learning_rate": 8.799506646567083e-06, + "loss": 0.3091, + "step": 9511 + }, + { + "epoch": 0.12902875746066197, + "grad_norm": 6.416955947875977, + "learning_rate": 8.799369603946829e-06, + "loss": 0.3342, + "step": 9512 + }, + { + "epoch": 0.12904232230059684, + "grad_norm": 5.928442478179932, + "learning_rate": 8.799232561326572e-06, + "loss": 0.4251, + "step": 9513 + }, + { + "epoch": 0.12905588714053173, + "grad_norm": 3.767824172973633, + "learning_rate": 8.799095518706319e-06, + "loss": 0.1768, + "step": 9514 + }, + { + "epoch": 0.12906945198046663, + "grad_norm": 6.076384544372559, + "learning_rate": 8.798958476086064e-06, + "loss": 0.3355, + "step": 9515 + }, + { + "epoch": 0.12908301682040152, + "grad_norm": 5.877745628356934, + "learning_rate": 8.798821433465808e-06, + "loss": 0.4021, + "step": 9516 + }, + { + "epoch": 0.1290965816603364, + "grad_norm": 4.683191776275635, + "learning_rate": 8.798684390845553e-06, + "loss": 0.2577, + "step": 9517 + }, + { + "epoch": 0.1291101465002713, + "grad_norm": 6.111527919769287, + "learning_rate": 8.7985473482253e-06, + "loss": 0.4088, + "step": 9518 + }, + { + "epoch": 0.1291237113402062, + "grad_norm": 4.645392417907715, + "learning_rate": 8.798410305605045e-06, + "loss": 0.2448, + "step": 9519 + }, + { + "epoch": 0.12913727618014106, + "grad_norm": 4.915872573852539, + "learning_rate": 8.798273262984788e-06, + "loss": 0.2325, + "step": 9520 + }, + { + "epoch": 0.12915084102007596, + "grad_norm": 6.788381576538086, + "learning_rate": 8.798136220364534e-06, + "loss": 0.3331, + "step": 9521 + }, + { + "epoch": 0.12916440586001085, + "grad_norm": 6.214980125427246, + "learning_rate": 8.79799917774428e-06, + "loss": 0.3904, + "step": 9522 + }, + { + "epoch": 0.12917797069994574, + "grad_norm": 6.4141411781311035, + "learning_rate": 8.797862135124024e-06, + "loss": 0.4618, + "step": 9523 + }, + { + "epoch": 0.12919153553988064, + "grad_norm": 7.383152961730957, + "learning_rate": 8.797725092503769e-06, + "loss": 0.4503, + "step": 9524 + }, + { + "epoch": 0.12920510037981553, + "grad_norm": 6.205418109893799, + "learning_rate": 8.797588049883514e-06, + "loss": 0.3123, + "step": 9525 + }, + { + "epoch": 0.1292186652197504, + "grad_norm": 5.742172718048096, + "learning_rate": 8.79745100726326e-06, + "loss": 0.3883, + "step": 9526 + }, + { + "epoch": 0.1292322300596853, + "grad_norm": 6.620988368988037, + "learning_rate": 8.797313964643005e-06, + "loss": 0.3802, + "step": 9527 + }, + { + "epoch": 0.12924579489962018, + "grad_norm": 7.095709800720215, + "learning_rate": 8.79717692202275e-06, + "loss": 0.4385, + "step": 9528 + }, + { + "epoch": 0.12925935973955507, + "grad_norm": 5.085083484649658, + "learning_rate": 8.797039879402495e-06, + "loss": 0.2343, + "step": 9529 + }, + { + "epoch": 0.12927292457948997, + "grad_norm": 4.595135688781738, + "learning_rate": 8.79690283678224e-06, + "loss": 0.2417, + "step": 9530 + }, + { + "epoch": 0.12928648941942486, + "grad_norm": 6.968440532684326, + "learning_rate": 8.796765794161985e-06, + "loss": 0.4298, + "step": 9531 + }, + { + "epoch": 0.12930005425935973, + "grad_norm": 4.494041442871094, + "learning_rate": 8.79662875154173e-06, + "loss": 0.3397, + "step": 9532 + }, + { + "epoch": 0.12931361909929462, + "grad_norm": 5.341174125671387, + "learning_rate": 8.796491708921476e-06, + "loss": 0.2533, + "step": 9533 + }, + { + "epoch": 0.1293271839392295, + "grad_norm": 6.333257675170898, + "learning_rate": 8.796354666301221e-06, + "loss": 0.3733, + "step": 9534 + }, + { + "epoch": 0.1293407487791644, + "grad_norm": 6.044010162353516, + "learning_rate": 8.796217623680966e-06, + "loss": 0.4702, + "step": 9535 + }, + { + "epoch": 0.1293543136190993, + "grad_norm": 6.5101823806762695, + "learning_rate": 8.796080581060711e-06, + "loss": 0.2792, + "step": 9536 + }, + { + "epoch": 0.1293678784590342, + "grad_norm": 4.922168731689453, + "learning_rate": 8.795943538440456e-06, + "loss": 0.2427, + "step": 9537 + }, + { + "epoch": 0.12938144329896908, + "grad_norm": 5.204797267913818, + "learning_rate": 8.7958064958202e-06, + "loss": 0.3679, + "step": 9538 + }, + { + "epoch": 0.12939500813890395, + "grad_norm": 4.833796501159668, + "learning_rate": 8.795669453199945e-06, + "loss": 0.2729, + "step": 9539 + }, + { + "epoch": 0.12940857297883884, + "grad_norm": 4.757225036621094, + "learning_rate": 8.795532410579692e-06, + "loss": 0.2484, + "step": 9540 + }, + { + "epoch": 0.12942213781877374, + "grad_norm": 5.194931507110596, + "learning_rate": 8.795395367959435e-06, + "loss": 0.3769, + "step": 9541 + }, + { + "epoch": 0.12943570265870863, + "grad_norm": 5.182471752166748, + "learning_rate": 8.79525832533918e-06, + "loss": 0.3006, + "step": 9542 + }, + { + "epoch": 0.12944926749864352, + "grad_norm": 4.703525543212891, + "learning_rate": 8.795121282718926e-06, + "loss": 0.2157, + "step": 9543 + }, + { + "epoch": 0.12946283233857842, + "grad_norm": 5.471092224121094, + "learning_rate": 8.794984240098673e-06, + "loss": 0.2924, + "step": 9544 + }, + { + "epoch": 0.12947639717851328, + "grad_norm": 5.061090469360352, + "learning_rate": 8.794847197478416e-06, + "loss": 0.3789, + "step": 9545 + }, + { + "epoch": 0.12948996201844817, + "grad_norm": 5.5330400466918945, + "learning_rate": 8.794710154858161e-06, + "loss": 0.2866, + "step": 9546 + }, + { + "epoch": 0.12950352685838307, + "grad_norm": 4.9239726066589355, + "learning_rate": 8.794573112237906e-06, + "loss": 0.2962, + "step": 9547 + }, + { + "epoch": 0.12951709169831796, + "grad_norm": 5.542669296264648, + "learning_rate": 8.794436069617652e-06, + "loss": 0.282, + "step": 9548 + }, + { + "epoch": 0.12953065653825285, + "grad_norm": 3.7232141494750977, + "learning_rate": 8.794299026997397e-06, + "loss": 0.1831, + "step": 9549 + }, + { + "epoch": 0.12954422137818775, + "grad_norm": 5.061459064483643, + "learning_rate": 8.794161984377142e-06, + "loss": 0.3151, + "step": 9550 + }, + { + "epoch": 0.12955778621812264, + "grad_norm": 6.037795066833496, + "learning_rate": 8.794024941756887e-06, + "loss": 0.2982, + "step": 9551 + }, + { + "epoch": 0.1295713510580575, + "grad_norm": 7.554645538330078, + "learning_rate": 8.793887899136632e-06, + "loss": 0.3897, + "step": 9552 + }, + { + "epoch": 0.1295849158979924, + "grad_norm": 6.24260139465332, + "learning_rate": 8.793750856516378e-06, + "loss": 0.3, + "step": 9553 + }, + { + "epoch": 0.1295984807379273, + "grad_norm": 5.910465717315674, + "learning_rate": 8.793613813896123e-06, + "loss": 0.2645, + "step": 9554 + }, + { + "epoch": 0.12961204557786218, + "grad_norm": 5.135303497314453, + "learning_rate": 8.793476771275868e-06, + "loss": 0.3687, + "step": 9555 + }, + { + "epoch": 0.12962561041779708, + "grad_norm": 5.247692584991455, + "learning_rate": 8.793339728655611e-06, + "loss": 0.3489, + "step": 9556 + }, + { + "epoch": 0.12963917525773197, + "grad_norm": 7.236728191375732, + "learning_rate": 8.793202686035358e-06, + "loss": 0.3388, + "step": 9557 + }, + { + "epoch": 0.12965274009766684, + "grad_norm": 5.825442314147949, + "learning_rate": 8.793065643415103e-06, + "loss": 0.3408, + "step": 9558 + }, + { + "epoch": 0.12966630493760173, + "grad_norm": 6.800344944000244, + "learning_rate": 8.792928600794847e-06, + "loss": 0.383, + "step": 9559 + }, + { + "epoch": 0.12967986977753662, + "grad_norm": 5.4862189292907715, + "learning_rate": 8.792791558174592e-06, + "loss": 0.3106, + "step": 9560 + }, + { + "epoch": 0.12969343461747151, + "grad_norm": 5.321022033691406, + "learning_rate": 8.792654515554339e-06, + "loss": 0.225, + "step": 9561 + }, + { + "epoch": 0.1297069994574064, + "grad_norm": 5.336562633514404, + "learning_rate": 8.792517472934084e-06, + "loss": 0.2651, + "step": 9562 + }, + { + "epoch": 0.1297205642973413, + "grad_norm": 5.537688255310059, + "learning_rate": 8.792380430313828e-06, + "loss": 0.22, + "step": 9563 + }, + { + "epoch": 0.1297341291372762, + "grad_norm": 6.948128700256348, + "learning_rate": 8.792243387693573e-06, + "loss": 0.3622, + "step": 9564 + }, + { + "epoch": 0.12974769397721106, + "grad_norm": 5.916669845581055, + "learning_rate": 8.79210634507332e-06, + "loss": 0.2856, + "step": 9565 + }, + { + "epoch": 0.12976125881714595, + "grad_norm": 4.824949264526367, + "learning_rate": 8.791969302453063e-06, + "loss": 0.2817, + "step": 9566 + }, + { + "epoch": 0.12977482365708085, + "grad_norm": 8.036179542541504, + "learning_rate": 8.791832259832808e-06, + "loss": 0.4, + "step": 9567 + }, + { + "epoch": 0.12978838849701574, + "grad_norm": 5.96492862701416, + "learning_rate": 8.791695217212554e-06, + "loss": 0.3006, + "step": 9568 + }, + { + "epoch": 0.12980195333695063, + "grad_norm": 6.101284027099609, + "learning_rate": 8.791558174592299e-06, + "loss": 0.4402, + "step": 9569 + }, + { + "epoch": 0.12981551817688552, + "grad_norm": 8.388908386230469, + "learning_rate": 8.791421131972044e-06, + "loss": 0.4698, + "step": 9570 + }, + { + "epoch": 0.1298290830168204, + "grad_norm": 7.136528491973877, + "learning_rate": 8.791284089351789e-06, + "loss": 0.3696, + "step": 9571 + }, + { + "epoch": 0.12984264785675528, + "grad_norm": 6.620447158813477, + "learning_rate": 8.791147046731534e-06, + "loss": 0.3872, + "step": 9572 + }, + { + "epoch": 0.12985621269669018, + "grad_norm": 6.852954387664795, + "learning_rate": 8.79101000411128e-06, + "loss": 0.4868, + "step": 9573 + }, + { + "epoch": 0.12986977753662507, + "grad_norm": 7.005537033081055, + "learning_rate": 8.790872961491025e-06, + "loss": 0.4706, + "step": 9574 + }, + { + "epoch": 0.12988334237655996, + "grad_norm": 5.64221715927124, + "learning_rate": 8.79073591887077e-06, + "loss": 0.2738, + "step": 9575 + }, + { + "epoch": 0.12989690721649486, + "grad_norm": 6.488274574279785, + "learning_rate": 8.790598876250515e-06, + "loss": 0.3656, + "step": 9576 + }, + { + "epoch": 0.12991047205642972, + "grad_norm": 7.032741069793701, + "learning_rate": 8.79046183363026e-06, + "loss": 0.4448, + "step": 9577 + }, + { + "epoch": 0.12992403689636461, + "grad_norm": 7.538451671600342, + "learning_rate": 8.790324791010005e-06, + "loss": 0.3659, + "step": 9578 + }, + { + "epoch": 0.1299376017362995, + "grad_norm": 7.783622741699219, + "learning_rate": 8.79018774838975e-06, + "loss": 0.3338, + "step": 9579 + }, + { + "epoch": 0.1299511665762344, + "grad_norm": 8.233739852905273, + "learning_rate": 8.790050705769496e-06, + "loss": 0.5907, + "step": 9580 + }, + { + "epoch": 0.1299647314161693, + "grad_norm": 6.422529220581055, + "learning_rate": 8.78991366314924e-06, + "loss": 0.4876, + "step": 9581 + }, + { + "epoch": 0.1299782962561042, + "grad_norm": 6.2548909187316895, + "learning_rate": 8.789776620528984e-06, + "loss": 0.3908, + "step": 9582 + }, + { + "epoch": 0.12999186109603908, + "grad_norm": 5.367825508117676, + "learning_rate": 8.789639577908731e-06, + "loss": 0.3388, + "step": 9583 + }, + { + "epoch": 0.13000542593597395, + "grad_norm": 4.427102088928223, + "learning_rate": 8.789502535288475e-06, + "loss": 0.3239, + "step": 9584 + }, + { + "epoch": 0.13001899077590884, + "grad_norm": 7.897709846496582, + "learning_rate": 8.78936549266822e-06, + "loss": 0.4212, + "step": 9585 + }, + { + "epoch": 0.13003255561584373, + "grad_norm": 6.521414279937744, + "learning_rate": 8.789228450047965e-06, + "loss": 0.3275, + "step": 9586 + }, + { + "epoch": 0.13004612045577862, + "grad_norm": 7.795353412628174, + "learning_rate": 8.789091407427712e-06, + "loss": 0.3265, + "step": 9587 + }, + { + "epoch": 0.13005968529571352, + "grad_norm": 7.1254754066467285, + "learning_rate": 8.788954364807455e-06, + "loss": 0.5127, + "step": 9588 + }, + { + "epoch": 0.1300732501356484, + "grad_norm": 8.300755500793457, + "learning_rate": 8.7888173221872e-06, + "loss": 0.4792, + "step": 9589 + }, + { + "epoch": 0.13008681497558328, + "grad_norm": 5.494011878967285, + "learning_rate": 8.788680279566946e-06, + "loss": 0.3027, + "step": 9590 + }, + { + "epoch": 0.13010037981551817, + "grad_norm": 7.771852970123291, + "learning_rate": 8.788543236946691e-06, + "loss": 0.4132, + "step": 9591 + }, + { + "epoch": 0.13011394465545306, + "grad_norm": 9.516011238098145, + "learning_rate": 8.788406194326436e-06, + "loss": 0.5646, + "step": 9592 + }, + { + "epoch": 0.13012750949538796, + "grad_norm": 6.277604579925537, + "learning_rate": 8.788269151706181e-06, + "loss": 0.3774, + "step": 9593 + }, + { + "epoch": 0.13014107433532285, + "grad_norm": 5.663750171661377, + "learning_rate": 8.788132109085927e-06, + "loss": 0.2332, + "step": 9594 + }, + { + "epoch": 0.13015463917525774, + "grad_norm": 6.713316440582275, + "learning_rate": 8.787995066465672e-06, + "loss": 0.3527, + "step": 9595 + }, + { + "epoch": 0.13016820401519263, + "grad_norm": 5.603292942047119, + "learning_rate": 8.787858023845417e-06, + "loss": 0.2865, + "step": 9596 + }, + { + "epoch": 0.1301817688551275, + "grad_norm": 6.004098892211914, + "learning_rate": 8.787720981225162e-06, + "loss": 0.347, + "step": 9597 + }, + { + "epoch": 0.1301953336950624, + "grad_norm": 7.04608154296875, + "learning_rate": 8.787583938604907e-06, + "loss": 0.3079, + "step": 9598 + }, + { + "epoch": 0.13020889853499729, + "grad_norm": 6.226680755615234, + "learning_rate": 8.78744689598465e-06, + "loss": 0.257, + "step": 9599 + }, + { + "epoch": 0.13022246337493218, + "grad_norm": 7.958026885986328, + "learning_rate": 8.787309853364398e-06, + "loss": 0.4974, + "step": 9600 + }, + { + "epoch": 0.13023602821486707, + "grad_norm": 7.479384899139404, + "learning_rate": 8.787172810744143e-06, + "loss": 0.324, + "step": 9601 + }, + { + "epoch": 0.13024959305480197, + "grad_norm": 5.824954032897949, + "learning_rate": 8.787035768123888e-06, + "loss": 0.3102, + "step": 9602 + }, + { + "epoch": 0.13026315789473683, + "grad_norm": 7.1194610595703125, + "learning_rate": 8.786898725503631e-06, + "loss": 0.3263, + "step": 9603 + }, + { + "epoch": 0.13027672273467172, + "grad_norm": 7.651744842529297, + "learning_rate": 8.786761682883378e-06, + "loss": 0.381, + "step": 9604 + }, + { + "epoch": 0.13029028757460662, + "grad_norm": 6.147503852844238, + "learning_rate": 8.786624640263123e-06, + "loss": 0.3922, + "step": 9605 + }, + { + "epoch": 0.1303038524145415, + "grad_norm": 5.893484115600586, + "learning_rate": 8.786487597642867e-06, + "loss": 0.3292, + "step": 9606 + }, + { + "epoch": 0.1303174172544764, + "grad_norm": 5.405019760131836, + "learning_rate": 8.786350555022612e-06, + "loss": 0.255, + "step": 9607 + }, + { + "epoch": 0.1303309820944113, + "grad_norm": 7.036563873291016, + "learning_rate": 8.786213512402357e-06, + "loss": 0.453, + "step": 9608 + }, + { + "epoch": 0.13034454693434616, + "grad_norm": 7.096198081970215, + "learning_rate": 8.786076469782102e-06, + "loss": 0.3004, + "step": 9609 + }, + { + "epoch": 0.13035811177428105, + "grad_norm": 6.251145839691162, + "learning_rate": 8.785939427161848e-06, + "loss": 0.367, + "step": 9610 + }, + { + "epoch": 0.13037167661421595, + "grad_norm": 7.646984577178955, + "learning_rate": 8.785802384541593e-06, + "loss": 0.4441, + "step": 9611 + }, + { + "epoch": 0.13038524145415084, + "grad_norm": 7.147789478302002, + "learning_rate": 8.785665341921338e-06, + "loss": 0.2828, + "step": 9612 + }, + { + "epoch": 0.13039880629408573, + "grad_norm": 6.016058921813965, + "learning_rate": 8.785528299301083e-06, + "loss": 0.4333, + "step": 9613 + }, + { + "epoch": 0.13041237113402063, + "grad_norm": 5.886025428771973, + "learning_rate": 8.785391256680828e-06, + "loss": 0.3097, + "step": 9614 + }, + { + "epoch": 0.13042593597395552, + "grad_norm": 7.944471836090088, + "learning_rate": 8.785254214060574e-06, + "loss": 0.3752, + "step": 9615 + }, + { + "epoch": 0.13043950081389039, + "grad_norm": 7.985036373138428, + "learning_rate": 8.785117171440319e-06, + "loss": 0.41, + "step": 9616 + }, + { + "epoch": 0.13045306565382528, + "grad_norm": 7.4168500900268555, + "learning_rate": 8.784980128820064e-06, + "loss": 0.3827, + "step": 9617 + }, + { + "epoch": 0.13046663049376017, + "grad_norm": 10.35243034362793, + "learning_rate": 8.784843086199809e-06, + "loss": 0.3774, + "step": 9618 + }, + { + "epoch": 0.13048019533369506, + "grad_norm": 7.720123291015625, + "learning_rate": 8.784706043579554e-06, + "loss": 0.4937, + "step": 9619 + }, + { + "epoch": 0.13049376017362996, + "grad_norm": 10.423319816589355, + "learning_rate": 8.7845690009593e-06, + "loss": 0.7364, + "step": 9620 + }, + { + "epoch": 0.13050732501356485, + "grad_norm": 7.69235372543335, + "learning_rate": 8.784431958339043e-06, + "loss": 0.4802, + "step": 9621 + }, + { + "epoch": 0.13052088985349972, + "grad_norm": 6.9326043128967285, + "learning_rate": 8.78429491571879e-06, + "loss": 0.4192, + "step": 9622 + }, + { + "epoch": 0.1305344546934346, + "grad_norm": 6.447765350341797, + "learning_rate": 8.784157873098535e-06, + "loss": 0.3462, + "step": 9623 + }, + { + "epoch": 0.1305480195333695, + "grad_norm": 7.245455265045166, + "learning_rate": 8.784020830478278e-06, + "loss": 0.3986, + "step": 9624 + }, + { + "epoch": 0.1305615843733044, + "grad_norm": 6.649042129516602, + "learning_rate": 8.783883787858024e-06, + "loss": 0.3733, + "step": 9625 + }, + { + "epoch": 0.1305751492132393, + "grad_norm": 7.583316802978516, + "learning_rate": 8.78374674523777e-06, + "loss": 0.4203, + "step": 9626 + }, + { + "epoch": 0.13058871405317418, + "grad_norm": 4.236422538757324, + "learning_rate": 8.783609702617516e-06, + "loss": 0.177, + "step": 9627 + }, + { + "epoch": 0.13060227889310908, + "grad_norm": 7.4552001953125, + "learning_rate": 8.78347265999726e-06, + "loss": 0.5245, + "step": 9628 + }, + { + "epoch": 0.13061584373304394, + "grad_norm": 6.639019012451172, + "learning_rate": 8.783335617377004e-06, + "loss": 0.3401, + "step": 9629 + }, + { + "epoch": 0.13062940857297883, + "grad_norm": 8.150103569030762, + "learning_rate": 8.783198574756751e-06, + "loss": 0.4715, + "step": 9630 + }, + { + "epoch": 0.13064297341291373, + "grad_norm": 6.534870147705078, + "learning_rate": 8.783061532136495e-06, + "loss": 0.4188, + "step": 9631 + }, + { + "epoch": 0.13065653825284862, + "grad_norm": 5.558417320251465, + "learning_rate": 8.78292448951624e-06, + "loss": 0.3476, + "step": 9632 + }, + { + "epoch": 0.1306701030927835, + "grad_norm": 7.397049427032471, + "learning_rate": 8.782787446895985e-06, + "loss": 0.4017, + "step": 9633 + }, + { + "epoch": 0.1306836679327184, + "grad_norm": 7.107193946838379, + "learning_rate": 8.78265040427573e-06, + "loss": 0.3526, + "step": 9634 + }, + { + "epoch": 0.13069723277265327, + "grad_norm": 6.176170825958252, + "learning_rate": 8.782513361655475e-06, + "loss": 0.3077, + "step": 9635 + }, + { + "epoch": 0.13071079761258816, + "grad_norm": 5.639369010925293, + "learning_rate": 8.78237631903522e-06, + "loss": 0.3037, + "step": 9636 + }, + { + "epoch": 0.13072436245252306, + "grad_norm": 3.9012842178344727, + "learning_rate": 8.782239276414966e-06, + "loss": 0.131, + "step": 9637 + }, + { + "epoch": 0.13073792729245795, + "grad_norm": 6.090043067932129, + "learning_rate": 8.782102233794711e-06, + "loss": 0.3813, + "step": 9638 + }, + { + "epoch": 0.13075149213239284, + "grad_norm": 5.409567356109619, + "learning_rate": 8.781965191174456e-06, + "loss": 0.3886, + "step": 9639 + }, + { + "epoch": 0.13076505697232774, + "grad_norm": 7.6708221435546875, + "learning_rate": 8.781828148554201e-06, + "loss": 0.4539, + "step": 9640 + }, + { + "epoch": 0.1307786218122626, + "grad_norm": 7.811886310577393, + "learning_rate": 8.781691105933947e-06, + "loss": 0.6041, + "step": 9641 + }, + { + "epoch": 0.1307921866521975, + "grad_norm": 6.351420879364014, + "learning_rate": 8.781554063313692e-06, + "loss": 0.4657, + "step": 9642 + }, + { + "epoch": 0.1308057514921324, + "grad_norm": 7.840840816497803, + "learning_rate": 8.781417020693437e-06, + "loss": 0.4199, + "step": 9643 + }, + { + "epoch": 0.13081931633206728, + "grad_norm": 5.6833014488220215, + "learning_rate": 8.781279978073182e-06, + "loss": 0.3469, + "step": 9644 + }, + { + "epoch": 0.13083288117200217, + "grad_norm": 8.236916542053223, + "learning_rate": 8.781142935452927e-06, + "loss": 0.6382, + "step": 9645 + }, + { + "epoch": 0.13084644601193707, + "grad_norm": 8.33704662322998, + "learning_rate": 8.78100589283267e-06, + "loss": 0.4473, + "step": 9646 + }, + { + "epoch": 0.13086001085187196, + "grad_norm": 7.401642799377441, + "learning_rate": 8.780868850212418e-06, + "loss": 0.4179, + "step": 9647 + }, + { + "epoch": 0.13087357569180683, + "grad_norm": 6.336109161376953, + "learning_rate": 8.780731807592163e-06, + "loss": 0.4786, + "step": 9648 + }, + { + "epoch": 0.13088714053174172, + "grad_norm": 8.4766845703125, + "learning_rate": 8.780594764971906e-06, + "loss": 0.392, + "step": 9649 + }, + { + "epoch": 0.1309007053716766, + "grad_norm": 6.784364223480225, + "learning_rate": 8.780457722351651e-06, + "loss": 0.3265, + "step": 9650 + }, + { + "epoch": 0.1309142702116115, + "grad_norm": 8.490394592285156, + "learning_rate": 8.780320679731397e-06, + "loss": 0.3364, + "step": 9651 + }, + { + "epoch": 0.1309278350515464, + "grad_norm": 7.246098041534424, + "learning_rate": 8.780183637111142e-06, + "loss": 0.2713, + "step": 9652 + }, + { + "epoch": 0.1309413998914813, + "grad_norm": 11.396208763122559, + "learning_rate": 8.780046594490887e-06, + "loss": 0.617, + "step": 9653 + }, + { + "epoch": 0.13095496473141616, + "grad_norm": 5.806854248046875, + "learning_rate": 8.779909551870632e-06, + "loss": 0.4156, + "step": 9654 + }, + { + "epoch": 0.13096852957135105, + "grad_norm": 8.057798385620117, + "learning_rate": 8.779772509250377e-06, + "loss": 0.396, + "step": 9655 + }, + { + "epoch": 0.13098209441128594, + "grad_norm": 5.342539310455322, + "learning_rate": 8.779635466630123e-06, + "loss": 0.2779, + "step": 9656 + }, + { + "epoch": 0.13099565925122084, + "grad_norm": 6.520411014556885, + "learning_rate": 8.779498424009868e-06, + "loss": 0.4153, + "step": 9657 + }, + { + "epoch": 0.13100922409115573, + "grad_norm": 6.177801609039307, + "learning_rate": 8.779361381389613e-06, + "loss": 0.2829, + "step": 9658 + }, + { + "epoch": 0.13102278893109062, + "grad_norm": 6.8115234375, + "learning_rate": 8.779224338769358e-06, + "loss": 0.4083, + "step": 9659 + }, + { + "epoch": 0.13103635377102552, + "grad_norm": 8.90988826751709, + "learning_rate": 8.779087296149103e-06, + "loss": 0.4123, + "step": 9660 + }, + { + "epoch": 0.13104991861096038, + "grad_norm": 7.267480373382568, + "learning_rate": 8.778950253528848e-06, + "loss": 0.4549, + "step": 9661 + }, + { + "epoch": 0.13106348345089527, + "grad_norm": 8.785486221313477, + "learning_rate": 8.778813210908594e-06, + "loss": 0.5417, + "step": 9662 + }, + { + "epoch": 0.13107704829083017, + "grad_norm": 6.517858982086182, + "learning_rate": 8.778676168288339e-06, + "loss": 0.3973, + "step": 9663 + }, + { + "epoch": 0.13109061313076506, + "grad_norm": 7.440919399261475, + "learning_rate": 8.778539125668082e-06, + "loss": 0.5075, + "step": 9664 + }, + { + "epoch": 0.13110417797069995, + "grad_norm": 6.440766334533691, + "learning_rate": 8.778402083047829e-06, + "loss": 0.5316, + "step": 9665 + }, + { + "epoch": 0.13111774281063485, + "grad_norm": 8.14819049835205, + "learning_rate": 8.778265040427574e-06, + "loss": 0.4677, + "step": 9666 + }, + { + "epoch": 0.1311313076505697, + "grad_norm": 6.180514335632324, + "learning_rate": 8.778127997807318e-06, + "loss": 0.3546, + "step": 9667 + }, + { + "epoch": 0.1311448724905046, + "grad_norm": 8.100735664367676, + "learning_rate": 8.777990955187063e-06, + "loss": 0.4469, + "step": 9668 + }, + { + "epoch": 0.1311584373304395, + "grad_norm": 4.8915934562683105, + "learning_rate": 8.77785391256681e-06, + "loss": 0.2892, + "step": 9669 + }, + { + "epoch": 0.1311720021703744, + "grad_norm": 8.566217422485352, + "learning_rate": 8.777716869946555e-06, + "loss": 0.5056, + "step": 9670 + }, + { + "epoch": 0.13118556701030928, + "grad_norm": 6.142951488494873, + "learning_rate": 8.777579827326299e-06, + "loss": 0.3126, + "step": 9671 + }, + { + "epoch": 0.13119913185024418, + "grad_norm": 5.137514114379883, + "learning_rate": 8.777442784706044e-06, + "loss": 0.2858, + "step": 9672 + }, + { + "epoch": 0.13121269669017904, + "grad_norm": 4.639937400817871, + "learning_rate": 8.77730574208579e-06, + "loss": 0.1999, + "step": 9673 + }, + { + "epoch": 0.13122626153011394, + "grad_norm": 6.4136061668396, + "learning_rate": 8.777168699465534e-06, + "loss": 0.4668, + "step": 9674 + }, + { + "epoch": 0.13123982637004883, + "grad_norm": 6.082613945007324, + "learning_rate": 8.77703165684528e-06, + "loss": 0.3592, + "step": 9675 + }, + { + "epoch": 0.13125339120998372, + "grad_norm": 6.655172824859619, + "learning_rate": 8.776894614225024e-06, + "loss": 0.3571, + "step": 9676 + }, + { + "epoch": 0.13126695604991861, + "grad_norm": 6.183568954467773, + "learning_rate": 8.77675757160477e-06, + "loss": 0.4466, + "step": 9677 + }, + { + "epoch": 0.1312805208898535, + "grad_norm": 4.834166049957275, + "learning_rate": 8.776620528984515e-06, + "loss": 0.3393, + "step": 9678 + }, + { + "epoch": 0.1312940857297884, + "grad_norm": 5.3336029052734375, + "learning_rate": 8.77648348636426e-06, + "loss": 0.3325, + "step": 9679 + }, + { + "epoch": 0.13130765056972327, + "grad_norm": 5.70692777633667, + "learning_rate": 8.776346443744005e-06, + "loss": 0.427, + "step": 9680 + }, + { + "epoch": 0.13132121540965816, + "grad_norm": 6.393347263336182, + "learning_rate": 8.77620940112375e-06, + "loss": 0.3181, + "step": 9681 + }, + { + "epoch": 0.13133478024959305, + "grad_norm": 7.267829895019531, + "learning_rate": 8.776072358503495e-06, + "loss": 0.3476, + "step": 9682 + }, + { + "epoch": 0.13134834508952795, + "grad_norm": 6.616368293762207, + "learning_rate": 8.77593531588324e-06, + "loss": 0.4192, + "step": 9683 + }, + { + "epoch": 0.13136190992946284, + "grad_norm": 5.3464837074279785, + "learning_rate": 8.775798273262986e-06, + "loss": 0.2336, + "step": 9684 + }, + { + "epoch": 0.13137547476939773, + "grad_norm": 6.800158500671387, + "learning_rate": 8.775661230642731e-06, + "loss": 0.3462, + "step": 9685 + }, + { + "epoch": 0.1313890396093326, + "grad_norm": 6.09452486038208, + "learning_rate": 8.775524188022476e-06, + "loss": 0.329, + "step": 9686 + }, + { + "epoch": 0.1314026044492675, + "grad_norm": 6.571394443511963, + "learning_rate": 8.775387145402221e-06, + "loss": 0.347, + "step": 9687 + }, + { + "epoch": 0.13141616928920238, + "grad_norm": 6.553527355194092, + "learning_rate": 8.775250102781967e-06, + "loss": 0.3533, + "step": 9688 + }, + { + "epoch": 0.13142973412913728, + "grad_norm": 7.2809014320373535, + "learning_rate": 8.77511306016171e-06, + "loss": 0.4265, + "step": 9689 + }, + { + "epoch": 0.13144329896907217, + "grad_norm": 4.92959451675415, + "learning_rate": 8.774976017541455e-06, + "loss": 0.4404, + "step": 9690 + }, + { + "epoch": 0.13145686380900706, + "grad_norm": 5.670311450958252, + "learning_rate": 8.774838974921202e-06, + "loss": 0.3171, + "step": 9691 + }, + { + "epoch": 0.13147042864894196, + "grad_norm": 6.511138916015625, + "learning_rate": 8.774701932300946e-06, + "loss": 0.3698, + "step": 9692 + }, + { + "epoch": 0.13148399348887682, + "grad_norm": 5.730086803436279, + "learning_rate": 8.77456488968069e-06, + "loss": 0.3399, + "step": 9693 + }, + { + "epoch": 0.13149755832881171, + "grad_norm": 5.699723243713379, + "learning_rate": 8.774427847060436e-06, + "loss": 0.2137, + "step": 9694 + }, + { + "epoch": 0.1315111231687466, + "grad_norm": 5.846185684204102, + "learning_rate": 8.774290804440183e-06, + "loss": 0.3454, + "step": 9695 + }, + { + "epoch": 0.1315246880086815, + "grad_norm": 6.769159317016602, + "learning_rate": 8.774153761819926e-06, + "loss": 0.3721, + "step": 9696 + }, + { + "epoch": 0.1315382528486164, + "grad_norm": 4.8044304847717285, + "learning_rate": 8.774016719199671e-06, + "loss": 0.3189, + "step": 9697 + }, + { + "epoch": 0.1315518176885513, + "grad_norm": 6.30729341506958, + "learning_rate": 8.773879676579417e-06, + "loss": 0.2704, + "step": 9698 + }, + { + "epoch": 0.13156538252848615, + "grad_norm": 6.472882270812988, + "learning_rate": 8.773742633959162e-06, + "loss": 0.3934, + "step": 9699 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 5.158924102783203, + "learning_rate": 8.773605591338907e-06, + "loss": 0.3591, + "step": 9700 + }, + { + "epoch": 0.13159251220835594, + "grad_norm": 5.0598931312561035, + "learning_rate": 8.773468548718652e-06, + "loss": 0.3066, + "step": 9701 + }, + { + "epoch": 0.13160607704829083, + "grad_norm": 3.678011894226074, + "learning_rate": 8.773331506098397e-06, + "loss": 0.2551, + "step": 9702 + }, + { + "epoch": 0.13161964188822572, + "grad_norm": 4.550026893615723, + "learning_rate": 8.773194463478143e-06, + "loss": 0.2367, + "step": 9703 + }, + { + "epoch": 0.13163320672816062, + "grad_norm": 5.312737941741943, + "learning_rate": 8.773057420857888e-06, + "loss": 0.3399, + "step": 9704 + }, + { + "epoch": 0.13164677156809548, + "grad_norm": 5.575189590454102, + "learning_rate": 8.772920378237633e-06, + "loss": 0.2328, + "step": 9705 + }, + { + "epoch": 0.13166033640803038, + "grad_norm": 6.189388275146484, + "learning_rate": 8.772783335617378e-06, + "loss": 0.4658, + "step": 9706 + }, + { + "epoch": 0.13167390124796527, + "grad_norm": 6.397900104522705, + "learning_rate": 8.772646292997122e-06, + "loss": 0.419, + "step": 9707 + }, + { + "epoch": 0.13168746608790016, + "grad_norm": 5.905891418457031, + "learning_rate": 8.772509250376868e-06, + "loss": 0.3388, + "step": 9708 + }, + { + "epoch": 0.13170103092783506, + "grad_norm": 4.5950446128845215, + "learning_rate": 8.772372207756614e-06, + "loss": 0.3207, + "step": 9709 + }, + { + "epoch": 0.13171459576776995, + "grad_norm": 3.5634682178497314, + "learning_rate": 8.772235165136359e-06, + "loss": 0.158, + "step": 9710 + }, + { + "epoch": 0.13172816060770484, + "grad_norm": 4.549288272857666, + "learning_rate": 8.772098122516102e-06, + "loss": 0.2986, + "step": 9711 + }, + { + "epoch": 0.1317417254476397, + "grad_norm": 3.9912331104278564, + "learning_rate": 8.771961079895849e-06, + "loss": 0.211, + "step": 9712 + }, + { + "epoch": 0.1317552902875746, + "grad_norm": 4.671599864959717, + "learning_rate": 8.771824037275594e-06, + "loss": 0.2662, + "step": 9713 + }, + { + "epoch": 0.1317688551275095, + "grad_norm": 5.285481929779053, + "learning_rate": 8.771686994655338e-06, + "loss": 0.2744, + "step": 9714 + }, + { + "epoch": 0.1317824199674444, + "grad_norm": 3.682687759399414, + "learning_rate": 8.771549952035083e-06, + "loss": 0.2446, + "step": 9715 + }, + { + "epoch": 0.13179598480737928, + "grad_norm": 5.553106784820557, + "learning_rate": 8.77141290941483e-06, + "loss": 0.3391, + "step": 9716 + }, + { + "epoch": 0.13180954964731417, + "grad_norm": 4.5522074699401855, + "learning_rate": 8.771275866794573e-06, + "loss": 0.2946, + "step": 9717 + }, + { + "epoch": 0.13182311448724904, + "grad_norm": 6.209023475646973, + "learning_rate": 8.771138824174319e-06, + "loss": 0.4753, + "step": 9718 + }, + { + "epoch": 0.13183667932718393, + "grad_norm": 4.6423492431640625, + "learning_rate": 8.771001781554064e-06, + "loss": 0.2379, + "step": 9719 + }, + { + "epoch": 0.13185024416711882, + "grad_norm": 7.374971389770508, + "learning_rate": 8.770864738933809e-06, + "loss": 0.3276, + "step": 9720 + }, + { + "epoch": 0.13186380900705372, + "grad_norm": 5.894476413726807, + "learning_rate": 8.770727696313554e-06, + "loss": 0.331, + "step": 9721 + }, + { + "epoch": 0.1318773738469886, + "grad_norm": 6.370143413543701, + "learning_rate": 8.7705906536933e-06, + "loss": 0.3828, + "step": 9722 + }, + { + "epoch": 0.1318909386869235, + "grad_norm": 6.692907333374023, + "learning_rate": 8.770453611073044e-06, + "loss": 0.3234, + "step": 9723 + }, + { + "epoch": 0.1319045035268584, + "grad_norm": 4.6915059089660645, + "learning_rate": 8.77031656845279e-06, + "loss": 0.2949, + "step": 9724 + }, + { + "epoch": 0.13191806836679326, + "grad_norm": 6.30043363571167, + "learning_rate": 8.770179525832535e-06, + "loss": 0.3086, + "step": 9725 + }, + { + "epoch": 0.13193163320672815, + "grad_norm": 6.674134254455566, + "learning_rate": 8.77004248321228e-06, + "loss": 0.3691, + "step": 9726 + }, + { + "epoch": 0.13194519804666305, + "grad_norm": 5.291707992553711, + "learning_rate": 8.769905440592025e-06, + "loss": 0.2798, + "step": 9727 + }, + { + "epoch": 0.13195876288659794, + "grad_norm": 4.984164714813232, + "learning_rate": 8.76976839797177e-06, + "loss": 0.2782, + "step": 9728 + }, + { + "epoch": 0.13197232772653283, + "grad_norm": 6.019286155700684, + "learning_rate": 8.769631355351515e-06, + "loss": 0.3378, + "step": 9729 + }, + { + "epoch": 0.13198589256646773, + "grad_norm": 7.995388984680176, + "learning_rate": 8.76949431273126e-06, + "loss": 0.4166, + "step": 9730 + }, + { + "epoch": 0.1319994574064026, + "grad_norm": 5.723376274108887, + "learning_rate": 8.769357270111006e-06, + "loss": 0.4433, + "step": 9731 + }, + { + "epoch": 0.13201302224633749, + "grad_norm": 6.18047571182251, + "learning_rate": 8.76922022749075e-06, + "loss": 0.3147, + "step": 9732 + }, + { + "epoch": 0.13202658708627238, + "grad_norm": 8.230741500854492, + "learning_rate": 8.769083184870495e-06, + "loss": 0.4154, + "step": 9733 + }, + { + "epoch": 0.13204015192620727, + "grad_norm": 5.038899898529053, + "learning_rate": 8.768946142250241e-06, + "loss": 0.2575, + "step": 9734 + }, + { + "epoch": 0.13205371676614217, + "grad_norm": 8.311594009399414, + "learning_rate": 8.768809099629987e-06, + "loss": 0.4551, + "step": 9735 + }, + { + "epoch": 0.13206728160607706, + "grad_norm": 4.717966079711914, + "learning_rate": 8.76867205700973e-06, + "loss": 0.2988, + "step": 9736 + }, + { + "epoch": 0.13208084644601192, + "grad_norm": 4.970431804656982, + "learning_rate": 8.768535014389475e-06, + "loss": 0.301, + "step": 9737 + }, + { + "epoch": 0.13209441128594682, + "grad_norm": 3.3356330394744873, + "learning_rate": 8.768397971769222e-06, + "loss": 0.1928, + "step": 9738 + }, + { + "epoch": 0.1321079761258817, + "grad_norm": 4.211009502410889, + "learning_rate": 8.768260929148966e-06, + "loss": 0.2373, + "step": 9739 + }, + { + "epoch": 0.1321215409658166, + "grad_norm": 7.359415054321289, + "learning_rate": 8.76812388652871e-06, + "loss": 0.3461, + "step": 9740 + }, + { + "epoch": 0.1321351058057515, + "grad_norm": 6.1899003982543945, + "learning_rate": 8.767986843908456e-06, + "loss": 0.3319, + "step": 9741 + }, + { + "epoch": 0.1321486706456864, + "grad_norm": 7.641109466552734, + "learning_rate": 8.767849801288201e-06, + "loss": 0.484, + "step": 9742 + }, + { + "epoch": 0.13216223548562128, + "grad_norm": 6.302103519439697, + "learning_rate": 8.767712758667946e-06, + "loss": 0.291, + "step": 9743 + }, + { + "epoch": 0.13217580032555615, + "grad_norm": 6.950103282928467, + "learning_rate": 8.767575716047691e-06, + "loss": 0.3349, + "step": 9744 + }, + { + "epoch": 0.13218936516549104, + "grad_norm": 4.956302642822266, + "learning_rate": 8.767438673427437e-06, + "loss": 0.3897, + "step": 9745 + }, + { + "epoch": 0.13220293000542593, + "grad_norm": 7.06347131729126, + "learning_rate": 8.767301630807182e-06, + "loss": 0.3808, + "step": 9746 + }, + { + "epoch": 0.13221649484536083, + "grad_norm": 5.523189544677734, + "learning_rate": 8.767164588186927e-06, + "loss": 0.4276, + "step": 9747 + }, + { + "epoch": 0.13223005968529572, + "grad_norm": 5.798789024353027, + "learning_rate": 8.767027545566672e-06, + "loss": 0.4247, + "step": 9748 + }, + { + "epoch": 0.1322436245252306, + "grad_norm": 12.828645706176758, + "learning_rate": 8.766890502946417e-06, + "loss": 0.4602, + "step": 9749 + }, + { + "epoch": 0.13225718936516548, + "grad_norm": 5.234737396240234, + "learning_rate": 8.766753460326161e-06, + "loss": 0.2808, + "step": 9750 + }, + { + "epoch": 0.13227075420510037, + "grad_norm": 4.898491859436035, + "learning_rate": 8.766616417705908e-06, + "loss": 0.2537, + "step": 9751 + }, + { + "epoch": 0.13228431904503526, + "grad_norm": 6.3142476081848145, + "learning_rate": 8.766479375085653e-06, + "loss": 0.2791, + "step": 9752 + }, + { + "epoch": 0.13229788388497016, + "grad_norm": 8.300540924072266, + "learning_rate": 8.766342332465398e-06, + "loss": 0.3632, + "step": 9753 + }, + { + "epoch": 0.13231144872490505, + "grad_norm": 4.872084140777588, + "learning_rate": 8.766205289845142e-06, + "loss": 0.2887, + "step": 9754 + }, + { + "epoch": 0.13232501356483994, + "grad_norm": 6.519929885864258, + "learning_rate": 8.766068247224888e-06, + "loss": 0.2796, + "step": 9755 + }, + { + "epoch": 0.13233857840477484, + "grad_norm": 7.204483509063721, + "learning_rate": 8.765931204604634e-06, + "loss": 0.3583, + "step": 9756 + }, + { + "epoch": 0.1323521432447097, + "grad_norm": 4.467578887939453, + "learning_rate": 8.765794161984377e-06, + "loss": 0.2025, + "step": 9757 + }, + { + "epoch": 0.1323657080846446, + "grad_norm": 5.741865158081055, + "learning_rate": 8.765657119364122e-06, + "loss": 0.365, + "step": 9758 + }, + { + "epoch": 0.1323792729245795, + "grad_norm": 5.303692817687988, + "learning_rate": 8.765520076743867e-06, + "loss": 0.2952, + "step": 9759 + }, + { + "epoch": 0.13239283776451438, + "grad_norm": 7.788690090179443, + "learning_rate": 8.765383034123613e-06, + "loss": 0.3611, + "step": 9760 + }, + { + "epoch": 0.13240640260444927, + "grad_norm": 6.833197593688965, + "learning_rate": 8.765245991503358e-06, + "loss": 0.3845, + "step": 9761 + }, + { + "epoch": 0.13241996744438417, + "grad_norm": 4.501234531402588, + "learning_rate": 8.765108948883103e-06, + "loss": 0.2349, + "step": 9762 + }, + { + "epoch": 0.13243353228431903, + "grad_norm": 5.6420440673828125, + "learning_rate": 8.764971906262848e-06, + "loss": 0.3105, + "step": 9763 + }, + { + "epoch": 0.13244709712425393, + "grad_norm": 6.126606464385986, + "learning_rate": 8.764834863642593e-06, + "loss": 0.3438, + "step": 9764 + }, + { + "epoch": 0.13246066196418882, + "grad_norm": 5.027839183807373, + "learning_rate": 8.764697821022339e-06, + "loss": 0.2799, + "step": 9765 + }, + { + "epoch": 0.1324742268041237, + "grad_norm": 5.429416656494141, + "learning_rate": 8.764560778402084e-06, + "loss": 0.2303, + "step": 9766 + }, + { + "epoch": 0.1324877916440586, + "grad_norm": 5.175467491149902, + "learning_rate": 8.764423735781829e-06, + "loss": 0.3525, + "step": 9767 + }, + { + "epoch": 0.1325013564839935, + "grad_norm": 4.578464984893799, + "learning_rate": 8.764286693161574e-06, + "loss": 0.3202, + "step": 9768 + }, + { + "epoch": 0.13251492132392836, + "grad_norm": 6.07462739944458, + "learning_rate": 8.76414965054132e-06, + "loss": 0.4693, + "step": 9769 + }, + { + "epoch": 0.13252848616386326, + "grad_norm": 6.370356559753418, + "learning_rate": 8.764012607921064e-06, + "loss": 0.3517, + "step": 9770 + }, + { + "epoch": 0.13254205100379815, + "grad_norm": 5.740285873413086, + "learning_rate": 8.76387556530081e-06, + "loss": 0.362, + "step": 9771 + }, + { + "epoch": 0.13255561584373304, + "grad_norm": 5.439290523529053, + "learning_rate": 8.763738522680555e-06, + "loss": 0.3658, + "step": 9772 + }, + { + "epoch": 0.13256918068366794, + "grad_norm": 6.586767673492432, + "learning_rate": 8.7636014800603e-06, + "loss": 0.3693, + "step": 9773 + }, + { + "epoch": 0.13258274552360283, + "grad_norm": 5.744421005249023, + "learning_rate": 8.763464437440045e-06, + "loss": 0.3717, + "step": 9774 + }, + { + "epoch": 0.13259631036353772, + "grad_norm": 6.820810794830322, + "learning_rate": 8.763327394819789e-06, + "loss": 0.3541, + "step": 9775 + }, + { + "epoch": 0.1326098752034726, + "grad_norm": 7.214556694030762, + "learning_rate": 8.763190352199534e-06, + "loss": 0.4224, + "step": 9776 + }, + { + "epoch": 0.13262344004340748, + "grad_norm": 6.745981216430664, + "learning_rate": 8.76305330957928e-06, + "loss": 0.4169, + "step": 9777 + }, + { + "epoch": 0.13263700488334237, + "grad_norm": 8.26801586151123, + "learning_rate": 8.762916266959026e-06, + "loss": 0.4723, + "step": 9778 + }, + { + "epoch": 0.13265056972327727, + "grad_norm": 5.7164998054504395, + "learning_rate": 8.76277922433877e-06, + "loss": 0.4033, + "step": 9779 + }, + { + "epoch": 0.13266413456321216, + "grad_norm": 7.627884864807129, + "learning_rate": 8.762642181718515e-06, + "loss": 0.4026, + "step": 9780 + }, + { + "epoch": 0.13267769940314705, + "grad_norm": 5.938911437988281, + "learning_rate": 8.762505139098261e-06, + "loss": 0.4074, + "step": 9781 + }, + { + "epoch": 0.13269126424308192, + "grad_norm": 3.864805221557617, + "learning_rate": 8.762368096478005e-06, + "loss": 0.3001, + "step": 9782 + }, + { + "epoch": 0.1327048290830168, + "grad_norm": 4.255274295806885, + "learning_rate": 8.76223105385775e-06, + "loss": 0.3657, + "step": 9783 + }, + { + "epoch": 0.1327183939229517, + "grad_norm": 5.732542037963867, + "learning_rate": 8.762094011237495e-06, + "loss": 0.4796, + "step": 9784 + }, + { + "epoch": 0.1327319587628866, + "grad_norm": 4.448753833770752, + "learning_rate": 8.76195696861724e-06, + "loss": 0.3246, + "step": 9785 + }, + { + "epoch": 0.1327455236028215, + "grad_norm": 5.331238746643066, + "learning_rate": 8.761819925996986e-06, + "loss": 0.3214, + "step": 9786 + }, + { + "epoch": 0.13275908844275638, + "grad_norm": 6.185420036315918, + "learning_rate": 8.76168288337673e-06, + "loss": 0.3805, + "step": 9787 + }, + { + "epoch": 0.13277265328269128, + "grad_norm": 5.931912422180176, + "learning_rate": 8.761545840756476e-06, + "loss": 0.5339, + "step": 9788 + }, + { + "epoch": 0.13278621812262614, + "grad_norm": 7.458765506744385, + "learning_rate": 8.761408798136221e-06, + "loss": 0.4897, + "step": 9789 + }, + { + "epoch": 0.13279978296256104, + "grad_norm": 5.610836029052734, + "learning_rate": 8.761271755515966e-06, + "loss": 0.3798, + "step": 9790 + }, + { + "epoch": 0.13281334780249593, + "grad_norm": 5.955480575561523, + "learning_rate": 8.761134712895711e-06, + "loss": 0.5015, + "step": 9791 + }, + { + "epoch": 0.13282691264243082, + "grad_norm": 5.729814052581787, + "learning_rate": 8.760997670275457e-06, + "loss": 0.4148, + "step": 9792 + }, + { + "epoch": 0.13284047748236572, + "grad_norm": 5.096227169036865, + "learning_rate": 8.760860627655202e-06, + "loss": 0.3156, + "step": 9793 + }, + { + "epoch": 0.1328540423223006, + "grad_norm": 4.9112548828125, + "learning_rate": 8.760723585034947e-06, + "loss": 0.403, + "step": 9794 + }, + { + "epoch": 0.13286760716223547, + "grad_norm": 6.547095775604248, + "learning_rate": 8.760586542414692e-06, + "loss": 0.2913, + "step": 9795 + }, + { + "epoch": 0.13288117200217037, + "grad_norm": 7.874242305755615, + "learning_rate": 8.760449499794437e-06, + "loss": 0.4634, + "step": 9796 + }, + { + "epoch": 0.13289473684210526, + "grad_norm": 5.466801166534424, + "learning_rate": 8.760312457174181e-06, + "loss": 0.3273, + "step": 9797 + }, + { + "epoch": 0.13290830168204015, + "grad_norm": 5.148651123046875, + "learning_rate": 8.760175414553928e-06, + "loss": 0.2608, + "step": 9798 + }, + { + "epoch": 0.13292186652197505, + "grad_norm": 5.901987075805664, + "learning_rate": 8.760038371933673e-06, + "loss": 0.4899, + "step": 9799 + }, + { + "epoch": 0.13293543136190994, + "grad_norm": 7.139894485473633, + "learning_rate": 8.759901329313416e-06, + "loss": 0.279, + "step": 9800 + }, + { + "epoch": 0.1329489962018448, + "grad_norm": 6.088993549346924, + "learning_rate": 8.759764286693162e-06, + "loss": 0.3231, + "step": 9801 + }, + { + "epoch": 0.1329625610417797, + "grad_norm": 6.603847980499268, + "learning_rate": 8.759627244072907e-06, + "loss": 0.3334, + "step": 9802 + }, + { + "epoch": 0.1329761258817146, + "grad_norm": 6.536016941070557, + "learning_rate": 8.759490201452654e-06, + "loss": 0.4814, + "step": 9803 + }, + { + "epoch": 0.13298969072164948, + "grad_norm": 7.6634674072265625, + "learning_rate": 8.759353158832397e-06, + "loss": 0.4223, + "step": 9804 + }, + { + "epoch": 0.13300325556158438, + "grad_norm": 6.306955814361572, + "learning_rate": 8.759216116212142e-06, + "loss": 0.4141, + "step": 9805 + }, + { + "epoch": 0.13301682040151927, + "grad_norm": 5.725619792938232, + "learning_rate": 8.759079073591887e-06, + "loss": 0.3758, + "step": 9806 + }, + { + "epoch": 0.13303038524145416, + "grad_norm": 4.986690998077393, + "learning_rate": 8.758942030971633e-06, + "loss": 0.3666, + "step": 9807 + }, + { + "epoch": 0.13304395008138903, + "grad_norm": 5.963035583496094, + "learning_rate": 8.758804988351378e-06, + "loss": 0.4119, + "step": 9808 + }, + { + "epoch": 0.13305751492132392, + "grad_norm": 6.074991703033447, + "learning_rate": 8.758667945731123e-06, + "loss": 0.2616, + "step": 9809 + }, + { + "epoch": 0.13307107976125881, + "grad_norm": 5.431123733520508, + "learning_rate": 8.758530903110868e-06, + "loss": 0.3322, + "step": 9810 + }, + { + "epoch": 0.1330846446011937, + "grad_norm": 5.967298984527588, + "learning_rate": 8.758393860490613e-06, + "loss": 0.3418, + "step": 9811 + }, + { + "epoch": 0.1330982094411286, + "grad_norm": 5.339149475097656, + "learning_rate": 8.758256817870359e-06, + "loss": 0.3508, + "step": 9812 + }, + { + "epoch": 0.1331117742810635, + "grad_norm": 5.008146286010742, + "learning_rate": 8.758119775250104e-06, + "loss": 0.3782, + "step": 9813 + }, + { + "epoch": 0.13312533912099836, + "grad_norm": 5.757988929748535, + "learning_rate": 8.757982732629849e-06, + "loss": 0.3229, + "step": 9814 + }, + { + "epoch": 0.13313890396093325, + "grad_norm": 4.4969868659973145, + "learning_rate": 8.757845690009592e-06, + "loss": 0.3909, + "step": 9815 + }, + { + "epoch": 0.13315246880086815, + "grad_norm": 6.010661602020264, + "learning_rate": 8.75770864738934e-06, + "loss": 0.3452, + "step": 9816 + }, + { + "epoch": 0.13316603364080304, + "grad_norm": 6.204958438873291, + "learning_rate": 8.757571604769084e-06, + "loss": 0.2984, + "step": 9817 + }, + { + "epoch": 0.13317959848073793, + "grad_norm": 4.833187103271484, + "learning_rate": 8.75743456214883e-06, + "loss": 0.33, + "step": 9818 + }, + { + "epoch": 0.13319316332067282, + "grad_norm": 5.406104564666748, + "learning_rate": 8.757297519528573e-06, + "loss": 0.3051, + "step": 9819 + }, + { + "epoch": 0.13320672816060772, + "grad_norm": 6.013457298278809, + "learning_rate": 8.75716047690832e-06, + "loss": 0.3455, + "step": 9820 + }, + { + "epoch": 0.13322029300054258, + "grad_norm": 5.232307434082031, + "learning_rate": 8.757023434288065e-06, + "loss": 0.2986, + "step": 9821 + }, + { + "epoch": 0.13323385784047748, + "grad_norm": 5.992379665374756, + "learning_rate": 8.756886391667809e-06, + "loss": 0.4429, + "step": 9822 + }, + { + "epoch": 0.13324742268041237, + "grad_norm": 5.651230812072754, + "learning_rate": 8.756749349047554e-06, + "loss": 0.3882, + "step": 9823 + }, + { + "epoch": 0.13326098752034726, + "grad_norm": 7.062747478485107, + "learning_rate": 8.7566123064273e-06, + "loss": 0.4019, + "step": 9824 + }, + { + "epoch": 0.13327455236028216, + "grad_norm": 6.8605875968933105, + "learning_rate": 8.756475263807044e-06, + "loss": 0.3854, + "step": 9825 + }, + { + "epoch": 0.13328811720021705, + "grad_norm": 5.257763385772705, + "learning_rate": 8.75633822118679e-06, + "loss": 0.2855, + "step": 9826 + }, + { + "epoch": 0.13330168204015191, + "grad_norm": 4.848256587982178, + "learning_rate": 8.756201178566535e-06, + "loss": 0.2979, + "step": 9827 + }, + { + "epoch": 0.1333152468800868, + "grad_norm": 9.866568565368652, + "learning_rate": 8.75606413594628e-06, + "loss": 0.493, + "step": 9828 + }, + { + "epoch": 0.1333288117200217, + "grad_norm": 4.699923038482666, + "learning_rate": 8.755927093326025e-06, + "loss": 0.2831, + "step": 9829 + }, + { + "epoch": 0.1333423765599566, + "grad_norm": 5.858040809631348, + "learning_rate": 8.75579005070577e-06, + "loss": 0.3982, + "step": 9830 + }, + { + "epoch": 0.1333559413998915, + "grad_norm": 6.465073108673096, + "learning_rate": 8.755653008085515e-06, + "loss": 0.4634, + "step": 9831 + }, + { + "epoch": 0.13336950623982638, + "grad_norm": 4.767824172973633, + "learning_rate": 8.75551596546526e-06, + "loss": 0.3577, + "step": 9832 + }, + { + "epoch": 0.13338307107976125, + "grad_norm": 5.6237101554870605, + "learning_rate": 8.755378922845006e-06, + "loss": 0.431, + "step": 9833 + }, + { + "epoch": 0.13339663591969614, + "grad_norm": 5.849287033081055, + "learning_rate": 8.75524188022475e-06, + "loss": 0.3324, + "step": 9834 + }, + { + "epoch": 0.13341020075963103, + "grad_norm": 6.241520881652832, + "learning_rate": 8.755104837604496e-06, + "loss": 0.274, + "step": 9835 + }, + { + "epoch": 0.13342376559956592, + "grad_norm": 4.575817584991455, + "learning_rate": 8.754967794984241e-06, + "loss": 0.362, + "step": 9836 + }, + { + "epoch": 0.13343733043950082, + "grad_norm": 5.793750762939453, + "learning_rate": 8.754830752363986e-06, + "loss": 0.3856, + "step": 9837 + }, + { + "epoch": 0.1334508952794357, + "grad_norm": 6.265534400939941, + "learning_rate": 8.754693709743732e-06, + "loss": 0.357, + "step": 9838 + }, + { + "epoch": 0.1334644601193706, + "grad_norm": 7.392005443572998, + "learning_rate": 8.754556667123477e-06, + "loss": 0.4514, + "step": 9839 + }, + { + "epoch": 0.13347802495930547, + "grad_norm": 5.63400411605835, + "learning_rate": 8.75441962450322e-06, + "loss": 0.4671, + "step": 9840 + }, + { + "epoch": 0.13349158979924036, + "grad_norm": 9.01618766784668, + "learning_rate": 8.754282581882967e-06, + "loss": 0.5064, + "step": 9841 + }, + { + "epoch": 0.13350515463917526, + "grad_norm": 5.607973098754883, + "learning_rate": 8.754145539262712e-06, + "loss": 0.3285, + "step": 9842 + }, + { + "epoch": 0.13351871947911015, + "grad_norm": 6.101891040802002, + "learning_rate": 8.754008496642456e-06, + "loss": 0.4853, + "step": 9843 + }, + { + "epoch": 0.13353228431904504, + "grad_norm": 6.317266941070557, + "learning_rate": 8.753871454022201e-06, + "loss": 0.3926, + "step": 9844 + }, + { + "epoch": 0.13354584915897993, + "grad_norm": 5.988515377044678, + "learning_rate": 8.753734411401946e-06, + "loss": 0.3998, + "step": 9845 + }, + { + "epoch": 0.1335594139989148, + "grad_norm": 8.098607063293457, + "learning_rate": 8.753597368781693e-06, + "loss": 0.5428, + "step": 9846 + }, + { + "epoch": 0.1335729788388497, + "grad_norm": 7.010625839233398, + "learning_rate": 8.753460326161436e-06, + "loss": 0.3961, + "step": 9847 + }, + { + "epoch": 0.1335865436787846, + "grad_norm": 7.9253950119018555, + "learning_rate": 8.753323283541182e-06, + "loss": 0.5274, + "step": 9848 + }, + { + "epoch": 0.13360010851871948, + "grad_norm": 5.373205184936523, + "learning_rate": 8.753186240920927e-06, + "loss": 0.3494, + "step": 9849 + }, + { + "epoch": 0.13361367335865437, + "grad_norm": 5.288830280303955, + "learning_rate": 8.753049198300672e-06, + "loss": 0.3783, + "step": 9850 + }, + { + "epoch": 0.13362723819858927, + "grad_norm": 4.617241859436035, + "learning_rate": 8.752912155680417e-06, + "loss": 0.3009, + "step": 9851 + }, + { + "epoch": 0.13364080303852416, + "grad_norm": 6.857008934020996, + "learning_rate": 8.752775113060162e-06, + "loss": 0.3423, + "step": 9852 + }, + { + "epoch": 0.13365436787845902, + "grad_norm": 6.2656660079956055, + "learning_rate": 8.752638070439908e-06, + "loss": 0.4005, + "step": 9853 + }, + { + "epoch": 0.13366793271839392, + "grad_norm": 5.896670818328857, + "learning_rate": 8.752501027819653e-06, + "loss": 0.3704, + "step": 9854 + }, + { + "epoch": 0.1336814975583288, + "grad_norm": 6.222388744354248, + "learning_rate": 8.752363985199398e-06, + "loss": 0.3165, + "step": 9855 + }, + { + "epoch": 0.1336950623982637, + "grad_norm": 7.130173683166504, + "learning_rate": 8.752226942579143e-06, + "loss": 0.4102, + "step": 9856 + }, + { + "epoch": 0.1337086272381986, + "grad_norm": 7.175823211669922, + "learning_rate": 8.752089899958888e-06, + "loss": 0.4284, + "step": 9857 + }, + { + "epoch": 0.1337221920781335, + "grad_norm": 5.790322303771973, + "learning_rate": 8.751952857338632e-06, + "loss": 0.3717, + "step": 9858 + }, + { + "epoch": 0.13373575691806835, + "grad_norm": 5.147396564483643, + "learning_rate": 8.751815814718379e-06, + "loss": 0.3021, + "step": 9859 + }, + { + "epoch": 0.13374932175800325, + "grad_norm": 7.279067039489746, + "learning_rate": 8.751678772098124e-06, + "loss": 0.4197, + "step": 9860 + }, + { + "epoch": 0.13376288659793814, + "grad_norm": 6.202226638793945, + "learning_rate": 8.751541729477869e-06, + "loss": 0.3438, + "step": 9861 + }, + { + "epoch": 0.13377645143787303, + "grad_norm": 5.094374179840088, + "learning_rate": 8.751404686857612e-06, + "loss": 0.2996, + "step": 9862 + }, + { + "epoch": 0.13379001627780793, + "grad_norm": 8.52355670928955, + "learning_rate": 8.75126764423736e-06, + "loss": 0.3683, + "step": 9863 + }, + { + "epoch": 0.13380358111774282, + "grad_norm": 6.996747016906738, + "learning_rate": 8.751130601617104e-06, + "loss": 0.4954, + "step": 9864 + }, + { + "epoch": 0.13381714595767769, + "grad_norm": 7.240680694580078, + "learning_rate": 8.750993558996848e-06, + "loss": 0.5082, + "step": 9865 + }, + { + "epoch": 0.13383071079761258, + "grad_norm": 6.130307197570801, + "learning_rate": 8.750856516376593e-06, + "loss": 0.33, + "step": 9866 + }, + { + "epoch": 0.13384427563754747, + "grad_norm": 5.976043701171875, + "learning_rate": 8.75071947375634e-06, + "loss": 0.3057, + "step": 9867 + }, + { + "epoch": 0.13385784047748236, + "grad_norm": 5.908549785614014, + "learning_rate": 8.750582431136083e-06, + "loss": 0.3418, + "step": 9868 + }, + { + "epoch": 0.13387140531741726, + "grad_norm": 6.140532970428467, + "learning_rate": 8.750445388515829e-06, + "loss": 0.2452, + "step": 9869 + }, + { + "epoch": 0.13388497015735215, + "grad_norm": 5.897700786590576, + "learning_rate": 8.750308345895574e-06, + "loss": 0.3398, + "step": 9870 + }, + { + "epoch": 0.13389853499728704, + "grad_norm": 7.3267340660095215, + "learning_rate": 8.750171303275319e-06, + "loss": 0.3956, + "step": 9871 + }, + { + "epoch": 0.1339120998372219, + "grad_norm": 4.652771472930908, + "learning_rate": 8.750034260655064e-06, + "loss": 0.3456, + "step": 9872 + }, + { + "epoch": 0.1339256646771568, + "grad_norm": 5.746942043304443, + "learning_rate": 8.74989721803481e-06, + "loss": 0.3117, + "step": 9873 + }, + { + "epoch": 0.1339392295170917, + "grad_norm": 6.419872760772705, + "learning_rate": 8.749760175414555e-06, + "loss": 0.3952, + "step": 9874 + }, + { + "epoch": 0.1339527943570266, + "grad_norm": 7.579748630523682, + "learning_rate": 8.7496231327943e-06, + "loss": 0.476, + "step": 9875 + }, + { + "epoch": 0.13396635919696148, + "grad_norm": 7.086772441864014, + "learning_rate": 8.749486090174045e-06, + "loss": 0.3628, + "step": 9876 + }, + { + "epoch": 0.13397992403689638, + "grad_norm": 6.018071174621582, + "learning_rate": 8.74934904755379e-06, + "loss": 0.3372, + "step": 9877 + }, + { + "epoch": 0.13399348887683124, + "grad_norm": 7.016120910644531, + "learning_rate": 8.749212004933535e-06, + "loss": 0.4282, + "step": 9878 + }, + { + "epoch": 0.13400705371676613, + "grad_norm": 7.70876407623291, + "learning_rate": 8.74907496231328e-06, + "loss": 0.4753, + "step": 9879 + }, + { + "epoch": 0.13402061855670103, + "grad_norm": 5.136069297790527, + "learning_rate": 8.748937919693026e-06, + "loss": 0.2155, + "step": 9880 + }, + { + "epoch": 0.13403418339663592, + "grad_norm": 4.838817596435547, + "learning_rate": 8.74880087707277e-06, + "loss": 0.2639, + "step": 9881 + }, + { + "epoch": 0.1340477482365708, + "grad_norm": 6.718809127807617, + "learning_rate": 8.748663834452516e-06, + "loss": 0.386, + "step": 9882 + }, + { + "epoch": 0.1340613130765057, + "grad_norm": 5.975987911224365, + "learning_rate": 8.74852679183226e-06, + "loss": 0.4043, + "step": 9883 + }, + { + "epoch": 0.1340748779164406, + "grad_norm": 5.313125133514404, + "learning_rate": 8.748389749212005e-06, + "loss": 0.3319, + "step": 9884 + }, + { + "epoch": 0.13408844275637546, + "grad_norm": 5.227423191070557, + "learning_rate": 8.748252706591752e-06, + "loss": 0.2425, + "step": 9885 + }, + { + "epoch": 0.13410200759631036, + "grad_norm": 3.956718683242798, + "learning_rate": 8.748115663971497e-06, + "loss": 0.2917, + "step": 9886 + }, + { + "epoch": 0.13411557243624525, + "grad_norm": 6.144656181335449, + "learning_rate": 8.74797862135124e-06, + "loss": 0.2563, + "step": 9887 + }, + { + "epoch": 0.13412913727618014, + "grad_norm": 6.268795490264893, + "learning_rate": 8.747841578730985e-06, + "loss": 0.4547, + "step": 9888 + }, + { + "epoch": 0.13414270211611504, + "grad_norm": 5.693535804748535, + "learning_rate": 8.747704536110732e-06, + "loss": 0.4612, + "step": 9889 + }, + { + "epoch": 0.13415626695604993, + "grad_norm": 8.077047348022461, + "learning_rate": 8.747567493490476e-06, + "loss": 0.4126, + "step": 9890 + }, + { + "epoch": 0.1341698317959848, + "grad_norm": 7.94024658203125, + "learning_rate": 8.747430450870221e-06, + "loss": 0.3809, + "step": 9891 + }, + { + "epoch": 0.1341833966359197, + "grad_norm": 6.268712043762207, + "learning_rate": 8.747293408249966e-06, + "loss": 0.2897, + "step": 9892 + }, + { + "epoch": 0.13419696147585458, + "grad_norm": 5.615865230560303, + "learning_rate": 8.747156365629711e-06, + "loss": 0.2758, + "step": 9893 + }, + { + "epoch": 0.13421052631578947, + "grad_norm": 5.195559024810791, + "learning_rate": 8.747019323009456e-06, + "loss": 0.274, + "step": 9894 + }, + { + "epoch": 0.13422409115572437, + "grad_norm": 5.381495952606201, + "learning_rate": 8.746882280389202e-06, + "loss": 0.2301, + "step": 9895 + }, + { + "epoch": 0.13423765599565926, + "grad_norm": 5.260166645050049, + "learning_rate": 8.746745237768947e-06, + "loss": 0.3111, + "step": 9896 + }, + { + "epoch": 0.13425122083559413, + "grad_norm": 5.189951419830322, + "learning_rate": 8.746608195148692e-06, + "loss": 0.2747, + "step": 9897 + }, + { + "epoch": 0.13426478567552902, + "grad_norm": 4.760782241821289, + "learning_rate": 8.746471152528437e-06, + "loss": 0.2894, + "step": 9898 + }, + { + "epoch": 0.1342783505154639, + "grad_norm": 5.8185882568359375, + "learning_rate": 8.746334109908182e-06, + "loss": 0.3473, + "step": 9899 + }, + { + "epoch": 0.1342919153553988, + "grad_norm": 5.858283519744873, + "learning_rate": 8.746197067287928e-06, + "loss": 0.2927, + "step": 9900 + }, + { + "epoch": 0.1343054801953337, + "grad_norm": 5.9378180503845215, + "learning_rate": 8.746060024667673e-06, + "loss": 0.3634, + "step": 9901 + }, + { + "epoch": 0.1343190450352686, + "grad_norm": 5.85731840133667, + "learning_rate": 8.745922982047418e-06, + "loss": 0.3435, + "step": 9902 + }, + { + "epoch": 0.13433260987520348, + "grad_norm": 6.655158996582031, + "learning_rate": 8.745785939427163e-06, + "loss": 0.2698, + "step": 9903 + }, + { + "epoch": 0.13434617471513835, + "grad_norm": 5.45338249206543, + "learning_rate": 8.745648896806908e-06, + "loss": 0.3488, + "step": 9904 + }, + { + "epoch": 0.13435973955507324, + "grad_norm": 6.602651596069336, + "learning_rate": 8.745511854186652e-06, + "loss": 0.2792, + "step": 9905 + }, + { + "epoch": 0.13437330439500814, + "grad_norm": 5.406306743621826, + "learning_rate": 8.745374811566399e-06, + "loss": 0.2447, + "step": 9906 + }, + { + "epoch": 0.13438686923494303, + "grad_norm": 4.806190013885498, + "learning_rate": 8.745237768946144e-06, + "loss": 0.2047, + "step": 9907 + }, + { + "epoch": 0.13440043407487792, + "grad_norm": 6.166857719421387, + "learning_rate": 8.745100726325887e-06, + "loss": 0.4592, + "step": 9908 + }, + { + "epoch": 0.13441399891481282, + "grad_norm": 5.5759077072143555, + "learning_rate": 8.744963683705632e-06, + "loss": 0.2679, + "step": 9909 + }, + { + "epoch": 0.13442756375474768, + "grad_norm": 3.92158842086792, + "learning_rate": 8.74482664108538e-06, + "loss": 0.2286, + "step": 9910 + }, + { + "epoch": 0.13444112859468257, + "grad_norm": 3.1220574378967285, + "learning_rate": 8.744689598465124e-06, + "loss": 0.179, + "step": 9911 + }, + { + "epoch": 0.13445469343461747, + "grad_norm": 5.091104030609131, + "learning_rate": 8.744552555844868e-06, + "loss": 0.27, + "step": 9912 + }, + { + "epoch": 0.13446825827455236, + "grad_norm": 5.7945685386657715, + "learning_rate": 8.744415513224613e-06, + "loss": 0.3175, + "step": 9913 + }, + { + "epoch": 0.13448182311448725, + "grad_norm": 3.502995729446411, + "learning_rate": 8.744278470604358e-06, + "loss": 0.2069, + "step": 9914 + }, + { + "epoch": 0.13449538795442215, + "grad_norm": 8.044403076171875, + "learning_rate": 8.744141427984104e-06, + "loss": 0.4614, + "step": 9915 + }, + { + "epoch": 0.13450895279435704, + "grad_norm": 5.5869035720825195, + "learning_rate": 8.744004385363849e-06, + "loss": 0.3217, + "step": 9916 + }, + { + "epoch": 0.1345225176342919, + "grad_norm": 6.585434913635254, + "learning_rate": 8.743867342743594e-06, + "loss": 0.3857, + "step": 9917 + }, + { + "epoch": 0.1345360824742268, + "grad_norm": 7.150252819061279, + "learning_rate": 8.743730300123339e-06, + "loss": 0.3652, + "step": 9918 + }, + { + "epoch": 0.1345496473141617, + "grad_norm": 6.068453311920166, + "learning_rate": 8.743593257503084e-06, + "loss": 0.3317, + "step": 9919 + }, + { + "epoch": 0.13456321215409658, + "grad_norm": 6.055080413818359, + "learning_rate": 8.74345621488283e-06, + "loss": 0.237, + "step": 9920 + }, + { + "epoch": 0.13457677699403148, + "grad_norm": 4.831462860107422, + "learning_rate": 8.743319172262575e-06, + "loss": 0.244, + "step": 9921 + }, + { + "epoch": 0.13459034183396637, + "grad_norm": 5.3262457847595215, + "learning_rate": 8.74318212964232e-06, + "loss": 0.1972, + "step": 9922 + }, + { + "epoch": 0.13460390667390124, + "grad_norm": 4.365741729736328, + "learning_rate": 8.743045087022065e-06, + "loss": 0.2095, + "step": 9923 + }, + { + "epoch": 0.13461747151383613, + "grad_norm": 5.17822265625, + "learning_rate": 8.74290804440181e-06, + "loss": 0.2482, + "step": 9924 + }, + { + "epoch": 0.13463103635377102, + "grad_norm": 5.385260581970215, + "learning_rate": 8.742771001781555e-06, + "loss": 0.194, + "step": 9925 + }, + { + "epoch": 0.13464460119370592, + "grad_norm": 5.4856672286987305, + "learning_rate": 8.7426339591613e-06, + "loss": 0.3426, + "step": 9926 + }, + { + "epoch": 0.1346581660336408, + "grad_norm": 7.59953498840332, + "learning_rate": 8.742496916541044e-06, + "loss": 0.301, + "step": 9927 + }, + { + "epoch": 0.1346717308735757, + "grad_norm": 5.407611846923828, + "learning_rate": 8.74235987392079e-06, + "loss": 0.3097, + "step": 9928 + }, + { + "epoch": 0.1346852957135106, + "grad_norm": 5.999100685119629, + "learning_rate": 8.742222831300536e-06, + "loss": 0.3154, + "step": 9929 + }, + { + "epoch": 0.13469886055344546, + "grad_norm": 4.374578475952148, + "learning_rate": 8.74208578868028e-06, + "loss": 0.2287, + "step": 9930 + }, + { + "epoch": 0.13471242539338035, + "grad_norm": 6.674715995788574, + "learning_rate": 8.741948746060025e-06, + "loss": 0.3822, + "step": 9931 + }, + { + "epoch": 0.13472599023331525, + "grad_norm": 9.446589469909668, + "learning_rate": 8.741811703439772e-06, + "loss": 0.6834, + "step": 9932 + }, + { + "epoch": 0.13473955507325014, + "grad_norm": 5.728161811828613, + "learning_rate": 8.741674660819515e-06, + "loss": 0.3156, + "step": 9933 + }, + { + "epoch": 0.13475311991318503, + "grad_norm": 4.065010070800781, + "learning_rate": 8.74153761819926e-06, + "loss": 0.3052, + "step": 9934 + }, + { + "epoch": 0.13476668475311993, + "grad_norm": 4.671703815460205, + "learning_rate": 8.741400575579005e-06, + "loss": 0.311, + "step": 9935 + }, + { + "epoch": 0.1347802495930548, + "grad_norm": 5.443467140197754, + "learning_rate": 8.74126353295875e-06, + "loss": 0.4612, + "step": 9936 + }, + { + "epoch": 0.13479381443298968, + "grad_norm": 5.571283340454102, + "learning_rate": 8.741126490338496e-06, + "loss": 0.3232, + "step": 9937 + }, + { + "epoch": 0.13480737927292458, + "grad_norm": 3.9971280097961426, + "learning_rate": 8.740989447718241e-06, + "loss": 0.189, + "step": 9938 + }, + { + "epoch": 0.13482094411285947, + "grad_norm": 4.6954426765441895, + "learning_rate": 8.740852405097986e-06, + "loss": 0.3096, + "step": 9939 + }, + { + "epoch": 0.13483450895279436, + "grad_norm": 6.207581996917725, + "learning_rate": 8.740715362477731e-06, + "loss": 0.402, + "step": 9940 + }, + { + "epoch": 0.13484807379272926, + "grad_norm": 6.954540252685547, + "learning_rate": 8.740578319857476e-06, + "loss": 0.4445, + "step": 9941 + }, + { + "epoch": 0.13486163863266412, + "grad_norm": 5.557058334350586, + "learning_rate": 8.740441277237222e-06, + "loss": 0.3925, + "step": 9942 + }, + { + "epoch": 0.13487520347259901, + "grad_norm": 6.014784336090088, + "learning_rate": 8.740304234616967e-06, + "loss": 0.4325, + "step": 9943 + }, + { + "epoch": 0.1348887683125339, + "grad_norm": 6.896883487701416, + "learning_rate": 8.740167191996712e-06, + "loss": 0.5222, + "step": 9944 + }, + { + "epoch": 0.1349023331524688, + "grad_norm": 4.439206123352051, + "learning_rate": 8.740030149376457e-06, + "loss": 0.2121, + "step": 9945 + }, + { + "epoch": 0.1349158979924037, + "grad_norm": 5.9936137199401855, + "learning_rate": 8.739893106756202e-06, + "loss": 0.4269, + "step": 9946 + }, + { + "epoch": 0.1349294628323386, + "grad_norm": 7.440878868103027, + "learning_rate": 8.739756064135948e-06, + "loss": 0.5383, + "step": 9947 + }, + { + "epoch": 0.13494302767227348, + "grad_norm": 6.924073696136475, + "learning_rate": 8.739619021515691e-06, + "loss": 0.4266, + "step": 9948 + }, + { + "epoch": 0.13495659251220835, + "grad_norm": 8.892297744750977, + "learning_rate": 8.739481978895438e-06, + "loss": 0.4267, + "step": 9949 + }, + { + "epoch": 0.13497015735214324, + "grad_norm": 4.920801639556885, + "learning_rate": 8.739344936275183e-06, + "loss": 0.3237, + "step": 9950 + }, + { + "epoch": 0.13498372219207813, + "grad_norm": 5.380276203155518, + "learning_rate": 8.739207893654927e-06, + "loss": 0.4219, + "step": 9951 + }, + { + "epoch": 0.13499728703201302, + "grad_norm": 5.03286075592041, + "learning_rate": 8.739070851034672e-06, + "loss": 0.3823, + "step": 9952 + }, + { + "epoch": 0.13501085187194792, + "grad_norm": 6.484428405761719, + "learning_rate": 8.738933808414417e-06, + "loss": 0.3524, + "step": 9953 + }, + { + "epoch": 0.1350244167118828, + "grad_norm": 5.503812789916992, + "learning_rate": 8.738796765794164e-06, + "loss": 0.3518, + "step": 9954 + }, + { + "epoch": 0.13503798155181768, + "grad_norm": 5.19677209854126, + "learning_rate": 8.738659723173907e-06, + "loss": 0.316, + "step": 9955 + }, + { + "epoch": 0.13505154639175257, + "grad_norm": 7.039409637451172, + "learning_rate": 8.738522680553652e-06, + "loss": 0.4103, + "step": 9956 + }, + { + "epoch": 0.13506511123168746, + "grad_norm": 6.150717735290527, + "learning_rate": 8.738385637933398e-06, + "loss": 0.4439, + "step": 9957 + }, + { + "epoch": 0.13507867607162236, + "grad_norm": 5.9460978507995605, + "learning_rate": 8.738248595313143e-06, + "loss": 0.5165, + "step": 9958 + }, + { + "epoch": 0.13509224091155725, + "grad_norm": 7.258893966674805, + "learning_rate": 8.738111552692888e-06, + "loss": 0.4339, + "step": 9959 + }, + { + "epoch": 0.13510580575149214, + "grad_norm": 6.585537910461426, + "learning_rate": 8.737974510072633e-06, + "loss": 0.4846, + "step": 9960 + }, + { + "epoch": 0.13511937059142703, + "grad_norm": 6.76524019241333, + "learning_rate": 8.737837467452378e-06, + "loss": 0.4141, + "step": 9961 + }, + { + "epoch": 0.1351329354313619, + "grad_norm": 6.575769901275635, + "learning_rate": 8.737700424832124e-06, + "loss": 0.404, + "step": 9962 + }, + { + "epoch": 0.1351465002712968, + "grad_norm": 5.0205230712890625, + "learning_rate": 8.737563382211869e-06, + "loss": 0.3303, + "step": 9963 + }, + { + "epoch": 0.1351600651112317, + "grad_norm": 6.661877155303955, + "learning_rate": 8.737426339591614e-06, + "loss": 0.4338, + "step": 9964 + }, + { + "epoch": 0.13517362995116658, + "grad_norm": 5.770737648010254, + "learning_rate": 8.737289296971359e-06, + "loss": 0.3824, + "step": 9965 + }, + { + "epoch": 0.13518719479110147, + "grad_norm": 6.506396293640137, + "learning_rate": 8.737152254351103e-06, + "loss": 0.3178, + "step": 9966 + }, + { + "epoch": 0.13520075963103637, + "grad_norm": 6.7886738777160645, + "learning_rate": 8.73701521173085e-06, + "loss": 0.4145, + "step": 9967 + }, + { + "epoch": 0.13521432447097123, + "grad_norm": 4.739322662353516, + "learning_rate": 8.736878169110595e-06, + "loss": 0.3533, + "step": 9968 + }, + { + "epoch": 0.13522788931090612, + "grad_norm": 8.46184253692627, + "learning_rate": 8.73674112649034e-06, + "loss": 0.5318, + "step": 9969 + }, + { + "epoch": 0.13524145415084102, + "grad_norm": 4.994670867919922, + "learning_rate": 8.736604083870083e-06, + "loss": 0.3123, + "step": 9970 + }, + { + "epoch": 0.1352550189907759, + "grad_norm": 8.658716201782227, + "learning_rate": 8.73646704124983e-06, + "loss": 0.4155, + "step": 9971 + }, + { + "epoch": 0.1352685838307108, + "grad_norm": 6.017419815063477, + "learning_rate": 8.736329998629575e-06, + "loss": 0.4071, + "step": 9972 + }, + { + "epoch": 0.1352821486706457, + "grad_norm": 5.4099907875061035, + "learning_rate": 8.736192956009319e-06, + "loss": 0.3216, + "step": 9973 + }, + { + "epoch": 0.13529571351058056, + "grad_norm": 6.346441268920898, + "learning_rate": 8.736055913389064e-06, + "loss": 0.4011, + "step": 9974 + }, + { + "epoch": 0.13530927835051546, + "grad_norm": 7.576297760009766, + "learning_rate": 8.735918870768811e-06, + "loss": 0.3867, + "step": 9975 + }, + { + "epoch": 0.13532284319045035, + "grad_norm": 6.030551910400391, + "learning_rate": 8.735781828148554e-06, + "loss": 0.4332, + "step": 9976 + }, + { + "epoch": 0.13533640803038524, + "grad_norm": 5.8580827713012695, + "learning_rate": 8.7356447855283e-06, + "loss": 0.4192, + "step": 9977 + }, + { + "epoch": 0.13534997287032013, + "grad_norm": 6.574447154998779, + "learning_rate": 8.735507742908045e-06, + "loss": 0.5922, + "step": 9978 + }, + { + "epoch": 0.13536353771025503, + "grad_norm": 7.504910945892334, + "learning_rate": 8.73537070028779e-06, + "loss": 0.41, + "step": 9979 + }, + { + "epoch": 0.13537710255018992, + "grad_norm": 8.442497253417969, + "learning_rate": 8.735233657667535e-06, + "loss": 0.5066, + "step": 9980 + }, + { + "epoch": 0.13539066739012479, + "grad_norm": 6.070981502532959, + "learning_rate": 8.73509661504728e-06, + "loss": 0.3531, + "step": 9981 + }, + { + "epoch": 0.13540423223005968, + "grad_norm": 8.236797332763672, + "learning_rate": 8.734959572427025e-06, + "loss": 0.394, + "step": 9982 + }, + { + "epoch": 0.13541779706999457, + "grad_norm": 5.911465644836426, + "learning_rate": 8.73482252980677e-06, + "loss": 0.4909, + "step": 9983 + }, + { + "epoch": 0.13543136190992947, + "grad_norm": 6.121901512145996, + "learning_rate": 8.734685487186516e-06, + "loss": 0.3036, + "step": 9984 + }, + { + "epoch": 0.13544492674986436, + "grad_norm": 5.369635105133057, + "learning_rate": 8.734548444566261e-06, + "loss": 0.3594, + "step": 9985 + }, + { + "epoch": 0.13545849158979925, + "grad_norm": 6.227592468261719, + "learning_rate": 8.734411401946006e-06, + "loss": 0.3084, + "step": 9986 + }, + { + "epoch": 0.13547205642973412, + "grad_norm": 6.8750457763671875, + "learning_rate": 8.734274359325751e-06, + "loss": 0.4788, + "step": 9987 + }, + { + "epoch": 0.135485621269669, + "grad_norm": 4.9922051429748535, + "learning_rate": 8.734137316705496e-06, + "loss": 0.3265, + "step": 9988 + }, + { + "epoch": 0.1354991861096039, + "grad_norm": 6.465062618255615, + "learning_rate": 8.734000274085242e-06, + "loss": 0.3216, + "step": 9989 + }, + { + "epoch": 0.1355127509495388, + "grad_norm": 5.860386848449707, + "learning_rate": 8.733863231464987e-06, + "loss": 0.3776, + "step": 9990 + }, + { + "epoch": 0.1355263157894737, + "grad_norm": 6.978604793548584, + "learning_rate": 8.73372618884473e-06, + "loss": 0.4758, + "step": 9991 + }, + { + "epoch": 0.13553988062940858, + "grad_norm": 8.971841812133789, + "learning_rate": 8.733589146224477e-06, + "loss": 0.6719, + "step": 9992 + }, + { + "epoch": 0.13555344546934348, + "grad_norm": 7.144806385040283, + "learning_rate": 8.733452103604222e-06, + "loss": 0.5971, + "step": 9993 + }, + { + "epoch": 0.13556701030927834, + "grad_norm": 6.019430160522461, + "learning_rate": 8.733315060983968e-06, + "loss": 0.3075, + "step": 9994 + }, + { + "epoch": 0.13558057514921323, + "grad_norm": 8.048163414001465, + "learning_rate": 8.733178018363711e-06, + "loss": 0.5891, + "step": 9995 + }, + { + "epoch": 0.13559413998914813, + "grad_norm": 5.560287952423096, + "learning_rate": 8.733040975743456e-06, + "loss": 0.3685, + "step": 9996 + }, + { + "epoch": 0.13560770482908302, + "grad_norm": 9.361146926879883, + "learning_rate": 8.732903933123203e-06, + "loss": 0.7197, + "step": 9997 + }, + { + "epoch": 0.1356212696690179, + "grad_norm": 5.9899091720581055, + "learning_rate": 8.732766890502947e-06, + "loss": 0.4254, + "step": 9998 + }, + { + "epoch": 0.1356348345089528, + "grad_norm": 7.26551628112793, + "learning_rate": 8.732629847882692e-06, + "loss": 0.7148, + "step": 9999 + }, + { + "epoch": 0.13564839934888767, + "grad_norm": 6.325384616851807, + "learning_rate": 8.732492805262437e-06, + "loss": 0.5539, + "step": 10000 + }, + { + "epoch": 0.13566196418882256, + "grad_norm": 7.674426555633545, + "learning_rate": 8.732355762642182e-06, + "loss": 0.4966, + "step": 10001 + }, + { + "epoch": 0.13567552902875746, + "grad_norm": 7.998277187347412, + "learning_rate": 8.732218720021927e-06, + "loss": 0.5013, + "step": 10002 + }, + { + "epoch": 0.13568909386869235, + "grad_norm": 6.585010528564453, + "learning_rate": 8.732081677401672e-06, + "loss": 0.5347, + "step": 10003 + }, + { + "epoch": 0.13570265870862724, + "grad_norm": 7.219786167144775, + "learning_rate": 8.731944634781418e-06, + "loss": 0.4295, + "step": 10004 + }, + { + "epoch": 0.13571622354856214, + "grad_norm": 5.770124435424805, + "learning_rate": 8.731807592161163e-06, + "loss": 0.3966, + "step": 10005 + }, + { + "epoch": 0.135729788388497, + "grad_norm": 6.587559223175049, + "learning_rate": 8.731670549540908e-06, + "loss": 0.3419, + "step": 10006 + }, + { + "epoch": 0.1357433532284319, + "grad_norm": 5.3960723876953125, + "learning_rate": 8.731533506920653e-06, + "loss": 0.2542, + "step": 10007 + }, + { + "epoch": 0.1357569180683668, + "grad_norm": 7.332982540130615, + "learning_rate": 8.731396464300398e-06, + "loss": 0.4574, + "step": 10008 + }, + { + "epoch": 0.13577048290830168, + "grad_norm": 6.377333641052246, + "learning_rate": 8.731259421680144e-06, + "loss": 0.3452, + "step": 10009 + }, + { + "epoch": 0.13578404774823657, + "grad_norm": 6.661908149719238, + "learning_rate": 8.731122379059889e-06, + "loss": 0.6158, + "step": 10010 + }, + { + "epoch": 0.13579761258817147, + "grad_norm": 5.5986857414245605, + "learning_rate": 8.730985336439634e-06, + "loss": 0.3711, + "step": 10011 + }, + { + "epoch": 0.13581117742810636, + "grad_norm": 6.081745624542236, + "learning_rate": 8.730848293819379e-06, + "loss": 0.3542, + "step": 10012 + }, + { + "epoch": 0.13582474226804123, + "grad_norm": 7.714528560638428, + "learning_rate": 8.730711251199123e-06, + "loss": 0.581, + "step": 10013 + }, + { + "epoch": 0.13583830710797612, + "grad_norm": 6.028285026550293, + "learning_rate": 8.73057420857887e-06, + "loss": 0.3698, + "step": 10014 + }, + { + "epoch": 0.135851871947911, + "grad_norm": 5.515350818634033, + "learning_rate": 8.730437165958615e-06, + "loss": 0.4358, + "step": 10015 + }, + { + "epoch": 0.1358654367878459, + "grad_norm": 6.519564628601074, + "learning_rate": 8.730300123338358e-06, + "loss": 0.347, + "step": 10016 + }, + { + "epoch": 0.1358790016277808, + "grad_norm": 7.241753578186035, + "learning_rate": 8.730163080718103e-06, + "loss": 0.6016, + "step": 10017 + }, + { + "epoch": 0.1358925664677157, + "grad_norm": 6.658722877502441, + "learning_rate": 8.73002603809785e-06, + "loss": 0.4329, + "step": 10018 + }, + { + "epoch": 0.13590613130765056, + "grad_norm": 5.823822975158691, + "learning_rate": 8.729888995477595e-06, + "loss": 0.3405, + "step": 10019 + }, + { + "epoch": 0.13591969614758545, + "grad_norm": 7.6362104415893555, + "learning_rate": 8.729751952857339e-06, + "loss": 0.5475, + "step": 10020 + }, + { + "epoch": 0.13593326098752034, + "grad_norm": 6.496086120605469, + "learning_rate": 8.729614910237084e-06, + "loss": 0.5243, + "step": 10021 + }, + { + "epoch": 0.13594682582745524, + "grad_norm": 6.160902500152588, + "learning_rate": 8.72947786761683e-06, + "loss": 0.259, + "step": 10022 + }, + { + "epoch": 0.13596039066739013, + "grad_norm": 6.534593105316162, + "learning_rate": 8.729340824996574e-06, + "loss": 0.3576, + "step": 10023 + }, + { + "epoch": 0.13597395550732502, + "grad_norm": 6.625800609588623, + "learning_rate": 8.72920378237632e-06, + "loss": 0.441, + "step": 10024 + }, + { + "epoch": 0.13598752034725992, + "grad_norm": 6.0368571281433105, + "learning_rate": 8.729066739756065e-06, + "loss": 0.3816, + "step": 10025 + }, + { + "epoch": 0.13600108518719478, + "grad_norm": 6.081247329711914, + "learning_rate": 8.72892969713581e-06, + "loss": 0.3481, + "step": 10026 + }, + { + "epoch": 0.13601465002712967, + "grad_norm": 6.792050361633301, + "learning_rate": 8.728792654515555e-06, + "loss": 0.339, + "step": 10027 + }, + { + "epoch": 0.13602821486706457, + "grad_norm": 6.720574378967285, + "learning_rate": 8.7286556118953e-06, + "loss": 0.3613, + "step": 10028 + }, + { + "epoch": 0.13604177970699946, + "grad_norm": 7.597392559051514, + "learning_rate": 8.728518569275045e-06, + "loss": 0.5089, + "step": 10029 + }, + { + "epoch": 0.13605534454693435, + "grad_norm": 6.198166370391846, + "learning_rate": 8.72838152665479e-06, + "loss": 0.2962, + "step": 10030 + }, + { + "epoch": 0.13606890938686925, + "grad_norm": 7.693325519561768, + "learning_rate": 8.728244484034536e-06, + "loss": 0.494, + "step": 10031 + }, + { + "epoch": 0.1360824742268041, + "grad_norm": 7.222593307495117, + "learning_rate": 8.728107441414281e-06, + "loss": 0.49, + "step": 10032 + }, + { + "epoch": 0.136096039066739, + "grad_norm": 5.902729034423828, + "learning_rate": 8.727970398794026e-06, + "loss": 0.2875, + "step": 10033 + }, + { + "epoch": 0.1361096039066739, + "grad_norm": 6.430828094482422, + "learning_rate": 8.72783335617377e-06, + "loss": 0.4305, + "step": 10034 + }, + { + "epoch": 0.1361231687466088, + "grad_norm": 9.322556495666504, + "learning_rate": 8.727696313553515e-06, + "loss": 0.4867, + "step": 10035 + }, + { + "epoch": 0.13613673358654368, + "grad_norm": 8.164275169372559, + "learning_rate": 8.727559270933262e-06, + "loss": 0.4499, + "step": 10036 + }, + { + "epoch": 0.13615029842647858, + "grad_norm": 7.490677356719971, + "learning_rate": 8.727422228313007e-06, + "loss": 0.4559, + "step": 10037 + }, + { + "epoch": 0.13616386326641344, + "grad_norm": 6.811707973480225, + "learning_rate": 8.72728518569275e-06, + "loss": 0.3493, + "step": 10038 + }, + { + "epoch": 0.13617742810634834, + "grad_norm": 6.045580863952637, + "learning_rate": 8.727148143072496e-06, + "loss": 0.4226, + "step": 10039 + }, + { + "epoch": 0.13619099294628323, + "grad_norm": 6.435094833374023, + "learning_rate": 8.727011100452242e-06, + "loss": 0.3092, + "step": 10040 + }, + { + "epoch": 0.13620455778621812, + "grad_norm": 6.28221321105957, + "learning_rate": 8.726874057831986e-06, + "loss": 0.4151, + "step": 10041 + }, + { + "epoch": 0.13621812262615302, + "grad_norm": 6.262073040008545, + "learning_rate": 8.726737015211731e-06, + "loss": 0.3431, + "step": 10042 + }, + { + "epoch": 0.1362316874660879, + "grad_norm": 9.301933288574219, + "learning_rate": 8.726599972591476e-06, + "loss": 0.5159, + "step": 10043 + }, + { + "epoch": 0.1362452523060228, + "grad_norm": 7.415355205535889, + "learning_rate": 8.726462929971221e-06, + "loss": 0.6526, + "step": 10044 + }, + { + "epoch": 0.13625881714595767, + "grad_norm": 8.285573959350586, + "learning_rate": 8.726325887350967e-06, + "loss": 0.5699, + "step": 10045 + }, + { + "epoch": 0.13627238198589256, + "grad_norm": 7.180477142333984, + "learning_rate": 8.726188844730712e-06, + "loss": 0.5214, + "step": 10046 + }, + { + "epoch": 0.13628594682582745, + "grad_norm": 7.072122573852539, + "learning_rate": 8.726051802110457e-06, + "loss": 0.6143, + "step": 10047 + }, + { + "epoch": 0.13629951166576235, + "grad_norm": 4.589900493621826, + "learning_rate": 8.725914759490202e-06, + "loss": 0.3646, + "step": 10048 + }, + { + "epoch": 0.13631307650569724, + "grad_norm": 8.091118812561035, + "learning_rate": 8.725777716869947e-06, + "loss": 0.4325, + "step": 10049 + }, + { + "epoch": 0.13632664134563213, + "grad_norm": 5.985348224639893, + "learning_rate": 8.725640674249692e-06, + "loss": 0.4626, + "step": 10050 + }, + { + "epoch": 0.136340206185567, + "grad_norm": 6.482504367828369, + "learning_rate": 8.725503631629438e-06, + "loss": 0.4769, + "step": 10051 + }, + { + "epoch": 0.1363537710255019, + "grad_norm": 6.449730396270752, + "learning_rate": 8.725366589009183e-06, + "loss": 0.608, + "step": 10052 + }, + { + "epoch": 0.13636733586543678, + "grad_norm": 5.533212184906006, + "learning_rate": 8.725229546388928e-06, + "loss": 0.4265, + "step": 10053 + }, + { + "epoch": 0.13638090070537168, + "grad_norm": 5.6803131103515625, + "learning_rate": 8.725092503768673e-06, + "loss": 0.3644, + "step": 10054 + }, + { + "epoch": 0.13639446554530657, + "grad_norm": 9.103705406188965, + "learning_rate": 8.724955461148418e-06, + "loss": 0.5938, + "step": 10055 + }, + { + "epoch": 0.13640803038524146, + "grad_norm": 6.001543045043945, + "learning_rate": 8.724818418528162e-06, + "loss": 0.4081, + "step": 10056 + }, + { + "epoch": 0.13642159522517636, + "grad_norm": 9.340054512023926, + "learning_rate": 8.724681375907909e-06, + "loss": 0.6603, + "step": 10057 + }, + { + "epoch": 0.13643516006511122, + "grad_norm": 7.573166847229004, + "learning_rate": 8.724544333287654e-06, + "loss": 0.5382, + "step": 10058 + }, + { + "epoch": 0.13644872490504611, + "grad_norm": 8.283116340637207, + "learning_rate": 8.724407290667397e-06, + "loss": 0.4281, + "step": 10059 + }, + { + "epoch": 0.136462289744981, + "grad_norm": 5.107236862182617, + "learning_rate": 8.724270248047143e-06, + "loss": 0.3136, + "step": 10060 + }, + { + "epoch": 0.1364758545849159, + "grad_norm": 6.973875999450684, + "learning_rate": 8.72413320542689e-06, + "loss": 0.4652, + "step": 10061 + }, + { + "epoch": 0.1364894194248508, + "grad_norm": 7.289099216461182, + "learning_rate": 8.723996162806635e-06, + "loss": 0.4923, + "step": 10062 + }, + { + "epoch": 0.1365029842647857, + "grad_norm": 7.459157466888428, + "learning_rate": 8.723859120186378e-06, + "loss": 0.511, + "step": 10063 + }, + { + "epoch": 0.13651654910472055, + "grad_norm": 7.8616943359375, + "learning_rate": 8.723722077566123e-06, + "loss": 0.607, + "step": 10064 + }, + { + "epoch": 0.13653011394465545, + "grad_norm": 7.855877876281738, + "learning_rate": 8.723585034945868e-06, + "loss": 0.416, + "step": 10065 + }, + { + "epoch": 0.13654367878459034, + "grad_norm": 5.96766996383667, + "learning_rate": 8.723447992325614e-06, + "loss": 0.5615, + "step": 10066 + }, + { + "epoch": 0.13655724362452523, + "grad_norm": 6.884050369262695, + "learning_rate": 8.723310949705359e-06, + "loss": 0.5113, + "step": 10067 + }, + { + "epoch": 0.13657080846446013, + "grad_norm": 5.129942417144775, + "learning_rate": 8.723173907085104e-06, + "loss": 0.2925, + "step": 10068 + }, + { + "epoch": 0.13658437330439502, + "grad_norm": 7.141401290893555, + "learning_rate": 8.72303686446485e-06, + "loss": 0.5887, + "step": 10069 + }, + { + "epoch": 0.13659793814432988, + "grad_norm": 6.287934303283691, + "learning_rate": 8.722899821844594e-06, + "loss": 0.4252, + "step": 10070 + }, + { + "epoch": 0.13661150298426478, + "grad_norm": 6.65350866317749, + "learning_rate": 8.72276277922434e-06, + "loss": 0.465, + "step": 10071 + }, + { + "epoch": 0.13662506782419967, + "grad_norm": 8.163008689880371, + "learning_rate": 8.722625736604085e-06, + "loss": 0.633, + "step": 10072 + }, + { + "epoch": 0.13663863266413456, + "grad_norm": 9.743778228759766, + "learning_rate": 8.72248869398383e-06, + "loss": 0.7611, + "step": 10073 + }, + { + "epoch": 0.13665219750406946, + "grad_norm": 6.203774929046631, + "learning_rate": 8.722351651363575e-06, + "loss": 0.5187, + "step": 10074 + }, + { + "epoch": 0.13666576234400435, + "grad_norm": 5.377631664276123, + "learning_rate": 8.72221460874332e-06, + "loss": 0.3997, + "step": 10075 + }, + { + "epoch": 0.13667932718393924, + "grad_norm": 7.058324337005615, + "learning_rate": 8.722077566123065e-06, + "loss": 0.4558, + "step": 10076 + }, + { + "epoch": 0.1366928920238741, + "grad_norm": 6.15090799331665, + "learning_rate": 8.72194052350281e-06, + "loss": 0.5863, + "step": 10077 + }, + { + "epoch": 0.136706456863809, + "grad_norm": 6.2216033935546875, + "learning_rate": 8.721803480882554e-06, + "loss": 0.3446, + "step": 10078 + }, + { + "epoch": 0.1367200217037439, + "grad_norm": 6.533848762512207, + "learning_rate": 8.721666438262301e-06, + "loss": 0.435, + "step": 10079 + }, + { + "epoch": 0.1367335865436788, + "grad_norm": 8.740541458129883, + "learning_rate": 8.721529395642046e-06, + "loss": 0.464, + "step": 10080 + }, + { + "epoch": 0.13674715138361368, + "grad_norm": 11.16641902923584, + "learning_rate": 8.72139235302179e-06, + "loss": 0.5546, + "step": 10081 + }, + { + "epoch": 0.13676071622354857, + "grad_norm": 6.164920330047607, + "learning_rate": 8.721255310401535e-06, + "loss": 0.432, + "step": 10082 + }, + { + "epoch": 0.13677428106348344, + "grad_norm": 6.0904083251953125, + "learning_rate": 8.721118267781282e-06, + "loss": 0.5749, + "step": 10083 + }, + { + "epoch": 0.13678784590341833, + "grad_norm": 5.704213619232178, + "learning_rate": 8.720981225161025e-06, + "loss": 0.4392, + "step": 10084 + }, + { + "epoch": 0.13680141074335322, + "grad_norm": 6.116606712341309, + "learning_rate": 8.72084418254077e-06, + "loss": 0.3844, + "step": 10085 + }, + { + "epoch": 0.13681497558328812, + "grad_norm": 6.4812750816345215, + "learning_rate": 8.720707139920516e-06, + "loss": 0.4338, + "step": 10086 + }, + { + "epoch": 0.136828540423223, + "grad_norm": 9.462328910827637, + "learning_rate": 8.720570097300262e-06, + "loss": 0.5095, + "step": 10087 + }, + { + "epoch": 0.1368421052631579, + "grad_norm": 4.836634159088135, + "learning_rate": 8.720433054680006e-06, + "loss": 0.3541, + "step": 10088 + }, + { + "epoch": 0.1368556701030928, + "grad_norm": 6.548897743225098, + "learning_rate": 8.720296012059751e-06, + "loss": 0.3672, + "step": 10089 + }, + { + "epoch": 0.13686923494302766, + "grad_norm": 7.918015480041504, + "learning_rate": 8.720158969439496e-06, + "loss": 0.4312, + "step": 10090 + }, + { + "epoch": 0.13688279978296256, + "grad_norm": 5.743622303009033, + "learning_rate": 8.720021926819241e-06, + "loss": 0.3575, + "step": 10091 + }, + { + "epoch": 0.13689636462289745, + "grad_norm": 5.4930033683776855, + "learning_rate": 8.719884884198987e-06, + "loss": 0.4772, + "step": 10092 + }, + { + "epoch": 0.13690992946283234, + "grad_norm": 4.7342939376831055, + "learning_rate": 8.719747841578732e-06, + "loss": 0.3105, + "step": 10093 + }, + { + "epoch": 0.13692349430276723, + "grad_norm": 8.832198143005371, + "learning_rate": 8.719610798958477e-06, + "loss": 0.4343, + "step": 10094 + }, + { + "epoch": 0.13693705914270213, + "grad_norm": 5.9080681800842285, + "learning_rate": 8.719473756338222e-06, + "loss": 0.349, + "step": 10095 + }, + { + "epoch": 0.136950623982637, + "grad_norm": 6.817637920379639, + "learning_rate": 8.719336713717967e-06, + "loss": 0.4934, + "step": 10096 + }, + { + "epoch": 0.1369641888225719, + "grad_norm": 6.2903361320495605, + "learning_rate": 8.719199671097713e-06, + "loss": 0.3557, + "step": 10097 + }, + { + "epoch": 0.13697775366250678, + "grad_norm": 7.2426042556762695, + "learning_rate": 8.719062628477458e-06, + "loss": 0.4204, + "step": 10098 + }, + { + "epoch": 0.13699131850244167, + "grad_norm": 8.84666919708252, + "learning_rate": 8.718925585857201e-06, + "loss": 0.5658, + "step": 10099 + }, + { + "epoch": 0.13700488334237657, + "grad_norm": 7.051046371459961, + "learning_rate": 8.718788543236948e-06, + "loss": 0.4836, + "step": 10100 + }, + { + "epoch": 0.13701844818231146, + "grad_norm": 6.833146572113037, + "learning_rate": 8.718651500616693e-06, + "loss": 0.4656, + "step": 10101 + }, + { + "epoch": 0.13703201302224632, + "grad_norm": 6.872221946716309, + "learning_rate": 8.718514457996438e-06, + "loss": 0.5354, + "step": 10102 + }, + { + "epoch": 0.13704557786218122, + "grad_norm": 6.686950206756592, + "learning_rate": 8.718377415376182e-06, + "loss": 0.3503, + "step": 10103 + }, + { + "epoch": 0.1370591427021161, + "grad_norm": 7.395849704742432, + "learning_rate": 8.718240372755927e-06, + "loss": 0.4335, + "step": 10104 + }, + { + "epoch": 0.137072707542051, + "grad_norm": 7.07086181640625, + "learning_rate": 8.718103330135674e-06, + "loss": 0.3507, + "step": 10105 + }, + { + "epoch": 0.1370862723819859, + "grad_norm": 6.6025824546813965, + "learning_rate": 8.717966287515417e-06, + "loss": 0.3958, + "step": 10106 + }, + { + "epoch": 0.1370998372219208, + "grad_norm": 6.936326026916504, + "learning_rate": 8.717829244895163e-06, + "loss": 0.2426, + "step": 10107 + }, + { + "epoch": 0.13711340206185568, + "grad_norm": 7.390388011932373, + "learning_rate": 8.717692202274908e-06, + "loss": 0.3664, + "step": 10108 + }, + { + "epoch": 0.13712696690179055, + "grad_norm": 6.712927341461182, + "learning_rate": 8.717555159654653e-06, + "loss": 0.3623, + "step": 10109 + }, + { + "epoch": 0.13714053174172544, + "grad_norm": 5.834977626800537, + "learning_rate": 8.717418117034398e-06, + "loss": 0.3677, + "step": 10110 + }, + { + "epoch": 0.13715409658166033, + "grad_norm": 5.950664520263672, + "learning_rate": 8.717281074414143e-06, + "loss": 0.2155, + "step": 10111 + }, + { + "epoch": 0.13716766142159523, + "grad_norm": 7.261538028717041, + "learning_rate": 8.717144031793889e-06, + "loss": 0.3535, + "step": 10112 + }, + { + "epoch": 0.13718122626153012, + "grad_norm": 5.918980598449707, + "learning_rate": 8.717006989173634e-06, + "loss": 0.3221, + "step": 10113 + }, + { + "epoch": 0.137194791101465, + "grad_norm": 6.93170166015625, + "learning_rate": 8.716869946553379e-06, + "loss": 0.3234, + "step": 10114 + }, + { + "epoch": 0.13720835594139988, + "grad_norm": 4.5634002685546875, + "learning_rate": 8.716732903933124e-06, + "loss": 0.2365, + "step": 10115 + }, + { + "epoch": 0.13722192078133477, + "grad_norm": 8.774062156677246, + "learning_rate": 8.71659586131287e-06, + "loss": 0.4723, + "step": 10116 + }, + { + "epoch": 0.13723548562126966, + "grad_norm": 6.883525848388672, + "learning_rate": 8.716458818692614e-06, + "loss": 0.5115, + "step": 10117 + }, + { + "epoch": 0.13724905046120456, + "grad_norm": 6.674900054931641, + "learning_rate": 8.71632177607236e-06, + "loss": 0.4273, + "step": 10118 + }, + { + "epoch": 0.13726261530113945, + "grad_norm": 6.6046013832092285, + "learning_rate": 8.716184733452105e-06, + "loss": 0.2779, + "step": 10119 + }, + { + "epoch": 0.13727618014107434, + "grad_norm": 6.402468681335449, + "learning_rate": 8.71604769083185e-06, + "loss": 0.4256, + "step": 10120 + }, + { + "epoch": 0.13728974498100924, + "grad_norm": 6.615974426269531, + "learning_rate": 8.715910648211593e-06, + "loss": 0.3716, + "step": 10121 + }, + { + "epoch": 0.1373033098209441, + "grad_norm": 7.300879955291748, + "learning_rate": 8.71577360559134e-06, + "loss": 0.4264, + "step": 10122 + }, + { + "epoch": 0.137316874660879, + "grad_norm": 5.843977928161621, + "learning_rate": 8.715636562971085e-06, + "loss": 0.373, + "step": 10123 + }, + { + "epoch": 0.1373304395008139, + "grad_norm": 5.25613260269165, + "learning_rate": 8.715499520350829e-06, + "loss": 0.3201, + "step": 10124 + }, + { + "epoch": 0.13734400434074878, + "grad_norm": 8.30195426940918, + "learning_rate": 8.715362477730574e-06, + "loss": 0.4114, + "step": 10125 + }, + { + "epoch": 0.13735756918068368, + "grad_norm": 7.3032941818237305, + "learning_rate": 8.715225435110321e-06, + "loss": 0.3596, + "step": 10126 + }, + { + "epoch": 0.13737113402061857, + "grad_norm": 5.9893598556518555, + "learning_rate": 8.715088392490064e-06, + "loss": 0.3809, + "step": 10127 + }, + { + "epoch": 0.13738469886055343, + "grad_norm": 4.272173881530762, + "learning_rate": 8.71495134986981e-06, + "loss": 0.21, + "step": 10128 + }, + { + "epoch": 0.13739826370048833, + "grad_norm": 7.894199371337891, + "learning_rate": 8.714814307249555e-06, + "loss": 0.3745, + "step": 10129 + }, + { + "epoch": 0.13741182854042322, + "grad_norm": 6.560247898101807, + "learning_rate": 8.714677264629302e-06, + "loss": 0.4929, + "step": 10130 + }, + { + "epoch": 0.1374253933803581, + "grad_norm": 8.429802894592285, + "learning_rate": 8.714540222009045e-06, + "loss": 0.3987, + "step": 10131 + }, + { + "epoch": 0.137438958220293, + "grad_norm": 4.456062316894531, + "learning_rate": 8.71440317938879e-06, + "loss": 0.2259, + "step": 10132 + }, + { + "epoch": 0.1374525230602279, + "grad_norm": 5.90196418762207, + "learning_rate": 8.714266136768536e-06, + "loss": 0.4033, + "step": 10133 + }, + { + "epoch": 0.13746608790016276, + "grad_norm": 6.000312805175781, + "learning_rate": 8.71412909414828e-06, + "loss": 0.3231, + "step": 10134 + }, + { + "epoch": 0.13747965274009766, + "grad_norm": 7.394511699676514, + "learning_rate": 8.713992051528026e-06, + "loss": 0.5061, + "step": 10135 + }, + { + "epoch": 0.13749321758003255, + "grad_norm": 7.354433536529541, + "learning_rate": 8.713855008907771e-06, + "loss": 0.5158, + "step": 10136 + }, + { + "epoch": 0.13750678241996744, + "grad_norm": 9.1061429977417, + "learning_rate": 8.713717966287516e-06, + "loss": 0.7793, + "step": 10137 + }, + { + "epoch": 0.13752034725990234, + "grad_norm": 8.003728866577148, + "learning_rate": 8.713580923667261e-06, + "loss": 0.5283, + "step": 10138 + }, + { + "epoch": 0.13753391209983723, + "grad_norm": 6.630816459655762, + "learning_rate": 8.713443881047007e-06, + "loss": 0.5398, + "step": 10139 + }, + { + "epoch": 0.13754747693977212, + "grad_norm": 8.019272804260254, + "learning_rate": 8.713306838426752e-06, + "loss": 0.4002, + "step": 10140 + }, + { + "epoch": 0.137561041779707, + "grad_norm": 5.809673309326172, + "learning_rate": 8.713169795806497e-06, + "loss": 0.4779, + "step": 10141 + }, + { + "epoch": 0.13757460661964188, + "grad_norm": 6.583925724029541, + "learning_rate": 8.71303275318624e-06, + "loss": 0.3073, + "step": 10142 + }, + { + "epoch": 0.13758817145957677, + "grad_norm": 7.090178966522217, + "learning_rate": 8.712895710565987e-06, + "loss": 0.58, + "step": 10143 + }, + { + "epoch": 0.13760173629951167, + "grad_norm": 9.00337028503418, + "learning_rate": 8.712758667945733e-06, + "loss": 0.7362, + "step": 10144 + }, + { + "epoch": 0.13761530113944656, + "grad_norm": 7.132111072540283, + "learning_rate": 8.712621625325478e-06, + "loss": 0.4039, + "step": 10145 + }, + { + "epoch": 0.13762886597938145, + "grad_norm": 6.3219990730285645, + "learning_rate": 8.712484582705221e-06, + "loss": 0.4128, + "step": 10146 + }, + { + "epoch": 0.13764243081931632, + "grad_norm": 9.198487281799316, + "learning_rate": 8.712347540084966e-06, + "loss": 0.6803, + "step": 10147 + }, + { + "epoch": 0.1376559956592512, + "grad_norm": 7.358509540557861, + "learning_rate": 8.712210497464713e-06, + "loss": 0.5299, + "step": 10148 + }, + { + "epoch": 0.1376695604991861, + "grad_norm": 7.841756343841553, + "learning_rate": 8.712073454844457e-06, + "loss": 0.7988, + "step": 10149 + }, + { + "epoch": 0.137683125339121, + "grad_norm": 9.13832950592041, + "learning_rate": 8.711936412224202e-06, + "loss": 0.6934, + "step": 10150 + }, + { + "epoch": 0.1376966901790559, + "grad_norm": 6.529710292816162, + "learning_rate": 8.711799369603947e-06, + "loss": 0.3248, + "step": 10151 + }, + { + "epoch": 0.13771025501899078, + "grad_norm": 7.345498561859131, + "learning_rate": 8.711662326983692e-06, + "loss": 0.5338, + "step": 10152 + }, + { + "epoch": 0.13772381985892568, + "grad_norm": 6.184235572814941, + "learning_rate": 8.711525284363437e-06, + "loss": 0.4089, + "step": 10153 + }, + { + "epoch": 0.13773738469886054, + "grad_norm": 6.92579460144043, + "learning_rate": 8.711388241743183e-06, + "loss": 0.4441, + "step": 10154 + }, + { + "epoch": 0.13775094953879544, + "grad_norm": 6.411614894866943, + "learning_rate": 8.711251199122928e-06, + "loss": 0.4183, + "step": 10155 + }, + { + "epoch": 0.13776451437873033, + "grad_norm": 6.307250499725342, + "learning_rate": 8.711114156502673e-06, + "loss": 0.4737, + "step": 10156 + }, + { + "epoch": 0.13777807921866522, + "grad_norm": 5.446476936340332, + "learning_rate": 8.710977113882418e-06, + "loss": 0.3167, + "step": 10157 + }, + { + "epoch": 0.13779164405860012, + "grad_norm": 7.862937927246094, + "learning_rate": 8.710840071262163e-06, + "loss": 0.6375, + "step": 10158 + }, + { + "epoch": 0.137805208898535, + "grad_norm": 7.053139686584473, + "learning_rate": 8.710703028641909e-06, + "loss": 0.2637, + "step": 10159 + }, + { + "epoch": 0.13781877373846987, + "grad_norm": 8.755316734313965, + "learning_rate": 8.710565986021654e-06, + "loss": 0.51, + "step": 10160 + }, + { + "epoch": 0.13783233857840477, + "grad_norm": 5.413143157958984, + "learning_rate": 8.710428943401399e-06, + "loss": 0.3295, + "step": 10161 + }, + { + "epoch": 0.13784590341833966, + "grad_norm": 9.035380363464355, + "learning_rate": 8.710291900781144e-06, + "loss": 0.4983, + "step": 10162 + }, + { + "epoch": 0.13785946825827455, + "grad_norm": 9.78725814819336, + "learning_rate": 8.71015485816089e-06, + "loss": 0.7721, + "step": 10163 + }, + { + "epoch": 0.13787303309820945, + "grad_norm": 7.48758602142334, + "learning_rate": 8.710017815540633e-06, + "loss": 0.6409, + "step": 10164 + }, + { + "epoch": 0.13788659793814434, + "grad_norm": 6.234004974365234, + "learning_rate": 8.70988077292038e-06, + "loss": 0.3281, + "step": 10165 + }, + { + "epoch": 0.1379001627780792, + "grad_norm": 6.699772834777832, + "learning_rate": 8.709743730300125e-06, + "loss": 0.4533, + "step": 10166 + }, + { + "epoch": 0.1379137276180141, + "grad_norm": 5.471526622772217, + "learning_rate": 8.709606687679868e-06, + "loss": 0.4324, + "step": 10167 + }, + { + "epoch": 0.137927292457949, + "grad_norm": 9.581517219543457, + "learning_rate": 8.709469645059613e-06, + "loss": 0.7025, + "step": 10168 + }, + { + "epoch": 0.13794085729788388, + "grad_norm": 6.666987895965576, + "learning_rate": 8.70933260243936e-06, + "loss": 0.5762, + "step": 10169 + }, + { + "epoch": 0.13795442213781878, + "grad_norm": 8.651838302612305, + "learning_rate": 8.709195559819105e-06, + "loss": 0.6159, + "step": 10170 + }, + { + "epoch": 0.13796798697775367, + "grad_norm": 5.314711570739746, + "learning_rate": 8.709058517198849e-06, + "loss": 0.4044, + "step": 10171 + }, + { + "epoch": 0.13798155181768856, + "grad_norm": 7.545529365539551, + "learning_rate": 8.708921474578594e-06, + "loss": 0.4759, + "step": 10172 + }, + { + "epoch": 0.13799511665762343, + "grad_norm": 6.028468132019043, + "learning_rate": 8.70878443195834e-06, + "loss": 0.3918, + "step": 10173 + }, + { + "epoch": 0.13800868149755832, + "grad_norm": 9.011685371398926, + "learning_rate": 8.708647389338085e-06, + "loss": 0.5263, + "step": 10174 + }, + { + "epoch": 0.13802224633749322, + "grad_norm": 8.431238174438477, + "learning_rate": 8.70851034671783e-06, + "loss": 0.4616, + "step": 10175 + }, + { + "epoch": 0.1380358111774281, + "grad_norm": 7.559764862060547, + "learning_rate": 8.708373304097575e-06, + "loss": 0.6712, + "step": 10176 + }, + { + "epoch": 0.138049376017363, + "grad_norm": 6.627633571624756, + "learning_rate": 8.70823626147732e-06, + "loss": 0.3717, + "step": 10177 + }, + { + "epoch": 0.1380629408572979, + "grad_norm": 8.196599960327148, + "learning_rate": 8.708099218857065e-06, + "loss": 0.5636, + "step": 10178 + }, + { + "epoch": 0.13807650569723276, + "grad_norm": 7.128231525421143, + "learning_rate": 8.70796217623681e-06, + "loss": 0.445, + "step": 10179 + }, + { + "epoch": 0.13809007053716765, + "grad_norm": 5.122460842132568, + "learning_rate": 8.707825133616556e-06, + "loss": 0.3864, + "step": 10180 + }, + { + "epoch": 0.13810363537710255, + "grad_norm": 8.806297302246094, + "learning_rate": 8.7076880909963e-06, + "loss": 0.6456, + "step": 10181 + }, + { + "epoch": 0.13811720021703744, + "grad_norm": 6.810640811920166, + "learning_rate": 8.707551048376046e-06, + "loss": 0.5278, + "step": 10182 + }, + { + "epoch": 0.13813076505697233, + "grad_norm": 6.722137451171875, + "learning_rate": 8.707414005755791e-06, + "loss": 0.3733, + "step": 10183 + }, + { + "epoch": 0.13814432989690723, + "grad_norm": 6.821512222290039, + "learning_rate": 8.707276963135536e-06, + "loss": 0.4087, + "step": 10184 + }, + { + "epoch": 0.13815789473684212, + "grad_norm": 6.69087028503418, + "learning_rate": 8.707139920515281e-06, + "loss": 0.3947, + "step": 10185 + }, + { + "epoch": 0.13817145957677698, + "grad_norm": 8.37869644165039, + "learning_rate": 8.707002877895025e-06, + "loss": 0.5272, + "step": 10186 + }, + { + "epoch": 0.13818502441671188, + "grad_norm": 8.551314353942871, + "learning_rate": 8.706865835274772e-06, + "loss": 0.6556, + "step": 10187 + }, + { + "epoch": 0.13819858925664677, + "grad_norm": 7.603114604949951, + "learning_rate": 8.706728792654517e-06, + "loss": 0.5071, + "step": 10188 + }, + { + "epoch": 0.13821215409658166, + "grad_norm": 7.378667831420898, + "learning_rate": 8.70659175003426e-06, + "loss": 0.5109, + "step": 10189 + }, + { + "epoch": 0.13822571893651656, + "grad_norm": 7.359630107879639, + "learning_rate": 8.706454707414006e-06, + "loss": 0.3197, + "step": 10190 + }, + { + "epoch": 0.13823928377645145, + "grad_norm": 8.175662994384766, + "learning_rate": 8.706317664793753e-06, + "loss": 0.4773, + "step": 10191 + }, + { + "epoch": 0.13825284861638631, + "grad_norm": 5.746622085571289, + "learning_rate": 8.706180622173496e-06, + "loss": 0.3318, + "step": 10192 + }, + { + "epoch": 0.1382664134563212, + "grad_norm": 5.8903608322143555, + "learning_rate": 8.706043579553241e-06, + "loss": 0.3607, + "step": 10193 + }, + { + "epoch": 0.1382799782962561, + "grad_norm": 8.085481643676758, + "learning_rate": 8.705906536932986e-06, + "loss": 0.381, + "step": 10194 + }, + { + "epoch": 0.138293543136191, + "grad_norm": 8.668500900268555, + "learning_rate": 8.705769494312733e-06, + "loss": 0.7344, + "step": 10195 + }, + { + "epoch": 0.1383071079761259, + "grad_norm": 9.3025484085083, + "learning_rate": 8.705632451692477e-06, + "loss": 0.6146, + "step": 10196 + }, + { + "epoch": 0.13832067281606078, + "grad_norm": 8.560988426208496, + "learning_rate": 8.705495409072222e-06, + "loss": 0.8125, + "step": 10197 + }, + { + "epoch": 0.13833423765599565, + "grad_norm": 8.670324325561523, + "learning_rate": 8.705358366451967e-06, + "loss": 0.6228, + "step": 10198 + }, + { + "epoch": 0.13834780249593054, + "grad_norm": 7.546762466430664, + "learning_rate": 8.705221323831712e-06, + "loss": 0.5624, + "step": 10199 + }, + { + "epoch": 0.13836136733586543, + "grad_norm": 6.964473724365234, + "learning_rate": 8.705084281211457e-06, + "loss": 0.487, + "step": 10200 + }, + { + "epoch": 0.13837493217580032, + "grad_norm": 6.42946720123291, + "learning_rate": 8.704947238591203e-06, + "loss": 0.3802, + "step": 10201 + }, + { + "epoch": 0.13838849701573522, + "grad_norm": 6.235193252563477, + "learning_rate": 8.704810195970948e-06, + "loss": 0.3992, + "step": 10202 + }, + { + "epoch": 0.1384020618556701, + "grad_norm": 7.453510284423828, + "learning_rate": 8.704673153350693e-06, + "loss": 0.5527, + "step": 10203 + }, + { + "epoch": 0.138415626695605, + "grad_norm": 8.286497116088867, + "learning_rate": 8.704536110730438e-06, + "loss": 0.457, + "step": 10204 + }, + { + "epoch": 0.13842919153553987, + "grad_norm": 5.747398376464844, + "learning_rate": 8.704399068110183e-06, + "loss": 0.406, + "step": 10205 + }, + { + "epoch": 0.13844275637547476, + "grad_norm": 7.36131477355957, + "learning_rate": 8.704262025489929e-06, + "loss": 0.5177, + "step": 10206 + }, + { + "epoch": 0.13845632121540966, + "grad_norm": 8.098776817321777, + "learning_rate": 8.704124982869672e-06, + "loss": 0.6368, + "step": 10207 + }, + { + "epoch": 0.13846988605534455, + "grad_norm": 8.759142875671387, + "learning_rate": 8.703987940249419e-06, + "loss": 0.499, + "step": 10208 + }, + { + "epoch": 0.13848345089527944, + "grad_norm": 7.778761386871338, + "learning_rate": 8.703850897629164e-06, + "loss": 0.4241, + "step": 10209 + }, + { + "epoch": 0.13849701573521433, + "grad_norm": 8.46677017211914, + "learning_rate": 8.70371385500891e-06, + "loss": 0.4837, + "step": 10210 + }, + { + "epoch": 0.1385105805751492, + "grad_norm": 7.378601551055908, + "learning_rate": 8.703576812388653e-06, + "loss": 0.3044, + "step": 10211 + }, + { + "epoch": 0.1385241454150841, + "grad_norm": 5.717768669128418, + "learning_rate": 8.7034397697684e-06, + "loss": 0.3819, + "step": 10212 + }, + { + "epoch": 0.138537710255019, + "grad_norm": 8.661169052124023, + "learning_rate": 8.703302727148145e-06, + "loss": 0.6197, + "step": 10213 + }, + { + "epoch": 0.13855127509495388, + "grad_norm": 7.845874786376953, + "learning_rate": 8.703165684527888e-06, + "loss": 0.4559, + "step": 10214 + }, + { + "epoch": 0.13856483993488877, + "grad_norm": 6.058469772338867, + "learning_rate": 8.703028641907633e-06, + "loss": 0.391, + "step": 10215 + }, + { + "epoch": 0.13857840477482367, + "grad_norm": 10.576333999633789, + "learning_rate": 8.702891599287379e-06, + "loss": 0.6367, + "step": 10216 + }, + { + "epoch": 0.13859196961475856, + "grad_norm": 5.198177814483643, + "learning_rate": 8.702754556667124e-06, + "loss": 0.3371, + "step": 10217 + }, + { + "epoch": 0.13860553445469342, + "grad_norm": 6.490383625030518, + "learning_rate": 8.702617514046869e-06, + "loss": 0.4732, + "step": 10218 + }, + { + "epoch": 0.13861909929462832, + "grad_norm": 8.241561889648438, + "learning_rate": 8.702480471426614e-06, + "loss": 0.5271, + "step": 10219 + }, + { + "epoch": 0.1386326641345632, + "grad_norm": 9.331195831298828, + "learning_rate": 8.70234342880636e-06, + "loss": 0.674, + "step": 10220 + }, + { + "epoch": 0.1386462289744981, + "grad_norm": 5.136390209197998, + "learning_rate": 8.702206386186105e-06, + "loss": 0.3271, + "step": 10221 + }, + { + "epoch": 0.138659793814433, + "grad_norm": 5.324254989624023, + "learning_rate": 8.70206934356585e-06, + "loss": 0.3344, + "step": 10222 + }, + { + "epoch": 0.1386733586543679, + "grad_norm": 5.636989593505859, + "learning_rate": 8.701932300945595e-06, + "loss": 0.5203, + "step": 10223 + }, + { + "epoch": 0.13868692349430276, + "grad_norm": 9.434192657470703, + "learning_rate": 8.70179525832534e-06, + "loss": 0.5399, + "step": 10224 + }, + { + "epoch": 0.13870048833423765, + "grad_norm": 5.929501056671143, + "learning_rate": 8.701658215705085e-06, + "loss": 0.3711, + "step": 10225 + }, + { + "epoch": 0.13871405317417254, + "grad_norm": 6.108343124389648, + "learning_rate": 8.70152117308483e-06, + "loss": 0.371, + "step": 10226 + }, + { + "epoch": 0.13872761801410743, + "grad_norm": 7.824246406555176, + "learning_rate": 8.701384130464576e-06, + "loss": 0.4636, + "step": 10227 + }, + { + "epoch": 0.13874118285404233, + "grad_norm": 6.649259567260742, + "learning_rate": 8.70124708784432e-06, + "loss": 0.5311, + "step": 10228 + }, + { + "epoch": 0.13875474769397722, + "grad_norm": 7.501158714294434, + "learning_rate": 8.701110045224064e-06, + "loss": 0.4354, + "step": 10229 + }, + { + "epoch": 0.13876831253391209, + "grad_norm": 6.847817420959473, + "learning_rate": 8.700973002603811e-06, + "loss": 0.5431, + "step": 10230 + }, + { + "epoch": 0.13878187737384698, + "grad_norm": 6.009395122528076, + "learning_rate": 8.700835959983556e-06, + "loss": 0.3906, + "step": 10231 + }, + { + "epoch": 0.13879544221378187, + "grad_norm": 7.6139655113220215, + "learning_rate": 8.7006989173633e-06, + "loss": 0.4953, + "step": 10232 + }, + { + "epoch": 0.13880900705371677, + "grad_norm": 5.9876885414123535, + "learning_rate": 8.700561874743045e-06, + "loss": 0.4017, + "step": 10233 + }, + { + "epoch": 0.13882257189365166, + "grad_norm": 6.311351299285889, + "learning_rate": 8.700424832122792e-06, + "loss": 0.352, + "step": 10234 + }, + { + "epoch": 0.13883613673358655, + "grad_norm": 8.56635570526123, + "learning_rate": 8.700287789502535e-06, + "loss": 0.5718, + "step": 10235 + }, + { + "epoch": 0.13884970157352144, + "grad_norm": 6.575900554656982, + "learning_rate": 8.70015074688228e-06, + "loss": 0.3188, + "step": 10236 + }, + { + "epoch": 0.1388632664134563, + "grad_norm": 7.234378337860107, + "learning_rate": 8.700013704262026e-06, + "loss": 0.5165, + "step": 10237 + }, + { + "epoch": 0.1388768312533912, + "grad_norm": 6.714644432067871, + "learning_rate": 8.699876661641773e-06, + "loss": 0.6889, + "step": 10238 + }, + { + "epoch": 0.1388903960933261, + "grad_norm": 8.005627632141113, + "learning_rate": 8.699739619021516e-06, + "loss": 0.5012, + "step": 10239 + }, + { + "epoch": 0.138903960933261, + "grad_norm": 7.709902286529541, + "learning_rate": 8.699602576401261e-06, + "loss": 0.602, + "step": 10240 + }, + { + "epoch": 0.13891752577319588, + "grad_norm": 6.218051433563232, + "learning_rate": 8.699465533781006e-06, + "loss": 0.5512, + "step": 10241 + }, + { + "epoch": 0.13893109061313078, + "grad_norm": 6.360171318054199, + "learning_rate": 8.699328491160752e-06, + "loss": 0.4842, + "step": 10242 + }, + { + "epoch": 0.13894465545306564, + "grad_norm": 8.309268951416016, + "learning_rate": 8.699191448540497e-06, + "loss": 0.6292, + "step": 10243 + }, + { + "epoch": 0.13895822029300053, + "grad_norm": 7.0335798263549805, + "learning_rate": 8.699054405920242e-06, + "loss": 0.4244, + "step": 10244 + }, + { + "epoch": 0.13897178513293543, + "grad_norm": 6.711827754974365, + "learning_rate": 8.698917363299987e-06, + "loss": 0.4151, + "step": 10245 + }, + { + "epoch": 0.13898534997287032, + "grad_norm": 7.2249932289123535, + "learning_rate": 8.698780320679732e-06, + "loss": 0.3507, + "step": 10246 + }, + { + "epoch": 0.1389989148128052, + "grad_norm": 6.109934329986572, + "learning_rate": 8.698643278059477e-06, + "loss": 0.5062, + "step": 10247 + }, + { + "epoch": 0.1390124796527401, + "grad_norm": 6.928986072540283, + "learning_rate": 8.698506235439223e-06, + "loss": 0.3231, + "step": 10248 + }, + { + "epoch": 0.139026044492675, + "grad_norm": 6.2490739822387695, + "learning_rate": 8.698369192818968e-06, + "loss": 0.5073, + "step": 10249 + }, + { + "epoch": 0.13903960933260986, + "grad_norm": 7.857712268829346, + "learning_rate": 8.698232150198711e-06, + "loss": 0.578, + "step": 10250 + }, + { + "epoch": 0.13905317417254476, + "grad_norm": 6.289180755615234, + "learning_rate": 8.698095107578458e-06, + "loss": 0.2886, + "step": 10251 + }, + { + "epoch": 0.13906673901247965, + "grad_norm": 5.4746832847595215, + "learning_rate": 8.697958064958203e-06, + "loss": 0.5485, + "step": 10252 + }, + { + "epoch": 0.13908030385241454, + "grad_norm": 6.593144416809082, + "learning_rate": 8.697821022337949e-06, + "loss": 0.3683, + "step": 10253 + }, + { + "epoch": 0.13909386869234944, + "grad_norm": 7.090428829193115, + "learning_rate": 8.697683979717692e-06, + "loss": 0.701, + "step": 10254 + }, + { + "epoch": 0.13910743353228433, + "grad_norm": 7.213279724121094, + "learning_rate": 8.697546937097437e-06, + "loss": 0.3735, + "step": 10255 + }, + { + "epoch": 0.1391209983722192, + "grad_norm": 6.4977617263793945, + "learning_rate": 8.697409894477184e-06, + "loss": 0.3744, + "step": 10256 + }, + { + "epoch": 0.1391345632121541, + "grad_norm": 5.391493797302246, + "learning_rate": 8.697272851856928e-06, + "loss": 0.3576, + "step": 10257 + }, + { + "epoch": 0.13914812805208898, + "grad_norm": 8.694579124450684, + "learning_rate": 8.697135809236673e-06, + "loss": 0.5949, + "step": 10258 + }, + { + "epoch": 0.13916169289202387, + "grad_norm": 7.577820777893066, + "learning_rate": 8.696998766616418e-06, + "loss": 0.4514, + "step": 10259 + }, + { + "epoch": 0.13917525773195877, + "grad_norm": 7.776391506195068, + "learning_rate": 8.696861723996163e-06, + "loss": 0.3994, + "step": 10260 + }, + { + "epoch": 0.13918882257189366, + "grad_norm": 5.132401943206787, + "learning_rate": 8.696724681375908e-06, + "loss": 0.2639, + "step": 10261 + }, + { + "epoch": 0.13920238741182855, + "grad_norm": 7.4593353271484375, + "learning_rate": 8.696587638755653e-06, + "loss": 0.4685, + "step": 10262 + }, + { + "epoch": 0.13921595225176342, + "grad_norm": 6.2701921463012695, + "learning_rate": 8.696450596135399e-06, + "loss": 0.3472, + "step": 10263 + }, + { + "epoch": 0.1392295170916983, + "grad_norm": 4.249057769775391, + "learning_rate": 8.696313553515144e-06, + "loss": 0.3474, + "step": 10264 + }, + { + "epoch": 0.1392430819316332, + "grad_norm": 5.047098636627197, + "learning_rate": 8.696176510894889e-06, + "loss": 0.2753, + "step": 10265 + }, + { + "epoch": 0.1392566467715681, + "grad_norm": 7.019970417022705, + "learning_rate": 8.696039468274634e-06, + "loss": 0.5691, + "step": 10266 + }, + { + "epoch": 0.139270211611503, + "grad_norm": 5.100500106811523, + "learning_rate": 8.69590242565438e-06, + "loss": 0.1996, + "step": 10267 + }, + { + "epoch": 0.13928377645143789, + "grad_norm": 5.919683933258057, + "learning_rate": 8.695765383034125e-06, + "loss": 0.3683, + "step": 10268 + }, + { + "epoch": 0.13929734129137275, + "grad_norm": 8.804198265075684, + "learning_rate": 8.69562834041387e-06, + "loss": 0.5499, + "step": 10269 + }, + { + "epoch": 0.13931090613130764, + "grad_norm": 6.643955707550049, + "learning_rate": 8.695491297793615e-06, + "loss": 0.5694, + "step": 10270 + }, + { + "epoch": 0.13932447097124254, + "grad_norm": 5.961172580718994, + "learning_rate": 8.69535425517336e-06, + "loss": 0.2626, + "step": 10271 + }, + { + "epoch": 0.13933803581117743, + "grad_norm": 5.550355434417725, + "learning_rate": 8.695217212553104e-06, + "loss": 0.3491, + "step": 10272 + }, + { + "epoch": 0.13935160065111232, + "grad_norm": 8.643507957458496, + "learning_rate": 8.69508016993285e-06, + "loss": 0.4841, + "step": 10273 + }, + { + "epoch": 0.13936516549104722, + "grad_norm": 5.687705039978027, + "learning_rate": 8.694943127312596e-06, + "loss": 0.3317, + "step": 10274 + }, + { + "epoch": 0.13937873033098208, + "grad_norm": 6.865848541259766, + "learning_rate": 8.694806084692339e-06, + "loss": 0.4129, + "step": 10275 + }, + { + "epoch": 0.13939229517091697, + "grad_norm": 5.89661979675293, + "learning_rate": 8.694669042072084e-06, + "loss": 0.3522, + "step": 10276 + }, + { + "epoch": 0.13940586001085187, + "grad_norm": 4.863620281219482, + "learning_rate": 8.694531999451831e-06, + "loss": 0.2263, + "step": 10277 + }, + { + "epoch": 0.13941942485078676, + "grad_norm": 5.13885498046875, + "learning_rate": 8.694394956831576e-06, + "loss": 0.3389, + "step": 10278 + }, + { + "epoch": 0.13943298969072165, + "grad_norm": 6.504998207092285, + "learning_rate": 8.69425791421132e-06, + "loss": 0.2401, + "step": 10279 + }, + { + "epoch": 0.13944655453065655, + "grad_norm": 6.168881893157959, + "learning_rate": 8.694120871591065e-06, + "loss": 0.3127, + "step": 10280 + }, + { + "epoch": 0.13946011937059144, + "grad_norm": 5.066605091094971, + "learning_rate": 8.693983828970812e-06, + "loss": 0.4806, + "step": 10281 + }, + { + "epoch": 0.1394736842105263, + "grad_norm": 6.454610824584961, + "learning_rate": 8.693846786350555e-06, + "loss": 0.2532, + "step": 10282 + }, + { + "epoch": 0.1394872490504612, + "grad_norm": 6.599959850311279, + "learning_rate": 8.6937097437303e-06, + "loss": 0.4058, + "step": 10283 + }, + { + "epoch": 0.1395008138903961, + "grad_norm": 4.9367218017578125, + "learning_rate": 8.693572701110046e-06, + "loss": 0.2904, + "step": 10284 + }, + { + "epoch": 0.13951437873033098, + "grad_norm": 6.719346523284912, + "learning_rate": 8.693435658489791e-06, + "loss": 0.3914, + "step": 10285 + }, + { + "epoch": 0.13952794357026588, + "grad_norm": 6.138978004455566, + "learning_rate": 8.693298615869536e-06, + "loss": 0.3435, + "step": 10286 + }, + { + "epoch": 0.13954150841020077, + "grad_norm": 3.9593074321746826, + "learning_rate": 8.693161573249281e-06, + "loss": 0.2698, + "step": 10287 + }, + { + "epoch": 0.13955507325013564, + "grad_norm": 8.075510025024414, + "learning_rate": 8.693024530629026e-06, + "loss": 0.4464, + "step": 10288 + }, + { + "epoch": 0.13956863809007053, + "grad_norm": 4.084163665771484, + "learning_rate": 8.692887488008772e-06, + "loss": 0.3199, + "step": 10289 + }, + { + "epoch": 0.13958220293000542, + "grad_norm": 4.891336917877197, + "learning_rate": 8.692750445388517e-06, + "loss": 0.314, + "step": 10290 + }, + { + "epoch": 0.13959576776994032, + "grad_norm": 5.0926971435546875, + "learning_rate": 8.692613402768262e-06, + "loss": 0.3267, + "step": 10291 + }, + { + "epoch": 0.1396093326098752, + "grad_norm": 5.464116096496582, + "learning_rate": 8.692476360148007e-06, + "loss": 0.3399, + "step": 10292 + }, + { + "epoch": 0.1396228974498101, + "grad_norm": 5.72117805480957, + "learning_rate": 8.692339317527752e-06, + "loss": 0.3511, + "step": 10293 + }, + { + "epoch": 0.139636462289745, + "grad_norm": 7.62744140625, + "learning_rate": 8.692202274907498e-06, + "loss": 0.4416, + "step": 10294 + }, + { + "epoch": 0.13965002712967986, + "grad_norm": 5.634842395782471, + "learning_rate": 8.692065232287243e-06, + "loss": 0.3477, + "step": 10295 + }, + { + "epoch": 0.13966359196961475, + "grad_norm": 5.9094038009643555, + "learning_rate": 8.691928189666988e-06, + "loss": 0.2907, + "step": 10296 + }, + { + "epoch": 0.13967715680954965, + "grad_norm": 5.200460433959961, + "learning_rate": 8.691791147046731e-06, + "loss": 0.2833, + "step": 10297 + }, + { + "epoch": 0.13969072164948454, + "grad_norm": 5.769811153411865, + "learning_rate": 8.691654104426477e-06, + "loss": 0.2986, + "step": 10298 + }, + { + "epoch": 0.13970428648941943, + "grad_norm": 6.51617431640625, + "learning_rate": 8.691517061806223e-06, + "loss": 0.4192, + "step": 10299 + }, + { + "epoch": 0.13971785132935433, + "grad_norm": 7.37066650390625, + "learning_rate": 8.691380019185967e-06, + "loss": 0.3588, + "step": 10300 + }, + { + "epoch": 0.1397314161692892, + "grad_norm": 5.797913074493408, + "learning_rate": 8.691242976565712e-06, + "loss": 0.3506, + "step": 10301 + }, + { + "epoch": 0.13974498100922408, + "grad_norm": 6.710554599761963, + "learning_rate": 8.691105933945457e-06, + "loss": 0.4737, + "step": 10302 + }, + { + "epoch": 0.13975854584915898, + "grad_norm": 5.520692348480225, + "learning_rate": 8.690968891325202e-06, + "loss": 0.2396, + "step": 10303 + }, + { + "epoch": 0.13977211068909387, + "grad_norm": 7.518486499786377, + "learning_rate": 8.690831848704948e-06, + "loss": 0.4156, + "step": 10304 + }, + { + "epoch": 0.13978567552902876, + "grad_norm": 5.992142200469971, + "learning_rate": 8.690694806084693e-06, + "loss": 0.3195, + "step": 10305 + }, + { + "epoch": 0.13979924036896366, + "grad_norm": 4.848492622375488, + "learning_rate": 8.690557763464438e-06, + "loss": 0.2768, + "step": 10306 + }, + { + "epoch": 0.13981280520889852, + "grad_norm": 5.378960132598877, + "learning_rate": 8.690420720844183e-06, + "loss": 0.4231, + "step": 10307 + }, + { + "epoch": 0.13982637004883341, + "grad_norm": 6.890070915222168, + "learning_rate": 8.690283678223928e-06, + "loss": 0.5547, + "step": 10308 + }, + { + "epoch": 0.1398399348887683, + "grad_norm": 6.056521415710449, + "learning_rate": 8.690146635603673e-06, + "loss": 0.4388, + "step": 10309 + }, + { + "epoch": 0.1398534997287032, + "grad_norm": 6.693378448486328, + "learning_rate": 8.690009592983419e-06, + "loss": 0.419, + "step": 10310 + }, + { + "epoch": 0.1398670645686381, + "grad_norm": 5.516960144042969, + "learning_rate": 8.689872550363164e-06, + "loss": 0.3225, + "step": 10311 + }, + { + "epoch": 0.139880629408573, + "grad_norm": 7.268494606018066, + "learning_rate": 8.689735507742909e-06, + "loss": 0.4221, + "step": 10312 + }, + { + "epoch": 0.13989419424850788, + "grad_norm": 6.581246852874756, + "learning_rate": 8.689598465122654e-06, + "loss": 0.3479, + "step": 10313 + }, + { + "epoch": 0.13990775908844275, + "grad_norm": 6.684040546417236, + "learning_rate": 8.6894614225024e-06, + "loss": 0.5134, + "step": 10314 + }, + { + "epoch": 0.13992132392837764, + "grad_norm": 6.455461502075195, + "learning_rate": 8.689324379882143e-06, + "loss": 0.4471, + "step": 10315 + }, + { + "epoch": 0.13993488876831253, + "grad_norm": 7.412742614746094, + "learning_rate": 8.68918733726189e-06, + "loss": 0.4484, + "step": 10316 + }, + { + "epoch": 0.13994845360824743, + "grad_norm": 5.855118274688721, + "learning_rate": 8.689050294641635e-06, + "loss": 0.4783, + "step": 10317 + }, + { + "epoch": 0.13996201844818232, + "grad_norm": 6.31565523147583, + "learning_rate": 8.688913252021378e-06, + "loss": 0.4474, + "step": 10318 + }, + { + "epoch": 0.1399755832881172, + "grad_norm": 5.4942121505737305, + "learning_rate": 8.688776209401124e-06, + "loss": 0.4499, + "step": 10319 + }, + { + "epoch": 0.13998914812805208, + "grad_norm": 6.4997124671936035, + "learning_rate": 8.68863916678087e-06, + "loss": 0.4103, + "step": 10320 + }, + { + "epoch": 0.14000271296798697, + "grad_norm": 6.104247093200684, + "learning_rate": 8.688502124160616e-06, + "loss": 0.3542, + "step": 10321 + }, + { + "epoch": 0.14001627780792186, + "grad_norm": 6.053161144256592, + "learning_rate": 8.688365081540359e-06, + "loss": 0.3991, + "step": 10322 + }, + { + "epoch": 0.14002984264785676, + "grad_norm": 6.040801048278809, + "learning_rate": 8.688228038920104e-06, + "loss": 0.4547, + "step": 10323 + }, + { + "epoch": 0.14004340748779165, + "grad_norm": 6.227509021759033, + "learning_rate": 8.68809099629985e-06, + "loss": 0.5568, + "step": 10324 + }, + { + "epoch": 0.14005697232772654, + "grad_norm": 6.787447929382324, + "learning_rate": 8.687953953679595e-06, + "loss": 0.5086, + "step": 10325 + }, + { + "epoch": 0.14007053716766144, + "grad_norm": 5.992518901824951, + "learning_rate": 8.68781691105934e-06, + "loss": 0.3604, + "step": 10326 + }, + { + "epoch": 0.1400841020075963, + "grad_norm": 8.707841873168945, + "learning_rate": 8.687679868439085e-06, + "loss": 0.497, + "step": 10327 + }, + { + "epoch": 0.1400976668475312, + "grad_norm": 6.770379066467285, + "learning_rate": 8.68754282581883e-06, + "loss": 0.4477, + "step": 10328 + }, + { + "epoch": 0.1401112316874661, + "grad_norm": 7.750580787658691, + "learning_rate": 8.687405783198575e-06, + "loss": 0.476, + "step": 10329 + }, + { + "epoch": 0.14012479652740098, + "grad_norm": 5.8924150466918945, + "learning_rate": 8.68726874057832e-06, + "loss": 0.4204, + "step": 10330 + }, + { + "epoch": 0.14013836136733587, + "grad_norm": 6.768065929412842, + "learning_rate": 8.687131697958066e-06, + "loss": 0.4386, + "step": 10331 + }, + { + "epoch": 0.14015192620727077, + "grad_norm": 13.326260566711426, + "learning_rate": 8.686994655337811e-06, + "loss": 0.4961, + "step": 10332 + }, + { + "epoch": 0.14016549104720563, + "grad_norm": 6.083540916442871, + "learning_rate": 8.686857612717556e-06, + "loss": 0.4409, + "step": 10333 + }, + { + "epoch": 0.14017905588714052, + "grad_norm": 6.605630397796631, + "learning_rate": 8.686720570097301e-06, + "loss": 0.3688, + "step": 10334 + }, + { + "epoch": 0.14019262072707542, + "grad_norm": 5.409664154052734, + "learning_rate": 8.686583527477046e-06, + "loss": 0.4973, + "step": 10335 + }, + { + "epoch": 0.1402061855670103, + "grad_norm": 5.77146053314209, + "learning_rate": 8.686446484856792e-06, + "loss": 0.3522, + "step": 10336 + }, + { + "epoch": 0.1402197504069452, + "grad_norm": 5.371885776519775, + "learning_rate": 8.686309442236537e-06, + "loss": 0.315, + "step": 10337 + }, + { + "epoch": 0.1402333152468801, + "grad_norm": 7.107211112976074, + "learning_rate": 8.686172399616282e-06, + "loss": 0.428, + "step": 10338 + }, + { + "epoch": 0.14024688008681496, + "grad_norm": 7.532037258148193, + "learning_rate": 8.686035356996027e-06, + "loss": 0.377, + "step": 10339 + }, + { + "epoch": 0.14026044492674986, + "grad_norm": 4.894454479217529, + "learning_rate": 8.68589831437577e-06, + "loss": 0.4269, + "step": 10340 + }, + { + "epoch": 0.14027400976668475, + "grad_norm": 7.739659786224365, + "learning_rate": 8.685761271755516e-06, + "loss": 0.6794, + "step": 10341 + }, + { + "epoch": 0.14028757460661964, + "grad_norm": 7.71343994140625, + "learning_rate": 8.685624229135263e-06, + "loss": 0.5079, + "step": 10342 + }, + { + "epoch": 0.14030113944655453, + "grad_norm": 5.4206318855285645, + "learning_rate": 8.685487186515006e-06, + "loss": 0.4281, + "step": 10343 + }, + { + "epoch": 0.14031470428648943, + "grad_norm": 8.044541358947754, + "learning_rate": 8.685350143894751e-06, + "loss": 0.6141, + "step": 10344 + }, + { + "epoch": 0.14032826912642432, + "grad_norm": 5.230978965759277, + "learning_rate": 8.685213101274497e-06, + "loss": 0.3829, + "step": 10345 + }, + { + "epoch": 0.1403418339663592, + "grad_norm": 6.616824150085449, + "learning_rate": 8.685076058654243e-06, + "loss": 0.5929, + "step": 10346 + }, + { + "epoch": 0.14035539880629408, + "grad_norm": 6.511713027954102, + "learning_rate": 8.684939016033987e-06, + "loss": 0.4369, + "step": 10347 + }, + { + "epoch": 0.14036896364622897, + "grad_norm": 6.606892108917236, + "learning_rate": 8.684801973413732e-06, + "loss": 0.3762, + "step": 10348 + }, + { + "epoch": 0.14038252848616387, + "grad_norm": 6.632441997528076, + "learning_rate": 8.684664930793477e-06, + "loss": 0.3112, + "step": 10349 + }, + { + "epoch": 0.14039609332609876, + "grad_norm": 5.885763645172119, + "learning_rate": 8.684527888173222e-06, + "loss": 0.3872, + "step": 10350 + }, + { + "epoch": 0.14040965816603365, + "grad_norm": 6.872829914093018, + "learning_rate": 8.684390845552968e-06, + "loss": 0.3959, + "step": 10351 + }, + { + "epoch": 0.14042322300596852, + "grad_norm": 6.580090045928955, + "learning_rate": 8.684253802932713e-06, + "loss": 0.4496, + "step": 10352 + }, + { + "epoch": 0.1404367878459034, + "grad_norm": 5.9533867835998535, + "learning_rate": 8.684116760312458e-06, + "loss": 0.2938, + "step": 10353 + }, + { + "epoch": 0.1404503526858383, + "grad_norm": 6.685843467712402, + "learning_rate": 8.683979717692203e-06, + "loss": 0.5183, + "step": 10354 + }, + { + "epoch": 0.1404639175257732, + "grad_norm": 5.936099052429199, + "learning_rate": 8.683842675071948e-06, + "loss": 0.4418, + "step": 10355 + }, + { + "epoch": 0.1404774823657081, + "grad_norm": 6.475640773773193, + "learning_rate": 8.683705632451694e-06, + "loss": 0.3859, + "step": 10356 + }, + { + "epoch": 0.14049104720564298, + "grad_norm": 6.956009387969971, + "learning_rate": 8.683568589831439e-06, + "loss": 0.4371, + "step": 10357 + }, + { + "epoch": 0.14050461204557788, + "grad_norm": 5.290243625640869, + "learning_rate": 8.683431547211182e-06, + "loss": 0.2907, + "step": 10358 + }, + { + "epoch": 0.14051817688551274, + "grad_norm": 6.041348934173584, + "learning_rate": 8.683294504590929e-06, + "loss": 0.2318, + "step": 10359 + }, + { + "epoch": 0.14053174172544763, + "grad_norm": 5.634278297424316, + "learning_rate": 8.683157461970674e-06, + "loss": 0.4, + "step": 10360 + }, + { + "epoch": 0.14054530656538253, + "grad_norm": 7.008796215057373, + "learning_rate": 8.68302041935042e-06, + "loss": 0.3075, + "step": 10361 + }, + { + "epoch": 0.14055887140531742, + "grad_norm": 7.304919719696045, + "learning_rate": 8.682883376730163e-06, + "loss": 0.5708, + "step": 10362 + }, + { + "epoch": 0.1405724362452523, + "grad_norm": 5.870928764343262, + "learning_rate": 8.68274633410991e-06, + "loss": 0.4063, + "step": 10363 + }, + { + "epoch": 0.1405860010851872, + "grad_norm": 7.807966232299805, + "learning_rate": 8.682609291489655e-06, + "loss": 0.5062, + "step": 10364 + }, + { + "epoch": 0.14059956592512207, + "grad_norm": 6.734631538391113, + "learning_rate": 8.682472248869398e-06, + "loss": 0.4568, + "step": 10365 + }, + { + "epoch": 0.14061313076505697, + "grad_norm": 6.162943363189697, + "learning_rate": 8.682335206249144e-06, + "loss": 0.3558, + "step": 10366 + }, + { + "epoch": 0.14062669560499186, + "grad_norm": 7.163835048675537, + "learning_rate": 8.682198163628889e-06, + "loss": 0.4895, + "step": 10367 + }, + { + "epoch": 0.14064026044492675, + "grad_norm": 8.075610160827637, + "learning_rate": 8.682061121008634e-06, + "loss": 0.4915, + "step": 10368 + }, + { + "epoch": 0.14065382528486164, + "grad_norm": 8.2147855758667, + "learning_rate": 8.681924078388379e-06, + "loss": 0.6801, + "step": 10369 + }, + { + "epoch": 0.14066739012479654, + "grad_norm": 4.984715461730957, + "learning_rate": 8.681787035768124e-06, + "loss": 0.3909, + "step": 10370 + }, + { + "epoch": 0.1406809549647314, + "grad_norm": 6.87612771987915, + "learning_rate": 8.68164999314787e-06, + "loss": 0.3012, + "step": 10371 + }, + { + "epoch": 0.1406945198046663, + "grad_norm": 5.348349094390869, + "learning_rate": 8.681512950527615e-06, + "loss": 0.3441, + "step": 10372 + }, + { + "epoch": 0.1407080846446012, + "grad_norm": 5.598530292510986, + "learning_rate": 8.68137590790736e-06, + "loss": 0.294, + "step": 10373 + }, + { + "epoch": 0.14072164948453608, + "grad_norm": 6.073416233062744, + "learning_rate": 8.681238865287105e-06, + "loss": 0.3758, + "step": 10374 + }, + { + "epoch": 0.14073521432447098, + "grad_norm": 7.186673164367676, + "learning_rate": 8.68110182266685e-06, + "loss": 0.3864, + "step": 10375 + }, + { + "epoch": 0.14074877916440587, + "grad_norm": 5.9940619468688965, + "learning_rate": 8.680964780046595e-06, + "loss": 0.4472, + "step": 10376 + }, + { + "epoch": 0.14076234400434076, + "grad_norm": 8.575772285461426, + "learning_rate": 8.68082773742634e-06, + "loss": 0.3869, + "step": 10377 + }, + { + "epoch": 0.14077590884427563, + "grad_norm": 6.30613899230957, + "learning_rate": 8.680690694806086e-06, + "loss": 0.4657, + "step": 10378 + }, + { + "epoch": 0.14078947368421052, + "grad_norm": 7.632806777954102, + "learning_rate": 8.680553652185831e-06, + "loss": 0.6976, + "step": 10379 + }, + { + "epoch": 0.1408030385241454, + "grad_norm": 7.706275463104248, + "learning_rate": 8.680416609565574e-06, + "loss": 0.5389, + "step": 10380 + }, + { + "epoch": 0.1408166033640803, + "grad_norm": 6.344025611877441, + "learning_rate": 8.680279566945321e-06, + "loss": 0.5595, + "step": 10381 + }, + { + "epoch": 0.1408301682040152, + "grad_norm": 6.42766809463501, + "learning_rate": 8.680142524325066e-06, + "loss": 0.5101, + "step": 10382 + }, + { + "epoch": 0.1408437330439501, + "grad_norm": 8.491917610168457, + "learning_rate": 8.68000548170481e-06, + "loss": 0.6215, + "step": 10383 + }, + { + "epoch": 0.14085729788388496, + "grad_norm": 7.708076477050781, + "learning_rate": 8.679868439084555e-06, + "loss": 0.4762, + "step": 10384 + }, + { + "epoch": 0.14087086272381985, + "grad_norm": 8.027609825134277, + "learning_rate": 8.679731396464302e-06, + "loss": 0.4879, + "step": 10385 + }, + { + "epoch": 0.14088442756375474, + "grad_norm": 7.0980658531188965, + "learning_rate": 8.679594353844047e-06, + "loss": 0.4134, + "step": 10386 + }, + { + "epoch": 0.14089799240368964, + "grad_norm": 8.138744354248047, + "learning_rate": 8.67945731122379e-06, + "loss": 0.5229, + "step": 10387 + }, + { + "epoch": 0.14091155724362453, + "grad_norm": 7.697350978851318, + "learning_rate": 8.679320268603536e-06, + "loss": 0.3955, + "step": 10388 + }, + { + "epoch": 0.14092512208355942, + "grad_norm": 11.72356128692627, + "learning_rate": 8.679183225983283e-06, + "loss": 1.0681, + "step": 10389 + }, + { + "epoch": 0.14093868692349432, + "grad_norm": 7.75393009185791, + "learning_rate": 8.679046183363026e-06, + "loss": 0.5469, + "step": 10390 + }, + { + "epoch": 0.14095225176342918, + "grad_norm": 5.868731498718262, + "learning_rate": 8.678909140742771e-06, + "loss": 0.3579, + "step": 10391 + }, + { + "epoch": 0.14096581660336407, + "grad_norm": 7.213723659515381, + "learning_rate": 8.678772098122517e-06, + "loss": 0.3931, + "step": 10392 + }, + { + "epoch": 0.14097938144329897, + "grad_norm": 7.139700412750244, + "learning_rate": 8.678635055502262e-06, + "loss": 0.5891, + "step": 10393 + }, + { + "epoch": 0.14099294628323386, + "grad_norm": 8.416633605957031, + "learning_rate": 8.678498012882007e-06, + "loss": 0.474, + "step": 10394 + }, + { + "epoch": 0.14100651112316875, + "grad_norm": 5.795490741729736, + "learning_rate": 8.678360970261752e-06, + "loss": 0.3935, + "step": 10395 + }, + { + "epoch": 0.14102007596310365, + "grad_norm": 7.896052837371826, + "learning_rate": 8.678223927641497e-06, + "loss": 0.6382, + "step": 10396 + }, + { + "epoch": 0.1410336408030385, + "grad_norm": 5.177103042602539, + "learning_rate": 8.678086885021242e-06, + "loss": 0.3315, + "step": 10397 + }, + { + "epoch": 0.1410472056429734, + "grad_norm": 5.834823131561279, + "learning_rate": 8.677949842400988e-06, + "loss": 0.4865, + "step": 10398 + }, + { + "epoch": 0.1410607704829083, + "grad_norm": 6.27133846282959, + "learning_rate": 8.677812799780733e-06, + "loss": 0.3957, + "step": 10399 + }, + { + "epoch": 0.1410743353228432, + "grad_norm": 5.960312366485596, + "learning_rate": 8.677675757160478e-06, + "loss": 0.3493, + "step": 10400 + }, + { + "epoch": 0.14108790016277808, + "grad_norm": 8.548744201660156, + "learning_rate": 8.677538714540223e-06, + "loss": 0.6122, + "step": 10401 + }, + { + "epoch": 0.14110146500271298, + "grad_norm": 8.264466285705566, + "learning_rate": 8.677401671919968e-06, + "loss": 0.6253, + "step": 10402 + }, + { + "epoch": 0.14111502984264784, + "grad_norm": 9.247590065002441, + "learning_rate": 8.677264629299714e-06, + "loss": 0.7658, + "step": 10403 + }, + { + "epoch": 0.14112859468258274, + "grad_norm": 6.370229721069336, + "learning_rate": 8.677127586679459e-06, + "loss": 0.4783, + "step": 10404 + }, + { + "epoch": 0.14114215952251763, + "grad_norm": 6.739009380340576, + "learning_rate": 8.676990544059202e-06, + "loss": 0.3678, + "step": 10405 + }, + { + "epoch": 0.14115572436245252, + "grad_norm": 8.004997253417969, + "learning_rate": 8.676853501438947e-06, + "loss": 0.5588, + "step": 10406 + }, + { + "epoch": 0.14116928920238742, + "grad_norm": 9.063637733459473, + "learning_rate": 8.676716458818694e-06, + "loss": 0.4235, + "step": 10407 + }, + { + "epoch": 0.1411828540423223, + "grad_norm": 7.835236072540283, + "learning_rate": 8.676579416198438e-06, + "loss": 0.5133, + "step": 10408 + }, + { + "epoch": 0.1411964188822572, + "grad_norm": 8.356260299682617, + "learning_rate": 8.676442373578183e-06, + "loss": 0.452, + "step": 10409 + }, + { + "epoch": 0.14120998372219207, + "grad_norm": 9.873422622680664, + "learning_rate": 8.676305330957928e-06, + "loss": 0.6172, + "step": 10410 + }, + { + "epoch": 0.14122354856212696, + "grad_norm": 6.149163246154785, + "learning_rate": 8.676168288337673e-06, + "loss": 0.3868, + "step": 10411 + }, + { + "epoch": 0.14123711340206185, + "grad_norm": 6.450953960418701, + "learning_rate": 8.676031245717418e-06, + "loss": 0.4877, + "step": 10412 + }, + { + "epoch": 0.14125067824199675, + "grad_norm": 7.535372734069824, + "learning_rate": 8.675894203097164e-06, + "loss": 0.5606, + "step": 10413 + }, + { + "epoch": 0.14126424308193164, + "grad_norm": 6.694319725036621, + "learning_rate": 8.675757160476909e-06, + "loss": 0.409, + "step": 10414 + }, + { + "epoch": 0.14127780792186653, + "grad_norm": 6.429244041442871, + "learning_rate": 8.675620117856654e-06, + "loss": 0.3643, + "step": 10415 + }, + { + "epoch": 0.1412913727618014, + "grad_norm": 5.400853633880615, + "learning_rate": 8.6754830752364e-06, + "loss": 0.3717, + "step": 10416 + }, + { + "epoch": 0.1413049376017363, + "grad_norm": 6.959139823913574, + "learning_rate": 8.675346032616144e-06, + "loss": 0.6264, + "step": 10417 + }, + { + "epoch": 0.14131850244167118, + "grad_norm": 7.347341060638428, + "learning_rate": 8.67520898999589e-06, + "loss": 0.5297, + "step": 10418 + }, + { + "epoch": 0.14133206728160608, + "grad_norm": 9.043452262878418, + "learning_rate": 8.675071947375635e-06, + "loss": 0.4581, + "step": 10419 + }, + { + "epoch": 0.14134563212154097, + "grad_norm": 7.490658283233643, + "learning_rate": 8.67493490475538e-06, + "loss": 0.4311, + "step": 10420 + }, + { + "epoch": 0.14135919696147586, + "grad_norm": 8.313036918640137, + "learning_rate": 8.674797862135125e-06, + "loss": 0.6149, + "step": 10421 + }, + { + "epoch": 0.14137276180141076, + "grad_norm": 5.0967512130737305, + "learning_rate": 8.67466081951487e-06, + "loss": 0.3446, + "step": 10422 + }, + { + "epoch": 0.14138632664134562, + "grad_norm": 5.85903787612915, + "learning_rate": 8.674523776894614e-06, + "loss": 0.4043, + "step": 10423 + }, + { + "epoch": 0.14139989148128052, + "grad_norm": 6.292046546936035, + "learning_rate": 8.67438673427436e-06, + "loss": 0.5159, + "step": 10424 + }, + { + "epoch": 0.1414134563212154, + "grad_norm": 9.343949317932129, + "learning_rate": 8.674249691654106e-06, + "loss": 0.4775, + "step": 10425 + }, + { + "epoch": 0.1414270211611503, + "grad_norm": 7.399735450744629, + "learning_rate": 8.67411264903385e-06, + "loss": 0.4134, + "step": 10426 + }, + { + "epoch": 0.1414405860010852, + "grad_norm": 8.978597640991211, + "learning_rate": 8.673975606413594e-06, + "loss": 0.5341, + "step": 10427 + }, + { + "epoch": 0.1414541508410201, + "grad_norm": 7.067063808441162, + "learning_rate": 8.673838563793341e-06, + "loss": 0.4214, + "step": 10428 + }, + { + "epoch": 0.14146771568095495, + "grad_norm": 8.122130393981934, + "learning_rate": 8.673701521173086e-06, + "loss": 0.4352, + "step": 10429 + }, + { + "epoch": 0.14148128052088985, + "grad_norm": 6.726694583892822, + "learning_rate": 8.67356447855283e-06, + "loss": 0.369, + "step": 10430 + }, + { + "epoch": 0.14149484536082474, + "grad_norm": 7.410377502441406, + "learning_rate": 8.673427435932575e-06, + "loss": 0.4589, + "step": 10431 + }, + { + "epoch": 0.14150841020075963, + "grad_norm": 5.8607988357543945, + "learning_rate": 8.673290393312322e-06, + "loss": 0.3591, + "step": 10432 + }, + { + "epoch": 0.14152197504069453, + "grad_norm": 5.598228454589844, + "learning_rate": 8.673153350692066e-06, + "loss": 0.3422, + "step": 10433 + }, + { + "epoch": 0.14153553988062942, + "grad_norm": 6.3002777099609375, + "learning_rate": 8.67301630807181e-06, + "loss": 0.3875, + "step": 10434 + }, + { + "epoch": 0.14154910472056428, + "grad_norm": 6.452958106994629, + "learning_rate": 8.672879265451556e-06, + "loss": 0.4657, + "step": 10435 + }, + { + "epoch": 0.14156266956049918, + "grad_norm": 9.155773162841797, + "learning_rate": 8.672742222831301e-06, + "loss": 0.4247, + "step": 10436 + }, + { + "epoch": 0.14157623440043407, + "grad_norm": 6.575094223022461, + "learning_rate": 8.672605180211046e-06, + "loss": 0.5008, + "step": 10437 + }, + { + "epoch": 0.14158979924036896, + "grad_norm": 7.632506847381592, + "learning_rate": 8.672468137590791e-06, + "loss": 0.3704, + "step": 10438 + }, + { + "epoch": 0.14160336408030386, + "grad_norm": 5.992527484893799, + "learning_rate": 8.672331094970537e-06, + "loss": 0.348, + "step": 10439 + }, + { + "epoch": 0.14161692892023875, + "grad_norm": 8.432534217834473, + "learning_rate": 8.672194052350282e-06, + "loss": 0.4189, + "step": 10440 + }, + { + "epoch": 0.14163049376017364, + "grad_norm": 7.021727561950684, + "learning_rate": 8.672057009730027e-06, + "loss": 0.5712, + "step": 10441 + }, + { + "epoch": 0.1416440586001085, + "grad_norm": 5.786710739135742, + "learning_rate": 8.671919967109772e-06, + "loss": 0.4581, + "step": 10442 + }, + { + "epoch": 0.1416576234400434, + "grad_norm": 6.929605007171631, + "learning_rate": 8.671782924489517e-06, + "loss": 0.3299, + "step": 10443 + }, + { + "epoch": 0.1416711882799783, + "grad_norm": 6.756417274475098, + "learning_rate": 8.671645881869262e-06, + "loss": 0.4139, + "step": 10444 + }, + { + "epoch": 0.1416847531199132, + "grad_norm": 7.713547706604004, + "learning_rate": 8.671508839249008e-06, + "loss": 0.5068, + "step": 10445 + }, + { + "epoch": 0.14169831795984808, + "grad_norm": 5.8393473625183105, + "learning_rate": 8.671371796628753e-06, + "loss": 0.3872, + "step": 10446 + }, + { + "epoch": 0.14171188279978297, + "grad_norm": 7.040712356567383, + "learning_rate": 8.671234754008498e-06, + "loss": 0.4656, + "step": 10447 + }, + { + "epoch": 0.14172544763971784, + "grad_norm": 7.157234191894531, + "learning_rate": 8.671097711388242e-06, + "loss": 0.5497, + "step": 10448 + }, + { + "epoch": 0.14173901247965273, + "grad_norm": 6.417447090148926, + "learning_rate": 8.670960668767987e-06, + "loss": 0.4152, + "step": 10449 + }, + { + "epoch": 0.14175257731958762, + "grad_norm": 7.706162452697754, + "learning_rate": 8.670823626147734e-06, + "loss": 0.4262, + "step": 10450 + }, + { + "epoch": 0.14176614215952252, + "grad_norm": 7.629234790802002, + "learning_rate": 8.670686583527477e-06, + "loss": 0.7019, + "step": 10451 + }, + { + "epoch": 0.1417797069994574, + "grad_norm": 7.630868434906006, + "learning_rate": 8.670549540907222e-06, + "loss": 0.3345, + "step": 10452 + }, + { + "epoch": 0.1417932718393923, + "grad_norm": 7.276468753814697, + "learning_rate": 8.670412498286967e-06, + "loss": 0.3125, + "step": 10453 + }, + { + "epoch": 0.1418068366793272, + "grad_norm": 7.506189823150635, + "learning_rate": 8.670275455666714e-06, + "loss": 0.6301, + "step": 10454 + }, + { + "epoch": 0.14182040151926206, + "grad_norm": 8.917739868164062, + "learning_rate": 8.670138413046458e-06, + "loss": 0.6582, + "step": 10455 + }, + { + "epoch": 0.14183396635919696, + "grad_norm": 6.839328289031982, + "learning_rate": 8.670001370426203e-06, + "loss": 0.398, + "step": 10456 + }, + { + "epoch": 0.14184753119913185, + "grad_norm": 8.178061485290527, + "learning_rate": 8.669864327805948e-06, + "loss": 0.4794, + "step": 10457 + }, + { + "epoch": 0.14186109603906674, + "grad_norm": 9.392793655395508, + "learning_rate": 8.669727285185693e-06, + "loss": 0.5109, + "step": 10458 + }, + { + "epoch": 0.14187466087900164, + "grad_norm": 7.2056121826171875, + "learning_rate": 8.669590242565438e-06, + "loss": 0.4361, + "step": 10459 + }, + { + "epoch": 0.14188822571893653, + "grad_norm": 5.9718017578125, + "learning_rate": 8.669453199945184e-06, + "loss": 0.4296, + "step": 10460 + }, + { + "epoch": 0.1419017905588714, + "grad_norm": 7.80630350112915, + "learning_rate": 8.669316157324929e-06, + "loss": 0.6261, + "step": 10461 + }, + { + "epoch": 0.1419153553988063, + "grad_norm": 7.06211519241333, + "learning_rate": 8.669179114704674e-06, + "loss": 0.4856, + "step": 10462 + }, + { + "epoch": 0.14192892023874118, + "grad_norm": 6.159117698669434, + "learning_rate": 8.66904207208442e-06, + "loss": 0.3744, + "step": 10463 + }, + { + "epoch": 0.14194248507867607, + "grad_norm": 8.443717002868652, + "learning_rate": 8.668905029464164e-06, + "loss": 0.5976, + "step": 10464 + }, + { + "epoch": 0.14195604991861097, + "grad_norm": 8.513772010803223, + "learning_rate": 8.66876798684391e-06, + "loss": 0.451, + "step": 10465 + }, + { + "epoch": 0.14196961475854586, + "grad_norm": 5.993826389312744, + "learning_rate": 8.668630944223653e-06, + "loss": 0.4334, + "step": 10466 + }, + { + "epoch": 0.14198317959848072, + "grad_norm": 6.844269275665283, + "learning_rate": 8.6684939016034e-06, + "loss": 0.3544, + "step": 10467 + }, + { + "epoch": 0.14199674443841562, + "grad_norm": 6.040504455566406, + "learning_rate": 8.668356858983145e-06, + "loss": 0.3881, + "step": 10468 + }, + { + "epoch": 0.1420103092783505, + "grad_norm": 7.4784064292907715, + "learning_rate": 8.66821981636289e-06, + "loss": 0.5128, + "step": 10469 + }, + { + "epoch": 0.1420238741182854, + "grad_norm": 7.193759918212891, + "learning_rate": 8.668082773742634e-06, + "loss": 0.4639, + "step": 10470 + }, + { + "epoch": 0.1420374389582203, + "grad_norm": 7.472148895263672, + "learning_rate": 8.66794573112238e-06, + "loss": 0.4532, + "step": 10471 + }, + { + "epoch": 0.1420510037981552, + "grad_norm": 7.805776596069336, + "learning_rate": 8.667808688502126e-06, + "loss": 0.5371, + "step": 10472 + }, + { + "epoch": 0.14206456863809008, + "grad_norm": 7.783554553985596, + "learning_rate": 8.66767164588187e-06, + "loss": 0.5504, + "step": 10473 + }, + { + "epoch": 0.14207813347802495, + "grad_norm": 5.628016948699951, + "learning_rate": 8.667534603261614e-06, + "loss": 0.4469, + "step": 10474 + }, + { + "epoch": 0.14209169831795984, + "grad_norm": 5.687433242797852, + "learning_rate": 8.66739756064136e-06, + "loss": 0.3703, + "step": 10475 + }, + { + "epoch": 0.14210526315789473, + "grad_norm": 7.039971828460693, + "learning_rate": 8.667260518021105e-06, + "loss": 0.3061, + "step": 10476 + }, + { + "epoch": 0.14211882799782963, + "grad_norm": 7.344165802001953, + "learning_rate": 8.66712347540085e-06, + "loss": 0.4421, + "step": 10477 + }, + { + "epoch": 0.14213239283776452, + "grad_norm": 6.463048934936523, + "learning_rate": 8.666986432780595e-06, + "loss": 0.352, + "step": 10478 + }, + { + "epoch": 0.1421459576776994, + "grad_norm": 7.149789333343506, + "learning_rate": 8.66684939016034e-06, + "loss": 0.3949, + "step": 10479 + }, + { + "epoch": 0.14215952251763428, + "grad_norm": 5.98779821395874, + "learning_rate": 8.666712347540086e-06, + "loss": 0.4231, + "step": 10480 + }, + { + "epoch": 0.14217308735756917, + "grad_norm": 5.308676242828369, + "learning_rate": 8.66657530491983e-06, + "loss": 0.2981, + "step": 10481 + }, + { + "epoch": 0.14218665219750407, + "grad_norm": 6.790525436401367, + "learning_rate": 8.666438262299576e-06, + "loss": 0.4117, + "step": 10482 + }, + { + "epoch": 0.14220021703743896, + "grad_norm": 5.6817169189453125, + "learning_rate": 8.666301219679321e-06, + "loss": 0.4003, + "step": 10483 + }, + { + "epoch": 0.14221378187737385, + "grad_norm": 5.846677780151367, + "learning_rate": 8.666164177059066e-06, + "loss": 0.2839, + "step": 10484 + }, + { + "epoch": 0.14222734671730874, + "grad_norm": 4.616160869598389, + "learning_rate": 8.666027134438811e-06, + "loss": 0.2487, + "step": 10485 + }, + { + "epoch": 0.14224091155724364, + "grad_norm": 8.630537033081055, + "learning_rate": 8.665890091818557e-06, + "loss": 0.5142, + "step": 10486 + }, + { + "epoch": 0.1422544763971785, + "grad_norm": 6.779980659484863, + "learning_rate": 8.665753049198302e-06, + "loss": 0.2598, + "step": 10487 + }, + { + "epoch": 0.1422680412371134, + "grad_norm": 7.091474533081055, + "learning_rate": 8.665616006578047e-06, + "loss": 0.3899, + "step": 10488 + }, + { + "epoch": 0.1422816060770483, + "grad_norm": 6.0539751052856445, + "learning_rate": 8.665478963957792e-06, + "loss": 0.3211, + "step": 10489 + }, + { + "epoch": 0.14229517091698318, + "grad_norm": 5.187897205352783, + "learning_rate": 8.665341921337537e-06, + "loss": 0.2618, + "step": 10490 + }, + { + "epoch": 0.14230873575691808, + "grad_norm": 9.164241790771484, + "learning_rate": 8.66520487871728e-06, + "loss": 0.4462, + "step": 10491 + }, + { + "epoch": 0.14232230059685297, + "grad_norm": 7.525217533111572, + "learning_rate": 8.665067836097026e-06, + "loss": 0.4688, + "step": 10492 + }, + { + "epoch": 0.14233586543678783, + "grad_norm": 5.557608127593994, + "learning_rate": 8.664930793476773e-06, + "loss": 0.3222, + "step": 10493 + }, + { + "epoch": 0.14234943027672273, + "grad_norm": 7.734316825866699, + "learning_rate": 8.664793750856518e-06, + "loss": 0.3726, + "step": 10494 + }, + { + "epoch": 0.14236299511665762, + "grad_norm": 4.8314595222473145, + "learning_rate": 8.664656708236262e-06, + "loss": 0.2223, + "step": 10495 + }, + { + "epoch": 0.1423765599565925, + "grad_norm": 5.324204444885254, + "learning_rate": 8.664519665616007e-06, + "loss": 0.3234, + "step": 10496 + }, + { + "epoch": 0.1423901247965274, + "grad_norm": 6.005496501922607, + "learning_rate": 8.664382622995754e-06, + "loss": 0.3082, + "step": 10497 + }, + { + "epoch": 0.1424036896364623, + "grad_norm": 6.756898880004883, + "learning_rate": 8.664245580375497e-06, + "loss": 0.3973, + "step": 10498 + }, + { + "epoch": 0.14241725447639716, + "grad_norm": 5.608520984649658, + "learning_rate": 8.664108537755242e-06, + "loss": 0.4457, + "step": 10499 + }, + { + "epoch": 0.14243081931633206, + "grad_norm": 6.417463302612305, + "learning_rate": 8.663971495134987e-06, + "loss": 0.2383, + "step": 10500 + }, + { + "epoch": 0.14244438415626695, + "grad_norm": 8.285778045654297, + "learning_rate": 8.663834452514733e-06, + "loss": 0.4093, + "step": 10501 + }, + { + "epoch": 0.14245794899620184, + "grad_norm": 4.513552188873291, + "learning_rate": 8.663697409894478e-06, + "loss": 0.2747, + "step": 10502 + }, + { + "epoch": 0.14247151383613674, + "grad_norm": 6.203390598297119, + "learning_rate": 8.663560367274223e-06, + "loss": 0.4328, + "step": 10503 + }, + { + "epoch": 0.14248507867607163, + "grad_norm": 6.9545111656188965, + "learning_rate": 8.663423324653968e-06, + "loss": 0.5578, + "step": 10504 + }, + { + "epoch": 0.14249864351600652, + "grad_norm": 7.5838775634765625, + "learning_rate": 8.663286282033713e-06, + "loss": 0.4546, + "step": 10505 + }, + { + "epoch": 0.1425122083559414, + "grad_norm": 7.610363483428955, + "learning_rate": 8.663149239413458e-06, + "loss": 0.6508, + "step": 10506 + }, + { + "epoch": 0.14252577319587628, + "grad_norm": 6.711118698120117, + "learning_rate": 8.663012196793204e-06, + "loss": 0.478, + "step": 10507 + }, + { + "epoch": 0.14253933803581118, + "grad_norm": 5.745343208312988, + "learning_rate": 8.662875154172949e-06, + "loss": 0.3514, + "step": 10508 + }, + { + "epoch": 0.14255290287574607, + "grad_norm": 5.308677673339844, + "learning_rate": 8.662738111552692e-06, + "loss": 0.3393, + "step": 10509 + }, + { + "epoch": 0.14256646771568096, + "grad_norm": 7.389786720275879, + "learning_rate": 8.66260106893244e-06, + "loss": 0.54, + "step": 10510 + }, + { + "epoch": 0.14258003255561585, + "grad_norm": 7.255308151245117, + "learning_rate": 8.662464026312184e-06, + "loss": 0.4936, + "step": 10511 + }, + { + "epoch": 0.14259359739555072, + "grad_norm": 9.914249420166016, + "learning_rate": 8.66232698369193e-06, + "loss": 0.7329, + "step": 10512 + }, + { + "epoch": 0.1426071622354856, + "grad_norm": 7.058861255645752, + "learning_rate": 8.662189941071673e-06, + "loss": 0.4426, + "step": 10513 + }, + { + "epoch": 0.1426207270754205, + "grad_norm": 6.626389980316162, + "learning_rate": 8.66205289845142e-06, + "loss": 0.3299, + "step": 10514 + }, + { + "epoch": 0.1426342919153554, + "grad_norm": 7.6027350425720215, + "learning_rate": 8.661915855831165e-06, + "loss": 0.3867, + "step": 10515 + }, + { + "epoch": 0.1426478567552903, + "grad_norm": 6.382197856903076, + "learning_rate": 8.661778813210909e-06, + "loss": 0.3728, + "step": 10516 + }, + { + "epoch": 0.14266142159522519, + "grad_norm": 5.923049449920654, + "learning_rate": 8.661641770590654e-06, + "loss": 0.374, + "step": 10517 + }, + { + "epoch": 0.14267498643516008, + "grad_norm": 6.330287456512451, + "learning_rate": 8.661504727970399e-06, + "loss": 0.3786, + "step": 10518 + }, + { + "epoch": 0.14268855127509494, + "grad_norm": 8.258779525756836, + "learning_rate": 8.661367685350144e-06, + "loss": 0.5137, + "step": 10519 + }, + { + "epoch": 0.14270211611502984, + "grad_norm": 5.895484924316406, + "learning_rate": 8.66123064272989e-06, + "loss": 0.4047, + "step": 10520 + }, + { + "epoch": 0.14271568095496473, + "grad_norm": 5.4167585372924805, + "learning_rate": 8.661093600109634e-06, + "loss": 0.4116, + "step": 10521 + }, + { + "epoch": 0.14272924579489962, + "grad_norm": 6.10890007019043, + "learning_rate": 8.66095655748938e-06, + "loss": 0.3889, + "step": 10522 + }, + { + "epoch": 0.14274281063483452, + "grad_norm": 5.576137542724609, + "learning_rate": 8.660819514869125e-06, + "loss": 0.4193, + "step": 10523 + }, + { + "epoch": 0.1427563754747694, + "grad_norm": 6.411550045013428, + "learning_rate": 8.66068247224887e-06, + "loss": 0.4816, + "step": 10524 + }, + { + "epoch": 0.14276994031470427, + "grad_norm": 9.103952407836914, + "learning_rate": 8.660545429628615e-06, + "loss": 0.7313, + "step": 10525 + }, + { + "epoch": 0.14278350515463917, + "grad_norm": 5.836127758026123, + "learning_rate": 8.66040838700836e-06, + "loss": 0.3639, + "step": 10526 + }, + { + "epoch": 0.14279706999457406, + "grad_norm": 6.769360065460205, + "learning_rate": 8.660271344388106e-06, + "loss": 0.3853, + "step": 10527 + }, + { + "epoch": 0.14281063483450895, + "grad_norm": 6.666043758392334, + "learning_rate": 8.66013430176785e-06, + "loss": 0.5, + "step": 10528 + }, + { + "epoch": 0.14282419967444385, + "grad_norm": 7.908755302429199, + "learning_rate": 8.659997259147596e-06, + "loss": 0.5643, + "step": 10529 + }, + { + "epoch": 0.14283776451437874, + "grad_norm": 4.610147476196289, + "learning_rate": 8.659860216527341e-06, + "loss": 0.3465, + "step": 10530 + }, + { + "epoch": 0.1428513293543136, + "grad_norm": 5.071578025817871, + "learning_rate": 8.659723173907085e-06, + "loss": 0.3335, + "step": 10531 + }, + { + "epoch": 0.1428648941942485, + "grad_norm": 5.10634183883667, + "learning_rate": 8.659586131286831e-06, + "loss": 0.2987, + "step": 10532 + }, + { + "epoch": 0.1428784590341834, + "grad_norm": 6.313145637512207, + "learning_rate": 8.659449088666577e-06, + "loss": 0.3044, + "step": 10533 + }, + { + "epoch": 0.14289202387411828, + "grad_norm": 6.1967010498046875, + "learning_rate": 8.65931204604632e-06, + "loss": 0.4507, + "step": 10534 + }, + { + "epoch": 0.14290558871405318, + "grad_norm": 6.394045352935791, + "learning_rate": 8.659175003426065e-06, + "loss": 0.4074, + "step": 10535 + }, + { + "epoch": 0.14291915355398807, + "grad_norm": 6.787099838256836, + "learning_rate": 8.659037960805812e-06, + "loss": 0.44, + "step": 10536 + }, + { + "epoch": 0.14293271839392296, + "grad_norm": 5.778404712677002, + "learning_rate": 8.658900918185557e-06, + "loss": 0.4672, + "step": 10537 + }, + { + "epoch": 0.14294628323385783, + "grad_norm": 5.699739933013916, + "learning_rate": 8.6587638755653e-06, + "loss": 0.3773, + "step": 10538 + }, + { + "epoch": 0.14295984807379272, + "grad_norm": 5.014864444732666, + "learning_rate": 8.658626832945046e-06, + "loss": 0.2002, + "step": 10539 + }, + { + "epoch": 0.14297341291372762, + "grad_norm": 7.04826545715332, + "learning_rate": 8.658489790324793e-06, + "loss": 0.4291, + "step": 10540 + }, + { + "epoch": 0.1429869777536625, + "grad_norm": 6.110723972320557, + "learning_rate": 8.658352747704536e-06, + "loss": 0.3258, + "step": 10541 + }, + { + "epoch": 0.1430005425935974, + "grad_norm": 5.067371368408203, + "learning_rate": 8.658215705084282e-06, + "loss": 0.2556, + "step": 10542 + }, + { + "epoch": 0.1430141074335323, + "grad_norm": 5.530787944793701, + "learning_rate": 8.658078662464027e-06, + "loss": 0.3127, + "step": 10543 + }, + { + "epoch": 0.14302767227346716, + "grad_norm": 6.7328267097473145, + "learning_rate": 8.657941619843772e-06, + "loss": 0.3182, + "step": 10544 + }, + { + "epoch": 0.14304123711340205, + "grad_norm": 5.873847484588623, + "learning_rate": 8.657804577223517e-06, + "loss": 0.4201, + "step": 10545 + }, + { + "epoch": 0.14305480195333695, + "grad_norm": 5.771218776702881, + "learning_rate": 8.657667534603262e-06, + "loss": 0.2169, + "step": 10546 + }, + { + "epoch": 0.14306836679327184, + "grad_norm": 6.3583760261535645, + "learning_rate": 8.657530491983007e-06, + "loss": 0.4972, + "step": 10547 + }, + { + "epoch": 0.14308193163320673, + "grad_norm": 7.515580177307129, + "learning_rate": 8.657393449362753e-06, + "loss": 0.416, + "step": 10548 + }, + { + "epoch": 0.14309549647314163, + "grad_norm": 7.339375972747803, + "learning_rate": 8.657256406742498e-06, + "loss": 0.3416, + "step": 10549 + }, + { + "epoch": 0.14310906131307652, + "grad_norm": 5.690878868103027, + "learning_rate": 8.657119364122243e-06, + "loss": 0.3307, + "step": 10550 + }, + { + "epoch": 0.14312262615301138, + "grad_norm": 8.096014976501465, + "learning_rate": 8.656982321501988e-06, + "loss": 0.5849, + "step": 10551 + }, + { + "epoch": 0.14313619099294628, + "grad_norm": 6.598565101623535, + "learning_rate": 8.656845278881733e-06, + "loss": 0.4083, + "step": 10552 + }, + { + "epoch": 0.14314975583288117, + "grad_norm": 6.487871170043945, + "learning_rate": 8.656708236261479e-06, + "loss": 0.256, + "step": 10553 + }, + { + "epoch": 0.14316332067281606, + "grad_norm": 4.593773365020752, + "learning_rate": 8.656571193641224e-06, + "loss": 0.2044, + "step": 10554 + }, + { + "epoch": 0.14317688551275096, + "grad_norm": 3.8691341876983643, + "learning_rate": 8.656434151020969e-06, + "loss": 0.1979, + "step": 10555 + }, + { + "epoch": 0.14319045035268585, + "grad_norm": 7.29129695892334, + "learning_rate": 8.656297108400712e-06, + "loss": 0.4809, + "step": 10556 + }, + { + "epoch": 0.14320401519262071, + "grad_norm": 6.493109226226807, + "learning_rate": 8.65616006578046e-06, + "loss": 0.479, + "step": 10557 + }, + { + "epoch": 0.1432175800325556, + "grad_norm": 7.265811443328857, + "learning_rate": 8.656023023160204e-06, + "loss": 0.4726, + "step": 10558 + }, + { + "epoch": 0.1432311448724905, + "grad_norm": 4.785660743713379, + "learning_rate": 8.655885980539948e-06, + "loss": 0.2748, + "step": 10559 + }, + { + "epoch": 0.1432447097124254, + "grad_norm": 9.418746948242188, + "learning_rate": 8.655748937919693e-06, + "loss": 0.5343, + "step": 10560 + }, + { + "epoch": 0.1432582745523603, + "grad_norm": 5.966996192932129, + "learning_rate": 8.655611895299438e-06, + "loss": 0.3693, + "step": 10561 + }, + { + "epoch": 0.14327183939229518, + "grad_norm": 6.501645565032959, + "learning_rate": 8.655474852679185e-06, + "loss": 0.5294, + "step": 10562 + }, + { + "epoch": 0.14328540423223005, + "grad_norm": 4.699335098266602, + "learning_rate": 8.655337810058929e-06, + "loss": 0.3217, + "step": 10563 + }, + { + "epoch": 0.14329896907216494, + "grad_norm": 7.191166400909424, + "learning_rate": 8.655200767438674e-06, + "loss": 0.7508, + "step": 10564 + }, + { + "epoch": 0.14331253391209983, + "grad_norm": 6.029871940612793, + "learning_rate": 8.655063724818419e-06, + "loss": 0.3417, + "step": 10565 + }, + { + "epoch": 0.14332609875203473, + "grad_norm": 7.055243492126465, + "learning_rate": 8.654926682198164e-06, + "loss": 0.4578, + "step": 10566 + }, + { + "epoch": 0.14333966359196962, + "grad_norm": 6.78193998336792, + "learning_rate": 8.65478963957791e-06, + "loss": 0.6146, + "step": 10567 + }, + { + "epoch": 0.1433532284319045, + "grad_norm": 5.5782623291015625, + "learning_rate": 8.654652596957655e-06, + "loss": 0.3453, + "step": 10568 + }, + { + "epoch": 0.1433667932718394, + "grad_norm": 5.395453929901123, + "learning_rate": 8.6545155543374e-06, + "loss": 0.2302, + "step": 10569 + }, + { + "epoch": 0.14338035811177427, + "grad_norm": 6.303896427154541, + "learning_rate": 8.654378511717145e-06, + "loss": 0.4445, + "step": 10570 + }, + { + "epoch": 0.14339392295170916, + "grad_norm": 6.557240962982178, + "learning_rate": 8.65424146909689e-06, + "loss": 0.4106, + "step": 10571 + }, + { + "epoch": 0.14340748779164406, + "grad_norm": 6.782855987548828, + "learning_rate": 8.654104426476635e-06, + "loss": 0.5425, + "step": 10572 + }, + { + "epoch": 0.14342105263157895, + "grad_norm": 5.3940749168396, + "learning_rate": 8.65396738385638e-06, + "loss": 0.3315, + "step": 10573 + }, + { + "epoch": 0.14343461747151384, + "grad_norm": 7.059296131134033, + "learning_rate": 8.653830341236124e-06, + "loss": 0.3818, + "step": 10574 + }, + { + "epoch": 0.14344818231144874, + "grad_norm": 7.443160057067871, + "learning_rate": 8.65369329861587e-06, + "loss": 0.4852, + "step": 10575 + }, + { + "epoch": 0.1434617471513836, + "grad_norm": 5.88469934463501, + "learning_rate": 8.653556255995616e-06, + "loss": 0.404, + "step": 10576 + }, + { + "epoch": 0.1434753119913185, + "grad_norm": 5.2710771560668945, + "learning_rate": 8.653419213375361e-06, + "loss": 0.3188, + "step": 10577 + }, + { + "epoch": 0.1434888768312534, + "grad_norm": 5.98598575592041, + "learning_rate": 8.653282170755105e-06, + "loss": 0.2794, + "step": 10578 + }, + { + "epoch": 0.14350244167118828, + "grad_norm": 5.686626434326172, + "learning_rate": 8.653145128134851e-06, + "loss": 0.2952, + "step": 10579 + }, + { + "epoch": 0.14351600651112317, + "grad_norm": 5.458011627197266, + "learning_rate": 8.653008085514597e-06, + "loss": 0.3122, + "step": 10580 + }, + { + "epoch": 0.14352957135105807, + "grad_norm": 5.655091285705566, + "learning_rate": 8.65287104289434e-06, + "loss": 0.2887, + "step": 10581 + }, + { + "epoch": 0.14354313619099296, + "grad_norm": 6.496124744415283, + "learning_rate": 8.652734000274085e-06, + "loss": 0.4128, + "step": 10582 + }, + { + "epoch": 0.14355670103092782, + "grad_norm": 10.125737190246582, + "learning_rate": 8.652596957653832e-06, + "loss": 0.4808, + "step": 10583 + }, + { + "epoch": 0.14357026587086272, + "grad_norm": 6.039556980133057, + "learning_rate": 8.652459915033576e-06, + "loss": 0.2939, + "step": 10584 + }, + { + "epoch": 0.1435838307107976, + "grad_norm": 6.589300155639648, + "learning_rate": 8.652322872413321e-06, + "loss": 0.4155, + "step": 10585 + }, + { + "epoch": 0.1435973955507325, + "grad_norm": 6.033478260040283, + "learning_rate": 8.652185829793066e-06, + "loss": 0.3404, + "step": 10586 + }, + { + "epoch": 0.1436109603906674, + "grad_norm": 7.471549034118652, + "learning_rate": 8.652048787172811e-06, + "loss": 0.4512, + "step": 10587 + }, + { + "epoch": 0.1436245252306023, + "grad_norm": 6.252683162689209, + "learning_rate": 8.651911744552556e-06, + "loss": 0.4889, + "step": 10588 + }, + { + "epoch": 0.14363809007053716, + "grad_norm": 6.853967189788818, + "learning_rate": 8.651774701932302e-06, + "loss": 0.4083, + "step": 10589 + }, + { + "epoch": 0.14365165491047205, + "grad_norm": 7.427216053009033, + "learning_rate": 8.651637659312047e-06, + "loss": 0.5453, + "step": 10590 + }, + { + "epoch": 0.14366521975040694, + "grad_norm": 6.38116455078125, + "learning_rate": 8.651500616691792e-06, + "loss": 0.4099, + "step": 10591 + }, + { + "epoch": 0.14367878459034183, + "grad_norm": 7.106180667877197, + "learning_rate": 8.651363574071537e-06, + "loss": 0.3825, + "step": 10592 + }, + { + "epoch": 0.14369234943027673, + "grad_norm": 7.725693702697754, + "learning_rate": 8.651226531451282e-06, + "loss": 0.3679, + "step": 10593 + }, + { + "epoch": 0.14370591427021162, + "grad_norm": 5.7857842445373535, + "learning_rate": 8.651089488831027e-06, + "loss": 0.4606, + "step": 10594 + }, + { + "epoch": 0.14371947911014651, + "grad_norm": 4.469252109527588, + "learning_rate": 8.650952446210773e-06, + "loss": 0.2731, + "step": 10595 + }, + { + "epoch": 0.14373304395008138, + "grad_norm": 7.474509239196777, + "learning_rate": 8.650815403590518e-06, + "loss": 0.4665, + "step": 10596 + }, + { + "epoch": 0.14374660879001627, + "grad_norm": 6.687150478363037, + "learning_rate": 8.650678360970263e-06, + "loss": 0.3398, + "step": 10597 + }, + { + "epoch": 0.14376017362995117, + "grad_norm": 6.04319429397583, + "learning_rate": 8.650541318350008e-06, + "loss": 0.3985, + "step": 10598 + }, + { + "epoch": 0.14377373846988606, + "grad_norm": 5.803151607513428, + "learning_rate": 8.650404275729752e-06, + "loss": 0.3463, + "step": 10599 + }, + { + "epoch": 0.14378730330982095, + "grad_norm": 5.4965009689331055, + "learning_rate": 8.650267233109497e-06, + "loss": 0.3903, + "step": 10600 + }, + { + "epoch": 0.14380086814975584, + "grad_norm": 7.323070526123047, + "learning_rate": 8.650130190489244e-06, + "loss": 0.393, + "step": 10601 + }, + { + "epoch": 0.1438144329896907, + "grad_norm": 5.8035407066345215, + "learning_rate": 8.649993147868987e-06, + "loss": 0.3209, + "step": 10602 + }, + { + "epoch": 0.1438279978296256, + "grad_norm": 5.5694193840026855, + "learning_rate": 8.649856105248732e-06, + "loss": 0.3, + "step": 10603 + }, + { + "epoch": 0.1438415626695605, + "grad_norm": 7.203901290893555, + "learning_rate": 8.649719062628478e-06, + "loss": 0.4813, + "step": 10604 + }, + { + "epoch": 0.1438551275094954, + "grad_norm": 6.673104763031006, + "learning_rate": 8.649582020008224e-06, + "loss": 0.4449, + "step": 10605 + }, + { + "epoch": 0.14386869234943028, + "grad_norm": 4.8152618408203125, + "learning_rate": 8.649444977387968e-06, + "loss": 0.3659, + "step": 10606 + }, + { + "epoch": 0.14388225718936518, + "grad_norm": 6.121631145477295, + "learning_rate": 8.649307934767713e-06, + "loss": 0.6655, + "step": 10607 + }, + { + "epoch": 0.14389582202930004, + "grad_norm": 5.999741077423096, + "learning_rate": 8.649170892147458e-06, + "loss": 0.3987, + "step": 10608 + }, + { + "epoch": 0.14390938686923493, + "grad_norm": 5.210629463195801, + "learning_rate": 8.649033849527203e-06, + "loss": 0.3907, + "step": 10609 + }, + { + "epoch": 0.14392295170916983, + "grad_norm": 4.461653709411621, + "learning_rate": 8.648896806906949e-06, + "loss": 0.3188, + "step": 10610 + }, + { + "epoch": 0.14393651654910472, + "grad_norm": 7.369960784912109, + "learning_rate": 8.648759764286694e-06, + "loss": 0.7047, + "step": 10611 + }, + { + "epoch": 0.1439500813890396, + "grad_norm": 7.775152683258057, + "learning_rate": 8.648622721666439e-06, + "loss": 0.4408, + "step": 10612 + }, + { + "epoch": 0.1439636462289745, + "grad_norm": 6.082947731018066, + "learning_rate": 8.648485679046184e-06, + "loss": 0.4587, + "step": 10613 + }, + { + "epoch": 0.1439772110689094, + "grad_norm": 4.402381896972656, + "learning_rate": 8.64834863642593e-06, + "loss": 0.257, + "step": 10614 + }, + { + "epoch": 0.14399077590884427, + "grad_norm": 6.442495822906494, + "learning_rate": 8.648211593805675e-06, + "loss": 0.3698, + "step": 10615 + }, + { + "epoch": 0.14400434074877916, + "grad_norm": 6.127766132354736, + "learning_rate": 8.64807455118542e-06, + "loss": 0.3684, + "step": 10616 + }, + { + "epoch": 0.14401790558871405, + "grad_norm": 4.956195831298828, + "learning_rate": 8.647937508565163e-06, + "loss": 0.3637, + "step": 10617 + }, + { + "epoch": 0.14403147042864894, + "grad_norm": 5.422576427459717, + "learning_rate": 8.64780046594491e-06, + "loss": 0.3027, + "step": 10618 + }, + { + "epoch": 0.14404503526858384, + "grad_norm": 6.903382301330566, + "learning_rate": 8.647663423324655e-06, + "loss": 0.4176, + "step": 10619 + }, + { + "epoch": 0.14405860010851873, + "grad_norm": 5.332482814788818, + "learning_rate": 8.6475263807044e-06, + "loss": 0.4394, + "step": 10620 + }, + { + "epoch": 0.1440721649484536, + "grad_norm": 4.30109167098999, + "learning_rate": 8.647389338084144e-06, + "loss": 0.3323, + "step": 10621 + }, + { + "epoch": 0.1440857297883885, + "grad_norm": 6.09187126159668, + "learning_rate": 8.64725229546389e-06, + "loss": 0.3795, + "step": 10622 + }, + { + "epoch": 0.14409929462832338, + "grad_norm": 4.754607200622559, + "learning_rate": 8.647115252843636e-06, + "loss": 0.4901, + "step": 10623 + }, + { + "epoch": 0.14411285946825828, + "grad_norm": 6.57153844833374, + "learning_rate": 8.64697821022338e-06, + "loss": 0.6128, + "step": 10624 + }, + { + "epoch": 0.14412642430819317, + "grad_norm": 6.216367244720459, + "learning_rate": 8.646841167603125e-06, + "loss": 0.4185, + "step": 10625 + }, + { + "epoch": 0.14413998914812806, + "grad_norm": 5.541033744812012, + "learning_rate": 8.646704124982871e-06, + "loss": 0.4278, + "step": 10626 + }, + { + "epoch": 0.14415355398806295, + "grad_norm": 6.294240474700928, + "learning_rate": 8.646567082362615e-06, + "loss": 0.3712, + "step": 10627 + }, + { + "epoch": 0.14416711882799782, + "grad_norm": 5.076178073883057, + "learning_rate": 8.64643003974236e-06, + "loss": 0.3969, + "step": 10628 + }, + { + "epoch": 0.1441806836679327, + "grad_norm": 6.682417392730713, + "learning_rate": 8.646292997122105e-06, + "loss": 0.6306, + "step": 10629 + }, + { + "epoch": 0.1441942485078676, + "grad_norm": 5.445986270904541, + "learning_rate": 8.64615595450185e-06, + "loss": 0.5652, + "step": 10630 + }, + { + "epoch": 0.1442078133478025, + "grad_norm": 4.623904228210449, + "learning_rate": 8.646018911881596e-06, + "loss": 0.406, + "step": 10631 + }, + { + "epoch": 0.1442213781877374, + "grad_norm": 5.4329094886779785, + "learning_rate": 8.645881869261341e-06, + "loss": 0.4476, + "step": 10632 + }, + { + "epoch": 0.14423494302767229, + "grad_norm": 6.976787090301514, + "learning_rate": 8.645744826641086e-06, + "loss": 0.4731, + "step": 10633 + }, + { + "epoch": 0.14424850786760715, + "grad_norm": 6.230726718902588, + "learning_rate": 8.645607784020831e-06, + "loss": 0.357, + "step": 10634 + }, + { + "epoch": 0.14426207270754204, + "grad_norm": 4.439204692840576, + "learning_rate": 8.645470741400576e-06, + "loss": 0.3273, + "step": 10635 + }, + { + "epoch": 0.14427563754747694, + "grad_norm": 5.279077053070068, + "learning_rate": 8.645333698780322e-06, + "loss": 0.3309, + "step": 10636 + }, + { + "epoch": 0.14428920238741183, + "grad_norm": 4.310202121734619, + "learning_rate": 8.645196656160067e-06, + "loss": 0.2851, + "step": 10637 + }, + { + "epoch": 0.14430276722734672, + "grad_norm": 4.466716766357422, + "learning_rate": 8.645059613539812e-06, + "loss": 0.22, + "step": 10638 + }, + { + "epoch": 0.14431633206728162, + "grad_norm": 5.660497665405273, + "learning_rate": 8.644922570919557e-06, + "loss": 0.4242, + "step": 10639 + }, + { + "epoch": 0.14432989690721648, + "grad_norm": 5.147482872009277, + "learning_rate": 8.644785528299302e-06, + "loss": 0.2198, + "step": 10640 + }, + { + "epoch": 0.14434346174715137, + "grad_norm": 6.171370029449463, + "learning_rate": 8.644648485679047e-06, + "loss": 0.4471, + "step": 10641 + }, + { + "epoch": 0.14435702658708627, + "grad_norm": 3.7221031188964844, + "learning_rate": 8.644511443058791e-06, + "loss": 0.3895, + "step": 10642 + }, + { + "epoch": 0.14437059142702116, + "grad_norm": 4.8566789627075195, + "learning_rate": 8.644374400438536e-06, + "loss": 0.3412, + "step": 10643 + }, + { + "epoch": 0.14438415626695605, + "grad_norm": 5.086557865142822, + "learning_rate": 8.644237357818283e-06, + "loss": 0.3305, + "step": 10644 + }, + { + "epoch": 0.14439772110689095, + "grad_norm": 3.6777076721191406, + "learning_rate": 8.644100315198028e-06, + "loss": 0.3115, + "step": 10645 + }, + { + "epoch": 0.14441128594682584, + "grad_norm": 4.463238716125488, + "learning_rate": 8.643963272577772e-06, + "loss": 0.3604, + "step": 10646 + }, + { + "epoch": 0.1444248507867607, + "grad_norm": 5.734208106994629, + "learning_rate": 8.643826229957517e-06, + "loss": 0.3986, + "step": 10647 + }, + { + "epoch": 0.1444384156266956, + "grad_norm": 4.296401023864746, + "learning_rate": 8.643689187337264e-06, + "loss": 0.3906, + "step": 10648 + }, + { + "epoch": 0.1444519804666305, + "grad_norm": 6.9502081871032715, + "learning_rate": 8.643552144717007e-06, + "loss": 0.6182, + "step": 10649 + }, + { + "epoch": 0.14446554530656538, + "grad_norm": 5.665464401245117, + "learning_rate": 8.643415102096752e-06, + "loss": 0.3247, + "step": 10650 + }, + { + "epoch": 0.14447911014650028, + "grad_norm": 6.461389064788818, + "learning_rate": 8.643278059476498e-06, + "loss": 0.4488, + "step": 10651 + }, + { + "epoch": 0.14449267498643517, + "grad_norm": 8.726472854614258, + "learning_rate": 8.643141016856243e-06, + "loss": 0.6793, + "step": 10652 + }, + { + "epoch": 0.14450623982637004, + "grad_norm": 5.344161033630371, + "learning_rate": 8.643003974235988e-06, + "loss": 0.4313, + "step": 10653 + }, + { + "epoch": 0.14451980466630493, + "grad_norm": 6.23590612411499, + "learning_rate": 8.642866931615733e-06, + "loss": 0.4616, + "step": 10654 + }, + { + "epoch": 0.14453336950623982, + "grad_norm": 7.544357776641846, + "learning_rate": 8.642729888995478e-06, + "loss": 0.4156, + "step": 10655 + }, + { + "epoch": 0.14454693434617472, + "grad_norm": 6.348668098449707, + "learning_rate": 8.642592846375223e-06, + "loss": 0.3811, + "step": 10656 + }, + { + "epoch": 0.1445604991861096, + "grad_norm": 5.358905792236328, + "learning_rate": 8.642455803754969e-06, + "loss": 0.2786, + "step": 10657 + }, + { + "epoch": 0.1445740640260445, + "grad_norm": 8.441142082214355, + "learning_rate": 8.642318761134714e-06, + "loss": 0.592, + "step": 10658 + }, + { + "epoch": 0.1445876288659794, + "grad_norm": 5.033048629760742, + "learning_rate": 8.642181718514459e-06, + "loss": 0.3375, + "step": 10659 + }, + { + "epoch": 0.14460119370591426, + "grad_norm": 6.201465129852295, + "learning_rate": 8.642044675894204e-06, + "loss": 0.4293, + "step": 10660 + }, + { + "epoch": 0.14461475854584915, + "grad_norm": 8.695765495300293, + "learning_rate": 8.64190763327395e-06, + "loss": 0.5999, + "step": 10661 + }, + { + "epoch": 0.14462832338578405, + "grad_norm": 5.613796234130859, + "learning_rate": 8.641770590653695e-06, + "loss": 0.3405, + "step": 10662 + }, + { + "epoch": 0.14464188822571894, + "grad_norm": 8.777588844299316, + "learning_rate": 8.64163354803344e-06, + "loss": 0.6126, + "step": 10663 + }, + { + "epoch": 0.14465545306565383, + "grad_norm": 7.250744342803955, + "learning_rate": 8.641496505413183e-06, + "loss": 0.5416, + "step": 10664 + }, + { + "epoch": 0.14466901790558873, + "grad_norm": 6.755357265472412, + "learning_rate": 8.64135946279293e-06, + "loss": 0.4968, + "step": 10665 + }, + { + "epoch": 0.1446825827455236, + "grad_norm": 7.154147148132324, + "learning_rate": 8.641222420172675e-06, + "loss": 0.3648, + "step": 10666 + }, + { + "epoch": 0.14469614758545848, + "grad_norm": 6.834244251251221, + "learning_rate": 8.641085377552419e-06, + "loss": 0.4582, + "step": 10667 + }, + { + "epoch": 0.14470971242539338, + "grad_norm": 5.867450714111328, + "learning_rate": 8.640948334932164e-06, + "loss": 0.4071, + "step": 10668 + }, + { + "epoch": 0.14472327726532827, + "grad_norm": 7.1463212966918945, + "learning_rate": 8.640811292311909e-06, + "loss": 0.5395, + "step": 10669 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 7.857137680053711, + "learning_rate": 8.640674249691656e-06, + "loss": 0.5548, + "step": 10670 + }, + { + "epoch": 0.14475040694519806, + "grad_norm": 6.728461265563965, + "learning_rate": 8.6405372070714e-06, + "loss": 0.6434, + "step": 10671 + }, + { + "epoch": 0.14476397178513292, + "grad_norm": 6.520116806030273, + "learning_rate": 8.640400164451145e-06, + "loss": 0.561, + "step": 10672 + }, + { + "epoch": 0.14477753662506782, + "grad_norm": 6.775640487670898, + "learning_rate": 8.64026312183089e-06, + "loss": 0.6497, + "step": 10673 + }, + { + "epoch": 0.1447911014650027, + "grad_norm": 7.775871276855469, + "learning_rate": 8.640126079210635e-06, + "loss": 0.5766, + "step": 10674 + }, + { + "epoch": 0.1448046663049376, + "grad_norm": 9.275957107543945, + "learning_rate": 8.63998903659038e-06, + "loss": 0.4808, + "step": 10675 + }, + { + "epoch": 0.1448182311448725, + "grad_norm": 6.005938529968262, + "learning_rate": 8.639851993970125e-06, + "loss": 0.431, + "step": 10676 + }, + { + "epoch": 0.1448317959848074, + "grad_norm": 5.889760494232178, + "learning_rate": 8.63971495134987e-06, + "loss": 0.458, + "step": 10677 + }, + { + "epoch": 0.14484536082474228, + "grad_norm": 7.308699131011963, + "learning_rate": 8.639577908729616e-06, + "loss": 0.5534, + "step": 10678 + }, + { + "epoch": 0.14485892566467715, + "grad_norm": 6.6435980796813965, + "learning_rate": 8.639440866109361e-06, + "loss": 0.4571, + "step": 10679 + }, + { + "epoch": 0.14487249050461204, + "grad_norm": 5.232545852661133, + "learning_rate": 8.639303823489106e-06, + "loss": 0.4069, + "step": 10680 + }, + { + "epoch": 0.14488605534454693, + "grad_norm": 5.568904399871826, + "learning_rate": 8.639166780868851e-06, + "loss": 0.4849, + "step": 10681 + }, + { + "epoch": 0.14489962018448183, + "grad_norm": 5.926767826080322, + "learning_rate": 8.639029738248595e-06, + "loss": 0.2523, + "step": 10682 + }, + { + "epoch": 0.14491318502441672, + "grad_norm": 5.003059387207031, + "learning_rate": 8.638892695628342e-06, + "loss": 0.4626, + "step": 10683 + }, + { + "epoch": 0.1449267498643516, + "grad_norm": 5.966547012329102, + "learning_rate": 8.638755653008087e-06, + "loss": 0.4751, + "step": 10684 + }, + { + "epoch": 0.14494031470428648, + "grad_norm": 5.804115295410156, + "learning_rate": 8.638618610387832e-06, + "loss": 0.4324, + "step": 10685 + }, + { + "epoch": 0.14495387954422137, + "grad_norm": 6.524879455566406, + "learning_rate": 8.638481567767575e-06, + "loss": 0.4547, + "step": 10686 + }, + { + "epoch": 0.14496744438415626, + "grad_norm": 6.99550724029541, + "learning_rate": 8.638344525147322e-06, + "loss": 0.4557, + "step": 10687 + }, + { + "epoch": 0.14498100922409116, + "grad_norm": 6.360071659088135, + "learning_rate": 8.638207482527067e-06, + "loss": 0.4934, + "step": 10688 + }, + { + "epoch": 0.14499457406402605, + "grad_norm": 7.2303361892700195, + "learning_rate": 8.638070439906811e-06, + "loss": 0.4256, + "step": 10689 + }, + { + "epoch": 0.14500813890396094, + "grad_norm": 9.271814346313477, + "learning_rate": 8.637933397286556e-06, + "loss": 0.8487, + "step": 10690 + }, + { + "epoch": 0.14502170374389584, + "grad_norm": 6.772697448730469, + "learning_rate": 8.637796354666303e-06, + "loss": 0.5208, + "step": 10691 + }, + { + "epoch": 0.1450352685838307, + "grad_norm": 5.270370006561279, + "learning_rate": 8.637659312046047e-06, + "loss": 0.3742, + "step": 10692 + }, + { + "epoch": 0.1450488334237656, + "grad_norm": 6.811211585998535, + "learning_rate": 8.637522269425792e-06, + "loss": 0.4918, + "step": 10693 + }, + { + "epoch": 0.1450623982637005, + "grad_norm": 8.340110778808594, + "learning_rate": 8.637385226805537e-06, + "loss": 0.6673, + "step": 10694 + }, + { + "epoch": 0.14507596310363538, + "grad_norm": 5.628327369689941, + "learning_rate": 8.637248184185282e-06, + "loss": 0.5986, + "step": 10695 + }, + { + "epoch": 0.14508952794357027, + "grad_norm": 5.250437259674072, + "learning_rate": 8.637111141565027e-06, + "loss": 0.3523, + "step": 10696 + }, + { + "epoch": 0.14510309278350517, + "grad_norm": 5.999332904815674, + "learning_rate": 8.636974098944772e-06, + "loss": 0.3988, + "step": 10697 + }, + { + "epoch": 0.14511665762344003, + "grad_norm": 6.864372730255127, + "learning_rate": 8.636837056324518e-06, + "loss": 0.5562, + "step": 10698 + }, + { + "epoch": 0.14513022246337492, + "grad_norm": 7.6635422706604, + "learning_rate": 8.636700013704263e-06, + "loss": 0.5737, + "step": 10699 + }, + { + "epoch": 0.14514378730330982, + "grad_norm": 5.424098014831543, + "learning_rate": 8.636562971084008e-06, + "loss": 0.4616, + "step": 10700 + }, + { + "epoch": 0.1451573521432447, + "grad_norm": 5.679169654846191, + "learning_rate": 8.636425928463753e-06, + "loss": 0.3096, + "step": 10701 + }, + { + "epoch": 0.1451709169831796, + "grad_norm": 5.361079692840576, + "learning_rate": 8.636288885843498e-06, + "loss": 0.3579, + "step": 10702 + }, + { + "epoch": 0.1451844818231145, + "grad_norm": 5.316638946533203, + "learning_rate": 8.636151843223243e-06, + "loss": 0.3928, + "step": 10703 + }, + { + "epoch": 0.14519804666304936, + "grad_norm": 5.86228084564209, + "learning_rate": 8.636014800602989e-06, + "loss": 0.4043, + "step": 10704 + }, + { + "epoch": 0.14521161150298426, + "grad_norm": 6.969207763671875, + "learning_rate": 8.635877757982734e-06, + "loss": 0.4174, + "step": 10705 + }, + { + "epoch": 0.14522517634291915, + "grad_norm": 7.349292278289795, + "learning_rate": 8.635740715362479e-06, + "loss": 0.5581, + "step": 10706 + }, + { + "epoch": 0.14523874118285404, + "grad_norm": 5.927756309509277, + "learning_rate": 8.635603672742223e-06, + "loss": 0.4255, + "step": 10707 + }, + { + "epoch": 0.14525230602278894, + "grad_norm": 6.410634517669678, + "learning_rate": 8.63546663012197e-06, + "loss": 0.4104, + "step": 10708 + }, + { + "epoch": 0.14526587086272383, + "grad_norm": 7.2202067375183105, + "learning_rate": 8.635329587501715e-06, + "loss": 0.4643, + "step": 10709 + }, + { + "epoch": 0.14527943570265872, + "grad_norm": 4.7444963455200195, + "learning_rate": 8.635192544881458e-06, + "loss": 0.3037, + "step": 10710 + }, + { + "epoch": 0.1452930005425936, + "grad_norm": 6.952388286590576, + "learning_rate": 8.635055502261203e-06, + "loss": 0.4499, + "step": 10711 + }, + { + "epoch": 0.14530656538252848, + "grad_norm": 7.275516033172607, + "learning_rate": 8.634918459640948e-06, + "loss": 0.3638, + "step": 10712 + }, + { + "epoch": 0.14532013022246337, + "grad_norm": 6.936154842376709, + "learning_rate": 8.634781417020695e-06, + "loss": 0.4807, + "step": 10713 + }, + { + "epoch": 0.14533369506239827, + "grad_norm": 6.184232234954834, + "learning_rate": 8.634644374400439e-06, + "loss": 0.4077, + "step": 10714 + }, + { + "epoch": 0.14534725990233316, + "grad_norm": 4.624492168426514, + "learning_rate": 8.634507331780184e-06, + "loss": 0.2692, + "step": 10715 + }, + { + "epoch": 0.14536082474226805, + "grad_norm": 7.173182010650635, + "learning_rate": 8.634370289159929e-06, + "loss": 0.3272, + "step": 10716 + }, + { + "epoch": 0.14537438958220292, + "grad_norm": 6.510496616363525, + "learning_rate": 8.634233246539674e-06, + "loss": 0.3675, + "step": 10717 + }, + { + "epoch": 0.1453879544221378, + "grad_norm": 5.8282952308654785, + "learning_rate": 8.63409620391942e-06, + "loss": 0.3898, + "step": 10718 + }, + { + "epoch": 0.1454015192620727, + "grad_norm": 4.831150531768799, + "learning_rate": 8.633959161299165e-06, + "loss": 0.3292, + "step": 10719 + }, + { + "epoch": 0.1454150841020076, + "grad_norm": 4.506537437438965, + "learning_rate": 8.63382211867891e-06, + "loss": 0.2419, + "step": 10720 + }, + { + "epoch": 0.1454286489419425, + "grad_norm": 5.4812421798706055, + "learning_rate": 8.633685076058655e-06, + "loss": 0.4837, + "step": 10721 + }, + { + "epoch": 0.14544221378187738, + "grad_norm": 4.314565181732178, + "learning_rate": 8.6335480334384e-06, + "loss": 0.2568, + "step": 10722 + }, + { + "epoch": 0.14545577862181228, + "grad_norm": 4.990719318389893, + "learning_rate": 8.633410990818145e-06, + "loss": 0.3069, + "step": 10723 + }, + { + "epoch": 0.14546934346174714, + "grad_norm": 5.933165550231934, + "learning_rate": 8.63327394819789e-06, + "loss": 0.3716, + "step": 10724 + }, + { + "epoch": 0.14548290830168203, + "grad_norm": 6.555728912353516, + "learning_rate": 8.633136905577634e-06, + "loss": 0.3528, + "step": 10725 + }, + { + "epoch": 0.14549647314161693, + "grad_norm": 4.88176965713501, + "learning_rate": 8.632999862957381e-06, + "loss": 0.3182, + "step": 10726 + }, + { + "epoch": 0.14551003798155182, + "grad_norm": 5.757215976715088, + "learning_rate": 8.632862820337126e-06, + "loss": 0.3553, + "step": 10727 + }, + { + "epoch": 0.14552360282148671, + "grad_norm": 7.251237869262695, + "learning_rate": 8.632725777716871e-06, + "loss": 0.298, + "step": 10728 + }, + { + "epoch": 0.1455371676614216, + "grad_norm": 4.927096843719482, + "learning_rate": 8.632588735096615e-06, + "loss": 0.2995, + "step": 10729 + }, + { + "epoch": 0.14555073250135647, + "grad_norm": 6.151144504547119, + "learning_rate": 8.632451692476362e-06, + "loss": 0.2861, + "step": 10730 + }, + { + "epoch": 0.14556429734129137, + "grad_norm": 4.7220611572265625, + "learning_rate": 8.632314649856107e-06, + "loss": 0.222, + "step": 10731 + }, + { + "epoch": 0.14557786218122626, + "grad_norm": 5.5195112228393555, + "learning_rate": 8.63217760723585e-06, + "loss": 0.4876, + "step": 10732 + }, + { + "epoch": 0.14559142702116115, + "grad_norm": 4.33803129196167, + "learning_rate": 8.632040564615595e-06, + "loss": 0.388, + "step": 10733 + }, + { + "epoch": 0.14560499186109604, + "grad_norm": 5.192509651184082, + "learning_rate": 8.631903521995342e-06, + "loss": 0.2844, + "step": 10734 + }, + { + "epoch": 0.14561855670103094, + "grad_norm": 4.717794895172119, + "learning_rate": 8.631766479375086e-06, + "loss": 0.2539, + "step": 10735 + }, + { + "epoch": 0.1456321215409658, + "grad_norm": 5.841053485870361, + "learning_rate": 8.631629436754831e-06, + "loss": 0.2903, + "step": 10736 + }, + { + "epoch": 0.1456456863809007, + "grad_norm": 6.138232707977295, + "learning_rate": 8.631492394134576e-06, + "loss": 0.4325, + "step": 10737 + }, + { + "epoch": 0.1456592512208356, + "grad_norm": 6.0289225578308105, + "learning_rate": 8.631355351514321e-06, + "loss": 0.343, + "step": 10738 + }, + { + "epoch": 0.14567281606077048, + "grad_norm": 7.472414970397949, + "learning_rate": 8.631218308894067e-06, + "loss": 0.5521, + "step": 10739 + }, + { + "epoch": 0.14568638090070538, + "grad_norm": 6.22152853012085, + "learning_rate": 8.631081266273812e-06, + "loss": 0.3558, + "step": 10740 + }, + { + "epoch": 0.14569994574064027, + "grad_norm": 5.345506191253662, + "learning_rate": 8.630944223653557e-06, + "loss": 0.2862, + "step": 10741 + }, + { + "epoch": 0.14571351058057516, + "grad_norm": 4.185894012451172, + "learning_rate": 8.630807181033302e-06, + "loss": 0.3606, + "step": 10742 + }, + { + "epoch": 0.14572707542051003, + "grad_norm": 6.27175235748291, + "learning_rate": 8.630670138413047e-06, + "loss": 0.344, + "step": 10743 + }, + { + "epoch": 0.14574064026044492, + "grad_norm": 5.50347900390625, + "learning_rate": 8.630533095792792e-06, + "loss": 0.3028, + "step": 10744 + }, + { + "epoch": 0.1457542051003798, + "grad_norm": 8.491273880004883, + "learning_rate": 8.630396053172538e-06, + "loss": 0.3987, + "step": 10745 + }, + { + "epoch": 0.1457677699403147, + "grad_norm": 5.824460983276367, + "learning_rate": 8.630259010552283e-06, + "loss": 0.2974, + "step": 10746 + }, + { + "epoch": 0.1457813347802496, + "grad_norm": 4.551273822784424, + "learning_rate": 8.630121967932028e-06, + "loss": 0.3806, + "step": 10747 + }, + { + "epoch": 0.1457948996201845, + "grad_norm": 5.128396511077881, + "learning_rate": 8.629984925311773e-06, + "loss": 0.3591, + "step": 10748 + }, + { + "epoch": 0.14580846446011936, + "grad_norm": 6.648977756500244, + "learning_rate": 8.629847882691518e-06, + "loss": 0.4613, + "step": 10749 + }, + { + "epoch": 0.14582202930005425, + "grad_norm": 4.301287651062012, + "learning_rate": 8.629710840071262e-06, + "loss": 0.2847, + "step": 10750 + }, + { + "epoch": 0.14583559413998914, + "grad_norm": 5.267543315887451, + "learning_rate": 8.629573797451007e-06, + "loss": 0.3814, + "step": 10751 + }, + { + "epoch": 0.14584915897992404, + "grad_norm": 5.431938171386719, + "learning_rate": 8.629436754830754e-06, + "loss": 0.3885, + "step": 10752 + }, + { + "epoch": 0.14586272381985893, + "grad_norm": 5.910903453826904, + "learning_rate": 8.629299712210499e-06, + "loss": 0.4612, + "step": 10753 + }, + { + "epoch": 0.14587628865979382, + "grad_norm": 6.5381178855896, + "learning_rate": 8.629162669590243e-06, + "loss": 0.4478, + "step": 10754 + }, + { + "epoch": 0.14588985349972872, + "grad_norm": 7.129181385040283, + "learning_rate": 8.629025626969988e-06, + "loss": 0.4472, + "step": 10755 + }, + { + "epoch": 0.14590341833966358, + "grad_norm": 6.456555366516113, + "learning_rate": 8.628888584349735e-06, + "loss": 0.3244, + "step": 10756 + }, + { + "epoch": 0.14591698317959848, + "grad_norm": 5.711275100708008, + "learning_rate": 8.628751541729478e-06, + "loss": 0.5633, + "step": 10757 + }, + { + "epoch": 0.14593054801953337, + "grad_norm": 4.652919292449951, + "learning_rate": 8.628614499109223e-06, + "loss": 0.337, + "step": 10758 + }, + { + "epoch": 0.14594411285946826, + "grad_norm": 6.757781982421875, + "learning_rate": 8.628477456488968e-06, + "loss": 0.3845, + "step": 10759 + }, + { + "epoch": 0.14595767769940315, + "grad_norm": 6.0658278465271, + "learning_rate": 8.628340413868714e-06, + "loss": 0.3651, + "step": 10760 + }, + { + "epoch": 0.14597124253933805, + "grad_norm": 5.553519248962402, + "learning_rate": 8.628203371248459e-06, + "loss": 0.3246, + "step": 10761 + }, + { + "epoch": 0.1459848073792729, + "grad_norm": 8.057621955871582, + "learning_rate": 8.628066328628204e-06, + "loss": 0.3785, + "step": 10762 + }, + { + "epoch": 0.1459983722192078, + "grad_norm": 4.735253810882568, + "learning_rate": 8.627929286007949e-06, + "loss": 0.3159, + "step": 10763 + }, + { + "epoch": 0.1460119370591427, + "grad_norm": 6.111423492431641, + "learning_rate": 8.627792243387694e-06, + "loss": 0.5106, + "step": 10764 + }, + { + "epoch": 0.1460255018990776, + "grad_norm": 4.775132656097412, + "learning_rate": 8.62765520076744e-06, + "loss": 0.4181, + "step": 10765 + }, + { + "epoch": 0.14603906673901249, + "grad_norm": 5.421469688415527, + "learning_rate": 8.627518158147185e-06, + "loss": 0.3257, + "step": 10766 + }, + { + "epoch": 0.14605263157894738, + "grad_norm": 9.34963321685791, + "learning_rate": 8.62738111552693e-06, + "loss": 0.5622, + "step": 10767 + }, + { + "epoch": 0.14606619641888224, + "grad_norm": 5.185481071472168, + "learning_rate": 8.627244072906675e-06, + "loss": 0.2374, + "step": 10768 + }, + { + "epoch": 0.14607976125881714, + "grad_norm": 7.6729302406311035, + "learning_rate": 8.62710703028642e-06, + "loss": 0.4975, + "step": 10769 + }, + { + "epoch": 0.14609332609875203, + "grad_norm": 7.646551609039307, + "learning_rate": 8.626969987666165e-06, + "loss": 0.3457, + "step": 10770 + }, + { + "epoch": 0.14610689093868692, + "grad_norm": 6.3849101066589355, + "learning_rate": 8.62683294504591e-06, + "loss": 0.4225, + "step": 10771 + }, + { + "epoch": 0.14612045577862182, + "grad_norm": 6.987218379974365, + "learning_rate": 8.626695902425654e-06, + "loss": 0.2604, + "step": 10772 + }, + { + "epoch": 0.1461340206185567, + "grad_norm": 6.37108039855957, + "learning_rate": 8.626558859805401e-06, + "loss": 0.4274, + "step": 10773 + }, + { + "epoch": 0.1461475854584916, + "grad_norm": 7.905296802520752, + "learning_rate": 8.626421817185146e-06, + "loss": 0.5764, + "step": 10774 + }, + { + "epoch": 0.14616115029842647, + "grad_norm": 6.819881916046143, + "learning_rate": 8.62628477456489e-06, + "loss": 0.3028, + "step": 10775 + }, + { + "epoch": 0.14617471513836136, + "grad_norm": 6.2443156242370605, + "learning_rate": 8.626147731944635e-06, + "loss": 0.3443, + "step": 10776 + }, + { + "epoch": 0.14618827997829625, + "grad_norm": 6.415040969848633, + "learning_rate": 8.626010689324382e-06, + "loss": 0.4075, + "step": 10777 + }, + { + "epoch": 0.14620184481823115, + "grad_norm": 5.416011333465576, + "learning_rate": 8.625873646704125e-06, + "loss": 0.252, + "step": 10778 + }, + { + "epoch": 0.14621540965816604, + "grad_norm": 7.774909973144531, + "learning_rate": 8.62573660408387e-06, + "loss": 0.5081, + "step": 10779 + }, + { + "epoch": 0.14622897449810093, + "grad_norm": 7.2113447189331055, + "learning_rate": 8.625599561463615e-06, + "loss": 0.514, + "step": 10780 + }, + { + "epoch": 0.1462425393380358, + "grad_norm": 7.0753936767578125, + "learning_rate": 8.62546251884336e-06, + "loss": 0.3708, + "step": 10781 + }, + { + "epoch": 0.1462561041779707, + "grad_norm": 8.098502159118652, + "learning_rate": 8.625325476223106e-06, + "loss": 0.5615, + "step": 10782 + }, + { + "epoch": 0.14626966901790558, + "grad_norm": 5.489559173583984, + "learning_rate": 8.625188433602851e-06, + "loss": 0.3345, + "step": 10783 + }, + { + "epoch": 0.14628323385784048, + "grad_norm": 6.196364879608154, + "learning_rate": 8.625051390982596e-06, + "loss": 0.2694, + "step": 10784 + }, + { + "epoch": 0.14629679869777537, + "grad_norm": 5.650742053985596, + "learning_rate": 8.624914348362341e-06, + "loss": 0.3337, + "step": 10785 + }, + { + "epoch": 0.14631036353771026, + "grad_norm": 6.9135870933532715, + "learning_rate": 8.624777305742087e-06, + "loss": 0.4503, + "step": 10786 + }, + { + "epoch": 0.14632392837764516, + "grad_norm": 4.978184223175049, + "learning_rate": 8.624640263121832e-06, + "loss": 0.271, + "step": 10787 + }, + { + "epoch": 0.14633749321758002, + "grad_norm": 5.712184906005859, + "learning_rate": 8.624503220501577e-06, + "loss": 0.2975, + "step": 10788 + }, + { + "epoch": 0.14635105805751492, + "grad_norm": 5.730088710784912, + "learning_rate": 8.624366177881322e-06, + "loss": 0.2827, + "step": 10789 + }, + { + "epoch": 0.1463646228974498, + "grad_norm": 4.618049144744873, + "learning_rate": 8.624229135261067e-06, + "loss": 0.3618, + "step": 10790 + }, + { + "epoch": 0.1463781877373847, + "grad_norm": 6.991499900817871, + "learning_rate": 8.624092092640812e-06, + "loss": 0.3342, + "step": 10791 + }, + { + "epoch": 0.1463917525773196, + "grad_norm": 5.629237651824951, + "learning_rate": 8.623955050020558e-06, + "loss": 0.2785, + "step": 10792 + }, + { + "epoch": 0.1464053174172545, + "grad_norm": 5.629465579986572, + "learning_rate": 8.623818007400301e-06, + "loss": 0.407, + "step": 10793 + }, + { + "epoch": 0.14641888225718935, + "grad_norm": 4.421086311340332, + "learning_rate": 8.623680964780046e-06, + "loss": 0.2803, + "step": 10794 + }, + { + "epoch": 0.14643244709712425, + "grad_norm": 5.616604804992676, + "learning_rate": 8.623543922159793e-06, + "loss": 0.3378, + "step": 10795 + }, + { + "epoch": 0.14644601193705914, + "grad_norm": 4.83960485458374, + "learning_rate": 8.623406879539538e-06, + "loss": 0.3127, + "step": 10796 + }, + { + "epoch": 0.14645957677699403, + "grad_norm": 5.798310279846191, + "learning_rate": 8.623269836919282e-06, + "loss": 0.3056, + "step": 10797 + }, + { + "epoch": 0.14647314161692893, + "grad_norm": 5.195356369018555, + "learning_rate": 8.623132794299027e-06, + "loss": 0.4798, + "step": 10798 + }, + { + "epoch": 0.14648670645686382, + "grad_norm": 4.705187797546387, + "learning_rate": 8.622995751678774e-06, + "loss": 0.2536, + "step": 10799 + }, + { + "epoch": 0.14650027129679868, + "grad_norm": 7.286779403686523, + "learning_rate": 8.622858709058517e-06, + "loss": 0.381, + "step": 10800 + }, + { + "epoch": 0.14651383613673358, + "grad_norm": 4.288989067077637, + "learning_rate": 8.622721666438263e-06, + "loss": 0.2221, + "step": 10801 + }, + { + "epoch": 0.14652740097666847, + "grad_norm": 5.507090091705322, + "learning_rate": 8.622584623818008e-06, + "loss": 0.2596, + "step": 10802 + }, + { + "epoch": 0.14654096581660336, + "grad_norm": 4.890621662139893, + "learning_rate": 8.622447581197753e-06, + "loss": 0.3256, + "step": 10803 + }, + { + "epoch": 0.14655453065653826, + "grad_norm": 7.9748945236206055, + "learning_rate": 8.622310538577498e-06, + "loss": 0.3766, + "step": 10804 + }, + { + "epoch": 0.14656809549647315, + "grad_norm": 5.189251899719238, + "learning_rate": 8.622173495957243e-06, + "loss": 0.1433, + "step": 10805 + }, + { + "epoch": 0.14658166033640804, + "grad_norm": 5.067616939544678, + "learning_rate": 8.622036453336988e-06, + "loss": 0.3363, + "step": 10806 + }, + { + "epoch": 0.1465952251763429, + "grad_norm": 6.959494113922119, + "learning_rate": 8.621899410716734e-06, + "loss": 0.4355, + "step": 10807 + }, + { + "epoch": 0.1466087900162778, + "grad_norm": 6.695410251617432, + "learning_rate": 8.621762368096479e-06, + "loss": 0.425, + "step": 10808 + }, + { + "epoch": 0.1466223548562127, + "grad_norm": 4.6331682205200195, + "learning_rate": 8.621625325476224e-06, + "loss": 0.2633, + "step": 10809 + }, + { + "epoch": 0.1466359196961476, + "grad_norm": 4.92287540435791, + "learning_rate": 8.62148828285597e-06, + "loss": 0.3646, + "step": 10810 + }, + { + "epoch": 0.14664948453608248, + "grad_norm": 5.491153717041016, + "learning_rate": 8.621351240235714e-06, + "loss": 0.2629, + "step": 10811 + }, + { + "epoch": 0.14666304937601737, + "grad_norm": 5.585596561431885, + "learning_rate": 8.62121419761546e-06, + "loss": 0.3538, + "step": 10812 + }, + { + "epoch": 0.14667661421595224, + "grad_norm": 7.526704788208008, + "learning_rate": 8.621077154995205e-06, + "loss": 0.5166, + "step": 10813 + }, + { + "epoch": 0.14669017905588713, + "grad_norm": 4.273128032684326, + "learning_rate": 8.62094011237495e-06, + "loss": 0.2295, + "step": 10814 + }, + { + "epoch": 0.14670374389582203, + "grad_norm": 4.3619279861450195, + "learning_rate": 8.620803069754693e-06, + "loss": 0.2781, + "step": 10815 + }, + { + "epoch": 0.14671730873575692, + "grad_norm": 6.2507243156433105, + "learning_rate": 8.62066602713444e-06, + "loss": 0.4092, + "step": 10816 + }, + { + "epoch": 0.1467308735756918, + "grad_norm": 7.323467254638672, + "learning_rate": 8.620528984514185e-06, + "loss": 0.322, + "step": 10817 + }, + { + "epoch": 0.1467444384156267, + "grad_norm": 5.964612007141113, + "learning_rate": 8.620391941893929e-06, + "loss": 0.3927, + "step": 10818 + }, + { + "epoch": 0.1467580032555616, + "grad_norm": 4.9194722175598145, + "learning_rate": 8.620254899273674e-06, + "loss": 0.2815, + "step": 10819 + }, + { + "epoch": 0.14677156809549646, + "grad_norm": 4.26525354385376, + "learning_rate": 8.62011785665342e-06, + "loss": 0.2315, + "step": 10820 + }, + { + "epoch": 0.14678513293543136, + "grad_norm": 4.371386528015137, + "learning_rate": 8.619980814033166e-06, + "loss": 0.3099, + "step": 10821 + }, + { + "epoch": 0.14679869777536625, + "grad_norm": 4.907731056213379, + "learning_rate": 8.61984377141291e-06, + "loss": 0.3434, + "step": 10822 + }, + { + "epoch": 0.14681226261530114, + "grad_norm": 6.696691036224365, + "learning_rate": 8.619706728792655e-06, + "loss": 0.3981, + "step": 10823 + }, + { + "epoch": 0.14682582745523604, + "grad_norm": 6.095755577087402, + "learning_rate": 8.6195696861724e-06, + "loss": 0.3637, + "step": 10824 + }, + { + "epoch": 0.14683939229517093, + "grad_norm": 5.001340866088867, + "learning_rate": 8.619432643552145e-06, + "loss": 0.2964, + "step": 10825 + }, + { + "epoch": 0.1468529571351058, + "grad_norm": 4.36555814743042, + "learning_rate": 8.61929560093189e-06, + "loss": 0.2894, + "step": 10826 + }, + { + "epoch": 0.1468665219750407, + "grad_norm": 4.8249077796936035, + "learning_rate": 8.619158558311636e-06, + "loss": 0.2351, + "step": 10827 + }, + { + "epoch": 0.14688008681497558, + "grad_norm": 4.294064044952393, + "learning_rate": 8.61902151569138e-06, + "loss": 0.3067, + "step": 10828 + }, + { + "epoch": 0.14689365165491047, + "grad_norm": 8.573663711547852, + "learning_rate": 8.618884473071126e-06, + "loss": 0.3558, + "step": 10829 + }, + { + "epoch": 0.14690721649484537, + "grad_norm": 4.872384071350098, + "learning_rate": 8.618747430450871e-06, + "loss": 0.304, + "step": 10830 + }, + { + "epoch": 0.14692078133478026, + "grad_norm": 4.50387716293335, + "learning_rate": 8.618610387830616e-06, + "loss": 0.2712, + "step": 10831 + }, + { + "epoch": 0.14693434617471512, + "grad_norm": 6.468750476837158, + "learning_rate": 8.618473345210361e-06, + "loss": 0.3479, + "step": 10832 + }, + { + "epoch": 0.14694791101465002, + "grad_norm": 4.613247394561768, + "learning_rate": 8.618336302590105e-06, + "loss": 0.3049, + "step": 10833 + }, + { + "epoch": 0.1469614758545849, + "grad_norm": 5.468775272369385, + "learning_rate": 8.618199259969852e-06, + "loss": 0.2889, + "step": 10834 + }, + { + "epoch": 0.1469750406945198, + "grad_norm": 5.7814202308654785, + "learning_rate": 8.618062217349597e-06, + "loss": 0.322, + "step": 10835 + }, + { + "epoch": 0.1469886055344547, + "grad_norm": 5.253184795379639, + "learning_rate": 8.617925174729342e-06, + "loss": 0.2853, + "step": 10836 + }, + { + "epoch": 0.1470021703743896, + "grad_norm": 5.423882484436035, + "learning_rate": 8.617788132109086e-06, + "loss": 0.2437, + "step": 10837 + }, + { + "epoch": 0.14701573521432448, + "grad_norm": 4.0054497718811035, + "learning_rate": 8.617651089488832e-06, + "loss": 0.2863, + "step": 10838 + }, + { + "epoch": 0.14702930005425935, + "grad_norm": 5.398726940155029, + "learning_rate": 8.617514046868578e-06, + "loss": 0.3624, + "step": 10839 + }, + { + "epoch": 0.14704286489419424, + "grad_norm": 5.7682108879089355, + "learning_rate": 8.617377004248321e-06, + "loss": 0.2404, + "step": 10840 + }, + { + "epoch": 0.14705642973412913, + "grad_norm": 6.251429080963135, + "learning_rate": 8.617239961628066e-06, + "loss": 0.2912, + "step": 10841 + }, + { + "epoch": 0.14706999457406403, + "grad_norm": 5.62039041519165, + "learning_rate": 8.617102919007813e-06, + "loss": 0.3338, + "step": 10842 + }, + { + "epoch": 0.14708355941399892, + "grad_norm": 5.532435894012451, + "learning_rate": 8.616965876387557e-06, + "loss": 0.3564, + "step": 10843 + }, + { + "epoch": 0.14709712425393381, + "grad_norm": 6.973025321960449, + "learning_rate": 8.616828833767302e-06, + "loss": 0.3979, + "step": 10844 + }, + { + "epoch": 0.14711068909386868, + "grad_norm": 5.074891567230225, + "learning_rate": 8.616691791147047e-06, + "loss": 0.3556, + "step": 10845 + }, + { + "epoch": 0.14712425393380357, + "grad_norm": 4.97915506362915, + "learning_rate": 8.616554748526794e-06, + "loss": 0.2581, + "step": 10846 + }, + { + "epoch": 0.14713781877373847, + "grad_norm": 5.588186740875244, + "learning_rate": 8.616417705906537e-06, + "loss": 0.292, + "step": 10847 + }, + { + "epoch": 0.14715138361367336, + "grad_norm": 6.2508745193481445, + "learning_rate": 8.616280663286283e-06, + "loss": 0.4787, + "step": 10848 + }, + { + "epoch": 0.14716494845360825, + "grad_norm": 4.880657196044922, + "learning_rate": 8.616143620666028e-06, + "loss": 0.3367, + "step": 10849 + }, + { + "epoch": 0.14717851329354315, + "grad_norm": 9.154658317565918, + "learning_rate": 8.616006578045773e-06, + "loss": 0.7885, + "step": 10850 + }, + { + "epoch": 0.14719207813347804, + "grad_norm": 5.694019317626953, + "learning_rate": 8.615869535425518e-06, + "loss": 0.343, + "step": 10851 + }, + { + "epoch": 0.1472056429734129, + "grad_norm": 4.425328254699707, + "learning_rate": 8.615732492805263e-06, + "loss": 0.2434, + "step": 10852 + }, + { + "epoch": 0.1472192078133478, + "grad_norm": 5.430419445037842, + "learning_rate": 8.615595450185008e-06, + "loss": 0.2553, + "step": 10853 + }, + { + "epoch": 0.1472327726532827, + "grad_norm": 5.102474689483643, + "learning_rate": 8.615458407564754e-06, + "loss": 0.288, + "step": 10854 + }, + { + "epoch": 0.14724633749321758, + "grad_norm": 5.846359729766846, + "learning_rate": 8.615321364944499e-06, + "loss": 0.4391, + "step": 10855 + }, + { + "epoch": 0.14725990233315248, + "grad_norm": 4.954289436340332, + "learning_rate": 8.615184322324244e-06, + "loss": 0.4276, + "step": 10856 + }, + { + "epoch": 0.14727346717308737, + "grad_norm": 6.600408554077148, + "learning_rate": 8.61504727970399e-06, + "loss": 0.38, + "step": 10857 + }, + { + "epoch": 0.14728703201302223, + "grad_norm": 6.520724296569824, + "learning_rate": 8.614910237083733e-06, + "loss": 0.5736, + "step": 10858 + }, + { + "epoch": 0.14730059685295713, + "grad_norm": 4.669440269470215, + "learning_rate": 8.61477319446348e-06, + "loss": 0.315, + "step": 10859 + }, + { + "epoch": 0.14731416169289202, + "grad_norm": 4.783502101898193, + "learning_rate": 8.614636151843225e-06, + "loss": 0.2561, + "step": 10860 + }, + { + "epoch": 0.1473277265328269, + "grad_norm": 6.68086051940918, + "learning_rate": 8.61449910922297e-06, + "loss": 0.4314, + "step": 10861 + }, + { + "epoch": 0.1473412913727618, + "grad_norm": 5.225028038024902, + "learning_rate": 8.614362066602713e-06, + "loss": 0.4771, + "step": 10862 + }, + { + "epoch": 0.1473548562126967, + "grad_norm": 6.607115745544434, + "learning_rate": 8.614225023982459e-06, + "loss": 0.525, + "step": 10863 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 5.868510723114014, + "learning_rate": 8.614087981362205e-06, + "loss": 0.2351, + "step": 10864 + }, + { + "epoch": 0.14738198589256646, + "grad_norm": 3.9286322593688965, + "learning_rate": 8.613950938741949e-06, + "loss": 0.232, + "step": 10865 + }, + { + "epoch": 0.14739555073250135, + "grad_norm": 6.011862277984619, + "learning_rate": 8.613813896121694e-06, + "loss": 0.3164, + "step": 10866 + }, + { + "epoch": 0.14740911557243624, + "grad_norm": 7.4634881019592285, + "learning_rate": 8.61367685350144e-06, + "loss": 0.5658, + "step": 10867 + }, + { + "epoch": 0.14742268041237114, + "grad_norm": 9.035993576049805, + "learning_rate": 8.613539810881184e-06, + "loss": 0.4932, + "step": 10868 + }, + { + "epoch": 0.14743624525230603, + "grad_norm": 5.705293655395508, + "learning_rate": 8.61340276826093e-06, + "loss": 0.3568, + "step": 10869 + }, + { + "epoch": 0.14744981009224092, + "grad_norm": 6.483756065368652, + "learning_rate": 8.613265725640675e-06, + "loss": 0.4417, + "step": 10870 + }, + { + "epoch": 0.1474633749321758, + "grad_norm": 5.0786848068237305, + "learning_rate": 8.61312868302042e-06, + "loss": 0.2315, + "step": 10871 + }, + { + "epoch": 0.14747693977211068, + "grad_norm": 7.116804122924805, + "learning_rate": 8.612991640400165e-06, + "loss": 0.3594, + "step": 10872 + }, + { + "epoch": 0.14749050461204558, + "grad_norm": 5.100592613220215, + "learning_rate": 8.61285459777991e-06, + "loss": 0.3662, + "step": 10873 + }, + { + "epoch": 0.14750406945198047, + "grad_norm": 6.578904628753662, + "learning_rate": 8.612717555159656e-06, + "loss": 0.3635, + "step": 10874 + }, + { + "epoch": 0.14751763429191536, + "grad_norm": 7.300057411193848, + "learning_rate": 8.6125805125394e-06, + "loss": 0.4037, + "step": 10875 + }, + { + "epoch": 0.14753119913185025, + "grad_norm": 5.458874702453613, + "learning_rate": 8.612443469919146e-06, + "loss": 0.2676, + "step": 10876 + }, + { + "epoch": 0.14754476397178512, + "grad_norm": 7.500325679779053, + "learning_rate": 8.612306427298891e-06, + "loss": 0.3695, + "step": 10877 + }, + { + "epoch": 0.14755832881172, + "grad_norm": 6.6129021644592285, + "learning_rate": 8.612169384678636e-06, + "loss": 0.4161, + "step": 10878 + }, + { + "epoch": 0.1475718936516549, + "grad_norm": 5.814011096954346, + "learning_rate": 8.612032342058381e-06, + "loss": 0.3172, + "step": 10879 + }, + { + "epoch": 0.1475854584915898, + "grad_norm": 7.141301155090332, + "learning_rate": 8.611895299438125e-06, + "loss": 0.2442, + "step": 10880 + }, + { + "epoch": 0.1475990233315247, + "grad_norm": 5.556678771972656, + "learning_rate": 8.611758256817872e-06, + "loss": 0.328, + "step": 10881 + }, + { + "epoch": 0.14761258817145959, + "grad_norm": 4.677433967590332, + "learning_rate": 8.611621214197617e-06, + "loss": 0.2806, + "step": 10882 + }, + { + "epoch": 0.14762615301139448, + "grad_norm": 5.6331787109375, + "learning_rate": 8.61148417157736e-06, + "loss": 0.2855, + "step": 10883 + }, + { + "epoch": 0.14763971785132934, + "grad_norm": 6.787789344787598, + "learning_rate": 8.611347128957106e-06, + "loss": 0.2892, + "step": 10884 + }, + { + "epoch": 0.14765328269126424, + "grad_norm": 5.784031867980957, + "learning_rate": 8.611210086336852e-06, + "loss": 0.428, + "step": 10885 + }, + { + "epoch": 0.14766684753119913, + "grad_norm": 5.948359966278076, + "learning_rate": 8.611073043716596e-06, + "loss": 0.3534, + "step": 10886 + }, + { + "epoch": 0.14768041237113402, + "grad_norm": 6.834518909454346, + "learning_rate": 8.610936001096341e-06, + "loss": 0.3743, + "step": 10887 + }, + { + "epoch": 0.14769397721106892, + "grad_norm": 4.833123207092285, + "learning_rate": 8.610798958476086e-06, + "loss": 0.2447, + "step": 10888 + }, + { + "epoch": 0.1477075420510038, + "grad_norm": 6.349453449249268, + "learning_rate": 8.610661915855832e-06, + "loss": 0.2961, + "step": 10889 + }, + { + "epoch": 0.14772110689093867, + "grad_norm": 4.860767841339111, + "learning_rate": 8.610524873235577e-06, + "loss": 0.2576, + "step": 10890 + }, + { + "epoch": 0.14773467173087357, + "grad_norm": 7.77432107925415, + "learning_rate": 8.610387830615322e-06, + "loss": 0.3823, + "step": 10891 + }, + { + "epoch": 0.14774823657080846, + "grad_norm": 7.4119110107421875, + "learning_rate": 8.610250787995067e-06, + "loss": 0.3503, + "step": 10892 + }, + { + "epoch": 0.14776180141074335, + "grad_norm": 4.115249156951904, + "learning_rate": 8.610113745374812e-06, + "loss": 0.2182, + "step": 10893 + }, + { + "epoch": 0.14777536625067825, + "grad_norm": 4.5958380699157715, + "learning_rate": 8.609976702754557e-06, + "loss": 0.3201, + "step": 10894 + }, + { + "epoch": 0.14778893109061314, + "grad_norm": 4.051063537597656, + "learning_rate": 8.609839660134303e-06, + "loss": 0.1903, + "step": 10895 + }, + { + "epoch": 0.147802495930548, + "grad_norm": 4.885313034057617, + "learning_rate": 8.609702617514048e-06, + "loss": 0.2236, + "step": 10896 + }, + { + "epoch": 0.1478160607704829, + "grad_norm": 5.101890563964844, + "learning_rate": 8.609565574893793e-06, + "loss": 0.2508, + "step": 10897 + }, + { + "epoch": 0.1478296256104178, + "grad_norm": 7.957259178161621, + "learning_rate": 8.609428532273538e-06, + "loss": 0.3648, + "step": 10898 + }, + { + "epoch": 0.14784319045035269, + "grad_norm": 7.177188873291016, + "learning_rate": 8.609291489653283e-06, + "loss": 0.3423, + "step": 10899 + }, + { + "epoch": 0.14785675529028758, + "grad_norm": 5.652586460113525, + "learning_rate": 8.609154447033028e-06, + "loss": 0.3376, + "step": 10900 + }, + { + "epoch": 0.14787032013022247, + "grad_norm": 4.777293682098389, + "learning_rate": 8.609017404412772e-06, + "loss": 0.3636, + "step": 10901 + }, + { + "epoch": 0.14788388497015736, + "grad_norm": 6.280613422393799, + "learning_rate": 8.608880361792517e-06, + "loss": 0.3333, + "step": 10902 + }, + { + "epoch": 0.14789744981009223, + "grad_norm": 4.705934524536133, + "learning_rate": 8.608743319172264e-06, + "loss": 0.2769, + "step": 10903 + }, + { + "epoch": 0.14791101465002712, + "grad_norm": 7.22348690032959, + "learning_rate": 8.60860627655201e-06, + "loss": 0.468, + "step": 10904 + }, + { + "epoch": 0.14792457948996202, + "grad_norm": 4.704843997955322, + "learning_rate": 8.608469233931753e-06, + "loss": 0.2732, + "step": 10905 + }, + { + "epoch": 0.1479381443298969, + "grad_norm": 4.7931060791015625, + "learning_rate": 8.608332191311498e-06, + "loss": 0.2943, + "step": 10906 + }, + { + "epoch": 0.1479517091698318, + "grad_norm": 5.228592395782471, + "learning_rate": 8.608195148691245e-06, + "loss": 0.3891, + "step": 10907 + }, + { + "epoch": 0.1479652740097667, + "grad_norm": 5.306735515594482, + "learning_rate": 8.608058106070988e-06, + "loss": 0.3613, + "step": 10908 + }, + { + "epoch": 0.14797883884970156, + "grad_norm": 5.582620620727539, + "learning_rate": 8.607921063450733e-06, + "loss": 0.4389, + "step": 10909 + }, + { + "epoch": 0.14799240368963645, + "grad_norm": 6.51298713684082, + "learning_rate": 8.607784020830479e-06, + "loss": 0.402, + "step": 10910 + }, + { + "epoch": 0.14800596852957135, + "grad_norm": 5.430807113647461, + "learning_rate": 8.607646978210224e-06, + "loss": 0.4189, + "step": 10911 + }, + { + "epoch": 0.14801953336950624, + "grad_norm": 4.996541500091553, + "learning_rate": 8.607509935589969e-06, + "loss": 0.3632, + "step": 10912 + }, + { + "epoch": 0.14803309820944113, + "grad_norm": 4.236408710479736, + "learning_rate": 8.607372892969714e-06, + "loss": 0.1739, + "step": 10913 + }, + { + "epoch": 0.14804666304937603, + "grad_norm": 4.715654373168945, + "learning_rate": 8.60723585034946e-06, + "loss": 0.281, + "step": 10914 + }, + { + "epoch": 0.14806022788931092, + "grad_norm": 5.431653022766113, + "learning_rate": 8.607098807729204e-06, + "loss": 0.33, + "step": 10915 + }, + { + "epoch": 0.14807379272924578, + "grad_norm": 6.842035293579102, + "learning_rate": 8.60696176510895e-06, + "loss": 0.3832, + "step": 10916 + }, + { + "epoch": 0.14808735756918068, + "grad_norm": 4.567404270172119, + "learning_rate": 8.606824722488695e-06, + "loss": 0.3589, + "step": 10917 + }, + { + "epoch": 0.14810092240911557, + "grad_norm": 5.020670413970947, + "learning_rate": 8.60668767986844e-06, + "loss": 0.3943, + "step": 10918 + }, + { + "epoch": 0.14811448724905046, + "grad_norm": 6.680320739746094, + "learning_rate": 8.606550637248185e-06, + "loss": 0.3789, + "step": 10919 + }, + { + "epoch": 0.14812805208898536, + "grad_norm": 4.8026933670043945, + "learning_rate": 8.60641359462793e-06, + "loss": 0.2575, + "step": 10920 + }, + { + "epoch": 0.14814161692892025, + "grad_norm": 5.400325775146484, + "learning_rate": 8.606276552007676e-06, + "loss": 0.3525, + "step": 10921 + }, + { + "epoch": 0.14815518176885512, + "grad_norm": 6.639288902282715, + "learning_rate": 8.60613950938742e-06, + "loss": 0.3792, + "step": 10922 + }, + { + "epoch": 0.14816874660879, + "grad_norm": 6.823256492614746, + "learning_rate": 8.606002466767164e-06, + "loss": 0.3238, + "step": 10923 + }, + { + "epoch": 0.1481823114487249, + "grad_norm": 5.696567058563232, + "learning_rate": 8.605865424146911e-06, + "loss": 0.4265, + "step": 10924 + }, + { + "epoch": 0.1481958762886598, + "grad_norm": 6.082564830780029, + "learning_rate": 8.605728381526656e-06, + "loss": 0.3468, + "step": 10925 + }, + { + "epoch": 0.1482094411285947, + "grad_norm": 7.639197826385498, + "learning_rate": 8.6055913389064e-06, + "loss": 0.5118, + "step": 10926 + }, + { + "epoch": 0.14822300596852958, + "grad_norm": 7.767913818359375, + "learning_rate": 8.605454296286145e-06, + "loss": 0.497, + "step": 10927 + }, + { + "epoch": 0.14823657080846447, + "grad_norm": 5.641254425048828, + "learning_rate": 8.605317253665892e-06, + "loss": 0.3992, + "step": 10928 + }, + { + "epoch": 0.14825013564839934, + "grad_norm": 6.9984917640686035, + "learning_rate": 8.605180211045637e-06, + "loss": 0.4508, + "step": 10929 + }, + { + "epoch": 0.14826370048833423, + "grad_norm": 4.597808361053467, + "learning_rate": 8.60504316842538e-06, + "loss": 0.4084, + "step": 10930 + }, + { + "epoch": 0.14827726532826913, + "grad_norm": 6.148552894592285, + "learning_rate": 8.604906125805126e-06, + "loss": 0.4061, + "step": 10931 + }, + { + "epoch": 0.14829083016820402, + "grad_norm": 8.385997772216797, + "learning_rate": 8.60476908318487e-06, + "loss": 0.4644, + "step": 10932 + }, + { + "epoch": 0.1483043950081389, + "grad_norm": 5.918285846710205, + "learning_rate": 8.604632040564616e-06, + "loss": 0.4213, + "step": 10933 + }, + { + "epoch": 0.1483179598480738, + "grad_norm": 10.543071746826172, + "learning_rate": 8.604494997944361e-06, + "loss": 0.3265, + "step": 10934 + }, + { + "epoch": 0.14833152468800867, + "grad_norm": 4.711264610290527, + "learning_rate": 8.604357955324106e-06, + "loss": 0.327, + "step": 10935 + }, + { + "epoch": 0.14834508952794356, + "grad_norm": 6.376436710357666, + "learning_rate": 8.604220912703852e-06, + "loss": 0.4266, + "step": 10936 + }, + { + "epoch": 0.14835865436787846, + "grad_norm": 6.774467468261719, + "learning_rate": 8.604083870083597e-06, + "loss": 0.6069, + "step": 10937 + }, + { + "epoch": 0.14837221920781335, + "grad_norm": 5.549246311187744, + "learning_rate": 8.603946827463342e-06, + "loss": 0.3505, + "step": 10938 + }, + { + "epoch": 0.14838578404774824, + "grad_norm": 8.082237243652344, + "learning_rate": 8.603809784843087e-06, + "loss": 0.6251, + "step": 10939 + }, + { + "epoch": 0.14839934888768314, + "grad_norm": 6.21284818649292, + "learning_rate": 8.603672742222832e-06, + "loss": 0.4019, + "step": 10940 + }, + { + "epoch": 0.148412913727618, + "grad_norm": 6.233545303344727, + "learning_rate": 8.603535699602577e-06, + "loss": 0.5542, + "step": 10941 + }, + { + "epoch": 0.1484264785675529, + "grad_norm": 7.513850212097168, + "learning_rate": 8.603398656982323e-06, + "loss": 0.3977, + "step": 10942 + }, + { + "epoch": 0.1484400434074878, + "grad_norm": 6.874285697937012, + "learning_rate": 8.603261614362068e-06, + "loss": 0.6741, + "step": 10943 + }, + { + "epoch": 0.14845360824742268, + "grad_norm": 5.611144542694092, + "learning_rate": 8.603124571741813e-06, + "loss": 0.3242, + "step": 10944 + }, + { + "epoch": 0.14846717308735757, + "grad_norm": 6.592973709106445, + "learning_rate": 8.602987529121556e-06, + "loss": 0.4922, + "step": 10945 + }, + { + "epoch": 0.14848073792729247, + "grad_norm": 6.002517223358154, + "learning_rate": 8.602850486501303e-06, + "loss": 0.4607, + "step": 10946 + }, + { + "epoch": 0.14849430276722736, + "grad_norm": 6.661028861999512, + "learning_rate": 8.602713443881048e-06, + "loss": 0.435, + "step": 10947 + }, + { + "epoch": 0.14850786760716223, + "grad_norm": 5.132875442504883, + "learning_rate": 8.602576401260792e-06, + "loss": 0.4485, + "step": 10948 + }, + { + "epoch": 0.14852143244709712, + "grad_norm": 5.873366832733154, + "learning_rate": 8.602439358640537e-06, + "loss": 0.4035, + "step": 10949 + }, + { + "epoch": 0.148534997287032, + "grad_norm": 7.1835527420043945, + "learning_rate": 8.602302316020284e-06, + "loss": 0.4523, + "step": 10950 + }, + { + "epoch": 0.1485485621269669, + "grad_norm": 5.730925559997559, + "learning_rate": 8.602165273400028e-06, + "loss": 0.4234, + "step": 10951 + }, + { + "epoch": 0.1485621269669018, + "grad_norm": 5.707712650299072, + "learning_rate": 8.602028230779773e-06, + "loss": 0.4191, + "step": 10952 + }, + { + "epoch": 0.1485756918068367, + "grad_norm": 8.75283145904541, + "learning_rate": 8.601891188159518e-06, + "loss": 0.542, + "step": 10953 + }, + { + "epoch": 0.14858925664677156, + "grad_norm": 5.616368293762207, + "learning_rate": 8.601754145539265e-06, + "loss": 0.4589, + "step": 10954 + }, + { + "epoch": 0.14860282148670645, + "grad_norm": 5.1427812576293945, + "learning_rate": 8.601617102919008e-06, + "loss": 0.3198, + "step": 10955 + }, + { + "epoch": 0.14861638632664134, + "grad_norm": 7.884969711303711, + "learning_rate": 8.601480060298753e-06, + "loss": 0.3687, + "step": 10956 + }, + { + "epoch": 0.14862995116657624, + "grad_norm": 6.186650276184082, + "learning_rate": 8.601343017678499e-06, + "loss": 0.4065, + "step": 10957 + }, + { + "epoch": 0.14864351600651113, + "grad_norm": 6.097390174865723, + "learning_rate": 8.601205975058244e-06, + "loss": 0.391, + "step": 10958 + }, + { + "epoch": 0.14865708084644602, + "grad_norm": 10.553999900817871, + "learning_rate": 8.601068932437989e-06, + "loss": 0.5405, + "step": 10959 + }, + { + "epoch": 0.14867064568638091, + "grad_norm": 4.411770343780518, + "learning_rate": 8.600931889817734e-06, + "loss": 0.3788, + "step": 10960 + }, + { + "epoch": 0.14868421052631578, + "grad_norm": 8.126697540283203, + "learning_rate": 8.60079484719748e-06, + "loss": 0.6626, + "step": 10961 + }, + { + "epoch": 0.14869777536625067, + "grad_norm": 5.481838703155518, + "learning_rate": 8.600657804577224e-06, + "loss": 0.3642, + "step": 10962 + }, + { + "epoch": 0.14871134020618557, + "grad_norm": 6.132723808288574, + "learning_rate": 8.60052076195697e-06, + "loss": 0.3859, + "step": 10963 + }, + { + "epoch": 0.14872490504612046, + "grad_norm": 8.547136306762695, + "learning_rate": 8.600383719336715e-06, + "loss": 0.5539, + "step": 10964 + }, + { + "epoch": 0.14873846988605535, + "grad_norm": 7.583253383636475, + "learning_rate": 8.60024667671646e-06, + "loss": 0.4574, + "step": 10965 + }, + { + "epoch": 0.14875203472599025, + "grad_norm": 6.170833110809326, + "learning_rate": 8.600109634096204e-06, + "loss": 0.4657, + "step": 10966 + }, + { + "epoch": 0.1487655995659251, + "grad_norm": 7.626426696777344, + "learning_rate": 8.59997259147595e-06, + "loss": 0.4453, + "step": 10967 + }, + { + "epoch": 0.14877916440586, + "grad_norm": 6.4883527755737305, + "learning_rate": 8.599835548855696e-06, + "loss": 0.5508, + "step": 10968 + }, + { + "epoch": 0.1487927292457949, + "grad_norm": 7.419590950012207, + "learning_rate": 8.599698506235439e-06, + "loss": 0.3909, + "step": 10969 + }, + { + "epoch": 0.1488062940857298, + "grad_norm": 5.7816481590271, + "learning_rate": 8.599561463615184e-06, + "loss": 0.316, + "step": 10970 + }, + { + "epoch": 0.14881985892566468, + "grad_norm": 6.232464790344238, + "learning_rate": 8.59942442099493e-06, + "loss": 0.4822, + "step": 10971 + }, + { + "epoch": 0.14883342376559958, + "grad_norm": 5.808386325836182, + "learning_rate": 8.599287378374676e-06, + "loss": 0.481, + "step": 10972 + }, + { + "epoch": 0.14884698860553444, + "grad_norm": 6.204714298248291, + "learning_rate": 8.59915033575442e-06, + "loss": 0.4037, + "step": 10973 + }, + { + "epoch": 0.14886055344546933, + "grad_norm": 6.040816307067871, + "learning_rate": 8.599013293134165e-06, + "loss": 0.3514, + "step": 10974 + }, + { + "epoch": 0.14887411828540423, + "grad_norm": 5.755744457244873, + "learning_rate": 8.59887625051391e-06, + "loss": 0.4358, + "step": 10975 + }, + { + "epoch": 0.14888768312533912, + "grad_norm": 8.837311744689941, + "learning_rate": 8.598739207893655e-06, + "loss": 0.5525, + "step": 10976 + }, + { + "epoch": 0.14890124796527401, + "grad_norm": 5.391183853149414, + "learning_rate": 8.5986021652734e-06, + "loss": 0.3506, + "step": 10977 + }, + { + "epoch": 0.1489148128052089, + "grad_norm": 6.350825309753418, + "learning_rate": 8.598465122653146e-06, + "loss": 0.4327, + "step": 10978 + }, + { + "epoch": 0.1489283776451438, + "grad_norm": 7.245776176452637, + "learning_rate": 8.59832808003289e-06, + "loss": 0.4194, + "step": 10979 + }, + { + "epoch": 0.14894194248507867, + "grad_norm": 5.681593894958496, + "learning_rate": 8.598191037412636e-06, + "loss": 0.358, + "step": 10980 + }, + { + "epoch": 0.14895550732501356, + "grad_norm": 4.990698337554932, + "learning_rate": 8.598053994792381e-06, + "loss": 0.2592, + "step": 10981 + }, + { + "epoch": 0.14896907216494845, + "grad_norm": 5.158817768096924, + "learning_rate": 8.597916952172126e-06, + "loss": 0.3172, + "step": 10982 + }, + { + "epoch": 0.14898263700488334, + "grad_norm": 5.838773250579834, + "learning_rate": 8.597779909551872e-06, + "loss": 0.2516, + "step": 10983 + }, + { + "epoch": 0.14899620184481824, + "grad_norm": 8.765568733215332, + "learning_rate": 8.597642866931617e-06, + "loss": 0.5629, + "step": 10984 + }, + { + "epoch": 0.14900976668475313, + "grad_norm": 7.166065692901611, + "learning_rate": 8.597505824311362e-06, + "loss": 0.4757, + "step": 10985 + }, + { + "epoch": 0.149023331524688, + "grad_norm": 4.875993251800537, + "learning_rate": 8.597368781691107e-06, + "loss": 0.2593, + "step": 10986 + }, + { + "epoch": 0.1490368963646229, + "grad_norm": 5.8314008712768555, + "learning_rate": 8.597231739070852e-06, + "loss": 0.3483, + "step": 10987 + }, + { + "epoch": 0.14905046120455778, + "grad_norm": 5.748601913452148, + "learning_rate": 8.597094696450596e-06, + "loss": 0.2588, + "step": 10988 + }, + { + "epoch": 0.14906402604449268, + "grad_norm": 8.65125846862793, + "learning_rate": 8.596957653830343e-06, + "loss": 0.4777, + "step": 10989 + }, + { + "epoch": 0.14907759088442757, + "grad_norm": 4.7347588539123535, + "learning_rate": 8.596820611210088e-06, + "loss": 0.2473, + "step": 10990 + }, + { + "epoch": 0.14909115572436246, + "grad_norm": 6.039515495300293, + "learning_rate": 8.596683568589831e-06, + "loss": 0.3092, + "step": 10991 + }, + { + "epoch": 0.14910472056429735, + "grad_norm": 7.286802768707275, + "learning_rate": 8.596546525969576e-06, + "loss": 0.3875, + "step": 10992 + }, + { + "epoch": 0.14911828540423222, + "grad_norm": 9.252460479736328, + "learning_rate": 8.596409483349323e-06, + "loss": 0.4066, + "step": 10993 + }, + { + "epoch": 0.1491318502441671, + "grad_norm": 5.135557651519775, + "learning_rate": 8.596272440729067e-06, + "loss": 0.2923, + "step": 10994 + }, + { + "epoch": 0.149145415084102, + "grad_norm": 7.272328853607178, + "learning_rate": 8.596135398108812e-06, + "loss": 0.4671, + "step": 10995 + }, + { + "epoch": 0.1491589799240369, + "grad_norm": 4.358537673950195, + "learning_rate": 8.595998355488557e-06, + "loss": 0.2148, + "step": 10996 + }, + { + "epoch": 0.1491725447639718, + "grad_norm": 5.315536975860596, + "learning_rate": 8.595861312868304e-06, + "loss": 0.302, + "step": 10997 + }, + { + "epoch": 0.14918610960390669, + "grad_norm": 7.992688179016113, + "learning_rate": 8.595724270248048e-06, + "loss": 0.366, + "step": 10998 + }, + { + "epoch": 0.14919967444384155, + "grad_norm": 5.7003021240234375, + "learning_rate": 8.595587227627793e-06, + "loss": 0.284, + "step": 10999 + }, + { + "epoch": 0.14921323928377644, + "grad_norm": 5.313231468200684, + "learning_rate": 8.595450185007538e-06, + "loss": 0.2833, + "step": 11000 + }, + { + "epoch": 0.14922680412371134, + "grad_norm": 5.863143444061279, + "learning_rate": 8.595313142387283e-06, + "loss": 0.2972, + "step": 11001 + }, + { + "epoch": 0.14924036896364623, + "grad_norm": 7.482583045959473, + "learning_rate": 8.595176099767028e-06, + "loss": 0.2879, + "step": 11002 + }, + { + "epoch": 0.14925393380358112, + "grad_norm": 6.627554893493652, + "learning_rate": 8.595039057146773e-06, + "loss": 0.333, + "step": 11003 + }, + { + "epoch": 0.14926749864351602, + "grad_norm": 8.154304504394531, + "learning_rate": 8.594902014526519e-06, + "loss": 0.4065, + "step": 11004 + }, + { + "epoch": 0.14928106348345088, + "grad_norm": 5.634890079498291, + "learning_rate": 8.594764971906264e-06, + "loss": 0.4335, + "step": 11005 + }, + { + "epoch": 0.14929462832338578, + "grad_norm": 4.454107284545898, + "learning_rate": 8.594627929286009e-06, + "loss": 0.3316, + "step": 11006 + }, + { + "epoch": 0.14930819316332067, + "grad_norm": 4.937817096710205, + "learning_rate": 8.594490886665754e-06, + "loss": 0.2416, + "step": 11007 + }, + { + "epoch": 0.14932175800325556, + "grad_norm": 5.822942733764648, + "learning_rate": 8.5943538440455e-06, + "loss": 0.3285, + "step": 11008 + }, + { + "epoch": 0.14933532284319045, + "grad_norm": 6.010222434997559, + "learning_rate": 8.594216801425243e-06, + "loss": 0.432, + "step": 11009 + }, + { + "epoch": 0.14934888768312535, + "grad_norm": 6.911641597747803, + "learning_rate": 8.59407975880499e-06, + "loss": 0.4358, + "step": 11010 + }, + { + "epoch": 0.14936245252306024, + "grad_norm": 6.046995639801025, + "learning_rate": 8.593942716184735e-06, + "loss": 0.3895, + "step": 11011 + }, + { + "epoch": 0.1493760173629951, + "grad_norm": 5.538368225097656, + "learning_rate": 8.59380567356448e-06, + "loss": 0.26, + "step": 11012 + }, + { + "epoch": 0.14938958220293, + "grad_norm": 5.283458232879639, + "learning_rate": 8.593668630944224e-06, + "loss": 0.3237, + "step": 11013 + }, + { + "epoch": 0.1494031470428649, + "grad_norm": 8.207464218139648, + "learning_rate": 8.593531588323969e-06, + "loss": 0.4968, + "step": 11014 + }, + { + "epoch": 0.14941671188279979, + "grad_norm": 5.956969738006592, + "learning_rate": 8.593394545703716e-06, + "loss": 0.4084, + "step": 11015 + }, + { + "epoch": 0.14943027672273468, + "grad_norm": 6.457127571105957, + "learning_rate": 8.593257503083459e-06, + "loss": 0.3306, + "step": 11016 + }, + { + "epoch": 0.14944384156266957, + "grad_norm": 4.670873165130615, + "learning_rate": 8.593120460463204e-06, + "loss": 0.2989, + "step": 11017 + }, + { + "epoch": 0.14945740640260444, + "grad_norm": 6.900859355926514, + "learning_rate": 8.59298341784295e-06, + "loss": 0.4776, + "step": 11018 + }, + { + "epoch": 0.14947097124253933, + "grad_norm": 6.01993989944458, + "learning_rate": 8.592846375222695e-06, + "loss": 0.4466, + "step": 11019 + }, + { + "epoch": 0.14948453608247422, + "grad_norm": 8.023490905761719, + "learning_rate": 8.59270933260244e-06, + "loss": 0.3122, + "step": 11020 + }, + { + "epoch": 0.14949810092240912, + "grad_norm": 6.031292915344238, + "learning_rate": 8.592572289982185e-06, + "loss": 0.3121, + "step": 11021 + }, + { + "epoch": 0.149511665762344, + "grad_norm": 6.417675018310547, + "learning_rate": 8.59243524736193e-06, + "loss": 0.4807, + "step": 11022 + }, + { + "epoch": 0.1495252306022789, + "grad_norm": 3.9106149673461914, + "learning_rate": 8.592298204741675e-06, + "loss": 0.3376, + "step": 11023 + }, + { + "epoch": 0.1495387954422138, + "grad_norm": 6.803354740142822, + "learning_rate": 8.59216116212142e-06, + "loss": 0.5527, + "step": 11024 + }, + { + "epoch": 0.14955236028214866, + "grad_norm": 5.157639026641846, + "learning_rate": 8.592024119501166e-06, + "loss": 0.2023, + "step": 11025 + }, + { + "epoch": 0.14956592512208355, + "grad_norm": 7.452229976654053, + "learning_rate": 8.591887076880911e-06, + "loss": 0.5954, + "step": 11026 + }, + { + "epoch": 0.14957948996201845, + "grad_norm": 5.10310697555542, + "learning_rate": 8.591750034260656e-06, + "loss": 0.4551, + "step": 11027 + }, + { + "epoch": 0.14959305480195334, + "grad_norm": 7.234725475311279, + "learning_rate": 8.591612991640401e-06, + "loss": 0.5298, + "step": 11028 + }, + { + "epoch": 0.14960661964188823, + "grad_norm": 7.451053619384766, + "learning_rate": 8.591475949020146e-06, + "loss": 0.4532, + "step": 11029 + }, + { + "epoch": 0.14962018448182313, + "grad_norm": 8.210060119628906, + "learning_rate": 8.591338906399892e-06, + "loss": 0.521, + "step": 11030 + }, + { + "epoch": 0.149633749321758, + "grad_norm": 6.669192314147949, + "learning_rate": 8.591201863779635e-06, + "loss": 0.6098, + "step": 11031 + }, + { + "epoch": 0.14964731416169288, + "grad_norm": 8.436727523803711, + "learning_rate": 8.591064821159382e-06, + "loss": 0.4748, + "step": 11032 + }, + { + "epoch": 0.14966087900162778, + "grad_norm": 5.149845600128174, + "learning_rate": 8.590927778539127e-06, + "loss": 0.4741, + "step": 11033 + }, + { + "epoch": 0.14967444384156267, + "grad_norm": 5.004453182220459, + "learning_rate": 8.59079073591887e-06, + "loss": 0.3807, + "step": 11034 + }, + { + "epoch": 0.14968800868149756, + "grad_norm": 6.877532482147217, + "learning_rate": 8.590653693298616e-06, + "loss": 0.4169, + "step": 11035 + }, + { + "epoch": 0.14970157352143246, + "grad_norm": 7.4897236824035645, + "learning_rate": 8.590516650678363e-06, + "loss": 0.4798, + "step": 11036 + }, + { + "epoch": 0.14971513836136732, + "grad_norm": 7.625338554382324, + "learning_rate": 8.590379608058108e-06, + "loss": 0.3528, + "step": 11037 + }, + { + "epoch": 0.14972870320130222, + "grad_norm": 6.56850528717041, + "learning_rate": 8.590242565437851e-06, + "loss": 0.5599, + "step": 11038 + }, + { + "epoch": 0.1497422680412371, + "grad_norm": 5.2960333824157715, + "learning_rate": 8.590105522817596e-06, + "loss": 0.2241, + "step": 11039 + }, + { + "epoch": 0.149755832881172, + "grad_norm": 7.1368727684021, + "learning_rate": 8.589968480197342e-06, + "loss": 0.3819, + "step": 11040 + }, + { + "epoch": 0.1497693977211069, + "grad_norm": 7.416837215423584, + "learning_rate": 8.589831437577087e-06, + "loss": 0.45, + "step": 11041 + }, + { + "epoch": 0.1497829625610418, + "grad_norm": 5.868650436401367, + "learning_rate": 8.589694394956832e-06, + "loss": 0.4112, + "step": 11042 + }, + { + "epoch": 0.14979652740097668, + "grad_norm": 6.9037089347839355, + "learning_rate": 8.589557352336577e-06, + "loss": 0.4107, + "step": 11043 + }, + { + "epoch": 0.14981009224091155, + "grad_norm": 5.602982997894287, + "learning_rate": 8.589420309716322e-06, + "loss": 0.376, + "step": 11044 + }, + { + "epoch": 0.14982365708084644, + "grad_norm": 4.999057292938232, + "learning_rate": 8.589283267096068e-06, + "loss": 0.3644, + "step": 11045 + }, + { + "epoch": 0.14983722192078133, + "grad_norm": 7.460063457489014, + "learning_rate": 8.589146224475813e-06, + "loss": 0.383, + "step": 11046 + }, + { + "epoch": 0.14985078676071623, + "grad_norm": 6.67645788192749, + "learning_rate": 8.589009181855558e-06, + "loss": 0.4748, + "step": 11047 + }, + { + "epoch": 0.14986435160065112, + "grad_norm": 6.032885551452637, + "learning_rate": 8.588872139235303e-06, + "loss": 0.4792, + "step": 11048 + }, + { + "epoch": 0.149877916440586, + "grad_norm": 4.971131801605225, + "learning_rate": 8.588735096615048e-06, + "loss": 0.3205, + "step": 11049 + }, + { + "epoch": 0.14989148128052088, + "grad_norm": 6.235006332397461, + "learning_rate": 8.588598053994793e-06, + "loss": 0.5369, + "step": 11050 + }, + { + "epoch": 0.14990504612045577, + "grad_norm": 7.254925727844238, + "learning_rate": 8.588461011374539e-06, + "loss": 0.4959, + "step": 11051 + }, + { + "epoch": 0.14991861096039066, + "grad_norm": 5.25140905380249, + "learning_rate": 8.588323968754284e-06, + "loss": 0.3941, + "step": 11052 + }, + { + "epoch": 0.14993217580032556, + "grad_norm": 7.796906471252441, + "learning_rate": 8.588186926134029e-06, + "loss": 0.4719, + "step": 11053 + }, + { + "epoch": 0.14994574064026045, + "grad_norm": 5.755126476287842, + "learning_rate": 8.588049883513774e-06, + "loss": 0.3747, + "step": 11054 + }, + { + "epoch": 0.14995930548019534, + "grad_norm": 5.389073848724365, + "learning_rate": 8.58791284089352e-06, + "loss": 0.3955, + "step": 11055 + }, + { + "epoch": 0.14997287032013024, + "grad_norm": 6.30488395690918, + "learning_rate": 8.587775798273263e-06, + "loss": 0.5775, + "step": 11056 + }, + { + "epoch": 0.1499864351600651, + "grad_norm": 4.6177849769592285, + "learning_rate": 8.587638755653008e-06, + "loss": 0.3592, + "step": 11057 + }, + { + "epoch": 0.15, + "grad_norm": 7.7452778816223145, + "learning_rate": 8.587501713032755e-06, + "loss": 0.4485, + "step": 11058 + }, + { + "epoch": 0.15, + "eval_loss": 0.365930438041687, + "eval_noise_accuracy": NaN, + "eval_runtime": 4659.4583, + "eval_samples_per_second": 1.078, + "eval_steps_per_second": 0.067, + "eval_wer": 33.07899045441908, + "step": 11058 + }, + { + "epoch": 0.1500135648399349, + "grad_norm": 6.669414043426514, + "learning_rate": 8.587364670412498e-06, + "loss": 0.5604, + "step": 11059 + }, + { + "epoch": 0.15002712967986978, + "grad_norm": 8.391818046569824, + "learning_rate": 8.587227627792244e-06, + "loss": 0.3594, + "step": 11060 + }, + { + "epoch": 0.15004069451980467, + "grad_norm": 8.840438842773438, + "learning_rate": 8.587090585171989e-06, + "loss": 0.464, + "step": 11061 + }, + { + "epoch": 0.15005425935973957, + "grad_norm": 5.626988887786865, + "learning_rate": 8.586953542551734e-06, + "loss": 0.4236, + "step": 11062 + }, + { + "epoch": 0.15006782419967443, + "grad_norm": 7.38307523727417, + "learning_rate": 8.586816499931479e-06, + "loss": 0.4281, + "step": 11063 + }, + { + "epoch": 0.15008138903960933, + "grad_norm": 4.70150899887085, + "learning_rate": 8.586679457311224e-06, + "loss": 0.314, + "step": 11064 + }, + { + "epoch": 0.15009495387954422, + "grad_norm": 5.93522834777832, + "learning_rate": 8.58654241469097e-06, + "loss": 0.3787, + "step": 11065 + }, + { + "epoch": 0.1501085187194791, + "grad_norm": 5.711450099945068, + "learning_rate": 8.586405372070715e-06, + "loss": 0.3398, + "step": 11066 + }, + { + "epoch": 0.150122083559414, + "grad_norm": 4.840072154998779, + "learning_rate": 8.58626832945046e-06, + "loss": 0.3841, + "step": 11067 + }, + { + "epoch": 0.1501356483993489, + "grad_norm": 4.809789657592773, + "learning_rate": 8.586131286830205e-06, + "loss": 0.3383, + "step": 11068 + }, + { + "epoch": 0.15014921323928376, + "grad_norm": 4.775641441345215, + "learning_rate": 8.58599424420995e-06, + "loss": 0.4521, + "step": 11069 + }, + { + "epoch": 0.15016277807921866, + "grad_norm": 7.13399600982666, + "learning_rate": 8.585857201589695e-06, + "loss": 0.4712, + "step": 11070 + }, + { + "epoch": 0.15017634291915355, + "grad_norm": 4.622450828552246, + "learning_rate": 8.58572015896944e-06, + "loss": 0.3374, + "step": 11071 + }, + { + "epoch": 0.15018990775908844, + "grad_norm": 5.749405384063721, + "learning_rate": 8.585583116349186e-06, + "loss": 0.3474, + "step": 11072 + }, + { + "epoch": 0.15020347259902334, + "grad_norm": 5.268463134765625, + "learning_rate": 8.585446073728931e-06, + "loss": 0.3357, + "step": 11073 + }, + { + "epoch": 0.15021703743895823, + "grad_norm": 5.699535846710205, + "learning_rate": 8.585309031108674e-06, + "loss": 0.3613, + "step": 11074 + }, + { + "epoch": 0.15023060227889312, + "grad_norm": 7.130804061889648, + "learning_rate": 8.585171988488421e-06, + "loss": 0.4328, + "step": 11075 + }, + { + "epoch": 0.150244167118828, + "grad_norm": 7.187844276428223, + "learning_rate": 8.585034945868166e-06, + "loss": 0.3308, + "step": 11076 + }, + { + "epoch": 0.15025773195876288, + "grad_norm": 7.163358688354492, + "learning_rate": 8.58489790324791e-06, + "loss": 0.5035, + "step": 11077 + }, + { + "epoch": 0.15027129679869777, + "grad_norm": 7.2495341300964355, + "learning_rate": 8.584760860627655e-06, + "loss": 0.3564, + "step": 11078 + }, + { + "epoch": 0.15028486163863267, + "grad_norm": 8.8447904586792, + "learning_rate": 8.584623818007402e-06, + "loss": 0.6329, + "step": 11079 + }, + { + "epoch": 0.15029842647856756, + "grad_norm": 5.209693908691406, + "learning_rate": 8.584486775387147e-06, + "loss": 0.3923, + "step": 11080 + }, + { + "epoch": 0.15031199131850245, + "grad_norm": 4.717147350311279, + "learning_rate": 8.58434973276689e-06, + "loss": 0.3625, + "step": 11081 + }, + { + "epoch": 0.15032555615843732, + "grad_norm": 7.194338321685791, + "learning_rate": 8.584212690146636e-06, + "loss": 0.4588, + "step": 11082 + }, + { + "epoch": 0.1503391209983722, + "grad_norm": 5.817358016967773, + "learning_rate": 8.584075647526381e-06, + "loss": 0.3576, + "step": 11083 + }, + { + "epoch": 0.1503526858383071, + "grad_norm": 8.00246810913086, + "learning_rate": 8.583938604906126e-06, + "loss": 0.4405, + "step": 11084 + }, + { + "epoch": 0.150366250678242, + "grad_norm": 8.0717134475708, + "learning_rate": 8.583801562285871e-06, + "loss": 0.5803, + "step": 11085 + }, + { + "epoch": 0.1503798155181769, + "grad_norm": 5.992298603057861, + "learning_rate": 8.583664519665617e-06, + "loss": 0.4637, + "step": 11086 + }, + { + "epoch": 0.15039338035811178, + "grad_norm": 8.536903381347656, + "learning_rate": 8.583527477045362e-06, + "loss": 0.5291, + "step": 11087 + }, + { + "epoch": 0.15040694519804668, + "grad_norm": 8.429411888122559, + "learning_rate": 8.583390434425107e-06, + "loss": 0.582, + "step": 11088 + }, + { + "epoch": 0.15042051003798154, + "grad_norm": 7.658401966094971, + "learning_rate": 8.583253391804852e-06, + "loss": 0.4978, + "step": 11089 + }, + { + "epoch": 0.15043407487791643, + "grad_norm": 6.090970039367676, + "learning_rate": 8.583116349184597e-06, + "loss": 0.2821, + "step": 11090 + }, + { + "epoch": 0.15044763971785133, + "grad_norm": 5.82404088973999, + "learning_rate": 8.582979306564342e-06, + "loss": 0.3532, + "step": 11091 + }, + { + "epoch": 0.15046120455778622, + "grad_norm": 5.250269889831543, + "learning_rate": 8.582842263944088e-06, + "loss": 0.3249, + "step": 11092 + }, + { + "epoch": 0.15047476939772111, + "grad_norm": 8.641983032226562, + "learning_rate": 8.582705221323833e-06, + "loss": 0.4975, + "step": 11093 + }, + { + "epoch": 0.150488334237656, + "grad_norm": 7.997742652893066, + "learning_rate": 8.582568178703578e-06, + "loss": 0.661, + "step": 11094 + }, + { + "epoch": 0.15050189907759087, + "grad_norm": 7.87379789352417, + "learning_rate": 8.582431136083323e-06, + "loss": 0.4871, + "step": 11095 + }, + { + "epoch": 0.15051546391752577, + "grad_norm": 6.0691399574279785, + "learning_rate": 8.582294093463067e-06, + "loss": 0.4276, + "step": 11096 + }, + { + "epoch": 0.15052902875746066, + "grad_norm": 5.939282417297363, + "learning_rate": 8.582157050842813e-06, + "loss": 0.3545, + "step": 11097 + }, + { + "epoch": 0.15054259359739555, + "grad_norm": 5.855377674102783, + "learning_rate": 8.582020008222559e-06, + "loss": 0.3957, + "step": 11098 + }, + { + "epoch": 0.15055615843733045, + "grad_norm": 4.444392681121826, + "learning_rate": 8.581882965602302e-06, + "loss": 0.3611, + "step": 11099 + }, + { + "epoch": 0.15056972327726534, + "grad_norm": 6.105868816375732, + "learning_rate": 8.581745922982047e-06, + "loss": 0.5358, + "step": 11100 + }, + { + "epoch": 0.1505832881172002, + "grad_norm": 4.663147926330566, + "learning_rate": 8.581608880361794e-06, + "loss": 0.3011, + "step": 11101 + }, + { + "epoch": 0.1505968529571351, + "grad_norm": 7.749690532684326, + "learning_rate": 8.581471837741538e-06, + "loss": 0.4519, + "step": 11102 + }, + { + "epoch": 0.15061041779707, + "grad_norm": 6.116151809692383, + "learning_rate": 8.581334795121283e-06, + "loss": 0.398, + "step": 11103 + }, + { + "epoch": 0.15062398263700488, + "grad_norm": 6.236160755157471, + "learning_rate": 8.581197752501028e-06, + "loss": 0.3613, + "step": 11104 + }, + { + "epoch": 0.15063754747693978, + "grad_norm": 4.817325592041016, + "learning_rate": 8.581060709880775e-06, + "loss": 0.3451, + "step": 11105 + }, + { + "epoch": 0.15065111231687467, + "grad_norm": 6.220145225524902, + "learning_rate": 8.580923667260518e-06, + "loss": 0.5676, + "step": 11106 + }, + { + "epoch": 0.15066467715680956, + "grad_norm": 6.438898086547852, + "learning_rate": 8.580786624640264e-06, + "loss": 0.4603, + "step": 11107 + }, + { + "epoch": 0.15067824199674443, + "grad_norm": 5.818187713623047, + "learning_rate": 8.580649582020009e-06, + "loss": 0.3323, + "step": 11108 + }, + { + "epoch": 0.15069180683667932, + "grad_norm": 5.938425540924072, + "learning_rate": 8.580512539399754e-06, + "loss": 0.4159, + "step": 11109 + }, + { + "epoch": 0.1507053716766142, + "grad_norm": 4.72567892074585, + "learning_rate": 8.580375496779499e-06, + "loss": 0.3602, + "step": 11110 + }, + { + "epoch": 0.1507189365165491, + "grad_norm": 10.372761726379395, + "learning_rate": 8.580238454159244e-06, + "loss": 0.4747, + "step": 11111 + }, + { + "epoch": 0.150732501356484, + "grad_norm": 6.975037097930908, + "learning_rate": 8.58010141153899e-06, + "loss": 0.4904, + "step": 11112 + }, + { + "epoch": 0.1507460661964189, + "grad_norm": 6.387598037719727, + "learning_rate": 8.579964368918735e-06, + "loss": 0.428, + "step": 11113 + }, + { + "epoch": 0.15075963103635376, + "grad_norm": 5.584796905517578, + "learning_rate": 8.57982732629848e-06, + "loss": 0.3067, + "step": 11114 + }, + { + "epoch": 0.15077319587628865, + "grad_norm": 4.4576568603515625, + "learning_rate": 8.579690283678225e-06, + "loss": 0.3328, + "step": 11115 + }, + { + "epoch": 0.15078676071622354, + "grad_norm": 6.400365829467773, + "learning_rate": 8.57955324105797e-06, + "loss": 0.4432, + "step": 11116 + }, + { + "epoch": 0.15080032555615844, + "grad_norm": 7.251195907592773, + "learning_rate": 8.579416198437714e-06, + "loss": 0.6644, + "step": 11117 + }, + { + "epoch": 0.15081389039609333, + "grad_norm": 6.803762912750244, + "learning_rate": 8.57927915581746e-06, + "loss": 0.3271, + "step": 11118 + }, + { + "epoch": 0.15082745523602822, + "grad_norm": 7.281054973602295, + "learning_rate": 8.579142113197206e-06, + "loss": 0.6605, + "step": 11119 + }, + { + "epoch": 0.15084102007596312, + "grad_norm": 5.2284955978393555, + "learning_rate": 8.579005070576951e-06, + "loss": 0.475, + "step": 11120 + }, + { + "epoch": 0.15085458491589798, + "grad_norm": 5.684724807739258, + "learning_rate": 8.578868027956694e-06, + "loss": 0.33, + "step": 11121 + }, + { + "epoch": 0.15086814975583288, + "grad_norm": 5.94687557220459, + "learning_rate": 8.578730985336441e-06, + "loss": 0.4225, + "step": 11122 + }, + { + "epoch": 0.15088171459576777, + "grad_norm": 5.087793350219727, + "learning_rate": 8.578593942716186e-06, + "loss": 0.3346, + "step": 11123 + }, + { + "epoch": 0.15089527943570266, + "grad_norm": 5.686838150024414, + "learning_rate": 8.57845690009593e-06, + "loss": 0.3866, + "step": 11124 + }, + { + "epoch": 0.15090884427563755, + "grad_norm": 5.553849697113037, + "learning_rate": 8.578319857475675e-06, + "loss": 0.3821, + "step": 11125 + }, + { + "epoch": 0.15092240911557245, + "grad_norm": 6.549323558807373, + "learning_rate": 8.57818281485542e-06, + "loss": 0.3019, + "step": 11126 + }, + { + "epoch": 0.1509359739555073, + "grad_norm": 5.795065402984619, + "learning_rate": 8.578045772235165e-06, + "loss": 0.3594, + "step": 11127 + }, + { + "epoch": 0.1509495387954422, + "grad_norm": 4.341304779052734, + "learning_rate": 8.57790872961491e-06, + "loss": 0.2517, + "step": 11128 + }, + { + "epoch": 0.1509631036353771, + "grad_norm": 5.492066860198975, + "learning_rate": 8.577771686994656e-06, + "loss": 0.4207, + "step": 11129 + }, + { + "epoch": 0.150976668475312, + "grad_norm": 4.971837997436523, + "learning_rate": 8.577634644374401e-06, + "loss": 0.3013, + "step": 11130 + }, + { + "epoch": 0.15099023331524689, + "grad_norm": 5.319420337677002, + "learning_rate": 8.577497601754146e-06, + "loss": 0.2923, + "step": 11131 + }, + { + "epoch": 0.15100379815518178, + "grad_norm": 7.346650123596191, + "learning_rate": 8.577360559133891e-06, + "loss": 0.4225, + "step": 11132 + }, + { + "epoch": 0.15101736299511664, + "grad_norm": 5.622995376586914, + "learning_rate": 8.577223516513637e-06, + "loss": 0.4029, + "step": 11133 + }, + { + "epoch": 0.15103092783505154, + "grad_norm": 7.18597412109375, + "learning_rate": 8.577086473893382e-06, + "loss": 0.4003, + "step": 11134 + }, + { + "epoch": 0.15104449267498643, + "grad_norm": 5.558771133422852, + "learning_rate": 8.576949431273127e-06, + "loss": 0.4244, + "step": 11135 + }, + { + "epoch": 0.15105805751492132, + "grad_norm": 4.724140644073486, + "learning_rate": 8.576812388652872e-06, + "loss": 0.2107, + "step": 11136 + }, + { + "epoch": 0.15107162235485622, + "grad_norm": 6.750668048858643, + "learning_rate": 8.576675346032617e-06, + "loss": 0.376, + "step": 11137 + }, + { + "epoch": 0.1510851871947911, + "grad_norm": 5.210720062255859, + "learning_rate": 8.576538303412362e-06, + "loss": 0.2659, + "step": 11138 + }, + { + "epoch": 0.151098752034726, + "grad_norm": 4.871608257293701, + "learning_rate": 8.576401260792106e-06, + "loss": 0.306, + "step": 11139 + }, + { + "epoch": 0.15111231687466087, + "grad_norm": 5.567786693572998, + "learning_rate": 8.576264218171853e-06, + "loss": 0.3862, + "step": 11140 + }, + { + "epoch": 0.15112588171459576, + "grad_norm": 6.745171546936035, + "learning_rate": 8.576127175551598e-06, + "loss": 0.2876, + "step": 11141 + }, + { + "epoch": 0.15113944655453065, + "grad_norm": 4.250367641448975, + "learning_rate": 8.575990132931341e-06, + "loss": 0.2943, + "step": 11142 + }, + { + "epoch": 0.15115301139446555, + "grad_norm": 5.327508926391602, + "learning_rate": 8.575853090311087e-06, + "loss": 0.2601, + "step": 11143 + }, + { + "epoch": 0.15116657623440044, + "grad_norm": 5.277046203613281, + "learning_rate": 8.575716047690833e-06, + "loss": 0.2985, + "step": 11144 + }, + { + "epoch": 0.15118014107433533, + "grad_norm": 6.130295753479004, + "learning_rate": 8.575579005070579e-06, + "loss": 0.5472, + "step": 11145 + }, + { + "epoch": 0.1511937059142702, + "grad_norm": 8.11406421661377, + "learning_rate": 8.575441962450322e-06, + "loss": 0.4687, + "step": 11146 + }, + { + "epoch": 0.1512072707542051, + "grad_norm": 7.132465362548828, + "learning_rate": 8.575304919830067e-06, + "loss": 0.3823, + "step": 11147 + }, + { + "epoch": 0.15122083559413999, + "grad_norm": 9.075345039367676, + "learning_rate": 8.575167877209814e-06, + "loss": 0.5042, + "step": 11148 + }, + { + "epoch": 0.15123440043407488, + "grad_norm": 7.775359630584717, + "learning_rate": 8.575030834589558e-06, + "loss": 0.401, + "step": 11149 + }, + { + "epoch": 0.15124796527400977, + "grad_norm": 5.287836074829102, + "learning_rate": 8.574893791969303e-06, + "loss": 0.2984, + "step": 11150 + }, + { + "epoch": 0.15126153011394466, + "grad_norm": 5.060666561126709, + "learning_rate": 8.574756749349048e-06, + "loss": 0.3084, + "step": 11151 + }, + { + "epoch": 0.15127509495387956, + "grad_norm": 7.169868469238281, + "learning_rate": 8.574619706728793e-06, + "loss": 0.4137, + "step": 11152 + }, + { + "epoch": 0.15128865979381442, + "grad_norm": 5.3594794273376465, + "learning_rate": 8.574482664108538e-06, + "loss": 0.4046, + "step": 11153 + }, + { + "epoch": 0.15130222463374932, + "grad_norm": 6.461056232452393, + "learning_rate": 8.574345621488284e-06, + "loss": 0.3352, + "step": 11154 + }, + { + "epoch": 0.1513157894736842, + "grad_norm": 6.744591236114502, + "learning_rate": 8.574208578868029e-06, + "loss": 0.3247, + "step": 11155 + }, + { + "epoch": 0.1513293543136191, + "grad_norm": 6.853153705596924, + "learning_rate": 8.574071536247774e-06, + "loss": 0.4009, + "step": 11156 + }, + { + "epoch": 0.151342919153554, + "grad_norm": 6.218510627746582, + "learning_rate": 8.573934493627519e-06, + "loss": 0.3886, + "step": 11157 + }, + { + "epoch": 0.1513564839934889, + "grad_norm": 5.884933948516846, + "learning_rate": 8.573797451007264e-06, + "loss": 0.3499, + "step": 11158 + }, + { + "epoch": 0.15137004883342375, + "grad_norm": 5.797464370727539, + "learning_rate": 8.57366040838701e-06, + "loss": 0.3424, + "step": 11159 + }, + { + "epoch": 0.15138361367335865, + "grad_norm": 6.505424976348877, + "learning_rate": 8.573523365766755e-06, + "loss": 0.5049, + "step": 11160 + }, + { + "epoch": 0.15139717851329354, + "grad_norm": 5.483203411102295, + "learning_rate": 8.5733863231465e-06, + "loss": 0.2976, + "step": 11161 + }, + { + "epoch": 0.15141074335322843, + "grad_norm": 5.709758758544922, + "learning_rate": 8.573249280526245e-06, + "loss": 0.3251, + "step": 11162 + }, + { + "epoch": 0.15142430819316333, + "grad_norm": 5.199642181396484, + "learning_rate": 8.57311223790599e-06, + "loss": 0.2405, + "step": 11163 + }, + { + "epoch": 0.15143787303309822, + "grad_norm": 5.428919792175293, + "learning_rate": 8.572975195285734e-06, + "loss": 0.3321, + "step": 11164 + }, + { + "epoch": 0.15145143787303308, + "grad_norm": 5.55981969833374, + "learning_rate": 8.572838152665479e-06, + "loss": 0.44, + "step": 11165 + }, + { + "epoch": 0.15146500271296798, + "grad_norm": 6.152133464813232, + "learning_rate": 8.572701110045226e-06, + "loss": 0.3365, + "step": 11166 + }, + { + "epoch": 0.15147856755290287, + "grad_norm": 6.250608444213867, + "learning_rate": 8.57256406742497e-06, + "loss": 0.3961, + "step": 11167 + }, + { + "epoch": 0.15149213239283776, + "grad_norm": 5.446068286895752, + "learning_rate": 8.572427024804714e-06, + "loss": 0.3557, + "step": 11168 + }, + { + "epoch": 0.15150569723277266, + "grad_norm": 6.829223155975342, + "learning_rate": 8.57228998218446e-06, + "loss": 0.374, + "step": 11169 + }, + { + "epoch": 0.15151926207270755, + "grad_norm": 5.531836032867432, + "learning_rate": 8.572152939564205e-06, + "loss": 0.3561, + "step": 11170 + }, + { + "epoch": 0.15153282691264244, + "grad_norm": 4.7434563636779785, + "learning_rate": 8.57201589694395e-06, + "loss": 0.2811, + "step": 11171 + }, + { + "epoch": 0.1515463917525773, + "grad_norm": 5.358663558959961, + "learning_rate": 8.571878854323695e-06, + "loss": 0.2556, + "step": 11172 + }, + { + "epoch": 0.1515599565925122, + "grad_norm": 6.316411018371582, + "learning_rate": 8.57174181170344e-06, + "loss": 0.3659, + "step": 11173 + }, + { + "epoch": 0.1515735214324471, + "grad_norm": 5.494054794311523, + "learning_rate": 8.571604769083185e-06, + "loss": 0.2729, + "step": 11174 + }, + { + "epoch": 0.151587086272382, + "grad_norm": 6.009372711181641, + "learning_rate": 8.57146772646293e-06, + "loss": 0.3225, + "step": 11175 + }, + { + "epoch": 0.15160065111231688, + "grad_norm": 4.175833702087402, + "learning_rate": 8.571330683842676e-06, + "loss": 0.2885, + "step": 11176 + }, + { + "epoch": 0.15161421595225177, + "grad_norm": 6.391233444213867, + "learning_rate": 8.571193641222421e-06, + "loss": 0.3347, + "step": 11177 + }, + { + "epoch": 0.15162778079218664, + "grad_norm": 5.043327331542969, + "learning_rate": 8.571056598602166e-06, + "loss": 0.4403, + "step": 11178 + }, + { + "epoch": 0.15164134563212153, + "grad_norm": 4.2263665199279785, + "learning_rate": 8.570919555981911e-06, + "loss": 0.2255, + "step": 11179 + }, + { + "epoch": 0.15165491047205643, + "grad_norm": 6.703799724578857, + "learning_rate": 8.570782513361657e-06, + "loss": 0.2807, + "step": 11180 + }, + { + "epoch": 0.15166847531199132, + "grad_norm": 5.492595195770264, + "learning_rate": 8.570645470741402e-06, + "loss": 0.3834, + "step": 11181 + }, + { + "epoch": 0.1516820401519262, + "grad_norm": 4.656878471374512, + "learning_rate": 8.570508428121145e-06, + "loss": 0.3143, + "step": 11182 + }, + { + "epoch": 0.1516956049918611, + "grad_norm": 20.955379486083984, + "learning_rate": 8.570371385500892e-06, + "loss": 0.3791, + "step": 11183 + }, + { + "epoch": 0.151709169831796, + "grad_norm": 5.672329902648926, + "learning_rate": 8.570234342880637e-06, + "loss": 0.36, + "step": 11184 + }, + { + "epoch": 0.15172273467173086, + "grad_norm": 4.141825199127197, + "learning_rate": 8.57009730026038e-06, + "loss": 0.2343, + "step": 11185 + }, + { + "epoch": 0.15173629951166576, + "grad_norm": 5.712063789367676, + "learning_rate": 8.569960257640126e-06, + "loss": 0.3106, + "step": 11186 + }, + { + "epoch": 0.15174986435160065, + "grad_norm": 6.282299518585205, + "learning_rate": 8.569823215019873e-06, + "loss": 0.3061, + "step": 11187 + }, + { + "epoch": 0.15176342919153554, + "grad_norm": 4.28666877746582, + "learning_rate": 8.569686172399618e-06, + "loss": 0.26, + "step": 11188 + }, + { + "epoch": 0.15177699403147044, + "grad_norm": 5.6189703941345215, + "learning_rate": 8.569549129779361e-06, + "loss": 0.2036, + "step": 11189 + }, + { + "epoch": 0.15179055887140533, + "grad_norm": 7.234889507293701, + "learning_rate": 8.569412087159107e-06, + "loss": 0.4394, + "step": 11190 + }, + { + "epoch": 0.1518041237113402, + "grad_norm": 4.3065290451049805, + "learning_rate": 8.569275044538854e-06, + "loss": 0.175, + "step": 11191 + }, + { + "epoch": 0.1518176885512751, + "grad_norm": 5.385239601135254, + "learning_rate": 8.569138001918597e-06, + "loss": 0.3853, + "step": 11192 + }, + { + "epoch": 0.15183125339120998, + "grad_norm": 6.339945316314697, + "learning_rate": 8.569000959298342e-06, + "loss": 0.3041, + "step": 11193 + }, + { + "epoch": 0.15184481823114487, + "grad_norm": 4.467218399047852, + "learning_rate": 8.568863916678087e-06, + "loss": 0.152, + "step": 11194 + }, + { + "epoch": 0.15185838307107977, + "grad_norm": 3.700460910797119, + "learning_rate": 8.568726874057833e-06, + "loss": 0.2103, + "step": 11195 + }, + { + "epoch": 0.15187194791101466, + "grad_norm": 6.0711493492126465, + "learning_rate": 8.568589831437578e-06, + "loss": 0.3151, + "step": 11196 + }, + { + "epoch": 0.15188551275094953, + "grad_norm": 6.501598358154297, + "learning_rate": 8.568452788817323e-06, + "loss": 0.4244, + "step": 11197 + }, + { + "epoch": 0.15189907759088442, + "grad_norm": 6.739923000335693, + "learning_rate": 8.568315746197068e-06, + "loss": 0.2513, + "step": 11198 + }, + { + "epoch": 0.1519126424308193, + "grad_norm": 6.449194431304932, + "learning_rate": 8.568178703576813e-06, + "loss": 0.3463, + "step": 11199 + }, + { + "epoch": 0.1519262072707542, + "grad_norm": 4.897917747497559, + "learning_rate": 8.568041660956558e-06, + "loss": 0.2037, + "step": 11200 + }, + { + "epoch": 0.1519397721106891, + "grad_norm": 5.938797473907471, + "learning_rate": 8.567904618336304e-06, + "loss": 0.2792, + "step": 11201 + }, + { + "epoch": 0.151953336950624, + "grad_norm": 4.545495986938477, + "learning_rate": 8.567767575716049e-06, + "loss": 0.2363, + "step": 11202 + }, + { + "epoch": 0.15196690179055888, + "grad_norm": 4.308159828186035, + "learning_rate": 8.567630533095794e-06, + "loss": 0.2452, + "step": 11203 + }, + { + "epoch": 0.15198046663049375, + "grad_norm": 7.246335983276367, + "learning_rate": 8.567493490475539e-06, + "loss": 0.3662, + "step": 11204 + }, + { + "epoch": 0.15199403147042864, + "grad_norm": 6.621921062469482, + "learning_rate": 8.567356447855284e-06, + "loss": 0.3931, + "step": 11205 + }, + { + "epoch": 0.15200759631036354, + "grad_norm": 5.521084308624268, + "learning_rate": 8.56721940523503e-06, + "loss": 0.3089, + "step": 11206 + }, + { + "epoch": 0.15202116115029843, + "grad_norm": 6.423239707946777, + "learning_rate": 8.567082362614773e-06, + "loss": 0.2925, + "step": 11207 + }, + { + "epoch": 0.15203472599023332, + "grad_norm": 6.686542510986328, + "learning_rate": 8.566945319994518e-06, + "loss": 0.3363, + "step": 11208 + }, + { + "epoch": 0.15204829083016821, + "grad_norm": 4.912893772125244, + "learning_rate": 8.566808277374265e-06, + "loss": 0.2803, + "step": 11209 + }, + { + "epoch": 0.15206185567010308, + "grad_norm": 5.857325077056885, + "learning_rate": 8.566671234754009e-06, + "loss": 0.3389, + "step": 11210 + }, + { + "epoch": 0.15207542051003797, + "grad_norm": 6.065587997436523, + "learning_rate": 8.566534192133754e-06, + "loss": 0.2646, + "step": 11211 + }, + { + "epoch": 0.15208898534997287, + "grad_norm": 5.432681560516357, + "learning_rate": 8.566397149513499e-06, + "loss": 0.2962, + "step": 11212 + }, + { + "epoch": 0.15210255018990776, + "grad_norm": 5.179108142852783, + "learning_rate": 8.566260106893246e-06, + "loss": 0.2066, + "step": 11213 + }, + { + "epoch": 0.15211611502984265, + "grad_norm": 6.15223503112793, + "learning_rate": 8.56612306427299e-06, + "loss": 0.4316, + "step": 11214 + }, + { + "epoch": 0.15212967986977755, + "grad_norm": 6.804435729980469, + "learning_rate": 8.565986021652734e-06, + "loss": 0.287, + "step": 11215 + }, + { + "epoch": 0.15214324470971244, + "grad_norm": 6.188068866729736, + "learning_rate": 8.56584897903248e-06, + "loss": 0.2435, + "step": 11216 + }, + { + "epoch": 0.1521568095496473, + "grad_norm": 4.809650897979736, + "learning_rate": 8.565711936412225e-06, + "loss": 0.2493, + "step": 11217 + }, + { + "epoch": 0.1521703743895822, + "grad_norm": 5.129018783569336, + "learning_rate": 8.56557489379197e-06, + "loss": 0.3091, + "step": 11218 + }, + { + "epoch": 0.1521839392295171, + "grad_norm": 4.0601701736450195, + "learning_rate": 8.565437851171715e-06, + "loss": 0.1554, + "step": 11219 + }, + { + "epoch": 0.15219750406945198, + "grad_norm": 4.197558403015137, + "learning_rate": 8.56530080855146e-06, + "loss": 0.1707, + "step": 11220 + }, + { + "epoch": 0.15221106890938688, + "grad_norm": 6.737423419952393, + "learning_rate": 8.565163765931205e-06, + "loss": 0.3082, + "step": 11221 + }, + { + "epoch": 0.15222463374932177, + "grad_norm": 11.4027681350708, + "learning_rate": 8.56502672331095e-06, + "loss": 0.3561, + "step": 11222 + }, + { + "epoch": 0.15223819858925663, + "grad_norm": 5.068445682525635, + "learning_rate": 8.564889680690696e-06, + "loss": 0.3219, + "step": 11223 + }, + { + "epoch": 0.15225176342919153, + "grad_norm": 6.369889736175537, + "learning_rate": 8.564752638070441e-06, + "loss": 0.4305, + "step": 11224 + }, + { + "epoch": 0.15226532826912642, + "grad_norm": 6.58279275894165, + "learning_rate": 8.564615595450185e-06, + "loss": 0.4678, + "step": 11225 + }, + { + "epoch": 0.15227889310906131, + "grad_norm": 7.276062965393066, + "learning_rate": 8.564478552829931e-06, + "loss": 0.4189, + "step": 11226 + }, + { + "epoch": 0.1522924579489962, + "grad_norm": 6.844357490539551, + "learning_rate": 8.564341510209677e-06, + "loss": 0.377, + "step": 11227 + }, + { + "epoch": 0.1523060227889311, + "grad_norm": 4.961215019226074, + "learning_rate": 8.564204467589422e-06, + "loss": 0.3061, + "step": 11228 + }, + { + "epoch": 0.15231958762886597, + "grad_norm": 5.7424187660217285, + "learning_rate": 8.564067424969165e-06, + "loss": 0.3517, + "step": 11229 + }, + { + "epoch": 0.15233315246880086, + "grad_norm": 7.996091842651367, + "learning_rate": 8.563930382348912e-06, + "loss": 0.5251, + "step": 11230 + }, + { + "epoch": 0.15234671730873575, + "grad_norm": 7.147181987762451, + "learning_rate": 8.563793339728657e-06, + "loss": 0.4099, + "step": 11231 + }, + { + "epoch": 0.15236028214867064, + "grad_norm": 5.336006164550781, + "learning_rate": 8.5636562971084e-06, + "loss": 0.3227, + "step": 11232 + }, + { + "epoch": 0.15237384698860554, + "grad_norm": 6.424168109893799, + "learning_rate": 8.563519254488146e-06, + "loss": 0.416, + "step": 11233 + }, + { + "epoch": 0.15238741182854043, + "grad_norm": 5.616665840148926, + "learning_rate": 8.563382211867891e-06, + "loss": 0.4572, + "step": 11234 + }, + { + "epoch": 0.15240097666847532, + "grad_norm": 5.763815879821777, + "learning_rate": 8.563245169247636e-06, + "loss": 0.5064, + "step": 11235 + }, + { + "epoch": 0.1524145415084102, + "grad_norm": 5.7890167236328125, + "learning_rate": 8.563108126627381e-06, + "loss": 0.4161, + "step": 11236 + }, + { + "epoch": 0.15242810634834508, + "grad_norm": 6.125300884246826, + "learning_rate": 8.562971084007127e-06, + "loss": 0.4004, + "step": 11237 + }, + { + "epoch": 0.15244167118827998, + "grad_norm": 6.0864481925964355, + "learning_rate": 8.562834041386872e-06, + "loss": 0.443, + "step": 11238 + }, + { + "epoch": 0.15245523602821487, + "grad_norm": 5.837602615356445, + "learning_rate": 8.562696998766617e-06, + "loss": 0.3713, + "step": 11239 + }, + { + "epoch": 0.15246880086814976, + "grad_norm": 4.9726738929748535, + "learning_rate": 8.562559956146362e-06, + "loss": 0.3017, + "step": 11240 + }, + { + "epoch": 0.15248236570808466, + "grad_norm": 6.425187110900879, + "learning_rate": 8.562422913526107e-06, + "loss": 0.2692, + "step": 11241 + }, + { + "epoch": 0.15249593054801952, + "grad_norm": 6.047688007354736, + "learning_rate": 8.562285870905853e-06, + "loss": 0.5024, + "step": 11242 + }, + { + "epoch": 0.1525094953879544, + "grad_norm": 8.42435359954834, + "learning_rate": 8.562148828285598e-06, + "loss": 0.5689, + "step": 11243 + }, + { + "epoch": 0.1525230602278893, + "grad_norm": 9.277480125427246, + "learning_rate": 8.562011785665343e-06, + "loss": 0.5749, + "step": 11244 + }, + { + "epoch": 0.1525366250678242, + "grad_norm": 7.768210411071777, + "learning_rate": 8.561874743045088e-06, + "loss": 0.5424, + "step": 11245 + }, + { + "epoch": 0.1525501899077591, + "grad_norm": 7.073139667510986, + "learning_rate": 8.561737700424833e-06, + "loss": 0.5325, + "step": 11246 + }, + { + "epoch": 0.15256375474769399, + "grad_norm": 4.260623931884766, + "learning_rate": 8.561600657804577e-06, + "loss": 0.2333, + "step": 11247 + }, + { + "epoch": 0.15257731958762888, + "grad_norm": 8.406986236572266, + "learning_rate": 8.561463615184324e-06, + "loss": 0.4584, + "step": 11248 + }, + { + "epoch": 0.15259088442756374, + "grad_norm": 5.786404132843018, + "learning_rate": 8.561326572564069e-06, + "loss": 0.3905, + "step": 11249 + }, + { + "epoch": 0.15260444926749864, + "grad_norm": 6.185511112213135, + "learning_rate": 8.561189529943812e-06, + "loss": 0.5561, + "step": 11250 + }, + { + "epoch": 0.15261801410743353, + "grad_norm": 6.4586334228515625, + "learning_rate": 8.561052487323557e-06, + "loss": 0.4121, + "step": 11251 + }, + { + "epoch": 0.15263157894736842, + "grad_norm": 8.897061347961426, + "learning_rate": 8.560915444703304e-06, + "loss": 0.4931, + "step": 11252 + }, + { + "epoch": 0.15264514378730332, + "grad_norm": 6.779706001281738, + "learning_rate": 8.560778402083048e-06, + "loss": 0.3767, + "step": 11253 + }, + { + "epoch": 0.1526587086272382, + "grad_norm": 7.463429927825928, + "learning_rate": 8.560641359462793e-06, + "loss": 0.4605, + "step": 11254 + }, + { + "epoch": 0.15267227346717308, + "grad_norm": 5.559743881225586, + "learning_rate": 8.560504316842538e-06, + "loss": 0.301, + "step": 11255 + }, + { + "epoch": 0.15268583830710797, + "grad_norm": 5.831579208374023, + "learning_rate": 8.560367274222285e-06, + "loss": 0.2919, + "step": 11256 + }, + { + "epoch": 0.15269940314704286, + "grad_norm": 5.060220718383789, + "learning_rate": 8.560230231602029e-06, + "loss": 0.3096, + "step": 11257 + }, + { + "epoch": 0.15271296798697775, + "grad_norm": 5.886857986450195, + "learning_rate": 8.560093188981774e-06, + "loss": 0.4284, + "step": 11258 + }, + { + "epoch": 0.15272653282691265, + "grad_norm": 5.510219573974609, + "learning_rate": 8.559956146361519e-06, + "loss": 0.4172, + "step": 11259 + }, + { + "epoch": 0.15274009766684754, + "grad_norm": 6.8804850578308105, + "learning_rate": 8.559819103741264e-06, + "loss": 0.3704, + "step": 11260 + }, + { + "epoch": 0.1527536625067824, + "grad_norm": 6.783374309539795, + "learning_rate": 8.55968206112101e-06, + "loss": 0.5266, + "step": 11261 + }, + { + "epoch": 0.1527672273467173, + "grad_norm": 7.924218654632568, + "learning_rate": 8.559545018500754e-06, + "loss": 0.45, + "step": 11262 + }, + { + "epoch": 0.1527807921866522, + "grad_norm": 6.856064319610596, + "learning_rate": 8.5594079758805e-06, + "loss": 0.4992, + "step": 11263 + }, + { + "epoch": 0.15279435702658709, + "grad_norm": 9.444524765014648, + "learning_rate": 8.559270933260245e-06, + "loss": 0.6847, + "step": 11264 + }, + { + "epoch": 0.15280792186652198, + "grad_norm": 6.272468566894531, + "learning_rate": 8.55913389063999e-06, + "loss": 0.5217, + "step": 11265 + }, + { + "epoch": 0.15282148670645687, + "grad_norm": 9.212902069091797, + "learning_rate": 8.558996848019735e-06, + "loss": 0.5803, + "step": 11266 + }, + { + "epoch": 0.15283505154639176, + "grad_norm": 6.884718418121338, + "learning_rate": 8.55885980539948e-06, + "loss": 0.3759, + "step": 11267 + }, + { + "epoch": 0.15284861638632663, + "grad_norm": 8.562100410461426, + "learning_rate": 8.558722762779224e-06, + "loss": 0.4835, + "step": 11268 + }, + { + "epoch": 0.15286218122626152, + "grad_norm": 10.142443656921387, + "learning_rate": 8.55858572015897e-06, + "loss": 0.6416, + "step": 11269 + }, + { + "epoch": 0.15287574606619642, + "grad_norm": 6.263644218444824, + "learning_rate": 8.558448677538716e-06, + "loss": 0.5209, + "step": 11270 + }, + { + "epoch": 0.1528893109061313, + "grad_norm": 6.318615436553955, + "learning_rate": 8.558311634918461e-06, + "loss": 0.4906, + "step": 11271 + }, + { + "epoch": 0.1529028757460662, + "grad_norm": 7.663491725921631, + "learning_rate": 8.558174592298205e-06, + "loss": 0.4188, + "step": 11272 + }, + { + "epoch": 0.1529164405860011, + "grad_norm": 8.433073997497559, + "learning_rate": 8.558037549677951e-06, + "loss": 0.5079, + "step": 11273 + }, + { + "epoch": 0.15293000542593596, + "grad_norm": 4.3037638664245605, + "learning_rate": 8.557900507057697e-06, + "loss": 0.351, + "step": 11274 + }, + { + "epoch": 0.15294357026587085, + "grad_norm": 5.873502731323242, + "learning_rate": 8.55776346443744e-06, + "loss": 0.4693, + "step": 11275 + }, + { + "epoch": 0.15295713510580575, + "grad_norm": 7.319426536560059, + "learning_rate": 8.557626421817185e-06, + "loss": 0.4888, + "step": 11276 + }, + { + "epoch": 0.15297069994574064, + "grad_norm": 7.942917346954346, + "learning_rate": 8.55748937919693e-06, + "loss": 0.55, + "step": 11277 + }, + { + "epoch": 0.15298426478567553, + "grad_norm": 6.7421956062316895, + "learning_rate": 8.557352336576676e-06, + "loss": 0.3622, + "step": 11278 + }, + { + "epoch": 0.15299782962561043, + "grad_norm": 6.293832778930664, + "learning_rate": 8.55721529395642e-06, + "loss": 0.4586, + "step": 11279 + }, + { + "epoch": 0.15301139446554532, + "grad_norm": 7.356494426727295, + "learning_rate": 8.557078251336166e-06, + "loss": 0.6096, + "step": 11280 + }, + { + "epoch": 0.15302495930548018, + "grad_norm": 6.63906192779541, + "learning_rate": 8.556941208715911e-06, + "loss": 0.3477, + "step": 11281 + }, + { + "epoch": 0.15303852414541508, + "grad_norm": 7.35821008682251, + "learning_rate": 8.556804166095656e-06, + "loss": 0.4471, + "step": 11282 + }, + { + "epoch": 0.15305208898534997, + "grad_norm": 6.14108419418335, + "learning_rate": 8.556667123475401e-06, + "loss": 0.4736, + "step": 11283 + }, + { + "epoch": 0.15306565382528486, + "grad_norm": 5.2253642082214355, + "learning_rate": 8.556530080855147e-06, + "loss": 0.3784, + "step": 11284 + }, + { + "epoch": 0.15307921866521976, + "grad_norm": 7.777222633361816, + "learning_rate": 8.556393038234892e-06, + "loss": 0.5506, + "step": 11285 + }, + { + "epoch": 0.15309278350515465, + "grad_norm": 8.216118812561035, + "learning_rate": 8.556255995614637e-06, + "loss": 0.5709, + "step": 11286 + }, + { + "epoch": 0.15310634834508952, + "grad_norm": 9.353501319885254, + "learning_rate": 8.556118952994382e-06, + "loss": 0.527, + "step": 11287 + }, + { + "epoch": 0.1531199131850244, + "grad_norm": 5.594208717346191, + "learning_rate": 8.555981910374127e-06, + "loss": 0.4465, + "step": 11288 + }, + { + "epoch": 0.1531334780249593, + "grad_norm": 5.946901798248291, + "learning_rate": 8.555844867753873e-06, + "loss": 0.4044, + "step": 11289 + }, + { + "epoch": 0.1531470428648942, + "grad_norm": 4.876906394958496, + "learning_rate": 8.555707825133616e-06, + "loss": 0.3727, + "step": 11290 + }, + { + "epoch": 0.1531606077048291, + "grad_norm": 5.6226606369018555, + "learning_rate": 8.555570782513363e-06, + "loss": 0.5422, + "step": 11291 + }, + { + "epoch": 0.15317417254476398, + "grad_norm": 7.300461769104004, + "learning_rate": 8.555433739893108e-06, + "loss": 0.4525, + "step": 11292 + }, + { + "epoch": 0.15318773738469887, + "grad_norm": 5.118399143218994, + "learning_rate": 8.555296697272852e-06, + "loss": 0.41, + "step": 11293 + }, + { + "epoch": 0.15320130222463374, + "grad_norm": 9.36037540435791, + "learning_rate": 8.555159654652597e-06, + "loss": 0.5354, + "step": 11294 + }, + { + "epoch": 0.15321486706456863, + "grad_norm": 6.325841903686523, + "learning_rate": 8.555022612032344e-06, + "loss": 0.3619, + "step": 11295 + }, + { + "epoch": 0.15322843190450353, + "grad_norm": 9.280242919921875, + "learning_rate": 8.554885569412089e-06, + "loss": 0.4711, + "step": 11296 + }, + { + "epoch": 0.15324199674443842, + "grad_norm": 6.3248796463012695, + "learning_rate": 8.554748526791832e-06, + "loss": 0.3444, + "step": 11297 + }, + { + "epoch": 0.1532555615843733, + "grad_norm": 8.04944896697998, + "learning_rate": 8.554611484171577e-06, + "loss": 0.5869, + "step": 11298 + }, + { + "epoch": 0.1532691264243082, + "grad_norm": 7.273743152618408, + "learning_rate": 8.554474441551324e-06, + "loss": 0.4039, + "step": 11299 + }, + { + "epoch": 0.15328269126424307, + "grad_norm": 5.105377674102783, + "learning_rate": 8.554337398931068e-06, + "loss": 0.3092, + "step": 11300 + }, + { + "epoch": 0.15329625610417796, + "grad_norm": 5.468822956085205, + "learning_rate": 8.554200356310813e-06, + "loss": 0.3886, + "step": 11301 + }, + { + "epoch": 0.15330982094411286, + "grad_norm": 6.555152416229248, + "learning_rate": 8.554063313690558e-06, + "loss": 0.5616, + "step": 11302 + }, + { + "epoch": 0.15332338578404775, + "grad_norm": 7.160092353820801, + "learning_rate": 8.553926271070303e-06, + "loss": 0.5272, + "step": 11303 + }, + { + "epoch": 0.15333695062398264, + "grad_norm": 8.372233390808105, + "learning_rate": 8.553789228450049e-06, + "loss": 0.4266, + "step": 11304 + }, + { + "epoch": 0.15335051546391754, + "grad_norm": 7.799172401428223, + "learning_rate": 8.553652185829794e-06, + "loss": 0.5633, + "step": 11305 + }, + { + "epoch": 0.1533640803038524, + "grad_norm": 7.046107769012451, + "learning_rate": 8.553515143209539e-06, + "loss": 0.564, + "step": 11306 + }, + { + "epoch": 0.1533776451437873, + "grad_norm": 6.511980056762695, + "learning_rate": 8.553378100589284e-06, + "loss": 0.3609, + "step": 11307 + }, + { + "epoch": 0.1533912099837222, + "grad_norm": 5.323617458343506, + "learning_rate": 8.55324105796903e-06, + "loss": 0.33, + "step": 11308 + }, + { + "epoch": 0.15340477482365708, + "grad_norm": 8.981836318969727, + "learning_rate": 8.553104015348774e-06, + "loss": 0.6779, + "step": 11309 + }, + { + "epoch": 0.15341833966359197, + "grad_norm": 9.798158645629883, + "learning_rate": 8.55296697272852e-06, + "loss": 0.49, + "step": 11310 + }, + { + "epoch": 0.15343190450352687, + "grad_norm": 7.580970287322998, + "learning_rate": 8.552829930108265e-06, + "loss": 0.428, + "step": 11311 + }, + { + "epoch": 0.15344546934346176, + "grad_norm": 7.258859157562256, + "learning_rate": 8.55269288748801e-06, + "loss": 0.5333, + "step": 11312 + }, + { + "epoch": 0.15345903418339663, + "grad_norm": 5.643923282623291, + "learning_rate": 8.552555844867755e-06, + "loss": 0.4505, + "step": 11313 + }, + { + "epoch": 0.15347259902333152, + "grad_norm": 5.848221302032471, + "learning_rate": 8.5524188022475e-06, + "loss": 0.3633, + "step": 11314 + }, + { + "epoch": 0.1534861638632664, + "grad_norm": 8.45413875579834, + "learning_rate": 8.552281759627244e-06, + "loss": 0.4629, + "step": 11315 + }, + { + "epoch": 0.1534997287032013, + "grad_norm": 6.69457483291626, + "learning_rate": 8.552144717006989e-06, + "loss": 0.5251, + "step": 11316 + }, + { + "epoch": 0.1535132935431362, + "grad_norm": 6.816157341003418, + "learning_rate": 8.552007674386736e-06, + "loss": 0.4883, + "step": 11317 + }, + { + "epoch": 0.1535268583830711, + "grad_norm": 6.127279281616211, + "learning_rate": 8.55187063176648e-06, + "loss": 0.3531, + "step": 11318 + }, + { + "epoch": 0.15354042322300596, + "grad_norm": 8.718622207641602, + "learning_rate": 8.551733589146225e-06, + "loss": 0.634, + "step": 11319 + }, + { + "epoch": 0.15355398806294085, + "grad_norm": 7.059290409088135, + "learning_rate": 8.55159654652597e-06, + "loss": 0.4512, + "step": 11320 + }, + { + "epoch": 0.15356755290287574, + "grad_norm": 7.9653730392456055, + "learning_rate": 8.551459503905717e-06, + "loss": 0.4791, + "step": 11321 + }, + { + "epoch": 0.15358111774281064, + "grad_norm": 6.217993259429932, + "learning_rate": 8.55132246128546e-06, + "loss": 0.3984, + "step": 11322 + }, + { + "epoch": 0.15359468258274553, + "grad_norm": 6.13215446472168, + "learning_rate": 8.551185418665205e-06, + "loss": 0.4222, + "step": 11323 + }, + { + "epoch": 0.15360824742268042, + "grad_norm": 7.3689703941345215, + "learning_rate": 8.55104837604495e-06, + "loss": 0.2931, + "step": 11324 + }, + { + "epoch": 0.15362181226261531, + "grad_norm": 9.38485050201416, + "learning_rate": 8.550911333424696e-06, + "loss": 0.412, + "step": 11325 + }, + { + "epoch": 0.15363537710255018, + "grad_norm": 6.258177757263184, + "learning_rate": 8.55077429080444e-06, + "loss": 0.3207, + "step": 11326 + }, + { + "epoch": 0.15364894194248507, + "grad_norm": 7.650767803192139, + "learning_rate": 8.550637248184186e-06, + "loss": 0.4626, + "step": 11327 + }, + { + "epoch": 0.15366250678241997, + "grad_norm": 7.492244720458984, + "learning_rate": 8.550500205563931e-06, + "loss": 0.369, + "step": 11328 + }, + { + "epoch": 0.15367607162235486, + "grad_norm": 8.663433074951172, + "learning_rate": 8.550363162943676e-06, + "loss": 0.6628, + "step": 11329 + }, + { + "epoch": 0.15368963646228975, + "grad_norm": 6.818439483642578, + "learning_rate": 8.550226120323422e-06, + "loss": 0.377, + "step": 11330 + }, + { + "epoch": 0.15370320130222465, + "grad_norm": 5.620218276977539, + "learning_rate": 8.550089077703167e-06, + "loss": 0.29, + "step": 11331 + }, + { + "epoch": 0.1537167661421595, + "grad_norm": 6.330146789550781, + "learning_rate": 8.549952035082912e-06, + "loss": 0.4662, + "step": 11332 + }, + { + "epoch": 0.1537303309820944, + "grad_norm": 5.371251583099365, + "learning_rate": 8.549814992462655e-06, + "loss": 0.2825, + "step": 11333 + }, + { + "epoch": 0.1537438958220293, + "grad_norm": 6.788933753967285, + "learning_rate": 8.549677949842402e-06, + "loss": 0.5357, + "step": 11334 + }, + { + "epoch": 0.1537574606619642, + "grad_norm": 7.503676414489746, + "learning_rate": 8.549540907222147e-06, + "loss": 0.4245, + "step": 11335 + }, + { + "epoch": 0.15377102550189908, + "grad_norm": 5.147612571716309, + "learning_rate": 8.549403864601893e-06, + "loss": 0.2425, + "step": 11336 + }, + { + "epoch": 0.15378459034183398, + "grad_norm": 8.503809928894043, + "learning_rate": 8.549266821981636e-06, + "loss": 0.3952, + "step": 11337 + }, + { + "epoch": 0.15379815518176884, + "grad_norm": 4.994952201843262, + "learning_rate": 8.549129779361383e-06, + "loss": 0.301, + "step": 11338 + }, + { + "epoch": 0.15381172002170374, + "grad_norm": 9.109411239624023, + "learning_rate": 8.548992736741128e-06, + "loss": 0.4574, + "step": 11339 + }, + { + "epoch": 0.15382528486163863, + "grad_norm": 8.82190227508545, + "learning_rate": 8.548855694120872e-06, + "loss": 0.406, + "step": 11340 + }, + { + "epoch": 0.15383884970157352, + "grad_norm": 6.051301956176758, + "learning_rate": 8.548718651500617e-06, + "loss": 0.3941, + "step": 11341 + }, + { + "epoch": 0.15385241454150841, + "grad_norm": 5.636016845703125, + "learning_rate": 8.548581608880364e-06, + "loss": 0.3404, + "step": 11342 + }, + { + "epoch": 0.1538659793814433, + "grad_norm": 7.926198482513428, + "learning_rate": 8.548444566260107e-06, + "loss": 0.3438, + "step": 11343 + }, + { + "epoch": 0.1538795442213782, + "grad_norm": 6.504409313201904, + "learning_rate": 8.548307523639852e-06, + "loss": 0.2482, + "step": 11344 + }, + { + "epoch": 0.15389310906131307, + "grad_norm": 4.616801738739014, + "learning_rate": 8.548170481019598e-06, + "loss": 0.2735, + "step": 11345 + }, + { + "epoch": 0.15390667390124796, + "grad_norm": 5.024205684661865, + "learning_rate": 8.548033438399343e-06, + "loss": 0.2306, + "step": 11346 + }, + { + "epoch": 0.15392023874118285, + "grad_norm": 6.251172065734863, + "learning_rate": 8.547896395779088e-06, + "loss": 0.3757, + "step": 11347 + }, + { + "epoch": 0.15393380358111775, + "grad_norm": 4.620380878448486, + "learning_rate": 8.547759353158833e-06, + "loss": 0.2054, + "step": 11348 + }, + { + "epoch": 0.15394736842105264, + "grad_norm": 4.369844913482666, + "learning_rate": 8.547622310538578e-06, + "loss": 0.1228, + "step": 11349 + }, + { + "epoch": 0.15396093326098753, + "grad_norm": 8.296189308166504, + "learning_rate": 8.547485267918323e-06, + "loss": 0.4677, + "step": 11350 + }, + { + "epoch": 0.1539744981009224, + "grad_norm": 6.114699363708496, + "learning_rate": 8.547348225298069e-06, + "loss": 0.3283, + "step": 11351 + }, + { + "epoch": 0.1539880629408573, + "grad_norm": 6.1653151512146, + "learning_rate": 8.547211182677814e-06, + "loss": 0.3363, + "step": 11352 + }, + { + "epoch": 0.15400162778079218, + "grad_norm": 6.8586578369140625, + "learning_rate": 8.547074140057559e-06, + "loss": 0.3654, + "step": 11353 + }, + { + "epoch": 0.15401519262072708, + "grad_norm": 6.944091320037842, + "learning_rate": 8.546937097437304e-06, + "loss": 0.3596, + "step": 11354 + }, + { + "epoch": 0.15402875746066197, + "grad_norm": 5.7958784103393555, + "learning_rate": 8.54680005481705e-06, + "loss": 0.3514, + "step": 11355 + }, + { + "epoch": 0.15404232230059686, + "grad_norm": 4.616502285003662, + "learning_rate": 8.546663012196794e-06, + "loss": 0.2656, + "step": 11356 + }, + { + "epoch": 0.15405588714053176, + "grad_norm": 7.506589889526367, + "learning_rate": 8.54652596957654e-06, + "loss": 0.2352, + "step": 11357 + }, + { + "epoch": 0.15406945198046662, + "grad_norm": 11.647848129272461, + "learning_rate": 8.546388926956283e-06, + "loss": 0.2482, + "step": 11358 + }, + { + "epoch": 0.1540830168204015, + "grad_norm": 6.506495952606201, + "learning_rate": 8.546251884336028e-06, + "loss": 0.3506, + "step": 11359 + }, + { + "epoch": 0.1540965816603364, + "grad_norm": 5.005837440490723, + "learning_rate": 8.546114841715775e-06, + "loss": 0.1791, + "step": 11360 + }, + { + "epoch": 0.1541101465002713, + "grad_norm": 5.863541126251221, + "learning_rate": 8.545977799095519e-06, + "loss": 0.3631, + "step": 11361 + }, + { + "epoch": 0.1541237113402062, + "grad_norm": 4.800090312957764, + "learning_rate": 8.545840756475264e-06, + "loss": 0.1959, + "step": 11362 + }, + { + "epoch": 0.1541372761801411, + "grad_norm": 6.285030841827393, + "learning_rate": 8.545703713855009e-06, + "loss": 0.3897, + "step": 11363 + }, + { + "epoch": 0.15415084102007595, + "grad_norm": 5.902576446533203, + "learning_rate": 8.545566671234756e-06, + "loss": 0.3943, + "step": 11364 + }, + { + "epoch": 0.15416440586001084, + "grad_norm": 5.859818458557129, + "learning_rate": 8.5454296286145e-06, + "loss": 0.2676, + "step": 11365 + }, + { + "epoch": 0.15417797069994574, + "grad_norm": 6.132271766662598, + "learning_rate": 8.545292585994245e-06, + "loss": 0.3166, + "step": 11366 + }, + { + "epoch": 0.15419153553988063, + "grad_norm": 5.813138008117676, + "learning_rate": 8.54515554337399e-06, + "loss": 0.2976, + "step": 11367 + }, + { + "epoch": 0.15420510037981552, + "grad_norm": 7.888082981109619, + "learning_rate": 8.545018500753735e-06, + "loss": 0.3907, + "step": 11368 + }, + { + "epoch": 0.15421866521975042, + "grad_norm": 5.601868629455566, + "learning_rate": 8.54488145813348e-06, + "loss": 0.2594, + "step": 11369 + }, + { + "epoch": 0.15423223005968528, + "grad_norm": 7.065622329711914, + "learning_rate": 8.544744415513225e-06, + "loss": 0.4223, + "step": 11370 + }, + { + "epoch": 0.15424579489962018, + "grad_norm": 5.40508508682251, + "learning_rate": 8.54460737289297e-06, + "loss": 0.3211, + "step": 11371 + }, + { + "epoch": 0.15425935973955507, + "grad_norm": 5.488354682922363, + "learning_rate": 8.544470330272716e-06, + "loss": 0.3647, + "step": 11372 + }, + { + "epoch": 0.15427292457948996, + "grad_norm": 6.078240394592285, + "learning_rate": 8.54433328765246e-06, + "loss": 0.3177, + "step": 11373 + }, + { + "epoch": 0.15428648941942485, + "grad_norm": 5.7215704917907715, + "learning_rate": 8.544196245032206e-06, + "loss": 0.2209, + "step": 11374 + }, + { + "epoch": 0.15430005425935975, + "grad_norm": 5.330687046051025, + "learning_rate": 8.544059202411951e-06, + "loss": 0.2472, + "step": 11375 + }, + { + "epoch": 0.15431361909929464, + "grad_norm": 6.189803600311279, + "learning_rate": 8.543922159791695e-06, + "loss": 0.249, + "step": 11376 + }, + { + "epoch": 0.1543271839392295, + "grad_norm": 6.122650623321533, + "learning_rate": 8.543785117171442e-06, + "loss": 0.3369, + "step": 11377 + }, + { + "epoch": 0.1543407487791644, + "grad_norm": 7.3432512283325195, + "learning_rate": 8.543648074551187e-06, + "loss": 0.4621, + "step": 11378 + }, + { + "epoch": 0.1543543136190993, + "grad_norm": 6.853071212768555, + "learning_rate": 8.543511031930932e-06, + "loss": 0.4079, + "step": 11379 + }, + { + "epoch": 0.15436787845903419, + "grad_norm": 6.861874580383301, + "learning_rate": 8.543373989310675e-06, + "loss": 0.3128, + "step": 11380 + }, + { + "epoch": 0.15438144329896908, + "grad_norm": 8.435577392578125, + "learning_rate": 8.543236946690422e-06, + "loss": 0.2836, + "step": 11381 + }, + { + "epoch": 0.15439500813890397, + "grad_norm": 6.350022792816162, + "learning_rate": 8.543099904070167e-06, + "loss": 0.3483, + "step": 11382 + }, + { + "epoch": 0.15440857297883884, + "grad_norm": 6.27016019821167, + "learning_rate": 8.542962861449911e-06, + "loss": 0.2831, + "step": 11383 + }, + { + "epoch": 0.15442213781877373, + "grad_norm": 6.881943225860596, + "learning_rate": 8.542825818829656e-06, + "loss": 0.4123, + "step": 11384 + }, + { + "epoch": 0.15443570265870862, + "grad_norm": 5.755082130432129, + "learning_rate": 8.542688776209401e-06, + "loss": 0.2977, + "step": 11385 + }, + { + "epoch": 0.15444926749864352, + "grad_norm": 7.041591167449951, + "learning_rate": 8.542551733589146e-06, + "loss": 0.3515, + "step": 11386 + }, + { + "epoch": 0.1544628323385784, + "grad_norm": 4.775789737701416, + "learning_rate": 8.542414690968892e-06, + "loss": 0.2864, + "step": 11387 + }, + { + "epoch": 0.1544763971785133, + "grad_norm": 5.792878150939941, + "learning_rate": 8.542277648348637e-06, + "loss": 0.2488, + "step": 11388 + }, + { + "epoch": 0.1544899620184482, + "grad_norm": 4.782592296600342, + "learning_rate": 8.542140605728382e-06, + "loss": 0.2762, + "step": 11389 + }, + { + "epoch": 0.15450352685838306, + "grad_norm": 7.380876064300537, + "learning_rate": 8.542003563108127e-06, + "loss": 0.5326, + "step": 11390 + }, + { + "epoch": 0.15451709169831795, + "grad_norm": 5.696420192718506, + "learning_rate": 8.541866520487872e-06, + "loss": 0.3366, + "step": 11391 + }, + { + "epoch": 0.15453065653825285, + "grad_norm": 6.72054386138916, + "learning_rate": 8.541729477867618e-06, + "loss": 0.4273, + "step": 11392 + }, + { + "epoch": 0.15454422137818774, + "grad_norm": 5.3750996589660645, + "learning_rate": 8.541592435247363e-06, + "loss": 0.345, + "step": 11393 + }, + { + "epoch": 0.15455778621812263, + "grad_norm": 5.353382587432861, + "learning_rate": 8.541455392627108e-06, + "loss": 0.3766, + "step": 11394 + }, + { + "epoch": 0.15457135105805753, + "grad_norm": 7.759500503540039, + "learning_rate": 8.541318350006853e-06, + "loss": 0.5818, + "step": 11395 + }, + { + "epoch": 0.1545849158979924, + "grad_norm": 6.970994472503662, + "learning_rate": 8.541181307386598e-06, + "loss": 0.4657, + "step": 11396 + }, + { + "epoch": 0.15459848073792729, + "grad_norm": 4.718031883239746, + "learning_rate": 8.541044264766343e-06, + "loss": 0.3164, + "step": 11397 + }, + { + "epoch": 0.15461204557786218, + "grad_norm": 4.820093154907227, + "learning_rate": 8.540907222146087e-06, + "loss": 0.3373, + "step": 11398 + }, + { + "epoch": 0.15462561041779707, + "grad_norm": 5.452808380126953, + "learning_rate": 8.540770179525834e-06, + "loss": 0.232, + "step": 11399 + }, + { + "epoch": 0.15463917525773196, + "grad_norm": 5.3946332931518555, + "learning_rate": 8.540633136905579e-06, + "loss": 0.3059, + "step": 11400 + }, + { + "epoch": 0.15465274009766686, + "grad_norm": 5.190818786621094, + "learning_rate": 8.540496094285322e-06, + "loss": 0.2994, + "step": 11401 + }, + { + "epoch": 0.15466630493760172, + "grad_norm": 6.4986748695373535, + "learning_rate": 8.540359051665068e-06, + "loss": 0.4867, + "step": 11402 + }, + { + "epoch": 0.15467986977753662, + "grad_norm": 5.021368503570557, + "learning_rate": 8.540222009044814e-06, + "loss": 0.2567, + "step": 11403 + }, + { + "epoch": 0.1546934346174715, + "grad_norm": 5.540224075317383, + "learning_rate": 8.54008496642456e-06, + "loss": 0.2674, + "step": 11404 + }, + { + "epoch": 0.1547069994574064, + "grad_norm": 6.80671501159668, + "learning_rate": 8.539947923804303e-06, + "loss": 0.4645, + "step": 11405 + }, + { + "epoch": 0.1547205642973413, + "grad_norm": 5.233470916748047, + "learning_rate": 8.539810881184048e-06, + "loss": 0.2816, + "step": 11406 + }, + { + "epoch": 0.1547341291372762, + "grad_norm": 5.7105302810668945, + "learning_rate": 8.539673838563795e-06, + "loss": 0.3014, + "step": 11407 + }, + { + "epoch": 0.15474769397721108, + "grad_norm": 5.409519672393799, + "learning_rate": 8.539536795943539e-06, + "loss": 0.3943, + "step": 11408 + }, + { + "epoch": 0.15476125881714595, + "grad_norm": 5.585697174072266, + "learning_rate": 8.539399753323284e-06, + "loss": 0.3593, + "step": 11409 + }, + { + "epoch": 0.15477482365708084, + "grad_norm": 4.161654949188232, + "learning_rate": 8.539262710703029e-06, + "loss": 0.2774, + "step": 11410 + }, + { + "epoch": 0.15478838849701573, + "grad_norm": 7.063803195953369, + "learning_rate": 8.539125668082774e-06, + "loss": 0.2756, + "step": 11411 + }, + { + "epoch": 0.15480195333695063, + "grad_norm": 7.259105205535889, + "learning_rate": 8.53898862546252e-06, + "loss": 0.4215, + "step": 11412 + }, + { + "epoch": 0.15481551817688552, + "grad_norm": 5.993277072906494, + "learning_rate": 8.538851582842265e-06, + "loss": 0.3235, + "step": 11413 + }, + { + "epoch": 0.1548290830168204, + "grad_norm": 5.436668395996094, + "learning_rate": 8.53871454022201e-06, + "loss": 0.4053, + "step": 11414 + }, + { + "epoch": 0.15484264785675528, + "grad_norm": 5.407629489898682, + "learning_rate": 8.538577497601755e-06, + "loss": 0.4505, + "step": 11415 + }, + { + "epoch": 0.15485621269669017, + "grad_norm": 6.867916584014893, + "learning_rate": 8.5384404549815e-06, + "loss": 0.4307, + "step": 11416 + }, + { + "epoch": 0.15486977753662506, + "grad_norm": 6.8657121658325195, + "learning_rate": 8.538303412361245e-06, + "loss": 0.4123, + "step": 11417 + }, + { + "epoch": 0.15488334237655996, + "grad_norm": 5.909729480743408, + "learning_rate": 8.53816636974099e-06, + "loss": 0.5055, + "step": 11418 + }, + { + "epoch": 0.15489690721649485, + "grad_norm": 5.7476806640625, + "learning_rate": 8.538029327120736e-06, + "loss": 0.3642, + "step": 11419 + }, + { + "epoch": 0.15491047205642974, + "grad_norm": 5.384915828704834, + "learning_rate": 8.53789228450048e-06, + "loss": 0.356, + "step": 11420 + }, + { + "epoch": 0.15492403689636464, + "grad_norm": 8.607186317443848, + "learning_rate": 8.537755241880226e-06, + "loss": 0.4502, + "step": 11421 + }, + { + "epoch": 0.1549376017362995, + "grad_norm": 5.817741394042969, + "learning_rate": 8.537618199259971e-06, + "loss": 0.3523, + "step": 11422 + }, + { + "epoch": 0.1549511665762344, + "grad_norm": 5.3087239265441895, + "learning_rate": 8.537481156639715e-06, + "loss": 0.3369, + "step": 11423 + }, + { + "epoch": 0.1549647314161693, + "grad_norm": 7.6984543800354, + "learning_rate": 8.537344114019462e-06, + "loss": 0.4643, + "step": 11424 + }, + { + "epoch": 0.15497829625610418, + "grad_norm": 5.242071628570557, + "learning_rate": 8.537207071399207e-06, + "loss": 0.2003, + "step": 11425 + }, + { + "epoch": 0.15499186109603907, + "grad_norm": 5.211216926574707, + "learning_rate": 8.53707002877895e-06, + "loss": 0.2493, + "step": 11426 + }, + { + "epoch": 0.15500542593597397, + "grad_norm": 6.401930809020996, + "learning_rate": 8.536932986158695e-06, + "loss": 0.3802, + "step": 11427 + }, + { + "epoch": 0.15501899077590883, + "grad_norm": 5.4636921882629395, + "learning_rate": 8.53679594353844e-06, + "loss": 0.2863, + "step": 11428 + }, + { + "epoch": 0.15503255561584373, + "grad_norm": 6.567657947540283, + "learning_rate": 8.536658900918187e-06, + "loss": 0.3618, + "step": 11429 + }, + { + "epoch": 0.15504612045577862, + "grad_norm": 4.443347454071045, + "learning_rate": 8.536521858297931e-06, + "loss": 0.3431, + "step": 11430 + }, + { + "epoch": 0.1550596852957135, + "grad_norm": 7.174187183380127, + "learning_rate": 8.536384815677676e-06, + "loss": 0.3582, + "step": 11431 + }, + { + "epoch": 0.1550732501356484, + "grad_norm": 4.666747093200684, + "learning_rate": 8.536247773057421e-06, + "loss": 0.2569, + "step": 11432 + }, + { + "epoch": 0.1550868149755833, + "grad_norm": 6.352858543395996, + "learning_rate": 8.536110730437166e-06, + "loss": 0.4671, + "step": 11433 + }, + { + "epoch": 0.15510037981551816, + "grad_norm": 4.721911430358887, + "learning_rate": 8.535973687816912e-06, + "loss": 0.3162, + "step": 11434 + }, + { + "epoch": 0.15511394465545306, + "grad_norm": 6.480866432189941, + "learning_rate": 8.535836645196657e-06, + "loss": 0.3871, + "step": 11435 + }, + { + "epoch": 0.15512750949538795, + "grad_norm": 5.637749195098877, + "learning_rate": 8.535699602576402e-06, + "loss": 0.4165, + "step": 11436 + }, + { + "epoch": 0.15514107433532284, + "grad_norm": 6.428820610046387, + "learning_rate": 8.535562559956147e-06, + "loss": 0.5087, + "step": 11437 + }, + { + "epoch": 0.15515463917525774, + "grad_norm": 7.045697212219238, + "learning_rate": 8.535425517335892e-06, + "loss": 0.4629, + "step": 11438 + }, + { + "epoch": 0.15516820401519263, + "grad_norm": 5.3753204345703125, + "learning_rate": 8.535288474715638e-06, + "loss": 0.2962, + "step": 11439 + }, + { + "epoch": 0.15518176885512752, + "grad_norm": 6.453970432281494, + "learning_rate": 8.535151432095383e-06, + "loss": 0.3954, + "step": 11440 + }, + { + "epoch": 0.1551953336950624, + "grad_norm": 5.492708683013916, + "learning_rate": 8.535014389475126e-06, + "loss": 0.4554, + "step": 11441 + }, + { + "epoch": 0.15520889853499728, + "grad_norm": 5.719161033630371, + "learning_rate": 8.534877346854873e-06, + "loss": 0.3983, + "step": 11442 + }, + { + "epoch": 0.15522246337493217, + "grad_norm": 7.3764519691467285, + "learning_rate": 8.534740304234618e-06, + "loss": 0.5741, + "step": 11443 + }, + { + "epoch": 0.15523602821486707, + "grad_norm": 5.432027816772461, + "learning_rate": 8.534603261614362e-06, + "loss": 0.3768, + "step": 11444 + }, + { + "epoch": 0.15524959305480196, + "grad_norm": 6.333942413330078, + "learning_rate": 8.534466218994107e-06, + "loss": 0.3218, + "step": 11445 + }, + { + "epoch": 0.15526315789473685, + "grad_norm": 6.841960430145264, + "learning_rate": 8.534329176373854e-06, + "loss": 0.5231, + "step": 11446 + }, + { + "epoch": 0.15527672273467172, + "grad_norm": 5.190036773681641, + "learning_rate": 8.534192133753599e-06, + "loss": 0.4736, + "step": 11447 + }, + { + "epoch": 0.1552902875746066, + "grad_norm": 8.456945419311523, + "learning_rate": 8.534055091133342e-06, + "loss": 0.5093, + "step": 11448 + }, + { + "epoch": 0.1553038524145415, + "grad_norm": 7.012587070465088, + "learning_rate": 8.533918048513088e-06, + "loss": 0.4568, + "step": 11449 + }, + { + "epoch": 0.1553174172544764, + "grad_norm": 7.613796710968018, + "learning_rate": 8.533781005892835e-06, + "loss": 0.5201, + "step": 11450 + }, + { + "epoch": 0.1553309820944113, + "grad_norm": 7.313101291656494, + "learning_rate": 8.533643963272578e-06, + "loss": 0.5214, + "step": 11451 + }, + { + "epoch": 0.15534454693434618, + "grad_norm": 7.539188861846924, + "learning_rate": 8.533506920652323e-06, + "loss": 0.4816, + "step": 11452 + }, + { + "epoch": 0.15535811177428108, + "grad_norm": 7.9539899826049805, + "learning_rate": 8.533369878032068e-06, + "loss": 0.4404, + "step": 11453 + }, + { + "epoch": 0.15537167661421594, + "grad_norm": 6.201564311981201, + "learning_rate": 8.533232835411814e-06, + "loss": 0.4823, + "step": 11454 + }, + { + "epoch": 0.15538524145415084, + "grad_norm": 5.583611488342285, + "learning_rate": 8.533095792791559e-06, + "loss": 0.3941, + "step": 11455 + }, + { + "epoch": 0.15539880629408573, + "grad_norm": 6.722030162811279, + "learning_rate": 8.532958750171304e-06, + "loss": 0.56, + "step": 11456 + }, + { + "epoch": 0.15541237113402062, + "grad_norm": 6.663395404815674, + "learning_rate": 8.532821707551049e-06, + "loss": 0.5869, + "step": 11457 + }, + { + "epoch": 0.15542593597395551, + "grad_norm": 5.084124565124512, + "learning_rate": 8.532684664930794e-06, + "loss": 0.2089, + "step": 11458 + }, + { + "epoch": 0.1554395008138904, + "grad_norm": 6.27390718460083, + "learning_rate": 8.53254762231054e-06, + "loss": 0.5019, + "step": 11459 + }, + { + "epoch": 0.15545306565382527, + "grad_norm": 6.974583148956299, + "learning_rate": 8.532410579690285e-06, + "loss": 0.4699, + "step": 11460 + }, + { + "epoch": 0.15546663049376017, + "grad_norm": 6.6446003913879395, + "learning_rate": 8.53227353707003e-06, + "loss": 0.4091, + "step": 11461 + }, + { + "epoch": 0.15548019533369506, + "grad_norm": 5.541521072387695, + "learning_rate": 8.532136494449775e-06, + "loss": 0.3188, + "step": 11462 + }, + { + "epoch": 0.15549376017362995, + "grad_norm": 6.283696174621582, + "learning_rate": 8.53199945182952e-06, + "loss": 0.228, + "step": 11463 + }, + { + "epoch": 0.15550732501356485, + "grad_norm": 5.30516242980957, + "learning_rate": 8.531862409209265e-06, + "loss": 0.3982, + "step": 11464 + }, + { + "epoch": 0.15552088985349974, + "grad_norm": 6.778751373291016, + "learning_rate": 8.53172536658901e-06, + "loss": 0.4752, + "step": 11465 + }, + { + "epoch": 0.1555344546934346, + "grad_norm": 5.59296178817749, + "learning_rate": 8.531588323968754e-06, + "loss": 0.2686, + "step": 11466 + }, + { + "epoch": 0.1555480195333695, + "grad_norm": 6.200605392456055, + "learning_rate": 8.5314512813485e-06, + "loss": 0.3099, + "step": 11467 + }, + { + "epoch": 0.1555615843733044, + "grad_norm": 7.277381420135498, + "learning_rate": 8.531314238728246e-06, + "loss": 0.4011, + "step": 11468 + }, + { + "epoch": 0.15557514921323928, + "grad_norm": 5.931700229644775, + "learning_rate": 8.53117719610799e-06, + "loss": 0.2949, + "step": 11469 + }, + { + "epoch": 0.15558871405317418, + "grad_norm": 7.432751655578613, + "learning_rate": 8.531040153487735e-06, + "loss": 0.4281, + "step": 11470 + }, + { + "epoch": 0.15560227889310907, + "grad_norm": 5.948946952819824, + "learning_rate": 8.53090311086748e-06, + "loss": 0.3335, + "step": 11471 + }, + { + "epoch": 0.15561584373304396, + "grad_norm": 5.791539192199707, + "learning_rate": 8.530766068247227e-06, + "loss": 0.4424, + "step": 11472 + }, + { + "epoch": 0.15562940857297883, + "grad_norm": 7.804938316345215, + "learning_rate": 8.53062902562697e-06, + "loss": 0.4798, + "step": 11473 + }, + { + "epoch": 0.15564297341291372, + "grad_norm": 6.343837261199951, + "learning_rate": 8.530491983006715e-06, + "loss": 0.3307, + "step": 11474 + }, + { + "epoch": 0.15565653825284861, + "grad_norm": 6.290741920471191, + "learning_rate": 8.53035494038646e-06, + "loss": 0.313, + "step": 11475 + }, + { + "epoch": 0.1556701030927835, + "grad_norm": 6.999575138092041, + "learning_rate": 8.530217897766206e-06, + "loss": 0.4138, + "step": 11476 + }, + { + "epoch": 0.1556836679327184, + "grad_norm": 5.7340288162231445, + "learning_rate": 8.530080855145951e-06, + "loss": 0.2454, + "step": 11477 + }, + { + "epoch": 0.1556972327726533, + "grad_norm": 4.888777732849121, + "learning_rate": 8.529943812525696e-06, + "loss": 0.3103, + "step": 11478 + }, + { + "epoch": 0.15571079761258816, + "grad_norm": 4.1702351570129395, + "learning_rate": 8.529806769905441e-06, + "loss": 0.1944, + "step": 11479 + }, + { + "epoch": 0.15572436245252305, + "grad_norm": 6.090721130371094, + "learning_rate": 8.529669727285186e-06, + "loss": 0.4563, + "step": 11480 + }, + { + "epoch": 0.15573792729245794, + "grad_norm": 5.421535015106201, + "learning_rate": 8.529532684664932e-06, + "loss": 0.3296, + "step": 11481 + }, + { + "epoch": 0.15575149213239284, + "grad_norm": 5.341028690338135, + "learning_rate": 8.529395642044677e-06, + "loss": 0.2746, + "step": 11482 + }, + { + "epoch": 0.15576505697232773, + "grad_norm": 5.59531307220459, + "learning_rate": 8.529258599424422e-06, + "loss": 0.2884, + "step": 11483 + }, + { + "epoch": 0.15577862181226262, + "grad_norm": 6.685766696929932, + "learning_rate": 8.529121556804166e-06, + "loss": 0.3648, + "step": 11484 + }, + { + "epoch": 0.15579218665219752, + "grad_norm": 6.229109287261963, + "learning_rate": 8.528984514183912e-06, + "loss": 0.3614, + "step": 11485 + }, + { + "epoch": 0.15580575149213238, + "grad_norm": 7.097259521484375, + "learning_rate": 8.528847471563658e-06, + "loss": 0.5264, + "step": 11486 + }, + { + "epoch": 0.15581931633206728, + "grad_norm": 6.060471534729004, + "learning_rate": 8.528710428943403e-06, + "loss": 0.2713, + "step": 11487 + }, + { + "epoch": 0.15583288117200217, + "grad_norm": 4.484630584716797, + "learning_rate": 8.528573386323146e-06, + "loss": 0.239, + "step": 11488 + }, + { + "epoch": 0.15584644601193706, + "grad_norm": 6.901764392852783, + "learning_rate": 8.528436343702893e-06, + "loss": 0.3736, + "step": 11489 + }, + { + "epoch": 0.15586001085187196, + "grad_norm": 7.260006427764893, + "learning_rate": 8.528299301082638e-06, + "loss": 0.3405, + "step": 11490 + }, + { + "epoch": 0.15587357569180685, + "grad_norm": 4.410888671875, + "learning_rate": 8.528162258462382e-06, + "loss": 0.2375, + "step": 11491 + }, + { + "epoch": 0.1558871405317417, + "grad_norm": 7.5239949226379395, + "learning_rate": 8.528025215842127e-06, + "loss": 0.3896, + "step": 11492 + }, + { + "epoch": 0.1559007053716766, + "grad_norm": 5.679330825805664, + "learning_rate": 8.527888173221874e-06, + "loss": 0.2926, + "step": 11493 + }, + { + "epoch": 0.1559142702116115, + "grad_norm": 3.5446319580078125, + "learning_rate": 8.527751130601617e-06, + "loss": 0.2094, + "step": 11494 + }, + { + "epoch": 0.1559278350515464, + "grad_norm": 6.672401428222656, + "learning_rate": 8.527614087981362e-06, + "loss": 0.4834, + "step": 11495 + }, + { + "epoch": 0.15594139989148129, + "grad_norm": 5.303679943084717, + "learning_rate": 8.527477045361108e-06, + "loss": 0.2757, + "step": 11496 + }, + { + "epoch": 0.15595496473141618, + "grad_norm": 8.847475051879883, + "learning_rate": 8.527340002740853e-06, + "loss": 0.3046, + "step": 11497 + }, + { + "epoch": 0.15596852957135104, + "grad_norm": 5.253267288208008, + "learning_rate": 8.527202960120598e-06, + "loss": 0.3706, + "step": 11498 + }, + { + "epoch": 0.15598209441128594, + "grad_norm": 6.518492221832275, + "learning_rate": 8.527065917500343e-06, + "loss": 0.4148, + "step": 11499 + }, + { + "epoch": 0.15599565925122083, + "grad_norm": 6.2993927001953125, + "learning_rate": 8.526928874880088e-06, + "loss": 0.3449, + "step": 11500 + }, + { + "epoch": 0.15600922409115572, + "grad_norm": 6.448675155639648, + "learning_rate": 8.526791832259834e-06, + "loss": 0.3753, + "step": 11501 + }, + { + "epoch": 0.15602278893109062, + "grad_norm": 6.10054874420166, + "learning_rate": 8.526654789639579e-06, + "loss": 0.3772, + "step": 11502 + }, + { + "epoch": 0.1560363537710255, + "grad_norm": 7.630148410797119, + "learning_rate": 8.526517747019324e-06, + "loss": 0.4838, + "step": 11503 + }, + { + "epoch": 0.1560499186109604, + "grad_norm": 4.724804401397705, + "learning_rate": 8.526380704399069e-06, + "loss": 0.3665, + "step": 11504 + }, + { + "epoch": 0.15606348345089527, + "grad_norm": 6.403534889221191, + "learning_rate": 8.526243661778814e-06, + "loss": 0.4426, + "step": 11505 + }, + { + "epoch": 0.15607704829083016, + "grad_norm": 4.670475006103516, + "learning_rate": 8.52610661915856e-06, + "loss": 0.4635, + "step": 11506 + }, + { + "epoch": 0.15609061313076505, + "grad_norm": 5.998253345489502, + "learning_rate": 8.525969576538305e-06, + "loss": 0.3652, + "step": 11507 + }, + { + "epoch": 0.15610417797069995, + "grad_norm": 6.028100967407227, + "learning_rate": 8.52583253391805e-06, + "loss": 0.3933, + "step": 11508 + }, + { + "epoch": 0.15611774281063484, + "grad_norm": 6.453199863433838, + "learning_rate": 8.525695491297793e-06, + "loss": 0.4627, + "step": 11509 + }, + { + "epoch": 0.15613130765056973, + "grad_norm": 6.005499839782715, + "learning_rate": 8.525558448677538e-06, + "loss": 0.3964, + "step": 11510 + }, + { + "epoch": 0.1561448724905046, + "grad_norm": 7.372622013092041, + "learning_rate": 8.525421406057285e-06, + "loss": 0.5743, + "step": 11511 + }, + { + "epoch": 0.1561584373304395, + "grad_norm": 5.729162693023682, + "learning_rate": 8.52528436343703e-06, + "loss": 0.4101, + "step": 11512 + }, + { + "epoch": 0.15617200217037439, + "grad_norm": 8.95117473602295, + "learning_rate": 8.525147320816774e-06, + "loss": 0.5096, + "step": 11513 + }, + { + "epoch": 0.15618556701030928, + "grad_norm": 5.829867362976074, + "learning_rate": 8.52501027819652e-06, + "loss": 0.4488, + "step": 11514 + }, + { + "epoch": 0.15619913185024417, + "grad_norm": 7.016597270965576, + "learning_rate": 8.524873235576266e-06, + "loss": 0.4267, + "step": 11515 + }, + { + "epoch": 0.15621269669017906, + "grad_norm": 6.255025386810303, + "learning_rate": 8.52473619295601e-06, + "loss": 0.4025, + "step": 11516 + }, + { + "epoch": 0.15622626153011396, + "grad_norm": 5.669145107269287, + "learning_rate": 8.524599150335755e-06, + "loss": 0.3525, + "step": 11517 + }, + { + "epoch": 0.15623982637004882, + "grad_norm": 6.4912943840026855, + "learning_rate": 8.5244621077155e-06, + "loss": 0.3932, + "step": 11518 + }, + { + "epoch": 0.15625339120998372, + "grad_norm": 4.934025764465332, + "learning_rate": 8.524325065095245e-06, + "loss": 0.2757, + "step": 11519 + }, + { + "epoch": 0.1562669560499186, + "grad_norm": 4.660161972045898, + "learning_rate": 8.52418802247499e-06, + "loss": 0.3894, + "step": 11520 + }, + { + "epoch": 0.1562805208898535, + "grad_norm": 5.345636367797852, + "learning_rate": 8.524050979854735e-06, + "loss": 0.4126, + "step": 11521 + }, + { + "epoch": 0.1562940857297884, + "grad_norm": 5.352370738983154, + "learning_rate": 8.52391393723448e-06, + "loss": 0.365, + "step": 11522 + }, + { + "epoch": 0.1563076505697233, + "grad_norm": 6.366996765136719, + "learning_rate": 8.523776894614226e-06, + "loss": 0.3687, + "step": 11523 + }, + { + "epoch": 0.15632121540965815, + "grad_norm": 4.793577194213867, + "learning_rate": 8.523639851993971e-06, + "loss": 0.2993, + "step": 11524 + }, + { + "epoch": 0.15633478024959305, + "grad_norm": 7.4406890869140625, + "learning_rate": 8.523502809373716e-06, + "loss": 0.4453, + "step": 11525 + }, + { + "epoch": 0.15634834508952794, + "grad_norm": 5.378852367401123, + "learning_rate": 8.523365766753461e-06, + "loss": 0.3436, + "step": 11526 + }, + { + "epoch": 0.15636190992946283, + "grad_norm": 6.154429912567139, + "learning_rate": 8.523228724133207e-06, + "loss": 0.4564, + "step": 11527 + }, + { + "epoch": 0.15637547476939773, + "grad_norm": 4.633158206939697, + "learning_rate": 8.523091681512952e-06, + "loss": 0.3708, + "step": 11528 + }, + { + "epoch": 0.15638903960933262, + "grad_norm": 5.355016231536865, + "learning_rate": 8.522954638892697e-06, + "loss": 0.309, + "step": 11529 + }, + { + "epoch": 0.15640260444926748, + "grad_norm": 6.212268829345703, + "learning_rate": 8.522817596272442e-06, + "loss": 0.4361, + "step": 11530 + }, + { + "epoch": 0.15641616928920238, + "grad_norm": 5.4452080726623535, + "learning_rate": 8.522680553652186e-06, + "loss": 0.3276, + "step": 11531 + }, + { + "epoch": 0.15642973412913727, + "grad_norm": 6.480372905731201, + "learning_rate": 8.522543511031932e-06, + "loss": 0.3208, + "step": 11532 + }, + { + "epoch": 0.15644329896907216, + "grad_norm": 5.58476448059082, + "learning_rate": 8.522406468411678e-06, + "loss": 0.4247, + "step": 11533 + }, + { + "epoch": 0.15645686380900706, + "grad_norm": 5.190673351287842, + "learning_rate": 8.522269425791421e-06, + "loss": 0.4414, + "step": 11534 + }, + { + "epoch": 0.15647042864894195, + "grad_norm": 5.394602298736572, + "learning_rate": 8.522132383171166e-06, + "loss": 0.2849, + "step": 11535 + }, + { + "epoch": 0.15648399348887684, + "grad_norm": 5.154000282287598, + "learning_rate": 8.521995340550911e-06, + "loss": 0.3881, + "step": 11536 + }, + { + "epoch": 0.1564975583288117, + "grad_norm": 4.661288738250732, + "learning_rate": 8.521858297930657e-06, + "loss": 0.2938, + "step": 11537 + }, + { + "epoch": 0.1565111231687466, + "grad_norm": 4.923110008239746, + "learning_rate": 8.521721255310402e-06, + "loss": 0.3703, + "step": 11538 + }, + { + "epoch": 0.1565246880086815, + "grad_norm": 4.2717671394348145, + "learning_rate": 8.521584212690147e-06, + "loss": 0.2865, + "step": 11539 + }, + { + "epoch": 0.1565382528486164, + "grad_norm": 4.966344356536865, + "learning_rate": 8.521447170069892e-06, + "loss": 0.2962, + "step": 11540 + }, + { + "epoch": 0.15655181768855128, + "grad_norm": 7.480279922485352, + "learning_rate": 8.521310127449637e-06, + "loss": 0.3706, + "step": 11541 + }, + { + "epoch": 0.15656538252848617, + "grad_norm": 6.6853556632995605, + "learning_rate": 8.521173084829383e-06, + "loss": 0.3567, + "step": 11542 + }, + { + "epoch": 0.15657894736842104, + "grad_norm": 8.340155601501465, + "learning_rate": 8.521036042209128e-06, + "loss": 0.4333, + "step": 11543 + }, + { + "epoch": 0.15659251220835593, + "grad_norm": 5.613896369934082, + "learning_rate": 8.520898999588873e-06, + "loss": 0.2759, + "step": 11544 + }, + { + "epoch": 0.15660607704829083, + "grad_norm": 6.8740763664245605, + "learning_rate": 8.520761956968618e-06, + "loss": 0.4547, + "step": 11545 + }, + { + "epoch": 0.15661964188822572, + "grad_norm": 7.053813934326172, + "learning_rate": 8.520624914348363e-06, + "loss": 0.2667, + "step": 11546 + }, + { + "epoch": 0.1566332067281606, + "grad_norm": 4.682684898376465, + "learning_rate": 8.520487871728108e-06, + "loss": 0.225, + "step": 11547 + }, + { + "epoch": 0.1566467715680955, + "grad_norm": 4.153872013092041, + "learning_rate": 8.520350829107854e-06, + "loss": 0.3064, + "step": 11548 + }, + { + "epoch": 0.1566603364080304, + "grad_norm": 6.652745246887207, + "learning_rate": 8.520213786487599e-06, + "loss": 0.452, + "step": 11549 + }, + { + "epoch": 0.15667390124796526, + "grad_norm": 3.866304874420166, + "learning_rate": 8.520076743867344e-06, + "loss": 0.2796, + "step": 11550 + }, + { + "epoch": 0.15668746608790016, + "grad_norm": 8.780424118041992, + "learning_rate": 8.519939701247089e-06, + "loss": 0.5573, + "step": 11551 + }, + { + "epoch": 0.15670103092783505, + "grad_norm": 5.869811058044434, + "learning_rate": 8.519802658626833e-06, + "loss": 0.3313, + "step": 11552 + }, + { + "epoch": 0.15671459576776994, + "grad_norm": 4.183193206787109, + "learning_rate": 8.519665616006578e-06, + "loss": 0.328, + "step": 11553 + }, + { + "epoch": 0.15672816060770484, + "grad_norm": 4.796124458312988, + "learning_rate": 8.519528573386325e-06, + "loss": 0.3243, + "step": 11554 + }, + { + "epoch": 0.15674172544763973, + "grad_norm": 6.031182289123535, + "learning_rate": 8.51939153076607e-06, + "loss": 0.4414, + "step": 11555 + }, + { + "epoch": 0.1567552902875746, + "grad_norm": 5.569090366363525, + "learning_rate": 8.519254488145813e-06, + "loss": 0.3626, + "step": 11556 + }, + { + "epoch": 0.1567688551275095, + "grad_norm": 6.888721466064453, + "learning_rate": 8.519117445525558e-06, + "loss": 0.4377, + "step": 11557 + }, + { + "epoch": 0.15678241996744438, + "grad_norm": 5.847568035125732, + "learning_rate": 8.518980402905305e-06, + "loss": 0.4558, + "step": 11558 + }, + { + "epoch": 0.15679598480737927, + "grad_norm": 7.038395404815674, + "learning_rate": 8.518843360285049e-06, + "loss": 0.5769, + "step": 11559 + }, + { + "epoch": 0.15680954964731417, + "grad_norm": 5.406500816345215, + "learning_rate": 8.518706317664794e-06, + "loss": 0.3691, + "step": 11560 + }, + { + "epoch": 0.15682311448724906, + "grad_norm": 8.07144832611084, + "learning_rate": 8.51856927504454e-06, + "loss": 0.4239, + "step": 11561 + }, + { + "epoch": 0.15683667932718393, + "grad_norm": 6.44467830657959, + "learning_rate": 8.518432232424284e-06, + "loss": 0.4192, + "step": 11562 + }, + { + "epoch": 0.15685024416711882, + "grad_norm": 9.5709810256958, + "learning_rate": 8.51829518980403e-06, + "loss": 0.4474, + "step": 11563 + }, + { + "epoch": 0.1568638090070537, + "grad_norm": 8.299013137817383, + "learning_rate": 8.518158147183775e-06, + "loss": 0.6349, + "step": 11564 + }, + { + "epoch": 0.1568773738469886, + "grad_norm": 5.738038063049316, + "learning_rate": 8.51802110456352e-06, + "loss": 0.3942, + "step": 11565 + }, + { + "epoch": 0.1568909386869235, + "grad_norm": 7.728208541870117, + "learning_rate": 8.517884061943265e-06, + "loss": 0.6139, + "step": 11566 + }, + { + "epoch": 0.1569045035268584, + "grad_norm": 8.266444206237793, + "learning_rate": 8.51774701932301e-06, + "loss": 0.515, + "step": 11567 + }, + { + "epoch": 0.15691806836679328, + "grad_norm": 6.270998477935791, + "learning_rate": 8.517609976702755e-06, + "loss": 0.2747, + "step": 11568 + }, + { + "epoch": 0.15693163320672815, + "grad_norm": 9.470412254333496, + "learning_rate": 8.5174729340825e-06, + "loss": 0.4963, + "step": 11569 + }, + { + "epoch": 0.15694519804666304, + "grad_norm": 7.771163463592529, + "learning_rate": 8.517335891462246e-06, + "loss": 0.397, + "step": 11570 + }, + { + "epoch": 0.15695876288659794, + "grad_norm": 7.838553428649902, + "learning_rate": 8.517198848841991e-06, + "loss": 0.5476, + "step": 11571 + }, + { + "epoch": 0.15697232772653283, + "grad_norm": 5.106043815612793, + "learning_rate": 8.517061806221736e-06, + "loss": 0.344, + "step": 11572 + }, + { + "epoch": 0.15698589256646772, + "grad_norm": 6.463361740112305, + "learning_rate": 8.516924763601481e-06, + "loss": 0.3672, + "step": 11573 + }, + { + "epoch": 0.15699945740640261, + "grad_norm": 5.65068244934082, + "learning_rate": 8.516787720981225e-06, + "loss": 0.2897, + "step": 11574 + }, + { + "epoch": 0.15701302224633748, + "grad_norm": 8.793967247009277, + "learning_rate": 8.516650678360972e-06, + "loss": 0.5961, + "step": 11575 + }, + { + "epoch": 0.15702658708627237, + "grad_norm": 7.111428260803223, + "learning_rate": 8.516513635740717e-06, + "loss": 0.3282, + "step": 11576 + }, + { + "epoch": 0.15704015192620727, + "grad_norm": 6.612102031707764, + "learning_rate": 8.51637659312046e-06, + "loss": 0.3895, + "step": 11577 + }, + { + "epoch": 0.15705371676614216, + "grad_norm": 8.727936744689941, + "learning_rate": 8.516239550500206e-06, + "loss": 0.5472, + "step": 11578 + }, + { + "epoch": 0.15706728160607705, + "grad_norm": 6.729349613189697, + "learning_rate": 8.51610250787995e-06, + "loss": 0.4497, + "step": 11579 + }, + { + "epoch": 0.15708084644601195, + "grad_norm": 8.681366920471191, + "learning_rate": 8.515965465259698e-06, + "loss": 0.5896, + "step": 11580 + }, + { + "epoch": 0.15709441128594684, + "grad_norm": 7.361874103546143, + "learning_rate": 8.515828422639441e-06, + "loss": 0.3925, + "step": 11581 + }, + { + "epoch": 0.1571079761258817, + "grad_norm": 6.294552326202393, + "learning_rate": 8.515691380019186e-06, + "loss": 0.4346, + "step": 11582 + }, + { + "epoch": 0.1571215409658166, + "grad_norm": 10.371338844299316, + "learning_rate": 8.515554337398931e-06, + "loss": 0.4916, + "step": 11583 + }, + { + "epoch": 0.1571351058057515, + "grad_norm": 6.365665435791016, + "learning_rate": 8.515417294778677e-06, + "loss": 0.4188, + "step": 11584 + }, + { + "epoch": 0.15714867064568638, + "grad_norm": 7.179482460021973, + "learning_rate": 8.515280252158422e-06, + "loss": 0.3157, + "step": 11585 + }, + { + "epoch": 0.15716223548562128, + "grad_norm": 6.861819744110107, + "learning_rate": 8.515143209538167e-06, + "loss": 0.4852, + "step": 11586 + }, + { + "epoch": 0.15717580032555617, + "grad_norm": 5.92268705368042, + "learning_rate": 8.515006166917912e-06, + "loss": 0.292, + "step": 11587 + }, + { + "epoch": 0.15718936516549104, + "grad_norm": 6.8777241706848145, + "learning_rate": 8.514869124297657e-06, + "loss": 0.4786, + "step": 11588 + }, + { + "epoch": 0.15720293000542593, + "grad_norm": 8.46423625946045, + "learning_rate": 8.514732081677403e-06, + "loss": 0.5469, + "step": 11589 + }, + { + "epoch": 0.15721649484536082, + "grad_norm": 6.7363786697387695, + "learning_rate": 8.514595039057148e-06, + "loss": 0.4092, + "step": 11590 + }, + { + "epoch": 0.15723005968529571, + "grad_norm": 5.572811126708984, + "learning_rate": 8.514457996436893e-06, + "loss": 0.3025, + "step": 11591 + }, + { + "epoch": 0.1572436245252306, + "grad_norm": 6.19332218170166, + "learning_rate": 8.514320953816636e-06, + "loss": 0.3767, + "step": 11592 + }, + { + "epoch": 0.1572571893651655, + "grad_norm": 8.283190727233887, + "learning_rate": 8.514183911196383e-06, + "loss": 0.3272, + "step": 11593 + }, + { + "epoch": 0.15727075420510037, + "grad_norm": 7.159676551818848, + "learning_rate": 8.514046868576128e-06, + "loss": 0.4031, + "step": 11594 + }, + { + "epoch": 0.15728431904503526, + "grad_norm": 7.333547592163086, + "learning_rate": 8.513909825955874e-06, + "loss": 0.5415, + "step": 11595 + }, + { + "epoch": 0.15729788388497015, + "grad_norm": 5.946677207946777, + "learning_rate": 8.513772783335617e-06, + "loss": 0.3272, + "step": 11596 + }, + { + "epoch": 0.15731144872490505, + "grad_norm": 6.218808174133301, + "learning_rate": 8.513635740715364e-06, + "loss": 0.3898, + "step": 11597 + }, + { + "epoch": 0.15732501356483994, + "grad_norm": 6.446853160858154, + "learning_rate": 8.513498698095109e-06, + "loss": 0.3376, + "step": 11598 + }, + { + "epoch": 0.15733857840477483, + "grad_norm": 5.37197208404541, + "learning_rate": 8.513361655474853e-06, + "loss": 0.3683, + "step": 11599 + }, + { + "epoch": 0.15735214324470972, + "grad_norm": 6.237242698669434, + "learning_rate": 8.513224612854598e-06, + "loss": 0.4195, + "step": 11600 + }, + { + "epoch": 0.1573657080846446, + "grad_norm": 6.56829833984375, + "learning_rate": 8.513087570234345e-06, + "loss": 0.3426, + "step": 11601 + }, + { + "epoch": 0.15737927292457948, + "grad_norm": 8.495282173156738, + "learning_rate": 8.512950527614088e-06, + "loss": 0.5059, + "step": 11602 + }, + { + "epoch": 0.15739283776451438, + "grad_norm": 6.934779644012451, + "learning_rate": 8.512813484993833e-06, + "loss": 0.5514, + "step": 11603 + }, + { + "epoch": 0.15740640260444927, + "grad_norm": 6.722757816314697, + "learning_rate": 8.512676442373579e-06, + "loss": 0.4338, + "step": 11604 + }, + { + "epoch": 0.15741996744438416, + "grad_norm": 6.009469032287598, + "learning_rate": 8.512539399753324e-06, + "loss": 0.4052, + "step": 11605 + }, + { + "epoch": 0.15743353228431906, + "grad_norm": 7.029261589050293, + "learning_rate": 8.512402357133069e-06, + "loss": 0.4938, + "step": 11606 + }, + { + "epoch": 0.15744709712425392, + "grad_norm": 7.575536251068115, + "learning_rate": 8.512265314512814e-06, + "loss": 0.4956, + "step": 11607 + }, + { + "epoch": 0.15746066196418881, + "grad_norm": 8.113801002502441, + "learning_rate": 8.51212827189256e-06, + "loss": 0.4757, + "step": 11608 + }, + { + "epoch": 0.1574742268041237, + "grad_norm": 5.814547061920166, + "learning_rate": 8.511991229272304e-06, + "loss": 0.4096, + "step": 11609 + }, + { + "epoch": 0.1574877916440586, + "grad_norm": 7.298645496368408, + "learning_rate": 8.51185418665205e-06, + "loss": 0.4826, + "step": 11610 + }, + { + "epoch": 0.1575013564839935, + "grad_norm": 7.10610818862915, + "learning_rate": 8.511717144031795e-06, + "loss": 0.6482, + "step": 11611 + }, + { + "epoch": 0.1575149213239284, + "grad_norm": 6.0636210441589355, + "learning_rate": 8.51158010141154e-06, + "loss": 0.3177, + "step": 11612 + }, + { + "epoch": 0.15752848616386328, + "grad_norm": 4.22899866104126, + "learning_rate": 8.511443058791285e-06, + "loss": 0.1855, + "step": 11613 + }, + { + "epoch": 0.15754205100379814, + "grad_norm": 6.253177165985107, + "learning_rate": 8.51130601617103e-06, + "loss": 0.3096, + "step": 11614 + }, + { + "epoch": 0.15755561584373304, + "grad_norm": 5.9679460525512695, + "learning_rate": 8.511168973550775e-06, + "loss": 0.3686, + "step": 11615 + }, + { + "epoch": 0.15756918068366793, + "grad_norm": 6.789231300354004, + "learning_rate": 8.51103193093052e-06, + "loss": 0.3973, + "step": 11616 + }, + { + "epoch": 0.15758274552360282, + "grad_norm": 6.991113185882568, + "learning_rate": 8.510894888310264e-06, + "loss": 0.273, + "step": 11617 + }, + { + "epoch": 0.15759631036353772, + "grad_norm": 5.833690166473389, + "learning_rate": 8.51075784569001e-06, + "loss": 0.3673, + "step": 11618 + }, + { + "epoch": 0.1576098752034726, + "grad_norm": 5.130932331085205, + "learning_rate": 8.510620803069756e-06, + "loss": 0.3523, + "step": 11619 + }, + { + "epoch": 0.15762344004340748, + "grad_norm": 6.5829243659973145, + "learning_rate": 8.510483760449501e-06, + "loss": 0.4141, + "step": 11620 + }, + { + "epoch": 0.15763700488334237, + "grad_norm": 6.801628112792969, + "learning_rate": 8.510346717829245e-06, + "loss": 0.2948, + "step": 11621 + }, + { + "epoch": 0.15765056972327726, + "grad_norm": 7.462874412536621, + "learning_rate": 8.51020967520899e-06, + "loss": 0.6045, + "step": 11622 + }, + { + "epoch": 0.15766413456321215, + "grad_norm": 7.831698894500732, + "learning_rate": 8.510072632588737e-06, + "loss": 0.4829, + "step": 11623 + }, + { + "epoch": 0.15767769940314705, + "grad_norm": 6.347375392913818, + "learning_rate": 8.50993558996848e-06, + "loss": 0.522, + "step": 11624 + }, + { + "epoch": 0.15769126424308194, + "grad_norm": 4.911727428436279, + "learning_rate": 8.509798547348226e-06, + "loss": 0.3024, + "step": 11625 + }, + { + "epoch": 0.15770482908301683, + "grad_norm": 5.892440319061279, + "learning_rate": 8.50966150472797e-06, + "loss": 0.3262, + "step": 11626 + }, + { + "epoch": 0.1577183939229517, + "grad_norm": 9.056854248046875, + "learning_rate": 8.509524462107716e-06, + "loss": 0.4681, + "step": 11627 + }, + { + "epoch": 0.1577319587628866, + "grad_norm": 8.361059188842773, + "learning_rate": 8.509387419487461e-06, + "loss": 0.3934, + "step": 11628 + }, + { + "epoch": 0.15774552360282149, + "grad_norm": 7.21424674987793, + "learning_rate": 8.509250376867206e-06, + "loss": 0.4262, + "step": 11629 + }, + { + "epoch": 0.15775908844275638, + "grad_norm": 5.609371662139893, + "learning_rate": 8.509113334246951e-06, + "loss": 0.2866, + "step": 11630 + }, + { + "epoch": 0.15777265328269127, + "grad_norm": 7.0301713943481445, + "learning_rate": 8.508976291626697e-06, + "loss": 0.5538, + "step": 11631 + }, + { + "epoch": 0.15778621812262617, + "grad_norm": 6.751715660095215, + "learning_rate": 8.508839249006442e-06, + "loss": 0.4006, + "step": 11632 + }, + { + "epoch": 0.15779978296256103, + "grad_norm": 4.916189193725586, + "learning_rate": 8.508702206386187e-06, + "loss": 0.3406, + "step": 11633 + }, + { + "epoch": 0.15781334780249592, + "grad_norm": 6.6523003578186035, + "learning_rate": 8.508565163765932e-06, + "loss": 0.3273, + "step": 11634 + }, + { + "epoch": 0.15782691264243082, + "grad_norm": 10.54687786102295, + "learning_rate": 8.508428121145676e-06, + "loss": 0.793, + "step": 11635 + }, + { + "epoch": 0.1578404774823657, + "grad_norm": 9.071691513061523, + "learning_rate": 8.508291078525423e-06, + "loss": 0.5839, + "step": 11636 + }, + { + "epoch": 0.1578540423223006, + "grad_norm": 7.040275573730469, + "learning_rate": 8.508154035905168e-06, + "loss": 0.3504, + "step": 11637 + }, + { + "epoch": 0.1578676071622355, + "grad_norm": 6.595125198364258, + "learning_rate": 8.508016993284913e-06, + "loss": 0.4109, + "step": 11638 + }, + { + "epoch": 0.15788117200217036, + "grad_norm": 8.441422462463379, + "learning_rate": 8.507879950664656e-06, + "loss": 0.4186, + "step": 11639 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 5.8276824951171875, + "learning_rate": 8.507742908044403e-06, + "loss": 0.4222, + "step": 11640 + }, + { + "epoch": 0.15790830168204015, + "grad_norm": 5.524853229522705, + "learning_rate": 8.507605865424148e-06, + "loss": 0.4202, + "step": 11641 + }, + { + "epoch": 0.15792186652197504, + "grad_norm": 5.875197410583496, + "learning_rate": 8.507468822803892e-06, + "loss": 0.4066, + "step": 11642 + }, + { + "epoch": 0.15793543136190993, + "grad_norm": 5.420622825622559, + "learning_rate": 8.507331780183637e-06, + "loss": 0.3541, + "step": 11643 + }, + { + "epoch": 0.15794899620184483, + "grad_norm": 7.328155517578125, + "learning_rate": 8.507194737563384e-06, + "loss": 0.4831, + "step": 11644 + }, + { + "epoch": 0.15796256104177972, + "grad_norm": 5.595098495483398, + "learning_rate": 8.507057694943127e-06, + "loss": 0.4544, + "step": 11645 + }, + { + "epoch": 0.15797612588171459, + "grad_norm": 5.455155372619629, + "learning_rate": 8.506920652322873e-06, + "loss": 0.3678, + "step": 11646 + }, + { + "epoch": 0.15798969072164948, + "grad_norm": 6.664257526397705, + "learning_rate": 8.506783609702618e-06, + "loss": 0.3071, + "step": 11647 + }, + { + "epoch": 0.15800325556158437, + "grad_norm": 5.1243672370910645, + "learning_rate": 8.506646567082363e-06, + "loss": 0.412, + "step": 11648 + }, + { + "epoch": 0.15801682040151926, + "grad_norm": 7.569154739379883, + "learning_rate": 8.506509524462108e-06, + "loss": 0.3204, + "step": 11649 + }, + { + "epoch": 0.15803038524145416, + "grad_norm": 7.3382487297058105, + "learning_rate": 8.506372481841853e-06, + "loss": 0.3636, + "step": 11650 + }, + { + "epoch": 0.15804395008138905, + "grad_norm": 6.682053565979004, + "learning_rate": 8.506235439221599e-06, + "loss": 0.3776, + "step": 11651 + }, + { + "epoch": 0.15805751492132392, + "grad_norm": 6.671211242675781, + "learning_rate": 8.506098396601344e-06, + "loss": 0.4431, + "step": 11652 + }, + { + "epoch": 0.1580710797612588, + "grad_norm": 5.512150287628174, + "learning_rate": 8.505961353981089e-06, + "loss": 0.3173, + "step": 11653 + }, + { + "epoch": 0.1580846446011937, + "grad_norm": 5.257598400115967, + "learning_rate": 8.505824311360834e-06, + "loss": 0.3294, + "step": 11654 + }, + { + "epoch": 0.1580982094411286, + "grad_norm": 6.302951812744141, + "learning_rate": 8.50568726874058e-06, + "loss": 0.3204, + "step": 11655 + }, + { + "epoch": 0.1581117742810635, + "grad_norm": 9.740781784057617, + "learning_rate": 8.505550226120324e-06, + "loss": 0.5709, + "step": 11656 + }, + { + "epoch": 0.15812533912099838, + "grad_norm": 5.57565450668335, + "learning_rate": 8.50541318350007e-06, + "loss": 0.2955, + "step": 11657 + }, + { + "epoch": 0.15813890396093327, + "grad_norm": 6.316195964813232, + "learning_rate": 8.505276140879815e-06, + "loss": 0.4168, + "step": 11658 + }, + { + "epoch": 0.15815246880086814, + "grad_norm": 8.06820297241211, + "learning_rate": 8.50513909825956e-06, + "loss": 0.424, + "step": 11659 + }, + { + "epoch": 0.15816603364080303, + "grad_norm": 5.406171798706055, + "learning_rate": 8.505002055639303e-06, + "loss": 0.5133, + "step": 11660 + }, + { + "epoch": 0.15817959848073793, + "grad_norm": 6.958815574645996, + "learning_rate": 8.504865013019049e-06, + "loss": 0.471, + "step": 11661 + }, + { + "epoch": 0.15819316332067282, + "grad_norm": 6.097813129425049, + "learning_rate": 8.504727970398795e-06, + "loss": 0.5408, + "step": 11662 + }, + { + "epoch": 0.1582067281606077, + "grad_norm": 8.516481399536133, + "learning_rate": 8.50459092777854e-06, + "loss": 0.6442, + "step": 11663 + }, + { + "epoch": 0.1582202930005426, + "grad_norm": 7.2705535888671875, + "learning_rate": 8.504453885158284e-06, + "loss": 0.4174, + "step": 11664 + }, + { + "epoch": 0.15823385784047747, + "grad_norm": 5.863336563110352, + "learning_rate": 8.50431684253803e-06, + "loss": 0.4089, + "step": 11665 + }, + { + "epoch": 0.15824742268041236, + "grad_norm": 6.041840076446533, + "learning_rate": 8.504179799917776e-06, + "loss": 0.366, + "step": 11666 + }, + { + "epoch": 0.15826098752034726, + "grad_norm": 5.669031620025635, + "learning_rate": 8.50404275729752e-06, + "loss": 0.5108, + "step": 11667 + }, + { + "epoch": 0.15827455236028215, + "grad_norm": 5.9837470054626465, + "learning_rate": 8.503905714677265e-06, + "loss": 0.3933, + "step": 11668 + }, + { + "epoch": 0.15828811720021704, + "grad_norm": 6.483503341674805, + "learning_rate": 8.50376867205701e-06, + "loss": 0.4736, + "step": 11669 + }, + { + "epoch": 0.15830168204015194, + "grad_norm": 8.262043952941895, + "learning_rate": 8.503631629436755e-06, + "loss": 0.4462, + "step": 11670 + }, + { + "epoch": 0.1583152468800868, + "grad_norm": 7.781821250915527, + "learning_rate": 8.5034945868165e-06, + "loss": 0.4525, + "step": 11671 + }, + { + "epoch": 0.1583288117200217, + "grad_norm": 5.415538787841797, + "learning_rate": 8.503357544196246e-06, + "loss": 0.3273, + "step": 11672 + }, + { + "epoch": 0.1583423765599566, + "grad_norm": 4.755737781524658, + "learning_rate": 8.50322050157599e-06, + "loss": 0.3105, + "step": 11673 + }, + { + "epoch": 0.15835594139989148, + "grad_norm": 6.730125904083252, + "learning_rate": 8.503083458955736e-06, + "loss": 0.3345, + "step": 11674 + }, + { + "epoch": 0.15836950623982637, + "grad_norm": 5.819712162017822, + "learning_rate": 8.502946416335481e-06, + "loss": 0.4935, + "step": 11675 + }, + { + "epoch": 0.15838307107976127, + "grad_norm": 6.521427154541016, + "learning_rate": 8.502809373715226e-06, + "loss": 0.4263, + "step": 11676 + }, + { + "epoch": 0.15839663591969616, + "grad_norm": 5.003021717071533, + "learning_rate": 8.502672331094971e-06, + "loss": 0.3035, + "step": 11677 + }, + { + "epoch": 0.15841020075963103, + "grad_norm": 7.688681602478027, + "learning_rate": 8.502535288474717e-06, + "loss": 0.4364, + "step": 11678 + }, + { + "epoch": 0.15842376559956592, + "grad_norm": 10.842077255249023, + "learning_rate": 8.502398245854462e-06, + "loss": 0.5796, + "step": 11679 + }, + { + "epoch": 0.1584373304395008, + "grad_norm": 5.940074920654297, + "learning_rate": 8.502261203234207e-06, + "loss": 0.3386, + "step": 11680 + }, + { + "epoch": 0.1584508952794357, + "grad_norm": 5.546331882476807, + "learning_rate": 8.502124160613952e-06, + "loss": 0.2925, + "step": 11681 + }, + { + "epoch": 0.1584644601193706, + "grad_norm": 5.495014190673828, + "learning_rate": 8.501987117993696e-06, + "loss": 0.2981, + "step": 11682 + }, + { + "epoch": 0.1584780249593055, + "grad_norm": 7.2510833740234375, + "learning_rate": 8.501850075373443e-06, + "loss": 0.4839, + "step": 11683 + }, + { + "epoch": 0.15849158979924036, + "grad_norm": 5.983634948730469, + "learning_rate": 8.501713032753188e-06, + "loss": 0.4348, + "step": 11684 + }, + { + "epoch": 0.15850515463917525, + "grad_norm": 5.973278522491455, + "learning_rate": 8.501575990132931e-06, + "loss": 0.4034, + "step": 11685 + }, + { + "epoch": 0.15851871947911014, + "grad_norm": 5.786810398101807, + "learning_rate": 8.501438947512676e-06, + "loss": 0.3894, + "step": 11686 + }, + { + "epoch": 0.15853228431904504, + "grad_norm": 6.390234470367432, + "learning_rate": 8.501301904892422e-06, + "loss": 0.3566, + "step": 11687 + }, + { + "epoch": 0.15854584915897993, + "grad_norm": 5.778083324432373, + "learning_rate": 8.501164862272168e-06, + "loss": 0.2535, + "step": 11688 + }, + { + "epoch": 0.15855941399891482, + "grad_norm": 6.605905055999756, + "learning_rate": 8.501027819651912e-06, + "loss": 0.2886, + "step": 11689 + }, + { + "epoch": 0.15857297883884972, + "grad_norm": 6.834146499633789, + "learning_rate": 8.500890777031657e-06, + "loss": 0.4245, + "step": 11690 + }, + { + "epoch": 0.15858654367878458, + "grad_norm": 6.948099136352539, + "learning_rate": 8.500753734411402e-06, + "loss": 0.2592, + "step": 11691 + }, + { + "epoch": 0.15860010851871947, + "grad_norm": 6.254024505615234, + "learning_rate": 8.500616691791147e-06, + "loss": 0.33, + "step": 11692 + }, + { + "epoch": 0.15861367335865437, + "grad_norm": 5.944091796875, + "learning_rate": 8.500479649170893e-06, + "loss": 0.4745, + "step": 11693 + }, + { + "epoch": 0.15862723819858926, + "grad_norm": 4.733989238739014, + "learning_rate": 8.500342606550638e-06, + "loss": 0.3324, + "step": 11694 + }, + { + "epoch": 0.15864080303852415, + "grad_norm": 5.504137992858887, + "learning_rate": 8.500205563930383e-06, + "loss": 0.2117, + "step": 11695 + }, + { + "epoch": 0.15865436787845905, + "grad_norm": 6.954818248748779, + "learning_rate": 8.500068521310128e-06, + "loss": 0.431, + "step": 11696 + }, + { + "epoch": 0.1586679327183939, + "grad_norm": 5.975978374481201, + "learning_rate": 8.499931478689873e-06, + "loss": 0.336, + "step": 11697 + }, + { + "epoch": 0.1586814975583288, + "grad_norm": 6.09842586517334, + "learning_rate": 8.499794436069619e-06, + "loss": 0.2719, + "step": 11698 + }, + { + "epoch": 0.1586950623982637, + "grad_norm": 4.271101951599121, + "learning_rate": 8.499657393449364e-06, + "loss": 0.1964, + "step": 11699 + }, + { + "epoch": 0.1587086272381986, + "grad_norm": 4.807662487030029, + "learning_rate": 8.499520350829109e-06, + "loss": 0.2647, + "step": 11700 + }, + { + "epoch": 0.15872219207813348, + "grad_norm": 6.045955181121826, + "learning_rate": 8.499383308208854e-06, + "loss": 0.4197, + "step": 11701 + }, + { + "epoch": 0.15873575691806838, + "grad_norm": 5.607292652130127, + "learning_rate": 8.4992462655886e-06, + "loss": 0.2551, + "step": 11702 + }, + { + "epoch": 0.15874932175800324, + "grad_norm": 5.7003493309021, + "learning_rate": 8.499109222968344e-06, + "loss": 0.3157, + "step": 11703 + }, + { + "epoch": 0.15876288659793814, + "grad_norm": 5.614954948425293, + "learning_rate": 8.498972180348088e-06, + "loss": 0.3705, + "step": 11704 + }, + { + "epoch": 0.15877645143787303, + "grad_norm": 4.759706497192383, + "learning_rate": 8.498835137727835e-06, + "loss": 0.2902, + "step": 11705 + }, + { + "epoch": 0.15879001627780792, + "grad_norm": 5.526489734649658, + "learning_rate": 8.49869809510758e-06, + "loss": 0.2878, + "step": 11706 + }, + { + "epoch": 0.15880358111774281, + "grad_norm": 4.915333271026611, + "learning_rate": 8.498561052487323e-06, + "loss": 0.2448, + "step": 11707 + }, + { + "epoch": 0.1588171459576777, + "grad_norm": 6.1563568115234375, + "learning_rate": 8.498424009867069e-06, + "loss": 0.2376, + "step": 11708 + }, + { + "epoch": 0.1588307107976126, + "grad_norm": 5.764602184295654, + "learning_rate": 8.498286967246816e-06, + "loss": 0.3501, + "step": 11709 + }, + { + "epoch": 0.15884427563754747, + "grad_norm": 6.351776123046875, + "learning_rate": 8.498149924626559e-06, + "loss": 0.4325, + "step": 11710 + }, + { + "epoch": 0.15885784047748236, + "grad_norm": 4.82932710647583, + "learning_rate": 8.498012882006304e-06, + "loss": 0.3741, + "step": 11711 + }, + { + "epoch": 0.15887140531741725, + "grad_norm": 6.417455196380615, + "learning_rate": 8.49787583938605e-06, + "loss": 0.2909, + "step": 11712 + }, + { + "epoch": 0.15888497015735215, + "grad_norm": 6.281325340270996, + "learning_rate": 8.497738796765795e-06, + "loss": 0.3408, + "step": 11713 + }, + { + "epoch": 0.15889853499728704, + "grad_norm": 6.579471588134766, + "learning_rate": 8.49760175414554e-06, + "loss": 0.3693, + "step": 11714 + }, + { + "epoch": 0.15891209983722193, + "grad_norm": 5.470376968383789, + "learning_rate": 8.497464711525285e-06, + "loss": 0.3248, + "step": 11715 + }, + { + "epoch": 0.1589256646771568, + "grad_norm": 6.466825008392334, + "learning_rate": 8.49732766890503e-06, + "loss": 0.257, + "step": 11716 + }, + { + "epoch": 0.1589392295170917, + "grad_norm": 5.800273895263672, + "learning_rate": 8.497190626284775e-06, + "loss": 0.4119, + "step": 11717 + }, + { + "epoch": 0.15895279435702658, + "grad_norm": 4.074289321899414, + "learning_rate": 8.49705358366452e-06, + "loss": 0.2866, + "step": 11718 + }, + { + "epoch": 0.15896635919696148, + "grad_norm": 5.331685543060303, + "learning_rate": 8.496916541044266e-06, + "loss": 0.3684, + "step": 11719 + }, + { + "epoch": 0.15897992403689637, + "grad_norm": 5.4412455558776855, + "learning_rate": 8.49677949842401e-06, + "loss": 0.3637, + "step": 11720 + }, + { + "epoch": 0.15899348887683126, + "grad_norm": 5.46159029006958, + "learning_rate": 8.496642455803756e-06, + "loss": 0.3501, + "step": 11721 + }, + { + "epoch": 0.15900705371676616, + "grad_norm": 4.110288143157959, + "learning_rate": 8.496505413183501e-06, + "loss": 0.1941, + "step": 11722 + }, + { + "epoch": 0.15902061855670102, + "grad_norm": 4.7301788330078125, + "learning_rate": 8.496368370563246e-06, + "loss": 0.3521, + "step": 11723 + }, + { + "epoch": 0.15903418339663591, + "grad_norm": 3.9642367362976074, + "learning_rate": 8.496231327942992e-06, + "loss": 0.258, + "step": 11724 + }, + { + "epoch": 0.1590477482365708, + "grad_norm": 5.032835006713867, + "learning_rate": 8.496094285322735e-06, + "loss": 0.366, + "step": 11725 + }, + { + "epoch": 0.1590613130765057, + "grad_norm": 5.731156826019287, + "learning_rate": 8.495957242702482e-06, + "loss": 0.3621, + "step": 11726 + }, + { + "epoch": 0.1590748779164406, + "grad_norm": 5.665318012237549, + "learning_rate": 8.495820200082227e-06, + "loss": 0.3884, + "step": 11727 + }, + { + "epoch": 0.1590884427563755, + "grad_norm": 4.730794429779053, + "learning_rate": 8.49568315746197e-06, + "loss": 0.2666, + "step": 11728 + }, + { + "epoch": 0.15910200759631035, + "grad_norm": 6.923519611358643, + "learning_rate": 8.495546114841716e-06, + "loss": 0.4468, + "step": 11729 + }, + { + "epoch": 0.15911557243624525, + "grad_norm": 5.898064613342285, + "learning_rate": 8.495409072221461e-06, + "loss": 0.3201, + "step": 11730 + }, + { + "epoch": 0.15912913727618014, + "grad_norm": 5.930343151092529, + "learning_rate": 8.495272029601208e-06, + "loss": 0.3213, + "step": 11731 + }, + { + "epoch": 0.15914270211611503, + "grad_norm": 7.951318740844727, + "learning_rate": 8.495134986980951e-06, + "loss": 0.4988, + "step": 11732 + }, + { + "epoch": 0.15915626695604992, + "grad_norm": 6.956562042236328, + "learning_rate": 8.494997944360696e-06, + "loss": 0.3591, + "step": 11733 + }, + { + "epoch": 0.15916983179598482, + "grad_norm": 9.220015525817871, + "learning_rate": 8.494860901740442e-06, + "loss": 0.3123, + "step": 11734 + }, + { + "epoch": 0.15918339663591968, + "grad_norm": 5.97031307220459, + "learning_rate": 8.494723859120187e-06, + "loss": 0.3357, + "step": 11735 + }, + { + "epoch": 0.15919696147585458, + "grad_norm": 7.585029602050781, + "learning_rate": 8.494586816499932e-06, + "loss": 0.4305, + "step": 11736 + }, + { + "epoch": 0.15921052631578947, + "grad_norm": 7.293946743011475, + "learning_rate": 8.494449773879677e-06, + "loss": 0.5094, + "step": 11737 + }, + { + "epoch": 0.15922409115572436, + "grad_norm": 5.195432186126709, + "learning_rate": 8.494312731259422e-06, + "loss": 0.3692, + "step": 11738 + }, + { + "epoch": 0.15923765599565926, + "grad_norm": 5.412491798400879, + "learning_rate": 8.494175688639167e-06, + "loss": 0.3624, + "step": 11739 + }, + { + "epoch": 0.15925122083559415, + "grad_norm": 6.474765300750732, + "learning_rate": 8.494038646018913e-06, + "loss": 0.4262, + "step": 11740 + }, + { + "epoch": 0.15926478567552904, + "grad_norm": 6.6903228759765625, + "learning_rate": 8.493901603398658e-06, + "loss": 0.339, + "step": 11741 + }, + { + "epoch": 0.1592783505154639, + "grad_norm": 7.463290691375732, + "learning_rate": 8.493764560778403e-06, + "loss": 0.3822, + "step": 11742 + }, + { + "epoch": 0.1592919153553988, + "grad_norm": 5.1709418296813965, + "learning_rate": 8.493627518158147e-06, + "loss": 0.3253, + "step": 11743 + }, + { + "epoch": 0.1593054801953337, + "grad_norm": 6.302470684051514, + "learning_rate": 8.493490475537893e-06, + "loss": 0.3484, + "step": 11744 + }, + { + "epoch": 0.15931904503526859, + "grad_norm": 5.037800312042236, + "learning_rate": 8.493353432917639e-06, + "loss": 0.3282, + "step": 11745 + }, + { + "epoch": 0.15933260987520348, + "grad_norm": 5.9961042404174805, + "learning_rate": 8.493216390297384e-06, + "loss": 0.2639, + "step": 11746 + }, + { + "epoch": 0.15934617471513837, + "grad_norm": 7.854069709777832, + "learning_rate": 8.493079347677127e-06, + "loss": 0.5659, + "step": 11747 + }, + { + "epoch": 0.15935973955507324, + "grad_norm": 7.60667610168457, + "learning_rate": 8.492942305056874e-06, + "loss": 0.392, + "step": 11748 + }, + { + "epoch": 0.15937330439500813, + "grad_norm": 7.463174343109131, + "learning_rate": 8.49280526243662e-06, + "loss": 0.5955, + "step": 11749 + }, + { + "epoch": 0.15938686923494302, + "grad_norm": 6.421631336212158, + "learning_rate": 8.492668219816363e-06, + "loss": 0.3646, + "step": 11750 + }, + { + "epoch": 0.15940043407487792, + "grad_norm": 6.478449821472168, + "learning_rate": 8.492531177196108e-06, + "loss": 0.3432, + "step": 11751 + }, + { + "epoch": 0.1594139989148128, + "grad_norm": 6.551289081573486, + "learning_rate": 8.492394134575855e-06, + "loss": 0.3334, + "step": 11752 + }, + { + "epoch": 0.1594275637547477, + "grad_norm": 9.134143829345703, + "learning_rate": 8.492257091955598e-06, + "loss": 0.4409, + "step": 11753 + }, + { + "epoch": 0.1594411285946826, + "grad_norm": 8.71929931640625, + "learning_rate": 8.492120049335343e-06, + "loss": 0.5128, + "step": 11754 + }, + { + "epoch": 0.15945469343461746, + "grad_norm": 5.285160064697266, + "learning_rate": 8.491983006715089e-06, + "loss": 0.2786, + "step": 11755 + }, + { + "epoch": 0.15946825827455235, + "grad_norm": 6.8961992263793945, + "learning_rate": 8.491845964094834e-06, + "loss": 0.4167, + "step": 11756 + }, + { + "epoch": 0.15948182311448725, + "grad_norm": 7.524195194244385, + "learning_rate": 8.491708921474579e-06, + "loss": 0.4585, + "step": 11757 + }, + { + "epoch": 0.15949538795442214, + "grad_norm": 4.2574381828308105, + "learning_rate": 8.491571878854324e-06, + "loss": 0.2302, + "step": 11758 + }, + { + "epoch": 0.15950895279435703, + "grad_norm": 6.5313310623168945, + "learning_rate": 8.49143483623407e-06, + "loss": 0.4005, + "step": 11759 + }, + { + "epoch": 0.15952251763429193, + "grad_norm": 6.501779079437256, + "learning_rate": 8.491297793613815e-06, + "loss": 0.3211, + "step": 11760 + }, + { + "epoch": 0.1595360824742268, + "grad_norm": 7.927577018737793, + "learning_rate": 8.49116075099356e-06, + "loss": 0.4522, + "step": 11761 + }, + { + "epoch": 0.15954964731416169, + "grad_norm": 7.189305782318115, + "learning_rate": 8.491023708373305e-06, + "loss": 0.4361, + "step": 11762 + }, + { + "epoch": 0.15956321215409658, + "grad_norm": 7.672161102294922, + "learning_rate": 8.49088666575305e-06, + "loss": 0.4819, + "step": 11763 + }, + { + "epoch": 0.15957677699403147, + "grad_norm": 6.850467205047607, + "learning_rate": 8.490749623132795e-06, + "loss": 0.5097, + "step": 11764 + }, + { + "epoch": 0.15959034183396636, + "grad_norm": 3.6906204223632812, + "learning_rate": 8.49061258051254e-06, + "loss": 0.2475, + "step": 11765 + }, + { + "epoch": 0.15960390667390126, + "grad_norm": 7.0537214279174805, + "learning_rate": 8.490475537892286e-06, + "loss": 0.4128, + "step": 11766 + }, + { + "epoch": 0.15961747151383612, + "grad_norm": 5.959044933319092, + "learning_rate": 8.49033849527203e-06, + "loss": 0.4305, + "step": 11767 + }, + { + "epoch": 0.15963103635377102, + "grad_norm": 6.685502529144287, + "learning_rate": 8.490201452651774e-06, + "loss": 0.4278, + "step": 11768 + }, + { + "epoch": 0.1596446011937059, + "grad_norm": 6.519890308380127, + "learning_rate": 8.490064410031521e-06, + "loss": 0.3345, + "step": 11769 + }, + { + "epoch": 0.1596581660336408, + "grad_norm": 7.007462024688721, + "learning_rate": 8.489927367411266e-06, + "loss": 0.5177, + "step": 11770 + }, + { + "epoch": 0.1596717308735757, + "grad_norm": 5.167903423309326, + "learning_rate": 8.489790324791012e-06, + "loss": 0.4494, + "step": 11771 + }, + { + "epoch": 0.1596852957135106, + "grad_norm": 7.337265491485596, + "learning_rate": 8.489653282170755e-06, + "loss": 0.4689, + "step": 11772 + }, + { + "epoch": 0.15969886055344548, + "grad_norm": 8.58018970489502, + "learning_rate": 8.4895162395505e-06, + "loss": 0.5123, + "step": 11773 + }, + { + "epoch": 0.15971242539338035, + "grad_norm": 6.522590637207031, + "learning_rate": 8.489379196930247e-06, + "loss": 0.3501, + "step": 11774 + }, + { + "epoch": 0.15972599023331524, + "grad_norm": 7.959064483642578, + "learning_rate": 8.48924215430999e-06, + "loss": 0.6739, + "step": 11775 + }, + { + "epoch": 0.15973955507325013, + "grad_norm": 6.516643524169922, + "learning_rate": 8.489105111689736e-06, + "loss": 0.3313, + "step": 11776 + }, + { + "epoch": 0.15975311991318503, + "grad_norm": 6.667094707489014, + "learning_rate": 8.488968069069481e-06, + "loss": 0.3989, + "step": 11777 + }, + { + "epoch": 0.15976668475311992, + "grad_norm": 6.357295036315918, + "learning_rate": 8.488831026449226e-06, + "loss": 0.5255, + "step": 11778 + }, + { + "epoch": 0.1597802495930548, + "grad_norm": 9.784723281860352, + "learning_rate": 8.488693983828971e-06, + "loss": 0.5659, + "step": 11779 + }, + { + "epoch": 0.15979381443298968, + "grad_norm": 7.447550296783447, + "learning_rate": 8.488556941208716e-06, + "loss": 0.5551, + "step": 11780 + }, + { + "epoch": 0.15980737927292457, + "grad_norm": 5.564870357513428, + "learning_rate": 8.488419898588462e-06, + "loss": 0.4332, + "step": 11781 + }, + { + "epoch": 0.15982094411285946, + "grad_norm": 7.10966682434082, + "learning_rate": 8.488282855968207e-06, + "loss": 0.3904, + "step": 11782 + }, + { + "epoch": 0.15983450895279436, + "grad_norm": 5.081770420074463, + "learning_rate": 8.488145813347952e-06, + "loss": 0.3552, + "step": 11783 + }, + { + "epoch": 0.15984807379272925, + "grad_norm": 6.256196975708008, + "learning_rate": 8.488008770727697e-06, + "loss": 0.4442, + "step": 11784 + }, + { + "epoch": 0.15986163863266414, + "grad_norm": 6.859849452972412, + "learning_rate": 8.487871728107442e-06, + "loss": 0.4083, + "step": 11785 + }, + { + "epoch": 0.15987520347259904, + "grad_norm": 5.9467926025390625, + "learning_rate": 8.487734685487188e-06, + "loss": 0.3514, + "step": 11786 + }, + { + "epoch": 0.1598887683125339, + "grad_norm": 4.872093677520752, + "learning_rate": 8.487597642866933e-06, + "loss": 0.2642, + "step": 11787 + }, + { + "epoch": 0.1599023331524688, + "grad_norm": 5.188451290130615, + "learning_rate": 8.487460600246678e-06, + "loss": 0.4194, + "step": 11788 + }, + { + "epoch": 0.1599158979924037, + "grad_norm": 6.165949821472168, + "learning_rate": 8.487323557626423e-06, + "loss": 0.585, + "step": 11789 + }, + { + "epoch": 0.15992946283233858, + "grad_norm": 5.123299598693848, + "learning_rate": 8.487186515006167e-06, + "loss": 0.4044, + "step": 11790 + }, + { + "epoch": 0.15994302767227347, + "grad_norm": 5.788045883178711, + "learning_rate": 8.487049472385913e-06, + "loss": 0.4368, + "step": 11791 + }, + { + "epoch": 0.15995659251220837, + "grad_norm": 6.746870040893555, + "learning_rate": 8.486912429765659e-06, + "loss": 0.4543, + "step": 11792 + }, + { + "epoch": 0.15997015735214323, + "grad_norm": 6.911814212799072, + "learning_rate": 8.486775387145402e-06, + "loss": 0.3867, + "step": 11793 + }, + { + "epoch": 0.15998372219207813, + "grad_norm": 6.054447650909424, + "learning_rate": 8.486638344525147e-06, + "loss": 0.423, + "step": 11794 + }, + { + "epoch": 0.15999728703201302, + "grad_norm": 9.197505950927734, + "learning_rate": 8.486501301904894e-06, + "loss": 0.5115, + "step": 11795 + }, + { + "epoch": 0.1600108518719479, + "grad_norm": 5.8012590408325195, + "learning_rate": 8.48636425928464e-06, + "loss": 0.4475, + "step": 11796 + }, + { + "epoch": 0.1600244167118828, + "grad_norm": 5.455972194671631, + "learning_rate": 8.486227216664383e-06, + "loss": 0.397, + "step": 11797 + }, + { + "epoch": 0.1600379815518177, + "grad_norm": 4.988344669342041, + "learning_rate": 8.486090174044128e-06, + "loss": 0.4447, + "step": 11798 + }, + { + "epoch": 0.16005154639175256, + "grad_norm": 6.7910685539245605, + "learning_rate": 8.485953131423873e-06, + "loss": 0.464, + "step": 11799 + }, + { + "epoch": 0.16006511123168746, + "grad_norm": 5.365777015686035, + "learning_rate": 8.485816088803618e-06, + "loss": 0.3386, + "step": 11800 + }, + { + "epoch": 0.16007867607162235, + "grad_norm": 5.580697059631348, + "learning_rate": 8.485679046183364e-06, + "loss": 0.3417, + "step": 11801 + }, + { + "epoch": 0.16009224091155724, + "grad_norm": 7.324523448944092, + "learning_rate": 8.485542003563109e-06, + "loss": 0.444, + "step": 11802 + }, + { + "epoch": 0.16010580575149214, + "grad_norm": 8.134329795837402, + "learning_rate": 8.485404960942854e-06, + "loss": 0.426, + "step": 11803 + }, + { + "epoch": 0.16011937059142703, + "grad_norm": 6.546031951904297, + "learning_rate": 8.485267918322599e-06, + "loss": 0.4027, + "step": 11804 + }, + { + "epoch": 0.16013293543136192, + "grad_norm": 5.765138149261475, + "learning_rate": 8.485130875702344e-06, + "loss": 0.4256, + "step": 11805 + }, + { + "epoch": 0.1601465002712968, + "grad_norm": 6.801455497741699, + "learning_rate": 8.48499383308209e-06, + "loss": 0.4349, + "step": 11806 + }, + { + "epoch": 0.16016006511123168, + "grad_norm": 5.794497966766357, + "learning_rate": 8.484856790461835e-06, + "loss": 0.4698, + "step": 11807 + }, + { + "epoch": 0.16017362995116657, + "grad_norm": 4.403530120849609, + "learning_rate": 8.48471974784158e-06, + "loss": 0.2085, + "step": 11808 + }, + { + "epoch": 0.16018719479110147, + "grad_norm": 5.524564266204834, + "learning_rate": 8.484582705221325e-06, + "loss": 0.2652, + "step": 11809 + }, + { + "epoch": 0.16020075963103636, + "grad_norm": 5.647326469421387, + "learning_rate": 8.48444566260107e-06, + "loss": 0.4541, + "step": 11810 + }, + { + "epoch": 0.16021432447097125, + "grad_norm": 6.750731468200684, + "learning_rate": 8.484308619980815e-06, + "loss": 0.484, + "step": 11811 + }, + { + "epoch": 0.16022788931090612, + "grad_norm": 6.995562553405762, + "learning_rate": 8.484171577360559e-06, + "loss": 0.3799, + "step": 11812 + }, + { + "epoch": 0.160241454150841, + "grad_norm": 5.14542818069458, + "learning_rate": 8.484034534740306e-06, + "loss": 0.3413, + "step": 11813 + }, + { + "epoch": 0.1602550189907759, + "grad_norm": 6.34212064743042, + "learning_rate": 8.48389749212005e-06, + "loss": 0.2836, + "step": 11814 + }, + { + "epoch": 0.1602685838307108, + "grad_norm": 7.067591190338135, + "learning_rate": 8.483760449499794e-06, + "loss": 0.4554, + "step": 11815 + }, + { + "epoch": 0.1602821486706457, + "grad_norm": 6.119629859924316, + "learning_rate": 8.48362340687954e-06, + "loss": 0.2483, + "step": 11816 + }, + { + "epoch": 0.16029571351058058, + "grad_norm": 5.462134838104248, + "learning_rate": 8.483486364259286e-06, + "loss": 0.3039, + "step": 11817 + }, + { + "epoch": 0.16030927835051548, + "grad_norm": 4.543563365936279, + "learning_rate": 8.48334932163903e-06, + "loss": 0.3362, + "step": 11818 + }, + { + "epoch": 0.16032284319045034, + "grad_norm": 4.822942733764648, + "learning_rate": 8.483212279018775e-06, + "loss": 0.2942, + "step": 11819 + }, + { + "epoch": 0.16033640803038524, + "grad_norm": 6.111002445220947, + "learning_rate": 8.48307523639852e-06, + "loss": 0.4788, + "step": 11820 + }, + { + "epoch": 0.16034997287032013, + "grad_norm": 6.493294715881348, + "learning_rate": 8.482938193778265e-06, + "loss": 0.4337, + "step": 11821 + }, + { + "epoch": 0.16036353771025502, + "grad_norm": 5.203524589538574, + "learning_rate": 8.48280115115801e-06, + "loss": 0.3063, + "step": 11822 + }, + { + "epoch": 0.16037710255018992, + "grad_norm": 6.736754894256592, + "learning_rate": 8.482664108537756e-06, + "loss": 0.4311, + "step": 11823 + }, + { + "epoch": 0.1603906673901248, + "grad_norm": 8.396513938903809, + "learning_rate": 8.482527065917501e-06, + "loss": 0.5079, + "step": 11824 + }, + { + "epoch": 0.16040423223005967, + "grad_norm": 7.077273845672607, + "learning_rate": 8.482390023297246e-06, + "loss": 0.3934, + "step": 11825 + }, + { + "epoch": 0.16041779706999457, + "grad_norm": 7.205737113952637, + "learning_rate": 8.482252980676991e-06, + "loss": 0.2678, + "step": 11826 + }, + { + "epoch": 0.16043136190992946, + "grad_norm": 6.663840293884277, + "learning_rate": 8.482115938056736e-06, + "loss": 0.3922, + "step": 11827 + }, + { + "epoch": 0.16044492674986435, + "grad_norm": 5.009943008422852, + "learning_rate": 8.481978895436482e-06, + "loss": 0.3068, + "step": 11828 + }, + { + "epoch": 0.16045849158979925, + "grad_norm": 5.903981685638428, + "learning_rate": 8.481841852816227e-06, + "loss": 0.2153, + "step": 11829 + }, + { + "epoch": 0.16047205642973414, + "grad_norm": 7.4491353034973145, + "learning_rate": 8.481704810195972e-06, + "loss": 0.3358, + "step": 11830 + }, + { + "epoch": 0.160485621269669, + "grad_norm": 5.438140392303467, + "learning_rate": 8.481567767575717e-06, + "loss": 0.2392, + "step": 11831 + }, + { + "epoch": 0.1604991861096039, + "grad_norm": 5.248794078826904, + "learning_rate": 8.481430724955462e-06, + "loss": 0.2182, + "step": 11832 + }, + { + "epoch": 0.1605127509495388, + "grad_norm": 7.195018291473389, + "learning_rate": 8.481293682335206e-06, + "loss": 0.3876, + "step": 11833 + }, + { + "epoch": 0.16052631578947368, + "grad_norm": 8.1058349609375, + "learning_rate": 8.481156639714953e-06, + "loss": 0.4293, + "step": 11834 + }, + { + "epoch": 0.16053988062940858, + "grad_norm": 7.053974151611328, + "learning_rate": 8.481019597094698e-06, + "loss": 0.3355, + "step": 11835 + }, + { + "epoch": 0.16055344546934347, + "grad_norm": 7.307062149047852, + "learning_rate": 8.480882554474441e-06, + "loss": 0.3681, + "step": 11836 + }, + { + "epoch": 0.16056701030927836, + "grad_norm": 7.431933403015137, + "learning_rate": 8.480745511854187e-06, + "loss": 0.4716, + "step": 11837 + }, + { + "epoch": 0.16058057514921323, + "grad_norm": 7.049337863922119, + "learning_rate": 8.480608469233933e-06, + "loss": 0.3206, + "step": 11838 + }, + { + "epoch": 0.16059413998914812, + "grad_norm": 6.141425609588623, + "learning_rate": 8.480471426613679e-06, + "loss": 0.2714, + "step": 11839 + }, + { + "epoch": 0.16060770482908301, + "grad_norm": 9.435579299926758, + "learning_rate": 8.480334383993422e-06, + "loss": 0.3717, + "step": 11840 + }, + { + "epoch": 0.1606212696690179, + "grad_norm": 7.202622890472412, + "learning_rate": 8.480197341373167e-06, + "loss": 0.3442, + "step": 11841 + }, + { + "epoch": 0.1606348345089528, + "grad_norm": 6.789344787597656, + "learning_rate": 8.480060298752912e-06, + "loss": 0.236, + "step": 11842 + }, + { + "epoch": 0.1606483993488877, + "grad_norm": 7.571081638336182, + "learning_rate": 8.479923256132658e-06, + "loss": 0.2792, + "step": 11843 + }, + { + "epoch": 0.16066196418882256, + "grad_norm": 6.856028079986572, + "learning_rate": 8.479786213512403e-06, + "loss": 0.2874, + "step": 11844 + }, + { + "epoch": 0.16067552902875745, + "grad_norm": 7.816951274871826, + "learning_rate": 8.479649170892148e-06, + "loss": 0.4323, + "step": 11845 + }, + { + "epoch": 0.16068909386869235, + "grad_norm": 4.59271240234375, + "learning_rate": 8.479512128271893e-06, + "loss": 0.2107, + "step": 11846 + }, + { + "epoch": 0.16070265870862724, + "grad_norm": 7.774786472320557, + "learning_rate": 8.479375085651638e-06, + "loss": 0.3486, + "step": 11847 + }, + { + "epoch": 0.16071622354856213, + "grad_norm": 7.311929702758789, + "learning_rate": 8.479238043031384e-06, + "loss": 0.3771, + "step": 11848 + }, + { + "epoch": 0.16072978838849702, + "grad_norm": 6.27879524230957, + "learning_rate": 8.479101000411129e-06, + "loss": 0.3483, + "step": 11849 + }, + { + "epoch": 0.16074335322843192, + "grad_norm": 6.996039390563965, + "learning_rate": 8.478963957790874e-06, + "loss": 0.3303, + "step": 11850 + }, + { + "epoch": 0.16075691806836678, + "grad_norm": 7.483651638031006, + "learning_rate": 8.478826915170619e-06, + "loss": 0.2845, + "step": 11851 + }, + { + "epoch": 0.16077048290830168, + "grad_norm": 6.934619426727295, + "learning_rate": 8.478689872550364e-06, + "loss": 0.3627, + "step": 11852 + }, + { + "epoch": 0.16078404774823657, + "grad_norm": 6.162098407745361, + "learning_rate": 8.47855282993011e-06, + "loss": 0.4343, + "step": 11853 + }, + { + "epoch": 0.16079761258817146, + "grad_norm": 5.08290433883667, + "learning_rate": 8.478415787309855e-06, + "loss": 0.2771, + "step": 11854 + }, + { + "epoch": 0.16081117742810636, + "grad_norm": 5.382961273193359, + "learning_rate": 8.478278744689598e-06, + "loss": 0.2502, + "step": 11855 + }, + { + "epoch": 0.16082474226804125, + "grad_norm": 6.728135108947754, + "learning_rate": 8.478141702069345e-06, + "loss": 0.2751, + "step": 11856 + }, + { + "epoch": 0.16083830710797611, + "grad_norm": 8.02000904083252, + "learning_rate": 8.47800465944909e-06, + "loss": 0.5433, + "step": 11857 + }, + { + "epoch": 0.160851871947911, + "grad_norm": 7.545498847961426, + "learning_rate": 8.477867616828834e-06, + "loss": 0.3376, + "step": 11858 + }, + { + "epoch": 0.1608654367878459, + "grad_norm": 7.274450302124023, + "learning_rate": 8.477730574208579e-06, + "loss": 0.3842, + "step": 11859 + }, + { + "epoch": 0.1608790016277808, + "grad_norm": 6.4418182373046875, + "learning_rate": 8.477593531588326e-06, + "loss": 0.299, + "step": 11860 + }, + { + "epoch": 0.1608925664677157, + "grad_norm": 6.20197868347168, + "learning_rate": 8.47745648896807e-06, + "loss": 0.3158, + "step": 11861 + }, + { + "epoch": 0.16090613130765058, + "grad_norm": 6.160574436187744, + "learning_rate": 8.477319446347814e-06, + "loss": 0.3586, + "step": 11862 + }, + { + "epoch": 0.16091969614758544, + "grad_norm": 4.750783920288086, + "learning_rate": 8.47718240372756e-06, + "loss": 0.2438, + "step": 11863 + }, + { + "epoch": 0.16093326098752034, + "grad_norm": 5.546408176422119, + "learning_rate": 8.477045361107306e-06, + "loss": 0.263, + "step": 11864 + }, + { + "epoch": 0.16094682582745523, + "grad_norm": 6.965887546539307, + "learning_rate": 8.47690831848705e-06, + "loss": 0.3127, + "step": 11865 + }, + { + "epoch": 0.16096039066739012, + "grad_norm": 6.253419876098633, + "learning_rate": 8.476771275866795e-06, + "loss": 0.4495, + "step": 11866 + }, + { + "epoch": 0.16097395550732502, + "grad_norm": 6.335179328918457, + "learning_rate": 8.47663423324654e-06, + "loss": 0.4903, + "step": 11867 + }, + { + "epoch": 0.1609875203472599, + "grad_norm": 7.537144184112549, + "learning_rate": 8.476497190626285e-06, + "loss": 0.5225, + "step": 11868 + }, + { + "epoch": 0.1610010851871948, + "grad_norm": 10.147746086120605, + "learning_rate": 8.47636014800603e-06, + "loss": 0.6289, + "step": 11869 + }, + { + "epoch": 0.16101465002712967, + "grad_norm": 7.038695812225342, + "learning_rate": 8.476223105385776e-06, + "loss": 0.3872, + "step": 11870 + }, + { + "epoch": 0.16102821486706456, + "grad_norm": 7.6736297607421875, + "learning_rate": 8.476086062765521e-06, + "loss": 0.3172, + "step": 11871 + }, + { + "epoch": 0.16104177970699945, + "grad_norm": 4.905659198760986, + "learning_rate": 8.475949020145266e-06, + "loss": 0.2777, + "step": 11872 + }, + { + "epoch": 0.16105534454693435, + "grad_norm": 7.411563396453857, + "learning_rate": 8.475811977525011e-06, + "loss": 0.3436, + "step": 11873 + }, + { + "epoch": 0.16106890938686924, + "grad_norm": 6.511383056640625, + "learning_rate": 8.475674934904756e-06, + "loss": 0.3996, + "step": 11874 + }, + { + "epoch": 0.16108247422680413, + "grad_norm": 6.052260398864746, + "learning_rate": 8.475537892284502e-06, + "loss": 0.3935, + "step": 11875 + }, + { + "epoch": 0.161096039066739, + "grad_norm": 5.584875583648682, + "learning_rate": 8.475400849664245e-06, + "loss": 0.3346, + "step": 11876 + }, + { + "epoch": 0.1611096039066739, + "grad_norm": 5.022412300109863, + "learning_rate": 8.475263807043992e-06, + "loss": 0.4573, + "step": 11877 + }, + { + "epoch": 0.16112316874660879, + "grad_norm": 7.338253498077393, + "learning_rate": 8.475126764423737e-06, + "loss": 0.4203, + "step": 11878 + }, + { + "epoch": 0.16113673358654368, + "grad_norm": 6.722139835357666, + "learning_rate": 8.474989721803482e-06, + "loss": 0.3561, + "step": 11879 + }, + { + "epoch": 0.16115029842647857, + "grad_norm": 7.679961681365967, + "learning_rate": 8.474852679183226e-06, + "loss": 0.3317, + "step": 11880 + }, + { + "epoch": 0.16116386326641347, + "grad_norm": 6.240593910217285, + "learning_rate": 8.474715636562971e-06, + "loss": 0.3365, + "step": 11881 + }, + { + "epoch": 0.16117742810634836, + "grad_norm": 6.936465263366699, + "learning_rate": 8.474578593942718e-06, + "loss": 0.3165, + "step": 11882 + }, + { + "epoch": 0.16119099294628322, + "grad_norm": 7.17486572265625, + "learning_rate": 8.474441551322461e-06, + "loss": 0.4292, + "step": 11883 + }, + { + "epoch": 0.16120455778621812, + "grad_norm": 7.060809135437012, + "learning_rate": 8.474304508702207e-06, + "loss": 0.3578, + "step": 11884 + }, + { + "epoch": 0.161218122626153, + "grad_norm": 5.473150730133057, + "learning_rate": 8.474167466081952e-06, + "loss": 0.2919, + "step": 11885 + }, + { + "epoch": 0.1612316874660879, + "grad_norm": 6.736245155334473, + "learning_rate": 8.474030423461697e-06, + "loss": 0.3088, + "step": 11886 + }, + { + "epoch": 0.1612452523060228, + "grad_norm": 5.3508219718933105, + "learning_rate": 8.473893380841442e-06, + "loss": 0.3197, + "step": 11887 + }, + { + "epoch": 0.1612588171459577, + "grad_norm": 6.938347816467285, + "learning_rate": 8.473756338221187e-06, + "loss": 0.3417, + "step": 11888 + }, + { + "epoch": 0.16127238198589255, + "grad_norm": 6.722756862640381, + "learning_rate": 8.473619295600932e-06, + "loss": 0.3728, + "step": 11889 + }, + { + "epoch": 0.16128594682582745, + "grad_norm": 12.07067584991455, + "learning_rate": 8.473482252980678e-06, + "loss": 0.4489, + "step": 11890 + }, + { + "epoch": 0.16129951166576234, + "grad_norm": 7.211819648742676, + "learning_rate": 8.473345210360423e-06, + "loss": 0.5197, + "step": 11891 + }, + { + "epoch": 0.16131307650569723, + "grad_norm": 10.049079895019531, + "learning_rate": 8.473208167740168e-06, + "loss": 0.4106, + "step": 11892 + }, + { + "epoch": 0.16132664134563213, + "grad_norm": 5.570562839508057, + "learning_rate": 8.473071125119913e-06, + "loss": 0.2922, + "step": 11893 + }, + { + "epoch": 0.16134020618556702, + "grad_norm": 6.732109546661377, + "learning_rate": 8.472934082499658e-06, + "loss": 0.3302, + "step": 11894 + }, + { + "epoch": 0.16135377102550189, + "grad_norm": 6.954076766967773, + "learning_rate": 8.472797039879404e-06, + "loss": 0.397, + "step": 11895 + }, + { + "epoch": 0.16136733586543678, + "grad_norm": 5.760664463043213, + "learning_rate": 8.472659997259149e-06, + "loss": 0.3355, + "step": 11896 + }, + { + "epoch": 0.16138090070537167, + "grad_norm": 5.588717460632324, + "learning_rate": 8.472522954638894e-06, + "loss": 0.2807, + "step": 11897 + }, + { + "epoch": 0.16139446554530656, + "grad_norm": 6.941647529602051, + "learning_rate": 8.472385912018637e-06, + "loss": 0.266, + "step": 11898 + }, + { + "epoch": 0.16140803038524146, + "grad_norm": 6.915946960449219, + "learning_rate": 8.472248869398384e-06, + "loss": 0.4091, + "step": 11899 + }, + { + "epoch": 0.16142159522517635, + "grad_norm": 6.426164627075195, + "learning_rate": 8.47211182677813e-06, + "loss": 0.4944, + "step": 11900 + }, + { + "epoch": 0.16143516006511124, + "grad_norm": 6.420992851257324, + "learning_rate": 8.471974784157873e-06, + "loss": 0.4134, + "step": 11901 + }, + { + "epoch": 0.1614487249050461, + "grad_norm": 10.250028610229492, + "learning_rate": 8.471837741537618e-06, + "loss": 0.4421, + "step": 11902 + }, + { + "epoch": 0.161462289744981, + "grad_norm": 6.011147499084473, + "learning_rate": 8.471700698917365e-06, + "loss": 0.4005, + "step": 11903 + }, + { + "epoch": 0.1614758545849159, + "grad_norm": 5.682679653167725, + "learning_rate": 8.47156365629711e-06, + "loss": 0.1804, + "step": 11904 + }, + { + "epoch": 0.1614894194248508, + "grad_norm": 6.855442047119141, + "learning_rate": 8.471426613676854e-06, + "loss": 0.4105, + "step": 11905 + }, + { + "epoch": 0.16150298426478568, + "grad_norm": 8.124957084655762, + "learning_rate": 8.471289571056599e-06, + "loss": 0.5141, + "step": 11906 + }, + { + "epoch": 0.16151654910472057, + "grad_norm": 9.061716079711914, + "learning_rate": 8.471152528436346e-06, + "loss": 0.5336, + "step": 11907 + }, + { + "epoch": 0.16153011394465544, + "grad_norm": 8.980498313903809, + "learning_rate": 8.47101548581609e-06, + "loss": 0.5887, + "step": 11908 + }, + { + "epoch": 0.16154367878459033, + "grad_norm": 4.913467884063721, + "learning_rate": 8.470878443195834e-06, + "loss": 0.3075, + "step": 11909 + }, + { + "epoch": 0.16155724362452523, + "grad_norm": 5.417251110076904, + "learning_rate": 8.47074140057558e-06, + "loss": 0.3106, + "step": 11910 + }, + { + "epoch": 0.16157080846446012, + "grad_norm": 4.804213523864746, + "learning_rate": 8.470604357955325e-06, + "loss": 0.2474, + "step": 11911 + }, + { + "epoch": 0.161584373304395, + "grad_norm": 5.693830490112305, + "learning_rate": 8.47046731533507e-06, + "loss": 0.2963, + "step": 11912 + }, + { + "epoch": 0.1615979381443299, + "grad_norm": 6.863053321838379, + "learning_rate": 8.470330272714815e-06, + "loss": 0.2978, + "step": 11913 + }, + { + "epoch": 0.1616115029842648, + "grad_norm": 8.847319602966309, + "learning_rate": 8.47019323009456e-06, + "loss": 0.4734, + "step": 11914 + }, + { + "epoch": 0.16162506782419966, + "grad_norm": 5.239073276519775, + "learning_rate": 8.470056187474305e-06, + "loss": 0.3273, + "step": 11915 + }, + { + "epoch": 0.16163863266413456, + "grad_norm": 5.609255790710449, + "learning_rate": 8.46991914485405e-06, + "loss": 0.3719, + "step": 11916 + }, + { + "epoch": 0.16165219750406945, + "grad_norm": 7.199468612670898, + "learning_rate": 8.469782102233796e-06, + "loss": 0.3951, + "step": 11917 + }, + { + "epoch": 0.16166576234400434, + "grad_norm": 7.113938331604004, + "learning_rate": 8.469645059613541e-06, + "loss": 0.4809, + "step": 11918 + }, + { + "epoch": 0.16167932718393924, + "grad_norm": 7.010352611541748, + "learning_rate": 8.469508016993284e-06, + "loss": 0.3968, + "step": 11919 + }, + { + "epoch": 0.16169289202387413, + "grad_norm": 4.304466247558594, + "learning_rate": 8.469370974373031e-06, + "loss": 0.1472, + "step": 11920 + }, + { + "epoch": 0.161706456863809, + "grad_norm": 6.330420017242432, + "learning_rate": 8.469233931752776e-06, + "loss": 0.3114, + "step": 11921 + }, + { + "epoch": 0.1617200217037439, + "grad_norm": 5.989176273345947, + "learning_rate": 8.469096889132522e-06, + "loss": 0.3294, + "step": 11922 + }, + { + "epoch": 0.16173358654367878, + "grad_norm": 6.425993919372559, + "learning_rate": 8.468959846512265e-06, + "loss": 0.4072, + "step": 11923 + }, + { + "epoch": 0.16174715138361367, + "grad_norm": 6.18353271484375, + "learning_rate": 8.46882280389201e-06, + "loss": 0.44, + "step": 11924 + }, + { + "epoch": 0.16176071622354857, + "grad_norm": 9.567754745483398, + "learning_rate": 8.468685761271757e-06, + "loss": 0.6155, + "step": 11925 + }, + { + "epoch": 0.16177428106348346, + "grad_norm": 7.135576248168945, + "learning_rate": 8.4685487186515e-06, + "loss": 0.4594, + "step": 11926 + }, + { + "epoch": 0.16178784590341833, + "grad_norm": 8.614940643310547, + "learning_rate": 8.468411676031246e-06, + "loss": 0.5941, + "step": 11927 + }, + { + "epoch": 0.16180141074335322, + "grad_norm": 6.51405143737793, + "learning_rate": 8.468274633410991e-06, + "loss": 0.3899, + "step": 11928 + }, + { + "epoch": 0.1618149755832881, + "grad_norm": 4.72587776184082, + "learning_rate": 8.468137590790736e-06, + "loss": 0.3926, + "step": 11929 + }, + { + "epoch": 0.161828540423223, + "grad_norm": 7.925445079803467, + "learning_rate": 8.468000548170481e-06, + "loss": 0.3921, + "step": 11930 + }, + { + "epoch": 0.1618421052631579, + "grad_norm": 6.708112716674805, + "learning_rate": 8.467863505550227e-06, + "loss": 0.4145, + "step": 11931 + }, + { + "epoch": 0.1618556701030928, + "grad_norm": 6.1072998046875, + "learning_rate": 8.467726462929972e-06, + "loss": 0.3396, + "step": 11932 + }, + { + "epoch": 0.16186923494302768, + "grad_norm": 7.758669376373291, + "learning_rate": 8.467589420309717e-06, + "loss": 0.3624, + "step": 11933 + }, + { + "epoch": 0.16188279978296255, + "grad_norm": 7.10306978225708, + "learning_rate": 8.467452377689462e-06, + "loss": 0.3958, + "step": 11934 + }, + { + "epoch": 0.16189636462289744, + "grad_norm": 5.42363166809082, + "learning_rate": 8.467315335069207e-06, + "loss": 0.3029, + "step": 11935 + }, + { + "epoch": 0.16190992946283234, + "grad_norm": 7.068767070770264, + "learning_rate": 8.467178292448952e-06, + "loss": 0.529, + "step": 11936 + }, + { + "epoch": 0.16192349430276723, + "grad_norm": 4.7933349609375, + "learning_rate": 8.467041249828698e-06, + "loss": 0.1983, + "step": 11937 + }, + { + "epoch": 0.16193705914270212, + "grad_norm": 8.182369232177734, + "learning_rate": 8.466904207208443e-06, + "loss": 0.4201, + "step": 11938 + }, + { + "epoch": 0.16195062398263702, + "grad_norm": 6.681442737579346, + "learning_rate": 8.466767164588188e-06, + "loss": 0.4403, + "step": 11939 + }, + { + "epoch": 0.16196418882257188, + "grad_norm": 7.757302761077881, + "learning_rate": 8.466630121967933e-06, + "loss": 0.4444, + "step": 11940 + }, + { + "epoch": 0.16197775366250677, + "grad_norm": 6.752236843109131, + "learning_rate": 8.466493079347677e-06, + "loss": 0.3858, + "step": 11941 + }, + { + "epoch": 0.16199131850244167, + "grad_norm": 5.877900123596191, + "learning_rate": 8.466356036727424e-06, + "loss": 0.2694, + "step": 11942 + }, + { + "epoch": 0.16200488334237656, + "grad_norm": 5.599492073059082, + "learning_rate": 8.466218994107169e-06, + "loss": 0.3288, + "step": 11943 + }, + { + "epoch": 0.16201844818231145, + "grad_norm": 6.2198567390441895, + "learning_rate": 8.466081951486912e-06, + "loss": 0.5122, + "step": 11944 + }, + { + "epoch": 0.16203201302224635, + "grad_norm": 7.996793270111084, + "learning_rate": 8.465944908866657e-06, + "loss": 0.3385, + "step": 11945 + }, + { + "epoch": 0.16204557786218124, + "grad_norm": 5.892063617706299, + "learning_rate": 8.465807866246404e-06, + "loss": 0.3868, + "step": 11946 + }, + { + "epoch": 0.1620591427021161, + "grad_norm": 6.565516948699951, + "learning_rate": 8.46567082362615e-06, + "loss": 0.345, + "step": 11947 + }, + { + "epoch": 0.162072707542051, + "grad_norm": 8.036127090454102, + "learning_rate": 8.465533781005893e-06, + "loss": 0.5299, + "step": 11948 + }, + { + "epoch": 0.1620862723819859, + "grad_norm": 7.016589164733887, + "learning_rate": 8.465396738385638e-06, + "loss": 0.3875, + "step": 11949 + }, + { + "epoch": 0.16209983722192078, + "grad_norm": 5.233409881591797, + "learning_rate": 8.465259695765383e-06, + "loss": 0.2426, + "step": 11950 + }, + { + "epoch": 0.16211340206185568, + "grad_norm": 6.855238437652588, + "learning_rate": 8.465122653145128e-06, + "loss": 0.5833, + "step": 11951 + }, + { + "epoch": 0.16212696690179057, + "grad_norm": 8.673754692077637, + "learning_rate": 8.464985610524874e-06, + "loss": 0.3785, + "step": 11952 + }, + { + "epoch": 0.16214053174172544, + "grad_norm": 6.235947132110596, + "learning_rate": 8.464848567904619e-06, + "loss": 0.3102, + "step": 11953 + }, + { + "epoch": 0.16215409658166033, + "grad_norm": 8.707834243774414, + "learning_rate": 8.464711525284364e-06, + "loss": 0.4229, + "step": 11954 + }, + { + "epoch": 0.16216766142159522, + "grad_norm": 7.227151393890381, + "learning_rate": 8.46457448266411e-06, + "loss": 0.3247, + "step": 11955 + }, + { + "epoch": 0.16218122626153011, + "grad_norm": 6.381353855133057, + "learning_rate": 8.464437440043854e-06, + "loss": 0.3273, + "step": 11956 + }, + { + "epoch": 0.162194791101465, + "grad_norm": 5.146489143371582, + "learning_rate": 8.4643003974236e-06, + "loss": 0.3819, + "step": 11957 + }, + { + "epoch": 0.1622083559413999, + "grad_norm": 5.350367546081543, + "learning_rate": 8.464163354803345e-06, + "loss": 0.2641, + "step": 11958 + }, + { + "epoch": 0.1622219207813348, + "grad_norm": 6.753889083862305, + "learning_rate": 8.46402631218309e-06, + "loss": 0.2913, + "step": 11959 + }, + { + "epoch": 0.16223548562126966, + "grad_norm": 7.413903713226318, + "learning_rate": 8.463889269562835e-06, + "loss": 0.4798, + "step": 11960 + }, + { + "epoch": 0.16224905046120455, + "grad_norm": 6.961067199707031, + "learning_rate": 8.46375222694258e-06, + "loss": 0.6079, + "step": 11961 + }, + { + "epoch": 0.16226261530113945, + "grad_norm": 10.119681358337402, + "learning_rate": 8.463615184322325e-06, + "loss": 0.6006, + "step": 11962 + }, + { + "epoch": 0.16227618014107434, + "grad_norm": 8.131078720092773, + "learning_rate": 8.463478141702069e-06, + "loss": 0.4558, + "step": 11963 + }, + { + "epoch": 0.16228974498100923, + "grad_norm": 7.4054856300354, + "learning_rate": 8.463341099081816e-06, + "loss": 0.4682, + "step": 11964 + }, + { + "epoch": 0.16230330982094412, + "grad_norm": 6.6507248878479, + "learning_rate": 8.463204056461561e-06, + "loss": 0.3055, + "step": 11965 + }, + { + "epoch": 0.162316874660879, + "grad_norm": 6.459211826324463, + "learning_rate": 8.463067013841304e-06, + "loss": 0.4599, + "step": 11966 + }, + { + "epoch": 0.16233043950081388, + "grad_norm": 6.942789077758789, + "learning_rate": 8.46292997122105e-06, + "loss": 0.5338, + "step": 11967 + }, + { + "epoch": 0.16234400434074878, + "grad_norm": 7.232558727264404, + "learning_rate": 8.462792928600797e-06, + "loss": 0.4001, + "step": 11968 + }, + { + "epoch": 0.16235756918068367, + "grad_norm": 7.557456970214844, + "learning_rate": 8.46265588598054e-06, + "loss": 0.598, + "step": 11969 + }, + { + "epoch": 0.16237113402061856, + "grad_norm": 8.899693489074707, + "learning_rate": 8.462518843360285e-06, + "loss": 0.5829, + "step": 11970 + }, + { + "epoch": 0.16238469886055346, + "grad_norm": 7.482513427734375, + "learning_rate": 8.46238180074003e-06, + "loss": 0.4923, + "step": 11971 + }, + { + "epoch": 0.16239826370048832, + "grad_norm": 6.101150989532471, + "learning_rate": 8.462244758119777e-06, + "loss": 0.4232, + "step": 11972 + }, + { + "epoch": 0.16241182854042321, + "grad_norm": 5.615514755249023, + "learning_rate": 8.46210771549952e-06, + "loss": 0.3846, + "step": 11973 + }, + { + "epoch": 0.1624253933803581, + "grad_norm": 7.4847822189331055, + "learning_rate": 8.461970672879266e-06, + "loss": 0.6095, + "step": 11974 + }, + { + "epoch": 0.162438958220293, + "grad_norm": 5.882638931274414, + "learning_rate": 8.461833630259011e-06, + "loss": 0.4216, + "step": 11975 + }, + { + "epoch": 0.1624525230602279, + "grad_norm": 6.662243843078613, + "learning_rate": 8.461696587638756e-06, + "loss": 0.411, + "step": 11976 + }, + { + "epoch": 0.1624660879001628, + "grad_norm": 6.117115020751953, + "learning_rate": 8.461559545018501e-06, + "loss": 0.3337, + "step": 11977 + }, + { + "epoch": 0.16247965274009768, + "grad_norm": 6.700484752655029, + "learning_rate": 8.461422502398247e-06, + "loss": 0.4446, + "step": 11978 + }, + { + "epoch": 0.16249321758003255, + "grad_norm": 7.072586536407471, + "learning_rate": 8.461285459777992e-06, + "loss": 0.5216, + "step": 11979 + }, + { + "epoch": 0.16250678241996744, + "grad_norm": 7.98089599609375, + "learning_rate": 8.461148417157737e-06, + "loss": 0.6588, + "step": 11980 + }, + { + "epoch": 0.16252034725990233, + "grad_norm": 7.749753475189209, + "learning_rate": 8.461011374537482e-06, + "loss": 0.5811, + "step": 11981 + }, + { + "epoch": 0.16253391209983722, + "grad_norm": 6.805503845214844, + "learning_rate": 8.460874331917227e-06, + "loss": 0.4139, + "step": 11982 + }, + { + "epoch": 0.16254747693977212, + "grad_norm": 7.678616046905518, + "learning_rate": 8.460737289296973e-06, + "loss": 0.4893, + "step": 11983 + }, + { + "epoch": 0.162561041779707, + "grad_norm": 7.1466803550720215, + "learning_rate": 8.460600246676716e-06, + "loss": 0.4928, + "step": 11984 + }, + { + "epoch": 0.16257460661964188, + "grad_norm": 6.7645697593688965, + "learning_rate": 8.460463204056463e-06, + "loss": 0.3391, + "step": 11985 + }, + { + "epoch": 0.16258817145957677, + "grad_norm": 7.10914421081543, + "learning_rate": 8.460326161436208e-06, + "loss": 0.5587, + "step": 11986 + }, + { + "epoch": 0.16260173629951166, + "grad_norm": 7.19525146484375, + "learning_rate": 8.460189118815953e-06, + "loss": 0.5332, + "step": 11987 + }, + { + "epoch": 0.16261530113944656, + "grad_norm": 5.034419059753418, + "learning_rate": 8.460052076195697e-06, + "loss": 0.3447, + "step": 11988 + }, + { + "epoch": 0.16262886597938145, + "grad_norm": 5.4990668296813965, + "learning_rate": 8.459915033575444e-06, + "loss": 0.3771, + "step": 11989 + }, + { + "epoch": 0.16264243081931634, + "grad_norm": 7.300875186920166, + "learning_rate": 8.459777990955189e-06, + "loss": 0.4705, + "step": 11990 + }, + { + "epoch": 0.16265599565925123, + "grad_norm": 6.175726413726807, + "learning_rate": 8.459640948334932e-06, + "loss": 0.5258, + "step": 11991 + }, + { + "epoch": 0.1626695604991861, + "grad_norm": 5.751072406768799, + "learning_rate": 8.459503905714677e-06, + "loss": 0.4195, + "step": 11992 + }, + { + "epoch": 0.162683125339121, + "grad_norm": 5.792398452758789, + "learning_rate": 8.459366863094423e-06, + "loss": 0.3629, + "step": 11993 + }, + { + "epoch": 0.1626966901790559, + "grad_norm": 5.428664207458496, + "learning_rate": 8.459229820474168e-06, + "loss": 0.3035, + "step": 11994 + }, + { + "epoch": 0.16271025501899078, + "grad_norm": 5.592040538787842, + "learning_rate": 8.459092777853913e-06, + "loss": 0.3661, + "step": 11995 + }, + { + "epoch": 0.16272381985892567, + "grad_norm": 5.7679057121276855, + "learning_rate": 8.458955735233658e-06, + "loss": 0.3508, + "step": 11996 + }, + { + "epoch": 0.16273738469886057, + "grad_norm": 5.7308349609375, + "learning_rate": 8.458818692613403e-06, + "loss": 0.4363, + "step": 11997 + }, + { + "epoch": 0.16275094953879543, + "grad_norm": 5.897585391998291, + "learning_rate": 8.458681649993148e-06, + "loss": 0.4465, + "step": 11998 + }, + { + "epoch": 0.16276451437873032, + "grad_norm": 10.37716293334961, + "learning_rate": 8.458544607372894e-06, + "loss": 0.381, + "step": 11999 + }, + { + "epoch": 0.16277807921866522, + "grad_norm": 8.153205871582031, + "learning_rate": 8.458407564752639e-06, + "loss": 0.4572, + "step": 12000 + }, + { + "epoch": 0.1627916440586001, + "grad_norm": 6.551041603088379, + "learning_rate": 8.458270522132384e-06, + "loss": 0.4542, + "step": 12001 + }, + { + "epoch": 0.162805208898535, + "grad_norm": 5.667660713195801, + "learning_rate": 8.45813347951213e-06, + "loss": 0.2898, + "step": 12002 + }, + { + "epoch": 0.1628187737384699, + "grad_norm": 7.2807087898254395, + "learning_rate": 8.457996436891874e-06, + "loss": 0.5269, + "step": 12003 + }, + { + "epoch": 0.16283233857840476, + "grad_norm": 6.144783020019531, + "learning_rate": 8.45785939427162e-06, + "loss": 0.3163, + "step": 12004 + }, + { + "epoch": 0.16284590341833965, + "grad_norm": 5.55825138092041, + "learning_rate": 8.457722351651365e-06, + "loss": 0.3588, + "step": 12005 + }, + { + "epoch": 0.16285946825827455, + "grad_norm": 8.00351619720459, + "learning_rate": 8.457585309031108e-06, + "loss": 0.3363, + "step": 12006 + }, + { + "epoch": 0.16287303309820944, + "grad_norm": 5.4123921394348145, + "learning_rate": 8.457448266410855e-06, + "loss": 0.2681, + "step": 12007 + }, + { + "epoch": 0.16288659793814433, + "grad_norm": 7.359386444091797, + "learning_rate": 8.4573112237906e-06, + "loss": 0.4403, + "step": 12008 + }, + { + "epoch": 0.16290016277807923, + "grad_norm": 7.803079605102539, + "learning_rate": 8.457174181170344e-06, + "loss": 0.5482, + "step": 12009 + }, + { + "epoch": 0.16291372761801412, + "grad_norm": 4.823597431182861, + "learning_rate": 8.457037138550089e-06, + "loss": 0.3823, + "step": 12010 + }, + { + "epoch": 0.16292729245794899, + "grad_norm": 6.366393566131592, + "learning_rate": 8.456900095929836e-06, + "loss": 0.2922, + "step": 12011 + }, + { + "epoch": 0.16294085729788388, + "grad_norm": 6.839661598205566, + "learning_rate": 8.45676305330958e-06, + "loss": 0.5531, + "step": 12012 + }, + { + "epoch": 0.16295442213781877, + "grad_norm": 7.161106109619141, + "learning_rate": 8.456626010689324e-06, + "loss": 0.4324, + "step": 12013 + }, + { + "epoch": 0.16296798697775366, + "grad_norm": 6.515002250671387, + "learning_rate": 8.45648896806907e-06, + "loss": 0.4139, + "step": 12014 + }, + { + "epoch": 0.16298155181768856, + "grad_norm": 7.127738952636719, + "learning_rate": 8.456351925448817e-06, + "loss": 0.5134, + "step": 12015 + }, + { + "epoch": 0.16299511665762345, + "grad_norm": 8.7048978805542, + "learning_rate": 8.45621488282856e-06, + "loss": 0.5768, + "step": 12016 + }, + { + "epoch": 0.16300868149755832, + "grad_norm": 8.380583763122559, + "learning_rate": 8.456077840208305e-06, + "loss": 0.4971, + "step": 12017 + }, + { + "epoch": 0.1630222463374932, + "grad_norm": 10.179224014282227, + "learning_rate": 8.45594079758805e-06, + "loss": 0.4672, + "step": 12018 + }, + { + "epoch": 0.1630358111774281, + "grad_norm": 6.951362133026123, + "learning_rate": 8.455803754967796e-06, + "loss": 0.4516, + "step": 12019 + }, + { + "epoch": 0.163049376017363, + "grad_norm": 5.644623756408691, + "learning_rate": 8.45566671234754e-06, + "loss": 0.4, + "step": 12020 + }, + { + "epoch": 0.1630629408572979, + "grad_norm": 9.420027732849121, + "learning_rate": 8.455529669727286e-06, + "loss": 0.7123, + "step": 12021 + }, + { + "epoch": 0.16307650569723278, + "grad_norm": 6.474523067474365, + "learning_rate": 8.455392627107031e-06, + "loss": 0.4909, + "step": 12022 + }, + { + "epoch": 0.16309007053716768, + "grad_norm": 7.071181774139404, + "learning_rate": 8.455255584486776e-06, + "loss": 0.3507, + "step": 12023 + }, + { + "epoch": 0.16310363537710254, + "grad_norm": 6.900833606719971, + "learning_rate": 8.455118541866521e-06, + "loss": 0.4168, + "step": 12024 + }, + { + "epoch": 0.16311720021703743, + "grad_norm": 6.753039360046387, + "learning_rate": 8.454981499246267e-06, + "loss": 0.4656, + "step": 12025 + }, + { + "epoch": 0.16313076505697233, + "grad_norm": 8.483065605163574, + "learning_rate": 8.454844456626012e-06, + "loss": 0.5503, + "step": 12026 + }, + { + "epoch": 0.16314432989690722, + "grad_norm": 7.40613317489624, + "learning_rate": 8.454707414005755e-06, + "loss": 0.5078, + "step": 12027 + }, + { + "epoch": 0.1631578947368421, + "grad_norm": 7.4937567710876465, + "learning_rate": 8.454570371385502e-06, + "loss": 0.4478, + "step": 12028 + }, + { + "epoch": 0.163171459576777, + "grad_norm": 7.33786678314209, + "learning_rate": 8.454433328765247e-06, + "loss": 0.3329, + "step": 12029 + }, + { + "epoch": 0.16318502441671187, + "grad_norm": 6.124750137329102, + "learning_rate": 8.454296286144993e-06, + "loss": 0.2432, + "step": 12030 + }, + { + "epoch": 0.16319858925664676, + "grad_norm": 7.0832319259643555, + "learning_rate": 8.454159243524736e-06, + "loss": 0.3791, + "step": 12031 + }, + { + "epoch": 0.16321215409658166, + "grad_norm": 6.827761650085449, + "learning_rate": 8.454022200904481e-06, + "loss": 0.4431, + "step": 12032 + }, + { + "epoch": 0.16322571893651655, + "grad_norm": 5.711122512817383, + "learning_rate": 8.453885158284228e-06, + "loss": 0.3378, + "step": 12033 + }, + { + "epoch": 0.16323928377645144, + "grad_norm": 6.700371742248535, + "learning_rate": 8.453748115663972e-06, + "loss": 0.3653, + "step": 12034 + }, + { + "epoch": 0.16325284861638634, + "grad_norm": 6.6605305671691895, + "learning_rate": 8.453611073043717e-06, + "loss": 0.4692, + "step": 12035 + }, + { + "epoch": 0.1632664134563212, + "grad_norm": 9.517463684082031, + "learning_rate": 8.453474030423462e-06, + "loss": 0.4423, + "step": 12036 + }, + { + "epoch": 0.1632799782962561, + "grad_norm": 7.751898288726807, + "learning_rate": 8.453336987803207e-06, + "loss": 0.3765, + "step": 12037 + }, + { + "epoch": 0.163293543136191, + "grad_norm": 7.549159526824951, + "learning_rate": 8.453199945182952e-06, + "loss": 0.3678, + "step": 12038 + }, + { + "epoch": 0.16330710797612588, + "grad_norm": 7.276884078979492, + "learning_rate": 8.453062902562697e-06, + "loss": 0.4122, + "step": 12039 + }, + { + "epoch": 0.16332067281606077, + "grad_norm": 6.447457790374756, + "learning_rate": 8.452925859942443e-06, + "loss": 0.3021, + "step": 12040 + }, + { + "epoch": 0.16333423765599567, + "grad_norm": 6.604013442993164, + "learning_rate": 8.452788817322188e-06, + "loss": 0.4134, + "step": 12041 + }, + { + "epoch": 0.16334780249593056, + "grad_norm": 7.430238246917725, + "learning_rate": 8.452651774701933e-06, + "loss": 0.4415, + "step": 12042 + }, + { + "epoch": 0.16336136733586543, + "grad_norm": 6.190214157104492, + "learning_rate": 8.452514732081678e-06, + "loss": 0.4153, + "step": 12043 + }, + { + "epoch": 0.16337493217580032, + "grad_norm": 6.477526664733887, + "learning_rate": 8.452377689461423e-06, + "loss": 0.3252, + "step": 12044 + }, + { + "epoch": 0.1633884970157352, + "grad_norm": 6.303393363952637, + "learning_rate": 8.452240646841169e-06, + "loss": 0.378, + "step": 12045 + }, + { + "epoch": 0.1634020618556701, + "grad_norm": 5.962584495544434, + "learning_rate": 8.452103604220914e-06, + "loss": 0.3251, + "step": 12046 + }, + { + "epoch": 0.163415626695605, + "grad_norm": 4.989958763122559, + "learning_rate": 8.451966561600659e-06, + "loss": 0.214, + "step": 12047 + }, + { + "epoch": 0.1634291915355399, + "grad_norm": 7.502835273742676, + "learning_rate": 8.451829518980404e-06, + "loss": 0.4247, + "step": 12048 + }, + { + "epoch": 0.16344275637547476, + "grad_norm": 6.4689178466796875, + "learning_rate": 8.451692476360148e-06, + "loss": 0.3997, + "step": 12049 + }, + { + "epoch": 0.16345632121540965, + "grad_norm": 5.490739822387695, + "learning_rate": 8.451555433739894e-06, + "loss": 0.3396, + "step": 12050 + }, + { + "epoch": 0.16346988605534454, + "grad_norm": 7.789859771728516, + "learning_rate": 8.45141839111964e-06, + "loss": 0.3495, + "step": 12051 + }, + { + "epoch": 0.16348345089527944, + "grad_norm": 7.856145858764648, + "learning_rate": 8.451281348499383e-06, + "loss": 0.3788, + "step": 12052 + }, + { + "epoch": 0.16349701573521433, + "grad_norm": 6.788635730743408, + "learning_rate": 8.451144305879128e-06, + "loss": 0.3737, + "step": 12053 + }, + { + "epoch": 0.16351058057514922, + "grad_norm": 6.765707969665527, + "learning_rate": 8.451007263258875e-06, + "loss": 0.3166, + "step": 12054 + }, + { + "epoch": 0.16352414541508412, + "grad_norm": 6.263398170471191, + "learning_rate": 8.45087022063862e-06, + "loss": 0.2959, + "step": 12055 + }, + { + "epoch": 0.16353771025501898, + "grad_norm": 6.370086669921875, + "learning_rate": 8.450733178018364e-06, + "loss": 0.3408, + "step": 12056 + }, + { + "epoch": 0.16355127509495387, + "grad_norm": 6.506677150726318, + "learning_rate": 8.450596135398109e-06, + "loss": 0.3075, + "step": 12057 + }, + { + "epoch": 0.16356483993488877, + "grad_norm": 5.899011611938477, + "learning_rate": 8.450459092777856e-06, + "loss": 0.3345, + "step": 12058 + }, + { + "epoch": 0.16357840477482366, + "grad_norm": 5.108409881591797, + "learning_rate": 8.4503220501576e-06, + "loss": 0.3016, + "step": 12059 + }, + { + "epoch": 0.16359196961475855, + "grad_norm": 7.8312859535217285, + "learning_rate": 8.450185007537345e-06, + "loss": 0.3706, + "step": 12060 + }, + { + "epoch": 0.16360553445469345, + "grad_norm": 8.26125717163086, + "learning_rate": 8.45004796491709e-06, + "loss": 0.3834, + "step": 12061 + }, + { + "epoch": 0.1636190992946283, + "grad_norm": 10.287626266479492, + "learning_rate": 8.449910922296835e-06, + "loss": 0.5714, + "step": 12062 + }, + { + "epoch": 0.1636326641345632, + "grad_norm": 6.411006450653076, + "learning_rate": 8.44977387967658e-06, + "loss": 0.345, + "step": 12063 + }, + { + "epoch": 0.1636462289744981, + "grad_norm": 5.0213303565979, + "learning_rate": 8.449636837056325e-06, + "loss": 0.3266, + "step": 12064 + }, + { + "epoch": 0.163659793814433, + "grad_norm": 7.982232570648193, + "learning_rate": 8.44949979443607e-06, + "loss": 0.3579, + "step": 12065 + }, + { + "epoch": 0.16367335865436788, + "grad_norm": 5.402946949005127, + "learning_rate": 8.449362751815816e-06, + "loss": 0.3054, + "step": 12066 + }, + { + "epoch": 0.16368692349430278, + "grad_norm": 6.699251651763916, + "learning_rate": 8.44922570919556e-06, + "loss": 0.3242, + "step": 12067 + }, + { + "epoch": 0.16370048833423764, + "grad_norm": 6.258535385131836, + "learning_rate": 8.449088666575306e-06, + "loss": 0.2885, + "step": 12068 + }, + { + "epoch": 0.16371405317417254, + "grad_norm": 6.6264543533325195, + "learning_rate": 8.448951623955051e-06, + "loss": 0.3711, + "step": 12069 + }, + { + "epoch": 0.16372761801410743, + "grad_norm": 6.4277024269104, + "learning_rate": 8.448814581334796e-06, + "loss": 0.3919, + "step": 12070 + }, + { + "epoch": 0.16374118285404232, + "grad_norm": 5.782377243041992, + "learning_rate": 8.448677538714541e-06, + "loss": 0.2882, + "step": 12071 + }, + { + "epoch": 0.16375474769397722, + "grad_norm": 6.648569107055664, + "learning_rate": 8.448540496094287e-06, + "loss": 0.3659, + "step": 12072 + }, + { + "epoch": 0.1637683125339121, + "grad_norm": 6.82586145401001, + "learning_rate": 8.448403453474032e-06, + "loss": 0.3107, + "step": 12073 + }, + { + "epoch": 0.163781877373847, + "grad_norm": 8.13663387298584, + "learning_rate": 8.448266410853775e-06, + "loss": 0.3911, + "step": 12074 + }, + { + "epoch": 0.16379544221378187, + "grad_norm": 6.106567859649658, + "learning_rate": 8.44812936823352e-06, + "loss": 0.2977, + "step": 12075 + }, + { + "epoch": 0.16380900705371676, + "grad_norm": 7.618409633636475, + "learning_rate": 8.447992325613267e-06, + "loss": 0.4297, + "step": 12076 + }, + { + "epoch": 0.16382257189365165, + "grad_norm": 7.389631748199463, + "learning_rate": 8.447855282993011e-06, + "loss": 0.4235, + "step": 12077 + }, + { + "epoch": 0.16383613673358655, + "grad_norm": 8.569759368896484, + "learning_rate": 8.447718240372756e-06, + "loss": 0.5216, + "step": 12078 + }, + { + "epoch": 0.16384970157352144, + "grad_norm": 6.151385307312012, + "learning_rate": 8.447581197752501e-06, + "loss": 0.3323, + "step": 12079 + }, + { + "epoch": 0.16386326641345633, + "grad_norm": 6.251616477966309, + "learning_rate": 8.447444155132248e-06, + "loss": 0.4276, + "step": 12080 + }, + { + "epoch": 0.1638768312533912, + "grad_norm": 5.618207931518555, + "learning_rate": 8.447307112511992e-06, + "loss": 0.4374, + "step": 12081 + }, + { + "epoch": 0.1638903960933261, + "grad_norm": 5.979750633239746, + "learning_rate": 8.447170069891737e-06, + "loss": 0.4432, + "step": 12082 + }, + { + "epoch": 0.16390396093326098, + "grad_norm": 5.263755798339844, + "learning_rate": 8.447033027271482e-06, + "loss": 0.4366, + "step": 12083 + }, + { + "epoch": 0.16391752577319588, + "grad_norm": 6.118418216705322, + "learning_rate": 8.446895984651227e-06, + "loss": 0.4169, + "step": 12084 + }, + { + "epoch": 0.16393109061313077, + "grad_norm": 7.2752604484558105, + "learning_rate": 8.446758942030972e-06, + "loss": 0.3583, + "step": 12085 + }, + { + "epoch": 0.16394465545306566, + "grad_norm": 7.299071311950684, + "learning_rate": 8.446621899410717e-06, + "loss": 0.4043, + "step": 12086 + }, + { + "epoch": 0.16395822029300056, + "grad_norm": 8.132368087768555, + "learning_rate": 8.446484856790463e-06, + "loss": 0.338, + "step": 12087 + }, + { + "epoch": 0.16397178513293542, + "grad_norm": 4.968807220458984, + "learning_rate": 8.446347814170208e-06, + "loss": 0.3009, + "step": 12088 + }, + { + "epoch": 0.16398534997287031, + "grad_norm": 5.980966567993164, + "learning_rate": 8.446210771549953e-06, + "loss": 0.3002, + "step": 12089 + }, + { + "epoch": 0.1639989148128052, + "grad_norm": 4.717428207397461, + "learning_rate": 8.446073728929698e-06, + "loss": 0.2986, + "step": 12090 + }, + { + "epoch": 0.1640124796527401, + "grad_norm": 7.4951934814453125, + "learning_rate": 8.445936686309443e-06, + "loss": 0.4677, + "step": 12091 + }, + { + "epoch": 0.164026044492675, + "grad_norm": 4.720541954040527, + "learning_rate": 8.445799643689187e-06, + "loss": 0.2219, + "step": 12092 + }, + { + "epoch": 0.1640396093326099, + "grad_norm": 4.785496711730957, + "learning_rate": 8.445662601068934e-06, + "loss": 0.2459, + "step": 12093 + }, + { + "epoch": 0.16405317417254475, + "grad_norm": 6.4153618812561035, + "learning_rate": 8.445525558448679e-06, + "loss": 0.4099, + "step": 12094 + }, + { + "epoch": 0.16406673901247965, + "grad_norm": 5.001276969909668, + "learning_rate": 8.445388515828424e-06, + "loss": 0.4276, + "step": 12095 + }, + { + "epoch": 0.16408030385241454, + "grad_norm": 5.379701614379883, + "learning_rate": 8.445251473208168e-06, + "loss": 0.4475, + "step": 12096 + }, + { + "epoch": 0.16409386869234943, + "grad_norm": 4.5921759605407715, + "learning_rate": 8.445114430587914e-06, + "loss": 0.1426, + "step": 12097 + }, + { + "epoch": 0.16410743353228432, + "grad_norm": 5.194815635681152, + "learning_rate": 8.44497738796766e-06, + "loss": 0.3617, + "step": 12098 + }, + { + "epoch": 0.16412099837221922, + "grad_norm": 4.814620494842529, + "learning_rate": 8.444840345347403e-06, + "loss": 0.3605, + "step": 12099 + }, + { + "epoch": 0.16413456321215408, + "grad_norm": 6.394733905792236, + "learning_rate": 8.444703302727148e-06, + "loss": 0.393, + "step": 12100 + }, + { + "epoch": 0.16414812805208898, + "grad_norm": 6.498347759246826, + "learning_rate": 8.444566260106893e-06, + "loss": 0.4145, + "step": 12101 + }, + { + "epoch": 0.16416169289202387, + "grad_norm": 6.1902852058410645, + "learning_rate": 8.444429217486639e-06, + "loss": 0.4311, + "step": 12102 + }, + { + "epoch": 0.16417525773195876, + "grad_norm": 5.603814125061035, + "learning_rate": 8.444292174866384e-06, + "loss": 0.3747, + "step": 12103 + }, + { + "epoch": 0.16418882257189366, + "grad_norm": 4.600372791290283, + "learning_rate": 8.444155132246129e-06, + "loss": 0.222, + "step": 12104 + }, + { + "epoch": 0.16420238741182855, + "grad_norm": 5.242251396179199, + "learning_rate": 8.444018089625874e-06, + "loss": 0.2725, + "step": 12105 + }, + { + "epoch": 0.16421595225176344, + "grad_norm": 5.677844047546387, + "learning_rate": 8.44388104700562e-06, + "loss": 0.4157, + "step": 12106 + }, + { + "epoch": 0.1642295170916983, + "grad_norm": 6.316489219665527, + "learning_rate": 8.443744004385365e-06, + "loss": 0.3975, + "step": 12107 + }, + { + "epoch": 0.1642430819316332, + "grad_norm": 5.859224796295166, + "learning_rate": 8.44360696176511e-06, + "loss": 0.3903, + "step": 12108 + }, + { + "epoch": 0.1642566467715681, + "grad_norm": 7.593347549438477, + "learning_rate": 8.443469919144855e-06, + "loss": 0.4593, + "step": 12109 + }, + { + "epoch": 0.164270211611503, + "grad_norm": 5.618531703948975, + "learning_rate": 8.4433328765246e-06, + "loss": 0.3622, + "step": 12110 + }, + { + "epoch": 0.16428377645143788, + "grad_norm": 6.183176040649414, + "learning_rate": 8.443195833904345e-06, + "loss": 0.2656, + "step": 12111 + }, + { + "epoch": 0.16429734129137277, + "grad_norm": 4.5673346519470215, + "learning_rate": 8.44305879128409e-06, + "loss": 0.2956, + "step": 12112 + }, + { + "epoch": 0.16431090613130764, + "grad_norm": 6.372870445251465, + "learning_rate": 8.442921748663836e-06, + "loss": 0.4613, + "step": 12113 + }, + { + "epoch": 0.16432447097124253, + "grad_norm": 4.296064853668213, + "learning_rate": 8.442784706043579e-06, + "loss": 0.2641, + "step": 12114 + }, + { + "epoch": 0.16433803581117742, + "grad_norm": 6.009721279144287, + "learning_rate": 8.442647663423326e-06, + "loss": 0.309, + "step": 12115 + }, + { + "epoch": 0.16435160065111232, + "grad_norm": 7.572810173034668, + "learning_rate": 8.442510620803071e-06, + "loss": 0.4874, + "step": 12116 + }, + { + "epoch": 0.1643651654910472, + "grad_norm": 4.371432781219482, + "learning_rate": 8.442373578182815e-06, + "loss": 0.3822, + "step": 12117 + }, + { + "epoch": 0.1643787303309821, + "grad_norm": 6.187825679779053, + "learning_rate": 8.44223653556256e-06, + "loss": 0.3228, + "step": 12118 + }, + { + "epoch": 0.164392295170917, + "grad_norm": 5.384653568267822, + "learning_rate": 8.442099492942307e-06, + "loss": 0.3019, + "step": 12119 + }, + { + "epoch": 0.16440586001085186, + "grad_norm": 5.567202091217041, + "learning_rate": 8.44196245032205e-06, + "loss": 0.3838, + "step": 12120 + }, + { + "epoch": 0.16441942485078676, + "grad_norm": 5.782050132751465, + "learning_rate": 8.441825407701795e-06, + "loss": 0.2905, + "step": 12121 + }, + { + "epoch": 0.16443298969072165, + "grad_norm": 6.005796432495117, + "learning_rate": 8.44168836508154e-06, + "loss": 0.5277, + "step": 12122 + }, + { + "epoch": 0.16444655453065654, + "grad_norm": 6.375955104827881, + "learning_rate": 8.441551322461287e-06, + "loss": 0.3151, + "step": 12123 + }, + { + "epoch": 0.16446011937059143, + "grad_norm": 5.753347873687744, + "learning_rate": 8.441414279841031e-06, + "loss": 0.3544, + "step": 12124 + }, + { + "epoch": 0.16447368421052633, + "grad_norm": 6.245108604431152, + "learning_rate": 8.441277237220776e-06, + "loss": 0.3231, + "step": 12125 + }, + { + "epoch": 0.1644872490504612, + "grad_norm": 4.9221415519714355, + "learning_rate": 8.441140194600521e-06, + "loss": 0.3263, + "step": 12126 + }, + { + "epoch": 0.16450081389039609, + "grad_norm": 4.638299942016602, + "learning_rate": 8.441003151980266e-06, + "loss": 0.2815, + "step": 12127 + }, + { + "epoch": 0.16451437873033098, + "grad_norm": 4.961704254150391, + "learning_rate": 8.440866109360012e-06, + "loss": 0.3066, + "step": 12128 + }, + { + "epoch": 0.16452794357026587, + "grad_norm": 6.4302239418029785, + "learning_rate": 8.440729066739757e-06, + "loss": 0.3706, + "step": 12129 + }, + { + "epoch": 0.16454150841020077, + "grad_norm": 6.35573148727417, + "learning_rate": 8.440592024119502e-06, + "loss": 0.3944, + "step": 12130 + }, + { + "epoch": 0.16455507325013566, + "grad_norm": 6.129939079284668, + "learning_rate": 8.440454981499247e-06, + "loss": 0.3813, + "step": 12131 + }, + { + "epoch": 0.16456863809007052, + "grad_norm": 6.697657108306885, + "learning_rate": 8.440317938878992e-06, + "loss": 0.3655, + "step": 12132 + }, + { + "epoch": 0.16458220293000542, + "grad_norm": 6.703118801116943, + "learning_rate": 8.440180896258737e-06, + "loss": 0.4493, + "step": 12133 + }, + { + "epoch": 0.1645957677699403, + "grad_norm": 5.9440999031066895, + "learning_rate": 8.440043853638483e-06, + "loss": 0.4294, + "step": 12134 + }, + { + "epoch": 0.1646093326098752, + "grad_norm": 6.430551528930664, + "learning_rate": 8.439906811018226e-06, + "loss": 0.4632, + "step": 12135 + }, + { + "epoch": 0.1646228974498101, + "grad_norm": 4.1502885818481445, + "learning_rate": 8.439769768397973e-06, + "loss": 0.2399, + "step": 12136 + }, + { + "epoch": 0.164636462289745, + "grad_norm": 6.214041709899902, + "learning_rate": 8.439632725777718e-06, + "loss": 0.3369, + "step": 12137 + }, + { + "epoch": 0.16465002712967988, + "grad_norm": 5.849399089813232, + "learning_rate": 8.439495683157463e-06, + "loss": 0.3208, + "step": 12138 + }, + { + "epoch": 0.16466359196961475, + "grad_norm": 5.65157413482666, + "learning_rate": 8.439358640537207e-06, + "loss": 0.3896, + "step": 12139 + }, + { + "epoch": 0.16467715680954964, + "grad_norm": 4.646707057952881, + "learning_rate": 8.439221597916954e-06, + "loss": 0.2842, + "step": 12140 + }, + { + "epoch": 0.16469072164948453, + "grad_norm": 6.024219989776611, + "learning_rate": 8.439084555296699e-06, + "loss": 0.2716, + "step": 12141 + }, + { + "epoch": 0.16470428648941943, + "grad_norm": 7.386404037475586, + "learning_rate": 8.438947512676442e-06, + "loss": 0.4864, + "step": 12142 + }, + { + "epoch": 0.16471785132935432, + "grad_norm": 5.180314064025879, + "learning_rate": 8.438810470056188e-06, + "loss": 0.3336, + "step": 12143 + }, + { + "epoch": 0.1647314161692892, + "grad_norm": 9.304461479187012, + "learning_rate": 8.438673427435933e-06, + "loss": 0.5473, + "step": 12144 + }, + { + "epoch": 0.16474498100922408, + "grad_norm": 5.678247451782227, + "learning_rate": 8.438536384815678e-06, + "loss": 0.3378, + "step": 12145 + }, + { + "epoch": 0.16475854584915897, + "grad_norm": 6.732650279998779, + "learning_rate": 8.438399342195423e-06, + "loss": 0.4573, + "step": 12146 + }, + { + "epoch": 0.16477211068909386, + "grad_norm": 7.070070743560791, + "learning_rate": 8.438262299575168e-06, + "loss": 0.383, + "step": 12147 + }, + { + "epoch": 0.16478567552902876, + "grad_norm": 6.471791744232178, + "learning_rate": 8.438125256954913e-06, + "loss": 0.361, + "step": 12148 + }, + { + "epoch": 0.16479924036896365, + "grad_norm": 8.638456344604492, + "learning_rate": 8.437988214334659e-06, + "loss": 0.5519, + "step": 12149 + }, + { + "epoch": 0.16481280520889854, + "grad_norm": 8.04865550994873, + "learning_rate": 8.437851171714404e-06, + "loss": 0.2909, + "step": 12150 + }, + { + "epoch": 0.16482637004883344, + "grad_norm": 6.621257305145264, + "learning_rate": 8.437714129094149e-06, + "loss": 0.5444, + "step": 12151 + }, + { + "epoch": 0.1648399348887683, + "grad_norm": 6.418248176574707, + "learning_rate": 8.437577086473894e-06, + "loss": 0.4432, + "step": 12152 + }, + { + "epoch": 0.1648534997287032, + "grad_norm": 6.091439723968506, + "learning_rate": 8.43744004385364e-06, + "loss": 0.3387, + "step": 12153 + }, + { + "epoch": 0.1648670645686381, + "grad_norm": 5.9979400634765625, + "learning_rate": 8.437303001233385e-06, + "loss": 0.358, + "step": 12154 + }, + { + "epoch": 0.16488062940857298, + "grad_norm": 7.446776866912842, + "learning_rate": 8.43716595861313e-06, + "loss": 0.4616, + "step": 12155 + }, + { + "epoch": 0.16489419424850787, + "grad_norm": 5.193438529968262, + "learning_rate": 8.437028915992875e-06, + "loss": 0.2803, + "step": 12156 + }, + { + "epoch": 0.16490775908844277, + "grad_norm": 6.996969223022461, + "learning_rate": 8.436891873372618e-06, + "loss": 0.4086, + "step": 12157 + }, + { + "epoch": 0.16492132392837763, + "grad_norm": 5.863053321838379, + "learning_rate": 8.436754830752365e-06, + "loss": 0.4243, + "step": 12158 + }, + { + "epoch": 0.16493488876831253, + "grad_norm": 7.937037467956543, + "learning_rate": 8.43661778813211e-06, + "loss": 0.5113, + "step": 12159 + }, + { + "epoch": 0.16494845360824742, + "grad_norm": 7.455841064453125, + "learning_rate": 8.436480745511854e-06, + "loss": 0.4758, + "step": 12160 + }, + { + "epoch": 0.1649620184481823, + "grad_norm": 6.609822750091553, + "learning_rate": 8.436343702891599e-06, + "loss": 0.4278, + "step": 12161 + }, + { + "epoch": 0.1649755832881172, + "grad_norm": 7.1411566734313965, + "learning_rate": 8.436206660271346e-06, + "loss": 0.4435, + "step": 12162 + }, + { + "epoch": 0.1649891481280521, + "grad_norm": 6.200859546661377, + "learning_rate": 8.436069617651091e-06, + "loss": 0.3782, + "step": 12163 + }, + { + "epoch": 0.16500271296798696, + "grad_norm": 4.72135591506958, + "learning_rate": 8.435932575030835e-06, + "loss": 0.2742, + "step": 12164 + }, + { + "epoch": 0.16501627780792186, + "grad_norm": 4.811033725738525, + "learning_rate": 8.43579553241058e-06, + "loss": 0.4093, + "step": 12165 + }, + { + "epoch": 0.16502984264785675, + "grad_norm": 5.11330509185791, + "learning_rate": 8.435658489790327e-06, + "loss": 0.414, + "step": 12166 + }, + { + "epoch": 0.16504340748779164, + "grad_norm": 5.669827938079834, + "learning_rate": 8.43552144717007e-06, + "loss": 0.4632, + "step": 12167 + }, + { + "epoch": 0.16505697232772654, + "grad_norm": 6.550622463226318, + "learning_rate": 8.435384404549815e-06, + "loss": 0.3744, + "step": 12168 + }, + { + "epoch": 0.16507053716766143, + "grad_norm": 7.426636695861816, + "learning_rate": 8.43524736192956e-06, + "loss": 0.3425, + "step": 12169 + }, + { + "epoch": 0.16508410200759632, + "grad_norm": 4.7783050537109375, + "learning_rate": 8.435110319309306e-06, + "loss": 0.3151, + "step": 12170 + }, + { + "epoch": 0.1650976668475312, + "grad_norm": 5.140785217285156, + "learning_rate": 8.434973276689051e-06, + "loss": 0.4357, + "step": 12171 + }, + { + "epoch": 0.16511123168746608, + "grad_norm": 6.269582748413086, + "learning_rate": 8.434836234068796e-06, + "loss": 0.4641, + "step": 12172 + }, + { + "epoch": 0.16512479652740097, + "grad_norm": 7.408116817474365, + "learning_rate": 8.434699191448541e-06, + "loss": 0.3605, + "step": 12173 + }, + { + "epoch": 0.16513836136733587, + "grad_norm": 7.041553974151611, + "learning_rate": 8.434562148828286e-06, + "loss": 0.4118, + "step": 12174 + }, + { + "epoch": 0.16515192620727076, + "grad_norm": 5.529245853424072, + "learning_rate": 8.434425106208032e-06, + "loss": 0.4929, + "step": 12175 + }, + { + "epoch": 0.16516549104720565, + "grad_norm": 6.393537521362305, + "learning_rate": 8.434288063587777e-06, + "loss": 0.3535, + "step": 12176 + }, + { + "epoch": 0.16517905588714052, + "grad_norm": 5.5541558265686035, + "learning_rate": 8.434151020967522e-06, + "loss": 0.2417, + "step": 12177 + }, + { + "epoch": 0.1651926207270754, + "grad_norm": 6.535806655883789, + "learning_rate": 8.434013978347267e-06, + "loss": 0.3519, + "step": 12178 + }, + { + "epoch": 0.1652061855670103, + "grad_norm": 5.925968170166016, + "learning_rate": 8.433876935727012e-06, + "loss": 0.3718, + "step": 12179 + }, + { + "epoch": 0.1652197504069452, + "grad_norm": 6.459280490875244, + "learning_rate": 8.433739893106758e-06, + "loss": 0.3901, + "step": 12180 + }, + { + "epoch": 0.1652333152468801, + "grad_norm": 7.960973739624023, + "learning_rate": 8.433602850486503e-06, + "loss": 0.5437, + "step": 12181 + }, + { + "epoch": 0.16524688008681498, + "grad_norm": 4.396331787109375, + "learning_rate": 8.433465807866246e-06, + "loss": 0.3802, + "step": 12182 + }, + { + "epoch": 0.16526044492674988, + "grad_norm": 6.919742584228516, + "learning_rate": 8.433328765245991e-06, + "loss": 0.4948, + "step": 12183 + }, + { + "epoch": 0.16527400976668474, + "grad_norm": 5.227790355682373, + "learning_rate": 8.433191722625738e-06, + "loss": 0.4724, + "step": 12184 + }, + { + "epoch": 0.16528757460661964, + "grad_norm": 7.867639541625977, + "learning_rate": 8.433054680005482e-06, + "loss": 0.4507, + "step": 12185 + }, + { + "epoch": 0.16530113944655453, + "grad_norm": 7.1714324951171875, + "learning_rate": 8.432917637385227e-06, + "loss": 0.4602, + "step": 12186 + }, + { + "epoch": 0.16531470428648942, + "grad_norm": 5.064708232879639, + "learning_rate": 8.432780594764972e-06, + "loss": 0.3948, + "step": 12187 + }, + { + "epoch": 0.16532826912642432, + "grad_norm": 6.662481307983398, + "learning_rate": 8.432643552144717e-06, + "loss": 0.4613, + "step": 12188 + }, + { + "epoch": 0.1653418339663592, + "grad_norm": 6.112710475921631, + "learning_rate": 8.432506509524462e-06, + "loss": 0.385, + "step": 12189 + }, + { + "epoch": 0.16535539880629407, + "grad_norm": 7.0817437171936035, + "learning_rate": 8.432369466904208e-06, + "loss": 0.5261, + "step": 12190 + }, + { + "epoch": 0.16536896364622897, + "grad_norm": 7.286290168762207, + "learning_rate": 8.432232424283953e-06, + "loss": 0.3796, + "step": 12191 + }, + { + "epoch": 0.16538252848616386, + "grad_norm": 4.464337348937988, + "learning_rate": 8.432095381663698e-06, + "loss": 0.3279, + "step": 12192 + }, + { + "epoch": 0.16539609332609875, + "grad_norm": 6.739080905914307, + "learning_rate": 8.431958339043443e-06, + "loss": 0.4178, + "step": 12193 + }, + { + "epoch": 0.16540965816603365, + "grad_norm": 5.267351150512695, + "learning_rate": 8.431821296423188e-06, + "loss": 0.4309, + "step": 12194 + }, + { + "epoch": 0.16542322300596854, + "grad_norm": 7.133444786071777, + "learning_rate": 8.431684253802933e-06, + "loss": 0.5945, + "step": 12195 + }, + { + "epoch": 0.1654367878459034, + "grad_norm": 7.601070404052734, + "learning_rate": 8.431547211182679e-06, + "loss": 0.6514, + "step": 12196 + }, + { + "epoch": 0.1654503526858383, + "grad_norm": 7.108713626861572, + "learning_rate": 8.431410168562424e-06, + "loss": 0.3888, + "step": 12197 + }, + { + "epoch": 0.1654639175257732, + "grad_norm": 7.44101095199585, + "learning_rate": 8.431273125942169e-06, + "loss": 0.6342, + "step": 12198 + }, + { + "epoch": 0.16547748236570808, + "grad_norm": 6.6608781814575195, + "learning_rate": 8.431136083321914e-06, + "loss": 0.5106, + "step": 12199 + }, + { + "epoch": 0.16549104720564298, + "grad_norm": 5.730293273925781, + "learning_rate": 8.430999040701658e-06, + "loss": 0.4363, + "step": 12200 + }, + { + "epoch": 0.16550461204557787, + "grad_norm": 7.751444339752197, + "learning_rate": 8.430861998081405e-06, + "loss": 0.4767, + "step": 12201 + }, + { + "epoch": 0.16551817688551276, + "grad_norm": 4.8852105140686035, + "learning_rate": 8.43072495546115e-06, + "loss": 0.3709, + "step": 12202 + }, + { + "epoch": 0.16553174172544763, + "grad_norm": 5.965075969696045, + "learning_rate": 8.430587912840893e-06, + "loss": 0.5215, + "step": 12203 + }, + { + "epoch": 0.16554530656538252, + "grad_norm": 6.741921424865723, + "learning_rate": 8.430450870220638e-06, + "loss": 0.4159, + "step": 12204 + }, + { + "epoch": 0.16555887140531741, + "grad_norm": 4.901025772094727, + "learning_rate": 8.430313827600385e-06, + "loss": 0.4245, + "step": 12205 + }, + { + "epoch": 0.1655724362452523, + "grad_norm": 7.605551719665527, + "learning_rate": 8.43017678498013e-06, + "loss": 0.4866, + "step": 12206 + }, + { + "epoch": 0.1655860010851872, + "grad_norm": 7.431743144989014, + "learning_rate": 8.430039742359874e-06, + "loss": 0.483, + "step": 12207 + }, + { + "epoch": 0.1655995659251221, + "grad_norm": 7.316726207733154, + "learning_rate": 8.429902699739619e-06, + "loss": 0.4733, + "step": 12208 + }, + { + "epoch": 0.16561313076505696, + "grad_norm": 6.993491172790527, + "learning_rate": 8.429765657119366e-06, + "loss": 0.5113, + "step": 12209 + }, + { + "epoch": 0.16562669560499185, + "grad_norm": 5.861526966094971, + "learning_rate": 8.42962861449911e-06, + "loss": 0.3785, + "step": 12210 + }, + { + "epoch": 0.16564026044492675, + "grad_norm": 5.508956432342529, + "learning_rate": 8.429491571878855e-06, + "loss": 0.4067, + "step": 12211 + }, + { + "epoch": 0.16565382528486164, + "grad_norm": 7.180076599121094, + "learning_rate": 8.4293545292586e-06, + "loss": 0.4516, + "step": 12212 + }, + { + "epoch": 0.16566739012479653, + "grad_norm": 5.118730545043945, + "learning_rate": 8.429217486638345e-06, + "loss": 0.4181, + "step": 12213 + }, + { + "epoch": 0.16568095496473143, + "grad_norm": 5.507533073425293, + "learning_rate": 8.42908044401809e-06, + "loss": 0.484, + "step": 12214 + }, + { + "epoch": 0.16569451980466632, + "grad_norm": 6.842198371887207, + "learning_rate": 8.428943401397835e-06, + "loss": 0.6407, + "step": 12215 + }, + { + "epoch": 0.16570808464460118, + "grad_norm": 5.781506538391113, + "learning_rate": 8.42880635877758e-06, + "loss": 0.5129, + "step": 12216 + }, + { + "epoch": 0.16572164948453608, + "grad_norm": 4.515210151672363, + "learning_rate": 8.428669316157326e-06, + "loss": 0.3371, + "step": 12217 + }, + { + "epoch": 0.16573521432447097, + "grad_norm": 6.231934547424316, + "learning_rate": 8.428532273537071e-06, + "loss": 0.3469, + "step": 12218 + }, + { + "epoch": 0.16574877916440586, + "grad_norm": 6.308058738708496, + "learning_rate": 8.428395230916816e-06, + "loss": 0.4382, + "step": 12219 + }, + { + "epoch": 0.16576234400434076, + "grad_norm": 6.08511209487915, + "learning_rate": 8.428258188296561e-06, + "loss": 0.5179, + "step": 12220 + }, + { + "epoch": 0.16577590884427565, + "grad_norm": 4.729233741760254, + "learning_rate": 8.428121145676306e-06, + "loss": 0.2648, + "step": 12221 + }, + { + "epoch": 0.16578947368421051, + "grad_norm": 5.9245219230651855, + "learning_rate": 8.427984103056052e-06, + "loss": 0.526, + "step": 12222 + }, + { + "epoch": 0.1658030385241454, + "grad_norm": 6.994899272918701, + "learning_rate": 8.427847060435797e-06, + "loss": 0.5314, + "step": 12223 + }, + { + "epoch": 0.1658166033640803, + "grad_norm": 6.792517185211182, + "learning_rate": 8.427710017815542e-06, + "loss": 0.5292, + "step": 12224 + }, + { + "epoch": 0.1658301682040152, + "grad_norm": 6.100817680358887, + "learning_rate": 8.427572975195285e-06, + "loss": 0.4511, + "step": 12225 + }, + { + "epoch": 0.1658437330439501, + "grad_norm": 7.272988796234131, + "learning_rate": 8.42743593257503e-06, + "loss": 0.4872, + "step": 12226 + }, + { + "epoch": 0.16585729788388498, + "grad_norm": 6.1742472648620605, + "learning_rate": 8.427298889954778e-06, + "loss": 0.4176, + "step": 12227 + }, + { + "epoch": 0.16587086272381985, + "grad_norm": 5.967830657958984, + "learning_rate": 8.427161847334521e-06, + "loss": 0.4552, + "step": 12228 + }, + { + "epoch": 0.16588442756375474, + "grad_norm": 8.636767387390137, + "learning_rate": 8.427024804714266e-06, + "loss": 0.6531, + "step": 12229 + }, + { + "epoch": 0.16589799240368963, + "grad_norm": 7.6502299308776855, + "learning_rate": 8.426887762094011e-06, + "loss": 0.4425, + "step": 12230 + }, + { + "epoch": 0.16591155724362452, + "grad_norm": 6.181210041046143, + "learning_rate": 8.426750719473758e-06, + "loss": 0.3343, + "step": 12231 + }, + { + "epoch": 0.16592512208355942, + "grad_norm": 5.122384548187256, + "learning_rate": 8.426613676853502e-06, + "loss": 0.3447, + "step": 12232 + }, + { + "epoch": 0.1659386869234943, + "grad_norm": 4.973871231079102, + "learning_rate": 8.426476634233247e-06, + "loss": 0.4438, + "step": 12233 + }, + { + "epoch": 0.1659522517634292, + "grad_norm": 5.299435615539551, + "learning_rate": 8.426339591612992e-06, + "loss": 0.2908, + "step": 12234 + }, + { + "epoch": 0.16596581660336407, + "grad_norm": 4.841035842895508, + "learning_rate": 8.426202548992737e-06, + "loss": 0.3077, + "step": 12235 + }, + { + "epoch": 0.16597938144329896, + "grad_norm": 7.5721821784973145, + "learning_rate": 8.426065506372482e-06, + "loss": 0.5426, + "step": 12236 + }, + { + "epoch": 0.16599294628323386, + "grad_norm": 4.939707279205322, + "learning_rate": 8.425928463752228e-06, + "loss": 0.2116, + "step": 12237 + }, + { + "epoch": 0.16600651112316875, + "grad_norm": 4.555453777313232, + "learning_rate": 8.425791421131973e-06, + "loss": 0.322, + "step": 12238 + }, + { + "epoch": 0.16602007596310364, + "grad_norm": 6.110024452209473, + "learning_rate": 8.425654378511718e-06, + "loss": 0.4816, + "step": 12239 + }, + { + "epoch": 0.16603364080303853, + "grad_norm": 6.061440944671631, + "learning_rate": 8.425517335891463e-06, + "loss": 0.4907, + "step": 12240 + }, + { + "epoch": 0.1660472056429734, + "grad_norm": 6.338626861572266, + "learning_rate": 8.425380293271208e-06, + "loss": 0.4056, + "step": 12241 + }, + { + "epoch": 0.1660607704829083, + "grad_norm": 6.122838973999023, + "learning_rate": 8.425243250650954e-06, + "loss": 0.3478, + "step": 12242 + }, + { + "epoch": 0.1660743353228432, + "grad_norm": 6.899070739746094, + "learning_rate": 8.425106208030697e-06, + "loss": 0.3603, + "step": 12243 + }, + { + "epoch": 0.16608790016277808, + "grad_norm": 7.68436861038208, + "learning_rate": 8.424969165410444e-06, + "loss": 0.4184, + "step": 12244 + }, + { + "epoch": 0.16610146500271297, + "grad_norm": 4.987961769104004, + "learning_rate": 8.424832122790189e-06, + "loss": 0.2854, + "step": 12245 + }, + { + "epoch": 0.16611502984264787, + "grad_norm": 9.097527503967285, + "learning_rate": 8.424695080169934e-06, + "loss": 0.4043, + "step": 12246 + }, + { + "epoch": 0.16612859468258276, + "grad_norm": 6.597470283508301, + "learning_rate": 8.424558037549678e-06, + "loss": 0.3508, + "step": 12247 + }, + { + "epoch": 0.16614215952251762, + "grad_norm": 5.5869011878967285, + "learning_rate": 8.424420994929425e-06, + "loss": 0.4185, + "step": 12248 + }, + { + "epoch": 0.16615572436245252, + "grad_norm": 5.540280818939209, + "learning_rate": 8.42428395230917e-06, + "loss": 0.3334, + "step": 12249 + }, + { + "epoch": 0.1661692892023874, + "grad_norm": 6.1583662033081055, + "learning_rate": 8.424146909688913e-06, + "loss": 0.2696, + "step": 12250 + }, + { + "epoch": 0.1661828540423223, + "grad_norm": 7.3782477378845215, + "learning_rate": 8.424009867068658e-06, + "loss": 0.5096, + "step": 12251 + }, + { + "epoch": 0.1661964188822572, + "grad_norm": 5.013638019561768, + "learning_rate": 8.423872824448404e-06, + "loss": 0.2853, + "step": 12252 + }, + { + "epoch": 0.1662099837221921, + "grad_norm": 6.386141300201416, + "learning_rate": 8.423735781828149e-06, + "loss": 0.3226, + "step": 12253 + }, + { + "epoch": 0.16622354856212695, + "grad_norm": 4.915837287902832, + "learning_rate": 8.423598739207894e-06, + "loss": 0.3406, + "step": 12254 + }, + { + "epoch": 0.16623711340206185, + "grad_norm": 6.689484596252441, + "learning_rate": 8.423461696587639e-06, + "loss": 0.332, + "step": 12255 + }, + { + "epoch": 0.16625067824199674, + "grad_norm": 6.577810287475586, + "learning_rate": 8.423324653967384e-06, + "loss": 0.3905, + "step": 12256 + }, + { + "epoch": 0.16626424308193163, + "grad_norm": 6.990068435668945, + "learning_rate": 8.42318761134713e-06, + "loss": 0.3541, + "step": 12257 + }, + { + "epoch": 0.16627780792186653, + "grad_norm": 7.377398490905762, + "learning_rate": 8.423050568726875e-06, + "loss": 0.4903, + "step": 12258 + }, + { + "epoch": 0.16629137276180142, + "grad_norm": 5.085748195648193, + "learning_rate": 8.42291352610662e-06, + "loss": 0.3872, + "step": 12259 + }, + { + "epoch": 0.16630493760173629, + "grad_norm": 6.269100666046143, + "learning_rate": 8.422776483486365e-06, + "loss": 0.4472, + "step": 12260 + }, + { + "epoch": 0.16631850244167118, + "grad_norm": 4.696188926696777, + "learning_rate": 8.42263944086611e-06, + "loss": 0.2793, + "step": 12261 + }, + { + "epoch": 0.16633206728160607, + "grad_norm": 5.362918853759766, + "learning_rate": 8.422502398245855e-06, + "loss": 0.3736, + "step": 12262 + }, + { + "epoch": 0.16634563212154097, + "grad_norm": 6.955193996429443, + "learning_rate": 8.4223653556256e-06, + "loss": 0.4183, + "step": 12263 + }, + { + "epoch": 0.16635919696147586, + "grad_norm": 5.879549026489258, + "learning_rate": 8.422228313005346e-06, + "loss": 0.3979, + "step": 12264 + }, + { + "epoch": 0.16637276180141075, + "grad_norm": 6.282439231872559, + "learning_rate": 8.422091270385091e-06, + "loss": 0.3651, + "step": 12265 + }, + { + "epoch": 0.16638632664134564, + "grad_norm": 4.546778678894043, + "learning_rate": 8.421954227764836e-06, + "loss": 0.2535, + "step": 12266 + }, + { + "epoch": 0.1663998914812805, + "grad_norm": 4.665377616882324, + "learning_rate": 8.421817185144581e-06, + "loss": 0.2565, + "step": 12267 + }, + { + "epoch": 0.1664134563212154, + "grad_norm": 6.541701793670654, + "learning_rate": 8.421680142524325e-06, + "loss": 0.3223, + "step": 12268 + }, + { + "epoch": 0.1664270211611503, + "grad_norm": 5.898636817932129, + "learning_rate": 8.42154309990407e-06, + "loss": 0.4265, + "step": 12269 + }, + { + "epoch": 0.1664405860010852, + "grad_norm": 6.204099178314209, + "learning_rate": 8.421406057283817e-06, + "loss": 0.3894, + "step": 12270 + }, + { + "epoch": 0.16645415084102008, + "grad_norm": 8.9777250289917, + "learning_rate": 8.421269014663562e-06, + "loss": 0.5092, + "step": 12271 + }, + { + "epoch": 0.16646771568095498, + "grad_norm": 6.202890872955322, + "learning_rate": 8.421131972043305e-06, + "loss": 0.3822, + "step": 12272 + }, + { + "epoch": 0.16648128052088984, + "grad_norm": 6.158069133758545, + "learning_rate": 8.42099492942305e-06, + "loss": 0.3763, + "step": 12273 + }, + { + "epoch": 0.16649484536082473, + "grad_norm": 6.678469181060791, + "learning_rate": 8.420857886802798e-06, + "loss": 0.3205, + "step": 12274 + }, + { + "epoch": 0.16650841020075963, + "grad_norm": 5.5431318283081055, + "learning_rate": 8.420720844182541e-06, + "loss": 0.3859, + "step": 12275 + }, + { + "epoch": 0.16652197504069452, + "grad_norm": 5.570821762084961, + "learning_rate": 8.420583801562286e-06, + "loss": 0.4167, + "step": 12276 + }, + { + "epoch": 0.1665355398806294, + "grad_norm": 4.886516571044922, + "learning_rate": 8.420446758942031e-06, + "loss": 0.2719, + "step": 12277 + }, + { + "epoch": 0.1665491047205643, + "grad_norm": 4.401644229888916, + "learning_rate": 8.420309716321777e-06, + "loss": 0.2251, + "step": 12278 + }, + { + "epoch": 0.1665626695604992, + "grad_norm": 5.395236492156982, + "learning_rate": 8.420172673701522e-06, + "loss": 0.3189, + "step": 12279 + }, + { + "epoch": 0.16657623440043406, + "grad_norm": 6.316933631896973, + "learning_rate": 8.420035631081267e-06, + "loss": 0.342, + "step": 12280 + }, + { + "epoch": 0.16658979924036896, + "grad_norm": 6.34937858581543, + "learning_rate": 8.419898588461012e-06, + "loss": 0.4115, + "step": 12281 + }, + { + "epoch": 0.16660336408030385, + "grad_norm": 6.14727258682251, + "learning_rate": 8.419761545840757e-06, + "loss": 0.3393, + "step": 12282 + }, + { + "epoch": 0.16661692892023874, + "grad_norm": 6.418065071105957, + "learning_rate": 8.419624503220502e-06, + "loss": 0.3484, + "step": 12283 + }, + { + "epoch": 0.16663049376017364, + "grad_norm": 5.543679237365723, + "learning_rate": 8.419487460600248e-06, + "loss": 0.4296, + "step": 12284 + }, + { + "epoch": 0.16664405860010853, + "grad_norm": 4.2428975105285645, + "learning_rate": 8.419350417979993e-06, + "loss": 0.2459, + "step": 12285 + }, + { + "epoch": 0.1666576234400434, + "grad_norm": 6.3995184898376465, + "learning_rate": 8.419213375359738e-06, + "loss": 0.241, + "step": 12286 + }, + { + "epoch": 0.1666711882799783, + "grad_norm": 6.4955339431762695, + "learning_rate": 8.419076332739483e-06, + "loss": 0.3302, + "step": 12287 + }, + { + "epoch": 0.16668475311991318, + "grad_norm": 5.264610290527344, + "learning_rate": 8.418939290119228e-06, + "loss": 0.3116, + "step": 12288 + }, + { + "epoch": 0.16669831795984807, + "grad_norm": 7.5930609703063965, + "learning_rate": 8.418802247498974e-06, + "loss": 0.4423, + "step": 12289 + }, + { + "epoch": 0.16671188279978297, + "grad_norm": 3.737563133239746, + "learning_rate": 8.418665204878717e-06, + "loss": 0.229, + "step": 12290 + }, + { + "epoch": 0.16672544763971786, + "grad_norm": 5.021454334259033, + "learning_rate": 8.418528162258464e-06, + "loss": 0.1573, + "step": 12291 + }, + { + "epoch": 0.16673901247965275, + "grad_norm": 5.310892581939697, + "learning_rate": 8.418391119638209e-06, + "loss": 0.3016, + "step": 12292 + }, + { + "epoch": 0.16675257731958762, + "grad_norm": 5.742420196533203, + "learning_rate": 8.418254077017953e-06, + "loss": 0.5724, + "step": 12293 + }, + { + "epoch": 0.1667661421595225, + "grad_norm": 5.646888732910156, + "learning_rate": 8.418117034397698e-06, + "loss": 0.2618, + "step": 12294 + }, + { + "epoch": 0.1667797069994574, + "grad_norm": 5.514575481414795, + "learning_rate": 8.417979991777443e-06, + "loss": 0.3856, + "step": 12295 + }, + { + "epoch": 0.1667932718393923, + "grad_norm": 4.8440046310424805, + "learning_rate": 8.417842949157188e-06, + "loss": 0.2796, + "step": 12296 + }, + { + "epoch": 0.1668068366793272, + "grad_norm": 7.830718040466309, + "learning_rate": 8.417705906536933e-06, + "loss": 0.3556, + "step": 12297 + }, + { + "epoch": 0.16682040151926208, + "grad_norm": 4.437819957733154, + "learning_rate": 8.417568863916678e-06, + "loss": 0.3187, + "step": 12298 + }, + { + "epoch": 0.16683396635919695, + "grad_norm": 7.97534704208374, + "learning_rate": 8.417431821296424e-06, + "loss": 0.3312, + "step": 12299 + }, + { + "epoch": 0.16684753119913184, + "grad_norm": 5.292691707611084, + "learning_rate": 8.417294778676169e-06, + "loss": 0.3734, + "step": 12300 + }, + { + "epoch": 0.16686109603906674, + "grad_norm": 6.549408912658691, + "learning_rate": 8.417157736055914e-06, + "loss": 0.3169, + "step": 12301 + }, + { + "epoch": 0.16687466087900163, + "grad_norm": 4.46396541595459, + "learning_rate": 8.41702069343566e-06, + "loss": 0.1913, + "step": 12302 + }, + { + "epoch": 0.16688822571893652, + "grad_norm": 6.910810470581055, + "learning_rate": 8.416883650815404e-06, + "loss": 0.3087, + "step": 12303 + }, + { + "epoch": 0.16690179055887142, + "grad_norm": 5.58491849899292, + "learning_rate": 8.41674660819515e-06, + "loss": 0.2771, + "step": 12304 + }, + { + "epoch": 0.16691535539880628, + "grad_norm": 4.041632652282715, + "learning_rate": 8.416609565574895e-06, + "loss": 0.2413, + "step": 12305 + }, + { + "epoch": 0.16692892023874117, + "grad_norm": 6.381218910217285, + "learning_rate": 8.41647252295464e-06, + "loss": 0.3584, + "step": 12306 + }, + { + "epoch": 0.16694248507867607, + "grad_norm": 6.060807228088379, + "learning_rate": 8.416335480334385e-06, + "loss": 0.2528, + "step": 12307 + }, + { + "epoch": 0.16695604991861096, + "grad_norm": 4.407201766967773, + "learning_rate": 8.416198437714129e-06, + "loss": 0.2333, + "step": 12308 + }, + { + "epoch": 0.16696961475854585, + "grad_norm": 4.919793128967285, + "learning_rate": 8.416061395093875e-06, + "loss": 0.3025, + "step": 12309 + }, + { + "epoch": 0.16698317959848075, + "grad_norm": 7.029292106628418, + "learning_rate": 8.41592435247362e-06, + "loss": 0.4071, + "step": 12310 + }, + { + "epoch": 0.16699674443841564, + "grad_norm": 7.679317951202393, + "learning_rate": 8.415787309853364e-06, + "loss": 0.4307, + "step": 12311 + }, + { + "epoch": 0.1670103092783505, + "grad_norm": 7.8962249755859375, + "learning_rate": 8.41565026723311e-06, + "loss": 0.4702, + "step": 12312 + }, + { + "epoch": 0.1670238741182854, + "grad_norm": 5.882174491882324, + "learning_rate": 8.415513224612856e-06, + "loss": 0.5481, + "step": 12313 + }, + { + "epoch": 0.1670374389582203, + "grad_norm": 6.641495227813721, + "learning_rate": 8.415376181992601e-06, + "loss": 0.5004, + "step": 12314 + }, + { + "epoch": 0.16705100379815518, + "grad_norm": 5.441781520843506, + "learning_rate": 8.415239139372345e-06, + "loss": 0.4656, + "step": 12315 + }, + { + "epoch": 0.16706456863809008, + "grad_norm": 6.963151454925537, + "learning_rate": 8.41510209675209e-06, + "loss": 0.4403, + "step": 12316 + }, + { + "epoch": 0.16707813347802497, + "grad_norm": 5.637389183044434, + "learning_rate": 8.414965054131837e-06, + "loss": 0.352, + "step": 12317 + }, + { + "epoch": 0.16709169831795984, + "grad_norm": 6.428391933441162, + "learning_rate": 8.41482801151158e-06, + "loss": 0.4926, + "step": 12318 + }, + { + "epoch": 0.16710526315789473, + "grad_norm": 6.625340461730957, + "learning_rate": 8.414690968891326e-06, + "loss": 0.3659, + "step": 12319 + }, + { + "epoch": 0.16711882799782962, + "grad_norm": 5.742856502532959, + "learning_rate": 8.41455392627107e-06, + "loss": 0.4103, + "step": 12320 + }, + { + "epoch": 0.16713239283776452, + "grad_norm": 6.677555561065674, + "learning_rate": 8.414416883650816e-06, + "loss": 0.3553, + "step": 12321 + }, + { + "epoch": 0.1671459576776994, + "grad_norm": 5.5376996994018555, + "learning_rate": 8.414279841030561e-06, + "loss": 0.4577, + "step": 12322 + }, + { + "epoch": 0.1671595225176343, + "grad_norm": 3.7984421253204346, + "learning_rate": 8.414142798410306e-06, + "loss": 0.189, + "step": 12323 + }, + { + "epoch": 0.1671730873575692, + "grad_norm": 6.7050065994262695, + "learning_rate": 8.414005755790051e-06, + "loss": 0.3425, + "step": 12324 + }, + { + "epoch": 0.16718665219750406, + "grad_norm": 5.843540668487549, + "learning_rate": 8.413868713169797e-06, + "loss": 0.3787, + "step": 12325 + }, + { + "epoch": 0.16720021703743895, + "grad_norm": 6.063526630401611, + "learning_rate": 8.413731670549542e-06, + "loss": 0.4681, + "step": 12326 + }, + { + "epoch": 0.16721378187737385, + "grad_norm": 4.858076095581055, + "learning_rate": 8.413594627929287e-06, + "loss": 0.2531, + "step": 12327 + }, + { + "epoch": 0.16722734671730874, + "grad_norm": 4.69968843460083, + "learning_rate": 8.413457585309032e-06, + "loss": 0.2288, + "step": 12328 + }, + { + "epoch": 0.16724091155724363, + "grad_norm": 6.291234016418457, + "learning_rate": 8.413320542688777e-06, + "loss": 0.4737, + "step": 12329 + }, + { + "epoch": 0.16725447639717853, + "grad_norm": 4.6365766525268555, + "learning_rate": 8.413183500068522e-06, + "loss": 0.2869, + "step": 12330 + }, + { + "epoch": 0.1672680412371134, + "grad_norm": 6.681015491485596, + "learning_rate": 8.413046457448268e-06, + "loss": 0.4395, + "step": 12331 + }, + { + "epoch": 0.16728160607704828, + "grad_norm": 8.30061149597168, + "learning_rate": 8.412909414828013e-06, + "loss": 0.493, + "step": 12332 + }, + { + "epoch": 0.16729517091698318, + "grad_norm": 5.61463737487793, + "learning_rate": 8.412772372207756e-06, + "loss": 0.3169, + "step": 12333 + }, + { + "epoch": 0.16730873575691807, + "grad_norm": 5.118105411529541, + "learning_rate": 8.412635329587503e-06, + "loss": 0.3092, + "step": 12334 + }, + { + "epoch": 0.16732230059685296, + "grad_norm": 5.229746341705322, + "learning_rate": 8.412498286967248e-06, + "loss": 0.21, + "step": 12335 + }, + { + "epoch": 0.16733586543678786, + "grad_norm": 4.744583606719971, + "learning_rate": 8.412361244346992e-06, + "loss": 0.2377, + "step": 12336 + }, + { + "epoch": 0.16734943027672272, + "grad_norm": 6.213165760040283, + "learning_rate": 8.412224201726737e-06, + "loss": 0.3276, + "step": 12337 + }, + { + "epoch": 0.16736299511665761, + "grad_norm": 6.26652193069458, + "learning_rate": 8.412087159106482e-06, + "loss": 0.3683, + "step": 12338 + }, + { + "epoch": 0.1673765599565925, + "grad_norm": 6.077304840087891, + "learning_rate": 8.411950116486229e-06, + "loss": 0.2709, + "step": 12339 + }, + { + "epoch": 0.1673901247965274, + "grad_norm": 5.217187404632568, + "learning_rate": 8.411813073865973e-06, + "loss": 0.227, + "step": 12340 + }, + { + "epoch": 0.1674036896364623, + "grad_norm": 6.474851131439209, + "learning_rate": 8.411676031245718e-06, + "loss": 0.2926, + "step": 12341 + }, + { + "epoch": 0.1674172544763972, + "grad_norm": 4.319947242736816, + "learning_rate": 8.411538988625463e-06, + "loss": 0.2252, + "step": 12342 + }, + { + "epoch": 0.16743081931633208, + "grad_norm": 7.384720325469971, + "learning_rate": 8.411401946005208e-06, + "loss": 0.4707, + "step": 12343 + }, + { + "epoch": 0.16744438415626695, + "grad_norm": 6.345503330230713, + "learning_rate": 8.411264903384953e-06, + "loss": 0.3165, + "step": 12344 + }, + { + "epoch": 0.16745794899620184, + "grad_norm": 5.709238052368164, + "learning_rate": 8.411127860764698e-06, + "loss": 0.3475, + "step": 12345 + }, + { + "epoch": 0.16747151383613673, + "grad_norm": 5.506000518798828, + "learning_rate": 8.410990818144444e-06, + "loss": 0.3808, + "step": 12346 + }, + { + "epoch": 0.16748507867607162, + "grad_norm": 6.864359378814697, + "learning_rate": 8.410853775524189e-06, + "loss": 0.3879, + "step": 12347 + }, + { + "epoch": 0.16749864351600652, + "grad_norm": 6.666354179382324, + "learning_rate": 8.410716732903934e-06, + "loss": 0.4827, + "step": 12348 + }, + { + "epoch": 0.1675122083559414, + "grad_norm": 5.55627965927124, + "learning_rate": 8.41057969028368e-06, + "loss": 0.3508, + "step": 12349 + }, + { + "epoch": 0.16752577319587628, + "grad_norm": 5.615560531616211, + "learning_rate": 8.410442647663424e-06, + "loss": 0.3353, + "step": 12350 + }, + { + "epoch": 0.16753933803581117, + "grad_norm": 6.998128890991211, + "learning_rate": 8.410305605043168e-06, + "loss": 0.4727, + "step": 12351 + }, + { + "epoch": 0.16755290287574606, + "grad_norm": 7.763524532318115, + "learning_rate": 8.410168562422915e-06, + "loss": 0.3398, + "step": 12352 + }, + { + "epoch": 0.16756646771568096, + "grad_norm": 4.575573444366455, + "learning_rate": 8.41003151980266e-06, + "loss": 0.3279, + "step": 12353 + }, + { + "epoch": 0.16758003255561585, + "grad_norm": 7.782784461975098, + "learning_rate": 8.409894477182405e-06, + "loss": 0.4736, + "step": 12354 + }, + { + "epoch": 0.16759359739555074, + "grad_norm": 6.737850189208984, + "learning_rate": 8.409757434562149e-06, + "loss": 0.3444, + "step": 12355 + }, + { + "epoch": 0.16760716223548563, + "grad_norm": 5.251576900482178, + "learning_rate": 8.409620391941895e-06, + "loss": 0.2521, + "step": 12356 + }, + { + "epoch": 0.1676207270754205, + "grad_norm": 7.172349452972412, + "learning_rate": 8.40948334932164e-06, + "loss": 0.4668, + "step": 12357 + }, + { + "epoch": 0.1676342919153554, + "grad_norm": 5.850586414337158, + "learning_rate": 8.409346306701384e-06, + "loss": 0.3171, + "step": 12358 + }, + { + "epoch": 0.1676478567552903, + "grad_norm": 5.464574337005615, + "learning_rate": 8.40920926408113e-06, + "loss": 0.3721, + "step": 12359 + }, + { + "epoch": 0.16766142159522518, + "grad_norm": 9.100605010986328, + "learning_rate": 8.409072221460876e-06, + "loss": 0.4082, + "step": 12360 + }, + { + "epoch": 0.16767498643516007, + "grad_norm": 6.679776191711426, + "learning_rate": 8.40893517884062e-06, + "loss": 0.3577, + "step": 12361 + }, + { + "epoch": 0.16768855127509497, + "grad_norm": 8.801973342895508, + "learning_rate": 8.408798136220365e-06, + "loss": 0.5154, + "step": 12362 + }, + { + "epoch": 0.16770211611502983, + "grad_norm": 7.20944356918335, + "learning_rate": 8.40866109360011e-06, + "loss": 0.4469, + "step": 12363 + }, + { + "epoch": 0.16771568095496472, + "grad_norm": 6.96645450592041, + "learning_rate": 8.408524050979855e-06, + "loss": 0.5501, + "step": 12364 + }, + { + "epoch": 0.16772924579489962, + "grad_norm": 5.472700595855713, + "learning_rate": 8.4083870083596e-06, + "loss": 0.3515, + "step": 12365 + }, + { + "epoch": 0.1677428106348345, + "grad_norm": 6.703450679779053, + "learning_rate": 8.408249965739346e-06, + "loss": 0.3524, + "step": 12366 + }, + { + "epoch": 0.1677563754747694, + "grad_norm": 5.141674041748047, + "learning_rate": 8.40811292311909e-06, + "loss": 0.2386, + "step": 12367 + }, + { + "epoch": 0.1677699403147043, + "grad_norm": 4.64528751373291, + "learning_rate": 8.407975880498836e-06, + "loss": 0.2096, + "step": 12368 + }, + { + "epoch": 0.16778350515463916, + "grad_norm": 7.81040096282959, + "learning_rate": 8.407838837878581e-06, + "loss": 0.3271, + "step": 12369 + }, + { + "epoch": 0.16779706999457406, + "grad_norm": 6.349728584289551, + "learning_rate": 8.407701795258326e-06, + "loss": 0.3119, + "step": 12370 + }, + { + "epoch": 0.16781063483450895, + "grad_norm": 7.547850131988525, + "learning_rate": 8.407564752638071e-06, + "loss": 0.3075, + "step": 12371 + }, + { + "epoch": 0.16782419967444384, + "grad_norm": 4.687827110290527, + "learning_rate": 8.407427710017817e-06, + "loss": 0.2853, + "step": 12372 + }, + { + "epoch": 0.16783776451437873, + "grad_norm": 8.387872695922852, + "learning_rate": 8.407290667397562e-06, + "loss": 0.5394, + "step": 12373 + }, + { + "epoch": 0.16785132935431363, + "grad_norm": 6.118710041046143, + "learning_rate": 8.407153624777307e-06, + "loss": 0.2849, + "step": 12374 + }, + { + "epoch": 0.16786489419424852, + "grad_norm": 6.73574161529541, + "learning_rate": 8.407016582157052e-06, + "loss": 0.3665, + "step": 12375 + }, + { + "epoch": 0.16787845903418339, + "grad_norm": 6.174158096313477, + "learning_rate": 8.406879539536796e-06, + "loss": 0.4214, + "step": 12376 + }, + { + "epoch": 0.16789202387411828, + "grad_norm": 6.200061798095703, + "learning_rate": 8.40674249691654e-06, + "loss": 0.5449, + "step": 12377 + }, + { + "epoch": 0.16790558871405317, + "grad_norm": 7.646008491516113, + "learning_rate": 8.406605454296288e-06, + "loss": 0.4154, + "step": 12378 + }, + { + "epoch": 0.16791915355398807, + "grad_norm": 6.752880573272705, + "learning_rate": 8.406468411676031e-06, + "loss": 0.387, + "step": 12379 + }, + { + "epoch": 0.16793271839392296, + "grad_norm": 6.2384748458862305, + "learning_rate": 8.406331369055776e-06, + "loss": 0.3441, + "step": 12380 + }, + { + "epoch": 0.16794628323385785, + "grad_norm": 6.036713123321533, + "learning_rate": 8.406194326435522e-06, + "loss": 0.374, + "step": 12381 + }, + { + "epoch": 0.16795984807379272, + "grad_norm": 7.774405002593994, + "learning_rate": 8.406057283815268e-06, + "loss": 0.4675, + "step": 12382 + }, + { + "epoch": 0.1679734129137276, + "grad_norm": 6.5473527908325195, + "learning_rate": 8.405920241195012e-06, + "loss": 0.4114, + "step": 12383 + }, + { + "epoch": 0.1679869777536625, + "grad_norm": 5.070858478546143, + "learning_rate": 8.405783198574757e-06, + "loss": 0.2933, + "step": 12384 + }, + { + "epoch": 0.1680005425935974, + "grad_norm": 7.267040729522705, + "learning_rate": 8.405646155954502e-06, + "loss": 0.28, + "step": 12385 + }, + { + "epoch": 0.1680141074335323, + "grad_norm": 7.2271223068237305, + "learning_rate": 8.405509113334247e-06, + "loss": 0.4429, + "step": 12386 + }, + { + "epoch": 0.16802767227346718, + "grad_norm": 6.294147491455078, + "learning_rate": 8.405372070713993e-06, + "loss": 0.5006, + "step": 12387 + }, + { + "epoch": 0.16804123711340208, + "grad_norm": 7.288929462432861, + "learning_rate": 8.405235028093738e-06, + "loss": 0.4762, + "step": 12388 + }, + { + "epoch": 0.16805480195333694, + "grad_norm": 8.102798461914062, + "learning_rate": 8.405097985473483e-06, + "loss": 0.5559, + "step": 12389 + }, + { + "epoch": 0.16806836679327183, + "grad_norm": 7.96359920501709, + "learning_rate": 8.404960942853228e-06, + "loss": 0.4132, + "step": 12390 + }, + { + "epoch": 0.16808193163320673, + "grad_norm": 7.061140537261963, + "learning_rate": 8.404823900232973e-06, + "loss": 0.3785, + "step": 12391 + }, + { + "epoch": 0.16809549647314162, + "grad_norm": 7.739774227142334, + "learning_rate": 8.404686857612718e-06, + "loss": 0.4138, + "step": 12392 + }, + { + "epoch": 0.1681090613130765, + "grad_norm": 8.550164222717285, + "learning_rate": 8.404549814992464e-06, + "loss": 0.4082, + "step": 12393 + }, + { + "epoch": 0.1681226261530114, + "grad_norm": 6.741666793823242, + "learning_rate": 8.404412772372207e-06, + "loss": 0.4045, + "step": 12394 + }, + { + "epoch": 0.16813619099294627, + "grad_norm": 7.519077777862549, + "learning_rate": 8.404275729751954e-06, + "loss": 0.4649, + "step": 12395 + }, + { + "epoch": 0.16814975583288116, + "grad_norm": 7.184960842132568, + "learning_rate": 8.4041386871317e-06, + "loss": 0.4266, + "step": 12396 + }, + { + "epoch": 0.16816332067281606, + "grad_norm": 7.1173200607299805, + "learning_rate": 8.404001644511444e-06, + "loss": 0.4928, + "step": 12397 + }, + { + "epoch": 0.16817688551275095, + "grad_norm": 7.576448917388916, + "learning_rate": 8.403864601891188e-06, + "loss": 0.3697, + "step": 12398 + }, + { + "epoch": 0.16819045035268584, + "grad_norm": 7.434964656829834, + "learning_rate": 8.403727559270935e-06, + "loss": 0.6853, + "step": 12399 + }, + { + "epoch": 0.16820401519262074, + "grad_norm": 8.522956848144531, + "learning_rate": 8.40359051665068e-06, + "loss": 0.4866, + "step": 12400 + }, + { + "epoch": 0.1682175800325556, + "grad_norm": 7.238915920257568, + "learning_rate": 8.403453474030423e-06, + "loss": 0.3946, + "step": 12401 + }, + { + "epoch": 0.1682311448724905, + "grad_norm": 6.948670387268066, + "learning_rate": 8.403316431410169e-06, + "loss": 0.4194, + "step": 12402 + }, + { + "epoch": 0.1682447097124254, + "grad_norm": 7.101099491119385, + "learning_rate": 8.403179388789915e-06, + "loss": 0.4898, + "step": 12403 + }, + { + "epoch": 0.16825827455236028, + "grad_norm": 8.61961841583252, + "learning_rate": 8.403042346169659e-06, + "loss": 0.5002, + "step": 12404 + }, + { + "epoch": 0.16827183939229517, + "grad_norm": 5.566004276275635, + "learning_rate": 8.402905303549404e-06, + "loss": 0.3171, + "step": 12405 + }, + { + "epoch": 0.16828540423223007, + "grad_norm": 11.553153991699219, + "learning_rate": 8.40276826092915e-06, + "loss": 0.5805, + "step": 12406 + }, + { + "epoch": 0.16829896907216496, + "grad_norm": 8.898443222045898, + "learning_rate": 8.402631218308894e-06, + "loss": 0.5013, + "step": 12407 + }, + { + "epoch": 0.16831253391209983, + "grad_norm": 6.746703147888184, + "learning_rate": 8.40249417568864e-06, + "loss": 0.4701, + "step": 12408 + }, + { + "epoch": 0.16832609875203472, + "grad_norm": 5.431395530700684, + "learning_rate": 8.402357133068385e-06, + "loss": 0.3593, + "step": 12409 + }, + { + "epoch": 0.1683396635919696, + "grad_norm": 7.256744384765625, + "learning_rate": 8.40222009044813e-06, + "loss": 0.4312, + "step": 12410 + }, + { + "epoch": 0.1683532284319045, + "grad_norm": 6.717574119567871, + "learning_rate": 8.402083047827875e-06, + "loss": 0.4333, + "step": 12411 + }, + { + "epoch": 0.1683667932718394, + "grad_norm": 6.899715423583984, + "learning_rate": 8.40194600520762e-06, + "loss": 0.3935, + "step": 12412 + }, + { + "epoch": 0.1683803581117743, + "grad_norm": 5.602436542510986, + "learning_rate": 8.401808962587366e-06, + "loss": 0.3247, + "step": 12413 + }, + { + "epoch": 0.16839392295170916, + "grad_norm": 6.820806980133057, + "learning_rate": 8.40167191996711e-06, + "loss": 0.4364, + "step": 12414 + }, + { + "epoch": 0.16840748779164405, + "grad_norm": 8.251086235046387, + "learning_rate": 8.401534877346856e-06, + "loss": 0.396, + "step": 12415 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 7.634778022766113, + "learning_rate": 8.401397834726601e-06, + "loss": 0.4807, + "step": 12416 + }, + { + "epoch": 0.16843461747151384, + "grad_norm": 7.218190670013428, + "learning_rate": 8.401260792106346e-06, + "loss": 0.402, + "step": 12417 + }, + { + "epoch": 0.16844818231144873, + "grad_norm": 7.478256702423096, + "learning_rate": 8.401123749486091e-06, + "loss": 0.3883, + "step": 12418 + }, + { + "epoch": 0.16846174715138362, + "grad_norm": 6.777125835418701, + "learning_rate": 8.400986706865835e-06, + "loss": 0.3266, + "step": 12419 + }, + { + "epoch": 0.16847531199131852, + "grad_norm": 6.659257411956787, + "learning_rate": 8.40084966424558e-06, + "loss": 0.4452, + "step": 12420 + }, + { + "epoch": 0.16848887683125338, + "grad_norm": 5.747942924499512, + "learning_rate": 8.400712621625327e-06, + "loss": 0.3433, + "step": 12421 + }, + { + "epoch": 0.16850244167118827, + "grad_norm": 7.511585712432861, + "learning_rate": 8.400575579005072e-06, + "loss": 0.4265, + "step": 12422 + }, + { + "epoch": 0.16851600651112317, + "grad_norm": 5.745375156402588, + "learning_rate": 8.400438536384816e-06, + "loss": 0.3605, + "step": 12423 + }, + { + "epoch": 0.16852957135105806, + "grad_norm": 8.507701873779297, + "learning_rate": 8.40030149376456e-06, + "loss": 0.644, + "step": 12424 + }, + { + "epoch": 0.16854313619099295, + "grad_norm": 7.746775150299072, + "learning_rate": 8.400164451144308e-06, + "loss": 0.443, + "step": 12425 + }, + { + "epoch": 0.16855670103092785, + "grad_norm": 5.3509674072265625, + "learning_rate": 8.400027408524051e-06, + "loss": 0.349, + "step": 12426 + }, + { + "epoch": 0.1685702658708627, + "grad_norm": 5.476999759674072, + "learning_rate": 8.399890365903796e-06, + "loss": 0.3137, + "step": 12427 + }, + { + "epoch": 0.1685838307107976, + "grad_norm": 6.858190536499023, + "learning_rate": 8.399753323283542e-06, + "loss": 0.4368, + "step": 12428 + }, + { + "epoch": 0.1685973955507325, + "grad_norm": 6.641676902770996, + "learning_rate": 8.399616280663287e-06, + "loss": 0.403, + "step": 12429 + }, + { + "epoch": 0.1686109603906674, + "grad_norm": 7.202661037445068, + "learning_rate": 8.399479238043032e-06, + "loss": 0.3908, + "step": 12430 + }, + { + "epoch": 0.16862452523060228, + "grad_norm": 9.804198265075684, + "learning_rate": 8.399342195422777e-06, + "loss": 0.4203, + "step": 12431 + }, + { + "epoch": 0.16863809007053718, + "grad_norm": 5.790496349334717, + "learning_rate": 8.399205152802522e-06, + "loss": 0.3594, + "step": 12432 + }, + { + "epoch": 0.16865165491047204, + "grad_norm": 6.49212646484375, + "learning_rate": 8.399068110182267e-06, + "loss": 0.3291, + "step": 12433 + }, + { + "epoch": 0.16866521975040694, + "grad_norm": 8.067169189453125, + "learning_rate": 8.398931067562013e-06, + "loss": 0.6194, + "step": 12434 + }, + { + "epoch": 0.16867878459034183, + "grad_norm": 9.388004302978516, + "learning_rate": 8.398794024941758e-06, + "loss": 0.5033, + "step": 12435 + }, + { + "epoch": 0.16869234943027672, + "grad_norm": 7.151809215545654, + "learning_rate": 8.398656982321503e-06, + "loss": 0.4034, + "step": 12436 + }, + { + "epoch": 0.16870591427021162, + "grad_norm": 7.463497161865234, + "learning_rate": 8.398519939701248e-06, + "loss": 0.5078, + "step": 12437 + }, + { + "epoch": 0.1687194791101465, + "grad_norm": 5.463535785675049, + "learning_rate": 8.398382897080993e-06, + "loss": 0.3632, + "step": 12438 + }, + { + "epoch": 0.1687330439500814, + "grad_norm": 5.992488861083984, + "learning_rate": 8.398245854460739e-06, + "loss": 0.2909, + "step": 12439 + }, + { + "epoch": 0.16874660879001627, + "grad_norm": 5.198352813720703, + "learning_rate": 8.398108811840484e-06, + "loss": 0.2793, + "step": 12440 + }, + { + "epoch": 0.16876017362995116, + "grad_norm": 7.00978422164917, + "learning_rate": 8.397971769220227e-06, + "loss": 0.3865, + "step": 12441 + }, + { + "epoch": 0.16877373846988605, + "grad_norm": 6.173305034637451, + "learning_rate": 8.397834726599974e-06, + "loss": 0.3806, + "step": 12442 + }, + { + "epoch": 0.16878730330982095, + "grad_norm": 7.906023025512695, + "learning_rate": 8.39769768397972e-06, + "loss": 0.4172, + "step": 12443 + }, + { + "epoch": 0.16880086814975584, + "grad_norm": 6.772793769836426, + "learning_rate": 8.397560641359463e-06, + "loss": 0.5123, + "step": 12444 + }, + { + "epoch": 0.16881443298969073, + "grad_norm": 7.366125106811523, + "learning_rate": 8.397423598739208e-06, + "loss": 0.4378, + "step": 12445 + }, + { + "epoch": 0.1688279978296256, + "grad_norm": 6.419791221618652, + "learning_rate": 8.397286556118953e-06, + "loss": 0.3114, + "step": 12446 + }, + { + "epoch": 0.1688415626695605, + "grad_norm": 5.610787391662598, + "learning_rate": 8.3971495134987e-06, + "loss": 0.2624, + "step": 12447 + }, + { + "epoch": 0.16885512750949538, + "grad_norm": 9.457368850708008, + "learning_rate": 8.397012470878443e-06, + "loss": 0.4656, + "step": 12448 + }, + { + "epoch": 0.16886869234943028, + "grad_norm": 4.026615619659424, + "learning_rate": 8.396875428258189e-06, + "loss": 0.2126, + "step": 12449 + }, + { + "epoch": 0.16888225718936517, + "grad_norm": 5.122289180755615, + "learning_rate": 8.396738385637934e-06, + "loss": 0.258, + "step": 12450 + }, + { + "epoch": 0.16889582202930006, + "grad_norm": 4.193695545196533, + "learning_rate": 8.396601343017679e-06, + "loss": 0.2083, + "step": 12451 + }, + { + "epoch": 0.16890938686923496, + "grad_norm": 8.105428695678711, + "learning_rate": 8.396464300397424e-06, + "loss": 0.4215, + "step": 12452 + }, + { + "epoch": 0.16892295170916982, + "grad_norm": 5.196881294250488, + "learning_rate": 8.39632725777717e-06, + "loss": 0.3487, + "step": 12453 + }, + { + "epoch": 0.16893651654910471, + "grad_norm": 4.6861796379089355, + "learning_rate": 8.396190215156914e-06, + "loss": 0.2622, + "step": 12454 + }, + { + "epoch": 0.1689500813890396, + "grad_norm": 4.305256366729736, + "learning_rate": 8.39605317253666e-06, + "loss": 0.3093, + "step": 12455 + }, + { + "epoch": 0.1689636462289745, + "grad_norm": 5.592179775238037, + "learning_rate": 8.395916129916405e-06, + "loss": 0.2084, + "step": 12456 + }, + { + "epoch": 0.1689772110689094, + "grad_norm": 6.863164901733398, + "learning_rate": 8.39577908729615e-06, + "loss": 0.3469, + "step": 12457 + }, + { + "epoch": 0.1689907759088443, + "grad_norm": 5.029214382171631, + "learning_rate": 8.395642044675895e-06, + "loss": 0.3617, + "step": 12458 + }, + { + "epoch": 0.16900434074877915, + "grad_norm": 6.759398937225342, + "learning_rate": 8.395505002055639e-06, + "loss": 0.4556, + "step": 12459 + }, + { + "epoch": 0.16901790558871405, + "grad_norm": 6.888267517089844, + "learning_rate": 8.395367959435386e-06, + "loss": 0.4638, + "step": 12460 + }, + { + "epoch": 0.16903147042864894, + "grad_norm": 5.291098117828369, + "learning_rate": 8.39523091681513e-06, + "loss": 0.2399, + "step": 12461 + }, + { + "epoch": 0.16904503526858383, + "grad_norm": 8.24073314666748, + "learning_rate": 8.395093874194876e-06, + "loss": 0.4531, + "step": 12462 + }, + { + "epoch": 0.16905860010851873, + "grad_norm": 6.366119861602783, + "learning_rate": 8.39495683157462e-06, + "loss": 0.3614, + "step": 12463 + }, + { + "epoch": 0.16907216494845362, + "grad_norm": 6.707075119018555, + "learning_rate": 8.394819788954366e-06, + "loss": 0.4949, + "step": 12464 + }, + { + "epoch": 0.16908572978838848, + "grad_norm": 8.291574478149414, + "learning_rate": 8.394682746334111e-06, + "loss": 0.4002, + "step": 12465 + }, + { + "epoch": 0.16909929462832338, + "grad_norm": 6.907360076904297, + "learning_rate": 8.394545703713855e-06, + "loss": 0.3874, + "step": 12466 + }, + { + "epoch": 0.16911285946825827, + "grad_norm": 5.595609664916992, + "learning_rate": 8.3944086610936e-06, + "loss": 0.3831, + "step": 12467 + }, + { + "epoch": 0.16912642430819316, + "grad_norm": 6.250299453735352, + "learning_rate": 8.394271618473347e-06, + "loss": 0.3934, + "step": 12468 + }, + { + "epoch": 0.16913998914812806, + "grad_norm": 7.233833312988281, + "learning_rate": 8.39413457585309e-06, + "loss": 0.3744, + "step": 12469 + }, + { + "epoch": 0.16915355398806295, + "grad_norm": 8.879958152770996, + "learning_rate": 8.393997533232836e-06, + "loss": 0.5175, + "step": 12470 + }, + { + "epoch": 0.16916711882799784, + "grad_norm": 5.276536464691162, + "learning_rate": 8.39386049061258e-06, + "loss": 0.35, + "step": 12471 + }, + { + "epoch": 0.1691806836679327, + "grad_norm": 5.233087062835693, + "learning_rate": 8.393723447992326e-06, + "loss": 0.3066, + "step": 12472 + }, + { + "epoch": 0.1691942485078676, + "grad_norm": 6.2095160484313965, + "learning_rate": 8.393586405372071e-06, + "loss": 0.3418, + "step": 12473 + }, + { + "epoch": 0.1692078133478025, + "grad_norm": 5.183384418487549, + "learning_rate": 8.393449362751816e-06, + "loss": 0.2383, + "step": 12474 + }, + { + "epoch": 0.1692213781877374, + "grad_norm": 5.713565826416016, + "learning_rate": 8.393312320131562e-06, + "loss": 0.2966, + "step": 12475 + }, + { + "epoch": 0.16923494302767228, + "grad_norm": 5.646799087524414, + "learning_rate": 8.393175277511307e-06, + "loss": 0.3488, + "step": 12476 + }, + { + "epoch": 0.16924850786760717, + "grad_norm": 8.177184104919434, + "learning_rate": 8.393038234891052e-06, + "loss": 0.3811, + "step": 12477 + }, + { + "epoch": 0.16926207270754204, + "grad_norm": 5.45257568359375, + "learning_rate": 8.392901192270797e-06, + "loss": 0.3458, + "step": 12478 + }, + { + "epoch": 0.16927563754747693, + "grad_norm": 4.982600688934326, + "learning_rate": 8.392764149650542e-06, + "loss": 0.2186, + "step": 12479 + }, + { + "epoch": 0.16928920238741182, + "grad_norm": 5.521381855010986, + "learning_rate": 8.392627107030287e-06, + "loss": 0.2086, + "step": 12480 + }, + { + "epoch": 0.16930276722734672, + "grad_norm": 4.542798042297363, + "learning_rate": 8.392490064410033e-06, + "loss": 0.2223, + "step": 12481 + }, + { + "epoch": 0.1693163320672816, + "grad_norm": 6.345457553863525, + "learning_rate": 8.392353021789778e-06, + "loss": 0.2314, + "step": 12482 + }, + { + "epoch": 0.1693298969072165, + "grad_norm": 5.718003749847412, + "learning_rate": 8.392215979169523e-06, + "loss": 0.3373, + "step": 12483 + }, + { + "epoch": 0.1693434617471514, + "grad_norm": 7.135181427001953, + "learning_rate": 8.392078936549266e-06, + "loss": 0.3494, + "step": 12484 + }, + { + "epoch": 0.16935702658708626, + "grad_norm": 5.285223484039307, + "learning_rate": 8.391941893929013e-06, + "loss": 0.3, + "step": 12485 + }, + { + "epoch": 0.16937059142702116, + "grad_norm": 4.688239574432373, + "learning_rate": 8.391804851308759e-06, + "loss": 0.319, + "step": 12486 + }, + { + "epoch": 0.16938415626695605, + "grad_norm": 4.132099151611328, + "learning_rate": 8.391667808688502e-06, + "loss": 0.1981, + "step": 12487 + }, + { + "epoch": 0.16939772110689094, + "grad_norm": 6.126128673553467, + "learning_rate": 8.391530766068247e-06, + "loss": 0.3465, + "step": 12488 + }, + { + "epoch": 0.16941128594682583, + "grad_norm": 6.905009746551514, + "learning_rate": 8.391393723447992e-06, + "loss": 0.2952, + "step": 12489 + }, + { + "epoch": 0.16942485078676073, + "grad_norm": 5.5080413818359375, + "learning_rate": 8.39125668082774e-06, + "loss": 0.256, + "step": 12490 + }, + { + "epoch": 0.1694384156266956, + "grad_norm": 5.432858943939209, + "learning_rate": 8.391119638207483e-06, + "loss": 0.3038, + "step": 12491 + }, + { + "epoch": 0.1694519804666305, + "grad_norm": 5.47570276260376, + "learning_rate": 8.390982595587228e-06, + "loss": 0.3703, + "step": 12492 + }, + { + "epoch": 0.16946554530656538, + "grad_norm": 4.670064449310303, + "learning_rate": 8.390845552966973e-06, + "loss": 0.2949, + "step": 12493 + }, + { + "epoch": 0.16947911014650027, + "grad_norm": 6.081723690032959, + "learning_rate": 8.390708510346718e-06, + "loss": 0.3241, + "step": 12494 + }, + { + "epoch": 0.16949267498643517, + "grad_norm": 4.791399955749512, + "learning_rate": 8.390571467726463e-06, + "loss": 0.2351, + "step": 12495 + }, + { + "epoch": 0.16950623982637006, + "grad_norm": 5.624195575714111, + "learning_rate": 8.390434425106209e-06, + "loss": 0.3058, + "step": 12496 + }, + { + "epoch": 0.16951980466630492, + "grad_norm": 5.826587200164795, + "learning_rate": 8.390297382485954e-06, + "loss": 0.3666, + "step": 12497 + }, + { + "epoch": 0.16953336950623982, + "grad_norm": 5.690701961517334, + "learning_rate": 8.390160339865699e-06, + "loss": 0.2784, + "step": 12498 + }, + { + "epoch": 0.1695469343461747, + "grad_norm": 6.396544933319092, + "learning_rate": 8.390023297245444e-06, + "loss": 0.3486, + "step": 12499 + }, + { + "epoch": 0.1695604991861096, + "grad_norm": 6.881139278411865, + "learning_rate": 8.38988625462519e-06, + "loss": 0.5052, + "step": 12500 + }, + { + "epoch": 0.1695740640260445, + "grad_norm": 6.509136199951172, + "learning_rate": 8.389749212004935e-06, + "loss": 0.4493, + "step": 12501 + }, + { + "epoch": 0.1695876288659794, + "grad_norm": 5.5384979248046875, + "learning_rate": 8.389612169384678e-06, + "loss": 0.3358, + "step": 12502 + }, + { + "epoch": 0.16960119370591428, + "grad_norm": 6.548900127410889, + "learning_rate": 8.389475126764425e-06, + "loss": 0.421, + "step": 12503 + }, + { + "epoch": 0.16961475854584915, + "grad_norm": 6.3850579261779785, + "learning_rate": 8.38933808414417e-06, + "loss": 0.2349, + "step": 12504 + }, + { + "epoch": 0.16962832338578404, + "grad_norm": 7.789031028747559, + "learning_rate": 8.389201041523915e-06, + "loss": 0.4827, + "step": 12505 + }, + { + "epoch": 0.16964188822571893, + "grad_norm": 5.423665523529053, + "learning_rate": 8.389063998903659e-06, + "loss": 0.2907, + "step": 12506 + }, + { + "epoch": 0.16965545306565383, + "grad_norm": 8.219799041748047, + "learning_rate": 8.388926956283406e-06, + "loss": 0.4946, + "step": 12507 + }, + { + "epoch": 0.16966901790558872, + "grad_norm": 9.152948379516602, + "learning_rate": 8.38878991366315e-06, + "loss": 0.6535, + "step": 12508 + }, + { + "epoch": 0.1696825827455236, + "grad_norm": 6.402649402618408, + "learning_rate": 8.388652871042894e-06, + "loss": 0.4367, + "step": 12509 + }, + { + "epoch": 0.16969614758545848, + "grad_norm": 5.163474082946777, + "learning_rate": 8.38851582842264e-06, + "loss": 0.2853, + "step": 12510 + }, + { + "epoch": 0.16970971242539337, + "grad_norm": 6.934104919433594, + "learning_rate": 8.388378785802386e-06, + "loss": 0.4662, + "step": 12511 + }, + { + "epoch": 0.16972327726532827, + "grad_norm": 6.444970607757568, + "learning_rate": 8.38824174318213e-06, + "loss": 0.3868, + "step": 12512 + }, + { + "epoch": 0.16973684210526316, + "grad_norm": 7.791102886199951, + "learning_rate": 8.388104700561875e-06, + "loss": 0.5091, + "step": 12513 + }, + { + "epoch": 0.16975040694519805, + "grad_norm": 4.941915035247803, + "learning_rate": 8.38796765794162e-06, + "loss": 0.3218, + "step": 12514 + }, + { + "epoch": 0.16976397178513294, + "grad_norm": 5.011888027191162, + "learning_rate": 8.387830615321365e-06, + "loss": 0.3176, + "step": 12515 + }, + { + "epoch": 0.16977753662506784, + "grad_norm": 5.995962619781494, + "learning_rate": 8.38769357270111e-06, + "loss": 0.3182, + "step": 12516 + }, + { + "epoch": 0.1697911014650027, + "grad_norm": 6.528904438018799, + "learning_rate": 8.387556530080856e-06, + "loss": 0.3308, + "step": 12517 + }, + { + "epoch": 0.1698046663049376, + "grad_norm": 6.189830780029297, + "learning_rate": 8.387419487460601e-06, + "loss": 0.3801, + "step": 12518 + }, + { + "epoch": 0.1698182311448725, + "grad_norm": 6.030525207519531, + "learning_rate": 8.387282444840346e-06, + "loss": 0.4066, + "step": 12519 + }, + { + "epoch": 0.16983179598480738, + "grad_norm": 8.686582565307617, + "learning_rate": 8.387145402220091e-06, + "loss": 0.4694, + "step": 12520 + }, + { + "epoch": 0.16984536082474228, + "grad_norm": 6.067732334136963, + "learning_rate": 8.387008359599836e-06, + "loss": 0.3574, + "step": 12521 + }, + { + "epoch": 0.16985892566467717, + "grad_norm": 4.905946731567383, + "learning_rate": 8.386871316979582e-06, + "loss": 0.3363, + "step": 12522 + }, + { + "epoch": 0.16987249050461203, + "grad_norm": 4.788383483886719, + "learning_rate": 8.386734274359327e-06, + "loss": 0.2548, + "step": 12523 + }, + { + "epoch": 0.16988605534454693, + "grad_norm": 5.769965648651123, + "learning_rate": 8.386597231739072e-06, + "loss": 0.3108, + "step": 12524 + }, + { + "epoch": 0.16989962018448182, + "grad_norm": 5.946588516235352, + "learning_rate": 8.386460189118817e-06, + "loss": 0.2679, + "step": 12525 + }, + { + "epoch": 0.1699131850244167, + "grad_norm": 3.6355233192443848, + "learning_rate": 8.386323146498562e-06, + "loss": 0.3034, + "step": 12526 + }, + { + "epoch": 0.1699267498643516, + "grad_norm": 4.769613742828369, + "learning_rate": 8.386186103878306e-06, + "loss": 0.2398, + "step": 12527 + }, + { + "epoch": 0.1699403147042865, + "grad_norm": 6.346920490264893, + "learning_rate": 8.386049061258051e-06, + "loss": 0.2686, + "step": 12528 + }, + { + "epoch": 0.16995387954422136, + "grad_norm": 7.263872146606445, + "learning_rate": 8.385912018637798e-06, + "loss": 0.4977, + "step": 12529 + }, + { + "epoch": 0.16996744438415626, + "grad_norm": 4.433160781860352, + "learning_rate": 8.385774976017543e-06, + "loss": 0.2142, + "step": 12530 + }, + { + "epoch": 0.16998100922409115, + "grad_norm": 6.097153663635254, + "learning_rate": 8.385637933397286e-06, + "loss": 0.288, + "step": 12531 + }, + { + "epoch": 0.16999457406402604, + "grad_norm": 5.914606094360352, + "learning_rate": 8.385500890777032e-06, + "loss": 0.3495, + "step": 12532 + }, + { + "epoch": 0.17000813890396094, + "grad_norm": 5.906896114349365, + "learning_rate": 8.385363848156779e-06, + "loss": 0.3324, + "step": 12533 + }, + { + "epoch": 0.17002170374389583, + "grad_norm": 4.94691801071167, + "learning_rate": 8.385226805536522e-06, + "loss": 0.372, + "step": 12534 + }, + { + "epoch": 0.17003526858383072, + "grad_norm": 6.348870277404785, + "learning_rate": 8.385089762916267e-06, + "loss": 0.3525, + "step": 12535 + }, + { + "epoch": 0.1700488334237656, + "grad_norm": 5.606594562530518, + "learning_rate": 8.384952720296012e-06, + "loss": 0.2625, + "step": 12536 + }, + { + "epoch": 0.17006239826370048, + "grad_norm": 4.861709117889404, + "learning_rate": 8.384815677675758e-06, + "loss": 0.2409, + "step": 12537 + }, + { + "epoch": 0.17007596310363537, + "grad_norm": 4.219320297241211, + "learning_rate": 8.384678635055503e-06, + "loss": 0.2117, + "step": 12538 + }, + { + "epoch": 0.17008952794357027, + "grad_norm": 4.986706256866455, + "learning_rate": 8.384541592435248e-06, + "loss": 0.2504, + "step": 12539 + }, + { + "epoch": 0.17010309278350516, + "grad_norm": 5.078813076019287, + "learning_rate": 8.384404549814993e-06, + "loss": 0.3172, + "step": 12540 + }, + { + "epoch": 0.17011665762344005, + "grad_norm": 5.816190242767334, + "learning_rate": 8.384267507194738e-06, + "loss": 0.3062, + "step": 12541 + }, + { + "epoch": 0.17013022246337492, + "grad_norm": 7.000774383544922, + "learning_rate": 8.384130464574483e-06, + "loss": 0.3966, + "step": 12542 + }, + { + "epoch": 0.1701437873033098, + "grad_norm": 7.583296775817871, + "learning_rate": 8.383993421954229e-06, + "loss": 0.3621, + "step": 12543 + }, + { + "epoch": 0.1701573521432447, + "grad_norm": 4.615065097808838, + "learning_rate": 8.383856379333974e-06, + "loss": 0.321, + "step": 12544 + }, + { + "epoch": 0.1701709169831796, + "grad_norm": 4.125748634338379, + "learning_rate": 8.383719336713719e-06, + "loss": 0.1602, + "step": 12545 + }, + { + "epoch": 0.1701844818231145, + "grad_norm": 7.292916297912598, + "learning_rate": 8.383582294093464e-06, + "loss": 0.2942, + "step": 12546 + }, + { + "epoch": 0.17019804666304938, + "grad_norm": 6.924049377441406, + "learning_rate": 8.38344525147321e-06, + "loss": 0.3909, + "step": 12547 + }, + { + "epoch": 0.17021161150298428, + "grad_norm": 7.251497745513916, + "learning_rate": 8.383308208852955e-06, + "loss": 0.4304, + "step": 12548 + }, + { + "epoch": 0.17022517634291914, + "grad_norm": 10.940756797790527, + "learning_rate": 8.383171166232698e-06, + "loss": 0.609, + "step": 12549 + }, + { + "epoch": 0.17023874118285404, + "grad_norm": 5.452278137207031, + "learning_rate": 8.383034123612445e-06, + "loss": 0.3294, + "step": 12550 + }, + { + "epoch": 0.17025230602278893, + "grad_norm": 6.201940059661865, + "learning_rate": 8.38289708099219e-06, + "loss": 0.2974, + "step": 12551 + }, + { + "epoch": 0.17026587086272382, + "grad_norm": 5.8427653312683105, + "learning_rate": 8.382760038371934e-06, + "loss": 0.312, + "step": 12552 + }, + { + "epoch": 0.17027943570265872, + "grad_norm": 7.117753505706787, + "learning_rate": 8.382622995751679e-06, + "loss": 0.3969, + "step": 12553 + }, + { + "epoch": 0.1702930005425936, + "grad_norm": 5.191544532775879, + "learning_rate": 8.382485953131426e-06, + "loss": 0.3428, + "step": 12554 + }, + { + "epoch": 0.17030656538252847, + "grad_norm": 7.278018951416016, + "learning_rate": 8.38234891051117e-06, + "loss": 0.2669, + "step": 12555 + }, + { + "epoch": 0.17032013022246337, + "grad_norm": 7.515079975128174, + "learning_rate": 8.382211867890914e-06, + "loss": 0.4504, + "step": 12556 + }, + { + "epoch": 0.17033369506239826, + "grad_norm": 4.741102695465088, + "learning_rate": 8.38207482527066e-06, + "loss": 0.2314, + "step": 12557 + }, + { + "epoch": 0.17034725990233315, + "grad_norm": 6.899774551391602, + "learning_rate": 8.381937782650405e-06, + "loss": 0.4535, + "step": 12558 + }, + { + "epoch": 0.17036082474226805, + "grad_norm": 5.199121952056885, + "learning_rate": 8.38180074003015e-06, + "loss": 0.3814, + "step": 12559 + }, + { + "epoch": 0.17037438958220294, + "grad_norm": 4.673211097717285, + "learning_rate": 8.381663697409895e-06, + "loss": 0.2523, + "step": 12560 + }, + { + "epoch": 0.1703879544221378, + "grad_norm": 4.7538275718688965, + "learning_rate": 8.38152665478964e-06, + "loss": 0.314, + "step": 12561 + }, + { + "epoch": 0.1704015192620727, + "grad_norm": 7.502269268035889, + "learning_rate": 8.381389612169385e-06, + "loss": 0.408, + "step": 12562 + }, + { + "epoch": 0.1704150841020076, + "grad_norm": 4.802136421203613, + "learning_rate": 8.38125256954913e-06, + "loss": 0.2947, + "step": 12563 + }, + { + "epoch": 0.17042864894194248, + "grad_norm": 5.885307788848877, + "learning_rate": 8.381115526928876e-06, + "loss": 0.4586, + "step": 12564 + }, + { + "epoch": 0.17044221378187738, + "grad_norm": 7.027400016784668, + "learning_rate": 8.380978484308621e-06, + "loss": 0.3414, + "step": 12565 + }, + { + "epoch": 0.17045577862181227, + "grad_norm": 5.897618770599365, + "learning_rate": 8.380841441688366e-06, + "loss": 0.2836, + "step": 12566 + }, + { + "epoch": 0.17046934346174716, + "grad_norm": 7.075929641723633, + "learning_rate": 8.380704399068111e-06, + "loss": 0.3751, + "step": 12567 + }, + { + "epoch": 0.17048290830168203, + "grad_norm": 4.951437473297119, + "learning_rate": 8.380567356447856e-06, + "loss": 0.2135, + "step": 12568 + }, + { + "epoch": 0.17049647314161692, + "grad_norm": 6.796904563903809, + "learning_rate": 8.380430313827602e-06, + "loss": 0.3927, + "step": 12569 + }, + { + "epoch": 0.17051003798155182, + "grad_norm": 5.652533531188965, + "learning_rate": 8.380293271207347e-06, + "loss": 0.3633, + "step": 12570 + }, + { + "epoch": 0.1705236028214867, + "grad_norm": 5.102812767028809, + "learning_rate": 8.38015622858709e-06, + "loss": 0.2462, + "step": 12571 + }, + { + "epoch": 0.1705371676614216, + "grad_norm": 5.247172832489014, + "learning_rate": 8.380019185966837e-06, + "loss": 0.2856, + "step": 12572 + }, + { + "epoch": 0.1705507325013565, + "grad_norm": 6.4342546463012695, + "learning_rate": 8.379882143346582e-06, + "loss": 0.3359, + "step": 12573 + }, + { + "epoch": 0.17056429734129136, + "grad_norm": 6.492471218109131, + "learning_rate": 8.379745100726326e-06, + "loss": 0.3698, + "step": 12574 + }, + { + "epoch": 0.17057786218122625, + "grad_norm": 5.960079669952393, + "learning_rate": 8.379608058106071e-06, + "loss": 0.2733, + "step": 12575 + }, + { + "epoch": 0.17059142702116115, + "grad_norm": 8.511343955993652, + "learning_rate": 8.379471015485818e-06, + "loss": 0.5388, + "step": 12576 + }, + { + "epoch": 0.17060499186109604, + "grad_norm": 7.289690971374512, + "learning_rate": 8.379333972865561e-06, + "loss": 0.3311, + "step": 12577 + }, + { + "epoch": 0.17061855670103093, + "grad_norm": 5.623840808868408, + "learning_rate": 8.379196930245307e-06, + "loss": 0.4164, + "step": 12578 + }, + { + "epoch": 0.17063212154096583, + "grad_norm": 5.503202438354492, + "learning_rate": 8.379059887625052e-06, + "loss": 0.2763, + "step": 12579 + }, + { + "epoch": 0.17064568638090072, + "grad_norm": 6.583444595336914, + "learning_rate": 8.378922845004797e-06, + "loss": 0.32, + "step": 12580 + }, + { + "epoch": 0.17065925122083558, + "grad_norm": 5.751636505126953, + "learning_rate": 8.378785802384542e-06, + "loss": 0.2828, + "step": 12581 + }, + { + "epoch": 0.17067281606077048, + "grad_norm": 7.422565460205078, + "learning_rate": 8.378648759764287e-06, + "loss": 0.3818, + "step": 12582 + }, + { + "epoch": 0.17068638090070537, + "grad_norm": 4.269686698913574, + "learning_rate": 8.378511717144032e-06, + "loss": 0.2293, + "step": 12583 + }, + { + "epoch": 0.17069994574064026, + "grad_norm": 3.961606025695801, + "learning_rate": 8.378374674523778e-06, + "loss": 0.1567, + "step": 12584 + }, + { + "epoch": 0.17071351058057516, + "grad_norm": 4.434669494628906, + "learning_rate": 8.378237631903523e-06, + "loss": 0.2874, + "step": 12585 + }, + { + "epoch": 0.17072707542051005, + "grad_norm": 4.034399509429932, + "learning_rate": 8.378100589283268e-06, + "loss": 0.2918, + "step": 12586 + }, + { + "epoch": 0.17074064026044491, + "grad_norm": 5.513166427612305, + "learning_rate": 8.377963546663013e-06, + "loss": 0.261, + "step": 12587 + }, + { + "epoch": 0.1707542051003798, + "grad_norm": 5.579530715942383, + "learning_rate": 8.377826504042758e-06, + "loss": 0.1625, + "step": 12588 + }, + { + "epoch": 0.1707677699403147, + "grad_norm": 5.988185882568359, + "learning_rate": 8.377689461422503e-06, + "loss": 0.3147, + "step": 12589 + }, + { + "epoch": 0.1707813347802496, + "grad_norm": 6.637105941772461, + "learning_rate": 8.377552418802249e-06, + "loss": 0.3684, + "step": 12590 + }, + { + "epoch": 0.1707948996201845, + "grad_norm": 4.776288986206055, + "learning_rate": 8.377415376181994e-06, + "loss": 0.3013, + "step": 12591 + }, + { + "epoch": 0.17080846446011938, + "grad_norm": 3.822871446609497, + "learning_rate": 8.377278333561737e-06, + "loss": 0.3217, + "step": 12592 + }, + { + "epoch": 0.17082202930005425, + "grad_norm": 7.007094860076904, + "learning_rate": 8.377141290941484e-06, + "loss": 0.3315, + "step": 12593 + }, + { + "epoch": 0.17083559413998914, + "grad_norm": 4.98909854888916, + "learning_rate": 8.37700424832123e-06, + "loss": 0.3522, + "step": 12594 + }, + { + "epoch": 0.17084915897992403, + "grad_norm": 6.024118423461914, + "learning_rate": 8.376867205700973e-06, + "loss": 0.3987, + "step": 12595 + }, + { + "epoch": 0.17086272381985892, + "grad_norm": 6.666292667388916, + "learning_rate": 8.376730163080718e-06, + "loss": 0.3877, + "step": 12596 + }, + { + "epoch": 0.17087628865979382, + "grad_norm": 5.674040794372559, + "learning_rate": 8.376593120460463e-06, + "loss": 0.312, + "step": 12597 + }, + { + "epoch": 0.1708898534997287, + "grad_norm": 5.489013671875, + "learning_rate": 8.37645607784021e-06, + "loss": 0.3353, + "step": 12598 + }, + { + "epoch": 0.1709034183396636, + "grad_norm": 5.293697357177734, + "learning_rate": 8.376319035219954e-06, + "loss": 0.5153, + "step": 12599 + }, + { + "epoch": 0.17091698317959847, + "grad_norm": 5.544060707092285, + "learning_rate": 8.376181992599699e-06, + "loss": 0.4253, + "step": 12600 + }, + { + "epoch": 0.17093054801953336, + "grad_norm": 6.135098457336426, + "learning_rate": 8.376044949979444e-06, + "loss": 0.3472, + "step": 12601 + }, + { + "epoch": 0.17094411285946826, + "grad_norm": 6.059926509857178, + "learning_rate": 8.375907907359189e-06, + "loss": 0.4219, + "step": 12602 + }, + { + "epoch": 0.17095767769940315, + "grad_norm": 3.85937237739563, + "learning_rate": 8.375770864738934e-06, + "loss": 0.2334, + "step": 12603 + }, + { + "epoch": 0.17097124253933804, + "grad_norm": 5.908238410949707, + "learning_rate": 8.37563382211868e-06, + "loss": 0.5217, + "step": 12604 + }, + { + "epoch": 0.17098480737927294, + "grad_norm": 5.817061901092529, + "learning_rate": 8.375496779498425e-06, + "loss": 0.4545, + "step": 12605 + }, + { + "epoch": 0.1709983722192078, + "grad_norm": 7.361856460571289, + "learning_rate": 8.37535973687817e-06, + "loss": 0.5153, + "step": 12606 + }, + { + "epoch": 0.1710119370591427, + "grad_norm": 6.981168270111084, + "learning_rate": 8.375222694257915e-06, + "loss": 0.583, + "step": 12607 + }, + { + "epoch": 0.1710255018990776, + "grad_norm": 5.400672912597656, + "learning_rate": 8.37508565163766e-06, + "loss": 0.3807, + "step": 12608 + }, + { + "epoch": 0.17103906673901248, + "grad_norm": 6.07806396484375, + "learning_rate": 8.374948609017405e-06, + "loss": 0.5332, + "step": 12609 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 5.77099609375, + "learning_rate": 8.374811566397149e-06, + "loss": 0.4922, + "step": 12610 + }, + { + "epoch": 0.17106619641888227, + "grad_norm": 4.59017276763916, + "learning_rate": 8.374674523776896e-06, + "loss": 0.3759, + "step": 12611 + }, + { + "epoch": 0.17107976125881716, + "grad_norm": 6.754301071166992, + "learning_rate": 8.374537481156641e-06, + "loss": 0.5238, + "step": 12612 + }, + { + "epoch": 0.17109332609875202, + "grad_norm": 6.527447700500488, + "learning_rate": 8.374400438536386e-06, + "loss": 0.3976, + "step": 12613 + }, + { + "epoch": 0.17110689093868692, + "grad_norm": 5.2445197105407715, + "learning_rate": 8.37426339591613e-06, + "loss": 0.3602, + "step": 12614 + }, + { + "epoch": 0.1711204557786218, + "grad_norm": 6.178770542144775, + "learning_rate": 8.374126353295876e-06, + "loss": 0.4804, + "step": 12615 + }, + { + "epoch": 0.1711340206185567, + "grad_norm": 6.137636661529541, + "learning_rate": 8.373989310675622e-06, + "loss": 0.4583, + "step": 12616 + }, + { + "epoch": 0.1711475854584916, + "grad_norm": 5.992091178894043, + "learning_rate": 8.373852268055365e-06, + "loss": 0.4265, + "step": 12617 + }, + { + "epoch": 0.1711611502984265, + "grad_norm": 6.590725421905518, + "learning_rate": 8.37371522543511e-06, + "loss": 0.4117, + "step": 12618 + }, + { + "epoch": 0.17117471513836136, + "grad_norm": 7.651945114135742, + "learning_rate": 8.373578182814857e-06, + "loss": 0.4351, + "step": 12619 + }, + { + "epoch": 0.17118827997829625, + "grad_norm": 7.112224102020264, + "learning_rate": 8.3734411401946e-06, + "loss": 0.5748, + "step": 12620 + }, + { + "epoch": 0.17120184481823114, + "grad_norm": 8.118658065795898, + "learning_rate": 8.373304097574346e-06, + "loss": 0.5058, + "step": 12621 + }, + { + "epoch": 0.17121540965816603, + "grad_norm": 6.647836208343506, + "learning_rate": 8.373167054954091e-06, + "loss": 0.3545, + "step": 12622 + }, + { + "epoch": 0.17122897449810093, + "grad_norm": 6.075175762176514, + "learning_rate": 8.373030012333838e-06, + "loss": 0.2604, + "step": 12623 + }, + { + "epoch": 0.17124253933803582, + "grad_norm": 8.461923599243164, + "learning_rate": 8.372892969713581e-06, + "loss": 0.6272, + "step": 12624 + }, + { + "epoch": 0.17125610417797069, + "grad_norm": 5.732715129852295, + "learning_rate": 8.372755927093327e-06, + "loss": 0.3701, + "step": 12625 + }, + { + "epoch": 0.17126966901790558, + "grad_norm": 6.080602645874023, + "learning_rate": 8.372618884473072e-06, + "loss": 0.3065, + "step": 12626 + }, + { + "epoch": 0.17128323385784047, + "grad_norm": 6.169887065887451, + "learning_rate": 8.372481841852817e-06, + "loss": 0.3561, + "step": 12627 + }, + { + "epoch": 0.17129679869777537, + "grad_norm": 5.865388870239258, + "learning_rate": 8.372344799232562e-06, + "loss": 0.5131, + "step": 12628 + }, + { + "epoch": 0.17131036353771026, + "grad_norm": 6.169945240020752, + "learning_rate": 8.372207756612307e-06, + "loss": 0.4674, + "step": 12629 + }, + { + "epoch": 0.17132392837764515, + "grad_norm": 7.002187252044678, + "learning_rate": 8.372070713992052e-06, + "loss": 0.4127, + "step": 12630 + }, + { + "epoch": 0.17133749321758004, + "grad_norm": 6.834867000579834, + "learning_rate": 8.371933671371798e-06, + "loss": 0.6038, + "step": 12631 + }, + { + "epoch": 0.1713510580575149, + "grad_norm": 7.155603408813477, + "learning_rate": 8.371796628751543e-06, + "loss": 0.5623, + "step": 12632 + }, + { + "epoch": 0.1713646228974498, + "grad_norm": 3.9780304431915283, + "learning_rate": 8.371659586131288e-06, + "loss": 0.2062, + "step": 12633 + }, + { + "epoch": 0.1713781877373847, + "grad_norm": 6.209799289703369, + "learning_rate": 8.371522543511033e-06, + "loss": 0.4607, + "step": 12634 + }, + { + "epoch": 0.1713917525773196, + "grad_norm": 6.7557597160339355, + "learning_rate": 8.371385500890777e-06, + "loss": 0.3499, + "step": 12635 + }, + { + "epoch": 0.17140531741725448, + "grad_norm": 7.146206855773926, + "learning_rate": 8.371248458270523e-06, + "loss": 0.4724, + "step": 12636 + }, + { + "epoch": 0.17141888225718938, + "grad_norm": 5.770370960235596, + "learning_rate": 8.371111415650269e-06, + "loss": 0.5947, + "step": 12637 + }, + { + "epoch": 0.17143244709712424, + "grad_norm": 6.0822672843933105, + "learning_rate": 8.370974373030014e-06, + "loss": 0.4376, + "step": 12638 + }, + { + "epoch": 0.17144601193705913, + "grad_norm": 7.650982856750488, + "learning_rate": 8.370837330409757e-06, + "loss": 0.4633, + "step": 12639 + }, + { + "epoch": 0.17145957677699403, + "grad_norm": 5.179485321044922, + "learning_rate": 8.370700287789503e-06, + "loss": 0.2953, + "step": 12640 + }, + { + "epoch": 0.17147314161692892, + "grad_norm": 5.038431644439697, + "learning_rate": 8.37056324516925e-06, + "loss": 0.3081, + "step": 12641 + }, + { + "epoch": 0.1714867064568638, + "grad_norm": 4.8875226974487305, + "learning_rate": 8.370426202548993e-06, + "loss": 0.2592, + "step": 12642 + }, + { + "epoch": 0.1715002712967987, + "grad_norm": 6.064601421356201, + "learning_rate": 8.370289159928738e-06, + "loss": 0.4197, + "step": 12643 + }, + { + "epoch": 0.1715138361367336, + "grad_norm": 3.628690481185913, + "learning_rate": 8.370152117308483e-06, + "loss": 0.1883, + "step": 12644 + }, + { + "epoch": 0.17152740097666846, + "grad_norm": 7.32820463180542, + "learning_rate": 8.370015074688228e-06, + "loss": 0.5626, + "step": 12645 + }, + { + "epoch": 0.17154096581660336, + "grad_norm": 4.8965983390808105, + "learning_rate": 8.369878032067974e-06, + "loss": 0.2591, + "step": 12646 + }, + { + "epoch": 0.17155453065653825, + "grad_norm": 5.452487468719482, + "learning_rate": 8.369740989447719e-06, + "loss": 0.3137, + "step": 12647 + }, + { + "epoch": 0.17156809549647314, + "grad_norm": 6.345117568969727, + "learning_rate": 8.369603946827464e-06, + "loss": 0.4022, + "step": 12648 + }, + { + "epoch": 0.17158166033640804, + "grad_norm": 5.071157932281494, + "learning_rate": 8.369466904207209e-06, + "loss": 0.3247, + "step": 12649 + }, + { + "epoch": 0.17159522517634293, + "grad_norm": 6.139552116394043, + "learning_rate": 8.369329861586954e-06, + "loss": 0.388, + "step": 12650 + }, + { + "epoch": 0.1716087900162778, + "grad_norm": 5.379919052124023, + "learning_rate": 8.3691928189667e-06, + "loss": 0.5172, + "step": 12651 + }, + { + "epoch": 0.1716223548562127, + "grad_norm": 7.865880966186523, + "learning_rate": 8.369055776346445e-06, + "loss": 0.385, + "step": 12652 + }, + { + "epoch": 0.17163591969614758, + "grad_norm": 6.761286735534668, + "learning_rate": 8.36891873372619e-06, + "loss": 0.3238, + "step": 12653 + }, + { + "epoch": 0.17164948453608248, + "grad_norm": 5.692591190338135, + "learning_rate": 8.368781691105935e-06, + "loss": 0.3837, + "step": 12654 + }, + { + "epoch": 0.17166304937601737, + "grad_norm": 6.968808650970459, + "learning_rate": 8.36864464848568e-06, + "loss": 0.4644, + "step": 12655 + }, + { + "epoch": 0.17167661421595226, + "grad_norm": 6.405490875244141, + "learning_rate": 8.368507605865425e-06, + "loss": 0.4413, + "step": 12656 + }, + { + "epoch": 0.17169017905588715, + "grad_norm": 4.936686992645264, + "learning_rate": 8.368370563245169e-06, + "loss": 0.405, + "step": 12657 + }, + { + "epoch": 0.17170374389582202, + "grad_norm": 6.658344268798828, + "learning_rate": 8.368233520624916e-06, + "loss": 0.4963, + "step": 12658 + }, + { + "epoch": 0.1717173087357569, + "grad_norm": 6.814687252044678, + "learning_rate": 8.368096478004661e-06, + "loss": 0.419, + "step": 12659 + }, + { + "epoch": 0.1717308735756918, + "grad_norm": 8.18826961517334, + "learning_rate": 8.367959435384404e-06, + "loss": 0.5164, + "step": 12660 + }, + { + "epoch": 0.1717444384156267, + "grad_norm": 7.340713024139404, + "learning_rate": 8.36782239276415e-06, + "loss": 0.4717, + "step": 12661 + }, + { + "epoch": 0.1717580032555616, + "grad_norm": 6.809130668640137, + "learning_rate": 8.367685350143896e-06, + "loss": 0.4619, + "step": 12662 + }, + { + "epoch": 0.17177156809549649, + "grad_norm": 7.92946720123291, + "learning_rate": 8.36754830752364e-06, + "loss": 0.558, + "step": 12663 + }, + { + "epoch": 0.17178513293543135, + "grad_norm": 6.395073413848877, + "learning_rate": 8.367411264903385e-06, + "loss": 0.4908, + "step": 12664 + }, + { + "epoch": 0.17179869777536624, + "grad_norm": 6.25176477432251, + "learning_rate": 8.36727422228313e-06, + "loss": 0.5332, + "step": 12665 + }, + { + "epoch": 0.17181226261530114, + "grad_norm": 7.488234996795654, + "learning_rate": 8.367137179662875e-06, + "loss": 0.4113, + "step": 12666 + }, + { + "epoch": 0.17182582745523603, + "grad_norm": 6.050863742828369, + "learning_rate": 8.36700013704262e-06, + "loss": 0.4901, + "step": 12667 + }, + { + "epoch": 0.17183939229517092, + "grad_norm": 6.387185096740723, + "learning_rate": 8.366863094422366e-06, + "loss": 0.4504, + "step": 12668 + }, + { + "epoch": 0.17185295713510582, + "grad_norm": 5.915403842926025, + "learning_rate": 8.366726051802111e-06, + "loss": 0.4012, + "step": 12669 + }, + { + "epoch": 0.17186652197504068, + "grad_norm": 7.620893955230713, + "learning_rate": 8.366589009181856e-06, + "loss": 0.4827, + "step": 12670 + }, + { + "epoch": 0.17188008681497557, + "grad_norm": 6.713852882385254, + "learning_rate": 8.366451966561601e-06, + "loss": 0.3915, + "step": 12671 + }, + { + "epoch": 0.17189365165491047, + "grad_norm": 6.2834601402282715, + "learning_rate": 8.366314923941347e-06, + "loss": 0.4816, + "step": 12672 + }, + { + "epoch": 0.17190721649484536, + "grad_norm": 5.5505757331848145, + "learning_rate": 8.366177881321092e-06, + "loss": 0.3794, + "step": 12673 + }, + { + "epoch": 0.17192078133478025, + "grad_norm": 6.944741725921631, + "learning_rate": 8.366040838700837e-06, + "loss": 0.4728, + "step": 12674 + }, + { + "epoch": 0.17193434617471515, + "grad_norm": 6.805820941925049, + "learning_rate": 8.365903796080582e-06, + "loss": 0.4972, + "step": 12675 + }, + { + "epoch": 0.17194791101465004, + "grad_norm": 5.867321968078613, + "learning_rate": 8.365766753460327e-06, + "loss": 0.4633, + "step": 12676 + }, + { + "epoch": 0.1719614758545849, + "grad_norm": 7.191789627075195, + "learning_rate": 8.365629710840072e-06, + "loss": 0.4284, + "step": 12677 + }, + { + "epoch": 0.1719750406945198, + "grad_norm": 6.7162957191467285, + "learning_rate": 8.365492668219816e-06, + "loss": 0.458, + "step": 12678 + }, + { + "epoch": 0.1719886055344547, + "grad_norm": 6.969515800476074, + "learning_rate": 8.365355625599561e-06, + "loss": 0.4883, + "step": 12679 + }, + { + "epoch": 0.17200217037438958, + "grad_norm": 7.145220756530762, + "learning_rate": 8.365218582979308e-06, + "loss": 0.6276, + "step": 12680 + }, + { + "epoch": 0.17201573521432448, + "grad_norm": 6.0563201904296875, + "learning_rate": 8.365081540359053e-06, + "loss": 0.4639, + "step": 12681 + }, + { + "epoch": 0.17202930005425937, + "grad_norm": 7.321856498718262, + "learning_rate": 8.364944497738797e-06, + "loss": 0.6041, + "step": 12682 + }, + { + "epoch": 0.17204286489419424, + "grad_norm": 4.461257457733154, + "learning_rate": 8.364807455118542e-06, + "loss": 0.3215, + "step": 12683 + }, + { + "epoch": 0.17205642973412913, + "grad_norm": 4.474446773529053, + "learning_rate": 8.364670412498289e-06, + "loss": 0.3169, + "step": 12684 + }, + { + "epoch": 0.17206999457406402, + "grad_norm": 8.663893699645996, + "learning_rate": 8.364533369878032e-06, + "loss": 0.5302, + "step": 12685 + }, + { + "epoch": 0.17208355941399892, + "grad_norm": 6.97629451751709, + "learning_rate": 8.364396327257777e-06, + "loss": 0.4938, + "step": 12686 + }, + { + "epoch": 0.1720971242539338, + "grad_norm": 5.504337310791016, + "learning_rate": 8.364259284637523e-06, + "loss": 0.4162, + "step": 12687 + }, + { + "epoch": 0.1721106890938687, + "grad_norm": 8.072619438171387, + "learning_rate": 8.364122242017268e-06, + "loss": 0.5619, + "step": 12688 + }, + { + "epoch": 0.1721242539338036, + "grad_norm": 4.626622676849365, + "learning_rate": 8.363985199397013e-06, + "loss": 0.4143, + "step": 12689 + }, + { + "epoch": 0.17213781877373846, + "grad_norm": 6.341518878936768, + "learning_rate": 8.363848156776758e-06, + "loss": 0.4941, + "step": 12690 + }, + { + "epoch": 0.17215138361367335, + "grad_norm": 6.217397689819336, + "learning_rate": 8.363711114156503e-06, + "loss": 0.4672, + "step": 12691 + }, + { + "epoch": 0.17216494845360825, + "grad_norm": 7.133105754852295, + "learning_rate": 8.363574071536248e-06, + "loss": 0.3857, + "step": 12692 + }, + { + "epoch": 0.17217851329354314, + "grad_norm": 6.629039287567139, + "learning_rate": 8.363437028915994e-06, + "loss": 0.5392, + "step": 12693 + }, + { + "epoch": 0.17219207813347803, + "grad_norm": 6.759429454803467, + "learning_rate": 8.363299986295739e-06, + "loss": 0.5275, + "step": 12694 + }, + { + "epoch": 0.17220564297341293, + "grad_norm": 6.214276313781738, + "learning_rate": 8.363162943675484e-06, + "loss": 0.4288, + "step": 12695 + }, + { + "epoch": 0.1722192078133478, + "grad_norm": 5.529086112976074, + "learning_rate": 8.363025901055229e-06, + "loss": 0.3664, + "step": 12696 + }, + { + "epoch": 0.17223277265328268, + "grad_norm": 4.80219030380249, + "learning_rate": 8.362888858434974e-06, + "loss": 0.3019, + "step": 12697 + }, + { + "epoch": 0.17224633749321758, + "grad_norm": 6.968858242034912, + "learning_rate": 8.36275181581472e-06, + "loss": 0.4413, + "step": 12698 + }, + { + "epoch": 0.17225990233315247, + "grad_norm": 5.451353549957275, + "learning_rate": 8.362614773194465e-06, + "loss": 0.4785, + "step": 12699 + }, + { + "epoch": 0.17227346717308736, + "grad_norm": 5.931373119354248, + "learning_rate": 8.362477730574208e-06, + "loss": 0.3052, + "step": 12700 + }, + { + "epoch": 0.17228703201302226, + "grad_norm": 5.926784992218018, + "learning_rate": 8.362340687953955e-06, + "loss": 0.4948, + "step": 12701 + }, + { + "epoch": 0.17230059685295712, + "grad_norm": 7.225817680358887, + "learning_rate": 8.3622036453337e-06, + "loss": 0.4447, + "step": 12702 + }, + { + "epoch": 0.17231416169289202, + "grad_norm": 8.679543495178223, + "learning_rate": 8.362066602713444e-06, + "loss": 0.6064, + "step": 12703 + }, + { + "epoch": 0.1723277265328269, + "grad_norm": 5.0901384353637695, + "learning_rate": 8.361929560093189e-06, + "loss": 0.3951, + "step": 12704 + }, + { + "epoch": 0.1723412913727618, + "grad_norm": 5.53167724609375, + "learning_rate": 8.361792517472936e-06, + "loss": 0.4867, + "step": 12705 + }, + { + "epoch": 0.1723548562126967, + "grad_norm": 6.9350786209106445, + "learning_rate": 8.361655474852681e-06, + "loss": 0.5259, + "step": 12706 + }, + { + "epoch": 0.1723684210526316, + "grad_norm": 6.261717319488525, + "learning_rate": 8.361518432232424e-06, + "loss": 0.3689, + "step": 12707 + }, + { + "epoch": 0.17238198589256648, + "grad_norm": 4.964408874511719, + "learning_rate": 8.36138138961217e-06, + "loss": 0.4584, + "step": 12708 + }, + { + "epoch": 0.17239555073250135, + "grad_norm": 5.0788893699646, + "learning_rate": 8.361244346991915e-06, + "loss": 0.4177, + "step": 12709 + }, + { + "epoch": 0.17240911557243624, + "grad_norm": 5.3112406730651855, + "learning_rate": 8.36110730437166e-06, + "loss": 0.3239, + "step": 12710 + }, + { + "epoch": 0.17242268041237113, + "grad_norm": 6.941345691680908, + "learning_rate": 8.360970261751405e-06, + "loss": 0.4507, + "step": 12711 + }, + { + "epoch": 0.17243624525230603, + "grad_norm": 6.672161102294922, + "learning_rate": 8.36083321913115e-06, + "loss": 0.469, + "step": 12712 + }, + { + "epoch": 0.17244981009224092, + "grad_norm": 4.40106201171875, + "learning_rate": 8.360696176510895e-06, + "loss": 0.3342, + "step": 12713 + }, + { + "epoch": 0.1724633749321758, + "grad_norm": 5.741341590881348, + "learning_rate": 8.36055913389064e-06, + "loss": 0.4572, + "step": 12714 + }, + { + "epoch": 0.17247693977211068, + "grad_norm": 5.500401496887207, + "learning_rate": 8.360422091270386e-06, + "loss": 0.352, + "step": 12715 + }, + { + "epoch": 0.17249050461204557, + "grad_norm": 7.28842830657959, + "learning_rate": 8.360285048650131e-06, + "loss": 0.731, + "step": 12716 + }, + { + "epoch": 0.17250406945198046, + "grad_norm": 6.815276622772217, + "learning_rate": 8.360148006029876e-06, + "loss": 0.4585, + "step": 12717 + }, + { + "epoch": 0.17251763429191536, + "grad_norm": 5.422051429748535, + "learning_rate": 8.360010963409621e-06, + "loss": 0.3875, + "step": 12718 + }, + { + "epoch": 0.17253119913185025, + "grad_norm": 7.227945804595947, + "learning_rate": 8.359873920789367e-06, + "loss": 0.4809, + "step": 12719 + }, + { + "epoch": 0.17254476397178514, + "grad_norm": 5.024757385253906, + "learning_rate": 8.359736878169112e-06, + "loss": 0.3716, + "step": 12720 + }, + { + "epoch": 0.17255832881172004, + "grad_norm": 5.108546733856201, + "learning_rate": 8.359599835548857e-06, + "loss": 0.3621, + "step": 12721 + }, + { + "epoch": 0.1725718936516549, + "grad_norm": 6.409648418426514, + "learning_rate": 8.3594627929286e-06, + "loss": 0.4105, + "step": 12722 + }, + { + "epoch": 0.1725854584915898, + "grad_norm": 5.070459842681885, + "learning_rate": 8.359325750308347e-06, + "loss": 0.5706, + "step": 12723 + }, + { + "epoch": 0.1725990233315247, + "grad_norm": 4.5597662925720215, + "learning_rate": 8.359188707688092e-06, + "loss": 0.3483, + "step": 12724 + }, + { + "epoch": 0.17261258817145958, + "grad_norm": 5.919748306274414, + "learning_rate": 8.359051665067836e-06, + "loss": 0.5207, + "step": 12725 + }, + { + "epoch": 0.17262615301139447, + "grad_norm": 4.870491981506348, + "learning_rate": 8.358914622447581e-06, + "loss": 0.4641, + "step": 12726 + }, + { + "epoch": 0.17263971785132937, + "grad_norm": 6.885532379150391, + "learning_rate": 8.358777579827328e-06, + "loss": 0.6098, + "step": 12727 + }, + { + "epoch": 0.17265328269126423, + "grad_norm": 5.177473068237305, + "learning_rate": 8.358640537207071e-06, + "loss": 0.445, + "step": 12728 + }, + { + "epoch": 0.17266684753119912, + "grad_norm": 4.814555644989014, + "learning_rate": 8.358503494586817e-06, + "loss": 0.4649, + "step": 12729 + }, + { + "epoch": 0.17268041237113402, + "grad_norm": 8.594037055969238, + "learning_rate": 8.358366451966562e-06, + "loss": 0.4417, + "step": 12730 + }, + { + "epoch": 0.1726939772110689, + "grad_norm": 4.423548698425293, + "learning_rate": 8.358229409346309e-06, + "loss": 0.2844, + "step": 12731 + }, + { + "epoch": 0.1727075420510038, + "grad_norm": 5.972203254699707, + "learning_rate": 8.358092366726052e-06, + "loss": 0.4849, + "step": 12732 + }, + { + "epoch": 0.1727211068909387, + "grad_norm": 6.303534030914307, + "learning_rate": 8.357955324105797e-06, + "loss": 0.5184, + "step": 12733 + }, + { + "epoch": 0.17273467173087356, + "grad_norm": 6.082722187042236, + "learning_rate": 8.357818281485543e-06, + "loss": 0.5383, + "step": 12734 + }, + { + "epoch": 0.17274823657080846, + "grad_norm": 4.259114742279053, + "learning_rate": 8.357681238865288e-06, + "loss": 0.2776, + "step": 12735 + }, + { + "epoch": 0.17276180141074335, + "grad_norm": 5.51801061630249, + "learning_rate": 8.357544196245033e-06, + "loss": 0.5797, + "step": 12736 + }, + { + "epoch": 0.17277536625067824, + "grad_norm": 5.926334857940674, + "learning_rate": 8.357407153624778e-06, + "loss": 0.3872, + "step": 12737 + }, + { + "epoch": 0.17278893109061313, + "grad_norm": 5.010077476501465, + "learning_rate": 8.357270111004523e-06, + "loss": 0.3553, + "step": 12738 + }, + { + "epoch": 0.17280249593054803, + "grad_norm": 5.290826797485352, + "learning_rate": 8.357133068384268e-06, + "loss": 0.4539, + "step": 12739 + }, + { + "epoch": 0.17281606077048292, + "grad_norm": 7.098767280578613, + "learning_rate": 8.356996025764014e-06, + "loss": 0.4802, + "step": 12740 + }, + { + "epoch": 0.1728296256104178, + "grad_norm": 6.070959568023682, + "learning_rate": 8.356858983143759e-06, + "loss": 0.4184, + "step": 12741 + }, + { + "epoch": 0.17284319045035268, + "grad_norm": 6.2914628982543945, + "learning_rate": 8.356721940523504e-06, + "loss": 0.5026, + "step": 12742 + }, + { + "epoch": 0.17285675529028757, + "grad_norm": 5.525749683380127, + "learning_rate": 8.356584897903247e-06, + "loss": 0.3672, + "step": 12743 + }, + { + "epoch": 0.17287032013022247, + "grad_norm": 8.78325080871582, + "learning_rate": 8.356447855282994e-06, + "loss": 0.6295, + "step": 12744 + }, + { + "epoch": 0.17288388497015736, + "grad_norm": 3.849605083465576, + "learning_rate": 8.35631081266274e-06, + "loss": 0.3314, + "step": 12745 + }, + { + "epoch": 0.17289744981009225, + "grad_norm": 5.102651119232178, + "learning_rate": 8.356173770042485e-06, + "loss": 0.2874, + "step": 12746 + }, + { + "epoch": 0.17291101465002712, + "grad_norm": 5.290246963500977, + "learning_rate": 8.356036727422228e-06, + "loss": 0.3481, + "step": 12747 + }, + { + "epoch": 0.172924579489962, + "grad_norm": 5.977834224700928, + "learning_rate": 8.355899684801973e-06, + "loss": 0.3182, + "step": 12748 + }, + { + "epoch": 0.1729381443298969, + "grad_norm": 4.48012113571167, + "learning_rate": 8.35576264218172e-06, + "loss": 0.3587, + "step": 12749 + }, + { + "epoch": 0.1729517091698318, + "grad_norm": 4.380703926086426, + "learning_rate": 8.355625599561464e-06, + "loss": 0.356, + "step": 12750 + }, + { + "epoch": 0.1729652740097667, + "grad_norm": 7.726386547088623, + "learning_rate": 8.355488556941209e-06, + "loss": 0.4132, + "step": 12751 + }, + { + "epoch": 0.17297883884970158, + "grad_norm": 6.492734432220459, + "learning_rate": 8.355351514320954e-06, + "loss": 0.3257, + "step": 12752 + }, + { + "epoch": 0.17299240368963648, + "grad_norm": 7.4388427734375, + "learning_rate": 8.3552144717007e-06, + "loss": 0.381, + "step": 12753 + }, + { + "epoch": 0.17300596852957134, + "grad_norm": 5.638716220855713, + "learning_rate": 8.355077429080444e-06, + "loss": 0.4651, + "step": 12754 + }, + { + "epoch": 0.17301953336950623, + "grad_norm": 7.203932762145996, + "learning_rate": 8.35494038646019e-06, + "loss": 0.5181, + "step": 12755 + }, + { + "epoch": 0.17303309820944113, + "grad_norm": 5.795904636383057, + "learning_rate": 8.354803343839935e-06, + "loss": 0.3144, + "step": 12756 + }, + { + "epoch": 0.17304666304937602, + "grad_norm": 7.194941997528076, + "learning_rate": 8.35466630121968e-06, + "loss": 0.3965, + "step": 12757 + }, + { + "epoch": 0.1730602278893109, + "grad_norm": 6.643050670623779, + "learning_rate": 8.354529258599425e-06, + "loss": 0.4678, + "step": 12758 + }, + { + "epoch": 0.1730737927292458, + "grad_norm": 6.354063987731934, + "learning_rate": 8.35439221597917e-06, + "loss": 0.4215, + "step": 12759 + }, + { + "epoch": 0.17308735756918067, + "grad_norm": 8.178866386413574, + "learning_rate": 8.354255173358916e-06, + "loss": 0.4677, + "step": 12760 + }, + { + "epoch": 0.17310092240911557, + "grad_norm": 5.65850830078125, + "learning_rate": 8.35411813073866e-06, + "loss": 0.4019, + "step": 12761 + }, + { + "epoch": 0.17311448724905046, + "grad_norm": 6.4745354652404785, + "learning_rate": 8.353981088118406e-06, + "loss": 0.452, + "step": 12762 + }, + { + "epoch": 0.17312805208898535, + "grad_norm": 5.360483169555664, + "learning_rate": 8.353844045498151e-06, + "loss": 0.4084, + "step": 12763 + }, + { + "epoch": 0.17314161692892024, + "grad_norm": 5.406743049621582, + "learning_rate": 8.353707002877896e-06, + "loss": 0.263, + "step": 12764 + }, + { + "epoch": 0.17315518176885514, + "grad_norm": 6.254209041595459, + "learning_rate": 8.35356996025764e-06, + "loss": 0.4426, + "step": 12765 + }, + { + "epoch": 0.17316874660879, + "grad_norm": 6.196353912353516, + "learning_rate": 8.353432917637387e-06, + "loss": 0.441, + "step": 12766 + }, + { + "epoch": 0.1731823114487249, + "grad_norm": 4.440738201141357, + "learning_rate": 8.353295875017132e-06, + "loss": 0.3624, + "step": 12767 + }, + { + "epoch": 0.1731958762886598, + "grad_norm": 7.0106401443481445, + "learning_rate": 8.353158832396875e-06, + "loss": 0.3392, + "step": 12768 + }, + { + "epoch": 0.17320944112859468, + "grad_norm": 7.064242362976074, + "learning_rate": 8.35302178977662e-06, + "loss": 0.4297, + "step": 12769 + }, + { + "epoch": 0.17322300596852958, + "grad_norm": 7.236154079437256, + "learning_rate": 8.352884747156367e-06, + "loss": 0.3878, + "step": 12770 + }, + { + "epoch": 0.17323657080846447, + "grad_norm": 5.997804641723633, + "learning_rate": 8.35274770453611e-06, + "loss": 0.3911, + "step": 12771 + }, + { + "epoch": 0.17325013564839936, + "grad_norm": 9.960155487060547, + "learning_rate": 8.352610661915856e-06, + "loss": 0.5033, + "step": 12772 + }, + { + "epoch": 0.17326370048833423, + "grad_norm": 5.89060640335083, + "learning_rate": 8.352473619295601e-06, + "loss": 0.3231, + "step": 12773 + }, + { + "epoch": 0.17327726532826912, + "grad_norm": 7.326956272125244, + "learning_rate": 8.352336576675348e-06, + "loss": 0.3798, + "step": 12774 + }, + { + "epoch": 0.173290830168204, + "grad_norm": 6.03324031829834, + "learning_rate": 8.352199534055092e-06, + "loss": 0.3826, + "step": 12775 + }, + { + "epoch": 0.1733043950081389, + "grad_norm": 6.181290149688721, + "learning_rate": 8.352062491434837e-06, + "loss": 0.4394, + "step": 12776 + }, + { + "epoch": 0.1733179598480738, + "grad_norm": 6.9346022605896, + "learning_rate": 8.351925448814582e-06, + "loss": 0.3428, + "step": 12777 + }, + { + "epoch": 0.1733315246880087, + "grad_norm": 7.744527339935303, + "learning_rate": 8.351788406194327e-06, + "loss": 0.3658, + "step": 12778 + }, + { + "epoch": 0.17334508952794356, + "grad_norm": 5.889974117279053, + "learning_rate": 8.351651363574072e-06, + "loss": 0.4871, + "step": 12779 + }, + { + "epoch": 0.17335865436787845, + "grad_norm": 9.237228393554688, + "learning_rate": 8.351514320953817e-06, + "loss": 0.5679, + "step": 12780 + }, + { + "epoch": 0.17337221920781334, + "grad_norm": 6.691098213195801, + "learning_rate": 8.351377278333563e-06, + "loss": 0.3989, + "step": 12781 + }, + { + "epoch": 0.17338578404774824, + "grad_norm": 7.03196382522583, + "learning_rate": 8.351240235713308e-06, + "loss": 0.419, + "step": 12782 + }, + { + "epoch": 0.17339934888768313, + "grad_norm": 5.872996807098389, + "learning_rate": 8.351103193093053e-06, + "loss": 0.3762, + "step": 12783 + }, + { + "epoch": 0.17341291372761802, + "grad_norm": 6.845762252807617, + "learning_rate": 8.350966150472798e-06, + "loss": 0.4033, + "step": 12784 + }, + { + "epoch": 0.17342647856755292, + "grad_norm": 6.575993537902832, + "learning_rate": 8.350829107852543e-06, + "loss": 0.3633, + "step": 12785 + }, + { + "epoch": 0.17344004340748778, + "grad_norm": 10.074963569641113, + "learning_rate": 8.350692065232287e-06, + "loss": 0.5536, + "step": 12786 + }, + { + "epoch": 0.17345360824742267, + "grad_norm": 6.092051029205322, + "learning_rate": 8.350555022612034e-06, + "loss": 0.2349, + "step": 12787 + }, + { + "epoch": 0.17346717308735757, + "grad_norm": 9.513463973999023, + "learning_rate": 8.350417979991779e-06, + "loss": 0.4612, + "step": 12788 + }, + { + "epoch": 0.17348073792729246, + "grad_norm": 4.706714630126953, + "learning_rate": 8.350280937371524e-06, + "loss": 0.2913, + "step": 12789 + }, + { + "epoch": 0.17349430276722735, + "grad_norm": 7.385503768920898, + "learning_rate": 8.350143894751267e-06, + "loss": 0.4241, + "step": 12790 + }, + { + "epoch": 0.17350786760716225, + "grad_norm": 7.242767810821533, + "learning_rate": 8.350006852131013e-06, + "loss": 0.4024, + "step": 12791 + }, + { + "epoch": 0.1735214324470971, + "grad_norm": 6.566612243652344, + "learning_rate": 8.34986980951076e-06, + "loss": 0.494, + "step": 12792 + }, + { + "epoch": 0.173534997287032, + "grad_norm": 7.331286907196045, + "learning_rate": 8.349732766890503e-06, + "loss": 0.3827, + "step": 12793 + }, + { + "epoch": 0.1735485621269669, + "grad_norm": 6.148956775665283, + "learning_rate": 8.349595724270248e-06, + "loss": 0.2978, + "step": 12794 + }, + { + "epoch": 0.1735621269669018, + "grad_norm": 5.3941330909729, + "learning_rate": 8.349458681649993e-06, + "loss": 0.4287, + "step": 12795 + }, + { + "epoch": 0.17357569180683668, + "grad_norm": 6.433730602264404, + "learning_rate": 8.349321639029739e-06, + "loss": 0.3271, + "step": 12796 + }, + { + "epoch": 0.17358925664677158, + "grad_norm": 7.78089714050293, + "learning_rate": 8.349184596409484e-06, + "loss": 0.4087, + "step": 12797 + }, + { + "epoch": 0.17360282148670644, + "grad_norm": 6.956143856048584, + "learning_rate": 8.349047553789229e-06, + "loss": 0.3374, + "step": 12798 + }, + { + "epoch": 0.17361638632664134, + "grad_norm": 4.959187030792236, + "learning_rate": 8.348910511168974e-06, + "loss": 0.3454, + "step": 12799 + }, + { + "epoch": 0.17362995116657623, + "grad_norm": 6.842704772949219, + "learning_rate": 8.34877346854872e-06, + "loss": 0.3862, + "step": 12800 + }, + { + "epoch": 0.17364351600651112, + "grad_norm": 4.486500263214111, + "learning_rate": 8.348636425928464e-06, + "loss": 0.3074, + "step": 12801 + }, + { + "epoch": 0.17365708084644602, + "grad_norm": 5.842264175415039, + "learning_rate": 8.34849938330821e-06, + "loss": 0.4301, + "step": 12802 + }, + { + "epoch": 0.1736706456863809, + "grad_norm": 7.072737216949463, + "learning_rate": 8.348362340687955e-06, + "loss": 0.4132, + "step": 12803 + }, + { + "epoch": 0.1736842105263158, + "grad_norm": 5.974741458892822, + "learning_rate": 8.3482252980677e-06, + "loss": 0.3966, + "step": 12804 + }, + { + "epoch": 0.17369777536625067, + "grad_norm": 7.621243000030518, + "learning_rate": 8.348088255447445e-06, + "loss": 0.5192, + "step": 12805 + }, + { + "epoch": 0.17371134020618556, + "grad_norm": 5.052315711975098, + "learning_rate": 8.34795121282719e-06, + "loss": 0.3073, + "step": 12806 + }, + { + "epoch": 0.17372490504612045, + "grad_norm": 6.114007949829102, + "learning_rate": 8.347814170206936e-06, + "loss": 0.3142, + "step": 12807 + }, + { + "epoch": 0.17373846988605535, + "grad_norm": 7.906190872192383, + "learning_rate": 8.347677127586679e-06, + "loss": 0.5159, + "step": 12808 + }, + { + "epoch": 0.17375203472599024, + "grad_norm": 6.0910258293151855, + "learning_rate": 8.347540084966426e-06, + "loss": 0.4611, + "step": 12809 + }, + { + "epoch": 0.17376559956592513, + "grad_norm": 8.137779235839844, + "learning_rate": 8.347403042346171e-06, + "loss": 0.3795, + "step": 12810 + }, + { + "epoch": 0.17377916440586, + "grad_norm": 6.480264186859131, + "learning_rate": 8.347265999725915e-06, + "loss": 0.3827, + "step": 12811 + }, + { + "epoch": 0.1737927292457949, + "grad_norm": 7.357575416564941, + "learning_rate": 8.34712895710566e-06, + "loss": 0.4661, + "step": 12812 + }, + { + "epoch": 0.17380629408572978, + "grad_norm": 5.393177509307861, + "learning_rate": 8.346991914485407e-06, + "loss": 0.4122, + "step": 12813 + }, + { + "epoch": 0.17381985892566468, + "grad_norm": 7.214751720428467, + "learning_rate": 8.346854871865152e-06, + "loss": 0.4711, + "step": 12814 + }, + { + "epoch": 0.17383342376559957, + "grad_norm": 7.4522881507873535, + "learning_rate": 8.346717829244895e-06, + "loss": 0.6171, + "step": 12815 + }, + { + "epoch": 0.17384698860553446, + "grad_norm": 8.059849739074707, + "learning_rate": 8.34658078662464e-06, + "loss": 0.3541, + "step": 12816 + }, + { + "epoch": 0.17386055344546936, + "grad_norm": 4.288132190704346, + "learning_rate": 8.346443744004386e-06, + "loss": 0.2712, + "step": 12817 + }, + { + "epoch": 0.17387411828540422, + "grad_norm": 6.560700416564941, + "learning_rate": 8.34630670138413e-06, + "loss": 0.4054, + "step": 12818 + }, + { + "epoch": 0.17388768312533912, + "grad_norm": 7.97568416595459, + "learning_rate": 8.346169658763876e-06, + "loss": 0.5862, + "step": 12819 + }, + { + "epoch": 0.173901247965274, + "grad_norm": 7.108450412750244, + "learning_rate": 8.346032616143621e-06, + "loss": 0.3758, + "step": 12820 + }, + { + "epoch": 0.1739148128052089, + "grad_norm": 6.763504505157471, + "learning_rate": 8.345895573523366e-06, + "loss": 0.3549, + "step": 12821 + }, + { + "epoch": 0.1739283776451438, + "grad_norm": 6.181743621826172, + "learning_rate": 8.345758530903112e-06, + "loss": 0.399, + "step": 12822 + }, + { + "epoch": 0.1739419424850787, + "grad_norm": 5.915903568267822, + "learning_rate": 8.345621488282857e-06, + "loss": 0.3749, + "step": 12823 + }, + { + "epoch": 0.17395550732501355, + "grad_norm": 6.166750431060791, + "learning_rate": 8.345484445662602e-06, + "loss": 0.3347, + "step": 12824 + }, + { + "epoch": 0.17396907216494845, + "grad_norm": 6.392689228057861, + "learning_rate": 8.345347403042347e-06, + "loss": 0.3644, + "step": 12825 + }, + { + "epoch": 0.17398263700488334, + "grad_norm": 6.586782932281494, + "learning_rate": 8.345210360422092e-06, + "loss": 0.3043, + "step": 12826 + }, + { + "epoch": 0.17399620184481823, + "grad_norm": 7.759390354156494, + "learning_rate": 8.345073317801837e-06, + "loss": 0.437, + "step": 12827 + }, + { + "epoch": 0.17400976668475313, + "grad_norm": 6.9448394775390625, + "learning_rate": 8.344936275181583e-06, + "loss": 0.4333, + "step": 12828 + }, + { + "epoch": 0.17402333152468802, + "grad_norm": 6.579123020172119, + "learning_rate": 8.344799232561328e-06, + "loss": 0.5183, + "step": 12829 + }, + { + "epoch": 0.17403689636462288, + "grad_norm": 8.5068359375, + "learning_rate": 8.344662189941073e-06, + "loss": 0.5507, + "step": 12830 + }, + { + "epoch": 0.17405046120455778, + "grad_norm": 5.963450908660889, + "learning_rate": 8.344525147320818e-06, + "loss": 0.4672, + "step": 12831 + }, + { + "epoch": 0.17406402604449267, + "grad_norm": 7.350037097930908, + "learning_rate": 8.344388104700563e-06, + "loss": 0.4595, + "step": 12832 + }, + { + "epoch": 0.17407759088442756, + "grad_norm": 7.166614055633545, + "learning_rate": 8.344251062080307e-06, + "loss": 0.3673, + "step": 12833 + }, + { + "epoch": 0.17409115572436246, + "grad_norm": 8.32739543914795, + "learning_rate": 8.344114019460052e-06, + "loss": 0.6, + "step": 12834 + }, + { + "epoch": 0.17410472056429735, + "grad_norm": 6.288634777069092, + "learning_rate": 8.343976976839799e-06, + "loss": 0.4641, + "step": 12835 + }, + { + "epoch": 0.17411828540423224, + "grad_norm": 6.645572662353516, + "learning_rate": 8.343839934219542e-06, + "loss": 0.4939, + "step": 12836 + }, + { + "epoch": 0.1741318502441671, + "grad_norm": 6.276261329650879, + "learning_rate": 8.343702891599288e-06, + "loss": 0.3926, + "step": 12837 + }, + { + "epoch": 0.174145415084102, + "grad_norm": 6.859340190887451, + "learning_rate": 8.343565848979033e-06, + "loss": 0.5063, + "step": 12838 + }, + { + "epoch": 0.1741589799240369, + "grad_norm": 6.356383800506592, + "learning_rate": 8.34342880635878e-06, + "loss": 0.5009, + "step": 12839 + }, + { + "epoch": 0.1741725447639718, + "grad_norm": 7.3217034339904785, + "learning_rate": 8.343291763738523e-06, + "loss": 0.4972, + "step": 12840 + }, + { + "epoch": 0.17418610960390668, + "grad_norm": 8.005636215209961, + "learning_rate": 8.343154721118268e-06, + "loss": 0.6126, + "step": 12841 + }, + { + "epoch": 0.17419967444384157, + "grad_norm": 7.677014350891113, + "learning_rate": 8.343017678498013e-06, + "loss": 0.6272, + "step": 12842 + }, + { + "epoch": 0.17421323928377644, + "grad_norm": 9.14512825012207, + "learning_rate": 8.342880635877759e-06, + "loss": 0.7558, + "step": 12843 + }, + { + "epoch": 0.17422680412371133, + "grad_norm": 7.092624187469482, + "learning_rate": 8.342743593257504e-06, + "loss": 0.5076, + "step": 12844 + }, + { + "epoch": 0.17424036896364622, + "grad_norm": 4.530056476593018, + "learning_rate": 8.342606550637249e-06, + "loss": 0.3621, + "step": 12845 + }, + { + "epoch": 0.17425393380358112, + "grad_norm": 7.795158386230469, + "learning_rate": 8.342469508016994e-06, + "loss": 0.4704, + "step": 12846 + }, + { + "epoch": 0.174267498643516, + "grad_norm": 6.597216606140137, + "learning_rate": 8.34233246539674e-06, + "loss": 0.469, + "step": 12847 + }, + { + "epoch": 0.1742810634834509, + "grad_norm": 6.942066669464111, + "learning_rate": 8.342195422776484e-06, + "loss": 0.4042, + "step": 12848 + }, + { + "epoch": 0.1742946283233858, + "grad_norm": 8.980622291564941, + "learning_rate": 8.34205838015623e-06, + "loss": 0.5546, + "step": 12849 + }, + { + "epoch": 0.17430819316332066, + "grad_norm": 9.307909965515137, + "learning_rate": 8.341921337535975e-06, + "loss": 0.6312, + "step": 12850 + }, + { + "epoch": 0.17432175800325556, + "grad_norm": 5.8168816566467285, + "learning_rate": 8.341784294915718e-06, + "loss": 0.4241, + "step": 12851 + }, + { + "epoch": 0.17433532284319045, + "grad_norm": 8.414472579956055, + "learning_rate": 8.341647252295465e-06, + "loss": 0.6234, + "step": 12852 + }, + { + "epoch": 0.17434888768312534, + "grad_norm": 9.034454345703125, + "learning_rate": 8.34151020967521e-06, + "loss": 0.7144, + "step": 12853 + }, + { + "epoch": 0.17436245252306024, + "grad_norm": 7.52626371383667, + "learning_rate": 8.341373167054954e-06, + "loss": 0.5283, + "step": 12854 + }, + { + "epoch": 0.17437601736299513, + "grad_norm": 7.6297125816345215, + "learning_rate": 8.341236124434699e-06, + "loss": 0.5552, + "step": 12855 + }, + { + "epoch": 0.17438958220293, + "grad_norm": 7.165008068084717, + "learning_rate": 8.341099081814446e-06, + "loss": 0.4136, + "step": 12856 + }, + { + "epoch": 0.1744031470428649, + "grad_norm": 8.849308013916016, + "learning_rate": 8.340962039194191e-06, + "loss": 0.571, + "step": 12857 + }, + { + "epoch": 0.17441671188279978, + "grad_norm": 6.383408069610596, + "learning_rate": 8.340824996573935e-06, + "loss": 0.3512, + "step": 12858 + }, + { + "epoch": 0.17443027672273467, + "grad_norm": 5.19313907623291, + "learning_rate": 8.34068795395368e-06, + "loss": 0.3507, + "step": 12859 + }, + { + "epoch": 0.17444384156266957, + "grad_norm": 8.511260032653809, + "learning_rate": 8.340550911333425e-06, + "loss": 0.5327, + "step": 12860 + }, + { + "epoch": 0.17445740640260446, + "grad_norm": 6.124979019165039, + "learning_rate": 8.34041386871317e-06, + "loss": 0.4431, + "step": 12861 + }, + { + "epoch": 0.17447097124253932, + "grad_norm": 4.413919925689697, + "learning_rate": 8.340276826092915e-06, + "loss": 0.281, + "step": 12862 + }, + { + "epoch": 0.17448453608247422, + "grad_norm": 5.13375997543335, + "learning_rate": 8.34013978347266e-06, + "loss": 0.3392, + "step": 12863 + }, + { + "epoch": 0.1744981009224091, + "grad_norm": 6.976029396057129, + "learning_rate": 8.340002740852406e-06, + "loss": 0.4927, + "step": 12864 + }, + { + "epoch": 0.174511665762344, + "grad_norm": 4.641989231109619, + "learning_rate": 8.33986569823215e-06, + "loss": 0.3517, + "step": 12865 + }, + { + "epoch": 0.1745252306022789, + "grad_norm": 5.197585105895996, + "learning_rate": 8.339728655611896e-06, + "loss": 0.3045, + "step": 12866 + }, + { + "epoch": 0.1745387954422138, + "grad_norm": 5.494890213012695, + "learning_rate": 8.339591612991641e-06, + "loss": 0.3979, + "step": 12867 + }, + { + "epoch": 0.17455236028214868, + "grad_norm": 6.0521745681762695, + "learning_rate": 8.339454570371386e-06, + "loss": 0.2562, + "step": 12868 + }, + { + "epoch": 0.17456592512208355, + "grad_norm": 4.818453311920166, + "learning_rate": 8.339317527751132e-06, + "loss": 0.3382, + "step": 12869 + }, + { + "epoch": 0.17457948996201844, + "grad_norm": 5.755685329437256, + "learning_rate": 8.339180485130877e-06, + "loss": 0.4029, + "step": 12870 + }, + { + "epoch": 0.17459305480195333, + "grad_norm": 6.019900798797607, + "learning_rate": 8.339043442510622e-06, + "loss": 0.3266, + "step": 12871 + }, + { + "epoch": 0.17460661964188823, + "grad_norm": 8.027511596679688, + "learning_rate": 8.338906399890367e-06, + "loss": 0.5076, + "step": 12872 + }, + { + "epoch": 0.17462018448182312, + "grad_norm": 3.613344669342041, + "learning_rate": 8.33876935727011e-06, + "loss": 0.2055, + "step": 12873 + }, + { + "epoch": 0.17463374932175801, + "grad_norm": 5.461398124694824, + "learning_rate": 8.338632314649857e-06, + "loss": 0.3649, + "step": 12874 + }, + { + "epoch": 0.17464731416169288, + "grad_norm": 5.531067371368408, + "learning_rate": 8.338495272029603e-06, + "loss": 0.3233, + "step": 12875 + }, + { + "epoch": 0.17466087900162777, + "grad_norm": 6.7645134925842285, + "learning_rate": 8.338358229409346e-06, + "loss": 0.392, + "step": 12876 + }, + { + "epoch": 0.17467444384156267, + "grad_norm": 4.6227827072143555, + "learning_rate": 8.338221186789091e-06, + "loss": 0.3264, + "step": 12877 + }, + { + "epoch": 0.17468800868149756, + "grad_norm": 6.231930732727051, + "learning_rate": 8.338084144168838e-06, + "loss": 0.3595, + "step": 12878 + }, + { + "epoch": 0.17470157352143245, + "grad_norm": 5.013583660125732, + "learning_rate": 8.337947101548582e-06, + "loss": 0.3668, + "step": 12879 + }, + { + "epoch": 0.17471513836136734, + "grad_norm": 5.565636157989502, + "learning_rate": 8.337810058928327e-06, + "loss": 0.3717, + "step": 12880 + }, + { + "epoch": 0.17472870320130224, + "grad_norm": 4.838046073913574, + "learning_rate": 8.337673016308072e-06, + "loss": 0.2709, + "step": 12881 + }, + { + "epoch": 0.1747422680412371, + "grad_norm": 6.305899143218994, + "learning_rate": 8.337535973687819e-06, + "loss": 0.3852, + "step": 12882 + }, + { + "epoch": 0.174755832881172, + "grad_norm": 6.187739372253418, + "learning_rate": 8.337398931067562e-06, + "loss": 0.2804, + "step": 12883 + }, + { + "epoch": 0.1747693977211069, + "grad_norm": 5.130259037017822, + "learning_rate": 8.337261888447308e-06, + "loss": 0.2329, + "step": 12884 + }, + { + "epoch": 0.17478296256104178, + "grad_norm": 5.821178913116455, + "learning_rate": 8.337124845827053e-06, + "loss": 0.3377, + "step": 12885 + }, + { + "epoch": 0.17479652740097668, + "grad_norm": 5.455888748168945, + "learning_rate": 8.336987803206798e-06, + "loss": 0.2687, + "step": 12886 + }, + { + "epoch": 0.17481009224091157, + "grad_norm": 6.155031204223633, + "learning_rate": 8.336850760586543e-06, + "loss": 0.2468, + "step": 12887 + }, + { + "epoch": 0.17482365708084643, + "grad_norm": 7.157480239868164, + "learning_rate": 8.336713717966288e-06, + "loss": 0.3741, + "step": 12888 + }, + { + "epoch": 0.17483722192078133, + "grad_norm": 4.055916786193848, + "learning_rate": 8.336576675346033e-06, + "loss": 0.3169, + "step": 12889 + }, + { + "epoch": 0.17485078676071622, + "grad_norm": 4.846077919006348, + "learning_rate": 8.336439632725779e-06, + "loss": 0.2739, + "step": 12890 + }, + { + "epoch": 0.1748643516006511, + "grad_norm": 8.250648498535156, + "learning_rate": 8.336302590105524e-06, + "loss": 0.4783, + "step": 12891 + }, + { + "epoch": 0.174877916440586, + "grad_norm": 7.201649188995361, + "learning_rate": 8.336165547485269e-06, + "loss": 0.4161, + "step": 12892 + }, + { + "epoch": 0.1748914812805209, + "grad_norm": 8.115151405334473, + "learning_rate": 8.336028504865014e-06, + "loss": 0.5878, + "step": 12893 + }, + { + "epoch": 0.17490504612045576, + "grad_norm": 6.528184413909912, + "learning_rate": 8.335891462244758e-06, + "loss": 0.3491, + "step": 12894 + }, + { + "epoch": 0.17491861096039066, + "grad_norm": 7.3643317222595215, + "learning_rate": 8.335754419624504e-06, + "loss": 0.4415, + "step": 12895 + }, + { + "epoch": 0.17493217580032555, + "grad_norm": 7.523855686187744, + "learning_rate": 8.33561737700425e-06, + "loss": 0.524, + "step": 12896 + }, + { + "epoch": 0.17494574064026044, + "grad_norm": 4.421506404876709, + "learning_rate": 8.335480334383995e-06, + "loss": 0.2598, + "step": 12897 + }, + { + "epoch": 0.17495930548019534, + "grad_norm": 5.840895175933838, + "learning_rate": 8.335343291763738e-06, + "loss": 0.359, + "step": 12898 + }, + { + "epoch": 0.17497287032013023, + "grad_norm": 8.88006591796875, + "learning_rate": 8.335206249143484e-06, + "loss": 0.6723, + "step": 12899 + }, + { + "epoch": 0.17498643516006512, + "grad_norm": 9.510856628417969, + "learning_rate": 8.33506920652323e-06, + "loss": 0.8365, + "step": 12900 + }, + { + "epoch": 0.175, + "grad_norm": 7.044816493988037, + "learning_rate": 8.334932163902974e-06, + "loss": 0.5499, + "step": 12901 + }, + { + "epoch": 0.17501356483993488, + "grad_norm": 5.338533401489258, + "learning_rate": 8.334795121282719e-06, + "loss": 0.3116, + "step": 12902 + }, + { + "epoch": 0.17502712967986978, + "grad_norm": 7.220023155212402, + "learning_rate": 8.334658078662464e-06, + "loss": 0.4546, + "step": 12903 + }, + { + "epoch": 0.17504069451980467, + "grad_norm": 7.473721981048584, + "learning_rate": 8.33452103604221e-06, + "loss": 0.4457, + "step": 12904 + }, + { + "epoch": 0.17505425935973956, + "grad_norm": 4.8099565505981445, + "learning_rate": 8.334383993421955e-06, + "loss": 0.3118, + "step": 12905 + }, + { + "epoch": 0.17506782419967445, + "grad_norm": 4.696988105773926, + "learning_rate": 8.3342469508017e-06, + "loss": 0.3555, + "step": 12906 + }, + { + "epoch": 0.17508138903960932, + "grad_norm": 7.188471794128418, + "learning_rate": 8.334109908181445e-06, + "loss": 0.4209, + "step": 12907 + }, + { + "epoch": 0.1750949538795442, + "grad_norm": 8.687273025512695, + "learning_rate": 8.33397286556119e-06, + "loss": 0.6166, + "step": 12908 + }, + { + "epoch": 0.1751085187194791, + "grad_norm": 9.270167350769043, + "learning_rate": 8.333835822940935e-06, + "loss": 0.4322, + "step": 12909 + }, + { + "epoch": 0.175122083559414, + "grad_norm": 7.553124904632568, + "learning_rate": 8.33369878032068e-06, + "loss": 0.3244, + "step": 12910 + }, + { + "epoch": 0.1751356483993489, + "grad_norm": 8.151555061340332, + "learning_rate": 8.333561737700426e-06, + "loss": 0.4389, + "step": 12911 + }, + { + "epoch": 0.17514921323928379, + "grad_norm": 7.963907241821289, + "learning_rate": 8.33342469508017e-06, + "loss": 0.4942, + "step": 12912 + }, + { + "epoch": 0.17516277807921868, + "grad_norm": 8.738919258117676, + "learning_rate": 8.333287652459916e-06, + "loss": 0.5212, + "step": 12913 + }, + { + "epoch": 0.17517634291915354, + "grad_norm": 4.85335111618042, + "learning_rate": 8.333150609839661e-06, + "loss": 0.282, + "step": 12914 + }, + { + "epoch": 0.17518990775908844, + "grad_norm": 6.063952922821045, + "learning_rate": 8.333013567219406e-06, + "loss": 0.2961, + "step": 12915 + }, + { + "epoch": 0.17520347259902333, + "grad_norm": 6.207266807556152, + "learning_rate": 8.33287652459915e-06, + "loss": 0.3654, + "step": 12916 + }, + { + "epoch": 0.17521703743895822, + "grad_norm": 5.6325154304504395, + "learning_rate": 8.332739481978897e-06, + "loss": 0.2965, + "step": 12917 + }, + { + "epoch": 0.17523060227889312, + "grad_norm": 7.606105804443359, + "learning_rate": 8.332602439358642e-06, + "loss": 0.4427, + "step": 12918 + }, + { + "epoch": 0.175244167118828, + "grad_norm": 6.686489105224609, + "learning_rate": 8.332465396738385e-06, + "loss": 0.2887, + "step": 12919 + }, + { + "epoch": 0.17525773195876287, + "grad_norm": 8.767486572265625, + "learning_rate": 8.33232835411813e-06, + "loss": 0.3531, + "step": 12920 + }, + { + "epoch": 0.17527129679869777, + "grad_norm": 4.235123157501221, + "learning_rate": 8.332191311497877e-06, + "loss": 0.2376, + "step": 12921 + }, + { + "epoch": 0.17528486163863266, + "grad_norm": 7.002651691436768, + "learning_rate": 8.332054268877623e-06, + "loss": 0.5162, + "step": 12922 + }, + { + "epoch": 0.17529842647856755, + "grad_norm": 6.409977436065674, + "learning_rate": 8.331917226257366e-06, + "loss": 0.3339, + "step": 12923 + }, + { + "epoch": 0.17531199131850245, + "grad_norm": 6.229799747467041, + "learning_rate": 8.331780183637111e-06, + "loss": 0.3468, + "step": 12924 + }, + { + "epoch": 0.17532555615843734, + "grad_norm": 5.518940448760986, + "learning_rate": 8.331643141016858e-06, + "loss": 0.4649, + "step": 12925 + }, + { + "epoch": 0.1753391209983722, + "grad_norm": 8.023058891296387, + "learning_rate": 8.331506098396602e-06, + "loss": 0.3816, + "step": 12926 + }, + { + "epoch": 0.1753526858383071, + "grad_norm": 8.283285140991211, + "learning_rate": 8.331369055776347e-06, + "loss": 0.581, + "step": 12927 + }, + { + "epoch": 0.175366250678242, + "grad_norm": 6.973442554473877, + "learning_rate": 8.331232013156092e-06, + "loss": 0.4056, + "step": 12928 + }, + { + "epoch": 0.17537981551817688, + "grad_norm": 8.889457702636719, + "learning_rate": 8.331094970535837e-06, + "loss": 0.3759, + "step": 12929 + }, + { + "epoch": 0.17539338035811178, + "grad_norm": 7.294217109680176, + "learning_rate": 8.330957927915582e-06, + "loss": 0.317, + "step": 12930 + }, + { + "epoch": 0.17540694519804667, + "grad_norm": 7.017988204956055, + "learning_rate": 8.330820885295328e-06, + "loss": 0.4852, + "step": 12931 + }, + { + "epoch": 0.17542051003798156, + "grad_norm": 7.485454082489014, + "learning_rate": 8.330683842675073e-06, + "loss": 0.5852, + "step": 12932 + }, + { + "epoch": 0.17543407487791643, + "grad_norm": 5.7111005783081055, + "learning_rate": 8.330546800054818e-06, + "loss": 0.3336, + "step": 12933 + }, + { + "epoch": 0.17544763971785132, + "grad_norm": 6.22459077835083, + "learning_rate": 8.330409757434563e-06, + "loss": 0.5296, + "step": 12934 + }, + { + "epoch": 0.17546120455778622, + "grad_norm": 5.9464569091796875, + "learning_rate": 8.330272714814308e-06, + "loss": 0.3297, + "step": 12935 + }, + { + "epoch": 0.1754747693977211, + "grad_norm": 5.929626941680908, + "learning_rate": 8.330135672194053e-06, + "loss": 0.3074, + "step": 12936 + }, + { + "epoch": 0.175488334237656, + "grad_norm": 6.156596660614014, + "learning_rate": 8.329998629573799e-06, + "loss": 0.4888, + "step": 12937 + }, + { + "epoch": 0.1755018990775909, + "grad_norm": 8.22143840789795, + "learning_rate": 8.329861586953544e-06, + "loss": 0.6753, + "step": 12938 + }, + { + "epoch": 0.17551546391752576, + "grad_norm": 7.014125347137451, + "learning_rate": 8.329724544333289e-06, + "loss": 0.5346, + "step": 12939 + }, + { + "epoch": 0.17552902875746065, + "grad_norm": 5.114738941192627, + "learning_rate": 8.329587501713034e-06, + "loss": 0.2616, + "step": 12940 + }, + { + "epoch": 0.17554259359739555, + "grad_norm": 5.361451625823975, + "learning_rate": 8.329450459092778e-06, + "loss": 0.3407, + "step": 12941 + }, + { + "epoch": 0.17555615843733044, + "grad_norm": 7.092883586883545, + "learning_rate": 8.329313416472523e-06, + "loss": 0.4637, + "step": 12942 + }, + { + "epoch": 0.17556972327726533, + "grad_norm": 6.469746112823486, + "learning_rate": 8.32917637385227e-06, + "loss": 0.4326, + "step": 12943 + }, + { + "epoch": 0.17558328811720023, + "grad_norm": 9.504548072814941, + "learning_rate": 8.329039331232013e-06, + "loss": 0.5034, + "step": 12944 + }, + { + "epoch": 0.17559685295713512, + "grad_norm": 8.353370666503906, + "learning_rate": 8.328902288611758e-06, + "loss": 0.4594, + "step": 12945 + }, + { + "epoch": 0.17561041779706998, + "grad_norm": 9.965781211853027, + "learning_rate": 8.328765245991504e-06, + "loss": 0.6635, + "step": 12946 + }, + { + "epoch": 0.17562398263700488, + "grad_norm": 6.0550923347473145, + "learning_rate": 8.328628203371249e-06, + "loss": 0.4362, + "step": 12947 + }, + { + "epoch": 0.17563754747693977, + "grad_norm": 9.894405364990234, + "learning_rate": 8.328491160750994e-06, + "loss": 0.6919, + "step": 12948 + }, + { + "epoch": 0.17565111231687466, + "grad_norm": 8.859478950500488, + "learning_rate": 8.328354118130739e-06, + "loss": 0.5297, + "step": 12949 + }, + { + "epoch": 0.17566467715680956, + "grad_norm": 8.588814735412598, + "learning_rate": 8.328217075510484e-06, + "loss": 0.6634, + "step": 12950 + }, + { + "epoch": 0.17567824199674445, + "grad_norm": 8.92271614074707, + "learning_rate": 8.32808003289023e-06, + "loss": 0.6122, + "step": 12951 + }, + { + "epoch": 0.17569180683667932, + "grad_norm": 6.339015007019043, + "learning_rate": 8.327942990269975e-06, + "loss": 0.4657, + "step": 12952 + }, + { + "epoch": 0.1757053716766142, + "grad_norm": 7.002791404724121, + "learning_rate": 8.32780594764972e-06, + "loss": 0.3838, + "step": 12953 + }, + { + "epoch": 0.1757189365165491, + "grad_norm": 9.95356273651123, + "learning_rate": 8.327668905029465e-06, + "loss": 0.6218, + "step": 12954 + }, + { + "epoch": 0.175732501356484, + "grad_norm": 8.321144104003906, + "learning_rate": 8.32753186240921e-06, + "loss": 0.543, + "step": 12955 + }, + { + "epoch": 0.1757460661964189, + "grad_norm": 6.4534101486206055, + "learning_rate": 8.327394819788955e-06, + "loss": 0.5052, + "step": 12956 + }, + { + "epoch": 0.17575963103635378, + "grad_norm": 6.555189609527588, + "learning_rate": 8.3272577771687e-06, + "loss": 0.3658, + "step": 12957 + }, + { + "epoch": 0.17577319587628865, + "grad_norm": 6.237368106842041, + "learning_rate": 8.327120734548446e-06, + "loss": 0.3522, + "step": 12958 + }, + { + "epoch": 0.17578676071622354, + "grad_norm": 7.193839073181152, + "learning_rate": 8.32698369192819e-06, + "loss": 0.3494, + "step": 12959 + }, + { + "epoch": 0.17580032555615843, + "grad_norm": 5.418681621551514, + "learning_rate": 8.326846649307936e-06, + "loss": 0.3286, + "step": 12960 + }, + { + "epoch": 0.17581389039609333, + "grad_norm": 10.524811744689941, + "learning_rate": 8.326709606687681e-06, + "loss": 0.6981, + "step": 12961 + }, + { + "epoch": 0.17582745523602822, + "grad_norm": 12.622749328613281, + "learning_rate": 8.326572564067425e-06, + "loss": 0.4963, + "step": 12962 + }, + { + "epoch": 0.1758410200759631, + "grad_norm": 7.174727916717529, + "learning_rate": 8.32643552144717e-06, + "loss": 0.5176, + "step": 12963 + }, + { + "epoch": 0.175854584915898, + "grad_norm": 6.778750896453857, + "learning_rate": 8.326298478826917e-06, + "loss": 0.4247, + "step": 12964 + }, + { + "epoch": 0.17586814975583287, + "grad_norm": 8.521376609802246, + "learning_rate": 8.326161436206662e-06, + "loss": 0.4365, + "step": 12965 + }, + { + "epoch": 0.17588171459576776, + "grad_norm": 8.747749328613281, + "learning_rate": 8.326024393586405e-06, + "loss": 0.6582, + "step": 12966 + }, + { + "epoch": 0.17589527943570266, + "grad_norm": 9.162130355834961, + "learning_rate": 8.32588735096615e-06, + "loss": 0.653, + "step": 12967 + }, + { + "epoch": 0.17590884427563755, + "grad_norm": 10.175189018249512, + "learning_rate": 8.325750308345896e-06, + "loss": 0.6086, + "step": 12968 + }, + { + "epoch": 0.17592240911557244, + "grad_norm": 8.475354194641113, + "learning_rate": 8.325613265725641e-06, + "loss": 0.481, + "step": 12969 + }, + { + "epoch": 0.17593597395550734, + "grad_norm": 8.5078125, + "learning_rate": 8.325476223105386e-06, + "loss": 0.4679, + "step": 12970 + }, + { + "epoch": 0.1759495387954422, + "grad_norm": 8.673979759216309, + "learning_rate": 8.325339180485131e-06, + "loss": 0.5088, + "step": 12971 + }, + { + "epoch": 0.1759631036353771, + "grad_norm": 5.474225997924805, + "learning_rate": 8.325202137864876e-06, + "loss": 0.2995, + "step": 12972 + }, + { + "epoch": 0.175976668475312, + "grad_norm": 7.045039176940918, + "learning_rate": 8.325065095244622e-06, + "loss": 0.4558, + "step": 12973 + }, + { + "epoch": 0.17599023331524688, + "grad_norm": 8.438815116882324, + "learning_rate": 8.324928052624367e-06, + "loss": 0.4507, + "step": 12974 + }, + { + "epoch": 0.17600379815518177, + "grad_norm": 5.784244537353516, + "learning_rate": 8.324791010004112e-06, + "loss": 0.3651, + "step": 12975 + }, + { + "epoch": 0.17601736299511667, + "grad_norm": 7.898000240325928, + "learning_rate": 8.324653967383857e-06, + "loss": 0.4585, + "step": 12976 + }, + { + "epoch": 0.17603092783505156, + "grad_norm": 10.631619453430176, + "learning_rate": 8.324516924763602e-06, + "loss": 0.6569, + "step": 12977 + }, + { + "epoch": 0.17604449267498642, + "grad_norm": 9.151053428649902, + "learning_rate": 8.324379882143348e-06, + "loss": 0.4608, + "step": 12978 + }, + { + "epoch": 0.17605805751492132, + "grad_norm": 8.552066802978516, + "learning_rate": 8.324242839523093e-06, + "loss": 0.5957, + "step": 12979 + }, + { + "epoch": 0.1760716223548562, + "grad_norm": 7.635260105133057, + "learning_rate": 8.324105796902838e-06, + "loss": 0.4592, + "step": 12980 + }, + { + "epoch": 0.1760851871947911, + "grad_norm": 8.416001319885254, + "learning_rate": 8.323968754282583e-06, + "loss": 0.59, + "step": 12981 + }, + { + "epoch": 0.176098752034726, + "grad_norm": 8.864763259887695, + "learning_rate": 8.323831711662328e-06, + "loss": 0.4673, + "step": 12982 + }, + { + "epoch": 0.1761123168746609, + "grad_norm": 8.139775276184082, + "learning_rate": 8.323694669042073e-06, + "loss": 0.4627, + "step": 12983 + }, + { + "epoch": 0.17612588171459576, + "grad_norm": 7.506629943847656, + "learning_rate": 8.323557626421817e-06, + "loss": 0.3619, + "step": 12984 + }, + { + "epoch": 0.17613944655453065, + "grad_norm": 8.800199508666992, + "learning_rate": 8.323420583801562e-06, + "loss": 0.5956, + "step": 12985 + }, + { + "epoch": 0.17615301139446554, + "grad_norm": 6.7958598136901855, + "learning_rate": 8.323283541181309e-06, + "loss": 0.3862, + "step": 12986 + }, + { + "epoch": 0.17616657623440043, + "grad_norm": 7.319971561431885, + "learning_rate": 8.323146498561052e-06, + "loss": 0.5324, + "step": 12987 + }, + { + "epoch": 0.17618014107433533, + "grad_norm": 5.040694236755371, + "learning_rate": 8.323009455940798e-06, + "loss": 0.3055, + "step": 12988 + }, + { + "epoch": 0.17619370591427022, + "grad_norm": 5.986558437347412, + "learning_rate": 8.322872413320543e-06, + "loss": 0.4056, + "step": 12989 + }, + { + "epoch": 0.17620727075420511, + "grad_norm": 7.763227939605713, + "learning_rate": 8.32273537070029e-06, + "loss": 0.5384, + "step": 12990 + }, + { + "epoch": 0.17622083559413998, + "grad_norm": 6.375207424163818, + "learning_rate": 8.322598328080033e-06, + "loss": 0.4106, + "step": 12991 + }, + { + "epoch": 0.17623440043407487, + "grad_norm": 7.323101043701172, + "learning_rate": 8.322461285459778e-06, + "loss": 0.4153, + "step": 12992 + }, + { + "epoch": 0.17624796527400977, + "grad_norm": 6.850687026977539, + "learning_rate": 8.322324242839524e-06, + "loss": 0.4047, + "step": 12993 + }, + { + "epoch": 0.17626153011394466, + "grad_norm": 8.306719779968262, + "learning_rate": 8.322187200219269e-06, + "loss": 0.5174, + "step": 12994 + }, + { + "epoch": 0.17627509495387955, + "grad_norm": 6.338250637054443, + "learning_rate": 8.322050157599014e-06, + "loss": 0.3772, + "step": 12995 + }, + { + "epoch": 0.17628865979381445, + "grad_norm": 8.920125007629395, + "learning_rate": 8.321913114978759e-06, + "loss": 0.5088, + "step": 12996 + }, + { + "epoch": 0.1763022246337493, + "grad_norm": 9.234701156616211, + "learning_rate": 8.321776072358504e-06, + "loss": 0.4377, + "step": 12997 + }, + { + "epoch": 0.1763157894736842, + "grad_norm": 6.632273197174072, + "learning_rate": 8.32163902973825e-06, + "loss": 0.5343, + "step": 12998 + }, + { + "epoch": 0.1763293543136191, + "grad_norm": 6.67644739151001, + "learning_rate": 8.321501987117995e-06, + "loss": 0.4533, + "step": 12999 + }, + { + "epoch": 0.176342919153554, + "grad_norm": 8.13897705078125, + "learning_rate": 8.32136494449774e-06, + "loss": 0.6722, + "step": 13000 + }, + { + "epoch": 0.17635648399348888, + "grad_norm": 7.815348148345947, + "learning_rate": 8.321227901877485e-06, + "loss": 0.4354, + "step": 13001 + }, + { + "epoch": 0.17637004883342378, + "grad_norm": 7.5583882331848145, + "learning_rate": 8.321090859257228e-06, + "loss": 0.495, + "step": 13002 + }, + { + "epoch": 0.17638361367335864, + "grad_norm": 5.74264669418335, + "learning_rate": 8.320953816636975e-06, + "loss": 0.342, + "step": 13003 + }, + { + "epoch": 0.17639717851329353, + "grad_norm": 8.932511329650879, + "learning_rate": 8.32081677401672e-06, + "loss": 0.5644, + "step": 13004 + }, + { + "epoch": 0.17641074335322843, + "grad_norm": 6.484068393707275, + "learning_rate": 8.320679731396466e-06, + "loss": 0.3997, + "step": 13005 + }, + { + "epoch": 0.17642430819316332, + "grad_norm": 7.246706008911133, + "learning_rate": 8.32054268877621e-06, + "loss": 0.5168, + "step": 13006 + }, + { + "epoch": 0.1764378730330982, + "grad_norm": 8.7633695602417, + "learning_rate": 8.320405646155956e-06, + "loss": 0.4882, + "step": 13007 + }, + { + "epoch": 0.1764514378730331, + "grad_norm": 6.959556579589844, + "learning_rate": 8.320268603535701e-06, + "loss": 0.499, + "step": 13008 + }, + { + "epoch": 0.176465002712968, + "grad_norm": 7.56294059753418, + "learning_rate": 8.320131560915445e-06, + "loss": 0.5151, + "step": 13009 + }, + { + "epoch": 0.17647856755290287, + "grad_norm": 5.513092041015625, + "learning_rate": 8.31999451829519e-06, + "loss": 0.4229, + "step": 13010 + }, + { + "epoch": 0.17649213239283776, + "grad_norm": 5.8249993324279785, + "learning_rate": 8.319857475674935e-06, + "loss": 0.3065, + "step": 13011 + }, + { + "epoch": 0.17650569723277265, + "grad_norm": 5.866919994354248, + "learning_rate": 8.31972043305468e-06, + "loss": 0.4295, + "step": 13012 + }, + { + "epoch": 0.17651926207270754, + "grad_norm": 7.692643165588379, + "learning_rate": 8.319583390434425e-06, + "loss": 0.5831, + "step": 13013 + }, + { + "epoch": 0.17653282691264244, + "grad_norm": 7.734164237976074, + "learning_rate": 8.31944634781417e-06, + "loss": 0.4475, + "step": 13014 + }, + { + "epoch": 0.17654639175257733, + "grad_norm": 6.407224178314209, + "learning_rate": 8.319309305193916e-06, + "loss": 0.4467, + "step": 13015 + }, + { + "epoch": 0.1765599565925122, + "grad_norm": 7.407998561859131, + "learning_rate": 8.319172262573661e-06, + "loss": 0.4406, + "step": 13016 + }, + { + "epoch": 0.1765735214324471, + "grad_norm": 8.673693656921387, + "learning_rate": 8.319035219953406e-06, + "loss": 0.4754, + "step": 13017 + }, + { + "epoch": 0.17658708627238198, + "grad_norm": 7.574557781219482, + "learning_rate": 8.318898177333151e-06, + "loss": 0.3449, + "step": 13018 + }, + { + "epoch": 0.17660065111231688, + "grad_norm": 6.016320705413818, + "learning_rate": 8.318761134712897e-06, + "loss": 0.3095, + "step": 13019 + }, + { + "epoch": 0.17661421595225177, + "grad_norm": 7.719284534454346, + "learning_rate": 8.318624092092642e-06, + "loss": 0.3816, + "step": 13020 + }, + { + "epoch": 0.17662778079218666, + "grad_norm": 5.60190486907959, + "learning_rate": 8.318487049472387e-06, + "loss": 0.3616, + "step": 13021 + }, + { + "epoch": 0.17664134563212155, + "grad_norm": 7.211787223815918, + "learning_rate": 8.318350006852132e-06, + "loss": 0.3256, + "step": 13022 + }, + { + "epoch": 0.17665491047205642, + "grad_norm": 7.327197074890137, + "learning_rate": 8.318212964231877e-06, + "loss": 0.4517, + "step": 13023 + }, + { + "epoch": 0.1766684753119913, + "grad_norm": 7.942503452301025, + "learning_rate": 8.31807592161162e-06, + "loss": 0.4622, + "step": 13024 + }, + { + "epoch": 0.1766820401519262, + "grad_norm": 9.106606483459473, + "learning_rate": 8.317938878991368e-06, + "loss": 0.4878, + "step": 13025 + }, + { + "epoch": 0.1766956049918611, + "grad_norm": 7.081244468688965, + "learning_rate": 8.317801836371113e-06, + "loss": 0.4753, + "step": 13026 + }, + { + "epoch": 0.176709169831796, + "grad_norm": 8.7333984375, + "learning_rate": 8.317664793750856e-06, + "loss": 0.5851, + "step": 13027 + }, + { + "epoch": 0.17672273467173089, + "grad_norm": 6.207739353179932, + "learning_rate": 8.317527751130601e-06, + "loss": 0.3203, + "step": 13028 + }, + { + "epoch": 0.17673629951166575, + "grad_norm": 9.07207202911377, + "learning_rate": 8.317390708510348e-06, + "loss": 0.3028, + "step": 13029 + }, + { + "epoch": 0.17674986435160064, + "grad_norm": 5.148008346557617, + "learning_rate": 8.317253665890093e-06, + "loss": 0.4325, + "step": 13030 + }, + { + "epoch": 0.17676342919153554, + "grad_norm": 6.308037757873535, + "learning_rate": 8.317116623269837e-06, + "loss": 0.354, + "step": 13031 + }, + { + "epoch": 0.17677699403147043, + "grad_norm": 6.8759050369262695, + "learning_rate": 8.316979580649582e-06, + "loss": 0.532, + "step": 13032 + }, + { + "epoch": 0.17679055887140532, + "grad_norm": 10.157116889953613, + "learning_rate": 8.316842538029329e-06, + "loss": 0.6166, + "step": 13033 + }, + { + "epoch": 0.17680412371134022, + "grad_norm": 4.7756667137146, + "learning_rate": 8.316705495409073e-06, + "loss": 0.3692, + "step": 13034 + }, + { + "epoch": 0.17681768855127508, + "grad_norm": 6.079866886138916, + "learning_rate": 8.316568452788818e-06, + "loss": 0.3907, + "step": 13035 + }, + { + "epoch": 0.17683125339120997, + "grad_norm": 6.434382915496826, + "learning_rate": 8.316431410168563e-06, + "loss": 0.3071, + "step": 13036 + }, + { + "epoch": 0.17684481823114487, + "grad_norm": 4.713453769683838, + "learning_rate": 8.316294367548308e-06, + "loss": 0.3719, + "step": 13037 + }, + { + "epoch": 0.17685838307107976, + "grad_norm": 8.140364646911621, + "learning_rate": 8.316157324928053e-06, + "loss": 0.3484, + "step": 13038 + }, + { + "epoch": 0.17687194791101465, + "grad_norm": 7.408039569854736, + "learning_rate": 8.316020282307798e-06, + "loss": 0.53, + "step": 13039 + }, + { + "epoch": 0.17688551275094955, + "grad_norm": 7.054605484008789, + "learning_rate": 8.315883239687544e-06, + "loss": 0.4324, + "step": 13040 + }, + { + "epoch": 0.17689907759088444, + "grad_norm": 5.1525115966796875, + "learning_rate": 8.315746197067289e-06, + "loss": 0.3279, + "step": 13041 + }, + { + "epoch": 0.1769126424308193, + "grad_norm": 5.208045482635498, + "learning_rate": 8.315609154447034e-06, + "loss": 0.3517, + "step": 13042 + }, + { + "epoch": 0.1769262072707542, + "grad_norm": 7.368453502655029, + "learning_rate": 8.315472111826779e-06, + "loss": 0.5805, + "step": 13043 + }, + { + "epoch": 0.1769397721106891, + "grad_norm": 6.914813995361328, + "learning_rate": 8.315335069206524e-06, + "loss": 0.3966, + "step": 13044 + }, + { + "epoch": 0.17695333695062399, + "grad_norm": 6.107807636260986, + "learning_rate": 8.315198026586268e-06, + "loss": 0.4554, + "step": 13045 + }, + { + "epoch": 0.17696690179055888, + "grad_norm": 5.21540641784668, + "learning_rate": 8.315060983966015e-06, + "loss": 0.4532, + "step": 13046 + }, + { + "epoch": 0.17698046663049377, + "grad_norm": 9.83033275604248, + "learning_rate": 8.31492394134576e-06, + "loss": 0.6199, + "step": 13047 + }, + { + "epoch": 0.17699403147042864, + "grad_norm": 6.395434379577637, + "learning_rate": 8.314786898725505e-06, + "loss": 0.4615, + "step": 13048 + }, + { + "epoch": 0.17700759631036353, + "grad_norm": 6.680382251739502, + "learning_rate": 8.314649856105248e-06, + "loss": 0.4744, + "step": 13049 + }, + { + "epoch": 0.17702116115029842, + "grad_norm": 5.573028564453125, + "learning_rate": 8.314512813484995e-06, + "loss": 0.3873, + "step": 13050 + }, + { + "epoch": 0.17703472599023332, + "grad_norm": 8.164243698120117, + "learning_rate": 8.31437577086474e-06, + "loss": 0.5716, + "step": 13051 + }, + { + "epoch": 0.1770482908301682, + "grad_norm": 7.345139980316162, + "learning_rate": 8.314238728244484e-06, + "loss": 0.5213, + "step": 13052 + }, + { + "epoch": 0.1770618556701031, + "grad_norm": 8.027444839477539, + "learning_rate": 8.31410168562423e-06, + "loss": 0.4728, + "step": 13053 + }, + { + "epoch": 0.177075420510038, + "grad_norm": 6.16116189956665, + "learning_rate": 8.313964643003974e-06, + "loss": 0.4551, + "step": 13054 + }, + { + "epoch": 0.17708898534997286, + "grad_norm": 8.14078140258789, + "learning_rate": 8.31382760038372e-06, + "loss": 0.6069, + "step": 13055 + }, + { + "epoch": 0.17710255018990775, + "grad_norm": 5.7300238609313965, + "learning_rate": 8.313690557763465e-06, + "loss": 0.376, + "step": 13056 + }, + { + "epoch": 0.17711611502984265, + "grad_norm": 4.705436706542969, + "learning_rate": 8.31355351514321e-06, + "loss": 0.2906, + "step": 13057 + }, + { + "epoch": 0.17712967986977754, + "grad_norm": 5.628925800323486, + "learning_rate": 8.313416472522955e-06, + "loss": 0.3805, + "step": 13058 + }, + { + "epoch": 0.17714324470971243, + "grad_norm": 5.881781578063965, + "learning_rate": 8.3132794299027e-06, + "loss": 0.4314, + "step": 13059 + }, + { + "epoch": 0.17715680954964733, + "grad_norm": 4.669981479644775, + "learning_rate": 8.313142387282445e-06, + "loss": 0.3684, + "step": 13060 + }, + { + "epoch": 0.1771703743895822, + "grad_norm": 5.826742649078369, + "learning_rate": 8.31300534466219e-06, + "loss": 0.3829, + "step": 13061 + }, + { + "epoch": 0.17718393922951708, + "grad_norm": 7.520499229431152, + "learning_rate": 8.312868302041936e-06, + "loss": 0.3753, + "step": 13062 + }, + { + "epoch": 0.17719750406945198, + "grad_norm": 6.370738506317139, + "learning_rate": 8.312731259421681e-06, + "loss": 0.291, + "step": 13063 + }, + { + "epoch": 0.17721106890938687, + "grad_norm": 6.643624305725098, + "learning_rate": 8.312594216801426e-06, + "loss": 0.3347, + "step": 13064 + }, + { + "epoch": 0.17722463374932176, + "grad_norm": 8.566998481750488, + "learning_rate": 8.312457174181171e-06, + "loss": 0.374, + "step": 13065 + }, + { + "epoch": 0.17723819858925666, + "grad_norm": 4.449951171875, + "learning_rate": 8.312320131560917e-06, + "loss": 0.2652, + "step": 13066 + }, + { + "epoch": 0.17725176342919152, + "grad_norm": 4.703823089599609, + "learning_rate": 8.31218308894066e-06, + "loss": 0.2592, + "step": 13067 + }, + { + "epoch": 0.17726532826912642, + "grad_norm": 6.153239727020264, + "learning_rate": 8.312046046320407e-06, + "loss": 0.3567, + "step": 13068 + }, + { + "epoch": 0.1772788931090613, + "grad_norm": 4.688961982727051, + "learning_rate": 8.311909003700152e-06, + "loss": 0.2349, + "step": 13069 + }, + { + "epoch": 0.1772924579489962, + "grad_norm": 6.785751819610596, + "learning_rate": 8.311771961079896e-06, + "loss": 0.5831, + "step": 13070 + }, + { + "epoch": 0.1773060227889311, + "grad_norm": 5.617741107940674, + "learning_rate": 8.31163491845964e-06, + "loss": 0.4393, + "step": 13071 + }, + { + "epoch": 0.177319587628866, + "grad_norm": 7.024160861968994, + "learning_rate": 8.311497875839388e-06, + "loss": 0.6515, + "step": 13072 + }, + { + "epoch": 0.17733315246880088, + "grad_norm": 6.589621067047119, + "learning_rate": 8.311360833219133e-06, + "loss": 0.482, + "step": 13073 + }, + { + "epoch": 0.17734671730873575, + "grad_norm": 4.957956314086914, + "learning_rate": 8.311223790598876e-06, + "loss": 0.3288, + "step": 13074 + }, + { + "epoch": 0.17736028214867064, + "grad_norm": 6.541713714599609, + "learning_rate": 8.311086747978621e-06, + "loss": 0.4555, + "step": 13075 + }, + { + "epoch": 0.17737384698860553, + "grad_norm": 4.70494270324707, + "learning_rate": 8.310949705358368e-06, + "loss": 0.4828, + "step": 13076 + }, + { + "epoch": 0.17738741182854043, + "grad_norm": 7.8612236976623535, + "learning_rate": 8.310812662738112e-06, + "loss": 0.4949, + "step": 13077 + }, + { + "epoch": 0.17740097666847532, + "grad_norm": 5.062186241149902, + "learning_rate": 8.310675620117857e-06, + "loss": 0.2566, + "step": 13078 + }, + { + "epoch": 0.1774145415084102, + "grad_norm": 6.2094831466674805, + "learning_rate": 8.310538577497602e-06, + "loss": 0.3328, + "step": 13079 + }, + { + "epoch": 0.17742810634834508, + "grad_norm": 5.788604736328125, + "learning_rate": 8.310401534877347e-06, + "loss": 0.3464, + "step": 13080 + }, + { + "epoch": 0.17744167118827997, + "grad_norm": 5.3021626472473145, + "learning_rate": 8.310264492257093e-06, + "loss": 0.3375, + "step": 13081 + }, + { + "epoch": 0.17745523602821486, + "grad_norm": 6.172534465789795, + "learning_rate": 8.310127449636838e-06, + "loss": 0.3589, + "step": 13082 + }, + { + "epoch": 0.17746880086814976, + "grad_norm": 7.156558513641357, + "learning_rate": 8.309990407016583e-06, + "loss": 0.3455, + "step": 13083 + }, + { + "epoch": 0.17748236570808465, + "grad_norm": 6.784675598144531, + "learning_rate": 8.309853364396328e-06, + "loss": 0.3254, + "step": 13084 + }, + { + "epoch": 0.17749593054801954, + "grad_norm": 5.374565601348877, + "learning_rate": 8.309716321776073e-06, + "loss": 0.4831, + "step": 13085 + }, + { + "epoch": 0.17750949538795444, + "grad_norm": 6.345241546630859, + "learning_rate": 8.309579279155818e-06, + "loss": 0.423, + "step": 13086 + }, + { + "epoch": 0.1775230602278893, + "grad_norm": 6.473802089691162, + "learning_rate": 8.309442236535564e-06, + "loss": 0.3994, + "step": 13087 + }, + { + "epoch": 0.1775366250678242, + "grad_norm": 6.2551798820495605, + "learning_rate": 8.309305193915309e-06, + "loss": 0.3503, + "step": 13088 + }, + { + "epoch": 0.1775501899077591, + "grad_norm": 7.530636787414551, + "learning_rate": 8.309168151295054e-06, + "loss": 0.4936, + "step": 13089 + }, + { + "epoch": 0.17756375474769398, + "grad_norm": 6.3887457847595215, + "learning_rate": 8.309031108674799e-06, + "loss": 0.3389, + "step": 13090 + }, + { + "epoch": 0.17757731958762887, + "grad_norm": 6.722297191619873, + "learning_rate": 8.308894066054544e-06, + "loss": 0.3745, + "step": 13091 + }, + { + "epoch": 0.17759088442756377, + "grad_norm": 5.005014896392822, + "learning_rate": 8.308757023434288e-06, + "loss": 0.2306, + "step": 13092 + }, + { + "epoch": 0.17760444926749863, + "grad_norm": 7.052994251251221, + "learning_rate": 8.308619980814033e-06, + "loss": 0.3604, + "step": 13093 + }, + { + "epoch": 0.17761801410743353, + "grad_norm": 5.664450645446777, + "learning_rate": 8.30848293819378e-06, + "loss": 0.2309, + "step": 13094 + }, + { + "epoch": 0.17763157894736842, + "grad_norm": 7.145622730255127, + "learning_rate": 8.308345895573523e-06, + "loss": 0.4323, + "step": 13095 + }, + { + "epoch": 0.1776451437873033, + "grad_norm": 6.389457702636719, + "learning_rate": 8.308208852953269e-06, + "loss": 0.3924, + "step": 13096 + }, + { + "epoch": 0.1776587086272382, + "grad_norm": 8.94940185546875, + "learning_rate": 8.308071810333014e-06, + "loss": 0.5222, + "step": 13097 + }, + { + "epoch": 0.1776722734671731, + "grad_norm": 5.103153228759766, + "learning_rate": 8.30793476771276e-06, + "loss": 0.2898, + "step": 13098 + }, + { + "epoch": 0.17768583830710796, + "grad_norm": 6.386083602905273, + "learning_rate": 8.307797725092504e-06, + "loss": 0.463, + "step": 13099 + }, + { + "epoch": 0.17769940314704286, + "grad_norm": 5.296080589294434, + "learning_rate": 8.30766068247225e-06, + "loss": 0.3132, + "step": 13100 + }, + { + "epoch": 0.17771296798697775, + "grad_norm": 5.479735851287842, + "learning_rate": 8.307523639851994e-06, + "loss": 0.3573, + "step": 13101 + }, + { + "epoch": 0.17772653282691264, + "grad_norm": 6.868685722351074, + "learning_rate": 8.30738659723174e-06, + "loss": 0.2406, + "step": 13102 + }, + { + "epoch": 0.17774009766684754, + "grad_norm": 6.822082996368408, + "learning_rate": 8.307249554611485e-06, + "loss": 0.4033, + "step": 13103 + }, + { + "epoch": 0.17775366250678243, + "grad_norm": 6.337291240692139, + "learning_rate": 8.30711251199123e-06, + "loss": 0.4173, + "step": 13104 + }, + { + "epoch": 0.17776722734671732, + "grad_norm": 5.213181495666504, + "learning_rate": 8.306975469370975e-06, + "loss": 0.4728, + "step": 13105 + }, + { + "epoch": 0.1777807921866522, + "grad_norm": 4.787171363830566, + "learning_rate": 8.30683842675072e-06, + "loss": 0.2794, + "step": 13106 + }, + { + "epoch": 0.17779435702658708, + "grad_norm": 7.756700038909912, + "learning_rate": 8.306701384130465e-06, + "loss": 0.4752, + "step": 13107 + }, + { + "epoch": 0.17780792186652197, + "grad_norm": 6.070430755615234, + "learning_rate": 8.30656434151021e-06, + "loss": 0.3185, + "step": 13108 + }, + { + "epoch": 0.17782148670645687, + "grad_norm": 5.5103278160095215, + "learning_rate": 8.306427298889956e-06, + "loss": 0.3623, + "step": 13109 + }, + { + "epoch": 0.17783505154639176, + "grad_norm": 5.835883140563965, + "learning_rate": 8.3062902562697e-06, + "loss": 0.3173, + "step": 13110 + }, + { + "epoch": 0.17784861638632665, + "grad_norm": 7.5652995109558105, + "learning_rate": 8.306153213649446e-06, + "loss": 0.4268, + "step": 13111 + }, + { + "epoch": 0.17786218122626152, + "grad_norm": 4.72974967956543, + "learning_rate": 8.306016171029191e-06, + "loss": 0.308, + "step": 13112 + }, + { + "epoch": 0.1778757460661964, + "grad_norm": 5.026423931121826, + "learning_rate": 8.305879128408937e-06, + "loss": 0.2834, + "step": 13113 + }, + { + "epoch": 0.1778893109061313, + "grad_norm": 5.699146270751953, + "learning_rate": 8.30574208578868e-06, + "loss": 0.2917, + "step": 13114 + }, + { + "epoch": 0.1779028757460662, + "grad_norm": 5.362853527069092, + "learning_rate": 8.305605043168427e-06, + "loss": 0.3426, + "step": 13115 + }, + { + "epoch": 0.1779164405860011, + "grad_norm": 5.250030994415283, + "learning_rate": 8.305468000548172e-06, + "loss": 0.3399, + "step": 13116 + }, + { + "epoch": 0.17793000542593598, + "grad_norm": 8.202091217041016, + "learning_rate": 8.305330957927916e-06, + "loss": 0.2971, + "step": 13117 + }, + { + "epoch": 0.17794357026587088, + "grad_norm": 5.582093238830566, + "learning_rate": 8.30519391530766e-06, + "loss": 0.337, + "step": 13118 + }, + { + "epoch": 0.17795713510580574, + "grad_norm": 5.976607322692871, + "learning_rate": 8.305056872687408e-06, + "loss": 0.3506, + "step": 13119 + }, + { + "epoch": 0.17797069994574063, + "grad_norm": 4.778857231140137, + "learning_rate": 8.304919830067151e-06, + "loss": 0.2691, + "step": 13120 + }, + { + "epoch": 0.17798426478567553, + "grad_norm": 4.31061315536499, + "learning_rate": 8.304782787446896e-06, + "loss": 0.1931, + "step": 13121 + }, + { + "epoch": 0.17799782962561042, + "grad_norm": 6.252143383026123, + "learning_rate": 8.304645744826641e-06, + "loss": 0.3062, + "step": 13122 + }, + { + "epoch": 0.17801139446554531, + "grad_norm": 5.720866680145264, + "learning_rate": 8.304508702206387e-06, + "loss": 0.3174, + "step": 13123 + }, + { + "epoch": 0.1780249593054802, + "grad_norm": 5.089187145233154, + "learning_rate": 8.304371659586132e-06, + "loss": 0.2611, + "step": 13124 + }, + { + "epoch": 0.17803852414541507, + "grad_norm": 6.717165470123291, + "learning_rate": 8.304234616965877e-06, + "loss": 0.4421, + "step": 13125 + }, + { + "epoch": 0.17805208898534997, + "grad_norm": 7.118844985961914, + "learning_rate": 8.304097574345622e-06, + "loss": 0.3877, + "step": 13126 + }, + { + "epoch": 0.17806565382528486, + "grad_norm": 5.551808834075928, + "learning_rate": 8.303960531725367e-06, + "loss": 0.3428, + "step": 13127 + }, + { + "epoch": 0.17807921866521975, + "grad_norm": 6.647404670715332, + "learning_rate": 8.303823489105113e-06, + "loss": 0.3479, + "step": 13128 + }, + { + "epoch": 0.17809278350515464, + "grad_norm": 4.055004596710205, + "learning_rate": 8.303686446484858e-06, + "loss": 0.2145, + "step": 13129 + }, + { + "epoch": 0.17810634834508954, + "grad_norm": 4.931039333343506, + "learning_rate": 8.303549403864603e-06, + "loss": 0.3292, + "step": 13130 + }, + { + "epoch": 0.1781199131850244, + "grad_norm": 7.746970176696777, + "learning_rate": 8.303412361244348e-06, + "loss": 0.541, + "step": 13131 + }, + { + "epoch": 0.1781334780249593, + "grad_norm": 5.55645227432251, + "learning_rate": 8.303275318624093e-06, + "loss": 0.4753, + "step": 13132 + }, + { + "epoch": 0.1781470428648942, + "grad_norm": 5.471778392791748, + "learning_rate": 8.303138276003838e-06, + "loss": 0.4064, + "step": 13133 + }, + { + "epoch": 0.17816060770482908, + "grad_norm": 6.076466083526611, + "learning_rate": 8.303001233383584e-06, + "loss": 0.2367, + "step": 13134 + }, + { + "epoch": 0.17817417254476398, + "grad_norm": 5.447292804718018, + "learning_rate": 8.302864190763327e-06, + "loss": 0.3875, + "step": 13135 + }, + { + "epoch": 0.17818773738469887, + "grad_norm": 4.641476631164551, + "learning_rate": 8.302727148143072e-06, + "loss": 0.273, + "step": 13136 + }, + { + "epoch": 0.17820130222463376, + "grad_norm": 8.35655689239502, + "learning_rate": 8.302590105522819e-06, + "loss": 0.5542, + "step": 13137 + }, + { + "epoch": 0.17821486706456863, + "grad_norm": 6.162149429321289, + "learning_rate": 8.302453062902563e-06, + "loss": 0.3637, + "step": 13138 + }, + { + "epoch": 0.17822843190450352, + "grad_norm": 5.2710747718811035, + "learning_rate": 8.302316020282308e-06, + "loss": 0.3365, + "step": 13139 + }, + { + "epoch": 0.1782419967444384, + "grad_norm": 5.180593490600586, + "learning_rate": 8.302178977662053e-06, + "loss": 0.3545, + "step": 13140 + }, + { + "epoch": 0.1782555615843733, + "grad_norm": 7.463088512420654, + "learning_rate": 8.3020419350418e-06, + "loss": 0.4254, + "step": 13141 + }, + { + "epoch": 0.1782691264243082, + "grad_norm": 5.0617547035217285, + "learning_rate": 8.301904892421543e-06, + "loss": 0.3269, + "step": 13142 + }, + { + "epoch": 0.1782826912642431, + "grad_norm": 6.850475788116455, + "learning_rate": 8.301767849801289e-06, + "loss": 0.4067, + "step": 13143 + }, + { + "epoch": 0.17829625610417796, + "grad_norm": 5.490926265716553, + "learning_rate": 8.301630807181034e-06, + "loss": 0.2975, + "step": 13144 + }, + { + "epoch": 0.17830982094411285, + "grad_norm": 6.340959548950195, + "learning_rate": 8.301493764560779e-06, + "loss": 0.3675, + "step": 13145 + }, + { + "epoch": 0.17832338578404774, + "grad_norm": 4.742635726928711, + "learning_rate": 8.301356721940524e-06, + "loss": 0.3878, + "step": 13146 + }, + { + "epoch": 0.17833695062398264, + "grad_norm": 6.060959339141846, + "learning_rate": 8.30121967932027e-06, + "loss": 0.334, + "step": 13147 + }, + { + "epoch": 0.17835051546391753, + "grad_norm": 6.865283966064453, + "learning_rate": 8.301082636700014e-06, + "loss": 0.4233, + "step": 13148 + }, + { + "epoch": 0.17836408030385242, + "grad_norm": 6.795452117919922, + "learning_rate": 8.30094559407976e-06, + "loss": 0.265, + "step": 13149 + }, + { + "epoch": 0.17837764514378732, + "grad_norm": 6.739559650421143, + "learning_rate": 8.300808551459505e-06, + "loss": 0.4435, + "step": 13150 + }, + { + "epoch": 0.17839120998372218, + "grad_norm": 5.289524078369141, + "learning_rate": 8.30067150883925e-06, + "loss": 0.208, + "step": 13151 + }, + { + "epoch": 0.17840477482365708, + "grad_norm": 7.498579502105713, + "learning_rate": 8.300534466218995e-06, + "loss": 0.4791, + "step": 13152 + }, + { + "epoch": 0.17841833966359197, + "grad_norm": 5.634328842163086, + "learning_rate": 8.300397423598739e-06, + "loss": 0.2654, + "step": 13153 + }, + { + "epoch": 0.17843190450352686, + "grad_norm": 5.4753522872924805, + "learning_rate": 8.300260380978485e-06, + "loss": 0.3379, + "step": 13154 + }, + { + "epoch": 0.17844546934346175, + "grad_norm": 6.037502288818359, + "learning_rate": 8.30012333835823e-06, + "loss": 0.3491, + "step": 13155 + }, + { + "epoch": 0.17845903418339665, + "grad_norm": 7.207313537597656, + "learning_rate": 8.299986295737976e-06, + "loss": 0.4674, + "step": 13156 + }, + { + "epoch": 0.1784725990233315, + "grad_norm": 5.681244850158691, + "learning_rate": 8.29984925311772e-06, + "loss": 0.2983, + "step": 13157 + }, + { + "epoch": 0.1784861638632664, + "grad_norm": 5.885179042816162, + "learning_rate": 8.299712210497466e-06, + "loss": 0.2461, + "step": 13158 + }, + { + "epoch": 0.1784997287032013, + "grad_norm": 5.794534206390381, + "learning_rate": 8.299575167877211e-06, + "loss": 0.3352, + "step": 13159 + }, + { + "epoch": 0.1785132935431362, + "grad_norm": 5.561667442321777, + "learning_rate": 8.299438125256955e-06, + "loss": 0.2939, + "step": 13160 + }, + { + "epoch": 0.17852685838307109, + "grad_norm": 7.384519100189209, + "learning_rate": 8.2993010826367e-06, + "loss": 0.3036, + "step": 13161 + }, + { + "epoch": 0.17854042322300598, + "grad_norm": 5.216132640838623, + "learning_rate": 8.299164040016445e-06, + "loss": 0.3759, + "step": 13162 + }, + { + "epoch": 0.17855398806294084, + "grad_norm": 3.8409423828125, + "learning_rate": 8.29902699739619e-06, + "loss": 0.2171, + "step": 13163 + }, + { + "epoch": 0.17856755290287574, + "grad_norm": 6.331295967102051, + "learning_rate": 8.298889954775936e-06, + "loss": 0.3545, + "step": 13164 + }, + { + "epoch": 0.17858111774281063, + "grad_norm": 7.044109344482422, + "learning_rate": 8.29875291215568e-06, + "loss": 0.4882, + "step": 13165 + }, + { + "epoch": 0.17859468258274552, + "grad_norm": 5.7450079917907715, + "learning_rate": 8.298615869535426e-06, + "loss": 0.3476, + "step": 13166 + }, + { + "epoch": 0.17860824742268042, + "grad_norm": 7.1170501708984375, + "learning_rate": 8.298478826915171e-06, + "loss": 0.4995, + "step": 13167 + }, + { + "epoch": 0.1786218122626153, + "grad_norm": 6.007618427276611, + "learning_rate": 8.298341784294916e-06, + "loss": 0.2493, + "step": 13168 + }, + { + "epoch": 0.1786353771025502, + "grad_norm": 6.735451698303223, + "learning_rate": 8.298204741674661e-06, + "loss": 0.555, + "step": 13169 + }, + { + "epoch": 0.17864894194248507, + "grad_norm": 6.455972671508789, + "learning_rate": 8.298067699054407e-06, + "loss": 0.3334, + "step": 13170 + }, + { + "epoch": 0.17866250678241996, + "grad_norm": 4.934111595153809, + "learning_rate": 8.297930656434152e-06, + "loss": 0.2377, + "step": 13171 + }, + { + "epoch": 0.17867607162235485, + "grad_norm": 6.3308258056640625, + "learning_rate": 8.297793613813897e-06, + "loss": 0.2996, + "step": 13172 + }, + { + "epoch": 0.17868963646228975, + "grad_norm": 3.6330955028533936, + "learning_rate": 8.297656571193642e-06, + "loss": 0.2157, + "step": 13173 + }, + { + "epoch": 0.17870320130222464, + "grad_norm": 4.617695331573486, + "learning_rate": 8.297519528573387e-06, + "loss": 0.2426, + "step": 13174 + }, + { + "epoch": 0.17871676614215953, + "grad_norm": 11.032210350036621, + "learning_rate": 8.297382485953131e-06, + "loss": 0.5115, + "step": 13175 + }, + { + "epoch": 0.1787303309820944, + "grad_norm": 3.9100003242492676, + "learning_rate": 8.297245443332878e-06, + "loss": 0.2289, + "step": 13176 + }, + { + "epoch": 0.1787438958220293, + "grad_norm": 5.566987991333008, + "learning_rate": 8.297108400712623e-06, + "loss": 0.2165, + "step": 13177 + }, + { + "epoch": 0.17875746066196418, + "grad_norm": 8.483139991760254, + "learning_rate": 8.296971358092366e-06, + "loss": 0.3468, + "step": 13178 + }, + { + "epoch": 0.17877102550189908, + "grad_norm": 6.2520670890808105, + "learning_rate": 8.296834315472112e-06, + "loss": 0.3655, + "step": 13179 + }, + { + "epoch": 0.17878459034183397, + "grad_norm": 6.283144474029541, + "learning_rate": 8.296697272851858e-06, + "loss": 0.2878, + "step": 13180 + }, + { + "epoch": 0.17879815518176886, + "grad_norm": 4.4804606437683105, + "learning_rate": 8.296560230231604e-06, + "loss": 0.2777, + "step": 13181 + }, + { + "epoch": 0.17881172002170376, + "grad_norm": 4.90466833114624, + "learning_rate": 8.296423187611347e-06, + "loss": 0.2441, + "step": 13182 + }, + { + "epoch": 0.17882528486163862, + "grad_norm": 4.776248455047607, + "learning_rate": 8.296286144991092e-06, + "loss": 0.2449, + "step": 13183 + }, + { + "epoch": 0.17883884970157352, + "grad_norm": 6.956377983093262, + "learning_rate": 8.29614910237084e-06, + "loss": 0.4217, + "step": 13184 + }, + { + "epoch": 0.1788524145415084, + "grad_norm": 5.875592231750488, + "learning_rate": 8.296012059750583e-06, + "loss": 0.3485, + "step": 13185 + }, + { + "epoch": 0.1788659793814433, + "grad_norm": 3.9463887214660645, + "learning_rate": 8.295875017130328e-06, + "loss": 0.2183, + "step": 13186 + }, + { + "epoch": 0.1788795442213782, + "grad_norm": 5.74333381652832, + "learning_rate": 8.295737974510073e-06, + "loss": 0.2902, + "step": 13187 + }, + { + "epoch": 0.1788931090613131, + "grad_norm": 6.766179084777832, + "learning_rate": 8.295600931889818e-06, + "loss": 0.3393, + "step": 13188 + }, + { + "epoch": 0.17890667390124795, + "grad_norm": 7.615203380584717, + "learning_rate": 8.295463889269563e-06, + "loss": 0.4098, + "step": 13189 + }, + { + "epoch": 0.17892023874118285, + "grad_norm": 6.166472434997559, + "learning_rate": 8.295326846649309e-06, + "loss": 0.3614, + "step": 13190 + }, + { + "epoch": 0.17893380358111774, + "grad_norm": 5.676845073699951, + "learning_rate": 8.295189804029054e-06, + "loss": 0.2826, + "step": 13191 + }, + { + "epoch": 0.17894736842105263, + "grad_norm": 5.299057960510254, + "learning_rate": 8.295052761408799e-06, + "loss": 0.3, + "step": 13192 + }, + { + "epoch": 0.17896093326098753, + "grad_norm": 7.072418212890625, + "learning_rate": 8.294915718788544e-06, + "loss": 0.2701, + "step": 13193 + }, + { + "epoch": 0.17897449810092242, + "grad_norm": 4.50159215927124, + "learning_rate": 8.29477867616829e-06, + "loss": 0.2127, + "step": 13194 + }, + { + "epoch": 0.17898806294085728, + "grad_norm": 4.467145919799805, + "learning_rate": 8.294641633548034e-06, + "loss": 0.2696, + "step": 13195 + }, + { + "epoch": 0.17900162778079218, + "grad_norm": 8.206402778625488, + "learning_rate": 8.29450459092778e-06, + "loss": 0.4382, + "step": 13196 + }, + { + "epoch": 0.17901519262072707, + "grad_norm": 7.7242112159729, + "learning_rate": 8.294367548307525e-06, + "loss": 0.3375, + "step": 13197 + }, + { + "epoch": 0.17902875746066196, + "grad_norm": 6.201431751251221, + "learning_rate": 8.29423050568727e-06, + "loss": 0.2993, + "step": 13198 + }, + { + "epoch": 0.17904232230059686, + "grad_norm": 6.488588809967041, + "learning_rate": 8.294093463067015e-06, + "loss": 0.429, + "step": 13199 + }, + { + "epoch": 0.17905588714053175, + "grad_norm": 5.425548553466797, + "learning_rate": 8.293956420446759e-06, + "loss": 0.3206, + "step": 13200 + }, + { + "epoch": 0.17906945198046664, + "grad_norm": 5.4782586097717285, + "learning_rate": 8.293819377826506e-06, + "loss": 0.2869, + "step": 13201 + }, + { + "epoch": 0.1790830168204015, + "grad_norm": 4.314181327819824, + "learning_rate": 8.29368233520625e-06, + "loss": 0.1809, + "step": 13202 + }, + { + "epoch": 0.1790965816603364, + "grad_norm": 8.179025650024414, + "learning_rate": 8.293545292585994e-06, + "loss": 0.4249, + "step": 13203 + }, + { + "epoch": 0.1791101465002713, + "grad_norm": 7.34528112411499, + "learning_rate": 8.29340824996574e-06, + "loss": 0.4169, + "step": 13204 + }, + { + "epoch": 0.1791237113402062, + "grad_norm": 6.492861270904541, + "learning_rate": 8.293271207345485e-06, + "loss": 0.3412, + "step": 13205 + }, + { + "epoch": 0.17913727618014108, + "grad_norm": 8.230840682983398, + "learning_rate": 8.293134164725231e-06, + "loss": 0.3483, + "step": 13206 + }, + { + "epoch": 0.17915084102007597, + "grad_norm": 7.059435844421387, + "learning_rate": 8.292997122104975e-06, + "loss": 0.3967, + "step": 13207 + }, + { + "epoch": 0.17916440586001084, + "grad_norm": 6.1296539306640625, + "learning_rate": 8.29286007948472e-06, + "loss": 0.2832, + "step": 13208 + }, + { + "epoch": 0.17917797069994573, + "grad_norm": 7.43215274810791, + "learning_rate": 8.292723036864465e-06, + "loss": 0.3847, + "step": 13209 + }, + { + "epoch": 0.17919153553988063, + "grad_norm": 5.258502960205078, + "learning_rate": 8.29258599424421e-06, + "loss": 0.298, + "step": 13210 + }, + { + "epoch": 0.17920510037981552, + "grad_norm": 6.6519246101379395, + "learning_rate": 8.292448951623956e-06, + "loss": 0.2433, + "step": 13211 + }, + { + "epoch": 0.1792186652197504, + "grad_norm": 7.191584587097168, + "learning_rate": 8.2923119090037e-06, + "loss": 0.3914, + "step": 13212 + }, + { + "epoch": 0.1792322300596853, + "grad_norm": 6.547895908355713, + "learning_rate": 8.292174866383446e-06, + "loss": 0.3186, + "step": 13213 + }, + { + "epoch": 0.1792457948996202, + "grad_norm": 5.360866069793701, + "learning_rate": 8.292037823763191e-06, + "loss": 0.3026, + "step": 13214 + }, + { + "epoch": 0.17925935973955506, + "grad_norm": 5.327215671539307, + "learning_rate": 8.291900781142936e-06, + "loss": 0.2884, + "step": 13215 + }, + { + "epoch": 0.17927292457948996, + "grad_norm": 8.168344497680664, + "learning_rate": 8.291763738522682e-06, + "loss": 0.4908, + "step": 13216 + }, + { + "epoch": 0.17928648941942485, + "grad_norm": 5.517652988433838, + "learning_rate": 8.291626695902427e-06, + "loss": 0.3597, + "step": 13217 + }, + { + "epoch": 0.17930005425935974, + "grad_norm": 6.8577165603637695, + "learning_rate": 8.29148965328217e-06, + "loss": 0.2471, + "step": 13218 + }, + { + "epoch": 0.17931361909929464, + "grad_norm": 5.977726936340332, + "learning_rate": 8.291352610661917e-06, + "loss": 0.2949, + "step": 13219 + }, + { + "epoch": 0.17932718393922953, + "grad_norm": 6.059152603149414, + "learning_rate": 8.291215568041662e-06, + "loss": 0.3301, + "step": 13220 + }, + { + "epoch": 0.1793407487791644, + "grad_norm": 5.540793418884277, + "learning_rate": 8.291078525421407e-06, + "loss": 0.265, + "step": 13221 + }, + { + "epoch": 0.1793543136190993, + "grad_norm": 5.620658874511719, + "learning_rate": 8.290941482801151e-06, + "loss": 0.2708, + "step": 13222 + }, + { + "epoch": 0.17936787845903418, + "grad_norm": 5.374488353729248, + "learning_rate": 8.290804440180898e-06, + "loss": 0.2279, + "step": 13223 + }, + { + "epoch": 0.17938144329896907, + "grad_norm": 5.013862609863281, + "learning_rate": 8.290667397560643e-06, + "loss": 0.2183, + "step": 13224 + }, + { + "epoch": 0.17939500813890397, + "grad_norm": 7.901904582977295, + "learning_rate": 8.290530354940386e-06, + "loss": 0.3883, + "step": 13225 + }, + { + "epoch": 0.17940857297883886, + "grad_norm": 5.149357795715332, + "learning_rate": 8.290393312320132e-06, + "loss": 0.3128, + "step": 13226 + }, + { + "epoch": 0.17942213781877372, + "grad_norm": 9.684961318969727, + "learning_rate": 8.290256269699878e-06, + "loss": 0.3542, + "step": 13227 + }, + { + "epoch": 0.17943570265870862, + "grad_norm": 5.001236438751221, + "learning_rate": 8.290119227079622e-06, + "loss": 0.2655, + "step": 13228 + }, + { + "epoch": 0.1794492674986435, + "grad_norm": 4.529270648956299, + "learning_rate": 8.289982184459367e-06, + "loss": 0.2364, + "step": 13229 + }, + { + "epoch": 0.1794628323385784, + "grad_norm": 6.121013164520264, + "learning_rate": 8.289845141839112e-06, + "loss": 0.3258, + "step": 13230 + }, + { + "epoch": 0.1794763971785133, + "grad_norm": 5.930697917938232, + "learning_rate": 8.289708099218857e-06, + "loss": 0.3118, + "step": 13231 + }, + { + "epoch": 0.1794899620184482, + "grad_norm": 5.235206127166748, + "learning_rate": 8.289571056598603e-06, + "loss": 0.2921, + "step": 13232 + }, + { + "epoch": 0.17950352685838308, + "grad_norm": 5.6392107009887695, + "learning_rate": 8.289434013978348e-06, + "loss": 0.2467, + "step": 13233 + }, + { + "epoch": 0.17951709169831795, + "grad_norm": 7.18669319152832, + "learning_rate": 8.289296971358093e-06, + "loss": 0.3509, + "step": 13234 + }, + { + "epoch": 0.17953065653825284, + "grad_norm": 4.676509857177734, + "learning_rate": 8.289159928737838e-06, + "loss": 0.1876, + "step": 13235 + }, + { + "epoch": 0.17954422137818773, + "grad_norm": 4.536855220794678, + "learning_rate": 8.289022886117583e-06, + "loss": 0.1775, + "step": 13236 + }, + { + "epoch": 0.17955778621812263, + "grad_norm": 6.356355667114258, + "learning_rate": 8.288885843497329e-06, + "loss": 0.264, + "step": 13237 + }, + { + "epoch": 0.17957135105805752, + "grad_norm": 5.05686092376709, + "learning_rate": 8.288748800877074e-06, + "loss": 0.2087, + "step": 13238 + }, + { + "epoch": 0.17958491589799241, + "grad_norm": 4.724864482879639, + "learning_rate": 8.288611758256819e-06, + "loss": 0.1778, + "step": 13239 + }, + { + "epoch": 0.17959848073792728, + "grad_norm": 6.525926113128662, + "learning_rate": 8.288474715636564e-06, + "loss": 0.4824, + "step": 13240 + }, + { + "epoch": 0.17961204557786217, + "grad_norm": 5.390065670013428, + "learning_rate": 8.28833767301631e-06, + "loss": 0.3101, + "step": 13241 + }, + { + "epoch": 0.17962561041779707, + "grad_norm": 6.022335529327393, + "learning_rate": 8.288200630396054e-06, + "loss": 0.3515, + "step": 13242 + }, + { + "epoch": 0.17963917525773196, + "grad_norm": 5.025282859802246, + "learning_rate": 8.288063587775798e-06, + "loss": 0.2728, + "step": 13243 + }, + { + "epoch": 0.17965274009766685, + "grad_norm": 7.592372417449951, + "learning_rate": 8.287926545155543e-06, + "loss": 0.3538, + "step": 13244 + }, + { + "epoch": 0.17966630493760175, + "grad_norm": 6.4966840744018555, + "learning_rate": 8.28778950253529e-06, + "loss": 0.5529, + "step": 13245 + }, + { + "epoch": 0.17967986977753664, + "grad_norm": 7.039313316345215, + "learning_rate": 8.287652459915033e-06, + "loss": 0.3105, + "step": 13246 + }, + { + "epoch": 0.1796934346174715, + "grad_norm": 6.192300319671631, + "learning_rate": 8.287515417294779e-06, + "loss": 0.271, + "step": 13247 + }, + { + "epoch": 0.1797069994574064, + "grad_norm": 5.676523685455322, + "learning_rate": 8.287378374674524e-06, + "loss": 0.3759, + "step": 13248 + }, + { + "epoch": 0.1797205642973413, + "grad_norm": 6.188041687011719, + "learning_rate": 8.28724133205427e-06, + "loss": 0.3815, + "step": 13249 + }, + { + "epoch": 0.17973412913727618, + "grad_norm": 4.152912616729736, + "learning_rate": 8.287104289434014e-06, + "loss": 0.206, + "step": 13250 + }, + { + "epoch": 0.17974769397721108, + "grad_norm": 4.847492694854736, + "learning_rate": 8.28696724681376e-06, + "loss": 0.205, + "step": 13251 + }, + { + "epoch": 0.17976125881714597, + "grad_norm": 7.573049068450928, + "learning_rate": 8.286830204193505e-06, + "loss": 0.3377, + "step": 13252 + }, + { + "epoch": 0.17977482365708083, + "grad_norm": 7.443642616271973, + "learning_rate": 8.28669316157325e-06, + "loss": 0.4954, + "step": 13253 + }, + { + "epoch": 0.17978838849701573, + "grad_norm": 9.039201736450195, + "learning_rate": 8.286556118952995e-06, + "loss": 0.4678, + "step": 13254 + }, + { + "epoch": 0.17980195333695062, + "grad_norm": 8.418996810913086, + "learning_rate": 8.28641907633274e-06, + "loss": 0.4912, + "step": 13255 + }, + { + "epoch": 0.1798155181768855, + "grad_norm": 6.913078308105469, + "learning_rate": 8.286282033712485e-06, + "loss": 0.2923, + "step": 13256 + }, + { + "epoch": 0.1798290830168204, + "grad_norm": 6.835611820220947, + "learning_rate": 8.28614499109223e-06, + "loss": 0.4587, + "step": 13257 + }, + { + "epoch": 0.1798426478567553, + "grad_norm": 6.993625164031982, + "learning_rate": 8.286007948471976e-06, + "loss": 0.439, + "step": 13258 + }, + { + "epoch": 0.17985621269669017, + "grad_norm": 7.5877580642700195, + "learning_rate": 8.28587090585172e-06, + "loss": 0.4995, + "step": 13259 + }, + { + "epoch": 0.17986977753662506, + "grad_norm": 5.636757850646973, + "learning_rate": 8.285733863231466e-06, + "loss": 0.3494, + "step": 13260 + }, + { + "epoch": 0.17988334237655995, + "grad_norm": 5.284242630004883, + "learning_rate": 8.28559682061121e-06, + "loss": 0.3517, + "step": 13261 + }, + { + "epoch": 0.17989690721649484, + "grad_norm": 5.793546199798584, + "learning_rate": 8.285459777990956e-06, + "loss": 0.2645, + "step": 13262 + }, + { + "epoch": 0.17991047205642974, + "grad_norm": 5.0826802253723145, + "learning_rate": 8.285322735370702e-06, + "loss": 0.2995, + "step": 13263 + }, + { + "epoch": 0.17992403689636463, + "grad_norm": 6.5742387771606445, + "learning_rate": 8.285185692750447e-06, + "loss": 0.3805, + "step": 13264 + }, + { + "epoch": 0.17993760173629952, + "grad_norm": 7.12569522857666, + "learning_rate": 8.28504865013019e-06, + "loss": 0.5254, + "step": 13265 + }, + { + "epoch": 0.1799511665762344, + "grad_norm": 6.938061237335205, + "learning_rate": 8.284911607509937e-06, + "loss": 0.4462, + "step": 13266 + }, + { + "epoch": 0.17996473141616928, + "grad_norm": 6.389140605926514, + "learning_rate": 8.284774564889682e-06, + "loss": 0.3348, + "step": 13267 + }, + { + "epoch": 0.17997829625610418, + "grad_norm": 7.490574836730957, + "learning_rate": 8.284637522269426e-06, + "loss": 0.4882, + "step": 13268 + }, + { + "epoch": 0.17999186109603907, + "grad_norm": 6.80682897567749, + "learning_rate": 8.284500479649171e-06, + "loss": 0.3705, + "step": 13269 + }, + { + "epoch": 0.18000542593597396, + "grad_norm": 5.868730068206787, + "learning_rate": 8.284363437028918e-06, + "loss": 0.3697, + "step": 13270 + }, + { + "epoch": 0.18001899077590885, + "grad_norm": 6.601104736328125, + "learning_rate": 8.284226394408661e-06, + "loss": 0.3803, + "step": 13271 + }, + { + "epoch": 0.18003255561584372, + "grad_norm": 6.562840461730957, + "learning_rate": 8.284089351788406e-06, + "loss": 0.3835, + "step": 13272 + }, + { + "epoch": 0.1800461204557786, + "grad_norm": 8.162854194641113, + "learning_rate": 8.283952309168152e-06, + "loss": 0.4647, + "step": 13273 + }, + { + "epoch": 0.1800596852957135, + "grad_norm": 6.847372055053711, + "learning_rate": 8.283815266547897e-06, + "loss": 0.4107, + "step": 13274 + }, + { + "epoch": 0.1800732501356484, + "grad_norm": 6.559881210327148, + "learning_rate": 8.283678223927642e-06, + "loss": 0.3443, + "step": 13275 + }, + { + "epoch": 0.1800868149755833, + "grad_norm": 5.589729309082031, + "learning_rate": 8.283541181307387e-06, + "loss": 0.3894, + "step": 13276 + }, + { + "epoch": 0.18010037981551819, + "grad_norm": 7.429443836212158, + "learning_rate": 8.283404138687132e-06, + "loss": 0.4153, + "step": 13277 + }, + { + "epoch": 0.18011394465545308, + "grad_norm": 7.920071601867676, + "learning_rate": 8.283267096066878e-06, + "loss": 0.4808, + "step": 13278 + }, + { + "epoch": 0.18012750949538794, + "grad_norm": 4.8240132331848145, + "learning_rate": 8.283130053446623e-06, + "loss": 0.2479, + "step": 13279 + }, + { + "epoch": 0.18014107433532284, + "grad_norm": 7.309937477111816, + "learning_rate": 8.282993010826368e-06, + "loss": 0.3656, + "step": 13280 + }, + { + "epoch": 0.18015463917525773, + "grad_norm": 8.589033126831055, + "learning_rate": 8.282855968206113e-06, + "loss": 0.3875, + "step": 13281 + }, + { + "epoch": 0.18016820401519262, + "grad_norm": 7.301487445831299, + "learning_rate": 8.282718925585858e-06, + "loss": 0.4389, + "step": 13282 + }, + { + "epoch": 0.18018176885512752, + "grad_norm": 8.769438743591309, + "learning_rate": 8.282581882965603e-06, + "loss": 0.5106, + "step": 13283 + }, + { + "epoch": 0.1801953336950624, + "grad_norm": 7.228612899780273, + "learning_rate": 8.282444840345349e-06, + "loss": 0.4519, + "step": 13284 + }, + { + "epoch": 0.18020889853499727, + "grad_norm": 8.023419380187988, + "learning_rate": 8.282307797725094e-06, + "loss": 0.3233, + "step": 13285 + }, + { + "epoch": 0.18022246337493217, + "grad_norm": 6.805908203125, + "learning_rate": 8.282170755104837e-06, + "loss": 0.495, + "step": 13286 + }, + { + "epoch": 0.18023602821486706, + "grad_norm": 7.055903434753418, + "learning_rate": 8.282033712484582e-06, + "loss": 0.5247, + "step": 13287 + }, + { + "epoch": 0.18024959305480195, + "grad_norm": 5.767543315887451, + "learning_rate": 8.28189666986433e-06, + "loss": 0.2989, + "step": 13288 + }, + { + "epoch": 0.18026315789473685, + "grad_norm": 8.983428955078125, + "learning_rate": 8.281759627244074e-06, + "loss": 0.5311, + "step": 13289 + }, + { + "epoch": 0.18027672273467174, + "grad_norm": 7.211045265197754, + "learning_rate": 8.281622584623818e-06, + "loss": 0.3489, + "step": 13290 + }, + { + "epoch": 0.1802902875746066, + "grad_norm": 7.2532830238342285, + "learning_rate": 8.281485542003563e-06, + "loss": 0.3777, + "step": 13291 + }, + { + "epoch": 0.1803038524145415, + "grad_norm": 5.112174987792969, + "learning_rate": 8.28134849938331e-06, + "loss": 0.3813, + "step": 13292 + }, + { + "epoch": 0.1803174172544764, + "grad_norm": 6.78621244430542, + "learning_rate": 8.281211456763054e-06, + "loss": 0.3572, + "step": 13293 + }, + { + "epoch": 0.18033098209441129, + "grad_norm": 4.700019836425781, + "learning_rate": 8.281074414142799e-06, + "loss": 0.2461, + "step": 13294 + }, + { + "epoch": 0.18034454693434618, + "grad_norm": 8.059571266174316, + "learning_rate": 8.280937371522544e-06, + "loss": 0.5592, + "step": 13295 + }, + { + "epoch": 0.18035811177428107, + "grad_norm": 5.596524715423584, + "learning_rate": 8.280800328902289e-06, + "loss": 0.3807, + "step": 13296 + }, + { + "epoch": 0.18037167661421596, + "grad_norm": 7.2244954109191895, + "learning_rate": 8.280663286282034e-06, + "loss": 0.3567, + "step": 13297 + }, + { + "epoch": 0.18038524145415083, + "grad_norm": 6.669574737548828, + "learning_rate": 8.28052624366178e-06, + "loss": 0.4574, + "step": 13298 + }, + { + "epoch": 0.18039880629408572, + "grad_norm": 8.620906829833984, + "learning_rate": 8.280389201041525e-06, + "loss": 0.4602, + "step": 13299 + }, + { + "epoch": 0.18041237113402062, + "grad_norm": 6.037768840789795, + "learning_rate": 8.28025215842127e-06, + "loss": 0.4282, + "step": 13300 + }, + { + "epoch": 0.1804259359739555, + "grad_norm": 6.455239772796631, + "learning_rate": 8.280115115801015e-06, + "loss": 0.3815, + "step": 13301 + }, + { + "epoch": 0.1804395008138904, + "grad_norm": 9.134722709655762, + "learning_rate": 8.27997807318076e-06, + "loss": 0.4694, + "step": 13302 + }, + { + "epoch": 0.1804530656538253, + "grad_norm": 6.822131156921387, + "learning_rate": 8.279841030560505e-06, + "loss": 0.2987, + "step": 13303 + }, + { + "epoch": 0.18046663049376016, + "grad_norm": 6.089964866638184, + "learning_rate": 8.27970398794025e-06, + "loss": 0.247, + "step": 13304 + }, + { + "epoch": 0.18048019533369505, + "grad_norm": 5.029089450836182, + "learning_rate": 8.279566945319996e-06, + "loss": 0.2184, + "step": 13305 + }, + { + "epoch": 0.18049376017362995, + "grad_norm": 5.1171345710754395, + "learning_rate": 8.27942990269974e-06, + "loss": 0.3551, + "step": 13306 + }, + { + "epoch": 0.18050732501356484, + "grad_norm": 5.087101459503174, + "learning_rate": 8.279292860079486e-06, + "loss": 0.2812, + "step": 13307 + }, + { + "epoch": 0.18052088985349973, + "grad_norm": 7.019190311431885, + "learning_rate": 8.27915581745923e-06, + "loss": 0.3468, + "step": 13308 + }, + { + "epoch": 0.18053445469343463, + "grad_norm": 5.887853145599365, + "learning_rate": 8.279018774838976e-06, + "loss": 0.2933, + "step": 13309 + }, + { + "epoch": 0.18054801953336952, + "grad_norm": 6.478085517883301, + "learning_rate": 8.278881732218722e-06, + "loss": 0.3362, + "step": 13310 + }, + { + "epoch": 0.18056158437330438, + "grad_norm": 5.699774742126465, + "learning_rate": 8.278744689598465e-06, + "loss": 0.2742, + "step": 13311 + }, + { + "epoch": 0.18057514921323928, + "grad_norm": 5.618751525878906, + "learning_rate": 8.27860764697821e-06, + "loss": 0.3593, + "step": 13312 + }, + { + "epoch": 0.18058871405317417, + "grad_norm": 5.183297634124756, + "learning_rate": 8.278470604357955e-06, + "loss": 0.2805, + "step": 13313 + }, + { + "epoch": 0.18060227889310906, + "grad_norm": 6.18168306350708, + "learning_rate": 8.278333561737702e-06, + "loss": 0.3598, + "step": 13314 + }, + { + "epoch": 0.18061584373304396, + "grad_norm": 6.005351543426514, + "learning_rate": 8.278196519117446e-06, + "loss": 0.3968, + "step": 13315 + }, + { + "epoch": 0.18062940857297885, + "grad_norm": 3.98990535736084, + "learning_rate": 8.278059476497191e-06, + "loss": 0.1755, + "step": 13316 + }, + { + "epoch": 0.18064297341291372, + "grad_norm": 7.321100234985352, + "learning_rate": 8.277922433876936e-06, + "loss": 0.2968, + "step": 13317 + }, + { + "epoch": 0.1806565382528486, + "grad_norm": 5.929632663726807, + "learning_rate": 8.277785391256681e-06, + "loss": 0.3429, + "step": 13318 + }, + { + "epoch": 0.1806701030927835, + "grad_norm": 7.910882472991943, + "learning_rate": 8.277648348636426e-06, + "loss": 0.4431, + "step": 13319 + }, + { + "epoch": 0.1806836679327184, + "grad_norm": 6.00355339050293, + "learning_rate": 8.277511306016172e-06, + "loss": 0.2908, + "step": 13320 + }, + { + "epoch": 0.1806972327726533, + "grad_norm": 6.652039051055908, + "learning_rate": 8.277374263395917e-06, + "loss": 0.4483, + "step": 13321 + }, + { + "epoch": 0.18071079761258818, + "grad_norm": 5.393139839172363, + "learning_rate": 8.277237220775662e-06, + "loss": 0.3312, + "step": 13322 + }, + { + "epoch": 0.18072436245252307, + "grad_norm": 5.805974006652832, + "learning_rate": 8.277100178155407e-06, + "loss": 0.2996, + "step": 13323 + }, + { + "epoch": 0.18073792729245794, + "grad_norm": 5.567653656005859, + "learning_rate": 8.276963135535152e-06, + "loss": 0.4093, + "step": 13324 + }, + { + "epoch": 0.18075149213239283, + "grad_norm": 4.665774822235107, + "learning_rate": 8.276826092914898e-06, + "loss": 0.289, + "step": 13325 + }, + { + "epoch": 0.18076505697232773, + "grad_norm": 6.644190788269043, + "learning_rate": 8.276689050294641e-06, + "loss": 0.3985, + "step": 13326 + }, + { + "epoch": 0.18077862181226262, + "grad_norm": 5.754733085632324, + "learning_rate": 8.276552007674388e-06, + "loss": 0.2513, + "step": 13327 + }, + { + "epoch": 0.1807921866521975, + "grad_norm": 5.226111888885498, + "learning_rate": 8.276414965054133e-06, + "loss": 0.2363, + "step": 13328 + }, + { + "epoch": 0.1808057514921324, + "grad_norm": 5.5458550453186035, + "learning_rate": 8.276277922433877e-06, + "loss": 0.2769, + "step": 13329 + }, + { + "epoch": 0.18081931633206727, + "grad_norm": 7.014982223510742, + "learning_rate": 8.276140879813622e-06, + "loss": 0.5042, + "step": 13330 + }, + { + "epoch": 0.18083288117200216, + "grad_norm": 4.010850429534912, + "learning_rate": 8.276003837193369e-06, + "loss": 0.2351, + "step": 13331 + }, + { + "epoch": 0.18084644601193706, + "grad_norm": 5.213143825531006, + "learning_rate": 8.275866794573114e-06, + "loss": 0.2527, + "step": 13332 + }, + { + "epoch": 0.18086001085187195, + "grad_norm": 4.930777072906494, + "learning_rate": 8.275729751952857e-06, + "loss": 0.3079, + "step": 13333 + }, + { + "epoch": 0.18087357569180684, + "grad_norm": 5.341472148895264, + "learning_rate": 8.275592709332602e-06, + "loss": 0.3209, + "step": 13334 + }, + { + "epoch": 0.18088714053174174, + "grad_norm": 5.8141961097717285, + "learning_rate": 8.27545566671235e-06, + "loss": 0.196, + "step": 13335 + }, + { + "epoch": 0.1809007053716766, + "grad_norm": 7.196123123168945, + "learning_rate": 8.275318624092093e-06, + "loss": 0.3788, + "step": 13336 + }, + { + "epoch": 0.1809142702116115, + "grad_norm": 7.111973762512207, + "learning_rate": 8.275181581471838e-06, + "loss": 0.3974, + "step": 13337 + }, + { + "epoch": 0.1809278350515464, + "grad_norm": 4.865838050842285, + "learning_rate": 8.275044538851583e-06, + "loss": 0.257, + "step": 13338 + }, + { + "epoch": 0.18094139989148128, + "grad_norm": 5.657694339752197, + "learning_rate": 8.274907496231328e-06, + "loss": 0.3898, + "step": 13339 + }, + { + "epoch": 0.18095496473141617, + "grad_norm": 5.418027400970459, + "learning_rate": 8.274770453611074e-06, + "loss": 0.3048, + "step": 13340 + }, + { + "epoch": 0.18096852957135107, + "grad_norm": 5.4100494384765625, + "learning_rate": 8.274633410990819e-06, + "loss": 0.3552, + "step": 13341 + }, + { + "epoch": 0.18098209441128596, + "grad_norm": 8.75229549407959, + "learning_rate": 8.274496368370564e-06, + "loss": 0.3216, + "step": 13342 + }, + { + "epoch": 0.18099565925122083, + "grad_norm": 6.2220611572265625, + "learning_rate": 8.274359325750309e-06, + "loss": 0.3433, + "step": 13343 + }, + { + "epoch": 0.18100922409115572, + "grad_norm": 5.236563205718994, + "learning_rate": 8.274222283130054e-06, + "loss": 0.2135, + "step": 13344 + }, + { + "epoch": 0.1810227889310906, + "grad_norm": 5.864542484283447, + "learning_rate": 8.2740852405098e-06, + "loss": 0.3415, + "step": 13345 + }, + { + "epoch": 0.1810363537710255, + "grad_norm": 7.904994964599609, + "learning_rate": 8.273948197889545e-06, + "loss": 0.2873, + "step": 13346 + }, + { + "epoch": 0.1810499186109604, + "grad_norm": 5.011319637298584, + "learning_rate": 8.27381115526929e-06, + "loss": 0.2444, + "step": 13347 + }, + { + "epoch": 0.1810634834508953, + "grad_norm": 7.778831481933594, + "learning_rate": 8.273674112649035e-06, + "loss": 0.4552, + "step": 13348 + }, + { + "epoch": 0.18107704829083016, + "grad_norm": 5.110291004180908, + "learning_rate": 8.27353707002878e-06, + "loss": 0.2633, + "step": 13349 + }, + { + "epoch": 0.18109061313076505, + "grad_norm": 6.350872039794922, + "learning_rate": 8.273400027408525e-06, + "loss": 0.3763, + "step": 13350 + }, + { + "epoch": 0.18110417797069994, + "grad_norm": 4.146731376647949, + "learning_rate": 8.273262984788269e-06, + "loss": 0.2433, + "step": 13351 + }, + { + "epoch": 0.18111774281063484, + "grad_norm": 5.223596572875977, + "learning_rate": 8.273125942168016e-06, + "loss": 0.2235, + "step": 13352 + }, + { + "epoch": 0.18113130765056973, + "grad_norm": 6.253279685974121, + "learning_rate": 8.272988899547761e-06, + "loss": 0.3977, + "step": 13353 + }, + { + "epoch": 0.18114487249050462, + "grad_norm": 7.62566614151001, + "learning_rate": 8.272851856927504e-06, + "loss": 0.3362, + "step": 13354 + }, + { + "epoch": 0.18115843733043951, + "grad_norm": 6.376695156097412, + "learning_rate": 8.27271481430725e-06, + "loss": 0.3957, + "step": 13355 + }, + { + "epoch": 0.18117200217037438, + "grad_norm": 5.0793657302856445, + "learning_rate": 8.272577771686995e-06, + "loss": 0.3287, + "step": 13356 + }, + { + "epoch": 0.18118556701030927, + "grad_norm": 6.100083827972412, + "learning_rate": 8.272440729066742e-06, + "loss": 0.3523, + "step": 13357 + }, + { + "epoch": 0.18119913185024417, + "grad_norm": 7.563191890716553, + "learning_rate": 8.272303686446485e-06, + "loss": 0.3135, + "step": 13358 + }, + { + "epoch": 0.18121269669017906, + "grad_norm": 6.449779033660889, + "learning_rate": 8.27216664382623e-06, + "loss": 0.3473, + "step": 13359 + }, + { + "epoch": 0.18122626153011395, + "grad_norm": 6.101245403289795, + "learning_rate": 8.272029601205975e-06, + "loss": 0.5083, + "step": 13360 + }, + { + "epoch": 0.18123982637004885, + "grad_norm": 5.504655361175537, + "learning_rate": 8.27189255858572e-06, + "loss": 0.2354, + "step": 13361 + }, + { + "epoch": 0.1812533912099837, + "grad_norm": 6.692554950714111, + "learning_rate": 8.271755515965466e-06, + "loss": 0.3728, + "step": 13362 + }, + { + "epoch": 0.1812669560499186, + "grad_norm": 6.316214084625244, + "learning_rate": 8.271618473345211e-06, + "loss": 0.3772, + "step": 13363 + }, + { + "epoch": 0.1812805208898535, + "grad_norm": 5.474367618560791, + "learning_rate": 8.271481430724956e-06, + "loss": 0.2938, + "step": 13364 + }, + { + "epoch": 0.1812940857297884, + "grad_norm": 6.161708354949951, + "learning_rate": 8.271344388104701e-06, + "loss": 0.5157, + "step": 13365 + }, + { + "epoch": 0.18130765056972328, + "grad_norm": 7.024778842926025, + "learning_rate": 8.271207345484446e-06, + "loss": 0.4187, + "step": 13366 + }, + { + "epoch": 0.18132121540965818, + "grad_norm": 6.684330463409424, + "learning_rate": 8.271070302864192e-06, + "loss": 0.3603, + "step": 13367 + }, + { + "epoch": 0.18133478024959304, + "grad_norm": 9.190983772277832, + "learning_rate": 8.270933260243937e-06, + "loss": 0.4918, + "step": 13368 + }, + { + "epoch": 0.18134834508952793, + "grad_norm": 6.01159143447876, + "learning_rate": 8.27079621762368e-06, + "loss": 0.3881, + "step": 13369 + }, + { + "epoch": 0.18136190992946283, + "grad_norm": 4.296907901763916, + "learning_rate": 8.270659175003427e-06, + "loss": 0.2267, + "step": 13370 + }, + { + "epoch": 0.18137547476939772, + "grad_norm": 5.752720355987549, + "learning_rate": 8.270522132383172e-06, + "loss": 0.4183, + "step": 13371 + }, + { + "epoch": 0.18138903960933261, + "grad_norm": 5.7251739501953125, + "learning_rate": 8.270385089762918e-06, + "loss": 0.4003, + "step": 13372 + }, + { + "epoch": 0.1814026044492675, + "grad_norm": 5.425194263458252, + "learning_rate": 8.270248047142661e-06, + "loss": 0.4117, + "step": 13373 + }, + { + "epoch": 0.1814161692892024, + "grad_norm": 5.81908655166626, + "learning_rate": 8.270111004522408e-06, + "loss": 0.363, + "step": 13374 + }, + { + "epoch": 0.18142973412913727, + "grad_norm": 6.914282321929932, + "learning_rate": 8.269973961902153e-06, + "loss": 0.4928, + "step": 13375 + }, + { + "epoch": 0.18144329896907216, + "grad_norm": 7.2248406410217285, + "learning_rate": 8.269836919281897e-06, + "loss": 0.3528, + "step": 13376 + }, + { + "epoch": 0.18145686380900705, + "grad_norm": 9.426922798156738, + "learning_rate": 8.269699876661642e-06, + "loss": 0.5406, + "step": 13377 + }, + { + "epoch": 0.18147042864894194, + "grad_norm": 6.066367149353027, + "learning_rate": 8.269562834041389e-06, + "loss": 0.2704, + "step": 13378 + }, + { + "epoch": 0.18148399348887684, + "grad_norm": 6.260489463806152, + "learning_rate": 8.269425791421132e-06, + "loss": 0.5085, + "step": 13379 + }, + { + "epoch": 0.18149755832881173, + "grad_norm": 5.678675174713135, + "learning_rate": 8.269288748800877e-06, + "loss": 0.3474, + "step": 13380 + }, + { + "epoch": 0.1815111231687466, + "grad_norm": 5.381645202636719, + "learning_rate": 8.269151706180622e-06, + "loss": 0.3279, + "step": 13381 + }, + { + "epoch": 0.1815246880086815, + "grad_norm": 5.372577667236328, + "learning_rate": 8.269014663560368e-06, + "loss": 0.3936, + "step": 13382 + }, + { + "epoch": 0.18153825284861638, + "grad_norm": 5.891077518463135, + "learning_rate": 8.268877620940113e-06, + "loss": 0.3573, + "step": 13383 + }, + { + "epoch": 0.18155181768855128, + "grad_norm": 7.486580848693848, + "learning_rate": 8.268740578319858e-06, + "loss": 0.3489, + "step": 13384 + }, + { + "epoch": 0.18156538252848617, + "grad_norm": 4.572498321533203, + "learning_rate": 8.268603535699603e-06, + "loss": 0.2408, + "step": 13385 + }, + { + "epoch": 0.18157894736842106, + "grad_norm": 5.573204517364502, + "learning_rate": 8.268466493079348e-06, + "loss": 0.3862, + "step": 13386 + }, + { + "epoch": 0.18159251220835596, + "grad_norm": 5.448040962219238, + "learning_rate": 8.268329450459094e-06, + "loss": 0.2735, + "step": 13387 + }, + { + "epoch": 0.18160607704829082, + "grad_norm": 6.0117716789245605, + "learning_rate": 8.268192407838839e-06, + "loss": 0.4557, + "step": 13388 + }, + { + "epoch": 0.1816196418882257, + "grad_norm": 7.714721202850342, + "learning_rate": 8.268055365218584e-06, + "loss": 0.3902, + "step": 13389 + }, + { + "epoch": 0.1816332067281606, + "grad_norm": 5.166287422180176, + "learning_rate": 8.267918322598329e-06, + "loss": 0.2691, + "step": 13390 + }, + { + "epoch": 0.1816467715680955, + "grad_norm": 4.809632301330566, + "learning_rate": 8.267781279978074e-06, + "loss": 0.2519, + "step": 13391 + }, + { + "epoch": 0.1816603364080304, + "grad_norm": 5.096078395843506, + "learning_rate": 8.26764423735782e-06, + "loss": 0.2261, + "step": 13392 + }, + { + "epoch": 0.18167390124796529, + "grad_norm": 8.577933311462402, + "learning_rate": 8.267507194737565e-06, + "loss": 0.486, + "step": 13393 + }, + { + "epoch": 0.18168746608790015, + "grad_norm": 8.03455924987793, + "learning_rate": 8.267370152117308e-06, + "loss": 0.5744, + "step": 13394 + }, + { + "epoch": 0.18170103092783504, + "grad_norm": 3.875352621078491, + "learning_rate": 8.267233109497053e-06, + "loss": 0.2143, + "step": 13395 + }, + { + "epoch": 0.18171459576776994, + "grad_norm": 5.591806411743164, + "learning_rate": 8.2670960668768e-06, + "loss": 0.244, + "step": 13396 + }, + { + "epoch": 0.18172816060770483, + "grad_norm": 5.06946325302124, + "learning_rate": 8.266959024256545e-06, + "loss": 0.3298, + "step": 13397 + }, + { + "epoch": 0.18174172544763972, + "grad_norm": 7.067358016967773, + "learning_rate": 8.266821981636289e-06, + "loss": 0.3849, + "step": 13398 + }, + { + "epoch": 0.18175529028757462, + "grad_norm": 5.5179033279418945, + "learning_rate": 8.266684939016034e-06, + "loss": 0.2597, + "step": 13399 + }, + { + "epoch": 0.18176885512750948, + "grad_norm": 5.74722146987915, + "learning_rate": 8.266547896395781e-06, + "loss": 0.4262, + "step": 13400 + }, + { + "epoch": 0.18178241996744438, + "grad_norm": 5.090507507324219, + "learning_rate": 8.266410853775524e-06, + "loss": 0.3598, + "step": 13401 + }, + { + "epoch": 0.18179598480737927, + "grad_norm": 6.513262748718262, + "learning_rate": 8.26627381115527e-06, + "loss": 0.3866, + "step": 13402 + }, + { + "epoch": 0.18180954964731416, + "grad_norm": 6.403619289398193, + "learning_rate": 8.266136768535015e-06, + "loss": 0.3537, + "step": 13403 + }, + { + "epoch": 0.18182311448724905, + "grad_norm": 4.121766567230225, + "learning_rate": 8.26599972591476e-06, + "loss": 0.3174, + "step": 13404 + }, + { + "epoch": 0.18183667932718395, + "grad_norm": 5.688201427459717, + "learning_rate": 8.265862683294505e-06, + "loss": 0.3074, + "step": 13405 + }, + { + "epoch": 0.18185024416711884, + "grad_norm": 6.369019508361816, + "learning_rate": 8.26572564067425e-06, + "loss": 0.3167, + "step": 13406 + }, + { + "epoch": 0.1818638090070537, + "grad_norm": 9.516244888305664, + "learning_rate": 8.265588598053995e-06, + "loss": 0.4192, + "step": 13407 + }, + { + "epoch": 0.1818773738469886, + "grad_norm": 4.930335521697998, + "learning_rate": 8.26545155543374e-06, + "loss": 0.3062, + "step": 13408 + }, + { + "epoch": 0.1818909386869235, + "grad_norm": 6.5753607749938965, + "learning_rate": 8.265314512813486e-06, + "loss": 0.3219, + "step": 13409 + }, + { + "epoch": 0.18190450352685839, + "grad_norm": 5.823453903198242, + "learning_rate": 8.265177470193231e-06, + "loss": 0.2562, + "step": 13410 + }, + { + "epoch": 0.18191806836679328, + "grad_norm": 5.621798992156982, + "learning_rate": 8.265040427572976e-06, + "loss": 0.2542, + "step": 13411 + }, + { + "epoch": 0.18193163320672817, + "grad_norm": 4.61233377456665, + "learning_rate": 8.264903384952721e-06, + "loss": 0.2592, + "step": 13412 + }, + { + "epoch": 0.18194519804666304, + "grad_norm": 4.955948829650879, + "learning_rate": 8.264766342332467e-06, + "loss": 0.1864, + "step": 13413 + }, + { + "epoch": 0.18195876288659793, + "grad_norm": 7.360172748565674, + "learning_rate": 8.264629299712212e-06, + "loss": 0.4745, + "step": 13414 + }, + { + "epoch": 0.18197232772653282, + "grad_norm": 7.2540669441223145, + "learning_rate": 8.264492257091957e-06, + "loss": 0.322, + "step": 13415 + }, + { + "epoch": 0.18198589256646772, + "grad_norm": 6.074439525604248, + "learning_rate": 8.2643552144717e-06, + "loss": 0.3087, + "step": 13416 + }, + { + "epoch": 0.1819994574064026, + "grad_norm": 6.8394598960876465, + "learning_rate": 8.264218171851447e-06, + "loss": 0.4408, + "step": 13417 + }, + { + "epoch": 0.1820130222463375, + "grad_norm": 5.007289886474609, + "learning_rate": 8.264081129231192e-06, + "loss": 0.2939, + "step": 13418 + }, + { + "epoch": 0.1820265870862724, + "grad_norm": 9.230547904968262, + "learning_rate": 8.263944086610936e-06, + "loss": 0.4134, + "step": 13419 + }, + { + "epoch": 0.18204015192620726, + "grad_norm": 5.527885913848877, + "learning_rate": 8.263807043990681e-06, + "loss": 0.3445, + "step": 13420 + }, + { + "epoch": 0.18205371676614215, + "grad_norm": 5.607303142547607, + "learning_rate": 8.263670001370428e-06, + "loss": 0.2909, + "step": 13421 + }, + { + "epoch": 0.18206728160607705, + "grad_norm": 6.350986957550049, + "learning_rate": 8.263532958750171e-06, + "loss": 0.3313, + "step": 13422 + }, + { + "epoch": 0.18208084644601194, + "grad_norm": 8.408710479736328, + "learning_rate": 8.263395916129917e-06, + "loss": 0.4018, + "step": 13423 + }, + { + "epoch": 0.18209441128594683, + "grad_norm": 6.048239231109619, + "learning_rate": 8.263258873509662e-06, + "loss": 0.3077, + "step": 13424 + }, + { + "epoch": 0.18210797612588173, + "grad_norm": 6.268813610076904, + "learning_rate": 8.263121830889407e-06, + "loss": 0.4566, + "step": 13425 + }, + { + "epoch": 0.1821215409658166, + "grad_norm": 6.189331531524658, + "learning_rate": 8.262984788269152e-06, + "loss": 0.3173, + "step": 13426 + }, + { + "epoch": 0.18213510580575148, + "grad_norm": 4.537499904632568, + "learning_rate": 8.262847745648897e-06, + "loss": 0.2714, + "step": 13427 + }, + { + "epoch": 0.18214867064568638, + "grad_norm": 4.86562442779541, + "learning_rate": 8.262710703028642e-06, + "loss": 0.2765, + "step": 13428 + }, + { + "epoch": 0.18216223548562127, + "grad_norm": 5.722329616546631, + "learning_rate": 8.262573660408388e-06, + "loss": 0.325, + "step": 13429 + }, + { + "epoch": 0.18217580032555616, + "grad_norm": 6.327053070068359, + "learning_rate": 8.262436617788133e-06, + "loss": 0.3198, + "step": 13430 + }, + { + "epoch": 0.18218936516549106, + "grad_norm": 6.1313066482543945, + "learning_rate": 8.262299575167878e-06, + "loss": 0.3483, + "step": 13431 + }, + { + "epoch": 0.18220293000542592, + "grad_norm": 6.4424896240234375, + "learning_rate": 8.262162532547623e-06, + "loss": 0.3788, + "step": 13432 + }, + { + "epoch": 0.18221649484536082, + "grad_norm": 6.448067665100098, + "learning_rate": 8.262025489927368e-06, + "loss": 0.4364, + "step": 13433 + }, + { + "epoch": 0.1822300596852957, + "grad_norm": 6.068521499633789, + "learning_rate": 8.261888447307114e-06, + "loss": 0.3551, + "step": 13434 + }, + { + "epoch": 0.1822436245252306, + "grad_norm": 7.854836463928223, + "learning_rate": 8.261751404686859e-06, + "loss": 0.4323, + "step": 13435 + }, + { + "epoch": 0.1822571893651655, + "grad_norm": 4.613903999328613, + "learning_rate": 8.261614362066604e-06, + "loss": 0.2998, + "step": 13436 + }, + { + "epoch": 0.1822707542051004, + "grad_norm": 4.654789924621582, + "learning_rate": 8.261477319446347e-06, + "loss": 0.3805, + "step": 13437 + }, + { + "epoch": 0.18228431904503528, + "grad_norm": 6.801981449127197, + "learning_rate": 8.261340276826093e-06, + "loss": 0.4057, + "step": 13438 + }, + { + "epoch": 0.18229788388497015, + "grad_norm": 12.336652755737305, + "learning_rate": 8.26120323420584e-06, + "loss": 0.4141, + "step": 13439 + }, + { + "epoch": 0.18231144872490504, + "grad_norm": 6.443044662475586, + "learning_rate": 8.261066191585585e-06, + "loss": 0.3829, + "step": 13440 + }, + { + "epoch": 0.18232501356483993, + "grad_norm": 4.616369724273682, + "learning_rate": 8.260929148965328e-06, + "loss": 0.2958, + "step": 13441 + }, + { + "epoch": 0.18233857840477483, + "grad_norm": 7.308426856994629, + "learning_rate": 8.260792106345073e-06, + "loss": 0.3317, + "step": 13442 + }, + { + "epoch": 0.18235214324470972, + "grad_norm": 5.752607345581055, + "learning_rate": 8.26065506372482e-06, + "loss": 0.3927, + "step": 13443 + }, + { + "epoch": 0.1823657080846446, + "grad_norm": 7.342332363128662, + "learning_rate": 8.260518021104564e-06, + "loss": 0.5661, + "step": 13444 + }, + { + "epoch": 0.18237927292457948, + "grad_norm": 5.965977668762207, + "learning_rate": 8.260380978484309e-06, + "loss": 0.4004, + "step": 13445 + }, + { + "epoch": 0.18239283776451437, + "grad_norm": 5.525208473205566, + "learning_rate": 8.260243935864054e-06, + "loss": 0.3293, + "step": 13446 + }, + { + "epoch": 0.18240640260444926, + "grad_norm": 5.020605087280273, + "learning_rate": 8.2601068932438e-06, + "loss": 0.3196, + "step": 13447 + }, + { + "epoch": 0.18241996744438416, + "grad_norm": 7.644707202911377, + "learning_rate": 8.259969850623544e-06, + "loss": 0.5087, + "step": 13448 + }, + { + "epoch": 0.18243353228431905, + "grad_norm": 6.83765983581543, + "learning_rate": 8.25983280800329e-06, + "loss": 0.4766, + "step": 13449 + }, + { + "epoch": 0.18244709712425394, + "grad_norm": 6.034869194030762, + "learning_rate": 8.259695765383035e-06, + "loss": 0.3607, + "step": 13450 + }, + { + "epoch": 0.18246066196418884, + "grad_norm": 9.179943084716797, + "learning_rate": 8.25955872276278e-06, + "loss": 0.4793, + "step": 13451 + }, + { + "epoch": 0.1824742268041237, + "grad_norm": 7.438990116119385, + "learning_rate": 8.259421680142525e-06, + "loss": 0.4634, + "step": 13452 + }, + { + "epoch": 0.1824877916440586, + "grad_norm": 5.282742977142334, + "learning_rate": 8.25928463752227e-06, + "loss": 0.2742, + "step": 13453 + }, + { + "epoch": 0.1825013564839935, + "grad_norm": 6.931341171264648, + "learning_rate": 8.259147594902015e-06, + "loss": 0.4553, + "step": 13454 + }, + { + "epoch": 0.18251492132392838, + "grad_norm": 5.887728214263916, + "learning_rate": 8.25901055228176e-06, + "loss": 0.3391, + "step": 13455 + }, + { + "epoch": 0.18252848616386327, + "grad_norm": 5.595891952514648, + "learning_rate": 8.258873509661506e-06, + "loss": 0.3459, + "step": 13456 + }, + { + "epoch": 0.18254205100379817, + "grad_norm": 5.450949192047119, + "learning_rate": 8.258736467041251e-06, + "loss": 0.4219, + "step": 13457 + }, + { + "epoch": 0.18255561584373303, + "grad_norm": 6.042112827301025, + "learning_rate": 8.258599424420996e-06, + "loss": 0.3663, + "step": 13458 + }, + { + "epoch": 0.18256918068366793, + "grad_norm": 6.907910346984863, + "learning_rate": 8.25846238180074e-06, + "loss": 0.4836, + "step": 13459 + }, + { + "epoch": 0.18258274552360282, + "grad_norm": 5.368495464324951, + "learning_rate": 8.258325339180487e-06, + "loss": 0.3082, + "step": 13460 + }, + { + "epoch": 0.1825963103635377, + "grad_norm": 6.46555757522583, + "learning_rate": 8.258188296560232e-06, + "loss": 0.3333, + "step": 13461 + }, + { + "epoch": 0.1826098752034726, + "grad_norm": 5.569995880126953, + "learning_rate": 8.258051253939975e-06, + "loss": 0.2997, + "step": 13462 + }, + { + "epoch": 0.1826234400434075, + "grad_norm": 6.143776893615723, + "learning_rate": 8.25791421131972e-06, + "loss": 0.3811, + "step": 13463 + }, + { + "epoch": 0.18263700488334236, + "grad_norm": 6.460106372833252, + "learning_rate": 8.257777168699466e-06, + "loss": 0.262, + "step": 13464 + }, + { + "epoch": 0.18265056972327726, + "grad_norm": 5.854684829711914, + "learning_rate": 8.257640126079212e-06, + "loss": 0.4292, + "step": 13465 + }, + { + "epoch": 0.18266413456321215, + "grad_norm": 7.959706783294678, + "learning_rate": 8.257503083458956e-06, + "loss": 0.5293, + "step": 13466 + }, + { + "epoch": 0.18267769940314704, + "grad_norm": 6.128259658813477, + "learning_rate": 8.257366040838701e-06, + "loss": 0.4012, + "step": 13467 + }, + { + "epoch": 0.18269126424308194, + "grad_norm": 4.762041091918945, + "learning_rate": 8.257228998218446e-06, + "loss": 0.2455, + "step": 13468 + }, + { + "epoch": 0.18270482908301683, + "grad_norm": 5.98983907699585, + "learning_rate": 8.257091955598191e-06, + "loss": 0.2919, + "step": 13469 + }, + { + "epoch": 0.18271839392295172, + "grad_norm": 8.746758460998535, + "learning_rate": 8.256954912977937e-06, + "loss": 0.4782, + "step": 13470 + }, + { + "epoch": 0.1827319587628866, + "grad_norm": 8.277321815490723, + "learning_rate": 8.256817870357682e-06, + "loss": 0.3865, + "step": 13471 + }, + { + "epoch": 0.18274552360282148, + "grad_norm": 5.162832260131836, + "learning_rate": 8.256680827737427e-06, + "loss": 0.3072, + "step": 13472 + }, + { + "epoch": 0.18275908844275637, + "grad_norm": 5.536201000213623, + "learning_rate": 8.256543785117172e-06, + "loss": 0.3335, + "step": 13473 + }, + { + "epoch": 0.18277265328269127, + "grad_norm": 6.919642448425293, + "learning_rate": 8.256406742496917e-06, + "loss": 0.4396, + "step": 13474 + }, + { + "epoch": 0.18278621812262616, + "grad_norm": 5.753433704376221, + "learning_rate": 8.256269699876663e-06, + "loss": 0.2776, + "step": 13475 + }, + { + "epoch": 0.18279978296256105, + "grad_norm": 5.352236270904541, + "learning_rate": 8.256132657256408e-06, + "loss": 0.2683, + "step": 13476 + }, + { + "epoch": 0.18281334780249592, + "grad_norm": 6.680312633514404, + "learning_rate": 8.255995614636153e-06, + "loss": 0.3422, + "step": 13477 + }, + { + "epoch": 0.1828269126424308, + "grad_norm": 6.721122741699219, + "learning_rate": 8.255858572015898e-06, + "loss": 0.5478, + "step": 13478 + }, + { + "epoch": 0.1828404774823657, + "grad_norm": 6.690126895904541, + "learning_rate": 8.255721529395643e-06, + "loss": 0.371, + "step": 13479 + }, + { + "epoch": 0.1828540423223006, + "grad_norm": 7.215445041656494, + "learning_rate": 8.255584486775388e-06, + "loss": 0.3985, + "step": 13480 + }, + { + "epoch": 0.1828676071622355, + "grad_norm": 6.327004432678223, + "learning_rate": 8.255447444155132e-06, + "loss": 0.4466, + "step": 13481 + }, + { + "epoch": 0.18288117200217038, + "grad_norm": 4.8172783851623535, + "learning_rate": 8.255310401534879e-06, + "loss": 0.2848, + "step": 13482 + }, + { + "epoch": 0.18289473684210528, + "grad_norm": 6.158184051513672, + "learning_rate": 8.255173358914624e-06, + "loss": 0.4033, + "step": 13483 + }, + { + "epoch": 0.18290830168204014, + "grad_norm": 5.3946404457092285, + "learning_rate": 8.255036316294367e-06, + "loss": 0.3539, + "step": 13484 + }, + { + "epoch": 0.18292186652197504, + "grad_norm": 7.582634449005127, + "learning_rate": 8.254899273674113e-06, + "loss": 0.4421, + "step": 13485 + }, + { + "epoch": 0.18293543136190993, + "grad_norm": 7.238615036010742, + "learning_rate": 8.25476223105386e-06, + "loss": 0.519, + "step": 13486 + }, + { + "epoch": 0.18294899620184482, + "grad_norm": 5.206300258636475, + "learning_rate": 8.254625188433603e-06, + "loss": 0.3122, + "step": 13487 + }, + { + "epoch": 0.18296256104177971, + "grad_norm": 5.3816962242126465, + "learning_rate": 8.254488145813348e-06, + "loss": 0.2724, + "step": 13488 + }, + { + "epoch": 0.1829761258817146, + "grad_norm": 7.065772533416748, + "learning_rate": 8.254351103193093e-06, + "loss": 0.4447, + "step": 13489 + }, + { + "epoch": 0.18298969072164947, + "grad_norm": 6.067612648010254, + "learning_rate": 8.25421406057284e-06, + "loss": 0.404, + "step": 13490 + }, + { + "epoch": 0.18300325556158437, + "grad_norm": 6.404355049133301, + "learning_rate": 8.254077017952584e-06, + "loss": 0.2563, + "step": 13491 + }, + { + "epoch": 0.18301682040151926, + "grad_norm": 7.260431289672852, + "learning_rate": 8.253939975332329e-06, + "loss": 0.2996, + "step": 13492 + }, + { + "epoch": 0.18303038524145415, + "grad_norm": 4.587018966674805, + "learning_rate": 8.253802932712074e-06, + "loss": 0.337, + "step": 13493 + }, + { + "epoch": 0.18304395008138905, + "grad_norm": 7.84768533706665, + "learning_rate": 8.25366589009182e-06, + "loss": 0.4243, + "step": 13494 + }, + { + "epoch": 0.18305751492132394, + "grad_norm": 5.8909735679626465, + "learning_rate": 8.253528847471564e-06, + "loss": 0.2238, + "step": 13495 + }, + { + "epoch": 0.1830710797612588, + "grad_norm": 8.047229766845703, + "learning_rate": 8.25339180485131e-06, + "loss": 0.859, + "step": 13496 + }, + { + "epoch": 0.1830846446011937, + "grad_norm": 5.2133097648620605, + "learning_rate": 8.253254762231055e-06, + "loss": 0.3077, + "step": 13497 + }, + { + "epoch": 0.1830982094411286, + "grad_norm": 8.208646774291992, + "learning_rate": 8.2531177196108e-06, + "loss": 0.3973, + "step": 13498 + }, + { + "epoch": 0.18311177428106348, + "grad_norm": 6.987485408782959, + "learning_rate": 8.252980676990545e-06, + "loss": 0.4327, + "step": 13499 + }, + { + "epoch": 0.18312533912099838, + "grad_norm": 8.523885726928711, + "learning_rate": 8.25284363437029e-06, + "loss": 0.8414, + "step": 13500 + }, + { + "epoch": 0.18313890396093327, + "grad_norm": 6.292126178741455, + "learning_rate": 8.252706591750035e-06, + "loss": 0.4357, + "step": 13501 + }, + { + "epoch": 0.18315246880086816, + "grad_norm": 7.221829414367676, + "learning_rate": 8.252569549129779e-06, + "loss": 0.459, + "step": 13502 + }, + { + "epoch": 0.18316603364080303, + "grad_norm": 5.183452129364014, + "learning_rate": 8.252432506509526e-06, + "loss": 0.4058, + "step": 13503 + }, + { + "epoch": 0.18317959848073792, + "grad_norm": 8.23693561553955, + "learning_rate": 8.252295463889271e-06, + "loss": 0.4806, + "step": 13504 + }, + { + "epoch": 0.1831931633206728, + "grad_norm": 7.4726881980896, + "learning_rate": 8.252158421269016e-06, + "loss": 0.682, + "step": 13505 + }, + { + "epoch": 0.1832067281606077, + "grad_norm": 6.673652172088623, + "learning_rate": 8.25202137864876e-06, + "loss": 0.3242, + "step": 13506 + }, + { + "epoch": 0.1832202930005426, + "grad_norm": 5.516979694366455, + "learning_rate": 8.251884336028505e-06, + "loss": 0.3774, + "step": 13507 + }, + { + "epoch": 0.1832338578404775, + "grad_norm": 7.223714828491211, + "learning_rate": 8.251747293408252e-06, + "loss": 0.4914, + "step": 13508 + }, + { + "epoch": 0.18324742268041236, + "grad_norm": 7.623312473297119, + "learning_rate": 8.251610250787995e-06, + "loss": 0.5367, + "step": 13509 + }, + { + "epoch": 0.18326098752034725, + "grad_norm": 5.367812156677246, + "learning_rate": 8.25147320816774e-06, + "loss": 0.2815, + "step": 13510 + }, + { + "epoch": 0.18327455236028214, + "grad_norm": 5.427034854888916, + "learning_rate": 8.251336165547486e-06, + "loss": 0.3053, + "step": 13511 + }, + { + "epoch": 0.18328811720021704, + "grad_norm": 7.828582763671875, + "learning_rate": 8.25119912292723e-06, + "loss": 0.4476, + "step": 13512 + }, + { + "epoch": 0.18330168204015193, + "grad_norm": 7.697542667388916, + "learning_rate": 8.251062080306976e-06, + "loss": 0.2362, + "step": 13513 + }, + { + "epoch": 0.18331524688008682, + "grad_norm": 6.3035454750061035, + "learning_rate": 8.250925037686721e-06, + "loss": 0.3271, + "step": 13514 + }, + { + "epoch": 0.18332881172002172, + "grad_norm": 6.606174945831299, + "learning_rate": 8.250787995066466e-06, + "loss": 0.3837, + "step": 13515 + }, + { + "epoch": 0.18334237655995658, + "grad_norm": 6.340851783752441, + "learning_rate": 8.250650952446211e-06, + "loss": 0.4778, + "step": 13516 + }, + { + "epoch": 0.18335594139989148, + "grad_norm": 6.839977741241455, + "learning_rate": 8.250513909825957e-06, + "loss": 0.3522, + "step": 13517 + }, + { + "epoch": 0.18336950623982637, + "grad_norm": 6.935298919677734, + "learning_rate": 8.250376867205702e-06, + "loss": 0.4126, + "step": 13518 + }, + { + "epoch": 0.18338307107976126, + "grad_norm": 5.716131687164307, + "learning_rate": 8.250239824585447e-06, + "loss": 0.2381, + "step": 13519 + }, + { + "epoch": 0.18339663591969615, + "grad_norm": 5.716177463531494, + "learning_rate": 8.25010278196519e-06, + "loss": 0.339, + "step": 13520 + }, + { + "epoch": 0.18341020075963105, + "grad_norm": 12.466224670410156, + "learning_rate": 8.249965739344937e-06, + "loss": 0.4696, + "step": 13521 + }, + { + "epoch": 0.1834237655995659, + "grad_norm": 4.869205951690674, + "learning_rate": 8.249828696724683e-06, + "loss": 0.2718, + "step": 13522 + }, + { + "epoch": 0.1834373304395008, + "grad_norm": 6.677178859710693, + "learning_rate": 8.249691654104428e-06, + "loss": 0.4123, + "step": 13523 + }, + { + "epoch": 0.1834508952794357, + "grad_norm": 6.925345420837402, + "learning_rate": 8.249554611484171e-06, + "loss": 0.345, + "step": 13524 + }, + { + "epoch": 0.1834644601193706, + "grad_norm": 6.07776403427124, + "learning_rate": 8.249417568863918e-06, + "loss": 0.2676, + "step": 13525 + }, + { + "epoch": 0.18347802495930549, + "grad_norm": 8.409926414489746, + "learning_rate": 8.249280526243663e-06, + "loss": 0.6244, + "step": 13526 + }, + { + "epoch": 0.18349158979924038, + "grad_norm": 6.832396030426025, + "learning_rate": 8.249143483623407e-06, + "loss": 0.3749, + "step": 13527 + }, + { + "epoch": 0.18350515463917524, + "grad_norm": 5.269766330718994, + "learning_rate": 8.249006441003152e-06, + "loss": 0.2802, + "step": 13528 + }, + { + "epoch": 0.18351871947911014, + "grad_norm": 5.396020412445068, + "learning_rate": 8.248869398382899e-06, + "loss": 0.2979, + "step": 13529 + }, + { + "epoch": 0.18353228431904503, + "grad_norm": 6.791731357574463, + "learning_rate": 8.248732355762642e-06, + "loss": 0.391, + "step": 13530 + }, + { + "epoch": 0.18354584915897992, + "grad_norm": 7.2865376472473145, + "learning_rate": 8.248595313142387e-06, + "loss": 0.4989, + "step": 13531 + }, + { + "epoch": 0.18355941399891482, + "grad_norm": 4.957555294036865, + "learning_rate": 8.248458270522133e-06, + "loss": 0.2034, + "step": 13532 + }, + { + "epoch": 0.1835729788388497, + "grad_norm": 7.034811496734619, + "learning_rate": 8.248321227901878e-06, + "loss": 0.3522, + "step": 13533 + }, + { + "epoch": 0.1835865436787846, + "grad_norm": 6.015563011169434, + "learning_rate": 8.248184185281623e-06, + "loss": 0.2545, + "step": 13534 + }, + { + "epoch": 0.18360010851871947, + "grad_norm": 6.465078353881836, + "learning_rate": 8.248047142661368e-06, + "loss": 0.3669, + "step": 13535 + }, + { + "epoch": 0.18361367335865436, + "grad_norm": 8.19060230255127, + "learning_rate": 8.247910100041113e-06, + "loss": 0.5068, + "step": 13536 + }, + { + "epoch": 0.18362723819858925, + "grad_norm": 5.860910892486572, + "learning_rate": 8.247773057420859e-06, + "loss": 0.3052, + "step": 13537 + }, + { + "epoch": 0.18364080303852415, + "grad_norm": 7.772897243499756, + "learning_rate": 8.247636014800604e-06, + "loss": 0.3732, + "step": 13538 + }, + { + "epoch": 0.18365436787845904, + "grad_norm": 5.366546154022217, + "learning_rate": 8.247498972180349e-06, + "loss": 0.4055, + "step": 13539 + }, + { + "epoch": 0.18366793271839393, + "grad_norm": 6.086034297943115, + "learning_rate": 8.247361929560094e-06, + "loss": 0.3582, + "step": 13540 + }, + { + "epoch": 0.1836814975583288, + "grad_norm": 5.685585975646973, + "learning_rate": 8.24722488693984e-06, + "loss": 0.3846, + "step": 13541 + }, + { + "epoch": 0.1836950623982637, + "grad_norm": 7.753395080566406, + "learning_rate": 8.247087844319584e-06, + "loss": 0.4255, + "step": 13542 + }, + { + "epoch": 0.18370862723819859, + "grad_norm": 6.809804916381836, + "learning_rate": 8.24695080169933e-06, + "loss": 0.5322, + "step": 13543 + }, + { + "epoch": 0.18372219207813348, + "grad_norm": 6.461348533630371, + "learning_rate": 8.246813759079075e-06, + "loss": 0.4431, + "step": 13544 + }, + { + "epoch": 0.18373575691806837, + "grad_norm": 7.371434688568115, + "learning_rate": 8.246676716458818e-06, + "loss": 0.4195, + "step": 13545 + }, + { + "epoch": 0.18374932175800326, + "grad_norm": 4.59384298324585, + "learning_rate": 8.246539673838565e-06, + "loss": 0.3311, + "step": 13546 + }, + { + "epoch": 0.18376288659793816, + "grad_norm": 8.38806438446045, + "learning_rate": 8.24640263121831e-06, + "loss": 0.4463, + "step": 13547 + }, + { + "epoch": 0.18377645143787302, + "grad_norm": 6.770242691040039, + "learning_rate": 8.246265588598055e-06, + "loss": 0.552, + "step": 13548 + }, + { + "epoch": 0.18379001627780792, + "grad_norm": 9.723455429077148, + "learning_rate": 8.246128545977799e-06, + "loss": 0.4744, + "step": 13549 + }, + { + "epoch": 0.1838035811177428, + "grad_norm": 8.193828582763672, + "learning_rate": 8.245991503357544e-06, + "loss": 0.401, + "step": 13550 + }, + { + "epoch": 0.1838171459576777, + "grad_norm": 5.011817932128906, + "learning_rate": 8.245854460737291e-06, + "loss": 0.3125, + "step": 13551 + }, + { + "epoch": 0.1838307107976126, + "grad_norm": 6.003725051879883, + "learning_rate": 8.245717418117035e-06, + "loss": 0.2922, + "step": 13552 + }, + { + "epoch": 0.1838442756375475, + "grad_norm": 5.934176921844482, + "learning_rate": 8.24558037549678e-06, + "loss": 0.2343, + "step": 13553 + }, + { + "epoch": 0.18385784047748235, + "grad_norm": 6.2310333251953125, + "learning_rate": 8.245443332876525e-06, + "loss": 0.3447, + "step": 13554 + }, + { + "epoch": 0.18387140531741725, + "grad_norm": 6.772974967956543, + "learning_rate": 8.24530629025627e-06, + "loss": 0.4594, + "step": 13555 + }, + { + "epoch": 0.18388497015735214, + "grad_norm": 6.777181625366211, + "learning_rate": 8.245169247636015e-06, + "loss": 0.3961, + "step": 13556 + }, + { + "epoch": 0.18389853499728703, + "grad_norm": 5.786120414733887, + "learning_rate": 8.24503220501576e-06, + "loss": 0.289, + "step": 13557 + }, + { + "epoch": 0.18391209983722193, + "grad_norm": 7.0000176429748535, + "learning_rate": 8.244895162395506e-06, + "loss": 0.2625, + "step": 13558 + }, + { + "epoch": 0.18392566467715682, + "grad_norm": 5.883935451507568, + "learning_rate": 8.24475811977525e-06, + "loss": 0.3281, + "step": 13559 + }, + { + "epoch": 0.18393922951709168, + "grad_norm": 5.494534492492676, + "learning_rate": 8.244621077154996e-06, + "loss": 0.314, + "step": 13560 + }, + { + "epoch": 0.18395279435702658, + "grad_norm": 6.51407527923584, + "learning_rate": 8.244484034534741e-06, + "loss": 0.3921, + "step": 13561 + }, + { + "epoch": 0.18396635919696147, + "grad_norm": 8.275086402893066, + "learning_rate": 8.244346991914486e-06, + "loss": 0.429, + "step": 13562 + }, + { + "epoch": 0.18397992403689636, + "grad_norm": 8.339293479919434, + "learning_rate": 8.244209949294231e-06, + "loss": 0.2928, + "step": 13563 + }, + { + "epoch": 0.18399348887683126, + "grad_norm": 6.800079822540283, + "learning_rate": 8.244072906673977e-06, + "loss": 0.571, + "step": 13564 + }, + { + "epoch": 0.18400705371676615, + "grad_norm": 5.85891056060791, + "learning_rate": 8.243935864053722e-06, + "loss": 0.4495, + "step": 13565 + }, + { + "epoch": 0.18402061855670104, + "grad_norm": 5.758090972900391, + "learning_rate": 8.243798821433467e-06, + "loss": 0.4399, + "step": 13566 + }, + { + "epoch": 0.1840341833966359, + "grad_norm": 6.602797985076904, + "learning_rate": 8.24366177881321e-06, + "loss": 0.4822, + "step": 13567 + }, + { + "epoch": 0.1840477482365708, + "grad_norm": 5.2829413414001465, + "learning_rate": 8.243524736192957e-06, + "loss": 0.3995, + "step": 13568 + }, + { + "epoch": 0.1840613130765057, + "grad_norm": 4.968446731567383, + "learning_rate": 8.243387693572703e-06, + "loss": 0.3503, + "step": 13569 + }, + { + "epoch": 0.1840748779164406, + "grad_norm": 5.080721378326416, + "learning_rate": 8.243250650952446e-06, + "loss": 0.3097, + "step": 13570 + }, + { + "epoch": 0.18408844275637548, + "grad_norm": 6.58984899520874, + "learning_rate": 8.243113608332191e-06, + "loss": 0.3967, + "step": 13571 + }, + { + "epoch": 0.18410200759631037, + "grad_norm": 6.002063274383545, + "learning_rate": 8.242976565711938e-06, + "loss": 0.282, + "step": 13572 + }, + { + "epoch": 0.18411557243624524, + "grad_norm": 5.255213260650635, + "learning_rate": 8.242839523091683e-06, + "loss": 0.4098, + "step": 13573 + }, + { + "epoch": 0.18412913727618013, + "grad_norm": 5.26235818862915, + "learning_rate": 8.242702480471427e-06, + "loss": 0.3585, + "step": 13574 + }, + { + "epoch": 0.18414270211611503, + "grad_norm": 4.614306449890137, + "learning_rate": 8.242565437851172e-06, + "loss": 0.3119, + "step": 13575 + }, + { + "epoch": 0.18415626695604992, + "grad_norm": 5.578242778778076, + "learning_rate": 8.242428395230917e-06, + "loss": 0.3126, + "step": 13576 + }, + { + "epoch": 0.1841698317959848, + "grad_norm": 5.904330253601074, + "learning_rate": 8.242291352610662e-06, + "loss": 0.2871, + "step": 13577 + }, + { + "epoch": 0.1841833966359197, + "grad_norm": 6.044366359710693, + "learning_rate": 8.242154309990407e-06, + "loss": 0.2857, + "step": 13578 + }, + { + "epoch": 0.1841969614758546, + "grad_norm": 6.653443813323975, + "learning_rate": 8.242017267370153e-06, + "loss": 0.6119, + "step": 13579 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 4.571200847625732, + "learning_rate": 8.241880224749898e-06, + "loss": 0.2365, + "step": 13580 + }, + { + "epoch": 0.18422409115572436, + "grad_norm": 5.4421539306640625, + "learning_rate": 8.241743182129643e-06, + "loss": 0.4076, + "step": 13581 + }, + { + "epoch": 0.18423765599565925, + "grad_norm": 5.478764057159424, + "learning_rate": 8.241606139509388e-06, + "loss": 0.3413, + "step": 13582 + }, + { + "epoch": 0.18425122083559414, + "grad_norm": 5.335917949676514, + "learning_rate": 8.241469096889133e-06, + "loss": 0.261, + "step": 13583 + }, + { + "epoch": 0.18426478567552904, + "grad_norm": 4.38938570022583, + "learning_rate": 8.241332054268879e-06, + "loss": 0.2036, + "step": 13584 + }, + { + "epoch": 0.18427835051546393, + "grad_norm": 4.59692907333374, + "learning_rate": 8.241195011648624e-06, + "loss": 0.229, + "step": 13585 + }, + { + "epoch": 0.1842919153553988, + "grad_norm": 5.67336893081665, + "learning_rate": 8.241057969028369e-06, + "loss": 0.3703, + "step": 13586 + }, + { + "epoch": 0.1843054801953337, + "grad_norm": 6.190459728240967, + "learning_rate": 8.240920926408114e-06, + "loss": 0.3499, + "step": 13587 + }, + { + "epoch": 0.18431904503526858, + "grad_norm": 8.550955772399902, + "learning_rate": 8.24078388378786e-06, + "loss": 0.5567, + "step": 13588 + }, + { + "epoch": 0.18433260987520347, + "grad_norm": 6.821792125701904, + "learning_rate": 8.240646841167603e-06, + "loss": 0.3754, + "step": 13589 + }, + { + "epoch": 0.18434617471513837, + "grad_norm": 6.484344959259033, + "learning_rate": 8.24050979854735e-06, + "loss": 0.3968, + "step": 13590 + }, + { + "epoch": 0.18435973955507326, + "grad_norm": 7.768712520599365, + "learning_rate": 8.240372755927095e-06, + "loss": 0.7061, + "step": 13591 + }, + { + "epoch": 0.18437330439500813, + "grad_norm": 7.068560600280762, + "learning_rate": 8.240235713306838e-06, + "loss": 0.3982, + "step": 13592 + }, + { + "epoch": 0.18438686923494302, + "grad_norm": 5.766315460205078, + "learning_rate": 8.240098670686583e-06, + "loss": 0.27, + "step": 13593 + }, + { + "epoch": 0.1844004340748779, + "grad_norm": 6.725499153137207, + "learning_rate": 8.23996162806633e-06, + "loss": 0.332, + "step": 13594 + }, + { + "epoch": 0.1844139989148128, + "grad_norm": 6.756999492645264, + "learning_rate": 8.239824585446074e-06, + "loss": 0.4002, + "step": 13595 + }, + { + "epoch": 0.1844275637547477, + "grad_norm": 4.957364559173584, + "learning_rate": 8.239687542825819e-06, + "loss": 0.2238, + "step": 13596 + }, + { + "epoch": 0.1844411285946826, + "grad_norm": 8.525105476379395, + "learning_rate": 8.239550500205564e-06, + "loss": 0.6104, + "step": 13597 + }, + { + "epoch": 0.18445469343461748, + "grad_norm": 6.6917338371276855, + "learning_rate": 8.23941345758531e-06, + "loss": 0.3485, + "step": 13598 + }, + { + "epoch": 0.18446825827455235, + "grad_norm": 4.481215953826904, + "learning_rate": 8.239276414965055e-06, + "loss": 0.2916, + "step": 13599 + }, + { + "epoch": 0.18448182311448724, + "grad_norm": 5.471769332885742, + "learning_rate": 8.2391393723448e-06, + "loss": 0.3847, + "step": 13600 + }, + { + "epoch": 0.18449538795442214, + "grad_norm": 6.864701271057129, + "learning_rate": 8.239002329724545e-06, + "loss": 0.3397, + "step": 13601 + }, + { + "epoch": 0.18450895279435703, + "grad_norm": 5.711800575256348, + "learning_rate": 8.23886528710429e-06, + "loss": 0.3061, + "step": 13602 + }, + { + "epoch": 0.18452251763429192, + "grad_norm": 7.109445095062256, + "learning_rate": 8.238728244484035e-06, + "loss": 0.4678, + "step": 13603 + }, + { + "epoch": 0.18453608247422681, + "grad_norm": 6.506644248962402, + "learning_rate": 8.23859120186378e-06, + "loss": 0.4672, + "step": 13604 + }, + { + "epoch": 0.18454964731416168, + "grad_norm": 6.583085536956787, + "learning_rate": 8.238454159243526e-06, + "loss": 0.449, + "step": 13605 + }, + { + "epoch": 0.18456321215409657, + "grad_norm": 5.247978687286377, + "learning_rate": 8.23831711662327e-06, + "loss": 0.3237, + "step": 13606 + }, + { + "epoch": 0.18457677699403147, + "grad_norm": 5.6292877197265625, + "learning_rate": 8.238180074003016e-06, + "loss": 0.2732, + "step": 13607 + }, + { + "epoch": 0.18459034183396636, + "grad_norm": 7.217862129211426, + "learning_rate": 8.238043031382761e-06, + "loss": 0.3696, + "step": 13608 + }, + { + "epoch": 0.18460390667390125, + "grad_norm": 6.701678276062012, + "learning_rate": 8.237905988762506e-06, + "loss": 0.3898, + "step": 13609 + }, + { + "epoch": 0.18461747151383615, + "grad_norm": 6.161136150360107, + "learning_rate": 8.23776894614225e-06, + "loss": 0.4007, + "step": 13610 + }, + { + "epoch": 0.18463103635377104, + "grad_norm": 5.970709323883057, + "learning_rate": 8.237631903521997e-06, + "loss": 0.3475, + "step": 13611 + }, + { + "epoch": 0.1846446011937059, + "grad_norm": 7.331027030944824, + "learning_rate": 8.237494860901742e-06, + "loss": 0.3677, + "step": 13612 + }, + { + "epoch": 0.1846581660336408, + "grad_norm": 6.4324517250061035, + "learning_rate": 8.237357818281485e-06, + "loss": 0.4383, + "step": 13613 + }, + { + "epoch": 0.1846717308735757, + "grad_norm": 6.7688398361206055, + "learning_rate": 8.23722077566123e-06, + "loss": 0.3904, + "step": 13614 + }, + { + "epoch": 0.18468529571351058, + "grad_norm": 7.937685012817383, + "learning_rate": 8.237083733040977e-06, + "loss": 0.5567, + "step": 13615 + }, + { + "epoch": 0.18469886055344548, + "grad_norm": 7.015048027038574, + "learning_rate": 8.236946690420723e-06, + "loss": 0.3985, + "step": 13616 + }, + { + "epoch": 0.18471242539338037, + "grad_norm": 6.227080345153809, + "learning_rate": 8.236809647800466e-06, + "loss": 0.3922, + "step": 13617 + }, + { + "epoch": 0.18472599023331523, + "grad_norm": 5.562592506408691, + "learning_rate": 8.236672605180211e-06, + "loss": 0.2877, + "step": 13618 + }, + { + "epoch": 0.18473955507325013, + "grad_norm": 5.230879306793213, + "learning_rate": 8.236535562559956e-06, + "loss": 0.3625, + "step": 13619 + }, + { + "epoch": 0.18475311991318502, + "grad_norm": 5.844698905944824, + "learning_rate": 8.236398519939702e-06, + "loss": 0.3481, + "step": 13620 + }, + { + "epoch": 0.18476668475311991, + "grad_norm": 7.024789810180664, + "learning_rate": 8.236261477319447e-06, + "loss": 0.3436, + "step": 13621 + }, + { + "epoch": 0.1847802495930548, + "grad_norm": 6.874062538146973, + "learning_rate": 8.236124434699192e-06, + "loss": 0.4367, + "step": 13622 + }, + { + "epoch": 0.1847938144329897, + "grad_norm": 5.137616157531738, + "learning_rate": 8.235987392078937e-06, + "loss": 0.3183, + "step": 13623 + }, + { + "epoch": 0.18480737927292457, + "grad_norm": 6.897027492523193, + "learning_rate": 8.235850349458682e-06, + "loss": 0.3928, + "step": 13624 + }, + { + "epoch": 0.18482094411285946, + "grad_norm": 5.534662246704102, + "learning_rate": 8.235713306838427e-06, + "loss": 0.3363, + "step": 13625 + }, + { + "epoch": 0.18483450895279435, + "grad_norm": 5.767733097076416, + "learning_rate": 8.235576264218173e-06, + "loss": 0.2922, + "step": 13626 + }, + { + "epoch": 0.18484807379272924, + "grad_norm": 5.01489782333374, + "learning_rate": 8.235439221597918e-06, + "loss": 0.2611, + "step": 13627 + }, + { + "epoch": 0.18486163863266414, + "grad_norm": 7.737407684326172, + "learning_rate": 8.235302178977663e-06, + "loss": 0.3453, + "step": 13628 + }, + { + "epoch": 0.18487520347259903, + "grad_norm": 5.9917802810668945, + "learning_rate": 8.235165136357408e-06, + "loss": 0.2863, + "step": 13629 + }, + { + "epoch": 0.18488876831253392, + "grad_norm": 4.141965866088867, + "learning_rate": 8.235028093737153e-06, + "loss": 0.25, + "step": 13630 + }, + { + "epoch": 0.1849023331524688, + "grad_norm": 4.281432151794434, + "learning_rate": 8.234891051116899e-06, + "loss": 0.2857, + "step": 13631 + }, + { + "epoch": 0.18491589799240368, + "grad_norm": 4.606733798980713, + "learning_rate": 8.234754008496642e-06, + "loss": 0.2283, + "step": 13632 + }, + { + "epoch": 0.18492946283233858, + "grad_norm": 6.836503505706787, + "learning_rate": 8.234616965876389e-06, + "loss": 0.2758, + "step": 13633 + }, + { + "epoch": 0.18494302767227347, + "grad_norm": 5.098140716552734, + "learning_rate": 8.234479923256134e-06, + "loss": 0.3832, + "step": 13634 + }, + { + "epoch": 0.18495659251220836, + "grad_norm": 6.053249359130859, + "learning_rate": 8.234342880635878e-06, + "loss": 0.3591, + "step": 13635 + }, + { + "epoch": 0.18497015735214326, + "grad_norm": 4.151370525360107, + "learning_rate": 8.234205838015623e-06, + "loss": 0.3182, + "step": 13636 + }, + { + "epoch": 0.18498372219207812, + "grad_norm": 5.175814628601074, + "learning_rate": 8.23406879539537e-06, + "loss": 0.3068, + "step": 13637 + }, + { + "epoch": 0.184997287032013, + "grad_norm": 3.7345430850982666, + "learning_rate": 8.233931752775113e-06, + "loss": 0.2128, + "step": 13638 + }, + { + "epoch": 0.1850108518719479, + "grad_norm": 5.141608715057373, + "learning_rate": 8.233794710154858e-06, + "loss": 0.331, + "step": 13639 + }, + { + "epoch": 0.1850244167118828, + "grad_norm": 4.745635032653809, + "learning_rate": 8.233657667534603e-06, + "loss": 0.1857, + "step": 13640 + }, + { + "epoch": 0.1850379815518177, + "grad_norm": 5.451821804046631, + "learning_rate": 8.23352062491435e-06, + "loss": 0.443, + "step": 13641 + }, + { + "epoch": 0.18505154639175259, + "grad_norm": 4.547729015350342, + "learning_rate": 8.233383582294094e-06, + "loss": 0.1833, + "step": 13642 + }, + { + "epoch": 0.18506511123168748, + "grad_norm": 5.975360870361328, + "learning_rate": 8.233246539673839e-06, + "loss": 0.4576, + "step": 13643 + }, + { + "epoch": 0.18507867607162234, + "grad_norm": 3.6100454330444336, + "learning_rate": 8.233109497053584e-06, + "loss": 0.2122, + "step": 13644 + }, + { + "epoch": 0.18509224091155724, + "grad_norm": 7.022332191467285, + "learning_rate": 8.23297245443333e-06, + "loss": 0.3001, + "step": 13645 + }, + { + "epoch": 0.18510580575149213, + "grad_norm": 6.9762396812438965, + "learning_rate": 8.232835411813075e-06, + "loss": 0.3296, + "step": 13646 + }, + { + "epoch": 0.18511937059142702, + "grad_norm": 4.9413909912109375, + "learning_rate": 8.23269836919282e-06, + "loss": 0.1924, + "step": 13647 + }, + { + "epoch": 0.18513293543136192, + "grad_norm": 5.920393466949463, + "learning_rate": 8.232561326572565e-06, + "loss": 0.4735, + "step": 13648 + }, + { + "epoch": 0.1851465002712968, + "grad_norm": 3.5644404888153076, + "learning_rate": 8.23242428395231e-06, + "loss": 0.2104, + "step": 13649 + }, + { + "epoch": 0.18516006511123168, + "grad_norm": 5.062504291534424, + "learning_rate": 8.232287241332055e-06, + "loss": 0.3924, + "step": 13650 + }, + { + "epoch": 0.18517362995116657, + "grad_norm": 4.184950351715088, + "learning_rate": 8.2321501987118e-06, + "loss": 0.2141, + "step": 13651 + }, + { + "epoch": 0.18518719479110146, + "grad_norm": 4.772414684295654, + "learning_rate": 8.232013156091546e-06, + "loss": 0.2615, + "step": 13652 + }, + { + "epoch": 0.18520075963103635, + "grad_norm": 5.01185941696167, + "learning_rate": 8.231876113471289e-06, + "loss": 0.3967, + "step": 13653 + }, + { + "epoch": 0.18521432447097125, + "grad_norm": 5.241315841674805, + "learning_rate": 8.231739070851036e-06, + "loss": 0.2974, + "step": 13654 + }, + { + "epoch": 0.18522788931090614, + "grad_norm": 5.6840410232543945, + "learning_rate": 8.231602028230781e-06, + "loss": 0.366, + "step": 13655 + }, + { + "epoch": 0.18524145415084103, + "grad_norm": 5.761959552764893, + "learning_rate": 8.231464985610526e-06, + "loss": 0.5081, + "step": 13656 + }, + { + "epoch": 0.1852550189907759, + "grad_norm": 6.17172908782959, + "learning_rate": 8.23132794299027e-06, + "loss": 0.3458, + "step": 13657 + }, + { + "epoch": 0.1852685838307108, + "grad_norm": 6.029958724975586, + "learning_rate": 8.231190900370015e-06, + "loss": 0.3802, + "step": 13658 + }, + { + "epoch": 0.18528214867064569, + "grad_norm": 4.963016986846924, + "learning_rate": 8.231053857749762e-06, + "loss": 0.3107, + "step": 13659 + }, + { + "epoch": 0.18529571351058058, + "grad_norm": 7.936525821685791, + "learning_rate": 8.230916815129505e-06, + "loss": 0.4299, + "step": 13660 + }, + { + "epoch": 0.18530927835051547, + "grad_norm": 5.466644287109375, + "learning_rate": 8.23077977250925e-06, + "loss": 0.3424, + "step": 13661 + }, + { + "epoch": 0.18532284319045036, + "grad_norm": 6.698638916015625, + "learning_rate": 8.230642729888996e-06, + "loss": 0.4598, + "step": 13662 + }, + { + "epoch": 0.18533640803038523, + "grad_norm": 5.8603081703186035, + "learning_rate": 8.230505687268741e-06, + "loss": 0.4224, + "step": 13663 + }, + { + "epoch": 0.18534997287032012, + "grad_norm": 7.17551851272583, + "learning_rate": 8.230368644648486e-06, + "loss": 0.4856, + "step": 13664 + }, + { + "epoch": 0.18536353771025502, + "grad_norm": 5.9949445724487305, + "learning_rate": 8.230231602028231e-06, + "loss": 0.4742, + "step": 13665 + }, + { + "epoch": 0.1853771025501899, + "grad_norm": 3.5823781490325928, + "learning_rate": 8.230094559407976e-06, + "loss": 0.279, + "step": 13666 + }, + { + "epoch": 0.1853906673901248, + "grad_norm": 4.395098686218262, + "learning_rate": 8.229957516787722e-06, + "loss": 0.3591, + "step": 13667 + }, + { + "epoch": 0.1854042322300597, + "grad_norm": 8.113446235656738, + "learning_rate": 8.229820474167467e-06, + "loss": 0.5759, + "step": 13668 + }, + { + "epoch": 0.18541779706999456, + "grad_norm": 6.742560863494873, + "learning_rate": 8.229683431547212e-06, + "loss": 0.4688, + "step": 13669 + }, + { + "epoch": 0.18543136190992945, + "grad_norm": 6.231607913970947, + "learning_rate": 8.229546388926957e-06, + "loss": 0.4041, + "step": 13670 + }, + { + "epoch": 0.18544492674986435, + "grad_norm": 3.452695608139038, + "learning_rate": 8.229409346306702e-06, + "loss": 0.2917, + "step": 13671 + }, + { + "epoch": 0.18545849158979924, + "grad_norm": 7.690509796142578, + "learning_rate": 8.229272303686448e-06, + "loss": 0.5366, + "step": 13672 + }, + { + "epoch": 0.18547205642973413, + "grad_norm": 5.706130027770996, + "learning_rate": 8.229135261066193e-06, + "loss": 0.2209, + "step": 13673 + }, + { + "epoch": 0.18548562126966903, + "grad_norm": 5.738473415374756, + "learning_rate": 8.228998218445938e-06, + "loss": 0.3828, + "step": 13674 + }, + { + "epoch": 0.18549918610960392, + "grad_norm": 5.498593330383301, + "learning_rate": 8.228861175825681e-06, + "loss": 0.4566, + "step": 13675 + }, + { + "epoch": 0.18551275094953878, + "grad_norm": 7.164052486419678, + "learning_rate": 8.228724133205428e-06, + "loss": 0.4747, + "step": 13676 + }, + { + "epoch": 0.18552631578947368, + "grad_norm": 7.170546531677246, + "learning_rate": 8.228587090585173e-06, + "loss": 0.375, + "step": 13677 + }, + { + "epoch": 0.18553988062940857, + "grad_norm": 5.179811954498291, + "learning_rate": 8.228450047964917e-06, + "loss": 0.3986, + "step": 13678 + }, + { + "epoch": 0.18555344546934346, + "grad_norm": 5.581114292144775, + "learning_rate": 8.228313005344662e-06, + "loss": 0.3443, + "step": 13679 + }, + { + "epoch": 0.18556701030927836, + "grad_norm": 6.506448745727539, + "learning_rate": 8.228175962724409e-06, + "loss": 0.5376, + "step": 13680 + }, + { + "epoch": 0.18558057514921325, + "grad_norm": 5.395453453063965, + "learning_rate": 8.228038920104154e-06, + "loss": 0.3954, + "step": 13681 + }, + { + "epoch": 0.18559413998914812, + "grad_norm": 5.304163455963135, + "learning_rate": 8.227901877483898e-06, + "loss": 0.4306, + "step": 13682 + }, + { + "epoch": 0.185607704829083, + "grad_norm": 6.878720283508301, + "learning_rate": 8.227764834863643e-06, + "loss": 0.5234, + "step": 13683 + }, + { + "epoch": 0.1856212696690179, + "grad_norm": 6.139754295349121, + "learning_rate": 8.227627792243388e-06, + "loss": 0.5151, + "step": 13684 + }, + { + "epoch": 0.1856348345089528, + "grad_norm": 7.61492395401001, + "learning_rate": 8.227490749623133e-06, + "loss": 0.434, + "step": 13685 + }, + { + "epoch": 0.1856483993488877, + "grad_norm": 5.680761337280273, + "learning_rate": 8.227353707002878e-06, + "loss": 0.4193, + "step": 13686 + }, + { + "epoch": 0.18566196418882258, + "grad_norm": 8.321112632751465, + "learning_rate": 8.227216664382623e-06, + "loss": 0.4863, + "step": 13687 + }, + { + "epoch": 0.18567552902875747, + "grad_norm": 5.078488349914551, + "learning_rate": 8.227079621762369e-06, + "loss": 0.3066, + "step": 13688 + }, + { + "epoch": 0.18568909386869234, + "grad_norm": 6.515858173370361, + "learning_rate": 8.226942579142114e-06, + "loss": 0.4449, + "step": 13689 + }, + { + "epoch": 0.18570265870862723, + "grad_norm": 5.345226764678955, + "learning_rate": 8.226805536521859e-06, + "loss": 0.3524, + "step": 13690 + }, + { + "epoch": 0.18571622354856213, + "grad_norm": 6.199385643005371, + "learning_rate": 8.226668493901604e-06, + "loss": 0.5426, + "step": 13691 + }, + { + "epoch": 0.18572978838849702, + "grad_norm": 4.566125392913818, + "learning_rate": 8.22653145128135e-06, + "loss": 0.2721, + "step": 13692 + }, + { + "epoch": 0.1857433532284319, + "grad_norm": 5.815080165863037, + "learning_rate": 8.226394408661095e-06, + "loss": 0.4256, + "step": 13693 + }, + { + "epoch": 0.1857569180683668, + "grad_norm": 7.043276309967041, + "learning_rate": 8.22625736604084e-06, + "loss": 0.4974, + "step": 13694 + }, + { + "epoch": 0.18577048290830167, + "grad_norm": 4.215188026428223, + "learning_rate": 8.226120323420585e-06, + "loss": 0.336, + "step": 13695 + }, + { + "epoch": 0.18578404774823656, + "grad_norm": 5.908486366271973, + "learning_rate": 8.22598328080033e-06, + "loss": 0.4692, + "step": 13696 + }, + { + "epoch": 0.18579761258817146, + "grad_norm": 4.596137046813965, + "learning_rate": 8.225846238180075e-06, + "loss": 0.2527, + "step": 13697 + }, + { + "epoch": 0.18581117742810635, + "grad_norm": 7.182257652282715, + "learning_rate": 8.22570919555982e-06, + "loss": 0.4584, + "step": 13698 + }, + { + "epoch": 0.18582474226804124, + "grad_norm": 4.600341320037842, + "learning_rate": 8.225572152939566e-06, + "loss": 0.3782, + "step": 13699 + }, + { + "epoch": 0.18583830710797614, + "grad_norm": 5.1816887855529785, + "learning_rate": 8.225435110319309e-06, + "loss": 0.3505, + "step": 13700 + }, + { + "epoch": 0.185851871947911, + "grad_norm": 5.784499645233154, + "learning_rate": 8.225298067699054e-06, + "loss": 0.4165, + "step": 13701 + }, + { + "epoch": 0.1858654367878459, + "grad_norm": 6.5613017082214355, + "learning_rate": 8.225161025078801e-06, + "loss": 0.7057, + "step": 13702 + }, + { + "epoch": 0.1858790016277808, + "grad_norm": 7.577389717102051, + "learning_rate": 8.225023982458545e-06, + "loss": 0.6738, + "step": 13703 + }, + { + "epoch": 0.18589256646771568, + "grad_norm": 6.84206485748291, + "learning_rate": 8.22488693983829e-06, + "loss": 0.451, + "step": 13704 + }, + { + "epoch": 0.18590613130765057, + "grad_norm": 4.405789852142334, + "learning_rate": 8.224749897218035e-06, + "loss": 0.4606, + "step": 13705 + }, + { + "epoch": 0.18591969614758547, + "grad_norm": 6.873775959014893, + "learning_rate": 8.22461285459778e-06, + "loss": 0.4608, + "step": 13706 + }, + { + "epoch": 0.18593326098752036, + "grad_norm": 9.517232894897461, + "learning_rate": 8.224475811977525e-06, + "loss": 0.6051, + "step": 13707 + }, + { + "epoch": 0.18594682582745523, + "grad_norm": 5.977022647857666, + "learning_rate": 8.22433876935727e-06, + "loss": 0.5108, + "step": 13708 + }, + { + "epoch": 0.18596039066739012, + "grad_norm": 6.528868198394775, + "learning_rate": 8.224201726737016e-06, + "loss": 0.6063, + "step": 13709 + }, + { + "epoch": 0.185973955507325, + "grad_norm": 4.721982955932617, + "learning_rate": 8.224064684116761e-06, + "loss": 0.3845, + "step": 13710 + }, + { + "epoch": 0.1859875203472599, + "grad_norm": 5.365521430969238, + "learning_rate": 8.223927641496506e-06, + "loss": 0.5002, + "step": 13711 + }, + { + "epoch": 0.1860010851871948, + "grad_norm": 8.298107147216797, + "learning_rate": 8.223790598876251e-06, + "loss": 0.4543, + "step": 13712 + }, + { + "epoch": 0.1860146500271297, + "grad_norm": 6.824392318725586, + "learning_rate": 8.223653556255996e-06, + "loss": 0.4854, + "step": 13713 + }, + { + "epoch": 0.18602821486706456, + "grad_norm": 6.86051082611084, + "learning_rate": 8.223516513635742e-06, + "loss": 0.4268, + "step": 13714 + }, + { + "epoch": 0.18604177970699945, + "grad_norm": 4.899285316467285, + "learning_rate": 8.223379471015487e-06, + "loss": 0.3734, + "step": 13715 + }, + { + "epoch": 0.18605534454693434, + "grad_norm": 6.760961532592773, + "learning_rate": 8.223242428395232e-06, + "loss": 0.4987, + "step": 13716 + }, + { + "epoch": 0.18606890938686924, + "grad_norm": 6.735617160797119, + "learning_rate": 8.223105385774977e-06, + "loss": 0.3601, + "step": 13717 + }, + { + "epoch": 0.18608247422680413, + "grad_norm": 5.218008518218994, + "learning_rate": 8.22296834315472e-06, + "loss": 0.4512, + "step": 13718 + }, + { + "epoch": 0.18609603906673902, + "grad_norm": 7.050449371337891, + "learning_rate": 8.222831300534468e-06, + "loss": 0.4066, + "step": 13719 + }, + { + "epoch": 0.18610960390667391, + "grad_norm": 5.980092525482178, + "learning_rate": 8.222694257914213e-06, + "loss": 0.4495, + "step": 13720 + }, + { + "epoch": 0.18612316874660878, + "grad_norm": 5.084059715270996, + "learning_rate": 8.222557215293956e-06, + "loss": 0.3675, + "step": 13721 + }, + { + "epoch": 0.18613673358654367, + "grad_norm": 6.1771416664123535, + "learning_rate": 8.222420172673701e-06, + "loss": 0.4235, + "step": 13722 + }, + { + "epoch": 0.18615029842647857, + "grad_norm": 5.932520866394043, + "learning_rate": 8.222283130053448e-06, + "loss": 0.4371, + "step": 13723 + }, + { + "epoch": 0.18616386326641346, + "grad_norm": 5.964991569519043, + "learning_rate": 8.222146087433193e-06, + "loss": 0.4158, + "step": 13724 + }, + { + "epoch": 0.18617742810634835, + "grad_norm": 7.836608409881592, + "learning_rate": 8.222009044812937e-06, + "loss": 0.4708, + "step": 13725 + }, + { + "epoch": 0.18619099294628325, + "grad_norm": 6.803744316101074, + "learning_rate": 8.221872002192682e-06, + "loss": 0.5752, + "step": 13726 + }, + { + "epoch": 0.1862045577862181, + "grad_norm": 4.940694808959961, + "learning_rate": 8.221734959572427e-06, + "loss": 0.4243, + "step": 13727 + }, + { + "epoch": 0.186218122626153, + "grad_norm": 6.741756916046143, + "learning_rate": 8.221597916952172e-06, + "loss": 0.669, + "step": 13728 + }, + { + "epoch": 0.1862316874660879, + "grad_norm": 5.574113845825195, + "learning_rate": 8.221460874331918e-06, + "loss": 0.475, + "step": 13729 + }, + { + "epoch": 0.1862452523060228, + "grad_norm": 7.238336086273193, + "learning_rate": 8.221323831711663e-06, + "loss": 0.6618, + "step": 13730 + }, + { + "epoch": 0.18625881714595768, + "grad_norm": 5.650284767150879, + "learning_rate": 8.221186789091408e-06, + "loss": 0.3631, + "step": 13731 + }, + { + "epoch": 0.18627238198589258, + "grad_norm": 4.98298978805542, + "learning_rate": 8.221049746471153e-06, + "loss": 0.4114, + "step": 13732 + }, + { + "epoch": 0.18628594682582744, + "grad_norm": 4.921231269836426, + "learning_rate": 8.220912703850898e-06, + "loss": 0.2947, + "step": 13733 + }, + { + "epoch": 0.18629951166576234, + "grad_norm": 5.52473258972168, + "learning_rate": 8.220775661230644e-06, + "loss": 0.3485, + "step": 13734 + }, + { + "epoch": 0.18631307650569723, + "grad_norm": 5.425942897796631, + "learning_rate": 8.220638618610389e-06, + "loss": 0.446, + "step": 13735 + }, + { + "epoch": 0.18632664134563212, + "grad_norm": 8.51694393157959, + "learning_rate": 8.220501575990134e-06, + "loss": 0.4575, + "step": 13736 + }, + { + "epoch": 0.18634020618556701, + "grad_norm": 5.748275279998779, + "learning_rate": 8.220364533369879e-06, + "loss": 0.5681, + "step": 13737 + }, + { + "epoch": 0.1863537710255019, + "grad_norm": 5.654996395111084, + "learning_rate": 8.220227490749624e-06, + "loss": 0.3501, + "step": 13738 + }, + { + "epoch": 0.1863673358654368, + "grad_norm": 5.499091625213623, + "learning_rate": 8.22009044812937e-06, + "loss": 0.4213, + "step": 13739 + }, + { + "epoch": 0.18638090070537167, + "grad_norm": 5.258005142211914, + "learning_rate": 8.219953405509113e-06, + "loss": 0.3791, + "step": 13740 + }, + { + "epoch": 0.18639446554530656, + "grad_norm": 6.330808639526367, + "learning_rate": 8.21981636288886e-06, + "loss": 0.5324, + "step": 13741 + }, + { + "epoch": 0.18640803038524145, + "grad_norm": 7.45542573928833, + "learning_rate": 8.219679320268605e-06, + "loss": 0.5915, + "step": 13742 + }, + { + "epoch": 0.18642159522517635, + "grad_norm": 6.047036647796631, + "learning_rate": 8.219542277648348e-06, + "loss": 0.4847, + "step": 13743 + }, + { + "epoch": 0.18643516006511124, + "grad_norm": 7.626928329467773, + "learning_rate": 8.219405235028094e-06, + "loss": 0.5747, + "step": 13744 + }, + { + "epoch": 0.18644872490504613, + "grad_norm": 8.068309783935547, + "learning_rate": 8.21926819240784e-06, + "loss": 0.4805, + "step": 13745 + }, + { + "epoch": 0.186462289744981, + "grad_norm": 5.2319560050964355, + "learning_rate": 8.219131149787584e-06, + "loss": 0.4083, + "step": 13746 + }, + { + "epoch": 0.1864758545849159, + "grad_norm": 4.694599151611328, + "learning_rate": 8.218994107167329e-06, + "loss": 0.3524, + "step": 13747 + }, + { + "epoch": 0.18648941942485078, + "grad_norm": 8.13472843170166, + "learning_rate": 8.218857064547074e-06, + "loss": 0.6374, + "step": 13748 + }, + { + "epoch": 0.18650298426478568, + "grad_norm": 6.218682765960693, + "learning_rate": 8.218720021926821e-06, + "loss": 0.512, + "step": 13749 + }, + { + "epoch": 0.18651654910472057, + "grad_norm": 7.220652103424072, + "learning_rate": 8.218582979306565e-06, + "loss": 0.5095, + "step": 13750 + }, + { + "epoch": 0.18653011394465546, + "grad_norm": 5.656858444213867, + "learning_rate": 8.21844593668631e-06, + "loss": 0.4428, + "step": 13751 + }, + { + "epoch": 0.18654367878459036, + "grad_norm": 7.152047157287598, + "learning_rate": 8.218308894066055e-06, + "loss": 0.47, + "step": 13752 + }, + { + "epoch": 0.18655724362452522, + "grad_norm": 7.899967193603516, + "learning_rate": 8.2181718514458e-06, + "loss": 0.5995, + "step": 13753 + }, + { + "epoch": 0.18657080846446011, + "grad_norm": 5.93056058883667, + "learning_rate": 8.218034808825545e-06, + "loss": 0.4354, + "step": 13754 + }, + { + "epoch": 0.186584373304395, + "grad_norm": 6.845528602600098, + "learning_rate": 8.21789776620529e-06, + "loss": 0.7238, + "step": 13755 + }, + { + "epoch": 0.1865979381443299, + "grad_norm": 6.204400539398193, + "learning_rate": 8.217760723585036e-06, + "loss": 0.4543, + "step": 13756 + }, + { + "epoch": 0.1866115029842648, + "grad_norm": 5.454187393188477, + "learning_rate": 8.217623680964781e-06, + "loss": 0.4697, + "step": 13757 + }, + { + "epoch": 0.1866250678241997, + "grad_norm": 5.1478753089904785, + "learning_rate": 8.217486638344526e-06, + "loss": 0.3417, + "step": 13758 + }, + { + "epoch": 0.18663863266413455, + "grad_norm": 6.0909247398376465, + "learning_rate": 8.217349595724271e-06, + "loss": 0.4031, + "step": 13759 + }, + { + "epoch": 0.18665219750406944, + "grad_norm": 5.891741752624512, + "learning_rate": 8.217212553104016e-06, + "loss": 0.4285, + "step": 13760 + }, + { + "epoch": 0.18666576234400434, + "grad_norm": 6.937044620513916, + "learning_rate": 8.21707551048376e-06, + "loss": 0.4638, + "step": 13761 + }, + { + "epoch": 0.18667932718393923, + "grad_norm": 4.820916652679443, + "learning_rate": 8.216938467863507e-06, + "loss": 0.2921, + "step": 13762 + }, + { + "epoch": 0.18669289202387412, + "grad_norm": 6.011247158050537, + "learning_rate": 8.216801425243252e-06, + "loss": 0.4609, + "step": 13763 + }, + { + "epoch": 0.18670645686380902, + "grad_norm": 6.755220413208008, + "learning_rate": 8.216664382622997e-06, + "loss": 0.4864, + "step": 13764 + }, + { + "epoch": 0.18672002170374388, + "grad_norm": 8.137676239013672, + "learning_rate": 8.21652734000274e-06, + "loss": 0.7231, + "step": 13765 + }, + { + "epoch": 0.18673358654367878, + "grad_norm": 9.238081932067871, + "learning_rate": 8.216390297382488e-06, + "loss": 0.5865, + "step": 13766 + }, + { + "epoch": 0.18674715138361367, + "grad_norm": 5.743433475494385, + "learning_rate": 8.216253254762233e-06, + "loss": 0.3665, + "step": 13767 + }, + { + "epoch": 0.18676071622354856, + "grad_norm": 7.673309326171875, + "learning_rate": 8.216116212141976e-06, + "loss": 0.4685, + "step": 13768 + }, + { + "epoch": 0.18677428106348345, + "grad_norm": 6.665100574493408, + "learning_rate": 8.215979169521721e-06, + "loss": 0.3395, + "step": 13769 + }, + { + "epoch": 0.18678784590341835, + "grad_norm": 5.951933860778809, + "learning_rate": 8.215842126901467e-06, + "loss": 0.329, + "step": 13770 + }, + { + "epoch": 0.18680141074335324, + "grad_norm": 8.68554401397705, + "learning_rate": 8.215705084281212e-06, + "loss": 0.5395, + "step": 13771 + }, + { + "epoch": 0.1868149755832881, + "grad_norm": 5.754349231719971, + "learning_rate": 8.215568041660957e-06, + "loss": 0.5899, + "step": 13772 + }, + { + "epoch": 0.186828540423223, + "grad_norm": 3.666478395462036, + "learning_rate": 8.215430999040702e-06, + "loss": 0.3077, + "step": 13773 + }, + { + "epoch": 0.1868421052631579, + "grad_norm": 5.6492743492126465, + "learning_rate": 8.215293956420447e-06, + "loss": 0.4297, + "step": 13774 + }, + { + "epoch": 0.18685567010309279, + "grad_norm": 6.508048057556152, + "learning_rate": 8.215156913800192e-06, + "loss": 0.4252, + "step": 13775 + }, + { + "epoch": 0.18686923494302768, + "grad_norm": 6.247976779937744, + "learning_rate": 8.215019871179938e-06, + "loss": 0.3804, + "step": 13776 + }, + { + "epoch": 0.18688279978296257, + "grad_norm": 5.821223258972168, + "learning_rate": 8.214882828559683e-06, + "loss": 0.4063, + "step": 13777 + }, + { + "epoch": 0.18689636462289744, + "grad_norm": 6.4343581199646, + "learning_rate": 8.214745785939428e-06, + "loss": 0.394, + "step": 13778 + }, + { + "epoch": 0.18690992946283233, + "grad_norm": 4.587652206420898, + "learning_rate": 8.214608743319173e-06, + "loss": 0.306, + "step": 13779 + }, + { + "epoch": 0.18692349430276722, + "grad_norm": 5.757845401763916, + "learning_rate": 8.214471700698918e-06, + "loss": 0.3819, + "step": 13780 + }, + { + "epoch": 0.18693705914270212, + "grad_norm": 6.3602375984191895, + "learning_rate": 8.214334658078664e-06, + "loss": 0.2855, + "step": 13781 + }, + { + "epoch": 0.186950623982637, + "grad_norm": 5.508847236633301, + "learning_rate": 8.214197615458409e-06, + "loss": 0.4291, + "step": 13782 + }, + { + "epoch": 0.1869641888225719, + "grad_norm": 6.241469860076904, + "learning_rate": 8.214060572838152e-06, + "loss": 0.3819, + "step": 13783 + }, + { + "epoch": 0.1869777536625068, + "grad_norm": 5.4220123291015625, + "learning_rate": 8.213923530217899e-06, + "loss": 0.4371, + "step": 13784 + }, + { + "epoch": 0.18699131850244166, + "grad_norm": 5.946225166320801, + "learning_rate": 8.213786487597644e-06, + "loss": 0.3663, + "step": 13785 + }, + { + "epoch": 0.18700488334237655, + "grad_norm": 6.703976154327393, + "learning_rate": 8.213649444977388e-06, + "loss": 0.4232, + "step": 13786 + }, + { + "epoch": 0.18701844818231145, + "grad_norm": 4.926299571990967, + "learning_rate": 8.213512402357133e-06, + "loss": 0.5004, + "step": 13787 + }, + { + "epoch": 0.18703201302224634, + "grad_norm": 6.868411540985107, + "learning_rate": 8.21337535973688e-06, + "loss": 0.3862, + "step": 13788 + }, + { + "epoch": 0.18704557786218123, + "grad_norm": 4.852699279785156, + "learning_rate": 8.213238317116623e-06, + "loss": 0.3844, + "step": 13789 + }, + { + "epoch": 0.18705914270211613, + "grad_norm": 5.6598100662231445, + "learning_rate": 8.213101274496368e-06, + "loss": 0.3325, + "step": 13790 + }, + { + "epoch": 0.187072707542051, + "grad_norm": 5.80766487121582, + "learning_rate": 8.212964231876114e-06, + "loss": 0.3163, + "step": 13791 + }, + { + "epoch": 0.18708627238198589, + "grad_norm": 4.206270217895508, + "learning_rate": 8.21282718925586e-06, + "loss": 0.2711, + "step": 13792 + }, + { + "epoch": 0.18709983722192078, + "grad_norm": 7.30620813369751, + "learning_rate": 8.212690146635604e-06, + "loss": 0.6169, + "step": 13793 + }, + { + "epoch": 0.18711340206185567, + "grad_norm": 7.018466949462891, + "learning_rate": 8.21255310401535e-06, + "loss": 0.4014, + "step": 13794 + }, + { + "epoch": 0.18712696690179056, + "grad_norm": 5.876809120178223, + "learning_rate": 8.212416061395094e-06, + "loss": 0.4283, + "step": 13795 + }, + { + "epoch": 0.18714053174172546, + "grad_norm": 5.439571380615234, + "learning_rate": 8.21227901877484e-06, + "loss": 0.4666, + "step": 13796 + }, + { + "epoch": 0.18715409658166032, + "grad_norm": 5.903409957885742, + "learning_rate": 8.212141976154585e-06, + "loss": 0.4401, + "step": 13797 + }, + { + "epoch": 0.18716766142159522, + "grad_norm": 4.919546127319336, + "learning_rate": 8.21200493353433e-06, + "loss": 0.4158, + "step": 13798 + }, + { + "epoch": 0.1871812262615301, + "grad_norm": 5.7511162757873535, + "learning_rate": 8.211867890914075e-06, + "loss": 0.3336, + "step": 13799 + }, + { + "epoch": 0.187194791101465, + "grad_norm": 5.237600326538086, + "learning_rate": 8.21173084829382e-06, + "loss": 0.465, + "step": 13800 + }, + { + "epoch": 0.1872083559413999, + "grad_norm": 7.833162784576416, + "learning_rate": 8.211593805673565e-06, + "loss": 0.6247, + "step": 13801 + }, + { + "epoch": 0.1872219207813348, + "grad_norm": 5.523808002471924, + "learning_rate": 8.21145676305331e-06, + "loss": 0.3844, + "step": 13802 + }, + { + "epoch": 0.18723548562126968, + "grad_norm": 4.994436740875244, + "learning_rate": 8.211319720433056e-06, + "loss": 0.2763, + "step": 13803 + }, + { + "epoch": 0.18724905046120455, + "grad_norm": 5.718303203582764, + "learning_rate": 8.2111826778128e-06, + "loss": 0.4201, + "step": 13804 + }, + { + "epoch": 0.18726261530113944, + "grad_norm": 4.752536773681641, + "learning_rate": 8.211045635192546e-06, + "loss": 0.4174, + "step": 13805 + }, + { + "epoch": 0.18727618014107433, + "grad_norm": 6.551331043243408, + "learning_rate": 8.210908592572291e-06, + "loss": 0.472, + "step": 13806 + }, + { + "epoch": 0.18728974498100923, + "grad_norm": 6.260222911834717, + "learning_rate": 8.210771549952036e-06, + "loss": 0.3972, + "step": 13807 + }, + { + "epoch": 0.18730330982094412, + "grad_norm": 6.553835868835449, + "learning_rate": 8.21063450733178e-06, + "loss": 0.4058, + "step": 13808 + }, + { + "epoch": 0.187316874660879, + "grad_norm": 6.145380020141602, + "learning_rate": 8.210497464711525e-06, + "loss": 0.3311, + "step": 13809 + }, + { + "epoch": 0.18733043950081388, + "grad_norm": 6.744847297668457, + "learning_rate": 8.210360422091272e-06, + "loss": 0.4152, + "step": 13810 + }, + { + "epoch": 0.18734400434074877, + "grad_norm": 7.158385276794434, + "learning_rate": 8.210223379471016e-06, + "loss": 0.4504, + "step": 13811 + }, + { + "epoch": 0.18735756918068366, + "grad_norm": 5.6679863929748535, + "learning_rate": 8.21008633685076e-06, + "loss": 0.415, + "step": 13812 + }, + { + "epoch": 0.18737113402061856, + "grad_norm": 5.455498218536377, + "learning_rate": 8.209949294230506e-06, + "loss": 0.3387, + "step": 13813 + }, + { + "epoch": 0.18738469886055345, + "grad_norm": 5.190299034118652, + "learning_rate": 8.209812251610251e-06, + "loss": 0.2745, + "step": 13814 + }, + { + "epoch": 0.18739826370048834, + "grad_norm": 5.4270148277282715, + "learning_rate": 8.209675208989996e-06, + "loss": 0.4083, + "step": 13815 + }, + { + "epoch": 0.18741182854042324, + "grad_norm": 6.621206760406494, + "learning_rate": 8.209538166369741e-06, + "loss": 0.4115, + "step": 13816 + }, + { + "epoch": 0.1874253933803581, + "grad_norm": 5.124413013458252, + "learning_rate": 8.209401123749487e-06, + "loss": 0.3297, + "step": 13817 + }, + { + "epoch": 0.187438958220293, + "grad_norm": 5.357955455780029, + "learning_rate": 8.209264081129232e-06, + "loss": 0.2483, + "step": 13818 + }, + { + "epoch": 0.1874525230602279, + "grad_norm": 6.256579875946045, + "learning_rate": 8.209127038508977e-06, + "loss": 0.3252, + "step": 13819 + }, + { + "epoch": 0.18746608790016278, + "grad_norm": 4.509673595428467, + "learning_rate": 8.208989995888722e-06, + "loss": 0.2736, + "step": 13820 + }, + { + "epoch": 0.18747965274009767, + "grad_norm": 7.025852203369141, + "learning_rate": 8.208852953268467e-06, + "loss": 0.4024, + "step": 13821 + }, + { + "epoch": 0.18749321758003257, + "grad_norm": 8.084632873535156, + "learning_rate": 8.208715910648212e-06, + "loss": 0.4221, + "step": 13822 + }, + { + "epoch": 0.18750678241996743, + "grad_norm": 5.114243030548096, + "learning_rate": 8.208578868027958e-06, + "loss": 0.3469, + "step": 13823 + }, + { + "epoch": 0.18752034725990233, + "grad_norm": 5.176779270172119, + "learning_rate": 8.208441825407703e-06, + "loss": 0.2865, + "step": 13824 + }, + { + "epoch": 0.18753391209983722, + "grad_norm": 4.226426601409912, + "learning_rate": 8.208304782787448e-06, + "loss": 0.2413, + "step": 13825 + }, + { + "epoch": 0.1875474769397721, + "grad_norm": 5.6660871505737305, + "learning_rate": 8.208167740167191e-06, + "loss": 0.4305, + "step": 13826 + }, + { + "epoch": 0.187561041779707, + "grad_norm": 5.168315410614014, + "learning_rate": 8.208030697546938e-06, + "loss": 0.3284, + "step": 13827 + }, + { + "epoch": 0.1875746066196419, + "grad_norm": 5.445887088775635, + "learning_rate": 8.207893654926684e-06, + "loss": 0.1749, + "step": 13828 + }, + { + "epoch": 0.18758817145957676, + "grad_norm": 6.977290153503418, + "learning_rate": 8.207756612306427e-06, + "loss": 0.439, + "step": 13829 + }, + { + "epoch": 0.18760173629951166, + "grad_norm": 5.742663860321045, + "learning_rate": 8.207619569686172e-06, + "loss": 0.3531, + "step": 13830 + }, + { + "epoch": 0.18761530113944655, + "grad_norm": 6.630847930908203, + "learning_rate": 8.207482527065919e-06, + "loss": 0.4537, + "step": 13831 + }, + { + "epoch": 0.18762886597938144, + "grad_norm": 6.283926010131836, + "learning_rate": 8.207345484445664e-06, + "loss": 0.3246, + "step": 13832 + }, + { + "epoch": 0.18764243081931634, + "grad_norm": 5.784730434417725, + "learning_rate": 8.207208441825408e-06, + "loss": 0.3037, + "step": 13833 + }, + { + "epoch": 0.18765599565925123, + "grad_norm": 5.4100446701049805, + "learning_rate": 8.207071399205153e-06, + "loss": 0.4797, + "step": 13834 + }, + { + "epoch": 0.18766956049918612, + "grad_norm": 5.5113725662231445, + "learning_rate": 8.2069343565849e-06, + "loss": 0.271, + "step": 13835 + }, + { + "epoch": 0.187683125339121, + "grad_norm": 5.249514579772949, + "learning_rate": 8.206797313964643e-06, + "loss": 0.263, + "step": 13836 + }, + { + "epoch": 0.18769669017905588, + "grad_norm": 5.70444393157959, + "learning_rate": 8.206660271344388e-06, + "loss": 0.3361, + "step": 13837 + }, + { + "epoch": 0.18771025501899077, + "grad_norm": 6.552860260009766, + "learning_rate": 8.206523228724134e-06, + "loss": 0.4727, + "step": 13838 + }, + { + "epoch": 0.18772381985892567, + "grad_norm": 6.169086933135986, + "learning_rate": 8.206386186103879e-06, + "loss": 0.2052, + "step": 13839 + }, + { + "epoch": 0.18773738469886056, + "grad_norm": 7.693750858306885, + "learning_rate": 8.206249143483624e-06, + "loss": 0.4377, + "step": 13840 + }, + { + "epoch": 0.18775094953879545, + "grad_norm": 5.682271480560303, + "learning_rate": 8.20611210086337e-06, + "loss": 0.2962, + "step": 13841 + }, + { + "epoch": 0.18776451437873032, + "grad_norm": 6.1705708503723145, + "learning_rate": 8.205975058243114e-06, + "loss": 0.4203, + "step": 13842 + }, + { + "epoch": 0.1877780792186652, + "grad_norm": 4.653284072875977, + "learning_rate": 8.20583801562286e-06, + "loss": 0.3, + "step": 13843 + }, + { + "epoch": 0.1877916440586001, + "grad_norm": 4.992091178894043, + "learning_rate": 8.205700973002605e-06, + "loss": 0.2266, + "step": 13844 + }, + { + "epoch": 0.187805208898535, + "grad_norm": 6.336411952972412, + "learning_rate": 8.20556393038235e-06, + "loss": 0.3491, + "step": 13845 + }, + { + "epoch": 0.1878187737384699, + "grad_norm": 5.84792423248291, + "learning_rate": 8.205426887762095e-06, + "loss": 0.4381, + "step": 13846 + }, + { + "epoch": 0.18783233857840478, + "grad_norm": 5.264190196990967, + "learning_rate": 8.20528984514184e-06, + "loss": 0.1672, + "step": 13847 + }, + { + "epoch": 0.18784590341833968, + "grad_norm": 5.180126190185547, + "learning_rate": 8.205152802521585e-06, + "loss": 0.3058, + "step": 13848 + }, + { + "epoch": 0.18785946825827454, + "grad_norm": 7.171006202697754, + "learning_rate": 8.20501575990133e-06, + "loss": 0.2985, + "step": 13849 + }, + { + "epoch": 0.18787303309820944, + "grad_norm": 5.333992004394531, + "learning_rate": 8.204878717281076e-06, + "loss": 0.3083, + "step": 13850 + }, + { + "epoch": 0.18788659793814433, + "grad_norm": 4.42685604095459, + "learning_rate": 8.20474167466082e-06, + "loss": 0.2741, + "step": 13851 + }, + { + "epoch": 0.18790016277807922, + "grad_norm": 4.826344966888428, + "learning_rate": 8.204604632040564e-06, + "loss": 0.3135, + "step": 13852 + }, + { + "epoch": 0.18791372761801411, + "grad_norm": 4.857753276824951, + "learning_rate": 8.204467589420311e-06, + "loss": 0.2799, + "step": 13853 + }, + { + "epoch": 0.187927292457949, + "grad_norm": 4.735535621643066, + "learning_rate": 8.204330546800055e-06, + "loss": 0.2574, + "step": 13854 + }, + { + "epoch": 0.18794085729788387, + "grad_norm": 5.274987697601318, + "learning_rate": 8.2041935041798e-06, + "loss": 0.4083, + "step": 13855 + }, + { + "epoch": 0.18795442213781877, + "grad_norm": 7.592294216156006, + "learning_rate": 8.204056461559545e-06, + "loss": 0.3449, + "step": 13856 + }, + { + "epoch": 0.18796798697775366, + "grad_norm": 5.019386291503906, + "learning_rate": 8.203919418939292e-06, + "loss": 0.1984, + "step": 13857 + }, + { + "epoch": 0.18798155181768855, + "grad_norm": 5.194469451904297, + "learning_rate": 8.203782376319036e-06, + "loss": 0.296, + "step": 13858 + }, + { + "epoch": 0.18799511665762345, + "grad_norm": 4.947722911834717, + "learning_rate": 8.20364533369878e-06, + "loss": 0.2806, + "step": 13859 + }, + { + "epoch": 0.18800868149755834, + "grad_norm": 5.773852348327637, + "learning_rate": 8.203508291078526e-06, + "loss": 0.3541, + "step": 13860 + }, + { + "epoch": 0.1880222463374932, + "grad_norm": 5.577038764953613, + "learning_rate": 8.203371248458271e-06, + "loss": 0.308, + "step": 13861 + }, + { + "epoch": 0.1880358111774281, + "grad_norm": 9.71997356414795, + "learning_rate": 8.203234205838016e-06, + "loss": 0.5309, + "step": 13862 + }, + { + "epoch": 0.188049376017363, + "grad_norm": 6.847711563110352, + "learning_rate": 8.203097163217761e-06, + "loss": 0.3355, + "step": 13863 + }, + { + "epoch": 0.18806294085729788, + "grad_norm": 5.389049053192139, + "learning_rate": 8.202960120597507e-06, + "loss": 0.2742, + "step": 13864 + }, + { + "epoch": 0.18807650569723278, + "grad_norm": 6.002936363220215, + "learning_rate": 8.202823077977252e-06, + "loss": 0.3068, + "step": 13865 + }, + { + "epoch": 0.18809007053716767, + "grad_norm": 5.3636040687561035, + "learning_rate": 8.202686035356997e-06, + "loss": 0.3385, + "step": 13866 + }, + { + "epoch": 0.18810363537710256, + "grad_norm": 6.525244235992432, + "learning_rate": 8.202548992736742e-06, + "loss": 0.4567, + "step": 13867 + }, + { + "epoch": 0.18811720021703743, + "grad_norm": 5.885673522949219, + "learning_rate": 8.202411950116487e-06, + "loss": 0.3855, + "step": 13868 + }, + { + "epoch": 0.18813076505697232, + "grad_norm": 6.258731842041016, + "learning_rate": 8.20227490749623e-06, + "loss": 0.3822, + "step": 13869 + }, + { + "epoch": 0.18814432989690721, + "grad_norm": 6.109626770019531, + "learning_rate": 8.202137864875978e-06, + "loss": 0.3011, + "step": 13870 + }, + { + "epoch": 0.1881578947368421, + "grad_norm": 5.577147960662842, + "learning_rate": 8.202000822255723e-06, + "loss": 0.3633, + "step": 13871 + }, + { + "epoch": 0.188171459576777, + "grad_norm": 5.122972011566162, + "learning_rate": 8.201863779635468e-06, + "loss": 0.3237, + "step": 13872 + }, + { + "epoch": 0.1881850244167119, + "grad_norm": 6.091273784637451, + "learning_rate": 8.201726737015212e-06, + "loss": 0.4441, + "step": 13873 + }, + { + "epoch": 0.18819858925664676, + "grad_norm": 7.713595390319824, + "learning_rate": 8.201589694394958e-06, + "loss": 0.5623, + "step": 13874 + }, + { + "epoch": 0.18821215409658165, + "grad_norm": 5.184560298919678, + "learning_rate": 8.201452651774704e-06, + "loss": 0.3281, + "step": 13875 + }, + { + "epoch": 0.18822571893651655, + "grad_norm": 6.526390075683594, + "learning_rate": 8.201315609154447e-06, + "loss": 0.4555, + "step": 13876 + }, + { + "epoch": 0.18823928377645144, + "grad_norm": 4.15150260925293, + "learning_rate": 8.201178566534192e-06, + "loss": 0.3766, + "step": 13877 + }, + { + "epoch": 0.18825284861638633, + "grad_norm": 6.0930256843566895, + "learning_rate": 8.201041523913937e-06, + "loss": 0.3451, + "step": 13878 + }, + { + "epoch": 0.18826641345632122, + "grad_norm": 6.318484783172607, + "learning_rate": 8.200904481293683e-06, + "loss": 0.3946, + "step": 13879 + }, + { + "epoch": 0.18827997829625612, + "grad_norm": 6.660577774047852, + "learning_rate": 8.200767438673428e-06, + "loss": 0.5013, + "step": 13880 + }, + { + "epoch": 0.18829354313619098, + "grad_norm": 7.111180305480957, + "learning_rate": 8.200630396053173e-06, + "loss": 0.4696, + "step": 13881 + }, + { + "epoch": 0.18830710797612588, + "grad_norm": 5.061542510986328, + "learning_rate": 8.200493353432918e-06, + "loss": 0.3722, + "step": 13882 + }, + { + "epoch": 0.18832067281606077, + "grad_norm": 6.0146050453186035, + "learning_rate": 8.200356310812663e-06, + "loss": 0.371, + "step": 13883 + }, + { + "epoch": 0.18833423765599566, + "grad_norm": 8.182628631591797, + "learning_rate": 8.200219268192408e-06, + "loss": 0.5358, + "step": 13884 + }, + { + "epoch": 0.18834780249593056, + "grad_norm": 5.522884368896484, + "learning_rate": 8.200082225572154e-06, + "loss": 0.2916, + "step": 13885 + }, + { + "epoch": 0.18836136733586545, + "grad_norm": 6.194279670715332, + "learning_rate": 8.199945182951899e-06, + "loss": 0.3889, + "step": 13886 + }, + { + "epoch": 0.1883749321758003, + "grad_norm": 5.824500560760498, + "learning_rate": 8.199808140331644e-06, + "loss": 0.3276, + "step": 13887 + }, + { + "epoch": 0.1883884970157352, + "grad_norm": 5.780914783477783, + "learning_rate": 8.19967109771139e-06, + "loss": 0.3625, + "step": 13888 + }, + { + "epoch": 0.1884020618556701, + "grad_norm": 6.787834167480469, + "learning_rate": 8.199534055091134e-06, + "loss": 0.385, + "step": 13889 + }, + { + "epoch": 0.188415626695605, + "grad_norm": 4.076481342315674, + "learning_rate": 8.19939701247088e-06, + "loss": 0.2844, + "step": 13890 + }, + { + "epoch": 0.18842919153553989, + "grad_norm": 5.309345245361328, + "learning_rate": 8.199259969850623e-06, + "loss": 0.2618, + "step": 13891 + }, + { + "epoch": 0.18844275637547478, + "grad_norm": 4.195186614990234, + "learning_rate": 8.19912292723037e-06, + "loss": 0.2508, + "step": 13892 + }, + { + "epoch": 0.18845632121540964, + "grad_norm": 4.266801834106445, + "learning_rate": 8.198985884610115e-06, + "loss": 0.1969, + "step": 13893 + }, + { + "epoch": 0.18846988605534454, + "grad_norm": 4.047187328338623, + "learning_rate": 8.198848841989859e-06, + "loss": 0.2421, + "step": 13894 + }, + { + "epoch": 0.18848345089527943, + "grad_norm": 5.60728120803833, + "learning_rate": 8.198711799369604e-06, + "loss": 0.3164, + "step": 13895 + }, + { + "epoch": 0.18849701573521432, + "grad_norm": 4.070285797119141, + "learning_rate": 8.19857475674935e-06, + "loss": 0.2741, + "step": 13896 + }, + { + "epoch": 0.18851058057514922, + "grad_norm": 4.505557537078857, + "learning_rate": 8.198437714129094e-06, + "loss": 0.1872, + "step": 13897 + }, + { + "epoch": 0.1885241454150841, + "grad_norm": 5.7310638427734375, + "learning_rate": 8.19830067150884e-06, + "loss": 0.3372, + "step": 13898 + }, + { + "epoch": 0.188537710255019, + "grad_norm": 6.992790699005127, + "learning_rate": 8.198163628888584e-06, + "loss": 0.4044, + "step": 13899 + }, + { + "epoch": 0.18855127509495387, + "grad_norm": 4.3462958335876465, + "learning_rate": 8.198026586268331e-06, + "loss": 0.2912, + "step": 13900 + }, + { + "epoch": 0.18856483993488876, + "grad_norm": 3.8970491886138916, + "learning_rate": 8.197889543648075e-06, + "loss": 0.2231, + "step": 13901 + }, + { + "epoch": 0.18857840477482365, + "grad_norm": 4.726510047912598, + "learning_rate": 8.19775250102782e-06, + "loss": 0.3289, + "step": 13902 + }, + { + "epoch": 0.18859196961475855, + "grad_norm": 6.360950469970703, + "learning_rate": 8.197615458407565e-06, + "loss": 0.3059, + "step": 13903 + }, + { + "epoch": 0.18860553445469344, + "grad_norm": 5.529661655426025, + "learning_rate": 8.19747841578731e-06, + "loss": 0.294, + "step": 13904 + }, + { + "epoch": 0.18861909929462833, + "grad_norm": 5.449735164642334, + "learning_rate": 8.197341373167056e-06, + "loss": 0.2759, + "step": 13905 + }, + { + "epoch": 0.1886326641345632, + "grad_norm": 5.308660507202148, + "learning_rate": 8.1972043305468e-06, + "loss": 0.1717, + "step": 13906 + }, + { + "epoch": 0.1886462289744981, + "grad_norm": 3.8664193153381348, + "learning_rate": 8.197067287926546e-06, + "loss": 0.1486, + "step": 13907 + }, + { + "epoch": 0.18865979381443299, + "grad_norm": 4.331371784210205, + "learning_rate": 8.196930245306291e-06, + "loss": 0.2724, + "step": 13908 + }, + { + "epoch": 0.18867335865436788, + "grad_norm": 4.060304641723633, + "learning_rate": 8.196793202686036e-06, + "loss": 0.2123, + "step": 13909 + }, + { + "epoch": 0.18868692349430277, + "grad_norm": 5.118837833404541, + "learning_rate": 8.196656160065781e-06, + "loss": 0.3326, + "step": 13910 + }, + { + "epoch": 0.18870048833423766, + "grad_norm": 6.937408447265625, + "learning_rate": 8.196519117445527e-06, + "loss": 0.3426, + "step": 13911 + }, + { + "epoch": 0.18871405317417256, + "grad_norm": 7.135918140411377, + "learning_rate": 8.19638207482527e-06, + "loss": 0.403, + "step": 13912 + }, + { + "epoch": 0.18872761801410742, + "grad_norm": 7.0136799812316895, + "learning_rate": 8.196245032205017e-06, + "loss": 0.388, + "step": 13913 + }, + { + "epoch": 0.18874118285404232, + "grad_norm": 5.426177978515625, + "learning_rate": 8.196107989584762e-06, + "loss": 0.4032, + "step": 13914 + }, + { + "epoch": 0.1887547476939772, + "grad_norm": 5.904170989990234, + "learning_rate": 8.195970946964507e-06, + "loss": 0.3107, + "step": 13915 + }, + { + "epoch": 0.1887683125339121, + "grad_norm": 4.454639434814453, + "learning_rate": 8.19583390434425e-06, + "loss": 0.2877, + "step": 13916 + }, + { + "epoch": 0.188781877373847, + "grad_norm": 5.166262626647949, + "learning_rate": 8.195696861723998e-06, + "loss": 0.3242, + "step": 13917 + }, + { + "epoch": 0.1887954422137819, + "grad_norm": 4.399059295654297, + "learning_rate": 8.195559819103743e-06, + "loss": 0.2189, + "step": 13918 + }, + { + "epoch": 0.18880900705371675, + "grad_norm": 5.833034038543701, + "learning_rate": 8.195422776483486e-06, + "loss": 0.4245, + "step": 13919 + }, + { + "epoch": 0.18882257189365165, + "grad_norm": 7.69997501373291, + "learning_rate": 8.195285733863232e-06, + "loss": 0.511, + "step": 13920 + }, + { + "epoch": 0.18883613673358654, + "grad_norm": 8.423707962036133, + "learning_rate": 8.195148691242977e-06, + "loss": 0.4304, + "step": 13921 + }, + { + "epoch": 0.18884970157352143, + "grad_norm": 5.98729944229126, + "learning_rate": 8.195011648622722e-06, + "loss": 0.4401, + "step": 13922 + }, + { + "epoch": 0.18886326641345633, + "grad_norm": 5.976038932800293, + "learning_rate": 8.194874606002467e-06, + "loss": 0.2914, + "step": 13923 + }, + { + "epoch": 0.18887683125339122, + "grad_norm": 5.307066440582275, + "learning_rate": 8.194737563382212e-06, + "loss": 0.3168, + "step": 13924 + }, + { + "epoch": 0.18889039609332609, + "grad_norm": 7.051836013793945, + "learning_rate": 8.194600520761957e-06, + "loss": 0.4467, + "step": 13925 + }, + { + "epoch": 0.18890396093326098, + "grad_norm": 4.927231788635254, + "learning_rate": 8.194463478141703e-06, + "loss": 0.3326, + "step": 13926 + }, + { + "epoch": 0.18891752577319587, + "grad_norm": 4.884829521179199, + "learning_rate": 8.194326435521448e-06, + "loss": 0.2744, + "step": 13927 + }, + { + "epoch": 0.18893109061313076, + "grad_norm": 6.0401129722595215, + "learning_rate": 8.194189392901193e-06, + "loss": 0.3534, + "step": 13928 + }, + { + "epoch": 0.18894465545306566, + "grad_norm": 5.182314395904541, + "learning_rate": 8.194052350280938e-06, + "loss": 0.3011, + "step": 13929 + }, + { + "epoch": 0.18895822029300055, + "grad_norm": 6.348968029022217, + "learning_rate": 8.193915307660683e-06, + "loss": 0.373, + "step": 13930 + }, + { + "epoch": 0.18897178513293544, + "grad_norm": 5.790460586547852, + "learning_rate": 8.193778265040429e-06, + "loss": 0.2436, + "step": 13931 + }, + { + "epoch": 0.1889853499728703, + "grad_norm": 6.186721324920654, + "learning_rate": 8.193641222420174e-06, + "loss": 0.4433, + "step": 13932 + }, + { + "epoch": 0.1889989148128052, + "grad_norm": 4.226690769195557, + "learning_rate": 8.193504179799919e-06, + "loss": 0.1944, + "step": 13933 + }, + { + "epoch": 0.1890124796527401, + "grad_norm": 7.712936878204346, + "learning_rate": 8.193367137179662e-06, + "loss": 0.559, + "step": 13934 + }, + { + "epoch": 0.189026044492675, + "grad_norm": 4.152501106262207, + "learning_rate": 8.19323009455941e-06, + "loss": 0.2438, + "step": 13935 + }, + { + "epoch": 0.18903960933260988, + "grad_norm": 5.623419284820557, + "learning_rate": 8.193093051939154e-06, + "loss": 0.305, + "step": 13936 + }, + { + "epoch": 0.18905317417254477, + "grad_norm": 6.975953578948975, + "learning_rate": 8.192956009318898e-06, + "loss": 0.5224, + "step": 13937 + }, + { + "epoch": 0.18906673901247964, + "grad_norm": 4.975018501281738, + "learning_rate": 8.192818966698643e-06, + "loss": 0.3134, + "step": 13938 + }, + { + "epoch": 0.18908030385241453, + "grad_norm": 5.030195713043213, + "learning_rate": 8.19268192407839e-06, + "loss": 0.3298, + "step": 13939 + }, + { + "epoch": 0.18909386869234943, + "grad_norm": 5.594305992126465, + "learning_rate": 8.192544881458135e-06, + "loss": 0.2886, + "step": 13940 + }, + { + "epoch": 0.18910743353228432, + "grad_norm": 6.998929977416992, + "learning_rate": 8.192407838837879e-06, + "loss": 0.4107, + "step": 13941 + }, + { + "epoch": 0.1891209983722192, + "grad_norm": 7.012583255767822, + "learning_rate": 8.192270796217624e-06, + "loss": 0.4641, + "step": 13942 + }, + { + "epoch": 0.1891345632121541, + "grad_norm": 4.999155044555664, + "learning_rate": 8.19213375359737e-06, + "loss": 0.3992, + "step": 13943 + }, + { + "epoch": 0.189148128052089, + "grad_norm": 5.1005706787109375, + "learning_rate": 8.191996710977114e-06, + "loss": 0.2961, + "step": 13944 + }, + { + "epoch": 0.18916169289202386, + "grad_norm": 6.797049045562744, + "learning_rate": 8.19185966835686e-06, + "loss": 0.4784, + "step": 13945 + }, + { + "epoch": 0.18917525773195876, + "grad_norm": 6.840802192687988, + "learning_rate": 8.191722625736604e-06, + "loss": 0.5948, + "step": 13946 + }, + { + "epoch": 0.18918882257189365, + "grad_norm": 6.9166669845581055, + "learning_rate": 8.19158558311635e-06, + "loss": 0.3398, + "step": 13947 + }, + { + "epoch": 0.18920238741182854, + "grad_norm": 5.94614315032959, + "learning_rate": 8.191448540496095e-06, + "loss": 0.3566, + "step": 13948 + }, + { + "epoch": 0.18921595225176344, + "grad_norm": 4.951136112213135, + "learning_rate": 8.19131149787584e-06, + "loss": 0.242, + "step": 13949 + }, + { + "epoch": 0.18922951709169833, + "grad_norm": 7.298843860626221, + "learning_rate": 8.191174455255585e-06, + "loss": 0.5016, + "step": 13950 + }, + { + "epoch": 0.1892430819316332, + "grad_norm": 7.018736839294434, + "learning_rate": 8.19103741263533e-06, + "loss": 0.4716, + "step": 13951 + }, + { + "epoch": 0.1892566467715681, + "grad_norm": 6.432234287261963, + "learning_rate": 8.190900370015076e-06, + "loss": 0.4273, + "step": 13952 + }, + { + "epoch": 0.18927021161150298, + "grad_norm": 4.36922025680542, + "learning_rate": 8.19076332739482e-06, + "loss": 0.3013, + "step": 13953 + }, + { + "epoch": 0.18928377645143787, + "grad_norm": 6.101027965545654, + "learning_rate": 8.190626284774566e-06, + "loss": 0.3505, + "step": 13954 + }, + { + "epoch": 0.18929734129137277, + "grad_norm": 5.491897106170654, + "learning_rate": 8.190489242154311e-06, + "loss": 0.4243, + "step": 13955 + }, + { + "epoch": 0.18931090613130766, + "grad_norm": 7.2916975021362305, + "learning_rate": 8.190352199534056e-06, + "loss": 0.4376, + "step": 13956 + }, + { + "epoch": 0.18932447097124253, + "grad_norm": 5.010251045227051, + "learning_rate": 8.190215156913801e-06, + "loss": 0.4976, + "step": 13957 + }, + { + "epoch": 0.18933803581117742, + "grad_norm": 5.3727216720581055, + "learning_rate": 8.190078114293547e-06, + "loss": 0.3133, + "step": 13958 + }, + { + "epoch": 0.1893516006511123, + "grad_norm": 5.559458255767822, + "learning_rate": 8.18994107167329e-06, + "loss": 0.3763, + "step": 13959 + }, + { + "epoch": 0.1893651654910472, + "grad_norm": 4.783300399780273, + "learning_rate": 8.189804029053035e-06, + "loss": 0.4057, + "step": 13960 + }, + { + "epoch": 0.1893787303309821, + "grad_norm": 7.106045722961426, + "learning_rate": 8.189666986432782e-06, + "loss": 0.5443, + "step": 13961 + }, + { + "epoch": 0.189392295170917, + "grad_norm": 5.57811164855957, + "learning_rate": 8.189529943812526e-06, + "loss": 0.4655, + "step": 13962 + }, + { + "epoch": 0.18940586001085188, + "grad_norm": 6.918463230133057, + "learning_rate": 8.18939290119227e-06, + "loss": 0.6326, + "step": 13963 + }, + { + "epoch": 0.18941942485078675, + "grad_norm": 6.029601097106934, + "learning_rate": 8.189255858572016e-06, + "loss": 0.3991, + "step": 13964 + }, + { + "epoch": 0.18943298969072164, + "grad_norm": 5.421029567718506, + "learning_rate": 8.189118815951763e-06, + "loss": 0.3761, + "step": 13965 + }, + { + "epoch": 0.18944655453065654, + "grad_norm": 4.817491054534912, + "learning_rate": 8.188981773331506e-06, + "loss": 0.3032, + "step": 13966 + }, + { + "epoch": 0.18946011937059143, + "grad_norm": 5.328281402587891, + "learning_rate": 8.188844730711252e-06, + "loss": 0.4757, + "step": 13967 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 4.919144630432129, + "learning_rate": 8.188707688090997e-06, + "loss": 0.3406, + "step": 13968 + }, + { + "epoch": 0.18948724905046122, + "grad_norm": 5.166126728057861, + "learning_rate": 8.188570645470742e-06, + "loss": 0.2967, + "step": 13969 + }, + { + "epoch": 0.18950081389039608, + "grad_norm": 6.132217884063721, + "learning_rate": 8.188433602850487e-06, + "loss": 0.343, + "step": 13970 + }, + { + "epoch": 0.18951437873033097, + "grad_norm": 7.873373985290527, + "learning_rate": 8.188296560230232e-06, + "loss": 0.5011, + "step": 13971 + }, + { + "epoch": 0.18952794357026587, + "grad_norm": 12.41915225982666, + "learning_rate": 8.188159517609977e-06, + "loss": 0.3938, + "step": 13972 + }, + { + "epoch": 0.18954150841020076, + "grad_norm": 5.883265018463135, + "learning_rate": 8.188022474989723e-06, + "loss": 0.4781, + "step": 13973 + }, + { + "epoch": 0.18955507325013565, + "grad_norm": 5.5843505859375, + "learning_rate": 8.187885432369468e-06, + "loss": 0.4385, + "step": 13974 + }, + { + "epoch": 0.18956863809007055, + "grad_norm": 4.8665313720703125, + "learning_rate": 8.187748389749213e-06, + "loss": 0.2998, + "step": 13975 + }, + { + "epoch": 0.18958220293000544, + "grad_norm": 6.332406520843506, + "learning_rate": 8.187611347128958e-06, + "loss": 0.5775, + "step": 13976 + }, + { + "epoch": 0.1895957677699403, + "grad_norm": 5.905489921569824, + "learning_rate": 8.187474304508702e-06, + "loss": 0.3074, + "step": 13977 + }, + { + "epoch": 0.1896093326098752, + "grad_norm": 4.654481887817383, + "learning_rate": 8.187337261888449e-06, + "loss": 0.3449, + "step": 13978 + }, + { + "epoch": 0.1896228974498101, + "grad_norm": 6.653938293457031, + "learning_rate": 8.187200219268194e-06, + "loss": 0.4205, + "step": 13979 + }, + { + "epoch": 0.18963646228974498, + "grad_norm": 7.181497573852539, + "learning_rate": 8.187063176647939e-06, + "loss": 0.6426, + "step": 13980 + }, + { + "epoch": 0.18965002712967988, + "grad_norm": 5.166650295257568, + "learning_rate": 8.186926134027682e-06, + "loss": 0.5015, + "step": 13981 + }, + { + "epoch": 0.18966359196961477, + "grad_norm": 4.417681694030762, + "learning_rate": 8.18678909140743e-06, + "loss": 0.2495, + "step": 13982 + }, + { + "epoch": 0.18967715680954964, + "grad_norm": 5.889858722686768, + "learning_rate": 8.186652048787174e-06, + "loss": 0.3593, + "step": 13983 + }, + { + "epoch": 0.18969072164948453, + "grad_norm": 8.663117408752441, + "learning_rate": 8.186515006166918e-06, + "loss": 0.5982, + "step": 13984 + }, + { + "epoch": 0.18970428648941942, + "grad_norm": 6.815353870391846, + "learning_rate": 8.186377963546663e-06, + "loss": 0.3887, + "step": 13985 + }, + { + "epoch": 0.18971785132935431, + "grad_norm": 6.791165351867676, + "learning_rate": 8.18624092092641e-06, + "loss": 0.5833, + "step": 13986 + }, + { + "epoch": 0.1897314161692892, + "grad_norm": 7.313876152038574, + "learning_rate": 8.186103878306153e-06, + "loss": 0.3936, + "step": 13987 + }, + { + "epoch": 0.1897449810092241, + "grad_norm": 5.842690944671631, + "learning_rate": 8.185966835685899e-06, + "loss": 0.4124, + "step": 13988 + }, + { + "epoch": 0.18975854584915897, + "grad_norm": 5.061460494995117, + "learning_rate": 8.185829793065644e-06, + "loss": 0.3734, + "step": 13989 + }, + { + "epoch": 0.18977211068909386, + "grad_norm": 5.8787078857421875, + "learning_rate": 8.185692750445389e-06, + "loss": 0.5238, + "step": 13990 + }, + { + "epoch": 0.18978567552902875, + "grad_norm": 6.1547369956970215, + "learning_rate": 8.185555707825134e-06, + "loss": 0.3983, + "step": 13991 + }, + { + "epoch": 0.18979924036896365, + "grad_norm": 3.924499273300171, + "learning_rate": 8.18541866520488e-06, + "loss": 0.2792, + "step": 13992 + }, + { + "epoch": 0.18981280520889854, + "grad_norm": 6.7327985763549805, + "learning_rate": 8.185281622584625e-06, + "loss": 0.4479, + "step": 13993 + }, + { + "epoch": 0.18982637004883343, + "grad_norm": 8.28708267211914, + "learning_rate": 8.18514457996437e-06, + "loss": 0.5716, + "step": 13994 + }, + { + "epoch": 0.18983993488876832, + "grad_norm": 6.726132392883301, + "learning_rate": 8.185007537344115e-06, + "loss": 0.3692, + "step": 13995 + }, + { + "epoch": 0.1898534997287032, + "grad_norm": 6.527833461761475, + "learning_rate": 8.18487049472386e-06, + "loss": 0.445, + "step": 13996 + }, + { + "epoch": 0.18986706456863808, + "grad_norm": 5.44334077835083, + "learning_rate": 8.184733452103605e-06, + "loss": 0.2702, + "step": 13997 + }, + { + "epoch": 0.18988062940857298, + "grad_norm": 4.418597221374512, + "learning_rate": 8.18459640948335e-06, + "loss": 0.4787, + "step": 13998 + }, + { + "epoch": 0.18989419424850787, + "grad_norm": 5.276154518127441, + "learning_rate": 8.184459366863096e-06, + "loss": 0.3661, + "step": 13999 + }, + { + "epoch": 0.18990775908844276, + "grad_norm": 4.680639743804932, + "learning_rate": 8.18432232424284e-06, + "loss": 0.4579, + "step": 14000 + }, + { + "epoch": 0.18992132392837766, + "grad_norm": 7.376703262329102, + "learning_rate": 8.184185281622586e-06, + "loss": 0.5129, + "step": 14001 + }, + { + "epoch": 0.18993488876831252, + "grad_norm": 4.5496344566345215, + "learning_rate": 8.18404823900233e-06, + "loss": 0.3406, + "step": 14002 + }, + { + "epoch": 0.18994845360824741, + "grad_norm": 5.509461402893066, + "learning_rate": 8.183911196382075e-06, + "loss": 0.3552, + "step": 14003 + }, + { + "epoch": 0.1899620184481823, + "grad_norm": 6.262775897979736, + "learning_rate": 8.183774153761821e-06, + "loss": 0.4458, + "step": 14004 + }, + { + "epoch": 0.1899755832881172, + "grad_norm": 6.930265426635742, + "learning_rate": 8.183637111141565e-06, + "loss": 0.3772, + "step": 14005 + }, + { + "epoch": 0.1899891481280521, + "grad_norm": 5.194793701171875, + "learning_rate": 8.18350006852131e-06, + "loss": 0.3489, + "step": 14006 + }, + { + "epoch": 0.190002712967987, + "grad_norm": 4.869900703430176, + "learning_rate": 8.183363025901055e-06, + "loss": 0.3029, + "step": 14007 + }, + { + "epoch": 0.19001627780792188, + "grad_norm": 6.568742275238037, + "learning_rate": 8.183225983280802e-06, + "loss": 0.3931, + "step": 14008 + }, + { + "epoch": 0.19002984264785674, + "grad_norm": 7.953954696655273, + "learning_rate": 8.183088940660546e-06, + "loss": 0.4363, + "step": 14009 + }, + { + "epoch": 0.19004340748779164, + "grad_norm": 6.59761381149292, + "learning_rate": 8.182951898040291e-06, + "loss": 0.3745, + "step": 14010 + }, + { + "epoch": 0.19005697232772653, + "grad_norm": 6.354248523712158, + "learning_rate": 8.182814855420036e-06, + "loss": 0.3299, + "step": 14011 + }, + { + "epoch": 0.19007053716766142, + "grad_norm": 4.602102756500244, + "learning_rate": 8.182677812799781e-06, + "loss": 0.3436, + "step": 14012 + }, + { + "epoch": 0.19008410200759632, + "grad_norm": 6.152233123779297, + "learning_rate": 8.182540770179526e-06, + "loss": 0.4011, + "step": 14013 + }, + { + "epoch": 0.1900976668475312, + "grad_norm": 5.580960750579834, + "learning_rate": 8.182403727559272e-06, + "loss": 0.4506, + "step": 14014 + }, + { + "epoch": 0.19011123168746608, + "grad_norm": 5.412355422973633, + "learning_rate": 8.182266684939017e-06, + "loss": 0.3981, + "step": 14015 + }, + { + "epoch": 0.19012479652740097, + "grad_norm": 5.954122066497803, + "learning_rate": 8.182129642318762e-06, + "loss": 0.4004, + "step": 14016 + }, + { + "epoch": 0.19013836136733586, + "grad_norm": 6.377519130706787, + "learning_rate": 8.181992599698507e-06, + "loss": 0.4101, + "step": 14017 + }, + { + "epoch": 0.19015192620727076, + "grad_norm": 7.051529407501221, + "learning_rate": 8.181855557078252e-06, + "loss": 0.4686, + "step": 14018 + }, + { + "epoch": 0.19016549104720565, + "grad_norm": 7.364290714263916, + "learning_rate": 8.181718514457997e-06, + "loss": 0.3942, + "step": 14019 + }, + { + "epoch": 0.19017905588714054, + "grad_norm": 8.133929252624512, + "learning_rate": 8.181581471837741e-06, + "loss": 0.4592, + "step": 14020 + }, + { + "epoch": 0.19019262072707543, + "grad_norm": 6.096273422241211, + "learning_rate": 8.181444429217488e-06, + "loss": 0.3768, + "step": 14021 + }, + { + "epoch": 0.1902061855670103, + "grad_norm": 6.818755149841309, + "learning_rate": 8.181307386597233e-06, + "loss": 0.351, + "step": 14022 + }, + { + "epoch": 0.1902197504069452, + "grad_norm": 5.975238800048828, + "learning_rate": 8.181170343976978e-06, + "loss": 0.3166, + "step": 14023 + }, + { + "epoch": 0.19023331524688009, + "grad_norm": 5.003284931182861, + "learning_rate": 8.181033301356722e-06, + "loss": 0.3492, + "step": 14024 + }, + { + "epoch": 0.19024688008681498, + "grad_norm": 18.477096557617188, + "learning_rate": 8.180896258736469e-06, + "loss": 0.4394, + "step": 14025 + }, + { + "epoch": 0.19026044492674987, + "grad_norm": 7.803028106689453, + "learning_rate": 8.180759216116214e-06, + "loss": 0.4176, + "step": 14026 + }, + { + "epoch": 0.19027400976668477, + "grad_norm": 5.372293949127197, + "learning_rate": 8.180622173495957e-06, + "loss": 0.3035, + "step": 14027 + }, + { + "epoch": 0.19028757460661963, + "grad_norm": 5.858569145202637, + "learning_rate": 8.180485130875702e-06, + "loss": 0.3695, + "step": 14028 + }, + { + "epoch": 0.19030113944655452, + "grad_norm": 6.045593738555908, + "learning_rate": 8.180348088255448e-06, + "loss": 0.3496, + "step": 14029 + }, + { + "epoch": 0.19031470428648942, + "grad_norm": 7.041123867034912, + "learning_rate": 8.180211045635193e-06, + "loss": 0.3858, + "step": 14030 + }, + { + "epoch": 0.1903282691264243, + "grad_norm": 6.959781646728516, + "learning_rate": 8.180074003014938e-06, + "loss": 0.4596, + "step": 14031 + }, + { + "epoch": 0.1903418339663592, + "grad_norm": 6.836175441741943, + "learning_rate": 8.179936960394683e-06, + "loss": 0.2935, + "step": 14032 + }, + { + "epoch": 0.1903553988062941, + "grad_norm": 6.216264724731445, + "learning_rate": 8.179799917774428e-06, + "loss": 0.3541, + "step": 14033 + }, + { + "epoch": 0.19036896364622896, + "grad_norm": 6.463839530944824, + "learning_rate": 8.179662875154173e-06, + "loss": 0.4131, + "step": 14034 + }, + { + "epoch": 0.19038252848616385, + "grad_norm": 9.051246643066406, + "learning_rate": 8.179525832533919e-06, + "loss": 0.46, + "step": 14035 + }, + { + "epoch": 0.19039609332609875, + "grad_norm": 5.389129161834717, + "learning_rate": 8.179388789913664e-06, + "loss": 0.3108, + "step": 14036 + }, + { + "epoch": 0.19040965816603364, + "grad_norm": 6.856922626495361, + "learning_rate": 8.179251747293409e-06, + "loss": 0.3707, + "step": 14037 + }, + { + "epoch": 0.19042322300596853, + "grad_norm": 7.276634693145752, + "learning_rate": 8.179114704673154e-06, + "loss": 0.4223, + "step": 14038 + }, + { + "epoch": 0.19043678784590343, + "grad_norm": 6.805322647094727, + "learning_rate": 8.1789776620529e-06, + "loss": 0.2983, + "step": 14039 + }, + { + "epoch": 0.19045035268583832, + "grad_norm": 6.6873273849487305, + "learning_rate": 8.178840619432645e-06, + "loss": 0.3141, + "step": 14040 + }, + { + "epoch": 0.19046391752577319, + "grad_norm": 6.419933795928955, + "learning_rate": 8.17870357681239e-06, + "loss": 0.308, + "step": 14041 + }, + { + "epoch": 0.19047748236570808, + "grad_norm": 5.708689212799072, + "learning_rate": 8.178566534192135e-06, + "loss": 0.4224, + "step": 14042 + }, + { + "epoch": 0.19049104720564297, + "grad_norm": 5.818568706512451, + "learning_rate": 8.17842949157188e-06, + "loss": 0.2721, + "step": 14043 + }, + { + "epoch": 0.19050461204557786, + "grad_norm": 6.276803970336914, + "learning_rate": 8.178292448951625e-06, + "loss": 0.2781, + "step": 14044 + }, + { + "epoch": 0.19051817688551276, + "grad_norm": 5.161899566650391, + "learning_rate": 8.178155406331369e-06, + "loss": 0.283, + "step": 14045 + }, + { + "epoch": 0.19053174172544765, + "grad_norm": 7.139949798583984, + "learning_rate": 8.178018363711114e-06, + "loss": 0.4887, + "step": 14046 + }, + { + "epoch": 0.19054530656538252, + "grad_norm": 5.727482795715332, + "learning_rate": 8.17788132109086e-06, + "loss": 0.2766, + "step": 14047 + }, + { + "epoch": 0.1905588714053174, + "grad_norm": 6.462667942047119, + "learning_rate": 8.177744278470606e-06, + "loss": 0.366, + "step": 14048 + }, + { + "epoch": 0.1905724362452523, + "grad_norm": 6.142604351043701, + "learning_rate": 8.17760723585035e-06, + "loss": 0.2941, + "step": 14049 + }, + { + "epoch": 0.1905860010851872, + "grad_norm": 7.190966606140137, + "learning_rate": 8.177470193230095e-06, + "loss": 0.4291, + "step": 14050 + }, + { + "epoch": 0.1905995659251221, + "grad_norm": 8.406675338745117, + "learning_rate": 8.177333150609842e-06, + "loss": 0.3898, + "step": 14051 + }, + { + "epoch": 0.19061313076505698, + "grad_norm": 5.472235202789307, + "learning_rate": 8.177196107989585e-06, + "loss": 0.2332, + "step": 14052 + }, + { + "epoch": 0.19062669560499187, + "grad_norm": 5.484139442443848, + "learning_rate": 8.17705906536933e-06, + "loss": 0.4387, + "step": 14053 + }, + { + "epoch": 0.19064026044492674, + "grad_norm": 7.753280162811279, + "learning_rate": 8.176922022749075e-06, + "loss": 0.3054, + "step": 14054 + }, + { + "epoch": 0.19065382528486163, + "grad_norm": 6.604395866394043, + "learning_rate": 8.17678498012882e-06, + "loss": 0.3626, + "step": 14055 + }, + { + "epoch": 0.19066739012479653, + "grad_norm": 7.731407165527344, + "learning_rate": 8.176647937508566e-06, + "loss": 0.4095, + "step": 14056 + }, + { + "epoch": 0.19068095496473142, + "grad_norm": 5.620826244354248, + "learning_rate": 8.176510894888311e-06, + "loss": 0.2559, + "step": 14057 + }, + { + "epoch": 0.1906945198046663, + "grad_norm": 5.811288833618164, + "learning_rate": 8.176373852268056e-06, + "loss": 0.2863, + "step": 14058 + }, + { + "epoch": 0.1907080846446012, + "grad_norm": 9.413264274597168, + "learning_rate": 8.176236809647801e-06, + "loss": 0.4657, + "step": 14059 + }, + { + "epoch": 0.19072164948453607, + "grad_norm": 7.080929756164551, + "learning_rate": 8.176099767027546e-06, + "loss": 0.4166, + "step": 14060 + }, + { + "epoch": 0.19073521432447096, + "grad_norm": 7.722756385803223, + "learning_rate": 8.175962724407292e-06, + "loss": 0.4191, + "step": 14061 + }, + { + "epoch": 0.19074877916440586, + "grad_norm": 8.182180404663086, + "learning_rate": 8.175825681787037e-06, + "loss": 0.3491, + "step": 14062 + }, + { + "epoch": 0.19076234400434075, + "grad_norm": 5.8397698402404785, + "learning_rate": 8.175688639166782e-06, + "loss": 0.3142, + "step": 14063 + }, + { + "epoch": 0.19077590884427564, + "grad_norm": 5.907791614532471, + "learning_rate": 8.175551596546527e-06, + "loss": 0.2412, + "step": 14064 + }, + { + "epoch": 0.19078947368421054, + "grad_norm": 6.310617446899414, + "learning_rate": 8.175414553926272e-06, + "loss": 0.3185, + "step": 14065 + }, + { + "epoch": 0.1908030385241454, + "grad_norm": 6.668659687042236, + "learning_rate": 8.175277511306017e-06, + "loss": 0.4557, + "step": 14066 + }, + { + "epoch": 0.1908166033640803, + "grad_norm": 9.809380531311035, + "learning_rate": 8.175140468685761e-06, + "loss": 0.6175, + "step": 14067 + }, + { + "epoch": 0.1908301682040152, + "grad_norm": 5.652737617492676, + "learning_rate": 8.175003426065508e-06, + "loss": 0.3419, + "step": 14068 + }, + { + "epoch": 0.19084373304395008, + "grad_norm": 7.257293224334717, + "learning_rate": 8.174866383445253e-06, + "loss": 0.3565, + "step": 14069 + }, + { + "epoch": 0.19085729788388497, + "grad_norm": 6.846682071685791, + "learning_rate": 8.174729340824997e-06, + "loss": 0.4242, + "step": 14070 + }, + { + "epoch": 0.19087086272381987, + "grad_norm": 6.823680877685547, + "learning_rate": 8.174592298204742e-06, + "loss": 0.3623, + "step": 14071 + }, + { + "epoch": 0.19088442756375476, + "grad_norm": 5.574707508087158, + "learning_rate": 8.174455255584487e-06, + "loss": 0.333, + "step": 14072 + }, + { + "epoch": 0.19089799240368963, + "grad_norm": 6.408257007598877, + "learning_rate": 8.174318212964232e-06, + "loss": 0.3583, + "step": 14073 + }, + { + "epoch": 0.19091155724362452, + "grad_norm": 4.634552001953125, + "learning_rate": 8.174181170343977e-06, + "loss": 0.279, + "step": 14074 + }, + { + "epoch": 0.1909251220835594, + "grad_norm": 7.266811370849609, + "learning_rate": 8.174044127723722e-06, + "loss": 0.3845, + "step": 14075 + }, + { + "epoch": 0.1909386869234943, + "grad_norm": 6.790839672088623, + "learning_rate": 8.173907085103468e-06, + "loss": 0.4399, + "step": 14076 + }, + { + "epoch": 0.1909522517634292, + "grad_norm": 6.49018669128418, + "learning_rate": 8.173770042483213e-06, + "loss": 0.3522, + "step": 14077 + }, + { + "epoch": 0.1909658166033641, + "grad_norm": 5.670827865600586, + "learning_rate": 8.173632999862958e-06, + "loss": 0.2206, + "step": 14078 + }, + { + "epoch": 0.19097938144329896, + "grad_norm": 5.7444000244140625, + "learning_rate": 8.173495957242703e-06, + "loss": 0.3639, + "step": 14079 + }, + { + "epoch": 0.19099294628323385, + "grad_norm": 5.999749660491943, + "learning_rate": 8.173358914622448e-06, + "loss": 0.2955, + "step": 14080 + }, + { + "epoch": 0.19100651112316874, + "grad_norm": 6.35155725479126, + "learning_rate": 8.173221872002193e-06, + "loss": 0.271, + "step": 14081 + }, + { + "epoch": 0.19102007596310364, + "grad_norm": 5.428788661956787, + "learning_rate": 8.173084829381939e-06, + "loss": 0.2365, + "step": 14082 + }, + { + "epoch": 0.19103364080303853, + "grad_norm": 5.600325584411621, + "learning_rate": 8.172947786761684e-06, + "loss": 0.3024, + "step": 14083 + }, + { + "epoch": 0.19104720564297342, + "grad_norm": 6.0495285987854, + "learning_rate": 8.172810744141429e-06, + "loss": 0.3312, + "step": 14084 + }, + { + "epoch": 0.19106077048290832, + "grad_norm": 11.19656753540039, + "learning_rate": 8.172673701521173e-06, + "loss": 0.4991, + "step": 14085 + }, + { + "epoch": 0.19107433532284318, + "grad_norm": 5.60538387298584, + "learning_rate": 8.17253665890092e-06, + "loss": 0.3441, + "step": 14086 + }, + { + "epoch": 0.19108790016277807, + "grad_norm": 6.307362079620361, + "learning_rate": 8.172399616280665e-06, + "loss": 0.2264, + "step": 14087 + }, + { + "epoch": 0.19110146500271297, + "grad_norm": 5.376869201660156, + "learning_rate": 8.172262573660408e-06, + "loss": 0.255, + "step": 14088 + }, + { + "epoch": 0.19111502984264786, + "grad_norm": 4.6177520751953125, + "learning_rate": 8.172125531040153e-06, + "loss": 0.3086, + "step": 14089 + }, + { + "epoch": 0.19112859468258275, + "grad_norm": 7.1888532638549805, + "learning_rate": 8.1719884884199e-06, + "loss": 0.6918, + "step": 14090 + }, + { + "epoch": 0.19114215952251765, + "grad_norm": 5.241617202758789, + "learning_rate": 8.171851445799645e-06, + "loss": 0.2464, + "step": 14091 + }, + { + "epoch": 0.1911557243624525, + "grad_norm": 6.9837117195129395, + "learning_rate": 8.171714403179389e-06, + "loss": 0.3584, + "step": 14092 + }, + { + "epoch": 0.1911692892023874, + "grad_norm": 3.893540620803833, + "learning_rate": 8.171577360559134e-06, + "loss": 0.1994, + "step": 14093 + }, + { + "epoch": 0.1911828540423223, + "grad_norm": 7.442408084869385, + "learning_rate": 8.17144031793888e-06, + "loss": 0.4064, + "step": 14094 + }, + { + "epoch": 0.1911964188822572, + "grad_norm": 5.8077826499938965, + "learning_rate": 8.171303275318624e-06, + "loss": 0.3294, + "step": 14095 + }, + { + "epoch": 0.19120998372219208, + "grad_norm": 8.36953353881836, + "learning_rate": 8.17116623269837e-06, + "loss": 0.4181, + "step": 14096 + }, + { + "epoch": 0.19122354856212698, + "grad_norm": 7.153566360473633, + "learning_rate": 8.171029190078115e-06, + "loss": 0.3175, + "step": 14097 + }, + { + "epoch": 0.19123711340206184, + "grad_norm": 6.888758659362793, + "learning_rate": 8.17089214745786e-06, + "loss": 0.3918, + "step": 14098 + }, + { + "epoch": 0.19125067824199674, + "grad_norm": 5.7100138664245605, + "learning_rate": 8.170755104837605e-06, + "loss": 0.2838, + "step": 14099 + }, + { + "epoch": 0.19126424308193163, + "grad_norm": 5.73823881149292, + "learning_rate": 8.17061806221735e-06, + "loss": 0.2765, + "step": 14100 + }, + { + "epoch": 0.19127780792186652, + "grad_norm": 6.98261022567749, + "learning_rate": 8.170481019597095e-06, + "loss": 0.4597, + "step": 14101 + }, + { + "epoch": 0.19129137276180141, + "grad_norm": 6.478520393371582, + "learning_rate": 8.17034397697684e-06, + "loss": 0.4116, + "step": 14102 + }, + { + "epoch": 0.1913049376017363, + "grad_norm": 6.015071868896484, + "learning_rate": 8.170206934356586e-06, + "loss": 0.3916, + "step": 14103 + }, + { + "epoch": 0.1913185024416712, + "grad_norm": 6.933437347412109, + "learning_rate": 8.170069891736331e-06, + "loss": 0.3682, + "step": 14104 + }, + { + "epoch": 0.19133206728160607, + "grad_norm": 4.386769771575928, + "learning_rate": 8.169932849116076e-06, + "loss": 0.2528, + "step": 14105 + }, + { + "epoch": 0.19134563212154096, + "grad_norm": 6.339069843292236, + "learning_rate": 8.169795806495821e-06, + "loss": 0.3932, + "step": 14106 + }, + { + "epoch": 0.19135919696147585, + "grad_norm": 6.964006423950195, + "learning_rate": 8.169658763875566e-06, + "loss": 0.3768, + "step": 14107 + }, + { + "epoch": 0.19137276180141075, + "grad_norm": 6.427331924438477, + "learning_rate": 8.169521721255312e-06, + "loss": 0.3284, + "step": 14108 + }, + { + "epoch": 0.19138632664134564, + "grad_norm": 5.167314529418945, + "learning_rate": 8.169384678635057e-06, + "loss": 0.3403, + "step": 14109 + }, + { + "epoch": 0.19139989148128053, + "grad_norm": 5.421138286590576, + "learning_rate": 8.1692476360148e-06, + "loss": 0.2675, + "step": 14110 + }, + { + "epoch": 0.1914134563212154, + "grad_norm": 4.763230800628662, + "learning_rate": 8.169110593394545e-06, + "loss": 0.3477, + "step": 14111 + }, + { + "epoch": 0.1914270211611503, + "grad_norm": 6.80792236328125, + "learning_rate": 8.168973550774292e-06, + "loss": 0.3136, + "step": 14112 + }, + { + "epoch": 0.19144058600108518, + "grad_norm": 4.692610263824463, + "learning_rate": 8.168836508154036e-06, + "loss": 0.1763, + "step": 14113 + }, + { + "epoch": 0.19145415084102008, + "grad_norm": 5.284366607666016, + "learning_rate": 8.168699465533781e-06, + "loss": 0.2033, + "step": 14114 + }, + { + "epoch": 0.19146771568095497, + "grad_norm": 6.449429988861084, + "learning_rate": 8.168562422913526e-06, + "loss": 0.3185, + "step": 14115 + }, + { + "epoch": 0.19148128052088986, + "grad_norm": 6.558922290802002, + "learning_rate": 8.168425380293273e-06, + "loss": 0.3008, + "step": 14116 + }, + { + "epoch": 0.19149484536082476, + "grad_norm": 5.494565486907959, + "learning_rate": 8.168288337673017e-06, + "loss": 0.3326, + "step": 14117 + }, + { + "epoch": 0.19150841020075962, + "grad_norm": 5.9319915771484375, + "learning_rate": 8.168151295052762e-06, + "loss": 0.2858, + "step": 14118 + }, + { + "epoch": 0.19152197504069451, + "grad_norm": 6.448119163513184, + "learning_rate": 8.168014252432507e-06, + "loss": 0.282, + "step": 14119 + }, + { + "epoch": 0.1915355398806294, + "grad_norm": 7.073143482208252, + "learning_rate": 8.167877209812252e-06, + "loss": 0.2793, + "step": 14120 + }, + { + "epoch": 0.1915491047205643, + "grad_norm": 5.144626617431641, + "learning_rate": 8.167740167191997e-06, + "loss": 0.2382, + "step": 14121 + }, + { + "epoch": 0.1915626695604992, + "grad_norm": 4.23270320892334, + "learning_rate": 8.167603124571742e-06, + "loss": 0.2136, + "step": 14122 + }, + { + "epoch": 0.1915762344004341, + "grad_norm": 4.175440788269043, + "learning_rate": 8.167466081951488e-06, + "loss": 0.2613, + "step": 14123 + }, + { + "epoch": 0.19158979924036895, + "grad_norm": 4.646079063415527, + "learning_rate": 8.167329039331233e-06, + "loss": 0.3512, + "step": 14124 + }, + { + "epoch": 0.19160336408030385, + "grad_norm": 6.606706142425537, + "learning_rate": 8.167191996710978e-06, + "loss": 0.2932, + "step": 14125 + }, + { + "epoch": 0.19161692892023874, + "grad_norm": 6.269661903381348, + "learning_rate": 8.167054954090723e-06, + "loss": 0.315, + "step": 14126 + }, + { + "epoch": 0.19163049376017363, + "grad_norm": 5.0316243171691895, + "learning_rate": 8.166917911470468e-06, + "loss": 0.3198, + "step": 14127 + }, + { + "epoch": 0.19164405860010852, + "grad_norm": 8.31116008758545, + "learning_rate": 8.166780868850212e-06, + "loss": 0.4468, + "step": 14128 + }, + { + "epoch": 0.19165762344004342, + "grad_norm": 6.165606498718262, + "learning_rate": 8.166643826229959e-06, + "loss": 0.4108, + "step": 14129 + }, + { + "epoch": 0.19167118827997828, + "grad_norm": 7.080409049987793, + "learning_rate": 8.166506783609704e-06, + "loss": 0.2563, + "step": 14130 + }, + { + "epoch": 0.19168475311991318, + "grad_norm": 7.7143354415893555, + "learning_rate": 8.166369740989449e-06, + "loss": 0.4766, + "step": 14131 + }, + { + "epoch": 0.19169831795984807, + "grad_norm": 6.762633323669434, + "learning_rate": 8.166232698369193e-06, + "loss": 0.5384, + "step": 14132 + }, + { + "epoch": 0.19171188279978296, + "grad_norm": 5.8891987800598145, + "learning_rate": 8.16609565574894e-06, + "loss": 0.3289, + "step": 14133 + }, + { + "epoch": 0.19172544763971786, + "grad_norm": 7.867786407470703, + "learning_rate": 8.165958613128685e-06, + "loss": 0.5287, + "step": 14134 + }, + { + "epoch": 0.19173901247965275, + "grad_norm": 5.545222282409668, + "learning_rate": 8.165821570508428e-06, + "loss": 0.3661, + "step": 14135 + }, + { + "epoch": 0.19175257731958764, + "grad_norm": 6.685629367828369, + "learning_rate": 8.165684527888173e-06, + "loss": 0.359, + "step": 14136 + }, + { + "epoch": 0.1917661421595225, + "grad_norm": 5.608922481536865, + "learning_rate": 8.16554748526792e-06, + "loss": 0.3654, + "step": 14137 + }, + { + "epoch": 0.1917797069994574, + "grad_norm": 5.581566333770752, + "learning_rate": 8.165410442647664e-06, + "loss": 0.3918, + "step": 14138 + }, + { + "epoch": 0.1917932718393923, + "grad_norm": 6.039770126342773, + "learning_rate": 8.165273400027409e-06, + "loss": 0.4757, + "step": 14139 + }, + { + "epoch": 0.1918068366793272, + "grad_norm": 4.997923374176025, + "learning_rate": 8.165136357407154e-06, + "loss": 0.3558, + "step": 14140 + }, + { + "epoch": 0.19182040151926208, + "grad_norm": 5.710089683532715, + "learning_rate": 8.164999314786899e-06, + "loss": 0.3682, + "step": 14141 + }, + { + "epoch": 0.19183396635919697, + "grad_norm": 6.494273662567139, + "learning_rate": 8.164862272166644e-06, + "loss": 0.4866, + "step": 14142 + }, + { + "epoch": 0.19184753119913184, + "grad_norm": 4.909788131713867, + "learning_rate": 8.16472522954639e-06, + "loss": 0.3982, + "step": 14143 + }, + { + "epoch": 0.19186109603906673, + "grad_norm": 5.753860950469971, + "learning_rate": 8.164588186926135e-06, + "loss": 0.2919, + "step": 14144 + }, + { + "epoch": 0.19187466087900162, + "grad_norm": 6.709502696990967, + "learning_rate": 8.16445114430588e-06, + "loss": 0.4124, + "step": 14145 + }, + { + "epoch": 0.19188822571893652, + "grad_norm": 6.436703681945801, + "learning_rate": 8.164314101685625e-06, + "loss": 0.4485, + "step": 14146 + }, + { + "epoch": 0.1919017905588714, + "grad_norm": 4.8133697509765625, + "learning_rate": 8.16417705906537e-06, + "loss": 0.1968, + "step": 14147 + }, + { + "epoch": 0.1919153553988063, + "grad_norm": 4.617457866668701, + "learning_rate": 8.164040016445115e-06, + "loss": 0.3032, + "step": 14148 + }, + { + "epoch": 0.1919289202387412, + "grad_norm": 4.65761661529541, + "learning_rate": 8.16390297382486e-06, + "loss": 0.3272, + "step": 14149 + }, + { + "epoch": 0.19194248507867606, + "grad_norm": 5.752230167388916, + "learning_rate": 8.163765931204606e-06, + "loss": 0.3676, + "step": 14150 + }, + { + "epoch": 0.19195604991861095, + "grad_norm": 6.221765518188477, + "learning_rate": 8.163628888584351e-06, + "loss": 0.3571, + "step": 14151 + }, + { + "epoch": 0.19196961475854585, + "grad_norm": 3.673671245574951, + "learning_rate": 8.163491845964096e-06, + "loss": 0.2896, + "step": 14152 + }, + { + "epoch": 0.19198317959848074, + "grad_norm": 5.081875801086426, + "learning_rate": 8.16335480334384e-06, + "loss": 0.3356, + "step": 14153 + }, + { + "epoch": 0.19199674443841563, + "grad_norm": 5.657028675079346, + "learning_rate": 8.163217760723585e-06, + "loss": 0.252, + "step": 14154 + }, + { + "epoch": 0.19201030927835053, + "grad_norm": 6.044492244720459, + "learning_rate": 8.163080718103332e-06, + "loss": 0.4217, + "step": 14155 + }, + { + "epoch": 0.1920238741182854, + "grad_norm": 6.803552150726318, + "learning_rate": 8.162943675483077e-06, + "loss": 0.4559, + "step": 14156 + }, + { + "epoch": 0.19203743895822029, + "grad_norm": 4.8194122314453125, + "learning_rate": 8.16280663286282e-06, + "loss": 0.3335, + "step": 14157 + }, + { + "epoch": 0.19205100379815518, + "grad_norm": 5.936041355133057, + "learning_rate": 8.162669590242565e-06, + "loss": 0.391, + "step": 14158 + }, + { + "epoch": 0.19206456863809007, + "grad_norm": 4.54529333114624, + "learning_rate": 8.162532547622312e-06, + "loss": 0.2563, + "step": 14159 + }, + { + "epoch": 0.19207813347802496, + "grad_norm": 5.9566192626953125, + "learning_rate": 8.162395505002056e-06, + "loss": 0.3928, + "step": 14160 + }, + { + "epoch": 0.19209169831795986, + "grad_norm": 7.0343475341796875, + "learning_rate": 8.162258462381801e-06, + "loss": 0.417, + "step": 14161 + }, + { + "epoch": 0.19210526315789472, + "grad_norm": 7.104079723358154, + "learning_rate": 8.162121419761546e-06, + "loss": 0.4168, + "step": 14162 + }, + { + "epoch": 0.19211882799782962, + "grad_norm": 6.979165077209473, + "learning_rate": 8.161984377141291e-06, + "loss": 0.3765, + "step": 14163 + }, + { + "epoch": 0.1921323928377645, + "grad_norm": 6.390807151794434, + "learning_rate": 8.161847334521037e-06, + "loss": 0.2906, + "step": 14164 + }, + { + "epoch": 0.1921459576776994, + "grad_norm": 5.70682430267334, + "learning_rate": 8.161710291900782e-06, + "loss": 0.4505, + "step": 14165 + }, + { + "epoch": 0.1921595225176343, + "grad_norm": 8.522321701049805, + "learning_rate": 8.161573249280527e-06, + "loss": 0.4795, + "step": 14166 + }, + { + "epoch": 0.1921730873575692, + "grad_norm": 9.900959968566895, + "learning_rate": 8.161436206660272e-06, + "loss": 0.5203, + "step": 14167 + }, + { + "epoch": 0.19218665219750408, + "grad_norm": 6.0258684158325195, + "learning_rate": 8.161299164040017e-06, + "loss": 0.3704, + "step": 14168 + }, + { + "epoch": 0.19220021703743895, + "grad_norm": 5.286050319671631, + "learning_rate": 8.161162121419762e-06, + "loss": 0.2862, + "step": 14169 + }, + { + "epoch": 0.19221378187737384, + "grad_norm": 6.605103969573975, + "learning_rate": 8.161025078799508e-06, + "loss": 0.4716, + "step": 14170 + }, + { + "epoch": 0.19222734671730873, + "grad_norm": 7.704184532165527, + "learning_rate": 8.160888036179253e-06, + "loss": 0.4663, + "step": 14171 + }, + { + "epoch": 0.19224091155724363, + "grad_norm": 6.065963268280029, + "learning_rate": 8.160750993558998e-06, + "loss": 0.4714, + "step": 14172 + }, + { + "epoch": 0.19225447639717852, + "grad_norm": 5.554594993591309, + "learning_rate": 8.160613950938743e-06, + "loss": 0.3729, + "step": 14173 + }, + { + "epoch": 0.1922680412371134, + "grad_norm": 5.025240898132324, + "learning_rate": 8.160476908318488e-06, + "loss": 0.3496, + "step": 14174 + }, + { + "epoch": 0.19228160607704828, + "grad_norm": 4.032158851623535, + "learning_rate": 8.160339865698232e-06, + "loss": 0.3687, + "step": 14175 + }, + { + "epoch": 0.19229517091698317, + "grad_norm": 4.863280296325684, + "learning_rate": 8.160202823077979e-06, + "loss": 0.4414, + "step": 14176 + }, + { + "epoch": 0.19230873575691806, + "grad_norm": 5.492260456085205, + "learning_rate": 8.160065780457724e-06, + "loss": 0.3841, + "step": 14177 + }, + { + "epoch": 0.19232230059685296, + "grad_norm": 6.880600929260254, + "learning_rate": 8.159928737837467e-06, + "loss": 0.5814, + "step": 14178 + }, + { + "epoch": 0.19233586543678785, + "grad_norm": 5.143429756164551, + "learning_rate": 8.159791695217213e-06, + "loss": 0.3486, + "step": 14179 + }, + { + "epoch": 0.19234943027672274, + "grad_norm": 5.765141010284424, + "learning_rate": 8.159654652596958e-06, + "loss": 0.3803, + "step": 14180 + }, + { + "epoch": 0.19236299511665764, + "grad_norm": 5.810072422027588, + "learning_rate": 8.159517609976703e-06, + "loss": 0.4676, + "step": 14181 + }, + { + "epoch": 0.1923765599565925, + "grad_norm": 5.276987552642822, + "learning_rate": 8.159380567356448e-06, + "loss": 0.3454, + "step": 14182 + }, + { + "epoch": 0.1923901247965274, + "grad_norm": 4.415961742401123, + "learning_rate": 8.159243524736193e-06, + "loss": 0.3645, + "step": 14183 + }, + { + "epoch": 0.1924036896364623, + "grad_norm": 5.977422714233398, + "learning_rate": 8.159106482115938e-06, + "loss": 0.3962, + "step": 14184 + }, + { + "epoch": 0.19241725447639718, + "grad_norm": 4.679018497467041, + "learning_rate": 8.158969439495684e-06, + "loss": 0.3061, + "step": 14185 + }, + { + "epoch": 0.19243081931633207, + "grad_norm": 9.425871849060059, + "learning_rate": 8.158832396875429e-06, + "loss": 0.5202, + "step": 14186 + }, + { + "epoch": 0.19244438415626697, + "grad_norm": 7.0117387771606445, + "learning_rate": 8.158695354255174e-06, + "loss": 0.3371, + "step": 14187 + }, + { + "epoch": 0.19245794899620183, + "grad_norm": 5.200222969055176, + "learning_rate": 8.158558311634919e-06, + "loss": 0.3421, + "step": 14188 + }, + { + "epoch": 0.19247151383613673, + "grad_norm": 4.541850566864014, + "learning_rate": 8.158421269014664e-06, + "loss": 0.2886, + "step": 14189 + }, + { + "epoch": 0.19248507867607162, + "grad_norm": 5.594569206237793, + "learning_rate": 8.15828422639441e-06, + "loss": 0.3394, + "step": 14190 + }, + { + "epoch": 0.1924986435160065, + "grad_norm": 5.091324329376221, + "learning_rate": 8.158147183774155e-06, + "loss": 0.266, + "step": 14191 + }, + { + "epoch": 0.1925122083559414, + "grad_norm": 5.664504528045654, + "learning_rate": 8.1580101411539e-06, + "loss": 0.3387, + "step": 14192 + }, + { + "epoch": 0.1925257731958763, + "grad_norm": 4.394266605377197, + "learning_rate": 8.157873098533645e-06, + "loss": 0.3282, + "step": 14193 + }, + { + "epoch": 0.19253933803581116, + "grad_norm": 4.794620037078857, + "learning_rate": 8.15773605591339e-06, + "loss": 0.2996, + "step": 14194 + }, + { + "epoch": 0.19255290287574606, + "grad_norm": 4.247023582458496, + "learning_rate": 8.157599013293135e-06, + "loss": 0.2683, + "step": 14195 + }, + { + "epoch": 0.19256646771568095, + "grad_norm": 6.143507957458496, + "learning_rate": 8.157461970672879e-06, + "loss": 0.2869, + "step": 14196 + }, + { + "epoch": 0.19258003255561584, + "grad_norm": 6.017621994018555, + "learning_rate": 8.157324928052624e-06, + "loss": 0.3281, + "step": 14197 + }, + { + "epoch": 0.19259359739555074, + "grad_norm": 5.997939586639404, + "learning_rate": 8.157187885432371e-06, + "loss": 0.4385, + "step": 14198 + }, + { + "epoch": 0.19260716223548563, + "grad_norm": 5.270591735839844, + "learning_rate": 8.157050842812116e-06, + "loss": 0.3645, + "step": 14199 + }, + { + "epoch": 0.19262072707542052, + "grad_norm": 4.725484848022461, + "learning_rate": 8.15691380019186e-06, + "loss": 0.3045, + "step": 14200 + }, + { + "epoch": 0.1926342919153554, + "grad_norm": 5.73333215713501, + "learning_rate": 8.156776757571605e-06, + "loss": 0.4417, + "step": 14201 + }, + { + "epoch": 0.19264785675529028, + "grad_norm": 5.703635215759277, + "learning_rate": 8.156639714951352e-06, + "loss": 0.3379, + "step": 14202 + }, + { + "epoch": 0.19266142159522517, + "grad_norm": 5.802382469177246, + "learning_rate": 8.156502672331095e-06, + "loss": 0.3101, + "step": 14203 + }, + { + "epoch": 0.19267498643516007, + "grad_norm": 5.526337623596191, + "learning_rate": 8.15636562971084e-06, + "loss": 0.3034, + "step": 14204 + }, + { + "epoch": 0.19268855127509496, + "grad_norm": 4.846706867218018, + "learning_rate": 8.156228587090585e-06, + "loss": 0.3324, + "step": 14205 + }, + { + "epoch": 0.19270211611502985, + "grad_norm": 8.658203125, + "learning_rate": 8.15609154447033e-06, + "loss": 0.5136, + "step": 14206 + }, + { + "epoch": 0.19271568095496472, + "grad_norm": 5.353655815124512, + "learning_rate": 8.155954501850076e-06, + "loss": 0.3596, + "step": 14207 + }, + { + "epoch": 0.1927292457948996, + "grad_norm": 5.405937194824219, + "learning_rate": 8.155817459229821e-06, + "loss": 0.2378, + "step": 14208 + }, + { + "epoch": 0.1927428106348345, + "grad_norm": 4.384707927703857, + "learning_rate": 8.155680416609566e-06, + "loss": 0.2592, + "step": 14209 + }, + { + "epoch": 0.1927563754747694, + "grad_norm": 4.919930934906006, + "learning_rate": 8.155543373989311e-06, + "loss": 0.2813, + "step": 14210 + }, + { + "epoch": 0.1927699403147043, + "grad_norm": 4.821085453033447, + "learning_rate": 8.155406331369057e-06, + "loss": 0.2162, + "step": 14211 + }, + { + "epoch": 0.19278350515463918, + "grad_norm": 5.375413417816162, + "learning_rate": 8.155269288748802e-06, + "loss": 0.2638, + "step": 14212 + }, + { + "epoch": 0.19279706999457408, + "grad_norm": 4.883310317993164, + "learning_rate": 8.155132246128547e-06, + "loss": 0.2941, + "step": 14213 + }, + { + "epoch": 0.19281063483450894, + "grad_norm": 3.8810176849365234, + "learning_rate": 8.154995203508292e-06, + "loss": 0.2076, + "step": 14214 + }, + { + "epoch": 0.19282419967444384, + "grad_norm": 4.180809497833252, + "learning_rate": 8.154858160888037e-06, + "loss": 0.3272, + "step": 14215 + }, + { + "epoch": 0.19283776451437873, + "grad_norm": 5.531719207763672, + "learning_rate": 8.154721118267782e-06, + "loss": 0.3143, + "step": 14216 + }, + { + "epoch": 0.19285132935431362, + "grad_norm": 5.045733451843262, + "learning_rate": 8.154584075647528e-06, + "loss": 0.3419, + "step": 14217 + }, + { + "epoch": 0.19286489419424852, + "grad_norm": 6.8516998291015625, + "learning_rate": 8.154447033027271e-06, + "loss": 0.3961, + "step": 14218 + }, + { + "epoch": 0.1928784590341834, + "grad_norm": 3.9515273571014404, + "learning_rate": 8.154309990407018e-06, + "loss": 0.3034, + "step": 14219 + }, + { + "epoch": 0.19289202387411827, + "grad_norm": 6.1469573974609375, + "learning_rate": 8.154172947786763e-06, + "loss": 0.3763, + "step": 14220 + }, + { + "epoch": 0.19290558871405317, + "grad_norm": 4.707149982452393, + "learning_rate": 8.154035905166507e-06, + "loss": 0.229, + "step": 14221 + }, + { + "epoch": 0.19291915355398806, + "grad_norm": 5.004936695098877, + "learning_rate": 8.153898862546252e-06, + "loss": 0.3517, + "step": 14222 + }, + { + "epoch": 0.19293271839392295, + "grad_norm": 5.257476329803467, + "learning_rate": 8.153761819925997e-06, + "loss": 0.3467, + "step": 14223 + }, + { + "epoch": 0.19294628323385785, + "grad_norm": 6.424355506896973, + "learning_rate": 8.153624777305744e-06, + "loss": 0.2673, + "step": 14224 + }, + { + "epoch": 0.19295984807379274, + "grad_norm": 6.053681373596191, + "learning_rate": 8.153487734685487e-06, + "loss": 0.28, + "step": 14225 + }, + { + "epoch": 0.1929734129137276, + "grad_norm": 5.0441670417785645, + "learning_rate": 8.153350692065233e-06, + "loss": 0.3842, + "step": 14226 + }, + { + "epoch": 0.1929869777536625, + "grad_norm": 5.181633472442627, + "learning_rate": 8.153213649444978e-06, + "loss": 0.3078, + "step": 14227 + }, + { + "epoch": 0.1930005425935974, + "grad_norm": 3.703782320022583, + "learning_rate": 8.153076606824723e-06, + "loss": 0.1985, + "step": 14228 + }, + { + "epoch": 0.19301410743353228, + "grad_norm": 5.049402713775635, + "learning_rate": 8.152939564204468e-06, + "loss": 0.3284, + "step": 14229 + }, + { + "epoch": 0.19302767227346718, + "grad_norm": 4.543796062469482, + "learning_rate": 8.152802521584213e-06, + "loss": 0.1769, + "step": 14230 + }, + { + "epoch": 0.19304123711340207, + "grad_norm": 5.757754802703857, + "learning_rate": 8.152665478963958e-06, + "loss": 0.289, + "step": 14231 + }, + { + "epoch": 0.19305480195333696, + "grad_norm": 5.387282371520996, + "learning_rate": 8.152528436343704e-06, + "loss": 0.2556, + "step": 14232 + }, + { + "epoch": 0.19306836679327183, + "grad_norm": 7.4175310134887695, + "learning_rate": 8.152391393723449e-06, + "loss": 0.4639, + "step": 14233 + }, + { + "epoch": 0.19308193163320672, + "grad_norm": 5.342935562133789, + "learning_rate": 8.152254351103194e-06, + "loss": 0.3187, + "step": 14234 + }, + { + "epoch": 0.19309549647314161, + "grad_norm": 6.308545112609863, + "learning_rate": 8.15211730848294e-06, + "loss": 0.4375, + "step": 14235 + }, + { + "epoch": 0.1931090613130765, + "grad_norm": 4.345129013061523, + "learning_rate": 8.151980265862683e-06, + "loss": 0.3099, + "step": 14236 + }, + { + "epoch": 0.1931226261530114, + "grad_norm": 4.8878912925720215, + "learning_rate": 8.15184322324243e-06, + "loss": 0.2675, + "step": 14237 + }, + { + "epoch": 0.1931361909929463, + "grad_norm": 5.310513496398926, + "learning_rate": 8.151706180622175e-06, + "loss": 0.2694, + "step": 14238 + }, + { + "epoch": 0.19314975583288116, + "grad_norm": 5.291018009185791, + "learning_rate": 8.15156913800192e-06, + "loss": 0.2701, + "step": 14239 + }, + { + "epoch": 0.19316332067281605, + "grad_norm": 6.31868839263916, + "learning_rate": 8.151432095381663e-06, + "loss": 0.326, + "step": 14240 + }, + { + "epoch": 0.19317688551275095, + "grad_norm": 5.24117374420166, + "learning_rate": 8.15129505276141e-06, + "loss": 0.2285, + "step": 14241 + }, + { + "epoch": 0.19319045035268584, + "grad_norm": 5.843963146209717, + "learning_rate": 8.151158010141155e-06, + "loss": 0.2988, + "step": 14242 + }, + { + "epoch": 0.19320401519262073, + "grad_norm": 6.507763862609863, + "learning_rate": 8.151020967520899e-06, + "loss": 0.2504, + "step": 14243 + }, + { + "epoch": 0.19321758003255562, + "grad_norm": 6.524658203125, + "learning_rate": 8.150883924900644e-06, + "loss": 0.3443, + "step": 14244 + }, + { + "epoch": 0.19323114487249052, + "grad_norm": 6.394744396209717, + "learning_rate": 8.150746882280391e-06, + "loss": 0.3142, + "step": 14245 + }, + { + "epoch": 0.19324470971242538, + "grad_norm": 6.060706615447998, + "learning_rate": 8.150609839660134e-06, + "loss": 0.3806, + "step": 14246 + }, + { + "epoch": 0.19325827455236028, + "grad_norm": 4.91711950302124, + "learning_rate": 8.15047279703988e-06, + "loss": 0.2483, + "step": 14247 + }, + { + "epoch": 0.19327183939229517, + "grad_norm": 8.458311080932617, + "learning_rate": 8.150335754419625e-06, + "loss": 0.5095, + "step": 14248 + }, + { + "epoch": 0.19328540423223006, + "grad_norm": 7.002152442932129, + "learning_rate": 8.15019871179937e-06, + "loss": 0.3096, + "step": 14249 + }, + { + "epoch": 0.19329896907216496, + "grad_norm": 6.250828742980957, + "learning_rate": 8.150061669179115e-06, + "loss": 0.3, + "step": 14250 + }, + { + "epoch": 0.19331253391209985, + "grad_norm": 6.204843997955322, + "learning_rate": 8.14992462655886e-06, + "loss": 0.2863, + "step": 14251 + }, + { + "epoch": 0.19332609875203471, + "grad_norm": 6.380445957183838, + "learning_rate": 8.149787583938606e-06, + "loss": 0.332, + "step": 14252 + }, + { + "epoch": 0.1933396635919696, + "grad_norm": 6.9607086181640625, + "learning_rate": 8.14965054131835e-06, + "loss": 0.3762, + "step": 14253 + }, + { + "epoch": 0.1933532284319045, + "grad_norm": 4.284040451049805, + "learning_rate": 8.149513498698096e-06, + "loss": 0.2094, + "step": 14254 + }, + { + "epoch": 0.1933667932718394, + "grad_norm": 5.9960103034973145, + "learning_rate": 8.149376456077841e-06, + "loss": 0.4011, + "step": 14255 + }, + { + "epoch": 0.1933803581117743, + "grad_norm": 5.0700364112854, + "learning_rate": 8.149239413457586e-06, + "loss": 0.2644, + "step": 14256 + }, + { + "epoch": 0.19339392295170918, + "grad_norm": 6.051250457763672, + "learning_rate": 8.149102370837331e-06, + "loss": 0.245, + "step": 14257 + }, + { + "epoch": 0.19340748779164404, + "grad_norm": 5.461134910583496, + "learning_rate": 8.148965328217077e-06, + "loss": 0.3279, + "step": 14258 + }, + { + "epoch": 0.19342105263157894, + "grad_norm": 6.488198280334473, + "learning_rate": 8.148828285596822e-06, + "loss": 0.3748, + "step": 14259 + }, + { + "epoch": 0.19343461747151383, + "grad_norm": 6.844749927520752, + "learning_rate": 8.148691242976567e-06, + "loss": 0.3283, + "step": 14260 + }, + { + "epoch": 0.19344818231144872, + "grad_norm": 4.265383243560791, + "learning_rate": 8.14855420035631e-06, + "loss": 0.1743, + "step": 14261 + }, + { + "epoch": 0.19346174715138362, + "grad_norm": 4.090991497039795, + "learning_rate": 8.148417157736057e-06, + "loss": 0.153, + "step": 14262 + }, + { + "epoch": 0.1934753119913185, + "grad_norm": 8.895841598510742, + "learning_rate": 8.148280115115802e-06, + "loss": 0.4742, + "step": 14263 + }, + { + "epoch": 0.1934888768312534, + "grad_norm": 5.976553440093994, + "learning_rate": 8.148143072495546e-06, + "loss": 0.4091, + "step": 14264 + }, + { + "epoch": 0.19350244167118827, + "grad_norm": 5.639764308929443, + "learning_rate": 8.148006029875291e-06, + "loss": 0.2684, + "step": 14265 + }, + { + "epoch": 0.19351600651112316, + "grad_norm": 8.513866424560547, + "learning_rate": 8.147868987255036e-06, + "loss": 0.5091, + "step": 14266 + }, + { + "epoch": 0.19352957135105806, + "grad_norm": 7.274280071258545, + "learning_rate": 8.147731944634783e-06, + "loss": 0.4641, + "step": 14267 + }, + { + "epoch": 0.19354313619099295, + "grad_norm": 5.2791032791137695, + "learning_rate": 8.147594902014527e-06, + "loss": 0.2297, + "step": 14268 + }, + { + "epoch": 0.19355670103092784, + "grad_norm": 7.9006757736206055, + "learning_rate": 8.147457859394272e-06, + "loss": 0.421, + "step": 14269 + }, + { + "epoch": 0.19357026587086273, + "grad_norm": 4.920236110687256, + "learning_rate": 8.147320816774017e-06, + "loss": 0.2906, + "step": 14270 + }, + { + "epoch": 0.1935838307107976, + "grad_norm": 5.153635501861572, + "learning_rate": 8.147183774153762e-06, + "loss": 0.2781, + "step": 14271 + }, + { + "epoch": 0.1935973955507325, + "grad_norm": 6.167893409729004, + "learning_rate": 8.147046731533507e-06, + "loss": 0.4632, + "step": 14272 + }, + { + "epoch": 0.19361096039066739, + "grad_norm": 7.480136394500732, + "learning_rate": 8.146909688913253e-06, + "loss": 0.5189, + "step": 14273 + }, + { + "epoch": 0.19362452523060228, + "grad_norm": 7.1258039474487305, + "learning_rate": 8.146772646292998e-06, + "loss": 0.3851, + "step": 14274 + }, + { + "epoch": 0.19363809007053717, + "grad_norm": 6.062307357788086, + "learning_rate": 8.146635603672743e-06, + "loss": 0.3358, + "step": 14275 + }, + { + "epoch": 0.19365165491047207, + "grad_norm": 6.033874034881592, + "learning_rate": 8.146498561052488e-06, + "loss": 0.3758, + "step": 14276 + }, + { + "epoch": 0.19366521975040696, + "grad_norm": 7.219758987426758, + "learning_rate": 8.146361518432233e-06, + "loss": 0.3964, + "step": 14277 + }, + { + "epoch": 0.19367878459034182, + "grad_norm": 7.252903461456299, + "learning_rate": 8.146224475811978e-06, + "loss": 0.4038, + "step": 14278 + }, + { + "epoch": 0.19369234943027672, + "grad_norm": 8.09618091583252, + "learning_rate": 8.146087433191722e-06, + "loss": 0.4608, + "step": 14279 + }, + { + "epoch": 0.1937059142702116, + "grad_norm": 7.14844274520874, + "learning_rate": 8.145950390571469e-06, + "loss": 0.3606, + "step": 14280 + }, + { + "epoch": 0.1937194791101465, + "grad_norm": 5.28922176361084, + "learning_rate": 8.145813347951214e-06, + "loss": 0.4017, + "step": 14281 + }, + { + "epoch": 0.1937330439500814, + "grad_norm": 4.227828502655029, + "learning_rate": 8.14567630533096e-06, + "loss": 0.2551, + "step": 14282 + }, + { + "epoch": 0.1937466087900163, + "grad_norm": 6.2472710609436035, + "learning_rate": 8.145539262710703e-06, + "loss": 0.2659, + "step": 14283 + }, + { + "epoch": 0.19376017362995115, + "grad_norm": 5.882026195526123, + "learning_rate": 8.14540222009045e-06, + "loss": 0.3777, + "step": 14284 + }, + { + "epoch": 0.19377373846988605, + "grad_norm": 6.1064558029174805, + "learning_rate": 8.145265177470195e-06, + "loss": 0.2899, + "step": 14285 + }, + { + "epoch": 0.19378730330982094, + "grad_norm": 6.351465225219727, + "learning_rate": 8.145128134849938e-06, + "loss": 0.3542, + "step": 14286 + }, + { + "epoch": 0.19380086814975583, + "grad_norm": 7.554667949676514, + "learning_rate": 8.144991092229683e-06, + "loss": 0.4684, + "step": 14287 + }, + { + "epoch": 0.19381443298969073, + "grad_norm": 5.971450328826904, + "learning_rate": 8.14485404960943e-06, + "loss": 0.2292, + "step": 14288 + }, + { + "epoch": 0.19382799782962562, + "grad_norm": 5.613182067871094, + "learning_rate": 8.144717006989174e-06, + "loss": 0.2719, + "step": 14289 + }, + { + "epoch": 0.19384156266956049, + "grad_norm": 4.5443878173828125, + "learning_rate": 8.144579964368919e-06, + "loss": 0.2861, + "step": 14290 + }, + { + "epoch": 0.19385512750949538, + "grad_norm": 5.038177490234375, + "learning_rate": 8.144442921748664e-06, + "loss": 0.2724, + "step": 14291 + }, + { + "epoch": 0.19386869234943027, + "grad_norm": 7.584300518035889, + "learning_rate": 8.14430587912841e-06, + "loss": 0.3092, + "step": 14292 + }, + { + "epoch": 0.19388225718936516, + "grad_norm": 5.5119829177856445, + "learning_rate": 8.144168836508154e-06, + "loss": 0.3849, + "step": 14293 + }, + { + "epoch": 0.19389582202930006, + "grad_norm": 6.659787178039551, + "learning_rate": 8.1440317938879e-06, + "loss": 0.374, + "step": 14294 + }, + { + "epoch": 0.19390938686923495, + "grad_norm": 6.950528621673584, + "learning_rate": 8.143894751267645e-06, + "loss": 0.499, + "step": 14295 + }, + { + "epoch": 0.19392295170916984, + "grad_norm": 6.570927619934082, + "learning_rate": 8.14375770864739e-06, + "loss": 0.3614, + "step": 14296 + }, + { + "epoch": 0.1939365165491047, + "grad_norm": 8.745306015014648, + "learning_rate": 8.143620666027135e-06, + "loss": 0.7141, + "step": 14297 + }, + { + "epoch": 0.1939500813890396, + "grad_norm": 8.226370811462402, + "learning_rate": 8.14348362340688e-06, + "loss": 0.4365, + "step": 14298 + }, + { + "epoch": 0.1939636462289745, + "grad_norm": 6.486948013305664, + "learning_rate": 8.143346580786626e-06, + "loss": 0.4154, + "step": 14299 + }, + { + "epoch": 0.1939772110689094, + "grad_norm": 6.1164984703063965, + "learning_rate": 8.14320953816637e-06, + "loss": 0.4229, + "step": 14300 + }, + { + "epoch": 0.19399077590884428, + "grad_norm": 6.646454334259033, + "learning_rate": 8.143072495546116e-06, + "loss": 0.4297, + "step": 14301 + }, + { + "epoch": 0.19400434074877917, + "grad_norm": 6.8814849853515625, + "learning_rate": 8.142935452925861e-06, + "loss": 0.564, + "step": 14302 + }, + { + "epoch": 0.19401790558871404, + "grad_norm": 5.884591102600098, + "learning_rate": 8.142798410305606e-06, + "loss": 0.4156, + "step": 14303 + }, + { + "epoch": 0.19403147042864893, + "grad_norm": 5.915832042694092, + "learning_rate": 8.14266136768535e-06, + "loss": 0.412, + "step": 14304 + }, + { + "epoch": 0.19404503526858383, + "grad_norm": 5.8754448890686035, + "learning_rate": 8.142524325065095e-06, + "loss": 0.2656, + "step": 14305 + }, + { + "epoch": 0.19405860010851872, + "grad_norm": 5.073643684387207, + "learning_rate": 8.142387282444842e-06, + "loss": 0.3493, + "step": 14306 + }, + { + "epoch": 0.1940721649484536, + "grad_norm": 6.0509538650512695, + "learning_rate": 8.142250239824587e-06, + "loss": 0.3118, + "step": 14307 + }, + { + "epoch": 0.1940857297883885, + "grad_norm": 5.04714822769165, + "learning_rate": 8.14211319720433e-06, + "loss": 0.2381, + "step": 14308 + }, + { + "epoch": 0.1940992946283234, + "grad_norm": 6.083760738372803, + "learning_rate": 8.141976154584076e-06, + "loss": 0.3712, + "step": 14309 + }, + { + "epoch": 0.19411285946825826, + "grad_norm": 4.995960235595703, + "learning_rate": 8.141839111963823e-06, + "loss": 0.3275, + "step": 14310 + }, + { + "epoch": 0.19412642430819316, + "grad_norm": 5.098589897155762, + "learning_rate": 8.141702069343566e-06, + "loss": 0.4464, + "step": 14311 + }, + { + "epoch": 0.19413998914812805, + "grad_norm": 6.2965803146362305, + "learning_rate": 8.141565026723311e-06, + "loss": 0.4375, + "step": 14312 + }, + { + "epoch": 0.19415355398806294, + "grad_norm": 6.367603778839111, + "learning_rate": 8.141427984103056e-06, + "loss": 0.36, + "step": 14313 + }, + { + "epoch": 0.19416711882799784, + "grad_norm": 6.7015838623046875, + "learning_rate": 8.141290941482802e-06, + "loss": 0.7291, + "step": 14314 + }, + { + "epoch": 0.19418068366793273, + "grad_norm": 4.669753551483154, + "learning_rate": 8.141153898862547e-06, + "loss": 0.206, + "step": 14315 + }, + { + "epoch": 0.1941942485078676, + "grad_norm": 5.928323745727539, + "learning_rate": 8.141016856242292e-06, + "loss": 0.3241, + "step": 14316 + }, + { + "epoch": 0.1942078133478025, + "grad_norm": 5.373211860656738, + "learning_rate": 8.140879813622037e-06, + "loss": 0.313, + "step": 14317 + }, + { + "epoch": 0.19422137818773738, + "grad_norm": 6.48054313659668, + "learning_rate": 8.140742771001782e-06, + "loss": 0.341, + "step": 14318 + }, + { + "epoch": 0.19423494302767227, + "grad_norm": 7.616123199462891, + "learning_rate": 8.140605728381527e-06, + "loss": 0.4273, + "step": 14319 + }, + { + "epoch": 0.19424850786760717, + "grad_norm": 5.312366008758545, + "learning_rate": 8.140468685761273e-06, + "loss": 0.312, + "step": 14320 + }, + { + "epoch": 0.19426207270754206, + "grad_norm": 5.6110968589782715, + "learning_rate": 8.140331643141018e-06, + "loss": 0.3516, + "step": 14321 + }, + { + "epoch": 0.19427563754747693, + "grad_norm": 6.477969169616699, + "learning_rate": 8.140194600520763e-06, + "loss": 0.3998, + "step": 14322 + }, + { + "epoch": 0.19428920238741182, + "grad_norm": 7.74830961227417, + "learning_rate": 8.140057557900508e-06, + "loss": 0.4076, + "step": 14323 + }, + { + "epoch": 0.1943027672273467, + "grad_norm": 5.769705295562744, + "learning_rate": 8.139920515280253e-06, + "loss": 0.3371, + "step": 14324 + }, + { + "epoch": 0.1943163320672816, + "grad_norm": 7.547155857086182, + "learning_rate": 8.139783472659998e-06, + "loss": 0.4596, + "step": 14325 + }, + { + "epoch": 0.1943298969072165, + "grad_norm": 8.272071838378906, + "learning_rate": 8.139646430039742e-06, + "loss": 0.4613, + "step": 14326 + }, + { + "epoch": 0.1943434617471514, + "grad_norm": 5.795720100402832, + "learning_rate": 8.139509387419489e-06, + "loss": 0.3845, + "step": 14327 + }, + { + "epoch": 0.19435702658708628, + "grad_norm": 5.413476467132568, + "learning_rate": 8.139372344799234e-06, + "loss": 0.3833, + "step": 14328 + }, + { + "epoch": 0.19437059142702115, + "grad_norm": 5.095373630523682, + "learning_rate": 8.139235302178978e-06, + "loss": 0.2865, + "step": 14329 + }, + { + "epoch": 0.19438415626695604, + "grad_norm": 4.680815696716309, + "learning_rate": 8.139098259558723e-06, + "loss": 0.3855, + "step": 14330 + }, + { + "epoch": 0.19439772110689094, + "grad_norm": 7.8455681800842285, + "learning_rate": 8.13896121693847e-06, + "loss": 0.4076, + "step": 14331 + }, + { + "epoch": 0.19441128594682583, + "grad_norm": 6.405622482299805, + "learning_rate": 8.138824174318215e-06, + "loss": 0.4255, + "step": 14332 + }, + { + "epoch": 0.19442485078676072, + "grad_norm": 9.39245891571045, + "learning_rate": 8.138687131697958e-06, + "loss": 0.6085, + "step": 14333 + }, + { + "epoch": 0.19443841562669562, + "grad_norm": 7.002583026885986, + "learning_rate": 8.138550089077703e-06, + "loss": 0.4957, + "step": 14334 + }, + { + "epoch": 0.19445198046663048, + "grad_norm": 6.480003356933594, + "learning_rate": 8.138413046457449e-06, + "loss": 0.3864, + "step": 14335 + }, + { + "epoch": 0.19446554530656537, + "grad_norm": 7.1252121925354, + "learning_rate": 8.138276003837194e-06, + "loss": 0.5432, + "step": 14336 + }, + { + "epoch": 0.19447911014650027, + "grad_norm": 5.866519451141357, + "learning_rate": 8.138138961216939e-06, + "loss": 0.3302, + "step": 14337 + }, + { + "epoch": 0.19449267498643516, + "grad_norm": 4.498589515686035, + "learning_rate": 8.138001918596684e-06, + "loss": 0.2997, + "step": 14338 + }, + { + "epoch": 0.19450623982637005, + "grad_norm": 5.878735065460205, + "learning_rate": 8.13786487597643e-06, + "loss": 0.2633, + "step": 14339 + }, + { + "epoch": 0.19451980466630495, + "grad_norm": 7.491512298583984, + "learning_rate": 8.137727833356174e-06, + "loss": 0.3962, + "step": 14340 + }, + { + "epoch": 0.19453336950623984, + "grad_norm": 6.291327476501465, + "learning_rate": 8.13759079073592e-06, + "loss": 0.403, + "step": 14341 + }, + { + "epoch": 0.1945469343461747, + "grad_norm": 6.330262660980225, + "learning_rate": 8.137453748115665e-06, + "loss": 0.52, + "step": 14342 + }, + { + "epoch": 0.1945604991861096, + "grad_norm": 4.684292316436768, + "learning_rate": 8.13731670549541e-06, + "loss": 0.3047, + "step": 14343 + }, + { + "epoch": 0.1945740640260445, + "grad_norm": 6.526506423950195, + "learning_rate": 8.137179662875155e-06, + "loss": 0.2038, + "step": 14344 + }, + { + "epoch": 0.19458762886597938, + "grad_norm": 7.026280403137207, + "learning_rate": 8.1370426202549e-06, + "loss": 0.5528, + "step": 14345 + }, + { + "epoch": 0.19460119370591428, + "grad_norm": 6.026504993438721, + "learning_rate": 8.136905577634646e-06, + "loss": 0.3421, + "step": 14346 + }, + { + "epoch": 0.19461475854584917, + "grad_norm": 6.755553722381592, + "learning_rate": 8.13676853501439e-06, + "loss": 0.4071, + "step": 14347 + }, + { + "epoch": 0.19462832338578404, + "grad_norm": 6.780742168426514, + "learning_rate": 8.136631492394134e-06, + "loss": 0.4227, + "step": 14348 + }, + { + "epoch": 0.19464188822571893, + "grad_norm": 5.583523750305176, + "learning_rate": 8.136494449773881e-06, + "loss": 0.4827, + "step": 14349 + }, + { + "epoch": 0.19465545306565382, + "grad_norm": 5.962168216705322, + "learning_rate": 8.136357407153626e-06, + "loss": 0.3559, + "step": 14350 + }, + { + "epoch": 0.19466901790558871, + "grad_norm": 5.212188243865967, + "learning_rate": 8.13622036453337e-06, + "loss": 0.2926, + "step": 14351 + }, + { + "epoch": 0.1946825827455236, + "grad_norm": 5.045587062835693, + "learning_rate": 8.136083321913115e-06, + "loss": 0.3015, + "step": 14352 + }, + { + "epoch": 0.1946961475854585, + "grad_norm": 5.627458095550537, + "learning_rate": 8.135946279292862e-06, + "loss": 0.2696, + "step": 14353 + }, + { + "epoch": 0.1947097124253934, + "grad_norm": 5.811127662658691, + "learning_rate": 8.135809236672605e-06, + "loss": 0.3504, + "step": 14354 + }, + { + "epoch": 0.19472327726532826, + "grad_norm": 4.970917701721191, + "learning_rate": 8.13567219405235e-06, + "loss": 0.35, + "step": 14355 + }, + { + "epoch": 0.19473684210526315, + "grad_norm": 6.067661762237549, + "learning_rate": 8.135535151432096e-06, + "loss": 0.369, + "step": 14356 + }, + { + "epoch": 0.19475040694519805, + "grad_norm": 3.987863779067993, + "learning_rate": 8.13539810881184e-06, + "loss": 0.1793, + "step": 14357 + }, + { + "epoch": 0.19476397178513294, + "grad_norm": 6.440133094787598, + "learning_rate": 8.135261066191586e-06, + "loss": 0.2706, + "step": 14358 + }, + { + "epoch": 0.19477753662506783, + "grad_norm": 6.396653175354004, + "learning_rate": 8.135124023571331e-06, + "loss": 0.3763, + "step": 14359 + }, + { + "epoch": 0.19479110146500273, + "grad_norm": 5.671993255615234, + "learning_rate": 8.134986980951076e-06, + "loss": 0.4432, + "step": 14360 + }, + { + "epoch": 0.1948046663049376, + "grad_norm": 6.943820476531982, + "learning_rate": 8.134849938330822e-06, + "loss": 0.3636, + "step": 14361 + }, + { + "epoch": 0.19481823114487248, + "grad_norm": 7.80357551574707, + "learning_rate": 8.134712895710567e-06, + "loss": 0.4726, + "step": 14362 + }, + { + "epoch": 0.19483179598480738, + "grad_norm": 5.955730438232422, + "learning_rate": 8.134575853090312e-06, + "loss": 0.2817, + "step": 14363 + }, + { + "epoch": 0.19484536082474227, + "grad_norm": 5.841419219970703, + "learning_rate": 8.134438810470057e-06, + "loss": 0.3619, + "step": 14364 + }, + { + "epoch": 0.19485892566467716, + "grad_norm": 6.775628566741943, + "learning_rate": 8.134301767849802e-06, + "loss": 0.4709, + "step": 14365 + }, + { + "epoch": 0.19487249050461206, + "grad_norm": 4.839331150054932, + "learning_rate": 8.134164725229547e-06, + "loss": 0.2458, + "step": 14366 + }, + { + "epoch": 0.19488605534454692, + "grad_norm": 5.574466705322266, + "learning_rate": 8.134027682609293e-06, + "loss": 0.3169, + "step": 14367 + }, + { + "epoch": 0.19489962018448181, + "grad_norm": 4.065123558044434, + "learning_rate": 8.133890639989038e-06, + "loss": 0.2546, + "step": 14368 + }, + { + "epoch": 0.1949131850244167, + "grad_norm": 5.31442928314209, + "learning_rate": 8.133753597368781e-06, + "loss": 0.3105, + "step": 14369 + }, + { + "epoch": 0.1949267498643516, + "grad_norm": 4.864023208618164, + "learning_rate": 8.133616554748528e-06, + "loss": 0.3372, + "step": 14370 + }, + { + "epoch": 0.1949403147042865, + "grad_norm": 8.66169261932373, + "learning_rate": 8.133479512128273e-06, + "loss": 0.4989, + "step": 14371 + }, + { + "epoch": 0.1949538795442214, + "grad_norm": 4.066705226898193, + "learning_rate": 8.133342469508017e-06, + "loss": 0.2931, + "step": 14372 + }, + { + "epoch": 0.19496744438415628, + "grad_norm": 4.774709701538086, + "learning_rate": 8.133205426887762e-06, + "loss": 0.3448, + "step": 14373 + }, + { + "epoch": 0.19498100922409115, + "grad_norm": 6.086483955383301, + "learning_rate": 8.133068384267507e-06, + "loss": 0.3752, + "step": 14374 + }, + { + "epoch": 0.19499457406402604, + "grad_norm": 4.790730953216553, + "learning_rate": 8.132931341647254e-06, + "loss": 0.3719, + "step": 14375 + }, + { + "epoch": 0.19500813890396093, + "grad_norm": 6.213069438934326, + "learning_rate": 8.132794299026998e-06, + "loss": 0.4269, + "step": 14376 + }, + { + "epoch": 0.19502170374389582, + "grad_norm": 6.642096996307373, + "learning_rate": 8.132657256406743e-06, + "loss": 0.4556, + "step": 14377 + }, + { + "epoch": 0.19503526858383072, + "grad_norm": 5.3962507247924805, + "learning_rate": 8.132520213786488e-06, + "loss": 0.382, + "step": 14378 + }, + { + "epoch": 0.1950488334237656, + "grad_norm": 5.930113315582275, + "learning_rate": 8.132383171166233e-06, + "loss": 0.2942, + "step": 14379 + }, + { + "epoch": 0.19506239826370048, + "grad_norm": 6.234333038330078, + "learning_rate": 8.132246128545978e-06, + "loss": 0.3107, + "step": 14380 + }, + { + "epoch": 0.19507596310363537, + "grad_norm": 4.855418682098389, + "learning_rate": 8.132109085925723e-06, + "loss": 0.3044, + "step": 14381 + }, + { + "epoch": 0.19508952794357026, + "grad_norm": 5.607755661010742, + "learning_rate": 8.131972043305469e-06, + "loss": 0.3122, + "step": 14382 + }, + { + "epoch": 0.19510309278350516, + "grad_norm": 4.8758463859558105, + "learning_rate": 8.131835000685214e-06, + "loss": 0.2691, + "step": 14383 + }, + { + "epoch": 0.19511665762344005, + "grad_norm": 5.131258010864258, + "learning_rate": 8.131697958064959e-06, + "loss": 0.3371, + "step": 14384 + }, + { + "epoch": 0.19513022246337494, + "grad_norm": 5.739354610443115, + "learning_rate": 8.131560915444704e-06, + "loss": 0.3582, + "step": 14385 + }, + { + "epoch": 0.19514378730330983, + "grad_norm": 8.186564445495605, + "learning_rate": 8.13142387282445e-06, + "loss": 0.4904, + "step": 14386 + }, + { + "epoch": 0.1951573521432447, + "grad_norm": 5.274497985839844, + "learning_rate": 8.131286830204193e-06, + "loss": 0.2868, + "step": 14387 + }, + { + "epoch": 0.1951709169831796, + "grad_norm": 6.7809038162231445, + "learning_rate": 8.13114978758394e-06, + "loss": 0.278, + "step": 14388 + }, + { + "epoch": 0.1951844818231145, + "grad_norm": 9.03918743133545, + "learning_rate": 8.131012744963685e-06, + "loss": 0.517, + "step": 14389 + }, + { + "epoch": 0.19519804666304938, + "grad_norm": 8.017101287841797, + "learning_rate": 8.13087570234343e-06, + "loss": 0.5533, + "step": 14390 + }, + { + "epoch": 0.19521161150298427, + "grad_norm": 8.778839111328125, + "learning_rate": 8.130738659723174e-06, + "loss": 0.4761, + "step": 14391 + }, + { + "epoch": 0.19522517634291917, + "grad_norm": 6.5464701652526855, + "learning_rate": 8.13060161710292e-06, + "loss": 0.3376, + "step": 14392 + }, + { + "epoch": 0.19523874118285403, + "grad_norm": 6.436671733856201, + "learning_rate": 8.130464574482666e-06, + "loss": 0.327, + "step": 14393 + }, + { + "epoch": 0.19525230602278892, + "grad_norm": 5.4458489418029785, + "learning_rate": 8.130327531862409e-06, + "loss": 0.2886, + "step": 14394 + }, + { + "epoch": 0.19526587086272382, + "grad_norm": 8.654898643493652, + "learning_rate": 8.130190489242154e-06, + "loss": 0.4681, + "step": 14395 + }, + { + "epoch": 0.1952794357026587, + "grad_norm": 4.619458198547363, + "learning_rate": 8.130053446621901e-06, + "loss": 0.3797, + "step": 14396 + }, + { + "epoch": 0.1952930005425936, + "grad_norm": 5.790047645568848, + "learning_rate": 8.129916404001645e-06, + "loss": 0.2504, + "step": 14397 + }, + { + "epoch": 0.1953065653825285, + "grad_norm": 5.4295196533203125, + "learning_rate": 8.12977936138139e-06, + "loss": 0.3461, + "step": 14398 + }, + { + "epoch": 0.19532013022246336, + "grad_norm": 4.458568572998047, + "learning_rate": 8.129642318761135e-06, + "loss": 0.2838, + "step": 14399 + }, + { + "epoch": 0.19533369506239825, + "grad_norm": 6.6198954582214355, + "learning_rate": 8.129505276140882e-06, + "loss": 0.3901, + "step": 14400 + }, + { + "epoch": 0.19534725990233315, + "grad_norm": 7.370729446411133, + "learning_rate": 8.129368233520625e-06, + "loss": 0.5459, + "step": 14401 + }, + { + "epoch": 0.19536082474226804, + "grad_norm": 4.455202579498291, + "learning_rate": 8.12923119090037e-06, + "loss": 0.3093, + "step": 14402 + }, + { + "epoch": 0.19537438958220293, + "grad_norm": 7.179758071899414, + "learning_rate": 8.129094148280116e-06, + "loss": 0.4025, + "step": 14403 + }, + { + "epoch": 0.19538795442213783, + "grad_norm": 10.18262767791748, + "learning_rate": 8.128957105659861e-06, + "loss": 0.4006, + "step": 14404 + }, + { + "epoch": 0.19540151926207272, + "grad_norm": 5.770803451538086, + "learning_rate": 8.128820063039606e-06, + "loss": 0.4205, + "step": 14405 + }, + { + "epoch": 0.19541508410200759, + "grad_norm": 6.955652236938477, + "learning_rate": 8.128683020419351e-06, + "loss": 0.4228, + "step": 14406 + }, + { + "epoch": 0.19542864894194248, + "grad_norm": 5.774896621704102, + "learning_rate": 8.128545977799096e-06, + "loss": 0.4106, + "step": 14407 + }, + { + "epoch": 0.19544221378187737, + "grad_norm": 5.11589241027832, + "learning_rate": 8.128408935178842e-06, + "loss": 0.2228, + "step": 14408 + }, + { + "epoch": 0.19545577862181227, + "grad_norm": 5.210648059844971, + "learning_rate": 8.128271892558587e-06, + "loss": 0.3267, + "step": 14409 + }, + { + "epoch": 0.19546934346174716, + "grad_norm": 5.382449626922607, + "learning_rate": 8.128134849938332e-06, + "loss": 0.2898, + "step": 14410 + }, + { + "epoch": 0.19548290830168205, + "grad_norm": 8.1456880569458, + "learning_rate": 8.127997807318077e-06, + "loss": 0.3536, + "step": 14411 + }, + { + "epoch": 0.19549647314161692, + "grad_norm": 4.945853233337402, + "learning_rate": 8.12786076469782e-06, + "loss": 0.2101, + "step": 14412 + }, + { + "epoch": 0.1955100379815518, + "grad_norm": 6.453150749206543, + "learning_rate": 8.127723722077567e-06, + "loss": 0.4427, + "step": 14413 + }, + { + "epoch": 0.1955236028214867, + "grad_norm": 5.464242935180664, + "learning_rate": 8.127586679457313e-06, + "loss": 0.3601, + "step": 14414 + }, + { + "epoch": 0.1955371676614216, + "grad_norm": 6.381328105926514, + "learning_rate": 8.127449636837058e-06, + "loss": 0.3684, + "step": 14415 + }, + { + "epoch": 0.1955507325013565, + "grad_norm": 4.5989556312561035, + "learning_rate": 8.127312594216801e-06, + "loss": 0.337, + "step": 14416 + }, + { + "epoch": 0.19556429734129138, + "grad_norm": 6.707930564880371, + "learning_rate": 8.127175551596546e-06, + "loss": 0.3771, + "step": 14417 + }, + { + "epoch": 0.19557786218122628, + "grad_norm": 6.845596790313721, + "learning_rate": 8.127038508976293e-06, + "loss": 0.383, + "step": 14418 + }, + { + "epoch": 0.19559142702116114, + "grad_norm": 7.537614822387695, + "learning_rate": 8.126901466356037e-06, + "loss": 0.3896, + "step": 14419 + }, + { + "epoch": 0.19560499186109603, + "grad_norm": 7.031769752502441, + "learning_rate": 8.126764423735782e-06, + "loss": 0.4084, + "step": 14420 + }, + { + "epoch": 0.19561855670103093, + "grad_norm": 7.1245293617248535, + "learning_rate": 8.126627381115527e-06, + "loss": 0.3509, + "step": 14421 + }, + { + "epoch": 0.19563212154096582, + "grad_norm": 5.763484954833984, + "learning_rate": 8.126490338495272e-06, + "loss": 0.3308, + "step": 14422 + }, + { + "epoch": 0.1956456863809007, + "grad_norm": 5.760036945343018, + "learning_rate": 8.126353295875018e-06, + "loss": 0.4152, + "step": 14423 + }, + { + "epoch": 0.1956592512208356, + "grad_norm": 5.259757995605469, + "learning_rate": 8.126216253254763e-06, + "loss": 0.3664, + "step": 14424 + }, + { + "epoch": 0.19567281606077047, + "grad_norm": 7.309767246246338, + "learning_rate": 8.126079210634508e-06, + "loss": 0.3869, + "step": 14425 + }, + { + "epoch": 0.19568638090070536, + "grad_norm": 7.287147521972656, + "learning_rate": 8.125942168014253e-06, + "loss": 0.3998, + "step": 14426 + }, + { + "epoch": 0.19569994574064026, + "grad_norm": 4.610508918762207, + "learning_rate": 8.125805125393998e-06, + "loss": 0.2443, + "step": 14427 + }, + { + "epoch": 0.19571351058057515, + "grad_norm": 6.473265171051025, + "learning_rate": 8.125668082773743e-06, + "loss": 0.3789, + "step": 14428 + }, + { + "epoch": 0.19572707542051004, + "grad_norm": 5.177221775054932, + "learning_rate": 8.125531040153489e-06, + "loss": 0.3947, + "step": 14429 + }, + { + "epoch": 0.19574064026044494, + "grad_norm": 6.248337745666504, + "learning_rate": 8.125393997533234e-06, + "loss": 0.3269, + "step": 14430 + }, + { + "epoch": 0.1957542051003798, + "grad_norm": 8.262025833129883, + "learning_rate": 8.125256954912979e-06, + "loss": 0.4781, + "step": 14431 + }, + { + "epoch": 0.1957677699403147, + "grad_norm": 7.635774612426758, + "learning_rate": 8.125119912292724e-06, + "loss": 0.445, + "step": 14432 + }, + { + "epoch": 0.1957813347802496, + "grad_norm": 5.624651908874512, + "learning_rate": 8.12498286967247e-06, + "loss": 0.2661, + "step": 14433 + }, + { + "epoch": 0.19579489962018448, + "grad_norm": 5.50012731552124, + "learning_rate": 8.124845827052213e-06, + "loss": 0.3533, + "step": 14434 + }, + { + "epoch": 0.19580846446011937, + "grad_norm": 7.523571491241455, + "learning_rate": 8.12470878443196e-06, + "loss": 0.5585, + "step": 14435 + }, + { + "epoch": 0.19582202930005427, + "grad_norm": 6.6461873054504395, + "learning_rate": 8.124571741811705e-06, + "loss": 0.2707, + "step": 14436 + }, + { + "epoch": 0.19583559413998916, + "grad_norm": 6.673670768737793, + "learning_rate": 8.124434699191448e-06, + "loss": 0.3717, + "step": 14437 + }, + { + "epoch": 0.19584915897992403, + "grad_norm": 6.752872943878174, + "learning_rate": 8.124297656571194e-06, + "loss": 0.3786, + "step": 14438 + }, + { + "epoch": 0.19586272381985892, + "grad_norm": 7.688185691833496, + "learning_rate": 8.12416061395094e-06, + "loss": 0.4457, + "step": 14439 + }, + { + "epoch": 0.1958762886597938, + "grad_norm": 7.004549026489258, + "learning_rate": 8.124023571330686e-06, + "loss": 0.4252, + "step": 14440 + }, + { + "epoch": 0.1958898534997287, + "grad_norm": 11.019296646118164, + "learning_rate": 8.123886528710429e-06, + "loss": 0.6098, + "step": 14441 + }, + { + "epoch": 0.1959034183396636, + "grad_norm": 6.642143726348877, + "learning_rate": 8.123749486090174e-06, + "loss": 0.2178, + "step": 14442 + }, + { + "epoch": 0.1959169831795985, + "grad_norm": 8.96951961517334, + "learning_rate": 8.12361244346992e-06, + "loss": 0.5128, + "step": 14443 + }, + { + "epoch": 0.19593054801953336, + "grad_norm": 8.551429748535156, + "learning_rate": 8.123475400849665e-06, + "loss": 0.4253, + "step": 14444 + }, + { + "epoch": 0.19594411285946825, + "grad_norm": 7.567966461181641, + "learning_rate": 8.12333835822941e-06, + "loss": 0.3872, + "step": 14445 + }, + { + "epoch": 0.19595767769940314, + "grad_norm": 6.630322456359863, + "learning_rate": 8.123201315609155e-06, + "loss": 0.3791, + "step": 14446 + }, + { + "epoch": 0.19597124253933804, + "grad_norm": 7.239665508270264, + "learning_rate": 8.1230642729889e-06, + "loss": 0.3834, + "step": 14447 + }, + { + "epoch": 0.19598480737927293, + "grad_norm": 9.03656005859375, + "learning_rate": 8.122927230368645e-06, + "loss": 0.4706, + "step": 14448 + }, + { + "epoch": 0.19599837221920782, + "grad_norm": 7.166490077972412, + "learning_rate": 8.12279018774839e-06, + "loss": 0.3933, + "step": 14449 + }, + { + "epoch": 0.19601193705914272, + "grad_norm": 7.190013408660889, + "learning_rate": 8.122653145128136e-06, + "loss": 0.3522, + "step": 14450 + }, + { + "epoch": 0.19602550189907758, + "grad_norm": 6.578561782836914, + "learning_rate": 8.122516102507881e-06, + "loss": 0.3894, + "step": 14451 + }, + { + "epoch": 0.19603906673901247, + "grad_norm": 8.744507789611816, + "learning_rate": 8.122379059887626e-06, + "loss": 0.4923, + "step": 14452 + }, + { + "epoch": 0.19605263157894737, + "grad_norm": 6.721408843994141, + "learning_rate": 8.122242017267371e-06, + "loss": 0.3014, + "step": 14453 + }, + { + "epoch": 0.19606619641888226, + "grad_norm": 6.385499000549316, + "learning_rate": 8.122104974647116e-06, + "loss": 0.3548, + "step": 14454 + }, + { + "epoch": 0.19607976125881715, + "grad_norm": 5.857288837432861, + "learning_rate": 8.12196793202686e-06, + "loss": 0.4023, + "step": 14455 + }, + { + "epoch": 0.19609332609875205, + "grad_norm": 6.951540470123291, + "learning_rate": 8.121830889406605e-06, + "loss": 0.4229, + "step": 14456 + }, + { + "epoch": 0.1961068909386869, + "grad_norm": 6.606766223907471, + "learning_rate": 8.121693846786352e-06, + "loss": 0.3712, + "step": 14457 + }, + { + "epoch": 0.1961204557786218, + "grad_norm": 5.3477044105529785, + "learning_rate": 8.121556804166097e-06, + "loss": 0.3197, + "step": 14458 + }, + { + "epoch": 0.1961340206185567, + "grad_norm": 5.573415756225586, + "learning_rate": 8.12141976154584e-06, + "loss": 0.2615, + "step": 14459 + }, + { + "epoch": 0.1961475854584916, + "grad_norm": 8.307462692260742, + "learning_rate": 8.121282718925586e-06, + "loss": 0.3821, + "step": 14460 + }, + { + "epoch": 0.19616115029842648, + "grad_norm": 6.478147983551025, + "learning_rate": 8.121145676305333e-06, + "loss": 0.3532, + "step": 14461 + }, + { + "epoch": 0.19617471513836138, + "grad_norm": 7.260720252990723, + "learning_rate": 8.121008633685076e-06, + "loss": 0.4791, + "step": 14462 + }, + { + "epoch": 0.19618827997829624, + "grad_norm": 6.266078948974609, + "learning_rate": 8.120871591064821e-06, + "loss": 0.43, + "step": 14463 + }, + { + "epoch": 0.19620184481823114, + "grad_norm": 7.647133827209473, + "learning_rate": 8.120734548444566e-06, + "loss": 0.5243, + "step": 14464 + }, + { + "epoch": 0.19621540965816603, + "grad_norm": 7.280693054199219, + "learning_rate": 8.120597505824312e-06, + "loss": 0.4169, + "step": 14465 + }, + { + "epoch": 0.19622897449810092, + "grad_norm": 6.500672340393066, + "learning_rate": 8.120460463204057e-06, + "loss": 0.3806, + "step": 14466 + }, + { + "epoch": 0.19624253933803582, + "grad_norm": 6.195103168487549, + "learning_rate": 8.120323420583802e-06, + "loss": 0.4015, + "step": 14467 + }, + { + "epoch": 0.1962561041779707, + "grad_norm": 7.102145195007324, + "learning_rate": 8.120186377963547e-06, + "loss": 0.3666, + "step": 14468 + }, + { + "epoch": 0.1962696690179056, + "grad_norm": 7.608457088470459, + "learning_rate": 8.120049335343292e-06, + "loss": 0.422, + "step": 14469 + }, + { + "epoch": 0.19628323385784047, + "grad_norm": 9.043615341186523, + "learning_rate": 8.119912292723038e-06, + "loss": 0.5471, + "step": 14470 + }, + { + "epoch": 0.19629679869777536, + "grad_norm": 5.386162281036377, + "learning_rate": 8.119775250102783e-06, + "loss": 0.3201, + "step": 14471 + }, + { + "epoch": 0.19631036353771025, + "grad_norm": 6.975903511047363, + "learning_rate": 8.119638207482528e-06, + "loss": 0.4259, + "step": 14472 + }, + { + "epoch": 0.19632392837764515, + "grad_norm": 6.611083030700684, + "learning_rate": 8.119501164862273e-06, + "loss": 0.3388, + "step": 14473 + }, + { + "epoch": 0.19633749321758004, + "grad_norm": 7.870934963226318, + "learning_rate": 8.119364122242018e-06, + "loss": 0.406, + "step": 14474 + }, + { + "epoch": 0.19635105805751493, + "grad_norm": 6.114737033843994, + "learning_rate": 8.119227079621763e-06, + "loss": 0.4022, + "step": 14475 + }, + { + "epoch": 0.1963646228974498, + "grad_norm": 7.312180519104004, + "learning_rate": 8.119090037001509e-06, + "loss": 0.3512, + "step": 14476 + }, + { + "epoch": 0.1963781877373847, + "grad_norm": 6.705749034881592, + "learning_rate": 8.118952994381252e-06, + "loss": 0.4126, + "step": 14477 + }, + { + "epoch": 0.19639175257731958, + "grad_norm": 6.712070941925049, + "learning_rate": 8.118815951760999e-06, + "loss": 0.4105, + "step": 14478 + }, + { + "epoch": 0.19640531741725448, + "grad_norm": 6.337400436401367, + "learning_rate": 8.118678909140744e-06, + "loss": 0.4268, + "step": 14479 + }, + { + "epoch": 0.19641888225718937, + "grad_norm": 7.270570755004883, + "learning_rate": 8.118541866520488e-06, + "loss": 0.3686, + "step": 14480 + }, + { + "epoch": 0.19643244709712426, + "grad_norm": 6.822224140167236, + "learning_rate": 8.118404823900233e-06, + "loss": 0.5055, + "step": 14481 + }, + { + "epoch": 0.19644601193705916, + "grad_norm": 8.0436429977417, + "learning_rate": 8.11826778127998e-06, + "loss": 0.4137, + "step": 14482 + }, + { + "epoch": 0.19645957677699402, + "grad_norm": 8.263797760009766, + "learning_rate": 8.118130738659725e-06, + "loss": 0.5187, + "step": 14483 + }, + { + "epoch": 0.19647314161692891, + "grad_norm": 7.587871551513672, + "learning_rate": 8.117993696039468e-06, + "loss": 0.4376, + "step": 14484 + }, + { + "epoch": 0.1964867064568638, + "grad_norm": 7.011927127838135, + "learning_rate": 8.117856653419214e-06, + "loss": 0.4069, + "step": 14485 + }, + { + "epoch": 0.1965002712967987, + "grad_norm": 6.481893539428711, + "learning_rate": 8.117719610798959e-06, + "loss": 0.4734, + "step": 14486 + }, + { + "epoch": 0.1965138361367336, + "grad_norm": 5.908204555511475, + "learning_rate": 8.117582568178704e-06, + "loss": 0.4083, + "step": 14487 + }, + { + "epoch": 0.1965274009766685, + "grad_norm": 5.91915225982666, + "learning_rate": 8.117445525558449e-06, + "loss": 0.3807, + "step": 14488 + }, + { + "epoch": 0.19654096581660335, + "grad_norm": 5.425825595855713, + "learning_rate": 8.117308482938194e-06, + "loss": 0.3287, + "step": 14489 + }, + { + "epoch": 0.19655453065653825, + "grad_norm": 6.283257961273193, + "learning_rate": 8.11717144031794e-06, + "loss": 0.4843, + "step": 14490 + }, + { + "epoch": 0.19656809549647314, + "grad_norm": 5.8268280029296875, + "learning_rate": 8.117034397697685e-06, + "loss": 0.3817, + "step": 14491 + }, + { + "epoch": 0.19658166033640803, + "grad_norm": 6.742135047912598, + "learning_rate": 8.11689735507743e-06, + "loss": 0.4839, + "step": 14492 + }, + { + "epoch": 0.19659522517634292, + "grad_norm": 5.02752685546875, + "learning_rate": 8.116760312457175e-06, + "loss": 0.2912, + "step": 14493 + }, + { + "epoch": 0.19660879001627782, + "grad_norm": 5.4282941818237305, + "learning_rate": 8.11662326983692e-06, + "loss": 0.3354, + "step": 14494 + }, + { + "epoch": 0.19662235485621268, + "grad_norm": 6.071567058563232, + "learning_rate": 8.116486227216665e-06, + "loss": 0.4772, + "step": 14495 + }, + { + "epoch": 0.19663591969614758, + "grad_norm": 6.764385223388672, + "learning_rate": 8.11634918459641e-06, + "loss": 0.3813, + "step": 14496 + }, + { + "epoch": 0.19664948453608247, + "grad_norm": 7.605956554412842, + "learning_rate": 8.116212141976156e-06, + "loss": 0.3262, + "step": 14497 + }, + { + "epoch": 0.19666304937601736, + "grad_norm": 7.193119049072266, + "learning_rate": 8.116075099355901e-06, + "loss": 0.3422, + "step": 14498 + }, + { + "epoch": 0.19667661421595226, + "grad_norm": 4.966063022613525, + "learning_rate": 8.115938056735644e-06, + "loss": 0.301, + "step": 14499 + }, + { + "epoch": 0.19669017905588715, + "grad_norm": 7.796757698059082, + "learning_rate": 8.115801014115391e-06, + "loss": 0.5092, + "step": 14500 + }, + { + "epoch": 0.19670374389582204, + "grad_norm": 4.967424392700195, + "learning_rate": 8.115663971495136e-06, + "loss": 0.2805, + "step": 14501 + }, + { + "epoch": 0.1967173087357569, + "grad_norm": 4.98881721496582, + "learning_rate": 8.11552692887488e-06, + "loss": 0.2659, + "step": 14502 + }, + { + "epoch": 0.1967308735756918, + "grad_norm": 7.13886022567749, + "learning_rate": 8.115389886254625e-06, + "loss": 0.5053, + "step": 14503 + }, + { + "epoch": 0.1967444384156267, + "grad_norm": 4.480016231536865, + "learning_rate": 8.115252843634372e-06, + "loss": 0.3669, + "step": 14504 + }, + { + "epoch": 0.1967580032555616, + "grad_norm": 5.920527935028076, + "learning_rate": 8.115115801014115e-06, + "loss": 0.3551, + "step": 14505 + }, + { + "epoch": 0.19677156809549648, + "grad_norm": 7.74232292175293, + "learning_rate": 8.11497875839386e-06, + "loss": 0.4567, + "step": 14506 + }, + { + "epoch": 0.19678513293543137, + "grad_norm": 4.501244068145752, + "learning_rate": 8.114841715773606e-06, + "loss": 0.3492, + "step": 14507 + }, + { + "epoch": 0.19679869777536624, + "grad_norm": 6.011326313018799, + "learning_rate": 8.114704673153353e-06, + "loss": 0.3928, + "step": 14508 + }, + { + "epoch": 0.19681226261530113, + "grad_norm": 8.148894309997559, + "learning_rate": 8.114567630533096e-06, + "loss": 0.3279, + "step": 14509 + }, + { + "epoch": 0.19682582745523602, + "grad_norm": 5.408169269561768, + "learning_rate": 8.114430587912841e-06, + "loss": 0.2303, + "step": 14510 + }, + { + "epoch": 0.19683939229517092, + "grad_norm": 5.882409572601318, + "learning_rate": 8.114293545292587e-06, + "loss": 0.4212, + "step": 14511 + }, + { + "epoch": 0.1968529571351058, + "grad_norm": 5.6989359855651855, + "learning_rate": 8.114156502672332e-06, + "loss": 0.3974, + "step": 14512 + }, + { + "epoch": 0.1968665219750407, + "grad_norm": 5.415829181671143, + "learning_rate": 8.114019460052077e-06, + "loss": 0.458, + "step": 14513 + }, + { + "epoch": 0.1968800868149756, + "grad_norm": 8.366894721984863, + "learning_rate": 8.113882417431822e-06, + "loss": 0.4057, + "step": 14514 + }, + { + "epoch": 0.19689365165491046, + "grad_norm": 6.8979034423828125, + "learning_rate": 8.113745374811567e-06, + "loss": 0.3589, + "step": 14515 + }, + { + "epoch": 0.19690721649484536, + "grad_norm": 6.7420759201049805, + "learning_rate": 8.113608332191312e-06, + "loss": 0.3674, + "step": 14516 + }, + { + "epoch": 0.19692078133478025, + "grad_norm": 4.309848785400391, + "learning_rate": 8.113471289571058e-06, + "loss": 0.2883, + "step": 14517 + }, + { + "epoch": 0.19693434617471514, + "grad_norm": 6.073111534118652, + "learning_rate": 8.113334246950803e-06, + "loss": 0.3129, + "step": 14518 + }, + { + "epoch": 0.19694791101465003, + "grad_norm": 4.920838832855225, + "learning_rate": 8.113197204330548e-06, + "loss": 0.3141, + "step": 14519 + }, + { + "epoch": 0.19696147585458493, + "grad_norm": 6.876121997833252, + "learning_rate": 8.113060161710291e-06, + "loss": 0.4379, + "step": 14520 + }, + { + "epoch": 0.1969750406945198, + "grad_norm": 5.486571788787842, + "learning_rate": 8.112923119090038e-06, + "loss": 0.4433, + "step": 14521 + }, + { + "epoch": 0.19698860553445469, + "grad_norm": 4.946741580963135, + "learning_rate": 8.112786076469783e-06, + "loss": 0.415, + "step": 14522 + }, + { + "epoch": 0.19700217037438958, + "grad_norm": 6.014376163482666, + "learning_rate": 8.112649033849529e-06, + "loss": 0.4106, + "step": 14523 + }, + { + "epoch": 0.19701573521432447, + "grad_norm": 6.567007064819336, + "learning_rate": 8.112511991229272e-06, + "loss": 0.3359, + "step": 14524 + }, + { + "epoch": 0.19702930005425937, + "grad_norm": 7.660557746887207, + "learning_rate": 8.112374948609017e-06, + "loss": 0.5458, + "step": 14525 + }, + { + "epoch": 0.19704286489419426, + "grad_norm": 7.446781158447266, + "learning_rate": 8.112237905988764e-06, + "loss": 0.5339, + "step": 14526 + }, + { + "epoch": 0.19705642973412912, + "grad_norm": 6.801164150238037, + "learning_rate": 8.112100863368508e-06, + "loss": 0.4343, + "step": 14527 + }, + { + "epoch": 0.19706999457406402, + "grad_norm": 6.367506980895996, + "learning_rate": 8.111963820748253e-06, + "loss": 0.41, + "step": 14528 + }, + { + "epoch": 0.1970835594139989, + "grad_norm": 7.352313995361328, + "learning_rate": 8.111826778127998e-06, + "loss": 0.6553, + "step": 14529 + }, + { + "epoch": 0.1970971242539338, + "grad_norm": 6.942520618438721, + "learning_rate": 8.111689735507743e-06, + "loss": 0.4153, + "step": 14530 + }, + { + "epoch": 0.1971106890938687, + "grad_norm": 9.022764205932617, + "learning_rate": 8.111552692887488e-06, + "loss": 0.5092, + "step": 14531 + }, + { + "epoch": 0.1971242539338036, + "grad_norm": 6.0184783935546875, + "learning_rate": 8.111415650267234e-06, + "loss": 0.3767, + "step": 14532 + }, + { + "epoch": 0.19713781877373848, + "grad_norm": 6.770558834075928, + "learning_rate": 8.111278607646979e-06, + "loss": 0.3542, + "step": 14533 + }, + { + "epoch": 0.19715138361367335, + "grad_norm": 5.88231897354126, + "learning_rate": 8.111141565026724e-06, + "loss": 0.466, + "step": 14534 + }, + { + "epoch": 0.19716494845360824, + "grad_norm": 6.249835968017578, + "learning_rate": 8.111004522406469e-06, + "loss": 0.3979, + "step": 14535 + }, + { + "epoch": 0.19717851329354313, + "grad_norm": 5.005854606628418, + "learning_rate": 8.110867479786214e-06, + "loss": 0.3653, + "step": 14536 + }, + { + "epoch": 0.19719207813347803, + "grad_norm": 3.8633015155792236, + "learning_rate": 8.11073043716596e-06, + "loss": 0.2196, + "step": 14537 + }, + { + "epoch": 0.19720564297341292, + "grad_norm": 6.106466293334961, + "learning_rate": 8.110593394545705e-06, + "loss": 0.3152, + "step": 14538 + }, + { + "epoch": 0.1972192078133478, + "grad_norm": 6.049648284912109, + "learning_rate": 8.11045635192545e-06, + "loss": 0.3416, + "step": 14539 + }, + { + "epoch": 0.19723277265328268, + "grad_norm": 7.240511417388916, + "learning_rate": 8.110319309305195e-06, + "loss": 0.3229, + "step": 14540 + }, + { + "epoch": 0.19724633749321757, + "grad_norm": 5.095923900604248, + "learning_rate": 8.11018226668494e-06, + "loss": 0.3401, + "step": 14541 + }, + { + "epoch": 0.19725990233315246, + "grad_norm": 5.253385066986084, + "learning_rate": 8.110045224064684e-06, + "loss": 0.3377, + "step": 14542 + }, + { + "epoch": 0.19727346717308736, + "grad_norm": 5.933538436889648, + "learning_rate": 8.10990818144443e-06, + "loss": 0.2912, + "step": 14543 + }, + { + "epoch": 0.19728703201302225, + "grad_norm": 4.78742790222168, + "learning_rate": 8.109771138824176e-06, + "loss": 0.3454, + "step": 14544 + }, + { + "epoch": 0.19730059685295714, + "grad_norm": 6.829317092895508, + "learning_rate": 8.10963409620392e-06, + "loss": 0.3363, + "step": 14545 + }, + { + "epoch": 0.19731416169289204, + "grad_norm": 7.220126152038574, + "learning_rate": 8.109497053583664e-06, + "loss": 0.3327, + "step": 14546 + }, + { + "epoch": 0.1973277265328269, + "grad_norm": 7.446463108062744, + "learning_rate": 8.109360010963411e-06, + "loss": 0.4434, + "step": 14547 + }, + { + "epoch": 0.1973412913727618, + "grad_norm": 5.038612365722656, + "learning_rate": 8.109222968343155e-06, + "loss": 0.3023, + "step": 14548 + }, + { + "epoch": 0.1973548562126967, + "grad_norm": 5.423382759094238, + "learning_rate": 8.1090859257229e-06, + "loss": 0.4042, + "step": 14549 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 6.902701377868652, + "learning_rate": 8.108948883102645e-06, + "loss": 0.3866, + "step": 14550 + }, + { + "epoch": 0.19738198589256647, + "grad_norm": 5.235330581665039, + "learning_rate": 8.108811840482392e-06, + "loss": 0.3531, + "step": 14551 + }, + { + "epoch": 0.19739555073250137, + "grad_norm": 6.096275806427002, + "learning_rate": 8.108674797862135e-06, + "loss": 0.3889, + "step": 14552 + }, + { + "epoch": 0.19740911557243623, + "grad_norm": 7.28053092956543, + "learning_rate": 8.10853775524188e-06, + "loss": 0.4967, + "step": 14553 + }, + { + "epoch": 0.19742268041237113, + "grad_norm": 6.220334053039551, + "learning_rate": 8.108400712621626e-06, + "loss": 0.359, + "step": 14554 + }, + { + "epoch": 0.19743624525230602, + "grad_norm": 4.86581563949585, + "learning_rate": 8.108263670001371e-06, + "loss": 0.2312, + "step": 14555 + }, + { + "epoch": 0.1974498100922409, + "grad_norm": 5.1860032081604, + "learning_rate": 8.108126627381116e-06, + "loss": 0.2855, + "step": 14556 + }, + { + "epoch": 0.1974633749321758, + "grad_norm": 3.46342134475708, + "learning_rate": 8.107989584760861e-06, + "loss": 0.2049, + "step": 14557 + }, + { + "epoch": 0.1974769397721107, + "grad_norm": 5.600778579711914, + "learning_rate": 8.107852542140607e-06, + "loss": 0.5252, + "step": 14558 + }, + { + "epoch": 0.19749050461204556, + "grad_norm": 5.3563618659973145, + "learning_rate": 8.107715499520352e-06, + "loss": 0.3143, + "step": 14559 + }, + { + "epoch": 0.19750406945198046, + "grad_norm": 6.030886173248291, + "learning_rate": 8.107578456900097e-06, + "loss": 0.3555, + "step": 14560 + }, + { + "epoch": 0.19751763429191535, + "grad_norm": 6.522048473358154, + "learning_rate": 8.107441414279842e-06, + "loss": 0.4679, + "step": 14561 + }, + { + "epoch": 0.19753119913185024, + "grad_norm": 5.547435760498047, + "learning_rate": 8.107304371659587e-06, + "loss": 0.3818, + "step": 14562 + }, + { + "epoch": 0.19754476397178514, + "grad_norm": 4.2503790855407715, + "learning_rate": 8.10716732903933e-06, + "loss": 0.286, + "step": 14563 + }, + { + "epoch": 0.19755832881172003, + "grad_norm": 4.7151970863342285, + "learning_rate": 8.107030286419078e-06, + "loss": 0.4564, + "step": 14564 + }, + { + "epoch": 0.19757189365165492, + "grad_norm": 4.478053092956543, + "learning_rate": 8.106893243798823e-06, + "loss": 0.2654, + "step": 14565 + }, + { + "epoch": 0.1975854584915898, + "grad_norm": 5.483048915863037, + "learning_rate": 8.106756201178568e-06, + "loss": 0.283, + "step": 14566 + }, + { + "epoch": 0.19759902333152468, + "grad_norm": 4.941550254821777, + "learning_rate": 8.106619158558311e-06, + "loss": 0.3617, + "step": 14567 + }, + { + "epoch": 0.19761258817145957, + "grad_norm": 6.095035076141357, + "learning_rate": 8.106482115938057e-06, + "loss": 0.3706, + "step": 14568 + }, + { + "epoch": 0.19762615301139447, + "grad_norm": 6.214139461517334, + "learning_rate": 8.106345073317804e-06, + "loss": 0.4975, + "step": 14569 + }, + { + "epoch": 0.19763971785132936, + "grad_norm": 4.970876693725586, + "learning_rate": 8.106208030697547e-06, + "loss": 0.3665, + "step": 14570 + }, + { + "epoch": 0.19765328269126425, + "grad_norm": 4.426906585693359, + "learning_rate": 8.106070988077292e-06, + "loss": 0.2996, + "step": 14571 + }, + { + "epoch": 0.19766684753119912, + "grad_norm": 4.977828502655029, + "learning_rate": 8.105933945457037e-06, + "loss": 0.2791, + "step": 14572 + }, + { + "epoch": 0.197680412371134, + "grad_norm": 5.71056604385376, + "learning_rate": 8.105796902836783e-06, + "loss": 0.3521, + "step": 14573 + }, + { + "epoch": 0.1976939772110689, + "grad_norm": 6.8168864250183105, + "learning_rate": 8.105659860216528e-06, + "loss": 0.5008, + "step": 14574 + }, + { + "epoch": 0.1977075420510038, + "grad_norm": 9.265439987182617, + "learning_rate": 8.105522817596273e-06, + "loss": 0.5644, + "step": 14575 + }, + { + "epoch": 0.1977211068909387, + "grad_norm": 4.755455017089844, + "learning_rate": 8.105385774976018e-06, + "loss": 0.3893, + "step": 14576 + }, + { + "epoch": 0.19773467173087358, + "grad_norm": 6.787478923797607, + "learning_rate": 8.105248732355763e-06, + "loss": 0.3626, + "step": 14577 + }, + { + "epoch": 0.19774823657080848, + "grad_norm": 7.053581237792969, + "learning_rate": 8.105111689735508e-06, + "loss": 0.4463, + "step": 14578 + }, + { + "epoch": 0.19776180141074334, + "grad_norm": 5.379461288452148, + "learning_rate": 8.104974647115254e-06, + "loss": 0.3049, + "step": 14579 + }, + { + "epoch": 0.19777536625067824, + "grad_norm": 5.789447784423828, + "learning_rate": 8.104837604494999e-06, + "loss": 0.5122, + "step": 14580 + }, + { + "epoch": 0.19778893109061313, + "grad_norm": 7.360311985015869, + "learning_rate": 8.104700561874744e-06, + "loss": 0.4564, + "step": 14581 + }, + { + "epoch": 0.19780249593054802, + "grad_norm": 7.876959800720215, + "learning_rate": 8.104563519254489e-06, + "loss": 0.4906, + "step": 14582 + }, + { + "epoch": 0.19781606077048292, + "grad_norm": 7.082386016845703, + "learning_rate": 8.104426476634234e-06, + "loss": 0.3936, + "step": 14583 + }, + { + "epoch": 0.1978296256104178, + "grad_norm": 5.137558460235596, + "learning_rate": 8.10428943401398e-06, + "loss": 0.2904, + "step": 14584 + }, + { + "epoch": 0.19784319045035267, + "grad_norm": 6.503767967224121, + "learning_rate": 8.104152391393723e-06, + "loss": 0.3791, + "step": 14585 + }, + { + "epoch": 0.19785675529028757, + "grad_norm": 8.058661460876465, + "learning_rate": 8.10401534877347e-06, + "loss": 0.4972, + "step": 14586 + }, + { + "epoch": 0.19787032013022246, + "grad_norm": 5.669619560241699, + "learning_rate": 8.103878306153215e-06, + "loss": 0.3704, + "step": 14587 + }, + { + "epoch": 0.19788388497015735, + "grad_norm": 7.020537853240967, + "learning_rate": 8.103741263532959e-06, + "loss": 0.3716, + "step": 14588 + }, + { + "epoch": 0.19789744981009225, + "grad_norm": 6.2149200439453125, + "learning_rate": 8.103604220912704e-06, + "loss": 0.406, + "step": 14589 + }, + { + "epoch": 0.19791101465002714, + "grad_norm": 6.247465133666992, + "learning_rate": 8.10346717829245e-06, + "loss": 0.3065, + "step": 14590 + }, + { + "epoch": 0.197924579489962, + "grad_norm": 7.6015472412109375, + "learning_rate": 8.103330135672196e-06, + "loss": 0.3668, + "step": 14591 + }, + { + "epoch": 0.1979381443298969, + "grad_norm": 5.973147869110107, + "learning_rate": 8.10319309305194e-06, + "loss": 0.2898, + "step": 14592 + }, + { + "epoch": 0.1979517091698318, + "grad_norm": 6.674647808074951, + "learning_rate": 8.103056050431684e-06, + "loss": 0.4336, + "step": 14593 + }, + { + "epoch": 0.19796527400976668, + "grad_norm": 4.445521831512451, + "learning_rate": 8.10291900781143e-06, + "loss": 0.2454, + "step": 14594 + }, + { + "epoch": 0.19797883884970158, + "grad_norm": 4.3296709060668945, + "learning_rate": 8.102781965191175e-06, + "loss": 0.3029, + "step": 14595 + }, + { + "epoch": 0.19799240368963647, + "grad_norm": 4.928738594055176, + "learning_rate": 8.10264492257092e-06, + "loss": 0.3599, + "step": 14596 + }, + { + "epoch": 0.19800596852957136, + "grad_norm": 6.095935821533203, + "learning_rate": 8.102507879950665e-06, + "loss": 0.3752, + "step": 14597 + }, + { + "epoch": 0.19801953336950623, + "grad_norm": 5.647626876831055, + "learning_rate": 8.10237083733041e-06, + "loss": 0.4006, + "step": 14598 + }, + { + "epoch": 0.19803309820944112, + "grad_norm": 3.8257689476013184, + "learning_rate": 8.102233794710155e-06, + "loss": 0.2476, + "step": 14599 + }, + { + "epoch": 0.19804666304937601, + "grad_norm": 6.106451034545898, + "learning_rate": 8.1020967520899e-06, + "loss": 0.4152, + "step": 14600 + }, + { + "epoch": 0.1980602278893109, + "grad_norm": 4.990822792053223, + "learning_rate": 8.101959709469646e-06, + "loss": 0.245, + "step": 14601 + }, + { + "epoch": 0.1980737927292458, + "grad_norm": 4.956543922424316, + "learning_rate": 8.101822666849391e-06, + "loss": 0.3449, + "step": 14602 + }, + { + "epoch": 0.1980873575691807, + "grad_norm": 5.250104904174805, + "learning_rate": 8.101685624229136e-06, + "loss": 0.4131, + "step": 14603 + }, + { + "epoch": 0.19810092240911556, + "grad_norm": 5.114770889282227, + "learning_rate": 8.101548581608881e-06, + "loss": 0.3041, + "step": 14604 + }, + { + "epoch": 0.19811448724905045, + "grad_norm": 5.0599188804626465, + "learning_rate": 8.101411538988627e-06, + "loss": 0.3983, + "step": 14605 + }, + { + "epoch": 0.19812805208898535, + "grad_norm": 8.15701675415039, + "learning_rate": 8.101274496368372e-06, + "loss": 0.4202, + "step": 14606 + }, + { + "epoch": 0.19814161692892024, + "grad_norm": 5.007097244262695, + "learning_rate": 8.101137453748115e-06, + "loss": 0.3352, + "step": 14607 + }, + { + "epoch": 0.19815518176885513, + "grad_norm": 5.4081950187683105, + "learning_rate": 8.101000411127862e-06, + "loss": 0.3459, + "step": 14608 + }, + { + "epoch": 0.19816874660879003, + "grad_norm": 3.379072427749634, + "learning_rate": 8.100863368507607e-06, + "loss": 0.1577, + "step": 14609 + }, + { + "epoch": 0.19818231144872492, + "grad_norm": 6.289524555206299, + "learning_rate": 8.10072632588735e-06, + "loss": 0.3986, + "step": 14610 + }, + { + "epoch": 0.19819587628865978, + "grad_norm": 5.44557523727417, + "learning_rate": 8.100589283267096e-06, + "loss": 0.2449, + "step": 14611 + }, + { + "epoch": 0.19820944112859468, + "grad_norm": 5.763784885406494, + "learning_rate": 8.100452240646843e-06, + "loss": 0.3398, + "step": 14612 + }, + { + "epoch": 0.19822300596852957, + "grad_norm": 6.771361827850342, + "learning_rate": 8.100315198026586e-06, + "loss": 0.4006, + "step": 14613 + }, + { + "epoch": 0.19823657080846446, + "grad_norm": 5.188754081726074, + "learning_rate": 8.100178155406331e-06, + "loss": 0.2611, + "step": 14614 + }, + { + "epoch": 0.19825013564839936, + "grad_norm": 5.412836074829102, + "learning_rate": 8.100041112786077e-06, + "loss": 0.3877, + "step": 14615 + }, + { + "epoch": 0.19826370048833425, + "grad_norm": 4.861743927001953, + "learning_rate": 8.099904070165824e-06, + "loss": 0.299, + "step": 14616 + }, + { + "epoch": 0.19827726532826911, + "grad_norm": 4.397434234619141, + "learning_rate": 8.099767027545567e-06, + "loss": 0.2928, + "step": 14617 + }, + { + "epoch": 0.198290830168204, + "grad_norm": 5.195379257202148, + "learning_rate": 8.099629984925312e-06, + "loss": 0.3177, + "step": 14618 + }, + { + "epoch": 0.1983043950081389, + "grad_norm": 6.989655494689941, + "learning_rate": 8.099492942305057e-06, + "loss": 0.3646, + "step": 14619 + }, + { + "epoch": 0.1983179598480738, + "grad_norm": 8.833510398864746, + "learning_rate": 8.099355899684803e-06, + "loss": 0.5477, + "step": 14620 + }, + { + "epoch": 0.1983315246880087, + "grad_norm": 5.970418453216553, + "learning_rate": 8.099218857064548e-06, + "loss": 0.4578, + "step": 14621 + }, + { + "epoch": 0.19834508952794358, + "grad_norm": 6.502279281616211, + "learning_rate": 8.099081814444293e-06, + "loss": 0.3071, + "step": 14622 + }, + { + "epoch": 0.19835865436787845, + "grad_norm": 6.529483318328857, + "learning_rate": 8.098944771824038e-06, + "loss": 0.2788, + "step": 14623 + }, + { + "epoch": 0.19837221920781334, + "grad_norm": 5.938624858856201, + "learning_rate": 8.098807729203783e-06, + "loss": 0.4223, + "step": 14624 + }, + { + "epoch": 0.19838578404774823, + "grad_norm": 6.150099277496338, + "learning_rate": 8.098670686583528e-06, + "loss": 0.2461, + "step": 14625 + }, + { + "epoch": 0.19839934888768312, + "grad_norm": 6.229095935821533, + "learning_rate": 8.098533643963274e-06, + "loss": 0.4515, + "step": 14626 + }, + { + "epoch": 0.19841291372761802, + "grad_norm": 6.118956565856934, + "learning_rate": 8.098396601343019e-06, + "loss": 0.4588, + "step": 14627 + }, + { + "epoch": 0.1984264785675529, + "grad_norm": 5.3375725746154785, + "learning_rate": 8.098259558722762e-06, + "loss": 0.2301, + "step": 14628 + }, + { + "epoch": 0.1984400434074878, + "grad_norm": 6.339353084564209, + "learning_rate": 8.09812251610251e-06, + "loss": 0.4111, + "step": 14629 + }, + { + "epoch": 0.19845360824742267, + "grad_norm": 5.905339241027832, + "learning_rate": 8.097985473482254e-06, + "loss": 0.4517, + "step": 14630 + }, + { + "epoch": 0.19846717308735756, + "grad_norm": 7.096480846405029, + "learning_rate": 8.097848430862e-06, + "loss": 0.5267, + "step": 14631 + }, + { + "epoch": 0.19848073792729246, + "grad_norm": 5.076213836669922, + "learning_rate": 8.097711388241743e-06, + "loss": 0.2529, + "step": 14632 + }, + { + "epoch": 0.19849430276722735, + "grad_norm": 5.1324968338012695, + "learning_rate": 8.09757434562149e-06, + "loss": 0.2728, + "step": 14633 + }, + { + "epoch": 0.19850786760716224, + "grad_norm": 7.237133979797363, + "learning_rate": 8.097437303001235e-06, + "loss": 0.4442, + "step": 14634 + }, + { + "epoch": 0.19852143244709713, + "grad_norm": 9.94486141204834, + "learning_rate": 8.097300260380979e-06, + "loss": 0.4166, + "step": 14635 + }, + { + "epoch": 0.198534997287032, + "grad_norm": 8.07753849029541, + "learning_rate": 8.097163217760724e-06, + "loss": 0.5296, + "step": 14636 + }, + { + "epoch": 0.1985485621269669, + "grad_norm": 7.365403175354004, + "learning_rate": 8.097026175140469e-06, + "loss": 0.4646, + "step": 14637 + }, + { + "epoch": 0.1985621269669018, + "grad_norm": 5.630356788635254, + "learning_rate": 8.096889132520214e-06, + "loss": 0.3734, + "step": 14638 + }, + { + "epoch": 0.19857569180683668, + "grad_norm": 6.704692840576172, + "learning_rate": 8.09675208989996e-06, + "loss": 0.3612, + "step": 14639 + }, + { + "epoch": 0.19858925664677157, + "grad_norm": 6.834286689758301, + "learning_rate": 8.096615047279704e-06, + "loss": 0.318, + "step": 14640 + }, + { + "epoch": 0.19860282148670647, + "grad_norm": 6.766645431518555, + "learning_rate": 8.09647800465945e-06, + "loss": 0.388, + "step": 14641 + }, + { + "epoch": 0.19861638632664136, + "grad_norm": 8.334283828735352, + "learning_rate": 8.096340962039195e-06, + "loss": 0.4496, + "step": 14642 + }, + { + "epoch": 0.19862995116657622, + "grad_norm": 5.82867431640625, + "learning_rate": 8.09620391941894e-06, + "loss": 0.3141, + "step": 14643 + }, + { + "epoch": 0.19864351600651112, + "grad_norm": 6.428017616271973, + "learning_rate": 8.096066876798685e-06, + "loss": 0.2996, + "step": 14644 + }, + { + "epoch": 0.198657080846446, + "grad_norm": 7.436252117156982, + "learning_rate": 8.09592983417843e-06, + "loss": 0.3402, + "step": 14645 + }, + { + "epoch": 0.1986706456863809, + "grad_norm": 8.598910331726074, + "learning_rate": 8.095792791558176e-06, + "loss": 0.4523, + "step": 14646 + }, + { + "epoch": 0.1986842105263158, + "grad_norm": 5.970219135284424, + "learning_rate": 8.09565574893792e-06, + "loss": 0.3373, + "step": 14647 + }, + { + "epoch": 0.1986977753662507, + "grad_norm": 6.054995059967041, + "learning_rate": 8.095518706317666e-06, + "loss": 0.3258, + "step": 14648 + }, + { + "epoch": 0.19871134020618555, + "grad_norm": 6.871496677398682, + "learning_rate": 8.095381663697411e-06, + "loss": 0.3626, + "step": 14649 + }, + { + "epoch": 0.19872490504612045, + "grad_norm": 6.731443881988525, + "learning_rate": 8.095244621077155e-06, + "loss": 0.363, + "step": 14650 + }, + { + "epoch": 0.19873846988605534, + "grad_norm": 8.034820556640625, + "learning_rate": 8.095107578456901e-06, + "loss": 0.472, + "step": 14651 + }, + { + "epoch": 0.19875203472599023, + "grad_norm": 4.890596866607666, + "learning_rate": 8.094970535836647e-06, + "loss": 0.3367, + "step": 14652 + }, + { + "epoch": 0.19876559956592513, + "grad_norm": 7.622481346130371, + "learning_rate": 8.09483349321639e-06, + "loss": 0.4656, + "step": 14653 + }, + { + "epoch": 0.19877916440586002, + "grad_norm": 7.702210903167725, + "learning_rate": 8.094696450596135e-06, + "loss": 0.5452, + "step": 14654 + }, + { + "epoch": 0.19879272924579489, + "grad_norm": 8.313352584838867, + "learning_rate": 8.094559407975882e-06, + "loss": 0.5376, + "step": 14655 + }, + { + "epoch": 0.19880629408572978, + "grad_norm": 6.663395404815674, + "learning_rate": 8.094422365355626e-06, + "loss": 0.3937, + "step": 14656 + }, + { + "epoch": 0.19881985892566467, + "grad_norm": 5.813442230224609, + "learning_rate": 8.09428532273537e-06, + "loss": 0.4901, + "step": 14657 + }, + { + "epoch": 0.19883342376559957, + "grad_norm": 6.18284797668457, + "learning_rate": 8.094148280115116e-06, + "loss": 0.3814, + "step": 14658 + }, + { + "epoch": 0.19884698860553446, + "grad_norm": 7.5629801750183105, + "learning_rate": 8.094011237494863e-06, + "loss": 0.4669, + "step": 14659 + }, + { + "epoch": 0.19886055344546935, + "grad_norm": 7.001232624053955, + "learning_rate": 8.093874194874606e-06, + "loss": 0.3566, + "step": 14660 + }, + { + "epoch": 0.19887411828540424, + "grad_norm": 5.235766410827637, + "learning_rate": 8.093737152254351e-06, + "loss": 0.449, + "step": 14661 + }, + { + "epoch": 0.1988876831253391, + "grad_norm": 7.057112693786621, + "learning_rate": 8.093600109634097e-06, + "loss": 0.4033, + "step": 14662 + }, + { + "epoch": 0.198901247965274, + "grad_norm": 9.031946182250977, + "learning_rate": 8.093463067013842e-06, + "loss": 0.5247, + "step": 14663 + }, + { + "epoch": 0.1989148128052089, + "grad_norm": 7.9750776290893555, + "learning_rate": 8.093326024393587e-06, + "loss": 0.5316, + "step": 14664 + }, + { + "epoch": 0.1989283776451438, + "grad_norm": 5.823146343231201, + "learning_rate": 8.093188981773332e-06, + "loss": 0.3116, + "step": 14665 + }, + { + "epoch": 0.19894194248507868, + "grad_norm": 8.599228858947754, + "learning_rate": 8.093051939153077e-06, + "loss": 0.5532, + "step": 14666 + }, + { + "epoch": 0.19895550732501358, + "grad_norm": 7.638144016265869, + "learning_rate": 8.092914896532823e-06, + "loss": 0.4004, + "step": 14667 + }, + { + "epoch": 0.19896907216494844, + "grad_norm": 7.228383541107178, + "learning_rate": 8.092777853912568e-06, + "loss": 0.5092, + "step": 14668 + }, + { + "epoch": 0.19898263700488333, + "grad_norm": 10.273727416992188, + "learning_rate": 8.092640811292313e-06, + "loss": 0.7775, + "step": 14669 + }, + { + "epoch": 0.19899620184481823, + "grad_norm": 4.402703285217285, + "learning_rate": 8.092503768672058e-06, + "loss": 0.3008, + "step": 14670 + }, + { + "epoch": 0.19900976668475312, + "grad_norm": 10.52557373046875, + "learning_rate": 8.092366726051802e-06, + "loss": 0.7884, + "step": 14671 + }, + { + "epoch": 0.199023331524688, + "grad_norm": 5.865886211395264, + "learning_rate": 8.092229683431548e-06, + "loss": 0.6026, + "step": 14672 + }, + { + "epoch": 0.1990368963646229, + "grad_norm": 7.4099931716918945, + "learning_rate": 8.092092640811294e-06, + "loss": 0.6732, + "step": 14673 + }, + { + "epoch": 0.1990504612045578, + "grad_norm": 4.999007225036621, + "learning_rate": 8.091955598191039e-06, + "loss": 0.4125, + "step": 14674 + }, + { + "epoch": 0.19906402604449266, + "grad_norm": 6.344309329986572, + "learning_rate": 8.091818555570782e-06, + "loss": 0.4099, + "step": 14675 + }, + { + "epoch": 0.19907759088442756, + "grad_norm": 8.291714668273926, + "learning_rate": 8.091681512950527e-06, + "loss": 0.6941, + "step": 14676 + }, + { + "epoch": 0.19909115572436245, + "grad_norm": 5.4994049072265625, + "learning_rate": 8.091544470330274e-06, + "loss": 0.3451, + "step": 14677 + }, + { + "epoch": 0.19910472056429734, + "grad_norm": 8.494041442871094, + "learning_rate": 8.091407427710018e-06, + "loss": 0.4897, + "step": 14678 + }, + { + "epoch": 0.19911828540423224, + "grad_norm": 6.696977138519287, + "learning_rate": 8.091270385089763e-06, + "loss": 0.3613, + "step": 14679 + }, + { + "epoch": 0.19913185024416713, + "grad_norm": 6.1275529861450195, + "learning_rate": 8.091133342469508e-06, + "loss": 0.4985, + "step": 14680 + }, + { + "epoch": 0.199145415084102, + "grad_norm": 6.356073379516602, + "learning_rate": 8.090996299849253e-06, + "loss": 0.4786, + "step": 14681 + }, + { + "epoch": 0.1991589799240369, + "grad_norm": 4.066349983215332, + "learning_rate": 8.090859257228999e-06, + "loss": 0.2686, + "step": 14682 + }, + { + "epoch": 0.19917254476397178, + "grad_norm": 7.2308573722839355, + "learning_rate": 8.090722214608744e-06, + "loss": 0.6519, + "step": 14683 + }, + { + "epoch": 0.19918610960390667, + "grad_norm": 5.637345790863037, + "learning_rate": 8.090585171988489e-06, + "loss": 0.395, + "step": 14684 + }, + { + "epoch": 0.19919967444384157, + "grad_norm": 6.1158857345581055, + "learning_rate": 8.090448129368234e-06, + "loss": 0.4583, + "step": 14685 + }, + { + "epoch": 0.19921323928377646, + "grad_norm": 6.90792179107666, + "learning_rate": 8.09031108674798e-06, + "loss": 0.45, + "step": 14686 + }, + { + "epoch": 0.19922680412371135, + "grad_norm": 6.170573711395264, + "learning_rate": 8.090174044127724e-06, + "loss": 0.4559, + "step": 14687 + }, + { + "epoch": 0.19924036896364622, + "grad_norm": 7.420011520385742, + "learning_rate": 8.09003700150747e-06, + "loss": 0.5569, + "step": 14688 + }, + { + "epoch": 0.1992539338035811, + "grad_norm": 7.524966716766357, + "learning_rate": 8.089899958887215e-06, + "loss": 0.4797, + "step": 14689 + }, + { + "epoch": 0.199267498643516, + "grad_norm": 8.408851623535156, + "learning_rate": 8.08976291626696e-06, + "loss": 0.466, + "step": 14690 + }, + { + "epoch": 0.1992810634834509, + "grad_norm": 5.102217674255371, + "learning_rate": 8.089625873646705e-06, + "loss": 0.4423, + "step": 14691 + }, + { + "epoch": 0.1992946283233858, + "grad_norm": 5.056593418121338, + "learning_rate": 8.08948883102645e-06, + "loss": 0.393, + "step": 14692 + }, + { + "epoch": 0.19930819316332068, + "grad_norm": 5.362679958343506, + "learning_rate": 8.089351788406194e-06, + "loss": 0.4457, + "step": 14693 + }, + { + "epoch": 0.19932175800325555, + "grad_norm": 6.134350776672363, + "learning_rate": 8.08921474578594e-06, + "loss": 0.4402, + "step": 14694 + }, + { + "epoch": 0.19933532284319044, + "grad_norm": 6.767082691192627, + "learning_rate": 8.089077703165686e-06, + "loss": 0.3556, + "step": 14695 + }, + { + "epoch": 0.19934888768312534, + "grad_norm": 4.35929536819458, + "learning_rate": 8.08894066054543e-06, + "loss": 0.3879, + "step": 14696 + }, + { + "epoch": 0.19936245252306023, + "grad_norm": 8.834997177124023, + "learning_rate": 8.088803617925175e-06, + "loss": 0.7759, + "step": 14697 + }, + { + "epoch": 0.19937601736299512, + "grad_norm": 8.044877052307129, + "learning_rate": 8.088666575304921e-06, + "loss": 0.3615, + "step": 14698 + }, + { + "epoch": 0.19938958220293002, + "grad_norm": 7.568475246429443, + "learning_rate": 8.088529532684667e-06, + "loss": 0.3662, + "step": 14699 + }, + { + "epoch": 0.19940314704286488, + "grad_norm": 5.3068084716796875, + "learning_rate": 8.08839249006441e-06, + "loss": 0.3936, + "step": 14700 + }, + { + "epoch": 0.19941671188279977, + "grad_norm": 5.684213638305664, + "learning_rate": 8.088255447444155e-06, + "loss": 0.3231, + "step": 14701 + }, + { + "epoch": 0.19943027672273467, + "grad_norm": 7.0194525718688965, + "learning_rate": 8.088118404823902e-06, + "loss": 0.5114, + "step": 14702 + }, + { + "epoch": 0.19944384156266956, + "grad_norm": 6.506568431854248, + "learning_rate": 8.087981362203646e-06, + "loss": 0.3244, + "step": 14703 + }, + { + "epoch": 0.19945740640260445, + "grad_norm": 6.523426532745361, + "learning_rate": 8.08784431958339e-06, + "loss": 0.3606, + "step": 14704 + }, + { + "epoch": 0.19947097124253935, + "grad_norm": 14.459775924682617, + "learning_rate": 8.087707276963136e-06, + "loss": 0.4378, + "step": 14705 + }, + { + "epoch": 0.19948453608247424, + "grad_norm": 9.062384605407715, + "learning_rate": 8.087570234342881e-06, + "loss": 0.5217, + "step": 14706 + }, + { + "epoch": 0.1994981009224091, + "grad_norm": 5.907467842102051, + "learning_rate": 8.087433191722626e-06, + "loss": 0.3957, + "step": 14707 + }, + { + "epoch": 0.199511665762344, + "grad_norm": 7.524012088775635, + "learning_rate": 8.087296149102372e-06, + "loss": 0.5872, + "step": 14708 + }, + { + "epoch": 0.1995252306022789, + "grad_norm": 6.453045845031738, + "learning_rate": 8.087159106482117e-06, + "loss": 0.4421, + "step": 14709 + }, + { + "epoch": 0.19953879544221378, + "grad_norm": 6.539208889007568, + "learning_rate": 8.087022063861862e-06, + "loss": 0.452, + "step": 14710 + }, + { + "epoch": 0.19955236028214868, + "grad_norm": 4.791964530944824, + "learning_rate": 8.086885021241607e-06, + "loss": 0.4093, + "step": 14711 + }, + { + "epoch": 0.19956592512208357, + "grad_norm": 7.853217601776123, + "learning_rate": 8.086747978621352e-06, + "loss": 0.464, + "step": 14712 + }, + { + "epoch": 0.19957948996201844, + "grad_norm": 5.889479160308838, + "learning_rate": 8.086610936001097e-06, + "loss": 0.3359, + "step": 14713 + }, + { + "epoch": 0.19959305480195333, + "grad_norm": 5.07119083404541, + "learning_rate": 8.086473893380843e-06, + "loss": 0.383, + "step": 14714 + }, + { + "epoch": 0.19960661964188822, + "grad_norm": 4.977444171905518, + "learning_rate": 8.086336850760588e-06, + "loss": 0.2995, + "step": 14715 + }, + { + "epoch": 0.19962018448182312, + "grad_norm": 4.628533363342285, + "learning_rate": 8.086199808140333e-06, + "loss": 0.2454, + "step": 14716 + }, + { + "epoch": 0.199633749321758, + "grad_norm": 5.5598368644714355, + "learning_rate": 8.086062765520078e-06, + "loss": 0.3743, + "step": 14717 + }, + { + "epoch": 0.1996473141616929, + "grad_norm": 11.52074909210205, + "learning_rate": 8.085925722899822e-06, + "loss": 0.6973, + "step": 14718 + }, + { + "epoch": 0.1996608790016278, + "grad_norm": 7.3644514083862305, + "learning_rate": 8.085788680279567e-06, + "loss": 0.479, + "step": 14719 + }, + { + "epoch": 0.19967444384156266, + "grad_norm": 7.058644771575928, + "learning_rate": 8.085651637659314e-06, + "loss": 0.4061, + "step": 14720 + }, + { + "epoch": 0.19968800868149755, + "grad_norm": 6.176384925842285, + "learning_rate": 8.085514595039057e-06, + "loss": 0.4626, + "step": 14721 + }, + { + "epoch": 0.19970157352143245, + "grad_norm": 6.412537574768066, + "learning_rate": 8.085377552418802e-06, + "loss": 0.489, + "step": 14722 + }, + { + "epoch": 0.19971513836136734, + "grad_norm": 4.866840362548828, + "learning_rate": 8.085240509798548e-06, + "loss": 0.2936, + "step": 14723 + }, + { + "epoch": 0.19972870320130223, + "grad_norm": 5.551011085510254, + "learning_rate": 8.085103467178294e-06, + "loss": 0.2888, + "step": 14724 + }, + { + "epoch": 0.19974226804123713, + "grad_norm": 6.072602272033691, + "learning_rate": 8.084966424558038e-06, + "loss": 0.3572, + "step": 14725 + }, + { + "epoch": 0.199755832881172, + "grad_norm": 7.014140605926514, + "learning_rate": 8.084829381937783e-06, + "loss": 0.4019, + "step": 14726 + }, + { + "epoch": 0.19976939772110688, + "grad_norm": 6.761993408203125, + "learning_rate": 8.084692339317528e-06, + "loss": 0.5901, + "step": 14727 + }, + { + "epoch": 0.19978296256104178, + "grad_norm": 5.723880290985107, + "learning_rate": 8.084555296697273e-06, + "loss": 0.3428, + "step": 14728 + }, + { + "epoch": 0.19979652740097667, + "grad_norm": 5.927894592285156, + "learning_rate": 8.084418254077019e-06, + "loss": 0.3845, + "step": 14729 + }, + { + "epoch": 0.19981009224091156, + "grad_norm": 8.125760078430176, + "learning_rate": 8.084281211456764e-06, + "loss": 0.5845, + "step": 14730 + }, + { + "epoch": 0.19982365708084646, + "grad_norm": 7.7726969718933105, + "learning_rate": 8.084144168836509e-06, + "loss": 0.7118, + "step": 14731 + }, + { + "epoch": 0.19983722192078132, + "grad_norm": 5.687027931213379, + "learning_rate": 8.084007126216254e-06, + "loss": 0.442, + "step": 14732 + }, + { + "epoch": 0.19985078676071621, + "grad_norm": 5.877281665802002, + "learning_rate": 8.083870083596e-06, + "loss": 0.3702, + "step": 14733 + }, + { + "epoch": 0.1998643516006511, + "grad_norm": 5.0581374168396, + "learning_rate": 8.083733040975744e-06, + "loss": 0.3697, + "step": 14734 + }, + { + "epoch": 0.199877916440586, + "grad_norm": 6.598159313201904, + "learning_rate": 8.08359599835549e-06, + "loss": 0.5253, + "step": 14735 + }, + { + "epoch": 0.1998914812805209, + "grad_norm": 6.658522605895996, + "learning_rate": 8.083458955735233e-06, + "loss": 0.4158, + "step": 14736 + }, + { + "epoch": 0.1999050461204558, + "grad_norm": 6.1245927810668945, + "learning_rate": 8.08332191311498e-06, + "loss": 0.3888, + "step": 14737 + }, + { + "epoch": 0.19991861096039068, + "grad_norm": 6.753973484039307, + "learning_rate": 8.083184870494725e-06, + "loss": 0.3702, + "step": 14738 + }, + { + "epoch": 0.19993217580032555, + "grad_norm": 5.275925159454346, + "learning_rate": 8.083047827874469e-06, + "loss": 0.3314, + "step": 14739 + }, + { + "epoch": 0.19994574064026044, + "grad_norm": 5.085633277893066, + "learning_rate": 8.082910785254214e-06, + "loss": 0.3695, + "step": 14740 + }, + { + "epoch": 0.19995930548019533, + "grad_norm": 6.859382629394531, + "learning_rate": 8.08277374263396e-06, + "loss": 0.2963, + "step": 14741 + }, + { + "epoch": 0.19997287032013022, + "grad_norm": 5.35659122467041, + "learning_rate": 8.082636700013706e-06, + "loss": 0.4073, + "step": 14742 + }, + { + "epoch": 0.19998643516006512, + "grad_norm": 9.325766563415527, + "learning_rate": 8.08249965739345e-06, + "loss": 0.5644, + "step": 14743 + }, + { + "epoch": 0.2, + "grad_norm": 6.37679386138916, + "learning_rate": 8.082362614773195e-06, + "loss": 0.4302, + "step": 14744 + }, + { + "epoch": 0.2, + "eval_loss": 0.35063454508781433, + "eval_noise_accuracy": NaN, + "eval_runtime": 4646.4076, + "eval_samples_per_second": 1.081, + "eval_steps_per_second": 0.068, + "eval_wer": 34.550986994532146, + "step": 14744 + }, + { + "epoch": 0.20001356483993488, + "grad_norm": 6.800195217132568, + "learning_rate": 8.08222557215294e-06, + "loss": 0.5556, + "step": 14745 + }, + { + "epoch": 0.20002712967986977, + "grad_norm": 5.6193928718566895, + "learning_rate": 8.082088529532685e-06, + "loss": 0.3733, + "step": 14746 + }, + { + "epoch": 1.0000135648399349, + "grad_norm": 6.0258917808532715, + "learning_rate": 8.08195148691243e-06, + "loss": 0.2582, + "step": 14747 + }, + { + "epoch": 1.0000271296798697, + "grad_norm": 4.2262396812438965, + "learning_rate": 8.081814444292175e-06, + "loss": 0.1897, + "step": 14748 + }, + { + "epoch": 1.0000406945198046, + "grad_norm": 6.033735275268555, + "learning_rate": 8.08167740167192e-06, + "loss": 0.3912, + "step": 14749 + }, + { + "epoch": 1.0000542593597395, + "grad_norm": 4.129775047302246, + "learning_rate": 8.081540359051666e-06, + "loss": 0.1763, + "step": 14750 + }, + { + "epoch": 1.0000678241996745, + "grad_norm": 5.763254165649414, + "learning_rate": 8.08140331643141e-06, + "loss": 0.3172, + "step": 14751 + }, + { + "epoch": 1.0000813890396094, + "grad_norm": 5.134901523590088, + "learning_rate": 8.081266273811156e-06, + "loss": 0.2268, + "step": 14752 + }, + { + "epoch": 1.0000949538795443, + "grad_norm": 3.448305368423462, + "learning_rate": 8.081129231190901e-06, + "loss": 0.1874, + "step": 14753 + }, + { + "epoch": 1.0001085187194791, + "grad_norm": 4.606448650360107, + "learning_rate": 8.080992188570646e-06, + "loss": 0.2812, + "step": 14754 + }, + { + "epoch": 1.000122083559414, + "grad_norm": 5.534090995788574, + "learning_rate": 8.080855145950392e-06, + "loss": 0.4423, + "step": 14755 + }, + { + "epoch": 1.0001356483993489, + "grad_norm": 5.357290744781494, + "learning_rate": 8.080718103330137e-06, + "loss": 0.4325, + "step": 14756 + }, + { + "epoch": 1.0001492132392837, + "grad_norm": 3.357520818710327, + "learning_rate": 8.080581060709882e-06, + "loss": 0.1435, + "step": 14757 + }, + { + "epoch": 1.0001627780792186, + "grad_norm": 4.8194169998168945, + "learning_rate": 8.080444018089627e-06, + "loss": 0.2108, + "step": 14758 + }, + { + "epoch": 1.0001763429191535, + "grad_norm": 5.327221870422363, + "learning_rate": 8.080306975469372e-06, + "loss": 0.3918, + "step": 14759 + }, + { + "epoch": 1.0001899077590883, + "grad_norm": 4.4569268226623535, + "learning_rate": 8.080169932849117e-06, + "loss": 0.2003, + "step": 14760 + }, + { + "epoch": 1.0002034725990234, + "grad_norm": 6.262768745422363, + "learning_rate": 8.080032890228861e-06, + "loss": 0.288, + "step": 14761 + }, + { + "epoch": 1.0002170374389583, + "grad_norm": 4.6629838943481445, + "learning_rate": 8.079895847608606e-06, + "loss": 0.1593, + "step": 14762 + }, + { + "epoch": 1.0002306022788932, + "grad_norm": 7.752167224884033, + "learning_rate": 8.079758804988353e-06, + "loss": 0.423, + "step": 14763 + }, + { + "epoch": 1.000244167118828, + "grad_norm": 4.390713214874268, + "learning_rate": 8.079621762368096e-06, + "loss": 0.2769, + "step": 14764 + }, + { + "epoch": 1.0002577319587629, + "grad_norm": 5.603749752044678, + "learning_rate": 8.079484719747842e-06, + "loss": 0.3339, + "step": 14765 + }, + { + "epoch": 1.0002712967986978, + "grad_norm": 6.008617401123047, + "learning_rate": 8.079347677127587e-06, + "loss": 0.3355, + "step": 14766 + }, + { + "epoch": 1.0002848616386326, + "grad_norm": 6.923316955566406, + "learning_rate": 8.079210634507334e-06, + "loss": 0.2833, + "step": 14767 + }, + { + "epoch": 1.0002984264785675, + "grad_norm": 5.539759159088135, + "learning_rate": 8.079073591887077e-06, + "loss": 0.288, + "step": 14768 + }, + { + "epoch": 1.0003119913185023, + "grad_norm": 4.912669658660889, + "learning_rate": 8.078936549266822e-06, + "loss": 0.1854, + "step": 14769 + }, + { + "epoch": 1.0003255561584374, + "grad_norm": 4.9723358154296875, + "learning_rate": 8.078799506646568e-06, + "loss": 0.3627, + "step": 14770 + }, + { + "epoch": 1.0003391209983723, + "grad_norm": 4.4729461669921875, + "learning_rate": 8.078662464026313e-06, + "loss": 0.26, + "step": 14771 + }, + { + "epoch": 1.0003526858383072, + "grad_norm": 7.089022159576416, + "learning_rate": 8.078525421406058e-06, + "loss": 0.3725, + "step": 14772 + }, + { + "epoch": 1.000366250678242, + "grad_norm": 4.935999870300293, + "learning_rate": 8.078388378785803e-06, + "loss": 0.2206, + "step": 14773 + }, + { + "epoch": 1.000379815518177, + "grad_norm": 5.797290325164795, + "learning_rate": 8.078251336165548e-06, + "loss": 0.3288, + "step": 14774 + }, + { + "epoch": 1.0003933803581118, + "grad_norm": 4.104628086090088, + "learning_rate": 8.078114293545293e-06, + "loss": 0.129, + "step": 14775 + }, + { + "epoch": 1.0004069451980466, + "grad_norm": 5.0842180252075195, + "learning_rate": 8.077977250925039e-06, + "loss": 0.271, + "step": 14776 + }, + { + "epoch": 1.0004205100379815, + "grad_norm": 5.1529974937438965, + "learning_rate": 8.077840208304784e-06, + "loss": 0.2618, + "step": 14777 + }, + { + "epoch": 1.0004340748779164, + "grad_norm": 5.717156410217285, + "learning_rate": 8.077703165684529e-06, + "loss": 0.3453, + "step": 14778 + }, + { + "epoch": 1.0004476397178512, + "grad_norm": 4.846607208251953, + "learning_rate": 8.077566123064272e-06, + "loss": 0.2763, + "step": 14779 + }, + { + "epoch": 1.0004612045577863, + "grad_norm": 4.602531909942627, + "learning_rate": 8.07742908044402e-06, + "loss": 0.1775, + "step": 14780 + }, + { + "epoch": 1.0004747693977212, + "grad_norm": 5.903252601623535, + "learning_rate": 8.077292037823764e-06, + "loss": 0.2415, + "step": 14781 + }, + { + "epoch": 1.000488334237656, + "grad_norm": 4.873730659484863, + "learning_rate": 8.07715499520351e-06, + "loss": 0.2719, + "step": 14782 + }, + { + "epoch": 1.000501899077591, + "grad_norm": 4.50223970413208, + "learning_rate": 8.077017952583253e-06, + "loss": 0.4192, + "step": 14783 + }, + { + "epoch": 1.0005154639175258, + "grad_norm": 5.71973180770874, + "learning_rate": 8.076880909963e-06, + "loss": 0.2429, + "step": 14784 + }, + { + "epoch": 1.0005290287574606, + "grad_norm": 6.238398551940918, + "learning_rate": 8.076743867342745e-06, + "loss": 0.2571, + "step": 14785 + }, + { + "epoch": 1.0005425935973955, + "grad_norm": 5.8630452156066895, + "learning_rate": 8.076606824722489e-06, + "loss": 0.5149, + "step": 14786 + }, + { + "epoch": 1.0005561584373304, + "grad_norm": 5.477667808532715, + "learning_rate": 8.076469782102234e-06, + "loss": 0.3082, + "step": 14787 + }, + { + "epoch": 1.0005697232772652, + "grad_norm": 6.626049518585205, + "learning_rate": 8.076332739481979e-06, + "loss": 0.2976, + "step": 14788 + }, + { + "epoch": 1.0005832881172003, + "grad_norm": 5.889956951141357, + "learning_rate": 8.076195696861724e-06, + "loss": 0.3686, + "step": 14789 + }, + { + "epoch": 1.0005968529571352, + "grad_norm": 5.438870906829834, + "learning_rate": 8.07605865424147e-06, + "loss": 0.2387, + "step": 14790 + }, + { + "epoch": 1.00061041779707, + "grad_norm": 4.828776836395264, + "learning_rate": 8.075921611621215e-06, + "loss": 0.282, + "step": 14791 + }, + { + "epoch": 1.000623982637005, + "grad_norm": 4.846008777618408, + "learning_rate": 8.07578456900096e-06, + "loss": 0.3633, + "step": 14792 + }, + { + "epoch": 1.0006375474769398, + "grad_norm": 3.5919361114501953, + "learning_rate": 8.075647526380705e-06, + "loss": 0.2285, + "step": 14793 + }, + { + "epoch": 1.0006511123168746, + "grad_norm": 5.555774688720703, + "learning_rate": 8.07551048376045e-06, + "loss": 0.2325, + "step": 14794 + }, + { + "epoch": 1.0006646771568095, + "grad_norm": 7.472517967224121, + "learning_rate": 8.075373441140195e-06, + "loss": 0.4623, + "step": 14795 + }, + { + "epoch": 1.0006782419967444, + "grad_norm": 5.6677937507629395, + "learning_rate": 8.07523639851994e-06, + "loss": 0.2061, + "step": 14796 + }, + { + "epoch": 1.0006918068366792, + "grad_norm": 5.80961799621582, + "learning_rate": 8.075099355899686e-06, + "loss": 0.2899, + "step": 14797 + }, + { + "epoch": 1.000705371676614, + "grad_norm": 4.376371383666992, + "learning_rate": 8.07496231327943e-06, + "loss": 0.1984, + "step": 14798 + }, + { + "epoch": 1.0007189365165492, + "grad_norm": 7.532887935638428, + "learning_rate": 8.074825270659176e-06, + "loss": 0.4097, + "step": 14799 + }, + { + "epoch": 1.000732501356484, + "grad_norm": 3.3866376876831055, + "learning_rate": 8.074688228038921e-06, + "loss": 0.1677, + "step": 14800 + }, + { + "epoch": 1.000746066196419, + "grad_norm": 3.9204511642456055, + "learning_rate": 8.074551185418665e-06, + "loss": 0.3581, + "step": 14801 + }, + { + "epoch": 1.0007596310363538, + "grad_norm": 5.647553443908691, + "learning_rate": 8.074414142798412e-06, + "loss": 0.323, + "step": 14802 + }, + { + "epoch": 1.0007731958762887, + "grad_norm": 3.95947003364563, + "learning_rate": 8.074277100178157e-06, + "loss": 0.2278, + "step": 14803 + }, + { + "epoch": 1.0007867607162235, + "grad_norm": 5.572716236114502, + "learning_rate": 8.0741400575579e-06, + "loss": 0.2634, + "step": 14804 + }, + { + "epoch": 1.0008003255561584, + "grad_norm": 4.676078796386719, + "learning_rate": 8.074003014937645e-06, + "loss": 0.3152, + "step": 14805 + }, + { + "epoch": 1.0008138903960933, + "grad_norm": 5.394859313964844, + "learning_rate": 8.073865972317392e-06, + "loss": 0.2256, + "step": 14806 + }, + { + "epoch": 1.0008274552360281, + "grad_norm": 5.144838809967041, + "learning_rate": 8.073728929697137e-06, + "loss": 0.4343, + "step": 14807 + }, + { + "epoch": 1.0008410200759632, + "grad_norm": 4.453055381774902, + "learning_rate": 8.073591887076881e-06, + "loss": 0.2361, + "step": 14808 + }, + { + "epoch": 1.000854584915898, + "grad_norm": 5.309354782104492, + "learning_rate": 8.073454844456626e-06, + "loss": 0.2924, + "step": 14809 + }, + { + "epoch": 1.000868149755833, + "grad_norm": 5.561428070068359, + "learning_rate": 8.073317801836373e-06, + "loss": 0.2563, + "step": 14810 + }, + { + "epoch": 1.0008817145957678, + "grad_norm": 5.239142894744873, + "learning_rate": 8.073180759216116e-06, + "loss": 0.23, + "step": 14811 + }, + { + "epoch": 1.0008952794357027, + "grad_norm": 5.455413818359375, + "learning_rate": 8.073043716595862e-06, + "loss": 0.3279, + "step": 14812 + }, + { + "epoch": 1.0009088442756375, + "grad_norm": 3.579423189163208, + "learning_rate": 8.072906673975607e-06, + "loss": 0.195, + "step": 14813 + }, + { + "epoch": 1.0009224091155724, + "grad_norm": 5.456503391265869, + "learning_rate": 8.072769631355352e-06, + "loss": 0.2607, + "step": 14814 + }, + { + "epoch": 1.0009359739555073, + "grad_norm": 4.704913139343262, + "learning_rate": 8.072632588735097e-06, + "loss": 0.1806, + "step": 14815 + }, + { + "epoch": 1.0009495387954421, + "grad_norm": 5.436882495880127, + "learning_rate": 8.072495546114842e-06, + "loss": 0.3065, + "step": 14816 + }, + { + "epoch": 1.000963103635377, + "grad_norm": 5.813786029815674, + "learning_rate": 8.072358503494588e-06, + "loss": 0.3814, + "step": 14817 + }, + { + "epoch": 1.000976668475312, + "grad_norm": 6.033076763153076, + "learning_rate": 8.072221460874333e-06, + "loss": 0.249, + "step": 14818 + }, + { + "epoch": 1.000990233315247, + "grad_norm": 4.213903903961182, + "learning_rate": 8.072084418254078e-06, + "loss": 0.2452, + "step": 14819 + }, + { + "epoch": 1.0010037981551818, + "grad_norm": 5.065183162689209, + "learning_rate": 8.071947375633823e-06, + "loss": 0.2417, + "step": 14820 + }, + { + "epoch": 1.0010173629951167, + "grad_norm": 3.9195969104766846, + "learning_rate": 8.071810333013568e-06, + "loss": 0.2145, + "step": 14821 + }, + { + "epoch": 1.0010309278350515, + "grad_norm": 5.366652965545654, + "learning_rate": 8.071673290393313e-06, + "loss": 0.2969, + "step": 14822 + }, + { + "epoch": 1.0010444926749864, + "grad_norm": 6.326091289520264, + "learning_rate": 8.071536247773059e-06, + "loss": 0.2789, + "step": 14823 + }, + { + "epoch": 1.0010580575149213, + "grad_norm": 5.8661041259765625, + "learning_rate": 8.071399205152804e-06, + "loss": 0.2452, + "step": 14824 + }, + { + "epoch": 1.0010716223548561, + "grad_norm": 3.6573755741119385, + "learning_rate": 8.071262162532549e-06, + "loss": 0.1778, + "step": 14825 + }, + { + "epoch": 1.001085187194791, + "grad_norm": 3.807138442993164, + "learning_rate": 8.071125119912292e-06, + "loss": 0.2473, + "step": 14826 + }, + { + "epoch": 1.001098752034726, + "grad_norm": 4.453393936157227, + "learning_rate": 8.07098807729204e-06, + "loss": 0.1612, + "step": 14827 + }, + { + "epoch": 1.001112316874661, + "grad_norm": 4.592589855194092, + "learning_rate": 8.070851034671785e-06, + "loss": 0.2145, + "step": 14828 + }, + { + "epoch": 1.0011258817145958, + "grad_norm": 5.734989643096924, + "learning_rate": 8.070713992051528e-06, + "loss": 0.2326, + "step": 14829 + }, + { + "epoch": 1.0011394465545307, + "grad_norm": 3.7466859817504883, + "learning_rate": 8.070576949431273e-06, + "loss": 0.1579, + "step": 14830 + }, + { + "epoch": 1.0011530113944656, + "grad_norm": 5.049147605895996, + "learning_rate": 8.070439906811018e-06, + "loss": 0.2281, + "step": 14831 + }, + { + "epoch": 1.0011665762344004, + "grad_norm": 3.7946293354034424, + "learning_rate": 8.070302864190764e-06, + "loss": 0.2178, + "step": 14832 + }, + { + "epoch": 1.0011801410743353, + "grad_norm": 4.970941066741943, + "learning_rate": 8.070165821570509e-06, + "loss": 0.2766, + "step": 14833 + }, + { + "epoch": 1.0011937059142701, + "grad_norm": 5.190209865570068, + "learning_rate": 8.070028778950254e-06, + "loss": 0.2591, + "step": 14834 + }, + { + "epoch": 1.001207270754205, + "grad_norm": 3.904682159423828, + "learning_rate": 8.069891736329999e-06, + "loss": 0.1766, + "step": 14835 + }, + { + "epoch": 1.0012208355941399, + "grad_norm": 4.298942565917969, + "learning_rate": 8.069754693709744e-06, + "loss": 0.1855, + "step": 14836 + }, + { + "epoch": 1.001234400434075, + "grad_norm": 6.17646598815918, + "learning_rate": 8.06961765108949e-06, + "loss": 0.2289, + "step": 14837 + }, + { + "epoch": 1.0012479652740098, + "grad_norm": 4.316057205200195, + "learning_rate": 8.069480608469235e-06, + "loss": 0.2503, + "step": 14838 + }, + { + "epoch": 1.0012615301139447, + "grad_norm": 6.141165256500244, + "learning_rate": 8.06934356584898e-06, + "loss": 0.3494, + "step": 14839 + }, + { + "epoch": 1.0012750949538796, + "grad_norm": 4.17507266998291, + "learning_rate": 8.069206523228725e-06, + "loss": 0.1886, + "step": 14840 + }, + { + "epoch": 1.0012886597938144, + "grad_norm": 4.704710006713867, + "learning_rate": 8.06906948060847e-06, + "loss": 0.1848, + "step": 14841 + }, + { + "epoch": 1.0013022246337493, + "grad_norm": 5.534587383270264, + "learning_rate": 8.068932437988215e-06, + "loss": 0.2753, + "step": 14842 + }, + { + "epoch": 1.0013157894736842, + "grad_norm": 5.2359938621521, + "learning_rate": 8.06879539536796e-06, + "loss": 0.2337, + "step": 14843 + }, + { + "epoch": 1.001329354313619, + "grad_norm": 3.5912935733795166, + "learning_rate": 8.068658352747704e-06, + "loss": 0.1308, + "step": 14844 + }, + { + "epoch": 1.001342919153554, + "grad_norm": 5.63086462020874, + "learning_rate": 8.068521310127451e-06, + "loss": 0.2501, + "step": 14845 + }, + { + "epoch": 1.001356483993489, + "grad_norm": 4.72758150100708, + "learning_rate": 8.068384267507196e-06, + "loss": 0.2146, + "step": 14846 + }, + { + "epoch": 1.0013700488334238, + "grad_norm": 6.211514472961426, + "learning_rate": 8.06824722488694e-06, + "loss": 0.249, + "step": 14847 + }, + { + "epoch": 1.0013836136733587, + "grad_norm": 5.730206489562988, + "learning_rate": 8.068110182266685e-06, + "loss": 0.1927, + "step": 14848 + }, + { + "epoch": 1.0013971785132936, + "grad_norm": 5.194157600402832, + "learning_rate": 8.067973139646432e-06, + "loss": 0.2101, + "step": 14849 + }, + { + "epoch": 1.0014107433532284, + "grad_norm": 3.593618392944336, + "learning_rate": 8.067836097026177e-06, + "loss": 0.143, + "step": 14850 + }, + { + "epoch": 1.0014243081931633, + "grad_norm": 3.886314868927002, + "learning_rate": 8.06769905440592e-06, + "loss": 0.1724, + "step": 14851 + }, + { + "epoch": 1.0014378730330982, + "grad_norm": 4.7735490798950195, + "learning_rate": 8.067562011785665e-06, + "loss": 0.2138, + "step": 14852 + }, + { + "epoch": 1.001451437873033, + "grad_norm": 4.626741409301758, + "learning_rate": 8.067424969165412e-06, + "loss": 0.1821, + "step": 14853 + }, + { + "epoch": 1.001465002712968, + "grad_norm": 4.023372650146484, + "learning_rate": 8.067287926545156e-06, + "loss": 0.1691, + "step": 14854 + }, + { + "epoch": 1.0014785675529028, + "grad_norm": 6.874668121337891, + "learning_rate": 8.067150883924901e-06, + "loss": 0.3516, + "step": 14855 + }, + { + "epoch": 1.0014921323928379, + "grad_norm": 11.312691688537598, + "learning_rate": 8.067013841304646e-06, + "loss": 0.3318, + "step": 14856 + }, + { + "epoch": 1.0015056972327727, + "grad_norm": 4.419126033782959, + "learning_rate": 8.066876798684391e-06, + "loss": 0.1823, + "step": 14857 + }, + { + "epoch": 1.0015192620727076, + "grad_norm": 5.519974708557129, + "learning_rate": 8.066739756064136e-06, + "loss": 0.3088, + "step": 14858 + }, + { + "epoch": 1.0015328269126424, + "grad_norm": 5.52072286605835, + "learning_rate": 8.066602713443882e-06, + "loss": 0.2801, + "step": 14859 + }, + { + "epoch": 1.0015463917525773, + "grad_norm": 4.664029121398926, + "learning_rate": 8.066465670823627e-06, + "loss": 0.173, + "step": 14860 + }, + { + "epoch": 1.0015599565925122, + "grad_norm": 4.922971725463867, + "learning_rate": 8.066328628203372e-06, + "loss": 0.3494, + "step": 14861 + }, + { + "epoch": 1.001573521432447, + "grad_norm": 6.682237148284912, + "learning_rate": 8.066191585583117e-06, + "loss": 0.353, + "step": 14862 + }, + { + "epoch": 1.001587086272382, + "grad_norm": 5.368105411529541, + "learning_rate": 8.066054542962862e-06, + "loss": 0.2903, + "step": 14863 + }, + { + "epoch": 1.0016006511123168, + "grad_norm": 7.127165794372559, + "learning_rate": 8.065917500342608e-06, + "loss": 0.5137, + "step": 14864 + }, + { + "epoch": 1.0016142159522519, + "grad_norm": 3.2876269817352295, + "learning_rate": 8.065780457722353e-06, + "loss": 0.157, + "step": 14865 + }, + { + "epoch": 1.0016277807921867, + "grad_norm": 4.887798309326172, + "learning_rate": 8.065643415102098e-06, + "loss": 0.2762, + "step": 14866 + }, + { + "epoch": 1.0016413456321216, + "grad_norm": 4.65902853012085, + "learning_rate": 8.065506372481843e-06, + "loss": 0.2659, + "step": 14867 + }, + { + "epoch": 1.0016549104720565, + "grad_norm": 6.31975793838501, + "learning_rate": 8.065369329861588e-06, + "loss": 0.2994, + "step": 14868 + }, + { + "epoch": 1.0016684753119913, + "grad_norm": 6.904699802398682, + "learning_rate": 8.065232287241332e-06, + "loss": 0.2618, + "step": 14869 + }, + { + "epoch": 1.0016820401519262, + "grad_norm": 4.641022682189941, + "learning_rate": 8.065095244621077e-06, + "loss": 0.2514, + "step": 14870 + }, + { + "epoch": 1.001695604991861, + "grad_norm": 9.454292297363281, + "learning_rate": 8.064958202000824e-06, + "loss": 0.5158, + "step": 14871 + }, + { + "epoch": 1.001709169831796, + "grad_norm": 4.785866737365723, + "learning_rate": 8.064821159380567e-06, + "loss": 0.1906, + "step": 14872 + }, + { + "epoch": 1.0017227346717308, + "grad_norm": 5.762378215789795, + "learning_rate": 8.064684116760312e-06, + "loss": 0.244, + "step": 14873 + }, + { + "epoch": 1.0017362995116659, + "grad_norm": 5.808207988739014, + "learning_rate": 8.064547074140058e-06, + "loss": 0.2818, + "step": 14874 + }, + { + "epoch": 1.0017498643516007, + "grad_norm": 6.581339359283447, + "learning_rate": 8.064410031519805e-06, + "loss": 0.4086, + "step": 14875 + }, + { + "epoch": 1.0017634291915356, + "grad_norm": 8.421963691711426, + "learning_rate": 8.064272988899548e-06, + "loss": 0.3385, + "step": 14876 + }, + { + "epoch": 1.0017769940314705, + "grad_norm": 5.332812309265137, + "learning_rate": 8.064135946279293e-06, + "loss": 0.2675, + "step": 14877 + }, + { + "epoch": 1.0017905588714053, + "grad_norm": 4.445125102996826, + "learning_rate": 8.063998903659038e-06, + "loss": 0.2287, + "step": 14878 + }, + { + "epoch": 1.0018041237113402, + "grad_norm": 7.030791282653809, + "learning_rate": 8.063861861038784e-06, + "loss": 0.4137, + "step": 14879 + }, + { + "epoch": 1.001817688551275, + "grad_norm": 5.110599040985107, + "learning_rate": 8.063724818418529e-06, + "loss": 0.2536, + "step": 14880 + }, + { + "epoch": 1.00183125339121, + "grad_norm": 4.922320365905762, + "learning_rate": 8.063587775798274e-06, + "loss": 0.294, + "step": 14881 + }, + { + "epoch": 1.0018448182311448, + "grad_norm": 4.140431880950928, + "learning_rate": 8.063450733178019e-06, + "loss": 0.1682, + "step": 14882 + }, + { + "epoch": 1.0018583830710797, + "grad_norm": 5.558441162109375, + "learning_rate": 8.063313690557764e-06, + "loss": 0.2663, + "step": 14883 + }, + { + "epoch": 1.0018719479110147, + "grad_norm": 5.801197052001953, + "learning_rate": 8.06317664793751e-06, + "loss": 0.2604, + "step": 14884 + }, + { + "epoch": 1.0018855127509496, + "grad_norm": 5.719603538513184, + "learning_rate": 8.063039605317255e-06, + "loss": 0.2632, + "step": 14885 + }, + { + "epoch": 1.0018990775908845, + "grad_norm": 6.000813961029053, + "learning_rate": 8.062902562697e-06, + "loss": 0.3011, + "step": 14886 + }, + { + "epoch": 1.0019126424308193, + "grad_norm": 6.436274528503418, + "learning_rate": 8.062765520076743e-06, + "loss": 0.2528, + "step": 14887 + }, + { + "epoch": 1.0019262072707542, + "grad_norm": 5.081475734710693, + "learning_rate": 8.06262847745649e-06, + "loss": 0.2161, + "step": 14888 + }, + { + "epoch": 1.001939772110689, + "grad_norm": 8.441722869873047, + "learning_rate": 8.062491434836235e-06, + "loss": 0.3168, + "step": 14889 + }, + { + "epoch": 1.001953336950624, + "grad_norm": 7.747360706329346, + "learning_rate": 8.06235439221598e-06, + "loss": 0.3608, + "step": 14890 + }, + { + "epoch": 1.0019669017905588, + "grad_norm": 6.499897480010986, + "learning_rate": 8.062217349595724e-06, + "loss": 0.3295, + "step": 14891 + }, + { + "epoch": 1.0019804666304937, + "grad_norm": 8.183266639709473, + "learning_rate": 8.062080306975471e-06, + "loss": 0.3484, + "step": 14892 + }, + { + "epoch": 1.0019940314704288, + "grad_norm": 4.173401355743408, + "learning_rate": 8.061943264355216e-06, + "loss": 0.1724, + "step": 14893 + }, + { + "epoch": 1.0020075963103636, + "grad_norm": 5.1321306228637695, + "learning_rate": 8.06180622173496e-06, + "loss": 0.2862, + "step": 14894 + }, + { + "epoch": 1.0020211611502985, + "grad_norm": 6.095034599304199, + "learning_rate": 8.061669179114705e-06, + "loss": 0.2345, + "step": 14895 + }, + { + "epoch": 1.0020347259902334, + "grad_norm": 6.151951789855957, + "learning_rate": 8.06153213649445e-06, + "loss": 0.2181, + "step": 14896 + }, + { + "epoch": 1.0020482908301682, + "grad_norm": 5.110756874084473, + "learning_rate": 8.061395093874195e-06, + "loss": 0.227, + "step": 14897 + }, + { + "epoch": 1.002061855670103, + "grad_norm": 4.277800559997559, + "learning_rate": 8.06125805125394e-06, + "loss": 0.1649, + "step": 14898 + }, + { + "epoch": 1.002075420510038, + "grad_norm": 4.838335990905762, + "learning_rate": 8.061121008633685e-06, + "loss": 0.2482, + "step": 14899 + }, + { + "epoch": 1.0020889853499728, + "grad_norm": 6.062623977661133, + "learning_rate": 8.06098396601343e-06, + "loss": 0.3223, + "step": 14900 + }, + { + "epoch": 1.0021025501899077, + "grad_norm": 6.594548225402832, + "learning_rate": 8.060846923393176e-06, + "loss": 0.2553, + "step": 14901 + }, + { + "epoch": 1.0021161150298425, + "grad_norm": 5.1291608810424805, + "learning_rate": 8.060709880772921e-06, + "loss": 0.2659, + "step": 14902 + }, + { + "epoch": 1.0021296798697776, + "grad_norm": 5.845102787017822, + "learning_rate": 8.060572838152666e-06, + "loss": 0.2829, + "step": 14903 + }, + { + "epoch": 1.0021432447097125, + "grad_norm": 5.900684356689453, + "learning_rate": 8.060435795532411e-06, + "loss": 0.3154, + "step": 14904 + }, + { + "epoch": 1.0021568095496474, + "grad_norm": 3.904967784881592, + "learning_rate": 8.060298752912157e-06, + "loss": 0.1977, + "step": 14905 + }, + { + "epoch": 1.0021703743895822, + "grad_norm": 5.850626468658447, + "learning_rate": 8.060161710291902e-06, + "loss": 0.3091, + "step": 14906 + }, + { + "epoch": 1.002183939229517, + "grad_norm": 4.193957805633545, + "learning_rate": 8.060024667671647e-06, + "loss": 0.1894, + "step": 14907 + }, + { + "epoch": 1.002197504069452, + "grad_norm": 4.985541343688965, + "learning_rate": 8.059887625051392e-06, + "loss": 0.3185, + "step": 14908 + }, + { + "epoch": 1.0022110689093868, + "grad_norm": 6.852181434631348, + "learning_rate": 8.059750582431137e-06, + "loss": 0.3836, + "step": 14909 + }, + { + "epoch": 1.0022246337493217, + "grad_norm": 6.441447734832764, + "learning_rate": 8.059613539810882e-06, + "loss": 0.3096, + "step": 14910 + }, + { + "epoch": 1.0022381985892566, + "grad_norm": 5.906408309936523, + "learning_rate": 8.059476497190628e-06, + "loss": 0.2356, + "step": 14911 + }, + { + "epoch": 1.0022517634291916, + "grad_norm": 6.470837116241455, + "learning_rate": 8.059339454570371e-06, + "loss": 0.3289, + "step": 14912 + }, + { + "epoch": 1.0022653282691265, + "grad_norm": 5.012482166290283, + "learning_rate": 8.059202411950116e-06, + "loss": 0.2709, + "step": 14913 + }, + { + "epoch": 1.0022788931090614, + "grad_norm": 5.7586774826049805, + "learning_rate": 8.059065369329863e-06, + "loss": 0.2762, + "step": 14914 + }, + { + "epoch": 1.0022924579489962, + "grad_norm": 4.6072187423706055, + "learning_rate": 8.058928326709608e-06, + "loss": 0.2875, + "step": 14915 + }, + { + "epoch": 1.002306022788931, + "grad_norm": 7.409695625305176, + "learning_rate": 8.058791284089352e-06, + "loss": 0.4711, + "step": 14916 + }, + { + "epoch": 1.002319587628866, + "grad_norm": 4.80631685256958, + "learning_rate": 8.058654241469097e-06, + "loss": 0.2539, + "step": 14917 + }, + { + "epoch": 1.0023331524688008, + "grad_norm": 6.004725456237793, + "learning_rate": 8.058517198848844e-06, + "loss": 0.3912, + "step": 14918 + }, + { + "epoch": 1.0023467173087357, + "grad_norm": 4.619677543640137, + "learning_rate": 8.058380156228587e-06, + "loss": 0.3254, + "step": 14919 + }, + { + "epoch": 1.0023602821486706, + "grad_norm": 5.742053031921387, + "learning_rate": 8.058243113608332e-06, + "loss": 0.2849, + "step": 14920 + }, + { + "epoch": 1.0023738469886054, + "grad_norm": 6.7600202560424805, + "learning_rate": 8.058106070988078e-06, + "loss": 0.3299, + "step": 14921 + }, + { + "epoch": 1.0023874118285405, + "grad_norm": 5.35109806060791, + "learning_rate": 8.057969028367823e-06, + "loss": 0.3461, + "step": 14922 + }, + { + "epoch": 1.0024009766684754, + "grad_norm": 7.54236364364624, + "learning_rate": 8.057831985747568e-06, + "loss": 0.482, + "step": 14923 + }, + { + "epoch": 1.0024145415084103, + "grad_norm": 9.064925193786621, + "learning_rate": 8.057694943127313e-06, + "loss": 0.6854, + "step": 14924 + }, + { + "epoch": 1.0024281063483451, + "grad_norm": 6.339097499847412, + "learning_rate": 8.057557900507058e-06, + "loss": 0.3369, + "step": 14925 + }, + { + "epoch": 1.00244167118828, + "grad_norm": 9.080251693725586, + "learning_rate": 8.057420857886804e-06, + "loss": 0.3857, + "step": 14926 + }, + { + "epoch": 1.0024552360282148, + "grad_norm": 4.690685272216797, + "learning_rate": 8.057283815266549e-06, + "loss": 0.2562, + "step": 14927 + }, + { + "epoch": 1.0024688008681497, + "grad_norm": 7.084031581878662, + "learning_rate": 8.057146772646294e-06, + "loss": 0.3293, + "step": 14928 + }, + { + "epoch": 1.0024823657080846, + "grad_norm": 7.219849109649658, + "learning_rate": 8.057009730026039e-06, + "loss": 0.446, + "step": 14929 + }, + { + "epoch": 1.0024959305480194, + "grad_norm": 6.014969825744629, + "learning_rate": 8.056872687405783e-06, + "loss": 0.3523, + "step": 14930 + }, + { + "epoch": 1.0025094953879545, + "grad_norm": 5.548582553863525, + "learning_rate": 8.05673564478553e-06, + "loss": 0.2918, + "step": 14931 + }, + { + "epoch": 1.0025230602278894, + "grad_norm": 3.9115560054779053, + "learning_rate": 8.056598602165275e-06, + "loss": 0.1406, + "step": 14932 + }, + { + "epoch": 1.0025366250678243, + "grad_norm": 6.933712482452393, + "learning_rate": 8.05646155954502e-06, + "loss": 0.2875, + "step": 14933 + }, + { + "epoch": 1.0025501899077591, + "grad_norm": 6.588930606842041, + "learning_rate": 8.056324516924763e-06, + "loss": 0.2714, + "step": 14934 + }, + { + "epoch": 1.002563754747694, + "grad_norm": 4.7309136390686035, + "learning_rate": 8.05618747430451e-06, + "loss": 0.2426, + "step": 14935 + }, + { + "epoch": 1.0025773195876289, + "grad_norm": 4.010119438171387, + "learning_rate": 8.056050431684255e-06, + "loss": 0.2885, + "step": 14936 + }, + { + "epoch": 1.0025908844275637, + "grad_norm": 5.835457801818848, + "learning_rate": 8.055913389063999e-06, + "loss": 0.4094, + "step": 14937 + }, + { + "epoch": 1.0026044492674986, + "grad_norm": 5.268833160400391, + "learning_rate": 8.055776346443744e-06, + "loss": 0.3541, + "step": 14938 + }, + { + "epoch": 1.0026180141074335, + "grad_norm": 7.255650043487549, + "learning_rate": 8.05563930382349e-06, + "loss": 0.3061, + "step": 14939 + }, + { + "epoch": 1.0026315789473683, + "grad_norm": 6.543017864227295, + "learning_rate": 8.055502261203234e-06, + "loss": 0.3918, + "step": 14940 + }, + { + "epoch": 1.0026451437873034, + "grad_norm": 6.271376609802246, + "learning_rate": 8.05536521858298e-06, + "loss": 0.3839, + "step": 14941 + }, + { + "epoch": 1.0026587086272383, + "grad_norm": 5.960376739501953, + "learning_rate": 8.055228175962725e-06, + "loss": 0.3069, + "step": 14942 + }, + { + "epoch": 1.0026722734671731, + "grad_norm": 6.080454349517822, + "learning_rate": 8.05509113334247e-06, + "loss": 0.3515, + "step": 14943 + }, + { + "epoch": 1.002685838307108, + "grad_norm": 5.107316970825195, + "learning_rate": 8.054954090722215e-06, + "loss": 0.3671, + "step": 14944 + }, + { + "epoch": 1.0026994031470429, + "grad_norm": 5.477987766265869, + "learning_rate": 8.05481704810196e-06, + "loss": 0.3291, + "step": 14945 + }, + { + "epoch": 1.0027129679869777, + "grad_norm": 4.935527324676514, + "learning_rate": 8.054680005481705e-06, + "loss": 0.3272, + "step": 14946 + }, + { + "epoch": 1.0027265328269126, + "grad_norm": 7.639787197113037, + "learning_rate": 8.05454296286145e-06, + "loss": 0.4322, + "step": 14947 + }, + { + "epoch": 1.0027400976668475, + "grad_norm": 5.4858832359313965, + "learning_rate": 8.054405920241196e-06, + "loss": 0.3454, + "step": 14948 + }, + { + "epoch": 1.0027536625067823, + "grad_norm": 6.124469757080078, + "learning_rate": 8.054268877620941e-06, + "loss": 0.3548, + "step": 14949 + }, + { + "epoch": 1.0027672273467174, + "grad_norm": 6.5296831130981445, + "learning_rate": 8.054131835000686e-06, + "loss": 0.2972, + "step": 14950 + }, + { + "epoch": 1.0027807921866523, + "grad_norm": 5.1396708488464355, + "learning_rate": 8.053994792380431e-06, + "loss": 0.2358, + "step": 14951 + }, + { + "epoch": 1.0027943570265871, + "grad_norm": 6.703770160675049, + "learning_rate": 8.053857749760175e-06, + "loss": 0.3919, + "step": 14952 + }, + { + "epoch": 1.002807921866522, + "grad_norm": 4.779472351074219, + "learning_rate": 8.053720707139922e-06, + "loss": 0.224, + "step": 14953 + }, + { + "epoch": 1.0028214867064569, + "grad_norm": 3.4412150382995605, + "learning_rate": 8.053583664519667e-06, + "loss": 0.2358, + "step": 14954 + }, + { + "epoch": 1.0028350515463917, + "grad_norm": 5.5114006996154785, + "learning_rate": 8.05344662189941e-06, + "loss": 0.3414, + "step": 14955 + }, + { + "epoch": 1.0028486163863266, + "grad_norm": 5.797542572021484, + "learning_rate": 8.053309579279156e-06, + "loss": 0.2629, + "step": 14956 + }, + { + "epoch": 1.0028621812262615, + "grad_norm": 6.491927623748779, + "learning_rate": 8.053172536658902e-06, + "loss": 0.4242, + "step": 14957 + }, + { + "epoch": 1.0028757460661963, + "grad_norm": 6.810025691986084, + "learning_rate": 8.053035494038648e-06, + "loss": 0.4696, + "step": 14958 + }, + { + "epoch": 1.0028893109061312, + "grad_norm": 4.668208122253418, + "learning_rate": 8.052898451418391e-06, + "loss": 0.2283, + "step": 14959 + }, + { + "epoch": 1.0029028757460663, + "grad_norm": 5.187962055206299, + "learning_rate": 8.052761408798136e-06, + "loss": 0.3324, + "step": 14960 + }, + { + "epoch": 1.0029164405860012, + "grad_norm": 5.453437328338623, + "learning_rate": 8.052624366177883e-06, + "loss": 0.2438, + "step": 14961 + }, + { + "epoch": 1.002930005425936, + "grad_norm": 8.398664474487305, + "learning_rate": 8.052487323557627e-06, + "loss": 0.4755, + "step": 14962 + }, + { + "epoch": 1.0029435702658709, + "grad_norm": 6.0657172203063965, + "learning_rate": 8.052350280937372e-06, + "loss": 0.2464, + "step": 14963 + }, + { + "epoch": 1.0029571351058058, + "grad_norm": 4.736496925354004, + "learning_rate": 8.052213238317117e-06, + "loss": 0.2825, + "step": 14964 + }, + { + "epoch": 1.0029706999457406, + "grad_norm": 4.628909587860107, + "learning_rate": 8.052076195696862e-06, + "loss": 0.2043, + "step": 14965 + }, + { + "epoch": 1.0029842647856755, + "grad_norm": 6.297091007232666, + "learning_rate": 8.051939153076607e-06, + "loss": 0.4271, + "step": 14966 + }, + { + "epoch": 1.0029978296256103, + "grad_norm": 6.0940842628479, + "learning_rate": 8.051802110456353e-06, + "loss": 0.4104, + "step": 14967 + }, + { + "epoch": 1.0030113944655452, + "grad_norm": 6.189389228820801, + "learning_rate": 8.051665067836098e-06, + "loss": 0.3397, + "step": 14968 + }, + { + "epoch": 1.0030249593054803, + "grad_norm": 5.187692642211914, + "learning_rate": 8.051528025215843e-06, + "loss": 0.3334, + "step": 14969 + }, + { + "epoch": 1.0030385241454152, + "grad_norm": 4.429109573364258, + "learning_rate": 8.051390982595588e-06, + "loss": 0.2816, + "step": 14970 + }, + { + "epoch": 1.00305208898535, + "grad_norm": 4.520811557769775, + "learning_rate": 8.051253939975333e-06, + "loss": 0.1554, + "step": 14971 + }, + { + "epoch": 1.003065653825285, + "grad_norm": 6.2105278968811035, + "learning_rate": 8.051116897355078e-06, + "loss": 0.3368, + "step": 14972 + }, + { + "epoch": 1.0030792186652198, + "grad_norm": 4.700309753417969, + "learning_rate": 8.050979854734824e-06, + "loss": 0.2744, + "step": 14973 + }, + { + "epoch": 1.0030927835051546, + "grad_norm": 4.650252342224121, + "learning_rate": 8.050842812114569e-06, + "loss": 0.2015, + "step": 14974 + }, + { + "epoch": 1.0031063483450895, + "grad_norm": 4.355146884918213, + "learning_rate": 8.050705769494314e-06, + "loss": 0.1823, + "step": 14975 + }, + { + "epoch": 1.0031199131850244, + "grad_norm": 6.859040260314941, + "learning_rate": 8.050568726874059e-06, + "loss": 0.3454, + "step": 14976 + }, + { + "epoch": 1.0031334780249592, + "grad_norm": 5.371952533721924, + "learning_rate": 8.050431684253803e-06, + "loss": 0.2876, + "step": 14977 + }, + { + "epoch": 1.003147042864894, + "grad_norm": 4.815741539001465, + "learning_rate": 8.05029464163355e-06, + "loss": 0.1637, + "step": 14978 + }, + { + "epoch": 1.0031606077048292, + "grad_norm": 3.9190871715545654, + "learning_rate": 8.050157599013295e-06, + "loss": 0.2082, + "step": 14979 + }, + { + "epoch": 1.003174172544764, + "grad_norm": 4.454131603240967, + "learning_rate": 8.050020556393038e-06, + "loss": 0.2343, + "step": 14980 + }, + { + "epoch": 1.003187737384699, + "grad_norm": 4.672544479370117, + "learning_rate": 8.049883513772783e-06, + "loss": 0.3044, + "step": 14981 + }, + { + "epoch": 1.0032013022246338, + "grad_norm": 5.952506065368652, + "learning_rate": 8.049746471152529e-06, + "loss": 0.2847, + "step": 14982 + }, + { + "epoch": 1.0032148670645686, + "grad_norm": 5.084697246551514, + "learning_rate": 8.049609428532275e-06, + "loss": 0.3045, + "step": 14983 + }, + { + "epoch": 1.0032284319045035, + "grad_norm": 3.5508646965026855, + "learning_rate": 8.049472385912019e-06, + "loss": 0.23, + "step": 14984 + }, + { + "epoch": 1.0032419967444384, + "grad_norm": 6.321219444274902, + "learning_rate": 8.049335343291764e-06, + "loss": 0.3409, + "step": 14985 + }, + { + "epoch": 1.0032555615843732, + "grad_norm": 5.376039981842041, + "learning_rate": 8.04919830067151e-06, + "loss": 0.1902, + "step": 14986 + }, + { + "epoch": 1.003269126424308, + "grad_norm": 4.23654842376709, + "learning_rate": 8.049061258051254e-06, + "loss": 0.2109, + "step": 14987 + }, + { + "epoch": 1.0032826912642432, + "grad_norm": 6.373482704162598, + "learning_rate": 8.048924215431e-06, + "loss": 0.3561, + "step": 14988 + }, + { + "epoch": 1.003296256104178, + "grad_norm": 10.727145195007324, + "learning_rate": 8.048787172810745e-06, + "loss": 0.3476, + "step": 14989 + }, + { + "epoch": 1.003309820944113, + "grad_norm": 5.713454723358154, + "learning_rate": 8.04865013019049e-06, + "loss": 0.2505, + "step": 14990 + }, + { + "epoch": 1.0033233857840478, + "grad_norm": 5.688849925994873, + "learning_rate": 8.048513087570235e-06, + "loss": 0.3373, + "step": 14991 + }, + { + "epoch": 1.0033369506239826, + "grad_norm": 4.971365928649902, + "learning_rate": 8.04837604494998e-06, + "loss": 0.3108, + "step": 14992 + }, + { + "epoch": 1.0033505154639175, + "grad_norm": 5.607809066772461, + "learning_rate": 8.048239002329725e-06, + "loss": 0.2715, + "step": 14993 + }, + { + "epoch": 1.0033640803038524, + "grad_norm": 5.751819610595703, + "learning_rate": 8.04810195970947e-06, + "loss": 0.4144, + "step": 14994 + }, + { + "epoch": 1.0033776451437872, + "grad_norm": 5.830623149871826, + "learning_rate": 8.047964917089214e-06, + "loss": 0.3731, + "step": 14995 + }, + { + "epoch": 1.003391209983722, + "grad_norm": 6.473817348480225, + "learning_rate": 8.047827874468961e-06, + "loss": 0.2744, + "step": 14996 + }, + { + "epoch": 1.003404774823657, + "grad_norm": 6.09193754196167, + "learning_rate": 8.047690831848706e-06, + "loss": 0.3083, + "step": 14997 + }, + { + "epoch": 1.003418339663592, + "grad_norm": 5.353208541870117, + "learning_rate": 8.047553789228451e-06, + "loss": 0.3175, + "step": 14998 + }, + { + "epoch": 1.003431904503527, + "grad_norm": 6.796580791473389, + "learning_rate": 8.047416746608195e-06, + "loss": 0.4733, + "step": 14999 + }, + { + "epoch": 1.0034454693434618, + "grad_norm": 5.668054103851318, + "learning_rate": 8.047279703987942e-06, + "loss": 0.3047, + "step": 15000 + }, + { + "epoch": 1.0034590341833967, + "grad_norm": 8.291436195373535, + "learning_rate": 8.047142661367687e-06, + "loss": 0.6597, + "step": 15001 + }, + { + "epoch": 1.0034725990233315, + "grad_norm": 5.342665195465088, + "learning_rate": 8.04700561874743e-06, + "loss": 0.3351, + "step": 15002 + }, + { + "epoch": 1.0034861638632664, + "grad_norm": 7.161826133728027, + "learning_rate": 8.046868576127176e-06, + "loss": 0.3465, + "step": 15003 + }, + { + "epoch": 1.0034997287032013, + "grad_norm": 5.959744930267334, + "learning_rate": 8.046731533506922e-06, + "loss": 0.5011, + "step": 15004 + }, + { + "epoch": 1.0035132935431361, + "grad_norm": 5.0220513343811035, + "learning_rate": 8.046594490886666e-06, + "loss": 0.2897, + "step": 15005 + }, + { + "epoch": 1.003526858383071, + "grad_norm": 6.75865364074707, + "learning_rate": 8.046457448266411e-06, + "loss": 0.4217, + "step": 15006 + }, + { + "epoch": 1.003540423223006, + "grad_norm": 3.796192169189453, + "learning_rate": 8.046320405646156e-06, + "loss": 0.1291, + "step": 15007 + }, + { + "epoch": 1.003553988062941, + "grad_norm": 4.8590874671936035, + "learning_rate": 8.046183363025901e-06, + "loss": 0.2259, + "step": 15008 + }, + { + "epoch": 1.0035675529028758, + "grad_norm": 8.042312622070312, + "learning_rate": 8.046046320405647e-06, + "loss": 0.3945, + "step": 15009 + }, + { + "epoch": 1.0035811177428107, + "grad_norm": 4.290974140167236, + "learning_rate": 8.045909277785392e-06, + "loss": 0.3299, + "step": 15010 + }, + { + "epoch": 1.0035946825827455, + "grad_norm": 7.177584648132324, + "learning_rate": 8.045772235165137e-06, + "loss": 0.4098, + "step": 15011 + }, + { + "epoch": 1.0036082474226804, + "grad_norm": 5.380190372467041, + "learning_rate": 8.045635192544882e-06, + "loss": 0.2921, + "step": 15012 + }, + { + "epoch": 1.0036218122626153, + "grad_norm": 4.846046447753906, + "learning_rate": 8.045498149924627e-06, + "loss": 0.2004, + "step": 15013 + }, + { + "epoch": 1.0036353771025501, + "grad_norm": 6.1801347732543945, + "learning_rate": 8.045361107304373e-06, + "loss": 0.3637, + "step": 15014 + }, + { + "epoch": 1.003648941942485, + "grad_norm": 6.010442733764648, + "learning_rate": 8.045224064684118e-06, + "loss": 0.3274, + "step": 15015 + }, + { + "epoch": 1.0036625067824199, + "grad_norm": 4.99002742767334, + "learning_rate": 8.045087022063863e-06, + "loss": 0.223, + "step": 15016 + }, + { + "epoch": 1.003676071622355, + "grad_norm": 5.604731559753418, + "learning_rate": 8.044949979443608e-06, + "loss": 0.2991, + "step": 15017 + }, + { + "epoch": 1.0036896364622898, + "grad_norm": 5.51801872253418, + "learning_rate": 8.044812936823353e-06, + "loss": 0.3735, + "step": 15018 + }, + { + "epoch": 1.0037032013022247, + "grad_norm": 5.864407062530518, + "learning_rate": 8.044675894203098e-06, + "loss": 0.2368, + "step": 15019 + }, + { + "epoch": 1.0037167661421595, + "grad_norm": 6.436901569366455, + "learning_rate": 8.044538851582842e-06, + "loss": 0.2637, + "step": 15020 + }, + { + "epoch": 1.0037303309820944, + "grad_norm": 5.410538673400879, + "learning_rate": 8.044401808962587e-06, + "loss": 0.2879, + "step": 15021 + }, + { + "epoch": 1.0037438958220293, + "grad_norm": 4.539417743682861, + "learning_rate": 8.044264766342334e-06, + "loss": 0.2366, + "step": 15022 + }, + { + "epoch": 1.0037574606619641, + "grad_norm": 4.9378342628479, + "learning_rate": 8.044127723722077e-06, + "loss": 0.2126, + "step": 15023 + }, + { + "epoch": 1.003771025501899, + "grad_norm": 5.597629547119141, + "learning_rate": 8.043990681101823e-06, + "loss": 0.2464, + "step": 15024 + }, + { + "epoch": 1.0037845903418339, + "grad_norm": 4.347342491149902, + "learning_rate": 8.043853638481568e-06, + "loss": 0.2776, + "step": 15025 + }, + { + "epoch": 1.003798155181769, + "grad_norm": 4.880950927734375, + "learning_rate": 8.043716595861315e-06, + "loss": 0.3395, + "step": 15026 + }, + { + "epoch": 1.0038117200217038, + "grad_norm": 6.911513805389404, + "learning_rate": 8.043579553241058e-06, + "loss": 0.3351, + "step": 15027 + }, + { + "epoch": 1.0038252848616387, + "grad_norm": 4.383606433868408, + "learning_rate": 8.043442510620803e-06, + "loss": 0.2402, + "step": 15028 + }, + { + "epoch": 1.0038388497015736, + "grad_norm": 7.879003047943115, + "learning_rate": 8.043305468000549e-06, + "loss": 0.2897, + "step": 15029 + }, + { + "epoch": 1.0038524145415084, + "grad_norm": 6.5891289710998535, + "learning_rate": 8.043168425380294e-06, + "loss": 0.3405, + "step": 15030 + }, + { + "epoch": 1.0038659793814433, + "grad_norm": 4.479444980621338, + "learning_rate": 8.043031382760039e-06, + "loss": 0.1782, + "step": 15031 + }, + { + "epoch": 1.0038795442213782, + "grad_norm": 5.653338432312012, + "learning_rate": 8.042894340139784e-06, + "loss": 0.3165, + "step": 15032 + }, + { + "epoch": 1.003893109061313, + "grad_norm": 3.637025833129883, + "learning_rate": 8.04275729751953e-06, + "loss": 0.1844, + "step": 15033 + }, + { + "epoch": 1.0039066739012479, + "grad_norm": 4.805109977722168, + "learning_rate": 8.042620254899274e-06, + "loss": 0.3035, + "step": 15034 + }, + { + "epoch": 1.0039202387411827, + "grad_norm": 6.088316440582275, + "learning_rate": 8.04248321227902e-06, + "loss": 0.2718, + "step": 15035 + }, + { + "epoch": 1.0039338035811178, + "grad_norm": 7.708918571472168, + "learning_rate": 8.042346169658765e-06, + "loss": 0.336, + "step": 15036 + }, + { + "epoch": 1.0039473684210527, + "grad_norm": 5.964537143707275, + "learning_rate": 8.04220912703851e-06, + "loss": 0.3035, + "step": 15037 + }, + { + "epoch": 1.0039609332609876, + "grad_norm": 6.78738260269165, + "learning_rate": 8.042072084418253e-06, + "loss": 0.3337, + "step": 15038 + }, + { + "epoch": 1.0039744981009224, + "grad_norm": 4.617880821228027, + "learning_rate": 8.041935041798e-06, + "loss": 0.3123, + "step": 15039 + }, + { + "epoch": 1.0039880629408573, + "grad_norm": 5.680215358734131, + "learning_rate": 8.041797999177745e-06, + "loss": 0.3187, + "step": 15040 + }, + { + "epoch": 1.0040016277807922, + "grad_norm": 4.022759914398193, + "learning_rate": 8.04166095655749e-06, + "loss": 0.3218, + "step": 15041 + }, + { + "epoch": 1.004015192620727, + "grad_norm": 5.487396240234375, + "learning_rate": 8.041523913937234e-06, + "loss": 0.1564, + "step": 15042 + }, + { + "epoch": 1.004028757460662, + "grad_norm": 4.389580726623535, + "learning_rate": 8.041386871316981e-06, + "loss": 0.2217, + "step": 15043 + }, + { + "epoch": 1.0040423223005968, + "grad_norm": 3.767688035964966, + "learning_rate": 8.041249828696726e-06, + "loss": 0.1678, + "step": 15044 + }, + { + "epoch": 1.0040558871405318, + "grad_norm": 5.303073406219482, + "learning_rate": 8.04111278607647e-06, + "loss": 0.3529, + "step": 15045 + }, + { + "epoch": 1.0040694519804667, + "grad_norm": 5.524463653564453, + "learning_rate": 8.040975743456215e-06, + "loss": 0.2985, + "step": 15046 + }, + { + "epoch": 1.0040830168204016, + "grad_norm": 7.9762654304504395, + "learning_rate": 8.040838700835962e-06, + "loss": 0.3898, + "step": 15047 + }, + { + "epoch": 1.0040965816603364, + "grad_norm": 6.414556980133057, + "learning_rate": 8.040701658215705e-06, + "loss": 0.3066, + "step": 15048 + }, + { + "epoch": 1.0041101465002713, + "grad_norm": 4.912889003753662, + "learning_rate": 8.04056461559545e-06, + "loss": 0.1911, + "step": 15049 + }, + { + "epoch": 1.0041237113402062, + "grad_norm": 4.1171746253967285, + "learning_rate": 8.040427572975196e-06, + "loss": 0.1726, + "step": 15050 + }, + { + "epoch": 1.004137276180141, + "grad_norm": 5.571025848388672, + "learning_rate": 8.04029053035494e-06, + "loss": 0.203, + "step": 15051 + }, + { + "epoch": 1.004150841020076, + "grad_norm": 4.992640018463135, + "learning_rate": 8.040153487734686e-06, + "loss": 0.2081, + "step": 15052 + }, + { + "epoch": 1.0041644058600108, + "grad_norm": 6.2114410400390625, + "learning_rate": 8.040016445114431e-06, + "loss": 0.2267, + "step": 15053 + }, + { + "epoch": 1.0041779706999456, + "grad_norm": 5.715400218963623, + "learning_rate": 8.039879402494176e-06, + "loss": 0.2385, + "step": 15054 + }, + { + "epoch": 1.0041915355398807, + "grad_norm": 4.494435787200928, + "learning_rate": 8.039742359873921e-06, + "loss": 0.215, + "step": 15055 + }, + { + "epoch": 1.0042051003798156, + "grad_norm": 5.964955806732178, + "learning_rate": 8.039605317253667e-06, + "loss": 0.1779, + "step": 15056 + }, + { + "epoch": 1.0042186652197505, + "grad_norm": 4.218923091888428, + "learning_rate": 8.039468274633412e-06, + "loss": 0.1338, + "step": 15057 + }, + { + "epoch": 1.0042322300596853, + "grad_norm": 4.199399471282959, + "learning_rate": 8.039331232013157e-06, + "loss": 0.1849, + "step": 15058 + }, + { + "epoch": 1.0042457948996202, + "grad_norm": 4.2862958908081055, + "learning_rate": 8.039194189392902e-06, + "loss": 0.1587, + "step": 15059 + }, + { + "epoch": 1.004259359739555, + "grad_norm": 6.4007697105407715, + "learning_rate": 8.039057146772647e-06, + "loss": 0.2839, + "step": 15060 + }, + { + "epoch": 1.00427292457949, + "grad_norm": 4.7022600173950195, + "learning_rate": 8.038920104152393e-06, + "loss": 0.235, + "step": 15061 + }, + { + "epoch": 1.0042864894194248, + "grad_norm": 5.9082183837890625, + "learning_rate": 8.038783061532138e-06, + "loss": 0.2761, + "step": 15062 + }, + { + "epoch": 1.0043000542593596, + "grad_norm": 5.984780311584473, + "learning_rate": 8.038646018911881e-06, + "loss": 0.2334, + "step": 15063 + }, + { + "epoch": 1.0043136190992947, + "grad_norm": 6.063122749328613, + "learning_rate": 8.038508976291626e-06, + "loss": 0.3083, + "step": 15064 + }, + { + "epoch": 1.0043271839392296, + "grad_norm": 4.328074932098389, + "learning_rate": 8.038371933671373e-06, + "loss": 0.1018, + "step": 15065 + }, + { + "epoch": 1.0043407487791645, + "grad_norm": 8.110403060913086, + "learning_rate": 8.038234891051118e-06, + "loss": 0.415, + "step": 15066 + }, + { + "epoch": 1.0043543136190993, + "grad_norm": 5.475956439971924, + "learning_rate": 8.038097848430862e-06, + "loss": 0.313, + "step": 15067 + }, + { + "epoch": 1.0043678784590342, + "grad_norm": 4.607039928436279, + "learning_rate": 8.037960805810607e-06, + "loss": 0.1861, + "step": 15068 + }, + { + "epoch": 1.004381443298969, + "grad_norm": 3.604198694229126, + "learning_rate": 8.037823763190354e-06, + "loss": 0.192, + "step": 15069 + }, + { + "epoch": 1.004395008138904, + "grad_norm": 5.017603874206543, + "learning_rate": 8.037686720570097e-06, + "loss": 0.3536, + "step": 15070 + }, + { + "epoch": 1.0044085729788388, + "grad_norm": 5.384247779846191, + "learning_rate": 8.037549677949843e-06, + "loss": 0.1983, + "step": 15071 + }, + { + "epoch": 1.0044221378187737, + "grad_norm": 7.127419948577881, + "learning_rate": 8.037412635329588e-06, + "loss": 0.4003, + "step": 15072 + }, + { + "epoch": 1.0044357026587085, + "grad_norm": 6.137596607208252, + "learning_rate": 8.037275592709333e-06, + "loss": 0.3031, + "step": 15073 + }, + { + "epoch": 1.0044492674986436, + "grad_norm": 6.118683338165283, + "learning_rate": 8.037138550089078e-06, + "loss": 0.2663, + "step": 15074 + }, + { + "epoch": 1.0044628323385785, + "grad_norm": 7.759317874908447, + "learning_rate": 8.037001507468823e-06, + "loss": 0.4417, + "step": 15075 + }, + { + "epoch": 1.0044763971785133, + "grad_norm": 4.925467014312744, + "learning_rate": 8.036864464848569e-06, + "loss": 0.267, + "step": 15076 + }, + { + "epoch": 1.0044899620184482, + "grad_norm": 8.876575469970703, + "learning_rate": 8.036727422228314e-06, + "loss": 0.3122, + "step": 15077 + }, + { + "epoch": 1.004503526858383, + "grad_norm": 6.534013748168945, + "learning_rate": 8.036590379608059e-06, + "loss": 0.2696, + "step": 15078 + }, + { + "epoch": 1.004517091698318, + "grad_norm": 5.560133457183838, + "learning_rate": 8.036453336987804e-06, + "loss": 0.2222, + "step": 15079 + }, + { + "epoch": 1.0045306565382528, + "grad_norm": 5.115878582000732, + "learning_rate": 8.03631629436755e-06, + "loss": 0.3001, + "step": 15080 + }, + { + "epoch": 1.0045442213781877, + "grad_norm": 6.243271350860596, + "learning_rate": 8.036179251747294e-06, + "loss": 0.3403, + "step": 15081 + }, + { + "epoch": 1.0045577862181225, + "grad_norm": 5.988765239715576, + "learning_rate": 8.03604220912704e-06, + "loss": 0.387, + "step": 15082 + }, + { + "epoch": 1.0045713510580576, + "grad_norm": 6.113719463348389, + "learning_rate": 8.035905166506785e-06, + "loss": 0.2609, + "step": 15083 + }, + { + "epoch": 1.0045849158979925, + "grad_norm": 5.911016464233398, + "learning_rate": 8.03576812388653e-06, + "loss": 0.2526, + "step": 15084 + }, + { + "epoch": 1.0045984807379273, + "grad_norm": 8.145856857299805, + "learning_rate": 8.035631081266273e-06, + "loss": 0.2966, + "step": 15085 + }, + { + "epoch": 1.0046120455778622, + "grad_norm": 6.795959949493408, + "learning_rate": 8.03549403864602e-06, + "loss": 0.3566, + "step": 15086 + }, + { + "epoch": 1.004625610417797, + "grad_norm": 5.221805095672607, + "learning_rate": 8.035356996025766e-06, + "loss": 0.2669, + "step": 15087 + }, + { + "epoch": 1.004639175257732, + "grad_norm": 7.154123783111572, + "learning_rate": 8.035219953405509e-06, + "loss": 0.3176, + "step": 15088 + }, + { + "epoch": 1.0046527400976668, + "grad_norm": 6.088968753814697, + "learning_rate": 8.035082910785254e-06, + "loss": 0.254, + "step": 15089 + }, + { + "epoch": 1.0046663049376017, + "grad_norm": 6.027446269989014, + "learning_rate": 8.034945868165e-06, + "loss": 0.3563, + "step": 15090 + }, + { + "epoch": 1.0046798697775365, + "grad_norm": 4.318092346191406, + "learning_rate": 8.034808825544746e-06, + "loss": 0.1809, + "step": 15091 + }, + { + "epoch": 1.0046934346174714, + "grad_norm": 6.998733997344971, + "learning_rate": 8.03467178292449e-06, + "loss": 0.3918, + "step": 15092 + }, + { + "epoch": 1.0047069994574065, + "grad_norm": 7.317354679107666, + "learning_rate": 8.034534740304235e-06, + "loss": 0.3589, + "step": 15093 + }, + { + "epoch": 1.0047205642973414, + "grad_norm": 5.908191680908203, + "learning_rate": 8.03439769768398e-06, + "loss": 0.3325, + "step": 15094 + }, + { + "epoch": 1.0047341291372762, + "grad_norm": 6.156616687774658, + "learning_rate": 8.034260655063725e-06, + "loss": 0.3586, + "step": 15095 + }, + { + "epoch": 1.004747693977211, + "grad_norm": 5.301631927490234, + "learning_rate": 8.03412361244347e-06, + "loss": 0.3179, + "step": 15096 + }, + { + "epoch": 1.004761258817146, + "grad_norm": 5.621920108795166, + "learning_rate": 8.033986569823216e-06, + "loss": 0.2693, + "step": 15097 + }, + { + "epoch": 1.0047748236570808, + "grad_norm": 5.413774490356445, + "learning_rate": 8.03384952720296e-06, + "loss": 0.2367, + "step": 15098 + }, + { + "epoch": 1.0047883884970157, + "grad_norm": 5.746584892272949, + "learning_rate": 8.033712484582706e-06, + "loss": 0.2866, + "step": 15099 + }, + { + "epoch": 1.0048019533369505, + "grad_norm": 8.477224349975586, + "learning_rate": 8.033575441962451e-06, + "loss": 0.3769, + "step": 15100 + }, + { + "epoch": 1.0048155181768854, + "grad_norm": 5.042596817016602, + "learning_rate": 8.033438399342196e-06, + "loss": 0.2973, + "step": 15101 + }, + { + "epoch": 1.0048290830168205, + "grad_norm": 4.187417507171631, + "learning_rate": 8.033301356721941e-06, + "loss": 0.263, + "step": 15102 + }, + { + "epoch": 1.0048426478567554, + "grad_norm": 5.581655979156494, + "learning_rate": 8.033164314101685e-06, + "loss": 0.2766, + "step": 15103 + }, + { + "epoch": 1.0048562126966902, + "grad_norm": 6.208231449127197, + "learning_rate": 8.033027271481432e-06, + "loss": 0.3335, + "step": 15104 + }, + { + "epoch": 1.004869777536625, + "grad_norm": 6.842191219329834, + "learning_rate": 8.032890228861177e-06, + "loss": 0.3606, + "step": 15105 + }, + { + "epoch": 1.00488334237656, + "grad_norm": 3.851562738418579, + "learning_rate": 8.032753186240922e-06, + "loss": 0.2229, + "step": 15106 + }, + { + "epoch": 1.0048969072164948, + "grad_norm": 5.017131805419922, + "learning_rate": 8.032616143620666e-06, + "loss": 0.2209, + "step": 15107 + }, + { + "epoch": 1.0049104720564297, + "grad_norm": 4.590054512023926, + "learning_rate": 8.032479101000413e-06, + "loss": 0.2232, + "step": 15108 + }, + { + "epoch": 1.0049240368963646, + "grad_norm": 5.157252311706543, + "learning_rate": 8.032342058380158e-06, + "loss": 0.2094, + "step": 15109 + }, + { + "epoch": 1.0049376017362994, + "grad_norm": 5.224742412567139, + "learning_rate": 8.032205015759901e-06, + "loss": 0.2475, + "step": 15110 + }, + { + "epoch": 1.0049511665762343, + "grad_norm": 5.068774700164795, + "learning_rate": 8.032067973139646e-06, + "loss": 0.2347, + "step": 15111 + }, + { + "epoch": 1.0049647314161694, + "grad_norm": 5.171662330627441, + "learning_rate": 8.031930930519393e-06, + "loss": 0.3427, + "step": 15112 + }, + { + "epoch": 1.0049782962561042, + "grad_norm": 5.300124645233154, + "learning_rate": 8.031793887899137e-06, + "loss": 0.1985, + "step": 15113 + }, + { + "epoch": 1.004991861096039, + "grad_norm": 5.8696513175964355, + "learning_rate": 8.031656845278882e-06, + "loss": 0.2618, + "step": 15114 + }, + { + "epoch": 1.005005425935974, + "grad_norm": 4.465357303619385, + "learning_rate": 8.031519802658627e-06, + "loss": 0.2726, + "step": 15115 + }, + { + "epoch": 1.0050189907759088, + "grad_norm": 4.581040382385254, + "learning_rate": 8.031382760038372e-06, + "loss": 0.1974, + "step": 15116 + }, + { + "epoch": 1.0050325556158437, + "grad_norm": 4.744213581085205, + "learning_rate": 8.031245717418117e-06, + "loss": 0.2737, + "step": 15117 + }, + { + "epoch": 1.0050461204557786, + "grad_norm": 5.441195011138916, + "learning_rate": 8.031108674797863e-06, + "loss": 0.2804, + "step": 15118 + }, + { + "epoch": 1.0050596852957134, + "grad_norm": 7.093509197235107, + "learning_rate": 8.030971632177608e-06, + "loss": 0.3608, + "step": 15119 + }, + { + "epoch": 1.0050732501356483, + "grad_norm": 6.002425670623779, + "learning_rate": 8.030834589557353e-06, + "loss": 0.2815, + "step": 15120 + }, + { + "epoch": 1.0050868149755834, + "grad_norm": 3.788926124572754, + "learning_rate": 8.030697546937098e-06, + "loss": 0.2002, + "step": 15121 + }, + { + "epoch": 1.0051003798155183, + "grad_norm": 6.3231072425842285, + "learning_rate": 8.030560504316843e-06, + "loss": 0.3293, + "step": 15122 + }, + { + "epoch": 1.0051139446554531, + "grad_norm": 4.276627540588379, + "learning_rate": 8.030423461696589e-06, + "loss": 0.2017, + "step": 15123 + }, + { + "epoch": 1.005127509495388, + "grad_norm": 5.808063983917236, + "learning_rate": 8.030286419076334e-06, + "loss": 0.2615, + "step": 15124 + }, + { + "epoch": 1.0051410743353228, + "grad_norm": 6.847846031188965, + "learning_rate": 8.030149376456079e-06, + "loss": 0.2802, + "step": 15125 + }, + { + "epoch": 1.0051546391752577, + "grad_norm": 5.081049919128418, + "learning_rate": 8.030012333835824e-06, + "loss": 0.1796, + "step": 15126 + }, + { + "epoch": 1.0051682040151926, + "grad_norm": 7.427545070648193, + "learning_rate": 8.02987529121557e-06, + "loss": 0.3214, + "step": 15127 + }, + { + "epoch": 1.0051817688551274, + "grad_norm": 6.591705799102783, + "learning_rate": 8.029738248595313e-06, + "loss": 0.3085, + "step": 15128 + }, + { + "epoch": 1.0051953336950623, + "grad_norm": 6.061633110046387, + "learning_rate": 8.02960120597506e-06, + "loss": 0.3194, + "step": 15129 + }, + { + "epoch": 1.0052088985349974, + "grad_norm": 4.1930365562438965, + "learning_rate": 8.029464163354805e-06, + "loss": 0.2638, + "step": 15130 + }, + { + "epoch": 1.0052224633749323, + "grad_norm": 6.639125347137451, + "learning_rate": 8.029327120734548e-06, + "loss": 0.3531, + "step": 15131 + }, + { + "epoch": 1.0052360282148671, + "grad_norm": 5.491297721862793, + "learning_rate": 8.029190078114293e-06, + "loss": 0.2426, + "step": 15132 + }, + { + "epoch": 1.005249593054802, + "grad_norm": 5.82700252532959, + "learning_rate": 8.029053035494039e-06, + "loss": 0.4555, + "step": 15133 + }, + { + "epoch": 1.0052631578947369, + "grad_norm": 7.600548267364502, + "learning_rate": 8.028915992873786e-06, + "loss": 0.4314, + "step": 15134 + }, + { + "epoch": 1.0052767227346717, + "grad_norm": 5.431621074676514, + "learning_rate": 8.028778950253529e-06, + "loss": 0.3552, + "step": 15135 + }, + { + "epoch": 1.0052902875746066, + "grad_norm": 6.382257461547852, + "learning_rate": 8.028641907633274e-06, + "loss": 0.4466, + "step": 15136 + }, + { + "epoch": 1.0053038524145415, + "grad_norm": 5.787542343139648, + "learning_rate": 8.02850486501302e-06, + "loss": 0.2925, + "step": 15137 + }, + { + "epoch": 1.0053174172544763, + "grad_norm": 5.425489902496338, + "learning_rate": 8.028367822392765e-06, + "loss": 0.1953, + "step": 15138 + }, + { + "epoch": 1.0053309820944112, + "grad_norm": 4.134778022766113, + "learning_rate": 8.02823077977251e-06, + "loss": 0.3411, + "step": 15139 + }, + { + "epoch": 1.0053445469343463, + "grad_norm": 6.122762680053711, + "learning_rate": 8.028093737152255e-06, + "loss": 0.3353, + "step": 15140 + }, + { + "epoch": 1.0053581117742811, + "grad_norm": 4.979694366455078, + "learning_rate": 8.027956694532e-06, + "loss": 0.2926, + "step": 15141 + }, + { + "epoch": 1.005371676614216, + "grad_norm": 4.401403427124023, + "learning_rate": 8.027819651911745e-06, + "loss": 0.2164, + "step": 15142 + }, + { + "epoch": 1.0053852414541509, + "grad_norm": 5.583656311035156, + "learning_rate": 8.02768260929149e-06, + "loss": 0.2361, + "step": 15143 + }, + { + "epoch": 1.0053988062940857, + "grad_norm": 7.024211406707764, + "learning_rate": 8.027545566671236e-06, + "loss": 0.5608, + "step": 15144 + }, + { + "epoch": 1.0054123711340206, + "grad_norm": 4.958329677581787, + "learning_rate": 8.02740852405098e-06, + "loss": 0.2524, + "step": 15145 + }, + { + "epoch": 1.0054259359739555, + "grad_norm": 4.507344722747803, + "learning_rate": 8.027271481430724e-06, + "loss": 0.2417, + "step": 15146 + }, + { + "epoch": 1.0054395008138903, + "grad_norm": 7.393750190734863, + "learning_rate": 8.027134438810471e-06, + "loss": 0.2665, + "step": 15147 + }, + { + "epoch": 1.0054530656538252, + "grad_norm": 4.226733207702637, + "learning_rate": 8.026997396190216e-06, + "loss": 0.2672, + "step": 15148 + }, + { + "epoch": 1.0054666304937603, + "grad_norm": 4.262189865112305, + "learning_rate": 8.026860353569962e-06, + "loss": 0.2863, + "step": 15149 + }, + { + "epoch": 1.0054801953336951, + "grad_norm": 5.384862899780273, + "learning_rate": 8.026723310949705e-06, + "loss": 0.282, + "step": 15150 + }, + { + "epoch": 1.00549376017363, + "grad_norm": 5.084153175354004, + "learning_rate": 8.026586268329452e-06, + "loss": 0.3028, + "step": 15151 + }, + { + "epoch": 1.0055073250135649, + "grad_norm": 6.013461112976074, + "learning_rate": 8.026449225709197e-06, + "loss": 0.2997, + "step": 15152 + }, + { + "epoch": 1.0055208898534997, + "grad_norm": 4.778938293457031, + "learning_rate": 8.02631218308894e-06, + "loss": 0.3355, + "step": 15153 + }, + { + "epoch": 1.0055344546934346, + "grad_norm": 7.232961177825928, + "learning_rate": 8.026175140468686e-06, + "loss": 0.3697, + "step": 15154 + }, + { + "epoch": 1.0055480195333695, + "grad_norm": 6.278641223907471, + "learning_rate": 8.026038097848433e-06, + "loss": 0.3748, + "step": 15155 + }, + { + "epoch": 1.0055615843733043, + "grad_norm": 4.688333034515381, + "learning_rate": 8.025901055228176e-06, + "loss": 0.2679, + "step": 15156 + }, + { + "epoch": 1.0055751492132392, + "grad_norm": 6.202794075012207, + "learning_rate": 8.025764012607921e-06, + "loss": 0.29, + "step": 15157 + }, + { + "epoch": 1.005588714053174, + "grad_norm": 5.148886680603027, + "learning_rate": 8.025626969987666e-06, + "loss": 0.3224, + "step": 15158 + }, + { + "epoch": 1.0056022788931092, + "grad_norm": 6.0243096351623535, + "learning_rate": 8.025489927367412e-06, + "loss": 0.4176, + "step": 15159 + }, + { + "epoch": 1.005615843733044, + "grad_norm": 4.161607265472412, + "learning_rate": 8.025352884747157e-06, + "loss": 0.1914, + "step": 15160 + }, + { + "epoch": 1.0056294085729789, + "grad_norm": 3.554285764694214, + "learning_rate": 8.025215842126902e-06, + "loss": 0.1967, + "step": 15161 + }, + { + "epoch": 1.0056429734129138, + "grad_norm": 4.338027000427246, + "learning_rate": 8.025078799506647e-06, + "loss": 0.2991, + "step": 15162 + }, + { + "epoch": 1.0056565382528486, + "grad_norm": 5.115423202514648, + "learning_rate": 8.024941756886392e-06, + "loss": 0.3486, + "step": 15163 + }, + { + "epoch": 1.0056701030927835, + "grad_norm": 4.639038562774658, + "learning_rate": 8.024804714266138e-06, + "loss": 0.2641, + "step": 15164 + }, + { + "epoch": 1.0056836679327184, + "grad_norm": 5.5227484703063965, + "learning_rate": 8.024667671645883e-06, + "loss": 0.3471, + "step": 15165 + }, + { + "epoch": 1.0056972327726532, + "grad_norm": 5.214727401733398, + "learning_rate": 8.024530629025628e-06, + "loss": 0.3195, + "step": 15166 + }, + { + "epoch": 1.005710797612588, + "grad_norm": 4.625952243804932, + "learning_rate": 8.024393586405373e-06, + "loss": 0.1805, + "step": 15167 + }, + { + "epoch": 1.0057243624525232, + "grad_norm": 6.488541126251221, + "learning_rate": 8.024256543785118e-06, + "loss": 0.3081, + "step": 15168 + }, + { + "epoch": 1.005737927292458, + "grad_norm": 4.052435874938965, + "learning_rate": 8.024119501164863e-06, + "loss": 0.2006, + "step": 15169 + }, + { + "epoch": 1.005751492132393, + "grad_norm": 3.7351572513580322, + "learning_rate": 8.023982458544609e-06, + "loss": 0.2379, + "step": 15170 + }, + { + "epoch": 1.0057650569723278, + "grad_norm": 5.351043224334717, + "learning_rate": 8.023845415924352e-06, + "loss": 0.2585, + "step": 15171 + }, + { + "epoch": 1.0057786218122626, + "grad_norm": 4.30222225189209, + "learning_rate": 8.023708373304097e-06, + "loss": 0.2398, + "step": 15172 + }, + { + "epoch": 1.0057921866521975, + "grad_norm": 4.633973598480225, + "learning_rate": 8.023571330683844e-06, + "loss": 0.2673, + "step": 15173 + }, + { + "epoch": 1.0058057514921324, + "grad_norm": 5.078998565673828, + "learning_rate": 8.02343428806359e-06, + "loss": 0.3642, + "step": 15174 + }, + { + "epoch": 1.0058193163320672, + "grad_norm": 4.747900485992432, + "learning_rate": 8.023297245443333e-06, + "loss": 0.1829, + "step": 15175 + }, + { + "epoch": 1.005832881172002, + "grad_norm": 4.630624771118164, + "learning_rate": 8.023160202823078e-06, + "loss": 0.2111, + "step": 15176 + }, + { + "epoch": 1.005846446011937, + "grad_norm": 5.632998943328857, + "learning_rate": 8.023023160202825e-06, + "loss": 0.3018, + "step": 15177 + }, + { + "epoch": 1.005860010851872, + "grad_norm": 4.035528182983398, + "learning_rate": 8.022886117582568e-06, + "loss": 0.2137, + "step": 15178 + }, + { + "epoch": 1.005873575691807, + "grad_norm": 4.572933673858643, + "learning_rate": 8.022749074962313e-06, + "loss": 0.2972, + "step": 15179 + }, + { + "epoch": 1.0058871405317418, + "grad_norm": 4.528546333312988, + "learning_rate": 8.022612032342059e-06, + "loss": 0.2673, + "step": 15180 + }, + { + "epoch": 1.0059007053716766, + "grad_norm": 5.34894323348999, + "learning_rate": 8.022474989721804e-06, + "loss": 0.288, + "step": 15181 + }, + { + "epoch": 1.0059142702116115, + "grad_norm": 4.923062801361084, + "learning_rate": 8.022337947101549e-06, + "loss": 0.2339, + "step": 15182 + }, + { + "epoch": 1.0059278350515464, + "grad_norm": 5.152289390563965, + "learning_rate": 8.022200904481294e-06, + "loss": 0.2154, + "step": 15183 + }, + { + "epoch": 1.0059413998914812, + "grad_norm": 6.00935697555542, + "learning_rate": 8.02206386186104e-06, + "loss": 0.371, + "step": 15184 + }, + { + "epoch": 1.005954964731416, + "grad_norm": 6.052182674407959, + "learning_rate": 8.021926819240785e-06, + "loss": 0.4412, + "step": 15185 + }, + { + "epoch": 1.005968529571351, + "grad_norm": 5.343756675720215, + "learning_rate": 8.02178977662053e-06, + "loss": 0.2999, + "step": 15186 + }, + { + "epoch": 1.005982094411286, + "grad_norm": 7.852680206298828, + "learning_rate": 8.021652734000275e-06, + "loss": 0.403, + "step": 15187 + }, + { + "epoch": 1.005995659251221, + "grad_norm": 5.384501934051514, + "learning_rate": 8.02151569138002e-06, + "loss": 0.2276, + "step": 15188 + }, + { + "epoch": 1.0060092240911558, + "grad_norm": 4.856984615325928, + "learning_rate": 8.021378648759765e-06, + "loss": 0.3335, + "step": 15189 + }, + { + "epoch": 1.0060227889310907, + "grad_norm": 5.266666412353516, + "learning_rate": 8.02124160613951e-06, + "loss": 0.1951, + "step": 15190 + }, + { + "epoch": 1.0060363537710255, + "grad_norm": 6.36564302444458, + "learning_rate": 8.021104563519256e-06, + "loss": 0.3358, + "step": 15191 + }, + { + "epoch": 1.0060499186109604, + "grad_norm": 6.003376007080078, + "learning_rate": 8.020967520899e-06, + "loss": 0.28, + "step": 15192 + }, + { + "epoch": 1.0060634834508952, + "grad_norm": 5.524771213531494, + "learning_rate": 8.020830478278744e-06, + "loss": 0.2963, + "step": 15193 + }, + { + "epoch": 1.0060770482908301, + "grad_norm": 5.731869220733643, + "learning_rate": 8.020693435658491e-06, + "loss": 0.3084, + "step": 15194 + }, + { + "epoch": 1.006090613130765, + "grad_norm": 6.211330413818359, + "learning_rate": 8.020556393038236e-06, + "loss": 0.2632, + "step": 15195 + }, + { + "epoch": 1.0061041779706998, + "grad_norm": 6.982635021209717, + "learning_rate": 8.02041935041798e-06, + "loss": 0.2608, + "step": 15196 + }, + { + "epoch": 1.006117742810635, + "grad_norm": 5.841463088989258, + "learning_rate": 8.020282307797725e-06, + "loss": 0.2715, + "step": 15197 + }, + { + "epoch": 1.0061313076505698, + "grad_norm": 5.901645660400391, + "learning_rate": 8.020145265177472e-06, + "loss": 0.2712, + "step": 15198 + }, + { + "epoch": 1.0061448724905047, + "grad_norm": 7.472258567810059, + "learning_rate": 8.020008222557217e-06, + "loss": 0.3531, + "step": 15199 + }, + { + "epoch": 1.0061584373304395, + "grad_norm": 7.20953369140625, + "learning_rate": 8.01987117993696e-06, + "loss": 0.383, + "step": 15200 + }, + { + "epoch": 1.0061720021703744, + "grad_norm": 10.687433242797852, + "learning_rate": 8.019734137316706e-06, + "loss": 0.5786, + "step": 15201 + }, + { + "epoch": 1.0061855670103093, + "grad_norm": 5.6677327156066895, + "learning_rate": 8.019597094696451e-06, + "loss": 0.3073, + "step": 15202 + }, + { + "epoch": 1.0061991318502441, + "grad_norm": 6.134860038757324, + "learning_rate": 8.019460052076196e-06, + "loss": 0.2514, + "step": 15203 + }, + { + "epoch": 1.006212696690179, + "grad_norm": 7.8116888999938965, + "learning_rate": 8.019323009455941e-06, + "loss": 0.3777, + "step": 15204 + }, + { + "epoch": 1.0062262615301139, + "grad_norm": 5.381431579589844, + "learning_rate": 8.019185966835686e-06, + "loss": 0.3315, + "step": 15205 + }, + { + "epoch": 1.006239826370049, + "grad_norm": 5.472871780395508, + "learning_rate": 8.019048924215432e-06, + "loss": 0.2473, + "step": 15206 + }, + { + "epoch": 1.0062533912099838, + "grad_norm": 5.492198467254639, + "learning_rate": 8.018911881595177e-06, + "loss": 0.3358, + "step": 15207 + }, + { + "epoch": 1.0062669560499187, + "grad_norm": 6.58248233795166, + "learning_rate": 8.018774838974922e-06, + "loss": 0.2407, + "step": 15208 + }, + { + "epoch": 1.0062805208898535, + "grad_norm": 6.873067378997803, + "learning_rate": 8.018637796354667e-06, + "loss": 0.3519, + "step": 15209 + }, + { + "epoch": 1.0062940857297884, + "grad_norm": 5.127682685852051, + "learning_rate": 8.018500753734412e-06, + "loss": 0.3269, + "step": 15210 + }, + { + "epoch": 1.0063076505697233, + "grad_norm": 4.5548601150512695, + "learning_rate": 8.018363711114158e-06, + "loss": 0.3614, + "step": 15211 + }, + { + "epoch": 1.0063212154096581, + "grad_norm": 4.904389381408691, + "learning_rate": 8.018226668493903e-06, + "loss": 0.2698, + "step": 15212 + }, + { + "epoch": 1.006334780249593, + "grad_norm": 6.692370891571045, + "learning_rate": 8.018089625873648e-06, + "loss": 0.3081, + "step": 15213 + }, + { + "epoch": 1.0063483450895279, + "grad_norm": 4.446905136108398, + "learning_rate": 8.017952583253391e-06, + "loss": 0.292, + "step": 15214 + }, + { + "epoch": 1.0063619099294627, + "grad_norm": 5.7462849617004395, + "learning_rate": 8.017815540633137e-06, + "loss": 0.328, + "step": 15215 + }, + { + "epoch": 1.0063754747693978, + "grad_norm": 7.0821146965026855, + "learning_rate": 8.017678498012883e-06, + "loss": 0.4226, + "step": 15216 + }, + { + "epoch": 1.0063890396093327, + "grad_norm": 6.169015884399414, + "learning_rate": 8.017541455392629e-06, + "loss": 0.3514, + "step": 15217 + }, + { + "epoch": 1.0064026044492675, + "grad_norm": 5.626319885253906, + "learning_rate": 8.017404412772372e-06, + "loss": 0.2791, + "step": 15218 + }, + { + "epoch": 1.0064161692892024, + "grad_norm": 6.3792405128479, + "learning_rate": 8.017267370152117e-06, + "loss": 0.3063, + "step": 15219 + }, + { + "epoch": 1.0064297341291373, + "grad_norm": 6.902448654174805, + "learning_rate": 8.017130327531864e-06, + "loss": 0.3762, + "step": 15220 + }, + { + "epoch": 1.0064432989690721, + "grad_norm": 7.0163164138793945, + "learning_rate": 8.016993284911608e-06, + "loss": 0.4651, + "step": 15221 + }, + { + "epoch": 1.006456863809007, + "grad_norm": 7.678739547729492, + "learning_rate": 8.016856242291353e-06, + "loss": 0.3978, + "step": 15222 + }, + { + "epoch": 1.0064704286489419, + "grad_norm": 6.476564407348633, + "learning_rate": 8.016719199671098e-06, + "loss": 0.3296, + "step": 15223 + }, + { + "epoch": 1.0064839934888767, + "grad_norm": 6.843644618988037, + "learning_rate": 8.016582157050843e-06, + "loss": 0.403, + "step": 15224 + }, + { + "epoch": 1.0064975583288118, + "grad_norm": 4.541141986846924, + "learning_rate": 8.016445114430588e-06, + "loss": 0.3661, + "step": 15225 + }, + { + "epoch": 1.0065111231687467, + "grad_norm": 4.730322360992432, + "learning_rate": 8.016308071810334e-06, + "loss": 0.3002, + "step": 15226 + }, + { + "epoch": 1.0065246880086816, + "grad_norm": 6.789733409881592, + "learning_rate": 8.016171029190079e-06, + "loss": 0.3437, + "step": 15227 + }, + { + "epoch": 1.0065382528486164, + "grad_norm": 8.670904159545898, + "learning_rate": 8.016033986569824e-06, + "loss": 0.4687, + "step": 15228 + }, + { + "epoch": 1.0065518176885513, + "grad_norm": 9.283117294311523, + "learning_rate": 8.015896943949569e-06, + "loss": 0.4695, + "step": 15229 + }, + { + "epoch": 1.0065653825284862, + "grad_norm": 6.733113765716553, + "learning_rate": 8.015759901329314e-06, + "loss": 0.301, + "step": 15230 + }, + { + "epoch": 1.006578947368421, + "grad_norm": 6.281452178955078, + "learning_rate": 8.01562285870906e-06, + "loss": 0.2662, + "step": 15231 + }, + { + "epoch": 1.0065925122083559, + "grad_norm": 5.6582417488098145, + "learning_rate": 8.015485816088805e-06, + "loss": 0.3152, + "step": 15232 + }, + { + "epoch": 1.0066060770482907, + "grad_norm": 5.278957366943359, + "learning_rate": 8.01534877346855e-06, + "loss": 0.253, + "step": 15233 + }, + { + "epoch": 1.0066196418882256, + "grad_norm": 6.00456428527832, + "learning_rate": 8.015211730848295e-06, + "loss": 0.2729, + "step": 15234 + }, + { + "epoch": 1.0066332067281607, + "grad_norm": 6.392640113830566, + "learning_rate": 8.01507468822804e-06, + "loss": 0.435, + "step": 15235 + }, + { + "epoch": 1.0066467715680956, + "grad_norm": 5.692556858062744, + "learning_rate": 8.014937645607784e-06, + "loss": 0.2071, + "step": 15236 + }, + { + "epoch": 1.0066603364080304, + "grad_norm": 4.7591938972473145, + "learning_rate": 8.01480060298753e-06, + "loss": 0.268, + "step": 15237 + }, + { + "epoch": 1.0066739012479653, + "grad_norm": 4.991698741912842, + "learning_rate": 8.014663560367276e-06, + "loss": 0.2208, + "step": 15238 + }, + { + "epoch": 1.0066874660879002, + "grad_norm": 5.05987024307251, + "learning_rate": 8.014526517747019e-06, + "loss": 0.2427, + "step": 15239 + }, + { + "epoch": 1.006701030927835, + "grad_norm": 5.92965030670166, + "learning_rate": 8.014389475126764e-06, + "loss": 0.2604, + "step": 15240 + }, + { + "epoch": 1.00671459576777, + "grad_norm": 5.5594587326049805, + "learning_rate": 8.01425243250651e-06, + "loss": 0.3196, + "step": 15241 + }, + { + "epoch": 1.0067281606077048, + "grad_norm": 4.038362979888916, + "learning_rate": 8.014115389886256e-06, + "loss": 0.2046, + "step": 15242 + }, + { + "epoch": 1.0067417254476396, + "grad_norm": 5.7191290855407715, + "learning_rate": 8.013978347266e-06, + "loss": 0.3142, + "step": 15243 + }, + { + "epoch": 1.0067552902875747, + "grad_norm": 5.77618932723999, + "learning_rate": 8.013841304645745e-06, + "loss": 0.2797, + "step": 15244 + }, + { + "epoch": 1.0067688551275096, + "grad_norm": 6.601249694824219, + "learning_rate": 8.01370426202549e-06, + "loss": 0.3249, + "step": 15245 + }, + { + "epoch": 1.0067824199674444, + "grad_norm": 5.420748233795166, + "learning_rate": 8.013567219405235e-06, + "loss": 0.339, + "step": 15246 + }, + { + "epoch": 1.0067959848073793, + "grad_norm": 7.026862144470215, + "learning_rate": 8.01343017678498e-06, + "loss": 0.3082, + "step": 15247 + }, + { + "epoch": 1.0068095496473142, + "grad_norm": 5.304634094238281, + "learning_rate": 8.013293134164726e-06, + "loss": 0.1952, + "step": 15248 + }, + { + "epoch": 1.006823114487249, + "grad_norm": 4.462350368499756, + "learning_rate": 8.013156091544471e-06, + "loss": 0.2192, + "step": 15249 + }, + { + "epoch": 1.006836679327184, + "grad_norm": 6.480578899383545, + "learning_rate": 8.013019048924216e-06, + "loss": 0.257, + "step": 15250 + }, + { + "epoch": 1.0068502441671188, + "grad_norm": 4.355633735656738, + "learning_rate": 8.012882006303961e-06, + "loss": 0.2136, + "step": 15251 + }, + { + "epoch": 1.0068638090070536, + "grad_norm": 3.1808483600616455, + "learning_rate": 8.012744963683706e-06, + "loss": 0.1851, + "step": 15252 + }, + { + "epoch": 1.0068773738469885, + "grad_norm": 5.172558784484863, + "learning_rate": 8.012607921063452e-06, + "loss": 0.2819, + "step": 15253 + }, + { + "epoch": 1.0068909386869236, + "grad_norm": 5.389993667602539, + "learning_rate": 8.012470878443197e-06, + "loss": 0.2788, + "step": 15254 + }, + { + "epoch": 1.0069045035268585, + "grad_norm": 5.589458465576172, + "learning_rate": 8.012333835822942e-06, + "loss": 0.202, + "step": 15255 + }, + { + "epoch": 1.0069180683667933, + "grad_norm": 5.506933689117432, + "learning_rate": 8.012196793202687e-06, + "loss": 0.2035, + "step": 15256 + }, + { + "epoch": 1.0069316332067282, + "grad_norm": 5.0213799476623535, + "learning_rate": 8.012059750582432e-06, + "loss": 0.2248, + "step": 15257 + }, + { + "epoch": 1.006945198046663, + "grad_norm": 4.551644802093506, + "learning_rate": 8.011922707962176e-06, + "loss": 0.2706, + "step": 15258 + }, + { + "epoch": 1.006958762886598, + "grad_norm": 4.661775588989258, + "learning_rate": 8.011785665341923e-06, + "loss": 0.1986, + "step": 15259 + }, + { + "epoch": 1.0069723277265328, + "grad_norm": 4.381368160247803, + "learning_rate": 8.011648622721668e-06, + "loss": 0.2366, + "step": 15260 + }, + { + "epoch": 1.0069858925664676, + "grad_norm": 4.840775012969971, + "learning_rate": 8.011511580101411e-06, + "loss": 0.2779, + "step": 15261 + }, + { + "epoch": 1.0069994574064025, + "grad_norm": 6.564239978790283, + "learning_rate": 8.011374537481157e-06, + "loss": 0.3646, + "step": 15262 + }, + { + "epoch": 1.0070130222463376, + "grad_norm": 4.2808918952941895, + "learning_rate": 8.011237494860903e-06, + "loss": 0.2536, + "step": 15263 + }, + { + "epoch": 1.0070265870862725, + "grad_norm": 8.869726181030273, + "learning_rate": 8.011100452240647e-06, + "loss": 0.4273, + "step": 15264 + }, + { + "epoch": 1.0070401519262073, + "grad_norm": 4.1470627784729, + "learning_rate": 8.010963409620392e-06, + "loss": 0.2305, + "step": 15265 + }, + { + "epoch": 1.0070537167661422, + "grad_norm": 5.733607769012451, + "learning_rate": 8.010826367000137e-06, + "loss": 0.3416, + "step": 15266 + }, + { + "epoch": 1.007067281606077, + "grad_norm": 6.09649133682251, + "learning_rate": 8.010689324379884e-06, + "loss": 0.2625, + "step": 15267 + }, + { + "epoch": 1.007080846446012, + "grad_norm": 6.608217239379883, + "learning_rate": 8.010552281759628e-06, + "loss": 0.3195, + "step": 15268 + }, + { + "epoch": 1.0070944112859468, + "grad_norm": 3.1914055347442627, + "learning_rate": 8.010415239139373e-06, + "loss": 0.1351, + "step": 15269 + }, + { + "epoch": 1.0071079761258817, + "grad_norm": 5.630845546722412, + "learning_rate": 8.010278196519118e-06, + "loss": 0.265, + "step": 15270 + }, + { + "epoch": 1.0071215409658165, + "grad_norm": 6.946951866149902, + "learning_rate": 8.010141153898863e-06, + "loss": 0.3171, + "step": 15271 + }, + { + "epoch": 1.0071351058057514, + "grad_norm": 6.025160789489746, + "learning_rate": 8.010004111278608e-06, + "loss": 0.249, + "step": 15272 + }, + { + "epoch": 1.0071486706456865, + "grad_norm": 5.259746551513672, + "learning_rate": 8.009867068658354e-06, + "loss": 0.27, + "step": 15273 + }, + { + "epoch": 1.0071622354856213, + "grad_norm": 5.807216167449951, + "learning_rate": 8.009730026038099e-06, + "loss": 0.3556, + "step": 15274 + }, + { + "epoch": 1.0071758003255562, + "grad_norm": 3.8520255088806152, + "learning_rate": 8.009592983417844e-06, + "loss": 0.2219, + "step": 15275 + }, + { + "epoch": 1.007189365165491, + "grad_norm": 4.539440631866455, + "learning_rate": 8.009455940797589e-06, + "loss": 0.2064, + "step": 15276 + }, + { + "epoch": 1.007202930005426, + "grad_norm": 4.857297897338867, + "learning_rate": 8.009318898177334e-06, + "loss": 0.1978, + "step": 15277 + }, + { + "epoch": 1.0072164948453608, + "grad_norm": 6.350337028503418, + "learning_rate": 8.00918185555708e-06, + "loss": 0.246, + "step": 15278 + }, + { + "epoch": 1.0072300596852957, + "grad_norm": 5.804589748382568, + "learning_rate": 8.009044812936823e-06, + "loss": 0.278, + "step": 15279 + }, + { + "epoch": 1.0072436245252305, + "grad_norm": 7.0521321296691895, + "learning_rate": 8.00890777031657e-06, + "loss": 0.3198, + "step": 15280 + }, + { + "epoch": 1.0072571893651654, + "grad_norm": 6.087032318115234, + "learning_rate": 8.008770727696315e-06, + "loss": 0.3139, + "step": 15281 + }, + { + "epoch": 1.0072707542051005, + "grad_norm": 4.390254974365234, + "learning_rate": 8.00863368507606e-06, + "loss": 0.2391, + "step": 15282 + }, + { + "epoch": 1.0072843190450353, + "grad_norm": 7.232161045074463, + "learning_rate": 8.008496642455804e-06, + "loss": 0.2641, + "step": 15283 + }, + { + "epoch": 1.0072978838849702, + "grad_norm": 6.0610575675964355, + "learning_rate": 8.008359599835549e-06, + "loss": 0.2597, + "step": 15284 + }, + { + "epoch": 1.007311448724905, + "grad_norm": 4.538976192474365, + "learning_rate": 8.008222557215296e-06, + "loss": 0.1742, + "step": 15285 + }, + { + "epoch": 1.00732501356484, + "grad_norm": 5.727993965148926, + "learning_rate": 8.00808551459504e-06, + "loss": 0.2997, + "step": 15286 + }, + { + "epoch": 1.0073385784047748, + "grad_norm": 5.904645919799805, + "learning_rate": 8.007948471974784e-06, + "loss": 0.294, + "step": 15287 + }, + { + "epoch": 1.0073521432447097, + "grad_norm": 6.1570892333984375, + "learning_rate": 8.00781142935453e-06, + "loss": 0.4171, + "step": 15288 + }, + { + "epoch": 1.0073657080846445, + "grad_norm": 6.72199821472168, + "learning_rate": 8.007674386734275e-06, + "loss": 0.3023, + "step": 15289 + }, + { + "epoch": 1.0073792729245794, + "grad_norm": 6.462567329406738, + "learning_rate": 8.00753734411402e-06, + "loss": 0.4194, + "step": 15290 + }, + { + "epoch": 1.0073928377645143, + "grad_norm": 5.9930195808410645, + "learning_rate": 8.007400301493765e-06, + "loss": 0.2954, + "step": 15291 + }, + { + "epoch": 1.0074064026044494, + "grad_norm": 4.850340843200684, + "learning_rate": 8.00726325887351e-06, + "loss": 0.2474, + "step": 15292 + }, + { + "epoch": 1.0074199674443842, + "grad_norm": 5.3575053215026855, + "learning_rate": 8.007126216253255e-06, + "loss": 0.2803, + "step": 15293 + }, + { + "epoch": 1.007433532284319, + "grad_norm": 7.459170341491699, + "learning_rate": 8.006989173633e-06, + "loss": 0.3222, + "step": 15294 + }, + { + "epoch": 1.007447097124254, + "grad_norm": 5.2099456787109375, + "learning_rate": 8.006852131012746e-06, + "loss": 0.332, + "step": 15295 + }, + { + "epoch": 1.0074606619641888, + "grad_norm": 4.892189979553223, + "learning_rate": 8.006715088392491e-06, + "loss": 0.265, + "step": 15296 + }, + { + "epoch": 1.0074742268041237, + "grad_norm": 4.726377964019775, + "learning_rate": 8.006578045772236e-06, + "loss": 0.2432, + "step": 15297 + }, + { + "epoch": 1.0074877916440586, + "grad_norm": 5.929107666015625, + "learning_rate": 8.006441003151981e-06, + "loss": 0.2526, + "step": 15298 + }, + { + "epoch": 1.0075013564839934, + "grad_norm": 6.544143199920654, + "learning_rate": 8.006303960531726e-06, + "loss": 0.3384, + "step": 15299 + }, + { + "epoch": 1.0075149213239283, + "grad_norm": 4.0601701736450195, + "learning_rate": 8.006166917911472e-06, + "loss": 0.1914, + "step": 15300 + }, + { + "epoch": 1.0075284861638634, + "grad_norm": 3.9642159938812256, + "learning_rate": 8.006029875291215e-06, + "loss": 0.2524, + "step": 15301 + }, + { + "epoch": 1.0075420510037982, + "grad_norm": 5.392520427703857, + "learning_rate": 8.005892832670962e-06, + "loss": 0.2384, + "step": 15302 + }, + { + "epoch": 1.007555615843733, + "grad_norm": 7.5465288162231445, + "learning_rate": 8.005755790050707e-06, + "loss": 0.3424, + "step": 15303 + }, + { + "epoch": 1.007569180683668, + "grad_norm": 4.255457878112793, + "learning_rate": 8.00561874743045e-06, + "loss": 0.2805, + "step": 15304 + }, + { + "epoch": 1.0075827455236028, + "grad_norm": 5.4627766609191895, + "learning_rate": 8.005481704810196e-06, + "loss": 0.2811, + "step": 15305 + }, + { + "epoch": 1.0075963103635377, + "grad_norm": 4.073343753814697, + "learning_rate": 8.005344662189943e-06, + "loss": 0.1933, + "step": 15306 + }, + { + "epoch": 1.0076098752034726, + "grad_norm": 3.9019317626953125, + "learning_rate": 8.005207619569686e-06, + "loss": 0.2091, + "step": 15307 + }, + { + "epoch": 1.0076234400434074, + "grad_norm": 5.535930633544922, + "learning_rate": 8.005070576949431e-06, + "loss": 0.2735, + "step": 15308 + }, + { + "epoch": 1.0076370048833423, + "grad_norm": 3.592935800552368, + "learning_rate": 8.004933534329177e-06, + "loss": 0.1924, + "step": 15309 + }, + { + "epoch": 1.0076505697232772, + "grad_norm": 3.888251781463623, + "learning_rate": 8.004796491708922e-06, + "loss": 0.2566, + "step": 15310 + }, + { + "epoch": 1.0076641345632122, + "grad_norm": 3.681950330734253, + "learning_rate": 8.004659449088667e-06, + "loss": 0.2383, + "step": 15311 + }, + { + "epoch": 1.007677699403147, + "grad_norm": 4.083428859710693, + "learning_rate": 8.004522406468412e-06, + "loss": 0.1848, + "step": 15312 + }, + { + "epoch": 1.007691264243082, + "grad_norm": 4.629627704620361, + "learning_rate": 8.004385363848157e-06, + "loss": 0.2997, + "step": 15313 + }, + { + "epoch": 1.0077048290830168, + "grad_norm": 4.503089427947998, + "learning_rate": 8.004248321227902e-06, + "loss": 0.1887, + "step": 15314 + }, + { + "epoch": 1.0077183939229517, + "grad_norm": 3.8684678077697754, + "learning_rate": 8.004111278607648e-06, + "loss": 0.1906, + "step": 15315 + }, + { + "epoch": 1.0077319587628866, + "grad_norm": 4.591444492340088, + "learning_rate": 8.003974235987393e-06, + "loss": 0.2427, + "step": 15316 + }, + { + "epoch": 1.0077455236028214, + "grad_norm": 4.484841346740723, + "learning_rate": 8.003837193367138e-06, + "loss": 0.2929, + "step": 15317 + }, + { + "epoch": 1.0077590884427563, + "grad_norm": 3.919616937637329, + "learning_rate": 8.003700150746883e-06, + "loss": 0.2337, + "step": 15318 + }, + { + "epoch": 1.0077726532826912, + "grad_norm": 4.13856315612793, + "learning_rate": 8.003563108126628e-06, + "loss": 0.1712, + "step": 15319 + }, + { + "epoch": 1.0077862181226263, + "grad_norm": 5.359622955322266, + "learning_rate": 8.003426065506374e-06, + "loss": 0.3093, + "step": 15320 + }, + { + "epoch": 1.0077997829625611, + "grad_norm": 3.3300230503082275, + "learning_rate": 8.003289022886119e-06, + "loss": 0.1686, + "step": 15321 + }, + { + "epoch": 1.007813347802496, + "grad_norm": 4.872478485107422, + "learning_rate": 8.003151980265862e-06, + "loss": 0.2736, + "step": 15322 + }, + { + "epoch": 1.0078269126424308, + "grad_norm": 4.91886043548584, + "learning_rate": 8.003014937645607e-06, + "loss": 0.2603, + "step": 15323 + }, + { + "epoch": 1.0078404774823657, + "grad_norm": 3.9816222190856934, + "learning_rate": 8.002877895025354e-06, + "loss": 0.1538, + "step": 15324 + }, + { + "epoch": 1.0078540423223006, + "grad_norm": 4.115257740020752, + "learning_rate": 8.0027408524051e-06, + "loss": 0.2414, + "step": 15325 + }, + { + "epoch": 1.0078676071622354, + "grad_norm": 3.942568302154541, + "learning_rate": 8.002603809784843e-06, + "loss": 0.182, + "step": 15326 + }, + { + "epoch": 1.0078811720021703, + "grad_norm": 4.558807849884033, + "learning_rate": 8.002466767164588e-06, + "loss": 0.1898, + "step": 15327 + }, + { + "epoch": 1.0078947368421052, + "grad_norm": 5.391519069671631, + "learning_rate": 8.002329724544335e-06, + "loss": 0.2012, + "step": 15328 + }, + { + "epoch": 1.00790830168204, + "grad_norm": 3.546560287475586, + "learning_rate": 8.002192681924078e-06, + "loss": 0.1315, + "step": 15329 + }, + { + "epoch": 1.0079218665219751, + "grad_norm": 5.706191062927246, + "learning_rate": 8.002055639303824e-06, + "loss": 0.245, + "step": 15330 + }, + { + "epoch": 1.00793543136191, + "grad_norm": 4.63547945022583, + "learning_rate": 8.001918596683569e-06, + "loss": 0.325, + "step": 15331 + }, + { + "epoch": 1.0079489962018449, + "grad_norm": 4.600983619689941, + "learning_rate": 8.001781554063314e-06, + "loss": 0.2274, + "step": 15332 + }, + { + "epoch": 1.0079625610417797, + "grad_norm": 5.072913646697998, + "learning_rate": 8.00164451144306e-06, + "loss": 0.2803, + "step": 15333 + }, + { + "epoch": 1.0079761258817146, + "grad_norm": 5.1763482093811035, + "learning_rate": 8.001507468822804e-06, + "loss": 0.369, + "step": 15334 + }, + { + "epoch": 1.0079896907216495, + "grad_norm": 4.271310329437256, + "learning_rate": 8.00137042620255e-06, + "loss": 0.2607, + "step": 15335 + }, + { + "epoch": 1.0080032555615843, + "grad_norm": 4.197829723358154, + "learning_rate": 8.001233383582295e-06, + "loss": 0.2345, + "step": 15336 + }, + { + "epoch": 1.0080168204015192, + "grad_norm": 5.548948764801025, + "learning_rate": 8.00109634096204e-06, + "loss": 0.2863, + "step": 15337 + }, + { + "epoch": 1.008030385241454, + "grad_norm": 3.8819074630737305, + "learning_rate": 8.000959298341785e-06, + "loss": 0.1844, + "step": 15338 + }, + { + "epoch": 1.0080439500813891, + "grad_norm": 5.42387056350708, + "learning_rate": 8.00082225572153e-06, + "loss": 0.3861, + "step": 15339 + }, + { + "epoch": 1.008057514921324, + "grad_norm": 3.7852494716644287, + "learning_rate": 8.000685213101275e-06, + "loss": 0.2613, + "step": 15340 + }, + { + "epoch": 1.0080710797612589, + "grad_norm": 7.433146953582764, + "learning_rate": 8.00054817048102e-06, + "loss": 0.3327, + "step": 15341 + }, + { + "epoch": 1.0080846446011937, + "grad_norm": 5.054966449737549, + "learning_rate": 8.000411127860766e-06, + "loss": 0.3187, + "step": 15342 + }, + { + "epoch": 1.0080982094411286, + "grad_norm": 5.975323677062988, + "learning_rate": 8.000274085240511e-06, + "loss": 0.3009, + "step": 15343 + }, + { + "epoch": 1.0081117742810635, + "grad_norm": 4.84951639175415, + "learning_rate": 8.000137042620254e-06, + "loss": 0.2186, + "step": 15344 + }, + { + "epoch": 1.0081253391209983, + "grad_norm": 4.9378204345703125, + "learning_rate": 8.000000000000001e-06, + "loss": 0.2023, + "step": 15345 + }, + { + "epoch": 1.0081389039609332, + "grad_norm": 5.923002243041992, + "learning_rate": 7.999862957379747e-06, + "loss": 0.3438, + "step": 15346 + }, + { + "epoch": 1.008152468800868, + "grad_norm": 5.323639392852783, + "learning_rate": 7.99972591475949e-06, + "loss": 0.4244, + "step": 15347 + }, + { + "epoch": 1.0081660336408031, + "grad_norm": 4.50208044052124, + "learning_rate": 7.999588872139235e-06, + "loss": 0.2636, + "step": 15348 + }, + { + "epoch": 1.008179598480738, + "grad_norm": 5.206104278564453, + "learning_rate": 7.999451829518982e-06, + "loss": 0.1457, + "step": 15349 + }, + { + "epoch": 1.0081931633206729, + "grad_norm": 6.642426013946533, + "learning_rate": 7.999314786898727e-06, + "loss": 0.2994, + "step": 15350 + }, + { + "epoch": 1.0082067281606077, + "grad_norm": 6.079967498779297, + "learning_rate": 7.99917774427847e-06, + "loss": 0.3419, + "step": 15351 + }, + { + "epoch": 1.0082202930005426, + "grad_norm": 4.293821811676025, + "learning_rate": 7.999040701658216e-06, + "loss": 0.2695, + "step": 15352 + }, + { + "epoch": 1.0082338578404775, + "grad_norm": 3.9046099185943604, + "learning_rate": 7.998903659037961e-06, + "loss": 0.2314, + "step": 15353 + }, + { + "epoch": 1.0082474226804123, + "grad_norm": 4.245595455169678, + "learning_rate": 7.998766616417706e-06, + "loss": 0.3928, + "step": 15354 + }, + { + "epoch": 1.0082609875203472, + "grad_norm": 4.932961463928223, + "learning_rate": 7.998629573797451e-06, + "loss": 0.4539, + "step": 15355 + }, + { + "epoch": 1.008274552360282, + "grad_norm": 5.152522563934326, + "learning_rate": 7.998492531177197e-06, + "loss": 0.2827, + "step": 15356 + }, + { + "epoch": 1.008288117200217, + "grad_norm": 5.706017017364502, + "learning_rate": 7.998355488556942e-06, + "loss": 0.3875, + "step": 15357 + }, + { + "epoch": 1.008301682040152, + "grad_norm": 6.142543792724609, + "learning_rate": 7.998218445936687e-06, + "loss": 0.3451, + "step": 15358 + }, + { + "epoch": 1.008315246880087, + "grad_norm": 4.545180797576904, + "learning_rate": 7.998081403316432e-06, + "loss": 0.2453, + "step": 15359 + }, + { + "epoch": 1.0083288117200218, + "grad_norm": 4.3503737449646, + "learning_rate": 7.997944360696177e-06, + "loss": 0.2205, + "step": 15360 + }, + { + "epoch": 1.0083423765599566, + "grad_norm": 5.965540409088135, + "learning_rate": 7.997807318075923e-06, + "loss": 0.3018, + "step": 15361 + }, + { + "epoch": 1.0083559413998915, + "grad_norm": 6.689228534698486, + "learning_rate": 7.997670275455668e-06, + "loss": 0.3117, + "step": 15362 + }, + { + "epoch": 1.0083695062398264, + "grad_norm": 4.690853118896484, + "learning_rate": 7.997533232835413e-06, + "loss": 0.2823, + "step": 15363 + }, + { + "epoch": 1.0083830710797612, + "grad_norm": 6.157199382781982, + "learning_rate": 7.997396190215158e-06, + "loss": 0.3068, + "step": 15364 + }, + { + "epoch": 1.008396635919696, + "grad_norm": 5.556918621063232, + "learning_rate": 7.997259147594903e-06, + "loss": 0.3444, + "step": 15365 + }, + { + "epoch": 1.008410200759631, + "grad_norm": 5.250025749206543, + "learning_rate": 7.997122104974647e-06, + "loss": 0.2684, + "step": 15366 + }, + { + "epoch": 1.008423765599566, + "grad_norm": 6.64558744430542, + "learning_rate": 7.996985062354394e-06, + "loss": 0.2942, + "step": 15367 + }, + { + "epoch": 1.008437330439501, + "grad_norm": 6.275256156921387, + "learning_rate": 7.996848019734139e-06, + "loss": 0.2781, + "step": 15368 + }, + { + "epoch": 1.0084508952794358, + "grad_norm": 6.124628067016602, + "learning_rate": 7.996710977113882e-06, + "loss": 0.3361, + "step": 15369 + }, + { + "epoch": 1.0084644601193706, + "grad_norm": 5.091064453125, + "learning_rate": 7.996573934493627e-06, + "loss": 0.5201, + "step": 15370 + }, + { + "epoch": 1.0084780249593055, + "grad_norm": 4.580848693847656, + "learning_rate": 7.996436891873374e-06, + "loss": 0.2376, + "step": 15371 + }, + { + "epoch": 1.0084915897992404, + "grad_norm": 5.964513301849365, + "learning_rate": 7.996299849253118e-06, + "loss": 0.3255, + "step": 15372 + }, + { + "epoch": 1.0085051546391752, + "grad_norm": 5.847951412200928, + "learning_rate": 7.996162806632863e-06, + "loss": 0.3301, + "step": 15373 + }, + { + "epoch": 1.00851871947911, + "grad_norm": 4.605025768280029, + "learning_rate": 7.996025764012608e-06, + "loss": 0.2311, + "step": 15374 + }, + { + "epoch": 1.008532284319045, + "grad_norm": 3.8780887126922607, + "learning_rate": 7.995888721392355e-06, + "loss": 0.2191, + "step": 15375 + }, + { + "epoch": 1.0085458491589798, + "grad_norm": 6.985201835632324, + "learning_rate": 7.995751678772098e-06, + "loss": 0.4073, + "step": 15376 + }, + { + "epoch": 1.008559413998915, + "grad_norm": 5.218849182128906, + "learning_rate": 7.995614636151844e-06, + "loss": 0.4338, + "step": 15377 + }, + { + "epoch": 1.0085729788388498, + "grad_norm": 4.2816643714904785, + "learning_rate": 7.995477593531589e-06, + "loss": 0.2077, + "step": 15378 + }, + { + "epoch": 1.0085865436787846, + "grad_norm": 7.340961933135986, + "learning_rate": 7.995340550911334e-06, + "loss": 0.4565, + "step": 15379 + }, + { + "epoch": 1.0086001085187195, + "grad_norm": 7.796713352203369, + "learning_rate": 7.99520350829108e-06, + "loss": 0.4048, + "step": 15380 + }, + { + "epoch": 1.0086136733586544, + "grad_norm": 5.130324363708496, + "learning_rate": 7.995066465670824e-06, + "loss": 0.2953, + "step": 15381 + }, + { + "epoch": 1.0086272381985892, + "grad_norm": 7.464971542358398, + "learning_rate": 7.99492942305057e-06, + "loss": 0.2697, + "step": 15382 + }, + { + "epoch": 1.008640803038524, + "grad_norm": 5.448214530944824, + "learning_rate": 7.994792380430315e-06, + "loss": 0.3163, + "step": 15383 + }, + { + "epoch": 1.008654367878459, + "grad_norm": 5.426538467407227, + "learning_rate": 7.99465533781006e-06, + "loss": 0.2915, + "step": 15384 + }, + { + "epoch": 1.0086679327183938, + "grad_norm": 6.104264736175537, + "learning_rate": 7.994518295189805e-06, + "loss": 0.2454, + "step": 15385 + }, + { + "epoch": 1.008681497558329, + "grad_norm": 6.540727615356445, + "learning_rate": 7.99438125256955e-06, + "loss": 0.2834, + "step": 15386 + }, + { + "epoch": 1.0086950623982638, + "grad_norm": 6.181093215942383, + "learning_rate": 7.994244209949294e-06, + "loss": 0.2678, + "step": 15387 + }, + { + "epoch": 1.0087086272381987, + "grad_norm": 5.464309215545654, + "learning_rate": 7.99410716732904e-06, + "loss": 0.3628, + "step": 15388 + }, + { + "epoch": 1.0087221920781335, + "grad_norm": 5.274040699005127, + "learning_rate": 7.993970124708786e-06, + "loss": 0.1742, + "step": 15389 + }, + { + "epoch": 1.0087357569180684, + "grad_norm": 6.673478603363037, + "learning_rate": 7.993833082088531e-06, + "loss": 0.2908, + "step": 15390 + }, + { + "epoch": 1.0087493217580032, + "grad_norm": 4.212286949157715, + "learning_rate": 7.993696039468274e-06, + "loss": 0.206, + "step": 15391 + }, + { + "epoch": 1.0087628865979381, + "grad_norm": 5.533988952636719, + "learning_rate": 7.99355899684802e-06, + "loss": 0.2143, + "step": 15392 + }, + { + "epoch": 1.008776451437873, + "grad_norm": 6.879020690917969, + "learning_rate": 7.993421954227767e-06, + "loss": 0.3622, + "step": 15393 + }, + { + "epoch": 1.0087900162778078, + "grad_norm": 5.575109481811523, + "learning_rate": 7.99328491160751e-06, + "loss": 0.3171, + "step": 15394 + }, + { + "epoch": 1.0088035811177427, + "grad_norm": 5.5102643966674805, + "learning_rate": 7.993147868987255e-06, + "loss": 0.2514, + "step": 15395 + }, + { + "epoch": 1.0088171459576778, + "grad_norm": 5.861166000366211, + "learning_rate": 7.993010826367e-06, + "loss": 0.2565, + "step": 15396 + }, + { + "epoch": 1.0088307107976127, + "grad_norm": 5.878359794616699, + "learning_rate": 7.992873783746746e-06, + "loss": 0.3969, + "step": 15397 + }, + { + "epoch": 1.0088442756375475, + "grad_norm": 6.639707088470459, + "learning_rate": 7.99273674112649e-06, + "loss": 0.2399, + "step": 15398 + }, + { + "epoch": 1.0088578404774824, + "grad_norm": 5.196722507476807, + "learning_rate": 7.992599698506236e-06, + "loss": 0.2664, + "step": 15399 + }, + { + "epoch": 1.0088714053174173, + "grad_norm": 5.910519599914551, + "learning_rate": 7.992462655885981e-06, + "loss": 0.4276, + "step": 15400 + }, + { + "epoch": 1.0088849701573521, + "grad_norm": 7.473750591278076, + "learning_rate": 7.992325613265726e-06, + "loss": 0.3458, + "step": 15401 + }, + { + "epoch": 1.008898534997287, + "grad_norm": 6.043802738189697, + "learning_rate": 7.992188570645471e-06, + "loss": 0.2991, + "step": 15402 + }, + { + "epoch": 1.0089120998372219, + "grad_norm": 4.443703651428223, + "learning_rate": 7.992051528025217e-06, + "loss": 0.2444, + "step": 15403 + }, + { + "epoch": 1.0089256646771567, + "grad_norm": 6.036920547485352, + "learning_rate": 7.991914485404962e-06, + "loss": 0.2933, + "step": 15404 + }, + { + "epoch": 1.0089392295170918, + "grad_norm": 5.666077613830566, + "learning_rate": 7.991777442784707e-06, + "loss": 0.244, + "step": 15405 + }, + { + "epoch": 1.0089527943570267, + "grad_norm": 6.023773193359375, + "learning_rate": 7.991640400164452e-06, + "loss": 0.2188, + "step": 15406 + }, + { + "epoch": 1.0089663591969615, + "grad_norm": 6.983550071716309, + "learning_rate": 7.991503357544197e-06, + "loss": 0.4214, + "step": 15407 + }, + { + "epoch": 1.0089799240368964, + "grad_norm": 6.917233467102051, + "learning_rate": 7.991366314923943e-06, + "loss": 0.3504, + "step": 15408 + }, + { + "epoch": 1.0089934888768313, + "grad_norm": 5.618245601654053, + "learning_rate": 7.991229272303686e-06, + "loss": 0.2939, + "step": 15409 + }, + { + "epoch": 1.0090070537167661, + "grad_norm": 6.320700168609619, + "learning_rate": 7.991092229683433e-06, + "loss": 0.2368, + "step": 15410 + }, + { + "epoch": 1.009020618556701, + "grad_norm": 7.052990436553955, + "learning_rate": 7.990955187063178e-06, + "loss": 0.4267, + "step": 15411 + }, + { + "epoch": 1.0090341833966359, + "grad_norm": 6.743345737457275, + "learning_rate": 7.990818144442922e-06, + "loss": 0.3937, + "step": 15412 + }, + { + "epoch": 1.0090477482365707, + "grad_norm": 5.677554607391357, + "learning_rate": 7.990681101822667e-06, + "loss": 0.4581, + "step": 15413 + }, + { + "epoch": 1.0090613130765056, + "grad_norm": 6.160210132598877, + "learning_rate": 7.990544059202414e-06, + "loss": 0.3031, + "step": 15414 + }, + { + "epoch": 1.0090748779164407, + "grad_norm": 6.437899589538574, + "learning_rate": 7.990407016582157e-06, + "loss": 0.3202, + "step": 15415 + }, + { + "epoch": 1.0090884427563755, + "grad_norm": 5.713270664215088, + "learning_rate": 7.990269973961902e-06, + "loss": 0.3356, + "step": 15416 + }, + { + "epoch": 1.0091020075963104, + "grad_norm": 6.252500534057617, + "learning_rate": 7.990132931341647e-06, + "loss": 0.2663, + "step": 15417 + }, + { + "epoch": 1.0091155724362453, + "grad_norm": 5.371798038482666, + "learning_rate": 7.989995888721394e-06, + "loss": 0.3612, + "step": 15418 + }, + { + "epoch": 1.0091291372761801, + "grad_norm": 5.694352149963379, + "learning_rate": 7.989858846101138e-06, + "loss": 0.4359, + "step": 15419 + }, + { + "epoch": 1.009142702116115, + "grad_norm": 5.150974750518799, + "learning_rate": 7.989721803480883e-06, + "loss": 0.269, + "step": 15420 + }, + { + "epoch": 1.0091562669560499, + "grad_norm": 7.174580097198486, + "learning_rate": 7.989584760860628e-06, + "loss": 0.3739, + "step": 15421 + }, + { + "epoch": 1.0091698317959847, + "grad_norm": 6.883986473083496, + "learning_rate": 7.989447718240373e-06, + "loss": 0.3604, + "step": 15422 + }, + { + "epoch": 1.0091833966359196, + "grad_norm": 5.910471439361572, + "learning_rate": 7.989310675620119e-06, + "loss": 0.2526, + "step": 15423 + }, + { + "epoch": 1.0091969614758547, + "grad_norm": 6.204023838043213, + "learning_rate": 7.989173632999864e-06, + "loss": 0.4031, + "step": 15424 + }, + { + "epoch": 1.0092105263157896, + "grad_norm": 7.917608737945557, + "learning_rate": 7.989036590379609e-06, + "loss": 0.3644, + "step": 15425 + }, + { + "epoch": 1.0092240911557244, + "grad_norm": 5.231281757354736, + "learning_rate": 7.988899547759354e-06, + "loss": 0.3358, + "step": 15426 + }, + { + "epoch": 1.0092376559956593, + "grad_norm": 6.282519340515137, + "learning_rate": 7.9887625051391e-06, + "loss": 0.2201, + "step": 15427 + }, + { + "epoch": 1.0092512208355942, + "grad_norm": 9.140055656433105, + "learning_rate": 7.988625462518844e-06, + "loss": 0.5496, + "step": 15428 + }, + { + "epoch": 1.009264785675529, + "grad_norm": 4.946041107177734, + "learning_rate": 7.98848841989859e-06, + "loss": 0.2618, + "step": 15429 + }, + { + "epoch": 1.0092783505154639, + "grad_norm": 5.646052360534668, + "learning_rate": 7.988351377278333e-06, + "loss": 0.3271, + "step": 15430 + }, + { + "epoch": 1.0092919153553987, + "grad_norm": 7.931426048278809, + "learning_rate": 7.98821433465808e-06, + "loss": 0.376, + "step": 15431 + }, + { + "epoch": 1.0093054801953336, + "grad_norm": 4.785085201263428, + "learning_rate": 7.988077292037825e-06, + "loss": 0.2466, + "step": 15432 + }, + { + "epoch": 1.0093190450352685, + "grad_norm": 7.971913814544678, + "learning_rate": 7.98794024941757e-06, + "loss": 0.6013, + "step": 15433 + }, + { + "epoch": 1.0093326098752036, + "grad_norm": 5.740139484405518, + "learning_rate": 7.987803206797314e-06, + "loss": 0.3391, + "step": 15434 + }, + { + "epoch": 1.0093461747151384, + "grad_norm": 5.122690200805664, + "learning_rate": 7.987666164177059e-06, + "loss": 0.3031, + "step": 15435 + }, + { + "epoch": 1.0093597395550733, + "grad_norm": 4.573774814605713, + "learning_rate": 7.987529121556806e-06, + "loss": 0.2319, + "step": 15436 + }, + { + "epoch": 1.0093733043950082, + "grad_norm": 4.371373653411865, + "learning_rate": 7.98739207893655e-06, + "loss": 0.2406, + "step": 15437 + }, + { + "epoch": 1.009386869234943, + "grad_norm": 5.1142401695251465, + "learning_rate": 7.987255036316294e-06, + "loss": 0.2524, + "step": 15438 + }, + { + "epoch": 1.009400434074878, + "grad_norm": 4.27216100692749, + "learning_rate": 7.98711799369604e-06, + "loss": 0.2502, + "step": 15439 + }, + { + "epoch": 1.0094139989148128, + "grad_norm": 6.047669410705566, + "learning_rate": 7.986980951075785e-06, + "loss": 0.2715, + "step": 15440 + }, + { + "epoch": 1.0094275637547476, + "grad_norm": 6.500588417053223, + "learning_rate": 7.98684390845553e-06, + "loss": 0.4143, + "step": 15441 + }, + { + "epoch": 1.0094411285946825, + "grad_norm": 5.706118583679199, + "learning_rate": 7.986706865835275e-06, + "loss": 0.316, + "step": 15442 + }, + { + "epoch": 1.0094546934346176, + "grad_norm": 4.120292663574219, + "learning_rate": 7.98656982321502e-06, + "loss": 0.2263, + "step": 15443 + }, + { + "epoch": 1.0094682582745524, + "grad_norm": 4.205305099487305, + "learning_rate": 7.986432780594766e-06, + "loss": 0.2528, + "step": 15444 + }, + { + "epoch": 1.0094818231144873, + "grad_norm": 5.297993183135986, + "learning_rate": 7.98629573797451e-06, + "loss": 0.3162, + "step": 15445 + }, + { + "epoch": 1.0094953879544222, + "grad_norm": 5.846748352050781, + "learning_rate": 7.986158695354256e-06, + "loss": 0.2214, + "step": 15446 + }, + { + "epoch": 1.009508952794357, + "grad_norm": 4.801079750061035, + "learning_rate": 7.986021652734001e-06, + "loss": 0.1514, + "step": 15447 + }, + { + "epoch": 1.009522517634292, + "grad_norm": 5.1419243812561035, + "learning_rate": 7.985884610113746e-06, + "loss": 0.3093, + "step": 15448 + }, + { + "epoch": 1.0095360824742268, + "grad_norm": 7.087690353393555, + "learning_rate": 7.985747567493491e-06, + "loss": 0.379, + "step": 15449 + }, + { + "epoch": 1.0095496473141616, + "grad_norm": 3.7902886867523193, + "learning_rate": 7.985610524873237e-06, + "loss": 0.2596, + "step": 15450 + }, + { + "epoch": 1.0095632121540965, + "grad_norm": 5.701067924499512, + "learning_rate": 7.985473482252982e-06, + "loss": 0.1984, + "step": 15451 + }, + { + "epoch": 1.0095767769940314, + "grad_norm": 3.796286106109619, + "learning_rate": 7.985336439632725e-06, + "loss": 0.158, + "step": 15452 + }, + { + "epoch": 1.0095903418339665, + "grad_norm": 5.111917495727539, + "learning_rate": 7.985199397012472e-06, + "loss": 0.1397, + "step": 15453 + }, + { + "epoch": 1.0096039066739013, + "grad_norm": 5.383553504943848, + "learning_rate": 7.985062354392217e-06, + "loss": 0.2102, + "step": 15454 + }, + { + "epoch": 1.0096174715138362, + "grad_norm": 5.197627067565918, + "learning_rate": 7.98492531177196e-06, + "loss": 0.2552, + "step": 15455 + }, + { + "epoch": 1.009631036353771, + "grad_norm": 3.4046285152435303, + "learning_rate": 7.984788269151706e-06, + "loss": 0.1741, + "step": 15456 + }, + { + "epoch": 1.009644601193706, + "grad_norm": 4.798656940460205, + "learning_rate": 7.984651226531453e-06, + "loss": 0.211, + "step": 15457 + }, + { + "epoch": 1.0096581660336408, + "grad_norm": 5.312453269958496, + "learning_rate": 7.984514183911198e-06, + "loss": 0.3097, + "step": 15458 + }, + { + "epoch": 1.0096717308735756, + "grad_norm": 4.986576080322266, + "learning_rate": 7.984377141290942e-06, + "loss": 0.2365, + "step": 15459 + }, + { + "epoch": 1.0096852957135105, + "grad_norm": 5.044827938079834, + "learning_rate": 7.984240098670687e-06, + "loss": 0.2433, + "step": 15460 + }, + { + "epoch": 1.0096988605534454, + "grad_norm": 6.018990516662598, + "learning_rate": 7.984103056050432e-06, + "loss": 0.2999, + "step": 15461 + }, + { + "epoch": 1.0097124253933805, + "grad_norm": 5.437066555023193, + "learning_rate": 7.983966013430177e-06, + "loss": 0.2834, + "step": 15462 + }, + { + "epoch": 1.0097259902333153, + "grad_norm": 4.333658695220947, + "learning_rate": 7.983828970809922e-06, + "loss": 0.2942, + "step": 15463 + }, + { + "epoch": 1.0097395550732502, + "grad_norm": 4.773878574371338, + "learning_rate": 7.983691928189667e-06, + "loss": 0.1805, + "step": 15464 + }, + { + "epoch": 1.009753119913185, + "grad_norm": 4.627000331878662, + "learning_rate": 7.983554885569413e-06, + "loss": 0.2313, + "step": 15465 + }, + { + "epoch": 1.00976668475312, + "grad_norm": 4.49127197265625, + "learning_rate": 7.983417842949158e-06, + "loss": 0.1813, + "step": 15466 + }, + { + "epoch": 1.0097802495930548, + "grad_norm": 5.698016166687012, + "learning_rate": 7.983280800328903e-06, + "loss": 0.266, + "step": 15467 + }, + { + "epoch": 1.0097938144329897, + "grad_norm": 4.983004570007324, + "learning_rate": 7.983143757708648e-06, + "loss": 0.1772, + "step": 15468 + }, + { + "epoch": 1.0098073792729245, + "grad_norm": 4.723086357116699, + "learning_rate": 7.983006715088393e-06, + "loss": 0.1877, + "step": 15469 + }, + { + "epoch": 1.0098209441128594, + "grad_norm": 4.518036842346191, + "learning_rate": 7.982869672468139e-06, + "loss": 0.2234, + "step": 15470 + }, + { + "epoch": 1.0098345089527943, + "grad_norm": 3.3763816356658936, + "learning_rate": 7.982732629847884e-06, + "loss": 0.1603, + "step": 15471 + }, + { + "epoch": 1.0098480737927293, + "grad_norm": 4.362515926361084, + "learning_rate": 7.982595587227629e-06, + "loss": 0.2335, + "step": 15472 + }, + { + "epoch": 1.0098616386326642, + "grad_norm": 5.568718910217285, + "learning_rate": 7.982458544607374e-06, + "loss": 0.325, + "step": 15473 + }, + { + "epoch": 1.009875203472599, + "grad_norm": 3.980112314224243, + "learning_rate": 7.98232150198712e-06, + "loss": 0.1937, + "step": 15474 + }, + { + "epoch": 1.009888768312534, + "grad_norm": 4.5483856201171875, + "learning_rate": 7.982184459366864e-06, + "loss": 0.2196, + "step": 15475 + }, + { + "epoch": 1.0099023331524688, + "grad_norm": 4.5018439292907715, + "learning_rate": 7.98204741674661e-06, + "loss": 0.1804, + "step": 15476 + }, + { + "epoch": 1.0099158979924037, + "grad_norm": 6.100078582763672, + "learning_rate": 7.981910374126353e-06, + "loss": 0.2599, + "step": 15477 + }, + { + "epoch": 1.0099294628323385, + "grad_norm": 3.852926015853882, + "learning_rate": 7.981773331506098e-06, + "loss": 0.2158, + "step": 15478 + }, + { + "epoch": 1.0099430276722734, + "grad_norm": 4.319788932800293, + "learning_rate": 7.981636288885845e-06, + "loss": 0.1222, + "step": 15479 + }, + { + "epoch": 1.0099565925122083, + "grad_norm": 4.23624324798584, + "learning_rate": 7.981499246265589e-06, + "loss": 0.2006, + "step": 15480 + }, + { + "epoch": 1.0099701573521433, + "grad_norm": 6.253654479980469, + "learning_rate": 7.981362203645334e-06, + "loss": 0.3162, + "step": 15481 + }, + { + "epoch": 1.0099837221920782, + "grad_norm": 4.004226207733154, + "learning_rate": 7.981225161025079e-06, + "loss": 0.1852, + "step": 15482 + }, + { + "epoch": 1.009997287032013, + "grad_norm": 3.5497565269470215, + "learning_rate": 7.981088118404824e-06, + "loss": 0.1583, + "step": 15483 + }, + { + "epoch": 1.010010851871948, + "grad_norm": 3.989377021789551, + "learning_rate": 7.98095107578457e-06, + "loss": 0.2225, + "step": 15484 + }, + { + "epoch": 1.0100244167118828, + "grad_norm": 3.667205333709717, + "learning_rate": 7.980814033164315e-06, + "loss": 0.1798, + "step": 15485 + }, + { + "epoch": 1.0100379815518177, + "grad_norm": 4.949525356292725, + "learning_rate": 7.98067699054406e-06, + "loss": 0.2344, + "step": 15486 + }, + { + "epoch": 1.0100515463917525, + "grad_norm": 3.2417664527893066, + "learning_rate": 7.980539947923805e-06, + "loss": 0.1761, + "step": 15487 + }, + { + "epoch": 1.0100651112316874, + "grad_norm": 5.647882461547852, + "learning_rate": 7.98040290530355e-06, + "loss": 0.2128, + "step": 15488 + }, + { + "epoch": 1.0100786760716223, + "grad_norm": 3.3720850944519043, + "learning_rate": 7.980265862683295e-06, + "loss": 0.1393, + "step": 15489 + }, + { + "epoch": 1.0100922409115571, + "grad_norm": 5.6267595291137695, + "learning_rate": 7.98012882006304e-06, + "loss": 0.3467, + "step": 15490 + }, + { + "epoch": 1.0101058057514922, + "grad_norm": 4.77432107925415, + "learning_rate": 7.979991777442786e-06, + "loss": 0.2776, + "step": 15491 + }, + { + "epoch": 1.010119370591427, + "grad_norm": 3.389702320098877, + "learning_rate": 7.97985473482253e-06, + "loss": 0.2281, + "step": 15492 + }, + { + "epoch": 1.010132935431362, + "grad_norm": 3.4759182929992676, + "learning_rate": 7.979717692202276e-06, + "loss": 0.1988, + "step": 15493 + }, + { + "epoch": 1.0101465002712968, + "grad_norm": 4.803971767425537, + "learning_rate": 7.979580649582021e-06, + "loss": 0.2178, + "step": 15494 + }, + { + "epoch": 1.0101600651112317, + "grad_norm": 4.992018222808838, + "learning_rate": 7.979443606961765e-06, + "loss": 0.301, + "step": 15495 + }, + { + "epoch": 1.0101736299511666, + "grad_norm": 4.6022491455078125, + "learning_rate": 7.979306564341511e-06, + "loss": 0.1328, + "step": 15496 + }, + { + "epoch": 1.0101871947911014, + "grad_norm": 4.267611026763916, + "learning_rate": 7.979169521721257e-06, + "loss": 0.2018, + "step": 15497 + }, + { + "epoch": 1.0102007596310363, + "grad_norm": 4.914949893951416, + "learning_rate": 7.979032479101e-06, + "loss": 0.308, + "step": 15498 + }, + { + "epoch": 1.0102143244709711, + "grad_norm": 4.917319297790527, + "learning_rate": 7.978895436480745e-06, + "loss": 0.2821, + "step": 15499 + }, + { + "epoch": 1.0102278893109062, + "grad_norm": 4.769347190856934, + "learning_rate": 7.978758393860492e-06, + "loss": 0.2234, + "step": 15500 + }, + { + "epoch": 1.010241454150841, + "grad_norm": 2.7120561599731445, + "learning_rate": 7.978621351240237e-06, + "loss": 0.1814, + "step": 15501 + }, + { + "epoch": 1.010255018990776, + "grad_norm": 3.586052179336548, + "learning_rate": 7.978484308619981e-06, + "loss": 0.2237, + "step": 15502 + }, + { + "epoch": 1.0102685838307108, + "grad_norm": 3.8953793048858643, + "learning_rate": 7.978347265999726e-06, + "loss": 0.1753, + "step": 15503 + }, + { + "epoch": 1.0102821486706457, + "grad_norm": 3.7568984031677246, + "learning_rate": 7.978210223379471e-06, + "loss": 0.1731, + "step": 15504 + }, + { + "epoch": 1.0102957135105806, + "grad_norm": 4.5914812088012695, + "learning_rate": 7.978073180759216e-06, + "loss": 0.2307, + "step": 15505 + }, + { + "epoch": 1.0103092783505154, + "grad_norm": 3.8448259830474854, + "learning_rate": 7.977936138138962e-06, + "loss": 0.1658, + "step": 15506 + }, + { + "epoch": 1.0103228431904503, + "grad_norm": 3.624255895614624, + "learning_rate": 7.977799095518707e-06, + "loss": 0.2218, + "step": 15507 + }, + { + "epoch": 1.0103364080303852, + "grad_norm": 4.4208269119262695, + "learning_rate": 7.977662052898452e-06, + "loss": 0.2508, + "step": 15508 + }, + { + "epoch": 1.01034997287032, + "grad_norm": 4.416804313659668, + "learning_rate": 7.977525010278197e-06, + "loss": 0.2221, + "step": 15509 + }, + { + "epoch": 1.010363537710255, + "grad_norm": 3.238468647003174, + "learning_rate": 7.977387967657942e-06, + "loss": 0.1783, + "step": 15510 + }, + { + "epoch": 1.01037710255019, + "grad_norm": 5.117467403411865, + "learning_rate": 7.977250925037687e-06, + "loss": 0.2666, + "step": 15511 + }, + { + "epoch": 1.0103906673901248, + "grad_norm": 3.357419013977051, + "learning_rate": 7.977113882417433e-06, + "loss": 0.1724, + "step": 15512 + }, + { + "epoch": 1.0104042322300597, + "grad_norm": 5.478661060333252, + "learning_rate": 7.976976839797178e-06, + "loss": 0.2794, + "step": 15513 + }, + { + "epoch": 1.0104177970699946, + "grad_norm": 6.162991046905518, + "learning_rate": 7.976839797176923e-06, + "loss": 0.2105, + "step": 15514 + }, + { + "epoch": 1.0104313619099294, + "grad_norm": 3.7918450832366943, + "learning_rate": 7.976702754556668e-06, + "loss": 0.1885, + "step": 15515 + }, + { + "epoch": 1.0104449267498643, + "grad_norm": 6.387773036956787, + "learning_rate": 7.976565711936413e-06, + "loss": 0.2554, + "step": 15516 + }, + { + "epoch": 1.0104584915897992, + "grad_norm": 3.124281167984009, + "learning_rate": 7.976428669316157e-06, + "loss": 0.1659, + "step": 15517 + }, + { + "epoch": 1.010472056429734, + "grad_norm": 4.085077285766602, + "learning_rate": 7.976291626695904e-06, + "loss": 0.2342, + "step": 15518 + }, + { + "epoch": 1.0104856212696691, + "grad_norm": 5.540498733520508, + "learning_rate": 7.976154584075649e-06, + "loss": 0.3265, + "step": 15519 + }, + { + "epoch": 1.010499186109604, + "grad_norm": 4.570056915283203, + "learning_rate": 7.976017541455392e-06, + "loss": 0.2411, + "step": 15520 + }, + { + "epoch": 1.0105127509495389, + "grad_norm": 5.091660499572754, + "learning_rate": 7.975880498835138e-06, + "loss": 0.2365, + "step": 15521 + }, + { + "epoch": 1.0105263157894737, + "grad_norm": 6.643990516662598, + "learning_rate": 7.975743456214884e-06, + "loss": 0.3021, + "step": 15522 + }, + { + "epoch": 1.0105398806294086, + "grad_norm": 5.689020156860352, + "learning_rate": 7.975606413594628e-06, + "loss": 0.5129, + "step": 15523 + }, + { + "epoch": 1.0105534454693434, + "grad_norm": 4.467434406280518, + "learning_rate": 7.975469370974373e-06, + "loss": 0.2578, + "step": 15524 + }, + { + "epoch": 1.0105670103092783, + "grad_norm": 5.395776748657227, + "learning_rate": 7.975332328354118e-06, + "loss": 0.3806, + "step": 15525 + }, + { + "epoch": 1.0105805751492132, + "grad_norm": 4.255511283874512, + "learning_rate": 7.975195285733865e-06, + "loss": 0.2609, + "step": 15526 + }, + { + "epoch": 1.010594139989148, + "grad_norm": 6.319108486175537, + "learning_rate": 7.975058243113609e-06, + "loss": 0.2515, + "step": 15527 + }, + { + "epoch": 1.010607704829083, + "grad_norm": 3.4065582752227783, + "learning_rate": 7.974921200493354e-06, + "loss": 0.2038, + "step": 15528 + }, + { + "epoch": 1.010621269669018, + "grad_norm": 4.468949317932129, + "learning_rate": 7.974784157873099e-06, + "loss": 0.2537, + "step": 15529 + }, + { + "epoch": 1.0106348345089529, + "grad_norm": 8.570937156677246, + "learning_rate": 7.974647115252844e-06, + "loss": 0.4995, + "step": 15530 + }, + { + "epoch": 1.0106483993488877, + "grad_norm": 5.239192962646484, + "learning_rate": 7.97451007263259e-06, + "loss": 0.2445, + "step": 15531 + }, + { + "epoch": 1.0106619641888226, + "grad_norm": 5.475651741027832, + "learning_rate": 7.974373030012335e-06, + "loss": 0.3862, + "step": 15532 + }, + { + "epoch": 1.0106755290287575, + "grad_norm": 4.437652111053467, + "learning_rate": 7.97423598739208e-06, + "loss": 0.2168, + "step": 15533 + }, + { + "epoch": 1.0106890938686923, + "grad_norm": 4.424399375915527, + "learning_rate": 7.974098944771825e-06, + "loss": 0.2294, + "step": 15534 + }, + { + "epoch": 1.0107026587086272, + "grad_norm": 6.425760269165039, + "learning_rate": 7.97396190215157e-06, + "loss": 0.3295, + "step": 15535 + }, + { + "epoch": 1.010716223548562, + "grad_norm": 4.229187965393066, + "learning_rate": 7.973824859531315e-06, + "loss": 0.2094, + "step": 15536 + }, + { + "epoch": 1.010729788388497, + "grad_norm": 5.344478130340576, + "learning_rate": 7.97368781691106e-06, + "loss": 0.2931, + "step": 15537 + }, + { + "epoch": 1.010743353228432, + "grad_norm": 5.446079254150391, + "learning_rate": 7.973550774290804e-06, + "loss": 0.3075, + "step": 15538 + }, + { + "epoch": 1.0107569180683669, + "grad_norm": 6.376369476318359, + "learning_rate": 7.97341373167055e-06, + "loss": 0.5536, + "step": 15539 + }, + { + "epoch": 1.0107704829083017, + "grad_norm": 5.597986698150635, + "learning_rate": 7.973276689050296e-06, + "loss": 0.2487, + "step": 15540 + }, + { + "epoch": 1.0107840477482366, + "grad_norm": 6.281111717224121, + "learning_rate": 7.973139646430041e-06, + "loss": 0.1978, + "step": 15541 + }, + { + "epoch": 1.0107976125881715, + "grad_norm": 5.118190765380859, + "learning_rate": 7.973002603809785e-06, + "loss": 0.3519, + "step": 15542 + }, + { + "epoch": 1.0108111774281063, + "grad_norm": 5.017704486846924, + "learning_rate": 7.972865561189532e-06, + "loss": 0.3034, + "step": 15543 + }, + { + "epoch": 1.0108247422680412, + "grad_norm": 4.886582374572754, + "learning_rate": 7.972728518569277e-06, + "loss": 0.2791, + "step": 15544 + }, + { + "epoch": 1.010838307107976, + "grad_norm": 4.248204231262207, + "learning_rate": 7.97259147594902e-06, + "loss": 0.2764, + "step": 15545 + }, + { + "epoch": 1.010851871947911, + "grad_norm": 6.685595512390137, + "learning_rate": 7.972454433328765e-06, + "loss": 0.3829, + "step": 15546 + }, + { + "epoch": 1.0108654367878458, + "grad_norm": 6.9133219718933105, + "learning_rate": 7.97231739070851e-06, + "loss": 0.2613, + "step": 15547 + }, + { + "epoch": 1.0108790016277809, + "grad_norm": 4.124736309051514, + "learning_rate": 7.972180348088256e-06, + "loss": 0.1896, + "step": 15548 + }, + { + "epoch": 1.0108925664677157, + "grad_norm": 5.149864196777344, + "learning_rate": 7.972043305468001e-06, + "loss": 0.2864, + "step": 15549 + }, + { + "epoch": 1.0109061313076506, + "grad_norm": 6.386865615844727, + "learning_rate": 7.971906262847746e-06, + "loss": 0.3309, + "step": 15550 + }, + { + "epoch": 1.0109196961475855, + "grad_norm": 6.19879674911499, + "learning_rate": 7.971769220227491e-06, + "loss": 0.2959, + "step": 15551 + }, + { + "epoch": 1.0109332609875203, + "grad_norm": 4.7564496994018555, + "learning_rate": 7.971632177607236e-06, + "loss": 0.2095, + "step": 15552 + }, + { + "epoch": 1.0109468258274552, + "grad_norm": 5.9600114822387695, + "learning_rate": 7.971495134986982e-06, + "loss": 0.342, + "step": 15553 + }, + { + "epoch": 1.01096039066739, + "grad_norm": 5.9277024269104, + "learning_rate": 7.971358092366727e-06, + "loss": 0.3437, + "step": 15554 + }, + { + "epoch": 1.010973955507325, + "grad_norm": 6.251729488372803, + "learning_rate": 7.971221049746472e-06, + "loss": 0.357, + "step": 15555 + }, + { + "epoch": 1.0109875203472598, + "grad_norm": 4.7514190673828125, + "learning_rate": 7.971084007126217e-06, + "loss": 0.2318, + "step": 15556 + }, + { + "epoch": 1.011001085187195, + "grad_norm": 5.241632461547852, + "learning_rate": 7.970946964505962e-06, + "loss": 0.321, + "step": 15557 + }, + { + "epoch": 1.0110146500271298, + "grad_norm": 5.001289367675781, + "learning_rate": 7.970809921885707e-06, + "loss": 0.2867, + "step": 15558 + }, + { + "epoch": 1.0110282148670646, + "grad_norm": 5.805257797241211, + "learning_rate": 7.970672879265453e-06, + "loss": 0.3683, + "step": 15559 + }, + { + "epoch": 1.0110417797069995, + "grad_norm": 6.474325180053711, + "learning_rate": 7.970535836645196e-06, + "loss": 0.3552, + "step": 15560 + }, + { + "epoch": 1.0110553445469344, + "grad_norm": 5.9062347412109375, + "learning_rate": 7.970398794024943e-06, + "loss": 0.3775, + "step": 15561 + }, + { + "epoch": 1.0110689093868692, + "grad_norm": 5.088146686553955, + "learning_rate": 7.970261751404688e-06, + "loss": 0.2889, + "step": 15562 + }, + { + "epoch": 1.011082474226804, + "grad_norm": 5.971416473388672, + "learning_rate": 7.970124708784432e-06, + "loss": 0.3614, + "step": 15563 + }, + { + "epoch": 1.011096039066739, + "grad_norm": 3.9855237007141113, + "learning_rate": 7.969987666164177e-06, + "loss": 0.2486, + "step": 15564 + }, + { + "epoch": 1.0111096039066738, + "grad_norm": 6.156308650970459, + "learning_rate": 7.969850623543924e-06, + "loss": 0.4166, + "step": 15565 + }, + { + "epoch": 1.0111231687466087, + "grad_norm": 5.796081066131592, + "learning_rate": 7.969713580923669e-06, + "loss": 0.3009, + "step": 15566 + }, + { + "epoch": 1.0111367335865438, + "grad_norm": 5.8007097244262695, + "learning_rate": 7.969576538303412e-06, + "loss": 0.2257, + "step": 15567 + }, + { + "epoch": 1.0111502984264786, + "grad_norm": 4.766583442687988, + "learning_rate": 7.969439495683158e-06, + "loss": 0.2087, + "step": 15568 + }, + { + "epoch": 1.0111638632664135, + "grad_norm": 6.260830402374268, + "learning_rate": 7.969302453062904e-06, + "loss": 0.3125, + "step": 15569 + }, + { + "epoch": 1.0111774281063484, + "grad_norm": 6.018795490264893, + "learning_rate": 7.969165410442648e-06, + "loss": 0.2504, + "step": 15570 + }, + { + "epoch": 1.0111909929462832, + "grad_norm": 6.514252662658691, + "learning_rate": 7.969028367822393e-06, + "loss": 0.2211, + "step": 15571 + }, + { + "epoch": 1.011204557786218, + "grad_norm": 5.800174236297607, + "learning_rate": 7.968891325202138e-06, + "loss": 0.274, + "step": 15572 + }, + { + "epoch": 1.011218122626153, + "grad_norm": 4.8935089111328125, + "learning_rate": 7.968754282581883e-06, + "loss": 0.2586, + "step": 15573 + }, + { + "epoch": 1.0112316874660878, + "grad_norm": 5.5856194496154785, + "learning_rate": 7.968617239961629e-06, + "loss": 0.2751, + "step": 15574 + }, + { + "epoch": 1.0112452523060227, + "grad_norm": 4.5875773429870605, + "learning_rate": 7.968480197341374e-06, + "loss": 0.2493, + "step": 15575 + }, + { + "epoch": 1.0112588171459578, + "grad_norm": 4.70670223236084, + "learning_rate": 7.968343154721119e-06, + "loss": 0.2877, + "step": 15576 + }, + { + "epoch": 1.0112723819858926, + "grad_norm": 4.443944931030273, + "learning_rate": 7.968206112100864e-06, + "loss": 0.2229, + "step": 15577 + }, + { + "epoch": 1.0112859468258275, + "grad_norm": 4.2819108963012695, + "learning_rate": 7.96806906948061e-06, + "loss": 0.193, + "step": 15578 + }, + { + "epoch": 1.0112995116657624, + "grad_norm": 4.6937079429626465, + "learning_rate": 7.967932026860355e-06, + "loss": 0.3004, + "step": 15579 + }, + { + "epoch": 1.0113130765056972, + "grad_norm": 6.848104000091553, + "learning_rate": 7.9677949842401e-06, + "loss": 0.2638, + "step": 15580 + }, + { + "epoch": 1.011326641345632, + "grad_norm": 3.951744556427002, + "learning_rate": 7.967657941619845e-06, + "loss": 0.1765, + "step": 15581 + }, + { + "epoch": 1.011340206185567, + "grad_norm": 4.508567810058594, + "learning_rate": 7.96752089899959e-06, + "loss": 0.205, + "step": 15582 + }, + { + "epoch": 1.0113537710255018, + "grad_norm": 5.1172871589660645, + "learning_rate": 7.967383856379335e-06, + "loss": 0.2972, + "step": 15583 + }, + { + "epoch": 1.0113673358654367, + "grad_norm": 4.291250705718994, + "learning_rate": 7.96724681375908e-06, + "loss": 0.191, + "step": 15584 + }, + { + "epoch": 1.0113809007053716, + "grad_norm": 3.664478063583374, + "learning_rate": 7.967109771138824e-06, + "loss": 0.2002, + "step": 15585 + }, + { + "epoch": 1.0113944655453067, + "grad_norm": 3.5563302040100098, + "learning_rate": 7.966972728518569e-06, + "loss": 0.1904, + "step": 15586 + }, + { + "epoch": 1.0114080303852415, + "grad_norm": 4.6490797996521, + "learning_rate": 7.966835685898316e-06, + "loss": 0.2279, + "step": 15587 + }, + { + "epoch": 1.0114215952251764, + "grad_norm": 3.6395645141601562, + "learning_rate": 7.96669864327806e-06, + "loss": 0.1873, + "step": 15588 + }, + { + "epoch": 1.0114351600651112, + "grad_norm": 5.414516925811768, + "learning_rate": 7.966561600657805e-06, + "loss": 0.2872, + "step": 15589 + }, + { + "epoch": 1.0114487249050461, + "grad_norm": 4.763138294219971, + "learning_rate": 7.96642455803755e-06, + "loss": 0.1933, + "step": 15590 + }, + { + "epoch": 1.011462289744981, + "grad_norm": 3.8383140563964844, + "learning_rate": 7.966287515417295e-06, + "loss": 0.2169, + "step": 15591 + }, + { + "epoch": 1.0114758545849158, + "grad_norm": 3.3302671909332275, + "learning_rate": 7.96615047279704e-06, + "loss": 0.1471, + "step": 15592 + }, + { + "epoch": 1.0114894194248507, + "grad_norm": 7.495439052581787, + "learning_rate": 7.966013430176785e-06, + "loss": 0.4353, + "step": 15593 + }, + { + "epoch": 1.0115029842647856, + "grad_norm": 3.0816078186035156, + "learning_rate": 7.96587638755653e-06, + "loss": 0.1178, + "step": 15594 + }, + { + "epoch": 1.0115165491047207, + "grad_norm": 5.122756481170654, + "learning_rate": 7.965739344936276e-06, + "loss": 0.2611, + "step": 15595 + }, + { + "epoch": 1.0115301139446555, + "grad_norm": 4.5590386390686035, + "learning_rate": 7.965602302316021e-06, + "loss": 0.1949, + "step": 15596 + }, + { + "epoch": 1.0115436787845904, + "grad_norm": 3.2512078285217285, + "learning_rate": 7.965465259695766e-06, + "loss": 0.178, + "step": 15597 + }, + { + "epoch": 1.0115572436245253, + "grad_norm": 3.814821243286133, + "learning_rate": 7.965328217075511e-06, + "loss": 0.1849, + "step": 15598 + }, + { + "epoch": 1.0115708084644601, + "grad_norm": 4.857402801513672, + "learning_rate": 7.965191174455256e-06, + "loss": 0.1963, + "step": 15599 + }, + { + "epoch": 1.011584373304395, + "grad_norm": 3.8486263751983643, + "learning_rate": 7.965054131835002e-06, + "loss": 0.1807, + "step": 15600 + }, + { + "epoch": 1.0115979381443299, + "grad_norm": 4.227337837219238, + "learning_rate": 7.964917089214747e-06, + "loss": 0.216, + "step": 15601 + }, + { + "epoch": 1.0116115029842647, + "grad_norm": 5.705799102783203, + "learning_rate": 7.964780046594492e-06, + "loss": 0.2164, + "step": 15602 + }, + { + "epoch": 1.0116250678241996, + "grad_norm": 5.665639400482178, + "learning_rate": 7.964643003974235e-06, + "loss": 0.2679, + "step": 15603 + }, + { + "epoch": 1.0116386326641345, + "grad_norm": 4.3581132888793945, + "learning_rate": 7.964505961353982e-06, + "loss": 0.2836, + "step": 15604 + }, + { + "epoch": 1.0116521975040695, + "grad_norm": 4.758267879486084, + "learning_rate": 7.964368918733728e-06, + "loss": 0.2134, + "step": 15605 + }, + { + "epoch": 1.0116657623440044, + "grad_norm": 3.1677544116973877, + "learning_rate": 7.964231876113471e-06, + "loss": 0.2118, + "step": 15606 + }, + { + "epoch": 1.0116793271839393, + "grad_norm": 3.051116943359375, + "learning_rate": 7.964094833493216e-06, + "loss": 0.1341, + "step": 15607 + }, + { + "epoch": 1.0116928920238741, + "grad_norm": 2.8144123554229736, + "learning_rate": 7.963957790872963e-06, + "loss": 0.0726, + "step": 15608 + }, + { + "epoch": 1.011706456863809, + "grad_norm": 5.863626003265381, + "learning_rate": 7.963820748252708e-06, + "loss": 0.3467, + "step": 15609 + }, + { + "epoch": 1.0117200217037439, + "grad_norm": 4.016258239746094, + "learning_rate": 7.963683705632452e-06, + "loss": 0.1592, + "step": 15610 + }, + { + "epoch": 1.0117335865436787, + "grad_norm": 4.41050910949707, + "learning_rate": 7.963546663012197e-06, + "loss": 0.2313, + "step": 15611 + }, + { + "epoch": 1.0117471513836136, + "grad_norm": 5.300055027008057, + "learning_rate": 7.963409620391944e-06, + "loss": 0.2087, + "step": 15612 + }, + { + "epoch": 1.0117607162235485, + "grad_norm": 4.013779640197754, + "learning_rate": 7.963272577771687e-06, + "loss": 0.2093, + "step": 15613 + }, + { + "epoch": 1.0117742810634835, + "grad_norm": 4.912789344787598, + "learning_rate": 7.963135535151432e-06, + "loss": 0.2242, + "step": 15614 + }, + { + "epoch": 1.0117878459034184, + "grad_norm": 3.59717059135437, + "learning_rate": 7.962998492531178e-06, + "loss": 0.1514, + "step": 15615 + }, + { + "epoch": 1.0118014107433533, + "grad_norm": 4.78895902633667, + "learning_rate": 7.962861449910923e-06, + "loss": 0.2954, + "step": 15616 + }, + { + "epoch": 1.0118149755832881, + "grad_norm": 4.785504341125488, + "learning_rate": 7.962724407290668e-06, + "loss": 0.2381, + "step": 15617 + }, + { + "epoch": 1.011828540423223, + "grad_norm": 5.4475483894348145, + "learning_rate": 7.962587364670413e-06, + "loss": 0.2142, + "step": 15618 + }, + { + "epoch": 1.0118421052631579, + "grad_norm": 4.267998695373535, + "learning_rate": 7.962450322050158e-06, + "loss": 0.1918, + "step": 15619 + }, + { + "epoch": 1.0118556701030927, + "grad_norm": 5.144200801849365, + "learning_rate": 7.962313279429904e-06, + "loss": 0.1974, + "step": 15620 + }, + { + "epoch": 1.0118692349430276, + "grad_norm": 4.495233058929443, + "learning_rate": 7.962176236809649e-06, + "loss": 0.1913, + "step": 15621 + }, + { + "epoch": 1.0118827997829625, + "grad_norm": 6.109568119049072, + "learning_rate": 7.962039194189394e-06, + "loss": 0.2165, + "step": 15622 + }, + { + "epoch": 1.0118963646228976, + "grad_norm": 4.999785423278809, + "learning_rate": 7.961902151569139e-06, + "loss": 0.2334, + "step": 15623 + }, + { + "epoch": 1.0119099294628324, + "grad_norm": 4.979470729827881, + "learning_rate": 7.961765108948884e-06, + "loss": 0.1961, + "step": 15624 + }, + { + "epoch": 1.0119234943027673, + "grad_norm": 5.191701412200928, + "learning_rate": 7.96162806632863e-06, + "loss": 0.2836, + "step": 15625 + }, + { + "epoch": 1.0119370591427022, + "grad_norm": 5.134594440460205, + "learning_rate": 7.961491023708375e-06, + "loss": 0.2797, + "step": 15626 + }, + { + "epoch": 1.011950623982637, + "grad_norm": 5.018108367919922, + "learning_rate": 7.96135398108812e-06, + "loss": 0.2271, + "step": 15627 + }, + { + "epoch": 1.0119641888225719, + "grad_norm": 6.171719074249268, + "learning_rate": 7.961216938467863e-06, + "loss": 0.327, + "step": 15628 + }, + { + "epoch": 1.0119777536625068, + "grad_norm": 4.554595470428467, + "learning_rate": 7.961079895847608e-06, + "loss": 0.1837, + "step": 15629 + }, + { + "epoch": 1.0119913185024416, + "grad_norm": 4.950405597686768, + "learning_rate": 7.960942853227355e-06, + "loss": 0.2538, + "step": 15630 + }, + { + "epoch": 1.0120048833423765, + "grad_norm": 5.9521613121032715, + "learning_rate": 7.960805810607099e-06, + "loss": 0.3248, + "step": 15631 + }, + { + "epoch": 1.0120184481823113, + "grad_norm": 4.835526466369629, + "learning_rate": 7.960668767986844e-06, + "loss": 0.167, + "step": 15632 + }, + { + "epoch": 1.0120320130222464, + "grad_norm": 3.9359734058380127, + "learning_rate": 7.960531725366589e-06, + "loss": 0.2142, + "step": 15633 + }, + { + "epoch": 1.0120455778621813, + "grad_norm": 4.881103038787842, + "learning_rate": 7.960394682746336e-06, + "loss": 0.2869, + "step": 15634 + }, + { + "epoch": 1.0120591427021162, + "grad_norm": 3.7404074668884277, + "learning_rate": 7.96025764012608e-06, + "loss": 0.1607, + "step": 15635 + }, + { + "epoch": 1.012072707542051, + "grad_norm": 6.683023929595947, + "learning_rate": 7.960120597505825e-06, + "loss": 0.3004, + "step": 15636 + }, + { + "epoch": 1.012086272381986, + "grad_norm": 4.520568370819092, + "learning_rate": 7.95998355488557e-06, + "loss": 0.2403, + "step": 15637 + }, + { + "epoch": 1.0120998372219208, + "grad_norm": 3.8151164054870605, + "learning_rate": 7.959846512265315e-06, + "loss": 0.2004, + "step": 15638 + }, + { + "epoch": 1.0121134020618556, + "grad_norm": 4.29590368270874, + "learning_rate": 7.95970946964506e-06, + "loss": 0.2293, + "step": 15639 + }, + { + "epoch": 1.0121269669017905, + "grad_norm": 4.158545970916748, + "learning_rate": 7.959572427024805e-06, + "loss": 0.2177, + "step": 15640 + }, + { + "epoch": 1.0121405317417254, + "grad_norm": 4.874756813049316, + "learning_rate": 7.95943538440455e-06, + "loss": 0.2883, + "step": 15641 + }, + { + "epoch": 1.0121540965816604, + "grad_norm": 5.172357082366943, + "learning_rate": 7.959298341784296e-06, + "loss": 0.2441, + "step": 15642 + }, + { + "epoch": 1.0121676614215953, + "grad_norm": 5.206993103027344, + "learning_rate": 7.959161299164041e-06, + "loss": 0.2585, + "step": 15643 + }, + { + "epoch": 1.0121812262615302, + "grad_norm": 4.583245754241943, + "learning_rate": 7.959024256543786e-06, + "loss": 0.1147, + "step": 15644 + }, + { + "epoch": 1.012194791101465, + "grad_norm": 4.808078289031982, + "learning_rate": 7.958887213923531e-06, + "loss": 0.2612, + "step": 15645 + }, + { + "epoch": 1.0122083559414, + "grad_norm": 3.4209799766540527, + "learning_rate": 7.958750171303275e-06, + "loss": 0.114, + "step": 15646 + }, + { + "epoch": 1.0122219207813348, + "grad_norm": 5.999975681304932, + "learning_rate": 7.958613128683022e-06, + "loss": 0.2731, + "step": 15647 + }, + { + "epoch": 1.0122354856212696, + "grad_norm": 4.518220901489258, + "learning_rate": 7.958476086062767e-06, + "loss": 0.2421, + "step": 15648 + }, + { + "epoch": 1.0122490504612045, + "grad_norm": 5.266968727111816, + "learning_rate": 7.958339043442512e-06, + "loss": 0.2168, + "step": 15649 + }, + { + "epoch": 1.0122626153011394, + "grad_norm": 5.635858535766602, + "learning_rate": 7.958202000822255e-06, + "loss": 0.1871, + "step": 15650 + }, + { + "epoch": 1.0122761801410742, + "grad_norm": 4.814388751983643, + "learning_rate": 7.958064958202002e-06, + "loss": 0.2688, + "step": 15651 + }, + { + "epoch": 1.0122897449810093, + "grad_norm": 4.402621746063232, + "learning_rate": 7.957927915581748e-06, + "loss": 0.2464, + "step": 15652 + }, + { + "epoch": 1.0123033098209442, + "grad_norm": 4.431020259857178, + "learning_rate": 7.957790872961491e-06, + "loss": 0.2173, + "step": 15653 + }, + { + "epoch": 1.012316874660879, + "grad_norm": 4.46183967590332, + "learning_rate": 7.957653830341236e-06, + "loss": 0.2044, + "step": 15654 + }, + { + "epoch": 1.012330439500814, + "grad_norm": 6.63139533996582, + "learning_rate": 7.957516787720981e-06, + "loss": 0.3337, + "step": 15655 + }, + { + "epoch": 1.0123440043407488, + "grad_norm": 4.179649353027344, + "learning_rate": 7.957379745100727e-06, + "loss": 0.1603, + "step": 15656 + }, + { + "epoch": 1.0123575691806836, + "grad_norm": 3.7742176055908203, + "learning_rate": 7.957242702480472e-06, + "loss": 0.1808, + "step": 15657 + }, + { + "epoch": 1.0123711340206185, + "grad_norm": 4.261166095733643, + "learning_rate": 7.957105659860217e-06, + "loss": 0.2417, + "step": 15658 + }, + { + "epoch": 1.0123846988605534, + "grad_norm": 3.4110279083251953, + "learning_rate": 7.956968617239962e-06, + "loss": 0.1417, + "step": 15659 + }, + { + "epoch": 1.0123982637004882, + "grad_norm": 4.960216999053955, + "learning_rate": 7.956831574619707e-06, + "loss": 0.2863, + "step": 15660 + }, + { + "epoch": 1.0124118285404233, + "grad_norm": 6.756330966949463, + "learning_rate": 7.956694531999452e-06, + "loss": 0.3444, + "step": 15661 + }, + { + "epoch": 1.0124253933803582, + "grad_norm": 4.89208984375, + "learning_rate": 7.956557489379198e-06, + "loss": 0.2904, + "step": 15662 + }, + { + "epoch": 1.012438958220293, + "grad_norm": 4.6068925857543945, + "learning_rate": 7.956420446758943e-06, + "loss": 0.2392, + "step": 15663 + }, + { + "epoch": 1.012452523060228, + "grad_norm": 3.9919204711914062, + "learning_rate": 7.956283404138688e-06, + "loss": 0.2121, + "step": 15664 + }, + { + "epoch": 1.0124660879001628, + "grad_norm": 4.059140205383301, + "learning_rate": 7.956146361518433e-06, + "loss": 0.2288, + "step": 15665 + }, + { + "epoch": 1.0124796527400977, + "grad_norm": 4.67175817489624, + "learning_rate": 7.956009318898178e-06, + "loss": 0.2248, + "step": 15666 + }, + { + "epoch": 1.0124932175800325, + "grad_norm": 4.29484224319458, + "learning_rate": 7.955872276277924e-06, + "loss": 0.2349, + "step": 15667 + }, + { + "epoch": 1.0125067824199674, + "grad_norm": 5.418098449707031, + "learning_rate": 7.955735233657667e-06, + "loss": 0.2824, + "step": 15668 + }, + { + "epoch": 1.0125203472599023, + "grad_norm": 6.725034236907959, + "learning_rate": 7.955598191037414e-06, + "loss": 0.3142, + "step": 15669 + }, + { + "epoch": 1.0125339120998371, + "grad_norm": 4.520270347595215, + "learning_rate": 7.955461148417159e-06, + "loss": 0.1988, + "step": 15670 + }, + { + "epoch": 1.0125474769397722, + "grad_norm": 4.890805721282959, + "learning_rate": 7.955324105796903e-06, + "loss": 0.222, + "step": 15671 + }, + { + "epoch": 1.012561041779707, + "grad_norm": 5.761971473693848, + "learning_rate": 7.955187063176648e-06, + "loss": 0.2976, + "step": 15672 + }, + { + "epoch": 1.012574606619642, + "grad_norm": 3.5086958408355713, + "learning_rate": 7.955050020556395e-06, + "loss": 0.2188, + "step": 15673 + }, + { + "epoch": 1.0125881714595768, + "grad_norm": 4.503308296203613, + "learning_rate": 7.954912977936138e-06, + "loss": 0.272, + "step": 15674 + }, + { + "epoch": 1.0126017362995117, + "grad_norm": 4.470958232879639, + "learning_rate": 7.954775935315883e-06, + "loss": 0.2599, + "step": 15675 + }, + { + "epoch": 1.0126153011394465, + "grad_norm": 4.688629150390625, + "learning_rate": 7.954638892695628e-06, + "loss": 0.1681, + "step": 15676 + }, + { + "epoch": 1.0126288659793814, + "grad_norm": 6.219448566436768, + "learning_rate": 7.954501850075375e-06, + "loss": 0.2636, + "step": 15677 + }, + { + "epoch": 1.0126424308193163, + "grad_norm": 5.638312816619873, + "learning_rate": 7.954364807455119e-06, + "loss": 0.3553, + "step": 15678 + }, + { + "epoch": 1.0126559956592511, + "grad_norm": 4.070746898651123, + "learning_rate": 7.954227764834864e-06, + "loss": 0.2793, + "step": 15679 + }, + { + "epoch": 1.0126695604991862, + "grad_norm": 4.4022979736328125, + "learning_rate": 7.954090722214609e-06, + "loss": 0.2422, + "step": 15680 + }, + { + "epoch": 1.012683125339121, + "grad_norm": 5.384944915771484, + "learning_rate": 7.953953679594354e-06, + "loss": 0.2879, + "step": 15681 + }, + { + "epoch": 1.012696690179056, + "grad_norm": 4.7857794761657715, + "learning_rate": 7.9538166369741e-06, + "loss": 0.1373, + "step": 15682 + }, + { + "epoch": 1.0127102550189908, + "grad_norm": 5.39399528503418, + "learning_rate": 7.953679594353845e-06, + "loss": 0.1298, + "step": 15683 + }, + { + "epoch": 1.0127238198589257, + "grad_norm": 4.845790386199951, + "learning_rate": 7.95354255173359e-06, + "loss": 0.4078, + "step": 15684 + }, + { + "epoch": 1.0127373846988605, + "grad_norm": 5.89417028427124, + "learning_rate": 7.953405509113335e-06, + "loss": 0.3132, + "step": 15685 + }, + { + "epoch": 1.0127509495387954, + "grad_norm": 6.188899517059326, + "learning_rate": 7.95326846649308e-06, + "loss": 0.262, + "step": 15686 + }, + { + "epoch": 1.0127645143787303, + "grad_norm": 4.911748886108398, + "learning_rate": 7.953131423872825e-06, + "loss": 0.199, + "step": 15687 + }, + { + "epoch": 1.0127780792186651, + "grad_norm": 4.709322929382324, + "learning_rate": 7.95299438125257e-06, + "loss": 0.2166, + "step": 15688 + }, + { + "epoch": 1.0127916440586, + "grad_norm": 5.7810893058776855, + "learning_rate": 7.952857338632314e-06, + "loss": 0.2998, + "step": 15689 + }, + { + "epoch": 1.012805208898535, + "grad_norm": 5.016318321228027, + "learning_rate": 7.952720296012061e-06, + "loss": 0.3022, + "step": 15690 + }, + { + "epoch": 1.01281877373847, + "grad_norm": 5.3161187171936035, + "learning_rate": 7.952583253391806e-06, + "loss": 0.2629, + "step": 15691 + }, + { + "epoch": 1.0128323385784048, + "grad_norm": 4.923661708831787, + "learning_rate": 7.952446210771551e-06, + "loss": 0.3853, + "step": 15692 + }, + { + "epoch": 1.0128459034183397, + "grad_norm": 7.072931289672852, + "learning_rate": 7.952309168151295e-06, + "loss": 0.2472, + "step": 15693 + }, + { + "epoch": 1.0128594682582746, + "grad_norm": 5.613077163696289, + "learning_rate": 7.952172125531042e-06, + "loss": 0.2902, + "step": 15694 + }, + { + "epoch": 1.0128730330982094, + "grad_norm": 5.94118070602417, + "learning_rate": 7.952035082910787e-06, + "loss": 0.3181, + "step": 15695 + }, + { + "epoch": 1.0128865979381443, + "grad_norm": 5.89771842956543, + "learning_rate": 7.95189804029053e-06, + "loss": 0.2455, + "step": 15696 + }, + { + "epoch": 1.0129001627780791, + "grad_norm": 6.972736835479736, + "learning_rate": 7.951760997670275e-06, + "loss": 0.5313, + "step": 15697 + }, + { + "epoch": 1.012913727618014, + "grad_norm": 5.630451202392578, + "learning_rate": 7.95162395505002e-06, + "loss": 0.2397, + "step": 15698 + }, + { + "epoch": 1.012927292457949, + "grad_norm": 7.632232189178467, + "learning_rate": 7.951486912429766e-06, + "loss": 0.3951, + "step": 15699 + }, + { + "epoch": 1.012940857297884, + "grad_norm": 4.767345905303955, + "learning_rate": 7.951349869809511e-06, + "loss": 0.1987, + "step": 15700 + }, + { + "epoch": 1.0129544221378188, + "grad_norm": 7.418100357055664, + "learning_rate": 7.951212827189256e-06, + "loss": 0.4573, + "step": 15701 + }, + { + "epoch": 1.0129679869777537, + "grad_norm": 4.85147762298584, + "learning_rate": 7.951075784569001e-06, + "loss": 0.2812, + "step": 15702 + }, + { + "epoch": 1.0129815518176886, + "grad_norm": 7.917899131774902, + "learning_rate": 7.950938741948747e-06, + "loss": 0.4394, + "step": 15703 + }, + { + "epoch": 1.0129951166576234, + "grad_norm": 6.700867176055908, + "learning_rate": 7.950801699328492e-06, + "loss": 0.2807, + "step": 15704 + }, + { + "epoch": 1.0130086814975583, + "grad_norm": 5.921830654144287, + "learning_rate": 7.950664656708237e-06, + "loss": 0.303, + "step": 15705 + }, + { + "epoch": 1.0130222463374932, + "grad_norm": 3.320244550704956, + "learning_rate": 7.950527614087982e-06, + "loss": 0.131, + "step": 15706 + }, + { + "epoch": 1.013035811177428, + "grad_norm": 5.127596855163574, + "learning_rate": 7.950390571467727e-06, + "loss": 0.3021, + "step": 15707 + }, + { + "epoch": 1.013049376017363, + "grad_norm": 5.984931468963623, + "learning_rate": 7.950253528847472e-06, + "loss": 0.378, + "step": 15708 + }, + { + "epoch": 1.013062940857298, + "grad_norm": 5.10573148727417, + "learning_rate": 7.950116486227218e-06, + "loss": 0.311, + "step": 15709 + }, + { + "epoch": 1.0130765056972328, + "grad_norm": 5.0985918045043945, + "learning_rate": 7.949979443606963e-06, + "loss": 0.3327, + "step": 15710 + }, + { + "epoch": 1.0130900705371677, + "grad_norm": 4.014596939086914, + "learning_rate": 7.949842400986706e-06, + "loss": 0.1678, + "step": 15711 + }, + { + "epoch": 1.0131036353771026, + "grad_norm": 5.325283527374268, + "learning_rate": 7.949705358366453e-06, + "loss": 0.2365, + "step": 15712 + }, + { + "epoch": 1.0131172002170374, + "grad_norm": 12.160253524780273, + "learning_rate": 7.949568315746198e-06, + "loss": 0.2926, + "step": 15713 + }, + { + "epoch": 1.0131307650569723, + "grad_norm": 4.755122184753418, + "learning_rate": 7.949431273125942e-06, + "loss": 0.2524, + "step": 15714 + }, + { + "epoch": 1.0131443298969072, + "grad_norm": 5.18956995010376, + "learning_rate": 7.949294230505687e-06, + "loss": 0.3118, + "step": 15715 + }, + { + "epoch": 1.013157894736842, + "grad_norm": 6.191045761108398, + "learning_rate": 7.949157187885434e-06, + "loss": 0.3745, + "step": 15716 + }, + { + "epoch": 1.013171459576777, + "grad_norm": 5.905436992645264, + "learning_rate": 7.949020145265179e-06, + "loss": 0.3225, + "step": 15717 + }, + { + "epoch": 1.013185024416712, + "grad_norm": 5.775698184967041, + "learning_rate": 7.948883102644923e-06, + "loss": 0.3517, + "step": 15718 + }, + { + "epoch": 1.0131985892566469, + "grad_norm": 5.270929336547852, + "learning_rate": 7.948746060024668e-06, + "loss": 0.2181, + "step": 15719 + }, + { + "epoch": 1.0132121540965817, + "grad_norm": 5.980667591094971, + "learning_rate": 7.948609017404415e-06, + "loss": 0.3216, + "step": 15720 + }, + { + "epoch": 1.0132257189365166, + "grad_norm": 6.567871570587158, + "learning_rate": 7.948471974784158e-06, + "loss": 0.4237, + "step": 15721 + }, + { + "epoch": 1.0132392837764514, + "grad_norm": 5.766387939453125, + "learning_rate": 7.948334932163903e-06, + "loss": 0.2173, + "step": 15722 + }, + { + "epoch": 1.0132528486163863, + "grad_norm": 5.900300025939941, + "learning_rate": 7.948197889543648e-06, + "loss": 0.2027, + "step": 15723 + }, + { + "epoch": 1.0132664134563212, + "grad_norm": 5.558285236358643, + "learning_rate": 7.948060846923394e-06, + "loss": 0.2218, + "step": 15724 + }, + { + "epoch": 1.013279978296256, + "grad_norm": 7.348741054534912, + "learning_rate": 7.947923804303139e-06, + "loss": 0.5025, + "step": 15725 + }, + { + "epoch": 1.013293543136191, + "grad_norm": 6.959611415863037, + "learning_rate": 7.947786761682884e-06, + "loss": 0.324, + "step": 15726 + }, + { + "epoch": 1.0133071079761258, + "grad_norm": 6.718508243560791, + "learning_rate": 7.94764971906263e-06, + "loss": 0.316, + "step": 15727 + }, + { + "epoch": 1.0133206728160609, + "grad_norm": 5.5262956619262695, + "learning_rate": 7.947512676442374e-06, + "loss": 0.273, + "step": 15728 + }, + { + "epoch": 1.0133342376559957, + "grad_norm": 5.723686218261719, + "learning_rate": 7.94737563382212e-06, + "loss": 0.3296, + "step": 15729 + }, + { + "epoch": 1.0133478024959306, + "grad_norm": 5.359419345855713, + "learning_rate": 7.947238591201865e-06, + "loss": 0.3745, + "step": 15730 + }, + { + "epoch": 1.0133613673358655, + "grad_norm": 5.6830244064331055, + "learning_rate": 7.94710154858161e-06, + "loss": 0.2446, + "step": 15731 + }, + { + "epoch": 1.0133749321758003, + "grad_norm": 5.987954139709473, + "learning_rate": 7.946964505961355e-06, + "loss": 0.2433, + "step": 15732 + }, + { + "epoch": 1.0133884970157352, + "grad_norm": 6.495339393615723, + "learning_rate": 7.9468274633411e-06, + "loss": 0.3231, + "step": 15733 + }, + { + "epoch": 1.01340206185567, + "grad_norm": 6.378973960876465, + "learning_rate": 7.946690420720845e-06, + "loss": 0.3129, + "step": 15734 + }, + { + "epoch": 1.013415626695605, + "grad_norm": 5.701741695404053, + "learning_rate": 7.94655337810059e-06, + "loss": 0.2316, + "step": 15735 + }, + { + "epoch": 1.0134291915355398, + "grad_norm": 7.099954605102539, + "learning_rate": 7.946416335480334e-06, + "loss": 0.395, + "step": 15736 + }, + { + "epoch": 1.0134427563754749, + "grad_norm": 4.999474048614502, + "learning_rate": 7.94627929286008e-06, + "loss": 0.3586, + "step": 15737 + }, + { + "epoch": 1.0134563212154097, + "grad_norm": 7.001155853271484, + "learning_rate": 7.946142250239826e-06, + "loss": 0.3604, + "step": 15738 + }, + { + "epoch": 1.0134698860553446, + "grad_norm": 7.657862186431885, + "learning_rate": 7.94600520761957e-06, + "loss": 0.4945, + "step": 15739 + }, + { + "epoch": 1.0134834508952795, + "grad_norm": 8.71147346496582, + "learning_rate": 7.945868164999315e-06, + "loss": 0.3595, + "step": 15740 + }, + { + "epoch": 1.0134970157352143, + "grad_norm": 5.485509872436523, + "learning_rate": 7.94573112237906e-06, + "loss": 0.2905, + "step": 15741 + }, + { + "epoch": 1.0135105805751492, + "grad_norm": 5.771115779876709, + "learning_rate": 7.945594079758807e-06, + "loss": 0.3307, + "step": 15742 + }, + { + "epoch": 1.013524145415084, + "grad_norm": 6.277566432952881, + "learning_rate": 7.94545703713855e-06, + "loss": 0.3603, + "step": 15743 + }, + { + "epoch": 1.013537710255019, + "grad_norm": 5.870394706726074, + "learning_rate": 7.945319994518296e-06, + "loss": 0.2354, + "step": 15744 + }, + { + "epoch": 1.0135512750949538, + "grad_norm": 5.111703872680664, + "learning_rate": 7.94518295189804e-06, + "loss": 0.1766, + "step": 15745 + }, + { + "epoch": 1.0135648399348887, + "grad_norm": 6.5655412673950195, + "learning_rate": 7.945045909277786e-06, + "loss": 0.26, + "step": 15746 + }, + { + "epoch": 1.0135784047748237, + "grad_norm": 5.930330276489258, + "learning_rate": 7.944908866657531e-06, + "loss": 0.2295, + "step": 15747 + }, + { + "epoch": 1.0135919696147586, + "grad_norm": 5.225882053375244, + "learning_rate": 7.944771824037276e-06, + "loss": 0.2925, + "step": 15748 + }, + { + "epoch": 1.0136055344546935, + "grad_norm": 4.862267017364502, + "learning_rate": 7.944634781417021e-06, + "loss": 0.3044, + "step": 15749 + }, + { + "epoch": 1.0136190992946283, + "grad_norm": 5.296502113342285, + "learning_rate": 7.944497738796767e-06, + "loss": 0.2222, + "step": 15750 + }, + { + "epoch": 1.0136326641345632, + "grad_norm": 6.161423683166504, + "learning_rate": 7.944360696176512e-06, + "loss": 0.3087, + "step": 15751 + }, + { + "epoch": 1.013646228974498, + "grad_norm": 6.060306549072266, + "learning_rate": 7.944223653556257e-06, + "loss": 0.2269, + "step": 15752 + }, + { + "epoch": 1.013659793814433, + "grad_norm": 4.686872959136963, + "learning_rate": 7.944086610936002e-06, + "loss": 0.2266, + "step": 15753 + }, + { + "epoch": 1.0136733586543678, + "grad_norm": 4.966587543487549, + "learning_rate": 7.943949568315746e-06, + "loss": 0.2504, + "step": 15754 + }, + { + "epoch": 1.0136869234943027, + "grad_norm": 4.7539520263671875, + "learning_rate": 7.943812525695492e-06, + "loss": 0.2755, + "step": 15755 + }, + { + "epoch": 1.0137004883342378, + "grad_norm": 6.400981903076172, + "learning_rate": 7.943675483075238e-06, + "loss": 0.2697, + "step": 15756 + }, + { + "epoch": 1.0137140531741726, + "grad_norm": 4.106328964233398, + "learning_rate": 7.943538440454983e-06, + "loss": 0.1915, + "step": 15757 + }, + { + "epoch": 1.0137276180141075, + "grad_norm": 5.0444159507751465, + "learning_rate": 7.943401397834726e-06, + "loss": 0.1742, + "step": 15758 + }, + { + "epoch": 1.0137411828540424, + "grad_norm": 6.25094747543335, + "learning_rate": 7.943264355214473e-06, + "loss": 0.2158, + "step": 15759 + }, + { + "epoch": 1.0137547476939772, + "grad_norm": 5.051972389221191, + "learning_rate": 7.943127312594218e-06, + "loss": 0.1867, + "step": 15760 + }, + { + "epoch": 1.013768312533912, + "grad_norm": 6.895787715911865, + "learning_rate": 7.942990269973962e-06, + "loss": 0.3622, + "step": 15761 + }, + { + "epoch": 1.013781877373847, + "grad_norm": 5.705833911895752, + "learning_rate": 7.942853227353707e-06, + "loss": 0.1829, + "step": 15762 + }, + { + "epoch": 1.0137954422137818, + "grad_norm": 6.5879411697387695, + "learning_rate": 7.942716184733454e-06, + "loss": 0.2715, + "step": 15763 + }, + { + "epoch": 1.0138090070537167, + "grad_norm": 4.139573097229004, + "learning_rate": 7.942579142113197e-06, + "loss": 0.1911, + "step": 15764 + }, + { + "epoch": 1.0138225718936515, + "grad_norm": 5.414219856262207, + "learning_rate": 7.942442099492943e-06, + "loss": 0.2646, + "step": 15765 + }, + { + "epoch": 1.0138361367335866, + "grad_norm": 4.668193817138672, + "learning_rate": 7.942305056872688e-06, + "loss": 0.2274, + "step": 15766 + }, + { + "epoch": 1.0138497015735215, + "grad_norm": 3.8521695137023926, + "learning_rate": 7.942168014252433e-06, + "loss": 0.1211, + "step": 15767 + }, + { + "epoch": 1.0138632664134564, + "grad_norm": 5.2388386726379395, + "learning_rate": 7.942030971632178e-06, + "loss": 0.2576, + "step": 15768 + }, + { + "epoch": 1.0138768312533912, + "grad_norm": 5.7375288009643555, + "learning_rate": 7.941893929011923e-06, + "loss": 0.3805, + "step": 15769 + }, + { + "epoch": 1.013890396093326, + "grad_norm": 6.402678489685059, + "learning_rate": 7.941756886391668e-06, + "loss": 0.2012, + "step": 15770 + }, + { + "epoch": 1.013903960933261, + "grad_norm": 8.492972373962402, + "learning_rate": 7.941619843771414e-06, + "loss": 0.5115, + "step": 15771 + }, + { + "epoch": 1.0139175257731958, + "grad_norm": 8.104186058044434, + "learning_rate": 7.941482801151159e-06, + "loss": 0.2844, + "step": 15772 + }, + { + "epoch": 1.0139310906131307, + "grad_norm": 7.835391521453857, + "learning_rate": 7.941345758530904e-06, + "loss": 0.2797, + "step": 15773 + }, + { + "epoch": 1.0139446554530656, + "grad_norm": 5.426365375518799, + "learning_rate": 7.94120871591065e-06, + "loss": 0.208, + "step": 15774 + }, + { + "epoch": 1.0139582202930006, + "grad_norm": 4.098861217498779, + "learning_rate": 7.941071673290394e-06, + "loss": 0.1069, + "step": 15775 + }, + { + "epoch": 1.0139717851329355, + "grad_norm": 5.559014797210693, + "learning_rate": 7.94093463067014e-06, + "loss": 0.2424, + "step": 15776 + }, + { + "epoch": 1.0139853499728704, + "grad_norm": 5.890511989593506, + "learning_rate": 7.940797588049885e-06, + "loss": 0.1958, + "step": 15777 + }, + { + "epoch": 1.0139989148128052, + "grad_norm": 5.016499042510986, + "learning_rate": 7.94066054542963e-06, + "loss": 0.2994, + "step": 15778 + }, + { + "epoch": 1.01401247965274, + "grad_norm": 7.137155532836914, + "learning_rate": 7.940523502809373e-06, + "loss": 0.3381, + "step": 15779 + }, + { + "epoch": 1.014026044492675, + "grad_norm": 6.258042335510254, + "learning_rate": 7.940386460189119e-06, + "loss": 0.2938, + "step": 15780 + }, + { + "epoch": 1.0140396093326098, + "grad_norm": 4.148909568786621, + "learning_rate": 7.940249417568865e-06, + "loss": 0.1991, + "step": 15781 + }, + { + "epoch": 1.0140531741725447, + "grad_norm": 5.337896347045898, + "learning_rate": 7.940112374948609e-06, + "loss": 0.2387, + "step": 15782 + }, + { + "epoch": 1.0140667390124796, + "grad_norm": 5.1929216384887695, + "learning_rate": 7.939975332328354e-06, + "loss": 0.1968, + "step": 15783 + }, + { + "epoch": 1.0140803038524144, + "grad_norm": 4.774904251098633, + "learning_rate": 7.9398382897081e-06, + "loss": 0.1904, + "step": 15784 + }, + { + "epoch": 1.0140938686923495, + "grad_norm": 5.885346412658691, + "learning_rate": 7.939701247087846e-06, + "loss": 0.3636, + "step": 15785 + }, + { + "epoch": 1.0141074335322844, + "grad_norm": 4.784205436706543, + "learning_rate": 7.93956420446759e-06, + "loss": 0.184, + "step": 15786 + }, + { + "epoch": 1.0141209983722193, + "grad_norm": 5.425730228424072, + "learning_rate": 7.939427161847335e-06, + "loss": 0.2248, + "step": 15787 + }, + { + "epoch": 1.0141345632121541, + "grad_norm": 7.484267234802246, + "learning_rate": 7.93929011922708e-06, + "loss": 0.3033, + "step": 15788 + }, + { + "epoch": 1.014148128052089, + "grad_norm": 6.04221248626709, + "learning_rate": 7.939153076606825e-06, + "loss": 0.3812, + "step": 15789 + }, + { + "epoch": 1.0141616928920238, + "grad_norm": 6.138100624084473, + "learning_rate": 7.93901603398657e-06, + "loss": 0.1433, + "step": 15790 + }, + { + "epoch": 1.0141752577319587, + "grad_norm": 5.126507759094238, + "learning_rate": 7.938878991366316e-06, + "loss": 0.222, + "step": 15791 + }, + { + "epoch": 1.0141888225718936, + "grad_norm": 7.411239147186279, + "learning_rate": 7.93874194874606e-06, + "loss": 0.3828, + "step": 15792 + }, + { + "epoch": 1.0142023874118284, + "grad_norm": 5.517079830169678, + "learning_rate": 7.938604906125806e-06, + "loss": 0.3531, + "step": 15793 + }, + { + "epoch": 1.0142159522517635, + "grad_norm": 5.6941819190979, + "learning_rate": 7.938467863505551e-06, + "loss": 0.3326, + "step": 15794 + }, + { + "epoch": 1.0142295170916984, + "grad_norm": 4.521889686584473, + "learning_rate": 7.938330820885296e-06, + "loss": 0.2422, + "step": 15795 + }, + { + "epoch": 1.0142430819316333, + "grad_norm": 8.154979705810547, + "learning_rate": 7.938193778265041e-06, + "loss": 0.2378, + "step": 15796 + }, + { + "epoch": 1.0142566467715681, + "grad_norm": 4.555187702178955, + "learning_rate": 7.938056735644785e-06, + "loss": 0.1978, + "step": 15797 + }, + { + "epoch": 1.014270211611503, + "grad_norm": 5.415014743804932, + "learning_rate": 7.937919693024532e-06, + "loss": 0.3596, + "step": 15798 + }, + { + "epoch": 1.0142837764514379, + "grad_norm": 4.93405818939209, + "learning_rate": 7.937782650404277e-06, + "loss": 0.2026, + "step": 15799 + }, + { + "epoch": 1.0142973412913727, + "grad_norm": 4.9376139640808105, + "learning_rate": 7.937645607784022e-06, + "loss": 0.1787, + "step": 15800 + }, + { + "epoch": 1.0143109061313076, + "grad_norm": 6.496572971343994, + "learning_rate": 7.937508565163766e-06, + "loss": 0.247, + "step": 15801 + }, + { + "epoch": 1.0143244709712425, + "grad_norm": 5.6231255531311035, + "learning_rate": 7.937371522543513e-06, + "loss": 0.2172, + "step": 15802 + }, + { + "epoch": 1.0143380358111773, + "grad_norm": 4.640796184539795, + "learning_rate": 7.937234479923258e-06, + "loss": 0.1715, + "step": 15803 + }, + { + "epoch": 1.0143516006511124, + "grad_norm": 6.066932678222656, + "learning_rate": 7.937097437303001e-06, + "loss": 0.2798, + "step": 15804 + }, + { + "epoch": 1.0143651654910473, + "grad_norm": 6.258967876434326, + "learning_rate": 7.936960394682746e-06, + "loss": 0.2469, + "step": 15805 + }, + { + "epoch": 1.0143787303309821, + "grad_norm": 3.8822743892669678, + "learning_rate": 7.936823352062492e-06, + "loss": 0.1769, + "step": 15806 + }, + { + "epoch": 1.014392295170917, + "grad_norm": 6.626861572265625, + "learning_rate": 7.936686309442237e-06, + "loss": 0.3463, + "step": 15807 + }, + { + "epoch": 1.0144058600108519, + "grad_norm": 4.599116802215576, + "learning_rate": 7.936549266821982e-06, + "loss": 0.1583, + "step": 15808 + }, + { + "epoch": 1.0144194248507867, + "grad_norm": 4.945906639099121, + "learning_rate": 7.936412224201727e-06, + "loss": 0.2458, + "step": 15809 + }, + { + "epoch": 1.0144329896907216, + "grad_norm": 6.801535129547119, + "learning_rate": 7.936275181581472e-06, + "loss": 0.2795, + "step": 15810 + }, + { + "epoch": 1.0144465545306565, + "grad_norm": 7.039412975311279, + "learning_rate": 7.936138138961217e-06, + "loss": 0.2305, + "step": 15811 + }, + { + "epoch": 1.0144601193705913, + "grad_norm": 5.69240140914917, + "learning_rate": 7.936001096340963e-06, + "loss": 0.2451, + "step": 15812 + }, + { + "epoch": 1.0144736842105264, + "grad_norm": 6.8086652755737305, + "learning_rate": 7.935864053720708e-06, + "loss": 0.2778, + "step": 15813 + }, + { + "epoch": 1.0144872490504613, + "grad_norm": 6.693373680114746, + "learning_rate": 7.935727011100453e-06, + "loss": 0.2544, + "step": 15814 + }, + { + "epoch": 1.0145008138903961, + "grad_norm": 8.034469604492188, + "learning_rate": 7.935589968480198e-06, + "loss": 0.3926, + "step": 15815 + }, + { + "epoch": 1.014514378730331, + "grad_norm": 6.7782158851623535, + "learning_rate": 7.935452925859943e-06, + "loss": 0.3491, + "step": 15816 + }, + { + "epoch": 1.0145279435702659, + "grad_norm": 6.146936893463135, + "learning_rate": 7.935315883239688e-06, + "loss": 0.2887, + "step": 15817 + }, + { + "epoch": 1.0145415084102007, + "grad_norm": 5.3296613693237305, + "learning_rate": 7.935178840619434e-06, + "loss": 0.1835, + "step": 15818 + }, + { + "epoch": 1.0145550732501356, + "grad_norm": 4.345478534698486, + "learning_rate": 7.935041797999177e-06, + "loss": 0.1875, + "step": 15819 + }, + { + "epoch": 1.0145686380900705, + "grad_norm": 5.535057067871094, + "learning_rate": 7.934904755378924e-06, + "loss": 0.2342, + "step": 15820 + }, + { + "epoch": 1.0145822029300053, + "grad_norm": 7.598711013793945, + "learning_rate": 7.93476771275867e-06, + "loss": 0.3264, + "step": 15821 + }, + { + "epoch": 1.0145957677699404, + "grad_norm": 6.273486137390137, + "learning_rate": 7.934630670138413e-06, + "loss": 0.3727, + "step": 15822 + }, + { + "epoch": 1.0146093326098753, + "grad_norm": 4.668432712554932, + "learning_rate": 7.934493627518158e-06, + "loss": 0.1791, + "step": 15823 + }, + { + "epoch": 1.0146228974498102, + "grad_norm": 4.6212053298950195, + "learning_rate": 7.934356584897905e-06, + "loss": 0.2235, + "step": 15824 + }, + { + "epoch": 1.014636462289745, + "grad_norm": 4.58770751953125, + "learning_rate": 7.93421954227765e-06, + "loss": 0.2377, + "step": 15825 + }, + { + "epoch": 1.0146500271296799, + "grad_norm": 6.197422981262207, + "learning_rate": 7.934082499657393e-06, + "loss": 0.3369, + "step": 15826 + }, + { + "epoch": 1.0146635919696148, + "grad_norm": 6.795252323150635, + "learning_rate": 7.933945457037139e-06, + "loss": 0.3202, + "step": 15827 + }, + { + "epoch": 1.0146771568095496, + "grad_norm": 6.621179580688477, + "learning_rate": 7.933808414416885e-06, + "loss": 0.2737, + "step": 15828 + }, + { + "epoch": 1.0146907216494845, + "grad_norm": 5.676478385925293, + "learning_rate": 7.933671371796629e-06, + "loss": 0.2877, + "step": 15829 + }, + { + "epoch": 1.0147042864894193, + "grad_norm": 6.8858962059021, + "learning_rate": 7.933534329176374e-06, + "loss": 0.3815, + "step": 15830 + }, + { + "epoch": 1.0147178513293542, + "grad_norm": 5.590328216552734, + "learning_rate": 7.93339728655612e-06, + "loss": 0.3431, + "step": 15831 + }, + { + "epoch": 1.0147314161692893, + "grad_norm": 4.641382217407227, + "learning_rate": 7.933260243935864e-06, + "loss": 0.2766, + "step": 15832 + }, + { + "epoch": 1.0147449810092242, + "grad_norm": 7.0087456703186035, + "learning_rate": 7.93312320131561e-06, + "loss": 0.376, + "step": 15833 + }, + { + "epoch": 1.014758545849159, + "grad_norm": 6.418429851531982, + "learning_rate": 7.932986158695355e-06, + "loss": 0.4075, + "step": 15834 + }, + { + "epoch": 1.014772110689094, + "grad_norm": 6.573215961456299, + "learning_rate": 7.9328491160751e-06, + "loss": 0.2254, + "step": 15835 + }, + { + "epoch": 1.0147856755290288, + "grad_norm": 6.78688383102417, + "learning_rate": 7.932712073454845e-06, + "loss": 0.3269, + "step": 15836 + }, + { + "epoch": 1.0147992403689636, + "grad_norm": 5.442883014678955, + "learning_rate": 7.93257503083459e-06, + "loss": 0.3136, + "step": 15837 + }, + { + "epoch": 1.0148128052088985, + "grad_norm": 5.952253341674805, + "learning_rate": 7.932437988214336e-06, + "loss": 0.3459, + "step": 15838 + }, + { + "epoch": 1.0148263700488334, + "grad_norm": 4.588817596435547, + "learning_rate": 7.93230094559408e-06, + "loss": 0.2568, + "step": 15839 + }, + { + "epoch": 1.0148399348887682, + "grad_norm": 5.168046474456787, + "learning_rate": 7.932163902973826e-06, + "loss": 0.215, + "step": 15840 + }, + { + "epoch": 1.0148534997287033, + "grad_norm": 4.489185810089111, + "learning_rate": 7.932026860353571e-06, + "loss": 0.2672, + "step": 15841 + }, + { + "epoch": 1.0148670645686382, + "grad_norm": 7.843742847442627, + "learning_rate": 7.931889817733316e-06, + "loss": 0.3375, + "step": 15842 + }, + { + "epoch": 1.014880629408573, + "grad_norm": 6.692503929138184, + "learning_rate": 7.931752775113061e-06, + "loss": 0.4209, + "step": 15843 + }, + { + "epoch": 1.014894194248508, + "grad_norm": 6.118607044219971, + "learning_rate": 7.931615732492805e-06, + "loss": 0.286, + "step": 15844 + }, + { + "epoch": 1.0149077590884428, + "grad_norm": 6.863707065582275, + "learning_rate": 7.931478689872552e-06, + "loss": 0.2844, + "step": 15845 + }, + { + "epoch": 1.0149213239283776, + "grad_norm": 6.54550838470459, + "learning_rate": 7.931341647252297e-06, + "loss": 0.2777, + "step": 15846 + }, + { + "epoch": 1.0149348887683125, + "grad_norm": 4.7045464515686035, + "learning_rate": 7.93120460463204e-06, + "loss": 0.26, + "step": 15847 + }, + { + "epoch": 1.0149484536082474, + "grad_norm": 6.233334064483643, + "learning_rate": 7.931067562011786e-06, + "loss": 0.3351, + "step": 15848 + }, + { + "epoch": 1.0149620184481822, + "grad_norm": 5.581233501434326, + "learning_rate": 7.93093051939153e-06, + "loss": 0.2692, + "step": 15849 + }, + { + "epoch": 1.014975583288117, + "grad_norm": 7.656021595001221, + "learning_rate": 7.930793476771278e-06, + "loss": 0.4219, + "step": 15850 + }, + { + "epoch": 1.0149891481280522, + "grad_norm": 5.5445051193237305, + "learning_rate": 7.930656434151021e-06, + "loss": 0.3702, + "step": 15851 + }, + { + "epoch": 1.015002712967987, + "grad_norm": 6.947341442108154, + "learning_rate": 7.930519391530766e-06, + "loss": 0.3872, + "step": 15852 + }, + { + "epoch": 1.015016277807922, + "grad_norm": 4.326298236846924, + "learning_rate": 7.930382348910512e-06, + "loss": 0.3283, + "step": 15853 + }, + { + "epoch": 1.0150298426478568, + "grad_norm": 5.190774440765381, + "learning_rate": 7.930245306290257e-06, + "loss": 0.3028, + "step": 15854 + }, + { + "epoch": 1.0150434074877916, + "grad_norm": 4.418176174163818, + "learning_rate": 7.930108263670002e-06, + "loss": 0.2428, + "step": 15855 + }, + { + "epoch": 1.0150569723277265, + "grad_norm": 4.976489543914795, + "learning_rate": 7.929971221049747e-06, + "loss": 0.4119, + "step": 15856 + }, + { + "epoch": 1.0150705371676614, + "grad_norm": 6.4579949378967285, + "learning_rate": 7.929834178429492e-06, + "loss": 0.3444, + "step": 15857 + }, + { + "epoch": 1.0150841020075962, + "grad_norm": 4.396227836608887, + "learning_rate": 7.929697135809237e-06, + "loss": 0.2461, + "step": 15858 + }, + { + "epoch": 1.015097666847531, + "grad_norm": 4.019779205322266, + "learning_rate": 7.929560093188983e-06, + "loss": 0.1711, + "step": 15859 + }, + { + "epoch": 1.0151112316874662, + "grad_norm": 4.413082599639893, + "learning_rate": 7.929423050568728e-06, + "loss": 0.1737, + "step": 15860 + }, + { + "epoch": 1.015124796527401, + "grad_norm": 3.855212688446045, + "learning_rate": 7.929286007948473e-06, + "loss": 0.2303, + "step": 15861 + }, + { + "epoch": 1.015138361367336, + "grad_norm": 5.476507663726807, + "learning_rate": 7.929148965328216e-06, + "loss": 0.3125, + "step": 15862 + }, + { + "epoch": 1.0151519262072708, + "grad_norm": 6.2260026931762695, + "learning_rate": 7.929011922707963e-06, + "loss": 0.2544, + "step": 15863 + }, + { + "epoch": 1.0151654910472057, + "grad_norm": 5.060756206512451, + "learning_rate": 7.928874880087709e-06, + "loss": 0.3657, + "step": 15864 + }, + { + "epoch": 1.0151790558871405, + "grad_norm": 5.495786666870117, + "learning_rate": 7.928737837467454e-06, + "loss": 0.315, + "step": 15865 + }, + { + "epoch": 1.0151926207270754, + "grad_norm": 6.137124538421631, + "learning_rate": 7.928600794847197e-06, + "loss": 0.288, + "step": 15866 + }, + { + "epoch": 1.0152061855670103, + "grad_norm": 4.476141929626465, + "learning_rate": 7.928463752226944e-06, + "loss": 0.2063, + "step": 15867 + }, + { + "epoch": 1.0152197504069451, + "grad_norm": 6.734859466552734, + "learning_rate": 7.92832670960669e-06, + "loss": 0.398, + "step": 15868 + }, + { + "epoch": 1.01523331524688, + "grad_norm": 5.1587395668029785, + "learning_rate": 7.928189666986433e-06, + "loss": 0.3832, + "step": 15869 + }, + { + "epoch": 1.015246880086815, + "grad_norm": 3.945496082305908, + "learning_rate": 7.928052624366178e-06, + "loss": 0.1918, + "step": 15870 + }, + { + "epoch": 1.01526044492675, + "grad_norm": 4.4337849617004395, + "learning_rate": 7.927915581745925e-06, + "loss": 0.2162, + "step": 15871 + }, + { + "epoch": 1.0152740097666848, + "grad_norm": 5.372912406921387, + "learning_rate": 7.927778539125668e-06, + "loss": 0.3139, + "step": 15872 + }, + { + "epoch": 1.0152875746066197, + "grad_norm": 4.720937252044678, + "learning_rate": 7.927641496505413e-06, + "loss": 0.2099, + "step": 15873 + }, + { + "epoch": 1.0153011394465545, + "grad_norm": 4.247608184814453, + "learning_rate": 7.927504453885159e-06, + "loss": 0.2866, + "step": 15874 + }, + { + "epoch": 1.0153147042864894, + "grad_norm": 6.071672439575195, + "learning_rate": 7.927367411264904e-06, + "loss": 0.3727, + "step": 15875 + }, + { + "epoch": 1.0153282691264243, + "grad_norm": 3.864650249481201, + "learning_rate": 7.927230368644649e-06, + "loss": 0.2082, + "step": 15876 + }, + { + "epoch": 1.0153418339663591, + "grad_norm": 7.162680149078369, + "learning_rate": 7.927093326024394e-06, + "loss": 0.5268, + "step": 15877 + }, + { + "epoch": 1.015355398806294, + "grad_norm": 8.232931137084961, + "learning_rate": 7.92695628340414e-06, + "loss": 0.423, + "step": 15878 + }, + { + "epoch": 1.015368963646229, + "grad_norm": 6.662493705749512, + "learning_rate": 7.926819240783885e-06, + "loss": 0.3562, + "step": 15879 + }, + { + "epoch": 1.015382528486164, + "grad_norm": 5.212189674377441, + "learning_rate": 7.92668219816363e-06, + "loss": 0.2248, + "step": 15880 + }, + { + "epoch": 1.0153960933260988, + "grad_norm": 4.8180131912231445, + "learning_rate": 7.926545155543375e-06, + "loss": 0.2275, + "step": 15881 + }, + { + "epoch": 1.0154096581660337, + "grad_norm": 6.201505661010742, + "learning_rate": 7.92640811292312e-06, + "loss": 0.2916, + "step": 15882 + }, + { + "epoch": 1.0154232230059685, + "grad_norm": 4.91684103012085, + "learning_rate": 7.926271070302865e-06, + "loss": 0.2365, + "step": 15883 + }, + { + "epoch": 1.0154367878459034, + "grad_norm": 5.247581481933594, + "learning_rate": 7.92613402768261e-06, + "loss": 0.2386, + "step": 15884 + }, + { + "epoch": 1.0154503526858383, + "grad_norm": 4.251084327697754, + "learning_rate": 7.925996985062356e-06, + "loss": 0.202, + "step": 15885 + }, + { + "epoch": 1.0154639175257731, + "grad_norm": 4.117406845092773, + "learning_rate": 7.9258599424421e-06, + "loss": 0.1423, + "step": 15886 + }, + { + "epoch": 1.015477482365708, + "grad_norm": 5.487328052520752, + "learning_rate": 7.925722899821844e-06, + "loss": 0.2748, + "step": 15887 + }, + { + "epoch": 1.0154910472056429, + "grad_norm": 5.605180263519287, + "learning_rate": 7.92558585720159e-06, + "loss": 0.2677, + "step": 15888 + }, + { + "epoch": 1.015504612045578, + "grad_norm": 4.956093788146973, + "learning_rate": 7.925448814581336e-06, + "loss": 0.3009, + "step": 15889 + }, + { + "epoch": 1.0155181768855128, + "grad_norm": 5.517490386962891, + "learning_rate": 7.92531177196108e-06, + "loss": 0.3494, + "step": 15890 + }, + { + "epoch": 1.0155317417254477, + "grad_norm": 6.107114315032959, + "learning_rate": 7.925174729340825e-06, + "loss": 0.4596, + "step": 15891 + }, + { + "epoch": 1.0155453065653826, + "grad_norm": 5.956455230712891, + "learning_rate": 7.92503768672057e-06, + "loss": 0.2269, + "step": 15892 + }, + { + "epoch": 1.0155588714053174, + "grad_norm": 5.446667194366455, + "learning_rate": 7.924900644100317e-06, + "loss": 0.2864, + "step": 15893 + }, + { + "epoch": 1.0155724362452523, + "grad_norm": 5.534738540649414, + "learning_rate": 7.92476360148006e-06, + "loss": 0.211, + "step": 15894 + }, + { + "epoch": 1.0155860010851872, + "grad_norm": 6.302571773529053, + "learning_rate": 7.924626558859806e-06, + "loss": 0.3259, + "step": 15895 + }, + { + "epoch": 1.015599565925122, + "grad_norm": 5.753692626953125, + "learning_rate": 7.924489516239551e-06, + "loss": 0.2568, + "step": 15896 + }, + { + "epoch": 1.0156131307650569, + "grad_norm": 6.426350116729736, + "learning_rate": 7.924352473619296e-06, + "loss": 0.3794, + "step": 15897 + }, + { + "epoch": 1.015626695604992, + "grad_norm": 5.022762298583984, + "learning_rate": 7.924215430999041e-06, + "loss": 0.2018, + "step": 15898 + }, + { + "epoch": 1.0156402604449268, + "grad_norm": 5.9055705070495605, + "learning_rate": 7.924078388378786e-06, + "loss": 0.2626, + "step": 15899 + }, + { + "epoch": 1.0156538252848617, + "grad_norm": 5.095238208770752, + "learning_rate": 7.923941345758532e-06, + "loss": 0.3084, + "step": 15900 + }, + { + "epoch": 1.0156673901247966, + "grad_norm": 7.725504398345947, + "learning_rate": 7.923804303138277e-06, + "loss": 0.4633, + "step": 15901 + }, + { + "epoch": 1.0156809549647314, + "grad_norm": 5.431142807006836, + "learning_rate": 7.923667260518022e-06, + "loss": 0.3462, + "step": 15902 + }, + { + "epoch": 1.0156945198046663, + "grad_norm": 4.781754970550537, + "learning_rate": 7.923530217897767e-06, + "loss": 0.2596, + "step": 15903 + }, + { + "epoch": 1.0157080846446012, + "grad_norm": 6.951389789581299, + "learning_rate": 7.923393175277512e-06, + "loss": 0.3383, + "step": 15904 + }, + { + "epoch": 1.015721649484536, + "grad_norm": 5.977967739105225, + "learning_rate": 7.923256132657256e-06, + "loss": 0.2784, + "step": 15905 + }, + { + "epoch": 1.015735214324471, + "grad_norm": 4.186101913452148, + "learning_rate": 7.923119090037003e-06, + "loss": 0.194, + "step": 15906 + }, + { + "epoch": 1.0157487791644058, + "grad_norm": 5.59775972366333, + "learning_rate": 7.922982047416748e-06, + "loss": 0.2513, + "step": 15907 + }, + { + "epoch": 1.0157623440043408, + "grad_norm": 5.794796466827393, + "learning_rate": 7.922845004796493e-06, + "loss": 0.3761, + "step": 15908 + }, + { + "epoch": 1.0157759088442757, + "grad_norm": 6.86356258392334, + "learning_rate": 7.922707962176236e-06, + "loss": 0.4412, + "step": 15909 + }, + { + "epoch": 1.0157894736842106, + "grad_norm": 5.983849048614502, + "learning_rate": 7.922570919555983e-06, + "loss": 0.3217, + "step": 15910 + }, + { + "epoch": 1.0158030385241454, + "grad_norm": 6.189972877502441, + "learning_rate": 7.922433876935729e-06, + "loss": 0.3241, + "step": 15911 + }, + { + "epoch": 1.0158166033640803, + "grad_norm": 4.668858051300049, + "learning_rate": 7.922296834315472e-06, + "loss": 0.166, + "step": 15912 + }, + { + "epoch": 1.0158301682040152, + "grad_norm": 7.453067302703857, + "learning_rate": 7.922159791695217e-06, + "loss": 0.3209, + "step": 15913 + }, + { + "epoch": 1.01584373304395, + "grad_norm": 4.608414649963379, + "learning_rate": 7.922022749074964e-06, + "loss": 0.1876, + "step": 15914 + }, + { + "epoch": 1.015857297883885, + "grad_norm": 5.942353248596191, + "learning_rate": 7.921885706454708e-06, + "loss": 0.3367, + "step": 15915 + }, + { + "epoch": 1.0158708627238198, + "grad_norm": 4.639171123504639, + "learning_rate": 7.921748663834453e-06, + "loss": 0.2441, + "step": 15916 + }, + { + "epoch": 1.0158844275637549, + "grad_norm": 4.825697898864746, + "learning_rate": 7.921611621214198e-06, + "loss": 0.2424, + "step": 15917 + }, + { + "epoch": 1.0158979924036897, + "grad_norm": 5.896303653717041, + "learning_rate": 7.921474578593943e-06, + "loss": 0.2027, + "step": 15918 + }, + { + "epoch": 1.0159115572436246, + "grad_norm": 8.108917236328125, + "learning_rate": 7.921337535973688e-06, + "loss": 0.3939, + "step": 15919 + }, + { + "epoch": 1.0159251220835595, + "grad_norm": 6.803304195404053, + "learning_rate": 7.921200493353433e-06, + "loss": 0.3188, + "step": 15920 + }, + { + "epoch": 1.0159386869234943, + "grad_norm": 6.042364120483398, + "learning_rate": 7.921063450733179e-06, + "loss": 0.2892, + "step": 15921 + }, + { + "epoch": 1.0159522517634292, + "grad_norm": 6.490031719207764, + "learning_rate": 7.920926408112924e-06, + "loss": 0.2578, + "step": 15922 + }, + { + "epoch": 1.015965816603364, + "grad_norm": 6.338657855987549, + "learning_rate": 7.920789365492669e-06, + "loss": 0.3446, + "step": 15923 + }, + { + "epoch": 1.015979381443299, + "grad_norm": 7.290718078613281, + "learning_rate": 7.920652322872414e-06, + "loss": 0.2811, + "step": 15924 + }, + { + "epoch": 1.0159929462832338, + "grad_norm": 6.808869361877441, + "learning_rate": 7.92051528025216e-06, + "loss": 0.3843, + "step": 15925 + }, + { + "epoch": 1.0160065111231686, + "grad_norm": 7.469825744628906, + "learning_rate": 7.920378237631905e-06, + "loss": 0.4464, + "step": 15926 + }, + { + "epoch": 1.0160200759631037, + "grad_norm": 5.343947887420654, + "learning_rate": 7.92024119501165e-06, + "loss": 0.287, + "step": 15927 + }, + { + "epoch": 1.0160336408030386, + "grad_norm": 7.617031574249268, + "learning_rate": 7.920104152391395e-06, + "loss": 0.3594, + "step": 15928 + }, + { + "epoch": 1.0160472056429735, + "grad_norm": 5.385975360870361, + "learning_rate": 7.91996710977114e-06, + "loss": 0.3469, + "step": 15929 + }, + { + "epoch": 1.0160607704829083, + "grad_norm": 6.861351013183594, + "learning_rate": 7.919830067150884e-06, + "loss": 0.3036, + "step": 15930 + }, + { + "epoch": 1.0160743353228432, + "grad_norm": 4.210237979888916, + "learning_rate": 7.919693024530629e-06, + "loss": 0.1935, + "step": 15931 + }, + { + "epoch": 1.016087900162778, + "grad_norm": 6.226799964904785, + "learning_rate": 7.919555981910376e-06, + "loss": 0.3893, + "step": 15932 + }, + { + "epoch": 1.016101465002713, + "grad_norm": 4.773007392883301, + "learning_rate": 7.91941893929012e-06, + "loss": 0.2097, + "step": 15933 + }, + { + "epoch": 1.0161150298426478, + "grad_norm": 5.880929946899414, + "learning_rate": 7.919281896669864e-06, + "loss": 0.2446, + "step": 15934 + }, + { + "epoch": 1.0161285946825827, + "grad_norm": 5.163087844848633, + "learning_rate": 7.91914485404961e-06, + "loss": 0.2679, + "step": 15935 + }, + { + "epoch": 1.0161421595225177, + "grad_norm": 4.64190149307251, + "learning_rate": 7.919007811429356e-06, + "loss": 0.1941, + "step": 15936 + }, + { + "epoch": 1.0161557243624526, + "grad_norm": 4.464310169219971, + "learning_rate": 7.9188707688091e-06, + "loss": 0.2957, + "step": 15937 + }, + { + "epoch": 1.0161692892023875, + "grad_norm": 4.508310317993164, + "learning_rate": 7.918733726188845e-06, + "loss": 0.2593, + "step": 15938 + }, + { + "epoch": 1.0161828540423223, + "grad_norm": 6.048851013183594, + "learning_rate": 7.91859668356859e-06, + "loss": 0.3258, + "step": 15939 + }, + { + "epoch": 1.0161964188822572, + "grad_norm": 6.118239402770996, + "learning_rate": 7.918459640948335e-06, + "loss": 0.3795, + "step": 15940 + }, + { + "epoch": 1.016209983722192, + "grad_norm": 7.220101833343506, + "learning_rate": 7.91832259832808e-06, + "loss": 0.3801, + "step": 15941 + }, + { + "epoch": 1.016223548562127, + "grad_norm": 6.94920539855957, + "learning_rate": 7.918185555707826e-06, + "loss": 0.466, + "step": 15942 + }, + { + "epoch": 1.0162371134020618, + "grad_norm": 5.764710426330566, + "learning_rate": 7.918048513087571e-06, + "loss": 0.2854, + "step": 15943 + }, + { + "epoch": 1.0162506782419967, + "grad_norm": 6.0468573570251465, + "learning_rate": 7.917911470467316e-06, + "loss": 0.3461, + "step": 15944 + }, + { + "epoch": 1.0162642430819315, + "grad_norm": 6.819165229797363, + "learning_rate": 7.917774427847061e-06, + "loss": 0.3717, + "step": 15945 + }, + { + "epoch": 1.0162778079218666, + "grad_norm": 4.683807373046875, + "learning_rate": 7.917637385226806e-06, + "loss": 0.3648, + "step": 15946 + }, + { + "epoch": 1.0162913727618015, + "grad_norm": 6.83878755569458, + "learning_rate": 7.917500342606552e-06, + "loss": 0.326, + "step": 15947 + }, + { + "epoch": 1.0163049376017363, + "grad_norm": 4.408483028411865, + "learning_rate": 7.917363299986297e-06, + "loss": 0.2361, + "step": 15948 + }, + { + "epoch": 1.0163185024416712, + "grad_norm": 6.767213821411133, + "learning_rate": 7.917226257366042e-06, + "loss": 0.2614, + "step": 15949 + }, + { + "epoch": 1.016332067281606, + "grad_norm": 5.38482666015625, + "learning_rate": 7.917089214745787e-06, + "loss": 0.3279, + "step": 15950 + }, + { + "epoch": 1.016345632121541, + "grad_norm": 6.818838119506836, + "learning_rate": 7.916952172125532e-06, + "loss": 0.4323, + "step": 15951 + }, + { + "epoch": 1.0163591969614758, + "grad_norm": 4.991490364074707, + "learning_rate": 7.916815129505276e-06, + "loss": 0.293, + "step": 15952 + }, + { + "epoch": 1.0163727618014107, + "grad_norm": 5.25674295425415, + "learning_rate": 7.916678086885023e-06, + "loss": 0.2723, + "step": 15953 + }, + { + "epoch": 1.0163863266413455, + "grad_norm": 7.26515531539917, + "learning_rate": 7.916541044264768e-06, + "loss": 0.3811, + "step": 15954 + }, + { + "epoch": 1.0163998914812806, + "grad_norm": 5.579030990600586, + "learning_rate": 7.916404001644511e-06, + "loss": 0.2935, + "step": 15955 + }, + { + "epoch": 1.0164134563212155, + "grad_norm": 5.838875770568848, + "learning_rate": 7.916266959024257e-06, + "loss": 0.2438, + "step": 15956 + }, + { + "epoch": 1.0164270211611504, + "grad_norm": 5.9237542152404785, + "learning_rate": 7.916129916404002e-06, + "loss": 0.2841, + "step": 15957 + }, + { + "epoch": 1.0164405860010852, + "grad_norm": 5.952014923095703, + "learning_rate": 7.915992873783747e-06, + "loss": 0.2769, + "step": 15958 + }, + { + "epoch": 1.01645415084102, + "grad_norm": 6.274308681488037, + "learning_rate": 7.915855831163492e-06, + "loss": 0.2902, + "step": 15959 + }, + { + "epoch": 1.016467715680955, + "grad_norm": 5.000537872314453, + "learning_rate": 7.915718788543237e-06, + "loss": 0.2397, + "step": 15960 + }, + { + "epoch": 1.0164812805208898, + "grad_norm": 7.879330158233643, + "learning_rate": 7.915581745922982e-06, + "loss": 0.3616, + "step": 15961 + }, + { + "epoch": 1.0164948453608247, + "grad_norm": 4.733438491821289, + "learning_rate": 7.915444703302728e-06, + "loss": 0.2841, + "step": 15962 + }, + { + "epoch": 1.0165084102007595, + "grad_norm": 4.54692268371582, + "learning_rate": 7.915307660682473e-06, + "loss": 0.1886, + "step": 15963 + }, + { + "epoch": 1.0165219750406944, + "grad_norm": 5.872048377990723, + "learning_rate": 7.915170618062218e-06, + "loss": 0.3624, + "step": 15964 + }, + { + "epoch": 1.0165355398806295, + "grad_norm": 5.028322696685791, + "learning_rate": 7.915033575441963e-06, + "loss": 0.3503, + "step": 15965 + }, + { + "epoch": 1.0165491047205644, + "grad_norm": 6.961627006530762, + "learning_rate": 7.914896532821708e-06, + "loss": 0.3447, + "step": 15966 + }, + { + "epoch": 1.0165626695604992, + "grad_norm": 4.278672218322754, + "learning_rate": 7.914759490201453e-06, + "loss": 0.3391, + "step": 15967 + }, + { + "epoch": 1.016576234400434, + "grad_norm": 5.718221664428711, + "learning_rate": 7.914622447581199e-06, + "loss": 0.1768, + "step": 15968 + }, + { + "epoch": 1.016589799240369, + "grad_norm": 4.3300909996032715, + "learning_rate": 7.914485404960944e-06, + "loss": 0.3064, + "step": 15969 + }, + { + "epoch": 1.0166033640803038, + "grad_norm": 4.589625835418701, + "learning_rate": 7.914348362340689e-06, + "loss": 0.3305, + "step": 15970 + }, + { + "epoch": 1.0166169289202387, + "grad_norm": 4.729247093200684, + "learning_rate": 7.914211319720434e-06, + "loss": 0.3405, + "step": 15971 + }, + { + "epoch": 1.0166304937601736, + "grad_norm": 6.025145530700684, + "learning_rate": 7.91407427710018e-06, + "loss": 0.2991, + "step": 15972 + }, + { + "epoch": 1.0166440586001084, + "grad_norm": 5.793961048126221, + "learning_rate": 7.913937234479923e-06, + "loss": 0.2557, + "step": 15973 + }, + { + "epoch": 1.0166576234400435, + "grad_norm": 5.913898944854736, + "learning_rate": 7.913800191859668e-06, + "loss": 0.3188, + "step": 15974 + }, + { + "epoch": 1.0166711882799784, + "grad_norm": 6.232263088226318, + "learning_rate": 7.913663149239415e-06, + "loss": 0.241, + "step": 15975 + }, + { + "epoch": 1.0166847531199132, + "grad_norm": 4.860380172729492, + "learning_rate": 7.91352610661916e-06, + "loss": 0.2924, + "step": 15976 + }, + { + "epoch": 1.016698317959848, + "grad_norm": 5.595130443572998, + "learning_rate": 7.913389063998904e-06, + "loss": 0.307, + "step": 15977 + }, + { + "epoch": 1.016711882799783, + "grad_norm": 5.209598064422607, + "learning_rate": 7.913252021378649e-06, + "loss": 0.2899, + "step": 15978 + }, + { + "epoch": 1.0167254476397178, + "grad_norm": 5.568440914154053, + "learning_rate": 7.913114978758396e-06, + "loss": 0.2314, + "step": 15979 + }, + { + "epoch": 1.0167390124796527, + "grad_norm": 6.4699273109436035, + "learning_rate": 7.912977936138139e-06, + "loss": 0.3584, + "step": 15980 + }, + { + "epoch": 1.0167525773195876, + "grad_norm": 5.725692272186279, + "learning_rate": 7.912840893517884e-06, + "loss": 0.3448, + "step": 15981 + }, + { + "epoch": 1.0167661421595224, + "grad_norm": 7.271390914916992, + "learning_rate": 7.91270385089763e-06, + "loss": 0.2726, + "step": 15982 + }, + { + "epoch": 1.0167797069994573, + "grad_norm": 6.027499675750732, + "learning_rate": 7.912566808277375e-06, + "loss": 0.2992, + "step": 15983 + }, + { + "epoch": 1.0167932718393924, + "grad_norm": 4.924431800842285, + "learning_rate": 7.91242976565712e-06, + "loss": 0.2644, + "step": 15984 + }, + { + "epoch": 1.0168068366793273, + "grad_norm": 5.745489120483398, + "learning_rate": 7.912292723036865e-06, + "loss": 0.2796, + "step": 15985 + }, + { + "epoch": 1.0168204015192621, + "grad_norm": 5.331393241882324, + "learning_rate": 7.91215568041661e-06, + "loss": 0.3131, + "step": 15986 + }, + { + "epoch": 1.016833966359197, + "grad_norm": 6.563957214355469, + "learning_rate": 7.912018637796355e-06, + "loss": 0.3988, + "step": 15987 + }, + { + "epoch": 1.0168475311991318, + "grad_norm": 5.666750431060791, + "learning_rate": 7.9118815951761e-06, + "loss": 0.2485, + "step": 15988 + }, + { + "epoch": 1.0168610960390667, + "grad_norm": 7.159228801727295, + "learning_rate": 7.911744552555846e-06, + "loss": 0.3761, + "step": 15989 + }, + { + "epoch": 1.0168746608790016, + "grad_norm": 6.290239334106445, + "learning_rate": 7.911607509935591e-06, + "loss": 0.2667, + "step": 15990 + }, + { + "epoch": 1.0168882257189364, + "grad_norm": 6.426596164703369, + "learning_rate": 7.911470467315336e-06, + "loss": 0.2965, + "step": 15991 + }, + { + "epoch": 1.0169017905588713, + "grad_norm": 4.223938465118408, + "learning_rate": 7.911333424695081e-06, + "loss": 0.1371, + "step": 15992 + }, + { + "epoch": 1.0169153553988064, + "grad_norm": 6.423480033874512, + "learning_rate": 7.911196382074826e-06, + "loss": 0.3269, + "step": 15993 + }, + { + "epoch": 1.0169289202387413, + "grad_norm": 6.554468631744385, + "learning_rate": 7.911059339454572e-06, + "loss": 0.4289, + "step": 15994 + }, + { + "epoch": 1.0169424850786761, + "grad_norm": 6.309236526489258, + "learning_rate": 7.910922296834315e-06, + "loss": 0.3687, + "step": 15995 + }, + { + "epoch": 1.016956049918611, + "grad_norm": 5.608130931854248, + "learning_rate": 7.910785254214062e-06, + "loss": 0.2318, + "step": 15996 + }, + { + "epoch": 1.0169696147585459, + "grad_norm": 5.222724437713623, + "learning_rate": 7.910648211593807e-06, + "loss": 0.2602, + "step": 15997 + }, + { + "epoch": 1.0169831795984807, + "grad_norm": 5.92506217956543, + "learning_rate": 7.91051116897355e-06, + "loss": 0.2707, + "step": 15998 + }, + { + "epoch": 1.0169967444384156, + "grad_norm": 6.059469699859619, + "learning_rate": 7.910374126353296e-06, + "loss": 0.3709, + "step": 15999 + }, + { + "epoch": 1.0170103092783505, + "grad_norm": 6.901880264282227, + "learning_rate": 7.910237083733041e-06, + "loss": 0.3388, + "step": 16000 + }, + { + "epoch": 1.0170238741182853, + "grad_norm": 5.86766242980957, + "learning_rate": 7.910100041112788e-06, + "loss": 0.3236, + "step": 16001 + }, + { + "epoch": 1.0170374389582202, + "grad_norm": 5.441189765930176, + "learning_rate": 7.909962998492531e-06, + "loss": 0.3096, + "step": 16002 + }, + { + "epoch": 1.0170510037981553, + "grad_norm": 5.677998065948486, + "learning_rate": 7.909825955872277e-06, + "loss": 0.2953, + "step": 16003 + }, + { + "epoch": 1.0170645686380901, + "grad_norm": 5.908392906188965, + "learning_rate": 7.909688913252022e-06, + "loss": 0.2628, + "step": 16004 + }, + { + "epoch": 1.017078133478025, + "grad_norm": 5.2981181144714355, + "learning_rate": 7.909551870631767e-06, + "loss": 0.2777, + "step": 16005 + }, + { + "epoch": 1.0170916983179599, + "grad_norm": 4.759017467498779, + "learning_rate": 7.909414828011512e-06, + "loss": 0.2604, + "step": 16006 + }, + { + "epoch": 1.0171052631578947, + "grad_norm": 6.414813041687012, + "learning_rate": 7.909277785391257e-06, + "loss": 0.2423, + "step": 16007 + }, + { + "epoch": 1.0171188279978296, + "grad_norm": 5.474057197570801, + "learning_rate": 7.909140742771002e-06, + "loss": 0.3133, + "step": 16008 + }, + { + "epoch": 1.0171323928377645, + "grad_norm": 6.288939476013184, + "learning_rate": 7.909003700150748e-06, + "loss": 0.2893, + "step": 16009 + }, + { + "epoch": 1.0171459576776993, + "grad_norm": 6.112009525299072, + "learning_rate": 7.908866657530493e-06, + "loss": 0.2941, + "step": 16010 + }, + { + "epoch": 1.0171595225176342, + "grad_norm": 4.465234756469727, + "learning_rate": 7.908729614910238e-06, + "loss": 0.1806, + "step": 16011 + }, + { + "epoch": 1.0171730873575693, + "grad_norm": 5.824834823608398, + "learning_rate": 7.908592572289983e-06, + "loss": 0.314, + "step": 16012 + }, + { + "epoch": 1.0171866521975041, + "grad_norm": 7.278061866760254, + "learning_rate": 7.908455529669727e-06, + "loss": 0.3675, + "step": 16013 + }, + { + "epoch": 1.017200217037439, + "grad_norm": 6.648882865905762, + "learning_rate": 7.908318487049473e-06, + "loss": 0.3737, + "step": 16014 + }, + { + "epoch": 1.0172137818773739, + "grad_norm": 4.568985939025879, + "learning_rate": 7.908181444429219e-06, + "loss": 0.2011, + "step": 16015 + }, + { + "epoch": 1.0172273467173087, + "grad_norm": 4.892397403717041, + "learning_rate": 7.908044401808964e-06, + "loss": 0.2233, + "step": 16016 + }, + { + "epoch": 1.0172409115572436, + "grad_norm": 6.182493686676025, + "learning_rate": 7.907907359188707e-06, + "loss": 0.293, + "step": 16017 + }, + { + "epoch": 1.0172544763971785, + "grad_norm": 5.499197006225586, + "learning_rate": 7.907770316568454e-06, + "loss": 0.3556, + "step": 16018 + }, + { + "epoch": 1.0172680412371133, + "grad_norm": 5.718596458435059, + "learning_rate": 7.9076332739482e-06, + "loss": 0.2613, + "step": 16019 + }, + { + "epoch": 1.0172816060770482, + "grad_norm": 5.4555134773254395, + "learning_rate": 7.907496231327943e-06, + "loss": 0.2726, + "step": 16020 + }, + { + "epoch": 1.017295170916983, + "grad_norm": 5.384085178375244, + "learning_rate": 7.907359188707688e-06, + "loss": 0.2287, + "step": 16021 + }, + { + "epoch": 1.0173087357569182, + "grad_norm": 7.187456130981445, + "learning_rate": 7.907222146087435e-06, + "loss": 0.3607, + "step": 16022 + }, + { + "epoch": 1.017322300596853, + "grad_norm": 4.463510513305664, + "learning_rate": 7.907085103467178e-06, + "loss": 0.227, + "step": 16023 + }, + { + "epoch": 1.017335865436788, + "grad_norm": 6.074104309082031, + "learning_rate": 7.906948060846924e-06, + "loss": 0.2979, + "step": 16024 + }, + { + "epoch": 1.0173494302767228, + "grad_norm": 6.393637180328369, + "learning_rate": 7.906811018226669e-06, + "loss": 0.3584, + "step": 16025 + }, + { + "epoch": 1.0173629951166576, + "grad_norm": 5.621632099151611, + "learning_rate": 7.906673975606414e-06, + "loss": 0.201, + "step": 16026 + }, + { + "epoch": 1.0173765599565925, + "grad_norm": 6.565907955169678, + "learning_rate": 7.906536932986159e-06, + "loss": 0.3804, + "step": 16027 + }, + { + "epoch": 1.0173901247965274, + "grad_norm": 7.880030632019043, + "learning_rate": 7.906399890365904e-06, + "loss": 0.3258, + "step": 16028 + }, + { + "epoch": 1.0174036896364622, + "grad_norm": 5.283210277557373, + "learning_rate": 7.90626284774565e-06, + "loss": 0.2629, + "step": 16029 + }, + { + "epoch": 1.017417254476397, + "grad_norm": 4.89066219329834, + "learning_rate": 7.906125805125395e-06, + "loss": 0.2106, + "step": 16030 + }, + { + "epoch": 1.0174308193163322, + "grad_norm": 4.335445404052734, + "learning_rate": 7.90598876250514e-06, + "loss": 0.2228, + "step": 16031 + }, + { + "epoch": 1.017444384156267, + "grad_norm": 6.023960113525391, + "learning_rate": 7.905851719884885e-06, + "loss": 0.2833, + "step": 16032 + }, + { + "epoch": 1.017457948996202, + "grad_norm": 4.574112415313721, + "learning_rate": 7.90571467726463e-06, + "loss": 0.2154, + "step": 16033 + }, + { + "epoch": 1.0174715138361368, + "grad_norm": 7.440967559814453, + "learning_rate": 7.905577634644375e-06, + "loss": 0.483, + "step": 16034 + }, + { + "epoch": 1.0174850786760716, + "grad_norm": 7.970400333404541, + "learning_rate": 7.90544059202412e-06, + "loss": 0.2831, + "step": 16035 + }, + { + "epoch": 1.0174986435160065, + "grad_norm": 5.586824893951416, + "learning_rate": 7.905303549403866e-06, + "loss": 0.2625, + "step": 16036 + }, + { + "epoch": 1.0175122083559414, + "grad_norm": 5.601457595825195, + "learning_rate": 7.905166506783611e-06, + "loss": 0.319, + "step": 16037 + }, + { + "epoch": 1.0175257731958762, + "grad_norm": 6.235196113586426, + "learning_rate": 7.905029464163354e-06, + "loss": 0.247, + "step": 16038 + }, + { + "epoch": 1.017539338035811, + "grad_norm": 5.8201904296875, + "learning_rate": 7.904892421543101e-06, + "loss": 0.2607, + "step": 16039 + }, + { + "epoch": 1.017552902875746, + "grad_norm": 6.444372653961182, + "learning_rate": 7.904755378922846e-06, + "loss": 0.279, + "step": 16040 + }, + { + "epoch": 1.017566467715681, + "grad_norm": 7.524982929229736, + "learning_rate": 7.904618336302592e-06, + "loss": 0.4015, + "step": 16041 + }, + { + "epoch": 1.017580032555616, + "grad_norm": 4.7386794090271, + "learning_rate": 7.904481293682335e-06, + "loss": 0.2587, + "step": 16042 + }, + { + "epoch": 1.0175935973955508, + "grad_norm": 8.477706909179688, + "learning_rate": 7.90434425106208e-06, + "loss": 0.382, + "step": 16043 + }, + { + "epoch": 1.0176071622354856, + "grad_norm": 7.366369724273682, + "learning_rate": 7.904207208441827e-06, + "loss": 0.431, + "step": 16044 + }, + { + "epoch": 1.0176207270754205, + "grad_norm": 6.30072546005249, + "learning_rate": 7.90407016582157e-06, + "loss": 0.3589, + "step": 16045 + }, + { + "epoch": 1.0176342919153554, + "grad_norm": 8.283690452575684, + "learning_rate": 7.903933123201316e-06, + "loss": 0.3572, + "step": 16046 + }, + { + "epoch": 1.0176478567552902, + "grad_norm": 4.684642314910889, + "learning_rate": 7.903796080581061e-06, + "loss": 0.2488, + "step": 16047 + }, + { + "epoch": 1.017661421595225, + "grad_norm": 5.291536808013916, + "learning_rate": 7.903659037960806e-06, + "loss": 0.3109, + "step": 16048 + }, + { + "epoch": 1.01767498643516, + "grad_norm": 5.435524940490723, + "learning_rate": 7.903521995340551e-06, + "loss": 0.2595, + "step": 16049 + }, + { + "epoch": 1.017688551275095, + "grad_norm": 4.972982406616211, + "learning_rate": 7.903384952720297e-06, + "loss": 0.1671, + "step": 16050 + }, + { + "epoch": 1.01770211611503, + "grad_norm": 7.238722801208496, + "learning_rate": 7.903247910100042e-06, + "loss": 0.4598, + "step": 16051 + }, + { + "epoch": 1.0177156809549648, + "grad_norm": 7.326442718505859, + "learning_rate": 7.903110867479787e-06, + "loss": 0.2512, + "step": 16052 + }, + { + "epoch": 1.0177292457948997, + "grad_norm": 4.759247303009033, + "learning_rate": 7.902973824859532e-06, + "loss": 0.2137, + "step": 16053 + }, + { + "epoch": 1.0177428106348345, + "grad_norm": 5.4003729820251465, + "learning_rate": 7.902836782239277e-06, + "loss": 0.2553, + "step": 16054 + }, + { + "epoch": 1.0177563754747694, + "grad_norm": 4.347563743591309, + "learning_rate": 7.902699739619022e-06, + "loss": 0.1568, + "step": 16055 + }, + { + "epoch": 1.0177699403147042, + "grad_norm": 6.818612098693848, + "learning_rate": 7.902562696998768e-06, + "loss": 0.4376, + "step": 16056 + }, + { + "epoch": 1.0177835051546391, + "grad_norm": 6.755580902099609, + "learning_rate": 7.902425654378513e-06, + "loss": 0.3534, + "step": 16057 + }, + { + "epoch": 1.017797069994574, + "grad_norm": 6.800750255584717, + "learning_rate": 7.902288611758258e-06, + "loss": 0.2714, + "step": 16058 + }, + { + "epoch": 1.0178106348345088, + "grad_norm": 6.900210857391357, + "learning_rate": 7.902151569138003e-06, + "loss": 0.3258, + "step": 16059 + }, + { + "epoch": 1.017824199674444, + "grad_norm": 5.858254909515381, + "learning_rate": 7.902014526517747e-06, + "loss": 0.2447, + "step": 16060 + }, + { + "epoch": 1.0178377645143788, + "grad_norm": 4.997782230377197, + "learning_rate": 7.901877483897494e-06, + "loss": 0.2825, + "step": 16061 + }, + { + "epoch": 1.0178513293543137, + "grad_norm": 5.863989353179932, + "learning_rate": 7.901740441277239e-06, + "loss": 0.2878, + "step": 16062 + }, + { + "epoch": 1.0178648941942485, + "grad_norm": 5.558427810668945, + "learning_rate": 7.901603398656982e-06, + "loss": 0.2284, + "step": 16063 + }, + { + "epoch": 1.0178784590341834, + "grad_norm": 4.191459655761719, + "learning_rate": 7.901466356036727e-06, + "loss": 0.1297, + "step": 16064 + }, + { + "epoch": 1.0178920238741183, + "grad_norm": 6.207066059112549, + "learning_rate": 7.901329313416474e-06, + "loss": 0.2768, + "step": 16065 + }, + { + "epoch": 1.0179055887140531, + "grad_norm": 5.583162784576416, + "learning_rate": 7.901192270796218e-06, + "loss": 0.28, + "step": 16066 + }, + { + "epoch": 1.017919153553988, + "grad_norm": 4.122371673583984, + "learning_rate": 7.901055228175963e-06, + "loss": 0.1726, + "step": 16067 + }, + { + "epoch": 1.0179327183939229, + "grad_norm": 6.161792278289795, + "learning_rate": 7.900918185555708e-06, + "loss": 0.3607, + "step": 16068 + }, + { + "epoch": 1.017946283233858, + "grad_norm": 4.739904403686523, + "learning_rate": 7.900781142935453e-06, + "loss": 0.2219, + "step": 16069 + }, + { + "epoch": 1.0179598480737928, + "grad_norm": 4.725728988647461, + "learning_rate": 7.900644100315198e-06, + "loss": 0.2472, + "step": 16070 + }, + { + "epoch": 1.0179734129137277, + "grad_norm": 3.9446446895599365, + "learning_rate": 7.900507057694944e-06, + "loss": 0.1668, + "step": 16071 + }, + { + "epoch": 1.0179869777536625, + "grad_norm": 6.110358238220215, + "learning_rate": 7.900370015074689e-06, + "loss": 0.323, + "step": 16072 + }, + { + "epoch": 1.0180005425935974, + "grad_norm": 5.929605484008789, + "learning_rate": 7.900232972454434e-06, + "loss": 0.2367, + "step": 16073 + }, + { + "epoch": 1.0180141074335323, + "grad_norm": 4.62669563293457, + "learning_rate": 7.900095929834179e-06, + "loss": 0.2634, + "step": 16074 + }, + { + "epoch": 1.0180276722734671, + "grad_norm": 4.019739151000977, + "learning_rate": 7.899958887213924e-06, + "loss": 0.1816, + "step": 16075 + }, + { + "epoch": 1.018041237113402, + "grad_norm": 3.9723916053771973, + "learning_rate": 7.89982184459367e-06, + "loss": 0.2491, + "step": 16076 + }, + { + "epoch": 1.0180548019533369, + "grad_norm": 7.925947189331055, + "learning_rate": 7.899684801973415e-06, + "loss": 0.3144, + "step": 16077 + }, + { + "epoch": 1.0180683667932717, + "grad_norm": 5.5203938484191895, + "learning_rate": 7.89954775935316e-06, + "loss": 0.2988, + "step": 16078 + }, + { + "epoch": 1.0180819316332068, + "grad_norm": 6.5242509841918945, + "learning_rate": 7.899410716732905e-06, + "loss": 0.2867, + "step": 16079 + }, + { + "epoch": 1.0180954964731417, + "grad_norm": 5.66540002822876, + "learning_rate": 7.89927367411265e-06, + "loss": 0.2244, + "step": 16080 + }, + { + "epoch": 1.0181090613130765, + "grad_norm": 6.37709379196167, + "learning_rate": 7.899136631492394e-06, + "loss": 0.2436, + "step": 16081 + }, + { + "epoch": 1.0181226261530114, + "grad_norm": 4.7645792961120605, + "learning_rate": 7.898999588872139e-06, + "loss": 0.1987, + "step": 16082 + }, + { + "epoch": 1.0181361909929463, + "grad_norm": 5.713455677032471, + "learning_rate": 7.898862546251886e-06, + "loss": 0.268, + "step": 16083 + }, + { + "epoch": 1.0181497558328811, + "grad_norm": 5.58395528793335, + "learning_rate": 7.898725503631631e-06, + "loss": 0.3922, + "step": 16084 + }, + { + "epoch": 1.018163320672816, + "grad_norm": 5.22447395324707, + "learning_rate": 7.898588461011374e-06, + "loss": 0.315, + "step": 16085 + }, + { + "epoch": 1.0181768855127509, + "grad_norm": 5.268839359283447, + "learning_rate": 7.89845141839112e-06, + "loss": 0.2859, + "step": 16086 + }, + { + "epoch": 1.0181904503526857, + "grad_norm": 4.92682409286499, + "learning_rate": 7.898314375770866e-06, + "loss": 0.2659, + "step": 16087 + }, + { + "epoch": 1.0182040151926208, + "grad_norm": 5.671361923217773, + "learning_rate": 7.89817733315061e-06, + "loss": 0.3089, + "step": 16088 + }, + { + "epoch": 1.0182175800325557, + "grad_norm": 5.835974216461182, + "learning_rate": 7.898040290530355e-06, + "loss": 0.2943, + "step": 16089 + }, + { + "epoch": 1.0182311448724906, + "grad_norm": 5.206459999084473, + "learning_rate": 7.8979032479101e-06, + "loss": 0.3327, + "step": 16090 + }, + { + "epoch": 1.0182447097124254, + "grad_norm": 5.079563617706299, + "learning_rate": 7.897766205289845e-06, + "loss": 0.4083, + "step": 16091 + }, + { + "epoch": 1.0182582745523603, + "grad_norm": 6.6740336418151855, + "learning_rate": 7.89762916266959e-06, + "loss": 0.4282, + "step": 16092 + }, + { + "epoch": 1.0182718393922952, + "grad_norm": 4.972906112670898, + "learning_rate": 7.897492120049336e-06, + "loss": 0.3353, + "step": 16093 + }, + { + "epoch": 1.01828540423223, + "grad_norm": 3.5564475059509277, + "learning_rate": 7.897355077429081e-06, + "loss": 0.231, + "step": 16094 + }, + { + "epoch": 1.0182989690721649, + "grad_norm": 6.534174919128418, + "learning_rate": 7.897218034808826e-06, + "loss": 0.3919, + "step": 16095 + }, + { + "epoch": 1.0183125339120997, + "grad_norm": 4.730834007263184, + "learning_rate": 7.897080992188571e-06, + "loss": 0.1845, + "step": 16096 + }, + { + "epoch": 1.0183260987520346, + "grad_norm": 5.395130634307861, + "learning_rate": 7.896943949568317e-06, + "loss": 0.2997, + "step": 16097 + }, + { + "epoch": 1.0183396635919697, + "grad_norm": 4.830315589904785, + "learning_rate": 7.896806906948062e-06, + "loss": 0.2099, + "step": 16098 + }, + { + "epoch": 1.0183532284319046, + "grad_norm": 5.2047247886657715, + "learning_rate": 7.896669864327807e-06, + "loss": 0.2212, + "step": 16099 + }, + { + "epoch": 1.0183667932718394, + "grad_norm": 6.173455238342285, + "learning_rate": 7.896532821707552e-06, + "loss": 0.3289, + "step": 16100 + }, + { + "epoch": 1.0183803581117743, + "grad_norm": 4.904207706451416, + "learning_rate": 7.896395779087297e-06, + "loss": 0.2076, + "step": 16101 + }, + { + "epoch": 1.0183939229517092, + "grad_norm": 7.166809558868408, + "learning_rate": 7.896258736467042e-06, + "loss": 0.4283, + "step": 16102 + }, + { + "epoch": 1.018407487791644, + "grad_norm": 4.846743583679199, + "learning_rate": 7.896121693846786e-06, + "loss": 0.1946, + "step": 16103 + }, + { + "epoch": 1.018421052631579, + "grad_norm": 4.026425838470459, + "learning_rate": 7.895984651226533e-06, + "loss": 0.2462, + "step": 16104 + }, + { + "epoch": 1.0184346174715138, + "grad_norm": 4.270886421203613, + "learning_rate": 7.895847608606278e-06, + "loss": 0.2899, + "step": 16105 + }, + { + "epoch": 1.0184481823114486, + "grad_norm": 5.351601600646973, + "learning_rate": 7.895710565986021e-06, + "loss": 0.2068, + "step": 16106 + }, + { + "epoch": 1.0184617471513837, + "grad_norm": 4.517141342163086, + "learning_rate": 7.895573523365767e-06, + "loss": 0.2652, + "step": 16107 + }, + { + "epoch": 1.0184753119913186, + "grad_norm": 5.679062366485596, + "learning_rate": 7.895436480745514e-06, + "loss": 0.4065, + "step": 16108 + }, + { + "epoch": 1.0184888768312534, + "grad_norm": 3.1696083545684814, + "learning_rate": 7.895299438125259e-06, + "loss": 0.255, + "step": 16109 + }, + { + "epoch": 1.0185024416711883, + "grad_norm": 4.013091564178467, + "learning_rate": 7.895162395505002e-06, + "loss": 0.1799, + "step": 16110 + }, + { + "epoch": 1.0185160065111232, + "grad_norm": 5.126781940460205, + "learning_rate": 7.895025352884747e-06, + "loss": 0.2431, + "step": 16111 + }, + { + "epoch": 1.018529571351058, + "grad_norm": 5.735670566558838, + "learning_rate": 7.894888310264493e-06, + "loss": 0.3007, + "step": 16112 + }, + { + "epoch": 1.018543136190993, + "grad_norm": 4.966042518615723, + "learning_rate": 7.894751267644238e-06, + "loss": 0.24, + "step": 16113 + }, + { + "epoch": 1.0185567010309278, + "grad_norm": 6.322759628295898, + "learning_rate": 7.894614225023983e-06, + "loss": 0.3742, + "step": 16114 + }, + { + "epoch": 1.0185702658708626, + "grad_norm": 6.319315433502197, + "learning_rate": 7.894477182403728e-06, + "loss": 0.2558, + "step": 16115 + }, + { + "epoch": 1.0185838307107977, + "grad_norm": 4.335195541381836, + "learning_rate": 7.894340139783473e-06, + "loss": 0.1867, + "step": 16116 + }, + { + "epoch": 1.0185973955507326, + "grad_norm": 6.896032810211182, + "learning_rate": 7.894203097163218e-06, + "loss": 0.3698, + "step": 16117 + }, + { + "epoch": 1.0186109603906675, + "grad_norm": 5.417738914489746, + "learning_rate": 7.894066054542964e-06, + "loss": 0.238, + "step": 16118 + }, + { + "epoch": 1.0186245252306023, + "grad_norm": 6.728090763092041, + "learning_rate": 7.893929011922709e-06, + "loss": 0.2705, + "step": 16119 + }, + { + "epoch": 1.0186380900705372, + "grad_norm": 5.544424533843994, + "learning_rate": 7.893791969302454e-06, + "loss": 0.2632, + "step": 16120 + }, + { + "epoch": 1.018651654910472, + "grad_norm": 6.106490135192871, + "learning_rate": 7.8936549266822e-06, + "loss": 0.2887, + "step": 16121 + }, + { + "epoch": 1.018665219750407, + "grad_norm": 5.179046154022217, + "learning_rate": 7.893517884061944e-06, + "loss": 0.2881, + "step": 16122 + }, + { + "epoch": 1.0186787845903418, + "grad_norm": 7.882995128631592, + "learning_rate": 7.89338084144169e-06, + "loss": 0.4219, + "step": 16123 + }, + { + "epoch": 1.0186923494302766, + "grad_norm": 3.7852370738983154, + "learning_rate": 7.893243798821435e-06, + "loss": 0.1455, + "step": 16124 + }, + { + "epoch": 1.0187059142702115, + "grad_norm": 6.526675224304199, + "learning_rate": 7.893106756201178e-06, + "loss": 0.3385, + "step": 16125 + }, + { + "epoch": 1.0187194791101466, + "grad_norm": 5.695725917816162, + "learning_rate": 7.892969713580925e-06, + "loss": 0.2523, + "step": 16126 + }, + { + "epoch": 1.0187330439500815, + "grad_norm": 4.741641044616699, + "learning_rate": 7.89283267096067e-06, + "loss": 0.298, + "step": 16127 + }, + { + "epoch": 1.0187466087900163, + "grad_norm": 6.305732250213623, + "learning_rate": 7.892695628340414e-06, + "loss": 0.2566, + "step": 16128 + }, + { + "epoch": 1.0187601736299512, + "grad_norm": 6.2129807472229, + "learning_rate": 7.892558585720159e-06, + "loss": 0.2782, + "step": 16129 + }, + { + "epoch": 1.018773738469886, + "grad_norm": 4.564201831817627, + "learning_rate": 7.892421543099906e-06, + "loss": 0.2129, + "step": 16130 + }, + { + "epoch": 1.018787303309821, + "grad_norm": 6.724587917327881, + "learning_rate": 7.89228450047965e-06, + "loss": 0.3435, + "step": 16131 + }, + { + "epoch": 1.0188008681497558, + "grad_norm": 6.07753324508667, + "learning_rate": 7.892147457859394e-06, + "loss": 0.3236, + "step": 16132 + }, + { + "epoch": 1.0188144329896907, + "grad_norm": 5.632385730743408, + "learning_rate": 7.89201041523914e-06, + "loss": 0.3031, + "step": 16133 + }, + { + "epoch": 1.0188279978296255, + "grad_norm": 5.705281734466553, + "learning_rate": 7.891873372618886e-06, + "loss": 0.2744, + "step": 16134 + }, + { + "epoch": 1.0188415626695606, + "grad_norm": 4.55666971206665, + "learning_rate": 7.89173632999863e-06, + "loss": 0.1913, + "step": 16135 + }, + { + "epoch": 1.0188551275094955, + "grad_norm": 6.176394462585449, + "learning_rate": 7.891599287378375e-06, + "loss": 0.3098, + "step": 16136 + }, + { + "epoch": 1.0188686923494303, + "grad_norm": 5.901308059692383, + "learning_rate": 7.89146224475812e-06, + "loss": 0.3005, + "step": 16137 + }, + { + "epoch": 1.0188822571893652, + "grad_norm": 5.106616497039795, + "learning_rate": 7.891325202137866e-06, + "loss": 0.2534, + "step": 16138 + }, + { + "epoch": 1.0188958220293, + "grad_norm": 5.186194896697998, + "learning_rate": 7.89118815951761e-06, + "loss": 0.2779, + "step": 16139 + }, + { + "epoch": 1.018909386869235, + "grad_norm": 6.5887298583984375, + "learning_rate": 7.891051116897356e-06, + "loss": 0.3018, + "step": 16140 + }, + { + "epoch": 1.0189229517091698, + "grad_norm": 5.964601516723633, + "learning_rate": 7.890914074277101e-06, + "loss": 0.2442, + "step": 16141 + }, + { + "epoch": 1.0189365165491047, + "grad_norm": 4.550257205963135, + "learning_rate": 7.890777031656846e-06, + "loss": 0.1888, + "step": 16142 + }, + { + "epoch": 1.0189500813890395, + "grad_norm": 6.372226715087891, + "learning_rate": 7.890639989036591e-06, + "loss": 0.2728, + "step": 16143 + }, + { + "epoch": 1.0189636462289744, + "grad_norm": 6.135648250579834, + "learning_rate": 7.890502946416337e-06, + "loss": 0.2476, + "step": 16144 + }, + { + "epoch": 1.0189772110689095, + "grad_norm": 8.705957412719727, + "learning_rate": 7.890365903796082e-06, + "loss": 0.2111, + "step": 16145 + }, + { + "epoch": 1.0189907759088443, + "grad_norm": 8.550514221191406, + "learning_rate": 7.890228861175825e-06, + "loss": 0.4923, + "step": 16146 + }, + { + "epoch": 1.0190043407487792, + "grad_norm": 6.224287033081055, + "learning_rate": 7.890091818555572e-06, + "loss": 0.2704, + "step": 16147 + }, + { + "epoch": 1.019017905588714, + "grad_norm": 4.584964275360107, + "learning_rate": 7.889954775935317e-06, + "loss": 0.1782, + "step": 16148 + }, + { + "epoch": 1.019031470428649, + "grad_norm": 6.100836753845215, + "learning_rate": 7.88981773331506e-06, + "loss": 0.3476, + "step": 16149 + }, + { + "epoch": 1.0190450352685838, + "grad_norm": 8.373469352722168, + "learning_rate": 7.889680690694806e-06, + "loss": 0.3164, + "step": 16150 + }, + { + "epoch": 1.0190586001085187, + "grad_norm": 9.091238021850586, + "learning_rate": 7.889543648074551e-06, + "loss": 0.4901, + "step": 16151 + }, + { + "epoch": 1.0190721649484535, + "grad_norm": 6.789349555969238, + "learning_rate": 7.889406605454298e-06, + "loss": 0.3272, + "step": 16152 + }, + { + "epoch": 1.0190857297883884, + "grad_norm": 5.258569717407227, + "learning_rate": 7.889269562834041e-06, + "loss": 0.2801, + "step": 16153 + }, + { + "epoch": 1.0190992946283235, + "grad_norm": 7.154225826263428, + "learning_rate": 7.889132520213787e-06, + "loss": 0.3129, + "step": 16154 + }, + { + "epoch": 1.0191128594682584, + "grad_norm": 6.046051502227783, + "learning_rate": 7.888995477593532e-06, + "loss": 0.2924, + "step": 16155 + }, + { + "epoch": 1.0191264243081932, + "grad_norm": 6.188726425170898, + "learning_rate": 7.888858434973277e-06, + "loss": 0.2461, + "step": 16156 + }, + { + "epoch": 1.019139989148128, + "grad_norm": 6.1250786781311035, + "learning_rate": 7.888721392353022e-06, + "loss": 0.2617, + "step": 16157 + }, + { + "epoch": 1.019153553988063, + "grad_norm": 5.42231559753418, + "learning_rate": 7.888584349732767e-06, + "loss": 0.3682, + "step": 16158 + }, + { + "epoch": 1.0191671188279978, + "grad_norm": 7.689013957977295, + "learning_rate": 7.888447307112513e-06, + "loss": 0.4012, + "step": 16159 + }, + { + "epoch": 1.0191806836679327, + "grad_norm": 5.858963966369629, + "learning_rate": 7.888310264492258e-06, + "loss": 0.306, + "step": 16160 + }, + { + "epoch": 1.0191942485078676, + "grad_norm": 6.02546501159668, + "learning_rate": 7.888173221872003e-06, + "loss": 0.2925, + "step": 16161 + }, + { + "epoch": 1.0192078133478024, + "grad_norm": 7.387989521026611, + "learning_rate": 7.888036179251748e-06, + "loss": 0.2783, + "step": 16162 + }, + { + "epoch": 1.0192213781877373, + "grad_norm": 5.999553203582764, + "learning_rate": 7.887899136631493e-06, + "loss": 0.2319, + "step": 16163 + }, + { + "epoch": 1.0192349430276724, + "grad_norm": 5.288894176483154, + "learning_rate": 7.887762094011237e-06, + "loss": 0.3972, + "step": 16164 + }, + { + "epoch": 1.0192485078676072, + "grad_norm": 4.68366813659668, + "learning_rate": 7.887625051390984e-06, + "loss": 0.3056, + "step": 16165 + }, + { + "epoch": 1.019262072707542, + "grad_norm": 6.018702507019043, + "learning_rate": 7.887488008770729e-06, + "loss": 0.3801, + "step": 16166 + }, + { + "epoch": 1.019275637547477, + "grad_norm": 5.83829402923584, + "learning_rate": 7.887350966150474e-06, + "loss": 0.2381, + "step": 16167 + }, + { + "epoch": 1.0192892023874118, + "grad_norm": 7.305763244628906, + "learning_rate": 7.887213923530217e-06, + "loss": 0.3846, + "step": 16168 + }, + { + "epoch": 1.0193027672273467, + "grad_norm": 5.9070210456848145, + "learning_rate": 7.887076880909964e-06, + "loss": 0.3019, + "step": 16169 + }, + { + "epoch": 1.0193163320672816, + "grad_norm": 6.519379138946533, + "learning_rate": 7.88693983828971e-06, + "loss": 0.2899, + "step": 16170 + }, + { + "epoch": 1.0193298969072164, + "grad_norm": 5.83629846572876, + "learning_rate": 7.886802795669453e-06, + "loss": 0.3935, + "step": 16171 + }, + { + "epoch": 1.0193434617471513, + "grad_norm": 5.731019973754883, + "learning_rate": 7.886665753049198e-06, + "loss": 0.2647, + "step": 16172 + }, + { + "epoch": 1.0193570265870864, + "grad_norm": 5.433511734008789, + "learning_rate": 7.886528710428945e-06, + "loss": 0.3022, + "step": 16173 + }, + { + "epoch": 1.0193705914270212, + "grad_norm": 6.201343536376953, + "learning_rate": 7.886391667808689e-06, + "loss": 0.4318, + "step": 16174 + }, + { + "epoch": 1.019384156266956, + "grad_norm": 6.087795257568359, + "learning_rate": 7.886254625188434e-06, + "loss": 0.3938, + "step": 16175 + }, + { + "epoch": 1.019397721106891, + "grad_norm": 10.19924545288086, + "learning_rate": 7.886117582568179e-06, + "loss": 0.5376, + "step": 16176 + }, + { + "epoch": 1.0194112859468258, + "grad_norm": 4.943685054779053, + "learning_rate": 7.885980539947924e-06, + "loss": 0.1917, + "step": 16177 + }, + { + "epoch": 1.0194248507867607, + "grad_norm": 5.976179122924805, + "learning_rate": 7.88584349732767e-06, + "loss": 0.3622, + "step": 16178 + }, + { + "epoch": 1.0194384156266956, + "grad_norm": 4.58392858505249, + "learning_rate": 7.885706454707414e-06, + "loss": 0.2751, + "step": 16179 + }, + { + "epoch": 1.0194519804666304, + "grad_norm": 6.373414039611816, + "learning_rate": 7.88556941208716e-06, + "loss": 0.3579, + "step": 16180 + }, + { + "epoch": 1.0194655453065653, + "grad_norm": 3.7313430309295654, + "learning_rate": 7.885432369466905e-06, + "loss": 0.1319, + "step": 16181 + }, + { + "epoch": 1.0194791101465002, + "grad_norm": 4.936856746673584, + "learning_rate": 7.88529532684665e-06, + "loss": 0.2988, + "step": 16182 + }, + { + "epoch": 1.0194926749864353, + "grad_norm": 5.827963829040527, + "learning_rate": 7.885158284226395e-06, + "loss": 0.3679, + "step": 16183 + }, + { + "epoch": 1.0195062398263701, + "grad_norm": 5.302229404449463, + "learning_rate": 7.88502124160614e-06, + "loss": 0.2003, + "step": 16184 + }, + { + "epoch": 1.019519804666305, + "grad_norm": 5.266773700714111, + "learning_rate": 7.884884198985886e-06, + "loss": 0.3501, + "step": 16185 + }, + { + "epoch": 1.0195333695062399, + "grad_norm": 5.187260150909424, + "learning_rate": 7.88474715636563e-06, + "loss": 0.2949, + "step": 16186 + }, + { + "epoch": 1.0195469343461747, + "grad_norm": 7.507452011108398, + "learning_rate": 7.884610113745376e-06, + "loss": 0.4222, + "step": 16187 + }, + { + "epoch": 1.0195604991861096, + "grad_norm": 6.46480655670166, + "learning_rate": 7.884473071125121e-06, + "loss": 0.3602, + "step": 16188 + }, + { + "epoch": 1.0195740640260444, + "grad_norm": 4.260993957519531, + "learning_rate": 7.884336028504865e-06, + "loss": 0.3947, + "step": 16189 + }, + { + "epoch": 1.0195876288659793, + "grad_norm": 5.864684581756592, + "learning_rate": 7.884198985884611e-06, + "loss": 0.3035, + "step": 16190 + }, + { + "epoch": 1.0196011937059142, + "grad_norm": 5.356287479400635, + "learning_rate": 7.884061943264357e-06, + "loss": 0.2918, + "step": 16191 + }, + { + "epoch": 1.0196147585458493, + "grad_norm": 7.38453483581543, + "learning_rate": 7.883924900644102e-06, + "loss": 0.4453, + "step": 16192 + }, + { + "epoch": 1.0196283233857841, + "grad_norm": 6.178665637969971, + "learning_rate": 7.883787858023845e-06, + "loss": 0.5248, + "step": 16193 + }, + { + "epoch": 1.019641888225719, + "grad_norm": 5.465771675109863, + "learning_rate": 7.88365081540359e-06, + "loss": 0.3141, + "step": 16194 + }, + { + "epoch": 1.0196554530656539, + "grad_norm": 6.584676265716553, + "learning_rate": 7.883513772783337e-06, + "loss": 0.4179, + "step": 16195 + }, + { + "epoch": 1.0196690179055887, + "grad_norm": 7.500565052032471, + "learning_rate": 7.88337673016308e-06, + "loss": 0.3881, + "step": 16196 + }, + { + "epoch": 1.0196825827455236, + "grad_norm": 6.292859077453613, + "learning_rate": 7.883239687542826e-06, + "loss": 0.5693, + "step": 16197 + }, + { + "epoch": 1.0196961475854585, + "grad_norm": 5.7040581703186035, + "learning_rate": 7.883102644922571e-06, + "loss": 0.2986, + "step": 16198 + }, + { + "epoch": 1.0197097124253933, + "grad_norm": 6.318860054016113, + "learning_rate": 7.882965602302316e-06, + "loss": 0.4263, + "step": 16199 + }, + { + "epoch": 1.0197232772653282, + "grad_norm": 6.503018856048584, + "learning_rate": 7.882828559682062e-06, + "loss": 0.4261, + "step": 16200 + }, + { + "epoch": 1.019736842105263, + "grad_norm": 7.391196250915527, + "learning_rate": 7.882691517061807e-06, + "loss": 0.3418, + "step": 16201 + }, + { + "epoch": 1.0197504069451981, + "grad_norm": 5.727587699890137, + "learning_rate": 7.882554474441552e-06, + "loss": 0.2861, + "step": 16202 + }, + { + "epoch": 1.019763971785133, + "grad_norm": 5.428256034851074, + "learning_rate": 7.882417431821297e-06, + "loss": 0.282, + "step": 16203 + }, + { + "epoch": 1.0197775366250679, + "grad_norm": 5.4932451248168945, + "learning_rate": 7.882280389201042e-06, + "loss": 0.2735, + "step": 16204 + }, + { + "epoch": 1.0197911014650027, + "grad_norm": 6.719812393188477, + "learning_rate": 7.882143346580787e-06, + "loss": 0.5249, + "step": 16205 + }, + { + "epoch": 1.0198046663049376, + "grad_norm": 6.855802059173584, + "learning_rate": 7.882006303960533e-06, + "loss": 0.3716, + "step": 16206 + }, + { + "epoch": 1.0198182311448725, + "grad_norm": 5.824461936950684, + "learning_rate": 7.881869261340278e-06, + "loss": 0.3207, + "step": 16207 + }, + { + "epoch": 1.0198317959848073, + "grad_norm": 5.7803778648376465, + "learning_rate": 7.881732218720023e-06, + "loss": 0.3919, + "step": 16208 + }, + { + "epoch": 1.0198453608247422, + "grad_norm": 4.948718547821045, + "learning_rate": 7.881595176099768e-06, + "loss": 0.314, + "step": 16209 + }, + { + "epoch": 1.019858925664677, + "grad_norm": 3.7918922901153564, + "learning_rate": 7.881458133479513e-06, + "loss": 0.2041, + "step": 16210 + }, + { + "epoch": 1.0198724905046122, + "grad_norm": 4.505488872528076, + "learning_rate": 7.881321090859257e-06, + "loss": 0.2665, + "step": 16211 + }, + { + "epoch": 1.019886055344547, + "grad_norm": 6.754599571228027, + "learning_rate": 7.881184048239004e-06, + "loss": 0.3772, + "step": 16212 + }, + { + "epoch": 1.0198996201844819, + "grad_norm": 5.542133331298828, + "learning_rate": 7.881047005618749e-06, + "loss": 0.2911, + "step": 16213 + }, + { + "epoch": 1.0199131850244167, + "grad_norm": 5.212553024291992, + "learning_rate": 7.880909962998492e-06, + "loss": 0.2712, + "step": 16214 + }, + { + "epoch": 1.0199267498643516, + "grad_norm": 7.805336952209473, + "learning_rate": 7.880772920378238e-06, + "loss": 0.4242, + "step": 16215 + }, + { + "epoch": 1.0199403147042865, + "grad_norm": 5.8703389167785645, + "learning_rate": 7.880635877757984e-06, + "loss": 0.4129, + "step": 16216 + }, + { + "epoch": 1.0199538795442213, + "grad_norm": 4.0825324058532715, + "learning_rate": 7.88049883513773e-06, + "loss": 0.2403, + "step": 16217 + }, + { + "epoch": 1.0199674443841562, + "grad_norm": 4.669502258300781, + "learning_rate": 7.880361792517473e-06, + "loss": 0.2813, + "step": 16218 + }, + { + "epoch": 1.019981009224091, + "grad_norm": 6.1190056800842285, + "learning_rate": 7.880224749897218e-06, + "loss": 0.2176, + "step": 16219 + }, + { + "epoch": 1.019994574064026, + "grad_norm": 5.714360237121582, + "learning_rate": 7.880087707276963e-06, + "loss": 0.309, + "step": 16220 + }, + { + "epoch": 1.020008138903961, + "grad_norm": 4.562175750732422, + "learning_rate": 7.879950664656709e-06, + "loss": 0.2797, + "step": 16221 + }, + { + "epoch": 1.020021703743896, + "grad_norm": 7.449070453643799, + "learning_rate": 7.879813622036454e-06, + "loss": 0.4704, + "step": 16222 + }, + { + "epoch": 1.0200352685838308, + "grad_norm": 5.661283016204834, + "learning_rate": 7.879676579416199e-06, + "loss": 0.1548, + "step": 16223 + }, + { + "epoch": 1.0200488334237656, + "grad_norm": 7.8138275146484375, + "learning_rate": 7.879539536795944e-06, + "loss": 0.4275, + "step": 16224 + }, + { + "epoch": 1.0200623982637005, + "grad_norm": 5.170851230621338, + "learning_rate": 7.87940249417569e-06, + "loss": 0.2836, + "step": 16225 + }, + { + "epoch": 1.0200759631036354, + "grad_norm": 6.9656500816345215, + "learning_rate": 7.879265451555434e-06, + "loss": 0.4179, + "step": 16226 + }, + { + "epoch": 1.0200895279435702, + "grad_norm": 4.940554141998291, + "learning_rate": 7.87912840893518e-06, + "loss": 0.2584, + "step": 16227 + }, + { + "epoch": 1.020103092783505, + "grad_norm": 5.148190021514893, + "learning_rate": 7.878991366314925e-06, + "loss": 0.2296, + "step": 16228 + }, + { + "epoch": 1.02011665762344, + "grad_norm": 5.006452560424805, + "learning_rate": 7.87885432369467e-06, + "loss": 0.2951, + "step": 16229 + }, + { + "epoch": 1.020130222463375, + "grad_norm": 6.719802379608154, + "learning_rate": 7.878717281074415e-06, + "loss": 0.3709, + "step": 16230 + }, + { + "epoch": 1.02014378730331, + "grad_norm": 5.2854413986206055, + "learning_rate": 7.87858023845416e-06, + "loss": 0.3714, + "step": 16231 + }, + { + "epoch": 1.0201573521432448, + "grad_norm": 4.989621639251709, + "learning_rate": 7.878443195833906e-06, + "loss": 0.248, + "step": 16232 + }, + { + "epoch": 1.0201709169831796, + "grad_norm": 5.848728656768799, + "learning_rate": 7.878306153213649e-06, + "loss": 0.3816, + "step": 16233 + }, + { + "epoch": 1.0201844818231145, + "grad_norm": 6.126741886138916, + "learning_rate": 7.878169110593396e-06, + "loss": 0.3317, + "step": 16234 + }, + { + "epoch": 1.0201980466630494, + "grad_norm": 5.667135238647461, + "learning_rate": 7.878032067973141e-06, + "loss": 0.3178, + "step": 16235 + }, + { + "epoch": 1.0202116115029842, + "grad_norm": 4.67099666595459, + "learning_rate": 7.877895025352885e-06, + "loss": 0.2409, + "step": 16236 + }, + { + "epoch": 1.020225176342919, + "grad_norm": 6.136002063751221, + "learning_rate": 7.87775798273263e-06, + "loss": 0.321, + "step": 16237 + }, + { + "epoch": 1.020238741182854, + "grad_norm": 7.929913520812988, + "learning_rate": 7.877620940112377e-06, + "loss": 0.5023, + "step": 16238 + }, + { + "epoch": 1.0202523060227888, + "grad_norm": 4.94926643371582, + "learning_rate": 7.87748389749212e-06, + "loss": 0.2983, + "step": 16239 + }, + { + "epoch": 1.020265870862724, + "grad_norm": 5.300599098205566, + "learning_rate": 7.877346854871865e-06, + "loss": 0.3827, + "step": 16240 + }, + { + "epoch": 1.0202794357026588, + "grad_norm": 3.544422149658203, + "learning_rate": 7.87720981225161e-06, + "loss": 0.2815, + "step": 16241 + }, + { + "epoch": 1.0202930005425936, + "grad_norm": 4.94511604309082, + "learning_rate": 7.877072769631356e-06, + "loss": 0.1986, + "step": 16242 + }, + { + "epoch": 1.0203065653825285, + "grad_norm": 4.419857025146484, + "learning_rate": 7.8769357270111e-06, + "loss": 0.2778, + "step": 16243 + }, + { + "epoch": 1.0203201302224634, + "grad_norm": 5.017585754394531, + "learning_rate": 7.876798684390846e-06, + "loss": 0.2966, + "step": 16244 + }, + { + "epoch": 1.0203336950623982, + "grad_norm": 6.924830913543701, + "learning_rate": 7.876661641770591e-06, + "loss": 0.3274, + "step": 16245 + }, + { + "epoch": 1.020347259902333, + "grad_norm": 5.066377639770508, + "learning_rate": 7.876524599150336e-06, + "loss": 0.2337, + "step": 16246 + }, + { + "epoch": 1.020360824742268, + "grad_norm": 4.793039321899414, + "learning_rate": 7.876387556530082e-06, + "loss": 0.2941, + "step": 16247 + }, + { + "epoch": 1.0203743895822028, + "grad_norm": 7.463780403137207, + "learning_rate": 7.876250513909827e-06, + "loss": 0.4013, + "step": 16248 + }, + { + "epoch": 1.020387954422138, + "grad_norm": 4.331907272338867, + "learning_rate": 7.876113471289572e-06, + "loss": 0.3827, + "step": 16249 + }, + { + "epoch": 1.0204015192620728, + "grad_norm": 4.97153902053833, + "learning_rate": 7.875976428669317e-06, + "loss": 0.1531, + "step": 16250 + }, + { + "epoch": 1.0204150841020077, + "grad_norm": 6.449807167053223, + "learning_rate": 7.875839386049062e-06, + "loss": 0.263, + "step": 16251 + }, + { + "epoch": 1.0204286489419425, + "grad_norm": 5.156558990478516, + "learning_rate": 7.875702343428807e-06, + "loss": 0.2868, + "step": 16252 + }, + { + "epoch": 1.0204422137818774, + "grad_norm": 5.077413558959961, + "learning_rate": 7.875565300808553e-06, + "loss": 0.1988, + "step": 16253 + }, + { + "epoch": 1.0204557786218122, + "grad_norm": 4.241661071777344, + "learning_rate": 7.875428258188296e-06, + "loss": 0.1903, + "step": 16254 + }, + { + "epoch": 1.0204693434617471, + "grad_norm": 3.6756060123443604, + "learning_rate": 7.875291215568043e-06, + "loss": 0.2452, + "step": 16255 + }, + { + "epoch": 1.020482908301682, + "grad_norm": 5.294793128967285, + "learning_rate": 7.875154172947788e-06, + "loss": 0.2208, + "step": 16256 + }, + { + "epoch": 1.0204964731416168, + "grad_norm": 3.8173134326934814, + "learning_rate": 7.875017130327532e-06, + "loss": 0.17, + "step": 16257 + }, + { + "epoch": 1.0205100379815517, + "grad_norm": 5.274528503417969, + "learning_rate": 7.874880087707277e-06, + "loss": 0.3179, + "step": 16258 + }, + { + "epoch": 1.0205236028214868, + "grad_norm": 6.881925582885742, + "learning_rate": 7.874743045087024e-06, + "loss": 0.3851, + "step": 16259 + }, + { + "epoch": 1.0205371676614217, + "grad_norm": 4.132747650146484, + "learning_rate": 7.874606002466769e-06, + "loss": 0.2154, + "step": 16260 + }, + { + "epoch": 1.0205507325013565, + "grad_norm": 5.530838489532471, + "learning_rate": 7.874468959846512e-06, + "loss": 0.3595, + "step": 16261 + }, + { + "epoch": 1.0205642973412914, + "grad_norm": 3.908720016479492, + "learning_rate": 7.874331917226258e-06, + "loss": 0.1333, + "step": 16262 + }, + { + "epoch": 1.0205778621812263, + "grad_norm": 7.192833423614502, + "learning_rate": 7.874194874606003e-06, + "loss": 0.271, + "step": 16263 + }, + { + "epoch": 1.0205914270211611, + "grad_norm": 4.773914337158203, + "learning_rate": 7.874057831985748e-06, + "loss": 0.1637, + "step": 16264 + }, + { + "epoch": 1.020604991861096, + "grad_norm": 4.404316425323486, + "learning_rate": 7.873920789365493e-06, + "loss": 0.1932, + "step": 16265 + }, + { + "epoch": 1.0206185567010309, + "grad_norm": 4.038817405700684, + "learning_rate": 7.873783746745238e-06, + "loss": 0.2083, + "step": 16266 + }, + { + "epoch": 1.0206321215409657, + "grad_norm": 3.967719078063965, + "learning_rate": 7.873646704124983e-06, + "loss": 0.244, + "step": 16267 + }, + { + "epoch": 1.0206456863809008, + "grad_norm": 5.717923641204834, + "learning_rate": 7.873509661504729e-06, + "loss": 0.2951, + "step": 16268 + }, + { + "epoch": 1.0206592512208357, + "grad_norm": 5.414176940917969, + "learning_rate": 7.873372618884474e-06, + "loss": 0.2785, + "step": 16269 + }, + { + "epoch": 1.0206728160607705, + "grad_norm": 4.011970520019531, + "learning_rate": 7.873235576264219e-06, + "loss": 0.2038, + "step": 16270 + }, + { + "epoch": 1.0206863809007054, + "grad_norm": 5.339676380157471, + "learning_rate": 7.873098533643964e-06, + "loss": 0.2376, + "step": 16271 + }, + { + "epoch": 1.0206999457406403, + "grad_norm": 3.4382858276367188, + "learning_rate": 7.87296149102371e-06, + "loss": 0.2504, + "step": 16272 + }, + { + "epoch": 1.0207135105805751, + "grad_norm": 4.152359485626221, + "learning_rate": 7.872824448403454e-06, + "loss": 0.17, + "step": 16273 + }, + { + "epoch": 1.02072707542051, + "grad_norm": 6.754034996032715, + "learning_rate": 7.8726874057832e-06, + "loss": 0.2281, + "step": 16274 + }, + { + "epoch": 1.0207406402604449, + "grad_norm": 7.371191501617432, + "learning_rate": 7.872550363162945e-06, + "loss": 0.3822, + "step": 16275 + }, + { + "epoch": 1.0207542051003797, + "grad_norm": 6.281810760498047, + "learning_rate": 7.872413320542688e-06, + "loss": 0.2321, + "step": 16276 + }, + { + "epoch": 1.0207677699403146, + "grad_norm": 6.400392532348633, + "learning_rate": 7.872276277922435e-06, + "loss": 0.3006, + "step": 16277 + }, + { + "epoch": 1.0207813347802497, + "grad_norm": 7.124454021453857, + "learning_rate": 7.87213923530218e-06, + "loss": 0.3966, + "step": 16278 + }, + { + "epoch": 1.0207948996201845, + "grad_norm": 4.601531028747559, + "learning_rate": 7.872002192681924e-06, + "loss": 0.2281, + "step": 16279 + }, + { + "epoch": 1.0208084644601194, + "grad_norm": 4.970153331756592, + "learning_rate": 7.871865150061669e-06, + "loss": 0.3228, + "step": 16280 + }, + { + "epoch": 1.0208220293000543, + "grad_norm": 5.538094997406006, + "learning_rate": 7.871728107441416e-06, + "loss": 0.2733, + "step": 16281 + }, + { + "epoch": 1.0208355941399891, + "grad_norm": 5.3721795082092285, + "learning_rate": 7.87159106482116e-06, + "loss": 0.2971, + "step": 16282 + }, + { + "epoch": 1.020849158979924, + "grad_norm": 5.555177688598633, + "learning_rate": 7.871454022200905e-06, + "loss": 0.3362, + "step": 16283 + }, + { + "epoch": 1.0208627238198589, + "grad_norm": 6.715922832489014, + "learning_rate": 7.87131697958065e-06, + "loss": 0.3638, + "step": 16284 + }, + { + "epoch": 1.0208762886597937, + "grad_norm": 4.581467628479004, + "learning_rate": 7.871179936960397e-06, + "loss": 0.1869, + "step": 16285 + }, + { + "epoch": 1.0208898534997286, + "grad_norm": 5.622738838195801, + "learning_rate": 7.87104289434014e-06, + "loss": 0.2408, + "step": 16286 + }, + { + "epoch": 1.0209034183396637, + "grad_norm": 5.3239006996154785, + "learning_rate": 7.870905851719885e-06, + "loss": 0.1985, + "step": 16287 + }, + { + "epoch": 1.0209169831795986, + "grad_norm": 4.2862467765808105, + "learning_rate": 7.87076880909963e-06, + "loss": 0.2049, + "step": 16288 + }, + { + "epoch": 1.0209305480195334, + "grad_norm": 5.156361103057861, + "learning_rate": 7.870631766479376e-06, + "loss": 0.2377, + "step": 16289 + }, + { + "epoch": 1.0209441128594683, + "grad_norm": 5.380825042724609, + "learning_rate": 7.87049472385912e-06, + "loss": 0.2044, + "step": 16290 + }, + { + "epoch": 1.0209576776994032, + "grad_norm": 4.550876617431641, + "learning_rate": 7.870357681238866e-06, + "loss": 0.1683, + "step": 16291 + }, + { + "epoch": 1.020971242539338, + "grad_norm": 6.06179666519165, + "learning_rate": 7.870220638618611e-06, + "loss": 0.2565, + "step": 16292 + }, + { + "epoch": 1.0209848073792729, + "grad_norm": 5.613455772399902, + "learning_rate": 7.870083595998356e-06, + "loss": 0.2213, + "step": 16293 + }, + { + "epoch": 1.0209983722192078, + "grad_norm": 3.1959643363952637, + "learning_rate": 7.869946553378102e-06, + "loss": 0.1047, + "step": 16294 + }, + { + "epoch": 1.0210119370591426, + "grad_norm": 5.529357433319092, + "learning_rate": 7.869809510757847e-06, + "loss": 0.1934, + "step": 16295 + }, + { + "epoch": 1.0210255018990777, + "grad_norm": 4.977385997772217, + "learning_rate": 7.869672468137592e-06, + "loss": 0.2741, + "step": 16296 + }, + { + "epoch": 1.0210390667390126, + "grad_norm": 5.878890037536621, + "learning_rate": 7.869535425517335e-06, + "loss": 0.2446, + "step": 16297 + }, + { + "epoch": 1.0210526315789474, + "grad_norm": 4.566494464874268, + "learning_rate": 7.869398382897082e-06, + "loss": 0.167, + "step": 16298 + }, + { + "epoch": 1.0210661964188823, + "grad_norm": 5.703940391540527, + "learning_rate": 7.869261340276827e-06, + "loss": 0.1799, + "step": 16299 + }, + { + "epoch": 1.0210797612588172, + "grad_norm": 4.781775951385498, + "learning_rate": 7.869124297656573e-06, + "loss": 0.2071, + "step": 16300 + }, + { + "epoch": 1.021093326098752, + "grad_norm": 5.15116024017334, + "learning_rate": 7.868987255036316e-06, + "loss": 0.2221, + "step": 16301 + }, + { + "epoch": 1.021106890938687, + "grad_norm": 5.767906665802002, + "learning_rate": 7.868850212416061e-06, + "loss": 0.271, + "step": 16302 + }, + { + "epoch": 1.0211204557786218, + "grad_norm": 5.614482879638672, + "learning_rate": 7.868713169795808e-06, + "loss": 0.2354, + "step": 16303 + }, + { + "epoch": 1.0211340206185566, + "grad_norm": 4.817492485046387, + "learning_rate": 7.868576127175552e-06, + "loss": 0.2399, + "step": 16304 + }, + { + "epoch": 1.0211475854584915, + "grad_norm": 5.460385322570801, + "learning_rate": 7.868439084555297e-06, + "loss": 0.2036, + "step": 16305 + }, + { + "epoch": 1.0211611502984266, + "grad_norm": 5.442101955413818, + "learning_rate": 7.868302041935042e-06, + "loss": 0.251, + "step": 16306 + }, + { + "epoch": 1.0211747151383614, + "grad_norm": 6.374950408935547, + "learning_rate": 7.868164999314787e-06, + "loss": 0.3622, + "step": 16307 + }, + { + "epoch": 1.0211882799782963, + "grad_norm": 5.231292724609375, + "learning_rate": 7.868027956694532e-06, + "loss": 0.186, + "step": 16308 + }, + { + "epoch": 1.0212018448182312, + "grad_norm": 4.331464767456055, + "learning_rate": 7.867890914074278e-06, + "loss": 0.198, + "step": 16309 + }, + { + "epoch": 1.021215409658166, + "grad_norm": 7.272738456726074, + "learning_rate": 7.867753871454023e-06, + "loss": 0.3572, + "step": 16310 + }, + { + "epoch": 1.021228974498101, + "grad_norm": 5.321913719177246, + "learning_rate": 7.867616828833768e-06, + "loss": 0.2667, + "step": 16311 + }, + { + "epoch": 1.0212425393380358, + "grad_norm": 6.409482479095459, + "learning_rate": 7.867479786213513e-06, + "loss": 0.3065, + "step": 16312 + }, + { + "epoch": 1.0212561041779706, + "grad_norm": 6.649506568908691, + "learning_rate": 7.867342743593258e-06, + "loss": 0.364, + "step": 16313 + }, + { + "epoch": 1.0212696690179055, + "grad_norm": 5.721891403198242, + "learning_rate": 7.867205700973003e-06, + "loss": 0.3014, + "step": 16314 + }, + { + "epoch": 1.0212832338578406, + "grad_norm": 5.427515506744385, + "learning_rate": 7.867068658352749e-06, + "loss": 0.2959, + "step": 16315 + }, + { + "epoch": 1.0212967986977755, + "grad_norm": 4.634544849395752, + "learning_rate": 7.866931615732494e-06, + "loss": 0.2235, + "step": 16316 + }, + { + "epoch": 1.0213103635377103, + "grad_norm": 6.732154846191406, + "learning_rate": 7.866794573112239e-06, + "loss": 0.3493, + "step": 16317 + }, + { + "epoch": 1.0213239283776452, + "grad_norm": 4.6317315101623535, + "learning_rate": 7.866657530491984e-06, + "loss": 0.2793, + "step": 16318 + }, + { + "epoch": 1.02133749321758, + "grad_norm": 7.597341060638428, + "learning_rate": 7.866520487871728e-06, + "loss": 0.4281, + "step": 16319 + }, + { + "epoch": 1.021351058057515, + "grad_norm": 6.004715442657471, + "learning_rate": 7.866383445251475e-06, + "loss": 0.2594, + "step": 16320 + }, + { + "epoch": 1.0213646228974498, + "grad_norm": 5.8542962074279785, + "learning_rate": 7.86624640263122e-06, + "loss": 0.3403, + "step": 16321 + }, + { + "epoch": 1.0213781877373846, + "grad_norm": 4.629624366760254, + "learning_rate": 7.866109360010963e-06, + "loss": 0.2451, + "step": 16322 + }, + { + "epoch": 1.0213917525773195, + "grad_norm": 4.447299957275391, + "learning_rate": 7.865972317390708e-06, + "loss": 0.2487, + "step": 16323 + }, + { + "epoch": 1.0214053174172544, + "grad_norm": 4.64943265914917, + "learning_rate": 7.865835274770455e-06, + "loss": 0.246, + "step": 16324 + }, + { + "epoch": 1.0214188822571895, + "grad_norm": 5.3157806396484375, + "learning_rate": 7.8656982321502e-06, + "loss": 0.2382, + "step": 16325 + }, + { + "epoch": 1.0214324470971243, + "grad_norm": 4.808337211608887, + "learning_rate": 7.865561189529944e-06, + "loss": 0.2956, + "step": 16326 + }, + { + "epoch": 1.0214460119370592, + "grad_norm": 5.460665702819824, + "learning_rate": 7.865424146909689e-06, + "loss": 0.2235, + "step": 16327 + }, + { + "epoch": 1.021459576776994, + "grad_norm": 7.022233486175537, + "learning_rate": 7.865287104289436e-06, + "loss": 0.311, + "step": 16328 + }, + { + "epoch": 1.021473141616929, + "grad_norm": 7.063079357147217, + "learning_rate": 7.86515006166918e-06, + "loss": 0.3049, + "step": 16329 + }, + { + "epoch": 1.0214867064568638, + "grad_norm": 5.082183361053467, + "learning_rate": 7.865013019048925e-06, + "loss": 0.3499, + "step": 16330 + }, + { + "epoch": 1.0215002712967987, + "grad_norm": 5.182034492492676, + "learning_rate": 7.86487597642867e-06, + "loss": 0.3325, + "step": 16331 + }, + { + "epoch": 1.0215138361367335, + "grad_norm": 5.742542266845703, + "learning_rate": 7.864738933808415e-06, + "loss": 0.2082, + "step": 16332 + }, + { + "epoch": 1.0215274009766684, + "grad_norm": 5.167109966278076, + "learning_rate": 7.86460189118816e-06, + "loss": 0.3078, + "step": 16333 + }, + { + "epoch": 1.0215409658166035, + "grad_norm": 9.71628475189209, + "learning_rate": 7.864464848567905e-06, + "loss": 0.3073, + "step": 16334 + }, + { + "epoch": 1.0215545306565383, + "grad_norm": 5.475512504577637, + "learning_rate": 7.86432780594765e-06, + "loss": 0.3019, + "step": 16335 + }, + { + "epoch": 1.0215680954964732, + "grad_norm": 6.406837463378906, + "learning_rate": 7.864190763327396e-06, + "loss": 0.3128, + "step": 16336 + }, + { + "epoch": 1.021581660336408, + "grad_norm": 8.69416332244873, + "learning_rate": 7.864053720707141e-06, + "loss": 0.5503, + "step": 16337 + }, + { + "epoch": 1.021595225176343, + "grad_norm": 5.5192790031433105, + "learning_rate": 7.863916678086886e-06, + "loss": 0.2672, + "step": 16338 + }, + { + "epoch": 1.0216087900162778, + "grad_norm": 4.540343284606934, + "learning_rate": 7.863779635466631e-06, + "loss": 0.358, + "step": 16339 + }, + { + "epoch": 1.0216223548562127, + "grad_norm": 5.5295186042785645, + "learning_rate": 7.863642592846375e-06, + "loss": 0.2994, + "step": 16340 + }, + { + "epoch": 1.0216359196961475, + "grad_norm": 4.482665061950684, + "learning_rate": 7.863505550226122e-06, + "loss": 0.264, + "step": 16341 + }, + { + "epoch": 1.0216494845360824, + "grad_norm": 6.421626567840576, + "learning_rate": 7.863368507605867e-06, + "loss": 0.3555, + "step": 16342 + }, + { + "epoch": 1.0216630493760173, + "grad_norm": 5.965067386627197, + "learning_rate": 7.863231464985612e-06, + "loss": 0.3394, + "step": 16343 + }, + { + "epoch": 1.0216766142159524, + "grad_norm": 5.381970405578613, + "learning_rate": 7.863094422365355e-06, + "loss": 0.4424, + "step": 16344 + }, + { + "epoch": 1.0216901790558872, + "grad_norm": 5.457883358001709, + "learning_rate": 7.8629573797451e-06, + "loss": 0.3086, + "step": 16345 + }, + { + "epoch": 1.021703743895822, + "grad_norm": 5.782159328460693, + "learning_rate": 7.862820337124847e-06, + "loss": 0.2935, + "step": 16346 + }, + { + "epoch": 1.021717308735757, + "grad_norm": 6.091118812561035, + "learning_rate": 7.862683294504591e-06, + "loss": 0.3837, + "step": 16347 + }, + { + "epoch": 1.0217308735756918, + "grad_norm": 6.553847789764404, + "learning_rate": 7.862546251884336e-06, + "loss": 0.3611, + "step": 16348 + }, + { + "epoch": 1.0217444384156267, + "grad_norm": 5.309600830078125, + "learning_rate": 7.862409209264081e-06, + "loss": 0.2801, + "step": 16349 + }, + { + "epoch": 1.0217580032555615, + "grad_norm": 6.221393585205078, + "learning_rate": 7.862272166643826e-06, + "loss": 0.3624, + "step": 16350 + }, + { + "epoch": 1.0217715680954964, + "grad_norm": 5.277144908905029, + "learning_rate": 7.862135124023572e-06, + "loss": 0.2499, + "step": 16351 + }, + { + "epoch": 1.0217851329354313, + "grad_norm": 5.2914252281188965, + "learning_rate": 7.861998081403317e-06, + "loss": 0.3433, + "step": 16352 + }, + { + "epoch": 1.0217986977753664, + "grad_norm": 7.924820423126221, + "learning_rate": 7.861861038783062e-06, + "loss": 0.4037, + "step": 16353 + }, + { + "epoch": 1.0218122626153012, + "grad_norm": 6.436646461486816, + "learning_rate": 7.861723996162807e-06, + "loss": 0.4128, + "step": 16354 + }, + { + "epoch": 1.021825827455236, + "grad_norm": 6.577679634094238, + "learning_rate": 7.861586953542552e-06, + "loss": 0.3149, + "step": 16355 + }, + { + "epoch": 1.021839392295171, + "grad_norm": 6.96821403503418, + "learning_rate": 7.861449910922298e-06, + "loss": 0.4535, + "step": 16356 + }, + { + "epoch": 1.0218529571351058, + "grad_norm": 6.156445503234863, + "learning_rate": 7.861312868302043e-06, + "loss": 0.3081, + "step": 16357 + }, + { + "epoch": 1.0218665219750407, + "grad_norm": 5.919462203979492, + "learning_rate": 7.861175825681788e-06, + "loss": 0.3325, + "step": 16358 + }, + { + "epoch": 1.0218800868149756, + "grad_norm": 7.5323991775512695, + "learning_rate": 7.861038783061533e-06, + "loss": 0.2956, + "step": 16359 + }, + { + "epoch": 1.0218936516549104, + "grad_norm": 5.848796367645264, + "learning_rate": 7.860901740441278e-06, + "loss": 0.3244, + "step": 16360 + }, + { + "epoch": 1.0219072164948453, + "grad_norm": 4.744148254394531, + "learning_rate": 7.860764697821023e-06, + "loss": 0.2331, + "step": 16361 + }, + { + "epoch": 1.0219207813347801, + "grad_norm": 3.9697232246398926, + "learning_rate": 7.860627655200767e-06, + "loss": 0.2447, + "step": 16362 + }, + { + "epoch": 1.0219343461747152, + "grad_norm": 5.1572980880737305, + "learning_rate": 7.860490612580514e-06, + "loss": 0.2742, + "step": 16363 + }, + { + "epoch": 1.02194791101465, + "grad_norm": 5.868208408355713, + "learning_rate": 7.860353569960259e-06, + "loss": 0.4152, + "step": 16364 + }, + { + "epoch": 1.021961475854585, + "grad_norm": 5.222428798675537, + "learning_rate": 7.860216527340002e-06, + "loss": 0.2707, + "step": 16365 + }, + { + "epoch": 1.0219750406945198, + "grad_norm": 4.728939056396484, + "learning_rate": 7.860079484719748e-06, + "loss": 0.2939, + "step": 16366 + }, + { + "epoch": 1.0219886055344547, + "grad_norm": 7.37914514541626, + "learning_rate": 7.859942442099495e-06, + "loss": 0.5164, + "step": 16367 + }, + { + "epoch": 1.0220021703743896, + "grad_norm": 4.415847301483154, + "learning_rate": 7.85980539947924e-06, + "loss": 0.1874, + "step": 16368 + }, + { + "epoch": 1.0220157352143244, + "grad_norm": 3.5334219932556152, + "learning_rate": 7.859668356858983e-06, + "loss": 0.2189, + "step": 16369 + }, + { + "epoch": 1.0220293000542593, + "grad_norm": 4.467902183532715, + "learning_rate": 7.859531314238728e-06, + "loss": 0.2621, + "step": 16370 + }, + { + "epoch": 1.0220428648941942, + "grad_norm": 5.547165870666504, + "learning_rate": 7.859394271618474e-06, + "loss": 0.2948, + "step": 16371 + }, + { + "epoch": 1.0220564297341292, + "grad_norm": 5.632651329040527, + "learning_rate": 7.859257228998219e-06, + "loss": 0.3808, + "step": 16372 + }, + { + "epoch": 1.0220699945740641, + "grad_norm": 5.27165412902832, + "learning_rate": 7.859120186377964e-06, + "loss": 0.3208, + "step": 16373 + }, + { + "epoch": 1.022083559413999, + "grad_norm": 5.3841447830200195, + "learning_rate": 7.858983143757709e-06, + "loss": 0.2528, + "step": 16374 + }, + { + "epoch": 1.0220971242539338, + "grad_norm": 6.407102584838867, + "learning_rate": 7.858846101137454e-06, + "loss": 0.4914, + "step": 16375 + }, + { + "epoch": 1.0221106890938687, + "grad_norm": 4.793783187866211, + "learning_rate": 7.8587090585172e-06, + "loss": 0.243, + "step": 16376 + }, + { + "epoch": 1.0221242539338036, + "grad_norm": 5.3003315925598145, + "learning_rate": 7.858572015896945e-06, + "loss": 0.3384, + "step": 16377 + }, + { + "epoch": 1.0221378187737384, + "grad_norm": 5.515944957733154, + "learning_rate": 7.85843497327669e-06, + "loss": 0.3065, + "step": 16378 + }, + { + "epoch": 1.0221513836136733, + "grad_norm": 7.3017144203186035, + "learning_rate": 7.858297930656435e-06, + "loss": 0.3319, + "step": 16379 + }, + { + "epoch": 1.0221649484536082, + "grad_norm": 5.797325611114502, + "learning_rate": 7.85816088803618e-06, + "loss": 0.4903, + "step": 16380 + }, + { + "epoch": 1.022178513293543, + "grad_norm": 3.8085129261016846, + "learning_rate": 7.858023845415925e-06, + "loss": 0.2414, + "step": 16381 + }, + { + "epoch": 1.0221920781334781, + "grad_norm": 4.52668571472168, + "learning_rate": 7.85788680279567e-06, + "loss": 0.2992, + "step": 16382 + }, + { + "epoch": 1.022205642973413, + "grad_norm": 8.107095718383789, + "learning_rate": 7.857749760175416e-06, + "loss": 0.6748, + "step": 16383 + }, + { + "epoch": 1.0222192078133479, + "grad_norm": 6.605216026306152, + "learning_rate": 7.85761271755516e-06, + "loss": 0.3842, + "step": 16384 + }, + { + "epoch": 1.0222327726532827, + "grad_norm": 5.108030796051025, + "learning_rate": 7.857475674934906e-06, + "loss": 0.3682, + "step": 16385 + }, + { + "epoch": 1.0222463374932176, + "grad_norm": 5.318918228149414, + "learning_rate": 7.857338632314651e-06, + "loss": 0.3016, + "step": 16386 + }, + { + "epoch": 1.0222599023331524, + "grad_norm": 6.894967555999756, + "learning_rate": 7.857201589694395e-06, + "loss": 0.3108, + "step": 16387 + }, + { + "epoch": 1.0222734671730873, + "grad_norm": 5.881556987762451, + "learning_rate": 7.85706454707414e-06, + "loss": 0.3986, + "step": 16388 + }, + { + "epoch": 1.0222870320130222, + "grad_norm": 3.97418212890625, + "learning_rate": 7.856927504453887e-06, + "loss": 0.2313, + "step": 16389 + }, + { + "epoch": 1.022300596852957, + "grad_norm": 4.643820285797119, + "learning_rate": 7.85679046183363e-06, + "loss": 0.2993, + "step": 16390 + }, + { + "epoch": 1.0223141616928921, + "grad_norm": 4.366233825683594, + "learning_rate": 7.856653419213375e-06, + "loss": 0.2396, + "step": 16391 + }, + { + "epoch": 1.022327726532827, + "grad_norm": 5.181544780731201, + "learning_rate": 7.85651637659312e-06, + "loss": 0.2719, + "step": 16392 + }, + { + "epoch": 1.0223412913727619, + "grad_norm": 5.441892623901367, + "learning_rate": 7.856379333972867e-06, + "loss": 0.2446, + "step": 16393 + }, + { + "epoch": 1.0223548562126967, + "grad_norm": 4.593164920806885, + "learning_rate": 7.856242291352611e-06, + "loss": 0.3055, + "step": 16394 + }, + { + "epoch": 1.0223684210526316, + "grad_norm": 5.739780426025391, + "learning_rate": 7.856105248732356e-06, + "loss": 0.3559, + "step": 16395 + }, + { + "epoch": 1.0223819858925665, + "grad_norm": 4.316882133483887, + "learning_rate": 7.855968206112101e-06, + "loss": 0.2586, + "step": 16396 + }, + { + "epoch": 1.0223955507325013, + "grad_norm": 5.481861591339111, + "learning_rate": 7.855831163491847e-06, + "loss": 0.4124, + "step": 16397 + }, + { + "epoch": 1.0224091155724362, + "grad_norm": 4.80151891708374, + "learning_rate": 7.855694120871592e-06, + "loss": 0.2865, + "step": 16398 + }, + { + "epoch": 1.022422680412371, + "grad_norm": 4.9537763595581055, + "learning_rate": 7.855557078251337e-06, + "loss": 0.4454, + "step": 16399 + }, + { + "epoch": 1.022436245252306, + "grad_norm": 4.03760290145874, + "learning_rate": 7.855420035631082e-06, + "loss": 0.2401, + "step": 16400 + }, + { + "epoch": 1.022449810092241, + "grad_norm": 5.714296340942383, + "learning_rate": 7.855282993010827e-06, + "loss": 0.3687, + "step": 16401 + }, + { + "epoch": 1.0224633749321759, + "grad_norm": 5.913295745849609, + "learning_rate": 7.855145950390572e-06, + "loss": 0.3768, + "step": 16402 + }, + { + "epoch": 1.0224769397721107, + "grad_norm": 6.760695457458496, + "learning_rate": 7.855008907770318e-06, + "loss": 0.3779, + "step": 16403 + }, + { + "epoch": 1.0224905046120456, + "grad_norm": 6.822667598724365, + "learning_rate": 7.854871865150063e-06, + "loss": 0.4556, + "step": 16404 + }, + { + "epoch": 1.0225040694519805, + "grad_norm": 5.611288547515869, + "learning_rate": 7.854734822529806e-06, + "loss": 0.5063, + "step": 16405 + }, + { + "epoch": 1.0225176342919153, + "grad_norm": 6.355534076690674, + "learning_rate": 7.854597779909553e-06, + "loss": 0.3489, + "step": 16406 + }, + { + "epoch": 1.0225311991318502, + "grad_norm": 5.76102876663208, + "learning_rate": 7.854460737289298e-06, + "loss": 0.3814, + "step": 16407 + }, + { + "epoch": 1.022544763971785, + "grad_norm": 5.338846683502197, + "learning_rate": 7.854323694669043e-06, + "loss": 0.3447, + "step": 16408 + }, + { + "epoch": 1.02255832881172, + "grad_norm": 6.984132289886475, + "learning_rate": 7.854186652048787e-06, + "loss": 0.3819, + "step": 16409 + }, + { + "epoch": 1.022571893651655, + "grad_norm": 4.473879337310791, + "learning_rate": 7.854049609428534e-06, + "loss": 0.2478, + "step": 16410 + }, + { + "epoch": 1.0225854584915899, + "grad_norm": 6.089578151702881, + "learning_rate": 7.853912566808279e-06, + "loss": 0.3961, + "step": 16411 + }, + { + "epoch": 1.0225990233315247, + "grad_norm": 6.774605751037598, + "learning_rate": 7.853775524188022e-06, + "loss": 0.4334, + "step": 16412 + }, + { + "epoch": 1.0226125881714596, + "grad_norm": 5.762448787689209, + "learning_rate": 7.853638481567768e-06, + "loss": 0.4076, + "step": 16413 + }, + { + "epoch": 1.0226261530113945, + "grad_norm": 6.032712936401367, + "learning_rate": 7.853501438947513e-06, + "loss": 0.2688, + "step": 16414 + }, + { + "epoch": 1.0226397178513293, + "grad_norm": 4.563484191894531, + "learning_rate": 7.853364396327258e-06, + "loss": 0.262, + "step": 16415 + }, + { + "epoch": 1.0226532826912642, + "grad_norm": 6.593796253204346, + "learning_rate": 7.853227353707003e-06, + "loss": 0.5032, + "step": 16416 + }, + { + "epoch": 1.022666847531199, + "grad_norm": 5.792290687561035, + "learning_rate": 7.853090311086748e-06, + "loss": 0.4533, + "step": 16417 + }, + { + "epoch": 1.022680412371134, + "grad_norm": 6.690173149108887, + "learning_rate": 7.852953268466494e-06, + "loss": 0.4349, + "step": 16418 + }, + { + "epoch": 1.0226939772110688, + "grad_norm": 5.175749778747559, + "learning_rate": 7.852816225846239e-06, + "loss": 0.3243, + "step": 16419 + }, + { + "epoch": 1.022707542051004, + "grad_norm": 4.952328681945801, + "learning_rate": 7.852679183225984e-06, + "loss": 0.3171, + "step": 16420 + }, + { + "epoch": 1.0227211068909388, + "grad_norm": 4.187361240386963, + "learning_rate": 7.852542140605729e-06, + "loss": 0.2681, + "step": 16421 + }, + { + "epoch": 1.0227346717308736, + "grad_norm": 5.210773468017578, + "learning_rate": 7.852405097985474e-06, + "loss": 0.2949, + "step": 16422 + }, + { + "epoch": 1.0227482365708085, + "grad_norm": 5.098575115203857, + "learning_rate": 7.85226805536522e-06, + "loss": 0.2285, + "step": 16423 + }, + { + "epoch": 1.0227618014107434, + "grad_norm": 5.064241409301758, + "learning_rate": 7.852131012744965e-06, + "loss": 0.2716, + "step": 16424 + }, + { + "epoch": 1.0227753662506782, + "grad_norm": 7.313899993896484, + "learning_rate": 7.85199397012471e-06, + "loss": 0.5044, + "step": 16425 + }, + { + "epoch": 1.022788931090613, + "grad_norm": 5.090393543243408, + "learning_rate": 7.851856927504455e-06, + "loss": 0.3555, + "step": 16426 + }, + { + "epoch": 1.022802495930548, + "grad_norm": 5.3407368659973145, + "learning_rate": 7.851719884884198e-06, + "loss": 0.2481, + "step": 16427 + }, + { + "epoch": 1.0228160607704828, + "grad_norm": 4.5863752365112305, + "learning_rate": 7.851582842263945e-06, + "loss": 0.2099, + "step": 16428 + }, + { + "epoch": 1.022829625610418, + "grad_norm": 3.431042432785034, + "learning_rate": 7.85144579964369e-06, + "loss": 0.1645, + "step": 16429 + }, + { + "epoch": 1.0228431904503528, + "grad_norm": 6.757532596588135, + "learning_rate": 7.851308757023434e-06, + "loss": 0.4009, + "step": 16430 + }, + { + "epoch": 1.0228567552902876, + "grad_norm": 4.966290473937988, + "learning_rate": 7.85117171440318e-06, + "loss": 0.3044, + "step": 16431 + }, + { + "epoch": 1.0228703201302225, + "grad_norm": 5.3099775314331055, + "learning_rate": 7.851034671782926e-06, + "loss": 0.2601, + "step": 16432 + }, + { + "epoch": 1.0228838849701574, + "grad_norm": 5.391044616699219, + "learning_rate": 7.85089762916267e-06, + "loss": 0.2017, + "step": 16433 + }, + { + "epoch": 1.0228974498100922, + "grad_norm": 4.1211113929748535, + "learning_rate": 7.850760586542415e-06, + "loss": 0.1986, + "step": 16434 + }, + { + "epoch": 1.022911014650027, + "grad_norm": 4.317183017730713, + "learning_rate": 7.85062354392216e-06, + "loss": 0.284, + "step": 16435 + }, + { + "epoch": 1.022924579489962, + "grad_norm": 4.2706427574157715, + "learning_rate": 7.850486501301907e-06, + "loss": 0.2192, + "step": 16436 + }, + { + "epoch": 1.0229381443298968, + "grad_norm": 3.4612436294555664, + "learning_rate": 7.85034945868165e-06, + "loss": 0.1972, + "step": 16437 + }, + { + "epoch": 1.0229517091698317, + "grad_norm": 5.459146022796631, + "learning_rate": 7.850212416061395e-06, + "loss": 0.2516, + "step": 16438 + }, + { + "epoch": 1.0229652740097668, + "grad_norm": 6.090274333953857, + "learning_rate": 7.85007537344114e-06, + "loss": 0.2459, + "step": 16439 + }, + { + "epoch": 1.0229788388497016, + "grad_norm": 5.413323879241943, + "learning_rate": 7.849938330820886e-06, + "loss": 0.2395, + "step": 16440 + }, + { + "epoch": 1.0229924036896365, + "grad_norm": 4.591625690460205, + "learning_rate": 7.849801288200631e-06, + "loss": 0.2543, + "step": 16441 + }, + { + "epoch": 1.0230059685295714, + "grad_norm": 6.0254926681518555, + "learning_rate": 7.849664245580376e-06, + "loss": 0.2513, + "step": 16442 + }, + { + "epoch": 1.0230195333695062, + "grad_norm": 6.524773120880127, + "learning_rate": 7.849527202960121e-06, + "loss": 0.3654, + "step": 16443 + }, + { + "epoch": 1.023033098209441, + "grad_norm": 4.891541004180908, + "learning_rate": 7.849390160339867e-06, + "loss": 0.2239, + "step": 16444 + }, + { + "epoch": 1.023046663049376, + "grad_norm": 4.546419620513916, + "learning_rate": 7.849253117719612e-06, + "loss": 0.2034, + "step": 16445 + }, + { + "epoch": 1.0230602278893108, + "grad_norm": 6.222914218902588, + "learning_rate": 7.849116075099357e-06, + "loss": 0.4232, + "step": 16446 + }, + { + "epoch": 1.0230737927292457, + "grad_norm": 7.776647090911865, + "learning_rate": 7.848979032479102e-06, + "loss": 0.3481, + "step": 16447 + }, + { + "epoch": 1.0230873575691808, + "grad_norm": 4.996371746063232, + "learning_rate": 7.848841989858846e-06, + "loss": 0.2781, + "step": 16448 + }, + { + "epoch": 1.0231009224091157, + "grad_norm": 5.602666854858398, + "learning_rate": 7.848704947238592e-06, + "loss": 0.2651, + "step": 16449 + }, + { + "epoch": 1.0231144872490505, + "grad_norm": 7.435891151428223, + "learning_rate": 7.848567904618338e-06, + "loss": 0.3144, + "step": 16450 + }, + { + "epoch": 1.0231280520889854, + "grad_norm": 5.676605701446533, + "learning_rate": 7.848430861998083e-06, + "loss": 0.2618, + "step": 16451 + }, + { + "epoch": 1.0231416169289203, + "grad_norm": 5.358592510223389, + "learning_rate": 7.848293819377826e-06, + "loss": 0.3397, + "step": 16452 + }, + { + "epoch": 1.0231551817688551, + "grad_norm": 5.105745315551758, + "learning_rate": 7.848156776757571e-06, + "loss": 0.2762, + "step": 16453 + }, + { + "epoch": 1.02316874660879, + "grad_norm": 4.718258857727051, + "learning_rate": 7.848019734137318e-06, + "loss": 0.2442, + "step": 16454 + }, + { + "epoch": 1.0231823114487248, + "grad_norm": 5.94035005569458, + "learning_rate": 7.847882691517062e-06, + "loss": 0.2611, + "step": 16455 + }, + { + "epoch": 1.0231958762886597, + "grad_norm": 4.794371128082275, + "learning_rate": 7.847745648896807e-06, + "loss": 0.304, + "step": 16456 + }, + { + "epoch": 1.0232094411285946, + "grad_norm": 7.389484882354736, + "learning_rate": 7.847608606276552e-06, + "loss": 0.4146, + "step": 16457 + }, + { + "epoch": 1.0232230059685297, + "grad_norm": 4.235584259033203, + "learning_rate": 7.847471563656297e-06, + "loss": 0.1727, + "step": 16458 + }, + { + "epoch": 1.0232365708084645, + "grad_norm": 4.782838344573975, + "learning_rate": 7.847334521036043e-06, + "loss": 0.3541, + "step": 16459 + }, + { + "epoch": 1.0232501356483994, + "grad_norm": 5.8106608390808105, + "learning_rate": 7.847197478415788e-06, + "loss": 0.2588, + "step": 16460 + }, + { + "epoch": 1.0232637004883343, + "grad_norm": 4.8242669105529785, + "learning_rate": 7.847060435795533e-06, + "loss": 0.1789, + "step": 16461 + }, + { + "epoch": 1.0232772653282691, + "grad_norm": 7.636277675628662, + "learning_rate": 7.846923393175278e-06, + "loss": 0.3763, + "step": 16462 + }, + { + "epoch": 1.023290830168204, + "grad_norm": 5.296022415161133, + "learning_rate": 7.846786350555023e-06, + "loss": 0.1986, + "step": 16463 + }, + { + "epoch": 1.0233043950081389, + "grad_norm": 5.593076229095459, + "learning_rate": 7.846649307934768e-06, + "loss": 0.2919, + "step": 16464 + }, + { + "epoch": 1.0233179598480737, + "grad_norm": 7.82378625869751, + "learning_rate": 7.846512265314514e-06, + "loss": 0.3173, + "step": 16465 + }, + { + "epoch": 1.0233315246880086, + "grad_norm": 5.38529109954834, + "learning_rate": 7.846375222694259e-06, + "loss": 0.2599, + "step": 16466 + }, + { + "epoch": 1.0233450895279437, + "grad_norm": 4.89611291885376, + "learning_rate": 7.846238180074004e-06, + "loss": 0.2591, + "step": 16467 + }, + { + "epoch": 1.0233586543678785, + "grad_norm": 4.692565441131592, + "learning_rate": 7.846101137453749e-06, + "loss": 0.2054, + "step": 16468 + }, + { + "epoch": 1.0233722192078134, + "grad_norm": 5.942986011505127, + "learning_rate": 7.845964094833494e-06, + "loss": 0.3192, + "step": 16469 + }, + { + "epoch": 1.0233857840477483, + "grad_norm": 5.248490333557129, + "learning_rate": 7.845827052213238e-06, + "loss": 0.2877, + "step": 16470 + }, + { + "epoch": 1.0233993488876831, + "grad_norm": 4.808775424957275, + "learning_rate": 7.845690009592985e-06, + "loss": 0.3086, + "step": 16471 + }, + { + "epoch": 1.023412913727618, + "grad_norm": 5.488687992095947, + "learning_rate": 7.84555296697273e-06, + "loss": 0.2195, + "step": 16472 + }, + { + "epoch": 1.0234264785675529, + "grad_norm": 5.278878211975098, + "learning_rate": 7.845415924352473e-06, + "loss": 0.2282, + "step": 16473 + }, + { + "epoch": 1.0234400434074877, + "grad_norm": 5.401824951171875, + "learning_rate": 7.845278881732219e-06, + "loss": 0.3732, + "step": 16474 + }, + { + "epoch": 1.0234536082474226, + "grad_norm": 3.7022945880889893, + "learning_rate": 7.845141839111965e-06, + "loss": 0.1273, + "step": 16475 + }, + { + "epoch": 1.0234671730873575, + "grad_norm": 6.024720668792725, + "learning_rate": 7.84500479649171e-06, + "loss": 0.2881, + "step": 16476 + }, + { + "epoch": 1.0234807379272925, + "grad_norm": 6.685038089752197, + "learning_rate": 7.844867753871454e-06, + "loss": 0.4493, + "step": 16477 + }, + { + "epoch": 1.0234943027672274, + "grad_norm": 4.198681831359863, + "learning_rate": 7.8447307112512e-06, + "loss": 0.2505, + "step": 16478 + }, + { + "epoch": 1.0235078676071623, + "grad_norm": 5.837563991546631, + "learning_rate": 7.844593668630946e-06, + "loss": 0.2721, + "step": 16479 + }, + { + "epoch": 1.0235214324470971, + "grad_norm": 5.290709018707275, + "learning_rate": 7.84445662601069e-06, + "loss": 0.3157, + "step": 16480 + }, + { + "epoch": 1.023534997287032, + "grad_norm": 7.57607364654541, + "learning_rate": 7.844319583390435e-06, + "loss": 0.3188, + "step": 16481 + }, + { + "epoch": 1.0235485621269669, + "grad_norm": 4.682220935821533, + "learning_rate": 7.84418254077018e-06, + "loss": 0.2314, + "step": 16482 + }, + { + "epoch": 1.0235621269669017, + "grad_norm": 6.081136226654053, + "learning_rate": 7.844045498149925e-06, + "loss": 0.2819, + "step": 16483 + }, + { + "epoch": 1.0235756918068366, + "grad_norm": 5.665353298187256, + "learning_rate": 7.84390845552967e-06, + "loss": 0.2753, + "step": 16484 + }, + { + "epoch": 1.0235892566467715, + "grad_norm": 3.9837749004364014, + "learning_rate": 7.843771412909415e-06, + "loss": 0.1826, + "step": 16485 + }, + { + "epoch": 1.0236028214867066, + "grad_norm": 9.842890739440918, + "learning_rate": 7.84363437028916e-06, + "loss": 0.4018, + "step": 16486 + }, + { + "epoch": 1.0236163863266414, + "grad_norm": 6.664912700653076, + "learning_rate": 7.843497327668906e-06, + "loss": 0.3727, + "step": 16487 + }, + { + "epoch": 1.0236299511665763, + "grad_norm": 6.916849136352539, + "learning_rate": 7.843360285048651e-06, + "loss": 0.4214, + "step": 16488 + }, + { + "epoch": 1.0236435160065112, + "grad_norm": 4.828225135803223, + "learning_rate": 7.843223242428396e-06, + "loss": 0.2787, + "step": 16489 + }, + { + "epoch": 1.023657080846446, + "grad_norm": 5.490734100341797, + "learning_rate": 7.843086199808141e-06, + "loss": 0.3429, + "step": 16490 + }, + { + "epoch": 1.0236706456863809, + "grad_norm": 5.438986301422119, + "learning_rate": 7.842949157187887e-06, + "loss": 0.2231, + "step": 16491 + }, + { + "epoch": 1.0236842105263158, + "grad_norm": 7.302206039428711, + "learning_rate": 7.842812114567632e-06, + "loss": 0.3725, + "step": 16492 + }, + { + "epoch": 1.0236977753662506, + "grad_norm": 4.594425678253174, + "learning_rate": 7.842675071947377e-06, + "loss": 0.222, + "step": 16493 + }, + { + "epoch": 1.0237113402061855, + "grad_norm": 4.774195671081543, + "learning_rate": 7.842538029327122e-06, + "loss": 0.2126, + "step": 16494 + }, + { + "epoch": 1.0237249050461203, + "grad_norm": 6.356812953948975, + "learning_rate": 7.842400986706866e-06, + "loss": 0.3429, + "step": 16495 + }, + { + "epoch": 1.0237384698860554, + "grad_norm": 5.6794939041137695, + "learning_rate": 7.84226394408661e-06, + "loss": 0.3467, + "step": 16496 + }, + { + "epoch": 1.0237520347259903, + "grad_norm": 6.634791851043701, + "learning_rate": 7.842126901466358e-06, + "loss": 0.3644, + "step": 16497 + }, + { + "epoch": 1.0237655995659252, + "grad_norm": 7.07133150100708, + "learning_rate": 7.841989858846101e-06, + "loss": 0.4615, + "step": 16498 + }, + { + "epoch": 1.02377916440586, + "grad_norm": 5.553718566894531, + "learning_rate": 7.841852816225846e-06, + "loss": 0.2846, + "step": 16499 + }, + { + "epoch": 1.023792729245795, + "grad_norm": 5.625476360321045, + "learning_rate": 7.841715773605591e-06, + "loss": 0.3167, + "step": 16500 + }, + { + "epoch": 1.0238062940857298, + "grad_norm": 5.0884928703308105, + "learning_rate": 7.841578730985338e-06, + "loss": 0.3154, + "step": 16501 + }, + { + "epoch": 1.0238198589256646, + "grad_norm": 7.727067947387695, + "learning_rate": 7.841441688365082e-06, + "loss": 0.4169, + "step": 16502 + }, + { + "epoch": 1.0238334237655995, + "grad_norm": 4.52079439163208, + "learning_rate": 7.841304645744827e-06, + "loss": 0.2337, + "step": 16503 + }, + { + "epoch": 1.0238469886055344, + "grad_norm": 7.781192779541016, + "learning_rate": 7.841167603124572e-06, + "loss": 0.307, + "step": 16504 + }, + { + "epoch": 1.0238605534454694, + "grad_norm": 6.912903308868408, + "learning_rate": 7.841030560504317e-06, + "loss": 0.5256, + "step": 16505 + }, + { + "epoch": 1.0238741182854043, + "grad_norm": 6.327149868011475, + "learning_rate": 7.840893517884063e-06, + "loss": 0.451, + "step": 16506 + }, + { + "epoch": 1.0238876831253392, + "grad_norm": 7.028332233428955, + "learning_rate": 7.840756475263808e-06, + "loss": 0.3506, + "step": 16507 + }, + { + "epoch": 1.023901247965274, + "grad_norm": 6.466876029968262, + "learning_rate": 7.840619432643553e-06, + "loss": 0.2732, + "step": 16508 + }, + { + "epoch": 1.023914812805209, + "grad_norm": 6.102750778198242, + "learning_rate": 7.840482390023298e-06, + "loss": 0.1995, + "step": 16509 + }, + { + "epoch": 1.0239283776451438, + "grad_norm": 5.5438337326049805, + "learning_rate": 7.840345347403043e-06, + "loss": 0.3066, + "step": 16510 + }, + { + "epoch": 1.0239419424850786, + "grad_norm": 5.614818096160889, + "learning_rate": 7.840208304782788e-06, + "loss": 0.3287, + "step": 16511 + }, + { + "epoch": 1.0239555073250135, + "grad_norm": 4.803893089294434, + "learning_rate": 7.840071262162534e-06, + "loss": 0.2192, + "step": 16512 + }, + { + "epoch": 1.0239690721649484, + "grad_norm": 8.020936965942383, + "learning_rate": 7.839934219542277e-06, + "loss": 0.3883, + "step": 16513 + }, + { + "epoch": 1.0239826370048832, + "grad_norm": 8.682918548583984, + "learning_rate": 7.839797176922024e-06, + "loss": 0.3331, + "step": 16514 + }, + { + "epoch": 1.0239962018448183, + "grad_norm": 5.54893684387207, + "learning_rate": 7.839660134301769e-06, + "loss": 0.2746, + "step": 16515 + }, + { + "epoch": 1.0240097666847532, + "grad_norm": 7.159183025360107, + "learning_rate": 7.839523091681514e-06, + "loss": 0.3083, + "step": 16516 + }, + { + "epoch": 1.024023331524688, + "grad_norm": 4.9267706871032715, + "learning_rate": 7.839386049061258e-06, + "loss": 0.253, + "step": 16517 + }, + { + "epoch": 1.024036896364623, + "grad_norm": 5.117789268493652, + "learning_rate": 7.839249006441005e-06, + "loss": 0.2209, + "step": 16518 + }, + { + "epoch": 1.0240504612045578, + "grad_norm": 4.8530426025390625, + "learning_rate": 7.83911196382075e-06, + "loss": 0.2827, + "step": 16519 + }, + { + "epoch": 1.0240640260444926, + "grad_norm": 6.646688938140869, + "learning_rate": 7.838974921200493e-06, + "loss": 0.2989, + "step": 16520 + }, + { + "epoch": 1.0240775908844275, + "grad_norm": 5.6898722648620605, + "learning_rate": 7.838837878580239e-06, + "loss": 0.2057, + "step": 16521 + }, + { + "epoch": 1.0240911557243624, + "grad_norm": 4.593934535980225, + "learning_rate": 7.838700835959984e-06, + "loss": 0.2537, + "step": 16522 + }, + { + "epoch": 1.0241047205642972, + "grad_norm": 5.036507606506348, + "learning_rate": 7.838563793339729e-06, + "loss": 0.3489, + "step": 16523 + }, + { + "epoch": 1.0241182854042323, + "grad_norm": 6.14997673034668, + "learning_rate": 7.838426750719474e-06, + "loss": 0.3027, + "step": 16524 + }, + { + "epoch": 1.0241318502441672, + "grad_norm": 5.891469955444336, + "learning_rate": 7.83828970809922e-06, + "loss": 0.2842, + "step": 16525 + }, + { + "epoch": 1.024145415084102, + "grad_norm": 4.466601371765137, + "learning_rate": 7.838152665478964e-06, + "loss": 0.2157, + "step": 16526 + }, + { + "epoch": 1.024158979924037, + "grad_norm": 9.843965530395508, + "learning_rate": 7.83801562285871e-06, + "loss": 0.284, + "step": 16527 + }, + { + "epoch": 1.0241725447639718, + "grad_norm": 5.70090913772583, + "learning_rate": 7.837878580238455e-06, + "loss": 0.2219, + "step": 16528 + }, + { + "epoch": 1.0241861096039067, + "grad_norm": 4.6864333152771, + "learning_rate": 7.8377415376182e-06, + "loss": 0.2567, + "step": 16529 + }, + { + "epoch": 1.0241996744438415, + "grad_norm": 5.295551300048828, + "learning_rate": 7.837604494997945e-06, + "loss": 0.2898, + "step": 16530 + }, + { + "epoch": 1.0242132392837764, + "grad_norm": 6.657618522644043, + "learning_rate": 7.83746745237769e-06, + "loss": 0.2803, + "step": 16531 + }, + { + "epoch": 1.0242268041237113, + "grad_norm": 5.840975761413574, + "learning_rate": 7.837330409757435e-06, + "loss": 0.2085, + "step": 16532 + }, + { + "epoch": 1.0242403689636461, + "grad_norm": 5.0134429931640625, + "learning_rate": 7.83719336713718e-06, + "loss": 0.1657, + "step": 16533 + }, + { + "epoch": 1.0242539338035812, + "grad_norm": 6.225395202636719, + "learning_rate": 7.837056324516926e-06, + "loss": 0.3548, + "step": 16534 + }, + { + "epoch": 1.024267498643516, + "grad_norm": 5.187850475311279, + "learning_rate": 7.83691928189667e-06, + "loss": 0.2519, + "step": 16535 + }, + { + "epoch": 1.024281063483451, + "grad_norm": 5.984487056732178, + "learning_rate": 7.836782239276416e-06, + "loss": 0.2601, + "step": 16536 + }, + { + "epoch": 1.0242946283233858, + "grad_norm": 4.606541633605957, + "learning_rate": 7.836645196656161e-06, + "loss": 0.2326, + "step": 16537 + }, + { + "epoch": 1.0243081931633207, + "grad_norm": 5.307666778564453, + "learning_rate": 7.836508154035905e-06, + "loss": 0.2197, + "step": 16538 + }, + { + "epoch": 1.0243217580032555, + "grad_norm": 5.306554794311523, + "learning_rate": 7.83637111141565e-06, + "loss": 0.2699, + "step": 16539 + }, + { + "epoch": 1.0243353228431904, + "grad_norm": 4.3949971199035645, + "learning_rate": 7.836234068795397e-06, + "loss": 0.1423, + "step": 16540 + }, + { + "epoch": 1.0243488876831253, + "grad_norm": 4.504007816314697, + "learning_rate": 7.83609702617514e-06, + "loss": 0.2007, + "step": 16541 + }, + { + "epoch": 1.0243624525230601, + "grad_norm": 7.301656246185303, + "learning_rate": 7.835959983554886e-06, + "loss": 0.3165, + "step": 16542 + }, + { + "epoch": 1.0243760173629952, + "grad_norm": 5.60644006729126, + "learning_rate": 7.83582294093463e-06, + "loss": 0.1814, + "step": 16543 + }, + { + "epoch": 1.02438958220293, + "grad_norm": 7.003950595855713, + "learning_rate": 7.835685898314378e-06, + "loss": 0.3024, + "step": 16544 + }, + { + "epoch": 1.024403147042865, + "grad_norm": 3.2014174461364746, + "learning_rate": 7.835548855694121e-06, + "loss": 0.1404, + "step": 16545 + }, + { + "epoch": 1.0244167118827998, + "grad_norm": 4.480578899383545, + "learning_rate": 7.835411813073866e-06, + "loss": 0.2091, + "step": 16546 + }, + { + "epoch": 1.0244302767227347, + "grad_norm": 6.476655960083008, + "learning_rate": 7.835274770453611e-06, + "loss": 0.3429, + "step": 16547 + }, + { + "epoch": 1.0244438415626695, + "grad_norm": 7.787850856781006, + "learning_rate": 7.835137727833357e-06, + "loss": 0.3745, + "step": 16548 + }, + { + "epoch": 1.0244574064026044, + "grad_norm": 5.5047993659973145, + "learning_rate": 7.835000685213102e-06, + "loss": 0.2822, + "step": 16549 + }, + { + "epoch": 1.0244709712425393, + "grad_norm": 7.8891496658325195, + "learning_rate": 7.834863642592847e-06, + "loss": 0.3342, + "step": 16550 + }, + { + "epoch": 1.0244845360824741, + "grad_norm": 5.813490867614746, + "learning_rate": 7.834726599972592e-06, + "loss": 0.3369, + "step": 16551 + }, + { + "epoch": 1.024498100922409, + "grad_norm": 4.1425628662109375, + "learning_rate": 7.834589557352337e-06, + "loss": 0.1927, + "step": 16552 + }, + { + "epoch": 1.024511665762344, + "grad_norm": 4.38266134262085, + "learning_rate": 7.834452514732083e-06, + "loss": 0.2115, + "step": 16553 + }, + { + "epoch": 1.024525230602279, + "grad_norm": 4.514869689941406, + "learning_rate": 7.834315472111828e-06, + "loss": 0.1905, + "step": 16554 + }, + { + "epoch": 1.0245387954422138, + "grad_norm": 8.36175537109375, + "learning_rate": 7.834178429491573e-06, + "loss": 0.4598, + "step": 16555 + }, + { + "epoch": 1.0245523602821487, + "grad_norm": 4.936639785766602, + "learning_rate": 7.834041386871316e-06, + "loss": 0.2258, + "step": 16556 + }, + { + "epoch": 1.0245659251220836, + "grad_norm": 4.634466648101807, + "learning_rate": 7.833904344251063e-06, + "loss": 0.2422, + "step": 16557 + }, + { + "epoch": 1.0245794899620184, + "grad_norm": 3.777777910232544, + "learning_rate": 7.833767301630808e-06, + "loss": 0.2364, + "step": 16558 + }, + { + "epoch": 1.0245930548019533, + "grad_norm": 7.6560444831848145, + "learning_rate": 7.833630259010554e-06, + "loss": 0.4954, + "step": 16559 + }, + { + "epoch": 1.0246066196418881, + "grad_norm": 4.118927001953125, + "learning_rate": 7.833493216390297e-06, + "loss": 0.242, + "step": 16560 + }, + { + "epoch": 1.024620184481823, + "grad_norm": 5.127315998077393, + "learning_rate": 7.833356173770044e-06, + "loss": 0.2808, + "step": 16561 + }, + { + "epoch": 1.024633749321758, + "grad_norm": 4.847870826721191, + "learning_rate": 7.83321913114979e-06, + "loss": 0.2234, + "step": 16562 + }, + { + "epoch": 1.024647314161693, + "grad_norm": 4.810958385467529, + "learning_rate": 7.833082088529533e-06, + "loss": 0.3162, + "step": 16563 + }, + { + "epoch": 1.0246608790016278, + "grad_norm": 5.3687872886657715, + "learning_rate": 7.832945045909278e-06, + "loss": 0.2291, + "step": 16564 + }, + { + "epoch": 1.0246744438415627, + "grad_norm": 4.1519269943237305, + "learning_rate": 7.832808003289023e-06, + "loss": 0.2686, + "step": 16565 + }, + { + "epoch": 1.0246880086814976, + "grad_norm": 4.708798885345459, + "learning_rate": 7.832670960668768e-06, + "loss": 0.2755, + "step": 16566 + }, + { + "epoch": 1.0247015735214324, + "grad_norm": 4.718486785888672, + "learning_rate": 7.832533918048513e-06, + "loss": 0.2451, + "step": 16567 + }, + { + "epoch": 1.0247151383613673, + "grad_norm": 5.977499485015869, + "learning_rate": 7.832396875428259e-06, + "loss": 0.3076, + "step": 16568 + }, + { + "epoch": 1.0247287032013022, + "grad_norm": 4.962883472442627, + "learning_rate": 7.832259832808004e-06, + "loss": 0.2632, + "step": 16569 + }, + { + "epoch": 1.024742268041237, + "grad_norm": 5.012930393218994, + "learning_rate": 7.832122790187749e-06, + "loss": 0.2572, + "step": 16570 + }, + { + "epoch": 1.024755832881172, + "grad_norm": 5.476528167724609, + "learning_rate": 7.831985747567494e-06, + "loss": 0.3695, + "step": 16571 + }, + { + "epoch": 1.024769397721107, + "grad_norm": 4.930349826812744, + "learning_rate": 7.83184870494724e-06, + "loss": 0.2465, + "step": 16572 + }, + { + "epoch": 1.0247829625610418, + "grad_norm": 6.577532768249512, + "learning_rate": 7.831711662326984e-06, + "loss": 0.2503, + "step": 16573 + }, + { + "epoch": 1.0247965274009767, + "grad_norm": 4.765666961669922, + "learning_rate": 7.83157461970673e-06, + "loss": 0.2612, + "step": 16574 + }, + { + "epoch": 1.0248100922409116, + "grad_norm": 4.711646556854248, + "learning_rate": 7.831437577086475e-06, + "loss": 0.2331, + "step": 16575 + }, + { + "epoch": 1.0248236570808464, + "grad_norm": 5.086430072784424, + "learning_rate": 7.83130053446622e-06, + "loss": 0.2446, + "step": 16576 + }, + { + "epoch": 1.0248372219207813, + "grad_norm": 4.820980548858643, + "learning_rate": 7.831163491845965e-06, + "loss": 0.3435, + "step": 16577 + }, + { + "epoch": 1.0248507867607162, + "grad_norm": 5.080136299133301, + "learning_rate": 7.831026449225709e-06, + "loss": 0.3107, + "step": 16578 + }, + { + "epoch": 1.024864351600651, + "grad_norm": 3.3406219482421875, + "learning_rate": 7.830889406605456e-06, + "loss": 0.1111, + "step": 16579 + }, + { + "epoch": 1.024877916440586, + "grad_norm": 4.049747943878174, + "learning_rate": 7.8307523639852e-06, + "loss": 0.1538, + "step": 16580 + }, + { + "epoch": 1.024891481280521, + "grad_norm": 4.266133785247803, + "learning_rate": 7.830615321364944e-06, + "loss": 0.1827, + "step": 16581 + }, + { + "epoch": 1.0249050461204559, + "grad_norm": 4.133293628692627, + "learning_rate": 7.83047827874469e-06, + "loss": 0.1907, + "step": 16582 + }, + { + "epoch": 1.0249186109603907, + "grad_norm": 5.891274452209473, + "learning_rate": 7.830341236124436e-06, + "loss": 0.2198, + "step": 16583 + }, + { + "epoch": 1.0249321758003256, + "grad_norm": 3.1844122409820557, + "learning_rate": 7.830204193504181e-06, + "loss": 0.1566, + "step": 16584 + }, + { + "epoch": 1.0249457406402604, + "grad_norm": 6.549688339233398, + "learning_rate": 7.830067150883925e-06, + "loss": 0.3481, + "step": 16585 + }, + { + "epoch": 1.0249593054801953, + "grad_norm": 4.696863651275635, + "learning_rate": 7.82993010826367e-06, + "loss": 0.2178, + "step": 16586 + }, + { + "epoch": 1.0249728703201302, + "grad_norm": 4.141533374786377, + "learning_rate": 7.829793065643417e-06, + "loss": 0.2393, + "step": 16587 + }, + { + "epoch": 1.024986435160065, + "grad_norm": 7.5283732414245605, + "learning_rate": 7.82965602302316e-06, + "loss": 0.3133, + "step": 16588 + }, + { + "epoch": 1.025, + "grad_norm": 4.5198235511779785, + "learning_rate": 7.829518980402906e-06, + "loss": 0.2616, + "step": 16589 + }, + { + "epoch": 1.0250135648399348, + "grad_norm": 4.475947856903076, + "learning_rate": 7.82938193778265e-06, + "loss": 0.1606, + "step": 16590 + }, + { + "epoch": 1.0250271296798699, + "grad_norm": 4.62088680267334, + "learning_rate": 7.829244895162396e-06, + "loss": 0.2568, + "step": 16591 + }, + { + "epoch": 1.0250406945198047, + "grad_norm": 5.676570415496826, + "learning_rate": 7.829107852542141e-06, + "loss": 0.2505, + "step": 16592 + }, + { + "epoch": 1.0250542593597396, + "grad_norm": 6.124135494232178, + "learning_rate": 7.828970809921886e-06, + "loss": 0.3462, + "step": 16593 + }, + { + "epoch": 1.0250678241996745, + "grad_norm": 3.7003462314605713, + "learning_rate": 7.828833767301632e-06, + "loss": 0.1803, + "step": 16594 + }, + { + "epoch": 1.0250813890396093, + "grad_norm": 5.980307579040527, + "learning_rate": 7.828696724681377e-06, + "loss": 0.2729, + "step": 16595 + }, + { + "epoch": 1.0250949538795442, + "grad_norm": 5.660566806793213, + "learning_rate": 7.828559682061122e-06, + "loss": 0.2402, + "step": 16596 + }, + { + "epoch": 1.025108518719479, + "grad_norm": 4.535070419311523, + "learning_rate": 7.828422639440867e-06, + "loss": 0.2455, + "step": 16597 + }, + { + "epoch": 1.025122083559414, + "grad_norm": 4.507297039031982, + "learning_rate": 7.828285596820612e-06, + "loss": 0.2377, + "step": 16598 + }, + { + "epoch": 1.0251356483993488, + "grad_norm": 5.774580955505371, + "learning_rate": 7.828148554200357e-06, + "loss": 0.2246, + "step": 16599 + }, + { + "epoch": 1.0251492132392839, + "grad_norm": 5.027997016906738, + "learning_rate": 7.828011511580103e-06, + "loss": 0.2373, + "step": 16600 + }, + { + "epoch": 1.0251627780792187, + "grad_norm": 5.661487102508545, + "learning_rate": 7.827874468959848e-06, + "loss": 0.2481, + "step": 16601 + }, + { + "epoch": 1.0251763429191536, + "grad_norm": 6.355083465576172, + "learning_rate": 7.827737426339593e-06, + "loss": 0.2921, + "step": 16602 + }, + { + "epoch": 1.0251899077590885, + "grad_norm": 4.490642070770264, + "learning_rate": 7.827600383719336e-06, + "loss": 0.2038, + "step": 16603 + }, + { + "epoch": 1.0252034725990233, + "grad_norm": 4.675296783447266, + "learning_rate": 7.827463341099082e-06, + "loss": 0.2706, + "step": 16604 + }, + { + "epoch": 1.0252170374389582, + "grad_norm": 4.428818702697754, + "learning_rate": 7.827326298478828e-06, + "loss": 0.1489, + "step": 16605 + }, + { + "epoch": 1.025230602278893, + "grad_norm": 4.940815448760986, + "learning_rate": 7.827189255858572e-06, + "loss": 0.1959, + "step": 16606 + }, + { + "epoch": 1.025244167118828, + "grad_norm": 3.0216472148895264, + "learning_rate": 7.827052213238317e-06, + "loss": 0.1283, + "step": 16607 + }, + { + "epoch": 1.0252577319587628, + "grad_norm": 5.34396505355835, + "learning_rate": 7.826915170618062e-06, + "loss": 0.2699, + "step": 16608 + }, + { + "epoch": 1.0252712967986979, + "grad_norm": 3.911205291748047, + "learning_rate": 7.82677812799781e-06, + "loss": 0.1953, + "step": 16609 + }, + { + "epoch": 1.0252848616386327, + "grad_norm": 3.2440922260284424, + "learning_rate": 7.826641085377553e-06, + "loss": 0.1341, + "step": 16610 + }, + { + "epoch": 1.0252984264785676, + "grad_norm": 4.075784206390381, + "learning_rate": 7.826504042757298e-06, + "loss": 0.205, + "step": 16611 + }, + { + "epoch": 1.0253119913185025, + "grad_norm": 4.442863941192627, + "learning_rate": 7.826367000137043e-06, + "loss": 0.1893, + "step": 16612 + }, + { + "epoch": 1.0253255561584373, + "grad_norm": 5.279369354248047, + "learning_rate": 7.826229957516788e-06, + "loss": 0.2331, + "step": 16613 + }, + { + "epoch": 1.0253391209983722, + "grad_norm": 4.5792155265808105, + "learning_rate": 7.826092914896533e-06, + "loss": 0.2036, + "step": 16614 + }, + { + "epoch": 1.025352685838307, + "grad_norm": 5.458748817443848, + "learning_rate": 7.825955872276279e-06, + "loss": 0.1911, + "step": 16615 + }, + { + "epoch": 1.025366250678242, + "grad_norm": 5.520580768585205, + "learning_rate": 7.825818829656024e-06, + "loss": 0.2959, + "step": 16616 + }, + { + "epoch": 1.0253798155181768, + "grad_norm": 4.006349086761475, + "learning_rate": 7.825681787035769e-06, + "loss": 0.1945, + "step": 16617 + }, + { + "epoch": 1.0253933803581117, + "grad_norm": 5.756688594818115, + "learning_rate": 7.825544744415514e-06, + "loss": 0.3264, + "step": 16618 + }, + { + "epoch": 1.0254069451980468, + "grad_norm": 3.7835853099823, + "learning_rate": 7.82540770179526e-06, + "loss": 0.2061, + "step": 16619 + }, + { + "epoch": 1.0254205100379816, + "grad_norm": 6.121003150939941, + "learning_rate": 7.825270659175004e-06, + "loss": 0.2456, + "step": 16620 + }, + { + "epoch": 1.0254340748779165, + "grad_norm": 5.292691707611084, + "learning_rate": 7.825133616554748e-06, + "loss": 0.2913, + "step": 16621 + }, + { + "epoch": 1.0254476397178514, + "grad_norm": 4.5865607261657715, + "learning_rate": 7.824996573934495e-06, + "loss": 0.2025, + "step": 16622 + }, + { + "epoch": 1.0254612045577862, + "grad_norm": 5.38189172744751, + "learning_rate": 7.82485953131424e-06, + "loss": 0.3185, + "step": 16623 + }, + { + "epoch": 1.025474769397721, + "grad_norm": 4.862104892730713, + "learning_rate": 7.824722488693983e-06, + "loss": 0.2126, + "step": 16624 + }, + { + "epoch": 1.025488334237656, + "grad_norm": 5.410445213317871, + "learning_rate": 7.824585446073729e-06, + "loss": 0.2288, + "step": 16625 + }, + { + "epoch": 1.0255018990775908, + "grad_norm": 4.450624942779541, + "learning_rate": 7.824448403453476e-06, + "loss": 0.2053, + "step": 16626 + }, + { + "epoch": 1.0255154639175257, + "grad_norm": 4.231311321258545, + "learning_rate": 7.82431136083322e-06, + "loss": 0.2048, + "step": 16627 + }, + { + "epoch": 1.0255290287574608, + "grad_norm": 5.403324604034424, + "learning_rate": 7.824174318212964e-06, + "loss": 0.276, + "step": 16628 + }, + { + "epoch": 1.0255425935973956, + "grad_norm": 5.091404914855957, + "learning_rate": 7.82403727559271e-06, + "loss": 0.2147, + "step": 16629 + }, + { + "epoch": 1.0255561584373305, + "grad_norm": 6.532826900482178, + "learning_rate": 7.823900232972456e-06, + "loss": 0.2823, + "step": 16630 + }, + { + "epoch": 1.0255697232772654, + "grad_norm": 8.104373931884766, + "learning_rate": 7.8237631903522e-06, + "loss": 0.4846, + "step": 16631 + }, + { + "epoch": 1.0255832881172002, + "grad_norm": 4.965483665466309, + "learning_rate": 7.823626147731945e-06, + "loss": 0.2135, + "step": 16632 + }, + { + "epoch": 1.025596852957135, + "grad_norm": 5.008089542388916, + "learning_rate": 7.82348910511169e-06, + "loss": 0.2829, + "step": 16633 + }, + { + "epoch": 1.02561041779707, + "grad_norm": 7.128828048706055, + "learning_rate": 7.823352062491435e-06, + "loss": 0.4129, + "step": 16634 + }, + { + "epoch": 1.0256239826370048, + "grad_norm": 4.703209400177002, + "learning_rate": 7.82321501987118e-06, + "loss": 0.2661, + "step": 16635 + }, + { + "epoch": 1.0256375474769397, + "grad_norm": 7.068612098693848, + "learning_rate": 7.823077977250926e-06, + "loss": 0.2891, + "step": 16636 + }, + { + "epoch": 1.0256511123168746, + "grad_norm": 5.808395862579346, + "learning_rate": 7.82294093463067e-06, + "loss": 0.363, + "step": 16637 + }, + { + "epoch": 1.0256646771568096, + "grad_norm": 3.6758031845092773, + "learning_rate": 7.822803892010416e-06, + "loss": 0.234, + "step": 16638 + }, + { + "epoch": 1.0256782419967445, + "grad_norm": 7.276554107666016, + "learning_rate": 7.822666849390161e-06, + "loss": 0.3147, + "step": 16639 + }, + { + "epoch": 1.0256918068366794, + "grad_norm": 5.536870002746582, + "learning_rate": 7.822529806769906e-06, + "loss": 0.2956, + "step": 16640 + }, + { + "epoch": 1.0257053716766142, + "grad_norm": 3.942549228668213, + "learning_rate": 7.822392764149652e-06, + "loss": 0.2293, + "step": 16641 + }, + { + "epoch": 1.025718936516549, + "grad_norm": 6.177892208099365, + "learning_rate": 7.822255721529397e-06, + "loss": 0.3074, + "step": 16642 + }, + { + "epoch": 1.025732501356484, + "grad_norm": 4.000006198883057, + "learning_rate": 7.822118678909142e-06, + "loss": 0.2268, + "step": 16643 + }, + { + "epoch": 1.0257460661964188, + "grad_norm": 5.717620372772217, + "learning_rate": 7.821981636288887e-06, + "loss": 0.3093, + "step": 16644 + }, + { + "epoch": 1.0257596310363537, + "grad_norm": 6.108405113220215, + "learning_rate": 7.821844593668632e-06, + "loss": 0.3275, + "step": 16645 + }, + { + "epoch": 1.0257731958762886, + "grad_norm": 7.379837512969971, + "learning_rate": 7.821707551048376e-06, + "loss": 0.3233, + "step": 16646 + }, + { + "epoch": 1.0257867607162237, + "grad_norm": 6.183061122894287, + "learning_rate": 7.821570508428121e-06, + "loss": 0.361, + "step": 16647 + }, + { + "epoch": 1.0258003255561585, + "grad_norm": 6.308075904846191, + "learning_rate": 7.821433465807868e-06, + "loss": 0.4585, + "step": 16648 + }, + { + "epoch": 1.0258138903960934, + "grad_norm": 6.347921371459961, + "learning_rate": 7.821296423187611e-06, + "loss": 0.5029, + "step": 16649 + }, + { + "epoch": 1.0258274552360283, + "grad_norm": 7.572250843048096, + "learning_rate": 7.821159380567356e-06, + "loss": 0.3391, + "step": 16650 + }, + { + "epoch": 1.0258410200759631, + "grad_norm": 3.8917136192321777, + "learning_rate": 7.821022337947102e-06, + "loss": 0.2139, + "step": 16651 + }, + { + "epoch": 1.025854584915898, + "grad_norm": 6.089853763580322, + "learning_rate": 7.820885295326848e-06, + "loss": 0.2882, + "step": 16652 + }, + { + "epoch": 1.0258681497558328, + "grad_norm": 6.114630699157715, + "learning_rate": 7.820748252706592e-06, + "loss": 0.3424, + "step": 16653 + }, + { + "epoch": 1.0258817145957677, + "grad_norm": 5.57623291015625, + "learning_rate": 7.820611210086337e-06, + "loss": 0.3239, + "step": 16654 + }, + { + "epoch": 1.0258952794357026, + "grad_norm": 5.1699748039245605, + "learning_rate": 7.820474167466082e-06, + "loss": 0.3441, + "step": 16655 + }, + { + "epoch": 1.0259088442756374, + "grad_norm": 7.210094928741455, + "learning_rate": 7.820337124845828e-06, + "loss": 0.3642, + "step": 16656 + }, + { + "epoch": 1.0259224091155725, + "grad_norm": 5.188753604888916, + "learning_rate": 7.820200082225573e-06, + "loss": 0.4388, + "step": 16657 + }, + { + "epoch": 1.0259359739555074, + "grad_norm": 6.342613220214844, + "learning_rate": 7.820063039605318e-06, + "loss": 0.3045, + "step": 16658 + }, + { + "epoch": 1.0259495387954423, + "grad_norm": 5.388808250427246, + "learning_rate": 7.819925996985063e-06, + "loss": 0.2147, + "step": 16659 + }, + { + "epoch": 1.0259631036353771, + "grad_norm": 7.247949123382568, + "learning_rate": 7.819788954364808e-06, + "loss": 0.3416, + "step": 16660 + }, + { + "epoch": 1.025976668475312, + "grad_norm": 8.062260627746582, + "learning_rate": 7.819651911744553e-06, + "loss": 0.4142, + "step": 16661 + }, + { + "epoch": 1.0259902333152469, + "grad_norm": 7.349555492401123, + "learning_rate": 7.819514869124299e-06, + "loss": 0.4961, + "step": 16662 + }, + { + "epoch": 1.0260037981551817, + "grad_norm": 5.841933727264404, + "learning_rate": 7.819377826504044e-06, + "loss": 0.3489, + "step": 16663 + }, + { + "epoch": 1.0260173629951166, + "grad_norm": 7.36085844039917, + "learning_rate": 7.819240783883787e-06, + "loss": 0.5197, + "step": 16664 + }, + { + "epoch": 1.0260309278350515, + "grad_norm": 7.142892837524414, + "learning_rate": 7.819103741263534e-06, + "loss": 0.5156, + "step": 16665 + }, + { + "epoch": 1.0260444926749865, + "grad_norm": 5.265274524688721, + "learning_rate": 7.81896669864328e-06, + "loss": 0.3187, + "step": 16666 + }, + { + "epoch": 1.0260580575149214, + "grad_norm": 7.110794544219971, + "learning_rate": 7.818829656023024e-06, + "loss": 0.4381, + "step": 16667 + }, + { + "epoch": 1.0260716223548563, + "grad_norm": 5.435785293579102, + "learning_rate": 7.818692613402768e-06, + "loss": 0.3531, + "step": 16668 + }, + { + "epoch": 1.0260851871947911, + "grad_norm": 6.643973350524902, + "learning_rate": 7.818555570782515e-06, + "loss": 0.301, + "step": 16669 + }, + { + "epoch": 1.026098752034726, + "grad_norm": 9.073576927185059, + "learning_rate": 7.81841852816226e-06, + "loss": 0.4774, + "step": 16670 + }, + { + "epoch": 1.0261123168746609, + "grad_norm": 5.936556339263916, + "learning_rate": 7.818281485542003e-06, + "loss": 0.3234, + "step": 16671 + }, + { + "epoch": 1.0261258817145957, + "grad_norm": 8.316442489624023, + "learning_rate": 7.818144442921749e-06, + "loss": 0.4585, + "step": 16672 + }, + { + "epoch": 1.0261394465545306, + "grad_norm": 5.5418219566345215, + "learning_rate": 7.818007400301494e-06, + "loss": 0.206, + "step": 16673 + }, + { + "epoch": 1.0261530113944655, + "grad_norm": 8.063094139099121, + "learning_rate": 7.817870357681239e-06, + "loss": 0.362, + "step": 16674 + }, + { + "epoch": 1.0261665762344003, + "grad_norm": 7.5179338455200195, + "learning_rate": 7.817733315060984e-06, + "loss": 0.4582, + "step": 16675 + }, + { + "epoch": 1.0261801410743354, + "grad_norm": 6.505186557769775, + "learning_rate": 7.81759627244073e-06, + "loss": 0.2932, + "step": 16676 + }, + { + "epoch": 1.0261937059142703, + "grad_norm": 5.370854377746582, + "learning_rate": 7.817459229820475e-06, + "loss": 0.2412, + "step": 16677 + }, + { + "epoch": 1.0262072707542051, + "grad_norm": 4.371764659881592, + "learning_rate": 7.81732218720022e-06, + "loss": 0.2159, + "step": 16678 + }, + { + "epoch": 1.02622083559414, + "grad_norm": 6.877368927001953, + "learning_rate": 7.817185144579965e-06, + "loss": 0.2351, + "step": 16679 + }, + { + "epoch": 1.0262344004340749, + "grad_norm": 6.6314592361450195, + "learning_rate": 7.81704810195971e-06, + "loss": 0.3596, + "step": 16680 + }, + { + "epoch": 1.0262479652740097, + "grad_norm": 5.565360069274902, + "learning_rate": 7.816911059339455e-06, + "loss": 0.2531, + "step": 16681 + }, + { + "epoch": 1.0262615301139446, + "grad_norm": 6.296558380126953, + "learning_rate": 7.8167740167192e-06, + "loss": 0.3234, + "step": 16682 + }, + { + "epoch": 1.0262750949538795, + "grad_norm": 6.416445255279541, + "learning_rate": 7.816636974098946e-06, + "loss": 0.2405, + "step": 16683 + }, + { + "epoch": 1.0262886597938143, + "grad_norm": 6.017696857452393, + "learning_rate": 7.81649993147869e-06, + "loss": 0.3293, + "step": 16684 + }, + { + "epoch": 1.0263022246337494, + "grad_norm": 4.973308086395264, + "learning_rate": 7.816362888858436e-06, + "loss": 0.2535, + "step": 16685 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 6.404247283935547, + "learning_rate": 7.816225846238181e-06, + "loss": 0.3845, + "step": 16686 + }, + { + "epoch": 1.0263293543136192, + "grad_norm": 5.803933620452881, + "learning_rate": 7.816088803617926e-06, + "loss": 0.2675, + "step": 16687 + }, + { + "epoch": 1.026342919153554, + "grad_norm": 6.749136924743652, + "learning_rate": 7.815951760997672e-06, + "loss": 0.4137, + "step": 16688 + }, + { + "epoch": 1.0263564839934889, + "grad_norm": 6.873188018798828, + "learning_rate": 7.815814718377415e-06, + "loss": 0.4085, + "step": 16689 + }, + { + "epoch": 1.0263700488334238, + "grad_norm": 9.235559463500977, + "learning_rate": 7.81567767575716e-06, + "loss": 0.4152, + "step": 16690 + }, + { + "epoch": 1.0263836136733586, + "grad_norm": 5.214183807373047, + "learning_rate": 7.815540633136907e-06, + "loss": 0.2298, + "step": 16691 + }, + { + "epoch": 1.0263971785132935, + "grad_norm": 8.992780685424805, + "learning_rate": 7.815403590516652e-06, + "loss": 0.4145, + "step": 16692 + }, + { + "epoch": 1.0264107433532283, + "grad_norm": 4.600347995758057, + "learning_rate": 7.815266547896396e-06, + "loss": 0.181, + "step": 16693 + }, + { + "epoch": 1.0264243081931632, + "grad_norm": 5.6876373291015625, + "learning_rate": 7.815129505276141e-06, + "loss": 0.2137, + "step": 16694 + }, + { + "epoch": 1.0264378730330983, + "grad_norm": 7.240267276763916, + "learning_rate": 7.814992462655888e-06, + "loss": 0.4493, + "step": 16695 + }, + { + "epoch": 1.0264514378730332, + "grad_norm": 5.6565680503845215, + "learning_rate": 7.814855420035631e-06, + "loss": 0.3085, + "step": 16696 + }, + { + "epoch": 1.026465002712968, + "grad_norm": 6.697004795074463, + "learning_rate": 7.814718377415376e-06, + "loss": 0.389, + "step": 16697 + }, + { + "epoch": 1.026478567552903, + "grad_norm": 5.221721172332764, + "learning_rate": 7.814581334795122e-06, + "loss": 0.2638, + "step": 16698 + }, + { + "epoch": 1.0264921323928378, + "grad_norm": 7.796308994293213, + "learning_rate": 7.814444292174867e-06, + "loss": 0.387, + "step": 16699 + }, + { + "epoch": 1.0265056972327726, + "grad_norm": 6.16461181640625, + "learning_rate": 7.814307249554612e-06, + "loss": 0.3001, + "step": 16700 + }, + { + "epoch": 1.0265192620727075, + "grad_norm": 5.6640777587890625, + "learning_rate": 7.814170206934357e-06, + "loss": 0.4016, + "step": 16701 + }, + { + "epoch": 1.0265328269126424, + "grad_norm": 6.510904788970947, + "learning_rate": 7.814033164314102e-06, + "loss": 0.2464, + "step": 16702 + }, + { + "epoch": 1.0265463917525772, + "grad_norm": 3.7525475025177, + "learning_rate": 7.813896121693848e-06, + "loss": 0.1684, + "step": 16703 + }, + { + "epoch": 1.0265599565925123, + "grad_norm": 10.542823791503906, + "learning_rate": 7.813759079073593e-06, + "loss": 0.3086, + "step": 16704 + }, + { + "epoch": 1.0265735214324472, + "grad_norm": 6.257174968719482, + "learning_rate": 7.813622036453338e-06, + "loss": 0.486, + "step": 16705 + }, + { + "epoch": 1.026587086272382, + "grad_norm": 9.499874114990234, + "learning_rate": 7.813484993833083e-06, + "loss": 0.7401, + "step": 16706 + }, + { + "epoch": 1.026600651112317, + "grad_norm": 5.7649102210998535, + "learning_rate": 7.813347951212828e-06, + "loss": 0.2465, + "step": 16707 + }, + { + "epoch": 1.0266142159522518, + "grad_norm": 7.684197902679443, + "learning_rate": 7.813210908592573e-06, + "loss": 0.4205, + "step": 16708 + }, + { + "epoch": 1.0266277807921866, + "grad_norm": 5.693502426147461, + "learning_rate": 7.813073865972319e-06, + "loss": 0.3574, + "step": 16709 + }, + { + "epoch": 1.0266413456321215, + "grad_norm": 5.398646831512451, + "learning_rate": 7.812936823352064e-06, + "loss": 0.3011, + "step": 16710 + }, + { + "epoch": 1.0266549104720564, + "grad_norm": 4.829864025115967, + "learning_rate": 7.812799780731807e-06, + "loss": 0.2937, + "step": 16711 + }, + { + "epoch": 1.0266684753119912, + "grad_norm": 4.0429463386535645, + "learning_rate": 7.812662738111554e-06, + "loss": 0.3295, + "step": 16712 + }, + { + "epoch": 1.026682040151926, + "grad_norm": 6.461750030517578, + "learning_rate": 7.8125256954913e-06, + "loss": 0.4308, + "step": 16713 + }, + { + "epoch": 1.0266956049918612, + "grad_norm": 5.981158256530762, + "learning_rate": 7.812388652871043e-06, + "loss": 0.3198, + "step": 16714 + }, + { + "epoch": 1.026709169831796, + "grad_norm": 5.590553283691406, + "learning_rate": 7.812251610250788e-06, + "loss": 0.3191, + "step": 16715 + }, + { + "epoch": 1.026722734671731, + "grad_norm": 6.14408540725708, + "learning_rate": 7.812114567630533e-06, + "loss": 0.3045, + "step": 16716 + }, + { + "epoch": 1.0267362995116658, + "grad_norm": 4.93314790725708, + "learning_rate": 7.811977525010278e-06, + "loss": 0.3538, + "step": 16717 + }, + { + "epoch": 1.0267498643516006, + "grad_norm": 4.4708404541015625, + "learning_rate": 7.811840482390024e-06, + "loss": 0.1814, + "step": 16718 + }, + { + "epoch": 1.0267634291915355, + "grad_norm": 4.495426177978516, + "learning_rate": 7.811703439769769e-06, + "loss": 0.3147, + "step": 16719 + }, + { + "epoch": 1.0267769940314704, + "grad_norm": 5.663232803344727, + "learning_rate": 7.811566397149514e-06, + "loss": 0.334, + "step": 16720 + }, + { + "epoch": 1.0267905588714052, + "grad_norm": 4.9195780754089355, + "learning_rate": 7.811429354529259e-06, + "loss": 0.252, + "step": 16721 + }, + { + "epoch": 1.02680412371134, + "grad_norm": 6.568210601806641, + "learning_rate": 7.811292311909004e-06, + "loss": 0.5668, + "step": 16722 + }, + { + "epoch": 1.0268176885512752, + "grad_norm": 6.35611629486084, + "learning_rate": 7.81115526928875e-06, + "loss": 0.3513, + "step": 16723 + }, + { + "epoch": 1.02683125339121, + "grad_norm": 7.171986103057861, + "learning_rate": 7.811018226668495e-06, + "loss": 0.4841, + "step": 16724 + }, + { + "epoch": 1.026844818231145, + "grad_norm": 5.941717624664307, + "learning_rate": 7.81088118404824e-06, + "loss": 0.3059, + "step": 16725 + }, + { + "epoch": 1.0268583830710798, + "grad_norm": 3.8973731994628906, + "learning_rate": 7.810744141427985e-06, + "loss": 0.3008, + "step": 16726 + }, + { + "epoch": 1.0268719479110147, + "grad_norm": 5.016443729400635, + "learning_rate": 7.81060709880773e-06, + "loss": 0.3296, + "step": 16727 + }, + { + "epoch": 1.0268855127509495, + "grad_norm": 6.925873279571533, + "learning_rate": 7.810470056187475e-06, + "loss": 0.3401, + "step": 16728 + }, + { + "epoch": 1.0268990775908844, + "grad_norm": 5.1914873123168945, + "learning_rate": 7.810333013567219e-06, + "loss": 0.347, + "step": 16729 + }, + { + "epoch": 1.0269126424308193, + "grad_norm": 5.161990165710449, + "learning_rate": 7.810195970946966e-06, + "loss": 0.3737, + "step": 16730 + }, + { + "epoch": 1.0269262072707541, + "grad_norm": 7.337595462799072, + "learning_rate": 7.81005892832671e-06, + "loss": 0.4561, + "step": 16731 + }, + { + "epoch": 1.026939772110689, + "grad_norm": 6.673539161682129, + "learning_rate": 7.809921885706454e-06, + "loss": 0.4222, + "step": 16732 + }, + { + "epoch": 1.026953336950624, + "grad_norm": 4.91811466217041, + "learning_rate": 7.8097848430862e-06, + "loss": 0.3228, + "step": 16733 + }, + { + "epoch": 1.026966901790559, + "grad_norm": 9.25660514831543, + "learning_rate": 7.809647800465946e-06, + "loss": 0.6455, + "step": 16734 + }, + { + "epoch": 1.0269804666304938, + "grad_norm": 4.751509189605713, + "learning_rate": 7.809510757845692e-06, + "loss": 0.2527, + "step": 16735 + }, + { + "epoch": 1.0269940314704287, + "grad_norm": 5.602008819580078, + "learning_rate": 7.809373715225435e-06, + "loss": 0.2831, + "step": 16736 + }, + { + "epoch": 1.0270075963103635, + "grad_norm": 5.83569860458374, + "learning_rate": 7.80923667260518e-06, + "loss": 0.3719, + "step": 16737 + }, + { + "epoch": 1.0270211611502984, + "grad_norm": 4.137823104858398, + "learning_rate": 7.809099629984927e-06, + "loss": 0.195, + "step": 16738 + }, + { + "epoch": 1.0270347259902333, + "grad_norm": 6.511995792388916, + "learning_rate": 7.80896258736467e-06, + "loss": 0.4003, + "step": 16739 + }, + { + "epoch": 1.0270482908301681, + "grad_norm": 4.357247352600098, + "learning_rate": 7.808825544744416e-06, + "loss": 0.3897, + "step": 16740 + }, + { + "epoch": 1.027061855670103, + "grad_norm": 5.817166805267334, + "learning_rate": 7.808688502124161e-06, + "loss": 0.318, + "step": 16741 + }, + { + "epoch": 1.027075420510038, + "grad_norm": 6.173155307769775, + "learning_rate": 7.808551459503906e-06, + "loss": 0.4538, + "step": 16742 + }, + { + "epoch": 1.027088985349973, + "grad_norm": 5.00541877746582, + "learning_rate": 7.808414416883651e-06, + "loss": 0.2511, + "step": 16743 + }, + { + "epoch": 1.0271025501899078, + "grad_norm": 5.213761329650879, + "learning_rate": 7.808277374263396e-06, + "loss": 0.2852, + "step": 16744 + }, + { + "epoch": 1.0271161150298427, + "grad_norm": 3.8460497856140137, + "learning_rate": 7.808140331643142e-06, + "loss": 0.2311, + "step": 16745 + }, + { + "epoch": 1.0271296798697775, + "grad_norm": 6.188290596008301, + "learning_rate": 7.808003289022887e-06, + "loss": 0.2698, + "step": 16746 + }, + { + "epoch": 1.0271432447097124, + "grad_norm": 6.407607555389404, + "learning_rate": 7.807866246402632e-06, + "loss": 0.4442, + "step": 16747 + }, + { + "epoch": 1.0271568095496473, + "grad_norm": 5.537024974822998, + "learning_rate": 7.807729203782377e-06, + "loss": 0.3377, + "step": 16748 + }, + { + "epoch": 1.0271703743895821, + "grad_norm": 4.789947509765625, + "learning_rate": 7.807592161162122e-06, + "loss": 0.416, + "step": 16749 + }, + { + "epoch": 1.027183939229517, + "grad_norm": 6.378229141235352, + "learning_rate": 7.807455118541868e-06, + "loss": 0.3268, + "step": 16750 + }, + { + "epoch": 1.0271975040694519, + "grad_norm": 5.8842267990112305, + "learning_rate": 7.807318075921613e-06, + "loss": 0.4188, + "step": 16751 + }, + { + "epoch": 1.027211068909387, + "grad_norm": 5.339206218719482, + "learning_rate": 7.807181033301358e-06, + "loss": 0.3463, + "step": 16752 + }, + { + "epoch": 1.0272246337493218, + "grad_norm": 4.004238605499268, + "learning_rate": 7.807043990681103e-06, + "loss": 0.2687, + "step": 16753 + }, + { + "epoch": 1.0272381985892567, + "grad_norm": 5.364803314208984, + "learning_rate": 7.806906948060847e-06, + "loss": 0.324, + "step": 16754 + }, + { + "epoch": 1.0272517634291916, + "grad_norm": 4.6053080558776855, + "learning_rate": 7.806769905440593e-06, + "loss": 0.3443, + "step": 16755 + }, + { + "epoch": 1.0272653282691264, + "grad_norm": 5.021337032318115, + "learning_rate": 7.806632862820339e-06, + "loss": 0.4695, + "step": 16756 + }, + { + "epoch": 1.0272788931090613, + "grad_norm": 5.260613441467285, + "learning_rate": 7.806495820200082e-06, + "loss": 0.2848, + "step": 16757 + }, + { + "epoch": 1.0272924579489962, + "grad_norm": 5.217787265777588, + "learning_rate": 7.806358777579827e-06, + "loss": 0.2541, + "step": 16758 + }, + { + "epoch": 1.027306022788931, + "grad_norm": 5.639203071594238, + "learning_rate": 7.806221734959572e-06, + "loss": 0.3486, + "step": 16759 + }, + { + "epoch": 1.0273195876288659, + "grad_norm": 4.499730587005615, + "learning_rate": 7.80608469233932e-06, + "loss": 0.2826, + "step": 16760 + }, + { + "epoch": 1.027333152468801, + "grad_norm": 4.67410945892334, + "learning_rate": 7.805947649719063e-06, + "loss": 0.348, + "step": 16761 + }, + { + "epoch": 1.0273467173087358, + "grad_norm": 6.573784828186035, + "learning_rate": 7.805810607098808e-06, + "loss": 0.4791, + "step": 16762 + }, + { + "epoch": 1.0273602821486707, + "grad_norm": 4.863630294799805, + "learning_rate": 7.805673564478553e-06, + "loss": 0.3967, + "step": 16763 + }, + { + "epoch": 1.0273738469886056, + "grad_norm": 4.7612128257751465, + "learning_rate": 7.805536521858298e-06, + "loss": 0.3288, + "step": 16764 + }, + { + "epoch": 1.0273874118285404, + "grad_norm": 5.191773891448975, + "learning_rate": 7.805399479238044e-06, + "loss": 0.2907, + "step": 16765 + }, + { + "epoch": 1.0274009766684753, + "grad_norm": 3.940032720565796, + "learning_rate": 7.805262436617789e-06, + "loss": 0.2422, + "step": 16766 + }, + { + "epoch": 1.0274145415084102, + "grad_norm": 5.5391764640808105, + "learning_rate": 7.805125393997534e-06, + "loss": 0.3376, + "step": 16767 + }, + { + "epoch": 1.027428106348345, + "grad_norm": 5.27973747253418, + "learning_rate": 7.804988351377279e-06, + "loss": 0.3221, + "step": 16768 + }, + { + "epoch": 1.02744167118828, + "grad_norm": 5.686532020568848, + "learning_rate": 7.804851308757024e-06, + "loss": 0.4458, + "step": 16769 + }, + { + "epoch": 1.0274552360282148, + "grad_norm": 3.8122458457946777, + "learning_rate": 7.80471426613677e-06, + "loss": 0.2412, + "step": 16770 + }, + { + "epoch": 1.0274688008681498, + "grad_norm": 6.824237823486328, + "learning_rate": 7.804577223516515e-06, + "loss": 0.4027, + "step": 16771 + }, + { + "epoch": 1.0274823657080847, + "grad_norm": 5.028258800506592, + "learning_rate": 7.804440180896258e-06, + "loss": 0.3005, + "step": 16772 + }, + { + "epoch": 1.0274959305480196, + "grad_norm": 4.7146501541137695, + "learning_rate": 7.804303138276005e-06, + "loss": 0.304, + "step": 16773 + }, + { + "epoch": 1.0275094953879544, + "grad_norm": 5.2629075050354, + "learning_rate": 7.80416609565575e-06, + "loss": 0.3206, + "step": 16774 + }, + { + "epoch": 1.0275230602278893, + "grad_norm": 5.113465785980225, + "learning_rate": 7.804029053035495e-06, + "loss": 0.3257, + "step": 16775 + }, + { + "epoch": 1.0275366250678242, + "grad_norm": 3.8032894134521484, + "learning_rate": 7.803892010415239e-06, + "loss": 0.2441, + "step": 16776 + }, + { + "epoch": 1.027550189907759, + "grad_norm": 5.538205623626709, + "learning_rate": 7.803754967794986e-06, + "loss": 0.2747, + "step": 16777 + }, + { + "epoch": 1.027563754747694, + "grad_norm": 4.066879749298096, + "learning_rate": 7.803617925174731e-06, + "loss": 0.2596, + "step": 16778 + }, + { + "epoch": 1.0275773195876288, + "grad_norm": 3.7514901161193848, + "learning_rate": 7.803480882554474e-06, + "loss": 0.2486, + "step": 16779 + }, + { + "epoch": 1.0275908844275639, + "grad_norm": 5.155344486236572, + "learning_rate": 7.80334383993422e-06, + "loss": 0.1874, + "step": 16780 + }, + { + "epoch": 1.0276044492674987, + "grad_norm": 4.309585094451904, + "learning_rate": 7.803206797313966e-06, + "loss": 0.2972, + "step": 16781 + }, + { + "epoch": 1.0276180141074336, + "grad_norm": 5.351478099822998, + "learning_rate": 7.80306975469371e-06, + "loss": 0.3133, + "step": 16782 + }, + { + "epoch": 1.0276315789473685, + "grad_norm": 4.839629173278809, + "learning_rate": 7.802932712073455e-06, + "loss": 0.2175, + "step": 16783 + }, + { + "epoch": 1.0276451437873033, + "grad_norm": 6.831620216369629, + "learning_rate": 7.8027956694532e-06, + "loss": 0.364, + "step": 16784 + }, + { + "epoch": 1.0276587086272382, + "grad_norm": 4.854011535644531, + "learning_rate": 7.802658626832945e-06, + "loss": 0.2677, + "step": 16785 + }, + { + "epoch": 1.027672273467173, + "grad_norm": 5.312727451324463, + "learning_rate": 7.80252158421269e-06, + "loss": 0.2924, + "step": 16786 + }, + { + "epoch": 1.027685838307108, + "grad_norm": 4.618732452392578, + "learning_rate": 7.802384541592436e-06, + "loss": 0.2137, + "step": 16787 + }, + { + "epoch": 1.0276994031470428, + "grad_norm": 5.1610188484191895, + "learning_rate": 7.802247498972181e-06, + "loss": 0.2849, + "step": 16788 + }, + { + "epoch": 1.0277129679869779, + "grad_norm": 3.6220693588256836, + "learning_rate": 7.802110456351926e-06, + "loss": 0.2743, + "step": 16789 + }, + { + "epoch": 1.0277265328269127, + "grad_norm": 5.741916179656982, + "learning_rate": 7.801973413731671e-06, + "loss": 0.2283, + "step": 16790 + }, + { + "epoch": 1.0277400976668476, + "grad_norm": 4.401331901550293, + "learning_rate": 7.801836371111416e-06, + "loss": 0.2598, + "step": 16791 + }, + { + "epoch": 1.0277536625067825, + "grad_norm": 4.70026159286499, + "learning_rate": 7.801699328491162e-06, + "loss": 0.1847, + "step": 16792 + }, + { + "epoch": 1.0277672273467173, + "grad_norm": 5.807643890380859, + "learning_rate": 7.801562285870907e-06, + "loss": 0.2226, + "step": 16793 + }, + { + "epoch": 1.0277807921866522, + "grad_norm": 6.5665717124938965, + "learning_rate": 7.801425243250652e-06, + "loss": 0.4687, + "step": 16794 + }, + { + "epoch": 1.027794357026587, + "grad_norm": 4.027847766876221, + "learning_rate": 7.801288200630397e-06, + "loss": 0.2834, + "step": 16795 + }, + { + "epoch": 1.027807921866522, + "grad_norm": 4.8027191162109375, + "learning_rate": 7.801151158010142e-06, + "loss": 0.2953, + "step": 16796 + }, + { + "epoch": 1.0278214867064568, + "grad_norm": 5.929208755493164, + "learning_rate": 7.801014115389886e-06, + "loss": 0.296, + "step": 16797 + }, + { + "epoch": 1.0278350515463917, + "grad_norm": 6.596484661102295, + "learning_rate": 7.800877072769631e-06, + "loss": 0.3125, + "step": 16798 + }, + { + "epoch": 1.0278486163863267, + "grad_norm": 5.7578206062316895, + "learning_rate": 7.800740030149378e-06, + "loss": 0.4313, + "step": 16799 + }, + { + "epoch": 1.0278621812262616, + "grad_norm": 6.24771785736084, + "learning_rate": 7.800602987529123e-06, + "loss": 0.3473, + "step": 16800 + }, + { + "epoch": 1.0278757460661965, + "grad_norm": 5.540411949157715, + "learning_rate": 7.800465944908867e-06, + "loss": 0.2934, + "step": 16801 + }, + { + "epoch": 1.0278893109061313, + "grad_norm": 4.841291904449463, + "learning_rate": 7.800328902288612e-06, + "loss": 0.2574, + "step": 16802 + }, + { + "epoch": 1.0279028757460662, + "grad_norm": 4.890781879425049, + "learning_rate": 7.800191859668359e-06, + "loss": 0.292, + "step": 16803 + }, + { + "epoch": 1.027916440586001, + "grad_norm": 4.518641948699951, + "learning_rate": 7.800054817048102e-06, + "loss": 0.2987, + "step": 16804 + }, + { + "epoch": 1.027930005425936, + "grad_norm": 4.79142427444458, + "learning_rate": 7.799917774427847e-06, + "loss": 0.2466, + "step": 16805 + }, + { + "epoch": 1.0279435702658708, + "grad_norm": 6.692363739013672, + "learning_rate": 7.799780731807592e-06, + "loss": 0.2699, + "step": 16806 + }, + { + "epoch": 1.0279571351058057, + "grad_norm": 5.187055587768555, + "learning_rate": 7.799643689187338e-06, + "loss": 0.3323, + "step": 16807 + }, + { + "epoch": 1.0279706999457408, + "grad_norm": 5.997653007507324, + "learning_rate": 7.799506646567083e-06, + "loss": 0.3388, + "step": 16808 + }, + { + "epoch": 1.0279842647856756, + "grad_norm": 4.834702014923096, + "learning_rate": 7.799369603946828e-06, + "loss": 0.2962, + "step": 16809 + }, + { + "epoch": 1.0279978296256105, + "grad_norm": 7.085797309875488, + "learning_rate": 7.799232561326573e-06, + "loss": 0.2996, + "step": 16810 + }, + { + "epoch": 1.0280113944655453, + "grad_norm": 5.465210437774658, + "learning_rate": 7.799095518706318e-06, + "loss": 0.3493, + "step": 16811 + }, + { + "epoch": 1.0280249593054802, + "grad_norm": 6.375496864318848, + "learning_rate": 7.798958476086064e-06, + "loss": 0.356, + "step": 16812 + }, + { + "epoch": 1.028038524145415, + "grad_norm": 5.129861354827881, + "learning_rate": 7.798821433465809e-06, + "loss": 0.2427, + "step": 16813 + }, + { + "epoch": 1.02805208898535, + "grad_norm": 5.170408725738525, + "learning_rate": 7.798684390845554e-06, + "loss": 0.2171, + "step": 16814 + }, + { + "epoch": 1.0280656538252848, + "grad_norm": 5.723914623260498, + "learning_rate": 7.798547348225297e-06, + "loss": 0.2728, + "step": 16815 + }, + { + "epoch": 1.0280792186652197, + "grad_norm": 6.047379970550537, + "learning_rate": 7.798410305605044e-06, + "loss": 0.2948, + "step": 16816 + }, + { + "epoch": 1.0280927835051545, + "grad_norm": 5.304862976074219, + "learning_rate": 7.79827326298479e-06, + "loss": 0.3194, + "step": 16817 + }, + { + "epoch": 1.0281063483450896, + "grad_norm": 6.8103132247924805, + "learning_rate": 7.798136220364535e-06, + "loss": 0.3012, + "step": 16818 + }, + { + "epoch": 1.0281199131850245, + "grad_norm": 7.244378089904785, + "learning_rate": 7.797999177744278e-06, + "loss": 0.2505, + "step": 16819 + }, + { + "epoch": 1.0281334780249594, + "grad_norm": 4.822791576385498, + "learning_rate": 7.797862135124025e-06, + "loss": 0.2157, + "step": 16820 + }, + { + "epoch": 1.0281470428648942, + "grad_norm": 6.101291179656982, + "learning_rate": 7.79772509250377e-06, + "loss": 0.3336, + "step": 16821 + }, + { + "epoch": 1.028160607704829, + "grad_norm": 6.322148323059082, + "learning_rate": 7.797588049883514e-06, + "loss": 0.3338, + "step": 16822 + }, + { + "epoch": 1.028174172544764, + "grad_norm": 5.082040786743164, + "learning_rate": 7.797451007263259e-06, + "loss": 0.2038, + "step": 16823 + }, + { + "epoch": 1.0281877373846988, + "grad_norm": 6.517049312591553, + "learning_rate": 7.797313964643006e-06, + "loss": 0.2165, + "step": 16824 + }, + { + "epoch": 1.0282013022246337, + "grad_norm": 5.620024681091309, + "learning_rate": 7.79717692202275e-06, + "loss": 0.3618, + "step": 16825 + }, + { + "epoch": 1.0282148670645685, + "grad_norm": 5.938526630401611, + "learning_rate": 7.797039879402494e-06, + "loss": 0.1921, + "step": 16826 + }, + { + "epoch": 1.0282284319045036, + "grad_norm": 4.923459529876709, + "learning_rate": 7.79690283678224e-06, + "loss": 0.3448, + "step": 16827 + }, + { + "epoch": 1.0282419967444385, + "grad_norm": 6.038447856903076, + "learning_rate": 7.796765794161985e-06, + "loss": 0.2844, + "step": 16828 + }, + { + "epoch": 1.0282555615843734, + "grad_norm": 5.410032272338867, + "learning_rate": 7.79662875154173e-06, + "loss": 0.2159, + "step": 16829 + }, + { + "epoch": 1.0282691264243082, + "grad_norm": 7.789093971252441, + "learning_rate": 7.796491708921475e-06, + "loss": 0.5246, + "step": 16830 + }, + { + "epoch": 1.028282691264243, + "grad_norm": 6.456366062164307, + "learning_rate": 7.79635466630122e-06, + "loss": 0.2978, + "step": 16831 + }, + { + "epoch": 1.028296256104178, + "grad_norm": 6.276018142700195, + "learning_rate": 7.796217623680965e-06, + "loss": 0.3827, + "step": 16832 + }, + { + "epoch": 1.0283098209441128, + "grad_norm": 7.016636848449707, + "learning_rate": 7.79608058106071e-06, + "loss": 0.4425, + "step": 16833 + }, + { + "epoch": 1.0283233857840477, + "grad_norm": 4.648826599121094, + "learning_rate": 7.795943538440456e-06, + "loss": 0.2128, + "step": 16834 + }, + { + "epoch": 1.0283369506239826, + "grad_norm": 6.7057108879089355, + "learning_rate": 7.795806495820201e-06, + "loss": 0.4431, + "step": 16835 + }, + { + "epoch": 1.0283505154639174, + "grad_norm": 5.095155715942383, + "learning_rate": 7.795669453199946e-06, + "loss": 0.2449, + "step": 16836 + }, + { + "epoch": 1.0283640803038525, + "grad_norm": 5.79716157913208, + "learning_rate": 7.795532410579691e-06, + "loss": 0.3994, + "step": 16837 + }, + { + "epoch": 1.0283776451437874, + "grad_norm": 6.15153694152832, + "learning_rate": 7.795395367959437e-06, + "loss": 0.4333, + "step": 16838 + }, + { + "epoch": 1.0283912099837222, + "grad_norm": 6.0289506912231445, + "learning_rate": 7.795258325339182e-06, + "loss": 0.4161, + "step": 16839 + }, + { + "epoch": 1.028404774823657, + "grad_norm": 7.091749668121338, + "learning_rate": 7.795121282718925e-06, + "loss": 0.3314, + "step": 16840 + }, + { + "epoch": 1.028418339663592, + "grad_norm": 6.375610828399658, + "learning_rate": 7.79498424009867e-06, + "loss": 0.2813, + "step": 16841 + }, + { + "epoch": 1.0284319045035268, + "grad_norm": 5.102758884429932, + "learning_rate": 7.794847197478417e-06, + "loss": 0.3587, + "step": 16842 + }, + { + "epoch": 1.0284454693434617, + "grad_norm": 6.02579927444458, + "learning_rate": 7.794710154858162e-06, + "loss": 0.3831, + "step": 16843 + }, + { + "epoch": 1.0284590341833966, + "grad_norm": 5.872684955596924, + "learning_rate": 7.794573112237906e-06, + "loss": 0.3824, + "step": 16844 + }, + { + "epoch": 1.0284725990233314, + "grad_norm": 6.388395309448242, + "learning_rate": 7.794436069617651e-06, + "loss": 0.3644, + "step": 16845 + }, + { + "epoch": 1.0284861638632665, + "grad_norm": 6.047447681427002, + "learning_rate": 7.794299026997398e-06, + "loss": 0.299, + "step": 16846 + }, + { + "epoch": 1.0284997287032014, + "grad_norm": 4.939642906188965, + "learning_rate": 7.794161984377141e-06, + "loss": 0.3963, + "step": 16847 + }, + { + "epoch": 1.0285132935431363, + "grad_norm": 8.117121696472168, + "learning_rate": 7.794024941756887e-06, + "loss": 0.3229, + "step": 16848 + }, + { + "epoch": 1.0285268583830711, + "grad_norm": 6.52750301361084, + "learning_rate": 7.793887899136632e-06, + "loss": 0.295, + "step": 16849 + }, + { + "epoch": 1.028540423223006, + "grad_norm": 7.170820713043213, + "learning_rate": 7.793750856516377e-06, + "loss": 0.3654, + "step": 16850 + }, + { + "epoch": 1.0285539880629408, + "grad_norm": 6.513893127441406, + "learning_rate": 7.793613813896122e-06, + "loss": 0.4547, + "step": 16851 + }, + { + "epoch": 1.0285675529028757, + "grad_norm": 5.564383029937744, + "learning_rate": 7.793476771275867e-06, + "loss": 0.4352, + "step": 16852 + }, + { + "epoch": 1.0285811177428106, + "grad_norm": 4.393758296966553, + "learning_rate": 7.793339728655613e-06, + "loss": 0.2999, + "step": 16853 + }, + { + "epoch": 1.0285946825827454, + "grad_norm": 9.125061988830566, + "learning_rate": 7.793202686035358e-06, + "loss": 0.5562, + "step": 16854 + }, + { + "epoch": 1.0286082474226803, + "grad_norm": 6.050673484802246, + "learning_rate": 7.793065643415103e-06, + "loss": 0.3016, + "step": 16855 + }, + { + "epoch": 1.0286218122626154, + "grad_norm": 5.730571746826172, + "learning_rate": 7.792928600794848e-06, + "loss": 0.2817, + "step": 16856 + }, + { + "epoch": 1.0286353771025503, + "grad_norm": 5.489126682281494, + "learning_rate": 7.792791558174593e-06, + "loss": 0.3184, + "step": 16857 + }, + { + "epoch": 1.0286489419424851, + "grad_norm": 7.06044340133667, + "learning_rate": 7.792654515554338e-06, + "loss": 0.431, + "step": 16858 + }, + { + "epoch": 1.02866250678242, + "grad_norm": 4.966403961181641, + "learning_rate": 7.792517472934084e-06, + "loss": 0.3142, + "step": 16859 + }, + { + "epoch": 1.0286760716223549, + "grad_norm": 5.17701530456543, + "learning_rate": 7.792380430313829e-06, + "loss": 0.4008, + "step": 16860 + }, + { + "epoch": 1.0286896364622897, + "grad_norm": 6.514538288116455, + "learning_rate": 7.792243387693574e-06, + "loss": 0.3392, + "step": 16861 + }, + { + "epoch": 1.0287032013022246, + "grad_norm": 7.163195610046387, + "learning_rate": 7.792106345073317e-06, + "loss": 0.4519, + "step": 16862 + }, + { + "epoch": 1.0287167661421595, + "grad_norm": 6.089352130889893, + "learning_rate": 7.791969302453064e-06, + "loss": 0.3823, + "step": 16863 + }, + { + "epoch": 1.0287303309820943, + "grad_norm": 5.847503662109375, + "learning_rate": 7.79183225983281e-06, + "loss": 0.4108, + "step": 16864 + }, + { + "epoch": 1.0287438958220294, + "grad_norm": 9.124229431152344, + "learning_rate": 7.791695217212553e-06, + "loss": 0.6001, + "step": 16865 + }, + { + "epoch": 1.0287574606619643, + "grad_norm": 7.848627090454102, + "learning_rate": 7.791558174592298e-06, + "loss": 0.377, + "step": 16866 + }, + { + "epoch": 1.0287710255018991, + "grad_norm": 7.99132776260376, + "learning_rate": 7.791421131972043e-06, + "loss": 0.4864, + "step": 16867 + }, + { + "epoch": 1.028784590341834, + "grad_norm": 5.5973310470581055, + "learning_rate": 7.79128408935179e-06, + "loss": 0.4214, + "step": 16868 + }, + { + "epoch": 1.0287981551817689, + "grad_norm": 7.040485858917236, + "learning_rate": 7.791147046731534e-06, + "loss": 0.3461, + "step": 16869 + }, + { + "epoch": 1.0288117200217037, + "grad_norm": 4.630248546600342, + "learning_rate": 7.791010004111279e-06, + "loss": 0.3112, + "step": 16870 + }, + { + "epoch": 1.0288252848616386, + "grad_norm": 5.704917907714844, + "learning_rate": 7.790872961491024e-06, + "loss": 0.3705, + "step": 16871 + }, + { + "epoch": 1.0288388497015735, + "grad_norm": 4.520031929016113, + "learning_rate": 7.79073591887077e-06, + "loss": 0.3363, + "step": 16872 + }, + { + "epoch": 1.0288524145415083, + "grad_norm": 6.465353488922119, + "learning_rate": 7.790598876250514e-06, + "loss": 0.4324, + "step": 16873 + }, + { + "epoch": 1.0288659793814432, + "grad_norm": 5.987417697906494, + "learning_rate": 7.79046183363026e-06, + "loss": 0.3665, + "step": 16874 + }, + { + "epoch": 1.0288795442213783, + "grad_norm": 4.632516860961914, + "learning_rate": 7.790324791010005e-06, + "loss": 0.272, + "step": 16875 + }, + { + "epoch": 1.0288931090613131, + "grad_norm": 6.125087738037109, + "learning_rate": 7.79018774838975e-06, + "loss": 0.3724, + "step": 16876 + }, + { + "epoch": 1.028906673901248, + "grad_norm": 4.878722667694092, + "learning_rate": 7.790050705769495e-06, + "loss": 0.3635, + "step": 16877 + }, + { + "epoch": 1.0289202387411829, + "grad_norm": 4.766650676727295, + "learning_rate": 7.78991366314924e-06, + "loss": 0.2729, + "step": 16878 + }, + { + "epoch": 1.0289338035811177, + "grad_norm": 6.2981414794921875, + "learning_rate": 7.789776620528985e-06, + "loss": 0.4552, + "step": 16879 + }, + { + "epoch": 1.0289473684210526, + "grad_norm": 4.635488033294678, + "learning_rate": 7.789639577908729e-06, + "loss": 0.305, + "step": 16880 + }, + { + "epoch": 1.0289609332609875, + "grad_norm": 5.507959842681885, + "learning_rate": 7.789502535288476e-06, + "loss": 0.2835, + "step": 16881 + }, + { + "epoch": 1.0289744981009223, + "grad_norm": 5.161448955535889, + "learning_rate": 7.789365492668221e-06, + "loss": 0.3818, + "step": 16882 + }, + { + "epoch": 1.0289880629408572, + "grad_norm": 4.005366325378418, + "learning_rate": 7.789228450047966e-06, + "loss": 0.3202, + "step": 16883 + }, + { + "epoch": 1.0290016277807923, + "grad_norm": 5.949532985687256, + "learning_rate": 7.78909140742771e-06, + "loss": 0.4183, + "step": 16884 + }, + { + "epoch": 1.0290151926207272, + "grad_norm": 4.830308437347412, + "learning_rate": 7.788954364807457e-06, + "loss": 0.4476, + "step": 16885 + }, + { + "epoch": 1.029028757460662, + "grad_norm": 5.61691951751709, + "learning_rate": 7.788817322187202e-06, + "loss": 0.2821, + "step": 16886 + }, + { + "epoch": 1.029042322300597, + "grad_norm": 5.449004650115967, + "learning_rate": 7.788680279566945e-06, + "loss": 0.2993, + "step": 16887 + }, + { + "epoch": 1.0290558871405318, + "grad_norm": 5.8557963371276855, + "learning_rate": 7.78854323694669e-06, + "loss": 0.4669, + "step": 16888 + }, + { + "epoch": 1.0290694519804666, + "grad_norm": 5.194060325622559, + "learning_rate": 7.788406194326437e-06, + "loss": 0.2663, + "step": 16889 + }, + { + "epoch": 1.0290830168204015, + "grad_norm": 5.478489875793457, + "learning_rate": 7.78826915170618e-06, + "loss": 0.3741, + "step": 16890 + }, + { + "epoch": 1.0290965816603364, + "grad_norm": 4.890510559082031, + "learning_rate": 7.788132109085926e-06, + "loss": 0.2283, + "step": 16891 + }, + { + "epoch": 1.0291101465002712, + "grad_norm": 4.706763744354248, + "learning_rate": 7.787995066465671e-06, + "loss": 0.2248, + "step": 16892 + }, + { + "epoch": 1.029123711340206, + "grad_norm": 4.126096248626709, + "learning_rate": 7.787858023845416e-06, + "loss": 0.2778, + "step": 16893 + }, + { + "epoch": 1.0291372761801412, + "grad_norm": 5.872639179229736, + "learning_rate": 7.787720981225161e-06, + "loss": 0.3344, + "step": 16894 + }, + { + "epoch": 1.029150841020076, + "grad_norm": 4.629827499389648, + "learning_rate": 7.787583938604907e-06, + "loss": 0.2624, + "step": 16895 + }, + { + "epoch": 1.029164405860011, + "grad_norm": 7.820071697235107, + "learning_rate": 7.787446895984652e-06, + "loss": 0.5637, + "step": 16896 + }, + { + "epoch": 1.0291779706999458, + "grad_norm": 4.309426307678223, + "learning_rate": 7.787309853364397e-06, + "loss": 0.2601, + "step": 16897 + }, + { + "epoch": 1.0291915355398806, + "grad_norm": 5.739808559417725, + "learning_rate": 7.787172810744142e-06, + "loss": 0.3118, + "step": 16898 + }, + { + "epoch": 1.0292051003798155, + "grad_norm": 6.712731838226318, + "learning_rate": 7.787035768123887e-06, + "loss": 0.3973, + "step": 16899 + }, + { + "epoch": 1.0292186652197504, + "grad_norm": 6.820466995239258, + "learning_rate": 7.786898725503633e-06, + "loss": 0.4021, + "step": 16900 + }, + { + "epoch": 1.0292322300596852, + "grad_norm": 6.465328693389893, + "learning_rate": 7.786761682883378e-06, + "loss": 0.4004, + "step": 16901 + }, + { + "epoch": 1.02924579489962, + "grad_norm": 4.699825286865234, + "learning_rate": 7.786624640263123e-06, + "loss": 0.3511, + "step": 16902 + }, + { + "epoch": 1.0292593597395552, + "grad_norm": 5.729089736938477, + "learning_rate": 7.786487597642868e-06, + "loss": 0.2081, + "step": 16903 + }, + { + "epoch": 1.02927292457949, + "grad_norm": 4.914472579956055, + "learning_rate": 7.786350555022613e-06, + "loss": 0.3228, + "step": 16904 + }, + { + "epoch": 1.029286489419425, + "grad_norm": 5.963322639465332, + "learning_rate": 7.786213512402357e-06, + "loss": 0.3335, + "step": 16905 + }, + { + "epoch": 1.0293000542593598, + "grad_norm": 7.480273246765137, + "learning_rate": 7.786076469782104e-06, + "loss": 0.3348, + "step": 16906 + }, + { + "epoch": 1.0293136190992946, + "grad_norm": 4.389001369476318, + "learning_rate": 7.785939427161849e-06, + "loss": 0.2075, + "step": 16907 + }, + { + "epoch": 1.0293271839392295, + "grad_norm": 6.233772277832031, + "learning_rate": 7.785802384541592e-06, + "loss": 0.2132, + "step": 16908 + }, + { + "epoch": 1.0293407487791644, + "grad_norm": 6.807588577270508, + "learning_rate": 7.785665341921337e-06, + "loss": 0.3577, + "step": 16909 + }, + { + "epoch": 1.0293543136190992, + "grad_norm": 6.289849281311035, + "learning_rate": 7.785528299301083e-06, + "loss": 0.3277, + "step": 16910 + }, + { + "epoch": 1.029367878459034, + "grad_norm": 5.090252876281738, + "learning_rate": 7.78539125668083e-06, + "loss": 0.2644, + "step": 16911 + }, + { + "epoch": 1.029381443298969, + "grad_norm": 5.619244575500488, + "learning_rate": 7.785254214060573e-06, + "loss": 0.2918, + "step": 16912 + }, + { + "epoch": 1.029395008138904, + "grad_norm": 5.057353973388672, + "learning_rate": 7.785117171440318e-06, + "loss": 0.3233, + "step": 16913 + }, + { + "epoch": 1.029408572978839, + "grad_norm": 6.550892353057861, + "learning_rate": 7.784980128820063e-06, + "loss": 0.293, + "step": 16914 + }, + { + "epoch": 1.0294221378187738, + "grad_norm": 6.529486656188965, + "learning_rate": 7.784843086199809e-06, + "loss": 0.3645, + "step": 16915 + }, + { + "epoch": 1.0294357026587087, + "grad_norm": 4.368000507354736, + "learning_rate": 7.784706043579554e-06, + "loss": 0.3603, + "step": 16916 + }, + { + "epoch": 1.0294492674986435, + "grad_norm": 6.153021812438965, + "learning_rate": 7.784569000959299e-06, + "loss": 0.3224, + "step": 16917 + }, + { + "epoch": 1.0294628323385784, + "grad_norm": 6.343841552734375, + "learning_rate": 7.784431958339044e-06, + "loss": 0.5513, + "step": 16918 + }, + { + "epoch": 1.0294763971785132, + "grad_norm": 5.153253555297852, + "learning_rate": 7.78429491571879e-06, + "loss": 0.2864, + "step": 16919 + }, + { + "epoch": 1.0294899620184481, + "grad_norm": 4.445765972137451, + "learning_rate": 7.784157873098534e-06, + "loss": 0.2415, + "step": 16920 + }, + { + "epoch": 1.029503526858383, + "grad_norm": 4.6582417488098145, + "learning_rate": 7.78402083047828e-06, + "loss": 0.2455, + "step": 16921 + }, + { + "epoch": 1.029517091698318, + "grad_norm": 5.703700065612793, + "learning_rate": 7.783883787858025e-06, + "loss": 0.2905, + "step": 16922 + }, + { + "epoch": 1.029530656538253, + "grad_norm": 4.930100440979004, + "learning_rate": 7.783746745237768e-06, + "loss": 0.311, + "step": 16923 + }, + { + "epoch": 1.0295442213781878, + "grad_norm": 5.241905689239502, + "learning_rate": 7.783609702617515e-06, + "loss": 0.2548, + "step": 16924 + }, + { + "epoch": 1.0295577862181227, + "grad_norm": 4.585961818695068, + "learning_rate": 7.78347265999726e-06, + "loss": 0.2026, + "step": 16925 + }, + { + "epoch": 1.0295713510580575, + "grad_norm": 5.446207046508789, + "learning_rate": 7.783335617377005e-06, + "loss": 0.3274, + "step": 16926 + }, + { + "epoch": 1.0295849158979924, + "grad_norm": 4.915830612182617, + "learning_rate": 7.783198574756749e-06, + "loss": 0.3171, + "step": 16927 + }, + { + "epoch": 1.0295984807379273, + "grad_norm": 4.910306453704834, + "learning_rate": 7.783061532136496e-06, + "loss": 0.2638, + "step": 16928 + }, + { + "epoch": 1.0296120455778621, + "grad_norm": 7.312687397003174, + "learning_rate": 7.782924489516241e-06, + "loss": 0.4757, + "step": 16929 + }, + { + "epoch": 1.029625610417797, + "grad_norm": 5.387050628662109, + "learning_rate": 7.782787446895985e-06, + "loss": 0.1412, + "step": 16930 + }, + { + "epoch": 1.0296391752577319, + "grad_norm": 4.8779754638671875, + "learning_rate": 7.78265040427573e-06, + "loss": 0.4667, + "step": 16931 + }, + { + "epoch": 1.029652740097667, + "grad_norm": 4.847817897796631, + "learning_rate": 7.782513361655477e-06, + "loss": 0.2753, + "step": 16932 + }, + { + "epoch": 1.0296663049376018, + "grad_norm": 6.0036492347717285, + "learning_rate": 7.78237631903522e-06, + "loss": 0.3598, + "step": 16933 + }, + { + "epoch": 1.0296798697775367, + "grad_norm": 6.226648330688477, + "learning_rate": 7.782239276414965e-06, + "loss": 0.4595, + "step": 16934 + }, + { + "epoch": 1.0296934346174715, + "grad_norm": 6.547693729400635, + "learning_rate": 7.78210223379471e-06, + "loss": 0.2252, + "step": 16935 + }, + { + "epoch": 1.0297069994574064, + "grad_norm": 4.645458698272705, + "learning_rate": 7.781965191174456e-06, + "loss": 0.4339, + "step": 16936 + }, + { + "epoch": 1.0297205642973413, + "grad_norm": 4.687073230743408, + "learning_rate": 7.7818281485542e-06, + "loss": 0.227, + "step": 16937 + }, + { + "epoch": 1.0297341291372761, + "grad_norm": 7.0086164474487305, + "learning_rate": 7.781691105933946e-06, + "loss": 0.4784, + "step": 16938 + }, + { + "epoch": 1.029747693977211, + "grad_norm": 5.750837326049805, + "learning_rate": 7.781554063313691e-06, + "loss": 0.2545, + "step": 16939 + }, + { + "epoch": 1.0297612588171459, + "grad_norm": 6.192633628845215, + "learning_rate": 7.781417020693436e-06, + "loss": 0.3775, + "step": 16940 + }, + { + "epoch": 1.029774823657081, + "grad_norm": 6.154422760009766, + "learning_rate": 7.781279978073181e-06, + "loss": 0.2807, + "step": 16941 + }, + { + "epoch": 1.0297883884970158, + "grad_norm": 6.216113090515137, + "learning_rate": 7.781142935452927e-06, + "loss": 0.305, + "step": 16942 + }, + { + "epoch": 1.0298019533369507, + "grad_norm": 9.310563087463379, + "learning_rate": 7.781005892832672e-06, + "loss": 0.697, + "step": 16943 + }, + { + "epoch": 1.0298155181768855, + "grad_norm": 5.1586222648620605, + "learning_rate": 7.780868850212417e-06, + "loss": 0.4032, + "step": 16944 + }, + { + "epoch": 1.0298290830168204, + "grad_norm": 6.7604899406433105, + "learning_rate": 7.780731807592162e-06, + "loss": 0.4295, + "step": 16945 + }, + { + "epoch": 1.0298426478567553, + "grad_norm": 5.549985885620117, + "learning_rate": 7.780594764971907e-06, + "loss": 0.372, + "step": 16946 + }, + { + "epoch": 1.0298562126966901, + "grad_norm": 5.466963768005371, + "learning_rate": 7.780457722351653e-06, + "loss": 0.4034, + "step": 16947 + }, + { + "epoch": 1.029869777536625, + "grad_norm": 6.134471416473389, + "learning_rate": 7.780320679731396e-06, + "loss": 0.2697, + "step": 16948 + }, + { + "epoch": 1.0298833423765599, + "grad_norm": 6.503002643585205, + "learning_rate": 7.780183637111141e-06, + "loss": 0.3892, + "step": 16949 + }, + { + "epoch": 1.0298969072164947, + "grad_norm": 5.977264404296875, + "learning_rate": 7.780046594490888e-06, + "loss": 0.557, + "step": 16950 + }, + { + "epoch": 1.0299104720564298, + "grad_norm": 5.669976234436035, + "learning_rate": 7.779909551870633e-06, + "loss": 0.2778, + "step": 16951 + }, + { + "epoch": 1.0299240368963647, + "grad_norm": 6.245905876159668, + "learning_rate": 7.779772509250377e-06, + "loss": 0.3589, + "step": 16952 + }, + { + "epoch": 1.0299376017362996, + "grad_norm": 5.573307991027832, + "learning_rate": 7.779635466630122e-06, + "loss": 0.2787, + "step": 16953 + }, + { + "epoch": 1.0299511665762344, + "grad_norm": 6.379801273345947, + "learning_rate": 7.779498424009869e-06, + "loss": 0.4229, + "step": 16954 + }, + { + "epoch": 1.0299647314161693, + "grad_norm": 5.265333652496338, + "learning_rate": 7.779361381389612e-06, + "loss": 0.3374, + "step": 16955 + }, + { + "epoch": 1.0299782962561042, + "grad_norm": 5.485803127288818, + "learning_rate": 7.779224338769357e-06, + "loss": 0.2997, + "step": 16956 + }, + { + "epoch": 1.029991861096039, + "grad_norm": 5.543673992156982, + "learning_rate": 7.779087296149103e-06, + "loss": 0.2823, + "step": 16957 + }, + { + "epoch": 1.0300054259359739, + "grad_norm": 7.244720935821533, + "learning_rate": 7.778950253528848e-06, + "loss": 0.5097, + "step": 16958 + }, + { + "epoch": 1.0300189907759087, + "grad_norm": 4.903973579406738, + "learning_rate": 7.778813210908593e-06, + "loss": 0.3027, + "step": 16959 + }, + { + "epoch": 1.0300325556158438, + "grad_norm": 7.096414566040039, + "learning_rate": 7.778676168288338e-06, + "loss": 0.3905, + "step": 16960 + }, + { + "epoch": 1.0300461204557787, + "grad_norm": 6.393336296081543, + "learning_rate": 7.778539125668083e-06, + "loss": 0.2611, + "step": 16961 + }, + { + "epoch": 1.0300596852957136, + "grad_norm": 5.498850345611572, + "learning_rate": 7.778402083047829e-06, + "loss": 0.2732, + "step": 16962 + }, + { + "epoch": 1.0300732501356484, + "grad_norm": 5.729490280151367, + "learning_rate": 7.778265040427574e-06, + "loss": 0.3484, + "step": 16963 + }, + { + "epoch": 1.0300868149755833, + "grad_norm": 5.194888591766357, + "learning_rate": 7.778127997807319e-06, + "loss": 0.2531, + "step": 16964 + }, + { + "epoch": 1.0301003798155182, + "grad_norm": 5.874114036560059, + "learning_rate": 7.777990955187064e-06, + "loss": 0.2572, + "step": 16965 + }, + { + "epoch": 1.030113944655453, + "grad_norm": 4.809067249298096, + "learning_rate": 7.77785391256681e-06, + "loss": 0.3571, + "step": 16966 + }, + { + "epoch": 1.030127509495388, + "grad_norm": 6.547532081604004, + "learning_rate": 7.777716869946554e-06, + "loss": 0.3897, + "step": 16967 + }, + { + "epoch": 1.0301410743353228, + "grad_norm": 5.929562568664551, + "learning_rate": 7.7775798273263e-06, + "loss": 0.33, + "step": 16968 + }, + { + "epoch": 1.0301546391752576, + "grad_norm": 5.568596839904785, + "learning_rate": 7.777442784706045e-06, + "loss": 0.206, + "step": 16969 + }, + { + "epoch": 1.0301682040151927, + "grad_norm": 4.8481597900390625, + "learning_rate": 7.777305742085788e-06, + "loss": 0.2, + "step": 16970 + }, + { + "epoch": 1.0301817688551276, + "grad_norm": 6.246772289276123, + "learning_rate": 7.777168699465535e-06, + "loss": 0.4697, + "step": 16971 + }, + { + "epoch": 1.0301953336950624, + "grad_norm": 3.775421619415283, + "learning_rate": 7.77703165684528e-06, + "loss": 0.1571, + "step": 16972 + }, + { + "epoch": 1.0302088985349973, + "grad_norm": 4.231236934661865, + "learning_rate": 7.776894614225024e-06, + "loss": 0.2534, + "step": 16973 + }, + { + "epoch": 1.0302224633749322, + "grad_norm": 6.651515007019043, + "learning_rate": 7.776757571604769e-06, + "loss": 0.3706, + "step": 16974 + }, + { + "epoch": 1.030236028214867, + "grad_norm": 5.661811828613281, + "learning_rate": 7.776620528984516e-06, + "loss": 0.3283, + "step": 16975 + }, + { + "epoch": 1.030249593054802, + "grad_norm": 5.751795768737793, + "learning_rate": 7.776483486364261e-06, + "loss": 0.2923, + "step": 16976 + }, + { + "epoch": 1.0302631578947368, + "grad_norm": 5.324193000793457, + "learning_rate": 7.776346443744005e-06, + "loss": 0.2316, + "step": 16977 + }, + { + "epoch": 1.0302767227346716, + "grad_norm": 7.14774751663208, + "learning_rate": 7.77620940112375e-06, + "loss": 0.3732, + "step": 16978 + }, + { + "epoch": 1.0302902875746067, + "grad_norm": 8.891043663024902, + "learning_rate": 7.776072358503495e-06, + "loss": 0.3933, + "step": 16979 + }, + { + "epoch": 1.0303038524145416, + "grad_norm": 5.054752826690674, + "learning_rate": 7.77593531588324e-06, + "loss": 0.2454, + "step": 16980 + }, + { + "epoch": 1.0303174172544765, + "grad_norm": 6.601776123046875, + "learning_rate": 7.775798273262985e-06, + "loss": 0.3452, + "step": 16981 + }, + { + "epoch": 1.0303309820944113, + "grad_norm": 6.40760612487793, + "learning_rate": 7.77566123064273e-06, + "loss": 0.4451, + "step": 16982 + }, + { + "epoch": 1.0303445469343462, + "grad_norm": 5.1241936683654785, + "learning_rate": 7.775524188022476e-06, + "loss": 0.3697, + "step": 16983 + }, + { + "epoch": 1.030358111774281, + "grad_norm": 7.147254943847656, + "learning_rate": 7.77538714540222e-06, + "loss": 0.3257, + "step": 16984 + }, + { + "epoch": 1.030371676614216, + "grad_norm": 4.999169826507568, + "learning_rate": 7.775250102781966e-06, + "loss": 0.3094, + "step": 16985 + }, + { + "epoch": 1.0303852414541508, + "grad_norm": 9.900184631347656, + "learning_rate": 7.775113060161711e-06, + "loss": 0.2927, + "step": 16986 + }, + { + "epoch": 1.0303988062940856, + "grad_norm": 5.601075649261475, + "learning_rate": 7.774976017541456e-06, + "loss": 0.251, + "step": 16987 + }, + { + "epoch": 1.0304123711340205, + "grad_norm": 6.20790433883667, + "learning_rate": 7.774838974921201e-06, + "loss": 0.4976, + "step": 16988 + }, + { + "epoch": 1.0304259359739556, + "grad_norm": 6.652388572692871, + "learning_rate": 7.774701932300947e-06, + "loss": 0.3329, + "step": 16989 + }, + { + "epoch": 1.0304395008138905, + "grad_norm": 5.556983947753906, + "learning_rate": 7.774564889680692e-06, + "loss": 0.383, + "step": 16990 + }, + { + "epoch": 1.0304530656538253, + "grad_norm": 7.21917200088501, + "learning_rate": 7.774427847060437e-06, + "loss": 0.4512, + "step": 16991 + }, + { + "epoch": 1.0304666304937602, + "grad_norm": 7.244024276733398, + "learning_rate": 7.77429080444018e-06, + "loss": 0.4176, + "step": 16992 + }, + { + "epoch": 1.030480195333695, + "grad_norm": 5.110360622406006, + "learning_rate": 7.774153761819927e-06, + "loss": 0.3434, + "step": 16993 + }, + { + "epoch": 1.03049376017363, + "grad_norm": 5.811922073364258, + "learning_rate": 7.774016719199673e-06, + "loss": 0.263, + "step": 16994 + }, + { + "epoch": 1.0305073250135648, + "grad_norm": 5.329823017120361, + "learning_rate": 7.773879676579416e-06, + "loss": 0.233, + "step": 16995 + }, + { + "epoch": 1.0305208898534997, + "grad_norm": 4.839576244354248, + "learning_rate": 7.773742633959161e-06, + "loss": 0.3478, + "step": 16996 + }, + { + "epoch": 1.0305344546934345, + "grad_norm": 4.909545421600342, + "learning_rate": 7.773605591338908e-06, + "loss": 0.3018, + "step": 16997 + }, + { + "epoch": 1.0305480195333696, + "grad_norm": 5.8362650871276855, + "learning_rate": 7.773468548718652e-06, + "loss": 0.3062, + "step": 16998 + }, + { + "epoch": 1.0305615843733045, + "grad_norm": 6.474332332611084, + "learning_rate": 7.773331506098397e-06, + "loss": 0.3015, + "step": 16999 + }, + { + "epoch": 1.0305751492132393, + "grad_norm": 4.833578109741211, + "learning_rate": 7.773194463478142e-06, + "loss": 0.2976, + "step": 17000 + }, + { + "epoch": 1.0305887140531742, + "grad_norm": 6.323519706726074, + "learning_rate": 7.773057420857887e-06, + "loss": 0.5546, + "step": 17001 + }, + { + "epoch": 1.030602278893109, + "grad_norm": 5.702560901641846, + "learning_rate": 7.772920378237632e-06, + "loss": 0.2606, + "step": 17002 + }, + { + "epoch": 1.030615843733044, + "grad_norm": 8.704621315002441, + "learning_rate": 7.772783335617377e-06, + "loss": 0.381, + "step": 17003 + }, + { + "epoch": 1.0306294085729788, + "grad_norm": 4.500090599060059, + "learning_rate": 7.772646292997123e-06, + "loss": 0.2812, + "step": 17004 + }, + { + "epoch": 1.0306429734129137, + "grad_norm": 3.9132864475250244, + "learning_rate": 7.772509250376868e-06, + "loss": 0.2627, + "step": 17005 + }, + { + "epoch": 1.0306565382528485, + "grad_norm": 6.893381595611572, + "learning_rate": 7.772372207756613e-06, + "loss": 0.3353, + "step": 17006 + }, + { + "epoch": 1.0306701030927834, + "grad_norm": 3.965388774871826, + "learning_rate": 7.772235165136358e-06, + "loss": 0.1555, + "step": 17007 + }, + { + "epoch": 1.0306836679327185, + "grad_norm": 3.8322107791900635, + "learning_rate": 7.772098122516103e-06, + "loss": 0.179, + "step": 17008 + }, + { + "epoch": 1.0306972327726533, + "grad_norm": 5.355165481567383, + "learning_rate": 7.771961079895849e-06, + "loss": 0.2894, + "step": 17009 + }, + { + "epoch": 1.0307107976125882, + "grad_norm": 5.0456929206848145, + "learning_rate": 7.771824037275594e-06, + "loss": 0.2026, + "step": 17010 + }, + { + "epoch": 1.030724362452523, + "grad_norm": 5.338719367980957, + "learning_rate": 7.771686994655339e-06, + "loss": 0.2841, + "step": 17011 + }, + { + "epoch": 1.030737927292458, + "grad_norm": 5.226385593414307, + "learning_rate": 7.771549952035084e-06, + "loss": 0.2309, + "step": 17012 + }, + { + "epoch": 1.0307514921323928, + "grad_norm": 6.266021728515625, + "learning_rate": 7.771412909414828e-06, + "loss": 0.2528, + "step": 17013 + }, + { + "epoch": 1.0307650569723277, + "grad_norm": 6.071737766265869, + "learning_rate": 7.771275866794574e-06, + "loss": 0.2274, + "step": 17014 + }, + { + "epoch": 1.0307786218122625, + "grad_norm": 5.677900791168213, + "learning_rate": 7.77113882417432e-06, + "loss": 0.3024, + "step": 17015 + }, + { + "epoch": 1.0307921866521974, + "grad_norm": 6.992337703704834, + "learning_rate": 7.771001781554063e-06, + "loss": 0.3466, + "step": 17016 + }, + { + "epoch": 1.0308057514921325, + "grad_norm": 5.568085193634033, + "learning_rate": 7.770864738933808e-06, + "loss": 0.2341, + "step": 17017 + }, + { + "epoch": 1.0308193163320674, + "grad_norm": 4.523063659667969, + "learning_rate": 7.770727696313553e-06, + "loss": 0.2737, + "step": 17018 + }, + { + "epoch": 1.0308328811720022, + "grad_norm": 4.670398712158203, + "learning_rate": 7.7705906536933e-06, + "loss": 0.2111, + "step": 17019 + }, + { + "epoch": 1.030846446011937, + "grad_norm": 6.127691745758057, + "learning_rate": 7.770453611073044e-06, + "loss": 0.3229, + "step": 17020 + }, + { + "epoch": 1.030860010851872, + "grad_norm": 5.848003387451172, + "learning_rate": 7.770316568452789e-06, + "loss": 0.3528, + "step": 17021 + }, + { + "epoch": 1.0308735756918068, + "grad_norm": 5.450939655303955, + "learning_rate": 7.770179525832534e-06, + "loss": 0.2699, + "step": 17022 + }, + { + "epoch": 1.0308871405317417, + "grad_norm": 5.767191410064697, + "learning_rate": 7.77004248321228e-06, + "loss": 0.3654, + "step": 17023 + }, + { + "epoch": 1.0309007053716766, + "grad_norm": 4.511509418487549, + "learning_rate": 7.769905440592025e-06, + "loss": 0.2016, + "step": 17024 + }, + { + "epoch": 1.0309142702116114, + "grad_norm": 5.609715461730957, + "learning_rate": 7.76976839797177e-06, + "loss": 0.2664, + "step": 17025 + }, + { + "epoch": 1.0309278350515463, + "grad_norm": 7.727478981018066, + "learning_rate": 7.769631355351515e-06, + "loss": 0.3538, + "step": 17026 + }, + { + "epoch": 1.0309413998914814, + "grad_norm": 4.523892879486084, + "learning_rate": 7.76949431273126e-06, + "loss": 0.1969, + "step": 17027 + }, + { + "epoch": 1.0309549647314162, + "grad_norm": 4.755564212799072, + "learning_rate": 7.769357270111005e-06, + "loss": 0.1867, + "step": 17028 + }, + { + "epoch": 1.030968529571351, + "grad_norm": 5.085555076599121, + "learning_rate": 7.76922022749075e-06, + "loss": 0.2192, + "step": 17029 + }, + { + "epoch": 1.030982094411286, + "grad_norm": 5.229373455047607, + "learning_rate": 7.769083184870496e-06, + "loss": 0.3424, + "step": 17030 + }, + { + "epoch": 1.0309956592512208, + "grad_norm": 5.2573652267456055, + "learning_rate": 7.768946142250239e-06, + "loss": 0.3247, + "step": 17031 + }, + { + "epoch": 1.0310092240911557, + "grad_norm": 6.793453216552734, + "learning_rate": 7.768809099629986e-06, + "loss": 0.295, + "step": 17032 + }, + { + "epoch": 1.0310227889310906, + "grad_norm": 5.09979248046875, + "learning_rate": 7.768672057009731e-06, + "loss": 0.3387, + "step": 17033 + }, + { + "epoch": 1.0310363537710254, + "grad_norm": 6.1743011474609375, + "learning_rate": 7.768535014389476e-06, + "loss": 0.3844, + "step": 17034 + }, + { + "epoch": 1.0310499186109603, + "grad_norm": 5.827332496643066, + "learning_rate": 7.76839797176922e-06, + "loss": 0.3195, + "step": 17035 + }, + { + "epoch": 1.0310634834508954, + "grad_norm": 6.958630561828613, + "learning_rate": 7.768260929148967e-06, + "loss": 0.426, + "step": 17036 + }, + { + "epoch": 1.0310770482908302, + "grad_norm": 5.214702129364014, + "learning_rate": 7.768123886528712e-06, + "loss": 0.296, + "step": 17037 + }, + { + "epoch": 1.031090613130765, + "grad_norm": 4.894142150878906, + "learning_rate": 7.767986843908455e-06, + "loss": 0.2903, + "step": 17038 + }, + { + "epoch": 1.0311041779707, + "grad_norm": 7.768951892852783, + "learning_rate": 7.7678498012882e-06, + "loss": 0.3445, + "step": 17039 + }, + { + "epoch": 1.0311177428106348, + "grad_norm": 4.627758979797363, + "learning_rate": 7.767712758667947e-06, + "loss": 0.399, + "step": 17040 + }, + { + "epoch": 1.0311313076505697, + "grad_norm": 6.3379902839660645, + "learning_rate": 7.767575716047691e-06, + "loss": 0.2774, + "step": 17041 + }, + { + "epoch": 1.0311448724905046, + "grad_norm": 5.894220352172852, + "learning_rate": 7.767438673427436e-06, + "loss": 0.3102, + "step": 17042 + }, + { + "epoch": 1.0311584373304394, + "grad_norm": 7.526817798614502, + "learning_rate": 7.767301630807181e-06, + "loss": 0.3239, + "step": 17043 + }, + { + "epoch": 1.0311720021703743, + "grad_norm": 6.690789222717285, + "learning_rate": 7.767164588186928e-06, + "loss": 0.3421, + "step": 17044 + }, + { + "epoch": 1.0311855670103092, + "grad_norm": 4.793669700622559, + "learning_rate": 7.767027545566672e-06, + "loss": 0.2641, + "step": 17045 + }, + { + "epoch": 1.0311991318502443, + "grad_norm": 5.589808940887451, + "learning_rate": 7.766890502946417e-06, + "loss": 0.3602, + "step": 17046 + }, + { + "epoch": 1.0312126966901791, + "grad_norm": 6.938614368438721, + "learning_rate": 7.766753460326162e-06, + "loss": 0.3563, + "step": 17047 + }, + { + "epoch": 1.031226261530114, + "grad_norm": 5.38962984085083, + "learning_rate": 7.766616417705907e-06, + "loss": 0.2794, + "step": 17048 + }, + { + "epoch": 1.0312398263700489, + "grad_norm": 5.275141716003418, + "learning_rate": 7.766479375085652e-06, + "loss": 0.2905, + "step": 17049 + }, + { + "epoch": 1.0312533912099837, + "grad_norm": 4.1948676109313965, + "learning_rate": 7.766342332465397e-06, + "loss": 0.2163, + "step": 17050 + }, + { + "epoch": 1.0312669560499186, + "grad_norm": 7.749621391296387, + "learning_rate": 7.766205289845143e-06, + "loss": 0.3264, + "step": 17051 + }, + { + "epoch": 1.0312805208898534, + "grad_norm": 6.418409824371338, + "learning_rate": 7.766068247224888e-06, + "loss": 0.3799, + "step": 17052 + }, + { + "epoch": 1.0312940857297883, + "grad_norm": 5.255415916442871, + "learning_rate": 7.765931204604633e-06, + "loss": 0.2336, + "step": 17053 + }, + { + "epoch": 1.0313076505697232, + "grad_norm": 5.528520107269287, + "learning_rate": 7.765794161984378e-06, + "loss": 0.3158, + "step": 17054 + }, + { + "epoch": 1.0313212154096583, + "grad_norm": 4.600758075714111, + "learning_rate": 7.765657119364123e-06, + "loss": 0.3036, + "step": 17055 + }, + { + "epoch": 1.0313347802495931, + "grad_norm": 5.947307586669922, + "learning_rate": 7.765520076743867e-06, + "loss": 0.4571, + "step": 17056 + }, + { + "epoch": 1.031348345089528, + "grad_norm": 4.423388957977295, + "learning_rate": 7.765383034123614e-06, + "loss": 0.3278, + "step": 17057 + }, + { + "epoch": 1.0313619099294629, + "grad_norm": 8.44514274597168, + "learning_rate": 7.765245991503359e-06, + "loss": 0.6585, + "step": 17058 + }, + { + "epoch": 1.0313754747693977, + "grad_norm": 4.722385406494141, + "learning_rate": 7.765108948883104e-06, + "loss": 0.2633, + "step": 17059 + }, + { + "epoch": 1.0313890396093326, + "grad_norm": 7.227344036102295, + "learning_rate": 7.764971906262848e-06, + "loss": 0.4849, + "step": 17060 + }, + { + "epoch": 1.0314026044492675, + "grad_norm": 7.9017486572265625, + "learning_rate": 7.764834863642593e-06, + "loss": 0.498, + "step": 17061 + }, + { + "epoch": 1.0314161692892023, + "grad_norm": 5.649850368499756, + "learning_rate": 7.76469782102234e-06, + "loss": 0.2164, + "step": 17062 + }, + { + "epoch": 1.0314297341291372, + "grad_norm": 7.375102996826172, + "learning_rate": 7.764560778402083e-06, + "loss": 0.431, + "step": 17063 + }, + { + "epoch": 1.031443298969072, + "grad_norm": 5.057755947113037, + "learning_rate": 7.764423735781828e-06, + "loss": 0.3042, + "step": 17064 + }, + { + "epoch": 1.0314568638090071, + "grad_norm": 6.5451436042785645, + "learning_rate": 7.764286693161573e-06, + "loss": 0.2811, + "step": 17065 + }, + { + "epoch": 1.031470428648942, + "grad_norm": 8.16409683227539, + "learning_rate": 7.764149650541319e-06, + "loss": 0.3685, + "step": 17066 + }, + { + "epoch": 1.0314839934888769, + "grad_norm": 6.72027587890625, + "learning_rate": 7.764012607921064e-06, + "loss": 0.454, + "step": 17067 + }, + { + "epoch": 1.0314975583288117, + "grad_norm": 4.703108787536621, + "learning_rate": 7.763875565300809e-06, + "loss": 0.2019, + "step": 17068 + }, + { + "epoch": 1.0315111231687466, + "grad_norm": 4.120127201080322, + "learning_rate": 7.763738522680554e-06, + "loss": 0.2193, + "step": 17069 + }, + { + "epoch": 1.0315246880086815, + "grad_norm": 4.46128511428833, + "learning_rate": 7.7636014800603e-06, + "loss": 0.2456, + "step": 17070 + }, + { + "epoch": 1.0315382528486163, + "grad_norm": 5.556204795837402, + "learning_rate": 7.763464437440045e-06, + "loss": 0.2732, + "step": 17071 + }, + { + "epoch": 1.0315518176885512, + "grad_norm": 5.753029823303223, + "learning_rate": 7.76332739481979e-06, + "loss": 0.387, + "step": 17072 + }, + { + "epoch": 1.031565382528486, + "grad_norm": 6.817981243133545, + "learning_rate": 7.763190352199535e-06, + "loss": 0.2914, + "step": 17073 + }, + { + "epoch": 1.0315789473684212, + "grad_norm": 5.351951599121094, + "learning_rate": 7.76305330957928e-06, + "loss": 0.2738, + "step": 17074 + }, + { + "epoch": 1.031592512208356, + "grad_norm": 5.462994575500488, + "learning_rate": 7.762916266959025e-06, + "loss": 0.3502, + "step": 17075 + }, + { + "epoch": 1.0316060770482909, + "grad_norm": 6.987705707550049, + "learning_rate": 7.76277922433877e-06, + "loss": 0.315, + "step": 17076 + }, + { + "epoch": 1.0316196418882257, + "grad_norm": 5.691708087921143, + "learning_rate": 7.762642181718516e-06, + "loss": 0.2594, + "step": 17077 + }, + { + "epoch": 1.0316332067281606, + "grad_norm": 6.425474643707275, + "learning_rate": 7.762505139098259e-06, + "loss": 0.3033, + "step": 17078 + }, + { + "epoch": 1.0316467715680955, + "grad_norm": 6.073252201080322, + "learning_rate": 7.762368096478006e-06, + "loss": 0.2885, + "step": 17079 + }, + { + "epoch": 1.0316603364080303, + "grad_norm": 6.661872386932373, + "learning_rate": 7.762231053857751e-06, + "loss": 0.3655, + "step": 17080 + }, + { + "epoch": 1.0316739012479652, + "grad_norm": 5.987252235412598, + "learning_rate": 7.762094011237495e-06, + "loss": 0.3653, + "step": 17081 + }, + { + "epoch": 1.0316874660879, + "grad_norm": 7.711994647979736, + "learning_rate": 7.76195696861724e-06, + "loss": 0.5364, + "step": 17082 + }, + { + "epoch": 1.031701030927835, + "grad_norm": 5.042632102966309, + "learning_rate": 7.761819925996987e-06, + "loss": 0.2112, + "step": 17083 + }, + { + "epoch": 1.03171459576777, + "grad_norm": 6.4864349365234375, + "learning_rate": 7.76168288337673e-06, + "loss": 0.3144, + "step": 17084 + }, + { + "epoch": 1.031728160607705, + "grad_norm": 6.159891605377197, + "learning_rate": 7.761545840756475e-06, + "loss": 0.3863, + "step": 17085 + }, + { + "epoch": 1.0317417254476398, + "grad_norm": 6.247040748596191, + "learning_rate": 7.76140879813622e-06, + "loss": 0.2711, + "step": 17086 + }, + { + "epoch": 1.0317552902875746, + "grad_norm": 5.545709133148193, + "learning_rate": 7.761271755515966e-06, + "loss": 0.27, + "step": 17087 + }, + { + "epoch": 1.0317688551275095, + "grad_norm": 6.927506923675537, + "learning_rate": 7.761134712895711e-06, + "loss": 0.4005, + "step": 17088 + }, + { + "epoch": 1.0317824199674444, + "grad_norm": 6.372819900512695, + "learning_rate": 7.760997670275456e-06, + "loss": 0.2426, + "step": 17089 + }, + { + "epoch": 1.0317959848073792, + "grad_norm": 6.544864654541016, + "learning_rate": 7.760860627655201e-06, + "loss": 0.3258, + "step": 17090 + }, + { + "epoch": 1.031809549647314, + "grad_norm": 9.661921501159668, + "learning_rate": 7.760723585034946e-06, + "loss": 0.5456, + "step": 17091 + }, + { + "epoch": 1.031823114487249, + "grad_norm": 6.621149063110352, + "learning_rate": 7.760586542414692e-06, + "loss": 0.2911, + "step": 17092 + }, + { + "epoch": 1.031836679327184, + "grad_norm": 5.2533345222473145, + "learning_rate": 7.760449499794437e-06, + "loss": 0.238, + "step": 17093 + }, + { + "epoch": 1.031850244167119, + "grad_norm": 6.052444934844971, + "learning_rate": 7.760312457174182e-06, + "loss": 0.2354, + "step": 17094 + }, + { + "epoch": 1.0318638090070538, + "grad_norm": 6.220086574554443, + "learning_rate": 7.760175414553927e-06, + "loss": 0.3617, + "step": 17095 + }, + { + "epoch": 1.0318773738469886, + "grad_norm": 4.659417152404785, + "learning_rate": 7.760038371933672e-06, + "loss": 0.2047, + "step": 17096 + }, + { + "epoch": 1.0318909386869235, + "grad_norm": 5.955836772918701, + "learning_rate": 7.759901329313418e-06, + "loss": 0.2854, + "step": 17097 + }, + { + "epoch": 1.0319045035268584, + "grad_norm": 5.998847007751465, + "learning_rate": 7.759764286693163e-06, + "loss": 0.25, + "step": 17098 + }, + { + "epoch": 1.0319180683667932, + "grad_norm": 5.7403950691223145, + "learning_rate": 7.759627244072906e-06, + "loss": 0.37, + "step": 17099 + }, + { + "epoch": 1.031931633206728, + "grad_norm": 6.254076957702637, + "learning_rate": 7.759490201452651e-06, + "loss": 0.3913, + "step": 17100 + }, + { + "epoch": 1.031945198046663, + "grad_norm": 5.973339080810547, + "learning_rate": 7.759353158832398e-06, + "loss": 0.3855, + "step": 17101 + }, + { + "epoch": 1.0319587628865978, + "grad_norm": 6.880521297454834, + "learning_rate": 7.759216116212143e-06, + "loss": 0.3543, + "step": 17102 + }, + { + "epoch": 1.031972327726533, + "grad_norm": 7.84480094909668, + "learning_rate": 7.759079073591887e-06, + "loss": 0.3683, + "step": 17103 + }, + { + "epoch": 1.0319858925664678, + "grad_norm": 7.280384063720703, + "learning_rate": 7.758942030971632e-06, + "loss": 0.5295, + "step": 17104 + }, + { + "epoch": 1.0319994574064026, + "grad_norm": 6.578532695770264, + "learning_rate": 7.758804988351379e-06, + "loss": 0.345, + "step": 17105 + }, + { + "epoch": 1.0320130222463375, + "grad_norm": 4.471423625946045, + "learning_rate": 7.758667945731122e-06, + "loss": 0.2645, + "step": 17106 + }, + { + "epoch": 1.0320265870862724, + "grad_norm": 5.209489345550537, + "learning_rate": 7.758530903110868e-06, + "loss": 0.2566, + "step": 17107 + }, + { + "epoch": 1.0320401519262072, + "grad_norm": 5.5378899574279785, + "learning_rate": 7.758393860490613e-06, + "loss": 0.2864, + "step": 17108 + }, + { + "epoch": 1.032053716766142, + "grad_norm": 5.610507011413574, + "learning_rate": 7.758256817870358e-06, + "loss": 0.2575, + "step": 17109 + }, + { + "epoch": 1.032067281606077, + "grad_norm": 6.0413360595703125, + "learning_rate": 7.758119775250103e-06, + "loss": 0.3611, + "step": 17110 + }, + { + "epoch": 1.0320808464460118, + "grad_norm": 5.644893169403076, + "learning_rate": 7.757982732629848e-06, + "loss": 0.3043, + "step": 17111 + }, + { + "epoch": 1.032094411285947, + "grad_norm": 5.993097305297852, + "learning_rate": 7.757845690009594e-06, + "loss": 0.2806, + "step": 17112 + }, + { + "epoch": 1.0321079761258818, + "grad_norm": 6.6281633377075195, + "learning_rate": 7.757708647389339e-06, + "loss": 0.2193, + "step": 17113 + }, + { + "epoch": 1.0321215409658167, + "grad_norm": 6.638415813446045, + "learning_rate": 7.757571604769084e-06, + "loss": 0.3548, + "step": 17114 + }, + { + "epoch": 1.0321351058057515, + "grad_norm": 7.457523345947266, + "learning_rate": 7.757434562148829e-06, + "loss": 0.3098, + "step": 17115 + }, + { + "epoch": 1.0321486706456864, + "grad_norm": 4.984119415283203, + "learning_rate": 7.757297519528574e-06, + "loss": 0.2286, + "step": 17116 + }, + { + "epoch": 1.0321622354856212, + "grad_norm": 5.174525737762451, + "learning_rate": 7.75716047690832e-06, + "loss": 0.4007, + "step": 17117 + }, + { + "epoch": 1.0321758003255561, + "grad_norm": 5.153576850891113, + "learning_rate": 7.757023434288065e-06, + "loss": 0.3259, + "step": 17118 + }, + { + "epoch": 1.032189365165491, + "grad_norm": 6.873710632324219, + "learning_rate": 7.75688639166781e-06, + "loss": 0.2959, + "step": 17119 + }, + { + "epoch": 1.0322029300054258, + "grad_norm": 5.121338844299316, + "learning_rate": 7.756749349047555e-06, + "loss": 0.3773, + "step": 17120 + }, + { + "epoch": 1.0322164948453607, + "grad_norm": 5.098752498626709, + "learning_rate": 7.756612306427298e-06, + "loss": 0.2972, + "step": 17121 + }, + { + "epoch": 1.0322300596852958, + "grad_norm": 6.8759894371032715, + "learning_rate": 7.756475263807045e-06, + "loss": 0.3176, + "step": 17122 + }, + { + "epoch": 1.0322436245252307, + "grad_norm": 5.415785312652588, + "learning_rate": 7.75633822118679e-06, + "loss": 0.2942, + "step": 17123 + }, + { + "epoch": 1.0322571893651655, + "grad_norm": 5.197279453277588, + "learning_rate": 7.756201178566534e-06, + "loss": 0.3228, + "step": 17124 + }, + { + "epoch": 1.0322707542051004, + "grad_norm": 6.577213764190674, + "learning_rate": 7.756064135946279e-06, + "loss": 0.2441, + "step": 17125 + }, + { + "epoch": 1.0322843190450353, + "grad_norm": 4.536159515380859, + "learning_rate": 7.755927093326026e-06, + "loss": 0.2897, + "step": 17126 + }, + { + "epoch": 1.0322978838849701, + "grad_norm": 6.208682537078857, + "learning_rate": 7.755790050705771e-06, + "loss": 0.2871, + "step": 17127 + }, + { + "epoch": 1.032311448724905, + "grad_norm": 5.020267009735107, + "learning_rate": 7.755653008085515e-06, + "loss": 0.2441, + "step": 17128 + }, + { + "epoch": 1.0323250135648399, + "grad_norm": 6.649623870849609, + "learning_rate": 7.75551596546526e-06, + "loss": 0.3681, + "step": 17129 + }, + { + "epoch": 1.0323385784047747, + "grad_norm": 6.4858717918396, + "learning_rate": 7.755378922845005e-06, + "loss": 0.3143, + "step": 17130 + }, + { + "epoch": 1.0323521432447098, + "grad_norm": 5.619024753570557, + "learning_rate": 7.75524188022475e-06, + "loss": 0.2389, + "step": 17131 + }, + { + "epoch": 1.0323657080846447, + "grad_norm": 6.803748607635498, + "learning_rate": 7.755104837604495e-06, + "loss": 0.3299, + "step": 17132 + }, + { + "epoch": 1.0323792729245795, + "grad_norm": 5.821036338806152, + "learning_rate": 7.75496779498424e-06, + "loss": 0.158, + "step": 17133 + }, + { + "epoch": 1.0323928377645144, + "grad_norm": 5.00742244720459, + "learning_rate": 7.754830752363986e-06, + "loss": 0.2009, + "step": 17134 + }, + { + "epoch": 1.0324064026044493, + "grad_norm": 5.104460716247559, + "learning_rate": 7.754693709743731e-06, + "loss": 0.2522, + "step": 17135 + }, + { + "epoch": 1.0324199674443841, + "grad_norm": 5.375964641571045, + "learning_rate": 7.754556667123476e-06, + "loss": 0.3427, + "step": 17136 + }, + { + "epoch": 1.032433532284319, + "grad_norm": 4.303487300872803, + "learning_rate": 7.754419624503221e-06, + "loss": 0.2266, + "step": 17137 + }, + { + "epoch": 1.0324470971242539, + "grad_norm": 5.975584983825684, + "learning_rate": 7.754282581882966e-06, + "loss": 0.3431, + "step": 17138 + }, + { + "epoch": 1.0324606619641887, + "grad_norm": 6.470440864562988, + "learning_rate": 7.754145539262712e-06, + "loss": 0.3158, + "step": 17139 + }, + { + "epoch": 1.0324742268041238, + "grad_norm": 3.9747581481933594, + "learning_rate": 7.754008496642457e-06, + "loss": 0.2334, + "step": 17140 + }, + { + "epoch": 1.0324877916440587, + "grad_norm": 5.487870216369629, + "learning_rate": 7.753871454022202e-06, + "loss": 0.2722, + "step": 17141 + }, + { + "epoch": 1.0325013564839935, + "grad_norm": 4.335729598999023, + "learning_rate": 7.753734411401947e-06, + "loss": 0.1769, + "step": 17142 + }, + { + "epoch": 1.0325149213239284, + "grad_norm": 6.062392711639404, + "learning_rate": 7.75359736878169e-06, + "loss": 0.5418, + "step": 17143 + }, + { + "epoch": 1.0325284861638633, + "grad_norm": 4.197197437286377, + "learning_rate": 7.753460326161438e-06, + "loss": 0.3109, + "step": 17144 + }, + { + "epoch": 1.0325420510037981, + "grad_norm": 4.861471652984619, + "learning_rate": 7.753323283541183e-06, + "loss": 0.2632, + "step": 17145 + }, + { + "epoch": 1.032555615843733, + "grad_norm": 5.458678722381592, + "learning_rate": 7.753186240920926e-06, + "loss": 0.2836, + "step": 17146 + }, + { + "epoch": 1.0325691806836679, + "grad_norm": 4.3935394287109375, + "learning_rate": 7.753049198300671e-06, + "loss": 0.2299, + "step": 17147 + }, + { + "epoch": 1.0325827455236027, + "grad_norm": 4.66689395904541, + "learning_rate": 7.752912155680418e-06, + "loss": 0.2142, + "step": 17148 + }, + { + "epoch": 1.0325963103635376, + "grad_norm": 6.347103118896484, + "learning_rate": 7.752775113060162e-06, + "loss": 0.4388, + "step": 17149 + }, + { + "epoch": 1.0326098752034727, + "grad_norm": 5.874029636383057, + "learning_rate": 7.752638070439907e-06, + "loss": 0.311, + "step": 17150 + }, + { + "epoch": 1.0326234400434076, + "grad_norm": 5.155548572540283, + "learning_rate": 7.752501027819652e-06, + "loss": 0.2338, + "step": 17151 + }, + { + "epoch": 1.0326370048833424, + "grad_norm": 7.401393890380859, + "learning_rate": 7.752363985199399e-06, + "loss": 0.2509, + "step": 17152 + }, + { + "epoch": 1.0326505697232773, + "grad_norm": 5.0422163009643555, + "learning_rate": 7.752226942579142e-06, + "loss": 0.2175, + "step": 17153 + }, + { + "epoch": 1.0326641345632122, + "grad_norm": 5.950882434844971, + "learning_rate": 7.752089899958888e-06, + "loss": 0.3921, + "step": 17154 + }, + { + "epoch": 1.032677699403147, + "grad_norm": 5.1741766929626465, + "learning_rate": 7.751952857338633e-06, + "loss": 0.2911, + "step": 17155 + }, + { + "epoch": 1.0326912642430819, + "grad_norm": 4.7792205810546875, + "learning_rate": 7.751815814718378e-06, + "loss": 0.2598, + "step": 17156 + }, + { + "epoch": 1.0327048290830168, + "grad_norm": 4.942008972167969, + "learning_rate": 7.751678772098123e-06, + "loss": 0.1823, + "step": 17157 + }, + { + "epoch": 1.0327183939229516, + "grad_norm": 4.211537837982178, + "learning_rate": 7.751541729477868e-06, + "loss": 0.1805, + "step": 17158 + }, + { + "epoch": 1.0327319587628867, + "grad_norm": 3.955711603164673, + "learning_rate": 7.751404686857614e-06, + "loss": 0.2862, + "step": 17159 + }, + { + "epoch": 1.0327455236028216, + "grad_norm": 4.559247016906738, + "learning_rate": 7.751267644237359e-06, + "loss": 0.2394, + "step": 17160 + }, + { + "epoch": 1.0327590884427564, + "grad_norm": 4.615226745605469, + "learning_rate": 7.751130601617104e-06, + "loss": 0.2062, + "step": 17161 + }, + { + "epoch": 1.0327726532826913, + "grad_norm": 4.85961389541626, + "learning_rate": 7.750993558996849e-06, + "loss": 0.2376, + "step": 17162 + }, + { + "epoch": 1.0327862181226262, + "grad_norm": 6.526954650878906, + "learning_rate": 7.750856516376594e-06, + "loss": 0.4129, + "step": 17163 + }, + { + "epoch": 1.032799782962561, + "grad_norm": 3.9947030544281006, + "learning_rate": 7.750719473756338e-06, + "loss": 0.1995, + "step": 17164 + }, + { + "epoch": 1.032813347802496, + "grad_norm": 3.6645846366882324, + "learning_rate": 7.750582431136085e-06, + "loss": 0.188, + "step": 17165 + }, + { + "epoch": 1.0328269126424308, + "grad_norm": 4.962390899658203, + "learning_rate": 7.75044538851583e-06, + "loss": 0.2579, + "step": 17166 + }, + { + "epoch": 1.0328404774823656, + "grad_norm": 4.784311771392822, + "learning_rate": 7.750308345895575e-06, + "loss": 0.2486, + "step": 17167 + }, + { + "epoch": 1.0328540423223005, + "grad_norm": 6.18760347366333, + "learning_rate": 7.750171303275318e-06, + "loss": 0.2379, + "step": 17168 + }, + { + "epoch": 1.0328676071622356, + "grad_norm": 6.394104480743408, + "learning_rate": 7.750034260655064e-06, + "loss": 0.2469, + "step": 17169 + }, + { + "epoch": 1.0328811720021704, + "grad_norm": 6.558886528015137, + "learning_rate": 7.74989721803481e-06, + "loss": 0.3146, + "step": 17170 + }, + { + "epoch": 1.0328947368421053, + "grad_norm": 4.7644877433776855, + "learning_rate": 7.749760175414554e-06, + "loss": 0.207, + "step": 17171 + }, + { + "epoch": 1.0329083016820402, + "grad_norm": 5.922274589538574, + "learning_rate": 7.7496231327943e-06, + "loss": 0.2844, + "step": 17172 + }, + { + "epoch": 1.032921866521975, + "grad_norm": 5.85653829574585, + "learning_rate": 7.749486090174044e-06, + "loss": 0.263, + "step": 17173 + }, + { + "epoch": 1.03293543136191, + "grad_norm": 4.799165725708008, + "learning_rate": 7.74934904755379e-06, + "loss": 0.2748, + "step": 17174 + }, + { + "epoch": 1.0329489962018448, + "grad_norm": 7.206075668334961, + "learning_rate": 7.749212004933535e-06, + "loss": 0.4128, + "step": 17175 + }, + { + "epoch": 1.0329625610417796, + "grad_norm": 6.409852504730225, + "learning_rate": 7.74907496231328e-06, + "loss": 0.3061, + "step": 17176 + }, + { + "epoch": 1.0329761258817145, + "grad_norm": 4.395328044891357, + "learning_rate": 7.748937919693025e-06, + "loss": 0.2736, + "step": 17177 + }, + { + "epoch": 1.0329896907216496, + "grad_norm": 5.2846903800964355, + "learning_rate": 7.74880087707277e-06, + "loss": 0.3028, + "step": 17178 + }, + { + "epoch": 1.0330032555615845, + "grad_norm": 5.44256067276001, + "learning_rate": 7.748663834452515e-06, + "loss": 0.3071, + "step": 17179 + }, + { + "epoch": 1.0330168204015193, + "grad_norm": 5.651653289794922, + "learning_rate": 7.74852679183226e-06, + "loss": 0.2864, + "step": 17180 + }, + { + "epoch": 1.0330303852414542, + "grad_norm": 6.881239414215088, + "learning_rate": 7.748389749212006e-06, + "loss": 0.2924, + "step": 17181 + }, + { + "epoch": 1.033043950081389, + "grad_norm": 5.557915687561035, + "learning_rate": 7.748252706591751e-06, + "loss": 0.3093, + "step": 17182 + }, + { + "epoch": 1.033057514921324, + "grad_norm": 5.143612384796143, + "learning_rate": 7.748115663971496e-06, + "loss": 0.2537, + "step": 17183 + }, + { + "epoch": 1.0330710797612588, + "grad_norm": 4.046596527099609, + "learning_rate": 7.747978621351241e-06, + "loss": 0.2334, + "step": 17184 + }, + { + "epoch": 1.0330846446011936, + "grad_norm": 5.9175333976745605, + "learning_rate": 7.747841578730986e-06, + "loss": 0.2014, + "step": 17185 + }, + { + "epoch": 1.0330982094411285, + "grad_norm": 5.427663803100586, + "learning_rate": 7.74770453611073e-06, + "loss": 0.175, + "step": 17186 + }, + { + "epoch": 1.0331117742810634, + "grad_norm": 5.131260395050049, + "learning_rate": 7.747567493490477e-06, + "loss": 0.3113, + "step": 17187 + }, + { + "epoch": 1.0331253391209985, + "grad_norm": 6.786262035369873, + "learning_rate": 7.747430450870222e-06, + "loss": 0.3321, + "step": 17188 + }, + { + "epoch": 1.0331389039609333, + "grad_norm": 4.554327964782715, + "learning_rate": 7.747293408249966e-06, + "loss": 0.2606, + "step": 17189 + }, + { + "epoch": 1.0331524688008682, + "grad_norm": 4.914264678955078, + "learning_rate": 7.74715636562971e-06, + "loss": 0.3454, + "step": 17190 + }, + { + "epoch": 1.033166033640803, + "grad_norm": 6.060290813446045, + "learning_rate": 7.747019323009458e-06, + "loss": 0.3064, + "step": 17191 + }, + { + "epoch": 1.033179598480738, + "grad_norm": 5.568665504455566, + "learning_rate": 7.746882280389201e-06, + "loss": 0.3744, + "step": 17192 + }, + { + "epoch": 1.0331931633206728, + "grad_norm": 6.198572158813477, + "learning_rate": 7.746745237768946e-06, + "loss": 0.2658, + "step": 17193 + }, + { + "epoch": 1.0332067281606077, + "grad_norm": 4.482203483581543, + "learning_rate": 7.746608195148691e-06, + "loss": 0.2576, + "step": 17194 + }, + { + "epoch": 1.0332202930005425, + "grad_norm": 5.613503932952881, + "learning_rate": 7.746471152528438e-06, + "loss": 0.2816, + "step": 17195 + }, + { + "epoch": 1.0332338578404774, + "grad_norm": 6.003726959228516, + "learning_rate": 7.746334109908182e-06, + "loss": 0.2599, + "step": 17196 + }, + { + "epoch": 1.0332474226804125, + "grad_norm": 5.214609146118164, + "learning_rate": 7.746197067287927e-06, + "loss": 0.3824, + "step": 17197 + }, + { + "epoch": 1.0332609875203473, + "grad_norm": 6.252859592437744, + "learning_rate": 7.746060024667672e-06, + "loss": 0.3899, + "step": 17198 + }, + { + "epoch": 1.0332745523602822, + "grad_norm": 6.238354206085205, + "learning_rate": 7.745922982047417e-06, + "loss": 0.2898, + "step": 17199 + }, + { + "epoch": 1.033288117200217, + "grad_norm": 4.999953269958496, + "learning_rate": 7.745785939427162e-06, + "loss": 0.2519, + "step": 17200 + }, + { + "epoch": 1.033301682040152, + "grad_norm": 5.420114040374756, + "learning_rate": 7.745648896806908e-06, + "loss": 0.2265, + "step": 17201 + }, + { + "epoch": 1.0333152468800868, + "grad_norm": 6.578396320343018, + "learning_rate": 7.745511854186653e-06, + "loss": 0.3238, + "step": 17202 + }, + { + "epoch": 1.0333288117200217, + "grad_norm": 5.678462505340576, + "learning_rate": 7.745374811566398e-06, + "loss": 0.2915, + "step": 17203 + }, + { + "epoch": 1.0333423765599565, + "grad_norm": 4.606773376464844, + "learning_rate": 7.745237768946143e-06, + "loss": 0.1891, + "step": 17204 + }, + { + "epoch": 1.0333559413998914, + "grad_norm": 3.4194555282592773, + "learning_rate": 7.745100726325888e-06, + "loss": 0.2042, + "step": 17205 + }, + { + "epoch": 1.0333695062398263, + "grad_norm": 4.972505569458008, + "learning_rate": 7.744963683705634e-06, + "loss": 0.2484, + "step": 17206 + }, + { + "epoch": 1.0333830710797614, + "grad_norm": 4.7918782234191895, + "learning_rate": 7.744826641085377e-06, + "loss": 0.3381, + "step": 17207 + }, + { + "epoch": 1.0333966359196962, + "grad_norm": 4.4849090576171875, + "learning_rate": 7.744689598465124e-06, + "loss": 0.2619, + "step": 17208 + }, + { + "epoch": 1.033410200759631, + "grad_norm": 6.429628372192383, + "learning_rate": 7.744552555844869e-06, + "loss": 0.4075, + "step": 17209 + }, + { + "epoch": 1.033423765599566, + "grad_norm": 8.181811332702637, + "learning_rate": 7.744415513224614e-06, + "loss": 0.3205, + "step": 17210 + }, + { + "epoch": 1.0334373304395008, + "grad_norm": 4.627436637878418, + "learning_rate": 7.744278470604358e-06, + "loss": 0.184, + "step": 17211 + }, + { + "epoch": 1.0334508952794357, + "grad_norm": 4.840027809143066, + "learning_rate": 7.744141427984103e-06, + "loss": 0.2772, + "step": 17212 + }, + { + "epoch": 1.0334644601193705, + "grad_norm": 3.8820035457611084, + "learning_rate": 7.74400438536385e-06, + "loss": 0.1959, + "step": 17213 + }, + { + "epoch": 1.0334780249593054, + "grad_norm": 8.456530570983887, + "learning_rate": 7.743867342743593e-06, + "loss": 0.3909, + "step": 17214 + }, + { + "epoch": 1.0334915897992403, + "grad_norm": 4.517323970794678, + "learning_rate": 7.743730300123338e-06, + "loss": 0.284, + "step": 17215 + }, + { + "epoch": 1.0335051546391754, + "grad_norm": 6.2405548095703125, + "learning_rate": 7.743593257503084e-06, + "loss": 0.2268, + "step": 17216 + }, + { + "epoch": 1.0335187194791102, + "grad_norm": 7.227353096008301, + "learning_rate": 7.743456214882829e-06, + "loss": 0.485, + "step": 17217 + }, + { + "epoch": 1.033532284319045, + "grad_norm": 8.288671493530273, + "learning_rate": 7.743319172262574e-06, + "loss": 0.4049, + "step": 17218 + }, + { + "epoch": 1.03354584915898, + "grad_norm": 4.73986291885376, + "learning_rate": 7.74318212964232e-06, + "loss": 0.2083, + "step": 17219 + }, + { + "epoch": 1.0335594139989148, + "grad_norm": 6.863632678985596, + "learning_rate": 7.743045087022064e-06, + "loss": 0.255, + "step": 17220 + }, + { + "epoch": 1.0335729788388497, + "grad_norm": 5.439537048339844, + "learning_rate": 7.74290804440181e-06, + "loss": 0.2486, + "step": 17221 + }, + { + "epoch": 1.0335865436787846, + "grad_norm": 4.3593010902404785, + "learning_rate": 7.742771001781555e-06, + "loss": 0.2702, + "step": 17222 + }, + { + "epoch": 1.0336001085187194, + "grad_norm": 6.070357322692871, + "learning_rate": 7.7426339591613e-06, + "loss": 0.3134, + "step": 17223 + }, + { + "epoch": 1.0336136733586543, + "grad_norm": 7.123726844787598, + "learning_rate": 7.742496916541045e-06, + "loss": 0.3113, + "step": 17224 + }, + { + "epoch": 1.0336272381985894, + "grad_norm": 6.2620744705200195, + "learning_rate": 7.74235987392079e-06, + "loss": 0.3188, + "step": 17225 + }, + { + "epoch": 1.0336408030385242, + "grad_norm": 6.091019153594971, + "learning_rate": 7.742222831300535e-06, + "loss": 0.4097, + "step": 17226 + }, + { + "epoch": 1.033654367878459, + "grad_norm": 5.324513912200928, + "learning_rate": 7.74208578868028e-06, + "loss": 0.2941, + "step": 17227 + }, + { + "epoch": 1.033667932718394, + "grad_norm": 5.983955383300781, + "learning_rate": 7.741948746060026e-06, + "loss": 0.3833, + "step": 17228 + }, + { + "epoch": 1.0336814975583288, + "grad_norm": 4.517548561096191, + "learning_rate": 7.74181170343977e-06, + "loss": 0.2442, + "step": 17229 + }, + { + "epoch": 1.0336950623982637, + "grad_norm": 5.252557754516602, + "learning_rate": 7.741674660819516e-06, + "loss": 0.3108, + "step": 17230 + }, + { + "epoch": 1.0337086272381986, + "grad_norm": 7.071165561676025, + "learning_rate": 7.741537618199261e-06, + "loss": 0.354, + "step": 17231 + }, + { + "epoch": 1.0337221920781334, + "grad_norm": 4.192379951477051, + "learning_rate": 7.741400575579005e-06, + "loss": 0.1608, + "step": 17232 + }, + { + "epoch": 1.0337357569180683, + "grad_norm": 4.099571228027344, + "learning_rate": 7.74126353295875e-06, + "loss": 0.1835, + "step": 17233 + }, + { + "epoch": 1.0337493217580032, + "grad_norm": 5.549790859222412, + "learning_rate": 7.741126490338497e-06, + "loss": 0.2348, + "step": 17234 + }, + { + "epoch": 1.0337628865979382, + "grad_norm": 4.420032978057861, + "learning_rate": 7.740989447718242e-06, + "loss": 0.2115, + "step": 17235 + }, + { + "epoch": 1.0337764514378731, + "grad_norm": 6.250587463378906, + "learning_rate": 7.740852405097986e-06, + "loss": 0.2648, + "step": 17236 + }, + { + "epoch": 1.033790016277808, + "grad_norm": 5.362991809844971, + "learning_rate": 7.74071536247773e-06, + "loss": 0.2751, + "step": 17237 + }, + { + "epoch": 1.0338035811177428, + "grad_norm": 6.129626750946045, + "learning_rate": 7.740578319857476e-06, + "loss": 0.3526, + "step": 17238 + }, + { + "epoch": 1.0338171459576777, + "grad_norm": 3.8000495433807373, + "learning_rate": 7.740441277237221e-06, + "loss": 0.2217, + "step": 17239 + }, + { + "epoch": 1.0338307107976126, + "grad_norm": 5.138679027557373, + "learning_rate": 7.740304234616966e-06, + "loss": 0.3545, + "step": 17240 + }, + { + "epoch": 1.0338442756375474, + "grad_norm": 5.047530174255371, + "learning_rate": 7.740167191996711e-06, + "loss": 0.2297, + "step": 17241 + }, + { + "epoch": 1.0338578404774823, + "grad_norm": 5.751160144805908, + "learning_rate": 7.740030149376457e-06, + "loss": 0.3648, + "step": 17242 + }, + { + "epoch": 1.0338714053174172, + "grad_norm": 4.813157081604004, + "learning_rate": 7.739893106756202e-06, + "loss": 0.2451, + "step": 17243 + }, + { + "epoch": 1.0338849701573523, + "grad_norm": 5.023704528808594, + "learning_rate": 7.739756064135947e-06, + "loss": 0.2605, + "step": 17244 + }, + { + "epoch": 1.0338985349972871, + "grad_norm": 5.375853538513184, + "learning_rate": 7.739619021515692e-06, + "loss": 0.3451, + "step": 17245 + }, + { + "epoch": 1.033912099837222, + "grad_norm": 6.492523193359375, + "learning_rate": 7.739481978895437e-06, + "loss": 0.2996, + "step": 17246 + }, + { + "epoch": 1.0339256646771569, + "grad_norm": 4.697591304779053, + "learning_rate": 7.739344936275182e-06, + "loss": 0.2113, + "step": 17247 + }, + { + "epoch": 1.0339392295170917, + "grad_norm": 5.646881103515625, + "learning_rate": 7.739207893654928e-06, + "loss": 0.3666, + "step": 17248 + }, + { + "epoch": 1.0339527943570266, + "grad_norm": 5.909848690032959, + "learning_rate": 7.739070851034673e-06, + "loss": 0.358, + "step": 17249 + }, + { + "epoch": 1.0339663591969614, + "grad_norm": 3.663059711456299, + "learning_rate": 7.738933808414418e-06, + "loss": 0.1528, + "step": 17250 + }, + { + "epoch": 1.0339799240368963, + "grad_norm": 5.665704250335693, + "learning_rate": 7.738796765794163e-06, + "loss": 0.3199, + "step": 17251 + }, + { + "epoch": 1.0339934888768312, + "grad_norm": 5.230912208557129, + "learning_rate": 7.738659723173908e-06, + "loss": 0.2496, + "step": 17252 + }, + { + "epoch": 1.034007053716766, + "grad_norm": 6.0922160148620605, + "learning_rate": 7.738522680553654e-06, + "loss": 0.3001, + "step": 17253 + }, + { + "epoch": 1.0340206185567011, + "grad_norm": 4.764433860778809, + "learning_rate": 7.738385637933397e-06, + "loss": 0.192, + "step": 17254 + }, + { + "epoch": 1.034034183396636, + "grad_norm": 6.1218061447143555, + "learning_rate": 7.738248595313142e-06, + "loss": 0.2926, + "step": 17255 + }, + { + "epoch": 1.0340477482365709, + "grad_norm": 4.859133720397949, + "learning_rate": 7.738111552692889e-06, + "loss": 0.1877, + "step": 17256 + }, + { + "epoch": 1.0340613130765057, + "grad_norm": 4.905460834503174, + "learning_rate": 7.737974510072633e-06, + "loss": 0.2548, + "step": 17257 + }, + { + "epoch": 1.0340748779164406, + "grad_norm": 6.383270263671875, + "learning_rate": 7.737837467452378e-06, + "loss": 0.3098, + "step": 17258 + }, + { + "epoch": 1.0340884427563755, + "grad_norm": 5.690480709075928, + "learning_rate": 7.737700424832123e-06, + "loss": 0.1892, + "step": 17259 + }, + { + "epoch": 1.0341020075963103, + "grad_norm": 2.9637725353240967, + "learning_rate": 7.73756338221187e-06, + "loss": 0.093, + "step": 17260 + }, + { + "epoch": 1.0341155724362452, + "grad_norm": 7.432221412658691, + "learning_rate": 7.737426339591613e-06, + "loss": 0.3409, + "step": 17261 + }, + { + "epoch": 1.03412913727618, + "grad_norm": 4.176438331604004, + "learning_rate": 7.737289296971358e-06, + "loss": 0.2379, + "step": 17262 + }, + { + "epoch": 1.0341427021161151, + "grad_norm": 3.764690399169922, + "learning_rate": 7.737152254351104e-06, + "loss": 0.1415, + "step": 17263 + }, + { + "epoch": 1.03415626695605, + "grad_norm": 4.301392555236816, + "learning_rate": 7.737015211730849e-06, + "loss": 0.2065, + "step": 17264 + }, + { + "epoch": 1.0341698317959849, + "grad_norm": 4.882274150848389, + "learning_rate": 7.736878169110594e-06, + "loss": 0.2591, + "step": 17265 + }, + { + "epoch": 1.0341833966359197, + "grad_norm": 7.180318832397461, + "learning_rate": 7.73674112649034e-06, + "loss": 0.3545, + "step": 17266 + }, + { + "epoch": 1.0341969614758546, + "grad_norm": 4.936075210571289, + "learning_rate": 7.736604083870084e-06, + "loss": 0.2087, + "step": 17267 + }, + { + "epoch": 1.0342105263157895, + "grad_norm": 4.011687278747559, + "learning_rate": 7.73646704124983e-06, + "loss": 0.1472, + "step": 17268 + }, + { + "epoch": 1.0342240911557243, + "grad_norm": 4.886457443237305, + "learning_rate": 7.736329998629575e-06, + "loss": 0.1652, + "step": 17269 + }, + { + "epoch": 1.0342376559956592, + "grad_norm": 5.706419467926025, + "learning_rate": 7.73619295600932e-06, + "loss": 0.2096, + "step": 17270 + }, + { + "epoch": 1.034251220835594, + "grad_norm": 5.196605682373047, + "learning_rate": 7.736055913389065e-06, + "loss": 0.3966, + "step": 17271 + }, + { + "epoch": 1.034264785675529, + "grad_norm": 5.2410993576049805, + "learning_rate": 7.735918870768809e-06, + "loss": 0.1293, + "step": 17272 + }, + { + "epoch": 1.034278350515464, + "grad_norm": 5.44114875793457, + "learning_rate": 7.735781828148555e-06, + "loss": 0.2172, + "step": 17273 + }, + { + "epoch": 1.0342919153553989, + "grad_norm": 5.632103443145752, + "learning_rate": 7.7356447855283e-06, + "loss": 0.3453, + "step": 17274 + }, + { + "epoch": 1.0343054801953337, + "grad_norm": 5.227295398712158, + "learning_rate": 7.735507742908046e-06, + "loss": 0.1625, + "step": 17275 + }, + { + "epoch": 1.0343190450352686, + "grad_norm": 6.286929130554199, + "learning_rate": 7.73537070028779e-06, + "loss": 0.2514, + "step": 17276 + }, + { + "epoch": 1.0343326098752035, + "grad_norm": 4.530883312225342, + "learning_rate": 7.735233657667536e-06, + "loss": 0.2706, + "step": 17277 + }, + { + "epoch": 1.0343461747151383, + "grad_norm": 6.0618462562561035, + "learning_rate": 7.735096615047281e-06, + "loss": 0.2756, + "step": 17278 + }, + { + "epoch": 1.0343597395550732, + "grad_norm": 5.897520065307617, + "learning_rate": 7.734959572427025e-06, + "loss": 0.2887, + "step": 17279 + }, + { + "epoch": 1.034373304395008, + "grad_norm": 3.9956772327423096, + "learning_rate": 7.73482252980677e-06, + "loss": 0.1574, + "step": 17280 + }, + { + "epoch": 1.034386869234943, + "grad_norm": 6.3073296546936035, + "learning_rate": 7.734685487186515e-06, + "loss": 0.2832, + "step": 17281 + }, + { + "epoch": 1.034400434074878, + "grad_norm": 6.317028999328613, + "learning_rate": 7.73454844456626e-06, + "loss": 0.2456, + "step": 17282 + }, + { + "epoch": 1.034413998914813, + "grad_norm": 3.7434701919555664, + "learning_rate": 7.734411401946006e-06, + "loss": 0.1227, + "step": 17283 + }, + { + "epoch": 1.0344275637547478, + "grad_norm": 5.97585916519165, + "learning_rate": 7.73427435932575e-06, + "loss": 0.2432, + "step": 17284 + }, + { + "epoch": 1.0344411285946826, + "grad_norm": 4.5258073806762695, + "learning_rate": 7.734137316705496e-06, + "loss": 0.2229, + "step": 17285 + }, + { + "epoch": 1.0344546934346175, + "grad_norm": 3.3347675800323486, + "learning_rate": 7.734000274085241e-06, + "loss": 0.1966, + "step": 17286 + }, + { + "epoch": 1.0344682582745524, + "grad_norm": 5.087466239929199, + "learning_rate": 7.733863231464986e-06, + "loss": 0.2252, + "step": 17287 + }, + { + "epoch": 1.0344818231144872, + "grad_norm": 4.903646945953369, + "learning_rate": 7.733726188844731e-06, + "loss": 0.2182, + "step": 17288 + }, + { + "epoch": 1.034495387954422, + "grad_norm": 4.521949768066406, + "learning_rate": 7.733589146224477e-06, + "loss": 0.1538, + "step": 17289 + }, + { + "epoch": 1.034508952794357, + "grad_norm": 4.955201148986816, + "learning_rate": 7.733452103604222e-06, + "loss": 0.2686, + "step": 17290 + }, + { + "epoch": 1.0345225176342918, + "grad_norm": 4.18762731552124, + "learning_rate": 7.733315060983967e-06, + "loss": 0.2411, + "step": 17291 + }, + { + "epoch": 1.034536082474227, + "grad_norm": 5.6754326820373535, + "learning_rate": 7.733178018363712e-06, + "loss": 0.2569, + "step": 17292 + }, + { + "epoch": 1.0345496473141618, + "grad_norm": 2.7254252433776855, + "learning_rate": 7.733040975743457e-06, + "loss": 0.1637, + "step": 17293 + }, + { + "epoch": 1.0345632121540966, + "grad_norm": 5.415192604064941, + "learning_rate": 7.7329039331232e-06, + "loss": 0.3395, + "step": 17294 + }, + { + "epoch": 1.0345767769940315, + "grad_norm": 3.121965169906616, + "learning_rate": 7.732766890502948e-06, + "loss": 0.1571, + "step": 17295 + }, + { + "epoch": 1.0345903418339664, + "grad_norm": 4.668130397796631, + "learning_rate": 7.732629847882693e-06, + "loss": 0.2897, + "step": 17296 + }, + { + "epoch": 1.0346039066739012, + "grad_norm": 3.9517548084259033, + "learning_rate": 7.732492805262436e-06, + "loss": 0.2237, + "step": 17297 + }, + { + "epoch": 1.034617471513836, + "grad_norm": 4.596060276031494, + "learning_rate": 7.732355762642182e-06, + "loss": 0.3196, + "step": 17298 + }, + { + "epoch": 1.034631036353771, + "grad_norm": 3.631505012512207, + "learning_rate": 7.732218720021928e-06, + "loss": 0.1649, + "step": 17299 + }, + { + "epoch": 1.0346446011937058, + "grad_norm": 4.569540977478027, + "learning_rate": 7.732081677401672e-06, + "loss": 0.2214, + "step": 17300 + }, + { + "epoch": 1.034658166033641, + "grad_norm": 5.539119243621826, + "learning_rate": 7.731944634781417e-06, + "loss": 0.1935, + "step": 17301 + }, + { + "epoch": 1.0346717308735758, + "grad_norm": 4.794641017913818, + "learning_rate": 7.731807592161162e-06, + "loss": 0.201, + "step": 17302 + }, + { + "epoch": 1.0346852957135106, + "grad_norm": 4.0232110023498535, + "learning_rate": 7.731670549540909e-06, + "loss": 0.2058, + "step": 17303 + }, + { + "epoch": 1.0346988605534455, + "grad_norm": 4.230726718902588, + "learning_rate": 7.731533506920653e-06, + "loss": 0.1333, + "step": 17304 + }, + { + "epoch": 1.0347124253933804, + "grad_norm": 5.141375541687012, + "learning_rate": 7.731396464300398e-06, + "loss": 0.2997, + "step": 17305 + }, + { + "epoch": 1.0347259902333152, + "grad_norm": 5.6615309715271, + "learning_rate": 7.731259421680143e-06, + "loss": 0.3923, + "step": 17306 + }, + { + "epoch": 1.03473955507325, + "grad_norm": 3.762012243270874, + "learning_rate": 7.731122379059888e-06, + "loss": 0.1862, + "step": 17307 + }, + { + "epoch": 1.034753119913185, + "grad_norm": 4.519357681274414, + "learning_rate": 7.730985336439633e-06, + "loss": 0.2706, + "step": 17308 + }, + { + "epoch": 1.0347666847531198, + "grad_norm": 6.078516006469727, + "learning_rate": 7.730848293819378e-06, + "loss": 0.3171, + "step": 17309 + }, + { + "epoch": 1.0347802495930547, + "grad_norm": 4.379427909851074, + "learning_rate": 7.730711251199124e-06, + "loss": 0.2667, + "step": 17310 + }, + { + "epoch": 1.0347938144329898, + "grad_norm": 5.024237632751465, + "learning_rate": 7.730574208578869e-06, + "loss": 0.2514, + "step": 17311 + }, + { + "epoch": 1.0348073792729247, + "grad_norm": 4.476958751678467, + "learning_rate": 7.730437165958614e-06, + "loss": 0.2184, + "step": 17312 + }, + { + "epoch": 1.0348209441128595, + "grad_norm": 5.964646339416504, + "learning_rate": 7.73030012333836e-06, + "loss": 0.2933, + "step": 17313 + }, + { + "epoch": 1.0348345089527944, + "grad_norm": 5.072401523590088, + "learning_rate": 7.730163080718104e-06, + "loss": 0.3237, + "step": 17314 + }, + { + "epoch": 1.0348480737927293, + "grad_norm": 5.291253089904785, + "learning_rate": 7.730026038097848e-06, + "loss": 0.26, + "step": 17315 + }, + { + "epoch": 1.0348616386326641, + "grad_norm": 8.473931312561035, + "learning_rate": 7.729888995477595e-06, + "loss": 0.3507, + "step": 17316 + }, + { + "epoch": 1.034875203472599, + "grad_norm": 7.288971900939941, + "learning_rate": 7.72975195285734e-06, + "loss": 0.2956, + "step": 17317 + }, + { + "epoch": 1.0348887683125338, + "grad_norm": 5.1445088386535645, + "learning_rate": 7.729614910237085e-06, + "loss": 0.2539, + "step": 17318 + }, + { + "epoch": 1.0349023331524687, + "grad_norm": 6.594064712524414, + "learning_rate": 7.729477867616829e-06, + "loss": 0.2609, + "step": 17319 + }, + { + "epoch": 1.0349158979924038, + "grad_norm": 4.389289855957031, + "learning_rate": 7.729340824996575e-06, + "loss": 0.2297, + "step": 17320 + }, + { + "epoch": 1.0349294628323387, + "grad_norm": 5.271124839782715, + "learning_rate": 7.72920378237632e-06, + "loss": 0.2486, + "step": 17321 + }, + { + "epoch": 1.0349430276722735, + "grad_norm": 5.57600736618042, + "learning_rate": 7.729066739756064e-06, + "loss": 0.3404, + "step": 17322 + }, + { + "epoch": 1.0349565925122084, + "grad_norm": 4.08729362487793, + "learning_rate": 7.72892969713581e-06, + "loss": 0.2057, + "step": 17323 + }, + { + "epoch": 1.0349701573521433, + "grad_norm": 4.561936378479004, + "learning_rate": 7.728792654515554e-06, + "loss": 0.2373, + "step": 17324 + }, + { + "epoch": 1.0349837221920781, + "grad_norm": 5.790203094482422, + "learning_rate": 7.7286556118953e-06, + "loss": 0.323, + "step": 17325 + }, + { + "epoch": 1.034997287032013, + "grad_norm": 3.9158217906951904, + "learning_rate": 7.728518569275045e-06, + "loss": 0.1755, + "step": 17326 + }, + { + "epoch": 1.0350108518719479, + "grad_norm": 7.132404327392578, + "learning_rate": 7.72838152665479e-06, + "loss": 0.4025, + "step": 17327 + }, + { + "epoch": 1.0350244167118827, + "grad_norm": 5.0948662757873535, + "learning_rate": 7.728244484034535e-06, + "loss": 0.2283, + "step": 17328 + }, + { + "epoch": 1.0350379815518176, + "grad_norm": 4.116907119750977, + "learning_rate": 7.72810744141428e-06, + "loss": 0.213, + "step": 17329 + }, + { + "epoch": 1.0350515463917527, + "grad_norm": 5.575109481811523, + "learning_rate": 7.727970398794026e-06, + "loss": 0.2954, + "step": 17330 + }, + { + "epoch": 1.0350651112316875, + "grad_norm": 5.79569149017334, + "learning_rate": 7.72783335617377e-06, + "loss": 0.3821, + "step": 17331 + }, + { + "epoch": 1.0350786760716224, + "grad_norm": 7.100188732147217, + "learning_rate": 7.727696313553516e-06, + "loss": 0.3945, + "step": 17332 + }, + { + "epoch": 1.0350922409115573, + "grad_norm": 6.694638729095459, + "learning_rate": 7.727559270933261e-06, + "loss": 0.4054, + "step": 17333 + }, + { + "epoch": 1.0351058057514921, + "grad_norm": 6.991446495056152, + "learning_rate": 7.727422228313006e-06, + "loss": 0.4038, + "step": 17334 + }, + { + "epoch": 1.035119370591427, + "grad_norm": 5.594857692718506, + "learning_rate": 7.727285185692751e-06, + "loss": 0.3816, + "step": 17335 + }, + { + "epoch": 1.0351329354313619, + "grad_norm": 5.071791648864746, + "learning_rate": 7.727148143072497e-06, + "loss": 0.3302, + "step": 17336 + }, + { + "epoch": 1.0351465002712967, + "grad_norm": 5.158814430236816, + "learning_rate": 7.72701110045224e-06, + "loss": 0.2729, + "step": 17337 + }, + { + "epoch": 1.0351600651112316, + "grad_norm": 5.323867321014404, + "learning_rate": 7.726874057831987e-06, + "loss": 0.2707, + "step": 17338 + }, + { + "epoch": 1.0351736299511667, + "grad_norm": 6.028167724609375, + "learning_rate": 7.726737015211732e-06, + "loss": 0.2268, + "step": 17339 + }, + { + "epoch": 1.0351871947911016, + "grad_norm": 5.624847888946533, + "learning_rate": 7.726599972591476e-06, + "loss": 0.2115, + "step": 17340 + }, + { + "epoch": 1.0352007596310364, + "grad_norm": 6.013112545013428, + "learning_rate": 7.72646292997122e-06, + "loss": 0.2474, + "step": 17341 + }, + { + "epoch": 1.0352143244709713, + "grad_norm": 6.966805458068848, + "learning_rate": 7.726325887350968e-06, + "loss": 0.4416, + "step": 17342 + }, + { + "epoch": 1.0352278893109061, + "grad_norm": 8.288679122924805, + "learning_rate": 7.726188844730713e-06, + "loss": 0.4322, + "step": 17343 + }, + { + "epoch": 1.035241454150841, + "grad_norm": 5.454951763153076, + "learning_rate": 7.726051802110456e-06, + "loss": 0.2918, + "step": 17344 + }, + { + "epoch": 1.0352550189907759, + "grad_norm": 4.550121307373047, + "learning_rate": 7.725914759490202e-06, + "loss": 0.1931, + "step": 17345 + }, + { + "epoch": 1.0352685838307107, + "grad_norm": 5.550687313079834, + "learning_rate": 7.725777716869948e-06, + "loss": 0.3004, + "step": 17346 + }, + { + "epoch": 1.0352821486706456, + "grad_norm": 4.955684661865234, + "learning_rate": 7.725640674249692e-06, + "loss": 0.2608, + "step": 17347 + }, + { + "epoch": 1.0352957135105805, + "grad_norm": 7.243487358093262, + "learning_rate": 7.725503631629437e-06, + "loss": 0.334, + "step": 17348 + }, + { + "epoch": 1.0353092783505156, + "grad_norm": 4.881293773651123, + "learning_rate": 7.725366589009182e-06, + "loss": 0.1836, + "step": 17349 + }, + { + "epoch": 1.0353228431904504, + "grad_norm": 5.450257778167725, + "learning_rate": 7.725229546388927e-06, + "loss": 0.2995, + "step": 17350 + }, + { + "epoch": 1.0353364080303853, + "grad_norm": 4.6050214767456055, + "learning_rate": 7.725092503768673e-06, + "loss": 0.2388, + "step": 17351 + }, + { + "epoch": 1.0353499728703202, + "grad_norm": 5.735688209533691, + "learning_rate": 7.724955461148418e-06, + "loss": 0.2814, + "step": 17352 + }, + { + "epoch": 1.035363537710255, + "grad_norm": 5.301875591278076, + "learning_rate": 7.724818418528163e-06, + "loss": 0.3074, + "step": 17353 + }, + { + "epoch": 1.0353771025501899, + "grad_norm": 5.28073787689209, + "learning_rate": 7.724681375907908e-06, + "loss": 0.285, + "step": 17354 + }, + { + "epoch": 1.0353906673901248, + "grad_norm": 5.460992813110352, + "learning_rate": 7.724544333287653e-06, + "loss": 0.3163, + "step": 17355 + }, + { + "epoch": 1.0354042322300596, + "grad_norm": 5.936747074127197, + "learning_rate": 7.724407290667399e-06, + "loss": 0.3184, + "step": 17356 + }, + { + "epoch": 1.0354177970699945, + "grad_norm": 4.704186916351318, + "learning_rate": 7.724270248047144e-06, + "loss": 0.2952, + "step": 17357 + }, + { + "epoch": 1.0354313619099296, + "grad_norm": 4.957713603973389, + "learning_rate": 7.724133205426889e-06, + "loss": 0.21, + "step": 17358 + }, + { + "epoch": 1.0354449267498644, + "grad_norm": 7.196929931640625, + "learning_rate": 7.723996162806634e-06, + "loss": 0.3097, + "step": 17359 + }, + { + "epoch": 1.0354584915897993, + "grad_norm": 4.709621906280518, + "learning_rate": 7.72385912018638e-06, + "loss": 0.2713, + "step": 17360 + }, + { + "epoch": 1.0354720564297342, + "grad_norm": 7.8479323387146, + "learning_rate": 7.723722077566124e-06, + "loss": 0.3512, + "step": 17361 + }, + { + "epoch": 1.035485621269669, + "grad_norm": 5.838984489440918, + "learning_rate": 7.723585034945868e-06, + "loss": 0.3266, + "step": 17362 + }, + { + "epoch": 1.035499186109604, + "grad_norm": 5.364752769470215, + "learning_rate": 7.723447992325613e-06, + "loss": 0.2698, + "step": 17363 + }, + { + "epoch": 1.0355127509495388, + "grad_norm": 6.281554698944092, + "learning_rate": 7.72331094970536e-06, + "loss": 0.3126, + "step": 17364 + }, + { + "epoch": 1.0355263157894736, + "grad_norm": 6.415921211242676, + "learning_rate": 7.723173907085103e-06, + "loss": 0.2669, + "step": 17365 + }, + { + "epoch": 1.0355398806294085, + "grad_norm": 5.633338451385498, + "learning_rate": 7.723036864464849e-06, + "loss": 0.2637, + "step": 17366 + }, + { + "epoch": 1.0355534454693434, + "grad_norm": 5.1642656326293945, + "learning_rate": 7.722899821844594e-06, + "loss": 0.2108, + "step": 17367 + }, + { + "epoch": 1.0355670103092784, + "grad_norm": 5.193124771118164, + "learning_rate": 7.722762779224339e-06, + "loss": 0.2878, + "step": 17368 + }, + { + "epoch": 1.0355805751492133, + "grad_norm": 5.761780738830566, + "learning_rate": 7.722625736604084e-06, + "loss": 0.2547, + "step": 17369 + }, + { + "epoch": 1.0355941399891482, + "grad_norm": 5.18543815612793, + "learning_rate": 7.72248869398383e-06, + "loss": 0.2002, + "step": 17370 + }, + { + "epoch": 1.035607704829083, + "grad_norm": 4.359597682952881, + "learning_rate": 7.722351651363575e-06, + "loss": 0.2133, + "step": 17371 + }, + { + "epoch": 1.035621269669018, + "grad_norm": 6.879873752593994, + "learning_rate": 7.72221460874332e-06, + "loss": 0.3285, + "step": 17372 + }, + { + "epoch": 1.0356348345089528, + "grad_norm": 8.185608863830566, + "learning_rate": 7.722077566123065e-06, + "loss": 0.3829, + "step": 17373 + }, + { + "epoch": 1.0356483993488876, + "grad_norm": 7.348135471343994, + "learning_rate": 7.72194052350281e-06, + "loss": 0.3501, + "step": 17374 + }, + { + "epoch": 1.0356619641888225, + "grad_norm": 7.083847522735596, + "learning_rate": 7.721803480882555e-06, + "loss": 0.2878, + "step": 17375 + }, + { + "epoch": 1.0356755290287574, + "grad_norm": 6.914696216583252, + "learning_rate": 7.7216664382623e-06, + "loss": 0.3445, + "step": 17376 + }, + { + "epoch": 1.0356890938686925, + "grad_norm": 6.985241889953613, + "learning_rate": 7.721529395642046e-06, + "loss": 0.2855, + "step": 17377 + }, + { + "epoch": 1.0357026587086273, + "grad_norm": 5.859086036682129, + "learning_rate": 7.72139235302179e-06, + "loss": 0.21, + "step": 17378 + }, + { + "epoch": 1.0357162235485622, + "grad_norm": 7.78163480758667, + "learning_rate": 7.721255310401536e-06, + "loss": 0.3146, + "step": 17379 + }, + { + "epoch": 1.035729788388497, + "grad_norm": 5.176403045654297, + "learning_rate": 7.72111826778128e-06, + "loss": 0.399, + "step": 17380 + }, + { + "epoch": 1.035743353228432, + "grad_norm": 4.85769510269165, + "learning_rate": 7.720981225161026e-06, + "loss": 0.3316, + "step": 17381 + }, + { + "epoch": 1.0357569180683668, + "grad_norm": 5.339495658874512, + "learning_rate": 7.720844182540771e-06, + "loss": 0.2356, + "step": 17382 + }, + { + "epoch": 1.0357704829083016, + "grad_norm": 4.840524673461914, + "learning_rate": 7.720707139920515e-06, + "loss": 0.2415, + "step": 17383 + }, + { + "epoch": 1.0357840477482365, + "grad_norm": 6.163599967956543, + "learning_rate": 7.72057009730026e-06, + "loss": 0.3196, + "step": 17384 + }, + { + "epoch": 1.0357976125881714, + "grad_norm": 5.050950050354004, + "learning_rate": 7.720433054680007e-06, + "loss": 0.1837, + "step": 17385 + }, + { + "epoch": 1.0358111774281062, + "grad_norm": 4.974164962768555, + "learning_rate": 7.720296012059752e-06, + "loss": 0.2657, + "step": 17386 + }, + { + "epoch": 1.0358247422680413, + "grad_norm": 5.36498498916626, + "learning_rate": 7.720158969439496e-06, + "loss": 0.3046, + "step": 17387 + }, + { + "epoch": 1.0358383071079762, + "grad_norm": 5.471604824066162, + "learning_rate": 7.720021926819241e-06, + "loss": 0.2758, + "step": 17388 + }, + { + "epoch": 1.035851871947911, + "grad_norm": 6.4086594581604, + "learning_rate": 7.719884884198986e-06, + "loss": 0.3213, + "step": 17389 + }, + { + "epoch": 1.035865436787846, + "grad_norm": 5.378392219543457, + "learning_rate": 7.719747841578731e-06, + "loss": 0.2714, + "step": 17390 + }, + { + "epoch": 1.0358790016277808, + "grad_norm": 6.972303867340088, + "learning_rate": 7.719610798958476e-06, + "loss": 0.3172, + "step": 17391 + }, + { + "epoch": 1.0358925664677157, + "grad_norm": 6.309717655181885, + "learning_rate": 7.719473756338222e-06, + "loss": 0.4298, + "step": 17392 + }, + { + "epoch": 1.0359061313076505, + "grad_norm": 5.58989953994751, + "learning_rate": 7.719336713717967e-06, + "loss": 0.283, + "step": 17393 + }, + { + "epoch": 1.0359196961475854, + "grad_norm": 7.103089809417725, + "learning_rate": 7.719199671097712e-06, + "loss": 0.355, + "step": 17394 + }, + { + "epoch": 1.0359332609875203, + "grad_norm": 4.622056007385254, + "learning_rate": 7.719062628477457e-06, + "loss": 0.2484, + "step": 17395 + }, + { + "epoch": 1.0359468258274553, + "grad_norm": 5.010159015655518, + "learning_rate": 7.718925585857202e-06, + "loss": 0.3335, + "step": 17396 + }, + { + "epoch": 1.0359603906673902, + "grad_norm": 3.556124687194824, + "learning_rate": 7.718788543236947e-06, + "loss": 0.2243, + "step": 17397 + }, + { + "epoch": 1.035973955507325, + "grad_norm": 5.7297282218933105, + "learning_rate": 7.718651500616693e-06, + "loss": 0.288, + "step": 17398 + }, + { + "epoch": 1.03598752034726, + "grad_norm": 5.8542985916137695, + "learning_rate": 7.718514457996438e-06, + "loss": 0.3696, + "step": 17399 + }, + { + "epoch": 1.0360010851871948, + "grad_norm": 4.444614887237549, + "learning_rate": 7.718377415376183e-06, + "loss": 0.1844, + "step": 17400 + }, + { + "epoch": 1.0360146500271297, + "grad_norm": 5.891427040100098, + "learning_rate": 7.718240372755928e-06, + "loss": 0.227, + "step": 17401 + }, + { + "epoch": 1.0360282148670645, + "grad_norm": 5.8394670486450195, + "learning_rate": 7.718103330135673e-06, + "loss": 0.2793, + "step": 17402 + }, + { + "epoch": 1.0360417797069994, + "grad_norm": 6.853301525115967, + "learning_rate": 7.717966287515419e-06, + "loss": 0.374, + "step": 17403 + }, + { + "epoch": 1.0360553445469343, + "grad_norm": 5.0164899826049805, + "learning_rate": 7.717829244895164e-06, + "loss": 0.3617, + "step": 17404 + }, + { + "epoch": 1.0360689093868691, + "grad_norm": 5.458151817321777, + "learning_rate": 7.717692202274907e-06, + "loss": 0.301, + "step": 17405 + }, + { + "epoch": 1.0360824742268042, + "grad_norm": 5.59798526763916, + "learning_rate": 7.717555159654652e-06, + "loss": 0.3991, + "step": 17406 + }, + { + "epoch": 1.036096039066739, + "grad_norm": 6.80183219909668, + "learning_rate": 7.7174181170344e-06, + "loss": 0.4125, + "step": 17407 + }, + { + "epoch": 1.036109603906674, + "grad_norm": 3.8867897987365723, + "learning_rate": 7.717281074414143e-06, + "loss": 0.1932, + "step": 17408 + }, + { + "epoch": 1.0361231687466088, + "grad_norm": 7.347028732299805, + "learning_rate": 7.717144031793888e-06, + "loss": 0.2436, + "step": 17409 + }, + { + "epoch": 1.0361367335865437, + "grad_norm": 6.982946395874023, + "learning_rate": 7.717006989173633e-06, + "loss": 0.4068, + "step": 17410 + }, + { + "epoch": 1.0361502984264785, + "grad_norm": 5.8063273429870605, + "learning_rate": 7.71686994655338e-06, + "loss": 0.2164, + "step": 17411 + }, + { + "epoch": 1.0361638632664134, + "grad_norm": 4.492346286773682, + "learning_rate": 7.716732903933123e-06, + "loss": 0.2149, + "step": 17412 + }, + { + "epoch": 1.0361774281063483, + "grad_norm": 5.926486968994141, + "learning_rate": 7.716595861312869e-06, + "loss": 0.422, + "step": 17413 + }, + { + "epoch": 1.0361909929462831, + "grad_norm": 3.8089959621429443, + "learning_rate": 7.716458818692614e-06, + "loss": 0.2188, + "step": 17414 + }, + { + "epoch": 1.0362045577862182, + "grad_norm": 4.2767333984375, + "learning_rate": 7.716321776072359e-06, + "loss": 0.2353, + "step": 17415 + }, + { + "epoch": 1.036218122626153, + "grad_norm": 4.457381725311279, + "learning_rate": 7.716184733452104e-06, + "loss": 0.2508, + "step": 17416 + }, + { + "epoch": 1.036231687466088, + "grad_norm": 5.215469837188721, + "learning_rate": 7.71604769083185e-06, + "loss": 0.2195, + "step": 17417 + }, + { + "epoch": 1.0362452523060228, + "grad_norm": 4.445319175720215, + "learning_rate": 7.715910648211595e-06, + "loss": 0.2709, + "step": 17418 + }, + { + "epoch": 1.0362588171459577, + "grad_norm": 4.915741920471191, + "learning_rate": 7.71577360559134e-06, + "loss": 0.2802, + "step": 17419 + }, + { + "epoch": 1.0362723819858926, + "grad_norm": 7.170178413391113, + "learning_rate": 7.715636562971085e-06, + "loss": 0.3728, + "step": 17420 + }, + { + "epoch": 1.0362859468258274, + "grad_norm": 2.965484380722046, + "learning_rate": 7.71549952035083e-06, + "loss": 0.0974, + "step": 17421 + }, + { + "epoch": 1.0362995116657623, + "grad_norm": 5.2790632247924805, + "learning_rate": 7.715362477730575e-06, + "loss": 0.2626, + "step": 17422 + }, + { + "epoch": 1.0363130765056972, + "grad_norm": 5.960120677947998, + "learning_rate": 7.715225435110319e-06, + "loss": 0.3103, + "step": 17423 + }, + { + "epoch": 1.036326641345632, + "grad_norm": 5.106897830963135, + "learning_rate": 7.715088392490066e-06, + "loss": 0.3056, + "step": 17424 + }, + { + "epoch": 1.036340206185567, + "grad_norm": 3.897451877593994, + "learning_rate": 7.71495134986981e-06, + "loss": 0.1732, + "step": 17425 + }, + { + "epoch": 1.036353771025502, + "grad_norm": 5.3313446044921875, + "learning_rate": 7.714814307249556e-06, + "loss": 0.2734, + "step": 17426 + }, + { + "epoch": 1.0363673358654368, + "grad_norm": 5.467520713806152, + "learning_rate": 7.7146772646293e-06, + "loss": 0.2157, + "step": 17427 + }, + { + "epoch": 1.0363809007053717, + "grad_norm": 4.722201824188232, + "learning_rate": 7.714540222009046e-06, + "loss": 0.2009, + "step": 17428 + }, + { + "epoch": 1.0363944655453066, + "grad_norm": 6.907276153564453, + "learning_rate": 7.714403179388791e-06, + "loss": 0.3171, + "step": 17429 + }, + { + "epoch": 1.0364080303852414, + "grad_norm": 4.4461283683776855, + "learning_rate": 7.714266136768535e-06, + "loss": 0.1786, + "step": 17430 + }, + { + "epoch": 1.0364215952251763, + "grad_norm": 4.8040032386779785, + "learning_rate": 7.71412909414828e-06, + "loss": 0.1957, + "step": 17431 + }, + { + "epoch": 1.0364351600651112, + "grad_norm": 5.618424415588379, + "learning_rate": 7.713992051528025e-06, + "loss": 0.2865, + "step": 17432 + }, + { + "epoch": 1.036448724905046, + "grad_norm": 4.589020252227783, + "learning_rate": 7.71385500890777e-06, + "loss": 0.2234, + "step": 17433 + }, + { + "epoch": 1.0364622897449811, + "grad_norm": 5.770480155944824, + "learning_rate": 7.713717966287516e-06, + "loss": 0.3296, + "step": 17434 + }, + { + "epoch": 1.036475854584916, + "grad_norm": 4.216434478759766, + "learning_rate": 7.713580923667261e-06, + "loss": 0.1939, + "step": 17435 + }, + { + "epoch": 1.0364894194248508, + "grad_norm": 4.626877784729004, + "learning_rate": 7.713443881047006e-06, + "loss": 0.3369, + "step": 17436 + }, + { + "epoch": 1.0365029842647857, + "grad_norm": 6.92100191116333, + "learning_rate": 7.713306838426751e-06, + "loss": 0.3155, + "step": 17437 + }, + { + "epoch": 1.0365165491047206, + "grad_norm": 3.851703643798828, + "learning_rate": 7.713169795806496e-06, + "loss": 0.228, + "step": 17438 + }, + { + "epoch": 1.0365301139446554, + "grad_norm": 4.043344497680664, + "learning_rate": 7.713032753186242e-06, + "loss": 0.2089, + "step": 17439 + }, + { + "epoch": 1.0365436787845903, + "grad_norm": 6.017384052276611, + "learning_rate": 7.712895710565987e-06, + "loss": 0.246, + "step": 17440 + }, + { + "epoch": 1.0365572436245252, + "grad_norm": 5.971049785614014, + "learning_rate": 7.712758667945732e-06, + "loss": 0.3341, + "step": 17441 + }, + { + "epoch": 1.03657080846446, + "grad_norm": 4.071714878082275, + "learning_rate": 7.712621625325477e-06, + "loss": 0.2004, + "step": 17442 + }, + { + "epoch": 1.036584373304395, + "grad_norm": 4.341926574707031, + "learning_rate": 7.712484582705222e-06, + "loss": 0.2756, + "step": 17443 + }, + { + "epoch": 1.03659793814433, + "grad_norm": 4.652883529663086, + "learning_rate": 7.712347540084967e-06, + "loss": 0.2292, + "step": 17444 + }, + { + "epoch": 1.0366115029842649, + "grad_norm": 5.870887756347656, + "learning_rate": 7.712210497464711e-06, + "loss": 0.2043, + "step": 17445 + }, + { + "epoch": 1.0366250678241997, + "grad_norm": 4.25833797454834, + "learning_rate": 7.712073454844458e-06, + "loss": 0.2683, + "step": 17446 + }, + { + "epoch": 1.0366386326641346, + "grad_norm": 3.516021251678467, + "learning_rate": 7.711936412224203e-06, + "loss": 0.2789, + "step": 17447 + }, + { + "epoch": 1.0366521975040695, + "grad_norm": 4.368373870849609, + "learning_rate": 7.711799369603947e-06, + "loss": 0.2247, + "step": 17448 + }, + { + "epoch": 1.0366657623440043, + "grad_norm": 5.489014148712158, + "learning_rate": 7.711662326983692e-06, + "loss": 0.2953, + "step": 17449 + }, + { + "epoch": 1.0366793271839392, + "grad_norm": 4.940067291259766, + "learning_rate": 7.711525284363439e-06, + "loss": 0.2254, + "step": 17450 + }, + { + "epoch": 1.036692892023874, + "grad_norm": 5.773608207702637, + "learning_rate": 7.711388241743184e-06, + "loss": 0.209, + "step": 17451 + }, + { + "epoch": 1.036706456863809, + "grad_norm": 5.138698101043701, + "learning_rate": 7.711251199122927e-06, + "loss": 0.2333, + "step": 17452 + }, + { + "epoch": 1.036720021703744, + "grad_norm": 4.9586968421936035, + "learning_rate": 7.711114156502672e-06, + "loss": 0.2019, + "step": 17453 + }, + { + "epoch": 1.0367335865436789, + "grad_norm": 6.890323638916016, + "learning_rate": 7.71097711388242e-06, + "loss": 0.2021, + "step": 17454 + }, + { + "epoch": 1.0367471513836137, + "grad_norm": 6.754395008087158, + "learning_rate": 7.710840071262163e-06, + "loss": 0.2692, + "step": 17455 + }, + { + "epoch": 1.0367607162235486, + "grad_norm": 4.096047401428223, + "learning_rate": 7.710703028641908e-06, + "loss": 0.2165, + "step": 17456 + }, + { + "epoch": 1.0367742810634835, + "grad_norm": 3.6466965675354004, + "learning_rate": 7.710565986021653e-06, + "loss": 0.1961, + "step": 17457 + }, + { + "epoch": 1.0367878459034183, + "grad_norm": 4.020252704620361, + "learning_rate": 7.710428943401398e-06, + "loss": 0.2034, + "step": 17458 + }, + { + "epoch": 1.0368014107433532, + "grad_norm": 3.5320322513580322, + "learning_rate": 7.710291900781143e-06, + "loss": 0.1622, + "step": 17459 + }, + { + "epoch": 1.036814975583288, + "grad_norm": 4.339419364929199, + "learning_rate": 7.710154858160889e-06, + "loss": 0.1977, + "step": 17460 + }, + { + "epoch": 1.036828540423223, + "grad_norm": 5.918131351470947, + "learning_rate": 7.710017815540634e-06, + "loss": 0.322, + "step": 17461 + }, + { + "epoch": 1.0368421052631578, + "grad_norm": 4.408801555633545, + "learning_rate": 7.709880772920379e-06, + "loss": 0.21, + "step": 17462 + }, + { + "epoch": 1.0368556701030929, + "grad_norm": 4.266882419586182, + "learning_rate": 7.709743730300124e-06, + "loss": 0.2391, + "step": 17463 + }, + { + "epoch": 1.0368692349430277, + "grad_norm": 4.324426651000977, + "learning_rate": 7.70960668767987e-06, + "loss": 0.2322, + "step": 17464 + }, + { + "epoch": 1.0368827997829626, + "grad_norm": 4.575600624084473, + "learning_rate": 7.709469645059615e-06, + "loss": 0.2738, + "step": 17465 + }, + { + "epoch": 1.0368963646228975, + "grad_norm": 5.198936939239502, + "learning_rate": 7.70933260243936e-06, + "loss": 0.2872, + "step": 17466 + }, + { + "epoch": 1.0369099294628323, + "grad_norm": 7.352774620056152, + "learning_rate": 7.709195559819105e-06, + "loss": 0.3481, + "step": 17467 + }, + { + "epoch": 1.0369234943027672, + "grad_norm": 6.993385314941406, + "learning_rate": 7.70905851719885e-06, + "loss": 0.2927, + "step": 17468 + }, + { + "epoch": 1.036937059142702, + "grad_norm": 4.451369762420654, + "learning_rate": 7.708921474578595e-06, + "loss": 0.249, + "step": 17469 + }, + { + "epoch": 1.036950623982637, + "grad_norm": 4.7072319984436035, + "learning_rate": 7.708784431958339e-06, + "loss": 0.1936, + "step": 17470 + }, + { + "epoch": 1.0369641888225718, + "grad_norm": 4.655514240264893, + "learning_rate": 7.708647389338086e-06, + "loss": 0.3554, + "step": 17471 + }, + { + "epoch": 1.0369777536625069, + "grad_norm": 3.6360857486724854, + "learning_rate": 7.70851034671783e-06, + "loss": 0.1923, + "step": 17472 + }, + { + "epoch": 1.0369913185024418, + "grad_norm": 5.157948970794678, + "learning_rate": 7.708373304097574e-06, + "loss": 0.3172, + "step": 17473 + }, + { + "epoch": 1.0370048833423766, + "grad_norm": 4.7992682456970215, + "learning_rate": 7.70823626147732e-06, + "loss": 0.2498, + "step": 17474 + }, + { + "epoch": 1.0370184481823115, + "grad_norm": 3.9349071979522705, + "learning_rate": 7.708099218857065e-06, + "loss": 0.2312, + "step": 17475 + }, + { + "epoch": 1.0370320130222463, + "grad_norm": 5.538571834564209, + "learning_rate": 7.70796217623681e-06, + "loss": 0.2772, + "step": 17476 + }, + { + "epoch": 1.0370455778621812, + "grad_norm": 4.645421981811523, + "learning_rate": 7.707825133616555e-06, + "loss": 0.243, + "step": 17477 + }, + { + "epoch": 1.037059142702116, + "grad_norm": 4.871949195861816, + "learning_rate": 7.7076880909963e-06, + "loss": 0.2952, + "step": 17478 + }, + { + "epoch": 1.037072707542051, + "grad_norm": 4.742945194244385, + "learning_rate": 7.707551048376045e-06, + "loss": 0.2358, + "step": 17479 + }, + { + "epoch": 1.0370862723819858, + "grad_norm": 4.493189334869385, + "learning_rate": 7.70741400575579e-06, + "loss": 0.241, + "step": 17480 + }, + { + "epoch": 1.0370998372219207, + "grad_norm": 4.401591777801514, + "learning_rate": 7.707276963135536e-06, + "loss": 0.3244, + "step": 17481 + }, + { + "epoch": 1.0371134020618558, + "grad_norm": 3.7258718013763428, + "learning_rate": 7.707139920515281e-06, + "loss": 0.243, + "step": 17482 + }, + { + "epoch": 1.0371269669017906, + "grad_norm": 5.022218227386475, + "learning_rate": 7.707002877895026e-06, + "loss": 0.2821, + "step": 17483 + }, + { + "epoch": 1.0371405317417255, + "grad_norm": 4.581308364868164, + "learning_rate": 7.706865835274771e-06, + "loss": 0.336, + "step": 17484 + }, + { + "epoch": 1.0371540965816604, + "grad_norm": 4.881860256195068, + "learning_rate": 7.706728792654516e-06, + "loss": 0.2833, + "step": 17485 + }, + { + "epoch": 1.0371676614215952, + "grad_norm": 4.856276512145996, + "learning_rate": 7.706591750034262e-06, + "loss": 0.2951, + "step": 17486 + }, + { + "epoch": 1.03718122626153, + "grad_norm": 5.531569480895996, + "learning_rate": 7.706454707414007e-06, + "loss": 0.2884, + "step": 17487 + }, + { + "epoch": 1.037194791101465, + "grad_norm": 4.384614944458008, + "learning_rate": 7.70631766479375e-06, + "loss": 0.2424, + "step": 17488 + }, + { + "epoch": 1.0372083559413998, + "grad_norm": 5.291103363037109, + "learning_rate": 7.706180622173497e-06, + "loss": 0.3089, + "step": 17489 + }, + { + "epoch": 1.0372219207813347, + "grad_norm": 6.664453029632568, + "learning_rate": 7.706043579553242e-06, + "loss": 0.3864, + "step": 17490 + }, + { + "epoch": 1.0372354856212698, + "grad_norm": 4.254518032073975, + "learning_rate": 7.705906536932986e-06, + "loss": 0.2044, + "step": 17491 + }, + { + "epoch": 1.0372490504612046, + "grad_norm": 6.973074913024902, + "learning_rate": 7.705769494312731e-06, + "loss": 0.5261, + "step": 17492 + }, + { + "epoch": 1.0372626153011395, + "grad_norm": 5.510763645172119, + "learning_rate": 7.705632451692478e-06, + "loss": 0.3419, + "step": 17493 + }, + { + "epoch": 1.0372761801410744, + "grad_norm": 5.682371616363525, + "learning_rate": 7.705495409072223e-06, + "loss": 0.3717, + "step": 17494 + }, + { + "epoch": 1.0372897449810092, + "grad_norm": 5.748353004455566, + "learning_rate": 7.705358366451967e-06, + "loss": 0.313, + "step": 17495 + }, + { + "epoch": 1.037303309820944, + "grad_norm": 5.3530402183532715, + "learning_rate": 7.705221323831712e-06, + "loss": 0.2808, + "step": 17496 + }, + { + "epoch": 1.037316874660879, + "grad_norm": 7.2270331382751465, + "learning_rate": 7.705084281211459e-06, + "loss": 0.5163, + "step": 17497 + }, + { + "epoch": 1.0373304395008138, + "grad_norm": 5.732840538024902, + "learning_rate": 7.704947238591202e-06, + "loss": 0.386, + "step": 17498 + }, + { + "epoch": 1.0373440043407487, + "grad_norm": 5.918055057525635, + "learning_rate": 7.704810195970947e-06, + "loss": 0.4332, + "step": 17499 + }, + { + "epoch": 1.0373575691806836, + "grad_norm": 6.508420944213867, + "learning_rate": 7.704673153350692e-06, + "loss": 0.5029, + "step": 17500 + }, + { + "epoch": 1.0373711340206186, + "grad_norm": 5.324020862579346, + "learning_rate": 7.704536110730438e-06, + "loss": 0.2686, + "step": 17501 + }, + { + "epoch": 1.0373846988605535, + "grad_norm": 6.263280391693115, + "learning_rate": 7.704399068110183e-06, + "loss": 0.296, + "step": 17502 + }, + { + "epoch": 1.0373982637004884, + "grad_norm": 6.677089214324951, + "learning_rate": 7.704262025489928e-06, + "loss": 0.3574, + "step": 17503 + }, + { + "epoch": 1.0374118285404232, + "grad_norm": 7.21347188949585, + "learning_rate": 7.704124982869673e-06, + "loss": 0.526, + "step": 17504 + }, + { + "epoch": 1.037425393380358, + "grad_norm": 4.179215431213379, + "learning_rate": 7.703987940249418e-06, + "loss": 0.3405, + "step": 17505 + }, + { + "epoch": 1.037438958220293, + "grad_norm": 5.689074516296387, + "learning_rate": 7.703850897629163e-06, + "loss": 0.4373, + "step": 17506 + }, + { + "epoch": 1.0374525230602278, + "grad_norm": 6.57806921005249, + "learning_rate": 7.703713855008909e-06, + "loss": 0.4754, + "step": 17507 + }, + { + "epoch": 1.0374660879001627, + "grad_norm": 5.850845813751221, + "learning_rate": 7.703576812388654e-06, + "loss": 0.3298, + "step": 17508 + }, + { + "epoch": 1.0374796527400976, + "grad_norm": 5.969182968139648, + "learning_rate": 7.703439769768399e-06, + "loss": 0.4095, + "step": 17509 + }, + { + "epoch": 1.0374932175800327, + "grad_norm": 6.203417778015137, + "learning_rate": 7.703302727148144e-06, + "loss": 0.4081, + "step": 17510 + }, + { + "epoch": 1.0375067824199675, + "grad_norm": 5.186092853546143, + "learning_rate": 7.70316568452789e-06, + "loss": 0.3059, + "step": 17511 + }, + { + "epoch": 1.0375203472599024, + "grad_norm": 4.327948570251465, + "learning_rate": 7.703028641907635e-06, + "loss": 0.2425, + "step": 17512 + }, + { + "epoch": 1.0375339120998373, + "grad_norm": 6.559133052825928, + "learning_rate": 7.702891599287378e-06, + "loss": 0.4229, + "step": 17513 + }, + { + "epoch": 1.0375474769397721, + "grad_norm": 6.160538196563721, + "learning_rate": 7.702754556667123e-06, + "loss": 0.4476, + "step": 17514 + }, + { + "epoch": 1.037561041779707, + "grad_norm": 4.850239276885986, + "learning_rate": 7.70261751404687e-06, + "loss": 0.3177, + "step": 17515 + }, + { + "epoch": 1.0375746066196418, + "grad_norm": 7.050732135772705, + "learning_rate": 7.702480471426614e-06, + "loss": 0.4418, + "step": 17516 + }, + { + "epoch": 1.0375881714595767, + "grad_norm": 4.018774509429932, + "learning_rate": 7.702343428806359e-06, + "loss": 0.3088, + "step": 17517 + }, + { + "epoch": 1.0376017362995116, + "grad_norm": 7.3220133781433105, + "learning_rate": 7.702206386186104e-06, + "loss": 0.406, + "step": 17518 + }, + { + "epoch": 1.0376153011394464, + "grad_norm": 5.885805606842041, + "learning_rate": 7.70206934356585e-06, + "loss": 0.3984, + "step": 17519 + }, + { + "epoch": 1.0376288659793815, + "grad_norm": 7.596328258514404, + "learning_rate": 7.701932300945594e-06, + "loss": 0.423, + "step": 17520 + }, + { + "epoch": 1.0376424308193164, + "grad_norm": 4.1974711418151855, + "learning_rate": 7.70179525832534e-06, + "loss": 0.2435, + "step": 17521 + }, + { + "epoch": 1.0376559956592513, + "grad_norm": 8.431069374084473, + "learning_rate": 7.701658215705085e-06, + "loss": 0.5144, + "step": 17522 + }, + { + "epoch": 1.0376695604991861, + "grad_norm": 6.248117446899414, + "learning_rate": 7.70152117308483e-06, + "loss": 0.4704, + "step": 17523 + }, + { + "epoch": 1.037683125339121, + "grad_norm": 6.196887969970703, + "learning_rate": 7.701384130464575e-06, + "loss": 0.3953, + "step": 17524 + }, + { + "epoch": 1.0376966901790559, + "grad_norm": 5.813831806182861, + "learning_rate": 7.70124708784432e-06, + "loss": 0.3828, + "step": 17525 + }, + { + "epoch": 1.0377102550189907, + "grad_norm": 5.59190034866333, + "learning_rate": 7.701110045224065e-06, + "loss": 0.3365, + "step": 17526 + }, + { + "epoch": 1.0377238198589256, + "grad_norm": 5.837062358856201, + "learning_rate": 7.70097300260381e-06, + "loss": 0.3802, + "step": 17527 + }, + { + "epoch": 1.0377373846988605, + "grad_norm": 4.769438743591309, + "learning_rate": 7.700835959983556e-06, + "loss": 0.3383, + "step": 17528 + }, + { + "epoch": 1.0377509495387955, + "grad_norm": 5.951629638671875, + "learning_rate": 7.700698917363301e-06, + "loss": 0.3035, + "step": 17529 + }, + { + "epoch": 1.0377645143787304, + "grad_norm": 5.95713472366333, + "learning_rate": 7.700561874743046e-06, + "loss": 0.3174, + "step": 17530 + }, + { + "epoch": 1.0377780792186653, + "grad_norm": 6.624805450439453, + "learning_rate": 7.70042483212279e-06, + "loss": 0.4538, + "step": 17531 + }, + { + "epoch": 1.0377916440586001, + "grad_norm": 4.012548446655273, + "learning_rate": 7.700287789502536e-06, + "loss": 0.3603, + "step": 17532 + }, + { + "epoch": 1.037805208898535, + "grad_norm": 5.082812786102295, + "learning_rate": 7.700150746882282e-06, + "loss": 0.3241, + "step": 17533 + }, + { + "epoch": 1.0378187737384699, + "grad_norm": 6.627520561218262, + "learning_rate": 7.700013704262027e-06, + "loss": 0.5475, + "step": 17534 + }, + { + "epoch": 1.0378323385784047, + "grad_norm": 5.908256530761719, + "learning_rate": 7.69987666164177e-06, + "loss": 0.3478, + "step": 17535 + }, + { + "epoch": 1.0378459034183396, + "grad_norm": 6.2837090492248535, + "learning_rate": 7.699739619021517e-06, + "loss": 0.403, + "step": 17536 + }, + { + "epoch": 1.0378594682582745, + "grad_norm": 5.533764362335205, + "learning_rate": 7.699602576401262e-06, + "loss": 0.3554, + "step": 17537 + }, + { + "epoch": 1.0378730330982093, + "grad_norm": 6.475522994995117, + "learning_rate": 7.699465533781006e-06, + "loss": 0.3693, + "step": 17538 + }, + { + "epoch": 1.0378865979381444, + "grad_norm": 6.7326130867004395, + "learning_rate": 7.699328491160751e-06, + "loss": 0.443, + "step": 17539 + }, + { + "epoch": 1.0379001627780793, + "grad_norm": 6.002285003662109, + "learning_rate": 7.699191448540498e-06, + "loss": 0.4388, + "step": 17540 + }, + { + "epoch": 1.0379137276180141, + "grad_norm": 6.4200358390808105, + "learning_rate": 7.699054405920241e-06, + "loss": 0.4286, + "step": 17541 + }, + { + "epoch": 1.037927292457949, + "grad_norm": 5.626229286193848, + "learning_rate": 7.698917363299987e-06, + "loss": 0.3543, + "step": 17542 + }, + { + "epoch": 1.0379408572978839, + "grad_norm": 7.637706279754639, + "learning_rate": 7.698780320679732e-06, + "loss": 0.3151, + "step": 17543 + }, + { + "epoch": 1.0379544221378187, + "grad_norm": 5.901771545410156, + "learning_rate": 7.698643278059477e-06, + "loss": 0.4142, + "step": 17544 + }, + { + "epoch": 1.0379679869777536, + "grad_norm": 6.267301082611084, + "learning_rate": 7.698506235439222e-06, + "loss": 0.2835, + "step": 17545 + }, + { + "epoch": 1.0379815518176885, + "grad_norm": 6.645733833312988, + "learning_rate": 7.698369192818967e-06, + "loss": 0.3984, + "step": 17546 + }, + { + "epoch": 1.0379951166576233, + "grad_norm": 8.28714370727539, + "learning_rate": 7.698232150198712e-06, + "loss": 0.501, + "step": 17547 + }, + { + "epoch": 1.0380086814975584, + "grad_norm": 6.045841693878174, + "learning_rate": 7.698095107578458e-06, + "loss": 0.2547, + "step": 17548 + }, + { + "epoch": 1.0380222463374933, + "grad_norm": 7.037654876708984, + "learning_rate": 7.697958064958203e-06, + "loss": 0.4296, + "step": 17549 + }, + { + "epoch": 1.0380358111774282, + "grad_norm": 7.202612400054932, + "learning_rate": 7.697821022337948e-06, + "loss": 0.3457, + "step": 17550 + }, + { + "epoch": 1.038049376017363, + "grad_norm": 5.410060405731201, + "learning_rate": 7.697683979717693e-06, + "loss": 0.2682, + "step": 17551 + }, + { + "epoch": 1.0380629408572979, + "grad_norm": 9.09554672241211, + "learning_rate": 7.697546937097438e-06, + "loss": 0.4918, + "step": 17552 + }, + { + "epoch": 1.0380765056972328, + "grad_norm": 7.597428798675537, + "learning_rate": 7.697409894477184e-06, + "loss": 0.4976, + "step": 17553 + }, + { + "epoch": 1.0380900705371676, + "grad_norm": 5.200166702270508, + "learning_rate": 7.697272851856929e-06, + "loss": 0.2591, + "step": 17554 + }, + { + "epoch": 1.0381036353771025, + "grad_norm": 7.884753227233887, + "learning_rate": 7.697135809236674e-06, + "loss": 0.4732, + "step": 17555 + }, + { + "epoch": 1.0381172002170374, + "grad_norm": 5.892230033874512, + "learning_rate": 7.696998766616417e-06, + "loss": 0.3936, + "step": 17556 + }, + { + "epoch": 1.0381307650569722, + "grad_norm": 7.624959945678711, + "learning_rate": 7.696861723996163e-06, + "loss": 0.421, + "step": 17557 + }, + { + "epoch": 1.0381443298969073, + "grad_norm": 4.881446838378906, + "learning_rate": 7.69672468137591e-06, + "loss": 0.3915, + "step": 17558 + }, + { + "epoch": 1.0381578947368422, + "grad_norm": 5.338603496551514, + "learning_rate": 7.696587638755653e-06, + "loss": 0.3173, + "step": 17559 + }, + { + "epoch": 1.038171459576777, + "grad_norm": 4.52000617980957, + "learning_rate": 7.696450596135398e-06, + "loss": 0.1668, + "step": 17560 + }, + { + "epoch": 1.038185024416712, + "grad_norm": 7.045941352844238, + "learning_rate": 7.696313553515143e-06, + "loss": 0.3923, + "step": 17561 + }, + { + "epoch": 1.0381985892566468, + "grad_norm": 6.403904438018799, + "learning_rate": 7.69617651089489e-06, + "loss": 0.4669, + "step": 17562 + }, + { + "epoch": 1.0382121540965816, + "grad_norm": 6.071524620056152, + "learning_rate": 7.696039468274634e-06, + "loss": 0.4092, + "step": 17563 + }, + { + "epoch": 1.0382257189365165, + "grad_norm": 5.314179420471191, + "learning_rate": 7.695902425654379e-06, + "loss": 0.2953, + "step": 17564 + }, + { + "epoch": 1.0382392837764514, + "grad_norm": 6.817562580108643, + "learning_rate": 7.695765383034124e-06, + "loss": 0.5463, + "step": 17565 + }, + { + "epoch": 1.0382528486163862, + "grad_norm": 6.642795562744141, + "learning_rate": 7.695628340413869e-06, + "loss": 0.3359, + "step": 17566 + }, + { + "epoch": 1.0382664134563213, + "grad_norm": 4.3734025955200195, + "learning_rate": 7.695491297793614e-06, + "loss": 0.2082, + "step": 17567 + }, + { + "epoch": 1.0382799782962562, + "grad_norm": 7.071183204650879, + "learning_rate": 7.69535425517336e-06, + "loss": 0.3524, + "step": 17568 + }, + { + "epoch": 1.038293543136191, + "grad_norm": 6.0240278244018555, + "learning_rate": 7.695217212553105e-06, + "loss": 0.4284, + "step": 17569 + }, + { + "epoch": 1.038307107976126, + "grad_norm": 6.559610843658447, + "learning_rate": 7.69508016993285e-06, + "loss": 0.2599, + "step": 17570 + }, + { + "epoch": 1.0383206728160608, + "grad_norm": 6.169216632843018, + "learning_rate": 7.694943127312595e-06, + "loss": 0.2764, + "step": 17571 + }, + { + "epoch": 1.0383342376559956, + "grad_norm": 6.476543426513672, + "learning_rate": 7.69480608469234e-06, + "loss": 0.3391, + "step": 17572 + }, + { + "epoch": 1.0383478024959305, + "grad_norm": 5.0281829833984375, + "learning_rate": 7.694669042072085e-06, + "loss": 0.3277, + "step": 17573 + }, + { + "epoch": 1.0383613673358654, + "grad_norm": 7.782224178314209, + "learning_rate": 7.694531999451829e-06, + "loss": 0.3694, + "step": 17574 + }, + { + "epoch": 1.0383749321758002, + "grad_norm": 4.5531206130981445, + "learning_rate": 7.694394956831576e-06, + "loss": 0.2193, + "step": 17575 + }, + { + "epoch": 1.038388497015735, + "grad_norm": 5.600186347961426, + "learning_rate": 7.694257914211321e-06, + "loss": 0.3318, + "step": 17576 + }, + { + "epoch": 1.0384020618556702, + "grad_norm": 5.508945465087891, + "learning_rate": 7.694120871591066e-06, + "loss": 0.2967, + "step": 17577 + }, + { + "epoch": 1.038415626695605, + "grad_norm": 5.171947479248047, + "learning_rate": 7.69398382897081e-06, + "loss": 0.3562, + "step": 17578 + }, + { + "epoch": 1.03842919153554, + "grad_norm": 4.603915214538574, + "learning_rate": 7.693846786350556e-06, + "loss": 0.3507, + "step": 17579 + }, + { + "epoch": 1.0384427563754748, + "grad_norm": 6.544904708862305, + "learning_rate": 7.693709743730302e-06, + "loss": 0.3023, + "step": 17580 + }, + { + "epoch": 1.0384563212154097, + "grad_norm": 6.2544941902160645, + "learning_rate": 7.693572701110045e-06, + "loss": 0.3726, + "step": 17581 + }, + { + "epoch": 1.0384698860553445, + "grad_norm": 4.270179271697998, + "learning_rate": 7.69343565848979e-06, + "loss": 0.2386, + "step": 17582 + }, + { + "epoch": 1.0384834508952794, + "grad_norm": 5.42340612411499, + "learning_rate": 7.693298615869535e-06, + "loss": 0.3324, + "step": 17583 + }, + { + "epoch": 1.0384970157352142, + "grad_norm": 4.779316425323486, + "learning_rate": 7.69316157324928e-06, + "loss": 0.2491, + "step": 17584 + }, + { + "epoch": 1.0385105805751491, + "grad_norm": 4.707476615905762, + "learning_rate": 7.693024530629026e-06, + "loss": 0.3132, + "step": 17585 + }, + { + "epoch": 1.0385241454150842, + "grad_norm": 7.067795753479004, + "learning_rate": 7.692887488008771e-06, + "loss": 0.3743, + "step": 17586 + }, + { + "epoch": 1.038537710255019, + "grad_norm": 6.909673690795898, + "learning_rate": 7.692750445388516e-06, + "loss": 0.3553, + "step": 17587 + }, + { + "epoch": 1.038551275094954, + "grad_norm": 6.891795635223389, + "learning_rate": 7.692613402768261e-06, + "loss": 0.2935, + "step": 17588 + }, + { + "epoch": 1.0385648399348888, + "grad_norm": 6.556041717529297, + "learning_rate": 7.692476360148007e-06, + "loss": 0.3687, + "step": 17589 + }, + { + "epoch": 1.0385784047748237, + "grad_norm": 6.412471294403076, + "learning_rate": 7.692339317527752e-06, + "loss": 0.3259, + "step": 17590 + }, + { + "epoch": 1.0385919696147585, + "grad_norm": 6.285670757293701, + "learning_rate": 7.692202274907497e-06, + "loss": 0.2885, + "step": 17591 + }, + { + "epoch": 1.0386055344546934, + "grad_norm": 8.22413444519043, + "learning_rate": 7.692065232287242e-06, + "loss": 0.4657, + "step": 17592 + }, + { + "epoch": 1.0386190992946283, + "grad_norm": 7.327043533325195, + "learning_rate": 7.691928189666987e-06, + "loss": 0.4251, + "step": 17593 + }, + { + "epoch": 1.0386326641345631, + "grad_norm": 6.3987016677856445, + "learning_rate": 7.691791147046732e-06, + "loss": 0.4577, + "step": 17594 + }, + { + "epoch": 1.038646228974498, + "grad_norm": 6.169328212738037, + "learning_rate": 7.691654104426478e-06, + "loss": 0.2965, + "step": 17595 + }, + { + "epoch": 1.038659793814433, + "grad_norm": 4.967896938323975, + "learning_rate": 7.691517061806221e-06, + "loss": 0.2375, + "step": 17596 + }, + { + "epoch": 1.038673358654368, + "grad_norm": 8.209418296813965, + "learning_rate": 7.691380019185968e-06, + "loss": 0.4005, + "step": 17597 + }, + { + "epoch": 1.0386869234943028, + "grad_norm": 6.010303974151611, + "learning_rate": 7.691242976565713e-06, + "loss": 0.3275, + "step": 17598 + }, + { + "epoch": 1.0387004883342377, + "grad_norm": 4.62246036529541, + "learning_rate": 7.691105933945457e-06, + "loss": 0.2252, + "step": 17599 + }, + { + "epoch": 1.0387140531741725, + "grad_norm": 6.378974914550781, + "learning_rate": 7.690968891325202e-06, + "loss": 0.3567, + "step": 17600 + }, + { + "epoch": 1.0387276180141074, + "grad_norm": 6.412927627563477, + "learning_rate": 7.690831848704949e-06, + "loss": 0.3193, + "step": 17601 + }, + { + "epoch": 1.0387411828540423, + "grad_norm": 7.361081123352051, + "learning_rate": 7.690694806084694e-06, + "loss": 0.3242, + "step": 17602 + }, + { + "epoch": 1.0387547476939771, + "grad_norm": 6.928621292114258, + "learning_rate": 7.690557763464437e-06, + "loss": 0.283, + "step": 17603 + }, + { + "epoch": 1.038768312533912, + "grad_norm": 5.270568370819092, + "learning_rate": 7.690420720844183e-06, + "loss": 0.2307, + "step": 17604 + }, + { + "epoch": 1.038781877373847, + "grad_norm": 5.520598888397217, + "learning_rate": 7.69028367822393e-06, + "loss": 0.2904, + "step": 17605 + }, + { + "epoch": 1.038795442213782, + "grad_norm": 4.124314308166504, + "learning_rate": 7.690146635603673e-06, + "loss": 0.1721, + "step": 17606 + }, + { + "epoch": 1.0388090070537168, + "grad_norm": 6.140494346618652, + "learning_rate": 7.690009592983418e-06, + "loss": 0.4447, + "step": 17607 + }, + { + "epoch": 1.0388225718936517, + "grad_norm": 3.140389919281006, + "learning_rate": 7.689872550363163e-06, + "loss": 0.1737, + "step": 17608 + }, + { + "epoch": 1.0388361367335865, + "grad_norm": 3.8210813999176025, + "learning_rate": 7.689735507742908e-06, + "loss": 0.1418, + "step": 17609 + }, + { + "epoch": 1.0388497015735214, + "grad_norm": 5.157754421234131, + "learning_rate": 7.689598465122654e-06, + "loss": 0.2613, + "step": 17610 + }, + { + "epoch": 1.0388632664134563, + "grad_norm": 5.475571155548096, + "learning_rate": 7.689461422502399e-06, + "loss": 0.2088, + "step": 17611 + }, + { + "epoch": 1.0388768312533911, + "grad_norm": 5.492080211639404, + "learning_rate": 7.689324379882144e-06, + "loss": 0.2836, + "step": 17612 + }, + { + "epoch": 1.038890396093326, + "grad_norm": 4.831981182098389, + "learning_rate": 7.68918733726189e-06, + "loss": 0.2576, + "step": 17613 + }, + { + "epoch": 1.038903960933261, + "grad_norm": 5.042276382446289, + "learning_rate": 7.689050294641634e-06, + "loss": 0.3448, + "step": 17614 + }, + { + "epoch": 1.038917525773196, + "grad_norm": 6.972874164581299, + "learning_rate": 7.68891325202138e-06, + "loss": 0.3615, + "step": 17615 + }, + { + "epoch": 1.0389310906131308, + "grad_norm": 6.826410293579102, + "learning_rate": 7.688776209401125e-06, + "loss": 0.3119, + "step": 17616 + }, + { + "epoch": 1.0389446554530657, + "grad_norm": 6.265015602111816, + "learning_rate": 7.68863916678087e-06, + "loss": 0.324, + "step": 17617 + }, + { + "epoch": 1.0389582202930006, + "grad_norm": 5.980297565460205, + "learning_rate": 7.688502124160615e-06, + "loss": 0.3738, + "step": 17618 + }, + { + "epoch": 1.0389717851329354, + "grad_norm": 5.343655109405518, + "learning_rate": 7.68836508154036e-06, + "loss": 0.2027, + "step": 17619 + }, + { + "epoch": 1.0389853499728703, + "grad_norm": 4.697911739349365, + "learning_rate": 7.688228038920105e-06, + "loss": 0.288, + "step": 17620 + }, + { + "epoch": 1.0389989148128052, + "grad_norm": 7.612469673156738, + "learning_rate": 7.688090996299849e-06, + "loss": 0.3265, + "step": 17621 + }, + { + "epoch": 1.03901247965274, + "grad_norm": 5.656998157501221, + "learning_rate": 7.687953953679596e-06, + "loss": 0.3243, + "step": 17622 + }, + { + "epoch": 1.0390260444926749, + "grad_norm": 5.464348793029785, + "learning_rate": 7.687816911059341e-06, + "loss": 0.3275, + "step": 17623 + }, + { + "epoch": 1.03903960933261, + "grad_norm": 5.436733722686768, + "learning_rate": 7.687679868439084e-06, + "loss": 0.267, + "step": 17624 + }, + { + "epoch": 1.0390531741725448, + "grad_norm": 8.196928977966309, + "learning_rate": 7.68754282581883e-06, + "loss": 0.3589, + "step": 17625 + }, + { + "epoch": 1.0390667390124797, + "grad_norm": 7.161203384399414, + "learning_rate": 7.687405783198575e-06, + "loss": 0.4281, + "step": 17626 + }, + { + "epoch": 1.0390803038524146, + "grad_norm": 7.404742240905762, + "learning_rate": 7.687268740578322e-06, + "loss": 0.3254, + "step": 17627 + }, + { + "epoch": 1.0390938686923494, + "grad_norm": 5.453803539276123, + "learning_rate": 7.687131697958065e-06, + "loss": 0.294, + "step": 17628 + }, + { + "epoch": 1.0391074335322843, + "grad_norm": 4.32001256942749, + "learning_rate": 7.68699465533781e-06, + "loss": 0.2284, + "step": 17629 + }, + { + "epoch": 1.0391209983722192, + "grad_norm": 4.877782344818115, + "learning_rate": 7.686857612717556e-06, + "loss": 0.2581, + "step": 17630 + }, + { + "epoch": 1.039134563212154, + "grad_norm": 4.479351997375488, + "learning_rate": 7.6867205700973e-06, + "loss": 0.1987, + "step": 17631 + }, + { + "epoch": 1.039148128052089, + "grad_norm": 5.27691650390625, + "learning_rate": 7.686583527477046e-06, + "loss": 0.2114, + "step": 17632 + }, + { + "epoch": 1.039161692892024, + "grad_norm": 6.032449722290039, + "learning_rate": 7.686446484856791e-06, + "loss": 0.2684, + "step": 17633 + }, + { + "epoch": 1.0391752577319588, + "grad_norm": 7.057419300079346, + "learning_rate": 7.686309442236536e-06, + "loss": 0.2783, + "step": 17634 + }, + { + "epoch": 1.0391888225718937, + "grad_norm": 5.07496976852417, + "learning_rate": 7.686172399616281e-06, + "loss": 0.193, + "step": 17635 + }, + { + "epoch": 1.0392023874118286, + "grad_norm": 7.320675849914551, + "learning_rate": 7.686035356996027e-06, + "loss": 0.3904, + "step": 17636 + }, + { + "epoch": 1.0392159522517634, + "grad_norm": 6.877324104309082, + "learning_rate": 7.685898314375772e-06, + "loss": 0.3883, + "step": 17637 + }, + { + "epoch": 1.0392295170916983, + "grad_norm": 5.677513122558594, + "learning_rate": 7.685761271755517e-06, + "loss": 0.2144, + "step": 17638 + }, + { + "epoch": 1.0392430819316332, + "grad_norm": 8.162066459655762, + "learning_rate": 7.68562422913526e-06, + "loss": 0.3588, + "step": 17639 + }, + { + "epoch": 1.039256646771568, + "grad_norm": 6.8202104568481445, + "learning_rate": 7.685487186515007e-06, + "loss": 0.3586, + "step": 17640 + }, + { + "epoch": 1.039270211611503, + "grad_norm": 5.790410041809082, + "learning_rate": 7.685350143894752e-06, + "loss": 0.2709, + "step": 17641 + }, + { + "epoch": 1.0392837764514378, + "grad_norm": 6.028787136077881, + "learning_rate": 7.685213101274498e-06, + "loss": 0.3154, + "step": 17642 + }, + { + "epoch": 1.0392973412913729, + "grad_norm": 5.883472442626953, + "learning_rate": 7.685076058654241e-06, + "loss": 0.2916, + "step": 17643 + }, + { + "epoch": 1.0393109061313077, + "grad_norm": 5.155710220336914, + "learning_rate": 7.684939016033988e-06, + "loss": 0.2477, + "step": 17644 + }, + { + "epoch": 1.0393244709712426, + "grad_norm": 5.914370059967041, + "learning_rate": 7.684801973413733e-06, + "loss": 0.2192, + "step": 17645 + }, + { + "epoch": 1.0393380358111775, + "grad_norm": 5.886533260345459, + "learning_rate": 7.684664930793477e-06, + "loss": 0.2253, + "step": 17646 + }, + { + "epoch": 1.0393516006511123, + "grad_norm": 6.824419021606445, + "learning_rate": 7.684527888173222e-06, + "loss": 0.2713, + "step": 17647 + }, + { + "epoch": 1.0393651654910472, + "grad_norm": 4.584811210632324, + "learning_rate": 7.684390845552969e-06, + "loss": 0.1838, + "step": 17648 + }, + { + "epoch": 1.039378730330982, + "grad_norm": 8.531167030334473, + "learning_rate": 7.684253802932712e-06, + "loss": 0.4598, + "step": 17649 + }, + { + "epoch": 1.039392295170917, + "grad_norm": 9.676966667175293, + "learning_rate": 7.684116760312457e-06, + "loss": 0.3283, + "step": 17650 + }, + { + "epoch": 1.0394058600108518, + "grad_norm": 5.487280368804932, + "learning_rate": 7.683979717692203e-06, + "loss": 0.2577, + "step": 17651 + }, + { + "epoch": 1.0394194248507869, + "grad_norm": 5.406973361968994, + "learning_rate": 7.683842675071948e-06, + "loss": 0.2302, + "step": 17652 + }, + { + "epoch": 1.0394329896907217, + "grad_norm": 5.694457054138184, + "learning_rate": 7.683705632451693e-06, + "loss": 0.258, + "step": 17653 + }, + { + "epoch": 1.0394465545306566, + "grad_norm": 3.9812495708465576, + "learning_rate": 7.683568589831438e-06, + "loss": 0.2033, + "step": 17654 + }, + { + "epoch": 1.0394601193705915, + "grad_norm": 4.433700084686279, + "learning_rate": 7.683431547211183e-06, + "loss": 0.2408, + "step": 17655 + }, + { + "epoch": 1.0394736842105263, + "grad_norm": 6.007261753082275, + "learning_rate": 7.683294504590928e-06, + "loss": 0.39, + "step": 17656 + }, + { + "epoch": 1.0394872490504612, + "grad_norm": 6.80952262878418, + "learning_rate": 7.683157461970674e-06, + "loss": 0.3403, + "step": 17657 + }, + { + "epoch": 1.039500813890396, + "grad_norm": 5.149971961975098, + "learning_rate": 7.683020419350419e-06, + "loss": 0.3295, + "step": 17658 + }, + { + "epoch": 1.039514378730331, + "grad_norm": 6.6232123374938965, + "learning_rate": 7.682883376730164e-06, + "loss": 0.2612, + "step": 17659 + }, + { + "epoch": 1.0395279435702658, + "grad_norm": 4.888724327087402, + "learning_rate": 7.68274633410991e-06, + "loss": 0.2107, + "step": 17660 + }, + { + "epoch": 1.0395415084102007, + "grad_norm": 5.793041229248047, + "learning_rate": 7.682609291489654e-06, + "loss": 0.2834, + "step": 17661 + }, + { + "epoch": 1.0395550732501357, + "grad_norm": 4.373706340789795, + "learning_rate": 7.6824722488694e-06, + "loss": 0.2642, + "step": 17662 + }, + { + "epoch": 1.0395686380900706, + "grad_norm": 4.238444805145264, + "learning_rate": 7.682335206249145e-06, + "loss": 0.2114, + "step": 17663 + }, + { + "epoch": 1.0395822029300055, + "grad_norm": 4.760673522949219, + "learning_rate": 7.682198163628888e-06, + "loss": 0.2322, + "step": 17664 + }, + { + "epoch": 1.0395957677699403, + "grad_norm": 6.5664963722229, + "learning_rate": 7.682061121008633e-06, + "loss": 0.3228, + "step": 17665 + }, + { + "epoch": 1.0396093326098752, + "grad_norm": 3.886504650115967, + "learning_rate": 7.68192407838838e-06, + "loss": 0.1531, + "step": 17666 + }, + { + "epoch": 1.03962289744981, + "grad_norm": 5.238794803619385, + "learning_rate": 7.681787035768124e-06, + "loss": 0.337, + "step": 17667 + }, + { + "epoch": 1.039636462289745, + "grad_norm": 5.241323947906494, + "learning_rate": 7.681649993147869e-06, + "loss": 0.2127, + "step": 17668 + }, + { + "epoch": 1.0396500271296798, + "grad_norm": 5.477580547332764, + "learning_rate": 7.681512950527614e-06, + "loss": 0.2811, + "step": 17669 + }, + { + "epoch": 1.0396635919696147, + "grad_norm": 3.991856813430786, + "learning_rate": 7.681375907907361e-06, + "loss": 0.2108, + "step": 17670 + }, + { + "epoch": 1.0396771568095498, + "grad_norm": 4.056742191314697, + "learning_rate": 7.681238865287104e-06, + "loss": 0.2391, + "step": 17671 + }, + { + "epoch": 1.0396907216494846, + "grad_norm": 4.407683849334717, + "learning_rate": 7.68110182266685e-06, + "loss": 0.2061, + "step": 17672 + }, + { + "epoch": 1.0397042864894195, + "grad_norm": 5.479559898376465, + "learning_rate": 7.680964780046595e-06, + "loss": 0.2102, + "step": 17673 + }, + { + "epoch": 1.0397178513293543, + "grad_norm": 6.6834845542907715, + "learning_rate": 7.68082773742634e-06, + "loss": 0.28, + "step": 17674 + }, + { + "epoch": 1.0397314161692892, + "grad_norm": 4.89539098739624, + "learning_rate": 7.680690694806085e-06, + "loss": 0.2471, + "step": 17675 + }, + { + "epoch": 1.039744981009224, + "grad_norm": 4.64373779296875, + "learning_rate": 7.68055365218583e-06, + "loss": 0.268, + "step": 17676 + }, + { + "epoch": 1.039758545849159, + "grad_norm": 6.83430290222168, + "learning_rate": 7.680416609565576e-06, + "loss": 0.2629, + "step": 17677 + }, + { + "epoch": 1.0397721106890938, + "grad_norm": 4.532622337341309, + "learning_rate": 7.68027956694532e-06, + "loss": 0.2971, + "step": 17678 + }, + { + "epoch": 1.0397856755290287, + "grad_norm": 6.490972995758057, + "learning_rate": 7.680142524325066e-06, + "loss": 0.2341, + "step": 17679 + }, + { + "epoch": 1.0397992403689635, + "grad_norm": 5.884147644042969, + "learning_rate": 7.680005481704811e-06, + "loss": 0.2536, + "step": 17680 + }, + { + "epoch": 1.0398128052088986, + "grad_norm": 4.556156635284424, + "learning_rate": 7.679868439084556e-06, + "loss": 0.3145, + "step": 17681 + }, + { + "epoch": 1.0398263700488335, + "grad_norm": 6.655795574188232, + "learning_rate": 7.6797313964643e-06, + "loss": 0.2997, + "step": 17682 + }, + { + "epoch": 1.0398399348887684, + "grad_norm": 4.3717360496521, + "learning_rate": 7.679594353844047e-06, + "loss": 0.2157, + "step": 17683 + }, + { + "epoch": 1.0398534997287032, + "grad_norm": 4.925510883331299, + "learning_rate": 7.679457311223792e-06, + "loss": 0.2788, + "step": 17684 + }, + { + "epoch": 1.039867064568638, + "grad_norm": 5.827782154083252, + "learning_rate": 7.679320268603537e-06, + "loss": 0.277, + "step": 17685 + }, + { + "epoch": 1.039880629408573, + "grad_norm": 7.4871296882629395, + "learning_rate": 7.67918322598328e-06, + "loss": 0.2482, + "step": 17686 + }, + { + "epoch": 1.0398941942485078, + "grad_norm": 6.155755043029785, + "learning_rate": 7.679046183363027e-06, + "loss": 0.2862, + "step": 17687 + }, + { + "epoch": 1.0399077590884427, + "grad_norm": 7.059428691864014, + "learning_rate": 7.678909140742772e-06, + "loss": 0.4067, + "step": 17688 + }, + { + "epoch": 1.0399213239283776, + "grad_norm": 5.166317939758301, + "learning_rate": 7.678772098122516e-06, + "loss": 0.2074, + "step": 17689 + }, + { + "epoch": 1.0399348887683126, + "grad_norm": 5.060512542724609, + "learning_rate": 7.678635055502261e-06, + "loss": 0.2529, + "step": 17690 + }, + { + "epoch": 1.0399484536082475, + "grad_norm": 6.045583724975586, + "learning_rate": 7.678498012882008e-06, + "loss": 0.3621, + "step": 17691 + }, + { + "epoch": 1.0399620184481824, + "grad_norm": 5.926126003265381, + "learning_rate": 7.678360970261752e-06, + "loss": 0.3265, + "step": 17692 + }, + { + "epoch": 1.0399755832881172, + "grad_norm": 6.263721942901611, + "learning_rate": 7.678223927641497e-06, + "loss": 0.4154, + "step": 17693 + }, + { + "epoch": 1.039989148128052, + "grad_norm": 7.901873588562012, + "learning_rate": 7.678086885021242e-06, + "loss": 0.4296, + "step": 17694 + }, + { + "epoch": 1.040002712967987, + "grad_norm": 5.505758762359619, + "learning_rate": 7.677949842400987e-06, + "loss": 0.2825, + "step": 17695 + }, + { + "epoch": 1.0400162778079218, + "grad_norm": 5.652427673339844, + "learning_rate": 7.677812799780732e-06, + "loss": 0.3045, + "step": 17696 + }, + { + "epoch": 1.0400298426478567, + "grad_norm": 8.609212875366211, + "learning_rate": 7.677675757160477e-06, + "loss": 0.3319, + "step": 17697 + }, + { + "epoch": 1.0400434074877916, + "grad_norm": 5.601275444030762, + "learning_rate": 7.677538714540223e-06, + "loss": 0.2737, + "step": 17698 + }, + { + "epoch": 1.0400569723277264, + "grad_norm": 6.942990779876709, + "learning_rate": 7.677401671919968e-06, + "loss": 0.4029, + "step": 17699 + }, + { + "epoch": 1.0400705371676615, + "grad_norm": 5.807370185852051, + "learning_rate": 7.677264629299713e-06, + "loss": 0.3785, + "step": 17700 + }, + { + "epoch": 1.0400841020075964, + "grad_norm": 8.382254600524902, + "learning_rate": 7.677127586679458e-06, + "loss": 0.3275, + "step": 17701 + }, + { + "epoch": 1.0400976668475312, + "grad_norm": 8.251533508300781, + "learning_rate": 7.676990544059203e-06, + "loss": 0.4451, + "step": 17702 + }, + { + "epoch": 1.040111231687466, + "grad_norm": 7.162902355194092, + "learning_rate": 7.676853501438948e-06, + "loss": 0.2275, + "step": 17703 + }, + { + "epoch": 1.040124796527401, + "grad_norm": 6.257731914520264, + "learning_rate": 7.676716458818694e-06, + "loss": 0.3119, + "step": 17704 + }, + { + "epoch": 1.0401383613673358, + "grad_norm": 6.482021331787109, + "learning_rate": 7.676579416198439e-06, + "loss": 0.3523, + "step": 17705 + }, + { + "epoch": 1.0401519262072707, + "grad_norm": 8.22184944152832, + "learning_rate": 7.676442373578184e-06, + "loss": 0.4199, + "step": 17706 + }, + { + "epoch": 1.0401654910472056, + "grad_norm": 5.5871124267578125, + "learning_rate": 7.676305330957928e-06, + "loss": 0.2912, + "step": 17707 + }, + { + "epoch": 1.0401790558871404, + "grad_norm": 4.758833885192871, + "learning_rate": 7.676168288337673e-06, + "loss": 0.237, + "step": 17708 + }, + { + "epoch": 1.0401926207270755, + "grad_norm": 5.038266181945801, + "learning_rate": 7.67603124571742e-06, + "loss": 0.3385, + "step": 17709 + }, + { + "epoch": 1.0402061855670104, + "grad_norm": 7.065070629119873, + "learning_rate": 7.675894203097165e-06, + "loss": 0.3746, + "step": 17710 + }, + { + "epoch": 1.0402197504069453, + "grad_norm": 6.673791408538818, + "learning_rate": 7.675757160476908e-06, + "loss": 0.31, + "step": 17711 + }, + { + "epoch": 1.0402333152468801, + "grad_norm": 5.248042583465576, + "learning_rate": 7.675620117856653e-06, + "loss": 0.3865, + "step": 17712 + }, + { + "epoch": 1.040246880086815, + "grad_norm": 5.1419172286987305, + "learning_rate": 7.6754830752364e-06, + "loss": 0.3612, + "step": 17713 + }, + { + "epoch": 1.0402604449267498, + "grad_norm": 10.042607307434082, + "learning_rate": 7.675346032616144e-06, + "loss": 0.4727, + "step": 17714 + }, + { + "epoch": 1.0402740097666847, + "grad_norm": 5.2993083000183105, + "learning_rate": 7.675208989995889e-06, + "loss": 0.3515, + "step": 17715 + }, + { + "epoch": 1.0402875746066196, + "grad_norm": 8.292531967163086, + "learning_rate": 7.675071947375634e-06, + "loss": 0.3403, + "step": 17716 + }, + { + "epoch": 1.0403011394465544, + "grad_norm": 6.0972490310668945, + "learning_rate": 7.67493490475538e-06, + "loss": 0.2961, + "step": 17717 + }, + { + "epoch": 1.0403147042864895, + "grad_norm": 4.775275707244873, + "learning_rate": 7.674797862135124e-06, + "loss": 0.263, + "step": 17718 + }, + { + "epoch": 1.0403282691264244, + "grad_norm": 7.105576515197754, + "learning_rate": 7.67466081951487e-06, + "loss": 0.2579, + "step": 17719 + }, + { + "epoch": 1.0403418339663593, + "grad_norm": 4.878469467163086, + "learning_rate": 7.674523776894615e-06, + "loss": 0.238, + "step": 17720 + }, + { + "epoch": 1.0403553988062941, + "grad_norm": 4.40350341796875, + "learning_rate": 7.67438673427436e-06, + "loss": 0.3512, + "step": 17721 + }, + { + "epoch": 1.040368963646229, + "grad_norm": 6.3108320236206055, + "learning_rate": 7.674249691654105e-06, + "loss": 0.4596, + "step": 17722 + }, + { + "epoch": 1.0403825284861639, + "grad_norm": 7.287433624267578, + "learning_rate": 7.67411264903385e-06, + "loss": 0.3283, + "step": 17723 + }, + { + "epoch": 1.0403960933260987, + "grad_norm": 4.778316497802734, + "learning_rate": 7.673975606413596e-06, + "loss": 0.3079, + "step": 17724 + }, + { + "epoch": 1.0404096581660336, + "grad_norm": 6.351926803588867, + "learning_rate": 7.67383856379334e-06, + "loss": 0.3128, + "step": 17725 + }, + { + "epoch": 1.0404232230059685, + "grad_norm": 7.080634593963623, + "learning_rate": 7.673701521173086e-06, + "loss": 0.4095, + "step": 17726 + }, + { + "epoch": 1.0404367878459033, + "grad_norm": 6.739854335784912, + "learning_rate": 7.673564478552831e-06, + "loss": 0.448, + "step": 17727 + }, + { + "epoch": 1.0404503526858384, + "grad_norm": 5.554154396057129, + "learning_rate": 7.673427435932576e-06, + "loss": 0.3212, + "step": 17728 + }, + { + "epoch": 1.0404639175257733, + "grad_norm": 5.877597332000732, + "learning_rate": 7.67329039331232e-06, + "loss": 0.3295, + "step": 17729 + }, + { + "epoch": 1.0404774823657081, + "grad_norm": 8.18401050567627, + "learning_rate": 7.673153350692067e-06, + "loss": 0.5086, + "step": 17730 + }, + { + "epoch": 1.040491047205643, + "grad_norm": 6.00286865234375, + "learning_rate": 7.673016308071812e-06, + "loss": 0.3554, + "step": 17731 + }, + { + "epoch": 1.0405046120455779, + "grad_norm": 7.5697503089904785, + "learning_rate": 7.672879265451555e-06, + "loss": 0.3514, + "step": 17732 + }, + { + "epoch": 1.0405181768855127, + "grad_norm": 7.0207109451293945, + "learning_rate": 7.6727422228313e-06, + "loss": 0.3339, + "step": 17733 + }, + { + "epoch": 1.0405317417254476, + "grad_norm": 3.5170626640319824, + "learning_rate": 7.672605180211046e-06, + "loss": 0.2489, + "step": 17734 + }, + { + "epoch": 1.0405453065653825, + "grad_norm": 5.356331825256348, + "learning_rate": 7.672468137590793e-06, + "loss": 0.3022, + "step": 17735 + }, + { + "epoch": 1.0405588714053173, + "grad_norm": 7.006809711456299, + "learning_rate": 7.672331094970536e-06, + "loss": 0.3706, + "step": 17736 + }, + { + "epoch": 1.0405724362452524, + "grad_norm": 4.804441928863525, + "learning_rate": 7.672194052350281e-06, + "loss": 0.3224, + "step": 17737 + }, + { + "epoch": 1.0405860010851873, + "grad_norm": 6.294196128845215, + "learning_rate": 7.672057009730026e-06, + "loss": 0.3051, + "step": 17738 + }, + { + "epoch": 1.0405995659251221, + "grad_norm": 7.093759536743164, + "learning_rate": 7.671919967109772e-06, + "loss": 0.3881, + "step": 17739 + }, + { + "epoch": 1.040613130765057, + "grad_norm": 5.800421237945557, + "learning_rate": 7.671782924489517e-06, + "loss": 0.3971, + "step": 17740 + }, + { + "epoch": 1.0406266956049919, + "grad_norm": 6.485775947570801, + "learning_rate": 7.671645881869262e-06, + "loss": 0.334, + "step": 17741 + }, + { + "epoch": 1.0406402604449267, + "grad_norm": 6.881815433502197, + "learning_rate": 7.671508839249007e-06, + "loss": 0.2492, + "step": 17742 + }, + { + "epoch": 1.0406538252848616, + "grad_norm": 4.564511775970459, + "learning_rate": 7.671371796628752e-06, + "loss": 0.191, + "step": 17743 + }, + { + "epoch": 1.0406673901247965, + "grad_norm": 5.871431827545166, + "learning_rate": 7.671234754008497e-06, + "loss": 0.3374, + "step": 17744 + }, + { + "epoch": 1.0406809549647313, + "grad_norm": 5.874326229095459, + "learning_rate": 7.671097711388243e-06, + "loss": 0.3933, + "step": 17745 + }, + { + "epoch": 1.0406945198046662, + "grad_norm": 7.20310115814209, + "learning_rate": 7.670960668767988e-06, + "loss": 0.3845, + "step": 17746 + }, + { + "epoch": 1.0407080846446013, + "grad_norm": 6.561047077178955, + "learning_rate": 7.670823626147733e-06, + "loss": 0.3121, + "step": 17747 + }, + { + "epoch": 1.0407216494845362, + "grad_norm": 5.698546886444092, + "learning_rate": 7.670686583527478e-06, + "loss": 0.2749, + "step": 17748 + }, + { + "epoch": 1.040735214324471, + "grad_norm": 4.888818740844727, + "learning_rate": 7.670549540907223e-06, + "loss": 0.324, + "step": 17749 + }, + { + "epoch": 1.040748779164406, + "grad_norm": 6.582786560058594, + "learning_rate": 7.670412498286967e-06, + "loss": 0.339, + "step": 17750 + }, + { + "epoch": 1.0407623440043408, + "grad_norm": 5.243333339691162, + "learning_rate": 7.670275455666712e-06, + "loss": 0.3025, + "step": 17751 + }, + { + "epoch": 1.0407759088442756, + "grad_norm": 6.102072238922119, + "learning_rate": 7.670138413046459e-06, + "loss": 0.3394, + "step": 17752 + }, + { + "epoch": 1.0407894736842105, + "grad_norm": 4.5942230224609375, + "learning_rate": 7.670001370426204e-06, + "loss": 0.2831, + "step": 17753 + }, + { + "epoch": 1.0408030385241454, + "grad_norm": 6.49324369430542, + "learning_rate": 7.669864327805948e-06, + "loss": 0.3715, + "step": 17754 + }, + { + "epoch": 1.0408166033640802, + "grad_norm": 5.095782279968262, + "learning_rate": 7.669727285185693e-06, + "loss": 0.2346, + "step": 17755 + }, + { + "epoch": 1.0408301682040153, + "grad_norm": 3.992415189743042, + "learning_rate": 7.66959024256544e-06, + "loss": 0.2899, + "step": 17756 + }, + { + "epoch": 1.0408437330439502, + "grad_norm": 4.3604817390441895, + "learning_rate": 7.669453199945183e-06, + "loss": 0.2215, + "step": 17757 + }, + { + "epoch": 1.040857297883885, + "grad_norm": 4.6842265129089355, + "learning_rate": 7.669316157324928e-06, + "loss": 0.1465, + "step": 17758 + }, + { + "epoch": 1.04087086272382, + "grad_norm": 6.720714569091797, + "learning_rate": 7.669179114704673e-06, + "loss": 0.3082, + "step": 17759 + }, + { + "epoch": 1.0408844275637548, + "grad_norm": 6.233506679534912, + "learning_rate": 7.669042072084419e-06, + "loss": 0.4426, + "step": 17760 + }, + { + "epoch": 1.0408979924036896, + "grad_norm": 7.413999080657959, + "learning_rate": 7.668905029464164e-06, + "loss": 0.4005, + "step": 17761 + }, + { + "epoch": 1.0409115572436245, + "grad_norm": 5.0278167724609375, + "learning_rate": 7.668767986843909e-06, + "loss": 0.2043, + "step": 17762 + }, + { + "epoch": 1.0409251220835594, + "grad_norm": 4.684789657592773, + "learning_rate": 7.668630944223654e-06, + "loss": 0.2476, + "step": 17763 + }, + { + "epoch": 1.0409386869234942, + "grad_norm": 4.939156532287598, + "learning_rate": 7.6684939016034e-06, + "loss": 0.3111, + "step": 17764 + }, + { + "epoch": 1.040952251763429, + "grad_norm": 6.086126327514648, + "learning_rate": 7.668356858983144e-06, + "loss": 0.2525, + "step": 17765 + }, + { + "epoch": 1.0409658166033642, + "grad_norm": 5.5237836837768555, + "learning_rate": 7.66821981636289e-06, + "loss": 0.309, + "step": 17766 + }, + { + "epoch": 1.040979381443299, + "grad_norm": 6.646378517150879, + "learning_rate": 7.668082773742635e-06, + "loss": 0.2739, + "step": 17767 + }, + { + "epoch": 1.040992946283234, + "grad_norm": 6.2583794593811035, + "learning_rate": 7.66794573112238e-06, + "loss": 0.3125, + "step": 17768 + }, + { + "epoch": 1.0410065111231688, + "grad_norm": 13.496192932128906, + "learning_rate": 7.667808688502125e-06, + "loss": 0.2614, + "step": 17769 + }, + { + "epoch": 1.0410200759631036, + "grad_norm": 6.205106735229492, + "learning_rate": 7.66767164588187e-06, + "loss": 0.3831, + "step": 17770 + }, + { + "epoch": 1.0410336408030385, + "grad_norm": 3.550757884979248, + "learning_rate": 7.667534603261616e-06, + "loss": 0.1823, + "step": 17771 + }, + { + "epoch": 1.0410472056429734, + "grad_norm": 5.368886470794678, + "learning_rate": 7.667397560641359e-06, + "loss": 0.3336, + "step": 17772 + }, + { + "epoch": 1.0410607704829082, + "grad_norm": 5.502018451690674, + "learning_rate": 7.667260518021106e-06, + "loss": 0.2683, + "step": 17773 + }, + { + "epoch": 1.041074335322843, + "grad_norm": 4.984633922576904, + "learning_rate": 7.667123475400851e-06, + "loss": 0.3009, + "step": 17774 + }, + { + "epoch": 1.0410879001627782, + "grad_norm": 5.632351875305176, + "learning_rate": 7.666986432780595e-06, + "loss": 0.2463, + "step": 17775 + }, + { + "epoch": 1.041101465002713, + "grad_norm": 6.994905471801758, + "learning_rate": 7.66684939016034e-06, + "loss": 0.435, + "step": 17776 + }, + { + "epoch": 1.041115029842648, + "grad_norm": 3.4451282024383545, + "learning_rate": 7.666712347540085e-06, + "loss": 0.1981, + "step": 17777 + }, + { + "epoch": 1.0411285946825828, + "grad_norm": 4.756447792053223, + "learning_rate": 7.666575304919832e-06, + "loss": 0.2185, + "step": 17778 + }, + { + "epoch": 1.0411421595225177, + "grad_norm": 6.327884674072266, + "learning_rate": 7.666438262299575e-06, + "loss": 0.3029, + "step": 17779 + }, + { + "epoch": 1.0411557243624525, + "grad_norm": 7.409245014190674, + "learning_rate": 7.66630121967932e-06, + "loss": 0.3662, + "step": 17780 + }, + { + "epoch": 1.0411692892023874, + "grad_norm": 6.590825080871582, + "learning_rate": 7.666164177059066e-06, + "loss": 0.2703, + "step": 17781 + }, + { + "epoch": 1.0411828540423222, + "grad_norm": 7.563324451446533, + "learning_rate": 7.66602713443881e-06, + "loss": 0.3478, + "step": 17782 + }, + { + "epoch": 1.0411964188822571, + "grad_norm": 3.8815579414367676, + "learning_rate": 7.665890091818556e-06, + "loss": 0.2292, + "step": 17783 + }, + { + "epoch": 1.041209983722192, + "grad_norm": 5.949525356292725, + "learning_rate": 7.665753049198301e-06, + "loss": 0.2157, + "step": 17784 + }, + { + "epoch": 1.041223548562127, + "grad_norm": 6.993226051330566, + "learning_rate": 7.665616006578046e-06, + "loss": 0.3713, + "step": 17785 + }, + { + "epoch": 1.041237113402062, + "grad_norm": 5.8826727867126465, + "learning_rate": 7.665478963957792e-06, + "loss": 0.2813, + "step": 17786 + }, + { + "epoch": 1.0412506782419968, + "grad_norm": 7.9456467628479, + "learning_rate": 7.665341921337537e-06, + "loss": 0.3757, + "step": 17787 + }, + { + "epoch": 1.0412642430819317, + "grad_norm": 6.552990436553955, + "learning_rate": 7.665204878717282e-06, + "loss": 0.2483, + "step": 17788 + }, + { + "epoch": 1.0412778079218665, + "grad_norm": 6.514076232910156, + "learning_rate": 7.665067836097027e-06, + "loss": 0.3222, + "step": 17789 + }, + { + "epoch": 1.0412913727618014, + "grad_norm": 4.954232215881348, + "learning_rate": 7.66493079347677e-06, + "loss": 0.1796, + "step": 17790 + }, + { + "epoch": 1.0413049376017363, + "grad_norm": 6.157423496246338, + "learning_rate": 7.664793750856517e-06, + "loss": 0.297, + "step": 17791 + }, + { + "epoch": 1.0413185024416711, + "grad_norm": 4.931014060974121, + "learning_rate": 7.664656708236263e-06, + "loss": 0.1978, + "step": 17792 + }, + { + "epoch": 1.041332067281606, + "grad_norm": 6.265263080596924, + "learning_rate": 7.664519665616008e-06, + "loss": 0.3045, + "step": 17793 + }, + { + "epoch": 1.041345632121541, + "grad_norm": 5.468201637268066, + "learning_rate": 7.664382622995751e-06, + "loss": 0.2814, + "step": 17794 + }, + { + "epoch": 1.041359196961476, + "grad_norm": 6.833204746246338, + "learning_rate": 7.664245580375498e-06, + "loss": 0.3561, + "step": 17795 + }, + { + "epoch": 1.0413727618014108, + "grad_norm": 6.465656757354736, + "learning_rate": 7.664108537755243e-06, + "loss": 0.2866, + "step": 17796 + }, + { + "epoch": 1.0413863266413457, + "grad_norm": 6.131727695465088, + "learning_rate": 7.663971495134987e-06, + "loss": 0.3869, + "step": 17797 + }, + { + "epoch": 1.0413998914812805, + "grad_norm": 5.1312665939331055, + "learning_rate": 7.663834452514732e-06, + "loss": 0.286, + "step": 17798 + }, + { + "epoch": 1.0414134563212154, + "grad_norm": 6.518374443054199, + "learning_rate": 7.663697409894479e-06, + "loss": 0.3482, + "step": 17799 + }, + { + "epoch": 1.0414270211611503, + "grad_norm": 4.331299304962158, + "learning_rate": 7.663560367274222e-06, + "loss": 0.234, + "step": 17800 + }, + { + "epoch": 1.0414405860010851, + "grad_norm": 5.624861717224121, + "learning_rate": 7.663423324653968e-06, + "loss": 0.2849, + "step": 17801 + }, + { + "epoch": 1.04145415084102, + "grad_norm": 4.9799981117248535, + "learning_rate": 7.663286282033713e-06, + "loss": 0.2691, + "step": 17802 + }, + { + "epoch": 1.0414677156809549, + "grad_norm": 6.141937255859375, + "learning_rate": 7.663149239413458e-06, + "loss": 0.2249, + "step": 17803 + }, + { + "epoch": 1.04148128052089, + "grad_norm": 5.656289100646973, + "learning_rate": 7.663012196793203e-06, + "loss": 0.2016, + "step": 17804 + }, + { + "epoch": 1.0414948453608248, + "grad_norm": 6.495238780975342, + "learning_rate": 7.662875154172948e-06, + "loss": 0.3, + "step": 17805 + }, + { + "epoch": 1.0415084102007597, + "grad_norm": 6.591668605804443, + "learning_rate": 7.662738111552693e-06, + "loss": 0.2357, + "step": 17806 + }, + { + "epoch": 1.0415219750406945, + "grad_norm": 5.7874603271484375, + "learning_rate": 7.662601068932439e-06, + "loss": 0.2219, + "step": 17807 + }, + { + "epoch": 1.0415355398806294, + "grad_norm": 5.759100437164307, + "learning_rate": 7.662464026312184e-06, + "loss": 0.2488, + "step": 17808 + }, + { + "epoch": 1.0415491047205643, + "grad_norm": 6.21157169342041, + "learning_rate": 7.662326983691929e-06, + "loss": 0.2772, + "step": 17809 + }, + { + "epoch": 1.0415626695604991, + "grad_norm": 5.808743476867676, + "learning_rate": 7.662189941071674e-06, + "loss": 0.2178, + "step": 17810 + }, + { + "epoch": 1.041576234400434, + "grad_norm": 7.983504295349121, + "learning_rate": 7.66205289845142e-06, + "loss": 0.3947, + "step": 17811 + }, + { + "epoch": 1.0415897992403689, + "grad_norm": 5.042590618133545, + "learning_rate": 7.661915855831165e-06, + "loss": 0.2276, + "step": 17812 + }, + { + "epoch": 1.041603364080304, + "grad_norm": 3.8040874004364014, + "learning_rate": 7.66177881321091e-06, + "loss": 0.1392, + "step": 17813 + }, + { + "epoch": 1.0416169289202388, + "grad_norm": 7.160871982574463, + "learning_rate": 7.661641770590655e-06, + "loss": 0.2764, + "step": 17814 + }, + { + "epoch": 1.0416304937601737, + "grad_norm": 5.9501848220825195, + "learning_rate": 7.661504727970398e-06, + "loss": 0.2516, + "step": 17815 + }, + { + "epoch": 1.0416440586001086, + "grad_norm": 5.489232063293457, + "learning_rate": 7.661367685350144e-06, + "loss": 0.1818, + "step": 17816 + }, + { + "epoch": 1.0416576234400434, + "grad_norm": 5.681253910064697, + "learning_rate": 7.66123064272989e-06, + "loss": 0.2788, + "step": 17817 + }, + { + "epoch": 1.0416711882799783, + "grad_norm": 4.625288486480713, + "learning_rate": 7.661093600109636e-06, + "loss": 0.1948, + "step": 17818 + }, + { + "epoch": 1.0416847531199132, + "grad_norm": 8.196656227111816, + "learning_rate": 7.660956557489379e-06, + "loss": 0.3348, + "step": 17819 + }, + { + "epoch": 1.041698317959848, + "grad_norm": 5.465452194213867, + "learning_rate": 7.660819514869124e-06, + "loss": 0.3083, + "step": 17820 + }, + { + "epoch": 1.0417118827997829, + "grad_norm": 4.921830654144287, + "learning_rate": 7.660682472248871e-06, + "loss": 0.1426, + "step": 17821 + }, + { + "epoch": 1.0417254476397177, + "grad_norm": 7.314966201782227, + "learning_rate": 7.660545429628615e-06, + "loss": 0.2765, + "step": 17822 + }, + { + "epoch": 1.0417390124796528, + "grad_norm": 4.687954425811768, + "learning_rate": 7.66040838700836e-06, + "loss": 0.2228, + "step": 17823 + }, + { + "epoch": 1.0417525773195877, + "grad_norm": 7.6182146072387695, + "learning_rate": 7.660271344388105e-06, + "loss": 0.2044, + "step": 17824 + }, + { + "epoch": 1.0417661421595226, + "grad_norm": 4.540556907653809, + "learning_rate": 7.66013430176785e-06, + "loss": 0.1975, + "step": 17825 + }, + { + "epoch": 1.0417797069994574, + "grad_norm": 3.3181638717651367, + "learning_rate": 7.659997259147595e-06, + "loss": 0.1523, + "step": 17826 + }, + { + "epoch": 1.0417932718393923, + "grad_norm": 4.725897789001465, + "learning_rate": 7.65986021652734e-06, + "loss": 0.1687, + "step": 17827 + }, + { + "epoch": 1.0418068366793272, + "grad_norm": 4.666100025177002, + "learning_rate": 7.659723173907086e-06, + "loss": 0.2431, + "step": 17828 + }, + { + "epoch": 1.041820401519262, + "grad_norm": 4.933299541473389, + "learning_rate": 7.659586131286831e-06, + "loss": 0.159, + "step": 17829 + }, + { + "epoch": 1.041833966359197, + "grad_norm": 5.905338287353516, + "learning_rate": 7.659449088666576e-06, + "loss": 0.2441, + "step": 17830 + }, + { + "epoch": 1.0418475311991318, + "grad_norm": 5.629512310028076, + "learning_rate": 7.659312046046321e-06, + "loss": 0.263, + "step": 17831 + }, + { + "epoch": 1.0418610960390668, + "grad_norm": 6.472352504730225, + "learning_rate": 7.659175003426066e-06, + "loss": 0.2982, + "step": 17832 + }, + { + "epoch": 1.0418746608790017, + "grad_norm": 4.927412509918213, + "learning_rate": 7.659037960805812e-06, + "loss": 0.2435, + "step": 17833 + }, + { + "epoch": 1.0418882257189366, + "grad_norm": 6.541058540344238, + "learning_rate": 7.658900918185557e-06, + "loss": 0.2958, + "step": 17834 + }, + { + "epoch": 1.0419017905588714, + "grad_norm": 5.293844223022461, + "learning_rate": 7.658763875565302e-06, + "loss": 0.2353, + "step": 17835 + }, + { + "epoch": 1.0419153553988063, + "grad_norm": 5.013166904449463, + "learning_rate": 7.658626832945047e-06, + "loss": 0.2534, + "step": 17836 + }, + { + "epoch": 1.0419289202387412, + "grad_norm": 5.023099422454834, + "learning_rate": 7.65848979032479e-06, + "loss": 0.2189, + "step": 17837 + }, + { + "epoch": 1.041942485078676, + "grad_norm": 3.146228790283203, + "learning_rate": 7.658352747704537e-06, + "loss": 0.1024, + "step": 17838 + }, + { + "epoch": 1.041956049918611, + "grad_norm": 5.673202037811279, + "learning_rate": 7.658215705084283e-06, + "loss": 0.2448, + "step": 17839 + }, + { + "epoch": 1.0419696147585458, + "grad_norm": 4.76392126083374, + "learning_rate": 7.658078662464026e-06, + "loss": 0.231, + "step": 17840 + }, + { + "epoch": 1.0419831795984806, + "grad_norm": 6.209117889404297, + "learning_rate": 7.657941619843771e-06, + "loss": 0.2157, + "step": 17841 + }, + { + "epoch": 1.0419967444384157, + "grad_norm": 4.307928085327148, + "learning_rate": 7.657804577223518e-06, + "loss": 0.1932, + "step": 17842 + }, + { + "epoch": 1.0420103092783506, + "grad_norm": 6.453895568847656, + "learning_rate": 7.657667534603262e-06, + "loss": 0.3169, + "step": 17843 + }, + { + "epoch": 1.0420238741182855, + "grad_norm": 3.97310471534729, + "learning_rate": 7.657530491983007e-06, + "loss": 0.162, + "step": 17844 + }, + { + "epoch": 1.0420374389582203, + "grad_norm": 7.127921104431152, + "learning_rate": 7.657393449362752e-06, + "loss": 0.3265, + "step": 17845 + }, + { + "epoch": 1.0420510037981552, + "grad_norm": 6.028442859649658, + "learning_rate": 7.657256406742497e-06, + "loss": 0.2161, + "step": 17846 + }, + { + "epoch": 1.04206456863809, + "grad_norm": 6.697978496551514, + "learning_rate": 7.657119364122242e-06, + "loss": 0.2869, + "step": 17847 + }, + { + "epoch": 1.042078133478025, + "grad_norm": 4.955164909362793, + "learning_rate": 7.656982321501988e-06, + "loss": 0.2149, + "step": 17848 + }, + { + "epoch": 1.0420916983179598, + "grad_norm": 5.143742084503174, + "learning_rate": 7.656845278881733e-06, + "loss": 0.199, + "step": 17849 + }, + { + "epoch": 1.0421052631578946, + "grad_norm": 6.883133411407471, + "learning_rate": 7.656708236261478e-06, + "loss": 0.5044, + "step": 17850 + }, + { + "epoch": 1.0421188279978297, + "grad_norm": 4.525451183319092, + "learning_rate": 7.656571193641223e-06, + "loss": 0.1798, + "step": 17851 + }, + { + "epoch": 1.0421323928377646, + "grad_norm": 6.742828369140625, + "learning_rate": 7.656434151020968e-06, + "loss": 0.3592, + "step": 17852 + }, + { + "epoch": 1.0421459576776995, + "grad_norm": 4.522529602050781, + "learning_rate": 7.656297108400713e-06, + "loss": 0.1943, + "step": 17853 + }, + { + "epoch": 1.0421595225176343, + "grad_norm": 4.683455467224121, + "learning_rate": 7.656160065780459e-06, + "loss": 0.1935, + "step": 17854 + }, + { + "epoch": 1.0421730873575692, + "grad_norm": 3.5657906532287598, + "learning_rate": 7.656023023160204e-06, + "loss": 0.1545, + "step": 17855 + }, + { + "epoch": 1.042186652197504, + "grad_norm": 5.876086235046387, + "learning_rate": 7.655885980539949e-06, + "loss": 0.265, + "step": 17856 + }, + { + "epoch": 1.042200217037439, + "grad_norm": 5.647581577301025, + "learning_rate": 7.655748937919694e-06, + "loss": 0.2627, + "step": 17857 + }, + { + "epoch": 1.0422137818773738, + "grad_norm": 6.913881301879883, + "learning_rate": 7.655611895299438e-06, + "loss": 0.3947, + "step": 17858 + }, + { + "epoch": 1.0422273467173087, + "grad_norm": 4.204385280609131, + "learning_rate": 7.655474852679183e-06, + "loss": 0.1868, + "step": 17859 + }, + { + "epoch": 1.0422409115572435, + "grad_norm": 5.112208366394043, + "learning_rate": 7.65533781005893e-06, + "loss": 0.2796, + "step": 17860 + }, + { + "epoch": 1.0422544763971786, + "grad_norm": 5.176498889923096, + "learning_rate": 7.655200767438675e-06, + "loss": 0.2019, + "step": 17861 + }, + { + "epoch": 1.0422680412371135, + "grad_norm": 4.928133487701416, + "learning_rate": 7.655063724818418e-06, + "loss": 0.1809, + "step": 17862 + }, + { + "epoch": 1.0422816060770483, + "grad_norm": 3.6376612186431885, + "learning_rate": 7.654926682198164e-06, + "loss": 0.1499, + "step": 17863 + }, + { + "epoch": 1.0422951709169832, + "grad_norm": 5.14940881729126, + "learning_rate": 7.65478963957791e-06, + "loss": 0.2508, + "step": 17864 + }, + { + "epoch": 1.042308735756918, + "grad_norm": 6.616940498352051, + "learning_rate": 7.654652596957654e-06, + "loss": 0.2404, + "step": 17865 + }, + { + "epoch": 1.042322300596853, + "grad_norm": 3.7956809997558594, + "learning_rate": 7.654515554337399e-06, + "loss": 0.0918, + "step": 17866 + }, + { + "epoch": 1.0423358654367878, + "grad_norm": 3.7806928157806396, + "learning_rate": 7.654378511717144e-06, + "loss": 0.2257, + "step": 17867 + }, + { + "epoch": 1.0423494302767227, + "grad_norm": 4.829227924346924, + "learning_rate": 7.65424146909689e-06, + "loss": 0.2096, + "step": 17868 + }, + { + "epoch": 1.0423629951166575, + "grad_norm": 5.873241901397705, + "learning_rate": 7.654104426476635e-06, + "loss": 0.1876, + "step": 17869 + }, + { + "epoch": 1.0423765599565926, + "grad_norm": 3.7301266193389893, + "learning_rate": 7.65396738385638e-06, + "loss": 0.1882, + "step": 17870 + }, + { + "epoch": 1.0423901247965275, + "grad_norm": 5.028961181640625, + "learning_rate": 7.653830341236125e-06, + "loss": 0.1921, + "step": 17871 + }, + { + "epoch": 1.0424036896364623, + "grad_norm": 2.985529899597168, + "learning_rate": 7.65369329861587e-06, + "loss": 0.1118, + "step": 17872 + }, + { + "epoch": 1.0424172544763972, + "grad_norm": 4.5663957595825195, + "learning_rate": 7.653556255995615e-06, + "loss": 0.2193, + "step": 17873 + }, + { + "epoch": 1.042430819316332, + "grad_norm": 6.386663436889648, + "learning_rate": 7.65341921337536e-06, + "loss": 0.1583, + "step": 17874 + }, + { + "epoch": 1.042444384156267, + "grad_norm": 5.531551837921143, + "learning_rate": 7.653282170755106e-06, + "loss": 0.2963, + "step": 17875 + }, + { + "epoch": 1.0424579489962018, + "grad_norm": 5.669804573059082, + "learning_rate": 7.653145128134851e-06, + "loss": 0.2186, + "step": 17876 + }, + { + "epoch": 1.0424715138361367, + "grad_norm": 4.773687362670898, + "learning_rate": 7.653008085514596e-06, + "loss": 0.2382, + "step": 17877 + }, + { + "epoch": 1.0424850786760715, + "grad_norm": 6.536593437194824, + "learning_rate": 7.652871042894341e-06, + "loss": 0.2885, + "step": 17878 + }, + { + "epoch": 1.0424986435160064, + "grad_norm": 5.115184783935547, + "learning_rate": 7.652734000274086e-06, + "loss": 0.2304, + "step": 17879 + }, + { + "epoch": 1.0425122083559415, + "grad_norm": 6.014224529266357, + "learning_rate": 7.65259695765383e-06, + "loss": 0.2825, + "step": 17880 + }, + { + "epoch": 1.0425257731958764, + "grad_norm": 4.440776824951172, + "learning_rate": 7.652459915033577e-06, + "loss": 0.1755, + "step": 17881 + }, + { + "epoch": 1.0425393380358112, + "grad_norm": 5.290576457977295, + "learning_rate": 7.652322872413322e-06, + "loss": 0.3134, + "step": 17882 + }, + { + "epoch": 1.042552902875746, + "grad_norm": 3.9141368865966797, + "learning_rate": 7.652185829793065e-06, + "loss": 0.1761, + "step": 17883 + }, + { + "epoch": 1.042566467715681, + "grad_norm": 5.557185173034668, + "learning_rate": 7.65204878717281e-06, + "loss": 0.2365, + "step": 17884 + }, + { + "epoch": 1.0425800325556158, + "grad_norm": 5.937056541442871, + "learning_rate": 7.651911744552556e-06, + "loss": 0.2888, + "step": 17885 + }, + { + "epoch": 1.0425935973955507, + "grad_norm": 5.352377414703369, + "learning_rate": 7.651774701932303e-06, + "loss": 0.2726, + "step": 17886 + }, + { + "epoch": 1.0426071622354856, + "grad_norm": 7.033565998077393, + "learning_rate": 7.651637659312046e-06, + "loss": 0.3581, + "step": 17887 + }, + { + "epoch": 1.0426207270754204, + "grad_norm": 5.984304428100586, + "learning_rate": 7.651500616691791e-06, + "loss": 0.1727, + "step": 17888 + }, + { + "epoch": 1.0426342919153555, + "grad_norm": 6.423558235168457, + "learning_rate": 7.651363574071537e-06, + "loss": 0.2908, + "step": 17889 + }, + { + "epoch": 1.0426478567552904, + "grad_norm": 6.357278347015381, + "learning_rate": 7.651226531451282e-06, + "loss": 0.3337, + "step": 17890 + }, + { + "epoch": 1.0426614215952252, + "grad_norm": 7.527837753295898, + "learning_rate": 7.651089488831027e-06, + "loss": 0.2935, + "step": 17891 + }, + { + "epoch": 1.04267498643516, + "grad_norm": 7.275649547576904, + "learning_rate": 7.650952446210772e-06, + "loss": 0.4095, + "step": 17892 + }, + { + "epoch": 1.042688551275095, + "grad_norm": 4.341866493225098, + "learning_rate": 7.650815403590517e-06, + "loss": 0.1912, + "step": 17893 + }, + { + "epoch": 1.0427021161150298, + "grad_norm": 4.8496928215026855, + "learning_rate": 7.650678360970262e-06, + "loss": 0.2743, + "step": 17894 + }, + { + "epoch": 1.0427156809549647, + "grad_norm": 5.1149091720581055, + "learning_rate": 7.650541318350008e-06, + "loss": 0.3304, + "step": 17895 + }, + { + "epoch": 1.0427292457948996, + "grad_norm": 6.648166179656982, + "learning_rate": 7.650404275729753e-06, + "loss": 0.2252, + "step": 17896 + }, + { + "epoch": 1.0427428106348344, + "grad_norm": 5.746749401092529, + "learning_rate": 7.650267233109498e-06, + "loss": 0.3439, + "step": 17897 + }, + { + "epoch": 1.0427563754747693, + "grad_norm": 5.873103618621826, + "learning_rate": 7.650130190489243e-06, + "loss": 0.2716, + "step": 17898 + }, + { + "epoch": 1.0427699403147044, + "grad_norm": 5.66044282913208, + "learning_rate": 7.649993147868988e-06, + "loss": 0.1822, + "step": 17899 + }, + { + "epoch": 1.0427835051546392, + "grad_norm": 4.574548721313477, + "learning_rate": 7.649856105248733e-06, + "loss": 0.1602, + "step": 17900 + }, + { + "epoch": 1.042797069994574, + "grad_norm": 6.868495464324951, + "learning_rate": 7.649719062628479e-06, + "loss": 0.3521, + "step": 17901 + }, + { + "epoch": 1.042810634834509, + "grad_norm": 6.0976762771606445, + "learning_rate": 7.649582020008222e-06, + "loss": 0.3143, + "step": 17902 + }, + { + "epoch": 1.0428241996744438, + "grad_norm": 5.076070308685303, + "learning_rate": 7.649444977387969e-06, + "loss": 0.2872, + "step": 17903 + }, + { + "epoch": 1.0428377645143787, + "grad_norm": 3.8566250801086426, + "learning_rate": 7.649307934767714e-06, + "loss": 0.295, + "step": 17904 + }, + { + "epoch": 1.0428513293543136, + "grad_norm": 4.587963581085205, + "learning_rate": 7.649170892147458e-06, + "loss": 0.2749, + "step": 17905 + }, + { + "epoch": 1.0428648941942484, + "grad_norm": 4.5305376052856445, + "learning_rate": 7.649033849527203e-06, + "loss": 0.2845, + "step": 17906 + }, + { + "epoch": 1.0428784590341833, + "grad_norm": 4.224540710449219, + "learning_rate": 7.64889680690695e-06, + "loss": 0.2859, + "step": 17907 + }, + { + "epoch": 1.0428920238741184, + "grad_norm": 4.465244770050049, + "learning_rate": 7.648759764286693e-06, + "loss": 0.2544, + "step": 17908 + }, + { + "epoch": 1.0429055887140533, + "grad_norm": 5.87827730178833, + "learning_rate": 7.648622721666438e-06, + "loss": 0.3587, + "step": 17909 + }, + { + "epoch": 1.0429191535539881, + "grad_norm": 4.411681175231934, + "learning_rate": 7.648485679046184e-06, + "loss": 0.2848, + "step": 17910 + }, + { + "epoch": 1.042932718393923, + "grad_norm": 5.140120506286621, + "learning_rate": 7.64834863642593e-06, + "loss": 0.3125, + "step": 17911 + }, + { + "epoch": 1.0429462832338579, + "grad_norm": 5.270447731018066, + "learning_rate": 7.648211593805674e-06, + "loss": 0.3727, + "step": 17912 + }, + { + "epoch": 1.0429598480737927, + "grad_norm": 4.350338459014893, + "learning_rate": 7.648074551185419e-06, + "loss": 0.2724, + "step": 17913 + }, + { + "epoch": 1.0429734129137276, + "grad_norm": 4.406126499176025, + "learning_rate": 7.647937508565164e-06, + "loss": 0.2359, + "step": 17914 + }, + { + "epoch": 1.0429869777536624, + "grad_norm": 4.467129230499268, + "learning_rate": 7.64780046594491e-06, + "loss": 0.2645, + "step": 17915 + }, + { + "epoch": 1.0430005425935973, + "grad_norm": 4.707637310028076, + "learning_rate": 7.647663423324655e-06, + "loss": 0.3193, + "step": 17916 + }, + { + "epoch": 1.0430141074335322, + "grad_norm": 4.9112043380737305, + "learning_rate": 7.6475263807044e-06, + "loss": 0.218, + "step": 17917 + }, + { + "epoch": 1.0430276722734673, + "grad_norm": 4.616091251373291, + "learning_rate": 7.647389338084145e-06, + "loss": 0.2968, + "step": 17918 + }, + { + "epoch": 1.0430412371134021, + "grad_norm": 5.046925067901611, + "learning_rate": 7.64725229546389e-06, + "loss": 0.2687, + "step": 17919 + }, + { + "epoch": 1.043054801953337, + "grad_norm": 7.3688063621521, + "learning_rate": 7.647115252843635e-06, + "loss": 0.608, + "step": 17920 + }, + { + "epoch": 1.0430683667932719, + "grad_norm": 3.824664354324341, + "learning_rate": 7.64697821022338e-06, + "loss": 0.2752, + "step": 17921 + }, + { + "epoch": 1.0430819316332067, + "grad_norm": 5.652849197387695, + "learning_rate": 7.646841167603126e-06, + "loss": 0.2905, + "step": 17922 + }, + { + "epoch": 1.0430954964731416, + "grad_norm": 5.437123775482178, + "learning_rate": 7.64670412498287e-06, + "loss": 0.3513, + "step": 17923 + }, + { + "epoch": 1.0431090613130765, + "grad_norm": 4.343317031860352, + "learning_rate": 7.646567082362616e-06, + "loss": 0.3189, + "step": 17924 + }, + { + "epoch": 1.0431226261530113, + "grad_norm": 5.049375534057617, + "learning_rate": 7.646430039742361e-06, + "loss": 0.3199, + "step": 17925 + }, + { + "epoch": 1.0431361909929462, + "grad_norm": 4.65175724029541, + "learning_rate": 7.646292997122106e-06, + "loss": 0.2102, + "step": 17926 + }, + { + "epoch": 1.0431497558328813, + "grad_norm": 4.13408088684082, + "learning_rate": 7.64615595450185e-06, + "loss": 0.2598, + "step": 17927 + }, + { + "epoch": 1.0431633206728161, + "grad_norm": 5.158968925476074, + "learning_rate": 7.646018911881595e-06, + "loss": 0.3598, + "step": 17928 + }, + { + "epoch": 1.043176885512751, + "grad_norm": 6.381173610687256, + "learning_rate": 7.645881869261342e-06, + "loss": 0.3662, + "step": 17929 + }, + { + "epoch": 1.0431904503526859, + "grad_norm": 4.912486553192139, + "learning_rate": 7.645744826641085e-06, + "loss": 0.3168, + "step": 17930 + }, + { + "epoch": 1.0432040151926207, + "grad_norm": 5.940362453460693, + "learning_rate": 7.64560778402083e-06, + "loss": 0.3746, + "step": 17931 + }, + { + "epoch": 1.0432175800325556, + "grad_norm": 4.391338348388672, + "learning_rate": 7.645470741400576e-06, + "loss": 0.2413, + "step": 17932 + }, + { + "epoch": 1.0432311448724905, + "grad_norm": 5.217895030975342, + "learning_rate": 7.645333698780321e-06, + "loss": 0.2851, + "step": 17933 + }, + { + "epoch": 1.0432447097124253, + "grad_norm": 7.471206188201904, + "learning_rate": 7.645196656160066e-06, + "loss": 0.4099, + "step": 17934 + }, + { + "epoch": 1.0432582745523602, + "grad_norm": 4.117136478424072, + "learning_rate": 7.645059613539811e-06, + "loss": 0.2308, + "step": 17935 + }, + { + "epoch": 1.043271839392295, + "grad_norm": 4.869892597198486, + "learning_rate": 7.644922570919557e-06, + "loss": 0.2684, + "step": 17936 + }, + { + "epoch": 1.0432854042322302, + "grad_norm": 4.367203235626221, + "learning_rate": 7.644785528299302e-06, + "loss": 0.2018, + "step": 17937 + }, + { + "epoch": 1.043298969072165, + "grad_norm": 4.884138107299805, + "learning_rate": 7.644648485679047e-06, + "loss": 0.3258, + "step": 17938 + }, + { + "epoch": 1.0433125339120999, + "grad_norm": 5.127203941345215, + "learning_rate": 7.644511443058792e-06, + "loss": 0.3405, + "step": 17939 + }, + { + "epoch": 1.0433260987520347, + "grad_norm": 8.17403507232666, + "learning_rate": 7.644374400438537e-06, + "loss": 0.5371, + "step": 17940 + }, + { + "epoch": 1.0433396635919696, + "grad_norm": 4.300686836242676, + "learning_rate": 7.644237357818282e-06, + "loss": 0.3317, + "step": 17941 + }, + { + "epoch": 1.0433532284319045, + "grad_norm": 4.926233768463135, + "learning_rate": 7.644100315198028e-06, + "loss": 0.3025, + "step": 17942 + }, + { + "epoch": 1.0433667932718393, + "grad_norm": 5.108022689819336, + "learning_rate": 7.643963272577773e-06, + "loss": 0.3006, + "step": 17943 + }, + { + "epoch": 1.0433803581117742, + "grad_norm": 6.085840702056885, + "learning_rate": 7.643826229957518e-06, + "loss": 0.4103, + "step": 17944 + }, + { + "epoch": 1.043393922951709, + "grad_norm": 5.801023960113525, + "learning_rate": 7.643689187337261e-06, + "loss": 0.3326, + "step": 17945 + }, + { + "epoch": 1.0434074877916442, + "grad_norm": 5.106812000274658, + "learning_rate": 7.643552144717008e-06, + "loss": 0.3406, + "step": 17946 + }, + { + "epoch": 1.043421052631579, + "grad_norm": 4.625402450561523, + "learning_rate": 7.643415102096753e-06, + "loss": 0.2721, + "step": 17947 + }, + { + "epoch": 1.043434617471514, + "grad_norm": 4.988773345947266, + "learning_rate": 7.643278059476497e-06, + "loss": 0.4148, + "step": 17948 + }, + { + "epoch": 1.0434481823114488, + "grad_norm": 5.830373287200928, + "learning_rate": 7.643141016856242e-06, + "loss": 0.2835, + "step": 17949 + }, + { + "epoch": 1.0434617471513836, + "grad_norm": 5.488017559051514, + "learning_rate": 7.643003974235989e-06, + "loss": 0.2928, + "step": 17950 + }, + { + "epoch": 1.0434753119913185, + "grad_norm": 5.053210735321045, + "learning_rate": 7.642866931615733e-06, + "loss": 0.3741, + "step": 17951 + }, + { + "epoch": 1.0434888768312534, + "grad_norm": 4.8340935707092285, + "learning_rate": 7.642729888995478e-06, + "loss": 0.3898, + "step": 17952 + }, + { + "epoch": 1.0435024416711882, + "grad_norm": 6.098124027252197, + "learning_rate": 7.642592846375223e-06, + "loss": 0.3224, + "step": 17953 + }, + { + "epoch": 1.043516006511123, + "grad_norm": 6.043883800506592, + "learning_rate": 7.642455803754968e-06, + "loss": 0.3152, + "step": 17954 + }, + { + "epoch": 1.043529571351058, + "grad_norm": 5.489161491394043, + "learning_rate": 7.642318761134713e-06, + "loss": 0.3972, + "step": 17955 + }, + { + "epoch": 1.043543136190993, + "grad_norm": 5.933165550231934, + "learning_rate": 7.642181718514458e-06, + "loss": 0.3527, + "step": 17956 + }, + { + "epoch": 1.043556701030928, + "grad_norm": 6.069260120391846, + "learning_rate": 7.642044675894204e-06, + "loss": 0.2993, + "step": 17957 + }, + { + "epoch": 1.0435702658708628, + "grad_norm": 4.003778457641602, + "learning_rate": 7.641907633273949e-06, + "loss": 0.2193, + "step": 17958 + }, + { + "epoch": 1.0435838307107976, + "grad_norm": 5.319691181182861, + "learning_rate": 7.641770590653694e-06, + "loss": 0.29, + "step": 17959 + }, + { + "epoch": 1.0435973955507325, + "grad_norm": 7.798238754272461, + "learning_rate": 7.641633548033439e-06, + "loss": 0.3951, + "step": 17960 + }, + { + "epoch": 1.0436109603906674, + "grad_norm": 5.080428600311279, + "learning_rate": 7.641496505413184e-06, + "loss": 0.2733, + "step": 17961 + }, + { + "epoch": 1.0436245252306022, + "grad_norm": 5.686291694641113, + "learning_rate": 7.64135946279293e-06, + "loss": 0.279, + "step": 17962 + }, + { + "epoch": 1.043638090070537, + "grad_norm": 5.469648838043213, + "learning_rate": 7.641222420172675e-06, + "loss": 0.2949, + "step": 17963 + }, + { + "epoch": 1.043651654910472, + "grad_norm": 4.845553874969482, + "learning_rate": 7.64108537755242e-06, + "loss": 0.2731, + "step": 17964 + }, + { + "epoch": 1.043665219750407, + "grad_norm": 3.806523561477661, + "learning_rate": 7.640948334932165e-06, + "loss": 0.2971, + "step": 17965 + }, + { + "epoch": 1.043678784590342, + "grad_norm": 5.504894733428955, + "learning_rate": 7.640811292311909e-06, + "loss": 0.3187, + "step": 17966 + }, + { + "epoch": 1.0436923494302768, + "grad_norm": 5.7375168800354, + "learning_rate": 7.640674249691655e-06, + "loss": 0.3264, + "step": 17967 + }, + { + "epoch": 1.0437059142702116, + "grad_norm": 6.9380035400390625, + "learning_rate": 7.6405372070714e-06, + "loss": 0.3357, + "step": 17968 + }, + { + "epoch": 1.0437194791101465, + "grad_norm": 7.191023349761963, + "learning_rate": 7.640400164451146e-06, + "loss": 0.4286, + "step": 17969 + }, + { + "epoch": 1.0437330439500814, + "grad_norm": 5.769354820251465, + "learning_rate": 7.64026312183089e-06, + "loss": 0.3052, + "step": 17970 + }, + { + "epoch": 1.0437466087900162, + "grad_norm": 6.748406887054443, + "learning_rate": 7.640126079210634e-06, + "loss": 0.3843, + "step": 17971 + }, + { + "epoch": 1.043760173629951, + "grad_norm": 7.420024871826172, + "learning_rate": 7.639989036590381e-06, + "loss": 0.2435, + "step": 17972 + }, + { + "epoch": 1.043773738469886, + "grad_norm": 6.796247959136963, + "learning_rate": 7.639851993970125e-06, + "loss": 0.41, + "step": 17973 + }, + { + "epoch": 1.0437873033098208, + "grad_norm": 5.2350077629089355, + "learning_rate": 7.63971495134987e-06, + "loss": 0.321, + "step": 17974 + }, + { + "epoch": 1.043800868149756, + "grad_norm": 6.358583927154541, + "learning_rate": 7.639577908729615e-06, + "loss": 0.3532, + "step": 17975 + }, + { + "epoch": 1.0438144329896908, + "grad_norm": 5.92753791809082, + "learning_rate": 7.63944086610936e-06, + "loss": 0.3039, + "step": 17976 + }, + { + "epoch": 1.0438279978296257, + "grad_norm": 6.138668060302734, + "learning_rate": 7.639303823489105e-06, + "loss": 0.2926, + "step": 17977 + }, + { + "epoch": 1.0438415626695605, + "grad_norm": 5.056540489196777, + "learning_rate": 7.63916678086885e-06, + "loss": 0.2858, + "step": 17978 + }, + { + "epoch": 1.0438551275094954, + "grad_norm": 5.461044788360596, + "learning_rate": 7.639029738248596e-06, + "loss": 0.2663, + "step": 17979 + }, + { + "epoch": 1.0438686923494302, + "grad_norm": 5.500417709350586, + "learning_rate": 7.638892695628341e-06, + "loss": 0.3098, + "step": 17980 + }, + { + "epoch": 1.0438822571893651, + "grad_norm": 4.552534580230713, + "learning_rate": 7.638755653008086e-06, + "loss": 0.2358, + "step": 17981 + }, + { + "epoch": 1.0438958220293, + "grad_norm": 8.729493141174316, + "learning_rate": 7.638618610387831e-06, + "loss": 0.4489, + "step": 17982 + }, + { + "epoch": 1.0439093868692348, + "grad_norm": 4.2435784339904785, + "learning_rate": 7.638481567767577e-06, + "loss": 0.223, + "step": 17983 + }, + { + "epoch": 1.04392295170917, + "grad_norm": 4.29571533203125, + "learning_rate": 7.638344525147322e-06, + "loss": 0.3078, + "step": 17984 + }, + { + "epoch": 1.0439365165491048, + "grad_norm": 4.522462368011475, + "learning_rate": 7.638207482527067e-06, + "loss": 0.2865, + "step": 17985 + }, + { + "epoch": 1.0439500813890397, + "grad_norm": 5.040832996368408, + "learning_rate": 7.638070439906812e-06, + "loss": 0.2968, + "step": 17986 + }, + { + "epoch": 1.0439636462289745, + "grad_norm": 4.635866165161133, + "learning_rate": 7.637933397286557e-06, + "loss": 0.2792, + "step": 17987 + }, + { + "epoch": 1.0439772110689094, + "grad_norm": 4.763883590698242, + "learning_rate": 7.6377963546663e-06, + "loss": 0.2676, + "step": 17988 + }, + { + "epoch": 1.0439907759088443, + "grad_norm": 6.435065269470215, + "learning_rate": 7.637659312046048e-06, + "loss": 0.4386, + "step": 17989 + }, + { + "epoch": 1.0440043407487791, + "grad_norm": 5.276548385620117, + "learning_rate": 7.637522269425793e-06, + "loss": 0.3374, + "step": 17990 + }, + { + "epoch": 1.044017905588714, + "grad_norm": 6.273602485656738, + "learning_rate": 7.637385226805536e-06, + "loss": 0.3029, + "step": 17991 + }, + { + "epoch": 1.0440314704286489, + "grad_norm": 5.903364181518555, + "learning_rate": 7.637248184185281e-06, + "loss": 0.3348, + "step": 17992 + }, + { + "epoch": 1.0440450352685837, + "grad_norm": 4.9772233963012695, + "learning_rate": 7.637111141565028e-06, + "loss": 0.2565, + "step": 17993 + }, + { + "epoch": 1.0440586001085188, + "grad_norm": 6.087172508239746, + "learning_rate": 7.636974098944774e-06, + "loss": 0.356, + "step": 17994 + }, + { + "epoch": 1.0440721649484537, + "grad_norm": 5.730906963348389, + "learning_rate": 7.636837056324517e-06, + "loss": 0.2832, + "step": 17995 + }, + { + "epoch": 1.0440857297883885, + "grad_norm": 5.349349021911621, + "learning_rate": 7.636700013704262e-06, + "loss": 0.2148, + "step": 17996 + }, + { + "epoch": 1.0440992946283234, + "grad_norm": 5.442267894744873, + "learning_rate": 7.636562971084007e-06, + "loss": 0.312, + "step": 17997 + }, + { + "epoch": 1.0441128594682583, + "grad_norm": 5.3135175704956055, + "learning_rate": 7.636425928463753e-06, + "loss": 0.3302, + "step": 17998 + }, + { + "epoch": 1.0441264243081931, + "grad_norm": 5.853421688079834, + "learning_rate": 7.636288885843498e-06, + "loss": 0.3041, + "step": 17999 + }, + { + "epoch": 1.044139989148128, + "grad_norm": 5.8371148109436035, + "learning_rate": 7.636151843223243e-06, + "loss": 0.338, + "step": 18000 + }, + { + "epoch": 1.0441535539880629, + "grad_norm": 4.48391580581665, + "learning_rate": 7.636014800602988e-06, + "loss": 0.2744, + "step": 18001 + }, + { + "epoch": 1.0441671188279977, + "grad_norm": 4.458827972412109, + "learning_rate": 7.635877757982733e-06, + "loss": 0.2791, + "step": 18002 + }, + { + "epoch": 1.0441806836679328, + "grad_norm": 6.98690938949585, + "learning_rate": 7.635740715362478e-06, + "loss": 0.3916, + "step": 18003 + }, + { + "epoch": 1.0441942485078677, + "grad_norm": 5.813494682312012, + "learning_rate": 7.635603672742224e-06, + "loss": 0.3115, + "step": 18004 + }, + { + "epoch": 1.0442078133478025, + "grad_norm": 7.873904228210449, + "learning_rate": 7.635466630121969e-06, + "loss": 0.4273, + "step": 18005 + }, + { + "epoch": 1.0442213781877374, + "grad_norm": 6.917181968688965, + "learning_rate": 7.635329587501714e-06, + "loss": 0.4338, + "step": 18006 + }, + { + "epoch": 1.0442349430276723, + "grad_norm": 4.255307674407959, + "learning_rate": 7.635192544881459e-06, + "loss": 0.2544, + "step": 18007 + }, + { + "epoch": 1.0442485078676071, + "grad_norm": 5.333606243133545, + "learning_rate": 7.635055502261204e-06, + "loss": 0.2784, + "step": 18008 + }, + { + "epoch": 1.044262072707542, + "grad_norm": 4.749430179595947, + "learning_rate": 7.63491845964095e-06, + "loss": 0.2376, + "step": 18009 + }, + { + "epoch": 1.0442756375474769, + "grad_norm": 4.841279983520508, + "learning_rate": 7.634781417020693e-06, + "loss": 0.256, + "step": 18010 + }, + { + "epoch": 1.0442892023874117, + "grad_norm": 4.183144569396973, + "learning_rate": 7.63464437440044e-06, + "loss": 0.2718, + "step": 18011 + }, + { + "epoch": 1.0443027672273466, + "grad_norm": 5.769683361053467, + "learning_rate": 7.634507331780185e-06, + "loss": 0.3327, + "step": 18012 + }, + { + "epoch": 1.0443163320672817, + "grad_norm": 4.870796203613281, + "learning_rate": 7.634370289159929e-06, + "loss": 0.235, + "step": 18013 + }, + { + "epoch": 1.0443298969072166, + "grad_norm": 4.861220836639404, + "learning_rate": 7.634233246539674e-06, + "loss": 0.204, + "step": 18014 + }, + { + "epoch": 1.0443434617471514, + "grad_norm": 3.7758147716522217, + "learning_rate": 7.63409620391942e-06, + "loss": 0.2441, + "step": 18015 + }, + { + "epoch": 1.0443570265870863, + "grad_norm": 4.63469934463501, + "learning_rate": 7.633959161299164e-06, + "loss": 0.2071, + "step": 18016 + }, + { + "epoch": 1.0443705914270212, + "grad_norm": 5.268373012542725, + "learning_rate": 7.63382211867891e-06, + "loss": 0.3489, + "step": 18017 + }, + { + "epoch": 1.044384156266956, + "grad_norm": 4.460427761077881, + "learning_rate": 7.633685076058654e-06, + "loss": 0.2972, + "step": 18018 + }, + { + "epoch": 1.0443977211068909, + "grad_norm": 5.249978065490723, + "learning_rate": 7.633548033438401e-06, + "loss": 0.2729, + "step": 18019 + }, + { + "epoch": 1.0444112859468258, + "grad_norm": 6.009199142456055, + "learning_rate": 7.633410990818145e-06, + "loss": 0.4109, + "step": 18020 + }, + { + "epoch": 1.0444248507867606, + "grad_norm": 5.634664535522461, + "learning_rate": 7.63327394819789e-06, + "loss": 0.3045, + "step": 18021 + }, + { + "epoch": 1.0444384156266957, + "grad_norm": 4.746618747711182, + "learning_rate": 7.633136905577635e-06, + "loss": 0.2285, + "step": 18022 + }, + { + "epoch": 1.0444519804666306, + "grad_norm": 3.9213247299194336, + "learning_rate": 7.63299986295738e-06, + "loss": 0.2168, + "step": 18023 + }, + { + "epoch": 1.0444655453065654, + "grad_norm": 6.027839660644531, + "learning_rate": 7.632862820337125e-06, + "loss": 0.3365, + "step": 18024 + }, + { + "epoch": 1.0444791101465003, + "grad_norm": 4.308848857879639, + "learning_rate": 7.63272577771687e-06, + "loss": 0.2855, + "step": 18025 + }, + { + "epoch": 1.0444926749864352, + "grad_norm": 6.931826114654541, + "learning_rate": 7.632588735096616e-06, + "loss": 0.3245, + "step": 18026 + }, + { + "epoch": 1.04450623982637, + "grad_norm": 4.757723331451416, + "learning_rate": 7.632451692476361e-06, + "loss": 0.2698, + "step": 18027 + }, + { + "epoch": 1.044519804666305, + "grad_norm": 3.8764660358428955, + "learning_rate": 7.632314649856106e-06, + "loss": 0.2528, + "step": 18028 + }, + { + "epoch": 1.0445333695062398, + "grad_norm": 4.299468994140625, + "learning_rate": 7.632177607235851e-06, + "loss": 0.2115, + "step": 18029 + }, + { + "epoch": 1.0445469343461746, + "grad_norm": 3.8161561489105225, + "learning_rate": 7.632040564615597e-06, + "loss": 0.2232, + "step": 18030 + }, + { + "epoch": 1.0445604991861095, + "grad_norm": 4.817492961883545, + "learning_rate": 7.63190352199534e-06, + "loss": 0.2523, + "step": 18031 + }, + { + "epoch": 1.0445740640260446, + "grad_norm": 4.256165504455566, + "learning_rate": 7.631766479375087e-06, + "loss": 0.1554, + "step": 18032 + }, + { + "epoch": 1.0445876288659794, + "grad_norm": 4.3264384269714355, + "learning_rate": 7.631629436754832e-06, + "loss": 0.2875, + "step": 18033 + }, + { + "epoch": 1.0446011937059143, + "grad_norm": 4.206066608428955, + "learning_rate": 7.631492394134576e-06, + "loss": 0.2666, + "step": 18034 + }, + { + "epoch": 1.0446147585458492, + "grad_norm": 5.039616584777832, + "learning_rate": 7.63135535151432e-06, + "loss": 0.2788, + "step": 18035 + }, + { + "epoch": 1.044628323385784, + "grad_norm": 5.5838189125061035, + "learning_rate": 7.631218308894068e-06, + "loss": 0.3488, + "step": 18036 + }, + { + "epoch": 1.044641888225719, + "grad_norm": 3.6410787105560303, + "learning_rate": 7.631081266273813e-06, + "loss": 0.2079, + "step": 18037 + }, + { + "epoch": 1.0446554530656538, + "grad_norm": 4.0497002601623535, + "learning_rate": 7.630944223653556e-06, + "loss": 0.1818, + "step": 18038 + }, + { + "epoch": 1.0446690179055886, + "grad_norm": 3.2987060546875, + "learning_rate": 7.630807181033301e-06, + "loss": 0.1804, + "step": 18039 + }, + { + "epoch": 1.0446825827455235, + "grad_norm": 5.6327033042907715, + "learning_rate": 7.630670138413047e-06, + "loss": 0.2666, + "step": 18040 + }, + { + "epoch": 1.0446961475854586, + "grad_norm": 3.224544048309326, + "learning_rate": 7.630533095792792e-06, + "loss": 0.2105, + "step": 18041 + }, + { + "epoch": 1.0447097124253935, + "grad_norm": 3.28340220451355, + "learning_rate": 7.630396053172537e-06, + "loss": 0.1192, + "step": 18042 + }, + { + "epoch": 1.0447232772653283, + "grad_norm": 5.082887649536133, + "learning_rate": 7.630259010552282e-06, + "loss": 0.4294, + "step": 18043 + }, + { + "epoch": 1.0447368421052632, + "grad_norm": 4.986810684204102, + "learning_rate": 7.630121967932027e-06, + "loss": 0.2104, + "step": 18044 + }, + { + "epoch": 1.044750406945198, + "grad_norm": 4.9376912117004395, + "learning_rate": 7.629984925311773e-06, + "loss": 0.2516, + "step": 18045 + }, + { + "epoch": 1.044763971785133, + "grad_norm": 4.77465295791626, + "learning_rate": 7.629847882691518e-06, + "loss": 0.2267, + "step": 18046 + }, + { + "epoch": 1.0447775366250678, + "grad_norm": 4.846184730529785, + "learning_rate": 7.629710840071263e-06, + "loss": 0.2452, + "step": 18047 + }, + { + "epoch": 1.0447911014650026, + "grad_norm": 5.738011360168457, + "learning_rate": 7.629573797451008e-06, + "loss": 0.3266, + "step": 18048 + }, + { + "epoch": 1.0448046663049375, + "grad_norm": 4.45827579498291, + "learning_rate": 7.629436754830753e-06, + "loss": 0.3122, + "step": 18049 + }, + { + "epoch": 1.0448182311448724, + "grad_norm": 7.248628616333008, + "learning_rate": 7.6292997122104985e-06, + "loss": 0.324, + "step": 18050 + }, + { + "epoch": 1.0448317959848075, + "grad_norm": 7.396246910095215, + "learning_rate": 7.629162669590244e-06, + "loss": 0.3112, + "step": 18051 + }, + { + "epoch": 1.0448453608247423, + "grad_norm": 5.372079372406006, + "learning_rate": 7.629025626969988e-06, + "loss": 0.2529, + "step": 18052 + }, + { + "epoch": 1.0448589256646772, + "grad_norm": 4.908661842346191, + "learning_rate": 7.628888584349733e-06, + "loss": 0.1841, + "step": 18053 + }, + { + "epoch": 1.044872490504612, + "grad_norm": 7.062000751495361, + "learning_rate": 7.628751541729479e-06, + "loss": 0.3887, + "step": 18054 + }, + { + "epoch": 1.044886055344547, + "grad_norm": 6.359062194824219, + "learning_rate": 7.6286144991092235e-06, + "loss": 0.2327, + "step": 18055 + }, + { + "epoch": 1.0448996201844818, + "grad_norm": 4.54557466506958, + "learning_rate": 7.628477456488969e-06, + "loss": 0.2501, + "step": 18056 + }, + { + "epoch": 1.0449131850244167, + "grad_norm": 3.9538395404815674, + "learning_rate": 7.628340413868713e-06, + "loss": 0.1992, + "step": 18057 + }, + { + "epoch": 1.0449267498643515, + "grad_norm": 3.768237352371216, + "learning_rate": 7.628203371248459e-06, + "loss": 0.2086, + "step": 18058 + }, + { + "epoch": 1.0449403147042864, + "grad_norm": 5.862695217132568, + "learning_rate": 7.628066328628204e-06, + "loss": 0.2846, + "step": 18059 + }, + { + "epoch": 1.0449538795442215, + "grad_norm": 5.980954647064209, + "learning_rate": 7.627929286007949e-06, + "loss": 0.257, + "step": 18060 + }, + { + "epoch": 1.0449674443841563, + "grad_norm": 5.102502346038818, + "learning_rate": 7.627792243387694e-06, + "loss": 0.3309, + "step": 18061 + }, + { + "epoch": 1.0449810092240912, + "grad_norm": 5.913576126098633, + "learning_rate": 7.62765520076744e-06, + "loss": 0.3768, + "step": 18062 + }, + { + "epoch": 1.044994574064026, + "grad_norm": 5.760477542877197, + "learning_rate": 7.627518158147185e-06, + "loss": 0.3238, + "step": 18063 + }, + { + "epoch": 1.045008138903961, + "grad_norm": 4.886193752288818, + "learning_rate": 7.627381115526929e-06, + "loss": 0.2391, + "step": 18064 + }, + { + "epoch": 1.0450217037438958, + "grad_norm": 5.193778038024902, + "learning_rate": 7.6272440729066744e-06, + "loss": 0.2772, + "step": 18065 + }, + { + "epoch": 1.0450352685838307, + "grad_norm": 4.383155345916748, + "learning_rate": 7.627107030286419e-06, + "loss": 0.2357, + "step": 18066 + }, + { + "epoch": 1.0450488334237655, + "grad_norm": 5.801131248474121, + "learning_rate": 7.626969987666165e-06, + "loss": 0.3583, + "step": 18067 + }, + { + "epoch": 1.0450623982637004, + "grad_norm": 5.758592128753662, + "learning_rate": 7.62683294504591e-06, + "loss": 0.2936, + "step": 18068 + }, + { + "epoch": 1.0450759631036353, + "grad_norm": 5.885883331298828, + "learning_rate": 7.626695902425655e-06, + "loss": 0.3689, + "step": 18069 + }, + { + "epoch": 1.0450895279435704, + "grad_norm": 6.2722649574279785, + "learning_rate": 7.6265588598053995e-06, + "loss": 0.3122, + "step": 18070 + }, + { + "epoch": 1.0451030927835052, + "grad_norm": 6.720329284667969, + "learning_rate": 7.6264218171851455e-06, + "loss": 0.3568, + "step": 18071 + }, + { + "epoch": 1.04511665762344, + "grad_norm": 4.479892253875732, + "learning_rate": 7.626284774564891e-06, + "loss": 0.2325, + "step": 18072 + }, + { + "epoch": 1.045130222463375, + "grad_norm": 3.661410093307495, + "learning_rate": 7.626147731944635e-06, + "loss": 0.2027, + "step": 18073 + }, + { + "epoch": 1.0451437873033098, + "grad_norm": 6.217332363128662, + "learning_rate": 7.62601068932438e-06, + "loss": 0.2916, + "step": 18074 + }, + { + "epoch": 1.0451573521432447, + "grad_norm": 4.741457939147949, + "learning_rate": 7.625873646704126e-06, + "loss": 0.2829, + "step": 18075 + }, + { + "epoch": 1.0451709169831795, + "grad_norm": 5.670846462249756, + "learning_rate": 7.6257366040838706e-06, + "loss": 0.253, + "step": 18076 + }, + { + "epoch": 1.0451844818231144, + "grad_norm": 6.10175895690918, + "learning_rate": 7.625599561463616e-06, + "loss": 0.28, + "step": 18077 + }, + { + "epoch": 1.0451980466630493, + "grad_norm": 6.382174968719482, + "learning_rate": 7.625462518843361e-06, + "loss": 0.2358, + "step": 18078 + }, + { + "epoch": 1.0452116115029844, + "grad_norm": 5.093975067138672, + "learning_rate": 7.625325476223105e-06, + "loss": 0.3461, + "step": 18079 + }, + { + "epoch": 1.0452251763429192, + "grad_norm": 6.672612190246582, + "learning_rate": 7.625188433602851e-06, + "loss": 0.3493, + "step": 18080 + }, + { + "epoch": 1.045238741182854, + "grad_norm": 4.67360258102417, + "learning_rate": 7.6250513909825965e-06, + "loss": 0.2781, + "step": 18081 + }, + { + "epoch": 1.045252306022789, + "grad_norm": 6.586376190185547, + "learning_rate": 7.624914348362341e-06, + "loss": 0.3478, + "step": 18082 + }, + { + "epoch": 1.0452658708627238, + "grad_norm": 6.354097843170166, + "learning_rate": 7.624777305742086e-06, + "loss": 0.3592, + "step": 18083 + }, + { + "epoch": 1.0452794357026587, + "grad_norm": 4.348651885986328, + "learning_rate": 7.624640263121832e-06, + "loss": 0.1818, + "step": 18084 + }, + { + "epoch": 1.0452930005425936, + "grad_norm": 6.571967601776123, + "learning_rate": 7.624503220501577e-06, + "loss": 0.4287, + "step": 18085 + }, + { + "epoch": 1.0453065653825284, + "grad_norm": 3.829996347427368, + "learning_rate": 7.6243661778813215e-06, + "loss": 0.1676, + "step": 18086 + }, + { + "epoch": 1.0453201302224633, + "grad_norm": 8.391252517700195, + "learning_rate": 7.624229135261067e-06, + "loss": 0.419, + "step": 18087 + }, + { + "epoch": 1.0453336950623981, + "grad_norm": 4.652556896209717, + "learning_rate": 7.624092092640813e-06, + "loss": 0.2473, + "step": 18088 + }, + { + "epoch": 1.0453472599023332, + "grad_norm": 6.266218662261963, + "learning_rate": 7.623955050020557e-06, + "loss": 0.3874, + "step": 18089 + }, + { + "epoch": 1.045360824742268, + "grad_norm": 4.925468444824219, + "learning_rate": 7.623818007400302e-06, + "loss": 0.2269, + "step": 18090 + }, + { + "epoch": 1.045374389582203, + "grad_norm": 7.5246901512146, + "learning_rate": 7.6236809647800466e-06, + "loss": 0.2301, + "step": 18091 + }, + { + "epoch": 1.0453879544221378, + "grad_norm": 4.366831302642822, + "learning_rate": 7.623543922159792e-06, + "loss": 0.233, + "step": 18092 + }, + { + "epoch": 1.0454015192620727, + "grad_norm": 5.454896450042725, + "learning_rate": 7.623406879539538e-06, + "loss": 0.178, + "step": 18093 + }, + { + "epoch": 1.0454150841020076, + "grad_norm": 7.047799587249756, + "learning_rate": 7.623269836919283e-06, + "loss": 0.3679, + "step": 18094 + }, + { + "epoch": 1.0454286489419424, + "grad_norm": 5.48197603225708, + "learning_rate": 7.623132794299027e-06, + "loss": 0.2458, + "step": 18095 + }, + { + "epoch": 1.0454422137818773, + "grad_norm": 5.4594316482543945, + "learning_rate": 7.6229957516787724e-06, + "loss": 0.2503, + "step": 18096 + }, + { + "epoch": 1.0454557786218122, + "grad_norm": 5.764177322387695, + "learning_rate": 7.6228587090585185e-06, + "loss": 0.2601, + "step": 18097 + }, + { + "epoch": 1.0454693434617472, + "grad_norm": 7.036454200744629, + "learning_rate": 7.622721666438263e-06, + "loss": 0.3375, + "step": 18098 + }, + { + "epoch": 1.0454829083016821, + "grad_norm": 4.355310440063477, + "learning_rate": 7.622584623818008e-06, + "loss": 0.1541, + "step": 18099 + }, + { + "epoch": 1.045496473141617, + "grad_norm": 5.831972122192383, + "learning_rate": 7.622447581197753e-06, + "loss": 0.2226, + "step": 18100 + }, + { + "epoch": 1.0455100379815518, + "grad_norm": 5.83275842666626, + "learning_rate": 7.622310538577498e-06, + "loss": 0.2815, + "step": 18101 + }, + { + "epoch": 1.0455236028214867, + "grad_norm": 5.366308689117432, + "learning_rate": 7.6221734959572435e-06, + "loss": 0.2162, + "step": 18102 + }, + { + "epoch": 1.0455371676614216, + "grad_norm": 4.920313358306885, + "learning_rate": 7.622036453336989e-06, + "loss": 0.2186, + "step": 18103 + }, + { + "epoch": 1.0455507325013564, + "grad_norm": 4.977092742919922, + "learning_rate": 7.621899410716733e-06, + "loss": 0.2571, + "step": 18104 + }, + { + "epoch": 1.0455642973412913, + "grad_norm": 5.206348419189453, + "learning_rate": 7.621762368096479e-06, + "loss": 0.2669, + "step": 18105 + }, + { + "epoch": 1.0455778621812262, + "grad_norm": 6.728978633880615, + "learning_rate": 7.621625325476224e-06, + "loss": 0.2656, + "step": 18106 + }, + { + "epoch": 1.0455914270211613, + "grad_norm": 5.742144584655762, + "learning_rate": 7.6214882828559686e-06, + "loss": 0.1969, + "step": 18107 + }, + { + "epoch": 1.0456049918610961, + "grad_norm": 4.770848751068115, + "learning_rate": 7.621351240235714e-06, + "loss": 0.1801, + "step": 18108 + }, + { + "epoch": 1.045618556701031, + "grad_norm": 6.126428127288818, + "learning_rate": 7.621214197615459e-06, + "loss": 0.2321, + "step": 18109 + }, + { + "epoch": 1.0456321215409659, + "grad_norm": 6.4039812088012695, + "learning_rate": 7.621077154995204e-06, + "loss": 0.2749, + "step": 18110 + }, + { + "epoch": 1.0456456863809007, + "grad_norm": 5.536315441131592, + "learning_rate": 7.620940112374949e-06, + "loss": 0.2483, + "step": 18111 + }, + { + "epoch": 1.0456592512208356, + "grad_norm": 4.787563800811768, + "learning_rate": 7.6208030697546945e-06, + "loss": 0.1824, + "step": 18112 + }, + { + "epoch": 1.0456728160607704, + "grad_norm": 5.007608890533447, + "learning_rate": 7.620666027134439e-06, + "loss": 0.1513, + "step": 18113 + }, + { + "epoch": 1.0456863809007053, + "grad_norm": 5.935624122619629, + "learning_rate": 7.620528984514185e-06, + "loss": 0.1796, + "step": 18114 + }, + { + "epoch": 1.0456999457406402, + "grad_norm": 4.759275436401367, + "learning_rate": 7.62039194189393e-06, + "loss": 0.1552, + "step": 18115 + }, + { + "epoch": 1.045713510580575, + "grad_norm": 4.802550792694092, + "learning_rate": 7.620254899273674e-06, + "loss": 0.2564, + "step": 18116 + }, + { + "epoch": 1.0457270754205101, + "grad_norm": 5.710957050323486, + "learning_rate": 7.6201178566534195e-06, + "loss": 0.2215, + "step": 18117 + }, + { + "epoch": 1.045740640260445, + "grad_norm": 6.31604528427124, + "learning_rate": 7.6199808140331655e-06, + "loss": 0.2982, + "step": 18118 + }, + { + "epoch": 1.0457542051003799, + "grad_norm": 5.1404008865356445, + "learning_rate": 7.619843771412911e-06, + "loss": 0.2772, + "step": 18119 + }, + { + "epoch": 1.0457677699403147, + "grad_norm": 6.266955375671387, + "learning_rate": 7.619706728792655e-06, + "loss": 0.3039, + "step": 18120 + }, + { + "epoch": 1.0457813347802496, + "grad_norm": 4.477024555206299, + "learning_rate": 7.6195696861724e-06, + "loss": 0.1519, + "step": 18121 + }, + { + "epoch": 1.0457948996201845, + "grad_norm": 5.434049129486084, + "learning_rate": 7.6194326435521446e-06, + "loss": 0.2551, + "step": 18122 + }, + { + "epoch": 1.0458084644601193, + "grad_norm": 4.1077375411987305, + "learning_rate": 7.619295600931891e-06, + "loss": 0.1699, + "step": 18123 + }, + { + "epoch": 1.0458220293000542, + "grad_norm": 5.098395347595215, + "learning_rate": 7.619158558311636e-06, + "loss": 0.1864, + "step": 18124 + }, + { + "epoch": 1.045835594139989, + "grad_norm": 5.963890552520752, + "learning_rate": 7.61902151569138e-06, + "loss": 0.2288, + "step": 18125 + }, + { + "epoch": 1.0458491589799241, + "grad_norm": 6.526739597320557, + "learning_rate": 7.618884473071125e-06, + "loss": 0.2459, + "step": 18126 + }, + { + "epoch": 1.045862723819859, + "grad_norm": 6.921295166015625, + "learning_rate": 7.618747430450871e-06, + "loss": 0.3783, + "step": 18127 + }, + { + "epoch": 1.0458762886597939, + "grad_norm": 5.643820762634277, + "learning_rate": 7.6186103878306165e-06, + "loss": 0.3989, + "step": 18128 + }, + { + "epoch": 1.0458898534997287, + "grad_norm": 6.674653053283691, + "learning_rate": 7.618473345210361e-06, + "loss": 0.4734, + "step": 18129 + }, + { + "epoch": 1.0459034183396636, + "grad_norm": 5.908012866973877, + "learning_rate": 7.618336302590106e-06, + "loss": 0.3448, + "step": 18130 + }, + { + "epoch": 1.0459169831795985, + "grad_norm": 9.710968971252441, + "learning_rate": 7.618199259969852e-06, + "loss": 0.5125, + "step": 18131 + }, + { + "epoch": 1.0459305480195333, + "grad_norm": 5.850576877593994, + "learning_rate": 7.618062217349596e-06, + "loss": 0.445, + "step": 18132 + }, + { + "epoch": 1.0459441128594682, + "grad_norm": 5.6990766525268555, + "learning_rate": 7.6179251747293415e-06, + "loss": 0.295, + "step": 18133 + }, + { + "epoch": 1.045957677699403, + "grad_norm": 7.929628849029541, + "learning_rate": 7.617788132109087e-06, + "loss": 0.3408, + "step": 18134 + }, + { + "epoch": 1.045971242539338, + "grad_norm": 6.484339237213135, + "learning_rate": 7.617651089488831e-06, + "loss": 0.3345, + "step": 18135 + }, + { + "epoch": 1.045984807379273, + "grad_norm": 5.360487937927246, + "learning_rate": 7.617514046868577e-06, + "loss": 0.2354, + "step": 18136 + }, + { + "epoch": 1.0459983722192079, + "grad_norm": 5.787736415863037, + "learning_rate": 7.617377004248322e-06, + "loss": 0.268, + "step": 18137 + }, + { + "epoch": 1.0460119370591427, + "grad_norm": 5.128540515899658, + "learning_rate": 7.6172399616280666e-06, + "loss": 0.2158, + "step": 18138 + }, + { + "epoch": 1.0460255018990776, + "grad_norm": 6.1075239181518555, + "learning_rate": 7.617102919007812e-06, + "loss": 0.2404, + "step": 18139 + }, + { + "epoch": 1.0460390667390125, + "grad_norm": 6.113154411315918, + "learning_rate": 7.616965876387558e-06, + "loss": 0.3307, + "step": 18140 + }, + { + "epoch": 1.0460526315789473, + "grad_norm": 5.746210098266602, + "learning_rate": 7.616828833767302e-06, + "loss": 0.2786, + "step": 18141 + }, + { + "epoch": 1.0460661964188822, + "grad_norm": 5.351003646850586, + "learning_rate": 7.616691791147047e-06, + "loss": 0.2991, + "step": 18142 + }, + { + "epoch": 1.046079761258817, + "grad_norm": 7.120243072509766, + "learning_rate": 7.6165547485267925e-06, + "loss": 0.4235, + "step": 18143 + }, + { + "epoch": 1.046093326098752, + "grad_norm": 5.073232173919678, + "learning_rate": 7.616417705906538e-06, + "loss": 0.2633, + "step": 18144 + }, + { + "epoch": 1.046106890938687, + "grad_norm": 6.015040874481201, + "learning_rate": 7.616280663286283e-06, + "loss": 0.2103, + "step": 18145 + }, + { + "epoch": 1.046120455778622, + "grad_norm": 7.679179668426514, + "learning_rate": 7.616143620666028e-06, + "loss": 0.4847, + "step": 18146 + }, + { + "epoch": 1.0461340206185568, + "grad_norm": 5.764746189117432, + "learning_rate": 7.616006578045772e-06, + "loss": 0.3559, + "step": 18147 + }, + { + "epoch": 1.0461475854584916, + "grad_norm": 6.526773452758789, + "learning_rate": 7.6158695354255175e-06, + "loss": 0.3407, + "step": 18148 + }, + { + "epoch": 1.0461611502984265, + "grad_norm": 4.346067905426025, + "learning_rate": 7.6157324928052635e-06, + "loss": 0.255, + "step": 18149 + }, + { + "epoch": 1.0461747151383614, + "grad_norm": 3.600058078765869, + "learning_rate": 7.615595450185008e-06, + "loss": 0.2002, + "step": 18150 + }, + { + "epoch": 1.0461882799782962, + "grad_norm": 3.859549045562744, + "learning_rate": 7.615458407564753e-06, + "loss": 0.2419, + "step": 18151 + }, + { + "epoch": 1.046201844818231, + "grad_norm": 4.081263542175293, + "learning_rate": 7.615321364944498e-06, + "loss": 0.2264, + "step": 18152 + }, + { + "epoch": 1.046215409658166, + "grad_norm": 4.674374103546143, + "learning_rate": 7.615184322324244e-06, + "loss": 0.2534, + "step": 18153 + }, + { + "epoch": 1.0462289744981008, + "grad_norm": 3.8979434967041016, + "learning_rate": 7.615047279703989e-06, + "loss": 0.223, + "step": 18154 + }, + { + "epoch": 1.046242539338036, + "grad_norm": 5.451224327087402, + "learning_rate": 7.614910237083734e-06, + "loss": 0.2576, + "step": 18155 + }, + { + "epoch": 1.0462561041779708, + "grad_norm": 4.3102827072143555, + "learning_rate": 7.614773194463478e-06, + "loss": 0.2036, + "step": 18156 + }, + { + "epoch": 1.0462696690179056, + "grad_norm": 3.7192413806915283, + "learning_rate": 7.614636151843224e-06, + "loss": 0.1884, + "step": 18157 + }, + { + "epoch": 1.0462832338578405, + "grad_norm": 3.191497564315796, + "learning_rate": 7.614499109222969e-06, + "loss": 0.1695, + "step": 18158 + }, + { + "epoch": 1.0462967986977754, + "grad_norm": 4.8541789054870605, + "learning_rate": 7.614362066602714e-06, + "loss": 0.3653, + "step": 18159 + }, + { + "epoch": 1.0463103635377102, + "grad_norm": 4.0384063720703125, + "learning_rate": 7.614225023982459e-06, + "loss": 0.1268, + "step": 18160 + }, + { + "epoch": 1.046323928377645, + "grad_norm": 7.392300605773926, + "learning_rate": 7.614087981362204e-06, + "loss": 0.3413, + "step": 18161 + }, + { + "epoch": 1.04633749321758, + "grad_norm": 4.799880504608154, + "learning_rate": 7.61395093874195e-06, + "loss": 0.2497, + "step": 18162 + }, + { + "epoch": 1.0463510580575148, + "grad_norm": 3.7666962146759033, + "learning_rate": 7.613813896121694e-06, + "loss": 0.174, + "step": 18163 + }, + { + "epoch": 1.04636462289745, + "grad_norm": 7.26543664932251, + "learning_rate": 7.6136768535014395e-06, + "loss": 0.4553, + "step": 18164 + }, + { + "epoch": 1.0463781877373848, + "grad_norm": 5.9583210945129395, + "learning_rate": 7.613539810881184e-06, + "loss": 0.3699, + "step": 18165 + }, + { + "epoch": 1.0463917525773196, + "grad_norm": 3.5847573280334473, + "learning_rate": 7.61340276826093e-06, + "loss": 0.179, + "step": 18166 + }, + { + "epoch": 1.0464053174172545, + "grad_norm": 3.7802846431732178, + "learning_rate": 7.613265725640675e-06, + "loss": 0.198, + "step": 18167 + }, + { + "epoch": 1.0464188822571894, + "grad_norm": 3.2966830730438232, + "learning_rate": 7.61312868302042e-06, + "loss": 0.1468, + "step": 18168 + }, + { + "epoch": 1.0464324470971242, + "grad_norm": 5.396663188934326, + "learning_rate": 7.6129916404001646e-06, + "loss": 0.3746, + "step": 18169 + }, + { + "epoch": 1.046446011937059, + "grad_norm": 4.561422824859619, + "learning_rate": 7.612854597779911e-06, + "loss": 0.169, + "step": 18170 + }, + { + "epoch": 1.046459576776994, + "grad_norm": 4.5842671394348145, + "learning_rate": 7.612717555159656e-06, + "loss": 0.3467, + "step": 18171 + }, + { + "epoch": 1.0464731416169288, + "grad_norm": 4.0923943519592285, + "learning_rate": 7.6125805125394e-06, + "loss": 0.2226, + "step": 18172 + }, + { + "epoch": 1.0464867064568637, + "grad_norm": 4.7792744636535645, + "learning_rate": 7.612443469919145e-06, + "loss": 0.2716, + "step": 18173 + }, + { + "epoch": 1.0465002712967988, + "grad_norm": 5.549373626708984, + "learning_rate": 7.61230642729889e-06, + "loss": 0.3608, + "step": 18174 + }, + { + "epoch": 1.0465138361367337, + "grad_norm": 5.628282070159912, + "learning_rate": 7.612169384678636e-06, + "loss": 0.2707, + "step": 18175 + }, + { + "epoch": 1.0465274009766685, + "grad_norm": 4.800775527954102, + "learning_rate": 7.612032342058381e-06, + "loss": 0.3066, + "step": 18176 + }, + { + "epoch": 1.0465409658166034, + "grad_norm": 6.5683441162109375, + "learning_rate": 7.611895299438126e-06, + "loss": 0.2631, + "step": 18177 + }, + { + "epoch": 1.0465545306565383, + "grad_norm": 6.9833598136901855, + "learning_rate": 7.61175825681787e-06, + "loss": 0.3285, + "step": 18178 + }, + { + "epoch": 1.0465680954964731, + "grad_norm": 3.6866297721862793, + "learning_rate": 7.611621214197616e-06, + "loss": 0.1895, + "step": 18179 + }, + { + "epoch": 1.046581660336408, + "grad_norm": 5.6092658042907715, + "learning_rate": 7.6114841715773615e-06, + "loss": 0.2937, + "step": 18180 + }, + { + "epoch": 1.0465952251763428, + "grad_norm": 5.423159122467041, + "learning_rate": 7.611347128957106e-06, + "loss": 0.3747, + "step": 18181 + }, + { + "epoch": 1.0466087900162777, + "grad_norm": 5.66457986831665, + "learning_rate": 7.611210086336851e-06, + "loss": 0.3926, + "step": 18182 + }, + { + "epoch": 1.0466223548562128, + "grad_norm": 6.49010705947876, + "learning_rate": 7.611073043716597e-06, + "loss": 0.3065, + "step": 18183 + }, + { + "epoch": 1.0466359196961477, + "grad_norm": 4.743784427642822, + "learning_rate": 7.610936001096341e-06, + "loss": 0.2643, + "step": 18184 + }, + { + "epoch": 1.0466494845360825, + "grad_norm": 5.912408828735352, + "learning_rate": 7.610798958476087e-06, + "loss": 0.2611, + "step": 18185 + }, + { + "epoch": 1.0466630493760174, + "grad_norm": 6.660097122192383, + "learning_rate": 7.610661915855832e-06, + "loss": 0.3622, + "step": 18186 + }, + { + "epoch": 1.0466766142159523, + "grad_norm": 5.589385986328125, + "learning_rate": 7.610524873235578e-06, + "loss": 0.3066, + "step": 18187 + }, + { + "epoch": 1.0466901790558871, + "grad_norm": 7.154914379119873, + "learning_rate": 7.610387830615322e-06, + "loss": 0.4302, + "step": 18188 + }, + { + "epoch": 1.046703743895822, + "grad_norm": 4.550364017486572, + "learning_rate": 7.610250787995067e-06, + "loss": 0.2905, + "step": 18189 + }, + { + "epoch": 1.0467173087357569, + "grad_norm": 5.550923824310303, + "learning_rate": 7.610113745374812e-06, + "loss": 0.3451, + "step": 18190 + }, + { + "epoch": 1.0467308735756917, + "grad_norm": 6.458390712738037, + "learning_rate": 7.609976702754557e-06, + "loss": 0.3544, + "step": 18191 + }, + { + "epoch": 1.0467444384156268, + "grad_norm": 5.407955646514893, + "learning_rate": 7.609839660134303e-06, + "loss": 0.2027, + "step": 18192 + }, + { + "epoch": 1.0467580032555617, + "grad_norm": 6.781627655029297, + "learning_rate": 7.609702617514047e-06, + "loss": 0.4316, + "step": 18193 + }, + { + "epoch": 1.0467715680954965, + "grad_norm": 6.3210368156433105, + "learning_rate": 7.609565574893792e-06, + "loss": 0.465, + "step": 18194 + }, + { + "epoch": 1.0467851329354314, + "grad_norm": 4.492199897766113, + "learning_rate": 7.6094285322735375e-06, + "loss": 0.2776, + "step": 18195 + }, + { + "epoch": 1.0467986977753663, + "grad_norm": 5.648542404174805, + "learning_rate": 7.6092914896532836e-06, + "loss": 0.3524, + "step": 18196 + }, + { + "epoch": 1.0468122626153011, + "grad_norm": 5.404273986816406, + "learning_rate": 7.609154447033028e-06, + "loss": 0.2824, + "step": 18197 + }, + { + "epoch": 1.046825827455236, + "grad_norm": 7.7862701416015625, + "learning_rate": 7.609017404412773e-06, + "loss": 0.5787, + "step": 18198 + }, + { + "epoch": 1.0468393922951709, + "grad_norm": 5.138071060180664, + "learning_rate": 7.608880361792517e-06, + "loss": 0.2073, + "step": 18199 + }, + { + "epoch": 1.0468529571351057, + "grad_norm": 6.165859222412109, + "learning_rate": 7.6087433191722634e-06, + "loss": 0.3314, + "step": 18200 + }, + { + "epoch": 1.0468665219750406, + "grad_norm": 6.269939422607422, + "learning_rate": 7.608606276552009e-06, + "loss": 0.3579, + "step": 18201 + }, + { + "epoch": 1.0468800868149757, + "grad_norm": 5.639683723449707, + "learning_rate": 7.608469233931754e-06, + "loss": 0.2429, + "step": 18202 + }, + { + "epoch": 1.0468936516549106, + "grad_norm": 5.372283935546875, + "learning_rate": 7.608332191311498e-06, + "loss": 0.2732, + "step": 18203 + }, + { + "epoch": 1.0469072164948454, + "grad_norm": 5.1437554359436035, + "learning_rate": 7.608195148691243e-06, + "loss": 0.4101, + "step": 18204 + }, + { + "epoch": 1.0469207813347803, + "grad_norm": 6.979921340942383, + "learning_rate": 7.608058106070989e-06, + "loss": 0.4022, + "step": 18205 + }, + { + "epoch": 1.0469343461747151, + "grad_norm": 5.381912708282471, + "learning_rate": 7.607921063450734e-06, + "loss": 0.3606, + "step": 18206 + }, + { + "epoch": 1.04694791101465, + "grad_norm": 7.6378397941589355, + "learning_rate": 7.607784020830479e-06, + "loss": 0.3026, + "step": 18207 + }, + { + "epoch": 1.0469614758545849, + "grad_norm": 7.569997787475586, + "learning_rate": 7.607646978210223e-06, + "loss": 0.393, + "step": 18208 + }, + { + "epoch": 1.0469750406945197, + "grad_norm": 5.1323957443237305, + "learning_rate": 7.607509935589969e-06, + "loss": 0.307, + "step": 18209 + }, + { + "epoch": 1.0469886055344546, + "grad_norm": 5.840086460113525, + "learning_rate": 7.607372892969714e-06, + "loss": 0.3272, + "step": 18210 + }, + { + "epoch": 1.0470021703743897, + "grad_norm": 4.439738750457764, + "learning_rate": 7.6072358503494595e-06, + "loss": 0.2784, + "step": 18211 + }, + { + "epoch": 1.0470157352143246, + "grad_norm": 5.4276251792907715, + "learning_rate": 7.607098807729204e-06, + "loss": 0.3979, + "step": 18212 + }, + { + "epoch": 1.0470293000542594, + "grad_norm": 5.619185924530029, + "learning_rate": 7.60696176510895e-06, + "loss": 0.2367, + "step": 18213 + }, + { + "epoch": 1.0470428648941943, + "grad_norm": 4.563059329986572, + "learning_rate": 7.606824722488695e-06, + "loss": 0.2347, + "step": 18214 + }, + { + "epoch": 1.0470564297341292, + "grad_norm": 8.684343338012695, + "learning_rate": 7.606687679868439e-06, + "loss": 0.4025, + "step": 18215 + }, + { + "epoch": 1.047069994574064, + "grad_norm": 4.278284549713135, + "learning_rate": 7.606550637248185e-06, + "loss": 0.2048, + "step": 18216 + }, + { + "epoch": 1.0470835594139989, + "grad_norm": 3.7689735889434814, + "learning_rate": 7.60641359462793e-06, + "loss": 0.2693, + "step": 18217 + }, + { + "epoch": 1.0470971242539338, + "grad_norm": 5.180812358856201, + "learning_rate": 7.606276552007675e-06, + "loss": 0.2632, + "step": 18218 + }, + { + "epoch": 1.0471106890938686, + "grad_norm": 6.761843681335449, + "learning_rate": 7.60613950938742e-06, + "loss": 0.3113, + "step": 18219 + }, + { + "epoch": 1.0471242539338035, + "grad_norm": 5.412944793701172, + "learning_rate": 7.606002466767165e-06, + "loss": 0.2438, + "step": 18220 + }, + { + "epoch": 1.0471378187737386, + "grad_norm": 6.559565544128418, + "learning_rate": 7.60586542414691e-06, + "loss": 0.3982, + "step": 18221 + }, + { + "epoch": 1.0471513836136734, + "grad_norm": 4.628942012786865, + "learning_rate": 7.605728381526656e-06, + "loss": 0.2409, + "step": 18222 + }, + { + "epoch": 1.0471649484536083, + "grad_norm": 4.5625386238098145, + "learning_rate": 7.605591338906401e-06, + "loss": 0.2588, + "step": 18223 + }, + { + "epoch": 1.0471785132935432, + "grad_norm": 5.44878625869751, + "learning_rate": 7.605454296286145e-06, + "loss": 0.3294, + "step": 18224 + }, + { + "epoch": 1.047192078133478, + "grad_norm": 5.843454360961914, + "learning_rate": 7.60531725366589e-06, + "loss": 0.5332, + "step": 18225 + }, + { + "epoch": 1.047205642973413, + "grad_norm": 6.677030563354492, + "learning_rate": 7.605180211045636e-06, + "loss": 0.4219, + "step": 18226 + }, + { + "epoch": 1.0472192078133478, + "grad_norm": 4.097657203674316, + "learning_rate": 7.6050431684253816e-06, + "loss": 0.2511, + "step": 18227 + }, + { + "epoch": 1.0472327726532826, + "grad_norm": 5.36016321182251, + "learning_rate": 7.604906125805126e-06, + "loss": 0.2404, + "step": 18228 + }, + { + "epoch": 1.0472463374932175, + "grad_norm": 4.9246907234191895, + "learning_rate": 7.604769083184871e-06, + "loss": 0.2657, + "step": 18229 + }, + { + "epoch": 1.0472599023331526, + "grad_norm": 5.682837963104248, + "learning_rate": 7.604632040564615e-06, + "loss": 0.2809, + "step": 18230 + }, + { + "epoch": 1.0472734671730874, + "grad_norm": 4.995922088623047, + "learning_rate": 7.6044949979443614e-06, + "loss": 0.2847, + "step": 18231 + }, + { + "epoch": 1.0472870320130223, + "grad_norm": 5.432033538818359, + "learning_rate": 7.604357955324107e-06, + "loss": 0.2059, + "step": 18232 + }, + { + "epoch": 1.0473005968529572, + "grad_norm": 6.819770336151123, + "learning_rate": 7.604220912703851e-06, + "loss": 0.2917, + "step": 18233 + }, + { + "epoch": 1.047314161692892, + "grad_norm": 5.5148444175720215, + "learning_rate": 7.604083870083596e-06, + "loss": 0.2788, + "step": 18234 + }, + { + "epoch": 1.047327726532827, + "grad_norm": 5.351809978485107, + "learning_rate": 7.603946827463342e-06, + "loss": 0.2079, + "step": 18235 + }, + { + "epoch": 1.0473412913727618, + "grad_norm": 6.03864860534668, + "learning_rate": 7.603809784843087e-06, + "loss": 0.2974, + "step": 18236 + }, + { + "epoch": 1.0473548562126966, + "grad_norm": 5.027560234069824, + "learning_rate": 7.603672742222832e-06, + "loss": 0.165, + "step": 18237 + }, + { + "epoch": 1.0473684210526315, + "grad_norm": 6.06968355178833, + "learning_rate": 7.603535699602577e-06, + "loss": 0.3108, + "step": 18238 + }, + { + "epoch": 1.0473819858925664, + "grad_norm": 4.957729816436768, + "learning_rate": 7.603398656982323e-06, + "loss": 0.227, + "step": 18239 + }, + { + "epoch": 1.0473955507325015, + "grad_norm": 5.303678035736084, + "learning_rate": 7.603261614362067e-06, + "loss": 0.3411, + "step": 18240 + }, + { + "epoch": 1.0474091155724363, + "grad_norm": 5.012340545654297, + "learning_rate": 7.603124571741812e-06, + "loss": 0.2369, + "step": 18241 + }, + { + "epoch": 1.0474226804123712, + "grad_norm": 5.4017863273620605, + "learning_rate": 7.6029875291215576e-06, + "loss": 0.29, + "step": 18242 + }, + { + "epoch": 1.047436245252306, + "grad_norm": 4.643033027648926, + "learning_rate": 7.602850486501302e-06, + "loss": 0.2995, + "step": 18243 + }, + { + "epoch": 1.047449810092241, + "grad_norm": 7.0851006507873535, + "learning_rate": 7.602713443881048e-06, + "loss": 0.3107, + "step": 18244 + }, + { + "epoch": 1.0474633749321758, + "grad_norm": 6.140091419219971, + "learning_rate": 7.602576401260793e-06, + "loss": 0.3212, + "step": 18245 + }, + { + "epoch": 1.0474769397721106, + "grad_norm": 5.943788051605225, + "learning_rate": 7.602439358640537e-06, + "loss": 0.3733, + "step": 18246 + }, + { + "epoch": 1.0474905046120455, + "grad_norm": 6.095920562744141, + "learning_rate": 7.602302316020283e-06, + "loss": 0.2584, + "step": 18247 + }, + { + "epoch": 1.0475040694519804, + "grad_norm": 5.427087783813477, + "learning_rate": 7.602165273400029e-06, + "loss": 0.2222, + "step": 18248 + }, + { + "epoch": 1.0475176342919155, + "grad_norm": 5.607492446899414, + "learning_rate": 7.602028230779773e-06, + "loss": 0.2727, + "step": 18249 + }, + { + "epoch": 1.0475311991318503, + "grad_norm": 4.761994361877441, + "learning_rate": 7.601891188159518e-06, + "loss": 0.2585, + "step": 18250 + }, + { + "epoch": 1.0475447639717852, + "grad_norm": 5.059879779815674, + "learning_rate": 7.601754145539263e-06, + "loss": 0.3152, + "step": 18251 + }, + { + "epoch": 1.04755832881172, + "grad_norm": 6.956537246704102, + "learning_rate": 7.6016171029190085e-06, + "loss": 0.3352, + "step": 18252 + }, + { + "epoch": 1.047571893651655, + "grad_norm": 6.37165641784668, + "learning_rate": 7.601480060298754e-06, + "loss": 0.3239, + "step": 18253 + }, + { + "epoch": 1.0475854584915898, + "grad_norm": 6.325812816619873, + "learning_rate": 7.601343017678499e-06, + "loss": 0.3227, + "step": 18254 + }, + { + "epoch": 1.0475990233315247, + "grad_norm": 6.9959306716918945, + "learning_rate": 7.601205975058243e-06, + "loss": 0.4166, + "step": 18255 + }, + { + "epoch": 1.0476125881714595, + "grad_norm": 5.137897968292236, + "learning_rate": 7.601068932437989e-06, + "loss": 0.268, + "step": 18256 + }, + { + "epoch": 1.0476261530113944, + "grad_norm": 4.680565357208252, + "learning_rate": 7.600931889817734e-06, + "loss": 0.2274, + "step": 18257 + }, + { + "epoch": 1.0476397178513293, + "grad_norm": 4.7010040283203125, + "learning_rate": 7.600794847197479e-06, + "loss": 0.2257, + "step": 18258 + }, + { + "epoch": 1.0476532826912643, + "grad_norm": 6.4533257484436035, + "learning_rate": 7.600657804577224e-06, + "loss": 0.4291, + "step": 18259 + }, + { + "epoch": 1.0476668475311992, + "grad_norm": 5.248493671417236, + "learning_rate": 7.600520761956969e-06, + "loss": 0.2924, + "step": 18260 + }, + { + "epoch": 1.047680412371134, + "grad_norm": 5.679314613342285, + "learning_rate": 7.600383719336715e-06, + "loss": 0.3578, + "step": 18261 + }, + { + "epoch": 1.047693977211069, + "grad_norm": 5.686367511749268, + "learning_rate": 7.6002466767164594e-06, + "loss": 0.2363, + "step": 18262 + }, + { + "epoch": 1.0477075420510038, + "grad_norm": 4.819868087768555, + "learning_rate": 7.600109634096205e-06, + "loss": 0.2007, + "step": 18263 + }, + { + "epoch": 1.0477211068909387, + "grad_norm": 4.369063854217529, + "learning_rate": 7.599972591475949e-06, + "loss": 0.2484, + "step": 18264 + }, + { + "epoch": 1.0477346717308735, + "grad_norm": 5.025028705596924, + "learning_rate": 7.599835548855695e-06, + "loss": 0.4627, + "step": 18265 + }, + { + "epoch": 1.0477482365708084, + "grad_norm": 7.915596008300781, + "learning_rate": 7.59969850623544e-06, + "loss": 0.3507, + "step": 18266 + }, + { + "epoch": 1.0477618014107433, + "grad_norm": 5.9520792961120605, + "learning_rate": 7.5995614636151845e-06, + "loss": 0.3285, + "step": 18267 + }, + { + "epoch": 1.0477753662506784, + "grad_norm": 6.125327110290527, + "learning_rate": 7.59942442099493e-06, + "loss": 0.257, + "step": 18268 + }, + { + "epoch": 1.0477889310906132, + "grad_norm": 5.72685432434082, + "learning_rate": 7.599287378374676e-06, + "loss": 0.2679, + "step": 18269 + }, + { + "epoch": 1.047802495930548, + "grad_norm": 5.060112953186035, + "learning_rate": 7.599150335754421e-06, + "loss": 0.2359, + "step": 18270 + }, + { + "epoch": 1.047816060770483, + "grad_norm": 5.236216068267822, + "learning_rate": 7.599013293134165e-06, + "loss": 0.1849, + "step": 18271 + }, + { + "epoch": 1.0478296256104178, + "grad_norm": 5.144791603088379, + "learning_rate": 7.59887625051391e-06, + "loss": 0.3224, + "step": 18272 + }, + { + "epoch": 1.0478431904503527, + "grad_norm": 4.586639881134033, + "learning_rate": 7.598739207893655e-06, + "loss": 0.1692, + "step": 18273 + }, + { + "epoch": 1.0478567552902875, + "grad_norm": 5.446622371673584, + "learning_rate": 7.598602165273401e-06, + "loss": 0.3307, + "step": 18274 + }, + { + "epoch": 1.0478703201302224, + "grad_norm": 5.403307914733887, + "learning_rate": 7.598465122653146e-06, + "loss": 0.24, + "step": 18275 + }, + { + "epoch": 1.0478838849701573, + "grad_norm": 7.359179973602295, + "learning_rate": 7.598328080032891e-06, + "loss": 0.3518, + "step": 18276 + }, + { + "epoch": 1.0478974498100921, + "grad_norm": 5.03028678894043, + "learning_rate": 7.5981910374126354e-06, + "loss": 0.224, + "step": 18277 + }, + { + "epoch": 1.0479110146500272, + "grad_norm": 6.944025993347168, + "learning_rate": 7.5980539947923815e-06, + "loss": 0.2592, + "step": 18278 + }, + { + "epoch": 1.047924579489962, + "grad_norm": 3.9644010066986084, + "learning_rate": 7.597916952172127e-06, + "loss": 0.278, + "step": 18279 + }, + { + "epoch": 1.047938144329897, + "grad_norm": 5.79721212387085, + "learning_rate": 7.597779909551871e-06, + "loss": 0.2707, + "step": 18280 + }, + { + "epoch": 1.0479517091698318, + "grad_norm": 5.03768253326416, + "learning_rate": 7.597642866931616e-06, + "loss": 0.2374, + "step": 18281 + }, + { + "epoch": 1.0479652740097667, + "grad_norm": 6.023580074310303, + "learning_rate": 7.597505824311362e-06, + "loss": 0.2963, + "step": 18282 + }, + { + "epoch": 1.0479788388497016, + "grad_norm": 4.857207775115967, + "learning_rate": 7.5973687816911065e-06, + "loss": 0.2797, + "step": 18283 + }, + { + "epoch": 1.0479924036896364, + "grad_norm": 6.051830291748047, + "learning_rate": 7.597231739070852e-06, + "loss": 0.3011, + "step": 18284 + }, + { + "epoch": 1.0480059685295713, + "grad_norm": 4.213793754577637, + "learning_rate": 7.597094696450597e-06, + "loss": 0.2316, + "step": 18285 + }, + { + "epoch": 1.0480195333695062, + "grad_norm": 4.394707679748535, + "learning_rate": 7.596957653830341e-06, + "loss": 0.3271, + "step": 18286 + }, + { + "epoch": 1.0480330982094412, + "grad_norm": 5.483471393585205, + "learning_rate": 7.596820611210087e-06, + "loss": 0.2708, + "step": 18287 + }, + { + "epoch": 1.048046663049376, + "grad_norm": 5.478943824768066, + "learning_rate": 7.596683568589832e-06, + "loss": 0.2701, + "step": 18288 + }, + { + "epoch": 1.048060227889311, + "grad_norm": 7.323968410491943, + "learning_rate": 7.596546525969577e-06, + "loss": 0.2967, + "step": 18289 + }, + { + "epoch": 1.0480737927292458, + "grad_norm": 5.073818683624268, + "learning_rate": 7.596409483349322e-06, + "loss": 0.2653, + "step": 18290 + }, + { + "epoch": 1.0480873575691807, + "grad_norm": 6.261026382446289, + "learning_rate": 7.596272440729068e-06, + "loss": 0.415, + "step": 18291 + }, + { + "epoch": 1.0481009224091156, + "grad_norm": 5.813987731933594, + "learning_rate": 7.596135398108812e-06, + "loss": 0.3504, + "step": 18292 + }, + { + "epoch": 1.0481144872490504, + "grad_norm": 5.869968414306641, + "learning_rate": 7.5959983554885574e-06, + "loss": 0.2132, + "step": 18293 + }, + { + "epoch": 1.0481280520889853, + "grad_norm": 4.794439792633057, + "learning_rate": 7.595861312868303e-06, + "loss": 0.3356, + "step": 18294 + }, + { + "epoch": 1.0481416169289202, + "grad_norm": 5.124124050140381, + "learning_rate": 7.595724270248049e-06, + "loss": 0.3445, + "step": 18295 + }, + { + "epoch": 1.048155181768855, + "grad_norm": 4.2283430099487305, + "learning_rate": 7.595587227627793e-06, + "loss": 0.1901, + "step": 18296 + }, + { + "epoch": 1.0481687466087901, + "grad_norm": 5.72269868850708, + "learning_rate": 7.595450185007538e-06, + "loss": 0.3529, + "step": 18297 + }, + { + "epoch": 1.048182311448725, + "grad_norm": 5.758261680603027, + "learning_rate": 7.5953131423872825e-06, + "loss": 0.3053, + "step": 18298 + }, + { + "epoch": 1.0481958762886598, + "grad_norm": 6.090411186218262, + "learning_rate": 7.595176099767028e-06, + "loss": 0.2816, + "step": 18299 + }, + { + "epoch": 1.0482094411285947, + "grad_norm": 5.4059600830078125, + "learning_rate": 7.595039057146774e-06, + "loss": 0.2648, + "step": 18300 + }, + { + "epoch": 1.0482230059685296, + "grad_norm": 4.945456504821777, + "learning_rate": 7.594902014526518e-06, + "loss": 0.2248, + "step": 18301 + }, + { + "epoch": 1.0482365708084644, + "grad_norm": 4.6195220947265625, + "learning_rate": 7.594764971906263e-06, + "loss": 0.1944, + "step": 18302 + }, + { + "epoch": 1.0482501356483993, + "grad_norm": 5.645791053771973, + "learning_rate": 7.594627929286008e-06, + "loss": 0.1781, + "step": 18303 + }, + { + "epoch": 1.0482637004883342, + "grad_norm": 7.124150276184082, + "learning_rate": 7.594490886665754e-06, + "loss": 0.2874, + "step": 18304 + }, + { + "epoch": 1.048277265328269, + "grad_norm": 5.993952751159668, + "learning_rate": 7.594353844045499e-06, + "loss": 0.2948, + "step": 18305 + }, + { + "epoch": 1.0482908301682041, + "grad_norm": 4.202480792999268, + "learning_rate": 7.594216801425244e-06, + "loss": 0.2268, + "step": 18306 + }, + { + "epoch": 1.048304395008139, + "grad_norm": 7.121936798095703, + "learning_rate": 7.594079758804988e-06, + "loss": 0.3207, + "step": 18307 + }, + { + "epoch": 1.0483179598480739, + "grad_norm": 6.240850448608398, + "learning_rate": 7.593942716184734e-06, + "loss": 0.2861, + "step": 18308 + }, + { + "epoch": 1.0483315246880087, + "grad_norm": 5.0178937911987305, + "learning_rate": 7.5938056735644795e-06, + "loss": 0.2973, + "step": 18309 + }, + { + "epoch": 1.0483450895279436, + "grad_norm": 4.798532485961914, + "learning_rate": 7.593668630944225e-06, + "loss": 0.1966, + "step": 18310 + }, + { + "epoch": 1.0483586543678785, + "grad_norm": 3.8718087673187256, + "learning_rate": 7.593531588323969e-06, + "loss": 0.1546, + "step": 18311 + }, + { + "epoch": 1.0483722192078133, + "grad_norm": 4.599048137664795, + "learning_rate": 7.593394545703714e-06, + "loss": 0.2448, + "step": 18312 + }, + { + "epoch": 1.0483857840477482, + "grad_norm": 4.261128902435303, + "learning_rate": 7.59325750308346e-06, + "loss": 0.1573, + "step": 18313 + }, + { + "epoch": 1.048399348887683, + "grad_norm": 2.670206069946289, + "learning_rate": 7.5931204604632045e-06, + "loss": 0.132, + "step": 18314 + }, + { + "epoch": 1.048412913727618, + "grad_norm": 3.7823071479797363, + "learning_rate": 7.59298341784295e-06, + "loss": 0.189, + "step": 18315 + }, + { + "epoch": 1.048426478567553, + "grad_norm": 4.600653648376465, + "learning_rate": 7.592846375222694e-06, + "loss": 0.2804, + "step": 18316 + }, + { + "epoch": 1.0484400434074879, + "grad_norm": 4.2303690910339355, + "learning_rate": 7.59270933260244e-06, + "loss": 0.1582, + "step": 18317 + }, + { + "epoch": 1.0484536082474227, + "grad_norm": 4.763163089752197, + "learning_rate": 7.592572289982185e-06, + "loss": 0.3362, + "step": 18318 + }, + { + "epoch": 1.0484671730873576, + "grad_norm": 4.871206760406494, + "learning_rate": 7.59243524736193e-06, + "loss": 0.3429, + "step": 18319 + }, + { + "epoch": 1.0484807379272925, + "grad_norm": 5.718414306640625, + "learning_rate": 7.592298204741675e-06, + "loss": 0.2853, + "step": 18320 + }, + { + "epoch": 1.0484943027672273, + "grad_norm": 4.175650119781494, + "learning_rate": 7.592161162121421e-06, + "loss": 0.2761, + "step": 18321 + }, + { + "epoch": 1.0485078676071622, + "grad_norm": 4.602146625518799, + "learning_rate": 7.592024119501166e-06, + "loss": 0.198, + "step": 18322 + }, + { + "epoch": 1.048521432447097, + "grad_norm": 4.168391704559326, + "learning_rate": 7.59188707688091e-06, + "loss": 0.2537, + "step": 18323 + }, + { + "epoch": 1.048534997287032, + "grad_norm": 3.238105058670044, + "learning_rate": 7.5917500342606554e-06, + "loss": 0.189, + "step": 18324 + }, + { + "epoch": 1.048548562126967, + "grad_norm": 12.324953079223633, + "learning_rate": 7.5916129916404015e-06, + "loss": 0.3808, + "step": 18325 + }, + { + "epoch": 1.0485621269669019, + "grad_norm": 3.352015256881714, + "learning_rate": 7.591475949020146e-06, + "loss": 0.2037, + "step": 18326 + }, + { + "epoch": 1.0485756918068367, + "grad_norm": 4.268775463104248, + "learning_rate": 7.591338906399891e-06, + "loss": 0.2338, + "step": 18327 + }, + { + "epoch": 1.0485892566467716, + "grad_norm": 4.302100658416748, + "learning_rate": 7.591201863779636e-06, + "loss": 0.1645, + "step": 18328 + }, + { + "epoch": 1.0486028214867065, + "grad_norm": 6.58286714553833, + "learning_rate": 7.5910648211593805e-06, + "loss": 0.2858, + "step": 18329 + }, + { + "epoch": 1.0486163863266413, + "grad_norm": 5.506366729736328, + "learning_rate": 7.5909277785391265e-06, + "loss": 0.4067, + "step": 18330 + }, + { + "epoch": 1.0486299511665762, + "grad_norm": 4.5933637619018555, + "learning_rate": 7.590790735918872e-06, + "loss": 0.2013, + "step": 18331 + }, + { + "epoch": 1.048643516006511, + "grad_norm": 4.969138145446777, + "learning_rate": 7.590653693298616e-06, + "loss": 0.2855, + "step": 18332 + }, + { + "epoch": 1.048657080846446, + "grad_norm": 5.515780448913574, + "learning_rate": 7.590516650678361e-06, + "loss": 0.3289, + "step": 18333 + }, + { + "epoch": 1.0486706456863808, + "grad_norm": 6.28177547454834, + "learning_rate": 7.590379608058107e-06, + "loss": 0.3226, + "step": 18334 + }, + { + "epoch": 1.0486842105263159, + "grad_norm": 6.119784355163574, + "learning_rate": 7.5902425654378516e-06, + "loss": 0.2239, + "step": 18335 + }, + { + "epoch": 1.0486977753662508, + "grad_norm": 4.527782440185547, + "learning_rate": 7.590105522817597e-06, + "loss": 0.1849, + "step": 18336 + }, + { + "epoch": 1.0487113402061856, + "grad_norm": 6.004769802093506, + "learning_rate": 7.589968480197342e-06, + "loss": 0.3031, + "step": 18337 + }, + { + "epoch": 1.0487249050461205, + "grad_norm": 3.6968841552734375, + "learning_rate": 7.589831437577088e-06, + "loss": 0.1595, + "step": 18338 + }, + { + "epoch": 1.0487384698860553, + "grad_norm": 5.844153881072998, + "learning_rate": 7.589694394956832e-06, + "loss": 0.3375, + "step": 18339 + }, + { + "epoch": 1.0487520347259902, + "grad_norm": 6.327310562133789, + "learning_rate": 7.5895573523365775e-06, + "loss": 0.2972, + "step": 18340 + }, + { + "epoch": 1.048765599565925, + "grad_norm": 6.349325656890869, + "learning_rate": 7.589420309716322e-06, + "loss": 0.2838, + "step": 18341 + }, + { + "epoch": 1.04877916440586, + "grad_norm": 5.970293045043945, + "learning_rate": 7.589283267096067e-06, + "loss": 0.345, + "step": 18342 + }, + { + "epoch": 1.0487927292457948, + "grad_norm": 5.179495811462402, + "learning_rate": 7.589146224475813e-06, + "loss": 0.3138, + "step": 18343 + }, + { + "epoch": 1.04880629408573, + "grad_norm": 4.65814208984375, + "learning_rate": 7.589009181855558e-06, + "loss": 0.2885, + "step": 18344 + }, + { + "epoch": 1.0488198589256648, + "grad_norm": 6.35587739944458, + "learning_rate": 7.5888721392353025e-06, + "loss": 0.2852, + "step": 18345 + }, + { + "epoch": 1.0488334237655996, + "grad_norm": 3.818815231323242, + "learning_rate": 7.588735096615048e-06, + "loss": 0.2115, + "step": 18346 + }, + { + "epoch": 1.0488469886055345, + "grad_norm": 3.791356325149536, + "learning_rate": 7.588598053994794e-06, + "loss": 0.1546, + "step": 18347 + }, + { + "epoch": 1.0488605534454694, + "grad_norm": 4.104461193084717, + "learning_rate": 7.588461011374538e-06, + "loss": 0.2698, + "step": 18348 + }, + { + "epoch": 1.0488741182854042, + "grad_norm": 4.116218566894531, + "learning_rate": 7.588323968754283e-06, + "loss": 0.2235, + "step": 18349 + }, + { + "epoch": 1.048887683125339, + "grad_norm": 4.767809867858887, + "learning_rate": 7.5881869261340276e-06, + "loss": 0.1946, + "step": 18350 + }, + { + "epoch": 1.048901247965274, + "grad_norm": 4.1267008781433105, + "learning_rate": 7.588049883513774e-06, + "loss": 0.1899, + "step": 18351 + }, + { + "epoch": 1.0489148128052088, + "grad_norm": 4.556158542633057, + "learning_rate": 7.587912840893519e-06, + "loss": 0.1538, + "step": 18352 + }, + { + "epoch": 1.0489283776451437, + "grad_norm": 5.205505847930908, + "learning_rate": 7.587775798273264e-06, + "loss": 0.2488, + "step": 18353 + }, + { + "epoch": 1.0489419424850788, + "grad_norm": 4.292879581451416, + "learning_rate": 7.587638755653008e-06, + "loss": 0.2295, + "step": 18354 + }, + { + "epoch": 1.0489555073250136, + "grad_norm": 3.7339859008789062, + "learning_rate": 7.5875017130327534e-06, + "loss": 0.2371, + "step": 18355 + }, + { + "epoch": 1.0489690721649485, + "grad_norm": 5.393531799316406, + "learning_rate": 7.5873646704124995e-06, + "loss": 0.2607, + "step": 18356 + }, + { + "epoch": 1.0489826370048834, + "grad_norm": 5.120926380157471, + "learning_rate": 7.587227627792244e-06, + "loss": 0.2151, + "step": 18357 + }, + { + "epoch": 1.0489962018448182, + "grad_norm": 5.428021430969238, + "learning_rate": 7.587090585171989e-06, + "loss": 0.3454, + "step": 18358 + }, + { + "epoch": 1.049009766684753, + "grad_norm": 5.250650882720947, + "learning_rate": 7.586953542551734e-06, + "loss": 0.2154, + "step": 18359 + }, + { + "epoch": 1.049023331524688, + "grad_norm": 4.830789089202881, + "learning_rate": 7.586816499931479e-06, + "loss": 0.1493, + "step": 18360 + }, + { + "epoch": 1.0490368963646228, + "grad_norm": 4.344332218170166, + "learning_rate": 7.5866794573112245e-06, + "loss": 0.1986, + "step": 18361 + }, + { + "epoch": 1.0490504612045577, + "grad_norm": 4.627732753753662, + "learning_rate": 7.58654241469097e-06, + "loss": 0.2442, + "step": 18362 + }, + { + "epoch": 1.0490640260444928, + "grad_norm": 5.414709568023682, + "learning_rate": 7.586405372070714e-06, + "loss": 0.1827, + "step": 18363 + }, + { + "epoch": 1.0490775908844276, + "grad_norm": 4.178684234619141, + "learning_rate": 7.58626832945046e-06, + "loss": 0.1669, + "step": 18364 + }, + { + "epoch": 1.0490911557243625, + "grad_norm": 5.642507553100586, + "learning_rate": 7.586131286830205e-06, + "loss": 0.2897, + "step": 18365 + }, + { + "epoch": 1.0491047205642974, + "grad_norm": 3.947929859161377, + "learning_rate": 7.5859942442099496e-06, + "loss": 0.1923, + "step": 18366 + }, + { + "epoch": 1.0491182854042322, + "grad_norm": 4.630324363708496, + "learning_rate": 7.585857201589695e-06, + "loss": 0.2395, + "step": 18367 + }, + { + "epoch": 1.049131850244167, + "grad_norm": 4.287143230438232, + "learning_rate": 7.58572015896944e-06, + "loss": 0.2111, + "step": 18368 + }, + { + "epoch": 1.049145415084102, + "grad_norm": 3.4321775436401367, + "learning_rate": 7.585583116349186e-06, + "loss": 0.2066, + "step": 18369 + }, + { + "epoch": 1.0491589799240368, + "grad_norm": 4.774073600769043, + "learning_rate": 7.58544607372893e-06, + "loss": 0.209, + "step": 18370 + }, + { + "epoch": 1.0491725447639717, + "grad_norm": 4.373049259185791, + "learning_rate": 7.5853090311086755e-06, + "loss": 0.1613, + "step": 18371 + }, + { + "epoch": 1.0491861096039066, + "grad_norm": 3.902895212173462, + "learning_rate": 7.58517198848842e-06, + "loss": 0.1848, + "step": 18372 + }, + { + "epoch": 1.0491996744438417, + "grad_norm": 5.6622843742370605, + "learning_rate": 7.585034945868166e-06, + "loss": 0.2772, + "step": 18373 + }, + { + "epoch": 1.0492132392837765, + "grad_norm": 3.7729856967926025, + "learning_rate": 7.584897903247911e-06, + "loss": 0.1866, + "step": 18374 + }, + { + "epoch": 1.0492268041237114, + "grad_norm": 3.832066774368286, + "learning_rate": 7.584760860627655e-06, + "loss": 0.1877, + "step": 18375 + }, + { + "epoch": 1.0492403689636463, + "grad_norm": 4.504273891448975, + "learning_rate": 7.5846238180074005e-06, + "loss": 0.2719, + "step": 18376 + }, + { + "epoch": 1.0492539338035811, + "grad_norm": 5.37227725982666, + "learning_rate": 7.5844867753871465e-06, + "loss": 0.2088, + "step": 18377 + }, + { + "epoch": 1.049267498643516, + "grad_norm": 3.947373151779175, + "learning_rate": 7.584349732766892e-06, + "loss": 0.2308, + "step": 18378 + }, + { + "epoch": 1.0492810634834508, + "grad_norm": 6.199617385864258, + "learning_rate": 7.584212690146636e-06, + "loss": 0.1906, + "step": 18379 + }, + { + "epoch": 1.0492946283233857, + "grad_norm": 5.782829284667969, + "learning_rate": 7.584075647526381e-06, + "loss": 0.2046, + "step": 18380 + }, + { + "epoch": 1.0493081931633206, + "grad_norm": 4.141817092895508, + "learning_rate": 7.5839386049061256e-06, + "loss": 0.1938, + "step": 18381 + }, + { + "epoch": 1.0493217580032557, + "grad_norm": 6.791335582733154, + "learning_rate": 7.583801562285872e-06, + "loss": 0.3128, + "step": 18382 + }, + { + "epoch": 1.0493353228431905, + "grad_norm": 5.635827541351318, + "learning_rate": 7.583664519665617e-06, + "loss": 0.2697, + "step": 18383 + }, + { + "epoch": 1.0493488876831254, + "grad_norm": 4.483277797698975, + "learning_rate": 7.583527477045362e-06, + "loss": 0.2299, + "step": 18384 + }, + { + "epoch": 1.0493624525230603, + "grad_norm": 5.598562717437744, + "learning_rate": 7.583390434425106e-06, + "loss": 0.3372, + "step": 18385 + }, + { + "epoch": 1.0493760173629951, + "grad_norm": 3.6147334575653076, + "learning_rate": 7.583253391804852e-06, + "loss": 0.1233, + "step": 18386 + }, + { + "epoch": 1.04938958220293, + "grad_norm": 4.704916954040527, + "learning_rate": 7.5831163491845975e-06, + "loss": 0.2009, + "step": 18387 + }, + { + "epoch": 1.0494031470428649, + "grad_norm": 4.537675857543945, + "learning_rate": 7.582979306564342e-06, + "loss": 0.1731, + "step": 18388 + }, + { + "epoch": 1.0494167118827997, + "grad_norm": 5.17520809173584, + "learning_rate": 7.582842263944087e-06, + "loss": 0.2252, + "step": 18389 + }, + { + "epoch": 1.0494302767227346, + "grad_norm": 4.867774963378906, + "learning_rate": 7.582705221323833e-06, + "loss": 0.2278, + "step": 18390 + }, + { + "epoch": 1.0494438415626695, + "grad_norm": 6.153672218322754, + "learning_rate": 7.582568178703577e-06, + "loss": 0.3472, + "step": 18391 + }, + { + "epoch": 1.0494574064026045, + "grad_norm": 7.650123596191406, + "learning_rate": 7.5824311360833225e-06, + "loss": 0.3285, + "step": 18392 + }, + { + "epoch": 1.0494709712425394, + "grad_norm": 3.951512575149536, + "learning_rate": 7.582294093463068e-06, + "loss": 0.1297, + "step": 18393 + }, + { + "epoch": 1.0494845360824743, + "grad_norm": 5.153362274169922, + "learning_rate": 7.582157050842813e-06, + "loss": 0.2759, + "step": 18394 + }, + { + "epoch": 1.0494981009224091, + "grad_norm": 4.9935383796691895, + "learning_rate": 7.582020008222558e-06, + "loss": 0.2571, + "step": 18395 + }, + { + "epoch": 1.049511665762344, + "grad_norm": 6.171785831451416, + "learning_rate": 7.581882965602303e-06, + "loss": 0.2688, + "step": 18396 + }, + { + "epoch": 1.0495252306022789, + "grad_norm": 6.979342460632324, + "learning_rate": 7.5817459229820476e-06, + "loss": 0.3347, + "step": 18397 + }, + { + "epoch": 1.0495387954422137, + "grad_norm": 5.7511162757873535, + "learning_rate": 7.581608880361793e-06, + "loss": 0.2328, + "step": 18398 + }, + { + "epoch": 1.0495523602821486, + "grad_norm": 5.34391975402832, + "learning_rate": 7.581471837741539e-06, + "loss": 0.2993, + "step": 18399 + }, + { + "epoch": 1.0495659251220835, + "grad_norm": 4.020502090454102, + "learning_rate": 7.581334795121283e-06, + "loss": 0.2379, + "step": 18400 + }, + { + "epoch": 1.0495794899620186, + "grad_norm": 4.028339862823486, + "learning_rate": 7.581197752501028e-06, + "loss": 0.234, + "step": 18401 + }, + { + "epoch": 1.0495930548019534, + "grad_norm": 5.502811431884766, + "learning_rate": 7.5810607098807735e-06, + "loss": 0.2407, + "step": 18402 + }, + { + "epoch": 1.0496066196418883, + "grad_norm": 4.382223606109619, + "learning_rate": 7.5809236672605195e-06, + "loss": 0.2809, + "step": 18403 + }, + { + "epoch": 1.0496201844818231, + "grad_norm": 5.124169826507568, + "learning_rate": 7.580786624640264e-06, + "loss": 0.3413, + "step": 18404 + }, + { + "epoch": 1.049633749321758, + "grad_norm": 4.5790605545043945, + "learning_rate": 7.580649582020009e-06, + "loss": 0.1767, + "step": 18405 + }, + { + "epoch": 1.0496473141616929, + "grad_norm": 5.120000839233398, + "learning_rate": 7.580512539399753e-06, + "loss": 0.2169, + "step": 18406 + }, + { + "epoch": 1.0496608790016277, + "grad_norm": 4.921546459197998, + "learning_rate": 7.580375496779499e-06, + "loss": 0.2601, + "step": 18407 + }, + { + "epoch": 1.0496744438415626, + "grad_norm": 5.099162578582764, + "learning_rate": 7.5802384541592445e-06, + "loss": 0.2624, + "step": 18408 + }, + { + "epoch": 1.0496880086814975, + "grad_norm": 4.668692111968994, + "learning_rate": 7.580101411538989e-06, + "loss": 0.1749, + "step": 18409 + }, + { + "epoch": 1.0497015735214323, + "grad_norm": 7.580813407897949, + "learning_rate": 7.579964368918734e-06, + "loss": 0.3361, + "step": 18410 + }, + { + "epoch": 1.0497151383613674, + "grad_norm": 5.005957126617432, + "learning_rate": 7.579827326298479e-06, + "loss": 0.2513, + "step": 18411 + }, + { + "epoch": 1.0497287032013023, + "grad_norm": 4.922773838043213, + "learning_rate": 7.579690283678225e-06, + "loss": 0.2492, + "step": 18412 + }, + { + "epoch": 1.0497422680412372, + "grad_norm": 5.263619422912598, + "learning_rate": 7.57955324105797e-06, + "loss": 0.2926, + "step": 18413 + }, + { + "epoch": 1.049755832881172, + "grad_norm": 4.301992416381836, + "learning_rate": 7.579416198437715e-06, + "loss": 0.2831, + "step": 18414 + }, + { + "epoch": 1.049769397721107, + "grad_norm": 5.774589538574219, + "learning_rate": 7.579279155817459e-06, + "loss": 0.2759, + "step": 18415 + }, + { + "epoch": 1.0497829625610418, + "grad_norm": 4.267400741577148, + "learning_rate": 7.579142113197205e-06, + "loss": 0.2043, + "step": 18416 + }, + { + "epoch": 1.0497965274009766, + "grad_norm": 5.252350330352783, + "learning_rate": 7.57900507057695e-06, + "loss": 0.3113, + "step": 18417 + }, + { + "epoch": 1.0498100922409115, + "grad_norm": 4.976059436798096, + "learning_rate": 7.5788680279566955e-06, + "loss": 0.195, + "step": 18418 + }, + { + "epoch": 1.0498236570808464, + "grad_norm": 7.180349349975586, + "learning_rate": 7.57873098533644e-06, + "loss": 0.2975, + "step": 18419 + }, + { + "epoch": 1.0498372219207814, + "grad_norm": 5.626366138458252, + "learning_rate": 7.578593942716186e-06, + "loss": 0.318, + "step": 18420 + }, + { + "epoch": 1.0498507867607163, + "grad_norm": 6.6093292236328125, + "learning_rate": 7.578456900095931e-06, + "loss": 0.3869, + "step": 18421 + }, + { + "epoch": 1.0498643516006512, + "grad_norm": 5.179046154022217, + "learning_rate": 7.578319857475675e-06, + "loss": 0.3761, + "step": 18422 + }, + { + "epoch": 1.049877916440586, + "grad_norm": 5.070349216461182, + "learning_rate": 7.5781828148554205e-06, + "loss": 0.2085, + "step": 18423 + }, + { + "epoch": 1.049891481280521, + "grad_norm": 5.882847309112549, + "learning_rate": 7.578045772235165e-06, + "loss": 0.3179, + "step": 18424 + }, + { + "epoch": 1.0499050461204558, + "grad_norm": 6.783071994781494, + "learning_rate": 7.577908729614911e-06, + "loss": 0.319, + "step": 18425 + }, + { + "epoch": 1.0499186109603906, + "grad_norm": 3.8942794799804688, + "learning_rate": 7.577771686994656e-06, + "loss": 0.3185, + "step": 18426 + }, + { + "epoch": 1.0499321758003255, + "grad_norm": 7.672930717468262, + "learning_rate": 7.577634644374401e-06, + "loss": 0.3758, + "step": 18427 + }, + { + "epoch": 1.0499457406402604, + "grad_norm": 5.367619037628174, + "learning_rate": 7.577497601754146e-06, + "loss": 0.1965, + "step": 18428 + }, + { + "epoch": 1.0499593054801952, + "grad_norm": 7.408793926239014, + "learning_rate": 7.577360559133892e-06, + "loss": 0.3129, + "step": 18429 + }, + { + "epoch": 1.0499728703201303, + "grad_norm": 7.560757637023926, + "learning_rate": 7.577223516513637e-06, + "loss": 0.3481, + "step": 18430 + }, + { + "epoch": 1.0499728703201303, + "eval_loss": 0.34797126054763794, + "eval_noise_accuracy": NaN, + "eval_runtime": 4534.4076, + "eval_samples_per_second": 1.108, + "eval_steps_per_second": 0.069, + "eval_wer": 28.13629483179389, + "step": 18430 + }, + { + "epoch": 1.0499864351600652, + "grad_norm": 4.168925762176514, + "learning_rate": 7.577086473893381e-06, + "loss": 0.2382, + "step": 18431 + }, + { + "epoch": 1.05, + "grad_norm": 4.354788780212402, + "learning_rate": 7.576949431273126e-06, + "loss": 0.1701, + "step": 18432 + }, + { + "epoch": 1.050013564839935, + "grad_norm": 6.125268459320068, + "learning_rate": 7.576812388652872e-06, + "loss": 0.2902, + "step": 18433 + }, + { + "epoch": 1.0500271296798698, + "grad_norm": 3.656670570373535, + "learning_rate": 7.576675346032617e-06, + "loss": 0.1577, + "step": 18434 + }, + { + "epoch": 1.0500406945198046, + "grad_norm": 5.398870944976807, + "learning_rate": 7.576538303412362e-06, + "loss": 0.3853, + "step": 18435 + }, + { + "epoch": 1.0500542593597395, + "grad_norm": 4.424790382385254, + "learning_rate": 7.576401260792107e-06, + "loss": 0.2404, + "step": 18436 + }, + { + "epoch": 1.0500678241996744, + "grad_norm": 5.609872817993164, + "learning_rate": 7.576264218171851e-06, + "loss": 0.2766, + "step": 18437 + }, + { + "epoch": 1.0500813890396092, + "grad_norm": 5.21174955368042, + "learning_rate": 7.576127175551597e-06, + "loss": 0.3225, + "step": 18438 + }, + { + "epoch": 1.0500949538795443, + "grad_norm": 6.158812522888184, + "learning_rate": 7.5759901329313425e-06, + "loss": 0.3317, + "step": 18439 + }, + { + "epoch": 1.0501085187194792, + "grad_norm": 5.152443885803223, + "learning_rate": 7.575853090311087e-06, + "loss": 0.3473, + "step": 18440 + }, + { + "epoch": 1.050122083559414, + "grad_norm": 4.0378923416137695, + "learning_rate": 7.575716047690832e-06, + "loss": 0.248, + "step": 18441 + }, + { + "epoch": 1.050135648399349, + "grad_norm": 6.712604999542236, + "learning_rate": 7.575579005070578e-06, + "loss": 0.3334, + "step": 18442 + }, + { + "epoch": 1.0501492132392838, + "grad_norm": 4.815308570861816, + "learning_rate": 7.575441962450322e-06, + "loss": 0.1975, + "step": 18443 + }, + { + "epoch": 1.0501627780792187, + "grad_norm": 4.656893253326416, + "learning_rate": 7.575304919830068e-06, + "loss": 0.1861, + "step": 18444 + }, + { + "epoch": 1.0501763429191535, + "grad_norm": 4.675276279449463, + "learning_rate": 7.575167877209813e-06, + "loss": 0.2948, + "step": 18445 + }, + { + "epoch": 1.0501899077590884, + "grad_norm": 5.379252910614014, + "learning_rate": 7.575030834589559e-06, + "loss": 0.3036, + "step": 18446 + }, + { + "epoch": 1.0502034725990232, + "grad_norm": 5.984286308288574, + "learning_rate": 7.574893791969303e-06, + "loss": 0.3171, + "step": 18447 + }, + { + "epoch": 1.0502170374389581, + "grad_norm": 6.034285545349121, + "learning_rate": 7.574756749349048e-06, + "loss": 0.3287, + "step": 18448 + }, + { + "epoch": 1.0502306022788932, + "grad_norm": 7.176599025726318, + "learning_rate": 7.574619706728793e-06, + "loss": 0.529, + "step": 18449 + }, + { + "epoch": 1.050244167118828, + "grad_norm": 7.164152145385742, + "learning_rate": 7.574482664108538e-06, + "loss": 0.3021, + "step": 18450 + }, + { + "epoch": 1.050257731958763, + "grad_norm": 5.69321346282959, + "learning_rate": 7.574345621488284e-06, + "loss": 0.2914, + "step": 18451 + }, + { + "epoch": 1.0502712967986978, + "grad_norm": 5.539464950561523, + "learning_rate": 7.574208578868029e-06, + "loss": 0.406, + "step": 18452 + }, + { + "epoch": 1.0502848616386327, + "grad_norm": 5.9473371505737305, + "learning_rate": 7.574071536247773e-06, + "loss": 0.2778, + "step": 18453 + }, + { + "epoch": 1.0502984264785675, + "grad_norm": 6.486056327819824, + "learning_rate": 7.5739344936275185e-06, + "loss": 0.4246, + "step": 18454 + }, + { + "epoch": 1.0503119913185024, + "grad_norm": 5.607243061065674, + "learning_rate": 7.5737974510072646e-06, + "loss": 0.4366, + "step": 18455 + }, + { + "epoch": 1.0503255561584373, + "grad_norm": 7.1199564933776855, + "learning_rate": 7.573660408387009e-06, + "loss": 0.3553, + "step": 18456 + }, + { + "epoch": 1.0503391209983721, + "grad_norm": 5.55867338180542, + "learning_rate": 7.573523365766754e-06, + "loss": 0.2996, + "step": 18457 + }, + { + "epoch": 1.0503526858383072, + "grad_norm": 4.539041519165039, + "learning_rate": 7.573386323146498e-06, + "loss": 0.1751, + "step": 18458 + }, + { + "epoch": 1.050366250678242, + "grad_norm": 5.2403788566589355, + "learning_rate": 7.5732492805262444e-06, + "loss": 0.2831, + "step": 18459 + }, + { + "epoch": 1.050379815518177, + "grad_norm": 7.629897117614746, + "learning_rate": 7.57311223790599e-06, + "loss": 0.4377, + "step": 18460 + }, + { + "epoch": 1.0503933803581118, + "grad_norm": 5.28642463684082, + "learning_rate": 7.572975195285735e-06, + "loss": 0.3759, + "step": 18461 + }, + { + "epoch": 1.0504069451980467, + "grad_norm": 6.190364837646484, + "learning_rate": 7.572838152665479e-06, + "loss": 0.3857, + "step": 18462 + }, + { + "epoch": 1.0504205100379815, + "grad_norm": 7.4917144775390625, + "learning_rate": 7.572701110045225e-06, + "loss": 0.5023, + "step": 18463 + }, + { + "epoch": 1.0504340748779164, + "grad_norm": 5.704315662384033, + "learning_rate": 7.57256406742497e-06, + "loss": 0.4244, + "step": 18464 + }, + { + "epoch": 1.0504476397178513, + "grad_norm": 4.157012939453125, + "learning_rate": 7.572427024804715e-06, + "loss": 0.2398, + "step": 18465 + }, + { + "epoch": 1.0504612045577861, + "grad_norm": 5.518191337585449, + "learning_rate": 7.57228998218446e-06, + "loss": 0.3219, + "step": 18466 + }, + { + "epoch": 1.050474769397721, + "grad_norm": 5.0712199211120605, + "learning_rate": 7.572152939564205e-06, + "loss": 0.2819, + "step": 18467 + }, + { + "epoch": 1.050488334237656, + "grad_norm": 4.983087539672852, + "learning_rate": 7.57201589694395e-06, + "loss": 0.2294, + "step": 18468 + }, + { + "epoch": 1.050501899077591, + "grad_norm": 5.984880447387695, + "learning_rate": 7.571878854323695e-06, + "loss": 0.3369, + "step": 18469 + }, + { + "epoch": 1.0505154639175258, + "grad_norm": 4.174939155578613, + "learning_rate": 7.5717418117034405e-06, + "loss": 0.2889, + "step": 18470 + }, + { + "epoch": 1.0505290287574607, + "grad_norm": 5.094361305236816, + "learning_rate": 7.571604769083185e-06, + "loss": 0.3723, + "step": 18471 + }, + { + "epoch": 1.0505425935973955, + "grad_norm": 5.093506336212158, + "learning_rate": 7.571467726462931e-06, + "loss": 0.2881, + "step": 18472 + }, + { + "epoch": 1.0505561584373304, + "grad_norm": 5.204371452331543, + "learning_rate": 7.571330683842676e-06, + "loss": 0.3478, + "step": 18473 + }, + { + "epoch": 1.0505697232772653, + "grad_norm": 5.983643054962158, + "learning_rate": 7.57119364122242e-06, + "loss": 0.2842, + "step": 18474 + }, + { + "epoch": 1.0505832881172001, + "grad_norm": 5.802605152130127, + "learning_rate": 7.571056598602166e-06, + "loss": 0.3241, + "step": 18475 + }, + { + "epoch": 1.050596852957135, + "grad_norm": 4.044837474822998, + "learning_rate": 7.570919555981912e-06, + "loss": 0.3567, + "step": 18476 + }, + { + "epoch": 1.05061041779707, + "grad_norm": 5.569909572601318, + "learning_rate": 7.570782513361656e-06, + "loss": 0.403, + "step": 18477 + }, + { + "epoch": 1.050623982637005, + "grad_norm": 4.600683689117432, + "learning_rate": 7.570645470741401e-06, + "loss": 0.3022, + "step": 18478 + }, + { + "epoch": 1.0506375474769398, + "grad_norm": 8.352293968200684, + "learning_rate": 7.570508428121146e-06, + "loss": 0.3595, + "step": 18479 + }, + { + "epoch": 1.0506511123168747, + "grad_norm": 4.57521390914917, + "learning_rate": 7.570371385500891e-06, + "loss": 0.2551, + "step": 18480 + }, + { + "epoch": 1.0506646771568096, + "grad_norm": 9.164478302001953, + "learning_rate": 7.570234342880637e-06, + "loss": 0.3785, + "step": 18481 + }, + { + "epoch": 1.0506782419967444, + "grad_norm": 6.852389335632324, + "learning_rate": 7.570097300260382e-06, + "loss": 0.4484, + "step": 18482 + }, + { + "epoch": 1.0506918068366793, + "grad_norm": 5.463761329650879, + "learning_rate": 7.569960257640126e-06, + "loss": 0.2767, + "step": 18483 + }, + { + "epoch": 1.0507053716766142, + "grad_norm": 4.64449405670166, + "learning_rate": 7.569823215019871e-06, + "loss": 0.2032, + "step": 18484 + }, + { + "epoch": 1.050718936516549, + "grad_norm": 7.785191059112549, + "learning_rate": 7.569686172399617e-06, + "loss": 0.2899, + "step": 18485 + }, + { + "epoch": 1.0507325013564839, + "grad_norm": 5.217334747314453, + "learning_rate": 7.5695491297793626e-06, + "loss": 0.2307, + "step": 18486 + }, + { + "epoch": 1.050746066196419, + "grad_norm": 5.415945529937744, + "learning_rate": 7.569412087159107e-06, + "loss": 0.2986, + "step": 18487 + }, + { + "epoch": 1.0507596310363538, + "grad_norm": 6.163363933563232, + "learning_rate": 7.569275044538852e-06, + "loss": 0.33, + "step": 18488 + }, + { + "epoch": 1.0507731958762887, + "grad_norm": 5.640991687774658, + "learning_rate": 7.569138001918598e-06, + "loss": 0.3855, + "step": 18489 + }, + { + "epoch": 1.0507867607162236, + "grad_norm": 6.532663345336914, + "learning_rate": 7.5690009592983424e-06, + "loss": 0.2705, + "step": 18490 + }, + { + "epoch": 1.0508003255561584, + "grad_norm": 3.705981731414795, + "learning_rate": 7.568863916678088e-06, + "loss": 0.237, + "step": 18491 + }, + { + "epoch": 1.0508138903960933, + "grad_norm": 5.8991007804870605, + "learning_rate": 7.568726874057832e-06, + "loss": 0.2286, + "step": 18492 + }, + { + "epoch": 1.0508274552360282, + "grad_norm": 6.295729160308838, + "learning_rate": 7.568589831437577e-06, + "loss": 0.322, + "step": 18493 + }, + { + "epoch": 1.050841020075963, + "grad_norm": 5.29301643371582, + "learning_rate": 7.568452788817323e-06, + "loss": 0.2753, + "step": 18494 + }, + { + "epoch": 1.050854584915898, + "grad_norm": 5.698147296905518, + "learning_rate": 7.568315746197068e-06, + "loss": 0.3043, + "step": 18495 + }, + { + "epoch": 1.050868149755833, + "grad_norm": 4.773892879486084, + "learning_rate": 7.568178703576813e-06, + "loss": 0.2111, + "step": 18496 + }, + { + "epoch": 1.0508817145957678, + "grad_norm": 5.878757476806641, + "learning_rate": 7.568041660956558e-06, + "loss": 0.3233, + "step": 18497 + }, + { + "epoch": 1.0508952794357027, + "grad_norm": 7.859836578369141, + "learning_rate": 7.567904618336304e-06, + "loss": 0.3065, + "step": 18498 + }, + { + "epoch": 1.0509088442756376, + "grad_norm": 5.384515762329102, + "learning_rate": 7.567767575716048e-06, + "loss": 0.2682, + "step": 18499 + }, + { + "epoch": 1.0509224091155724, + "grad_norm": 5.244321823120117, + "learning_rate": 7.567630533095793e-06, + "loss": 0.2149, + "step": 18500 + }, + { + "epoch": 1.0509359739555073, + "grad_norm": 6.954065799713135, + "learning_rate": 7.5674934904755386e-06, + "loss": 0.3041, + "step": 18501 + }, + { + "epoch": 1.0509495387954422, + "grad_norm": 6.130558967590332, + "learning_rate": 7.567356447855284e-06, + "loss": 0.2746, + "step": 18502 + }, + { + "epoch": 1.050963103635377, + "grad_norm": 6.4081878662109375, + "learning_rate": 7.567219405235029e-06, + "loss": 0.3174, + "step": 18503 + }, + { + "epoch": 1.050976668475312, + "grad_norm": 5.092771053314209, + "learning_rate": 7.567082362614774e-06, + "loss": 0.1933, + "step": 18504 + }, + { + "epoch": 1.0509902333152468, + "grad_norm": 7.009486675262451, + "learning_rate": 7.566945319994518e-06, + "loss": 0.2579, + "step": 18505 + }, + { + "epoch": 1.0510037981551819, + "grad_norm": 7.335385322570801, + "learning_rate": 7.566808277374264e-06, + "loss": 0.3617, + "step": 18506 + }, + { + "epoch": 1.0510173629951167, + "grad_norm": 5.4237565994262695, + "learning_rate": 7.56667123475401e-06, + "loss": 0.2366, + "step": 18507 + }, + { + "epoch": 1.0510309278350516, + "grad_norm": 5.732712268829346, + "learning_rate": 7.566534192133754e-06, + "loss": 0.2652, + "step": 18508 + }, + { + "epoch": 1.0510444926749865, + "grad_norm": 7.669715881347656, + "learning_rate": 7.566397149513499e-06, + "loss": 0.314, + "step": 18509 + }, + { + "epoch": 1.0510580575149213, + "grad_norm": 4.48713493347168, + "learning_rate": 7.566260106893244e-06, + "loss": 0.1881, + "step": 18510 + }, + { + "epoch": 1.0510716223548562, + "grad_norm": 5.208380222320557, + "learning_rate": 7.56612306427299e-06, + "loss": 0.212, + "step": 18511 + }, + { + "epoch": 1.051085187194791, + "grad_norm": 5.215540409088135, + "learning_rate": 7.565986021652735e-06, + "loss": 0.1837, + "step": 18512 + }, + { + "epoch": 1.051098752034726, + "grad_norm": 5.454411029815674, + "learning_rate": 7.56584897903248e-06, + "loss": 0.2466, + "step": 18513 + }, + { + "epoch": 1.0511123168746608, + "grad_norm": 6.7213134765625, + "learning_rate": 7.565711936412224e-06, + "loss": 0.2516, + "step": 18514 + }, + { + "epoch": 1.0511258817145959, + "grad_norm": 4.577330589294434, + "learning_rate": 7.56557489379197e-06, + "loss": 0.1564, + "step": 18515 + }, + { + "epoch": 1.0511394465545307, + "grad_norm": 6.534182548522949, + "learning_rate": 7.565437851171715e-06, + "loss": 0.3093, + "step": 18516 + }, + { + "epoch": 1.0511530113944656, + "grad_norm": 5.911357402801514, + "learning_rate": 7.56530080855146e-06, + "loss": 0.3287, + "step": 18517 + }, + { + "epoch": 1.0511665762344005, + "grad_norm": 5.8934502601623535, + "learning_rate": 7.565163765931205e-06, + "loss": 0.3033, + "step": 18518 + }, + { + "epoch": 1.0511801410743353, + "grad_norm": 7.705862522125244, + "learning_rate": 7.56502672331095e-06, + "loss": 0.2307, + "step": 18519 + }, + { + "epoch": 1.0511937059142702, + "grad_norm": 7.524319648742676, + "learning_rate": 7.564889680690696e-06, + "loss": 0.2931, + "step": 18520 + }, + { + "epoch": 1.051207270754205, + "grad_norm": 6.312933444976807, + "learning_rate": 7.5647526380704404e-06, + "loss": 0.2959, + "step": 18521 + }, + { + "epoch": 1.05122083559414, + "grad_norm": 7.708216190338135, + "learning_rate": 7.564615595450186e-06, + "loss": 0.5876, + "step": 18522 + }, + { + "epoch": 1.0512344004340748, + "grad_norm": 5.467614650726318, + "learning_rate": 7.56447855282993e-06, + "loss": 0.307, + "step": 18523 + }, + { + "epoch": 1.0512479652740097, + "grad_norm": 5.606828689575195, + "learning_rate": 7.564341510209676e-06, + "loss": 0.3327, + "step": 18524 + }, + { + "epoch": 1.0512615301139447, + "grad_norm": 4.925049781799316, + "learning_rate": 7.564204467589421e-06, + "loss": 0.2549, + "step": 18525 + }, + { + "epoch": 1.0512750949538796, + "grad_norm": 5.438410758972168, + "learning_rate": 7.5640674249691655e-06, + "loss": 0.2786, + "step": 18526 + }, + { + "epoch": 1.0512886597938145, + "grad_norm": 5.773740291595459, + "learning_rate": 7.563930382348911e-06, + "loss": 0.2461, + "step": 18527 + }, + { + "epoch": 1.0513022246337493, + "grad_norm": 8.144425392150879, + "learning_rate": 7.563793339728657e-06, + "loss": 0.4106, + "step": 18528 + }, + { + "epoch": 1.0513157894736842, + "grad_norm": 4.680975914001465, + "learning_rate": 7.563656297108402e-06, + "loss": 0.2173, + "step": 18529 + }, + { + "epoch": 1.051329354313619, + "grad_norm": 6.059060096740723, + "learning_rate": 7.563519254488146e-06, + "loss": 0.2988, + "step": 18530 + }, + { + "epoch": 1.051342919153554, + "grad_norm": 5.596226692199707, + "learning_rate": 7.563382211867891e-06, + "loss": 0.3048, + "step": 18531 + }, + { + "epoch": 1.0513564839934888, + "grad_norm": 5.240821838378906, + "learning_rate": 7.563245169247637e-06, + "loss": 0.2685, + "step": 18532 + }, + { + "epoch": 1.0513700488334237, + "grad_norm": 5.971137046813965, + "learning_rate": 7.563108126627382e-06, + "loss": 0.271, + "step": 18533 + }, + { + "epoch": 1.0513836136733588, + "grad_norm": 4.91404914855957, + "learning_rate": 7.562971084007127e-06, + "loss": 0.23, + "step": 18534 + }, + { + "epoch": 1.0513971785132936, + "grad_norm": 4.960956573486328, + "learning_rate": 7.562834041386872e-06, + "loss": 0.394, + "step": 18535 + }, + { + "epoch": 1.0514107433532285, + "grad_norm": 5.049346446990967, + "learning_rate": 7.5626969987666164e-06, + "loss": 0.3059, + "step": 18536 + }, + { + "epoch": 1.0514243081931633, + "grad_norm": 4.847475528717041, + "learning_rate": 7.5625599561463625e-06, + "loss": 0.2727, + "step": 18537 + }, + { + "epoch": 1.0514378730330982, + "grad_norm": 7.018542289733887, + "learning_rate": 7.562422913526108e-06, + "loss": 0.3866, + "step": 18538 + }, + { + "epoch": 1.051451437873033, + "grad_norm": 5.732588768005371, + "learning_rate": 7.562285870905852e-06, + "loss": 0.3133, + "step": 18539 + }, + { + "epoch": 1.051465002712968, + "grad_norm": 5.646676063537598, + "learning_rate": 7.562148828285597e-06, + "loss": 0.2398, + "step": 18540 + }, + { + "epoch": 1.0514785675529028, + "grad_norm": 3.9537885189056396, + "learning_rate": 7.562011785665343e-06, + "loss": 0.1706, + "step": 18541 + }, + { + "epoch": 1.0514921323928377, + "grad_norm": 6.560184478759766, + "learning_rate": 7.5618747430450875e-06, + "loss": 0.4781, + "step": 18542 + }, + { + "epoch": 1.0515056972327725, + "grad_norm": 4.925622463226318, + "learning_rate": 7.561737700424833e-06, + "loss": 0.3407, + "step": 18543 + }, + { + "epoch": 1.0515192620727076, + "grad_norm": 6.216310977935791, + "learning_rate": 7.561600657804578e-06, + "loss": 0.2784, + "step": 18544 + }, + { + "epoch": 1.0515328269126425, + "grad_norm": 4.146687984466553, + "learning_rate": 7.561463615184324e-06, + "loss": 0.1869, + "step": 18545 + }, + { + "epoch": 1.0515463917525774, + "grad_norm": 6.51530647277832, + "learning_rate": 7.561326572564068e-06, + "loss": 0.3365, + "step": 18546 + }, + { + "epoch": 1.0515599565925122, + "grad_norm": 7.221515655517578, + "learning_rate": 7.561189529943813e-06, + "loss": 0.3471, + "step": 18547 + }, + { + "epoch": 1.051573521432447, + "grad_norm": 4.573788166046143, + "learning_rate": 7.561052487323558e-06, + "loss": 0.1631, + "step": 18548 + }, + { + "epoch": 1.051587086272382, + "grad_norm": 6.404078483581543, + "learning_rate": 7.560915444703303e-06, + "loss": 0.4084, + "step": 18549 + }, + { + "epoch": 1.0516006511123168, + "grad_norm": 4.95184850692749, + "learning_rate": 7.560778402083049e-06, + "loss": 0.3484, + "step": 18550 + }, + { + "epoch": 1.0516142159522517, + "grad_norm": 6.896451950073242, + "learning_rate": 7.560641359462793e-06, + "loss": 0.3839, + "step": 18551 + }, + { + "epoch": 1.0516277807921866, + "grad_norm": 4.513515949249268, + "learning_rate": 7.5605043168425384e-06, + "loss": 0.2695, + "step": 18552 + }, + { + "epoch": 1.0516413456321216, + "grad_norm": 5.500943660736084, + "learning_rate": 7.560367274222284e-06, + "loss": 0.2154, + "step": 18553 + }, + { + "epoch": 1.0516549104720565, + "grad_norm": 6.101462364196777, + "learning_rate": 7.56023023160203e-06, + "loss": 0.3771, + "step": 18554 + }, + { + "epoch": 1.0516684753119914, + "grad_norm": 5.127666473388672, + "learning_rate": 7.560093188981774e-06, + "loss": 0.288, + "step": 18555 + }, + { + "epoch": 1.0516820401519262, + "grad_norm": 6.386368274688721, + "learning_rate": 7.559956146361519e-06, + "loss": 0.2992, + "step": 18556 + }, + { + "epoch": 1.051695604991861, + "grad_norm": 6.1475419998168945, + "learning_rate": 7.5598191037412635e-06, + "loss": 0.4991, + "step": 18557 + }, + { + "epoch": 1.051709169831796, + "grad_norm": 4.808414936065674, + "learning_rate": 7.5596820611210095e-06, + "loss": 0.3411, + "step": 18558 + }, + { + "epoch": 1.0517227346717308, + "grad_norm": 8.103814125061035, + "learning_rate": 7.559545018500755e-06, + "loss": 0.3582, + "step": 18559 + }, + { + "epoch": 1.0517362995116657, + "grad_norm": 5.821413516998291, + "learning_rate": 7.5594079758805e-06, + "loss": 0.261, + "step": 18560 + }, + { + "epoch": 1.0517498643516006, + "grad_norm": 6.505198955535889, + "learning_rate": 7.559270933260244e-06, + "loss": 0.2984, + "step": 18561 + }, + { + "epoch": 1.0517634291915354, + "grad_norm": 5.807586193084717, + "learning_rate": 7.559133890639989e-06, + "loss": 0.346, + "step": 18562 + }, + { + "epoch": 1.0517769940314705, + "grad_norm": 6.160665035247803, + "learning_rate": 7.558996848019735e-06, + "loss": 0.2451, + "step": 18563 + }, + { + "epoch": 1.0517905588714054, + "grad_norm": 5.55353307723999, + "learning_rate": 7.55885980539948e-06, + "loss": 0.3087, + "step": 18564 + }, + { + "epoch": 1.0518041237113402, + "grad_norm": 5.470317363739014, + "learning_rate": 7.558722762779225e-06, + "loss": 0.3313, + "step": 18565 + }, + { + "epoch": 1.051817688551275, + "grad_norm": 6.86551570892334, + "learning_rate": 7.558585720158969e-06, + "loss": 0.2847, + "step": 18566 + }, + { + "epoch": 1.05183125339121, + "grad_norm": 7.501993656158447, + "learning_rate": 7.558448677538715e-06, + "loss": 0.314, + "step": 18567 + }, + { + "epoch": 1.0518448182311448, + "grad_norm": 5.552555561065674, + "learning_rate": 7.5583116349184605e-06, + "loss": 0.4649, + "step": 18568 + }, + { + "epoch": 1.0518583830710797, + "grad_norm": 4.8074798583984375, + "learning_rate": 7.558174592298206e-06, + "loss": 0.4309, + "step": 18569 + }, + { + "epoch": 1.0518719479110146, + "grad_norm": 5.9980854988098145, + "learning_rate": 7.55803754967795e-06, + "loss": 0.2264, + "step": 18570 + }, + { + "epoch": 1.0518855127509494, + "grad_norm": 5.428356647491455, + "learning_rate": 7.557900507057696e-06, + "loss": 0.3005, + "step": 18571 + }, + { + "epoch": 1.0518990775908845, + "grad_norm": 6.563031196594238, + "learning_rate": 7.557763464437441e-06, + "loss": 0.2701, + "step": 18572 + }, + { + "epoch": 1.0519126424308194, + "grad_norm": 5.032731533050537, + "learning_rate": 7.5576264218171855e-06, + "loss": 0.313, + "step": 18573 + }, + { + "epoch": 1.0519262072707543, + "grad_norm": 7.2450690269470215, + "learning_rate": 7.557489379196931e-06, + "loss": 0.3418, + "step": 18574 + }, + { + "epoch": 1.0519397721106891, + "grad_norm": 4.975924015045166, + "learning_rate": 7.557352336576676e-06, + "loss": 0.218, + "step": 18575 + }, + { + "epoch": 1.051953336950624, + "grad_norm": 6.750397205352783, + "learning_rate": 7.557215293956421e-06, + "loss": 0.3447, + "step": 18576 + }, + { + "epoch": 1.0519669017905589, + "grad_norm": 5.745971202850342, + "learning_rate": 7.557078251336166e-06, + "loss": 0.3309, + "step": 18577 + }, + { + "epoch": 1.0519804666304937, + "grad_norm": 4.368301868438721, + "learning_rate": 7.556941208715911e-06, + "loss": 0.284, + "step": 18578 + }, + { + "epoch": 1.0519940314704286, + "grad_norm": 4.9890360832214355, + "learning_rate": 7.556804166095656e-06, + "loss": 0.2297, + "step": 18579 + }, + { + "epoch": 1.0520075963103634, + "grad_norm": 7.112928867340088, + "learning_rate": 7.556667123475402e-06, + "loss": 0.3064, + "step": 18580 + }, + { + "epoch": 1.0520211611502985, + "grad_norm": 5.834710597991943, + "learning_rate": 7.556530080855147e-06, + "loss": 0.2496, + "step": 18581 + }, + { + "epoch": 1.0520347259902334, + "grad_norm": 5.102975368499756, + "learning_rate": 7.556393038234891e-06, + "loss": 0.2472, + "step": 18582 + }, + { + "epoch": 1.0520482908301683, + "grad_norm": 7.415034770965576, + "learning_rate": 7.5562559956146364e-06, + "loss": 0.2611, + "step": 18583 + }, + { + "epoch": 1.0520618556701031, + "grad_norm": 5.261405944824219, + "learning_rate": 7.5561189529943825e-06, + "loss": 0.2576, + "step": 18584 + }, + { + "epoch": 1.052075420510038, + "grad_norm": 4.0365166664123535, + "learning_rate": 7.555981910374127e-06, + "loss": 0.1349, + "step": 18585 + }, + { + "epoch": 1.0520889853499729, + "grad_norm": 5.4883575439453125, + "learning_rate": 7.555844867753872e-06, + "loss": 0.3362, + "step": 18586 + }, + { + "epoch": 1.0521025501899077, + "grad_norm": 7.540837287902832, + "learning_rate": 7.555707825133617e-06, + "loss": 0.4627, + "step": 18587 + }, + { + "epoch": 1.0521161150298426, + "grad_norm": 6.481330394744873, + "learning_rate": 7.5555707825133615e-06, + "loss": 0.3433, + "step": 18588 + }, + { + "epoch": 1.0521296798697775, + "grad_norm": 6.027211666107178, + "learning_rate": 7.5554337398931075e-06, + "loss": 0.2391, + "step": 18589 + }, + { + "epoch": 1.0521432447097123, + "grad_norm": 5.4697980880737305, + "learning_rate": 7.555296697272853e-06, + "loss": 0.2232, + "step": 18590 + }, + { + "epoch": 1.0521568095496474, + "grad_norm": 6.407756805419922, + "learning_rate": 7.555159654652597e-06, + "loss": 0.3228, + "step": 18591 + }, + { + "epoch": 1.0521703743895823, + "grad_norm": 5.573204517364502, + "learning_rate": 7.555022612032342e-06, + "loss": 0.241, + "step": 18592 + }, + { + "epoch": 1.0521839392295171, + "grad_norm": 6.0674896240234375, + "learning_rate": 7.554885569412088e-06, + "loss": 0.24, + "step": 18593 + }, + { + "epoch": 1.052197504069452, + "grad_norm": 7.919186115264893, + "learning_rate": 7.554748526791833e-06, + "loss": 0.3801, + "step": 18594 + }, + { + "epoch": 1.0522110689093869, + "grad_norm": 6.259745121002197, + "learning_rate": 7.554611484171578e-06, + "loss": 0.3604, + "step": 18595 + }, + { + "epoch": 1.0522246337493217, + "grad_norm": 5.4527177810668945, + "learning_rate": 7.554474441551323e-06, + "loss": 0.1428, + "step": 18596 + }, + { + "epoch": 1.0522381985892566, + "grad_norm": 4.043707847595215, + "learning_rate": 7.554337398931069e-06, + "loss": 0.2447, + "step": 18597 + }, + { + "epoch": 1.0522517634291915, + "grad_norm": 7.704732894897461, + "learning_rate": 7.554200356310813e-06, + "loss": 0.4229, + "step": 18598 + }, + { + "epoch": 1.0522653282691263, + "grad_norm": 5.790292739868164, + "learning_rate": 7.5540633136905585e-06, + "loss": 0.1904, + "step": 18599 + }, + { + "epoch": 1.0522788931090614, + "grad_norm": 6.539689064025879, + "learning_rate": 7.553926271070303e-06, + "loss": 0.3616, + "step": 18600 + }, + { + "epoch": 1.0522924579489963, + "grad_norm": 5.974842548370361, + "learning_rate": 7.553789228450048e-06, + "loss": 0.3434, + "step": 18601 + }, + { + "epoch": 1.0523060227889312, + "grad_norm": 6.715542793273926, + "learning_rate": 7.553652185829794e-06, + "loss": 0.2783, + "step": 18602 + }, + { + "epoch": 1.052319587628866, + "grad_norm": 6.703165531158447, + "learning_rate": 7.553515143209539e-06, + "loss": 0.2783, + "step": 18603 + }, + { + "epoch": 1.0523331524688009, + "grad_norm": 4.6319098472595215, + "learning_rate": 7.5533781005892835e-06, + "loss": 0.2717, + "step": 18604 + }, + { + "epoch": 1.0523467173087357, + "grad_norm": 5.906975269317627, + "learning_rate": 7.553241057969029e-06, + "loss": 0.3766, + "step": 18605 + }, + { + "epoch": 1.0523602821486706, + "grad_norm": 7.763511657714844, + "learning_rate": 7.553104015348775e-06, + "loss": 0.3508, + "step": 18606 + }, + { + "epoch": 1.0523738469886055, + "grad_norm": 5.335697650909424, + "learning_rate": 7.552966972728519e-06, + "loss": 0.2465, + "step": 18607 + }, + { + "epoch": 1.0523874118285403, + "grad_norm": 5.629779815673828, + "learning_rate": 7.552829930108264e-06, + "loss": 0.2872, + "step": 18608 + }, + { + "epoch": 1.0524009766684752, + "grad_norm": 6.315840244293213, + "learning_rate": 7.552692887488009e-06, + "loss": 0.2862, + "step": 18609 + }, + { + "epoch": 1.0524145415084103, + "grad_norm": 5.759855270385742, + "learning_rate": 7.552555844867755e-06, + "loss": 0.3161, + "step": 18610 + }, + { + "epoch": 1.0524281063483452, + "grad_norm": 6.942453861236572, + "learning_rate": 7.5524188022475e-06, + "loss": 0.3013, + "step": 18611 + }, + { + "epoch": 1.05244167118828, + "grad_norm": 4.963557720184326, + "learning_rate": 7.552281759627245e-06, + "loss": 0.2595, + "step": 18612 + }, + { + "epoch": 1.052455236028215, + "grad_norm": 5.609123706817627, + "learning_rate": 7.552144717006989e-06, + "loss": 0.3316, + "step": 18613 + }, + { + "epoch": 1.0524688008681498, + "grad_norm": 8.004446983337402, + "learning_rate": 7.552007674386735e-06, + "loss": 0.5028, + "step": 18614 + }, + { + "epoch": 1.0524823657080846, + "grad_norm": 5.182316303253174, + "learning_rate": 7.5518706317664805e-06, + "loss": 0.2662, + "step": 18615 + }, + { + "epoch": 1.0524959305480195, + "grad_norm": 6.452136993408203, + "learning_rate": 7.551733589146225e-06, + "loss": 0.2514, + "step": 18616 + }, + { + "epoch": 1.0525094953879544, + "grad_norm": 6.696345329284668, + "learning_rate": 7.55159654652597e-06, + "loss": 0.2852, + "step": 18617 + }, + { + "epoch": 1.0525230602278892, + "grad_norm": 6.446379661560059, + "learning_rate": 7.551459503905715e-06, + "loss": 0.3138, + "step": 18618 + }, + { + "epoch": 1.0525366250678243, + "grad_norm": 7.995143890380859, + "learning_rate": 7.55132246128546e-06, + "loss": 0.6765, + "step": 18619 + }, + { + "epoch": 1.0525501899077592, + "grad_norm": 5.065332412719727, + "learning_rate": 7.5511854186652055e-06, + "loss": 0.2384, + "step": 18620 + }, + { + "epoch": 1.052563754747694, + "grad_norm": 5.502598762512207, + "learning_rate": 7.551048376044951e-06, + "loss": 0.2276, + "step": 18621 + }, + { + "epoch": 1.052577319587629, + "grad_norm": 6.126918792724609, + "learning_rate": 7.550911333424695e-06, + "loss": 0.3828, + "step": 18622 + }, + { + "epoch": 1.0525908844275638, + "grad_norm": 7.897407054901123, + "learning_rate": 7.550774290804441e-06, + "loss": 0.4243, + "step": 18623 + }, + { + "epoch": 1.0526044492674986, + "grad_norm": 7.461602687835693, + "learning_rate": 7.550637248184186e-06, + "loss": 0.4384, + "step": 18624 + }, + { + "epoch": 1.0526180141074335, + "grad_norm": 6.170085430145264, + "learning_rate": 7.5505002055639306e-06, + "loss": 0.4057, + "step": 18625 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 5.028871059417725, + "learning_rate": 7.550363162943676e-06, + "loss": 0.2081, + "step": 18626 + }, + { + "epoch": 1.0526451437873032, + "grad_norm": 6.408971309661865, + "learning_rate": 7.550226120323422e-06, + "loss": 0.322, + "step": 18627 + }, + { + "epoch": 1.052658708627238, + "grad_norm": 4.698835372924805, + "learning_rate": 7.550089077703167e-06, + "loss": 0.2827, + "step": 18628 + }, + { + "epoch": 1.0526722734671732, + "grad_norm": 5.213942527770996, + "learning_rate": 7.549952035082911e-06, + "loss": 0.3275, + "step": 18629 + }, + { + "epoch": 1.052685838307108, + "grad_norm": 5.2108612060546875, + "learning_rate": 7.5498149924626565e-06, + "loss": 0.2307, + "step": 18630 + }, + { + "epoch": 1.052699403147043, + "grad_norm": 4.312490940093994, + "learning_rate": 7.549677949842401e-06, + "loss": 0.225, + "step": 18631 + }, + { + "epoch": 1.0527129679869778, + "grad_norm": 4.821467876434326, + "learning_rate": 7.549540907222147e-06, + "loss": 0.2273, + "step": 18632 + }, + { + "epoch": 1.0527265328269126, + "grad_norm": 4.287898063659668, + "learning_rate": 7.549403864601892e-06, + "loss": 0.3167, + "step": 18633 + }, + { + "epoch": 1.0527400976668475, + "grad_norm": 5.712067604064941, + "learning_rate": 7.549266821981636e-06, + "loss": 0.3457, + "step": 18634 + }, + { + "epoch": 1.0527536625067824, + "grad_norm": 4.109470367431641, + "learning_rate": 7.5491297793613815e-06, + "loss": 0.1555, + "step": 18635 + }, + { + "epoch": 1.0527672273467172, + "grad_norm": 4.344234466552734, + "learning_rate": 7.5489927367411275e-06, + "loss": 0.1873, + "step": 18636 + }, + { + "epoch": 1.052780792186652, + "grad_norm": 6.503149032592773, + "learning_rate": 7.548855694120873e-06, + "loss": 0.3641, + "step": 18637 + }, + { + "epoch": 1.0527943570265872, + "grad_norm": 6.688456058502197, + "learning_rate": 7.548718651500617e-06, + "loss": 0.3015, + "step": 18638 + }, + { + "epoch": 1.052807921866522, + "grad_norm": 5.893544673919678, + "learning_rate": 7.548581608880362e-06, + "loss": 0.2138, + "step": 18639 + }, + { + "epoch": 1.052821486706457, + "grad_norm": 5.917618274688721, + "learning_rate": 7.548444566260108e-06, + "loss": 0.3362, + "step": 18640 + }, + { + "epoch": 1.0528350515463918, + "grad_norm": 4.422091960906982, + "learning_rate": 7.548307523639853e-06, + "loss": 0.3156, + "step": 18641 + }, + { + "epoch": 1.0528486163863267, + "grad_norm": 6.072519302368164, + "learning_rate": 7.548170481019598e-06, + "loss": 0.3607, + "step": 18642 + }, + { + "epoch": 1.0528621812262615, + "grad_norm": 6.562658309936523, + "learning_rate": 7.548033438399343e-06, + "loss": 0.3572, + "step": 18643 + }, + { + "epoch": 1.0528757460661964, + "grad_norm": 5.365809917449951, + "learning_rate": 7.547896395779087e-06, + "loss": 0.3873, + "step": 18644 + }, + { + "epoch": 1.0528893109061312, + "grad_norm": 4.603741645812988, + "learning_rate": 7.547759353158833e-06, + "loss": 0.215, + "step": 18645 + }, + { + "epoch": 1.0529028757460661, + "grad_norm": 7.898876190185547, + "learning_rate": 7.5476223105385785e-06, + "loss": 0.4034, + "step": 18646 + }, + { + "epoch": 1.052916440586001, + "grad_norm": 6.116644382476807, + "learning_rate": 7.547485267918323e-06, + "loss": 0.3203, + "step": 18647 + }, + { + "epoch": 1.052930005425936, + "grad_norm": 4.850048065185547, + "learning_rate": 7.547348225298068e-06, + "loss": 0.2205, + "step": 18648 + }, + { + "epoch": 1.052943570265871, + "grad_norm": 6.198765277862549, + "learning_rate": 7.547211182677814e-06, + "loss": 0.4778, + "step": 18649 + }, + { + "epoch": 1.0529571351058058, + "grad_norm": 6.772700309753418, + "learning_rate": 7.547074140057558e-06, + "loss": 0.3217, + "step": 18650 + }, + { + "epoch": 1.0529706999457407, + "grad_norm": 10.518241882324219, + "learning_rate": 7.5469370974373035e-06, + "loss": 0.519, + "step": 18651 + }, + { + "epoch": 1.0529842647856755, + "grad_norm": 5.899314880371094, + "learning_rate": 7.546800054817049e-06, + "loss": 0.2674, + "step": 18652 + }, + { + "epoch": 1.0529978296256104, + "grad_norm": 7.325019359588623, + "learning_rate": 7.546663012196795e-06, + "loss": 0.5261, + "step": 18653 + }, + { + "epoch": 1.0530113944655453, + "grad_norm": 5.2229437828063965, + "learning_rate": 7.546525969576539e-06, + "loss": 0.3777, + "step": 18654 + }, + { + "epoch": 1.0530249593054801, + "grad_norm": 5.432406902313232, + "learning_rate": 7.546388926956284e-06, + "loss": 0.3395, + "step": 18655 + }, + { + "epoch": 1.053038524145415, + "grad_norm": 4.505463123321533, + "learning_rate": 7.5462518843360286e-06, + "loss": 0.3345, + "step": 18656 + }, + { + "epoch": 1.05305208898535, + "grad_norm": 6.539085388183594, + "learning_rate": 7.546114841715774e-06, + "loss": 0.3776, + "step": 18657 + }, + { + "epoch": 1.053065653825285, + "grad_norm": 5.081206321716309, + "learning_rate": 7.54597779909552e-06, + "loss": 0.4415, + "step": 18658 + }, + { + "epoch": 1.0530792186652198, + "grad_norm": 6.465579986572266, + "learning_rate": 7.545840756475264e-06, + "loss": 0.2897, + "step": 18659 + }, + { + "epoch": 1.0530927835051547, + "grad_norm": 7.682189464569092, + "learning_rate": 7.545703713855009e-06, + "loss": 0.4166, + "step": 18660 + }, + { + "epoch": 1.0531063483450895, + "grad_norm": 4.393143177032471, + "learning_rate": 7.5455666712347545e-06, + "loss": 0.2368, + "step": 18661 + }, + { + "epoch": 1.0531199131850244, + "grad_norm": 5.120398998260498, + "learning_rate": 7.5454296286145005e-06, + "loss": 0.2763, + "step": 18662 + }, + { + "epoch": 1.0531334780249593, + "grad_norm": 5.796664237976074, + "learning_rate": 7.545292585994245e-06, + "loss": 0.3478, + "step": 18663 + }, + { + "epoch": 1.0531470428648941, + "grad_norm": 6.12701940536499, + "learning_rate": 7.54515554337399e-06, + "loss": 0.5113, + "step": 18664 + }, + { + "epoch": 1.053160607704829, + "grad_norm": 5.485937595367432, + "learning_rate": 7.545018500753734e-06, + "loss": 0.3807, + "step": 18665 + }, + { + "epoch": 1.0531741725447639, + "grad_norm": 6.717350959777832, + "learning_rate": 7.54488145813348e-06, + "loss": 0.3441, + "step": 18666 + }, + { + "epoch": 1.053187737384699, + "grad_norm": 6.898866176605225, + "learning_rate": 7.5447444155132255e-06, + "loss": 0.307, + "step": 18667 + }, + { + "epoch": 1.0532013022246338, + "grad_norm": 5.505677223205566, + "learning_rate": 7.54460737289297e-06, + "loss": 0.332, + "step": 18668 + }, + { + "epoch": 1.0532148670645687, + "grad_norm": 5.263008117675781, + "learning_rate": 7.544470330272715e-06, + "loss": 0.2899, + "step": 18669 + }, + { + "epoch": 1.0532284319045035, + "grad_norm": 6.645439147949219, + "learning_rate": 7.54433328765246e-06, + "loss": 0.5609, + "step": 18670 + }, + { + "epoch": 1.0532419967444384, + "grad_norm": 5.16217041015625, + "learning_rate": 7.544196245032206e-06, + "loss": 0.3185, + "step": 18671 + }, + { + "epoch": 1.0532555615843733, + "grad_norm": 5.550475120544434, + "learning_rate": 7.544059202411951e-06, + "loss": 0.2335, + "step": 18672 + }, + { + "epoch": 1.0532691264243081, + "grad_norm": 4.895991325378418, + "learning_rate": 7.543922159791696e-06, + "loss": 0.2562, + "step": 18673 + }, + { + "epoch": 1.053282691264243, + "grad_norm": 5.5540056228637695, + "learning_rate": 7.54378511717144e-06, + "loss": 0.2477, + "step": 18674 + }, + { + "epoch": 1.0532962561041779, + "grad_norm": 5.396631240844727, + "learning_rate": 7.543648074551186e-06, + "loss": 0.28, + "step": 18675 + }, + { + "epoch": 1.053309820944113, + "grad_norm": 6.239282608032227, + "learning_rate": 7.543511031930931e-06, + "loss": 0.2934, + "step": 18676 + }, + { + "epoch": 1.0533233857840478, + "grad_norm": 5.126572132110596, + "learning_rate": 7.5433739893106765e-06, + "loss": 0.2526, + "step": 18677 + }, + { + "epoch": 1.0533369506239827, + "grad_norm": 6.242693901062012, + "learning_rate": 7.543236946690421e-06, + "loss": 0.3432, + "step": 18678 + }, + { + "epoch": 1.0533505154639176, + "grad_norm": 7.473675727844238, + "learning_rate": 7.543099904070167e-06, + "loss": 0.4706, + "step": 18679 + }, + { + "epoch": 1.0533640803038524, + "grad_norm": 5.1618757247924805, + "learning_rate": 7.542962861449912e-06, + "loss": 0.4156, + "step": 18680 + }, + { + "epoch": 1.0533776451437873, + "grad_norm": 5.310155391693115, + "learning_rate": 7.542825818829656e-06, + "loss": 0.2715, + "step": 18681 + }, + { + "epoch": 1.0533912099837222, + "grad_norm": 5.363775730133057, + "learning_rate": 7.5426887762094015e-06, + "loss": 0.3292, + "step": 18682 + }, + { + "epoch": 1.053404774823657, + "grad_norm": 4.601696491241455, + "learning_rate": 7.5425517335891476e-06, + "loss": 0.2484, + "step": 18683 + }, + { + "epoch": 1.0534183396635919, + "grad_norm": 6.459589958190918, + "learning_rate": 7.542414690968892e-06, + "loss": 0.3688, + "step": 18684 + }, + { + "epoch": 1.053431904503527, + "grad_norm": 4.82787561416626, + "learning_rate": 7.542277648348637e-06, + "loss": 0.1719, + "step": 18685 + }, + { + "epoch": 1.0534454693434618, + "grad_norm": 6.069542407989502, + "learning_rate": 7.542140605728382e-06, + "loss": 0.2942, + "step": 18686 + }, + { + "epoch": 1.0534590341833967, + "grad_norm": 4.583578586578369, + "learning_rate": 7.542003563108127e-06, + "loss": 0.29, + "step": 18687 + }, + { + "epoch": 1.0534725990233316, + "grad_norm": 5.885513782501221, + "learning_rate": 7.541866520487873e-06, + "loss": 0.2079, + "step": 18688 + }, + { + "epoch": 1.0534861638632664, + "grad_norm": 7.52855920791626, + "learning_rate": 7.541729477867618e-06, + "loss": 0.3227, + "step": 18689 + }, + { + "epoch": 1.0534997287032013, + "grad_norm": 6.8938679695129395, + "learning_rate": 7.541592435247362e-06, + "loss": 0.2289, + "step": 18690 + }, + { + "epoch": 1.0535132935431362, + "grad_norm": 6.2324748039245605, + "learning_rate": 7.541455392627107e-06, + "loss": 0.2375, + "step": 18691 + }, + { + "epoch": 1.053526858383071, + "grad_norm": 4.428687572479248, + "learning_rate": 7.541318350006853e-06, + "loss": 0.2912, + "step": 18692 + }, + { + "epoch": 1.053540423223006, + "grad_norm": 6.1856207847595215, + "learning_rate": 7.541181307386598e-06, + "loss": 0.2282, + "step": 18693 + }, + { + "epoch": 1.0535539880629408, + "grad_norm": 4.643990993499756, + "learning_rate": 7.541044264766343e-06, + "loss": 0.2416, + "step": 18694 + }, + { + "epoch": 1.0535675529028758, + "grad_norm": 7.867641448974609, + "learning_rate": 7.540907222146088e-06, + "loss": 0.3697, + "step": 18695 + }, + { + "epoch": 1.0535811177428107, + "grad_norm": 4.970150470733643, + "learning_rate": 7.540770179525834e-06, + "loss": 0.3205, + "step": 18696 + }, + { + "epoch": 1.0535946825827456, + "grad_norm": 6.631535530090332, + "learning_rate": 7.540633136905578e-06, + "loss": 0.4114, + "step": 18697 + }, + { + "epoch": 1.0536082474226804, + "grad_norm": 7.144015789031982, + "learning_rate": 7.5404960942853235e-06, + "loss": 0.4496, + "step": 18698 + }, + { + "epoch": 1.0536218122626153, + "grad_norm": 5.623002529144287, + "learning_rate": 7.540359051665068e-06, + "loss": 0.3649, + "step": 18699 + }, + { + "epoch": 1.0536353771025502, + "grad_norm": 4.6288652420043945, + "learning_rate": 7.540222009044813e-06, + "loss": 0.2538, + "step": 18700 + }, + { + "epoch": 1.053648941942485, + "grad_norm": 5.092306613922119, + "learning_rate": 7.540084966424559e-06, + "loss": 0.266, + "step": 18701 + }, + { + "epoch": 1.05366250678242, + "grad_norm": 5.537083625793457, + "learning_rate": 7.539947923804304e-06, + "loss": 0.2466, + "step": 18702 + }, + { + "epoch": 1.0536760716223548, + "grad_norm": 6.122471332550049, + "learning_rate": 7.539810881184049e-06, + "loss": 0.3065, + "step": 18703 + }, + { + "epoch": 1.0536896364622899, + "grad_norm": 5.329137802124023, + "learning_rate": 7.539673838563794e-06, + "loss": 0.2397, + "step": 18704 + }, + { + "epoch": 1.0537032013022247, + "grad_norm": 6.543855667114258, + "learning_rate": 7.53953679594354e-06, + "loss": 0.2957, + "step": 18705 + }, + { + "epoch": 1.0537167661421596, + "grad_norm": 6.272363185882568, + "learning_rate": 7.539399753323284e-06, + "loss": 0.2623, + "step": 18706 + }, + { + "epoch": 1.0537303309820945, + "grad_norm": 6.108826637268066, + "learning_rate": 7.539262710703029e-06, + "loss": 0.3438, + "step": 18707 + }, + { + "epoch": 1.0537438958220293, + "grad_norm": 4.315948009490967, + "learning_rate": 7.539125668082774e-06, + "loss": 0.2795, + "step": 18708 + }, + { + "epoch": 1.0537574606619642, + "grad_norm": 5.969736576080322, + "learning_rate": 7.53898862546252e-06, + "loss": 0.3403, + "step": 18709 + }, + { + "epoch": 1.053771025501899, + "grad_norm": 5.805872440338135, + "learning_rate": 7.538851582842265e-06, + "loss": 0.3474, + "step": 18710 + }, + { + "epoch": 1.053784590341834, + "grad_norm": 3.8699331283569336, + "learning_rate": 7.53871454022201e-06, + "loss": 0.2122, + "step": 18711 + }, + { + "epoch": 1.0537981551817688, + "grad_norm": 6.9378814697265625, + "learning_rate": 7.538577497601754e-06, + "loss": 0.3269, + "step": 18712 + }, + { + "epoch": 1.0538117200217036, + "grad_norm": 6.554511547088623, + "learning_rate": 7.5384404549814995e-06, + "loss": 0.2898, + "step": 18713 + }, + { + "epoch": 1.0538252848616387, + "grad_norm": 4.6725358963012695, + "learning_rate": 7.5383034123612456e-06, + "loss": 0.217, + "step": 18714 + }, + { + "epoch": 1.0538388497015736, + "grad_norm": 5.739834785461426, + "learning_rate": 7.53816636974099e-06, + "loss": 0.2614, + "step": 18715 + }, + { + "epoch": 1.0538524145415085, + "grad_norm": 5.264492511749268, + "learning_rate": 7.538029327120735e-06, + "loss": 0.3741, + "step": 18716 + }, + { + "epoch": 1.0538659793814433, + "grad_norm": 4.771937847137451, + "learning_rate": 7.53789228450048e-06, + "loss": 0.1931, + "step": 18717 + }, + { + "epoch": 1.0538795442213782, + "grad_norm": 6.616918087005615, + "learning_rate": 7.5377552418802254e-06, + "loss": 0.2137, + "step": 18718 + }, + { + "epoch": 1.053893109061313, + "grad_norm": 5.584259510040283, + "learning_rate": 7.537618199259971e-06, + "loss": 0.2804, + "step": 18719 + }, + { + "epoch": 1.053906673901248, + "grad_norm": 5.188056945800781, + "learning_rate": 7.537481156639716e-06, + "loss": 0.2443, + "step": 18720 + }, + { + "epoch": 1.0539202387411828, + "grad_norm": 5.196420192718506, + "learning_rate": 7.53734411401946e-06, + "loss": 0.2935, + "step": 18721 + }, + { + "epoch": 1.0539338035811177, + "grad_norm": 7.129227638244629, + "learning_rate": 7.537207071399206e-06, + "loss": 0.3171, + "step": 18722 + }, + { + "epoch": 1.0539473684210527, + "grad_norm": 3.8985390663146973, + "learning_rate": 7.537070028778951e-06, + "loss": 0.207, + "step": 18723 + }, + { + "epoch": 1.0539609332609876, + "grad_norm": 4.474743366241455, + "learning_rate": 7.536932986158696e-06, + "loss": 0.2678, + "step": 18724 + }, + { + "epoch": 1.0539744981009225, + "grad_norm": 5.774545669555664, + "learning_rate": 7.536795943538441e-06, + "loss": 0.3015, + "step": 18725 + }, + { + "epoch": 1.0539880629408573, + "grad_norm": 5.456213474273682, + "learning_rate": 7.536658900918186e-06, + "loss": 0.214, + "step": 18726 + }, + { + "epoch": 1.0540016277807922, + "grad_norm": 5.021713733673096, + "learning_rate": 7.536521858297931e-06, + "loss": 0.2821, + "step": 18727 + }, + { + "epoch": 1.054015192620727, + "grad_norm": 5.890940189361572, + "learning_rate": 7.536384815677676e-06, + "loss": 0.2886, + "step": 18728 + }, + { + "epoch": 1.054028757460662, + "grad_norm": 3.9938392639160156, + "learning_rate": 7.5362477730574216e-06, + "loss": 0.116, + "step": 18729 + }, + { + "epoch": 1.0540423223005968, + "grad_norm": 5.4791693687438965, + "learning_rate": 7.536110730437166e-06, + "loss": 0.4012, + "step": 18730 + }, + { + "epoch": 1.0540558871405317, + "grad_norm": 7.221343517303467, + "learning_rate": 7.535973687816912e-06, + "loss": 0.311, + "step": 18731 + }, + { + "epoch": 1.0540694519804665, + "grad_norm": 5.63596248626709, + "learning_rate": 7.535836645196657e-06, + "loss": 0.2312, + "step": 18732 + }, + { + "epoch": 1.0540830168204016, + "grad_norm": 6.11590576171875, + "learning_rate": 7.535699602576401e-06, + "loss": 0.3751, + "step": 18733 + }, + { + "epoch": 1.0540965816603365, + "grad_norm": 4.789270401000977, + "learning_rate": 7.535562559956147e-06, + "loss": 0.1929, + "step": 18734 + }, + { + "epoch": 1.0541101465002713, + "grad_norm": 5.750185966491699, + "learning_rate": 7.535425517335893e-06, + "loss": 0.4923, + "step": 18735 + }, + { + "epoch": 1.0541237113402062, + "grad_norm": 5.268966197967529, + "learning_rate": 7.535288474715638e-06, + "loss": 0.2802, + "step": 18736 + }, + { + "epoch": 1.054137276180141, + "grad_norm": 3.970818042755127, + "learning_rate": 7.535151432095382e-06, + "loss": 0.2634, + "step": 18737 + }, + { + "epoch": 1.054150841020076, + "grad_norm": 4.615272045135498, + "learning_rate": 7.535014389475127e-06, + "loss": 0.1919, + "step": 18738 + }, + { + "epoch": 1.0541644058600108, + "grad_norm": 7.179872512817383, + "learning_rate": 7.534877346854872e-06, + "loss": 0.3403, + "step": 18739 + }, + { + "epoch": 1.0541779706999457, + "grad_norm": 6.370193004608154, + "learning_rate": 7.534740304234618e-06, + "loss": 0.3622, + "step": 18740 + }, + { + "epoch": 1.0541915355398805, + "grad_norm": 5.966946125030518, + "learning_rate": 7.534603261614363e-06, + "loss": 0.3218, + "step": 18741 + }, + { + "epoch": 1.0542051003798156, + "grad_norm": 4.613367557525635, + "learning_rate": 7.534466218994107e-06, + "loss": 0.157, + "step": 18742 + }, + { + "epoch": 1.0542186652197505, + "grad_norm": 4.151813983917236, + "learning_rate": 7.534329176373852e-06, + "loss": 0.1122, + "step": 18743 + }, + { + "epoch": 1.0542322300596854, + "grad_norm": 5.177392482757568, + "learning_rate": 7.534192133753598e-06, + "loss": 0.2704, + "step": 18744 + }, + { + "epoch": 1.0542457948996202, + "grad_norm": 7.417710304260254, + "learning_rate": 7.5340550911333436e-06, + "loss": 0.2323, + "step": 18745 + }, + { + "epoch": 1.054259359739555, + "grad_norm": 4.6428303718566895, + "learning_rate": 7.533918048513088e-06, + "loss": 0.3625, + "step": 18746 + }, + { + "epoch": 1.05427292457949, + "grad_norm": 5.577476978302002, + "learning_rate": 7.533781005892833e-06, + "loss": 0.2882, + "step": 18747 + }, + { + "epoch": 1.0542864894194248, + "grad_norm": 4.418259143829346, + "learning_rate": 7.533643963272579e-06, + "loss": 0.2427, + "step": 18748 + }, + { + "epoch": 1.0543000542593597, + "grad_norm": 5.306906223297119, + "learning_rate": 7.5335069206523234e-06, + "loss": 0.2637, + "step": 18749 + }, + { + "epoch": 1.0543136190992946, + "grad_norm": 6.930217266082764, + "learning_rate": 7.533369878032069e-06, + "loss": 0.2771, + "step": 18750 + }, + { + "epoch": 1.0543271839392294, + "grad_norm": 4.971061706542969, + "learning_rate": 7.533232835411814e-06, + "loss": 0.2188, + "step": 18751 + }, + { + "epoch": 1.0543407487791645, + "grad_norm": 5.133634567260742, + "learning_rate": 7.533095792791559e-06, + "loss": 0.2878, + "step": 18752 + }, + { + "epoch": 1.0543543136190994, + "grad_norm": 6.996585369110107, + "learning_rate": 7.532958750171304e-06, + "loss": 0.3695, + "step": 18753 + }, + { + "epoch": 1.0543678784590342, + "grad_norm": 5.8474578857421875, + "learning_rate": 7.532821707551049e-06, + "loss": 0.2818, + "step": 18754 + }, + { + "epoch": 1.054381443298969, + "grad_norm": 4.948092937469482, + "learning_rate": 7.532684664930794e-06, + "loss": 0.2412, + "step": 18755 + }, + { + "epoch": 1.054395008138904, + "grad_norm": 5.574760437011719, + "learning_rate": 7.532547622310539e-06, + "loss": 0.3357, + "step": 18756 + }, + { + "epoch": 1.0544085729788388, + "grad_norm": 4.750402927398682, + "learning_rate": 7.532410579690285e-06, + "loss": 0.2739, + "step": 18757 + }, + { + "epoch": 1.0544221378187737, + "grad_norm": 6.1762261390686035, + "learning_rate": 7.532273537070029e-06, + "loss": 0.2166, + "step": 18758 + }, + { + "epoch": 1.0544357026587086, + "grad_norm": 5.71412467956543, + "learning_rate": 7.532136494449774e-06, + "loss": 0.2717, + "step": 18759 + }, + { + "epoch": 1.0544492674986434, + "grad_norm": 6.658648490905762, + "learning_rate": 7.5319994518295196e-06, + "loss": 0.3829, + "step": 18760 + }, + { + "epoch": 1.0544628323385785, + "grad_norm": 5.680091857910156, + "learning_rate": 7.531862409209265e-06, + "loss": 0.3712, + "step": 18761 + }, + { + "epoch": 1.0544763971785134, + "grad_norm": 8.245695114135742, + "learning_rate": 7.53172536658901e-06, + "loss": 0.4, + "step": 18762 + }, + { + "epoch": 1.0544899620184482, + "grad_norm": 5.552726745605469, + "learning_rate": 7.531588323968755e-06, + "loss": 0.2447, + "step": 18763 + }, + { + "epoch": 1.0545035268583831, + "grad_norm": 6.812146186828613, + "learning_rate": 7.5314512813484994e-06, + "loss": 0.2302, + "step": 18764 + }, + { + "epoch": 1.054517091698318, + "grad_norm": 5.866966247558594, + "learning_rate": 7.5313142387282454e-06, + "loss": 0.3112, + "step": 18765 + }, + { + "epoch": 1.0545306565382528, + "grad_norm": 5.445219993591309, + "learning_rate": 7.531177196107991e-06, + "loss": 0.2545, + "step": 18766 + }, + { + "epoch": 1.0545442213781877, + "grad_norm": 5.777628421783447, + "learning_rate": 7.531040153487735e-06, + "loss": 0.3162, + "step": 18767 + }, + { + "epoch": 1.0545577862181226, + "grad_norm": 5.378759860992432, + "learning_rate": 7.53090311086748e-06, + "loss": 0.1803, + "step": 18768 + }, + { + "epoch": 1.0545713510580574, + "grad_norm": 5.814090728759766, + "learning_rate": 7.530766068247225e-06, + "loss": 0.2932, + "step": 18769 + }, + { + "epoch": 1.0545849158979923, + "grad_norm": 4.319347858428955, + "learning_rate": 7.530629025626971e-06, + "loss": 0.2318, + "step": 18770 + }, + { + "epoch": 1.0545984807379274, + "grad_norm": 7.042673110961914, + "learning_rate": 7.530491983006716e-06, + "loss": 0.2799, + "step": 18771 + }, + { + "epoch": 1.0546120455778623, + "grad_norm": 4.7967610359191895, + "learning_rate": 7.530354940386461e-06, + "loss": 0.2691, + "step": 18772 + }, + { + "epoch": 1.0546256104177971, + "grad_norm": 4.986471176147461, + "learning_rate": 7.530217897766205e-06, + "loss": 0.2757, + "step": 18773 + }, + { + "epoch": 1.054639175257732, + "grad_norm": 7.436698913574219, + "learning_rate": 7.530080855145951e-06, + "loss": 0.2857, + "step": 18774 + }, + { + "epoch": 1.0546527400976669, + "grad_norm": 5.9304728507995605, + "learning_rate": 7.529943812525696e-06, + "loss": 0.3148, + "step": 18775 + }, + { + "epoch": 1.0546663049376017, + "grad_norm": 6.012590408325195, + "learning_rate": 7.529806769905441e-06, + "loss": 0.3102, + "step": 18776 + }, + { + "epoch": 1.0546798697775366, + "grad_norm": 4.258087158203125, + "learning_rate": 7.529669727285186e-06, + "loss": 0.2059, + "step": 18777 + }, + { + "epoch": 1.0546934346174714, + "grad_norm": 4.906341552734375, + "learning_rate": 7.529532684664932e-06, + "loss": 0.2248, + "step": 18778 + }, + { + "epoch": 1.0547069994574063, + "grad_norm": 6.004798412322998, + "learning_rate": 7.529395642044677e-06, + "loss": 0.3282, + "step": 18779 + }, + { + "epoch": 1.0547205642973414, + "grad_norm": 6.092752933502197, + "learning_rate": 7.5292585994244214e-06, + "loss": 0.3262, + "step": 18780 + }, + { + "epoch": 1.0547341291372763, + "grad_norm": 5.19952392578125, + "learning_rate": 7.529121556804167e-06, + "loss": 0.1679, + "step": 18781 + }, + { + "epoch": 1.0547476939772111, + "grad_norm": 4.4271159172058105, + "learning_rate": 7.528984514183911e-06, + "loss": 0.172, + "step": 18782 + }, + { + "epoch": 1.054761258817146, + "grad_norm": 7.676052093505859, + "learning_rate": 7.528847471563657e-06, + "loss": 0.3237, + "step": 18783 + }, + { + "epoch": 1.0547748236570809, + "grad_norm": 6.461935043334961, + "learning_rate": 7.528710428943402e-06, + "loss": 0.3426, + "step": 18784 + }, + { + "epoch": 1.0547883884970157, + "grad_norm": 6.627806663513184, + "learning_rate": 7.528573386323147e-06, + "loss": 0.2902, + "step": 18785 + }, + { + "epoch": 1.0548019533369506, + "grad_norm": 7.388762474060059, + "learning_rate": 7.528436343702892e-06, + "loss": 0.3224, + "step": 18786 + }, + { + "epoch": 1.0548155181768855, + "grad_norm": 4.870514392852783, + "learning_rate": 7.528299301082638e-06, + "loss": 0.3024, + "step": 18787 + }, + { + "epoch": 1.0548290830168203, + "grad_norm": 9.331561088562012, + "learning_rate": 7.528162258462383e-06, + "loss": 0.5017, + "step": 18788 + }, + { + "epoch": 1.0548426478567552, + "grad_norm": 5.115967273712158, + "learning_rate": 7.528025215842127e-06, + "loss": 0.2229, + "step": 18789 + }, + { + "epoch": 1.0548562126966903, + "grad_norm": 5.164057731628418, + "learning_rate": 7.527888173221872e-06, + "loss": 0.2662, + "step": 18790 + }, + { + "epoch": 1.0548697775366251, + "grad_norm": 5.267357349395752, + "learning_rate": 7.527751130601618e-06, + "loss": 0.2879, + "step": 18791 + }, + { + "epoch": 1.05488334237656, + "grad_norm": 5.306943416595459, + "learning_rate": 7.527614087981363e-06, + "loss": 0.2157, + "step": 18792 + }, + { + "epoch": 1.0548969072164949, + "grad_norm": 5.295227527618408, + "learning_rate": 7.527477045361108e-06, + "loss": 0.2381, + "step": 18793 + }, + { + "epoch": 1.0549104720564297, + "grad_norm": 5.580071449279785, + "learning_rate": 7.527340002740853e-06, + "loss": 0.1973, + "step": 18794 + }, + { + "epoch": 1.0549240368963646, + "grad_norm": 3.3847496509552, + "learning_rate": 7.5272029601205974e-06, + "loss": 0.2306, + "step": 18795 + }, + { + "epoch": 1.0549376017362995, + "grad_norm": 4.431087493896484, + "learning_rate": 7.5270659175003435e-06, + "loss": 0.2499, + "step": 18796 + }, + { + "epoch": 1.0549511665762343, + "grad_norm": 5.036709308624268, + "learning_rate": 7.526928874880089e-06, + "loss": 0.1891, + "step": 18797 + }, + { + "epoch": 1.0549647314161692, + "grad_norm": 4.168204307556152, + "learning_rate": 7.526791832259833e-06, + "loss": 0.1929, + "step": 18798 + }, + { + "epoch": 1.0549782962561043, + "grad_norm": 4.574951171875, + "learning_rate": 7.526654789639578e-06, + "loss": 0.2501, + "step": 18799 + }, + { + "epoch": 1.0549918610960392, + "grad_norm": 5.851754188537598, + "learning_rate": 7.526517747019324e-06, + "loss": 0.2714, + "step": 18800 + }, + { + "epoch": 1.055005425935974, + "grad_norm": 5.580365180969238, + "learning_rate": 7.5263807043990685e-06, + "loss": 0.3434, + "step": 18801 + }, + { + "epoch": 1.0550189907759089, + "grad_norm": 5.787005424499512, + "learning_rate": 7.526243661778814e-06, + "loss": 0.2298, + "step": 18802 + }, + { + "epoch": 1.0550325556158437, + "grad_norm": 5.490899085998535, + "learning_rate": 7.526106619158559e-06, + "loss": 0.1966, + "step": 18803 + }, + { + "epoch": 1.0550461204557786, + "grad_norm": 6.022565841674805, + "learning_rate": 7.525969576538305e-06, + "loss": 0.2947, + "step": 18804 + }, + { + "epoch": 1.0550596852957135, + "grad_norm": 5.36767053604126, + "learning_rate": 7.525832533918049e-06, + "loss": 0.2346, + "step": 18805 + }, + { + "epoch": 1.0550732501356483, + "grad_norm": 6.123011589050293, + "learning_rate": 7.525695491297794e-06, + "loss": 0.3067, + "step": 18806 + }, + { + "epoch": 1.0550868149755832, + "grad_norm": 5.12809944152832, + "learning_rate": 7.525558448677539e-06, + "loss": 0.2961, + "step": 18807 + }, + { + "epoch": 1.055100379815518, + "grad_norm": 3.5589864253997803, + "learning_rate": 7.525421406057284e-06, + "loss": 0.1362, + "step": 18808 + }, + { + "epoch": 1.0551139446554532, + "grad_norm": 7.680519104003906, + "learning_rate": 7.52528436343703e-06, + "loss": 0.2779, + "step": 18809 + }, + { + "epoch": 1.055127509495388, + "grad_norm": 4.650167942047119, + "learning_rate": 7.525147320816774e-06, + "loss": 0.2711, + "step": 18810 + }, + { + "epoch": 1.055141074335323, + "grad_norm": 5.879973411560059, + "learning_rate": 7.5250102781965194e-06, + "loss": 0.3267, + "step": 18811 + }, + { + "epoch": 1.0551546391752578, + "grad_norm": 6.565334796905518, + "learning_rate": 7.524873235576265e-06, + "loss": 0.3333, + "step": 18812 + }, + { + "epoch": 1.0551682040151926, + "grad_norm": 5.728275299072266, + "learning_rate": 7.524736192956011e-06, + "loss": 0.2917, + "step": 18813 + }, + { + "epoch": 1.0551817688551275, + "grad_norm": 6.096113204956055, + "learning_rate": 7.524599150335755e-06, + "loss": 0.4617, + "step": 18814 + }, + { + "epoch": 1.0551953336950624, + "grad_norm": 4.137061595916748, + "learning_rate": 7.5244621077155e-06, + "loss": 0.2007, + "step": 18815 + }, + { + "epoch": 1.0552088985349972, + "grad_norm": 5.774477481842041, + "learning_rate": 7.5243250650952445e-06, + "loss": 0.2227, + "step": 18816 + }, + { + "epoch": 1.055222463374932, + "grad_norm": 5.103146553039551, + "learning_rate": 7.5241880224749905e-06, + "loss": 0.2925, + "step": 18817 + }, + { + "epoch": 1.0552360282148672, + "grad_norm": 5.37965726852417, + "learning_rate": 7.524050979854736e-06, + "loss": 0.2182, + "step": 18818 + }, + { + "epoch": 1.055249593054802, + "grad_norm": 7.0582499504089355, + "learning_rate": 7.523913937234481e-06, + "loss": 0.3278, + "step": 18819 + }, + { + "epoch": 1.055263157894737, + "grad_norm": 6.272571086883545, + "learning_rate": 7.523776894614225e-06, + "loss": 0.2528, + "step": 18820 + }, + { + "epoch": 1.0552767227346718, + "grad_norm": 3.362006664276123, + "learning_rate": 7.523639851993971e-06, + "loss": 0.1568, + "step": 18821 + }, + { + "epoch": 1.0552902875746066, + "grad_norm": 4.855204105377197, + "learning_rate": 7.523502809373716e-06, + "loss": 0.2709, + "step": 18822 + }, + { + "epoch": 1.0553038524145415, + "grad_norm": 6.344005584716797, + "learning_rate": 7.523365766753461e-06, + "loss": 0.3051, + "step": 18823 + }, + { + "epoch": 1.0553174172544764, + "grad_norm": 4.437247276306152, + "learning_rate": 7.523228724133206e-06, + "loss": 0.2089, + "step": 18824 + }, + { + "epoch": 1.0553309820944112, + "grad_norm": 5.499971389770508, + "learning_rate": 7.52309168151295e-06, + "loss": 0.2256, + "step": 18825 + }, + { + "epoch": 1.055344546934346, + "grad_norm": 4.364219665527344, + "learning_rate": 7.522954638892696e-06, + "loss": 0.148, + "step": 18826 + }, + { + "epoch": 1.055358111774281, + "grad_norm": 4.216881275177002, + "learning_rate": 7.5228175962724415e-06, + "loss": 0.2, + "step": 18827 + }, + { + "epoch": 1.055371676614216, + "grad_norm": 4.6031413078308105, + "learning_rate": 7.522680553652187e-06, + "loss": 0.1694, + "step": 18828 + }, + { + "epoch": 1.055385241454151, + "grad_norm": 5.197521209716797, + "learning_rate": 7.522543511031931e-06, + "loss": 0.3049, + "step": 18829 + }, + { + "epoch": 1.0553988062940858, + "grad_norm": 5.594489574432373, + "learning_rate": 7.522406468411677e-06, + "loss": 0.3801, + "step": 18830 + }, + { + "epoch": 1.0554123711340206, + "grad_norm": 4.99925422668457, + "learning_rate": 7.522269425791422e-06, + "loss": 0.2707, + "step": 18831 + }, + { + "epoch": 1.0554259359739555, + "grad_norm": 5.501832962036133, + "learning_rate": 7.5221323831711665e-06, + "loss": 0.3823, + "step": 18832 + }, + { + "epoch": 1.0554395008138904, + "grad_norm": 3.547463893890381, + "learning_rate": 7.521995340550912e-06, + "loss": 0.1381, + "step": 18833 + }, + { + "epoch": 1.0554530656538252, + "grad_norm": 5.398166656494141, + "learning_rate": 7.521858297930658e-06, + "loss": 0.2074, + "step": 18834 + }, + { + "epoch": 1.05546663049376, + "grad_norm": 5.183849811553955, + "learning_rate": 7.521721255310402e-06, + "loss": 0.2064, + "step": 18835 + }, + { + "epoch": 1.055480195333695, + "grad_norm": 4.726500511169434, + "learning_rate": 7.521584212690147e-06, + "loss": 0.2965, + "step": 18836 + }, + { + "epoch": 1.05549376017363, + "grad_norm": 4.609246253967285, + "learning_rate": 7.521447170069892e-06, + "loss": 0.2342, + "step": 18837 + }, + { + "epoch": 1.055507325013565, + "grad_norm": 9.016136169433594, + "learning_rate": 7.521310127449637e-06, + "loss": 0.4753, + "step": 18838 + }, + { + "epoch": 1.0555208898534998, + "grad_norm": 4.683621883392334, + "learning_rate": 7.521173084829383e-06, + "loss": 0.2483, + "step": 18839 + }, + { + "epoch": 1.0555344546934347, + "grad_norm": 5.374341011047363, + "learning_rate": 7.521036042209128e-06, + "loss": 0.2984, + "step": 18840 + }, + { + "epoch": 1.0555480195333695, + "grad_norm": 4.904206275939941, + "learning_rate": 7.520898999588872e-06, + "loss": 0.2925, + "step": 18841 + }, + { + "epoch": 1.0555615843733044, + "grad_norm": 6.239654541015625, + "learning_rate": 7.5207619569686174e-06, + "loss": 0.315, + "step": 18842 + }, + { + "epoch": 1.0555751492132392, + "grad_norm": 5.398308277130127, + "learning_rate": 7.5206249143483635e-06, + "loss": 0.3193, + "step": 18843 + }, + { + "epoch": 1.0555887140531741, + "grad_norm": 5.664247512817383, + "learning_rate": 7.520487871728109e-06, + "loss": 0.1993, + "step": 18844 + }, + { + "epoch": 1.055602278893109, + "grad_norm": 4.846452236175537, + "learning_rate": 7.520350829107853e-06, + "loss": 0.3071, + "step": 18845 + }, + { + "epoch": 1.0556158437330438, + "grad_norm": 4.448585033416748, + "learning_rate": 7.520213786487598e-06, + "loss": 0.259, + "step": 18846 + }, + { + "epoch": 1.055629408572979, + "grad_norm": 6.638645172119141, + "learning_rate": 7.520076743867344e-06, + "loss": 0.3793, + "step": 18847 + }, + { + "epoch": 1.0556429734129138, + "grad_norm": 4.640426158905029, + "learning_rate": 7.5199397012470885e-06, + "loss": 0.2595, + "step": 18848 + }, + { + "epoch": 1.0556565382528487, + "grad_norm": 5.194497585296631, + "learning_rate": 7.519802658626834e-06, + "loss": 0.1846, + "step": 18849 + }, + { + "epoch": 1.0556701030927835, + "grad_norm": 4.670617580413818, + "learning_rate": 7.519665616006578e-06, + "loss": 0.2241, + "step": 18850 + }, + { + "epoch": 1.0556836679327184, + "grad_norm": 3.3859195709228516, + "learning_rate": 7.519528573386323e-06, + "loss": 0.1924, + "step": 18851 + }, + { + "epoch": 1.0556972327726533, + "grad_norm": 6.413010120391846, + "learning_rate": 7.519391530766069e-06, + "loss": 0.3557, + "step": 18852 + }, + { + "epoch": 1.0557107976125881, + "grad_norm": 3.8759191036224365, + "learning_rate": 7.519254488145814e-06, + "loss": 0.2214, + "step": 18853 + }, + { + "epoch": 1.055724362452523, + "grad_norm": 5.047188758850098, + "learning_rate": 7.519117445525559e-06, + "loss": 0.2335, + "step": 18854 + }, + { + "epoch": 1.0557379272924579, + "grad_norm": 4.861820220947266, + "learning_rate": 7.518980402905304e-06, + "loss": 0.307, + "step": 18855 + }, + { + "epoch": 1.055751492132393, + "grad_norm": 6.6993536949157715, + "learning_rate": 7.51884336028505e-06, + "loss": 0.3899, + "step": 18856 + }, + { + "epoch": 1.0557650569723278, + "grad_norm": 5.536810874938965, + "learning_rate": 7.518706317664794e-06, + "loss": 0.2397, + "step": 18857 + }, + { + "epoch": 1.0557786218122627, + "grad_norm": 4.2248148918151855, + "learning_rate": 7.5185692750445395e-06, + "loss": 0.2016, + "step": 18858 + }, + { + "epoch": 1.0557921866521975, + "grad_norm": 4.398868560791016, + "learning_rate": 7.518432232424284e-06, + "loss": 0.2272, + "step": 18859 + }, + { + "epoch": 1.0558057514921324, + "grad_norm": 4.036266326904297, + "learning_rate": 7.51829518980403e-06, + "loss": 0.1723, + "step": 18860 + }, + { + "epoch": 1.0558193163320673, + "grad_norm": 3.5820040702819824, + "learning_rate": 7.518158147183775e-06, + "loss": 0.1323, + "step": 18861 + }, + { + "epoch": 1.0558328811720021, + "grad_norm": 4.758907794952393, + "learning_rate": 7.51802110456352e-06, + "loss": 0.2634, + "step": 18862 + }, + { + "epoch": 1.055846446011937, + "grad_norm": 5.968446254730225, + "learning_rate": 7.5178840619432645e-06, + "loss": 0.3613, + "step": 18863 + }, + { + "epoch": 1.0558600108518719, + "grad_norm": 3.7167861461639404, + "learning_rate": 7.51774701932301e-06, + "loss": 0.1535, + "step": 18864 + }, + { + "epoch": 1.0558735756918067, + "grad_norm": 3.8060693740844727, + "learning_rate": 7.517609976702756e-06, + "loss": 0.1327, + "step": 18865 + }, + { + "epoch": 1.0558871405317418, + "grad_norm": 4.77638578414917, + "learning_rate": 7.5174729340825e-06, + "loss": 0.2584, + "step": 18866 + }, + { + "epoch": 1.0559007053716767, + "grad_norm": 5.498180389404297, + "learning_rate": 7.517335891462245e-06, + "loss": 0.2194, + "step": 18867 + }, + { + "epoch": 1.0559142702116115, + "grad_norm": 4.401643753051758, + "learning_rate": 7.51719884884199e-06, + "loss": 0.2022, + "step": 18868 + }, + { + "epoch": 1.0559278350515464, + "grad_norm": 6.542230606079102, + "learning_rate": 7.517061806221736e-06, + "loss": 0.2662, + "step": 18869 + }, + { + "epoch": 1.0559413998914813, + "grad_norm": 4.133754730224609, + "learning_rate": 7.516924763601481e-06, + "loss": 0.2353, + "step": 18870 + }, + { + "epoch": 1.0559549647314161, + "grad_norm": 6.539503574371338, + "learning_rate": 7.516787720981226e-06, + "loss": 0.37, + "step": 18871 + }, + { + "epoch": 1.055968529571351, + "grad_norm": 4.718746185302734, + "learning_rate": 7.51665067836097e-06, + "loss": 0.2979, + "step": 18872 + }, + { + "epoch": 1.0559820944112859, + "grad_norm": 5.37339448928833, + "learning_rate": 7.516513635740716e-06, + "loss": 0.2727, + "step": 18873 + }, + { + "epoch": 1.0559956592512207, + "grad_norm": 4.973580360412598, + "learning_rate": 7.5163765931204615e-06, + "loss": 0.264, + "step": 18874 + }, + { + "epoch": 1.0560092240911558, + "grad_norm": 4.121664047241211, + "learning_rate": 7.516239550500206e-06, + "loss": 0.1861, + "step": 18875 + }, + { + "epoch": 1.0560227889310907, + "grad_norm": 5.334728240966797, + "learning_rate": 7.516102507879951e-06, + "loss": 0.2295, + "step": 18876 + }, + { + "epoch": 1.0560363537710256, + "grad_norm": 3.734969139099121, + "learning_rate": 7.515965465259696e-06, + "loss": 0.2295, + "step": 18877 + }, + { + "epoch": 1.0560499186109604, + "grad_norm": 5.425001621246338, + "learning_rate": 7.515828422639442e-06, + "loss": 0.3087, + "step": 18878 + }, + { + "epoch": 1.0560634834508953, + "grad_norm": 5.198612213134766, + "learning_rate": 7.5156913800191865e-06, + "loss": 0.3223, + "step": 18879 + }, + { + "epoch": 1.0560770482908302, + "grad_norm": 4.753841876983643, + "learning_rate": 7.515554337398932e-06, + "loss": 0.2672, + "step": 18880 + }, + { + "epoch": 1.056090613130765, + "grad_norm": 6.66448450088501, + "learning_rate": 7.515417294778676e-06, + "loss": 0.2466, + "step": 18881 + }, + { + "epoch": 1.0561041779706999, + "grad_norm": 5.4272847175598145, + "learning_rate": 7.515280252158422e-06, + "loss": 0.3999, + "step": 18882 + }, + { + "epoch": 1.0561177428106348, + "grad_norm": 3.5346367359161377, + "learning_rate": 7.515143209538167e-06, + "loss": 0.3061, + "step": 18883 + }, + { + "epoch": 1.0561313076505696, + "grad_norm": 5.81073522567749, + "learning_rate": 7.5150061669179116e-06, + "loss": 0.3109, + "step": 18884 + }, + { + "epoch": 1.0561448724905047, + "grad_norm": 4.28381872177124, + "learning_rate": 7.514869124297657e-06, + "loss": 0.1913, + "step": 18885 + }, + { + "epoch": 1.0561584373304396, + "grad_norm": 4.818196773529053, + "learning_rate": 7.514732081677403e-06, + "loss": 0.2527, + "step": 18886 + }, + { + "epoch": 1.0561720021703744, + "grad_norm": 4.402917861938477, + "learning_rate": 7.514595039057148e-06, + "loss": 0.2658, + "step": 18887 + }, + { + "epoch": 1.0561855670103093, + "grad_norm": 6.308120250701904, + "learning_rate": 7.514457996436892e-06, + "loss": 0.3291, + "step": 18888 + }, + { + "epoch": 1.0561991318502442, + "grad_norm": 4.521254062652588, + "learning_rate": 7.5143209538166375e-06, + "loss": 0.2624, + "step": 18889 + }, + { + "epoch": 1.056212696690179, + "grad_norm": 4.767230033874512, + "learning_rate": 7.5141839111963835e-06, + "loss": 0.2828, + "step": 18890 + }, + { + "epoch": 1.056226261530114, + "grad_norm": 3.5443968772888184, + "learning_rate": 7.514046868576128e-06, + "loss": 0.1212, + "step": 18891 + }, + { + "epoch": 1.0562398263700488, + "grad_norm": 6.217761039733887, + "learning_rate": 7.513909825955873e-06, + "loss": 0.4449, + "step": 18892 + }, + { + "epoch": 1.0562533912099836, + "grad_norm": 4.36441707611084, + "learning_rate": 7.513772783335618e-06, + "loss": 0.2062, + "step": 18893 + }, + { + "epoch": 1.0562669560499187, + "grad_norm": 5.408106803894043, + "learning_rate": 7.5136357407153625e-06, + "loss": 0.2466, + "step": 18894 + }, + { + "epoch": 1.0562805208898536, + "grad_norm": 6.701135635375977, + "learning_rate": 7.5134986980951085e-06, + "loss": 0.379, + "step": 18895 + }, + { + "epoch": 1.0562940857297884, + "grad_norm": 4.4619293212890625, + "learning_rate": 7.513361655474854e-06, + "loss": 0.2801, + "step": 18896 + }, + { + "epoch": 1.0563076505697233, + "grad_norm": 5.396537780761719, + "learning_rate": 7.513224612854598e-06, + "loss": 0.2305, + "step": 18897 + }, + { + "epoch": 1.0563212154096582, + "grad_norm": 4.107401371002197, + "learning_rate": 7.513087570234343e-06, + "loss": 0.3296, + "step": 18898 + }, + { + "epoch": 1.056334780249593, + "grad_norm": 5.480745315551758, + "learning_rate": 7.512950527614089e-06, + "loss": 0.2941, + "step": 18899 + }, + { + "epoch": 1.056348345089528, + "grad_norm": 5.868351459503174, + "learning_rate": 7.512813484993834e-06, + "loss": 0.2947, + "step": 18900 + }, + { + "epoch": 1.0563619099294628, + "grad_norm": 4.438015937805176, + "learning_rate": 7.512676442373579e-06, + "loss": 0.2422, + "step": 18901 + }, + { + "epoch": 1.0563754747693976, + "grad_norm": 5.872303009033203, + "learning_rate": 7.512539399753324e-06, + "loss": 0.3127, + "step": 18902 + }, + { + "epoch": 1.0563890396093325, + "grad_norm": 4.781434059143066, + "learning_rate": 7.512402357133069e-06, + "loss": 0.2871, + "step": 18903 + }, + { + "epoch": 1.0564026044492676, + "grad_norm": 3.665402889251709, + "learning_rate": 7.512265314512814e-06, + "loss": 0.1593, + "step": 18904 + }, + { + "epoch": 1.0564161692892025, + "grad_norm": 6.311384677886963, + "learning_rate": 7.5121282718925595e-06, + "loss": 0.3341, + "step": 18905 + }, + { + "epoch": 1.0564297341291373, + "grad_norm": 5.306581020355225, + "learning_rate": 7.511991229272304e-06, + "loss": 0.3902, + "step": 18906 + }, + { + "epoch": 1.0564432989690722, + "grad_norm": 4.744949817657471, + "learning_rate": 7.511854186652049e-06, + "loss": 0.2364, + "step": 18907 + }, + { + "epoch": 1.056456863809007, + "grad_norm": 5.52499532699585, + "learning_rate": 7.511717144031795e-06, + "loss": 0.3118, + "step": 18908 + }, + { + "epoch": 1.056470428648942, + "grad_norm": 6.035385608673096, + "learning_rate": 7.511580101411539e-06, + "loss": 0.363, + "step": 18909 + }, + { + "epoch": 1.0564839934888768, + "grad_norm": 6.0696845054626465, + "learning_rate": 7.5114430587912845e-06, + "loss": 0.3839, + "step": 18910 + }, + { + "epoch": 1.0564975583288116, + "grad_norm": 5.126686096191406, + "learning_rate": 7.51130601617103e-06, + "loss": 0.288, + "step": 18911 + }, + { + "epoch": 1.0565111231687465, + "grad_norm": 5.5692524909973145, + "learning_rate": 7.511168973550776e-06, + "loss": 0.3022, + "step": 18912 + }, + { + "epoch": 1.0565246880086816, + "grad_norm": 7.616581916809082, + "learning_rate": 7.51103193093052e-06, + "loss": 0.5609, + "step": 18913 + }, + { + "epoch": 1.0565382528486165, + "grad_norm": 5.445885181427002, + "learning_rate": 7.510894888310265e-06, + "loss": 0.2782, + "step": 18914 + }, + { + "epoch": 1.0565518176885513, + "grad_norm": 3.902341842651367, + "learning_rate": 7.51075784569001e-06, + "loss": 0.254, + "step": 18915 + }, + { + "epoch": 1.0565653825284862, + "grad_norm": 4.667590141296387, + "learning_rate": 7.510620803069756e-06, + "loss": 0.2268, + "step": 18916 + }, + { + "epoch": 1.056578947368421, + "grad_norm": 5.178542137145996, + "learning_rate": 7.510483760449501e-06, + "loss": 0.3064, + "step": 18917 + }, + { + "epoch": 1.056592512208356, + "grad_norm": 5.00506591796875, + "learning_rate": 7.510346717829245e-06, + "loss": 0.2668, + "step": 18918 + }, + { + "epoch": 1.0566060770482908, + "grad_norm": 4.57186222076416, + "learning_rate": 7.51020967520899e-06, + "loss": 0.2317, + "step": 18919 + }, + { + "epoch": 1.0566196418882257, + "grad_norm": 4.715208530426025, + "learning_rate": 7.5100726325887355e-06, + "loss": 0.3233, + "step": 18920 + }, + { + "epoch": 1.0566332067281605, + "grad_norm": 6.184406757354736, + "learning_rate": 7.5099355899684815e-06, + "loss": 0.3387, + "step": 18921 + }, + { + "epoch": 1.0566467715680954, + "grad_norm": 6.181589603424072, + "learning_rate": 7.509798547348226e-06, + "loss": 0.315, + "step": 18922 + }, + { + "epoch": 1.0566603364080305, + "grad_norm": 6.874151229858398, + "learning_rate": 7.509661504727971e-06, + "loss": 0.349, + "step": 18923 + }, + { + "epoch": 1.0566739012479653, + "grad_norm": 5.342845439910889, + "learning_rate": 7.509524462107715e-06, + "loss": 0.2488, + "step": 18924 + }, + { + "epoch": 1.0566874660879002, + "grad_norm": 8.415392875671387, + "learning_rate": 7.509387419487461e-06, + "loss": 0.4798, + "step": 18925 + }, + { + "epoch": 1.056701030927835, + "grad_norm": 4.726090431213379, + "learning_rate": 7.5092503768672065e-06, + "loss": 0.2695, + "step": 18926 + }, + { + "epoch": 1.05671459576777, + "grad_norm": 5.002720355987549, + "learning_rate": 7.509113334246952e-06, + "loss": 0.2892, + "step": 18927 + }, + { + "epoch": 1.0567281606077048, + "grad_norm": 5.399872303009033, + "learning_rate": 7.508976291626696e-06, + "loss": 0.3054, + "step": 18928 + }, + { + "epoch": 1.0567417254476397, + "grad_norm": 7.068441867828369, + "learning_rate": 7.508839249006442e-06, + "loss": 0.3664, + "step": 18929 + }, + { + "epoch": 1.0567552902875745, + "grad_norm": 6.099366664886475, + "learning_rate": 7.508702206386187e-06, + "loss": 0.3202, + "step": 18930 + }, + { + "epoch": 1.0567688551275094, + "grad_norm": 5.177568435668945, + "learning_rate": 7.508565163765932e-06, + "loss": 0.4557, + "step": 18931 + }, + { + "epoch": 1.0567824199674445, + "grad_norm": 5.995945453643799, + "learning_rate": 7.508428121145677e-06, + "loss": 0.315, + "step": 18932 + }, + { + "epoch": 1.0567959848073794, + "grad_norm": 5.536859512329102, + "learning_rate": 7.508291078525421e-06, + "loss": 0.3781, + "step": 18933 + }, + { + "epoch": 1.0568095496473142, + "grad_norm": 4.937904357910156, + "learning_rate": 7.508154035905167e-06, + "loss": 0.2565, + "step": 18934 + }, + { + "epoch": 1.056823114487249, + "grad_norm": 6.178007125854492, + "learning_rate": 7.508016993284912e-06, + "loss": 0.3339, + "step": 18935 + }, + { + "epoch": 1.056836679327184, + "grad_norm": 4.882397651672363, + "learning_rate": 7.5078799506646575e-06, + "loss": 0.3936, + "step": 18936 + }, + { + "epoch": 1.0568502441671188, + "grad_norm": 6.04365873336792, + "learning_rate": 7.507742908044402e-06, + "loss": 0.4787, + "step": 18937 + }, + { + "epoch": 1.0568638090070537, + "grad_norm": 5.357214450836182, + "learning_rate": 7.507605865424148e-06, + "loss": 0.3343, + "step": 18938 + }, + { + "epoch": 1.0568773738469885, + "grad_norm": 6.1104416847229, + "learning_rate": 7.507468822803893e-06, + "loss": 0.411, + "step": 18939 + }, + { + "epoch": 1.0568909386869234, + "grad_norm": 6.575032711029053, + "learning_rate": 7.507331780183637e-06, + "loss": 0.4028, + "step": 18940 + }, + { + "epoch": 1.0569045035268583, + "grad_norm": 4.4607744216918945, + "learning_rate": 7.5071947375633825e-06, + "loss": 0.2503, + "step": 18941 + }, + { + "epoch": 1.0569180683667934, + "grad_norm": 4.384420394897461, + "learning_rate": 7.5070576949431286e-06, + "loss": 0.2309, + "step": 18942 + }, + { + "epoch": 1.0569316332067282, + "grad_norm": 5.533458232879639, + "learning_rate": 7.506920652322873e-06, + "loss": 0.3052, + "step": 18943 + }, + { + "epoch": 1.056945198046663, + "grad_norm": 6.925340175628662, + "learning_rate": 7.506783609702618e-06, + "loss": 0.4804, + "step": 18944 + }, + { + "epoch": 1.056958762886598, + "grad_norm": 7.661330699920654, + "learning_rate": 7.506646567082363e-06, + "loss": 0.3537, + "step": 18945 + }, + { + "epoch": 1.0569723277265328, + "grad_norm": 7.075419902801514, + "learning_rate": 7.506509524462108e-06, + "loss": 0.409, + "step": 18946 + }, + { + "epoch": 1.0569858925664677, + "grad_norm": 4.632487773895264, + "learning_rate": 7.506372481841854e-06, + "loss": 0.432, + "step": 18947 + }, + { + "epoch": 1.0569994574064026, + "grad_norm": 5.221029758453369, + "learning_rate": 7.506235439221599e-06, + "loss": 0.4271, + "step": 18948 + }, + { + "epoch": 1.0570130222463374, + "grad_norm": 4.5943827629089355, + "learning_rate": 7.506098396601343e-06, + "loss": 0.3002, + "step": 18949 + }, + { + "epoch": 1.0570265870862723, + "grad_norm": 7.129660129547119, + "learning_rate": 7.505961353981088e-06, + "loss": 0.3773, + "step": 18950 + }, + { + "epoch": 1.0570401519262074, + "grad_norm": 4.9593586921691895, + "learning_rate": 7.505824311360834e-06, + "loss": 0.3118, + "step": 18951 + }, + { + "epoch": 1.0570537167661422, + "grad_norm": 5.484456539154053, + "learning_rate": 7.505687268740579e-06, + "loss": 0.2845, + "step": 18952 + }, + { + "epoch": 1.057067281606077, + "grad_norm": 6.0881500244140625, + "learning_rate": 7.505550226120324e-06, + "loss": 0.3671, + "step": 18953 + }, + { + "epoch": 1.057080846446012, + "grad_norm": 5.70030403137207, + "learning_rate": 7.505413183500069e-06, + "loss": 0.2998, + "step": 18954 + }, + { + "epoch": 1.0570944112859468, + "grad_norm": 5.782772541046143, + "learning_rate": 7.505276140879815e-06, + "loss": 0.2087, + "step": 18955 + }, + { + "epoch": 1.0571079761258817, + "grad_norm": 4.884124279022217, + "learning_rate": 7.505139098259559e-06, + "loss": 0.302, + "step": 18956 + }, + { + "epoch": 1.0571215409658166, + "grad_norm": 5.42062520980835, + "learning_rate": 7.5050020556393045e-06, + "loss": 0.2099, + "step": 18957 + }, + { + "epoch": 1.0571351058057514, + "grad_norm": 6.204818248748779, + "learning_rate": 7.504865013019049e-06, + "loss": 0.2581, + "step": 18958 + }, + { + "epoch": 1.0571486706456863, + "grad_norm": 4.092609882354736, + "learning_rate": 7.504727970398795e-06, + "loss": 0.2272, + "step": 18959 + }, + { + "epoch": 1.0571622354856212, + "grad_norm": 5.285808086395264, + "learning_rate": 7.50459092777854e-06, + "loss": 0.2778, + "step": 18960 + }, + { + "epoch": 1.0571758003255562, + "grad_norm": 5.12615442276001, + "learning_rate": 7.504453885158285e-06, + "loss": 0.2883, + "step": 18961 + }, + { + "epoch": 1.0571893651654911, + "grad_norm": 5.46058988571167, + "learning_rate": 7.50431684253803e-06, + "loss": 0.2765, + "step": 18962 + }, + { + "epoch": 1.057202930005426, + "grad_norm": 5.620815277099609, + "learning_rate": 7.504179799917775e-06, + "loss": 0.3375, + "step": 18963 + }, + { + "epoch": 1.0572164948453608, + "grad_norm": 4.362077236175537, + "learning_rate": 7.504042757297521e-06, + "loss": 0.1936, + "step": 18964 + }, + { + "epoch": 1.0572300596852957, + "grad_norm": 5.2529191970825195, + "learning_rate": 7.503905714677265e-06, + "loss": 0.2991, + "step": 18965 + }, + { + "epoch": 1.0572436245252306, + "grad_norm": 6.8435378074646, + "learning_rate": 7.50376867205701e-06, + "loss": 0.3307, + "step": 18966 + }, + { + "epoch": 1.0572571893651654, + "grad_norm": 5.6883392333984375, + "learning_rate": 7.503631629436755e-06, + "loss": 0.3398, + "step": 18967 + }, + { + "epoch": 1.0572707542051003, + "grad_norm": 5.56509256362915, + "learning_rate": 7.503494586816501e-06, + "loss": 0.323, + "step": 18968 + }, + { + "epoch": 1.0572843190450352, + "grad_norm": 6.069455623626709, + "learning_rate": 7.503357544196246e-06, + "loss": 0.2809, + "step": 18969 + }, + { + "epoch": 1.0572978838849703, + "grad_norm": 7.212531089782715, + "learning_rate": 7.503220501575991e-06, + "loss": 0.2563, + "step": 18970 + }, + { + "epoch": 1.0573114487249051, + "grad_norm": 3.5987277030944824, + "learning_rate": 7.503083458955735e-06, + "loss": 0.2279, + "step": 18971 + }, + { + "epoch": 1.05732501356484, + "grad_norm": 7.241856098175049, + "learning_rate": 7.502946416335481e-06, + "loss": 0.3369, + "step": 18972 + }, + { + "epoch": 1.0573385784047749, + "grad_norm": 5.361727237701416, + "learning_rate": 7.5028093737152266e-06, + "loss": 0.2925, + "step": 18973 + }, + { + "epoch": 1.0573521432447097, + "grad_norm": 4.8211236000061035, + "learning_rate": 7.502672331094971e-06, + "loss": 0.221, + "step": 18974 + }, + { + "epoch": 1.0573657080846446, + "grad_norm": 6.682262897491455, + "learning_rate": 7.502535288474716e-06, + "loss": 0.2472, + "step": 18975 + }, + { + "epoch": 1.0573792729245794, + "grad_norm": 6.427426815032959, + "learning_rate": 7.502398245854461e-06, + "loss": 0.2851, + "step": 18976 + }, + { + "epoch": 1.0573928377645143, + "grad_norm": 7.018509387969971, + "learning_rate": 7.5022612032342064e-06, + "loss": 0.3314, + "step": 18977 + }, + { + "epoch": 1.0574064026044492, + "grad_norm": 5.754281044006348, + "learning_rate": 7.502124160613952e-06, + "loss": 0.2932, + "step": 18978 + }, + { + "epoch": 1.057419967444384, + "grad_norm": 4.912890911102295, + "learning_rate": 7.501987117993697e-06, + "loss": 0.2313, + "step": 18979 + }, + { + "epoch": 1.0574335322843191, + "grad_norm": 6.544175148010254, + "learning_rate": 7.501850075373441e-06, + "loss": 0.3489, + "step": 18980 + }, + { + "epoch": 1.057447097124254, + "grad_norm": 5.102042198181152, + "learning_rate": 7.501713032753187e-06, + "loss": 0.3147, + "step": 18981 + }, + { + "epoch": 1.0574606619641889, + "grad_norm": 5.045434951782227, + "learning_rate": 7.501575990132932e-06, + "loss": 0.2909, + "step": 18982 + }, + { + "epoch": 1.0574742268041237, + "grad_norm": 7.496001720428467, + "learning_rate": 7.501438947512677e-06, + "loss": 0.3257, + "step": 18983 + }, + { + "epoch": 1.0574877916440586, + "grad_norm": 6.6433563232421875, + "learning_rate": 7.501301904892422e-06, + "loss": 0.3812, + "step": 18984 + }, + { + "epoch": 1.0575013564839935, + "grad_norm": 7.167738437652588, + "learning_rate": 7.501164862272168e-06, + "loss": 0.3418, + "step": 18985 + }, + { + "epoch": 1.0575149213239283, + "grad_norm": 4.404595851898193, + "learning_rate": 7.501027819651913e-06, + "loss": 0.2064, + "step": 18986 + }, + { + "epoch": 1.0575284861638632, + "grad_norm": 4.575056552886963, + "learning_rate": 7.500890777031657e-06, + "loss": 0.1855, + "step": 18987 + }, + { + "epoch": 1.057542051003798, + "grad_norm": 7.150454998016357, + "learning_rate": 7.5007537344114026e-06, + "loss": 0.3927, + "step": 18988 + }, + { + "epoch": 1.0575556158437331, + "grad_norm": 4.9205827713012695, + "learning_rate": 7.500616691791147e-06, + "loss": 0.2912, + "step": 18989 + }, + { + "epoch": 1.057569180683668, + "grad_norm": 6.309133529663086, + "learning_rate": 7.500479649170893e-06, + "loss": 0.297, + "step": 18990 + }, + { + "epoch": 1.0575827455236029, + "grad_norm": 5.263754844665527, + "learning_rate": 7.500342606550638e-06, + "loss": 0.1599, + "step": 18991 + }, + { + "epoch": 1.0575963103635377, + "grad_norm": 4.201655864715576, + "learning_rate": 7.500205563930382e-06, + "loss": 0.2249, + "step": 18992 + }, + { + "epoch": 1.0576098752034726, + "grad_norm": 8.535462379455566, + "learning_rate": 7.500068521310128e-06, + "loss": 0.4079, + "step": 18993 + }, + { + "epoch": 1.0576234400434075, + "grad_norm": 6.963809490203857, + "learning_rate": 7.499931478689874e-06, + "loss": 0.2728, + "step": 18994 + }, + { + "epoch": 1.0576370048833423, + "grad_norm": 5.077398300170898, + "learning_rate": 7.499794436069619e-06, + "loss": 0.3346, + "step": 18995 + }, + { + "epoch": 1.0576505697232772, + "grad_norm": 6.4559478759765625, + "learning_rate": 7.499657393449363e-06, + "loss": 0.3668, + "step": 18996 + }, + { + "epoch": 1.057664134563212, + "grad_norm": 7.576631546020508, + "learning_rate": 7.499520350829108e-06, + "loss": 0.3333, + "step": 18997 + }, + { + "epoch": 1.057677699403147, + "grad_norm": 6.84419584274292, + "learning_rate": 7.499383308208854e-06, + "loss": 0.3332, + "step": 18998 + }, + { + "epoch": 1.057691264243082, + "grad_norm": 5.1028923988342285, + "learning_rate": 7.499246265588599e-06, + "loss": 0.1906, + "step": 18999 + }, + { + "epoch": 1.0577048290830169, + "grad_norm": 5.281905651092529, + "learning_rate": 7.499109222968344e-06, + "loss": 0.2414, + "step": 19000 + }, + { + "epoch": 1.0577183939229517, + "grad_norm": 6.407622814178467, + "learning_rate": 7.498972180348088e-06, + "loss": 0.338, + "step": 19001 + }, + { + "epoch": 1.0577319587628866, + "grad_norm": 9.978135108947754, + "learning_rate": 7.498835137727833e-06, + "loss": 0.3341, + "step": 19002 + }, + { + "epoch": 1.0577455236028215, + "grad_norm": 6.388052463531494, + "learning_rate": 7.498698095107579e-06, + "loss": 0.3212, + "step": 19003 + }, + { + "epoch": 1.0577590884427563, + "grad_norm": 6.539393901824951, + "learning_rate": 7.4985610524873246e-06, + "loss": 0.2697, + "step": 19004 + }, + { + "epoch": 1.0577726532826912, + "grad_norm": 6.410655975341797, + "learning_rate": 7.498424009867069e-06, + "loss": 0.2934, + "step": 19005 + }, + { + "epoch": 1.057786218122626, + "grad_norm": 7.099924564361572, + "learning_rate": 7.498286967246814e-06, + "loss": 0.3094, + "step": 19006 + }, + { + "epoch": 1.057799782962561, + "grad_norm": 5.8631744384765625, + "learning_rate": 7.49814992462656e-06, + "loss": 0.1748, + "step": 19007 + }, + { + "epoch": 1.057813347802496, + "grad_norm": 5.185716152191162, + "learning_rate": 7.4980128820063044e-06, + "loss": 0.2649, + "step": 19008 + }, + { + "epoch": 1.057826912642431, + "grad_norm": 6.3758039474487305, + "learning_rate": 7.49787583938605e-06, + "loss": 0.3291, + "step": 19009 + }, + { + "epoch": 1.0578404774823658, + "grad_norm": 9.325446128845215, + "learning_rate": 7.497738796765795e-06, + "loss": 0.395, + "step": 19010 + }, + { + "epoch": 1.0578540423223006, + "grad_norm": 6.136246681213379, + "learning_rate": 7.49760175414554e-06, + "loss": 0.2504, + "step": 19011 + }, + { + "epoch": 1.0578676071622355, + "grad_norm": 5.453237056732178, + "learning_rate": 7.497464711525285e-06, + "loss": 0.3122, + "step": 19012 + }, + { + "epoch": 1.0578811720021704, + "grad_norm": 6.52677059173584, + "learning_rate": 7.49732766890503e-06, + "loss": 0.2785, + "step": 19013 + }, + { + "epoch": 1.0578947368421052, + "grad_norm": 6.516359329223633, + "learning_rate": 7.497190626284775e-06, + "loss": 0.3984, + "step": 19014 + }, + { + "epoch": 1.05790830168204, + "grad_norm": 5.315216064453125, + "learning_rate": 7.49705358366452e-06, + "loss": 0.2959, + "step": 19015 + }, + { + "epoch": 1.057921866521975, + "grad_norm": 7.344417572021484, + "learning_rate": 7.496916541044266e-06, + "loss": 0.254, + "step": 19016 + }, + { + "epoch": 1.0579354313619098, + "grad_norm": 5.850594997406006, + "learning_rate": 7.49677949842401e-06, + "loss": 0.346, + "step": 19017 + }, + { + "epoch": 1.057948996201845, + "grad_norm": 5.075804233551025, + "learning_rate": 7.496642455803755e-06, + "loss": 0.2506, + "step": 19018 + }, + { + "epoch": 1.0579625610417798, + "grad_norm": 5.810324192047119, + "learning_rate": 7.4965054131835006e-06, + "loss": 0.2747, + "step": 19019 + }, + { + "epoch": 1.0579761258817146, + "grad_norm": 4.405193328857422, + "learning_rate": 7.496368370563247e-06, + "loss": 0.2181, + "step": 19020 + }, + { + "epoch": 1.0579896907216495, + "grad_norm": 5.361435890197754, + "learning_rate": 7.496231327942991e-06, + "loss": 0.2972, + "step": 19021 + }, + { + "epoch": 1.0580032555615844, + "grad_norm": 5.871457576751709, + "learning_rate": 7.496094285322736e-06, + "loss": 0.2739, + "step": 19022 + }, + { + "epoch": 1.0580168204015192, + "grad_norm": 5.16490364074707, + "learning_rate": 7.4959572427024804e-06, + "loss": 0.1971, + "step": 19023 + }, + { + "epoch": 1.058030385241454, + "grad_norm": 5.315011024475098, + "learning_rate": 7.4958202000822265e-06, + "loss": 0.1538, + "step": 19024 + }, + { + "epoch": 1.058043950081389, + "grad_norm": 4.9722466468811035, + "learning_rate": 7.495683157461972e-06, + "loss": 0.2689, + "step": 19025 + }, + { + "epoch": 1.0580575149213238, + "grad_norm": 4.65140438079834, + "learning_rate": 7.495546114841716e-06, + "loss": 0.1778, + "step": 19026 + }, + { + "epoch": 1.058071079761259, + "grad_norm": 6.786530494689941, + "learning_rate": 7.495409072221461e-06, + "loss": 0.2012, + "step": 19027 + }, + { + "epoch": 1.0580846446011938, + "grad_norm": 5.179846286773682, + "learning_rate": 7.495272029601206e-06, + "loss": 0.237, + "step": 19028 + }, + { + "epoch": 1.0580982094411286, + "grad_norm": 6.14901065826416, + "learning_rate": 7.495134986980952e-06, + "loss": 0.2441, + "step": 19029 + }, + { + "epoch": 1.0581117742810635, + "grad_norm": 6.855720043182373, + "learning_rate": 7.494997944360697e-06, + "loss": 0.2713, + "step": 19030 + }, + { + "epoch": 1.0581253391209984, + "grad_norm": 4.596703052520752, + "learning_rate": 7.494860901740442e-06, + "loss": 0.213, + "step": 19031 + }, + { + "epoch": 1.0581389039609332, + "grad_norm": 5.162325859069824, + "learning_rate": 7.494723859120186e-06, + "loss": 0.2397, + "step": 19032 + }, + { + "epoch": 1.058152468800868, + "grad_norm": 6.173565864562988, + "learning_rate": 7.494586816499932e-06, + "loss": 0.3303, + "step": 19033 + }, + { + "epoch": 1.058166033640803, + "grad_norm": 5.5731024742126465, + "learning_rate": 7.494449773879677e-06, + "loss": 0.2949, + "step": 19034 + }, + { + "epoch": 1.0581795984807378, + "grad_norm": 4.295838832855225, + "learning_rate": 7.4943127312594226e-06, + "loss": 0.2028, + "step": 19035 + }, + { + "epoch": 1.0581931633206727, + "grad_norm": 6.9570088386535645, + "learning_rate": 7.494175688639167e-06, + "loss": 0.3645, + "step": 19036 + }, + { + "epoch": 1.0582067281606078, + "grad_norm": 9.006423950195312, + "learning_rate": 7.494038646018913e-06, + "loss": 0.3951, + "step": 19037 + }, + { + "epoch": 1.0582202930005427, + "grad_norm": 3.9130215644836426, + "learning_rate": 7.493901603398658e-06, + "loss": 0.143, + "step": 19038 + }, + { + "epoch": 1.0582338578404775, + "grad_norm": 7.055917739868164, + "learning_rate": 7.4937645607784024e-06, + "loss": 0.3243, + "step": 19039 + }, + { + "epoch": 1.0582474226804124, + "grad_norm": 5.575588703155518, + "learning_rate": 7.493627518158148e-06, + "loss": 0.2814, + "step": 19040 + }, + { + "epoch": 1.0582609875203473, + "grad_norm": 6.695507526397705, + "learning_rate": 7.493490475537894e-06, + "loss": 0.4179, + "step": 19041 + }, + { + "epoch": 1.0582745523602821, + "grad_norm": 5.480875015258789, + "learning_rate": 7.493353432917638e-06, + "loss": 0.1719, + "step": 19042 + }, + { + "epoch": 1.058288117200217, + "grad_norm": 5.718844890594482, + "learning_rate": 7.493216390297383e-06, + "loss": 0.2745, + "step": 19043 + }, + { + "epoch": 1.0583016820401518, + "grad_norm": 5.322837829589844, + "learning_rate": 7.493079347677128e-06, + "loss": 0.1939, + "step": 19044 + }, + { + "epoch": 1.0583152468800867, + "grad_norm": 6.037799835205078, + "learning_rate": 7.492942305056873e-06, + "loss": 0.3691, + "step": 19045 + }, + { + "epoch": 1.0583288117200218, + "grad_norm": 4.729723930358887, + "learning_rate": 7.492805262436619e-06, + "loss": 0.3028, + "step": 19046 + }, + { + "epoch": 1.0583423765599567, + "grad_norm": 6.231978893280029, + "learning_rate": 7.492668219816364e-06, + "loss": 0.294, + "step": 19047 + }, + { + "epoch": 1.0583559413998915, + "grad_norm": 5.928383827209473, + "learning_rate": 7.492531177196108e-06, + "loss": 0.3732, + "step": 19048 + }, + { + "epoch": 1.0583695062398264, + "grad_norm": 6.255996227264404, + "learning_rate": 7.492394134575853e-06, + "loss": 0.3143, + "step": 19049 + }, + { + "epoch": 1.0583830710797613, + "grad_norm": 5.556724548339844, + "learning_rate": 7.492257091955599e-06, + "loss": 0.2913, + "step": 19050 + }, + { + "epoch": 1.0583966359196961, + "grad_norm": 4.998826503753662, + "learning_rate": 7.492120049335344e-06, + "loss": 0.2789, + "step": 19051 + }, + { + "epoch": 1.058410200759631, + "grad_norm": 4.350419998168945, + "learning_rate": 7.491983006715089e-06, + "loss": 0.2513, + "step": 19052 + }, + { + "epoch": 1.0584237655995659, + "grad_norm": 5.552979469299316, + "learning_rate": 7.491845964094834e-06, + "loss": 0.341, + "step": 19053 + }, + { + "epoch": 1.0584373304395007, + "grad_norm": 4.9884443283081055, + "learning_rate": 7.49170892147458e-06, + "loss": 0.2751, + "step": 19054 + }, + { + "epoch": 1.0584508952794356, + "grad_norm": 6.2879204750061035, + "learning_rate": 7.4915718788543245e-06, + "loss": 0.3688, + "step": 19055 + }, + { + "epoch": 1.0584644601193707, + "grad_norm": 7.701291561126709, + "learning_rate": 7.49143483623407e-06, + "loss": 0.3933, + "step": 19056 + }, + { + "epoch": 1.0584780249593055, + "grad_norm": 5.475775718688965, + "learning_rate": 7.491297793613814e-06, + "loss": 0.236, + "step": 19057 + }, + { + "epoch": 1.0584915897992404, + "grad_norm": 5.52056360244751, + "learning_rate": 7.491160750993559e-06, + "loss": 0.3437, + "step": 19058 + }, + { + "epoch": 1.0585051546391753, + "grad_norm": 6.359504699707031, + "learning_rate": 7.491023708373305e-06, + "loss": 0.3358, + "step": 19059 + }, + { + "epoch": 1.0585187194791101, + "grad_norm": 6.424374580383301, + "learning_rate": 7.4908866657530495e-06, + "loss": 0.3999, + "step": 19060 + }, + { + "epoch": 1.058532284319045, + "grad_norm": 5.922886371612549, + "learning_rate": 7.490749623132795e-06, + "loss": 0.22, + "step": 19061 + }, + { + "epoch": 1.0585458491589799, + "grad_norm": 6.1284685134887695, + "learning_rate": 7.49061258051254e-06, + "loss": 0.3111, + "step": 19062 + }, + { + "epoch": 1.0585594139989147, + "grad_norm": 6.682738780975342, + "learning_rate": 7.490475537892286e-06, + "loss": 0.3499, + "step": 19063 + }, + { + "epoch": 1.0585729788388496, + "grad_norm": 6.830559253692627, + "learning_rate": 7.49033849527203e-06, + "loss": 0.2897, + "step": 19064 + }, + { + "epoch": 1.0585865436787847, + "grad_norm": 5.305764198303223, + "learning_rate": 7.490201452651775e-06, + "loss": 0.2706, + "step": 19065 + }, + { + "epoch": 1.0586001085187196, + "grad_norm": 6.093717575073242, + "learning_rate": 7.49006441003152e-06, + "loss": 0.3578, + "step": 19066 + }, + { + "epoch": 1.0586136733586544, + "grad_norm": 5.401597499847412, + "learning_rate": 7.489927367411266e-06, + "loss": 0.2868, + "step": 19067 + }, + { + "epoch": 1.0586272381985893, + "grad_norm": 5.898196220397949, + "learning_rate": 7.489790324791011e-06, + "loss": 0.3547, + "step": 19068 + }, + { + "epoch": 1.0586408030385241, + "grad_norm": 7.314105987548828, + "learning_rate": 7.489653282170756e-06, + "loss": 0.329, + "step": 19069 + }, + { + "epoch": 1.058654367878459, + "grad_norm": 6.3921122550964355, + "learning_rate": 7.4895162395505004e-06, + "loss": 0.3286, + "step": 19070 + }, + { + "epoch": 1.0586679327183939, + "grad_norm": 6.582440376281738, + "learning_rate": 7.489379196930246e-06, + "loss": 0.3527, + "step": 19071 + }, + { + "epoch": 1.0586814975583287, + "grad_norm": 5.802891254425049, + "learning_rate": 7.489242154309992e-06, + "loss": 0.2668, + "step": 19072 + }, + { + "epoch": 1.0586950623982636, + "grad_norm": 5.089707374572754, + "learning_rate": 7.489105111689736e-06, + "loss": 0.3506, + "step": 19073 + }, + { + "epoch": 1.0587086272381987, + "grad_norm": 7.648629665374756, + "learning_rate": 7.488968069069481e-06, + "loss": 0.3332, + "step": 19074 + }, + { + "epoch": 1.0587221920781336, + "grad_norm": 8.259235382080078, + "learning_rate": 7.4888310264492255e-06, + "loss": 0.4516, + "step": 19075 + }, + { + "epoch": 1.0587357569180684, + "grad_norm": 9.53333854675293, + "learning_rate": 7.4886939838289715e-06, + "loss": 0.5945, + "step": 19076 + }, + { + "epoch": 1.0587493217580033, + "grad_norm": 7.168148517608643, + "learning_rate": 7.488556941208717e-06, + "loss": 0.42, + "step": 19077 + }, + { + "epoch": 1.0587628865979382, + "grad_norm": 6.3553314208984375, + "learning_rate": 7.488419898588462e-06, + "loss": 0.3319, + "step": 19078 + }, + { + "epoch": 1.058776451437873, + "grad_norm": 6.682168960571289, + "learning_rate": 7.488282855968206e-06, + "loss": 0.3404, + "step": 19079 + }, + { + "epoch": 1.0587900162778079, + "grad_norm": 7.570343017578125, + "learning_rate": 7.488145813347952e-06, + "loss": 0.3834, + "step": 19080 + }, + { + "epoch": 1.0588035811177428, + "grad_norm": 6.482065677642822, + "learning_rate": 7.488008770727697e-06, + "loss": 0.385, + "step": 19081 + }, + { + "epoch": 1.0588171459576776, + "grad_norm": 6.480156898498535, + "learning_rate": 7.487871728107442e-06, + "loss": 0.4364, + "step": 19082 + }, + { + "epoch": 1.0588307107976125, + "grad_norm": 30.62212371826172, + "learning_rate": 7.487734685487187e-06, + "loss": 0.3869, + "step": 19083 + }, + { + "epoch": 1.0588442756375476, + "grad_norm": 8.625486373901367, + "learning_rate": 7.487597642866932e-06, + "loss": 0.434, + "step": 19084 + }, + { + "epoch": 1.0588578404774824, + "grad_norm": 4.686809062957764, + "learning_rate": 7.487460600246677e-06, + "loss": 0.2705, + "step": 19085 + }, + { + "epoch": 1.0588714053174173, + "grad_norm": 9.089544296264648, + "learning_rate": 7.4873235576264225e-06, + "loss": 0.3802, + "step": 19086 + }, + { + "epoch": 1.0588849701573522, + "grad_norm": 4.75861120223999, + "learning_rate": 7.487186515006168e-06, + "loss": 0.202, + "step": 19087 + }, + { + "epoch": 1.058898534997287, + "grad_norm": 6.061517715454102, + "learning_rate": 7.487049472385912e-06, + "loss": 0.3259, + "step": 19088 + }, + { + "epoch": 1.058912099837222, + "grad_norm": 5.449348449707031, + "learning_rate": 7.486912429765658e-06, + "loss": 0.1963, + "step": 19089 + }, + { + "epoch": 1.0589256646771568, + "grad_norm": 4.7800679206848145, + "learning_rate": 7.486775387145403e-06, + "loss": 0.2252, + "step": 19090 + }, + { + "epoch": 1.0589392295170916, + "grad_norm": 7.857306480407715, + "learning_rate": 7.4866383445251475e-06, + "loss": 0.4034, + "step": 19091 + }, + { + "epoch": 1.0589527943570265, + "grad_norm": 5.705777168273926, + "learning_rate": 7.486501301904893e-06, + "loss": 0.3052, + "step": 19092 + }, + { + "epoch": 1.0589663591969616, + "grad_norm": 6.190860748291016, + "learning_rate": 7.486364259284639e-06, + "loss": 0.3243, + "step": 19093 + }, + { + "epoch": 1.0589799240368964, + "grad_norm": 4.649007797241211, + "learning_rate": 7.486227216664383e-06, + "loss": 0.229, + "step": 19094 + }, + { + "epoch": 1.0589934888768313, + "grad_norm": 5.281556606292725, + "learning_rate": 7.486090174044128e-06, + "loss": 0.2441, + "step": 19095 + }, + { + "epoch": 1.0590070537167662, + "grad_norm": 6.499075412750244, + "learning_rate": 7.485953131423873e-06, + "loss": 0.2944, + "step": 19096 + }, + { + "epoch": 1.059020618556701, + "grad_norm": 5.282474040985107, + "learning_rate": 7.485816088803618e-06, + "loss": 0.2151, + "step": 19097 + }, + { + "epoch": 1.059034183396636, + "grad_norm": 4.905911922454834, + "learning_rate": 7.485679046183364e-06, + "loss": 0.1535, + "step": 19098 + }, + { + "epoch": 1.0590477482365708, + "grad_norm": 4.57959508895874, + "learning_rate": 7.485542003563109e-06, + "loss": 0.2892, + "step": 19099 + }, + { + "epoch": 1.0590613130765056, + "grad_norm": 5.013232231140137, + "learning_rate": 7.485404960942853e-06, + "loss": 0.2108, + "step": 19100 + }, + { + "epoch": 1.0590748779164405, + "grad_norm": 4.887276649475098, + "learning_rate": 7.4852679183225984e-06, + "loss": 0.1757, + "step": 19101 + }, + { + "epoch": 1.0590884427563754, + "grad_norm": 5.721208095550537, + "learning_rate": 7.4851308757023445e-06, + "loss": 0.231, + "step": 19102 + }, + { + "epoch": 1.0591020075963105, + "grad_norm": 6.344587326049805, + "learning_rate": 7.48499383308209e-06, + "loss": 0.311, + "step": 19103 + }, + { + "epoch": 1.0591155724362453, + "grad_norm": 5.854126453399658, + "learning_rate": 7.484856790461834e-06, + "loss": 0.122, + "step": 19104 + }, + { + "epoch": 1.0591291372761802, + "grad_norm": 5.516735076904297, + "learning_rate": 7.484719747841579e-06, + "loss": 0.2422, + "step": 19105 + }, + { + "epoch": 1.059142702116115, + "grad_norm": 3.972015619277954, + "learning_rate": 7.484582705221325e-06, + "loss": 0.1095, + "step": 19106 + }, + { + "epoch": 1.05915626695605, + "grad_norm": 5.59580135345459, + "learning_rate": 7.4844456626010695e-06, + "loss": 0.2661, + "step": 19107 + }, + { + "epoch": 1.0591698317959848, + "grad_norm": 6.15568733215332, + "learning_rate": 7.484308619980815e-06, + "loss": 0.2865, + "step": 19108 + }, + { + "epoch": 1.0591833966359196, + "grad_norm": 3.9858644008636475, + "learning_rate": 7.484171577360559e-06, + "loss": 0.1762, + "step": 19109 + }, + { + "epoch": 1.0591969614758545, + "grad_norm": 4.596560955047607, + "learning_rate": 7.484034534740305e-06, + "loss": 0.2182, + "step": 19110 + }, + { + "epoch": 1.0592105263157894, + "grad_norm": 5.208958148956299, + "learning_rate": 7.48389749212005e-06, + "loss": 0.1807, + "step": 19111 + }, + { + "epoch": 1.0592240911557245, + "grad_norm": 7.463254928588867, + "learning_rate": 7.483760449499795e-06, + "loss": 0.1975, + "step": 19112 + }, + { + "epoch": 1.0592376559956593, + "grad_norm": 4.638126850128174, + "learning_rate": 7.48362340687954e-06, + "loss": 0.2802, + "step": 19113 + }, + { + "epoch": 1.0592512208355942, + "grad_norm": 4.710082530975342, + "learning_rate": 7.483486364259285e-06, + "loss": 0.2582, + "step": 19114 + }, + { + "epoch": 1.059264785675529, + "grad_norm": 5.466379642486572, + "learning_rate": 7.483349321639031e-06, + "loss": 0.2518, + "step": 19115 + }, + { + "epoch": 1.059278350515464, + "grad_norm": 4.66403341293335, + "learning_rate": 7.483212279018775e-06, + "loss": 0.1981, + "step": 19116 + }, + { + "epoch": 1.0592919153553988, + "grad_norm": 6.4500532150268555, + "learning_rate": 7.4830752363985205e-06, + "loss": 0.3796, + "step": 19117 + }, + { + "epoch": 1.0593054801953337, + "grad_norm": 4.940221786499023, + "learning_rate": 7.482938193778266e-06, + "loss": 0.2185, + "step": 19118 + }, + { + "epoch": 1.0593190450352685, + "grad_norm": 4.484389305114746, + "learning_rate": 7.482801151158011e-06, + "loss": 0.2351, + "step": 19119 + }, + { + "epoch": 1.0593326098752034, + "grad_norm": 5.962414741516113, + "learning_rate": 7.482664108537756e-06, + "loss": 0.258, + "step": 19120 + }, + { + "epoch": 1.0593461747151383, + "grad_norm": 5.029326915740967, + "learning_rate": 7.482527065917501e-06, + "loss": 0.1771, + "step": 19121 + }, + { + "epoch": 1.0593597395550733, + "grad_norm": 4.951271057128906, + "learning_rate": 7.4823900232972455e-06, + "loss": 0.201, + "step": 19122 + }, + { + "epoch": 1.0593733043950082, + "grad_norm": 4.443363666534424, + "learning_rate": 7.4822529806769915e-06, + "loss": 0.1437, + "step": 19123 + }, + { + "epoch": 1.059386869234943, + "grad_norm": 6.484423637390137, + "learning_rate": 7.482115938056737e-06, + "loss": 0.2791, + "step": 19124 + }, + { + "epoch": 1.059400434074878, + "grad_norm": 5.243253231048584, + "learning_rate": 7.481978895436481e-06, + "loss": 0.2085, + "step": 19125 + }, + { + "epoch": 1.0594139989148128, + "grad_norm": 6.437046527862549, + "learning_rate": 7.481841852816226e-06, + "loss": 0.3622, + "step": 19126 + }, + { + "epoch": 1.0594275637547477, + "grad_norm": 4.611726760864258, + "learning_rate": 7.481704810195971e-06, + "loss": 0.1983, + "step": 19127 + }, + { + "epoch": 1.0594411285946825, + "grad_norm": 6.403162479400635, + "learning_rate": 7.4815677675757174e-06, + "loss": 0.2837, + "step": 19128 + }, + { + "epoch": 1.0594546934346174, + "grad_norm": 4.472581386566162, + "learning_rate": 7.481430724955462e-06, + "loss": 0.2178, + "step": 19129 + }, + { + "epoch": 1.0594682582745523, + "grad_norm": 5.0933380126953125, + "learning_rate": 7.481293682335207e-06, + "loss": 0.1948, + "step": 19130 + }, + { + "epoch": 1.0594818231144874, + "grad_norm": 7.198314666748047, + "learning_rate": 7.481156639714951e-06, + "loss": 0.332, + "step": 19131 + }, + { + "epoch": 1.0594953879544222, + "grad_norm": 5.334558963775635, + "learning_rate": 7.481019597094697e-06, + "loss": 0.2715, + "step": 19132 + }, + { + "epoch": 1.059508952794357, + "grad_norm": 5.797457218170166, + "learning_rate": 7.4808825544744425e-06, + "loss": 0.2739, + "step": 19133 + }, + { + "epoch": 1.059522517634292, + "grad_norm": 5.821531295776367, + "learning_rate": 7.480745511854187e-06, + "loss": 0.2995, + "step": 19134 + }, + { + "epoch": 1.0595360824742268, + "grad_norm": 4.7739787101745605, + "learning_rate": 7.480608469233932e-06, + "loss": 0.2466, + "step": 19135 + }, + { + "epoch": 1.0595496473141617, + "grad_norm": 5.226762771606445, + "learning_rate": 7.480471426613678e-06, + "loss": 0.1887, + "step": 19136 + }, + { + "epoch": 1.0595632121540965, + "grad_norm": 5.7418084144592285, + "learning_rate": 7.480334383993423e-06, + "loss": 0.2834, + "step": 19137 + }, + { + "epoch": 1.0595767769940314, + "grad_norm": 4.696787357330322, + "learning_rate": 7.4801973413731675e-06, + "loss": 0.2123, + "step": 19138 + }, + { + "epoch": 1.0595903418339663, + "grad_norm": 7.195002555847168, + "learning_rate": 7.480060298752913e-06, + "loss": 0.3038, + "step": 19139 + }, + { + "epoch": 1.0596039066739011, + "grad_norm": 4.916189670562744, + "learning_rate": 7.479923256132657e-06, + "loss": 0.1805, + "step": 19140 + }, + { + "epoch": 1.0596174715138362, + "grad_norm": 3.8241333961486816, + "learning_rate": 7.479786213512403e-06, + "loss": 0.1097, + "step": 19141 + }, + { + "epoch": 1.059631036353771, + "grad_norm": 5.399200439453125, + "learning_rate": 7.479649170892148e-06, + "loss": 0.2787, + "step": 19142 + }, + { + "epoch": 1.059644601193706, + "grad_norm": 5.6379876136779785, + "learning_rate": 7.4795121282718926e-06, + "loss": 0.2548, + "step": 19143 + }, + { + "epoch": 1.0596581660336408, + "grad_norm": 6.0209221839904785, + "learning_rate": 7.479375085651638e-06, + "loss": 0.2526, + "step": 19144 + }, + { + "epoch": 1.0596717308735757, + "grad_norm": 7.858265399932861, + "learning_rate": 7.479238043031384e-06, + "loss": 0.2455, + "step": 19145 + }, + { + "epoch": 1.0596852957135106, + "grad_norm": 5.43753719329834, + "learning_rate": 7.479101000411129e-06, + "loss": 0.2194, + "step": 19146 + }, + { + "epoch": 1.0596988605534454, + "grad_norm": 4.769888877868652, + "learning_rate": 7.478963957790873e-06, + "loss": 0.2394, + "step": 19147 + }, + { + "epoch": 1.0597124253933803, + "grad_norm": 4.8278679847717285, + "learning_rate": 7.4788269151706185e-06, + "loss": 0.1985, + "step": 19148 + }, + { + "epoch": 1.0597259902333152, + "grad_norm": 4.292165279388428, + "learning_rate": 7.4786898725503645e-06, + "loss": 0.18, + "step": 19149 + }, + { + "epoch": 1.0597395550732502, + "grad_norm": 4.634655952453613, + "learning_rate": 7.478552829930109e-06, + "loss": 0.1453, + "step": 19150 + }, + { + "epoch": 1.059753119913185, + "grad_norm": 4.837255477905273, + "learning_rate": 7.478415787309854e-06, + "loss": 0.2526, + "step": 19151 + }, + { + "epoch": 1.05976668475312, + "grad_norm": 4.33231782913208, + "learning_rate": 7.478278744689599e-06, + "loss": 0.1522, + "step": 19152 + }, + { + "epoch": 1.0597802495930548, + "grad_norm": 5.371460914611816, + "learning_rate": 7.4781417020693435e-06, + "loss": 0.2955, + "step": 19153 + }, + { + "epoch": 1.0597938144329897, + "grad_norm": 6.177471160888672, + "learning_rate": 7.4780046594490895e-06, + "loss": 0.2736, + "step": 19154 + }, + { + "epoch": 1.0598073792729246, + "grad_norm": 4.093869209289551, + "learning_rate": 7.477867616828835e-06, + "loss": 0.1385, + "step": 19155 + }, + { + "epoch": 1.0598209441128594, + "grad_norm": 4.710086822509766, + "learning_rate": 7.477730574208579e-06, + "loss": 0.2873, + "step": 19156 + }, + { + "epoch": 1.0598345089527943, + "grad_norm": 4.89724588394165, + "learning_rate": 7.477593531588324e-06, + "loss": 0.2632, + "step": 19157 + }, + { + "epoch": 1.0598480737927292, + "grad_norm": 3.9589645862579346, + "learning_rate": 7.47745648896807e-06, + "loss": 0.334, + "step": 19158 + }, + { + "epoch": 1.059861638632664, + "grad_norm": 3.9677181243896484, + "learning_rate": 7.477319446347815e-06, + "loss": 0.1846, + "step": 19159 + }, + { + "epoch": 1.0598752034725991, + "grad_norm": 3.4309732913970947, + "learning_rate": 7.47718240372756e-06, + "loss": 0.2189, + "step": 19160 + }, + { + "epoch": 1.059888768312534, + "grad_norm": 4.496744632720947, + "learning_rate": 7.477045361107305e-06, + "loss": 0.2562, + "step": 19161 + }, + { + "epoch": 1.0599023331524688, + "grad_norm": 2.984243392944336, + "learning_rate": 7.476908318487051e-06, + "loss": 0.1274, + "step": 19162 + }, + { + "epoch": 1.0599158979924037, + "grad_norm": 4.545811176300049, + "learning_rate": 7.476771275866795e-06, + "loss": 0.3679, + "step": 19163 + }, + { + "epoch": 1.0599294628323386, + "grad_norm": 5.110123634338379, + "learning_rate": 7.4766342332465405e-06, + "loss": 0.3075, + "step": 19164 + }, + { + "epoch": 1.0599430276722734, + "grad_norm": 5.892577648162842, + "learning_rate": 7.476497190626285e-06, + "loss": 0.2741, + "step": 19165 + }, + { + "epoch": 1.0599565925122083, + "grad_norm": 4.730564117431641, + "learning_rate": 7.47636014800603e-06, + "loss": 0.1485, + "step": 19166 + }, + { + "epoch": 1.0599701573521432, + "grad_norm": 4.4160637855529785, + "learning_rate": 7.476223105385776e-06, + "loss": 0.2603, + "step": 19167 + }, + { + "epoch": 1.059983722192078, + "grad_norm": 6.500504970550537, + "learning_rate": 7.47608606276552e-06, + "loss": 0.4365, + "step": 19168 + }, + { + "epoch": 1.0599972870320131, + "grad_norm": 4.568077087402344, + "learning_rate": 7.4759490201452655e-06, + "loss": 0.2195, + "step": 19169 + }, + { + "epoch": 1.060010851871948, + "grad_norm": 3.7991063594818115, + "learning_rate": 7.475811977525011e-06, + "loss": 0.2461, + "step": 19170 + }, + { + "epoch": 1.0600244167118829, + "grad_norm": 4.056876182556152, + "learning_rate": 7.475674934904757e-06, + "loss": 0.194, + "step": 19171 + }, + { + "epoch": 1.0600379815518177, + "grad_norm": 5.7204484939575195, + "learning_rate": 7.475537892284501e-06, + "loss": 0.2444, + "step": 19172 + }, + { + "epoch": 1.0600515463917526, + "grad_norm": 3.590240240097046, + "learning_rate": 7.475400849664246e-06, + "loss": 0.1531, + "step": 19173 + }, + { + "epoch": 1.0600651112316875, + "grad_norm": 5.018124580383301, + "learning_rate": 7.475263807043991e-06, + "loss": 0.1567, + "step": 19174 + }, + { + "epoch": 1.0600786760716223, + "grad_norm": 5.040228366851807, + "learning_rate": 7.475126764423737e-06, + "loss": 0.2228, + "step": 19175 + }, + { + "epoch": 1.0600922409115572, + "grad_norm": 4.63613224029541, + "learning_rate": 7.474989721803482e-06, + "loss": 0.1984, + "step": 19176 + }, + { + "epoch": 1.060105805751492, + "grad_norm": 4.229454040527344, + "learning_rate": 7.474852679183227e-06, + "loss": 0.2391, + "step": 19177 + }, + { + "epoch": 1.0601193705914271, + "grad_norm": 5.576592445373535, + "learning_rate": 7.474715636562971e-06, + "loss": 0.2701, + "step": 19178 + }, + { + "epoch": 1.060132935431362, + "grad_norm": 5.712947845458984, + "learning_rate": 7.474578593942717e-06, + "loss": 0.2912, + "step": 19179 + }, + { + "epoch": 1.0601465002712969, + "grad_norm": 4.9261016845703125, + "learning_rate": 7.4744415513224625e-06, + "loss": 0.3055, + "step": 19180 + }, + { + "epoch": 1.0601600651112317, + "grad_norm": 6.338593482971191, + "learning_rate": 7.474304508702207e-06, + "loss": 0.2682, + "step": 19181 + }, + { + "epoch": 1.0601736299511666, + "grad_norm": 5.031639099121094, + "learning_rate": 7.474167466081952e-06, + "loss": 0.1949, + "step": 19182 + }, + { + "epoch": 1.0601871947911015, + "grad_norm": 6.326103210449219, + "learning_rate": 7.474030423461696e-06, + "loss": 0.3133, + "step": 19183 + }, + { + "epoch": 1.0602007596310363, + "grad_norm": 4.546731948852539, + "learning_rate": 7.473893380841442e-06, + "loss": 0.1681, + "step": 19184 + }, + { + "epoch": 1.0602143244709712, + "grad_norm": 3.854877471923828, + "learning_rate": 7.4737563382211875e-06, + "loss": 0.1619, + "step": 19185 + }, + { + "epoch": 1.060227889310906, + "grad_norm": 5.5745768547058105, + "learning_rate": 7.473619295600933e-06, + "loss": 0.2538, + "step": 19186 + }, + { + "epoch": 1.060241454150841, + "grad_norm": 5.178686618804932, + "learning_rate": 7.473482252980677e-06, + "loss": 0.2171, + "step": 19187 + }, + { + "epoch": 1.060255018990776, + "grad_norm": 4.47737979888916, + "learning_rate": 7.473345210360423e-06, + "loss": 0.212, + "step": 19188 + }, + { + "epoch": 1.0602685838307109, + "grad_norm": 5.4073638916015625, + "learning_rate": 7.473208167740168e-06, + "loss": 0.2498, + "step": 19189 + }, + { + "epoch": 1.0602821486706457, + "grad_norm": 5.148838520050049, + "learning_rate": 7.473071125119913e-06, + "loss": 0.1906, + "step": 19190 + }, + { + "epoch": 1.0602957135105806, + "grad_norm": 4.248803615570068, + "learning_rate": 7.472934082499658e-06, + "loss": 0.2089, + "step": 19191 + }, + { + "epoch": 1.0603092783505155, + "grad_norm": 4.554885387420654, + "learning_rate": 7.472797039879404e-06, + "loss": 0.1783, + "step": 19192 + }, + { + "epoch": 1.0603228431904503, + "grad_norm": 5.949897289276123, + "learning_rate": 7.472659997259148e-06, + "loss": 0.3459, + "step": 19193 + }, + { + "epoch": 1.0603364080303852, + "grad_norm": 5.831995010375977, + "learning_rate": 7.472522954638893e-06, + "loss": 0.2944, + "step": 19194 + }, + { + "epoch": 1.06034997287032, + "grad_norm": 4.494042873382568, + "learning_rate": 7.4723859120186385e-06, + "loss": 0.2097, + "step": 19195 + }, + { + "epoch": 1.060363537710255, + "grad_norm": 3.907855749130249, + "learning_rate": 7.472248869398383e-06, + "loss": 0.1825, + "step": 19196 + }, + { + "epoch": 1.06037710255019, + "grad_norm": 4.37614107131958, + "learning_rate": 7.472111826778129e-06, + "loss": 0.2114, + "step": 19197 + }, + { + "epoch": 1.0603906673901249, + "grad_norm": 4.629652500152588, + "learning_rate": 7.471974784157874e-06, + "loss": 0.152, + "step": 19198 + }, + { + "epoch": 1.0604042322300598, + "grad_norm": 4.302363872528076, + "learning_rate": 7.471837741537618e-06, + "loss": 0.1947, + "step": 19199 + }, + { + "epoch": 1.0604177970699946, + "grad_norm": 5.717668533325195, + "learning_rate": 7.4717006989173635e-06, + "loss": 0.1856, + "step": 19200 + }, + { + "epoch": 1.0604313619099295, + "grad_norm": 3.125746488571167, + "learning_rate": 7.4715636562971096e-06, + "loss": 0.1365, + "step": 19201 + }, + { + "epoch": 1.0604449267498643, + "grad_norm": 4.291096210479736, + "learning_rate": 7.471426613676854e-06, + "loss": 0.2129, + "step": 19202 + }, + { + "epoch": 1.0604584915897992, + "grad_norm": 4.422873020172119, + "learning_rate": 7.471289571056599e-06, + "loss": 0.2519, + "step": 19203 + }, + { + "epoch": 1.060472056429734, + "grad_norm": 5.986020565032959, + "learning_rate": 7.471152528436344e-06, + "loss": 0.2154, + "step": 19204 + }, + { + "epoch": 1.060485621269669, + "grad_norm": 5.219454288482666, + "learning_rate": 7.47101548581609e-06, + "loss": 0.2689, + "step": 19205 + }, + { + "epoch": 1.0604991861096038, + "grad_norm": 5.844454765319824, + "learning_rate": 7.470878443195835e-06, + "loss": 0.4529, + "step": 19206 + }, + { + "epoch": 1.060512750949539, + "grad_norm": 5.65847635269165, + "learning_rate": 7.47074140057558e-06, + "loss": 0.2063, + "step": 19207 + }, + { + "epoch": 1.0605263157894738, + "grad_norm": 5.035462379455566, + "learning_rate": 7.470604357955324e-06, + "loss": 0.1997, + "step": 19208 + }, + { + "epoch": 1.0605398806294086, + "grad_norm": 5.423879623413086, + "learning_rate": 7.470467315335069e-06, + "loss": 0.2291, + "step": 19209 + }, + { + "epoch": 1.0605534454693435, + "grad_norm": 3.541816473007202, + "learning_rate": 7.470330272714815e-06, + "loss": 0.1835, + "step": 19210 + }, + { + "epoch": 1.0605670103092784, + "grad_norm": 5.225606441497803, + "learning_rate": 7.4701932300945605e-06, + "loss": 0.1923, + "step": 19211 + }, + { + "epoch": 1.0605805751492132, + "grad_norm": 5.075957298278809, + "learning_rate": 7.470056187474305e-06, + "loss": 0.242, + "step": 19212 + }, + { + "epoch": 1.060594139989148, + "grad_norm": 6.201220989227295, + "learning_rate": 7.46991914485405e-06, + "loss": 0.3471, + "step": 19213 + }, + { + "epoch": 1.060607704829083, + "grad_norm": 3.779500961303711, + "learning_rate": 7.469782102233796e-06, + "loss": 0.1627, + "step": 19214 + }, + { + "epoch": 1.0606212696690178, + "grad_norm": 3.4054133892059326, + "learning_rate": 7.46964505961354e-06, + "loss": 0.1862, + "step": 19215 + }, + { + "epoch": 1.060634834508953, + "grad_norm": 7.024600982666016, + "learning_rate": 7.4695080169932856e-06, + "loss": 0.4079, + "step": 19216 + }, + { + "epoch": 1.0606483993488878, + "grad_norm": 5.485069274902344, + "learning_rate": 7.46937097437303e-06, + "loss": 0.2843, + "step": 19217 + }, + { + "epoch": 1.0606619641888226, + "grad_norm": 4.895576477050781, + "learning_rate": 7.469233931752776e-06, + "loss": 0.1837, + "step": 19218 + }, + { + "epoch": 1.0606755290287575, + "grad_norm": 6.412138938903809, + "learning_rate": 7.469096889132521e-06, + "loss": 0.3835, + "step": 19219 + }, + { + "epoch": 1.0606890938686924, + "grad_norm": 6.099839687347412, + "learning_rate": 7.468959846512266e-06, + "loss": 0.4319, + "step": 19220 + }, + { + "epoch": 1.0607026587086272, + "grad_norm": 4.471290588378906, + "learning_rate": 7.468822803892011e-06, + "loss": 0.256, + "step": 19221 + }, + { + "epoch": 1.060716223548562, + "grad_norm": 5.382147789001465, + "learning_rate": 7.468685761271756e-06, + "loss": 0.396, + "step": 19222 + }, + { + "epoch": 1.060729788388497, + "grad_norm": 4.192431449890137, + "learning_rate": 7.468548718651502e-06, + "loss": 0.2303, + "step": 19223 + }, + { + "epoch": 1.0607433532284318, + "grad_norm": 5.330888748168945, + "learning_rate": 7.468411676031246e-06, + "loss": 0.2684, + "step": 19224 + }, + { + "epoch": 1.0607569180683667, + "grad_norm": 3.8752379417419434, + "learning_rate": 7.468274633410991e-06, + "loss": 0.1734, + "step": 19225 + }, + { + "epoch": 1.0607704829083018, + "grad_norm": 5.117772102355957, + "learning_rate": 7.4681375907907365e-06, + "loss": 0.2293, + "step": 19226 + }, + { + "epoch": 1.0607840477482366, + "grad_norm": 4.844027996063232, + "learning_rate": 7.468000548170482e-06, + "loss": 0.2225, + "step": 19227 + }, + { + "epoch": 1.0607976125881715, + "grad_norm": 5.125687599182129, + "learning_rate": 7.467863505550227e-06, + "loss": 0.2142, + "step": 19228 + }, + { + "epoch": 1.0608111774281064, + "grad_norm": 5.7304792404174805, + "learning_rate": 7.467726462929972e-06, + "loss": 0.3446, + "step": 19229 + }, + { + "epoch": 1.0608247422680412, + "grad_norm": 7.778083801269531, + "learning_rate": 7.467589420309716e-06, + "loss": 0.3932, + "step": 19230 + }, + { + "epoch": 1.060838307107976, + "grad_norm": 4.8294148445129395, + "learning_rate": 7.467452377689462e-06, + "loss": 0.267, + "step": 19231 + }, + { + "epoch": 1.060851871947911, + "grad_norm": 8.974101066589355, + "learning_rate": 7.4673153350692076e-06, + "loss": 0.528, + "step": 19232 + }, + { + "epoch": 1.0608654367878458, + "grad_norm": 5.739154815673828, + "learning_rate": 7.467178292448952e-06, + "loss": 0.2483, + "step": 19233 + }, + { + "epoch": 1.0608790016277807, + "grad_norm": 5.056047439575195, + "learning_rate": 7.467041249828697e-06, + "loss": 0.4427, + "step": 19234 + }, + { + "epoch": 1.0608925664677158, + "grad_norm": 5.838449954986572, + "learning_rate": 7.466904207208442e-06, + "loss": 0.4134, + "step": 19235 + }, + { + "epoch": 1.0609061313076507, + "grad_norm": 5.760802268981934, + "learning_rate": 7.4667671645881874e-06, + "loss": 0.3218, + "step": 19236 + }, + { + "epoch": 1.0609196961475855, + "grad_norm": 5.965017795562744, + "learning_rate": 7.466630121967933e-06, + "loss": 0.3109, + "step": 19237 + }, + { + "epoch": 1.0609332609875204, + "grad_norm": 7.641697883605957, + "learning_rate": 7.466493079347678e-06, + "loss": 0.4161, + "step": 19238 + }, + { + "epoch": 1.0609468258274553, + "grad_norm": 4.382580280303955, + "learning_rate": 7.466356036727422e-06, + "loss": 0.2523, + "step": 19239 + }, + { + "epoch": 1.0609603906673901, + "grad_norm": 5.990867614746094, + "learning_rate": 7.466218994107168e-06, + "loss": 0.2285, + "step": 19240 + }, + { + "epoch": 1.060973955507325, + "grad_norm": 7.490899085998535, + "learning_rate": 7.466081951486913e-06, + "loss": 0.4739, + "step": 19241 + }, + { + "epoch": 1.0609875203472598, + "grad_norm": 5.42231559753418, + "learning_rate": 7.465944908866658e-06, + "loss": 0.3229, + "step": 19242 + }, + { + "epoch": 1.0610010851871947, + "grad_norm": 4.780033111572266, + "learning_rate": 7.465807866246403e-06, + "loss": 0.2749, + "step": 19243 + }, + { + "epoch": 1.0610146500271296, + "grad_norm": 4.682584762573242, + "learning_rate": 7.465670823626149e-06, + "loss": 0.3234, + "step": 19244 + }, + { + "epoch": 1.0610282148670647, + "grad_norm": 6.324458122253418, + "learning_rate": 7.465533781005894e-06, + "loss": 0.3116, + "step": 19245 + }, + { + "epoch": 1.0610417797069995, + "grad_norm": 4.4231367111206055, + "learning_rate": 7.465396738385638e-06, + "loss": 0.3195, + "step": 19246 + }, + { + "epoch": 1.0610553445469344, + "grad_norm": 6.285242557525635, + "learning_rate": 7.4652596957653836e-06, + "loss": 0.3656, + "step": 19247 + }, + { + "epoch": 1.0610689093868693, + "grad_norm": 6.388981819152832, + "learning_rate": 7.46512265314513e-06, + "loss": 0.3001, + "step": 19248 + }, + { + "epoch": 1.0610824742268041, + "grad_norm": 5.697317600250244, + "learning_rate": 7.464985610524874e-06, + "loss": 0.3319, + "step": 19249 + }, + { + "epoch": 1.061096039066739, + "grad_norm": 6.066378116607666, + "learning_rate": 7.464848567904619e-06, + "loss": 0.3607, + "step": 19250 + }, + { + "epoch": 1.0611096039066739, + "grad_norm": 5.258832931518555, + "learning_rate": 7.4647115252843634e-06, + "loss": 0.3534, + "step": 19251 + }, + { + "epoch": 1.0611231687466087, + "grad_norm": 7.032448768615723, + "learning_rate": 7.464574482664109e-06, + "loss": 0.3833, + "step": 19252 + }, + { + "epoch": 1.0611367335865436, + "grad_norm": 6.935730457305908, + "learning_rate": 7.464437440043855e-06, + "loss": 0.4263, + "step": 19253 + }, + { + "epoch": 1.0611502984264787, + "grad_norm": 5.904191970825195, + "learning_rate": 7.4643003974236e-06, + "loss": 0.3971, + "step": 19254 + }, + { + "epoch": 1.0611638632664135, + "grad_norm": 5.518466472625732, + "learning_rate": 7.464163354803344e-06, + "loss": 0.3341, + "step": 19255 + }, + { + "epoch": 1.0611774281063484, + "grad_norm": 6.0162577629089355, + "learning_rate": 7.464026312183089e-06, + "loss": 0.3568, + "step": 19256 + }, + { + "epoch": 1.0611909929462833, + "grad_norm": 6.28824520111084, + "learning_rate": 7.463889269562835e-06, + "loss": 0.3687, + "step": 19257 + }, + { + "epoch": 1.0612045577862181, + "grad_norm": 5.560988426208496, + "learning_rate": 7.46375222694258e-06, + "loss": 0.3262, + "step": 19258 + }, + { + "epoch": 1.061218122626153, + "grad_norm": 5.807180404663086, + "learning_rate": 7.463615184322325e-06, + "loss": 0.4872, + "step": 19259 + }, + { + "epoch": 1.0612316874660879, + "grad_norm": 7.004922866821289, + "learning_rate": 7.46347814170207e-06, + "loss": 0.5464, + "step": 19260 + }, + { + "epoch": 1.0612452523060227, + "grad_norm": 6.290289878845215, + "learning_rate": 7.463341099081815e-06, + "loss": 0.3344, + "step": 19261 + }, + { + "epoch": 1.0612588171459576, + "grad_norm": 6.843183517456055, + "learning_rate": 7.46320405646156e-06, + "loss": 0.4303, + "step": 19262 + }, + { + "epoch": 1.0612723819858925, + "grad_norm": 7.445333480834961, + "learning_rate": 7.4630670138413056e-06, + "loss": 0.4715, + "step": 19263 + }, + { + "epoch": 1.0612859468258276, + "grad_norm": 6.1103835105896, + "learning_rate": 7.46292997122105e-06, + "loss": 0.457, + "step": 19264 + }, + { + "epoch": 1.0612995116657624, + "grad_norm": 4.644449710845947, + "learning_rate": 7.462792928600795e-06, + "loss": 0.3023, + "step": 19265 + }, + { + "epoch": 1.0613130765056973, + "grad_norm": 7.191397666931152, + "learning_rate": 7.462655885980541e-06, + "loss": 0.4508, + "step": 19266 + }, + { + "epoch": 1.0613266413456321, + "grad_norm": 6.395044803619385, + "learning_rate": 7.4625188433602854e-06, + "loss": 0.3985, + "step": 19267 + }, + { + "epoch": 1.061340206185567, + "grad_norm": 6.486697673797607, + "learning_rate": 7.462381800740031e-06, + "loss": 0.4034, + "step": 19268 + }, + { + "epoch": 1.0613537710255019, + "grad_norm": 4.1894941329956055, + "learning_rate": 7.462244758119776e-06, + "loss": 0.2797, + "step": 19269 + }, + { + "epoch": 1.0613673358654367, + "grad_norm": 5.550235748291016, + "learning_rate": 7.462107715499521e-06, + "loss": 0.3119, + "step": 19270 + }, + { + "epoch": 1.0613809007053716, + "grad_norm": 6.274193286895752, + "learning_rate": 7.461970672879266e-06, + "loss": 0.4777, + "step": 19271 + }, + { + "epoch": 1.0613944655453065, + "grad_norm": 7.111197471618652, + "learning_rate": 7.461833630259011e-06, + "loss": 0.4822, + "step": 19272 + }, + { + "epoch": 1.0614080303852416, + "grad_norm": 6.063173770904541, + "learning_rate": 7.461696587638756e-06, + "loss": 0.2751, + "step": 19273 + }, + { + "epoch": 1.0614215952251764, + "grad_norm": 4.428518772125244, + "learning_rate": 7.461559545018502e-06, + "loss": 0.2521, + "step": 19274 + }, + { + "epoch": 1.0614351600651113, + "grad_norm": 4.439868927001953, + "learning_rate": 7.461422502398247e-06, + "loss": 0.3985, + "step": 19275 + }, + { + "epoch": 1.0614487249050462, + "grad_norm": 5.438359260559082, + "learning_rate": 7.461285459777991e-06, + "loss": 0.3055, + "step": 19276 + }, + { + "epoch": 1.061462289744981, + "grad_norm": 4.578830718994141, + "learning_rate": 7.461148417157736e-06, + "loss": 0.2665, + "step": 19277 + }, + { + "epoch": 1.061475854584916, + "grad_norm": 5.698948860168457, + "learning_rate": 7.4610113745374816e-06, + "loss": 0.2758, + "step": 19278 + }, + { + "epoch": 1.0614894194248508, + "grad_norm": 4.477277755737305, + "learning_rate": 7.460874331917228e-06, + "loss": 0.2665, + "step": 19279 + }, + { + "epoch": 1.0615029842647856, + "grad_norm": 6.272139549255371, + "learning_rate": 7.460737289296972e-06, + "loss": 0.4282, + "step": 19280 + }, + { + "epoch": 1.0615165491047205, + "grad_norm": 6.456897258758545, + "learning_rate": 7.460600246676717e-06, + "loss": 0.3208, + "step": 19281 + }, + { + "epoch": 1.0615301139446554, + "grad_norm": 4.915979862213135, + "learning_rate": 7.4604632040564614e-06, + "loss": 0.2663, + "step": 19282 + }, + { + "epoch": 1.0615436787845904, + "grad_norm": 7.009235382080078, + "learning_rate": 7.4603261614362075e-06, + "loss": 0.3307, + "step": 19283 + }, + { + "epoch": 1.0615572436245253, + "grad_norm": 5.164477348327637, + "learning_rate": 7.460189118815953e-06, + "loss": 0.3109, + "step": 19284 + }, + { + "epoch": 1.0615708084644602, + "grad_norm": 6.015216827392578, + "learning_rate": 7.460052076195697e-06, + "loss": 0.245, + "step": 19285 + }, + { + "epoch": 1.061584373304395, + "grad_norm": 4.09065580368042, + "learning_rate": 7.459915033575442e-06, + "loss": 0.2037, + "step": 19286 + }, + { + "epoch": 1.06159793814433, + "grad_norm": 4.3173441886901855, + "learning_rate": 7.459777990955188e-06, + "loss": 0.2613, + "step": 19287 + }, + { + "epoch": 1.0616115029842648, + "grad_norm": 6.0137763023376465, + "learning_rate": 7.459640948334933e-06, + "loss": 0.4075, + "step": 19288 + }, + { + "epoch": 1.0616250678241996, + "grad_norm": 6.103264808654785, + "learning_rate": 7.459503905714678e-06, + "loss": 0.443, + "step": 19289 + }, + { + "epoch": 1.0616386326641345, + "grad_norm": 4.864404678344727, + "learning_rate": 7.459366863094423e-06, + "loss": 0.2929, + "step": 19290 + }, + { + "epoch": 1.0616521975040694, + "grad_norm": 5.995112895965576, + "learning_rate": 7.459229820474167e-06, + "loss": 0.3443, + "step": 19291 + }, + { + "epoch": 1.0616657623440044, + "grad_norm": 5.558794021606445, + "learning_rate": 7.459092777853913e-06, + "loss": 0.2878, + "step": 19292 + }, + { + "epoch": 1.0616793271839393, + "grad_norm": 4.577808380126953, + "learning_rate": 7.458955735233658e-06, + "loss": 0.3341, + "step": 19293 + }, + { + "epoch": 1.0616928920238742, + "grad_norm": 6.384937763214111, + "learning_rate": 7.4588186926134036e-06, + "loss": 0.4259, + "step": 19294 + }, + { + "epoch": 1.061706456863809, + "grad_norm": 5.337523937225342, + "learning_rate": 7.458681649993148e-06, + "loss": 0.3781, + "step": 19295 + }, + { + "epoch": 1.061720021703744, + "grad_norm": 5.105709552764893, + "learning_rate": 7.458544607372894e-06, + "loss": 0.345, + "step": 19296 + }, + { + "epoch": 1.0617335865436788, + "grad_norm": 5.173467636108398, + "learning_rate": 7.458407564752639e-06, + "loss": 0.4036, + "step": 19297 + }, + { + "epoch": 1.0617471513836136, + "grad_norm": 5.686624526977539, + "learning_rate": 7.4582705221323834e-06, + "loss": 0.3034, + "step": 19298 + }, + { + "epoch": 1.0617607162235485, + "grad_norm": 5.96480131149292, + "learning_rate": 7.458133479512129e-06, + "loss": 0.329, + "step": 19299 + }, + { + "epoch": 1.0617742810634834, + "grad_norm": 6.826742172241211, + "learning_rate": 7.457996436891875e-06, + "loss": 0.3161, + "step": 19300 + }, + { + "epoch": 1.0617878459034182, + "grad_norm": 5.396653175354004, + "learning_rate": 7.457859394271619e-06, + "loss": 0.3098, + "step": 19301 + }, + { + "epoch": 1.0618014107433533, + "grad_norm": 4.544051170349121, + "learning_rate": 7.457722351651364e-06, + "loss": 0.2653, + "step": 19302 + }, + { + "epoch": 1.0618149755832882, + "grad_norm": 6.022891998291016, + "learning_rate": 7.457585309031109e-06, + "loss": 0.4376, + "step": 19303 + }, + { + "epoch": 1.061828540423223, + "grad_norm": 4.342470169067383, + "learning_rate": 7.457448266410854e-06, + "loss": 0.2545, + "step": 19304 + }, + { + "epoch": 1.061842105263158, + "grad_norm": 7.721620082855225, + "learning_rate": 7.4573112237906e-06, + "loss": 0.4021, + "step": 19305 + }, + { + "epoch": 1.0618556701030928, + "grad_norm": 6.930805683135986, + "learning_rate": 7.457174181170345e-06, + "loss": 0.4106, + "step": 19306 + }, + { + "epoch": 1.0618692349430277, + "grad_norm": 7.800923824310303, + "learning_rate": 7.457037138550089e-06, + "loss": 0.3256, + "step": 19307 + }, + { + "epoch": 1.0618827997829625, + "grad_norm": 4.900252342224121, + "learning_rate": 7.456900095929834e-06, + "loss": 0.2124, + "step": 19308 + }, + { + "epoch": 1.0618963646228974, + "grad_norm": 6.427811145782471, + "learning_rate": 7.45676305330958e-06, + "loss": 0.4572, + "step": 19309 + }, + { + "epoch": 1.0619099294628322, + "grad_norm": 4.830986499786377, + "learning_rate": 7.456626010689325e-06, + "loss": 0.3623, + "step": 19310 + }, + { + "epoch": 1.0619234943027673, + "grad_norm": 5.6008195877075195, + "learning_rate": 7.45648896806907e-06, + "loss": 0.2641, + "step": 19311 + }, + { + "epoch": 1.0619370591427022, + "grad_norm": 5.120242595672607, + "learning_rate": 7.456351925448815e-06, + "loss": 0.3721, + "step": 19312 + }, + { + "epoch": 1.061950623982637, + "grad_norm": 7.571267604827881, + "learning_rate": 7.456214882828561e-06, + "loss": 0.4803, + "step": 19313 + }, + { + "epoch": 1.061964188822572, + "grad_norm": 5.036721706390381, + "learning_rate": 7.4560778402083055e-06, + "loss": 0.2516, + "step": 19314 + }, + { + "epoch": 1.0619777536625068, + "grad_norm": 5.379140853881836, + "learning_rate": 7.455940797588051e-06, + "loss": 0.3238, + "step": 19315 + }, + { + "epoch": 1.0619913185024417, + "grad_norm": 4.714986324310303, + "learning_rate": 7.455803754967795e-06, + "loss": 0.2982, + "step": 19316 + }, + { + "epoch": 1.0620048833423765, + "grad_norm": 5.191289901733398, + "learning_rate": 7.455666712347541e-06, + "loss": 0.309, + "step": 19317 + }, + { + "epoch": 1.0620184481823114, + "grad_norm": 4.9782843589782715, + "learning_rate": 7.455529669727286e-06, + "loss": 0.3027, + "step": 19318 + }, + { + "epoch": 1.0620320130222463, + "grad_norm": 7.776429653167725, + "learning_rate": 7.455392627107031e-06, + "loss": 0.3365, + "step": 19319 + }, + { + "epoch": 1.0620455778621811, + "grad_norm": 6.087592124938965, + "learning_rate": 7.455255584486776e-06, + "loss": 0.347, + "step": 19320 + }, + { + "epoch": 1.0620591427021162, + "grad_norm": 5.45408821105957, + "learning_rate": 7.455118541866521e-06, + "loss": 0.3363, + "step": 19321 + }, + { + "epoch": 1.062072707542051, + "grad_norm": 5.6980299949646, + "learning_rate": 7.454981499246267e-06, + "loss": 0.3278, + "step": 19322 + }, + { + "epoch": 1.062086272381986, + "grad_norm": 4.503848075866699, + "learning_rate": 7.454844456626011e-06, + "loss": 0.264, + "step": 19323 + }, + { + "epoch": 1.0620998372219208, + "grad_norm": 4.366907119750977, + "learning_rate": 7.454707414005756e-06, + "loss": 0.2801, + "step": 19324 + }, + { + "epoch": 1.0621134020618557, + "grad_norm": 4.260225296020508, + "learning_rate": 7.454570371385501e-06, + "loss": 0.2654, + "step": 19325 + }, + { + "epoch": 1.0621269669017905, + "grad_norm": 4.512495040893555, + "learning_rate": 7.454433328765247e-06, + "loss": 0.281, + "step": 19326 + }, + { + "epoch": 1.0621405317417254, + "grad_norm": 6.007624626159668, + "learning_rate": 7.454296286144992e-06, + "loss": 0.2334, + "step": 19327 + }, + { + "epoch": 1.0621540965816603, + "grad_norm": 6.323624134063721, + "learning_rate": 7.454159243524737e-06, + "loss": 0.4113, + "step": 19328 + }, + { + "epoch": 1.0621676614215951, + "grad_norm": 6.08518648147583, + "learning_rate": 7.4540222009044814e-06, + "loss": 0.2582, + "step": 19329 + }, + { + "epoch": 1.0621812262615302, + "grad_norm": 5.085709095001221, + "learning_rate": 7.4538851582842275e-06, + "loss": 0.2686, + "step": 19330 + }, + { + "epoch": 1.062194791101465, + "grad_norm": 3.9248790740966797, + "learning_rate": 7.453748115663973e-06, + "loss": 0.1824, + "step": 19331 + }, + { + "epoch": 1.0622083559414, + "grad_norm": 4.665960311889648, + "learning_rate": 7.453611073043717e-06, + "loss": 0.2474, + "step": 19332 + }, + { + "epoch": 1.0622219207813348, + "grad_norm": 4.3261799812316895, + "learning_rate": 7.453474030423462e-06, + "loss": 0.2752, + "step": 19333 + }, + { + "epoch": 1.0622354856212697, + "grad_norm": 5.800370216369629, + "learning_rate": 7.4533369878032065e-06, + "loss": 0.3552, + "step": 19334 + }, + { + "epoch": 1.0622490504612045, + "grad_norm": 7.719470977783203, + "learning_rate": 7.4531999451829525e-06, + "loss": 0.3294, + "step": 19335 + }, + { + "epoch": 1.0622626153011394, + "grad_norm": 5.77100133895874, + "learning_rate": 7.453062902562698e-06, + "loss": 0.2207, + "step": 19336 + }, + { + "epoch": 1.0622761801410743, + "grad_norm": 4.243401527404785, + "learning_rate": 7.452925859942443e-06, + "loss": 0.3436, + "step": 19337 + }, + { + "epoch": 1.0622897449810091, + "grad_norm": 5.382279396057129, + "learning_rate": 7.452788817322187e-06, + "loss": 0.4169, + "step": 19338 + }, + { + "epoch": 1.062303309820944, + "grad_norm": 6.792388439178467, + "learning_rate": 7.452651774701933e-06, + "loss": 0.2622, + "step": 19339 + }, + { + "epoch": 1.062316874660879, + "grad_norm": 4.457534313201904, + "learning_rate": 7.452514732081678e-06, + "loss": 0.2091, + "step": 19340 + }, + { + "epoch": 1.062330439500814, + "grad_norm": 7.341056823730469, + "learning_rate": 7.452377689461423e-06, + "loss": 0.3774, + "step": 19341 + }, + { + "epoch": 1.0623440043407488, + "grad_norm": 5.402281284332275, + "learning_rate": 7.452240646841168e-06, + "loss": 0.2144, + "step": 19342 + }, + { + "epoch": 1.0623575691806837, + "grad_norm": 6.096193313598633, + "learning_rate": 7.452103604220914e-06, + "loss": 0.3008, + "step": 19343 + }, + { + "epoch": 1.0623711340206186, + "grad_norm": 4.206879615783691, + "learning_rate": 7.451966561600658e-06, + "loss": 0.1872, + "step": 19344 + }, + { + "epoch": 1.0623846988605534, + "grad_norm": 4.074323654174805, + "learning_rate": 7.4518295189804035e-06, + "loss": 0.1956, + "step": 19345 + }, + { + "epoch": 1.0623982637004883, + "grad_norm": 4.156012535095215, + "learning_rate": 7.451692476360149e-06, + "loss": 0.25, + "step": 19346 + }, + { + "epoch": 1.0624118285404232, + "grad_norm": 6.788450241088867, + "learning_rate": 7.451555433739893e-06, + "loss": 0.3447, + "step": 19347 + }, + { + "epoch": 1.062425393380358, + "grad_norm": 6.121490001678467, + "learning_rate": 7.451418391119639e-06, + "loss": 0.2933, + "step": 19348 + }, + { + "epoch": 1.062438958220293, + "grad_norm": 5.170690536499023, + "learning_rate": 7.451281348499384e-06, + "loss": 0.2005, + "step": 19349 + }, + { + "epoch": 1.062452523060228, + "grad_norm": 5.104612827301025, + "learning_rate": 7.4511443058791285e-06, + "loss": 0.2308, + "step": 19350 + }, + { + "epoch": 1.0624660879001628, + "grad_norm": 5.843276500701904, + "learning_rate": 7.451007263258874e-06, + "loss": 0.2395, + "step": 19351 + }, + { + "epoch": 1.0624796527400977, + "grad_norm": 5.379220008850098, + "learning_rate": 7.45087022063862e-06, + "loss": 0.3409, + "step": 19352 + }, + { + "epoch": 1.0624932175800326, + "grad_norm": 6.406810283660889, + "learning_rate": 7.450733178018365e-06, + "loss": 0.3132, + "step": 19353 + }, + { + "epoch": 1.0625067824199674, + "grad_norm": 4.900324821472168, + "learning_rate": 7.450596135398109e-06, + "loss": 0.2275, + "step": 19354 + }, + { + "epoch": 1.0625203472599023, + "grad_norm": 7.3919196128845215, + "learning_rate": 7.450459092777854e-06, + "loss": 0.3528, + "step": 19355 + }, + { + "epoch": 1.0625339120998372, + "grad_norm": 6.91852331161499, + "learning_rate": 7.4503220501576004e-06, + "loss": 0.2515, + "step": 19356 + }, + { + "epoch": 1.062547476939772, + "grad_norm": 4.83200740814209, + "learning_rate": 7.450185007537345e-06, + "loss": 0.1921, + "step": 19357 + }, + { + "epoch": 1.062561041779707, + "grad_norm": 4.114842414855957, + "learning_rate": 7.45004796491709e-06, + "loss": 0.2038, + "step": 19358 + }, + { + "epoch": 1.062574606619642, + "grad_norm": 5.102302551269531, + "learning_rate": 7.449910922296834e-06, + "loss": 0.3469, + "step": 19359 + }, + { + "epoch": 1.0625881714595768, + "grad_norm": 8.34244441986084, + "learning_rate": 7.4497738796765795e-06, + "loss": 0.3335, + "step": 19360 + }, + { + "epoch": 1.0626017362995117, + "grad_norm": 5.094330787658691, + "learning_rate": 7.4496368370563255e-06, + "loss": 0.3123, + "step": 19361 + }, + { + "epoch": 1.0626153011394466, + "grad_norm": 4.564109802246094, + "learning_rate": 7.449499794436071e-06, + "loss": 0.2652, + "step": 19362 + }, + { + "epoch": 1.0626288659793814, + "grad_norm": 4.587156772613525, + "learning_rate": 7.449362751815815e-06, + "loss": 0.2919, + "step": 19363 + }, + { + "epoch": 1.0626424308193163, + "grad_norm": 3.766352891921997, + "learning_rate": 7.44922570919556e-06, + "loss": 0.2058, + "step": 19364 + }, + { + "epoch": 1.0626559956592512, + "grad_norm": 5.286566257476807, + "learning_rate": 7.449088666575306e-06, + "loss": 0.2591, + "step": 19365 + }, + { + "epoch": 1.062669560499186, + "grad_norm": 6.099964618682861, + "learning_rate": 7.4489516239550505e-06, + "loss": 0.4295, + "step": 19366 + }, + { + "epoch": 1.062683125339121, + "grad_norm": 5.471982002258301, + "learning_rate": 7.448814581334796e-06, + "loss": 0.2437, + "step": 19367 + }, + { + "epoch": 1.062696690179056, + "grad_norm": 6.657842636108398, + "learning_rate": 7.448677538714541e-06, + "loss": 0.2399, + "step": 19368 + }, + { + "epoch": 1.0627102550189909, + "grad_norm": 8.173365592956543, + "learning_rate": 7.448540496094286e-06, + "loss": 0.3858, + "step": 19369 + }, + { + "epoch": 1.0627238198589257, + "grad_norm": 6.4402618408203125, + "learning_rate": 7.448403453474031e-06, + "loss": 0.2903, + "step": 19370 + }, + { + "epoch": 1.0627373846988606, + "grad_norm": 5.9408793449401855, + "learning_rate": 7.448266410853776e-06, + "loss": 0.1498, + "step": 19371 + }, + { + "epoch": 1.0627509495387955, + "grad_norm": 5.275583744049072, + "learning_rate": 7.448129368233521e-06, + "loss": 0.3117, + "step": 19372 + }, + { + "epoch": 1.0627645143787303, + "grad_norm": 6.001817226409912, + "learning_rate": 7.447992325613266e-06, + "loss": 0.2201, + "step": 19373 + }, + { + "epoch": 1.0627780792186652, + "grad_norm": 7.896761894226074, + "learning_rate": 7.447855282993012e-06, + "loss": 0.4391, + "step": 19374 + }, + { + "epoch": 1.0627916440586, + "grad_norm": 5.243927955627441, + "learning_rate": 7.447718240372756e-06, + "loss": 0.1719, + "step": 19375 + }, + { + "epoch": 1.062805208898535, + "grad_norm": 4.11509895324707, + "learning_rate": 7.4475811977525015e-06, + "loss": 0.1797, + "step": 19376 + }, + { + "epoch": 1.0628187737384698, + "grad_norm": 4.5413618087768555, + "learning_rate": 7.447444155132247e-06, + "loss": 0.1915, + "step": 19377 + }, + { + "epoch": 1.0628323385784049, + "grad_norm": 5.222373008728027, + "learning_rate": 7.447307112511992e-06, + "loss": 0.2445, + "step": 19378 + }, + { + "epoch": 1.0628459034183397, + "grad_norm": 4.005066871643066, + "learning_rate": 7.447170069891737e-06, + "loss": 0.228, + "step": 19379 + }, + { + "epoch": 1.0628594682582746, + "grad_norm": 6.6395063400268555, + "learning_rate": 7.447033027271482e-06, + "loss": 0.2757, + "step": 19380 + }, + { + "epoch": 1.0628730330982095, + "grad_norm": 6.46602201461792, + "learning_rate": 7.4468959846512265e-06, + "loss": 0.2809, + "step": 19381 + }, + { + "epoch": 1.0628865979381443, + "grad_norm": 3.9648940563201904, + "learning_rate": 7.4467589420309725e-06, + "loss": 0.1673, + "step": 19382 + }, + { + "epoch": 1.0629001627780792, + "grad_norm": 5.047588348388672, + "learning_rate": 7.446621899410718e-06, + "loss": 0.246, + "step": 19383 + }, + { + "epoch": 1.062913727618014, + "grad_norm": 4.721644401550293, + "learning_rate": 7.446484856790462e-06, + "loss": 0.2366, + "step": 19384 + }, + { + "epoch": 1.062927292457949, + "grad_norm": 4.899663925170898, + "learning_rate": 7.446347814170207e-06, + "loss": 0.2038, + "step": 19385 + }, + { + "epoch": 1.0629408572978838, + "grad_norm": 8.440712928771973, + "learning_rate": 7.446210771549953e-06, + "loss": 0.453, + "step": 19386 + }, + { + "epoch": 1.0629544221378189, + "grad_norm": 5.702203750610352, + "learning_rate": 7.4460737289296984e-06, + "loss": 0.3021, + "step": 19387 + }, + { + "epoch": 1.0629679869777537, + "grad_norm": 6.10952091217041, + "learning_rate": 7.445936686309443e-06, + "loss": 0.2993, + "step": 19388 + }, + { + "epoch": 1.0629815518176886, + "grad_norm": 4.8399457931518555, + "learning_rate": 7.445799643689188e-06, + "loss": 0.2477, + "step": 19389 + }, + { + "epoch": 1.0629951166576235, + "grad_norm": 5.575171947479248, + "learning_rate": 7.445662601068932e-06, + "loss": 0.3064, + "step": 19390 + }, + { + "epoch": 1.0630086814975583, + "grad_norm": 7.51041841506958, + "learning_rate": 7.445525558448678e-06, + "loss": 0.3091, + "step": 19391 + }, + { + "epoch": 1.0630222463374932, + "grad_norm": 6.571982383728027, + "learning_rate": 7.4453885158284235e-06, + "loss": 0.2642, + "step": 19392 + }, + { + "epoch": 1.063035811177428, + "grad_norm": 6.149966239929199, + "learning_rate": 7.445251473208168e-06, + "loss": 0.2913, + "step": 19393 + }, + { + "epoch": 1.063049376017363, + "grad_norm": 4.001500129699707, + "learning_rate": 7.445114430587913e-06, + "loss": 0.2972, + "step": 19394 + }, + { + "epoch": 1.0630629408572978, + "grad_norm": 4.4998602867126465, + "learning_rate": 7.444977387967659e-06, + "loss": 0.2597, + "step": 19395 + }, + { + "epoch": 1.0630765056972327, + "grad_norm": 5.478524208068848, + "learning_rate": 7.444840345347404e-06, + "loss": 0.2979, + "step": 19396 + }, + { + "epoch": 1.0630900705371678, + "grad_norm": 6.106320858001709, + "learning_rate": 7.4447033027271485e-06, + "loss": 0.187, + "step": 19397 + }, + { + "epoch": 1.0631036353771026, + "grad_norm": 4.880719184875488, + "learning_rate": 7.444566260106894e-06, + "loss": 0.31, + "step": 19398 + }, + { + "epoch": 1.0631172002170375, + "grad_norm": 5.419113636016846, + "learning_rate": 7.44442921748664e-06, + "loss": 0.3334, + "step": 19399 + }, + { + "epoch": 1.0631307650569723, + "grad_norm": 4.176363468170166, + "learning_rate": 7.444292174866384e-06, + "loss": 0.1609, + "step": 19400 + }, + { + "epoch": 1.0631443298969072, + "grad_norm": 4.702383518218994, + "learning_rate": 7.444155132246129e-06, + "loss": 0.1858, + "step": 19401 + }, + { + "epoch": 1.063157894736842, + "grad_norm": 4.925546646118164, + "learning_rate": 7.4440180896258744e-06, + "loss": 0.1958, + "step": 19402 + }, + { + "epoch": 1.063171459576777, + "grad_norm": 5.0711894035339355, + "learning_rate": 7.443881047005619e-06, + "loss": 0.294, + "step": 19403 + }, + { + "epoch": 1.0631850244167118, + "grad_norm": 6.71936559677124, + "learning_rate": 7.443744004385365e-06, + "loss": 0.2967, + "step": 19404 + }, + { + "epoch": 1.0631985892566467, + "grad_norm": 3.9940683841705322, + "learning_rate": 7.44360696176511e-06, + "loss": 0.1916, + "step": 19405 + }, + { + "epoch": 1.0632121540965818, + "grad_norm": 4.4609785079956055, + "learning_rate": 7.443469919144854e-06, + "loss": 0.1745, + "step": 19406 + }, + { + "epoch": 1.0632257189365166, + "grad_norm": 4.557530879974365, + "learning_rate": 7.4433328765245995e-06, + "loss": 0.2033, + "step": 19407 + }, + { + "epoch": 1.0632392837764515, + "grad_norm": 3.303936243057251, + "learning_rate": 7.4431958339043455e-06, + "loss": 0.201, + "step": 19408 + }, + { + "epoch": 1.0632528486163864, + "grad_norm": 6.159511089324951, + "learning_rate": 7.44305879128409e-06, + "loss": 0.2554, + "step": 19409 + }, + { + "epoch": 1.0632664134563212, + "grad_norm": 5.400402069091797, + "learning_rate": 7.442921748663835e-06, + "loss": 0.2143, + "step": 19410 + }, + { + "epoch": 1.063279978296256, + "grad_norm": 4.4627156257629395, + "learning_rate": 7.44278470604358e-06, + "loss": 0.2329, + "step": 19411 + }, + { + "epoch": 1.063293543136191, + "grad_norm": 4.1752800941467285, + "learning_rate": 7.442647663423325e-06, + "loss": 0.223, + "step": 19412 + }, + { + "epoch": 1.0633071079761258, + "grad_norm": 4.400493621826172, + "learning_rate": 7.4425106208030705e-06, + "loss": 0.1659, + "step": 19413 + }, + { + "epoch": 1.0633206728160607, + "grad_norm": 4.276606559753418, + "learning_rate": 7.442373578182816e-06, + "loss": 0.1976, + "step": 19414 + }, + { + "epoch": 1.0633342376559956, + "grad_norm": 5.670490741729736, + "learning_rate": 7.44223653556256e-06, + "loss": 0.3776, + "step": 19415 + }, + { + "epoch": 1.0633478024959306, + "grad_norm": 4.833752632141113, + "learning_rate": 7.442099492942305e-06, + "loss": 0.2351, + "step": 19416 + }, + { + "epoch": 1.0633613673358655, + "grad_norm": 5.769617080688477, + "learning_rate": 7.441962450322051e-06, + "loss": 0.2724, + "step": 19417 + }, + { + "epoch": 1.0633749321758004, + "grad_norm": 4.569130897521973, + "learning_rate": 7.441825407701796e-06, + "loss": 0.1351, + "step": 19418 + }, + { + "epoch": 1.0633884970157352, + "grad_norm": 5.408354759216309, + "learning_rate": 7.441688365081541e-06, + "loss": 0.2332, + "step": 19419 + }, + { + "epoch": 1.06340206185567, + "grad_norm": 4.18747091293335, + "learning_rate": 7.441551322461286e-06, + "loss": 0.1889, + "step": 19420 + }, + { + "epoch": 1.063415626695605, + "grad_norm": 6.557375907897949, + "learning_rate": 7.441414279841032e-06, + "loss": 0.4163, + "step": 19421 + }, + { + "epoch": 1.0634291915355398, + "grad_norm": 6.249354362487793, + "learning_rate": 7.441277237220776e-06, + "loss": 0.3111, + "step": 19422 + }, + { + "epoch": 1.0634427563754747, + "grad_norm": 6.382862091064453, + "learning_rate": 7.4411401946005215e-06, + "loss": 0.2022, + "step": 19423 + }, + { + "epoch": 1.0634563212154096, + "grad_norm": 5.896036148071289, + "learning_rate": 7.441003151980266e-06, + "loss": 0.2586, + "step": 19424 + }, + { + "epoch": 1.0634698860553446, + "grad_norm": 5.058875560760498, + "learning_rate": 7.440866109360012e-06, + "loss": 0.2557, + "step": 19425 + }, + { + "epoch": 1.0634834508952795, + "grad_norm": 5.630974769592285, + "learning_rate": 7.440729066739757e-06, + "loss": 0.2708, + "step": 19426 + }, + { + "epoch": 1.0634970157352144, + "grad_norm": 5.2604851722717285, + "learning_rate": 7.440592024119501e-06, + "loss": 0.3241, + "step": 19427 + }, + { + "epoch": 1.0635105805751492, + "grad_norm": 5.150784492492676, + "learning_rate": 7.4404549814992465e-06, + "loss": 0.2279, + "step": 19428 + }, + { + "epoch": 1.063524145415084, + "grad_norm": 6.602084636688232, + "learning_rate": 7.440317938878992e-06, + "loss": 0.2801, + "step": 19429 + }, + { + "epoch": 1.063537710255019, + "grad_norm": 6.398077487945557, + "learning_rate": 7.440180896258738e-06, + "loss": 0.2244, + "step": 19430 + }, + { + "epoch": 1.0635512750949538, + "grad_norm": 4.070145606994629, + "learning_rate": 7.440043853638482e-06, + "loss": 0.1587, + "step": 19431 + }, + { + "epoch": 1.0635648399348887, + "grad_norm": 4.925486087799072, + "learning_rate": 7.439906811018227e-06, + "loss": 0.1821, + "step": 19432 + }, + { + "epoch": 1.0635784047748236, + "grad_norm": 5.7268853187561035, + "learning_rate": 7.439769768397972e-06, + "loss": 0.3469, + "step": 19433 + }, + { + "epoch": 1.0635919696147584, + "grad_norm": 5.964259624481201, + "learning_rate": 7.439632725777718e-06, + "loss": 0.1975, + "step": 19434 + }, + { + "epoch": 1.0636055344546935, + "grad_norm": 4.615743160247803, + "learning_rate": 7.439495683157463e-06, + "loss": 0.1446, + "step": 19435 + }, + { + "epoch": 1.0636190992946284, + "grad_norm": 4.571559429168701, + "learning_rate": 7.439358640537208e-06, + "loss": 0.2014, + "step": 19436 + }, + { + "epoch": 1.0636326641345633, + "grad_norm": 4.660514831542969, + "learning_rate": 7.439221597916952e-06, + "loss": 0.2783, + "step": 19437 + }, + { + "epoch": 1.0636462289744981, + "grad_norm": 6.23467493057251, + "learning_rate": 7.439084555296698e-06, + "loss": 0.2053, + "step": 19438 + }, + { + "epoch": 1.063659793814433, + "grad_norm": 8.588798522949219, + "learning_rate": 7.4389475126764435e-06, + "loss": 0.3027, + "step": 19439 + }, + { + "epoch": 1.0636733586543679, + "grad_norm": 4.513881683349609, + "learning_rate": 7.438810470056188e-06, + "loss": 0.1843, + "step": 19440 + }, + { + "epoch": 1.0636869234943027, + "grad_norm": 5.628443241119385, + "learning_rate": 7.438673427435933e-06, + "loss": 0.2972, + "step": 19441 + }, + { + "epoch": 1.0637004883342376, + "grad_norm": 5.568210124969482, + "learning_rate": 7.438536384815677e-06, + "loss": 0.1943, + "step": 19442 + }, + { + "epoch": 1.0637140531741724, + "grad_norm": 4.4884490966796875, + "learning_rate": 7.438399342195423e-06, + "loss": 0.1949, + "step": 19443 + }, + { + "epoch": 1.0637276180141075, + "grad_norm": 4.9381794929504395, + "learning_rate": 7.4382622995751685e-06, + "loss": 0.222, + "step": 19444 + }, + { + "epoch": 1.0637411828540424, + "grad_norm": 3.667214870452881, + "learning_rate": 7.438125256954914e-06, + "loss": 0.1388, + "step": 19445 + }, + { + "epoch": 1.0637547476939773, + "grad_norm": 4.715089321136475, + "learning_rate": 7.437988214334658e-06, + "loss": 0.2358, + "step": 19446 + }, + { + "epoch": 1.0637683125339121, + "grad_norm": 6.013891220092773, + "learning_rate": 7.437851171714404e-06, + "loss": 0.2689, + "step": 19447 + }, + { + "epoch": 1.063781877373847, + "grad_norm": 5.363309383392334, + "learning_rate": 7.437714129094149e-06, + "loss": 0.3326, + "step": 19448 + }, + { + "epoch": 1.0637954422137819, + "grad_norm": 7.719576835632324, + "learning_rate": 7.437577086473894e-06, + "loss": 0.3725, + "step": 19449 + }, + { + "epoch": 1.0638090070537167, + "grad_norm": 5.3447442054748535, + "learning_rate": 7.437440043853639e-06, + "loss": 0.2322, + "step": 19450 + }, + { + "epoch": 1.0638225718936516, + "grad_norm": 4.876293659210205, + "learning_rate": 7.437303001233385e-06, + "loss": 0.221, + "step": 19451 + }, + { + "epoch": 1.0638361367335865, + "grad_norm": 8.927749633789062, + "learning_rate": 7.437165958613129e-06, + "loss": 0.3623, + "step": 19452 + }, + { + "epoch": 1.0638497015735213, + "grad_norm": 5.468628406524658, + "learning_rate": 7.437028915992874e-06, + "loss": 0.2962, + "step": 19453 + }, + { + "epoch": 1.0638632664134564, + "grad_norm": 4.986574172973633, + "learning_rate": 7.4368918733726195e-06, + "loss": 0.2152, + "step": 19454 + }, + { + "epoch": 1.0638768312533913, + "grad_norm": 5.5876288414001465, + "learning_rate": 7.436754830752364e-06, + "loss": 0.2503, + "step": 19455 + }, + { + "epoch": 1.0638903960933261, + "grad_norm": 6.732699871063232, + "learning_rate": 7.43661778813211e-06, + "loss": 0.4121, + "step": 19456 + }, + { + "epoch": 1.063903960933261, + "grad_norm": 5.013633728027344, + "learning_rate": 7.436480745511855e-06, + "loss": 0.1498, + "step": 19457 + }, + { + "epoch": 1.0639175257731959, + "grad_norm": 4.683817386627197, + "learning_rate": 7.436343702891599e-06, + "loss": 0.2172, + "step": 19458 + }, + { + "epoch": 1.0639310906131307, + "grad_norm": 5.384319305419922, + "learning_rate": 7.4362066602713445e-06, + "loss": 0.3246, + "step": 19459 + }, + { + "epoch": 1.0639446554530656, + "grad_norm": 6.123396873474121, + "learning_rate": 7.4360696176510906e-06, + "loss": 0.3832, + "step": 19460 + }, + { + "epoch": 1.0639582202930005, + "grad_norm": 5.1178669929504395, + "learning_rate": 7.435932575030836e-06, + "loss": 0.3629, + "step": 19461 + }, + { + "epoch": 1.0639717851329353, + "grad_norm": 6.77789306640625, + "learning_rate": 7.43579553241058e-06, + "loss": 0.3767, + "step": 19462 + }, + { + "epoch": 1.0639853499728704, + "grad_norm": 5.8327202796936035, + "learning_rate": 7.435658489790325e-06, + "loss": 0.2778, + "step": 19463 + }, + { + "epoch": 1.0639989148128053, + "grad_norm": 4.222850322723389, + "learning_rate": 7.435521447170071e-06, + "loss": 0.1751, + "step": 19464 + }, + { + "epoch": 1.0640124796527402, + "grad_norm": 5.287806510925293, + "learning_rate": 7.435384404549816e-06, + "loss": 0.4444, + "step": 19465 + }, + { + "epoch": 1.064026044492675, + "grad_norm": 4.7192254066467285, + "learning_rate": 7.435247361929561e-06, + "loss": 0.3181, + "step": 19466 + }, + { + "epoch": 1.0640396093326099, + "grad_norm": 3.4822580814361572, + "learning_rate": 7.435110319309305e-06, + "loss": 0.1552, + "step": 19467 + }, + { + "epoch": 1.0640531741725447, + "grad_norm": 6.485044002532959, + "learning_rate": 7.434973276689051e-06, + "loss": 0.3811, + "step": 19468 + }, + { + "epoch": 1.0640667390124796, + "grad_norm": 5.811336040496826, + "learning_rate": 7.434836234068796e-06, + "loss": 0.3361, + "step": 19469 + }, + { + "epoch": 1.0640803038524145, + "grad_norm": 5.234499931335449, + "learning_rate": 7.4346991914485415e-06, + "loss": 0.259, + "step": 19470 + }, + { + "epoch": 1.0640938686923493, + "grad_norm": 5.6398749351501465, + "learning_rate": 7.434562148828286e-06, + "loss": 0.3446, + "step": 19471 + }, + { + "epoch": 1.0641074335322842, + "grad_norm": 5.484809398651123, + "learning_rate": 7.434425106208031e-06, + "loss": 0.3597, + "step": 19472 + }, + { + "epoch": 1.0641209983722193, + "grad_norm": 6.36421537399292, + "learning_rate": 7.434288063587777e-06, + "loss": 0.4067, + "step": 19473 + }, + { + "epoch": 1.0641345632121542, + "grad_norm": 4.874536514282227, + "learning_rate": 7.434151020967521e-06, + "loss": 0.2822, + "step": 19474 + }, + { + "epoch": 1.064148128052089, + "grad_norm": 5.343012809753418, + "learning_rate": 7.4340139783472666e-06, + "loss": 0.2632, + "step": 19475 + }, + { + "epoch": 1.064161692892024, + "grad_norm": 6.201269149780273, + "learning_rate": 7.433876935727011e-06, + "loss": 0.3641, + "step": 19476 + }, + { + "epoch": 1.0641752577319588, + "grad_norm": 4.973264217376709, + "learning_rate": 7.433739893106757e-06, + "loss": 0.3109, + "step": 19477 + }, + { + "epoch": 1.0641888225718936, + "grad_norm": 5.25579309463501, + "learning_rate": 7.433602850486502e-06, + "loss": 0.3492, + "step": 19478 + }, + { + "epoch": 1.0642023874118285, + "grad_norm": 3.185218334197998, + "learning_rate": 7.433465807866247e-06, + "loss": 0.1494, + "step": 19479 + }, + { + "epoch": 1.0642159522517634, + "grad_norm": 4.474177360534668, + "learning_rate": 7.433328765245992e-06, + "loss": 0.409, + "step": 19480 + }, + { + "epoch": 1.0642295170916982, + "grad_norm": 4.473262310028076, + "learning_rate": 7.433191722625738e-06, + "loss": 0.2715, + "step": 19481 + }, + { + "epoch": 1.0642430819316333, + "grad_norm": 6.814645767211914, + "learning_rate": 7.433054680005483e-06, + "loss": 0.3216, + "step": 19482 + }, + { + "epoch": 1.0642566467715682, + "grad_norm": 7.786805629730225, + "learning_rate": 7.432917637385227e-06, + "loss": 0.4952, + "step": 19483 + }, + { + "epoch": 1.064270211611503, + "grad_norm": 5.463968753814697, + "learning_rate": 7.432780594764972e-06, + "loss": 0.2801, + "step": 19484 + }, + { + "epoch": 1.064283776451438, + "grad_norm": 4.855696678161621, + "learning_rate": 7.4326435521447175e-06, + "loss": 0.2975, + "step": 19485 + }, + { + "epoch": 1.0642973412913728, + "grad_norm": 4.219235897064209, + "learning_rate": 7.432506509524463e-06, + "loss": 0.2098, + "step": 19486 + }, + { + "epoch": 1.0643109061313076, + "grad_norm": 4.303225517272949, + "learning_rate": 7.432369466904208e-06, + "loss": 0.2102, + "step": 19487 + }, + { + "epoch": 1.0643244709712425, + "grad_norm": 3.77892804145813, + "learning_rate": 7.432232424283953e-06, + "loss": 0.2139, + "step": 19488 + }, + { + "epoch": 1.0643380358111774, + "grad_norm": 4.357803821563721, + "learning_rate": 7.432095381663697e-06, + "loss": 0.1906, + "step": 19489 + }, + { + "epoch": 1.0643516006511122, + "grad_norm": 5.646161079406738, + "learning_rate": 7.431958339043443e-06, + "loss": 0.2809, + "step": 19490 + }, + { + "epoch": 1.064365165491047, + "grad_norm": 5.006218433380127, + "learning_rate": 7.4318212964231886e-06, + "loss": 0.3318, + "step": 19491 + }, + { + "epoch": 1.0643787303309822, + "grad_norm": 6.657922267913818, + "learning_rate": 7.431684253802933e-06, + "loss": 0.3077, + "step": 19492 + }, + { + "epoch": 1.064392295170917, + "grad_norm": 5.177605628967285, + "learning_rate": 7.431547211182678e-06, + "loss": 0.2714, + "step": 19493 + }, + { + "epoch": 1.064405860010852, + "grad_norm": 3.591872215270996, + "learning_rate": 7.431410168562424e-06, + "loss": 0.2071, + "step": 19494 + }, + { + "epoch": 1.0644194248507868, + "grad_norm": 5.491029739379883, + "learning_rate": 7.431273125942169e-06, + "loss": 0.3013, + "step": 19495 + }, + { + "epoch": 1.0644329896907216, + "grad_norm": 6.999357223510742, + "learning_rate": 7.431136083321914e-06, + "loss": 0.4557, + "step": 19496 + }, + { + "epoch": 1.0644465545306565, + "grad_norm": 4.222102642059326, + "learning_rate": 7.430999040701659e-06, + "loss": 0.2328, + "step": 19497 + }, + { + "epoch": 1.0644601193705914, + "grad_norm": 4.792910575866699, + "learning_rate": 7.430861998081403e-06, + "loss": 0.3099, + "step": 19498 + }, + { + "epoch": 1.0644736842105262, + "grad_norm": 4.908086776733398, + "learning_rate": 7.430724955461149e-06, + "loss": 0.4086, + "step": 19499 + }, + { + "epoch": 1.064487249050461, + "grad_norm": 4.647839069366455, + "learning_rate": 7.430587912840894e-06, + "loss": 0.2868, + "step": 19500 + }, + { + "epoch": 1.0645008138903962, + "grad_norm": 4.394136905670166, + "learning_rate": 7.430450870220639e-06, + "loss": 0.2972, + "step": 19501 + }, + { + "epoch": 1.064514378730331, + "grad_norm": 3.853240966796875, + "learning_rate": 7.430313827600384e-06, + "loss": 0.1802, + "step": 19502 + }, + { + "epoch": 1.064527943570266, + "grad_norm": 5.554488658905029, + "learning_rate": 7.43017678498013e-06, + "loss": 0.3115, + "step": 19503 + }, + { + "epoch": 1.0645415084102008, + "grad_norm": 3.4699461460113525, + "learning_rate": 7.430039742359875e-06, + "loss": 0.249, + "step": 19504 + }, + { + "epoch": 1.0645550732501357, + "grad_norm": 4.373700141906738, + "learning_rate": 7.429902699739619e-06, + "loss": 0.3124, + "step": 19505 + }, + { + "epoch": 1.0645686380900705, + "grad_norm": 3.926605463027954, + "learning_rate": 7.4297656571193646e-06, + "loss": 0.2151, + "step": 19506 + }, + { + "epoch": 1.0645822029300054, + "grad_norm": 4.491905212402344, + "learning_rate": 7.429628614499111e-06, + "loss": 0.2219, + "step": 19507 + }, + { + "epoch": 1.0645957677699402, + "grad_norm": 5.867763996124268, + "learning_rate": 7.429491571878855e-06, + "loss": 0.4127, + "step": 19508 + }, + { + "epoch": 1.0646093326098751, + "grad_norm": 5.399949073791504, + "learning_rate": 7.4293545292586e-06, + "loss": 0.2966, + "step": 19509 + }, + { + "epoch": 1.06462289744981, + "grad_norm": 5.927711009979248, + "learning_rate": 7.429217486638345e-06, + "loss": 0.2542, + "step": 19510 + }, + { + "epoch": 1.064636462289745, + "grad_norm": 4.707756519317627, + "learning_rate": 7.42908044401809e-06, + "loss": 0.2577, + "step": 19511 + }, + { + "epoch": 1.06465002712968, + "grad_norm": 5.330742835998535, + "learning_rate": 7.428943401397836e-06, + "loss": 0.3715, + "step": 19512 + }, + { + "epoch": 1.0646635919696148, + "grad_norm": 7.186373233795166, + "learning_rate": 7.428806358777581e-06, + "loss": 0.3951, + "step": 19513 + }, + { + "epoch": 1.0646771568095497, + "grad_norm": 4.168387413024902, + "learning_rate": 7.428669316157325e-06, + "loss": 0.1723, + "step": 19514 + }, + { + "epoch": 1.0646907216494845, + "grad_norm": 5.9033918380737305, + "learning_rate": 7.42853227353707e-06, + "loss": 0.2738, + "step": 19515 + }, + { + "epoch": 1.0647042864894194, + "grad_norm": 4.5665788650512695, + "learning_rate": 7.428395230916816e-06, + "loss": 0.2616, + "step": 19516 + }, + { + "epoch": 1.0647178513293543, + "grad_norm": 5.718075752258301, + "learning_rate": 7.428258188296561e-06, + "loss": 0.4084, + "step": 19517 + }, + { + "epoch": 1.0647314161692891, + "grad_norm": 4.9982781410217285, + "learning_rate": 7.428121145676306e-06, + "loss": 0.2672, + "step": 19518 + }, + { + "epoch": 1.064744981009224, + "grad_norm": 7.0138373374938965, + "learning_rate": 7.427984103056051e-06, + "loss": 0.2996, + "step": 19519 + }, + { + "epoch": 1.064758545849159, + "grad_norm": 5.819929599761963, + "learning_rate": 7.427847060435796e-06, + "loss": 0.2842, + "step": 19520 + }, + { + "epoch": 1.064772110689094, + "grad_norm": 5.999780178070068, + "learning_rate": 7.427710017815541e-06, + "loss": 0.3605, + "step": 19521 + }, + { + "epoch": 1.0647856755290288, + "grad_norm": 6.759286880493164, + "learning_rate": 7.4275729751952866e-06, + "loss": 0.4436, + "step": 19522 + }, + { + "epoch": 1.0647992403689637, + "grad_norm": 5.064914226531982, + "learning_rate": 7.427435932575031e-06, + "loss": 0.4671, + "step": 19523 + }, + { + "epoch": 1.0648128052088985, + "grad_norm": 6.042903423309326, + "learning_rate": 7.427298889954776e-06, + "loss": 0.3623, + "step": 19524 + }, + { + "epoch": 1.0648263700488334, + "grad_norm": 5.1763787269592285, + "learning_rate": 7.427161847334522e-06, + "loss": 0.3307, + "step": 19525 + }, + { + "epoch": 1.0648399348887683, + "grad_norm": 8.758563041687012, + "learning_rate": 7.4270248047142664e-06, + "loss": 0.7135, + "step": 19526 + }, + { + "epoch": 1.0648534997287031, + "grad_norm": 7.043988227844238, + "learning_rate": 7.426887762094012e-06, + "loss": 0.363, + "step": 19527 + }, + { + "epoch": 1.064867064568638, + "grad_norm": 6.120641708374023, + "learning_rate": 7.426750719473757e-06, + "loss": 0.2687, + "step": 19528 + }, + { + "epoch": 1.0648806294085729, + "grad_norm": 5.929839611053467, + "learning_rate": 7.426613676853503e-06, + "loss": 0.2347, + "step": 19529 + }, + { + "epoch": 1.064894194248508, + "grad_norm": 7.2781147956848145, + "learning_rate": 7.426476634233247e-06, + "loss": 0.2795, + "step": 19530 + }, + { + "epoch": 1.0649077590884428, + "grad_norm": 4.747318267822266, + "learning_rate": 7.426339591612992e-06, + "loss": 0.2786, + "step": 19531 + }, + { + "epoch": 1.0649213239283777, + "grad_norm": 6.7364091873168945, + "learning_rate": 7.426202548992737e-06, + "loss": 0.3388, + "step": 19532 + }, + { + "epoch": 1.0649348887683125, + "grad_norm": 6.053247928619385, + "learning_rate": 7.426065506372483e-06, + "loss": 0.3118, + "step": 19533 + }, + { + "epoch": 1.0649484536082474, + "grad_norm": 8.557053565979004, + "learning_rate": 7.425928463752228e-06, + "loss": 0.5764, + "step": 19534 + }, + { + "epoch": 1.0649620184481823, + "grad_norm": 5.932870388031006, + "learning_rate": 7.425791421131972e-06, + "loss": 0.3354, + "step": 19535 + }, + { + "epoch": 1.0649755832881171, + "grad_norm": 5.186185836791992, + "learning_rate": 7.425654378511717e-06, + "loss": 0.2901, + "step": 19536 + }, + { + "epoch": 1.064989148128052, + "grad_norm": 6.042712688446045, + "learning_rate": 7.425517335891463e-06, + "loss": 0.3912, + "step": 19537 + }, + { + "epoch": 1.0650027129679869, + "grad_norm": 5.428230285644531, + "learning_rate": 7.425380293271209e-06, + "loss": 0.2885, + "step": 19538 + }, + { + "epoch": 1.065016277807922, + "grad_norm": 5.063807964324951, + "learning_rate": 7.425243250650953e-06, + "loss": 0.221, + "step": 19539 + }, + { + "epoch": 1.0650298426478568, + "grad_norm": 6.75437068939209, + "learning_rate": 7.425106208030698e-06, + "loss": 0.4194, + "step": 19540 + }, + { + "epoch": 1.0650434074877917, + "grad_norm": 5.578726768493652, + "learning_rate": 7.4249691654104424e-06, + "loss": 0.3488, + "step": 19541 + }, + { + "epoch": 1.0650569723277266, + "grad_norm": 6.85921049118042, + "learning_rate": 7.4248321227901885e-06, + "loss": 0.3507, + "step": 19542 + }, + { + "epoch": 1.0650705371676614, + "grad_norm": 5.404152870178223, + "learning_rate": 7.424695080169934e-06, + "loss": 0.2845, + "step": 19543 + }, + { + "epoch": 1.0650841020075963, + "grad_norm": 4.91007661819458, + "learning_rate": 7.424558037549679e-06, + "loss": 0.4622, + "step": 19544 + }, + { + "epoch": 1.0650976668475312, + "grad_norm": 5.111151218414307, + "learning_rate": 7.424420994929423e-06, + "loss": 0.3811, + "step": 19545 + }, + { + "epoch": 1.065111231687466, + "grad_norm": 5.411294460296631, + "learning_rate": 7.424283952309169e-06, + "loss": 0.3035, + "step": 19546 + }, + { + "epoch": 1.0651247965274009, + "grad_norm": 5.388630390167236, + "learning_rate": 7.424146909688914e-06, + "loss": 0.2739, + "step": 19547 + }, + { + "epoch": 1.0651383613673358, + "grad_norm": 6.475419521331787, + "learning_rate": 7.424009867068659e-06, + "loss": 0.4091, + "step": 19548 + }, + { + "epoch": 1.0651519262072708, + "grad_norm": 4.731623649597168, + "learning_rate": 7.423872824448404e-06, + "loss": 0.2362, + "step": 19549 + }, + { + "epoch": 1.0651654910472057, + "grad_norm": 6.860350608825684, + "learning_rate": 7.42373578182815e-06, + "loss": 0.5124, + "step": 19550 + }, + { + "epoch": 1.0651790558871406, + "grad_norm": 6.27835750579834, + "learning_rate": 7.423598739207894e-06, + "loss": 0.3616, + "step": 19551 + }, + { + "epoch": 1.0651926207270754, + "grad_norm": 5.834394454956055, + "learning_rate": 7.423461696587639e-06, + "loss": 0.2926, + "step": 19552 + }, + { + "epoch": 1.0652061855670103, + "grad_norm": 5.874218940734863, + "learning_rate": 7.4233246539673846e-06, + "loss": 0.3162, + "step": 19553 + }, + { + "epoch": 1.0652197504069452, + "grad_norm": 8.91302490234375, + "learning_rate": 7.423187611347129e-06, + "loss": 0.3999, + "step": 19554 + }, + { + "epoch": 1.06523331524688, + "grad_norm": 5.2881693840026855, + "learning_rate": 7.423050568726875e-06, + "loss": 0.3354, + "step": 19555 + }, + { + "epoch": 1.065246880086815, + "grad_norm": 8.596586227416992, + "learning_rate": 7.42291352610662e-06, + "loss": 0.2247, + "step": 19556 + }, + { + "epoch": 1.0652604449267498, + "grad_norm": 6.2704315185546875, + "learning_rate": 7.4227764834863644e-06, + "loss": 0.2499, + "step": 19557 + }, + { + "epoch": 1.0652740097666848, + "grad_norm": 7.202363967895508, + "learning_rate": 7.42263944086611e-06, + "loss": 0.4223, + "step": 19558 + }, + { + "epoch": 1.0652875746066197, + "grad_norm": 4.514922142028809, + "learning_rate": 7.422502398245856e-06, + "loss": 0.2197, + "step": 19559 + }, + { + "epoch": 1.0653011394465546, + "grad_norm": 4.205116271972656, + "learning_rate": 7.4223653556256e-06, + "loss": 0.3025, + "step": 19560 + }, + { + "epoch": 1.0653147042864894, + "grad_norm": 4.971843242645264, + "learning_rate": 7.422228313005345e-06, + "loss": 0.2017, + "step": 19561 + }, + { + "epoch": 1.0653282691264243, + "grad_norm": 6.635756969451904, + "learning_rate": 7.42209127038509e-06, + "loss": 0.28, + "step": 19562 + }, + { + "epoch": 1.0653418339663592, + "grad_norm": 4.615896224975586, + "learning_rate": 7.421954227764836e-06, + "loss": 0.3449, + "step": 19563 + }, + { + "epoch": 1.065355398806294, + "grad_norm": 6.292897701263428, + "learning_rate": 7.421817185144581e-06, + "loss": 0.4, + "step": 19564 + }, + { + "epoch": 1.065368963646229, + "grad_norm": 4.399659633636475, + "learning_rate": 7.421680142524326e-06, + "loss": 0.284, + "step": 19565 + }, + { + "epoch": 1.0653825284861638, + "grad_norm": 4.499423503875732, + "learning_rate": 7.42154309990407e-06, + "loss": 0.2815, + "step": 19566 + }, + { + "epoch": 1.0653960933260986, + "grad_norm": 6.392192363739014, + "learning_rate": 7.421406057283815e-06, + "loss": 0.2396, + "step": 19567 + }, + { + "epoch": 1.0654096581660337, + "grad_norm": 3.988447904586792, + "learning_rate": 7.421269014663561e-06, + "loss": 0.1688, + "step": 19568 + }, + { + "epoch": 1.0654232230059686, + "grad_norm": 6.682200908660889, + "learning_rate": 7.421131972043306e-06, + "loss": 0.4921, + "step": 19569 + }, + { + "epoch": 1.0654367878459035, + "grad_norm": 4.821453094482422, + "learning_rate": 7.420994929423051e-06, + "loss": 0.2959, + "step": 19570 + }, + { + "epoch": 1.0654503526858383, + "grad_norm": 3.713205337524414, + "learning_rate": 7.420857886802796e-06, + "loss": 0.2859, + "step": 19571 + }, + { + "epoch": 1.0654639175257732, + "grad_norm": 3.8377952575683594, + "learning_rate": 7.420720844182542e-06, + "loss": 0.2633, + "step": 19572 + }, + { + "epoch": 1.065477482365708, + "grad_norm": 5.934772491455078, + "learning_rate": 7.4205838015622865e-06, + "loss": 0.3826, + "step": 19573 + }, + { + "epoch": 1.065491047205643, + "grad_norm": 4.170566082000732, + "learning_rate": 7.420446758942032e-06, + "loss": 0.1987, + "step": 19574 + }, + { + "epoch": 1.0655046120455778, + "grad_norm": 3.6777946949005127, + "learning_rate": 7.420309716321776e-06, + "loss": 0.1936, + "step": 19575 + }, + { + "epoch": 1.0655181768855129, + "grad_norm": 3.7565321922302246, + "learning_rate": 7.420172673701522e-06, + "loss": 0.2146, + "step": 19576 + }, + { + "epoch": 1.0655317417254477, + "grad_norm": 5.388603687286377, + "learning_rate": 7.420035631081267e-06, + "loss": 0.4348, + "step": 19577 + }, + { + "epoch": 1.0655453065653826, + "grad_norm": 4.75878381729126, + "learning_rate": 7.419898588461012e-06, + "loss": 0.2228, + "step": 19578 + }, + { + "epoch": 1.0655588714053175, + "grad_norm": 5.9474897384643555, + "learning_rate": 7.419761545840757e-06, + "loss": 0.4007, + "step": 19579 + }, + { + "epoch": 1.0655724362452523, + "grad_norm": 3.446223020553589, + "learning_rate": 7.419624503220502e-06, + "loss": 0.2131, + "step": 19580 + }, + { + "epoch": 1.0655860010851872, + "grad_norm": 3.4842522144317627, + "learning_rate": 7.419487460600248e-06, + "loss": 0.1944, + "step": 19581 + }, + { + "epoch": 1.065599565925122, + "grad_norm": 5.850998401641846, + "learning_rate": 7.419350417979992e-06, + "loss": 0.371, + "step": 19582 + }, + { + "epoch": 1.065613130765057, + "grad_norm": 5.452755451202393, + "learning_rate": 7.419213375359737e-06, + "loss": 0.2995, + "step": 19583 + }, + { + "epoch": 1.0656266956049918, + "grad_norm": 5.85765266418457, + "learning_rate": 7.419076332739482e-06, + "loss": 0.3299, + "step": 19584 + }, + { + "epoch": 1.0656402604449267, + "grad_norm": 4.897297382354736, + "learning_rate": 7.418939290119228e-06, + "loss": 0.4286, + "step": 19585 + }, + { + "epoch": 1.0656538252848615, + "grad_norm": 3.9487104415893555, + "learning_rate": 7.418802247498973e-06, + "loss": 0.2181, + "step": 19586 + }, + { + "epoch": 1.0656673901247966, + "grad_norm": 6.548304080963135, + "learning_rate": 7.418665204878718e-06, + "loss": 0.2742, + "step": 19587 + }, + { + "epoch": 1.0656809549647315, + "grad_norm": 3.5599420070648193, + "learning_rate": 7.4185281622584624e-06, + "loss": 0.179, + "step": 19588 + }, + { + "epoch": 1.0656945198046663, + "grad_norm": 4.217906951904297, + "learning_rate": 7.4183911196382085e-06, + "loss": 0.2133, + "step": 19589 + }, + { + "epoch": 1.0657080846446012, + "grad_norm": 6.338998794555664, + "learning_rate": 7.418254077017954e-06, + "loss": 0.2657, + "step": 19590 + }, + { + "epoch": 1.065721649484536, + "grad_norm": 5.443522930145264, + "learning_rate": 7.418117034397698e-06, + "loss": 0.3169, + "step": 19591 + }, + { + "epoch": 1.065735214324471, + "grad_norm": 3.3162190914154053, + "learning_rate": 7.417979991777443e-06, + "loss": 0.1895, + "step": 19592 + }, + { + "epoch": 1.0657487791644058, + "grad_norm": 3.955686569213867, + "learning_rate": 7.417842949157188e-06, + "loss": 0.2309, + "step": 19593 + }, + { + "epoch": 1.0657623440043407, + "grad_norm": 5.803969383239746, + "learning_rate": 7.4177059065369335e-06, + "loss": 0.2961, + "step": 19594 + }, + { + "epoch": 1.0657759088442758, + "grad_norm": 4.803073883056641, + "learning_rate": 7.417568863916679e-06, + "loss": 0.2926, + "step": 19595 + }, + { + "epoch": 1.0657894736842106, + "grad_norm": 5.313357830047607, + "learning_rate": 7.417431821296424e-06, + "loss": 0.244, + "step": 19596 + }, + { + "epoch": 1.0658030385241455, + "grad_norm": 4.838447093963623, + "learning_rate": 7.417294778676168e-06, + "loss": 0.288, + "step": 19597 + }, + { + "epoch": 1.0658166033640804, + "grad_norm": 5.034849166870117, + "learning_rate": 7.417157736055914e-06, + "loss": 0.3312, + "step": 19598 + }, + { + "epoch": 1.0658301682040152, + "grad_norm": 5.685725212097168, + "learning_rate": 7.417020693435659e-06, + "loss": 0.2927, + "step": 19599 + }, + { + "epoch": 1.06584373304395, + "grad_norm": 5.273719310760498, + "learning_rate": 7.416883650815404e-06, + "loss": 0.3883, + "step": 19600 + }, + { + "epoch": 1.065857297883885, + "grad_norm": 5.628851890563965, + "learning_rate": 7.416746608195149e-06, + "loss": 0.3841, + "step": 19601 + }, + { + "epoch": 1.0658708627238198, + "grad_norm": 5.002058982849121, + "learning_rate": 7.416609565574895e-06, + "loss": 0.2594, + "step": 19602 + }, + { + "epoch": 1.0658844275637547, + "grad_norm": 3.9141390323638916, + "learning_rate": 7.416472522954639e-06, + "loss": 0.3155, + "step": 19603 + }, + { + "epoch": 1.0658979924036895, + "grad_norm": 5.3045244216918945, + "learning_rate": 7.4163354803343845e-06, + "loss": 0.2467, + "step": 19604 + }, + { + "epoch": 1.0659115572436244, + "grad_norm": 8.072551727294922, + "learning_rate": 7.41619843771413e-06, + "loss": 0.3667, + "step": 19605 + }, + { + "epoch": 1.0659251220835595, + "grad_norm": 5.9827656745910645, + "learning_rate": 7.416061395093876e-06, + "loss": 0.5264, + "step": 19606 + }, + { + "epoch": 1.0659386869234944, + "grad_norm": 4.3121337890625, + "learning_rate": 7.41592435247362e-06, + "loss": 0.3532, + "step": 19607 + }, + { + "epoch": 1.0659522517634292, + "grad_norm": 4.582671165466309, + "learning_rate": 7.415787309853365e-06, + "loss": 0.1976, + "step": 19608 + }, + { + "epoch": 1.065965816603364, + "grad_norm": 5.055568218231201, + "learning_rate": 7.4156502672331095e-06, + "loss": 0.3436, + "step": 19609 + }, + { + "epoch": 1.065979381443299, + "grad_norm": 6.29876184463501, + "learning_rate": 7.415513224612855e-06, + "loss": 0.3647, + "step": 19610 + }, + { + "epoch": 1.0659929462832338, + "grad_norm": 3.7574245929718018, + "learning_rate": 7.415376181992601e-06, + "loss": 0.1915, + "step": 19611 + }, + { + "epoch": 1.0660065111231687, + "grad_norm": 5.137170314788818, + "learning_rate": 7.415239139372346e-06, + "loss": 0.3367, + "step": 19612 + }, + { + "epoch": 1.0660200759631036, + "grad_norm": 8.402274131774902, + "learning_rate": 7.41510209675209e-06, + "loss": 0.3432, + "step": 19613 + }, + { + "epoch": 1.0660336408030386, + "grad_norm": 4.342868328094482, + "learning_rate": 7.414965054131835e-06, + "loss": 0.2057, + "step": 19614 + }, + { + "epoch": 1.0660472056429735, + "grad_norm": 5.746233940124512, + "learning_rate": 7.4148280115115814e-06, + "loss": 0.4009, + "step": 19615 + }, + { + "epoch": 1.0660607704829084, + "grad_norm": 4.446502685546875, + "learning_rate": 7.414690968891326e-06, + "loss": 0.3199, + "step": 19616 + }, + { + "epoch": 1.0660743353228432, + "grad_norm": 4.785294055938721, + "learning_rate": 7.414553926271071e-06, + "loss": 0.3134, + "step": 19617 + }, + { + "epoch": 1.066087900162778, + "grad_norm": 7.828052043914795, + "learning_rate": 7.414416883650815e-06, + "loss": 0.3047, + "step": 19618 + }, + { + "epoch": 1.066101465002713, + "grad_norm": 5.646557331085205, + "learning_rate": 7.414279841030561e-06, + "loss": 0.2639, + "step": 19619 + }, + { + "epoch": 1.0661150298426478, + "grad_norm": 5.760022163391113, + "learning_rate": 7.4141427984103065e-06, + "loss": 0.3688, + "step": 19620 + }, + { + "epoch": 1.0661285946825827, + "grad_norm": 6.297337055206299, + "learning_rate": 7.414005755790052e-06, + "loss": 0.2728, + "step": 19621 + }, + { + "epoch": 1.0661421595225176, + "grad_norm": 6.246091842651367, + "learning_rate": 7.413868713169796e-06, + "loss": 0.2967, + "step": 19622 + }, + { + "epoch": 1.0661557243624524, + "grad_norm": 6.095180988311768, + "learning_rate": 7.413731670549541e-06, + "loss": 0.2793, + "step": 19623 + }, + { + "epoch": 1.0661692892023875, + "grad_norm": 6.24306058883667, + "learning_rate": 7.413594627929287e-06, + "loss": 0.2955, + "step": 19624 + }, + { + "epoch": 1.0661828540423224, + "grad_norm": 4.719077110290527, + "learning_rate": 7.4134575853090315e-06, + "loss": 0.1772, + "step": 19625 + }, + { + "epoch": 1.0661964188822572, + "grad_norm": 5.1491475105285645, + "learning_rate": 7.413320542688777e-06, + "loss": 0.2539, + "step": 19626 + }, + { + "epoch": 1.0662099837221921, + "grad_norm": 7.7291579246521, + "learning_rate": 7.413183500068522e-06, + "loss": 0.3513, + "step": 19627 + }, + { + "epoch": 1.066223548562127, + "grad_norm": 5.83556604385376, + "learning_rate": 7.413046457448267e-06, + "loss": 0.3154, + "step": 19628 + }, + { + "epoch": 1.0662371134020618, + "grad_norm": 5.621182441711426, + "learning_rate": 7.412909414828012e-06, + "loss": 0.323, + "step": 19629 + }, + { + "epoch": 1.0662506782419967, + "grad_norm": 6.793676376342773, + "learning_rate": 7.412772372207757e-06, + "loss": 0.3756, + "step": 19630 + }, + { + "epoch": 1.0662642430819316, + "grad_norm": 5.4137067794799805, + "learning_rate": 7.412635329587502e-06, + "loss": 0.1955, + "step": 19631 + }, + { + "epoch": 1.0662778079218664, + "grad_norm": 4.530628681182861, + "learning_rate": 7.412498286967248e-06, + "loss": 0.1997, + "step": 19632 + }, + { + "epoch": 1.0662913727618015, + "grad_norm": 6.670403480529785, + "learning_rate": 7.412361244346993e-06, + "loss": 0.3328, + "step": 19633 + }, + { + "epoch": 1.0663049376017364, + "grad_norm": 4.897768020629883, + "learning_rate": 7.412224201726737e-06, + "loss": 0.2605, + "step": 19634 + }, + { + "epoch": 1.0663185024416713, + "grad_norm": 5.038052558898926, + "learning_rate": 7.4120871591064825e-06, + "loss": 0.2671, + "step": 19635 + }, + { + "epoch": 1.0663320672816061, + "grad_norm": 7.326646327972412, + "learning_rate": 7.411950116486228e-06, + "loss": 0.4832, + "step": 19636 + }, + { + "epoch": 1.066345632121541, + "grad_norm": 7.01254940032959, + "learning_rate": 7.411813073865974e-06, + "loss": 0.4337, + "step": 19637 + }, + { + "epoch": 1.0663591969614759, + "grad_norm": 5.7534894943237305, + "learning_rate": 7.411676031245718e-06, + "loss": 0.3639, + "step": 19638 + }, + { + "epoch": 1.0663727618014107, + "grad_norm": 6.877018451690674, + "learning_rate": 7.411538988625463e-06, + "loss": 0.252, + "step": 19639 + }, + { + "epoch": 1.0663863266413456, + "grad_norm": 5.625518321990967, + "learning_rate": 7.4114019460052075e-06, + "loss": 0.289, + "step": 19640 + }, + { + "epoch": 1.0663998914812804, + "grad_norm": 7.948068618774414, + "learning_rate": 7.4112649033849535e-06, + "loss": 0.4747, + "step": 19641 + }, + { + "epoch": 1.0664134563212153, + "grad_norm": 6.06117582321167, + "learning_rate": 7.411127860764699e-06, + "loss": 0.3276, + "step": 19642 + }, + { + "epoch": 1.0664270211611504, + "grad_norm": 7.010546684265137, + "learning_rate": 7.410990818144443e-06, + "loss": 0.4196, + "step": 19643 + }, + { + "epoch": 1.0664405860010853, + "grad_norm": 4.286749839782715, + "learning_rate": 7.410853775524188e-06, + "loss": 0.2994, + "step": 19644 + }, + { + "epoch": 1.0664541508410201, + "grad_norm": 8.658270835876465, + "learning_rate": 7.410716732903934e-06, + "loss": 0.4548, + "step": 19645 + }, + { + "epoch": 1.066467715680955, + "grad_norm": 4.772757530212402, + "learning_rate": 7.4105796902836794e-06, + "loss": 0.4381, + "step": 19646 + }, + { + "epoch": 1.0664812805208899, + "grad_norm": 5.798964500427246, + "learning_rate": 7.410442647663424e-06, + "loss": 0.3051, + "step": 19647 + }, + { + "epoch": 1.0664948453608247, + "grad_norm": 3.966130495071411, + "learning_rate": 7.410305605043169e-06, + "loss": 0.2732, + "step": 19648 + }, + { + "epoch": 1.0665084102007596, + "grad_norm": 5.39328670501709, + "learning_rate": 7.410168562422913e-06, + "loss": 0.2356, + "step": 19649 + }, + { + "epoch": 1.0665219750406945, + "grad_norm": 4.46099328994751, + "learning_rate": 7.410031519802659e-06, + "loss": 0.2532, + "step": 19650 + }, + { + "epoch": 1.0665355398806293, + "grad_norm": 5.09470796585083, + "learning_rate": 7.4098944771824045e-06, + "loss": 0.319, + "step": 19651 + }, + { + "epoch": 1.0665491047205644, + "grad_norm": 4.32647180557251, + "learning_rate": 7.40975743456215e-06, + "loss": 0.1528, + "step": 19652 + }, + { + "epoch": 1.0665626695604993, + "grad_norm": 5.32994270324707, + "learning_rate": 7.409620391941894e-06, + "loss": 0.2496, + "step": 19653 + }, + { + "epoch": 1.0665762344004341, + "grad_norm": 7.524291038513184, + "learning_rate": 7.40948334932164e-06, + "loss": 0.2627, + "step": 19654 + }, + { + "epoch": 1.066589799240369, + "grad_norm": 5.383732318878174, + "learning_rate": 7.409346306701385e-06, + "loss": 0.2569, + "step": 19655 + }, + { + "epoch": 1.0666033640803039, + "grad_norm": 5.587719440460205, + "learning_rate": 7.4092092640811295e-06, + "loss": 0.3105, + "step": 19656 + }, + { + "epoch": 1.0666169289202387, + "grad_norm": 6.2043538093566895, + "learning_rate": 7.409072221460875e-06, + "loss": 0.3125, + "step": 19657 + }, + { + "epoch": 1.0666304937601736, + "grad_norm": 6.022186279296875, + "learning_rate": 7.408935178840621e-06, + "loss": 0.2512, + "step": 19658 + }, + { + "epoch": 1.0666440586001085, + "grad_norm": 4.4224700927734375, + "learning_rate": 7.408798136220365e-06, + "loss": 0.2548, + "step": 19659 + }, + { + "epoch": 1.0666576234400433, + "grad_norm": 5.914188385009766, + "learning_rate": 7.40866109360011e-06, + "loss": 0.3147, + "step": 19660 + }, + { + "epoch": 1.0666711882799782, + "grad_norm": 5.223707675933838, + "learning_rate": 7.4085240509798554e-06, + "loss": 0.2689, + "step": 19661 + }, + { + "epoch": 1.0666847531199133, + "grad_norm": 6.466767311096191, + "learning_rate": 7.4083870083596e-06, + "loss": 0.3688, + "step": 19662 + }, + { + "epoch": 1.0666983179598482, + "grad_norm": 6.346279621124268, + "learning_rate": 7.408249965739346e-06, + "loss": 0.2994, + "step": 19663 + }, + { + "epoch": 1.066711882799783, + "grad_norm": 4.6561970710754395, + "learning_rate": 7.408112923119091e-06, + "loss": 0.2413, + "step": 19664 + }, + { + "epoch": 1.0667254476397179, + "grad_norm": 3.9814705848693848, + "learning_rate": 7.407975880498835e-06, + "loss": 0.2002, + "step": 19665 + }, + { + "epoch": 1.0667390124796527, + "grad_norm": 5.380369663238525, + "learning_rate": 7.4078388378785805e-06, + "loss": 0.283, + "step": 19666 + }, + { + "epoch": 1.0667525773195876, + "grad_norm": 5.756214141845703, + "learning_rate": 7.4077017952583265e-06, + "loss": 0.3042, + "step": 19667 + }, + { + "epoch": 1.0667661421595225, + "grad_norm": 4.870522499084473, + "learning_rate": 7.407564752638071e-06, + "loss": 0.2103, + "step": 19668 + }, + { + "epoch": 1.0667797069994573, + "grad_norm": 4.878387451171875, + "learning_rate": 7.407427710017816e-06, + "loss": 0.1815, + "step": 19669 + }, + { + "epoch": 1.0667932718393922, + "grad_norm": 6.034724235534668, + "learning_rate": 7.407290667397561e-06, + "loss": 0.3165, + "step": 19670 + }, + { + "epoch": 1.0668068366793273, + "grad_norm": 6.029768466949463, + "learning_rate": 7.407153624777307e-06, + "loss": 0.2895, + "step": 19671 + }, + { + "epoch": 1.0668204015192622, + "grad_norm": 3.7717838287353516, + "learning_rate": 7.4070165821570515e-06, + "loss": 0.1692, + "step": 19672 + }, + { + "epoch": 1.066833966359197, + "grad_norm": 5.94248104095459, + "learning_rate": 7.406879539536797e-06, + "loss": 0.3223, + "step": 19673 + }, + { + "epoch": 1.066847531199132, + "grad_norm": 4.576573848724365, + "learning_rate": 7.406742496916541e-06, + "loss": 0.2616, + "step": 19674 + }, + { + "epoch": 1.0668610960390668, + "grad_norm": 4.899899959564209, + "learning_rate": 7.406605454296287e-06, + "loss": 0.247, + "step": 19675 + }, + { + "epoch": 1.0668746608790016, + "grad_norm": 3.6121490001678467, + "learning_rate": 7.406468411676032e-06, + "loss": 0.1719, + "step": 19676 + }, + { + "epoch": 1.0668882257189365, + "grad_norm": 5.489284038543701, + "learning_rate": 7.406331369055777e-06, + "loss": 0.2601, + "step": 19677 + }, + { + "epoch": 1.0669017905588714, + "grad_norm": 7.628442287445068, + "learning_rate": 7.406194326435522e-06, + "loss": 0.2431, + "step": 19678 + }, + { + "epoch": 1.0669153553988062, + "grad_norm": 5.640086650848389, + "learning_rate": 7.406057283815267e-06, + "loss": 0.3674, + "step": 19679 + }, + { + "epoch": 1.066928920238741, + "grad_norm": 6.243250370025635, + "learning_rate": 7.405920241195013e-06, + "loss": 0.2676, + "step": 19680 + }, + { + "epoch": 1.0669424850786762, + "grad_norm": 3.51261830329895, + "learning_rate": 7.405783198574757e-06, + "loss": 0.1309, + "step": 19681 + }, + { + "epoch": 1.066956049918611, + "grad_norm": 4.69684362411499, + "learning_rate": 7.4056461559545025e-06, + "loss": 0.2329, + "step": 19682 + }, + { + "epoch": 1.066969614758546, + "grad_norm": 5.674226760864258, + "learning_rate": 7.405509113334247e-06, + "loss": 0.2312, + "step": 19683 + }, + { + "epoch": 1.0669831795984808, + "grad_norm": 4.336952209472656, + "learning_rate": 7.405372070713993e-06, + "loss": 0.2723, + "step": 19684 + }, + { + "epoch": 1.0669967444384156, + "grad_norm": 5.076766490936279, + "learning_rate": 7.405235028093738e-06, + "loss": 0.2829, + "step": 19685 + }, + { + "epoch": 1.0670103092783505, + "grad_norm": 5.290139675140381, + "learning_rate": 7.405097985473483e-06, + "loss": 0.2795, + "step": 19686 + }, + { + "epoch": 1.0670238741182854, + "grad_norm": 3.5135960578918457, + "learning_rate": 7.4049609428532275e-06, + "loss": 0.123, + "step": 19687 + }, + { + "epoch": 1.0670374389582202, + "grad_norm": 4.404790878295898, + "learning_rate": 7.4048239002329736e-06, + "loss": 0.1482, + "step": 19688 + }, + { + "epoch": 1.067051003798155, + "grad_norm": 6.0517578125, + "learning_rate": 7.404686857612719e-06, + "loss": 0.2288, + "step": 19689 + }, + { + "epoch": 1.0670645686380902, + "grad_norm": 4.791879177093506, + "learning_rate": 7.404549814992463e-06, + "loss": 0.1793, + "step": 19690 + }, + { + "epoch": 1.067078133478025, + "grad_norm": 6.281363010406494, + "learning_rate": 7.404412772372208e-06, + "loss": 0.2997, + "step": 19691 + }, + { + "epoch": 1.06709169831796, + "grad_norm": 5.862916946411133, + "learning_rate": 7.404275729751953e-06, + "loss": 0.167, + "step": 19692 + }, + { + "epoch": 1.0671052631578948, + "grad_norm": 6.456879615783691, + "learning_rate": 7.404138687131699e-06, + "loss": 0.1743, + "step": 19693 + }, + { + "epoch": 1.0671188279978296, + "grad_norm": 4.209471702575684, + "learning_rate": 7.404001644511444e-06, + "loss": 0.1979, + "step": 19694 + }, + { + "epoch": 1.0671323928377645, + "grad_norm": 3.7352311611175537, + "learning_rate": 7.403864601891189e-06, + "loss": 0.1206, + "step": 19695 + }, + { + "epoch": 1.0671459576776994, + "grad_norm": 4.752147197723389, + "learning_rate": 7.403727559270933e-06, + "loss": 0.2114, + "step": 19696 + }, + { + "epoch": 1.0671595225176342, + "grad_norm": 5.524942398071289, + "learning_rate": 7.403590516650679e-06, + "loss": 0.2461, + "step": 19697 + }, + { + "epoch": 1.067173087357569, + "grad_norm": 4.97053337097168, + "learning_rate": 7.4034534740304245e-06, + "loss": 0.2186, + "step": 19698 + }, + { + "epoch": 1.067186652197504, + "grad_norm": 6.149260997772217, + "learning_rate": 7.403316431410169e-06, + "loss": 0.2245, + "step": 19699 + }, + { + "epoch": 1.067200217037439, + "grad_norm": 5.050596714019775, + "learning_rate": 7.403179388789914e-06, + "loss": 0.1962, + "step": 19700 + }, + { + "epoch": 1.067213781877374, + "grad_norm": 6.001474380493164, + "learning_rate": 7.40304234616966e-06, + "loss": 0.2811, + "step": 19701 + }, + { + "epoch": 1.0672273467173088, + "grad_norm": 4.950375556945801, + "learning_rate": 7.402905303549404e-06, + "loss": 0.2261, + "step": 19702 + }, + { + "epoch": 1.0672409115572437, + "grad_norm": 5.34727144241333, + "learning_rate": 7.4027682609291496e-06, + "loss": 0.213, + "step": 19703 + }, + { + "epoch": 1.0672544763971785, + "grad_norm": 5.766687393188477, + "learning_rate": 7.402631218308895e-06, + "loss": 0.2536, + "step": 19704 + }, + { + "epoch": 1.0672680412371134, + "grad_norm": 5.127859592437744, + "learning_rate": 7.402494175688639e-06, + "loss": 0.3825, + "step": 19705 + }, + { + "epoch": 1.0672816060770483, + "grad_norm": 5.5199995040893555, + "learning_rate": 7.402357133068385e-06, + "loss": 0.3434, + "step": 19706 + }, + { + "epoch": 1.0672951709169831, + "grad_norm": 4.5809102058410645, + "learning_rate": 7.40222009044813e-06, + "loss": 0.3198, + "step": 19707 + }, + { + "epoch": 1.067308735756918, + "grad_norm": 5.867728233337402, + "learning_rate": 7.402083047827875e-06, + "loss": 0.2603, + "step": 19708 + }, + { + "epoch": 1.067322300596853, + "grad_norm": 5.826539039611816, + "learning_rate": 7.40194600520762e-06, + "loss": 0.3436, + "step": 19709 + }, + { + "epoch": 1.067335865436788, + "grad_norm": 6.43575382232666, + "learning_rate": 7.401808962587366e-06, + "loss": 0.2985, + "step": 19710 + }, + { + "epoch": 1.0673494302767228, + "grad_norm": 6.910846710205078, + "learning_rate": 7.40167191996711e-06, + "loss": 0.378, + "step": 19711 + }, + { + "epoch": 1.0673629951166577, + "grad_norm": 6.950364112854004, + "learning_rate": 7.401534877346855e-06, + "loss": 0.3416, + "step": 19712 + }, + { + "epoch": 1.0673765599565925, + "grad_norm": 5.078545093536377, + "learning_rate": 7.4013978347266005e-06, + "loss": 0.2779, + "step": 19713 + }, + { + "epoch": 1.0673901247965274, + "grad_norm": 4.84175443649292, + "learning_rate": 7.4012607921063465e-06, + "loss": 0.216, + "step": 19714 + }, + { + "epoch": 1.0674036896364623, + "grad_norm": 7.166930675506592, + "learning_rate": 7.401123749486091e-06, + "loss": 0.3473, + "step": 19715 + }, + { + "epoch": 1.0674172544763971, + "grad_norm": 5.472562789916992, + "learning_rate": 7.400986706865836e-06, + "loss": 0.2951, + "step": 19716 + }, + { + "epoch": 1.067430819316332, + "grad_norm": 6.103353977203369, + "learning_rate": 7.40084966424558e-06, + "loss": 0.3146, + "step": 19717 + }, + { + "epoch": 1.0674443841562669, + "grad_norm": 3.5586888790130615, + "learning_rate": 7.4007126216253255e-06, + "loss": 0.1592, + "step": 19718 + }, + { + "epoch": 1.067457948996202, + "grad_norm": 4.562580585479736, + "learning_rate": 7.4005755790050716e-06, + "loss": 0.2834, + "step": 19719 + }, + { + "epoch": 1.0674715138361368, + "grad_norm": 5.685835361480713, + "learning_rate": 7.400438536384817e-06, + "loss": 0.2568, + "step": 19720 + }, + { + "epoch": 1.0674850786760717, + "grad_norm": 5.688620090484619, + "learning_rate": 7.400301493764561e-06, + "loss": 0.2664, + "step": 19721 + }, + { + "epoch": 1.0674986435160065, + "grad_norm": 6.119970321655273, + "learning_rate": 7.400164451144306e-06, + "loss": 0.3983, + "step": 19722 + }, + { + "epoch": 1.0675122083559414, + "grad_norm": 5.73566198348999, + "learning_rate": 7.400027408524052e-06, + "loss": 0.3189, + "step": 19723 + }, + { + "epoch": 1.0675257731958763, + "grad_norm": 7.503855228424072, + "learning_rate": 7.399890365903797e-06, + "loss": 0.3826, + "step": 19724 + }, + { + "epoch": 1.0675393380358111, + "grad_norm": 6.839295864105225, + "learning_rate": 7.399753323283542e-06, + "loss": 0.4132, + "step": 19725 + }, + { + "epoch": 1.067552902875746, + "grad_norm": 6.326011657714844, + "learning_rate": 7.399616280663286e-06, + "loss": 0.3745, + "step": 19726 + }, + { + "epoch": 1.0675664677156809, + "grad_norm": 6.197551250457764, + "learning_rate": 7.399479238043032e-06, + "loss": 0.307, + "step": 19727 + }, + { + "epoch": 1.067580032555616, + "grad_norm": 5.02962064743042, + "learning_rate": 7.399342195422777e-06, + "loss": 0.2034, + "step": 19728 + }, + { + "epoch": 1.0675935973955508, + "grad_norm": 4.691895484924316, + "learning_rate": 7.3992051528025225e-06, + "loss": 0.2222, + "step": 19729 + }, + { + "epoch": 1.0676071622354857, + "grad_norm": 5.425090312957764, + "learning_rate": 7.399068110182267e-06, + "loss": 0.1925, + "step": 19730 + }, + { + "epoch": 1.0676207270754206, + "grad_norm": 4.988784313201904, + "learning_rate": 7.398931067562012e-06, + "loss": 0.147, + "step": 19731 + }, + { + "epoch": 1.0676342919153554, + "grad_norm": 5.382867813110352, + "learning_rate": 7.398794024941758e-06, + "loss": 0.2481, + "step": 19732 + }, + { + "epoch": 1.0676478567552903, + "grad_norm": 6.022480010986328, + "learning_rate": 7.398656982321502e-06, + "loss": 0.2834, + "step": 19733 + }, + { + "epoch": 1.0676614215952251, + "grad_norm": 5.4618024826049805, + "learning_rate": 7.3985199397012476e-06, + "loss": 0.298, + "step": 19734 + }, + { + "epoch": 1.06767498643516, + "grad_norm": 4.869503021240234, + "learning_rate": 7.398382897080993e-06, + "loss": 0.2253, + "step": 19735 + }, + { + "epoch": 1.0676885512750949, + "grad_norm": 4.705501556396484, + "learning_rate": 7.398245854460738e-06, + "loss": 0.2341, + "step": 19736 + }, + { + "epoch": 1.0677021161150297, + "grad_norm": 5.151777267456055, + "learning_rate": 7.398108811840483e-06, + "loss": 0.436, + "step": 19737 + }, + { + "epoch": 1.0677156809549648, + "grad_norm": 5.657855987548828, + "learning_rate": 7.397971769220228e-06, + "loss": 0.3743, + "step": 19738 + }, + { + "epoch": 1.0677292457948997, + "grad_norm": 6.319851398468018, + "learning_rate": 7.397834726599973e-06, + "loss": 0.3769, + "step": 19739 + }, + { + "epoch": 1.0677428106348346, + "grad_norm": 7.972348690032959, + "learning_rate": 7.397697683979719e-06, + "loss": 0.2574, + "step": 19740 + }, + { + "epoch": 1.0677563754747694, + "grad_norm": 9.546144485473633, + "learning_rate": 7.397560641359464e-06, + "loss": 0.457, + "step": 19741 + }, + { + "epoch": 1.0677699403147043, + "grad_norm": 5.807061195373535, + "learning_rate": 7.397423598739208e-06, + "loss": 0.3068, + "step": 19742 + }, + { + "epoch": 1.0677835051546392, + "grad_norm": 5.129739284515381, + "learning_rate": 7.397286556118953e-06, + "loss": 0.2218, + "step": 19743 + }, + { + "epoch": 1.067797069994574, + "grad_norm": 4.700252056121826, + "learning_rate": 7.397149513498699e-06, + "loss": 0.1422, + "step": 19744 + }, + { + "epoch": 1.0678106348345089, + "grad_norm": 6.644899368286133, + "learning_rate": 7.397012470878444e-06, + "loss": 0.2997, + "step": 19745 + }, + { + "epoch": 1.0678241996744438, + "grad_norm": 5.977571964263916, + "learning_rate": 7.396875428258189e-06, + "loss": 0.3135, + "step": 19746 + }, + { + "epoch": 1.0678377645143788, + "grad_norm": 6.472391128540039, + "learning_rate": 7.396738385637934e-06, + "loss": 0.3357, + "step": 19747 + }, + { + "epoch": 1.0678513293543137, + "grad_norm": 5.921088218688965, + "learning_rate": 7.396601343017678e-06, + "loss": 0.3992, + "step": 19748 + }, + { + "epoch": 1.0678648941942486, + "grad_norm": 3.971924304962158, + "learning_rate": 7.396464300397424e-06, + "loss": 0.2279, + "step": 19749 + }, + { + "epoch": 1.0678784590341834, + "grad_norm": 5.282201766967773, + "learning_rate": 7.3963272577771696e-06, + "loss": 0.2871, + "step": 19750 + }, + { + "epoch": 1.0678920238741183, + "grad_norm": 5.277245998382568, + "learning_rate": 7.396190215156914e-06, + "loss": 0.3446, + "step": 19751 + }, + { + "epoch": 1.0679055887140532, + "grad_norm": 6.485008716583252, + "learning_rate": 7.396053172536659e-06, + "loss": 0.2977, + "step": 19752 + }, + { + "epoch": 1.067919153553988, + "grad_norm": 6.024084568023682, + "learning_rate": 7.395916129916405e-06, + "loss": 0.2647, + "step": 19753 + }, + { + "epoch": 1.067932718393923, + "grad_norm": 6.356564998626709, + "learning_rate": 7.39577908729615e-06, + "loss": 0.3496, + "step": 19754 + }, + { + "epoch": 1.0679462832338578, + "grad_norm": 4.474832534790039, + "learning_rate": 7.395642044675895e-06, + "loss": 0.2581, + "step": 19755 + }, + { + "epoch": 1.0679598480737926, + "grad_norm": 7.605313301086426, + "learning_rate": 7.39550500205564e-06, + "loss": 0.3685, + "step": 19756 + }, + { + "epoch": 1.0679734129137277, + "grad_norm": 4.952136993408203, + "learning_rate": 7.395367959435386e-06, + "loss": 0.309, + "step": 19757 + }, + { + "epoch": 1.0679869777536626, + "grad_norm": 5.671293258666992, + "learning_rate": 7.39523091681513e-06, + "loss": 0.313, + "step": 19758 + }, + { + "epoch": 1.0680005425935974, + "grad_norm": 5.120164394378662, + "learning_rate": 7.395093874194875e-06, + "loss": 0.2084, + "step": 19759 + }, + { + "epoch": 1.0680141074335323, + "grad_norm": 4.510991096496582, + "learning_rate": 7.39495683157462e-06, + "loss": 0.2702, + "step": 19760 + }, + { + "epoch": 1.0680276722734672, + "grad_norm": 6.162899017333984, + "learning_rate": 7.394819788954365e-06, + "loss": 0.2616, + "step": 19761 + }, + { + "epoch": 1.068041237113402, + "grad_norm": 5.418903350830078, + "learning_rate": 7.394682746334111e-06, + "loss": 0.3376, + "step": 19762 + }, + { + "epoch": 1.068054801953337, + "grad_norm": 5.077828407287598, + "learning_rate": 7.394545703713856e-06, + "loss": 0.3659, + "step": 19763 + }, + { + "epoch": 1.0680683667932718, + "grad_norm": 4.1117167472839355, + "learning_rate": 7.3944086610936e-06, + "loss": 0.2203, + "step": 19764 + }, + { + "epoch": 1.0680819316332066, + "grad_norm": 3.887322187423706, + "learning_rate": 7.3942716184733456e-06, + "loss": 0.1936, + "step": 19765 + }, + { + "epoch": 1.0680954964731417, + "grad_norm": 4.832974433898926, + "learning_rate": 7.394134575853092e-06, + "loss": 0.2859, + "step": 19766 + }, + { + "epoch": 1.0681090613130766, + "grad_norm": 5.0476765632629395, + "learning_rate": 7.393997533232836e-06, + "loss": 0.2478, + "step": 19767 + }, + { + "epoch": 1.0681226261530115, + "grad_norm": 6.102090358734131, + "learning_rate": 7.393860490612581e-06, + "loss": 0.2834, + "step": 19768 + }, + { + "epoch": 1.0681361909929463, + "grad_norm": 4.626381874084473, + "learning_rate": 7.393723447992326e-06, + "loss": 0.2059, + "step": 19769 + }, + { + "epoch": 1.0681497558328812, + "grad_norm": 4.4934587478637695, + "learning_rate": 7.3935864053720715e-06, + "loss": 0.2899, + "step": 19770 + }, + { + "epoch": 1.068163320672816, + "grad_norm": 3.532520055770874, + "learning_rate": 7.393449362751817e-06, + "loss": 0.1716, + "step": 19771 + }, + { + "epoch": 1.068176885512751, + "grad_norm": 5.882365703582764, + "learning_rate": 7.393312320131562e-06, + "loss": 0.3104, + "step": 19772 + }, + { + "epoch": 1.0681904503526858, + "grad_norm": 4.06548547744751, + "learning_rate": 7.393175277511306e-06, + "loss": 0.3177, + "step": 19773 + }, + { + "epoch": 1.0682040151926206, + "grad_norm": 4.783685684204102, + "learning_rate": 7.393038234891051e-06, + "loss": 0.2724, + "step": 19774 + }, + { + "epoch": 1.0682175800325555, + "grad_norm": 5.119953155517578, + "learning_rate": 7.392901192270797e-06, + "loss": 0.2133, + "step": 19775 + }, + { + "epoch": 1.0682311448724906, + "grad_norm": 4.171764373779297, + "learning_rate": 7.392764149650542e-06, + "loss": 0.207, + "step": 19776 + }, + { + "epoch": 1.0682447097124255, + "grad_norm": 4.755776882171631, + "learning_rate": 7.392627107030287e-06, + "loss": 0.2834, + "step": 19777 + }, + { + "epoch": 1.0682582745523603, + "grad_norm": 3.8920576572418213, + "learning_rate": 7.392490064410032e-06, + "loss": 0.2108, + "step": 19778 + }, + { + "epoch": 1.0682718393922952, + "grad_norm": 4.812739849090576, + "learning_rate": 7.392353021789778e-06, + "loss": 0.2708, + "step": 19779 + }, + { + "epoch": 1.06828540423223, + "grad_norm": 6.271587371826172, + "learning_rate": 7.392215979169522e-06, + "loss": 0.3051, + "step": 19780 + }, + { + "epoch": 1.068298969072165, + "grad_norm": 7.594704627990723, + "learning_rate": 7.3920789365492676e-06, + "loss": 0.4058, + "step": 19781 + }, + { + "epoch": 1.0683125339120998, + "grad_norm": 4.470160961151123, + "learning_rate": 7.391941893929012e-06, + "loss": 0.2818, + "step": 19782 + }, + { + "epoch": 1.0683260987520347, + "grad_norm": 3.669645309448242, + "learning_rate": 7.391804851308758e-06, + "loss": 0.1859, + "step": 19783 + }, + { + "epoch": 1.0683396635919695, + "grad_norm": 3.4004480838775635, + "learning_rate": 7.391667808688503e-06, + "loss": 0.1583, + "step": 19784 + }, + { + "epoch": 1.0683532284319046, + "grad_norm": 6.146620750427246, + "learning_rate": 7.3915307660682474e-06, + "loss": 0.2777, + "step": 19785 + }, + { + "epoch": 1.0683667932718395, + "grad_norm": 4.425568580627441, + "learning_rate": 7.391393723447993e-06, + "loss": 0.2083, + "step": 19786 + }, + { + "epoch": 1.0683803581117743, + "grad_norm": 5.379836559295654, + "learning_rate": 7.391256680827738e-06, + "loss": 0.2419, + "step": 19787 + }, + { + "epoch": 1.0683939229517092, + "grad_norm": 4.8515625, + "learning_rate": 7.391119638207484e-06, + "loss": 0.2841, + "step": 19788 + }, + { + "epoch": 1.068407487791644, + "grad_norm": 5.257531642913818, + "learning_rate": 7.390982595587228e-06, + "loss": 0.2625, + "step": 19789 + }, + { + "epoch": 1.068421052631579, + "grad_norm": 6.395059108734131, + "learning_rate": 7.390845552966973e-06, + "loss": 0.3969, + "step": 19790 + }, + { + "epoch": 1.0684346174715138, + "grad_norm": 4.791065692901611, + "learning_rate": 7.390708510346718e-06, + "loss": 0.2191, + "step": 19791 + }, + { + "epoch": 1.0684481823114487, + "grad_norm": 4.070605278015137, + "learning_rate": 7.390571467726464e-06, + "loss": 0.1764, + "step": 19792 + }, + { + "epoch": 1.0684617471513835, + "grad_norm": 4.909496784210205, + "learning_rate": 7.390434425106209e-06, + "loss": 0.2813, + "step": 19793 + }, + { + "epoch": 1.0684753119913184, + "grad_norm": 3.920719861984253, + "learning_rate": 7.390297382485954e-06, + "loss": 0.2572, + "step": 19794 + }, + { + "epoch": 1.0684888768312535, + "grad_norm": 5.2499213218688965, + "learning_rate": 7.390160339865698e-06, + "loss": 0.1902, + "step": 19795 + }, + { + "epoch": 1.0685024416711884, + "grad_norm": 6.357736110687256, + "learning_rate": 7.390023297245444e-06, + "loss": 0.257, + "step": 19796 + }, + { + "epoch": 1.0685160065111232, + "grad_norm": 4.681517601013184, + "learning_rate": 7.38988625462519e-06, + "loss": 0.3114, + "step": 19797 + }, + { + "epoch": 1.068529571351058, + "grad_norm": 4.007568359375, + "learning_rate": 7.389749212004934e-06, + "loss": 0.2228, + "step": 19798 + }, + { + "epoch": 1.068543136190993, + "grad_norm": 5.105198860168457, + "learning_rate": 7.389612169384679e-06, + "loss": 0.2291, + "step": 19799 + }, + { + "epoch": 1.0685567010309278, + "grad_norm": 4.536502838134766, + "learning_rate": 7.3894751267644234e-06, + "loss": 0.1596, + "step": 19800 + }, + { + "epoch": 1.0685702658708627, + "grad_norm": 4.18974494934082, + "learning_rate": 7.3893380841441695e-06, + "loss": 0.154, + "step": 19801 + }, + { + "epoch": 1.0685838307107975, + "grad_norm": 5.875781536102295, + "learning_rate": 7.389201041523915e-06, + "loss": 0.24, + "step": 19802 + }, + { + "epoch": 1.0685973955507324, + "grad_norm": 4.163471698760986, + "learning_rate": 7.38906399890366e-06, + "loss": 0.1629, + "step": 19803 + }, + { + "epoch": 1.0686109603906675, + "grad_norm": 6.233384132385254, + "learning_rate": 7.388926956283404e-06, + "loss": 0.2862, + "step": 19804 + }, + { + "epoch": 1.0686245252306024, + "grad_norm": 6.586507797241211, + "learning_rate": 7.38878991366315e-06, + "loss": 0.2846, + "step": 19805 + }, + { + "epoch": 1.0686380900705372, + "grad_norm": 4.964405536651611, + "learning_rate": 7.388652871042895e-06, + "loss": 0.2167, + "step": 19806 + }, + { + "epoch": 1.068651654910472, + "grad_norm": 4.937282562255859, + "learning_rate": 7.38851582842264e-06, + "loss": 0.1766, + "step": 19807 + }, + { + "epoch": 1.068665219750407, + "grad_norm": 5.636918067932129, + "learning_rate": 7.388378785802385e-06, + "loss": 0.3214, + "step": 19808 + }, + { + "epoch": 1.0686787845903418, + "grad_norm": 5.963113307952881, + "learning_rate": 7.388241743182131e-06, + "loss": 0.1987, + "step": 19809 + }, + { + "epoch": 1.0686923494302767, + "grad_norm": 4.718317985534668, + "learning_rate": 7.388104700561875e-06, + "loss": 0.2324, + "step": 19810 + }, + { + "epoch": 1.0687059142702116, + "grad_norm": 5.071180820465088, + "learning_rate": 7.38796765794162e-06, + "loss": 0.2963, + "step": 19811 + }, + { + "epoch": 1.0687194791101464, + "grad_norm": 6.402944087982178, + "learning_rate": 7.387830615321366e-06, + "loss": 0.2968, + "step": 19812 + }, + { + "epoch": 1.0687330439500813, + "grad_norm": 4.850956439971924, + "learning_rate": 7.387693572701112e-06, + "loss": 0.2391, + "step": 19813 + }, + { + "epoch": 1.0687466087900164, + "grad_norm": 5.099004745483398, + "learning_rate": 7.387556530080856e-06, + "loss": 0.1332, + "step": 19814 + }, + { + "epoch": 1.0687601736299512, + "grad_norm": 3.715866804122925, + "learning_rate": 7.387419487460601e-06, + "loss": 0.1222, + "step": 19815 + }, + { + "epoch": 1.068773738469886, + "grad_norm": 5.822450160980225, + "learning_rate": 7.3872824448403454e-06, + "loss": 0.282, + "step": 19816 + }, + { + "epoch": 1.068787303309821, + "grad_norm": 4.411820411682129, + "learning_rate": 7.387145402220091e-06, + "loss": 0.1536, + "step": 19817 + }, + { + "epoch": 1.0688008681497558, + "grad_norm": 5.066939830780029, + "learning_rate": 7.387008359599837e-06, + "loss": 0.2881, + "step": 19818 + }, + { + "epoch": 1.0688144329896907, + "grad_norm": 5.795604228973389, + "learning_rate": 7.386871316979581e-06, + "loss": 0.3157, + "step": 19819 + }, + { + "epoch": 1.0688279978296256, + "grad_norm": 5.503334999084473, + "learning_rate": 7.386734274359326e-06, + "loss": 0.2448, + "step": 19820 + }, + { + "epoch": 1.0688415626695604, + "grad_norm": 4.359498023986816, + "learning_rate": 7.386597231739071e-06, + "loss": 0.1706, + "step": 19821 + }, + { + "epoch": 1.0688551275094953, + "grad_norm": 4.572906017303467, + "learning_rate": 7.386460189118817e-06, + "loss": 0.2211, + "step": 19822 + }, + { + "epoch": 1.0688686923494304, + "grad_norm": 4.231996059417725, + "learning_rate": 7.386323146498562e-06, + "loss": 0.1869, + "step": 19823 + }, + { + "epoch": 1.0688822571893652, + "grad_norm": 3.9342691898345947, + "learning_rate": 7.386186103878307e-06, + "loss": 0.2242, + "step": 19824 + }, + { + "epoch": 1.0688958220293001, + "grad_norm": 4.553300380706787, + "learning_rate": 7.386049061258051e-06, + "loss": 0.2011, + "step": 19825 + }, + { + "epoch": 1.068909386869235, + "grad_norm": 4.766594886779785, + "learning_rate": 7.385912018637797e-06, + "loss": 0.1825, + "step": 19826 + }, + { + "epoch": 1.0689229517091698, + "grad_norm": 4.8094987869262695, + "learning_rate": 7.385774976017542e-06, + "loss": 0.2709, + "step": 19827 + }, + { + "epoch": 1.0689365165491047, + "grad_norm": 3.8110835552215576, + "learning_rate": 7.385637933397288e-06, + "loss": 0.165, + "step": 19828 + }, + { + "epoch": 1.0689500813890396, + "grad_norm": 6.440816879272461, + "learning_rate": 7.385500890777032e-06, + "loss": 0.3311, + "step": 19829 + }, + { + "epoch": 1.0689636462289744, + "grad_norm": 2.830108165740967, + "learning_rate": 7.385363848156777e-06, + "loss": 0.0993, + "step": 19830 + }, + { + "epoch": 1.0689772110689093, + "grad_norm": 4.850790023803711, + "learning_rate": 7.385226805536523e-06, + "loss": 0.1614, + "step": 19831 + }, + { + "epoch": 1.0689907759088442, + "grad_norm": 4.440735816955566, + "learning_rate": 7.3850897629162675e-06, + "loss": 0.2142, + "step": 19832 + }, + { + "epoch": 1.0690043407487793, + "grad_norm": 4.303083896636963, + "learning_rate": 7.384952720296013e-06, + "loss": 0.1443, + "step": 19833 + }, + { + "epoch": 1.0690179055887141, + "grad_norm": 5.221523284912109, + "learning_rate": 7.384815677675757e-06, + "loss": 0.2868, + "step": 19834 + }, + { + "epoch": 1.069031470428649, + "grad_norm": 4.73541259765625, + "learning_rate": 7.384678635055503e-06, + "loss": 0.3049, + "step": 19835 + }, + { + "epoch": 1.0690450352685839, + "grad_norm": 5.290765285491943, + "learning_rate": 7.384541592435248e-06, + "loss": 0.1623, + "step": 19836 + }, + { + "epoch": 1.0690586001085187, + "grad_norm": 3.84918212890625, + "learning_rate": 7.384404549814993e-06, + "loss": 0.1609, + "step": 19837 + }, + { + "epoch": 1.0690721649484536, + "grad_norm": 6.52439546585083, + "learning_rate": 7.384267507194738e-06, + "loss": 0.2146, + "step": 19838 + }, + { + "epoch": 1.0690857297883885, + "grad_norm": 6.173670291900635, + "learning_rate": 7.384130464574484e-06, + "loss": 0.2953, + "step": 19839 + }, + { + "epoch": 1.0690992946283233, + "grad_norm": 5.302811145782471, + "learning_rate": 7.383993421954229e-06, + "loss": 0.2401, + "step": 19840 + }, + { + "epoch": 1.0691128594682582, + "grad_norm": 4.641952037811279, + "learning_rate": 7.383856379333973e-06, + "loss": 0.1505, + "step": 19841 + }, + { + "epoch": 1.0691264243081933, + "grad_norm": 5.487680912017822, + "learning_rate": 7.383719336713718e-06, + "loss": 0.2292, + "step": 19842 + }, + { + "epoch": 1.0691399891481281, + "grad_norm": 5.489484786987305, + "learning_rate": 7.383582294093464e-06, + "loss": 0.2392, + "step": 19843 + }, + { + "epoch": 1.069153553988063, + "grad_norm": 4.505130290985107, + "learning_rate": 7.383445251473209e-06, + "loss": 0.126, + "step": 19844 + }, + { + "epoch": 1.0691671188279979, + "grad_norm": 4.376723766326904, + "learning_rate": 7.383308208852954e-06, + "loss": 0.2425, + "step": 19845 + }, + { + "epoch": 1.0691806836679327, + "grad_norm": 6.888271808624268, + "learning_rate": 7.383171166232699e-06, + "loss": 0.2985, + "step": 19846 + }, + { + "epoch": 1.0691942485078676, + "grad_norm": 5.708323001861572, + "learning_rate": 7.3830341236124435e-06, + "loss": 0.2073, + "step": 19847 + }, + { + "epoch": 1.0692078133478025, + "grad_norm": 5.9840874671936035, + "learning_rate": 7.3828970809921895e-06, + "loss": 0.2702, + "step": 19848 + }, + { + "epoch": 1.0692213781877373, + "grad_norm": 5.36956262588501, + "learning_rate": 7.382760038371935e-06, + "loss": 0.1496, + "step": 19849 + }, + { + "epoch": 1.0692349430276722, + "grad_norm": 6.124812602996826, + "learning_rate": 7.382622995751679e-06, + "loss": 0.2432, + "step": 19850 + }, + { + "epoch": 1.069248507867607, + "grad_norm": 4.461462020874023, + "learning_rate": 7.382485953131424e-06, + "loss": 0.1506, + "step": 19851 + }, + { + "epoch": 1.0692620727075421, + "grad_norm": 4.979564666748047, + "learning_rate": 7.38234891051117e-06, + "loss": 0.2026, + "step": 19852 + }, + { + "epoch": 1.069275637547477, + "grad_norm": 5.352316379547119, + "learning_rate": 7.3822118678909145e-06, + "loss": 0.2409, + "step": 19853 + }, + { + "epoch": 1.0692892023874119, + "grad_norm": 4.829456806182861, + "learning_rate": 7.38207482527066e-06, + "loss": 0.1656, + "step": 19854 + }, + { + "epoch": 1.0693027672273467, + "grad_norm": 7.786519527435303, + "learning_rate": 7.381937782650405e-06, + "loss": 0.2504, + "step": 19855 + }, + { + "epoch": 1.0693163320672816, + "grad_norm": 5.167067527770996, + "learning_rate": 7.381800740030149e-06, + "loss": 0.1681, + "step": 19856 + }, + { + "epoch": 1.0693298969072165, + "grad_norm": 4.83125638961792, + "learning_rate": 7.381663697409895e-06, + "loss": 0.1819, + "step": 19857 + }, + { + "epoch": 1.0693434617471513, + "grad_norm": 4.165529251098633, + "learning_rate": 7.38152665478964e-06, + "loss": 0.2008, + "step": 19858 + }, + { + "epoch": 1.0693570265870862, + "grad_norm": 5.555175304412842, + "learning_rate": 7.381389612169385e-06, + "loss": 0.3047, + "step": 19859 + }, + { + "epoch": 1.069370591427021, + "grad_norm": 4.16005802154541, + "learning_rate": 7.38125256954913e-06, + "loss": 0.1768, + "step": 19860 + }, + { + "epoch": 1.0693841562669562, + "grad_norm": 5.2741193771362305, + "learning_rate": 7.381115526928876e-06, + "loss": 0.2224, + "step": 19861 + }, + { + "epoch": 1.069397721106891, + "grad_norm": 5.801880359649658, + "learning_rate": 7.380978484308621e-06, + "loss": 0.2669, + "step": 19862 + }, + { + "epoch": 1.0694112859468259, + "grad_norm": 5.533834457397461, + "learning_rate": 7.3808414416883655e-06, + "loss": 0.3537, + "step": 19863 + }, + { + "epoch": 1.0694248507867608, + "grad_norm": 4.30733585357666, + "learning_rate": 7.380704399068111e-06, + "loss": 0.2765, + "step": 19864 + }, + { + "epoch": 1.0694384156266956, + "grad_norm": 3.4569506645202637, + "learning_rate": 7.380567356447857e-06, + "loss": 0.1243, + "step": 19865 + }, + { + "epoch": 1.0694519804666305, + "grad_norm": 6.690112113952637, + "learning_rate": 7.380430313827601e-06, + "loss": 0.44, + "step": 19866 + }, + { + "epoch": 1.0694655453065653, + "grad_norm": 5.0424652099609375, + "learning_rate": 7.380293271207346e-06, + "loss": 0.2158, + "step": 19867 + }, + { + "epoch": 1.0694791101465002, + "grad_norm": 4.0705718994140625, + "learning_rate": 7.3801562285870905e-06, + "loss": 0.1994, + "step": 19868 + }, + { + "epoch": 1.069492674986435, + "grad_norm": 4.390881061553955, + "learning_rate": 7.380019185966836e-06, + "loss": 0.2056, + "step": 19869 + }, + { + "epoch": 1.06950623982637, + "grad_norm": 4.7530341148376465, + "learning_rate": 7.379882143346582e-06, + "loss": 0.2264, + "step": 19870 + }, + { + "epoch": 1.069519804666305, + "grad_norm": 4.064240455627441, + "learning_rate": 7.379745100726327e-06, + "loss": 0.2408, + "step": 19871 + }, + { + "epoch": 1.06953336950624, + "grad_norm": 7.114400863647461, + "learning_rate": 7.379608058106071e-06, + "loss": 0.4109, + "step": 19872 + }, + { + "epoch": 1.0695469343461748, + "grad_norm": 6.726410388946533, + "learning_rate": 7.379471015485816e-06, + "loss": 0.3837, + "step": 19873 + }, + { + "epoch": 1.0695604991861096, + "grad_norm": 5.39828634262085, + "learning_rate": 7.3793339728655624e-06, + "loss": 0.3473, + "step": 19874 + }, + { + "epoch": 1.0695740640260445, + "grad_norm": 5.115161418914795, + "learning_rate": 7.379196930245307e-06, + "loss": 0.3441, + "step": 19875 + }, + { + "epoch": 1.0695876288659794, + "grad_norm": 5.565964698791504, + "learning_rate": 7.379059887625052e-06, + "loss": 0.288, + "step": 19876 + }, + { + "epoch": 1.0696011937059142, + "grad_norm": 5.885908126831055, + "learning_rate": 7.378922845004797e-06, + "loss": 0.2422, + "step": 19877 + }, + { + "epoch": 1.069614758545849, + "grad_norm": 4.851985931396484, + "learning_rate": 7.378785802384542e-06, + "loss": 0.2235, + "step": 19878 + }, + { + "epoch": 1.069628323385784, + "grad_norm": 7.719588279724121, + "learning_rate": 7.3786487597642875e-06, + "loss": 0.3717, + "step": 19879 + }, + { + "epoch": 1.069641888225719, + "grad_norm": 5.009894847869873, + "learning_rate": 7.378511717144033e-06, + "loss": 0.256, + "step": 19880 + }, + { + "epoch": 1.069655453065654, + "grad_norm": 6.871822834014893, + "learning_rate": 7.378374674523777e-06, + "loss": 0.3367, + "step": 19881 + }, + { + "epoch": 1.0696690179055888, + "grad_norm": 5.220508575439453, + "learning_rate": 7.378237631903522e-06, + "loss": 0.3928, + "step": 19882 + }, + { + "epoch": 1.0696825827455236, + "grad_norm": 5.311625003814697, + "learning_rate": 7.378100589283268e-06, + "loss": 0.286, + "step": 19883 + }, + { + "epoch": 1.0696961475854585, + "grad_norm": 6.313404083251953, + "learning_rate": 7.3779635466630125e-06, + "loss": 0.2353, + "step": 19884 + }, + { + "epoch": 1.0697097124253934, + "grad_norm": 5.234913349151611, + "learning_rate": 7.377826504042758e-06, + "loss": 0.2271, + "step": 19885 + }, + { + "epoch": 1.0697232772653282, + "grad_norm": 5.5702996253967285, + "learning_rate": 7.377689461422503e-06, + "loss": 0.3028, + "step": 19886 + }, + { + "epoch": 1.069736842105263, + "grad_norm": 6.403785705566406, + "learning_rate": 7.377552418802248e-06, + "loss": 0.284, + "step": 19887 + }, + { + "epoch": 1.069750406945198, + "grad_norm": 5.998819351196289, + "learning_rate": 7.377415376181993e-06, + "loss": 0.2619, + "step": 19888 + }, + { + "epoch": 1.0697639717851328, + "grad_norm": 4.956921100616455, + "learning_rate": 7.377278333561738e-06, + "loss": 0.3448, + "step": 19889 + }, + { + "epoch": 1.069777536625068, + "grad_norm": 3.9780638217926025, + "learning_rate": 7.377141290941483e-06, + "loss": 0.1701, + "step": 19890 + }, + { + "epoch": 1.0697911014650028, + "grad_norm": 4.004610538482666, + "learning_rate": 7.377004248321229e-06, + "loss": 0.1713, + "step": 19891 + }, + { + "epoch": 1.0698046663049376, + "grad_norm": 5.666964054107666, + "learning_rate": 7.376867205700974e-06, + "loss": 0.3713, + "step": 19892 + }, + { + "epoch": 1.0698182311448725, + "grad_norm": 4.523702621459961, + "learning_rate": 7.376730163080718e-06, + "loss": 0.2129, + "step": 19893 + }, + { + "epoch": 1.0698317959848074, + "grad_norm": 5.541628837585449, + "learning_rate": 7.3765931204604635e-06, + "loss": 0.3269, + "step": 19894 + }, + { + "epoch": 1.0698453608247422, + "grad_norm": 4.394146919250488, + "learning_rate": 7.3764560778402095e-06, + "loss": 0.19, + "step": 19895 + }, + { + "epoch": 1.069858925664677, + "grad_norm": 7.238893508911133, + "learning_rate": 7.376319035219955e-06, + "loss": 0.3774, + "step": 19896 + }, + { + "epoch": 1.069872490504612, + "grad_norm": 5.95554780960083, + "learning_rate": 7.376181992599699e-06, + "loss": 0.4278, + "step": 19897 + }, + { + "epoch": 1.0698860553445468, + "grad_norm": 7.825153350830078, + "learning_rate": 7.376044949979444e-06, + "loss": 0.3981, + "step": 19898 + }, + { + "epoch": 1.069899620184482, + "grad_norm": 5.172240257263184, + "learning_rate": 7.3759079073591885e-06, + "loss": 0.2866, + "step": 19899 + }, + { + "epoch": 1.0699131850244168, + "grad_norm": 6.3988142013549805, + "learning_rate": 7.3757708647389345e-06, + "loss": 0.3085, + "step": 19900 + }, + { + "epoch": 1.0699267498643517, + "grad_norm": 5.5819926261901855, + "learning_rate": 7.37563382211868e-06, + "loss": 0.2271, + "step": 19901 + }, + { + "epoch": 1.0699403147042865, + "grad_norm": 6.619974613189697, + "learning_rate": 7.375496779498424e-06, + "loss": 0.2512, + "step": 19902 + }, + { + "epoch": 1.0699538795442214, + "grad_norm": 4.5535502433776855, + "learning_rate": 7.375359736878169e-06, + "loss": 0.193, + "step": 19903 + }, + { + "epoch": 1.0699674443841563, + "grad_norm": 5.923943519592285, + "learning_rate": 7.375222694257915e-06, + "loss": 0.2254, + "step": 19904 + }, + { + "epoch": 1.0699810092240911, + "grad_norm": 5.015378475189209, + "learning_rate": 7.3750856516376604e-06, + "loss": 0.24, + "step": 19905 + }, + { + "epoch": 1.069994574064026, + "grad_norm": 4.791944980621338, + "learning_rate": 7.374948609017405e-06, + "loss": 0.2423, + "step": 19906 + }, + { + "epoch": 1.0700081389039608, + "grad_norm": 8.010700225830078, + "learning_rate": 7.37481156639715e-06, + "loss": 0.4473, + "step": 19907 + }, + { + "epoch": 1.0700217037438957, + "grad_norm": 7.835428714752197, + "learning_rate": 7.374674523776896e-06, + "loss": 0.2781, + "step": 19908 + }, + { + "epoch": 1.0700352685838308, + "grad_norm": 7.243872165679932, + "learning_rate": 7.37453748115664e-06, + "loss": 0.3015, + "step": 19909 + }, + { + "epoch": 1.0700488334237657, + "grad_norm": 7.877471923828125, + "learning_rate": 7.3744004385363855e-06, + "loss": 0.3634, + "step": 19910 + }, + { + "epoch": 1.0700623982637005, + "grad_norm": 5.326922416687012, + "learning_rate": 7.374263395916131e-06, + "loss": 0.2181, + "step": 19911 + }, + { + "epoch": 1.0700759631036354, + "grad_norm": 6.226703643798828, + "learning_rate": 7.374126353295875e-06, + "loss": 0.2776, + "step": 19912 + }, + { + "epoch": 1.0700895279435703, + "grad_norm": 6.611086845397949, + "learning_rate": 7.373989310675621e-06, + "loss": 0.3254, + "step": 19913 + }, + { + "epoch": 1.0701030927835051, + "grad_norm": 5.435279369354248, + "learning_rate": 7.373852268055366e-06, + "loss": 0.2273, + "step": 19914 + }, + { + "epoch": 1.07011665762344, + "grad_norm": 5.797354698181152, + "learning_rate": 7.3737152254351105e-06, + "loss": 0.2688, + "step": 19915 + }, + { + "epoch": 1.0701302224633749, + "grad_norm": 6.501707553863525, + "learning_rate": 7.373578182814856e-06, + "loss": 0.2988, + "step": 19916 + }, + { + "epoch": 1.0701437873033097, + "grad_norm": 5.324801921844482, + "learning_rate": 7.373441140194602e-06, + "loss": 0.2865, + "step": 19917 + }, + { + "epoch": 1.0701573521432448, + "grad_norm": 4.920741081237793, + "learning_rate": 7.373304097574346e-06, + "loss": 0.1969, + "step": 19918 + }, + { + "epoch": 1.0701709169831797, + "grad_norm": 6.54941987991333, + "learning_rate": 7.373167054954091e-06, + "loss": 0.3895, + "step": 19919 + }, + { + "epoch": 1.0701844818231145, + "grad_norm": 8.696361541748047, + "learning_rate": 7.3730300123338364e-06, + "loss": 0.4064, + "step": 19920 + }, + { + "epoch": 1.0701980466630494, + "grad_norm": 7.112307071685791, + "learning_rate": 7.3728929697135825e-06, + "loss": 0.3038, + "step": 19921 + }, + { + "epoch": 1.0702116115029843, + "grad_norm": 5.67449951171875, + "learning_rate": 7.372755927093327e-06, + "loss": 0.3907, + "step": 19922 + }, + { + "epoch": 1.0702251763429191, + "grad_norm": 5.781213283538818, + "learning_rate": 7.372618884473072e-06, + "loss": 0.2862, + "step": 19923 + }, + { + "epoch": 1.070238741182854, + "grad_norm": 5.606235027313232, + "learning_rate": 7.372481841852816e-06, + "loss": 0.3012, + "step": 19924 + }, + { + "epoch": 1.0702523060227889, + "grad_norm": 9.522202491760254, + "learning_rate": 7.3723447992325615e-06, + "loss": 0.5108, + "step": 19925 + }, + { + "epoch": 1.0702658708627237, + "grad_norm": 4.736066818237305, + "learning_rate": 7.3722077566123075e-06, + "loss": 0.2736, + "step": 19926 + }, + { + "epoch": 1.0702794357026586, + "grad_norm": 5.0245819091796875, + "learning_rate": 7.372070713992052e-06, + "loss": 0.2572, + "step": 19927 + }, + { + "epoch": 1.0702930005425937, + "grad_norm": 4.973031044006348, + "learning_rate": 7.371933671371797e-06, + "loss": 0.3255, + "step": 19928 + }, + { + "epoch": 1.0703065653825286, + "grad_norm": 5.91623067855835, + "learning_rate": 7.371796628751542e-06, + "loss": 0.3254, + "step": 19929 + }, + { + "epoch": 1.0703201302224634, + "grad_norm": 5.0445098876953125, + "learning_rate": 7.371659586131288e-06, + "loss": 0.2804, + "step": 19930 + }, + { + "epoch": 1.0703336950623983, + "grad_norm": 5.094147682189941, + "learning_rate": 7.3715225435110325e-06, + "loss": 0.2559, + "step": 19931 + }, + { + "epoch": 1.0703472599023331, + "grad_norm": 5.60631799697876, + "learning_rate": 7.371385500890778e-06, + "loss": 0.321, + "step": 19932 + }, + { + "epoch": 1.070360824742268, + "grad_norm": 4.828434944152832, + "learning_rate": 7.371248458270522e-06, + "loss": 0.254, + "step": 19933 + }, + { + "epoch": 1.0703743895822029, + "grad_norm": 7.473515510559082, + "learning_rate": 7.371111415650268e-06, + "loss": 0.3615, + "step": 19934 + }, + { + "epoch": 1.0703879544221377, + "grad_norm": 5.5420989990234375, + "learning_rate": 7.370974373030013e-06, + "loss": 0.2733, + "step": 19935 + }, + { + "epoch": 1.0704015192620726, + "grad_norm": 7.016155242919922, + "learning_rate": 7.370837330409758e-06, + "loss": 0.3266, + "step": 19936 + }, + { + "epoch": 1.0704150841020077, + "grad_norm": 3.7194366455078125, + "learning_rate": 7.370700287789503e-06, + "loss": 0.1595, + "step": 19937 + }, + { + "epoch": 1.0704286489419426, + "grad_norm": 4.802184104919434, + "learning_rate": 7.370563245169248e-06, + "loss": 0.3061, + "step": 19938 + }, + { + "epoch": 1.0704422137818774, + "grad_norm": 6.44828462600708, + "learning_rate": 7.370426202548994e-06, + "loss": 0.4852, + "step": 19939 + }, + { + "epoch": 1.0704557786218123, + "grad_norm": 5.435861110687256, + "learning_rate": 7.370289159928738e-06, + "loss": 0.3324, + "step": 19940 + }, + { + "epoch": 1.0704693434617472, + "grad_norm": 6.168603897094727, + "learning_rate": 7.3701521173084835e-06, + "loss": 0.3319, + "step": 19941 + }, + { + "epoch": 1.070482908301682, + "grad_norm": 4.914501667022705, + "learning_rate": 7.370015074688228e-06, + "loss": 0.2408, + "step": 19942 + }, + { + "epoch": 1.0704964731416169, + "grad_norm": 3.4708778858184814, + "learning_rate": 7.369878032067974e-06, + "loss": 0.1824, + "step": 19943 + }, + { + "epoch": 1.0705100379815518, + "grad_norm": 4.252392292022705, + "learning_rate": 7.369740989447719e-06, + "loss": 0.2723, + "step": 19944 + }, + { + "epoch": 1.0705236028214866, + "grad_norm": 5.214506149291992, + "learning_rate": 7.369603946827464e-06, + "loss": 0.3026, + "step": 19945 + }, + { + "epoch": 1.0705371676614215, + "grad_norm": 8.584158897399902, + "learning_rate": 7.3694669042072085e-06, + "loss": 0.3559, + "step": 19946 + }, + { + "epoch": 1.0705507325013566, + "grad_norm": 6.087651252746582, + "learning_rate": 7.3693298615869546e-06, + "loss": 0.3064, + "step": 19947 + }, + { + "epoch": 1.0705642973412914, + "grad_norm": 8.16900634765625, + "learning_rate": 7.3691928189667e-06, + "loss": 0.3723, + "step": 19948 + }, + { + "epoch": 1.0705778621812263, + "grad_norm": 4.259159088134766, + "learning_rate": 7.369055776346444e-06, + "loss": 0.2652, + "step": 19949 + }, + { + "epoch": 1.0705914270211612, + "grad_norm": 5.697803020477295, + "learning_rate": 7.368918733726189e-06, + "loss": 0.3413, + "step": 19950 + }, + { + "epoch": 1.070604991861096, + "grad_norm": 5.56087064743042, + "learning_rate": 7.368781691105934e-06, + "loss": 0.2824, + "step": 19951 + }, + { + "epoch": 1.070618556701031, + "grad_norm": 5.2379255294799805, + "learning_rate": 7.36864464848568e-06, + "loss": 0.3411, + "step": 19952 + }, + { + "epoch": 1.0706321215409658, + "grad_norm": 5.2128729820251465, + "learning_rate": 7.368507605865425e-06, + "loss": 0.2133, + "step": 19953 + }, + { + "epoch": 1.0706456863809006, + "grad_norm": 6.311440467834473, + "learning_rate": 7.36837056324517e-06, + "loss": 0.2595, + "step": 19954 + }, + { + "epoch": 1.0706592512208355, + "grad_norm": 4.456070423126221, + "learning_rate": 7.368233520624914e-06, + "loss": 0.2165, + "step": 19955 + }, + { + "epoch": 1.0706728160607706, + "grad_norm": 3.9241364002227783, + "learning_rate": 7.36809647800466e-06, + "loss": 0.1903, + "step": 19956 + }, + { + "epoch": 1.0706863809007054, + "grad_norm": 5.945341110229492, + "learning_rate": 7.3679594353844055e-06, + "loss": 0.2438, + "step": 19957 + }, + { + "epoch": 1.0706999457406403, + "grad_norm": 3.5144031047821045, + "learning_rate": 7.36782239276415e-06, + "loss": 0.1649, + "step": 19958 + }, + { + "epoch": 1.0707135105805752, + "grad_norm": 4.4644694328308105, + "learning_rate": 7.367685350143895e-06, + "loss": 0.2351, + "step": 19959 + }, + { + "epoch": 1.07072707542051, + "grad_norm": 4.2643866539001465, + "learning_rate": 7.367548307523641e-06, + "loss": 0.1815, + "step": 19960 + }, + { + "epoch": 1.070740640260445, + "grad_norm": 5.7579874992370605, + "learning_rate": 7.367411264903385e-06, + "loss": 0.2449, + "step": 19961 + }, + { + "epoch": 1.0707542051003798, + "grad_norm": 3.6755011081695557, + "learning_rate": 7.3672742222831306e-06, + "loss": 0.2171, + "step": 19962 + }, + { + "epoch": 1.0707677699403146, + "grad_norm": 6.124462604522705, + "learning_rate": 7.367137179662876e-06, + "loss": 0.2386, + "step": 19963 + }, + { + "epoch": 1.0707813347802495, + "grad_norm": 2.3646156787872314, + "learning_rate": 7.367000137042622e-06, + "loss": 0.1038, + "step": 19964 + }, + { + "epoch": 1.0707948996201844, + "grad_norm": 6.0044732093811035, + "learning_rate": 7.366863094422366e-06, + "loss": 0.3325, + "step": 19965 + }, + { + "epoch": 1.0708084644601195, + "grad_norm": 3.712888479232788, + "learning_rate": 7.366726051802111e-06, + "loss": 0.1722, + "step": 19966 + }, + { + "epoch": 1.0708220293000543, + "grad_norm": 4.482344627380371, + "learning_rate": 7.366589009181856e-06, + "loss": 0.2172, + "step": 19967 + }, + { + "epoch": 1.0708355941399892, + "grad_norm": 4.210380554199219, + "learning_rate": 7.366451966561601e-06, + "loss": 0.2938, + "step": 19968 + }, + { + "epoch": 1.070849158979924, + "grad_norm": 5.13201904296875, + "learning_rate": 7.366314923941347e-06, + "loss": 0.3433, + "step": 19969 + }, + { + "epoch": 1.070862723819859, + "grad_norm": 6.752475738525391, + "learning_rate": 7.366177881321092e-06, + "loss": 0.2482, + "step": 19970 + }, + { + "epoch": 1.0708762886597938, + "grad_norm": 5.28867244720459, + "learning_rate": 7.366040838700836e-06, + "loss": 0.3089, + "step": 19971 + }, + { + "epoch": 1.0708898534997287, + "grad_norm": 4.185189723968506, + "learning_rate": 7.3659037960805815e-06, + "loss": 0.2819, + "step": 19972 + }, + { + "epoch": 1.0709034183396635, + "grad_norm": 4.753018379211426, + "learning_rate": 7.3657667534603275e-06, + "loss": 0.3015, + "step": 19973 + }, + { + "epoch": 1.0709169831795984, + "grad_norm": 5.650609016418457, + "learning_rate": 7.365629710840072e-06, + "loss": 0.3016, + "step": 19974 + }, + { + "epoch": 1.0709305480195335, + "grad_norm": 7.10740327835083, + "learning_rate": 7.365492668219817e-06, + "loss": 0.3775, + "step": 19975 + }, + { + "epoch": 1.0709441128594683, + "grad_norm": 6.956289768218994, + "learning_rate": 7.365355625599561e-06, + "loss": 0.3536, + "step": 19976 + }, + { + "epoch": 1.0709576776994032, + "grad_norm": 6.735710144042969, + "learning_rate": 7.365218582979307e-06, + "loss": 0.4466, + "step": 19977 + }, + { + "epoch": 1.070971242539338, + "grad_norm": 5.167269706726074, + "learning_rate": 7.3650815403590526e-06, + "loss": 0.2224, + "step": 19978 + }, + { + "epoch": 1.070984807379273, + "grad_norm": 5.869156360626221, + "learning_rate": 7.364944497738798e-06, + "loss": 0.3256, + "step": 19979 + }, + { + "epoch": 1.0709983722192078, + "grad_norm": 6.847331523895264, + "learning_rate": 7.364807455118542e-06, + "loss": 0.2456, + "step": 19980 + }, + { + "epoch": 1.0710119370591427, + "grad_norm": 4.482121467590332, + "learning_rate": 7.364670412498287e-06, + "loss": 0.2693, + "step": 19981 + }, + { + "epoch": 1.0710255018990775, + "grad_norm": 7.271943092346191, + "learning_rate": 7.364533369878033e-06, + "loss": 0.4205, + "step": 19982 + }, + { + "epoch": 1.0710390667390124, + "grad_norm": 6.65131950378418, + "learning_rate": 7.364396327257778e-06, + "loss": 0.4042, + "step": 19983 + }, + { + "epoch": 1.0710526315789473, + "grad_norm": 5.564262390136719, + "learning_rate": 7.364259284637523e-06, + "loss": 0.3302, + "step": 19984 + }, + { + "epoch": 1.0710661964188823, + "grad_norm": 7.26967716217041, + "learning_rate": 7.364122242017268e-06, + "loss": 0.3543, + "step": 19985 + }, + { + "epoch": 1.0710797612588172, + "grad_norm": 5.2469682693481445, + "learning_rate": 7.363985199397013e-06, + "loss": 0.2577, + "step": 19986 + }, + { + "epoch": 1.071093326098752, + "grad_norm": 6.497317790985107, + "learning_rate": 7.363848156776758e-06, + "loss": 0.2982, + "step": 19987 + }, + { + "epoch": 1.071106890938687, + "grad_norm": 4.494437217712402, + "learning_rate": 7.3637111141565035e-06, + "loss": 0.2527, + "step": 19988 + }, + { + "epoch": 1.0711204557786218, + "grad_norm": 5.324680328369141, + "learning_rate": 7.363574071536248e-06, + "loss": 0.3156, + "step": 19989 + }, + { + "epoch": 1.0711340206185567, + "grad_norm": 6.79879903793335, + "learning_rate": 7.363437028915994e-06, + "loss": 0.3012, + "step": 19990 + }, + { + "epoch": 1.0711475854584915, + "grad_norm": 6.130378723144531, + "learning_rate": 7.363299986295739e-06, + "loss": 0.2823, + "step": 19991 + }, + { + "epoch": 1.0711611502984264, + "grad_norm": 5.074324131011963, + "learning_rate": 7.363162943675483e-06, + "loss": 0.258, + "step": 19992 + }, + { + "epoch": 1.0711747151383613, + "grad_norm": 5.228707790374756, + "learning_rate": 7.3630259010552286e-06, + "loss": 0.4082, + "step": 19993 + }, + { + "epoch": 1.0711882799782964, + "grad_norm": 6.194622039794922, + "learning_rate": 7.362888858434974e-06, + "loss": 0.4353, + "step": 19994 + }, + { + "epoch": 1.0712018448182312, + "grad_norm": 4.971776485443115, + "learning_rate": 7.362751815814719e-06, + "loss": 0.3404, + "step": 19995 + }, + { + "epoch": 1.071215409658166, + "grad_norm": 5.303466320037842, + "learning_rate": 7.362614773194464e-06, + "loss": 0.2449, + "step": 19996 + }, + { + "epoch": 1.071228974498101, + "grad_norm": 4.280148506164551, + "learning_rate": 7.362477730574209e-06, + "loss": 0.2265, + "step": 19997 + }, + { + "epoch": 1.0712425393380358, + "grad_norm": 6.448909282684326, + "learning_rate": 7.362340687953954e-06, + "loss": 0.1885, + "step": 19998 + }, + { + "epoch": 1.0712561041779707, + "grad_norm": 4.374844551086426, + "learning_rate": 7.3622036453337e-06, + "loss": 0.2115, + "step": 19999 + }, + { + "epoch": 1.0712696690179055, + "grad_norm": 4.159053325653076, + "learning_rate": 7.362066602713445e-06, + "loss": 0.4128, + "step": 20000 + }, + { + "epoch": 1.0712832338578404, + "grad_norm": 3.608896017074585, + "learning_rate": 7.361929560093189e-06, + "loss": 0.2112, + "step": 20001 + }, + { + "epoch": 1.0712967986977753, + "grad_norm": 5.418060302734375, + "learning_rate": 7.361792517472934e-06, + "loss": 0.3544, + "step": 20002 + }, + { + "epoch": 1.0713103635377101, + "grad_norm": 3.917064666748047, + "learning_rate": 7.36165547485268e-06, + "loss": 0.1334, + "step": 20003 + }, + { + "epoch": 1.0713239283776452, + "grad_norm": 8.299721717834473, + "learning_rate": 7.3615184322324255e-06, + "loss": 0.3217, + "step": 20004 + }, + { + "epoch": 1.07133749321758, + "grad_norm": 6.008577346801758, + "learning_rate": 7.36138138961217e-06, + "loss": 0.3508, + "step": 20005 + }, + { + "epoch": 1.071351058057515, + "grad_norm": 5.249358654022217, + "learning_rate": 7.361244346991915e-06, + "loss": 0.2004, + "step": 20006 + }, + { + "epoch": 1.0713646228974498, + "grad_norm": 4.514955043792725, + "learning_rate": 7.361107304371659e-06, + "loss": 0.2754, + "step": 20007 + }, + { + "epoch": 1.0713781877373847, + "grad_norm": 5.282358646392822, + "learning_rate": 7.360970261751405e-06, + "loss": 0.2741, + "step": 20008 + }, + { + "epoch": 1.0713917525773196, + "grad_norm": 5.326760292053223, + "learning_rate": 7.3608332191311506e-06, + "loss": 0.2822, + "step": 20009 + }, + { + "epoch": 1.0714053174172544, + "grad_norm": 5.161691665649414, + "learning_rate": 7.360696176510895e-06, + "loss": 0.2492, + "step": 20010 + }, + { + "epoch": 1.0714188822571893, + "grad_norm": 5.492958068847656, + "learning_rate": 7.36055913389064e-06, + "loss": 0.3775, + "step": 20011 + }, + { + "epoch": 1.0714324470971242, + "grad_norm": 5.2079033851623535, + "learning_rate": 7.360422091270386e-06, + "loss": 0.2265, + "step": 20012 + }, + { + "epoch": 1.0714460119370592, + "grad_norm": 5.493837356567383, + "learning_rate": 7.360285048650131e-06, + "loss": 0.2331, + "step": 20013 + }, + { + "epoch": 1.071459576776994, + "grad_norm": 6.159422397613525, + "learning_rate": 7.360148006029876e-06, + "loss": 0.3516, + "step": 20014 + }, + { + "epoch": 1.071473141616929, + "grad_norm": 4.568313121795654, + "learning_rate": 7.360010963409621e-06, + "loss": 0.1703, + "step": 20015 + }, + { + "epoch": 1.0714867064568638, + "grad_norm": 5.6321611404418945, + "learning_rate": 7.359873920789367e-06, + "loss": 0.2761, + "step": 20016 + }, + { + "epoch": 1.0715002712967987, + "grad_norm": 5.351968288421631, + "learning_rate": 7.359736878169111e-06, + "loss": 0.239, + "step": 20017 + }, + { + "epoch": 1.0715138361367336, + "grad_norm": 5.224619388580322, + "learning_rate": 7.359599835548856e-06, + "loss": 0.2256, + "step": 20018 + }, + { + "epoch": 1.0715274009766684, + "grad_norm": 5.252469062805176, + "learning_rate": 7.3594627929286015e-06, + "loss": 0.2575, + "step": 20019 + }, + { + "epoch": 1.0715409658166033, + "grad_norm": 4.971432685852051, + "learning_rate": 7.359325750308346e-06, + "loss": 0.3059, + "step": 20020 + }, + { + "epoch": 1.0715545306565382, + "grad_norm": 6.535022258758545, + "learning_rate": 7.359188707688092e-06, + "loss": 0.4791, + "step": 20021 + }, + { + "epoch": 1.071568095496473, + "grad_norm": 5.8447065353393555, + "learning_rate": 7.359051665067837e-06, + "loss": 0.2734, + "step": 20022 + }, + { + "epoch": 1.0715816603364081, + "grad_norm": 6.085793495178223, + "learning_rate": 7.358914622447581e-06, + "loss": 0.3228, + "step": 20023 + }, + { + "epoch": 1.071595225176343, + "grad_norm": 3.9409122467041016, + "learning_rate": 7.3587775798273266e-06, + "loss": 0.1591, + "step": 20024 + }, + { + "epoch": 1.0716087900162778, + "grad_norm": 6.0232038497924805, + "learning_rate": 7.358640537207073e-06, + "loss": 0.383, + "step": 20025 + }, + { + "epoch": 1.0716223548562127, + "grad_norm": 5.832759380340576, + "learning_rate": 7.358503494586817e-06, + "loss": 0.2911, + "step": 20026 + }, + { + "epoch": 1.0716359196961476, + "grad_norm": 5.931014060974121, + "learning_rate": 7.358366451966562e-06, + "loss": 0.2955, + "step": 20027 + }, + { + "epoch": 1.0716494845360824, + "grad_norm": 6.12196159362793, + "learning_rate": 7.358229409346307e-06, + "loss": 0.3867, + "step": 20028 + }, + { + "epoch": 1.0716630493760173, + "grad_norm": 5.06237268447876, + "learning_rate": 7.3580923667260525e-06, + "loss": 0.2106, + "step": 20029 + }, + { + "epoch": 1.0716766142159522, + "grad_norm": 5.4095587730407715, + "learning_rate": 7.357955324105798e-06, + "loss": 0.281, + "step": 20030 + }, + { + "epoch": 1.071690179055887, + "grad_norm": 6.18228816986084, + "learning_rate": 7.357818281485543e-06, + "loss": 0.386, + "step": 20031 + }, + { + "epoch": 1.0717037438958221, + "grad_norm": 5.7578935623168945, + "learning_rate": 7.357681238865287e-06, + "loss": 0.3946, + "step": 20032 + }, + { + "epoch": 1.071717308735757, + "grad_norm": 5.219448089599609, + "learning_rate": 7.357544196245033e-06, + "loss": 0.2304, + "step": 20033 + }, + { + "epoch": 1.0717308735756919, + "grad_norm": 3.9316842555999756, + "learning_rate": 7.357407153624778e-06, + "loss": 0.2811, + "step": 20034 + }, + { + "epoch": 1.0717444384156267, + "grad_norm": 6.427675724029541, + "learning_rate": 7.357270111004523e-06, + "loss": 0.3749, + "step": 20035 + }, + { + "epoch": 1.0717580032555616, + "grad_norm": 6.262622356414795, + "learning_rate": 7.357133068384268e-06, + "loss": 0.3324, + "step": 20036 + }, + { + "epoch": 1.0717715680954965, + "grad_norm": 5.61691951751709, + "learning_rate": 7.356996025764013e-06, + "loss": 0.2839, + "step": 20037 + }, + { + "epoch": 1.0717851329354313, + "grad_norm": 4.272159576416016, + "learning_rate": 7.356858983143759e-06, + "loss": 0.3336, + "step": 20038 + }, + { + "epoch": 1.0717986977753662, + "grad_norm": 5.196131706237793, + "learning_rate": 7.356721940523503e-06, + "loss": 0.2196, + "step": 20039 + }, + { + "epoch": 1.071812262615301, + "grad_norm": 6.368576526641846, + "learning_rate": 7.3565848979032486e-06, + "loss": 0.3225, + "step": 20040 + }, + { + "epoch": 1.071825827455236, + "grad_norm": 7.347773551940918, + "learning_rate": 7.356447855282993e-06, + "loss": 0.3634, + "step": 20041 + }, + { + "epoch": 1.071839392295171, + "grad_norm": 5.689134120941162, + "learning_rate": 7.356310812662739e-06, + "loss": 0.3753, + "step": 20042 + }, + { + "epoch": 1.0718529571351059, + "grad_norm": 4.521063804626465, + "learning_rate": 7.356173770042484e-06, + "loss": 0.235, + "step": 20043 + }, + { + "epoch": 1.0718665219750407, + "grad_norm": 5.7402448654174805, + "learning_rate": 7.3560367274222284e-06, + "loss": 0.3096, + "step": 20044 + }, + { + "epoch": 1.0718800868149756, + "grad_norm": 4.358186721801758, + "learning_rate": 7.355899684801974e-06, + "loss": 0.3123, + "step": 20045 + }, + { + "epoch": 1.0718936516549105, + "grad_norm": 5.829446792602539, + "learning_rate": 7.35576264218172e-06, + "loss": 0.3646, + "step": 20046 + }, + { + "epoch": 1.0719072164948453, + "grad_norm": 7.146193027496338, + "learning_rate": 7.355625599561465e-06, + "loss": 0.3548, + "step": 20047 + }, + { + "epoch": 1.0719207813347802, + "grad_norm": 5.712864875793457, + "learning_rate": 7.355488556941209e-06, + "loss": 0.3177, + "step": 20048 + }, + { + "epoch": 1.071934346174715, + "grad_norm": 5.712132453918457, + "learning_rate": 7.355351514320954e-06, + "loss": 0.3289, + "step": 20049 + }, + { + "epoch": 1.07194791101465, + "grad_norm": 9.624319076538086, + "learning_rate": 7.355214471700699e-06, + "loss": 0.5533, + "step": 20050 + }, + { + "epoch": 1.071961475854585, + "grad_norm": 4.7054009437561035, + "learning_rate": 7.355077429080445e-06, + "loss": 0.2751, + "step": 20051 + }, + { + "epoch": 1.0719750406945199, + "grad_norm": 6.517271041870117, + "learning_rate": 7.35494038646019e-06, + "loss": 0.3067, + "step": 20052 + }, + { + "epoch": 1.0719886055344547, + "grad_norm": 5.084458351135254, + "learning_rate": 7.354803343839935e-06, + "loss": 0.2628, + "step": 20053 + }, + { + "epoch": 1.0720021703743896, + "grad_norm": 6.715068340301514, + "learning_rate": 7.354666301219679e-06, + "loss": 0.2948, + "step": 20054 + }, + { + "epoch": 1.0720157352143245, + "grad_norm": 5.372523784637451, + "learning_rate": 7.354529258599425e-06, + "loss": 0.3675, + "step": 20055 + }, + { + "epoch": 1.0720293000542593, + "grad_norm": 6.456425189971924, + "learning_rate": 7.354392215979171e-06, + "loss": 0.3488, + "step": 20056 + }, + { + "epoch": 1.0720428648941942, + "grad_norm": 6.0656819343566895, + "learning_rate": 7.354255173358915e-06, + "loss": 0.3368, + "step": 20057 + }, + { + "epoch": 1.072056429734129, + "grad_norm": 7.589933395385742, + "learning_rate": 7.35411813073866e-06, + "loss": 0.3552, + "step": 20058 + }, + { + "epoch": 1.072069994574064, + "grad_norm": 6.359954833984375, + "learning_rate": 7.353981088118406e-06, + "loss": 0.4601, + "step": 20059 + }, + { + "epoch": 1.0720835594139988, + "grad_norm": 4.486208915710449, + "learning_rate": 7.3538440454981505e-06, + "loss": 0.3056, + "step": 20060 + }, + { + "epoch": 1.0720971242539339, + "grad_norm": 3.7811901569366455, + "learning_rate": 7.353707002877896e-06, + "loss": 0.2021, + "step": 20061 + }, + { + "epoch": 1.0721106890938688, + "grad_norm": 6.687878608703613, + "learning_rate": 7.353569960257641e-06, + "loss": 0.3591, + "step": 20062 + }, + { + "epoch": 1.0721242539338036, + "grad_norm": 4.662410259246826, + "learning_rate": 7.353432917637385e-06, + "loss": 0.2834, + "step": 20063 + }, + { + "epoch": 1.0721378187737385, + "grad_norm": 5.091307640075684, + "learning_rate": 7.353295875017131e-06, + "loss": 0.2342, + "step": 20064 + }, + { + "epoch": 1.0721513836136733, + "grad_norm": 5.872945308685303, + "learning_rate": 7.353158832396876e-06, + "loss": 0.3243, + "step": 20065 + }, + { + "epoch": 1.0721649484536082, + "grad_norm": 9.308130264282227, + "learning_rate": 7.353021789776621e-06, + "loss": 0.2788, + "step": 20066 + }, + { + "epoch": 1.072178513293543, + "grad_norm": 6.398244380950928, + "learning_rate": 7.352884747156366e-06, + "loss": 0.5544, + "step": 20067 + }, + { + "epoch": 1.072192078133478, + "grad_norm": 3.8805980682373047, + "learning_rate": 7.352747704536112e-06, + "loss": 0.2179, + "step": 20068 + }, + { + "epoch": 1.072205642973413, + "grad_norm": 8.677292823791504, + "learning_rate": 7.352610661915856e-06, + "loss": 0.373, + "step": 20069 + }, + { + "epoch": 1.072219207813348, + "grad_norm": 4.979426383972168, + "learning_rate": 7.352473619295601e-06, + "loss": 0.4077, + "step": 20070 + }, + { + "epoch": 1.0722327726532828, + "grad_norm": 4.65905237197876, + "learning_rate": 7.352336576675347e-06, + "loss": 0.303, + "step": 20071 + }, + { + "epoch": 1.0722463374932176, + "grad_norm": 4.939396381378174, + "learning_rate": 7.352199534055093e-06, + "loss": 0.1802, + "step": 20072 + }, + { + "epoch": 1.0722599023331525, + "grad_norm": 6.303566932678223, + "learning_rate": 7.352062491434837e-06, + "loss": 0.4576, + "step": 20073 + }, + { + "epoch": 1.0722734671730874, + "grad_norm": 4.5602030754089355, + "learning_rate": 7.351925448814582e-06, + "loss": 0.35, + "step": 20074 + }, + { + "epoch": 1.0722870320130222, + "grad_norm": 4.429572582244873, + "learning_rate": 7.3517884061943264e-06, + "loss": 0.1779, + "step": 20075 + }, + { + "epoch": 1.072300596852957, + "grad_norm": 4.751459121704102, + "learning_rate": 7.351651363574072e-06, + "loss": 0.3453, + "step": 20076 + }, + { + "epoch": 1.072314161692892, + "grad_norm": 5.236741542816162, + "learning_rate": 7.351514320953818e-06, + "loss": 0.2706, + "step": 20077 + }, + { + "epoch": 1.0723277265328268, + "grad_norm": 4.60523796081543, + "learning_rate": 7.351377278333562e-06, + "loss": 0.2724, + "step": 20078 + }, + { + "epoch": 1.0723412913727617, + "grad_norm": 4.9856858253479, + "learning_rate": 7.351240235713307e-06, + "loss": 0.3137, + "step": 20079 + }, + { + "epoch": 1.0723548562126968, + "grad_norm": 5.607289791107178, + "learning_rate": 7.351103193093052e-06, + "loss": 0.3201, + "step": 20080 + }, + { + "epoch": 1.0723684210526316, + "grad_norm": 5.953529357910156, + "learning_rate": 7.350966150472798e-06, + "loss": 0.2589, + "step": 20081 + }, + { + "epoch": 1.0723819858925665, + "grad_norm": 4.296003341674805, + "learning_rate": 7.350829107852543e-06, + "loss": 0.279, + "step": 20082 + }, + { + "epoch": 1.0723955507325014, + "grad_norm": 5.614505767822266, + "learning_rate": 7.350692065232288e-06, + "loss": 0.3625, + "step": 20083 + }, + { + "epoch": 1.0724091155724362, + "grad_norm": 4.956117630004883, + "learning_rate": 7.350555022612032e-06, + "loss": 0.2651, + "step": 20084 + }, + { + "epoch": 1.072422680412371, + "grad_norm": 4.858304023742676, + "learning_rate": 7.350417979991778e-06, + "loss": 0.2519, + "step": 20085 + }, + { + "epoch": 1.072436245252306, + "grad_norm": 6.63109827041626, + "learning_rate": 7.350280937371523e-06, + "loss": 0.3797, + "step": 20086 + }, + { + "epoch": 1.0724498100922408, + "grad_norm": 6.638239860534668, + "learning_rate": 7.350143894751269e-06, + "loss": 0.4419, + "step": 20087 + }, + { + "epoch": 1.072463374932176, + "grad_norm": 7.5586676597595215, + "learning_rate": 7.350006852131013e-06, + "loss": 0.3759, + "step": 20088 + }, + { + "epoch": 1.0724769397721108, + "grad_norm": 6.287164211273193, + "learning_rate": 7.349869809510758e-06, + "loss": 0.5421, + "step": 20089 + }, + { + "epoch": 1.0724905046120456, + "grad_norm": 5.647224426269531, + "learning_rate": 7.349732766890504e-06, + "loss": 0.3025, + "step": 20090 + }, + { + "epoch": 1.0725040694519805, + "grad_norm": 7.5230302810668945, + "learning_rate": 7.3495957242702485e-06, + "loss": 0.3082, + "step": 20091 + }, + { + "epoch": 1.0725176342919154, + "grad_norm": 5.393629550933838, + "learning_rate": 7.349458681649994e-06, + "loss": 0.1609, + "step": 20092 + }, + { + "epoch": 1.0725311991318502, + "grad_norm": 6.185021877288818, + "learning_rate": 7.349321639029738e-06, + "loss": 0.347, + "step": 20093 + }, + { + "epoch": 1.072544763971785, + "grad_norm": 4.289083003997803, + "learning_rate": 7.349184596409484e-06, + "loss": 0.2387, + "step": 20094 + }, + { + "epoch": 1.07255832881172, + "grad_norm": 6.376190185546875, + "learning_rate": 7.349047553789229e-06, + "loss": 0.3652, + "step": 20095 + }, + { + "epoch": 1.0725718936516548, + "grad_norm": 5.649677276611328, + "learning_rate": 7.348910511168974e-06, + "loss": 0.2322, + "step": 20096 + }, + { + "epoch": 1.0725854584915897, + "grad_norm": 5.99346399307251, + "learning_rate": 7.348773468548719e-06, + "loss": 0.3278, + "step": 20097 + }, + { + "epoch": 1.0725990233315246, + "grad_norm": 4.387596130371094, + "learning_rate": 7.348636425928465e-06, + "loss": 0.1801, + "step": 20098 + }, + { + "epoch": 1.0726125881714597, + "grad_norm": 5.627522945404053, + "learning_rate": 7.34849938330821e-06, + "loss": 0.2577, + "step": 20099 + }, + { + "epoch": 1.0726261530113945, + "grad_norm": 6.798850059509277, + "learning_rate": 7.348362340687954e-06, + "loss": 0.3007, + "step": 20100 + }, + { + "epoch": 1.0726397178513294, + "grad_norm": 7.716589450836182, + "learning_rate": 7.348225298067699e-06, + "loss": 0.3472, + "step": 20101 + }, + { + "epoch": 1.0726532826912643, + "grad_norm": 3.9325313568115234, + "learning_rate": 7.3480882554474454e-06, + "loss": 0.1717, + "step": 20102 + }, + { + "epoch": 1.0726668475311991, + "grad_norm": 6.010471820831299, + "learning_rate": 7.34795121282719e-06, + "loss": 0.3948, + "step": 20103 + }, + { + "epoch": 1.072680412371134, + "grad_norm": 4.987639427185059, + "learning_rate": 7.347814170206935e-06, + "loss": 0.3721, + "step": 20104 + }, + { + "epoch": 1.0726939772110688, + "grad_norm": 6.168851375579834, + "learning_rate": 7.34767712758668e-06, + "loss": 0.2953, + "step": 20105 + }, + { + "epoch": 1.0727075420510037, + "grad_norm": 7.537445545196533, + "learning_rate": 7.3475400849664245e-06, + "loss": 0.4395, + "step": 20106 + }, + { + "epoch": 1.0727211068909388, + "grad_norm": 6.774405479431152, + "learning_rate": 7.3474030423461705e-06, + "loss": 0.2885, + "step": 20107 + }, + { + "epoch": 1.0727346717308737, + "grad_norm": 5.259737014770508, + "learning_rate": 7.347265999725916e-06, + "loss": 0.35, + "step": 20108 + }, + { + "epoch": 1.0727482365708085, + "grad_norm": 6.400651931762695, + "learning_rate": 7.34712895710566e-06, + "loss": 0.3323, + "step": 20109 + }, + { + "epoch": 1.0727618014107434, + "grad_norm": 6.637699127197266, + "learning_rate": 7.346991914485405e-06, + "loss": 0.4201, + "step": 20110 + }, + { + "epoch": 1.0727753662506783, + "grad_norm": 6.13530969619751, + "learning_rate": 7.346854871865151e-06, + "loss": 0.2992, + "step": 20111 + }, + { + "epoch": 1.0727889310906131, + "grad_norm": 8.454144477844238, + "learning_rate": 7.346717829244896e-06, + "loss": 0.3871, + "step": 20112 + }, + { + "epoch": 1.072802495930548, + "grad_norm": 6.70395565032959, + "learning_rate": 7.346580786624641e-06, + "loss": 0.314, + "step": 20113 + }, + { + "epoch": 1.0728160607704829, + "grad_norm": 7.71892786026001, + "learning_rate": 7.346443744004386e-06, + "loss": 0.2517, + "step": 20114 + }, + { + "epoch": 1.0728296256104177, + "grad_norm": 6.561966896057129, + "learning_rate": 7.346306701384132e-06, + "loss": 0.3244, + "step": 20115 + }, + { + "epoch": 1.0728431904503526, + "grad_norm": 7.509285926818848, + "learning_rate": 7.346169658763876e-06, + "loss": 0.2988, + "step": 20116 + }, + { + "epoch": 1.0728567552902877, + "grad_norm": 6.849177837371826, + "learning_rate": 7.346032616143621e-06, + "loss": 0.3692, + "step": 20117 + }, + { + "epoch": 1.0728703201302225, + "grad_norm": 6.334578037261963, + "learning_rate": 7.345895573523366e-06, + "loss": 0.2403, + "step": 20118 + }, + { + "epoch": 1.0728838849701574, + "grad_norm": 5.287463665008545, + "learning_rate": 7.345758530903111e-06, + "loss": 0.3526, + "step": 20119 + }, + { + "epoch": 1.0728974498100923, + "grad_norm": 5.227907657623291, + "learning_rate": 7.345621488282857e-06, + "loss": 0.3646, + "step": 20120 + }, + { + "epoch": 1.0729110146500271, + "grad_norm": 5.412855625152588, + "learning_rate": 7.345484445662602e-06, + "loss": 0.231, + "step": 20121 + }, + { + "epoch": 1.072924579489962, + "grad_norm": 7.1987624168396, + "learning_rate": 7.3453474030423465e-06, + "loss": 0.253, + "step": 20122 + }, + { + "epoch": 1.0729381443298969, + "grad_norm": 6.824717998504639, + "learning_rate": 7.345210360422092e-06, + "loss": 0.2648, + "step": 20123 + }, + { + "epoch": 1.0729517091698317, + "grad_norm": 7.860110282897949, + "learning_rate": 7.345073317801838e-06, + "loss": 0.2305, + "step": 20124 + }, + { + "epoch": 1.0729652740097666, + "grad_norm": 6.114857196807861, + "learning_rate": 7.344936275181582e-06, + "loss": 0.3498, + "step": 20125 + }, + { + "epoch": 1.0729788388497017, + "grad_norm": 4.596642971038818, + "learning_rate": 7.344799232561327e-06, + "loss": 0.1666, + "step": 20126 + }, + { + "epoch": 1.0729924036896366, + "grad_norm": 5.4622015953063965, + "learning_rate": 7.344662189941072e-06, + "loss": 0.3276, + "step": 20127 + }, + { + "epoch": 1.0730059685295714, + "grad_norm": 5.987165927886963, + "learning_rate": 7.3445251473208175e-06, + "loss": 0.256, + "step": 20128 + }, + { + "epoch": 1.0730195333695063, + "grad_norm": 8.924687385559082, + "learning_rate": 7.344388104700563e-06, + "loss": 0.6518, + "step": 20129 + }, + { + "epoch": 1.0730330982094411, + "grad_norm": 6.792148113250732, + "learning_rate": 7.344251062080308e-06, + "loss": 0.4046, + "step": 20130 + }, + { + "epoch": 1.073046663049376, + "grad_norm": 6.572427749633789, + "learning_rate": 7.344114019460052e-06, + "loss": 0.251, + "step": 20131 + }, + { + "epoch": 1.0730602278893109, + "grad_norm": 7.03237247467041, + "learning_rate": 7.343976976839797e-06, + "loss": 0.3359, + "step": 20132 + }, + { + "epoch": 1.0730737927292457, + "grad_norm": 7.64322566986084, + "learning_rate": 7.3438399342195434e-06, + "loss": 0.401, + "step": 20133 + }, + { + "epoch": 1.0730873575691806, + "grad_norm": 7.867157459259033, + "learning_rate": 7.343702891599288e-06, + "loss": 0.4557, + "step": 20134 + }, + { + "epoch": 1.0731009224091155, + "grad_norm": 5.462695121765137, + "learning_rate": 7.343565848979033e-06, + "loss": 0.3236, + "step": 20135 + }, + { + "epoch": 1.0731144872490506, + "grad_norm": 7.678565502166748, + "learning_rate": 7.343428806358778e-06, + "loss": 0.3314, + "step": 20136 + }, + { + "epoch": 1.0731280520889854, + "grad_norm": 5.978720664978027, + "learning_rate": 7.343291763738523e-06, + "loss": 0.3195, + "step": 20137 + }, + { + "epoch": 1.0731416169289203, + "grad_norm": 7.961789131164551, + "learning_rate": 7.3431547211182685e-06, + "loss": 0.8091, + "step": 20138 + }, + { + "epoch": 1.0731551817688552, + "grad_norm": 5.212693214416504, + "learning_rate": 7.343017678498014e-06, + "loss": 0.1852, + "step": 20139 + }, + { + "epoch": 1.07316874660879, + "grad_norm": 4.941150188446045, + "learning_rate": 7.342880635877758e-06, + "loss": 0.3188, + "step": 20140 + }, + { + "epoch": 1.073182311448725, + "grad_norm": 6.975391387939453, + "learning_rate": 7.342743593257504e-06, + "loss": 0.3918, + "step": 20141 + }, + { + "epoch": 1.0731958762886598, + "grad_norm": 5.8399553298950195, + "learning_rate": 7.342606550637249e-06, + "loss": 0.2504, + "step": 20142 + }, + { + "epoch": 1.0732094411285946, + "grad_norm": 6.578885078430176, + "learning_rate": 7.3424695080169935e-06, + "loss": 0.2887, + "step": 20143 + }, + { + "epoch": 1.0732230059685295, + "grad_norm": 5.617358207702637, + "learning_rate": 7.342332465396739e-06, + "loss": 0.2615, + "step": 20144 + }, + { + "epoch": 1.0732365708084646, + "grad_norm": 5.775045394897461, + "learning_rate": 7.342195422776484e-06, + "loss": 0.2884, + "step": 20145 + }, + { + "epoch": 1.0732501356483994, + "grad_norm": 6.2220659255981445, + "learning_rate": 7.34205838015623e-06, + "loss": 0.3016, + "step": 20146 + }, + { + "epoch": 1.0732637004883343, + "grad_norm": 5.861302375793457, + "learning_rate": 7.341921337535974e-06, + "loss": 0.3953, + "step": 20147 + }, + { + "epoch": 1.0732772653282692, + "grad_norm": 6.078035831451416, + "learning_rate": 7.3417842949157194e-06, + "loss": 0.5567, + "step": 20148 + }, + { + "epoch": 1.073290830168204, + "grad_norm": 6.150425434112549, + "learning_rate": 7.341647252295464e-06, + "loss": 0.2207, + "step": 20149 + }, + { + "epoch": 1.073304395008139, + "grad_norm": 6.058618068695068, + "learning_rate": 7.34151020967521e-06, + "loss": 0.3307, + "step": 20150 + }, + { + "epoch": 1.0733179598480738, + "grad_norm": 6.535989284515381, + "learning_rate": 7.341373167054955e-06, + "loss": 0.2333, + "step": 20151 + }, + { + "epoch": 1.0733315246880086, + "grad_norm": 8.431034088134766, + "learning_rate": 7.341236124434699e-06, + "loss": 0.5154, + "step": 20152 + }, + { + "epoch": 1.0733450895279435, + "grad_norm": 6.2913737297058105, + "learning_rate": 7.3410990818144445e-06, + "loss": 0.2309, + "step": 20153 + }, + { + "epoch": 1.0733586543678784, + "grad_norm": 5.983887195587158, + "learning_rate": 7.3409620391941905e-06, + "loss": 0.2831, + "step": 20154 + }, + { + "epoch": 1.0733722192078134, + "grad_norm": 5.022743225097656, + "learning_rate": 7.340824996573936e-06, + "loss": 0.2032, + "step": 20155 + }, + { + "epoch": 1.0733857840477483, + "grad_norm": 4.733636856079102, + "learning_rate": 7.34068795395368e-06, + "loss": 0.2566, + "step": 20156 + }, + { + "epoch": 1.0733993488876832, + "grad_norm": 5.321532726287842, + "learning_rate": 7.340550911333425e-06, + "loss": 0.2861, + "step": 20157 + }, + { + "epoch": 1.073412913727618, + "grad_norm": 4.648841381072998, + "learning_rate": 7.3404138687131695e-06, + "loss": 0.2186, + "step": 20158 + }, + { + "epoch": 1.073426478567553, + "grad_norm": 5.192992687225342, + "learning_rate": 7.3402768260929155e-06, + "loss": 0.3471, + "step": 20159 + }, + { + "epoch": 1.0734400434074878, + "grad_norm": 5.809813499450684, + "learning_rate": 7.340139783472661e-06, + "loss": 0.2978, + "step": 20160 + }, + { + "epoch": 1.0734536082474226, + "grad_norm": 5.277416706085205, + "learning_rate": 7.340002740852406e-06, + "loss": 0.2339, + "step": 20161 + }, + { + "epoch": 1.0734671730873575, + "grad_norm": 7.202390670776367, + "learning_rate": 7.33986569823215e-06, + "loss": 0.2569, + "step": 20162 + }, + { + "epoch": 1.0734807379272924, + "grad_norm": 3.81382155418396, + "learning_rate": 7.339728655611896e-06, + "loss": 0.2238, + "step": 20163 + }, + { + "epoch": 1.0734943027672275, + "grad_norm": 5.544862270355225, + "learning_rate": 7.3395916129916414e-06, + "loss": 0.3547, + "step": 20164 + }, + { + "epoch": 1.0735078676071623, + "grad_norm": 5.179678916931152, + "learning_rate": 7.339454570371386e-06, + "loss": 0.2212, + "step": 20165 + }, + { + "epoch": 1.0735214324470972, + "grad_norm": 6.1062822341918945, + "learning_rate": 7.339317527751131e-06, + "loss": 0.2868, + "step": 20166 + }, + { + "epoch": 1.073534997287032, + "grad_norm": 5.249023914337158, + "learning_rate": 7.339180485130877e-06, + "loss": 0.324, + "step": 20167 + }, + { + "epoch": 1.073548562126967, + "grad_norm": 4.998528480529785, + "learning_rate": 7.339043442510621e-06, + "loss": 0.2942, + "step": 20168 + }, + { + "epoch": 1.0735621269669018, + "grad_norm": 5.562494277954102, + "learning_rate": 7.3389063998903665e-06, + "loss": 0.2845, + "step": 20169 + }, + { + "epoch": 1.0735756918068367, + "grad_norm": 5.065275192260742, + "learning_rate": 7.338769357270112e-06, + "loss": 0.2709, + "step": 20170 + }, + { + "epoch": 1.0735892566467715, + "grad_norm": 6.116713523864746, + "learning_rate": 7.338632314649857e-06, + "loss": 0.3396, + "step": 20171 + }, + { + "epoch": 1.0736028214867064, + "grad_norm": 5.188066005706787, + "learning_rate": 7.338495272029602e-06, + "loss": 0.3026, + "step": 20172 + }, + { + "epoch": 1.0736163863266412, + "grad_norm": 5.523085594177246, + "learning_rate": 7.338358229409347e-06, + "loss": 0.2437, + "step": 20173 + }, + { + "epoch": 1.0736299511665763, + "grad_norm": 4.893793106079102, + "learning_rate": 7.3382211867890915e-06, + "loss": 0.2834, + "step": 20174 + }, + { + "epoch": 1.0736435160065112, + "grad_norm": 6.5374040603637695, + "learning_rate": 7.338084144168837e-06, + "loss": 0.3231, + "step": 20175 + }, + { + "epoch": 1.073657080846446, + "grad_norm": 5.04859733581543, + "learning_rate": 7.337947101548583e-06, + "loss": 0.2464, + "step": 20176 + }, + { + "epoch": 1.073670645686381, + "grad_norm": 4.897261619567871, + "learning_rate": 7.337810058928327e-06, + "loss": 0.1549, + "step": 20177 + }, + { + "epoch": 1.0736842105263158, + "grad_norm": 4.462887763977051, + "learning_rate": 7.337673016308072e-06, + "loss": 0.418, + "step": 20178 + }, + { + "epoch": 1.0736977753662507, + "grad_norm": 6.1157636642456055, + "learning_rate": 7.3375359736878174e-06, + "loss": 0.3138, + "step": 20179 + }, + { + "epoch": 1.0737113402061855, + "grad_norm": 7.081308841705322, + "learning_rate": 7.3373989310675635e-06, + "loss": 0.377, + "step": 20180 + }, + { + "epoch": 1.0737249050461204, + "grad_norm": 4.553286075592041, + "learning_rate": 7.337261888447308e-06, + "loss": 0.2693, + "step": 20181 + }, + { + "epoch": 1.0737384698860553, + "grad_norm": 5.620932579040527, + "learning_rate": 7.337124845827053e-06, + "loss": 0.4487, + "step": 20182 + }, + { + "epoch": 1.0737520347259903, + "grad_norm": 5.43044900894165, + "learning_rate": 7.336987803206797e-06, + "loss": 0.3334, + "step": 20183 + }, + { + "epoch": 1.0737655995659252, + "grad_norm": 8.064385414123535, + "learning_rate": 7.336850760586543e-06, + "loss": 0.2367, + "step": 20184 + }, + { + "epoch": 1.07377916440586, + "grad_norm": 3.913902997970581, + "learning_rate": 7.3367137179662885e-06, + "loss": 0.2335, + "step": 20185 + }, + { + "epoch": 1.073792729245795, + "grad_norm": 5.7010040283203125, + "learning_rate": 7.336576675346033e-06, + "loss": 0.2622, + "step": 20186 + }, + { + "epoch": 1.0738062940857298, + "grad_norm": 6.754715442657471, + "learning_rate": 7.336439632725778e-06, + "loss": 0.3313, + "step": 20187 + }, + { + "epoch": 1.0738198589256647, + "grad_norm": 4.998048305511475, + "learning_rate": 7.336302590105523e-06, + "loss": 0.2735, + "step": 20188 + }, + { + "epoch": 1.0738334237655995, + "grad_norm": 4.774942398071289, + "learning_rate": 7.336165547485269e-06, + "loss": 0.2568, + "step": 20189 + }, + { + "epoch": 1.0738469886055344, + "grad_norm": 4.818658828735352, + "learning_rate": 7.3360285048650136e-06, + "loss": 0.2084, + "step": 20190 + }, + { + "epoch": 1.0738605534454693, + "grad_norm": 7.352548122406006, + "learning_rate": 7.335891462244759e-06, + "loss": 0.5706, + "step": 20191 + }, + { + "epoch": 1.0738741182854041, + "grad_norm": 4.679321765899658, + "learning_rate": 7.335754419624503e-06, + "loss": 0.2142, + "step": 20192 + }, + { + "epoch": 1.0738876831253392, + "grad_norm": 4.405388832092285, + "learning_rate": 7.335617377004249e-06, + "loss": 0.2738, + "step": 20193 + }, + { + "epoch": 1.073901247965274, + "grad_norm": 6.407062530517578, + "learning_rate": 7.335480334383994e-06, + "loss": 0.3561, + "step": 20194 + }, + { + "epoch": 1.073914812805209, + "grad_norm": 6.745220184326172, + "learning_rate": 7.3353432917637394e-06, + "loss": 0.346, + "step": 20195 + }, + { + "epoch": 1.0739283776451438, + "grad_norm": 4.9474873542785645, + "learning_rate": 7.335206249143484e-06, + "loss": 0.3219, + "step": 20196 + }, + { + "epoch": 1.0739419424850787, + "grad_norm": 4.626339912414551, + "learning_rate": 7.33506920652323e-06, + "loss": 0.2809, + "step": 20197 + }, + { + "epoch": 1.0739555073250135, + "grad_norm": 6.938333034515381, + "learning_rate": 7.334932163902975e-06, + "loss": 0.2708, + "step": 20198 + }, + { + "epoch": 1.0739690721649484, + "grad_norm": 5.2782769203186035, + "learning_rate": 7.334795121282719e-06, + "loss": 0.3154, + "step": 20199 + }, + { + "epoch": 1.0739826370048833, + "grad_norm": 4.551242828369141, + "learning_rate": 7.3346580786624645e-06, + "loss": 0.2167, + "step": 20200 + }, + { + "epoch": 1.0739962018448181, + "grad_norm": 4.946776866912842, + "learning_rate": 7.334521036042209e-06, + "loss": 0.2083, + "step": 20201 + }, + { + "epoch": 1.0740097666847532, + "grad_norm": 4.291856288909912, + "learning_rate": 7.334383993421955e-06, + "loss": 0.2626, + "step": 20202 + }, + { + "epoch": 1.074023331524688, + "grad_norm": 5.741386413574219, + "learning_rate": 7.3342469508017e-06, + "loss": 0.2869, + "step": 20203 + }, + { + "epoch": 1.074036896364623, + "grad_norm": 4.758274555206299, + "learning_rate": 7.334109908181445e-06, + "loss": 0.1362, + "step": 20204 + }, + { + "epoch": 1.0740504612045578, + "grad_norm": 4.660003185272217, + "learning_rate": 7.3339728655611895e-06, + "loss": 0.2072, + "step": 20205 + }, + { + "epoch": 1.0740640260444927, + "grad_norm": 5.79037618637085, + "learning_rate": 7.3338358229409356e-06, + "loss": 0.308, + "step": 20206 + }, + { + "epoch": 1.0740775908844276, + "grad_norm": 5.000626564025879, + "learning_rate": 7.333698780320681e-06, + "loss": 0.2757, + "step": 20207 + }, + { + "epoch": 1.0740911557243624, + "grad_norm": 4.803920269012451, + "learning_rate": 7.333561737700425e-06, + "loss": 0.2981, + "step": 20208 + }, + { + "epoch": 1.0741047205642973, + "grad_norm": 7.175632476806641, + "learning_rate": 7.33342469508017e-06, + "loss": 0.3404, + "step": 20209 + }, + { + "epoch": 1.0741182854042322, + "grad_norm": 3.3206393718719482, + "learning_rate": 7.333287652459916e-06, + "loss": 0.1295, + "step": 20210 + }, + { + "epoch": 1.074131850244167, + "grad_norm": 5.122354030609131, + "learning_rate": 7.333150609839661e-06, + "loss": 0.3017, + "step": 20211 + }, + { + "epoch": 1.074145415084102, + "grad_norm": 3.95961594581604, + "learning_rate": 7.333013567219406e-06, + "loss": 0.1772, + "step": 20212 + }, + { + "epoch": 1.074158979924037, + "grad_norm": 5.104889869689941, + "learning_rate": 7.332876524599151e-06, + "loss": 0.3505, + "step": 20213 + }, + { + "epoch": 1.0741725447639718, + "grad_norm": 4.157309532165527, + "learning_rate": 7.332739481978895e-06, + "loss": 0.2223, + "step": 20214 + }, + { + "epoch": 1.0741861096039067, + "grad_norm": 5.157149314880371, + "learning_rate": 7.332602439358641e-06, + "loss": 0.1936, + "step": 20215 + }, + { + "epoch": 1.0741996744438416, + "grad_norm": 5.375901222229004, + "learning_rate": 7.3324653967383865e-06, + "loss": 0.3501, + "step": 20216 + }, + { + "epoch": 1.0742132392837764, + "grad_norm": 3.3777551651000977, + "learning_rate": 7.332328354118131e-06, + "loss": 0.1202, + "step": 20217 + }, + { + "epoch": 1.0742268041237113, + "grad_norm": 3.4828271865844727, + "learning_rate": 7.332191311497876e-06, + "loss": 0.1729, + "step": 20218 + }, + { + "epoch": 1.0742403689636462, + "grad_norm": 3.8290631771087646, + "learning_rate": 7.332054268877622e-06, + "loss": 0.2196, + "step": 20219 + }, + { + "epoch": 1.074253933803581, + "grad_norm": 6.183184623718262, + "learning_rate": 7.331917226257366e-06, + "loss": 0.1913, + "step": 20220 + }, + { + "epoch": 1.0742674986435161, + "grad_norm": 3.5143256187438965, + "learning_rate": 7.3317801836371116e-06, + "loss": 0.1684, + "step": 20221 + }, + { + "epoch": 1.074281063483451, + "grad_norm": 3.7550437450408936, + "learning_rate": 7.331643141016857e-06, + "loss": 0.1859, + "step": 20222 + }, + { + "epoch": 1.0742946283233858, + "grad_norm": 4.564441204071045, + "learning_rate": 7.331506098396603e-06, + "loss": 0.2206, + "step": 20223 + }, + { + "epoch": 1.0743081931633207, + "grad_norm": 4.717051982879639, + "learning_rate": 7.331369055776347e-06, + "loss": 0.2182, + "step": 20224 + }, + { + "epoch": 1.0743217580032556, + "grad_norm": 3.6513659954071045, + "learning_rate": 7.331232013156092e-06, + "loss": 0.1733, + "step": 20225 + }, + { + "epoch": 1.0743353228431904, + "grad_norm": 3.5019125938415527, + "learning_rate": 7.331094970535837e-06, + "loss": 0.2055, + "step": 20226 + }, + { + "epoch": 1.0743488876831253, + "grad_norm": 2.9652187824249268, + "learning_rate": 7.330957927915582e-06, + "loss": 0.1355, + "step": 20227 + }, + { + "epoch": 1.0743624525230602, + "grad_norm": 5.042046070098877, + "learning_rate": 7.330820885295328e-06, + "loss": 0.2501, + "step": 20228 + }, + { + "epoch": 1.074376017362995, + "grad_norm": 3.611433506011963, + "learning_rate": 7.330683842675073e-06, + "loss": 0.1753, + "step": 20229 + }, + { + "epoch": 1.07438958220293, + "grad_norm": 3.5281522274017334, + "learning_rate": 7.330546800054817e-06, + "loss": 0.144, + "step": 20230 + }, + { + "epoch": 1.074403147042865, + "grad_norm": 4.177168846130371, + "learning_rate": 7.3304097574345625e-06, + "loss": 0.241, + "step": 20231 + }, + { + "epoch": 1.0744167118827999, + "grad_norm": 3.870222568511963, + "learning_rate": 7.3302727148143085e-06, + "loss": 0.2148, + "step": 20232 + }, + { + "epoch": 1.0744302767227347, + "grad_norm": 4.180438995361328, + "learning_rate": 7.330135672194053e-06, + "loss": 0.1884, + "step": 20233 + }, + { + "epoch": 1.0744438415626696, + "grad_norm": 6.757600784301758, + "learning_rate": 7.329998629573798e-06, + "loss": 0.4299, + "step": 20234 + }, + { + "epoch": 1.0744574064026045, + "grad_norm": 5.676223278045654, + "learning_rate": 7.329861586953542e-06, + "loss": 0.285, + "step": 20235 + }, + { + "epoch": 1.0744709712425393, + "grad_norm": 5.462081432342529, + "learning_rate": 7.329724544333288e-06, + "loss": 0.3623, + "step": 20236 + }, + { + "epoch": 1.0744845360824742, + "grad_norm": 3.7293519973754883, + "learning_rate": 7.3295875017130336e-06, + "loss": 0.1731, + "step": 20237 + }, + { + "epoch": 1.074498100922409, + "grad_norm": 3.2034952640533447, + "learning_rate": 7.329450459092779e-06, + "loss": 0.1658, + "step": 20238 + }, + { + "epoch": 1.074511665762344, + "grad_norm": 5.0164361000061035, + "learning_rate": 7.329313416472523e-06, + "loss": 0.2684, + "step": 20239 + }, + { + "epoch": 1.074525230602279, + "grad_norm": 5.349216938018799, + "learning_rate": 7.329176373852268e-06, + "loss": 0.2025, + "step": 20240 + }, + { + "epoch": 1.0745387954422139, + "grad_norm": 3.285724401473999, + "learning_rate": 7.329039331232014e-06, + "loss": 0.12, + "step": 20241 + }, + { + "epoch": 1.0745523602821487, + "grad_norm": 3.7460672855377197, + "learning_rate": 7.328902288611759e-06, + "loss": 0.1461, + "step": 20242 + }, + { + "epoch": 1.0745659251220836, + "grad_norm": 4.933957576751709, + "learning_rate": 7.328765245991504e-06, + "loss": 0.1983, + "step": 20243 + }, + { + "epoch": 1.0745794899620185, + "grad_norm": 6.61572790145874, + "learning_rate": 7.328628203371249e-06, + "loss": 0.2808, + "step": 20244 + }, + { + "epoch": 1.0745930548019533, + "grad_norm": 4.171508312225342, + "learning_rate": 7.328491160750994e-06, + "loss": 0.2531, + "step": 20245 + }, + { + "epoch": 1.0746066196418882, + "grad_norm": 4.290188789367676, + "learning_rate": 7.328354118130739e-06, + "loss": 0.2179, + "step": 20246 + }, + { + "epoch": 1.074620184481823, + "grad_norm": 3.485067129135132, + "learning_rate": 7.3282170755104845e-06, + "loss": 0.1761, + "step": 20247 + }, + { + "epoch": 1.074633749321758, + "grad_norm": 6.948599338531494, + "learning_rate": 7.328080032890229e-06, + "loss": 0.3868, + "step": 20248 + }, + { + "epoch": 1.0746473141616928, + "grad_norm": 4.161775588989258, + "learning_rate": 7.327942990269975e-06, + "loss": 0.1648, + "step": 20249 + }, + { + "epoch": 1.0746608790016279, + "grad_norm": 5.774268627166748, + "learning_rate": 7.32780594764972e-06, + "loss": 0.2785, + "step": 20250 + }, + { + "epoch": 1.0746744438415627, + "grad_norm": 5.902843475341797, + "learning_rate": 7.327668905029464e-06, + "loss": 0.3588, + "step": 20251 + }, + { + "epoch": 1.0746880086814976, + "grad_norm": 4.856714725494385, + "learning_rate": 7.3275318624092096e-06, + "loss": 0.2019, + "step": 20252 + }, + { + "epoch": 1.0747015735214325, + "grad_norm": 5.303917407989502, + "learning_rate": 7.327394819788956e-06, + "loss": 0.3558, + "step": 20253 + }, + { + "epoch": 1.0747151383613673, + "grad_norm": 5.391785144805908, + "learning_rate": 7.327257777168701e-06, + "loss": 0.2858, + "step": 20254 + }, + { + "epoch": 1.0747287032013022, + "grad_norm": 5.159326076507568, + "learning_rate": 7.327120734548445e-06, + "loss": 0.2059, + "step": 20255 + }, + { + "epoch": 1.074742268041237, + "grad_norm": 4.5261335372924805, + "learning_rate": 7.32698369192819e-06, + "loss": 0.2411, + "step": 20256 + }, + { + "epoch": 1.074755832881172, + "grad_norm": 5.648642063140869, + "learning_rate": 7.326846649307935e-06, + "loss": 0.3546, + "step": 20257 + }, + { + "epoch": 1.0747693977211068, + "grad_norm": 4.886298179626465, + "learning_rate": 7.326709606687681e-06, + "loss": 0.2434, + "step": 20258 + }, + { + "epoch": 1.0747829625610419, + "grad_norm": 5.021849632263184, + "learning_rate": 7.326572564067426e-06, + "loss": 0.3481, + "step": 20259 + }, + { + "epoch": 1.0747965274009768, + "grad_norm": 4.537327766418457, + "learning_rate": 7.32643552144717e-06, + "loss": 0.2319, + "step": 20260 + }, + { + "epoch": 1.0748100922409116, + "grad_norm": 4.754068851470947, + "learning_rate": 7.326298478826915e-06, + "loss": 0.3033, + "step": 20261 + }, + { + "epoch": 1.0748236570808465, + "grad_norm": 4.273252964019775, + "learning_rate": 7.326161436206661e-06, + "loss": 0.2124, + "step": 20262 + }, + { + "epoch": 1.0748372219207813, + "grad_norm": 4.464007377624512, + "learning_rate": 7.3260243935864065e-06, + "loss": 0.2159, + "step": 20263 + }, + { + "epoch": 1.0748507867607162, + "grad_norm": 5.011638164520264, + "learning_rate": 7.325887350966151e-06, + "loss": 0.2168, + "step": 20264 + }, + { + "epoch": 1.074864351600651, + "grad_norm": 7.022409439086914, + "learning_rate": 7.325750308345896e-06, + "loss": 0.3929, + "step": 20265 + }, + { + "epoch": 1.074877916440586, + "grad_norm": 3.2408628463745117, + "learning_rate": 7.325613265725642e-06, + "loss": 0.1486, + "step": 20266 + }, + { + "epoch": 1.0748914812805208, + "grad_norm": 5.784236907958984, + "learning_rate": 7.325476223105386e-06, + "loss": 0.3004, + "step": 20267 + }, + { + "epoch": 1.0749050461204557, + "grad_norm": 5.359628200531006, + "learning_rate": 7.3253391804851316e-06, + "loss": 0.2969, + "step": 20268 + }, + { + "epoch": 1.0749186109603908, + "grad_norm": 4.712464809417725, + "learning_rate": 7.325202137864876e-06, + "loss": 0.2033, + "step": 20269 + }, + { + "epoch": 1.0749321758003256, + "grad_norm": 4.269251346588135, + "learning_rate": 7.325065095244621e-06, + "loss": 0.294, + "step": 20270 + }, + { + "epoch": 1.0749457406402605, + "grad_norm": 5.400113105773926, + "learning_rate": 7.324928052624367e-06, + "loss": 0.3334, + "step": 20271 + }, + { + "epoch": 1.0749593054801954, + "grad_norm": 6.368345260620117, + "learning_rate": 7.324791010004112e-06, + "loss": 0.3506, + "step": 20272 + }, + { + "epoch": 1.0749728703201302, + "grad_norm": 5.243714809417725, + "learning_rate": 7.324653967383857e-06, + "loss": 0.2258, + "step": 20273 + }, + { + "epoch": 1.074986435160065, + "grad_norm": 4.653997421264648, + "learning_rate": 7.324516924763602e-06, + "loss": 0.2664, + "step": 20274 + }, + { + "epoch": 1.075, + "grad_norm": 4.746103763580322, + "learning_rate": 7.324379882143348e-06, + "loss": 0.4212, + "step": 20275 + }, + { + "epoch": 1.0750135648399348, + "grad_norm": 3.0074307918548584, + "learning_rate": 7.324242839523092e-06, + "loss": 0.1566, + "step": 20276 + }, + { + "epoch": 1.0750271296798697, + "grad_norm": 4.860381126403809, + "learning_rate": 7.324105796902837e-06, + "loss": 0.2189, + "step": 20277 + }, + { + "epoch": 1.0750406945198048, + "grad_norm": 4.335286617279053, + "learning_rate": 7.3239687542825825e-06, + "loss": 0.2112, + "step": 20278 + }, + { + "epoch": 1.0750542593597396, + "grad_norm": 5.188830852508545, + "learning_rate": 7.323831711662328e-06, + "loss": 0.2315, + "step": 20279 + }, + { + "epoch": 1.0750678241996745, + "grad_norm": 4.245560169219971, + "learning_rate": 7.323694669042073e-06, + "loss": 0.3025, + "step": 20280 + }, + { + "epoch": 1.0750813890396094, + "grad_norm": 5.860884666442871, + "learning_rate": 7.323557626421818e-06, + "loss": 0.3496, + "step": 20281 + }, + { + "epoch": 1.0750949538795442, + "grad_norm": 5.152486801147461, + "learning_rate": 7.323420583801562e-06, + "loss": 0.3007, + "step": 20282 + }, + { + "epoch": 1.075108518719479, + "grad_norm": 3.66202449798584, + "learning_rate": 7.3232835411813076e-06, + "loss": 0.158, + "step": 20283 + }, + { + "epoch": 1.075122083559414, + "grad_norm": 5.611891746520996, + "learning_rate": 7.323146498561054e-06, + "loss": 0.2916, + "step": 20284 + }, + { + "epoch": 1.0751356483993488, + "grad_norm": 5.028423309326172, + "learning_rate": 7.323009455940798e-06, + "loss": 0.2366, + "step": 20285 + }, + { + "epoch": 1.0751492132392837, + "grad_norm": 4.897732734680176, + "learning_rate": 7.322872413320543e-06, + "loss": 0.263, + "step": 20286 + }, + { + "epoch": 1.0751627780792186, + "grad_norm": 5.8219895362854, + "learning_rate": 7.322735370700288e-06, + "loss": 0.3727, + "step": 20287 + }, + { + "epoch": 1.0751763429191536, + "grad_norm": 4.66326379776001, + "learning_rate": 7.322598328080034e-06, + "loss": 0.2869, + "step": 20288 + }, + { + "epoch": 1.0751899077590885, + "grad_norm": 5.2192511558532715, + "learning_rate": 7.322461285459779e-06, + "loss": 0.3604, + "step": 20289 + }, + { + "epoch": 1.0752034725990234, + "grad_norm": 4.177823066711426, + "learning_rate": 7.322324242839524e-06, + "loss": 0.2383, + "step": 20290 + }, + { + "epoch": 1.0752170374389582, + "grad_norm": 5.36722469329834, + "learning_rate": 7.322187200219268e-06, + "loss": 0.341, + "step": 20291 + }, + { + "epoch": 1.075230602278893, + "grad_norm": 7.349575996398926, + "learning_rate": 7.322050157599014e-06, + "loss": 0.4133, + "step": 20292 + }, + { + "epoch": 1.075244167118828, + "grad_norm": 4.504207134246826, + "learning_rate": 7.321913114978759e-06, + "loss": 0.2434, + "step": 20293 + }, + { + "epoch": 1.0752577319587628, + "grad_norm": 4.804446220397949, + "learning_rate": 7.321776072358504e-06, + "loss": 0.319, + "step": 20294 + }, + { + "epoch": 1.0752712967986977, + "grad_norm": 4.280819416046143, + "learning_rate": 7.321639029738249e-06, + "loss": 0.2683, + "step": 20295 + }, + { + "epoch": 1.0752848616386326, + "grad_norm": 6.193167686462402, + "learning_rate": 7.321501987117994e-06, + "loss": 0.3042, + "step": 20296 + }, + { + "epoch": 1.0752984264785677, + "grad_norm": 4.972986698150635, + "learning_rate": 7.32136494449774e-06, + "loss": 0.2618, + "step": 20297 + }, + { + "epoch": 1.0753119913185025, + "grad_norm": 3.354616165161133, + "learning_rate": 7.321227901877484e-06, + "loss": 0.155, + "step": 20298 + }, + { + "epoch": 1.0753255561584374, + "grad_norm": 6.224721908569336, + "learning_rate": 7.32109085925723e-06, + "loss": 0.4889, + "step": 20299 + }, + { + "epoch": 1.0753391209983723, + "grad_norm": 4.4962687492370605, + "learning_rate": 7.320953816636974e-06, + "loss": 0.2905, + "step": 20300 + }, + { + "epoch": 1.0753526858383071, + "grad_norm": 3.515902519226074, + "learning_rate": 7.32081677401672e-06, + "loss": 0.1867, + "step": 20301 + }, + { + "epoch": 1.075366250678242, + "grad_norm": 3.9544782638549805, + "learning_rate": 7.320679731396465e-06, + "loss": 0.262, + "step": 20302 + }, + { + "epoch": 1.0753798155181769, + "grad_norm": 4.4737162590026855, + "learning_rate": 7.32054268877621e-06, + "loss": 0.2214, + "step": 20303 + }, + { + "epoch": 1.0753933803581117, + "grad_norm": 4.646058559417725, + "learning_rate": 7.320405646155955e-06, + "loss": 0.3075, + "step": 20304 + }, + { + "epoch": 1.0754069451980466, + "grad_norm": 6.134089946746826, + "learning_rate": 7.320268603535701e-06, + "loss": 0.2753, + "step": 20305 + }, + { + "epoch": 1.0754205100379814, + "grad_norm": 4.657467842102051, + "learning_rate": 7.320131560915446e-06, + "loss": 0.2172, + "step": 20306 + }, + { + "epoch": 1.0754340748779165, + "grad_norm": 4.785004615783691, + "learning_rate": 7.31999451829519e-06, + "loss": 0.1918, + "step": 20307 + }, + { + "epoch": 1.0754476397178514, + "grad_norm": 5.231617450714111, + "learning_rate": 7.319857475674935e-06, + "loss": 0.4167, + "step": 20308 + }, + { + "epoch": 1.0754612045577863, + "grad_norm": 5.07928991317749, + "learning_rate": 7.31972043305468e-06, + "loss": 0.2506, + "step": 20309 + }, + { + "epoch": 1.0754747693977211, + "grad_norm": 3.7094149589538574, + "learning_rate": 7.319583390434426e-06, + "loss": 0.2188, + "step": 20310 + }, + { + "epoch": 1.075488334237656, + "grad_norm": 6.8688273429870605, + "learning_rate": 7.319446347814171e-06, + "loss": 0.2911, + "step": 20311 + }, + { + "epoch": 1.0755018990775909, + "grad_norm": 4.647380352020264, + "learning_rate": 7.319309305193916e-06, + "loss": 0.2461, + "step": 20312 + }, + { + "epoch": 1.0755154639175257, + "grad_norm": 4.7643537521362305, + "learning_rate": 7.31917226257366e-06, + "loss": 0.3126, + "step": 20313 + }, + { + "epoch": 1.0755290287574606, + "grad_norm": 5.061466693878174, + "learning_rate": 7.319035219953406e-06, + "loss": 0.2385, + "step": 20314 + }, + { + "epoch": 1.0755425935973955, + "grad_norm": 3.281602621078491, + "learning_rate": 7.318898177333152e-06, + "loss": 0.1524, + "step": 20315 + }, + { + "epoch": 1.0755561584373305, + "grad_norm": 5.260527610778809, + "learning_rate": 7.318761134712896e-06, + "loss": 0.2441, + "step": 20316 + }, + { + "epoch": 1.0755697232772654, + "grad_norm": 4.545321464538574, + "learning_rate": 7.318624092092641e-06, + "loss": 0.2493, + "step": 20317 + }, + { + "epoch": 1.0755832881172003, + "grad_norm": 4.172822952270508, + "learning_rate": 7.318487049472387e-06, + "loss": 0.2203, + "step": 20318 + }, + { + "epoch": 1.0755968529571351, + "grad_norm": 4.269014835357666, + "learning_rate": 7.3183500068521315e-06, + "loss": 0.2716, + "step": 20319 + }, + { + "epoch": 1.07561041779707, + "grad_norm": 6.680327892303467, + "learning_rate": 7.318212964231877e-06, + "loss": 0.3877, + "step": 20320 + }, + { + "epoch": 1.0756239826370049, + "grad_norm": 3.7624876499176025, + "learning_rate": 7.318075921611622e-06, + "loss": 0.2019, + "step": 20321 + }, + { + "epoch": 1.0756375474769397, + "grad_norm": 5.882513046264648, + "learning_rate": 7.317938878991368e-06, + "loss": 0.3694, + "step": 20322 + }, + { + "epoch": 1.0756511123168746, + "grad_norm": 4.854310512542725, + "learning_rate": 7.317801836371112e-06, + "loss": 0.3412, + "step": 20323 + }, + { + "epoch": 1.0756646771568095, + "grad_norm": 4.796757221221924, + "learning_rate": 7.317664793750857e-06, + "loss": 0.2627, + "step": 20324 + }, + { + "epoch": 1.0756782419967443, + "grad_norm": 5.634153366088867, + "learning_rate": 7.317527751130602e-06, + "loss": 0.2257, + "step": 20325 + }, + { + "epoch": 1.0756918068366794, + "grad_norm": 5.277131080627441, + "learning_rate": 7.317390708510347e-06, + "loss": 0.2512, + "step": 20326 + }, + { + "epoch": 1.0757053716766143, + "grad_norm": 6.733997344970703, + "learning_rate": 7.317253665890093e-06, + "loss": 0.286, + "step": 20327 + }, + { + "epoch": 1.0757189365165492, + "grad_norm": 5.181530475616455, + "learning_rate": 7.317116623269837e-06, + "loss": 0.2676, + "step": 20328 + }, + { + "epoch": 1.075732501356484, + "grad_norm": 3.6367645263671875, + "learning_rate": 7.316979580649582e-06, + "loss": 0.1519, + "step": 20329 + }, + { + "epoch": 1.0757460661964189, + "grad_norm": 4.897533416748047, + "learning_rate": 7.316842538029328e-06, + "loss": 0.2706, + "step": 20330 + }, + { + "epoch": 1.0757596310363537, + "grad_norm": 3.774178981781006, + "learning_rate": 7.316705495409074e-06, + "loss": 0.179, + "step": 20331 + }, + { + "epoch": 1.0757731958762886, + "grad_norm": 4.041578769683838, + "learning_rate": 7.316568452788818e-06, + "loss": 0.215, + "step": 20332 + }, + { + "epoch": 1.0757867607162235, + "grad_norm": 3.754765272140503, + "learning_rate": 7.316431410168563e-06, + "loss": 0.2086, + "step": 20333 + }, + { + "epoch": 1.0758003255561583, + "grad_norm": 4.416054725646973, + "learning_rate": 7.3162943675483075e-06, + "loss": 0.2598, + "step": 20334 + }, + { + "epoch": 1.0758138903960934, + "grad_norm": 5.242317199707031, + "learning_rate": 7.3161573249280535e-06, + "loss": 0.2149, + "step": 20335 + }, + { + "epoch": 1.0758274552360283, + "grad_norm": 4.7330427169799805, + "learning_rate": 7.316020282307799e-06, + "loss": 0.1675, + "step": 20336 + }, + { + "epoch": 1.0758410200759632, + "grad_norm": 6.057637691497803, + "learning_rate": 7.315883239687544e-06, + "loss": 0.2302, + "step": 20337 + }, + { + "epoch": 1.075854584915898, + "grad_norm": 4.2580156326293945, + "learning_rate": 7.315746197067288e-06, + "loss": 0.2123, + "step": 20338 + }, + { + "epoch": 1.075868149755833, + "grad_norm": 5.486686706542969, + "learning_rate": 7.315609154447033e-06, + "loss": 0.2352, + "step": 20339 + }, + { + "epoch": 1.0758817145957678, + "grad_norm": 4.995721817016602, + "learning_rate": 7.315472111826779e-06, + "loss": 0.2849, + "step": 20340 + }, + { + "epoch": 1.0758952794357026, + "grad_norm": 5.8121867179870605, + "learning_rate": 7.315335069206524e-06, + "loss": 0.2502, + "step": 20341 + }, + { + "epoch": 1.0759088442756375, + "grad_norm": 4.566269874572754, + "learning_rate": 7.315198026586269e-06, + "loss": 0.1925, + "step": 20342 + }, + { + "epoch": 1.0759224091155724, + "grad_norm": 3.6479101181030273, + "learning_rate": 7.315060983966013e-06, + "loss": 0.1921, + "step": 20343 + }, + { + "epoch": 1.0759359739555072, + "grad_norm": 4.353686809539795, + "learning_rate": 7.314923941345759e-06, + "loss": 0.2335, + "step": 20344 + }, + { + "epoch": 1.0759495387954423, + "grad_norm": 3.6689226627349854, + "learning_rate": 7.314786898725504e-06, + "loss": 0.1908, + "step": 20345 + }, + { + "epoch": 1.0759631036353772, + "grad_norm": 5.623672008514404, + "learning_rate": 7.31464985610525e-06, + "loss": 0.29, + "step": 20346 + }, + { + "epoch": 1.075976668475312, + "grad_norm": 4.645468711853027, + "learning_rate": 7.314512813484994e-06, + "loss": 0.1914, + "step": 20347 + }, + { + "epoch": 1.075990233315247, + "grad_norm": 4.564651966094971, + "learning_rate": 7.31437577086474e-06, + "loss": 0.2073, + "step": 20348 + }, + { + "epoch": 1.0760037981551818, + "grad_norm": 5.583460807800293, + "learning_rate": 7.314238728244485e-06, + "loss": 0.2435, + "step": 20349 + }, + { + "epoch": 1.0760173629951166, + "grad_norm": 5.051485061645508, + "learning_rate": 7.3141016856242295e-06, + "loss": 0.214, + "step": 20350 + }, + { + "epoch": 1.0760309278350515, + "grad_norm": 5.154433250427246, + "learning_rate": 7.313964643003975e-06, + "loss": 0.2034, + "step": 20351 + }, + { + "epoch": 1.0760444926749864, + "grad_norm": 5.182791709899902, + "learning_rate": 7.31382760038372e-06, + "loss": 0.1723, + "step": 20352 + }, + { + "epoch": 1.0760580575149212, + "grad_norm": 4.287615776062012, + "learning_rate": 7.313690557763465e-06, + "loss": 0.1711, + "step": 20353 + }, + { + "epoch": 1.0760716223548563, + "grad_norm": 5.831982135772705, + "learning_rate": 7.31355351514321e-06, + "loss": 0.2627, + "step": 20354 + }, + { + "epoch": 1.0760851871947912, + "grad_norm": 4.111861228942871, + "learning_rate": 7.313416472522955e-06, + "loss": 0.218, + "step": 20355 + }, + { + "epoch": 1.076098752034726, + "grad_norm": 6.80314826965332, + "learning_rate": 7.3132794299027e-06, + "loss": 0.2775, + "step": 20356 + }, + { + "epoch": 1.076112316874661, + "grad_norm": 4.151077747344971, + "learning_rate": 7.313142387282446e-06, + "loss": 0.1804, + "step": 20357 + }, + { + "epoch": 1.0761258817145958, + "grad_norm": 4.804654598236084, + "learning_rate": 7.313005344662191e-06, + "loss": 0.1626, + "step": 20358 + }, + { + "epoch": 1.0761394465545306, + "grad_norm": 5.103378772735596, + "learning_rate": 7.312868302041935e-06, + "loss": 0.2306, + "step": 20359 + }, + { + "epoch": 1.0761530113944655, + "grad_norm": 5.2262282371521, + "learning_rate": 7.31273125942168e-06, + "loss": 0.1736, + "step": 20360 + }, + { + "epoch": 1.0761665762344004, + "grad_norm": 3.556824207305908, + "learning_rate": 7.3125942168014264e-06, + "loss": 0.1799, + "step": 20361 + }, + { + "epoch": 1.0761801410743352, + "grad_norm": 5.981227874755859, + "learning_rate": 7.312457174181171e-06, + "loss": 0.2139, + "step": 20362 + }, + { + "epoch": 1.07619370591427, + "grad_norm": 4.091015815734863, + "learning_rate": 7.312320131560916e-06, + "loss": 0.1644, + "step": 20363 + }, + { + "epoch": 1.0762072707542052, + "grad_norm": 5.218648433685303, + "learning_rate": 7.312183088940661e-06, + "loss": 0.1814, + "step": 20364 + }, + { + "epoch": 1.07622083559414, + "grad_norm": 3.976452589035034, + "learning_rate": 7.3120460463204055e-06, + "loss": 0.2524, + "step": 20365 + }, + { + "epoch": 1.076234400434075, + "grad_norm": 5.847280979156494, + "learning_rate": 7.3119090037001515e-06, + "loss": 0.302, + "step": 20366 + }, + { + "epoch": 1.0762479652740098, + "grad_norm": 3.867777109146118, + "learning_rate": 7.311771961079897e-06, + "loss": 0.2342, + "step": 20367 + }, + { + "epoch": 1.0762615301139447, + "grad_norm": 4.643908977508545, + "learning_rate": 7.311634918459641e-06, + "loss": 0.3415, + "step": 20368 + }, + { + "epoch": 1.0762750949538795, + "grad_norm": 6.901957035064697, + "learning_rate": 7.311497875839386e-06, + "loss": 0.3837, + "step": 20369 + }, + { + "epoch": 1.0762886597938144, + "grad_norm": 4.9810380935668945, + "learning_rate": 7.311360833219132e-06, + "loss": 0.2326, + "step": 20370 + }, + { + "epoch": 1.0763022246337492, + "grad_norm": 3.4377007484436035, + "learning_rate": 7.311223790598877e-06, + "loss": 0.1733, + "step": 20371 + }, + { + "epoch": 1.0763157894736841, + "grad_norm": 4.47074031829834, + "learning_rate": 7.311086747978622e-06, + "loss": 0.261, + "step": 20372 + }, + { + "epoch": 1.0763293543136192, + "grad_norm": 6.849998950958252, + "learning_rate": 7.310949705358367e-06, + "loss": 0.3527, + "step": 20373 + }, + { + "epoch": 1.076342919153554, + "grad_norm": 5.578062534332275, + "learning_rate": 7.310812662738113e-06, + "loss": 0.4154, + "step": 20374 + }, + { + "epoch": 1.076356483993489, + "grad_norm": 4.760511875152588, + "learning_rate": 7.310675620117857e-06, + "loss": 0.2827, + "step": 20375 + }, + { + "epoch": 1.0763700488334238, + "grad_norm": 4.346268653869629, + "learning_rate": 7.310538577497602e-06, + "loss": 0.291, + "step": 20376 + }, + { + "epoch": 1.0763836136733587, + "grad_norm": 5.54796838760376, + "learning_rate": 7.310401534877347e-06, + "loss": 0.3518, + "step": 20377 + }, + { + "epoch": 1.0763971785132935, + "grad_norm": 4.416690349578857, + "learning_rate": 7.310264492257092e-06, + "loss": 0.3347, + "step": 20378 + }, + { + "epoch": 1.0764107433532284, + "grad_norm": 4.183898448944092, + "learning_rate": 7.310127449636838e-06, + "loss": 0.33, + "step": 20379 + }, + { + "epoch": 1.0764243081931633, + "grad_norm": 7.696897506713867, + "learning_rate": 7.309990407016583e-06, + "loss": 0.3815, + "step": 20380 + }, + { + "epoch": 1.0764378730330981, + "grad_norm": 5.737391948699951, + "learning_rate": 7.3098533643963275e-06, + "loss": 0.3352, + "step": 20381 + }, + { + "epoch": 1.076451437873033, + "grad_norm": 5.527953624725342, + "learning_rate": 7.309716321776073e-06, + "loss": 0.3949, + "step": 20382 + }, + { + "epoch": 1.076465002712968, + "grad_norm": 4.925671100616455, + "learning_rate": 7.309579279155819e-06, + "loss": 0.3847, + "step": 20383 + }, + { + "epoch": 1.076478567552903, + "grad_norm": 4.660153388977051, + "learning_rate": 7.309442236535563e-06, + "loss": 0.3466, + "step": 20384 + }, + { + "epoch": 1.0764921323928378, + "grad_norm": 6.152077674865723, + "learning_rate": 7.309305193915308e-06, + "loss": 0.2838, + "step": 20385 + }, + { + "epoch": 1.0765056972327727, + "grad_norm": 5.566310882568359, + "learning_rate": 7.309168151295053e-06, + "loss": 0.4143, + "step": 20386 + }, + { + "epoch": 1.0765192620727075, + "grad_norm": 3.403186798095703, + "learning_rate": 7.3090311086747985e-06, + "loss": 0.1989, + "step": 20387 + }, + { + "epoch": 1.0765328269126424, + "grad_norm": 7.675201416015625, + "learning_rate": 7.308894066054544e-06, + "loss": 0.4037, + "step": 20388 + }, + { + "epoch": 1.0765463917525773, + "grad_norm": 5.37903356552124, + "learning_rate": 7.308757023434289e-06, + "loss": 0.3064, + "step": 20389 + }, + { + "epoch": 1.0765599565925121, + "grad_norm": 4.21948766708374, + "learning_rate": 7.308619980814033e-06, + "loss": 0.2937, + "step": 20390 + }, + { + "epoch": 1.076573521432447, + "grad_norm": 5.146777629852295, + "learning_rate": 7.308482938193779e-06, + "loss": 0.2644, + "step": 20391 + }, + { + "epoch": 1.076587086272382, + "grad_norm": 4.036950588226318, + "learning_rate": 7.3083458955735244e-06, + "loss": 0.3008, + "step": 20392 + }, + { + "epoch": 1.076600651112317, + "grad_norm": 4.910053253173828, + "learning_rate": 7.308208852953269e-06, + "loss": 0.3325, + "step": 20393 + }, + { + "epoch": 1.0766142159522518, + "grad_norm": 4.306646347045898, + "learning_rate": 7.308071810333014e-06, + "loss": 0.2655, + "step": 20394 + }, + { + "epoch": 1.0766277807921867, + "grad_norm": 8.326481819152832, + "learning_rate": 7.307934767712759e-06, + "loss": 0.3634, + "step": 20395 + }, + { + "epoch": 1.0766413456321215, + "grad_norm": 3.4719860553741455, + "learning_rate": 7.307797725092505e-06, + "loss": 0.196, + "step": 20396 + }, + { + "epoch": 1.0766549104720564, + "grad_norm": 4.7974348068237305, + "learning_rate": 7.3076606824722495e-06, + "loss": 0.3321, + "step": 20397 + }, + { + "epoch": 1.0766684753119913, + "grad_norm": 4.7158989906311035, + "learning_rate": 7.307523639851995e-06, + "loss": 0.2853, + "step": 20398 + }, + { + "epoch": 1.0766820401519261, + "grad_norm": 6.331728458404541, + "learning_rate": 7.307386597231739e-06, + "loss": 0.4136, + "step": 20399 + }, + { + "epoch": 1.076695604991861, + "grad_norm": 6.193660736083984, + "learning_rate": 7.307249554611485e-06, + "loss": 0.4139, + "step": 20400 + }, + { + "epoch": 1.0767091698317959, + "grad_norm": 4.689122676849365, + "learning_rate": 7.30711251199123e-06, + "loss": 0.2747, + "step": 20401 + }, + { + "epoch": 1.076722734671731, + "grad_norm": 5.46733283996582, + "learning_rate": 7.3069754693709745e-06, + "loss": 0.2956, + "step": 20402 + }, + { + "epoch": 1.0767362995116658, + "grad_norm": 5.520989418029785, + "learning_rate": 7.30683842675072e-06, + "loss": 0.2639, + "step": 20403 + }, + { + "epoch": 1.0767498643516007, + "grad_norm": 5.308661937713623, + "learning_rate": 7.306701384130466e-06, + "loss": 0.299, + "step": 20404 + }, + { + "epoch": 1.0767634291915356, + "grad_norm": 5.449430465698242, + "learning_rate": 7.306564341510211e-06, + "loss": 0.2611, + "step": 20405 + }, + { + "epoch": 1.0767769940314704, + "grad_norm": 5.812283992767334, + "learning_rate": 7.306427298889955e-06, + "loss": 0.2899, + "step": 20406 + }, + { + "epoch": 1.0767905588714053, + "grad_norm": 5.343015193939209, + "learning_rate": 7.3062902562697004e-06, + "loss": 0.1851, + "step": 20407 + }, + { + "epoch": 1.0768041237113402, + "grad_norm": 5.223095417022705, + "learning_rate": 7.306153213649445e-06, + "loss": 0.2551, + "step": 20408 + }, + { + "epoch": 1.076817688551275, + "grad_norm": 5.378609657287598, + "learning_rate": 7.306016171029191e-06, + "loss": 0.4947, + "step": 20409 + }, + { + "epoch": 1.0768312533912099, + "grad_norm": 6.843796730041504, + "learning_rate": 7.305879128408936e-06, + "loss": 0.3441, + "step": 20410 + }, + { + "epoch": 1.076844818231145, + "grad_norm": 7.08953857421875, + "learning_rate": 7.30574208578868e-06, + "loss": 0.3311, + "step": 20411 + }, + { + "epoch": 1.0768583830710798, + "grad_norm": 5.722989559173584, + "learning_rate": 7.3056050431684255e-06, + "loss": 0.3554, + "step": 20412 + }, + { + "epoch": 1.0768719479110147, + "grad_norm": 4.3960981369018555, + "learning_rate": 7.3054680005481715e-06, + "loss": 0.2969, + "step": 20413 + }, + { + "epoch": 1.0768855127509496, + "grad_norm": 5.513791084289551, + "learning_rate": 7.305330957927917e-06, + "loss": 0.3615, + "step": 20414 + }, + { + "epoch": 1.0768990775908844, + "grad_norm": 4.895232200622559, + "learning_rate": 7.305193915307661e-06, + "loss": 0.2347, + "step": 20415 + }, + { + "epoch": 1.0769126424308193, + "grad_norm": 5.040441989898682, + "learning_rate": 7.305056872687406e-06, + "loss": 0.3739, + "step": 20416 + }, + { + "epoch": 1.0769262072707542, + "grad_norm": 5.168239593505859, + "learning_rate": 7.304919830067152e-06, + "loss": 0.3782, + "step": 20417 + }, + { + "epoch": 1.076939772110689, + "grad_norm": 5.041851997375488, + "learning_rate": 7.3047827874468965e-06, + "loss": 0.3272, + "step": 20418 + }, + { + "epoch": 1.076953336950624, + "grad_norm": 4.573790073394775, + "learning_rate": 7.304645744826642e-06, + "loss": 0.3738, + "step": 20419 + }, + { + "epoch": 1.0769669017905588, + "grad_norm": 6.267971992492676, + "learning_rate": 7.304508702206387e-06, + "loss": 0.3995, + "step": 20420 + }, + { + "epoch": 1.0769804666304938, + "grad_norm": 6.961174011230469, + "learning_rate": 7.304371659586131e-06, + "loss": 0.3379, + "step": 20421 + }, + { + "epoch": 1.0769940314704287, + "grad_norm": 6.622485637664795, + "learning_rate": 7.304234616965877e-06, + "loss": 0.4455, + "step": 20422 + }, + { + "epoch": 1.0770075963103636, + "grad_norm": 5.885223388671875, + "learning_rate": 7.3040975743456224e-06, + "loss": 0.3626, + "step": 20423 + }, + { + "epoch": 1.0770211611502984, + "grad_norm": 4.724563121795654, + "learning_rate": 7.303960531725367e-06, + "loss": 0.3243, + "step": 20424 + }, + { + "epoch": 1.0770347259902333, + "grad_norm": 6.386837482452393, + "learning_rate": 7.303823489105112e-06, + "loss": 0.4082, + "step": 20425 + }, + { + "epoch": 1.0770482908301682, + "grad_norm": 4.549701690673828, + "learning_rate": 7.303686446484858e-06, + "loss": 0.2778, + "step": 20426 + }, + { + "epoch": 1.077061855670103, + "grad_norm": 6.91720724105835, + "learning_rate": 7.303549403864602e-06, + "loss": 0.5818, + "step": 20427 + }, + { + "epoch": 1.077075420510038, + "grad_norm": 4.5751752853393555, + "learning_rate": 7.3034123612443475e-06, + "loss": 0.27, + "step": 20428 + }, + { + "epoch": 1.0770889853499728, + "grad_norm": 6.260404109954834, + "learning_rate": 7.303275318624093e-06, + "loss": 0.4102, + "step": 20429 + }, + { + "epoch": 1.0771025501899079, + "grad_norm": 5.61606502532959, + "learning_rate": 7.303138276003839e-06, + "loss": 0.3559, + "step": 20430 + }, + { + "epoch": 1.0771161150298427, + "grad_norm": 7.336390018463135, + "learning_rate": 7.303001233383583e-06, + "loss": 0.432, + "step": 20431 + }, + { + "epoch": 1.0771296798697776, + "grad_norm": 4.50566291809082, + "learning_rate": 7.302864190763328e-06, + "loss": 0.2104, + "step": 20432 + }, + { + "epoch": 1.0771432447097125, + "grad_norm": 5.198517322540283, + "learning_rate": 7.3027271481430725e-06, + "loss": 0.3507, + "step": 20433 + }, + { + "epoch": 1.0771568095496473, + "grad_norm": 6.143844127655029, + "learning_rate": 7.302590105522818e-06, + "loss": 0.3484, + "step": 20434 + }, + { + "epoch": 1.0771703743895822, + "grad_norm": 6.017331123352051, + "learning_rate": 7.302453062902564e-06, + "loss": 0.3615, + "step": 20435 + }, + { + "epoch": 1.077183939229517, + "grad_norm": 5.872710704803467, + "learning_rate": 7.302316020282308e-06, + "loss": 0.315, + "step": 20436 + }, + { + "epoch": 1.077197504069452, + "grad_norm": 6.183056354522705, + "learning_rate": 7.302178977662053e-06, + "loss": 0.4574, + "step": 20437 + }, + { + "epoch": 1.0772110689093868, + "grad_norm": 7.124609470367432, + "learning_rate": 7.3020419350417984e-06, + "loss": 0.4076, + "step": 20438 + }, + { + "epoch": 1.0772246337493216, + "grad_norm": 5.784982681274414, + "learning_rate": 7.3019048924215445e-06, + "loss": 0.3511, + "step": 20439 + }, + { + "epoch": 1.0772381985892567, + "grad_norm": 5.957108497619629, + "learning_rate": 7.301767849801289e-06, + "loss": 0.5066, + "step": 20440 + }, + { + "epoch": 1.0772517634291916, + "grad_norm": 7.181273937225342, + "learning_rate": 7.301630807181034e-06, + "loss": 0.4598, + "step": 20441 + }, + { + "epoch": 1.0772653282691265, + "grad_norm": 7.314188003540039, + "learning_rate": 7.301493764560778e-06, + "loss": 0.4358, + "step": 20442 + }, + { + "epoch": 1.0772788931090613, + "grad_norm": 4.705317497253418, + "learning_rate": 7.301356721940524e-06, + "loss": 0.4196, + "step": 20443 + }, + { + "epoch": 1.0772924579489962, + "grad_norm": 8.131386756896973, + "learning_rate": 7.3012196793202695e-06, + "loss": 0.522, + "step": 20444 + }, + { + "epoch": 1.077306022788931, + "grad_norm": 4.6236186027526855, + "learning_rate": 7.301082636700015e-06, + "loss": 0.2871, + "step": 20445 + }, + { + "epoch": 1.077319587628866, + "grad_norm": 6.758553504943848, + "learning_rate": 7.300945594079759e-06, + "loss": 0.3611, + "step": 20446 + }, + { + "epoch": 1.0773331524688008, + "grad_norm": 5.353329658508301, + "learning_rate": 7.300808551459504e-06, + "loss": 0.2844, + "step": 20447 + }, + { + "epoch": 1.0773467173087357, + "grad_norm": 5.15479850769043, + "learning_rate": 7.30067150883925e-06, + "loss": 0.2492, + "step": 20448 + }, + { + "epoch": 1.0773602821486707, + "grad_norm": 6.150362014770508, + "learning_rate": 7.3005344662189946e-06, + "loss": 0.3949, + "step": 20449 + }, + { + "epoch": 1.0773738469886056, + "grad_norm": 6.581076145172119, + "learning_rate": 7.30039742359874e-06, + "loss": 0.6302, + "step": 20450 + }, + { + "epoch": 1.0773874118285405, + "grad_norm": 5.726677894592285, + "learning_rate": 7.300260380978484e-06, + "loss": 0.3692, + "step": 20451 + }, + { + "epoch": 1.0774009766684753, + "grad_norm": 5.88705587387085, + "learning_rate": 7.30012333835823e-06, + "loss": 0.3334, + "step": 20452 + }, + { + "epoch": 1.0774145415084102, + "grad_norm": 6.012632369995117, + "learning_rate": 7.299986295737975e-06, + "loss": 0.3003, + "step": 20453 + }, + { + "epoch": 1.077428106348345, + "grad_norm": 4.720519542694092, + "learning_rate": 7.2998492531177204e-06, + "loss": 0.3657, + "step": 20454 + }, + { + "epoch": 1.07744167118828, + "grad_norm": 5.993571758270264, + "learning_rate": 7.299712210497465e-06, + "loss": 0.3272, + "step": 20455 + }, + { + "epoch": 1.0774552360282148, + "grad_norm": 6.282689094543457, + "learning_rate": 7.299575167877211e-06, + "loss": 0.5329, + "step": 20456 + }, + { + "epoch": 1.0774688008681497, + "grad_norm": 6.570468902587891, + "learning_rate": 7.299438125256956e-06, + "loss": 0.4713, + "step": 20457 + }, + { + "epoch": 1.0774823657080845, + "grad_norm": 6.2234320640563965, + "learning_rate": 7.2993010826367e-06, + "loss": 0.3383, + "step": 20458 + }, + { + "epoch": 1.0774959305480196, + "grad_norm": 5.023251533508301, + "learning_rate": 7.2991640400164455e-06, + "loss": 0.291, + "step": 20459 + }, + { + "epoch": 1.0775094953879545, + "grad_norm": 7.517664432525635, + "learning_rate": 7.2990269973961915e-06, + "loss": 0.4217, + "step": 20460 + }, + { + "epoch": 1.0775230602278894, + "grad_norm": 5.658545970916748, + "learning_rate": 7.298889954775936e-06, + "loss": 0.3975, + "step": 20461 + }, + { + "epoch": 1.0775366250678242, + "grad_norm": 6.1132588386535645, + "learning_rate": 7.298752912155681e-06, + "loss": 0.4827, + "step": 20462 + }, + { + "epoch": 1.077550189907759, + "grad_norm": 6.733757495880127, + "learning_rate": 7.298615869535426e-06, + "loss": 0.4419, + "step": 20463 + }, + { + "epoch": 1.077563754747694, + "grad_norm": 6.288538455963135, + "learning_rate": 7.2984788269151705e-06, + "loss": 0.4897, + "step": 20464 + }, + { + "epoch": 1.0775773195876288, + "grad_norm": 6.0277419090271, + "learning_rate": 7.2983417842949166e-06, + "loss": 0.3688, + "step": 20465 + }, + { + "epoch": 1.0775908844275637, + "grad_norm": 5.5355224609375, + "learning_rate": 7.298204741674662e-06, + "loss": 0.3499, + "step": 20466 + }, + { + "epoch": 1.0776044492674985, + "grad_norm": 4.3612165451049805, + "learning_rate": 7.298067699054406e-06, + "loss": 0.2189, + "step": 20467 + }, + { + "epoch": 1.0776180141074336, + "grad_norm": 5.0963358879089355, + "learning_rate": 7.297930656434151e-06, + "loss": 0.2409, + "step": 20468 + }, + { + "epoch": 1.0776315789473685, + "grad_norm": 5.6746039390563965, + "learning_rate": 7.297793613813897e-06, + "loss": 0.465, + "step": 20469 + }, + { + "epoch": 1.0776451437873034, + "grad_norm": 4.441572189331055, + "learning_rate": 7.297656571193642e-06, + "loss": 0.2781, + "step": 20470 + }, + { + "epoch": 1.0776587086272382, + "grad_norm": 6.549131870269775, + "learning_rate": 7.297519528573387e-06, + "loss": 0.3511, + "step": 20471 + }, + { + "epoch": 1.077672273467173, + "grad_norm": 5.300637245178223, + "learning_rate": 7.297382485953132e-06, + "loss": 0.3806, + "step": 20472 + }, + { + "epoch": 1.077685838307108, + "grad_norm": 6.707560062408447, + "learning_rate": 7.297245443332878e-06, + "loss": 0.4401, + "step": 20473 + }, + { + "epoch": 1.0776994031470428, + "grad_norm": 7.510168075561523, + "learning_rate": 7.297108400712622e-06, + "loss": 0.395, + "step": 20474 + }, + { + "epoch": 1.0777129679869777, + "grad_norm": 6.197298049926758, + "learning_rate": 7.2969713580923675e-06, + "loss": 0.3829, + "step": 20475 + }, + { + "epoch": 1.0777265328269126, + "grad_norm": 5.880699634552002, + "learning_rate": 7.296834315472112e-06, + "loss": 0.5056, + "step": 20476 + }, + { + "epoch": 1.0777400976668474, + "grad_norm": 3.4006121158599854, + "learning_rate": 7.296697272851857e-06, + "loss": 0.1919, + "step": 20477 + }, + { + "epoch": 1.0777536625067825, + "grad_norm": 5.668041229248047, + "learning_rate": 7.296560230231603e-06, + "loss": 0.4212, + "step": 20478 + }, + { + "epoch": 1.0777672273467174, + "grad_norm": 6.266366481781006, + "learning_rate": 7.296423187611348e-06, + "loss": 0.4883, + "step": 20479 + }, + { + "epoch": 1.0777807921866522, + "grad_norm": 6.801294803619385, + "learning_rate": 7.2962861449910926e-06, + "loss": 0.4231, + "step": 20480 + }, + { + "epoch": 1.077794357026587, + "grad_norm": 4.922415733337402, + "learning_rate": 7.296149102370838e-06, + "loss": 0.4049, + "step": 20481 + }, + { + "epoch": 1.077807921866522, + "grad_norm": 6.723381042480469, + "learning_rate": 7.296012059750584e-06, + "loss": 0.4866, + "step": 20482 + }, + { + "epoch": 1.0778214867064568, + "grad_norm": 4.890109539031982, + "learning_rate": 7.295875017130328e-06, + "loss": 0.4085, + "step": 20483 + }, + { + "epoch": 1.0778350515463917, + "grad_norm": 4.388838768005371, + "learning_rate": 7.295737974510073e-06, + "loss": 0.344, + "step": 20484 + }, + { + "epoch": 1.0778486163863266, + "grad_norm": 6.310123920440674, + "learning_rate": 7.295600931889818e-06, + "loss": 0.3329, + "step": 20485 + }, + { + "epoch": 1.0778621812262614, + "grad_norm": 4.2836503982543945, + "learning_rate": 7.295463889269564e-06, + "loss": 0.3645, + "step": 20486 + }, + { + "epoch": 1.0778757460661965, + "grad_norm": 4.438728332519531, + "learning_rate": 7.295326846649309e-06, + "loss": 0.3573, + "step": 20487 + }, + { + "epoch": 1.0778893109061314, + "grad_norm": 3.648615598678589, + "learning_rate": 7.295189804029054e-06, + "loss": 0.2974, + "step": 20488 + }, + { + "epoch": 1.0779028757460662, + "grad_norm": 6.651584625244141, + "learning_rate": 7.295052761408798e-06, + "loss": 0.5081, + "step": 20489 + }, + { + "epoch": 1.0779164405860011, + "grad_norm": 4.760982990264893, + "learning_rate": 7.2949157187885435e-06, + "loss": 0.3128, + "step": 20490 + }, + { + "epoch": 1.077930005425936, + "grad_norm": 5.346629619598389, + "learning_rate": 7.2947786761682895e-06, + "loss": 0.3347, + "step": 20491 + }, + { + "epoch": 1.0779435702658708, + "grad_norm": 4.8768181800842285, + "learning_rate": 7.294641633548034e-06, + "loss": 0.3511, + "step": 20492 + }, + { + "epoch": 1.0779571351058057, + "grad_norm": 4.754467964172363, + "learning_rate": 7.294504590927779e-06, + "loss": 0.3099, + "step": 20493 + }, + { + "epoch": 1.0779706999457406, + "grad_norm": 5.123706340789795, + "learning_rate": 7.294367548307524e-06, + "loss": 0.3251, + "step": 20494 + }, + { + "epoch": 1.0779842647856754, + "grad_norm": 5.404285430908203, + "learning_rate": 7.294230505687269e-06, + "loss": 0.2967, + "step": 20495 + }, + { + "epoch": 1.0779978296256103, + "grad_norm": 4.85867166519165, + "learning_rate": 7.2940934630670146e-06, + "loss": 0.2214, + "step": 20496 + }, + { + "epoch": 1.0780113944655454, + "grad_norm": 4.437730312347412, + "learning_rate": 7.29395642044676e-06, + "loss": 0.2297, + "step": 20497 + }, + { + "epoch": 1.0780249593054803, + "grad_norm": 3.830575942993164, + "learning_rate": 7.293819377826504e-06, + "loss": 0.2822, + "step": 20498 + }, + { + "epoch": 1.0780385241454151, + "grad_norm": 3.7136943340301514, + "learning_rate": 7.29368233520625e-06, + "loss": 0.1837, + "step": 20499 + }, + { + "epoch": 1.07805208898535, + "grad_norm": 5.422721862792969, + "learning_rate": 7.293545292585995e-06, + "loss": 0.2443, + "step": 20500 + }, + { + "epoch": 1.0780656538252849, + "grad_norm": 4.172845840454102, + "learning_rate": 7.29340824996574e-06, + "loss": 0.255, + "step": 20501 + }, + { + "epoch": 1.0780792186652197, + "grad_norm": 3.6292312145233154, + "learning_rate": 7.293271207345485e-06, + "loss": 0.1608, + "step": 20502 + }, + { + "epoch": 1.0780927835051546, + "grad_norm": 4.799699306488037, + "learning_rate": 7.29313416472523e-06, + "loss": 0.1975, + "step": 20503 + }, + { + "epoch": 1.0781063483450894, + "grad_norm": 6.147984981536865, + "learning_rate": 7.292997122104975e-06, + "loss": 0.2877, + "step": 20504 + }, + { + "epoch": 1.0781199131850243, + "grad_norm": 4.702658653259277, + "learning_rate": 7.29286007948472e-06, + "loss": 0.2826, + "step": 20505 + }, + { + "epoch": 1.0781334780249594, + "grad_norm": 4.184589385986328, + "learning_rate": 7.2927230368644655e-06, + "loss": 0.2555, + "step": 20506 + }, + { + "epoch": 1.0781470428648943, + "grad_norm": 3.774644136428833, + "learning_rate": 7.29258599424421e-06, + "loss": 0.2333, + "step": 20507 + }, + { + "epoch": 1.0781606077048291, + "grad_norm": 3.433905601501465, + "learning_rate": 7.292448951623956e-06, + "loss": 0.156, + "step": 20508 + }, + { + "epoch": 1.078174172544764, + "grad_norm": 4.001842498779297, + "learning_rate": 7.292311909003701e-06, + "loss": 0.1966, + "step": 20509 + }, + { + "epoch": 1.0781877373846989, + "grad_norm": 4.6564435958862305, + "learning_rate": 7.292174866383445e-06, + "loss": 0.1934, + "step": 20510 + }, + { + "epoch": 1.0782013022246337, + "grad_norm": 5.725744247436523, + "learning_rate": 7.2920378237631906e-06, + "loss": 0.3294, + "step": 20511 + }, + { + "epoch": 1.0782148670645686, + "grad_norm": 4.3231587409973145, + "learning_rate": 7.291900781142937e-06, + "loss": 0.1691, + "step": 20512 + }, + { + "epoch": 1.0782284319045035, + "grad_norm": 5.973974227905273, + "learning_rate": 7.291763738522682e-06, + "loss": 0.3085, + "step": 20513 + }, + { + "epoch": 1.0782419967444383, + "grad_norm": 4.454546928405762, + "learning_rate": 7.291626695902426e-06, + "loss": 0.2532, + "step": 20514 + }, + { + "epoch": 1.0782555615843732, + "grad_norm": 4.183833122253418, + "learning_rate": 7.291489653282171e-06, + "loss": 0.2053, + "step": 20515 + }, + { + "epoch": 1.0782691264243083, + "grad_norm": 4.587450981140137, + "learning_rate": 7.291352610661916e-06, + "loss": 0.2183, + "step": 20516 + }, + { + "epoch": 1.0782826912642431, + "grad_norm": 6.224549293518066, + "learning_rate": 7.291215568041662e-06, + "loss": 0.2148, + "step": 20517 + }, + { + "epoch": 1.078296256104178, + "grad_norm": 4.3153181076049805, + "learning_rate": 7.291078525421407e-06, + "loss": 0.193, + "step": 20518 + }, + { + "epoch": 1.0783098209441129, + "grad_norm": 3.4701273441314697, + "learning_rate": 7.290941482801151e-06, + "loss": 0.1383, + "step": 20519 + }, + { + "epoch": 1.0783233857840477, + "grad_norm": 5.710078239440918, + "learning_rate": 7.290804440180896e-06, + "loss": 0.2694, + "step": 20520 + }, + { + "epoch": 1.0783369506239826, + "grad_norm": 4.27556848526001, + "learning_rate": 7.290667397560642e-06, + "loss": 0.1786, + "step": 20521 + }, + { + "epoch": 1.0783505154639175, + "grad_norm": 5.547560691833496, + "learning_rate": 7.2905303549403875e-06, + "loss": 0.2312, + "step": 20522 + }, + { + "epoch": 1.0783640803038523, + "grad_norm": 4.672183036804199, + "learning_rate": 7.290393312320132e-06, + "loss": 0.281, + "step": 20523 + }, + { + "epoch": 1.0783776451437872, + "grad_norm": 4.28143835067749, + "learning_rate": 7.290256269699877e-06, + "loss": 0.2373, + "step": 20524 + }, + { + "epoch": 1.0783912099837223, + "grad_norm": 5.242550373077393, + "learning_rate": 7.290119227079623e-06, + "loss": 0.2127, + "step": 20525 + }, + { + "epoch": 1.0784047748236572, + "grad_norm": 5.131383895874023, + "learning_rate": 7.289982184459367e-06, + "loss": 0.2628, + "step": 20526 + }, + { + "epoch": 1.078418339663592, + "grad_norm": 5.67358922958374, + "learning_rate": 7.2898451418391126e-06, + "loss": 0.2196, + "step": 20527 + }, + { + "epoch": 1.0784319045035269, + "grad_norm": 4.862024784088135, + "learning_rate": 7.289708099218858e-06, + "loss": 0.1794, + "step": 20528 + }, + { + "epoch": 1.0784454693434617, + "grad_norm": 5.538144588470459, + "learning_rate": 7.289571056598603e-06, + "loss": 0.2138, + "step": 20529 + }, + { + "epoch": 1.0784590341833966, + "grad_norm": 4.256980895996094, + "learning_rate": 7.289434013978348e-06, + "loss": 0.1811, + "step": 20530 + }, + { + "epoch": 1.0784725990233315, + "grad_norm": 3.3185267448425293, + "learning_rate": 7.289296971358093e-06, + "loss": 0.1209, + "step": 20531 + }, + { + "epoch": 1.0784861638632663, + "grad_norm": 6.469976425170898, + "learning_rate": 7.289159928737838e-06, + "loss": 0.2335, + "step": 20532 + }, + { + "epoch": 1.0784997287032012, + "grad_norm": 4.4914679527282715, + "learning_rate": 7.289022886117583e-06, + "loss": 0.1375, + "step": 20533 + }, + { + "epoch": 1.078513293543136, + "grad_norm": 4.97476863861084, + "learning_rate": 7.288885843497329e-06, + "loss": 0.2594, + "step": 20534 + }, + { + "epoch": 1.0785268583830712, + "grad_norm": 4.022530555725098, + "learning_rate": 7.288748800877073e-06, + "loss": 0.2217, + "step": 20535 + }, + { + "epoch": 1.078540423223006, + "grad_norm": 5.206331253051758, + "learning_rate": 7.288611758256818e-06, + "loss": 0.3301, + "step": 20536 + }, + { + "epoch": 1.078553988062941, + "grad_norm": 5.6566243171691895, + "learning_rate": 7.2884747156365635e-06, + "loss": 0.2592, + "step": 20537 + }, + { + "epoch": 1.0785675529028758, + "grad_norm": 3.667207956314087, + "learning_rate": 7.2883376730163095e-06, + "loss": 0.172, + "step": 20538 + }, + { + "epoch": 1.0785811177428106, + "grad_norm": 4.809754848480225, + "learning_rate": 7.288200630396054e-06, + "loss": 0.2415, + "step": 20539 + }, + { + "epoch": 1.0785946825827455, + "grad_norm": 7.231840133666992, + "learning_rate": 7.288063587775799e-06, + "loss": 0.4161, + "step": 20540 + }, + { + "epoch": 1.0786082474226804, + "grad_norm": 5.280357360839844, + "learning_rate": 7.287926545155543e-06, + "loss": 0.1942, + "step": 20541 + }, + { + "epoch": 1.0786218122626152, + "grad_norm": 6.818476676940918, + "learning_rate": 7.287789502535289e-06, + "loss": 0.2467, + "step": 20542 + }, + { + "epoch": 1.07863537710255, + "grad_norm": 5.204885959625244, + "learning_rate": 7.287652459915035e-06, + "loss": 0.262, + "step": 20543 + }, + { + "epoch": 1.0786489419424852, + "grad_norm": 4.735414028167725, + "learning_rate": 7.287515417294779e-06, + "loss": 0.2156, + "step": 20544 + }, + { + "epoch": 1.07866250678242, + "grad_norm": 5.114975929260254, + "learning_rate": 7.287378374674524e-06, + "loss": 0.24, + "step": 20545 + }, + { + "epoch": 1.078676071622355, + "grad_norm": 6.04576301574707, + "learning_rate": 7.287241332054269e-06, + "loss": 0.3306, + "step": 20546 + }, + { + "epoch": 1.0786896364622898, + "grad_norm": 6.641544818878174, + "learning_rate": 7.287104289434015e-06, + "loss": 0.3069, + "step": 20547 + }, + { + "epoch": 1.0787032013022246, + "grad_norm": 5.413010597229004, + "learning_rate": 7.28696724681376e-06, + "loss": 0.272, + "step": 20548 + }, + { + "epoch": 1.0787167661421595, + "grad_norm": 3.7571682929992676, + "learning_rate": 7.286830204193505e-06, + "loss": 0.1878, + "step": 20549 + }, + { + "epoch": 1.0787303309820944, + "grad_norm": 3.0717005729675293, + "learning_rate": 7.286693161573249e-06, + "loss": 0.1167, + "step": 20550 + }, + { + "epoch": 1.0787438958220292, + "grad_norm": 4.20003080368042, + "learning_rate": 7.286556118952995e-06, + "loss": 0.2523, + "step": 20551 + }, + { + "epoch": 1.078757460661964, + "grad_norm": 6.041978359222412, + "learning_rate": 7.28641907633274e-06, + "loss": 0.2364, + "step": 20552 + }, + { + "epoch": 1.078771025501899, + "grad_norm": 6.429193496704102, + "learning_rate": 7.286282033712485e-06, + "loss": 0.3119, + "step": 20553 + }, + { + "epoch": 1.078784590341834, + "grad_norm": 4.721269130706787, + "learning_rate": 7.28614499109223e-06, + "loss": 0.2468, + "step": 20554 + }, + { + "epoch": 1.078798155181769, + "grad_norm": 4.464472770690918, + "learning_rate": 7.286007948471976e-06, + "loss": 0.199, + "step": 20555 + }, + { + "epoch": 1.0788117200217038, + "grad_norm": 4.363779544830322, + "learning_rate": 7.285870905851721e-06, + "loss": 0.1738, + "step": 20556 + }, + { + "epoch": 1.0788252848616386, + "grad_norm": 5.437103748321533, + "learning_rate": 7.285733863231465e-06, + "loss": 0.1845, + "step": 20557 + }, + { + "epoch": 1.0788388497015735, + "grad_norm": 4.607624053955078, + "learning_rate": 7.285596820611211e-06, + "loss": 0.2343, + "step": 20558 + }, + { + "epoch": 1.0788524145415084, + "grad_norm": 5.307650566101074, + "learning_rate": 7.285459777990955e-06, + "loss": 0.1977, + "step": 20559 + }, + { + "epoch": 1.0788659793814432, + "grad_norm": 4.800487995147705, + "learning_rate": 7.285322735370701e-06, + "loss": 0.2482, + "step": 20560 + }, + { + "epoch": 1.078879544221378, + "grad_norm": 5.914621829986572, + "learning_rate": 7.285185692750446e-06, + "loss": 0.213, + "step": 20561 + }, + { + "epoch": 1.0788931090613132, + "grad_norm": 5.011648178100586, + "learning_rate": 7.285048650130191e-06, + "loss": 0.2507, + "step": 20562 + }, + { + "epoch": 1.078906673901248, + "grad_norm": 6.557849884033203, + "learning_rate": 7.284911607509936e-06, + "loss": 0.4572, + "step": 20563 + }, + { + "epoch": 1.078920238741183, + "grad_norm": 4.32505989074707, + "learning_rate": 7.284774564889682e-06, + "loss": 0.2563, + "step": 20564 + }, + { + "epoch": 1.0789338035811178, + "grad_norm": 6.385234832763672, + "learning_rate": 7.284637522269427e-06, + "loss": 0.2401, + "step": 20565 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 4.842954158782959, + "learning_rate": 7.284500479649171e-06, + "loss": 0.185, + "step": 20566 + }, + { + "epoch": 1.0789609332609875, + "grad_norm": 4.6530232429504395, + "learning_rate": 7.284363437028916e-06, + "loss": 0.2337, + "step": 20567 + }, + { + "epoch": 1.0789744981009224, + "grad_norm": 6.01016092300415, + "learning_rate": 7.284226394408662e-06, + "loss": 0.3061, + "step": 20568 + }, + { + "epoch": 1.0789880629408573, + "grad_norm": 4.882910251617432, + "learning_rate": 7.284089351788407e-06, + "loss": 0.2447, + "step": 20569 + }, + { + "epoch": 1.0790016277807921, + "grad_norm": 4.529464244842529, + "learning_rate": 7.283952309168152e-06, + "loss": 0.1921, + "step": 20570 + }, + { + "epoch": 1.079015192620727, + "grad_norm": 5.832765579223633, + "learning_rate": 7.283815266547897e-06, + "loss": 0.265, + "step": 20571 + }, + { + "epoch": 1.0790287574606618, + "grad_norm": 5.731863021850586, + "learning_rate": 7.283678223927641e-06, + "loss": 0.2674, + "step": 20572 + }, + { + "epoch": 1.079042322300597, + "grad_norm": 5.7384161949157715, + "learning_rate": 7.283541181307387e-06, + "loss": 0.2617, + "step": 20573 + }, + { + "epoch": 1.0790558871405318, + "grad_norm": 5.827972888946533, + "learning_rate": 7.283404138687133e-06, + "loss": 0.3731, + "step": 20574 + }, + { + "epoch": 1.0790694519804667, + "grad_norm": 8.183420181274414, + "learning_rate": 7.283267096066877e-06, + "loss": 0.4399, + "step": 20575 + }, + { + "epoch": 1.0790830168204015, + "grad_norm": 4.715784072875977, + "learning_rate": 7.283130053446622e-06, + "loss": 0.2451, + "step": 20576 + }, + { + "epoch": 1.0790965816603364, + "grad_norm": 4.564941883087158, + "learning_rate": 7.282993010826368e-06, + "loss": 0.1843, + "step": 20577 + }, + { + "epoch": 1.0791101465002713, + "grad_norm": 6.087192058563232, + "learning_rate": 7.2828559682061125e-06, + "loss": 0.2842, + "step": 20578 + }, + { + "epoch": 1.0791237113402061, + "grad_norm": 5.056011199951172, + "learning_rate": 7.282718925585858e-06, + "loss": 0.2619, + "step": 20579 + }, + { + "epoch": 1.079137276180141, + "grad_norm": 4.065464019775391, + "learning_rate": 7.282581882965603e-06, + "loss": 0.1687, + "step": 20580 + }, + { + "epoch": 1.079150841020076, + "grad_norm": 6.566651344299316, + "learning_rate": 7.282444840345349e-06, + "loss": 0.2426, + "step": 20581 + }, + { + "epoch": 1.079164405860011, + "grad_norm": 4.73669958114624, + "learning_rate": 7.282307797725093e-06, + "loss": 0.3002, + "step": 20582 + }, + { + "epoch": 1.0791779706999458, + "grad_norm": 5.851267337799072, + "learning_rate": 7.282170755104838e-06, + "loss": 0.2853, + "step": 20583 + }, + { + "epoch": 1.0791915355398807, + "grad_norm": 4.5020222663879395, + "learning_rate": 7.282033712484583e-06, + "loss": 0.2145, + "step": 20584 + }, + { + "epoch": 1.0792051003798155, + "grad_norm": 4.9930901527404785, + "learning_rate": 7.281896669864328e-06, + "loss": 0.1991, + "step": 20585 + }, + { + "epoch": 1.0792186652197504, + "grad_norm": 7.020435810089111, + "learning_rate": 7.281759627244074e-06, + "loss": 0.2845, + "step": 20586 + }, + { + "epoch": 1.0792322300596853, + "grad_norm": 5.521251678466797, + "learning_rate": 7.281622584623819e-06, + "loss": 0.2511, + "step": 20587 + }, + { + "epoch": 1.0792457948996201, + "grad_norm": 5.277407646179199, + "learning_rate": 7.281485542003563e-06, + "loss": 0.1952, + "step": 20588 + }, + { + "epoch": 1.079259359739555, + "grad_norm": 4.197327613830566, + "learning_rate": 7.281348499383309e-06, + "loss": 0.2687, + "step": 20589 + }, + { + "epoch": 1.0792729245794899, + "grad_norm": 6.414180755615234, + "learning_rate": 7.281211456763055e-06, + "loss": 0.2937, + "step": 20590 + }, + { + "epoch": 1.079286489419425, + "grad_norm": 5.386588096618652, + "learning_rate": 7.281074414142799e-06, + "loss": 0.2072, + "step": 20591 + }, + { + "epoch": 1.0793000542593598, + "grad_norm": 5.655841827392578, + "learning_rate": 7.280937371522544e-06, + "loss": 0.228, + "step": 20592 + }, + { + "epoch": 1.0793136190992947, + "grad_norm": 5.540809631347656, + "learning_rate": 7.2808003289022885e-06, + "loss": 0.2444, + "step": 20593 + }, + { + "epoch": 1.0793271839392296, + "grad_norm": 5.6675190925598145, + "learning_rate": 7.2806632862820345e-06, + "loss": 0.1658, + "step": 20594 + }, + { + "epoch": 1.0793407487791644, + "grad_norm": 5.788822650909424, + "learning_rate": 7.28052624366178e-06, + "loss": 0.2082, + "step": 20595 + }, + { + "epoch": 1.0793543136190993, + "grad_norm": 5.827078342437744, + "learning_rate": 7.280389201041525e-06, + "loss": 0.1969, + "step": 20596 + }, + { + "epoch": 1.0793678784590341, + "grad_norm": 4.304067611694336, + "learning_rate": 7.280252158421269e-06, + "loss": 0.2128, + "step": 20597 + }, + { + "epoch": 1.079381443298969, + "grad_norm": 10.572946548461914, + "learning_rate": 7.280115115801015e-06, + "loss": 0.3662, + "step": 20598 + }, + { + "epoch": 1.0793950081389039, + "grad_norm": 5.483517646789551, + "learning_rate": 7.27997807318076e-06, + "loss": 0.3005, + "step": 20599 + }, + { + "epoch": 1.079408572978839, + "grad_norm": 5.390173435211182, + "learning_rate": 7.279841030560505e-06, + "loss": 0.1904, + "step": 20600 + }, + { + "epoch": 1.0794221378187738, + "grad_norm": 6.895744323730469, + "learning_rate": 7.27970398794025e-06, + "loss": 0.2705, + "step": 20601 + }, + { + "epoch": 1.0794357026587087, + "grad_norm": 6.908659934997559, + "learning_rate": 7.279566945319994e-06, + "loss": 0.3724, + "step": 20602 + }, + { + "epoch": 1.0794492674986436, + "grad_norm": 3.7926337718963623, + "learning_rate": 7.27942990269974e-06, + "loss": 0.1382, + "step": 20603 + }, + { + "epoch": 1.0794628323385784, + "grad_norm": 5.5833964347839355, + "learning_rate": 7.279292860079485e-06, + "loss": 0.2204, + "step": 20604 + }, + { + "epoch": 1.0794763971785133, + "grad_norm": 6.398406028747559, + "learning_rate": 7.279155817459231e-06, + "loss": 0.2844, + "step": 20605 + }, + { + "epoch": 1.0794899620184482, + "grad_norm": 7.121192932128906, + "learning_rate": 7.279018774838975e-06, + "loss": 0.4108, + "step": 20606 + }, + { + "epoch": 1.079503526858383, + "grad_norm": 6.062007904052734, + "learning_rate": 7.278881732218721e-06, + "loss": 0.353, + "step": 20607 + }, + { + "epoch": 1.0795170916983179, + "grad_norm": 5.167120933532715, + "learning_rate": 7.278744689598466e-06, + "loss": 0.3122, + "step": 20608 + }, + { + "epoch": 1.0795306565382528, + "grad_norm": 5.1532673835754395, + "learning_rate": 7.2786076469782105e-06, + "loss": 0.2679, + "step": 20609 + }, + { + "epoch": 1.0795442213781878, + "grad_norm": 5.106410026550293, + "learning_rate": 7.278470604357956e-06, + "loss": 0.2689, + "step": 20610 + }, + { + "epoch": 1.0795577862181227, + "grad_norm": 5.499814987182617, + "learning_rate": 7.278333561737702e-06, + "loss": 0.2823, + "step": 20611 + }, + { + "epoch": 1.0795713510580576, + "grad_norm": 5.546471118927002, + "learning_rate": 7.278196519117446e-06, + "loss": 0.2307, + "step": 20612 + }, + { + "epoch": 1.0795849158979924, + "grad_norm": 4.605068683624268, + "learning_rate": 7.278059476497191e-06, + "loss": 0.226, + "step": 20613 + }, + { + "epoch": 1.0795984807379273, + "grad_norm": 8.073372840881348, + "learning_rate": 7.277922433876936e-06, + "loss": 0.4416, + "step": 20614 + }, + { + "epoch": 1.0796120455778622, + "grad_norm": 4.931776523590088, + "learning_rate": 7.277785391256681e-06, + "loss": 0.1897, + "step": 20615 + }, + { + "epoch": 1.079625610417797, + "grad_norm": 7.396421909332275, + "learning_rate": 7.277648348636427e-06, + "loss": 0.3556, + "step": 20616 + }, + { + "epoch": 1.079639175257732, + "grad_norm": 4.264686107635498, + "learning_rate": 7.277511306016172e-06, + "loss": 0.1582, + "step": 20617 + }, + { + "epoch": 1.0796527400976668, + "grad_norm": 6.540987491607666, + "learning_rate": 7.277374263395916e-06, + "loss": 0.4837, + "step": 20618 + }, + { + "epoch": 1.0796663049376019, + "grad_norm": 4.948439598083496, + "learning_rate": 7.277237220775661e-06, + "loss": 0.2214, + "step": 20619 + }, + { + "epoch": 1.0796798697775367, + "grad_norm": 5.110295295715332, + "learning_rate": 7.2771001781554074e-06, + "loss": 0.2415, + "step": 20620 + }, + { + "epoch": 1.0796934346174716, + "grad_norm": 4.925342082977295, + "learning_rate": 7.276963135535153e-06, + "loss": 0.2591, + "step": 20621 + }, + { + "epoch": 1.0797069994574064, + "grad_norm": 5.21679162979126, + "learning_rate": 7.276826092914897e-06, + "loss": 0.377, + "step": 20622 + }, + { + "epoch": 1.0797205642973413, + "grad_norm": 4.863822937011719, + "learning_rate": 7.276689050294642e-06, + "loss": 0.2507, + "step": 20623 + }, + { + "epoch": 1.0797341291372762, + "grad_norm": 6.091048240661621, + "learning_rate": 7.276552007674388e-06, + "loss": 0.3747, + "step": 20624 + }, + { + "epoch": 1.079747693977211, + "grad_norm": 5.293206214904785, + "learning_rate": 7.2764149650541325e-06, + "loss": 0.3421, + "step": 20625 + }, + { + "epoch": 1.079761258817146, + "grad_norm": 4.54841947555542, + "learning_rate": 7.276277922433878e-06, + "loss": 0.3135, + "step": 20626 + }, + { + "epoch": 1.0797748236570808, + "grad_norm": 5.093788146972656, + "learning_rate": 7.276140879813622e-06, + "loss": 0.2523, + "step": 20627 + }, + { + "epoch": 1.0797883884970156, + "grad_norm": 5.285837173461914, + "learning_rate": 7.276003837193367e-06, + "loss": 0.232, + "step": 20628 + }, + { + "epoch": 1.0798019533369507, + "grad_norm": 5.448535442352295, + "learning_rate": 7.275866794573113e-06, + "loss": 0.3089, + "step": 20629 + }, + { + "epoch": 1.0798155181768856, + "grad_norm": 4.861096382141113, + "learning_rate": 7.275729751952858e-06, + "loss": 0.2281, + "step": 20630 + }, + { + "epoch": 1.0798290830168205, + "grad_norm": 4.667933464050293, + "learning_rate": 7.275592709332603e-06, + "loss": 0.2719, + "step": 20631 + }, + { + "epoch": 1.0798426478567553, + "grad_norm": 4.5702385902404785, + "learning_rate": 7.275455666712348e-06, + "loss": 0.3547, + "step": 20632 + }, + { + "epoch": 1.0798562126966902, + "grad_norm": 4.45626163482666, + "learning_rate": 7.275318624092094e-06, + "loss": 0.2037, + "step": 20633 + }, + { + "epoch": 1.079869777536625, + "grad_norm": 5.413498401641846, + "learning_rate": 7.275181581471838e-06, + "loss": 0.2263, + "step": 20634 + }, + { + "epoch": 1.07988334237656, + "grad_norm": 6.182612895965576, + "learning_rate": 7.2750445388515834e-06, + "loss": 0.36, + "step": 20635 + }, + { + "epoch": 1.0798969072164948, + "grad_norm": 5.05517053604126, + "learning_rate": 7.274907496231329e-06, + "loss": 0.2887, + "step": 20636 + }, + { + "epoch": 1.0799104720564296, + "grad_norm": 6.010287284851074, + "learning_rate": 7.274770453611074e-06, + "loss": 0.3713, + "step": 20637 + }, + { + "epoch": 1.0799240368963647, + "grad_norm": 5.975183963775635, + "learning_rate": 7.274633410990819e-06, + "loss": 0.4097, + "step": 20638 + }, + { + "epoch": 1.0799376017362996, + "grad_norm": 5.751776695251465, + "learning_rate": 7.274496368370564e-06, + "loss": 0.2887, + "step": 20639 + }, + { + "epoch": 1.0799511665762345, + "grad_norm": 4.912980556488037, + "learning_rate": 7.2743593257503085e-06, + "loss": 0.3708, + "step": 20640 + }, + { + "epoch": 1.0799647314161693, + "grad_norm": 5.127286911010742, + "learning_rate": 7.274222283130054e-06, + "loss": 0.2423, + "step": 20641 + }, + { + "epoch": 1.0799782962561042, + "grad_norm": 5.6164093017578125, + "learning_rate": 7.2740852405098e-06, + "loss": 0.2541, + "step": 20642 + }, + { + "epoch": 1.079991861096039, + "grad_norm": 6.121524810791016, + "learning_rate": 7.273948197889544e-06, + "loss": 0.3522, + "step": 20643 + }, + { + "epoch": 1.080005425935974, + "grad_norm": 6.050113677978516, + "learning_rate": 7.273811155269289e-06, + "loss": 0.2695, + "step": 20644 + }, + { + "epoch": 1.0800189907759088, + "grad_norm": 5.809160232543945, + "learning_rate": 7.273674112649034e-06, + "loss": 0.4447, + "step": 20645 + }, + { + "epoch": 1.0800325556158437, + "grad_norm": 6.153130531311035, + "learning_rate": 7.2735370700287795e-06, + "loss": 0.2031, + "step": 20646 + }, + { + "epoch": 1.0800461204557785, + "grad_norm": 7.040111064910889, + "learning_rate": 7.273400027408525e-06, + "loss": 0.3794, + "step": 20647 + }, + { + "epoch": 1.0800596852957136, + "grad_norm": 5.95628023147583, + "learning_rate": 7.27326298478827e-06, + "loss": 0.2833, + "step": 20648 + }, + { + "epoch": 1.0800732501356485, + "grad_norm": 5.603017330169678, + "learning_rate": 7.273125942168014e-06, + "loss": 0.3824, + "step": 20649 + }, + { + "epoch": 1.0800868149755833, + "grad_norm": 6.70686149597168, + "learning_rate": 7.27298889954776e-06, + "loss": 0.2845, + "step": 20650 + }, + { + "epoch": 1.0801003798155182, + "grad_norm": 6.751331329345703, + "learning_rate": 7.2728518569275054e-06, + "loss": 0.2462, + "step": 20651 + }, + { + "epoch": 1.080113944655453, + "grad_norm": 6.661079406738281, + "learning_rate": 7.27271481430725e-06, + "loss": 0.4386, + "step": 20652 + }, + { + "epoch": 1.080127509495388, + "grad_norm": 4.986270427703857, + "learning_rate": 7.272577771686995e-06, + "loss": 0.2779, + "step": 20653 + }, + { + "epoch": 1.0801410743353228, + "grad_norm": 5.883049488067627, + "learning_rate": 7.27244072906674e-06, + "loss": 0.2415, + "step": 20654 + }, + { + "epoch": 1.0801546391752577, + "grad_norm": 6.831838607788086, + "learning_rate": 7.272303686446486e-06, + "loss": 0.3499, + "step": 20655 + }, + { + "epoch": 1.0801682040151925, + "grad_norm": 5.740375518798828, + "learning_rate": 7.2721666438262305e-06, + "loss": 0.2437, + "step": 20656 + }, + { + "epoch": 1.0801817688551276, + "grad_norm": 6.203274726867676, + "learning_rate": 7.272029601205976e-06, + "loss": 0.3255, + "step": 20657 + }, + { + "epoch": 1.0801953336950625, + "grad_norm": 6.111679553985596, + "learning_rate": 7.27189255858572e-06, + "loss": 0.3608, + "step": 20658 + }, + { + "epoch": 1.0802088985349974, + "grad_norm": 6.62043571472168, + "learning_rate": 7.271755515965466e-06, + "loss": 0.2736, + "step": 20659 + }, + { + "epoch": 1.0802224633749322, + "grad_norm": 6.719871997833252, + "learning_rate": 7.271618473345211e-06, + "loss": 0.3797, + "step": 20660 + }, + { + "epoch": 1.080236028214867, + "grad_norm": 5.689077377319336, + "learning_rate": 7.2714814307249555e-06, + "loss": 0.2273, + "step": 20661 + }, + { + "epoch": 1.080249593054802, + "grad_norm": 6.137019157409668, + "learning_rate": 7.271344388104701e-06, + "loss": 0.3065, + "step": 20662 + }, + { + "epoch": 1.0802631578947368, + "grad_norm": 5.081473350524902, + "learning_rate": 7.271207345484447e-06, + "loss": 0.3308, + "step": 20663 + }, + { + "epoch": 1.0802767227346717, + "grad_norm": 7.27256965637207, + "learning_rate": 7.271070302864192e-06, + "loss": 0.4283, + "step": 20664 + }, + { + "epoch": 1.0802902875746065, + "grad_norm": 6.2542924880981445, + "learning_rate": 7.270933260243936e-06, + "loss": 0.307, + "step": 20665 + }, + { + "epoch": 1.0803038524145414, + "grad_norm": 5.650587558746338, + "learning_rate": 7.2707962176236814e-06, + "loss": 0.4324, + "step": 20666 + }, + { + "epoch": 1.0803174172544765, + "grad_norm": 6.061064720153809, + "learning_rate": 7.270659175003426e-06, + "loss": 0.3573, + "step": 20667 + }, + { + "epoch": 1.0803309820944114, + "grad_norm": 4.649348735809326, + "learning_rate": 7.270522132383172e-06, + "loss": 0.2676, + "step": 20668 + }, + { + "epoch": 1.0803445469343462, + "grad_norm": 4.59821891784668, + "learning_rate": 7.270385089762917e-06, + "loss": 0.2221, + "step": 20669 + }, + { + "epoch": 1.080358111774281, + "grad_norm": 7.342850685119629, + "learning_rate": 7.270248047142662e-06, + "loss": 0.3356, + "step": 20670 + }, + { + "epoch": 1.080371676614216, + "grad_norm": 7.461329460144043, + "learning_rate": 7.2701110045224065e-06, + "loss": 0.2507, + "step": 20671 + }, + { + "epoch": 1.0803852414541508, + "grad_norm": 4.993282794952393, + "learning_rate": 7.2699739619021525e-06, + "loss": 0.2215, + "step": 20672 + }, + { + "epoch": 1.0803988062940857, + "grad_norm": 5.259868144989014, + "learning_rate": 7.269836919281898e-06, + "loss": 0.3397, + "step": 20673 + }, + { + "epoch": 1.0804123711340206, + "grad_norm": 5.504449367523193, + "learning_rate": 7.269699876661642e-06, + "loss": 0.2671, + "step": 20674 + }, + { + "epoch": 1.0804259359739554, + "grad_norm": 7.6120686531066895, + "learning_rate": 7.269562834041387e-06, + "loss": 0.3998, + "step": 20675 + }, + { + "epoch": 1.0804395008138905, + "grad_norm": 6.255636692047119, + "learning_rate": 7.269425791421133e-06, + "loss": 0.286, + "step": 20676 + }, + { + "epoch": 1.0804530656538254, + "grad_norm": 4.659327983856201, + "learning_rate": 7.2692887488008775e-06, + "loss": 0.2341, + "step": 20677 + }, + { + "epoch": 1.0804666304937602, + "grad_norm": 6.604005336761475, + "learning_rate": 7.269151706180623e-06, + "loss": 0.3396, + "step": 20678 + }, + { + "epoch": 1.080480195333695, + "grad_norm": 5.790160179138184, + "learning_rate": 7.269014663560368e-06, + "loss": 0.2979, + "step": 20679 + }, + { + "epoch": 1.08049376017363, + "grad_norm": 7.362782001495361, + "learning_rate": 7.268877620940114e-06, + "loss": 0.4898, + "step": 20680 + }, + { + "epoch": 1.0805073250135648, + "grad_norm": 6.199995517730713, + "learning_rate": 7.268740578319858e-06, + "loss": 0.2565, + "step": 20681 + }, + { + "epoch": 1.0805208898534997, + "grad_norm": 5.577614784240723, + "learning_rate": 7.2686035356996034e-06, + "loss": 0.304, + "step": 20682 + }, + { + "epoch": 1.0805344546934346, + "grad_norm": 5.945891857147217, + "learning_rate": 7.268466493079348e-06, + "loss": 0.3058, + "step": 20683 + }, + { + "epoch": 1.0805480195333694, + "grad_norm": 6.181454658508301, + "learning_rate": 7.268329450459093e-06, + "loss": 0.3573, + "step": 20684 + }, + { + "epoch": 1.0805615843733043, + "grad_norm": 5.783047676086426, + "learning_rate": 7.268192407838839e-06, + "loss": 0.3967, + "step": 20685 + }, + { + "epoch": 1.0805751492132394, + "grad_norm": 5.623231887817383, + "learning_rate": 7.268055365218583e-06, + "loss": 0.3096, + "step": 20686 + }, + { + "epoch": 1.0805887140531742, + "grad_norm": 4.976030349731445, + "learning_rate": 7.2679183225983285e-06, + "loss": 0.2496, + "step": 20687 + }, + { + "epoch": 1.0806022788931091, + "grad_norm": 5.52780818939209, + "learning_rate": 7.267781279978074e-06, + "loss": 0.3258, + "step": 20688 + }, + { + "epoch": 1.080615843733044, + "grad_norm": 4.931555271148682, + "learning_rate": 7.26764423735782e-06, + "loss": 0.3105, + "step": 20689 + }, + { + "epoch": 1.0806294085729788, + "grad_norm": 7.260551929473877, + "learning_rate": 7.267507194737564e-06, + "loss": 0.3125, + "step": 20690 + }, + { + "epoch": 1.0806429734129137, + "grad_norm": 5.3928070068359375, + "learning_rate": 7.267370152117309e-06, + "loss": 0.3912, + "step": 20691 + }, + { + "epoch": 1.0806565382528486, + "grad_norm": 5.378304958343506, + "learning_rate": 7.2672331094970535e-06, + "loss": 0.2443, + "step": 20692 + }, + { + "epoch": 1.0806701030927834, + "grad_norm": 5.654703617095947, + "learning_rate": 7.2670960668767996e-06, + "loss": 0.252, + "step": 20693 + }, + { + "epoch": 1.0806836679327183, + "grad_norm": 7.0288567543029785, + "learning_rate": 7.266959024256545e-06, + "loss": 0.3343, + "step": 20694 + }, + { + "epoch": 1.0806972327726534, + "grad_norm": 7.132638454437256, + "learning_rate": 7.266821981636289e-06, + "loss": 0.321, + "step": 20695 + }, + { + "epoch": 1.0807107976125883, + "grad_norm": 4.462576866149902, + "learning_rate": 7.266684939016034e-06, + "loss": 0.2024, + "step": 20696 + }, + { + "epoch": 1.0807243624525231, + "grad_norm": 5.748302936553955, + "learning_rate": 7.2665478963957794e-06, + "loss": 0.336, + "step": 20697 + }, + { + "epoch": 1.080737927292458, + "grad_norm": 6.708042621612549, + "learning_rate": 7.2664108537755255e-06, + "loss": 0.3699, + "step": 20698 + }, + { + "epoch": 1.0807514921323929, + "grad_norm": 5.911109924316406, + "learning_rate": 7.26627381115527e-06, + "loss": 0.361, + "step": 20699 + }, + { + "epoch": 1.0807650569723277, + "grad_norm": 6.552943229675293, + "learning_rate": 7.266136768535015e-06, + "loss": 0.4488, + "step": 20700 + }, + { + "epoch": 1.0807786218122626, + "grad_norm": 6.328695297241211, + "learning_rate": 7.265999725914759e-06, + "loss": 0.4326, + "step": 20701 + }, + { + "epoch": 1.0807921866521975, + "grad_norm": 4.763381481170654, + "learning_rate": 7.265862683294505e-06, + "loss": 0.2516, + "step": 20702 + }, + { + "epoch": 1.0808057514921323, + "grad_norm": 5.6299285888671875, + "learning_rate": 7.2657256406742505e-06, + "loss": 0.216, + "step": 20703 + }, + { + "epoch": 1.0808193163320672, + "grad_norm": 4.936309814453125, + "learning_rate": 7.265588598053996e-06, + "loss": 0.2306, + "step": 20704 + }, + { + "epoch": 1.0808328811720023, + "grad_norm": 3.752196788787842, + "learning_rate": 7.26545155543374e-06, + "loss": 0.2191, + "step": 20705 + }, + { + "epoch": 1.0808464460119371, + "grad_norm": 6.872091770172119, + "learning_rate": 7.265314512813486e-06, + "loss": 0.38, + "step": 20706 + }, + { + "epoch": 1.080860010851872, + "grad_norm": 5.85645055770874, + "learning_rate": 7.265177470193231e-06, + "loss": 0.2895, + "step": 20707 + }, + { + "epoch": 1.0808735756918069, + "grad_norm": 6.875241279602051, + "learning_rate": 7.2650404275729756e-06, + "loss": 0.4268, + "step": 20708 + }, + { + "epoch": 1.0808871405317417, + "grad_norm": 5.810258865356445, + "learning_rate": 7.264903384952721e-06, + "loss": 0.3455, + "step": 20709 + }, + { + "epoch": 1.0809007053716766, + "grad_norm": 5.346012115478516, + "learning_rate": 7.264766342332465e-06, + "loss": 0.2214, + "step": 20710 + }, + { + "epoch": 1.0809142702116115, + "grad_norm": 6.2829437255859375, + "learning_rate": 7.264629299712211e-06, + "loss": 0.299, + "step": 20711 + }, + { + "epoch": 1.0809278350515463, + "grad_norm": 5.243113040924072, + "learning_rate": 7.264492257091956e-06, + "loss": 0.321, + "step": 20712 + }, + { + "epoch": 1.0809413998914812, + "grad_norm": 4.889459133148193, + "learning_rate": 7.2643552144717014e-06, + "loss": 0.3266, + "step": 20713 + }, + { + "epoch": 1.0809549647314163, + "grad_norm": 5.481247425079346, + "learning_rate": 7.264218171851446e-06, + "loss": 0.3836, + "step": 20714 + }, + { + "epoch": 1.0809685295713511, + "grad_norm": 7.025491237640381, + "learning_rate": 7.264081129231192e-06, + "loss": 0.3293, + "step": 20715 + }, + { + "epoch": 1.080982094411286, + "grad_norm": 6.39168643951416, + "learning_rate": 7.263944086610937e-06, + "loss": 0.3759, + "step": 20716 + }, + { + "epoch": 1.0809956592512209, + "grad_norm": 6.558658599853516, + "learning_rate": 7.263807043990681e-06, + "loss": 0.3254, + "step": 20717 + }, + { + "epoch": 1.0810092240911557, + "grad_norm": 7.165185451507568, + "learning_rate": 7.2636700013704265e-06, + "loss": 0.5223, + "step": 20718 + }, + { + "epoch": 1.0810227889310906, + "grad_norm": 7.0735392570495605, + "learning_rate": 7.2635329587501725e-06, + "loss": 0.3828, + "step": 20719 + }, + { + "epoch": 1.0810363537710255, + "grad_norm": 4.650331020355225, + "learning_rate": 7.263395916129917e-06, + "loss": 0.2565, + "step": 20720 + }, + { + "epoch": 1.0810499186109603, + "grad_norm": 4.780245780944824, + "learning_rate": 7.263258873509662e-06, + "loss": 0.2242, + "step": 20721 + }, + { + "epoch": 1.0810634834508952, + "grad_norm": 4.048558712005615, + "learning_rate": 7.263121830889407e-06, + "loss": 0.144, + "step": 20722 + }, + { + "epoch": 1.08107704829083, + "grad_norm": 6.317596435546875, + "learning_rate": 7.2629847882691515e-06, + "loss": 0.2656, + "step": 20723 + }, + { + "epoch": 1.0810906131307652, + "grad_norm": 6.335227012634277, + "learning_rate": 7.2628477456488976e-06, + "loss": 0.4295, + "step": 20724 + }, + { + "epoch": 1.0811041779707, + "grad_norm": 6.244981288909912, + "learning_rate": 7.262710703028643e-06, + "loss": 0.2851, + "step": 20725 + }, + { + "epoch": 1.0811177428106349, + "grad_norm": 6.151612281799316, + "learning_rate": 7.262573660408387e-06, + "loss": 0.3795, + "step": 20726 + }, + { + "epoch": 1.0811313076505698, + "grad_norm": 4.257130146026611, + "learning_rate": 7.262436617788132e-06, + "loss": 0.2987, + "step": 20727 + }, + { + "epoch": 1.0811448724905046, + "grad_norm": 6.160741806030273, + "learning_rate": 7.262299575167878e-06, + "loss": 0.3506, + "step": 20728 + }, + { + "epoch": 1.0811584373304395, + "grad_norm": 5.667587757110596, + "learning_rate": 7.2621625325476235e-06, + "loss": 0.4054, + "step": 20729 + }, + { + "epoch": 1.0811720021703743, + "grad_norm": 5.451823711395264, + "learning_rate": 7.262025489927368e-06, + "loss": 0.393, + "step": 20730 + }, + { + "epoch": 1.0811855670103092, + "grad_norm": 5.586004257202148, + "learning_rate": 7.261888447307113e-06, + "loss": 0.2874, + "step": 20731 + }, + { + "epoch": 1.081199131850244, + "grad_norm": 5.459738731384277, + "learning_rate": 7.261751404686859e-06, + "loss": 0.3825, + "step": 20732 + }, + { + "epoch": 1.0812126966901792, + "grad_norm": 5.3106842041015625, + "learning_rate": 7.261614362066603e-06, + "loss": 0.3083, + "step": 20733 + }, + { + "epoch": 1.081226261530114, + "grad_norm": 8.370678901672363, + "learning_rate": 7.2614773194463485e-06, + "loss": 0.3775, + "step": 20734 + }, + { + "epoch": 1.081239826370049, + "grad_norm": 4.649501800537109, + "learning_rate": 7.261340276826093e-06, + "loss": 0.2118, + "step": 20735 + }, + { + "epoch": 1.0812533912099838, + "grad_norm": 6.869203090667725, + "learning_rate": 7.261203234205838e-06, + "loss": 0.2741, + "step": 20736 + }, + { + "epoch": 1.0812669560499186, + "grad_norm": 6.3555908203125, + "learning_rate": 7.261066191585584e-06, + "loss": 0.398, + "step": 20737 + }, + { + "epoch": 1.0812805208898535, + "grad_norm": 4.7144551277160645, + "learning_rate": 7.260929148965329e-06, + "loss": 0.2541, + "step": 20738 + }, + { + "epoch": 1.0812940857297884, + "grad_norm": 5.471858501434326, + "learning_rate": 7.2607921063450736e-06, + "loss": 0.2533, + "step": 20739 + }, + { + "epoch": 1.0813076505697232, + "grad_norm": 5.130539417266846, + "learning_rate": 7.260655063724819e-06, + "loss": 0.1695, + "step": 20740 + }, + { + "epoch": 1.081321215409658, + "grad_norm": 5.39549446105957, + "learning_rate": 7.260518021104565e-06, + "loss": 0.2763, + "step": 20741 + }, + { + "epoch": 1.081334780249593, + "grad_norm": 5.37129020690918, + "learning_rate": 7.260380978484309e-06, + "loss": 0.301, + "step": 20742 + }, + { + "epoch": 1.081348345089528, + "grad_norm": 7.473577976226807, + "learning_rate": 7.260243935864054e-06, + "loss": 0.5583, + "step": 20743 + }, + { + "epoch": 1.081361909929463, + "grad_norm": 4.232738971710205, + "learning_rate": 7.260106893243799e-06, + "loss": 0.2415, + "step": 20744 + }, + { + "epoch": 1.0813754747693978, + "grad_norm": 4.918837070465088, + "learning_rate": 7.259969850623545e-06, + "loss": 0.1842, + "step": 20745 + }, + { + "epoch": 1.0813890396093326, + "grad_norm": 6.662089824676514, + "learning_rate": 7.25983280800329e-06, + "loss": 0.348, + "step": 20746 + }, + { + "epoch": 1.0814026044492675, + "grad_norm": 6.506319999694824, + "learning_rate": 7.259695765383035e-06, + "loss": 0.3499, + "step": 20747 + }, + { + "epoch": 1.0814161692892024, + "grad_norm": 4.520670413970947, + "learning_rate": 7.259558722762779e-06, + "loss": 0.3283, + "step": 20748 + }, + { + "epoch": 1.0814297341291372, + "grad_norm": 4.686673641204834, + "learning_rate": 7.259421680142525e-06, + "loss": 0.3247, + "step": 20749 + }, + { + "epoch": 1.081443298969072, + "grad_norm": 6.822707653045654, + "learning_rate": 7.2592846375222705e-06, + "loss": 0.3525, + "step": 20750 + }, + { + "epoch": 1.081456863809007, + "grad_norm": 4.4857025146484375, + "learning_rate": 7.259147594902015e-06, + "loss": 0.2062, + "step": 20751 + }, + { + "epoch": 1.081470428648942, + "grad_norm": 8.908920288085938, + "learning_rate": 7.25901055228176e-06, + "loss": 0.439, + "step": 20752 + }, + { + "epoch": 1.081483993488877, + "grad_norm": 4.768057823181152, + "learning_rate": 7.258873509661505e-06, + "loss": 0.2542, + "step": 20753 + }, + { + "epoch": 1.0814975583288118, + "grad_norm": 5.257816791534424, + "learning_rate": 7.25873646704125e-06, + "loss": 0.2755, + "step": 20754 + }, + { + "epoch": 1.0815111231687466, + "grad_norm": 7.23950719833374, + "learning_rate": 7.2585994244209956e-06, + "loss": 0.4973, + "step": 20755 + }, + { + "epoch": 1.0815246880086815, + "grad_norm": 5.1151814460754395, + "learning_rate": 7.258462381800741e-06, + "loss": 0.2253, + "step": 20756 + }, + { + "epoch": 1.0815382528486164, + "grad_norm": 5.349310398101807, + "learning_rate": 7.258325339180485e-06, + "loss": 0.3054, + "step": 20757 + }, + { + "epoch": 1.0815518176885512, + "grad_norm": 3.9463918209075928, + "learning_rate": 7.258188296560231e-06, + "loss": 0.119, + "step": 20758 + }, + { + "epoch": 1.081565382528486, + "grad_norm": 5.639718532562256, + "learning_rate": 7.258051253939976e-06, + "loss": 0.3422, + "step": 20759 + }, + { + "epoch": 1.081578947368421, + "grad_norm": 5.616634845733643, + "learning_rate": 7.257914211319721e-06, + "loss": 0.2772, + "step": 20760 + }, + { + "epoch": 1.0815925122083558, + "grad_norm": 4.508190631866455, + "learning_rate": 7.257777168699466e-06, + "loss": 0.1861, + "step": 20761 + }, + { + "epoch": 1.081606077048291, + "grad_norm": 6.504543781280518, + "learning_rate": 7.257640126079212e-06, + "loss": 0.2171, + "step": 20762 + }, + { + "epoch": 1.0816196418882258, + "grad_norm": 6.555530548095703, + "learning_rate": 7.257503083458957e-06, + "loss": 0.325, + "step": 20763 + }, + { + "epoch": 1.0816332067281607, + "grad_norm": 5.258360385894775, + "learning_rate": 7.257366040838701e-06, + "loss": 0.2361, + "step": 20764 + }, + { + "epoch": 1.0816467715680955, + "grad_norm": 5.347052574157715, + "learning_rate": 7.2572289982184465e-06, + "loss": 0.2003, + "step": 20765 + }, + { + "epoch": 1.0816603364080304, + "grad_norm": 4.383852958679199, + "learning_rate": 7.257091955598191e-06, + "loss": 0.3242, + "step": 20766 + }, + { + "epoch": 1.0816739012479653, + "grad_norm": 5.361140251159668, + "learning_rate": 7.256954912977937e-06, + "loss": 0.3466, + "step": 20767 + }, + { + "epoch": 1.0816874660879001, + "grad_norm": 4.539410591125488, + "learning_rate": 7.256817870357682e-06, + "loss": 0.2216, + "step": 20768 + }, + { + "epoch": 1.081701030927835, + "grad_norm": 5.991466999053955, + "learning_rate": 7.256680827737426e-06, + "loss": 0.323, + "step": 20769 + }, + { + "epoch": 1.0817145957677698, + "grad_norm": 6.576825141906738, + "learning_rate": 7.2565437851171716e-06, + "loss": 0.3586, + "step": 20770 + }, + { + "epoch": 1.081728160607705, + "grad_norm": 5.398492813110352, + "learning_rate": 7.256406742496918e-06, + "loss": 0.2972, + "step": 20771 + }, + { + "epoch": 1.0817417254476398, + "grad_norm": 4.322465896606445, + "learning_rate": 7.256269699876663e-06, + "loss": 0.2251, + "step": 20772 + }, + { + "epoch": 1.0817552902875747, + "grad_norm": 8.123191833496094, + "learning_rate": 7.256132657256407e-06, + "loss": 0.4399, + "step": 20773 + }, + { + "epoch": 1.0817688551275095, + "grad_norm": 4.864993572235107, + "learning_rate": 7.255995614636152e-06, + "loss": 0.1176, + "step": 20774 + }, + { + "epoch": 1.0817824199674444, + "grad_norm": 3.7312495708465576, + "learning_rate": 7.255858572015898e-06, + "loss": 0.1632, + "step": 20775 + }, + { + "epoch": 1.0817959848073793, + "grad_norm": 5.20517110824585, + "learning_rate": 7.255721529395643e-06, + "loss": 0.3335, + "step": 20776 + }, + { + "epoch": 1.0818095496473141, + "grad_norm": 5.00988245010376, + "learning_rate": 7.255584486775388e-06, + "loss": 0.2042, + "step": 20777 + }, + { + "epoch": 1.081823114487249, + "grad_norm": 4.498767375946045, + "learning_rate": 7.255447444155133e-06, + "loss": 0.1395, + "step": 20778 + }, + { + "epoch": 1.0818366793271839, + "grad_norm": 5.442386150360107, + "learning_rate": 7.255310401534877e-06, + "loss": 0.301, + "step": 20779 + }, + { + "epoch": 1.0818502441671187, + "grad_norm": 5.832199573516846, + "learning_rate": 7.255173358914623e-06, + "loss": 0.3385, + "step": 20780 + }, + { + "epoch": 1.0818638090070538, + "grad_norm": 4.924316883087158, + "learning_rate": 7.2550363162943685e-06, + "loss": 0.2301, + "step": 20781 + }, + { + "epoch": 1.0818773738469887, + "grad_norm": 8.353166580200195, + "learning_rate": 7.254899273674113e-06, + "loss": 0.4665, + "step": 20782 + }, + { + "epoch": 1.0818909386869235, + "grad_norm": 6.488393306732178, + "learning_rate": 7.254762231053858e-06, + "loss": 0.3551, + "step": 20783 + }, + { + "epoch": 1.0819045035268584, + "grad_norm": 5.964015007019043, + "learning_rate": 7.254625188433604e-06, + "loss": 0.2138, + "step": 20784 + }, + { + "epoch": 1.0819180683667933, + "grad_norm": 7.033720970153809, + "learning_rate": 7.254488145813348e-06, + "loss": 0.3812, + "step": 20785 + }, + { + "epoch": 1.0819316332067281, + "grad_norm": 4.9988579750061035, + "learning_rate": 7.254351103193094e-06, + "loss": 0.2848, + "step": 20786 + }, + { + "epoch": 1.081945198046663, + "grad_norm": 7.039402961730957, + "learning_rate": 7.254214060572839e-06, + "loss": 0.3161, + "step": 20787 + }, + { + "epoch": 1.0819587628865979, + "grad_norm": 6.991529941558838, + "learning_rate": 7.254077017952584e-06, + "loss": 0.3834, + "step": 20788 + }, + { + "epoch": 1.0819723277265327, + "grad_norm": 6.406789302825928, + "learning_rate": 7.253939975332329e-06, + "loss": 0.4221, + "step": 20789 + }, + { + "epoch": 1.0819858925664678, + "grad_norm": 6.2146100997924805, + "learning_rate": 7.253802932712074e-06, + "loss": 0.3228, + "step": 20790 + }, + { + "epoch": 1.0819994574064027, + "grad_norm": 6.595275402069092, + "learning_rate": 7.253665890091819e-06, + "loss": 0.2742, + "step": 20791 + }, + { + "epoch": 1.0820130222463376, + "grad_norm": 9.616722106933594, + "learning_rate": 7.253528847471564e-06, + "loss": 0.3568, + "step": 20792 + }, + { + "epoch": 1.0820265870862724, + "grad_norm": 5.872809886932373, + "learning_rate": 7.25339180485131e-06, + "loss": 0.2293, + "step": 20793 + }, + { + "epoch": 1.0820401519262073, + "grad_norm": 4.65889835357666, + "learning_rate": 7.253254762231054e-06, + "loss": 0.256, + "step": 20794 + }, + { + "epoch": 1.0820537167661421, + "grad_norm": 3.467582941055298, + "learning_rate": 7.253117719610799e-06, + "loss": 0.173, + "step": 20795 + }, + { + "epoch": 1.082067281606077, + "grad_norm": 6.118210792541504, + "learning_rate": 7.2529806769905445e-06, + "loss": 0.2444, + "step": 20796 + }, + { + "epoch": 1.0820808464460119, + "grad_norm": 6.630795955657959, + "learning_rate": 7.2528436343702905e-06, + "loss": 0.3764, + "step": 20797 + }, + { + "epoch": 1.0820944112859467, + "grad_norm": 6.093745708465576, + "learning_rate": 7.252706591750035e-06, + "loss": 0.2944, + "step": 20798 + }, + { + "epoch": 1.0821079761258816, + "grad_norm": 5.2158684730529785, + "learning_rate": 7.25256954912978e-06, + "loss": 0.2401, + "step": 20799 + }, + { + "epoch": 1.0821215409658167, + "grad_norm": 6.232633113861084, + "learning_rate": 7.252432506509524e-06, + "loss": 0.2668, + "step": 20800 + }, + { + "epoch": 1.0821351058057516, + "grad_norm": 6.330411434173584, + "learning_rate": 7.25229546388927e-06, + "loss": 0.2, + "step": 20801 + }, + { + "epoch": 1.0821486706456864, + "grad_norm": 6.595464706420898, + "learning_rate": 7.252158421269016e-06, + "loss": 0.2689, + "step": 20802 + }, + { + "epoch": 1.0821622354856213, + "grad_norm": 5.911309242248535, + "learning_rate": 7.25202137864876e-06, + "loss": 0.327, + "step": 20803 + }, + { + "epoch": 1.0821758003255562, + "grad_norm": 4.828086853027344, + "learning_rate": 7.251884336028505e-06, + "loss": 0.145, + "step": 20804 + }, + { + "epoch": 1.082189365165491, + "grad_norm": 4.68461799621582, + "learning_rate": 7.25174729340825e-06, + "loss": 0.2358, + "step": 20805 + }, + { + "epoch": 1.082202930005426, + "grad_norm": 3.749070405960083, + "learning_rate": 7.251610250787996e-06, + "loss": 0.1644, + "step": 20806 + }, + { + "epoch": 1.0822164948453608, + "grad_norm": 4.029151916503906, + "learning_rate": 7.251473208167741e-06, + "loss": 0.1377, + "step": 20807 + }, + { + "epoch": 1.0822300596852956, + "grad_norm": 4.4212846755981445, + "learning_rate": 7.251336165547486e-06, + "loss": 0.1811, + "step": 20808 + }, + { + "epoch": 1.0822436245252307, + "grad_norm": 3.3907082080841064, + "learning_rate": 7.25119912292723e-06, + "loss": 0.1459, + "step": 20809 + }, + { + "epoch": 1.0822571893651656, + "grad_norm": 6.248401165008545, + "learning_rate": 7.251062080306976e-06, + "loss": 0.3382, + "step": 20810 + }, + { + "epoch": 1.0822707542051004, + "grad_norm": 7.369213581085205, + "learning_rate": 7.250925037686721e-06, + "loss": 0.3514, + "step": 20811 + }, + { + "epoch": 1.0822843190450353, + "grad_norm": 5.430294036865234, + "learning_rate": 7.2507879950664665e-06, + "loss": 0.1965, + "step": 20812 + }, + { + "epoch": 1.0822978838849702, + "grad_norm": 5.308088302612305, + "learning_rate": 7.250650952446211e-06, + "loss": 0.2207, + "step": 20813 + }, + { + "epoch": 1.082311448724905, + "grad_norm": 5.569225788116455, + "learning_rate": 7.250513909825957e-06, + "loss": 0.3009, + "step": 20814 + }, + { + "epoch": 1.08232501356484, + "grad_norm": 6.853817462921143, + "learning_rate": 7.250376867205702e-06, + "loss": 0.2488, + "step": 20815 + }, + { + "epoch": 1.0823385784047748, + "grad_norm": 5.159601211547852, + "learning_rate": 7.250239824585446e-06, + "loss": 0.2001, + "step": 20816 + }, + { + "epoch": 1.0823521432447096, + "grad_norm": 6.280028343200684, + "learning_rate": 7.250102781965192e-06, + "loss": 0.2319, + "step": 20817 + }, + { + "epoch": 1.0823657080846445, + "grad_norm": 6.48909854888916, + "learning_rate": 7.249965739344938e-06, + "loss": 0.2232, + "step": 20818 + }, + { + "epoch": 1.0823792729245796, + "grad_norm": 5.8805460929870605, + "learning_rate": 7.249828696724682e-06, + "loss": 0.206, + "step": 20819 + }, + { + "epoch": 1.0823928377645144, + "grad_norm": 5.020817756652832, + "learning_rate": 7.249691654104427e-06, + "loss": 0.1705, + "step": 20820 + }, + { + "epoch": 1.0824064026044493, + "grad_norm": 6.482549667358398, + "learning_rate": 7.249554611484172e-06, + "loss": 0.2457, + "step": 20821 + }, + { + "epoch": 1.0824199674443842, + "grad_norm": 5.721512317657471, + "learning_rate": 7.249417568863917e-06, + "loss": 0.2532, + "step": 20822 + }, + { + "epoch": 1.082433532284319, + "grad_norm": 4.283908843994141, + "learning_rate": 7.249280526243663e-06, + "loss": 0.2156, + "step": 20823 + }, + { + "epoch": 1.082447097124254, + "grad_norm": 4.641730785369873, + "learning_rate": 7.249143483623408e-06, + "loss": 0.1817, + "step": 20824 + }, + { + "epoch": 1.0824606619641888, + "grad_norm": 5.15027379989624, + "learning_rate": 7.249006441003152e-06, + "loss": 0.2353, + "step": 20825 + }, + { + "epoch": 1.0824742268041236, + "grad_norm": 5.124478816986084, + "learning_rate": 7.248869398382897e-06, + "loss": 0.2101, + "step": 20826 + }, + { + "epoch": 1.0824877916440585, + "grad_norm": 5.412572383880615, + "learning_rate": 7.248732355762643e-06, + "loss": 0.2951, + "step": 20827 + }, + { + "epoch": 1.0825013564839936, + "grad_norm": 4.341961860656738, + "learning_rate": 7.248595313142388e-06, + "loss": 0.2437, + "step": 20828 + }, + { + "epoch": 1.0825149213239285, + "grad_norm": 8.585800170898438, + "learning_rate": 7.248458270522133e-06, + "loss": 0.3407, + "step": 20829 + }, + { + "epoch": 1.0825284861638633, + "grad_norm": 6.25897741317749, + "learning_rate": 7.248321227901878e-06, + "loss": 0.3539, + "step": 20830 + }, + { + "epoch": 1.0825420510037982, + "grad_norm": 5.258633613586426, + "learning_rate": 7.248184185281624e-06, + "loss": 0.2225, + "step": 20831 + }, + { + "epoch": 1.082555615843733, + "grad_norm": 4.28334379196167, + "learning_rate": 7.248047142661368e-06, + "loss": 0.1994, + "step": 20832 + }, + { + "epoch": 1.082569180683668, + "grad_norm": 5.9151997566223145, + "learning_rate": 7.247910100041114e-06, + "loss": 0.3558, + "step": 20833 + }, + { + "epoch": 1.0825827455236028, + "grad_norm": 6.057720184326172, + "learning_rate": 7.247773057420858e-06, + "loss": 0.292, + "step": 20834 + }, + { + "epoch": 1.0825963103635377, + "grad_norm": 4.618385314941406, + "learning_rate": 7.247636014800603e-06, + "loss": 0.2085, + "step": 20835 + }, + { + "epoch": 1.0826098752034725, + "grad_norm": 5.705772399902344, + "learning_rate": 7.247498972180349e-06, + "loss": 0.3857, + "step": 20836 + }, + { + "epoch": 1.0826234400434074, + "grad_norm": 5.906977653503418, + "learning_rate": 7.2473619295600935e-06, + "loss": 0.3523, + "step": 20837 + }, + { + "epoch": 1.0826370048833425, + "grad_norm": 5.080636501312256, + "learning_rate": 7.247224886939839e-06, + "loss": 0.361, + "step": 20838 + }, + { + "epoch": 1.0826505697232773, + "grad_norm": 5.524499893188477, + "learning_rate": 7.247087844319584e-06, + "loss": 0.3114, + "step": 20839 + }, + { + "epoch": 1.0826641345632122, + "grad_norm": 6.089542865753174, + "learning_rate": 7.24695080169933e-06, + "loss": 0.1816, + "step": 20840 + }, + { + "epoch": 1.082677699403147, + "grad_norm": 4.8411335945129395, + "learning_rate": 7.246813759079074e-06, + "loss": 0.2204, + "step": 20841 + }, + { + "epoch": 1.082691264243082, + "grad_norm": 6.453458309173584, + "learning_rate": 7.246676716458819e-06, + "loss": 0.2724, + "step": 20842 + }, + { + "epoch": 1.0827048290830168, + "grad_norm": 6.956614971160889, + "learning_rate": 7.246539673838564e-06, + "loss": 0.4478, + "step": 20843 + }, + { + "epoch": 1.0827183939229517, + "grad_norm": 5.703083515167236, + "learning_rate": 7.24640263121831e-06, + "loss": 0.38, + "step": 20844 + }, + { + "epoch": 1.0827319587628865, + "grad_norm": 5.2168869972229, + "learning_rate": 7.246265588598055e-06, + "loss": 0.3072, + "step": 20845 + }, + { + "epoch": 1.0827455236028214, + "grad_norm": 5.357634544372559, + "learning_rate": 7.2461285459778e-06, + "loss": 0.2867, + "step": 20846 + }, + { + "epoch": 1.0827590884427565, + "grad_norm": 4.799534797668457, + "learning_rate": 7.245991503357544e-06, + "loss": 0.2698, + "step": 20847 + }, + { + "epoch": 1.0827726532826913, + "grad_norm": 5.904577255249023, + "learning_rate": 7.24585446073729e-06, + "loss": 0.398, + "step": 20848 + }, + { + "epoch": 1.0827862181226262, + "grad_norm": 6.380523681640625, + "learning_rate": 7.245717418117036e-06, + "loss": 0.3078, + "step": 20849 + }, + { + "epoch": 1.082799782962561, + "grad_norm": 5.231255531311035, + "learning_rate": 7.24558037549678e-06, + "loss": 0.2938, + "step": 20850 + }, + { + "epoch": 1.082813347802496, + "grad_norm": 4.2868547439575195, + "learning_rate": 7.245443332876525e-06, + "loss": 0.1601, + "step": 20851 + }, + { + "epoch": 1.0828269126424308, + "grad_norm": 3.872075080871582, + "learning_rate": 7.2453062902562695e-06, + "loss": 0.1753, + "step": 20852 + }, + { + "epoch": 1.0828404774823657, + "grad_norm": 4.570948123931885, + "learning_rate": 7.2451692476360155e-06, + "loss": 0.2708, + "step": 20853 + }, + { + "epoch": 1.0828540423223005, + "grad_norm": 5.614495754241943, + "learning_rate": 7.245032205015761e-06, + "loss": 0.2586, + "step": 20854 + }, + { + "epoch": 1.0828676071622354, + "grad_norm": 4.903537273406982, + "learning_rate": 7.244895162395506e-06, + "loss": 0.3096, + "step": 20855 + }, + { + "epoch": 1.0828811720021703, + "grad_norm": 5.560794830322266, + "learning_rate": 7.24475811977525e-06, + "loss": 0.2223, + "step": 20856 + }, + { + "epoch": 1.0828947368421054, + "grad_norm": 4.64922571182251, + "learning_rate": 7.244621077154996e-06, + "loss": 0.2211, + "step": 20857 + }, + { + "epoch": 1.0829083016820402, + "grad_norm": 5.240388870239258, + "learning_rate": 7.244484034534741e-06, + "loss": 0.3329, + "step": 20858 + }, + { + "epoch": 1.082921866521975, + "grad_norm": 5.492027759552002, + "learning_rate": 7.244346991914486e-06, + "loss": 0.2802, + "step": 20859 + }, + { + "epoch": 1.08293543136191, + "grad_norm": 4.439280033111572, + "learning_rate": 7.244209949294231e-06, + "loss": 0.2103, + "step": 20860 + }, + { + "epoch": 1.0829489962018448, + "grad_norm": 4.07358455657959, + "learning_rate": 7.244072906673976e-06, + "loss": 0.2182, + "step": 20861 + }, + { + "epoch": 1.0829625610417797, + "grad_norm": 5.6509690284729, + "learning_rate": 7.243935864053721e-06, + "loss": 0.2561, + "step": 20862 + }, + { + "epoch": 1.0829761258817145, + "grad_norm": 5.568795680999756, + "learning_rate": 7.243798821433466e-06, + "loss": 0.3073, + "step": 20863 + }, + { + "epoch": 1.0829896907216494, + "grad_norm": 4.1143717765808105, + "learning_rate": 7.243661778813212e-06, + "loss": 0.1816, + "step": 20864 + }, + { + "epoch": 1.0830032555615843, + "grad_norm": 4.959621429443359, + "learning_rate": 7.243524736192956e-06, + "loss": 0.296, + "step": 20865 + }, + { + "epoch": 1.0830168204015194, + "grad_norm": 5.37012243270874, + "learning_rate": 7.243387693572702e-06, + "loss": 0.2361, + "step": 20866 + }, + { + "epoch": 1.0830303852414542, + "grad_norm": 4.258942127227783, + "learning_rate": 7.243250650952447e-06, + "loss": 0.2079, + "step": 20867 + }, + { + "epoch": 1.083043950081389, + "grad_norm": 3.832693099975586, + "learning_rate": 7.2431136083321915e-06, + "loss": 0.1336, + "step": 20868 + }, + { + "epoch": 1.083057514921324, + "grad_norm": 4.444338798522949, + "learning_rate": 7.242976565711937e-06, + "loss": 0.1983, + "step": 20869 + }, + { + "epoch": 1.0830710797612588, + "grad_norm": 4.909639835357666, + "learning_rate": 7.242839523091683e-06, + "loss": 0.2774, + "step": 20870 + }, + { + "epoch": 1.0830846446011937, + "grad_norm": 3.849398612976074, + "learning_rate": 7.242702480471428e-06, + "loss": 0.1683, + "step": 20871 + }, + { + "epoch": 1.0830982094411286, + "grad_norm": 4.600011825561523, + "learning_rate": 7.242565437851172e-06, + "loss": 0.1877, + "step": 20872 + }, + { + "epoch": 1.0831117742810634, + "grad_norm": 5.542945384979248, + "learning_rate": 7.242428395230917e-06, + "loss": 0.2777, + "step": 20873 + }, + { + "epoch": 1.0831253391209983, + "grad_norm": 3.9501969814300537, + "learning_rate": 7.242291352610662e-06, + "loss": 0.1661, + "step": 20874 + }, + { + "epoch": 1.0831389039609332, + "grad_norm": 4.774489402770996, + "learning_rate": 7.242154309990408e-06, + "loss": 0.2732, + "step": 20875 + }, + { + "epoch": 1.0831524688008682, + "grad_norm": 4.462541103363037, + "learning_rate": 7.242017267370153e-06, + "loss": 0.3318, + "step": 20876 + }, + { + "epoch": 1.083166033640803, + "grad_norm": 3.9136288166046143, + "learning_rate": 7.241880224749897e-06, + "loss": 0.2678, + "step": 20877 + }, + { + "epoch": 1.083179598480738, + "grad_norm": 4.027106761932373, + "learning_rate": 7.241743182129642e-06, + "loss": 0.2562, + "step": 20878 + }, + { + "epoch": 1.0831931633206728, + "grad_norm": 4.560626029968262, + "learning_rate": 7.2416061395093884e-06, + "loss": 0.2363, + "step": 20879 + }, + { + "epoch": 1.0832067281606077, + "grad_norm": 4.540103912353516, + "learning_rate": 7.241469096889134e-06, + "loss": 0.3316, + "step": 20880 + }, + { + "epoch": 1.0832202930005426, + "grad_norm": 4.887163162231445, + "learning_rate": 7.241332054268878e-06, + "loss": 0.2472, + "step": 20881 + }, + { + "epoch": 1.0832338578404774, + "grad_norm": 5.436851978302002, + "learning_rate": 7.241195011648623e-06, + "loss": 0.2658, + "step": 20882 + }, + { + "epoch": 1.0832474226804123, + "grad_norm": 4.469284534454346, + "learning_rate": 7.241057969028369e-06, + "loss": 0.2361, + "step": 20883 + }, + { + "epoch": 1.0832609875203472, + "grad_norm": 3.948092460632324, + "learning_rate": 7.2409209264081135e-06, + "loss": 0.2341, + "step": 20884 + }, + { + "epoch": 1.0832745523602823, + "grad_norm": 4.201193332672119, + "learning_rate": 7.240783883787859e-06, + "loss": 0.2127, + "step": 20885 + }, + { + "epoch": 1.0832881172002171, + "grad_norm": 5.791634559631348, + "learning_rate": 7.240646841167603e-06, + "loss": 0.3338, + "step": 20886 + }, + { + "epoch": 1.083301682040152, + "grad_norm": 5.716239929199219, + "learning_rate": 7.240509798547349e-06, + "loss": 0.2763, + "step": 20887 + }, + { + "epoch": 1.0833152468800868, + "grad_norm": 5.949872970581055, + "learning_rate": 7.240372755927094e-06, + "loss": 0.2126, + "step": 20888 + }, + { + "epoch": 1.0833288117200217, + "grad_norm": 3.761014461517334, + "learning_rate": 7.240235713306839e-06, + "loss": 0.1984, + "step": 20889 + }, + { + "epoch": 1.0833423765599566, + "grad_norm": 7.372492790222168, + "learning_rate": 7.240098670686584e-06, + "loss": 0.3293, + "step": 20890 + }, + { + "epoch": 1.0833559413998914, + "grad_norm": 5.618335247039795, + "learning_rate": 7.239961628066329e-06, + "loss": 0.2716, + "step": 20891 + }, + { + "epoch": 1.0833695062398263, + "grad_norm": 4.364651679992676, + "learning_rate": 7.239824585446075e-06, + "loss": 0.1601, + "step": 20892 + }, + { + "epoch": 1.0833830710797612, + "grad_norm": 4.422512054443359, + "learning_rate": 7.239687542825819e-06, + "loss": 0.144, + "step": 20893 + }, + { + "epoch": 1.083396635919696, + "grad_norm": 5.149303913116455, + "learning_rate": 7.2395505002055644e-06, + "loss": 0.2448, + "step": 20894 + }, + { + "epoch": 1.0834102007596311, + "grad_norm": 5.157498836517334, + "learning_rate": 7.23941345758531e-06, + "loss": 0.2721, + "step": 20895 + }, + { + "epoch": 1.083423765599566, + "grad_norm": 4.568035125732422, + "learning_rate": 7.239276414965055e-06, + "loss": 0.1807, + "step": 20896 + }, + { + "epoch": 1.0834373304395009, + "grad_norm": 4.306035995483398, + "learning_rate": 7.2391393723448e-06, + "loss": 0.2039, + "step": 20897 + }, + { + "epoch": 1.0834508952794357, + "grad_norm": 3.396042823791504, + "learning_rate": 7.239002329724545e-06, + "loss": 0.1237, + "step": 20898 + }, + { + "epoch": 1.0834644601193706, + "grad_norm": 4.967942714691162, + "learning_rate": 7.2388652871042895e-06, + "loss": 0.1328, + "step": 20899 + }, + { + "epoch": 1.0834780249593055, + "grad_norm": 4.203227996826172, + "learning_rate": 7.2387282444840355e-06, + "loss": 0.1457, + "step": 20900 + }, + { + "epoch": 1.0834915897992403, + "grad_norm": 5.4363203048706055, + "learning_rate": 7.238591201863781e-06, + "loss": 0.2503, + "step": 20901 + }, + { + "epoch": 1.0835051546391752, + "grad_norm": 5.708967208862305, + "learning_rate": 7.238454159243525e-06, + "loss": 0.1856, + "step": 20902 + }, + { + "epoch": 1.08351871947911, + "grad_norm": 3.7550244331359863, + "learning_rate": 7.23831711662327e-06, + "loss": 0.1194, + "step": 20903 + }, + { + "epoch": 1.0835322843190451, + "grad_norm": 3.0662646293640137, + "learning_rate": 7.238180074003015e-06, + "loss": 0.1616, + "step": 20904 + }, + { + "epoch": 1.08354584915898, + "grad_norm": 6.071943283081055, + "learning_rate": 7.238043031382761e-06, + "loss": 0.238, + "step": 20905 + }, + { + "epoch": 1.0835594139989149, + "grad_norm": 4.57421350479126, + "learning_rate": 7.237905988762506e-06, + "loss": 0.1773, + "step": 20906 + }, + { + "epoch": 1.0835729788388497, + "grad_norm": 6.3961944580078125, + "learning_rate": 7.237768946142251e-06, + "loss": 0.215, + "step": 20907 + }, + { + "epoch": 1.0835865436787846, + "grad_norm": 4.63777494430542, + "learning_rate": 7.237631903521995e-06, + "loss": 0.0934, + "step": 20908 + }, + { + "epoch": 1.0836001085187195, + "grad_norm": 4.842268466949463, + "learning_rate": 7.237494860901741e-06, + "loss": 0.2694, + "step": 20909 + }, + { + "epoch": 1.0836136733586543, + "grad_norm": 4.244537830352783, + "learning_rate": 7.2373578182814864e-06, + "loss": 0.297, + "step": 20910 + }, + { + "epoch": 1.0836272381985892, + "grad_norm": 3.517399787902832, + "learning_rate": 7.237220775661231e-06, + "loss": 0.1455, + "step": 20911 + }, + { + "epoch": 1.083640803038524, + "grad_norm": 4.86461877822876, + "learning_rate": 7.237083733040976e-06, + "loss": 0.1736, + "step": 20912 + }, + { + "epoch": 1.083654367878459, + "grad_norm": 5.57005500793457, + "learning_rate": 7.236946690420722e-06, + "loss": 0.3376, + "step": 20913 + }, + { + "epoch": 1.083667932718394, + "grad_norm": 5.274788856506348, + "learning_rate": 7.236809647800467e-06, + "loss": 0.2016, + "step": 20914 + }, + { + "epoch": 1.0836814975583289, + "grad_norm": 6.553570747375488, + "learning_rate": 7.2366726051802115e-06, + "loss": 0.2116, + "step": 20915 + }, + { + "epoch": 1.0836950623982637, + "grad_norm": 5.522340297698975, + "learning_rate": 7.236535562559957e-06, + "loss": 0.3537, + "step": 20916 + }, + { + "epoch": 1.0837086272381986, + "grad_norm": 4.258124828338623, + "learning_rate": 7.236398519939701e-06, + "loss": 0.1732, + "step": 20917 + }, + { + "epoch": 1.0837221920781335, + "grad_norm": 5.542209625244141, + "learning_rate": 7.236261477319447e-06, + "loss": 0.2208, + "step": 20918 + }, + { + "epoch": 1.0837357569180683, + "grad_norm": 5.253922939300537, + "learning_rate": 7.236124434699192e-06, + "loss": 0.391, + "step": 20919 + }, + { + "epoch": 1.0837493217580032, + "grad_norm": 4.361988544464111, + "learning_rate": 7.235987392078937e-06, + "loss": 0.2342, + "step": 20920 + }, + { + "epoch": 1.083762886597938, + "grad_norm": 4.425815582275391, + "learning_rate": 7.235850349458682e-06, + "loss": 0.2444, + "step": 20921 + }, + { + "epoch": 1.083776451437873, + "grad_norm": 6.178679466247559, + "learning_rate": 7.235713306838428e-06, + "loss": 0.2378, + "step": 20922 + }, + { + "epoch": 1.083790016277808, + "grad_norm": 6.411380290985107, + "learning_rate": 7.235576264218173e-06, + "loss": 0.3845, + "step": 20923 + }, + { + "epoch": 1.0838035811177429, + "grad_norm": 5.900778770446777, + "learning_rate": 7.235439221597917e-06, + "loss": 0.2879, + "step": 20924 + }, + { + "epoch": 1.0838171459576778, + "grad_norm": 5.387191295623779, + "learning_rate": 7.2353021789776624e-06, + "loss": 0.2769, + "step": 20925 + }, + { + "epoch": 1.0838307107976126, + "grad_norm": 4.2836785316467285, + "learning_rate": 7.2351651363574085e-06, + "loss": 0.185, + "step": 20926 + }, + { + "epoch": 1.0838442756375475, + "grad_norm": 5.397891998291016, + "learning_rate": 7.235028093737153e-06, + "loss": 0.2671, + "step": 20927 + }, + { + "epoch": 1.0838578404774823, + "grad_norm": 5.087418556213379, + "learning_rate": 7.234891051116898e-06, + "loss": 0.1789, + "step": 20928 + }, + { + "epoch": 1.0838714053174172, + "grad_norm": 5.745080471038818, + "learning_rate": 7.234754008496643e-06, + "loss": 0.2944, + "step": 20929 + }, + { + "epoch": 1.083884970157352, + "grad_norm": 4.767582893371582, + "learning_rate": 7.2346169658763875e-06, + "loss": 0.234, + "step": 20930 + }, + { + "epoch": 1.083898534997287, + "grad_norm": 5.682858943939209, + "learning_rate": 7.2344799232561335e-06, + "loss": 0.3432, + "step": 20931 + }, + { + "epoch": 1.0839120998372218, + "grad_norm": 6.09846830368042, + "learning_rate": 7.234342880635879e-06, + "loss": 0.3232, + "step": 20932 + }, + { + "epoch": 1.083925664677157, + "grad_norm": 5.59224271774292, + "learning_rate": 7.234205838015623e-06, + "loss": 0.234, + "step": 20933 + }, + { + "epoch": 1.0839392295170918, + "grad_norm": 5.378735542297363, + "learning_rate": 7.234068795395368e-06, + "loss": 0.3284, + "step": 20934 + }, + { + "epoch": 1.0839527943570266, + "grad_norm": 5.347070217132568, + "learning_rate": 7.233931752775114e-06, + "loss": 0.2568, + "step": 20935 + }, + { + "epoch": 1.0839663591969615, + "grad_norm": 4.811437129974365, + "learning_rate": 7.2337947101548586e-06, + "loss": 0.2765, + "step": 20936 + }, + { + "epoch": 1.0839799240368964, + "grad_norm": 6.259610652923584, + "learning_rate": 7.233657667534604e-06, + "loss": 0.4004, + "step": 20937 + }, + { + "epoch": 1.0839934888768312, + "grad_norm": 5.25627326965332, + "learning_rate": 7.233520624914349e-06, + "loss": 0.2699, + "step": 20938 + }, + { + "epoch": 1.084007053716766, + "grad_norm": 6.351256370544434, + "learning_rate": 7.233383582294095e-06, + "loss": 0.2353, + "step": 20939 + }, + { + "epoch": 1.084020618556701, + "grad_norm": 6.49965238571167, + "learning_rate": 7.233246539673839e-06, + "loss": 0.3007, + "step": 20940 + }, + { + "epoch": 1.0840341833966358, + "grad_norm": 4.7104716300964355, + "learning_rate": 7.2331094970535844e-06, + "loss": 0.2836, + "step": 20941 + }, + { + "epoch": 1.084047748236571, + "grad_norm": 6.710052013397217, + "learning_rate": 7.232972454433329e-06, + "loss": 0.4607, + "step": 20942 + }, + { + "epoch": 1.0840613130765058, + "grad_norm": 5.968766212463379, + "learning_rate": 7.232835411813074e-06, + "loss": 0.2973, + "step": 20943 + }, + { + "epoch": 1.0840748779164406, + "grad_norm": 7.25742244720459, + "learning_rate": 7.23269836919282e-06, + "loss": 0.4894, + "step": 20944 + }, + { + "epoch": 1.0840884427563755, + "grad_norm": 6.062483310699463, + "learning_rate": 7.232561326572564e-06, + "loss": 0.3733, + "step": 20945 + }, + { + "epoch": 1.0841020075963104, + "grad_norm": 3.97226881980896, + "learning_rate": 7.2324242839523095e-06, + "loss": 0.1847, + "step": 20946 + }, + { + "epoch": 1.0841155724362452, + "grad_norm": 5.398727893829346, + "learning_rate": 7.232287241332055e-06, + "loss": 0.3627, + "step": 20947 + }, + { + "epoch": 1.08412913727618, + "grad_norm": 5.420177936553955, + "learning_rate": 7.232150198711801e-06, + "loss": 0.2831, + "step": 20948 + }, + { + "epoch": 1.084142702116115, + "grad_norm": 7.269344329833984, + "learning_rate": 7.232013156091545e-06, + "loss": 0.4284, + "step": 20949 + }, + { + "epoch": 1.0841562669560498, + "grad_norm": 7.800894737243652, + "learning_rate": 7.23187611347129e-06, + "loss": 0.4118, + "step": 20950 + }, + { + "epoch": 1.0841698317959847, + "grad_norm": 4.614266395568848, + "learning_rate": 7.2317390708510345e-06, + "loss": 0.3504, + "step": 20951 + }, + { + "epoch": 1.0841833966359198, + "grad_norm": 5.336808681488037, + "learning_rate": 7.2316020282307806e-06, + "loss": 0.4427, + "step": 20952 + }, + { + "epoch": 1.0841969614758546, + "grad_norm": 5.830266952514648, + "learning_rate": 7.231464985610526e-06, + "loss": 0.2776, + "step": 20953 + }, + { + "epoch": 1.0842105263157895, + "grad_norm": 7.757440090179443, + "learning_rate": 7.231327942990271e-06, + "loss": 0.5303, + "step": 20954 + }, + { + "epoch": 1.0842240911557244, + "grad_norm": 6.206363201141357, + "learning_rate": 7.231190900370015e-06, + "loss": 0.3606, + "step": 20955 + }, + { + "epoch": 1.0842376559956592, + "grad_norm": 5.509872913360596, + "learning_rate": 7.231053857749761e-06, + "loss": 0.342, + "step": 20956 + }, + { + "epoch": 1.084251220835594, + "grad_norm": 6.413431644439697, + "learning_rate": 7.2309168151295065e-06, + "loss": 0.3404, + "step": 20957 + }, + { + "epoch": 1.084264785675529, + "grad_norm": 7.889306545257568, + "learning_rate": 7.230779772509251e-06, + "loss": 0.5328, + "step": 20958 + }, + { + "epoch": 1.0842783505154638, + "grad_norm": 7.701141834259033, + "learning_rate": 7.230642729888996e-06, + "loss": 0.6033, + "step": 20959 + }, + { + "epoch": 1.0842919153553987, + "grad_norm": 5.996570110321045, + "learning_rate": 7.23050568726874e-06, + "loss": 0.3191, + "step": 20960 + }, + { + "epoch": 1.0843054801953338, + "grad_norm": 5.992147922515869, + "learning_rate": 7.230368644648486e-06, + "loss": 0.3522, + "step": 20961 + }, + { + "epoch": 1.0843190450352687, + "grad_norm": 8.676639556884766, + "learning_rate": 7.2302316020282315e-06, + "loss": 0.3644, + "step": 20962 + }, + { + "epoch": 1.0843326098752035, + "grad_norm": 5.936452388763428, + "learning_rate": 7.230094559407977e-06, + "loss": 0.2751, + "step": 20963 + }, + { + "epoch": 1.0843461747151384, + "grad_norm": 5.465178966522217, + "learning_rate": 7.229957516787721e-06, + "loss": 0.277, + "step": 20964 + }, + { + "epoch": 1.0843597395550733, + "grad_norm": 4.138449668884277, + "learning_rate": 7.229820474167467e-06, + "loss": 0.244, + "step": 20965 + }, + { + "epoch": 1.0843733043950081, + "grad_norm": 6.150457382202148, + "learning_rate": 7.229683431547212e-06, + "loss": 0.3586, + "step": 20966 + }, + { + "epoch": 1.084386869234943, + "grad_norm": 5.346713542938232, + "learning_rate": 7.2295463889269566e-06, + "loss": 0.2432, + "step": 20967 + }, + { + "epoch": 1.0844004340748779, + "grad_norm": 4.582415580749512, + "learning_rate": 7.229409346306702e-06, + "loss": 0.3967, + "step": 20968 + }, + { + "epoch": 1.0844139989148127, + "grad_norm": 5.222153663635254, + "learning_rate": 7.229272303686448e-06, + "loss": 0.2015, + "step": 20969 + }, + { + "epoch": 1.0844275637547476, + "grad_norm": 6.539583206176758, + "learning_rate": 7.229135261066192e-06, + "loss": 0.316, + "step": 20970 + }, + { + "epoch": 1.0844411285946827, + "grad_norm": 5.882848262786865, + "learning_rate": 7.228998218445937e-06, + "loss": 0.4114, + "step": 20971 + }, + { + "epoch": 1.0844546934346175, + "grad_norm": 6.854256629943848, + "learning_rate": 7.2288611758256825e-06, + "loss": 0.5277, + "step": 20972 + }, + { + "epoch": 1.0844682582745524, + "grad_norm": 6.695127487182617, + "learning_rate": 7.228724133205427e-06, + "loss": 0.3881, + "step": 20973 + }, + { + "epoch": 1.0844818231144873, + "grad_norm": 5.355132102966309, + "learning_rate": 7.228587090585173e-06, + "loss": 0.3013, + "step": 20974 + }, + { + "epoch": 1.0844953879544221, + "grad_norm": 6.526094436645508, + "learning_rate": 7.228450047964918e-06, + "loss": 0.4198, + "step": 20975 + }, + { + "epoch": 1.084508952794357, + "grad_norm": 4.605541706085205, + "learning_rate": 7.228313005344662e-06, + "loss": 0.3641, + "step": 20976 + }, + { + "epoch": 1.0845225176342919, + "grad_norm": 5.6515679359436035, + "learning_rate": 7.2281759627244075e-06, + "loss": 0.293, + "step": 20977 + }, + { + "epoch": 1.0845360824742267, + "grad_norm": 5.701219081878662, + "learning_rate": 7.2280389201041535e-06, + "loss": 0.3333, + "step": 20978 + }, + { + "epoch": 1.0845496473141616, + "grad_norm": 5.331916332244873, + "learning_rate": 7.227901877483898e-06, + "loss": 0.3323, + "step": 20979 + }, + { + "epoch": 1.0845632121540967, + "grad_norm": 4.866812705993652, + "learning_rate": 7.227764834863643e-06, + "loss": 0.3638, + "step": 20980 + }, + { + "epoch": 1.0845767769940315, + "grad_norm": 5.96571159362793, + "learning_rate": 7.227627792243388e-06, + "loss": 0.5182, + "step": 20981 + }, + { + "epoch": 1.0845903418339664, + "grad_norm": 4.886923789978027, + "learning_rate": 7.227490749623134e-06, + "loss": 0.3548, + "step": 20982 + }, + { + "epoch": 1.0846039066739013, + "grad_norm": 5.737141132354736, + "learning_rate": 7.2273537070028786e-06, + "loss": 0.56, + "step": 20983 + }, + { + "epoch": 1.0846174715138361, + "grad_norm": 6.8078789710998535, + "learning_rate": 7.227216664382624e-06, + "loss": 0.4081, + "step": 20984 + }, + { + "epoch": 1.084631036353771, + "grad_norm": 4.387343406677246, + "learning_rate": 7.227079621762368e-06, + "loss": 0.2104, + "step": 20985 + }, + { + "epoch": 1.0846446011937059, + "grad_norm": 5.925597667694092, + "learning_rate": 7.226942579142113e-06, + "loss": 0.2192, + "step": 20986 + }, + { + "epoch": 1.0846581660336407, + "grad_norm": 4.864142894744873, + "learning_rate": 7.226805536521859e-06, + "loss": 0.3006, + "step": 20987 + }, + { + "epoch": 1.0846717308735756, + "grad_norm": 5.5036211013793945, + "learning_rate": 7.2266684939016045e-06, + "loss": 0.3145, + "step": 20988 + }, + { + "epoch": 1.0846852957135105, + "grad_norm": 6.1739726066589355, + "learning_rate": 7.226531451281349e-06, + "loss": 0.2446, + "step": 20989 + }, + { + "epoch": 1.0846988605534456, + "grad_norm": 4.782867908477783, + "learning_rate": 7.226394408661094e-06, + "loss": 0.2513, + "step": 20990 + }, + { + "epoch": 1.0847124253933804, + "grad_norm": 5.660556316375732, + "learning_rate": 7.22625736604084e-06, + "loss": 0.3414, + "step": 20991 + }, + { + "epoch": 1.0847259902333153, + "grad_norm": 4.878398895263672, + "learning_rate": 7.226120323420584e-06, + "loss": 0.3354, + "step": 20992 + }, + { + "epoch": 1.0847395550732502, + "grad_norm": 5.939437389373779, + "learning_rate": 7.2259832808003295e-06, + "loss": 0.342, + "step": 20993 + }, + { + "epoch": 1.084753119913185, + "grad_norm": 4.456808090209961, + "learning_rate": 7.225846238180074e-06, + "loss": 0.2707, + "step": 20994 + }, + { + "epoch": 1.0847666847531199, + "grad_norm": 4.798226356506348, + "learning_rate": 7.22570919555982e-06, + "loss": 0.3336, + "step": 20995 + }, + { + "epoch": 1.0847802495930547, + "grad_norm": 5.105687141418457, + "learning_rate": 7.225572152939565e-06, + "loss": 0.3835, + "step": 20996 + }, + { + "epoch": 1.0847938144329896, + "grad_norm": 5.522549629211426, + "learning_rate": 7.22543511031931e-06, + "loss": 0.2933, + "step": 20997 + }, + { + "epoch": 1.0848073792729245, + "grad_norm": 5.06124210357666, + "learning_rate": 7.2252980676990546e-06, + "loss": 0.2319, + "step": 20998 + }, + { + "epoch": 1.0848209441128596, + "grad_norm": 5.92821741104126, + "learning_rate": 7.2251610250788e-06, + "loss": 0.407, + "step": 20999 + }, + { + "epoch": 1.0848345089527944, + "grad_norm": 5.402624130249023, + "learning_rate": 7.225023982458546e-06, + "loss": 0.4023, + "step": 21000 + }, + { + "epoch": 1.0848480737927293, + "grad_norm": 4.668753147125244, + "learning_rate": 7.22488693983829e-06, + "loss": 0.2474, + "step": 21001 + }, + { + "epoch": 1.0848616386326642, + "grad_norm": 5.217726707458496, + "learning_rate": 7.224749897218035e-06, + "loss": 0.2724, + "step": 21002 + }, + { + "epoch": 1.084875203472599, + "grad_norm": 4.0577850341796875, + "learning_rate": 7.2246128545977805e-06, + "loss": 0.2181, + "step": 21003 + }, + { + "epoch": 1.084888768312534, + "grad_norm": 4.535391330718994, + "learning_rate": 7.224475811977526e-06, + "loss": 0.3321, + "step": 21004 + }, + { + "epoch": 1.0849023331524688, + "grad_norm": 6.507805347442627, + "learning_rate": 7.224338769357271e-06, + "loss": 0.3066, + "step": 21005 + }, + { + "epoch": 1.0849158979924036, + "grad_norm": 5.376535415649414, + "learning_rate": 7.224201726737016e-06, + "loss": 0.3283, + "step": 21006 + }, + { + "epoch": 1.0849294628323385, + "grad_norm": 4.154804706573486, + "learning_rate": 7.22406468411676e-06, + "loss": 0.3429, + "step": 21007 + }, + { + "epoch": 1.0849430276722734, + "grad_norm": 6.933816432952881, + "learning_rate": 7.223927641496506e-06, + "loss": 0.3799, + "step": 21008 + }, + { + "epoch": 1.0849565925122084, + "grad_norm": 5.357234954833984, + "learning_rate": 7.2237905988762515e-06, + "loss": 0.3133, + "step": 21009 + }, + { + "epoch": 1.0849701573521433, + "grad_norm": 5.306432247161865, + "learning_rate": 7.223653556255996e-06, + "loss": 0.261, + "step": 21010 + }, + { + "epoch": 1.0849837221920782, + "grad_norm": 5.95938777923584, + "learning_rate": 7.223516513635741e-06, + "loss": 0.3749, + "step": 21011 + }, + { + "epoch": 1.084997287032013, + "grad_norm": 5.055357456207275, + "learning_rate": 7.223379471015486e-06, + "loss": 0.3211, + "step": 21012 + }, + { + "epoch": 1.085010851871948, + "grad_norm": 4.395177364349365, + "learning_rate": 7.223242428395232e-06, + "loss": 0.2162, + "step": 21013 + }, + { + "epoch": 1.0850244167118828, + "grad_norm": 4.8364577293396, + "learning_rate": 7.2231053857749766e-06, + "loss": 0.3359, + "step": 21014 + }, + { + "epoch": 1.0850379815518176, + "grad_norm": 6.396581172943115, + "learning_rate": 7.222968343154722e-06, + "loss": 0.5167, + "step": 21015 + }, + { + "epoch": 1.0850515463917525, + "grad_norm": 3.923670768737793, + "learning_rate": 7.222831300534466e-06, + "loss": 0.1409, + "step": 21016 + }, + { + "epoch": 1.0850651112316874, + "grad_norm": 7.223666191101074, + "learning_rate": 7.222694257914212e-06, + "loss": 0.37, + "step": 21017 + }, + { + "epoch": 1.0850786760716224, + "grad_norm": 6.96862268447876, + "learning_rate": 7.222557215293957e-06, + "loss": 0.454, + "step": 21018 + }, + { + "epoch": 1.0850922409115573, + "grad_norm": 5.023949146270752, + "learning_rate": 7.222420172673702e-06, + "loss": 0.3859, + "step": 21019 + }, + { + "epoch": 1.0851058057514922, + "grad_norm": 5.413416862487793, + "learning_rate": 7.222283130053447e-06, + "loss": 0.2856, + "step": 21020 + }, + { + "epoch": 1.085119370591427, + "grad_norm": 4.577193260192871, + "learning_rate": 7.222146087433193e-06, + "loss": 0.2507, + "step": 21021 + }, + { + "epoch": 1.085132935431362, + "grad_norm": 4.654657363891602, + "learning_rate": 7.222009044812938e-06, + "loss": 0.3553, + "step": 21022 + }, + { + "epoch": 1.0851465002712968, + "grad_norm": 4.702218055725098, + "learning_rate": 7.221872002192682e-06, + "loss": 0.4198, + "step": 21023 + }, + { + "epoch": 1.0851600651112316, + "grad_norm": 5.875777244567871, + "learning_rate": 7.2217349595724275e-06, + "loss": 0.3637, + "step": 21024 + }, + { + "epoch": 1.0851736299511665, + "grad_norm": 4.514785289764404, + "learning_rate": 7.2215979169521735e-06, + "loss": 0.3245, + "step": 21025 + }, + { + "epoch": 1.0851871947911014, + "grad_norm": 7.776416778564453, + "learning_rate": 7.221460874331918e-06, + "loss": 0.4126, + "step": 21026 + }, + { + "epoch": 1.0852007596310362, + "grad_norm": 6.221984386444092, + "learning_rate": 7.221323831711663e-06, + "loss": 0.4305, + "step": 21027 + }, + { + "epoch": 1.0852143244709713, + "grad_norm": 5.362582206726074, + "learning_rate": 7.221186789091407e-06, + "loss": 0.2815, + "step": 21028 + }, + { + "epoch": 1.0852278893109062, + "grad_norm": 4.016453266143799, + "learning_rate": 7.2210497464711526e-06, + "loss": 0.1945, + "step": 21029 + }, + { + "epoch": 1.085241454150841, + "grad_norm": 5.456981658935547, + "learning_rate": 7.220912703850899e-06, + "loss": 0.3015, + "step": 21030 + }, + { + "epoch": 1.085255018990776, + "grad_norm": 4.911811351776123, + "learning_rate": 7.220775661230644e-06, + "loss": 0.3599, + "step": 21031 + }, + { + "epoch": 1.0852685838307108, + "grad_norm": 5.359776973724365, + "learning_rate": 7.220638618610388e-06, + "loss": 0.3731, + "step": 21032 + }, + { + "epoch": 1.0852821486706457, + "grad_norm": 5.592791557312012, + "learning_rate": 7.220501575990133e-06, + "loss": 0.3248, + "step": 21033 + }, + { + "epoch": 1.0852957135105805, + "grad_norm": 7.271799564361572, + "learning_rate": 7.220364533369879e-06, + "loss": 0.631, + "step": 21034 + }, + { + "epoch": 1.0853092783505154, + "grad_norm": 5.196980953216553, + "learning_rate": 7.220227490749624e-06, + "loss": 0.3212, + "step": 21035 + }, + { + "epoch": 1.0853228431904505, + "grad_norm": 4.46614408493042, + "learning_rate": 7.220090448129369e-06, + "loss": 0.3483, + "step": 21036 + }, + { + "epoch": 1.0853364080303853, + "grad_norm": 4.5139946937561035, + "learning_rate": 7.219953405509114e-06, + "loss": 0.2201, + "step": 21037 + }, + { + "epoch": 1.0853499728703202, + "grad_norm": 5.759603977203369, + "learning_rate": 7.219816362888859e-06, + "loss": 0.3681, + "step": 21038 + }, + { + "epoch": 1.085363537710255, + "grad_norm": 4.40256404876709, + "learning_rate": 7.219679320268604e-06, + "loss": 0.2446, + "step": 21039 + }, + { + "epoch": 1.08537710255019, + "grad_norm": 4.485347270965576, + "learning_rate": 7.2195422776483495e-06, + "loss": 0.2167, + "step": 21040 + }, + { + "epoch": 1.0853906673901248, + "grad_norm": 6.237647533416748, + "learning_rate": 7.219405235028094e-06, + "loss": 0.4247, + "step": 21041 + }, + { + "epoch": 1.0854042322300597, + "grad_norm": 4.588376522064209, + "learning_rate": 7.219268192407839e-06, + "loss": 0.288, + "step": 21042 + }, + { + "epoch": 1.0854177970699945, + "grad_norm": 4.487445831298828, + "learning_rate": 7.219131149787585e-06, + "loss": 0.2803, + "step": 21043 + }, + { + "epoch": 1.0854313619099294, + "grad_norm": 6.321045875549316, + "learning_rate": 7.218994107167329e-06, + "loss": 0.3773, + "step": 21044 + }, + { + "epoch": 1.0854449267498643, + "grad_norm": 5.396479606628418, + "learning_rate": 7.218857064547075e-06, + "loss": 0.3333, + "step": 21045 + }, + { + "epoch": 1.0854584915897991, + "grad_norm": 5.079116344451904, + "learning_rate": 7.21872002192682e-06, + "loss": 0.2359, + "step": 21046 + }, + { + "epoch": 1.0854720564297342, + "grad_norm": 5.047358512878418, + "learning_rate": 7.218582979306566e-06, + "loss": 0.3156, + "step": 21047 + }, + { + "epoch": 1.085485621269669, + "grad_norm": 4.860149383544922, + "learning_rate": 7.21844593668631e-06, + "loss": 0.3266, + "step": 21048 + }, + { + "epoch": 1.085499186109604, + "grad_norm": 5.185268402099609, + "learning_rate": 7.218308894066055e-06, + "loss": 0.2777, + "step": 21049 + }, + { + "epoch": 1.0855127509495388, + "grad_norm": 4.872859954833984, + "learning_rate": 7.2181718514458e-06, + "loss": 0.386, + "step": 21050 + }, + { + "epoch": 1.0855263157894737, + "grad_norm": 3.5241401195526123, + "learning_rate": 7.218034808825546e-06, + "loss": 0.1799, + "step": 21051 + }, + { + "epoch": 1.0855398806294085, + "grad_norm": 6.088438987731934, + "learning_rate": 7.217897766205291e-06, + "loss": 0.2401, + "step": 21052 + }, + { + "epoch": 1.0855534454693434, + "grad_norm": 5.417239665985107, + "learning_rate": 7.217760723585035e-06, + "loss": 0.4003, + "step": 21053 + }, + { + "epoch": 1.0855670103092783, + "grad_norm": 5.614623069763184, + "learning_rate": 7.21762368096478e-06, + "loss": 0.3388, + "step": 21054 + }, + { + "epoch": 1.0855805751492134, + "grad_norm": 4.049798488616943, + "learning_rate": 7.2174866383445255e-06, + "loss": 0.1748, + "step": 21055 + }, + { + "epoch": 1.0855941399891482, + "grad_norm": 3.763493061065674, + "learning_rate": 7.2173495957242715e-06, + "loss": 0.2511, + "step": 21056 + }, + { + "epoch": 1.085607704829083, + "grad_norm": 5.780097961425781, + "learning_rate": 7.217212553104016e-06, + "loss": 0.355, + "step": 21057 + }, + { + "epoch": 1.085621269669018, + "grad_norm": 5.390098571777344, + "learning_rate": 7.217075510483761e-06, + "loss": 0.2414, + "step": 21058 + }, + { + "epoch": 1.0856348345089528, + "grad_norm": 5.214386940002441, + "learning_rate": 7.216938467863505e-06, + "loss": 0.2839, + "step": 21059 + }, + { + "epoch": 1.0856483993488877, + "grad_norm": 5.491428852081299, + "learning_rate": 7.216801425243251e-06, + "loss": 0.2324, + "step": 21060 + }, + { + "epoch": 1.0856619641888225, + "grad_norm": 4.648648262023926, + "learning_rate": 7.216664382622997e-06, + "loss": 0.3095, + "step": 21061 + }, + { + "epoch": 1.0856755290287574, + "grad_norm": 5.459575653076172, + "learning_rate": 7.216527340002742e-06, + "loss": 0.2873, + "step": 21062 + }, + { + "epoch": 1.0856890938686923, + "grad_norm": 6.250070095062256, + "learning_rate": 7.216390297382486e-06, + "loss": 0.4319, + "step": 21063 + }, + { + "epoch": 1.0857026587086271, + "grad_norm": 3.6626884937286377, + "learning_rate": 7.216253254762232e-06, + "loss": 0.1746, + "step": 21064 + }, + { + "epoch": 1.085716223548562, + "grad_norm": 3.942315101623535, + "learning_rate": 7.216116212141977e-06, + "loss": 0.1813, + "step": 21065 + }, + { + "epoch": 1.085729788388497, + "grad_norm": 4.8987908363342285, + "learning_rate": 7.215979169521722e-06, + "loss": 0.2426, + "step": 21066 + }, + { + "epoch": 1.085743353228432, + "grad_norm": 3.5327367782592773, + "learning_rate": 7.215842126901467e-06, + "loss": 0.1888, + "step": 21067 + }, + { + "epoch": 1.0857569180683668, + "grad_norm": 5.9155707359313965, + "learning_rate": 7.215705084281211e-06, + "loss": 0.3455, + "step": 21068 + }, + { + "epoch": 1.0857704829083017, + "grad_norm": 4.83095645904541, + "learning_rate": 7.215568041660957e-06, + "loss": 0.2956, + "step": 21069 + }, + { + "epoch": 1.0857840477482366, + "grad_norm": 4.045950412750244, + "learning_rate": 7.215430999040702e-06, + "loss": 0.1387, + "step": 21070 + }, + { + "epoch": 1.0857976125881714, + "grad_norm": 5.781581401824951, + "learning_rate": 7.2152939564204475e-06, + "loss": 0.3181, + "step": 21071 + }, + { + "epoch": 1.0858111774281063, + "grad_norm": 3.377873182296753, + "learning_rate": 7.215156913800192e-06, + "loss": 0.1296, + "step": 21072 + }, + { + "epoch": 1.0858247422680412, + "grad_norm": 6.36823844909668, + "learning_rate": 7.215019871179938e-06, + "loss": 0.3047, + "step": 21073 + }, + { + "epoch": 1.0858383071079762, + "grad_norm": 4.093776226043701, + "learning_rate": 7.214882828559683e-06, + "loss": 0.2304, + "step": 21074 + }, + { + "epoch": 1.085851871947911, + "grad_norm": 6.098893642425537, + "learning_rate": 7.214745785939427e-06, + "loss": 0.2201, + "step": 21075 + }, + { + "epoch": 1.085865436787846, + "grad_norm": 5.131186008453369, + "learning_rate": 7.214608743319173e-06, + "loss": 0.201, + "step": 21076 + }, + { + "epoch": 1.0858790016277808, + "grad_norm": 5.031842231750488, + "learning_rate": 7.214471700698919e-06, + "loss": 0.2369, + "step": 21077 + }, + { + "epoch": 1.0858925664677157, + "grad_norm": 5.188523292541504, + "learning_rate": 7.214334658078663e-06, + "loss": 0.1886, + "step": 21078 + }, + { + "epoch": 1.0859061313076506, + "grad_norm": 5.582026958465576, + "learning_rate": 7.214197615458408e-06, + "loss": 0.1786, + "step": 21079 + }, + { + "epoch": 1.0859196961475854, + "grad_norm": 4.834056377410889, + "learning_rate": 7.214060572838153e-06, + "loss": 0.227, + "step": 21080 + }, + { + "epoch": 1.0859332609875203, + "grad_norm": 6.122886657714844, + "learning_rate": 7.213923530217898e-06, + "loss": 0.2343, + "step": 21081 + }, + { + "epoch": 1.0859468258274552, + "grad_norm": 6.236581802368164, + "learning_rate": 7.213786487597644e-06, + "loss": 0.2391, + "step": 21082 + }, + { + "epoch": 1.08596039066739, + "grad_norm": 5.631651401519775, + "learning_rate": 7.213649444977389e-06, + "loss": 0.2921, + "step": 21083 + }, + { + "epoch": 1.0859739555073251, + "grad_norm": 10.213994979858398, + "learning_rate": 7.213512402357133e-06, + "loss": 0.4701, + "step": 21084 + }, + { + "epoch": 1.08598752034726, + "grad_norm": 6.398601531982422, + "learning_rate": 7.213375359736878e-06, + "loss": 0.1724, + "step": 21085 + }, + { + "epoch": 1.0860010851871948, + "grad_norm": 4.517728328704834, + "learning_rate": 7.213238317116624e-06, + "loss": 0.1689, + "step": 21086 + }, + { + "epoch": 1.0860146500271297, + "grad_norm": 7.196621417999268, + "learning_rate": 7.213101274496369e-06, + "loss": 0.3111, + "step": 21087 + }, + { + "epoch": 1.0860282148670646, + "grad_norm": 4.673101902008057, + "learning_rate": 7.212964231876114e-06, + "loss": 0.1979, + "step": 21088 + }, + { + "epoch": 1.0860417797069994, + "grad_norm": 4.499670028686523, + "learning_rate": 7.212827189255859e-06, + "loss": 0.2142, + "step": 21089 + }, + { + "epoch": 1.0860553445469343, + "grad_norm": 6.243876934051514, + "learning_rate": 7.212690146635605e-06, + "loss": 0.2742, + "step": 21090 + }, + { + "epoch": 1.0860689093868692, + "grad_norm": 5.7344489097595215, + "learning_rate": 7.212553104015349e-06, + "loss": 0.2417, + "step": 21091 + }, + { + "epoch": 1.086082474226804, + "grad_norm": 6.1092729568481445, + "learning_rate": 7.212416061395095e-06, + "loss": 0.3146, + "step": 21092 + }, + { + "epoch": 1.0860960390667391, + "grad_norm": 5.411993980407715, + "learning_rate": 7.212279018774839e-06, + "loss": 0.2472, + "step": 21093 + }, + { + "epoch": 1.086109603906674, + "grad_norm": 6.387742042541504, + "learning_rate": 7.212141976154584e-06, + "loss": 0.3351, + "step": 21094 + }, + { + "epoch": 1.0861231687466089, + "grad_norm": 4.523350238800049, + "learning_rate": 7.21200493353433e-06, + "loss": 0.1627, + "step": 21095 + }, + { + "epoch": 1.0861367335865437, + "grad_norm": 4.897469520568848, + "learning_rate": 7.211867890914075e-06, + "loss": 0.2747, + "step": 21096 + }, + { + "epoch": 1.0861502984264786, + "grad_norm": 5.908378601074219, + "learning_rate": 7.21173084829382e-06, + "loss": 0.262, + "step": 21097 + }, + { + "epoch": 1.0861638632664135, + "grad_norm": 3.5293474197387695, + "learning_rate": 7.211593805673565e-06, + "loss": 0.1213, + "step": 21098 + }, + { + "epoch": 1.0861774281063483, + "grad_norm": 5.58651876449585, + "learning_rate": 7.211456763053311e-06, + "loss": 0.2741, + "step": 21099 + }, + { + "epoch": 1.0861909929462832, + "grad_norm": 6.785565376281738, + "learning_rate": 7.211319720433055e-06, + "loss": 0.225, + "step": 21100 + }, + { + "epoch": 1.086204557786218, + "grad_norm": 6.135234832763672, + "learning_rate": 7.2111826778128e-06, + "loss": 0.2493, + "step": 21101 + }, + { + "epoch": 1.086218122626153, + "grad_norm": 4.426033973693848, + "learning_rate": 7.211045635192545e-06, + "loss": 0.1907, + "step": 21102 + }, + { + "epoch": 1.086231687466088, + "grad_norm": 5.853628635406494, + "learning_rate": 7.210908592572291e-06, + "loss": 0.2904, + "step": 21103 + }, + { + "epoch": 1.0862452523060229, + "grad_norm": 8.055161476135254, + "learning_rate": 7.210771549952036e-06, + "loss": 0.4039, + "step": 21104 + }, + { + "epoch": 1.0862588171459577, + "grad_norm": 5.048562526702881, + "learning_rate": 7.210634507331781e-06, + "loss": 0.2479, + "step": 21105 + }, + { + "epoch": 1.0862723819858926, + "grad_norm": 5.234737396240234, + "learning_rate": 7.210497464711525e-06, + "loss": 0.3488, + "step": 21106 + }, + { + "epoch": 1.0862859468258275, + "grad_norm": 7.743375778198242, + "learning_rate": 7.2103604220912714e-06, + "loss": 0.4118, + "step": 21107 + }, + { + "epoch": 1.0862995116657623, + "grad_norm": 5.169741153717041, + "learning_rate": 7.210223379471017e-06, + "loss": 0.2725, + "step": 21108 + }, + { + "epoch": 1.0863130765056972, + "grad_norm": 4.84299898147583, + "learning_rate": 7.210086336850761e-06, + "loss": 0.1955, + "step": 21109 + }, + { + "epoch": 1.086326641345632, + "grad_norm": 4.077748775482178, + "learning_rate": 7.209949294230506e-06, + "loss": 0.2109, + "step": 21110 + }, + { + "epoch": 1.086340206185567, + "grad_norm": 4.221746444702148, + "learning_rate": 7.209812251610251e-06, + "loss": 0.2131, + "step": 21111 + }, + { + "epoch": 1.086353771025502, + "grad_norm": 4.697456359863281, + "learning_rate": 7.2096752089899965e-06, + "loss": 0.244, + "step": 21112 + }, + { + "epoch": 1.0863673358654369, + "grad_norm": 5.990920543670654, + "learning_rate": 7.209538166369742e-06, + "loss": 0.2973, + "step": 21113 + }, + { + "epoch": 1.0863809007053717, + "grad_norm": 6.968891143798828, + "learning_rate": 7.209401123749487e-06, + "loss": 0.3788, + "step": 21114 + }, + { + "epoch": 1.0863944655453066, + "grad_norm": 6.837550163269043, + "learning_rate": 7.209264081129231e-06, + "loss": 0.2905, + "step": 21115 + }, + { + "epoch": 1.0864080303852415, + "grad_norm": 4.637333869934082, + "learning_rate": 7.209127038508977e-06, + "loss": 0.212, + "step": 21116 + }, + { + "epoch": 1.0864215952251763, + "grad_norm": 4.299849033355713, + "learning_rate": 7.208989995888722e-06, + "loss": 0.2378, + "step": 21117 + }, + { + "epoch": 1.0864351600651112, + "grad_norm": 7.777639865875244, + "learning_rate": 7.208852953268467e-06, + "loss": 0.5891, + "step": 21118 + }, + { + "epoch": 1.086448724905046, + "grad_norm": 6.663653373718262, + "learning_rate": 7.208715910648212e-06, + "loss": 0.3606, + "step": 21119 + }, + { + "epoch": 1.086462289744981, + "grad_norm": 6.3538737297058105, + "learning_rate": 7.208578868027958e-06, + "loss": 0.3067, + "step": 21120 + }, + { + "epoch": 1.0864758545849158, + "grad_norm": 5.205070495605469, + "learning_rate": 7.208441825407702e-06, + "loss": 0.2771, + "step": 21121 + }, + { + "epoch": 1.0864894194248509, + "grad_norm": 7.03267765045166, + "learning_rate": 7.2083047827874474e-06, + "loss": 0.3845, + "step": 21122 + }, + { + "epoch": 1.0865029842647858, + "grad_norm": 4.253734111785889, + "learning_rate": 7.208167740167193e-06, + "loss": 0.2558, + "step": 21123 + }, + { + "epoch": 1.0865165491047206, + "grad_norm": 6.034993648529053, + "learning_rate": 7.208030697546937e-06, + "loss": 0.6862, + "step": 21124 + }, + { + "epoch": 1.0865301139446555, + "grad_norm": 7.298794746398926, + "learning_rate": 7.207893654926683e-06, + "loss": 0.409, + "step": 21125 + }, + { + "epoch": 1.0865436787845903, + "grad_norm": 6.673977375030518, + "learning_rate": 7.207756612306428e-06, + "loss": 0.4818, + "step": 21126 + }, + { + "epoch": 1.0865572436245252, + "grad_norm": 5.438507080078125, + "learning_rate": 7.2076195696861725e-06, + "loss": 0.1691, + "step": 21127 + }, + { + "epoch": 1.08657080846446, + "grad_norm": 5.207962989807129, + "learning_rate": 7.207482527065918e-06, + "loss": 0.3365, + "step": 21128 + }, + { + "epoch": 1.086584373304395, + "grad_norm": 8.133466720581055, + "learning_rate": 7.207345484445664e-06, + "loss": 0.4136, + "step": 21129 + }, + { + "epoch": 1.0865979381443298, + "grad_norm": 5.735905170440674, + "learning_rate": 7.207208441825409e-06, + "loss": 0.2639, + "step": 21130 + }, + { + "epoch": 1.086611502984265, + "grad_norm": 5.643258571624756, + "learning_rate": 7.207071399205153e-06, + "loss": 0.3454, + "step": 21131 + }, + { + "epoch": 1.0866250678241998, + "grad_norm": 7.570263385772705, + "learning_rate": 7.206934356584898e-06, + "loss": 0.4629, + "step": 21132 + }, + { + "epoch": 1.0866386326641346, + "grad_norm": 7.661823272705078, + "learning_rate": 7.206797313964644e-06, + "loss": 0.4344, + "step": 21133 + }, + { + "epoch": 1.0866521975040695, + "grad_norm": 6.181204795837402, + "learning_rate": 7.206660271344389e-06, + "loss": 0.3246, + "step": 21134 + }, + { + "epoch": 1.0866657623440044, + "grad_norm": 5.128734588623047, + "learning_rate": 7.206523228724134e-06, + "loss": 0.2104, + "step": 21135 + }, + { + "epoch": 1.0866793271839392, + "grad_norm": 4.310850143432617, + "learning_rate": 7.206386186103878e-06, + "loss": 0.3133, + "step": 21136 + }, + { + "epoch": 1.086692892023874, + "grad_norm": 6.839291095733643, + "learning_rate": 7.206249143483623e-06, + "loss": 0.3833, + "step": 21137 + }, + { + "epoch": 1.086706456863809, + "grad_norm": 5.472560405731201, + "learning_rate": 7.2061121008633694e-06, + "loss": 0.2651, + "step": 21138 + }, + { + "epoch": 1.0867200217037438, + "grad_norm": 6.1139373779296875, + "learning_rate": 7.205975058243115e-06, + "loss": 0.3446, + "step": 21139 + }, + { + "epoch": 1.0867335865436787, + "grad_norm": 6.4092888832092285, + "learning_rate": 7.205838015622859e-06, + "loss": 0.3549, + "step": 21140 + }, + { + "epoch": 1.0867471513836138, + "grad_norm": 8.020947456359863, + "learning_rate": 7.205700973002604e-06, + "loss": 0.4656, + "step": 21141 + }, + { + "epoch": 1.0867607162235486, + "grad_norm": 5.326414108276367, + "learning_rate": 7.20556393038235e-06, + "loss": 0.3867, + "step": 21142 + }, + { + "epoch": 1.0867742810634835, + "grad_norm": 5.694010257720947, + "learning_rate": 7.2054268877620945e-06, + "loss": 0.2876, + "step": 21143 + }, + { + "epoch": 1.0867878459034184, + "grad_norm": 4.411014080047607, + "learning_rate": 7.20528984514184e-06, + "loss": 0.1993, + "step": 21144 + }, + { + "epoch": 1.0868014107433532, + "grad_norm": 5.474823474884033, + "learning_rate": 7.205152802521585e-06, + "loss": 0.2545, + "step": 21145 + }, + { + "epoch": 1.086814975583288, + "grad_norm": 6.2989325523376465, + "learning_rate": 7.20501575990133e-06, + "loss": 0.401, + "step": 21146 + }, + { + "epoch": 1.086828540423223, + "grad_norm": 8.151541709899902, + "learning_rate": 7.204878717281075e-06, + "loss": 0.5798, + "step": 21147 + }, + { + "epoch": 1.0868421052631578, + "grad_norm": 5.112245082855225, + "learning_rate": 7.20474167466082e-06, + "loss": 0.2087, + "step": 21148 + }, + { + "epoch": 1.0868556701030927, + "grad_norm": 5.7545084953308105, + "learning_rate": 7.204604632040565e-06, + "loss": 0.2788, + "step": 21149 + }, + { + "epoch": 1.0868692349430278, + "grad_norm": 6.477449417114258, + "learning_rate": 7.20446758942031e-06, + "loss": 0.3437, + "step": 21150 + }, + { + "epoch": 1.0868827997829626, + "grad_norm": 5.72595739364624, + "learning_rate": 7.204330546800056e-06, + "loss": 0.2846, + "step": 21151 + }, + { + "epoch": 1.0868963646228975, + "grad_norm": 6.329155921936035, + "learning_rate": 7.2041935041798e-06, + "loss": 0.3482, + "step": 21152 + }, + { + "epoch": 1.0869099294628324, + "grad_norm": 6.889890193939209, + "learning_rate": 7.2040564615595454e-06, + "loss": 0.3464, + "step": 21153 + }, + { + "epoch": 1.0869234943027672, + "grad_norm": 4.911589622497559, + "learning_rate": 7.203919418939291e-06, + "loss": 0.2095, + "step": 21154 + }, + { + "epoch": 1.0869370591427021, + "grad_norm": 10.028682708740234, + "learning_rate": 7.203782376319036e-06, + "loss": 0.4588, + "step": 21155 + }, + { + "epoch": 1.086950623982637, + "grad_norm": 4.7869553565979, + "learning_rate": 7.203645333698781e-06, + "loss": 0.3087, + "step": 21156 + }, + { + "epoch": 1.0869641888225718, + "grad_norm": 5.38815450668335, + "learning_rate": 7.203508291078526e-06, + "loss": 0.2281, + "step": 21157 + }, + { + "epoch": 1.0869777536625067, + "grad_norm": 5.847933292388916, + "learning_rate": 7.2033712484582705e-06, + "loss": 0.4765, + "step": 21158 + }, + { + "epoch": 1.0869913185024416, + "grad_norm": 4.613870620727539, + "learning_rate": 7.2032342058380165e-06, + "loss": 0.2568, + "step": 21159 + }, + { + "epoch": 1.0870048833423767, + "grad_norm": 5.554319858551025, + "learning_rate": 7.203097163217762e-06, + "loss": 0.2529, + "step": 21160 + }, + { + "epoch": 1.0870184481823115, + "grad_norm": 4.846285343170166, + "learning_rate": 7.202960120597506e-06, + "loss": 0.2235, + "step": 21161 + }, + { + "epoch": 1.0870320130222464, + "grad_norm": 4.199699401855469, + "learning_rate": 7.202823077977251e-06, + "loss": 0.2188, + "step": 21162 + }, + { + "epoch": 1.0870455778621813, + "grad_norm": 6.596773147583008, + "learning_rate": 7.202686035356996e-06, + "loss": 0.3968, + "step": 21163 + }, + { + "epoch": 1.0870591427021161, + "grad_norm": 6.94610595703125, + "learning_rate": 7.202548992736742e-06, + "loss": 0.3295, + "step": 21164 + }, + { + "epoch": 1.087072707542051, + "grad_norm": 6.723727703094482, + "learning_rate": 7.202411950116487e-06, + "loss": 0.3627, + "step": 21165 + }, + { + "epoch": 1.0870862723819859, + "grad_norm": 5.102255344390869, + "learning_rate": 7.202274907496232e-06, + "loss": 0.2784, + "step": 21166 + }, + { + "epoch": 1.0870998372219207, + "grad_norm": 6.199583530426025, + "learning_rate": 7.202137864875976e-06, + "loss": 0.3037, + "step": 21167 + }, + { + "epoch": 1.0871134020618556, + "grad_norm": 5.2116594314575195, + "learning_rate": 7.202000822255722e-06, + "loss": 0.3248, + "step": 21168 + }, + { + "epoch": 1.0871269669017907, + "grad_norm": 4.464649677276611, + "learning_rate": 7.2018637796354674e-06, + "loss": 0.3503, + "step": 21169 + }, + { + "epoch": 1.0871405317417255, + "grad_norm": 6.833108425140381, + "learning_rate": 7.201726737015212e-06, + "loss": 0.3042, + "step": 21170 + }, + { + "epoch": 1.0871540965816604, + "grad_norm": 4.454486846923828, + "learning_rate": 7.201589694394957e-06, + "loss": 0.3334, + "step": 21171 + }, + { + "epoch": 1.0871676614215953, + "grad_norm": 6.229340076446533, + "learning_rate": 7.201452651774703e-06, + "loss": 0.4447, + "step": 21172 + }, + { + "epoch": 1.0871812262615301, + "grad_norm": 5.9884352684021, + "learning_rate": 7.201315609154448e-06, + "loss": 0.2221, + "step": 21173 + }, + { + "epoch": 1.087194791101465, + "grad_norm": 6.671745300292969, + "learning_rate": 7.2011785665341925e-06, + "loss": 0.3526, + "step": 21174 + }, + { + "epoch": 1.0872083559413999, + "grad_norm": 4.820106506347656, + "learning_rate": 7.201041523913938e-06, + "loss": 0.303, + "step": 21175 + }, + { + "epoch": 1.0872219207813347, + "grad_norm": 5.395566940307617, + "learning_rate": 7.200904481293684e-06, + "loss": 0.5547, + "step": 21176 + }, + { + "epoch": 1.0872354856212696, + "grad_norm": 6.509768962860107, + "learning_rate": 7.200767438673428e-06, + "loss": 0.4079, + "step": 21177 + }, + { + "epoch": 1.0872490504612045, + "grad_norm": 4.600085735321045, + "learning_rate": 7.200630396053173e-06, + "loss": 0.2011, + "step": 21178 + }, + { + "epoch": 1.0872626153011395, + "grad_norm": 6.077058792114258, + "learning_rate": 7.200493353432918e-06, + "loss": 0.3649, + "step": 21179 + }, + { + "epoch": 1.0872761801410744, + "grad_norm": 4.567733287811279, + "learning_rate": 7.200356310812663e-06, + "loss": 0.338, + "step": 21180 + }, + { + "epoch": 1.0872897449810093, + "grad_norm": 4.977332592010498, + "learning_rate": 7.200219268192409e-06, + "loss": 0.3041, + "step": 21181 + }, + { + "epoch": 1.0873033098209441, + "grad_norm": 5.377434253692627, + "learning_rate": 7.200082225572154e-06, + "loss": 0.2857, + "step": 21182 + }, + { + "epoch": 1.087316874660879, + "grad_norm": 5.733633995056152, + "learning_rate": 7.199945182951898e-06, + "loss": 0.2718, + "step": 21183 + }, + { + "epoch": 1.0873304395008139, + "grad_norm": 5.70534086227417, + "learning_rate": 7.1998081403316434e-06, + "loss": 0.2794, + "step": 21184 + }, + { + "epoch": 1.0873440043407487, + "grad_norm": 4.868008613586426, + "learning_rate": 7.1996710977113895e-06, + "loss": 0.2953, + "step": 21185 + }, + { + "epoch": 1.0873575691806836, + "grad_norm": 5.513195037841797, + "learning_rate": 7.199534055091134e-06, + "loss": 0.3403, + "step": 21186 + }, + { + "epoch": 1.0873711340206185, + "grad_norm": 5.574132919311523, + "learning_rate": 7.199397012470879e-06, + "loss": 0.3193, + "step": 21187 + }, + { + "epoch": 1.0873846988605536, + "grad_norm": 3.7188539505004883, + "learning_rate": 7.199259969850624e-06, + "loss": 0.1886, + "step": 21188 + }, + { + "epoch": 1.0873982637004884, + "grad_norm": 4.097498893737793, + "learning_rate": 7.19912292723037e-06, + "loss": 0.1598, + "step": 21189 + }, + { + "epoch": 1.0874118285404233, + "grad_norm": 3.2753002643585205, + "learning_rate": 7.1989858846101145e-06, + "loss": 0.1835, + "step": 21190 + }, + { + "epoch": 1.0874253933803582, + "grad_norm": 5.152723789215088, + "learning_rate": 7.19884884198986e-06, + "loss": 0.25, + "step": 21191 + }, + { + "epoch": 1.087438958220293, + "grad_norm": 4.729091167449951, + "learning_rate": 7.198711799369604e-06, + "loss": 0.2478, + "step": 21192 + }, + { + "epoch": 1.0874525230602279, + "grad_norm": 4.220375061035156, + "learning_rate": 7.198574756749349e-06, + "loss": 0.2123, + "step": 21193 + }, + { + "epoch": 1.0874660879001627, + "grad_norm": 5.081284523010254, + "learning_rate": 7.198437714129095e-06, + "loss": 0.2361, + "step": 21194 + }, + { + "epoch": 1.0874796527400976, + "grad_norm": 5.512348651885986, + "learning_rate": 7.1983006715088396e-06, + "loss": 0.3072, + "step": 21195 + }, + { + "epoch": 1.0874932175800325, + "grad_norm": 3.6480793952941895, + "learning_rate": 7.198163628888585e-06, + "loss": 0.2418, + "step": 21196 + }, + { + "epoch": 1.0875067824199673, + "grad_norm": 4.387537002563477, + "learning_rate": 7.19802658626833e-06, + "loss": 0.1798, + "step": 21197 + }, + { + "epoch": 1.0875203472599024, + "grad_norm": 4.042046546936035, + "learning_rate": 7.197889543648076e-06, + "loss": 0.1976, + "step": 21198 + }, + { + "epoch": 1.0875339120998373, + "grad_norm": 5.644815921783447, + "learning_rate": 7.19775250102782e-06, + "loss": 0.2837, + "step": 21199 + }, + { + "epoch": 1.0875474769397722, + "grad_norm": 5.039129734039307, + "learning_rate": 7.1976154584075654e-06, + "loss": 0.2726, + "step": 21200 + }, + { + "epoch": 1.087561041779707, + "grad_norm": 3.6747047901153564, + "learning_rate": 7.19747841578731e-06, + "loss": 0.196, + "step": 21201 + }, + { + "epoch": 1.087574606619642, + "grad_norm": 4.55695915222168, + "learning_rate": 7.197341373167056e-06, + "loss": 0.3771, + "step": 21202 + }, + { + "epoch": 1.0875881714595768, + "grad_norm": 4.58982515335083, + "learning_rate": 7.197204330546801e-06, + "loss": 0.3578, + "step": 21203 + }, + { + "epoch": 1.0876017362995116, + "grad_norm": 3.688810110092163, + "learning_rate": 7.197067287926546e-06, + "loss": 0.1862, + "step": 21204 + }, + { + "epoch": 1.0876153011394465, + "grad_norm": 3.91459584236145, + "learning_rate": 7.1969302453062905e-06, + "loss": 0.1669, + "step": 21205 + }, + { + "epoch": 1.0876288659793814, + "grad_norm": 4.99177885055542, + "learning_rate": 7.196793202686036e-06, + "loss": 0.416, + "step": 21206 + }, + { + "epoch": 1.0876424308193164, + "grad_norm": 5.365012168884277, + "learning_rate": 7.196656160065782e-06, + "loss": 0.2914, + "step": 21207 + }, + { + "epoch": 1.0876559956592513, + "grad_norm": 4.356309413909912, + "learning_rate": 7.196519117445526e-06, + "loss": 0.2164, + "step": 21208 + }, + { + "epoch": 1.0876695604991862, + "grad_norm": 4.221670627593994, + "learning_rate": 7.196382074825271e-06, + "loss": 0.1642, + "step": 21209 + }, + { + "epoch": 1.087683125339121, + "grad_norm": 7.01273775100708, + "learning_rate": 7.1962450322050155e-06, + "loss": 0.4164, + "step": 21210 + }, + { + "epoch": 1.087696690179056, + "grad_norm": 6.660276889801025, + "learning_rate": 7.1961079895847616e-06, + "loss": 0.2678, + "step": 21211 + }, + { + "epoch": 1.0877102550189908, + "grad_norm": 5.861652851104736, + "learning_rate": 7.195970946964507e-06, + "loss": 0.2552, + "step": 21212 + }, + { + "epoch": 1.0877238198589256, + "grad_norm": 4.360105991363525, + "learning_rate": 7.195833904344252e-06, + "loss": 0.2988, + "step": 21213 + }, + { + "epoch": 1.0877373846988605, + "grad_norm": 5.6580891609191895, + "learning_rate": 7.195696861723996e-06, + "loss": 0.2545, + "step": 21214 + }, + { + "epoch": 1.0877509495387954, + "grad_norm": 5.155369281768799, + "learning_rate": 7.195559819103742e-06, + "loss": 0.2646, + "step": 21215 + }, + { + "epoch": 1.0877645143787302, + "grad_norm": 6.1335649490356445, + "learning_rate": 7.1954227764834875e-06, + "loss": 0.404, + "step": 21216 + }, + { + "epoch": 1.0877780792186653, + "grad_norm": 3.450620651245117, + "learning_rate": 7.195285733863232e-06, + "loss": 0.1977, + "step": 21217 + }, + { + "epoch": 1.0877916440586002, + "grad_norm": 5.5235981941223145, + "learning_rate": 7.195148691242977e-06, + "loss": 0.3824, + "step": 21218 + }, + { + "epoch": 1.087805208898535, + "grad_norm": 6.738949298858643, + "learning_rate": 7.195011648622721e-06, + "loss": 0.3279, + "step": 21219 + }, + { + "epoch": 1.08781877373847, + "grad_norm": 6.127615451812744, + "learning_rate": 7.194874606002467e-06, + "loss": 0.4473, + "step": 21220 + }, + { + "epoch": 1.0878323385784048, + "grad_norm": 4.86944055557251, + "learning_rate": 7.1947375633822125e-06, + "loss": 0.2354, + "step": 21221 + }, + { + "epoch": 1.0878459034183396, + "grad_norm": 4.264151096343994, + "learning_rate": 7.194600520761958e-06, + "loss": 0.2888, + "step": 21222 + }, + { + "epoch": 1.0878594682582745, + "grad_norm": 7.694788455963135, + "learning_rate": 7.194463478141702e-06, + "loss": 0.3679, + "step": 21223 + }, + { + "epoch": 1.0878730330982094, + "grad_norm": 4.379118919372559, + "learning_rate": 7.194326435521448e-06, + "loss": 0.2707, + "step": 21224 + }, + { + "epoch": 1.0878865979381442, + "grad_norm": 5.208324432373047, + "learning_rate": 7.194189392901193e-06, + "loss": 0.242, + "step": 21225 + }, + { + "epoch": 1.0879001627780793, + "grad_norm": 5.214879989624023, + "learning_rate": 7.1940523502809376e-06, + "loss": 0.3346, + "step": 21226 + }, + { + "epoch": 1.0879137276180142, + "grad_norm": 4.526856899261475, + "learning_rate": 7.193915307660683e-06, + "loss": 0.2505, + "step": 21227 + }, + { + "epoch": 1.087927292457949, + "grad_norm": 4.901917934417725, + "learning_rate": 7.193778265040429e-06, + "loss": 0.3648, + "step": 21228 + }, + { + "epoch": 1.087940857297884, + "grad_norm": 5.829356670379639, + "learning_rate": 7.193641222420173e-06, + "loss": 0.3139, + "step": 21229 + }, + { + "epoch": 1.0879544221378188, + "grad_norm": 5.248347282409668, + "learning_rate": 7.193504179799918e-06, + "loss": 0.2824, + "step": 21230 + }, + { + "epoch": 1.0879679869777537, + "grad_norm": 6.971297264099121, + "learning_rate": 7.1933671371796635e-06, + "loss": 0.3014, + "step": 21231 + }, + { + "epoch": 1.0879815518176885, + "grad_norm": 6.384610652923584, + "learning_rate": 7.193230094559408e-06, + "loss": 0.3365, + "step": 21232 + }, + { + "epoch": 1.0879951166576234, + "grad_norm": 6.183619022369385, + "learning_rate": 7.193093051939154e-06, + "loss": 0.4362, + "step": 21233 + }, + { + "epoch": 1.0880086814975582, + "grad_norm": 4.66755485534668, + "learning_rate": 7.192956009318899e-06, + "loss": 0.2872, + "step": 21234 + }, + { + "epoch": 1.0880222463374931, + "grad_norm": 5.3942131996154785, + "learning_rate": 7.192818966698643e-06, + "loss": 0.3285, + "step": 21235 + }, + { + "epoch": 1.0880358111774282, + "grad_norm": 6.502737998962402, + "learning_rate": 7.1926819240783885e-06, + "loss": 0.3185, + "step": 21236 + }, + { + "epoch": 1.088049376017363, + "grad_norm": 4.796534061431885, + "learning_rate": 7.1925448814581345e-06, + "loss": 0.2179, + "step": 21237 + }, + { + "epoch": 1.088062940857298, + "grad_norm": 7.277018070220947, + "learning_rate": 7.19240783883788e-06, + "loss": 0.2329, + "step": 21238 + }, + { + "epoch": 1.0880765056972328, + "grad_norm": 5.949092388153076, + "learning_rate": 7.192270796217624e-06, + "loss": 0.4102, + "step": 21239 + }, + { + "epoch": 1.0880900705371677, + "grad_norm": 7.244363307952881, + "learning_rate": 7.192133753597369e-06, + "loss": 0.4287, + "step": 21240 + }, + { + "epoch": 1.0881036353771025, + "grad_norm": 4.744899749755859, + "learning_rate": 7.191996710977115e-06, + "loss": 0.2643, + "step": 21241 + }, + { + "epoch": 1.0881172002170374, + "grad_norm": 4.402822494506836, + "learning_rate": 7.1918596683568596e-06, + "loss": 0.2063, + "step": 21242 + }, + { + "epoch": 1.0881307650569723, + "grad_norm": 6.289263725280762, + "learning_rate": 7.191722625736605e-06, + "loss": 0.4485, + "step": 21243 + }, + { + "epoch": 1.0881443298969071, + "grad_norm": 6.038023471832275, + "learning_rate": 7.191585583116349e-06, + "loss": 0.3828, + "step": 21244 + }, + { + "epoch": 1.0881578947368422, + "grad_norm": 5.147168159484863, + "learning_rate": 7.191448540496095e-06, + "loss": 0.224, + "step": 21245 + }, + { + "epoch": 1.088171459576777, + "grad_norm": 3.460996150970459, + "learning_rate": 7.19131149787584e-06, + "loss": 0.2122, + "step": 21246 + }, + { + "epoch": 1.088185024416712, + "grad_norm": 4.815566062927246, + "learning_rate": 7.1911744552555855e-06, + "loss": 0.2454, + "step": 21247 + }, + { + "epoch": 1.0881985892566468, + "grad_norm": 4.202095031738281, + "learning_rate": 7.19103741263533e-06, + "loss": 0.2712, + "step": 21248 + }, + { + "epoch": 1.0882121540965817, + "grad_norm": 4.2966389656066895, + "learning_rate": 7.190900370015075e-06, + "loss": 0.3135, + "step": 21249 + }, + { + "epoch": 1.0882257189365165, + "grad_norm": 7.499993324279785, + "learning_rate": 7.190763327394821e-06, + "loss": 0.3863, + "step": 21250 + }, + { + "epoch": 1.0882392837764514, + "grad_norm": 5.13712215423584, + "learning_rate": 7.190626284774565e-06, + "loss": 0.2751, + "step": 21251 + }, + { + "epoch": 1.0882528486163863, + "grad_norm": 5.672732353210449, + "learning_rate": 7.1904892421543105e-06, + "loss": 0.3423, + "step": 21252 + }, + { + "epoch": 1.0882664134563211, + "grad_norm": 4.591477394104004, + "learning_rate": 7.190352199534056e-06, + "loss": 0.2106, + "step": 21253 + }, + { + "epoch": 1.088279978296256, + "grad_norm": 5.620006084442139, + "learning_rate": 7.190215156913801e-06, + "loss": 0.2063, + "step": 21254 + }, + { + "epoch": 1.088293543136191, + "grad_norm": 4.8974609375, + "learning_rate": 7.190078114293546e-06, + "loss": 0.2687, + "step": 21255 + }, + { + "epoch": 1.088307107976126, + "grad_norm": 6.1841020584106445, + "learning_rate": 7.189941071673291e-06, + "loss": 0.3338, + "step": 21256 + }, + { + "epoch": 1.0883206728160608, + "grad_norm": 5.143790245056152, + "learning_rate": 7.1898040290530356e-06, + "loss": 0.2904, + "step": 21257 + }, + { + "epoch": 1.0883342376559957, + "grad_norm": 5.304933071136475, + "learning_rate": 7.189666986432782e-06, + "loss": 0.2699, + "step": 21258 + }, + { + "epoch": 1.0883478024959305, + "grad_norm": 5.658135890960693, + "learning_rate": 7.189529943812527e-06, + "loss": 0.2167, + "step": 21259 + }, + { + "epoch": 1.0883613673358654, + "grad_norm": 5.027367115020752, + "learning_rate": 7.189392901192271e-06, + "loss": 0.3223, + "step": 21260 + }, + { + "epoch": 1.0883749321758003, + "grad_norm": 5.174301624298096, + "learning_rate": 7.189255858572016e-06, + "loss": 0.2259, + "step": 21261 + }, + { + "epoch": 1.0883884970157351, + "grad_norm": 3.8327016830444336, + "learning_rate": 7.1891188159517615e-06, + "loss": 0.1842, + "step": 21262 + }, + { + "epoch": 1.08840206185567, + "grad_norm": 5.031716346740723, + "learning_rate": 7.188981773331507e-06, + "loss": 0.268, + "step": 21263 + }, + { + "epoch": 1.088415626695605, + "grad_norm": 5.519753456115723, + "learning_rate": 7.188844730711252e-06, + "loss": 0.2411, + "step": 21264 + }, + { + "epoch": 1.08842919153554, + "grad_norm": 5.303037643432617, + "learning_rate": 7.188707688090997e-06, + "loss": 0.2621, + "step": 21265 + }, + { + "epoch": 1.0884427563754748, + "grad_norm": 4.146564960479736, + "learning_rate": 7.188570645470741e-06, + "loss": 0.217, + "step": 21266 + }, + { + "epoch": 1.0884563212154097, + "grad_norm": 3.9929938316345215, + "learning_rate": 7.188433602850487e-06, + "loss": 0.1413, + "step": 21267 + }, + { + "epoch": 1.0884698860553446, + "grad_norm": 5.141767501831055, + "learning_rate": 7.1882965602302325e-06, + "loss": 0.2583, + "step": 21268 + }, + { + "epoch": 1.0884834508952794, + "grad_norm": 5.30571174621582, + "learning_rate": 7.188159517609977e-06, + "loss": 0.269, + "step": 21269 + }, + { + "epoch": 1.0884970157352143, + "grad_norm": 4.1545209884643555, + "learning_rate": 7.188022474989722e-06, + "loss": 0.1795, + "step": 21270 + }, + { + "epoch": 1.0885105805751492, + "grad_norm": 10.286075592041016, + "learning_rate": 7.187885432369468e-06, + "loss": 0.3071, + "step": 21271 + }, + { + "epoch": 1.088524145415084, + "grad_norm": 4.8994574546813965, + "learning_rate": 7.187748389749213e-06, + "loss": 0.3224, + "step": 21272 + }, + { + "epoch": 1.0885377102550189, + "grad_norm": 4.883784770965576, + "learning_rate": 7.187611347128958e-06, + "loss": 0.2488, + "step": 21273 + }, + { + "epoch": 1.088551275094954, + "grad_norm": 4.611770153045654, + "learning_rate": 7.187474304508703e-06, + "loss": 0.2921, + "step": 21274 + }, + { + "epoch": 1.0885648399348888, + "grad_norm": 5.3091206550598145, + "learning_rate": 7.187337261888447e-06, + "loss": 0.4039, + "step": 21275 + }, + { + "epoch": 1.0885784047748237, + "grad_norm": 9.033219337463379, + "learning_rate": 7.187200219268193e-06, + "loss": 0.3602, + "step": 21276 + }, + { + "epoch": 1.0885919696147586, + "grad_norm": 6.806333541870117, + "learning_rate": 7.187063176647938e-06, + "loss": 0.2289, + "step": 21277 + }, + { + "epoch": 1.0886055344546934, + "grad_norm": 4.1069464683532715, + "learning_rate": 7.186926134027683e-06, + "loss": 0.1612, + "step": 21278 + }, + { + "epoch": 1.0886190992946283, + "grad_norm": 6.193839073181152, + "learning_rate": 7.186789091407428e-06, + "loss": 0.2256, + "step": 21279 + }, + { + "epoch": 1.0886326641345632, + "grad_norm": 5.480370998382568, + "learning_rate": 7.186652048787174e-06, + "loss": 0.2482, + "step": 21280 + }, + { + "epoch": 1.088646228974498, + "grad_norm": 4.924530029296875, + "learning_rate": 7.186515006166919e-06, + "loss": 0.2039, + "step": 21281 + }, + { + "epoch": 1.088659793814433, + "grad_norm": 5.301053524017334, + "learning_rate": 7.186377963546663e-06, + "loss": 0.2472, + "step": 21282 + }, + { + "epoch": 1.088673358654368, + "grad_norm": 5.1106696128845215, + "learning_rate": 7.1862409209264085e-06, + "loss": 0.2376, + "step": 21283 + }, + { + "epoch": 1.0886869234943028, + "grad_norm": 5.408304214477539, + "learning_rate": 7.1861038783061545e-06, + "loss": 0.2284, + "step": 21284 + }, + { + "epoch": 1.0887004883342377, + "grad_norm": 5.425890922546387, + "learning_rate": 7.185966835685899e-06, + "loss": 0.2991, + "step": 21285 + }, + { + "epoch": 1.0887140531741726, + "grad_norm": 5.561683654785156, + "learning_rate": 7.185829793065644e-06, + "loss": 0.1897, + "step": 21286 + }, + { + "epoch": 1.0887276180141074, + "grad_norm": 4.505679130554199, + "learning_rate": 7.185692750445389e-06, + "loss": 0.2326, + "step": 21287 + }, + { + "epoch": 1.0887411828540423, + "grad_norm": 5.862009525299072, + "learning_rate": 7.1855557078251336e-06, + "loss": 0.2701, + "step": 21288 + }, + { + "epoch": 1.0887547476939772, + "grad_norm": 5.197031497955322, + "learning_rate": 7.18541866520488e-06, + "loss": 0.195, + "step": 21289 + }, + { + "epoch": 1.088768312533912, + "grad_norm": 3.956925630569458, + "learning_rate": 7.185281622584625e-06, + "loss": 0.1913, + "step": 21290 + }, + { + "epoch": 1.088781877373847, + "grad_norm": 4.976151943206787, + "learning_rate": 7.185144579964369e-06, + "loss": 0.2615, + "step": 21291 + }, + { + "epoch": 1.0887954422137818, + "grad_norm": 4.987800598144531, + "learning_rate": 7.185007537344114e-06, + "loss": 0.1946, + "step": 21292 + }, + { + "epoch": 1.0888090070537169, + "grad_norm": 3.848799467086792, + "learning_rate": 7.18487049472386e-06, + "loss": 0.211, + "step": 21293 + }, + { + "epoch": 1.0888225718936517, + "grad_norm": 5.364124774932861, + "learning_rate": 7.184733452103605e-06, + "loss": 0.3462, + "step": 21294 + }, + { + "epoch": 1.0888361367335866, + "grad_norm": 3.9448611736297607, + "learning_rate": 7.18459640948335e-06, + "loss": 0.2323, + "step": 21295 + }, + { + "epoch": 1.0888497015735215, + "grad_norm": 2.832460880279541, + "learning_rate": 7.184459366863095e-06, + "loss": 0.1878, + "step": 21296 + }, + { + "epoch": 1.0888632664134563, + "grad_norm": 6.293513774871826, + "learning_rate": 7.18432232424284e-06, + "loss": 0.3111, + "step": 21297 + }, + { + "epoch": 1.0888768312533912, + "grad_norm": 3.1830809116363525, + "learning_rate": 7.184185281622585e-06, + "loss": 0.1242, + "step": 21298 + }, + { + "epoch": 1.088890396093326, + "grad_norm": 3.894465446472168, + "learning_rate": 7.1840482390023305e-06, + "loss": 0.1662, + "step": 21299 + }, + { + "epoch": 1.088903960933261, + "grad_norm": 4.349307537078857, + "learning_rate": 7.183911196382075e-06, + "loss": 0.2562, + "step": 21300 + }, + { + "epoch": 1.0889175257731958, + "grad_norm": 3.5038018226623535, + "learning_rate": 7.18377415376182e-06, + "loss": 0.157, + "step": 21301 + }, + { + "epoch": 1.0889310906131309, + "grad_norm": 4.732187271118164, + "learning_rate": 7.183637111141566e-06, + "loss": 0.2767, + "step": 21302 + }, + { + "epoch": 1.0889446554530657, + "grad_norm": 4.433006763458252, + "learning_rate": 7.18350006852131e-06, + "loss": 0.2083, + "step": 21303 + }, + { + "epoch": 1.0889582202930006, + "grad_norm": 5.85750150680542, + "learning_rate": 7.183363025901056e-06, + "loss": 0.3999, + "step": 21304 + }, + { + "epoch": 1.0889717851329355, + "grad_norm": 5.163764953613281, + "learning_rate": 7.183225983280801e-06, + "loss": 0.2708, + "step": 21305 + }, + { + "epoch": 1.0889853499728703, + "grad_norm": 6.187337875366211, + "learning_rate": 7.183088940660547e-06, + "loss": 0.3752, + "step": 21306 + }, + { + "epoch": 1.0889989148128052, + "grad_norm": 4.933358669281006, + "learning_rate": 7.182951898040291e-06, + "loss": 0.2446, + "step": 21307 + }, + { + "epoch": 1.08901247965274, + "grad_norm": 5.519754409790039, + "learning_rate": 7.182814855420036e-06, + "loss": 0.3071, + "step": 21308 + }, + { + "epoch": 1.089026044492675, + "grad_norm": 6.363369464874268, + "learning_rate": 7.182677812799781e-06, + "loss": 0.3802, + "step": 21309 + }, + { + "epoch": 1.0890396093326098, + "grad_norm": 5.534015655517578, + "learning_rate": 7.182540770179527e-06, + "loss": 0.2241, + "step": 21310 + }, + { + "epoch": 1.0890531741725447, + "grad_norm": 4.641286373138428, + "learning_rate": 7.182403727559272e-06, + "loss": 0.2009, + "step": 21311 + }, + { + "epoch": 1.0890667390124797, + "grad_norm": 6.239930629730225, + "learning_rate": 7.182266684939016e-06, + "loss": 0.3094, + "step": 21312 + }, + { + "epoch": 1.0890803038524146, + "grad_norm": 5.378902912139893, + "learning_rate": 7.182129642318761e-06, + "loss": 0.2337, + "step": 21313 + }, + { + "epoch": 1.0890938686923495, + "grad_norm": 4.846951961517334, + "learning_rate": 7.181992599698507e-06, + "loss": 0.3482, + "step": 21314 + }, + { + "epoch": 1.0891074335322843, + "grad_norm": 5.116621494293213, + "learning_rate": 7.1818555570782525e-06, + "loss": 0.2507, + "step": 21315 + }, + { + "epoch": 1.0891209983722192, + "grad_norm": 5.790121555328369, + "learning_rate": 7.181718514457997e-06, + "loss": 0.3366, + "step": 21316 + }, + { + "epoch": 1.089134563212154, + "grad_norm": 7.831255912780762, + "learning_rate": 7.181581471837742e-06, + "loss": 0.37, + "step": 21317 + }, + { + "epoch": 1.089148128052089, + "grad_norm": 5.004767417907715, + "learning_rate": 7.181444429217486e-06, + "loss": 0.2153, + "step": 21318 + }, + { + "epoch": 1.0891616928920238, + "grad_norm": 4.791175842285156, + "learning_rate": 7.181307386597232e-06, + "loss": 0.2717, + "step": 21319 + }, + { + "epoch": 1.0891752577319587, + "grad_norm": 4.470953464508057, + "learning_rate": 7.181170343976978e-06, + "loss": 0.1856, + "step": 21320 + }, + { + "epoch": 1.0891888225718938, + "grad_norm": 6.121408462524414, + "learning_rate": 7.181033301356723e-06, + "loss": 0.2387, + "step": 21321 + }, + { + "epoch": 1.0892023874118286, + "grad_norm": 3.3671979904174805, + "learning_rate": 7.180896258736467e-06, + "loss": 0.1562, + "step": 21322 + }, + { + "epoch": 1.0892159522517635, + "grad_norm": 5.207481861114502, + "learning_rate": 7.180759216116213e-06, + "loss": 0.1643, + "step": 21323 + }, + { + "epoch": 1.0892295170916984, + "grad_norm": 3.6200690269470215, + "learning_rate": 7.180622173495958e-06, + "loss": 0.2076, + "step": 21324 + }, + { + "epoch": 1.0892430819316332, + "grad_norm": 6.432966709136963, + "learning_rate": 7.180485130875703e-06, + "loss": 0.3546, + "step": 21325 + }, + { + "epoch": 1.089256646771568, + "grad_norm": 4.188347339630127, + "learning_rate": 7.180348088255448e-06, + "loss": 0.2488, + "step": 21326 + }, + { + "epoch": 1.089270211611503, + "grad_norm": 7.421081066131592, + "learning_rate": 7.180211045635194e-06, + "loss": 0.2626, + "step": 21327 + }, + { + "epoch": 1.0892837764514378, + "grad_norm": 6.125565052032471, + "learning_rate": 7.180074003014938e-06, + "loss": 0.269, + "step": 21328 + }, + { + "epoch": 1.0892973412913727, + "grad_norm": 4.716238021850586, + "learning_rate": 7.179936960394683e-06, + "loss": 0.2257, + "step": 21329 + }, + { + "epoch": 1.0893109061313075, + "grad_norm": 4.395573616027832, + "learning_rate": 7.1797999177744285e-06, + "loss": 0.2094, + "step": 21330 + }, + { + "epoch": 1.0893244709712426, + "grad_norm": 5.1545538902282715, + "learning_rate": 7.179662875154173e-06, + "loss": 0.2312, + "step": 21331 + }, + { + "epoch": 1.0893380358111775, + "grad_norm": 3.3550658226013184, + "learning_rate": 7.179525832533919e-06, + "loss": 0.1822, + "step": 21332 + }, + { + "epoch": 1.0893516006511124, + "grad_norm": 3.8134777545928955, + "learning_rate": 7.179388789913664e-06, + "loss": 0.2337, + "step": 21333 + }, + { + "epoch": 1.0893651654910472, + "grad_norm": 5.984552383422852, + "learning_rate": 7.179251747293408e-06, + "loss": 0.253, + "step": 21334 + }, + { + "epoch": 1.089378730330982, + "grad_norm": 3.840991735458374, + "learning_rate": 7.179114704673154e-06, + "loss": 0.1688, + "step": 21335 + }, + { + "epoch": 1.089392295170917, + "grad_norm": 5.04596471786499, + "learning_rate": 7.1789776620529e-06, + "loss": 0.2251, + "step": 21336 + }, + { + "epoch": 1.0894058600108518, + "grad_norm": 4.632009506225586, + "learning_rate": 7.178840619432644e-06, + "loss": 0.1498, + "step": 21337 + }, + { + "epoch": 1.0894194248507867, + "grad_norm": 4.252864360809326, + "learning_rate": 7.178703576812389e-06, + "loss": 0.2006, + "step": 21338 + }, + { + "epoch": 1.0894329896907216, + "grad_norm": 4.289855003356934, + "learning_rate": 7.178566534192134e-06, + "loss": 0.1575, + "step": 21339 + }, + { + "epoch": 1.0894465545306566, + "grad_norm": 5.0292887687683105, + "learning_rate": 7.17842949157188e-06, + "loss": 0.2091, + "step": 21340 + }, + { + "epoch": 1.0894601193705915, + "grad_norm": 6.319511890411377, + "learning_rate": 7.178292448951625e-06, + "loss": 0.2403, + "step": 21341 + }, + { + "epoch": 1.0894736842105264, + "grad_norm": 5.496508598327637, + "learning_rate": 7.17815540633137e-06, + "loss": 0.2669, + "step": 21342 + }, + { + "epoch": 1.0894872490504612, + "grad_norm": 3.773836135864258, + "learning_rate": 7.178018363711114e-06, + "loss": 0.1836, + "step": 21343 + }, + { + "epoch": 1.089500813890396, + "grad_norm": 5.5446457862854, + "learning_rate": 7.177881321090859e-06, + "loss": 0.2501, + "step": 21344 + }, + { + "epoch": 1.089514378730331, + "grad_norm": 5.036447048187256, + "learning_rate": 7.177744278470605e-06, + "loss": 0.1704, + "step": 21345 + }, + { + "epoch": 1.0895279435702658, + "grad_norm": 4.4497270584106445, + "learning_rate": 7.1776072358503506e-06, + "loss": 0.2034, + "step": 21346 + }, + { + "epoch": 1.0895415084102007, + "grad_norm": 7.140420913696289, + "learning_rate": 7.177470193230095e-06, + "loss": 0.3189, + "step": 21347 + }, + { + "epoch": 1.0895550732501356, + "grad_norm": 7.4714508056640625, + "learning_rate": 7.17733315060984e-06, + "loss": 0.2355, + "step": 21348 + }, + { + "epoch": 1.0895686380900704, + "grad_norm": 8.092212677001953, + "learning_rate": 7.177196107989586e-06, + "loss": 0.271, + "step": 21349 + }, + { + "epoch": 1.0895822029300055, + "grad_norm": 4.614716053009033, + "learning_rate": 7.17705906536933e-06, + "loss": 0.3028, + "step": 21350 + }, + { + "epoch": 1.0895957677699404, + "grad_norm": 6.20950984954834, + "learning_rate": 7.176922022749076e-06, + "loss": 0.2851, + "step": 21351 + }, + { + "epoch": 1.0896093326098752, + "grad_norm": 4.857307434082031, + "learning_rate": 7.17678498012882e-06, + "loss": 0.2533, + "step": 21352 + }, + { + "epoch": 1.0896228974498101, + "grad_norm": 6.049795150756836, + "learning_rate": 7.176647937508566e-06, + "loss": 0.2887, + "step": 21353 + }, + { + "epoch": 1.089636462289745, + "grad_norm": 9.166814804077148, + "learning_rate": 7.176510894888311e-06, + "loss": 0.292, + "step": 21354 + }, + { + "epoch": 1.0896500271296798, + "grad_norm": 4.132400035858154, + "learning_rate": 7.176373852268056e-06, + "loss": 0.2429, + "step": 21355 + }, + { + "epoch": 1.0896635919696147, + "grad_norm": 5.318286895751953, + "learning_rate": 7.176236809647801e-06, + "loss": 0.288, + "step": 21356 + }, + { + "epoch": 1.0896771568095496, + "grad_norm": 6.522161960601807, + "learning_rate": 7.176099767027546e-06, + "loss": 0.2958, + "step": 21357 + }, + { + "epoch": 1.0896907216494844, + "grad_norm": 4.427718639373779, + "learning_rate": 7.175962724407292e-06, + "loss": 0.1836, + "step": 21358 + }, + { + "epoch": 1.0897042864894195, + "grad_norm": 4.471282482147217, + "learning_rate": 7.175825681787036e-06, + "loss": 0.2068, + "step": 21359 + }, + { + "epoch": 1.0897178513293544, + "grad_norm": 5.226888656616211, + "learning_rate": 7.175688639166781e-06, + "loss": 0.2662, + "step": 21360 + }, + { + "epoch": 1.0897314161692893, + "grad_norm": 4.887517929077148, + "learning_rate": 7.175551596546526e-06, + "loss": 0.2049, + "step": 21361 + }, + { + "epoch": 1.0897449810092241, + "grad_norm": 3.659177780151367, + "learning_rate": 7.175414553926272e-06, + "loss": 0.1771, + "step": 21362 + }, + { + "epoch": 1.089758545849159, + "grad_norm": 3.125110626220703, + "learning_rate": 7.175277511306017e-06, + "loss": 0.107, + "step": 21363 + }, + { + "epoch": 1.0897721106890939, + "grad_norm": 8.136918067932129, + "learning_rate": 7.175140468685762e-06, + "loss": 0.3097, + "step": 21364 + }, + { + "epoch": 1.0897856755290287, + "grad_norm": 6.485019207000732, + "learning_rate": 7.175003426065506e-06, + "loss": 0.2649, + "step": 21365 + }, + { + "epoch": 1.0897992403689636, + "grad_norm": 4.641440391540527, + "learning_rate": 7.1748663834452524e-06, + "loss": 0.2079, + "step": 21366 + }, + { + "epoch": 1.0898128052088984, + "grad_norm": 5.138162612915039, + "learning_rate": 7.174729340824998e-06, + "loss": 0.2315, + "step": 21367 + }, + { + "epoch": 1.0898263700488333, + "grad_norm": 4.853470325469971, + "learning_rate": 7.174592298204742e-06, + "loss": 0.222, + "step": 21368 + }, + { + "epoch": 1.0898399348887684, + "grad_norm": 5.067795753479004, + "learning_rate": 7.174455255584487e-06, + "loss": 0.2275, + "step": 21369 + }, + { + "epoch": 1.0898534997287033, + "grad_norm": 4.707117080688477, + "learning_rate": 7.174318212964232e-06, + "loss": 0.1787, + "step": 21370 + }, + { + "epoch": 1.0898670645686381, + "grad_norm": 4.869078636169434, + "learning_rate": 7.1741811703439775e-06, + "loss": 0.2304, + "step": 21371 + }, + { + "epoch": 1.089880629408573, + "grad_norm": 4.8348307609558105, + "learning_rate": 7.174044127723723e-06, + "loss": 0.2063, + "step": 21372 + }, + { + "epoch": 1.0898941942485079, + "grad_norm": 6.394606590270996, + "learning_rate": 7.173907085103468e-06, + "loss": 0.27, + "step": 21373 + }, + { + "epoch": 1.0899077590884427, + "grad_norm": 6.429757595062256, + "learning_rate": 7.173770042483212e-06, + "loss": 0.2728, + "step": 21374 + }, + { + "epoch": 1.0899213239283776, + "grad_norm": 5.140777111053467, + "learning_rate": 7.173632999862958e-06, + "loss": 0.2978, + "step": 21375 + }, + { + "epoch": 1.0899348887683125, + "grad_norm": 5.106133460998535, + "learning_rate": 7.173495957242703e-06, + "loss": 0.2328, + "step": 21376 + }, + { + "epoch": 1.0899484536082473, + "grad_norm": 6.365884304046631, + "learning_rate": 7.173358914622448e-06, + "loss": 0.2195, + "step": 21377 + }, + { + "epoch": 1.0899620184481824, + "grad_norm": 6.114985942840576, + "learning_rate": 7.173221872002193e-06, + "loss": 0.2568, + "step": 21378 + }, + { + "epoch": 1.0899755832881173, + "grad_norm": 4.475228786468506, + "learning_rate": 7.173084829381939e-06, + "loss": 0.2715, + "step": 21379 + }, + { + "epoch": 1.0899891481280521, + "grad_norm": 4.421509265899658, + "learning_rate": 7.172947786761684e-06, + "loss": 0.1881, + "step": 21380 + }, + { + "epoch": 1.090002712967987, + "grad_norm": 5.809153079986572, + "learning_rate": 7.1728107441414284e-06, + "loss": 0.3373, + "step": 21381 + }, + { + "epoch": 1.0900162778079219, + "grad_norm": 5.711907863616943, + "learning_rate": 7.172673701521174e-06, + "loss": 0.4119, + "step": 21382 + }, + { + "epoch": 1.0900298426478567, + "grad_norm": 7.32990837097168, + "learning_rate": 7.17253665890092e-06, + "loss": 0.3361, + "step": 21383 + }, + { + "epoch": 1.0900434074877916, + "grad_norm": 5.264699459075928, + "learning_rate": 7.172399616280664e-06, + "loss": 0.2492, + "step": 21384 + }, + { + "epoch": 1.0900569723277265, + "grad_norm": 5.6833720207214355, + "learning_rate": 7.172262573660409e-06, + "loss": 0.3377, + "step": 21385 + }, + { + "epoch": 1.0900705371676613, + "grad_norm": 4.678859710693359, + "learning_rate": 7.1721255310401535e-06, + "loss": 0.3003, + "step": 21386 + }, + { + "epoch": 1.0900841020075962, + "grad_norm": 4.165781497955322, + "learning_rate": 7.171988488419899e-06, + "loss": 0.2692, + "step": 21387 + }, + { + "epoch": 1.0900976668475313, + "grad_norm": 7.049841403961182, + "learning_rate": 7.171851445799645e-06, + "loss": 0.5425, + "step": 21388 + }, + { + "epoch": 1.0901112316874662, + "grad_norm": 4.2592363357543945, + "learning_rate": 7.17171440317939e-06, + "loss": 0.1948, + "step": 21389 + }, + { + "epoch": 1.090124796527401, + "grad_norm": 6.762002944946289, + "learning_rate": 7.171577360559134e-06, + "loss": 0.3132, + "step": 21390 + }, + { + "epoch": 1.0901383613673359, + "grad_norm": 4.802319526672363, + "learning_rate": 7.171440317938879e-06, + "loss": 0.1721, + "step": 21391 + }, + { + "epoch": 1.0901519262072707, + "grad_norm": 5.347358226776123, + "learning_rate": 7.171303275318625e-06, + "loss": 0.1415, + "step": 21392 + }, + { + "epoch": 1.0901654910472056, + "grad_norm": 5.961302757263184, + "learning_rate": 7.17116623269837e-06, + "loss": 0.3586, + "step": 21393 + }, + { + "epoch": 1.0901790558871405, + "grad_norm": 5.286764144897461, + "learning_rate": 7.171029190078115e-06, + "loss": 0.2342, + "step": 21394 + }, + { + "epoch": 1.0901926207270753, + "grad_norm": 6.662661552429199, + "learning_rate": 7.17089214745786e-06, + "loss": 0.301, + "step": 21395 + }, + { + "epoch": 1.0902061855670102, + "grad_norm": 6.448458194732666, + "learning_rate": 7.170755104837605e-06, + "loss": 0.3536, + "step": 21396 + }, + { + "epoch": 1.0902197504069453, + "grad_norm": 4.794825553894043, + "learning_rate": 7.1706180622173504e-06, + "loss": 0.1566, + "step": 21397 + }, + { + "epoch": 1.0902333152468802, + "grad_norm": 6.387272834777832, + "learning_rate": 7.170481019597096e-06, + "loss": 0.2858, + "step": 21398 + }, + { + "epoch": 1.090246880086815, + "grad_norm": 4.619758129119873, + "learning_rate": 7.17034397697684e-06, + "loss": 0.1893, + "step": 21399 + }, + { + "epoch": 1.09026044492675, + "grad_norm": 5.120323657989502, + "learning_rate": 7.170206934356585e-06, + "loss": 0.2483, + "step": 21400 + }, + { + "epoch": 1.0902740097666848, + "grad_norm": 5.21051549911499, + "learning_rate": 7.170069891736331e-06, + "loss": 0.2846, + "step": 21401 + }, + { + "epoch": 1.0902875746066196, + "grad_norm": 5.466496467590332, + "learning_rate": 7.1699328491160755e-06, + "loss": 0.2745, + "step": 21402 + }, + { + "epoch": 1.0903011394465545, + "grad_norm": 4.524738311767578, + "learning_rate": 7.169795806495821e-06, + "loss": 0.144, + "step": 21403 + }, + { + "epoch": 1.0903147042864894, + "grad_norm": 4.904040336608887, + "learning_rate": 7.169658763875566e-06, + "loss": 0.1702, + "step": 21404 + }, + { + "epoch": 1.0903282691264242, + "grad_norm": 4.581336975097656, + "learning_rate": 7.169521721255311e-06, + "loss": 0.1706, + "step": 21405 + }, + { + "epoch": 1.090341833966359, + "grad_norm": 5.162102222442627, + "learning_rate": 7.169384678635056e-06, + "loss": 0.1827, + "step": 21406 + }, + { + "epoch": 1.0903553988062942, + "grad_norm": 5.110695838928223, + "learning_rate": 7.169247636014801e-06, + "loss": 0.1912, + "step": 21407 + }, + { + "epoch": 1.090368963646229, + "grad_norm": 4.265084266662598, + "learning_rate": 7.169110593394546e-06, + "loss": 0.1797, + "step": 21408 + }, + { + "epoch": 1.090382528486164, + "grad_norm": 4.395366668701172, + "learning_rate": 7.168973550774292e-06, + "loss": 0.1841, + "step": 21409 + }, + { + "epoch": 1.0903960933260988, + "grad_norm": 4.020928382873535, + "learning_rate": 7.168836508154037e-06, + "loss": 0.1868, + "step": 21410 + }, + { + "epoch": 1.0904096581660336, + "grad_norm": 8.481294631958008, + "learning_rate": 7.168699465533781e-06, + "loss": 0.391, + "step": 21411 + }, + { + "epoch": 1.0904232230059685, + "grad_norm": 5.135838508605957, + "learning_rate": 7.1685624229135264e-06, + "loss": 0.2116, + "step": 21412 + }, + { + "epoch": 1.0904367878459034, + "grad_norm": 4.572552680969238, + "learning_rate": 7.168425380293272e-06, + "loss": 0.1654, + "step": 21413 + }, + { + "epoch": 1.0904503526858382, + "grad_norm": 6.716599941253662, + "learning_rate": 7.168288337673018e-06, + "loss": 0.4854, + "step": 21414 + }, + { + "epoch": 1.090463917525773, + "grad_norm": 5.015392303466797, + "learning_rate": 7.168151295052762e-06, + "loss": 0.2624, + "step": 21415 + }, + { + "epoch": 1.0904774823657082, + "grad_norm": 4.221385955810547, + "learning_rate": 7.168014252432507e-06, + "loss": 0.191, + "step": 21416 + }, + { + "epoch": 1.090491047205643, + "grad_norm": 4.921432018280029, + "learning_rate": 7.1678772098122515e-06, + "loss": 0.2552, + "step": 21417 + }, + { + "epoch": 1.090504612045578, + "grad_norm": 6.110498428344727, + "learning_rate": 7.1677401671919975e-06, + "loss": 0.3163, + "step": 21418 + }, + { + "epoch": 1.0905181768855128, + "grad_norm": 7.86931037902832, + "learning_rate": 7.167603124571743e-06, + "loss": 0.3211, + "step": 21419 + }, + { + "epoch": 1.0905317417254476, + "grad_norm": 5.579112529754639, + "learning_rate": 7.167466081951487e-06, + "loss": 0.2104, + "step": 21420 + }, + { + "epoch": 1.0905453065653825, + "grad_norm": 7.775659561157227, + "learning_rate": 7.167329039331232e-06, + "loss": 0.4153, + "step": 21421 + }, + { + "epoch": 1.0905588714053174, + "grad_norm": 4.561593055725098, + "learning_rate": 7.167191996710978e-06, + "loss": 0.161, + "step": 21422 + }, + { + "epoch": 1.0905724362452522, + "grad_norm": 4.854367733001709, + "learning_rate": 7.167054954090723e-06, + "loss": 0.2822, + "step": 21423 + }, + { + "epoch": 1.090586001085187, + "grad_norm": 6.716507911682129, + "learning_rate": 7.166917911470468e-06, + "loss": 0.4407, + "step": 21424 + }, + { + "epoch": 1.090599565925122, + "grad_norm": 5.054062843322754, + "learning_rate": 7.166780868850213e-06, + "loss": 0.2929, + "step": 21425 + }, + { + "epoch": 1.090613130765057, + "grad_norm": 6.443085670471191, + "learning_rate": 7.166643826229957e-06, + "loss": 0.2899, + "step": 21426 + }, + { + "epoch": 1.090626695604992, + "grad_norm": 5.130117416381836, + "learning_rate": 7.166506783609703e-06, + "loss": 0.2287, + "step": 21427 + }, + { + "epoch": 1.0906402604449268, + "grad_norm": 6.704253196716309, + "learning_rate": 7.1663697409894484e-06, + "loss": 0.2887, + "step": 21428 + }, + { + "epoch": 1.0906538252848617, + "grad_norm": 4.884685039520264, + "learning_rate": 7.166232698369194e-06, + "loss": 0.2681, + "step": 21429 + }, + { + "epoch": 1.0906673901247965, + "grad_norm": 5.984231472015381, + "learning_rate": 7.166095655748938e-06, + "loss": 0.2698, + "step": 21430 + }, + { + "epoch": 1.0906809549647314, + "grad_norm": 7.451918601989746, + "learning_rate": 7.165958613128684e-06, + "loss": 0.3748, + "step": 21431 + }, + { + "epoch": 1.0906945198046663, + "grad_norm": 4.079452037811279, + "learning_rate": 7.165821570508429e-06, + "loss": 0.2085, + "step": 21432 + }, + { + "epoch": 1.0907080846446011, + "grad_norm": 6.122523307800293, + "learning_rate": 7.1656845278881735e-06, + "loss": 0.2954, + "step": 21433 + }, + { + "epoch": 1.090721649484536, + "grad_norm": 7.036617755889893, + "learning_rate": 7.165547485267919e-06, + "loss": 0.3931, + "step": 21434 + }, + { + "epoch": 1.090735214324471, + "grad_norm": 5.848330020904541, + "learning_rate": 7.165410442647665e-06, + "loss": 0.3121, + "step": 21435 + }, + { + "epoch": 1.090748779164406, + "grad_norm": 5.4822869300842285, + "learning_rate": 7.165273400027409e-06, + "loss": 0.2992, + "step": 21436 + }, + { + "epoch": 1.0907623440043408, + "grad_norm": 4.503963470458984, + "learning_rate": 7.165136357407154e-06, + "loss": 0.2468, + "step": 21437 + }, + { + "epoch": 1.0907759088442757, + "grad_norm": 4.973617076873779, + "learning_rate": 7.164999314786899e-06, + "loss": 0.202, + "step": 21438 + }, + { + "epoch": 1.0907894736842105, + "grad_norm": 6.832031726837158, + "learning_rate": 7.164862272166644e-06, + "loss": 0.3535, + "step": 21439 + }, + { + "epoch": 1.0908030385241454, + "grad_norm": 6.514270782470703, + "learning_rate": 7.16472522954639e-06, + "loss": 0.2958, + "step": 21440 + }, + { + "epoch": 1.0908166033640803, + "grad_norm": 7.86079216003418, + "learning_rate": 7.164588186926135e-06, + "loss": 0.4198, + "step": 21441 + }, + { + "epoch": 1.0908301682040151, + "grad_norm": 4.672067165374756, + "learning_rate": 7.164451144305879e-06, + "loss": 0.2426, + "step": 21442 + }, + { + "epoch": 1.09084373304395, + "grad_norm": 6.287774085998535, + "learning_rate": 7.1643141016856244e-06, + "loss": 0.2407, + "step": 21443 + }, + { + "epoch": 1.0908572978838849, + "grad_norm": 4.315743923187256, + "learning_rate": 7.1641770590653705e-06, + "loss": 0.1905, + "step": 21444 + }, + { + "epoch": 1.09087086272382, + "grad_norm": 5.142693042755127, + "learning_rate": 7.164040016445115e-06, + "loss": 0.2694, + "step": 21445 + }, + { + "epoch": 1.0908844275637548, + "grad_norm": 5.197412490844727, + "learning_rate": 7.16390297382486e-06, + "loss": 0.1993, + "step": 21446 + }, + { + "epoch": 1.0908979924036897, + "grad_norm": 4.834805488586426, + "learning_rate": 7.163765931204605e-06, + "loss": 0.2169, + "step": 21447 + }, + { + "epoch": 1.0909115572436245, + "grad_norm": 5.872692108154297, + "learning_rate": 7.163628888584351e-06, + "loss": 0.2518, + "step": 21448 + }, + { + "epoch": 1.0909251220835594, + "grad_norm": 4.924690246582031, + "learning_rate": 7.1634918459640955e-06, + "loss": 0.2242, + "step": 21449 + }, + { + "epoch": 1.0909386869234943, + "grad_norm": 3.6401233673095703, + "learning_rate": 7.163354803343841e-06, + "loss": 0.1039, + "step": 21450 + }, + { + "epoch": 1.0909522517634291, + "grad_norm": 6.304055690765381, + "learning_rate": 7.163217760723585e-06, + "loss": 0.2327, + "step": 21451 + }, + { + "epoch": 1.090965816603364, + "grad_norm": 4.321004390716553, + "learning_rate": 7.163080718103331e-06, + "loss": 0.2348, + "step": 21452 + }, + { + "epoch": 1.0909793814432989, + "grad_norm": 6.45881986618042, + "learning_rate": 7.162943675483076e-06, + "loss": 0.2496, + "step": 21453 + }, + { + "epoch": 1.090992946283234, + "grad_norm": 6.655468463897705, + "learning_rate": 7.1628066328628206e-06, + "loss": 0.3456, + "step": 21454 + }, + { + "epoch": 1.0910065111231688, + "grad_norm": 4.684971809387207, + "learning_rate": 7.162669590242566e-06, + "loss": 0.1999, + "step": 21455 + }, + { + "epoch": 1.0910200759631037, + "grad_norm": 5.413912296295166, + "learning_rate": 7.162532547622311e-06, + "loss": 0.2, + "step": 21456 + }, + { + "epoch": 1.0910336408030386, + "grad_norm": 4.844438076019287, + "learning_rate": 7.162395505002057e-06, + "loss": 0.2964, + "step": 21457 + }, + { + "epoch": 1.0910472056429734, + "grad_norm": 3.8921804428100586, + "learning_rate": 7.162258462381801e-06, + "loss": 0.1824, + "step": 21458 + }, + { + "epoch": 1.0910607704829083, + "grad_norm": 5.507082462310791, + "learning_rate": 7.1621214197615464e-06, + "loss": 0.3097, + "step": 21459 + }, + { + "epoch": 1.0910743353228431, + "grad_norm": 6.087991237640381, + "learning_rate": 7.161984377141291e-06, + "loss": 0.3727, + "step": 21460 + }, + { + "epoch": 1.091087900162778, + "grad_norm": 5.536104679107666, + "learning_rate": 7.161847334521037e-06, + "loss": 0.2504, + "step": 21461 + }, + { + "epoch": 1.0911014650027129, + "grad_norm": 5.085700035095215, + "learning_rate": 7.161710291900782e-06, + "loss": 0.2352, + "step": 21462 + }, + { + "epoch": 1.0911150298426477, + "grad_norm": 5.1998677253723145, + "learning_rate": 7.161573249280527e-06, + "loss": 0.2048, + "step": 21463 + }, + { + "epoch": 1.0911285946825828, + "grad_norm": 6.0842814445495605, + "learning_rate": 7.1614362066602715e-06, + "loss": 0.2347, + "step": 21464 + }, + { + "epoch": 1.0911421595225177, + "grad_norm": 4.944477081298828, + "learning_rate": 7.1612991640400175e-06, + "loss": 0.207, + "step": 21465 + }, + { + "epoch": 1.0911557243624526, + "grad_norm": 4.7641377449035645, + "learning_rate": 7.161162121419763e-06, + "loss": 0.1952, + "step": 21466 + }, + { + "epoch": 1.0911692892023874, + "grad_norm": 4.692043304443359, + "learning_rate": 7.161025078799507e-06, + "loss": 0.1779, + "step": 21467 + }, + { + "epoch": 1.0911828540423223, + "grad_norm": 4.648370265960693, + "learning_rate": 7.160888036179252e-06, + "loss": 0.2999, + "step": 21468 + }, + { + "epoch": 1.0911964188822572, + "grad_norm": 3.814943313598633, + "learning_rate": 7.1607509935589965e-06, + "loss": 0.1303, + "step": 21469 + }, + { + "epoch": 1.091209983722192, + "grad_norm": 3.9739937782287598, + "learning_rate": 7.1606139509387426e-06, + "loss": 0.2231, + "step": 21470 + }, + { + "epoch": 1.0912235485621269, + "grad_norm": 5.3193793296813965, + "learning_rate": 7.160476908318488e-06, + "loss": 0.3087, + "step": 21471 + }, + { + "epoch": 1.0912371134020618, + "grad_norm": 4.479211330413818, + "learning_rate": 7.160339865698233e-06, + "loss": 0.2257, + "step": 21472 + }, + { + "epoch": 1.0912506782419968, + "grad_norm": 3.8289952278137207, + "learning_rate": 7.160202823077977e-06, + "loss": 0.1516, + "step": 21473 + }, + { + "epoch": 1.0912642430819317, + "grad_norm": 4.948535442352295, + "learning_rate": 7.160065780457723e-06, + "loss": 0.2014, + "step": 21474 + }, + { + "epoch": 1.0912778079218666, + "grad_norm": 5.532617092132568, + "learning_rate": 7.1599287378374685e-06, + "loss": 0.2704, + "step": 21475 + }, + { + "epoch": 1.0912913727618014, + "grad_norm": 5.671202182769775, + "learning_rate": 7.159791695217213e-06, + "loss": 0.2881, + "step": 21476 + }, + { + "epoch": 1.0913049376017363, + "grad_norm": 4.8348002433776855, + "learning_rate": 7.159654652596958e-06, + "loss": 0.213, + "step": 21477 + }, + { + "epoch": 1.0913185024416712, + "grad_norm": 5.962461471557617, + "learning_rate": 7.159517609976704e-06, + "loss": 0.322, + "step": 21478 + }, + { + "epoch": 1.091332067281606, + "grad_norm": 4.419256210327148, + "learning_rate": 7.159380567356448e-06, + "loss": 0.1643, + "step": 21479 + }, + { + "epoch": 1.091345632121541, + "grad_norm": 3.9482297897338867, + "learning_rate": 7.1592435247361935e-06, + "loss": 0.1928, + "step": 21480 + }, + { + "epoch": 1.0913591969614758, + "grad_norm": 4.975577354431152, + "learning_rate": 7.159106482115939e-06, + "loss": 0.2865, + "step": 21481 + }, + { + "epoch": 1.0913727618014106, + "grad_norm": 4.819057941436768, + "learning_rate": 7.158969439495683e-06, + "loss": 0.2002, + "step": 21482 + }, + { + "epoch": 1.0913863266413457, + "grad_norm": 6.416491985321045, + "learning_rate": 7.158832396875429e-06, + "loss": 0.2262, + "step": 21483 + }, + { + "epoch": 1.0913998914812806, + "grad_norm": 5.754147529602051, + "learning_rate": 7.158695354255174e-06, + "loss": 0.2764, + "step": 21484 + }, + { + "epoch": 1.0914134563212154, + "grad_norm": 3.766237735748291, + "learning_rate": 7.1585583116349186e-06, + "loss": 0.2139, + "step": 21485 + }, + { + "epoch": 1.0914270211611503, + "grad_norm": 4.758293151855469, + "learning_rate": 7.158421269014664e-06, + "loss": 0.248, + "step": 21486 + }, + { + "epoch": 1.0914405860010852, + "grad_norm": 5.4939188957214355, + "learning_rate": 7.15828422639441e-06, + "loss": 0.2724, + "step": 21487 + }, + { + "epoch": 1.09145415084102, + "grad_norm": 4.886425495147705, + "learning_rate": 7.158147183774154e-06, + "loss": 0.2683, + "step": 21488 + }, + { + "epoch": 1.091467715680955, + "grad_norm": 4.543388366699219, + "learning_rate": 7.158010141153899e-06, + "loss": 0.2178, + "step": 21489 + }, + { + "epoch": 1.0914812805208898, + "grad_norm": 6.013269901275635, + "learning_rate": 7.1578730985336445e-06, + "loss": 0.3826, + "step": 21490 + }, + { + "epoch": 1.0914948453608246, + "grad_norm": 4.8339033126831055, + "learning_rate": 7.1577360559133905e-06, + "loss": 0.2718, + "step": 21491 + }, + { + "epoch": 1.0915084102007597, + "grad_norm": 4.8381476402282715, + "learning_rate": 7.157599013293135e-06, + "loss": 0.2896, + "step": 21492 + }, + { + "epoch": 1.0915219750406946, + "grad_norm": 5.433690071105957, + "learning_rate": 7.15746197067288e-06, + "loss": 0.2082, + "step": 21493 + }, + { + "epoch": 1.0915355398806295, + "grad_norm": 3.85536789894104, + "learning_rate": 7.157324928052624e-06, + "loss": 0.247, + "step": 21494 + }, + { + "epoch": 1.0915491047205643, + "grad_norm": 4.923925399780273, + "learning_rate": 7.1571878854323695e-06, + "loss": 0.2331, + "step": 21495 + }, + { + "epoch": 1.0915626695604992, + "grad_norm": 6.77726411819458, + "learning_rate": 7.1570508428121155e-06, + "loss": 0.3173, + "step": 21496 + }, + { + "epoch": 1.091576234400434, + "grad_norm": 5.281700611114502, + "learning_rate": 7.156913800191861e-06, + "loss": 0.2113, + "step": 21497 + }, + { + "epoch": 1.091589799240369, + "grad_norm": 3.5499510765075684, + "learning_rate": 7.156776757571605e-06, + "loss": 0.1175, + "step": 21498 + }, + { + "epoch": 1.0916033640803038, + "grad_norm": 5.690040111541748, + "learning_rate": 7.15663971495135e-06, + "loss": 0.2886, + "step": 21499 + }, + { + "epoch": 1.0916169289202386, + "grad_norm": 6.274205684661865, + "learning_rate": 7.156502672331096e-06, + "loss": 0.2793, + "step": 21500 + }, + { + "epoch": 1.0916304937601735, + "grad_norm": 4.727559566497803, + "learning_rate": 7.1563656297108406e-06, + "loss": 0.2112, + "step": 21501 + }, + { + "epoch": 1.0916440586001086, + "grad_norm": 6.151980876922607, + "learning_rate": 7.156228587090586e-06, + "loss": 0.3028, + "step": 21502 + }, + { + "epoch": 1.0916576234400435, + "grad_norm": 4.964695930480957, + "learning_rate": 7.15609154447033e-06, + "loss": 0.2259, + "step": 21503 + }, + { + "epoch": 1.0916711882799783, + "grad_norm": 4.802073955535889, + "learning_rate": 7.155954501850076e-06, + "loss": 0.2577, + "step": 21504 + }, + { + "epoch": 1.0916847531199132, + "grad_norm": 4.610243797302246, + "learning_rate": 7.155817459229821e-06, + "loss": 0.2435, + "step": 21505 + }, + { + "epoch": 1.091698317959848, + "grad_norm": 5.500271320343018, + "learning_rate": 7.1556804166095665e-06, + "loss": 0.2175, + "step": 21506 + }, + { + "epoch": 1.091711882799783, + "grad_norm": 5.266176223754883, + "learning_rate": 7.155543373989311e-06, + "loss": 0.2257, + "step": 21507 + }, + { + "epoch": 1.0917254476397178, + "grad_norm": 5.186131954193115, + "learning_rate": 7.155406331369056e-06, + "loss": 0.2526, + "step": 21508 + }, + { + "epoch": 1.0917390124796527, + "grad_norm": 5.249450206756592, + "learning_rate": 7.155269288748802e-06, + "loss": 0.3608, + "step": 21509 + }, + { + "epoch": 1.0917525773195875, + "grad_norm": 5.60672664642334, + "learning_rate": 7.155132246128546e-06, + "loss": 0.2693, + "step": 21510 + }, + { + "epoch": 1.0917661421595226, + "grad_norm": 5.408098220825195, + "learning_rate": 7.1549952035082915e-06, + "loss": 0.2762, + "step": 21511 + }, + { + "epoch": 1.0917797069994575, + "grad_norm": 5.892107963562012, + "learning_rate": 7.154858160888037e-06, + "loss": 0.2596, + "step": 21512 + }, + { + "epoch": 1.0917932718393923, + "grad_norm": 4.77488899230957, + "learning_rate": 7.154721118267782e-06, + "loss": 0.208, + "step": 21513 + }, + { + "epoch": 1.0918068366793272, + "grad_norm": 5.375114440917969, + "learning_rate": 7.154584075647527e-06, + "loss": 0.1911, + "step": 21514 + }, + { + "epoch": 1.091820401519262, + "grad_norm": 4.952764511108398, + "learning_rate": 7.154447033027272e-06, + "loss": 0.2768, + "step": 21515 + }, + { + "epoch": 1.091833966359197, + "grad_norm": 4.93861722946167, + "learning_rate": 7.1543099904070166e-06, + "loss": 0.1641, + "step": 21516 + }, + { + "epoch": 1.0918475311991318, + "grad_norm": 6.490470886230469, + "learning_rate": 7.154172947786763e-06, + "loss": 0.3523, + "step": 21517 + }, + { + "epoch": 1.0918610960390667, + "grad_norm": 5.761507987976074, + "learning_rate": 7.154035905166508e-06, + "loss": 0.2967, + "step": 21518 + }, + { + "epoch": 1.0918746608790015, + "grad_norm": 4.852923393249512, + "learning_rate": 7.153898862546252e-06, + "loss": 0.1821, + "step": 21519 + }, + { + "epoch": 1.0918882257189364, + "grad_norm": 5.456852912902832, + "learning_rate": 7.153761819925997e-06, + "loss": 0.2624, + "step": 21520 + }, + { + "epoch": 1.0919017905588715, + "grad_norm": 5.191412448883057, + "learning_rate": 7.1536247773057425e-06, + "loss": 0.2015, + "step": 21521 + }, + { + "epoch": 1.0919153553988064, + "grad_norm": 4.590028762817383, + "learning_rate": 7.1534877346854885e-06, + "loss": 0.2032, + "step": 21522 + }, + { + "epoch": 1.0919289202387412, + "grad_norm": 4.907069683074951, + "learning_rate": 7.153350692065233e-06, + "loss": 0.2201, + "step": 21523 + }, + { + "epoch": 1.091942485078676, + "grad_norm": 5.233788967132568, + "learning_rate": 7.153213649444978e-06, + "loss": 0.2666, + "step": 21524 + }, + { + "epoch": 1.091956049918611, + "grad_norm": 4.751404762268066, + "learning_rate": 7.153076606824722e-06, + "loss": 0.2397, + "step": 21525 + }, + { + "epoch": 1.0919696147585458, + "grad_norm": 3.8275721073150635, + "learning_rate": 7.152939564204468e-06, + "loss": 0.1506, + "step": 21526 + }, + { + "epoch": 1.0919831795984807, + "grad_norm": 5.038022518157959, + "learning_rate": 7.1528025215842135e-06, + "loss": 0.2914, + "step": 21527 + }, + { + "epoch": 1.0919967444384155, + "grad_norm": 4.14140510559082, + "learning_rate": 7.152665478963958e-06, + "loss": 0.2014, + "step": 21528 + }, + { + "epoch": 1.0920103092783506, + "grad_norm": 5.004826545715332, + "learning_rate": 7.152528436343703e-06, + "loss": 0.2586, + "step": 21529 + }, + { + "epoch": 1.0920238741182855, + "grad_norm": 4.438161849975586, + "learning_rate": 7.152391393723449e-06, + "loss": 0.2447, + "step": 21530 + }, + { + "epoch": 1.0920374389582204, + "grad_norm": 6.686785697937012, + "learning_rate": 7.152254351103194e-06, + "loss": 0.2318, + "step": 21531 + }, + { + "epoch": 1.0920510037981552, + "grad_norm": 3.533543109893799, + "learning_rate": 7.152117308482939e-06, + "loss": 0.1135, + "step": 21532 + }, + { + "epoch": 1.09206456863809, + "grad_norm": 4.730162143707275, + "learning_rate": 7.151980265862684e-06, + "loss": 0.202, + "step": 21533 + }, + { + "epoch": 1.092078133478025, + "grad_norm": 4.112504482269287, + "learning_rate": 7.15184322324243e-06, + "loss": 0.1517, + "step": 21534 + }, + { + "epoch": 1.0920916983179598, + "grad_norm": 5.076402187347412, + "learning_rate": 7.151706180622174e-06, + "loss": 0.2071, + "step": 21535 + }, + { + "epoch": 1.0921052631578947, + "grad_norm": 5.0659332275390625, + "learning_rate": 7.151569138001919e-06, + "loss": 0.2807, + "step": 21536 + }, + { + "epoch": 1.0921188279978296, + "grad_norm": 5.539534568786621, + "learning_rate": 7.1514320953816645e-06, + "loss": 0.2303, + "step": 21537 + }, + { + "epoch": 1.0921323928377644, + "grad_norm": 4.1198811531066895, + "learning_rate": 7.151295052761409e-06, + "loss": 0.2156, + "step": 21538 + }, + { + "epoch": 1.0921459576776993, + "grad_norm": 4.317439079284668, + "learning_rate": 7.151158010141155e-06, + "loss": 0.2329, + "step": 21539 + }, + { + "epoch": 1.0921595225176344, + "grad_norm": 4.200206279754639, + "learning_rate": 7.1510209675209e-06, + "loss": 0.1913, + "step": 21540 + }, + { + "epoch": 1.0921730873575692, + "grad_norm": 4.050292015075684, + "learning_rate": 7.150883924900644e-06, + "loss": 0.1833, + "step": 21541 + }, + { + "epoch": 1.092186652197504, + "grad_norm": 4.328445911407471, + "learning_rate": 7.1507468822803895e-06, + "loss": 0.1744, + "step": 21542 + }, + { + "epoch": 1.092200217037439, + "grad_norm": 8.085193634033203, + "learning_rate": 7.1506098396601355e-06, + "loss": 0.1667, + "step": 21543 + }, + { + "epoch": 1.0922137818773738, + "grad_norm": 4.048030376434326, + "learning_rate": 7.15047279703988e-06, + "loss": 0.2115, + "step": 21544 + }, + { + "epoch": 1.0922273467173087, + "grad_norm": 6.074654579162598, + "learning_rate": 7.150335754419625e-06, + "loss": 0.2755, + "step": 21545 + }, + { + "epoch": 1.0922409115572436, + "grad_norm": 3.3951053619384766, + "learning_rate": 7.15019871179937e-06, + "loss": 0.1378, + "step": 21546 + }, + { + "epoch": 1.0922544763971784, + "grad_norm": 3.7930660247802734, + "learning_rate": 7.150061669179115e-06, + "loss": 0.137, + "step": 21547 + }, + { + "epoch": 1.0922680412371135, + "grad_norm": 3.778062343597412, + "learning_rate": 7.149924626558861e-06, + "loss": 0.1763, + "step": 21548 + }, + { + "epoch": 1.0922816060770484, + "grad_norm": 4.557783603668213, + "learning_rate": 7.149787583938606e-06, + "loss": 0.2571, + "step": 21549 + }, + { + "epoch": 1.0922951709169832, + "grad_norm": 4.420905113220215, + "learning_rate": 7.14965054131835e-06, + "loss": 0.1949, + "step": 21550 + }, + { + "epoch": 1.0923087357569181, + "grad_norm": 3.8512120246887207, + "learning_rate": 7.149513498698095e-06, + "loss": 0.1757, + "step": 21551 + }, + { + "epoch": 1.092322300596853, + "grad_norm": 3.2579803466796875, + "learning_rate": 7.149376456077841e-06, + "loss": 0.1966, + "step": 21552 + }, + { + "epoch": 1.0923358654367878, + "grad_norm": 5.09885835647583, + "learning_rate": 7.149239413457586e-06, + "loss": 0.3161, + "step": 21553 + }, + { + "epoch": 1.0923494302767227, + "grad_norm": 4.145253658294678, + "learning_rate": 7.149102370837331e-06, + "loss": 0.2021, + "step": 21554 + }, + { + "epoch": 1.0923629951166576, + "grad_norm": 4.9401679039001465, + "learning_rate": 7.148965328217076e-06, + "loss": 0.2994, + "step": 21555 + }, + { + "epoch": 1.0923765599565924, + "grad_norm": 5.02330207824707, + "learning_rate": 7.148828285596822e-06, + "loss": 0.2043, + "step": 21556 + }, + { + "epoch": 1.0923901247965273, + "grad_norm": 4.471630096435547, + "learning_rate": 7.148691242976566e-06, + "loss": 0.2172, + "step": 21557 + }, + { + "epoch": 1.0924036896364622, + "grad_norm": 5.614553928375244, + "learning_rate": 7.1485542003563115e-06, + "loss": 0.1569, + "step": 21558 + }, + { + "epoch": 1.0924172544763973, + "grad_norm": 4.2830915451049805, + "learning_rate": 7.148417157736056e-06, + "loss": 0.1683, + "step": 21559 + }, + { + "epoch": 1.0924308193163321, + "grad_norm": 3.9023044109344482, + "learning_rate": 7.148280115115802e-06, + "loss": 0.1579, + "step": 21560 + }, + { + "epoch": 1.092444384156267, + "grad_norm": 4.7924885749816895, + "learning_rate": 7.148143072495547e-06, + "loss": 0.2165, + "step": 21561 + }, + { + "epoch": 1.0924579489962019, + "grad_norm": 5.763791561126709, + "learning_rate": 7.148006029875291e-06, + "loss": 0.3016, + "step": 21562 + }, + { + "epoch": 1.0924715138361367, + "grad_norm": 3.4445202350616455, + "learning_rate": 7.147868987255037e-06, + "loss": 0.1757, + "step": 21563 + }, + { + "epoch": 1.0924850786760716, + "grad_norm": 3.9296863079071045, + "learning_rate": 7.147731944634782e-06, + "loss": 0.1853, + "step": 21564 + }, + { + "epoch": 1.0924986435160065, + "grad_norm": 5.255455493927002, + "learning_rate": 7.147594902014528e-06, + "loss": 0.2462, + "step": 21565 + }, + { + "epoch": 1.0925122083559413, + "grad_norm": 5.244866847991943, + "learning_rate": 7.147457859394272e-06, + "loss": 0.1842, + "step": 21566 + }, + { + "epoch": 1.0925257731958764, + "grad_norm": 3.692013740539551, + "learning_rate": 7.147320816774017e-06, + "loss": 0.2336, + "step": 21567 + }, + { + "epoch": 1.0925393380358113, + "grad_norm": 6.228684425354004, + "learning_rate": 7.147183774153762e-06, + "loss": 0.2723, + "step": 21568 + }, + { + "epoch": 1.0925529028757461, + "grad_norm": 7.838004112243652, + "learning_rate": 7.147046731533508e-06, + "loss": 0.3343, + "step": 21569 + }, + { + "epoch": 1.092566467715681, + "grad_norm": 4.506779670715332, + "learning_rate": 7.146909688913253e-06, + "loss": 0.2874, + "step": 21570 + }, + { + "epoch": 1.0925800325556159, + "grad_norm": 5.372279167175293, + "learning_rate": 7.146772646292998e-06, + "loss": 0.21, + "step": 21571 + }, + { + "epoch": 1.0925935973955507, + "grad_norm": 3.4822287559509277, + "learning_rate": 7.146635603672742e-06, + "loss": 0.1652, + "step": 21572 + }, + { + "epoch": 1.0926071622354856, + "grad_norm": 4.1480536460876465, + "learning_rate": 7.146498561052488e-06, + "loss": 0.252, + "step": 21573 + }, + { + "epoch": 1.0926207270754205, + "grad_norm": 3.691441059112549, + "learning_rate": 7.1463615184322336e-06, + "loss": 0.18, + "step": 21574 + }, + { + "epoch": 1.0926342919153553, + "grad_norm": 4.609952449798584, + "learning_rate": 7.146224475811978e-06, + "loss": 0.2532, + "step": 21575 + }, + { + "epoch": 1.0926478567552902, + "grad_norm": 4.713130474090576, + "learning_rate": 7.146087433191723e-06, + "loss": 0.2727, + "step": 21576 + }, + { + "epoch": 1.0926614215952253, + "grad_norm": 5.484837532043457, + "learning_rate": 7.145950390571467e-06, + "loss": 0.3004, + "step": 21577 + }, + { + "epoch": 1.0926749864351601, + "grad_norm": 3.6839566230773926, + "learning_rate": 7.145813347951213e-06, + "loss": 0.2024, + "step": 21578 + }, + { + "epoch": 1.092688551275095, + "grad_norm": 5.695631980895996, + "learning_rate": 7.145676305330959e-06, + "loss": 0.3335, + "step": 21579 + }, + { + "epoch": 1.0927021161150299, + "grad_norm": 6.545315265655518, + "learning_rate": 7.145539262710704e-06, + "loss": 0.358, + "step": 21580 + }, + { + "epoch": 1.0927156809549647, + "grad_norm": 6.761148452758789, + "learning_rate": 7.145402220090448e-06, + "loss": 0.372, + "step": 21581 + }, + { + "epoch": 1.0927292457948996, + "grad_norm": 5.656249523162842, + "learning_rate": 7.145265177470194e-06, + "loss": 0.2641, + "step": 21582 + }, + { + "epoch": 1.0927428106348345, + "grad_norm": 5.0814528465271, + "learning_rate": 7.145128134849939e-06, + "loss": 0.2454, + "step": 21583 + }, + { + "epoch": 1.0927563754747693, + "grad_norm": 5.214643955230713, + "learning_rate": 7.144991092229684e-06, + "loss": 0.3683, + "step": 21584 + }, + { + "epoch": 1.0927699403147042, + "grad_norm": 7.9712090492248535, + "learning_rate": 7.144854049609429e-06, + "loss": 0.4312, + "step": 21585 + }, + { + "epoch": 1.0927835051546393, + "grad_norm": 6.009909629821777, + "learning_rate": 7.144717006989175e-06, + "loss": 0.2501, + "step": 21586 + }, + { + "epoch": 1.0927970699945742, + "grad_norm": 5.485213279724121, + "learning_rate": 7.144579964368919e-06, + "loss": 0.2526, + "step": 21587 + }, + { + "epoch": 1.092810634834509, + "grad_norm": 6.254511833190918, + "learning_rate": 7.144442921748664e-06, + "loss": 0.3895, + "step": 21588 + }, + { + "epoch": 1.0928241996744439, + "grad_norm": 7.084348678588867, + "learning_rate": 7.1443058791284095e-06, + "loss": 0.4354, + "step": 21589 + }, + { + "epoch": 1.0928377645143788, + "grad_norm": 6.429505825042725, + "learning_rate": 7.144168836508154e-06, + "loss": 0.3063, + "step": 21590 + }, + { + "epoch": 1.0928513293543136, + "grad_norm": 6.01395845413208, + "learning_rate": 7.1440317938879e-06, + "loss": 0.2718, + "step": 21591 + }, + { + "epoch": 1.0928648941942485, + "grad_norm": 6.359217166900635, + "learning_rate": 7.143894751267645e-06, + "loss": 0.2784, + "step": 21592 + }, + { + "epoch": 1.0928784590341833, + "grad_norm": 4.229299545288086, + "learning_rate": 7.143757708647389e-06, + "loss": 0.2051, + "step": 21593 + }, + { + "epoch": 1.0928920238741182, + "grad_norm": 4.986174583435059, + "learning_rate": 7.143620666027135e-06, + "loss": 0.2786, + "step": 21594 + }, + { + "epoch": 1.092905588714053, + "grad_norm": 6.902231216430664, + "learning_rate": 7.143483623406881e-06, + "loss": 0.3749, + "step": 21595 + }, + { + "epoch": 1.0929191535539882, + "grad_norm": 6.506583213806152, + "learning_rate": 7.143346580786625e-06, + "loss": 0.3015, + "step": 21596 + }, + { + "epoch": 1.092932718393923, + "grad_norm": 5.014999866485596, + "learning_rate": 7.14320953816637e-06, + "loss": 0.2652, + "step": 21597 + }, + { + "epoch": 1.092946283233858, + "grad_norm": 3.6158833503723145, + "learning_rate": 7.143072495546115e-06, + "loss": 0.1631, + "step": 21598 + }, + { + "epoch": 1.0929598480737928, + "grad_norm": 6.205552577972412, + "learning_rate": 7.142935452925861e-06, + "loss": 0.3135, + "step": 21599 + }, + { + "epoch": 1.0929734129137276, + "grad_norm": 7.329055309295654, + "learning_rate": 7.142798410305606e-06, + "loss": 0.3167, + "step": 21600 + }, + { + "epoch": 1.0929869777536625, + "grad_norm": 6.576058387756348, + "learning_rate": 7.142661367685351e-06, + "loss": 0.2816, + "step": 21601 + }, + { + "epoch": 1.0930005425935974, + "grad_norm": 6.054511070251465, + "learning_rate": 7.142524325065095e-06, + "loss": 0.2649, + "step": 21602 + }, + { + "epoch": 1.0930141074335322, + "grad_norm": 7.699019432067871, + "learning_rate": 7.142387282444841e-06, + "loss": 0.3915, + "step": 21603 + }, + { + "epoch": 1.093027672273467, + "grad_norm": 5.9661736488342285, + "learning_rate": 7.142250239824586e-06, + "loss": 0.2353, + "step": 21604 + }, + { + "epoch": 1.0930412371134022, + "grad_norm": 7.269093990325928, + "learning_rate": 7.1421131972043316e-06, + "loss": 0.3665, + "step": 21605 + }, + { + "epoch": 1.093054801953337, + "grad_norm": 4.968216896057129, + "learning_rate": 7.141976154584076e-06, + "loss": 0.2648, + "step": 21606 + }, + { + "epoch": 1.093068366793272, + "grad_norm": 6.347280502319336, + "learning_rate": 7.141839111963821e-06, + "loss": 0.3907, + "step": 21607 + }, + { + "epoch": 1.0930819316332068, + "grad_norm": 6.180716514587402, + "learning_rate": 7.141702069343567e-06, + "loss": 0.3494, + "step": 21608 + }, + { + "epoch": 1.0930954964731416, + "grad_norm": 5.300248146057129, + "learning_rate": 7.1415650267233114e-06, + "loss": 0.2503, + "step": 21609 + }, + { + "epoch": 1.0931090613130765, + "grad_norm": 6.149145603179932, + "learning_rate": 7.141427984103057e-06, + "loss": 0.3681, + "step": 21610 + }, + { + "epoch": 1.0931226261530114, + "grad_norm": 5.981625556945801, + "learning_rate": 7.141290941482801e-06, + "loss": 0.4498, + "step": 21611 + }, + { + "epoch": 1.0931361909929462, + "grad_norm": 7.405697345733643, + "learning_rate": 7.141153898862547e-06, + "loss": 0.4075, + "step": 21612 + }, + { + "epoch": 1.093149755832881, + "grad_norm": 4.988957405090332, + "learning_rate": 7.141016856242292e-06, + "loss": 0.2153, + "step": 21613 + }, + { + "epoch": 1.093163320672816, + "grad_norm": 5.097752571105957, + "learning_rate": 7.140879813622037e-06, + "loss": 0.2301, + "step": 21614 + }, + { + "epoch": 1.093176885512751, + "grad_norm": 6.728554725646973, + "learning_rate": 7.140742771001782e-06, + "loss": 0.2881, + "step": 21615 + }, + { + "epoch": 1.093190450352686, + "grad_norm": 5.616624355316162, + "learning_rate": 7.140605728381528e-06, + "loss": 0.273, + "step": 21616 + }, + { + "epoch": 1.0932040151926208, + "grad_norm": 8.591562271118164, + "learning_rate": 7.140468685761273e-06, + "loss": 0.721, + "step": 21617 + }, + { + "epoch": 1.0932175800325556, + "grad_norm": 5.537740230560303, + "learning_rate": 7.140331643141017e-06, + "loss": 0.2641, + "step": 21618 + }, + { + "epoch": 1.0932311448724905, + "grad_norm": 6.585316181182861, + "learning_rate": 7.140194600520762e-06, + "loss": 0.3715, + "step": 21619 + }, + { + "epoch": 1.0932447097124254, + "grad_norm": 8.701704978942871, + "learning_rate": 7.1400575579005075e-06, + "loss": 0.547, + "step": 21620 + }, + { + "epoch": 1.0932582745523602, + "grad_norm": 6.208643913269043, + "learning_rate": 7.139920515280253e-06, + "loss": 0.3002, + "step": 21621 + }, + { + "epoch": 1.093271839392295, + "grad_norm": 6.302371978759766, + "learning_rate": 7.139783472659998e-06, + "loss": 0.2883, + "step": 21622 + }, + { + "epoch": 1.09328540423223, + "grad_norm": 6.597775936126709, + "learning_rate": 7.139646430039743e-06, + "loss": 0.2618, + "step": 21623 + }, + { + "epoch": 1.093298969072165, + "grad_norm": 6.234733581542969, + "learning_rate": 7.139509387419487e-06, + "loss": 0.2631, + "step": 21624 + }, + { + "epoch": 1.0933125339121, + "grad_norm": 7.0513529777526855, + "learning_rate": 7.1393723447992334e-06, + "loss": 0.311, + "step": 21625 + }, + { + "epoch": 1.0933260987520348, + "grad_norm": 6.6707763671875, + "learning_rate": 7.139235302178979e-06, + "loss": 0.2901, + "step": 21626 + }, + { + "epoch": 1.0933396635919697, + "grad_norm": 6.754992961883545, + "learning_rate": 7.139098259558723e-06, + "loss": 0.3056, + "step": 21627 + }, + { + "epoch": 1.0933532284319045, + "grad_norm": 5.416199207305908, + "learning_rate": 7.138961216938468e-06, + "loss": 0.3564, + "step": 21628 + }, + { + "epoch": 1.0933667932718394, + "grad_norm": 6.03277587890625, + "learning_rate": 7.138824174318214e-06, + "loss": 0.3587, + "step": 21629 + }, + { + "epoch": 1.0933803581117743, + "grad_norm": 6.252918720245361, + "learning_rate": 7.1386871316979585e-06, + "loss": 0.39, + "step": 21630 + }, + { + "epoch": 1.0933939229517091, + "grad_norm": 8.774600982666016, + "learning_rate": 7.138550089077704e-06, + "loss": 0.6489, + "step": 21631 + }, + { + "epoch": 1.093407487791644, + "grad_norm": 6.981926441192627, + "learning_rate": 7.138413046457449e-06, + "loss": 0.5076, + "step": 21632 + }, + { + "epoch": 1.0934210526315788, + "grad_norm": 7.325681209564209, + "learning_rate": 7.138276003837193e-06, + "loss": 0.3895, + "step": 21633 + }, + { + "epoch": 1.093434617471514, + "grad_norm": 4.818314075469971, + "learning_rate": 7.138138961216939e-06, + "loss": 0.2699, + "step": 21634 + }, + { + "epoch": 1.0934481823114488, + "grad_norm": 6.582585334777832, + "learning_rate": 7.138001918596684e-06, + "loss": 0.3809, + "step": 21635 + }, + { + "epoch": 1.0934617471513837, + "grad_norm": 8.84281063079834, + "learning_rate": 7.137864875976429e-06, + "loss": 0.456, + "step": 21636 + }, + { + "epoch": 1.0934753119913185, + "grad_norm": 5.879117012023926, + "learning_rate": 7.137727833356174e-06, + "loss": 0.3941, + "step": 21637 + }, + { + "epoch": 1.0934888768312534, + "grad_norm": 6.221065044403076, + "learning_rate": 7.13759079073592e-06, + "loss": 0.328, + "step": 21638 + }, + { + "epoch": 1.0935024416711883, + "grad_norm": 6.157934188842773, + "learning_rate": 7.137453748115665e-06, + "loss": 0.2725, + "step": 21639 + }, + { + "epoch": 1.0935160065111231, + "grad_norm": 4.738943576812744, + "learning_rate": 7.1373167054954094e-06, + "loss": 0.3086, + "step": 21640 + }, + { + "epoch": 1.093529571351058, + "grad_norm": 6.635967254638672, + "learning_rate": 7.137179662875155e-06, + "loss": 0.2777, + "step": 21641 + }, + { + "epoch": 1.0935431361909929, + "grad_norm": 8.037789344787598, + "learning_rate": 7.137042620254901e-06, + "loss": 0.3862, + "step": 21642 + }, + { + "epoch": 1.093556701030928, + "grad_norm": 5.716549873352051, + "learning_rate": 7.136905577634645e-06, + "loss": 0.3375, + "step": 21643 + }, + { + "epoch": 1.0935702658708628, + "grad_norm": 7.344781398773193, + "learning_rate": 7.13676853501439e-06, + "loss": 0.2717, + "step": 21644 + }, + { + "epoch": 1.0935838307107977, + "grad_norm": 5.724452972412109, + "learning_rate": 7.1366314923941345e-06, + "loss": 0.2854, + "step": 21645 + }, + { + "epoch": 1.0935973955507325, + "grad_norm": 7.3493828773498535, + "learning_rate": 7.13649444977388e-06, + "loss": 0.3927, + "step": 21646 + }, + { + "epoch": 1.0936109603906674, + "grad_norm": 7.861181259155273, + "learning_rate": 7.136357407153626e-06, + "loss": 0.4717, + "step": 21647 + }, + { + "epoch": 1.0936245252306023, + "grad_norm": 7.127492904663086, + "learning_rate": 7.136220364533371e-06, + "loss": 0.4947, + "step": 21648 + }, + { + "epoch": 1.0936380900705371, + "grad_norm": 5.632497310638428, + "learning_rate": 7.136083321913115e-06, + "loss": 0.215, + "step": 21649 + }, + { + "epoch": 1.093651654910472, + "grad_norm": 5.148318290710449, + "learning_rate": 7.13594627929286e-06, + "loss": 0.27, + "step": 21650 + }, + { + "epoch": 1.0936652197504069, + "grad_norm": 5.3981170654296875, + "learning_rate": 7.135809236672606e-06, + "loss": 0.2478, + "step": 21651 + }, + { + "epoch": 1.0936787845903417, + "grad_norm": 8.011492729187012, + "learning_rate": 7.135672194052351e-06, + "loss": 0.4344, + "step": 21652 + }, + { + "epoch": 1.0936923494302768, + "grad_norm": 5.518980503082275, + "learning_rate": 7.135535151432096e-06, + "loss": 0.272, + "step": 21653 + }, + { + "epoch": 1.0937059142702117, + "grad_norm": 6.0416975021362305, + "learning_rate": 7.135398108811841e-06, + "loss": 0.2927, + "step": 21654 + }, + { + "epoch": 1.0937194791101466, + "grad_norm": 6.672142505645752, + "learning_rate": 7.135261066191586e-06, + "loss": 0.318, + "step": 21655 + }, + { + "epoch": 1.0937330439500814, + "grad_norm": 4.822414398193359, + "learning_rate": 7.1351240235713314e-06, + "loss": 0.2657, + "step": 21656 + }, + { + "epoch": 1.0937466087900163, + "grad_norm": 5.72429895401001, + "learning_rate": 7.134986980951077e-06, + "loss": 0.2423, + "step": 21657 + }, + { + "epoch": 1.0937601736299511, + "grad_norm": 6.359409809112549, + "learning_rate": 7.134849938330821e-06, + "loss": 0.4598, + "step": 21658 + }, + { + "epoch": 1.093773738469886, + "grad_norm": 6.46493673324585, + "learning_rate": 7.134712895710566e-06, + "loss": 0.2707, + "step": 21659 + }, + { + "epoch": 1.0937873033098209, + "grad_norm": 7.230751991271973, + "learning_rate": 7.134575853090312e-06, + "loss": 0.2947, + "step": 21660 + }, + { + "epoch": 1.0938008681497557, + "grad_norm": 5.9447150230407715, + "learning_rate": 7.1344388104700565e-06, + "loss": 0.2457, + "step": 21661 + }, + { + "epoch": 1.0938144329896908, + "grad_norm": 7.033906936645508, + "learning_rate": 7.134301767849802e-06, + "loss": 0.3206, + "step": 21662 + }, + { + "epoch": 1.0938279978296257, + "grad_norm": 7.0497050285339355, + "learning_rate": 7.134164725229547e-06, + "loss": 0.4748, + "step": 21663 + }, + { + "epoch": 1.0938415626695606, + "grad_norm": 5.860743522644043, + "learning_rate": 7.134027682609293e-06, + "loss": 0.4061, + "step": 21664 + }, + { + "epoch": 1.0938551275094954, + "grad_norm": 6.159110069274902, + "learning_rate": 7.133890639989037e-06, + "loss": 0.3808, + "step": 21665 + }, + { + "epoch": 1.0938686923494303, + "grad_norm": 6.1964216232299805, + "learning_rate": 7.133753597368782e-06, + "loss": 0.2744, + "step": 21666 + }, + { + "epoch": 1.0938822571893652, + "grad_norm": 5.815526008605957, + "learning_rate": 7.133616554748527e-06, + "loss": 0.292, + "step": 21667 + }, + { + "epoch": 1.0938958220293, + "grad_norm": 6.137945175170898, + "learning_rate": 7.133479512128273e-06, + "loss": 0.3289, + "step": 21668 + }, + { + "epoch": 1.093909386869235, + "grad_norm": 6.418367385864258, + "learning_rate": 7.133342469508018e-06, + "loss": 0.3024, + "step": 21669 + }, + { + "epoch": 1.0939229517091698, + "grad_norm": 6.356346130371094, + "learning_rate": 7.133205426887762e-06, + "loss": 0.2732, + "step": 21670 + }, + { + "epoch": 1.0939365165491046, + "grad_norm": 4.538723468780518, + "learning_rate": 7.1330683842675074e-06, + "loss": 0.2198, + "step": 21671 + }, + { + "epoch": 1.0939500813890397, + "grad_norm": 5.833949089050293, + "learning_rate": 7.1329313416472535e-06, + "loss": 0.3275, + "step": 21672 + }, + { + "epoch": 1.0939636462289746, + "grad_norm": 3.905036449432373, + "learning_rate": 7.132794299026999e-06, + "loss": 0.2238, + "step": 21673 + }, + { + "epoch": 1.0939772110689094, + "grad_norm": 7.182972431182861, + "learning_rate": 7.132657256406743e-06, + "loss": 0.3808, + "step": 21674 + }, + { + "epoch": 1.0939907759088443, + "grad_norm": 5.803205966949463, + "learning_rate": 7.132520213786488e-06, + "loss": 0.3301, + "step": 21675 + }, + { + "epoch": 1.0940043407487792, + "grad_norm": 6.174719333648682, + "learning_rate": 7.1323831711662325e-06, + "loss": 0.2398, + "step": 21676 + }, + { + "epoch": 1.094017905588714, + "grad_norm": 5.888758659362793, + "learning_rate": 7.1322461285459785e-06, + "loss": 0.2773, + "step": 21677 + }, + { + "epoch": 1.094031470428649, + "grad_norm": 7.185013771057129, + "learning_rate": 7.132109085925724e-06, + "loss": 0.3465, + "step": 21678 + }, + { + "epoch": 1.0940450352685838, + "grad_norm": 6.472988605499268, + "learning_rate": 7.131972043305468e-06, + "loss": 0.3529, + "step": 21679 + }, + { + "epoch": 1.0940586001085186, + "grad_norm": 4.909549713134766, + "learning_rate": 7.131835000685213e-06, + "loss": 0.284, + "step": 21680 + }, + { + "epoch": 1.0940721649484537, + "grad_norm": 5.783632278442383, + "learning_rate": 7.131697958064959e-06, + "loss": 0.3676, + "step": 21681 + }, + { + "epoch": 1.0940857297883886, + "grad_norm": 5.844886302947998, + "learning_rate": 7.131560915444704e-06, + "loss": 0.258, + "step": 21682 + }, + { + "epoch": 1.0940992946283234, + "grad_norm": 5.322325706481934, + "learning_rate": 7.131423872824449e-06, + "loss": 0.3395, + "step": 21683 + }, + { + "epoch": 1.0941128594682583, + "grad_norm": 4.8915910720825195, + "learning_rate": 7.131286830204194e-06, + "loss": 0.2388, + "step": 21684 + }, + { + "epoch": 1.0941264243081932, + "grad_norm": 5.867087364196777, + "learning_rate": 7.13114978758394e-06, + "loss": 0.2194, + "step": 21685 + }, + { + "epoch": 1.094139989148128, + "grad_norm": 5.35695219039917, + "learning_rate": 7.131012744963684e-06, + "loss": 0.2898, + "step": 21686 + }, + { + "epoch": 1.094153553988063, + "grad_norm": 5.695276737213135, + "learning_rate": 7.1308757023434294e-06, + "loss": 0.3139, + "step": 21687 + }, + { + "epoch": 1.0941671188279978, + "grad_norm": 5.227087497711182, + "learning_rate": 7.130738659723175e-06, + "loss": 0.2745, + "step": 21688 + }, + { + "epoch": 1.0941806836679326, + "grad_norm": 5.65167760848999, + "learning_rate": 7.130601617102919e-06, + "loss": 0.2976, + "step": 21689 + }, + { + "epoch": 1.0941942485078675, + "grad_norm": 4.602997779846191, + "learning_rate": 7.130464574482665e-06, + "loss": 0.2029, + "step": 21690 + }, + { + "epoch": 1.0942078133478026, + "grad_norm": 5.03061056137085, + "learning_rate": 7.13032753186241e-06, + "loss": 0.2784, + "step": 21691 + }, + { + "epoch": 1.0942213781877375, + "grad_norm": 5.502901554107666, + "learning_rate": 7.1301904892421545e-06, + "loss": 0.3305, + "step": 21692 + }, + { + "epoch": 1.0942349430276723, + "grad_norm": 5.0882344245910645, + "learning_rate": 7.1300534466219e-06, + "loss": 0.2937, + "step": 21693 + }, + { + "epoch": 1.0942485078676072, + "grad_norm": 6.3666276931762695, + "learning_rate": 7.129916404001646e-06, + "loss": 0.3074, + "step": 21694 + }, + { + "epoch": 1.094262072707542, + "grad_norm": 5.563899993896484, + "learning_rate": 7.12977936138139e-06, + "loss": 0.3398, + "step": 21695 + }, + { + "epoch": 1.094275637547477, + "grad_norm": 6.781922340393066, + "learning_rate": 7.129642318761135e-06, + "loss": 0.3225, + "step": 21696 + }, + { + "epoch": 1.0942892023874118, + "grad_norm": 4.366235733032227, + "learning_rate": 7.12950527614088e-06, + "loss": 0.2162, + "step": 21697 + }, + { + "epoch": 1.0943027672273467, + "grad_norm": 6.641031265258789, + "learning_rate": 7.129368233520626e-06, + "loss": 0.3017, + "step": 21698 + }, + { + "epoch": 1.0943163320672815, + "grad_norm": 7.414429664611816, + "learning_rate": 7.129231190900371e-06, + "loss": 0.3989, + "step": 21699 + }, + { + "epoch": 1.0943298969072166, + "grad_norm": 4.6676788330078125, + "learning_rate": 7.129094148280116e-06, + "loss": 0.2315, + "step": 21700 + }, + { + "epoch": 1.0943434617471515, + "grad_norm": 7.31459379196167, + "learning_rate": 7.12895710565986e-06, + "loss": 0.1703, + "step": 21701 + }, + { + "epoch": 1.0943570265870863, + "grad_norm": 6.459529399871826, + "learning_rate": 7.1288200630396054e-06, + "loss": 0.4699, + "step": 21702 + }, + { + "epoch": 1.0943705914270212, + "grad_norm": 5.076016902923584, + "learning_rate": 7.1286830204193515e-06, + "loss": 0.2759, + "step": 21703 + }, + { + "epoch": 1.094384156266956, + "grad_norm": 8.35168170928955, + "learning_rate": 7.128545977799096e-06, + "loss": 0.5883, + "step": 21704 + }, + { + "epoch": 1.094397721106891, + "grad_norm": 5.6826605796813965, + "learning_rate": 7.128408935178841e-06, + "loss": 0.2855, + "step": 21705 + }, + { + "epoch": 1.0944112859468258, + "grad_norm": 5.040968894958496, + "learning_rate": 7.128271892558586e-06, + "loss": 0.2845, + "step": 21706 + }, + { + "epoch": 1.0944248507867607, + "grad_norm": 8.7802095413208, + "learning_rate": 7.128134849938332e-06, + "loss": 0.4104, + "step": 21707 + }, + { + "epoch": 1.0944384156266955, + "grad_norm": 6.591896057128906, + "learning_rate": 7.1279978073180765e-06, + "loss": 0.3096, + "step": 21708 + }, + { + "epoch": 1.0944519804666304, + "grad_norm": 5.241775035858154, + "learning_rate": 7.127860764697822e-06, + "loss": 0.3857, + "step": 21709 + }, + { + "epoch": 1.0944655453065655, + "grad_norm": 6.692252159118652, + "learning_rate": 7.127723722077566e-06, + "loss": 0.4537, + "step": 21710 + }, + { + "epoch": 1.0944791101465003, + "grad_norm": 7.283175945281982, + "learning_rate": 7.127586679457312e-06, + "loss": 0.3464, + "step": 21711 + }, + { + "epoch": 1.0944926749864352, + "grad_norm": 6.394084930419922, + "learning_rate": 7.127449636837057e-06, + "loss": 0.287, + "step": 21712 + }, + { + "epoch": 1.09450623982637, + "grad_norm": 4.7835798263549805, + "learning_rate": 7.127312594216802e-06, + "loss": 0.2263, + "step": 21713 + }, + { + "epoch": 1.094519804666305, + "grad_norm": 4.893475532531738, + "learning_rate": 7.127175551596547e-06, + "loss": 0.2052, + "step": 21714 + }, + { + "epoch": 1.0945333695062398, + "grad_norm": 4.896248817443848, + "learning_rate": 7.127038508976292e-06, + "loss": 0.2333, + "step": 21715 + }, + { + "epoch": 1.0945469343461747, + "grad_norm": 5.28591775894165, + "learning_rate": 7.126901466356038e-06, + "loss": 0.2259, + "step": 21716 + }, + { + "epoch": 1.0945604991861095, + "grad_norm": 4.915546417236328, + "learning_rate": 7.126764423735782e-06, + "loss": 0.4012, + "step": 21717 + }, + { + "epoch": 1.0945740640260444, + "grad_norm": 4.941303730010986, + "learning_rate": 7.1266273811155275e-06, + "loss": 0.2237, + "step": 21718 + }, + { + "epoch": 1.0945876288659795, + "grad_norm": 3.492216110229492, + "learning_rate": 7.126490338495272e-06, + "loss": 0.1777, + "step": 21719 + }, + { + "epoch": 1.0946011937059144, + "grad_norm": 3.4881842136383057, + "learning_rate": 7.126353295875018e-06, + "loss": 0.2095, + "step": 21720 + }, + { + "epoch": 1.0946147585458492, + "grad_norm": 5.2093048095703125, + "learning_rate": 7.126216253254763e-06, + "loss": 0.2784, + "step": 21721 + }, + { + "epoch": 1.094628323385784, + "grad_norm": 4.876402378082275, + "learning_rate": 7.126079210634508e-06, + "loss": 0.2929, + "step": 21722 + }, + { + "epoch": 1.094641888225719, + "grad_norm": 5.26588249206543, + "learning_rate": 7.1259421680142525e-06, + "loss": 0.2339, + "step": 21723 + }, + { + "epoch": 1.0946554530656538, + "grad_norm": 4.893180847167969, + "learning_rate": 7.1258051253939985e-06, + "loss": 0.1995, + "step": 21724 + }, + { + "epoch": 1.0946690179055887, + "grad_norm": 5.836080551147461, + "learning_rate": 7.125668082773744e-06, + "loss": 0.2899, + "step": 21725 + }, + { + "epoch": 1.0946825827455235, + "grad_norm": 4.457588195800781, + "learning_rate": 7.125531040153488e-06, + "loss": 0.1936, + "step": 21726 + }, + { + "epoch": 1.0946961475854584, + "grad_norm": 6.491934299468994, + "learning_rate": 7.125393997533233e-06, + "loss": 0.4323, + "step": 21727 + }, + { + "epoch": 1.0947097124253933, + "grad_norm": 4.780395984649658, + "learning_rate": 7.125256954912978e-06, + "loss": 0.1893, + "step": 21728 + }, + { + "epoch": 1.0947232772653284, + "grad_norm": 5.176377296447754, + "learning_rate": 7.1251199122927236e-06, + "loss": 0.2879, + "step": 21729 + }, + { + "epoch": 1.0947368421052632, + "grad_norm": 5.198300838470459, + "learning_rate": 7.124982869672469e-06, + "loss": 0.2323, + "step": 21730 + }, + { + "epoch": 1.094750406945198, + "grad_norm": 4.433948516845703, + "learning_rate": 7.124845827052214e-06, + "loss": 0.2518, + "step": 21731 + }, + { + "epoch": 1.094763971785133, + "grad_norm": 4.377017498016357, + "learning_rate": 7.124708784431958e-06, + "loss": 0.332, + "step": 21732 + }, + { + "epoch": 1.0947775366250678, + "grad_norm": 5.4756340980529785, + "learning_rate": 7.124571741811704e-06, + "loss": 0.3769, + "step": 21733 + }, + { + "epoch": 1.0947911014650027, + "grad_norm": 4.388008117675781, + "learning_rate": 7.1244346991914495e-06, + "loss": 0.2235, + "step": 21734 + }, + { + "epoch": 1.0948046663049376, + "grad_norm": 6.576022148132324, + "learning_rate": 7.124297656571194e-06, + "loss": 0.4358, + "step": 21735 + }, + { + "epoch": 1.0948182311448724, + "grad_norm": 4.138849258422852, + "learning_rate": 7.124160613950939e-06, + "loss": 0.1658, + "step": 21736 + }, + { + "epoch": 1.0948317959848073, + "grad_norm": 4.605193138122559, + "learning_rate": 7.124023571330685e-06, + "loss": 0.3002, + "step": 21737 + }, + { + "epoch": 1.0948453608247424, + "grad_norm": 5.420401573181152, + "learning_rate": 7.123886528710429e-06, + "loss": 0.3218, + "step": 21738 + }, + { + "epoch": 1.0948589256646772, + "grad_norm": 8.645590782165527, + "learning_rate": 7.1237494860901745e-06, + "loss": 0.3718, + "step": 21739 + }, + { + "epoch": 1.094872490504612, + "grad_norm": 4.761386394500732, + "learning_rate": 7.12361244346992e-06, + "loss": 0.2757, + "step": 21740 + }, + { + "epoch": 1.094886055344547, + "grad_norm": 4.695026874542236, + "learning_rate": 7.123475400849666e-06, + "loss": 0.2409, + "step": 21741 + }, + { + "epoch": 1.0948996201844818, + "grad_norm": 6.578399658203125, + "learning_rate": 7.12333835822941e-06, + "loss": 0.3162, + "step": 21742 + }, + { + "epoch": 1.0949131850244167, + "grad_norm": 4.156390190124512, + "learning_rate": 7.123201315609155e-06, + "loss": 0.1988, + "step": 21743 + }, + { + "epoch": 1.0949267498643516, + "grad_norm": 4.448807239532471, + "learning_rate": 7.1230642729888996e-06, + "loss": 0.1805, + "step": 21744 + }, + { + "epoch": 1.0949403147042864, + "grad_norm": 7.141829490661621, + "learning_rate": 7.122927230368645e-06, + "loss": 0.273, + "step": 21745 + }, + { + "epoch": 1.0949538795442213, + "grad_norm": 3.6393396854400635, + "learning_rate": 7.122790187748391e-06, + "loss": 0.2514, + "step": 21746 + }, + { + "epoch": 1.0949674443841562, + "grad_norm": 5.733617305755615, + "learning_rate": 7.122653145128136e-06, + "loss": 0.3664, + "step": 21747 + }, + { + "epoch": 1.0949810092240913, + "grad_norm": 4.140653610229492, + "learning_rate": 7.12251610250788e-06, + "loss": 0.1708, + "step": 21748 + }, + { + "epoch": 1.0949945740640261, + "grad_norm": 6.0732645988464355, + "learning_rate": 7.1223790598876255e-06, + "loss": 0.3471, + "step": 21749 + }, + { + "epoch": 1.095008138903961, + "grad_norm": 7.631553649902344, + "learning_rate": 7.1222420172673715e-06, + "loss": 0.3337, + "step": 21750 + }, + { + "epoch": 1.0950217037438958, + "grad_norm": 5.192935466766357, + "learning_rate": 7.122104974647116e-06, + "loss": 0.4363, + "step": 21751 + }, + { + "epoch": 1.0950352685838307, + "grad_norm": 6.700899124145508, + "learning_rate": 7.121967932026861e-06, + "loss": 0.3215, + "step": 21752 + }, + { + "epoch": 1.0950488334237656, + "grad_norm": 4.947521209716797, + "learning_rate": 7.121830889406605e-06, + "loss": 0.3091, + "step": 21753 + }, + { + "epoch": 1.0950623982637004, + "grad_norm": 6.253696441650391, + "learning_rate": 7.121693846786351e-06, + "loss": 0.3222, + "step": 21754 + }, + { + "epoch": 1.0950759631036353, + "grad_norm": 5.885452747344971, + "learning_rate": 7.1215568041660965e-06, + "loss": 0.3322, + "step": 21755 + }, + { + "epoch": 1.0950895279435702, + "grad_norm": 5.011152744293213, + "learning_rate": 7.121419761545842e-06, + "loss": 0.2592, + "step": 21756 + }, + { + "epoch": 1.0951030927835053, + "grad_norm": 5.190232753753662, + "learning_rate": 7.121282718925586e-06, + "loss": 0.3305, + "step": 21757 + }, + { + "epoch": 1.0951166576234401, + "grad_norm": 6.460390090942383, + "learning_rate": 7.121145676305331e-06, + "loss": 0.3986, + "step": 21758 + }, + { + "epoch": 1.095130222463375, + "grad_norm": 5.464489936828613, + "learning_rate": 7.121008633685077e-06, + "loss": 0.2675, + "step": 21759 + }, + { + "epoch": 1.0951437873033099, + "grad_norm": 5.724008560180664, + "learning_rate": 7.120871591064822e-06, + "loss": 0.2665, + "step": 21760 + }, + { + "epoch": 1.0951573521432447, + "grad_norm": 5.68177604675293, + "learning_rate": 7.120734548444567e-06, + "loss": 0.2666, + "step": 21761 + }, + { + "epoch": 1.0951709169831796, + "grad_norm": 3.9782087802886963, + "learning_rate": 7.120597505824312e-06, + "loss": 0.2277, + "step": 21762 + }, + { + "epoch": 1.0951844818231145, + "grad_norm": 5.958569049835205, + "learning_rate": 7.120460463204057e-06, + "loss": 0.3642, + "step": 21763 + }, + { + "epoch": 1.0951980466630493, + "grad_norm": 5.280285358428955, + "learning_rate": 7.120323420583802e-06, + "loss": 0.2525, + "step": 21764 + }, + { + "epoch": 1.0952116115029842, + "grad_norm": 6.093027114868164, + "learning_rate": 7.1201863779635475e-06, + "loss": 0.3218, + "step": 21765 + }, + { + "epoch": 1.095225176342919, + "grad_norm": 6.282039165496826, + "learning_rate": 7.120049335343292e-06, + "loss": 0.3663, + "step": 21766 + }, + { + "epoch": 1.0952387411828541, + "grad_norm": 6.0395426750183105, + "learning_rate": 7.119912292723038e-06, + "loss": 0.3788, + "step": 21767 + }, + { + "epoch": 1.095252306022789, + "grad_norm": 5.612651824951172, + "learning_rate": 7.119775250102783e-06, + "loss": 0.1895, + "step": 21768 + }, + { + "epoch": 1.0952658708627239, + "grad_norm": 4.751681804656982, + "learning_rate": 7.119638207482527e-06, + "loss": 0.2428, + "step": 21769 + }, + { + "epoch": 1.0952794357026587, + "grad_norm": 5.645586967468262, + "learning_rate": 7.1195011648622725e-06, + "loss": 0.2635, + "step": 21770 + }, + { + "epoch": 1.0952930005425936, + "grad_norm": 5.084458351135254, + "learning_rate": 7.119364122242018e-06, + "loss": 0.3163, + "step": 21771 + }, + { + "epoch": 1.0953065653825285, + "grad_norm": 5.501005172729492, + "learning_rate": 7.119227079621763e-06, + "loss": 0.3119, + "step": 21772 + }, + { + "epoch": 1.0953201302224633, + "grad_norm": 4.956139087677002, + "learning_rate": 7.119090037001508e-06, + "loss": 0.2909, + "step": 21773 + }, + { + "epoch": 1.0953336950623982, + "grad_norm": 5.303572177886963, + "learning_rate": 7.118952994381253e-06, + "loss": 0.333, + "step": 21774 + }, + { + "epoch": 1.095347259902333, + "grad_norm": 7.465935230255127, + "learning_rate": 7.1188159517609976e-06, + "loss": 0.4038, + "step": 21775 + }, + { + "epoch": 1.0953608247422681, + "grad_norm": 4.647185325622559, + "learning_rate": 7.118678909140744e-06, + "loss": 0.3592, + "step": 21776 + }, + { + "epoch": 1.095374389582203, + "grad_norm": 5.566580772399902, + "learning_rate": 7.118541866520489e-06, + "loss": 0.2022, + "step": 21777 + }, + { + "epoch": 1.0953879544221379, + "grad_norm": 5.857273101806641, + "learning_rate": 7.118404823900233e-06, + "loss": 0.4056, + "step": 21778 + }, + { + "epoch": 1.0954015192620727, + "grad_norm": 5.0530900955200195, + "learning_rate": 7.118267781279978e-06, + "loss": 0.2539, + "step": 21779 + }, + { + "epoch": 1.0954150841020076, + "grad_norm": 6.787904262542725, + "learning_rate": 7.118130738659724e-06, + "loss": 0.323, + "step": 21780 + }, + { + "epoch": 1.0954286489419425, + "grad_norm": 4.757369518280029, + "learning_rate": 7.1179936960394695e-06, + "loss": 0.2511, + "step": 21781 + }, + { + "epoch": 1.0954422137818773, + "grad_norm": 4.725958824157715, + "learning_rate": 7.117856653419214e-06, + "loss": 0.3131, + "step": 21782 + }, + { + "epoch": 1.0954557786218122, + "grad_norm": 7.298803806304932, + "learning_rate": 7.117719610798959e-06, + "loss": 0.3942, + "step": 21783 + }, + { + "epoch": 1.095469343461747, + "grad_norm": 5.843944072723389, + "learning_rate": 7.117582568178703e-06, + "loss": 0.2739, + "step": 21784 + }, + { + "epoch": 1.095482908301682, + "grad_norm": 5.23583984375, + "learning_rate": 7.117445525558449e-06, + "loss": 0.2321, + "step": 21785 + }, + { + "epoch": 1.095496473141617, + "grad_norm": 4.879977703094482, + "learning_rate": 7.1173084829381945e-06, + "loss": 0.2201, + "step": 21786 + }, + { + "epoch": 1.0955100379815519, + "grad_norm": 4.488457679748535, + "learning_rate": 7.117171440317939e-06, + "loss": 0.2333, + "step": 21787 + }, + { + "epoch": 1.0955236028214868, + "grad_norm": 5.066187858581543, + "learning_rate": 7.117034397697684e-06, + "loss": 0.2621, + "step": 21788 + }, + { + "epoch": 1.0955371676614216, + "grad_norm": 5.316336631774902, + "learning_rate": 7.11689735507743e-06, + "loss": 0.3158, + "step": 21789 + }, + { + "epoch": 1.0955507325013565, + "grad_norm": 5.269773483276367, + "learning_rate": 7.116760312457175e-06, + "loss": 0.3086, + "step": 21790 + }, + { + "epoch": 1.0955642973412913, + "grad_norm": 5.288434982299805, + "learning_rate": 7.11662326983692e-06, + "loss": 0.2504, + "step": 21791 + }, + { + "epoch": 1.0955778621812262, + "grad_norm": 8.11507511138916, + "learning_rate": 7.116486227216665e-06, + "loss": 0.3568, + "step": 21792 + }, + { + "epoch": 1.095591427021161, + "grad_norm": 4.8930344581604, + "learning_rate": 7.116349184596411e-06, + "loss": 0.2898, + "step": 21793 + }, + { + "epoch": 1.095604991861096, + "grad_norm": 7.840904712677002, + "learning_rate": 7.116212141976155e-06, + "loss": 0.4301, + "step": 21794 + }, + { + "epoch": 1.095618556701031, + "grad_norm": 5.401640892028809, + "learning_rate": 7.1160750993559e-06, + "loss": 0.2743, + "step": 21795 + }, + { + "epoch": 1.095632121540966, + "grad_norm": 5.020091533660889, + "learning_rate": 7.1159380567356455e-06, + "loss": 0.2542, + "step": 21796 + }, + { + "epoch": 1.0956456863809008, + "grad_norm": 5.1139655113220215, + "learning_rate": 7.11580101411539e-06, + "loss": 0.2464, + "step": 21797 + }, + { + "epoch": 1.0956592512208356, + "grad_norm": 6.998824119567871, + "learning_rate": 7.115663971495136e-06, + "loss": 0.4096, + "step": 21798 + }, + { + "epoch": 1.0956728160607705, + "grad_norm": 4.885361194610596, + "learning_rate": 7.115526928874881e-06, + "loss": 0.3001, + "step": 21799 + }, + { + "epoch": 1.0956863809007054, + "grad_norm": 6.054079055786133, + "learning_rate": 7.115389886254625e-06, + "loss": 0.2576, + "step": 21800 + }, + { + "epoch": 1.0956999457406402, + "grad_norm": 8.315034866333008, + "learning_rate": 7.1152528436343705e-06, + "loss": 0.1914, + "step": 21801 + }, + { + "epoch": 1.095713510580575, + "grad_norm": 6.401425361633301, + "learning_rate": 7.1151158010141165e-06, + "loss": 0.2365, + "step": 21802 + }, + { + "epoch": 1.09572707542051, + "grad_norm": 5.466670036315918, + "learning_rate": 7.114978758393861e-06, + "loss": 0.3269, + "step": 21803 + }, + { + "epoch": 1.0957406402604448, + "grad_norm": 4.9157209396362305, + "learning_rate": 7.114841715773606e-06, + "loss": 0.1773, + "step": 21804 + }, + { + "epoch": 1.09575420510038, + "grad_norm": 5.101130485534668, + "learning_rate": 7.114704673153351e-06, + "loss": 0.3206, + "step": 21805 + }, + { + "epoch": 1.0957677699403148, + "grad_norm": 4.342901229858398, + "learning_rate": 7.114567630533097e-06, + "loss": 0.2502, + "step": 21806 + }, + { + "epoch": 1.0957813347802496, + "grad_norm": 5.737757205963135, + "learning_rate": 7.114430587912842e-06, + "loss": 0.3998, + "step": 21807 + }, + { + "epoch": 1.0957948996201845, + "grad_norm": 5.733415603637695, + "learning_rate": 7.114293545292587e-06, + "loss": 0.3498, + "step": 21808 + }, + { + "epoch": 1.0958084644601194, + "grad_norm": 3.439154863357544, + "learning_rate": 7.114156502672331e-06, + "loss": 0.2846, + "step": 21809 + }, + { + "epoch": 1.0958220293000542, + "grad_norm": 3.458254814147949, + "learning_rate": 7.114019460052077e-06, + "loss": 0.1563, + "step": 21810 + }, + { + "epoch": 1.095835594139989, + "grad_norm": 6.264972686767578, + "learning_rate": 7.113882417431822e-06, + "loss": 0.278, + "step": 21811 + }, + { + "epoch": 1.095849158979924, + "grad_norm": 5.434539794921875, + "learning_rate": 7.113745374811567e-06, + "loss": 0.2633, + "step": 21812 + }, + { + "epoch": 1.0958627238198588, + "grad_norm": 5.112143039703369, + "learning_rate": 7.113608332191312e-06, + "loss": 0.3256, + "step": 21813 + }, + { + "epoch": 1.095876288659794, + "grad_norm": 5.696715354919434, + "learning_rate": 7.113471289571057e-06, + "loss": 0.325, + "step": 21814 + }, + { + "epoch": 1.0958898534997288, + "grad_norm": 4.715493202209473, + "learning_rate": 7.113334246950803e-06, + "loss": 0.1755, + "step": 21815 + }, + { + "epoch": 1.0959034183396636, + "grad_norm": 5.683924674987793, + "learning_rate": 7.113197204330547e-06, + "loss": 0.2926, + "step": 21816 + }, + { + "epoch": 1.0959169831795985, + "grad_norm": 4.838115215301514, + "learning_rate": 7.1130601617102925e-06, + "loss": 0.2759, + "step": 21817 + }, + { + "epoch": 1.0959305480195334, + "grad_norm": 5.195065498352051, + "learning_rate": 7.112923119090037e-06, + "loss": 0.3112, + "step": 21818 + }, + { + "epoch": 1.0959441128594682, + "grad_norm": 4.726194858551025, + "learning_rate": 7.112786076469783e-06, + "loss": 0.1746, + "step": 21819 + }, + { + "epoch": 1.095957677699403, + "grad_norm": 6.28338623046875, + "learning_rate": 7.112649033849528e-06, + "loss": 0.4278, + "step": 21820 + }, + { + "epoch": 1.095971242539338, + "grad_norm": 6.062770843505859, + "learning_rate": 7.112511991229272e-06, + "loss": 0.3666, + "step": 21821 + }, + { + "epoch": 1.0959848073792728, + "grad_norm": 5.259230136871338, + "learning_rate": 7.112374948609018e-06, + "loss": 0.2991, + "step": 21822 + }, + { + "epoch": 1.0959983722192077, + "grad_norm": 4.760700225830078, + "learning_rate": 7.112237905988764e-06, + "loss": 0.2864, + "step": 21823 + }, + { + "epoch": 1.0960119370591428, + "grad_norm": 4.377516746520996, + "learning_rate": 7.112100863368509e-06, + "loss": 0.2777, + "step": 21824 + }, + { + "epoch": 1.0960255018990777, + "grad_norm": 4.047619819641113, + "learning_rate": 7.111963820748253e-06, + "loss": 0.2171, + "step": 21825 + }, + { + "epoch": 1.0960390667390125, + "grad_norm": 5.709754467010498, + "learning_rate": 7.111826778127998e-06, + "loss": 0.2732, + "step": 21826 + }, + { + "epoch": 1.0960526315789474, + "grad_norm": 5.768986225128174, + "learning_rate": 7.111689735507743e-06, + "loss": 0.2542, + "step": 21827 + }, + { + "epoch": 1.0960661964188823, + "grad_norm": 8.593416213989258, + "learning_rate": 7.111552692887489e-06, + "loss": 0.2843, + "step": 21828 + }, + { + "epoch": 1.0960797612588171, + "grad_norm": 6.337528705596924, + "learning_rate": 7.111415650267234e-06, + "loss": 0.298, + "step": 21829 + }, + { + "epoch": 1.096093326098752, + "grad_norm": 5.776905059814453, + "learning_rate": 7.111278607646979e-06, + "loss": 0.243, + "step": 21830 + }, + { + "epoch": 1.0961068909386869, + "grad_norm": 4.575159072875977, + "learning_rate": 7.111141565026723e-06, + "loss": 0.2283, + "step": 21831 + }, + { + "epoch": 1.0961204557786217, + "grad_norm": 5.220122337341309, + "learning_rate": 7.111004522406469e-06, + "loss": 0.2764, + "step": 21832 + }, + { + "epoch": 1.0961340206185568, + "grad_norm": 5.992125511169434, + "learning_rate": 7.1108674797862146e-06, + "loss": 0.2981, + "step": 21833 + }, + { + "epoch": 1.0961475854584917, + "grad_norm": 6.035717487335205, + "learning_rate": 7.110730437165959e-06, + "loss": 0.3206, + "step": 21834 + }, + { + "epoch": 1.0961611502984265, + "grad_norm": 4.702311992645264, + "learning_rate": 7.110593394545704e-06, + "loss": 0.193, + "step": 21835 + }, + { + "epoch": 1.0961747151383614, + "grad_norm": 3.7932136058807373, + "learning_rate": 7.11045635192545e-06, + "loss": 0.1963, + "step": 21836 + }, + { + "epoch": 1.0961882799782963, + "grad_norm": 5.12087345123291, + "learning_rate": 7.110319309305194e-06, + "loss": 0.2464, + "step": 21837 + }, + { + "epoch": 1.0962018448182311, + "grad_norm": 5.67014217376709, + "learning_rate": 7.11018226668494e-06, + "loss": 0.2948, + "step": 21838 + }, + { + "epoch": 1.096215409658166, + "grad_norm": 4.660823822021484, + "learning_rate": 7.110045224064685e-06, + "loss": 0.2088, + "step": 21839 + }, + { + "epoch": 1.0962289744981009, + "grad_norm": 7.104545593261719, + "learning_rate": 7.109908181444429e-06, + "loss": 0.3831, + "step": 21840 + }, + { + "epoch": 1.0962425393380357, + "grad_norm": 4.631833553314209, + "learning_rate": 7.109771138824175e-06, + "loss": 0.2089, + "step": 21841 + }, + { + "epoch": 1.0962561041779706, + "grad_norm": 6.085644245147705, + "learning_rate": 7.10963409620392e-06, + "loss": 0.2585, + "step": 21842 + }, + { + "epoch": 1.0962696690179057, + "grad_norm": 6.063904762268066, + "learning_rate": 7.109497053583665e-06, + "loss": 0.401, + "step": 21843 + }, + { + "epoch": 1.0962832338578405, + "grad_norm": 6.374446392059326, + "learning_rate": 7.10936001096341e-06, + "loss": 0.4419, + "step": 21844 + }, + { + "epoch": 1.0962967986977754, + "grad_norm": 5.825305461883545, + "learning_rate": 7.109222968343156e-06, + "loss": 0.2887, + "step": 21845 + }, + { + "epoch": 1.0963103635377103, + "grad_norm": 5.03659725189209, + "learning_rate": 7.1090859257229e-06, + "loss": 0.2187, + "step": 21846 + }, + { + "epoch": 1.0963239283776451, + "grad_norm": 3.7529942989349365, + "learning_rate": 7.108948883102645e-06, + "loss": 0.2495, + "step": 21847 + }, + { + "epoch": 1.09633749321758, + "grad_norm": 4.75079345703125, + "learning_rate": 7.1088118404823905e-06, + "loss": 0.2497, + "step": 21848 + }, + { + "epoch": 1.0963510580575149, + "grad_norm": 8.486963272094727, + "learning_rate": 7.1086747978621366e-06, + "loss": 0.571, + "step": 21849 + }, + { + "epoch": 1.0963646228974497, + "grad_norm": 5.402505397796631, + "learning_rate": 7.108537755241881e-06, + "loss": 0.2778, + "step": 21850 + }, + { + "epoch": 1.0963781877373846, + "grad_norm": 4.644189357757568, + "learning_rate": 7.108400712621626e-06, + "loss": 0.2183, + "step": 21851 + }, + { + "epoch": 1.0963917525773197, + "grad_norm": 6.447035789489746, + "learning_rate": 7.10826367000137e-06, + "loss": 0.3007, + "step": 21852 + }, + { + "epoch": 1.0964053174172546, + "grad_norm": 6.252674102783203, + "learning_rate": 7.108126627381116e-06, + "loss": 0.3374, + "step": 21853 + }, + { + "epoch": 1.0964188822571894, + "grad_norm": 5.228938579559326, + "learning_rate": 7.107989584760862e-06, + "loss": 0.3863, + "step": 21854 + }, + { + "epoch": 1.0964324470971243, + "grad_norm": 4.1080427169799805, + "learning_rate": 7.107852542140607e-06, + "loss": 0.2201, + "step": 21855 + }, + { + "epoch": 1.0964460119370592, + "grad_norm": 6.392329216003418, + "learning_rate": 7.107715499520351e-06, + "loss": 0.3332, + "step": 21856 + }, + { + "epoch": 1.096459576776994, + "grad_norm": 4.6940507888793945, + "learning_rate": 7.107578456900096e-06, + "loss": 0.1974, + "step": 21857 + }, + { + "epoch": 1.0964731416169289, + "grad_norm": 6.7361159324646, + "learning_rate": 7.107441414279842e-06, + "loss": 0.2571, + "step": 21858 + }, + { + "epoch": 1.0964867064568637, + "grad_norm": 4.99262809753418, + "learning_rate": 7.107304371659587e-06, + "loss": 0.2418, + "step": 21859 + }, + { + "epoch": 1.0965002712967986, + "grad_norm": 5.303366661071777, + "learning_rate": 7.107167329039332e-06, + "loss": 0.3257, + "step": 21860 + }, + { + "epoch": 1.0965138361367335, + "grad_norm": 5.029211521148682, + "learning_rate": 7.107030286419076e-06, + "loss": 0.4454, + "step": 21861 + }, + { + "epoch": 1.0965274009766686, + "grad_norm": 5.2054572105407715, + "learning_rate": 7.106893243798822e-06, + "loss": 0.3285, + "step": 21862 + }, + { + "epoch": 1.0965409658166034, + "grad_norm": 4.156464099884033, + "learning_rate": 7.106756201178567e-06, + "loss": 0.266, + "step": 21863 + }, + { + "epoch": 1.0965545306565383, + "grad_norm": 4.709293842315674, + "learning_rate": 7.1066191585583126e-06, + "loss": 0.2786, + "step": 21864 + }, + { + "epoch": 1.0965680954964732, + "grad_norm": 4.347158908843994, + "learning_rate": 7.106482115938057e-06, + "loss": 0.2375, + "step": 21865 + }, + { + "epoch": 1.096581660336408, + "grad_norm": 5.515162944793701, + "learning_rate": 7.106345073317802e-06, + "loss": 0.3691, + "step": 21866 + }, + { + "epoch": 1.096595225176343, + "grad_norm": 4.903019428253174, + "learning_rate": 7.106208030697548e-06, + "loss": 0.2149, + "step": 21867 + }, + { + "epoch": 1.0966087900162778, + "grad_norm": 4.383420944213867, + "learning_rate": 7.1060709880772924e-06, + "loss": 0.3164, + "step": 21868 + }, + { + "epoch": 1.0966223548562126, + "grad_norm": 6.945392608642578, + "learning_rate": 7.105933945457038e-06, + "loss": 0.4558, + "step": 21869 + }, + { + "epoch": 1.0966359196961475, + "grad_norm": 5.370769023895264, + "learning_rate": 7.105796902836783e-06, + "loss": 0.3184, + "step": 21870 + }, + { + "epoch": 1.0966494845360826, + "grad_norm": 6.197181701660156, + "learning_rate": 7.105659860216528e-06, + "loss": 0.348, + "step": 21871 + }, + { + "epoch": 1.0966630493760174, + "grad_norm": 5.699104309082031, + "learning_rate": 7.105522817596273e-06, + "loss": 0.2967, + "step": 21872 + }, + { + "epoch": 1.0966766142159523, + "grad_norm": 6.540700435638428, + "learning_rate": 7.105385774976018e-06, + "loss": 0.3593, + "step": 21873 + }, + { + "epoch": 1.0966901790558872, + "grad_norm": 5.373517990112305, + "learning_rate": 7.105248732355763e-06, + "loss": 0.4688, + "step": 21874 + }, + { + "epoch": 1.096703743895822, + "grad_norm": 4.674490928649902, + "learning_rate": 7.105111689735509e-06, + "loss": 0.2856, + "step": 21875 + }, + { + "epoch": 1.096717308735757, + "grad_norm": 3.7234368324279785, + "learning_rate": 7.104974647115254e-06, + "loss": 0.2043, + "step": 21876 + }, + { + "epoch": 1.0967308735756918, + "grad_norm": 5.828885078430176, + "learning_rate": 7.104837604494998e-06, + "loss": 0.2272, + "step": 21877 + }, + { + "epoch": 1.0967444384156266, + "grad_norm": 6.162771701812744, + "learning_rate": 7.104700561874743e-06, + "loss": 0.3946, + "step": 21878 + }, + { + "epoch": 1.0967580032555615, + "grad_norm": 6.5711669921875, + "learning_rate": 7.1045635192544885e-06, + "loss": 0.3977, + "step": 21879 + }, + { + "epoch": 1.0967715680954964, + "grad_norm": 9.085555076599121, + "learning_rate": 7.104426476634234e-06, + "loss": 0.3663, + "step": 21880 + }, + { + "epoch": 1.0967851329354315, + "grad_norm": 6.113259315490723, + "learning_rate": 7.104289434013979e-06, + "loss": 0.2512, + "step": 21881 + }, + { + "epoch": 1.0967986977753663, + "grad_norm": 4.64170503616333, + "learning_rate": 7.104152391393724e-06, + "loss": 0.2923, + "step": 21882 + }, + { + "epoch": 1.0968122626153012, + "grad_norm": 7.025604724884033, + "learning_rate": 7.104015348773468e-06, + "loss": 0.4547, + "step": 21883 + }, + { + "epoch": 1.096825827455236, + "grad_norm": 5.787920951843262, + "learning_rate": 7.1038783061532144e-06, + "loss": 0.3149, + "step": 21884 + }, + { + "epoch": 1.096839392295171, + "grad_norm": 6.5947489738464355, + "learning_rate": 7.10374126353296e-06, + "loss": 0.2428, + "step": 21885 + }, + { + "epoch": 1.0968529571351058, + "grad_norm": 6.251158714294434, + "learning_rate": 7.103604220912704e-06, + "loss": 0.3152, + "step": 21886 + }, + { + "epoch": 1.0968665219750406, + "grad_norm": 4.630299091339111, + "learning_rate": 7.103467178292449e-06, + "loss": 0.1918, + "step": 21887 + }, + { + "epoch": 1.0968800868149755, + "grad_norm": 5.7710347175598145, + "learning_rate": 7.103330135672195e-06, + "loss": 0.3759, + "step": 21888 + }, + { + "epoch": 1.0968936516549104, + "grad_norm": 7.005426406860352, + "learning_rate": 7.10319309305194e-06, + "loss": 0.4825, + "step": 21889 + }, + { + "epoch": 1.0969072164948455, + "grad_norm": 4.4448065757751465, + "learning_rate": 7.103056050431685e-06, + "loss": 0.2615, + "step": 21890 + }, + { + "epoch": 1.0969207813347803, + "grad_norm": 5.3502960205078125, + "learning_rate": 7.10291900781143e-06, + "loss": 0.2094, + "step": 21891 + }, + { + "epoch": 1.0969343461747152, + "grad_norm": 5.171859264373779, + "learning_rate": 7.102781965191176e-06, + "loss": 0.2207, + "step": 21892 + }, + { + "epoch": 1.09694791101465, + "grad_norm": 5.17323637008667, + "learning_rate": 7.10264492257092e-06, + "loss": 0.3189, + "step": 21893 + }, + { + "epoch": 1.096961475854585, + "grad_norm": 4.079834461212158, + "learning_rate": 7.102507879950665e-06, + "loss": 0.2091, + "step": 21894 + }, + { + "epoch": 1.0969750406945198, + "grad_norm": 4.925255298614502, + "learning_rate": 7.10237083733041e-06, + "loss": 0.266, + "step": 21895 + }, + { + "epoch": 1.0969886055344547, + "grad_norm": 5.414430618286133, + "learning_rate": 7.102233794710155e-06, + "loss": 0.3301, + "step": 21896 + }, + { + "epoch": 1.0970021703743895, + "grad_norm": 5.379317760467529, + "learning_rate": 7.102096752089901e-06, + "loss": 0.3646, + "step": 21897 + }, + { + "epoch": 1.0970157352143244, + "grad_norm": 6.044541835784912, + "learning_rate": 7.101959709469646e-06, + "loss": 0.2422, + "step": 21898 + }, + { + "epoch": 1.0970293000542592, + "grad_norm": 6.11765718460083, + "learning_rate": 7.1018226668493904e-06, + "loss": 0.368, + "step": 21899 + }, + { + "epoch": 1.0970428648941943, + "grad_norm": 5.839915752410889, + "learning_rate": 7.101685624229136e-06, + "loss": 0.2748, + "step": 21900 + }, + { + "epoch": 1.0970564297341292, + "grad_norm": 5.815250396728516, + "learning_rate": 7.101548581608882e-06, + "loss": 0.3469, + "step": 21901 + }, + { + "epoch": 1.097069994574064, + "grad_norm": 7.102590084075928, + "learning_rate": 7.101411538988626e-06, + "loss": 0.2708, + "step": 21902 + }, + { + "epoch": 1.097083559413999, + "grad_norm": 5.111968994140625, + "learning_rate": 7.101274496368371e-06, + "loss": 0.255, + "step": 21903 + }, + { + "epoch": 1.0970971242539338, + "grad_norm": 3.9237220287323, + "learning_rate": 7.101137453748116e-06, + "loss": 0.2772, + "step": 21904 + }, + { + "epoch": 1.0971106890938687, + "grad_norm": 5.832965850830078, + "learning_rate": 7.1010004111278615e-06, + "loss": 0.2798, + "step": 21905 + }, + { + "epoch": 1.0971242539338035, + "grad_norm": 5.260103225708008, + "learning_rate": 7.100863368507607e-06, + "loss": 0.2441, + "step": 21906 + }, + { + "epoch": 1.0971378187737384, + "grad_norm": 4.995626449584961, + "learning_rate": 7.100726325887352e-06, + "loss": 0.2046, + "step": 21907 + }, + { + "epoch": 1.0971513836136733, + "grad_norm": 4.367041110992432, + "learning_rate": 7.100589283267096e-06, + "loss": 0.2988, + "step": 21908 + }, + { + "epoch": 1.0971649484536083, + "grad_norm": 4.820573806762695, + "learning_rate": 7.100452240646841e-06, + "loss": 0.2476, + "step": 21909 + }, + { + "epoch": 1.0971785132935432, + "grad_norm": 5.395162105560303, + "learning_rate": 7.100315198026587e-06, + "loss": 0.3585, + "step": 21910 + }, + { + "epoch": 1.097192078133478, + "grad_norm": 5.740469932556152, + "learning_rate": 7.100178155406332e-06, + "loss": 0.3639, + "step": 21911 + }, + { + "epoch": 1.097205642973413, + "grad_norm": 5.788589000701904, + "learning_rate": 7.100041112786077e-06, + "loss": 0.3327, + "step": 21912 + }, + { + "epoch": 1.0972192078133478, + "grad_norm": 4.692122459411621, + "learning_rate": 7.099904070165822e-06, + "loss": 0.2362, + "step": 21913 + }, + { + "epoch": 1.0972327726532827, + "grad_norm": 4.679430961608887, + "learning_rate": 7.099767027545567e-06, + "loss": 0.2146, + "step": 21914 + }, + { + "epoch": 1.0972463374932175, + "grad_norm": 5.675132751464844, + "learning_rate": 7.0996299849253124e-06, + "loss": 0.2711, + "step": 21915 + }, + { + "epoch": 1.0972599023331524, + "grad_norm": 5.77246618270874, + "learning_rate": 7.099492942305058e-06, + "loss": 0.3017, + "step": 21916 + }, + { + "epoch": 1.0972734671730873, + "grad_norm": 4.728547096252441, + "learning_rate": 7.099355899684802e-06, + "loss": 0.1865, + "step": 21917 + }, + { + "epoch": 1.0972870320130221, + "grad_norm": 6.9413957595825195, + "learning_rate": 7.099218857064548e-06, + "loss": 0.176, + "step": 21918 + }, + { + "epoch": 1.0973005968529572, + "grad_norm": 5.238336563110352, + "learning_rate": 7.099081814444293e-06, + "loss": 0.3182, + "step": 21919 + }, + { + "epoch": 1.097314161692892, + "grad_norm": 6.243986129760742, + "learning_rate": 7.0989447718240375e-06, + "loss": 0.2622, + "step": 21920 + }, + { + "epoch": 1.097327726532827, + "grad_norm": 4.38373327255249, + "learning_rate": 7.098807729203783e-06, + "loss": 0.2116, + "step": 21921 + }, + { + "epoch": 1.0973412913727618, + "grad_norm": 5.17917013168335, + "learning_rate": 7.098670686583528e-06, + "loss": 0.3906, + "step": 21922 + }, + { + "epoch": 1.0973548562126967, + "grad_norm": 5.4511919021606445, + "learning_rate": 7.098533643963274e-06, + "loss": 0.2815, + "step": 21923 + }, + { + "epoch": 1.0973684210526315, + "grad_norm": 6.7361555099487305, + "learning_rate": 7.098396601343018e-06, + "loss": 0.3715, + "step": 21924 + }, + { + "epoch": 1.0973819858925664, + "grad_norm": 5.548601150512695, + "learning_rate": 7.098259558722763e-06, + "loss": 0.18, + "step": 21925 + }, + { + "epoch": 1.0973955507325013, + "grad_norm": 4.854234218597412, + "learning_rate": 7.098122516102508e-06, + "loss": 0.2584, + "step": 21926 + }, + { + "epoch": 1.0974091155724361, + "grad_norm": 4.370520114898682, + "learning_rate": 7.097985473482254e-06, + "loss": 0.2357, + "step": 21927 + }, + { + "epoch": 1.0974226804123712, + "grad_norm": 6.06273889541626, + "learning_rate": 7.097848430861999e-06, + "loss": 0.3872, + "step": 21928 + }, + { + "epoch": 1.097436245252306, + "grad_norm": 5.284483909606934, + "learning_rate": 7.097711388241743e-06, + "loss": 0.3071, + "step": 21929 + }, + { + "epoch": 1.097449810092241, + "grad_norm": 5.661416530609131, + "learning_rate": 7.0975743456214884e-06, + "loss": 0.2876, + "step": 21930 + }, + { + "epoch": 1.0974633749321758, + "grad_norm": 5.436668872833252, + "learning_rate": 7.0974373030012345e-06, + "loss": 0.342, + "step": 21931 + }, + { + "epoch": 1.0974769397721107, + "grad_norm": 6.1207380294799805, + "learning_rate": 7.09730026038098e-06, + "loss": 0.2634, + "step": 21932 + }, + { + "epoch": 1.0974905046120456, + "grad_norm": 5.55327844619751, + "learning_rate": 7.097163217760724e-06, + "loss": 0.2975, + "step": 21933 + }, + { + "epoch": 1.0975040694519804, + "grad_norm": 5.932139873504639, + "learning_rate": 7.097026175140469e-06, + "loss": 0.288, + "step": 21934 + }, + { + "epoch": 1.0975176342919153, + "grad_norm": 4.78670597076416, + "learning_rate": 7.0968891325202135e-06, + "loss": 0.2457, + "step": 21935 + }, + { + "epoch": 1.0975311991318502, + "grad_norm": 4.544320106506348, + "learning_rate": 7.0967520898999595e-06, + "loss": 0.1344, + "step": 21936 + }, + { + "epoch": 1.097544763971785, + "grad_norm": 9.44189453125, + "learning_rate": 7.096615047279705e-06, + "loss": 0.3916, + "step": 21937 + }, + { + "epoch": 1.09755832881172, + "grad_norm": 5.323638916015625, + "learning_rate": 7.09647800465945e-06, + "loss": 0.2482, + "step": 21938 + }, + { + "epoch": 1.097571893651655, + "grad_norm": 7.131023406982422, + "learning_rate": 7.096340962039194e-06, + "loss": 0.5258, + "step": 21939 + }, + { + "epoch": 1.0975854584915898, + "grad_norm": 5.902606964111328, + "learning_rate": 7.09620391941894e-06, + "loss": 0.2223, + "step": 21940 + }, + { + "epoch": 1.0975990233315247, + "grad_norm": 6.023451328277588, + "learning_rate": 7.096066876798685e-06, + "loss": 0.2504, + "step": 21941 + }, + { + "epoch": 1.0976125881714596, + "grad_norm": 5.548132419586182, + "learning_rate": 7.09592983417843e-06, + "loss": 0.4556, + "step": 21942 + }, + { + "epoch": 1.0976261530113944, + "grad_norm": 3.941596508026123, + "learning_rate": 7.095792791558175e-06, + "loss": 0.2099, + "step": 21943 + }, + { + "epoch": 1.0976397178513293, + "grad_norm": 4.541030406951904, + "learning_rate": 7.095655748937921e-06, + "loss": 0.3005, + "step": 21944 + }, + { + "epoch": 1.0976532826912642, + "grad_norm": 6.7328338623046875, + "learning_rate": 7.095518706317665e-06, + "loss": 0.4688, + "step": 21945 + }, + { + "epoch": 1.097666847531199, + "grad_norm": 5.827572345733643, + "learning_rate": 7.0953816636974104e-06, + "loss": 0.3059, + "step": 21946 + }, + { + "epoch": 1.0976804123711341, + "grad_norm": 4.558542728424072, + "learning_rate": 7.095244621077156e-06, + "loss": 0.2466, + "step": 21947 + }, + { + "epoch": 1.097693977211069, + "grad_norm": 5.437931537628174, + "learning_rate": 7.0951075784569e-06, + "loss": 0.224, + "step": 21948 + }, + { + "epoch": 1.0977075420510038, + "grad_norm": 7.340507984161377, + "learning_rate": 7.094970535836646e-06, + "loss": 0.4616, + "step": 21949 + }, + { + "epoch": 1.0977211068909387, + "grad_norm": 5.416622161865234, + "learning_rate": 7.094833493216391e-06, + "loss": 0.2926, + "step": 21950 + }, + { + "epoch": 1.0977346717308736, + "grad_norm": 7.331825256347656, + "learning_rate": 7.0946964505961355e-06, + "loss": 0.2905, + "step": 21951 + }, + { + "epoch": 1.0977482365708084, + "grad_norm": 5.53997802734375, + "learning_rate": 7.094559407975881e-06, + "loss": 0.4835, + "step": 21952 + }, + { + "epoch": 1.0977618014107433, + "grad_norm": 6.637446880340576, + "learning_rate": 7.094422365355627e-06, + "loss": 0.3296, + "step": 21953 + }, + { + "epoch": 1.0977753662506782, + "grad_norm": 4.948609828948975, + "learning_rate": 7.094285322735371e-06, + "loss": 0.2573, + "step": 21954 + }, + { + "epoch": 1.097788931090613, + "grad_norm": 4.662065505981445, + "learning_rate": 7.094148280115116e-06, + "loss": 0.2171, + "step": 21955 + }, + { + "epoch": 1.097802495930548, + "grad_norm": 6.587978839874268, + "learning_rate": 7.094011237494861e-06, + "loss": 0.3024, + "step": 21956 + }, + { + "epoch": 1.097816060770483, + "grad_norm": 4.725028991699219, + "learning_rate": 7.093874194874607e-06, + "loss": 0.2203, + "step": 21957 + }, + { + "epoch": 1.0978296256104179, + "grad_norm": 4.516331672668457, + "learning_rate": 7.093737152254352e-06, + "loss": 0.2094, + "step": 21958 + }, + { + "epoch": 1.0978431904503527, + "grad_norm": 4.930362701416016, + "learning_rate": 7.093600109634097e-06, + "loss": 0.1982, + "step": 21959 + }, + { + "epoch": 1.0978567552902876, + "grad_norm": 6.993490695953369, + "learning_rate": 7.093463067013841e-06, + "loss": 0.3035, + "step": 21960 + }, + { + "epoch": 1.0978703201302225, + "grad_norm": 5.078475475311279, + "learning_rate": 7.093326024393587e-06, + "loss": 0.2113, + "step": 21961 + }, + { + "epoch": 1.0978838849701573, + "grad_norm": 5.025345802307129, + "learning_rate": 7.0931889817733325e-06, + "loss": 0.2483, + "step": 21962 + }, + { + "epoch": 1.0978974498100922, + "grad_norm": 4.982969284057617, + "learning_rate": 7.093051939153077e-06, + "loss": 0.3559, + "step": 21963 + }, + { + "epoch": 1.097911014650027, + "grad_norm": 4.626302242279053, + "learning_rate": 7.092914896532822e-06, + "loss": 0.2402, + "step": 21964 + }, + { + "epoch": 1.097924579489962, + "grad_norm": 5.117007255554199, + "learning_rate": 7.092777853912567e-06, + "loss": 0.2313, + "step": 21965 + }, + { + "epoch": 1.097938144329897, + "grad_norm": 4.410124778747559, + "learning_rate": 7.092640811292313e-06, + "loss": 0.2628, + "step": 21966 + }, + { + "epoch": 1.0979517091698319, + "grad_norm": 5.511808395385742, + "learning_rate": 7.0925037686720575e-06, + "loss": 0.343, + "step": 21967 + }, + { + "epoch": 1.0979652740097667, + "grad_norm": 5.07027530670166, + "learning_rate": 7.092366726051803e-06, + "loss": 0.2545, + "step": 21968 + }, + { + "epoch": 1.0979788388497016, + "grad_norm": 5.204248905181885, + "learning_rate": 7.092229683431547e-06, + "loss": 0.2686, + "step": 21969 + }, + { + "epoch": 1.0979924036896365, + "grad_norm": 4.596999168395996, + "learning_rate": 7.092092640811293e-06, + "loss": 0.1316, + "step": 21970 + }, + { + "epoch": 1.0980059685295713, + "grad_norm": 3.5792722702026367, + "learning_rate": 7.091955598191038e-06, + "loss": 0.2033, + "step": 21971 + }, + { + "epoch": 1.0980195333695062, + "grad_norm": 4.716969966888428, + "learning_rate": 7.091818555570783e-06, + "loss": 0.2107, + "step": 21972 + }, + { + "epoch": 1.098033098209441, + "grad_norm": 3.934905529022217, + "learning_rate": 7.091681512950528e-06, + "loss": 0.1451, + "step": 21973 + }, + { + "epoch": 1.098046663049376, + "grad_norm": 4.779356002807617, + "learning_rate": 7.091544470330274e-06, + "loss": 0.1758, + "step": 21974 + }, + { + "epoch": 1.0980602278893108, + "grad_norm": 4.440285682678223, + "learning_rate": 7.091407427710019e-06, + "loss": 0.1977, + "step": 21975 + }, + { + "epoch": 1.0980737927292459, + "grad_norm": 4.554553031921387, + "learning_rate": 7.091270385089763e-06, + "loss": 0.2352, + "step": 21976 + }, + { + "epoch": 1.0980873575691807, + "grad_norm": 9.013782501220703, + "learning_rate": 7.0911333424695085e-06, + "loss": 0.3216, + "step": 21977 + }, + { + "epoch": 1.0981009224091156, + "grad_norm": 4.794672012329102, + "learning_rate": 7.090996299849253e-06, + "loss": 0.2362, + "step": 21978 + }, + { + "epoch": 1.0981144872490505, + "grad_norm": 5.105018138885498, + "learning_rate": 7.090859257228999e-06, + "loss": 0.1938, + "step": 21979 + }, + { + "epoch": 1.0981280520889853, + "grad_norm": 4.154148578643799, + "learning_rate": 7.090722214608744e-06, + "loss": 0.1308, + "step": 21980 + }, + { + "epoch": 1.0981416169289202, + "grad_norm": 3.164353609085083, + "learning_rate": 7.090585171988489e-06, + "loss": 0.1331, + "step": 21981 + }, + { + "epoch": 1.098155181768855, + "grad_norm": 4.897207736968994, + "learning_rate": 7.0904481293682335e-06, + "loss": 0.1307, + "step": 21982 + }, + { + "epoch": 1.09816874660879, + "grad_norm": 4.57068395614624, + "learning_rate": 7.0903110867479795e-06, + "loss": 0.239, + "step": 21983 + }, + { + "epoch": 1.0981823114487248, + "grad_norm": 4.421357154846191, + "learning_rate": 7.090174044127725e-06, + "loss": 0.1801, + "step": 21984 + }, + { + "epoch": 1.09819587628866, + "grad_norm": 4.769463062286377, + "learning_rate": 7.090037001507469e-06, + "loss": 0.2577, + "step": 21985 + }, + { + "epoch": 1.0982094411285948, + "grad_norm": 5.41972541809082, + "learning_rate": 7.089899958887214e-06, + "loss": 0.3268, + "step": 21986 + }, + { + "epoch": 1.0982230059685296, + "grad_norm": 5.203128337860107, + "learning_rate": 7.08976291626696e-06, + "loss": 0.2286, + "step": 21987 + }, + { + "epoch": 1.0982365708084645, + "grad_norm": 3.3623476028442383, + "learning_rate": 7.0896258736467046e-06, + "loss": 0.1314, + "step": 21988 + }, + { + "epoch": 1.0982501356483994, + "grad_norm": 5.0637946128845215, + "learning_rate": 7.08948883102645e-06, + "loss": 0.1994, + "step": 21989 + }, + { + "epoch": 1.0982637004883342, + "grad_norm": 3.9108431339263916, + "learning_rate": 7.089351788406195e-06, + "loss": 0.2, + "step": 21990 + }, + { + "epoch": 1.098277265328269, + "grad_norm": 4.418205738067627, + "learning_rate": 7.089214745785939e-06, + "loss": 0.254, + "step": 21991 + }, + { + "epoch": 1.098290830168204, + "grad_norm": 3.90482234954834, + "learning_rate": 7.089077703165685e-06, + "loss": 0.1579, + "step": 21992 + }, + { + "epoch": 1.0983043950081388, + "grad_norm": 4.319961071014404, + "learning_rate": 7.0889406605454305e-06, + "loss": 0.1928, + "step": 21993 + }, + { + "epoch": 1.0983179598480737, + "grad_norm": 6.883055210113525, + "learning_rate": 7.088803617925175e-06, + "loss": 0.3061, + "step": 21994 + }, + { + "epoch": 1.0983315246880088, + "grad_norm": 3.6892881393432617, + "learning_rate": 7.08866657530492e-06, + "loss": 0.2147, + "step": 21995 + }, + { + "epoch": 1.0983450895279436, + "grad_norm": 3.2383532524108887, + "learning_rate": 7.088529532684666e-06, + "loss": 0.1364, + "step": 21996 + }, + { + "epoch": 1.0983586543678785, + "grad_norm": 5.053766250610352, + "learning_rate": 7.088392490064411e-06, + "loss": 0.1534, + "step": 21997 + }, + { + "epoch": 1.0983722192078134, + "grad_norm": 3.5844943523406982, + "learning_rate": 7.0882554474441555e-06, + "loss": 0.122, + "step": 21998 + }, + { + "epoch": 1.0983857840477482, + "grad_norm": 5.5424041748046875, + "learning_rate": 7.088118404823901e-06, + "loss": 0.2348, + "step": 21999 + }, + { + "epoch": 1.098399348887683, + "grad_norm": 8.672648429870605, + "learning_rate": 7.087981362203647e-06, + "loss": 0.2669, + "step": 22000 + }, + { + "epoch": 1.098412913727618, + "grad_norm": 5.5105366706848145, + "learning_rate": 7.087844319583391e-06, + "loss": 0.2352, + "step": 22001 + }, + { + "epoch": 1.0984264785675528, + "grad_norm": 5.37183952331543, + "learning_rate": 7.087707276963136e-06, + "loss": 0.1939, + "step": 22002 + }, + { + "epoch": 1.098440043407488, + "grad_norm": 4.509417533874512, + "learning_rate": 7.0875702343428806e-06, + "loss": 0.1738, + "step": 22003 + }, + { + "epoch": 1.0984536082474228, + "grad_norm": 4.333771228790283, + "learning_rate": 7.087433191722626e-06, + "loss": 0.1327, + "step": 22004 + }, + { + "epoch": 1.0984671730873576, + "grad_norm": 6.872291564941406, + "learning_rate": 7.087296149102372e-06, + "loss": 0.3028, + "step": 22005 + }, + { + "epoch": 1.0984807379272925, + "grad_norm": 6.644675254821777, + "learning_rate": 7.087159106482117e-06, + "loss": 0.2471, + "step": 22006 + }, + { + "epoch": 1.0984943027672274, + "grad_norm": 4.386356830596924, + "learning_rate": 7.087022063861861e-06, + "loss": 0.257, + "step": 22007 + }, + { + "epoch": 1.0985078676071622, + "grad_norm": 5.4162421226501465, + "learning_rate": 7.0868850212416065e-06, + "loss": 0.2661, + "step": 22008 + }, + { + "epoch": 1.098521432447097, + "grad_norm": 4.781247615814209, + "learning_rate": 7.0867479786213525e-06, + "loss": 0.1812, + "step": 22009 + }, + { + "epoch": 1.098534997287032, + "grad_norm": 6.11005973815918, + "learning_rate": 7.086610936001097e-06, + "loss": 0.3076, + "step": 22010 + }, + { + "epoch": 1.0985485621269668, + "grad_norm": 4.340310096740723, + "learning_rate": 7.086473893380842e-06, + "loss": 0.1783, + "step": 22011 + }, + { + "epoch": 1.0985621269669017, + "grad_norm": 3.096604585647583, + "learning_rate": 7.086336850760586e-06, + "loss": 0.1358, + "step": 22012 + }, + { + "epoch": 1.0985756918068366, + "grad_norm": 3.534686803817749, + "learning_rate": 7.086199808140332e-06, + "loss": 0.1815, + "step": 22013 + }, + { + "epoch": 1.0985892566467717, + "grad_norm": 5.118781089782715, + "learning_rate": 7.0860627655200775e-06, + "loss": 0.2383, + "step": 22014 + }, + { + "epoch": 1.0986028214867065, + "grad_norm": 6.710792064666748, + "learning_rate": 7.085925722899823e-06, + "loss": 0.2409, + "step": 22015 + }, + { + "epoch": 1.0986163863266414, + "grad_norm": 5.315142631530762, + "learning_rate": 7.085788680279567e-06, + "loss": 0.1972, + "step": 22016 + }, + { + "epoch": 1.0986299511665762, + "grad_norm": 5.095634937286377, + "learning_rate": 7.085651637659312e-06, + "loss": 0.2664, + "step": 22017 + }, + { + "epoch": 1.0986435160065111, + "grad_norm": 4.6775288581848145, + "learning_rate": 7.085514595039058e-06, + "loss": 0.2162, + "step": 22018 + }, + { + "epoch": 1.098657080846446, + "grad_norm": 7.718224048614502, + "learning_rate": 7.085377552418803e-06, + "loss": 0.2778, + "step": 22019 + }, + { + "epoch": 1.0986706456863808, + "grad_norm": 4.544410705566406, + "learning_rate": 7.085240509798548e-06, + "loss": 0.2176, + "step": 22020 + }, + { + "epoch": 1.0986842105263157, + "grad_norm": 5.925527095794678, + "learning_rate": 7.085103467178293e-06, + "loss": 0.2468, + "step": 22021 + }, + { + "epoch": 1.0986977753662508, + "grad_norm": 4.489197254180908, + "learning_rate": 7.084966424558038e-06, + "loss": 0.2264, + "step": 22022 + }, + { + "epoch": 1.0987113402061857, + "grad_norm": 3.4569854736328125, + "learning_rate": 7.084829381937783e-06, + "loss": 0.1058, + "step": 22023 + }, + { + "epoch": 1.0987249050461205, + "grad_norm": 3.3546905517578125, + "learning_rate": 7.0846923393175285e-06, + "loss": 0.1832, + "step": 22024 + }, + { + "epoch": 1.0987384698860554, + "grad_norm": 4.6452155113220215, + "learning_rate": 7.084555296697273e-06, + "loss": 0.3237, + "step": 22025 + }, + { + "epoch": 1.0987520347259903, + "grad_norm": 5.067624568939209, + "learning_rate": 7.084418254077019e-06, + "loss": 0.2622, + "step": 22026 + }, + { + "epoch": 1.0987655995659251, + "grad_norm": 4.614103317260742, + "learning_rate": 7.084281211456764e-06, + "loss": 0.1438, + "step": 22027 + }, + { + "epoch": 1.09877916440586, + "grad_norm": 5.110982418060303, + "learning_rate": 7.084144168836508e-06, + "loss": 0.2774, + "step": 22028 + }, + { + "epoch": 1.0987927292457949, + "grad_norm": 5.170418739318848, + "learning_rate": 7.0840071262162535e-06, + "loss": 0.2327, + "step": 22029 + }, + { + "epoch": 1.0988062940857297, + "grad_norm": 4.015316009521484, + "learning_rate": 7.0838700835959995e-06, + "loss": 0.212, + "step": 22030 + }, + { + "epoch": 1.0988198589256646, + "grad_norm": 5.825281620025635, + "learning_rate": 7.083733040975745e-06, + "loss": 0.2011, + "step": 22031 + }, + { + "epoch": 1.0988334237655994, + "grad_norm": 5.208900451660156, + "learning_rate": 7.083595998355489e-06, + "loss": 0.2893, + "step": 22032 + }, + { + "epoch": 1.0988469886055345, + "grad_norm": 5.186286449432373, + "learning_rate": 7.083458955735234e-06, + "loss": 0.2302, + "step": 22033 + }, + { + "epoch": 1.0988605534454694, + "grad_norm": 6.785554885864258, + "learning_rate": 7.0833219131149786e-06, + "loss": 0.2283, + "step": 22034 + }, + { + "epoch": 1.0988741182854043, + "grad_norm": 5.377516269683838, + "learning_rate": 7.083184870494725e-06, + "loss": 0.2408, + "step": 22035 + }, + { + "epoch": 1.0988876831253391, + "grad_norm": 5.191145420074463, + "learning_rate": 7.08304782787447e-06, + "loss": 0.2209, + "step": 22036 + }, + { + "epoch": 1.098901247965274, + "grad_norm": 5.480547904968262, + "learning_rate": 7.082910785254214e-06, + "loss": 0.2534, + "step": 22037 + }, + { + "epoch": 1.0989148128052089, + "grad_norm": 3.6922521591186523, + "learning_rate": 7.082773742633959e-06, + "loss": 0.1698, + "step": 22038 + }, + { + "epoch": 1.0989283776451437, + "grad_norm": 7.049023151397705, + "learning_rate": 7.082636700013705e-06, + "loss": 0.3222, + "step": 22039 + }, + { + "epoch": 1.0989419424850786, + "grad_norm": 4.892446517944336, + "learning_rate": 7.0824996573934505e-06, + "loss": 0.3182, + "step": 22040 + }, + { + "epoch": 1.0989555073250137, + "grad_norm": 4.981447219848633, + "learning_rate": 7.082362614773195e-06, + "loss": 0.2185, + "step": 22041 + }, + { + "epoch": 1.0989690721649485, + "grad_norm": 4.991519927978516, + "learning_rate": 7.08222557215294e-06, + "loss": 0.2014, + "step": 22042 + }, + { + "epoch": 1.0989826370048834, + "grad_norm": 3.7897260189056396, + "learning_rate": 7.082088529532686e-06, + "loss": 0.1864, + "step": 22043 + }, + { + "epoch": 1.0989962018448183, + "grad_norm": 3.990635633468628, + "learning_rate": 7.08195148691243e-06, + "loss": 0.1557, + "step": 22044 + }, + { + "epoch": 1.0990097666847531, + "grad_norm": 4.307246685028076, + "learning_rate": 7.0818144442921755e-06, + "loss": 0.1897, + "step": 22045 + }, + { + "epoch": 1.099023331524688, + "grad_norm": 5.864954471588135, + "learning_rate": 7.081677401671921e-06, + "loss": 0.2434, + "step": 22046 + }, + { + "epoch": 1.0990368963646229, + "grad_norm": 4.818382740020752, + "learning_rate": 7.081540359051665e-06, + "loss": 0.2668, + "step": 22047 + }, + { + "epoch": 1.0990504612045577, + "grad_norm": 5.57056999206543, + "learning_rate": 7.081403316431411e-06, + "loss": 0.3629, + "step": 22048 + }, + { + "epoch": 1.0990640260444926, + "grad_norm": 5.259387969970703, + "learning_rate": 7.081266273811156e-06, + "loss": 0.3028, + "step": 22049 + }, + { + "epoch": 1.0990775908844275, + "grad_norm": 4.536477565765381, + "learning_rate": 7.081129231190901e-06, + "loss": 0.1403, + "step": 22050 + }, + { + "epoch": 1.0990911557243626, + "grad_norm": 4.863345146179199, + "learning_rate": 7.080992188570646e-06, + "loss": 0.2596, + "step": 22051 + }, + { + "epoch": 1.0991047205642974, + "grad_norm": 4.067403793334961, + "learning_rate": 7.080855145950392e-06, + "loss": 0.2096, + "step": 22052 + }, + { + "epoch": 1.0991182854042323, + "grad_norm": 5.70448637008667, + "learning_rate": 7.080718103330136e-06, + "loss": 0.2528, + "step": 22053 + }, + { + "epoch": 1.0991318502441672, + "grad_norm": 4.08075475692749, + "learning_rate": 7.080581060709881e-06, + "loss": 0.2145, + "step": 22054 + }, + { + "epoch": 1.099145415084102, + "grad_norm": 5.807925224304199, + "learning_rate": 7.0804440180896265e-06, + "loss": 0.1875, + "step": 22055 + }, + { + "epoch": 1.0991589799240369, + "grad_norm": 4.308529376983643, + "learning_rate": 7.080306975469372e-06, + "loss": 0.1657, + "step": 22056 + }, + { + "epoch": 1.0991725447639717, + "grad_norm": 5.091372489929199, + "learning_rate": 7.080169932849117e-06, + "loss": 0.3183, + "step": 22057 + }, + { + "epoch": 1.0991861096039066, + "grad_norm": 3.929197072982788, + "learning_rate": 7.080032890228862e-06, + "loss": 0.1785, + "step": 22058 + }, + { + "epoch": 1.0991996744438415, + "grad_norm": 3.6779351234436035, + "learning_rate": 7.079895847608606e-06, + "loss": 0.1558, + "step": 22059 + }, + { + "epoch": 1.0992132392837766, + "grad_norm": 4.615276336669922, + "learning_rate": 7.0797588049883515e-06, + "loss": 0.1805, + "step": 22060 + }, + { + "epoch": 1.0992268041237114, + "grad_norm": 3.1436052322387695, + "learning_rate": 7.0796217623680976e-06, + "loss": 0.1215, + "step": 22061 + }, + { + "epoch": 1.0992403689636463, + "grad_norm": 3.402057647705078, + "learning_rate": 7.079484719747842e-06, + "loss": 0.1625, + "step": 22062 + }, + { + "epoch": 1.0992539338035812, + "grad_norm": 4.0770344734191895, + "learning_rate": 7.079347677127587e-06, + "loss": 0.2617, + "step": 22063 + }, + { + "epoch": 1.099267498643516, + "grad_norm": 5.355093002319336, + "learning_rate": 7.079210634507332e-06, + "loss": 0.1839, + "step": 22064 + }, + { + "epoch": 1.099281063483451, + "grad_norm": 3.7338948249816895, + "learning_rate": 7.079073591887078e-06, + "loss": 0.1587, + "step": 22065 + }, + { + "epoch": 1.0992946283233858, + "grad_norm": 4.2178544998168945, + "learning_rate": 7.078936549266823e-06, + "loss": 0.214, + "step": 22066 + }, + { + "epoch": 1.0993081931633206, + "grad_norm": 4.239331245422363, + "learning_rate": 7.078799506646568e-06, + "loss": 0.1866, + "step": 22067 + }, + { + "epoch": 1.0993217580032555, + "grad_norm": 4.7378435134887695, + "learning_rate": 7.078662464026312e-06, + "loss": 0.2028, + "step": 22068 + }, + { + "epoch": 1.0993353228431904, + "grad_norm": 5.0218377113342285, + "learning_rate": 7.078525421406058e-06, + "loss": 0.1978, + "step": 22069 + }, + { + "epoch": 1.0993488876831254, + "grad_norm": 5.5441412925720215, + "learning_rate": 7.078388378785803e-06, + "loss": 0.2596, + "step": 22070 + }, + { + "epoch": 1.0993624525230603, + "grad_norm": 6.420278072357178, + "learning_rate": 7.078251336165548e-06, + "loss": 0.3521, + "step": 22071 + }, + { + "epoch": 1.0993760173629952, + "grad_norm": 4.505357265472412, + "learning_rate": 7.078114293545293e-06, + "loss": 0.1518, + "step": 22072 + }, + { + "epoch": 1.09938958220293, + "grad_norm": 4.085700035095215, + "learning_rate": 7.077977250925038e-06, + "loss": 0.1779, + "step": 22073 + }, + { + "epoch": 1.099403147042865, + "grad_norm": 4.324955940246582, + "learning_rate": 7.077840208304784e-06, + "loss": 0.1479, + "step": 22074 + }, + { + "epoch": 1.0994167118827998, + "grad_norm": 4.7357497215271, + "learning_rate": 7.077703165684528e-06, + "loss": 0.26, + "step": 22075 + }, + { + "epoch": 1.0994302767227346, + "grad_norm": 7.445908069610596, + "learning_rate": 7.0775661230642735e-06, + "loss": 0.285, + "step": 22076 + }, + { + "epoch": 1.0994438415626695, + "grad_norm": 3.877079486846924, + "learning_rate": 7.077429080444018e-06, + "loss": 0.2019, + "step": 22077 + }, + { + "epoch": 1.0994574064026044, + "grad_norm": 6.2052130699157715, + "learning_rate": 7.077292037823764e-06, + "loss": 0.2593, + "step": 22078 + }, + { + "epoch": 1.0994709712425395, + "grad_norm": 5.127363681793213, + "learning_rate": 7.077154995203509e-06, + "loss": 0.2063, + "step": 22079 + }, + { + "epoch": 1.0994845360824743, + "grad_norm": 4.208507061004639, + "learning_rate": 7.077017952583254e-06, + "loss": 0.2029, + "step": 22080 + }, + { + "epoch": 1.0994981009224092, + "grad_norm": 5.0753583908081055, + "learning_rate": 7.076880909962999e-06, + "loss": 0.1772, + "step": 22081 + }, + { + "epoch": 1.099511665762344, + "grad_norm": 5.489319324493408, + "learning_rate": 7.076743867342745e-06, + "loss": 0.2148, + "step": 22082 + }, + { + "epoch": 1.099525230602279, + "grad_norm": 5.355477809906006, + "learning_rate": 7.07660682472249e-06, + "loss": 0.1952, + "step": 22083 + }, + { + "epoch": 1.0995387954422138, + "grad_norm": 3.8231754302978516, + "learning_rate": 7.076469782102234e-06, + "loss": 0.1841, + "step": 22084 + }, + { + "epoch": 1.0995523602821486, + "grad_norm": 3.9465901851654053, + "learning_rate": 7.076332739481979e-06, + "loss": 0.1763, + "step": 22085 + }, + { + "epoch": 1.0995659251220835, + "grad_norm": 6.083406448364258, + "learning_rate": 7.076195696861724e-06, + "loss": 0.4208, + "step": 22086 + }, + { + "epoch": 1.0995794899620184, + "grad_norm": 5.1996049880981445, + "learning_rate": 7.07605865424147e-06, + "loss": 0.2452, + "step": 22087 + }, + { + "epoch": 1.0995930548019532, + "grad_norm": 5.037075519561768, + "learning_rate": 7.075921611621215e-06, + "loss": 0.2333, + "step": 22088 + }, + { + "epoch": 1.0996066196418883, + "grad_norm": 4.5194411277771, + "learning_rate": 7.07578456900096e-06, + "loss": 0.2654, + "step": 22089 + }, + { + "epoch": 1.0996201844818232, + "grad_norm": 3.3906688690185547, + "learning_rate": 7.075647526380704e-06, + "loss": 0.1701, + "step": 22090 + }, + { + "epoch": 1.099633749321758, + "grad_norm": 3.803577184677124, + "learning_rate": 7.07551048376045e-06, + "loss": 0.1504, + "step": 22091 + }, + { + "epoch": 1.099647314161693, + "grad_norm": 5.12436580657959, + "learning_rate": 7.0753734411401956e-06, + "loss": 0.2718, + "step": 22092 + }, + { + "epoch": 1.0996608790016278, + "grad_norm": 3.720989465713501, + "learning_rate": 7.07523639851994e-06, + "loss": 0.1442, + "step": 22093 + }, + { + "epoch": 1.0996744438415627, + "grad_norm": 5.490756511688232, + "learning_rate": 7.075099355899685e-06, + "loss": 0.2839, + "step": 22094 + }, + { + "epoch": 1.0996880086814975, + "grad_norm": 4.709422588348389, + "learning_rate": 7.074962313279431e-06, + "loss": 0.2391, + "step": 22095 + }, + { + "epoch": 1.0997015735214324, + "grad_norm": 5.168466091156006, + "learning_rate": 7.074825270659175e-06, + "loss": 0.2952, + "step": 22096 + }, + { + "epoch": 1.0997151383613673, + "grad_norm": 4.468676567077637, + "learning_rate": 7.074688228038921e-06, + "loss": 0.1563, + "step": 22097 + }, + { + "epoch": 1.0997287032013023, + "grad_norm": 3.872215986251831, + "learning_rate": 7.074551185418666e-06, + "loss": 0.1977, + "step": 22098 + }, + { + "epoch": 1.0997422680412372, + "grad_norm": 5.9131669998168945, + "learning_rate": 7.074414142798412e-06, + "loss": 0.3047, + "step": 22099 + }, + { + "epoch": 1.099755832881172, + "grad_norm": 7.269438743591309, + "learning_rate": 7.074277100178156e-06, + "loss": 0.4547, + "step": 22100 + }, + { + "epoch": 1.099769397721107, + "grad_norm": 4.730832576751709, + "learning_rate": 7.074140057557901e-06, + "loss": 0.2086, + "step": 22101 + }, + { + "epoch": 1.0997829625610418, + "grad_norm": 6.776638031005859, + "learning_rate": 7.074003014937646e-06, + "loss": 0.2663, + "step": 22102 + }, + { + "epoch": 1.0997965274009767, + "grad_norm": 6.446747779846191, + "learning_rate": 7.073865972317391e-06, + "loss": 0.2668, + "step": 22103 + }, + { + "epoch": 1.0998100922409115, + "grad_norm": 5.3985161781311035, + "learning_rate": 7.073728929697137e-06, + "loss": 0.3985, + "step": 22104 + }, + { + "epoch": 1.0998236570808464, + "grad_norm": 4.0529093742370605, + "learning_rate": 7.073591887076881e-06, + "loss": 0.1502, + "step": 22105 + }, + { + "epoch": 1.0998372219207813, + "grad_norm": 3.829216957092285, + "learning_rate": 7.073454844456626e-06, + "loss": 0.198, + "step": 22106 + }, + { + "epoch": 1.0998507867607161, + "grad_norm": 5.169897079467773, + "learning_rate": 7.0733178018363715e-06, + "loss": 0.2562, + "step": 22107 + }, + { + "epoch": 1.0998643516006512, + "grad_norm": 4.145587921142578, + "learning_rate": 7.0731807592161176e-06, + "loss": 0.1398, + "step": 22108 + }, + { + "epoch": 1.099877916440586, + "grad_norm": 5.190341949462891, + "learning_rate": 7.073043716595862e-06, + "loss": 0.1914, + "step": 22109 + }, + { + "epoch": 1.099891481280521, + "grad_norm": 5.900058746337891, + "learning_rate": 7.072906673975607e-06, + "loss": 0.1974, + "step": 22110 + }, + { + "epoch": 1.0999050461204558, + "grad_norm": 5.196072578430176, + "learning_rate": 7.072769631355351e-06, + "loss": 0.1977, + "step": 22111 + }, + { + "epoch": 1.0999186109603907, + "grad_norm": 6.099088191986084, + "learning_rate": 7.0726325887350974e-06, + "loss": 0.1823, + "step": 22112 + }, + { + "epoch": 1.0999321758003255, + "grad_norm": 6.651485443115234, + "learning_rate": 7.072495546114843e-06, + "loss": 0.247, + "step": 22113 + }, + { + "epoch": 1.0999457406402604, + "grad_norm": 4.619965076446533, + "learning_rate": 7.072358503494588e-06, + "loss": 0.2911, + "step": 22114 + }, + { + "epoch": 1.0999593054801953, + "grad_norm": 4.339319705963135, + "learning_rate": 7.072221460874332e-06, + "loss": 0.2209, + "step": 22115 + }, + { + "epoch": 1.0999728703201301, + "grad_norm": 6.338413715362549, + "learning_rate": 7.072084418254077e-06, + "loss": 0.2304, + "step": 22116 + }, + { + "epoch": 1.0999728703201301, + "eval_loss": 0.3409672677516937, + "eval_noise_accuracy": NaN, + "eval_runtime": 4525.6371, + "eval_samples_per_second": 1.11, + "eval_steps_per_second": 0.069, + "eval_wer": 28.42976738438726, + "step": 22116 + }, + { + "epoch": 1.0999864351600652, + "grad_norm": 4.342178821563721, + "learning_rate": 7.071947375633823e-06, + "loss": 0.1895, + "step": 22117 + }, + { + "epoch": 1.1, + "grad_norm": 2.8186864852905273, + "learning_rate": 7.071810333013568e-06, + "loss": 0.1005, + "step": 22118 + }, + { + "epoch": 1.100013564839935, + "grad_norm": 3.6010429859161377, + "learning_rate": 7.071673290393313e-06, + "loss": 0.1662, + "step": 22119 + }, + { + "epoch": 1.1000271296798698, + "grad_norm": 5.324407577514648, + "learning_rate": 7.071536247773057e-06, + "loss": 0.2016, + "step": 22120 + }, + { + "epoch": 1.1000406945198047, + "grad_norm": 6.679689407348633, + "learning_rate": 7.071399205152803e-06, + "loss": 0.304, + "step": 22121 + }, + { + "epoch": 1.1000542593597396, + "grad_norm": 5.219043731689453, + "learning_rate": 7.071262162532548e-06, + "loss": 0.2221, + "step": 22122 + }, + { + "epoch": 1.1000678241996744, + "grad_norm": 6.146294593811035, + "learning_rate": 7.0711251199122936e-06, + "loss": 0.279, + "step": 22123 + }, + { + "epoch": 1.1000813890396093, + "grad_norm": 5.782711505889893, + "learning_rate": 7.070988077292038e-06, + "loss": 0.2443, + "step": 22124 + }, + { + "epoch": 1.1000949538795441, + "grad_norm": 7.0971598625183105, + "learning_rate": 7.070851034671784e-06, + "loss": 0.3707, + "step": 22125 + }, + { + "epoch": 1.100108518719479, + "grad_norm": 4.166907787322998, + "learning_rate": 7.070713992051529e-06, + "loss": 0.1915, + "step": 22126 + }, + { + "epoch": 1.100122083559414, + "grad_norm": 5.6028337478637695, + "learning_rate": 7.0705769494312734e-06, + "loss": 0.2113, + "step": 22127 + }, + { + "epoch": 1.100135648399349, + "grad_norm": 4.922213554382324, + "learning_rate": 7.070439906811019e-06, + "loss": 0.2463, + "step": 22128 + }, + { + "epoch": 1.1001492132392838, + "grad_norm": 5.333925724029541, + "learning_rate": 7.070302864190764e-06, + "loss": 0.2995, + "step": 22129 + }, + { + "epoch": 1.1001627780792187, + "grad_norm": 4.800012111663818, + "learning_rate": 7.070165821570509e-06, + "loss": 0.1916, + "step": 22130 + }, + { + "epoch": 1.1001763429191536, + "grad_norm": 4.989597797393799, + "learning_rate": 7.070028778950254e-06, + "loss": 0.2717, + "step": 22131 + }, + { + "epoch": 1.1001899077590884, + "grad_norm": 6.056686878204346, + "learning_rate": 7.069891736329999e-06, + "loss": 0.2783, + "step": 22132 + }, + { + "epoch": 1.1002034725990233, + "grad_norm": 6.755044460296631, + "learning_rate": 7.069754693709744e-06, + "loss": 0.3477, + "step": 22133 + }, + { + "epoch": 1.1002170374389582, + "grad_norm": 5.793794631958008, + "learning_rate": 7.06961765108949e-06, + "loss": 0.2644, + "step": 22134 + }, + { + "epoch": 1.100230602278893, + "grad_norm": 4.596336364746094, + "learning_rate": 7.069480608469235e-06, + "loss": 0.3127, + "step": 22135 + }, + { + "epoch": 1.100244167118828, + "grad_norm": 5.891688346862793, + "learning_rate": 7.069343565848979e-06, + "loss": 0.2098, + "step": 22136 + }, + { + "epoch": 1.100257731958763, + "grad_norm": 6.61419677734375, + "learning_rate": 7.069206523228724e-06, + "loss": 0.4042, + "step": 22137 + }, + { + "epoch": 1.1002712967986978, + "grad_norm": 4.163944721221924, + "learning_rate": 7.06906948060847e-06, + "loss": 0.1823, + "step": 22138 + }, + { + "epoch": 1.1002848616386327, + "grad_norm": 8.475963592529297, + "learning_rate": 7.0689324379882156e-06, + "loss": 0.4848, + "step": 22139 + }, + { + "epoch": 1.1002984264785676, + "grad_norm": 6.453361511230469, + "learning_rate": 7.06879539536796e-06, + "loss": 0.2709, + "step": 22140 + }, + { + "epoch": 1.1003119913185024, + "grad_norm": 6.573232173919678, + "learning_rate": 7.068658352747705e-06, + "loss": 0.2978, + "step": 22141 + }, + { + "epoch": 1.1003255561584373, + "grad_norm": 8.661720275878906, + "learning_rate": 7.068521310127449e-06, + "loss": 0.5242, + "step": 22142 + }, + { + "epoch": 1.1003391209983722, + "grad_norm": 5.966274738311768, + "learning_rate": 7.0683842675071954e-06, + "loss": 0.3737, + "step": 22143 + }, + { + "epoch": 1.100352685838307, + "grad_norm": 6.115427494049072, + "learning_rate": 7.068247224886941e-06, + "loss": 0.354, + "step": 22144 + }, + { + "epoch": 1.100366250678242, + "grad_norm": 5.188222885131836, + "learning_rate": 7.068110182266685e-06, + "loss": 0.2498, + "step": 22145 + }, + { + "epoch": 1.100379815518177, + "grad_norm": 5.7696661949157715, + "learning_rate": 7.06797313964643e-06, + "loss": 0.3281, + "step": 22146 + }, + { + "epoch": 1.1003933803581119, + "grad_norm": 11.495526313781738, + "learning_rate": 7.067836097026176e-06, + "loss": 0.5065, + "step": 22147 + }, + { + "epoch": 1.1004069451980467, + "grad_norm": 6.569002151489258, + "learning_rate": 7.067699054405921e-06, + "loss": 0.333, + "step": 22148 + }, + { + "epoch": 1.1004205100379816, + "grad_norm": 7.156731128692627, + "learning_rate": 7.067562011785666e-06, + "loss": 0.3483, + "step": 22149 + }, + { + "epoch": 1.1004340748779164, + "grad_norm": 4.352203845977783, + "learning_rate": 7.067424969165411e-06, + "loss": 0.2265, + "step": 22150 + }, + { + "epoch": 1.1004476397178513, + "grad_norm": 5.907749652862549, + "learning_rate": 7.067287926545157e-06, + "loss": 0.2647, + "step": 22151 + }, + { + "epoch": 1.1004612045577862, + "grad_norm": 5.801846504211426, + "learning_rate": 7.067150883924901e-06, + "loss": 0.2304, + "step": 22152 + }, + { + "epoch": 1.100474769397721, + "grad_norm": 7.216527462005615, + "learning_rate": 7.067013841304646e-06, + "loss": 0.4161, + "step": 22153 + }, + { + "epoch": 1.100488334237656, + "grad_norm": 5.016350269317627, + "learning_rate": 7.066876798684391e-06, + "loss": 0.4121, + "step": 22154 + }, + { + "epoch": 1.100501899077591, + "grad_norm": 6.260885238647461, + "learning_rate": 7.066739756064136e-06, + "loss": 0.3028, + "step": 22155 + }, + { + "epoch": 1.1005154639175259, + "grad_norm": 5.1233744621276855, + "learning_rate": 7.066602713443882e-06, + "loss": 0.3736, + "step": 22156 + }, + { + "epoch": 1.1005290287574607, + "grad_norm": 6.888559341430664, + "learning_rate": 7.066465670823627e-06, + "loss": 0.4357, + "step": 22157 + }, + { + "epoch": 1.1005425935973956, + "grad_norm": 7.633706092834473, + "learning_rate": 7.0663286282033714e-06, + "loss": 0.3798, + "step": 22158 + }, + { + "epoch": 1.1005561584373305, + "grad_norm": 4.7188544273376465, + "learning_rate": 7.066191585583117e-06, + "loss": 0.2827, + "step": 22159 + }, + { + "epoch": 1.1005697232772653, + "grad_norm": 5.425227165222168, + "learning_rate": 7.066054542962863e-06, + "loss": 0.2429, + "step": 22160 + }, + { + "epoch": 1.1005832881172002, + "grad_norm": 6.013676643371582, + "learning_rate": 7.065917500342607e-06, + "loss": 0.3995, + "step": 22161 + }, + { + "epoch": 1.100596852957135, + "grad_norm": 5.257756233215332, + "learning_rate": 7.065780457722352e-06, + "loss": 0.3692, + "step": 22162 + }, + { + "epoch": 1.10061041779707, + "grad_norm": 8.889521598815918, + "learning_rate": 7.065643415102097e-06, + "loss": 0.397, + "step": 22163 + }, + { + "epoch": 1.1006239826370048, + "grad_norm": 6.9564595222473145, + "learning_rate": 7.0655063724818425e-06, + "loss": 0.3292, + "step": 22164 + }, + { + "epoch": 1.1006375474769399, + "grad_norm": 5.660278797149658, + "learning_rate": 7.065369329861588e-06, + "loss": 0.2665, + "step": 22165 + }, + { + "epoch": 1.1006511123168747, + "grad_norm": 5.637693881988525, + "learning_rate": 7.065232287241333e-06, + "loss": 0.3765, + "step": 22166 + }, + { + "epoch": 1.1006646771568096, + "grad_norm": 8.307127952575684, + "learning_rate": 7.065095244621077e-06, + "loss": 0.4952, + "step": 22167 + }, + { + "epoch": 1.1006782419967445, + "grad_norm": 5.242025375366211, + "learning_rate": 7.064958202000823e-06, + "loss": 0.3053, + "step": 22168 + }, + { + "epoch": 1.1006918068366793, + "grad_norm": 4.1156744956970215, + "learning_rate": 7.064821159380568e-06, + "loss": 0.2022, + "step": 22169 + }, + { + "epoch": 1.1007053716766142, + "grad_norm": 5.888498783111572, + "learning_rate": 7.064684116760313e-06, + "loss": 0.3769, + "step": 22170 + }, + { + "epoch": 1.100718936516549, + "grad_norm": 5.015009880065918, + "learning_rate": 7.064547074140058e-06, + "loss": 0.3035, + "step": 22171 + }, + { + "epoch": 1.100732501356484, + "grad_norm": 7.526745319366455, + "learning_rate": 7.064410031519803e-06, + "loss": 0.3775, + "step": 22172 + }, + { + "epoch": 1.1007460661964188, + "grad_norm": 5.865839004516602, + "learning_rate": 7.064272988899549e-06, + "loss": 0.2458, + "step": 22173 + }, + { + "epoch": 1.1007596310363539, + "grad_norm": 6.369954586029053, + "learning_rate": 7.0641359462792934e-06, + "loss": 0.3949, + "step": 22174 + }, + { + "epoch": 1.1007731958762887, + "grad_norm": 6.732305526733398, + "learning_rate": 7.063998903659039e-06, + "loss": 0.4647, + "step": 22175 + }, + { + "epoch": 1.1007867607162236, + "grad_norm": 6.121123790740967, + "learning_rate": 7.063861861038783e-06, + "loss": 0.2754, + "step": 22176 + }, + { + "epoch": 1.1008003255561585, + "grad_norm": 4.5158867835998535, + "learning_rate": 7.063724818418529e-06, + "loss": 0.244, + "step": 22177 + }, + { + "epoch": 1.1008138903960933, + "grad_norm": 5.345893859863281, + "learning_rate": 7.063587775798274e-06, + "loss": 0.3966, + "step": 22178 + }, + { + "epoch": 1.1008274552360282, + "grad_norm": 5.755537509918213, + "learning_rate": 7.0634507331780185e-06, + "loss": 0.2989, + "step": 22179 + }, + { + "epoch": 1.100841020075963, + "grad_norm": 5.000060081481934, + "learning_rate": 7.063313690557764e-06, + "loss": 0.2665, + "step": 22180 + }, + { + "epoch": 1.100854584915898, + "grad_norm": 6.274930000305176, + "learning_rate": 7.06317664793751e-06, + "loss": 0.354, + "step": 22181 + }, + { + "epoch": 1.1008681497558328, + "grad_norm": 5.340269565582275, + "learning_rate": 7.063039605317255e-06, + "loss": 0.422, + "step": 22182 + }, + { + "epoch": 1.1008817145957677, + "grad_norm": 6.067886829376221, + "learning_rate": 7.062902562696999e-06, + "loss": 0.3388, + "step": 22183 + }, + { + "epoch": 1.1008952794357028, + "grad_norm": 5.779168128967285, + "learning_rate": 7.062765520076744e-06, + "loss": 0.412, + "step": 22184 + }, + { + "epoch": 1.1009088442756376, + "grad_norm": 5.562061786651611, + "learning_rate": 7.062628477456489e-06, + "loss": 0.3772, + "step": 22185 + }, + { + "epoch": 1.1009224091155725, + "grad_norm": 4.661840438842773, + "learning_rate": 7.062491434836235e-06, + "loss": 0.2572, + "step": 22186 + }, + { + "epoch": 1.1009359739555074, + "grad_norm": 7.56564998626709, + "learning_rate": 7.06235439221598e-06, + "loss": 0.4642, + "step": 22187 + }, + { + "epoch": 1.1009495387954422, + "grad_norm": 5.014139175415039, + "learning_rate": 7.062217349595725e-06, + "loss": 0.2604, + "step": 22188 + }, + { + "epoch": 1.100963103635377, + "grad_norm": 4.998748302459717, + "learning_rate": 7.0620803069754694e-06, + "loss": 0.2861, + "step": 22189 + }, + { + "epoch": 1.100976668475312, + "grad_norm": 5.120026588439941, + "learning_rate": 7.0619432643552155e-06, + "loss": 0.3049, + "step": 22190 + }, + { + "epoch": 1.1009902333152468, + "grad_norm": 8.073702812194824, + "learning_rate": 7.061806221734961e-06, + "loss": 0.4202, + "step": 22191 + }, + { + "epoch": 1.1010037981551817, + "grad_norm": 8.315970420837402, + "learning_rate": 7.061669179114705e-06, + "loss": 0.4609, + "step": 22192 + }, + { + "epoch": 1.1010173629951168, + "grad_norm": 7.91016960144043, + "learning_rate": 7.06153213649445e-06, + "loss": 0.4285, + "step": 22193 + }, + { + "epoch": 1.1010309278350516, + "grad_norm": 4.958006381988525, + "learning_rate": 7.061395093874196e-06, + "loss": 0.2702, + "step": 22194 + }, + { + "epoch": 1.1010444926749865, + "grad_norm": 5.045458793640137, + "learning_rate": 7.0612580512539405e-06, + "loss": 0.2525, + "step": 22195 + }, + { + "epoch": 1.1010580575149214, + "grad_norm": 5.0872273445129395, + "learning_rate": 7.061121008633686e-06, + "loss": 0.2998, + "step": 22196 + }, + { + "epoch": 1.1010716223548562, + "grad_norm": 6.578428745269775, + "learning_rate": 7.060983966013431e-06, + "loss": 0.41, + "step": 22197 + }, + { + "epoch": 1.101085187194791, + "grad_norm": 7.4604291915893555, + "learning_rate": 7.060846923393175e-06, + "loss": 0.323, + "step": 22198 + }, + { + "epoch": 1.101098752034726, + "grad_norm": 4.85577917098999, + "learning_rate": 7.060709880772921e-06, + "loss": 0.2877, + "step": 22199 + }, + { + "epoch": 1.1011123168746608, + "grad_norm": 6.451278209686279, + "learning_rate": 7.060572838152666e-06, + "loss": 0.2886, + "step": 22200 + }, + { + "epoch": 1.1011258817145957, + "grad_norm": 5.908693313598633, + "learning_rate": 7.060435795532411e-06, + "loss": 0.2441, + "step": 22201 + }, + { + "epoch": 1.1011394465545306, + "grad_norm": 6.568992614746094, + "learning_rate": 7.060298752912156e-06, + "loss": 0.3185, + "step": 22202 + }, + { + "epoch": 1.1011530113944656, + "grad_norm": 5.378365516662598, + "learning_rate": 7.060161710291902e-06, + "loss": 0.2898, + "step": 22203 + }, + { + "epoch": 1.1011665762344005, + "grad_norm": 4.6284613609313965, + "learning_rate": 7.060024667671646e-06, + "loss": 0.2052, + "step": 22204 + }, + { + "epoch": 1.1011801410743354, + "grad_norm": 3.9302117824554443, + "learning_rate": 7.0598876250513915e-06, + "loss": 0.1646, + "step": 22205 + }, + { + "epoch": 1.1011937059142702, + "grad_norm": 5.963443279266357, + "learning_rate": 7.059750582431137e-06, + "loss": 0.3455, + "step": 22206 + }, + { + "epoch": 1.101207270754205, + "grad_norm": 6.561220645904541, + "learning_rate": 7.059613539810883e-06, + "loss": 0.2882, + "step": 22207 + }, + { + "epoch": 1.10122083559414, + "grad_norm": 6.022069931030273, + "learning_rate": 7.059476497190627e-06, + "loss": 0.35, + "step": 22208 + }, + { + "epoch": 1.1012344004340748, + "grad_norm": 6.558172225952148, + "learning_rate": 7.059339454570372e-06, + "loss": 0.378, + "step": 22209 + }, + { + "epoch": 1.1012479652740097, + "grad_norm": 6.552077770233154, + "learning_rate": 7.0592024119501165e-06, + "loss": 0.2722, + "step": 22210 + }, + { + "epoch": 1.1012615301139446, + "grad_norm": 5.34236478805542, + "learning_rate": 7.059065369329862e-06, + "loss": 0.2995, + "step": 22211 + }, + { + "epoch": 1.1012750949538797, + "grad_norm": 7.027177810668945, + "learning_rate": 7.058928326709608e-06, + "loss": 0.3934, + "step": 22212 + }, + { + "epoch": 1.1012886597938145, + "grad_norm": 3.242919921875, + "learning_rate": 7.058791284089352e-06, + "loss": 0.1343, + "step": 22213 + }, + { + "epoch": 1.1013022246337494, + "grad_norm": 5.123309135437012, + "learning_rate": 7.058654241469097e-06, + "loss": 0.1907, + "step": 22214 + }, + { + "epoch": 1.1013157894736842, + "grad_norm": 5.207205772399902, + "learning_rate": 7.058517198848842e-06, + "loss": 0.2863, + "step": 22215 + }, + { + "epoch": 1.1013293543136191, + "grad_norm": 5.064603805541992, + "learning_rate": 7.058380156228588e-06, + "loss": 0.2307, + "step": 22216 + }, + { + "epoch": 1.101342919153554, + "grad_norm": 6.778295040130615, + "learning_rate": 7.058243113608333e-06, + "loss": 0.3556, + "step": 22217 + }, + { + "epoch": 1.1013564839934888, + "grad_norm": 6.096306324005127, + "learning_rate": 7.058106070988078e-06, + "loss": 0.3182, + "step": 22218 + }, + { + "epoch": 1.1013700488334237, + "grad_norm": 6.919361114501953, + "learning_rate": 7.057969028367822e-06, + "loss": 0.2738, + "step": 22219 + }, + { + "epoch": 1.1013836136733586, + "grad_norm": 5.014881134033203, + "learning_rate": 7.057831985747568e-06, + "loss": 0.2846, + "step": 22220 + }, + { + "epoch": 1.1013971785132934, + "grad_norm": 5.481448650360107, + "learning_rate": 7.0576949431273135e-06, + "loss": 0.2757, + "step": 22221 + }, + { + "epoch": 1.1014107433532285, + "grad_norm": 5.0959978103637695, + "learning_rate": 7.057557900507059e-06, + "loss": 0.2595, + "step": 22222 + }, + { + "epoch": 1.1014243081931634, + "grad_norm": 6.49135684967041, + "learning_rate": 7.057420857886803e-06, + "loss": 0.4244, + "step": 22223 + }, + { + "epoch": 1.1014378730330983, + "grad_norm": 5.546383857727051, + "learning_rate": 7.057283815266548e-06, + "loss": 0.2448, + "step": 22224 + }, + { + "epoch": 1.1014514378730331, + "grad_norm": 4.472758769989014, + "learning_rate": 7.057146772646294e-06, + "loss": 0.2409, + "step": 22225 + }, + { + "epoch": 1.101465002712968, + "grad_norm": 6.214320659637451, + "learning_rate": 7.0570097300260385e-06, + "loss": 0.3079, + "step": 22226 + }, + { + "epoch": 1.1014785675529029, + "grad_norm": 6.156529426574707, + "learning_rate": 7.056872687405784e-06, + "loss": 0.4732, + "step": 22227 + }, + { + "epoch": 1.1014921323928377, + "grad_norm": 5.089334487915039, + "learning_rate": 7.056735644785528e-06, + "loss": 0.2153, + "step": 22228 + }, + { + "epoch": 1.1015056972327726, + "grad_norm": 5.3060712814331055, + "learning_rate": 7.056598602165274e-06, + "loss": 0.2922, + "step": 22229 + }, + { + "epoch": 1.1015192620727075, + "grad_norm": 8.8412504196167, + "learning_rate": 7.056461559545019e-06, + "loss": 0.3541, + "step": 22230 + }, + { + "epoch": 1.1015328269126425, + "grad_norm": 6.420627117156982, + "learning_rate": 7.056324516924764e-06, + "loss": 0.31, + "step": 22231 + }, + { + "epoch": 1.1015463917525774, + "grad_norm": 5.704163551330566, + "learning_rate": 7.056187474304509e-06, + "loss": 0.2578, + "step": 22232 + }, + { + "epoch": 1.1015599565925123, + "grad_norm": 6.1225080490112305, + "learning_rate": 7.056050431684255e-06, + "loss": 0.2986, + "step": 22233 + }, + { + "epoch": 1.1015735214324471, + "grad_norm": 6.047370910644531, + "learning_rate": 7.055913389064e-06, + "loss": 0.5184, + "step": 22234 + }, + { + "epoch": 1.101587086272382, + "grad_norm": 5.8108978271484375, + "learning_rate": 7.055776346443744e-06, + "loss": 0.2405, + "step": 22235 + }, + { + "epoch": 1.1016006511123169, + "grad_norm": 5.736138820648193, + "learning_rate": 7.0556393038234895e-06, + "loss": 0.2894, + "step": 22236 + }, + { + "epoch": 1.1016142159522517, + "grad_norm": 5.009955406188965, + "learning_rate": 7.0555022612032355e-06, + "loss": 0.246, + "step": 22237 + }, + { + "epoch": 1.1016277807921866, + "grad_norm": 6.306241035461426, + "learning_rate": 7.05536521858298e-06, + "loss": 0.4858, + "step": 22238 + }, + { + "epoch": 1.1016413456321215, + "grad_norm": 5.451666831970215, + "learning_rate": 7.055228175962725e-06, + "loss": 0.2611, + "step": 22239 + }, + { + "epoch": 1.1016549104720563, + "grad_norm": 4.074771404266357, + "learning_rate": 7.05509113334247e-06, + "loss": 0.2266, + "step": 22240 + }, + { + "epoch": 1.1016684753119914, + "grad_norm": 4.703261375427246, + "learning_rate": 7.0549540907222145e-06, + "loss": 0.3184, + "step": 22241 + }, + { + "epoch": 1.1016820401519263, + "grad_norm": 4.9952311515808105, + "learning_rate": 7.0548170481019605e-06, + "loss": 0.2027, + "step": 22242 + }, + { + "epoch": 1.1016956049918611, + "grad_norm": 6.725867748260498, + "learning_rate": 7.054680005481706e-06, + "loss": 0.3415, + "step": 22243 + }, + { + "epoch": 1.101709169831796, + "grad_norm": 6.82282018661499, + "learning_rate": 7.05454296286145e-06, + "loss": 0.3024, + "step": 22244 + }, + { + "epoch": 1.1017227346717309, + "grad_norm": 5.408053398132324, + "learning_rate": 7.054405920241195e-06, + "loss": 0.2809, + "step": 22245 + }, + { + "epoch": 1.1017362995116657, + "grad_norm": 4.688809394836426, + "learning_rate": 7.054268877620941e-06, + "loss": 0.3237, + "step": 22246 + }, + { + "epoch": 1.1017498643516006, + "grad_norm": 5.376009941101074, + "learning_rate": 7.0541318350006856e-06, + "loss": 0.3098, + "step": 22247 + }, + { + "epoch": 1.1017634291915355, + "grad_norm": 5.648129940032959, + "learning_rate": 7.053994792380431e-06, + "loss": 0.2906, + "step": 22248 + }, + { + "epoch": 1.1017769940314703, + "grad_norm": 4.726088047027588, + "learning_rate": 7.053857749760176e-06, + "loss": 0.1638, + "step": 22249 + }, + { + "epoch": 1.1017905588714054, + "grad_norm": 5.077205181121826, + "learning_rate": 7.053720707139922e-06, + "loss": 0.3427, + "step": 22250 + }, + { + "epoch": 1.1018041237113403, + "grad_norm": 5.021187782287598, + "learning_rate": 7.053583664519666e-06, + "loss": 0.171, + "step": 22251 + }, + { + "epoch": 1.1018176885512752, + "grad_norm": 3.955005645751953, + "learning_rate": 7.0534466218994115e-06, + "loss": 0.2986, + "step": 22252 + }, + { + "epoch": 1.10183125339121, + "grad_norm": 5.921080589294434, + "learning_rate": 7.053309579279156e-06, + "loss": 0.3032, + "step": 22253 + }, + { + "epoch": 1.1018448182311449, + "grad_norm": 5.654928207397461, + "learning_rate": 7.053172536658901e-06, + "loss": 0.2593, + "step": 22254 + }, + { + "epoch": 1.1018583830710797, + "grad_norm": 7.421150207519531, + "learning_rate": 7.053035494038647e-06, + "loss": 0.4647, + "step": 22255 + }, + { + "epoch": 1.1018719479110146, + "grad_norm": 7.14251708984375, + "learning_rate": 7.052898451418392e-06, + "loss": 0.2989, + "step": 22256 + }, + { + "epoch": 1.1018855127509495, + "grad_norm": 8.840330123901367, + "learning_rate": 7.0527614087981365e-06, + "loss": 0.4088, + "step": 22257 + }, + { + "epoch": 1.1018990775908843, + "grad_norm": 5.885623931884766, + "learning_rate": 7.052624366177882e-06, + "loss": 0.2881, + "step": 22258 + }, + { + "epoch": 1.1019126424308192, + "grad_norm": 5.3073015213012695, + "learning_rate": 7.052487323557628e-06, + "loss": 0.3401, + "step": 22259 + }, + { + "epoch": 1.1019262072707543, + "grad_norm": 5.903637409210205, + "learning_rate": 7.052350280937372e-06, + "loss": 0.2221, + "step": 22260 + }, + { + "epoch": 1.1019397721106892, + "grad_norm": 4.891859531402588, + "learning_rate": 7.052213238317117e-06, + "loss": 0.2593, + "step": 22261 + }, + { + "epoch": 1.101953336950624, + "grad_norm": 3.9801528453826904, + "learning_rate": 7.0520761956968616e-06, + "loss": 0.2859, + "step": 22262 + }, + { + "epoch": 1.101966901790559, + "grad_norm": 5.608736038208008, + "learning_rate": 7.051939153076608e-06, + "loss": 0.4609, + "step": 22263 + }, + { + "epoch": 1.1019804666304938, + "grad_norm": 5.537185192108154, + "learning_rate": 7.051802110456353e-06, + "loss": 0.2664, + "step": 22264 + }, + { + "epoch": 1.1019940314704286, + "grad_norm": 3.745027780532837, + "learning_rate": 7.051665067836098e-06, + "loss": 0.251, + "step": 22265 + }, + { + "epoch": 1.1020075963103635, + "grad_norm": 5.316804885864258, + "learning_rate": 7.051528025215842e-06, + "loss": 0.3331, + "step": 22266 + }, + { + "epoch": 1.1020211611502984, + "grad_norm": 6.343299388885498, + "learning_rate": 7.0513909825955875e-06, + "loss": 0.2402, + "step": 22267 + }, + { + "epoch": 1.1020347259902332, + "grad_norm": 6.492865085601807, + "learning_rate": 7.0512539399753335e-06, + "loss": 0.3837, + "step": 22268 + }, + { + "epoch": 1.1020482908301683, + "grad_norm": 3.955577850341797, + "learning_rate": 7.051116897355078e-06, + "loss": 0.2232, + "step": 22269 + }, + { + "epoch": 1.1020618556701032, + "grad_norm": 6.082122802734375, + "learning_rate": 7.050979854734823e-06, + "loss": 0.2295, + "step": 22270 + }, + { + "epoch": 1.102075420510038, + "grad_norm": 5.971336364746094, + "learning_rate": 7.050842812114568e-06, + "loss": 0.3419, + "step": 22271 + }, + { + "epoch": 1.102088985349973, + "grad_norm": 6.206920146942139, + "learning_rate": 7.050705769494313e-06, + "loss": 0.2484, + "step": 22272 + }, + { + "epoch": 1.1021025501899078, + "grad_norm": 5.731200218200684, + "learning_rate": 7.0505687268740585e-06, + "loss": 0.286, + "step": 22273 + }, + { + "epoch": 1.1021161150298426, + "grad_norm": 6.572349548339844, + "learning_rate": 7.050431684253804e-06, + "loss": 0.2142, + "step": 22274 + }, + { + "epoch": 1.1021296798697775, + "grad_norm": 4.431945323944092, + "learning_rate": 7.050294641633548e-06, + "loss": 0.2493, + "step": 22275 + }, + { + "epoch": 1.1021432447097124, + "grad_norm": 4.661110877990723, + "learning_rate": 7.050157599013294e-06, + "loss": 0.2342, + "step": 22276 + }, + { + "epoch": 1.1021568095496472, + "grad_norm": 8.567874908447266, + "learning_rate": 7.050020556393039e-06, + "loss": 0.4262, + "step": 22277 + }, + { + "epoch": 1.102170374389582, + "grad_norm": 4.135006427764893, + "learning_rate": 7.049883513772784e-06, + "loss": 0.1984, + "step": 22278 + }, + { + "epoch": 1.1021839392295172, + "grad_norm": 5.4489240646362305, + "learning_rate": 7.049746471152529e-06, + "loss": 0.2347, + "step": 22279 + }, + { + "epoch": 1.102197504069452, + "grad_norm": 5.980684757232666, + "learning_rate": 7.049609428532274e-06, + "loss": 0.3294, + "step": 22280 + }, + { + "epoch": 1.102211068909387, + "grad_norm": 6.4206976890563965, + "learning_rate": 7.04947238591202e-06, + "loss": 0.2816, + "step": 22281 + }, + { + "epoch": 1.1022246337493218, + "grad_norm": 5.0074896812438965, + "learning_rate": 7.049335343291764e-06, + "loss": 0.2638, + "step": 22282 + }, + { + "epoch": 1.1022381985892566, + "grad_norm": 6.920567989349365, + "learning_rate": 7.0491983006715095e-06, + "loss": 0.3937, + "step": 22283 + }, + { + "epoch": 1.1022517634291915, + "grad_norm": 5.250699996948242, + "learning_rate": 7.049061258051254e-06, + "loss": 0.2845, + "step": 22284 + }, + { + "epoch": 1.1022653282691264, + "grad_norm": 6.585104465484619, + "learning_rate": 7.048924215431e-06, + "loss": 0.1922, + "step": 22285 + }, + { + "epoch": 1.1022788931090612, + "grad_norm": 4.14243221282959, + "learning_rate": 7.048787172810745e-06, + "loss": 0.1963, + "step": 22286 + }, + { + "epoch": 1.102292457948996, + "grad_norm": 8.292326927185059, + "learning_rate": 7.048650130190489e-06, + "loss": 0.4546, + "step": 22287 + }, + { + "epoch": 1.1023060227889312, + "grad_norm": 7.307889461517334, + "learning_rate": 7.0485130875702345e-06, + "loss": 0.2648, + "step": 22288 + }, + { + "epoch": 1.102319587628866, + "grad_norm": 5.028103351593018, + "learning_rate": 7.0483760449499805e-06, + "loss": 0.2611, + "step": 22289 + }, + { + "epoch": 1.102333152468801, + "grad_norm": 7.411575794219971, + "learning_rate": 7.048239002329726e-06, + "loss": 0.4385, + "step": 22290 + }, + { + "epoch": 1.1023467173087358, + "grad_norm": 7.63365364074707, + "learning_rate": 7.04810195970947e-06, + "loss": 0.4375, + "step": 22291 + }, + { + "epoch": 1.1023602821486707, + "grad_norm": 6.613205909729004, + "learning_rate": 7.047964917089215e-06, + "loss": 0.2532, + "step": 22292 + }, + { + "epoch": 1.1023738469886055, + "grad_norm": 6.323697090148926, + "learning_rate": 7.0478278744689596e-06, + "loss": 0.2666, + "step": 22293 + }, + { + "epoch": 1.1023874118285404, + "grad_norm": 5.103005886077881, + "learning_rate": 7.047690831848706e-06, + "loss": 0.3025, + "step": 22294 + }, + { + "epoch": 1.1024009766684753, + "grad_norm": 8.725479125976562, + "learning_rate": 7.047553789228451e-06, + "loss": 0.4629, + "step": 22295 + }, + { + "epoch": 1.1024145415084101, + "grad_norm": 5.0867767333984375, + "learning_rate": 7.047416746608195e-06, + "loss": 0.2861, + "step": 22296 + }, + { + "epoch": 1.102428106348345, + "grad_norm": 6.938305854797363, + "learning_rate": 7.04727970398794e-06, + "loss": 0.3089, + "step": 22297 + }, + { + "epoch": 1.10244167118828, + "grad_norm": 5.337311267852783, + "learning_rate": 7.047142661367686e-06, + "loss": 0.3112, + "step": 22298 + }, + { + "epoch": 1.102455236028215, + "grad_norm": 8.533307075500488, + "learning_rate": 7.0470056187474315e-06, + "loss": 0.3973, + "step": 22299 + }, + { + "epoch": 1.1024688008681498, + "grad_norm": 8.215502738952637, + "learning_rate": 7.046868576127176e-06, + "loss": 0.4074, + "step": 22300 + }, + { + "epoch": 1.1024823657080847, + "grad_norm": 6.365903377532959, + "learning_rate": 7.046731533506921e-06, + "loss": 0.4355, + "step": 22301 + }, + { + "epoch": 1.1024959305480195, + "grad_norm": 6.980865955352783, + "learning_rate": 7.046594490886667e-06, + "loss": 0.3223, + "step": 22302 + }, + { + "epoch": 1.1025094953879544, + "grad_norm": 6.954372406005859, + "learning_rate": 7.046457448266411e-06, + "loss": 0.2422, + "step": 22303 + }, + { + "epoch": 1.1025230602278893, + "grad_norm": 5.806086540222168, + "learning_rate": 7.0463204056461565e-06, + "loss": 0.3, + "step": 22304 + }, + { + "epoch": 1.1025366250678241, + "grad_norm": 7.228388786315918, + "learning_rate": 7.046183363025902e-06, + "loss": 0.3371, + "step": 22305 + }, + { + "epoch": 1.102550189907759, + "grad_norm": 5.271888732910156, + "learning_rate": 7.046046320405646e-06, + "loss": 0.3064, + "step": 22306 + }, + { + "epoch": 1.102563754747694, + "grad_norm": 7.0218000411987305, + "learning_rate": 7.045909277785392e-06, + "loss": 0.2613, + "step": 22307 + }, + { + "epoch": 1.102577319587629, + "grad_norm": 5.807258129119873, + "learning_rate": 7.045772235165137e-06, + "loss": 0.301, + "step": 22308 + }, + { + "epoch": 1.1025908844275638, + "grad_norm": 7.900836944580078, + "learning_rate": 7.045635192544882e-06, + "loss": 0.2627, + "step": 22309 + }, + { + "epoch": 1.1026044492674987, + "grad_norm": 4.880378723144531, + "learning_rate": 7.045498149924627e-06, + "loss": 0.2719, + "step": 22310 + }, + { + "epoch": 1.1026180141074335, + "grad_norm": 6.887501239776611, + "learning_rate": 7.045361107304373e-06, + "loss": 0.4345, + "step": 22311 + }, + { + "epoch": 1.1026315789473684, + "grad_norm": 5.601024150848389, + "learning_rate": 7.045224064684117e-06, + "loss": 0.2643, + "step": 22312 + }, + { + "epoch": 1.1026451437873033, + "grad_norm": 6.043396472930908, + "learning_rate": 7.045087022063862e-06, + "loss": 0.2686, + "step": 22313 + }, + { + "epoch": 1.1026587086272381, + "grad_norm": 5.220180988311768, + "learning_rate": 7.0449499794436075e-06, + "loss": 0.3103, + "step": 22314 + }, + { + "epoch": 1.102672273467173, + "grad_norm": 5.550661087036133, + "learning_rate": 7.0448129368233535e-06, + "loss": 0.2595, + "step": 22315 + }, + { + "epoch": 1.1026858383071079, + "grad_norm": 6.445455074310303, + "learning_rate": 7.044675894203098e-06, + "loss": 0.5152, + "step": 22316 + }, + { + "epoch": 1.102699403147043, + "grad_norm": 6.254515171051025, + "learning_rate": 7.044538851582843e-06, + "loss": 0.3843, + "step": 22317 + }, + { + "epoch": 1.1027129679869778, + "grad_norm": 5.193688869476318, + "learning_rate": 7.044401808962587e-06, + "loss": 0.2356, + "step": 22318 + }, + { + "epoch": 1.1027265328269127, + "grad_norm": 7.234555244445801, + "learning_rate": 7.044264766342333e-06, + "loss": 0.4657, + "step": 22319 + }, + { + "epoch": 1.1027400976668476, + "grad_norm": 5.541459083557129, + "learning_rate": 7.0441277237220786e-06, + "loss": 0.2424, + "step": 22320 + }, + { + "epoch": 1.1027536625067824, + "grad_norm": 5.889657497406006, + "learning_rate": 7.043990681101823e-06, + "loss": 0.2041, + "step": 22321 + }, + { + "epoch": 1.1027672273467173, + "grad_norm": 4.778313636779785, + "learning_rate": 7.043853638481568e-06, + "loss": 0.2623, + "step": 22322 + }, + { + "epoch": 1.1027807921866521, + "grad_norm": 6.470166206359863, + "learning_rate": 7.043716595861313e-06, + "loss": 0.4228, + "step": 22323 + }, + { + "epoch": 1.102794357026587, + "grad_norm": 6.82073450088501, + "learning_rate": 7.043579553241059e-06, + "loss": 0.3107, + "step": 22324 + }, + { + "epoch": 1.1028079218665219, + "grad_norm": 6.243547439575195, + "learning_rate": 7.043442510620804e-06, + "loss": 0.3531, + "step": 22325 + }, + { + "epoch": 1.102821486706457, + "grad_norm": 4.663461208343506, + "learning_rate": 7.043305468000549e-06, + "loss": 0.2451, + "step": 22326 + }, + { + "epoch": 1.1028350515463918, + "grad_norm": 6.81872034072876, + "learning_rate": 7.043168425380293e-06, + "loss": 0.3708, + "step": 22327 + }, + { + "epoch": 1.1028486163863267, + "grad_norm": 6.2216105461120605, + "learning_rate": 7.043031382760039e-06, + "loss": 0.2225, + "step": 22328 + }, + { + "epoch": 1.1028621812262616, + "grad_norm": 5.5998663902282715, + "learning_rate": 7.042894340139784e-06, + "loss": 0.1865, + "step": 22329 + }, + { + "epoch": 1.1028757460661964, + "grad_norm": 6.511547565460205, + "learning_rate": 7.0427572975195295e-06, + "loss": 0.2179, + "step": 22330 + }, + { + "epoch": 1.1028893109061313, + "grad_norm": 6.4962873458862305, + "learning_rate": 7.042620254899274e-06, + "loss": 0.3179, + "step": 22331 + }, + { + "epoch": 1.1029028757460662, + "grad_norm": 4.504475116729736, + "learning_rate": 7.04248321227902e-06, + "loss": 0.172, + "step": 22332 + }, + { + "epoch": 1.102916440586001, + "grad_norm": 6.052231311798096, + "learning_rate": 7.042346169658765e-06, + "loss": 0.3104, + "step": 22333 + }, + { + "epoch": 1.1029300054259359, + "grad_norm": 5.327849864959717, + "learning_rate": 7.042209127038509e-06, + "loss": 0.1995, + "step": 22334 + }, + { + "epoch": 1.1029435702658708, + "grad_norm": 5.84391975402832, + "learning_rate": 7.0420720844182545e-06, + "loss": 0.2482, + "step": 22335 + }, + { + "epoch": 1.1029571351058058, + "grad_norm": 4.759941577911377, + "learning_rate": 7.041935041797999e-06, + "loss": 0.2065, + "step": 22336 + }, + { + "epoch": 1.1029706999457407, + "grad_norm": 5.271084308624268, + "learning_rate": 7.041797999177745e-06, + "loss": 0.2194, + "step": 22337 + }, + { + "epoch": 1.1029842647856756, + "grad_norm": 4.862640380859375, + "learning_rate": 7.04166095655749e-06, + "loss": 0.2004, + "step": 22338 + }, + { + "epoch": 1.1029978296256104, + "grad_norm": 5.686720371246338, + "learning_rate": 7.041523913937235e-06, + "loss": 0.2459, + "step": 22339 + }, + { + "epoch": 1.1030113944655453, + "grad_norm": 5.262776851654053, + "learning_rate": 7.04138687131698e-06, + "loss": 0.2703, + "step": 22340 + }, + { + "epoch": 1.1030249593054802, + "grad_norm": 6.61023473739624, + "learning_rate": 7.041249828696726e-06, + "loss": 0.3048, + "step": 22341 + }, + { + "epoch": 1.103038524145415, + "grad_norm": 6.224648952484131, + "learning_rate": 7.041112786076471e-06, + "loss": 0.3835, + "step": 22342 + }, + { + "epoch": 1.10305208898535, + "grad_norm": 4.700028896331787, + "learning_rate": 7.040975743456215e-06, + "loss": 0.1774, + "step": 22343 + }, + { + "epoch": 1.1030656538252848, + "grad_norm": 5.541005611419678, + "learning_rate": 7.04083870083596e-06, + "loss": 0.2118, + "step": 22344 + }, + { + "epoch": 1.1030792186652199, + "grad_norm": 8.374430656433105, + "learning_rate": 7.040701658215706e-06, + "loss": 0.3797, + "step": 22345 + }, + { + "epoch": 1.1030927835051547, + "grad_norm": 5.357171535491943, + "learning_rate": 7.040564615595451e-06, + "loss": 0.2368, + "step": 22346 + }, + { + "epoch": 1.1031063483450896, + "grad_norm": 5.015856742858887, + "learning_rate": 7.040427572975196e-06, + "loss": 0.2442, + "step": 22347 + }, + { + "epoch": 1.1031199131850244, + "grad_norm": 5.216880798339844, + "learning_rate": 7.040290530354941e-06, + "loss": 0.2351, + "step": 22348 + }, + { + "epoch": 1.1031334780249593, + "grad_norm": 5.683778762817383, + "learning_rate": 7.040153487734685e-06, + "loss": 0.3504, + "step": 22349 + }, + { + "epoch": 1.1031470428648942, + "grad_norm": 6.199277877807617, + "learning_rate": 7.040016445114431e-06, + "loss": 0.2734, + "step": 22350 + }, + { + "epoch": 1.103160607704829, + "grad_norm": 6.80873441696167, + "learning_rate": 7.0398794024941766e-06, + "loss": 0.3369, + "step": 22351 + }, + { + "epoch": 1.103174172544764, + "grad_norm": 6.812969207763672, + "learning_rate": 7.039742359873921e-06, + "loss": 0.4636, + "step": 22352 + }, + { + "epoch": 1.1031877373846988, + "grad_norm": 5.841432571411133, + "learning_rate": 7.039605317253666e-06, + "loss": 0.2706, + "step": 22353 + }, + { + "epoch": 1.1032013022246336, + "grad_norm": 3.964914083480835, + "learning_rate": 7.039468274633412e-06, + "loss": 0.1384, + "step": 22354 + }, + { + "epoch": 1.1032148670645687, + "grad_norm": 5.427066326141357, + "learning_rate": 7.0393312320131564e-06, + "loss": 0.2037, + "step": 22355 + }, + { + "epoch": 1.1032284319045036, + "grad_norm": 5.591768741607666, + "learning_rate": 7.039194189392902e-06, + "loss": 0.2222, + "step": 22356 + }, + { + "epoch": 1.1032419967444385, + "grad_norm": 4.320595741271973, + "learning_rate": 7.039057146772647e-06, + "loss": 0.2179, + "step": 22357 + }, + { + "epoch": 1.1032555615843733, + "grad_norm": 6.377314567565918, + "learning_rate": 7.038920104152393e-06, + "loss": 0.3297, + "step": 22358 + }, + { + "epoch": 1.1032691264243082, + "grad_norm": 5.841329097747803, + "learning_rate": 7.038783061532137e-06, + "loss": 0.2616, + "step": 22359 + }, + { + "epoch": 1.103282691264243, + "grad_norm": 5.731619358062744, + "learning_rate": 7.038646018911882e-06, + "loss": 0.2662, + "step": 22360 + }, + { + "epoch": 1.103296256104178, + "grad_norm": 3.8737924098968506, + "learning_rate": 7.038508976291627e-06, + "loss": 0.1579, + "step": 22361 + }, + { + "epoch": 1.1033098209441128, + "grad_norm": 5.752287864685059, + "learning_rate": 7.038371933671372e-06, + "loss": 0.2362, + "step": 22362 + }, + { + "epoch": 1.1033233857840476, + "grad_norm": 5.384061813354492, + "learning_rate": 7.038234891051118e-06, + "loss": 0.2653, + "step": 22363 + }, + { + "epoch": 1.1033369506239827, + "grad_norm": 3.0860748291015625, + "learning_rate": 7.038097848430863e-06, + "loss": 0.1001, + "step": 22364 + }, + { + "epoch": 1.1033505154639176, + "grad_norm": 7.178163528442383, + "learning_rate": 7.037960805810607e-06, + "loss": 0.3224, + "step": 22365 + }, + { + "epoch": 1.1033640803038525, + "grad_norm": 5.254563331604004, + "learning_rate": 7.0378237631903525e-06, + "loss": 0.1979, + "step": 22366 + }, + { + "epoch": 1.1033776451437873, + "grad_norm": 4.722076892852783, + "learning_rate": 7.0376867205700986e-06, + "loss": 0.1653, + "step": 22367 + }, + { + "epoch": 1.1033912099837222, + "grad_norm": 5.157891750335693, + "learning_rate": 7.037549677949843e-06, + "loss": 0.2027, + "step": 22368 + }, + { + "epoch": 1.103404774823657, + "grad_norm": 5.367910385131836, + "learning_rate": 7.037412635329588e-06, + "loss": 0.2528, + "step": 22369 + }, + { + "epoch": 1.103418339663592, + "grad_norm": 5.887768268585205, + "learning_rate": 7.037275592709332e-06, + "loss": 0.2906, + "step": 22370 + }, + { + "epoch": 1.1034319045035268, + "grad_norm": 4.675546169281006, + "learning_rate": 7.0371385500890784e-06, + "loss": 0.2621, + "step": 22371 + }, + { + "epoch": 1.1034454693434617, + "grad_norm": 4.4461565017700195, + "learning_rate": 7.037001507468824e-06, + "loss": 0.1808, + "step": 22372 + }, + { + "epoch": 1.1034590341833965, + "grad_norm": 5.232916355133057, + "learning_rate": 7.036864464848569e-06, + "loss": 0.2483, + "step": 22373 + }, + { + "epoch": 1.1034725990233316, + "grad_norm": 6.260099411010742, + "learning_rate": 7.036727422228313e-06, + "loss": 0.2705, + "step": 22374 + }, + { + "epoch": 1.1034861638632665, + "grad_norm": 4.967316150665283, + "learning_rate": 7.036590379608058e-06, + "loss": 0.1662, + "step": 22375 + }, + { + "epoch": 1.1034997287032013, + "grad_norm": 5.984739780426025, + "learning_rate": 7.036453336987804e-06, + "loss": 0.2071, + "step": 22376 + }, + { + "epoch": 1.1035132935431362, + "grad_norm": 4.28877592086792, + "learning_rate": 7.036316294367549e-06, + "loss": 0.1246, + "step": 22377 + }, + { + "epoch": 1.103526858383071, + "grad_norm": 6.044601917266846, + "learning_rate": 7.036179251747294e-06, + "loss": 0.2219, + "step": 22378 + }, + { + "epoch": 1.103540423223006, + "grad_norm": 4.383967876434326, + "learning_rate": 7.036042209127039e-06, + "loss": 0.2819, + "step": 22379 + }, + { + "epoch": 1.1035539880629408, + "grad_norm": 5.459873199462891, + "learning_rate": 7.035905166506784e-06, + "loss": 0.2856, + "step": 22380 + }, + { + "epoch": 1.1035675529028757, + "grad_norm": 5.517407417297363, + "learning_rate": 7.035768123886529e-06, + "loss": 0.2739, + "step": 22381 + }, + { + "epoch": 1.1035811177428105, + "grad_norm": 8.0580415725708, + "learning_rate": 7.0356310812662746e-06, + "loss": 0.503, + "step": 22382 + }, + { + "epoch": 1.1035946825827456, + "grad_norm": 7.843400955200195, + "learning_rate": 7.035494038646019e-06, + "loss": 0.3421, + "step": 22383 + }, + { + "epoch": 1.1036082474226805, + "grad_norm": 6.424644470214844, + "learning_rate": 7.035356996025765e-06, + "loss": 0.3512, + "step": 22384 + }, + { + "epoch": 1.1036218122626154, + "grad_norm": 7.294981479644775, + "learning_rate": 7.03521995340551e-06, + "loss": 0.3459, + "step": 22385 + }, + { + "epoch": 1.1036353771025502, + "grad_norm": 6.391535758972168, + "learning_rate": 7.0350829107852544e-06, + "loss": 0.3389, + "step": 22386 + }, + { + "epoch": 1.103648941942485, + "grad_norm": 5.437525749206543, + "learning_rate": 7.034945868165e-06, + "loss": 0.2085, + "step": 22387 + }, + { + "epoch": 1.10366250678242, + "grad_norm": 9.168965339660645, + "learning_rate": 7.034808825544746e-06, + "loss": 0.315, + "step": 22388 + }, + { + "epoch": 1.1036760716223548, + "grad_norm": 5.977816581726074, + "learning_rate": 7.03467178292449e-06, + "loss": 0.2647, + "step": 22389 + }, + { + "epoch": 1.1036896364622897, + "grad_norm": 8.281085014343262, + "learning_rate": 7.034534740304235e-06, + "loss": 0.4172, + "step": 22390 + }, + { + "epoch": 1.1037032013022245, + "grad_norm": 5.663646697998047, + "learning_rate": 7.03439769768398e-06, + "loss": 0.3281, + "step": 22391 + }, + { + "epoch": 1.1037167661421594, + "grad_norm": 5.3496904373168945, + "learning_rate": 7.034260655063725e-06, + "loss": 0.2836, + "step": 22392 + }, + { + "epoch": 1.1037303309820945, + "grad_norm": 6.609522819519043, + "learning_rate": 7.034123612443471e-06, + "loss": 0.3586, + "step": 22393 + }, + { + "epoch": 1.1037438958220294, + "grad_norm": 6.069309234619141, + "learning_rate": 7.033986569823216e-06, + "loss": 0.2968, + "step": 22394 + }, + { + "epoch": 1.1037574606619642, + "grad_norm": 6.968229293823242, + "learning_rate": 7.03384952720296e-06, + "loss": 0.3354, + "step": 22395 + }, + { + "epoch": 1.103771025501899, + "grad_norm": 5.74547004699707, + "learning_rate": 7.033712484582705e-06, + "loss": 0.2253, + "step": 22396 + }, + { + "epoch": 1.103784590341834, + "grad_norm": 5.265850067138672, + "learning_rate": 7.033575441962451e-06, + "loss": 0.2474, + "step": 22397 + }, + { + "epoch": 1.1037981551817688, + "grad_norm": 4.620391845703125, + "learning_rate": 7.0334383993421966e-06, + "loss": 0.2761, + "step": 22398 + }, + { + "epoch": 1.1038117200217037, + "grad_norm": 5.109612464904785, + "learning_rate": 7.033301356721941e-06, + "loss": 0.2728, + "step": 22399 + }, + { + "epoch": 1.1038252848616386, + "grad_norm": 7.747342586517334, + "learning_rate": 7.033164314101686e-06, + "loss": 0.4509, + "step": 22400 + }, + { + "epoch": 1.1038388497015734, + "grad_norm": 5.569027900695801, + "learning_rate": 7.033027271481432e-06, + "loss": 0.304, + "step": 22401 + }, + { + "epoch": 1.1038524145415085, + "grad_norm": 4.241024017333984, + "learning_rate": 7.0328902288611764e-06, + "loss": 0.1796, + "step": 22402 + }, + { + "epoch": 1.1038659793814434, + "grad_norm": 4.587669849395752, + "learning_rate": 7.032753186240922e-06, + "loss": 0.2341, + "step": 22403 + }, + { + "epoch": 1.1038795442213782, + "grad_norm": 5.213477611541748, + "learning_rate": 7.032616143620666e-06, + "loss": 0.2653, + "step": 22404 + }, + { + "epoch": 1.103893109061313, + "grad_norm": 5.955301284790039, + "learning_rate": 7.032479101000411e-06, + "loss": 0.3518, + "step": 22405 + }, + { + "epoch": 1.103906673901248, + "grad_norm": 5.538241386413574, + "learning_rate": 7.032342058380157e-06, + "loss": 0.2991, + "step": 22406 + }, + { + "epoch": 1.1039202387411828, + "grad_norm": 5.614197254180908, + "learning_rate": 7.032205015759902e-06, + "loss": 0.2868, + "step": 22407 + }, + { + "epoch": 1.1039338035811177, + "grad_norm": 4.89081335067749, + "learning_rate": 7.032067973139647e-06, + "loss": 0.3111, + "step": 22408 + }, + { + "epoch": 1.1039473684210526, + "grad_norm": 5.323935508728027, + "learning_rate": 7.031930930519392e-06, + "loss": 0.2353, + "step": 22409 + }, + { + "epoch": 1.1039609332609874, + "grad_norm": 6.379582405090332, + "learning_rate": 7.031793887899138e-06, + "loss": 0.2668, + "step": 22410 + }, + { + "epoch": 1.1039744981009223, + "grad_norm": 3.722412586212158, + "learning_rate": 7.031656845278882e-06, + "loss": 0.1835, + "step": 22411 + }, + { + "epoch": 1.1039880629408574, + "grad_norm": 5.470399379730225, + "learning_rate": 7.031519802658627e-06, + "loss": 0.3022, + "step": 22412 + }, + { + "epoch": 1.1040016277807922, + "grad_norm": 13.233407974243164, + "learning_rate": 7.0313827600383726e-06, + "loss": 0.2038, + "step": 22413 + }, + { + "epoch": 1.1040151926207271, + "grad_norm": 5.618991374969482, + "learning_rate": 7.031245717418118e-06, + "loss": 0.2673, + "step": 22414 + }, + { + "epoch": 1.104028757460662, + "grad_norm": 7.013902187347412, + "learning_rate": 7.031108674797863e-06, + "loss": 0.3441, + "step": 22415 + }, + { + "epoch": 1.1040423223005968, + "grad_norm": 5.113840103149414, + "learning_rate": 7.030971632177608e-06, + "loss": 0.2913, + "step": 22416 + }, + { + "epoch": 1.1040558871405317, + "grad_norm": 5.501004219055176, + "learning_rate": 7.0308345895573524e-06, + "loss": 0.1899, + "step": 22417 + }, + { + "epoch": 1.1040694519804666, + "grad_norm": 3.9092836380004883, + "learning_rate": 7.030697546937098e-06, + "loss": 0.1867, + "step": 22418 + }, + { + "epoch": 1.1040830168204014, + "grad_norm": 4.046411991119385, + "learning_rate": 7.030560504316844e-06, + "loss": 0.2128, + "step": 22419 + }, + { + "epoch": 1.1040965816603363, + "grad_norm": 6.494785785675049, + "learning_rate": 7.030423461696588e-06, + "loss": 0.3388, + "step": 22420 + }, + { + "epoch": 1.1041101465002714, + "grad_norm": 3.4087984561920166, + "learning_rate": 7.030286419076333e-06, + "loss": 0.2032, + "step": 22421 + }, + { + "epoch": 1.1041237113402063, + "grad_norm": 5.8026933670043945, + "learning_rate": 7.030149376456078e-06, + "loss": 0.3111, + "step": 22422 + }, + { + "epoch": 1.1041372761801411, + "grad_norm": 4.639450550079346, + "learning_rate": 7.030012333835824e-06, + "loss": 0.2838, + "step": 22423 + }, + { + "epoch": 1.104150841020076, + "grad_norm": 5.0869550704956055, + "learning_rate": 7.029875291215569e-06, + "loss": 0.2738, + "step": 22424 + }, + { + "epoch": 1.1041644058600109, + "grad_norm": 5.7983503341674805, + "learning_rate": 7.029738248595314e-06, + "loss": 0.3255, + "step": 22425 + }, + { + "epoch": 1.1041779706999457, + "grad_norm": 7.7862348556518555, + "learning_rate": 7.029601205975058e-06, + "loss": 0.3221, + "step": 22426 + }, + { + "epoch": 1.1041915355398806, + "grad_norm": 4.484909534454346, + "learning_rate": 7.029464163354804e-06, + "loss": 0.2196, + "step": 22427 + }, + { + "epoch": 1.1042051003798155, + "grad_norm": 4.393172264099121, + "learning_rate": 7.029327120734549e-06, + "loss": 0.128, + "step": 22428 + }, + { + "epoch": 1.1042186652197503, + "grad_norm": 6.836173057556152, + "learning_rate": 7.029190078114294e-06, + "loss": 0.5295, + "step": 22429 + }, + { + "epoch": 1.1042322300596852, + "grad_norm": 4.580560684204102, + "learning_rate": 7.029053035494039e-06, + "loss": 0.1837, + "step": 22430 + }, + { + "epoch": 1.1042457948996203, + "grad_norm": 5.296928405761719, + "learning_rate": 7.028915992873784e-06, + "loss": 0.1836, + "step": 22431 + }, + { + "epoch": 1.1042593597395551, + "grad_norm": 5.114902019500732, + "learning_rate": 7.02877895025353e-06, + "loss": 0.2985, + "step": 22432 + }, + { + "epoch": 1.10427292457949, + "grad_norm": 4.770379066467285, + "learning_rate": 7.0286419076332744e-06, + "loss": 0.3725, + "step": 22433 + }, + { + "epoch": 1.1042864894194249, + "grad_norm": 3.743574619293213, + "learning_rate": 7.02850486501302e-06, + "loss": 0.1715, + "step": 22434 + }, + { + "epoch": 1.1043000542593597, + "grad_norm": 4.661348819732666, + "learning_rate": 7.028367822392764e-06, + "loss": 0.1747, + "step": 22435 + }, + { + "epoch": 1.1043136190992946, + "grad_norm": 6.692708969116211, + "learning_rate": 7.02823077977251e-06, + "loss": 0.3025, + "step": 22436 + }, + { + "epoch": 1.1043271839392295, + "grad_norm": 6.4442338943481445, + "learning_rate": 7.028093737152255e-06, + "loss": 0.2658, + "step": 22437 + }, + { + "epoch": 1.1043407487791643, + "grad_norm": 5.979098320007324, + "learning_rate": 7.0279566945319995e-06, + "loss": 0.3221, + "step": 22438 + }, + { + "epoch": 1.1043543136190992, + "grad_norm": 5.668351650238037, + "learning_rate": 7.027819651911745e-06, + "loss": 0.2043, + "step": 22439 + }, + { + "epoch": 1.1043678784590343, + "grad_norm": 6.126551628112793, + "learning_rate": 7.027682609291491e-06, + "loss": 0.2612, + "step": 22440 + }, + { + "epoch": 1.1043814432989691, + "grad_norm": 4.717061996459961, + "learning_rate": 7.027545566671236e-06, + "loss": 0.2881, + "step": 22441 + }, + { + "epoch": 1.104395008138904, + "grad_norm": 4.917538166046143, + "learning_rate": 7.02740852405098e-06, + "loss": 0.2357, + "step": 22442 + }, + { + "epoch": 1.1044085729788389, + "grad_norm": 4.93299674987793, + "learning_rate": 7.027271481430725e-06, + "loss": 0.2806, + "step": 22443 + }, + { + "epoch": 1.1044221378187737, + "grad_norm": 4.489259243011475, + "learning_rate": 7.02713443881047e-06, + "loss": 0.2823, + "step": 22444 + }, + { + "epoch": 1.1044357026587086, + "grad_norm": 3.4941673278808594, + "learning_rate": 7.026997396190216e-06, + "loss": 0.2007, + "step": 22445 + }, + { + "epoch": 1.1044492674986435, + "grad_norm": 7.43228816986084, + "learning_rate": 7.026860353569961e-06, + "loss": 0.3543, + "step": 22446 + }, + { + "epoch": 1.1044628323385783, + "grad_norm": 5.059804916381836, + "learning_rate": 7.026723310949706e-06, + "loss": 0.2666, + "step": 22447 + }, + { + "epoch": 1.1044763971785132, + "grad_norm": 4.330320358276367, + "learning_rate": 7.0265862683294504e-06, + "loss": 0.2155, + "step": 22448 + }, + { + "epoch": 1.104489962018448, + "grad_norm": 7.408594131469727, + "learning_rate": 7.0264492257091965e-06, + "loss": 0.4021, + "step": 22449 + }, + { + "epoch": 1.1045035268583832, + "grad_norm": 5.109420299530029, + "learning_rate": 7.026312183088942e-06, + "loss": 0.2916, + "step": 22450 + }, + { + "epoch": 1.104517091698318, + "grad_norm": 4.666754245758057, + "learning_rate": 7.026175140468686e-06, + "loss": 0.293, + "step": 22451 + }, + { + "epoch": 1.1045306565382529, + "grad_norm": 3.777374029159546, + "learning_rate": 7.026038097848431e-06, + "loss": 0.2147, + "step": 22452 + }, + { + "epoch": 1.1045442213781878, + "grad_norm": 5.002277374267578, + "learning_rate": 7.025901055228177e-06, + "loss": 0.2758, + "step": 22453 + }, + { + "epoch": 1.1045577862181226, + "grad_norm": 3.8081281185150146, + "learning_rate": 7.0257640126079215e-06, + "loss": 0.2653, + "step": 22454 + }, + { + "epoch": 1.1045713510580575, + "grad_norm": 3.4163897037506104, + "learning_rate": 7.025626969987667e-06, + "loss": 0.142, + "step": 22455 + }, + { + "epoch": 1.1045849158979923, + "grad_norm": 5.020887851715088, + "learning_rate": 7.025489927367412e-06, + "loss": 0.2448, + "step": 22456 + }, + { + "epoch": 1.1045984807379272, + "grad_norm": 4.96798038482666, + "learning_rate": 7.025352884747158e-06, + "loss": 0.2454, + "step": 22457 + }, + { + "epoch": 1.104612045577862, + "grad_norm": 3.3117401599884033, + "learning_rate": 7.025215842126902e-06, + "loss": 0.2159, + "step": 22458 + }, + { + "epoch": 1.1046256104177972, + "grad_norm": 6.215953826904297, + "learning_rate": 7.025078799506647e-06, + "loss": 0.3019, + "step": 22459 + }, + { + "epoch": 1.104639175257732, + "grad_norm": 5.9836506843566895, + "learning_rate": 7.024941756886392e-06, + "loss": 0.3127, + "step": 22460 + }, + { + "epoch": 1.104652740097667, + "grad_norm": 4.650848388671875, + "learning_rate": 7.024804714266137e-06, + "loss": 0.2664, + "step": 22461 + }, + { + "epoch": 1.1046663049376018, + "grad_norm": 5.2307305335998535, + "learning_rate": 7.024667671645883e-06, + "loss": 0.339, + "step": 22462 + }, + { + "epoch": 1.1046798697775366, + "grad_norm": 4.496047496795654, + "learning_rate": 7.024530629025627e-06, + "loss": 0.252, + "step": 22463 + }, + { + "epoch": 1.1046934346174715, + "grad_norm": 5.275538444519043, + "learning_rate": 7.0243935864053725e-06, + "loss": 0.2541, + "step": 22464 + }, + { + "epoch": 1.1047069994574064, + "grad_norm": 4.927874565124512, + "learning_rate": 7.024256543785118e-06, + "loss": 0.2866, + "step": 22465 + }, + { + "epoch": 1.1047205642973412, + "grad_norm": 5.769728183746338, + "learning_rate": 7.024119501164864e-06, + "loss": 0.4118, + "step": 22466 + }, + { + "epoch": 1.104734129137276, + "grad_norm": 4.26428747177124, + "learning_rate": 7.023982458544608e-06, + "loss": 0.2299, + "step": 22467 + }, + { + "epoch": 1.104747693977211, + "grad_norm": 4.376032829284668, + "learning_rate": 7.023845415924353e-06, + "loss": 0.2198, + "step": 22468 + }, + { + "epoch": 1.104761258817146, + "grad_norm": 4.0396552085876465, + "learning_rate": 7.0237083733040975e-06, + "loss": 0.2117, + "step": 22469 + }, + { + "epoch": 1.104774823657081, + "grad_norm": 3.3771183490753174, + "learning_rate": 7.0235713306838435e-06, + "loss": 0.1836, + "step": 22470 + }, + { + "epoch": 1.1047883884970158, + "grad_norm": 3.377312660217285, + "learning_rate": 7.023434288063589e-06, + "loss": 0.1529, + "step": 22471 + }, + { + "epoch": 1.1048019533369506, + "grad_norm": 5.270623207092285, + "learning_rate": 7.023297245443334e-06, + "loss": 0.2857, + "step": 22472 + }, + { + "epoch": 1.1048155181768855, + "grad_norm": 5.180190563201904, + "learning_rate": 7.023160202823078e-06, + "loss": 0.3416, + "step": 22473 + }, + { + "epoch": 1.1048290830168204, + "grad_norm": 5.846445560455322, + "learning_rate": 7.023023160202823e-06, + "loss": 0.3201, + "step": 22474 + }, + { + "epoch": 1.1048426478567552, + "grad_norm": 7.056778907775879, + "learning_rate": 7.022886117582569e-06, + "loss": 0.3649, + "step": 22475 + }, + { + "epoch": 1.10485621269669, + "grad_norm": 4.032756805419922, + "learning_rate": 7.022749074962314e-06, + "loss": 0.255, + "step": 22476 + }, + { + "epoch": 1.104869777536625, + "grad_norm": 4.18048620223999, + "learning_rate": 7.022612032342059e-06, + "loss": 0.1576, + "step": 22477 + }, + { + "epoch": 1.10488334237656, + "grad_norm": 3.8976221084594727, + "learning_rate": 7.022474989721803e-06, + "loss": 0.1855, + "step": 22478 + }, + { + "epoch": 1.104896907216495, + "grad_norm": 4.10650110244751, + "learning_rate": 7.022337947101549e-06, + "loss": 0.2864, + "step": 22479 + }, + { + "epoch": 1.1049104720564298, + "grad_norm": 3.5765292644500732, + "learning_rate": 7.0222009044812945e-06, + "loss": 0.2346, + "step": 22480 + }, + { + "epoch": 1.1049240368963646, + "grad_norm": 6.114706516265869, + "learning_rate": 7.02206386186104e-06, + "loss": 0.3063, + "step": 22481 + }, + { + "epoch": 1.1049376017362995, + "grad_norm": 4.263208866119385, + "learning_rate": 7.021926819240784e-06, + "loss": 0.2342, + "step": 22482 + }, + { + "epoch": 1.1049511665762344, + "grad_norm": 4.3415045738220215, + "learning_rate": 7.02178977662053e-06, + "loss": 0.3378, + "step": 22483 + }, + { + "epoch": 1.1049647314161692, + "grad_norm": 5.455894470214844, + "learning_rate": 7.021652734000275e-06, + "loss": 0.3278, + "step": 22484 + }, + { + "epoch": 1.104978296256104, + "grad_norm": 6.207569599151611, + "learning_rate": 7.0215156913800195e-06, + "loss": 0.3049, + "step": 22485 + }, + { + "epoch": 1.104991861096039, + "grad_norm": 5.745408058166504, + "learning_rate": 7.021378648759765e-06, + "loss": 0.2415, + "step": 22486 + }, + { + "epoch": 1.1050054259359738, + "grad_norm": 4.708537578582764, + "learning_rate": 7.021241606139509e-06, + "loss": 0.2208, + "step": 22487 + }, + { + "epoch": 1.105018990775909, + "grad_norm": 5.555124282836914, + "learning_rate": 7.021104563519255e-06, + "loss": 0.2582, + "step": 22488 + }, + { + "epoch": 1.1050325556158438, + "grad_norm": 3.419677495956421, + "learning_rate": 7.020967520899e-06, + "loss": 0.2121, + "step": 22489 + }, + { + "epoch": 1.1050461204557787, + "grad_norm": 5.157042980194092, + "learning_rate": 7.020830478278745e-06, + "loss": 0.1703, + "step": 22490 + }, + { + "epoch": 1.1050596852957135, + "grad_norm": 5.999730110168457, + "learning_rate": 7.02069343565849e-06, + "loss": 0.37, + "step": 22491 + }, + { + "epoch": 1.1050732501356484, + "grad_norm": 5.539285182952881, + "learning_rate": 7.020556393038236e-06, + "loss": 0.2762, + "step": 22492 + }, + { + "epoch": 1.1050868149755833, + "grad_norm": 5.201570510864258, + "learning_rate": 7.020419350417981e-06, + "loss": 0.2667, + "step": 22493 + }, + { + "epoch": 1.1051003798155181, + "grad_norm": 3.8716118335723877, + "learning_rate": 7.020282307797725e-06, + "loss": 0.2209, + "step": 22494 + }, + { + "epoch": 1.105113944655453, + "grad_norm": 4.102123737335205, + "learning_rate": 7.0201452651774705e-06, + "loss": 0.2264, + "step": 22495 + }, + { + "epoch": 1.105127509495388, + "grad_norm": 5.2577900886535645, + "learning_rate": 7.0200082225572165e-06, + "loss": 0.2034, + "step": 22496 + }, + { + "epoch": 1.105141074335323, + "grad_norm": 5.900293827056885, + "learning_rate": 7.019871179936961e-06, + "loss": 0.2928, + "step": 22497 + }, + { + "epoch": 1.1051546391752578, + "grad_norm": 5.506930828094482, + "learning_rate": 7.019734137316706e-06, + "loss": 0.3265, + "step": 22498 + }, + { + "epoch": 1.1051682040151927, + "grad_norm": 8.917940139770508, + "learning_rate": 7.019597094696451e-06, + "loss": 0.3815, + "step": 22499 + }, + { + "epoch": 1.1051817688551275, + "grad_norm": 5.460335731506348, + "learning_rate": 7.0194600520761955e-06, + "loss": 0.285, + "step": 22500 + }, + { + "epoch": 1.1051953336950624, + "grad_norm": 6.88640022277832, + "learning_rate": 7.0193230094559415e-06, + "loss": 0.2672, + "step": 22501 + }, + { + "epoch": 1.1052088985349973, + "grad_norm": 5.568302631378174, + "learning_rate": 7.019185966835687e-06, + "loss": 0.2688, + "step": 22502 + }, + { + "epoch": 1.1052224633749321, + "grad_norm": 4.436892509460449, + "learning_rate": 7.019048924215431e-06, + "loss": 0.2129, + "step": 22503 + }, + { + "epoch": 1.105236028214867, + "grad_norm": 5.996561527252197, + "learning_rate": 7.018911881595176e-06, + "loss": 0.2979, + "step": 22504 + }, + { + "epoch": 1.1052495930548019, + "grad_norm": 8.310043334960938, + "learning_rate": 7.018774838974922e-06, + "loss": 0.4602, + "step": 22505 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 4.961140155792236, + "learning_rate": 7.0186377963546674e-06, + "loss": 0.2809, + "step": 22506 + }, + { + "epoch": 1.1052767227346718, + "grad_norm": 4.993481159210205, + "learning_rate": 7.018500753734412e-06, + "loss": 0.1832, + "step": 22507 + }, + { + "epoch": 1.1052902875746067, + "grad_norm": 5.617534637451172, + "learning_rate": 7.018363711114157e-06, + "loss": 0.319, + "step": 22508 + }, + { + "epoch": 1.1053038524145415, + "grad_norm": 4.902162075042725, + "learning_rate": 7.018226668493903e-06, + "loss": 0.2732, + "step": 22509 + }, + { + "epoch": 1.1053174172544764, + "grad_norm": 4.6794114112854, + "learning_rate": 7.018089625873647e-06, + "loss": 0.208, + "step": 22510 + }, + { + "epoch": 1.1053309820944113, + "grad_norm": 5.783430099487305, + "learning_rate": 7.0179525832533925e-06, + "loss": 0.3759, + "step": 22511 + }, + { + "epoch": 1.1053445469343461, + "grad_norm": 5.64375114440918, + "learning_rate": 7.017815540633137e-06, + "loss": 0.2837, + "step": 22512 + }, + { + "epoch": 1.105358111774281, + "grad_norm": 5.232654571533203, + "learning_rate": 7.017678498012882e-06, + "loss": 0.377, + "step": 22513 + }, + { + "epoch": 1.1053716766142159, + "grad_norm": 6.121705532073975, + "learning_rate": 7.017541455392628e-06, + "loss": 0.3828, + "step": 22514 + }, + { + "epoch": 1.105385241454151, + "grad_norm": 6.667280673980713, + "learning_rate": 7.017404412772373e-06, + "loss": 0.3534, + "step": 22515 + }, + { + "epoch": 1.1053988062940858, + "grad_norm": 4.187800407409668, + "learning_rate": 7.0172673701521175e-06, + "loss": 0.2689, + "step": 22516 + }, + { + "epoch": 1.1054123711340207, + "grad_norm": 5.695520401000977, + "learning_rate": 7.017130327531863e-06, + "loss": 0.3032, + "step": 22517 + }, + { + "epoch": 1.1054259359739556, + "grad_norm": 6.892943382263184, + "learning_rate": 7.016993284911609e-06, + "loss": 0.3853, + "step": 22518 + }, + { + "epoch": 1.1054395008138904, + "grad_norm": 3.956395149230957, + "learning_rate": 7.016856242291353e-06, + "loss": 0.2172, + "step": 22519 + }, + { + "epoch": 1.1054530656538253, + "grad_norm": 6.074925422668457, + "learning_rate": 7.016719199671098e-06, + "loss": 0.4154, + "step": 22520 + }, + { + "epoch": 1.1054666304937601, + "grad_norm": 4.807430267333984, + "learning_rate": 7.016582157050843e-06, + "loss": 0.334, + "step": 22521 + }, + { + "epoch": 1.105480195333695, + "grad_norm": 5.376162528991699, + "learning_rate": 7.016445114430589e-06, + "loss": 0.2929, + "step": 22522 + }, + { + "epoch": 1.1054937601736299, + "grad_norm": 5.628761291503906, + "learning_rate": 7.016308071810334e-06, + "loss": 0.2959, + "step": 22523 + }, + { + "epoch": 1.1055073250135647, + "grad_norm": 4.138385772705078, + "learning_rate": 7.016171029190079e-06, + "loss": 0.2555, + "step": 22524 + }, + { + "epoch": 1.1055208898534996, + "grad_norm": 6.208786964416504, + "learning_rate": 7.016033986569823e-06, + "loss": 0.3911, + "step": 22525 + }, + { + "epoch": 1.1055344546934347, + "grad_norm": 4.94911003112793, + "learning_rate": 7.015896943949569e-06, + "loss": 0.3007, + "step": 22526 + }, + { + "epoch": 1.1055480195333696, + "grad_norm": 5.620578289031982, + "learning_rate": 7.0157599013293145e-06, + "loss": 0.2642, + "step": 22527 + }, + { + "epoch": 1.1055615843733044, + "grad_norm": 4.33248233795166, + "learning_rate": 7.015622858709059e-06, + "loss": 0.2848, + "step": 22528 + }, + { + "epoch": 1.1055751492132393, + "grad_norm": 3.77367901802063, + "learning_rate": 7.015485816088804e-06, + "loss": 0.2179, + "step": 22529 + }, + { + "epoch": 1.1055887140531742, + "grad_norm": 5.4688310623168945, + "learning_rate": 7.015348773468549e-06, + "loss": 0.3404, + "step": 22530 + }, + { + "epoch": 1.105602278893109, + "grad_norm": 4.376907825469971, + "learning_rate": 7.015211730848294e-06, + "loss": 0.1885, + "step": 22531 + }, + { + "epoch": 1.105615843733044, + "grad_norm": 6.1068243980407715, + "learning_rate": 7.0150746882280395e-06, + "loss": 0.3154, + "step": 22532 + }, + { + "epoch": 1.1056294085729788, + "grad_norm": 4.949313640594482, + "learning_rate": 7.014937645607785e-06, + "loss": 0.3193, + "step": 22533 + }, + { + "epoch": 1.1056429734129138, + "grad_norm": 5.851253032684326, + "learning_rate": 7.014800602987529e-06, + "loss": 0.2798, + "step": 22534 + }, + { + "epoch": 1.1056565382528487, + "grad_norm": 5.374857425689697, + "learning_rate": 7.014663560367275e-06, + "loss": 0.2248, + "step": 22535 + }, + { + "epoch": 1.1056701030927836, + "grad_norm": 5.2003045082092285, + "learning_rate": 7.01452651774702e-06, + "loss": 0.2994, + "step": 22536 + }, + { + "epoch": 1.1056836679327184, + "grad_norm": 4.279813289642334, + "learning_rate": 7.014389475126765e-06, + "loss": 0.2423, + "step": 22537 + }, + { + "epoch": 1.1056972327726533, + "grad_norm": 5.93353796005249, + "learning_rate": 7.01425243250651e-06, + "loss": 0.3219, + "step": 22538 + }, + { + "epoch": 1.1057107976125882, + "grad_norm": 7.433802604675293, + "learning_rate": 7.014115389886256e-06, + "loss": 0.2992, + "step": 22539 + }, + { + "epoch": 1.105724362452523, + "grad_norm": 7.146437168121338, + "learning_rate": 7.013978347266001e-06, + "loss": 0.3599, + "step": 22540 + }, + { + "epoch": 1.105737927292458, + "grad_norm": 8.533370018005371, + "learning_rate": 7.013841304645745e-06, + "loss": 0.324, + "step": 22541 + }, + { + "epoch": 1.1057514921323928, + "grad_norm": 5.19612455368042, + "learning_rate": 7.0137042620254905e-06, + "loss": 0.1926, + "step": 22542 + }, + { + "epoch": 1.1057650569723276, + "grad_norm": 6.509846210479736, + "learning_rate": 7.013567219405235e-06, + "loss": 0.2362, + "step": 22543 + }, + { + "epoch": 1.1057786218122627, + "grad_norm": 4.236358165740967, + "learning_rate": 7.013430176784981e-06, + "loss": 0.1559, + "step": 22544 + }, + { + "epoch": 1.1057921866521976, + "grad_norm": 3.4198720455169678, + "learning_rate": 7.013293134164726e-06, + "loss": 0.1648, + "step": 22545 + }, + { + "epoch": 1.1058057514921324, + "grad_norm": 4.514297008514404, + "learning_rate": 7.01315609154447e-06, + "loss": 0.1535, + "step": 22546 + }, + { + "epoch": 1.1058193163320673, + "grad_norm": 4.320485591888428, + "learning_rate": 7.0130190489242155e-06, + "loss": 0.2163, + "step": 22547 + }, + { + "epoch": 1.1058328811720022, + "grad_norm": 4.2579193115234375, + "learning_rate": 7.0128820063039616e-06, + "loss": 0.2145, + "step": 22548 + }, + { + "epoch": 1.105846446011937, + "grad_norm": 4.249957084655762, + "learning_rate": 7.012744963683707e-06, + "loss": 0.1543, + "step": 22549 + }, + { + "epoch": 1.105860010851872, + "grad_norm": 3.502023458480835, + "learning_rate": 7.012607921063451e-06, + "loss": 0.1723, + "step": 22550 + }, + { + "epoch": 1.1058735756918068, + "grad_norm": 5.719997406005859, + "learning_rate": 7.012470878443196e-06, + "loss": 0.1935, + "step": 22551 + }, + { + "epoch": 1.1058871405317416, + "grad_norm": 3.2677273750305176, + "learning_rate": 7.012333835822942e-06, + "loss": 0.158, + "step": 22552 + }, + { + "epoch": 1.1059007053716767, + "grad_norm": 3.888681411743164, + "learning_rate": 7.012196793202687e-06, + "loss": 0.1985, + "step": 22553 + }, + { + "epoch": 1.1059142702116116, + "grad_norm": 5.658121109008789, + "learning_rate": 7.012059750582432e-06, + "loss": 0.197, + "step": 22554 + }, + { + "epoch": 1.1059278350515465, + "grad_norm": 4.426000595092773, + "learning_rate": 7.011922707962177e-06, + "loss": 0.2924, + "step": 22555 + }, + { + "epoch": 1.1059413998914813, + "grad_norm": 4.704778671264648, + "learning_rate": 7.011785665341921e-06, + "loss": 0.1881, + "step": 22556 + }, + { + "epoch": 1.1059549647314162, + "grad_norm": 6.086453437805176, + "learning_rate": 7.011648622721667e-06, + "loss": 0.326, + "step": 22557 + }, + { + "epoch": 1.105968529571351, + "grad_norm": 6.196407794952393, + "learning_rate": 7.0115115801014125e-06, + "loss": 0.316, + "step": 22558 + }, + { + "epoch": 1.105982094411286, + "grad_norm": 3.7151060104370117, + "learning_rate": 7.011374537481157e-06, + "loss": 0.1759, + "step": 22559 + }, + { + "epoch": 1.1059956592512208, + "grad_norm": 5.817206859588623, + "learning_rate": 7.011237494860902e-06, + "loss": 0.3092, + "step": 22560 + }, + { + "epoch": 1.1060092240911557, + "grad_norm": 4.781109809875488, + "learning_rate": 7.011100452240648e-06, + "loss": 0.2994, + "step": 22561 + }, + { + "epoch": 1.1060227889310905, + "grad_norm": 7.672292232513428, + "learning_rate": 7.010963409620392e-06, + "loss": 0.3147, + "step": 22562 + }, + { + "epoch": 1.1060363537710256, + "grad_norm": 5.332932949066162, + "learning_rate": 7.0108263670001375e-06, + "loss": 0.2443, + "step": 22563 + }, + { + "epoch": 1.1060499186109605, + "grad_norm": 6.507438659667969, + "learning_rate": 7.010689324379883e-06, + "loss": 0.3177, + "step": 22564 + }, + { + "epoch": 1.1060634834508953, + "grad_norm": 3.9674720764160156, + "learning_rate": 7.010552281759628e-06, + "loss": 0.1736, + "step": 22565 + }, + { + "epoch": 1.1060770482908302, + "grad_norm": 5.76011848449707, + "learning_rate": 7.010415239139373e-06, + "loss": 0.3371, + "step": 22566 + }, + { + "epoch": 1.106090613130765, + "grad_norm": 5.546158790588379, + "learning_rate": 7.010278196519118e-06, + "loss": 0.2517, + "step": 22567 + }, + { + "epoch": 1.1061041779707, + "grad_norm": 4.981110095977783, + "learning_rate": 7.010141153898863e-06, + "loss": 0.2461, + "step": 22568 + }, + { + "epoch": 1.1061177428106348, + "grad_norm": 3.8365728855133057, + "learning_rate": 7.010004111278608e-06, + "loss": 0.158, + "step": 22569 + }, + { + "epoch": 1.1061313076505697, + "grad_norm": 5.534775733947754, + "learning_rate": 7.009867068658354e-06, + "loss": 0.326, + "step": 22570 + }, + { + "epoch": 1.1061448724905045, + "grad_norm": 4.054468631744385, + "learning_rate": 7.009730026038098e-06, + "loss": 0.0902, + "step": 22571 + }, + { + "epoch": 1.1061584373304396, + "grad_norm": 5.849312782287598, + "learning_rate": 7.009592983417843e-06, + "loss": 0.3043, + "step": 22572 + }, + { + "epoch": 1.1061720021703745, + "grad_norm": 3.9975028038024902, + "learning_rate": 7.0094559407975885e-06, + "loss": 0.1975, + "step": 22573 + }, + { + "epoch": 1.1061855670103093, + "grad_norm": 5.452641010284424, + "learning_rate": 7.0093188981773345e-06, + "loss": 0.1957, + "step": 22574 + }, + { + "epoch": 1.1061991318502442, + "grad_norm": 6.710830211639404, + "learning_rate": 7.009181855557079e-06, + "loss": 0.2654, + "step": 22575 + }, + { + "epoch": 1.106212696690179, + "grad_norm": 6.17555046081543, + "learning_rate": 7.009044812936824e-06, + "loss": 0.3644, + "step": 22576 + }, + { + "epoch": 1.106226261530114, + "grad_norm": 4.038115978240967, + "learning_rate": 7.008907770316568e-06, + "loss": 0.2489, + "step": 22577 + }, + { + "epoch": 1.1062398263700488, + "grad_norm": 4.4034504890441895, + "learning_rate": 7.008770727696314e-06, + "loss": 0.2689, + "step": 22578 + }, + { + "epoch": 1.1062533912099837, + "grad_norm": 6.6921868324279785, + "learning_rate": 7.0086336850760596e-06, + "loss": 0.4183, + "step": 22579 + }, + { + "epoch": 1.1062669560499185, + "grad_norm": 4.949176788330078, + "learning_rate": 7.008496642455804e-06, + "loss": 0.2503, + "step": 22580 + }, + { + "epoch": 1.1062805208898534, + "grad_norm": 5.789235591888428, + "learning_rate": 7.008359599835549e-06, + "loss": 0.3435, + "step": 22581 + }, + { + "epoch": 1.1062940857297885, + "grad_norm": 4.050034046173096, + "learning_rate": 7.008222557215294e-06, + "loss": 0.2934, + "step": 22582 + }, + { + "epoch": 1.1063076505697234, + "grad_norm": 7.342320442199707, + "learning_rate": 7.00808551459504e-06, + "loss": 0.3687, + "step": 22583 + }, + { + "epoch": 1.1063212154096582, + "grad_norm": 5.075028896331787, + "learning_rate": 7.007948471974785e-06, + "loss": 0.43, + "step": 22584 + }, + { + "epoch": 1.106334780249593, + "grad_norm": 4.058945655822754, + "learning_rate": 7.00781142935453e-06, + "loss": 0.2061, + "step": 22585 + }, + { + "epoch": 1.106348345089528, + "grad_norm": 6.892299652099609, + "learning_rate": 7.007674386734274e-06, + "loss": 0.362, + "step": 22586 + }, + { + "epoch": 1.1063619099294628, + "grad_norm": 5.239435195922852, + "learning_rate": 7.00753734411402e-06, + "loss": 0.351, + "step": 22587 + }, + { + "epoch": 1.1063754747693977, + "grad_norm": 5.62973690032959, + "learning_rate": 7.007400301493765e-06, + "loss": 0.3707, + "step": 22588 + }, + { + "epoch": 1.1063890396093325, + "grad_norm": 4.847741603851318, + "learning_rate": 7.0072632588735105e-06, + "loss": 0.2456, + "step": 22589 + }, + { + "epoch": 1.1064026044492674, + "grad_norm": 6.592720031738281, + "learning_rate": 7.007126216253255e-06, + "loss": 0.4632, + "step": 22590 + }, + { + "epoch": 1.1064161692892025, + "grad_norm": 5.254017353057861, + "learning_rate": 7.006989173633001e-06, + "loss": 0.3252, + "step": 22591 + }, + { + "epoch": 1.1064297341291374, + "grad_norm": 6.912263870239258, + "learning_rate": 7.006852131012746e-06, + "loss": 0.3566, + "step": 22592 + }, + { + "epoch": 1.1064432989690722, + "grad_norm": 6.006343841552734, + "learning_rate": 7.00671508839249e-06, + "loss": 0.2599, + "step": 22593 + }, + { + "epoch": 1.106456863809007, + "grad_norm": 5.395478248596191, + "learning_rate": 7.0065780457722355e-06, + "loss": 0.3154, + "step": 22594 + }, + { + "epoch": 1.106470428648942, + "grad_norm": 4.214148044586182, + "learning_rate": 7.0064410031519816e-06, + "loss": 0.2494, + "step": 22595 + }, + { + "epoch": 1.1064839934888768, + "grad_norm": 4.372823238372803, + "learning_rate": 7.006303960531726e-06, + "loss": 0.2548, + "step": 22596 + }, + { + "epoch": 1.1064975583288117, + "grad_norm": 5.30962610244751, + "learning_rate": 7.006166917911471e-06, + "loss": 0.2599, + "step": 22597 + }, + { + "epoch": 1.1065111231687466, + "grad_norm": 5.808645725250244, + "learning_rate": 7.006029875291216e-06, + "loss": 0.31, + "step": 22598 + }, + { + "epoch": 1.1065246880086814, + "grad_norm": 6.65463924407959, + "learning_rate": 7.005892832670961e-06, + "loss": 0.5103, + "step": 22599 + }, + { + "epoch": 1.1065382528486163, + "grad_norm": 6.8280839920043945, + "learning_rate": 7.005755790050707e-06, + "loss": 0.4057, + "step": 22600 + }, + { + "epoch": 1.1065518176885514, + "grad_norm": 5.669661521911621, + "learning_rate": 7.005618747430452e-06, + "loss": 0.3141, + "step": 22601 + }, + { + "epoch": 1.1065653825284862, + "grad_norm": 4.214789390563965, + "learning_rate": 7.005481704810196e-06, + "loss": 0.3229, + "step": 22602 + }, + { + "epoch": 1.106578947368421, + "grad_norm": 5.39571475982666, + "learning_rate": 7.005344662189941e-06, + "loss": 0.3131, + "step": 22603 + }, + { + "epoch": 1.106592512208356, + "grad_norm": 5.478862762451172, + "learning_rate": 7.005207619569687e-06, + "loss": 0.213, + "step": 22604 + }, + { + "epoch": 1.1066060770482908, + "grad_norm": 4.079977989196777, + "learning_rate": 7.005070576949432e-06, + "loss": 0.2414, + "step": 22605 + }, + { + "epoch": 1.1066196418882257, + "grad_norm": 5.5265398025512695, + "learning_rate": 7.004933534329177e-06, + "loss": 0.2317, + "step": 22606 + }, + { + "epoch": 1.1066332067281606, + "grad_norm": 4.8463544845581055, + "learning_rate": 7.004796491708922e-06, + "loss": 0.1964, + "step": 22607 + }, + { + "epoch": 1.1066467715680954, + "grad_norm": 4.342730522155762, + "learning_rate": 7.004659449088668e-06, + "loss": 0.1875, + "step": 22608 + }, + { + "epoch": 1.1066603364080303, + "grad_norm": 5.389781951904297, + "learning_rate": 7.004522406468412e-06, + "loss": 0.3218, + "step": 22609 + }, + { + "epoch": 1.1066739012479654, + "grad_norm": 6.39558219909668, + "learning_rate": 7.0043853638481576e-06, + "loss": 0.2412, + "step": 22610 + }, + { + "epoch": 1.1066874660879003, + "grad_norm": 5.529685020446777, + "learning_rate": 7.004248321227902e-06, + "loss": 0.3499, + "step": 22611 + }, + { + "epoch": 1.1067010309278351, + "grad_norm": 5.12635612487793, + "learning_rate": 7.004111278607647e-06, + "loss": 0.3121, + "step": 22612 + }, + { + "epoch": 1.10671459576777, + "grad_norm": 5.1075029373168945, + "learning_rate": 7.003974235987393e-06, + "loss": 0.3501, + "step": 22613 + }, + { + "epoch": 1.1067281606077048, + "grad_norm": 5.1229753494262695, + "learning_rate": 7.003837193367138e-06, + "loss": 0.4134, + "step": 22614 + }, + { + "epoch": 1.1067417254476397, + "grad_norm": 5.6424150466918945, + "learning_rate": 7.003700150746883e-06, + "loss": 0.2944, + "step": 22615 + }, + { + "epoch": 1.1067552902875746, + "grad_norm": 4.677929878234863, + "learning_rate": 7.003563108126628e-06, + "loss": 0.3589, + "step": 22616 + }, + { + "epoch": 1.1067688551275094, + "grad_norm": 5.272514820098877, + "learning_rate": 7.003426065506374e-06, + "loss": 0.3381, + "step": 22617 + }, + { + "epoch": 1.1067824199674443, + "grad_norm": 4.627695560455322, + "learning_rate": 7.003289022886118e-06, + "loss": 0.294, + "step": 22618 + }, + { + "epoch": 1.1067959848073792, + "grad_norm": 4.333365440368652, + "learning_rate": 7.003151980265863e-06, + "loss": 0.2067, + "step": 22619 + }, + { + "epoch": 1.1068095496473143, + "grad_norm": 5.754977703094482, + "learning_rate": 7.003014937645608e-06, + "loss": 0.368, + "step": 22620 + }, + { + "epoch": 1.1068231144872491, + "grad_norm": 6.493898391723633, + "learning_rate": 7.002877895025354e-06, + "loss": 0.2392, + "step": 22621 + }, + { + "epoch": 1.106836679327184, + "grad_norm": 3.945889472961426, + "learning_rate": 7.002740852405099e-06, + "loss": 0.2796, + "step": 22622 + }, + { + "epoch": 1.1068502441671189, + "grad_norm": 7.107431411743164, + "learning_rate": 7.002603809784844e-06, + "loss": 0.311, + "step": 22623 + }, + { + "epoch": 1.1068638090070537, + "grad_norm": 4.319440841674805, + "learning_rate": 7.002466767164588e-06, + "loss": 0.2193, + "step": 22624 + }, + { + "epoch": 1.1068773738469886, + "grad_norm": 6.809196949005127, + "learning_rate": 7.0023297245443335e-06, + "loss": 0.2839, + "step": 22625 + }, + { + "epoch": 1.1068909386869235, + "grad_norm": 6.224794864654541, + "learning_rate": 7.0021926819240796e-06, + "loss": 0.2262, + "step": 22626 + }, + { + "epoch": 1.1069045035268583, + "grad_norm": 6.1385345458984375, + "learning_rate": 7.002055639303824e-06, + "loss": 0.4241, + "step": 22627 + }, + { + "epoch": 1.1069180683667932, + "grad_norm": 5.931848526000977, + "learning_rate": 7.001918596683569e-06, + "loss": 0.3283, + "step": 22628 + }, + { + "epoch": 1.1069316332067283, + "grad_norm": 5.477395057678223, + "learning_rate": 7.001781554063313e-06, + "loss": 0.3423, + "step": 22629 + }, + { + "epoch": 1.1069451980466631, + "grad_norm": 5.820581912994385, + "learning_rate": 7.0016445114430594e-06, + "loss": 0.3096, + "step": 22630 + }, + { + "epoch": 1.106958762886598, + "grad_norm": 7.06076717376709, + "learning_rate": 7.001507468822805e-06, + "loss": 0.3435, + "step": 22631 + }, + { + "epoch": 1.1069723277265329, + "grad_norm": 5.547469615936279, + "learning_rate": 7.00137042620255e-06, + "loss": 0.3536, + "step": 22632 + }, + { + "epoch": 1.1069858925664677, + "grad_norm": 5.357351779937744, + "learning_rate": 7.001233383582294e-06, + "loss": 0.3274, + "step": 22633 + }, + { + "epoch": 1.1069994574064026, + "grad_norm": 6.889290809631348, + "learning_rate": 7.00109634096204e-06, + "loss": 0.4102, + "step": 22634 + }, + { + "epoch": 1.1070130222463375, + "grad_norm": 7.290041923522949, + "learning_rate": 7.000959298341785e-06, + "loss": 0.4017, + "step": 22635 + }, + { + "epoch": 1.1070265870862723, + "grad_norm": 5.388247489929199, + "learning_rate": 7.00082225572153e-06, + "loss": 0.2786, + "step": 22636 + }, + { + "epoch": 1.1070401519262072, + "grad_norm": 5.264631271362305, + "learning_rate": 7.000685213101275e-06, + "loss": 0.2613, + "step": 22637 + }, + { + "epoch": 1.107053716766142, + "grad_norm": 6.5252556800842285, + "learning_rate": 7.00054817048102e-06, + "loss": 0.3549, + "step": 22638 + }, + { + "epoch": 1.1070672816060771, + "grad_norm": 6.082377910614014, + "learning_rate": 7.000411127860765e-06, + "loss": 0.4367, + "step": 22639 + }, + { + "epoch": 1.107080846446012, + "grad_norm": 7.59814453125, + "learning_rate": 7.00027408524051e-06, + "loss": 0.3035, + "step": 22640 + }, + { + "epoch": 1.1070944112859469, + "grad_norm": 6.130840301513672, + "learning_rate": 7.0001370426202556e-06, + "loss": 0.3022, + "step": 22641 + }, + { + "epoch": 1.1071079761258817, + "grad_norm": 6.2124481201171875, + "learning_rate": 7e-06, + "loss": 0.3237, + "step": 22642 + }, + { + "epoch": 1.1071215409658166, + "grad_norm": 4.5731329917907715, + "learning_rate": 6.999862957379746e-06, + "loss": 0.3249, + "step": 22643 + }, + { + "epoch": 1.1071351058057515, + "grad_norm": 6.481771469116211, + "learning_rate": 6.999725914759491e-06, + "loss": 0.4215, + "step": 22644 + }, + { + "epoch": 1.1071486706456863, + "grad_norm": 8.498922348022461, + "learning_rate": 6.9995888721392354e-06, + "loss": 0.3431, + "step": 22645 + }, + { + "epoch": 1.1071622354856212, + "grad_norm": 4.519162178039551, + "learning_rate": 6.999451829518981e-06, + "loss": 0.2039, + "step": 22646 + }, + { + "epoch": 1.107175800325556, + "grad_norm": 5.054752826690674, + "learning_rate": 6.999314786898727e-06, + "loss": 0.263, + "step": 22647 + }, + { + "epoch": 1.1071893651654912, + "grad_norm": 5.571331024169922, + "learning_rate": 6.999177744278472e-06, + "loss": 0.2728, + "step": 22648 + }, + { + "epoch": 1.107202930005426, + "grad_norm": 5.749016284942627, + "learning_rate": 6.999040701658216e-06, + "loss": 0.1967, + "step": 22649 + }, + { + "epoch": 1.1072164948453609, + "grad_norm": 5.635374546051025, + "learning_rate": 6.998903659037961e-06, + "loss": 0.3829, + "step": 22650 + }, + { + "epoch": 1.1072300596852958, + "grad_norm": 4.976524353027344, + "learning_rate": 6.998766616417706e-06, + "loss": 0.35, + "step": 22651 + }, + { + "epoch": 1.1072436245252306, + "grad_norm": 4.567108631134033, + "learning_rate": 6.998629573797452e-06, + "loss": 0.1942, + "step": 22652 + }, + { + "epoch": 1.1072571893651655, + "grad_norm": 5.0537872314453125, + "learning_rate": 6.998492531177197e-06, + "loss": 0.2632, + "step": 22653 + }, + { + "epoch": 1.1072707542051003, + "grad_norm": 6.441614627838135, + "learning_rate": 6.998355488556941e-06, + "loss": 0.4372, + "step": 22654 + }, + { + "epoch": 1.1072843190450352, + "grad_norm": 3.651191473007202, + "learning_rate": 6.998218445936686e-06, + "loss": 0.1931, + "step": 22655 + }, + { + "epoch": 1.10729788388497, + "grad_norm": 5.059006690979004, + "learning_rate": 6.998081403316432e-06, + "loss": 0.2961, + "step": 22656 + }, + { + "epoch": 1.107311448724905, + "grad_norm": 5.354199409484863, + "learning_rate": 6.997944360696178e-06, + "loss": 0.3601, + "step": 22657 + }, + { + "epoch": 1.10732501356484, + "grad_norm": 4.8535308837890625, + "learning_rate": 6.997807318075922e-06, + "loss": 0.2237, + "step": 22658 + }, + { + "epoch": 1.107338578404775, + "grad_norm": 4.576841354370117, + "learning_rate": 6.997670275455667e-06, + "loss": 0.2431, + "step": 22659 + }, + { + "epoch": 1.1073521432447098, + "grad_norm": 5.200216770172119, + "learning_rate": 6.997533232835413e-06, + "loss": 0.2607, + "step": 22660 + }, + { + "epoch": 1.1073657080846446, + "grad_norm": 6.088696479797363, + "learning_rate": 6.9973961902151574e-06, + "loss": 0.2506, + "step": 22661 + }, + { + "epoch": 1.1073792729245795, + "grad_norm": 4.8592729568481445, + "learning_rate": 6.997259147594903e-06, + "loss": 0.192, + "step": 22662 + }, + { + "epoch": 1.1073928377645144, + "grad_norm": 3.7480151653289795, + "learning_rate": 6.997122104974648e-06, + "loss": 0.1722, + "step": 22663 + }, + { + "epoch": 1.1074064026044492, + "grad_norm": 5.025754928588867, + "learning_rate": 6.996985062354393e-06, + "loss": 0.2951, + "step": 22664 + }, + { + "epoch": 1.107419967444384, + "grad_norm": 3.9197793006896973, + "learning_rate": 6.996848019734138e-06, + "loss": 0.1343, + "step": 22665 + }, + { + "epoch": 1.107433532284319, + "grad_norm": 4.779850959777832, + "learning_rate": 6.996710977113883e-06, + "loss": 0.1977, + "step": 22666 + }, + { + "epoch": 1.107447097124254, + "grad_norm": 5.979096412658691, + "learning_rate": 6.996573934493628e-06, + "loss": 0.3404, + "step": 22667 + }, + { + "epoch": 1.107460661964189, + "grad_norm": 4.178238391876221, + "learning_rate": 6.996436891873373e-06, + "loss": 0.2483, + "step": 22668 + }, + { + "epoch": 1.1074742268041238, + "grad_norm": 6.800245761871338, + "learning_rate": 6.996299849253119e-06, + "loss": 0.3039, + "step": 22669 + }, + { + "epoch": 1.1074877916440586, + "grad_norm": 5.15866756439209, + "learning_rate": 6.996162806632863e-06, + "loss": 0.3512, + "step": 22670 + }, + { + "epoch": 1.1075013564839935, + "grad_norm": 5.837335586547852, + "learning_rate": 6.996025764012608e-06, + "loss": 0.2234, + "step": 22671 + }, + { + "epoch": 1.1075149213239284, + "grad_norm": 5.4099345207214355, + "learning_rate": 6.9958887213923536e-06, + "loss": 0.3328, + "step": 22672 + }, + { + "epoch": 1.1075284861638632, + "grad_norm": 7.11249303817749, + "learning_rate": 6.995751678772099e-06, + "loss": 0.2676, + "step": 22673 + }, + { + "epoch": 1.107542051003798, + "grad_norm": 5.858018398284912, + "learning_rate": 6.995614636151844e-06, + "loss": 0.237, + "step": 22674 + }, + { + "epoch": 1.107555615843733, + "grad_norm": 4.632818222045898, + "learning_rate": 6.995477593531589e-06, + "loss": 0.1942, + "step": 22675 + }, + { + "epoch": 1.1075691806836678, + "grad_norm": 6.171887397766113, + "learning_rate": 6.9953405509113334e-06, + "loss": 0.3037, + "step": 22676 + }, + { + "epoch": 1.107582745523603, + "grad_norm": 5.18367862701416, + "learning_rate": 6.9952035082910795e-06, + "loss": 0.2241, + "step": 22677 + }, + { + "epoch": 1.1075963103635378, + "grad_norm": 5.887596607208252, + "learning_rate": 6.995066465670825e-06, + "loss": 0.2379, + "step": 22678 + }, + { + "epoch": 1.1076098752034726, + "grad_norm": 4.134138107299805, + "learning_rate": 6.994929423050569e-06, + "loss": 0.2089, + "step": 22679 + }, + { + "epoch": 1.1076234400434075, + "grad_norm": 4.677131652832031, + "learning_rate": 6.994792380430314e-06, + "loss": 0.182, + "step": 22680 + }, + { + "epoch": 1.1076370048833424, + "grad_norm": 5.393225193023682, + "learning_rate": 6.994655337810059e-06, + "loss": 0.2668, + "step": 22681 + }, + { + "epoch": 1.1076505697232772, + "grad_norm": 3.2701621055603027, + "learning_rate": 6.994518295189805e-06, + "loss": 0.1313, + "step": 22682 + }, + { + "epoch": 1.107664134563212, + "grad_norm": 4.202881336212158, + "learning_rate": 6.99438125256955e-06, + "loss": 0.1626, + "step": 22683 + }, + { + "epoch": 1.107677699403147, + "grad_norm": 6.5511369705200195, + "learning_rate": 6.994244209949295e-06, + "loss": 0.3999, + "step": 22684 + }, + { + "epoch": 1.1076912642430818, + "grad_norm": 4.525315284729004, + "learning_rate": 6.994107167329039e-06, + "loss": 0.2346, + "step": 22685 + }, + { + "epoch": 1.107704829083017, + "grad_norm": 3.814727783203125, + "learning_rate": 6.993970124708785e-06, + "loss": 0.1578, + "step": 22686 + }, + { + "epoch": 1.1077183939229518, + "grad_norm": 3.949154853820801, + "learning_rate": 6.99383308208853e-06, + "loss": 0.108, + "step": 22687 + }, + { + "epoch": 1.1077319587628867, + "grad_norm": 4.787620544433594, + "learning_rate": 6.993696039468275e-06, + "loss": 0.211, + "step": 22688 + }, + { + "epoch": 1.1077455236028215, + "grad_norm": 5.230169773101807, + "learning_rate": 6.99355899684802e-06, + "loss": 0.2091, + "step": 22689 + }, + { + "epoch": 1.1077590884427564, + "grad_norm": 4.259893894195557, + "learning_rate": 6.993421954227766e-06, + "loss": 0.1777, + "step": 22690 + }, + { + "epoch": 1.1077726532826913, + "grad_norm": 3.985619068145752, + "learning_rate": 6.993284911607511e-06, + "loss": 0.1208, + "step": 22691 + }, + { + "epoch": 1.1077862181226261, + "grad_norm": 6.1059346199035645, + "learning_rate": 6.9931478689872555e-06, + "loss": 0.2695, + "step": 22692 + }, + { + "epoch": 1.107799782962561, + "grad_norm": 4.771346569061279, + "learning_rate": 6.993010826367001e-06, + "loss": 0.2433, + "step": 22693 + }, + { + "epoch": 1.1078133478024959, + "grad_norm": 4.429335117340088, + "learning_rate": 6.992873783746745e-06, + "loss": 0.2726, + "step": 22694 + }, + { + "epoch": 1.1078269126424307, + "grad_norm": 4.998783111572266, + "learning_rate": 6.992736741126491e-06, + "loss": 0.142, + "step": 22695 + }, + { + "epoch": 1.1078404774823658, + "grad_norm": 5.328427314758301, + "learning_rate": 6.992599698506236e-06, + "loss": 0.3397, + "step": 22696 + }, + { + "epoch": 1.1078540423223007, + "grad_norm": 4.484247207641602, + "learning_rate": 6.992462655885981e-06, + "loss": 0.247, + "step": 22697 + }, + { + "epoch": 1.1078676071622355, + "grad_norm": 6.344889163970947, + "learning_rate": 6.992325613265726e-06, + "loss": 0.2532, + "step": 22698 + }, + { + "epoch": 1.1078811720021704, + "grad_norm": 5.8375244140625, + "learning_rate": 6.992188570645472e-06, + "loss": 0.1425, + "step": 22699 + }, + { + "epoch": 1.1078947368421053, + "grad_norm": 6.483186721801758, + "learning_rate": 6.992051528025217e-06, + "loss": 0.3682, + "step": 22700 + }, + { + "epoch": 1.1079083016820401, + "grad_norm": 5.655389308929443, + "learning_rate": 6.991914485404961e-06, + "loss": 0.1694, + "step": 22701 + }, + { + "epoch": 1.107921866521975, + "grad_norm": 5.317896842956543, + "learning_rate": 6.991777442784706e-06, + "loss": 0.2154, + "step": 22702 + }, + { + "epoch": 1.1079354313619099, + "grad_norm": 6.352682113647461, + "learning_rate": 6.991640400164452e-06, + "loss": 0.1962, + "step": 22703 + }, + { + "epoch": 1.1079489962018447, + "grad_norm": 4.915862560272217, + "learning_rate": 6.991503357544197e-06, + "loss": 0.2103, + "step": 22704 + }, + { + "epoch": 1.1079625610417798, + "grad_norm": 7.630655288696289, + "learning_rate": 6.991366314923942e-06, + "loss": 0.2984, + "step": 22705 + }, + { + "epoch": 1.1079761258817147, + "grad_norm": 3.1474335193634033, + "learning_rate": 6.991229272303687e-06, + "loss": 0.094, + "step": 22706 + }, + { + "epoch": 1.1079896907216495, + "grad_norm": 3.8774821758270264, + "learning_rate": 6.9910922296834314e-06, + "loss": 0.1384, + "step": 22707 + }, + { + "epoch": 1.1080032555615844, + "grad_norm": 7.024176120758057, + "learning_rate": 6.9909551870631775e-06, + "loss": 0.2367, + "step": 22708 + }, + { + "epoch": 1.1080168204015193, + "grad_norm": 6.328351974487305, + "learning_rate": 6.990818144442923e-06, + "loss": 0.3075, + "step": 22709 + }, + { + "epoch": 1.1080303852414541, + "grad_norm": 5.134219646453857, + "learning_rate": 6.990681101822667e-06, + "loss": 0.2518, + "step": 22710 + }, + { + "epoch": 1.108043950081389, + "grad_norm": 5.57197904586792, + "learning_rate": 6.990544059202412e-06, + "loss": 0.3544, + "step": 22711 + }, + { + "epoch": 1.1080575149213239, + "grad_norm": 5.330165386199951, + "learning_rate": 6.990407016582158e-06, + "loss": 0.1526, + "step": 22712 + }, + { + "epoch": 1.1080710797612587, + "grad_norm": 3.8631556034088135, + "learning_rate": 6.9902699739619025e-06, + "loss": 0.1276, + "step": 22713 + }, + { + "epoch": 1.1080846446011936, + "grad_norm": 4.299891471862793, + "learning_rate": 6.990132931341648e-06, + "loss": 0.2844, + "step": 22714 + }, + { + "epoch": 1.1080982094411287, + "grad_norm": 7.385756492614746, + "learning_rate": 6.989995888721393e-06, + "loss": 0.3618, + "step": 22715 + }, + { + "epoch": 1.1081117742810636, + "grad_norm": 7.153119087219238, + "learning_rate": 6.989858846101139e-06, + "loss": 0.5611, + "step": 22716 + }, + { + "epoch": 1.1081253391209984, + "grad_norm": 5.3393049240112305, + "learning_rate": 6.989721803480883e-06, + "loss": 0.3284, + "step": 22717 + }, + { + "epoch": 1.1081389039609333, + "grad_norm": 5.357747554779053, + "learning_rate": 6.989584760860628e-06, + "loss": 0.1579, + "step": 22718 + }, + { + "epoch": 1.1081524688008682, + "grad_norm": 5.663236618041992, + "learning_rate": 6.989447718240373e-06, + "loss": 0.1722, + "step": 22719 + }, + { + "epoch": 1.108166033640803, + "grad_norm": 5.3793158531188965, + "learning_rate": 6.989310675620118e-06, + "loss": 0.278, + "step": 22720 + }, + { + "epoch": 1.1081795984807379, + "grad_norm": 6.1415486335754395, + "learning_rate": 6.989173632999864e-06, + "loss": 0.2456, + "step": 22721 + }, + { + "epoch": 1.1081931633206727, + "grad_norm": 4.720492839813232, + "learning_rate": 6.989036590379608e-06, + "loss": 0.2713, + "step": 22722 + }, + { + "epoch": 1.1082067281606076, + "grad_norm": 4.9350056648254395, + "learning_rate": 6.9888995477593535e-06, + "loss": 0.1426, + "step": 22723 + }, + { + "epoch": 1.1082202930005427, + "grad_norm": 6.77484130859375, + "learning_rate": 6.988762505139099e-06, + "loss": 0.3745, + "step": 22724 + }, + { + "epoch": 1.1082338578404776, + "grad_norm": 7.58177375793457, + "learning_rate": 6.988625462518845e-06, + "loss": 0.3354, + "step": 22725 + }, + { + "epoch": 1.1082474226804124, + "grad_norm": 6.2470622062683105, + "learning_rate": 6.988488419898589e-06, + "loss": 0.2681, + "step": 22726 + }, + { + "epoch": 1.1082609875203473, + "grad_norm": 6.9208269119262695, + "learning_rate": 6.988351377278334e-06, + "loss": 0.3049, + "step": 22727 + }, + { + "epoch": 1.1082745523602822, + "grad_norm": 5.606149673461914, + "learning_rate": 6.9882143346580785e-06, + "loss": 0.3838, + "step": 22728 + }, + { + "epoch": 1.108288117200217, + "grad_norm": 6.877389430999756, + "learning_rate": 6.9880772920378245e-06, + "loss": 0.3779, + "step": 22729 + }, + { + "epoch": 1.108301682040152, + "grad_norm": 7.491056442260742, + "learning_rate": 6.98794024941757e-06, + "loss": 0.4336, + "step": 22730 + }, + { + "epoch": 1.1083152468800868, + "grad_norm": 5.154420852661133, + "learning_rate": 6.987803206797315e-06, + "loss": 0.2017, + "step": 22731 + }, + { + "epoch": 1.1083288117200216, + "grad_norm": 5.8491435050964355, + "learning_rate": 6.987666164177059e-06, + "loss": 0.2379, + "step": 22732 + }, + { + "epoch": 1.1083423765599565, + "grad_norm": 5.396849632263184, + "learning_rate": 6.987529121556804e-06, + "loss": 0.2526, + "step": 22733 + }, + { + "epoch": 1.1083559413998916, + "grad_norm": 6.5935444831848145, + "learning_rate": 6.98739207893655e-06, + "loss": 0.231, + "step": 22734 + }, + { + "epoch": 1.1083695062398264, + "grad_norm": 6.1069746017456055, + "learning_rate": 6.987255036316295e-06, + "loss": 0.2439, + "step": 22735 + }, + { + "epoch": 1.1083830710797613, + "grad_norm": 8.437172889709473, + "learning_rate": 6.98711799369604e-06, + "loss": 0.4997, + "step": 22736 + }, + { + "epoch": 1.1083966359196962, + "grad_norm": 5.8102192878723145, + "learning_rate": 6.986980951075784e-06, + "loss": 0.2034, + "step": 22737 + }, + { + "epoch": 1.108410200759631, + "grad_norm": 5.650597095489502, + "learning_rate": 6.98684390845553e-06, + "loss": 0.2337, + "step": 22738 + }, + { + "epoch": 1.108423765599566, + "grad_norm": 6.980090141296387, + "learning_rate": 6.9867068658352755e-06, + "loss": 0.3217, + "step": 22739 + }, + { + "epoch": 1.1084373304395008, + "grad_norm": 5.976596355438232, + "learning_rate": 6.986569823215021e-06, + "loss": 0.2662, + "step": 22740 + }, + { + "epoch": 1.1084508952794356, + "grad_norm": 5.783294677734375, + "learning_rate": 6.986432780594765e-06, + "loss": 0.3245, + "step": 22741 + }, + { + "epoch": 1.1084644601193705, + "grad_norm": 7.172348976135254, + "learning_rate": 6.986295737974511e-06, + "loss": 0.2741, + "step": 22742 + }, + { + "epoch": 1.1084780249593056, + "grad_norm": 5.708014488220215, + "learning_rate": 6.986158695354256e-06, + "loss": 0.3462, + "step": 22743 + }, + { + "epoch": 1.1084915897992405, + "grad_norm": 6.042562961578369, + "learning_rate": 6.9860216527340005e-06, + "loss": 0.2899, + "step": 22744 + }, + { + "epoch": 1.1085051546391753, + "grad_norm": 7.070752143859863, + "learning_rate": 6.985884610113746e-06, + "loss": 0.3608, + "step": 22745 + }, + { + "epoch": 1.1085187194791102, + "grad_norm": 4.9791669845581055, + "learning_rate": 6.985747567493492e-06, + "loss": 0.2988, + "step": 22746 + }, + { + "epoch": 1.108532284319045, + "grad_norm": 8.706849098205566, + "learning_rate": 6.985610524873236e-06, + "loss": 0.4495, + "step": 22747 + }, + { + "epoch": 1.10854584915898, + "grad_norm": 5.791684150695801, + "learning_rate": 6.985473482252981e-06, + "loss": 0.3034, + "step": 22748 + }, + { + "epoch": 1.1085594139989148, + "grad_norm": 6.330395698547363, + "learning_rate": 6.985336439632726e-06, + "loss": 0.4492, + "step": 22749 + }, + { + "epoch": 1.1085729788388496, + "grad_norm": 7.094384670257568, + "learning_rate": 6.985199397012471e-06, + "loss": 0.401, + "step": 22750 + }, + { + "epoch": 1.1085865436787845, + "grad_norm": 4.31268835067749, + "learning_rate": 6.985062354392217e-06, + "loss": 0.2357, + "step": 22751 + }, + { + "epoch": 1.1086001085187194, + "grad_norm": 5.478602886199951, + "learning_rate": 6.984925311771962e-06, + "loss": 0.5012, + "step": 22752 + }, + { + "epoch": 1.1086136733586545, + "grad_norm": 7.073713302612305, + "learning_rate": 6.984788269151706e-06, + "loss": 0.3882, + "step": 22753 + }, + { + "epoch": 1.1086272381985893, + "grad_norm": 4.767673969268799, + "learning_rate": 6.9846512265314515e-06, + "loss": 0.2961, + "step": 22754 + }, + { + "epoch": 1.1086408030385242, + "grad_norm": 5.773034572601318, + "learning_rate": 6.9845141839111975e-06, + "loss": 0.2914, + "step": 22755 + }, + { + "epoch": 1.108654367878459, + "grad_norm": 4.392666816711426, + "learning_rate": 6.984377141290943e-06, + "loss": 0.289, + "step": 22756 + }, + { + "epoch": 1.108667932718394, + "grad_norm": 5.272488594055176, + "learning_rate": 6.984240098670687e-06, + "loss": 0.3599, + "step": 22757 + }, + { + "epoch": 1.1086814975583288, + "grad_norm": 5.399446487426758, + "learning_rate": 6.984103056050432e-06, + "loss": 0.2288, + "step": 22758 + }, + { + "epoch": 1.1086950623982637, + "grad_norm": 4.304511070251465, + "learning_rate": 6.983966013430178e-06, + "loss": 0.1898, + "step": 22759 + }, + { + "epoch": 1.1087086272381985, + "grad_norm": 6.123066425323486, + "learning_rate": 6.9838289708099225e-06, + "loss": 0.4088, + "step": 22760 + }, + { + "epoch": 1.1087221920781334, + "grad_norm": 6.565331935882568, + "learning_rate": 6.983691928189668e-06, + "loss": 0.4215, + "step": 22761 + }, + { + "epoch": 1.1087357569180685, + "grad_norm": 4.432467460632324, + "learning_rate": 6.983554885569412e-06, + "loss": 0.2862, + "step": 22762 + }, + { + "epoch": 1.1087493217580033, + "grad_norm": 6.468313217163086, + "learning_rate": 6.983417842949157e-06, + "loss": 0.4428, + "step": 22763 + }, + { + "epoch": 1.1087628865979382, + "grad_norm": 5.232034206390381, + "learning_rate": 6.983280800328903e-06, + "loss": 0.3149, + "step": 22764 + }, + { + "epoch": 1.108776451437873, + "grad_norm": 3.9333579540252686, + "learning_rate": 6.9831437577086484e-06, + "loss": 0.2675, + "step": 22765 + }, + { + "epoch": 1.108790016277808, + "grad_norm": 7.610720634460449, + "learning_rate": 6.983006715088393e-06, + "loss": 0.4247, + "step": 22766 + }, + { + "epoch": 1.1088035811177428, + "grad_norm": 4.74575662612915, + "learning_rate": 6.982869672468138e-06, + "loss": 0.2105, + "step": 22767 + }, + { + "epoch": 1.1088171459576777, + "grad_norm": 4.039060592651367, + "learning_rate": 6.982732629847884e-06, + "loss": 0.2435, + "step": 22768 + }, + { + "epoch": 1.1088307107976125, + "grad_norm": 6.364772796630859, + "learning_rate": 6.982595587227628e-06, + "loss": 0.2345, + "step": 22769 + }, + { + "epoch": 1.1088442756375474, + "grad_norm": 4.797909736633301, + "learning_rate": 6.9824585446073735e-06, + "loss": 0.3294, + "step": 22770 + }, + { + "epoch": 1.1088578404774823, + "grad_norm": 5.012589931488037, + "learning_rate": 6.982321501987118e-06, + "loss": 0.2492, + "step": 22771 + }, + { + "epoch": 1.1088714053174173, + "grad_norm": 4.678793907165527, + "learning_rate": 6.982184459366864e-06, + "loss": 0.2653, + "step": 22772 + }, + { + "epoch": 1.1088849701573522, + "grad_norm": 6.8580451011657715, + "learning_rate": 6.982047416746609e-06, + "loss": 0.4705, + "step": 22773 + }, + { + "epoch": 1.108898534997287, + "grad_norm": 6.888866901397705, + "learning_rate": 6.981910374126354e-06, + "loss": 0.4197, + "step": 22774 + }, + { + "epoch": 1.108912099837222, + "grad_norm": 3.7339510917663574, + "learning_rate": 6.9817733315060985e-06, + "loss": 0.2201, + "step": 22775 + }, + { + "epoch": 1.1089256646771568, + "grad_norm": 5.659679889678955, + "learning_rate": 6.981636288885844e-06, + "loss": 0.2522, + "step": 22776 + }, + { + "epoch": 1.1089392295170917, + "grad_norm": 6.281580924987793, + "learning_rate": 6.98149924626559e-06, + "loss": 0.3801, + "step": 22777 + }, + { + "epoch": 1.1089527943570265, + "grad_norm": 7.196741104125977, + "learning_rate": 6.981362203645334e-06, + "loss": 0.3894, + "step": 22778 + }, + { + "epoch": 1.1089663591969614, + "grad_norm": 5.759576797485352, + "learning_rate": 6.981225161025079e-06, + "loss": 0.2172, + "step": 22779 + }, + { + "epoch": 1.1089799240368963, + "grad_norm": 4.905777454376221, + "learning_rate": 6.981088118404824e-06, + "loss": 0.2898, + "step": 22780 + }, + { + "epoch": 1.1089934888768314, + "grad_norm": 7.724575996398926, + "learning_rate": 6.98095107578457e-06, + "loss": 0.4954, + "step": 22781 + }, + { + "epoch": 1.1090070537167662, + "grad_norm": 7.2171220779418945, + "learning_rate": 6.980814033164315e-06, + "loss": 0.4562, + "step": 22782 + }, + { + "epoch": 1.109020618556701, + "grad_norm": 7.0535478591918945, + "learning_rate": 6.98067699054406e-06, + "loss": 0.34, + "step": 22783 + }, + { + "epoch": 1.109034183396636, + "grad_norm": 4.411234378814697, + "learning_rate": 6.980539947923804e-06, + "loss": 0.2174, + "step": 22784 + }, + { + "epoch": 1.1090477482365708, + "grad_norm": 6.928651809692383, + "learning_rate": 6.98040290530355e-06, + "loss": 0.3755, + "step": 22785 + }, + { + "epoch": 1.1090613130765057, + "grad_norm": 5.969976902008057, + "learning_rate": 6.9802658626832955e-06, + "loss": 0.2998, + "step": 22786 + }, + { + "epoch": 1.1090748779164405, + "grad_norm": 6.733109951019287, + "learning_rate": 6.98012882006304e-06, + "loss": 0.4185, + "step": 22787 + }, + { + "epoch": 1.1090884427563754, + "grad_norm": 9.074143409729004, + "learning_rate": 6.979991777442785e-06, + "loss": 0.41, + "step": 22788 + }, + { + "epoch": 1.1091020075963103, + "grad_norm": 4.8429741859436035, + "learning_rate": 6.97985473482253e-06, + "loss": 0.1772, + "step": 22789 + }, + { + "epoch": 1.1091155724362451, + "grad_norm": 5.225551605224609, + "learning_rate": 6.979717692202276e-06, + "loss": 0.2189, + "step": 22790 + }, + { + "epoch": 1.1091291372761802, + "grad_norm": 5.153215408325195, + "learning_rate": 6.9795806495820205e-06, + "loss": 0.1898, + "step": 22791 + }, + { + "epoch": 1.109142702116115, + "grad_norm": 4.450112819671631, + "learning_rate": 6.979443606961766e-06, + "loss": 0.1782, + "step": 22792 + }, + { + "epoch": 1.10915626695605, + "grad_norm": 9.117573738098145, + "learning_rate": 6.97930656434151e-06, + "loss": 0.4183, + "step": 22793 + }, + { + "epoch": 1.1091698317959848, + "grad_norm": 5.773719787597656, + "learning_rate": 6.979169521721256e-06, + "loss": 0.2245, + "step": 22794 + }, + { + "epoch": 1.1091833966359197, + "grad_norm": 5.329211711883545, + "learning_rate": 6.979032479101001e-06, + "loss": 0.193, + "step": 22795 + }, + { + "epoch": 1.1091969614758546, + "grad_norm": 5.684747219085693, + "learning_rate": 6.978895436480746e-06, + "loss": 0.2867, + "step": 22796 + }, + { + "epoch": 1.1092105263157894, + "grad_norm": 6.583467960357666, + "learning_rate": 6.978758393860491e-06, + "loss": 0.3387, + "step": 22797 + }, + { + "epoch": 1.1092240911557243, + "grad_norm": 6.041236400604248, + "learning_rate": 6.978621351240237e-06, + "loss": 0.3778, + "step": 22798 + }, + { + "epoch": 1.1092376559956592, + "grad_norm": 6.145314693450928, + "learning_rate": 6.978484308619982e-06, + "loss": 0.2538, + "step": 22799 + }, + { + "epoch": 1.1092512208355942, + "grad_norm": 7.1996283531188965, + "learning_rate": 6.978347265999726e-06, + "loss": 0.3353, + "step": 22800 + }, + { + "epoch": 1.109264785675529, + "grad_norm": 4.463018417358398, + "learning_rate": 6.9782102233794715e-06, + "loss": 0.1911, + "step": 22801 + }, + { + "epoch": 1.109278350515464, + "grad_norm": 5.143045902252197, + "learning_rate": 6.978073180759216e-06, + "loss": 0.2656, + "step": 22802 + }, + { + "epoch": 1.1092919153553988, + "grad_norm": 5.941465377807617, + "learning_rate": 6.977936138138962e-06, + "loss": 0.4312, + "step": 22803 + }, + { + "epoch": 1.1093054801953337, + "grad_norm": 8.685303688049316, + "learning_rate": 6.977799095518707e-06, + "loss": 0.3141, + "step": 22804 + }, + { + "epoch": 1.1093190450352686, + "grad_norm": 4.399477958679199, + "learning_rate": 6.977662052898452e-06, + "loss": 0.235, + "step": 22805 + }, + { + "epoch": 1.1093326098752034, + "grad_norm": 6.828396797180176, + "learning_rate": 6.9775250102781965e-06, + "loss": 0.3086, + "step": 22806 + }, + { + "epoch": 1.1093461747151383, + "grad_norm": 5.253762722015381, + "learning_rate": 6.9773879676579426e-06, + "loss": 0.2955, + "step": 22807 + }, + { + "epoch": 1.1093597395550732, + "grad_norm": 3.8766565322875977, + "learning_rate": 6.977250925037688e-06, + "loss": 0.2417, + "step": 22808 + }, + { + "epoch": 1.109373304395008, + "grad_norm": 4.19877815246582, + "learning_rate": 6.977113882417432e-06, + "loss": 0.2103, + "step": 22809 + }, + { + "epoch": 1.1093868692349431, + "grad_norm": 4.026360034942627, + "learning_rate": 6.976976839797177e-06, + "loss": 0.2039, + "step": 22810 + }, + { + "epoch": 1.109400434074878, + "grad_norm": 6.102510452270508, + "learning_rate": 6.976839797176923e-06, + "loss": 0.2217, + "step": 22811 + }, + { + "epoch": 1.1094139989148128, + "grad_norm": 4.483859062194824, + "learning_rate": 6.976702754556668e-06, + "loss": 0.2576, + "step": 22812 + }, + { + "epoch": 1.1094275637547477, + "grad_norm": 4.592960834503174, + "learning_rate": 6.976565711936413e-06, + "loss": 0.1419, + "step": 22813 + }, + { + "epoch": 1.1094411285946826, + "grad_norm": 4.644551753997803, + "learning_rate": 6.976428669316158e-06, + "loss": 0.2323, + "step": 22814 + }, + { + "epoch": 1.1094546934346174, + "grad_norm": 5.227035999298096, + "learning_rate": 6.976291626695903e-06, + "loss": 0.2454, + "step": 22815 + }, + { + "epoch": 1.1094682582745523, + "grad_norm": 6.226024150848389, + "learning_rate": 6.976154584075648e-06, + "loss": 0.1919, + "step": 22816 + }, + { + "epoch": 1.1094818231144872, + "grad_norm": 7.334218502044678, + "learning_rate": 6.9760175414553935e-06, + "loss": 0.2698, + "step": 22817 + }, + { + "epoch": 1.109495387954422, + "grad_norm": 5.581320285797119, + "learning_rate": 6.975880498835138e-06, + "loss": 0.2189, + "step": 22818 + }, + { + "epoch": 1.1095089527943571, + "grad_norm": 4.47163200378418, + "learning_rate": 6.975743456214883e-06, + "loss": 0.1619, + "step": 22819 + }, + { + "epoch": 1.109522517634292, + "grad_norm": 3.338581085205078, + "learning_rate": 6.975606413594629e-06, + "loss": 0.1694, + "step": 22820 + }, + { + "epoch": 1.1095360824742269, + "grad_norm": 4.68473482131958, + "learning_rate": 6.975469370974373e-06, + "loss": 0.2311, + "step": 22821 + }, + { + "epoch": 1.1095496473141617, + "grad_norm": 3.7982492446899414, + "learning_rate": 6.9753323283541185e-06, + "loss": 0.1822, + "step": 22822 + }, + { + "epoch": 1.1095632121540966, + "grad_norm": 4.621574401855469, + "learning_rate": 6.975195285733864e-06, + "loss": 0.2998, + "step": 22823 + }, + { + "epoch": 1.1095767769940315, + "grad_norm": 6.68557071685791, + "learning_rate": 6.97505824311361e-06, + "loss": 0.2897, + "step": 22824 + }, + { + "epoch": 1.1095903418339663, + "grad_norm": 4.547281742095947, + "learning_rate": 6.974921200493354e-06, + "loss": 0.3451, + "step": 22825 + }, + { + "epoch": 1.1096039066739012, + "grad_norm": 6.270759582519531, + "learning_rate": 6.974784157873099e-06, + "loss": 0.2981, + "step": 22826 + }, + { + "epoch": 1.109617471513836, + "grad_norm": 3.998502016067505, + "learning_rate": 6.974647115252844e-06, + "loss": 0.1822, + "step": 22827 + }, + { + "epoch": 1.109631036353771, + "grad_norm": 4.853590965270996, + "learning_rate": 6.97451007263259e-06, + "loss": 0.2906, + "step": 22828 + }, + { + "epoch": 1.109644601193706, + "grad_norm": 4.887025356292725, + "learning_rate": 6.974373030012335e-06, + "loss": 0.2307, + "step": 22829 + }, + { + "epoch": 1.1096581660336409, + "grad_norm": 5.849153518676758, + "learning_rate": 6.974235987392079e-06, + "loss": 0.2619, + "step": 22830 + }, + { + "epoch": 1.1096717308735757, + "grad_norm": 4.32249116897583, + "learning_rate": 6.974098944771824e-06, + "loss": 0.2147, + "step": 22831 + }, + { + "epoch": 1.1096852957135106, + "grad_norm": 5.005806922912598, + "learning_rate": 6.9739619021515695e-06, + "loss": 0.2247, + "step": 22832 + }, + { + "epoch": 1.1096988605534455, + "grad_norm": 6.6021809577941895, + "learning_rate": 6.9738248595313155e-06, + "loss": 0.344, + "step": 22833 + }, + { + "epoch": 1.1097124253933803, + "grad_norm": 4.116652965545654, + "learning_rate": 6.97368781691106e-06, + "loss": 0.2447, + "step": 22834 + }, + { + "epoch": 1.1097259902333152, + "grad_norm": 5.857336044311523, + "learning_rate": 6.973550774290805e-06, + "loss": 0.2067, + "step": 22835 + }, + { + "epoch": 1.10973955507325, + "grad_norm": 4.332803249359131, + "learning_rate": 6.973413731670549e-06, + "loss": 0.2691, + "step": 22836 + }, + { + "epoch": 1.109753119913185, + "grad_norm": 5.170713424682617, + "learning_rate": 6.973276689050295e-06, + "loss": 0.3363, + "step": 22837 + }, + { + "epoch": 1.10976668475312, + "grad_norm": 4.739806175231934, + "learning_rate": 6.9731396464300406e-06, + "loss": 0.2418, + "step": 22838 + }, + { + "epoch": 1.1097802495930549, + "grad_norm": 4.862179279327393, + "learning_rate": 6.973002603809786e-06, + "loss": 0.3421, + "step": 22839 + }, + { + "epoch": 1.1097938144329897, + "grad_norm": 5.5190510749816895, + "learning_rate": 6.97286556118953e-06, + "loss": 0.1998, + "step": 22840 + }, + { + "epoch": 1.1098073792729246, + "grad_norm": 5.038976669311523, + "learning_rate": 6.972728518569276e-06, + "loss": 0.1958, + "step": 22841 + }, + { + "epoch": 1.1098209441128595, + "grad_norm": 5.597899436950684, + "learning_rate": 6.972591475949021e-06, + "loss": 0.2794, + "step": 22842 + }, + { + "epoch": 1.1098345089527943, + "grad_norm": 6.255173206329346, + "learning_rate": 6.972454433328766e-06, + "loss": 0.2893, + "step": 22843 + }, + { + "epoch": 1.1098480737927292, + "grad_norm": 4.983974456787109, + "learning_rate": 6.972317390708511e-06, + "loss": 0.3098, + "step": 22844 + }, + { + "epoch": 1.109861638632664, + "grad_norm": 4.853994846343994, + "learning_rate": 6.972180348088255e-06, + "loss": 0.2657, + "step": 22845 + }, + { + "epoch": 1.109875203472599, + "grad_norm": 5.160766124725342, + "learning_rate": 6.972043305468001e-06, + "loss": 0.2513, + "step": 22846 + }, + { + "epoch": 1.1098887683125338, + "grad_norm": 5.390402793884277, + "learning_rate": 6.971906262847746e-06, + "loss": 0.2516, + "step": 22847 + }, + { + "epoch": 1.109902333152469, + "grad_norm": 5.6806535720825195, + "learning_rate": 6.9717692202274915e-06, + "loss": 0.3775, + "step": 22848 + }, + { + "epoch": 1.1099158979924038, + "grad_norm": 4.534856796264648, + "learning_rate": 6.971632177607236e-06, + "loss": 0.2519, + "step": 22849 + }, + { + "epoch": 1.1099294628323386, + "grad_norm": 5.662178993225098, + "learning_rate": 6.971495134986982e-06, + "loss": 0.373, + "step": 22850 + }, + { + "epoch": 1.1099430276722735, + "grad_norm": 4.48183012008667, + "learning_rate": 6.971358092366727e-06, + "loss": 0.2229, + "step": 22851 + }, + { + "epoch": 1.1099565925122084, + "grad_norm": 4.528134346008301, + "learning_rate": 6.971221049746471e-06, + "loss": 0.2202, + "step": 22852 + }, + { + "epoch": 1.1099701573521432, + "grad_norm": 4.729140758514404, + "learning_rate": 6.9710840071262165e-06, + "loss": 0.2628, + "step": 22853 + }, + { + "epoch": 1.109983722192078, + "grad_norm": 4.7879509925842285, + "learning_rate": 6.9709469645059626e-06, + "loss": 0.3257, + "step": 22854 + }, + { + "epoch": 1.109997287032013, + "grad_norm": 3.610415458679199, + "learning_rate": 6.970809921885707e-06, + "loss": 0.1325, + "step": 22855 + }, + { + "epoch": 1.1100108518719478, + "grad_norm": 6.452696323394775, + "learning_rate": 6.970672879265452e-06, + "loss": 0.3625, + "step": 22856 + }, + { + "epoch": 1.110024416711883, + "grad_norm": 5.449762344360352, + "learning_rate": 6.970535836645197e-06, + "loss": 0.2454, + "step": 22857 + }, + { + "epoch": 1.1100379815518178, + "grad_norm": 4.987163543701172, + "learning_rate": 6.970398794024942e-06, + "loss": 0.2958, + "step": 22858 + }, + { + "epoch": 1.1100515463917526, + "grad_norm": 3.618593215942383, + "learning_rate": 6.970261751404688e-06, + "loss": 0.131, + "step": 22859 + }, + { + "epoch": 1.1100651112316875, + "grad_norm": 5.395090579986572, + "learning_rate": 6.970124708784433e-06, + "loss": 0.3247, + "step": 22860 + }, + { + "epoch": 1.1100786760716224, + "grad_norm": 4.720623970031738, + "learning_rate": 6.969987666164177e-06, + "loss": 0.2381, + "step": 22861 + }, + { + "epoch": 1.1100922409115572, + "grad_norm": 4.518732070922852, + "learning_rate": 6.969850623543922e-06, + "loss": 0.2677, + "step": 22862 + }, + { + "epoch": 1.110105805751492, + "grad_norm": 6.005540370941162, + "learning_rate": 6.969713580923668e-06, + "loss": 0.2267, + "step": 22863 + }, + { + "epoch": 1.110119370591427, + "grad_norm": 4.774409770965576, + "learning_rate": 6.969576538303413e-06, + "loss": 0.2227, + "step": 22864 + }, + { + "epoch": 1.1101329354313618, + "grad_norm": 5.745359420776367, + "learning_rate": 6.969439495683158e-06, + "loss": 0.3205, + "step": 22865 + }, + { + "epoch": 1.1101465002712967, + "grad_norm": 3.4435644149780273, + "learning_rate": 6.969302453062903e-06, + "loss": 0.1746, + "step": 22866 + }, + { + "epoch": 1.1101600651112318, + "grad_norm": 4.782933712005615, + "learning_rate": 6.969165410442649e-06, + "loss": 0.2215, + "step": 22867 + }, + { + "epoch": 1.1101736299511666, + "grad_norm": 3.7100231647491455, + "learning_rate": 6.969028367822393e-06, + "loss": 0.1592, + "step": 22868 + }, + { + "epoch": 1.1101871947911015, + "grad_norm": 4.467072486877441, + "learning_rate": 6.9688913252021386e-06, + "loss": 0.3066, + "step": 22869 + }, + { + "epoch": 1.1102007596310364, + "grad_norm": 5.04533052444458, + "learning_rate": 6.968754282581883e-06, + "loss": 0.2717, + "step": 22870 + }, + { + "epoch": 1.1102143244709712, + "grad_norm": 4.673060894012451, + "learning_rate": 6.968617239961628e-06, + "loss": 0.2424, + "step": 22871 + }, + { + "epoch": 1.110227889310906, + "grad_norm": 5.0406413078308105, + "learning_rate": 6.968480197341374e-06, + "loss": 0.2427, + "step": 22872 + }, + { + "epoch": 1.110241454150841, + "grad_norm": 5.629594802856445, + "learning_rate": 6.968343154721119e-06, + "loss": 0.2509, + "step": 22873 + }, + { + "epoch": 1.1102550189907758, + "grad_norm": 4.84302282333374, + "learning_rate": 6.968206112100864e-06, + "loss": 0.2047, + "step": 22874 + }, + { + "epoch": 1.1102685838307107, + "grad_norm": 4.649494171142578, + "learning_rate": 6.968069069480609e-06, + "loss": 0.2049, + "step": 22875 + }, + { + "epoch": 1.1102821486706458, + "grad_norm": 6.253509521484375, + "learning_rate": 6.967932026860355e-06, + "loss": 0.3352, + "step": 22876 + }, + { + "epoch": 1.1102957135105807, + "grad_norm": 5.58889627456665, + "learning_rate": 6.967794984240099e-06, + "loss": 0.2802, + "step": 22877 + }, + { + "epoch": 1.1103092783505155, + "grad_norm": 4.369587421417236, + "learning_rate": 6.967657941619844e-06, + "loss": 0.1694, + "step": 22878 + }, + { + "epoch": 1.1103228431904504, + "grad_norm": 4.080383777618408, + "learning_rate": 6.967520898999589e-06, + "loss": 0.1662, + "step": 22879 + }, + { + "epoch": 1.1103364080303852, + "grad_norm": 5.037593841552734, + "learning_rate": 6.967383856379335e-06, + "loss": 0.2302, + "step": 22880 + }, + { + "epoch": 1.1103499728703201, + "grad_norm": 3.3351893424987793, + "learning_rate": 6.96724681375908e-06, + "loss": 0.1052, + "step": 22881 + }, + { + "epoch": 1.110363537710255, + "grad_norm": 3.720449924468994, + "learning_rate": 6.967109771138825e-06, + "loss": 0.2441, + "step": 22882 + }, + { + "epoch": 1.1103771025501898, + "grad_norm": 5.715988636016846, + "learning_rate": 6.966972728518569e-06, + "loss": 0.3082, + "step": 22883 + }, + { + "epoch": 1.1103906673901247, + "grad_norm": 6.561865329742432, + "learning_rate": 6.966835685898315e-06, + "loss": 0.2675, + "step": 22884 + }, + { + "epoch": 1.1104042322300596, + "grad_norm": 5.480579376220703, + "learning_rate": 6.9666986432780606e-06, + "loss": 0.2488, + "step": 22885 + }, + { + "epoch": 1.1104177970699947, + "grad_norm": 3.6138103008270264, + "learning_rate": 6.966561600657805e-06, + "loss": 0.1811, + "step": 22886 + }, + { + "epoch": 1.1104313619099295, + "grad_norm": 5.380894660949707, + "learning_rate": 6.96642455803755e-06, + "loss": 0.2471, + "step": 22887 + }, + { + "epoch": 1.1104449267498644, + "grad_norm": 5.946662902832031, + "learning_rate": 6.966287515417295e-06, + "loss": 0.2832, + "step": 22888 + }, + { + "epoch": 1.1104584915897993, + "grad_norm": 3.630039691925049, + "learning_rate": 6.9661504727970404e-06, + "loss": 0.1896, + "step": 22889 + }, + { + "epoch": 1.1104720564297341, + "grad_norm": 3.209592819213867, + "learning_rate": 6.966013430176786e-06, + "loss": 0.146, + "step": 22890 + }, + { + "epoch": 1.110485621269669, + "grad_norm": 4.058208465576172, + "learning_rate": 6.965876387556531e-06, + "loss": 0.2062, + "step": 22891 + }, + { + "epoch": 1.1104991861096039, + "grad_norm": 5.178069591522217, + "learning_rate": 6.965739344936275e-06, + "loss": 0.1703, + "step": 22892 + }, + { + "epoch": 1.1105127509495387, + "grad_norm": 3.3894550800323486, + "learning_rate": 6.965602302316021e-06, + "loss": 0.1697, + "step": 22893 + }, + { + "epoch": 1.1105263157894736, + "grad_norm": 4.202260494232178, + "learning_rate": 6.965465259695766e-06, + "loss": 0.2117, + "step": 22894 + }, + { + "epoch": 1.1105398806294087, + "grad_norm": 4.430922985076904, + "learning_rate": 6.965328217075511e-06, + "loss": 0.1991, + "step": 22895 + }, + { + "epoch": 1.1105534454693435, + "grad_norm": 3.6390395164489746, + "learning_rate": 6.965191174455256e-06, + "loss": 0.1922, + "step": 22896 + }, + { + "epoch": 1.1105670103092784, + "grad_norm": 4.906397342681885, + "learning_rate": 6.965054131835002e-06, + "loss": 0.2341, + "step": 22897 + }, + { + "epoch": 1.1105805751492133, + "grad_norm": 5.686151504516602, + "learning_rate": 6.964917089214746e-06, + "loss": 0.3313, + "step": 22898 + }, + { + "epoch": 1.1105941399891481, + "grad_norm": 4.600445747375488, + "learning_rate": 6.964780046594491e-06, + "loss": 0.1761, + "step": 22899 + }, + { + "epoch": 1.110607704829083, + "grad_norm": 4.559515953063965, + "learning_rate": 6.9646430039742366e-06, + "loss": 0.2079, + "step": 22900 + }, + { + "epoch": 1.1106212696690179, + "grad_norm": 4.136188507080078, + "learning_rate": 6.964505961353981e-06, + "loss": 0.2071, + "step": 22901 + }, + { + "epoch": 1.1106348345089527, + "grad_norm": 3.960770845413208, + "learning_rate": 6.964368918733727e-06, + "loss": 0.1911, + "step": 22902 + }, + { + "epoch": 1.1106483993488876, + "grad_norm": 5.446661949157715, + "learning_rate": 6.964231876113472e-06, + "loss": 0.2524, + "step": 22903 + }, + { + "epoch": 1.1106619641888225, + "grad_norm": 3.531817674636841, + "learning_rate": 6.9640948334932164e-06, + "loss": 0.1882, + "step": 22904 + }, + { + "epoch": 1.1106755290287575, + "grad_norm": 2.6508493423461914, + "learning_rate": 6.963957790872962e-06, + "loss": 0.1362, + "step": 22905 + }, + { + "epoch": 1.1106890938686924, + "grad_norm": 4.152804851531982, + "learning_rate": 6.963820748252708e-06, + "loss": 0.1618, + "step": 22906 + }, + { + "epoch": 1.1107026587086273, + "grad_norm": 4.595122337341309, + "learning_rate": 6.963683705632453e-06, + "loss": 0.1931, + "step": 22907 + }, + { + "epoch": 1.1107162235485621, + "grad_norm": 4.108317852020264, + "learning_rate": 6.963546663012197e-06, + "loss": 0.2013, + "step": 22908 + }, + { + "epoch": 1.110729788388497, + "grad_norm": 8.294631958007812, + "learning_rate": 6.963409620391942e-06, + "loss": 0.297, + "step": 22909 + }, + { + "epoch": 1.1107433532284319, + "grad_norm": 4.260665416717529, + "learning_rate": 6.963272577771688e-06, + "loss": 0.2046, + "step": 22910 + }, + { + "epoch": 1.1107569180683667, + "grad_norm": 3.747943639755249, + "learning_rate": 6.963135535151433e-06, + "loss": 0.2018, + "step": 22911 + }, + { + "epoch": 1.1107704829083016, + "grad_norm": 5.886255264282227, + "learning_rate": 6.962998492531178e-06, + "loss": 0.2855, + "step": 22912 + }, + { + "epoch": 1.1107840477482365, + "grad_norm": 4.356388568878174, + "learning_rate": 6.962861449910922e-06, + "loss": 0.2496, + "step": 22913 + }, + { + "epoch": 1.1107976125881716, + "grad_norm": 5.773529529571533, + "learning_rate": 6.962724407290667e-06, + "loss": 0.2569, + "step": 22914 + }, + { + "epoch": 1.1108111774281064, + "grad_norm": 5.745650768280029, + "learning_rate": 6.962587364670413e-06, + "loss": 0.2184, + "step": 22915 + }, + { + "epoch": 1.1108247422680413, + "grad_norm": 6.364799499511719, + "learning_rate": 6.962450322050159e-06, + "loss": 0.2361, + "step": 22916 + }, + { + "epoch": 1.1108383071079762, + "grad_norm": 5.71970796585083, + "learning_rate": 6.962313279429903e-06, + "loss": 0.2106, + "step": 22917 + }, + { + "epoch": 1.110851871947911, + "grad_norm": 5.706324577331543, + "learning_rate": 6.962176236809648e-06, + "loss": 0.2562, + "step": 22918 + }, + { + "epoch": 1.1108654367878459, + "grad_norm": 6.127553462982178, + "learning_rate": 6.962039194189394e-06, + "loss": 0.3659, + "step": 22919 + }, + { + "epoch": 1.1108790016277807, + "grad_norm": 5.201579570770264, + "learning_rate": 6.9619021515691384e-06, + "loss": 0.2171, + "step": 22920 + }, + { + "epoch": 1.1108925664677156, + "grad_norm": 5.916485786437988, + "learning_rate": 6.961765108948884e-06, + "loss": 0.2299, + "step": 22921 + }, + { + "epoch": 1.1109061313076505, + "grad_norm": 5.847104549407959, + "learning_rate": 6.961628066328629e-06, + "loss": 0.2964, + "step": 22922 + }, + { + "epoch": 1.1109196961475853, + "grad_norm": 5.14381742477417, + "learning_rate": 6.961491023708374e-06, + "loss": 0.323, + "step": 22923 + }, + { + "epoch": 1.1109332609875204, + "grad_norm": 3.799589157104492, + "learning_rate": 6.961353981088119e-06, + "loss": 0.1648, + "step": 22924 + }, + { + "epoch": 1.1109468258274553, + "grad_norm": 5.0767059326171875, + "learning_rate": 6.961216938467864e-06, + "loss": 0.2839, + "step": 22925 + }, + { + "epoch": 1.1109603906673902, + "grad_norm": 5.6403584480285645, + "learning_rate": 6.961079895847609e-06, + "loss": 0.3524, + "step": 22926 + }, + { + "epoch": 1.110973955507325, + "grad_norm": 6.227156639099121, + "learning_rate": 6.960942853227354e-06, + "loss": 0.2601, + "step": 22927 + }, + { + "epoch": 1.11098752034726, + "grad_norm": 5.641472339630127, + "learning_rate": 6.9608058106071e-06, + "loss": 0.3354, + "step": 22928 + }, + { + "epoch": 1.1110010851871948, + "grad_norm": 6.338494777679443, + "learning_rate": 6.960668767986844e-06, + "loss": 0.3374, + "step": 22929 + }, + { + "epoch": 1.1110146500271296, + "grad_norm": 4.991176605224609, + "learning_rate": 6.960531725366589e-06, + "loss": 0.1951, + "step": 22930 + }, + { + "epoch": 1.1110282148670645, + "grad_norm": 5.85630989074707, + "learning_rate": 6.9603946827463346e-06, + "loss": 0.3322, + "step": 22931 + }, + { + "epoch": 1.1110417797069994, + "grad_norm": 5.278257369995117, + "learning_rate": 6.960257640126081e-06, + "loss": 0.2793, + "step": 22932 + }, + { + "epoch": 1.1110553445469344, + "grad_norm": 4.585445880889893, + "learning_rate": 6.960120597505825e-06, + "loss": 0.1734, + "step": 22933 + }, + { + "epoch": 1.1110689093868693, + "grad_norm": 6.426467418670654, + "learning_rate": 6.95998355488557e-06, + "loss": 0.3114, + "step": 22934 + }, + { + "epoch": 1.1110824742268042, + "grad_norm": 6.107964515686035, + "learning_rate": 6.9598465122653144e-06, + "loss": 0.3168, + "step": 22935 + }, + { + "epoch": 1.111096039066739, + "grad_norm": 5.3097052574157715, + "learning_rate": 6.9597094696450605e-06, + "loss": 0.319, + "step": 22936 + }, + { + "epoch": 1.111109603906674, + "grad_norm": 5.0472211837768555, + "learning_rate": 6.959572427024806e-06, + "loss": 0.2184, + "step": 22937 + }, + { + "epoch": 1.1111231687466088, + "grad_norm": 4.924739360809326, + "learning_rate": 6.95943538440455e-06, + "loss": 0.1937, + "step": 22938 + }, + { + "epoch": 1.1111367335865436, + "grad_norm": 5.599153995513916, + "learning_rate": 6.959298341784295e-06, + "loss": 0.2264, + "step": 22939 + }, + { + "epoch": 1.1111502984264785, + "grad_norm": 5.652915000915527, + "learning_rate": 6.95916129916404e-06, + "loss": 0.2812, + "step": 22940 + }, + { + "epoch": 1.1111638632664134, + "grad_norm": 4.049035549163818, + "learning_rate": 6.959024256543786e-06, + "loss": 0.1975, + "step": 22941 + }, + { + "epoch": 1.1111774281063482, + "grad_norm": 4.37050199508667, + "learning_rate": 6.958887213923531e-06, + "loss": 0.1842, + "step": 22942 + }, + { + "epoch": 1.1111909929462833, + "grad_norm": 5.678478717803955, + "learning_rate": 6.958750171303276e-06, + "loss": 0.3171, + "step": 22943 + }, + { + "epoch": 1.1112045577862182, + "grad_norm": 5.292447090148926, + "learning_rate": 6.95861312868302e-06, + "loss": 0.2302, + "step": 22944 + }, + { + "epoch": 1.111218122626153, + "grad_norm": 5.45457649230957, + "learning_rate": 6.958476086062766e-06, + "loss": 0.3065, + "step": 22945 + }, + { + "epoch": 1.111231687466088, + "grad_norm": 5.888800144195557, + "learning_rate": 6.958339043442511e-06, + "loss": 0.2668, + "step": 22946 + }, + { + "epoch": 1.1112452523060228, + "grad_norm": 5.341814994812012, + "learning_rate": 6.958202000822257e-06, + "loss": 0.2248, + "step": 22947 + }, + { + "epoch": 1.1112588171459576, + "grad_norm": 4.2014312744140625, + "learning_rate": 6.958064958202001e-06, + "loss": 0.2147, + "step": 22948 + }, + { + "epoch": 1.1112723819858925, + "grad_norm": 5.151885032653809, + "learning_rate": 6.957927915581747e-06, + "loss": 0.2693, + "step": 22949 + }, + { + "epoch": 1.1112859468258274, + "grad_norm": 4.814002513885498, + "learning_rate": 6.957790872961492e-06, + "loss": 0.2172, + "step": 22950 + }, + { + "epoch": 1.1112995116657622, + "grad_norm": 6.168903827667236, + "learning_rate": 6.9576538303412365e-06, + "loss": 0.2967, + "step": 22951 + }, + { + "epoch": 1.1113130765056973, + "grad_norm": 4.209187984466553, + "learning_rate": 6.957516787720982e-06, + "loss": 0.1876, + "step": 22952 + }, + { + "epoch": 1.1113266413456322, + "grad_norm": 4.269654273986816, + "learning_rate": 6.957379745100728e-06, + "loss": 0.2054, + "step": 22953 + }, + { + "epoch": 1.111340206185567, + "grad_norm": 5.764455318450928, + "learning_rate": 6.957242702480472e-06, + "loss": 0.3457, + "step": 22954 + }, + { + "epoch": 1.111353771025502, + "grad_norm": 5.198727130889893, + "learning_rate": 6.957105659860217e-06, + "loss": 0.2213, + "step": 22955 + }, + { + "epoch": 1.1113673358654368, + "grad_norm": 5.821234703063965, + "learning_rate": 6.956968617239962e-06, + "loss": 0.2782, + "step": 22956 + }, + { + "epoch": 1.1113809007053717, + "grad_norm": 6.868183612823486, + "learning_rate": 6.956831574619707e-06, + "loss": 0.2508, + "step": 22957 + }, + { + "epoch": 1.1113944655453065, + "grad_norm": 5.032874584197998, + "learning_rate": 6.956694531999453e-06, + "loss": 0.2339, + "step": 22958 + }, + { + "epoch": 1.1114080303852414, + "grad_norm": 4.131933212280273, + "learning_rate": 6.956557489379198e-06, + "loss": 0.221, + "step": 22959 + }, + { + "epoch": 1.1114215952251763, + "grad_norm": 5.295948505401611, + "learning_rate": 6.956420446758942e-06, + "loss": 0.2917, + "step": 22960 + }, + { + "epoch": 1.1114351600651111, + "grad_norm": 4.938212871551514, + "learning_rate": 6.956283404138687e-06, + "loss": 0.1973, + "step": 22961 + }, + { + "epoch": 1.1114487249050462, + "grad_norm": 5.655892372131348, + "learning_rate": 6.956146361518433e-06, + "loss": 0.3108, + "step": 22962 + }, + { + "epoch": 1.111462289744981, + "grad_norm": 5.249000072479248, + "learning_rate": 6.956009318898178e-06, + "loss": 0.2101, + "step": 22963 + }, + { + "epoch": 1.111475854584916, + "grad_norm": 4.251479625701904, + "learning_rate": 6.955872276277923e-06, + "loss": 0.212, + "step": 22964 + }, + { + "epoch": 1.1114894194248508, + "grad_norm": 6.999215126037598, + "learning_rate": 6.955735233657668e-06, + "loss": 0.2333, + "step": 22965 + }, + { + "epoch": 1.1115029842647857, + "grad_norm": 4.421515464782715, + "learning_rate": 6.955598191037414e-06, + "loss": 0.2321, + "step": 22966 + }, + { + "epoch": 1.1115165491047205, + "grad_norm": 7.634469509124756, + "learning_rate": 6.9554611484171585e-06, + "loss": 0.3148, + "step": 22967 + }, + { + "epoch": 1.1115301139446554, + "grad_norm": 3.9484784603118896, + "learning_rate": 6.955324105796904e-06, + "loss": 0.1796, + "step": 22968 + }, + { + "epoch": 1.1115436787845903, + "grad_norm": 6.139941215515137, + "learning_rate": 6.955187063176648e-06, + "loss": 0.1964, + "step": 22969 + }, + { + "epoch": 1.1115572436245253, + "grad_norm": 6.4422712326049805, + "learning_rate": 6.955050020556393e-06, + "loss": 0.3041, + "step": 22970 + }, + { + "epoch": 1.1115708084644602, + "grad_norm": 7.611272811889648, + "learning_rate": 6.954912977936139e-06, + "loss": 0.2682, + "step": 22971 + }, + { + "epoch": 1.111584373304395, + "grad_norm": 4.794768333435059, + "learning_rate": 6.9547759353158835e-06, + "loss": 0.2562, + "step": 22972 + }, + { + "epoch": 1.11159793814433, + "grad_norm": 4.535139560699463, + "learning_rate": 6.954638892695629e-06, + "loss": 0.1924, + "step": 22973 + }, + { + "epoch": 1.1116115029842648, + "grad_norm": 5.203799247741699, + "learning_rate": 6.954501850075374e-06, + "loss": 0.2273, + "step": 22974 + }, + { + "epoch": 1.1116250678241997, + "grad_norm": 4.267594814300537, + "learning_rate": 6.95436480745512e-06, + "loss": 0.1737, + "step": 22975 + }, + { + "epoch": 1.1116386326641345, + "grad_norm": 6.450855731964111, + "learning_rate": 6.954227764834864e-06, + "loss": 0.3228, + "step": 22976 + }, + { + "epoch": 1.1116521975040694, + "grad_norm": 5.715686321258545, + "learning_rate": 6.954090722214609e-06, + "loss": 0.3301, + "step": 22977 + }, + { + "epoch": 1.1116657623440043, + "grad_norm": 4.850867748260498, + "learning_rate": 6.953953679594354e-06, + "loss": 0.3177, + "step": 22978 + }, + { + "epoch": 1.1116793271839391, + "grad_norm": 7.367087364196777, + "learning_rate": 6.9538166369741e-06, + "loss": 0.3286, + "step": 22979 + }, + { + "epoch": 1.111692892023874, + "grad_norm": 4.857626438140869, + "learning_rate": 6.953679594353845e-06, + "loss": 0.2468, + "step": 22980 + }, + { + "epoch": 1.111706456863809, + "grad_norm": 7.254031658172607, + "learning_rate": 6.95354255173359e-06, + "loss": 0.3126, + "step": 22981 + }, + { + "epoch": 1.111720021703744, + "grad_norm": 4.040977954864502, + "learning_rate": 6.9534055091133345e-06, + "loss": 0.209, + "step": 22982 + }, + { + "epoch": 1.1117335865436788, + "grad_norm": 6.0856451988220215, + "learning_rate": 6.95326846649308e-06, + "loss": 0.341, + "step": 22983 + }, + { + "epoch": 1.1117471513836137, + "grad_norm": 5.020135402679443, + "learning_rate": 6.953131423872826e-06, + "loss": 0.1955, + "step": 22984 + }, + { + "epoch": 1.1117607162235486, + "grad_norm": 6.136765480041504, + "learning_rate": 6.95299438125257e-06, + "loss": 0.4465, + "step": 22985 + }, + { + "epoch": 1.1117742810634834, + "grad_norm": 5.0257744789123535, + "learning_rate": 6.952857338632315e-06, + "loss": 0.1975, + "step": 22986 + }, + { + "epoch": 1.1117878459034183, + "grad_norm": 5.314157962799072, + "learning_rate": 6.9527202960120595e-06, + "loss": 0.2131, + "step": 22987 + }, + { + "epoch": 1.1118014107433531, + "grad_norm": 6.086217880249023, + "learning_rate": 6.9525832533918055e-06, + "loss": 0.2662, + "step": 22988 + }, + { + "epoch": 1.1118149755832882, + "grad_norm": 5.356918811798096, + "learning_rate": 6.952446210771551e-06, + "loss": 0.1544, + "step": 22989 + }, + { + "epoch": 1.111828540423223, + "grad_norm": 6.991466999053955, + "learning_rate": 6.952309168151296e-06, + "loss": 0.3968, + "step": 22990 + }, + { + "epoch": 1.111842105263158, + "grad_norm": 4.712507724761963, + "learning_rate": 6.95217212553104e-06, + "loss": 0.1884, + "step": 22991 + }, + { + "epoch": 1.1118556701030928, + "grad_norm": 7.435226917266846, + "learning_rate": 6.952035082910786e-06, + "loss": 0.3672, + "step": 22992 + }, + { + "epoch": 1.1118692349430277, + "grad_norm": 5.255213737487793, + "learning_rate": 6.9518980402905314e-06, + "loss": 0.3436, + "step": 22993 + }, + { + "epoch": 1.1118827997829626, + "grad_norm": 5.698740482330322, + "learning_rate": 6.951760997670276e-06, + "loss": 0.2734, + "step": 22994 + }, + { + "epoch": 1.1118963646228974, + "grad_norm": 8.395408630371094, + "learning_rate": 6.951623955050021e-06, + "loss": 0.4235, + "step": 22995 + }, + { + "epoch": 1.1119099294628323, + "grad_norm": 6.054756164550781, + "learning_rate": 6.951486912429766e-06, + "loss": 0.2683, + "step": 22996 + }, + { + "epoch": 1.1119234943027672, + "grad_norm": 6.007283687591553, + "learning_rate": 6.951349869809511e-06, + "loss": 0.2798, + "step": 22997 + }, + { + "epoch": 1.111937059142702, + "grad_norm": 6.5916900634765625, + "learning_rate": 6.9512128271892565e-06, + "loss": 0.299, + "step": 22998 + }, + { + "epoch": 1.1119506239826369, + "grad_norm": 6.820772171020508, + "learning_rate": 6.951075784569002e-06, + "loss": 0.3416, + "step": 22999 + }, + { + "epoch": 1.111964188822572, + "grad_norm": 6.114834308624268, + "learning_rate": 6.950938741948746e-06, + "loss": 0.3545, + "step": 23000 + }, + { + "epoch": 1.1119777536625068, + "grad_norm": 5.554737091064453, + "learning_rate": 6.950801699328492e-06, + "loss": 0.2789, + "step": 23001 + }, + { + "epoch": 1.1119913185024417, + "grad_norm": 4.660008430480957, + "learning_rate": 6.950664656708237e-06, + "loss": 0.2598, + "step": 23002 + }, + { + "epoch": 1.1120048833423766, + "grad_norm": 6.067997455596924, + "learning_rate": 6.9505276140879815e-06, + "loss": 0.3127, + "step": 23003 + }, + { + "epoch": 1.1120184481823114, + "grad_norm": 7.260376930236816, + "learning_rate": 6.950390571467727e-06, + "loss": 0.2817, + "step": 23004 + }, + { + "epoch": 1.1120320130222463, + "grad_norm": 6.606838703155518, + "learning_rate": 6.950253528847473e-06, + "loss": 0.235, + "step": 23005 + }, + { + "epoch": 1.1120455778621812, + "grad_norm": 7.063260078430176, + "learning_rate": 6.950116486227217e-06, + "loss": 0.2907, + "step": 23006 + }, + { + "epoch": 1.112059142702116, + "grad_norm": 6.31390380859375, + "learning_rate": 6.949979443606962e-06, + "loss": 0.3366, + "step": 23007 + }, + { + "epoch": 1.1120727075420511, + "grad_norm": 6.041319847106934, + "learning_rate": 6.949842400986707e-06, + "loss": 0.2911, + "step": 23008 + }, + { + "epoch": 1.112086272381986, + "grad_norm": 6.3954362869262695, + "learning_rate": 6.949705358366452e-06, + "loss": 0.3104, + "step": 23009 + }, + { + "epoch": 1.1120998372219209, + "grad_norm": 5.5856781005859375, + "learning_rate": 6.949568315746198e-06, + "loss": 0.2535, + "step": 23010 + }, + { + "epoch": 1.1121134020618557, + "grad_norm": 5.0688300132751465, + "learning_rate": 6.949431273125943e-06, + "loss": 0.2753, + "step": 23011 + }, + { + "epoch": 1.1121269669017906, + "grad_norm": 5.633523464202881, + "learning_rate": 6.949294230505687e-06, + "loss": 0.1756, + "step": 23012 + }, + { + "epoch": 1.1121405317417254, + "grad_norm": 7.907382965087891, + "learning_rate": 6.9491571878854325e-06, + "loss": 0.4693, + "step": 23013 + }, + { + "epoch": 1.1121540965816603, + "grad_norm": 4.515288829803467, + "learning_rate": 6.9490201452651785e-06, + "loss": 0.2505, + "step": 23014 + }, + { + "epoch": 1.1121676614215952, + "grad_norm": 7.153779029846191, + "learning_rate": 6.948883102644924e-06, + "loss": 0.2231, + "step": 23015 + }, + { + "epoch": 1.11218122626153, + "grad_norm": 5.00907564163208, + "learning_rate": 6.948746060024668e-06, + "loss": 0.2924, + "step": 23016 + }, + { + "epoch": 1.112194791101465, + "grad_norm": 4.194714546203613, + "learning_rate": 6.948609017404413e-06, + "loss": 0.252, + "step": 23017 + }, + { + "epoch": 1.1122083559414, + "grad_norm": 4.403752326965332, + "learning_rate": 6.948471974784159e-06, + "loss": 0.2117, + "step": 23018 + }, + { + "epoch": 1.1122219207813349, + "grad_norm": 4.6118316650390625, + "learning_rate": 6.9483349321639035e-06, + "loss": 0.2157, + "step": 23019 + }, + { + "epoch": 1.1122354856212697, + "grad_norm": 8.42249584197998, + "learning_rate": 6.948197889543649e-06, + "loss": 0.4003, + "step": 23020 + }, + { + "epoch": 1.1122490504612046, + "grad_norm": 5.370117664337158, + "learning_rate": 6.948060846923393e-06, + "loss": 0.3026, + "step": 23021 + }, + { + "epoch": 1.1122626153011395, + "grad_norm": 4.945062160491943, + "learning_rate": 6.947923804303139e-06, + "loss": 0.2509, + "step": 23022 + }, + { + "epoch": 1.1122761801410743, + "grad_norm": 6.594160556793213, + "learning_rate": 6.947786761682884e-06, + "loss": 0.3964, + "step": 23023 + }, + { + "epoch": 1.1122897449810092, + "grad_norm": 4.6244964599609375, + "learning_rate": 6.9476497190626294e-06, + "loss": 0.3117, + "step": 23024 + }, + { + "epoch": 1.112303309820944, + "grad_norm": 4.710790634155273, + "learning_rate": 6.947512676442374e-06, + "loss": 0.3256, + "step": 23025 + }, + { + "epoch": 1.112316874660879, + "grad_norm": 7.549720764160156, + "learning_rate": 6.947375633822119e-06, + "loss": 0.5255, + "step": 23026 + }, + { + "epoch": 1.112330439500814, + "grad_norm": 4.530778884887695, + "learning_rate": 6.947238591201865e-06, + "loss": 0.3813, + "step": 23027 + }, + { + "epoch": 1.1123440043407489, + "grad_norm": 3.6515042781829834, + "learning_rate": 6.947101548581609e-06, + "loss": 0.3316, + "step": 23028 + }, + { + "epoch": 1.1123575691806837, + "grad_norm": 4.537652015686035, + "learning_rate": 6.9469645059613545e-06, + "loss": 0.1328, + "step": 23029 + }, + { + "epoch": 1.1123711340206186, + "grad_norm": 4.080348014831543, + "learning_rate": 6.9468274633411e-06, + "loss": 0.2677, + "step": 23030 + }, + { + "epoch": 1.1123846988605535, + "grad_norm": 5.44676399230957, + "learning_rate": 6.946690420720845e-06, + "loss": 0.4036, + "step": 23031 + }, + { + "epoch": 1.1123982637004883, + "grad_norm": 3.9977385997772217, + "learning_rate": 6.94655337810059e-06, + "loss": 0.2417, + "step": 23032 + }, + { + "epoch": 1.1124118285404232, + "grad_norm": 5.080376148223877, + "learning_rate": 6.946416335480335e-06, + "loss": 0.2553, + "step": 23033 + }, + { + "epoch": 1.112425393380358, + "grad_norm": 5.51986026763916, + "learning_rate": 6.9462792928600795e-06, + "loss": 0.2422, + "step": 23034 + }, + { + "epoch": 1.112438958220293, + "grad_norm": 5.7291669845581055, + "learning_rate": 6.9461422502398255e-06, + "loss": 0.2895, + "step": 23035 + }, + { + "epoch": 1.1124525230602278, + "grad_norm": 3.9668242931365967, + "learning_rate": 6.946005207619571e-06, + "loss": 0.2648, + "step": 23036 + }, + { + "epoch": 1.1124660879001629, + "grad_norm": 4.705016613006592, + "learning_rate": 6.945868164999315e-06, + "loss": 0.2787, + "step": 23037 + }, + { + "epoch": 1.1124796527400977, + "grad_norm": 4.3568878173828125, + "learning_rate": 6.94573112237906e-06, + "loss": 0.2124, + "step": 23038 + }, + { + "epoch": 1.1124932175800326, + "grad_norm": 6.174139022827148, + "learning_rate": 6.945594079758805e-06, + "loss": 0.2759, + "step": 23039 + }, + { + "epoch": 1.1125067824199675, + "grad_norm": 4.1379780769348145, + "learning_rate": 6.945457037138551e-06, + "loss": 0.2558, + "step": 23040 + }, + { + "epoch": 1.1125203472599023, + "grad_norm": 4.424392223358154, + "learning_rate": 6.945319994518296e-06, + "loss": 0.2216, + "step": 23041 + }, + { + "epoch": 1.1125339120998372, + "grad_norm": 3.432583808898926, + "learning_rate": 6.945182951898041e-06, + "loss": 0.2544, + "step": 23042 + }, + { + "epoch": 1.112547476939772, + "grad_norm": 5.376235485076904, + "learning_rate": 6.945045909277785e-06, + "loss": 0.2559, + "step": 23043 + }, + { + "epoch": 1.112561041779707, + "grad_norm": 6.301270484924316, + "learning_rate": 6.944908866657531e-06, + "loss": 0.2726, + "step": 23044 + }, + { + "epoch": 1.1125746066196418, + "grad_norm": 4.101805210113525, + "learning_rate": 6.9447718240372765e-06, + "loss": 0.2391, + "step": 23045 + }, + { + "epoch": 1.112588171459577, + "grad_norm": 5.420159816741943, + "learning_rate": 6.944634781417021e-06, + "loss": 0.2525, + "step": 23046 + }, + { + "epoch": 1.1126017362995118, + "grad_norm": 3.3764560222625732, + "learning_rate": 6.944497738796766e-06, + "loss": 0.1815, + "step": 23047 + }, + { + "epoch": 1.1126153011394466, + "grad_norm": 4.714451313018799, + "learning_rate": 6.944360696176512e-06, + "loss": 0.2782, + "step": 23048 + }, + { + "epoch": 1.1126288659793815, + "grad_norm": 5.691120624542236, + "learning_rate": 6.944223653556257e-06, + "loss": 0.3914, + "step": 23049 + }, + { + "epoch": 1.1126424308193164, + "grad_norm": 4.97498893737793, + "learning_rate": 6.9440866109360015e-06, + "loss": 0.2184, + "step": 23050 + }, + { + "epoch": 1.1126559956592512, + "grad_norm": 5.737167835235596, + "learning_rate": 6.943949568315747e-06, + "loss": 0.264, + "step": 23051 + }, + { + "epoch": 1.112669560499186, + "grad_norm": 4.981302738189697, + "learning_rate": 6.943812525695491e-06, + "loss": 0.2239, + "step": 23052 + }, + { + "epoch": 1.112683125339121, + "grad_norm": 4.615480899810791, + "learning_rate": 6.943675483075237e-06, + "loss": 0.2866, + "step": 23053 + }, + { + "epoch": 1.1126966901790558, + "grad_norm": 6.165054798126221, + "learning_rate": 6.943538440454982e-06, + "loss": 0.2853, + "step": 23054 + }, + { + "epoch": 1.1127102550189907, + "grad_norm": 4.407002925872803, + "learning_rate": 6.943401397834727e-06, + "loss": 0.2087, + "step": 23055 + }, + { + "epoch": 1.1127238198589258, + "grad_norm": 6.314635276794434, + "learning_rate": 6.943264355214472e-06, + "loss": 0.2668, + "step": 23056 + }, + { + "epoch": 1.1127373846988606, + "grad_norm": 4.7696661949157715, + "learning_rate": 6.943127312594218e-06, + "loss": 0.2532, + "step": 23057 + }, + { + "epoch": 1.1127509495387955, + "grad_norm": 5.437429904937744, + "learning_rate": 6.942990269973963e-06, + "loss": 0.261, + "step": 23058 + }, + { + "epoch": 1.1127645143787304, + "grad_norm": 4.8014607429504395, + "learning_rate": 6.942853227353707e-06, + "loss": 0.3213, + "step": 23059 + }, + { + "epoch": 1.1127780792186652, + "grad_norm": 5.752913475036621, + "learning_rate": 6.9427161847334525e-06, + "loss": 0.2063, + "step": 23060 + }, + { + "epoch": 1.1127916440586, + "grad_norm": 3.7992002964019775, + "learning_rate": 6.9425791421131985e-06, + "loss": 0.2132, + "step": 23061 + }, + { + "epoch": 1.112805208898535, + "grad_norm": 4.5421953201293945, + "learning_rate": 6.942442099492943e-06, + "loss": 0.2273, + "step": 23062 + }, + { + "epoch": 1.1128187737384698, + "grad_norm": 4.926238059997559, + "learning_rate": 6.942305056872688e-06, + "loss": 0.2056, + "step": 23063 + }, + { + "epoch": 1.1128323385784047, + "grad_norm": 5.1658549308776855, + "learning_rate": 6.942168014252433e-06, + "loss": 0.2187, + "step": 23064 + }, + { + "epoch": 1.1128459034183398, + "grad_norm": 4.149150848388672, + "learning_rate": 6.9420309716321775e-06, + "loss": 0.2254, + "step": 23065 + }, + { + "epoch": 1.1128594682582746, + "grad_norm": 6.588170528411865, + "learning_rate": 6.9418939290119236e-06, + "loss": 0.3988, + "step": 23066 + }, + { + "epoch": 1.1128730330982095, + "grad_norm": 5.4002604484558105, + "learning_rate": 6.941756886391669e-06, + "loss": 0.278, + "step": 23067 + }, + { + "epoch": 1.1128865979381444, + "grad_norm": 6.008469104766846, + "learning_rate": 6.941619843771413e-06, + "loss": 0.3271, + "step": 23068 + }, + { + "epoch": 1.1129001627780792, + "grad_norm": 5.998889446258545, + "learning_rate": 6.941482801151158e-06, + "loss": 0.2386, + "step": 23069 + }, + { + "epoch": 1.112913727618014, + "grad_norm": 4.867659568786621, + "learning_rate": 6.941345758530904e-06, + "loss": 0.2662, + "step": 23070 + }, + { + "epoch": 1.112927292457949, + "grad_norm": 4.541898250579834, + "learning_rate": 6.941208715910649e-06, + "loss": 0.2574, + "step": 23071 + }, + { + "epoch": 1.1129408572978838, + "grad_norm": 5.608706951141357, + "learning_rate": 6.941071673290394e-06, + "loss": 0.3293, + "step": 23072 + }, + { + "epoch": 1.1129544221378187, + "grad_norm": 3.8045198917388916, + "learning_rate": 6.940934630670139e-06, + "loss": 0.2413, + "step": 23073 + }, + { + "epoch": 1.1129679869777536, + "grad_norm": 3.213885545730591, + "learning_rate": 6.940797588049885e-06, + "loss": 0.1977, + "step": 23074 + }, + { + "epoch": 1.1129815518176887, + "grad_norm": 5.118931293487549, + "learning_rate": 6.940660545429629e-06, + "loss": 0.3465, + "step": 23075 + }, + { + "epoch": 1.1129951166576235, + "grad_norm": 4.268961429595947, + "learning_rate": 6.9405235028093745e-06, + "loss": 0.1929, + "step": 23076 + }, + { + "epoch": 1.1130086814975584, + "grad_norm": 5.927586555480957, + "learning_rate": 6.940386460189119e-06, + "loss": 0.2774, + "step": 23077 + }, + { + "epoch": 1.1130222463374932, + "grad_norm": 5.169384002685547, + "learning_rate": 6.940249417568864e-06, + "loss": 0.2257, + "step": 23078 + }, + { + "epoch": 1.1130358111774281, + "grad_norm": 4.8923773765563965, + "learning_rate": 6.94011237494861e-06, + "loss": 0.2813, + "step": 23079 + }, + { + "epoch": 1.113049376017363, + "grad_norm": 4.349088668823242, + "learning_rate": 6.939975332328354e-06, + "loss": 0.2104, + "step": 23080 + }, + { + "epoch": 1.1130629408572978, + "grad_norm": 6.390042781829834, + "learning_rate": 6.9398382897080995e-06, + "loss": 0.2262, + "step": 23081 + }, + { + "epoch": 1.1130765056972327, + "grad_norm": 3.614049196243286, + "learning_rate": 6.939701247087845e-06, + "loss": 0.2259, + "step": 23082 + }, + { + "epoch": 1.1130900705371676, + "grad_norm": 3.7143287658691406, + "learning_rate": 6.939564204467591e-06, + "loss": 0.1955, + "step": 23083 + }, + { + "epoch": 1.1131036353771027, + "grad_norm": 5.179670810699463, + "learning_rate": 6.939427161847335e-06, + "loss": 0.2802, + "step": 23084 + }, + { + "epoch": 1.1131172002170375, + "grad_norm": 4.634892463684082, + "learning_rate": 6.93929011922708e-06, + "loss": 0.2636, + "step": 23085 + }, + { + "epoch": 1.1131307650569724, + "grad_norm": 4.650501251220703, + "learning_rate": 6.939153076606825e-06, + "loss": 0.265, + "step": 23086 + }, + { + "epoch": 1.1131443298969073, + "grad_norm": 6.701895713806152, + "learning_rate": 6.939016033986571e-06, + "loss": 0.3621, + "step": 23087 + }, + { + "epoch": 1.1131578947368421, + "grad_norm": 6.452335357666016, + "learning_rate": 6.938878991366316e-06, + "loss": 0.2849, + "step": 23088 + }, + { + "epoch": 1.113171459576777, + "grad_norm": 4.18863582611084, + "learning_rate": 6.938741948746061e-06, + "loss": 0.129, + "step": 23089 + }, + { + "epoch": 1.1131850244167119, + "grad_norm": 3.8592262268066406, + "learning_rate": 6.938604906125805e-06, + "loss": 0.2426, + "step": 23090 + }, + { + "epoch": 1.1131985892566467, + "grad_norm": 4.210902214050293, + "learning_rate": 6.938467863505551e-06, + "loss": 0.2433, + "step": 23091 + }, + { + "epoch": 1.1132121540965816, + "grad_norm": 4.975151062011719, + "learning_rate": 6.9383308208852965e-06, + "loss": 0.3191, + "step": 23092 + }, + { + "epoch": 1.1132257189365165, + "grad_norm": 4.578901290893555, + "learning_rate": 6.938193778265041e-06, + "loss": 0.2055, + "step": 23093 + }, + { + "epoch": 1.1132392837764515, + "grad_norm": 4.605186462402344, + "learning_rate": 6.938056735644786e-06, + "loss": 0.3479, + "step": 23094 + }, + { + "epoch": 1.1132528486163864, + "grad_norm": 4.105040550231934, + "learning_rate": 6.93791969302453e-06, + "loss": 0.2366, + "step": 23095 + }, + { + "epoch": 1.1132664134563213, + "grad_norm": 5.670836925506592, + "learning_rate": 6.937782650404276e-06, + "loss": 0.2143, + "step": 23096 + }, + { + "epoch": 1.1132799782962561, + "grad_norm": 3.7164440155029297, + "learning_rate": 6.9376456077840216e-06, + "loss": 0.2103, + "step": 23097 + }, + { + "epoch": 1.113293543136191, + "grad_norm": 4.3505473136901855, + "learning_rate": 6.937508565163767e-06, + "loss": 0.1704, + "step": 23098 + }, + { + "epoch": 1.1133071079761259, + "grad_norm": 3.0504918098449707, + "learning_rate": 6.937371522543511e-06, + "loss": 0.1823, + "step": 23099 + }, + { + "epoch": 1.1133206728160607, + "grad_norm": 4.855188369750977, + "learning_rate": 6.937234479923257e-06, + "loss": 0.2351, + "step": 23100 + }, + { + "epoch": 1.1133342376559956, + "grad_norm": 4.475563049316406, + "learning_rate": 6.937097437303002e-06, + "loss": 0.216, + "step": 23101 + }, + { + "epoch": 1.1133478024959305, + "grad_norm": 6.062412738800049, + "learning_rate": 6.936960394682747e-06, + "loss": 0.3873, + "step": 23102 + }, + { + "epoch": 1.1133613673358655, + "grad_norm": 5.167839050292969, + "learning_rate": 6.936823352062492e-06, + "loss": 0.2921, + "step": 23103 + }, + { + "epoch": 1.1133749321758004, + "grad_norm": 5.826348781585693, + "learning_rate": 6.936686309442238e-06, + "loss": 0.2326, + "step": 23104 + }, + { + "epoch": 1.1133884970157353, + "grad_norm": 3.932021141052246, + "learning_rate": 6.936549266821982e-06, + "loss": 0.1867, + "step": 23105 + }, + { + "epoch": 1.1134020618556701, + "grad_norm": 4.235241889953613, + "learning_rate": 6.936412224201727e-06, + "loss": 0.2032, + "step": 23106 + }, + { + "epoch": 1.113415626695605, + "grad_norm": 4.234071254730225, + "learning_rate": 6.9362751815814725e-06, + "loss": 0.2314, + "step": 23107 + }, + { + "epoch": 1.1134291915355399, + "grad_norm": 3.475736141204834, + "learning_rate": 6.936138138961217e-06, + "loss": 0.2651, + "step": 23108 + }, + { + "epoch": 1.1134427563754747, + "grad_norm": 4.676278114318848, + "learning_rate": 6.936001096340963e-06, + "loss": 0.321, + "step": 23109 + }, + { + "epoch": 1.1134563212154096, + "grad_norm": 3.6842565536499023, + "learning_rate": 6.935864053720708e-06, + "loss": 0.1897, + "step": 23110 + }, + { + "epoch": 1.1134698860553445, + "grad_norm": 5.968270778656006, + "learning_rate": 6.935727011100452e-06, + "loss": 0.2964, + "step": 23111 + }, + { + "epoch": 1.1134834508952793, + "grad_norm": 6.305400371551514, + "learning_rate": 6.9355899684801975e-06, + "loss": 0.3107, + "step": 23112 + }, + { + "epoch": 1.1134970157352144, + "grad_norm": 4.613133430480957, + "learning_rate": 6.9354529258599436e-06, + "loss": 0.2488, + "step": 23113 + }, + { + "epoch": 1.1135105805751493, + "grad_norm": 5.227421760559082, + "learning_rate": 6.935315883239688e-06, + "loss": 0.3062, + "step": 23114 + }, + { + "epoch": 1.1135241454150842, + "grad_norm": 5.837420463562012, + "learning_rate": 6.935178840619433e-06, + "loss": 0.2195, + "step": 23115 + }, + { + "epoch": 1.113537710255019, + "grad_norm": 5.325902938842773, + "learning_rate": 6.935041797999178e-06, + "loss": 0.2923, + "step": 23116 + }, + { + "epoch": 1.1135512750949539, + "grad_norm": 5.36753511428833, + "learning_rate": 6.934904755378924e-06, + "loss": 0.2881, + "step": 23117 + }, + { + "epoch": 1.1135648399348888, + "grad_norm": 4.084394931793213, + "learning_rate": 6.934767712758669e-06, + "loss": 0.1797, + "step": 23118 + }, + { + "epoch": 1.1135784047748236, + "grad_norm": 5.888981342315674, + "learning_rate": 6.934630670138414e-06, + "loss": 0.3826, + "step": 23119 + }, + { + "epoch": 1.1135919696147585, + "grad_norm": 6.390832901000977, + "learning_rate": 6.934493627518158e-06, + "loss": 0.3264, + "step": 23120 + }, + { + "epoch": 1.1136055344546933, + "grad_norm": 5.4111714363098145, + "learning_rate": 6.934356584897903e-06, + "loss": 0.2687, + "step": 23121 + }, + { + "epoch": 1.1136190992946284, + "grad_norm": 6.341491222381592, + "learning_rate": 6.934219542277649e-06, + "loss": 0.2339, + "step": 23122 + }, + { + "epoch": 1.1136326641345633, + "grad_norm": 3.8930749893188477, + "learning_rate": 6.9340824996573945e-06, + "loss": 0.2185, + "step": 23123 + }, + { + "epoch": 1.1136462289744982, + "grad_norm": 7.226675033569336, + "learning_rate": 6.933945457037139e-06, + "loss": 0.3551, + "step": 23124 + }, + { + "epoch": 1.113659793814433, + "grad_norm": 4.024565696716309, + "learning_rate": 6.933808414416884e-06, + "loss": 0.1384, + "step": 23125 + }, + { + "epoch": 1.113673358654368, + "grad_norm": 5.047883987426758, + "learning_rate": 6.93367137179663e-06, + "loss": 0.3323, + "step": 23126 + }, + { + "epoch": 1.1136869234943028, + "grad_norm": 4.871946334838867, + "learning_rate": 6.933534329176374e-06, + "loss": 0.2973, + "step": 23127 + }, + { + "epoch": 1.1137004883342376, + "grad_norm": 5.640187740325928, + "learning_rate": 6.9333972865561196e-06, + "loss": 0.2189, + "step": 23128 + }, + { + "epoch": 1.1137140531741725, + "grad_norm": 5.394364356994629, + "learning_rate": 6.933260243935864e-06, + "loss": 0.3158, + "step": 23129 + }, + { + "epoch": 1.1137276180141074, + "grad_norm": 5.724880218505859, + "learning_rate": 6.93312320131561e-06, + "loss": 0.3198, + "step": 23130 + }, + { + "epoch": 1.1137411828540422, + "grad_norm": 5.283949851989746, + "learning_rate": 6.932986158695355e-06, + "loss": 0.2657, + "step": 23131 + }, + { + "epoch": 1.1137547476939773, + "grad_norm": 4.563751697540283, + "learning_rate": 6.9328491160751e-06, + "loss": 0.2563, + "step": 23132 + }, + { + "epoch": 1.1137683125339122, + "grad_norm": 4.830646514892578, + "learning_rate": 6.932712073454845e-06, + "loss": 0.2788, + "step": 23133 + }, + { + "epoch": 1.113781877373847, + "grad_norm": 5.605771064758301, + "learning_rate": 6.93257503083459e-06, + "loss": 0.2105, + "step": 23134 + }, + { + "epoch": 1.113795442213782, + "grad_norm": 6.2071404457092285, + "learning_rate": 6.932437988214336e-06, + "loss": 0.3042, + "step": 23135 + }, + { + "epoch": 1.1138090070537168, + "grad_norm": 7.299345016479492, + "learning_rate": 6.93230094559408e-06, + "loss": 0.4199, + "step": 23136 + }, + { + "epoch": 1.1138225718936516, + "grad_norm": 6.295603275299072, + "learning_rate": 6.932163902973825e-06, + "loss": 0.3877, + "step": 23137 + }, + { + "epoch": 1.1138361367335865, + "grad_norm": 5.772770881652832, + "learning_rate": 6.9320268603535705e-06, + "loss": 0.3325, + "step": 23138 + }, + { + "epoch": 1.1138497015735214, + "grad_norm": 4.118663787841797, + "learning_rate": 6.931889817733316e-06, + "loss": 0.1728, + "step": 23139 + }, + { + "epoch": 1.1138632664134562, + "grad_norm": 3.7487497329711914, + "learning_rate": 6.931752775113061e-06, + "loss": 0.1905, + "step": 23140 + }, + { + "epoch": 1.1138768312533913, + "grad_norm": 4.294993877410889, + "learning_rate": 6.931615732492806e-06, + "loss": 0.2801, + "step": 23141 + }, + { + "epoch": 1.1138903960933262, + "grad_norm": 8.97136116027832, + "learning_rate": 6.93147868987255e-06, + "loss": 0.407, + "step": 23142 + }, + { + "epoch": 1.113903960933261, + "grad_norm": 5.657415390014648, + "learning_rate": 6.931341647252296e-06, + "loss": 0.4589, + "step": 23143 + }, + { + "epoch": 1.113917525773196, + "grad_norm": 5.3259382247924805, + "learning_rate": 6.931204604632042e-06, + "loss": 0.3705, + "step": 23144 + }, + { + "epoch": 1.1139310906131308, + "grad_norm": 6.388289451599121, + "learning_rate": 6.931067562011786e-06, + "loss": 0.3937, + "step": 23145 + }, + { + "epoch": 1.1139446554530656, + "grad_norm": 5.325845241546631, + "learning_rate": 6.930930519391531e-06, + "loss": 0.2628, + "step": 23146 + }, + { + "epoch": 1.1139582202930005, + "grad_norm": 6.518589019775391, + "learning_rate": 6.930793476771276e-06, + "loss": 0.2462, + "step": 23147 + }, + { + "epoch": 1.1139717851329354, + "grad_norm": 5.269197463989258, + "learning_rate": 6.9306564341510214e-06, + "loss": 0.2146, + "step": 23148 + }, + { + "epoch": 1.1139853499728702, + "grad_norm": 5.250528812408447, + "learning_rate": 6.930519391530767e-06, + "loss": 0.3305, + "step": 23149 + }, + { + "epoch": 1.113998914812805, + "grad_norm": 5.646810054779053, + "learning_rate": 6.930382348910512e-06, + "loss": 0.3057, + "step": 23150 + }, + { + "epoch": 1.1140124796527402, + "grad_norm": 6.330715656280518, + "learning_rate": 6.930245306290256e-06, + "loss": 0.3073, + "step": 23151 + }, + { + "epoch": 1.114026044492675, + "grad_norm": 7.7860822677612305, + "learning_rate": 6.930108263670002e-06, + "loss": 0.4685, + "step": 23152 + }, + { + "epoch": 1.11403960933261, + "grad_norm": 5.565608978271484, + "learning_rate": 6.929971221049747e-06, + "loss": 0.2635, + "step": 23153 + }, + { + "epoch": 1.1140531741725448, + "grad_norm": 4.003292083740234, + "learning_rate": 6.929834178429492e-06, + "loss": 0.2111, + "step": 23154 + }, + { + "epoch": 1.1140667390124797, + "grad_norm": 4.52680778503418, + "learning_rate": 6.929697135809237e-06, + "loss": 0.2943, + "step": 23155 + }, + { + "epoch": 1.1140803038524145, + "grad_norm": 7.0707926750183105, + "learning_rate": 6.929560093188983e-06, + "loss": 0.3881, + "step": 23156 + }, + { + "epoch": 1.1140938686923494, + "grad_norm": 6.1202263832092285, + "learning_rate": 6.929423050568728e-06, + "loss": 0.4618, + "step": 23157 + }, + { + "epoch": 1.1141074335322843, + "grad_norm": 5.418120384216309, + "learning_rate": 6.929286007948472e-06, + "loss": 0.2628, + "step": 23158 + }, + { + "epoch": 1.1141209983722191, + "grad_norm": 5.443832874298096, + "learning_rate": 6.9291489653282176e-06, + "loss": 0.2775, + "step": 23159 + }, + { + "epoch": 1.1141345632121542, + "grad_norm": 4.469067096710205, + "learning_rate": 6.929011922707962e-06, + "loss": 0.267, + "step": 23160 + }, + { + "epoch": 1.114148128052089, + "grad_norm": 4.211479663848877, + "learning_rate": 6.928874880087708e-06, + "loss": 0.2555, + "step": 23161 + }, + { + "epoch": 1.114161692892024, + "grad_norm": 3.924952507019043, + "learning_rate": 6.928737837467453e-06, + "loss": 0.1569, + "step": 23162 + }, + { + "epoch": 1.1141752577319588, + "grad_norm": 5.835218906402588, + "learning_rate": 6.9286007948471974e-06, + "loss": 0.292, + "step": 23163 + }, + { + "epoch": 1.1141888225718937, + "grad_norm": 4.139908313751221, + "learning_rate": 6.928463752226943e-06, + "loss": 0.2341, + "step": 23164 + }, + { + "epoch": 1.1142023874118285, + "grad_norm": 3.447554349899292, + "learning_rate": 6.928326709606689e-06, + "loss": 0.1591, + "step": 23165 + }, + { + "epoch": 1.1142159522517634, + "grad_norm": 6.6417975425720215, + "learning_rate": 6.928189666986434e-06, + "loss": 0.3015, + "step": 23166 + }, + { + "epoch": 1.1142295170916983, + "grad_norm": 4.791257858276367, + "learning_rate": 6.928052624366178e-06, + "loss": 0.2346, + "step": 23167 + }, + { + "epoch": 1.1142430819316331, + "grad_norm": 6.668588638305664, + "learning_rate": 6.927915581745923e-06, + "loss": 0.3596, + "step": 23168 + }, + { + "epoch": 1.114256646771568, + "grad_norm": 4.7449259757995605, + "learning_rate": 6.927778539125669e-06, + "loss": 0.209, + "step": 23169 + }, + { + "epoch": 1.114270211611503, + "grad_norm": 5.207056045532227, + "learning_rate": 6.927641496505414e-06, + "loss": 0.3624, + "step": 23170 + }, + { + "epoch": 1.114283776451438, + "grad_norm": 6.420828819274902, + "learning_rate": 6.927504453885159e-06, + "loss": 0.2546, + "step": 23171 + }, + { + "epoch": 1.1142973412913728, + "grad_norm": 7.766866683959961, + "learning_rate": 6.927367411264904e-06, + "loss": 0.1982, + "step": 23172 + }, + { + "epoch": 1.1143109061313077, + "grad_norm": 6.6017842292785645, + "learning_rate": 6.927230368644649e-06, + "loss": 0.4308, + "step": 23173 + }, + { + "epoch": 1.1143244709712425, + "grad_norm": 4.459277153015137, + "learning_rate": 6.927093326024394e-06, + "loss": 0.3673, + "step": 23174 + }, + { + "epoch": 1.1143380358111774, + "grad_norm": 6.545133113861084, + "learning_rate": 6.92695628340414e-06, + "loss": 0.3885, + "step": 23175 + }, + { + "epoch": 1.1143516006511123, + "grad_norm": 6.614600658416748, + "learning_rate": 6.926819240783884e-06, + "loss": 0.2327, + "step": 23176 + }, + { + "epoch": 1.1143651654910471, + "grad_norm": 9.008966445922852, + "learning_rate": 6.926682198163629e-06, + "loss": 0.3605, + "step": 23177 + }, + { + "epoch": 1.114378730330982, + "grad_norm": 3.997821569442749, + "learning_rate": 6.926545155543375e-06, + "loss": 0.2628, + "step": 23178 + }, + { + "epoch": 1.114392295170917, + "grad_norm": 5.039969444274902, + "learning_rate": 6.9264081129231195e-06, + "loss": 0.3303, + "step": 23179 + }, + { + "epoch": 1.114405860010852, + "grad_norm": 8.612123489379883, + "learning_rate": 6.926271070302865e-06, + "loss": 0.4346, + "step": 23180 + }, + { + "epoch": 1.1144194248507868, + "grad_norm": 8.54804515838623, + "learning_rate": 6.92613402768261e-06, + "loss": 0.3808, + "step": 23181 + }, + { + "epoch": 1.1144329896907217, + "grad_norm": 9.017435073852539, + "learning_rate": 6.925996985062355e-06, + "loss": 0.5296, + "step": 23182 + }, + { + "epoch": 1.1144465545306566, + "grad_norm": 4.954653739929199, + "learning_rate": 6.9258599424421e-06, + "loss": 0.1692, + "step": 23183 + }, + { + "epoch": 1.1144601193705914, + "grad_norm": 4.9345383644104, + "learning_rate": 6.925722899821845e-06, + "loss": 0.2682, + "step": 23184 + }, + { + "epoch": 1.1144736842105263, + "grad_norm": 6.447023868560791, + "learning_rate": 6.92558585720159e-06, + "loss": 0.254, + "step": 23185 + }, + { + "epoch": 1.1144872490504611, + "grad_norm": 3.911358594894409, + "learning_rate": 6.925448814581336e-06, + "loss": 0.2323, + "step": 23186 + }, + { + "epoch": 1.114500813890396, + "grad_norm": 7.286473751068115, + "learning_rate": 6.925311771961081e-06, + "loss": 0.2778, + "step": 23187 + }, + { + "epoch": 1.1145143787303309, + "grad_norm": 8.044450759887695, + "learning_rate": 6.925174729340825e-06, + "loss": 0.3205, + "step": 23188 + }, + { + "epoch": 1.114527943570266, + "grad_norm": 6.159614562988281, + "learning_rate": 6.92503768672057e-06, + "loss": 0.2872, + "step": 23189 + }, + { + "epoch": 1.1145415084102008, + "grad_norm": 3.5339951515197754, + "learning_rate": 6.9249006441003156e-06, + "loss": 0.2068, + "step": 23190 + }, + { + "epoch": 1.1145550732501357, + "grad_norm": 6.060108661651611, + "learning_rate": 6.924763601480062e-06, + "loss": 0.4123, + "step": 23191 + }, + { + "epoch": 1.1145686380900706, + "grad_norm": 7.442878723144531, + "learning_rate": 6.924626558859806e-06, + "loss": 0.3665, + "step": 23192 + }, + { + "epoch": 1.1145822029300054, + "grad_norm": 6.533536911010742, + "learning_rate": 6.924489516239551e-06, + "loss": 0.3688, + "step": 23193 + }, + { + "epoch": 1.1145957677699403, + "grad_norm": 5.073164463043213, + "learning_rate": 6.9243524736192954e-06, + "loss": 0.2475, + "step": 23194 + }, + { + "epoch": 1.1146093326098752, + "grad_norm": 5.262049198150635, + "learning_rate": 6.9242154309990415e-06, + "loss": 0.1697, + "step": 23195 + }, + { + "epoch": 1.11462289744981, + "grad_norm": 3.891308069229126, + "learning_rate": 6.924078388378787e-06, + "loss": 0.1765, + "step": 23196 + }, + { + "epoch": 1.114636462289745, + "grad_norm": 4.250765323638916, + "learning_rate": 6.923941345758531e-06, + "loss": 0.1179, + "step": 23197 + }, + { + "epoch": 1.11465002712968, + "grad_norm": 6.763547420501709, + "learning_rate": 6.923804303138276e-06, + "loss": 0.306, + "step": 23198 + }, + { + "epoch": 1.1146635919696148, + "grad_norm": 5.9749345779418945, + "learning_rate": 6.923667260518022e-06, + "loss": 0.2685, + "step": 23199 + }, + { + "epoch": 1.1146771568095497, + "grad_norm": 7.6844635009765625, + "learning_rate": 6.923530217897767e-06, + "loss": 0.3931, + "step": 23200 + }, + { + "epoch": 1.1146907216494846, + "grad_norm": 6.210968971252441, + "learning_rate": 6.923393175277512e-06, + "loss": 0.2446, + "step": 23201 + }, + { + "epoch": 1.1147042864894194, + "grad_norm": 6.562623977661133, + "learning_rate": 6.923256132657257e-06, + "loss": 0.3633, + "step": 23202 + }, + { + "epoch": 1.1147178513293543, + "grad_norm": 7.73003625869751, + "learning_rate": 6.923119090037001e-06, + "loss": 0.4527, + "step": 23203 + }, + { + "epoch": 1.1147314161692892, + "grad_norm": 4.458917617797852, + "learning_rate": 6.922982047416747e-06, + "loss": 0.2715, + "step": 23204 + }, + { + "epoch": 1.114744981009224, + "grad_norm": 8.546530723571777, + "learning_rate": 6.922845004796492e-06, + "loss": 0.4373, + "step": 23205 + }, + { + "epoch": 1.114758545849159, + "grad_norm": 5.267751216888428, + "learning_rate": 6.922707962176238e-06, + "loss": 0.2767, + "step": 23206 + }, + { + "epoch": 1.1147721106890938, + "grad_norm": 5.5754499435424805, + "learning_rate": 6.922570919555982e-06, + "loss": 0.3039, + "step": 23207 + }, + { + "epoch": 1.1147856755290289, + "grad_norm": 6.096190452575684, + "learning_rate": 6.922433876935728e-06, + "loss": 0.2558, + "step": 23208 + }, + { + "epoch": 1.1147992403689637, + "grad_norm": 5.091282844543457, + "learning_rate": 6.922296834315473e-06, + "loss": 0.2949, + "step": 23209 + }, + { + "epoch": 1.1148128052088986, + "grad_norm": 6.066428184509277, + "learning_rate": 6.9221597916952175e-06, + "loss": 0.2444, + "step": 23210 + }, + { + "epoch": 1.1148263700488334, + "grad_norm": 5.852235317230225, + "learning_rate": 6.922022749074963e-06, + "loss": 0.2702, + "step": 23211 + }, + { + "epoch": 1.1148399348887683, + "grad_norm": 6.4445343017578125, + "learning_rate": 6.921885706454709e-06, + "loss": 0.3703, + "step": 23212 + }, + { + "epoch": 1.1148534997287032, + "grad_norm": 8.083159446716309, + "learning_rate": 6.921748663834453e-06, + "loss": 0.333, + "step": 23213 + }, + { + "epoch": 1.114867064568638, + "grad_norm": 8.15096664428711, + "learning_rate": 6.921611621214198e-06, + "loss": 0.3257, + "step": 23214 + }, + { + "epoch": 1.114880629408573, + "grad_norm": 7.905803203582764, + "learning_rate": 6.921474578593943e-06, + "loss": 0.3539, + "step": 23215 + }, + { + "epoch": 1.1148941942485078, + "grad_norm": 6.476252555847168, + "learning_rate": 6.921337535973688e-06, + "loss": 0.3029, + "step": 23216 + }, + { + "epoch": 1.1149077590884429, + "grad_norm": 7.380586624145508, + "learning_rate": 6.921200493353434e-06, + "loss": 0.2949, + "step": 23217 + }, + { + "epoch": 1.1149213239283777, + "grad_norm": 3.882274866104126, + "learning_rate": 6.921063450733179e-06, + "loss": 0.1781, + "step": 23218 + }, + { + "epoch": 1.1149348887683126, + "grad_norm": 5.012576580047607, + "learning_rate": 6.920926408112923e-06, + "loss": 0.3227, + "step": 23219 + }, + { + "epoch": 1.1149484536082475, + "grad_norm": 4.6388726234436035, + "learning_rate": 6.920789365492668e-06, + "loss": 0.2619, + "step": 23220 + }, + { + "epoch": 1.1149620184481823, + "grad_norm": 5.626216888427734, + "learning_rate": 6.920652322872414e-06, + "loss": 0.3645, + "step": 23221 + }, + { + "epoch": 1.1149755832881172, + "grad_norm": 5.50447416305542, + "learning_rate": 6.920515280252159e-06, + "loss": 0.3348, + "step": 23222 + }, + { + "epoch": 1.114989148128052, + "grad_norm": 6.39539909362793, + "learning_rate": 6.920378237631904e-06, + "loss": 0.4716, + "step": 23223 + }, + { + "epoch": 1.115002712967987, + "grad_norm": 5.60434103012085, + "learning_rate": 6.920241195011649e-06, + "loss": 0.3171, + "step": 23224 + }, + { + "epoch": 1.1150162778079218, + "grad_norm": 5.283192157745361, + "learning_rate": 6.920104152391395e-06, + "loss": 0.2416, + "step": 23225 + }, + { + "epoch": 1.1150298426478567, + "grad_norm": 6.804684162139893, + "learning_rate": 6.9199671097711395e-06, + "loss": 0.4974, + "step": 23226 + }, + { + "epoch": 1.1150434074877917, + "grad_norm": 4.608340740203857, + "learning_rate": 6.919830067150885e-06, + "loss": 0.1799, + "step": 23227 + }, + { + "epoch": 1.1150569723277266, + "grad_norm": 6.161699295043945, + "learning_rate": 6.919693024530629e-06, + "loss": 0.4282, + "step": 23228 + }, + { + "epoch": 1.1150705371676615, + "grad_norm": 7.674046516418457, + "learning_rate": 6.919555981910374e-06, + "loss": 0.3855, + "step": 23229 + }, + { + "epoch": 1.1150841020075963, + "grad_norm": 5.802331447601318, + "learning_rate": 6.91941893929012e-06, + "loss": 0.245, + "step": 23230 + }, + { + "epoch": 1.1150976668475312, + "grad_norm": 6.228753089904785, + "learning_rate": 6.9192818966698645e-06, + "loss": 0.3114, + "step": 23231 + }, + { + "epoch": 1.115111231687466, + "grad_norm": 6.55942440032959, + "learning_rate": 6.91914485404961e-06, + "loss": 0.256, + "step": 23232 + }, + { + "epoch": 1.115124796527401, + "grad_norm": 4.906179904937744, + "learning_rate": 6.919007811429355e-06, + "loss": 0.2162, + "step": 23233 + }, + { + "epoch": 1.1151383613673358, + "grad_norm": 8.121509552001953, + "learning_rate": 6.918870768809101e-06, + "loss": 0.472, + "step": 23234 + }, + { + "epoch": 1.1151519262072707, + "grad_norm": 5.104936122894287, + "learning_rate": 6.918733726188845e-06, + "loss": 0.2444, + "step": 23235 + }, + { + "epoch": 1.1151654910472057, + "grad_norm": 5.840775489807129, + "learning_rate": 6.91859668356859e-06, + "loss": 0.2599, + "step": 23236 + }, + { + "epoch": 1.1151790558871406, + "grad_norm": 7.79613733291626, + "learning_rate": 6.918459640948335e-06, + "loss": 0.3976, + "step": 23237 + }, + { + "epoch": 1.1151926207270755, + "grad_norm": 4.638314247131348, + "learning_rate": 6.918322598328081e-06, + "loss": 0.229, + "step": 23238 + }, + { + "epoch": 1.1152061855670103, + "grad_norm": 6.128139495849609, + "learning_rate": 6.918185555707826e-06, + "loss": 0.3386, + "step": 23239 + }, + { + "epoch": 1.1152197504069452, + "grad_norm": 5.336766719818115, + "learning_rate": 6.918048513087571e-06, + "loss": 0.2675, + "step": 23240 + }, + { + "epoch": 1.11523331524688, + "grad_norm": 7.1218719482421875, + "learning_rate": 6.9179114704673155e-06, + "loss": 0.2589, + "step": 23241 + }, + { + "epoch": 1.115246880086815, + "grad_norm": 5.198670864105225, + "learning_rate": 6.9177744278470615e-06, + "loss": 0.2331, + "step": 23242 + }, + { + "epoch": 1.1152604449267498, + "grad_norm": 8.27983570098877, + "learning_rate": 6.917637385226807e-06, + "loss": 0.4808, + "step": 23243 + }, + { + "epoch": 1.1152740097666847, + "grad_norm": 5.455864906311035, + "learning_rate": 6.917500342606551e-06, + "loss": 0.3291, + "step": 23244 + }, + { + "epoch": 1.1152875746066195, + "grad_norm": 5.886898994445801, + "learning_rate": 6.917363299986296e-06, + "loss": 0.3236, + "step": 23245 + }, + { + "epoch": 1.1153011394465546, + "grad_norm": 4.38972806930542, + "learning_rate": 6.9172262573660405e-06, + "loss": 0.3415, + "step": 23246 + }, + { + "epoch": 1.1153147042864895, + "grad_norm": 4.990207195281982, + "learning_rate": 6.9170892147457865e-06, + "loss": 0.1805, + "step": 23247 + }, + { + "epoch": 1.1153282691264244, + "grad_norm": 7.004771709442139, + "learning_rate": 6.916952172125532e-06, + "loss": 0.368, + "step": 23248 + }, + { + "epoch": 1.1153418339663592, + "grad_norm": 7.4962663650512695, + "learning_rate": 6.916815129505277e-06, + "loss": 0.3497, + "step": 23249 + }, + { + "epoch": 1.115355398806294, + "grad_norm": 5.759243011474609, + "learning_rate": 6.916678086885021e-06, + "loss": 0.2858, + "step": 23250 + }, + { + "epoch": 1.115368963646229, + "grad_norm": 5.026552200317383, + "learning_rate": 6.916541044264767e-06, + "loss": 0.2304, + "step": 23251 + }, + { + "epoch": 1.1153825284861638, + "grad_norm": 6.0212321281433105, + "learning_rate": 6.9164040016445124e-06, + "loss": 0.4238, + "step": 23252 + }, + { + "epoch": 1.1153960933260987, + "grad_norm": 7.305113792419434, + "learning_rate": 6.916266959024257e-06, + "loss": 0.3, + "step": 23253 + }, + { + "epoch": 1.1154096581660335, + "grad_norm": 6.938101291656494, + "learning_rate": 6.916129916404002e-06, + "loss": 0.2502, + "step": 23254 + }, + { + "epoch": 1.1154232230059686, + "grad_norm": 6.624907970428467, + "learning_rate": 6.915992873783748e-06, + "loss": 0.299, + "step": 23255 + }, + { + "epoch": 1.1154367878459035, + "grad_norm": 5.6385626792907715, + "learning_rate": 6.915855831163492e-06, + "loss": 0.2459, + "step": 23256 + }, + { + "epoch": 1.1154503526858384, + "grad_norm": 6.479669094085693, + "learning_rate": 6.9157187885432375e-06, + "loss": 0.2609, + "step": 23257 + }, + { + "epoch": 1.1154639175257732, + "grad_norm": 9.317736625671387, + "learning_rate": 6.915581745922983e-06, + "loss": 0.4192, + "step": 23258 + }, + { + "epoch": 1.115477482365708, + "grad_norm": 6.741611480712891, + "learning_rate": 6.915444703302727e-06, + "loss": 0.5169, + "step": 23259 + }, + { + "epoch": 1.115491047205643, + "grad_norm": 6.668393611907959, + "learning_rate": 6.915307660682473e-06, + "loss": 0.4717, + "step": 23260 + }, + { + "epoch": 1.1155046120455778, + "grad_norm": 6.065480709075928, + "learning_rate": 6.915170618062218e-06, + "loss": 0.4109, + "step": 23261 + }, + { + "epoch": 1.1155181768855127, + "grad_norm": 6.464702129364014, + "learning_rate": 6.9150335754419625e-06, + "loss": 0.308, + "step": 23262 + }, + { + "epoch": 1.1155317417254476, + "grad_norm": 6.045506954193115, + "learning_rate": 6.914896532821708e-06, + "loss": 0.2749, + "step": 23263 + }, + { + "epoch": 1.1155453065653824, + "grad_norm": 4.245692253112793, + "learning_rate": 6.914759490201454e-06, + "loss": 0.2496, + "step": 23264 + }, + { + "epoch": 1.1155588714053175, + "grad_norm": 5.082010269165039, + "learning_rate": 6.914622447581199e-06, + "loss": 0.2566, + "step": 23265 + }, + { + "epoch": 1.1155724362452524, + "grad_norm": 7.115625381469727, + "learning_rate": 6.914485404960943e-06, + "loss": 0.3156, + "step": 23266 + }, + { + "epoch": 1.1155860010851872, + "grad_norm": 5.82315731048584, + "learning_rate": 6.914348362340688e-06, + "loss": 0.2965, + "step": 23267 + }, + { + "epoch": 1.115599565925122, + "grad_norm": 7.15322208404541, + "learning_rate": 6.9142113197204344e-06, + "loss": 0.3651, + "step": 23268 + }, + { + "epoch": 1.115613130765057, + "grad_norm": 4.907443046569824, + "learning_rate": 6.914074277100179e-06, + "loss": 0.2158, + "step": 23269 + }, + { + "epoch": 1.1156266956049918, + "grad_norm": 7.1807122230529785, + "learning_rate": 6.913937234479924e-06, + "loss": 0.3788, + "step": 23270 + }, + { + "epoch": 1.1156402604449267, + "grad_norm": 4.908028602600098, + "learning_rate": 6.913800191859668e-06, + "loss": 0.2639, + "step": 23271 + }, + { + "epoch": 1.1156538252848616, + "grad_norm": 6.046335697174072, + "learning_rate": 6.9136631492394135e-06, + "loss": 0.3701, + "step": 23272 + }, + { + "epoch": 1.1156673901247964, + "grad_norm": 5.7288994789123535, + "learning_rate": 6.9135261066191595e-06, + "loss": 0.2751, + "step": 23273 + }, + { + "epoch": 1.1156809549647315, + "grad_norm": 5.295816421508789, + "learning_rate": 6.913389063998905e-06, + "loss": 0.2727, + "step": 23274 + }, + { + "epoch": 1.1156945198046664, + "grad_norm": 5.635602951049805, + "learning_rate": 6.913252021378649e-06, + "loss": 0.3312, + "step": 23275 + }, + { + "epoch": 1.1157080846446013, + "grad_norm": 4.91229248046875, + "learning_rate": 6.913114978758394e-06, + "loss": 0.2465, + "step": 23276 + }, + { + "epoch": 1.1157216494845361, + "grad_norm": 4.919198989868164, + "learning_rate": 6.91297793613814e-06, + "loss": 0.2721, + "step": 23277 + }, + { + "epoch": 1.115735214324471, + "grad_norm": 6.038699150085449, + "learning_rate": 6.9128408935178845e-06, + "loss": 0.3192, + "step": 23278 + }, + { + "epoch": 1.1157487791644058, + "grad_norm": 5.782299995422363, + "learning_rate": 6.91270385089763e-06, + "loss": 0.2855, + "step": 23279 + }, + { + "epoch": 1.1157623440043407, + "grad_norm": 5.608933448791504, + "learning_rate": 6.912566808277375e-06, + "loss": 0.2905, + "step": 23280 + }, + { + "epoch": 1.1157759088442756, + "grad_norm": 4.329858303070068, + "learning_rate": 6.91242976565712e-06, + "loss": 0.2016, + "step": 23281 + }, + { + "epoch": 1.1157894736842104, + "grad_norm": 5.52708101272583, + "learning_rate": 6.912292723036865e-06, + "loss": 0.214, + "step": 23282 + }, + { + "epoch": 1.1158030385241453, + "grad_norm": 6.792294502258301, + "learning_rate": 6.9121556804166104e-06, + "loss": 0.3444, + "step": 23283 + }, + { + "epoch": 1.1158166033640804, + "grad_norm": 8.081097602844238, + "learning_rate": 6.912018637796355e-06, + "loss": 0.4397, + "step": 23284 + }, + { + "epoch": 1.1158301682040153, + "grad_norm": 5.473763465881348, + "learning_rate": 6.9118815951761e-06, + "loss": 0.3371, + "step": 23285 + }, + { + "epoch": 1.1158437330439501, + "grad_norm": 6.483036041259766, + "learning_rate": 6.911744552555846e-06, + "loss": 0.3213, + "step": 23286 + }, + { + "epoch": 1.115857297883885, + "grad_norm": 6.804695129394531, + "learning_rate": 6.91160750993559e-06, + "loss": 0.3813, + "step": 23287 + }, + { + "epoch": 1.1158708627238199, + "grad_norm": 6.414846897125244, + "learning_rate": 6.9114704673153355e-06, + "loss": 0.4489, + "step": 23288 + }, + { + "epoch": 1.1158844275637547, + "grad_norm": 8.860976219177246, + "learning_rate": 6.911333424695081e-06, + "loss": 0.4669, + "step": 23289 + }, + { + "epoch": 1.1158979924036896, + "grad_norm": 4.808743953704834, + "learning_rate": 6.911196382074826e-06, + "loss": 0.2791, + "step": 23290 + }, + { + "epoch": 1.1159115572436245, + "grad_norm": 7.103560447692871, + "learning_rate": 6.911059339454571e-06, + "loss": 0.3226, + "step": 23291 + }, + { + "epoch": 1.1159251220835593, + "grad_norm": 4.958953857421875, + "learning_rate": 6.910922296834316e-06, + "loss": 0.2091, + "step": 23292 + }, + { + "epoch": 1.1159386869234944, + "grad_norm": 5.609713554382324, + "learning_rate": 6.9107852542140605e-06, + "loss": 0.3844, + "step": 23293 + }, + { + "epoch": 1.1159522517634293, + "grad_norm": 6.184062957763672, + "learning_rate": 6.9106482115938066e-06, + "loss": 0.3714, + "step": 23294 + }, + { + "epoch": 1.1159658166033641, + "grad_norm": 4.244038105010986, + "learning_rate": 6.910511168973552e-06, + "loss": 0.2626, + "step": 23295 + }, + { + "epoch": 1.115979381443299, + "grad_norm": 4.665873050689697, + "learning_rate": 6.910374126353296e-06, + "loss": 0.3142, + "step": 23296 + }, + { + "epoch": 1.1159929462832339, + "grad_norm": 5.350808143615723, + "learning_rate": 6.910237083733041e-06, + "loss": 0.2962, + "step": 23297 + }, + { + "epoch": 1.1160065111231687, + "grad_norm": 4.80080509185791, + "learning_rate": 6.910100041112786e-06, + "loss": 0.2162, + "step": 23298 + }, + { + "epoch": 1.1160200759631036, + "grad_norm": 5.422170162200928, + "learning_rate": 6.9099629984925324e-06, + "loss": 0.3697, + "step": 23299 + }, + { + "epoch": 1.1160336408030385, + "grad_norm": 4.642691612243652, + "learning_rate": 6.909825955872277e-06, + "loss": 0.3316, + "step": 23300 + }, + { + "epoch": 1.1160472056429733, + "grad_norm": 5.276962757110596, + "learning_rate": 6.909688913252022e-06, + "loss": 0.3145, + "step": 23301 + }, + { + "epoch": 1.1160607704829082, + "grad_norm": 4.904524803161621, + "learning_rate": 6.909551870631766e-06, + "loss": 0.2872, + "step": 23302 + }, + { + "epoch": 1.1160743353228433, + "grad_norm": 4.5509257316589355, + "learning_rate": 6.909414828011512e-06, + "loss": 0.2042, + "step": 23303 + }, + { + "epoch": 1.1160879001627781, + "grad_norm": 3.9647037982940674, + "learning_rate": 6.9092777853912575e-06, + "loss": 0.1738, + "step": 23304 + }, + { + "epoch": 1.116101465002713, + "grad_norm": 4.259614944458008, + "learning_rate": 6.909140742771002e-06, + "loss": 0.2207, + "step": 23305 + }, + { + "epoch": 1.1161150298426479, + "grad_norm": 6.136521816253662, + "learning_rate": 6.909003700150747e-06, + "loss": 0.4521, + "step": 23306 + }, + { + "epoch": 1.1161285946825827, + "grad_norm": 5.552475929260254, + "learning_rate": 6.908866657530493e-06, + "loss": 0.3064, + "step": 23307 + }, + { + "epoch": 1.1161421595225176, + "grad_norm": 4.72163724899292, + "learning_rate": 6.908729614910238e-06, + "loss": 0.1641, + "step": 23308 + }, + { + "epoch": 1.1161557243624525, + "grad_norm": 5.55991268157959, + "learning_rate": 6.9085925722899825e-06, + "loss": 0.3715, + "step": 23309 + }, + { + "epoch": 1.1161692892023873, + "grad_norm": 4.567304611206055, + "learning_rate": 6.908455529669728e-06, + "loss": 0.2236, + "step": 23310 + }, + { + "epoch": 1.1161828540423222, + "grad_norm": 5.081817150115967, + "learning_rate": 6.908318487049474e-06, + "loss": 0.3144, + "step": 23311 + }, + { + "epoch": 1.1161964188822573, + "grad_norm": 3.9347617626190186, + "learning_rate": 6.908181444429218e-06, + "loss": 0.2179, + "step": 23312 + }, + { + "epoch": 1.1162099837221922, + "grad_norm": 8.742237091064453, + "learning_rate": 6.908044401808963e-06, + "loss": 0.4687, + "step": 23313 + }, + { + "epoch": 1.116223548562127, + "grad_norm": 4.578315258026123, + "learning_rate": 6.9079073591887084e-06, + "loss": 0.3088, + "step": 23314 + }, + { + "epoch": 1.1162371134020619, + "grad_norm": 6.63024377822876, + "learning_rate": 6.907770316568453e-06, + "loss": 0.262, + "step": 23315 + }, + { + "epoch": 1.1162506782419968, + "grad_norm": 5.899549961090088, + "learning_rate": 6.907633273948199e-06, + "loss": 0.2237, + "step": 23316 + }, + { + "epoch": 1.1162642430819316, + "grad_norm": 5.193869113922119, + "learning_rate": 6.907496231327944e-06, + "loss": 0.227, + "step": 23317 + }, + { + "epoch": 1.1162778079218665, + "grad_norm": 4.471730709075928, + "learning_rate": 6.907359188707688e-06, + "loss": 0.2181, + "step": 23318 + }, + { + "epoch": 1.1162913727618013, + "grad_norm": 4.481544017791748, + "learning_rate": 6.9072221460874335e-06, + "loss": 0.1829, + "step": 23319 + }, + { + "epoch": 1.1163049376017362, + "grad_norm": 4.75444221496582, + "learning_rate": 6.9070851034671795e-06, + "loss": 0.2356, + "step": 23320 + }, + { + "epoch": 1.116318502441671, + "grad_norm": 5.608670234680176, + "learning_rate": 6.906948060846924e-06, + "loss": 0.2952, + "step": 23321 + }, + { + "epoch": 1.1163320672816062, + "grad_norm": 4.938267230987549, + "learning_rate": 6.906811018226669e-06, + "loss": 0.3797, + "step": 23322 + }, + { + "epoch": 1.116345632121541, + "grad_norm": 5.323523044586182, + "learning_rate": 6.906673975606414e-06, + "loss": 0.3902, + "step": 23323 + }, + { + "epoch": 1.116359196961476, + "grad_norm": 4.206367492675781, + "learning_rate": 6.906536932986159e-06, + "loss": 0.2335, + "step": 23324 + }, + { + "epoch": 1.1163727618014108, + "grad_norm": 5.92557954788208, + "learning_rate": 6.9063998903659046e-06, + "loss": 0.2547, + "step": 23325 + }, + { + "epoch": 1.1163863266413456, + "grad_norm": 7.014686584472656, + "learning_rate": 6.90626284774565e-06, + "loss": 0.3323, + "step": 23326 + }, + { + "epoch": 1.1163998914812805, + "grad_norm": 4.570472240447998, + "learning_rate": 6.906125805125394e-06, + "loss": 0.2835, + "step": 23327 + }, + { + "epoch": 1.1164134563212154, + "grad_norm": 5.115867614746094, + "learning_rate": 6.905988762505139e-06, + "loss": 0.2549, + "step": 23328 + }, + { + "epoch": 1.1164270211611502, + "grad_norm": 7.015902519226074, + "learning_rate": 6.905851719884885e-06, + "loss": 0.3311, + "step": 23329 + }, + { + "epoch": 1.116440586001085, + "grad_norm": 4.231430530548096, + "learning_rate": 6.90571467726463e-06, + "loss": 0.2981, + "step": 23330 + }, + { + "epoch": 1.1164541508410202, + "grad_norm": 5.6788716316223145, + "learning_rate": 6.905577634644375e-06, + "loss": 0.3403, + "step": 23331 + }, + { + "epoch": 1.116467715680955, + "grad_norm": 5.539273262023926, + "learning_rate": 6.90544059202412e-06, + "loss": 0.3487, + "step": 23332 + }, + { + "epoch": 1.11648128052089, + "grad_norm": 4.761500358581543, + "learning_rate": 6.905303549403866e-06, + "loss": 0.1963, + "step": 23333 + }, + { + "epoch": 1.1164948453608248, + "grad_norm": 4.780405044555664, + "learning_rate": 6.90516650678361e-06, + "loss": 0.221, + "step": 23334 + }, + { + "epoch": 1.1165084102007596, + "grad_norm": 4.218175411224365, + "learning_rate": 6.9050294641633555e-06, + "loss": 0.2744, + "step": 23335 + }, + { + "epoch": 1.1165219750406945, + "grad_norm": 5.548678398132324, + "learning_rate": 6.9048924215431e-06, + "loss": 0.2492, + "step": 23336 + }, + { + "epoch": 1.1165355398806294, + "grad_norm": 5.483444690704346, + "learning_rate": 6.904755378922846e-06, + "loss": 0.2741, + "step": 23337 + }, + { + "epoch": 1.1165491047205642, + "grad_norm": 4.016504764556885, + "learning_rate": 6.904618336302591e-06, + "loss": 0.2451, + "step": 23338 + }, + { + "epoch": 1.116562669560499, + "grad_norm": 6.6332197189331055, + "learning_rate": 6.904481293682335e-06, + "loss": 0.2805, + "step": 23339 + }, + { + "epoch": 1.116576234400434, + "grad_norm": 5.882369518280029, + "learning_rate": 6.9043442510620805e-06, + "loss": 0.307, + "step": 23340 + }, + { + "epoch": 1.116589799240369, + "grad_norm": 5.3247880935668945, + "learning_rate": 6.904207208441826e-06, + "loss": 0.3213, + "step": 23341 + }, + { + "epoch": 1.116603364080304, + "grad_norm": 6.231940746307373, + "learning_rate": 6.904070165821572e-06, + "loss": 0.2031, + "step": 23342 + }, + { + "epoch": 1.1166169289202388, + "grad_norm": 6.238447189331055, + "learning_rate": 6.903933123201316e-06, + "loss": 0.2909, + "step": 23343 + }, + { + "epoch": 1.1166304937601736, + "grad_norm": 5.146378040313721, + "learning_rate": 6.903796080581061e-06, + "loss": 0.2354, + "step": 23344 + }, + { + "epoch": 1.1166440586001085, + "grad_norm": 4.926377773284912, + "learning_rate": 6.903659037960806e-06, + "loss": 0.243, + "step": 23345 + }, + { + "epoch": 1.1166576234400434, + "grad_norm": 4.933050155639648, + "learning_rate": 6.903521995340552e-06, + "loss": 0.2748, + "step": 23346 + }, + { + "epoch": 1.1166711882799782, + "grad_norm": 5.275811195373535, + "learning_rate": 6.903384952720297e-06, + "loss": 0.2884, + "step": 23347 + }, + { + "epoch": 1.116684753119913, + "grad_norm": 4.368595600128174, + "learning_rate": 6.903247910100042e-06, + "loss": 0.2364, + "step": 23348 + }, + { + "epoch": 1.116698317959848, + "grad_norm": 5.854372978210449, + "learning_rate": 6.903110867479786e-06, + "loss": 0.2476, + "step": 23349 + }, + { + "epoch": 1.116711882799783, + "grad_norm": 4.275749683380127, + "learning_rate": 6.902973824859532e-06, + "loss": 0.2649, + "step": 23350 + }, + { + "epoch": 1.116725447639718, + "grad_norm": 4.2409844398498535, + "learning_rate": 6.9028367822392775e-06, + "loss": 0.2259, + "step": 23351 + }, + { + "epoch": 1.1167390124796528, + "grad_norm": 4.148910045623779, + "learning_rate": 6.902699739619022e-06, + "loss": 0.2586, + "step": 23352 + }, + { + "epoch": 1.1167525773195877, + "grad_norm": 3.5962634086608887, + "learning_rate": 6.902562696998767e-06, + "loss": 0.1801, + "step": 23353 + }, + { + "epoch": 1.1167661421595225, + "grad_norm": 4.808712005615234, + "learning_rate": 6.902425654378511e-06, + "loss": 0.225, + "step": 23354 + }, + { + "epoch": 1.1167797069994574, + "grad_norm": 4.0155029296875, + "learning_rate": 6.902288611758257e-06, + "loss": 0.2414, + "step": 23355 + }, + { + "epoch": 1.1167932718393923, + "grad_norm": 5.067657470703125, + "learning_rate": 6.9021515691380026e-06, + "loss": 0.3025, + "step": 23356 + }, + { + "epoch": 1.1168068366793271, + "grad_norm": 5.107393264770508, + "learning_rate": 6.902014526517748e-06, + "loss": 0.412, + "step": 23357 + }, + { + "epoch": 1.116820401519262, + "grad_norm": 5.870604991912842, + "learning_rate": 6.901877483897492e-06, + "loss": 0.3371, + "step": 23358 + }, + { + "epoch": 1.1168339663591969, + "grad_norm": 3.3403890132904053, + "learning_rate": 6.901740441277238e-06, + "loss": 0.2414, + "step": 23359 + }, + { + "epoch": 1.116847531199132, + "grad_norm": 3.9527997970581055, + "learning_rate": 6.901603398656983e-06, + "loss": 0.2476, + "step": 23360 + }, + { + "epoch": 1.1168610960390668, + "grad_norm": 4.564688205718994, + "learning_rate": 6.901466356036728e-06, + "loss": 0.2181, + "step": 23361 + }, + { + "epoch": 1.1168746608790017, + "grad_norm": 6.823402404785156, + "learning_rate": 6.901329313416473e-06, + "loss": 0.3223, + "step": 23362 + }, + { + "epoch": 1.1168882257189365, + "grad_norm": 4.096780300140381, + "learning_rate": 6.901192270796219e-06, + "loss": 0.1859, + "step": 23363 + }, + { + "epoch": 1.1169017905588714, + "grad_norm": 3.7893528938293457, + "learning_rate": 6.901055228175963e-06, + "loss": 0.1633, + "step": 23364 + }, + { + "epoch": 1.1169153553988063, + "grad_norm": 5.768177509307861, + "learning_rate": 6.900918185555708e-06, + "loss": 0.2224, + "step": 23365 + }, + { + "epoch": 1.1169289202387411, + "grad_norm": 5.5751214027404785, + "learning_rate": 6.9007811429354535e-06, + "loss": 0.2536, + "step": 23366 + }, + { + "epoch": 1.116942485078676, + "grad_norm": 7.049501419067383, + "learning_rate": 6.900644100315198e-06, + "loss": 0.2577, + "step": 23367 + }, + { + "epoch": 1.1169560499186109, + "grad_norm": 4.775802135467529, + "learning_rate": 6.900507057694944e-06, + "loss": 0.1903, + "step": 23368 + }, + { + "epoch": 1.116969614758546, + "grad_norm": 5.953396797180176, + "learning_rate": 6.900370015074689e-06, + "loss": 0.271, + "step": 23369 + }, + { + "epoch": 1.1169831795984808, + "grad_norm": 5.134799003601074, + "learning_rate": 6.900232972454433e-06, + "loss": 0.3189, + "step": 23370 + }, + { + "epoch": 1.1169967444384157, + "grad_norm": 5.150486946105957, + "learning_rate": 6.9000959298341785e-06, + "loss": 0.2805, + "step": 23371 + }, + { + "epoch": 1.1170103092783505, + "grad_norm": 4.089902877807617, + "learning_rate": 6.8999588872139246e-06, + "loss": 0.1949, + "step": 23372 + }, + { + "epoch": 1.1170238741182854, + "grad_norm": 4.731258392333984, + "learning_rate": 6.899821844593669e-06, + "loss": 0.2717, + "step": 23373 + }, + { + "epoch": 1.1170374389582203, + "grad_norm": 3.522649049758911, + "learning_rate": 6.899684801973414e-06, + "loss": 0.1365, + "step": 23374 + }, + { + "epoch": 1.1170510037981551, + "grad_norm": 4.0654449462890625, + "learning_rate": 6.899547759353159e-06, + "loss": 0.1808, + "step": 23375 + }, + { + "epoch": 1.11706456863809, + "grad_norm": 4.898305892944336, + "learning_rate": 6.899410716732905e-06, + "loss": 0.363, + "step": 23376 + }, + { + "epoch": 1.1170781334780249, + "grad_norm": 4.8837809562683105, + "learning_rate": 6.89927367411265e-06, + "loss": 0.2013, + "step": 23377 + }, + { + "epoch": 1.1170916983179597, + "grad_norm": 4.970839023590088, + "learning_rate": 6.899136631492395e-06, + "loss": 0.2139, + "step": 23378 + }, + { + "epoch": 1.1171052631578948, + "grad_norm": 5.20986795425415, + "learning_rate": 6.898999588872139e-06, + "loss": 0.2364, + "step": 23379 + }, + { + "epoch": 1.1171188279978297, + "grad_norm": 5.463230133056641, + "learning_rate": 6.898862546251885e-06, + "loss": 0.1782, + "step": 23380 + }, + { + "epoch": 1.1171323928377646, + "grad_norm": 5.076269149780273, + "learning_rate": 6.89872550363163e-06, + "loss": 0.2584, + "step": 23381 + }, + { + "epoch": 1.1171459576776994, + "grad_norm": 4.473201751708984, + "learning_rate": 6.8985884610113755e-06, + "loss": 0.1882, + "step": 23382 + }, + { + "epoch": 1.1171595225176343, + "grad_norm": 4.833846569061279, + "learning_rate": 6.89845141839112e-06, + "loss": 0.2344, + "step": 23383 + }, + { + "epoch": 1.1171730873575692, + "grad_norm": 3.8974921703338623, + "learning_rate": 6.898314375770865e-06, + "loss": 0.1652, + "step": 23384 + }, + { + "epoch": 1.117186652197504, + "grad_norm": 5.057437896728516, + "learning_rate": 6.898177333150611e-06, + "loss": 0.2694, + "step": 23385 + }, + { + "epoch": 1.1172002170374389, + "grad_norm": 5.596482753753662, + "learning_rate": 6.898040290530355e-06, + "loss": 0.1852, + "step": 23386 + }, + { + "epoch": 1.1172137818773737, + "grad_norm": 3.721444606781006, + "learning_rate": 6.8979032479101006e-06, + "loss": 0.1691, + "step": 23387 + }, + { + "epoch": 1.1172273467173088, + "grad_norm": 6.273631572723389, + "learning_rate": 6.897766205289845e-06, + "loss": 0.2746, + "step": 23388 + }, + { + "epoch": 1.1172409115572437, + "grad_norm": 3.7969775199890137, + "learning_rate": 6.897629162669591e-06, + "loss": 0.2252, + "step": 23389 + }, + { + "epoch": 1.1172544763971786, + "grad_norm": 3.736171007156372, + "learning_rate": 6.897492120049336e-06, + "loss": 0.2445, + "step": 23390 + }, + { + "epoch": 1.1172680412371134, + "grad_norm": 4.778975009918213, + "learning_rate": 6.897355077429081e-06, + "loss": 0.2059, + "step": 23391 + }, + { + "epoch": 1.1172816060770483, + "grad_norm": 3.9269208908081055, + "learning_rate": 6.897218034808826e-06, + "loss": 0.2146, + "step": 23392 + }, + { + "epoch": 1.1172951709169832, + "grad_norm": 5.856322288513184, + "learning_rate": 6.897080992188572e-06, + "loss": 0.3379, + "step": 23393 + }, + { + "epoch": 1.117308735756918, + "grad_norm": 4.819517612457275, + "learning_rate": 6.896943949568317e-06, + "loss": 0.2578, + "step": 23394 + }, + { + "epoch": 1.117322300596853, + "grad_norm": 3.4481263160705566, + "learning_rate": 6.896806906948061e-06, + "loss": 0.18, + "step": 23395 + }, + { + "epoch": 1.1173358654367878, + "grad_norm": 4.250572204589844, + "learning_rate": 6.896669864327806e-06, + "loss": 0.2696, + "step": 23396 + }, + { + "epoch": 1.1173494302767226, + "grad_norm": 5.001544952392578, + "learning_rate": 6.8965328217075515e-06, + "loss": 0.2478, + "step": 23397 + }, + { + "epoch": 1.1173629951166577, + "grad_norm": 4.896533012390137, + "learning_rate": 6.896395779087297e-06, + "loss": 0.2828, + "step": 23398 + }, + { + "epoch": 1.1173765599565926, + "grad_norm": 5.192833423614502, + "learning_rate": 6.896258736467042e-06, + "loss": 0.309, + "step": 23399 + }, + { + "epoch": 1.1173901247965274, + "grad_norm": 5.675708770751953, + "learning_rate": 6.896121693846787e-06, + "loss": 0.2396, + "step": 23400 + }, + { + "epoch": 1.1174036896364623, + "grad_norm": 6.432681560516357, + "learning_rate": 6.895984651226531e-06, + "loss": 0.328, + "step": 23401 + }, + { + "epoch": 1.1174172544763972, + "grad_norm": 5.921975135803223, + "learning_rate": 6.895847608606277e-06, + "loss": 0.2579, + "step": 23402 + }, + { + "epoch": 1.117430819316332, + "grad_norm": 3.9862115383148193, + "learning_rate": 6.895710565986023e-06, + "loss": 0.1944, + "step": 23403 + }, + { + "epoch": 1.117444384156267, + "grad_norm": 5.466165065765381, + "learning_rate": 6.895573523365767e-06, + "loss": 0.1549, + "step": 23404 + }, + { + "epoch": 1.1174579489962018, + "grad_norm": 4.168262481689453, + "learning_rate": 6.895436480745512e-06, + "loss": 0.164, + "step": 23405 + }, + { + "epoch": 1.1174715138361366, + "grad_norm": 4.511373996734619, + "learning_rate": 6.895299438125258e-06, + "loss": 0.1827, + "step": 23406 + }, + { + "epoch": 1.1174850786760717, + "grad_norm": 5.1046223640441895, + "learning_rate": 6.895162395505003e-06, + "loss": 0.2214, + "step": 23407 + }, + { + "epoch": 1.1174986435160066, + "grad_norm": 6.081261157989502, + "learning_rate": 6.895025352884748e-06, + "loss": 0.3373, + "step": 23408 + }, + { + "epoch": 1.1175122083559414, + "grad_norm": 3.797022581100464, + "learning_rate": 6.894888310264493e-06, + "loss": 0.1845, + "step": 23409 + }, + { + "epoch": 1.1175257731958763, + "grad_norm": 4.293992519378662, + "learning_rate": 6.894751267644237e-06, + "loss": 0.2407, + "step": 23410 + }, + { + "epoch": 1.1175393380358112, + "grad_norm": 4.104934215545654, + "learning_rate": 6.894614225023983e-06, + "loss": 0.1935, + "step": 23411 + }, + { + "epoch": 1.117552902875746, + "grad_norm": 3.730722665786743, + "learning_rate": 6.894477182403728e-06, + "loss": 0.1565, + "step": 23412 + }, + { + "epoch": 1.117566467715681, + "grad_norm": 3.7782633304595947, + "learning_rate": 6.894340139783473e-06, + "loss": 0.1477, + "step": 23413 + }, + { + "epoch": 1.1175800325556158, + "grad_norm": 3.920130729675293, + "learning_rate": 6.894203097163218e-06, + "loss": 0.2185, + "step": 23414 + }, + { + "epoch": 1.1175935973955506, + "grad_norm": 5.6596784591674805, + "learning_rate": 6.894066054542964e-06, + "loss": 0.2595, + "step": 23415 + }, + { + "epoch": 1.1176071622354855, + "grad_norm": 3.443387985229492, + "learning_rate": 6.893929011922709e-06, + "loss": 0.2702, + "step": 23416 + }, + { + "epoch": 1.1176207270754206, + "grad_norm": 6.426927089691162, + "learning_rate": 6.893791969302453e-06, + "loss": 0.2803, + "step": 23417 + }, + { + "epoch": 1.1176342919153555, + "grad_norm": 4.810742378234863, + "learning_rate": 6.8936549266821986e-06, + "loss": 0.2079, + "step": 23418 + }, + { + "epoch": 1.1176478567552903, + "grad_norm": 5.000942230224609, + "learning_rate": 6.893517884061945e-06, + "loss": 0.3118, + "step": 23419 + }, + { + "epoch": 1.1176614215952252, + "grad_norm": 5.029358386993408, + "learning_rate": 6.893380841441689e-06, + "loss": 0.3306, + "step": 23420 + }, + { + "epoch": 1.11767498643516, + "grad_norm": 5.194135665893555, + "learning_rate": 6.893243798821434e-06, + "loss": 0.3852, + "step": 23421 + }, + { + "epoch": 1.117688551275095, + "grad_norm": 5.167013645172119, + "learning_rate": 6.893106756201179e-06, + "loss": 0.2824, + "step": 23422 + }, + { + "epoch": 1.1177021161150298, + "grad_norm": 4.218753814697266, + "learning_rate": 6.892969713580924e-06, + "loss": 0.2279, + "step": 23423 + }, + { + "epoch": 1.1177156809549647, + "grad_norm": 4.5768656730651855, + "learning_rate": 6.89283267096067e-06, + "loss": 0.1696, + "step": 23424 + }, + { + "epoch": 1.1177292457948995, + "grad_norm": 4.98561429977417, + "learning_rate": 6.892695628340415e-06, + "loss": 0.4527, + "step": 23425 + }, + { + "epoch": 1.1177428106348346, + "grad_norm": 5.888601779937744, + "learning_rate": 6.892558585720159e-06, + "loss": 0.2856, + "step": 23426 + }, + { + "epoch": 1.1177563754747695, + "grad_norm": 4.296432971954346, + "learning_rate": 6.892421543099904e-06, + "loss": 0.3318, + "step": 23427 + }, + { + "epoch": 1.1177699403147043, + "grad_norm": 5.18203067779541, + "learning_rate": 6.89228450047965e-06, + "loss": 0.3587, + "step": 23428 + }, + { + "epoch": 1.1177835051546392, + "grad_norm": 3.9211084842681885, + "learning_rate": 6.892147457859395e-06, + "loss": 0.2823, + "step": 23429 + }, + { + "epoch": 1.117797069994574, + "grad_norm": 5.9391608238220215, + "learning_rate": 6.89201041523914e-06, + "loss": 0.3855, + "step": 23430 + }, + { + "epoch": 1.117810634834509, + "grad_norm": 3.7154171466827393, + "learning_rate": 6.891873372618885e-06, + "loss": 0.1978, + "step": 23431 + }, + { + "epoch": 1.1178241996744438, + "grad_norm": 4.413919448852539, + "learning_rate": 6.89173632999863e-06, + "loss": 0.2193, + "step": 23432 + }, + { + "epoch": 1.1178377645143787, + "grad_norm": 4.565439224243164, + "learning_rate": 6.891599287378375e-06, + "loss": 0.2273, + "step": 23433 + }, + { + "epoch": 1.1178513293543135, + "grad_norm": 4.923668384552002, + "learning_rate": 6.891462244758121e-06, + "loss": 0.2993, + "step": 23434 + }, + { + "epoch": 1.1178648941942484, + "grad_norm": 5.080567359924316, + "learning_rate": 6.891325202137865e-06, + "loss": 0.238, + "step": 23435 + }, + { + "epoch": 1.1178784590341835, + "grad_norm": 6.382706165313721, + "learning_rate": 6.89118815951761e-06, + "loss": 0.3832, + "step": 23436 + }, + { + "epoch": 1.1178920238741183, + "grad_norm": 5.002640247344971, + "learning_rate": 6.891051116897356e-06, + "loss": 0.2948, + "step": 23437 + }, + { + "epoch": 1.1179055887140532, + "grad_norm": 4.424151420593262, + "learning_rate": 6.8909140742771005e-06, + "loss": 0.1896, + "step": 23438 + }, + { + "epoch": 1.117919153553988, + "grad_norm": 7.0541839599609375, + "learning_rate": 6.890777031656846e-06, + "loss": 0.252, + "step": 23439 + }, + { + "epoch": 1.117932718393923, + "grad_norm": 4.551509380340576, + "learning_rate": 6.890639989036591e-06, + "loss": 0.2639, + "step": 23440 + }, + { + "epoch": 1.1179462832338578, + "grad_norm": 5.913083553314209, + "learning_rate": 6.890502946416337e-06, + "loss": 0.4053, + "step": 23441 + }, + { + "epoch": 1.1179598480737927, + "grad_norm": 6.997716426849365, + "learning_rate": 6.890365903796081e-06, + "loss": 0.4081, + "step": 23442 + }, + { + "epoch": 1.1179734129137275, + "grad_norm": 6.941787242889404, + "learning_rate": 6.890228861175826e-06, + "loss": 0.3278, + "step": 23443 + }, + { + "epoch": 1.1179869777536624, + "grad_norm": 5.074344158172607, + "learning_rate": 6.890091818555571e-06, + "loss": 0.2935, + "step": 23444 + }, + { + "epoch": 1.1180005425935975, + "grad_norm": 5.663430213928223, + "learning_rate": 6.889954775935317e-06, + "loss": 0.2832, + "step": 23445 + }, + { + "epoch": 1.1180141074335324, + "grad_norm": 4.332310676574707, + "learning_rate": 6.889817733315062e-06, + "loss": 0.2386, + "step": 23446 + }, + { + "epoch": 1.1180276722734672, + "grad_norm": 4.495638370513916, + "learning_rate": 6.889680690694806e-06, + "loss": 0.2171, + "step": 23447 + }, + { + "epoch": 1.118041237113402, + "grad_norm": 6.8799238204956055, + "learning_rate": 6.889543648074551e-06, + "loss": 0.38, + "step": 23448 + }, + { + "epoch": 1.118054801953337, + "grad_norm": 5.829887866973877, + "learning_rate": 6.889406605454297e-06, + "loss": 0.3495, + "step": 23449 + }, + { + "epoch": 1.1180683667932718, + "grad_norm": 4.526784420013428, + "learning_rate": 6.889269562834043e-06, + "loss": 0.2467, + "step": 23450 + }, + { + "epoch": 1.1180819316332067, + "grad_norm": 4.0284271240234375, + "learning_rate": 6.889132520213787e-06, + "loss": 0.2617, + "step": 23451 + }, + { + "epoch": 1.1180954964731415, + "grad_norm": 5.055687427520752, + "learning_rate": 6.888995477593532e-06, + "loss": 0.2621, + "step": 23452 + }, + { + "epoch": 1.1181090613130764, + "grad_norm": 6.496276378631592, + "learning_rate": 6.8888584349732764e-06, + "loss": 0.2416, + "step": 23453 + }, + { + "epoch": 1.1181226261530113, + "grad_norm": 5.95018196105957, + "learning_rate": 6.8887213923530225e-06, + "loss": 0.2557, + "step": 23454 + }, + { + "epoch": 1.1181361909929464, + "grad_norm": 6.60549783706665, + "learning_rate": 6.888584349732768e-06, + "loss": 0.3147, + "step": 23455 + }, + { + "epoch": 1.1181497558328812, + "grad_norm": 7.484637260437012, + "learning_rate": 6.888447307112513e-06, + "loss": 0.5447, + "step": 23456 + }, + { + "epoch": 1.118163320672816, + "grad_norm": 5.7696404457092285, + "learning_rate": 6.888310264492257e-06, + "loss": 0.2825, + "step": 23457 + }, + { + "epoch": 1.118176885512751, + "grad_norm": 5.134194850921631, + "learning_rate": 6.888173221872003e-06, + "loss": 0.2666, + "step": 23458 + }, + { + "epoch": 1.1181904503526858, + "grad_norm": 5.299474716186523, + "learning_rate": 6.888036179251748e-06, + "loss": 0.2493, + "step": 23459 + }, + { + "epoch": 1.1182040151926207, + "grad_norm": 5.840377330780029, + "learning_rate": 6.887899136631493e-06, + "loss": 0.2697, + "step": 23460 + }, + { + "epoch": 1.1182175800325556, + "grad_norm": 3.8217554092407227, + "learning_rate": 6.887762094011238e-06, + "loss": 0.2143, + "step": 23461 + }, + { + "epoch": 1.1182311448724904, + "grad_norm": 7.089626789093018, + "learning_rate": 6.887625051390984e-06, + "loss": 0.3639, + "step": 23462 + }, + { + "epoch": 1.1182447097124255, + "grad_norm": 4.507640361785889, + "learning_rate": 6.887488008770728e-06, + "loss": 0.2711, + "step": 23463 + }, + { + "epoch": 1.1182582745523604, + "grad_norm": 8.094962120056152, + "learning_rate": 6.887350966150473e-06, + "loss": 0.4871, + "step": 23464 + }, + { + "epoch": 1.1182718393922952, + "grad_norm": 5.016848087310791, + "learning_rate": 6.887213923530219e-06, + "loss": 0.2407, + "step": 23465 + }, + { + "epoch": 1.11828540423223, + "grad_norm": 7.5911760330200195, + "learning_rate": 6.887076880909963e-06, + "loss": 0.3972, + "step": 23466 + }, + { + "epoch": 1.118298969072165, + "grad_norm": 7.2583699226379395, + "learning_rate": 6.886939838289709e-06, + "loss": 0.3725, + "step": 23467 + }, + { + "epoch": 1.1183125339120998, + "grad_norm": 4.407951831817627, + "learning_rate": 6.886802795669454e-06, + "loss": 0.2521, + "step": 23468 + }, + { + "epoch": 1.1183260987520347, + "grad_norm": 6.385747909545898, + "learning_rate": 6.8866657530491985e-06, + "loss": 0.2384, + "step": 23469 + }, + { + "epoch": 1.1183396635919696, + "grad_norm": 6.176176071166992, + "learning_rate": 6.886528710428944e-06, + "loss": 0.391, + "step": 23470 + }, + { + "epoch": 1.1183532284319044, + "grad_norm": 5.096746444702148, + "learning_rate": 6.88639166780869e-06, + "loss": 0.2539, + "step": 23471 + }, + { + "epoch": 1.1183667932718393, + "grad_norm": 5.642829895019531, + "learning_rate": 6.886254625188434e-06, + "loss": 0.2682, + "step": 23472 + }, + { + "epoch": 1.1183803581117742, + "grad_norm": 5.031399726867676, + "learning_rate": 6.886117582568179e-06, + "loss": 0.171, + "step": 23473 + }, + { + "epoch": 1.1183939229517093, + "grad_norm": 4.437943458557129, + "learning_rate": 6.885980539947924e-06, + "loss": 0.2551, + "step": 23474 + }, + { + "epoch": 1.1184074877916441, + "grad_norm": 4.453542709350586, + "learning_rate": 6.88584349732767e-06, + "loss": 0.2655, + "step": 23475 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 6.294894695281982, + "learning_rate": 6.885706454707415e-06, + "loss": 0.2301, + "step": 23476 + }, + { + "epoch": 1.1184346174715138, + "grad_norm": 4.953829765319824, + "learning_rate": 6.88556941208716e-06, + "loss": 0.24, + "step": 23477 + }, + { + "epoch": 1.1184481823114487, + "grad_norm": 6.602060794830322, + "learning_rate": 6.885432369466904e-06, + "loss": 0.28, + "step": 23478 + }, + { + "epoch": 1.1184617471513836, + "grad_norm": 5.240103244781494, + "learning_rate": 6.885295326846649e-06, + "loss": 0.3529, + "step": 23479 + }, + { + "epoch": 1.1184753119913184, + "grad_norm": 4.233413219451904, + "learning_rate": 6.8851582842263954e-06, + "loss": 0.2397, + "step": 23480 + }, + { + "epoch": 1.1184888768312533, + "grad_norm": 5.521456241607666, + "learning_rate": 6.88502124160614e-06, + "loss": 0.2896, + "step": 23481 + }, + { + "epoch": 1.1185024416711884, + "grad_norm": 6.228299617767334, + "learning_rate": 6.884884198985885e-06, + "loss": 0.2945, + "step": 23482 + }, + { + "epoch": 1.1185160065111233, + "grad_norm": 3.7764687538146973, + "learning_rate": 6.88474715636563e-06, + "loss": 0.2066, + "step": 23483 + }, + { + "epoch": 1.1185295713510581, + "grad_norm": 6.377622127532959, + "learning_rate": 6.884610113745376e-06, + "loss": 0.3302, + "step": 23484 + }, + { + "epoch": 1.118543136190993, + "grad_norm": 5.328287124633789, + "learning_rate": 6.8844730711251205e-06, + "loss": 0.3644, + "step": 23485 + }, + { + "epoch": 1.1185567010309279, + "grad_norm": 4.428954601287842, + "learning_rate": 6.884336028504866e-06, + "loss": 0.3239, + "step": 23486 + }, + { + "epoch": 1.1185702658708627, + "grad_norm": 4.699563980102539, + "learning_rate": 6.88419898588461e-06, + "loss": 0.3123, + "step": 23487 + }, + { + "epoch": 1.1185838307107976, + "grad_norm": 3.812516450881958, + "learning_rate": 6.884061943264356e-06, + "loss": 0.2169, + "step": 23488 + }, + { + "epoch": 1.1185973955507325, + "grad_norm": 5.903986930847168, + "learning_rate": 6.883924900644101e-06, + "loss": 0.3119, + "step": 23489 + }, + { + "epoch": 1.1186109603906673, + "grad_norm": 6.031670570373535, + "learning_rate": 6.883787858023846e-06, + "loss": 0.2865, + "step": 23490 + }, + { + "epoch": 1.1186245252306022, + "grad_norm": 4.179952144622803, + "learning_rate": 6.883650815403591e-06, + "loss": 0.2494, + "step": 23491 + }, + { + "epoch": 1.118638090070537, + "grad_norm": 4.41813850402832, + "learning_rate": 6.883513772783336e-06, + "loss": 0.369, + "step": 23492 + }, + { + "epoch": 1.1186516549104721, + "grad_norm": 5.309790134429932, + "learning_rate": 6.883376730163082e-06, + "loss": 0.2076, + "step": 23493 + }, + { + "epoch": 1.118665219750407, + "grad_norm": 5.48852014541626, + "learning_rate": 6.883239687542826e-06, + "loss": 0.3665, + "step": 23494 + }, + { + "epoch": 1.1186787845903419, + "grad_norm": 4.895782947540283, + "learning_rate": 6.883102644922571e-06, + "loss": 0.2887, + "step": 23495 + }, + { + "epoch": 1.1186923494302767, + "grad_norm": 4.275325775146484, + "learning_rate": 6.882965602302316e-06, + "loss": 0.1825, + "step": 23496 + }, + { + "epoch": 1.1187059142702116, + "grad_norm": 5.941384315490723, + "learning_rate": 6.882828559682062e-06, + "loss": 0.4027, + "step": 23497 + }, + { + "epoch": 1.1187194791101465, + "grad_norm": 5.723450660705566, + "learning_rate": 6.882691517061807e-06, + "loss": 0.2641, + "step": 23498 + }, + { + "epoch": 1.1187330439500813, + "grad_norm": 3.397010326385498, + "learning_rate": 6.882554474441552e-06, + "loss": 0.1017, + "step": 23499 + }, + { + "epoch": 1.1187466087900162, + "grad_norm": 4.37986421585083, + "learning_rate": 6.8824174318212965e-06, + "loss": 0.2478, + "step": 23500 + }, + { + "epoch": 1.1187601736299513, + "grad_norm": 4.803161144256592, + "learning_rate": 6.8822803892010425e-06, + "loss": 0.1911, + "step": 23501 + }, + { + "epoch": 1.1187737384698861, + "grad_norm": 6.540513038635254, + "learning_rate": 6.882143346580788e-06, + "loss": 0.3283, + "step": 23502 + }, + { + "epoch": 1.118787303309821, + "grad_norm": 4.915061950683594, + "learning_rate": 6.882006303960532e-06, + "loss": 0.3124, + "step": 23503 + }, + { + "epoch": 1.1188008681497559, + "grad_norm": 4.320272445678711, + "learning_rate": 6.881869261340277e-06, + "loss": 0.2415, + "step": 23504 + }, + { + "epoch": 1.1188144329896907, + "grad_norm": 7.091968059539795, + "learning_rate": 6.881732218720022e-06, + "loss": 0.5323, + "step": 23505 + }, + { + "epoch": 1.1188279978296256, + "grad_norm": 5.462198734283447, + "learning_rate": 6.8815951760997675e-06, + "loss": 0.2727, + "step": 23506 + }, + { + "epoch": 1.1188415626695605, + "grad_norm": 4.1905975341796875, + "learning_rate": 6.881458133479513e-06, + "loss": 0.2474, + "step": 23507 + }, + { + "epoch": 1.1188551275094953, + "grad_norm": 6.362755298614502, + "learning_rate": 6.881321090859258e-06, + "loss": 0.3271, + "step": 23508 + }, + { + "epoch": 1.1188686923494302, + "grad_norm": 5.70510196685791, + "learning_rate": 6.881184048239002e-06, + "loss": 0.3801, + "step": 23509 + }, + { + "epoch": 1.118882257189365, + "grad_norm": 6.279869079589844, + "learning_rate": 6.881047005618748e-06, + "loss": 0.2565, + "step": 23510 + }, + { + "epoch": 1.1188958220293002, + "grad_norm": 5.786795139312744, + "learning_rate": 6.8809099629984934e-06, + "loss": 0.3392, + "step": 23511 + }, + { + "epoch": 1.118909386869235, + "grad_norm": 5.739893913269043, + "learning_rate": 6.880772920378238e-06, + "loss": 0.424, + "step": 23512 + }, + { + "epoch": 1.1189229517091699, + "grad_norm": 7.970986366271973, + "learning_rate": 6.880635877757983e-06, + "loss": 0.4378, + "step": 23513 + }, + { + "epoch": 1.1189365165491048, + "grad_norm": 6.258917808532715, + "learning_rate": 6.880498835137729e-06, + "loss": 0.382, + "step": 23514 + }, + { + "epoch": 1.1189500813890396, + "grad_norm": 6.825413227081299, + "learning_rate": 6.880361792517473e-06, + "loss": 0.3344, + "step": 23515 + }, + { + "epoch": 1.1189636462289745, + "grad_norm": 6.623174667358398, + "learning_rate": 6.8802247498972185e-06, + "loss": 0.3028, + "step": 23516 + }, + { + "epoch": 1.1189772110689093, + "grad_norm": 5.264596939086914, + "learning_rate": 6.880087707276964e-06, + "loss": 0.3432, + "step": 23517 + }, + { + "epoch": 1.1189907759088442, + "grad_norm": 7.259173393249512, + "learning_rate": 6.879950664656708e-06, + "loss": 0.4963, + "step": 23518 + }, + { + "epoch": 1.119004340748779, + "grad_norm": 6.517587661743164, + "learning_rate": 6.879813622036454e-06, + "loss": 0.4146, + "step": 23519 + }, + { + "epoch": 1.1190179055887142, + "grad_norm": 6.406238079071045, + "learning_rate": 6.879676579416199e-06, + "loss": 0.3107, + "step": 23520 + }, + { + "epoch": 1.119031470428649, + "grad_norm": 8.118877410888672, + "learning_rate": 6.8795395367959435e-06, + "loss": 0.4912, + "step": 23521 + }, + { + "epoch": 1.119045035268584, + "grad_norm": 7.415773391723633, + "learning_rate": 6.879402494175689e-06, + "loss": 0.5338, + "step": 23522 + }, + { + "epoch": 1.1190586001085188, + "grad_norm": 3.4061882495880127, + "learning_rate": 6.879265451555435e-06, + "loss": 0.1497, + "step": 23523 + }, + { + "epoch": 1.1190721649484536, + "grad_norm": 5.189601898193359, + "learning_rate": 6.87912840893518e-06, + "loss": 0.2473, + "step": 23524 + }, + { + "epoch": 1.1190857297883885, + "grad_norm": 3.855699300765991, + "learning_rate": 6.878991366314924e-06, + "loss": 0.2136, + "step": 23525 + }, + { + "epoch": 1.1190992946283234, + "grad_norm": 5.869174957275391, + "learning_rate": 6.878854323694669e-06, + "loss": 0.2647, + "step": 23526 + }, + { + "epoch": 1.1191128594682582, + "grad_norm": 6.261341094970703, + "learning_rate": 6.8787172810744154e-06, + "loss": 0.2718, + "step": 23527 + }, + { + "epoch": 1.119126424308193, + "grad_norm": 5.286736488342285, + "learning_rate": 6.87858023845416e-06, + "loss": 0.2701, + "step": 23528 + }, + { + "epoch": 1.119139989148128, + "grad_norm": 5.40152645111084, + "learning_rate": 6.878443195833905e-06, + "loss": 0.4023, + "step": 23529 + }, + { + "epoch": 1.119153553988063, + "grad_norm": 6.2417120933532715, + "learning_rate": 6.878306153213649e-06, + "loss": 0.3474, + "step": 23530 + }, + { + "epoch": 1.119167118827998, + "grad_norm": 4.001087665557861, + "learning_rate": 6.878169110593395e-06, + "loss": 0.2282, + "step": 23531 + }, + { + "epoch": 1.1191806836679328, + "grad_norm": 4.687391757965088, + "learning_rate": 6.8780320679731405e-06, + "loss": 0.1442, + "step": 23532 + }, + { + "epoch": 1.1191942485078676, + "grad_norm": 6.565377712249756, + "learning_rate": 6.877895025352886e-06, + "loss": 0.3773, + "step": 23533 + }, + { + "epoch": 1.1192078133478025, + "grad_norm": 6.394440650939941, + "learning_rate": 6.87775798273263e-06, + "loss": 0.3543, + "step": 23534 + }, + { + "epoch": 1.1192213781877374, + "grad_norm": 4.382815361022949, + "learning_rate": 6.877620940112375e-06, + "loss": 0.1436, + "step": 23535 + }, + { + "epoch": 1.1192349430276722, + "grad_norm": 4.546431064605713, + "learning_rate": 6.877483897492121e-06, + "loss": 0.2005, + "step": 23536 + }, + { + "epoch": 1.119248507867607, + "grad_norm": 7.115645885467529, + "learning_rate": 6.8773468548718655e-06, + "loss": 0.2008, + "step": 23537 + }, + { + "epoch": 1.119262072707542, + "grad_norm": 4.577189922332764, + "learning_rate": 6.877209812251611e-06, + "loss": 0.2563, + "step": 23538 + }, + { + "epoch": 1.119275637547477, + "grad_norm": 4.448916912078857, + "learning_rate": 6.877072769631356e-06, + "loss": 0.1768, + "step": 23539 + }, + { + "epoch": 1.119289202387412, + "grad_norm": 4.91115665435791, + "learning_rate": 6.876935727011101e-06, + "loss": 0.2743, + "step": 23540 + }, + { + "epoch": 1.1193027672273468, + "grad_norm": 5.8353071212768555, + "learning_rate": 6.876798684390846e-06, + "loss": 0.2877, + "step": 23541 + }, + { + "epoch": 1.1193163320672816, + "grad_norm": 4.9369635581970215, + "learning_rate": 6.8766616417705914e-06, + "loss": 0.2545, + "step": 23542 + }, + { + "epoch": 1.1193298969072165, + "grad_norm": 5.64812707901001, + "learning_rate": 6.876524599150336e-06, + "loss": 0.2425, + "step": 23543 + }, + { + "epoch": 1.1193434617471514, + "grad_norm": 5.052187919616699, + "learning_rate": 6.876387556530082e-06, + "loss": 0.2952, + "step": 23544 + }, + { + "epoch": 1.1193570265870862, + "grad_norm": 4.964157581329346, + "learning_rate": 6.876250513909827e-06, + "loss": 0.2792, + "step": 23545 + }, + { + "epoch": 1.1193705914270211, + "grad_norm": 3.758655309677124, + "learning_rate": 6.876113471289571e-06, + "loss": 0.1908, + "step": 23546 + }, + { + "epoch": 1.119384156266956, + "grad_norm": 5.936913013458252, + "learning_rate": 6.8759764286693165e-06, + "loss": 0.3027, + "step": 23547 + }, + { + "epoch": 1.1193977211068908, + "grad_norm": 6.724524021148682, + "learning_rate": 6.875839386049062e-06, + "loss": 0.3568, + "step": 23548 + }, + { + "epoch": 1.119411285946826, + "grad_norm": 6.7164082527160645, + "learning_rate": 6.875702343428808e-06, + "loss": 0.3094, + "step": 23549 + }, + { + "epoch": 1.1194248507867608, + "grad_norm": 6.716912269592285, + "learning_rate": 6.875565300808552e-06, + "loss": 0.4298, + "step": 23550 + }, + { + "epoch": 1.1194384156266957, + "grad_norm": 5.9697489738464355, + "learning_rate": 6.875428258188297e-06, + "loss": 0.3078, + "step": 23551 + }, + { + "epoch": 1.1194519804666305, + "grad_norm": 6.989843845367432, + "learning_rate": 6.8752912155680415e-06, + "loss": 0.4166, + "step": 23552 + }, + { + "epoch": 1.1194655453065654, + "grad_norm": 7.60758638381958, + "learning_rate": 6.8751541729477876e-06, + "loss": 0.3085, + "step": 23553 + }, + { + "epoch": 1.1194791101465003, + "grad_norm": 5.501676559448242, + "learning_rate": 6.875017130327533e-06, + "loss": 0.3225, + "step": 23554 + }, + { + "epoch": 1.1194926749864351, + "grad_norm": 6.334552764892578, + "learning_rate": 6.874880087707277e-06, + "loss": 0.3386, + "step": 23555 + }, + { + "epoch": 1.11950623982637, + "grad_norm": 5.64328670501709, + "learning_rate": 6.874743045087022e-06, + "loss": 0.3415, + "step": 23556 + }, + { + "epoch": 1.1195198046663049, + "grad_norm": 4.98033332824707, + "learning_rate": 6.874606002466768e-06, + "loss": 0.2676, + "step": 23557 + }, + { + "epoch": 1.11953336950624, + "grad_norm": 6.699828624725342, + "learning_rate": 6.8744689598465134e-06, + "loss": 0.2178, + "step": 23558 + }, + { + "epoch": 1.1195469343461748, + "grad_norm": 5.452314376831055, + "learning_rate": 6.874331917226258e-06, + "loss": 0.3297, + "step": 23559 + }, + { + "epoch": 1.1195604991861097, + "grad_norm": 6.317883014678955, + "learning_rate": 6.874194874606003e-06, + "loss": 0.2546, + "step": 23560 + }, + { + "epoch": 1.1195740640260445, + "grad_norm": 6.266432285308838, + "learning_rate": 6.874057831985747e-06, + "loss": 0.2573, + "step": 23561 + }, + { + "epoch": 1.1195876288659794, + "grad_norm": 5.21162223815918, + "learning_rate": 6.873920789365493e-06, + "loss": 0.2976, + "step": 23562 + }, + { + "epoch": 1.1196011937059143, + "grad_norm": 7.01287841796875, + "learning_rate": 6.8737837467452385e-06, + "loss": 0.3396, + "step": 23563 + }, + { + "epoch": 1.1196147585458491, + "grad_norm": 5.52204704284668, + "learning_rate": 6.873646704124983e-06, + "loss": 0.2048, + "step": 23564 + }, + { + "epoch": 1.119628323385784, + "grad_norm": 5.121930122375488, + "learning_rate": 6.873509661504728e-06, + "loss": 0.2168, + "step": 23565 + }, + { + "epoch": 1.1196418882257189, + "grad_norm": 5.6363205909729, + "learning_rate": 6.873372618884474e-06, + "loss": 0.3318, + "step": 23566 + }, + { + "epoch": 1.1196554530656537, + "grad_norm": 4.519961833953857, + "learning_rate": 6.873235576264219e-06, + "loss": 0.2503, + "step": 23567 + }, + { + "epoch": 1.1196690179055888, + "grad_norm": 6.928444862365723, + "learning_rate": 6.8730985336439635e-06, + "loss": 0.209, + "step": 23568 + }, + { + "epoch": 1.1196825827455237, + "grad_norm": 5.300983428955078, + "learning_rate": 6.872961491023709e-06, + "loss": 0.2432, + "step": 23569 + }, + { + "epoch": 1.1196961475854585, + "grad_norm": 7.989966869354248, + "learning_rate": 6.872824448403455e-06, + "loss": 0.4118, + "step": 23570 + }, + { + "epoch": 1.1197097124253934, + "grad_norm": 7.773796081542969, + "learning_rate": 6.872687405783199e-06, + "loss": 0.3665, + "step": 23571 + }, + { + "epoch": 1.1197232772653283, + "grad_norm": 7.5246968269348145, + "learning_rate": 6.872550363162944e-06, + "loss": 0.3666, + "step": 23572 + }, + { + "epoch": 1.1197368421052631, + "grad_norm": 7.205646991729736, + "learning_rate": 6.8724133205426894e-06, + "loss": 0.45, + "step": 23573 + }, + { + "epoch": 1.119750406945198, + "grad_norm": 7.604889869689941, + "learning_rate": 6.872276277922434e-06, + "loss": 0.4304, + "step": 23574 + }, + { + "epoch": 1.1197639717851329, + "grad_norm": 6.100979804992676, + "learning_rate": 6.87213923530218e-06, + "loss": 0.3275, + "step": 23575 + }, + { + "epoch": 1.1197775366250677, + "grad_norm": 7.013568878173828, + "learning_rate": 6.872002192681925e-06, + "loss": 0.3031, + "step": 23576 + }, + { + "epoch": 1.1197911014650028, + "grad_norm": 7.2137370109558105, + "learning_rate": 6.871865150061669e-06, + "loss": 0.3753, + "step": 23577 + }, + { + "epoch": 1.1198046663049377, + "grad_norm": 5.777961730957031, + "learning_rate": 6.8717281074414145e-06, + "loss": 0.2141, + "step": 23578 + }, + { + "epoch": 1.1198182311448726, + "grad_norm": 4.601924896240234, + "learning_rate": 6.8715910648211605e-06, + "loss": 0.332, + "step": 23579 + }, + { + "epoch": 1.1198317959848074, + "grad_norm": 4.009213447570801, + "learning_rate": 6.871454022200905e-06, + "loss": 0.1724, + "step": 23580 + }, + { + "epoch": 1.1198453608247423, + "grad_norm": 6.425876617431641, + "learning_rate": 6.87131697958065e-06, + "loss": 0.2878, + "step": 23581 + }, + { + "epoch": 1.1198589256646772, + "grad_norm": 6.474836349487305, + "learning_rate": 6.871179936960395e-06, + "loss": 0.2589, + "step": 23582 + }, + { + "epoch": 1.119872490504612, + "grad_norm": 5.868585109710693, + "learning_rate": 6.871042894340141e-06, + "loss": 0.4036, + "step": 23583 + }, + { + "epoch": 1.1198860553445469, + "grad_norm": 4.917370796203613, + "learning_rate": 6.8709058517198856e-06, + "loss": 0.2328, + "step": 23584 + }, + { + "epoch": 1.1198996201844817, + "grad_norm": 4.364004611968994, + "learning_rate": 6.870768809099631e-06, + "loss": 0.2555, + "step": 23585 + }, + { + "epoch": 1.1199131850244166, + "grad_norm": 5.810024261474609, + "learning_rate": 6.870631766479375e-06, + "loss": 0.2432, + "step": 23586 + }, + { + "epoch": 1.1199267498643517, + "grad_norm": 5.024466514587402, + "learning_rate": 6.87049472385912e-06, + "loss": 0.2651, + "step": 23587 + }, + { + "epoch": 1.1199403147042866, + "grad_norm": 4.386375427246094, + "learning_rate": 6.870357681238866e-06, + "loss": 0.2122, + "step": 23588 + }, + { + "epoch": 1.1199538795442214, + "grad_norm": 7.127308368682861, + "learning_rate": 6.870220638618611e-06, + "loss": 0.3565, + "step": 23589 + }, + { + "epoch": 1.1199674443841563, + "grad_norm": 4.57370662689209, + "learning_rate": 6.870083595998356e-06, + "loss": 0.2648, + "step": 23590 + }, + { + "epoch": 1.1199810092240912, + "grad_norm": 6.698800563812256, + "learning_rate": 6.869946553378101e-06, + "loss": 0.2678, + "step": 23591 + }, + { + "epoch": 1.119994574064026, + "grad_norm": 7.401066780090332, + "learning_rate": 6.869809510757847e-06, + "loss": 0.2787, + "step": 23592 + }, + { + "epoch": 1.120008138903961, + "grad_norm": 5.269652366638184, + "learning_rate": 6.869672468137591e-06, + "loss": 0.1794, + "step": 23593 + }, + { + "epoch": 1.1200217037438958, + "grad_norm": 4.975403308868408, + "learning_rate": 6.8695354255173365e-06, + "loss": 0.2921, + "step": 23594 + }, + { + "epoch": 1.1200352685838306, + "grad_norm": 6.773555278778076, + "learning_rate": 6.869398382897081e-06, + "loss": 0.2811, + "step": 23595 + }, + { + "epoch": 1.1200488334237657, + "grad_norm": 7.370513916015625, + "learning_rate": 6.869261340276827e-06, + "loss": 0.4697, + "step": 23596 + }, + { + "epoch": 1.1200623982637006, + "grad_norm": 6.260939121246338, + "learning_rate": 6.869124297656572e-06, + "loss": 0.4092, + "step": 23597 + }, + { + "epoch": 1.1200759631036354, + "grad_norm": 5.115663051605225, + "learning_rate": 6.868987255036317e-06, + "loss": 0.2523, + "step": 23598 + }, + { + "epoch": 1.1200895279435703, + "grad_norm": 6.673795700073242, + "learning_rate": 6.8688502124160615e-06, + "loss": 0.4351, + "step": 23599 + }, + { + "epoch": 1.1201030927835052, + "grad_norm": 6.027805328369141, + "learning_rate": 6.8687131697958076e-06, + "loss": 0.2985, + "step": 23600 + }, + { + "epoch": 1.12011665762344, + "grad_norm": 5.511733055114746, + "learning_rate": 6.868576127175553e-06, + "loss": 0.3091, + "step": 23601 + }, + { + "epoch": 1.120130222463375, + "grad_norm": 5.6584153175354, + "learning_rate": 6.868439084555297e-06, + "loss": 0.3049, + "step": 23602 + }, + { + "epoch": 1.1201437873033098, + "grad_norm": 6.361817836761475, + "learning_rate": 6.868302041935042e-06, + "loss": 0.4244, + "step": 23603 + }, + { + "epoch": 1.1201573521432446, + "grad_norm": 5.267536640167236, + "learning_rate": 6.868164999314787e-06, + "loss": 0.309, + "step": 23604 + }, + { + "epoch": 1.1201709169831795, + "grad_norm": 5.695298194885254, + "learning_rate": 6.868027956694533e-06, + "loss": 0.3616, + "step": 23605 + }, + { + "epoch": 1.1201844818231146, + "grad_norm": 3.882560968399048, + "learning_rate": 6.867890914074278e-06, + "loss": 0.2291, + "step": 23606 + }, + { + "epoch": 1.1201980466630495, + "grad_norm": 5.342747688293457, + "learning_rate": 6.867753871454023e-06, + "loss": 0.3283, + "step": 23607 + }, + { + "epoch": 1.1202116115029843, + "grad_norm": 4.070202350616455, + "learning_rate": 6.867616828833767e-06, + "loss": 0.2219, + "step": 23608 + }, + { + "epoch": 1.1202251763429192, + "grad_norm": 6.699118614196777, + "learning_rate": 6.867479786213513e-06, + "loss": 0.3381, + "step": 23609 + }, + { + "epoch": 1.120238741182854, + "grad_norm": 6.316876411437988, + "learning_rate": 6.8673427435932585e-06, + "loss": 0.446, + "step": 23610 + }, + { + "epoch": 1.120252306022789, + "grad_norm": 6.882813453674316, + "learning_rate": 6.867205700973003e-06, + "loss": 0.3645, + "step": 23611 + }, + { + "epoch": 1.1202658708627238, + "grad_norm": 3.9884960651397705, + "learning_rate": 6.867068658352748e-06, + "loss": 0.261, + "step": 23612 + }, + { + "epoch": 1.1202794357026586, + "grad_norm": 4.819033145904541, + "learning_rate": 6.866931615732494e-06, + "loss": 0.2824, + "step": 23613 + }, + { + "epoch": 1.1202930005425935, + "grad_norm": 5.48750114440918, + "learning_rate": 6.866794573112238e-06, + "loss": 0.3855, + "step": 23614 + }, + { + "epoch": 1.1203065653825286, + "grad_norm": 6.16082239151001, + "learning_rate": 6.8666575304919836e-06, + "loss": 0.295, + "step": 23615 + }, + { + "epoch": 1.1203201302224635, + "grad_norm": 6.025381565093994, + "learning_rate": 6.866520487871729e-06, + "loss": 0.3157, + "step": 23616 + }, + { + "epoch": 1.1203336950623983, + "grad_norm": 4.118963718414307, + "learning_rate": 6.866383445251473e-06, + "loss": 0.2051, + "step": 23617 + }, + { + "epoch": 1.1203472599023332, + "grad_norm": 5.395066738128662, + "learning_rate": 6.866246402631219e-06, + "loss": 0.1842, + "step": 23618 + }, + { + "epoch": 1.120360824742268, + "grad_norm": 5.665316581726074, + "learning_rate": 6.866109360010964e-06, + "loss": 0.3167, + "step": 23619 + }, + { + "epoch": 1.120374389582203, + "grad_norm": 3.9596750736236572, + "learning_rate": 6.865972317390709e-06, + "loss": 0.261, + "step": 23620 + }, + { + "epoch": 1.1203879544221378, + "grad_norm": 4.951292991638184, + "learning_rate": 6.865835274770454e-06, + "loss": 0.2695, + "step": 23621 + }, + { + "epoch": 1.1204015192620727, + "grad_norm": 6.483020305633545, + "learning_rate": 6.8656982321502e-06, + "loss": 0.4462, + "step": 23622 + }, + { + "epoch": 1.1204150841020075, + "grad_norm": 6.739603519439697, + "learning_rate": 6.865561189529944e-06, + "loss": 0.3977, + "step": 23623 + }, + { + "epoch": 1.1204286489419424, + "grad_norm": 5.577795505523682, + "learning_rate": 6.865424146909689e-06, + "loss": 0.2959, + "step": 23624 + }, + { + "epoch": 1.1204422137818775, + "grad_norm": 5.259073257446289, + "learning_rate": 6.8652871042894345e-06, + "loss": 0.238, + "step": 23625 + }, + { + "epoch": 1.1204557786218123, + "grad_norm": 4.312703609466553, + "learning_rate": 6.8651500616691805e-06, + "loss": 0.1907, + "step": 23626 + }, + { + "epoch": 1.1204693434617472, + "grad_norm": 5.205452919006348, + "learning_rate": 6.865013019048925e-06, + "loss": 0.2626, + "step": 23627 + }, + { + "epoch": 1.120482908301682, + "grad_norm": 6.249931335449219, + "learning_rate": 6.86487597642867e-06, + "loss": 0.2954, + "step": 23628 + }, + { + "epoch": 1.120496473141617, + "grad_norm": 4.002696514129639, + "learning_rate": 6.864738933808414e-06, + "loss": 0.1798, + "step": 23629 + }, + { + "epoch": 1.1205100379815518, + "grad_norm": 6.601558685302734, + "learning_rate": 6.8646018911881596e-06, + "loss": 0.3851, + "step": 23630 + }, + { + "epoch": 1.1205236028214867, + "grad_norm": 5.48677921295166, + "learning_rate": 6.864464848567906e-06, + "loss": 0.2778, + "step": 23631 + }, + { + "epoch": 1.1205371676614215, + "grad_norm": 3.4233899116516113, + "learning_rate": 6.864327805947651e-06, + "loss": 0.1269, + "step": 23632 + }, + { + "epoch": 1.1205507325013564, + "grad_norm": 5.963361740112305, + "learning_rate": 6.864190763327395e-06, + "loss": 0.2833, + "step": 23633 + }, + { + "epoch": 1.1205642973412915, + "grad_norm": 4.725876808166504, + "learning_rate": 6.86405372070714e-06, + "loss": 0.1921, + "step": 23634 + }, + { + "epoch": 1.1205778621812263, + "grad_norm": 3.740368366241455, + "learning_rate": 6.863916678086886e-06, + "loss": 0.1527, + "step": 23635 + }, + { + "epoch": 1.1205914270211612, + "grad_norm": 7.379114151000977, + "learning_rate": 6.863779635466631e-06, + "loss": 0.3547, + "step": 23636 + }, + { + "epoch": 1.120604991861096, + "grad_norm": 3.823378801345825, + "learning_rate": 6.863642592846376e-06, + "loss": 0.2011, + "step": 23637 + }, + { + "epoch": 1.120618556701031, + "grad_norm": 5.167407989501953, + "learning_rate": 6.86350555022612e-06, + "loss": 0.2616, + "step": 23638 + }, + { + "epoch": 1.1206321215409658, + "grad_norm": 4.465475082397461, + "learning_rate": 6.863368507605866e-06, + "loss": 0.2272, + "step": 23639 + }, + { + "epoch": 1.1206456863809007, + "grad_norm": 4.8619608879089355, + "learning_rate": 6.863231464985611e-06, + "loss": 0.2046, + "step": 23640 + }, + { + "epoch": 1.1206592512208355, + "grad_norm": 4.491587162017822, + "learning_rate": 6.8630944223653565e-06, + "loss": 0.1578, + "step": 23641 + }, + { + "epoch": 1.1206728160607704, + "grad_norm": 6.155954837799072, + "learning_rate": 6.862957379745101e-06, + "loss": 0.2168, + "step": 23642 + }, + { + "epoch": 1.1206863809007053, + "grad_norm": 4.302436351776123, + "learning_rate": 6.862820337124846e-06, + "loss": 0.1834, + "step": 23643 + }, + { + "epoch": 1.1206999457406404, + "grad_norm": 5.5119948387146, + "learning_rate": 6.862683294504592e-06, + "loss": 0.1897, + "step": 23644 + }, + { + "epoch": 1.1207135105805752, + "grad_norm": 4.687562465667725, + "learning_rate": 6.862546251884336e-06, + "loss": 0.2902, + "step": 23645 + }, + { + "epoch": 1.12072707542051, + "grad_norm": 4.422606945037842, + "learning_rate": 6.8624092092640816e-06, + "loss": 0.3097, + "step": 23646 + }, + { + "epoch": 1.120740640260445, + "grad_norm": 3.606346845626831, + "learning_rate": 6.862272166643827e-06, + "loss": 0.1096, + "step": 23647 + }, + { + "epoch": 1.1207542051003798, + "grad_norm": 7.182992935180664, + "learning_rate": 6.862135124023572e-06, + "loss": 0.3266, + "step": 23648 + }, + { + "epoch": 1.1207677699403147, + "grad_norm": 4.418886184692383, + "learning_rate": 6.861998081403317e-06, + "loss": 0.1838, + "step": 23649 + }, + { + "epoch": 1.1207813347802495, + "grad_norm": 5.966048717498779, + "learning_rate": 6.861861038783062e-06, + "loss": 0.2466, + "step": 23650 + }, + { + "epoch": 1.1207948996201844, + "grad_norm": 4.733202934265137, + "learning_rate": 6.861723996162807e-06, + "loss": 0.2171, + "step": 23651 + }, + { + "epoch": 1.1208084644601193, + "grad_norm": 6.003176689147949, + "learning_rate": 6.861586953542553e-06, + "loss": 0.3029, + "step": 23652 + }, + { + "epoch": 1.1208220293000544, + "grad_norm": 4.538427829742432, + "learning_rate": 6.861449910922298e-06, + "loss": 0.1454, + "step": 23653 + }, + { + "epoch": 1.1208355941399892, + "grad_norm": 5.329209804534912, + "learning_rate": 6.861312868302042e-06, + "loss": 0.3022, + "step": 23654 + }, + { + "epoch": 1.120849158979924, + "grad_norm": 7.217263698577881, + "learning_rate": 6.861175825681787e-06, + "loss": 0.2946, + "step": 23655 + }, + { + "epoch": 1.120862723819859, + "grad_norm": 5.566424369812012, + "learning_rate": 6.8610387830615325e-06, + "loss": 0.2803, + "step": 23656 + }, + { + "epoch": 1.1208762886597938, + "grad_norm": 5.638309001922607, + "learning_rate": 6.860901740441278e-06, + "loss": 0.1955, + "step": 23657 + }, + { + "epoch": 1.1208898534997287, + "grad_norm": 3.463449478149414, + "learning_rate": 6.860764697821023e-06, + "loss": 0.1702, + "step": 23658 + }, + { + "epoch": 1.1209034183396636, + "grad_norm": 6.413372993469238, + "learning_rate": 6.860627655200768e-06, + "loss": 0.3387, + "step": 23659 + }, + { + "epoch": 1.1209169831795984, + "grad_norm": 6.1274800300598145, + "learning_rate": 6.860490612580512e-06, + "loss": 0.3113, + "step": 23660 + }, + { + "epoch": 1.1209305480195333, + "grad_norm": 6.32970666885376, + "learning_rate": 6.860353569960258e-06, + "loss": 0.2526, + "step": 23661 + }, + { + "epoch": 1.1209441128594682, + "grad_norm": 6.124021053314209, + "learning_rate": 6.860216527340004e-06, + "loss": 0.2251, + "step": 23662 + }, + { + "epoch": 1.1209576776994032, + "grad_norm": 5.27263069152832, + "learning_rate": 6.860079484719748e-06, + "loss": 0.1804, + "step": 23663 + }, + { + "epoch": 1.120971242539338, + "grad_norm": 4.492073059082031, + "learning_rate": 6.859942442099493e-06, + "loss": 0.2582, + "step": 23664 + }, + { + "epoch": 1.120984807379273, + "grad_norm": 4.213636875152588, + "learning_rate": 6.859805399479239e-06, + "loss": 0.241, + "step": 23665 + }, + { + "epoch": 1.1209983722192078, + "grad_norm": 5.491451740264893, + "learning_rate": 6.859668356858984e-06, + "loss": 0.2342, + "step": 23666 + }, + { + "epoch": 1.1210119370591427, + "grad_norm": 6.372068405151367, + "learning_rate": 6.859531314238729e-06, + "loss": 0.293, + "step": 23667 + }, + { + "epoch": 1.1210255018990776, + "grad_norm": 3.9385721683502197, + "learning_rate": 6.859394271618474e-06, + "loss": 0.194, + "step": 23668 + }, + { + "epoch": 1.1210390667390124, + "grad_norm": 5.15919303894043, + "learning_rate": 6.85925722899822e-06, + "loss": 0.2012, + "step": 23669 + }, + { + "epoch": 1.1210526315789473, + "grad_norm": 5.913845062255859, + "learning_rate": 6.859120186377964e-06, + "loss": 0.2476, + "step": 23670 + }, + { + "epoch": 1.1210661964188822, + "grad_norm": 4.510428428649902, + "learning_rate": 6.858983143757709e-06, + "loss": 0.1705, + "step": 23671 + }, + { + "epoch": 1.1210797612588173, + "grad_norm": 5.946413993835449, + "learning_rate": 6.858846101137454e-06, + "loss": 0.4289, + "step": 23672 + }, + { + "epoch": 1.1210933260987521, + "grad_norm": 5.890031337738037, + "learning_rate": 6.858709058517199e-06, + "loss": 0.2765, + "step": 23673 + }, + { + "epoch": 1.121106890938687, + "grad_norm": 5.010260105133057, + "learning_rate": 6.858572015896945e-06, + "loss": 0.1653, + "step": 23674 + }, + { + "epoch": 1.1211204557786218, + "grad_norm": 5.027435779571533, + "learning_rate": 6.85843497327669e-06, + "loss": 0.1492, + "step": 23675 + }, + { + "epoch": 1.1211340206185567, + "grad_norm": 4.105423927307129, + "learning_rate": 6.858297930656434e-06, + "loss": 0.2268, + "step": 23676 + }, + { + "epoch": 1.1211475854584916, + "grad_norm": 4.542468547821045, + "learning_rate": 6.8581608880361796e-06, + "loss": 0.182, + "step": 23677 + }, + { + "epoch": 1.1211611502984264, + "grad_norm": 6.798110008239746, + "learning_rate": 6.858023845415926e-06, + "loss": 0.2788, + "step": 23678 + }, + { + "epoch": 1.1211747151383613, + "grad_norm": 5.105318069458008, + "learning_rate": 6.85788680279567e-06, + "loss": 0.2158, + "step": 23679 + }, + { + "epoch": 1.1211882799782962, + "grad_norm": 4.888563632965088, + "learning_rate": 6.857749760175415e-06, + "loss": 0.2942, + "step": 23680 + }, + { + "epoch": 1.121201844818231, + "grad_norm": 5.120891094207764, + "learning_rate": 6.85761271755516e-06, + "loss": 0.2378, + "step": 23681 + }, + { + "epoch": 1.1212154096581661, + "grad_norm": 3.6622743606567383, + "learning_rate": 6.8574756749349055e-06, + "loss": 0.1763, + "step": 23682 + }, + { + "epoch": 1.121228974498101, + "grad_norm": 4.506131649017334, + "learning_rate": 6.857338632314651e-06, + "loss": 0.1982, + "step": 23683 + }, + { + "epoch": 1.1212425393380359, + "grad_norm": 5.419876575469971, + "learning_rate": 6.857201589694396e-06, + "loss": 0.3221, + "step": 23684 + }, + { + "epoch": 1.1212561041779707, + "grad_norm": 6.114226341247559, + "learning_rate": 6.85706454707414e-06, + "loss": 0.2691, + "step": 23685 + }, + { + "epoch": 1.1212696690179056, + "grad_norm": 4.7499895095825195, + "learning_rate": 6.856927504453885e-06, + "loss": 0.1437, + "step": 23686 + }, + { + "epoch": 1.1212832338578405, + "grad_norm": 6.469411373138428, + "learning_rate": 6.856790461833631e-06, + "loss": 0.3327, + "step": 23687 + }, + { + "epoch": 1.1212967986977753, + "grad_norm": 5.490689277648926, + "learning_rate": 6.856653419213376e-06, + "loss": 0.3297, + "step": 23688 + }, + { + "epoch": 1.1213103635377102, + "grad_norm": 5.985573768615723, + "learning_rate": 6.856516376593121e-06, + "loss": 0.3102, + "step": 23689 + }, + { + "epoch": 1.121323928377645, + "grad_norm": 5.181952476501465, + "learning_rate": 6.856379333972866e-06, + "loss": 0.213, + "step": 23690 + }, + { + "epoch": 1.1213374932175801, + "grad_norm": 5.154639720916748, + "learning_rate": 6.856242291352612e-06, + "loss": 0.299, + "step": 23691 + }, + { + "epoch": 1.121351058057515, + "grad_norm": 8.387412071228027, + "learning_rate": 6.856105248732356e-06, + "loss": 0.3392, + "step": 23692 + }, + { + "epoch": 1.1213646228974499, + "grad_norm": 7.871403217315674, + "learning_rate": 6.855968206112102e-06, + "loss": 0.2746, + "step": 23693 + }, + { + "epoch": 1.1213781877373847, + "grad_norm": 5.380945682525635, + "learning_rate": 6.855831163491846e-06, + "loss": 0.2696, + "step": 23694 + }, + { + "epoch": 1.1213917525773196, + "grad_norm": 4.815368175506592, + "learning_rate": 6.855694120871592e-06, + "loss": 0.2521, + "step": 23695 + }, + { + "epoch": 1.1214053174172545, + "grad_norm": 5.407370567321777, + "learning_rate": 6.855557078251337e-06, + "loss": 0.2814, + "step": 23696 + }, + { + "epoch": 1.1214188822571893, + "grad_norm": 6.69503116607666, + "learning_rate": 6.8554200356310815e-06, + "loss": 0.3315, + "step": 23697 + }, + { + "epoch": 1.1214324470971242, + "grad_norm": 6.358486652374268, + "learning_rate": 6.855282993010827e-06, + "loss": 0.3134, + "step": 23698 + }, + { + "epoch": 1.121446011937059, + "grad_norm": 4.188605308532715, + "learning_rate": 6.855145950390572e-06, + "loss": 0.2043, + "step": 23699 + }, + { + "epoch": 1.121459576776994, + "grad_norm": 4.964808940887451, + "learning_rate": 6.855008907770318e-06, + "loss": 0.3636, + "step": 23700 + }, + { + "epoch": 1.121473141616929, + "grad_norm": 4.986566066741943, + "learning_rate": 6.854871865150062e-06, + "loss": 0.2541, + "step": 23701 + }, + { + "epoch": 1.1214867064568639, + "grad_norm": 5.170778751373291, + "learning_rate": 6.854734822529807e-06, + "loss": 0.2669, + "step": 23702 + }, + { + "epoch": 1.1215002712967987, + "grad_norm": 5.321707725524902, + "learning_rate": 6.854597779909552e-06, + "loss": 0.2267, + "step": 23703 + }, + { + "epoch": 1.1215138361367336, + "grad_norm": 6.171379566192627, + "learning_rate": 6.854460737289298e-06, + "loss": 0.3114, + "step": 23704 + }, + { + "epoch": 1.1215274009766685, + "grad_norm": 7.138700008392334, + "learning_rate": 6.854323694669043e-06, + "loss": 0.416, + "step": 23705 + }, + { + "epoch": 1.1215409658166033, + "grad_norm": 5.177974224090576, + "learning_rate": 6.854186652048787e-06, + "loss": 0.2979, + "step": 23706 + }, + { + "epoch": 1.1215545306565382, + "grad_norm": 4.779210567474365, + "learning_rate": 6.854049609428532e-06, + "loss": 0.2251, + "step": 23707 + }, + { + "epoch": 1.121568095496473, + "grad_norm": 7.006053447723389, + "learning_rate": 6.853912566808278e-06, + "loss": 0.3594, + "step": 23708 + }, + { + "epoch": 1.121581660336408, + "grad_norm": 8.406380653381348, + "learning_rate": 6.853775524188024e-06, + "loss": 0.4149, + "step": 23709 + }, + { + "epoch": 1.121595225176343, + "grad_norm": 4.974750518798828, + "learning_rate": 6.853638481567768e-06, + "loss": 0.2247, + "step": 23710 + }, + { + "epoch": 1.121608790016278, + "grad_norm": 4.822762966156006, + "learning_rate": 6.853501438947513e-06, + "loss": 0.2716, + "step": 23711 + }, + { + "epoch": 1.1216223548562128, + "grad_norm": 5.680087089538574, + "learning_rate": 6.8533643963272574e-06, + "loss": 0.2449, + "step": 23712 + }, + { + "epoch": 1.1216359196961476, + "grad_norm": 4.63127326965332, + "learning_rate": 6.8532273537070035e-06, + "loss": 0.2808, + "step": 23713 + }, + { + "epoch": 1.1216494845360825, + "grad_norm": 8.325471878051758, + "learning_rate": 6.853090311086749e-06, + "loss": 0.5211, + "step": 23714 + }, + { + "epoch": 1.1216630493760174, + "grad_norm": 4.880992889404297, + "learning_rate": 6.852953268466494e-06, + "loss": 0.2364, + "step": 23715 + }, + { + "epoch": 1.1216766142159522, + "grad_norm": 4.914573669433594, + "learning_rate": 6.852816225846238e-06, + "loss": 0.2244, + "step": 23716 + }, + { + "epoch": 1.121690179055887, + "grad_norm": 7.119693756103516, + "learning_rate": 6.852679183225984e-06, + "loss": 0.4754, + "step": 23717 + }, + { + "epoch": 1.121703743895822, + "grad_norm": 5.758190155029297, + "learning_rate": 6.852542140605729e-06, + "loss": 0.3659, + "step": 23718 + }, + { + "epoch": 1.1217173087357568, + "grad_norm": 5.020367622375488, + "learning_rate": 6.852405097985474e-06, + "loss": 0.2208, + "step": 23719 + }, + { + "epoch": 1.121730873575692, + "grad_norm": 6.950507640838623, + "learning_rate": 6.852268055365219e-06, + "loss": 0.4098, + "step": 23720 + }, + { + "epoch": 1.1217444384156268, + "grad_norm": 4.5161261558532715, + "learning_rate": 6.852131012744965e-06, + "loss": 0.1888, + "step": 23721 + }, + { + "epoch": 1.1217580032555616, + "grad_norm": 6.082958221435547, + "learning_rate": 6.851993970124709e-06, + "loss": 0.3038, + "step": 23722 + }, + { + "epoch": 1.1217715680954965, + "grad_norm": 5.214334011077881, + "learning_rate": 6.851856927504454e-06, + "loss": 0.354, + "step": 23723 + }, + { + "epoch": 1.1217851329354314, + "grad_norm": 5.5614166259765625, + "learning_rate": 6.8517198848842e-06, + "loss": 0.2136, + "step": 23724 + }, + { + "epoch": 1.1217986977753662, + "grad_norm": 5.2037577629089355, + "learning_rate": 6.851582842263944e-06, + "loss": 0.2371, + "step": 23725 + }, + { + "epoch": 1.121812262615301, + "grad_norm": 5.975783824920654, + "learning_rate": 6.85144579964369e-06, + "loss": 0.2817, + "step": 23726 + }, + { + "epoch": 1.121825827455236, + "grad_norm": 8.759634017944336, + "learning_rate": 6.851308757023435e-06, + "loss": 0.4476, + "step": 23727 + }, + { + "epoch": 1.1218393922951708, + "grad_norm": 6.324013710021973, + "learning_rate": 6.8511717144031795e-06, + "loss": 0.2946, + "step": 23728 + }, + { + "epoch": 1.121852957135106, + "grad_norm": 8.012055397033691, + "learning_rate": 6.851034671782925e-06, + "loss": 0.5033, + "step": 23729 + }, + { + "epoch": 1.1218665219750408, + "grad_norm": 7.113495826721191, + "learning_rate": 6.850897629162671e-06, + "loss": 0.4943, + "step": 23730 + }, + { + "epoch": 1.1218800868149756, + "grad_norm": 6.460733890533447, + "learning_rate": 6.850760586542415e-06, + "loss": 0.2652, + "step": 23731 + }, + { + "epoch": 1.1218936516549105, + "grad_norm": 6.008828163146973, + "learning_rate": 6.85062354392216e-06, + "loss": 0.4469, + "step": 23732 + }, + { + "epoch": 1.1219072164948454, + "grad_norm": 6.471225261688232, + "learning_rate": 6.850486501301905e-06, + "loss": 0.3279, + "step": 23733 + }, + { + "epoch": 1.1219207813347802, + "grad_norm": 4.969497203826904, + "learning_rate": 6.850349458681651e-06, + "loss": 0.2419, + "step": 23734 + }, + { + "epoch": 1.121934346174715, + "grad_norm": 6.853283405303955, + "learning_rate": 6.850212416061396e-06, + "loss": 0.3931, + "step": 23735 + }, + { + "epoch": 1.12194791101465, + "grad_norm": 5.958567142486572, + "learning_rate": 6.850075373441141e-06, + "loss": 0.301, + "step": 23736 + }, + { + "epoch": 1.1219614758545848, + "grad_norm": 6.497175693511963, + "learning_rate": 6.849938330820885e-06, + "loss": 0.3809, + "step": 23737 + }, + { + "epoch": 1.1219750406945197, + "grad_norm": 7.47532844543457, + "learning_rate": 6.849801288200631e-06, + "loss": 0.5297, + "step": 23738 + }, + { + "epoch": 1.1219886055344548, + "grad_norm": 4.835123538970947, + "learning_rate": 6.8496642455803764e-06, + "loss": 0.3425, + "step": 23739 + }, + { + "epoch": 1.1220021703743897, + "grad_norm": 5.9765706062316895, + "learning_rate": 6.849527202960122e-06, + "loss": 0.325, + "step": 23740 + }, + { + "epoch": 1.1220157352143245, + "grad_norm": 5.858352184295654, + "learning_rate": 6.849390160339866e-06, + "loss": 0.2995, + "step": 23741 + }, + { + "epoch": 1.1220293000542594, + "grad_norm": 3.9979588985443115, + "learning_rate": 6.849253117719611e-06, + "loss": 0.3019, + "step": 23742 + }, + { + "epoch": 1.1220428648941942, + "grad_norm": 6.198206901550293, + "learning_rate": 6.849116075099357e-06, + "loss": 0.2533, + "step": 23743 + }, + { + "epoch": 1.1220564297341291, + "grad_norm": 6.847112655639648, + "learning_rate": 6.8489790324791015e-06, + "loss": 0.4414, + "step": 23744 + }, + { + "epoch": 1.122069994574064, + "grad_norm": 6.781321048736572, + "learning_rate": 6.848841989858847e-06, + "loss": 0.4162, + "step": 23745 + }, + { + "epoch": 1.1220835594139988, + "grad_norm": 5.466981410980225, + "learning_rate": 6.848704947238591e-06, + "loss": 0.3271, + "step": 23746 + }, + { + "epoch": 1.1220971242539337, + "grad_norm": 6.628292560577393, + "learning_rate": 6.848567904618337e-06, + "loss": 0.575, + "step": 23747 + }, + { + "epoch": 1.1221106890938688, + "grad_norm": 5.9059367179870605, + "learning_rate": 6.848430861998082e-06, + "loss": 0.4216, + "step": 23748 + }, + { + "epoch": 1.1221242539338037, + "grad_norm": 4.425546169281006, + "learning_rate": 6.848293819377827e-06, + "loss": 0.1839, + "step": 23749 + }, + { + "epoch": 1.1221378187737385, + "grad_norm": 6.3992133140563965, + "learning_rate": 6.848156776757572e-06, + "loss": 0.3887, + "step": 23750 + }, + { + "epoch": 1.1221513836136734, + "grad_norm": 4.376638889312744, + "learning_rate": 6.848019734137318e-06, + "loss": 0.1972, + "step": 23751 + }, + { + "epoch": 1.1221649484536083, + "grad_norm": 7.186594009399414, + "learning_rate": 6.847882691517063e-06, + "loss": 0.4639, + "step": 23752 + }, + { + "epoch": 1.1221785132935431, + "grad_norm": 5.699093818664551, + "learning_rate": 6.847745648896807e-06, + "loss": 0.4013, + "step": 23753 + }, + { + "epoch": 1.122192078133478, + "grad_norm": 6.021383762359619, + "learning_rate": 6.847608606276552e-06, + "loss": 0.3624, + "step": 23754 + }, + { + "epoch": 1.1222056429734129, + "grad_norm": 6.518819332122803, + "learning_rate": 6.847471563656298e-06, + "loss": 0.4183, + "step": 23755 + }, + { + "epoch": 1.1222192078133477, + "grad_norm": 6.548716068267822, + "learning_rate": 6.847334521036043e-06, + "loss": 0.4113, + "step": 23756 + }, + { + "epoch": 1.1222327726532826, + "grad_norm": 6.949937343597412, + "learning_rate": 6.847197478415788e-06, + "loss": 0.3791, + "step": 23757 + }, + { + "epoch": 1.1222463374932177, + "grad_norm": 6.157729625701904, + "learning_rate": 6.847060435795533e-06, + "loss": 0.3849, + "step": 23758 + }, + { + "epoch": 1.1222599023331525, + "grad_norm": 5.059645652770996, + "learning_rate": 6.8469233931752775e-06, + "loss": 0.3836, + "step": 23759 + }, + { + "epoch": 1.1222734671730874, + "grad_norm": 5.653387069702148, + "learning_rate": 6.8467863505550235e-06, + "loss": 0.4171, + "step": 23760 + }, + { + "epoch": 1.1222870320130223, + "grad_norm": 5.809792995452881, + "learning_rate": 6.846649307934769e-06, + "loss": 0.3272, + "step": 23761 + }, + { + "epoch": 1.1223005968529571, + "grad_norm": 6.390780448913574, + "learning_rate": 6.846512265314513e-06, + "loss": 0.2541, + "step": 23762 + }, + { + "epoch": 1.122314161692892, + "grad_norm": 5.849400520324707, + "learning_rate": 6.846375222694258e-06, + "loss": 0.3853, + "step": 23763 + }, + { + "epoch": 1.1223277265328269, + "grad_norm": 6.41124153137207, + "learning_rate": 6.846238180074004e-06, + "loss": 0.5191, + "step": 23764 + }, + { + "epoch": 1.1223412913727617, + "grad_norm": 4.929773330688477, + "learning_rate": 6.8461011374537485e-06, + "loss": 0.3316, + "step": 23765 + }, + { + "epoch": 1.1223548562126966, + "grad_norm": 6.088529586791992, + "learning_rate": 6.845964094833494e-06, + "loss": 0.3468, + "step": 23766 + }, + { + "epoch": 1.1223684210526317, + "grad_norm": 5.206765651702881, + "learning_rate": 6.845827052213239e-06, + "loss": 0.381, + "step": 23767 + }, + { + "epoch": 1.1223819858925665, + "grad_norm": 5.344156742095947, + "learning_rate": 6.845690009592983e-06, + "loss": 0.3191, + "step": 23768 + }, + { + "epoch": 1.1223955507325014, + "grad_norm": 6.112555027008057, + "learning_rate": 6.845552966972729e-06, + "loss": 0.3447, + "step": 23769 + }, + { + "epoch": 1.1224091155724363, + "grad_norm": 6.486316680908203, + "learning_rate": 6.8454159243524744e-06, + "loss": 0.4951, + "step": 23770 + }, + { + "epoch": 1.1224226804123711, + "grad_norm": 5.22055196762085, + "learning_rate": 6.845278881732219e-06, + "loss": 0.322, + "step": 23771 + }, + { + "epoch": 1.122436245252306, + "grad_norm": 5.8712592124938965, + "learning_rate": 6.845141839111964e-06, + "loss": 0.3091, + "step": 23772 + }, + { + "epoch": 1.1224498100922409, + "grad_norm": 4.163758277893066, + "learning_rate": 6.84500479649171e-06, + "loss": 0.2196, + "step": 23773 + }, + { + "epoch": 1.1224633749321757, + "grad_norm": 5.88386344909668, + "learning_rate": 6.844867753871455e-06, + "loss": 0.4029, + "step": 23774 + }, + { + "epoch": 1.1224769397721106, + "grad_norm": 4.565727233886719, + "learning_rate": 6.8447307112511995e-06, + "loss": 0.2524, + "step": 23775 + }, + { + "epoch": 1.1224905046120455, + "grad_norm": 7.882680892944336, + "learning_rate": 6.844593668630945e-06, + "loss": 0.328, + "step": 23776 + }, + { + "epoch": 1.1225040694519806, + "grad_norm": 6.1574554443359375, + "learning_rate": 6.844456626010691e-06, + "loss": 0.305, + "step": 23777 + }, + { + "epoch": 1.1225176342919154, + "grad_norm": 4.840493679046631, + "learning_rate": 6.844319583390435e-06, + "loss": 0.3216, + "step": 23778 + }, + { + "epoch": 1.1225311991318503, + "grad_norm": 5.844607353210449, + "learning_rate": 6.84418254077018e-06, + "loss": 0.3434, + "step": 23779 + }, + { + "epoch": 1.1225447639717852, + "grad_norm": 5.781574726104736, + "learning_rate": 6.8440454981499245e-06, + "loss": 0.3489, + "step": 23780 + }, + { + "epoch": 1.12255832881172, + "grad_norm": 5.36311149597168, + "learning_rate": 6.84390845552967e-06, + "loss": 0.3956, + "step": 23781 + }, + { + "epoch": 1.1225718936516549, + "grad_norm": 5.184792518615723, + "learning_rate": 6.843771412909416e-06, + "loss": 0.2221, + "step": 23782 + }, + { + "epoch": 1.1225854584915897, + "grad_norm": 5.86206579208374, + "learning_rate": 6.843634370289161e-06, + "loss": 0.3333, + "step": 23783 + }, + { + "epoch": 1.1225990233315246, + "grad_norm": 6.324168682098389, + "learning_rate": 6.843497327668905e-06, + "loss": 0.4378, + "step": 23784 + }, + { + "epoch": 1.1226125881714595, + "grad_norm": 5.216318130493164, + "learning_rate": 6.84336028504865e-06, + "loss": 0.2827, + "step": 23785 + }, + { + "epoch": 1.1226261530113946, + "grad_norm": 5.276360988616943, + "learning_rate": 6.8432232424283964e-06, + "loss": 0.3248, + "step": 23786 + }, + { + "epoch": 1.1226397178513294, + "grad_norm": 6.291286945343018, + "learning_rate": 6.843086199808141e-06, + "loss": 0.3382, + "step": 23787 + }, + { + "epoch": 1.1226532826912643, + "grad_norm": 5.647930145263672, + "learning_rate": 6.842949157187886e-06, + "loss": 0.4377, + "step": 23788 + }, + { + "epoch": 1.1226668475311992, + "grad_norm": 7.361562728881836, + "learning_rate": 6.842812114567631e-06, + "loss": 0.383, + "step": 23789 + }, + { + "epoch": 1.122680412371134, + "grad_norm": 5.536763668060303, + "learning_rate": 6.842675071947376e-06, + "loss": 0.3937, + "step": 23790 + }, + { + "epoch": 1.122693977211069, + "grad_norm": 4.4562578201293945, + "learning_rate": 6.8425380293271215e-06, + "loss": 0.2217, + "step": 23791 + }, + { + "epoch": 1.1227075420510038, + "grad_norm": 4.539406776428223, + "learning_rate": 6.842400986706867e-06, + "loss": 0.2403, + "step": 23792 + }, + { + "epoch": 1.1227211068909386, + "grad_norm": 5.12476110458374, + "learning_rate": 6.842263944086611e-06, + "loss": 0.3676, + "step": 23793 + }, + { + "epoch": 1.1227346717308735, + "grad_norm": 7.030679225921631, + "learning_rate": 6.842126901466356e-06, + "loss": 0.3563, + "step": 23794 + }, + { + "epoch": 1.1227482365708084, + "grad_norm": 4.8098320960998535, + "learning_rate": 6.841989858846102e-06, + "loss": 0.2108, + "step": 23795 + }, + { + "epoch": 1.1227618014107434, + "grad_norm": 6.546154022216797, + "learning_rate": 6.8418528162258465e-06, + "loss": 0.4336, + "step": 23796 + }, + { + "epoch": 1.1227753662506783, + "grad_norm": 4.874143123626709, + "learning_rate": 6.841715773605592e-06, + "loss": 0.2568, + "step": 23797 + }, + { + "epoch": 1.1227889310906132, + "grad_norm": 4.9956955909729, + "learning_rate": 6.841578730985337e-06, + "loss": 0.1656, + "step": 23798 + }, + { + "epoch": 1.122802495930548, + "grad_norm": 5.750828266143799, + "learning_rate": 6.841441688365082e-06, + "loss": 0.2707, + "step": 23799 + }, + { + "epoch": 1.122816060770483, + "grad_norm": 6.402029514312744, + "learning_rate": 6.841304645744827e-06, + "loss": 0.4668, + "step": 23800 + }, + { + "epoch": 1.1228296256104178, + "grad_norm": 7.291773319244385, + "learning_rate": 6.8411676031245724e-06, + "loss": 0.4798, + "step": 23801 + }, + { + "epoch": 1.1228431904503526, + "grad_norm": 4.796763896942139, + "learning_rate": 6.841030560504317e-06, + "loss": 0.1995, + "step": 23802 + }, + { + "epoch": 1.1228567552902875, + "grad_norm": 4.386215686798096, + "learning_rate": 6.840893517884063e-06, + "loss": 0.2696, + "step": 23803 + }, + { + "epoch": 1.1228703201302224, + "grad_norm": 6.207660675048828, + "learning_rate": 6.840756475263808e-06, + "loss": 0.3547, + "step": 23804 + }, + { + "epoch": 1.1228838849701575, + "grad_norm": 5.5487751960754395, + "learning_rate": 6.840619432643552e-06, + "loss": 0.3042, + "step": 23805 + }, + { + "epoch": 1.1228974498100923, + "grad_norm": 6.597346305847168, + "learning_rate": 6.8404823900232975e-06, + "loss": 0.3923, + "step": 23806 + }, + { + "epoch": 1.1229110146500272, + "grad_norm": 5.301148414611816, + "learning_rate": 6.8403453474030435e-06, + "loss": 0.3448, + "step": 23807 + }, + { + "epoch": 1.122924579489962, + "grad_norm": 5.4037604331970215, + "learning_rate": 6.840208304782789e-06, + "loss": 0.2408, + "step": 23808 + }, + { + "epoch": 1.122938144329897, + "grad_norm": 4.629215717315674, + "learning_rate": 6.840071262162533e-06, + "loss": 0.2599, + "step": 23809 + }, + { + "epoch": 1.1229517091698318, + "grad_norm": 6.454537868499756, + "learning_rate": 6.839934219542278e-06, + "loss": 0.3099, + "step": 23810 + }, + { + "epoch": 1.1229652740097666, + "grad_norm": 3.5644893646240234, + "learning_rate": 6.8397971769220225e-06, + "loss": 0.2515, + "step": 23811 + }, + { + "epoch": 1.1229788388497015, + "grad_norm": 4.790698528289795, + "learning_rate": 6.8396601343017686e-06, + "loss": 0.4071, + "step": 23812 + }, + { + "epoch": 1.1229924036896364, + "grad_norm": 5.707278728485107, + "learning_rate": 6.839523091681514e-06, + "loss": 0.3592, + "step": 23813 + }, + { + "epoch": 1.1230059685295712, + "grad_norm": 5.755290508270264, + "learning_rate": 6.839386049061258e-06, + "loss": 0.3781, + "step": 23814 + }, + { + "epoch": 1.1230195333695063, + "grad_norm": 4.884204387664795, + "learning_rate": 6.839249006441003e-06, + "loss": 0.2308, + "step": 23815 + }, + { + "epoch": 1.1230330982094412, + "grad_norm": 5.391963005065918, + "learning_rate": 6.839111963820749e-06, + "loss": 0.3106, + "step": 23816 + }, + { + "epoch": 1.123046663049376, + "grad_norm": 5.759681224822998, + "learning_rate": 6.8389749212004945e-06, + "loss": 0.3194, + "step": 23817 + }, + { + "epoch": 1.123060227889311, + "grad_norm": 7.057823181152344, + "learning_rate": 6.838837878580239e-06, + "loss": 0.3185, + "step": 23818 + }, + { + "epoch": 1.1230737927292458, + "grad_norm": 9.438502311706543, + "learning_rate": 6.838700835959984e-06, + "loss": 0.5337, + "step": 23819 + }, + { + "epoch": 1.1230873575691807, + "grad_norm": 6.648925304412842, + "learning_rate": 6.83856379333973e-06, + "loss": 0.2964, + "step": 23820 + }, + { + "epoch": 1.1231009224091155, + "grad_norm": 5.1470627784729, + "learning_rate": 6.838426750719474e-06, + "loss": 0.2548, + "step": 23821 + }, + { + "epoch": 1.1231144872490504, + "grad_norm": 7.765620231628418, + "learning_rate": 6.8382897080992195e-06, + "loss": 0.3708, + "step": 23822 + }, + { + "epoch": 1.1231280520889853, + "grad_norm": 5.487292289733887, + "learning_rate": 6.838152665478965e-06, + "loss": 0.2964, + "step": 23823 + }, + { + "epoch": 1.1231416169289203, + "grad_norm": 9.171445846557617, + "learning_rate": 6.838015622858709e-06, + "loss": 0.4439, + "step": 23824 + }, + { + "epoch": 1.1231551817688552, + "grad_norm": 5.160400390625, + "learning_rate": 6.837878580238455e-06, + "loss": 0.2155, + "step": 23825 + }, + { + "epoch": 1.12316874660879, + "grad_norm": 6.578993797302246, + "learning_rate": 6.8377415376182e-06, + "loss": 0.2482, + "step": 23826 + }, + { + "epoch": 1.123182311448725, + "grad_norm": 5.500188827514648, + "learning_rate": 6.8376044949979445e-06, + "loss": 0.2429, + "step": 23827 + }, + { + "epoch": 1.1231958762886598, + "grad_norm": 4.871129512786865, + "learning_rate": 6.83746745237769e-06, + "loss": 0.2272, + "step": 23828 + }, + { + "epoch": 1.1232094411285947, + "grad_norm": 7.142117977142334, + "learning_rate": 6.837330409757436e-06, + "loss": 0.3962, + "step": 23829 + }, + { + "epoch": 1.1232230059685295, + "grad_norm": 5.929687023162842, + "learning_rate": 6.83719336713718e-06, + "loss": 0.1993, + "step": 23830 + }, + { + "epoch": 1.1232365708084644, + "grad_norm": 4.579277038574219, + "learning_rate": 6.837056324516925e-06, + "loss": 0.3008, + "step": 23831 + }, + { + "epoch": 1.1232501356483993, + "grad_norm": 5.260255336761475, + "learning_rate": 6.8369192818966704e-06, + "loss": 0.2182, + "step": 23832 + }, + { + "epoch": 1.1232637004883341, + "grad_norm": 5.601746559143066, + "learning_rate": 6.8367822392764165e-06, + "loss": 0.3534, + "step": 23833 + }, + { + "epoch": 1.1232772653282692, + "grad_norm": 5.867134094238281, + "learning_rate": 6.836645196656161e-06, + "loss": 0.2976, + "step": 23834 + }, + { + "epoch": 1.123290830168204, + "grad_norm": 4.7320404052734375, + "learning_rate": 6.836508154035906e-06, + "loss": 0.2217, + "step": 23835 + }, + { + "epoch": 1.123304395008139, + "grad_norm": 5.173327922821045, + "learning_rate": 6.83637111141565e-06, + "loss": 0.2194, + "step": 23836 + }, + { + "epoch": 1.1233179598480738, + "grad_norm": 8.374135971069336, + "learning_rate": 6.8362340687953955e-06, + "loss": 0.49, + "step": 23837 + }, + { + "epoch": 1.1233315246880087, + "grad_norm": 6.43446159362793, + "learning_rate": 6.8360970261751415e-06, + "loss": 0.2887, + "step": 23838 + }, + { + "epoch": 1.1233450895279435, + "grad_norm": 7.506188869476318, + "learning_rate": 6.835959983554886e-06, + "loss": 0.3327, + "step": 23839 + }, + { + "epoch": 1.1233586543678784, + "grad_norm": 6.203147888183594, + "learning_rate": 6.835822940934631e-06, + "loss": 0.2058, + "step": 23840 + }, + { + "epoch": 1.1233722192078133, + "grad_norm": 6.530320167541504, + "learning_rate": 6.835685898314376e-06, + "loss": 0.3796, + "step": 23841 + }, + { + "epoch": 1.1233857840477481, + "grad_norm": 6.9514641761779785, + "learning_rate": 6.835548855694122e-06, + "loss": 0.3549, + "step": 23842 + }, + { + "epoch": 1.1233993488876832, + "grad_norm": 5.126882076263428, + "learning_rate": 6.8354118130738666e-06, + "loss": 0.2661, + "step": 23843 + }, + { + "epoch": 1.123412913727618, + "grad_norm": 4.700063705444336, + "learning_rate": 6.835274770453612e-06, + "loss": 0.2677, + "step": 23844 + }, + { + "epoch": 1.123426478567553, + "grad_norm": 5.437479496002197, + "learning_rate": 6.835137727833356e-06, + "loss": 0.3066, + "step": 23845 + }, + { + "epoch": 1.1234400434074878, + "grad_norm": 6.634584903717041, + "learning_rate": 6.835000685213102e-06, + "loss": 0.2636, + "step": 23846 + }, + { + "epoch": 1.1234536082474227, + "grad_norm": 4.532954692840576, + "learning_rate": 6.834863642592847e-06, + "loss": 0.1991, + "step": 23847 + }, + { + "epoch": 1.1234671730873576, + "grad_norm": 5.214268207550049, + "learning_rate": 6.834726599972592e-06, + "loss": 0.2162, + "step": 23848 + }, + { + "epoch": 1.1234807379272924, + "grad_norm": 5.126158237457275, + "learning_rate": 6.834589557352337e-06, + "loss": 0.2812, + "step": 23849 + }, + { + "epoch": 1.1234943027672273, + "grad_norm": 5.0840654373168945, + "learning_rate": 6.834452514732082e-06, + "loss": 0.2516, + "step": 23850 + }, + { + "epoch": 1.1235078676071621, + "grad_norm": 5.027078628540039, + "learning_rate": 6.834315472111828e-06, + "loss": 0.362, + "step": 23851 + }, + { + "epoch": 1.123521432447097, + "grad_norm": 4.024796962738037, + "learning_rate": 6.834178429491572e-06, + "loss": 0.206, + "step": 23852 + }, + { + "epoch": 1.123534997287032, + "grad_norm": 5.0466461181640625, + "learning_rate": 6.8340413868713175e-06, + "loss": 0.1817, + "step": 23853 + }, + { + "epoch": 1.123548562126967, + "grad_norm": 6.018303871154785, + "learning_rate": 6.833904344251062e-06, + "loss": 0.4087, + "step": 23854 + }, + { + "epoch": 1.1235621269669018, + "grad_norm": 4.467734336853027, + "learning_rate": 6.833767301630808e-06, + "loss": 0.2275, + "step": 23855 + }, + { + "epoch": 1.1235756918068367, + "grad_norm": 5.3420186042785645, + "learning_rate": 6.833630259010553e-06, + "loss": 0.2648, + "step": 23856 + }, + { + "epoch": 1.1235892566467716, + "grad_norm": 5.8562235832214355, + "learning_rate": 6.833493216390298e-06, + "loss": 0.421, + "step": 23857 + }, + { + "epoch": 1.1236028214867064, + "grad_norm": 6.972843170166016, + "learning_rate": 6.8333561737700425e-06, + "loss": 0.3903, + "step": 23858 + }, + { + "epoch": 1.1236163863266413, + "grad_norm": 5.800692558288574, + "learning_rate": 6.8332191311497886e-06, + "loss": 0.2474, + "step": 23859 + }, + { + "epoch": 1.1236299511665762, + "grad_norm": 4.63075590133667, + "learning_rate": 6.833082088529534e-06, + "loss": 0.2515, + "step": 23860 + }, + { + "epoch": 1.123643516006511, + "grad_norm": 5.399903297424316, + "learning_rate": 6.832945045909278e-06, + "loss": 0.2781, + "step": 23861 + }, + { + "epoch": 1.123657080846446, + "grad_norm": 7.1394362449646, + "learning_rate": 6.832808003289023e-06, + "loss": 0.3766, + "step": 23862 + }, + { + "epoch": 1.123670645686381, + "grad_norm": 5.830821990966797, + "learning_rate": 6.832670960668768e-06, + "loss": 0.3, + "step": 23863 + }, + { + "epoch": 1.1236842105263158, + "grad_norm": 4.261871337890625, + "learning_rate": 6.832533918048514e-06, + "loss": 0.251, + "step": 23864 + }, + { + "epoch": 1.1236977753662507, + "grad_norm": 7.899229526519775, + "learning_rate": 6.832396875428259e-06, + "loss": 0.5785, + "step": 23865 + }, + { + "epoch": 1.1237113402061856, + "grad_norm": 7.278665065765381, + "learning_rate": 6.832259832808004e-06, + "loss": 0.4474, + "step": 23866 + }, + { + "epoch": 1.1237249050461204, + "grad_norm": 3.6581122875213623, + "learning_rate": 6.832122790187748e-06, + "loss": 0.2404, + "step": 23867 + }, + { + "epoch": 1.1237384698860553, + "grad_norm": 4.560868263244629, + "learning_rate": 6.831985747567494e-06, + "loss": 0.3031, + "step": 23868 + }, + { + "epoch": 1.1237520347259902, + "grad_norm": 5.034021377563477, + "learning_rate": 6.8318487049472395e-06, + "loss": 0.3254, + "step": 23869 + }, + { + "epoch": 1.123765599565925, + "grad_norm": 7.549855709075928, + "learning_rate": 6.831711662326984e-06, + "loss": 0.4958, + "step": 23870 + }, + { + "epoch": 1.12377916440586, + "grad_norm": 7.088437557220459, + "learning_rate": 6.831574619706729e-06, + "loss": 0.3928, + "step": 23871 + }, + { + "epoch": 1.123792729245795, + "grad_norm": 6.2634053230285645, + "learning_rate": 6.831437577086475e-06, + "loss": 0.3856, + "step": 23872 + }, + { + "epoch": 1.1238062940857299, + "grad_norm": 9.59735107421875, + "learning_rate": 6.831300534466219e-06, + "loss": 0.6582, + "step": 23873 + }, + { + "epoch": 1.1238198589256647, + "grad_norm": 6.503231525421143, + "learning_rate": 6.8311634918459646e-06, + "loss": 0.526, + "step": 23874 + }, + { + "epoch": 1.1238334237655996, + "grad_norm": 5.149479866027832, + "learning_rate": 6.83102644922571e-06, + "loss": 0.4075, + "step": 23875 + }, + { + "epoch": 1.1238469886055344, + "grad_norm": 9.05197525024414, + "learning_rate": 6.830889406605456e-06, + "loss": 0.5844, + "step": 23876 + }, + { + "epoch": 1.1238605534454693, + "grad_norm": 8.245757102966309, + "learning_rate": 6.8307523639852e-06, + "loss": 0.522, + "step": 23877 + }, + { + "epoch": 1.1238741182854042, + "grad_norm": 6.093125820159912, + "learning_rate": 6.830615321364945e-06, + "loss": 0.3293, + "step": 23878 + }, + { + "epoch": 1.123887683125339, + "grad_norm": 5.811999320983887, + "learning_rate": 6.83047827874469e-06, + "loss": 0.3317, + "step": 23879 + }, + { + "epoch": 1.123901247965274, + "grad_norm": 6.105108737945557, + "learning_rate": 6.830341236124435e-06, + "loss": 0.4111, + "step": 23880 + }, + { + "epoch": 1.123914812805209, + "grad_norm": 7.594334125518799, + "learning_rate": 6.830204193504181e-06, + "loss": 0.3596, + "step": 23881 + }, + { + "epoch": 1.1239283776451439, + "grad_norm": 8.698551177978516, + "learning_rate": 6.830067150883926e-06, + "loss": 0.6683, + "step": 23882 + }, + { + "epoch": 1.1239419424850787, + "grad_norm": 8.13941478729248, + "learning_rate": 6.82993010826367e-06, + "loss": 0.4996, + "step": 23883 + }, + { + "epoch": 1.1239555073250136, + "grad_norm": 7.210971355438232, + "learning_rate": 6.8297930656434155e-06, + "loss": 0.4377, + "step": 23884 + }, + { + "epoch": 1.1239690721649485, + "grad_norm": 4.658834457397461, + "learning_rate": 6.8296560230231615e-06, + "loss": 0.2883, + "step": 23885 + }, + { + "epoch": 1.1239826370048833, + "grad_norm": 9.203174591064453, + "learning_rate": 6.829518980402906e-06, + "loss": 0.6435, + "step": 23886 + }, + { + "epoch": 1.1239962018448182, + "grad_norm": 7.840362071990967, + "learning_rate": 6.829381937782651e-06, + "loss": 0.4002, + "step": 23887 + }, + { + "epoch": 1.124009766684753, + "grad_norm": 5.936525821685791, + "learning_rate": 6.829244895162395e-06, + "loss": 0.3006, + "step": 23888 + }, + { + "epoch": 1.124023331524688, + "grad_norm": 8.47031307220459, + "learning_rate": 6.829107852542141e-06, + "loss": 0.5362, + "step": 23889 + }, + { + "epoch": 1.1240368963646228, + "grad_norm": 7.607328414916992, + "learning_rate": 6.828970809921887e-06, + "loss": 0.4184, + "step": 23890 + }, + { + "epoch": 1.1240504612045579, + "grad_norm": 9.684466361999512, + "learning_rate": 6.828833767301632e-06, + "loss": 0.4879, + "step": 23891 + }, + { + "epoch": 1.1240640260444927, + "grad_norm": 6.07063102722168, + "learning_rate": 6.828696724681376e-06, + "loss": 0.3063, + "step": 23892 + }, + { + "epoch": 1.1240775908844276, + "grad_norm": 5.630934238433838, + "learning_rate": 6.828559682061121e-06, + "loss": 0.2805, + "step": 23893 + }, + { + "epoch": 1.1240911557243625, + "grad_norm": 5.132291793823242, + "learning_rate": 6.828422639440867e-06, + "loss": 0.3647, + "step": 23894 + }, + { + "epoch": 1.1241047205642973, + "grad_norm": 8.699060440063477, + "learning_rate": 6.828285596820612e-06, + "loss": 0.4051, + "step": 23895 + }, + { + "epoch": 1.1241182854042322, + "grad_norm": 8.140445709228516, + "learning_rate": 6.828148554200357e-06, + "loss": 0.4536, + "step": 23896 + }, + { + "epoch": 1.124131850244167, + "grad_norm": 6.267911911010742, + "learning_rate": 6.828011511580101e-06, + "loss": 0.4342, + "step": 23897 + }, + { + "epoch": 1.124145415084102, + "grad_norm": 7.053032398223877, + "learning_rate": 6.827874468959847e-06, + "loss": 0.4445, + "step": 23898 + }, + { + "epoch": 1.1241589799240368, + "grad_norm": 6.201353073120117, + "learning_rate": 6.827737426339592e-06, + "loss": 0.3535, + "step": 23899 + }, + { + "epoch": 1.1241725447639719, + "grad_norm": 6.9172539710998535, + "learning_rate": 6.8276003837193375e-06, + "loss": 0.3877, + "step": 23900 + }, + { + "epoch": 1.1241861096039067, + "grad_norm": 6.889996528625488, + "learning_rate": 6.827463341099082e-06, + "loss": 0.3117, + "step": 23901 + }, + { + "epoch": 1.1241996744438416, + "grad_norm": 5.151014804840088, + "learning_rate": 6.827326298478828e-06, + "loss": 0.3967, + "step": 23902 + }, + { + "epoch": 1.1242132392837765, + "grad_norm": 6.12156867980957, + "learning_rate": 6.827189255858573e-06, + "loss": 0.3388, + "step": 23903 + }, + { + "epoch": 1.1242268041237113, + "grad_norm": 5.820288181304932, + "learning_rate": 6.827052213238317e-06, + "loss": 0.3534, + "step": 23904 + }, + { + "epoch": 1.1242403689636462, + "grad_norm": 9.169063568115234, + "learning_rate": 6.8269151706180626e-06, + "loss": 0.6508, + "step": 23905 + }, + { + "epoch": 1.124253933803581, + "grad_norm": 8.900473594665527, + "learning_rate": 6.826778127997808e-06, + "loss": 0.4683, + "step": 23906 + }, + { + "epoch": 1.124267498643516, + "grad_norm": 6.333456039428711, + "learning_rate": 6.826641085377553e-06, + "loss": 0.3747, + "step": 23907 + }, + { + "epoch": 1.1242810634834508, + "grad_norm": 6.620575428009033, + "learning_rate": 6.826504042757298e-06, + "loss": 0.4677, + "step": 23908 + }, + { + "epoch": 1.1242946283233857, + "grad_norm": 3.920419454574585, + "learning_rate": 6.826367000137043e-06, + "loss": 0.2791, + "step": 23909 + }, + { + "epoch": 1.1243081931633208, + "grad_norm": 4.341580390930176, + "learning_rate": 6.826229957516788e-06, + "loss": 0.3426, + "step": 23910 + }, + { + "epoch": 1.1243217580032556, + "grad_norm": 6.664727687835693, + "learning_rate": 6.826092914896534e-06, + "loss": 0.3753, + "step": 23911 + }, + { + "epoch": 1.1243353228431905, + "grad_norm": 6.163218021392822, + "learning_rate": 6.825955872276279e-06, + "loss": 0.3757, + "step": 23912 + }, + { + "epoch": 1.1243488876831254, + "grad_norm": 5.452568531036377, + "learning_rate": 6.825818829656023e-06, + "loss": 0.3498, + "step": 23913 + }, + { + "epoch": 1.1243624525230602, + "grad_norm": 4.906445503234863, + "learning_rate": 6.825681787035768e-06, + "loss": 0.3677, + "step": 23914 + }, + { + "epoch": 1.124376017362995, + "grad_norm": 5.177765369415283, + "learning_rate": 6.825544744415514e-06, + "loss": 0.41, + "step": 23915 + }, + { + "epoch": 1.12438958220293, + "grad_norm": 7.229731559753418, + "learning_rate": 6.8254077017952595e-06, + "loss": 0.2875, + "step": 23916 + }, + { + "epoch": 1.1244031470428648, + "grad_norm": 6.679745197296143, + "learning_rate": 6.825270659175004e-06, + "loss": 0.4749, + "step": 23917 + }, + { + "epoch": 1.1244167118827997, + "grad_norm": 5.180741310119629, + "learning_rate": 6.825133616554749e-06, + "loss": 0.3009, + "step": 23918 + }, + { + "epoch": 1.1244302767227348, + "grad_norm": 7.23504638671875, + "learning_rate": 6.824996573934493e-06, + "loss": 0.4871, + "step": 23919 + }, + { + "epoch": 1.1244438415626696, + "grad_norm": 4.192681789398193, + "learning_rate": 6.824859531314239e-06, + "loss": 0.289, + "step": 23920 + }, + { + "epoch": 1.1244574064026045, + "grad_norm": 4.641148567199707, + "learning_rate": 6.824722488693985e-06, + "loss": 0.3296, + "step": 23921 + }, + { + "epoch": 1.1244709712425394, + "grad_norm": 6.1979289054870605, + "learning_rate": 6.824585446073729e-06, + "loss": 0.3317, + "step": 23922 + }, + { + "epoch": 1.1244845360824742, + "grad_norm": 5.507622718811035, + "learning_rate": 6.824448403453474e-06, + "loss": 0.4186, + "step": 23923 + }, + { + "epoch": 1.124498100922409, + "grad_norm": 4.350242614746094, + "learning_rate": 6.82431136083322e-06, + "loss": 0.3379, + "step": 23924 + }, + { + "epoch": 1.124511665762344, + "grad_norm": 5.405440807342529, + "learning_rate": 6.824174318212965e-06, + "loss": 0.2785, + "step": 23925 + }, + { + "epoch": 1.1245252306022788, + "grad_norm": 4.912943363189697, + "learning_rate": 6.82403727559271e-06, + "loss": 0.3271, + "step": 23926 + }, + { + "epoch": 1.1245387954422137, + "grad_norm": 5.4828596115112305, + "learning_rate": 6.823900232972455e-06, + "loss": 0.3452, + "step": 23927 + }, + { + "epoch": 1.1245523602821486, + "grad_norm": 6.190358638763428, + "learning_rate": 6.823763190352201e-06, + "loss": 0.3722, + "step": 23928 + }, + { + "epoch": 1.1245659251220836, + "grad_norm": 6.020147323608398, + "learning_rate": 6.823626147731945e-06, + "loss": 0.3335, + "step": 23929 + }, + { + "epoch": 1.1245794899620185, + "grad_norm": 4.251274585723877, + "learning_rate": 6.82348910511169e-06, + "loss": 0.2857, + "step": 23930 + }, + { + "epoch": 1.1245930548019534, + "grad_norm": 4.788951396942139, + "learning_rate": 6.8233520624914355e-06, + "loss": 0.3044, + "step": 23931 + }, + { + "epoch": 1.1246066196418882, + "grad_norm": 5.68598747253418, + "learning_rate": 6.82321501987118e-06, + "loss": 0.4006, + "step": 23932 + }, + { + "epoch": 1.124620184481823, + "grad_norm": 5.34420108795166, + "learning_rate": 6.823077977250926e-06, + "loss": 0.3067, + "step": 23933 + }, + { + "epoch": 1.124633749321758, + "grad_norm": 9.710805892944336, + "learning_rate": 6.822940934630671e-06, + "loss": 0.4771, + "step": 23934 + }, + { + "epoch": 1.1246473141616928, + "grad_norm": 6.951233863830566, + "learning_rate": 6.822803892010415e-06, + "loss": 0.3154, + "step": 23935 + }, + { + "epoch": 1.1246608790016277, + "grad_norm": 5.941916465759277, + "learning_rate": 6.8226668493901606e-06, + "loss": 0.3071, + "step": 23936 + }, + { + "epoch": 1.1246744438415626, + "grad_norm": 8.455886840820312, + "learning_rate": 6.822529806769907e-06, + "loss": 0.4097, + "step": 23937 + }, + { + "epoch": 1.1246880086814977, + "grad_norm": 8.170955657958984, + "learning_rate": 6.822392764149651e-06, + "loss": 0.3547, + "step": 23938 + }, + { + "epoch": 1.1247015735214325, + "grad_norm": 11.654317855834961, + "learning_rate": 6.822255721529396e-06, + "loss": 0.3734, + "step": 23939 + }, + { + "epoch": 1.1247151383613674, + "grad_norm": 5.169955253601074, + "learning_rate": 6.822118678909141e-06, + "loss": 0.2365, + "step": 23940 + }, + { + "epoch": 1.1247287032013022, + "grad_norm": 6.85722541809082, + "learning_rate": 6.8219816362888865e-06, + "loss": 0.4197, + "step": 23941 + }, + { + "epoch": 1.1247422680412371, + "grad_norm": 5.480307102203369, + "learning_rate": 6.821844593668632e-06, + "loss": 0.3186, + "step": 23942 + }, + { + "epoch": 1.124755832881172, + "grad_norm": 8.339632987976074, + "learning_rate": 6.821707551048377e-06, + "loss": 0.4294, + "step": 23943 + }, + { + "epoch": 1.1247693977211068, + "grad_norm": 9.167662620544434, + "learning_rate": 6.821570508428121e-06, + "loss": 0.483, + "step": 23944 + }, + { + "epoch": 1.1247829625610417, + "grad_norm": 7.124709129333496, + "learning_rate": 6.821433465807866e-06, + "loss": 0.3604, + "step": 23945 + }, + { + "epoch": 1.1247965274009766, + "grad_norm": 8.302637100219727, + "learning_rate": 6.821296423187612e-06, + "loss": 0.4504, + "step": 23946 + }, + { + "epoch": 1.1248100922409114, + "grad_norm": 4.797149181365967, + "learning_rate": 6.821159380567357e-06, + "loss": 0.2155, + "step": 23947 + }, + { + "epoch": 1.1248236570808465, + "grad_norm": 6.708404541015625, + "learning_rate": 6.821022337947102e-06, + "loss": 0.3202, + "step": 23948 + }, + { + "epoch": 1.1248372219207814, + "grad_norm": 5.1156158447265625, + "learning_rate": 6.820885295326847e-06, + "loss": 0.2138, + "step": 23949 + }, + { + "epoch": 1.1248507867607163, + "grad_norm": 7.445023059844971, + "learning_rate": 6.820748252706593e-06, + "loss": 0.4297, + "step": 23950 + }, + { + "epoch": 1.1248643516006511, + "grad_norm": 6.172558784484863, + "learning_rate": 6.820611210086337e-06, + "loss": 0.4186, + "step": 23951 + }, + { + "epoch": 1.124877916440586, + "grad_norm": 5.126917362213135, + "learning_rate": 6.820474167466083e-06, + "loss": 0.2717, + "step": 23952 + }, + { + "epoch": 1.1248914812805209, + "grad_norm": 5.439785003662109, + "learning_rate": 6.820337124845827e-06, + "loss": 0.455, + "step": 23953 + }, + { + "epoch": 1.1249050461204557, + "grad_norm": 5.009182929992676, + "learning_rate": 6.820200082225573e-06, + "loss": 0.34, + "step": 23954 + }, + { + "epoch": 1.1249186109603906, + "grad_norm": 4.806508541107178, + "learning_rate": 6.820063039605318e-06, + "loss": 0.2409, + "step": 23955 + }, + { + "epoch": 1.1249321758003257, + "grad_norm": 6.702024936676025, + "learning_rate": 6.8199259969850625e-06, + "loss": 0.3605, + "step": 23956 + }, + { + "epoch": 1.1249457406402605, + "grad_norm": 5.244622707366943, + "learning_rate": 6.819788954364808e-06, + "loss": 0.4111, + "step": 23957 + }, + { + "epoch": 1.1249593054801954, + "grad_norm": 4.432028293609619, + "learning_rate": 6.819651911744554e-06, + "loss": 0.2897, + "step": 23958 + }, + { + "epoch": 1.1249728703201303, + "grad_norm": 6.581399917602539, + "learning_rate": 6.819514869124299e-06, + "loss": 0.4176, + "step": 23959 + }, + { + "epoch": 1.1249864351600651, + "grad_norm": 6.991581439971924, + "learning_rate": 6.819377826504043e-06, + "loss": 0.3416, + "step": 23960 + }, + { + "epoch": 1.125, + "grad_norm": 4.7212395668029785, + "learning_rate": 6.819240783883788e-06, + "loss": 0.2973, + "step": 23961 + }, + { + "epoch": 1.1250135648399349, + "grad_norm": 6.145778656005859, + "learning_rate": 6.819103741263533e-06, + "loss": 0.4789, + "step": 23962 + }, + { + "epoch": 1.1250271296798697, + "grad_norm": 5.28273868560791, + "learning_rate": 6.818966698643279e-06, + "loss": 0.2634, + "step": 23963 + }, + { + "epoch": 1.1250406945198046, + "grad_norm": 6.697109222412109, + "learning_rate": 6.818829656023024e-06, + "loss": 0.5114, + "step": 23964 + }, + { + "epoch": 1.1250542593597395, + "grad_norm": 5.993160247802734, + "learning_rate": 6.818692613402769e-06, + "loss": 0.4077, + "step": 23965 + }, + { + "epoch": 1.1250678241996743, + "grad_norm": 4.728438377380371, + "learning_rate": 6.818555570782513e-06, + "loss": 0.1957, + "step": 23966 + }, + { + "epoch": 1.1250813890396094, + "grad_norm": 7.605262279510498, + "learning_rate": 6.8184185281622594e-06, + "loss": 0.3801, + "step": 23967 + }, + { + "epoch": 1.1250949538795443, + "grad_norm": 5.160488128662109, + "learning_rate": 6.818281485542005e-06, + "loss": 0.3567, + "step": 23968 + }, + { + "epoch": 1.1251085187194791, + "grad_norm": 7.3282952308654785, + "learning_rate": 6.818144442921749e-06, + "loss": 0.41, + "step": 23969 + }, + { + "epoch": 1.125122083559414, + "grad_norm": 5.618659496307373, + "learning_rate": 6.818007400301494e-06, + "loss": 0.4364, + "step": 23970 + }, + { + "epoch": 1.1251356483993489, + "grad_norm": 5.84668493270874, + "learning_rate": 6.81787035768124e-06, + "loss": 0.4118, + "step": 23971 + }, + { + "epoch": 1.1251492132392837, + "grad_norm": 4.605730056762695, + "learning_rate": 6.8177333150609845e-06, + "loss": 0.3581, + "step": 23972 + }, + { + "epoch": 1.1251627780792186, + "grad_norm": 5.713938236236572, + "learning_rate": 6.81759627244073e-06, + "loss": 0.2961, + "step": 23973 + }, + { + "epoch": 1.1251763429191535, + "grad_norm": 4.95175313949585, + "learning_rate": 6.817459229820475e-06, + "loss": 0.3081, + "step": 23974 + }, + { + "epoch": 1.1251899077590886, + "grad_norm": 8.577959060668945, + "learning_rate": 6.817322187200219e-06, + "loss": 0.615, + "step": 23975 + }, + { + "epoch": 1.1252034725990234, + "grad_norm": 6.347901344299316, + "learning_rate": 6.817185144579965e-06, + "loss": 0.391, + "step": 23976 + }, + { + "epoch": 1.1252170374389583, + "grad_norm": 5.544939041137695, + "learning_rate": 6.81704810195971e-06, + "loss": 0.3776, + "step": 23977 + }, + { + "epoch": 1.1252306022788932, + "grad_norm": 5.5868072509765625, + "learning_rate": 6.816911059339455e-06, + "loss": 0.5006, + "step": 23978 + }, + { + "epoch": 1.125244167118828, + "grad_norm": 5.418135166168213, + "learning_rate": 6.8167740167192e-06, + "loss": 0.1934, + "step": 23979 + }, + { + "epoch": 1.1252577319587629, + "grad_norm": 4.25268030166626, + "learning_rate": 6.816636974098946e-06, + "loss": 0.2725, + "step": 23980 + }, + { + "epoch": 1.1252712967986978, + "grad_norm": 6.472498416900635, + "learning_rate": 6.81649993147869e-06, + "loss": 0.4759, + "step": 23981 + }, + { + "epoch": 1.1252848616386326, + "grad_norm": 5.825035572052002, + "learning_rate": 6.816362888858435e-06, + "loss": 0.3356, + "step": 23982 + }, + { + "epoch": 1.1252984264785675, + "grad_norm": 6.924467086791992, + "learning_rate": 6.816225846238181e-06, + "loss": 0.3396, + "step": 23983 + }, + { + "epoch": 1.1253119913185023, + "grad_norm": 6.554255485534668, + "learning_rate": 6.816088803617927e-06, + "loss": 0.4038, + "step": 23984 + }, + { + "epoch": 1.1253255561584372, + "grad_norm": 5.855220317840576, + "learning_rate": 6.815951760997671e-06, + "loss": 0.3936, + "step": 23985 + }, + { + "epoch": 1.1253391209983723, + "grad_norm": 8.741875648498535, + "learning_rate": 6.815814718377416e-06, + "loss": 0.4504, + "step": 23986 + }, + { + "epoch": 1.1253526858383072, + "grad_norm": 6.611331939697266, + "learning_rate": 6.8156776757571605e-06, + "loss": 0.3395, + "step": 23987 + }, + { + "epoch": 1.125366250678242, + "grad_norm": 8.713025093078613, + "learning_rate": 6.815540633136906e-06, + "loss": 0.4166, + "step": 23988 + }, + { + "epoch": 1.125379815518177, + "grad_norm": 4.895071506500244, + "learning_rate": 6.815403590516652e-06, + "loss": 0.3227, + "step": 23989 + }, + { + "epoch": 1.1253933803581118, + "grad_norm": 7.7780842781066895, + "learning_rate": 6.815266547896396e-06, + "loss": 0.2835, + "step": 23990 + }, + { + "epoch": 1.1254069451980466, + "grad_norm": 6.115035533905029, + "learning_rate": 6.815129505276141e-06, + "loss": 0.3296, + "step": 23991 + }, + { + "epoch": 1.1254205100379815, + "grad_norm": 5.6035308837890625, + "learning_rate": 6.814992462655886e-06, + "loss": 0.2586, + "step": 23992 + }, + { + "epoch": 1.1254340748779164, + "grad_norm": 4.32926607131958, + "learning_rate": 6.814855420035632e-06, + "loss": 0.2964, + "step": 23993 + }, + { + "epoch": 1.1254476397178514, + "grad_norm": 5.124492168426514, + "learning_rate": 6.814718377415377e-06, + "loss": 0.3307, + "step": 23994 + }, + { + "epoch": 1.1254612045577863, + "grad_norm": 5.584092140197754, + "learning_rate": 6.814581334795122e-06, + "loss": 0.2546, + "step": 23995 + }, + { + "epoch": 1.1254747693977212, + "grad_norm": 5.995385646820068, + "learning_rate": 6.814444292174866e-06, + "loss": 0.2568, + "step": 23996 + }, + { + "epoch": 1.125488334237656, + "grad_norm": 5.117732524871826, + "learning_rate": 6.814307249554612e-06, + "loss": 0.3, + "step": 23997 + }, + { + "epoch": 1.125501899077591, + "grad_norm": 5.407106399536133, + "learning_rate": 6.8141702069343574e-06, + "loss": 0.3051, + "step": 23998 + }, + { + "epoch": 1.1255154639175258, + "grad_norm": 6.2087578773498535, + "learning_rate": 6.814033164314103e-06, + "loss": 0.2998, + "step": 23999 + }, + { + "epoch": 1.1255290287574606, + "grad_norm": 4.530264854431152, + "learning_rate": 6.813896121693847e-06, + "loss": 0.2281, + "step": 24000 + }, + { + "epoch": 1.1255425935973955, + "grad_norm": 5.931772708892822, + "learning_rate": 6.813759079073592e-06, + "loss": 0.2278, + "step": 24001 + }, + { + "epoch": 1.1255561584373304, + "grad_norm": 5.790008068084717, + "learning_rate": 6.813622036453338e-06, + "loss": 0.3893, + "step": 24002 + }, + { + "epoch": 1.1255697232772652, + "grad_norm": 5.293116092681885, + "learning_rate": 6.8134849938330825e-06, + "loss": 0.2501, + "step": 24003 + }, + { + "epoch": 1.1255832881172, + "grad_norm": 4.847490310668945, + "learning_rate": 6.813347951212828e-06, + "loss": 0.3256, + "step": 24004 + }, + { + "epoch": 1.1255968529571352, + "grad_norm": 4.0271525382995605, + "learning_rate": 6.813210908592572e-06, + "loss": 0.1633, + "step": 24005 + }, + { + "epoch": 1.12561041779707, + "grad_norm": 4.410797595977783, + "learning_rate": 6.813073865972318e-06, + "loss": 0.1657, + "step": 24006 + }, + { + "epoch": 1.125623982637005, + "grad_norm": 6.015814304351807, + "learning_rate": 6.812936823352063e-06, + "loss": 0.4266, + "step": 24007 + }, + { + "epoch": 1.1256375474769398, + "grad_norm": 4.993568420410156, + "learning_rate": 6.812799780731808e-06, + "loss": 0.2259, + "step": 24008 + }, + { + "epoch": 1.1256511123168746, + "grad_norm": 6.767361164093018, + "learning_rate": 6.812662738111553e-06, + "loss": 0.4798, + "step": 24009 + }, + { + "epoch": 1.1256646771568095, + "grad_norm": 7.6727375984191895, + "learning_rate": 6.812525695491299e-06, + "loss": 0.4467, + "step": 24010 + }, + { + "epoch": 1.1256782419967444, + "grad_norm": 5.187757968902588, + "learning_rate": 6.812388652871044e-06, + "loss": 0.3757, + "step": 24011 + }, + { + "epoch": 1.1256918068366792, + "grad_norm": 6.051368713378906, + "learning_rate": 6.812251610250788e-06, + "loss": 0.3419, + "step": 24012 + }, + { + "epoch": 1.1257053716766143, + "grad_norm": 7.469363689422607, + "learning_rate": 6.812114567630533e-06, + "loss": 0.4494, + "step": 24013 + }, + { + "epoch": 1.1257189365165492, + "grad_norm": 5.797708034515381, + "learning_rate": 6.811977525010279e-06, + "loss": 0.2571, + "step": 24014 + }, + { + "epoch": 1.125732501356484, + "grad_norm": 6.496171951293945, + "learning_rate": 6.811840482390024e-06, + "loss": 0.3183, + "step": 24015 + }, + { + "epoch": 1.125746066196419, + "grad_norm": 3.3874106407165527, + "learning_rate": 6.811703439769769e-06, + "loss": 0.1741, + "step": 24016 + }, + { + "epoch": 1.1257596310363538, + "grad_norm": 5.105086326599121, + "learning_rate": 6.811566397149514e-06, + "loss": 0.32, + "step": 24017 + }, + { + "epoch": 1.1257731958762887, + "grad_norm": 5.2904839515686035, + "learning_rate": 6.8114293545292585e-06, + "loss": 0.2931, + "step": 24018 + }, + { + "epoch": 1.1257867607162235, + "grad_norm": 6.066659450531006, + "learning_rate": 6.8112923119090045e-06, + "loss": 0.2734, + "step": 24019 + }, + { + "epoch": 1.1258003255561584, + "grad_norm": 7.037964344024658, + "learning_rate": 6.81115526928875e-06, + "loss": 0.434, + "step": 24020 + }, + { + "epoch": 1.1258138903960933, + "grad_norm": 8.632946014404297, + "learning_rate": 6.811018226668494e-06, + "loss": 0.4894, + "step": 24021 + }, + { + "epoch": 1.1258274552360281, + "grad_norm": 5.2057929039001465, + "learning_rate": 6.810881184048239e-06, + "loss": 0.4514, + "step": 24022 + }, + { + "epoch": 1.125841020075963, + "grad_norm": 4.37067174911499, + "learning_rate": 6.810744141427985e-06, + "loss": 0.2747, + "step": 24023 + }, + { + "epoch": 1.125854584915898, + "grad_norm": 5.343693733215332, + "learning_rate": 6.81060709880773e-06, + "loss": 0.264, + "step": 24024 + }, + { + "epoch": 1.125868149755833, + "grad_norm": 5.149224281311035, + "learning_rate": 6.810470056187475e-06, + "loss": 0.3291, + "step": 24025 + }, + { + "epoch": 1.1258817145957678, + "grad_norm": 6.20185661315918, + "learning_rate": 6.81033301356722e-06, + "loss": 0.3812, + "step": 24026 + }, + { + "epoch": 1.1258952794357027, + "grad_norm": 6.814356327056885, + "learning_rate": 6.810195970946966e-06, + "loss": 0.4471, + "step": 24027 + }, + { + "epoch": 1.1259088442756375, + "grad_norm": 7.154666900634766, + "learning_rate": 6.81005892832671e-06, + "loss": 0.4184, + "step": 24028 + }, + { + "epoch": 1.1259224091155724, + "grad_norm": 4.939698696136475, + "learning_rate": 6.8099218857064554e-06, + "loss": 0.3288, + "step": 24029 + }, + { + "epoch": 1.1259359739555073, + "grad_norm": 5.0026092529296875, + "learning_rate": 6.8097848430862e-06, + "loss": 0.2513, + "step": 24030 + }, + { + "epoch": 1.1259495387954421, + "grad_norm": 5.526499271392822, + "learning_rate": 6.809647800465945e-06, + "loss": 0.2932, + "step": 24031 + }, + { + "epoch": 1.1259631036353772, + "grad_norm": 4.579206943511963, + "learning_rate": 6.809510757845691e-06, + "loss": 0.2133, + "step": 24032 + }, + { + "epoch": 1.125976668475312, + "grad_norm": 4.210745334625244, + "learning_rate": 6.809373715225436e-06, + "loss": 0.2309, + "step": 24033 + }, + { + "epoch": 1.125990233315247, + "grad_norm": 7.387037754058838, + "learning_rate": 6.8092366726051805e-06, + "loss": 0.4167, + "step": 24034 + }, + { + "epoch": 1.1260037981551818, + "grad_norm": 4.629876136779785, + "learning_rate": 6.809099629984926e-06, + "loss": 0.2685, + "step": 24035 + }, + { + "epoch": 1.1260173629951167, + "grad_norm": 4.662050724029541, + "learning_rate": 6.808962587364672e-06, + "loss": 0.3299, + "step": 24036 + }, + { + "epoch": 1.1260309278350515, + "grad_norm": 5.963978290557861, + "learning_rate": 6.808825544744416e-06, + "loss": 0.3422, + "step": 24037 + }, + { + "epoch": 1.1260444926749864, + "grad_norm": 6.244719982147217, + "learning_rate": 6.808688502124161e-06, + "loss": 0.3832, + "step": 24038 + }, + { + "epoch": 1.1260580575149213, + "grad_norm": 5.199429512023926, + "learning_rate": 6.8085514595039055e-06, + "loss": 0.2679, + "step": 24039 + }, + { + "epoch": 1.1260716223548561, + "grad_norm": 8.362589836120605, + "learning_rate": 6.8084144168836516e-06, + "loss": 0.3816, + "step": 24040 + }, + { + "epoch": 1.126085187194791, + "grad_norm": 5.994693756103516, + "learning_rate": 6.808277374263397e-06, + "loss": 0.3265, + "step": 24041 + }, + { + "epoch": 1.1260987520347259, + "grad_norm": 4.680715560913086, + "learning_rate": 6.808140331643142e-06, + "loss": 0.2757, + "step": 24042 + }, + { + "epoch": 1.126112316874661, + "grad_norm": 6.835334300994873, + "learning_rate": 6.808003289022886e-06, + "loss": 0.3145, + "step": 24043 + }, + { + "epoch": 1.1261258817145958, + "grad_norm": 4.75342321395874, + "learning_rate": 6.807866246402631e-06, + "loss": 0.3238, + "step": 24044 + }, + { + "epoch": 1.1261394465545307, + "grad_norm": 6.782919406890869, + "learning_rate": 6.8077292037823774e-06, + "loss": 0.3035, + "step": 24045 + }, + { + "epoch": 1.1261530113944656, + "grad_norm": 8.093413352966309, + "learning_rate": 6.807592161162122e-06, + "loss": 0.4519, + "step": 24046 + }, + { + "epoch": 1.1261665762344004, + "grad_norm": 5.484843730926514, + "learning_rate": 6.807455118541867e-06, + "loss": 0.3672, + "step": 24047 + }, + { + "epoch": 1.1261801410743353, + "grad_norm": 6.114847660064697, + "learning_rate": 6.807318075921612e-06, + "loss": 0.3706, + "step": 24048 + }, + { + "epoch": 1.1261937059142701, + "grad_norm": 5.515469074249268, + "learning_rate": 6.807181033301357e-06, + "loss": 0.2436, + "step": 24049 + }, + { + "epoch": 1.126207270754205, + "grad_norm": 6.126840591430664, + "learning_rate": 6.8070439906811025e-06, + "loss": 0.4168, + "step": 24050 + }, + { + "epoch": 1.12622083559414, + "grad_norm": 6.30515718460083, + "learning_rate": 6.806906948060848e-06, + "loss": 0.307, + "step": 24051 + }, + { + "epoch": 1.126234400434075, + "grad_norm": 5.316649436950684, + "learning_rate": 6.806769905440592e-06, + "loss": 0.2902, + "step": 24052 + }, + { + "epoch": 1.1262479652740098, + "grad_norm": 4.8884429931640625, + "learning_rate": 6.806632862820338e-06, + "loss": 0.2841, + "step": 24053 + }, + { + "epoch": 1.1262615301139447, + "grad_norm": 5.534022331237793, + "learning_rate": 6.806495820200083e-06, + "loss": 0.2378, + "step": 24054 + }, + { + "epoch": 1.1262750949538796, + "grad_norm": 5.382201194763184, + "learning_rate": 6.8063587775798275e-06, + "loss": 0.344, + "step": 24055 + }, + { + "epoch": 1.1262886597938144, + "grad_norm": 6.793072700500488, + "learning_rate": 6.806221734959573e-06, + "loss": 0.2882, + "step": 24056 + }, + { + "epoch": 1.1263022246337493, + "grad_norm": 6.529806613922119, + "learning_rate": 6.806084692339318e-06, + "loss": 0.399, + "step": 24057 + }, + { + "epoch": 1.1263157894736842, + "grad_norm": 6.048456192016602, + "learning_rate": 6.805947649719064e-06, + "loss": 0.3884, + "step": 24058 + }, + { + "epoch": 1.126329354313619, + "grad_norm": 5.929568290710449, + "learning_rate": 6.805810607098808e-06, + "loss": 0.294, + "step": 24059 + }, + { + "epoch": 1.126342919153554, + "grad_norm": 5.1931891441345215, + "learning_rate": 6.8056735644785534e-06, + "loss": 0.2671, + "step": 24060 + }, + { + "epoch": 1.1263564839934888, + "grad_norm": 6.137012004852295, + "learning_rate": 6.805536521858298e-06, + "loss": 0.4143, + "step": 24061 + }, + { + "epoch": 1.1263700488334238, + "grad_norm": 3.8829054832458496, + "learning_rate": 6.805399479238044e-06, + "loss": 0.1671, + "step": 24062 + }, + { + "epoch": 1.1263836136733587, + "grad_norm": 6.391739845275879, + "learning_rate": 6.805262436617789e-06, + "loss": 0.3688, + "step": 24063 + }, + { + "epoch": 1.1263971785132936, + "grad_norm": 5.462869167327881, + "learning_rate": 6.805125393997533e-06, + "loss": 0.3052, + "step": 24064 + }, + { + "epoch": 1.1264107433532284, + "grad_norm": 6.110136032104492, + "learning_rate": 6.8049883513772785e-06, + "loss": 0.3016, + "step": 24065 + }, + { + "epoch": 1.1264243081931633, + "grad_norm": 4.212646961212158, + "learning_rate": 6.8048513087570245e-06, + "loss": 0.2115, + "step": 24066 + }, + { + "epoch": 1.1264378730330982, + "grad_norm": 5.151261806488037, + "learning_rate": 6.80471426613677e-06, + "loss": 0.2762, + "step": 24067 + }, + { + "epoch": 1.126451437873033, + "grad_norm": 6.223517417907715, + "learning_rate": 6.804577223516514e-06, + "loss": 0.2984, + "step": 24068 + }, + { + "epoch": 1.126465002712968, + "grad_norm": 5.1070404052734375, + "learning_rate": 6.804440180896259e-06, + "loss": 0.3084, + "step": 24069 + }, + { + "epoch": 1.126478567552903, + "grad_norm": 5.683332920074463, + "learning_rate": 6.8043031382760035e-06, + "loss": 0.3765, + "step": 24070 + }, + { + "epoch": 1.1264921323928379, + "grad_norm": 5.674259185791016, + "learning_rate": 6.8041660956557496e-06, + "loss": 0.3329, + "step": 24071 + }, + { + "epoch": 1.1265056972327727, + "grad_norm": 5.230401039123535, + "learning_rate": 6.804029053035495e-06, + "loss": 0.2673, + "step": 24072 + }, + { + "epoch": 1.1265192620727076, + "grad_norm": 5.358237266540527, + "learning_rate": 6.80389201041524e-06, + "loss": 0.3076, + "step": 24073 + }, + { + "epoch": 1.1265328269126424, + "grad_norm": 4.969832420349121, + "learning_rate": 6.803754967794984e-06, + "loss": 0.3053, + "step": 24074 + }, + { + "epoch": 1.1265463917525773, + "grad_norm": 4.725507736206055, + "learning_rate": 6.80361792517473e-06, + "loss": 0.2202, + "step": 24075 + }, + { + "epoch": 1.1265599565925122, + "grad_norm": 5.13160514831543, + "learning_rate": 6.8034808825544755e-06, + "loss": 0.2999, + "step": 24076 + }, + { + "epoch": 1.126573521432447, + "grad_norm": 4.563857078552246, + "learning_rate": 6.80334383993422e-06, + "loss": 0.2832, + "step": 24077 + }, + { + "epoch": 1.126587086272382, + "grad_norm": 6.054571151733398, + "learning_rate": 6.803206797313965e-06, + "loss": 0.2444, + "step": 24078 + }, + { + "epoch": 1.1266006511123168, + "grad_norm": 6.090960502624512, + "learning_rate": 6.803069754693711e-06, + "loss": 0.2436, + "step": 24079 + }, + { + "epoch": 1.1266142159522516, + "grad_norm": 7.336958885192871, + "learning_rate": 6.802932712073455e-06, + "loss": 0.3714, + "step": 24080 + }, + { + "epoch": 1.1266277807921867, + "grad_norm": 7.107936382293701, + "learning_rate": 6.8027956694532005e-06, + "loss": 0.372, + "step": 24081 + }, + { + "epoch": 1.1266413456321216, + "grad_norm": 7.372454643249512, + "learning_rate": 6.802658626832946e-06, + "loss": 0.385, + "step": 24082 + }, + { + "epoch": 1.1266549104720565, + "grad_norm": 4.069204330444336, + "learning_rate": 6.80252158421269e-06, + "loss": 0.2625, + "step": 24083 + }, + { + "epoch": 1.1266684753119913, + "grad_norm": 5.930985450744629, + "learning_rate": 6.802384541592436e-06, + "loss": 0.3782, + "step": 24084 + }, + { + "epoch": 1.1266820401519262, + "grad_norm": 5.208158016204834, + "learning_rate": 6.802247498972181e-06, + "loss": 0.2389, + "step": 24085 + }, + { + "epoch": 1.126695604991861, + "grad_norm": 4.834250450134277, + "learning_rate": 6.8021104563519255e-06, + "loss": 0.2798, + "step": 24086 + }, + { + "epoch": 1.126709169831796, + "grad_norm": 7.021073341369629, + "learning_rate": 6.801973413731671e-06, + "loss": 0.3384, + "step": 24087 + }, + { + "epoch": 1.1267227346717308, + "grad_norm": 6.288609504699707, + "learning_rate": 6.801836371111417e-06, + "loss": 0.3578, + "step": 24088 + }, + { + "epoch": 1.1267362995116659, + "grad_norm": 5.404551029205322, + "learning_rate": 6.801699328491161e-06, + "loss": 0.2746, + "step": 24089 + }, + { + "epoch": 1.1267498643516007, + "grad_norm": 6.786624908447266, + "learning_rate": 6.801562285870906e-06, + "loss": 0.368, + "step": 24090 + }, + { + "epoch": 1.1267634291915356, + "grad_norm": 5.508828639984131, + "learning_rate": 6.8014252432506514e-06, + "loss": 0.2407, + "step": 24091 + }, + { + "epoch": 1.1267769940314705, + "grad_norm": 5.8533854484558105, + "learning_rate": 6.8012882006303975e-06, + "loss": 0.281, + "step": 24092 + }, + { + "epoch": 1.1267905588714053, + "grad_norm": 6.514754772186279, + "learning_rate": 6.801151158010142e-06, + "loss": 0.4365, + "step": 24093 + }, + { + "epoch": 1.1268041237113402, + "grad_norm": 5.656343936920166, + "learning_rate": 6.801014115389887e-06, + "loss": 0.2776, + "step": 24094 + }, + { + "epoch": 1.126817688551275, + "grad_norm": 6.494290351867676, + "learning_rate": 6.800877072769631e-06, + "loss": 0.4719, + "step": 24095 + }, + { + "epoch": 1.12683125339121, + "grad_norm": 4.559521675109863, + "learning_rate": 6.800740030149377e-06, + "loss": 0.1752, + "step": 24096 + }, + { + "epoch": 1.1268448182311448, + "grad_norm": 5.181346416473389, + "learning_rate": 6.8006029875291225e-06, + "loss": 0.2173, + "step": 24097 + }, + { + "epoch": 1.1268583830710797, + "grad_norm": 5.254547595977783, + "learning_rate": 6.800465944908867e-06, + "loss": 0.3141, + "step": 24098 + }, + { + "epoch": 1.1268719479110145, + "grad_norm": 8.045077323913574, + "learning_rate": 6.800328902288612e-06, + "loss": 0.4472, + "step": 24099 + }, + { + "epoch": 1.1268855127509496, + "grad_norm": 6.888916969299316, + "learning_rate": 6.800191859668357e-06, + "loss": 0.3626, + "step": 24100 + }, + { + "epoch": 1.1268990775908845, + "grad_norm": 7.437878131866455, + "learning_rate": 6.800054817048103e-06, + "loss": 0.4031, + "step": 24101 + }, + { + "epoch": 1.1269126424308193, + "grad_norm": 6.572333335876465, + "learning_rate": 6.7999177744278476e-06, + "loss": 0.5319, + "step": 24102 + }, + { + "epoch": 1.1269262072707542, + "grad_norm": 5.2567362785339355, + "learning_rate": 6.799780731807593e-06, + "loss": 0.355, + "step": 24103 + }, + { + "epoch": 1.126939772110689, + "grad_norm": 6.007039546966553, + "learning_rate": 6.799643689187337e-06, + "loss": 0.3222, + "step": 24104 + }, + { + "epoch": 1.126953336950624, + "grad_norm": 6.347208499908447, + "learning_rate": 6.799506646567083e-06, + "loss": 0.342, + "step": 24105 + }, + { + "epoch": 1.1269669017905588, + "grad_norm": 6.507862091064453, + "learning_rate": 6.799369603946828e-06, + "loss": 0.4212, + "step": 24106 + }, + { + "epoch": 1.1269804666304937, + "grad_norm": 5.796528339385986, + "learning_rate": 6.7992325613265735e-06, + "loss": 0.2824, + "step": 24107 + }, + { + "epoch": 1.1269940314704288, + "grad_norm": 6.625929832458496, + "learning_rate": 6.799095518706318e-06, + "loss": 0.3846, + "step": 24108 + }, + { + "epoch": 1.1270075963103636, + "grad_norm": 5.7558770179748535, + "learning_rate": 6.798958476086064e-06, + "loss": 0.2144, + "step": 24109 + }, + { + "epoch": 1.1270211611502985, + "grad_norm": 6.123904705047607, + "learning_rate": 6.798821433465809e-06, + "loss": 0.3425, + "step": 24110 + }, + { + "epoch": 1.1270347259902334, + "grad_norm": 6.549344539642334, + "learning_rate": 6.798684390845553e-06, + "loss": 0.3082, + "step": 24111 + }, + { + "epoch": 1.1270482908301682, + "grad_norm": 4.646603107452393, + "learning_rate": 6.7985473482252985e-06, + "loss": 0.3051, + "step": 24112 + }, + { + "epoch": 1.127061855670103, + "grad_norm": 5.094473361968994, + "learning_rate": 6.798410305605043e-06, + "loss": 0.3152, + "step": 24113 + }, + { + "epoch": 1.127075420510038, + "grad_norm": 7.592445373535156, + "learning_rate": 6.798273262984789e-06, + "loss": 0.4518, + "step": 24114 + }, + { + "epoch": 1.1270889853499728, + "grad_norm": 6.145132064819336, + "learning_rate": 6.798136220364534e-06, + "loss": 0.2854, + "step": 24115 + }, + { + "epoch": 1.1271025501899077, + "grad_norm": 7.2447004318237305, + "learning_rate": 6.797999177744279e-06, + "loss": 0.411, + "step": 24116 + }, + { + "epoch": 1.1271161150298425, + "grad_norm": 7.878645420074463, + "learning_rate": 6.7978621351240236e-06, + "loss": 0.2568, + "step": 24117 + }, + { + "epoch": 1.1271296798697774, + "grad_norm": 5.918735027313232, + "learning_rate": 6.79772509250377e-06, + "loss": 0.4485, + "step": 24118 + }, + { + "epoch": 1.1271432447097125, + "grad_norm": 7.916383743286133, + "learning_rate": 6.797588049883515e-06, + "loss": 0.3624, + "step": 24119 + }, + { + "epoch": 1.1271568095496474, + "grad_norm": 7.657289981842041, + "learning_rate": 6.797451007263259e-06, + "loss": 0.4655, + "step": 24120 + }, + { + "epoch": 1.1271703743895822, + "grad_norm": 5.462906360626221, + "learning_rate": 6.797313964643004e-06, + "loss": 0.2616, + "step": 24121 + }, + { + "epoch": 1.127183939229517, + "grad_norm": 6.068243503570557, + "learning_rate": 6.79717692202275e-06, + "loss": 0.2763, + "step": 24122 + }, + { + "epoch": 1.127197504069452, + "grad_norm": 6.03140926361084, + "learning_rate": 6.797039879402495e-06, + "loss": 0.3135, + "step": 24123 + }, + { + "epoch": 1.1272110689093868, + "grad_norm": 6.5339179039001465, + "learning_rate": 6.79690283678224e-06, + "loss": 0.319, + "step": 24124 + }, + { + "epoch": 1.1272246337493217, + "grad_norm": 5.273947715759277, + "learning_rate": 6.796765794161985e-06, + "loss": 0.3266, + "step": 24125 + }, + { + "epoch": 1.1272381985892566, + "grad_norm": 5.736328125, + "learning_rate": 6.796628751541729e-06, + "loss": 0.4393, + "step": 24126 + }, + { + "epoch": 1.1272517634291916, + "grad_norm": 6.801814556121826, + "learning_rate": 6.796491708921475e-06, + "loss": 0.3298, + "step": 24127 + }, + { + "epoch": 1.1272653282691265, + "grad_norm": 5.926294803619385, + "learning_rate": 6.7963546663012205e-06, + "loss": 0.3217, + "step": 24128 + }, + { + "epoch": 1.1272788931090614, + "grad_norm": 7.203611850738525, + "learning_rate": 6.796217623680965e-06, + "loss": 0.3783, + "step": 24129 + }, + { + "epoch": 1.1272924579489962, + "grad_norm": 7.490443706512451, + "learning_rate": 6.79608058106071e-06, + "loss": 0.3646, + "step": 24130 + }, + { + "epoch": 1.127306022788931, + "grad_norm": 5.700395584106445, + "learning_rate": 6.795943538440456e-06, + "loss": 0.2832, + "step": 24131 + }, + { + "epoch": 1.127319587628866, + "grad_norm": 3.856541395187378, + "learning_rate": 6.7958064958202e-06, + "loss": 0.2106, + "step": 24132 + }, + { + "epoch": 1.1273331524688008, + "grad_norm": 6.654622554779053, + "learning_rate": 6.7956694531999456e-06, + "loss": 0.4843, + "step": 24133 + }, + { + "epoch": 1.1273467173087357, + "grad_norm": 4.968522548675537, + "learning_rate": 6.795532410579691e-06, + "loss": 0.2869, + "step": 24134 + }, + { + "epoch": 1.1273602821486706, + "grad_norm": 6.413270473480225, + "learning_rate": 6.795395367959437e-06, + "loss": 0.3571, + "step": 24135 + }, + { + "epoch": 1.1273738469886054, + "grad_norm": 4.69690465927124, + "learning_rate": 6.795258325339181e-06, + "loss": 0.1909, + "step": 24136 + }, + { + "epoch": 1.1273874118285405, + "grad_norm": 7.8927388191223145, + "learning_rate": 6.795121282718926e-06, + "loss": 0.3299, + "step": 24137 + }, + { + "epoch": 1.1274009766684754, + "grad_norm": 6.424910068511963, + "learning_rate": 6.794984240098671e-06, + "loss": 0.2912, + "step": 24138 + }, + { + "epoch": 1.1274145415084103, + "grad_norm": 5.248989582061768, + "learning_rate": 6.794847197478416e-06, + "loss": 0.2825, + "step": 24139 + }, + { + "epoch": 1.1274281063483451, + "grad_norm": 6.96895170211792, + "learning_rate": 6.794710154858162e-06, + "loss": 0.4708, + "step": 24140 + }, + { + "epoch": 1.12744167118828, + "grad_norm": 6.863440990447998, + "learning_rate": 6.794573112237907e-06, + "loss": 0.4028, + "step": 24141 + }, + { + "epoch": 1.1274552360282148, + "grad_norm": 7.6349616050720215, + "learning_rate": 6.794436069617651e-06, + "loss": 0.3643, + "step": 24142 + }, + { + "epoch": 1.1274688008681497, + "grad_norm": 5.355743885040283, + "learning_rate": 6.7942990269973965e-06, + "loss": 0.2733, + "step": 24143 + }, + { + "epoch": 1.1274823657080846, + "grad_norm": 6.82003927230835, + "learning_rate": 6.7941619843771425e-06, + "loss": 0.4143, + "step": 24144 + }, + { + "epoch": 1.1274959305480194, + "grad_norm": 7.100400924682617, + "learning_rate": 6.794024941756887e-06, + "loss": 0.4468, + "step": 24145 + }, + { + "epoch": 1.1275094953879545, + "grad_norm": 5.698207378387451, + "learning_rate": 6.793887899136632e-06, + "loss": 0.4126, + "step": 24146 + }, + { + "epoch": 1.1275230602278894, + "grad_norm": 8.451215744018555, + "learning_rate": 6.793750856516376e-06, + "loss": 0.5401, + "step": 24147 + }, + { + "epoch": 1.1275366250678243, + "grad_norm": 6.155975341796875, + "learning_rate": 6.793613813896122e-06, + "loss": 0.3615, + "step": 24148 + }, + { + "epoch": 1.1275501899077591, + "grad_norm": 7.306441307067871, + "learning_rate": 6.793476771275868e-06, + "loss": 0.4303, + "step": 24149 + }, + { + "epoch": 1.127563754747694, + "grad_norm": 7.143307685852051, + "learning_rate": 6.793339728655613e-06, + "loss": 0.5118, + "step": 24150 + }, + { + "epoch": 1.1275773195876289, + "grad_norm": 5.867368221282959, + "learning_rate": 6.793202686035357e-06, + "loss": 0.3242, + "step": 24151 + }, + { + "epoch": 1.1275908844275637, + "grad_norm": 6.330973148345947, + "learning_rate": 6.793065643415102e-06, + "loss": 0.3388, + "step": 24152 + }, + { + "epoch": 1.1276044492674986, + "grad_norm": 7.918400287628174, + "learning_rate": 6.792928600794848e-06, + "loss": 0.3335, + "step": 24153 + }, + { + "epoch": 1.1276180141074335, + "grad_norm": 8.275547981262207, + "learning_rate": 6.792791558174593e-06, + "loss": 0.3887, + "step": 24154 + }, + { + "epoch": 1.1276315789473683, + "grad_norm": 6.234898567199707, + "learning_rate": 6.792654515554338e-06, + "loss": 0.3677, + "step": 24155 + }, + { + "epoch": 1.1276451437873034, + "grad_norm": 9.656359672546387, + "learning_rate": 6.792517472934083e-06, + "loss": 0.5734, + "step": 24156 + }, + { + "epoch": 1.1276587086272383, + "grad_norm": 5.452658176422119, + "learning_rate": 6.792380430313828e-06, + "loss": 0.2749, + "step": 24157 + }, + { + "epoch": 1.1276722734671731, + "grad_norm": 7.766286849975586, + "learning_rate": 6.792243387693573e-06, + "loss": 0.5639, + "step": 24158 + }, + { + "epoch": 1.127685838307108, + "grad_norm": 7.155220985412598, + "learning_rate": 6.7921063450733185e-06, + "loss": 0.4368, + "step": 24159 + }, + { + "epoch": 1.1276994031470429, + "grad_norm": 8.10774040222168, + "learning_rate": 6.791969302453063e-06, + "loss": 0.4581, + "step": 24160 + }, + { + "epoch": 1.1277129679869777, + "grad_norm": 9.54722785949707, + "learning_rate": 6.791832259832809e-06, + "loss": 0.5378, + "step": 24161 + }, + { + "epoch": 1.1277265328269126, + "grad_norm": 5.226043224334717, + "learning_rate": 6.791695217212554e-06, + "loss": 0.3466, + "step": 24162 + }, + { + "epoch": 1.1277400976668475, + "grad_norm": 7.055328369140625, + "learning_rate": 6.791558174592298e-06, + "loss": 0.3287, + "step": 24163 + }, + { + "epoch": 1.1277536625067823, + "grad_norm": 7.456175804138184, + "learning_rate": 6.7914211319720436e-06, + "loss": 0.4819, + "step": 24164 + }, + { + "epoch": 1.1277672273467174, + "grad_norm": 7.740454196929932, + "learning_rate": 6.79128408935179e-06, + "loss": 0.5337, + "step": 24165 + }, + { + "epoch": 1.1277807921866523, + "grad_norm": 5.737825393676758, + "learning_rate": 6.791147046731535e-06, + "loss": 0.2357, + "step": 24166 + }, + { + "epoch": 1.1277943570265871, + "grad_norm": 5.809087753295898, + "learning_rate": 6.791010004111279e-06, + "loss": 0.5014, + "step": 24167 + }, + { + "epoch": 1.127807921866522, + "grad_norm": 6.667291164398193, + "learning_rate": 6.790872961491024e-06, + "loss": 0.2974, + "step": 24168 + }, + { + "epoch": 1.1278214867064569, + "grad_norm": 6.988913059234619, + "learning_rate": 6.790735918870769e-06, + "loss": 0.4033, + "step": 24169 + }, + { + "epoch": 1.1278350515463917, + "grad_norm": 6.909523010253906, + "learning_rate": 6.790598876250515e-06, + "loss": 0.4564, + "step": 24170 + }, + { + "epoch": 1.1278486163863266, + "grad_norm": 6.015419960021973, + "learning_rate": 6.79046183363026e-06, + "loss": 0.3018, + "step": 24171 + }, + { + "epoch": 1.1278621812262615, + "grad_norm": 7.392580509185791, + "learning_rate": 6.790324791010004e-06, + "loss": 0.4304, + "step": 24172 + }, + { + "epoch": 1.1278757460661963, + "grad_norm": 5.634634971618652, + "learning_rate": 6.790187748389749e-06, + "loss": 0.2238, + "step": 24173 + }, + { + "epoch": 1.1278893109061312, + "grad_norm": 7.2596025466918945, + "learning_rate": 6.790050705769495e-06, + "loss": 0.4123, + "step": 24174 + }, + { + "epoch": 1.1279028757460663, + "grad_norm": 5.442160606384277, + "learning_rate": 6.7899136631492405e-06, + "loss": 0.2773, + "step": 24175 + }, + { + "epoch": 1.1279164405860012, + "grad_norm": 5.607588291168213, + "learning_rate": 6.789776620528985e-06, + "loss": 0.3374, + "step": 24176 + }, + { + "epoch": 1.127930005425936, + "grad_norm": 5.881280899047852, + "learning_rate": 6.78963957790873e-06, + "loss": 0.2429, + "step": 24177 + }, + { + "epoch": 1.1279435702658709, + "grad_norm": 5.3997368812561035, + "learning_rate": 6.789502535288476e-06, + "loss": 0.2915, + "step": 24178 + }, + { + "epoch": 1.1279571351058058, + "grad_norm": 5.240189075469971, + "learning_rate": 6.78936549266822e-06, + "loss": 0.2037, + "step": 24179 + }, + { + "epoch": 1.1279706999457406, + "grad_norm": 5.354188919067383, + "learning_rate": 6.789228450047966e-06, + "loss": 0.298, + "step": 24180 + }, + { + "epoch": 1.1279842647856755, + "grad_norm": 3.814302444458008, + "learning_rate": 6.78909140742771e-06, + "loss": 0.2657, + "step": 24181 + }, + { + "epoch": 1.1279978296256103, + "grad_norm": 6.658013343811035, + "learning_rate": 6.788954364807455e-06, + "loss": 0.4803, + "step": 24182 + }, + { + "epoch": 1.1280113944655452, + "grad_norm": 6.653602123260498, + "learning_rate": 6.788817322187201e-06, + "loss": 0.298, + "step": 24183 + }, + { + "epoch": 1.1280249593054803, + "grad_norm": 6.292117118835449, + "learning_rate": 6.788680279566946e-06, + "loss": 0.2525, + "step": 24184 + }, + { + "epoch": 1.1280385241454152, + "grad_norm": 5.08930778503418, + "learning_rate": 6.788543236946691e-06, + "loss": 0.2586, + "step": 24185 + }, + { + "epoch": 1.12805208898535, + "grad_norm": 7.9239606857299805, + "learning_rate": 6.788406194326436e-06, + "loss": 0.4167, + "step": 24186 + }, + { + "epoch": 1.128065653825285, + "grad_norm": 7.2348856925964355, + "learning_rate": 6.788269151706182e-06, + "loss": 0.2962, + "step": 24187 + }, + { + "epoch": 1.1280792186652198, + "grad_norm": 6.5736775398254395, + "learning_rate": 6.788132109085926e-06, + "loss": 0.2668, + "step": 24188 + }, + { + "epoch": 1.1280927835051546, + "grad_norm": 5.002135753631592, + "learning_rate": 6.787995066465671e-06, + "loss": 0.2297, + "step": 24189 + }, + { + "epoch": 1.1281063483450895, + "grad_norm": 7.569725036621094, + "learning_rate": 6.7878580238454165e-06, + "loss": 0.3746, + "step": 24190 + }, + { + "epoch": 1.1281199131850244, + "grad_norm": 4.998769283294678, + "learning_rate": 6.787720981225162e-06, + "loss": 0.1832, + "step": 24191 + }, + { + "epoch": 1.1281334780249592, + "grad_norm": 7.251937389373779, + "learning_rate": 6.787583938604907e-06, + "loss": 0.2391, + "step": 24192 + }, + { + "epoch": 1.128147042864894, + "grad_norm": 5.767828941345215, + "learning_rate": 6.787446895984652e-06, + "loss": 0.3563, + "step": 24193 + }, + { + "epoch": 1.1281606077048292, + "grad_norm": 4.473402976989746, + "learning_rate": 6.787309853364396e-06, + "loss": 0.2128, + "step": 24194 + }, + { + "epoch": 1.128174172544764, + "grad_norm": 4.346348285675049, + "learning_rate": 6.7871728107441416e-06, + "loss": 0.1892, + "step": 24195 + }, + { + "epoch": 1.128187737384699, + "grad_norm": 4.527686595916748, + "learning_rate": 6.787035768123888e-06, + "loss": 0.1985, + "step": 24196 + }, + { + "epoch": 1.1282013022246338, + "grad_norm": 5.100139141082764, + "learning_rate": 6.786898725503632e-06, + "loss": 0.2266, + "step": 24197 + }, + { + "epoch": 1.1282148670645686, + "grad_norm": 6.394965648651123, + "learning_rate": 6.786761682883377e-06, + "loss": 0.3188, + "step": 24198 + }, + { + "epoch": 1.1282284319045035, + "grad_norm": 7.05148983001709, + "learning_rate": 6.786624640263122e-06, + "loss": 0.2918, + "step": 24199 + }, + { + "epoch": 1.1282419967444384, + "grad_norm": 6.847067356109619, + "learning_rate": 6.786487597642868e-06, + "loss": 0.3544, + "step": 24200 + }, + { + "epoch": 1.1282555615843732, + "grad_norm": 6.222250938415527, + "learning_rate": 6.786350555022613e-06, + "loss": 0.2813, + "step": 24201 + }, + { + "epoch": 1.128269126424308, + "grad_norm": 6.41273832321167, + "learning_rate": 6.786213512402358e-06, + "loss": 0.3853, + "step": 24202 + }, + { + "epoch": 1.1282826912642432, + "grad_norm": 4.637031555175781, + "learning_rate": 6.786076469782102e-06, + "loss": 0.1798, + "step": 24203 + }, + { + "epoch": 1.128296256104178, + "grad_norm": 4.811135292053223, + "learning_rate": 6.785939427161848e-06, + "loss": 0.2987, + "step": 24204 + }, + { + "epoch": 1.128309820944113, + "grad_norm": 6.012368679046631, + "learning_rate": 6.785802384541593e-06, + "loss": 0.3728, + "step": 24205 + }, + { + "epoch": 1.1283233857840478, + "grad_norm": 6.104377269744873, + "learning_rate": 6.785665341921338e-06, + "loss": 0.232, + "step": 24206 + }, + { + "epoch": 1.1283369506239826, + "grad_norm": 6.423555374145508, + "learning_rate": 6.785528299301083e-06, + "loss": 0.4064, + "step": 24207 + }, + { + "epoch": 1.1283505154639175, + "grad_norm": 5.251779556274414, + "learning_rate": 6.785391256680828e-06, + "loss": 0.2944, + "step": 24208 + }, + { + "epoch": 1.1283640803038524, + "grad_norm": 6.841080188751221, + "learning_rate": 6.785254214060574e-06, + "loss": 0.3394, + "step": 24209 + }, + { + "epoch": 1.1283776451437872, + "grad_norm": 5.2418036460876465, + "learning_rate": 6.785117171440318e-06, + "loss": 0.299, + "step": 24210 + }, + { + "epoch": 1.128391209983722, + "grad_norm": 4.563910484313965, + "learning_rate": 6.784980128820064e-06, + "loss": 0.2284, + "step": 24211 + }, + { + "epoch": 1.128404774823657, + "grad_norm": 5.732694625854492, + "learning_rate": 6.784843086199808e-06, + "loss": 0.3575, + "step": 24212 + }, + { + "epoch": 1.128418339663592, + "grad_norm": 5.1870317459106445, + "learning_rate": 6.784706043579554e-06, + "loss": 0.2682, + "step": 24213 + }, + { + "epoch": 1.128431904503527, + "grad_norm": 5.582636833190918, + "learning_rate": 6.784569000959299e-06, + "loss": 0.2894, + "step": 24214 + }, + { + "epoch": 1.1284454693434618, + "grad_norm": 6.544938087463379, + "learning_rate": 6.784431958339044e-06, + "loss": 0.3625, + "step": 24215 + }, + { + "epoch": 1.1284590341833967, + "grad_norm": 5.341604232788086, + "learning_rate": 6.784294915718789e-06, + "loss": 0.2315, + "step": 24216 + }, + { + "epoch": 1.1284725990233315, + "grad_norm": 5.887636184692383, + "learning_rate": 6.784157873098535e-06, + "loss": 0.241, + "step": 24217 + }, + { + "epoch": 1.1284861638632664, + "grad_norm": 6.972169399261475, + "learning_rate": 6.78402083047828e-06, + "loss": 0.323, + "step": 24218 + }, + { + "epoch": 1.1284997287032013, + "grad_norm": 4.699156761169434, + "learning_rate": 6.783883787858024e-06, + "loss": 0.3213, + "step": 24219 + }, + { + "epoch": 1.1285132935431361, + "grad_norm": 5.975732803344727, + "learning_rate": 6.783746745237769e-06, + "loss": 0.3246, + "step": 24220 + }, + { + "epoch": 1.128526858383071, + "grad_norm": 4.951615810394287, + "learning_rate": 6.783609702617514e-06, + "loss": 0.2113, + "step": 24221 + }, + { + "epoch": 1.128540423223006, + "grad_norm": 5.9358906745910645, + "learning_rate": 6.78347265999726e-06, + "loss": 0.28, + "step": 24222 + }, + { + "epoch": 1.128553988062941, + "grad_norm": 4.803870677947998, + "learning_rate": 6.783335617377005e-06, + "loss": 0.2104, + "step": 24223 + }, + { + "epoch": 1.1285675529028758, + "grad_norm": 6.244353294372559, + "learning_rate": 6.78319857475675e-06, + "loss": 0.1679, + "step": 24224 + }, + { + "epoch": 1.1285811177428107, + "grad_norm": 6.087045192718506, + "learning_rate": 6.783061532136494e-06, + "loss": 0.3564, + "step": 24225 + }, + { + "epoch": 1.1285946825827455, + "grad_norm": 7.194180011749268, + "learning_rate": 6.7829244895162404e-06, + "loss": 0.2406, + "step": 24226 + }, + { + "epoch": 1.1286082474226804, + "grad_norm": 5.399487018585205, + "learning_rate": 6.782787446895986e-06, + "loss": 0.3051, + "step": 24227 + }, + { + "epoch": 1.1286218122626153, + "grad_norm": 4.578001499176025, + "learning_rate": 6.78265040427573e-06, + "loss": 0.2407, + "step": 24228 + }, + { + "epoch": 1.1286353771025501, + "grad_norm": 5.033156871795654, + "learning_rate": 6.782513361655475e-06, + "loss": 0.2367, + "step": 24229 + }, + { + "epoch": 1.128648941942485, + "grad_norm": 4.542562007904053, + "learning_rate": 6.782376319035221e-06, + "loss": 0.2503, + "step": 24230 + }, + { + "epoch": 1.1286625067824199, + "grad_norm": 5.659770965576172, + "learning_rate": 6.7822392764149655e-06, + "loss": 0.3516, + "step": 24231 + }, + { + "epoch": 1.128676071622355, + "grad_norm": 5.070583343505859, + "learning_rate": 6.782102233794711e-06, + "loss": 0.3108, + "step": 24232 + }, + { + "epoch": 1.1286896364622898, + "grad_norm": 6.315374851226807, + "learning_rate": 6.781965191174456e-06, + "loss": 0.4419, + "step": 24233 + }, + { + "epoch": 1.1287032013022247, + "grad_norm": 4.9379472732543945, + "learning_rate": 6.781828148554202e-06, + "loss": 0.2937, + "step": 24234 + }, + { + "epoch": 1.1287167661421595, + "grad_norm": 5.287899017333984, + "learning_rate": 6.781691105933946e-06, + "loss": 0.4582, + "step": 24235 + }, + { + "epoch": 1.1287303309820944, + "grad_norm": 4.944368839263916, + "learning_rate": 6.781554063313691e-06, + "loss": 0.201, + "step": 24236 + }, + { + "epoch": 1.1287438958220293, + "grad_norm": 5.103094100952148, + "learning_rate": 6.781417020693436e-06, + "loss": 0.3585, + "step": 24237 + }, + { + "epoch": 1.1287574606619641, + "grad_norm": 5.131345748901367, + "learning_rate": 6.781279978073181e-06, + "loss": 0.2545, + "step": 24238 + }, + { + "epoch": 1.128771025501899, + "grad_norm": 6.194446086883545, + "learning_rate": 6.781142935452927e-06, + "loss": 0.3574, + "step": 24239 + }, + { + "epoch": 1.1287845903418339, + "grad_norm": 5.523387908935547, + "learning_rate": 6.781005892832671e-06, + "loss": 0.3475, + "step": 24240 + }, + { + "epoch": 1.128798155181769, + "grad_norm": 4.765074729919434, + "learning_rate": 6.780868850212416e-06, + "loss": 0.2819, + "step": 24241 + }, + { + "epoch": 1.1288117200217038, + "grad_norm": 5.528357982635498, + "learning_rate": 6.780731807592162e-06, + "loss": 0.4346, + "step": 24242 + }, + { + "epoch": 1.1288252848616387, + "grad_norm": 5.499887466430664, + "learning_rate": 6.780594764971908e-06, + "loss": 0.3464, + "step": 24243 + }, + { + "epoch": 1.1288388497015736, + "grad_norm": 6.93556547164917, + "learning_rate": 6.780457722351652e-06, + "loss": 0.3042, + "step": 24244 + }, + { + "epoch": 1.1288524145415084, + "grad_norm": 5.970491409301758, + "learning_rate": 6.780320679731397e-06, + "loss": 0.4124, + "step": 24245 + }, + { + "epoch": 1.1288659793814433, + "grad_norm": 5.052067756652832, + "learning_rate": 6.7801836371111415e-06, + "loss": 0.3093, + "step": 24246 + }, + { + "epoch": 1.1288795442213782, + "grad_norm": 4.607773303985596, + "learning_rate": 6.7800465944908875e-06, + "loss": 0.3521, + "step": 24247 + }, + { + "epoch": 1.128893109061313, + "grad_norm": 5.6896443367004395, + "learning_rate": 6.779909551870633e-06, + "loss": 0.3495, + "step": 24248 + }, + { + "epoch": 1.1289066739012479, + "grad_norm": 4.087335109710693, + "learning_rate": 6.779772509250378e-06, + "loss": 0.252, + "step": 24249 + }, + { + "epoch": 1.1289202387411827, + "grad_norm": 6.734157085418701, + "learning_rate": 6.779635466630122e-06, + "loss": 0.3812, + "step": 24250 + }, + { + "epoch": 1.1289338035811178, + "grad_norm": 6.978646278381348, + "learning_rate": 6.779498424009867e-06, + "loss": 0.3989, + "step": 24251 + }, + { + "epoch": 1.1289473684210527, + "grad_norm": 6.999091625213623, + "learning_rate": 6.779361381389613e-06, + "loss": 0.4616, + "step": 24252 + }, + { + "epoch": 1.1289609332609876, + "grad_norm": 5.887511730194092, + "learning_rate": 6.779224338769358e-06, + "loss": 0.3595, + "step": 24253 + }, + { + "epoch": 1.1289744981009224, + "grad_norm": 6.0729193687438965, + "learning_rate": 6.779087296149103e-06, + "loss": 0.4256, + "step": 24254 + }, + { + "epoch": 1.1289880629408573, + "grad_norm": 5.892158508300781, + "learning_rate": 6.778950253528847e-06, + "loss": 0.4121, + "step": 24255 + }, + { + "epoch": 1.1290016277807922, + "grad_norm": 6.311154842376709, + "learning_rate": 6.778813210908593e-06, + "loss": 0.3412, + "step": 24256 + }, + { + "epoch": 1.129015192620727, + "grad_norm": 6.898308753967285, + "learning_rate": 6.7786761682883384e-06, + "loss": 0.4611, + "step": 24257 + }, + { + "epoch": 1.129028757460662, + "grad_norm": 6.692549228668213, + "learning_rate": 6.778539125668084e-06, + "loss": 0.4034, + "step": 24258 + }, + { + "epoch": 1.1290423223005968, + "grad_norm": 6.492701053619385, + "learning_rate": 6.778402083047828e-06, + "loss": 0.4334, + "step": 24259 + }, + { + "epoch": 1.1290558871405318, + "grad_norm": 5.639881134033203, + "learning_rate": 6.778265040427574e-06, + "loss": 0.5327, + "step": 24260 + }, + { + "epoch": 1.1290694519804667, + "grad_norm": 6.479816913604736, + "learning_rate": 6.778127997807319e-06, + "loss": 0.3617, + "step": 24261 + }, + { + "epoch": 1.1290830168204016, + "grad_norm": 5.3191914558410645, + "learning_rate": 6.7779909551870635e-06, + "loss": 0.3955, + "step": 24262 + }, + { + "epoch": 1.1290965816603364, + "grad_norm": 5.319392204284668, + "learning_rate": 6.777853912566809e-06, + "loss": 0.3031, + "step": 24263 + }, + { + "epoch": 1.1291101465002713, + "grad_norm": 5.999613285064697, + "learning_rate": 6.777716869946554e-06, + "loss": 0.4734, + "step": 24264 + }, + { + "epoch": 1.1291237113402062, + "grad_norm": 6.443791389465332, + "learning_rate": 6.777579827326299e-06, + "loss": 0.4159, + "step": 24265 + }, + { + "epoch": 1.129137276180141, + "grad_norm": 5.249114513397217, + "learning_rate": 6.777442784706044e-06, + "loss": 0.3815, + "step": 24266 + }, + { + "epoch": 1.129150841020076, + "grad_norm": 5.416220664978027, + "learning_rate": 6.777305742085789e-06, + "loss": 0.3137, + "step": 24267 + }, + { + "epoch": 1.1291644058600108, + "grad_norm": 8.006317138671875, + "learning_rate": 6.777168699465534e-06, + "loss": 0.4956, + "step": 24268 + }, + { + "epoch": 1.1291779706999456, + "grad_norm": 4.488903045654297, + "learning_rate": 6.77703165684528e-06, + "loss": 0.2434, + "step": 24269 + }, + { + "epoch": 1.1291915355398807, + "grad_norm": 5.908208847045898, + "learning_rate": 6.776894614225025e-06, + "loss": 0.2464, + "step": 24270 + }, + { + "epoch": 1.1292051003798156, + "grad_norm": 4.327139377593994, + "learning_rate": 6.776757571604769e-06, + "loss": 0.2622, + "step": 24271 + }, + { + "epoch": 1.1292186652197505, + "grad_norm": 7.317424774169922, + "learning_rate": 6.776620528984514e-06, + "loss": 0.3621, + "step": 24272 + }, + { + "epoch": 1.1292322300596853, + "grad_norm": 6.653262138366699, + "learning_rate": 6.7764834863642604e-06, + "loss": 0.5676, + "step": 24273 + }, + { + "epoch": 1.1292457948996202, + "grad_norm": 6.602808952331543, + "learning_rate": 6.776346443744005e-06, + "loss": 0.3263, + "step": 24274 + }, + { + "epoch": 1.129259359739555, + "grad_norm": 6.1175103187561035, + "learning_rate": 6.77620940112375e-06, + "loss": 0.3351, + "step": 24275 + }, + { + "epoch": 1.12927292457949, + "grad_norm": 6.973115921020508, + "learning_rate": 6.776072358503495e-06, + "loss": 0.4299, + "step": 24276 + }, + { + "epoch": 1.1292864894194248, + "grad_norm": 5.9139275550842285, + "learning_rate": 6.7759353158832395e-06, + "loss": 0.4127, + "step": 24277 + }, + { + "epoch": 1.1293000542593596, + "grad_norm": 5.81771183013916, + "learning_rate": 6.7757982732629855e-06, + "loss": 0.3786, + "step": 24278 + }, + { + "epoch": 1.1293136190992947, + "grad_norm": 4.002924919128418, + "learning_rate": 6.775661230642731e-06, + "loss": 0.194, + "step": 24279 + }, + { + "epoch": 1.1293271839392296, + "grad_norm": 4.017138957977295, + "learning_rate": 6.775524188022475e-06, + "loss": 0.3089, + "step": 24280 + }, + { + "epoch": 1.1293407487791645, + "grad_norm": 6.397295951843262, + "learning_rate": 6.77538714540222e-06, + "loss": 0.4096, + "step": 24281 + }, + { + "epoch": 1.1293543136190993, + "grad_norm": 7.938085556030273, + "learning_rate": 6.775250102781966e-06, + "loss": 0.4523, + "step": 24282 + }, + { + "epoch": 1.1293678784590342, + "grad_norm": 6.44481897354126, + "learning_rate": 6.775113060161711e-06, + "loss": 0.4614, + "step": 24283 + }, + { + "epoch": 1.129381443298969, + "grad_norm": 6.082912445068359, + "learning_rate": 6.774976017541456e-06, + "loss": 0.3716, + "step": 24284 + }, + { + "epoch": 1.129395008138904, + "grad_norm": 6.350291728973389, + "learning_rate": 6.774838974921201e-06, + "loss": 0.3662, + "step": 24285 + }, + { + "epoch": 1.1294085729788388, + "grad_norm": 6.1486430168151855, + "learning_rate": 6.774701932300947e-06, + "loss": 0.3995, + "step": 24286 + }, + { + "epoch": 1.1294221378187737, + "grad_norm": 5.963742733001709, + "learning_rate": 6.774564889680691e-06, + "loss": 0.3705, + "step": 24287 + }, + { + "epoch": 1.1294357026587085, + "grad_norm": 5.69478178024292, + "learning_rate": 6.7744278470604364e-06, + "loss": 0.2834, + "step": 24288 + }, + { + "epoch": 1.1294492674986436, + "grad_norm": 5.291749000549316, + "learning_rate": 6.774290804440181e-06, + "loss": 0.3973, + "step": 24289 + }, + { + "epoch": 1.1294628323385785, + "grad_norm": 5.547550678253174, + "learning_rate": 6.774153761819926e-06, + "loss": 0.3268, + "step": 24290 + }, + { + "epoch": 1.1294763971785133, + "grad_norm": 6.8895487785339355, + "learning_rate": 6.774016719199672e-06, + "loss": 0.4548, + "step": 24291 + }, + { + "epoch": 1.1294899620184482, + "grad_norm": 5.904633522033691, + "learning_rate": 6.773879676579417e-06, + "loss": 0.3216, + "step": 24292 + }, + { + "epoch": 1.129503526858383, + "grad_norm": 6.432681083679199, + "learning_rate": 6.7737426339591615e-06, + "loss": 0.3472, + "step": 24293 + }, + { + "epoch": 1.129517091698318, + "grad_norm": 7.00593900680542, + "learning_rate": 6.773605591338907e-06, + "loss": 0.3602, + "step": 24294 + }, + { + "epoch": 1.1295306565382528, + "grad_norm": 5.202730655670166, + "learning_rate": 6.773468548718653e-06, + "loss": 0.3953, + "step": 24295 + }, + { + "epoch": 1.1295442213781877, + "grad_norm": 4.5453104972839355, + "learning_rate": 6.773331506098397e-06, + "loss": 0.3267, + "step": 24296 + }, + { + "epoch": 1.1295577862181228, + "grad_norm": 7.734062671661377, + "learning_rate": 6.773194463478142e-06, + "loss": 0.4235, + "step": 24297 + }, + { + "epoch": 1.1295713510580576, + "grad_norm": 6.625669479370117, + "learning_rate": 6.773057420857887e-06, + "loss": 0.3804, + "step": 24298 + }, + { + "epoch": 1.1295849158979925, + "grad_norm": 5.639411926269531, + "learning_rate": 6.7729203782376326e-06, + "loss": 0.3848, + "step": 24299 + }, + { + "epoch": 1.1295984807379273, + "grad_norm": 5.649816989898682, + "learning_rate": 6.772783335617378e-06, + "loss": 0.3976, + "step": 24300 + }, + { + "epoch": 1.1296120455778622, + "grad_norm": 7.905044078826904, + "learning_rate": 6.772646292997123e-06, + "loss": 0.6201, + "step": 24301 + }, + { + "epoch": 1.129625610417797, + "grad_norm": 5.828464508056641, + "learning_rate": 6.772509250376867e-06, + "loss": 0.1821, + "step": 24302 + }, + { + "epoch": 1.129639175257732, + "grad_norm": 6.797283172607422, + "learning_rate": 6.772372207756613e-06, + "loss": 0.4314, + "step": 24303 + }, + { + "epoch": 1.1296527400976668, + "grad_norm": 4.624302864074707, + "learning_rate": 6.7722351651363584e-06, + "loss": 0.3586, + "step": 24304 + }, + { + "epoch": 1.1296663049376017, + "grad_norm": 5.892974376678467, + "learning_rate": 6.772098122516103e-06, + "loss": 0.361, + "step": 24305 + }, + { + "epoch": 1.1296798697775365, + "grad_norm": 6.245858669281006, + "learning_rate": 6.771961079895848e-06, + "loss": 0.3777, + "step": 24306 + }, + { + "epoch": 1.1296934346174714, + "grad_norm": 5.899783611297607, + "learning_rate": 6.771824037275593e-06, + "loss": 0.3166, + "step": 24307 + }, + { + "epoch": 1.1297069994574065, + "grad_norm": 7.552853584289551, + "learning_rate": 6.771686994655338e-06, + "loss": 0.6161, + "step": 24308 + }, + { + "epoch": 1.1297205642973414, + "grad_norm": 5.264614582061768, + "learning_rate": 6.7715499520350835e-06, + "loss": 0.3376, + "step": 24309 + }, + { + "epoch": 1.1297341291372762, + "grad_norm": 4.417361736297607, + "learning_rate": 6.771412909414829e-06, + "loss": 0.2472, + "step": 24310 + }, + { + "epoch": 1.129747693977211, + "grad_norm": 6.630801200866699, + "learning_rate": 6.771275866794573e-06, + "loss": 0.4261, + "step": 24311 + }, + { + "epoch": 1.129761258817146, + "grad_norm": 7.030943393707275, + "learning_rate": 6.771138824174319e-06, + "loss": 0.4624, + "step": 24312 + }, + { + "epoch": 1.1297748236570808, + "grad_norm": 5.5747222900390625, + "learning_rate": 6.771001781554064e-06, + "loss": 0.3207, + "step": 24313 + }, + { + "epoch": 1.1297883884970157, + "grad_norm": 5.164085865020752, + "learning_rate": 6.7708647389338085e-06, + "loss": 0.3363, + "step": 24314 + }, + { + "epoch": 1.1298019533369505, + "grad_norm": 5.764841079711914, + "learning_rate": 6.770727696313554e-06, + "loss": 0.4006, + "step": 24315 + }, + { + "epoch": 1.1298155181768856, + "grad_norm": 5.611101150512695, + "learning_rate": 6.7705906536933e-06, + "loss": 0.3085, + "step": 24316 + }, + { + "epoch": 1.1298290830168205, + "grad_norm": 4.716275215148926, + "learning_rate": 6.770453611073045e-06, + "loss": 0.3648, + "step": 24317 + }, + { + "epoch": 1.1298426478567554, + "grad_norm": 6.269184112548828, + "learning_rate": 6.770316568452789e-06, + "loss": 0.3302, + "step": 24318 + }, + { + "epoch": 1.1298562126966902, + "grad_norm": 8.49659538269043, + "learning_rate": 6.7701795258325344e-06, + "loss": 0.4998, + "step": 24319 + }, + { + "epoch": 1.129869777536625, + "grad_norm": 5.104748249053955, + "learning_rate": 6.770042483212279e-06, + "loss": 0.4082, + "step": 24320 + }, + { + "epoch": 1.12988334237656, + "grad_norm": 4.161943435668945, + "learning_rate": 6.769905440592025e-06, + "loss": 0.3156, + "step": 24321 + }, + { + "epoch": 1.1298969072164948, + "grad_norm": 6.284742832183838, + "learning_rate": 6.76976839797177e-06, + "loss": 0.4305, + "step": 24322 + }, + { + "epoch": 1.1299104720564297, + "grad_norm": 5.870628356933594, + "learning_rate": 6.769631355351514e-06, + "loss": 0.4252, + "step": 24323 + }, + { + "epoch": 1.1299240368963646, + "grad_norm": 6.9199090003967285, + "learning_rate": 6.7694943127312595e-06, + "loss": 0.4674, + "step": 24324 + }, + { + "epoch": 1.1299376017362994, + "grad_norm": 5.337582111358643, + "learning_rate": 6.7693572701110055e-06, + "loss": 0.4298, + "step": 24325 + }, + { + "epoch": 1.1299511665762343, + "grad_norm": 4.109602451324463, + "learning_rate": 6.769220227490751e-06, + "loss": 0.257, + "step": 24326 + }, + { + "epoch": 1.1299647314161694, + "grad_norm": 6.878538608551025, + "learning_rate": 6.769083184870495e-06, + "loss": 0.4285, + "step": 24327 + }, + { + "epoch": 1.1299782962561042, + "grad_norm": 4.572272300720215, + "learning_rate": 6.76894614225024e-06, + "loss": 0.3899, + "step": 24328 + }, + { + "epoch": 1.129991861096039, + "grad_norm": 7.1469645500183105, + "learning_rate": 6.768809099629986e-06, + "loss": 0.4286, + "step": 24329 + }, + { + "epoch": 1.130005425935974, + "grad_norm": 7.1364850997924805, + "learning_rate": 6.7686720570097306e-06, + "loss": 0.5899, + "step": 24330 + }, + { + "epoch": 1.1300189907759088, + "grad_norm": 5.379729747772217, + "learning_rate": 6.768535014389476e-06, + "loss": 0.3733, + "step": 24331 + }, + { + "epoch": 1.1300325556158437, + "grad_norm": 6.982524871826172, + "learning_rate": 6.768397971769221e-06, + "loss": 0.3097, + "step": 24332 + }, + { + "epoch": 1.1300461204557786, + "grad_norm": 6.656182289123535, + "learning_rate": 6.768260929148965e-06, + "loss": 0.2971, + "step": 24333 + }, + { + "epoch": 1.1300596852957134, + "grad_norm": 6.257326602935791, + "learning_rate": 6.768123886528711e-06, + "loss": 0.448, + "step": 24334 + }, + { + "epoch": 1.1300732501356485, + "grad_norm": 7.161255359649658, + "learning_rate": 6.7679868439084565e-06, + "loss": 0.426, + "step": 24335 + }, + { + "epoch": 1.1300868149755834, + "grad_norm": 6.721004009246826, + "learning_rate": 6.767849801288201e-06, + "loss": 0.3194, + "step": 24336 + }, + { + "epoch": 1.1301003798155183, + "grad_norm": 6.010675430297852, + "learning_rate": 6.767712758667946e-06, + "loss": 0.4246, + "step": 24337 + }, + { + "epoch": 1.1301139446554531, + "grad_norm": 4.809029579162598, + "learning_rate": 6.767575716047692e-06, + "loss": 0.5072, + "step": 24338 + }, + { + "epoch": 1.130127509495388, + "grad_norm": 4.995367527008057, + "learning_rate": 6.767438673427436e-06, + "loss": 0.2441, + "step": 24339 + }, + { + "epoch": 1.1301410743353228, + "grad_norm": 9.85645866394043, + "learning_rate": 6.7673016308071815e-06, + "loss": 0.4304, + "step": 24340 + }, + { + "epoch": 1.1301546391752577, + "grad_norm": 6.06439733505249, + "learning_rate": 6.767164588186927e-06, + "loss": 0.3471, + "step": 24341 + }, + { + "epoch": 1.1301682040151926, + "grad_norm": 5.207339286804199, + "learning_rate": 6.767027545566673e-06, + "loss": 0.3321, + "step": 24342 + }, + { + "epoch": 1.1301817688551274, + "grad_norm": 7.3366007804870605, + "learning_rate": 6.766890502946417e-06, + "loss": 0.5759, + "step": 24343 + }, + { + "epoch": 1.1301953336950623, + "grad_norm": 4.969259738922119, + "learning_rate": 6.766753460326162e-06, + "loss": 0.3009, + "step": 24344 + }, + { + "epoch": 1.1302088985349972, + "grad_norm": 6.369575023651123, + "learning_rate": 6.7666164177059065e-06, + "loss": 0.4041, + "step": 24345 + }, + { + "epoch": 1.1302224633749323, + "grad_norm": 6.2938456535339355, + "learning_rate": 6.766479375085652e-06, + "loss": 0.4302, + "step": 24346 + }, + { + "epoch": 1.1302360282148671, + "grad_norm": 6.026637077331543, + "learning_rate": 6.766342332465398e-06, + "loss": 0.2682, + "step": 24347 + }, + { + "epoch": 1.130249593054802, + "grad_norm": 4.907187461853027, + "learning_rate": 6.766205289845142e-06, + "loss": 0.3377, + "step": 24348 + }, + { + "epoch": 1.1302631578947369, + "grad_norm": 5.320035934448242, + "learning_rate": 6.766068247224887e-06, + "loss": 0.3034, + "step": 24349 + }, + { + "epoch": 1.1302767227346717, + "grad_norm": 7.127901077270508, + "learning_rate": 6.7659312046046324e-06, + "loss": 0.4415, + "step": 24350 + }, + { + "epoch": 1.1302902875746066, + "grad_norm": 5.271817207336426, + "learning_rate": 6.7657941619843785e-06, + "loss": 0.3292, + "step": 24351 + }, + { + "epoch": 1.1303038524145415, + "grad_norm": 5.511385440826416, + "learning_rate": 6.765657119364123e-06, + "loss": 0.3659, + "step": 24352 + }, + { + "epoch": 1.1303174172544763, + "grad_norm": 8.2147798538208, + "learning_rate": 6.765520076743868e-06, + "loss": 0.4201, + "step": 24353 + }, + { + "epoch": 1.1303309820944114, + "grad_norm": 6.6377434730529785, + "learning_rate": 6.765383034123612e-06, + "loss": 0.3488, + "step": 24354 + }, + { + "epoch": 1.1303445469343463, + "grad_norm": 5.148021221160889, + "learning_rate": 6.765245991503358e-06, + "loss": 0.2783, + "step": 24355 + }, + { + "epoch": 1.1303581117742811, + "grad_norm": 6.405466556549072, + "learning_rate": 6.7651089488831035e-06, + "loss": 0.3434, + "step": 24356 + }, + { + "epoch": 1.130371676614216, + "grad_norm": 5.038686275482178, + "learning_rate": 6.764971906262849e-06, + "loss": 0.2515, + "step": 24357 + }, + { + "epoch": 1.1303852414541509, + "grad_norm": 5.761758327484131, + "learning_rate": 6.764834863642593e-06, + "loss": 0.2038, + "step": 24358 + }, + { + "epoch": 1.1303988062940857, + "grad_norm": 4.736514568328857, + "learning_rate": 6.764697821022338e-06, + "loss": 0.2657, + "step": 24359 + }, + { + "epoch": 1.1304123711340206, + "grad_norm": 6.107060432434082, + "learning_rate": 6.764560778402084e-06, + "loss": 0.3757, + "step": 24360 + }, + { + "epoch": 1.1304259359739555, + "grad_norm": 5.085555553436279, + "learning_rate": 6.7644237357818286e-06, + "loss": 0.2014, + "step": 24361 + }, + { + "epoch": 1.1304395008138903, + "grad_norm": 4.8974504470825195, + "learning_rate": 6.764286693161574e-06, + "loss": 0.2556, + "step": 24362 + }, + { + "epoch": 1.1304530656538252, + "grad_norm": 5.111299514770508, + "learning_rate": 6.764149650541318e-06, + "loss": 0.2797, + "step": 24363 + }, + { + "epoch": 1.13046663049376, + "grad_norm": 6.383337020874023, + "learning_rate": 6.764012607921064e-06, + "loss": 0.3683, + "step": 24364 + }, + { + "epoch": 1.1304801953336951, + "grad_norm": 5.17045259475708, + "learning_rate": 6.763875565300809e-06, + "loss": 0.1585, + "step": 24365 + }, + { + "epoch": 1.13049376017363, + "grad_norm": 4.975486755371094, + "learning_rate": 6.7637385226805545e-06, + "loss": 0.3493, + "step": 24366 + }, + { + "epoch": 1.1305073250135649, + "grad_norm": 5.813912391662598, + "learning_rate": 6.763601480060299e-06, + "loss": 0.3271, + "step": 24367 + }, + { + "epoch": 1.1305208898534997, + "grad_norm": 4.59065055847168, + "learning_rate": 6.763464437440045e-06, + "loss": 0.2714, + "step": 24368 + }, + { + "epoch": 1.1305344546934346, + "grad_norm": 5.117867469787598, + "learning_rate": 6.76332739481979e-06, + "loss": 0.2598, + "step": 24369 + }, + { + "epoch": 1.1305480195333695, + "grad_norm": 4.841554641723633, + "learning_rate": 6.763190352199534e-06, + "loss": 0.3457, + "step": 24370 + }, + { + "epoch": 1.1305615843733043, + "grad_norm": 5.4484968185424805, + "learning_rate": 6.7630533095792795e-06, + "loss": 0.2918, + "step": 24371 + }, + { + "epoch": 1.1305751492132392, + "grad_norm": 6.291034698486328, + "learning_rate": 6.762916266959024e-06, + "loss": 0.4631, + "step": 24372 + }, + { + "epoch": 1.1305887140531743, + "grad_norm": 4.283250331878662, + "learning_rate": 6.76277922433877e-06, + "loss": 0.2703, + "step": 24373 + }, + { + "epoch": 1.1306022788931092, + "grad_norm": 5.728269100189209, + "learning_rate": 6.762642181718515e-06, + "loss": 0.4499, + "step": 24374 + }, + { + "epoch": 1.130615843733044, + "grad_norm": 6.931220531463623, + "learning_rate": 6.76250513909826e-06, + "loss": 0.2663, + "step": 24375 + }, + { + "epoch": 1.1306294085729789, + "grad_norm": 5.263714790344238, + "learning_rate": 6.7623680964780046e-06, + "loss": 0.3738, + "step": 24376 + }, + { + "epoch": 1.1306429734129138, + "grad_norm": 4.656365871429443, + "learning_rate": 6.762231053857751e-06, + "loss": 0.2629, + "step": 24377 + }, + { + "epoch": 1.1306565382528486, + "grad_norm": 6.477415561676025, + "learning_rate": 6.762094011237496e-06, + "loss": 0.2527, + "step": 24378 + }, + { + "epoch": 1.1306701030927835, + "grad_norm": 5.204034805297852, + "learning_rate": 6.76195696861724e-06, + "loss": 0.3278, + "step": 24379 + }, + { + "epoch": 1.1306836679327184, + "grad_norm": 3.916776418685913, + "learning_rate": 6.761819925996985e-06, + "loss": 0.1996, + "step": 24380 + }, + { + "epoch": 1.1306972327726532, + "grad_norm": 5.93281888961792, + "learning_rate": 6.761682883376731e-06, + "loss": 0.2535, + "step": 24381 + }, + { + "epoch": 1.130710797612588, + "grad_norm": 4.659364223480225, + "learning_rate": 6.761545840756476e-06, + "loss": 0.3174, + "step": 24382 + }, + { + "epoch": 1.130724362452523, + "grad_norm": 4.659521579742432, + "learning_rate": 6.761408798136221e-06, + "loss": 0.3306, + "step": 24383 + }, + { + "epoch": 1.130737927292458, + "grad_norm": 5.679028511047363, + "learning_rate": 6.761271755515966e-06, + "loss": 0.2415, + "step": 24384 + }, + { + "epoch": 1.130751492132393, + "grad_norm": 5.762513160705566, + "learning_rate": 6.761134712895712e-06, + "loss": 0.3365, + "step": 24385 + }, + { + "epoch": 1.1307650569723278, + "grad_norm": 5.806663513183594, + "learning_rate": 6.760997670275456e-06, + "loss": 0.3285, + "step": 24386 + }, + { + "epoch": 1.1307786218122626, + "grad_norm": 5.550946235656738, + "learning_rate": 6.7608606276552015e-06, + "loss": 0.3086, + "step": 24387 + }, + { + "epoch": 1.1307921866521975, + "grad_norm": 6.528214931488037, + "learning_rate": 6.760723585034946e-06, + "loss": 0.3097, + "step": 24388 + }, + { + "epoch": 1.1308057514921324, + "grad_norm": 6.415832042694092, + "learning_rate": 6.760586542414691e-06, + "loss": 0.4408, + "step": 24389 + }, + { + "epoch": 1.1308193163320672, + "grad_norm": 5.866025924682617, + "learning_rate": 6.760449499794437e-06, + "loss": 0.4047, + "step": 24390 + }, + { + "epoch": 1.130832881172002, + "grad_norm": 5.322051525115967, + "learning_rate": 6.760312457174182e-06, + "loss": 0.3358, + "step": 24391 + }, + { + "epoch": 1.1308464460119372, + "grad_norm": 5.042395114898682, + "learning_rate": 6.7601754145539266e-06, + "loss": 0.3268, + "step": 24392 + }, + { + "epoch": 1.130860010851872, + "grad_norm": 4.754854202270508, + "learning_rate": 6.760038371933672e-06, + "loss": 0.2054, + "step": 24393 + }, + { + "epoch": 1.130873575691807, + "grad_norm": 4.171785831451416, + "learning_rate": 6.759901329313418e-06, + "loss": 0.2511, + "step": 24394 + }, + { + "epoch": 1.1308871405317418, + "grad_norm": 5.18533992767334, + "learning_rate": 6.759764286693162e-06, + "loss": 0.2462, + "step": 24395 + }, + { + "epoch": 1.1309007053716766, + "grad_norm": 3.2446374893188477, + "learning_rate": 6.759627244072907e-06, + "loss": 0.1199, + "step": 24396 + }, + { + "epoch": 1.1309142702116115, + "grad_norm": 5.095288276672363, + "learning_rate": 6.759490201452652e-06, + "loss": 0.4431, + "step": 24397 + }, + { + "epoch": 1.1309278350515464, + "grad_norm": 8.898003578186035, + "learning_rate": 6.759353158832398e-06, + "loss": 0.349, + "step": 24398 + }, + { + "epoch": 1.1309413998914812, + "grad_norm": 4.523752212524414, + "learning_rate": 6.759216116212143e-06, + "loss": 0.2548, + "step": 24399 + }, + { + "epoch": 1.130954964731416, + "grad_norm": 6.284238338470459, + "learning_rate": 6.759079073591888e-06, + "loss": 0.3172, + "step": 24400 + }, + { + "epoch": 1.130968529571351, + "grad_norm": 5.742313385009766, + "learning_rate": 6.758942030971632e-06, + "loss": 0.2281, + "step": 24401 + }, + { + "epoch": 1.1309820944112858, + "grad_norm": 5.688831329345703, + "learning_rate": 6.7588049883513775e-06, + "loss": 0.3948, + "step": 24402 + }, + { + "epoch": 1.130995659251221, + "grad_norm": 6.0107645988464355, + "learning_rate": 6.7586679457311235e-06, + "loss": 0.2258, + "step": 24403 + }, + { + "epoch": 1.1310092240911558, + "grad_norm": 4.9201340675354, + "learning_rate": 6.758530903110868e-06, + "loss": 0.2528, + "step": 24404 + }, + { + "epoch": 1.1310227889310907, + "grad_norm": 6.726574897766113, + "learning_rate": 6.758393860490613e-06, + "loss": 0.3684, + "step": 24405 + }, + { + "epoch": 1.1310363537710255, + "grad_norm": 6.3032989501953125, + "learning_rate": 6.758256817870358e-06, + "loss": 0.4458, + "step": 24406 + }, + { + "epoch": 1.1310499186109604, + "grad_norm": 6.873429775238037, + "learning_rate": 6.758119775250103e-06, + "loss": 0.3729, + "step": 24407 + }, + { + "epoch": 1.1310634834508952, + "grad_norm": 8.28691577911377, + "learning_rate": 6.757982732629849e-06, + "loss": 0.4585, + "step": 24408 + }, + { + "epoch": 1.1310770482908301, + "grad_norm": 6.5137786865234375, + "learning_rate": 6.757845690009594e-06, + "loss": 0.313, + "step": 24409 + }, + { + "epoch": 1.131090613130765, + "grad_norm": 6.044931888580322, + "learning_rate": 6.757708647389338e-06, + "loss": 0.3058, + "step": 24410 + }, + { + "epoch": 1.1311041779707, + "grad_norm": 4.172980785369873, + "learning_rate": 6.757571604769084e-06, + "loss": 0.2834, + "step": 24411 + }, + { + "epoch": 1.131117742810635, + "grad_norm": 6.917086124420166, + "learning_rate": 6.757434562148829e-06, + "loss": 0.3549, + "step": 24412 + }, + { + "epoch": 1.1311313076505698, + "grad_norm": 7.644728660583496, + "learning_rate": 6.757297519528574e-06, + "loss": 0.3491, + "step": 24413 + }, + { + "epoch": 1.1311448724905047, + "grad_norm": 6.117809772491455, + "learning_rate": 6.757160476908319e-06, + "loss": 0.446, + "step": 24414 + }, + { + "epoch": 1.1311584373304395, + "grad_norm": 5.76751708984375, + "learning_rate": 6.757023434288064e-06, + "loss": 0.2406, + "step": 24415 + }, + { + "epoch": 1.1311720021703744, + "grad_norm": 4.9755377769470215, + "learning_rate": 6.756886391667809e-06, + "loss": 0.2655, + "step": 24416 + }, + { + "epoch": 1.1311855670103093, + "grad_norm": 6.199105739593506, + "learning_rate": 6.756749349047554e-06, + "loss": 0.3834, + "step": 24417 + }, + { + "epoch": 1.1311991318502441, + "grad_norm": 5.222443580627441, + "learning_rate": 6.7566123064272995e-06, + "loss": 0.1992, + "step": 24418 + }, + { + "epoch": 1.131212696690179, + "grad_norm": 3.6899468898773193, + "learning_rate": 6.756475263807044e-06, + "loss": 0.2227, + "step": 24419 + }, + { + "epoch": 1.1312262615301139, + "grad_norm": 6.073080062866211, + "learning_rate": 6.75633822118679e-06, + "loss": 0.2958, + "step": 24420 + }, + { + "epoch": 1.1312398263700487, + "grad_norm": 6.396627902984619, + "learning_rate": 6.756201178566535e-06, + "loss": 0.402, + "step": 24421 + }, + { + "epoch": 1.1312533912099838, + "grad_norm": 4.991374969482422, + "learning_rate": 6.756064135946279e-06, + "loss": 0.2435, + "step": 24422 + }, + { + "epoch": 1.1312669560499187, + "grad_norm": 5.240983009338379, + "learning_rate": 6.7559270933260246e-06, + "loss": 0.1532, + "step": 24423 + }, + { + "epoch": 1.1312805208898535, + "grad_norm": 6.1948018074035645, + "learning_rate": 6.755790050705771e-06, + "loss": 0.2164, + "step": 24424 + }, + { + "epoch": 1.1312940857297884, + "grad_norm": 5.611320972442627, + "learning_rate": 6.755653008085516e-06, + "loss": 0.2165, + "step": 24425 + }, + { + "epoch": 1.1313076505697233, + "grad_norm": 6.093826770782471, + "learning_rate": 6.75551596546526e-06, + "loss": 0.2507, + "step": 24426 + }, + { + "epoch": 1.1313212154096581, + "grad_norm": 5.354732513427734, + "learning_rate": 6.755378922845005e-06, + "loss": 0.3131, + "step": 24427 + }, + { + "epoch": 1.131334780249593, + "grad_norm": 6.099598407745361, + "learning_rate": 6.75524188022475e-06, + "loss": 0.2416, + "step": 24428 + }, + { + "epoch": 1.1313483450895279, + "grad_norm": 6.96544885635376, + "learning_rate": 6.755104837604496e-06, + "loss": 0.3245, + "step": 24429 + }, + { + "epoch": 1.131361909929463, + "grad_norm": 5.643339157104492, + "learning_rate": 6.754967794984241e-06, + "loss": 0.162, + "step": 24430 + }, + { + "epoch": 1.1313754747693978, + "grad_norm": 5.105737686157227, + "learning_rate": 6.754830752363985e-06, + "loss": 0.3083, + "step": 24431 + }, + { + "epoch": 1.1313890396093327, + "grad_norm": 6.103496551513672, + "learning_rate": 6.75469370974373e-06, + "loss": 0.2337, + "step": 24432 + }, + { + "epoch": 1.1314026044492675, + "grad_norm": 5.601202964782715, + "learning_rate": 6.754556667123476e-06, + "loss": 0.2192, + "step": 24433 + }, + { + "epoch": 1.1314161692892024, + "grad_norm": 4.9395432472229, + "learning_rate": 6.7544196245032215e-06, + "loss": 0.281, + "step": 24434 + }, + { + "epoch": 1.1314297341291373, + "grad_norm": 7.034521102905273, + "learning_rate": 6.754282581882966e-06, + "loss": 0.4669, + "step": 24435 + }, + { + "epoch": 1.1314432989690721, + "grad_norm": 7.076839923858643, + "learning_rate": 6.754145539262711e-06, + "loss": 0.4296, + "step": 24436 + }, + { + "epoch": 1.131456863809007, + "grad_norm": 5.969358921051025, + "learning_rate": 6.754008496642457e-06, + "loss": 0.2698, + "step": 24437 + }, + { + "epoch": 1.1314704286489419, + "grad_norm": 4.950934410095215, + "learning_rate": 6.753871454022201e-06, + "loss": 0.2958, + "step": 24438 + }, + { + "epoch": 1.1314839934888767, + "grad_norm": 7.824224472045898, + "learning_rate": 6.753734411401947e-06, + "loss": 0.416, + "step": 24439 + }, + { + "epoch": 1.1314975583288116, + "grad_norm": 6.577728271484375, + "learning_rate": 6.753597368781692e-06, + "loss": 0.4386, + "step": 24440 + }, + { + "epoch": 1.1315111231687467, + "grad_norm": 6.790421485900879, + "learning_rate": 6.753460326161436e-06, + "loss": 0.5197, + "step": 24441 + }, + { + "epoch": 1.1315246880086816, + "grad_norm": 5.846837997436523, + "learning_rate": 6.753323283541182e-06, + "loss": 0.1956, + "step": 24442 + }, + { + "epoch": 1.1315382528486164, + "grad_norm": 8.236838340759277, + "learning_rate": 6.753186240920927e-06, + "loss": 0.4353, + "step": 24443 + }, + { + "epoch": 1.1315518176885513, + "grad_norm": 5.3294358253479, + "learning_rate": 6.753049198300672e-06, + "loss": 0.3308, + "step": 24444 + }, + { + "epoch": 1.1315653825284862, + "grad_norm": 6.2883405685424805, + "learning_rate": 6.752912155680417e-06, + "loss": 0.2469, + "step": 24445 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 5.425693035125732, + "learning_rate": 6.752775113060163e-06, + "loss": 0.2514, + "step": 24446 + }, + { + "epoch": 1.1315925122083559, + "grad_norm": 6.512344837188721, + "learning_rate": 6.752638070439907e-06, + "loss": 0.3241, + "step": 24447 + }, + { + "epoch": 1.1316060770482907, + "grad_norm": 6.0702805519104, + "learning_rate": 6.752501027819652e-06, + "loss": 0.4519, + "step": 24448 + }, + { + "epoch": 1.1316196418882258, + "grad_norm": 6.502789497375488, + "learning_rate": 6.7523639851993975e-06, + "loss": 0.3251, + "step": 24449 + }, + { + "epoch": 1.1316332067281607, + "grad_norm": 7.167512893676758, + "learning_rate": 6.752226942579143e-06, + "loss": 0.5797, + "step": 24450 + }, + { + "epoch": 1.1316467715680956, + "grad_norm": 6.55856466293335, + "learning_rate": 6.752089899958888e-06, + "loss": 0.4298, + "step": 24451 + }, + { + "epoch": 1.1316603364080304, + "grad_norm": 5.118815898895264, + "learning_rate": 6.751952857338633e-06, + "loss": 0.3153, + "step": 24452 + }, + { + "epoch": 1.1316739012479653, + "grad_norm": 6.55625057220459, + "learning_rate": 6.751815814718377e-06, + "loss": 0.3793, + "step": 24453 + }, + { + "epoch": 1.1316874660879002, + "grad_norm": 4.817440986633301, + "learning_rate": 6.7516787720981234e-06, + "loss": 0.3251, + "step": 24454 + }, + { + "epoch": 1.131701030927835, + "grad_norm": 8.353144645690918, + "learning_rate": 6.751541729477869e-06, + "loss": 0.4852, + "step": 24455 + }, + { + "epoch": 1.13171459576777, + "grad_norm": 7.08358097076416, + "learning_rate": 6.751404686857613e-06, + "loss": 0.44, + "step": 24456 + }, + { + "epoch": 1.1317281606077048, + "grad_norm": 5.011777877807617, + "learning_rate": 6.751267644237358e-06, + "loss": 0.313, + "step": 24457 + }, + { + "epoch": 1.1317417254476396, + "grad_norm": 6.194126129150391, + "learning_rate": 6.751130601617103e-06, + "loss": 0.3519, + "step": 24458 + }, + { + "epoch": 1.1317552902875745, + "grad_norm": 4.244690895080566, + "learning_rate": 6.750993558996849e-06, + "loss": 0.2082, + "step": 24459 + }, + { + "epoch": 1.1317688551275096, + "grad_norm": 5.469145774841309, + "learning_rate": 6.750856516376594e-06, + "loss": 0.3304, + "step": 24460 + }, + { + "epoch": 1.1317824199674444, + "grad_norm": 5.250150203704834, + "learning_rate": 6.750719473756339e-06, + "loss": 0.2052, + "step": 24461 + }, + { + "epoch": 1.1317959848073793, + "grad_norm": 6.497098922729492, + "learning_rate": 6.750582431136083e-06, + "loss": 0.4175, + "step": 24462 + }, + { + "epoch": 1.1318095496473142, + "grad_norm": 5.417626857757568, + "learning_rate": 6.750445388515829e-06, + "loss": 0.3972, + "step": 24463 + }, + { + "epoch": 1.131823114487249, + "grad_norm": 7.458164215087891, + "learning_rate": 6.750308345895574e-06, + "loss": 0.3054, + "step": 24464 + }, + { + "epoch": 1.131836679327184, + "grad_norm": 5.1748552322387695, + "learning_rate": 6.750171303275319e-06, + "loss": 0.3466, + "step": 24465 + }, + { + "epoch": 1.1318502441671188, + "grad_norm": 5.194237232208252, + "learning_rate": 6.750034260655064e-06, + "loss": 0.1514, + "step": 24466 + }, + { + "epoch": 1.1318638090070536, + "grad_norm": 5.916262149810791, + "learning_rate": 6.74989721803481e-06, + "loss": 0.3462, + "step": 24467 + }, + { + "epoch": 1.1318773738469887, + "grad_norm": 5.3660478591918945, + "learning_rate": 6.749760175414555e-06, + "loss": 0.2571, + "step": 24468 + }, + { + "epoch": 1.1318909386869236, + "grad_norm": 6.131705284118652, + "learning_rate": 6.749623132794299e-06, + "loss": 0.3034, + "step": 24469 + }, + { + "epoch": 1.1319045035268585, + "grad_norm": 7.070361614227295, + "learning_rate": 6.749486090174045e-06, + "loss": 0.3377, + "step": 24470 + }, + { + "epoch": 1.1319180683667933, + "grad_norm": 5.669404029846191, + "learning_rate": 6.749349047553789e-06, + "loss": 0.3519, + "step": 24471 + }, + { + "epoch": 1.1319316332067282, + "grad_norm": 3.617985725402832, + "learning_rate": 6.749212004933535e-06, + "loss": 0.1448, + "step": 24472 + }, + { + "epoch": 1.131945198046663, + "grad_norm": 5.934982776641846, + "learning_rate": 6.74907496231328e-06, + "loss": 0.2914, + "step": 24473 + }, + { + "epoch": 1.131958762886598, + "grad_norm": 4.971006870269775, + "learning_rate": 6.748937919693025e-06, + "loss": 0.2203, + "step": 24474 + }, + { + "epoch": 1.1319723277265328, + "grad_norm": 5.648033142089844, + "learning_rate": 6.74880087707277e-06, + "loss": 0.3062, + "step": 24475 + }, + { + "epoch": 1.1319858925664676, + "grad_norm": 4.449462890625, + "learning_rate": 6.748663834452516e-06, + "loss": 0.2632, + "step": 24476 + }, + { + "epoch": 1.1319994574064025, + "grad_norm": 5.2047905921936035, + "learning_rate": 6.748526791832261e-06, + "loss": 0.2424, + "step": 24477 + }, + { + "epoch": 1.1320130222463374, + "grad_norm": 5.069338321685791, + "learning_rate": 6.748389749212005e-06, + "loss": 0.2144, + "step": 24478 + }, + { + "epoch": 1.1320265870862725, + "grad_norm": 5.592239856719971, + "learning_rate": 6.74825270659175e-06, + "loss": 0.3261, + "step": 24479 + }, + { + "epoch": 1.1320401519262073, + "grad_norm": 6.393548011779785, + "learning_rate": 6.748115663971496e-06, + "loss": 0.3211, + "step": 24480 + }, + { + "epoch": 1.1320537167661422, + "grad_norm": 5.11320686340332, + "learning_rate": 6.747978621351241e-06, + "loss": 0.2247, + "step": 24481 + }, + { + "epoch": 1.132067281606077, + "grad_norm": 4.595663070678711, + "learning_rate": 6.747841578730986e-06, + "loss": 0.3147, + "step": 24482 + }, + { + "epoch": 1.132080846446012, + "grad_norm": 5.135383605957031, + "learning_rate": 6.747704536110731e-06, + "loss": 0.3128, + "step": 24483 + }, + { + "epoch": 1.1320944112859468, + "grad_norm": 6.756453990936279, + "learning_rate": 6.747567493490475e-06, + "loss": 0.383, + "step": 24484 + }, + { + "epoch": 1.1321079761258817, + "grad_norm": 5.309702396392822, + "learning_rate": 6.7474304508702214e-06, + "loss": 0.2764, + "step": 24485 + }, + { + "epoch": 1.1321215409658165, + "grad_norm": 5.646173477172852, + "learning_rate": 6.747293408249967e-06, + "loss": 0.2734, + "step": 24486 + }, + { + "epoch": 1.1321351058057516, + "grad_norm": 6.8869476318359375, + "learning_rate": 6.747156365629711e-06, + "loss": 0.3498, + "step": 24487 + }, + { + "epoch": 1.1321486706456865, + "grad_norm": 5.377603530883789, + "learning_rate": 6.747019323009456e-06, + "loss": 0.2646, + "step": 24488 + }, + { + "epoch": 1.1321622354856213, + "grad_norm": 5.016802787780762, + "learning_rate": 6.746882280389202e-06, + "loss": 0.2285, + "step": 24489 + }, + { + "epoch": 1.1321758003255562, + "grad_norm": 5.087960720062256, + "learning_rate": 6.7467452377689465e-06, + "loss": 0.2898, + "step": 24490 + }, + { + "epoch": 1.132189365165491, + "grad_norm": 4.950934410095215, + "learning_rate": 6.746608195148692e-06, + "loss": 0.2273, + "step": 24491 + }, + { + "epoch": 1.132202930005426, + "grad_norm": 3.529721975326538, + "learning_rate": 6.746471152528437e-06, + "loss": 0.1767, + "step": 24492 + }, + { + "epoch": 1.1322164948453608, + "grad_norm": 5.370273590087891, + "learning_rate": 6.746334109908183e-06, + "loss": 0.2785, + "step": 24493 + }, + { + "epoch": 1.1322300596852957, + "grad_norm": 4.78153657913208, + "learning_rate": 6.746197067287927e-06, + "loss": 0.1862, + "step": 24494 + }, + { + "epoch": 1.1322436245252305, + "grad_norm": 4.509667873382568, + "learning_rate": 6.746060024667672e-06, + "loss": 0.2162, + "step": 24495 + }, + { + "epoch": 1.1322571893651654, + "grad_norm": 5.162774562835693, + "learning_rate": 6.745922982047417e-06, + "loss": 0.2661, + "step": 24496 + }, + { + "epoch": 1.1322707542051003, + "grad_norm": 3.1697537899017334, + "learning_rate": 6.745785939427162e-06, + "loss": 0.184, + "step": 24497 + }, + { + "epoch": 1.1322843190450353, + "grad_norm": 3.5922322273254395, + "learning_rate": 6.745648896806908e-06, + "loss": 0.1278, + "step": 24498 + }, + { + "epoch": 1.1322978838849702, + "grad_norm": 6.965358734130859, + "learning_rate": 6.745511854186653e-06, + "loss": 0.2268, + "step": 24499 + }, + { + "epoch": 1.132311448724905, + "grad_norm": 3.9797942638397217, + "learning_rate": 6.745374811566397e-06, + "loss": 0.1492, + "step": 24500 + }, + { + "epoch": 1.13232501356484, + "grad_norm": 3.957204580307007, + "learning_rate": 6.745237768946143e-06, + "loss": 0.2345, + "step": 24501 + }, + { + "epoch": 1.1323385784047748, + "grad_norm": 4.866223335266113, + "learning_rate": 6.745100726325889e-06, + "loss": 0.2577, + "step": 24502 + }, + { + "epoch": 1.1323521432447097, + "grad_norm": 5.53207540512085, + "learning_rate": 6.744963683705633e-06, + "loss": 0.3356, + "step": 24503 + }, + { + "epoch": 1.1323657080846445, + "grad_norm": 5.304362773895264, + "learning_rate": 6.744826641085378e-06, + "loss": 0.3543, + "step": 24504 + }, + { + "epoch": 1.1323792729245794, + "grad_norm": 6.857142448425293, + "learning_rate": 6.7446895984651225e-06, + "loss": 0.3296, + "step": 24505 + }, + { + "epoch": 1.1323928377645145, + "grad_norm": 5.504063129425049, + "learning_rate": 6.7445525558448685e-06, + "loss": 0.2597, + "step": 24506 + }, + { + "epoch": 1.1324064026044494, + "grad_norm": 3.620853900909424, + "learning_rate": 6.744415513224614e-06, + "loss": 0.1638, + "step": 24507 + }, + { + "epoch": 1.1324199674443842, + "grad_norm": 4.98096227645874, + "learning_rate": 6.744278470604359e-06, + "loss": 0.3044, + "step": 24508 + }, + { + "epoch": 1.132433532284319, + "grad_norm": 6.002213954925537, + "learning_rate": 6.744141427984103e-06, + "loss": 0.2783, + "step": 24509 + }, + { + "epoch": 1.132447097124254, + "grad_norm": 7.7560858726501465, + "learning_rate": 6.744004385363848e-06, + "loss": 0.521, + "step": 24510 + }, + { + "epoch": 1.1324606619641888, + "grad_norm": 4.747119903564453, + "learning_rate": 6.743867342743594e-06, + "loss": 0.2324, + "step": 24511 + }, + { + "epoch": 1.1324742268041237, + "grad_norm": 4.486246585845947, + "learning_rate": 6.743730300123339e-06, + "loss": 0.2691, + "step": 24512 + }, + { + "epoch": 1.1324877916440586, + "grad_norm": 6.275935649871826, + "learning_rate": 6.743593257503084e-06, + "loss": 0.2185, + "step": 24513 + }, + { + "epoch": 1.1325013564839934, + "grad_norm": 6.722414970397949, + "learning_rate": 6.743456214882828e-06, + "loss": 0.3619, + "step": 24514 + }, + { + "epoch": 1.1325149213239283, + "grad_norm": 6.551377296447754, + "learning_rate": 6.743319172262574e-06, + "loss": 0.2317, + "step": 24515 + }, + { + "epoch": 1.1325284861638631, + "grad_norm": 3.6889922618865967, + "learning_rate": 6.7431821296423194e-06, + "loss": 0.1675, + "step": 24516 + }, + { + "epoch": 1.1325420510037982, + "grad_norm": 4.9398698806762695, + "learning_rate": 6.743045087022065e-06, + "loss": 0.2573, + "step": 24517 + }, + { + "epoch": 1.132555615843733, + "grad_norm": 6.03222131729126, + "learning_rate": 6.742908044401809e-06, + "loss": 0.2585, + "step": 24518 + }, + { + "epoch": 1.132569180683668, + "grad_norm": 5.316435813903809, + "learning_rate": 6.742771001781555e-06, + "loss": 0.1997, + "step": 24519 + }, + { + "epoch": 1.1325827455236028, + "grad_norm": 4.459231376647949, + "learning_rate": 6.7426339591613e-06, + "loss": 0.2709, + "step": 24520 + }, + { + "epoch": 1.1325963103635377, + "grad_norm": 4.977649688720703, + "learning_rate": 6.7424969165410445e-06, + "loss": 0.3625, + "step": 24521 + }, + { + "epoch": 1.1326098752034726, + "grad_norm": 7.268597602844238, + "learning_rate": 6.74235987392079e-06, + "loss": 0.2617, + "step": 24522 + }, + { + "epoch": 1.1326234400434074, + "grad_norm": 4.49696683883667, + "learning_rate": 6.742222831300536e-06, + "loss": 0.2167, + "step": 24523 + }, + { + "epoch": 1.1326370048833423, + "grad_norm": 5.5096564292907715, + "learning_rate": 6.74208578868028e-06, + "loss": 0.1877, + "step": 24524 + }, + { + "epoch": 1.1326505697232774, + "grad_norm": 5.818894863128662, + "learning_rate": 6.741948746060025e-06, + "loss": 0.2681, + "step": 24525 + }, + { + "epoch": 1.1326641345632122, + "grad_norm": 6.2260918617248535, + "learning_rate": 6.74181170343977e-06, + "loss": 0.2875, + "step": 24526 + }, + { + "epoch": 1.132677699403147, + "grad_norm": 5.79215669631958, + "learning_rate": 6.741674660819515e-06, + "loss": 0.2748, + "step": 24527 + }, + { + "epoch": 1.132691264243082, + "grad_norm": 5.198183536529541, + "learning_rate": 6.741537618199261e-06, + "loss": 0.3047, + "step": 24528 + }, + { + "epoch": 1.1327048290830168, + "grad_norm": 5.143947601318359, + "learning_rate": 6.741400575579006e-06, + "loss": 0.1968, + "step": 24529 + }, + { + "epoch": 1.1327183939229517, + "grad_norm": 7.109757423400879, + "learning_rate": 6.74126353295875e-06, + "loss": 0.364, + "step": 24530 + }, + { + "epoch": 1.1327319587628866, + "grad_norm": 5.895130634307861, + "learning_rate": 6.741126490338495e-06, + "loss": 0.292, + "step": 24531 + }, + { + "epoch": 1.1327455236028214, + "grad_norm": 6.544406890869141, + "learning_rate": 6.7409894477182414e-06, + "loss": 0.3266, + "step": 24532 + }, + { + "epoch": 1.1327590884427563, + "grad_norm": 4.672104835510254, + "learning_rate": 6.740852405097987e-06, + "loss": 0.3181, + "step": 24533 + }, + { + "epoch": 1.1327726532826912, + "grad_norm": 6.111632823944092, + "learning_rate": 6.740715362477731e-06, + "loss": 0.301, + "step": 24534 + }, + { + "epoch": 1.132786218122626, + "grad_norm": 4.832149028778076, + "learning_rate": 6.740578319857476e-06, + "loss": 0.2003, + "step": 24535 + }, + { + "epoch": 1.1327997829625611, + "grad_norm": 5.828361511230469, + "learning_rate": 6.740441277237222e-06, + "loss": 0.3341, + "step": 24536 + }, + { + "epoch": 1.132813347802496, + "grad_norm": 5.657273769378662, + "learning_rate": 6.7403042346169665e-06, + "loss": 0.1921, + "step": 24537 + }, + { + "epoch": 1.1328269126424308, + "grad_norm": 6.959346771240234, + "learning_rate": 6.740167191996712e-06, + "loss": 0.3084, + "step": 24538 + }, + { + "epoch": 1.1328404774823657, + "grad_norm": 6.743101596832275, + "learning_rate": 6.740030149376456e-06, + "loss": 0.3702, + "step": 24539 + }, + { + "epoch": 1.1328540423223006, + "grad_norm": 5.491772174835205, + "learning_rate": 6.739893106756201e-06, + "loss": 0.3293, + "step": 24540 + }, + { + "epoch": 1.1328676071622354, + "grad_norm": 5.705541133880615, + "learning_rate": 6.739756064135947e-06, + "loss": 0.2514, + "step": 24541 + }, + { + "epoch": 1.1328811720021703, + "grad_norm": 4.084222316741943, + "learning_rate": 6.739619021515692e-06, + "loss": 0.1574, + "step": 24542 + }, + { + "epoch": 1.1328947368421052, + "grad_norm": 3.814567804336548, + "learning_rate": 6.739481978895437e-06, + "loss": 0.1849, + "step": 24543 + }, + { + "epoch": 1.1329083016820403, + "grad_norm": 6.045793533325195, + "learning_rate": 6.739344936275182e-06, + "loss": 0.356, + "step": 24544 + }, + { + "epoch": 1.1329218665219751, + "grad_norm": 6.03546142578125, + "learning_rate": 6.739207893654928e-06, + "loss": 0.2149, + "step": 24545 + }, + { + "epoch": 1.13293543136191, + "grad_norm": 4.357661247253418, + "learning_rate": 6.739070851034672e-06, + "loss": 0.1681, + "step": 24546 + }, + { + "epoch": 1.1329489962018449, + "grad_norm": 4.659452438354492, + "learning_rate": 6.7389338084144174e-06, + "loss": 0.2274, + "step": 24547 + }, + { + "epoch": 1.1329625610417797, + "grad_norm": 6.015010356903076, + "learning_rate": 6.738796765794163e-06, + "loss": 0.3431, + "step": 24548 + }, + { + "epoch": 1.1329761258817146, + "grad_norm": 4.5239152908325195, + "learning_rate": 6.738659723173908e-06, + "loss": 0.1419, + "step": 24549 + }, + { + "epoch": 1.1329896907216495, + "grad_norm": 5.952988624572754, + "learning_rate": 6.738522680553653e-06, + "loss": 0.2379, + "step": 24550 + }, + { + "epoch": 1.1330032555615843, + "grad_norm": 4.469674587249756, + "learning_rate": 6.738385637933398e-06, + "loss": 0.1742, + "step": 24551 + }, + { + "epoch": 1.1330168204015192, + "grad_norm": 6.196333885192871, + "learning_rate": 6.7382485953131425e-06, + "loss": 0.244, + "step": 24552 + }, + { + "epoch": 1.133030385241454, + "grad_norm": 4.098333835601807, + "learning_rate": 6.738111552692888e-06, + "loss": 0.2284, + "step": 24553 + }, + { + "epoch": 1.133043950081389, + "grad_norm": 4.48177433013916, + "learning_rate": 6.737974510072634e-06, + "loss": 0.2029, + "step": 24554 + }, + { + "epoch": 1.133057514921324, + "grad_norm": 4.91025447845459, + "learning_rate": 6.737837467452378e-06, + "loss": 0.2162, + "step": 24555 + }, + { + "epoch": 1.1330710797612589, + "grad_norm": 6.152061462402344, + "learning_rate": 6.737700424832123e-06, + "loss": 0.3348, + "step": 24556 + }, + { + "epoch": 1.1330846446011937, + "grad_norm": 4.2050628662109375, + "learning_rate": 6.737563382211868e-06, + "loss": 0.3002, + "step": 24557 + }, + { + "epoch": 1.1330982094411286, + "grad_norm": 4.678585052490234, + "learning_rate": 6.7374263395916136e-06, + "loss": 0.2046, + "step": 24558 + }, + { + "epoch": 1.1331117742810635, + "grad_norm": 5.8331990242004395, + "learning_rate": 6.737289296971359e-06, + "loss": 0.3521, + "step": 24559 + }, + { + "epoch": 1.1331253391209983, + "grad_norm": 4.638736248016357, + "learning_rate": 6.737152254351104e-06, + "loss": 0.2641, + "step": 24560 + }, + { + "epoch": 1.1331389039609332, + "grad_norm": 8.206413269042969, + "learning_rate": 6.737015211730848e-06, + "loss": 0.3377, + "step": 24561 + }, + { + "epoch": 1.133152468800868, + "grad_norm": 6.122149467468262, + "learning_rate": 6.736878169110594e-06, + "loss": 0.224, + "step": 24562 + }, + { + "epoch": 1.1331660336408031, + "grad_norm": 6.01931619644165, + "learning_rate": 6.7367411264903395e-06, + "loss": 0.2543, + "step": 24563 + }, + { + "epoch": 1.133179598480738, + "grad_norm": 7.112368106842041, + "learning_rate": 6.736604083870084e-06, + "loss": 0.3019, + "step": 24564 + }, + { + "epoch": 1.1331931633206729, + "grad_norm": 3.787271738052368, + "learning_rate": 6.736467041249829e-06, + "loss": 0.1708, + "step": 24565 + }, + { + "epoch": 1.1332067281606077, + "grad_norm": 8.094949722290039, + "learning_rate": 6.736329998629574e-06, + "loss": 0.4565, + "step": 24566 + }, + { + "epoch": 1.1332202930005426, + "grad_norm": 4.849413871765137, + "learning_rate": 6.73619295600932e-06, + "loss": 0.1863, + "step": 24567 + }, + { + "epoch": 1.1332338578404775, + "grad_norm": 6.916666507720947, + "learning_rate": 6.7360559133890645e-06, + "loss": 0.3616, + "step": 24568 + }, + { + "epoch": 1.1332474226804123, + "grad_norm": 5.876986026763916, + "learning_rate": 6.73591887076881e-06, + "loss": 0.384, + "step": 24569 + }, + { + "epoch": 1.1332609875203472, + "grad_norm": 5.348492622375488, + "learning_rate": 6.735781828148554e-06, + "loss": 0.3176, + "step": 24570 + }, + { + "epoch": 1.133274552360282, + "grad_norm": 6.124715805053711, + "learning_rate": 6.7356447855283e-06, + "loss": 0.3441, + "step": 24571 + }, + { + "epoch": 1.133288117200217, + "grad_norm": 7.316563129425049, + "learning_rate": 6.735507742908045e-06, + "loss": 0.3076, + "step": 24572 + }, + { + "epoch": 1.1333016820401518, + "grad_norm": 5.24206018447876, + "learning_rate": 6.7353707002877895e-06, + "loss": 0.2445, + "step": 24573 + }, + { + "epoch": 1.133315246880087, + "grad_norm": 6.985286712646484, + "learning_rate": 6.735233657667535e-06, + "loss": 0.2836, + "step": 24574 + }, + { + "epoch": 1.1333288117200218, + "grad_norm": 7.342639446258545, + "learning_rate": 6.735096615047281e-06, + "loss": 0.3545, + "step": 24575 + }, + { + "epoch": 1.1333423765599566, + "grad_norm": 6.617759704589844, + "learning_rate": 6.734959572427026e-06, + "loss": 0.4195, + "step": 24576 + }, + { + "epoch": 1.1333559413998915, + "grad_norm": 3.9034488201141357, + "learning_rate": 6.73482252980677e-06, + "loss": 0.1792, + "step": 24577 + }, + { + "epoch": 1.1333695062398264, + "grad_norm": 5.159797668457031, + "learning_rate": 6.7346854871865154e-06, + "loss": 0.3151, + "step": 24578 + }, + { + "epoch": 1.1333830710797612, + "grad_norm": 4.137396812438965, + "learning_rate": 6.73454844456626e-06, + "loss": 0.1441, + "step": 24579 + }, + { + "epoch": 1.133396635919696, + "grad_norm": 5.174007892608643, + "learning_rate": 6.734411401946006e-06, + "loss": 0.2638, + "step": 24580 + }, + { + "epoch": 1.133410200759631, + "grad_norm": 5.6508588790893555, + "learning_rate": 6.734274359325751e-06, + "loss": 0.2916, + "step": 24581 + }, + { + "epoch": 1.133423765599566, + "grad_norm": 7.542194366455078, + "learning_rate": 6.734137316705496e-06, + "loss": 0.3437, + "step": 24582 + }, + { + "epoch": 1.133437330439501, + "grad_norm": 5.47007942199707, + "learning_rate": 6.7340002740852405e-06, + "loss": 0.2643, + "step": 24583 + }, + { + "epoch": 1.1334508952794358, + "grad_norm": 4.177867889404297, + "learning_rate": 6.7338632314649865e-06, + "loss": 0.1675, + "step": 24584 + }, + { + "epoch": 1.1334644601193706, + "grad_norm": 4.836440563201904, + "learning_rate": 6.733726188844732e-06, + "loss": 0.3517, + "step": 24585 + }, + { + "epoch": 1.1334780249593055, + "grad_norm": 3.9483957290649414, + "learning_rate": 6.733589146224476e-06, + "loss": 0.1768, + "step": 24586 + }, + { + "epoch": 1.1334915897992404, + "grad_norm": 5.136405944824219, + "learning_rate": 6.733452103604221e-06, + "loss": 0.1609, + "step": 24587 + }, + { + "epoch": 1.1335051546391752, + "grad_norm": 5.3154377937316895, + "learning_rate": 6.733315060983967e-06, + "loss": 0.2102, + "step": 24588 + }, + { + "epoch": 1.13351871947911, + "grad_norm": 4.081949710845947, + "learning_rate": 6.7331780183637116e-06, + "loss": 0.1405, + "step": 24589 + }, + { + "epoch": 1.133532284319045, + "grad_norm": 4.339193344116211, + "learning_rate": 6.733040975743457e-06, + "loss": 0.1987, + "step": 24590 + }, + { + "epoch": 1.1335458491589798, + "grad_norm": 4.660795211791992, + "learning_rate": 6.732903933123202e-06, + "loss": 0.1755, + "step": 24591 + }, + { + "epoch": 1.1335594139989147, + "grad_norm": 6.262632369995117, + "learning_rate": 6.732766890502947e-06, + "loss": 0.2725, + "step": 24592 + }, + { + "epoch": 1.1335729788388498, + "grad_norm": 4.671637058258057, + "learning_rate": 6.732629847882692e-06, + "loss": 0.1852, + "step": 24593 + }, + { + "epoch": 1.1335865436787846, + "grad_norm": 4.537082195281982, + "learning_rate": 6.7324928052624375e-06, + "loss": 0.2961, + "step": 24594 + }, + { + "epoch": 1.1336001085187195, + "grad_norm": 4.400409698486328, + "learning_rate": 6.732355762642182e-06, + "loss": 0.333, + "step": 24595 + }, + { + "epoch": 1.1336136733586544, + "grad_norm": 6.077645778656006, + "learning_rate": 6.732218720021927e-06, + "loss": 0.3359, + "step": 24596 + }, + { + "epoch": 1.1336272381985892, + "grad_norm": 3.563961982727051, + "learning_rate": 6.732081677401673e-06, + "loss": 0.1642, + "step": 24597 + }, + { + "epoch": 1.133640803038524, + "grad_norm": 5.93308687210083, + "learning_rate": 6.731944634781417e-06, + "loss": 0.3836, + "step": 24598 + }, + { + "epoch": 1.133654367878459, + "grad_norm": 4.256501197814941, + "learning_rate": 6.7318075921611625e-06, + "loss": 0.1637, + "step": 24599 + }, + { + "epoch": 1.1336679327183938, + "grad_norm": 4.835015773773193, + "learning_rate": 6.731670549540908e-06, + "loss": 0.327, + "step": 24600 + }, + { + "epoch": 1.133681497558329, + "grad_norm": 4.035826683044434, + "learning_rate": 6.731533506920654e-06, + "loss": 0.231, + "step": 24601 + }, + { + "epoch": 1.1336950623982638, + "grad_norm": 3.3450021743774414, + "learning_rate": 6.731396464300398e-06, + "loss": 0.2037, + "step": 24602 + }, + { + "epoch": 1.1337086272381987, + "grad_norm": 5.764560699462891, + "learning_rate": 6.731259421680143e-06, + "loss": 0.2256, + "step": 24603 + }, + { + "epoch": 1.1337221920781335, + "grad_norm": 3.937246084213257, + "learning_rate": 6.7311223790598876e-06, + "loss": 0.2387, + "step": 24604 + }, + { + "epoch": 1.1337357569180684, + "grad_norm": 4.35020112991333, + "learning_rate": 6.730985336439634e-06, + "loss": 0.1909, + "step": 24605 + }, + { + "epoch": 1.1337493217580032, + "grad_norm": 4.5412139892578125, + "learning_rate": 6.730848293819379e-06, + "loss": 0.2897, + "step": 24606 + }, + { + "epoch": 1.1337628865979381, + "grad_norm": 4.320355415344238, + "learning_rate": 6.730711251199123e-06, + "loss": 0.1942, + "step": 24607 + }, + { + "epoch": 1.133776451437873, + "grad_norm": 5.277780532836914, + "learning_rate": 6.730574208578868e-06, + "loss": 0.3843, + "step": 24608 + }, + { + "epoch": 1.1337900162778078, + "grad_norm": 4.835923671722412, + "learning_rate": 6.7304371659586134e-06, + "loss": 0.3117, + "step": 24609 + }, + { + "epoch": 1.1338035811177427, + "grad_norm": 6.433157444000244, + "learning_rate": 6.7303001233383595e-06, + "loss": 0.267, + "step": 24610 + }, + { + "epoch": 1.1338171459576776, + "grad_norm": 4.440753936767578, + "learning_rate": 6.730163080718104e-06, + "loss": 0.2261, + "step": 24611 + }, + { + "epoch": 1.1338307107976127, + "grad_norm": 4.527838230133057, + "learning_rate": 6.730026038097849e-06, + "loss": 0.3277, + "step": 24612 + }, + { + "epoch": 1.1338442756375475, + "grad_norm": 5.562602996826172, + "learning_rate": 6.729888995477593e-06, + "loss": 0.1995, + "step": 24613 + }, + { + "epoch": 1.1338578404774824, + "grad_norm": 4.318654537200928, + "learning_rate": 6.729751952857339e-06, + "loss": 0.2407, + "step": 24614 + }, + { + "epoch": 1.1338714053174173, + "grad_norm": 5.74950647354126, + "learning_rate": 6.7296149102370845e-06, + "loss": 0.3211, + "step": 24615 + }, + { + "epoch": 1.1338849701573521, + "grad_norm": 6.762021541595459, + "learning_rate": 6.72947786761683e-06, + "loss": 0.3072, + "step": 24616 + }, + { + "epoch": 1.133898534997287, + "grad_norm": 3.7003490924835205, + "learning_rate": 6.729340824996574e-06, + "loss": 0.3044, + "step": 24617 + }, + { + "epoch": 1.1339120998372219, + "grad_norm": 4.820958614349365, + "learning_rate": 6.72920378237632e-06, + "loss": 0.1866, + "step": 24618 + }, + { + "epoch": 1.1339256646771567, + "grad_norm": 7.282542705535889, + "learning_rate": 6.729066739756065e-06, + "loss": 0.2886, + "step": 24619 + }, + { + "epoch": 1.1339392295170918, + "grad_norm": 4.090280532836914, + "learning_rate": 6.7289296971358096e-06, + "loss": 0.2755, + "step": 24620 + }, + { + "epoch": 1.1339527943570267, + "grad_norm": 6.683995723724365, + "learning_rate": 6.728792654515555e-06, + "loss": 0.3518, + "step": 24621 + }, + { + "epoch": 1.1339663591969615, + "grad_norm": 5.366919994354248, + "learning_rate": 6.728655611895299e-06, + "loss": 0.2052, + "step": 24622 + }, + { + "epoch": 1.1339799240368964, + "grad_norm": 3.8666646480560303, + "learning_rate": 6.728518569275045e-06, + "loss": 0.1534, + "step": 24623 + }, + { + "epoch": 1.1339934888768313, + "grad_norm": 6.267816543579102, + "learning_rate": 6.72838152665479e-06, + "loss": 0.3701, + "step": 24624 + }, + { + "epoch": 1.1340070537167661, + "grad_norm": 5.063384056091309, + "learning_rate": 6.7282444840345355e-06, + "loss": 0.2195, + "step": 24625 + }, + { + "epoch": 1.134020618556701, + "grad_norm": 5.490209579467773, + "learning_rate": 6.72810744141428e-06, + "loss": 0.2708, + "step": 24626 + }, + { + "epoch": 1.1340341833966359, + "grad_norm": 5.207520484924316, + "learning_rate": 6.727970398794026e-06, + "loss": 0.2184, + "step": 24627 + }, + { + "epoch": 1.1340477482365707, + "grad_norm": 4.80913782119751, + "learning_rate": 6.727833356173771e-06, + "loss": 0.2294, + "step": 24628 + }, + { + "epoch": 1.1340613130765056, + "grad_norm": 5.933239936828613, + "learning_rate": 6.727696313553515e-06, + "loss": 0.3156, + "step": 24629 + }, + { + "epoch": 1.1340748779164407, + "grad_norm": 4.189415454864502, + "learning_rate": 6.7275592709332605e-06, + "loss": 0.2461, + "step": 24630 + }, + { + "epoch": 1.1340884427563755, + "grad_norm": 5.235921859741211, + "learning_rate": 6.7274222283130065e-06, + "loss": 0.2698, + "step": 24631 + }, + { + "epoch": 1.1341020075963104, + "grad_norm": 2.9224629402160645, + "learning_rate": 6.727285185692751e-06, + "loss": 0.1224, + "step": 24632 + }, + { + "epoch": 1.1341155724362453, + "grad_norm": 4.469677448272705, + "learning_rate": 6.727148143072496e-06, + "loss": 0.2449, + "step": 24633 + }, + { + "epoch": 1.1341291372761801, + "grad_norm": 5.020409107208252, + "learning_rate": 6.727011100452241e-06, + "loss": 0.2405, + "step": 24634 + }, + { + "epoch": 1.134142702116115, + "grad_norm": 4.224067211151123, + "learning_rate": 6.7268740578319856e-06, + "loss": 0.2165, + "step": 24635 + }, + { + "epoch": 1.1341562669560499, + "grad_norm": 3.7911269664764404, + "learning_rate": 6.726737015211732e-06, + "loss": 0.1984, + "step": 24636 + }, + { + "epoch": 1.1341698317959847, + "grad_norm": 6.318082332611084, + "learning_rate": 6.726599972591477e-06, + "loss": 0.3203, + "step": 24637 + }, + { + "epoch": 1.1341833966359196, + "grad_norm": 3.050462007522583, + "learning_rate": 6.726462929971221e-06, + "loss": 0.0872, + "step": 24638 + }, + { + "epoch": 1.1341969614758547, + "grad_norm": 6.510977268218994, + "learning_rate": 6.726325887350966e-06, + "loss": 0.3052, + "step": 24639 + }, + { + "epoch": 1.1342105263157896, + "grad_norm": 5.049871444702148, + "learning_rate": 6.726188844730712e-06, + "loss": 0.2067, + "step": 24640 + }, + { + "epoch": 1.1342240911557244, + "grad_norm": 5.590602397918701, + "learning_rate": 6.726051802110457e-06, + "loss": 0.268, + "step": 24641 + }, + { + "epoch": 1.1342376559956593, + "grad_norm": 4.650993347167969, + "learning_rate": 6.725914759490202e-06, + "loss": 0.1806, + "step": 24642 + }, + { + "epoch": 1.1342512208355942, + "grad_norm": 6.040164470672607, + "learning_rate": 6.725777716869947e-06, + "loss": 0.2619, + "step": 24643 + }, + { + "epoch": 1.134264785675529, + "grad_norm": 4.599800109863281, + "learning_rate": 6.725640674249693e-06, + "loss": 0.1991, + "step": 24644 + }, + { + "epoch": 1.1342783505154639, + "grad_norm": 3.4318971633911133, + "learning_rate": 6.725503631629437e-06, + "loss": 0.1988, + "step": 24645 + }, + { + "epoch": 1.1342919153553987, + "grad_norm": 4.989955425262451, + "learning_rate": 6.7253665890091825e-06, + "loss": 0.2456, + "step": 24646 + }, + { + "epoch": 1.1343054801953336, + "grad_norm": 4.748991966247559, + "learning_rate": 6.725229546388927e-06, + "loss": 0.1829, + "step": 24647 + }, + { + "epoch": 1.1343190450352685, + "grad_norm": 4.48203182220459, + "learning_rate": 6.725092503768672e-06, + "loss": 0.2145, + "step": 24648 + }, + { + "epoch": 1.1343326098752036, + "grad_norm": 4.348212718963623, + "learning_rate": 6.724955461148418e-06, + "loss": 0.1871, + "step": 24649 + }, + { + "epoch": 1.1343461747151384, + "grad_norm": 4.449604034423828, + "learning_rate": 6.724818418528163e-06, + "loss": 0.1743, + "step": 24650 + }, + { + "epoch": 1.1343597395550733, + "grad_norm": 3.8140041828155518, + "learning_rate": 6.7246813759079076e-06, + "loss": 0.1498, + "step": 24651 + }, + { + "epoch": 1.1343733043950082, + "grad_norm": 4.54962158203125, + "learning_rate": 6.724544333287653e-06, + "loss": 0.1449, + "step": 24652 + }, + { + "epoch": 1.134386869234943, + "grad_norm": 6.183387756347656, + "learning_rate": 6.724407290667399e-06, + "loss": 0.2735, + "step": 24653 + }, + { + "epoch": 1.134400434074878, + "grad_norm": 4.51862096786499, + "learning_rate": 6.724270248047143e-06, + "loss": 0.2287, + "step": 24654 + }, + { + "epoch": 1.1344139989148128, + "grad_norm": 3.5762925148010254, + "learning_rate": 6.724133205426888e-06, + "loss": 0.1205, + "step": 24655 + }, + { + "epoch": 1.1344275637547476, + "grad_norm": 5.931820392608643, + "learning_rate": 6.723996162806633e-06, + "loss": 0.2684, + "step": 24656 + }, + { + "epoch": 1.1344411285946825, + "grad_norm": 6.5512566566467285, + "learning_rate": 6.723859120186379e-06, + "loss": 0.2406, + "step": 24657 + }, + { + "epoch": 1.1344546934346176, + "grad_norm": 4.702846527099609, + "learning_rate": 6.723722077566124e-06, + "loss": 0.2061, + "step": 24658 + }, + { + "epoch": 1.1344682582745524, + "grad_norm": 3.9524505138397217, + "learning_rate": 6.723585034945869e-06, + "loss": 0.1509, + "step": 24659 + }, + { + "epoch": 1.1344818231144873, + "grad_norm": 3.2509360313415527, + "learning_rate": 6.723447992325613e-06, + "loss": 0.1662, + "step": 24660 + }, + { + "epoch": 1.1344953879544222, + "grad_norm": 6.423924446105957, + "learning_rate": 6.723310949705359e-06, + "loss": 0.2938, + "step": 24661 + }, + { + "epoch": 1.134508952794357, + "grad_norm": 4.309535026550293, + "learning_rate": 6.7231739070851045e-06, + "loss": 0.1366, + "step": 24662 + }, + { + "epoch": 1.134522517634292, + "grad_norm": 4.809399604797363, + "learning_rate": 6.723036864464849e-06, + "loss": 0.223, + "step": 24663 + }, + { + "epoch": 1.1345360824742268, + "grad_norm": 3.486143112182617, + "learning_rate": 6.722899821844594e-06, + "loss": 0.1621, + "step": 24664 + }, + { + "epoch": 1.1345496473141616, + "grad_norm": 3.2082512378692627, + "learning_rate": 6.722762779224339e-06, + "loss": 0.1486, + "step": 24665 + }, + { + "epoch": 1.1345632121540965, + "grad_norm": 4.2954487800598145, + "learning_rate": 6.722625736604084e-06, + "loss": 0.1908, + "step": 24666 + }, + { + "epoch": 1.1345767769940314, + "grad_norm": 4.3818254470825195, + "learning_rate": 6.72248869398383e-06, + "loss": 0.2028, + "step": 24667 + }, + { + "epoch": 1.1345903418339665, + "grad_norm": 3.6019909381866455, + "learning_rate": 6.722351651363575e-06, + "loss": 0.1397, + "step": 24668 + }, + { + "epoch": 1.1346039066739013, + "grad_norm": 3.4827771186828613, + "learning_rate": 6.722214608743319e-06, + "loss": 0.1763, + "step": 24669 + }, + { + "epoch": 1.1346174715138362, + "grad_norm": 3.5055794715881348, + "learning_rate": 6.722077566123065e-06, + "loss": 0.1414, + "step": 24670 + }, + { + "epoch": 1.134631036353771, + "grad_norm": 4.3936896324157715, + "learning_rate": 6.72194052350281e-06, + "loss": 0.1636, + "step": 24671 + }, + { + "epoch": 1.134644601193706, + "grad_norm": 4.2482523918151855, + "learning_rate": 6.721803480882555e-06, + "loss": 0.1691, + "step": 24672 + }, + { + "epoch": 1.1346581660336408, + "grad_norm": 3.990917205810547, + "learning_rate": 6.7216664382623e-06, + "loss": 0.1547, + "step": 24673 + }, + { + "epoch": 1.1346717308735756, + "grad_norm": 3.2957029342651367, + "learning_rate": 6.721529395642046e-06, + "loss": 0.1382, + "step": 24674 + }, + { + "epoch": 1.1346852957135105, + "grad_norm": 4.2944769859313965, + "learning_rate": 6.721392353021791e-06, + "loss": 0.2176, + "step": 24675 + }, + { + "epoch": 1.1346988605534454, + "grad_norm": 8.689652442932129, + "learning_rate": 6.721255310401535e-06, + "loss": 0.3685, + "step": 24676 + }, + { + "epoch": 1.1347124253933805, + "grad_norm": 7.103461265563965, + "learning_rate": 6.7211182677812805e-06, + "loss": 0.3357, + "step": 24677 + }, + { + "epoch": 1.1347259902333153, + "grad_norm": 4.370831489562988, + "learning_rate": 6.720981225161025e-06, + "loss": 0.2029, + "step": 24678 + }, + { + "epoch": 1.1347395550732502, + "grad_norm": 6.01340913772583, + "learning_rate": 6.720844182540771e-06, + "loss": 0.321, + "step": 24679 + }, + { + "epoch": 1.134753119913185, + "grad_norm": 3.9899749755859375, + "learning_rate": 6.720707139920516e-06, + "loss": 0.1451, + "step": 24680 + }, + { + "epoch": 1.13476668475312, + "grad_norm": 3.6083178520202637, + "learning_rate": 6.72057009730026e-06, + "loss": 0.2029, + "step": 24681 + }, + { + "epoch": 1.1347802495930548, + "grad_norm": 3.8127238750457764, + "learning_rate": 6.7204330546800056e-06, + "loss": 0.1732, + "step": 24682 + }, + { + "epoch": 1.1347938144329897, + "grad_norm": 5.359292507171631, + "learning_rate": 6.720296012059752e-06, + "loss": 0.1978, + "step": 24683 + }, + { + "epoch": 1.1348073792729245, + "grad_norm": 3.8899786472320557, + "learning_rate": 6.720158969439497e-06, + "loss": 0.216, + "step": 24684 + }, + { + "epoch": 1.1348209441128594, + "grad_norm": 8.008437156677246, + "learning_rate": 6.720021926819241e-06, + "loss": 0.2222, + "step": 24685 + }, + { + "epoch": 1.1348345089527943, + "grad_norm": 6.390561103820801, + "learning_rate": 6.719884884198986e-06, + "loss": 0.264, + "step": 24686 + }, + { + "epoch": 1.1348480737927293, + "grad_norm": 3.9189374446868896, + "learning_rate": 6.719747841578732e-06, + "loss": 0.1652, + "step": 24687 + }, + { + "epoch": 1.1348616386326642, + "grad_norm": 4.53183650970459, + "learning_rate": 6.719610798958477e-06, + "loss": 0.1878, + "step": 24688 + }, + { + "epoch": 1.134875203472599, + "grad_norm": 4.767566680908203, + "learning_rate": 6.719473756338222e-06, + "loss": 0.237, + "step": 24689 + }, + { + "epoch": 1.134888768312534, + "grad_norm": 5.756728649139404, + "learning_rate": 6.719336713717967e-06, + "loss": 0.2768, + "step": 24690 + }, + { + "epoch": 1.1349023331524688, + "grad_norm": 5.611140251159668, + "learning_rate": 6.719199671097711e-06, + "loss": 0.304, + "step": 24691 + }, + { + "epoch": 1.1349158979924037, + "grad_norm": 4.659070014953613, + "learning_rate": 6.719062628477457e-06, + "loss": 0.3059, + "step": 24692 + }, + { + "epoch": 1.1349294628323385, + "grad_norm": 3.6502974033355713, + "learning_rate": 6.7189255858572025e-06, + "loss": 0.2224, + "step": 24693 + }, + { + "epoch": 1.1349430276722734, + "grad_norm": 4.514873504638672, + "learning_rate": 6.718788543236947e-06, + "loss": 0.2268, + "step": 24694 + }, + { + "epoch": 1.1349565925122083, + "grad_norm": 5.765776634216309, + "learning_rate": 6.718651500616692e-06, + "loss": 0.22, + "step": 24695 + }, + { + "epoch": 1.1349701573521433, + "grad_norm": 5.4612908363342285, + "learning_rate": 6.718514457996438e-06, + "loss": 0.2657, + "step": 24696 + }, + { + "epoch": 1.1349837221920782, + "grad_norm": 5.182286262512207, + "learning_rate": 6.718377415376182e-06, + "loss": 0.1829, + "step": 24697 + }, + { + "epoch": 1.134997287032013, + "grad_norm": 6.338034152984619, + "learning_rate": 6.718240372755928e-06, + "loss": 0.2793, + "step": 24698 + }, + { + "epoch": 1.135010851871948, + "grad_norm": 6.079677581787109, + "learning_rate": 6.718103330135673e-06, + "loss": 0.27, + "step": 24699 + }, + { + "epoch": 1.1350244167118828, + "grad_norm": 6.230827331542969, + "learning_rate": 6.717966287515418e-06, + "loss": 0.477, + "step": 24700 + }, + { + "epoch": 1.1350379815518177, + "grad_norm": 5.728384971618652, + "learning_rate": 6.717829244895163e-06, + "loss": 0.2846, + "step": 24701 + }, + { + "epoch": 1.1350515463917525, + "grad_norm": 4.556469440460205, + "learning_rate": 6.717692202274908e-06, + "loss": 0.2593, + "step": 24702 + }, + { + "epoch": 1.1350651112316874, + "grad_norm": 6.149657249450684, + "learning_rate": 6.717555159654653e-06, + "loss": 0.3258, + "step": 24703 + }, + { + "epoch": 1.1350786760716223, + "grad_norm": 6.757744789123535, + "learning_rate": 6.717418117034398e-06, + "loss": 0.3604, + "step": 24704 + }, + { + "epoch": 1.1350922409115571, + "grad_norm": 4.696067810058594, + "learning_rate": 6.717281074414144e-06, + "loss": 0.2288, + "step": 24705 + }, + { + "epoch": 1.1351058057514922, + "grad_norm": 6.561645030975342, + "learning_rate": 6.717144031793888e-06, + "loss": 0.261, + "step": 24706 + }, + { + "epoch": 1.135119370591427, + "grad_norm": 5.602784156799316, + "learning_rate": 6.717006989173633e-06, + "loss": 0.3564, + "step": 24707 + }, + { + "epoch": 1.135132935431362, + "grad_norm": 5.790255069732666, + "learning_rate": 6.7168699465533785e-06, + "loss": 0.231, + "step": 24708 + }, + { + "epoch": 1.1351465002712968, + "grad_norm": 5.645658493041992, + "learning_rate": 6.7167329039331246e-06, + "loss": 0.4413, + "step": 24709 + }, + { + "epoch": 1.1351600651112317, + "grad_norm": 5.578558444976807, + "learning_rate": 6.716595861312869e-06, + "loss": 0.3077, + "step": 24710 + }, + { + "epoch": 1.1351736299511666, + "grad_norm": 5.762629508972168, + "learning_rate": 6.716458818692614e-06, + "loss": 0.2734, + "step": 24711 + }, + { + "epoch": 1.1351871947911014, + "grad_norm": 5.016432285308838, + "learning_rate": 6.716321776072358e-06, + "loss": 0.2673, + "step": 24712 + }, + { + "epoch": 1.1352007596310363, + "grad_norm": 5.382861614227295, + "learning_rate": 6.7161847334521044e-06, + "loss": 0.2512, + "step": 24713 + }, + { + "epoch": 1.1352143244709711, + "grad_norm": 7.8193135261535645, + "learning_rate": 6.71604769083185e-06, + "loss": 0.3305, + "step": 24714 + }, + { + "epoch": 1.1352278893109062, + "grad_norm": 6.067649841308594, + "learning_rate": 6.715910648211594e-06, + "loss": 0.3247, + "step": 24715 + }, + { + "epoch": 1.135241454150841, + "grad_norm": 6.6661295890808105, + "learning_rate": 6.715773605591339e-06, + "loss": 0.2308, + "step": 24716 + }, + { + "epoch": 1.135255018990776, + "grad_norm": 6.426192283630371, + "learning_rate": 6.715636562971084e-06, + "loss": 0.3232, + "step": 24717 + }, + { + "epoch": 1.1352685838307108, + "grad_norm": 6.427330017089844, + "learning_rate": 6.71549952035083e-06, + "loss": 0.2536, + "step": 24718 + }, + { + "epoch": 1.1352821486706457, + "grad_norm": 6.1537065505981445, + "learning_rate": 6.715362477730575e-06, + "loss": 0.4407, + "step": 24719 + }, + { + "epoch": 1.1352957135105806, + "grad_norm": 7.41411828994751, + "learning_rate": 6.71522543511032e-06, + "loss": 0.2541, + "step": 24720 + }, + { + "epoch": 1.1353092783505154, + "grad_norm": 4.713501453399658, + "learning_rate": 6.715088392490064e-06, + "loss": 0.2666, + "step": 24721 + }, + { + "epoch": 1.1353228431904503, + "grad_norm": 8.640312194824219, + "learning_rate": 6.71495134986981e-06, + "loss": 0.3822, + "step": 24722 + }, + { + "epoch": 1.1353364080303852, + "grad_norm": 4.363031387329102, + "learning_rate": 6.714814307249555e-06, + "loss": 0.2113, + "step": 24723 + }, + { + "epoch": 1.13534997287032, + "grad_norm": 7.088017463684082, + "learning_rate": 6.7146772646293005e-06, + "loss": 0.3341, + "step": 24724 + }, + { + "epoch": 1.135363537710255, + "grad_norm": 7.306586265563965, + "learning_rate": 6.714540222009045e-06, + "loss": 0.3616, + "step": 24725 + }, + { + "epoch": 1.13537710255019, + "grad_norm": 4.635222911834717, + "learning_rate": 6.714403179388791e-06, + "loss": 0.2126, + "step": 24726 + }, + { + "epoch": 1.1353906673901248, + "grad_norm": 5.142869472503662, + "learning_rate": 6.714266136768536e-06, + "loss": 0.2993, + "step": 24727 + }, + { + "epoch": 1.1354042322300597, + "grad_norm": 4.918220520019531, + "learning_rate": 6.71412909414828e-06, + "loss": 0.2062, + "step": 24728 + }, + { + "epoch": 1.1354177970699946, + "grad_norm": 5.948256969451904, + "learning_rate": 6.713992051528026e-06, + "loss": 0.3007, + "step": 24729 + }, + { + "epoch": 1.1354313619099294, + "grad_norm": 6.035825729370117, + "learning_rate": 6.713855008907772e-06, + "loss": 0.2932, + "step": 24730 + }, + { + "epoch": 1.1354449267498643, + "grad_norm": 4.8886260986328125, + "learning_rate": 6.713717966287516e-06, + "loss": 0.3364, + "step": 24731 + }, + { + "epoch": 1.1354584915897992, + "grad_norm": 5.799877166748047, + "learning_rate": 6.713580923667261e-06, + "loss": 0.2893, + "step": 24732 + }, + { + "epoch": 1.135472056429734, + "grad_norm": 6.409785270690918, + "learning_rate": 6.713443881047006e-06, + "loss": 0.4026, + "step": 24733 + }, + { + "epoch": 1.1354856212696691, + "grad_norm": 5.4215240478515625, + "learning_rate": 6.713306838426751e-06, + "loss": 0.2345, + "step": 24734 + }, + { + "epoch": 1.135499186109604, + "grad_norm": 4.268222808837891, + "learning_rate": 6.713169795806497e-06, + "loss": 0.171, + "step": 24735 + }, + { + "epoch": 1.1355127509495389, + "grad_norm": 6.936417579650879, + "learning_rate": 6.713032753186242e-06, + "loss": 0.3938, + "step": 24736 + }, + { + "epoch": 1.1355263157894737, + "grad_norm": 6.440553188323975, + "learning_rate": 6.712895710565986e-06, + "loss": 0.3032, + "step": 24737 + }, + { + "epoch": 1.1355398806294086, + "grad_norm": 6.935937404632568, + "learning_rate": 6.712758667945731e-06, + "loss": 0.4171, + "step": 24738 + }, + { + "epoch": 1.1355534454693434, + "grad_norm": 5.725078582763672, + "learning_rate": 6.712621625325477e-06, + "loss": 0.2477, + "step": 24739 + }, + { + "epoch": 1.1355670103092783, + "grad_norm": 5.811747074127197, + "learning_rate": 6.712484582705222e-06, + "loss": 0.4279, + "step": 24740 + }, + { + "epoch": 1.1355805751492132, + "grad_norm": 4.606144905090332, + "learning_rate": 6.712347540084967e-06, + "loss": 0.2494, + "step": 24741 + }, + { + "epoch": 1.135594139989148, + "grad_norm": 5.802875995635986, + "learning_rate": 6.712210497464712e-06, + "loss": 0.3344, + "step": 24742 + }, + { + "epoch": 1.135607704829083, + "grad_norm": 4.910090446472168, + "learning_rate": 6.712073454844458e-06, + "loss": 0.3235, + "step": 24743 + }, + { + "epoch": 1.135621269669018, + "grad_norm": 6.686620712280273, + "learning_rate": 6.7119364122242024e-06, + "loss": 0.3366, + "step": 24744 + }, + { + "epoch": 1.1356348345089529, + "grad_norm": 5.395505905151367, + "learning_rate": 6.711799369603948e-06, + "loss": 0.1971, + "step": 24745 + }, + { + "epoch": 1.1356483993488877, + "grad_norm": 5.667422294616699, + "learning_rate": 6.711662326983692e-06, + "loss": 0.2634, + "step": 24746 + }, + { + "epoch": 1.1356619641888226, + "grad_norm": 4.488865852355957, + "learning_rate": 6.711525284363437e-06, + "loss": 0.2202, + "step": 24747 + }, + { + "epoch": 1.1356755290287575, + "grad_norm": 5.1492791175842285, + "learning_rate": 6.711388241743183e-06, + "loss": 0.2842, + "step": 24748 + }, + { + "epoch": 1.1356890938686923, + "grad_norm": 6.944383144378662, + "learning_rate": 6.7112511991229275e-06, + "loss": 0.323, + "step": 24749 + }, + { + "epoch": 1.1357026587086272, + "grad_norm": 6.106441497802734, + "learning_rate": 6.711114156502673e-06, + "loss": 0.3055, + "step": 24750 + }, + { + "epoch": 1.135716223548562, + "grad_norm": 5.400524616241455, + "learning_rate": 6.710977113882418e-06, + "loss": 0.3209, + "step": 24751 + }, + { + "epoch": 1.135729788388497, + "grad_norm": 5.581520080566406, + "learning_rate": 6.710840071262164e-06, + "loss": 0.2552, + "step": 24752 + }, + { + "epoch": 1.135743353228432, + "grad_norm": 5.6642165184021, + "learning_rate": 6.710703028641908e-06, + "loss": 0.2428, + "step": 24753 + }, + { + "epoch": 1.1357569180683669, + "grad_norm": 5.705813407897949, + "learning_rate": 6.710565986021653e-06, + "loss": 0.3588, + "step": 24754 + }, + { + "epoch": 1.1357704829083017, + "grad_norm": 6.195585250854492, + "learning_rate": 6.710428943401398e-06, + "loss": 0.2995, + "step": 24755 + }, + { + "epoch": 1.1357840477482366, + "grad_norm": 5.801335334777832, + "learning_rate": 6.710291900781144e-06, + "loss": 0.2441, + "step": 24756 + }, + { + "epoch": 1.1357976125881715, + "grad_norm": 7.086206912994385, + "learning_rate": 6.710154858160889e-06, + "loss": 0.2608, + "step": 24757 + }, + { + "epoch": 1.1358111774281063, + "grad_norm": 5.780364513397217, + "learning_rate": 6.710017815540634e-06, + "loss": 0.2515, + "step": 24758 + }, + { + "epoch": 1.1358247422680412, + "grad_norm": 5.720424175262451, + "learning_rate": 6.709880772920378e-06, + "loss": 0.3255, + "step": 24759 + }, + { + "epoch": 1.135838307107976, + "grad_norm": 5.426787376403809, + "learning_rate": 6.709743730300124e-06, + "loss": 0.2875, + "step": 24760 + }, + { + "epoch": 1.135851871947911, + "grad_norm": 6.066822052001953, + "learning_rate": 6.70960668767987e-06, + "loss": 0.3982, + "step": 24761 + }, + { + "epoch": 1.1358654367878458, + "grad_norm": 7.076601505279541, + "learning_rate": 6.709469645059614e-06, + "loss": 0.3994, + "step": 24762 + }, + { + "epoch": 1.1358790016277809, + "grad_norm": 5.919098854064941, + "learning_rate": 6.709332602439359e-06, + "loss": 0.2533, + "step": 24763 + }, + { + "epoch": 1.1358925664677157, + "grad_norm": 4.9688401222229, + "learning_rate": 6.7091955598191035e-06, + "loss": 0.2372, + "step": 24764 + }, + { + "epoch": 1.1359061313076506, + "grad_norm": 5.268527984619141, + "learning_rate": 6.7090585171988495e-06, + "loss": 0.2579, + "step": 24765 + }, + { + "epoch": 1.1359196961475855, + "grad_norm": 7.326930522918701, + "learning_rate": 6.708921474578595e-06, + "loss": 0.3721, + "step": 24766 + }, + { + "epoch": 1.1359332609875203, + "grad_norm": 5.991774559020996, + "learning_rate": 6.70878443195834e-06, + "loss": 0.2546, + "step": 24767 + }, + { + "epoch": 1.1359468258274552, + "grad_norm": 6.37213134765625, + "learning_rate": 6.708647389338084e-06, + "loss": 0.2614, + "step": 24768 + }, + { + "epoch": 1.13596039066739, + "grad_norm": 5.797979354858398, + "learning_rate": 6.70851034671783e-06, + "loss": 0.1852, + "step": 24769 + }, + { + "epoch": 1.135973955507325, + "grad_norm": 4.263006210327148, + "learning_rate": 6.708373304097575e-06, + "loss": 0.1299, + "step": 24770 + }, + { + "epoch": 1.13598752034726, + "grad_norm": 6.438329696655273, + "learning_rate": 6.70823626147732e-06, + "loss": 0.1534, + "step": 24771 + }, + { + "epoch": 1.136001085187195, + "grad_norm": 4.979426383972168, + "learning_rate": 6.708099218857065e-06, + "loss": 0.226, + "step": 24772 + }, + { + "epoch": 1.1360146500271298, + "grad_norm": 5.524205207824707, + "learning_rate": 6.70796217623681e-06, + "loss": 0.2312, + "step": 24773 + }, + { + "epoch": 1.1360282148670646, + "grad_norm": 6.043880939483643, + "learning_rate": 6.707825133616555e-06, + "loss": 0.2674, + "step": 24774 + }, + { + "epoch": 1.1360417797069995, + "grad_norm": 5.497744560241699, + "learning_rate": 6.7076880909963004e-06, + "loss": 0.226, + "step": 24775 + }, + { + "epoch": 1.1360553445469344, + "grad_norm": 4.139581680297852, + "learning_rate": 6.707551048376046e-06, + "loss": 0.2187, + "step": 24776 + }, + { + "epoch": 1.1360689093868692, + "grad_norm": 5.096362113952637, + "learning_rate": 6.70741400575579e-06, + "loss": 0.3605, + "step": 24777 + }, + { + "epoch": 1.136082474226804, + "grad_norm": 6.119281768798828, + "learning_rate": 6.707276963135536e-06, + "loss": 0.3466, + "step": 24778 + }, + { + "epoch": 1.136096039066739, + "grad_norm": 5.552973747253418, + "learning_rate": 6.707139920515281e-06, + "loss": 0.2538, + "step": 24779 + }, + { + "epoch": 1.1361096039066738, + "grad_norm": 4.438815116882324, + "learning_rate": 6.7070028778950255e-06, + "loss": 0.1913, + "step": 24780 + }, + { + "epoch": 1.1361231687466087, + "grad_norm": 4.936628818511963, + "learning_rate": 6.706865835274771e-06, + "loss": 0.257, + "step": 24781 + }, + { + "epoch": 1.1361367335865438, + "grad_norm": 5.138803958892822, + "learning_rate": 6.706728792654517e-06, + "loss": 0.2742, + "step": 24782 + }, + { + "epoch": 1.1361502984264786, + "grad_norm": 6.858241558074951, + "learning_rate": 6.706591750034261e-06, + "loss": 0.3778, + "step": 24783 + }, + { + "epoch": 1.1361638632664135, + "grad_norm": 4.7276105880737305, + "learning_rate": 6.706454707414006e-06, + "loss": 0.1959, + "step": 24784 + }, + { + "epoch": 1.1361774281063484, + "grad_norm": 4.246609687805176, + "learning_rate": 6.706317664793751e-06, + "loss": 0.2309, + "step": 24785 + }, + { + "epoch": 1.1361909929462832, + "grad_norm": 4.225259304046631, + "learning_rate": 6.706180622173496e-06, + "loss": 0.1131, + "step": 24786 + }, + { + "epoch": 1.136204557786218, + "grad_norm": 3.806060314178467, + "learning_rate": 6.706043579553242e-06, + "loss": 0.1744, + "step": 24787 + }, + { + "epoch": 1.136218122626153, + "grad_norm": 4.494873523712158, + "learning_rate": 6.705906536932987e-06, + "loss": 0.1431, + "step": 24788 + }, + { + "epoch": 1.1362316874660878, + "grad_norm": 5.894221305847168, + "learning_rate": 6.705769494312731e-06, + "loss": 0.2562, + "step": 24789 + }, + { + "epoch": 1.136245252306023, + "grad_norm": 4.876832485198975, + "learning_rate": 6.705632451692476e-06, + "loss": 0.2056, + "step": 24790 + }, + { + "epoch": 1.1362588171459578, + "grad_norm": 4.4809675216674805, + "learning_rate": 6.7054954090722224e-06, + "loss": 0.2154, + "step": 24791 + }, + { + "epoch": 1.1362723819858926, + "grad_norm": 4.083518981933594, + "learning_rate": 6.705358366451968e-06, + "loss": 0.1185, + "step": 24792 + }, + { + "epoch": 1.1362859468258275, + "grad_norm": 5.680443286895752, + "learning_rate": 6.705221323831712e-06, + "loss": 0.24, + "step": 24793 + }, + { + "epoch": 1.1362995116657624, + "grad_norm": 6.003540992736816, + "learning_rate": 6.705084281211457e-06, + "loss": 0.2472, + "step": 24794 + }, + { + "epoch": 1.1363130765056972, + "grad_norm": 5.1045427322387695, + "learning_rate": 6.704947238591203e-06, + "loss": 0.2309, + "step": 24795 + }, + { + "epoch": 1.136326641345632, + "grad_norm": 5.1218485832214355, + "learning_rate": 6.7048101959709475e-06, + "loss": 0.1928, + "step": 24796 + }, + { + "epoch": 1.136340206185567, + "grad_norm": 4.386600494384766, + "learning_rate": 6.704673153350693e-06, + "loss": 0.1967, + "step": 24797 + }, + { + "epoch": 1.1363537710255018, + "grad_norm": 5.093369483947754, + "learning_rate": 6.704536110730437e-06, + "loss": 0.2538, + "step": 24798 + }, + { + "epoch": 1.1363673358654367, + "grad_norm": 5.531314373016357, + "learning_rate": 6.704399068110182e-06, + "loss": 0.2634, + "step": 24799 + }, + { + "epoch": 1.1363809007053716, + "grad_norm": 5.886290550231934, + "learning_rate": 6.704262025489928e-06, + "loss": 0.2482, + "step": 24800 + }, + { + "epoch": 1.1363944655453067, + "grad_norm": 6.6894450187683105, + "learning_rate": 6.704124982869673e-06, + "loss": 0.3301, + "step": 24801 + }, + { + "epoch": 1.1364080303852415, + "grad_norm": 4.561834812164307, + "learning_rate": 6.703987940249418e-06, + "loss": 0.2374, + "step": 24802 + }, + { + "epoch": 1.1364215952251764, + "grad_norm": 4.356874465942383, + "learning_rate": 6.703850897629163e-06, + "loss": 0.2157, + "step": 24803 + }, + { + "epoch": 1.1364351600651112, + "grad_norm": 4.570346832275391, + "learning_rate": 6.703713855008909e-06, + "loss": 0.2361, + "step": 24804 + }, + { + "epoch": 1.1364487249050461, + "grad_norm": 8.044197082519531, + "learning_rate": 6.703576812388653e-06, + "loss": 0.3321, + "step": 24805 + }, + { + "epoch": 1.136462289744981, + "grad_norm": 4.594090461730957, + "learning_rate": 6.7034397697683984e-06, + "loss": 0.1852, + "step": 24806 + }, + { + "epoch": 1.1364758545849158, + "grad_norm": 3.7479500770568848, + "learning_rate": 6.703302727148144e-06, + "loss": 0.1459, + "step": 24807 + }, + { + "epoch": 1.1364894194248507, + "grad_norm": 5.515156269073486, + "learning_rate": 6.703165684527889e-06, + "loss": 0.2623, + "step": 24808 + }, + { + "epoch": 1.1365029842647858, + "grad_norm": 7.008712291717529, + "learning_rate": 6.703028641907634e-06, + "loss": 0.2472, + "step": 24809 + }, + { + "epoch": 1.1365165491047207, + "grad_norm": 7.49857234954834, + "learning_rate": 6.702891599287379e-06, + "loss": 0.3853, + "step": 24810 + }, + { + "epoch": 1.1365301139446555, + "grad_norm": 7.436474800109863, + "learning_rate": 6.7027545566671235e-06, + "loss": 0.365, + "step": 24811 + }, + { + "epoch": 1.1365436787845904, + "grad_norm": 6.2684125900268555, + "learning_rate": 6.7026175140468695e-06, + "loss": 0.1741, + "step": 24812 + }, + { + "epoch": 1.1365572436245253, + "grad_norm": 11.12880802154541, + "learning_rate": 6.702480471426615e-06, + "loss": 0.4279, + "step": 24813 + }, + { + "epoch": 1.1365708084644601, + "grad_norm": 5.175714015960693, + "learning_rate": 6.702343428806359e-06, + "loss": 0.2781, + "step": 24814 + }, + { + "epoch": 1.136584373304395, + "grad_norm": 5.162895202636719, + "learning_rate": 6.702206386186104e-06, + "loss": 0.3058, + "step": 24815 + }, + { + "epoch": 1.1365979381443299, + "grad_norm": 8.447796821594238, + "learning_rate": 6.702069343565849e-06, + "loss": 0.3057, + "step": 24816 + }, + { + "epoch": 1.1366115029842647, + "grad_norm": 5.4068779945373535, + "learning_rate": 6.701932300945595e-06, + "loss": 0.2143, + "step": 24817 + }, + { + "epoch": 1.1366250678241996, + "grad_norm": 4.211639881134033, + "learning_rate": 6.70179525832534e-06, + "loss": 0.2337, + "step": 24818 + }, + { + "epoch": 1.1366386326641345, + "grad_norm": 5.798306941986084, + "learning_rate": 6.701658215705085e-06, + "loss": 0.3008, + "step": 24819 + }, + { + "epoch": 1.1366521975040695, + "grad_norm": 3.7052671909332275, + "learning_rate": 6.701521173084829e-06, + "loss": 0.1786, + "step": 24820 + }, + { + "epoch": 1.1366657623440044, + "grad_norm": 6.086941719055176, + "learning_rate": 6.701384130464575e-06, + "loss": 0.2639, + "step": 24821 + }, + { + "epoch": 1.1366793271839393, + "grad_norm": 4.818371772766113, + "learning_rate": 6.7012470878443205e-06, + "loss": 0.2278, + "step": 24822 + }, + { + "epoch": 1.1366928920238741, + "grad_norm": 3.71604585647583, + "learning_rate": 6.701110045224065e-06, + "loss": 0.1901, + "step": 24823 + }, + { + "epoch": 1.136706456863809, + "grad_norm": 3.5549163818359375, + "learning_rate": 6.70097300260381e-06, + "loss": 0.2235, + "step": 24824 + }, + { + "epoch": 1.1367200217037439, + "grad_norm": 8.086297988891602, + "learning_rate": 6.700835959983556e-06, + "loss": 0.2055, + "step": 24825 + }, + { + "epoch": 1.1367335865436787, + "grad_norm": 3.632143259048462, + "learning_rate": 6.700698917363301e-06, + "loss": 0.1727, + "step": 24826 + }, + { + "epoch": 1.1367471513836136, + "grad_norm": 6.486991882324219, + "learning_rate": 6.7005618747430455e-06, + "loss": 0.3142, + "step": 24827 + }, + { + "epoch": 1.1367607162235487, + "grad_norm": 5.8831634521484375, + "learning_rate": 6.700424832122791e-06, + "loss": 0.264, + "step": 24828 + }, + { + "epoch": 1.1367742810634835, + "grad_norm": 5.542427062988281, + "learning_rate": 6.700287789502535e-06, + "loss": 0.2261, + "step": 24829 + }, + { + "epoch": 1.1367878459034184, + "grad_norm": 4.617469310760498, + "learning_rate": 6.700150746882281e-06, + "loss": 0.3518, + "step": 24830 + }, + { + "epoch": 1.1368014107433533, + "grad_norm": 4.546195983886719, + "learning_rate": 6.700013704262026e-06, + "loss": 0.2267, + "step": 24831 + }, + { + "epoch": 1.1368149755832881, + "grad_norm": 3.9195406436920166, + "learning_rate": 6.699876661641771e-06, + "loss": 0.1631, + "step": 24832 + }, + { + "epoch": 1.136828540423223, + "grad_norm": 6.131750106811523, + "learning_rate": 6.699739619021516e-06, + "loss": 0.3332, + "step": 24833 + }, + { + "epoch": 1.1368421052631579, + "grad_norm": 4.3608880043029785, + "learning_rate": 6.699602576401262e-06, + "loss": 0.2422, + "step": 24834 + }, + { + "epoch": 1.1368556701030927, + "grad_norm": 4.145543098449707, + "learning_rate": 6.699465533781007e-06, + "loss": 0.2441, + "step": 24835 + }, + { + "epoch": 1.1368692349430276, + "grad_norm": 4.43846321105957, + "learning_rate": 6.699328491160751e-06, + "loss": 0.198, + "step": 24836 + }, + { + "epoch": 1.1368827997829625, + "grad_norm": 4.005801677703857, + "learning_rate": 6.6991914485404964e-06, + "loss": 0.2427, + "step": 24837 + }, + { + "epoch": 1.1368963646228973, + "grad_norm": 5.467833518981934, + "learning_rate": 6.6990544059202425e-06, + "loss": 0.2504, + "step": 24838 + }, + { + "epoch": 1.1369099294628324, + "grad_norm": 3.4094722270965576, + "learning_rate": 6.698917363299987e-06, + "loss": 0.1688, + "step": 24839 + }, + { + "epoch": 1.1369234943027673, + "grad_norm": 4.459115505218506, + "learning_rate": 6.698780320679732e-06, + "loss": 0.1683, + "step": 24840 + }, + { + "epoch": 1.1369370591427022, + "grad_norm": 4.93614387512207, + "learning_rate": 6.698643278059477e-06, + "loss": 0.1983, + "step": 24841 + }, + { + "epoch": 1.136950623982637, + "grad_norm": 4.7488555908203125, + "learning_rate": 6.6985062354392215e-06, + "loss": 0.2217, + "step": 24842 + }, + { + "epoch": 1.1369641888225719, + "grad_norm": 4.468287944793701, + "learning_rate": 6.6983691928189675e-06, + "loss": 0.1963, + "step": 24843 + }, + { + "epoch": 1.1369777536625068, + "grad_norm": 3.228445529937744, + "learning_rate": 6.698232150198713e-06, + "loss": 0.1492, + "step": 24844 + }, + { + "epoch": 1.1369913185024416, + "grad_norm": 4.693138122558594, + "learning_rate": 6.698095107578457e-06, + "loss": 0.1181, + "step": 24845 + }, + { + "epoch": 1.1370048833423765, + "grad_norm": 5.31215763092041, + "learning_rate": 6.697958064958202e-06, + "loss": 0.3374, + "step": 24846 + }, + { + "epoch": 1.1370184481823116, + "grad_norm": 4.347190856933594, + "learning_rate": 6.697821022337948e-06, + "loss": 0.2916, + "step": 24847 + }, + { + "epoch": 1.1370320130222464, + "grad_norm": 3.974090814590454, + "learning_rate": 6.6976839797176926e-06, + "loss": 0.1818, + "step": 24848 + }, + { + "epoch": 1.1370455778621813, + "grad_norm": 5.208784103393555, + "learning_rate": 6.697546937097438e-06, + "loss": 0.2514, + "step": 24849 + }, + { + "epoch": 1.1370591427021162, + "grad_norm": 4.335105895996094, + "learning_rate": 6.697409894477183e-06, + "loss": 0.2046, + "step": 24850 + }, + { + "epoch": 1.137072707542051, + "grad_norm": 5.125560283660889, + "learning_rate": 6.697272851856929e-06, + "loss": 0.203, + "step": 24851 + }, + { + "epoch": 1.137086272381986, + "grad_norm": 4.201898574829102, + "learning_rate": 6.697135809236673e-06, + "loss": 0.1907, + "step": 24852 + }, + { + "epoch": 1.1370998372219208, + "grad_norm": 4.783193588256836, + "learning_rate": 6.6969987666164185e-06, + "loss": 0.2469, + "step": 24853 + }, + { + "epoch": 1.1371134020618556, + "grad_norm": 5.892005920410156, + "learning_rate": 6.696861723996163e-06, + "loss": 0.2542, + "step": 24854 + }, + { + "epoch": 1.1371269669017905, + "grad_norm": 5.425517559051514, + "learning_rate": 6.696724681375908e-06, + "loss": 0.2222, + "step": 24855 + }, + { + "epoch": 1.1371405317417254, + "grad_norm": 6.768357753753662, + "learning_rate": 6.696587638755654e-06, + "loss": 0.2402, + "step": 24856 + }, + { + "epoch": 1.1371540965816602, + "grad_norm": 4.908622741699219, + "learning_rate": 6.696450596135398e-06, + "loss": 0.2706, + "step": 24857 + }, + { + "epoch": 1.1371676614215953, + "grad_norm": 5.8636651039123535, + "learning_rate": 6.6963135535151435e-06, + "loss": 0.2436, + "step": 24858 + }, + { + "epoch": 1.1371812262615302, + "grad_norm": 5.169277667999268, + "learning_rate": 6.696176510894889e-06, + "loss": 0.2252, + "step": 24859 + }, + { + "epoch": 1.137194791101465, + "grad_norm": 4.889118194580078, + "learning_rate": 6.696039468274635e-06, + "loss": 0.2219, + "step": 24860 + }, + { + "epoch": 1.1372083559414, + "grad_norm": 5.28305721282959, + "learning_rate": 6.695902425654379e-06, + "loss": 0.2057, + "step": 24861 + }, + { + "epoch": 1.1372219207813348, + "grad_norm": 4.098102569580078, + "learning_rate": 6.695765383034124e-06, + "loss": 0.1694, + "step": 24862 + }, + { + "epoch": 1.1372354856212696, + "grad_norm": 4.535760402679443, + "learning_rate": 6.6956283404138686e-06, + "loss": 0.1723, + "step": 24863 + }, + { + "epoch": 1.1372490504612045, + "grad_norm": 5.031314849853516, + "learning_rate": 6.695491297793615e-06, + "loss": 0.2248, + "step": 24864 + }, + { + "epoch": 1.1372626153011394, + "grad_norm": 5.423238277435303, + "learning_rate": 6.69535425517336e-06, + "loss": 0.3571, + "step": 24865 + }, + { + "epoch": 1.1372761801410745, + "grad_norm": 5.8426289558410645, + "learning_rate": 6.695217212553105e-06, + "loss": 0.2866, + "step": 24866 + }, + { + "epoch": 1.1372897449810093, + "grad_norm": 4.258948802947998, + "learning_rate": 6.695080169932849e-06, + "loss": 0.1929, + "step": 24867 + }, + { + "epoch": 1.1373033098209442, + "grad_norm": 4.512214183807373, + "learning_rate": 6.6949431273125944e-06, + "loss": 0.1814, + "step": 24868 + }, + { + "epoch": 1.137316874660879, + "grad_norm": 5.602160930633545, + "learning_rate": 6.6948060846923405e-06, + "loss": 0.3193, + "step": 24869 + }, + { + "epoch": 1.137330439500814, + "grad_norm": 4.866863250732422, + "learning_rate": 6.694669042072085e-06, + "loss": 0.1579, + "step": 24870 + }, + { + "epoch": 1.1373440043407488, + "grad_norm": 5.461337089538574, + "learning_rate": 6.69453199945183e-06, + "loss": 0.2414, + "step": 24871 + }, + { + "epoch": 1.1373575691806836, + "grad_norm": 4.601106643676758, + "learning_rate": 6.694394956831574e-06, + "loss": 0.2241, + "step": 24872 + }, + { + "epoch": 1.1373711340206185, + "grad_norm": 4.143078804016113, + "learning_rate": 6.69425791421132e-06, + "loss": 0.2138, + "step": 24873 + }, + { + "epoch": 1.1373846988605534, + "grad_norm": 3.8241026401519775, + "learning_rate": 6.6941208715910655e-06, + "loss": 0.1252, + "step": 24874 + }, + { + "epoch": 1.1373982637004882, + "grad_norm": 4.849249839782715, + "learning_rate": 6.693983828970811e-06, + "loss": 0.1776, + "step": 24875 + }, + { + "epoch": 1.137411828540423, + "grad_norm": 4.599403381347656, + "learning_rate": 6.693846786350555e-06, + "loss": 0.1815, + "step": 24876 + }, + { + "epoch": 1.1374253933803582, + "grad_norm": 6.277479648590088, + "learning_rate": 6.693709743730301e-06, + "loss": 0.25, + "step": 24877 + }, + { + "epoch": 1.137438958220293, + "grad_norm": 3.6960837841033936, + "learning_rate": 6.693572701110046e-06, + "loss": 0.1459, + "step": 24878 + }, + { + "epoch": 1.137452523060228, + "grad_norm": 6.73312520980835, + "learning_rate": 6.6934356584897906e-06, + "loss": 0.4107, + "step": 24879 + }, + { + "epoch": 1.1374660879001628, + "grad_norm": 5.552457332611084, + "learning_rate": 6.693298615869536e-06, + "loss": 0.2721, + "step": 24880 + }, + { + "epoch": 1.1374796527400977, + "grad_norm": 3.6308209896087646, + "learning_rate": 6.693161573249282e-06, + "loss": 0.1745, + "step": 24881 + }, + { + "epoch": 1.1374932175800325, + "grad_norm": 4.281924724578857, + "learning_rate": 6.693024530629026e-06, + "loss": 0.205, + "step": 24882 + }, + { + "epoch": 1.1375067824199674, + "grad_norm": 5.263824462890625, + "learning_rate": 6.692887488008771e-06, + "loss": 0.2754, + "step": 24883 + }, + { + "epoch": 1.1375203472599023, + "grad_norm": 4.935898780822754, + "learning_rate": 6.6927504453885165e-06, + "loss": 0.2087, + "step": 24884 + }, + { + "epoch": 1.1375339120998373, + "grad_norm": 9.58460521697998, + "learning_rate": 6.692613402768261e-06, + "loss": 0.3093, + "step": 24885 + }, + { + "epoch": 1.1375474769397722, + "grad_norm": 3.439577102661133, + "learning_rate": 6.692476360148007e-06, + "loss": 0.1115, + "step": 24886 + }, + { + "epoch": 1.137561041779707, + "grad_norm": 3.06807017326355, + "learning_rate": 6.692339317527752e-06, + "loss": 0.1848, + "step": 24887 + }, + { + "epoch": 1.137574606619642, + "grad_norm": 7.859187602996826, + "learning_rate": 6.692202274907496e-06, + "loss": 0.3775, + "step": 24888 + }, + { + "epoch": 1.1375881714595768, + "grad_norm": 3.980673313140869, + "learning_rate": 6.6920652322872415e-06, + "loss": 0.161, + "step": 24889 + }, + { + "epoch": 1.1376017362995117, + "grad_norm": 7.4334588050842285, + "learning_rate": 6.6919281896669875e-06, + "loss": 0.2558, + "step": 24890 + }, + { + "epoch": 1.1376153011394465, + "grad_norm": 4.8734025955200195, + "learning_rate": 6.691791147046732e-06, + "loss": 0.1858, + "step": 24891 + }, + { + "epoch": 1.1376288659793814, + "grad_norm": 3.7661967277526855, + "learning_rate": 6.691654104426477e-06, + "loss": 0.1581, + "step": 24892 + }, + { + "epoch": 1.1376424308193163, + "grad_norm": 5.694671630859375, + "learning_rate": 6.691517061806222e-06, + "loss": 0.2564, + "step": 24893 + }, + { + "epoch": 1.1376559956592511, + "grad_norm": 5.574660301208496, + "learning_rate": 6.691380019185968e-06, + "loss": 0.2204, + "step": 24894 + }, + { + "epoch": 1.137669560499186, + "grad_norm": 6.640030384063721, + "learning_rate": 6.691242976565713e-06, + "loss": 0.2775, + "step": 24895 + }, + { + "epoch": 1.137683125339121, + "grad_norm": 4.317025661468506, + "learning_rate": 6.691105933945458e-06, + "loss": 0.1922, + "step": 24896 + }, + { + "epoch": 1.137696690179056, + "grad_norm": 5.8909807205200195, + "learning_rate": 6.690968891325202e-06, + "loss": 0.2607, + "step": 24897 + }, + { + "epoch": 1.1377102550189908, + "grad_norm": 3.752358913421631, + "learning_rate": 6.690831848704947e-06, + "loss": 0.171, + "step": 24898 + }, + { + "epoch": 1.1377238198589257, + "grad_norm": 7.514520645141602, + "learning_rate": 6.690694806084693e-06, + "loss": 0.34, + "step": 24899 + }, + { + "epoch": 1.1377373846988605, + "grad_norm": 6.096351146697998, + "learning_rate": 6.6905577634644385e-06, + "loss": 0.232, + "step": 24900 + }, + { + "epoch": 1.1377509495387954, + "grad_norm": 5.510687828063965, + "learning_rate": 6.690420720844183e-06, + "loss": 0.1896, + "step": 24901 + }, + { + "epoch": 1.1377645143787303, + "grad_norm": 6.0174031257629395, + "learning_rate": 6.690283678223928e-06, + "loss": 0.2863, + "step": 24902 + }, + { + "epoch": 1.1377780792186651, + "grad_norm": 5.441205978393555, + "learning_rate": 6.690146635603674e-06, + "loss": 0.2617, + "step": 24903 + }, + { + "epoch": 1.1377916440586002, + "grad_norm": 5.338944435119629, + "learning_rate": 6.690009592983418e-06, + "loss": 0.2733, + "step": 24904 + }, + { + "epoch": 1.137805208898535, + "grad_norm": 6.267441272735596, + "learning_rate": 6.6898725503631635e-06, + "loss": 0.2614, + "step": 24905 + }, + { + "epoch": 1.13781877373847, + "grad_norm": 5.444217205047607, + "learning_rate": 6.689735507742908e-06, + "loss": 0.3178, + "step": 24906 + }, + { + "epoch": 1.1378323385784048, + "grad_norm": 6.333775997161865, + "learning_rate": 6.689598465122654e-06, + "loss": 0.2904, + "step": 24907 + }, + { + "epoch": 1.1378459034183397, + "grad_norm": 4.7991156578063965, + "learning_rate": 6.689461422502399e-06, + "loss": 0.1767, + "step": 24908 + }, + { + "epoch": 1.1378594682582746, + "grad_norm": 5.496982097625732, + "learning_rate": 6.689324379882144e-06, + "loss": 0.1321, + "step": 24909 + }, + { + "epoch": 1.1378730330982094, + "grad_norm": 4.975369930267334, + "learning_rate": 6.6891873372618886e-06, + "loss": 0.1555, + "step": 24910 + }, + { + "epoch": 1.1378865979381443, + "grad_norm": 4.894527435302734, + "learning_rate": 6.689050294641634e-06, + "loss": 0.1499, + "step": 24911 + }, + { + "epoch": 1.1379001627780791, + "grad_norm": 5.006767749786377, + "learning_rate": 6.68891325202138e-06, + "loss": 0.1592, + "step": 24912 + }, + { + "epoch": 1.137913727618014, + "grad_norm": 3.130051851272583, + "learning_rate": 6.688776209401124e-06, + "loss": 0.1435, + "step": 24913 + }, + { + "epoch": 1.1379272924579489, + "grad_norm": 4.794466018676758, + "learning_rate": 6.688639166780869e-06, + "loss": 0.1948, + "step": 24914 + }, + { + "epoch": 1.137940857297884, + "grad_norm": 4.586139678955078, + "learning_rate": 6.6885021241606145e-06, + "loss": 0.1745, + "step": 24915 + }, + { + "epoch": 1.1379544221378188, + "grad_norm": 6.705498695373535, + "learning_rate": 6.68836508154036e-06, + "loss": 0.323, + "step": 24916 + }, + { + "epoch": 1.1379679869777537, + "grad_norm": 7.776618480682373, + "learning_rate": 6.688228038920105e-06, + "loss": 0.2162, + "step": 24917 + }, + { + "epoch": 1.1379815518176886, + "grad_norm": 4.565197944641113, + "learning_rate": 6.68809099629985e-06, + "loss": 0.1969, + "step": 24918 + }, + { + "epoch": 1.1379951166576234, + "grad_norm": 5.242753982543945, + "learning_rate": 6.687953953679594e-06, + "loss": 0.1911, + "step": 24919 + }, + { + "epoch": 1.1380086814975583, + "grad_norm": 5.200182914733887, + "learning_rate": 6.68781691105934e-06, + "loss": 0.2964, + "step": 24920 + }, + { + "epoch": 1.1380222463374932, + "grad_norm": 5.558789253234863, + "learning_rate": 6.6876798684390855e-06, + "loss": 0.2082, + "step": 24921 + }, + { + "epoch": 1.138035811177428, + "grad_norm": 6.16582727432251, + "learning_rate": 6.68754282581883e-06, + "loss": 0.2611, + "step": 24922 + }, + { + "epoch": 1.1380493760173631, + "grad_norm": 6.9712233543396, + "learning_rate": 6.687405783198575e-06, + "loss": 0.2766, + "step": 24923 + }, + { + "epoch": 1.138062940857298, + "grad_norm": 6.4110589027404785, + "learning_rate": 6.68726874057832e-06, + "loss": 0.3261, + "step": 24924 + }, + { + "epoch": 1.1380765056972328, + "grad_norm": 6.995294094085693, + "learning_rate": 6.687131697958065e-06, + "loss": 0.4558, + "step": 24925 + }, + { + "epoch": 1.1380900705371677, + "grad_norm": 6.974069595336914, + "learning_rate": 6.686994655337811e-06, + "loss": 0.2487, + "step": 24926 + }, + { + "epoch": 1.1381036353771026, + "grad_norm": 5.520399570465088, + "learning_rate": 6.686857612717556e-06, + "loss": 0.2694, + "step": 24927 + }, + { + "epoch": 1.1381172002170374, + "grad_norm": 7.224693775177002, + "learning_rate": 6.6867205700973e-06, + "loss": 0.4354, + "step": 24928 + }, + { + "epoch": 1.1381307650569723, + "grad_norm": 7.9560346603393555, + "learning_rate": 6.686583527477046e-06, + "loss": 0.4953, + "step": 24929 + }, + { + "epoch": 1.1381443298969072, + "grad_norm": 5.803241729736328, + "learning_rate": 6.686446484856791e-06, + "loss": 0.3556, + "step": 24930 + }, + { + "epoch": 1.138157894736842, + "grad_norm": 4.44739294052124, + "learning_rate": 6.686309442236536e-06, + "loss": 0.2134, + "step": 24931 + }, + { + "epoch": 1.138171459576777, + "grad_norm": 8.091711044311523, + "learning_rate": 6.686172399616281e-06, + "loss": 0.4854, + "step": 24932 + }, + { + "epoch": 1.1381850244167118, + "grad_norm": 6.117409706115723, + "learning_rate": 6.686035356996027e-06, + "loss": 0.3139, + "step": 24933 + }, + { + "epoch": 1.1381985892566469, + "grad_norm": 5.4598283767700195, + "learning_rate": 6.685898314375772e-06, + "loss": 0.3195, + "step": 24934 + }, + { + "epoch": 1.1382121540965817, + "grad_norm": 5.0816450119018555, + "learning_rate": 6.685761271755516e-06, + "loss": 0.3322, + "step": 24935 + }, + { + "epoch": 1.1382257189365166, + "grad_norm": 6.724269390106201, + "learning_rate": 6.6856242291352615e-06, + "loss": 0.4492, + "step": 24936 + }, + { + "epoch": 1.1382392837764514, + "grad_norm": 6.276268482208252, + "learning_rate": 6.685487186515006e-06, + "loss": 0.2762, + "step": 24937 + }, + { + "epoch": 1.1382528486163863, + "grad_norm": 6.6365885734558105, + "learning_rate": 6.685350143894752e-06, + "loss": 0.3578, + "step": 24938 + }, + { + "epoch": 1.1382664134563212, + "grad_norm": 7.397587299346924, + "learning_rate": 6.685213101274497e-06, + "loss": 0.3441, + "step": 24939 + }, + { + "epoch": 1.138279978296256, + "grad_norm": 8.31375789642334, + "learning_rate": 6.685076058654241e-06, + "loss": 0.5163, + "step": 24940 + }, + { + "epoch": 1.138293543136191, + "grad_norm": 7.718229293823242, + "learning_rate": 6.6849390160339866e-06, + "loss": 0.4032, + "step": 24941 + }, + { + "epoch": 1.138307107976126, + "grad_norm": 7.363899230957031, + "learning_rate": 6.684801973413733e-06, + "loss": 0.3129, + "step": 24942 + }, + { + "epoch": 1.1383206728160609, + "grad_norm": 5.433279037475586, + "learning_rate": 6.684664930793478e-06, + "loss": 0.3143, + "step": 24943 + }, + { + "epoch": 1.1383342376559957, + "grad_norm": 8.653237342834473, + "learning_rate": 6.684527888173222e-06, + "loss": 0.3575, + "step": 24944 + }, + { + "epoch": 1.1383478024959306, + "grad_norm": 5.376644611358643, + "learning_rate": 6.684390845552967e-06, + "loss": 0.2027, + "step": 24945 + }, + { + "epoch": 1.1383613673358655, + "grad_norm": 7.177366256713867, + "learning_rate": 6.684253802932713e-06, + "loss": 0.3575, + "step": 24946 + }, + { + "epoch": 1.1383749321758003, + "grad_norm": 6.638662815093994, + "learning_rate": 6.684116760312458e-06, + "loss": 0.3318, + "step": 24947 + }, + { + "epoch": 1.1383884970157352, + "grad_norm": 6.946285247802734, + "learning_rate": 6.683979717692203e-06, + "loss": 0.4958, + "step": 24948 + }, + { + "epoch": 1.13840206185567, + "grad_norm": 6.159787654876709, + "learning_rate": 6.683842675071948e-06, + "loss": 0.4476, + "step": 24949 + }, + { + "epoch": 1.138415626695605, + "grad_norm": 4.5478057861328125, + "learning_rate": 6.683705632451693e-06, + "loss": 0.3489, + "step": 24950 + }, + { + "epoch": 1.1384291915355398, + "grad_norm": 7.035491943359375, + "learning_rate": 6.683568589831438e-06, + "loss": 0.403, + "step": 24951 + }, + { + "epoch": 1.1384427563754747, + "grad_norm": 7.033321380615234, + "learning_rate": 6.6834315472111835e-06, + "loss": 0.46, + "step": 24952 + }, + { + "epoch": 1.1384563212154097, + "grad_norm": 7.768707275390625, + "learning_rate": 6.683294504590928e-06, + "loss": 0.35, + "step": 24953 + }, + { + "epoch": 1.1384698860553446, + "grad_norm": 5.071011543273926, + "learning_rate": 6.683157461970673e-06, + "loss": 0.3081, + "step": 24954 + }, + { + "epoch": 1.1384834508952795, + "grad_norm": 6.815948963165283, + "learning_rate": 6.683020419350419e-06, + "loss": 0.2593, + "step": 24955 + }, + { + "epoch": 1.1384970157352143, + "grad_norm": 7.814328670501709, + "learning_rate": 6.682883376730163e-06, + "loss": 0.5611, + "step": 24956 + }, + { + "epoch": 1.1385105805751492, + "grad_norm": 8.45602035522461, + "learning_rate": 6.682746334109909e-06, + "loss": 0.4211, + "step": 24957 + }, + { + "epoch": 1.138524145415084, + "grad_norm": 4.907031536102295, + "learning_rate": 6.682609291489654e-06, + "loss": 0.1964, + "step": 24958 + }, + { + "epoch": 1.138537710255019, + "grad_norm": 4.631201267242432, + "learning_rate": 6.6824722488694e-06, + "loss": 0.1679, + "step": 24959 + }, + { + "epoch": 1.1385512750949538, + "grad_norm": 6.639462947845459, + "learning_rate": 6.682335206249144e-06, + "loss": 0.3886, + "step": 24960 + }, + { + "epoch": 1.1385648399348889, + "grad_norm": 7.192506313323975, + "learning_rate": 6.682198163628889e-06, + "loss": 0.36, + "step": 24961 + }, + { + "epoch": 1.1385784047748237, + "grad_norm": 6.774115085601807, + "learning_rate": 6.682061121008634e-06, + "loss": 0.3253, + "step": 24962 + }, + { + "epoch": 1.1385919696147586, + "grad_norm": 5.857845783233643, + "learning_rate": 6.68192407838838e-06, + "loss": 0.3195, + "step": 24963 + }, + { + "epoch": 1.1386055344546935, + "grad_norm": 6.6102070808410645, + "learning_rate": 6.681787035768125e-06, + "loss": 0.2421, + "step": 24964 + }, + { + "epoch": 1.1386190992946283, + "grad_norm": 5.256532669067383, + "learning_rate": 6.681649993147869e-06, + "loss": 0.1954, + "step": 24965 + }, + { + "epoch": 1.1386326641345632, + "grad_norm": 4.58199405670166, + "learning_rate": 6.681512950527614e-06, + "loss": 0.2813, + "step": 24966 + }, + { + "epoch": 1.138646228974498, + "grad_norm": 5.765466690063477, + "learning_rate": 6.6813759079073595e-06, + "loss": 0.3158, + "step": 24967 + }, + { + "epoch": 1.138659793814433, + "grad_norm": 6.392180919647217, + "learning_rate": 6.6812388652871056e-06, + "loss": 0.319, + "step": 24968 + }, + { + "epoch": 1.1386733586543678, + "grad_norm": 7.942780017852783, + "learning_rate": 6.68110182266685e-06, + "loss": 0.2767, + "step": 24969 + }, + { + "epoch": 1.1386869234943027, + "grad_norm": 4.644534111022949, + "learning_rate": 6.680964780046595e-06, + "loss": 0.2317, + "step": 24970 + }, + { + "epoch": 1.1387004883342375, + "grad_norm": 5.9509124755859375, + "learning_rate": 6.680827737426339e-06, + "loss": 0.3007, + "step": 24971 + }, + { + "epoch": 1.1387140531741726, + "grad_norm": 5.715335845947266, + "learning_rate": 6.6806906948060854e-06, + "loss": 0.2583, + "step": 24972 + }, + { + "epoch": 1.1387276180141075, + "grad_norm": 5.504538059234619, + "learning_rate": 6.680553652185831e-06, + "loss": 0.2773, + "step": 24973 + }, + { + "epoch": 1.1387411828540424, + "grad_norm": 5.979341983795166, + "learning_rate": 6.680416609565575e-06, + "loss": 0.2012, + "step": 24974 + }, + { + "epoch": 1.1387547476939772, + "grad_norm": 4.199277400970459, + "learning_rate": 6.68027956694532e-06, + "loss": 0.1699, + "step": 24975 + }, + { + "epoch": 1.138768312533912, + "grad_norm": 6.197634220123291, + "learning_rate": 6.680142524325066e-06, + "loss": 0.3297, + "step": 24976 + }, + { + "epoch": 1.138781877373847, + "grad_norm": 3.570730447769165, + "learning_rate": 6.680005481704811e-06, + "loss": 0.1855, + "step": 24977 + }, + { + "epoch": 1.1387954422137818, + "grad_norm": 5.247792720794678, + "learning_rate": 6.679868439084556e-06, + "loss": 0.244, + "step": 24978 + }, + { + "epoch": 1.1388090070537167, + "grad_norm": 5.967462062835693, + "learning_rate": 6.679731396464301e-06, + "loss": 0.4167, + "step": 24979 + }, + { + "epoch": 1.1388225718936518, + "grad_norm": 4.935985088348389, + "learning_rate": 6.679594353844045e-06, + "loss": 0.3293, + "step": 24980 + }, + { + "epoch": 1.1388361367335866, + "grad_norm": 5.787511348724365, + "learning_rate": 6.679457311223791e-06, + "loss": 0.3444, + "step": 24981 + }, + { + "epoch": 1.1388497015735215, + "grad_norm": 5.0605692863464355, + "learning_rate": 6.679320268603536e-06, + "loss": 0.2105, + "step": 24982 + }, + { + "epoch": 1.1388632664134564, + "grad_norm": 6.223255634307861, + "learning_rate": 6.6791832259832815e-06, + "loss": 0.339, + "step": 24983 + }, + { + "epoch": 1.1388768312533912, + "grad_norm": 6.987968921661377, + "learning_rate": 6.679046183363026e-06, + "loss": 0.2955, + "step": 24984 + }, + { + "epoch": 1.138890396093326, + "grad_norm": 5.284503936767578, + "learning_rate": 6.678909140742772e-06, + "loss": 0.216, + "step": 24985 + }, + { + "epoch": 1.138903960933261, + "grad_norm": 6.565377235412598, + "learning_rate": 6.678772098122517e-06, + "loss": 0.442, + "step": 24986 + }, + { + "epoch": 1.1389175257731958, + "grad_norm": 4.467360019683838, + "learning_rate": 6.678635055502261e-06, + "loss": 0.269, + "step": 24987 + }, + { + "epoch": 1.1389310906131307, + "grad_norm": 6.081229209899902, + "learning_rate": 6.678498012882007e-06, + "loss": 0.3238, + "step": 24988 + }, + { + "epoch": 1.1389446554530656, + "grad_norm": 4.657712936401367, + "learning_rate": 6.678360970261753e-06, + "loss": 0.2871, + "step": 24989 + }, + { + "epoch": 1.1389582202930004, + "grad_norm": 5.066798210144043, + "learning_rate": 6.678223927641497e-06, + "loss": 0.2826, + "step": 24990 + }, + { + "epoch": 1.1389717851329355, + "grad_norm": 6.15156888961792, + "learning_rate": 6.678086885021242e-06, + "loss": 0.3396, + "step": 24991 + }, + { + "epoch": 1.1389853499728704, + "grad_norm": 4.060020446777344, + "learning_rate": 6.677949842400987e-06, + "loss": 0.2002, + "step": 24992 + }, + { + "epoch": 1.1389989148128052, + "grad_norm": 5.367007255554199, + "learning_rate": 6.677812799780732e-06, + "loss": 0.2181, + "step": 24993 + }, + { + "epoch": 1.13901247965274, + "grad_norm": 4.460413932800293, + "learning_rate": 6.677675757160478e-06, + "loss": 0.2319, + "step": 24994 + }, + { + "epoch": 1.139026044492675, + "grad_norm": 6.310938358306885, + "learning_rate": 6.677538714540223e-06, + "loss": 0.2019, + "step": 24995 + }, + { + "epoch": 1.1390396093326098, + "grad_norm": 5.831290245056152, + "learning_rate": 6.677401671919967e-06, + "loss": 0.2928, + "step": 24996 + }, + { + "epoch": 1.1390531741725447, + "grad_norm": 5.30805778503418, + "learning_rate": 6.677264629299712e-06, + "loss": 0.315, + "step": 24997 + }, + { + "epoch": 1.1390667390124796, + "grad_norm": 5.157884120941162, + "learning_rate": 6.677127586679458e-06, + "loss": 0.2974, + "step": 24998 + }, + { + "epoch": 1.1390803038524147, + "grad_norm": 3.8335349559783936, + "learning_rate": 6.676990544059203e-06, + "loss": 0.1788, + "step": 24999 + }, + { + "epoch": 1.1390938686923495, + "grad_norm": 4.863603115081787, + "learning_rate": 6.676853501438948e-06, + "loss": 0.2036, + "step": 25000 + }, + { + "epoch": 1.1391074335322844, + "grad_norm": 6.166608810424805, + "learning_rate": 6.676716458818693e-06, + "loss": 0.2206, + "step": 25001 + }, + { + "epoch": 1.1391209983722193, + "grad_norm": 4.674095153808594, + "learning_rate": 6.676579416198439e-06, + "loss": 0.2615, + "step": 25002 + }, + { + "epoch": 1.1391345632121541, + "grad_norm": 5.863070964813232, + "learning_rate": 6.6764423735781834e-06, + "loss": 0.3398, + "step": 25003 + }, + { + "epoch": 1.139148128052089, + "grad_norm": 5.166396617889404, + "learning_rate": 6.676305330957929e-06, + "loss": 0.2036, + "step": 25004 + }, + { + "epoch": 1.1391616928920238, + "grad_norm": 5.119977951049805, + "learning_rate": 6.676168288337673e-06, + "loss": 0.2302, + "step": 25005 + }, + { + "epoch": 1.1391752577319587, + "grad_norm": 5.580619812011719, + "learning_rate": 6.676031245717418e-06, + "loss": 0.2747, + "step": 25006 + }, + { + "epoch": 1.1391888225718936, + "grad_norm": 5.0411224365234375, + "learning_rate": 6.675894203097164e-06, + "loss": 0.2176, + "step": 25007 + }, + { + "epoch": 1.1392023874118284, + "grad_norm": 5.473425388336182, + "learning_rate": 6.675757160476909e-06, + "loss": 0.2776, + "step": 25008 + }, + { + "epoch": 1.1392159522517633, + "grad_norm": 7.68508243560791, + "learning_rate": 6.675620117856654e-06, + "loss": 0.3983, + "step": 25009 + }, + { + "epoch": 1.1392295170916984, + "grad_norm": 5.692746639251709, + "learning_rate": 6.675483075236399e-06, + "loss": 0.2508, + "step": 25010 + }, + { + "epoch": 1.1392430819316333, + "grad_norm": 4.971034049987793, + "learning_rate": 6.675346032616145e-06, + "loss": 0.2053, + "step": 25011 + }, + { + "epoch": 1.1392566467715681, + "grad_norm": 4.838288307189941, + "learning_rate": 6.675208989995889e-06, + "loss": 0.2033, + "step": 25012 + }, + { + "epoch": 1.139270211611503, + "grad_norm": 4.427626132965088, + "learning_rate": 6.675071947375634e-06, + "loss": 0.2173, + "step": 25013 + }, + { + "epoch": 1.1392837764514379, + "grad_norm": 7.989931583404541, + "learning_rate": 6.674934904755379e-06, + "loss": 0.2922, + "step": 25014 + }, + { + "epoch": 1.1392973412913727, + "grad_norm": 4.150510787963867, + "learning_rate": 6.674797862135125e-06, + "loss": 0.1924, + "step": 25015 + }, + { + "epoch": 1.1393109061313076, + "grad_norm": 4.864771366119385, + "learning_rate": 6.67466081951487e-06, + "loss": 0.1899, + "step": 25016 + }, + { + "epoch": 1.1393244709712425, + "grad_norm": 7.6954216957092285, + "learning_rate": 6.674523776894615e-06, + "loss": 0.3362, + "step": 25017 + }, + { + "epoch": 1.1393380358111775, + "grad_norm": 3.71073579788208, + "learning_rate": 6.674386734274359e-06, + "loss": 0.2146, + "step": 25018 + }, + { + "epoch": 1.1393516006511124, + "grad_norm": 5.94108247756958, + "learning_rate": 6.6742496916541054e-06, + "loss": 0.28, + "step": 25019 + }, + { + "epoch": 1.1393651654910473, + "grad_norm": 4.803084373474121, + "learning_rate": 6.674112649033851e-06, + "loss": 0.2222, + "step": 25020 + }, + { + "epoch": 1.1393787303309821, + "grad_norm": 5.505316257476807, + "learning_rate": 6.673975606413595e-06, + "loss": 0.2739, + "step": 25021 + }, + { + "epoch": 1.139392295170917, + "grad_norm": 4.80969762802124, + "learning_rate": 6.67383856379334e-06, + "loss": 0.3268, + "step": 25022 + }, + { + "epoch": 1.1394058600108519, + "grad_norm": 5.737409591674805, + "learning_rate": 6.673701521173085e-06, + "loss": 0.2887, + "step": 25023 + }, + { + "epoch": 1.1394194248507867, + "grad_norm": 4.6941022872924805, + "learning_rate": 6.6735644785528305e-06, + "loss": 0.184, + "step": 25024 + }, + { + "epoch": 1.1394329896907216, + "grad_norm": 5.84567403793335, + "learning_rate": 6.673427435932576e-06, + "loss": 0.2364, + "step": 25025 + }, + { + "epoch": 1.1394465545306565, + "grad_norm": 6.290832996368408, + "learning_rate": 6.673290393312321e-06, + "loss": 0.2999, + "step": 25026 + }, + { + "epoch": 1.1394601193705913, + "grad_norm": 5.382156848907471, + "learning_rate": 6.673153350692065e-06, + "loss": 0.2428, + "step": 25027 + }, + { + "epoch": 1.1394736842105262, + "grad_norm": 5.364434719085693, + "learning_rate": 6.673016308071811e-06, + "loss": 0.2401, + "step": 25028 + }, + { + "epoch": 1.1394872490504613, + "grad_norm": 7.561607360839844, + "learning_rate": 6.672879265451556e-06, + "loss": 0.4653, + "step": 25029 + }, + { + "epoch": 1.1395008138903961, + "grad_norm": 6.676835060119629, + "learning_rate": 6.672742222831301e-06, + "loss": 0.3743, + "step": 25030 + }, + { + "epoch": 1.139514378730331, + "grad_norm": 5.643223285675049, + "learning_rate": 6.672605180211046e-06, + "loss": 0.3064, + "step": 25031 + }, + { + "epoch": 1.1395279435702659, + "grad_norm": 5.868358135223389, + "learning_rate": 6.672468137590792e-06, + "loss": 0.2399, + "step": 25032 + }, + { + "epoch": 1.1395415084102007, + "grad_norm": 5.818440914154053, + "learning_rate": 6.672331094970536e-06, + "loss": 0.266, + "step": 25033 + }, + { + "epoch": 1.1395550732501356, + "grad_norm": 5.5156569480896, + "learning_rate": 6.6721940523502814e-06, + "loss": 0.3483, + "step": 25034 + }, + { + "epoch": 1.1395686380900705, + "grad_norm": 7.747203350067139, + "learning_rate": 6.672057009730027e-06, + "loss": 0.2194, + "step": 25035 + }, + { + "epoch": 1.1395822029300053, + "grad_norm": 4.119243621826172, + "learning_rate": 6.671919967109771e-06, + "loss": 0.2017, + "step": 25036 + }, + { + "epoch": 1.1395957677699404, + "grad_norm": 5.010721683502197, + "learning_rate": 6.671782924489517e-06, + "loss": 0.2256, + "step": 25037 + }, + { + "epoch": 1.1396093326098753, + "grad_norm": 4.496058940887451, + "learning_rate": 6.671645881869262e-06, + "loss": 0.2671, + "step": 25038 + }, + { + "epoch": 1.1396228974498102, + "grad_norm": 6.491872310638428, + "learning_rate": 6.6715088392490065e-06, + "loss": 0.4622, + "step": 25039 + }, + { + "epoch": 1.139636462289745, + "grad_norm": 4.9005208015441895, + "learning_rate": 6.671371796628752e-06, + "loss": 0.2356, + "step": 25040 + }, + { + "epoch": 1.1396500271296799, + "grad_norm": 5.34556770324707, + "learning_rate": 6.671234754008498e-06, + "loss": 0.2955, + "step": 25041 + }, + { + "epoch": 1.1396635919696148, + "grad_norm": 5.057069301605225, + "learning_rate": 6.671097711388243e-06, + "loss": 0.2329, + "step": 25042 + }, + { + "epoch": 1.1396771568095496, + "grad_norm": 6.1105570793151855, + "learning_rate": 6.670960668767987e-06, + "loss": 0.2941, + "step": 25043 + }, + { + "epoch": 1.1396907216494845, + "grad_norm": 4.762988090515137, + "learning_rate": 6.670823626147732e-06, + "loss": 0.2052, + "step": 25044 + }, + { + "epoch": 1.1397042864894193, + "grad_norm": 5.323309898376465, + "learning_rate": 6.670686583527478e-06, + "loss": 0.2785, + "step": 25045 + }, + { + "epoch": 1.1397178513293542, + "grad_norm": 4.801548957824707, + "learning_rate": 6.670549540907223e-06, + "loss": 0.2037, + "step": 25046 + }, + { + "epoch": 1.139731416169289, + "grad_norm": 6.2269768714904785, + "learning_rate": 6.670412498286968e-06, + "loss": 0.3263, + "step": 25047 + }, + { + "epoch": 1.1397449810092242, + "grad_norm": 5.448215961456299, + "learning_rate": 6.670275455666712e-06, + "loss": 0.2446, + "step": 25048 + }, + { + "epoch": 1.139758545849159, + "grad_norm": 6.952003479003906, + "learning_rate": 6.6701384130464574e-06, + "loss": 0.3703, + "step": 25049 + }, + { + "epoch": 1.139772110689094, + "grad_norm": 3.6865599155426025, + "learning_rate": 6.6700013704262035e-06, + "loss": 0.1615, + "step": 25050 + }, + { + "epoch": 1.1397856755290288, + "grad_norm": 6.464969158172607, + "learning_rate": 6.669864327805949e-06, + "loss": 0.2931, + "step": 25051 + }, + { + "epoch": 1.1397992403689636, + "grad_norm": 6.962814807891846, + "learning_rate": 6.669727285185693e-06, + "loss": 0.2783, + "step": 25052 + }, + { + "epoch": 1.1398128052088985, + "grad_norm": 5.45904016494751, + "learning_rate": 6.669590242565438e-06, + "loss": 0.238, + "step": 25053 + }, + { + "epoch": 1.1398263700488334, + "grad_norm": 5.569969177246094, + "learning_rate": 6.669453199945184e-06, + "loss": 0.3445, + "step": 25054 + }, + { + "epoch": 1.1398399348887682, + "grad_norm": 6.290180683135986, + "learning_rate": 6.6693161573249285e-06, + "loss": 0.3721, + "step": 25055 + }, + { + "epoch": 1.1398534997287033, + "grad_norm": 6.171215534210205, + "learning_rate": 6.669179114704674e-06, + "loss": 0.3383, + "step": 25056 + }, + { + "epoch": 1.1398670645686382, + "grad_norm": 7.024298667907715, + "learning_rate": 6.669042072084419e-06, + "loss": 0.3032, + "step": 25057 + }, + { + "epoch": 1.139880629408573, + "grad_norm": 5.141828536987305, + "learning_rate": 6.668905029464164e-06, + "loss": 0.2761, + "step": 25058 + }, + { + "epoch": 1.139894194248508, + "grad_norm": 7.446351528167725, + "learning_rate": 6.668767986843909e-06, + "loss": 0.3505, + "step": 25059 + }, + { + "epoch": 1.1399077590884428, + "grad_norm": 5.892561435699463, + "learning_rate": 6.668630944223654e-06, + "loss": 0.3327, + "step": 25060 + }, + { + "epoch": 1.1399213239283776, + "grad_norm": 6.508265972137451, + "learning_rate": 6.668493901603399e-06, + "loss": 0.3555, + "step": 25061 + }, + { + "epoch": 1.1399348887683125, + "grad_norm": 4.771716117858887, + "learning_rate": 6.668356858983144e-06, + "loss": 0.3296, + "step": 25062 + }, + { + "epoch": 1.1399484536082474, + "grad_norm": 5.5695295333862305, + "learning_rate": 6.66821981636289e-06, + "loss": 0.2988, + "step": 25063 + }, + { + "epoch": 1.1399620184481822, + "grad_norm": 4.807321548461914, + "learning_rate": 6.668082773742634e-06, + "loss": 0.2755, + "step": 25064 + }, + { + "epoch": 1.139975583288117, + "grad_norm": 6.050497055053711, + "learning_rate": 6.6679457311223794e-06, + "loss": 0.3712, + "step": 25065 + }, + { + "epoch": 1.139989148128052, + "grad_norm": 5.1072258949279785, + "learning_rate": 6.667808688502125e-06, + "loss": 0.2634, + "step": 25066 + }, + { + "epoch": 1.140002712967987, + "grad_norm": 5.001743316650391, + "learning_rate": 6.66767164588187e-06, + "loss": 0.2501, + "step": 25067 + }, + { + "epoch": 1.140016277807922, + "grad_norm": 5.7403411865234375, + "learning_rate": 6.667534603261615e-06, + "loss": 0.3241, + "step": 25068 + }, + { + "epoch": 1.1400298426478568, + "grad_norm": 4.751776695251465, + "learning_rate": 6.66739756064136e-06, + "loss": 0.1658, + "step": 25069 + }, + { + "epoch": 1.1400434074877916, + "grad_norm": 6.807158946990967, + "learning_rate": 6.6672605180211045e-06, + "loss": 0.3811, + "step": 25070 + }, + { + "epoch": 1.1400569723277265, + "grad_norm": 6.75884485244751, + "learning_rate": 6.6671234754008505e-06, + "loss": 0.4663, + "step": 25071 + }, + { + "epoch": 1.1400705371676614, + "grad_norm": 5.502066135406494, + "learning_rate": 6.666986432780596e-06, + "loss": 0.3845, + "step": 25072 + }, + { + "epoch": 1.1400841020075962, + "grad_norm": 6.415555953979492, + "learning_rate": 6.66684939016034e-06, + "loss": 0.3848, + "step": 25073 + }, + { + "epoch": 1.140097666847531, + "grad_norm": 6.597604751586914, + "learning_rate": 6.666712347540085e-06, + "loss": 0.3904, + "step": 25074 + }, + { + "epoch": 1.1401112316874662, + "grad_norm": 4.578174591064453, + "learning_rate": 6.66657530491983e-06, + "loss": 0.2863, + "step": 25075 + }, + { + "epoch": 1.140124796527401, + "grad_norm": 6.81781005859375, + "learning_rate": 6.666438262299576e-06, + "loss": 0.3245, + "step": 25076 + }, + { + "epoch": 1.140138361367336, + "grad_norm": 6.592738151550293, + "learning_rate": 6.666301219679321e-06, + "loss": 0.3034, + "step": 25077 + }, + { + "epoch": 1.1401519262072708, + "grad_norm": 5.485284805297852, + "learning_rate": 6.666164177059066e-06, + "loss": 0.3488, + "step": 25078 + }, + { + "epoch": 1.1401654910472057, + "grad_norm": 5.888227939605713, + "learning_rate": 6.66602713443881e-06, + "loss": 0.3047, + "step": 25079 + }, + { + "epoch": 1.1401790558871405, + "grad_norm": 6.3875813484191895, + "learning_rate": 6.665890091818556e-06, + "loss": 0.3808, + "step": 25080 + }, + { + "epoch": 1.1401926207270754, + "grad_norm": 5.321523189544678, + "learning_rate": 6.6657530491983015e-06, + "loss": 0.333, + "step": 25081 + }, + { + "epoch": 1.1402061855670103, + "grad_norm": 3.9238860607147217, + "learning_rate": 6.665616006578046e-06, + "loss": 0.2796, + "step": 25082 + }, + { + "epoch": 1.1402197504069451, + "grad_norm": 7.349414348602295, + "learning_rate": 6.665478963957791e-06, + "loss": 0.39, + "step": 25083 + }, + { + "epoch": 1.14023331524688, + "grad_norm": 5.365331649780273, + "learning_rate": 6.665341921337537e-06, + "loss": 0.3416, + "step": 25084 + }, + { + "epoch": 1.1402468800868149, + "grad_norm": 4.956948757171631, + "learning_rate": 6.665204878717282e-06, + "loss": 0.2947, + "step": 25085 + }, + { + "epoch": 1.14026044492675, + "grad_norm": 5.265831470489502, + "learning_rate": 6.6650678360970265e-06, + "loss": 0.235, + "step": 25086 + }, + { + "epoch": 1.1402740097666848, + "grad_norm": 4.263955116271973, + "learning_rate": 6.664930793476772e-06, + "loss": 0.2967, + "step": 25087 + }, + { + "epoch": 1.1402875746066197, + "grad_norm": 7.935487270355225, + "learning_rate": 6.664793750856518e-06, + "loss": 0.5753, + "step": 25088 + }, + { + "epoch": 1.1403011394465545, + "grad_norm": 4.62083625793457, + "learning_rate": 6.664656708236262e-06, + "loss": 0.4983, + "step": 25089 + }, + { + "epoch": 1.1403147042864894, + "grad_norm": 5.83228063583374, + "learning_rate": 6.664519665616007e-06, + "loss": 0.3565, + "step": 25090 + }, + { + "epoch": 1.1403282691264243, + "grad_norm": 5.173681259155273, + "learning_rate": 6.664382622995752e-06, + "loss": 0.2658, + "step": 25091 + }, + { + "epoch": 1.1403418339663591, + "grad_norm": 6.602966785430908, + "learning_rate": 6.664245580375497e-06, + "loss": 0.3902, + "step": 25092 + }, + { + "epoch": 1.140355398806294, + "grad_norm": 5.669600009918213, + "learning_rate": 6.664108537755243e-06, + "loss": 0.3292, + "step": 25093 + }, + { + "epoch": 1.140368963646229, + "grad_norm": 6.020624160766602, + "learning_rate": 6.663971495134988e-06, + "loss": 0.3651, + "step": 25094 + }, + { + "epoch": 1.140382528486164, + "grad_norm": 4.645676136016846, + "learning_rate": 6.663834452514732e-06, + "loss": 0.2897, + "step": 25095 + }, + { + "epoch": 1.1403960933260988, + "grad_norm": 4.911983966827393, + "learning_rate": 6.6636974098944774e-06, + "loss": 0.2543, + "step": 25096 + }, + { + "epoch": 1.1404096581660337, + "grad_norm": 7.052407264709473, + "learning_rate": 6.6635603672742235e-06, + "loss": 0.5004, + "step": 25097 + }, + { + "epoch": 1.1404232230059685, + "grad_norm": 5.5470733642578125, + "learning_rate": 6.663423324653968e-06, + "loss": 0.3776, + "step": 25098 + }, + { + "epoch": 1.1404367878459034, + "grad_norm": 7.533116817474365, + "learning_rate": 6.663286282033713e-06, + "loss": 0.3508, + "step": 25099 + }, + { + "epoch": 1.1404503526858383, + "grad_norm": 5.561464786529541, + "learning_rate": 6.663149239413458e-06, + "loss": 0.3206, + "step": 25100 + }, + { + "epoch": 1.1404639175257731, + "grad_norm": 5.640242099761963, + "learning_rate": 6.663012196793204e-06, + "loss": 0.3539, + "step": 25101 + }, + { + "epoch": 1.140477482365708, + "grad_norm": 4.93075704574585, + "learning_rate": 6.6628751541729485e-06, + "loss": 0.3399, + "step": 25102 + }, + { + "epoch": 1.1404910472056429, + "grad_norm": 5.447000026702881, + "learning_rate": 6.662738111552694e-06, + "loss": 0.2497, + "step": 25103 + }, + { + "epoch": 1.140504612045578, + "grad_norm": 6.218041896820068, + "learning_rate": 6.662601068932438e-06, + "loss": 0.3243, + "step": 25104 + }, + { + "epoch": 1.1405181768855128, + "grad_norm": 5.682421684265137, + "learning_rate": 6.662464026312183e-06, + "loss": 0.3395, + "step": 25105 + }, + { + "epoch": 1.1405317417254477, + "grad_norm": 4.917265892028809, + "learning_rate": 6.662326983691929e-06, + "loss": 0.2635, + "step": 25106 + }, + { + "epoch": 1.1405453065653826, + "grad_norm": 5.784316539764404, + "learning_rate": 6.6621899410716736e-06, + "loss": 0.2325, + "step": 25107 + }, + { + "epoch": 1.1405588714053174, + "grad_norm": 4.604664325714111, + "learning_rate": 6.662052898451419e-06, + "loss": 0.2172, + "step": 25108 + }, + { + "epoch": 1.1405724362452523, + "grad_norm": 6.055941104888916, + "learning_rate": 6.661915855831164e-06, + "loss": 0.3221, + "step": 25109 + }, + { + "epoch": 1.1405860010851872, + "grad_norm": 5.511246204376221, + "learning_rate": 6.66177881321091e-06, + "loss": 0.3839, + "step": 25110 + }, + { + "epoch": 1.140599565925122, + "grad_norm": 4.944194793701172, + "learning_rate": 6.661641770590654e-06, + "loss": 0.3268, + "step": 25111 + }, + { + "epoch": 1.1406131307650569, + "grad_norm": 6.563167095184326, + "learning_rate": 6.6615047279703995e-06, + "loss": 0.3958, + "step": 25112 + }, + { + "epoch": 1.140626695604992, + "grad_norm": 5.533684253692627, + "learning_rate": 6.661367685350144e-06, + "loss": 0.2487, + "step": 25113 + }, + { + "epoch": 1.1406402604449268, + "grad_norm": 5.951749801635742, + "learning_rate": 6.66123064272989e-06, + "loss": 0.2576, + "step": 25114 + }, + { + "epoch": 1.1406538252848617, + "grad_norm": 5.894894123077393, + "learning_rate": 6.661093600109635e-06, + "loss": 0.2966, + "step": 25115 + }, + { + "epoch": 1.1406673901247966, + "grad_norm": 6.318056106567383, + "learning_rate": 6.660956557489379e-06, + "loss": 0.3815, + "step": 25116 + }, + { + "epoch": 1.1406809549647314, + "grad_norm": 4.253845691680908, + "learning_rate": 6.6608195148691245e-06, + "loss": 0.2348, + "step": 25117 + }, + { + "epoch": 1.1406945198046663, + "grad_norm": 3.732468366622925, + "learning_rate": 6.66068247224887e-06, + "loss": 0.169, + "step": 25118 + }, + { + "epoch": 1.1407080846446012, + "grad_norm": 4.290294647216797, + "learning_rate": 6.660545429628616e-06, + "loss": 0.1706, + "step": 25119 + }, + { + "epoch": 1.140721649484536, + "grad_norm": 4.908522129058838, + "learning_rate": 6.66040838700836e-06, + "loss": 0.2131, + "step": 25120 + }, + { + "epoch": 1.140735214324471, + "grad_norm": 5.250179290771484, + "learning_rate": 6.660271344388105e-06, + "loss": 0.2256, + "step": 25121 + }, + { + "epoch": 1.1407487791644058, + "grad_norm": 5.8920183181762695, + "learning_rate": 6.6601343017678496e-06, + "loss": 0.3936, + "step": 25122 + }, + { + "epoch": 1.1407623440043408, + "grad_norm": 5.635417461395264, + "learning_rate": 6.659997259147596e-06, + "loss": 0.2334, + "step": 25123 + }, + { + "epoch": 1.1407759088442757, + "grad_norm": 6.061125755310059, + "learning_rate": 6.659860216527341e-06, + "loss": 0.3577, + "step": 25124 + }, + { + "epoch": 1.1407894736842106, + "grad_norm": 4.657090663909912, + "learning_rate": 6.659723173907086e-06, + "loss": 0.2222, + "step": 25125 + }, + { + "epoch": 1.1408030385241454, + "grad_norm": 6.563113689422607, + "learning_rate": 6.65958613128683e-06, + "loss": 0.3517, + "step": 25126 + }, + { + "epoch": 1.1408166033640803, + "grad_norm": 5.36037540435791, + "learning_rate": 6.659449088666576e-06, + "loss": 0.2536, + "step": 25127 + }, + { + "epoch": 1.1408301682040152, + "grad_norm": 4.034061431884766, + "learning_rate": 6.6593120460463215e-06, + "loss": 0.2231, + "step": 25128 + }, + { + "epoch": 1.14084373304395, + "grad_norm": 5.674409866333008, + "learning_rate": 6.659175003426066e-06, + "loss": 0.268, + "step": 25129 + }, + { + "epoch": 1.140857297883885, + "grad_norm": 6.362708568572998, + "learning_rate": 6.659037960805811e-06, + "loss": 0.282, + "step": 25130 + }, + { + "epoch": 1.1408708627238198, + "grad_norm": 6.455906391143799, + "learning_rate": 6.658900918185555e-06, + "loss": 0.2706, + "step": 25131 + }, + { + "epoch": 1.1408844275637549, + "grad_norm": 4.858194828033447, + "learning_rate": 6.658763875565301e-06, + "loss": 0.225, + "step": 25132 + }, + { + "epoch": 1.1408979924036897, + "grad_norm": 5.199099540710449, + "learning_rate": 6.6586268329450465e-06, + "loss": 0.3236, + "step": 25133 + }, + { + "epoch": 1.1409115572436246, + "grad_norm": 5.059612274169922, + "learning_rate": 6.658489790324792e-06, + "loss": 0.2937, + "step": 25134 + }, + { + "epoch": 1.1409251220835595, + "grad_norm": 5.526127815246582, + "learning_rate": 6.658352747704536e-06, + "loss": 0.2249, + "step": 25135 + }, + { + "epoch": 1.1409386869234943, + "grad_norm": 5.040726184844971, + "learning_rate": 6.658215705084282e-06, + "loss": 0.248, + "step": 25136 + }, + { + "epoch": 1.1409522517634292, + "grad_norm": 6.494205951690674, + "learning_rate": 6.658078662464027e-06, + "loss": 0.3016, + "step": 25137 + }, + { + "epoch": 1.140965816603364, + "grad_norm": 4.972610950469971, + "learning_rate": 6.6579416198437716e-06, + "loss": 0.2118, + "step": 25138 + }, + { + "epoch": 1.140979381443299, + "grad_norm": 3.8743972778320312, + "learning_rate": 6.657804577223517e-06, + "loss": 0.1376, + "step": 25139 + }, + { + "epoch": 1.1409929462832338, + "grad_norm": 6.02208948135376, + "learning_rate": 6.657667534603263e-06, + "loss": 0.3448, + "step": 25140 + }, + { + "epoch": 1.1410065111231686, + "grad_norm": 7.252449035644531, + "learning_rate": 6.657530491983007e-06, + "loss": 0.3442, + "step": 25141 + }, + { + "epoch": 1.1410200759631037, + "grad_norm": 7.196688175201416, + "learning_rate": 6.657393449362752e-06, + "loss": 0.2849, + "step": 25142 + }, + { + "epoch": 1.1410336408030386, + "grad_norm": 4.16107702255249, + "learning_rate": 6.6572564067424975e-06, + "loss": 0.2061, + "step": 25143 + }, + { + "epoch": 1.1410472056429735, + "grad_norm": 4.435701847076416, + "learning_rate": 6.657119364122242e-06, + "loss": 0.1442, + "step": 25144 + }, + { + "epoch": 1.1410607704829083, + "grad_norm": 4.539188861846924, + "learning_rate": 6.656982321501988e-06, + "loss": 0.2259, + "step": 25145 + }, + { + "epoch": 1.1410743353228432, + "grad_norm": 6.134788513183594, + "learning_rate": 6.656845278881733e-06, + "loss": 0.2041, + "step": 25146 + }, + { + "epoch": 1.141087900162778, + "grad_norm": 5.75288724899292, + "learning_rate": 6.656708236261477e-06, + "loss": 0.2942, + "step": 25147 + }, + { + "epoch": 1.141101465002713, + "grad_norm": 7.485331058502197, + "learning_rate": 6.6565711936412225e-06, + "loss": 0.4191, + "step": 25148 + }, + { + "epoch": 1.1411150298426478, + "grad_norm": 5.544140338897705, + "learning_rate": 6.6564341510209685e-06, + "loss": 0.2937, + "step": 25149 + }, + { + "epoch": 1.1411285946825827, + "grad_norm": 8.271286010742188, + "learning_rate": 6.656297108400714e-06, + "loss": 0.2695, + "step": 25150 + }, + { + "epoch": 1.1411421595225177, + "grad_norm": 4.303177833557129, + "learning_rate": 6.656160065780458e-06, + "loss": 0.2076, + "step": 25151 + }, + { + "epoch": 1.1411557243624526, + "grad_norm": 6.098893642425537, + "learning_rate": 6.656023023160203e-06, + "loss": 0.3052, + "step": 25152 + }, + { + "epoch": 1.1411692892023875, + "grad_norm": 5.352764129638672, + "learning_rate": 6.655885980539949e-06, + "loss": 0.2687, + "step": 25153 + }, + { + "epoch": 1.1411828540423223, + "grad_norm": 5.9042744636535645, + "learning_rate": 6.655748937919694e-06, + "loss": 0.2447, + "step": 25154 + }, + { + "epoch": 1.1411964188822572, + "grad_norm": 6.18787145614624, + "learning_rate": 6.655611895299439e-06, + "loss": 0.4639, + "step": 25155 + }, + { + "epoch": 1.141209983722192, + "grad_norm": 5.626189231872559, + "learning_rate": 6.655474852679183e-06, + "loss": 0.2392, + "step": 25156 + }, + { + "epoch": 1.141223548562127, + "grad_norm": 6.632299423217773, + "learning_rate": 6.655337810058928e-06, + "loss": 0.3076, + "step": 25157 + }, + { + "epoch": 1.1412371134020618, + "grad_norm": 4.120667934417725, + "learning_rate": 6.655200767438674e-06, + "loss": 0.3198, + "step": 25158 + }, + { + "epoch": 1.1412506782419967, + "grad_norm": 5.506535053253174, + "learning_rate": 6.6550637248184195e-06, + "loss": 0.2883, + "step": 25159 + }, + { + "epoch": 1.1412642430819315, + "grad_norm": 7.659650802612305, + "learning_rate": 6.654926682198164e-06, + "loss": 0.4588, + "step": 25160 + }, + { + "epoch": 1.1412778079218666, + "grad_norm": 5.422214984893799, + "learning_rate": 6.654789639577909e-06, + "loss": 0.3043, + "step": 25161 + }, + { + "epoch": 1.1412913727618015, + "grad_norm": 6.300477504730225, + "learning_rate": 6.654652596957655e-06, + "loss": 0.3612, + "step": 25162 + }, + { + "epoch": 1.1413049376017363, + "grad_norm": 5.9104814529418945, + "learning_rate": 6.654515554337399e-06, + "loss": 0.4834, + "step": 25163 + }, + { + "epoch": 1.1413185024416712, + "grad_norm": 6.56032133102417, + "learning_rate": 6.6543785117171445e-06, + "loss": 0.4141, + "step": 25164 + }, + { + "epoch": 1.141332067281606, + "grad_norm": 4.359024524688721, + "learning_rate": 6.65424146909689e-06, + "loss": 0.2012, + "step": 25165 + }, + { + "epoch": 1.141345632121541, + "grad_norm": 4.329317092895508, + "learning_rate": 6.654104426476635e-06, + "loss": 0.2665, + "step": 25166 + }, + { + "epoch": 1.1413591969614758, + "grad_norm": 6.818474292755127, + "learning_rate": 6.65396738385638e-06, + "loss": 0.2511, + "step": 25167 + }, + { + "epoch": 1.1413727618014107, + "grad_norm": 6.973925590515137, + "learning_rate": 6.653830341236125e-06, + "loss": 0.3312, + "step": 25168 + }, + { + "epoch": 1.1413863266413455, + "grad_norm": 5.182579517364502, + "learning_rate": 6.6536932986158696e-06, + "loss": 0.3991, + "step": 25169 + }, + { + "epoch": 1.1413998914812806, + "grad_norm": 4.479367733001709, + "learning_rate": 6.653556255995616e-06, + "loss": 0.2753, + "step": 25170 + }, + { + "epoch": 1.1414134563212155, + "grad_norm": 7.406688213348389, + "learning_rate": 6.653419213375361e-06, + "loss": 0.3084, + "step": 25171 + }, + { + "epoch": 1.1414270211611504, + "grad_norm": 5.9835004806518555, + "learning_rate": 6.653282170755105e-06, + "loss": 0.3821, + "step": 25172 + }, + { + "epoch": 1.1414405860010852, + "grad_norm": 6.3234944343566895, + "learning_rate": 6.65314512813485e-06, + "loss": 0.3711, + "step": 25173 + }, + { + "epoch": 1.14145415084102, + "grad_norm": 3.8472561836242676, + "learning_rate": 6.6530080855145955e-06, + "loss": 0.2282, + "step": 25174 + }, + { + "epoch": 1.141467715680955, + "grad_norm": 6.5109710693359375, + "learning_rate": 6.652871042894341e-06, + "loss": 0.3609, + "step": 25175 + }, + { + "epoch": 1.1414812805208898, + "grad_norm": 6.418575286865234, + "learning_rate": 6.652734000274086e-06, + "loss": 0.371, + "step": 25176 + }, + { + "epoch": 1.1414948453608247, + "grad_norm": 4.796402454376221, + "learning_rate": 6.652596957653831e-06, + "loss": 0.2204, + "step": 25177 + }, + { + "epoch": 1.1415084102007595, + "grad_norm": 6.93895149230957, + "learning_rate": 6.652459915033575e-06, + "loss": 0.4398, + "step": 25178 + }, + { + "epoch": 1.1415219750406944, + "grad_norm": 5.131214141845703, + "learning_rate": 6.652322872413321e-06, + "loss": 0.2292, + "step": 25179 + }, + { + "epoch": 1.1415355398806295, + "grad_norm": 6.1075439453125, + "learning_rate": 6.6521858297930665e-06, + "loss": 0.3619, + "step": 25180 + }, + { + "epoch": 1.1415491047205644, + "grad_norm": 4.352618217468262, + "learning_rate": 6.652048787172811e-06, + "loss": 0.229, + "step": 25181 + }, + { + "epoch": 1.1415626695604992, + "grad_norm": 4.679874420166016, + "learning_rate": 6.651911744552556e-06, + "loss": 0.291, + "step": 25182 + }, + { + "epoch": 1.141576234400434, + "grad_norm": 4.417750358581543, + "learning_rate": 6.651774701932302e-06, + "loss": 0.3374, + "step": 25183 + }, + { + "epoch": 1.141589799240369, + "grad_norm": 5.876791954040527, + "learning_rate": 6.651637659312047e-06, + "loss": 0.3478, + "step": 25184 + }, + { + "epoch": 1.1416033640803038, + "grad_norm": 7.21941614151001, + "learning_rate": 6.651500616691792e-06, + "loss": 0.3796, + "step": 25185 + }, + { + "epoch": 1.1416169289202387, + "grad_norm": 4.303869247436523, + "learning_rate": 6.651363574071537e-06, + "loss": 0.2131, + "step": 25186 + }, + { + "epoch": 1.1416304937601736, + "grad_norm": 5.025246620178223, + "learning_rate": 6.651226531451281e-06, + "loss": 0.2882, + "step": 25187 + }, + { + "epoch": 1.1416440586001084, + "grad_norm": 3.6924729347229004, + "learning_rate": 6.651089488831027e-06, + "loss": 0.3254, + "step": 25188 + }, + { + "epoch": 1.1416576234400435, + "grad_norm": 7.718430995941162, + "learning_rate": 6.650952446210772e-06, + "loss": 0.4699, + "step": 25189 + }, + { + "epoch": 1.1416711882799784, + "grad_norm": 4.8360419273376465, + "learning_rate": 6.650815403590517e-06, + "loss": 0.3104, + "step": 25190 + }, + { + "epoch": 1.1416847531199132, + "grad_norm": 4.269972801208496, + "learning_rate": 6.650678360970262e-06, + "loss": 0.2868, + "step": 25191 + }, + { + "epoch": 1.141698317959848, + "grad_norm": 5.672112941741943, + "learning_rate": 6.650541318350008e-06, + "loss": 0.2888, + "step": 25192 + }, + { + "epoch": 1.141711882799783, + "grad_norm": 4.9354634284973145, + "learning_rate": 6.650404275729753e-06, + "loss": 0.2529, + "step": 25193 + }, + { + "epoch": 1.1417254476397178, + "grad_norm": 4.35902738571167, + "learning_rate": 6.650267233109497e-06, + "loss": 0.2934, + "step": 25194 + }, + { + "epoch": 1.1417390124796527, + "grad_norm": 4.214968681335449, + "learning_rate": 6.6501301904892425e-06, + "loss": 0.2894, + "step": 25195 + }, + { + "epoch": 1.1417525773195876, + "grad_norm": 4.314321517944336, + "learning_rate": 6.6499931478689886e-06, + "loss": 0.1886, + "step": 25196 + }, + { + "epoch": 1.1417661421595224, + "grad_norm": 3.396784543991089, + "learning_rate": 6.649856105248733e-06, + "loss": 0.2708, + "step": 25197 + }, + { + "epoch": 1.1417797069994573, + "grad_norm": 4.910678386688232, + "learning_rate": 6.649719062628478e-06, + "loss": 0.3375, + "step": 25198 + }, + { + "epoch": 1.1417932718393924, + "grad_norm": 4.8453450202941895, + "learning_rate": 6.649582020008223e-06, + "loss": 0.2996, + "step": 25199 + }, + { + "epoch": 1.1418068366793273, + "grad_norm": 6.73757791519165, + "learning_rate": 6.649444977387968e-06, + "loss": 0.5008, + "step": 25200 + }, + { + "epoch": 1.1418204015192621, + "grad_norm": 5.166407585144043, + "learning_rate": 6.649307934767714e-06, + "loss": 0.3005, + "step": 25201 + }, + { + "epoch": 1.141833966359197, + "grad_norm": 5.182255268096924, + "learning_rate": 6.649170892147459e-06, + "loss": 0.4234, + "step": 25202 + }, + { + "epoch": 1.1418475311991318, + "grad_norm": 5.001288890838623, + "learning_rate": 6.649033849527203e-06, + "loss": 0.2466, + "step": 25203 + }, + { + "epoch": 1.1418610960390667, + "grad_norm": 5.475833892822266, + "learning_rate": 6.648896806906948e-06, + "loss": 0.3139, + "step": 25204 + }, + { + "epoch": 1.1418746608790016, + "grad_norm": 5.184824466705322, + "learning_rate": 6.648759764286694e-06, + "loss": 0.2264, + "step": 25205 + }, + { + "epoch": 1.1418882257189364, + "grad_norm": 5.417463779449463, + "learning_rate": 6.648622721666439e-06, + "loss": 0.2823, + "step": 25206 + }, + { + "epoch": 1.1419017905588713, + "grad_norm": 5.596413612365723, + "learning_rate": 6.648485679046184e-06, + "loss": 0.3109, + "step": 25207 + }, + { + "epoch": 1.1419153553988064, + "grad_norm": 3.9837207794189453, + "learning_rate": 6.648348636425929e-06, + "loss": 0.2226, + "step": 25208 + }, + { + "epoch": 1.1419289202387413, + "grad_norm": 5.815385341644287, + "learning_rate": 6.648211593805674e-06, + "loss": 0.2963, + "step": 25209 + }, + { + "epoch": 1.1419424850786761, + "grad_norm": 4.70547342300415, + "learning_rate": 6.648074551185419e-06, + "loss": 0.2578, + "step": 25210 + }, + { + "epoch": 1.141956049918611, + "grad_norm": 6.251247882843018, + "learning_rate": 6.6479375085651645e-06, + "loss": 0.2751, + "step": 25211 + }, + { + "epoch": 1.1419696147585459, + "grad_norm": 8.400136947631836, + "learning_rate": 6.647800465944909e-06, + "loss": 0.4521, + "step": 25212 + }, + { + "epoch": 1.1419831795984807, + "grad_norm": 4.935842990875244, + "learning_rate": 6.647663423324654e-06, + "loss": 0.2757, + "step": 25213 + }, + { + "epoch": 1.1419967444384156, + "grad_norm": 6.59251070022583, + "learning_rate": 6.6475263807044e-06, + "loss": 0.3082, + "step": 25214 + }, + { + "epoch": 1.1420103092783505, + "grad_norm": 4.725632667541504, + "learning_rate": 6.647389338084144e-06, + "loss": 0.2996, + "step": 25215 + }, + { + "epoch": 1.1420238741182853, + "grad_norm": 5.995622158050537, + "learning_rate": 6.64725229546389e-06, + "loss": 0.3833, + "step": 25216 + }, + { + "epoch": 1.1420374389582202, + "grad_norm": 4.842037200927734, + "learning_rate": 6.647115252843635e-06, + "loss": 0.1994, + "step": 25217 + }, + { + "epoch": 1.1420510037981553, + "grad_norm": 5.209795951843262, + "learning_rate": 6.646978210223381e-06, + "loss": 0.3162, + "step": 25218 + }, + { + "epoch": 1.1420645686380901, + "grad_norm": 5.0216383934021, + "learning_rate": 6.646841167603125e-06, + "loss": 0.3361, + "step": 25219 + }, + { + "epoch": 1.142078133478025, + "grad_norm": 4.400358200073242, + "learning_rate": 6.64670412498287e-06, + "loss": 0.2683, + "step": 25220 + }, + { + "epoch": 1.1420916983179599, + "grad_norm": 4.062560081481934, + "learning_rate": 6.646567082362615e-06, + "loss": 0.2273, + "step": 25221 + }, + { + "epoch": 1.1421052631578947, + "grad_norm": 5.078243732452393, + "learning_rate": 6.646430039742361e-06, + "loss": 0.228, + "step": 25222 + }, + { + "epoch": 1.1421188279978296, + "grad_norm": 4.643471717834473, + "learning_rate": 6.646292997122106e-06, + "loss": 0.317, + "step": 25223 + }, + { + "epoch": 1.1421323928377645, + "grad_norm": 5.350218296051025, + "learning_rate": 6.64615595450185e-06, + "loss": 0.2512, + "step": 25224 + }, + { + "epoch": 1.1421459576776993, + "grad_norm": 6.162569522857666, + "learning_rate": 6.646018911881595e-06, + "loss": 0.3007, + "step": 25225 + }, + { + "epoch": 1.1421595225176342, + "grad_norm": 4.290244102478027, + "learning_rate": 6.6458818692613405e-06, + "loss": 0.1958, + "step": 25226 + }, + { + "epoch": 1.1421730873575693, + "grad_norm": 5.7526631355285645, + "learning_rate": 6.6457448266410866e-06, + "loss": 0.2724, + "step": 25227 + }, + { + "epoch": 1.1421866521975041, + "grad_norm": 5.078381061553955, + "learning_rate": 6.645607784020831e-06, + "loss": 0.3062, + "step": 25228 + }, + { + "epoch": 1.142200217037439, + "grad_norm": 4.014651775360107, + "learning_rate": 6.645470741400576e-06, + "loss": 0.2456, + "step": 25229 + }, + { + "epoch": 1.1422137818773739, + "grad_norm": 6.023547649383545, + "learning_rate": 6.64533369878032e-06, + "loss": 0.3231, + "step": 25230 + }, + { + "epoch": 1.1422273467173087, + "grad_norm": 5.448142051696777, + "learning_rate": 6.6451966561600664e-06, + "loss": 0.3434, + "step": 25231 + }, + { + "epoch": 1.1422409115572436, + "grad_norm": 5.107763767242432, + "learning_rate": 6.645059613539812e-06, + "loss": 0.3231, + "step": 25232 + }, + { + "epoch": 1.1422544763971785, + "grad_norm": 7.698372840881348, + "learning_rate": 6.644922570919557e-06, + "loss": 0.467, + "step": 25233 + }, + { + "epoch": 1.1422680412371133, + "grad_norm": 4.177438259124756, + "learning_rate": 6.644785528299301e-06, + "loss": 0.2228, + "step": 25234 + }, + { + "epoch": 1.1422816060770482, + "grad_norm": 6.51584529876709, + "learning_rate": 6.644648485679047e-06, + "loss": 0.2706, + "step": 25235 + }, + { + "epoch": 1.142295170916983, + "grad_norm": 6.069594383239746, + "learning_rate": 6.644511443058792e-06, + "loss": 0.3188, + "step": 25236 + }, + { + "epoch": 1.1423087357569182, + "grad_norm": 5.182290077209473, + "learning_rate": 6.644374400438537e-06, + "loss": 0.3339, + "step": 25237 + }, + { + "epoch": 1.142322300596853, + "grad_norm": 4.439778804779053, + "learning_rate": 6.644237357818282e-06, + "loss": 0.2371, + "step": 25238 + }, + { + "epoch": 1.142335865436788, + "grad_norm": 7.470168113708496, + "learning_rate": 6.644100315198028e-06, + "loss": 0.3679, + "step": 25239 + }, + { + "epoch": 1.1423494302767228, + "grad_norm": 6.112312316894531, + "learning_rate": 6.643963272577772e-06, + "loss": 0.3058, + "step": 25240 + }, + { + "epoch": 1.1423629951166576, + "grad_norm": 6.502436637878418, + "learning_rate": 6.643826229957517e-06, + "loss": 0.4071, + "step": 25241 + }, + { + "epoch": 1.1423765599565925, + "grad_norm": 4.158817291259766, + "learning_rate": 6.6436891873372626e-06, + "loss": 0.2212, + "step": 25242 + }, + { + "epoch": 1.1423901247965274, + "grad_norm": 6.373511791229248, + "learning_rate": 6.643552144717007e-06, + "loss": 0.2951, + "step": 25243 + }, + { + "epoch": 1.1424036896364622, + "grad_norm": 5.805833339691162, + "learning_rate": 6.643415102096753e-06, + "loss": 0.3322, + "step": 25244 + }, + { + "epoch": 1.142417254476397, + "grad_norm": 6.447514057159424, + "learning_rate": 6.643278059476498e-06, + "loss": 0.2291, + "step": 25245 + }, + { + "epoch": 1.1424308193163322, + "grad_norm": 5.677218914031982, + "learning_rate": 6.643141016856242e-06, + "loss": 0.2597, + "step": 25246 + }, + { + "epoch": 1.142444384156267, + "grad_norm": 7.277787685394287, + "learning_rate": 6.643003974235988e-06, + "loss": 0.536, + "step": 25247 + }, + { + "epoch": 1.142457948996202, + "grad_norm": 6.440932273864746, + "learning_rate": 6.642866931615734e-06, + "loss": 0.3221, + "step": 25248 + }, + { + "epoch": 1.1424715138361368, + "grad_norm": 6.623169898986816, + "learning_rate": 6.642729888995478e-06, + "loss": 0.5318, + "step": 25249 + }, + { + "epoch": 1.1424850786760716, + "grad_norm": 7.111020088195801, + "learning_rate": 6.642592846375223e-06, + "loss": 0.5177, + "step": 25250 + }, + { + "epoch": 1.1424986435160065, + "grad_norm": 7.10499382019043, + "learning_rate": 6.642455803754968e-06, + "loss": 0.3936, + "step": 25251 + }, + { + "epoch": 1.1425122083559414, + "grad_norm": 5.99381160736084, + "learning_rate": 6.642318761134714e-06, + "loss": 0.427, + "step": 25252 + }, + { + "epoch": 1.1425257731958762, + "grad_norm": 5.080030918121338, + "learning_rate": 6.642181718514459e-06, + "loss": 0.2182, + "step": 25253 + }, + { + "epoch": 1.142539338035811, + "grad_norm": 5.066084861755371, + "learning_rate": 6.642044675894204e-06, + "loss": 0.1832, + "step": 25254 + }, + { + "epoch": 1.142552902875746, + "grad_norm": 4.799014091491699, + "learning_rate": 6.641907633273948e-06, + "loss": 0.2801, + "step": 25255 + }, + { + "epoch": 1.142566467715681, + "grad_norm": 5.3895745277404785, + "learning_rate": 6.641770590653693e-06, + "loss": 0.2358, + "step": 25256 + }, + { + "epoch": 1.142580032555616, + "grad_norm": 5.712818622589111, + "learning_rate": 6.641633548033439e-06, + "loss": 0.2232, + "step": 25257 + }, + { + "epoch": 1.1425935973955508, + "grad_norm": 7.725897312164307, + "learning_rate": 6.641496505413184e-06, + "loss": 0.4816, + "step": 25258 + }, + { + "epoch": 1.1426071622354856, + "grad_norm": 6.697198867797852, + "learning_rate": 6.641359462792929e-06, + "loss": 0.3428, + "step": 25259 + }, + { + "epoch": 1.1426207270754205, + "grad_norm": 10.723960876464844, + "learning_rate": 6.641222420172674e-06, + "loss": 0.5486, + "step": 25260 + }, + { + "epoch": 1.1426342919153554, + "grad_norm": 6.085878372192383, + "learning_rate": 6.64108537755242e-06, + "loss": 0.3893, + "step": 25261 + }, + { + "epoch": 1.1426478567552902, + "grad_norm": 9.363350868225098, + "learning_rate": 6.6409483349321644e-06, + "loss": 0.5693, + "step": 25262 + }, + { + "epoch": 1.142661421595225, + "grad_norm": 6.816036224365234, + "learning_rate": 6.64081129231191e-06, + "loss": 0.3013, + "step": 25263 + }, + { + "epoch": 1.1426749864351602, + "grad_norm": 4.377247333526611, + "learning_rate": 6.640674249691654e-06, + "loss": 0.2695, + "step": 25264 + }, + { + "epoch": 1.142688551275095, + "grad_norm": 6.6461005210876465, + "learning_rate": 6.6405372070714e-06, + "loss": 0.4288, + "step": 25265 + }, + { + "epoch": 1.14270211611503, + "grad_norm": 6.943685054779053, + "learning_rate": 6.640400164451145e-06, + "loss": 0.3999, + "step": 25266 + }, + { + "epoch": 1.1427156809549648, + "grad_norm": 6.465770721435547, + "learning_rate": 6.64026312183089e-06, + "loss": 0.2921, + "step": 25267 + }, + { + "epoch": 1.1427292457948997, + "grad_norm": 6.087649822235107, + "learning_rate": 6.640126079210635e-06, + "loss": 0.3021, + "step": 25268 + }, + { + "epoch": 1.1427428106348345, + "grad_norm": 6.34279203414917, + "learning_rate": 6.63998903659038e-06, + "loss": 0.3088, + "step": 25269 + }, + { + "epoch": 1.1427563754747694, + "grad_norm": 5.782500267028809, + "learning_rate": 6.639851993970126e-06, + "loss": 0.2439, + "step": 25270 + }, + { + "epoch": 1.1427699403147042, + "grad_norm": 6.93156623840332, + "learning_rate": 6.63971495134987e-06, + "loss": 0.3896, + "step": 25271 + }, + { + "epoch": 1.1427835051546391, + "grad_norm": 10.703140258789062, + "learning_rate": 6.639577908729615e-06, + "loss": 0.5401, + "step": 25272 + }, + { + "epoch": 1.142797069994574, + "grad_norm": 4.574357986450195, + "learning_rate": 6.63944086610936e-06, + "loss": 0.2082, + "step": 25273 + }, + { + "epoch": 1.1428106348345088, + "grad_norm": 5.197231292724609, + "learning_rate": 6.639303823489106e-06, + "loss": 0.3394, + "step": 25274 + }, + { + "epoch": 1.142824199674444, + "grad_norm": 8.543045997619629, + "learning_rate": 6.639166780868851e-06, + "loss": 0.3515, + "step": 25275 + }, + { + "epoch": 1.1428377645143788, + "grad_norm": 5.332498073577881, + "learning_rate": 6.639029738248596e-06, + "loss": 0.1992, + "step": 25276 + }, + { + "epoch": 1.1428513293543137, + "grad_norm": 6.478830814361572, + "learning_rate": 6.63889269562834e-06, + "loss": 0.2902, + "step": 25277 + }, + { + "epoch": 1.1428648941942485, + "grad_norm": 5.069300174713135, + "learning_rate": 6.6387556530080864e-06, + "loss": 0.2131, + "step": 25278 + }, + { + "epoch": 1.1428784590341834, + "grad_norm": 5.568197727203369, + "learning_rate": 6.638618610387832e-06, + "loss": 0.2212, + "step": 25279 + }, + { + "epoch": 1.1428920238741183, + "grad_norm": 5.782262325286865, + "learning_rate": 6.638481567767576e-06, + "loss": 0.2527, + "step": 25280 + }, + { + "epoch": 1.1429055887140531, + "grad_norm": 6.504428386688232, + "learning_rate": 6.638344525147321e-06, + "loss": 0.3273, + "step": 25281 + }, + { + "epoch": 1.142919153553988, + "grad_norm": 7.662247180938721, + "learning_rate": 6.638207482527066e-06, + "loss": 0.2105, + "step": 25282 + }, + { + "epoch": 1.142932718393923, + "grad_norm": 5.90445613861084, + "learning_rate": 6.6380704399068115e-06, + "loss": 0.327, + "step": 25283 + }, + { + "epoch": 1.142946283233858, + "grad_norm": 8.152040481567383, + "learning_rate": 6.637933397286557e-06, + "loss": 0.3478, + "step": 25284 + }, + { + "epoch": 1.1429598480737928, + "grad_norm": 6.661522388458252, + "learning_rate": 6.637796354666302e-06, + "loss": 0.2948, + "step": 25285 + }, + { + "epoch": 1.1429734129137277, + "grad_norm": 6.009110927581787, + "learning_rate": 6.637659312046046e-06, + "loss": 0.2964, + "step": 25286 + }, + { + "epoch": 1.1429869777536625, + "grad_norm": 5.064132213592529, + "learning_rate": 6.637522269425792e-06, + "loss": 0.2751, + "step": 25287 + }, + { + "epoch": 1.1430005425935974, + "grad_norm": 5.377392768859863, + "learning_rate": 6.637385226805537e-06, + "loss": 0.2725, + "step": 25288 + }, + { + "epoch": 1.1430141074335323, + "grad_norm": 6.50890588760376, + "learning_rate": 6.637248184185282e-06, + "loss": 0.3272, + "step": 25289 + }, + { + "epoch": 1.1430276722734671, + "grad_norm": 7.297280788421631, + "learning_rate": 6.637111141565027e-06, + "loss": 0.4753, + "step": 25290 + }, + { + "epoch": 1.143041237113402, + "grad_norm": 4.8488335609436035, + "learning_rate": 6.636974098944773e-06, + "loss": 0.2488, + "step": 25291 + }, + { + "epoch": 1.1430548019533369, + "grad_norm": 6.698945045471191, + "learning_rate": 6.636837056324518e-06, + "loss": 0.2935, + "step": 25292 + }, + { + "epoch": 1.1430683667932717, + "grad_norm": 5.686116695404053, + "learning_rate": 6.6367000137042624e-06, + "loss": 0.216, + "step": 25293 + }, + { + "epoch": 1.1430819316332068, + "grad_norm": 5.278289318084717, + "learning_rate": 6.636562971084008e-06, + "loss": 0.3184, + "step": 25294 + }, + { + "epoch": 1.1430954964731417, + "grad_norm": 5.628218650817871, + "learning_rate": 6.636425928463752e-06, + "loss": 0.281, + "step": 25295 + }, + { + "epoch": 1.1431090613130765, + "grad_norm": 4.875443458557129, + "learning_rate": 6.636288885843498e-06, + "loss": 0.1806, + "step": 25296 + }, + { + "epoch": 1.1431226261530114, + "grad_norm": 7.454751014709473, + "learning_rate": 6.636151843223243e-06, + "loss": 0.3205, + "step": 25297 + }, + { + "epoch": 1.1431361909929463, + "grad_norm": 5.901757717132568, + "learning_rate": 6.6360148006029875e-06, + "loss": 0.2481, + "step": 25298 + }, + { + "epoch": 1.1431497558328811, + "grad_norm": 5.736059665679932, + "learning_rate": 6.635877757982733e-06, + "loss": 0.3548, + "step": 25299 + }, + { + "epoch": 1.143163320672816, + "grad_norm": 6.521646022796631, + "learning_rate": 6.635740715362479e-06, + "loss": 0.228, + "step": 25300 + }, + { + "epoch": 1.1431768855127509, + "grad_norm": 4.809585094451904, + "learning_rate": 6.635603672742224e-06, + "loss": 0.2078, + "step": 25301 + }, + { + "epoch": 1.143190450352686, + "grad_norm": 5.281405925750732, + "learning_rate": 6.635466630121968e-06, + "loss": 0.2702, + "step": 25302 + }, + { + "epoch": 1.1432040151926208, + "grad_norm": 4.706505298614502, + "learning_rate": 6.635329587501713e-06, + "loss": 0.2473, + "step": 25303 + }, + { + "epoch": 1.1432175800325557, + "grad_norm": 7.297521114349365, + "learning_rate": 6.635192544881459e-06, + "loss": 0.2851, + "step": 25304 + }, + { + "epoch": 1.1432311448724906, + "grad_norm": 5.0627570152282715, + "learning_rate": 6.635055502261204e-06, + "loss": 0.2405, + "step": 25305 + }, + { + "epoch": 1.1432447097124254, + "grad_norm": 5.16676664352417, + "learning_rate": 6.634918459640949e-06, + "loss": 0.2857, + "step": 25306 + }, + { + "epoch": 1.1432582745523603, + "grad_norm": 7.177587985992432, + "learning_rate": 6.634781417020693e-06, + "loss": 0.2892, + "step": 25307 + }, + { + "epoch": 1.1432718393922952, + "grad_norm": 5.370712757110596, + "learning_rate": 6.634644374400439e-06, + "loss": 0.2535, + "step": 25308 + }, + { + "epoch": 1.14328540423223, + "grad_norm": 4.606212139129639, + "learning_rate": 6.6345073317801845e-06, + "loss": 0.3051, + "step": 25309 + }, + { + "epoch": 1.1432989690721649, + "grad_norm": 5.436831951141357, + "learning_rate": 6.63437028915993e-06, + "loss": 0.237, + "step": 25310 + }, + { + "epoch": 1.1433125339120997, + "grad_norm": 6.584944725036621, + "learning_rate": 6.634233246539674e-06, + "loss": 0.2735, + "step": 25311 + }, + { + "epoch": 1.1433260987520346, + "grad_norm": 7.458462715148926, + "learning_rate": 6.634096203919419e-06, + "loss": 0.385, + "step": 25312 + }, + { + "epoch": 1.1433396635919697, + "grad_norm": 6.983741283416748, + "learning_rate": 6.633959161299165e-06, + "loss": 0.3604, + "step": 25313 + }, + { + "epoch": 1.1433532284319046, + "grad_norm": 5.131787300109863, + "learning_rate": 6.6338221186789095e-06, + "loss": 0.2199, + "step": 25314 + }, + { + "epoch": 1.1433667932718394, + "grad_norm": 4.809828281402588, + "learning_rate": 6.633685076058655e-06, + "loss": 0.3679, + "step": 25315 + }, + { + "epoch": 1.1433803581117743, + "grad_norm": 6.272400856018066, + "learning_rate": 6.6335480334384e-06, + "loss": 0.2489, + "step": 25316 + }, + { + "epoch": 1.1433939229517092, + "grad_norm": 5.816962242126465, + "learning_rate": 6.633410990818145e-06, + "loss": 0.3474, + "step": 25317 + }, + { + "epoch": 1.143407487791644, + "grad_norm": 4.965405464172363, + "learning_rate": 6.63327394819789e-06, + "loss": 0.3424, + "step": 25318 + }, + { + "epoch": 1.143421052631579, + "grad_norm": 4.473191261291504, + "learning_rate": 6.633136905577635e-06, + "loss": 0.198, + "step": 25319 + }, + { + "epoch": 1.1434346174715138, + "grad_norm": 5.711239337921143, + "learning_rate": 6.63299986295738e-06, + "loss": 0.2953, + "step": 25320 + }, + { + "epoch": 1.1434481823114488, + "grad_norm": 5.9869794845581055, + "learning_rate": 6.632862820337126e-06, + "loss": 0.3047, + "step": 25321 + }, + { + "epoch": 1.1434617471513837, + "grad_norm": 5.720226764678955, + "learning_rate": 6.632725777716871e-06, + "loss": 0.235, + "step": 25322 + }, + { + "epoch": 1.1434753119913186, + "grad_norm": 4.71164083480835, + "learning_rate": 6.632588735096615e-06, + "loss": 0.25, + "step": 25323 + }, + { + "epoch": 1.1434888768312534, + "grad_norm": 4.36987829208374, + "learning_rate": 6.6324516924763604e-06, + "loss": 0.1759, + "step": 25324 + }, + { + "epoch": 1.1435024416711883, + "grad_norm": 5.820033073425293, + "learning_rate": 6.632314649856106e-06, + "loss": 0.3828, + "step": 25325 + }, + { + "epoch": 1.1435160065111232, + "grad_norm": 4.619224548339844, + "learning_rate": 6.632177607235852e-06, + "loss": 0.1444, + "step": 25326 + }, + { + "epoch": 1.143529571351058, + "grad_norm": 4.03085470199585, + "learning_rate": 6.632040564615596e-06, + "loss": 0.2463, + "step": 25327 + }, + { + "epoch": 1.143543136190993, + "grad_norm": 7.122457027435303, + "learning_rate": 6.631903521995341e-06, + "loss": 0.3623, + "step": 25328 + }, + { + "epoch": 1.1435567010309278, + "grad_norm": 5.732610702514648, + "learning_rate": 6.6317664793750855e-06, + "loss": 0.3172, + "step": 25329 + }, + { + "epoch": 1.1435702658708626, + "grad_norm": 4.673118591308594, + "learning_rate": 6.6316294367548315e-06, + "loss": 0.2334, + "step": 25330 + }, + { + "epoch": 1.1435838307107975, + "grad_norm": 5.263732433319092, + "learning_rate": 6.631492394134577e-06, + "loss": 0.2972, + "step": 25331 + }, + { + "epoch": 1.1435973955507326, + "grad_norm": 7.783830165863037, + "learning_rate": 6.631355351514321e-06, + "loss": 0.3267, + "step": 25332 + }, + { + "epoch": 1.1436109603906675, + "grad_norm": 4.0013203620910645, + "learning_rate": 6.631218308894066e-06, + "loss": 0.2948, + "step": 25333 + }, + { + "epoch": 1.1436245252306023, + "grad_norm": 6.0192155838012695, + "learning_rate": 6.631081266273812e-06, + "loss": 0.2719, + "step": 25334 + }, + { + "epoch": 1.1436380900705372, + "grad_norm": 4.036622524261475, + "learning_rate": 6.630944223653557e-06, + "loss": 0.2198, + "step": 25335 + }, + { + "epoch": 1.143651654910472, + "grad_norm": 6.74009895324707, + "learning_rate": 6.630807181033302e-06, + "loss": 0.2879, + "step": 25336 + }, + { + "epoch": 1.143665219750407, + "grad_norm": 4.41327428817749, + "learning_rate": 6.630670138413047e-06, + "loss": 0.2582, + "step": 25337 + }, + { + "epoch": 1.1436787845903418, + "grad_norm": 5.397708415985107, + "learning_rate": 6.630533095792791e-06, + "loss": 0.2716, + "step": 25338 + }, + { + "epoch": 1.1436923494302766, + "grad_norm": 4.982072830200195, + "learning_rate": 6.630396053172537e-06, + "loss": 0.1737, + "step": 25339 + }, + { + "epoch": 1.1437059142702117, + "grad_norm": 5.863975524902344, + "learning_rate": 6.6302590105522825e-06, + "loss": 0.2677, + "step": 25340 + }, + { + "epoch": 1.1437194791101466, + "grad_norm": 7.102453708648682, + "learning_rate": 6.630121967932028e-06, + "loss": 0.2699, + "step": 25341 + }, + { + "epoch": 1.1437330439500815, + "grad_norm": 6.04456901550293, + "learning_rate": 6.629984925311772e-06, + "loss": 0.3318, + "step": 25342 + }, + { + "epoch": 1.1437466087900163, + "grad_norm": 4.85559606552124, + "learning_rate": 6.629847882691518e-06, + "loss": 0.2522, + "step": 25343 + }, + { + "epoch": 1.1437601736299512, + "grad_norm": 4.205737113952637, + "learning_rate": 6.629710840071263e-06, + "loss": 0.2799, + "step": 25344 + }, + { + "epoch": 1.143773738469886, + "grad_norm": 5.574788570404053, + "learning_rate": 6.6295737974510075e-06, + "loss": 0.2964, + "step": 25345 + }, + { + "epoch": 1.143787303309821, + "grad_norm": 4.261425018310547, + "learning_rate": 6.629436754830753e-06, + "loss": 0.1527, + "step": 25346 + }, + { + "epoch": 1.1438008681497558, + "grad_norm": 4.068176746368408, + "learning_rate": 6.629299712210499e-06, + "loss": 0.1899, + "step": 25347 + }, + { + "epoch": 1.1438144329896907, + "grad_norm": 6.137852668762207, + "learning_rate": 6.629162669590243e-06, + "loss": 0.2844, + "step": 25348 + }, + { + "epoch": 1.1438279978296255, + "grad_norm": 5.4469828605651855, + "learning_rate": 6.629025626969988e-06, + "loss": 0.2987, + "step": 25349 + }, + { + "epoch": 1.1438415626695604, + "grad_norm": 3.476969003677368, + "learning_rate": 6.628888584349733e-06, + "loss": 0.1258, + "step": 25350 + }, + { + "epoch": 1.1438551275094955, + "grad_norm": 5.102890491485596, + "learning_rate": 6.628751541729478e-06, + "loss": 0.2811, + "step": 25351 + }, + { + "epoch": 1.1438686923494303, + "grad_norm": 3.3426785469055176, + "learning_rate": 6.628614499109224e-06, + "loss": 0.1374, + "step": 25352 + }, + { + "epoch": 1.1438822571893652, + "grad_norm": 4.521740436553955, + "learning_rate": 6.628477456488969e-06, + "loss": 0.2121, + "step": 25353 + }, + { + "epoch": 1.1438958220293, + "grad_norm": 3.2801709175109863, + "learning_rate": 6.628340413868713e-06, + "loss": 0.1272, + "step": 25354 + }, + { + "epoch": 1.143909386869235, + "grad_norm": 5.399753093719482, + "learning_rate": 6.6282033712484584e-06, + "loss": 0.2313, + "step": 25355 + }, + { + "epoch": 1.1439229517091698, + "grad_norm": 7.439877986907959, + "learning_rate": 6.6280663286282045e-06, + "loss": 0.3774, + "step": 25356 + }, + { + "epoch": 1.1439365165491047, + "grad_norm": 4.73944616317749, + "learning_rate": 6.627929286007949e-06, + "loss": 0.2302, + "step": 25357 + }, + { + "epoch": 1.1439500813890395, + "grad_norm": 5.586196422576904, + "learning_rate": 6.627792243387694e-06, + "loss": 0.3632, + "step": 25358 + }, + { + "epoch": 1.1439636462289746, + "grad_norm": 3.529620409011841, + "learning_rate": 6.627655200767439e-06, + "loss": 0.1501, + "step": 25359 + }, + { + "epoch": 1.1439772110689095, + "grad_norm": 6.434521198272705, + "learning_rate": 6.627518158147185e-06, + "loss": 0.2092, + "step": 25360 + }, + { + "epoch": 1.1439907759088443, + "grad_norm": 4.9768524169921875, + "learning_rate": 6.6273811155269295e-06, + "loss": 0.2407, + "step": 25361 + }, + { + "epoch": 1.1440043407487792, + "grad_norm": 5.416655540466309, + "learning_rate": 6.627244072906675e-06, + "loss": 0.2261, + "step": 25362 + }, + { + "epoch": 1.144017905588714, + "grad_norm": 5.987112998962402, + "learning_rate": 6.627107030286419e-06, + "loss": 0.2412, + "step": 25363 + }, + { + "epoch": 1.144031470428649, + "grad_norm": 5.2496442794799805, + "learning_rate": 6.626969987666164e-06, + "loss": 0.2819, + "step": 25364 + }, + { + "epoch": 1.1440450352685838, + "grad_norm": 5.449400424957275, + "learning_rate": 6.62683294504591e-06, + "loss": 0.2764, + "step": 25365 + }, + { + "epoch": 1.1440586001085187, + "grad_norm": 5.370396137237549, + "learning_rate": 6.6266959024256546e-06, + "loss": 0.2761, + "step": 25366 + }, + { + "epoch": 1.1440721649484535, + "grad_norm": 5.372544765472412, + "learning_rate": 6.6265588598054e-06, + "loss": 0.1908, + "step": 25367 + }, + { + "epoch": 1.1440857297883884, + "grad_norm": 6.0014142990112305, + "learning_rate": 6.626421817185145e-06, + "loss": 0.2468, + "step": 25368 + }, + { + "epoch": 1.1440992946283233, + "grad_norm": 4.8805832862854, + "learning_rate": 6.626284774564891e-06, + "loss": 0.2575, + "step": 25369 + }, + { + "epoch": 1.1441128594682584, + "grad_norm": 6.848682403564453, + "learning_rate": 6.626147731944635e-06, + "loss": 0.3792, + "step": 25370 + }, + { + "epoch": 1.1441264243081932, + "grad_norm": 5.11293888092041, + "learning_rate": 6.6260106893243805e-06, + "loss": 0.2276, + "step": 25371 + }, + { + "epoch": 1.144139989148128, + "grad_norm": 5.418025016784668, + "learning_rate": 6.625873646704125e-06, + "loss": 0.2814, + "step": 25372 + }, + { + "epoch": 1.144153553988063, + "grad_norm": 4.974976062774658, + "learning_rate": 6.625736604083871e-06, + "loss": 0.2091, + "step": 25373 + }, + { + "epoch": 1.1441671188279978, + "grad_norm": 4.19736909866333, + "learning_rate": 6.625599561463616e-06, + "loss": 0.2468, + "step": 25374 + }, + { + "epoch": 1.1441806836679327, + "grad_norm": 6.688227653503418, + "learning_rate": 6.625462518843361e-06, + "loss": 0.352, + "step": 25375 + }, + { + "epoch": 1.1441942485078676, + "grad_norm": 3.1327667236328125, + "learning_rate": 6.6253254762231055e-06, + "loss": 0.09, + "step": 25376 + }, + { + "epoch": 1.1442078133478024, + "grad_norm": 4.764548301696777, + "learning_rate": 6.6251884336028515e-06, + "loss": 0.2101, + "step": 25377 + }, + { + "epoch": 1.1442213781877375, + "grad_norm": 4.041703701019287, + "learning_rate": 6.625051390982597e-06, + "loss": 0.1629, + "step": 25378 + }, + { + "epoch": 1.1442349430276724, + "grad_norm": 5.411355018615723, + "learning_rate": 6.624914348362341e-06, + "loss": 0.2734, + "step": 25379 + }, + { + "epoch": 1.1442485078676072, + "grad_norm": 5.541863918304443, + "learning_rate": 6.624777305742086e-06, + "loss": 0.2708, + "step": 25380 + }, + { + "epoch": 1.144262072707542, + "grad_norm": 4.652597904205322, + "learning_rate": 6.6246402631218306e-06, + "loss": 0.1664, + "step": 25381 + }, + { + "epoch": 1.144275637547477, + "grad_norm": 3.422266721725464, + "learning_rate": 6.624503220501577e-06, + "loss": 0.1485, + "step": 25382 + }, + { + "epoch": 1.1442892023874118, + "grad_norm": 4.701148986816406, + "learning_rate": 6.624366177881322e-06, + "loss": 0.2371, + "step": 25383 + }, + { + "epoch": 1.1443027672273467, + "grad_norm": 4.526277542114258, + "learning_rate": 6.624229135261067e-06, + "loss": 0.291, + "step": 25384 + }, + { + "epoch": 1.1443163320672816, + "grad_norm": 3.9125213623046875, + "learning_rate": 6.624092092640811e-06, + "loss": 0.2132, + "step": 25385 + }, + { + "epoch": 1.1443298969072164, + "grad_norm": 4.511502265930176, + "learning_rate": 6.623955050020557e-06, + "loss": 0.1925, + "step": 25386 + }, + { + "epoch": 1.1443434617471513, + "grad_norm": 4.0436625480651855, + "learning_rate": 6.6238180074003025e-06, + "loss": 0.2094, + "step": 25387 + }, + { + "epoch": 1.1443570265870862, + "grad_norm": 4.585638999938965, + "learning_rate": 6.623680964780047e-06, + "loss": 0.2008, + "step": 25388 + }, + { + "epoch": 1.1443705914270212, + "grad_norm": 4.544943809509277, + "learning_rate": 6.623543922159792e-06, + "loss": 0.2306, + "step": 25389 + }, + { + "epoch": 1.144384156266956, + "grad_norm": 4.934060096740723, + "learning_rate": 6.623406879539538e-06, + "loss": 0.1723, + "step": 25390 + }, + { + "epoch": 1.144397721106891, + "grad_norm": 7.562051773071289, + "learning_rate": 6.623269836919282e-06, + "loss": 0.2063, + "step": 25391 + }, + { + "epoch": 1.1444112859468258, + "grad_norm": 4.38260555267334, + "learning_rate": 6.6231327942990275e-06, + "loss": 0.2056, + "step": 25392 + }, + { + "epoch": 1.1444248507867607, + "grad_norm": 4.741880416870117, + "learning_rate": 6.622995751678773e-06, + "loss": 0.2555, + "step": 25393 + }, + { + "epoch": 1.1444384156266956, + "grad_norm": 5.549839019775391, + "learning_rate": 6.622858709058517e-06, + "loss": 0.2767, + "step": 25394 + }, + { + "epoch": 1.1444519804666304, + "grad_norm": 5.9111199378967285, + "learning_rate": 6.622721666438263e-06, + "loss": 0.2364, + "step": 25395 + }, + { + "epoch": 1.1444655453065653, + "grad_norm": 3.8358306884765625, + "learning_rate": 6.622584623818008e-06, + "loss": 0.1961, + "step": 25396 + }, + { + "epoch": 1.1444791101465004, + "grad_norm": 4.797799110412598, + "learning_rate": 6.6224475811977526e-06, + "loss": 0.1496, + "step": 25397 + }, + { + "epoch": 1.1444926749864353, + "grad_norm": 4.397269248962402, + "learning_rate": 6.622310538577498e-06, + "loss": 0.161, + "step": 25398 + }, + { + "epoch": 1.1445062398263701, + "grad_norm": 3.5264928340911865, + "learning_rate": 6.622173495957244e-06, + "loss": 0.1667, + "step": 25399 + }, + { + "epoch": 1.144519804666305, + "grad_norm": 3.729522466659546, + "learning_rate": 6.622036453336988e-06, + "loss": 0.1377, + "step": 25400 + }, + { + "epoch": 1.1445333695062399, + "grad_norm": 5.1054792404174805, + "learning_rate": 6.621899410716733e-06, + "loss": 0.2156, + "step": 25401 + }, + { + "epoch": 1.1445469343461747, + "grad_norm": 3.027829647064209, + "learning_rate": 6.6217623680964785e-06, + "loss": 0.1616, + "step": 25402 + }, + { + "epoch": 1.1445604991861096, + "grad_norm": 3.7174015045166016, + "learning_rate": 6.6216253254762245e-06, + "loss": 0.1934, + "step": 25403 + }, + { + "epoch": 1.1445740640260444, + "grad_norm": 5.070860385894775, + "learning_rate": 6.621488282855969e-06, + "loss": 0.207, + "step": 25404 + }, + { + "epoch": 1.1445876288659793, + "grad_norm": 6.4510698318481445, + "learning_rate": 6.621351240235714e-06, + "loss": 0.2245, + "step": 25405 + }, + { + "epoch": 1.1446011937059142, + "grad_norm": 6.166942596435547, + "learning_rate": 6.621214197615458e-06, + "loss": 0.4065, + "step": 25406 + }, + { + "epoch": 1.144614758545849, + "grad_norm": 6.021069526672363, + "learning_rate": 6.6210771549952035e-06, + "loss": 0.3099, + "step": 25407 + }, + { + "epoch": 1.1446283233857841, + "grad_norm": 4.474752426147461, + "learning_rate": 6.6209401123749495e-06, + "loss": 0.1993, + "step": 25408 + }, + { + "epoch": 1.144641888225719, + "grad_norm": 5.3810296058654785, + "learning_rate": 6.620803069754695e-06, + "loss": 0.3351, + "step": 25409 + }, + { + "epoch": 1.1446554530656539, + "grad_norm": 4.559036731719971, + "learning_rate": 6.620666027134439e-06, + "loss": 0.2702, + "step": 25410 + }, + { + "epoch": 1.1446690179055887, + "grad_norm": 5.091432094573975, + "learning_rate": 6.620528984514184e-06, + "loss": 0.234, + "step": 25411 + }, + { + "epoch": 1.1446825827455236, + "grad_norm": 6.149737358093262, + "learning_rate": 6.62039194189393e-06, + "loss": 0.3087, + "step": 25412 + }, + { + "epoch": 1.1446961475854585, + "grad_norm": 6.696720123291016, + "learning_rate": 6.620254899273675e-06, + "loss": 0.2803, + "step": 25413 + }, + { + "epoch": 1.1447097124253933, + "grad_norm": 5.16250467300415, + "learning_rate": 6.62011785665342e-06, + "loss": 0.2553, + "step": 25414 + }, + { + "epoch": 1.1447232772653282, + "grad_norm": 5.196850776672363, + "learning_rate": 6.619980814033164e-06, + "loss": 0.2296, + "step": 25415 + }, + { + "epoch": 1.1447368421052633, + "grad_norm": 4.486520290374756, + "learning_rate": 6.61984377141291e-06, + "loss": 0.3007, + "step": 25416 + }, + { + "epoch": 1.1447504069451981, + "grad_norm": 4.751697063446045, + "learning_rate": 6.619706728792655e-06, + "loss": 0.1994, + "step": 25417 + }, + { + "epoch": 1.144763971785133, + "grad_norm": 6.038359642028809, + "learning_rate": 6.6195696861724005e-06, + "loss": 0.3161, + "step": 25418 + }, + { + "epoch": 1.1447775366250679, + "grad_norm": 4.8168816566467285, + "learning_rate": 6.619432643552145e-06, + "loss": 0.2127, + "step": 25419 + }, + { + "epoch": 1.1447911014650027, + "grad_norm": 5.136532306671143, + "learning_rate": 6.61929560093189e-06, + "loss": 0.3252, + "step": 25420 + }, + { + "epoch": 1.1448046663049376, + "grad_norm": 5.661465644836426, + "learning_rate": 6.619158558311636e-06, + "loss": 0.2755, + "step": 25421 + }, + { + "epoch": 1.1448182311448725, + "grad_norm": 5.640971660614014, + "learning_rate": 6.61902151569138e-06, + "loss": 0.2943, + "step": 25422 + }, + { + "epoch": 1.1448317959848073, + "grad_norm": 7.106505870819092, + "learning_rate": 6.6188844730711255e-06, + "loss": 0.3109, + "step": 25423 + }, + { + "epoch": 1.1448453608247422, + "grad_norm": 6.437745571136475, + "learning_rate": 6.618747430450871e-06, + "loss": 0.3208, + "step": 25424 + }, + { + "epoch": 1.144858925664677, + "grad_norm": 6.181929588317871, + "learning_rate": 6.618610387830616e-06, + "loss": 0.4108, + "step": 25425 + }, + { + "epoch": 1.144872490504612, + "grad_norm": 5.23801326751709, + "learning_rate": 6.618473345210361e-06, + "loss": 0.3026, + "step": 25426 + }, + { + "epoch": 1.144886055344547, + "grad_norm": 4.973599910736084, + "learning_rate": 6.618336302590106e-06, + "loss": 0.1913, + "step": 25427 + }, + { + "epoch": 1.1448996201844819, + "grad_norm": 4.811647891998291, + "learning_rate": 6.6181992599698506e-06, + "loss": 0.275, + "step": 25428 + }, + { + "epoch": 1.1449131850244167, + "grad_norm": 5.297657489776611, + "learning_rate": 6.618062217349597e-06, + "loss": 0.31, + "step": 25429 + }, + { + "epoch": 1.1449267498643516, + "grad_norm": 5.3390045166015625, + "learning_rate": 6.617925174729342e-06, + "loss": 0.3154, + "step": 25430 + }, + { + "epoch": 1.1449403147042865, + "grad_norm": 4.425840854644775, + "learning_rate": 6.617788132109086e-06, + "loss": 0.2183, + "step": 25431 + }, + { + "epoch": 1.1449538795442213, + "grad_norm": 4.991321086883545, + "learning_rate": 6.617651089488831e-06, + "loss": 0.183, + "step": 25432 + }, + { + "epoch": 1.1449674443841562, + "grad_norm": 3.5829122066497803, + "learning_rate": 6.6175140468685765e-06, + "loss": 0.1979, + "step": 25433 + }, + { + "epoch": 1.144981009224091, + "grad_norm": 5.6648101806640625, + "learning_rate": 6.6173770042483225e-06, + "loss": 0.319, + "step": 25434 + }, + { + "epoch": 1.1449945740640262, + "grad_norm": 6.631747722625732, + "learning_rate": 6.617239961628067e-06, + "loss": 0.365, + "step": 25435 + }, + { + "epoch": 1.145008138903961, + "grad_norm": 7.758985996246338, + "learning_rate": 6.617102919007812e-06, + "loss": 0.4746, + "step": 25436 + }, + { + "epoch": 1.145021703743896, + "grad_norm": 5.16222620010376, + "learning_rate": 6.616965876387556e-06, + "loss": 0.3063, + "step": 25437 + }, + { + "epoch": 1.1450352685838308, + "grad_norm": 7.299548625946045, + "learning_rate": 6.616828833767302e-06, + "loss": 0.3777, + "step": 25438 + }, + { + "epoch": 1.1450488334237656, + "grad_norm": 4.232447624206543, + "learning_rate": 6.6166917911470475e-06, + "loss": 0.2802, + "step": 25439 + }, + { + "epoch": 1.1450623982637005, + "grad_norm": 5.6384077072143555, + "learning_rate": 6.616554748526792e-06, + "loss": 0.3942, + "step": 25440 + }, + { + "epoch": 1.1450759631036354, + "grad_norm": 4.279351711273193, + "learning_rate": 6.616417705906537e-06, + "loss": 0.2567, + "step": 25441 + }, + { + "epoch": 1.1450895279435702, + "grad_norm": 5.499423503875732, + "learning_rate": 6.616280663286283e-06, + "loss": 0.2833, + "step": 25442 + }, + { + "epoch": 1.145103092783505, + "grad_norm": 4.345173358917236, + "learning_rate": 6.616143620666028e-06, + "loss": 0.2758, + "step": 25443 + }, + { + "epoch": 1.14511665762344, + "grad_norm": 4.567367076873779, + "learning_rate": 6.616006578045773e-06, + "loss": 0.2971, + "step": 25444 + }, + { + "epoch": 1.1451302224633748, + "grad_norm": 4.689427852630615, + "learning_rate": 6.615869535425518e-06, + "loss": 0.3564, + "step": 25445 + }, + { + "epoch": 1.14514378730331, + "grad_norm": 4.398448944091797, + "learning_rate": 6.615732492805264e-06, + "loss": 0.2017, + "step": 25446 + }, + { + "epoch": 1.1451573521432448, + "grad_norm": 4.234980583190918, + "learning_rate": 6.615595450185008e-06, + "loss": 0.3171, + "step": 25447 + }, + { + "epoch": 1.1451709169831796, + "grad_norm": 5.924332618713379, + "learning_rate": 6.615458407564753e-06, + "loss": 0.3786, + "step": 25448 + }, + { + "epoch": 1.1451844818231145, + "grad_norm": 4.101608753204346, + "learning_rate": 6.615321364944498e-06, + "loss": 0.2548, + "step": 25449 + }, + { + "epoch": 1.1451980466630494, + "grad_norm": 5.284708499908447, + "learning_rate": 6.615184322324243e-06, + "loss": 0.3006, + "step": 25450 + }, + { + "epoch": 1.1452116115029842, + "grad_norm": 5.205947399139404, + "learning_rate": 6.615047279703989e-06, + "loss": 0.3341, + "step": 25451 + }, + { + "epoch": 1.145225176342919, + "grad_norm": 5.36533784866333, + "learning_rate": 6.614910237083734e-06, + "loss": 0.2571, + "step": 25452 + }, + { + "epoch": 1.145238741182854, + "grad_norm": 4.5644097328186035, + "learning_rate": 6.614773194463478e-06, + "loss": 0.2091, + "step": 25453 + }, + { + "epoch": 1.145252306022789, + "grad_norm": 5.159671783447266, + "learning_rate": 6.6146361518432235e-06, + "loss": 0.2602, + "step": 25454 + }, + { + "epoch": 1.145265870862724, + "grad_norm": 4.1621413230896, + "learning_rate": 6.6144991092229696e-06, + "loss": 0.1787, + "step": 25455 + }, + { + "epoch": 1.1452794357026588, + "grad_norm": 5.548057556152344, + "learning_rate": 6.614362066602714e-06, + "loss": 0.4479, + "step": 25456 + }, + { + "epoch": 1.1452930005425936, + "grad_norm": 7.091104507446289, + "learning_rate": 6.614225023982459e-06, + "loss": 0.3694, + "step": 25457 + }, + { + "epoch": 1.1453065653825285, + "grad_norm": 6.117274761199951, + "learning_rate": 6.614087981362204e-06, + "loss": 0.27, + "step": 25458 + }, + { + "epoch": 1.1453201302224634, + "grad_norm": 4.910948276519775, + "learning_rate": 6.6139509387419494e-06, + "loss": 0.2475, + "step": 25459 + }, + { + "epoch": 1.1453336950623982, + "grad_norm": 5.442081451416016, + "learning_rate": 6.613813896121695e-06, + "loss": 0.2536, + "step": 25460 + }, + { + "epoch": 1.145347259902333, + "grad_norm": 5.007435321807861, + "learning_rate": 6.61367685350144e-06, + "loss": 0.2695, + "step": 25461 + }, + { + "epoch": 1.145360824742268, + "grad_norm": 5.060303211212158, + "learning_rate": 6.613539810881184e-06, + "loss": 0.214, + "step": 25462 + }, + { + "epoch": 1.1453743895822028, + "grad_norm": 6.237643241882324, + "learning_rate": 6.613402768260929e-06, + "loss": 0.2857, + "step": 25463 + }, + { + "epoch": 1.1453879544221377, + "grad_norm": 4.803155899047852, + "learning_rate": 6.613265725640675e-06, + "loss": 0.2744, + "step": 25464 + }, + { + "epoch": 1.1454015192620728, + "grad_norm": 5.324122905731201, + "learning_rate": 6.61312868302042e-06, + "loss": 0.2919, + "step": 25465 + }, + { + "epoch": 1.1454150841020077, + "grad_norm": 3.697878122329712, + "learning_rate": 6.612991640400165e-06, + "loss": 0.1951, + "step": 25466 + }, + { + "epoch": 1.1454286489419425, + "grad_norm": 7.834198951721191, + "learning_rate": 6.61285459777991e-06, + "loss": 0.4555, + "step": 25467 + }, + { + "epoch": 1.1454422137818774, + "grad_norm": 5.378798484802246, + "learning_rate": 6.612717555159656e-06, + "loss": 0.3365, + "step": 25468 + }, + { + "epoch": 1.1454557786218122, + "grad_norm": 3.9930827617645264, + "learning_rate": 6.6125805125394e-06, + "loss": 0.2947, + "step": 25469 + }, + { + "epoch": 1.1454693434617471, + "grad_norm": 5.819803714752197, + "learning_rate": 6.6124434699191455e-06, + "loss": 0.3142, + "step": 25470 + }, + { + "epoch": 1.145482908301682, + "grad_norm": 5.182284832000732, + "learning_rate": 6.61230642729889e-06, + "loss": 0.2968, + "step": 25471 + }, + { + "epoch": 1.1454964731416168, + "grad_norm": 4.115514278411865, + "learning_rate": 6.612169384678636e-06, + "loss": 0.2543, + "step": 25472 + }, + { + "epoch": 1.145510037981552, + "grad_norm": 4.246429920196533, + "learning_rate": 6.612032342058381e-06, + "loss": 0.2704, + "step": 25473 + }, + { + "epoch": 1.1455236028214868, + "grad_norm": 5.047194480895996, + "learning_rate": 6.611895299438125e-06, + "loss": 0.2948, + "step": 25474 + }, + { + "epoch": 1.1455371676614217, + "grad_norm": 4.0963358879089355, + "learning_rate": 6.611758256817871e-06, + "loss": 0.1979, + "step": 25475 + }, + { + "epoch": 1.1455507325013565, + "grad_norm": 3.6201465129852295, + "learning_rate": 6.611621214197616e-06, + "loss": 0.2292, + "step": 25476 + }, + { + "epoch": 1.1455642973412914, + "grad_norm": 4.040622711181641, + "learning_rate": 6.611484171577362e-06, + "loss": 0.3156, + "step": 25477 + }, + { + "epoch": 1.1455778621812263, + "grad_norm": 4.60960054397583, + "learning_rate": 6.611347128957106e-06, + "loss": 0.2891, + "step": 25478 + }, + { + "epoch": 1.1455914270211611, + "grad_norm": 5.801400184631348, + "learning_rate": 6.611210086336851e-06, + "loss": 0.241, + "step": 25479 + }, + { + "epoch": 1.145604991861096, + "grad_norm": 4.332145690917969, + "learning_rate": 6.611073043716596e-06, + "loss": 0.2052, + "step": 25480 + }, + { + "epoch": 1.1456185567010309, + "grad_norm": 5.695178985595703, + "learning_rate": 6.610936001096342e-06, + "loss": 0.2457, + "step": 25481 + }, + { + "epoch": 1.1456321215409657, + "grad_norm": 4.517306804656982, + "learning_rate": 6.610798958476087e-06, + "loss": 0.2742, + "step": 25482 + }, + { + "epoch": 1.1456456863809006, + "grad_norm": 4.96690034866333, + "learning_rate": 6.610661915855832e-06, + "loss": 0.3176, + "step": 25483 + }, + { + "epoch": 1.1456592512208357, + "grad_norm": 7.933111667633057, + "learning_rate": 6.610524873235576e-06, + "loss": 0.3193, + "step": 25484 + }, + { + "epoch": 1.1456728160607705, + "grad_norm": 5.421787738800049, + "learning_rate": 6.610387830615322e-06, + "loss": 0.2463, + "step": 25485 + }, + { + "epoch": 1.1456863809007054, + "grad_norm": 3.4935102462768555, + "learning_rate": 6.6102507879950676e-06, + "loss": 0.1402, + "step": 25486 + }, + { + "epoch": 1.1456999457406403, + "grad_norm": 4.88854455947876, + "learning_rate": 6.610113745374812e-06, + "loss": 0.1605, + "step": 25487 + }, + { + "epoch": 1.1457135105805751, + "grad_norm": 4.797534465789795, + "learning_rate": 6.609976702754557e-06, + "loss": 0.3096, + "step": 25488 + }, + { + "epoch": 1.14572707542051, + "grad_norm": 5.412257671356201, + "learning_rate": 6.609839660134301e-06, + "loss": 0.2711, + "step": 25489 + }, + { + "epoch": 1.1457406402604449, + "grad_norm": 5.041074752807617, + "learning_rate": 6.6097026175140474e-06, + "loss": 0.2493, + "step": 25490 + }, + { + "epoch": 1.1457542051003797, + "grad_norm": 5.503734111785889, + "learning_rate": 6.609565574893793e-06, + "loss": 0.2406, + "step": 25491 + }, + { + "epoch": 1.1457677699403148, + "grad_norm": 4.557499408721924, + "learning_rate": 6.609428532273538e-06, + "loss": 0.2841, + "step": 25492 + }, + { + "epoch": 1.1457813347802497, + "grad_norm": 4.016343116760254, + "learning_rate": 6.609291489653282e-06, + "loss": 0.1523, + "step": 25493 + }, + { + "epoch": 1.1457948996201845, + "grad_norm": 5.357457160949707, + "learning_rate": 6.609154447033028e-06, + "loss": 0.2359, + "step": 25494 + }, + { + "epoch": 1.1458084644601194, + "grad_norm": 3.1820483207702637, + "learning_rate": 6.609017404412773e-06, + "loss": 0.1816, + "step": 25495 + }, + { + "epoch": 1.1458220293000543, + "grad_norm": 7.644023418426514, + "learning_rate": 6.608880361792518e-06, + "loss": 0.4146, + "step": 25496 + }, + { + "epoch": 1.1458355941399891, + "grad_norm": 5.36196231842041, + "learning_rate": 6.608743319172263e-06, + "loss": 0.2413, + "step": 25497 + }, + { + "epoch": 1.145849158979924, + "grad_norm": 5.228729248046875, + "learning_rate": 6.608606276552009e-06, + "loss": 0.1755, + "step": 25498 + }, + { + "epoch": 1.1458627238198589, + "grad_norm": 4.077974796295166, + "learning_rate": 6.608469233931753e-06, + "loss": 0.1861, + "step": 25499 + }, + { + "epoch": 1.1458762886597937, + "grad_norm": 4.0951104164123535, + "learning_rate": 6.608332191311498e-06, + "loss": 0.1879, + "step": 25500 + }, + { + "epoch": 1.1458898534997286, + "grad_norm": 5.705027103424072, + "learning_rate": 6.6081951486912436e-06, + "loss": 0.3308, + "step": 25501 + }, + { + "epoch": 1.1459034183396635, + "grad_norm": 4.155816555023193, + "learning_rate": 6.608058106070988e-06, + "loss": 0.1392, + "step": 25502 + }, + { + "epoch": 1.1459169831795986, + "grad_norm": 4.436150074005127, + "learning_rate": 6.607921063450734e-06, + "loss": 0.2122, + "step": 25503 + }, + { + "epoch": 1.1459305480195334, + "grad_norm": 5.469802379608154, + "learning_rate": 6.607784020830479e-06, + "loss": 0.3414, + "step": 25504 + }, + { + "epoch": 1.1459441128594683, + "grad_norm": 4.91152811050415, + "learning_rate": 6.607646978210223e-06, + "loss": 0.2703, + "step": 25505 + }, + { + "epoch": 1.1459576776994032, + "grad_norm": 5.5556230545043945, + "learning_rate": 6.607509935589969e-06, + "loss": 0.3489, + "step": 25506 + }, + { + "epoch": 1.145971242539338, + "grad_norm": 4.073869228363037, + "learning_rate": 6.607372892969715e-06, + "loss": 0.1597, + "step": 25507 + }, + { + "epoch": 1.1459848073792729, + "grad_norm": 8.082113265991211, + "learning_rate": 6.607235850349459e-06, + "loss": 0.2608, + "step": 25508 + }, + { + "epoch": 1.1459983722192078, + "grad_norm": 3.537837028503418, + "learning_rate": 6.607098807729204e-06, + "loss": 0.23, + "step": 25509 + }, + { + "epoch": 1.1460119370591426, + "grad_norm": 5.143096446990967, + "learning_rate": 6.606961765108949e-06, + "loss": 0.2417, + "step": 25510 + }, + { + "epoch": 1.1460255018990777, + "grad_norm": 4.647439479827881, + "learning_rate": 6.606824722488695e-06, + "loss": 0.1942, + "step": 25511 + }, + { + "epoch": 1.1460390667390126, + "grad_norm": 5.204136848449707, + "learning_rate": 6.60668767986844e-06, + "loss": 0.2092, + "step": 25512 + }, + { + "epoch": 1.1460526315789474, + "grad_norm": 3.882732391357422, + "learning_rate": 6.606550637248185e-06, + "loss": 0.2006, + "step": 25513 + }, + { + "epoch": 1.1460661964188823, + "grad_norm": 5.594210147857666, + "learning_rate": 6.606413594627929e-06, + "loss": 0.302, + "step": 25514 + }, + { + "epoch": 1.1460797612588172, + "grad_norm": 3.509021043777466, + "learning_rate": 6.606276552007675e-06, + "loss": 0.1791, + "step": 25515 + }, + { + "epoch": 1.146093326098752, + "grad_norm": 4.555437088012695, + "learning_rate": 6.60613950938742e-06, + "loss": 0.2357, + "step": 25516 + }, + { + "epoch": 1.146106890938687, + "grad_norm": 4.900329113006592, + "learning_rate": 6.6060024667671656e-06, + "loss": 0.3324, + "step": 25517 + }, + { + "epoch": 1.1461204557786218, + "grad_norm": 5.509345531463623, + "learning_rate": 6.60586542414691e-06, + "loss": 0.1998, + "step": 25518 + }, + { + "epoch": 1.1461340206185566, + "grad_norm": 5.148679733276367, + "learning_rate": 6.605728381526655e-06, + "loss": 0.2318, + "step": 25519 + }, + { + "epoch": 1.1461475854584915, + "grad_norm": 4.983109951019287, + "learning_rate": 6.605591338906401e-06, + "loss": 0.2383, + "step": 25520 + }, + { + "epoch": 1.1461611502984264, + "grad_norm": 4.265287399291992, + "learning_rate": 6.6054542962861454e-06, + "loss": 0.2683, + "step": 25521 + }, + { + "epoch": 1.1461747151383614, + "grad_norm": 5.310889720916748, + "learning_rate": 6.605317253665891e-06, + "loss": 0.2349, + "step": 25522 + }, + { + "epoch": 1.1461882799782963, + "grad_norm": 6.059336185455322, + "learning_rate": 6.605180211045635e-06, + "loss": 0.2155, + "step": 25523 + }, + { + "epoch": 1.1462018448182312, + "grad_norm": 5.178342342376709, + "learning_rate": 6.605043168425381e-06, + "loss": 0.2154, + "step": 25524 + }, + { + "epoch": 1.146215409658166, + "grad_norm": 4.8964619636535645, + "learning_rate": 6.604906125805126e-06, + "loss": 0.2899, + "step": 25525 + }, + { + "epoch": 1.146228974498101, + "grad_norm": 6.125366687774658, + "learning_rate": 6.604769083184871e-06, + "loss": 0.239, + "step": 25526 + }, + { + "epoch": 1.1462425393380358, + "grad_norm": 5.691176414489746, + "learning_rate": 6.604632040564616e-06, + "loss": 0.189, + "step": 25527 + }, + { + "epoch": 1.1462561041779706, + "grad_norm": 5.762553691864014, + "learning_rate": 6.604494997944362e-06, + "loss": 0.2336, + "step": 25528 + }, + { + "epoch": 1.1462696690179055, + "grad_norm": 4.102891445159912, + "learning_rate": 6.604357955324107e-06, + "loss": 0.241, + "step": 25529 + }, + { + "epoch": 1.1462832338578406, + "grad_norm": 6.464577674865723, + "learning_rate": 6.604220912703851e-06, + "loss": 0.3245, + "step": 25530 + }, + { + "epoch": 1.1462967986977755, + "grad_norm": 5.607354164123535, + "learning_rate": 6.604083870083596e-06, + "loss": 0.2197, + "step": 25531 + }, + { + "epoch": 1.1463103635377103, + "grad_norm": 5.283824443817139, + "learning_rate": 6.6039468274633416e-06, + "loss": 0.2656, + "step": 25532 + }, + { + "epoch": 1.1463239283776452, + "grad_norm": 6.815324783325195, + "learning_rate": 6.603809784843087e-06, + "loss": 0.3309, + "step": 25533 + }, + { + "epoch": 1.14633749321758, + "grad_norm": 7.305887699127197, + "learning_rate": 6.603672742222832e-06, + "loss": 0.4118, + "step": 25534 + }, + { + "epoch": 1.146351058057515, + "grad_norm": 4.453481197357178, + "learning_rate": 6.603535699602577e-06, + "loss": 0.1578, + "step": 25535 + }, + { + "epoch": 1.1463646228974498, + "grad_norm": 5.090311527252197, + "learning_rate": 6.6033986569823214e-06, + "loss": 0.2388, + "step": 25536 + }, + { + "epoch": 1.1463781877373846, + "grad_norm": 3.646505832672119, + "learning_rate": 6.6032616143620675e-06, + "loss": 0.2534, + "step": 25537 + }, + { + "epoch": 1.1463917525773195, + "grad_norm": 5.906156539916992, + "learning_rate": 6.603124571741813e-06, + "loss": 0.3378, + "step": 25538 + }, + { + "epoch": 1.1464053174172544, + "grad_norm": 3.9499573707580566, + "learning_rate": 6.602987529121557e-06, + "loss": 0.2506, + "step": 25539 + }, + { + "epoch": 1.1464188822571892, + "grad_norm": 4.584113121032715, + "learning_rate": 6.602850486501302e-06, + "loss": 0.2786, + "step": 25540 + }, + { + "epoch": 1.1464324470971243, + "grad_norm": 3.9768707752227783, + "learning_rate": 6.602713443881048e-06, + "loss": 0.1928, + "step": 25541 + }, + { + "epoch": 1.1464460119370592, + "grad_norm": 5.412789821624756, + "learning_rate": 6.6025764012607925e-06, + "loss": 0.2173, + "step": 25542 + }, + { + "epoch": 1.146459576776994, + "grad_norm": 4.740044116973877, + "learning_rate": 6.602439358640538e-06, + "loss": 0.2687, + "step": 25543 + }, + { + "epoch": 1.146473141616929, + "grad_norm": 4.701052665710449, + "learning_rate": 6.602302316020283e-06, + "loss": 0.2048, + "step": 25544 + }, + { + "epoch": 1.1464867064568638, + "grad_norm": 6.298023223876953, + "learning_rate": 6.602165273400027e-06, + "loss": 0.2968, + "step": 25545 + }, + { + "epoch": 1.1465002712967987, + "grad_norm": 5.654779434204102, + "learning_rate": 6.602028230779773e-06, + "loss": 0.2413, + "step": 25546 + }, + { + "epoch": 1.1465138361367335, + "grad_norm": 6.188808917999268, + "learning_rate": 6.601891188159518e-06, + "loss": 0.2493, + "step": 25547 + }, + { + "epoch": 1.1465274009766684, + "grad_norm": 6.199763774871826, + "learning_rate": 6.601754145539263e-06, + "loss": 0.2533, + "step": 25548 + }, + { + "epoch": 1.1465409658166035, + "grad_norm": 5.368503093719482, + "learning_rate": 6.601617102919008e-06, + "loss": 0.2092, + "step": 25549 + }, + { + "epoch": 1.1465545306565383, + "grad_norm": 4.670892238616943, + "learning_rate": 6.601480060298754e-06, + "loss": 0.2228, + "step": 25550 + }, + { + "epoch": 1.1465680954964732, + "grad_norm": 3.9962875843048096, + "learning_rate": 6.601343017678499e-06, + "loss": 0.2269, + "step": 25551 + }, + { + "epoch": 1.146581660336408, + "grad_norm": 7.164837837219238, + "learning_rate": 6.6012059750582434e-06, + "loss": 0.4289, + "step": 25552 + }, + { + "epoch": 1.146595225176343, + "grad_norm": 5.975026607513428, + "learning_rate": 6.601068932437989e-06, + "loss": 0.2663, + "step": 25553 + }, + { + "epoch": 1.1466087900162778, + "grad_norm": 4.828230857849121, + "learning_rate": 6.600931889817735e-06, + "loss": 0.2071, + "step": 25554 + }, + { + "epoch": 1.1466223548562127, + "grad_norm": 4.763265609741211, + "learning_rate": 6.600794847197479e-06, + "loss": 0.2338, + "step": 25555 + }, + { + "epoch": 1.1466359196961475, + "grad_norm": 4.710474491119385, + "learning_rate": 6.600657804577224e-06, + "loss": 0.2481, + "step": 25556 + }, + { + "epoch": 1.1466494845360824, + "grad_norm": 4.252895832061768, + "learning_rate": 6.6005207619569685e-06, + "loss": 0.1822, + "step": 25557 + }, + { + "epoch": 1.1466630493760173, + "grad_norm": 4.334090232849121, + "learning_rate": 6.600383719336714e-06, + "loss": 0.2642, + "step": 25558 + }, + { + "epoch": 1.1466766142159521, + "grad_norm": 4.8003010749816895, + "learning_rate": 6.60024667671646e-06, + "loss": 0.2054, + "step": 25559 + }, + { + "epoch": 1.1466901790558872, + "grad_norm": 5.8095622062683105, + "learning_rate": 6.600109634096205e-06, + "loss": 0.2665, + "step": 25560 + }, + { + "epoch": 1.146703743895822, + "grad_norm": 5.303353309631348, + "learning_rate": 6.599972591475949e-06, + "loss": 0.2354, + "step": 25561 + }, + { + "epoch": 1.146717308735757, + "grad_norm": 5.272759437561035, + "learning_rate": 6.599835548855694e-06, + "loss": 0.1983, + "step": 25562 + }, + { + "epoch": 1.1467308735756918, + "grad_norm": 5.311184883117676, + "learning_rate": 6.59969850623544e-06, + "loss": 0.2467, + "step": 25563 + }, + { + "epoch": 1.1467444384156267, + "grad_norm": 3.89477801322937, + "learning_rate": 6.599561463615185e-06, + "loss": 0.2417, + "step": 25564 + }, + { + "epoch": 1.1467580032555615, + "grad_norm": 4.858962059020996, + "learning_rate": 6.59942442099493e-06, + "loss": 0.2836, + "step": 25565 + }, + { + "epoch": 1.1467715680954964, + "grad_norm": 5.665237903594971, + "learning_rate": 6.599287378374675e-06, + "loss": 0.2887, + "step": 25566 + }, + { + "epoch": 1.1467851329354313, + "grad_norm": 4.654097080230713, + "learning_rate": 6.59915033575442e-06, + "loss": 0.2606, + "step": 25567 + }, + { + "epoch": 1.1467986977753664, + "grad_norm": 5.637823581695557, + "learning_rate": 6.5990132931341655e-06, + "loss": 0.203, + "step": 25568 + }, + { + "epoch": 1.1468122626153012, + "grad_norm": 6.659575939178467, + "learning_rate": 6.598876250513911e-06, + "loss": 0.3698, + "step": 25569 + }, + { + "epoch": 1.146825827455236, + "grad_norm": 4.502799034118652, + "learning_rate": 6.598739207893655e-06, + "loss": 0.1849, + "step": 25570 + }, + { + "epoch": 1.146839392295171, + "grad_norm": 4.735968112945557, + "learning_rate": 6.5986021652734e-06, + "loss": 0.2234, + "step": 25571 + }, + { + "epoch": 1.1468529571351058, + "grad_norm": 4.224954128265381, + "learning_rate": 6.598465122653146e-06, + "loss": 0.1979, + "step": 25572 + }, + { + "epoch": 1.1468665219750407, + "grad_norm": 3.8618147373199463, + "learning_rate": 6.5983280800328905e-06, + "loss": 0.226, + "step": 25573 + }, + { + "epoch": 1.1468800868149756, + "grad_norm": 5.139120101928711, + "learning_rate": 6.598191037412636e-06, + "loss": 0.3095, + "step": 25574 + }, + { + "epoch": 1.1468936516549104, + "grad_norm": 4.223271369934082, + "learning_rate": 6.598053994792381e-06, + "loss": 0.2573, + "step": 25575 + }, + { + "epoch": 1.1469072164948453, + "grad_norm": 4.026597023010254, + "learning_rate": 6.597916952172127e-06, + "loss": 0.2384, + "step": 25576 + }, + { + "epoch": 1.1469207813347801, + "grad_norm": 5.16265344619751, + "learning_rate": 6.597779909551871e-06, + "loss": 0.3107, + "step": 25577 + }, + { + "epoch": 1.146934346174715, + "grad_norm": 4.488692760467529, + "learning_rate": 6.597642866931616e-06, + "loss": 0.2869, + "step": 25578 + }, + { + "epoch": 1.14694791101465, + "grad_norm": 5.616240978240967, + "learning_rate": 6.597505824311361e-06, + "loss": 0.3103, + "step": 25579 + }, + { + "epoch": 1.146961475854585, + "grad_norm": 4.151998043060303, + "learning_rate": 6.597368781691107e-06, + "loss": 0.2172, + "step": 25580 + }, + { + "epoch": 1.1469750406945198, + "grad_norm": 5.666859149932861, + "learning_rate": 6.597231739070852e-06, + "loss": 0.2069, + "step": 25581 + }, + { + "epoch": 1.1469886055344547, + "grad_norm": 4.8844733238220215, + "learning_rate": 6.597094696450596e-06, + "loss": 0.2629, + "step": 25582 + }, + { + "epoch": 1.1470021703743896, + "grad_norm": 5.077119827270508, + "learning_rate": 6.5969576538303414e-06, + "loss": 0.395, + "step": 25583 + }, + { + "epoch": 1.1470157352143244, + "grad_norm": 5.706470489501953, + "learning_rate": 6.596820611210087e-06, + "loss": 0.2149, + "step": 25584 + }, + { + "epoch": 1.1470293000542593, + "grad_norm": 6.405302047729492, + "learning_rate": 6.596683568589833e-06, + "loss": 0.2796, + "step": 25585 + }, + { + "epoch": 1.1470428648941942, + "grad_norm": 4.312305450439453, + "learning_rate": 6.596546525969577e-06, + "loss": 0.2325, + "step": 25586 + }, + { + "epoch": 1.1470564297341292, + "grad_norm": 5.478294849395752, + "learning_rate": 6.596409483349322e-06, + "loss": 0.1996, + "step": 25587 + }, + { + "epoch": 1.1470699945740641, + "grad_norm": 4.503505229949951, + "learning_rate": 6.5962724407290665e-06, + "loss": 0.2023, + "step": 25588 + }, + { + "epoch": 1.147083559413999, + "grad_norm": 5.379439830780029, + "learning_rate": 6.5961353981088125e-06, + "loss": 0.2246, + "step": 25589 + }, + { + "epoch": 1.1470971242539338, + "grad_norm": 5.556167125701904, + "learning_rate": 6.595998355488558e-06, + "loss": 0.4079, + "step": 25590 + }, + { + "epoch": 1.1471106890938687, + "grad_norm": 5.02701997756958, + "learning_rate": 6.595861312868302e-06, + "loss": 0.2438, + "step": 25591 + }, + { + "epoch": 1.1471242539338036, + "grad_norm": 7.49232816696167, + "learning_rate": 6.595724270248047e-06, + "loss": 0.3295, + "step": 25592 + }, + { + "epoch": 1.1471378187737384, + "grad_norm": 4.351036071777344, + "learning_rate": 6.595587227627793e-06, + "loss": 0.1799, + "step": 25593 + }, + { + "epoch": 1.1471513836136733, + "grad_norm": 4.110342025756836, + "learning_rate": 6.595450185007538e-06, + "loss": 0.1714, + "step": 25594 + }, + { + "epoch": 1.1471649484536082, + "grad_norm": 4.281301498413086, + "learning_rate": 6.595313142387283e-06, + "loss": 0.2582, + "step": 25595 + }, + { + "epoch": 1.147178513293543, + "grad_norm": 4.3970184326171875, + "learning_rate": 6.595176099767028e-06, + "loss": 0.2596, + "step": 25596 + }, + { + "epoch": 1.1471920781334781, + "grad_norm": 3.9868571758270264, + "learning_rate": 6.595039057146774e-06, + "loss": 0.2389, + "step": 25597 + }, + { + "epoch": 1.147205642973413, + "grad_norm": 4.966675758361816, + "learning_rate": 6.594902014526518e-06, + "loss": 0.2826, + "step": 25598 + }, + { + "epoch": 1.1472192078133479, + "grad_norm": 4.277385711669922, + "learning_rate": 6.5947649719062635e-06, + "loss": 0.2255, + "step": 25599 + }, + { + "epoch": 1.1472327726532827, + "grad_norm": 5.713619232177734, + "learning_rate": 6.594627929286009e-06, + "loss": 0.1912, + "step": 25600 + }, + { + "epoch": 1.1472463374932176, + "grad_norm": 3.5601658821105957, + "learning_rate": 6.594490886665753e-06, + "loss": 0.2579, + "step": 25601 + }, + { + "epoch": 1.1472599023331524, + "grad_norm": 4.461913108825684, + "learning_rate": 6.594353844045499e-06, + "loss": 0.2423, + "step": 25602 + }, + { + "epoch": 1.1472734671730873, + "grad_norm": 3.79657244682312, + "learning_rate": 6.594216801425244e-06, + "loss": 0.1715, + "step": 25603 + }, + { + "epoch": 1.1472870320130222, + "grad_norm": 3.6402103900909424, + "learning_rate": 6.5940797588049885e-06, + "loss": 0.1941, + "step": 25604 + }, + { + "epoch": 1.147300596852957, + "grad_norm": 3.929643154144287, + "learning_rate": 6.593942716184734e-06, + "loss": 0.2198, + "step": 25605 + }, + { + "epoch": 1.1473141616928921, + "grad_norm": 4.578892707824707, + "learning_rate": 6.59380567356448e-06, + "loss": 0.2583, + "step": 25606 + }, + { + "epoch": 1.147327726532827, + "grad_norm": 4.291457653045654, + "learning_rate": 6.593668630944224e-06, + "loss": 0.1888, + "step": 25607 + }, + { + "epoch": 1.1473412913727619, + "grad_norm": 3.438326597213745, + "learning_rate": 6.593531588323969e-06, + "loss": 0.1122, + "step": 25608 + }, + { + "epoch": 1.1473548562126967, + "grad_norm": 5.214023113250732, + "learning_rate": 6.593394545703714e-06, + "loss": 0.1989, + "step": 25609 + }, + { + "epoch": 1.1473684210526316, + "grad_norm": 4.944604873657227, + "learning_rate": 6.5932575030834604e-06, + "loss": 0.2378, + "step": 25610 + }, + { + "epoch": 1.1473819858925665, + "grad_norm": 5.348909378051758, + "learning_rate": 6.593120460463205e-06, + "loss": 0.2171, + "step": 25611 + }, + { + "epoch": 1.1473955507325013, + "grad_norm": 3.7524051666259766, + "learning_rate": 6.59298341784295e-06, + "loss": 0.1397, + "step": 25612 + }, + { + "epoch": 1.1474091155724362, + "grad_norm": 3.134763240814209, + "learning_rate": 6.592846375222694e-06, + "loss": 0.148, + "step": 25613 + }, + { + "epoch": 1.147422680412371, + "grad_norm": 4.510805130004883, + "learning_rate": 6.5927093326024394e-06, + "loss": 0.2447, + "step": 25614 + }, + { + "epoch": 1.147436245252306, + "grad_norm": 2.8282406330108643, + "learning_rate": 6.5925722899821855e-06, + "loss": 0.1506, + "step": 25615 + }, + { + "epoch": 1.147449810092241, + "grad_norm": 4.046012878417969, + "learning_rate": 6.59243524736193e-06, + "loss": 0.2113, + "step": 25616 + }, + { + "epoch": 1.1474633749321759, + "grad_norm": 5.187235355377197, + "learning_rate": 6.592298204741675e-06, + "loss": 0.2291, + "step": 25617 + }, + { + "epoch": 1.1474769397721107, + "grad_norm": 5.712250709533691, + "learning_rate": 6.59216116212142e-06, + "loss": 0.244, + "step": 25618 + }, + { + "epoch": 1.1474905046120456, + "grad_norm": 4.0883002281188965, + "learning_rate": 6.592024119501166e-06, + "loss": 0.162, + "step": 25619 + }, + { + "epoch": 1.1475040694519805, + "grad_norm": 4.302430152893066, + "learning_rate": 6.5918870768809105e-06, + "loss": 0.1576, + "step": 25620 + }, + { + "epoch": 1.1475176342919153, + "grad_norm": 3.9932334423065186, + "learning_rate": 6.591750034260656e-06, + "loss": 0.1684, + "step": 25621 + }, + { + "epoch": 1.1475311991318502, + "grad_norm": 3.193985939025879, + "learning_rate": 6.5916129916404e-06, + "loss": 0.1616, + "step": 25622 + }, + { + "epoch": 1.147544763971785, + "grad_norm": 2.5158705711364746, + "learning_rate": 6.591475949020146e-06, + "loss": 0.0932, + "step": 25623 + }, + { + "epoch": 1.14755832881172, + "grad_norm": 4.9592719078063965, + "learning_rate": 6.591338906399891e-06, + "loss": 0.2182, + "step": 25624 + }, + { + "epoch": 1.147571893651655, + "grad_norm": 3.6293582916259766, + "learning_rate": 6.591201863779636e-06, + "loss": 0.1576, + "step": 25625 + }, + { + "epoch": 1.1475854584915899, + "grad_norm": 4.268277168273926, + "learning_rate": 6.591064821159381e-06, + "loss": 0.2305, + "step": 25626 + }, + { + "epoch": 1.1475990233315247, + "grad_norm": 4.186465740203857, + "learning_rate": 6.590927778539126e-06, + "loss": 0.191, + "step": 25627 + }, + { + "epoch": 1.1476125881714596, + "grad_norm": 4.935495853424072, + "learning_rate": 6.590790735918872e-06, + "loss": 0.2084, + "step": 25628 + }, + { + "epoch": 1.1476261530113945, + "grad_norm": 3.2147741317749023, + "learning_rate": 6.590653693298616e-06, + "loss": 0.1406, + "step": 25629 + }, + { + "epoch": 1.1476397178513293, + "grad_norm": 4.70916223526001, + "learning_rate": 6.5905166506783615e-06, + "loss": 0.2249, + "step": 25630 + }, + { + "epoch": 1.1476532826912642, + "grad_norm": 6.540712833404541, + "learning_rate": 6.590379608058106e-06, + "loss": 0.17, + "step": 25631 + }, + { + "epoch": 1.147666847531199, + "grad_norm": 4.086082458496094, + "learning_rate": 6.590242565437852e-06, + "loss": 0.16, + "step": 25632 + }, + { + "epoch": 1.147680412371134, + "grad_norm": 3.905876398086548, + "learning_rate": 6.590105522817597e-06, + "loss": 0.1447, + "step": 25633 + }, + { + "epoch": 1.1476939772110688, + "grad_norm": 4.112987995147705, + "learning_rate": 6.589968480197342e-06, + "loss": 0.129, + "step": 25634 + }, + { + "epoch": 1.147707542051004, + "grad_norm": 4.706186771392822, + "learning_rate": 6.5898314375770865e-06, + "loss": 0.2038, + "step": 25635 + }, + { + "epoch": 1.1477211068909388, + "grad_norm": 3.254164218902588, + "learning_rate": 6.5896943949568325e-06, + "loss": 0.1311, + "step": 25636 + }, + { + "epoch": 1.1477346717308736, + "grad_norm": 4.293368339538574, + "learning_rate": 6.589557352336578e-06, + "loss": 0.2086, + "step": 25637 + }, + { + "epoch": 1.1477482365708085, + "grad_norm": 3.9877688884735107, + "learning_rate": 6.589420309716322e-06, + "loss": 0.1896, + "step": 25638 + }, + { + "epoch": 1.1477618014107434, + "grad_norm": 5.013264179229736, + "learning_rate": 6.589283267096067e-06, + "loss": 0.1819, + "step": 25639 + }, + { + "epoch": 1.1477753662506782, + "grad_norm": 4.488943576812744, + "learning_rate": 6.5891462244758116e-06, + "loss": 0.2056, + "step": 25640 + }, + { + "epoch": 1.147788931090613, + "grad_norm": 4.323719024658203, + "learning_rate": 6.589009181855558e-06, + "loss": 0.1959, + "step": 25641 + }, + { + "epoch": 1.147802495930548, + "grad_norm": 3.720634698867798, + "learning_rate": 6.588872139235303e-06, + "loss": 0.2073, + "step": 25642 + }, + { + "epoch": 1.1478160607704828, + "grad_norm": 3.924591541290283, + "learning_rate": 6.588735096615048e-06, + "loss": 0.1808, + "step": 25643 + }, + { + "epoch": 1.147829625610418, + "grad_norm": 5.371158123016357, + "learning_rate": 6.588598053994792e-06, + "loss": 0.1431, + "step": 25644 + }, + { + "epoch": 1.1478431904503528, + "grad_norm": 5.489882469177246, + "learning_rate": 6.588461011374538e-06, + "loss": 0.215, + "step": 25645 + }, + { + "epoch": 1.1478567552902876, + "grad_norm": 4.412069320678711, + "learning_rate": 6.5883239687542835e-06, + "loss": 0.1607, + "step": 25646 + }, + { + "epoch": 1.1478703201302225, + "grad_norm": 3.4915125370025635, + "learning_rate": 6.588186926134028e-06, + "loss": 0.1372, + "step": 25647 + }, + { + "epoch": 1.1478838849701574, + "grad_norm": 3.8452961444854736, + "learning_rate": 6.588049883513773e-06, + "loss": 0.1572, + "step": 25648 + }, + { + "epoch": 1.1478974498100922, + "grad_norm": 7.9261698722839355, + "learning_rate": 6.587912840893519e-06, + "loss": 0.3752, + "step": 25649 + }, + { + "epoch": 1.147911014650027, + "grad_norm": 5.55902099609375, + "learning_rate": 6.587775798273263e-06, + "loss": 0.1993, + "step": 25650 + }, + { + "epoch": 1.147924579489962, + "grad_norm": 3.9707586765289307, + "learning_rate": 6.5876387556530085e-06, + "loss": 0.1996, + "step": 25651 + }, + { + "epoch": 1.1479381443298968, + "grad_norm": 4.8956828117370605, + "learning_rate": 6.587501713032754e-06, + "loss": 0.1922, + "step": 25652 + }, + { + "epoch": 1.1479517091698317, + "grad_norm": 4.915613651275635, + "learning_rate": 6.587364670412498e-06, + "loss": 0.2147, + "step": 25653 + }, + { + "epoch": 1.1479652740097668, + "grad_norm": 3.9911725521087646, + "learning_rate": 6.587227627792244e-06, + "loss": 0.1177, + "step": 25654 + }, + { + "epoch": 1.1479788388497016, + "grad_norm": 5.315305233001709, + "learning_rate": 6.587090585171989e-06, + "loss": 0.2064, + "step": 25655 + }, + { + "epoch": 1.1479924036896365, + "grad_norm": 4.100100040435791, + "learning_rate": 6.5869535425517336e-06, + "loss": 0.1563, + "step": 25656 + }, + { + "epoch": 1.1480059685295714, + "grad_norm": 3.705899238586426, + "learning_rate": 6.586816499931479e-06, + "loss": 0.1062, + "step": 25657 + }, + { + "epoch": 1.1480195333695062, + "grad_norm": 4.944397449493408, + "learning_rate": 6.586679457311225e-06, + "loss": 0.2279, + "step": 25658 + }, + { + "epoch": 1.148033098209441, + "grad_norm": 4.973833084106445, + "learning_rate": 6.58654241469097e-06, + "loss": 0.1882, + "step": 25659 + }, + { + "epoch": 1.148046663049376, + "grad_norm": 4.190924167633057, + "learning_rate": 6.586405372070714e-06, + "loss": 0.1484, + "step": 25660 + }, + { + "epoch": 1.1480602278893108, + "grad_norm": 5.377603054046631, + "learning_rate": 6.5862683294504595e-06, + "loss": 0.1845, + "step": 25661 + }, + { + "epoch": 1.1480737927292457, + "grad_norm": 5.003084659576416, + "learning_rate": 6.5861312868302055e-06, + "loss": 0.1678, + "step": 25662 + }, + { + "epoch": 1.1480873575691808, + "grad_norm": 4.116438388824463, + "learning_rate": 6.58599424420995e-06, + "loss": 0.1812, + "step": 25663 + }, + { + "epoch": 1.1481009224091157, + "grad_norm": 2.906247615814209, + "learning_rate": 6.585857201589695e-06, + "loss": 0.122, + "step": 25664 + }, + { + "epoch": 1.1481144872490505, + "grad_norm": 4.9698967933654785, + "learning_rate": 6.585720158969439e-06, + "loss": 0.1533, + "step": 25665 + }, + { + "epoch": 1.1481280520889854, + "grad_norm": 4.807638645172119, + "learning_rate": 6.585583116349185e-06, + "loss": 0.2193, + "step": 25666 + }, + { + "epoch": 1.1481416169289203, + "grad_norm": 6.3338541984558105, + "learning_rate": 6.5854460737289305e-06, + "loss": 0.2222, + "step": 25667 + }, + { + "epoch": 1.1481551817688551, + "grad_norm": 5.798806190490723, + "learning_rate": 6.585309031108676e-06, + "loss": 0.2242, + "step": 25668 + }, + { + "epoch": 1.14816874660879, + "grad_norm": 3.704049587249756, + "learning_rate": 6.58517198848842e-06, + "loss": 0.1301, + "step": 25669 + }, + { + "epoch": 1.1481823114487248, + "grad_norm": 4.1440558433532715, + "learning_rate": 6.585034945868165e-06, + "loss": 0.1404, + "step": 25670 + }, + { + "epoch": 1.1481958762886597, + "grad_norm": 3.5901787281036377, + "learning_rate": 6.584897903247911e-06, + "loss": 0.1614, + "step": 25671 + }, + { + "epoch": 1.1482094411285946, + "grad_norm": 4.833906173706055, + "learning_rate": 6.584760860627656e-06, + "loss": 0.1923, + "step": 25672 + }, + { + "epoch": 1.1482230059685297, + "grad_norm": 5.42853307723999, + "learning_rate": 6.584623818007401e-06, + "loss": 0.1941, + "step": 25673 + }, + { + "epoch": 1.1482365708084645, + "grad_norm": 6.266079902648926, + "learning_rate": 6.584486775387146e-06, + "loss": 0.3103, + "step": 25674 + }, + { + "epoch": 1.1482501356483994, + "grad_norm": 8.216459274291992, + "learning_rate": 6.584349732766891e-06, + "loss": 0.3109, + "step": 25675 + }, + { + "epoch": 1.1482637004883343, + "grad_norm": 5.332788467407227, + "learning_rate": 6.584212690146636e-06, + "loss": 0.267, + "step": 25676 + }, + { + "epoch": 1.1482772653282691, + "grad_norm": 4.8773698806762695, + "learning_rate": 6.5840756475263815e-06, + "loss": 0.2223, + "step": 25677 + }, + { + "epoch": 1.148290830168204, + "grad_norm": 4.856715202331543, + "learning_rate": 6.583938604906126e-06, + "loss": 0.2163, + "step": 25678 + }, + { + "epoch": 1.1483043950081389, + "grad_norm": 4.708834648132324, + "learning_rate": 6.583801562285872e-06, + "loss": 0.1673, + "step": 25679 + }, + { + "epoch": 1.1483179598480737, + "grad_norm": 5.529089450836182, + "learning_rate": 6.583664519665617e-06, + "loss": 0.153, + "step": 25680 + }, + { + "epoch": 1.1483315246880086, + "grad_norm": 5.946624279022217, + "learning_rate": 6.583527477045361e-06, + "loss": 0.2514, + "step": 25681 + }, + { + "epoch": 1.1483450895279437, + "grad_norm": 5.645781517028809, + "learning_rate": 6.5833904344251065e-06, + "loss": 0.2297, + "step": 25682 + }, + { + "epoch": 1.1483586543678785, + "grad_norm": 7.574763298034668, + "learning_rate": 6.583253391804852e-06, + "loss": 0.3517, + "step": 25683 + }, + { + "epoch": 1.1483722192078134, + "grad_norm": 6.959371566772461, + "learning_rate": 6.583116349184597e-06, + "loss": 0.3489, + "step": 25684 + }, + { + "epoch": 1.1483857840477483, + "grad_norm": 4.8575921058654785, + "learning_rate": 6.582979306564342e-06, + "loss": 0.254, + "step": 25685 + }, + { + "epoch": 1.1483993488876831, + "grad_norm": 7.617810249328613, + "learning_rate": 6.582842263944087e-06, + "loss": 0.2565, + "step": 25686 + }, + { + "epoch": 1.148412913727618, + "grad_norm": 4.634633541107178, + "learning_rate": 6.582705221323832e-06, + "loss": 0.1998, + "step": 25687 + }, + { + "epoch": 1.1484264785675529, + "grad_norm": 8.67138671875, + "learning_rate": 6.582568178703578e-06, + "loss": 0.3722, + "step": 25688 + }, + { + "epoch": 1.1484400434074877, + "grad_norm": 5.657286643981934, + "learning_rate": 6.582431136083323e-06, + "loss": 0.2524, + "step": 25689 + }, + { + "epoch": 1.1484536082474226, + "grad_norm": 6.622600078582764, + "learning_rate": 6.582294093463067e-06, + "loss": 0.2723, + "step": 25690 + }, + { + "epoch": 1.1484671730873575, + "grad_norm": 7.574042797088623, + "learning_rate": 6.582157050842812e-06, + "loss": 0.301, + "step": 25691 + }, + { + "epoch": 1.1484807379272925, + "grad_norm": 6.483291149139404, + "learning_rate": 6.582020008222558e-06, + "loss": 0.2608, + "step": 25692 + }, + { + "epoch": 1.1484943027672274, + "grad_norm": 7.095229148864746, + "learning_rate": 6.5818829656023035e-06, + "loss": 0.2867, + "step": 25693 + }, + { + "epoch": 1.1485078676071623, + "grad_norm": 5.169473648071289, + "learning_rate": 6.581745922982048e-06, + "loss": 0.263, + "step": 25694 + }, + { + "epoch": 1.1485214324470971, + "grad_norm": 5.640167713165283, + "learning_rate": 6.581608880361793e-06, + "loss": 0.2439, + "step": 25695 + }, + { + "epoch": 1.148534997287032, + "grad_norm": 4.993691444396973, + "learning_rate": 6.581471837741537e-06, + "loss": 0.1922, + "step": 25696 + }, + { + "epoch": 1.1485485621269669, + "grad_norm": 8.25411605834961, + "learning_rate": 6.581334795121283e-06, + "loss": 0.4593, + "step": 25697 + }, + { + "epoch": 1.1485621269669017, + "grad_norm": 5.324534893035889, + "learning_rate": 6.5811977525010285e-06, + "loss": 0.2637, + "step": 25698 + }, + { + "epoch": 1.1485756918068366, + "grad_norm": 4.8620805740356445, + "learning_rate": 6.581060709880773e-06, + "loss": 0.1938, + "step": 25699 + }, + { + "epoch": 1.1485892566467715, + "grad_norm": 4.363669395446777, + "learning_rate": 6.580923667260518e-06, + "loss": 0.207, + "step": 25700 + }, + { + "epoch": 1.1486028214867066, + "grad_norm": 4.488943576812744, + "learning_rate": 6.580786624640264e-06, + "loss": 0.2149, + "step": 25701 + }, + { + "epoch": 1.1486163863266414, + "grad_norm": 6.434810638427734, + "learning_rate": 6.580649582020009e-06, + "loss": 0.342, + "step": 25702 + }, + { + "epoch": 1.1486299511665763, + "grad_norm": 6.282656192779541, + "learning_rate": 6.580512539399754e-06, + "loss": 0.3252, + "step": 25703 + }, + { + "epoch": 1.1486435160065112, + "grad_norm": 4.644010066986084, + "learning_rate": 6.580375496779499e-06, + "loss": 0.3314, + "step": 25704 + }, + { + "epoch": 1.148657080846446, + "grad_norm": 5.347945213317871, + "learning_rate": 6.580238454159245e-06, + "loss": 0.1959, + "step": 25705 + }, + { + "epoch": 1.1486706456863809, + "grad_norm": 5.604955196380615, + "learning_rate": 6.580101411538989e-06, + "loss": 0.2519, + "step": 25706 + }, + { + "epoch": 1.1486842105263158, + "grad_norm": 5.993802547454834, + "learning_rate": 6.579964368918734e-06, + "loss": 0.2842, + "step": 25707 + }, + { + "epoch": 1.1486977753662506, + "grad_norm": 5.336165428161621, + "learning_rate": 6.5798273262984795e-06, + "loss": 0.4295, + "step": 25708 + }, + { + "epoch": 1.1487113402061855, + "grad_norm": 4.351897239685059, + "learning_rate": 6.579690283678224e-06, + "loss": 0.2248, + "step": 25709 + }, + { + "epoch": 1.1487249050461203, + "grad_norm": 6.02398157119751, + "learning_rate": 6.57955324105797e-06, + "loss": 0.4055, + "step": 25710 + }, + { + "epoch": 1.1487384698860554, + "grad_norm": 5.898756504058838, + "learning_rate": 6.579416198437715e-06, + "loss": 0.2889, + "step": 25711 + }, + { + "epoch": 1.1487520347259903, + "grad_norm": 6.247974872589111, + "learning_rate": 6.579279155817459e-06, + "loss": 0.3503, + "step": 25712 + }, + { + "epoch": 1.1487655995659252, + "grad_norm": 5.096131324768066, + "learning_rate": 6.5791421131972045e-06, + "loss": 0.2424, + "step": 25713 + }, + { + "epoch": 1.14877916440586, + "grad_norm": 5.878072738647461, + "learning_rate": 6.5790050705769506e-06, + "loss": 0.3969, + "step": 25714 + }, + { + "epoch": 1.148792729245795, + "grad_norm": 5.380260944366455, + "learning_rate": 6.578868027956695e-06, + "loss": 0.422, + "step": 25715 + }, + { + "epoch": 1.1488062940857298, + "grad_norm": 7.052535533905029, + "learning_rate": 6.57873098533644e-06, + "loss": 0.4102, + "step": 25716 + }, + { + "epoch": 1.1488198589256646, + "grad_norm": 7.980367660522461, + "learning_rate": 6.578593942716185e-06, + "loss": 0.4712, + "step": 25717 + }, + { + "epoch": 1.1488334237655995, + "grad_norm": 6.161096572875977, + "learning_rate": 6.578456900095931e-06, + "loss": 0.4886, + "step": 25718 + }, + { + "epoch": 1.1488469886055344, + "grad_norm": 5.759807109832764, + "learning_rate": 6.578319857475676e-06, + "loss": 0.2974, + "step": 25719 + }, + { + "epoch": 1.1488605534454694, + "grad_norm": 6.826361179351807, + "learning_rate": 6.578182814855421e-06, + "loss": 0.4714, + "step": 25720 + }, + { + "epoch": 1.1488741182854043, + "grad_norm": 6.510914325714111, + "learning_rate": 6.578045772235165e-06, + "loss": 0.3333, + "step": 25721 + }, + { + "epoch": 1.1488876831253392, + "grad_norm": 6.283134937286377, + "learning_rate": 6.57790872961491e-06, + "loss": 0.3493, + "step": 25722 + }, + { + "epoch": 1.148901247965274, + "grad_norm": 5.682454586029053, + "learning_rate": 6.577771686994656e-06, + "loss": 0.2156, + "step": 25723 + }, + { + "epoch": 1.148914812805209, + "grad_norm": 5.233019828796387, + "learning_rate": 6.577634644374401e-06, + "loss": 0.2946, + "step": 25724 + }, + { + "epoch": 1.1489283776451438, + "grad_norm": 4.9649224281311035, + "learning_rate": 6.577497601754146e-06, + "loss": 0.3303, + "step": 25725 + }, + { + "epoch": 1.1489419424850786, + "grad_norm": 5.01568603515625, + "learning_rate": 6.577360559133891e-06, + "loss": 0.2381, + "step": 25726 + }, + { + "epoch": 1.1489555073250135, + "grad_norm": 4.7240519523620605, + "learning_rate": 6.577223516513637e-06, + "loss": 0.3348, + "step": 25727 + }, + { + "epoch": 1.1489690721649484, + "grad_norm": 4.875454425811768, + "learning_rate": 6.577086473893381e-06, + "loss": 0.1832, + "step": 25728 + }, + { + "epoch": 1.1489826370048832, + "grad_norm": 5.276713848114014, + "learning_rate": 6.5769494312731265e-06, + "loss": 0.2719, + "step": 25729 + }, + { + "epoch": 1.1489962018448183, + "grad_norm": 7.02701473236084, + "learning_rate": 6.576812388652871e-06, + "loss": 0.3623, + "step": 25730 + }, + { + "epoch": 1.1490097666847532, + "grad_norm": 5.213370323181152, + "learning_rate": 6.576675346032617e-06, + "loss": 0.4403, + "step": 25731 + }, + { + "epoch": 1.149023331524688, + "grad_norm": 4.773055553436279, + "learning_rate": 6.576538303412362e-06, + "loss": 0.258, + "step": 25732 + }, + { + "epoch": 1.149036896364623, + "grad_norm": 6.542830944061279, + "learning_rate": 6.576401260792106e-06, + "loss": 0.3994, + "step": 25733 + }, + { + "epoch": 1.1490504612045578, + "grad_norm": 6.528240203857422, + "learning_rate": 6.576264218171852e-06, + "loss": 0.4505, + "step": 25734 + }, + { + "epoch": 1.1490640260444926, + "grad_norm": 6.104605674743652, + "learning_rate": 6.576127175551598e-06, + "loss": 0.3822, + "step": 25735 + }, + { + "epoch": 1.1490775908844275, + "grad_norm": 5.815581798553467, + "learning_rate": 6.575990132931343e-06, + "loss": 0.2683, + "step": 25736 + }, + { + "epoch": 1.1490911557243624, + "grad_norm": 5.021233558654785, + "learning_rate": 6.575853090311087e-06, + "loss": 0.2911, + "step": 25737 + }, + { + "epoch": 1.1491047205642975, + "grad_norm": 4.305905342102051, + "learning_rate": 6.575716047690832e-06, + "loss": 0.1711, + "step": 25738 + }, + { + "epoch": 1.1491182854042323, + "grad_norm": 4.117875576019287, + "learning_rate": 6.575579005070577e-06, + "loss": 0.3482, + "step": 25739 + }, + { + "epoch": 1.1491318502441672, + "grad_norm": 4.286398887634277, + "learning_rate": 6.575441962450323e-06, + "loss": 0.2827, + "step": 25740 + }, + { + "epoch": 1.149145415084102, + "grad_norm": 5.4038848876953125, + "learning_rate": 6.575304919830068e-06, + "loss": 0.2314, + "step": 25741 + }, + { + "epoch": 1.149158979924037, + "grad_norm": 5.344267845153809, + "learning_rate": 6.575167877209813e-06, + "loss": 0.3546, + "step": 25742 + }, + { + "epoch": 1.1491725447639718, + "grad_norm": 6.950006008148193, + "learning_rate": 6.575030834589557e-06, + "loss": 0.2978, + "step": 25743 + }, + { + "epoch": 1.1491861096039067, + "grad_norm": 4.52239465713501, + "learning_rate": 6.574893791969303e-06, + "loss": 0.3215, + "step": 25744 + }, + { + "epoch": 1.1491996744438415, + "grad_norm": 5.835341453552246, + "learning_rate": 6.5747567493490486e-06, + "loss": 0.2366, + "step": 25745 + }, + { + "epoch": 1.1492132392837764, + "grad_norm": 5.209780693054199, + "learning_rate": 6.574619706728793e-06, + "loss": 0.3216, + "step": 25746 + }, + { + "epoch": 1.1492268041237113, + "grad_norm": 5.814786911010742, + "learning_rate": 6.574482664108538e-06, + "loss": 0.3032, + "step": 25747 + }, + { + "epoch": 1.1492403689636461, + "grad_norm": 5.36794376373291, + "learning_rate": 6.574345621488284e-06, + "loss": 0.3913, + "step": 25748 + }, + { + "epoch": 1.1492539338035812, + "grad_norm": 6.3582987785339355, + "learning_rate": 6.5742085788680284e-06, + "loss": 0.3253, + "step": 25749 + }, + { + "epoch": 1.149267498643516, + "grad_norm": 5.690591812133789, + "learning_rate": 6.574071536247774e-06, + "loss": 0.2641, + "step": 25750 + }, + { + "epoch": 1.149281063483451, + "grad_norm": 6.204166889190674, + "learning_rate": 6.573934493627519e-06, + "loss": 0.3382, + "step": 25751 + }, + { + "epoch": 1.1492946283233858, + "grad_norm": 5.068366527557373, + "learning_rate": 6.573797451007263e-06, + "loss": 0.303, + "step": 25752 + }, + { + "epoch": 1.1493081931633207, + "grad_norm": 5.402190208435059, + "learning_rate": 6.573660408387009e-06, + "loss": 0.2577, + "step": 25753 + }, + { + "epoch": 1.1493217580032555, + "grad_norm": 5.189429759979248, + "learning_rate": 6.573523365766754e-06, + "loss": 0.2849, + "step": 25754 + }, + { + "epoch": 1.1493353228431904, + "grad_norm": 4.880134582519531, + "learning_rate": 6.573386323146499e-06, + "loss": 0.3171, + "step": 25755 + }, + { + "epoch": 1.1493488876831253, + "grad_norm": 5.059848785400391, + "learning_rate": 6.573249280526244e-06, + "loss": 0.3359, + "step": 25756 + }, + { + "epoch": 1.1493624525230604, + "grad_norm": 5.902370929718018, + "learning_rate": 6.57311223790599e-06, + "loss": 0.3104, + "step": 25757 + }, + { + "epoch": 1.1493760173629952, + "grad_norm": 4.057662010192871, + "learning_rate": 6.572975195285734e-06, + "loss": 0.2498, + "step": 25758 + }, + { + "epoch": 1.14938958220293, + "grad_norm": 5.61065673828125, + "learning_rate": 6.572838152665479e-06, + "loss": 0.3376, + "step": 25759 + }, + { + "epoch": 1.149403147042865, + "grad_norm": 5.182252407073975, + "learning_rate": 6.5727011100452246e-06, + "loss": 0.3894, + "step": 25760 + }, + { + "epoch": 1.1494167118827998, + "grad_norm": 5.323181629180908, + "learning_rate": 6.572564067424971e-06, + "loss": 0.2779, + "step": 25761 + }, + { + "epoch": 1.1494302767227347, + "grad_norm": 3.8974764347076416, + "learning_rate": 6.572427024804715e-06, + "loss": 0.2802, + "step": 25762 + }, + { + "epoch": 1.1494438415626695, + "grad_norm": 3.708829164505005, + "learning_rate": 6.57228998218446e-06, + "loss": 0.2704, + "step": 25763 + }, + { + "epoch": 1.1494574064026044, + "grad_norm": 4.545558452606201, + "learning_rate": 6.572152939564204e-06, + "loss": 0.2635, + "step": 25764 + }, + { + "epoch": 1.1494709712425393, + "grad_norm": 4.098285675048828, + "learning_rate": 6.57201589694395e-06, + "loss": 0.3077, + "step": 25765 + }, + { + "epoch": 1.1494845360824741, + "grad_norm": 4.84636926651001, + "learning_rate": 6.571878854323696e-06, + "loss": 0.2217, + "step": 25766 + }, + { + "epoch": 1.149498100922409, + "grad_norm": 4.120345592498779, + "learning_rate": 6.571741811703441e-06, + "loss": 0.2811, + "step": 25767 + }, + { + "epoch": 1.149511665762344, + "grad_norm": 5.216465950012207, + "learning_rate": 6.571604769083185e-06, + "loss": 0.2773, + "step": 25768 + }, + { + "epoch": 1.149525230602279, + "grad_norm": 4.826095104217529, + "learning_rate": 6.57146772646293e-06, + "loss": 0.3012, + "step": 25769 + }, + { + "epoch": 1.1495387954422138, + "grad_norm": 4.746626853942871, + "learning_rate": 6.571330683842676e-06, + "loss": 0.2821, + "step": 25770 + }, + { + "epoch": 1.1495523602821487, + "grad_norm": 4.5989813804626465, + "learning_rate": 6.571193641222421e-06, + "loss": 0.2476, + "step": 25771 + }, + { + "epoch": 1.1495659251220836, + "grad_norm": 5.010382652282715, + "learning_rate": 6.571056598602166e-06, + "loss": 0.3179, + "step": 25772 + }, + { + "epoch": 1.1495794899620184, + "grad_norm": 6.719583988189697, + "learning_rate": 6.57091955598191e-06, + "loss": 0.2963, + "step": 25773 + }, + { + "epoch": 1.1495930548019533, + "grad_norm": 4.901687145233154, + "learning_rate": 6.570782513361656e-06, + "loss": 0.2363, + "step": 25774 + }, + { + "epoch": 1.1496066196418881, + "grad_norm": 5.482527732849121, + "learning_rate": 6.570645470741401e-06, + "loss": 0.2674, + "step": 25775 + }, + { + "epoch": 1.1496201844818232, + "grad_norm": 5.0204315185546875, + "learning_rate": 6.5705084281211466e-06, + "loss": 0.2599, + "step": 25776 + }, + { + "epoch": 1.149633749321758, + "grad_norm": 5.1776652336120605, + "learning_rate": 6.570371385500891e-06, + "loss": 0.2496, + "step": 25777 + }, + { + "epoch": 1.149647314161693, + "grad_norm": 3.9309580326080322, + "learning_rate": 6.570234342880636e-06, + "loss": 0.191, + "step": 25778 + }, + { + "epoch": 1.1496608790016278, + "grad_norm": 5.152295112609863, + "learning_rate": 6.570097300260382e-06, + "loss": 0.2599, + "step": 25779 + }, + { + "epoch": 1.1496744438415627, + "grad_norm": 5.465709686279297, + "learning_rate": 6.5699602576401264e-06, + "loss": 0.244, + "step": 25780 + }, + { + "epoch": 1.1496880086814976, + "grad_norm": 3.554810047149658, + "learning_rate": 6.569823215019872e-06, + "loss": 0.1541, + "step": 25781 + }, + { + "epoch": 1.1497015735214324, + "grad_norm": 3.4011878967285156, + "learning_rate": 6.569686172399616e-06, + "loss": 0.1876, + "step": 25782 + }, + { + "epoch": 1.1497151383613673, + "grad_norm": 3.7776873111724854, + "learning_rate": 6.569549129779362e-06, + "loss": 0.1432, + "step": 25783 + }, + { + "epoch": 1.1497287032013022, + "grad_norm": 5.587017059326172, + "learning_rate": 6.569412087159107e-06, + "loss": 0.2185, + "step": 25784 + }, + { + "epoch": 1.149742268041237, + "grad_norm": 4.279048919677734, + "learning_rate": 6.569275044538852e-06, + "loss": 0.1802, + "step": 25785 + }, + { + "epoch": 1.149755832881172, + "grad_norm": 3.1735246181488037, + "learning_rate": 6.569138001918597e-06, + "loss": 0.1593, + "step": 25786 + }, + { + "epoch": 1.149769397721107, + "grad_norm": 4.249752044677734, + "learning_rate": 6.569000959298343e-06, + "loss": 0.1453, + "step": 25787 + }, + { + "epoch": 1.1497829625610418, + "grad_norm": 4.064919948577881, + "learning_rate": 6.568863916678088e-06, + "loss": 0.2642, + "step": 25788 + }, + { + "epoch": 1.1497965274009767, + "grad_norm": 4.938774585723877, + "learning_rate": 6.568726874057832e-06, + "loss": 0.2688, + "step": 25789 + }, + { + "epoch": 1.1498100922409116, + "grad_norm": 4.712850570678711, + "learning_rate": 6.568589831437577e-06, + "loss": 0.2015, + "step": 25790 + }, + { + "epoch": 1.1498236570808464, + "grad_norm": 4.613124847412109, + "learning_rate": 6.5684527888173226e-06, + "loss": 0.2879, + "step": 25791 + }, + { + "epoch": 1.1498372219207813, + "grad_norm": 7.174963474273682, + "learning_rate": 6.568315746197068e-06, + "loss": 0.2477, + "step": 25792 + }, + { + "epoch": 1.1498507867607162, + "grad_norm": 3.5005621910095215, + "learning_rate": 6.568178703576813e-06, + "loss": 0.1168, + "step": 25793 + }, + { + "epoch": 1.149864351600651, + "grad_norm": 4.561934947967529, + "learning_rate": 6.568041660956558e-06, + "loss": 0.2634, + "step": 25794 + }, + { + "epoch": 1.1498779164405861, + "grad_norm": 6.1538777351379395, + "learning_rate": 6.5679046183363024e-06, + "loss": 0.2224, + "step": 25795 + }, + { + "epoch": 1.149891481280521, + "grad_norm": 6.58358907699585, + "learning_rate": 6.5677675757160485e-06, + "loss": 0.2807, + "step": 25796 + }, + { + "epoch": 1.1499050461204559, + "grad_norm": 5.454270839691162, + "learning_rate": 6.567630533095794e-06, + "loss": 0.2489, + "step": 25797 + }, + { + "epoch": 1.1499186109603907, + "grad_norm": 3.955589771270752, + "learning_rate": 6.567493490475538e-06, + "loss": 0.1398, + "step": 25798 + }, + { + "epoch": 1.1499321758003256, + "grad_norm": 3.7039616107940674, + "learning_rate": 6.567356447855283e-06, + "loss": 0.1358, + "step": 25799 + }, + { + "epoch": 1.1499457406402604, + "grad_norm": 5.166628837585449, + "learning_rate": 6.567219405235029e-06, + "loss": 0.2217, + "step": 25800 + }, + { + "epoch": 1.1499593054801953, + "grad_norm": 4.3642706871032715, + "learning_rate": 6.567082362614774e-06, + "loss": 0.2155, + "step": 25801 + }, + { + "epoch": 1.1499728703201302, + "grad_norm": 4.0601396560668945, + "learning_rate": 6.566945319994519e-06, + "loss": 0.1508, + "step": 25802 + }, + { + "epoch": 1.1499728703201302, + "eval_loss": 0.32914718985557556, + "eval_noise_accuracy": NaN, + "eval_runtime": 4511.3255, + "eval_samples_per_second": 1.114, + "eval_steps_per_second": 0.07, + "eval_wer": 27.570974019956136, + "step": 25802 + }, + { + "epoch": 1.149986435160065, + "grad_norm": 5.902431011199951, + "learning_rate": 6.566808277374264e-06, + "loss": 0.3181, + "step": 25803 + }, + { + "epoch": 1.15, + "grad_norm": 4.422949314117432, + "learning_rate": 6.56667123475401e-06, + "loss": 0.2334, + "step": 25804 + }, + { + "epoch": 1.1500135648399348, + "grad_norm": 3.828413724899292, + "learning_rate": 6.566534192133754e-06, + "loss": 0.2666, + "step": 25805 + }, + { + "epoch": 1.1500271296798699, + "grad_norm": 3.9150538444519043, + "learning_rate": 6.566397149513499e-06, + "loss": 0.2023, + "step": 25806 + }, + { + "epoch": 1.1500406945198047, + "grad_norm": 6.451657772064209, + "learning_rate": 6.566260106893244e-06, + "loss": 0.3827, + "step": 25807 + }, + { + "epoch": 1.1500542593597396, + "grad_norm": 5.968225955963135, + "learning_rate": 6.566123064272989e-06, + "loss": 0.2674, + "step": 25808 + }, + { + "epoch": 1.1500678241996745, + "grad_norm": 4.363883972167969, + "learning_rate": 6.565986021652735e-06, + "loss": 0.167, + "step": 25809 + }, + { + "epoch": 1.1500813890396093, + "grad_norm": 3.728670358657837, + "learning_rate": 6.56584897903248e-06, + "loss": 0.1994, + "step": 25810 + }, + { + "epoch": 1.1500949538795442, + "grad_norm": 5.923831462860107, + "learning_rate": 6.5657119364122244e-06, + "loss": 0.3444, + "step": 25811 + }, + { + "epoch": 1.150108518719479, + "grad_norm": 7.592695236206055, + "learning_rate": 6.56557489379197e-06, + "loss": 0.2736, + "step": 25812 + }, + { + "epoch": 1.150122083559414, + "grad_norm": 7.411911487579346, + "learning_rate": 6.565437851171716e-06, + "loss": 0.4018, + "step": 25813 + }, + { + "epoch": 1.150135648399349, + "grad_norm": 5.134282112121582, + "learning_rate": 6.56530080855146e-06, + "loss": 0.3153, + "step": 25814 + }, + { + "epoch": 1.1501492132392839, + "grad_norm": 5.410610675811768, + "learning_rate": 6.565163765931205e-06, + "loss": 0.2248, + "step": 25815 + }, + { + "epoch": 1.1501627780792187, + "grad_norm": 5.427781581878662, + "learning_rate": 6.56502672331095e-06, + "loss": 0.3071, + "step": 25816 + }, + { + "epoch": 1.1501763429191536, + "grad_norm": 4.442541599273682, + "learning_rate": 6.5648896806906955e-06, + "loss": 0.241, + "step": 25817 + }, + { + "epoch": 1.1501899077590885, + "grad_norm": 6.00295877456665, + "learning_rate": 6.564752638070441e-06, + "loss": 0.3705, + "step": 25818 + }, + { + "epoch": 1.1502034725990233, + "grad_norm": 6.736677646636963, + "learning_rate": 6.564615595450186e-06, + "loss": 0.3778, + "step": 25819 + }, + { + "epoch": 1.1502170374389582, + "grad_norm": 6.245397090911865, + "learning_rate": 6.56447855282993e-06, + "loss": 0.2983, + "step": 25820 + }, + { + "epoch": 1.150230602278893, + "grad_norm": 4.906100749969482, + "learning_rate": 6.564341510209675e-06, + "loss": 0.3051, + "step": 25821 + }, + { + "epoch": 1.150244167118828, + "grad_norm": 5.172938823699951, + "learning_rate": 6.564204467589421e-06, + "loss": 0.346, + "step": 25822 + }, + { + "epoch": 1.1502577319587628, + "grad_norm": 4.45584774017334, + "learning_rate": 6.564067424969166e-06, + "loss": 0.2749, + "step": 25823 + }, + { + "epoch": 1.1502712967986977, + "grad_norm": 4.489532470703125, + "learning_rate": 6.563930382348911e-06, + "loss": 0.2062, + "step": 25824 + }, + { + "epoch": 1.1502848616386327, + "grad_norm": 5.654685020446777, + "learning_rate": 6.563793339728656e-06, + "loss": 0.2748, + "step": 25825 + }, + { + "epoch": 1.1502984264785676, + "grad_norm": 6.644068717956543, + "learning_rate": 6.563656297108401e-06, + "loss": 0.22, + "step": 25826 + }, + { + "epoch": 1.1503119913185025, + "grad_norm": 5.210389614105225, + "learning_rate": 6.5635192544881465e-06, + "loss": 0.2611, + "step": 25827 + }, + { + "epoch": 1.1503255561584373, + "grad_norm": 7.841374397277832, + "learning_rate": 6.563382211867892e-06, + "loss": 0.355, + "step": 25828 + }, + { + "epoch": 1.1503391209983722, + "grad_norm": 5.468071937561035, + "learning_rate": 6.563245169247636e-06, + "loss": 0.2766, + "step": 25829 + }, + { + "epoch": 1.150352685838307, + "grad_norm": 5.066667556762695, + "learning_rate": 6.563108126627382e-06, + "loss": 0.2291, + "step": 25830 + }, + { + "epoch": 1.150366250678242, + "grad_norm": 6.10299825668335, + "learning_rate": 6.562971084007127e-06, + "loss": 0.3191, + "step": 25831 + }, + { + "epoch": 1.1503798155181768, + "grad_norm": 4.786370277404785, + "learning_rate": 6.5628340413868715e-06, + "loss": 0.2716, + "step": 25832 + }, + { + "epoch": 1.150393380358112, + "grad_norm": 4.819178104400635, + "learning_rate": 6.562696998766617e-06, + "loss": 0.3852, + "step": 25833 + }, + { + "epoch": 1.1504069451980468, + "grad_norm": 5.704005241394043, + "learning_rate": 6.562559956146362e-06, + "loss": 0.3184, + "step": 25834 + }, + { + "epoch": 1.1504205100379816, + "grad_norm": 4.716818332672119, + "learning_rate": 6.562422913526108e-06, + "loss": 0.2244, + "step": 25835 + }, + { + "epoch": 1.1504340748779165, + "grad_norm": 6.196312427520752, + "learning_rate": 6.562285870905852e-06, + "loss": 0.3825, + "step": 25836 + }, + { + "epoch": 1.1504476397178514, + "grad_norm": 5.952178955078125, + "learning_rate": 6.562148828285597e-06, + "loss": 0.2688, + "step": 25837 + }, + { + "epoch": 1.1504612045577862, + "grad_norm": 4.567937850952148, + "learning_rate": 6.562011785665342e-06, + "loss": 0.2272, + "step": 25838 + }, + { + "epoch": 1.150474769397721, + "grad_norm": 5.630686283111572, + "learning_rate": 6.561874743045088e-06, + "loss": 0.3962, + "step": 25839 + }, + { + "epoch": 1.150488334237656, + "grad_norm": 5.371030807495117, + "learning_rate": 6.561737700424833e-06, + "loss": 0.2927, + "step": 25840 + }, + { + "epoch": 1.1505018990775908, + "grad_norm": 5.286807537078857, + "learning_rate": 6.561600657804577e-06, + "loss": 0.2123, + "step": 25841 + }, + { + "epoch": 1.1505154639175257, + "grad_norm": 6.21585750579834, + "learning_rate": 6.5614636151843224e-06, + "loss": 0.4003, + "step": 25842 + }, + { + "epoch": 1.1505290287574605, + "grad_norm": 4.540611743927002, + "learning_rate": 6.5613265725640685e-06, + "loss": 0.2644, + "step": 25843 + }, + { + "epoch": 1.1505425935973956, + "grad_norm": 5.548040390014648, + "learning_rate": 6.561189529943814e-06, + "loss": 0.3519, + "step": 25844 + }, + { + "epoch": 1.1505561584373305, + "grad_norm": 5.331971168518066, + "learning_rate": 6.561052487323558e-06, + "loss": 0.3425, + "step": 25845 + }, + { + "epoch": 1.1505697232772654, + "grad_norm": 4.946269989013672, + "learning_rate": 6.560915444703303e-06, + "loss": 0.2546, + "step": 25846 + }, + { + "epoch": 1.1505832881172002, + "grad_norm": 5.326456069946289, + "learning_rate": 6.5607784020830475e-06, + "loss": 0.2457, + "step": 25847 + }, + { + "epoch": 1.150596852957135, + "grad_norm": 4.59672737121582, + "learning_rate": 6.5606413594627935e-06, + "loss": 0.2667, + "step": 25848 + }, + { + "epoch": 1.15061041779707, + "grad_norm": 4.7671403884887695, + "learning_rate": 6.560504316842539e-06, + "loss": 0.2783, + "step": 25849 + }, + { + "epoch": 1.1506239826370048, + "grad_norm": 3.89325213432312, + "learning_rate": 6.560367274222284e-06, + "loss": 0.2377, + "step": 25850 + }, + { + "epoch": 1.1506375474769397, + "grad_norm": 5.033785343170166, + "learning_rate": 6.560230231602028e-06, + "loss": 0.2777, + "step": 25851 + }, + { + "epoch": 1.1506511123168748, + "grad_norm": 5.946422576904297, + "learning_rate": 6.560093188981774e-06, + "loss": 0.4128, + "step": 25852 + }, + { + "epoch": 1.1506646771568096, + "grad_norm": 4.771668910980225, + "learning_rate": 6.559956146361519e-06, + "loss": 0.1576, + "step": 25853 + }, + { + "epoch": 1.1506782419967445, + "grad_norm": 4.920942783355713, + "learning_rate": 6.559819103741264e-06, + "loss": 0.3001, + "step": 25854 + }, + { + "epoch": 1.1506918068366794, + "grad_norm": 5.093942642211914, + "learning_rate": 6.559682061121009e-06, + "loss": 0.2923, + "step": 25855 + }, + { + "epoch": 1.1507053716766142, + "grad_norm": 5.060456275939941, + "learning_rate": 6.559545018500755e-06, + "loss": 0.2724, + "step": 25856 + }, + { + "epoch": 1.150718936516549, + "grad_norm": 5.345669746398926, + "learning_rate": 6.559407975880499e-06, + "loss": 0.4429, + "step": 25857 + }, + { + "epoch": 1.150732501356484, + "grad_norm": 4.726004600524902, + "learning_rate": 6.5592709332602445e-06, + "loss": 0.2059, + "step": 25858 + }, + { + "epoch": 1.1507460661964188, + "grad_norm": 5.287384033203125, + "learning_rate": 6.55913389063999e-06, + "loss": 0.2664, + "step": 25859 + }, + { + "epoch": 1.1507596310363537, + "grad_norm": 5.185850143432617, + "learning_rate": 6.558996848019734e-06, + "loss": 0.2891, + "step": 25860 + }, + { + "epoch": 1.1507731958762886, + "grad_norm": 5.323157787322998, + "learning_rate": 6.55885980539948e-06, + "loss": 0.2982, + "step": 25861 + }, + { + "epoch": 1.1507867607162234, + "grad_norm": 6.242863178253174, + "learning_rate": 6.558722762779225e-06, + "loss": 0.3056, + "step": 25862 + }, + { + "epoch": 1.1508003255561585, + "grad_norm": 5.979129791259766, + "learning_rate": 6.5585857201589695e-06, + "loss": 0.2999, + "step": 25863 + }, + { + "epoch": 1.1508138903960934, + "grad_norm": 4.754273891448975, + "learning_rate": 6.558448677538715e-06, + "loss": 0.208, + "step": 25864 + }, + { + "epoch": 1.1508274552360283, + "grad_norm": 5.485204696655273, + "learning_rate": 6.558311634918461e-06, + "loss": 0.2342, + "step": 25865 + }, + { + "epoch": 1.1508410200759631, + "grad_norm": 6.404162883758545, + "learning_rate": 6.558174592298205e-06, + "loss": 0.3244, + "step": 25866 + }, + { + "epoch": 1.150854584915898, + "grad_norm": 6.392198085784912, + "learning_rate": 6.55803754967795e-06, + "loss": 0.27, + "step": 25867 + }, + { + "epoch": 1.1508681497558328, + "grad_norm": 6.235219478607178, + "learning_rate": 6.557900507057695e-06, + "loss": 0.3219, + "step": 25868 + }, + { + "epoch": 1.1508817145957677, + "grad_norm": 5.70378303527832, + "learning_rate": 6.5577634644374414e-06, + "loss": 0.287, + "step": 25869 + }, + { + "epoch": 1.1508952794357026, + "grad_norm": 5.2047648429870605, + "learning_rate": 6.557626421817186e-06, + "loss": 0.2318, + "step": 25870 + }, + { + "epoch": 1.1509088442756377, + "grad_norm": 5.107532501220703, + "learning_rate": 6.557489379196931e-06, + "loss": 0.3085, + "step": 25871 + }, + { + "epoch": 1.1509224091155725, + "grad_norm": 6.178940296173096, + "learning_rate": 6.557352336576675e-06, + "loss": 0.3395, + "step": 25872 + }, + { + "epoch": 1.1509359739555074, + "grad_norm": 5.231990814208984, + "learning_rate": 6.557215293956421e-06, + "loss": 0.2314, + "step": 25873 + }, + { + "epoch": 1.1509495387954423, + "grad_norm": 6.323351860046387, + "learning_rate": 6.5570782513361665e-06, + "loss": 0.3612, + "step": 25874 + }, + { + "epoch": 1.1509631036353771, + "grad_norm": 6.467837333679199, + "learning_rate": 6.556941208715911e-06, + "loss": 0.269, + "step": 25875 + }, + { + "epoch": 1.150976668475312, + "grad_norm": 4.64498233795166, + "learning_rate": 6.556804166095656e-06, + "loss": 0.259, + "step": 25876 + }, + { + "epoch": 1.1509902333152469, + "grad_norm": 4.038918972015381, + "learning_rate": 6.556667123475401e-06, + "loss": 0.2249, + "step": 25877 + }, + { + "epoch": 1.1510037981551817, + "grad_norm": 4.822621822357178, + "learning_rate": 6.556530080855147e-06, + "loss": 0.2798, + "step": 25878 + }, + { + "epoch": 1.1510173629951166, + "grad_norm": 6.8420186042785645, + "learning_rate": 6.5563930382348915e-06, + "loss": 0.33, + "step": 25879 + }, + { + "epoch": 1.1510309278350515, + "grad_norm": 5.194234848022461, + "learning_rate": 6.556255995614637e-06, + "loss": 0.2811, + "step": 25880 + }, + { + "epoch": 1.1510444926749863, + "grad_norm": 5.9866814613342285, + "learning_rate": 6.556118952994381e-06, + "loss": 0.2748, + "step": 25881 + }, + { + "epoch": 1.1510580575149214, + "grad_norm": 4.781889915466309, + "learning_rate": 6.555981910374127e-06, + "loss": 0.3487, + "step": 25882 + }, + { + "epoch": 1.1510716223548563, + "grad_norm": 6.014090061187744, + "learning_rate": 6.555844867753872e-06, + "loss": 0.3829, + "step": 25883 + }, + { + "epoch": 1.1510851871947911, + "grad_norm": 4.860822677612305, + "learning_rate": 6.555707825133617e-06, + "loss": 0.2187, + "step": 25884 + }, + { + "epoch": 1.151098752034726, + "grad_norm": 6.238775253295898, + "learning_rate": 6.555570782513362e-06, + "loss": 0.3129, + "step": 25885 + }, + { + "epoch": 1.1511123168746609, + "grad_norm": 5.099481105804443, + "learning_rate": 6.555433739893108e-06, + "loss": 0.3537, + "step": 25886 + }, + { + "epoch": 1.1511258817145957, + "grad_norm": 4.251713275909424, + "learning_rate": 6.555296697272853e-06, + "loss": 0.2415, + "step": 25887 + }, + { + "epoch": 1.1511394465545306, + "grad_norm": 6.462833881378174, + "learning_rate": 6.555159654652597e-06, + "loss": 0.3658, + "step": 25888 + }, + { + "epoch": 1.1511530113944655, + "grad_norm": 4.200446605682373, + "learning_rate": 6.5550226120323425e-06, + "loss": 0.2272, + "step": 25889 + }, + { + "epoch": 1.1511665762344006, + "grad_norm": 5.495953559875488, + "learning_rate": 6.554885569412087e-06, + "loss": 0.2969, + "step": 25890 + }, + { + "epoch": 1.1511801410743354, + "grad_norm": 6.160946369171143, + "learning_rate": 6.554748526791833e-06, + "loss": 0.2818, + "step": 25891 + }, + { + "epoch": 1.1511937059142703, + "grad_norm": 6.230053424835205, + "learning_rate": 6.554611484171578e-06, + "loss": 0.3371, + "step": 25892 + }, + { + "epoch": 1.1512072707542051, + "grad_norm": 5.383190155029297, + "learning_rate": 6.554474441551323e-06, + "loss": 0.3728, + "step": 25893 + }, + { + "epoch": 1.15122083559414, + "grad_norm": 7.99173641204834, + "learning_rate": 6.5543373989310675e-06, + "loss": 0.4167, + "step": 25894 + }, + { + "epoch": 1.1512344004340749, + "grad_norm": 4.837211608886719, + "learning_rate": 6.5542003563108135e-06, + "loss": 0.3064, + "step": 25895 + }, + { + "epoch": 1.1512479652740097, + "grad_norm": 8.27330207824707, + "learning_rate": 6.554063313690559e-06, + "loss": 0.4157, + "step": 25896 + }, + { + "epoch": 1.1512615301139446, + "grad_norm": 6.155192852020264, + "learning_rate": 6.553926271070303e-06, + "loss": 0.3079, + "step": 25897 + }, + { + "epoch": 1.1512750949538795, + "grad_norm": 6.524391174316406, + "learning_rate": 6.553789228450048e-06, + "loss": 0.497, + "step": 25898 + }, + { + "epoch": 1.1512886597938143, + "grad_norm": 6.625096321105957, + "learning_rate": 6.553652185829794e-06, + "loss": 0.3234, + "step": 25899 + }, + { + "epoch": 1.1513022246337492, + "grad_norm": 5.5676045417785645, + "learning_rate": 6.553515143209539e-06, + "loss": 0.1975, + "step": 25900 + }, + { + "epoch": 1.1513157894736843, + "grad_norm": 5.701913356781006, + "learning_rate": 6.553378100589284e-06, + "loss": 0.2586, + "step": 25901 + }, + { + "epoch": 1.1513293543136192, + "grad_norm": 5.856013774871826, + "learning_rate": 6.553241057969029e-06, + "loss": 0.3278, + "step": 25902 + }, + { + "epoch": 1.151342919153554, + "grad_norm": 5.225344181060791, + "learning_rate": 6.553104015348773e-06, + "loss": 0.3119, + "step": 25903 + }, + { + "epoch": 1.1513564839934889, + "grad_norm": 5.909900188446045, + "learning_rate": 6.552966972728519e-06, + "loss": 0.2749, + "step": 25904 + }, + { + "epoch": 1.1513700488334238, + "grad_norm": 5.130502223968506, + "learning_rate": 6.5528299301082645e-06, + "loss": 0.3423, + "step": 25905 + }, + { + "epoch": 1.1513836136733586, + "grad_norm": 4.725004196166992, + "learning_rate": 6.552692887488009e-06, + "loss": 0.318, + "step": 25906 + }, + { + "epoch": 1.1513971785132935, + "grad_norm": 5.811277866363525, + "learning_rate": 6.552555844867754e-06, + "loss": 0.3685, + "step": 25907 + }, + { + "epoch": 1.1514107433532283, + "grad_norm": 7.322560787200928, + "learning_rate": 6.5524188022475e-06, + "loss": 0.3116, + "step": 25908 + }, + { + "epoch": 1.1514243081931634, + "grad_norm": 5.116541862487793, + "learning_rate": 6.552281759627245e-06, + "loss": 0.3257, + "step": 25909 + }, + { + "epoch": 1.1514378730330983, + "grad_norm": 5.703474998474121, + "learning_rate": 6.5521447170069895e-06, + "loss": 0.2944, + "step": 25910 + }, + { + "epoch": 1.1514514378730332, + "grad_norm": 7.048820495605469, + "learning_rate": 6.552007674386735e-06, + "loss": 0.4064, + "step": 25911 + }, + { + "epoch": 1.151465002712968, + "grad_norm": 5.704469203948975, + "learning_rate": 6.551870631766481e-06, + "loss": 0.3506, + "step": 25912 + }, + { + "epoch": 1.151478567552903, + "grad_norm": 4.824703693389893, + "learning_rate": 6.551733589146225e-06, + "loss": 0.3151, + "step": 25913 + }, + { + "epoch": 1.1514921323928378, + "grad_norm": 5.117771148681641, + "learning_rate": 6.55159654652597e-06, + "loss": 0.3082, + "step": 25914 + }, + { + "epoch": 1.1515056972327726, + "grad_norm": 6.3084588050842285, + "learning_rate": 6.5514595039057146e-06, + "loss": 0.4038, + "step": 25915 + }, + { + "epoch": 1.1515192620727075, + "grad_norm": 5.743851184844971, + "learning_rate": 6.55132246128546e-06, + "loss": 0.3187, + "step": 25916 + }, + { + "epoch": 1.1515328269126424, + "grad_norm": 7.785281181335449, + "learning_rate": 6.551185418665206e-06, + "loss": 0.4417, + "step": 25917 + }, + { + "epoch": 1.1515463917525772, + "grad_norm": 5.329102039337158, + "learning_rate": 6.551048376044951e-06, + "loss": 0.2918, + "step": 25918 + }, + { + "epoch": 1.151559956592512, + "grad_norm": 6.113713264465332, + "learning_rate": 6.550911333424695e-06, + "loss": 0.3219, + "step": 25919 + }, + { + "epoch": 1.1515735214324472, + "grad_norm": 8.81824016571045, + "learning_rate": 6.5507742908044405e-06, + "loss": 0.444, + "step": 25920 + }, + { + "epoch": 1.151587086272382, + "grad_norm": 7.178874492645264, + "learning_rate": 6.5506372481841865e-06, + "loss": 0.4944, + "step": 25921 + }, + { + "epoch": 1.151600651112317, + "grad_norm": 4.733146667480469, + "learning_rate": 6.550500205563931e-06, + "loss": 0.2904, + "step": 25922 + }, + { + "epoch": 1.1516142159522518, + "grad_norm": 6.441807270050049, + "learning_rate": 6.550363162943676e-06, + "loss": 0.3943, + "step": 25923 + }, + { + "epoch": 1.1516277807921866, + "grad_norm": 3.8419349193573, + "learning_rate": 6.55022612032342e-06, + "loss": 0.2191, + "step": 25924 + }, + { + "epoch": 1.1516413456321215, + "grad_norm": 5.596087455749512, + "learning_rate": 6.550089077703166e-06, + "loss": 0.3378, + "step": 25925 + }, + { + "epoch": 1.1516549104720564, + "grad_norm": 5.648448944091797, + "learning_rate": 6.5499520350829115e-06, + "loss": 0.3625, + "step": 25926 + }, + { + "epoch": 1.1516684753119912, + "grad_norm": 4.382352352142334, + "learning_rate": 6.549814992462657e-06, + "loss": 0.2088, + "step": 25927 + }, + { + "epoch": 1.1516820401519263, + "grad_norm": 4.22632360458374, + "learning_rate": 6.549677949842401e-06, + "loss": 0.2553, + "step": 25928 + }, + { + "epoch": 1.1516956049918612, + "grad_norm": 4.885663986206055, + "learning_rate": 6.549540907222146e-06, + "loss": 0.2625, + "step": 25929 + }, + { + "epoch": 1.151709169831796, + "grad_norm": 3.7226791381835938, + "learning_rate": 6.549403864601892e-06, + "loss": 0.2366, + "step": 25930 + }, + { + "epoch": 1.151722734671731, + "grad_norm": 3.6701698303222656, + "learning_rate": 6.549266821981637e-06, + "loss": 0.1907, + "step": 25931 + }, + { + "epoch": 1.1517362995116658, + "grad_norm": 4.94935417175293, + "learning_rate": 6.549129779361382e-06, + "loss": 0.2843, + "step": 25932 + }, + { + "epoch": 1.1517498643516006, + "grad_norm": 5.046298980712891, + "learning_rate": 6.548992736741127e-06, + "loss": 0.2772, + "step": 25933 + }, + { + "epoch": 1.1517634291915355, + "grad_norm": 6.449023723602295, + "learning_rate": 6.548855694120872e-06, + "loss": 0.4689, + "step": 25934 + }, + { + "epoch": 1.1517769940314704, + "grad_norm": 3.793231248855591, + "learning_rate": 6.548718651500617e-06, + "loss": 0.2507, + "step": 25935 + }, + { + "epoch": 1.1517905588714052, + "grad_norm": 7.445249080657959, + "learning_rate": 6.5485816088803625e-06, + "loss": 0.3162, + "step": 25936 + }, + { + "epoch": 1.15180412371134, + "grad_norm": 4.561219692230225, + "learning_rate": 6.548444566260107e-06, + "loss": 0.2411, + "step": 25937 + }, + { + "epoch": 1.151817688551275, + "grad_norm": 5.693686485290527, + "learning_rate": 6.548307523639853e-06, + "loss": 0.3641, + "step": 25938 + }, + { + "epoch": 1.15183125339121, + "grad_norm": 4.841677188873291, + "learning_rate": 6.548170481019598e-06, + "loss": 0.2542, + "step": 25939 + }, + { + "epoch": 1.151844818231145, + "grad_norm": 3.520873785018921, + "learning_rate": 6.548033438399342e-06, + "loss": 0.2211, + "step": 25940 + }, + { + "epoch": 1.1518583830710798, + "grad_norm": 5.319004058837891, + "learning_rate": 6.5478963957790875e-06, + "loss": 0.3134, + "step": 25941 + }, + { + "epoch": 1.1518719479110147, + "grad_norm": 5.91429328918457, + "learning_rate": 6.5477593531588336e-06, + "loss": 0.3178, + "step": 25942 + }, + { + "epoch": 1.1518855127509495, + "grad_norm": 4.812812328338623, + "learning_rate": 6.547622310538579e-06, + "loss": 0.241, + "step": 25943 + }, + { + "epoch": 1.1518990775908844, + "grad_norm": 5.669105529785156, + "learning_rate": 6.547485267918323e-06, + "loss": 0.3127, + "step": 25944 + }, + { + "epoch": 1.1519126424308193, + "grad_norm": 5.742019176483154, + "learning_rate": 6.547348225298068e-06, + "loss": 0.2593, + "step": 25945 + }, + { + "epoch": 1.1519262072707541, + "grad_norm": 5.371435642242432, + "learning_rate": 6.547211182677813e-06, + "loss": 0.2176, + "step": 25946 + }, + { + "epoch": 1.1519397721106892, + "grad_norm": 4.519861221313477, + "learning_rate": 6.547074140057559e-06, + "loss": 0.2824, + "step": 25947 + }, + { + "epoch": 1.151953336950624, + "grad_norm": 6.040852069854736, + "learning_rate": 6.546937097437304e-06, + "loss": 0.2609, + "step": 25948 + }, + { + "epoch": 1.151966901790559, + "grad_norm": 5.309315204620361, + "learning_rate": 6.546800054817048e-06, + "loss": 0.2536, + "step": 25949 + }, + { + "epoch": 1.1519804666304938, + "grad_norm": 5.818056583404541, + "learning_rate": 6.546663012196793e-06, + "loss": 0.3068, + "step": 25950 + }, + { + "epoch": 1.1519940314704287, + "grad_norm": 4.987868309020996, + "learning_rate": 6.546525969576539e-06, + "loss": 0.2721, + "step": 25951 + }, + { + "epoch": 1.1520075963103635, + "grad_norm": 4.1876726150512695, + "learning_rate": 6.5463889269562845e-06, + "loss": 0.2092, + "step": 25952 + }, + { + "epoch": 1.1520211611502984, + "grad_norm": 5.6885504722595215, + "learning_rate": 6.546251884336029e-06, + "loss": 0.2662, + "step": 25953 + }, + { + "epoch": 1.1520347259902333, + "grad_norm": 8.56063461303711, + "learning_rate": 6.546114841715774e-06, + "loss": 0.4085, + "step": 25954 + }, + { + "epoch": 1.1520482908301681, + "grad_norm": 5.822873115539551, + "learning_rate": 6.54597779909552e-06, + "loss": 0.3713, + "step": 25955 + }, + { + "epoch": 1.152061855670103, + "grad_norm": 6.093814373016357, + "learning_rate": 6.545840756475264e-06, + "loss": 0.1962, + "step": 25956 + }, + { + "epoch": 1.1520754205100379, + "grad_norm": 6.844738006591797, + "learning_rate": 6.5457037138550095e-06, + "loss": 0.3497, + "step": 25957 + }, + { + "epoch": 1.152088985349973, + "grad_norm": 6.412425518035889, + "learning_rate": 6.545566671234755e-06, + "loss": 0.3919, + "step": 25958 + }, + { + "epoch": 1.1521025501899078, + "grad_norm": 6.1966071128845215, + "learning_rate": 6.545429628614499e-06, + "loss": 0.2931, + "step": 25959 + }, + { + "epoch": 1.1521161150298427, + "grad_norm": 5.81603479385376, + "learning_rate": 6.545292585994245e-06, + "loss": 0.2666, + "step": 25960 + }, + { + "epoch": 1.1521296798697775, + "grad_norm": 5.191432476043701, + "learning_rate": 6.54515554337399e-06, + "loss": 0.3435, + "step": 25961 + }, + { + "epoch": 1.1521432447097124, + "grad_norm": 3.918250799179077, + "learning_rate": 6.545018500753735e-06, + "loss": 0.2952, + "step": 25962 + }, + { + "epoch": 1.1521568095496473, + "grad_norm": 7.701519012451172, + "learning_rate": 6.54488145813348e-06, + "loss": 0.2542, + "step": 25963 + }, + { + "epoch": 1.1521703743895821, + "grad_norm": 6.8389410972595215, + "learning_rate": 6.544744415513226e-06, + "loss": 0.4106, + "step": 25964 + }, + { + "epoch": 1.152183939229517, + "grad_norm": 7.542948246002197, + "learning_rate": 6.54460737289297e-06, + "loss": 0.5406, + "step": 25965 + }, + { + "epoch": 1.152197504069452, + "grad_norm": 5.041433811187744, + "learning_rate": 6.544470330272715e-06, + "loss": 0.2277, + "step": 25966 + }, + { + "epoch": 1.152211068909387, + "grad_norm": 8.868804931640625, + "learning_rate": 6.5443332876524605e-06, + "loss": 0.416, + "step": 25967 + }, + { + "epoch": 1.1522246337493218, + "grad_norm": 7.377587795257568, + "learning_rate": 6.544196245032206e-06, + "loss": 0.3864, + "step": 25968 + }, + { + "epoch": 1.1522381985892567, + "grad_norm": 4.0073771476745605, + "learning_rate": 6.544059202411951e-06, + "loss": 0.1505, + "step": 25969 + }, + { + "epoch": 1.1522517634291916, + "grad_norm": 7.29974365234375, + "learning_rate": 6.543922159791696e-06, + "loss": 0.4129, + "step": 25970 + }, + { + "epoch": 1.1522653282691264, + "grad_norm": 6.219442367553711, + "learning_rate": 6.54378511717144e-06, + "loss": 0.441, + "step": 25971 + }, + { + "epoch": 1.1522788931090613, + "grad_norm": 6.767536640167236, + "learning_rate": 6.5436480745511855e-06, + "loss": 0.2891, + "step": 25972 + }, + { + "epoch": 1.1522924579489962, + "grad_norm": 3.727979898452759, + "learning_rate": 6.5435110319309316e-06, + "loss": 0.1855, + "step": 25973 + }, + { + "epoch": 1.152306022788931, + "grad_norm": 6.229664325714111, + "learning_rate": 6.543373989310676e-06, + "loss": 0.234, + "step": 25974 + }, + { + "epoch": 1.1523195876288659, + "grad_norm": 5.635422229766846, + "learning_rate": 6.543236946690421e-06, + "loss": 0.2463, + "step": 25975 + }, + { + "epoch": 1.1523331524688007, + "grad_norm": 5.302980422973633, + "learning_rate": 6.543099904070166e-06, + "loss": 0.272, + "step": 25976 + }, + { + "epoch": 1.1523467173087358, + "grad_norm": 5.515359878540039, + "learning_rate": 6.542962861449912e-06, + "loss": 0.2436, + "step": 25977 + }, + { + "epoch": 1.1523602821486707, + "grad_norm": 4.091405868530273, + "learning_rate": 6.542825818829657e-06, + "loss": 0.191, + "step": 25978 + }, + { + "epoch": 1.1523738469886056, + "grad_norm": 5.376645565032959, + "learning_rate": 6.542688776209402e-06, + "loss": 0.2449, + "step": 25979 + }, + { + "epoch": 1.1523874118285404, + "grad_norm": 4.061131000518799, + "learning_rate": 6.542551733589146e-06, + "loss": 0.1997, + "step": 25980 + }, + { + "epoch": 1.1524009766684753, + "grad_norm": 6.859203338623047, + "learning_rate": 6.542414690968892e-06, + "loss": 0.3586, + "step": 25981 + }, + { + "epoch": 1.1524145415084102, + "grad_norm": 4.557778835296631, + "learning_rate": 6.542277648348637e-06, + "loss": 0.2632, + "step": 25982 + }, + { + "epoch": 1.152428106348345, + "grad_norm": 4.552834987640381, + "learning_rate": 6.542140605728382e-06, + "loss": 0.3275, + "step": 25983 + }, + { + "epoch": 1.15244167118828, + "grad_norm": 5.309340953826904, + "learning_rate": 6.542003563108127e-06, + "loss": 0.3035, + "step": 25984 + }, + { + "epoch": 1.152455236028215, + "grad_norm": 6.345302581787109, + "learning_rate": 6.541866520487872e-06, + "loss": 0.3146, + "step": 25985 + }, + { + "epoch": 1.1524688008681498, + "grad_norm": 8.163683891296387, + "learning_rate": 6.541729477867618e-06, + "loss": 0.3603, + "step": 25986 + }, + { + "epoch": 1.1524823657080847, + "grad_norm": 8.417510986328125, + "learning_rate": 6.541592435247362e-06, + "loss": 0.4587, + "step": 25987 + }, + { + "epoch": 1.1524959305480196, + "grad_norm": 4.089807987213135, + "learning_rate": 6.5414553926271076e-06, + "loss": 0.1785, + "step": 25988 + }, + { + "epoch": 1.1525094953879544, + "grad_norm": 5.9341535568237305, + "learning_rate": 6.541318350006852e-06, + "loss": 0.3519, + "step": 25989 + }, + { + "epoch": 1.1525230602278893, + "grad_norm": 6.273176670074463, + "learning_rate": 6.541181307386598e-06, + "loss": 0.2786, + "step": 25990 + }, + { + "epoch": 1.1525366250678242, + "grad_norm": 5.51975679397583, + "learning_rate": 6.541044264766343e-06, + "loss": 0.2955, + "step": 25991 + }, + { + "epoch": 1.152550189907759, + "grad_norm": 6.3820648193359375, + "learning_rate": 6.540907222146088e-06, + "loss": 0.3474, + "step": 25992 + }, + { + "epoch": 1.152563754747694, + "grad_norm": 4.83761739730835, + "learning_rate": 6.540770179525833e-06, + "loss": 0.2633, + "step": 25993 + }, + { + "epoch": 1.1525773195876288, + "grad_norm": 5.966338157653809, + "learning_rate": 6.540633136905579e-06, + "loss": 0.2582, + "step": 25994 + }, + { + "epoch": 1.1525908844275636, + "grad_norm": 5.084648609161377, + "learning_rate": 6.540496094285324e-06, + "loss": 0.3228, + "step": 25995 + }, + { + "epoch": 1.1526044492674987, + "grad_norm": 4.586278438568115, + "learning_rate": 6.540359051665068e-06, + "loss": 0.2784, + "step": 25996 + }, + { + "epoch": 1.1526180141074336, + "grad_norm": 5.3583197593688965, + "learning_rate": 6.540222009044813e-06, + "loss": 0.2826, + "step": 25997 + }, + { + "epoch": 1.1526315789473685, + "grad_norm": 4.576810359954834, + "learning_rate": 6.540084966424558e-06, + "loss": 0.1905, + "step": 25998 + }, + { + "epoch": 1.1526451437873033, + "grad_norm": 5.1383867263793945, + "learning_rate": 6.539947923804304e-06, + "loss": 0.3024, + "step": 25999 + }, + { + "epoch": 1.1526587086272382, + "grad_norm": 5.865884780883789, + "learning_rate": 6.539810881184049e-06, + "loss": 0.3215, + "step": 26000 + }, + { + "epoch": 1.152672273467173, + "grad_norm": 3.572700023651123, + "learning_rate": 6.539673838563794e-06, + "loss": 0.2923, + "step": 26001 + }, + { + "epoch": 1.152685838307108, + "grad_norm": 4.980666637420654, + "learning_rate": 6.539536795943538e-06, + "loss": 0.2456, + "step": 26002 + }, + { + "epoch": 1.1526994031470428, + "grad_norm": 5.597752094268799, + "learning_rate": 6.539399753323284e-06, + "loss": 0.2562, + "step": 26003 + }, + { + "epoch": 1.1527129679869779, + "grad_norm": 6.1020917892456055, + "learning_rate": 6.5392627107030296e-06, + "loss": 0.3479, + "step": 26004 + }, + { + "epoch": 1.1527265328269127, + "grad_norm": 5.907587051391602, + "learning_rate": 6.539125668082774e-06, + "loss": 0.2823, + "step": 26005 + }, + { + "epoch": 1.1527400976668476, + "grad_norm": 4.38021183013916, + "learning_rate": 6.538988625462519e-06, + "loss": 0.1472, + "step": 26006 + }, + { + "epoch": 1.1527536625067825, + "grad_norm": 6.121217727661133, + "learning_rate": 6.538851582842265e-06, + "loss": 0.2772, + "step": 26007 + }, + { + "epoch": 1.1527672273467173, + "grad_norm": 5.779946804046631, + "learning_rate": 6.5387145402220094e-06, + "loss": 0.2493, + "step": 26008 + }, + { + "epoch": 1.1527807921866522, + "grad_norm": 4.698751449584961, + "learning_rate": 6.538577497601755e-06, + "loss": 0.2533, + "step": 26009 + }, + { + "epoch": 1.152794357026587, + "grad_norm": 6.836981296539307, + "learning_rate": 6.5384404549815e-06, + "loss": 0.2737, + "step": 26010 + }, + { + "epoch": 1.152807921866522, + "grad_norm": 6.175583362579346, + "learning_rate": 6.538303412361244e-06, + "loss": 0.2726, + "step": 26011 + }, + { + "epoch": 1.1528214867064568, + "grad_norm": 4.242509365081787, + "learning_rate": 6.53816636974099e-06, + "loss": 0.1987, + "step": 26012 + }, + { + "epoch": 1.1528350515463917, + "grad_norm": 4.841552257537842, + "learning_rate": 6.538029327120735e-06, + "loss": 0.1795, + "step": 26013 + }, + { + "epoch": 1.1528486163863265, + "grad_norm": 3.839092969894409, + "learning_rate": 6.53789228450048e-06, + "loss": 0.175, + "step": 26014 + }, + { + "epoch": 1.1528621812262616, + "grad_norm": 4.533263683319092, + "learning_rate": 6.537755241880225e-06, + "loss": 0.1328, + "step": 26015 + }, + { + "epoch": 1.1528757460661965, + "grad_norm": 4.470119953155518, + "learning_rate": 6.537618199259971e-06, + "loss": 0.1922, + "step": 26016 + }, + { + "epoch": 1.1528893109061313, + "grad_norm": 3.8168509006500244, + "learning_rate": 6.537481156639715e-06, + "loss": 0.1734, + "step": 26017 + }, + { + "epoch": 1.1529028757460662, + "grad_norm": 7.033513069152832, + "learning_rate": 6.53734411401946e-06, + "loss": 0.3402, + "step": 26018 + }, + { + "epoch": 1.152916440586001, + "grad_norm": 4.737688064575195, + "learning_rate": 6.5372070713992056e-06, + "loss": 0.2086, + "step": 26019 + }, + { + "epoch": 1.152930005425936, + "grad_norm": 5.403140068054199, + "learning_rate": 6.537070028778952e-06, + "loss": 0.2746, + "step": 26020 + }, + { + "epoch": 1.1529435702658708, + "grad_norm": 5.824860095977783, + "learning_rate": 6.536932986158696e-06, + "loss": 0.2736, + "step": 26021 + }, + { + "epoch": 1.1529571351058057, + "grad_norm": 9.231639862060547, + "learning_rate": 6.536795943538441e-06, + "loss": 0.2998, + "step": 26022 + }, + { + "epoch": 1.1529706999457408, + "grad_norm": 6.1622419357299805, + "learning_rate": 6.5366589009181854e-06, + "loss": 0.2, + "step": 26023 + }, + { + "epoch": 1.1529842647856756, + "grad_norm": 4.8077616691589355, + "learning_rate": 6.5365218582979315e-06, + "loss": 0.2273, + "step": 26024 + }, + { + "epoch": 1.1529978296256105, + "grad_norm": 4.758199691772461, + "learning_rate": 6.536384815677677e-06, + "loss": 0.2203, + "step": 26025 + }, + { + "epoch": 1.1530113944655453, + "grad_norm": 5.619009971618652, + "learning_rate": 6.536247773057422e-06, + "loss": 0.2857, + "step": 26026 + }, + { + "epoch": 1.1530249593054802, + "grad_norm": 4.135692596435547, + "learning_rate": 6.536110730437166e-06, + "loss": 0.2195, + "step": 26027 + }, + { + "epoch": 1.153038524145415, + "grad_norm": 4.12519645690918, + "learning_rate": 6.535973687816911e-06, + "loss": 0.2126, + "step": 26028 + }, + { + "epoch": 1.15305208898535, + "grad_norm": 4.5346903800964355, + "learning_rate": 6.535836645196657e-06, + "loss": 0.1462, + "step": 26029 + }, + { + "epoch": 1.1530656538252848, + "grad_norm": 4.104244709014893, + "learning_rate": 6.535699602576402e-06, + "loss": 0.194, + "step": 26030 + }, + { + "epoch": 1.1530792186652197, + "grad_norm": 6.138612747192383, + "learning_rate": 6.535562559956147e-06, + "loss": 0.3054, + "step": 26031 + }, + { + "epoch": 1.1530927835051545, + "grad_norm": 4.711562156677246, + "learning_rate": 6.535425517335891e-06, + "loss": 0.218, + "step": 26032 + }, + { + "epoch": 1.1531063483450894, + "grad_norm": 4.815580368041992, + "learning_rate": 6.535288474715637e-06, + "loss": 0.1952, + "step": 26033 + }, + { + "epoch": 1.1531199131850245, + "grad_norm": 3.7814502716064453, + "learning_rate": 6.535151432095382e-06, + "loss": 0.2187, + "step": 26034 + }, + { + "epoch": 1.1531334780249594, + "grad_norm": 4.987088680267334, + "learning_rate": 6.5350143894751276e-06, + "loss": 0.1971, + "step": 26035 + }, + { + "epoch": 1.1531470428648942, + "grad_norm": 5.081757068634033, + "learning_rate": 6.534877346854872e-06, + "loss": 0.2141, + "step": 26036 + }, + { + "epoch": 1.153160607704829, + "grad_norm": 5.515547752380371, + "learning_rate": 6.534740304234618e-06, + "loss": 0.2391, + "step": 26037 + }, + { + "epoch": 1.153174172544764, + "grad_norm": 5.366880416870117, + "learning_rate": 6.534603261614363e-06, + "loss": 0.1938, + "step": 26038 + }, + { + "epoch": 1.1531877373846988, + "grad_norm": 6.512271404266357, + "learning_rate": 6.5344662189941074e-06, + "loss": 0.3204, + "step": 26039 + }, + { + "epoch": 1.1532013022246337, + "grad_norm": 5.86553955078125, + "learning_rate": 6.534329176373853e-06, + "loss": 0.3101, + "step": 26040 + }, + { + "epoch": 1.1532148670645685, + "grad_norm": 3.9390552043914795, + "learning_rate": 6.534192133753598e-06, + "loss": 0.1836, + "step": 26041 + }, + { + "epoch": 1.1532284319045036, + "grad_norm": 4.664339542388916, + "learning_rate": 6.534055091133343e-06, + "loss": 0.261, + "step": 26042 + }, + { + "epoch": 1.1532419967444385, + "grad_norm": 4.408114433288574, + "learning_rate": 6.533918048513088e-06, + "loss": 0.1939, + "step": 26043 + }, + { + "epoch": 1.1532555615843734, + "grad_norm": 4.278182506561279, + "learning_rate": 6.533781005892833e-06, + "loss": 0.1978, + "step": 26044 + }, + { + "epoch": 1.1532691264243082, + "grad_norm": 8.153039932250977, + "learning_rate": 6.533643963272578e-06, + "loss": 0.4355, + "step": 26045 + }, + { + "epoch": 1.153282691264243, + "grad_norm": 4.2076873779296875, + "learning_rate": 6.533506920652324e-06, + "loss": 0.1106, + "step": 26046 + }, + { + "epoch": 1.153296256104178, + "grad_norm": 4.2508955001831055, + "learning_rate": 6.533369878032069e-06, + "loss": 0.188, + "step": 26047 + }, + { + "epoch": 1.1533098209441128, + "grad_norm": 5.862218856811523, + "learning_rate": 6.533232835411813e-06, + "loss": 0.3605, + "step": 26048 + }, + { + "epoch": 1.1533233857840477, + "grad_norm": 6.447361946105957, + "learning_rate": 6.533095792791558e-06, + "loss": 0.357, + "step": 26049 + }, + { + "epoch": 1.1533369506239826, + "grad_norm": 6.598565101623535, + "learning_rate": 6.532958750171304e-06, + "loss": 0.2351, + "step": 26050 + }, + { + "epoch": 1.1533505154639174, + "grad_norm": 5.029759883880615, + "learning_rate": 6.53282170755105e-06, + "loss": 0.2513, + "step": 26051 + }, + { + "epoch": 1.1533640803038523, + "grad_norm": 4.080417633056641, + "learning_rate": 6.532684664930794e-06, + "loss": 0.1666, + "step": 26052 + }, + { + "epoch": 1.1533776451437874, + "grad_norm": 7.393796920776367, + "learning_rate": 6.532547622310539e-06, + "loss": 0.4092, + "step": 26053 + }, + { + "epoch": 1.1533912099837222, + "grad_norm": 4.851700782775879, + "learning_rate": 6.5324105796902834e-06, + "loss": 0.2437, + "step": 26054 + }, + { + "epoch": 1.153404774823657, + "grad_norm": 4.639833927154541, + "learning_rate": 6.5322735370700295e-06, + "loss": 0.213, + "step": 26055 + }, + { + "epoch": 1.153418339663592, + "grad_norm": 5.91077995300293, + "learning_rate": 6.532136494449775e-06, + "loss": 0.2856, + "step": 26056 + }, + { + "epoch": 1.1534319045035268, + "grad_norm": 6.546747207641602, + "learning_rate": 6.531999451829519e-06, + "loss": 0.4142, + "step": 26057 + }, + { + "epoch": 1.1534454693434617, + "grad_norm": 3.7915735244750977, + "learning_rate": 6.531862409209264e-06, + "loss": 0.1573, + "step": 26058 + }, + { + "epoch": 1.1534590341833966, + "grad_norm": 3.991407871246338, + "learning_rate": 6.53172536658901e-06, + "loss": 0.2075, + "step": 26059 + }, + { + "epoch": 1.1534725990233314, + "grad_norm": 3.803816318511963, + "learning_rate": 6.531588323968755e-06, + "loss": 0.2287, + "step": 26060 + }, + { + "epoch": 1.1534861638632665, + "grad_norm": 7.296191215515137, + "learning_rate": 6.5314512813485e-06, + "loss": 0.5193, + "step": 26061 + }, + { + "epoch": 1.1534997287032014, + "grad_norm": 4.647146701812744, + "learning_rate": 6.531314238728245e-06, + "loss": 0.1985, + "step": 26062 + }, + { + "epoch": 1.1535132935431363, + "grad_norm": 5.639173984527588, + "learning_rate": 6.531177196107991e-06, + "loss": 0.2955, + "step": 26063 + }, + { + "epoch": 1.1535268583830711, + "grad_norm": 5.5449323654174805, + "learning_rate": 6.531040153487735e-06, + "loss": 0.3085, + "step": 26064 + }, + { + "epoch": 1.153540423223006, + "grad_norm": 9.628716468811035, + "learning_rate": 6.53090311086748e-06, + "loss": 0.5033, + "step": 26065 + }, + { + "epoch": 1.1535539880629408, + "grad_norm": 6.207576274871826, + "learning_rate": 6.530766068247225e-06, + "loss": 0.3858, + "step": 26066 + }, + { + "epoch": 1.1535675529028757, + "grad_norm": 3.758598566055298, + "learning_rate": 6.53062902562697e-06, + "loss": 0.3273, + "step": 26067 + }, + { + "epoch": 1.1535811177428106, + "grad_norm": 4.438844680786133, + "learning_rate": 6.530491983006716e-06, + "loss": 0.1796, + "step": 26068 + }, + { + "epoch": 1.1535946825827454, + "grad_norm": 5.803892135620117, + "learning_rate": 6.530354940386461e-06, + "loss": 0.3835, + "step": 26069 + }, + { + "epoch": 1.1536082474226803, + "grad_norm": 4.817952632904053, + "learning_rate": 6.5302178977662054e-06, + "loss": 0.3025, + "step": 26070 + }, + { + "epoch": 1.1536218122626154, + "grad_norm": 5.4888739585876465, + "learning_rate": 6.530080855145951e-06, + "loss": 0.3471, + "step": 26071 + }, + { + "epoch": 1.1536353771025503, + "grad_norm": 6.140187740325928, + "learning_rate": 6.529943812525697e-06, + "loss": 0.3851, + "step": 26072 + }, + { + "epoch": 1.1536489419424851, + "grad_norm": 6.206893444061279, + "learning_rate": 6.529806769905441e-06, + "loss": 0.2533, + "step": 26073 + }, + { + "epoch": 1.15366250678242, + "grad_norm": 4.875179767608643, + "learning_rate": 6.529669727285186e-06, + "loss": 0.2819, + "step": 26074 + }, + { + "epoch": 1.1536760716223549, + "grad_norm": 6.934584617614746, + "learning_rate": 6.529532684664931e-06, + "loss": 0.3307, + "step": 26075 + }, + { + "epoch": 1.1536896364622897, + "grad_norm": 6.242562770843506, + "learning_rate": 6.5293956420446765e-06, + "loss": 0.2195, + "step": 26076 + }, + { + "epoch": 1.1537032013022246, + "grad_norm": 6.132480621337891, + "learning_rate": 6.529258599424422e-06, + "loss": 0.4891, + "step": 26077 + }, + { + "epoch": 1.1537167661421595, + "grad_norm": 3.9881041049957275, + "learning_rate": 6.529121556804167e-06, + "loss": 0.3093, + "step": 26078 + }, + { + "epoch": 1.1537303309820943, + "grad_norm": 5.936184883117676, + "learning_rate": 6.528984514183911e-06, + "loss": 0.4477, + "step": 26079 + }, + { + "epoch": 1.1537438958220294, + "grad_norm": 6.637962341308594, + "learning_rate": 6.528847471563656e-06, + "loss": 0.3479, + "step": 26080 + }, + { + "epoch": 1.1537574606619643, + "grad_norm": 6.7512922286987305, + "learning_rate": 6.528710428943402e-06, + "loss": 0.4204, + "step": 26081 + }, + { + "epoch": 1.1537710255018991, + "grad_norm": 7.372506141662598, + "learning_rate": 6.528573386323147e-06, + "loss": 0.5553, + "step": 26082 + }, + { + "epoch": 1.153784590341834, + "grad_norm": 5.809085845947266, + "learning_rate": 6.528436343702892e-06, + "loss": 0.2876, + "step": 26083 + }, + { + "epoch": 1.1537981551817689, + "grad_norm": 4.95929479598999, + "learning_rate": 6.528299301082637e-06, + "loss": 0.2629, + "step": 26084 + }, + { + "epoch": 1.1538117200217037, + "grad_norm": 5.095600128173828, + "learning_rate": 6.528162258462383e-06, + "loss": 0.2976, + "step": 26085 + }, + { + "epoch": 1.1538252848616386, + "grad_norm": 6.317352771759033, + "learning_rate": 6.5280252158421275e-06, + "loss": 0.2609, + "step": 26086 + }, + { + "epoch": 1.1538388497015735, + "grad_norm": 5.688023090362549, + "learning_rate": 6.527888173221873e-06, + "loss": 0.2957, + "step": 26087 + }, + { + "epoch": 1.1538524145415083, + "grad_norm": 8.210197448730469, + "learning_rate": 6.527751130601617e-06, + "loss": 0.473, + "step": 26088 + }, + { + "epoch": 1.1538659793814432, + "grad_norm": 6.314684867858887, + "learning_rate": 6.527614087981363e-06, + "loss": 0.3925, + "step": 26089 + }, + { + "epoch": 1.1538795442213783, + "grad_norm": 5.214748382568359, + "learning_rate": 6.527477045361108e-06, + "loss": 0.2521, + "step": 26090 + }, + { + "epoch": 1.1538931090613131, + "grad_norm": 6.535393238067627, + "learning_rate": 6.5273400027408525e-06, + "loss": 0.3536, + "step": 26091 + }, + { + "epoch": 1.153906673901248, + "grad_norm": 5.762927532196045, + "learning_rate": 6.527202960120598e-06, + "loss": 0.3255, + "step": 26092 + }, + { + "epoch": 1.1539202387411829, + "grad_norm": 5.718538761138916, + "learning_rate": 6.527065917500344e-06, + "loss": 0.3667, + "step": 26093 + }, + { + "epoch": 1.1539338035811177, + "grad_norm": 5.7885847091674805, + "learning_rate": 6.526928874880089e-06, + "loss": 0.3235, + "step": 26094 + }, + { + "epoch": 1.1539473684210526, + "grad_norm": 6.1485676765441895, + "learning_rate": 6.526791832259833e-06, + "loss": 0.336, + "step": 26095 + }, + { + "epoch": 1.1539609332609875, + "grad_norm": 4.900198936462402, + "learning_rate": 6.526654789639578e-06, + "loss": 0.3263, + "step": 26096 + }, + { + "epoch": 1.1539744981009223, + "grad_norm": 4.982438087463379, + "learning_rate": 6.526517747019323e-06, + "loss": 0.23, + "step": 26097 + }, + { + "epoch": 1.1539880629408572, + "grad_norm": 7.574502468109131, + "learning_rate": 6.526380704399069e-06, + "loss": 0.4183, + "step": 26098 + }, + { + "epoch": 1.1540016277807923, + "grad_norm": 4.653055191040039, + "learning_rate": 6.526243661778814e-06, + "loss": 0.2032, + "step": 26099 + }, + { + "epoch": 1.1540151926207272, + "grad_norm": 6.697197914123535, + "learning_rate": 6.526106619158559e-06, + "loss": 0.3218, + "step": 26100 + }, + { + "epoch": 1.154028757460662, + "grad_norm": 6.662014484405518, + "learning_rate": 6.5259695765383034e-06, + "loss": 0.375, + "step": 26101 + }, + { + "epoch": 1.154042322300597, + "grad_norm": 4.85270881652832, + "learning_rate": 6.5258325339180495e-06, + "loss": 0.2631, + "step": 26102 + }, + { + "epoch": 1.1540558871405318, + "grad_norm": 6.729706764221191, + "learning_rate": 6.525695491297795e-06, + "loss": 0.2727, + "step": 26103 + }, + { + "epoch": 1.1540694519804666, + "grad_norm": 6.2035298347473145, + "learning_rate": 6.525558448677539e-06, + "loss": 0.3277, + "step": 26104 + }, + { + "epoch": 1.1540830168204015, + "grad_norm": 8.409625053405762, + "learning_rate": 6.525421406057284e-06, + "loss": 0.2874, + "step": 26105 + }, + { + "epoch": 1.1540965816603364, + "grad_norm": 5.047633171081543, + "learning_rate": 6.52528436343703e-06, + "loss": 0.2179, + "step": 26106 + }, + { + "epoch": 1.1541101465002712, + "grad_norm": 6.179032325744629, + "learning_rate": 6.5251473208167745e-06, + "loss": 0.3241, + "step": 26107 + }, + { + "epoch": 1.154123711340206, + "grad_norm": 7.317830562591553, + "learning_rate": 6.52501027819652e-06, + "loss": 0.3899, + "step": 26108 + }, + { + "epoch": 1.1541372761801412, + "grad_norm": 5.766955852508545, + "learning_rate": 6.524873235576265e-06, + "loss": 0.2242, + "step": 26109 + }, + { + "epoch": 1.154150841020076, + "grad_norm": 5.516059398651123, + "learning_rate": 6.524736192956009e-06, + "loss": 0.2743, + "step": 26110 + }, + { + "epoch": 1.154164405860011, + "grad_norm": 5.993807315826416, + "learning_rate": 6.524599150335755e-06, + "loss": 0.2488, + "step": 26111 + }, + { + "epoch": 1.1541779706999458, + "grad_norm": 5.7787184715271, + "learning_rate": 6.5244621077155e-06, + "loss": 0.3496, + "step": 26112 + }, + { + "epoch": 1.1541915355398806, + "grad_norm": 7.288054943084717, + "learning_rate": 6.524325065095245e-06, + "loss": 0.5287, + "step": 26113 + }, + { + "epoch": 1.1542051003798155, + "grad_norm": 3.903524398803711, + "learning_rate": 6.52418802247499e-06, + "loss": 0.217, + "step": 26114 + }, + { + "epoch": 1.1542186652197504, + "grad_norm": 5.263082981109619, + "learning_rate": 6.524050979854736e-06, + "loss": 0.2117, + "step": 26115 + }, + { + "epoch": 1.1542322300596852, + "grad_norm": 5.861460208892822, + "learning_rate": 6.52391393723448e-06, + "loss": 0.4292, + "step": 26116 + }, + { + "epoch": 1.15424579489962, + "grad_norm": 5.001262664794922, + "learning_rate": 6.5237768946142255e-06, + "loss": 0.3852, + "step": 26117 + }, + { + "epoch": 1.1542593597395552, + "grad_norm": 6.507617473602295, + "learning_rate": 6.523639851993971e-06, + "loss": 0.4125, + "step": 26118 + }, + { + "epoch": 1.15427292457949, + "grad_norm": 4.09765100479126, + "learning_rate": 6.523502809373717e-06, + "loss": 0.1986, + "step": 26119 + }, + { + "epoch": 1.154286489419425, + "grad_norm": 5.893460750579834, + "learning_rate": 6.523365766753461e-06, + "loss": 0.2438, + "step": 26120 + }, + { + "epoch": 1.1543000542593598, + "grad_norm": 4.038003444671631, + "learning_rate": 6.523228724133206e-06, + "loss": 0.2647, + "step": 26121 + }, + { + "epoch": 1.1543136190992946, + "grad_norm": 5.468647480010986, + "learning_rate": 6.5230916815129505e-06, + "loss": 0.3754, + "step": 26122 + }, + { + "epoch": 1.1543271839392295, + "grad_norm": 4.987828731536865, + "learning_rate": 6.522954638892696e-06, + "loss": 0.3014, + "step": 26123 + }, + { + "epoch": 1.1543407487791644, + "grad_norm": 4.641639232635498, + "learning_rate": 6.522817596272442e-06, + "loss": 0.2572, + "step": 26124 + }, + { + "epoch": 1.1543543136190992, + "grad_norm": 5.2729902267456055, + "learning_rate": 6.522680553652186e-06, + "loss": 0.3134, + "step": 26125 + }, + { + "epoch": 1.154367878459034, + "grad_norm": 6.499546051025391, + "learning_rate": 6.522543511031931e-06, + "loss": 0.2245, + "step": 26126 + }, + { + "epoch": 1.154381443298969, + "grad_norm": 5.05212926864624, + "learning_rate": 6.522406468411676e-06, + "loss": 0.3415, + "step": 26127 + }, + { + "epoch": 1.154395008138904, + "grad_norm": 5.59515905380249, + "learning_rate": 6.5222694257914224e-06, + "loss": 0.3447, + "step": 26128 + }, + { + "epoch": 1.154408572978839, + "grad_norm": 5.299412250518799, + "learning_rate": 6.522132383171167e-06, + "loss": 0.3843, + "step": 26129 + }, + { + "epoch": 1.1544221378187738, + "grad_norm": 5.193877696990967, + "learning_rate": 6.521995340550912e-06, + "loss": 0.2948, + "step": 26130 + }, + { + "epoch": 1.1544357026587087, + "grad_norm": 6.132879257202148, + "learning_rate": 6.521858297930656e-06, + "loss": 0.3886, + "step": 26131 + }, + { + "epoch": 1.1544492674986435, + "grad_norm": 4.135476112365723, + "learning_rate": 6.521721255310402e-06, + "loss": 0.2135, + "step": 26132 + }, + { + "epoch": 1.1544628323385784, + "grad_norm": 6.557122230529785, + "learning_rate": 6.5215842126901475e-06, + "loss": 0.2824, + "step": 26133 + }, + { + "epoch": 1.1544763971785132, + "grad_norm": 4.496933460235596, + "learning_rate": 6.521447170069893e-06, + "loss": 0.2731, + "step": 26134 + }, + { + "epoch": 1.1544899620184481, + "grad_norm": 6.63357400894165, + "learning_rate": 6.521310127449637e-06, + "loss": 0.4022, + "step": 26135 + }, + { + "epoch": 1.154503526858383, + "grad_norm": 6.2963480949401855, + "learning_rate": 6.521173084829382e-06, + "loss": 0.2458, + "step": 26136 + }, + { + "epoch": 1.154517091698318, + "grad_norm": 4.507595062255859, + "learning_rate": 6.521036042209128e-06, + "loss": 0.2909, + "step": 26137 + }, + { + "epoch": 1.154530656538253, + "grad_norm": 5.692119121551514, + "learning_rate": 6.5208989995888725e-06, + "loss": 0.3626, + "step": 26138 + }, + { + "epoch": 1.1545442213781878, + "grad_norm": 9.930768013000488, + "learning_rate": 6.520761956968618e-06, + "loss": 0.2421, + "step": 26139 + }, + { + "epoch": 1.1545577862181227, + "grad_norm": 6.113982200622559, + "learning_rate": 6.520624914348362e-06, + "loss": 0.2659, + "step": 26140 + }, + { + "epoch": 1.1545713510580575, + "grad_norm": 6.46525764465332, + "learning_rate": 6.520487871728108e-06, + "loss": 0.3515, + "step": 26141 + }, + { + "epoch": 1.1545849158979924, + "grad_norm": 4.765867710113525, + "learning_rate": 6.520350829107853e-06, + "loss": 0.2332, + "step": 26142 + }, + { + "epoch": 1.1545984807379273, + "grad_norm": 6.9466142654418945, + "learning_rate": 6.520213786487598e-06, + "loss": 0.2431, + "step": 26143 + }, + { + "epoch": 1.1546120455778621, + "grad_norm": 5.311005115509033, + "learning_rate": 6.520076743867343e-06, + "loss": 0.2616, + "step": 26144 + }, + { + "epoch": 1.154625610417797, + "grad_norm": 6.356371879577637, + "learning_rate": 6.519939701247089e-06, + "loss": 0.409, + "step": 26145 + }, + { + "epoch": 1.1546391752577319, + "grad_norm": 5.334234714508057, + "learning_rate": 6.519802658626834e-06, + "loss": 0.2435, + "step": 26146 + }, + { + "epoch": 1.154652740097667, + "grad_norm": 5.620334625244141, + "learning_rate": 6.519665616006578e-06, + "loss": 0.2413, + "step": 26147 + }, + { + "epoch": 1.1546663049376018, + "grad_norm": 6.5725579261779785, + "learning_rate": 6.5195285733863235e-06, + "loss": 0.249, + "step": 26148 + }, + { + "epoch": 1.1546798697775367, + "grad_norm": 5.106110095977783, + "learning_rate": 6.519391530766069e-06, + "loss": 0.2041, + "step": 26149 + }, + { + "epoch": 1.1546934346174715, + "grad_norm": 5.158022403717041, + "learning_rate": 6.519254488145814e-06, + "loss": 0.1679, + "step": 26150 + }, + { + "epoch": 1.1547069994574064, + "grad_norm": 7.191740989685059, + "learning_rate": 6.519117445525559e-06, + "loss": 0.4295, + "step": 26151 + }, + { + "epoch": 1.1547205642973413, + "grad_norm": 4.883424758911133, + "learning_rate": 6.518980402905304e-06, + "loss": 0.2391, + "step": 26152 + }, + { + "epoch": 1.1547341291372761, + "grad_norm": 6.50970983505249, + "learning_rate": 6.5188433602850485e-06, + "loss": 0.2386, + "step": 26153 + }, + { + "epoch": 1.154747693977211, + "grad_norm": 5.553566932678223, + "learning_rate": 6.5187063176647945e-06, + "loss": 0.2607, + "step": 26154 + }, + { + "epoch": 1.1547612588171459, + "grad_norm": 5.451371192932129, + "learning_rate": 6.51856927504454e-06, + "loss": 0.3078, + "step": 26155 + }, + { + "epoch": 1.154774823657081, + "grad_norm": 6.669186115264893, + "learning_rate": 6.518432232424284e-06, + "loss": 0.3236, + "step": 26156 + }, + { + "epoch": 1.1547883884970158, + "grad_norm": 5.840672492980957, + "learning_rate": 6.518295189804029e-06, + "loss": 0.3138, + "step": 26157 + }, + { + "epoch": 1.1548019533369507, + "grad_norm": 4.84374475479126, + "learning_rate": 6.518158147183775e-06, + "loss": 0.1488, + "step": 26158 + }, + { + "epoch": 1.1548155181768855, + "grad_norm": 6.842902660369873, + "learning_rate": 6.51802110456352e-06, + "loss": 0.3432, + "step": 26159 + }, + { + "epoch": 1.1548290830168204, + "grad_norm": 7.244910717010498, + "learning_rate": 6.517884061943265e-06, + "loss": 0.2884, + "step": 26160 + }, + { + "epoch": 1.1548426478567553, + "grad_norm": 5.174653053283691, + "learning_rate": 6.51774701932301e-06, + "loss": 0.3156, + "step": 26161 + }, + { + "epoch": 1.1548562126966901, + "grad_norm": 6.417924404144287, + "learning_rate": 6.517609976702756e-06, + "loss": 0.3336, + "step": 26162 + }, + { + "epoch": 1.154869777536625, + "grad_norm": 5.925288677215576, + "learning_rate": 6.5174729340825e-06, + "loss": 0.2584, + "step": 26163 + }, + { + "epoch": 1.1548833423765599, + "grad_norm": 6.253847599029541, + "learning_rate": 6.5173358914622455e-06, + "loss": 0.2682, + "step": 26164 + }, + { + "epoch": 1.1548969072164947, + "grad_norm": 5.579732894897461, + "learning_rate": 6.51719884884199e-06, + "loss": 0.3092, + "step": 26165 + }, + { + "epoch": 1.1549104720564298, + "grad_norm": 3.5690855979919434, + "learning_rate": 6.517061806221735e-06, + "loss": 0.1688, + "step": 26166 + }, + { + "epoch": 1.1549240368963647, + "grad_norm": 4.881473541259766, + "learning_rate": 6.516924763601481e-06, + "loss": 0.2355, + "step": 26167 + }, + { + "epoch": 1.1549376017362996, + "grad_norm": 6.956845760345459, + "learning_rate": 6.516787720981226e-06, + "loss": 0.352, + "step": 26168 + }, + { + "epoch": 1.1549511665762344, + "grad_norm": 5.663668632507324, + "learning_rate": 6.5166506783609705e-06, + "loss": 0.2765, + "step": 26169 + }, + { + "epoch": 1.1549647314161693, + "grad_norm": 4.497124671936035, + "learning_rate": 6.516513635740716e-06, + "loss": 0.163, + "step": 26170 + }, + { + "epoch": 1.1549782962561042, + "grad_norm": 6.250485420227051, + "learning_rate": 6.516376593120462e-06, + "loss": 0.2953, + "step": 26171 + }, + { + "epoch": 1.154991861096039, + "grad_norm": 4.473268508911133, + "learning_rate": 6.516239550500206e-06, + "loss": 0.2744, + "step": 26172 + }, + { + "epoch": 1.1550054259359739, + "grad_norm": 6.2523884773254395, + "learning_rate": 6.516102507879951e-06, + "loss": 0.2236, + "step": 26173 + }, + { + "epoch": 1.1550189907759087, + "grad_norm": 4.696493148803711, + "learning_rate": 6.515965465259696e-06, + "loss": 0.2047, + "step": 26174 + }, + { + "epoch": 1.1550325556158438, + "grad_norm": 5.2140398025512695, + "learning_rate": 6.515828422639442e-06, + "loss": 0.2713, + "step": 26175 + }, + { + "epoch": 1.1550461204557787, + "grad_norm": 4.789353847503662, + "learning_rate": 6.515691380019187e-06, + "loss": 0.2241, + "step": 26176 + }, + { + "epoch": 1.1550596852957136, + "grad_norm": 5.804317474365234, + "learning_rate": 6.515554337398932e-06, + "loss": 0.2535, + "step": 26177 + }, + { + "epoch": 1.1550732501356484, + "grad_norm": 5.260910511016846, + "learning_rate": 6.515417294778676e-06, + "loss": 0.2388, + "step": 26178 + }, + { + "epoch": 1.1550868149755833, + "grad_norm": 6.524652481079102, + "learning_rate": 6.5152802521584215e-06, + "loss": 0.3032, + "step": 26179 + }, + { + "epoch": 1.1551003798155182, + "grad_norm": 6.23508882522583, + "learning_rate": 6.5151432095381675e-06, + "loss": 0.2389, + "step": 26180 + }, + { + "epoch": 1.155113944655453, + "grad_norm": 5.395965576171875, + "learning_rate": 6.515006166917912e-06, + "loss": 0.2177, + "step": 26181 + }, + { + "epoch": 1.155127509495388, + "grad_norm": 5.737998962402344, + "learning_rate": 6.514869124297657e-06, + "loss": 0.2491, + "step": 26182 + }, + { + "epoch": 1.1551410743353228, + "grad_norm": 6.220072269439697, + "learning_rate": 6.514732081677402e-06, + "loss": 0.2614, + "step": 26183 + }, + { + "epoch": 1.1551546391752576, + "grad_norm": 5.165794849395752, + "learning_rate": 6.514595039057147e-06, + "loss": 0.2926, + "step": 26184 + }, + { + "epoch": 1.1551682040151927, + "grad_norm": 8.371711730957031, + "learning_rate": 6.5144579964368925e-06, + "loss": 0.351, + "step": 26185 + }, + { + "epoch": 1.1551817688551276, + "grad_norm": 3.9901771545410156, + "learning_rate": 6.514320953816638e-06, + "loss": 0.1892, + "step": 26186 + }, + { + "epoch": 1.1551953336950624, + "grad_norm": 4.3581976890563965, + "learning_rate": 6.514183911196382e-06, + "loss": 0.1836, + "step": 26187 + }, + { + "epoch": 1.1552088985349973, + "grad_norm": 4.488614559173584, + "learning_rate": 6.514046868576128e-06, + "loss": 0.1784, + "step": 26188 + }, + { + "epoch": 1.1552224633749322, + "grad_norm": 5.7267584800720215, + "learning_rate": 6.513909825955873e-06, + "loss": 0.2579, + "step": 26189 + }, + { + "epoch": 1.155236028214867, + "grad_norm": 3.7392451763153076, + "learning_rate": 6.513772783335618e-06, + "loss": 0.1711, + "step": 26190 + }, + { + "epoch": 1.155249593054802, + "grad_norm": 6.019224643707275, + "learning_rate": 6.513635740715363e-06, + "loss": 0.2876, + "step": 26191 + }, + { + "epoch": 1.1552631578947368, + "grad_norm": 7.004576206207275, + "learning_rate": 6.513498698095108e-06, + "loss": 0.3034, + "step": 26192 + }, + { + "epoch": 1.1552767227346716, + "grad_norm": 6.930333614349365, + "learning_rate": 6.513361655474853e-06, + "loss": 0.322, + "step": 26193 + }, + { + "epoch": 1.1552902875746067, + "grad_norm": 5.333330154418945, + "learning_rate": 6.513224612854598e-06, + "loss": 0.247, + "step": 26194 + }, + { + "epoch": 1.1553038524145416, + "grad_norm": 4.65306282043457, + "learning_rate": 6.5130875702343435e-06, + "loss": 0.2727, + "step": 26195 + }, + { + "epoch": 1.1553174172544765, + "grad_norm": 4.345666408538818, + "learning_rate": 6.512950527614088e-06, + "loss": 0.2993, + "step": 26196 + }, + { + "epoch": 1.1553309820944113, + "grad_norm": 6.319233417510986, + "learning_rate": 6.512813484993834e-06, + "loss": 0.3159, + "step": 26197 + }, + { + "epoch": 1.1553445469343462, + "grad_norm": 6.745505332946777, + "learning_rate": 6.512676442373579e-06, + "loss": 0.2848, + "step": 26198 + }, + { + "epoch": 1.155358111774281, + "grad_norm": 6.068150520324707, + "learning_rate": 6.512539399753323e-06, + "loss": 0.3907, + "step": 26199 + }, + { + "epoch": 1.155371676614216, + "grad_norm": 7.401545524597168, + "learning_rate": 6.5124023571330685e-06, + "loss": 0.3571, + "step": 26200 + }, + { + "epoch": 1.1553852414541508, + "grad_norm": 6.863710403442383, + "learning_rate": 6.5122653145128146e-06, + "loss": 0.2596, + "step": 26201 + }, + { + "epoch": 1.1553988062940856, + "grad_norm": 9.037757873535156, + "learning_rate": 6.51212827189256e-06, + "loss": 0.4062, + "step": 26202 + }, + { + "epoch": 1.1554123711340205, + "grad_norm": 6.773474216461182, + "learning_rate": 6.511991229272304e-06, + "loss": 0.3579, + "step": 26203 + }, + { + "epoch": 1.1554259359739556, + "grad_norm": 5.809441089630127, + "learning_rate": 6.511854186652049e-06, + "loss": 0.2651, + "step": 26204 + }, + { + "epoch": 1.1554395008138905, + "grad_norm": 6.490589141845703, + "learning_rate": 6.511717144031794e-06, + "loss": 0.2726, + "step": 26205 + }, + { + "epoch": 1.1554530656538253, + "grad_norm": 4.924217224121094, + "learning_rate": 6.51158010141154e-06, + "loss": 0.3096, + "step": 26206 + }, + { + "epoch": 1.1554666304937602, + "grad_norm": 7.123684406280518, + "learning_rate": 6.511443058791285e-06, + "loss": 0.3578, + "step": 26207 + }, + { + "epoch": 1.155480195333695, + "grad_norm": 6.00363826751709, + "learning_rate": 6.511306016171029e-06, + "loss": 0.2319, + "step": 26208 + }, + { + "epoch": 1.15549376017363, + "grad_norm": 5.741829872131348, + "learning_rate": 6.511168973550774e-06, + "loss": 0.3247, + "step": 26209 + }, + { + "epoch": 1.1555073250135648, + "grad_norm": 6.075077056884766, + "learning_rate": 6.51103193093052e-06, + "loss": 0.2823, + "step": 26210 + }, + { + "epoch": 1.1555208898534997, + "grad_norm": 5.5908918380737305, + "learning_rate": 6.5108948883102655e-06, + "loss": 0.235, + "step": 26211 + }, + { + "epoch": 1.1555344546934345, + "grad_norm": 6.412603855133057, + "learning_rate": 6.51075784569001e-06, + "loss": 0.3204, + "step": 26212 + }, + { + "epoch": 1.1555480195333696, + "grad_norm": 5.719968795776367, + "learning_rate": 6.510620803069755e-06, + "loss": 0.2544, + "step": 26213 + }, + { + "epoch": 1.1555615843733045, + "grad_norm": 6.3861260414123535, + "learning_rate": 6.510483760449501e-06, + "loss": 0.3806, + "step": 26214 + }, + { + "epoch": 1.1555751492132393, + "grad_norm": 5.525580406188965, + "learning_rate": 6.510346717829245e-06, + "loss": 0.2483, + "step": 26215 + }, + { + "epoch": 1.1555887140531742, + "grad_norm": 5.788393497467041, + "learning_rate": 6.5102096752089905e-06, + "loss": 0.4408, + "step": 26216 + }, + { + "epoch": 1.155602278893109, + "grad_norm": 5.059755802154541, + "learning_rate": 6.510072632588736e-06, + "loss": 0.3268, + "step": 26217 + }, + { + "epoch": 1.155615843733044, + "grad_norm": 6.834549903869629, + "learning_rate": 6.50993558996848e-06, + "loss": 0.4094, + "step": 26218 + }, + { + "epoch": 1.1556294085729788, + "grad_norm": 5.043808460235596, + "learning_rate": 6.509798547348226e-06, + "loss": 0.2156, + "step": 26219 + }, + { + "epoch": 1.1556429734129137, + "grad_norm": 7.266154766082764, + "learning_rate": 6.509661504727971e-06, + "loss": 0.4581, + "step": 26220 + }, + { + "epoch": 1.1556565382528485, + "grad_norm": 6.556759834289551, + "learning_rate": 6.509524462107716e-06, + "loss": 0.3064, + "step": 26221 + }, + { + "epoch": 1.1556701030927834, + "grad_norm": 5.063448429107666, + "learning_rate": 6.509387419487461e-06, + "loss": 0.3964, + "step": 26222 + }, + { + "epoch": 1.1556836679327185, + "grad_norm": 7.425429821014404, + "learning_rate": 6.509250376867207e-06, + "loss": 0.3906, + "step": 26223 + }, + { + "epoch": 1.1556972327726533, + "grad_norm": 4.827759265899658, + "learning_rate": 6.509113334246951e-06, + "loss": 0.1713, + "step": 26224 + }, + { + "epoch": 1.1557107976125882, + "grad_norm": 6.145545959472656, + "learning_rate": 6.508976291626696e-06, + "loss": 0.3595, + "step": 26225 + }, + { + "epoch": 1.155724362452523, + "grad_norm": 7.417929172515869, + "learning_rate": 6.5088392490064415e-06, + "loss": 0.4233, + "step": 26226 + }, + { + "epoch": 1.155737927292458, + "grad_norm": 6.2319488525390625, + "learning_rate": 6.5087022063861875e-06, + "loss": 0.3924, + "step": 26227 + }, + { + "epoch": 1.1557514921323928, + "grad_norm": 5.1284379959106445, + "learning_rate": 6.508565163765932e-06, + "loss": 0.3107, + "step": 26228 + }, + { + "epoch": 1.1557650569723277, + "grad_norm": 6.617476940155029, + "learning_rate": 6.508428121145677e-06, + "loss": 0.4016, + "step": 26229 + }, + { + "epoch": 1.1557786218122625, + "grad_norm": 5.685676097869873, + "learning_rate": 6.508291078525421e-06, + "loss": 0.3538, + "step": 26230 + }, + { + "epoch": 1.1557921866521976, + "grad_norm": 8.527231216430664, + "learning_rate": 6.508154035905167e-06, + "loss": 0.4886, + "step": 26231 + }, + { + "epoch": 1.1558057514921325, + "grad_norm": 6.234652042388916, + "learning_rate": 6.5080169932849126e-06, + "loss": 0.3358, + "step": 26232 + }, + { + "epoch": 1.1558193163320674, + "grad_norm": 4.5014495849609375, + "learning_rate": 6.507879950664657e-06, + "loss": 0.2699, + "step": 26233 + }, + { + "epoch": 1.1558328811720022, + "grad_norm": 7.728561878204346, + "learning_rate": 6.507742908044402e-06, + "loss": 0.356, + "step": 26234 + }, + { + "epoch": 1.155846446011937, + "grad_norm": 3.8552961349487305, + "learning_rate": 6.507605865424147e-06, + "loss": 0.2792, + "step": 26235 + }, + { + "epoch": 1.155860010851872, + "grad_norm": 8.719545364379883, + "learning_rate": 6.507468822803893e-06, + "loss": 0.3952, + "step": 26236 + }, + { + "epoch": 1.1558735756918068, + "grad_norm": 5.993190765380859, + "learning_rate": 6.507331780183638e-06, + "loss": 0.2381, + "step": 26237 + }, + { + "epoch": 1.1558871405317417, + "grad_norm": 3.420034170150757, + "learning_rate": 6.507194737563383e-06, + "loss": 0.1411, + "step": 26238 + }, + { + "epoch": 1.1559007053716766, + "grad_norm": 6.770304203033447, + "learning_rate": 6.507057694943127e-06, + "loss": 0.4086, + "step": 26239 + }, + { + "epoch": 1.1559142702116114, + "grad_norm": 6.838080406188965, + "learning_rate": 6.506920652322873e-06, + "loss": 0.3346, + "step": 26240 + }, + { + "epoch": 1.1559278350515463, + "grad_norm": 3.7029247283935547, + "learning_rate": 6.506783609702618e-06, + "loss": 0.2392, + "step": 26241 + }, + { + "epoch": 1.1559413998914814, + "grad_norm": 4.6470184326171875, + "learning_rate": 6.5066465670823635e-06, + "loss": 0.1572, + "step": 26242 + }, + { + "epoch": 1.1559549647314162, + "grad_norm": 6.050160884857178, + "learning_rate": 6.506509524462108e-06, + "loss": 0.3414, + "step": 26243 + }, + { + "epoch": 1.155968529571351, + "grad_norm": 3.6570639610290527, + "learning_rate": 6.506372481841854e-06, + "loss": 0.1953, + "step": 26244 + }, + { + "epoch": 1.155982094411286, + "grad_norm": 6.901186466217041, + "learning_rate": 6.506235439221599e-06, + "loss": 0.3171, + "step": 26245 + }, + { + "epoch": 1.1559956592512208, + "grad_norm": 5.449454307556152, + "learning_rate": 6.506098396601343e-06, + "loss": 0.1992, + "step": 26246 + }, + { + "epoch": 1.1560092240911557, + "grad_norm": 5.734130382537842, + "learning_rate": 6.5059613539810886e-06, + "loss": 0.2413, + "step": 26247 + }, + { + "epoch": 1.1560227889310906, + "grad_norm": 5.95329475402832, + "learning_rate": 6.505824311360833e-06, + "loss": 0.257, + "step": 26248 + }, + { + "epoch": 1.1560363537710254, + "grad_norm": 6.29089879989624, + "learning_rate": 6.505687268740579e-06, + "loss": 0.2052, + "step": 26249 + }, + { + "epoch": 1.1560499186109605, + "grad_norm": 5.633706569671631, + "learning_rate": 6.505550226120324e-06, + "loss": 0.2731, + "step": 26250 + }, + { + "epoch": 1.1560634834508954, + "grad_norm": 6.047572612762451, + "learning_rate": 6.505413183500069e-06, + "loss": 0.3159, + "step": 26251 + }, + { + "epoch": 1.1560770482908302, + "grad_norm": 4.623862266540527, + "learning_rate": 6.505276140879814e-06, + "loss": 0.157, + "step": 26252 + }, + { + "epoch": 1.156090613130765, + "grad_norm": 4.9558868408203125, + "learning_rate": 6.50513909825956e-06, + "loss": 0.238, + "step": 26253 + }, + { + "epoch": 1.1561041779707, + "grad_norm": 6.396154403686523, + "learning_rate": 6.505002055639305e-06, + "loss": 0.2727, + "step": 26254 + }, + { + "epoch": 1.1561177428106348, + "grad_norm": 5.309970855712891, + "learning_rate": 6.504865013019049e-06, + "loss": 0.2423, + "step": 26255 + }, + { + "epoch": 1.1561313076505697, + "grad_norm": 4.399226665496826, + "learning_rate": 6.504727970398794e-06, + "loss": 0.2448, + "step": 26256 + }, + { + "epoch": 1.1561448724905046, + "grad_norm": 3.7600157260894775, + "learning_rate": 6.50459092777854e-06, + "loss": 0.1923, + "step": 26257 + }, + { + "epoch": 1.1561584373304394, + "grad_norm": 6.273998260498047, + "learning_rate": 6.504453885158285e-06, + "loss": 0.2931, + "step": 26258 + }, + { + "epoch": 1.1561720021703743, + "grad_norm": 6.224231719970703, + "learning_rate": 6.50431684253803e-06, + "loss": 0.3595, + "step": 26259 + }, + { + "epoch": 1.1561855670103092, + "grad_norm": 5.050215721130371, + "learning_rate": 6.504179799917775e-06, + "loss": 0.2031, + "step": 26260 + }, + { + "epoch": 1.1561991318502443, + "grad_norm": 3.506574869155884, + "learning_rate": 6.504042757297519e-06, + "loss": 0.1134, + "step": 26261 + }, + { + "epoch": 1.1562126966901791, + "grad_norm": 5.890164375305176, + "learning_rate": 6.503905714677265e-06, + "loss": 0.1712, + "step": 26262 + }, + { + "epoch": 1.156226261530114, + "grad_norm": 5.315790176391602, + "learning_rate": 6.5037686720570106e-06, + "loss": 0.2858, + "step": 26263 + }, + { + "epoch": 1.1562398263700489, + "grad_norm": 7.367700576782227, + "learning_rate": 6.503631629436755e-06, + "loss": 0.3719, + "step": 26264 + }, + { + "epoch": 1.1562533912099837, + "grad_norm": 6.600450038909912, + "learning_rate": 6.5034945868165e-06, + "loss": 0.3053, + "step": 26265 + }, + { + "epoch": 1.1562669560499186, + "grad_norm": 5.760342597961426, + "learning_rate": 6.503357544196246e-06, + "loss": 0.2929, + "step": 26266 + }, + { + "epoch": 1.1562805208898534, + "grad_norm": 5.167067050933838, + "learning_rate": 6.5032205015759904e-06, + "loss": 0.2658, + "step": 26267 + }, + { + "epoch": 1.1562940857297883, + "grad_norm": 7.69013786315918, + "learning_rate": 6.503083458955736e-06, + "loss": 0.3299, + "step": 26268 + }, + { + "epoch": 1.1563076505697234, + "grad_norm": 5.7415971755981445, + "learning_rate": 6.502946416335481e-06, + "loss": 0.1698, + "step": 26269 + }, + { + "epoch": 1.1563212154096583, + "grad_norm": 3.512295961380005, + "learning_rate": 6.502809373715227e-06, + "loss": 0.1122, + "step": 26270 + }, + { + "epoch": 1.1563347802495931, + "grad_norm": 5.911592960357666, + "learning_rate": 6.502672331094971e-06, + "loss": 0.2761, + "step": 26271 + }, + { + "epoch": 1.156348345089528, + "grad_norm": 5.306793212890625, + "learning_rate": 6.502535288474716e-06, + "loss": 0.1904, + "step": 26272 + }, + { + "epoch": 1.1563619099294629, + "grad_norm": 4.120062351226807, + "learning_rate": 6.502398245854461e-06, + "loss": 0.1494, + "step": 26273 + }, + { + "epoch": 1.1563754747693977, + "grad_norm": 5.710892677307129, + "learning_rate": 6.502261203234206e-06, + "loss": 0.2413, + "step": 26274 + }, + { + "epoch": 1.1563890396093326, + "grad_norm": 3.947559356689453, + "learning_rate": 6.502124160613952e-06, + "loss": 0.2117, + "step": 26275 + }, + { + "epoch": 1.1564026044492675, + "grad_norm": 7.390677452087402, + "learning_rate": 6.501987117993697e-06, + "loss": 0.357, + "step": 26276 + }, + { + "epoch": 1.1564161692892023, + "grad_norm": 5.401144504547119, + "learning_rate": 6.501850075373441e-06, + "loss": 0.2248, + "step": 26277 + }, + { + "epoch": 1.1564297341291372, + "grad_norm": 5.120995044708252, + "learning_rate": 6.5017130327531866e-06, + "loss": 0.1588, + "step": 26278 + }, + { + "epoch": 1.156443298969072, + "grad_norm": 4.435126781463623, + "learning_rate": 6.501575990132933e-06, + "loss": 0.4032, + "step": 26279 + }, + { + "epoch": 1.1564568638090071, + "grad_norm": 6.06650447845459, + "learning_rate": 6.501438947512677e-06, + "loss": 0.2785, + "step": 26280 + }, + { + "epoch": 1.156470428648942, + "grad_norm": 4.419947624206543, + "learning_rate": 6.501301904892422e-06, + "loss": 0.2396, + "step": 26281 + }, + { + "epoch": 1.1564839934888769, + "grad_norm": 5.383718967437744, + "learning_rate": 6.5011648622721664e-06, + "loss": 0.2081, + "step": 26282 + }, + { + "epoch": 1.1564975583288117, + "grad_norm": 5.581084728240967, + "learning_rate": 6.5010278196519125e-06, + "loss": 0.3783, + "step": 26283 + }, + { + "epoch": 1.1565111231687466, + "grad_norm": 4.515496730804443, + "learning_rate": 6.500890777031658e-06, + "loss": 0.2336, + "step": 26284 + }, + { + "epoch": 1.1565246880086815, + "grad_norm": 4.845922946929932, + "learning_rate": 6.500753734411403e-06, + "loss": 0.1766, + "step": 26285 + }, + { + "epoch": 1.1565382528486163, + "grad_norm": 6.16365385055542, + "learning_rate": 6.500616691791147e-06, + "loss": 0.2283, + "step": 26286 + }, + { + "epoch": 1.1565518176885512, + "grad_norm": 5.413238048553467, + "learning_rate": 6.500479649170892e-06, + "loss": 0.2274, + "step": 26287 + }, + { + "epoch": 1.1565653825284863, + "grad_norm": 5.046722888946533, + "learning_rate": 6.500342606550638e-06, + "loss": 0.2222, + "step": 26288 + }, + { + "epoch": 1.1565789473684212, + "grad_norm": 5.378512859344482, + "learning_rate": 6.500205563930383e-06, + "loss": 0.2193, + "step": 26289 + }, + { + "epoch": 1.156592512208356, + "grad_norm": 5.137657642364502, + "learning_rate": 6.500068521310128e-06, + "loss": 0.2424, + "step": 26290 + }, + { + "epoch": 1.1566060770482909, + "grad_norm": 5.440792560577393, + "learning_rate": 6.499931478689873e-06, + "loss": 0.3634, + "step": 26291 + }, + { + "epoch": 1.1566196418882257, + "grad_norm": 5.4548726081848145, + "learning_rate": 6.499794436069618e-06, + "loss": 0.3199, + "step": 26292 + }, + { + "epoch": 1.1566332067281606, + "grad_norm": 4.840194225311279, + "learning_rate": 6.499657393449363e-06, + "loss": 0.2473, + "step": 26293 + }, + { + "epoch": 1.1566467715680955, + "grad_norm": 5.951321125030518, + "learning_rate": 6.4995203508291086e-06, + "loss": 0.2631, + "step": 26294 + }, + { + "epoch": 1.1566603364080303, + "grad_norm": 4.97758674621582, + "learning_rate": 6.499383308208853e-06, + "loss": 0.2302, + "step": 26295 + }, + { + "epoch": 1.1566739012479652, + "grad_norm": 4.865346908569336, + "learning_rate": 6.499246265588599e-06, + "loss": 0.3189, + "step": 26296 + }, + { + "epoch": 1.1566874660879, + "grad_norm": 5.0760016441345215, + "learning_rate": 6.499109222968344e-06, + "loss": 0.2584, + "step": 26297 + }, + { + "epoch": 1.156701030927835, + "grad_norm": 4.273650169372559, + "learning_rate": 6.4989721803480884e-06, + "loss": 0.1853, + "step": 26298 + }, + { + "epoch": 1.15671459576777, + "grad_norm": 5.467661380767822, + "learning_rate": 6.498835137727834e-06, + "loss": 0.2245, + "step": 26299 + }, + { + "epoch": 1.156728160607705, + "grad_norm": 5.055566310882568, + "learning_rate": 6.49869809510758e-06, + "loss": 0.3023, + "step": 26300 + }, + { + "epoch": 1.1567417254476398, + "grad_norm": 4.8244242668151855, + "learning_rate": 6.498561052487324e-06, + "loss": 0.2342, + "step": 26301 + }, + { + "epoch": 1.1567552902875746, + "grad_norm": 6.876680850982666, + "learning_rate": 6.498424009867069e-06, + "loss": 0.2364, + "step": 26302 + }, + { + "epoch": 1.1567688551275095, + "grad_norm": 3.9269394874572754, + "learning_rate": 6.498286967246814e-06, + "loss": 0.1656, + "step": 26303 + }, + { + "epoch": 1.1567824199674444, + "grad_norm": 4.711862564086914, + "learning_rate": 6.498149924626559e-06, + "loss": 0.2116, + "step": 26304 + }, + { + "epoch": 1.1567959848073792, + "grad_norm": 4.634757995605469, + "learning_rate": 6.498012882006305e-06, + "loss": 0.1744, + "step": 26305 + }, + { + "epoch": 1.156809549647314, + "grad_norm": 4.707540035247803, + "learning_rate": 6.49787583938605e-06, + "loss": 0.2737, + "step": 26306 + }, + { + "epoch": 1.1568231144872492, + "grad_norm": 5.683055877685547, + "learning_rate": 6.497738796765794e-06, + "loss": 0.2534, + "step": 26307 + }, + { + "epoch": 1.156836679327184, + "grad_norm": 3.135772228240967, + "learning_rate": 6.497601754145539e-06, + "loss": 0.1387, + "step": 26308 + }, + { + "epoch": 1.156850244167119, + "grad_norm": 5.333637714385986, + "learning_rate": 6.497464711525285e-06, + "loss": 0.1811, + "step": 26309 + }, + { + "epoch": 1.1568638090070538, + "grad_norm": 6.64893102645874, + "learning_rate": 6.497327668905031e-06, + "loss": 0.2647, + "step": 26310 + }, + { + "epoch": 1.1568773738469886, + "grad_norm": 3.455739736557007, + "learning_rate": 6.497190626284775e-06, + "loss": 0.1375, + "step": 26311 + }, + { + "epoch": 1.1568909386869235, + "grad_norm": 6.344089508056641, + "learning_rate": 6.49705358366452e-06, + "loss": 0.269, + "step": 26312 + }, + { + "epoch": 1.1569045035268584, + "grad_norm": 4.209100723266602, + "learning_rate": 6.496916541044266e-06, + "loss": 0.2532, + "step": 26313 + }, + { + "epoch": 1.1569180683667932, + "grad_norm": 4.371032238006592, + "learning_rate": 6.4967794984240105e-06, + "loss": 0.2207, + "step": 26314 + }, + { + "epoch": 1.156931633206728, + "grad_norm": 5.33875846862793, + "learning_rate": 6.496642455803756e-06, + "loss": 0.2714, + "step": 26315 + }, + { + "epoch": 1.156945198046663, + "grad_norm": 4.303895473480225, + "learning_rate": 6.4965054131835e-06, + "loss": 0.1514, + "step": 26316 + }, + { + "epoch": 1.1569587628865978, + "grad_norm": 4.174257755279541, + "learning_rate": 6.496368370563245e-06, + "loss": 0.1309, + "step": 26317 + }, + { + "epoch": 1.156972327726533, + "grad_norm": 4.683738708496094, + "learning_rate": 6.496231327942991e-06, + "loss": 0.1907, + "step": 26318 + }, + { + "epoch": 1.1569858925664678, + "grad_norm": 4.498854637145996, + "learning_rate": 6.496094285322736e-06, + "loss": 0.1867, + "step": 26319 + }, + { + "epoch": 1.1569994574064026, + "grad_norm": 5.386087894439697, + "learning_rate": 6.495957242702481e-06, + "loss": 0.267, + "step": 26320 + }, + { + "epoch": 1.1570130222463375, + "grad_norm": 3.8819308280944824, + "learning_rate": 6.495820200082226e-06, + "loss": 0.2052, + "step": 26321 + }, + { + "epoch": 1.1570265870862724, + "grad_norm": 5.1913371086120605, + "learning_rate": 6.495683157461972e-06, + "loss": 0.2561, + "step": 26322 + }, + { + "epoch": 1.1570401519262072, + "grad_norm": 3.39107084274292, + "learning_rate": 6.495546114841716e-06, + "loss": 0.1463, + "step": 26323 + }, + { + "epoch": 1.157053716766142, + "grad_norm": 4.038583755493164, + "learning_rate": 6.495409072221461e-06, + "loss": 0.1609, + "step": 26324 + }, + { + "epoch": 1.157067281606077, + "grad_norm": 4.818951606750488, + "learning_rate": 6.495272029601207e-06, + "loss": 0.2064, + "step": 26325 + }, + { + "epoch": 1.157080846446012, + "grad_norm": 3.248044967651367, + "learning_rate": 6.495134986980952e-06, + "loss": 0.1871, + "step": 26326 + }, + { + "epoch": 1.157094411285947, + "grad_norm": 6.3084282875061035, + "learning_rate": 6.494997944360697e-06, + "loss": 0.3296, + "step": 26327 + }, + { + "epoch": 1.1571079761258818, + "grad_norm": 3.963047504425049, + "learning_rate": 6.494860901740442e-06, + "loss": 0.1789, + "step": 26328 + }, + { + "epoch": 1.1571215409658167, + "grad_norm": 4.4773173332214355, + "learning_rate": 6.4947238591201864e-06, + "loss": 0.1891, + "step": 26329 + }, + { + "epoch": 1.1571351058057515, + "grad_norm": 4.729719161987305, + "learning_rate": 6.494586816499932e-06, + "loss": 0.2213, + "step": 26330 + }, + { + "epoch": 1.1571486706456864, + "grad_norm": 3.4262263774871826, + "learning_rate": 6.494449773879678e-06, + "loss": 0.1287, + "step": 26331 + }, + { + "epoch": 1.1571622354856212, + "grad_norm": 3.628326892852783, + "learning_rate": 6.494312731259422e-06, + "loss": 0.1763, + "step": 26332 + }, + { + "epoch": 1.1571758003255561, + "grad_norm": 3.7880616188049316, + "learning_rate": 6.494175688639167e-06, + "loss": 0.1272, + "step": 26333 + }, + { + "epoch": 1.157189365165491, + "grad_norm": 4.160061836242676, + "learning_rate": 6.494038646018912e-06, + "loss": 0.2542, + "step": 26334 + }, + { + "epoch": 1.1572029300054258, + "grad_norm": 3.7959208488464355, + "learning_rate": 6.4939016033986575e-06, + "loss": 0.1472, + "step": 26335 + }, + { + "epoch": 1.1572164948453607, + "grad_norm": 3.8813743591308594, + "learning_rate": 6.493764560778403e-06, + "loss": 0.2029, + "step": 26336 + }, + { + "epoch": 1.1572300596852958, + "grad_norm": 5.974152565002441, + "learning_rate": 6.493627518158148e-06, + "loss": 0.2289, + "step": 26337 + }, + { + "epoch": 1.1572436245252307, + "grad_norm": 2.4378912448883057, + "learning_rate": 6.493490475537892e-06, + "loss": 0.1262, + "step": 26338 + }, + { + "epoch": 1.1572571893651655, + "grad_norm": 5.339592456817627, + "learning_rate": 6.493353432917638e-06, + "loss": 0.1593, + "step": 26339 + }, + { + "epoch": 1.1572707542051004, + "grad_norm": 3.784358501434326, + "learning_rate": 6.493216390297383e-06, + "loss": 0.1791, + "step": 26340 + }, + { + "epoch": 1.1572843190450353, + "grad_norm": 3.508751630783081, + "learning_rate": 6.493079347677128e-06, + "loss": 0.1695, + "step": 26341 + }, + { + "epoch": 1.1572978838849701, + "grad_norm": 6.0669169425964355, + "learning_rate": 6.492942305056873e-06, + "loss": 0.283, + "step": 26342 + }, + { + "epoch": 1.157311448724905, + "grad_norm": 5.182064056396484, + "learning_rate": 6.492805262436618e-06, + "loss": 0.2015, + "step": 26343 + }, + { + "epoch": 1.1573250135648399, + "grad_norm": 5.371998310089111, + "learning_rate": 6.492668219816364e-06, + "loss": 0.2984, + "step": 26344 + }, + { + "epoch": 1.157338578404775, + "grad_norm": 7.6309614181518555, + "learning_rate": 6.4925311771961085e-06, + "loss": 0.2495, + "step": 26345 + }, + { + "epoch": 1.1573521432447098, + "grad_norm": 3.5726797580718994, + "learning_rate": 6.492394134575854e-06, + "loss": 0.1526, + "step": 26346 + }, + { + "epoch": 1.1573657080846447, + "grad_norm": 5.24809455871582, + "learning_rate": 6.492257091955598e-06, + "loss": 0.2554, + "step": 26347 + }, + { + "epoch": 1.1573792729245795, + "grad_norm": 3.513633966445923, + "learning_rate": 6.492120049335344e-06, + "loss": 0.1502, + "step": 26348 + }, + { + "epoch": 1.1573928377645144, + "grad_norm": 3.3135533332824707, + "learning_rate": 6.491983006715089e-06, + "loss": 0.1157, + "step": 26349 + }, + { + "epoch": 1.1574064026044493, + "grad_norm": 3.3934144973754883, + "learning_rate": 6.4918459640948335e-06, + "loss": 0.1135, + "step": 26350 + }, + { + "epoch": 1.1574199674443841, + "grad_norm": 6.57846212387085, + "learning_rate": 6.491708921474579e-06, + "loss": 0.2606, + "step": 26351 + }, + { + "epoch": 1.157433532284319, + "grad_norm": 3.0099518299102783, + "learning_rate": 6.491571878854325e-06, + "loss": 0.1427, + "step": 26352 + }, + { + "epoch": 1.1574470971242539, + "grad_norm": 5.214653015136719, + "learning_rate": 6.49143483623407e-06, + "loss": 0.2274, + "step": 26353 + }, + { + "epoch": 1.1574606619641887, + "grad_norm": 5.080480575561523, + "learning_rate": 6.491297793613814e-06, + "loss": 0.3026, + "step": 26354 + }, + { + "epoch": 1.1574742268041236, + "grad_norm": 4.172055244445801, + "learning_rate": 6.491160750993559e-06, + "loss": 0.208, + "step": 26355 + }, + { + "epoch": 1.1574877916440587, + "grad_norm": 5.881612300872803, + "learning_rate": 6.491023708373304e-06, + "loss": 0.2316, + "step": 26356 + }, + { + "epoch": 1.1575013564839935, + "grad_norm": 4.196534156799316, + "learning_rate": 6.49088666575305e-06, + "loss": 0.2147, + "step": 26357 + }, + { + "epoch": 1.1575149213239284, + "grad_norm": 5.065235137939453, + "learning_rate": 6.490749623132795e-06, + "loss": 0.1929, + "step": 26358 + }, + { + "epoch": 1.1575284861638633, + "grad_norm": 4.802478313446045, + "learning_rate": 6.49061258051254e-06, + "loss": 0.2043, + "step": 26359 + }, + { + "epoch": 1.1575420510037981, + "grad_norm": 6.003017425537109, + "learning_rate": 6.4904755378922845e-06, + "loss": 0.2357, + "step": 26360 + }, + { + "epoch": 1.157555615843733, + "grad_norm": 4.693924903869629, + "learning_rate": 6.4903384952720305e-06, + "loss": 0.2275, + "step": 26361 + }, + { + "epoch": 1.1575691806836679, + "grad_norm": 5.766275882720947, + "learning_rate": 6.490201452651776e-06, + "loss": 0.2741, + "step": 26362 + }, + { + "epoch": 1.1575827455236027, + "grad_norm": 5.235889911651611, + "learning_rate": 6.49006441003152e-06, + "loss": 0.2769, + "step": 26363 + }, + { + "epoch": 1.1575963103635378, + "grad_norm": 6.295970916748047, + "learning_rate": 6.489927367411265e-06, + "loss": 0.3646, + "step": 26364 + }, + { + "epoch": 1.1576098752034727, + "grad_norm": 5.369658470153809, + "learning_rate": 6.489790324791011e-06, + "loss": 0.3841, + "step": 26365 + }, + { + "epoch": 1.1576234400434076, + "grad_norm": 5.32515287399292, + "learning_rate": 6.4896532821707555e-06, + "loss": 0.2819, + "step": 26366 + }, + { + "epoch": 1.1576370048833424, + "grad_norm": 4.326720714569092, + "learning_rate": 6.489516239550501e-06, + "loss": 0.2403, + "step": 26367 + }, + { + "epoch": 1.1576505697232773, + "grad_norm": 4.467947483062744, + "learning_rate": 6.489379196930246e-06, + "loss": 0.1503, + "step": 26368 + }, + { + "epoch": 1.1576641345632122, + "grad_norm": 5.601129055023193, + "learning_rate": 6.489242154309992e-06, + "loss": 0.2919, + "step": 26369 + }, + { + "epoch": 1.157677699403147, + "grad_norm": 5.763806343078613, + "learning_rate": 6.489105111689736e-06, + "loss": 0.2421, + "step": 26370 + }, + { + "epoch": 1.1576912642430819, + "grad_norm": 5.615963935852051, + "learning_rate": 6.488968069069481e-06, + "loss": 0.2948, + "step": 26371 + }, + { + "epoch": 1.1577048290830168, + "grad_norm": 6.721786022186279, + "learning_rate": 6.488831026449226e-06, + "loss": 0.3056, + "step": 26372 + }, + { + "epoch": 1.1577183939229516, + "grad_norm": 5.476498603820801, + "learning_rate": 6.488693983828971e-06, + "loss": 0.2815, + "step": 26373 + }, + { + "epoch": 1.1577319587628865, + "grad_norm": 4.114063262939453, + "learning_rate": 6.488556941208717e-06, + "loss": 0.163, + "step": 26374 + }, + { + "epoch": 1.1577455236028216, + "grad_norm": 4.841851234436035, + "learning_rate": 6.488419898588461e-06, + "loss": 0.3624, + "step": 26375 + }, + { + "epoch": 1.1577590884427564, + "grad_norm": 4.604024887084961, + "learning_rate": 6.4882828559682065e-06, + "loss": 0.1462, + "step": 26376 + }, + { + "epoch": 1.1577726532826913, + "grad_norm": 4.841036796569824, + "learning_rate": 6.488145813347952e-06, + "loss": 0.2028, + "step": 26377 + }, + { + "epoch": 1.1577862181226262, + "grad_norm": 5.374486446380615, + "learning_rate": 6.488008770727698e-06, + "loss": 0.2738, + "step": 26378 + }, + { + "epoch": 1.157799782962561, + "grad_norm": 4.840789794921875, + "learning_rate": 6.487871728107442e-06, + "loss": 0.2464, + "step": 26379 + }, + { + "epoch": 1.157813347802496, + "grad_norm": 7.069685459136963, + "learning_rate": 6.487734685487187e-06, + "loss": 0.3858, + "step": 26380 + }, + { + "epoch": 1.1578269126424308, + "grad_norm": 4.607187747955322, + "learning_rate": 6.4875976428669315e-06, + "loss": 0.221, + "step": 26381 + }, + { + "epoch": 1.1578404774823656, + "grad_norm": 3.8754777908325195, + "learning_rate": 6.4874606002466775e-06, + "loss": 0.1109, + "step": 26382 + }, + { + "epoch": 1.1578540423223007, + "grad_norm": 4.699299335479736, + "learning_rate": 6.487323557626423e-06, + "loss": 0.2012, + "step": 26383 + }, + { + "epoch": 1.1578676071622356, + "grad_norm": 5.712747573852539, + "learning_rate": 6.487186515006168e-06, + "loss": 0.3658, + "step": 26384 + }, + { + "epoch": 1.1578811720021704, + "grad_norm": 5.139537811279297, + "learning_rate": 6.487049472385912e-06, + "loss": 0.2617, + "step": 26385 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 5.481921672821045, + "learning_rate": 6.486912429765657e-06, + "loss": 0.2474, + "step": 26386 + }, + { + "epoch": 1.1579083016820402, + "grad_norm": 5.671542644500732, + "learning_rate": 6.4867753871454034e-06, + "loss": 0.3022, + "step": 26387 + }, + { + "epoch": 1.157921866521975, + "grad_norm": 5.8210768699646, + "learning_rate": 6.486638344525148e-06, + "loss": 0.3472, + "step": 26388 + }, + { + "epoch": 1.15793543136191, + "grad_norm": 4.670846939086914, + "learning_rate": 6.486501301904893e-06, + "loss": 0.2535, + "step": 26389 + }, + { + "epoch": 1.1579489962018448, + "grad_norm": 5.266585350036621, + "learning_rate": 6.486364259284637e-06, + "loss": 0.3336, + "step": 26390 + }, + { + "epoch": 1.1579625610417796, + "grad_norm": 3.6665592193603516, + "learning_rate": 6.486227216664383e-06, + "loss": 0.1881, + "step": 26391 + }, + { + "epoch": 1.1579761258817145, + "grad_norm": 4.857743740081787, + "learning_rate": 6.4860901740441285e-06, + "loss": 0.2096, + "step": 26392 + }, + { + "epoch": 1.1579896907216494, + "grad_norm": 5.704068183898926, + "learning_rate": 6.485953131423874e-06, + "loss": 0.2328, + "step": 26393 + }, + { + "epoch": 1.1580032555615845, + "grad_norm": 5.196481227874756, + "learning_rate": 6.485816088803618e-06, + "loss": 0.2852, + "step": 26394 + }, + { + "epoch": 1.1580168204015193, + "grad_norm": 3.666574001312256, + "learning_rate": 6.485679046183364e-06, + "loss": 0.1787, + "step": 26395 + }, + { + "epoch": 1.1580303852414542, + "grad_norm": 5.235532760620117, + "learning_rate": 6.485542003563109e-06, + "loss": 0.2146, + "step": 26396 + }, + { + "epoch": 1.158043950081389, + "grad_norm": 5.454899787902832, + "learning_rate": 6.4854049609428535e-06, + "loss": 0.2612, + "step": 26397 + }, + { + "epoch": 1.158057514921324, + "grad_norm": 4.864773750305176, + "learning_rate": 6.485267918322599e-06, + "loss": 0.1748, + "step": 26398 + }, + { + "epoch": 1.1580710797612588, + "grad_norm": 4.681156158447266, + "learning_rate": 6.485130875702343e-06, + "loss": 0.2507, + "step": 26399 + }, + { + "epoch": 1.1580846446011936, + "grad_norm": 5.155354022979736, + "learning_rate": 6.484993833082089e-06, + "loss": 0.3469, + "step": 26400 + }, + { + "epoch": 1.1580982094411285, + "grad_norm": 6.907197952270508, + "learning_rate": 6.484856790461834e-06, + "loss": 0.3505, + "step": 26401 + }, + { + "epoch": 1.1581117742810636, + "grad_norm": 4.20051908493042, + "learning_rate": 6.484719747841579e-06, + "loss": 0.2111, + "step": 26402 + }, + { + "epoch": 1.1581253391209985, + "grad_norm": 4.425089359283447, + "learning_rate": 6.484582705221324e-06, + "loss": 0.1247, + "step": 26403 + }, + { + "epoch": 1.1581389039609333, + "grad_norm": 4.691534519195557, + "learning_rate": 6.48444566260107e-06, + "loss": 0.2626, + "step": 26404 + }, + { + "epoch": 1.1581524688008682, + "grad_norm": 6.572827339172363, + "learning_rate": 6.484308619980815e-06, + "loss": 0.2668, + "step": 26405 + }, + { + "epoch": 1.158166033640803, + "grad_norm": 4.560032367706299, + "learning_rate": 6.484171577360559e-06, + "loss": 0.3624, + "step": 26406 + }, + { + "epoch": 1.158179598480738, + "grad_norm": 5.3737592697143555, + "learning_rate": 6.4840345347403045e-06, + "loss": 0.2961, + "step": 26407 + }, + { + "epoch": 1.1581931633206728, + "grad_norm": 3.4198102951049805, + "learning_rate": 6.4838974921200505e-06, + "loss": 0.1561, + "step": 26408 + }, + { + "epoch": 1.1582067281606077, + "grad_norm": 5.158440589904785, + "learning_rate": 6.483760449499795e-06, + "loss": 0.2017, + "step": 26409 + }, + { + "epoch": 1.1582202930005425, + "grad_norm": 5.314281940460205, + "learning_rate": 6.48362340687954e-06, + "loss": 0.2792, + "step": 26410 + }, + { + "epoch": 1.1582338578404774, + "grad_norm": 5.308170795440674, + "learning_rate": 6.483486364259285e-06, + "loss": 0.2259, + "step": 26411 + }, + { + "epoch": 1.1582474226804123, + "grad_norm": 5.348576068878174, + "learning_rate": 6.4833493216390295e-06, + "loss": 0.251, + "step": 26412 + }, + { + "epoch": 1.1582609875203473, + "grad_norm": 4.434290409088135, + "learning_rate": 6.4832122790187755e-06, + "loss": 0.1898, + "step": 26413 + }, + { + "epoch": 1.1582745523602822, + "grad_norm": 4.160836696624756, + "learning_rate": 6.483075236398521e-06, + "loss": 0.2566, + "step": 26414 + }, + { + "epoch": 1.158288117200217, + "grad_norm": 5.449331760406494, + "learning_rate": 6.482938193778265e-06, + "loss": 0.2706, + "step": 26415 + }, + { + "epoch": 1.158301682040152, + "grad_norm": 5.333193778991699, + "learning_rate": 6.48280115115801e-06, + "loss": 0.2817, + "step": 26416 + }, + { + "epoch": 1.1583152468800868, + "grad_norm": 4.0817694664001465, + "learning_rate": 6.482664108537756e-06, + "loss": 0.1883, + "step": 26417 + }, + { + "epoch": 1.1583288117200217, + "grad_norm": 5.3784661293029785, + "learning_rate": 6.4825270659175014e-06, + "loss": 0.2025, + "step": 26418 + }, + { + "epoch": 1.1583423765599565, + "grad_norm": 6.363065242767334, + "learning_rate": 6.482390023297246e-06, + "loss": 0.312, + "step": 26419 + }, + { + "epoch": 1.1583559413998914, + "grad_norm": 5.968156337738037, + "learning_rate": 6.482252980676991e-06, + "loss": 0.2599, + "step": 26420 + }, + { + "epoch": 1.1583695062398265, + "grad_norm": 6.232687950134277, + "learning_rate": 6.482115938056737e-06, + "loss": 0.2549, + "step": 26421 + }, + { + "epoch": 1.1583830710797614, + "grad_norm": 6.381862640380859, + "learning_rate": 6.481978895436481e-06, + "loss": 0.3517, + "step": 26422 + }, + { + "epoch": 1.1583966359196962, + "grad_norm": 5.088897228240967, + "learning_rate": 6.4818418528162265e-06, + "loss": 0.2585, + "step": 26423 + }, + { + "epoch": 1.158410200759631, + "grad_norm": 4.017049789428711, + "learning_rate": 6.481704810195971e-06, + "loss": 0.2828, + "step": 26424 + }, + { + "epoch": 1.158423765599566, + "grad_norm": 5.20938777923584, + "learning_rate": 6.481567767575716e-06, + "loss": 0.2312, + "step": 26425 + }, + { + "epoch": 1.1584373304395008, + "grad_norm": 4.896275043487549, + "learning_rate": 6.481430724955462e-06, + "loss": 0.2748, + "step": 26426 + }, + { + "epoch": 1.1584508952794357, + "grad_norm": 5.256300449371338, + "learning_rate": 6.481293682335207e-06, + "loss": 0.233, + "step": 26427 + }, + { + "epoch": 1.1584644601193705, + "grad_norm": 5.4978814125061035, + "learning_rate": 6.4811566397149515e-06, + "loss": 0.4085, + "step": 26428 + }, + { + "epoch": 1.1584780249593054, + "grad_norm": 4.8792009353637695, + "learning_rate": 6.481019597094697e-06, + "loss": 0.2418, + "step": 26429 + }, + { + "epoch": 1.1584915897992403, + "grad_norm": 3.81121826171875, + "learning_rate": 6.480882554474443e-06, + "loss": 0.1639, + "step": 26430 + }, + { + "epoch": 1.1585051546391751, + "grad_norm": 6.448866844177246, + "learning_rate": 6.480745511854187e-06, + "loss": 0.2751, + "step": 26431 + }, + { + "epoch": 1.1585187194791102, + "grad_norm": 6.322570323944092, + "learning_rate": 6.480608469233932e-06, + "loss": 0.3664, + "step": 26432 + }, + { + "epoch": 1.158532284319045, + "grad_norm": 7.3791399002075195, + "learning_rate": 6.4804714266136774e-06, + "loss": 0.4717, + "step": 26433 + }, + { + "epoch": 1.15854584915898, + "grad_norm": 6.347257614135742, + "learning_rate": 6.480334383993423e-06, + "loss": 0.3813, + "step": 26434 + }, + { + "epoch": 1.1585594139989148, + "grad_norm": 5.97506046295166, + "learning_rate": 6.480197341373168e-06, + "loss": 0.3633, + "step": 26435 + }, + { + "epoch": 1.1585729788388497, + "grad_norm": 6.102876663208008, + "learning_rate": 6.480060298752913e-06, + "loss": 0.3257, + "step": 26436 + }, + { + "epoch": 1.1585865436787846, + "grad_norm": 4.35948371887207, + "learning_rate": 6.479923256132657e-06, + "loss": 0.2646, + "step": 26437 + }, + { + "epoch": 1.1586001085187194, + "grad_norm": 4.540266513824463, + "learning_rate": 6.4797862135124025e-06, + "loss": 0.231, + "step": 26438 + }, + { + "epoch": 1.1586136733586543, + "grad_norm": 4.538458824157715, + "learning_rate": 6.4796491708921485e-06, + "loss": 0.253, + "step": 26439 + }, + { + "epoch": 1.1586272381985894, + "grad_norm": 5.236278533935547, + "learning_rate": 6.479512128271893e-06, + "loss": 0.2438, + "step": 26440 + }, + { + "epoch": 1.1586408030385242, + "grad_norm": 4.846411228179932, + "learning_rate": 6.479375085651638e-06, + "loss": 0.2301, + "step": 26441 + }, + { + "epoch": 1.158654367878459, + "grad_norm": 4.8648271560668945, + "learning_rate": 6.479238043031383e-06, + "loss": 0.2891, + "step": 26442 + }, + { + "epoch": 1.158667932718394, + "grad_norm": 4.253600120544434, + "learning_rate": 6.479101000411128e-06, + "loss": 0.2, + "step": 26443 + }, + { + "epoch": 1.1586814975583288, + "grad_norm": 5.341228008270264, + "learning_rate": 6.4789639577908735e-06, + "loss": 0.3002, + "step": 26444 + }, + { + "epoch": 1.1586950623982637, + "grad_norm": 5.589136600494385, + "learning_rate": 6.478826915170619e-06, + "loss": 0.3174, + "step": 26445 + }, + { + "epoch": 1.1587086272381986, + "grad_norm": 5.77871036529541, + "learning_rate": 6.478689872550363e-06, + "loss": 0.2881, + "step": 26446 + }, + { + "epoch": 1.1587221920781334, + "grad_norm": 5.892817497253418, + "learning_rate": 6.478552829930109e-06, + "loss": 0.2679, + "step": 26447 + }, + { + "epoch": 1.1587357569180683, + "grad_norm": 4.627395153045654, + "learning_rate": 6.478415787309854e-06, + "loss": 0.2189, + "step": 26448 + }, + { + "epoch": 1.1587493217580032, + "grad_norm": 5.131798267364502, + "learning_rate": 6.478278744689599e-06, + "loss": 0.3037, + "step": 26449 + }, + { + "epoch": 1.158762886597938, + "grad_norm": 5.160421848297119, + "learning_rate": 6.478141702069344e-06, + "loss": 0.2324, + "step": 26450 + }, + { + "epoch": 1.1587764514378731, + "grad_norm": 6.8006062507629395, + "learning_rate": 6.47800465944909e-06, + "loss": 0.3087, + "step": 26451 + }, + { + "epoch": 1.158790016277808, + "grad_norm": 7.167434215545654, + "learning_rate": 6.477867616828835e-06, + "loss": 0.3897, + "step": 26452 + }, + { + "epoch": 1.1588035811177428, + "grad_norm": 6.529551029205322, + "learning_rate": 6.477730574208579e-06, + "loss": 0.364, + "step": 26453 + }, + { + "epoch": 1.1588171459576777, + "grad_norm": 6.640013694763184, + "learning_rate": 6.4775935315883245e-06, + "loss": 0.3247, + "step": 26454 + }, + { + "epoch": 1.1588307107976126, + "grad_norm": 4.0994720458984375, + "learning_rate": 6.477456488968069e-06, + "loss": 0.1561, + "step": 26455 + }, + { + "epoch": 1.1588442756375474, + "grad_norm": 5.217921257019043, + "learning_rate": 6.477319446347815e-06, + "loss": 0.2906, + "step": 26456 + }, + { + "epoch": 1.1588578404774823, + "grad_norm": 4.483427047729492, + "learning_rate": 6.47718240372756e-06, + "loss": 0.2738, + "step": 26457 + }, + { + "epoch": 1.1588714053174172, + "grad_norm": 5.211485862731934, + "learning_rate": 6.477045361107304e-06, + "loss": 0.2273, + "step": 26458 + }, + { + "epoch": 1.1588849701573523, + "grad_norm": 6.089508056640625, + "learning_rate": 6.4769083184870495e-06, + "loss": 0.3516, + "step": 26459 + }, + { + "epoch": 1.1588985349972871, + "grad_norm": 6.873412132263184, + "learning_rate": 6.4767712758667956e-06, + "loss": 0.4002, + "step": 26460 + }, + { + "epoch": 1.158912099837222, + "grad_norm": 8.563268661499023, + "learning_rate": 6.476634233246541e-06, + "loss": 0.445, + "step": 26461 + }, + { + "epoch": 1.1589256646771569, + "grad_norm": 5.396275520324707, + "learning_rate": 6.476497190626285e-06, + "loss": 0.3121, + "step": 26462 + }, + { + "epoch": 1.1589392295170917, + "grad_norm": 6.278045177459717, + "learning_rate": 6.47636014800603e-06, + "loss": 0.2777, + "step": 26463 + }, + { + "epoch": 1.1589527943570266, + "grad_norm": 5.761961936950684, + "learning_rate": 6.476223105385776e-06, + "loss": 0.2887, + "step": 26464 + }, + { + "epoch": 1.1589663591969614, + "grad_norm": 4.608126163482666, + "learning_rate": 6.476086062765521e-06, + "loss": 0.2723, + "step": 26465 + }, + { + "epoch": 1.1589799240368963, + "grad_norm": 4.1601104736328125, + "learning_rate": 6.475949020145266e-06, + "loss": 0.2315, + "step": 26466 + }, + { + "epoch": 1.1589934888768312, + "grad_norm": 4.9602861404418945, + "learning_rate": 6.475811977525011e-06, + "loss": 0.2923, + "step": 26467 + }, + { + "epoch": 1.159007053716766, + "grad_norm": 4.569617748260498, + "learning_rate": 6.475674934904755e-06, + "loss": 0.2792, + "step": 26468 + }, + { + "epoch": 1.159020618556701, + "grad_norm": 8.03428840637207, + "learning_rate": 6.475537892284501e-06, + "loss": 0.3851, + "step": 26469 + }, + { + "epoch": 1.159034183396636, + "grad_norm": 4.827330589294434, + "learning_rate": 6.4754008496642465e-06, + "loss": 0.2473, + "step": 26470 + }, + { + "epoch": 1.1590477482365709, + "grad_norm": 5.277720928192139, + "learning_rate": 6.475263807043991e-06, + "loss": 0.3635, + "step": 26471 + }, + { + "epoch": 1.1590613130765057, + "grad_norm": 5.517028331756592, + "learning_rate": 6.475126764423736e-06, + "loss": 0.299, + "step": 26472 + }, + { + "epoch": 1.1590748779164406, + "grad_norm": 4.213304042816162, + "learning_rate": 6.474989721803482e-06, + "loss": 0.2126, + "step": 26473 + }, + { + "epoch": 1.1590884427563755, + "grad_norm": 4.7889862060546875, + "learning_rate": 6.474852679183226e-06, + "loss": 0.1849, + "step": 26474 + }, + { + "epoch": 1.1591020075963103, + "grad_norm": 6.164093971252441, + "learning_rate": 6.4747156365629716e-06, + "loss": 0.409, + "step": 26475 + }, + { + "epoch": 1.1591155724362452, + "grad_norm": 7.948323726654053, + "learning_rate": 6.474578593942717e-06, + "loss": 0.3462, + "step": 26476 + }, + { + "epoch": 1.15912913727618, + "grad_norm": 4.73354959487915, + "learning_rate": 6.474441551322462e-06, + "loss": 0.2757, + "step": 26477 + }, + { + "epoch": 1.1591427021161151, + "grad_norm": 4.837556838989258, + "learning_rate": 6.474304508702207e-06, + "loss": 0.2806, + "step": 26478 + }, + { + "epoch": 1.15915626695605, + "grad_norm": 4.0563178062438965, + "learning_rate": 6.474167466081952e-06, + "loss": 0.2304, + "step": 26479 + }, + { + "epoch": 1.1591698317959849, + "grad_norm": 6.7306809425354, + "learning_rate": 6.474030423461697e-06, + "loss": 0.2701, + "step": 26480 + }, + { + "epoch": 1.1591833966359197, + "grad_norm": 4.919960021972656, + "learning_rate": 6.473893380841442e-06, + "loss": 0.2104, + "step": 26481 + }, + { + "epoch": 1.1591969614758546, + "grad_norm": 5.2033467292785645, + "learning_rate": 6.473756338221188e-06, + "loss": 0.3858, + "step": 26482 + }, + { + "epoch": 1.1592105263157895, + "grad_norm": 6.574219703674316, + "learning_rate": 6.473619295600932e-06, + "loss": 0.3262, + "step": 26483 + }, + { + "epoch": 1.1592240911557243, + "grad_norm": 7.618422031402588, + "learning_rate": 6.473482252980677e-06, + "loss": 0.564, + "step": 26484 + }, + { + "epoch": 1.1592376559956592, + "grad_norm": 5.325084209442139, + "learning_rate": 6.4733452103604225e-06, + "loss": 0.4186, + "step": 26485 + }, + { + "epoch": 1.159251220835594, + "grad_norm": 4.087434768676758, + "learning_rate": 6.4732081677401685e-06, + "loss": 0.2378, + "step": 26486 + }, + { + "epoch": 1.159264785675529, + "grad_norm": 3.924865484237671, + "learning_rate": 6.473071125119913e-06, + "loss": 0.2066, + "step": 26487 + }, + { + "epoch": 1.1592783505154638, + "grad_norm": 5.967556476593018, + "learning_rate": 6.472934082499658e-06, + "loss": 0.304, + "step": 26488 + }, + { + "epoch": 1.1592919153553989, + "grad_norm": 5.226205348968506, + "learning_rate": 6.472797039879402e-06, + "loss": 0.2801, + "step": 26489 + }, + { + "epoch": 1.1593054801953337, + "grad_norm": 3.873523235321045, + "learning_rate": 6.472659997259148e-06, + "loss": 0.1636, + "step": 26490 + }, + { + "epoch": 1.1593190450352686, + "grad_norm": 4.39631986618042, + "learning_rate": 6.4725229546388936e-06, + "loss": 0.1959, + "step": 26491 + }, + { + "epoch": 1.1593326098752035, + "grad_norm": 6.855146884918213, + "learning_rate": 6.472385912018638e-06, + "loss": 0.2242, + "step": 26492 + }, + { + "epoch": 1.1593461747151383, + "grad_norm": 4.926962375640869, + "learning_rate": 6.472248869398383e-06, + "loss": 0.2402, + "step": 26493 + }, + { + "epoch": 1.1593597395550732, + "grad_norm": 4.730936050415039, + "learning_rate": 6.472111826778128e-06, + "loss": 0.2309, + "step": 26494 + }, + { + "epoch": 1.159373304395008, + "grad_norm": 4.961919784545898, + "learning_rate": 6.471974784157874e-06, + "loss": 0.2739, + "step": 26495 + }, + { + "epoch": 1.159386869234943, + "grad_norm": 4.5780744552612305, + "learning_rate": 6.471837741537619e-06, + "loss": 0.1996, + "step": 26496 + }, + { + "epoch": 1.159400434074878, + "grad_norm": 3.969006299972534, + "learning_rate": 6.471700698917364e-06, + "loss": 0.2529, + "step": 26497 + }, + { + "epoch": 1.159413998914813, + "grad_norm": 4.025909423828125, + "learning_rate": 6.471563656297108e-06, + "loss": 0.2305, + "step": 26498 + }, + { + "epoch": 1.1594275637547478, + "grad_norm": 4.059040069580078, + "learning_rate": 6.471426613676854e-06, + "loss": 0.137, + "step": 26499 + }, + { + "epoch": 1.1594411285946826, + "grad_norm": 4.04511022567749, + "learning_rate": 6.471289571056599e-06, + "loss": 0.3318, + "step": 26500 + }, + { + "epoch": 1.1594546934346175, + "grad_norm": 5.806431770324707, + "learning_rate": 6.4711525284363445e-06, + "loss": 0.2396, + "step": 26501 + }, + { + "epoch": 1.1594682582745524, + "grad_norm": 6.596604824066162, + "learning_rate": 6.471015485816089e-06, + "loss": 0.4995, + "step": 26502 + }, + { + "epoch": 1.1594818231144872, + "grad_norm": 4.349501609802246, + "learning_rate": 6.470878443195835e-06, + "loss": 0.1745, + "step": 26503 + }, + { + "epoch": 1.159495387954422, + "grad_norm": 4.054901599884033, + "learning_rate": 6.47074140057558e-06, + "loss": 0.3024, + "step": 26504 + }, + { + "epoch": 1.159508952794357, + "grad_norm": 4.591311454772949, + "learning_rate": 6.470604357955324e-06, + "loss": 0.1754, + "step": 26505 + }, + { + "epoch": 1.1595225176342918, + "grad_norm": 2.9199655055999756, + "learning_rate": 6.4704673153350696e-06, + "loss": 0.0988, + "step": 26506 + }, + { + "epoch": 1.1595360824742267, + "grad_norm": 5.967354774475098, + "learning_rate": 6.470330272714814e-06, + "loss": 0.3642, + "step": 26507 + }, + { + "epoch": 1.1595496473141618, + "grad_norm": 4.847160816192627, + "learning_rate": 6.47019323009456e-06, + "loss": 0.2052, + "step": 26508 + }, + { + "epoch": 1.1595632121540966, + "grad_norm": 4.366321563720703, + "learning_rate": 6.470056187474305e-06, + "loss": 0.2368, + "step": 26509 + }, + { + "epoch": 1.1595767769940315, + "grad_norm": 5.387101173400879, + "learning_rate": 6.46991914485405e-06, + "loss": 0.3671, + "step": 26510 + }, + { + "epoch": 1.1595903418339664, + "grad_norm": 6.196613311767578, + "learning_rate": 6.469782102233795e-06, + "loss": 0.242, + "step": 26511 + }, + { + "epoch": 1.1596039066739012, + "grad_norm": 6.621003150939941, + "learning_rate": 6.469645059613541e-06, + "loss": 0.3084, + "step": 26512 + }, + { + "epoch": 1.159617471513836, + "grad_norm": 5.470314025878906, + "learning_rate": 6.469508016993286e-06, + "loss": 0.2898, + "step": 26513 + }, + { + "epoch": 1.159631036353771, + "grad_norm": 5.347609519958496, + "learning_rate": 6.46937097437303e-06, + "loss": 0.3275, + "step": 26514 + }, + { + "epoch": 1.1596446011937058, + "grad_norm": 5.6661577224731445, + "learning_rate": 6.469233931752775e-06, + "loss": 0.2533, + "step": 26515 + }, + { + "epoch": 1.159658166033641, + "grad_norm": 4.4678568840026855, + "learning_rate": 6.469096889132521e-06, + "loss": 0.2756, + "step": 26516 + }, + { + "epoch": 1.1596717308735758, + "grad_norm": 4.199823379516602, + "learning_rate": 6.468959846512266e-06, + "loss": 0.3181, + "step": 26517 + }, + { + "epoch": 1.1596852957135106, + "grad_norm": 6.63059139251709, + "learning_rate": 6.468822803892011e-06, + "loss": 0.2337, + "step": 26518 + }, + { + "epoch": 1.1596988605534455, + "grad_norm": 3.6089365482330322, + "learning_rate": 6.468685761271756e-06, + "loss": 0.2153, + "step": 26519 + }, + { + "epoch": 1.1597124253933804, + "grad_norm": 4.147305488586426, + "learning_rate": 6.468548718651502e-06, + "loss": 0.258, + "step": 26520 + }, + { + "epoch": 1.1597259902333152, + "grad_norm": 6.220536231994629, + "learning_rate": 6.468411676031246e-06, + "loss": 0.3353, + "step": 26521 + }, + { + "epoch": 1.15973955507325, + "grad_norm": 6.969104766845703, + "learning_rate": 6.4682746334109916e-06, + "loss": 0.5524, + "step": 26522 + }, + { + "epoch": 1.159753119913185, + "grad_norm": 5.7331109046936035, + "learning_rate": 6.468137590790736e-06, + "loss": 0.3425, + "step": 26523 + }, + { + "epoch": 1.1597666847531198, + "grad_norm": 5.060436248779297, + "learning_rate": 6.468000548170481e-06, + "loss": 0.364, + "step": 26524 + }, + { + "epoch": 1.1597802495930547, + "grad_norm": 6.070453643798828, + "learning_rate": 6.467863505550227e-06, + "loss": 0.2978, + "step": 26525 + }, + { + "epoch": 1.1597938144329896, + "grad_norm": 4.928144454956055, + "learning_rate": 6.4677264629299714e-06, + "loss": 0.2489, + "step": 26526 + }, + { + "epoch": 1.1598073792729247, + "grad_norm": 5.432713508605957, + "learning_rate": 6.467589420309717e-06, + "loss": 0.2226, + "step": 26527 + }, + { + "epoch": 1.1598209441128595, + "grad_norm": 5.358187675476074, + "learning_rate": 6.467452377689462e-06, + "loss": 0.3112, + "step": 26528 + }, + { + "epoch": 1.1598345089527944, + "grad_norm": 7.4022088050842285, + "learning_rate": 6.467315335069208e-06, + "loss": 0.2743, + "step": 26529 + }, + { + "epoch": 1.1598480737927293, + "grad_norm": 5.791869640350342, + "learning_rate": 6.467178292448952e-06, + "loss": 0.3255, + "step": 26530 + }, + { + "epoch": 1.1598616386326641, + "grad_norm": 4.468005657196045, + "learning_rate": 6.467041249828697e-06, + "loss": 0.3531, + "step": 26531 + }, + { + "epoch": 1.159875203472599, + "grad_norm": 4.764669418334961, + "learning_rate": 6.466904207208442e-06, + "loss": 0.287, + "step": 26532 + }, + { + "epoch": 1.1598887683125338, + "grad_norm": 4.65288782119751, + "learning_rate": 6.466767164588188e-06, + "loss": 0.3403, + "step": 26533 + }, + { + "epoch": 1.1599023331524687, + "grad_norm": 6.611842155456543, + "learning_rate": 6.466630121967933e-06, + "loss": 0.4197, + "step": 26534 + }, + { + "epoch": 1.1599158979924038, + "grad_norm": 3.6800646781921387, + "learning_rate": 6.466493079347678e-06, + "loss": 0.2469, + "step": 26535 + }, + { + "epoch": 1.1599294628323387, + "grad_norm": 5.285130023956299, + "learning_rate": 6.466356036727422e-06, + "loss": 0.2795, + "step": 26536 + }, + { + "epoch": 1.1599430276722735, + "grad_norm": 6.174549579620361, + "learning_rate": 6.4662189941071676e-06, + "loss": 0.2678, + "step": 26537 + }, + { + "epoch": 1.1599565925122084, + "grad_norm": 5.324508190155029, + "learning_rate": 6.466081951486914e-06, + "loss": 0.364, + "step": 26538 + }, + { + "epoch": 1.1599701573521433, + "grad_norm": 5.541968822479248, + "learning_rate": 6.465944908866658e-06, + "loss": 0.3177, + "step": 26539 + }, + { + "epoch": 1.1599837221920781, + "grad_norm": 4.281540870666504, + "learning_rate": 6.465807866246403e-06, + "loss": 0.2318, + "step": 26540 + }, + { + "epoch": 1.159997287032013, + "grad_norm": 7.436477184295654, + "learning_rate": 6.4656708236261474e-06, + "loss": 0.3539, + "step": 26541 + }, + { + "epoch": 1.1600108518719479, + "grad_norm": 5.202520847320557, + "learning_rate": 6.4655337810058935e-06, + "loss": 0.2954, + "step": 26542 + }, + { + "epoch": 1.1600244167118827, + "grad_norm": 5.804389953613281, + "learning_rate": 6.465396738385639e-06, + "loss": 0.2259, + "step": 26543 + }, + { + "epoch": 1.1600379815518176, + "grad_norm": 4.168212890625, + "learning_rate": 6.465259695765384e-06, + "loss": 0.3125, + "step": 26544 + }, + { + "epoch": 1.1600515463917525, + "grad_norm": 5.338689804077148, + "learning_rate": 6.465122653145128e-06, + "loss": 0.3245, + "step": 26545 + }, + { + "epoch": 1.1600651112316875, + "grad_norm": 4.980958461761475, + "learning_rate": 6.464985610524874e-06, + "loss": 0.2857, + "step": 26546 + }, + { + "epoch": 1.1600786760716224, + "grad_norm": 5.524653434753418, + "learning_rate": 6.464848567904619e-06, + "loss": 0.2559, + "step": 26547 + }, + { + "epoch": 1.1600922409115573, + "grad_norm": 4.620288372039795, + "learning_rate": 6.464711525284364e-06, + "loss": 0.2905, + "step": 26548 + }, + { + "epoch": 1.1601058057514921, + "grad_norm": 4.397787094116211, + "learning_rate": 6.464574482664109e-06, + "loss": 0.2035, + "step": 26549 + }, + { + "epoch": 1.160119370591427, + "grad_norm": 3.3587372303009033, + "learning_rate": 6.464437440043854e-06, + "loss": 0.2583, + "step": 26550 + }, + { + "epoch": 1.1601329354313619, + "grad_norm": 4.030181884765625, + "learning_rate": 6.464300397423599e-06, + "loss": 0.2409, + "step": 26551 + }, + { + "epoch": 1.1601465002712967, + "grad_norm": 4.424468517303467, + "learning_rate": 6.464163354803344e-06, + "loss": 0.2763, + "step": 26552 + }, + { + "epoch": 1.1601600651112316, + "grad_norm": 6.726080894470215, + "learning_rate": 6.4640263121830896e-06, + "loss": 0.3242, + "step": 26553 + }, + { + "epoch": 1.1601736299511667, + "grad_norm": 4.172199726104736, + "learning_rate": 6.463889269562834e-06, + "loss": 0.2145, + "step": 26554 + }, + { + "epoch": 1.1601871947911016, + "grad_norm": 6.055984020233154, + "learning_rate": 6.46375222694258e-06, + "loss": 0.3869, + "step": 26555 + }, + { + "epoch": 1.1602007596310364, + "grad_norm": 5.460982322692871, + "learning_rate": 6.463615184322325e-06, + "loss": 0.2749, + "step": 26556 + }, + { + "epoch": 1.1602143244709713, + "grad_norm": 5.199650764465332, + "learning_rate": 6.4634781417020694e-06, + "loss": 0.2871, + "step": 26557 + }, + { + "epoch": 1.1602278893109061, + "grad_norm": 4.745890140533447, + "learning_rate": 6.463341099081815e-06, + "loss": 0.1895, + "step": 26558 + }, + { + "epoch": 1.160241454150841, + "grad_norm": 4.024608135223389, + "learning_rate": 6.463204056461561e-06, + "loss": 0.273, + "step": 26559 + }, + { + "epoch": 1.1602550189907759, + "grad_norm": 7.228160858154297, + "learning_rate": 6.463067013841306e-06, + "loss": 0.3561, + "step": 26560 + }, + { + "epoch": 1.1602685838307107, + "grad_norm": 5.018609046936035, + "learning_rate": 6.46292997122105e-06, + "loss": 0.2731, + "step": 26561 + }, + { + "epoch": 1.1602821486706456, + "grad_norm": 4.6162428855896, + "learning_rate": 6.462792928600795e-06, + "loss": 0.2537, + "step": 26562 + }, + { + "epoch": 1.1602957135105805, + "grad_norm": 4.5616254806518555, + "learning_rate": 6.46265588598054e-06, + "loss": 0.183, + "step": 26563 + }, + { + "epoch": 1.1603092783505156, + "grad_norm": 4.44245719909668, + "learning_rate": 6.462518843360286e-06, + "loss": 0.2172, + "step": 26564 + }, + { + "epoch": 1.1603228431904504, + "grad_norm": 5.111133098602295, + "learning_rate": 6.462381800740031e-06, + "loss": 0.3502, + "step": 26565 + }, + { + "epoch": 1.1603364080303853, + "grad_norm": 4.098272323608398, + "learning_rate": 6.462244758119775e-06, + "loss": 0.237, + "step": 26566 + }, + { + "epoch": 1.1603499728703202, + "grad_norm": 4.313294410705566, + "learning_rate": 6.46210771549952e-06, + "loss": 0.2667, + "step": 26567 + }, + { + "epoch": 1.160363537710255, + "grad_norm": 4.930080890655518, + "learning_rate": 6.461970672879266e-06, + "loss": 0.3475, + "step": 26568 + }, + { + "epoch": 1.1603771025501899, + "grad_norm": 5.638019561767578, + "learning_rate": 6.461833630259012e-06, + "loss": 0.2659, + "step": 26569 + }, + { + "epoch": 1.1603906673901248, + "grad_norm": 4.935479640960693, + "learning_rate": 6.461696587638756e-06, + "loss": 0.2872, + "step": 26570 + }, + { + "epoch": 1.1604042322300596, + "grad_norm": 6.415863037109375, + "learning_rate": 6.461559545018501e-06, + "loss": 0.3475, + "step": 26571 + }, + { + "epoch": 1.1604177970699945, + "grad_norm": 7.2332963943481445, + "learning_rate": 6.461422502398247e-06, + "loss": 0.2515, + "step": 26572 + }, + { + "epoch": 1.1604313619099296, + "grad_norm": 4.126705169677734, + "learning_rate": 6.4612854597779915e-06, + "loss": 0.2437, + "step": 26573 + }, + { + "epoch": 1.1604449267498644, + "grad_norm": 6.872976303100586, + "learning_rate": 6.461148417157737e-06, + "loss": 0.3457, + "step": 26574 + }, + { + "epoch": 1.1604584915897993, + "grad_norm": 5.208394527435303, + "learning_rate": 6.461011374537482e-06, + "loss": 0.2112, + "step": 26575 + }, + { + "epoch": 1.1604720564297342, + "grad_norm": 4.039559364318848, + "learning_rate": 6.460874331917226e-06, + "loss": 0.1376, + "step": 26576 + }, + { + "epoch": 1.160485621269669, + "grad_norm": 7.413169860839844, + "learning_rate": 6.460737289296972e-06, + "loss": 0.3046, + "step": 26577 + }, + { + "epoch": 1.160499186109604, + "grad_norm": 5.27421760559082, + "learning_rate": 6.460600246676717e-06, + "loss": 0.3243, + "step": 26578 + }, + { + "epoch": 1.1605127509495388, + "grad_norm": 4.342187881469727, + "learning_rate": 6.460463204056462e-06, + "loss": 0.2347, + "step": 26579 + }, + { + "epoch": 1.1605263157894736, + "grad_norm": 6.280250549316406, + "learning_rate": 6.460326161436207e-06, + "loss": 0.2335, + "step": 26580 + }, + { + "epoch": 1.1605398806294085, + "grad_norm": 4.960427284240723, + "learning_rate": 6.460189118815953e-06, + "loss": 0.2263, + "step": 26581 + }, + { + "epoch": 1.1605534454693434, + "grad_norm": 4.329103946685791, + "learning_rate": 6.460052076195697e-06, + "loss": 0.2534, + "step": 26582 + }, + { + "epoch": 1.1605670103092784, + "grad_norm": 5.737080097198486, + "learning_rate": 6.459915033575442e-06, + "loss": 0.2616, + "step": 26583 + }, + { + "epoch": 1.1605805751492133, + "grad_norm": 4.591189861297607, + "learning_rate": 6.459777990955188e-06, + "loss": 0.2771, + "step": 26584 + }, + { + "epoch": 1.1605941399891482, + "grad_norm": 4.965343952178955, + "learning_rate": 6.459640948334933e-06, + "loss": 0.2897, + "step": 26585 + }, + { + "epoch": 1.160607704829083, + "grad_norm": 3.229037046432495, + "learning_rate": 6.459503905714678e-06, + "loss": 0.248, + "step": 26586 + }, + { + "epoch": 1.160621269669018, + "grad_norm": 5.328871726989746, + "learning_rate": 6.459366863094423e-06, + "loss": 0.3262, + "step": 26587 + }, + { + "epoch": 1.1606348345089528, + "grad_norm": 3.694277048110962, + "learning_rate": 6.4592298204741674e-06, + "loss": 0.1845, + "step": 26588 + }, + { + "epoch": 1.1606483993488876, + "grad_norm": 4.561151027679443, + "learning_rate": 6.4590927778539135e-06, + "loss": 0.2326, + "step": 26589 + }, + { + "epoch": 1.1606619641888225, + "grad_norm": 6.625703811645508, + "learning_rate": 6.458955735233659e-06, + "loss": 0.3537, + "step": 26590 + }, + { + "epoch": 1.1606755290287574, + "grad_norm": 5.266678333282471, + "learning_rate": 6.458818692613403e-06, + "loss": 0.1877, + "step": 26591 + }, + { + "epoch": 1.1606890938686925, + "grad_norm": 4.754741191864014, + "learning_rate": 6.458681649993148e-06, + "loss": 0.3275, + "step": 26592 + }, + { + "epoch": 1.1607026587086273, + "grad_norm": 4.59037971496582, + "learning_rate": 6.458544607372893e-06, + "loss": 0.3347, + "step": 26593 + }, + { + "epoch": 1.1607162235485622, + "grad_norm": 4.9239726066589355, + "learning_rate": 6.458407564752639e-06, + "loss": 0.2944, + "step": 26594 + }, + { + "epoch": 1.160729788388497, + "grad_norm": 6.687408924102783, + "learning_rate": 6.458270522132384e-06, + "loss": 0.267, + "step": 26595 + }, + { + "epoch": 1.160743353228432, + "grad_norm": 5.5285515785217285, + "learning_rate": 6.458133479512129e-06, + "loss": 0.2397, + "step": 26596 + }, + { + "epoch": 1.1607569180683668, + "grad_norm": 4.781025409698486, + "learning_rate": 6.457996436891873e-06, + "loss": 0.1961, + "step": 26597 + }, + { + "epoch": 1.1607704829083016, + "grad_norm": 4.481741905212402, + "learning_rate": 6.457859394271619e-06, + "loss": 0.1747, + "step": 26598 + }, + { + "epoch": 1.1607840477482365, + "grad_norm": 4.976408004760742, + "learning_rate": 6.457722351651364e-06, + "loss": 0.2484, + "step": 26599 + }, + { + "epoch": 1.1607976125881714, + "grad_norm": 5.055849075317383, + "learning_rate": 6.457585309031109e-06, + "loss": 0.2206, + "step": 26600 + }, + { + "epoch": 1.1608111774281062, + "grad_norm": 5.908002853393555, + "learning_rate": 6.457448266410854e-06, + "loss": 0.2963, + "step": 26601 + }, + { + "epoch": 1.1608247422680413, + "grad_norm": 4.292105197906494, + "learning_rate": 6.4573112237906e-06, + "loss": 0.2239, + "step": 26602 + }, + { + "epoch": 1.1608383071079762, + "grad_norm": 4.629607200622559, + "learning_rate": 6.457174181170345e-06, + "loss": 0.2857, + "step": 26603 + }, + { + "epoch": 1.160851871947911, + "grad_norm": 4.5136518478393555, + "learning_rate": 6.4570371385500895e-06, + "loss": 0.2081, + "step": 26604 + }, + { + "epoch": 1.160865436787846, + "grad_norm": 5.855478763580322, + "learning_rate": 6.456900095929835e-06, + "loss": 0.2317, + "step": 26605 + }, + { + "epoch": 1.1608790016277808, + "grad_norm": 4.417457580566406, + "learning_rate": 6.456763053309579e-06, + "loss": 0.2067, + "step": 26606 + }, + { + "epoch": 1.1608925664677157, + "grad_norm": 6.099433898925781, + "learning_rate": 6.456626010689325e-06, + "loss": 0.318, + "step": 26607 + }, + { + "epoch": 1.1609061313076505, + "grad_norm": 6.44681978225708, + "learning_rate": 6.45648896806907e-06, + "loss": 0.4366, + "step": 26608 + }, + { + "epoch": 1.1609196961475854, + "grad_norm": 4.845676422119141, + "learning_rate": 6.456351925448815e-06, + "loss": 0.291, + "step": 26609 + }, + { + "epoch": 1.1609332609875203, + "grad_norm": 6.188373565673828, + "learning_rate": 6.45621488282856e-06, + "loss": 0.1932, + "step": 26610 + }, + { + "epoch": 1.1609468258274553, + "grad_norm": 5.429425239562988, + "learning_rate": 6.456077840208306e-06, + "loss": 0.2601, + "step": 26611 + }, + { + "epoch": 1.1609603906673902, + "grad_norm": 6.238436698913574, + "learning_rate": 6.455940797588051e-06, + "loss": 0.3567, + "step": 26612 + }, + { + "epoch": 1.160973955507325, + "grad_norm": 6.480441570281982, + "learning_rate": 6.455803754967795e-06, + "loss": 0.3411, + "step": 26613 + }, + { + "epoch": 1.16098752034726, + "grad_norm": 4.4829182624816895, + "learning_rate": 6.45566671234754e-06, + "loss": 0.2064, + "step": 26614 + }, + { + "epoch": 1.1610010851871948, + "grad_norm": 4.2265448570251465, + "learning_rate": 6.4555296697272864e-06, + "loss": 0.1881, + "step": 26615 + }, + { + "epoch": 1.1610146500271297, + "grad_norm": 4.114313125610352, + "learning_rate": 6.455392627107031e-06, + "loss": 0.1826, + "step": 26616 + }, + { + "epoch": 1.1610282148670645, + "grad_norm": 5.480126857757568, + "learning_rate": 6.455255584486776e-06, + "loss": 0.2285, + "step": 26617 + }, + { + "epoch": 1.1610417797069994, + "grad_norm": 5.220996856689453, + "learning_rate": 6.455118541866521e-06, + "loss": 0.3174, + "step": 26618 + }, + { + "epoch": 1.1610553445469343, + "grad_norm": 5.50346040725708, + "learning_rate": 6.4549814992462655e-06, + "loss": 0.3101, + "step": 26619 + }, + { + "epoch": 1.1610689093868691, + "grad_norm": 4.4219255447387695, + "learning_rate": 6.4548444566260115e-06, + "loss": 0.1793, + "step": 26620 + }, + { + "epoch": 1.1610824742268042, + "grad_norm": 4.78615140914917, + "learning_rate": 6.454707414005757e-06, + "loss": 0.2033, + "step": 26621 + }, + { + "epoch": 1.161096039066739, + "grad_norm": 5.600213050842285, + "learning_rate": 6.454570371385501e-06, + "loss": 0.2402, + "step": 26622 + }, + { + "epoch": 1.161109603906674, + "grad_norm": 3.988151788711548, + "learning_rate": 6.454433328765246e-06, + "loss": 0.1611, + "step": 26623 + }, + { + "epoch": 1.1611231687466088, + "grad_norm": 6.308845043182373, + "learning_rate": 6.454296286144992e-06, + "loss": 0.2967, + "step": 26624 + }, + { + "epoch": 1.1611367335865437, + "grad_norm": 4.822734355926514, + "learning_rate": 6.4541592435247365e-06, + "loss": 0.1617, + "step": 26625 + }, + { + "epoch": 1.1611502984264785, + "grad_norm": 4.504858016967773, + "learning_rate": 6.454022200904482e-06, + "loss": 0.2388, + "step": 26626 + }, + { + "epoch": 1.1611638632664134, + "grad_norm": 8.907598495483398, + "learning_rate": 6.453885158284227e-06, + "loss": 0.2005, + "step": 26627 + }, + { + "epoch": 1.1611774281063483, + "grad_norm": 5.554660797119141, + "learning_rate": 6.453748115663973e-06, + "loss": 0.1335, + "step": 26628 + }, + { + "epoch": 1.1611909929462831, + "grad_norm": 6.083205223083496, + "learning_rate": 6.453611073043717e-06, + "loss": 0.322, + "step": 26629 + }, + { + "epoch": 1.1612045577862182, + "grad_norm": 5.610942840576172, + "learning_rate": 6.453474030423462e-06, + "loss": 0.154, + "step": 26630 + }, + { + "epoch": 1.161218122626153, + "grad_norm": 5.189150333404541, + "learning_rate": 6.453336987803207e-06, + "loss": 0.2326, + "step": 26631 + }, + { + "epoch": 1.161231687466088, + "grad_norm": 6.348261833190918, + "learning_rate": 6.453199945182952e-06, + "loss": 0.3406, + "step": 26632 + }, + { + "epoch": 1.1612452523060228, + "grad_norm": 4.713014602661133, + "learning_rate": 6.453062902562698e-06, + "loss": 0.2845, + "step": 26633 + }, + { + "epoch": 1.1612588171459577, + "grad_norm": 4.3843793869018555, + "learning_rate": 6.452925859942442e-06, + "loss": 0.1877, + "step": 26634 + }, + { + "epoch": 1.1612723819858926, + "grad_norm": 5.22843599319458, + "learning_rate": 6.4527888173221875e-06, + "loss": 0.3226, + "step": 26635 + }, + { + "epoch": 1.1612859468258274, + "grad_norm": 4.517546653747559, + "learning_rate": 6.452651774701933e-06, + "loss": 0.2115, + "step": 26636 + }, + { + "epoch": 1.1612995116657623, + "grad_norm": 4.488132953643799, + "learning_rate": 6.452514732081679e-06, + "loss": 0.262, + "step": 26637 + }, + { + "epoch": 1.1613130765056972, + "grad_norm": 5.5656208992004395, + "learning_rate": 6.452377689461423e-06, + "loss": 0.312, + "step": 26638 + }, + { + "epoch": 1.161326641345632, + "grad_norm": 6.100900173187256, + "learning_rate": 6.452240646841168e-06, + "loss": 0.3739, + "step": 26639 + }, + { + "epoch": 1.161340206185567, + "grad_norm": 4.8251495361328125, + "learning_rate": 6.4521036042209125e-06, + "loss": 0.401, + "step": 26640 + }, + { + "epoch": 1.161353771025502, + "grad_norm": 4.550465106964111, + "learning_rate": 6.4519665616006585e-06, + "loss": 0.2592, + "step": 26641 + }, + { + "epoch": 1.1613673358654368, + "grad_norm": 4.8556108474731445, + "learning_rate": 6.451829518980404e-06, + "loss": 0.3251, + "step": 26642 + }, + { + "epoch": 1.1613809007053717, + "grad_norm": 5.388271331787109, + "learning_rate": 6.451692476360149e-06, + "loss": 0.2551, + "step": 26643 + }, + { + "epoch": 1.1613944655453066, + "grad_norm": 4.731693744659424, + "learning_rate": 6.451555433739893e-06, + "loss": 0.2873, + "step": 26644 + }, + { + "epoch": 1.1614080303852414, + "grad_norm": 5.693291187286377, + "learning_rate": 6.451418391119638e-06, + "loss": 0.2268, + "step": 26645 + }, + { + "epoch": 1.1614215952251763, + "grad_norm": 5.061191082000732, + "learning_rate": 6.4512813484993844e-06, + "loss": 0.2348, + "step": 26646 + }, + { + "epoch": 1.1614351600651112, + "grad_norm": 5.320209980010986, + "learning_rate": 6.451144305879129e-06, + "loss": 0.2876, + "step": 26647 + }, + { + "epoch": 1.161448724905046, + "grad_norm": 5.070499897003174, + "learning_rate": 6.451007263258874e-06, + "loss": 0.3806, + "step": 26648 + }, + { + "epoch": 1.1614622897449811, + "grad_norm": 7.32764196395874, + "learning_rate": 6.450870220638618e-06, + "loss": 0.2493, + "step": 26649 + }, + { + "epoch": 1.161475854584916, + "grad_norm": 5.315155506134033, + "learning_rate": 6.450733178018364e-06, + "loss": 0.3749, + "step": 26650 + }, + { + "epoch": 1.1614894194248508, + "grad_norm": 4.740042209625244, + "learning_rate": 6.4505961353981095e-06, + "loss": 0.3887, + "step": 26651 + }, + { + "epoch": 1.1615029842647857, + "grad_norm": 5.824581146240234, + "learning_rate": 6.450459092777855e-06, + "loss": 0.4104, + "step": 26652 + }, + { + "epoch": 1.1615165491047206, + "grad_norm": 4.681586742401123, + "learning_rate": 6.450322050157599e-06, + "loss": 0.2661, + "step": 26653 + }, + { + "epoch": 1.1615301139446554, + "grad_norm": 4.12688684463501, + "learning_rate": 6.450185007537345e-06, + "loss": 0.2031, + "step": 26654 + }, + { + "epoch": 1.1615436787845903, + "grad_norm": 5.157706260681152, + "learning_rate": 6.45004796491709e-06, + "loss": 0.3546, + "step": 26655 + }, + { + "epoch": 1.1615572436245252, + "grad_norm": 4.308780193328857, + "learning_rate": 6.4499109222968345e-06, + "loss": 0.3124, + "step": 26656 + }, + { + "epoch": 1.16157080846446, + "grad_norm": 5.506314277648926, + "learning_rate": 6.44977387967658e-06, + "loss": 0.3975, + "step": 26657 + }, + { + "epoch": 1.161584373304395, + "grad_norm": 3.675394296646118, + "learning_rate": 6.449636837056326e-06, + "loss": 0.2021, + "step": 26658 + }, + { + "epoch": 1.16159793814433, + "grad_norm": 5.5051374435424805, + "learning_rate": 6.44949979443607e-06, + "loss": 0.3265, + "step": 26659 + }, + { + "epoch": 1.1616115029842649, + "grad_norm": 5.905326843261719, + "learning_rate": 6.449362751815815e-06, + "loss": 0.2786, + "step": 26660 + }, + { + "epoch": 1.1616250678241997, + "grad_norm": 6.253533363342285, + "learning_rate": 6.4492257091955604e-06, + "loss": 0.3873, + "step": 26661 + }, + { + "epoch": 1.1616386326641346, + "grad_norm": 4.735483646392822, + "learning_rate": 6.449088666575305e-06, + "loss": 0.3361, + "step": 26662 + }, + { + "epoch": 1.1616521975040695, + "grad_norm": 3.944807767868042, + "learning_rate": 6.448951623955051e-06, + "loss": 0.1518, + "step": 26663 + }, + { + "epoch": 1.1616657623440043, + "grad_norm": 4.699110984802246, + "learning_rate": 6.448814581334796e-06, + "loss": 0.2183, + "step": 26664 + }, + { + "epoch": 1.1616793271839392, + "grad_norm": 4.392584800720215, + "learning_rate": 6.44867753871454e-06, + "loss": 0.1905, + "step": 26665 + }, + { + "epoch": 1.161692892023874, + "grad_norm": 4.109904766082764, + "learning_rate": 6.4485404960942855e-06, + "loss": 0.2074, + "step": 26666 + }, + { + "epoch": 1.161706456863809, + "grad_norm": 5.238509178161621, + "learning_rate": 6.4484034534740315e-06, + "loss": 0.3648, + "step": 26667 + }, + { + "epoch": 1.161720021703744, + "grad_norm": 5.280984878540039, + "learning_rate": 6.448266410853776e-06, + "loss": 0.2324, + "step": 26668 + }, + { + "epoch": 1.1617335865436789, + "grad_norm": 3.9251492023468018, + "learning_rate": 6.448129368233521e-06, + "loss": 0.2153, + "step": 26669 + }, + { + "epoch": 1.1617471513836137, + "grad_norm": 4.512861728668213, + "learning_rate": 6.447992325613266e-06, + "loss": 0.218, + "step": 26670 + }, + { + "epoch": 1.1617607162235486, + "grad_norm": 3.788316249847412, + "learning_rate": 6.447855282993012e-06, + "loss": 0.0779, + "step": 26671 + }, + { + "epoch": 1.1617742810634835, + "grad_norm": 3.993293285369873, + "learning_rate": 6.4477182403727565e-06, + "loss": 0.135, + "step": 26672 + }, + { + "epoch": 1.1617878459034183, + "grad_norm": 5.525887966156006, + "learning_rate": 6.447581197752502e-06, + "loss": 0.2824, + "step": 26673 + }, + { + "epoch": 1.1618014107433532, + "grad_norm": 4.294285297393799, + "learning_rate": 6.447444155132246e-06, + "loss": 0.2345, + "step": 26674 + }, + { + "epoch": 1.161814975583288, + "grad_norm": 3.284820795059204, + "learning_rate": 6.447307112511991e-06, + "loss": 0.1692, + "step": 26675 + }, + { + "epoch": 1.161828540423223, + "grad_norm": 4.055654048919678, + "learning_rate": 6.447170069891737e-06, + "loss": 0.2027, + "step": 26676 + }, + { + "epoch": 1.1618421052631578, + "grad_norm": 3.523160934448242, + "learning_rate": 6.4470330272714824e-06, + "loss": 0.1577, + "step": 26677 + }, + { + "epoch": 1.1618556701030929, + "grad_norm": 4.947445392608643, + "learning_rate": 6.446895984651227e-06, + "loss": 0.2106, + "step": 26678 + }, + { + "epoch": 1.1618692349430277, + "grad_norm": 3.6818735599517822, + "learning_rate": 6.446758942030972e-06, + "loss": 0.1636, + "step": 26679 + }, + { + "epoch": 1.1618827997829626, + "grad_norm": 4.116724967956543, + "learning_rate": 6.446621899410718e-06, + "loss": 0.144, + "step": 26680 + }, + { + "epoch": 1.1618963646228975, + "grad_norm": 4.8690972328186035, + "learning_rate": 6.446484856790462e-06, + "loss": 0.1622, + "step": 26681 + }, + { + "epoch": 1.1619099294628323, + "grad_norm": 4.5271525382995605, + "learning_rate": 6.4463478141702075e-06, + "loss": 0.1741, + "step": 26682 + }, + { + "epoch": 1.1619234943027672, + "grad_norm": 3.580233097076416, + "learning_rate": 6.446210771549952e-06, + "loss": 0.1756, + "step": 26683 + }, + { + "epoch": 1.161937059142702, + "grad_norm": 4.505029678344727, + "learning_rate": 6.446073728929698e-06, + "loss": 0.2077, + "step": 26684 + }, + { + "epoch": 1.161950623982637, + "grad_norm": 3.752621650695801, + "learning_rate": 6.445936686309443e-06, + "loss": 0.1389, + "step": 26685 + }, + { + "epoch": 1.1619641888225718, + "grad_norm": 4.126976013183594, + "learning_rate": 6.445799643689188e-06, + "loss": 0.1357, + "step": 26686 + }, + { + "epoch": 1.1619777536625069, + "grad_norm": 3.9395110607147217, + "learning_rate": 6.4456626010689325e-06, + "loss": 0.1876, + "step": 26687 + }, + { + "epoch": 1.1619913185024418, + "grad_norm": 6.2385478019714355, + "learning_rate": 6.445525558448678e-06, + "loss": 0.2924, + "step": 26688 + }, + { + "epoch": 1.1620048833423766, + "grad_norm": 6.016560077667236, + "learning_rate": 6.445388515828424e-06, + "loss": 0.259, + "step": 26689 + }, + { + "epoch": 1.1620184481823115, + "grad_norm": 5.204007625579834, + "learning_rate": 6.445251473208168e-06, + "loss": 0.2487, + "step": 26690 + }, + { + "epoch": 1.1620320130222463, + "grad_norm": 4.466500282287598, + "learning_rate": 6.445114430587913e-06, + "loss": 0.2203, + "step": 26691 + }, + { + "epoch": 1.1620455778621812, + "grad_norm": 4.619798183441162, + "learning_rate": 6.4449773879676584e-06, + "loss": 0.1587, + "step": 26692 + }, + { + "epoch": 1.162059142702116, + "grad_norm": 4.194950103759766, + "learning_rate": 6.444840345347404e-06, + "loss": 0.2018, + "step": 26693 + }, + { + "epoch": 1.162072707542051, + "grad_norm": 5.596426010131836, + "learning_rate": 6.444703302727149e-06, + "loss": 0.2439, + "step": 26694 + }, + { + "epoch": 1.1620862723819858, + "grad_norm": 4.1594953536987305, + "learning_rate": 6.444566260106894e-06, + "loss": 0.2951, + "step": 26695 + }, + { + "epoch": 1.1620998372219207, + "grad_norm": 6.0276103019714355, + "learning_rate": 6.444429217486638e-06, + "loss": 0.2077, + "step": 26696 + }, + { + "epoch": 1.1621134020618558, + "grad_norm": 3.837730646133423, + "learning_rate": 6.444292174866384e-06, + "loss": 0.1766, + "step": 26697 + }, + { + "epoch": 1.1621269669017906, + "grad_norm": 5.333168983459473, + "learning_rate": 6.4441551322461295e-06, + "loss": 0.2298, + "step": 26698 + }, + { + "epoch": 1.1621405317417255, + "grad_norm": 3.3087360858917236, + "learning_rate": 6.444018089625874e-06, + "loss": 0.1093, + "step": 26699 + }, + { + "epoch": 1.1621540965816604, + "grad_norm": 5.631115913391113, + "learning_rate": 6.443881047005619e-06, + "loss": 0.1891, + "step": 26700 + }, + { + "epoch": 1.1621676614215952, + "grad_norm": 7.257446765899658, + "learning_rate": 6.443744004385364e-06, + "loss": 0.3983, + "step": 26701 + }, + { + "epoch": 1.16218122626153, + "grad_norm": 4.41478967666626, + "learning_rate": 6.44360696176511e-06, + "loss": 0.1723, + "step": 26702 + }, + { + "epoch": 1.162194791101465, + "grad_norm": 4.870867729187012, + "learning_rate": 6.4434699191448545e-06, + "loss": 0.1798, + "step": 26703 + }, + { + "epoch": 1.1622083559413998, + "grad_norm": 5.188623428344727, + "learning_rate": 6.4433328765246e-06, + "loss": 0.2404, + "step": 26704 + }, + { + "epoch": 1.162221920781335, + "grad_norm": 4.351418972015381, + "learning_rate": 6.443195833904344e-06, + "loss": 0.2089, + "step": 26705 + }, + { + "epoch": 1.1622354856212698, + "grad_norm": 4.264501094818115, + "learning_rate": 6.44305879128409e-06, + "loss": 0.1801, + "step": 26706 + }, + { + "epoch": 1.1622490504612046, + "grad_norm": 4.4104485511779785, + "learning_rate": 6.442921748663835e-06, + "loss": 0.239, + "step": 26707 + }, + { + "epoch": 1.1622626153011395, + "grad_norm": 5.118238925933838, + "learning_rate": 6.44278470604358e-06, + "loss": 0.2653, + "step": 26708 + }, + { + "epoch": 1.1622761801410744, + "grad_norm": 4.049816608428955, + "learning_rate": 6.442647663423325e-06, + "loss": 0.2151, + "step": 26709 + }, + { + "epoch": 1.1622897449810092, + "grad_norm": 6.87599515914917, + "learning_rate": 6.442510620803071e-06, + "loss": 0.2445, + "step": 26710 + }, + { + "epoch": 1.162303309820944, + "grad_norm": 4.68282413482666, + "learning_rate": 6.442373578182816e-06, + "loss": 0.1754, + "step": 26711 + }, + { + "epoch": 1.162316874660879, + "grad_norm": 4.526845455169678, + "learning_rate": 6.44223653556256e-06, + "loss": 0.1512, + "step": 26712 + }, + { + "epoch": 1.1623304395008138, + "grad_norm": 3.9035017490386963, + "learning_rate": 6.4420994929423055e-06, + "loss": 0.1901, + "step": 26713 + }, + { + "epoch": 1.1623440043407487, + "grad_norm": 5.427248954772949, + "learning_rate": 6.44196245032205e-06, + "loss": 0.2261, + "step": 26714 + }, + { + "epoch": 1.1623575691806836, + "grad_norm": 6.473933696746826, + "learning_rate": 6.441825407701796e-06, + "loss": 0.2993, + "step": 26715 + }, + { + "epoch": 1.1623711340206186, + "grad_norm": 6.053356647491455, + "learning_rate": 6.441688365081541e-06, + "loss": 0.2636, + "step": 26716 + }, + { + "epoch": 1.1623846988605535, + "grad_norm": 5.889915466308594, + "learning_rate": 6.441551322461286e-06, + "loss": 0.2313, + "step": 26717 + }, + { + "epoch": 1.1623982637004884, + "grad_norm": 5.863286972045898, + "learning_rate": 6.4414142798410305e-06, + "loss": 0.2388, + "step": 26718 + }, + { + "epoch": 1.1624118285404232, + "grad_norm": 5.7298502922058105, + "learning_rate": 6.4412772372207766e-06, + "loss": 0.2817, + "step": 26719 + }, + { + "epoch": 1.162425393380358, + "grad_norm": 4.481814861297607, + "learning_rate": 6.441140194600522e-06, + "loss": 0.2054, + "step": 26720 + }, + { + "epoch": 1.162438958220293, + "grad_norm": 3.7261502742767334, + "learning_rate": 6.441003151980266e-06, + "loss": 0.1187, + "step": 26721 + }, + { + "epoch": 1.1624525230602278, + "grad_norm": 4.0286478996276855, + "learning_rate": 6.440866109360011e-06, + "loss": 0.171, + "step": 26722 + }, + { + "epoch": 1.1624660879001627, + "grad_norm": 4.977497577667236, + "learning_rate": 6.440729066739757e-06, + "loss": 0.3058, + "step": 26723 + }, + { + "epoch": 1.1624796527400978, + "grad_norm": 5.6362152099609375, + "learning_rate": 6.440592024119502e-06, + "loss": 0.2517, + "step": 26724 + }, + { + "epoch": 1.1624932175800327, + "grad_norm": 4.985143184661865, + "learning_rate": 6.440454981499247e-06, + "loss": 0.3896, + "step": 26725 + }, + { + "epoch": 1.1625067824199675, + "grad_norm": 4.842857360839844, + "learning_rate": 6.440317938878992e-06, + "loss": 0.283, + "step": 26726 + }, + { + "epoch": 1.1625203472599024, + "grad_norm": 4.7794671058654785, + "learning_rate": 6.440180896258737e-06, + "loss": 0.2388, + "step": 26727 + }, + { + "epoch": 1.1625339120998373, + "grad_norm": 4.723153591156006, + "learning_rate": 6.440043853638482e-06, + "loss": 0.1887, + "step": 26728 + }, + { + "epoch": 1.1625474769397721, + "grad_norm": 6.1313934326171875, + "learning_rate": 6.4399068110182275e-06, + "loss": 0.3589, + "step": 26729 + }, + { + "epoch": 1.162561041779707, + "grad_norm": 6.311140537261963, + "learning_rate": 6.439769768397972e-06, + "loss": 0.2741, + "step": 26730 + }, + { + "epoch": 1.1625746066196418, + "grad_norm": 5.757721900939941, + "learning_rate": 6.439632725777717e-06, + "loss": 0.2941, + "step": 26731 + }, + { + "epoch": 1.1625881714595767, + "grad_norm": 5.556412220001221, + "learning_rate": 6.439495683157463e-06, + "loss": 0.391, + "step": 26732 + }, + { + "epoch": 1.1626017362995116, + "grad_norm": 5.770120143890381, + "learning_rate": 6.439358640537207e-06, + "loss": 0.31, + "step": 26733 + }, + { + "epoch": 1.1626153011394464, + "grad_norm": 6.503324031829834, + "learning_rate": 6.4392215979169526e-06, + "loss": 0.3572, + "step": 26734 + }, + { + "epoch": 1.1626288659793815, + "grad_norm": 4.3303608894348145, + "learning_rate": 6.439084555296698e-06, + "loss": 0.2613, + "step": 26735 + }, + { + "epoch": 1.1626424308193164, + "grad_norm": 6.539981842041016, + "learning_rate": 6.438947512676444e-06, + "loss": 0.3477, + "step": 26736 + }, + { + "epoch": 1.1626559956592513, + "grad_norm": 4.8746771812438965, + "learning_rate": 6.438810470056188e-06, + "loss": 0.2732, + "step": 26737 + }, + { + "epoch": 1.1626695604991861, + "grad_norm": 4.456396102905273, + "learning_rate": 6.438673427435933e-06, + "loss": 0.3118, + "step": 26738 + }, + { + "epoch": 1.162683125339121, + "grad_norm": 5.690558433532715, + "learning_rate": 6.438536384815678e-06, + "loss": 0.2186, + "step": 26739 + }, + { + "epoch": 1.1626966901790559, + "grad_norm": 5.305211067199707, + "learning_rate": 6.438399342195424e-06, + "loss": 0.2675, + "step": 26740 + }, + { + "epoch": 1.1627102550189907, + "grad_norm": 3.650641441345215, + "learning_rate": 6.438262299575169e-06, + "loss": 0.1588, + "step": 26741 + }, + { + "epoch": 1.1627238198589256, + "grad_norm": 7.13373327255249, + "learning_rate": 6.438125256954913e-06, + "loss": 0.3389, + "step": 26742 + }, + { + "epoch": 1.1627373846988607, + "grad_norm": 6.121219635009766, + "learning_rate": 6.437988214334658e-06, + "loss": 0.2731, + "step": 26743 + }, + { + "epoch": 1.1627509495387955, + "grad_norm": 4.820862770080566, + "learning_rate": 6.4378511717144035e-06, + "loss": 0.2697, + "step": 26744 + }, + { + "epoch": 1.1627645143787304, + "grad_norm": 6.314651966094971, + "learning_rate": 6.4377141290941495e-06, + "loss": 0.3314, + "step": 26745 + }, + { + "epoch": 1.1627780792186653, + "grad_norm": 6.569521903991699, + "learning_rate": 6.437577086473894e-06, + "loss": 0.328, + "step": 26746 + }, + { + "epoch": 1.1627916440586001, + "grad_norm": 5.798991680145264, + "learning_rate": 6.437440043853639e-06, + "loss": 0.3137, + "step": 26747 + }, + { + "epoch": 1.162805208898535, + "grad_norm": 5.369713306427002, + "learning_rate": 6.437303001233383e-06, + "loss": 0.3787, + "step": 26748 + }, + { + "epoch": 1.1628187737384699, + "grad_norm": 5.960739612579346, + "learning_rate": 6.437165958613129e-06, + "loss": 0.3542, + "step": 26749 + }, + { + "epoch": 1.1628323385784047, + "grad_norm": 6.690598964691162, + "learning_rate": 6.4370289159928746e-06, + "loss": 0.4518, + "step": 26750 + }, + { + "epoch": 1.1628459034183396, + "grad_norm": 4.208542823791504, + "learning_rate": 6.43689187337262e-06, + "loss": 0.1312, + "step": 26751 + }, + { + "epoch": 1.1628594682582745, + "grad_norm": 5.274190425872803, + "learning_rate": 6.436754830752364e-06, + "loss": 0.2538, + "step": 26752 + }, + { + "epoch": 1.1628730330982093, + "grad_norm": 4.8287529945373535, + "learning_rate": 6.43661778813211e-06, + "loss": 0.3076, + "step": 26753 + }, + { + "epoch": 1.1628865979381444, + "grad_norm": 6.079421520233154, + "learning_rate": 6.436480745511855e-06, + "loss": 0.2102, + "step": 26754 + }, + { + "epoch": 1.1629001627780793, + "grad_norm": 4.781713962554932, + "learning_rate": 6.4363437028916e-06, + "loss": 0.2056, + "step": 26755 + }, + { + "epoch": 1.1629137276180141, + "grad_norm": 6.526504993438721, + "learning_rate": 6.436206660271345e-06, + "loss": 0.2101, + "step": 26756 + }, + { + "epoch": 1.162927292457949, + "grad_norm": 5.181143760681152, + "learning_rate": 6.436069617651089e-06, + "loss": 0.2488, + "step": 26757 + }, + { + "epoch": 1.1629408572978839, + "grad_norm": 4.283501625061035, + "learning_rate": 6.435932575030835e-06, + "loss": 0.1701, + "step": 26758 + }, + { + "epoch": 1.1629544221378187, + "grad_norm": 4.1138200759887695, + "learning_rate": 6.43579553241058e-06, + "loss": 0.2174, + "step": 26759 + }, + { + "epoch": 1.1629679869777536, + "grad_norm": 2.937527894973755, + "learning_rate": 6.4356584897903255e-06, + "loss": 0.1773, + "step": 26760 + }, + { + "epoch": 1.1629815518176885, + "grad_norm": 7.379834175109863, + "learning_rate": 6.43552144717007e-06, + "loss": 0.5735, + "step": 26761 + }, + { + "epoch": 1.1629951166576236, + "grad_norm": 4.558834552764893, + "learning_rate": 6.435384404549816e-06, + "loss": 0.2076, + "step": 26762 + }, + { + "epoch": 1.1630086814975584, + "grad_norm": 7.052021503448486, + "learning_rate": 6.435247361929561e-06, + "loss": 0.4468, + "step": 26763 + }, + { + "epoch": 1.1630222463374933, + "grad_norm": 5.442676544189453, + "learning_rate": 6.435110319309305e-06, + "loss": 0.3459, + "step": 26764 + }, + { + "epoch": 1.1630358111774282, + "grad_norm": 5.351263523101807, + "learning_rate": 6.4349732766890506e-06, + "loss": 0.4004, + "step": 26765 + }, + { + "epoch": 1.163049376017363, + "grad_norm": 4.656716346740723, + "learning_rate": 6.434836234068797e-06, + "loss": 0.2194, + "step": 26766 + }, + { + "epoch": 1.1630629408572979, + "grad_norm": 8.02460765838623, + "learning_rate": 6.434699191448541e-06, + "loss": 0.4367, + "step": 26767 + }, + { + "epoch": 1.1630765056972328, + "grad_norm": 4.564416408538818, + "learning_rate": 6.434562148828286e-06, + "loss": 0.3658, + "step": 26768 + }, + { + "epoch": 1.1630900705371676, + "grad_norm": 7.232245922088623, + "learning_rate": 6.434425106208031e-06, + "loss": 0.5523, + "step": 26769 + }, + { + "epoch": 1.1631036353771025, + "grad_norm": 5.677000999450684, + "learning_rate": 6.434288063587776e-06, + "loss": 0.3261, + "step": 26770 + }, + { + "epoch": 1.1631172002170374, + "grad_norm": 5.392208099365234, + "learning_rate": 6.434151020967522e-06, + "loss": 0.3318, + "step": 26771 + }, + { + "epoch": 1.1631307650569722, + "grad_norm": 5.03181266784668, + "learning_rate": 6.434013978347267e-06, + "loss": 0.2825, + "step": 26772 + }, + { + "epoch": 1.1631443298969073, + "grad_norm": 4.70470666885376, + "learning_rate": 6.433876935727011e-06, + "loss": 0.2878, + "step": 26773 + }, + { + "epoch": 1.1631578947368422, + "grad_norm": 7.565602779388428, + "learning_rate": 6.433739893106756e-06, + "loss": 0.4406, + "step": 26774 + }, + { + "epoch": 1.163171459576777, + "grad_norm": 3.726602077484131, + "learning_rate": 6.433602850486502e-06, + "loss": 0.2322, + "step": 26775 + }, + { + "epoch": 1.163185024416712, + "grad_norm": 4.7981276512146, + "learning_rate": 6.433465807866247e-06, + "loss": 0.2429, + "step": 26776 + }, + { + "epoch": 1.1631985892566468, + "grad_norm": 6.510247230529785, + "learning_rate": 6.433328765245992e-06, + "loss": 0.2721, + "step": 26777 + }, + { + "epoch": 1.1632121540965816, + "grad_norm": 5.022595405578613, + "learning_rate": 6.433191722625737e-06, + "loss": 0.2691, + "step": 26778 + }, + { + "epoch": 1.1632257189365165, + "grad_norm": 6.201622009277344, + "learning_rate": 6.433054680005483e-06, + "loss": 0.4402, + "step": 26779 + }, + { + "epoch": 1.1632392837764514, + "grad_norm": 5.974625110626221, + "learning_rate": 6.432917637385227e-06, + "loss": 0.3823, + "step": 26780 + }, + { + "epoch": 1.1632528486163864, + "grad_norm": 4.288123607635498, + "learning_rate": 6.4327805947649726e-06, + "loss": 0.1635, + "step": 26781 + }, + { + "epoch": 1.1632664134563213, + "grad_norm": 7.8840107917785645, + "learning_rate": 6.432643552144717e-06, + "loss": 0.4722, + "step": 26782 + }, + { + "epoch": 1.1632799782962562, + "grad_norm": 4.359567642211914, + "learning_rate": 6.432506509524462e-06, + "loss": 0.2433, + "step": 26783 + }, + { + "epoch": 1.163293543136191, + "grad_norm": 6.462953567504883, + "learning_rate": 6.432369466904208e-06, + "loss": 0.4064, + "step": 26784 + }, + { + "epoch": 1.163307107976126, + "grad_norm": 6.281313419342041, + "learning_rate": 6.432232424283953e-06, + "loss": 0.3483, + "step": 26785 + }, + { + "epoch": 1.1633206728160608, + "grad_norm": 4.452561855316162, + "learning_rate": 6.432095381663698e-06, + "loss": 0.2038, + "step": 26786 + }, + { + "epoch": 1.1633342376559956, + "grad_norm": 4.155276298522949, + "learning_rate": 6.431958339043443e-06, + "loss": 0.1137, + "step": 26787 + }, + { + "epoch": 1.1633478024959305, + "grad_norm": 4.861182689666748, + "learning_rate": 6.431821296423189e-06, + "loss": 0.3956, + "step": 26788 + }, + { + "epoch": 1.1633613673358654, + "grad_norm": 4.211274147033691, + "learning_rate": 6.431684253802933e-06, + "loss": 0.2105, + "step": 26789 + }, + { + "epoch": 1.1633749321758002, + "grad_norm": 4.195667743682861, + "learning_rate": 6.431547211182678e-06, + "loss": 0.2599, + "step": 26790 + }, + { + "epoch": 1.163388497015735, + "grad_norm": 4.526377201080322, + "learning_rate": 6.431410168562423e-06, + "loss": 0.3125, + "step": 26791 + }, + { + "epoch": 1.1634020618556702, + "grad_norm": 4.02518367767334, + "learning_rate": 6.431273125942169e-06, + "loss": 0.2413, + "step": 26792 + }, + { + "epoch": 1.163415626695605, + "grad_norm": 5.5359296798706055, + "learning_rate": 6.431136083321914e-06, + "loss": 0.357, + "step": 26793 + }, + { + "epoch": 1.16342919153554, + "grad_norm": 5.518131732940674, + "learning_rate": 6.430999040701659e-06, + "loss": 0.2846, + "step": 26794 + }, + { + "epoch": 1.1634427563754748, + "grad_norm": 5.314362049102783, + "learning_rate": 6.430861998081403e-06, + "loss": 0.294, + "step": 26795 + }, + { + "epoch": 1.1634563212154097, + "grad_norm": 4.563444137573242, + "learning_rate": 6.4307249554611486e-06, + "loss": 0.3285, + "step": 26796 + }, + { + "epoch": 1.1634698860553445, + "grad_norm": 7.008380889892578, + "learning_rate": 6.430587912840895e-06, + "loss": 0.3412, + "step": 26797 + }, + { + "epoch": 1.1634834508952794, + "grad_norm": 6.904637336730957, + "learning_rate": 6.430450870220639e-06, + "loss": 0.3998, + "step": 26798 + }, + { + "epoch": 1.1634970157352142, + "grad_norm": 6.8329057693481445, + "learning_rate": 6.430313827600384e-06, + "loss": 0.2056, + "step": 26799 + }, + { + "epoch": 1.1635105805751493, + "grad_norm": 4.694461822509766, + "learning_rate": 6.430176784980129e-06, + "loss": 0.3013, + "step": 26800 + }, + { + "epoch": 1.1635241454150842, + "grad_norm": 4.859452247619629, + "learning_rate": 6.4300397423598745e-06, + "loss": 0.2176, + "step": 26801 + }, + { + "epoch": 1.163537710255019, + "grad_norm": 4.779811859130859, + "learning_rate": 6.42990269973962e-06, + "loss": 0.3298, + "step": 26802 + }, + { + "epoch": 1.163551275094954, + "grad_norm": 4.683691501617432, + "learning_rate": 6.429765657119365e-06, + "loss": 0.2705, + "step": 26803 + }, + { + "epoch": 1.1635648399348888, + "grad_norm": 4.780426502227783, + "learning_rate": 6.429628614499109e-06, + "loss": 0.2497, + "step": 26804 + }, + { + "epoch": 1.1635784047748237, + "grad_norm": 5.569149971008301, + "learning_rate": 6.429491571878855e-06, + "loss": 0.296, + "step": 26805 + }, + { + "epoch": 1.1635919696147585, + "grad_norm": 7.107337474822998, + "learning_rate": 6.4293545292586e-06, + "loss": 0.5272, + "step": 26806 + }, + { + "epoch": 1.1636055344546934, + "grad_norm": 6.0348429679870605, + "learning_rate": 6.429217486638345e-06, + "loss": 0.2797, + "step": 26807 + }, + { + "epoch": 1.1636190992946283, + "grad_norm": 5.870465278625488, + "learning_rate": 6.42908044401809e-06, + "loss": 0.4115, + "step": 26808 + }, + { + "epoch": 1.1636326641345631, + "grad_norm": 5.0096235275268555, + "learning_rate": 6.428943401397836e-06, + "loss": 0.2859, + "step": 26809 + }, + { + "epoch": 1.163646228974498, + "grad_norm": 5.551726341247559, + "learning_rate": 6.42880635877758e-06, + "loss": 0.2525, + "step": 26810 + }, + { + "epoch": 1.163659793814433, + "grad_norm": 5.827694892883301, + "learning_rate": 6.428669316157325e-06, + "loss": 0.2902, + "step": 26811 + }, + { + "epoch": 1.163673358654368, + "grad_norm": 8.953097343444824, + "learning_rate": 6.428532273537071e-06, + "loss": 0.6156, + "step": 26812 + }, + { + "epoch": 1.1636869234943028, + "grad_norm": 8.136966705322266, + "learning_rate": 6.428395230916815e-06, + "loss": 0.5015, + "step": 26813 + }, + { + "epoch": 1.1637004883342377, + "grad_norm": 6.091412544250488, + "learning_rate": 6.428258188296561e-06, + "loss": 0.3528, + "step": 26814 + }, + { + "epoch": 1.1637140531741725, + "grad_norm": 5.795506477355957, + "learning_rate": 6.428121145676306e-06, + "loss": 0.293, + "step": 26815 + }, + { + "epoch": 1.1637276180141074, + "grad_norm": 5.982911109924316, + "learning_rate": 6.4279841030560504e-06, + "loss": 0.3138, + "step": 26816 + }, + { + "epoch": 1.1637411828540423, + "grad_norm": 5.384082794189453, + "learning_rate": 6.427847060435796e-06, + "loss": 0.3605, + "step": 26817 + }, + { + "epoch": 1.1637547476939771, + "grad_norm": 8.366528511047363, + "learning_rate": 6.427710017815542e-06, + "loss": 0.5743, + "step": 26818 + }, + { + "epoch": 1.1637683125339122, + "grad_norm": 4.99639368057251, + "learning_rate": 6.427572975195287e-06, + "loss": 0.3429, + "step": 26819 + }, + { + "epoch": 1.163781877373847, + "grad_norm": 5.797994136810303, + "learning_rate": 6.427435932575031e-06, + "loss": 0.4002, + "step": 26820 + }, + { + "epoch": 1.163795442213782, + "grad_norm": 5.834357738494873, + "learning_rate": 6.427298889954776e-06, + "loss": 0.4993, + "step": 26821 + }, + { + "epoch": 1.1638090070537168, + "grad_norm": 5.987275123596191, + "learning_rate": 6.427161847334522e-06, + "loss": 0.2262, + "step": 26822 + }, + { + "epoch": 1.1638225718936517, + "grad_norm": 5.051316261291504, + "learning_rate": 6.427024804714267e-06, + "loss": 0.2693, + "step": 26823 + }, + { + "epoch": 1.1638361367335865, + "grad_norm": 5.428780555725098, + "learning_rate": 6.426887762094012e-06, + "loss": 0.3853, + "step": 26824 + }, + { + "epoch": 1.1638497015735214, + "grad_norm": 5.601297855377197, + "learning_rate": 6.426750719473756e-06, + "loss": 0.5022, + "step": 26825 + }, + { + "epoch": 1.1638632664134563, + "grad_norm": 5.773423194885254, + "learning_rate": 6.426613676853501e-06, + "loss": 0.3637, + "step": 26826 + }, + { + "epoch": 1.1638768312533911, + "grad_norm": 5.620026588439941, + "learning_rate": 6.426476634233247e-06, + "loss": 0.2478, + "step": 26827 + }, + { + "epoch": 1.163890396093326, + "grad_norm": 6.105801582336426, + "learning_rate": 6.426339591612993e-06, + "loss": 0.3717, + "step": 26828 + }, + { + "epoch": 1.1639039609332609, + "grad_norm": 6.031732559204102, + "learning_rate": 6.426202548992737e-06, + "loss": 0.4518, + "step": 26829 + }, + { + "epoch": 1.163917525773196, + "grad_norm": 4.918461799621582, + "learning_rate": 6.426065506372482e-06, + "loss": 0.3031, + "step": 26830 + }, + { + "epoch": 1.1639310906131308, + "grad_norm": 5.904663562774658, + "learning_rate": 6.425928463752228e-06, + "loss": 0.3464, + "step": 26831 + }, + { + "epoch": 1.1639446554530657, + "grad_norm": 7.107688903808594, + "learning_rate": 6.4257914211319725e-06, + "loss": 0.383, + "step": 26832 + }, + { + "epoch": 1.1639582202930006, + "grad_norm": 5.7092671394348145, + "learning_rate": 6.425654378511718e-06, + "loss": 0.3764, + "step": 26833 + }, + { + "epoch": 1.1639717851329354, + "grad_norm": 6.515254497528076, + "learning_rate": 6.425517335891463e-06, + "loss": 0.2659, + "step": 26834 + }, + { + "epoch": 1.1639853499728703, + "grad_norm": 5.848471641540527, + "learning_rate": 6.425380293271208e-06, + "loss": 0.2809, + "step": 26835 + }, + { + "epoch": 1.1639989148128052, + "grad_norm": 6.498802661895752, + "learning_rate": 6.425243250650953e-06, + "loss": 0.3044, + "step": 26836 + }, + { + "epoch": 1.16401247965274, + "grad_norm": 4.8305253982543945, + "learning_rate": 6.425106208030698e-06, + "loss": 0.2412, + "step": 26837 + }, + { + "epoch": 1.164026044492675, + "grad_norm": 4.247095108032227, + "learning_rate": 6.424969165410443e-06, + "loss": 0.2473, + "step": 26838 + }, + { + "epoch": 1.16403960933261, + "grad_norm": 5.017361640930176, + "learning_rate": 6.424832122790188e-06, + "loss": 0.1713, + "step": 26839 + }, + { + "epoch": 1.1640531741725448, + "grad_norm": 4.474168300628662, + "learning_rate": 6.424695080169934e-06, + "loss": 0.22, + "step": 26840 + }, + { + "epoch": 1.1640667390124797, + "grad_norm": 4.64225435256958, + "learning_rate": 6.424558037549678e-06, + "loss": 0.2929, + "step": 26841 + }, + { + "epoch": 1.1640803038524146, + "grad_norm": 5.503393173217773, + "learning_rate": 6.424420994929423e-06, + "loss": 0.3636, + "step": 26842 + }, + { + "epoch": 1.1640938686923494, + "grad_norm": 5.005801200866699, + "learning_rate": 6.424283952309169e-06, + "loss": 0.3464, + "step": 26843 + }, + { + "epoch": 1.1641074335322843, + "grad_norm": 4.74950647354126, + "learning_rate": 6.424146909688915e-06, + "loss": 0.2953, + "step": 26844 + }, + { + "epoch": 1.1641209983722192, + "grad_norm": 4.4505486488342285, + "learning_rate": 6.424009867068659e-06, + "loss": 0.2272, + "step": 26845 + }, + { + "epoch": 1.164134563212154, + "grad_norm": 3.707432270050049, + "learning_rate": 6.423872824448404e-06, + "loss": 0.1882, + "step": 26846 + }, + { + "epoch": 1.164148128052089, + "grad_norm": 3.9277989864349365, + "learning_rate": 6.4237357818281484e-06, + "loss": 0.337, + "step": 26847 + }, + { + "epoch": 1.1641616928920238, + "grad_norm": 4.537796974182129, + "learning_rate": 6.4235987392078945e-06, + "loss": 0.2308, + "step": 26848 + }, + { + "epoch": 1.1641752577319588, + "grad_norm": 5.4411420822143555, + "learning_rate": 6.42346169658764e-06, + "loss": 0.3006, + "step": 26849 + }, + { + "epoch": 1.1641888225718937, + "grad_norm": 5.26201868057251, + "learning_rate": 6.423324653967384e-06, + "loss": 0.2806, + "step": 26850 + }, + { + "epoch": 1.1642023874118286, + "grad_norm": 4.545683860778809, + "learning_rate": 6.423187611347129e-06, + "loss": 0.2636, + "step": 26851 + }, + { + "epoch": 1.1642159522517634, + "grad_norm": 5.871603012084961, + "learning_rate": 6.423050568726874e-06, + "loss": 0.3542, + "step": 26852 + }, + { + "epoch": 1.1642295170916983, + "grad_norm": 5.597859859466553, + "learning_rate": 6.42291352610662e-06, + "loss": 0.4518, + "step": 26853 + }, + { + "epoch": 1.1642430819316332, + "grad_norm": 6.024733543395996, + "learning_rate": 6.422776483486365e-06, + "loss": 0.3193, + "step": 26854 + }, + { + "epoch": 1.164256646771568, + "grad_norm": 6.770734786987305, + "learning_rate": 6.42263944086611e-06, + "loss": 0.4758, + "step": 26855 + }, + { + "epoch": 1.164270211611503, + "grad_norm": 6.297604560852051, + "learning_rate": 6.422502398245854e-06, + "loss": 0.535, + "step": 26856 + }, + { + "epoch": 1.164283776451438, + "grad_norm": 5.692812442779541, + "learning_rate": 6.4223653556256e-06, + "loss": 0.358, + "step": 26857 + }, + { + "epoch": 1.1642973412913729, + "grad_norm": 6.0868706703186035, + "learning_rate": 6.422228313005345e-06, + "loss": 0.384, + "step": 26858 + }, + { + "epoch": 1.1643109061313077, + "grad_norm": 3.3435206413269043, + "learning_rate": 6.42209127038509e-06, + "loss": 0.1857, + "step": 26859 + }, + { + "epoch": 1.1643244709712426, + "grad_norm": 6.860858917236328, + "learning_rate": 6.421954227764835e-06, + "loss": 0.4563, + "step": 26860 + }, + { + "epoch": 1.1643380358111775, + "grad_norm": 6.375263214111328, + "learning_rate": 6.421817185144581e-06, + "loss": 0.2784, + "step": 26861 + }, + { + "epoch": 1.1643516006511123, + "grad_norm": 4.151110649108887, + "learning_rate": 6.421680142524326e-06, + "loss": 0.1393, + "step": 26862 + }, + { + "epoch": 1.1643651654910472, + "grad_norm": 5.4000563621521, + "learning_rate": 6.4215430999040705e-06, + "loss": 0.3919, + "step": 26863 + }, + { + "epoch": 1.164378730330982, + "grad_norm": 5.6498870849609375, + "learning_rate": 6.421406057283816e-06, + "loss": 0.3152, + "step": 26864 + }, + { + "epoch": 1.164392295170917, + "grad_norm": 7.263367652893066, + "learning_rate": 6.42126901466356e-06, + "loss": 0.3862, + "step": 26865 + }, + { + "epoch": 1.1644058600108518, + "grad_norm": 5.289600849151611, + "learning_rate": 6.421131972043306e-06, + "loss": 0.3028, + "step": 26866 + }, + { + "epoch": 1.1644194248507866, + "grad_norm": 5.326425552368164, + "learning_rate": 6.420994929423051e-06, + "loss": 0.2292, + "step": 26867 + }, + { + "epoch": 1.1644329896907217, + "grad_norm": 3.9598898887634277, + "learning_rate": 6.420857886802796e-06, + "loss": 0.1788, + "step": 26868 + }, + { + "epoch": 1.1644465545306566, + "grad_norm": 5.810478687286377, + "learning_rate": 6.420720844182541e-06, + "loss": 0.2442, + "step": 26869 + }, + { + "epoch": 1.1644601193705915, + "grad_norm": 6.123910427093506, + "learning_rate": 6.420583801562287e-06, + "loss": 0.2907, + "step": 26870 + }, + { + "epoch": 1.1644736842105263, + "grad_norm": 5.592823028564453, + "learning_rate": 6.420446758942032e-06, + "loss": 0.2732, + "step": 26871 + }, + { + "epoch": 1.1644872490504612, + "grad_norm": 4.542867660522461, + "learning_rate": 6.420309716321776e-06, + "loss": 0.336, + "step": 26872 + }, + { + "epoch": 1.164500813890396, + "grad_norm": 7.079636096954346, + "learning_rate": 6.420172673701521e-06, + "loss": 0.2814, + "step": 26873 + }, + { + "epoch": 1.164514378730331, + "grad_norm": 3.5662786960601807, + "learning_rate": 6.4200356310812674e-06, + "loss": 0.2036, + "step": 26874 + }, + { + "epoch": 1.1645279435702658, + "grad_norm": 4.140964508056641, + "learning_rate": 6.419898588461012e-06, + "loss": 0.1864, + "step": 26875 + }, + { + "epoch": 1.1645415084102009, + "grad_norm": 5.595582962036133, + "learning_rate": 6.419761545840757e-06, + "loss": 0.2713, + "step": 26876 + }, + { + "epoch": 1.1645550732501357, + "grad_norm": 5.3814802169799805, + "learning_rate": 6.419624503220502e-06, + "loss": 0.2126, + "step": 26877 + }, + { + "epoch": 1.1645686380900706, + "grad_norm": 6.4130120277404785, + "learning_rate": 6.419487460600248e-06, + "loss": 0.2629, + "step": 26878 + }, + { + "epoch": 1.1645822029300055, + "grad_norm": 4.808178424835205, + "learning_rate": 6.4193504179799925e-06, + "loss": 0.174, + "step": 26879 + }, + { + "epoch": 1.1645957677699403, + "grad_norm": 5.081374645233154, + "learning_rate": 6.419213375359738e-06, + "loss": 0.2364, + "step": 26880 + }, + { + "epoch": 1.1646093326098752, + "grad_norm": 3.797034502029419, + "learning_rate": 6.419076332739482e-06, + "loss": 0.1449, + "step": 26881 + }, + { + "epoch": 1.16462289744981, + "grad_norm": 4.322726249694824, + "learning_rate": 6.418939290119227e-06, + "loss": 0.1541, + "step": 26882 + }, + { + "epoch": 1.164636462289745, + "grad_norm": 4.846935749053955, + "learning_rate": 6.418802247498973e-06, + "loss": 0.2487, + "step": 26883 + }, + { + "epoch": 1.1646500271296798, + "grad_norm": 3.550804615020752, + "learning_rate": 6.4186652048787175e-06, + "loss": 0.1573, + "step": 26884 + }, + { + "epoch": 1.1646635919696147, + "grad_norm": 4.796963691711426, + "learning_rate": 6.418528162258463e-06, + "loss": 0.2188, + "step": 26885 + }, + { + "epoch": 1.1646771568095495, + "grad_norm": 3.7324531078338623, + "learning_rate": 6.418391119638208e-06, + "loss": 0.2275, + "step": 26886 + }, + { + "epoch": 1.1646907216494846, + "grad_norm": 6.595055103302002, + "learning_rate": 6.418254077017954e-06, + "loss": 0.3893, + "step": 26887 + }, + { + "epoch": 1.1647042864894195, + "grad_norm": 4.696346282958984, + "learning_rate": 6.418117034397698e-06, + "loss": 0.2339, + "step": 26888 + }, + { + "epoch": 1.1647178513293543, + "grad_norm": 5.016582489013672, + "learning_rate": 6.417979991777443e-06, + "loss": 0.285, + "step": 26889 + }, + { + "epoch": 1.1647314161692892, + "grad_norm": 5.373964786529541, + "learning_rate": 6.417842949157188e-06, + "loss": 0.3009, + "step": 26890 + }, + { + "epoch": 1.164744981009224, + "grad_norm": 4.540542125701904, + "learning_rate": 6.417705906536934e-06, + "loss": 0.1445, + "step": 26891 + }, + { + "epoch": 1.164758545849159, + "grad_norm": 5.163053035736084, + "learning_rate": 6.417568863916679e-06, + "loss": 0.3072, + "step": 26892 + }, + { + "epoch": 1.1647721106890938, + "grad_norm": 4.323793411254883, + "learning_rate": 6.417431821296424e-06, + "loss": 0.1774, + "step": 26893 + }, + { + "epoch": 1.1647856755290287, + "grad_norm": 5.058680534362793, + "learning_rate": 6.4172947786761685e-06, + "loss": 0.1467, + "step": 26894 + }, + { + "epoch": 1.1647992403689638, + "grad_norm": 6.0671892166137695, + "learning_rate": 6.417157736055914e-06, + "loss": 0.357, + "step": 26895 + }, + { + "epoch": 1.1648128052088986, + "grad_norm": 5.283731460571289, + "learning_rate": 6.41702069343566e-06, + "loss": 0.2757, + "step": 26896 + }, + { + "epoch": 1.1648263700488335, + "grad_norm": 3.2605996131896973, + "learning_rate": 6.416883650815404e-06, + "loss": 0.1404, + "step": 26897 + }, + { + "epoch": 1.1648399348887684, + "grad_norm": 6.881748676300049, + "learning_rate": 6.416746608195149e-06, + "loss": 0.3163, + "step": 26898 + }, + { + "epoch": 1.1648534997287032, + "grad_norm": 5.838070869445801, + "learning_rate": 6.4166095655748935e-06, + "loss": 0.3856, + "step": 26899 + }, + { + "epoch": 1.164867064568638, + "grad_norm": 4.44316291809082, + "learning_rate": 6.4164725229546395e-06, + "loss": 0.2329, + "step": 26900 + }, + { + "epoch": 1.164880629408573, + "grad_norm": 3.984107494354248, + "learning_rate": 6.416335480334385e-06, + "loss": 0.2466, + "step": 26901 + }, + { + "epoch": 1.1648941942485078, + "grad_norm": 5.773255825042725, + "learning_rate": 6.41619843771413e-06, + "loss": 0.2675, + "step": 26902 + }, + { + "epoch": 1.1649077590884427, + "grad_norm": 4.195574760437012, + "learning_rate": 6.416061395093874e-06, + "loss": 0.2218, + "step": 26903 + }, + { + "epoch": 1.1649213239283776, + "grad_norm": 5.482888221740723, + "learning_rate": 6.41592435247362e-06, + "loss": 0.2365, + "step": 26904 + }, + { + "epoch": 1.1649348887683124, + "grad_norm": 4.511685848236084, + "learning_rate": 6.4157873098533654e-06, + "loss": 0.2653, + "step": 26905 + }, + { + "epoch": 1.1649484536082475, + "grad_norm": 5.914771556854248, + "learning_rate": 6.41565026723311e-06, + "loss": 0.3011, + "step": 26906 + }, + { + "epoch": 1.1649620184481824, + "grad_norm": 4.429378032684326, + "learning_rate": 6.415513224612855e-06, + "loss": 0.2046, + "step": 26907 + }, + { + "epoch": 1.1649755832881172, + "grad_norm": 4.688214302062988, + "learning_rate": 6.4153761819926e-06, + "loss": 0.223, + "step": 26908 + }, + { + "epoch": 1.164989148128052, + "grad_norm": 3.751932382583618, + "learning_rate": 6.415239139372345e-06, + "loss": 0.1442, + "step": 26909 + }, + { + "epoch": 1.165002712967987, + "grad_norm": 6.113059043884277, + "learning_rate": 6.4151020967520905e-06, + "loss": 0.2827, + "step": 26910 + }, + { + "epoch": 1.1650162778079218, + "grad_norm": 5.636770725250244, + "learning_rate": 6.414965054131836e-06, + "loss": 0.2799, + "step": 26911 + }, + { + "epoch": 1.1650298426478567, + "grad_norm": 8.969070434570312, + "learning_rate": 6.41482801151158e-06, + "loss": 0.1989, + "step": 26912 + }, + { + "epoch": 1.1650434074877916, + "grad_norm": 5.66188383102417, + "learning_rate": 6.414690968891326e-06, + "loss": 0.2174, + "step": 26913 + }, + { + "epoch": 1.1650569723277266, + "grad_norm": 4.817605018615723, + "learning_rate": 6.414553926271071e-06, + "loss": 0.2602, + "step": 26914 + }, + { + "epoch": 1.1650705371676615, + "grad_norm": 4.505696773529053, + "learning_rate": 6.4144168836508155e-06, + "loss": 0.1731, + "step": 26915 + }, + { + "epoch": 1.1650841020075964, + "grad_norm": 4.401054859161377, + "learning_rate": 6.414279841030561e-06, + "loss": 0.244, + "step": 26916 + }, + { + "epoch": 1.1650976668475312, + "grad_norm": 3.648846387863159, + "learning_rate": 6.414142798410307e-06, + "loss": 0.1932, + "step": 26917 + }, + { + "epoch": 1.165111231687466, + "grad_norm": 4.150310516357422, + "learning_rate": 6.414005755790051e-06, + "loss": 0.2177, + "step": 26918 + }, + { + "epoch": 1.165124796527401, + "grad_norm": 4.488412857055664, + "learning_rate": 6.413868713169796e-06, + "loss": 0.2182, + "step": 26919 + }, + { + "epoch": 1.1651383613673358, + "grad_norm": 5.862185001373291, + "learning_rate": 6.4137316705495414e-06, + "loss": 0.3408, + "step": 26920 + }, + { + "epoch": 1.1651519262072707, + "grad_norm": 4.351049423217773, + "learning_rate": 6.413594627929286e-06, + "loss": 0.1848, + "step": 26921 + }, + { + "epoch": 1.1651654910472056, + "grad_norm": 5.496407985687256, + "learning_rate": 6.413457585309032e-06, + "loss": 0.259, + "step": 26922 + }, + { + "epoch": 1.1651790558871404, + "grad_norm": 3.5242109298706055, + "learning_rate": 6.413320542688777e-06, + "loss": 0.145, + "step": 26923 + }, + { + "epoch": 1.1651926207270753, + "grad_norm": 6.343941688537598, + "learning_rate": 6.413183500068521e-06, + "loss": 0.3212, + "step": 26924 + }, + { + "epoch": 1.1652061855670104, + "grad_norm": 3.8706812858581543, + "learning_rate": 6.4130464574482665e-06, + "loss": 0.162, + "step": 26925 + }, + { + "epoch": 1.1652197504069453, + "grad_norm": 6.34695291519165, + "learning_rate": 6.4129094148280125e-06, + "loss": 0.2214, + "step": 26926 + }, + { + "epoch": 1.1652333152468801, + "grad_norm": 3.3642663955688477, + "learning_rate": 6.412772372207758e-06, + "loss": 0.1761, + "step": 26927 + }, + { + "epoch": 1.165246880086815, + "grad_norm": 4.165257930755615, + "learning_rate": 6.412635329587502e-06, + "loss": 0.2329, + "step": 26928 + }, + { + "epoch": 1.1652604449267498, + "grad_norm": 4.117417335510254, + "learning_rate": 6.412498286967247e-06, + "loss": 0.1351, + "step": 26929 + }, + { + "epoch": 1.1652740097666847, + "grad_norm": 4.719660758972168, + "learning_rate": 6.412361244346993e-06, + "loss": 0.2078, + "step": 26930 + }, + { + "epoch": 1.1652875746066196, + "grad_norm": 5.7895684242248535, + "learning_rate": 6.4122242017267375e-06, + "loss": 0.324, + "step": 26931 + }, + { + "epoch": 1.1653011394465544, + "grad_norm": 6.734835624694824, + "learning_rate": 6.412087159106483e-06, + "loss": 0.3375, + "step": 26932 + }, + { + "epoch": 1.1653147042864895, + "grad_norm": 3.5618155002593994, + "learning_rate": 6.411950116486227e-06, + "loss": 0.1305, + "step": 26933 + }, + { + "epoch": 1.1653282691264244, + "grad_norm": 4.498777389526367, + "learning_rate": 6.411813073865972e-06, + "loss": 0.1945, + "step": 26934 + }, + { + "epoch": 1.1653418339663593, + "grad_norm": 5.092092514038086, + "learning_rate": 6.411676031245718e-06, + "loss": 0.2628, + "step": 26935 + }, + { + "epoch": 1.1653553988062941, + "grad_norm": 6.329805374145508, + "learning_rate": 6.4115389886254634e-06, + "loss": 0.2269, + "step": 26936 + }, + { + "epoch": 1.165368963646229, + "grad_norm": 5.12142276763916, + "learning_rate": 6.411401946005208e-06, + "loss": 0.2325, + "step": 26937 + }, + { + "epoch": 1.1653825284861639, + "grad_norm": 4.696747779846191, + "learning_rate": 6.411264903384953e-06, + "loss": 0.1829, + "step": 26938 + }, + { + "epoch": 1.1653960933260987, + "grad_norm": 8.016425132751465, + "learning_rate": 6.411127860764699e-06, + "loss": 0.1955, + "step": 26939 + }, + { + "epoch": 1.1654096581660336, + "grad_norm": 3.5601296424865723, + "learning_rate": 6.410990818144443e-06, + "loss": 0.1725, + "step": 26940 + }, + { + "epoch": 1.1654232230059685, + "grad_norm": 3.623821496963501, + "learning_rate": 6.4108537755241885e-06, + "loss": 0.197, + "step": 26941 + }, + { + "epoch": 1.1654367878459033, + "grad_norm": 3.090599536895752, + "learning_rate": 6.410716732903934e-06, + "loss": 0.1406, + "step": 26942 + }, + { + "epoch": 1.1654503526858382, + "grad_norm": 3.913475513458252, + "learning_rate": 6.410579690283679e-06, + "loss": 0.212, + "step": 26943 + }, + { + "epoch": 1.1654639175257733, + "grad_norm": 3.455714225769043, + "learning_rate": 6.410442647663424e-06, + "loss": 0.1667, + "step": 26944 + }, + { + "epoch": 1.1654774823657081, + "grad_norm": 3.4106855392456055, + "learning_rate": 6.410305605043169e-06, + "loss": 0.1602, + "step": 26945 + }, + { + "epoch": 1.165491047205643, + "grad_norm": 4.519370079040527, + "learning_rate": 6.4101685624229135e-06, + "loss": 0.1952, + "step": 26946 + }, + { + "epoch": 1.1655046120455779, + "grad_norm": 5.0282511711120605, + "learning_rate": 6.4100315198026596e-06, + "loss": 0.2121, + "step": 26947 + }, + { + "epoch": 1.1655181768855127, + "grad_norm": 3.4071366786956787, + "learning_rate": 6.409894477182405e-06, + "loss": 0.1542, + "step": 26948 + }, + { + "epoch": 1.1655317417254476, + "grad_norm": 6.080219745635986, + "learning_rate": 6.409757434562149e-06, + "loss": 0.1931, + "step": 26949 + }, + { + "epoch": 1.1655453065653825, + "grad_norm": 4.6553120613098145, + "learning_rate": 6.409620391941894e-06, + "loss": 0.1805, + "step": 26950 + }, + { + "epoch": 1.1655588714053173, + "grad_norm": 6.277866840362549, + "learning_rate": 6.4094833493216394e-06, + "loss": 0.3023, + "step": 26951 + }, + { + "epoch": 1.1655724362452524, + "grad_norm": 4.034304141998291, + "learning_rate": 6.409346306701385e-06, + "loss": 0.194, + "step": 26952 + }, + { + "epoch": 1.1655860010851873, + "grad_norm": 3.25529146194458, + "learning_rate": 6.40920926408113e-06, + "loss": 0.1244, + "step": 26953 + }, + { + "epoch": 1.1655995659251221, + "grad_norm": 5.879905700683594, + "learning_rate": 6.409072221460875e-06, + "loss": 0.2842, + "step": 26954 + }, + { + "epoch": 1.165613130765057, + "grad_norm": 3.085268259048462, + "learning_rate": 6.408935178840619e-06, + "loss": 0.1305, + "step": 26955 + }, + { + "epoch": 1.1656266956049919, + "grad_norm": 6.120149612426758, + "learning_rate": 6.408798136220365e-06, + "loss": 0.2751, + "step": 26956 + }, + { + "epoch": 1.1656402604449267, + "grad_norm": 4.262354850769043, + "learning_rate": 6.4086610936001105e-06, + "loss": 0.2153, + "step": 26957 + }, + { + "epoch": 1.1656538252848616, + "grad_norm": 4.3665289878845215, + "learning_rate": 6.408524050979855e-06, + "loss": 0.2527, + "step": 26958 + }, + { + "epoch": 1.1656673901247965, + "grad_norm": 4.5562567710876465, + "learning_rate": 6.4083870083596e-06, + "loss": 0.225, + "step": 26959 + }, + { + "epoch": 1.1656809549647313, + "grad_norm": 4.958603858947754, + "learning_rate": 6.408249965739346e-06, + "loss": 0.1926, + "step": 26960 + }, + { + "epoch": 1.1656945198046662, + "grad_norm": 3.321284294128418, + "learning_rate": 6.408112923119091e-06, + "loss": 0.1658, + "step": 26961 + }, + { + "epoch": 1.165708084644601, + "grad_norm": 4.414191246032715, + "learning_rate": 6.4079758804988356e-06, + "loss": 0.2313, + "step": 26962 + }, + { + "epoch": 1.1657216494845362, + "grad_norm": 5.0662031173706055, + "learning_rate": 6.407838837878581e-06, + "loss": 0.255, + "step": 26963 + }, + { + "epoch": 1.165735214324471, + "grad_norm": 5.462818622589111, + "learning_rate": 6.407701795258325e-06, + "loss": 0.3143, + "step": 26964 + }, + { + "epoch": 1.165748779164406, + "grad_norm": 4.311593055725098, + "learning_rate": 6.407564752638071e-06, + "loss": 0.3016, + "step": 26965 + }, + { + "epoch": 1.1657623440043408, + "grad_norm": 4.4971137046813965, + "learning_rate": 6.407427710017816e-06, + "loss": 0.1795, + "step": 26966 + }, + { + "epoch": 1.1657759088442756, + "grad_norm": 4.2699408531188965, + "learning_rate": 6.407290667397561e-06, + "loss": 0.1449, + "step": 26967 + }, + { + "epoch": 1.1657894736842105, + "grad_norm": 3.166996479034424, + "learning_rate": 6.407153624777306e-06, + "loss": 0.1594, + "step": 26968 + }, + { + "epoch": 1.1658030385241454, + "grad_norm": 4.258760452270508, + "learning_rate": 6.407016582157052e-06, + "loss": 0.1878, + "step": 26969 + }, + { + "epoch": 1.1658166033640802, + "grad_norm": 4.057301044464111, + "learning_rate": 6.406879539536797e-06, + "loss": 0.2002, + "step": 26970 + }, + { + "epoch": 1.1658301682040153, + "grad_norm": 4.743370532989502, + "learning_rate": 6.406742496916541e-06, + "loss": 0.168, + "step": 26971 + }, + { + "epoch": 1.1658437330439502, + "grad_norm": 4.191791534423828, + "learning_rate": 6.4066054542962865e-06, + "loss": 0.2523, + "step": 26972 + }, + { + "epoch": 1.165857297883885, + "grad_norm": 4.4098005294799805, + "learning_rate": 6.4064684116760325e-06, + "loss": 0.1694, + "step": 26973 + }, + { + "epoch": 1.16587086272382, + "grad_norm": 6.834366798400879, + "learning_rate": 6.406331369055777e-06, + "loss": 0.4321, + "step": 26974 + }, + { + "epoch": 1.1658844275637548, + "grad_norm": 3.2818238735198975, + "learning_rate": 6.406194326435522e-06, + "loss": 0.2019, + "step": 26975 + }, + { + "epoch": 1.1658979924036896, + "grad_norm": 6.025644302368164, + "learning_rate": 6.406057283815267e-06, + "loss": 0.246, + "step": 26976 + }, + { + "epoch": 1.1659115572436245, + "grad_norm": 5.5677595138549805, + "learning_rate": 6.4059202411950115e-06, + "loss": 0.2007, + "step": 26977 + }, + { + "epoch": 1.1659251220835594, + "grad_norm": 4.333685874938965, + "learning_rate": 6.4057831985747576e-06, + "loss": 0.2759, + "step": 26978 + }, + { + "epoch": 1.1659386869234942, + "grad_norm": 4.121993541717529, + "learning_rate": 6.405646155954503e-06, + "loss": 0.2101, + "step": 26979 + }, + { + "epoch": 1.165952251763429, + "grad_norm": 6.398849010467529, + "learning_rate": 6.405509113334247e-06, + "loss": 0.2556, + "step": 26980 + }, + { + "epoch": 1.165965816603364, + "grad_norm": 5.850114345550537, + "learning_rate": 6.405372070713992e-06, + "loss": 0.1995, + "step": 26981 + }, + { + "epoch": 1.165979381443299, + "grad_norm": 4.333865642547607, + "learning_rate": 6.405235028093738e-06, + "loss": 0.2009, + "step": 26982 + }, + { + "epoch": 1.165992946283234, + "grad_norm": 6.074449062347412, + "learning_rate": 6.405097985473483e-06, + "loss": 0.373, + "step": 26983 + }, + { + "epoch": 1.1660065111231688, + "grad_norm": 4.6887102127075195, + "learning_rate": 6.404960942853228e-06, + "loss": 0.2902, + "step": 26984 + }, + { + "epoch": 1.1660200759631036, + "grad_norm": 4.228024959564209, + "learning_rate": 6.404823900232973e-06, + "loss": 0.1381, + "step": 26985 + }, + { + "epoch": 1.1660336408030385, + "grad_norm": 6.719982147216797, + "learning_rate": 6.404686857612719e-06, + "loss": 0.2681, + "step": 26986 + }, + { + "epoch": 1.1660472056429734, + "grad_norm": 7.605882167816162, + "learning_rate": 6.404549814992463e-06, + "loss": 0.3576, + "step": 26987 + }, + { + "epoch": 1.1660607704829082, + "grad_norm": 5.389388084411621, + "learning_rate": 6.4044127723722085e-06, + "loss": 0.2584, + "step": 26988 + }, + { + "epoch": 1.166074335322843, + "grad_norm": 4.90779447555542, + "learning_rate": 6.404275729751953e-06, + "loss": 0.2177, + "step": 26989 + }, + { + "epoch": 1.1660879001627782, + "grad_norm": 6.643498420715332, + "learning_rate": 6.404138687131698e-06, + "loss": 0.2711, + "step": 26990 + }, + { + "epoch": 1.166101465002713, + "grad_norm": 6.531632900238037, + "learning_rate": 6.404001644511444e-06, + "loss": 0.24, + "step": 26991 + }, + { + "epoch": 1.166115029842648, + "grad_norm": 4.100675106048584, + "learning_rate": 6.403864601891188e-06, + "loss": 0.1744, + "step": 26992 + }, + { + "epoch": 1.1661285946825828, + "grad_norm": 5.13504695892334, + "learning_rate": 6.4037275592709336e-06, + "loss": 0.1752, + "step": 26993 + }, + { + "epoch": 1.1661421595225177, + "grad_norm": 4.300119400024414, + "learning_rate": 6.403590516650679e-06, + "loss": 0.215, + "step": 26994 + }, + { + "epoch": 1.1661557243624525, + "grad_norm": 5.190310955047607, + "learning_rate": 6.403453474030425e-06, + "loss": 0.2198, + "step": 26995 + }, + { + "epoch": 1.1661692892023874, + "grad_norm": 5.963575839996338, + "learning_rate": 6.403316431410169e-06, + "loss": 0.2297, + "step": 26996 + }, + { + "epoch": 1.1661828540423222, + "grad_norm": 4.779250144958496, + "learning_rate": 6.403179388789914e-06, + "loss": 0.1358, + "step": 26997 + }, + { + "epoch": 1.1661964188822571, + "grad_norm": 4.930896759033203, + "learning_rate": 6.403042346169659e-06, + "loss": 0.2006, + "step": 26998 + }, + { + "epoch": 1.166209983722192, + "grad_norm": 5.402918338775635, + "learning_rate": 6.402905303549405e-06, + "loss": 0.2509, + "step": 26999 + }, + { + "epoch": 1.1662235485621268, + "grad_norm": 4.1718974113464355, + "learning_rate": 6.40276826092915e-06, + "loss": 0.177, + "step": 27000 + }, + { + "epoch": 1.166237113402062, + "grad_norm": 4.879330635070801, + "learning_rate": 6.402631218308894e-06, + "loss": 0.2233, + "step": 27001 + }, + { + "epoch": 1.1662506782419968, + "grad_norm": 4.922762870788574, + "learning_rate": 6.402494175688639e-06, + "loss": 0.1411, + "step": 27002 + }, + { + "epoch": 1.1662642430819317, + "grad_norm": 6.313879489898682, + "learning_rate": 6.4023571330683845e-06, + "loss": 0.2534, + "step": 27003 + }, + { + "epoch": 1.1662778079218665, + "grad_norm": 5.717152118682861, + "learning_rate": 6.4022200904481305e-06, + "loss": 0.2294, + "step": 27004 + }, + { + "epoch": 1.1662913727618014, + "grad_norm": 5.771038055419922, + "learning_rate": 6.402083047827875e-06, + "loss": 0.3227, + "step": 27005 + }, + { + "epoch": 1.1663049376017363, + "grad_norm": 6.1629838943481445, + "learning_rate": 6.40194600520762e-06, + "loss": 0.3014, + "step": 27006 + }, + { + "epoch": 1.1663185024416711, + "grad_norm": 5.534049987792969, + "learning_rate": 6.401808962587364e-06, + "loss": 0.2375, + "step": 27007 + }, + { + "epoch": 1.166332067281606, + "grad_norm": 5.431467533111572, + "learning_rate": 6.40167191996711e-06, + "loss": 0.2337, + "step": 27008 + }, + { + "epoch": 1.166345632121541, + "grad_norm": 6.151732444763184, + "learning_rate": 6.4015348773468556e-06, + "loss": 0.2625, + "step": 27009 + }, + { + "epoch": 1.166359196961476, + "grad_norm": 5.7048234939575195, + "learning_rate": 6.401397834726601e-06, + "loss": 0.2799, + "step": 27010 + }, + { + "epoch": 1.1663727618014108, + "grad_norm": 7.435125350952148, + "learning_rate": 6.401260792106345e-06, + "loss": 0.2424, + "step": 27011 + }, + { + "epoch": 1.1663863266413457, + "grad_norm": 5.7435150146484375, + "learning_rate": 6.401123749486091e-06, + "loss": 0.1903, + "step": 27012 + }, + { + "epoch": 1.1663998914812805, + "grad_norm": 4.590963363647461, + "learning_rate": 6.400986706865836e-06, + "loss": 0.2025, + "step": 27013 + }, + { + "epoch": 1.1664134563212154, + "grad_norm": 3.984365224838257, + "learning_rate": 6.400849664245581e-06, + "loss": 0.2035, + "step": 27014 + }, + { + "epoch": 1.1664270211611503, + "grad_norm": 4.562366962432861, + "learning_rate": 6.400712621625326e-06, + "loss": 0.1346, + "step": 27015 + }, + { + "epoch": 1.1664405860010851, + "grad_norm": 5.729389667510986, + "learning_rate": 6.400575579005072e-06, + "loss": 0.2439, + "step": 27016 + }, + { + "epoch": 1.16645415084102, + "grad_norm": 4.928016662597656, + "learning_rate": 6.400438536384816e-06, + "loss": 0.1595, + "step": 27017 + }, + { + "epoch": 1.1664677156809549, + "grad_norm": 4.379981994628906, + "learning_rate": 6.400301493764561e-06, + "loss": 0.1801, + "step": 27018 + }, + { + "epoch": 1.1664812805208897, + "grad_norm": 5.156681060791016, + "learning_rate": 6.4001644511443065e-06, + "loss": 0.2084, + "step": 27019 + }, + { + "epoch": 1.1664948453608248, + "grad_norm": 4.344117164611816, + "learning_rate": 6.400027408524051e-06, + "loss": 0.164, + "step": 27020 + }, + { + "epoch": 1.1665084102007597, + "grad_norm": 5.588780403137207, + "learning_rate": 6.399890365903797e-06, + "loss": 0.2902, + "step": 27021 + }, + { + "epoch": 1.1665219750406945, + "grad_norm": 4.604538917541504, + "learning_rate": 6.399753323283542e-06, + "loss": 0.1863, + "step": 27022 + }, + { + "epoch": 1.1665355398806294, + "grad_norm": 6.414647579193115, + "learning_rate": 6.399616280663286e-06, + "loss": 0.3324, + "step": 27023 + }, + { + "epoch": 1.1665491047205643, + "grad_norm": 5.680800437927246, + "learning_rate": 6.3994792380430316e-06, + "loss": 0.3002, + "step": 27024 + }, + { + "epoch": 1.1665626695604991, + "grad_norm": 6.096127510070801, + "learning_rate": 6.399342195422778e-06, + "loss": 0.2849, + "step": 27025 + }, + { + "epoch": 1.166576234400434, + "grad_norm": 5.817878723144531, + "learning_rate": 6.399205152802522e-06, + "loss": 0.286, + "step": 27026 + }, + { + "epoch": 1.1665897992403689, + "grad_norm": 4.54317045211792, + "learning_rate": 6.399068110182267e-06, + "loss": 0.2315, + "step": 27027 + }, + { + "epoch": 1.166603364080304, + "grad_norm": 6.4238176345825195, + "learning_rate": 6.398931067562012e-06, + "loss": 0.2529, + "step": 27028 + }, + { + "epoch": 1.1666169289202388, + "grad_norm": 5.231561660766602, + "learning_rate": 6.398794024941758e-06, + "loss": 0.2426, + "step": 27029 + }, + { + "epoch": 1.1666304937601737, + "grad_norm": 4.583170413970947, + "learning_rate": 6.398656982321503e-06, + "loss": 0.2741, + "step": 27030 + }, + { + "epoch": 1.1666440586001086, + "grad_norm": 4.8363823890686035, + "learning_rate": 6.398519939701248e-06, + "loss": 0.215, + "step": 27031 + }, + { + "epoch": 1.1666576234400434, + "grad_norm": 5.681809425354004, + "learning_rate": 6.398382897080992e-06, + "loss": 0.2619, + "step": 27032 + }, + { + "epoch": 1.1666711882799783, + "grad_norm": 5.040351390838623, + "learning_rate": 6.398245854460737e-06, + "loss": 0.1887, + "step": 27033 + }, + { + "epoch": 1.1666847531199132, + "grad_norm": 4.463360786437988, + "learning_rate": 6.398108811840483e-06, + "loss": 0.1787, + "step": 27034 + }, + { + "epoch": 1.166698317959848, + "grad_norm": 5.589696884155273, + "learning_rate": 6.3979717692202285e-06, + "loss": 0.3212, + "step": 27035 + }, + { + "epoch": 1.1667118827997829, + "grad_norm": 5.116990089416504, + "learning_rate": 6.397834726599973e-06, + "loss": 0.1894, + "step": 27036 + }, + { + "epoch": 1.1667254476397177, + "grad_norm": 5.2766947746276855, + "learning_rate": 6.397697683979718e-06, + "loss": 0.2093, + "step": 27037 + }, + { + "epoch": 1.1667390124796528, + "grad_norm": 3.965752363204956, + "learning_rate": 6.397560641359464e-06, + "loss": 0.1755, + "step": 27038 + }, + { + "epoch": 1.1667525773195877, + "grad_norm": 5.804240703582764, + "learning_rate": 6.397423598739208e-06, + "loss": 0.2512, + "step": 27039 + }, + { + "epoch": 1.1667661421595226, + "grad_norm": 5.985934734344482, + "learning_rate": 6.3972865561189536e-06, + "loss": 0.2323, + "step": 27040 + }, + { + "epoch": 1.1667797069994574, + "grad_norm": 5.439939975738525, + "learning_rate": 6.397149513498698e-06, + "loss": 0.2865, + "step": 27041 + }, + { + "epoch": 1.1667932718393923, + "grad_norm": 5.942718982696533, + "learning_rate": 6.397012470878444e-06, + "loss": 0.2923, + "step": 27042 + }, + { + "epoch": 1.1668068366793272, + "grad_norm": 5.444856643676758, + "learning_rate": 6.396875428258189e-06, + "loss": 0.179, + "step": 27043 + }, + { + "epoch": 1.166820401519262, + "grad_norm": 5.381787300109863, + "learning_rate": 6.396738385637934e-06, + "loss": 0.2256, + "step": 27044 + }, + { + "epoch": 1.166833966359197, + "grad_norm": 3.904953718185425, + "learning_rate": 6.396601343017679e-06, + "loss": 0.1945, + "step": 27045 + }, + { + "epoch": 1.1668475311991318, + "grad_norm": 3.7762482166290283, + "learning_rate": 6.396464300397424e-06, + "loss": 0.1774, + "step": 27046 + }, + { + "epoch": 1.1668610960390668, + "grad_norm": 5.671972751617432, + "learning_rate": 6.39632725777717e-06, + "loss": 0.224, + "step": 27047 + }, + { + "epoch": 1.1668746608790017, + "grad_norm": 4.476244926452637, + "learning_rate": 6.396190215156914e-06, + "loss": 0.2054, + "step": 27048 + }, + { + "epoch": 1.1668882257189366, + "grad_norm": 4.018475532531738, + "learning_rate": 6.396053172536659e-06, + "loss": 0.1546, + "step": 27049 + }, + { + "epoch": 1.1669017905588714, + "grad_norm": 7.349987506866455, + "learning_rate": 6.395916129916404e-06, + "loss": 0.2783, + "step": 27050 + }, + { + "epoch": 1.1669153553988063, + "grad_norm": 4.328851699829102, + "learning_rate": 6.39577908729615e-06, + "loss": 0.2051, + "step": 27051 + }, + { + "epoch": 1.1669289202387412, + "grad_norm": 5.05868673324585, + "learning_rate": 6.395642044675895e-06, + "loss": 0.2242, + "step": 27052 + }, + { + "epoch": 1.166942485078676, + "grad_norm": 4.458451747894287, + "learning_rate": 6.39550500205564e-06, + "loss": 0.2188, + "step": 27053 + }, + { + "epoch": 1.166956049918611, + "grad_norm": 4.482684135437012, + "learning_rate": 6.395367959435384e-06, + "loss": 0.1413, + "step": 27054 + }, + { + "epoch": 1.1669696147585458, + "grad_norm": 5.099179267883301, + "learning_rate": 6.39523091681513e-06, + "loss": 0.1902, + "step": 27055 + }, + { + "epoch": 1.1669831795984806, + "grad_norm": 6.776742458343506, + "learning_rate": 6.395093874194876e-06, + "loss": 0.2952, + "step": 27056 + }, + { + "epoch": 1.1669967444384157, + "grad_norm": 6.291133403778076, + "learning_rate": 6.39495683157462e-06, + "loss": 0.2777, + "step": 27057 + }, + { + "epoch": 1.1670103092783506, + "grad_norm": 6.411313056945801, + "learning_rate": 6.394819788954365e-06, + "loss": 0.2779, + "step": 27058 + }, + { + "epoch": 1.1670238741182855, + "grad_norm": 7.060147762298584, + "learning_rate": 6.39468274633411e-06, + "loss": 0.3211, + "step": 27059 + }, + { + "epoch": 1.1670374389582203, + "grad_norm": 4.469863414764404, + "learning_rate": 6.3945457037138555e-06, + "loss": 0.1845, + "step": 27060 + }, + { + "epoch": 1.1670510037981552, + "grad_norm": 5.217220783233643, + "learning_rate": 6.394408661093601e-06, + "loss": 0.2338, + "step": 27061 + }, + { + "epoch": 1.16706456863809, + "grad_norm": 6.1155877113342285, + "learning_rate": 6.394271618473346e-06, + "loss": 0.284, + "step": 27062 + }, + { + "epoch": 1.167078133478025, + "grad_norm": 5.447872161865234, + "learning_rate": 6.39413457585309e-06, + "loss": 0.2196, + "step": 27063 + }, + { + "epoch": 1.1670916983179598, + "grad_norm": 4.387903213500977, + "learning_rate": 6.393997533232836e-06, + "loss": 0.1883, + "step": 27064 + }, + { + "epoch": 1.1671052631578946, + "grad_norm": 5.659404277801514, + "learning_rate": 6.393860490612581e-06, + "loss": 0.2264, + "step": 27065 + }, + { + "epoch": 1.1671188279978297, + "grad_norm": 4.058804512023926, + "learning_rate": 6.393723447992326e-06, + "loss": 0.1783, + "step": 27066 + }, + { + "epoch": 1.1671323928377646, + "grad_norm": 4.707568168640137, + "learning_rate": 6.393586405372071e-06, + "loss": 0.2743, + "step": 27067 + }, + { + "epoch": 1.1671459576776995, + "grad_norm": 4.353756427764893, + "learning_rate": 6.393449362751817e-06, + "loss": 0.2521, + "step": 27068 + }, + { + "epoch": 1.1671595225176343, + "grad_norm": 5.660181045532227, + "learning_rate": 6.393312320131562e-06, + "loss": 0.2625, + "step": 27069 + }, + { + "epoch": 1.1671730873575692, + "grad_norm": 4.20753812789917, + "learning_rate": 6.393175277511306e-06, + "loss": 0.1687, + "step": 27070 + }, + { + "epoch": 1.167186652197504, + "grad_norm": 4.80764627456665, + "learning_rate": 6.393038234891052e-06, + "loss": 0.2271, + "step": 27071 + }, + { + "epoch": 1.167200217037439, + "grad_norm": 6.188815593719482, + "learning_rate": 6.392901192270796e-06, + "loss": 0.1873, + "step": 27072 + }, + { + "epoch": 1.1672137818773738, + "grad_norm": 4.759409427642822, + "learning_rate": 6.392764149650542e-06, + "loss": 0.2292, + "step": 27073 + }, + { + "epoch": 1.1672273467173087, + "grad_norm": 5.584510803222656, + "learning_rate": 6.392627107030287e-06, + "loss": 0.2096, + "step": 27074 + }, + { + "epoch": 1.1672409115572435, + "grad_norm": 5.6473469734191895, + "learning_rate": 6.3924900644100314e-06, + "loss": 0.2712, + "step": 27075 + }, + { + "epoch": 1.1672544763971786, + "grad_norm": 5.339561462402344, + "learning_rate": 6.392353021789777e-06, + "loss": 0.2013, + "step": 27076 + }, + { + "epoch": 1.1672680412371135, + "grad_norm": 5.9384589195251465, + "learning_rate": 6.392215979169523e-06, + "loss": 0.2241, + "step": 27077 + }, + { + "epoch": 1.1672816060770483, + "grad_norm": 5.450544357299805, + "learning_rate": 6.392078936549268e-06, + "loss": 0.1935, + "step": 27078 + }, + { + "epoch": 1.1672951709169832, + "grad_norm": 5.012320041656494, + "learning_rate": 6.391941893929012e-06, + "loss": 0.2311, + "step": 27079 + }, + { + "epoch": 1.167308735756918, + "grad_norm": 4.476219177246094, + "learning_rate": 6.391804851308757e-06, + "loss": 0.2081, + "step": 27080 + }, + { + "epoch": 1.167322300596853, + "grad_norm": 4.451771259307861, + "learning_rate": 6.391667808688503e-06, + "loss": 0.1569, + "step": 27081 + }, + { + "epoch": 1.1673358654367878, + "grad_norm": 5.5122857093811035, + "learning_rate": 6.391530766068248e-06, + "loss": 0.1852, + "step": 27082 + }, + { + "epoch": 1.1673494302767227, + "grad_norm": 3.843290090560913, + "learning_rate": 6.391393723447993e-06, + "loss": 0.2665, + "step": 27083 + }, + { + "epoch": 1.1673629951166575, + "grad_norm": 6.483068943023682, + "learning_rate": 6.391256680827738e-06, + "loss": 0.2247, + "step": 27084 + }, + { + "epoch": 1.1673765599565926, + "grad_norm": 4.481056213378906, + "learning_rate": 6.391119638207483e-06, + "loss": 0.1764, + "step": 27085 + }, + { + "epoch": 1.1673901247965275, + "grad_norm": 4.495226860046387, + "learning_rate": 6.390982595587228e-06, + "loss": 0.2247, + "step": 27086 + }, + { + "epoch": 1.1674036896364623, + "grad_norm": 8.306421279907227, + "learning_rate": 6.390845552966974e-06, + "loss": 0.2799, + "step": 27087 + }, + { + "epoch": 1.1674172544763972, + "grad_norm": 5.159994125366211, + "learning_rate": 6.390708510346718e-06, + "loss": 0.2194, + "step": 27088 + }, + { + "epoch": 1.167430819316332, + "grad_norm": 3.8441238403320312, + "learning_rate": 6.390571467726463e-06, + "loss": 0.1364, + "step": 27089 + }, + { + "epoch": 1.167444384156267, + "grad_norm": 3.829787015914917, + "learning_rate": 6.390434425106209e-06, + "loss": 0.1874, + "step": 27090 + }, + { + "epoch": 1.1674579489962018, + "grad_norm": 4.420652389526367, + "learning_rate": 6.3902973824859535e-06, + "loss": 0.2064, + "step": 27091 + }, + { + "epoch": 1.1674715138361367, + "grad_norm": 4.198364734649658, + "learning_rate": 6.390160339865699e-06, + "loss": 0.2054, + "step": 27092 + }, + { + "epoch": 1.1674850786760715, + "grad_norm": 4.388402462005615, + "learning_rate": 6.390023297245444e-06, + "loss": 0.255, + "step": 27093 + }, + { + "epoch": 1.1674986435160064, + "grad_norm": 3.7625558376312256, + "learning_rate": 6.389886254625189e-06, + "loss": 0.2169, + "step": 27094 + }, + { + "epoch": 1.1675122083559415, + "grad_norm": 4.3702569007873535, + "learning_rate": 6.389749212004934e-06, + "loss": 0.1974, + "step": 27095 + }, + { + "epoch": 1.1675257731958764, + "grad_norm": 2.721604585647583, + "learning_rate": 6.389612169384679e-06, + "loss": 0.1419, + "step": 27096 + }, + { + "epoch": 1.1675393380358112, + "grad_norm": 4.06288480758667, + "learning_rate": 6.389475126764424e-06, + "loss": 0.1791, + "step": 27097 + }, + { + "epoch": 1.167552902875746, + "grad_norm": 4.056270122528076, + "learning_rate": 6.38933808414417e-06, + "loss": 0.198, + "step": 27098 + }, + { + "epoch": 1.167566467715681, + "grad_norm": 5.761959075927734, + "learning_rate": 6.389201041523915e-06, + "loss": 0.3228, + "step": 27099 + }, + { + "epoch": 1.1675800325556158, + "grad_norm": 4.6339192390441895, + "learning_rate": 6.389063998903659e-06, + "loss": 0.2016, + "step": 27100 + }, + { + "epoch": 1.1675935973955507, + "grad_norm": 3.498286247253418, + "learning_rate": 6.388926956283404e-06, + "loss": 0.0761, + "step": 27101 + }, + { + "epoch": 1.1676071622354856, + "grad_norm": 3.217336416244507, + "learning_rate": 6.38878991366315e-06, + "loss": 0.1273, + "step": 27102 + }, + { + "epoch": 1.1676207270754204, + "grad_norm": 6.05366849899292, + "learning_rate": 6.388652871042896e-06, + "loss": 0.2392, + "step": 27103 + }, + { + "epoch": 1.1676342919153555, + "grad_norm": 6.313656806945801, + "learning_rate": 6.38851582842264e-06, + "loss": 0.3058, + "step": 27104 + }, + { + "epoch": 1.1676478567552904, + "grad_norm": 5.5437912940979, + "learning_rate": 6.388378785802385e-06, + "loss": 0.2376, + "step": 27105 + }, + { + "epoch": 1.1676614215952252, + "grad_norm": 3.8787524700164795, + "learning_rate": 6.3882417431821295e-06, + "loss": 0.1299, + "step": 27106 + }, + { + "epoch": 1.16767498643516, + "grad_norm": 5.333755970001221, + "learning_rate": 6.3881047005618755e-06, + "loss": 0.2439, + "step": 27107 + }, + { + "epoch": 1.167688551275095, + "grad_norm": 5.015227794647217, + "learning_rate": 6.387967657941621e-06, + "loss": 0.2734, + "step": 27108 + }, + { + "epoch": 1.1677021161150298, + "grad_norm": 5.583351135253906, + "learning_rate": 6.387830615321365e-06, + "loss": 0.273, + "step": 27109 + }, + { + "epoch": 1.1677156809549647, + "grad_norm": 5.983744144439697, + "learning_rate": 6.38769357270111e-06, + "loss": 0.3095, + "step": 27110 + }, + { + "epoch": 1.1677292457948996, + "grad_norm": 3.554643154144287, + "learning_rate": 6.387556530080856e-06, + "loss": 0.1359, + "step": 27111 + }, + { + "epoch": 1.1677428106348344, + "grad_norm": 4.741950988769531, + "learning_rate": 6.387419487460601e-06, + "loss": 0.2216, + "step": 27112 + }, + { + "epoch": 1.1677563754747693, + "grad_norm": 5.563854217529297, + "learning_rate": 6.387282444840346e-06, + "loss": 0.2354, + "step": 27113 + }, + { + "epoch": 1.1677699403147044, + "grad_norm": 4.405431270599365, + "learning_rate": 6.387145402220091e-06, + "loss": 0.2082, + "step": 27114 + }, + { + "epoch": 1.1677835051546392, + "grad_norm": 5.615592956542969, + "learning_rate": 6.387008359599835e-06, + "loss": 0.3117, + "step": 27115 + }, + { + "epoch": 1.167797069994574, + "grad_norm": 4.140743732452393, + "learning_rate": 6.386871316979581e-06, + "loss": 0.1861, + "step": 27116 + }, + { + "epoch": 1.167810634834509, + "grad_norm": 5.9497809410095215, + "learning_rate": 6.386734274359326e-06, + "loss": 0.2695, + "step": 27117 + }, + { + "epoch": 1.1678241996744438, + "grad_norm": 5.521514415740967, + "learning_rate": 6.386597231739072e-06, + "loss": 0.2904, + "step": 27118 + }, + { + "epoch": 1.1678377645143787, + "grad_norm": 5.221396446228027, + "learning_rate": 6.386460189118816e-06, + "loss": 0.3479, + "step": 27119 + }, + { + "epoch": 1.1678513293543136, + "grad_norm": 4.3716325759887695, + "learning_rate": 6.386323146498562e-06, + "loss": 0.2678, + "step": 27120 + }, + { + "epoch": 1.1678648941942484, + "grad_norm": 6.258844375610352, + "learning_rate": 6.386186103878307e-06, + "loss": 0.3634, + "step": 27121 + }, + { + "epoch": 1.1678784590341833, + "grad_norm": 4.065140724182129, + "learning_rate": 6.3860490612580515e-06, + "loss": 0.2535, + "step": 27122 + }, + { + "epoch": 1.1678920238741184, + "grad_norm": 4.623152732849121, + "learning_rate": 6.385912018637797e-06, + "loss": 0.2475, + "step": 27123 + }, + { + "epoch": 1.1679055887140533, + "grad_norm": 4.9386115074157715, + "learning_rate": 6.385774976017543e-06, + "loss": 0.2193, + "step": 27124 + }, + { + "epoch": 1.1679191535539881, + "grad_norm": 5.151637077331543, + "learning_rate": 6.385637933397287e-06, + "loss": 0.287, + "step": 27125 + }, + { + "epoch": 1.167932718393923, + "grad_norm": 6.681190013885498, + "learning_rate": 6.385500890777032e-06, + "loss": 0.348, + "step": 27126 + }, + { + "epoch": 1.1679462832338579, + "grad_norm": 3.930723190307617, + "learning_rate": 6.385363848156777e-06, + "loss": 0.2426, + "step": 27127 + }, + { + "epoch": 1.1679598480737927, + "grad_norm": 6.139942169189453, + "learning_rate": 6.385226805536522e-06, + "loss": 0.421, + "step": 27128 + }, + { + "epoch": 1.1679734129137276, + "grad_norm": 5.408462047576904, + "learning_rate": 6.385089762916268e-06, + "loss": 0.2774, + "step": 27129 + }, + { + "epoch": 1.1679869777536624, + "grad_norm": 3.991293430328369, + "learning_rate": 6.384952720296013e-06, + "loss": 0.2135, + "step": 27130 + }, + { + "epoch": 1.1680005425935973, + "grad_norm": 4.925406455993652, + "learning_rate": 6.384815677675757e-06, + "loss": 0.3162, + "step": 27131 + }, + { + "epoch": 1.1680141074335322, + "grad_norm": 6.24815034866333, + "learning_rate": 6.384678635055502e-06, + "loss": 0.2735, + "step": 27132 + }, + { + "epoch": 1.1680276722734673, + "grad_norm": 5.003416538238525, + "learning_rate": 6.3845415924352484e-06, + "loss": 0.2665, + "step": 27133 + }, + { + "epoch": 1.1680412371134021, + "grad_norm": 3.3893818855285645, + "learning_rate": 6.384404549814993e-06, + "loss": 0.2587, + "step": 27134 + }, + { + "epoch": 1.168054801953337, + "grad_norm": 4.79780912399292, + "learning_rate": 6.384267507194738e-06, + "loss": 0.2445, + "step": 27135 + }, + { + "epoch": 1.1680683667932719, + "grad_norm": 6.6113600730896, + "learning_rate": 6.384130464574483e-06, + "loss": 0.2148, + "step": 27136 + }, + { + "epoch": 1.1680819316332067, + "grad_norm": 5.406402111053467, + "learning_rate": 6.383993421954229e-06, + "loss": 0.2937, + "step": 27137 + }, + { + "epoch": 1.1680954964731416, + "grad_norm": 4.827141761779785, + "learning_rate": 6.3838563793339735e-06, + "loss": 0.1929, + "step": 27138 + }, + { + "epoch": 1.1681090613130765, + "grad_norm": 4.974833965301514, + "learning_rate": 6.383719336713719e-06, + "loss": 0.3351, + "step": 27139 + }, + { + "epoch": 1.1681226261530113, + "grad_norm": 4.188342094421387, + "learning_rate": 6.383582294093463e-06, + "loss": 0.2321, + "step": 27140 + }, + { + "epoch": 1.1681361909929462, + "grad_norm": 6.124734878540039, + "learning_rate": 6.383445251473208e-06, + "loss": 0.3704, + "step": 27141 + }, + { + "epoch": 1.1681497558328813, + "grad_norm": 3.91672420501709, + "learning_rate": 6.383308208852954e-06, + "loss": 0.2147, + "step": 27142 + }, + { + "epoch": 1.1681633206728161, + "grad_norm": 4.073964595794678, + "learning_rate": 6.3831711662326985e-06, + "loss": 0.2688, + "step": 27143 + }, + { + "epoch": 1.168176885512751, + "grad_norm": 4.631405353546143, + "learning_rate": 6.383034123612444e-06, + "loss": 0.2765, + "step": 27144 + }, + { + "epoch": 1.1681904503526859, + "grad_norm": 4.368960380554199, + "learning_rate": 6.382897080992189e-06, + "loss": 0.2194, + "step": 27145 + }, + { + "epoch": 1.1682040151926207, + "grad_norm": 4.945217132568359, + "learning_rate": 6.382760038371935e-06, + "loss": 0.2402, + "step": 27146 + }, + { + "epoch": 1.1682175800325556, + "grad_norm": 4.3132147789001465, + "learning_rate": 6.382622995751679e-06, + "loss": 0.362, + "step": 27147 + }, + { + "epoch": 1.1682311448724905, + "grad_norm": 4.629361152648926, + "learning_rate": 6.3824859531314244e-06, + "loss": 0.2267, + "step": 27148 + }, + { + "epoch": 1.1682447097124253, + "grad_norm": 5.321046829223633, + "learning_rate": 6.382348910511169e-06, + "loss": 0.2918, + "step": 27149 + }, + { + "epoch": 1.1682582745523602, + "grad_norm": 4.841041564941406, + "learning_rate": 6.382211867890915e-06, + "loss": 0.354, + "step": 27150 + }, + { + "epoch": 1.168271839392295, + "grad_norm": 4.923053741455078, + "learning_rate": 6.38207482527066e-06, + "loss": 0.2302, + "step": 27151 + }, + { + "epoch": 1.1682854042322302, + "grad_norm": 4.376769065856934, + "learning_rate": 6.381937782650405e-06, + "loss": 0.2049, + "step": 27152 + }, + { + "epoch": 1.168298969072165, + "grad_norm": 4.666741847991943, + "learning_rate": 6.3818007400301495e-06, + "loss": 0.3192, + "step": 27153 + }, + { + "epoch": 1.1683125339120999, + "grad_norm": 5.019449234008789, + "learning_rate": 6.3816636974098955e-06, + "loss": 0.2397, + "step": 27154 + }, + { + "epoch": 1.1683260987520347, + "grad_norm": 5.5138630867004395, + "learning_rate": 6.381526654789641e-06, + "loss": 0.3102, + "step": 27155 + }, + { + "epoch": 1.1683396635919696, + "grad_norm": 6.267951011657715, + "learning_rate": 6.381389612169385e-06, + "loss": 0.3214, + "step": 27156 + }, + { + "epoch": 1.1683532284319045, + "grad_norm": 4.383026599884033, + "learning_rate": 6.38125256954913e-06, + "loss": 0.2628, + "step": 27157 + }, + { + "epoch": 1.1683667932718393, + "grad_norm": 6.572393894195557, + "learning_rate": 6.3811155269288745e-06, + "loss": 0.333, + "step": 27158 + }, + { + "epoch": 1.1683803581117742, + "grad_norm": 5.125263214111328, + "learning_rate": 6.3809784843086205e-06, + "loss": 0.3627, + "step": 27159 + }, + { + "epoch": 1.168393922951709, + "grad_norm": 4.240665912628174, + "learning_rate": 6.380841441688366e-06, + "loss": 0.2294, + "step": 27160 + }, + { + "epoch": 1.1684074877916442, + "grad_norm": 5.301926136016846, + "learning_rate": 6.380704399068111e-06, + "loss": 0.2373, + "step": 27161 + }, + { + "epoch": 1.168421052631579, + "grad_norm": 5.73627233505249, + "learning_rate": 6.380567356447855e-06, + "loss": 0.233, + "step": 27162 + }, + { + "epoch": 1.168434617471514, + "grad_norm": 6.625670433044434, + "learning_rate": 6.380430313827601e-06, + "loss": 0.3824, + "step": 27163 + }, + { + "epoch": 1.1684481823114488, + "grad_norm": 5.066229343414307, + "learning_rate": 6.3802932712073464e-06, + "loss": 0.2091, + "step": 27164 + }, + { + "epoch": 1.1684617471513836, + "grad_norm": 5.769026756286621, + "learning_rate": 6.380156228587091e-06, + "loss": 0.3595, + "step": 27165 + }, + { + "epoch": 1.1684753119913185, + "grad_norm": 7.039530277252197, + "learning_rate": 6.380019185966836e-06, + "loss": 0.3144, + "step": 27166 + }, + { + "epoch": 1.1684888768312534, + "grad_norm": 5.043414115905762, + "learning_rate": 6.379882143346582e-06, + "loss": 0.2486, + "step": 27167 + }, + { + "epoch": 1.1685024416711882, + "grad_norm": 6.708012104034424, + "learning_rate": 6.379745100726326e-06, + "loss": 0.3664, + "step": 27168 + }, + { + "epoch": 1.168516006511123, + "grad_norm": 5.077921390533447, + "learning_rate": 6.3796080581060715e-06, + "loss": 0.2253, + "step": 27169 + }, + { + "epoch": 1.168529571351058, + "grad_norm": 6.229546546936035, + "learning_rate": 6.379471015485817e-06, + "loss": 0.376, + "step": 27170 + }, + { + "epoch": 1.168543136190993, + "grad_norm": 4.636343479156494, + "learning_rate": 6.379333972865561e-06, + "loss": 0.1743, + "step": 27171 + }, + { + "epoch": 1.168556701030928, + "grad_norm": 4.592771053314209, + "learning_rate": 6.379196930245307e-06, + "loss": 0.1911, + "step": 27172 + }, + { + "epoch": 1.1685702658708628, + "grad_norm": 6.176102638244629, + "learning_rate": 6.379059887625052e-06, + "loss": 0.2596, + "step": 27173 + }, + { + "epoch": 1.1685838307107976, + "grad_norm": 6.204665184020996, + "learning_rate": 6.3789228450047965e-06, + "loss": 0.3539, + "step": 27174 + }, + { + "epoch": 1.1685973955507325, + "grad_norm": 6.8531317710876465, + "learning_rate": 6.378785802384542e-06, + "loss": 0.4673, + "step": 27175 + }, + { + "epoch": 1.1686109603906674, + "grad_norm": 4.271562099456787, + "learning_rate": 6.378648759764288e-06, + "loss": 0.2314, + "step": 27176 + }, + { + "epoch": 1.1686245252306022, + "grad_norm": 4.550180435180664, + "learning_rate": 6.378511717144033e-06, + "loss": 0.2211, + "step": 27177 + }, + { + "epoch": 1.168638090070537, + "grad_norm": 5.019969940185547, + "learning_rate": 6.378374674523777e-06, + "loss": 0.2714, + "step": 27178 + }, + { + "epoch": 1.168651654910472, + "grad_norm": 6.476643085479736, + "learning_rate": 6.3782376319035224e-06, + "loss": 0.213, + "step": 27179 + }, + { + "epoch": 1.168665219750407, + "grad_norm": 5.861637115478516, + "learning_rate": 6.3781005892832685e-06, + "loss": 0.2795, + "step": 27180 + }, + { + "epoch": 1.168678784590342, + "grad_norm": 4.178145408630371, + "learning_rate": 6.377963546663013e-06, + "loss": 0.2594, + "step": 27181 + }, + { + "epoch": 1.1686923494302768, + "grad_norm": 5.505938529968262, + "learning_rate": 6.377826504042758e-06, + "loss": 0.2772, + "step": 27182 + }, + { + "epoch": 1.1687059142702116, + "grad_norm": 4.794198989868164, + "learning_rate": 6.377689461422502e-06, + "loss": 0.2217, + "step": 27183 + }, + { + "epoch": 1.1687194791101465, + "grad_norm": 6.003117084503174, + "learning_rate": 6.3775524188022475e-06, + "loss": 0.3501, + "step": 27184 + }, + { + "epoch": 1.1687330439500814, + "grad_norm": 4.787304401397705, + "learning_rate": 6.3774153761819935e-06, + "loss": 0.2663, + "step": 27185 + }, + { + "epoch": 1.1687466087900162, + "grad_norm": 5.029961585998535, + "learning_rate": 6.377278333561739e-06, + "loss": 0.2527, + "step": 27186 + }, + { + "epoch": 1.168760173629951, + "grad_norm": 10.492790222167969, + "learning_rate": 6.377141290941483e-06, + "loss": 0.3363, + "step": 27187 + }, + { + "epoch": 1.168773738469886, + "grad_norm": 4.530860424041748, + "learning_rate": 6.377004248321228e-06, + "loss": 0.1922, + "step": 27188 + }, + { + "epoch": 1.1687873033098208, + "grad_norm": 4.864706039428711, + "learning_rate": 6.376867205700974e-06, + "loss": 0.2539, + "step": 27189 + }, + { + "epoch": 1.168800868149756, + "grad_norm": 5.185309886932373, + "learning_rate": 6.3767301630807185e-06, + "loss": 0.2868, + "step": 27190 + }, + { + "epoch": 1.1688144329896908, + "grad_norm": 5.31168794631958, + "learning_rate": 6.376593120460464e-06, + "loss": 0.2696, + "step": 27191 + }, + { + "epoch": 1.1688279978296257, + "grad_norm": 3.5985729694366455, + "learning_rate": 6.376456077840208e-06, + "loss": 0.1308, + "step": 27192 + }, + { + "epoch": 1.1688415626695605, + "grad_norm": 4.4675517082214355, + "learning_rate": 6.376319035219954e-06, + "loss": 0.2515, + "step": 27193 + }, + { + "epoch": 1.1688551275094954, + "grad_norm": 5.338721752166748, + "learning_rate": 6.376181992599699e-06, + "loss": 0.3555, + "step": 27194 + }, + { + "epoch": 1.1688686923494302, + "grad_norm": 3.990335702896118, + "learning_rate": 6.3760449499794444e-06, + "loss": 0.2272, + "step": 27195 + }, + { + "epoch": 1.1688822571893651, + "grad_norm": 4.289222240447998, + "learning_rate": 6.375907907359189e-06, + "loss": 0.3116, + "step": 27196 + }, + { + "epoch": 1.1688958220293, + "grad_norm": 4.694025993347168, + "learning_rate": 6.375770864738934e-06, + "loss": 0.2784, + "step": 27197 + }, + { + "epoch": 1.168909386869235, + "grad_norm": 3.8850955963134766, + "learning_rate": 6.37563382211868e-06, + "loss": 0.1875, + "step": 27198 + }, + { + "epoch": 1.16892295170917, + "grad_norm": 4.177074909210205, + "learning_rate": 6.375496779498424e-06, + "loss": 0.2534, + "step": 27199 + }, + { + "epoch": 1.1689365165491048, + "grad_norm": 5.4181060791015625, + "learning_rate": 6.3753597368781695e-06, + "loss": 0.3478, + "step": 27200 + }, + { + "epoch": 1.1689500813890397, + "grad_norm": 3.5802175998687744, + "learning_rate": 6.375222694257915e-06, + "loss": 0.2025, + "step": 27201 + }, + { + "epoch": 1.1689636462289745, + "grad_norm": 5.420687198638916, + "learning_rate": 6.37508565163766e-06, + "loss": 0.3993, + "step": 27202 + }, + { + "epoch": 1.1689772110689094, + "grad_norm": 6.5339131355285645, + "learning_rate": 6.374948609017405e-06, + "loss": 0.3533, + "step": 27203 + }, + { + "epoch": 1.1689907759088443, + "grad_norm": 4.500320911407471, + "learning_rate": 6.37481156639715e-06, + "loss": 0.2938, + "step": 27204 + }, + { + "epoch": 1.1690043407487791, + "grad_norm": 6.448368072509766, + "learning_rate": 6.3746745237768945e-06, + "loss": 0.2796, + "step": 27205 + }, + { + "epoch": 1.169017905588714, + "grad_norm": 3.6608972549438477, + "learning_rate": 6.3745374811566406e-06, + "loss": 0.2018, + "step": 27206 + }, + { + "epoch": 1.1690314704286489, + "grad_norm": 6.245459079742432, + "learning_rate": 6.374400438536386e-06, + "loss": 0.4661, + "step": 27207 + }, + { + "epoch": 1.1690450352685837, + "grad_norm": 5.197712421417236, + "learning_rate": 6.37426339591613e-06, + "loss": 0.3122, + "step": 27208 + }, + { + "epoch": 1.1690586001085188, + "grad_norm": 5.593414306640625, + "learning_rate": 6.374126353295875e-06, + "loss": 0.2498, + "step": 27209 + }, + { + "epoch": 1.1690721649484537, + "grad_norm": 4.702756404876709, + "learning_rate": 6.3739893106756204e-06, + "loss": 0.1818, + "step": 27210 + }, + { + "epoch": 1.1690857297883885, + "grad_norm": 3.980032444000244, + "learning_rate": 6.3738522680553665e-06, + "loss": 0.2561, + "step": 27211 + }, + { + "epoch": 1.1690992946283234, + "grad_norm": 5.371585369110107, + "learning_rate": 6.373715225435111e-06, + "loss": 0.2983, + "step": 27212 + }, + { + "epoch": 1.1691128594682583, + "grad_norm": 6.742926597595215, + "learning_rate": 6.373578182814856e-06, + "loss": 0.3392, + "step": 27213 + }, + { + "epoch": 1.1691264243081931, + "grad_norm": 5.640157222747803, + "learning_rate": 6.3734411401946e-06, + "loss": 0.3908, + "step": 27214 + }, + { + "epoch": 1.169139989148128, + "grad_norm": 5.514692783355713, + "learning_rate": 6.373304097574346e-06, + "loss": 0.3252, + "step": 27215 + }, + { + "epoch": 1.1691535539880629, + "grad_norm": 5.758617877960205, + "learning_rate": 6.3731670549540915e-06, + "loss": 0.3706, + "step": 27216 + }, + { + "epoch": 1.169167118827998, + "grad_norm": 6.265464782714844, + "learning_rate": 6.373030012333836e-06, + "loss": 0.2548, + "step": 27217 + }, + { + "epoch": 1.1691806836679328, + "grad_norm": 11.202315330505371, + "learning_rate": 6.372892969713581e-06, + "loss": 0.3987, + "step": 27218 + }, + { + "epoch": 1.1691942485078677, + "grad_norm": 4.591494083404541, + "learning_rate": 6.372755927093327e-06, + "loss": 0.2431, + "step": 27219 + }, + { + "epoch": 1.1692078133478025, + "grad_norm": 4.713383674621582, + "learning_rate": 6.372618884473072e-06, + "loss": 0.2303, + "step": 27220 + }, + { + "epoch": 1.1692213781877374, + "grad_norm": 4.4240522384643555, + "learning_rate": 6.3724818418528166e-06, + "loss": 0.1314, + "step": 27221 + }, + { + "epoch": 1.1692349430276723, + "grad_norm": 5.879780292510986, + "learning_rate": 6.372344799232562e-06, + "loss": 0.2434, + "step": 27222 + }, + { + "epoch": 1.1692485078676071, + "grad_norm": 5.175812244415283, + "learning_rate": 6.372207756612306e-06, + "loss": 0.1853, + "step": 27223 + }, + { + "epoch": 1.169262072707542, + "grad_norm": 4.998814105987549, + "learning_rate": 6.372070713992052e-06, + "loss": 0.2362, + "step": 27224 + }, + { + "epoch": 1.1692756375474769, + "grad_norm": 6.366767883300781, + "learning_rate": 6.371933671371797e-06, + "loss": 0.2812, + "step": 27225 + }, + { + "epoch": 1.1692892023874117, + "grad_norm": 5.314089298248291, + "learning_rate": 6.3717966287515424e-06, + "loss": 0.2796, + "step": 27226 + }, + { + "epoch": 1.1693027672273466, + "grad_norm": 4.045765399932861, + "learning_rate": 6.371659586131287e-06, + "loss": 0.1502, + "step": 27227 + }, + { + "epoch": 1.1693163320672817, + "grad_norm": 6.358911991119385, + "learning_rate": 6.371522543511033e-06, + "loss": 0.45, + "step": 27228 + }, + { + "epoch": 1.1693298969072166, + "grad_norm": 3.6590194702148438, + "learning_rate": 6.371385500890778e-06, + "loss": 0.1894, + "step": 27229 + }, + { + "epoch": 1.1693434617471514, + "grad_norm": 3.925640821456909, + "learning_rate": 6.371248458270522e-06, + "loss": 0.2119, + "step": 27230 + }, + { + "epoch": 1.1693570265870863, + "grad_norm": 3.0997579097747803, + "learning_rate": 6.3711114156502675e-06, + "loss": 0.1738, + "step": 27231 + }, + { + "epoch": 1.1693705914270212, + "grad_norm": 6.374019622802734, + "learning_rate": 6.3709743730300135e-06, + "loss": 0.369, + "step": 27232 + }, + { + "epoch": 1.169384156266956, + "grad_norm": 4.4072699546813965, + "learning_rate": 6.370837330409758e-06, + "loss": 0.161, + "step": 27233 + }, + { + "epoch": 1.1693977211068909, + "grad_norm": 5.433767318725586, + "learning_rate": 6.370700287789503e-06, + "loss": 0.3435, + "step": 27234 + }, + { + "epoch": 1.1694112859468258, + "grad_norm": 5.774054050445557, + "learning_rate": 6.370563245169248e-06, + "loss": 0.2378, + "step": 27235 + }, + { + "epoch": 1.1694248507867608, + "grad_norm": 5.235677242279053, + "learning_rate": 6.370426202548993e-06, + "loss": 0.2566, + "step": 27236 + }, + { + "epoch": 1.1694384156266957, + "grad_norm": 5.330206394195557, + "learning_rate": 6.3702891599287386e-06, + "loss": 0.3072, + "step": 27237 + }, + { + "epoch": 1.1694519804666306, + "grad_norm": 5.298971176147461, + "learning_rate": 6.370152117308484e-06, + "loss": 0.3279, + "step": 27238 + }, + { + "epoch": 1.1694655453065654, + "grad_norm": 7.272006988525391, + "learning_rate": 6.370015074688228e-06, + "loss": 0.2807, + "step": 27239 + }, + { + "epoch": 1.1694791101465003, + "grad_norm": 4.429955959320068, + "learning_rate": 6.369878032067973e-06, + "loss": 0.2044, + "step": 27240 + }, + { + "epoch": 1.1694926749864352, + "grad_norm": 4.911583423614502, + "learning_rate": 6.369740989447719e-06, + "loss": 0.2337, + "step": 27241 + }, + { + "epoch": 1.16950623982637, + "grad_norm": 7.881688594818115, + "learning_rate": 6.369603946827464e-06, + "loss": 0.3113, + "step": 27242 + }, + { + "epoch": 1.169519804666305, + "grad_norm": 7.308877468109131, + "learning_rate": 6.369466904207209e-06, + "loss": 0.357, + "step": 27243 + }, + { + "epoch": 1.1695333695062398, + "grad_norm": 6.202722072601318, + "learning_rate": 6.369329861586954e-06, + "loss": 0.2922, + "step": 27244 + }, + { + "epoch": 1.1695469343461746, + "grad_norm": 8.32094955444336, + "learning_rate": 6.3691928189667e-06, + "loss": 0.4846, + "step": 27245 + }, + { + "epoch": 1.1695604991861095, + "grad_norm": 7.098175525665283, + "learning_rate": 6.369055776346444e-06, + "loss": 0.3237, + "step": 27246 + }, + { + "epoch": 1.1695740640260446, + "grad_norm": 4.985849857330322, + "learning_rate": 6.3689187337261895e-06, + "loss": 0.2419, + "step": 27247 + }, + { + "epoch": 1.1695876288659794, + "grad_norm": 4.5941619873046875, + "learning_rate": 6.368781691105934e-06, + "loss": 0.268, + "step": 27248 + }, + { + "epoch": 1.1696011937059143, + "grad_norm": 4.991425037384033, + "learning_rate": 6.36864464848568e-06, + "loss": 0.2433, + "step": 27249 + }, + { + "epoch": 1.1696147585458492, + "grad_norm": 3.8092639446258545, + "learning_rate": 6.368507605865425e-06, + "loss": 0.2006, + "step": 27250 + }, + { + "epoch": 1.169628323385784, + "grad_norm": 7.190627574920654, + "learning_rate": 6.368370563245169e-06, + "loss": 0.4123, + "step": 27251 + }, + { + "epoch": 1.169641888225719, + "grad_norm": 8.2925386428833, + "learning_rate": 6.3682335206249146e-06, + "loss": 0.3297, + "step": 27252 + }, + { + "epoch": 1.1696554530656538, + "grad_norm": 7.894213676452637, + "learning_rate": 6.36809647800466e-06, + "loss": 0.4217, + "step": 27253 + }, + { + "epoch": 1.1696690179055886, + "grad_norm": 5.485975742340088, + "learning_rate": 6.367959435384406e-06, + "loss": 0.2654, + "step": 27254 + }, + { + "epoch": 1.1696825827455237, + "grad_norm": 5.063533306121826, + "learning_rate": 6.36782239276415e-06, + "loss": 0.3459, + "step": 27255 + }, + { + "epoch": 1.1696961475854586, + "grad_norm": 5.462147235870361, + "learning_rate": 6.367685350143895e-06, + "loss": 0.3095, + "step": 27256 + }, + { + "epoch": 1.1697097124253935, + "grad_norm": 5.323068141937256, + "learning_rate": 6.36754830752364e-06, + "loss": 0.2707, + "step": 27257 + }, + { + "epoch": 1.1697232772653283, + "grad_norm": 5.353826999664307, + "learning_rate": 6.367411264903386e-06, + "loss": 0.3136, + "step": 27258 + }, + { + "epoch": 1.1697368421052632, + "grad_norm": 7.600103855133057, + "learning_rate": 6.367274222283131e-06, + "loss": 0.34, + "step": 27259 + }, + { + "epoch": 1.169750406945198, + "grad_norm": 5.334462642669678, + "learning_rate": 6.367137179662876e-06, + "loss": 0.1893, + "step": 27260 + }, + { + "epoch": 1.169763971785133, + "grad_norm": 4.556236743927002, + "learning_rate": 6.36700013704262e-06, + "loss": 0.2796, + "step": 27261 + }, + { + "epoch": 1.1697775366250678, + "grad_norm": 6.123413562774658, + "learning_rate": 6.366863094422366e-06, + "loss": 0.2795, + "step": 27262 + }, + { + "epoch": 1.1697911014650026, + "grad_norm": 4.649969100952148, + "learning_rate": 6.3667260518021115e-06, + "loss": 0.1977, + "step": 27263 + }, + { + "epoch": 1.1698046663049375, + "grad_norm": 6.024954319000244, + "learning_rate": 6.366589009181856e-06, + "loss": 0.2634, + "step": 27264 + }, + { + "epoch": 1.1698182311448724, + "grad_norm": 4.602001190185547, + "learning_rate": 6.366451966561601e-06, + "loss": 0.2393, + "step": 27265 + }, + { + "epoch": 1.1698317959848075, + "grad_norm": 5.107678413391113, + "learning_rate": 6.366314923941345e-06, + "loss": 0.3388, + "step": 27266 + }, + { + "epoch": 1.1698453608247423, + "grad_norm": 4.481967449188232, + "learning_rate": 6.366177881321091e-06, + "loss": 0.234, + "step": 27267 + }, + { + "epoch": 1.1698589256646772, + "grad_norm": 4.587310314178467, + "learning_rate": 6.3660408387008366e-06, + "loss": 0.2286, + "step": 27268 + }, + { + "epoch": 1.169872490504612, + "grad_norm": 5.0198564529418945, + "learning_rate": 6.365903796080582e-06, + "loss": 0.223, + "step": 27269 + }, + { + "epoch": 1.169886055344547, + "grad_norm": 5.572447776794434, + "learning_rate": 6.365766753460326e-06, + "loss": 0.3122, + "step": 27270 + }, + { + "epoch": 1.1698996201844818, + "grad_norm": 5.804095268249512, + "learning_rate": 6.365629710840072e-06, + "loss": 0.3409, + "step": 27271 + }, + { + "epoch": 1.1699131850244167, + "grad_norm": 6.677661895751953, + "learning_rate": 6.365492668219817e-06, + "loss": 0.3297, + "step": 27272 + }, + { + "epoch": 1.1699267498643515, + "grad_norm": 5.096293926239014, + "learning_rate": 6.365355625599562e-06, + "loss": 0.2562, + "step": 27273 + }, + { + "epoch": 1.1699403147042866, + "grad_norm": 4.735641002655029, + "learning_rate": 6.365218582979307e-06, + "loss": 0.2544, + "step": 27274 + }, + { + "epoch": 1.1699538795442215, + "grad_norm": 4.863320350646973, + "learning_rate": 6.365081540359053e-06, + "loss": 0.2866, + "step": 27275 + }, + { + "epoch": 1.1699674443841563, + "grad_norm": 4.5693135261535645, + "learning_rate": 6.364944497738797e-06, + "loss": 0.2751, + "step": 27276 + }, + { + "epoch": 1.1699810092240912, + "grad_norm": 3.8355789184570312, + "learning_rate": 6.364807455118542e-06, + "loss": 0.2683, + "step": 27277 + }, + { + "epoch": 1.169994574064026, + "grad_norm": 7.743844509124756, + "learning_rate": 6.3646704124982875e-06, + "loss": 0.3116, + "step": 27278 + }, + { + "epoch": 1.170008138903961, + "grad_norm": 5.958410263061523, + "learning_rate": 6.364533369878032e-06, + "loss": 0.3875, + "step": 27279 + }, + { + "epoch": 1.1700217037438958, + "grad_norm": 4.5528459548950195, + "learning_rate": 6.364396327257778e-06, + "loss": 0.2551, + "step": 27280 + }, + { + "epoch": 1.1700352685838307, + "grad_norm": 3.096376895904541, + "learning_rate": 6.364259284637523e-06, + "loss": 0.1211, + "step": 27281 + }, + { + "epoch": 1.1700488334237655, + "grad_norm": 4.066530704498291, + "learning_rate": 6.364122242017267e-06, + "loss": 0.2573, + "step": 27282 + }, + { + "epoch": 1.1700623982637004, + "grad_norm": 4.91740608215332, + "learning_rate": 6.3639851993970126e-06, + "loss": 0.2457, + "step": 27283 + }, + { + "epoch": 1.1700759631036353, + "grad_norm": 4.804369926452637, + "learning_rate": 6.363848156776759e-06, + "loss": 0.2973, + "step": 27284 + }, + { + "epoch": 1.1700895279435704, + "grad_norm": 4.581934452056885, + "learning_rate": 6.363711114156503e-06, + "loss": 0.2092, + "step": 27285 + }, + { + "epoch": 1.1701030927835052, + "grad_norm": 5.917612075805664, + "learning_rate": 6.363574071536248e-06, + "loss": 0.2554, + "step": 27286 + }, + { + "epoch": 1.17011665762344, + "grad_norm": 4.512558937072754, + "learning_rate": 6.363437028915993e-06, + "loss": 0.1761, + "step": 27287 + }, + { + "epoch": 1.170130222463375, + "grad_norm": 5.923390865325928, + "learning_rate": 6.363299986295739e-06, + "loss": 0.203, + "step": 27288 + }, + { + "epoch": 1.1701437873033098, + "grad_norm": 3.6415624618530273, + "learning_rate": 6.363162943675484e-06, + "loss": 0.1729, + "step": 27289 + }, + { + "epoch": 1.1701573521432447, + "grad_norm": 4.303586959838867, + "learning_rate": 6.363025901055229e-06, + "loss": 0.2461, + "step": 27290 + }, + { + "epoch": 1.1701709169831795, + "grad_norm": 7.139369487762451, + "learning_rate": 6.362888858434973e-06, + "loss": 0.4995, + "step": 27291 + }, + { + "epoch": 1.1701844818231144, + "grad_norm": 4.636500835418701, + "learning_rate": 6.362751815814718e-06, + "loss": 0.2416, + "step": 27292 + }, + { + "epoch": 1.1701980466630495, + "grad_norm": 5.618100643157959, + "learning_rate": 6.362614773194464e-06, + "loss": 0.2845, + "step": 27293 + }, + { + "epoch": 1.1702116115029844, + "grad_norm": 3.90529465675354, + "learning_rate": 6.3624777305742095e-06, + "loss": 0.1726, + "step": 27294 + }, + { + "epoch": 1.1702251763429192, + "grad_norm": 4.5141825675964355, + "learning_rate": 6.362340687953954e-06, + "loss": 0.2426, + "step": 27295 + }, + { + "epoch": 1.170238741182854, + "grad_norm": 5.566555500030518, + "learning_rate": 6.362203645333699e-06, + "loss": 0.2161, + "step": 27296 + }, + { + "epoch": 1.170252306022789, + "grad_norm": 4.2970757484436035, + "learning_rate": 6.362066602713445e-06, + "loss": 0.2531, + "step": 27297 + }, + { + "epoch": 1.1702658708627238, + "grad_norm": 5.449573993682861, + "learning_rate": 6.361929560093189e-06, + "loss": 0.2638, + "step": 27298 + }, + { + "epoch": 1.1702794357026587, + "grad_norm": 4.018828392028809, + "learning_rate": 6.361792517472935e-06, + "loss": 0.2205, + "step": 27299 + }, + { + "epoch": 1.1702930005425936, + "grad_norm": 6.647709846496582, + "learning_rate": 6.361655474852679e-06, + "loss": 0.3383, + "step": 27300 + }, + { + "epoch": 1.1703065653825284, + "grad_norm": 5.566272258758545, + "learning_rate": 6.361518432232425e-06, + "loss": 0.2907, + "step": 27301 + }, + { + "epoch": 1.1703201302224633, + "grad_norm": 4.0764689445495605, + "learning_rate": 6.36138138961217e-06, + "loss": 0.2502, + "step": 27302 + }, + { + "epoch": 1.1703336950623981, + "grad_norm": 4.197160243988037, + "learning_rate": 6.361244346991915e-06, + "loss": 0.1846, + "step": 27303 + }, + { + "epoch": 1.1703472599023332, + "grad_norm": 5.2469801902771, + "learning_rate": 6.36110730437166e-06, + "loss": 0.3023, + "step": 27304 + }, + { + "epoch": 1.170360824742268, + "grad_norm": 5.487964153289795, + "learning_rate": 6.360970261751406e-06, + "loss": 0.2424, + "step": 27305 + }, + { + "epoch": 1.170374389582203, + "grad_norm": 3.4959042072296143, + "learning_rate": 6.360833219131151e-06, + "loss": 0.1893, + "step": 27306 + }, + { + "epoch": 1.1703879544221378, + "grad_norm": 6.323132514953613, + "learning_rate": 6.360696176510895e-06, + "loss": 0.2805, + "step": 27307 + }, + { + "epoch": 1.1704015192620727, + "grad_norm": 5.26629638671875, + "learning_rate": 6.36055913389064e-06, + "loss": 0.2857, + "step": 27308 + }, + { + "epoch": 1.1704150841020076, + "grad_norm": 6.1745686531066895, + "learning_rate": 6.3604220912703855e-06, + "loss": 0.2765, + "step": 27309 + }, + { + "epoch": 1.1704286489419424, + "grad_norm": 5.2029876708984375, + "learning_rate": 6.360285048650131e-06, + "loss": 0.2823, + "step": 27310 + }, + { + "epoch": 1.1704422137818773, + "grad_norm": 4.403153419494629, + "learning_rate": 6.360148006029876e-06, + "loss": 0.1766, + "step": 27311 + }, + { + "epoch": 1.1704557786218124, + "grad_norm": 3.4184303283691406, + "learning_rate": 6.360010963409621e-06, + "loss": 0.2351, + "step": 27312 + }, + { + "epoch": 1.1704693434617472, + "grad_norm": 3.880544424057007, + "learning_rate": 6.359873920789365e-06, + "loss": 0.1228, + "step": 27313 + }, + { + "epoch": 1.1704829083016821, + "grad_norm": 3.086930513381958, + "learning_rate": 6.359736878169111e-06, + "loss": 0.155, + "step": 27314 + }, + { + "epoch": 1.170496473141617, + "grad_norm": 4.063314914703369, + "learning_rate": 6.359599835548857e-06, + "loss": 0.2459, + "step": 27315 + }, + { + "epoch": 1.1705100379815518, + "grad_norm": 3.392007350921631, + "learning_rate": 6.359462792928601e-06, + "loss": 0.1664, + "step": 27316 + }, + { + "epoch": 1.1705236028214867, + "grad_norm": 6.135862827301025, + "learning_rate": 6.359325750308346e-06, + "loss": 0.2035, + "step": 27317 + }, + { + "epoch": 1.1705371676614216, + "grad_norm": 5.110136032104492, + "learning_rate": 6.359188707688092e-06, + "loss": 0.2325, + "step": 27318 + }, + { + "epoch": 1.1705507325013564, + "grad_norm": 5.323570251464844, + "learning_rate": 6.359051665067837e-06, + "loss": 0.3941, + "step": 27319 + }, + { + "epoch": 1.1705642973412913, + "grad_norm": 5.6519317626953125, + "learning_rate": 6.358914622447582e-06, + "loss": 0.3633, + "step": 27320 + }, + { + "epoch": 1.1705778621812262, + "grad_norm": 4.2383809089660645, + "learning_rate": 6.358777579827327e-06, + "loss": 0.2442, + "step": 27321 + }, + { + "epoch": 1.170591427021161, + "grad_norm": 6.969879150390625, + "learning_rate": 6.358640537207071e-06, + "loss": 0.2626, + "step": 27322 + }, + { + "epoch": 1.1706049918610961, + "grad_norm": 5.100175380706787, + "learning_rate": 6.358503494586817e-06, + "loss": 0.2545, + "step": 27323 + }, + { + "epoch": 1.170618556701031, + "grad_norm": 5.332093238830566, + "learning_rate": 6.358366451966562e-06, + "loss": 0.3157, + "step": 27324 + }, + { + "epoch": 1.1706321215409659, + "grad_norm": 4.531729221343994, + "learning_rate": 6.358229409346307e-06, + "loss": 0.3167, + "step": 27325 + }, + { + "epoch": 1.1706456863809007, + "grad_norm": 5.174907684326172, + "learning_rate": 6.358092366726052e-06, + "loss": 0.294, + "step": 27326 + }, + { + "epoch": 1.1706592512208356, + "grad_norm": 5.485090732574463, + "learning_rate": 6.357955324105798e-06, + "loss": 0.3282, + "step": 27327 + }, + { + "epoch": 1.1706728160607704, + "grad_norm": 4.165440559387207, + "learning_rate": 6.357818281485543e-06, + "loss": 0.2128, + "step": 27328 + }, + { + "epoch": 1.1706863809007053, + "grad_norm": 5.698563575744629, + "learning_rate": 6.357681238865287e-06, + "loss": 0.258, + "step": 27329 + }, + { + "epoch": 1.1706999457406402, + "grad_norm": 4.353061676025391, + "learning_rate": 6.357544196245033e-06, + "loss": 0.2598, + "step": 27330 + }, + { + "epoch": 1.1707135105805753, + "grad_norm": 4.820577621459961, + "learning_rate": 6.357407153624779e-06, + "loss": 0.2592, + "step": 27331 + }, + { + "epoch": 1.1707270754205101, + "grad_norm": 6.700392723083496, + "learning_rate": 6.357270111004523e-06, + "loss": 0.396, + "step": 27332 + }, + { + "epoch": 1.170740640260445, + "grad_norm": 4.540407657623291, + "learning_rate": 6.357133068384268e-06, + "loss": 0.1613, + "step": 27333 + }, + { + "epoch": 1.1707542051003799, + "grad_norm": 4.823642253875732, + "learning_rate": 6.3569960257640124e-06, + "loss": 0.2979, + "step": 27334 + }, + { + "epoch": 1.1707677699403147, + "grad_norm": 5.963088035583496, + "learning_rate": 6.356858983143758e-06, + "loss": 0.1956, + "step": 27335 + }, + { + "epoch": 1.1707813347802496, + "grad_norm": 5.1947784423828125, + "learning_rate": 6.356721940523504e-06, + "loss": 0.303, + "step": 27336 + }, + { + "epoch": 1.1707948996201845, + "grad_norm": 6.016029357910156, + "learning_rate": 6.356584897903249e-06, + "loss": 0.3222, + "step": 27337 + }, + { + "epoch": 1.1708084644601193, + "grad_norm": 4.915159225463867, + "learning_rate": 6.356447855282993e-06, + "loss": 0.2637, + "step": 27338 + }, + { + "epoch": 1.1708220293000542, + "grad_norm": 4.299671649932861, + "learning_rate": 6.356310812662738e-06, + "loss": 0.2591, + "step": 27339 + }, + { + "epoch": 1.170835594139989, + "grad_norm": 5.0386528968811035, + "learning_rate": 6.356173770042484e-06, + "loss": 0.297, + "step": 27340 + }, + { + "epoch": 1.170849158979924, + "grad_norm": 4.008935451507568, + "learning_rate": 6.356036727422229e-06, + "loss": 0.2704, + "step": 27341 + }, + { + "epoch": 1.170862723819859, + "grad_norm": 3.80962872505188, + "learning_rate": 6.355899684801974e-06, + "loss": 0.2587, + "step": 27342 + }, + { + "epoch": 1.1708762886597939, + "grad_norm": 6.762929916381836, + "learning_rate": 6.355762642181719e-06, + "loss": 0.2424, + "step": 27343 + }, + { + "epoch": 1.1708898534997287, + "grad_norm": 4.95539665222168, + "learning_rate": 6.355625599561464e-06, + "loss": 0.2361, + "step": 27344 + }, + { + "epoch": 1.1709034183396636, + "grad_norm": 6.6804518699646, + "learning_rate": 6.355488556941209e-06, + "loss": 0.2745, + "step": 27345 + }, + { + "epoch": 1.1709169831795985, + "grad_norm": 3.417959690093994, + "learning_rate": 6.355351514320955e-06, + "loss": 0.2026, + "step": 27346 + }, + { + "epoch": 1.1709305480195333, + "grad_norm": 3.5632243156433105, + "learning_rate": 6.355214471700699e-06, + "loss": 0.2103, + "step": 27347 + }, + { + "epoch": 1.1709441128594682, + "grad_norm": 7.64098596572876, + "learning_rate": 6.355077429080444e-06, + "loss": 0.4837, + "step": 27348 + }, + { + "epoch": 1.170957677699403, + "grad_norm": 7.393587112426758, + "learning_rate": 6.35494038646019e-06, + "loss": 0.2957, + "step": 27349 + }, + { + "epoch": 1.1709712425393382, + "grad_norm": 3.9892382621765137, + "learning_rate": 6.3548033438399345e-06, + "loss": 0.2208, + "step": 27350 + }, + { + "epoch": 1.170984807379273, + "grad_norm": 5.46937894821167, + "learning_rate": 6.35466630121968e-06, + "loss": 0.3357, + "step": 27351 + }, + { + "epoch": 1.1709983722192079, + "grad_norm": 4.716602802276611, + "learning_rate": 6.354529258599425e-06, + "loss": 0.2722, + "step": 27352 + }, + { + "epoch": 1.1710119370591427, + "grad_norm": 5.904387474060059, + "learning_rate": 6.354392215979171e-06, + "loss": 0.2519, + "step": 27353 + }, + { + "epoch": 1.1710255018990776, + "grad_norm": 4.574397563934326, + "learning_rate": 6.354255173358915e-06, + "loss": 0.1581, + "step": 27354 + }, + { + "epoch": 1.1710390667390125, + "grad_norm": 6.734844207763672, + "learning_rate": 6.35411813073866e-06, + "loss": 0.3345, + "step": 27355 + }, + { + "epoch": 1.1710526315789473, + "grad_norm": 5.861093997955322, + "learning_rate": 6.353981088118405e-06, + "loss": 0.2577, + "step": 27356 + }, + { + "epoch": 1.1710661964188822, + "grad_norm": 9.227507591247559, + "learning_rate": 6.353844045498151e-06, + "loss": 0.3783, + "step": 27357 + }, + { + "epoch": 1.171079761258817, + "grad_norm": 5.120517730712891, + "learning_rate": 6.353707002877896e-06, + "loss": 0.2772, + "step": 27358 + }, + { + "epoch": 1.171093326098752, + "grad_norm": 5.2538628578186035, + "learning_rate": 6.35356996025764e-06, + "loss": 0.3064, + "step": 27359 + }, + { + "epoch": 1.1711068909386868, + "grad_norm": 6.312566757202148, + "learning_rate": 6.353432917637385e-06, + "loss": 0.2894, + "step": 27360 + }, + { + "epoch": 1.171120455778622, + "grad_norm": 7.600140571594238, + "learning_rate": 6.353295875017131e-06, + "loss": 0.4119, + "step": 27361 + }, + { + "epoch": 1.1711340206185568, + "grad_norm": 4.574902534484863, + "learning_rate": 6.353158832396877e-06, + "loss": 0.2615, + "step": 27362 + }, + { + "epoch": 1.1711475854584916, + "grad_norm": 6.175447463989258, + "learning_rate": 6.353021789776621e-06, + "loss": 0.2766, + "step": 27363 + }, + { + "epoch": 1.1711611502984265, + "grad_norm": 7.6446967124938965, + "learning_rate": 6.352884747156366e-06, + "loss": 0.3264, + "step": 27364 + }, + { + "epoch": 1.1711747151383614, + "grad_norm": 5.9831061363220215, + "learning_rate": 6.3527477045361105e-06, + "loss": 0.1564, + "step": 27365 + }, + { + "epoch": 1.1711882799782962, + "grad_norm": 5.032063961029053, + "learning_rate": 6.3526106619158565e-06, + "loss": 0.2523, + "step": 27366 + }, + { + "epoch": 1.171201844818231, + "grad_norm": 4.68427848815918, + "learning_rate": 6.352473619295602e-06, + "loss": 0.2036, + "step": 27367 + }, + { + "epoch": 1.171215409658166, + "grad_norm": 3.77730393409729, + "learning_rate": 6.352336576675347e-06, + "loss": 0.1669, + "step": 27368 + }, + { + "epoch": 1.171228974498101, + "grad_norm": 4.357596397399902, + "learning_rate": 6.352199534055091e-06, + "loss": 0.1599, + "step": 27369 + }, + { + "epoch": 1.171242539338036, + "grad_norm": 3.8809144496917725, + "learning_rate": 6.352062491434837e-06, + "loss": 0.1527, + "step": 27370 + }, + { + "epoch": 1.1712561041779708, + "grad_norm": 5.298372745513916, + "learning_rate": 6.351925448814582e-06, + "loss": 0.2126, + "step": 27371 + }, + { + "epoch": 1.1712696690179056, + "grad_norm": 5.253111839294434, + "learning_rate": 6.351788406194327e-06, + "loss": 0.2852, + "step": 27372 + }, + { + "epoch": 1.1712832338578405, + "grad_norm": 5.8457512855529785, + "learning_rate": 6.351651363574072e-06, + "loss": 0.3216, + "step": 27373 + }, + { + "epoch": 1.1712967986977754, + "grad_norm": 6.310036659240723, + "learning_rate": 6.351514320953818e-06, + "loss": 0.2409, + "step": 27374 + }, + { + "epoch": 1.1713103635377102, + "grad_norm": 5.923429012298584, + "learning_rate": 6.351377278333562e-06, + "loss": 0.2908, + "step": 27375 + }, + { + "epoch": 1.171323928377645, + "grad_norm": 6.356793403625488, + "learning_rate": 6.351240235713307e-06, + "loss": 0.2957, + "step": 27376 + }, + { + "epoch": 1.17133749321758, + "grad_norm": 4.852384090423584, + "learning_rate": 6.351103193093053e-06, + "loss": 0.2481, + "step": 27377 + }, + { + "epoch": 1.1713510580575148, + "grad_norm": 5.54755163192749, + "learning_rate": 6.350966150472797e-06, + "loss": 0.2417, + "step": 27378 + }, + { + "epoch": 1.1713646228974497, + "grad_norm": 5.949112892150879, + "learning_rate": 6.350829107852543e-06, + "loss": 0.3246, + "step": 27379 + }, + { + "epoch": 1.1713781877373848, + "grad_norm": 4.956762313842773, + "learning_rate": 6.350692065232288e-06, + "loss": 0.2721, + "step": 27380 + }, + { + "epoch": 1.1713917525773196, + "grad_norm": 5.260190486907959, + "learning_rate": 6.3505550226120325e-06, + "loss": 0.292, + "step": 27381 + }, + { + "epoch": 1.1714053174172545, + "grad_norm": 5.015275955200195, + "learning_rate": 6.350417979991778e-06, + "loss": 0.309, + "step": 27382 + }, + { + "epoch": 1.1714188822571894, + "grad_norm": 3.053818464279175, + "learning_rate": 6.350280937371524e-06, + "loss": 0.1581, + "step": 27383 + }, + { + "epoch": 1.1714324470971242, + "grad_norm": 5.694544792175293, + "learning_rate": 6.350143894751268e-06, + "loss": 0.2545, + "step": 27384 + }, + { + "epoch": 1.171446011937059, + "grad_norm": 5.409278869628906, + "learning_rate": 6.350006852131013e-06, + "loss": 0.2224, + "step": 27385 + }, + { + "epoch": 1.171459576776994, + "grad_norm": 4.747135639190674, + "learning_rate": 6.349869809510758e-06, + "loss": 0.2639, + "step": 27386 + }, + { + "epoch": 1.1714731416169288, + "grad_norm": 6.356350421905518, + "learning_rate": 6.349732766890504e-06, + "loss": 0.2455, + "step": 27387 + }, + { + "epoch": 1.171486706456864, + "grad_norm": 4.470664024353027, + "learning_rate": 6.349595724270249e-06, + "loss": 0.2279, + "step": 27388 + }, + { + "epoch": 1.1715002712967988, + "grad_norm": 4.759339809417725, + "learning_rate": 6.349458681649994e-06, + "loss": 0.1797, + "step": 27389 + }, + { + "epoch": 1.1715138361367337, + "grad_norm": 7.902670383453369, + "learning_rate": 6.349321639029738e-06, + "loss": 0.3776, + "step": 27390 + }, + { + "epoch": 1.1715274009766685, + "grad_norm": 4.859973430633545, + "learning_rate": 6.349184596409483e-06, + "loss": 0.2447, + "step": 27391 + }, + { + "epoch": 1.1715409658166034, + "grad_norm": 4.601556777954102, + "learning_rate": 6.3490475537892294e-06, + "loss": 0.2124, + "step": 27392 + }, + { + "epoch": 1.1715545306565383, + "grad_norm": 4.9049787521362305, + "learning_rate": 6.348910511168974e-06, + "loss": 0.2736, + "step": 27393 + }, + { + "epoch": 1.1715680954964731, + "grad_norm": 3.523106575012207, + "learning_rate": 6.348773468548719e-06, + "loss": 0.1527, + "step": 27394 + }, + { + "epoch": 1.171581660336408, + "grad_norm": 3.729074001312256, + "learning_rate": 6.348636425928464e-06, + "loss": 0.201, + "step": 27395 + }, + { + "epoch": 1.1715952251763428, + "grad_norm": 5.894126892089844, + "learning_rate": 6.34849938330821e-06, + "loss": 0.2537, + "step": 27396 + }, + { + "epoch": 1.1716087900162777, + "grad_norm": 4.172202110290527, + "learning_rate": 6.3483623406879545e-06, + "loss": 0.1953, + "step": 27397 + }, + { + "epoch": 1.1716223548562126, + "grad_norm": 4.475882530212402, + "learning_rate": 6.3482252980677e-06, + "loss": 0.2956, + "step": 27398 + }, + { + "epoch": 1.1716359196961477, + "grad_norm": 5.452741622924805, + "learning_rate": 6.348088255447444e-06, + "loss": 0.1929, + "step": 27399 + }, + { + "epoch": 1.1716494845360825, + "grad_norm": 5.148983478546143, + "learning_rate": 6.34795121282719e-06, + "loss": 0.2405, + "step": 27400 + }, + { + "epoch": 1.1716630493760174, + "grad_norm": 3.0957322120666504, + "learning_rate": 6.347814170206935e-06, + "loss": 0.1876, + "step": 27401 + }, + { + "epoch": 1.1716766142159523, + "grad_norm": 4.381274223327637, + "learning_rate": 6.34767712758668e-06, + "loss": 0.2146, + "step": 27402 + }, + { + "epoch": 1.1716901790558871, + "grad_norm": 3.9974722862243652, + "learning_rate": 6.347540084966425e-06, + "loss": 0.1733, + "step": 27403 + }, + { + "epoch": 1.171703743895822, + "grad_norm": 4.312180519104004, + "learning_rate": 6.34740304234617e-06, + "loss": 0.2028, + "step": 27404 + }, + { + "epoch": 1.1717173087357569, + "grad_norm": 5.658886432647705, + "learning_rate": 6.347265999725916e-06, + "loss": 0.2392, + "step": 27405 + }, + { + "epoch": 1.1717308735756917, + "grad_norm": 6.584883689880371, + "learning_rate": 6.34712895710566e-06, + "loss": 0.2308, + "step": 27406 + }, + { + "epoch": 1.1717444384156268, + "grad_norm": 4.592141151428223, + "learning_rate": 6.3469919144854054e-06, + "loss": 0.2636, + "step": 27407 + }, + { + "epoch": 1.1717580032555617, + "grad_norm": 3.218132734298706, + "learning_rate": 6.34685487186515e-06, + "loss": 0.1658, + "step": 27408 + }, + { + "epoch": 1.1717715680954965, + "grad_norm": 4.974670886993408, + "learning_rate": 6.346717829244896e-06, + "loss": 0.2658, + "step": 27409 + }, + { + "epoch": 1.1717851329354314, + "grad_norm": 5.569986343383789, + "learning_rate": 6.346580786624641e-06, + "loss": 0.2582, + "step": 27410 + }, + { + "epoch": 1.1717986977753663, + "grad_norm": 3.868439197540283, + "learning_rate": 6.346443744004386e-06, + "loss": 0.1456, + "step": 27411 + }, + { + "epoch": 1.1718122626153011, + "grad_norm": 3.595280170440674, + "learning_rate": 6.3463067013841305e-06, + "loss": 0.1565, + "step": 27412 + }, + { + "epoch": 1.171825827455236, + "grad_norm": 3.4894869327545166, + "learning_rate": 6.3461696587638765e-06, + "loss": 0.1739, + "step": 27413 + }, + { + "epoch": 1.1718393922951709, + "grad_norm": 3.6867034435272217, + "learning_rate": 6.346032616143622e-06, + "loss": 0.1501, + "step": 27414 + }, + { + "epoch": 1.1718529571351057, + "grad_norm": 4.052894115447998, + "learning_rate": 6.345895573523366e-06, + "loss": 0.2094, + "step": 27415 + }, + { + "epoch": 1.1718665219750406, + "grad_norm": 3.8053417205810547, + "learning_rate": 6.345758530903111e-06, + "loss": 0.1437, + "step": 27416 + }, + { + "epoch": 1.1718800868149755, + "grad_norm": 5.6184210777282715, + "learning_rate": 6.345621488282856e-06, + "loss": 0.3301, + "step": 27417 + }, + { + "epoch": 1.1718936516549106, + "grad_norm": 7.373027324676514, + "learning_rate": 6.3454844456626015e-06, + "loss": 0.3605, + "step": 27418 + }, + { + "epoch": 1.1719072164948454, + "grad_norm": 5.359293460845947, + "learning_rate": 6.345347403042347e-06, + "loss": 0.1823, + "step": 27419 + }, + { + "epoch": 1.1719207813347803, + "grad_norm": 4.562803268432617, + "learning_rate": 6.345210360422092e-06, + "loss": 0.2317, + "step": 27420 + }, + { + "epoch": 1.1719343461747151, + "grad_norm": 5.885967254638672, + "learning_rate": 6.345073317801836e-06, + "loss": 0.2835, + "step": 27421 + }, + { + "epoch": 1.17194791101465, + "grad_norm": 7.615659713745117, + "learning_rate": 6.344936275181582e-06, + "loss": 0.4163, + "step": 27422 + }, + { + "epoch": 1.1719614758545849, + "grad_norm": 4.625486373901367, + "learning_rate": 6.3447992325613274e-06, + "loss": 0.2307, + "step": 27423 + }, + { + "epoch": 1.1719750406945197, + "grad_norm": 5.562905788421631, + "learning_rate": 6.344662189941072e-06, + "loss": 0.3078, + "step": 27424 + }, + { + "epoch": 1.1719886055344546, + "grad_norm": 4.5573039054870605, + "learning_rate": 6.344525147320817e-06, + "loss": 0.3008, + "step": 27425 + }, + { + "epoch": 1.1720021703743897, + "grad_norm": 5.422597885131836, + "learning_rate": 6.344388104700563e-06, + "loss": 0.2711, + "step": 27426 + }, + { + "epoch": 1.1720157352143246, + "grad_norm": 5.903433799743652, + "learning_rate": 6.344251062080307e-06, + "loss": 0.2837, + "step": 27427 + }, + { + "epoch": 1.1720293000542594, + "grad_norm": 5.109277725219727, + "learning_rate": 6.3441140194600525e-06, + "loss": 0.2814, + "step": 27428 + }, + { + "epoch": 1.1720428648941943, + "grad_norm": 3.001091241836548, + "learning_rate": 6.343976976839798e-06, + "loss": 0.1973, + "step": 27429 + }, + { + "epoch": 1.1720564297341292, + "grad_norm": 5.292738437652588, + "learning_rate": 6.343839934219542e-06, + "loss": 0.255, + "step": 27430 + }, + { + "epoch": 1.172069994574064, + "grad_norm": 5.7313618659973145, + "learning_rate": 6.343702891599288e-06, + "loss": 0.3189, + "step": 27431 + }, + { + "epoch": 1.1720835594139989, + "grad_norm": 6.5610456466674805, + "learning_rate": 6.343565848979033e-06, + "loss": 0.3998, + "step": 27432 + }, + { + "epoch": 1.1720971242539338, + "grad_norm": 5.821651935577393, + "learning_rate": 6.3434288063587775e-06, + "loss": 0.361, + "step": 27433 + }, + { + "epoch": 1.1721106890938686, + "grad_norm": 4.468166351318359, + "learning_rate": 6.343291763738523e-06, + "loss": 0.1973, + "step": 27434 + }, + { + "epoch": 1.1721242539338035, + "grad_norm": 5.780888557434082, + "learning_rate": 6.343154721118269e-06, + "loss": 0.3021, + "step": 27435 + }, + { + "epoch": 1.1721378187737383, + "grad_norm": 4.86071252822876, + "learning_rate": 6.343017678498014e-06, + "loss": 0.3032, + "step": 27436 + }, + { + "epoch": 1.1721513836136734, + "grad_norm": 6.878544807434082, + "learning_rate": 6.342880635877758e-06, + "loss": 0.3431, + "step": 27437 + }, + { + "epoch": 1.1721649484536083, + "grad_norm": 6.9221086502075195, + "learning_rate": 6.3427435932575034e-06, + "loss": 0.2822, + "step": 27438 + }, + { + "epoch": 1.1721785132935432, + "grad_norm": 5.704535961151123, + "learning_rate": 6.3426065506372495e-06, + "loss": 0.2998, + "step": 27439 + }, + { + "epoch": 1.172192078133478, + "grad_norm": 4.091756820678711, + "learning_rate": 6.342469508016994e-06, + "loss": 0.2012, + "step": 27440 + }, + { + "epoch": 1.172205642973413, + "grad_norm": 5.968415260314941, + "learning_rate": 6.342332465396739e-06, + "loss": 0.3878, + "step": 27441 + }, + { + "epoch": 1.1722192078133478, + "grad_norm": 6.9137372970581055, + "learning_rate": 6.342195422776483e-06, + "loss": 0.3177, + "step": 27442 + }, + { + "epoch": 1.1722327726532826, + "grad_norm": 4.779388904571533, + "learning_rate": 6.342058380156229e-06, + "loss": 0.2479, + "step": 27443 + }, + { + "epoch": 1.1722463374932175, + "grad_norm": 5.3816423416137695, + "learning_rate": 6.3419213375359745e-06, + "loss": 0.3327, + "step": 27444 + }, + { + "epoch": 1.1722599023331526, + "grad_norm": 4.4479475021362305, + "learning_rate": 6.34178429491572e-06, + "loss": 0.2584, + "step": 27445 + }, + { + "epoch": 1.1722734671730874, + "grad_norm": 5.616413116455078, + "learning_rate": 6.341647252295464e-06, + "loss": 0.2519, + "step": 27446 + }, + { + "epoch": 1.1722870320130223, + "grad_norm": 4.050047397613525, + "learning_rate": 6.341510209675209e-06, + "loss": 0.2438, + "step": 27447 + }, + { + "epoch": 1.1723005968529572, + "grad_norm": 4.107092380523682, + "learning_rate": 6.341373167054955e-06, + "loss": 0.2016, + "step": 27448 + }, + { + "epoch": 1.172314161692892, + "grad_norm": 6.667969226837158, + "learning_rate": 6.3412361244346996e-06, + "loss": 0.4451, + "step": 27449 + }, + { + "epoch": 1.172327726532827, + "grad_norm": 3.3876845836639404, + "learning_rate": 6.341099081814445e-06, + "loss": 0.1879, + "step": 27450 + }, + { + "epoch": 1.1723412913727618, + "grad_norm": 4.42434024810791, + "learning_rate": 6.34096203919419e-06, + "loss": 0.287, + "step": 27451 + }, + { + "epoch": 1.1723548562126966, + "grad_norm": 5.852529525756836, + "learning_rate": 6.340824996573935e-06, + "loss": 0.2808, + "step": 27452 + }, + { + "epoch": 1.1723684210526315, + "grad_norm": 4.195557117462158, + "learning_rate": 6.34068795395368e-06, + "loss": 0.2224, + "step": 27453 + }, + { + "epoch": 1.1723819858925664, + "grad_norm": 8.031044960021973, + "learning_rate": 6.3405509113334254e-06, + "loss": 0.341, + "step": 27454 + }, + { + "epoch": 1.1723955507325012, + "grad_norm": 4.846845626831055, + "learning_rate": 6.34041386871317e-06, + "loss": 0.289, + "step": 27455 + }, + { + "epoch": 1.1724091155724363, + "grad_norm": 4.160233020782471, + "learning_rate": 6.340276826092916e-06, + "loss": 0.2317, + "step": 27456 + }, + { + "epoch": 1.1724226804123712, + "grad_norm": 5.009985446929932, + "learning_rate": 6.340139783472661e-06, + "loss": 0.1818, + "step": 27457 + }, + { + "epoch": 1.172436245252306, + "grad_norm": 6.106029510498047, + "learning_rate": 6.340002740852405e-06, + "loss": 0.3666, + "step": 27458 + }, + { + "epoch": 1.172449810092241, + "grad_norm": 4.842403888702393, + "learning_rate": 6.3398656982321505e-06, + "loss": 0.1598, + "step": 27459 + }, + { + "epoch": 1.1724633749321758, + "grad_norm": 5.297304630279541, + "learning_rate": 6.339728655611896e-06, + "loss": 0.2838, + "step": 27460 + }, + { + "epoch": 1.1724769397721106, + "grad_norm": 6.089682579040527, + "learning_rate": 6.339591612991642e-06, + "loss": 0.2874, + "step": 27461 + }, + { + "epoch": 1.1724905046120455, + "grad_norm": 5.233028888702393, + "learning_rate": 6.339454570371386e-06, + "loss": 0.2107, + "step": 27462 + }, + { + "epoch": 1.1725040694519804, + "grad_norm": 4.166158199310303, + "learning_rate": 6.339317527751131e-06, + "loss": 0.2238, + "step": 27463 + }, + { + "epoch": 1.1725176342919155, + "grad_norm": 5.937074184417725, + "learning_rate": 6.3391804851308755e-06, + "loss": 0.3536, + "step": 27464 + }, + { + "epoch": 1.1725311991318503, + "grad_norm": 5.8272480964660645, + "learning_rate": 6.3390434425106216e-06, + "loss": 0.2747, + "step": 27465 + }, + { + "epoch": 1.1725447639717852, + "grad_norm": 5.358968734741211, + "learning_rate": 6.338906399890367e-06, + "loss": 0.2505, + "step": 27466 + }, + { + "epoch": 1.17255832881172, + "grad_norm": 4.574519157409668, + "learning_rate": 6.338769357270111e-06, + "loss": 0.231, + "step": 27467 + }, + { + "epoch": 1.172571893651655, + "grad_norm": 3.570284128189087, + "learning_rate": 6.338632314649856e-06, + "loss": 0.2148, + "step": 27468 + }, + { + "epoch": 1.1725854584915898, + "grad_norm": 4.210622310638428, + "learning_rate": 6.338495272029602e-06, + "loss": 0.2832, + "step": 27469 + }, + { + "epoch": 1.1725990233315247, + "grad_norm": 4.460813999176025, + "learning_rate": 6.3383582294093475e-06, + "loss": 0.158, + "step": 27470 + }, + { + "epoch": 1.1726125881714595, + "grad_norm": 4.033995151519775, + "learning_rate": 6.338221186789092e-06, + "loss": 0.1695, + "step": 27471 + }, + { + "epoch": 1.1726261530113944, + "grad_norm": 7.617098808288574, + "learning_rate": 6.338084144168837e-06, + "loss": 0.3586, + "step": 27472 + }, + { + "epoch": 1.1726397178513293, + "grad_norm": 5.552868366241455, + "learning_rate": 6.337947101548581e-06, + "loss": 0.3074, + "step": 27473 + }, + { + "epoch": 1.1726532826912641, + "grad_norm": 3.6588339805603027, + "learning_rate": 6.337810058928327e-06, + "loss": 0.2187, + "step": 27474 + }, + { + "epoch": 1.1726668475311992, + "grad_norm": 4.369372367858887, + "learning_rate": 6.3376730163080725e-06, + "loss": 0.1993, + "step": 27475 + }, + { + "epoch": 1.172680412371134, + "grad_norm": 4.915067195892334, + "learning_rate": 6.337535973687817e-06, + "loss": 0.1961, + "step": 27476 + }, + { + "epoch": 1.172693977211069, + "grad_norm": 4.574782848358154, + "learning_rate": 6.337398931067562e-06, + "loss": 0.2678, + "step": 27477 + }, + { + "epoch": 1.1727075420510038, + "grad_norm": 3.7244131565093994, + "learning_rate": 6.337261888447308e-06, + "loss": 0.2342, + "step": 27478 + }, + { + "epoch": 1.1727211068909387, + "grad_norm": 7.32556676864624, + "learning_rate": 6.337124845827053e-06, + "loss": 0.2653, + "step": 27479 + }, + { + "epoch": 1.1727346717308735, + "grad_norm": 4.0629119873046875, + "learning_rate": 6.3369878032067976e-06, + "loss": 0.2294, + "step": 27480 + }, + { + "epoch": 1.1727482365708084, + "grad_norm": 4.693113327026367, + "learning_rate": 6.336850760586543e-06, + "loss": 0.267, + "step": 27481 + }, + { + "epoch": 1.1727618014107433, + "grad_norm": 4.328064918518066, + "learning_rate": 6.336713717966289e-06, + "loss": 0.2065, + "step": 27482 + }, + { + "epoch": 1.1727753662506784, + "grad_norm": 4.842586994171143, + "learning_rate": 6.336576675346033e-06, + "loss": 0.2168, + "step": 27483 + }, + { + "epoch": 1.1727889310906132, + "grad_norm": 5.051657199859619, + "learning_rate": 6.336439632725778e-06, + "loss": 0.2203, + "step": 27484 + }, + { + "epoch": 1.172802495930548, + "grad_norm": 5.427806854248047, + "learning_rate": 6.3363025901055234e-06, + "loss": 0.2379, + "step": 27485 + }, + { + "epoch": 1.172816060770483, + "grad_norm": 5.248932838439941, + "learning_rate": 6.336165547485268e-06, + "loss": 0.2519, + "step": 27486 + }, + { + "epoch": 1.1728296256104178, + "grad_norm": 5.158021926879883, + "learning_rate": 6.336028504865014e-06, + "loss": 0.1566, + "step": 27487 + }, + { + "epoch": 1.1728431904503527, + "grad_norm": 4.198915958404541, + "learning_rate": 6.335891462244759e-06, + "loss": 0.197, + "step": 27488 + }, + { + "epoch": 1.1728567552902875, + "grad_norm": 4.4465508460998535, + "learning_rate": 6.335754419624503e-06, + "loss": 0.206, + "step": 27489 + }, + { + "epoch": 1.1728703201302224, + "grad_norm": 4.502091407775879, + "learning_rate": 6.3356173770042485e-06, + "loss": 0.2695, + "step": 27490 + }, + { + "epoch": 1.1728838849701573, + "grad_norm": 4.958925724029541, + "learning_rate": 6.3354803343839945e-06, + "loss": 0.2821, + "step": 27491 + }, + { + "epoch": 1.1728974498100921, + "grad_norm": 5.529552459716797, + "learning_rate": 6.335343291763739e-06, + "loss": 0.2305, + "step": 27492 + }, + { + "epoch": 1.172911014650027, + "grad_norm": 4.197896480560303, + "learning_rate": 6.335206249143484e-06, + "loss": 0.172, + "step": 27493 + }, + { + "epoch": 1.172924579489962, + "grad_norm": 7.2985687255859375, + "learning_rate": 6.335069206523229e-06, + "loss": 0.3072, + "step": 27494 + }, + { + "epoch": 1.172938144329897, + "grad_norm": 3.3939621448516846, + "learning_rate": 6.334932163902975e-06, + "loss": 0.1497, + "step": 27495 + }, + { + "epoch": 1.1729517091698318, + "grad_norm": 4.192668437957764, + "learning_rate": 6.3347951212827196e-06, + "loss": 0.2209, + "step": 27496 + }, + { + "epoch": 1.1729652740097667, + "grad_norm": 4.886162281036377, + "learning_rate": 6.334658078662465e-06, + "loss": 0.33, + "step": 27497 + }, + { + "epoch": 1.1729788388497016, + "grad_norm": 4.154557704925537, + "learning_rate": 6.334521036042209e-06, + "loss": 0.1427, + "step": 27498 + }, + { + "epoch": 1.1729924036896364, + "grad_norm": 4.80125617980957, + "learning_rate": 6.334383993421954e-06, + "loss": 0.2503, + "step": 27499 + }, + { + "epoch": 1.1730059685295713, + "grad_norm": 3.8493950366973877, + "learning_rate": 6.3342469508017e-06, + "loss": 0.1924, + "step": 27500 + }, + { + "epoch": 1.1730195333695062, + "grad_norm": 6.007068157196045, + "learning_rate": 6.334109908181445e-06, + "loss": 0.3989, + "step": 27501 + }, + { + "epoch": 1.1730330982094412, + "grad_norm": 5.083155632019043, + "learning_rate": 6.33397286556119e-06, + "loss": 0.4091, + "step": 27502 + }, + { + "epoch": 1.173046663049376, + "grad_norm": 5.671224594116211, + "learning_rate": 6.333835822940935e-06, + "loss": 0.3164, + "step": 27503 + }, + { + "epoch": 1.173060227889311, + "grad_norm": 5.616375923156738, + "learning_rate": 6.333698780320681e-06, + "loss": 0.393, + "step": 27504 + }, + { + "epoch": 1.1730737927292458, + "grad_norm": 4.633961200714111, + "learning_rate": 6.333561737700425e-06, + "loss": 0.2982, + "step": 27505 + }, + { + "epoch": 1.1730873575691807, + "grad_norm": 5.033621788024902, + "learning_rate": 6.3334246950801705e-06, + "loss": 0.225, + "step": 27506 + }, + { + "epoch": 1.1731009224091156, + "grad_norm": 3.575855016708374, + "learning_rate": 6.333287652459915e-06, + "loss": 0.209, + "step": 27507 + }, + { + "epoch": 1.1731144872490504, + "grad_norm": 3.8935041427612305, + "learning_rate": 6.333150609839661e-06, + "loss": 0.2257, + "step": 27508 + }, + { + "epoch": 1.1731280520889853, + "grad_norm": 5.175912380218506, + "learning_rate": 6.333013567219406e-06, + "loss": 0.3308, + "step": 27509 + }, + { + "epoch": 1.1731416169289202, + "grad_norm": 5.742412567138672, + "learning_rate": 6.332876524599151e-06, + "loss": 0.3471, + "step": 27510 + }, + { + "epoch": 1.173155181768855, + "grad_norm": 4.58853006362915, + "learning_rate": 6.3327394819788956e-06, + "loss": 0.137, + "step": 27511 + }, + { + "epoch": 1.17316874660879, + "grad_norm": 6.522858142852783, + "learning_rate": 6.332602439358642e-06, + "loss": 0.439, + "step": 27512 + }, + { + "epoch": 1.173182311448725, + "grad_norm": 5.42319393157959, + "learning_rate": 6.332465396738387e-06, + "loss": 0.33, + "step": 27513 + }, + { + "epoch": 1.1731958762886598, + "grad_norm": 5.847990036010742, + "learning_rate": 6.332328354118131e-06, + "loss": 0.21, + "step": 27514 + }, + { + "epoch": 1.1732094411285947, + "grad_norm": 3.892472743988037, + "learning_rate": 6.332191311497876e-06, + "loss": 0.2422, + "step": 27515 + }, + { + "epoch": 1.1732230059685296, + "grad_norm": 5.039498329162598, + "learning_rate": 6.332054268877621e-06, + "loss": 0.203, + "step": 27516 + }, + { + "epoch": 1.1732365708084644, + "grad_norm": 5.255943298339844, + "learning_rate": 6.331917226257367e-06, + "loss": 0.3013, + "step": 27517 + }, + { + "epoch": 1.1732501356483993, + "grad_norm": 6.196012496948242, + "learning_rate": 6.331780183637112e-06, + "loss": 0.3372, + "step": 27518 + }, + { + "epoch": 1.1732637004883342, + "grad_norm": 5.486910820007324, + "learning_rate": 6.331643141016857e-06, + "loss": 0.2929, + "step": 27519 + }, + { + "epoch": 1.173277265328269, + "grad_norm": 4.102426528930664, + "learning_rate": 6.331506098396601e-06, + "loss": 0.1878, + "step": 27520 + }, + { + "epoch": 1.1732908301682041, + "grad_norm": 5.990221977233887, + "learning_rate": 6.331369055776347e-06, + "loss": 0.2093, + "step": 27521 + }, + { + "epoch": 1.173304395008139, + "grad_norm": 5.519440174102783, + "learning_rate": 6.3312320131560925e-06, + "loss": 0.2031, + "step": 27522 + }, + { + "epoch": 1.1733179598480739, + "grad_norm": 4.258707523345947, + "learning_rate": 6.331094970535837e-06, + "loss": 0.2965, + "step": 27523 + }, + { + "epoch": 1.1733315246880087, + "grad_norm": 5.3519511222839355, + "learning_rate": 6.330957927915582e-06, + "loss": 0.2832, + "step": 27524 + }, + { + "epoch": 1.1733450895279436, + "grad_norm": 4.08549690246582, + "learning_rate": 6.330820885295328e-06, + "loss": 0.1671, + "step": 27525 + }, + { + "epoch": 1.1733586543678785, + "grad_norm": 5.087831497192383, + "learning_rate": 6.330683842675072e-06, + "loss": 0.2139, + "step": 27526 + }, + { + "epoch": 1.1733722192078133, + "grad_norm": 4.259270668029785, + "learning_rate": 6.3305468000548176e-06, + "loss": 0.172, + "step": 27527 + }, + { + "epoch": 1.1733857840477482, + "grad_norm": 4.067171573638916, + "learning_rate": 6.330409757434563e-06, + "loss": 0.2526, + "step": 27528 + }, + { + "epoch": 1.173399348887683, + "grad_norm": 6.443938255310059, + "learning_rate": 6.330272714814307e-06, + "loss": 0.2451, + "step": 27529 + }, + { + "epoch": 1.173412913727618, + "grad_norm": 5.895576477050781, + "learning_rate": 6.330135672194053e-06, + "loss": 0.2263, + "step": 27530 + }, + { + "epoch": 1.173426478567553, + "grad_norm": 5.516103267669678, + "learning_rate": 6.329998629573798e-06, + "loss": 0.2536, + "step": 27531 + }, + { + "epoch": 1.1734400434074879, + "grad_norm": 4.524691104888916, + "learning_rate": 6.329861586953543e-06, + "loss": 0.1978, + "step": 27532 + }, + { + "epoch": 1.1734536082474227, + "grad_norm": 6.828866481781006, + "learning_rate": 6.329724544333288e-06, + "loss": 0.4001, + "step": 27533 + }, + { + "epoch": 1.1734671730873576, + "grad_norm": 4.450119495391846, + "learning_rate": 6.329587501713034e-06, + "loss": 0.271, + "step": 27534 + }, + { + "epoch": 1.1734807379272925, + "grad_norm": 5.560507774353027, + "learning_rate": 6.329450459092778e-06, + "loss": 0.3113, + "step": 27535 + }, + { + "epoch": 1.1734943027672273, + "grad_norm": 6.004535675048828, + "learning_rate": 6.329313416472523e-06, + "loss": 0.2354, + "step": 27536 + }, + { + "epoch": 1.1735078676071622, + "grad_norm": 5.764222621917725, + "learning_rate": 6.3291763738522685e-06, + "loss": 0.3774, + "step": 27537 + }, + { + "epoch": 1.173521432447097, + "grad_norm": 5.272138595581055, + "learning_rate": 6.3290393312320145e-06, + "loss": 0.2486, + "step": 27538 + }, + { + "epoch": 1.173534997287032, + "grad_norm": 4.704272270202637, + "learning_rate": 6.328902288611759e-06, + "loss": 0.3037, + "step": 27539 + }, + { + "epoch": 1.173548562126967, + "grad_norm": 5.220237731933594, + "learning_rate": 6.328765245991504e-06, + "loss": 0.3806, + "step": 27540 + }, + { + "epoch": 1.1735621269669019, + "grad_norm": 5.6793107986450195, + "learning_rate": 6.328628203371248e-06, + "loss": 0.3009, + "step": 27541 + }, + { + "epoch": 1.1735756918068367, + "grad_norm": 4.5490617752075195, + "learning_rate": 6.3284911607509936e-06, + "loss": 0.219, + "step": 27542 + }, + { + "epoch": 1.1735892566467716, + "grad_norm": 5.018702030181885, + "learning_rate": 6.32835411813074e-06, + "loss": 0.2551, + "step": 27543 + }, + { + "epoch": 1.1736028214867065, + "grad_norm": 5.808999061584473, + "learning_rate": 6.328217075510485e-06, + "loss": 0.3806, + "step": 27544 + }, + { + "epoch": 1.1736163863266413, + "grad_norm": 5.0996928215026855, + "learning_rate": 6.328080032890229e-06, + "loss": 0.4156, + "step": 27545 + }, + { + "epoch": 1.1736299511665762, + "grad_norm": 5.965424537658691, + "learning_rate": 6.327942990269974e-06, + "loss": 0.3629, + "step": 27546 + }, + { + "epoch": 1.173643516006511, + "grad_norm": 5.158895015716553, + "learning_rate": 6.32780594764972e-06, + "loss": 0.4343, + "step": 27547 + }, + { + "epoch": 1.173657080846446, + "grad_norm": 5.801958084106445, + "learning_rate": 6.327668905029465e-06, + "loss": 0.2301, + "step": 27548 + }, + { + "epoch": 1.1736706456863808, + "grad_norm": 5.398126125335693, + "learning_rate": 6.32753186240921e-06, + "loss": 0.2815, + "step": 27549 + }, + { + "epoch": 1.1736842105263159, + "grad_norm": 5.425329208374023, + "learning_rate": 6.327394819788954e-06, + "loss": 0.3873, + "step": 27550 + }, + { + "epoch": 1.1736977753662508, + "grad_norm": 7.040549278259277, + "learning_rate": 6.3272577771687e-06, + "loss": 0.2789, + "step": 27551 + }, + { + "epoch": 1.1737113402061856, + "grad_norm": 5.690598011016846, + "learning_rate": 6.327120734548445e-06, + "loss": 0.2694, + "step": 27552 + }, + { + "epoch": 1.1737249050461205, + "grad_norm": 3.184192419052124, + "learning_rate": 6.3269836919281905e-06, + "loss": 0.1365, + "step": 27553 + }, + { + "epoch": 1.1737384698860553, + "grad_norm": 8.384414672851562, + "learning_rate": 6.326846649307935e-06, + "loss": 0.5156, + "step": 27554 + }, + { + "epoch": 1.1737520347259902, + "grad_norm": 4.627336502075195, + "learning_rate": 6.32670960668768e-06, + "loss": 0.2985, + "step": 27555 + }, + { + "epoch": 1.173765599565925, + "grad_norm": 5.566972732543945, + "learning_rate": 6.326572564067426e-06, + "loss": 0.3133, + "step": 27556 + }, + { + "epoch": 1.17377916440586, + "grad_norm": 7.328411102294922, + "learning_rate": 6.32643552144717e-06, + "loss": 0.369, + "step": 27557 + }, + { + "epoch": 1.1737927292457948, + "grad_norm": 5.5795464515686035, + "learning_rate": 6.326298478826916e-06, + "loss": 0.3153, + "step": 27558 + }, + { + "epoch": 1.17380629408573, + "grad_norm": 4.57732629776001, + "learning_rate": 6.326161436206661e-06, + "loss": 0.2, + "step": 27559 + }, + { + "epoch": 1.1738198589256648, + "grad_norm": 7.191882133483887, + "learning_rate": 6.326024393586406e-06, + "loss": 0.362, + "step": 27560 + }, + { + "epoch": 1.1738334237655996, + "grad_norm": 5.015823841094971, + "learning_rate": 6.325887350966151e-06, + "loss": 0.2288, + "step": 27561 + }, + { + "epoch": 1.1738469886055345, + "grad_norm": 5.978337287902832, + "learning_rate": 6.325750308345896e-06, + "loss": 0.4325, + "step": 27562 + }, + { + "epoch": 1.1738605534454694, + "grad_norm": 3.0829501152038574, + "learning_rate": 6.325613265725641e-06, + "loss": 0.1382, + "step": 27563 + }, + { + "epoch": 1.1738741182854042, + "grad_norm": 5.488622665405273, + "learning_rate": 6.325476223105387e-06, + "loss": 0.3664, + "step": 27564 + }, + { + "epoch": 1.173887683125339, + "grad_norm": 5.256516456604004, + "learning_rate": 6.325339180485132e-06, + "loss": 0.3183, + "step": 27565 + }, + { + "epoch": 1.173901247965274, + "grad_norm": 5.4197163581848145, + "learning_rate": 6.325202137864876e-06, + "loss": 0.2509, + "step": 27566 + }, + { + "epoch": 1.1739148128052088, + "grad_norm": 5.091210842132568, + "learning_rate": 6.325065095244621e-06, + "loss": 0.2879, + "step": 27567 + }, + { + "epoch": 1.1739283776451437, + "grad_norm": 6.096868515014648, + "learning_rate": 6.3249280526243665e-06, + "loss": 0.3135, + "step": 27568 + }, + { + "epoch": 1.1739419424850788, + "grad_norm": 5.781301975250244, + "learning_rate": 6.324791010004112e-06, + "loss": 0.2475, + "step": 27569 + }, + { + "epoch": 1.1739555073250136, + "grad_norm": 5.680377960205078, + "learning_rate": 6.324653967383857e-06, + "loss": 0.4472, + "step": 27570 + }, + { + "epoch": 1.1739690721649485, + "grad_norm": 5.525253772735596, + "learning_rate": 6.324516924763602e-06, + "loss": 0.3252, + "step": 27571 + }, + { + "epoch": 1.1739826370048834, + "grad_norm": 4.44828987121582, + "learning_rate": 6.324379882143346e-06, + "loss": 0.2674, + "step": 27572 + }, + { + "epoch": 1.1739962018448182, + "grad_norm": 10.020668983459473, + "learning_rate": 6.324242839523092e-06, + "loss": 0.2872, + "step": 27573 + }, + { + "epoch": 1.174009766684753, + "grad_norm": 6.12516450881958, + "learning_rate": 6.324105796902838e-06, + "loss": 0.3606, + "step": 27574 + }, + { + "epoch": 1.174023331524688, + "grad_norm": 5.1594977378845215, + "learning_rate": 6.323968754282582e-06, + "loss": 0.2584, + "step": 27575 + }, + { + "epoch": 1.1740368963646228, + "grad_norm": 6.189988136291504, + "learning_rate": 6.323831711662327e-06, + "loss": 0.2563, + "step": 27576 + }, + { + "epoch": 1.1740504612045577, + "grad_norm": 5.566863059997559, + "learning_rate": 6.323694669042073e-06, + "loss": 0.1791, + "step": 27577 + }, + { + "epoch": 1.1740640260444928, + "grad_norm": 5.8380889892578125, + "learning_rate": 6.323557626421818e-06, + "loss": 0.4123, + "step": 27578 + }, + { + "epoch": 1.1740775908844276, + "grad_norm": 4.139888286590576, + "learning_rate": 6.323420583801563e-06, + "loss": 0.2051, + "step": 27579 + }, + { + "epoch": 1.1740911557243625, + "grad_norm": 5.754692554473877, + "learning_rate": 6.323283541181308e-06, + "loss": 0.418, + "step": 27580 + }, + { + "epoch": 1.1741047205642974, + "grad_norm": 5.033836841583252, + "learning_rate": 6.323146498561054e-06, + "loss": 0.2325, + "step": 27581 + }, + { + "epoch": 1.1741182854042322, + "grad_norm": 4.442112445831299, + "learning_rate": 6.323009455940798e-06, + "loss": 0.2387, + "step": 27582 + }, + { + "epoch": 1.174131850244167, + "grad_norm": 4.402824878692627, + "learning_rate": 6.322872413320543e-06, + "loss": 0.2434, + "step": 27583 + }, + { + "epoch": 1.174145415084102, + "grad_norm": 4.293615818023682, + "learning_rate": 6.322735370700288e-06, + "loss": 0.228, + "step": 27584 + }, + { + "epoch": 1.1741589799240368, + "grad_norm": 7.023985385894775, + "learning_rate": 6.322598328080033e-06, + "loss": 0.3576, + "step": 27585 + }, + { + "epoch": 1.1741725447639717, + "grad_norm": 4.563082695007324, + "learning_rate": 6.322461285459779e-06, + "loss": 0.352, + "step": 27586 + }, + { + "epoch": 1.1741861096039066, + "grad_norm": 5.3566741943359375, + "learning_rate": 6.322324242839524e-06, + "loss": 0.2386, + "step": 27587 + }, + { + "epoch": 1.1741996744438417, + "grad_norm": 5.803813934326172, + "learning_rate": 6.322187200219268e-06, + "loss": 0.3831, + "step": 27588 + }, + { + "epoch": 1.1742132392837765, + "grad_norm": 6.307969570159912, + "learning_rate": 6.322050157599014e-06, + "loss": 0.2981, + "step": 27589 + }, + { + "epoch": 1.1742268041237114, + "grad_norm": 3.9509365558624268, + "learning_rate": 6.32191311497876e-06, + "loss": 0.183, + "step": 27590 + }, + { + "epoch": 1.1742403689636463, + "grad_norm": 5.644538402557373, + "learning_rate": 6.321776072358504e-06, + "loss": 0.2791, + "step": 27591 + }, + { + "epoch": 1.1742539338035811, + "grad_norm": 4.574007511138916, + "learning_rate": 6.321639029738249e-06, + "loss": 0.3358, + "step": 27592 + }, + { + "epoch": 1.174267498643516, + "grad_norm": 5.1736273765563965, + "learning_rate": 6.321501987117994e-06, + "loss": 0.3123, + "step": 27593 + }, + { + "epoch": 1.1742810634834508, + "grad_norm": 5.686485290527344, + "learning_rate": 6.3213649444977395e-06, + "loss": 0.3169, + "step": 27594 + }, + { + "epoch": 1.1742946283233857, + "grad_norm": 4.324814796447754, + "learning_rate": 6.321227901877485e-06, + "loss": 0.2898, + "step": 27595 + }, + { + "epoch": 1.1743081931633206, + "grad_norm": 6.158565044403076, + "learning_rate": 6.32109085925723e-06, + "loss": 0.3017, + "step": 27596 + }, + { + "epoch": 1.1743217580032557, + "grad_norm": 4.493769645690918, + "learning_rate": 6.320953816636974e-06, + "loss": 0.3062, + "step": 27597 + }, + { + "epoch": 1.1743353228431905, + "grad_norm": 3.66471791267395, + "learning_rate": 6.320816774016719e-06, + "loss": 0.1972, + "step": 27598 + }, + { + "epoch": 1.1743488876831254, + "grad_norm": 4.257416248321533, + "learning_rate": 6.320679731396465e-06, + "loss": 0.2676, + "step": 27599 + }, + { + "epoch": 1.1743624525230603, + "grad_norm": 6.0491228103637695, + "learning_rate": 6.32054268877621e-06, + "loss": 0.3873, + "step": 27600 + }, + { + "epoch": 1.1743760173629951, + "grad_norm": 5.636769771575928, + "learning_rate": 6.320405646155955e-06, + "loss": 0.3288, + "step": 27601 + }, + { + "epoch": 1.17438958220293, + "grad_norm": 5.380587100982666, + "learning_rate": 6.3202686035357e-06, + "loss": 0.3681, + "step": 27602 + }, + { + "epoch": 1.1744031470428649, + "grad_norm": 4.946429252624512, + "learning_rate": 6.320131560915445e-06, + "loss": 0.3391, + "step": 27603 + }, + { + "epoch": 1.1744167118827997, + "grad_norm": 4.247523307800293, + "learning_rate": 6.31999451829519e-06, + "loss": 0.219, + "step": 27604 + }, + { + "epoch": 1.1744302767227346, + "grad_norm": 5.469661235809326, + "learning_rate": 6.319857475674936e-06, + "loss": 0.2338, + "step": 27605 + }, + { + "epoch": 1.1744438415626695, + "grad_norm": 6.350358009338379, + "learning_rate": 6.31972043305468e-06, + "loss": 0.3605, + "step": 27606 + }, + { + "epoch": 1.1744574064026045, + "grad_norm": 5.144103050231934, + "learning_rate": 6.319583390434426e-06, + "loss": 0.2076, + "step": 27607 + }, + { + "epoch": 1.1744709712425394, + "grad_norm": 6.712394714355469, + "learning_rate": 6.319446347814171e-06, + "loss": 0.3798, + "step": 27608 + }, + { + "epoch": 1.1744845360824743, + "grad_norm": 5.896434783935547, + "learning_rate": 6.3193093051939155e-06, + "loss": 0.3601, + "step": 27609 + }, + { + "epoch": 1.1744981009224091, + "grad_norm": 7.139273166656494, + "learning_rate": 6.319172262573661e-06, + "loss": 0.6205, + "step": 27610 + }, + { + "epoch": 1.174511665762344, + "grad_norm": 5.854272365570068, + "learning_rate": 6.319035219953406e-06, + "loss": 0.4078, + "step": 27611 + }, + { + "epoch": 1.1745252306022789, + "grad_norm": 4.618374347686768, + "learning_rate": 6.318898177333152e-06, + "loss": 0.2054, + "step": 27612 + }, + { + "epoch": 1.1745387954422137, + "grad_norm": 4.0362443923950195, + "learning_rate": 6.318761134712896e-06, + "loss": 0.2896, + "step": 27613 + }, + { + "epoch": 1.1745523602821486, + "grad_norm": 5.019088268280029, + "learning_rate": 6.318624092092641e-06, + "loss": 0.1775, + "step": 27614 + }, + { + "epoch": 1.1745659251220835, + "grad_norm": 4.868256092071533, + "learning_rate": 6.318487049472386e-06, + "loss": 0.2921, + "step": 27615 + }, + { + "epoch": 1.1745794899620186, + "grad_norm": 4.288237571716309, + "learning_rate": 6.318350006852132e-06, + "loss": 0.2849, + "step": 27616 + }, + { + "epoch": 1.1745930548019534, + "grad_norm": 3.6381425857543945, + "learning_rate": 6.318212964231877e-06, + "loss": 0.1898, + "step": 27617 + }, + { + "epoch": 1.1746066196418883, + "grad_norm": 4.477959156036377, + "learning_rate": 6.318075921611621e-06, + "loss": 0.1888, + "step": 27618 + }, + { + "epoch": 1.1746201844818231, + "grad_norm": 5.22491455078125, + "learning_rate": 6.317938878991366e-06, + "loss": 0.3047, + "step": 27619 + }, + { + "epoch": 1.174633749321758, + "grad_norm": 5.2854180335998535, + "learning_rate": 6.3178018363711124e-06, + "loss": 0.1966, + "step": 27620 + }, + { + "epoch": 1.1746473141616929, + "grad_norm": 5.087093353271484, + "learning_rate": 6.317664793750858e-06, + "loss": 0.3241, + "step": 27621 + }, + { + "epoch": 1.1746608790016277, + "grad_norm": 5.310677528381348, + "learning_rate": 6.317527751130602e-06, + "loss": 0.2539, + "step": 27622 + }, + { + "epoch": 1.1746744438415626, + "grad_norm": 4.086109638214111, + "learning_rate": 6.317390708510347e-06, + "loss": 0.1722, + "step": 27623 + }, + { + "epoch": 1.1746880086814975, + "grad_norm": 4.909712791442871, + "learning_rate": 6.3172536658900915e-06, + "loss": 0.2093, + "step": 27624 + }, + { + "epoch": 1.1747015735214323, + "grad_norm": 5.7347331047058105, + "learning_rate": 6.3171166232698375e-06, + "loss": 0.4011, + "step": 27625 + }, + { + "epoch": 1.1747151383613674, + "grad_norm": 4.352654933929443, + "learning_rate": 6.316979580649583e-06, + "loss": 0.3173, + "step": 27626 + }, + { + "epoch": 1.1747287032013023, + "grad_norm": 4.484870910644531, + "learning_rate": 6.316842538029328e-06, + "loss": 0.2279, + "step": 27627 + }, + { + "epoch": 1.1747422680412372, + "grad_norm": 4.805649757385254, + "learning_rate": 6.316705495409072e-06, + "loss": 0.3658, + "step": 27628 + }, + { + "epoch": 1.174755832881172, + "grad_norm": 4.339649200439453, + "learning_rate": 6.316568452788818e-06, + "loss": 0.3293, + "step": 27629 + }, + { + "epoch": 1.174769397721107, + "grad_norm": 3.5982117652893066, + "learning_rate": 6.316431410168563e-06, + "loss": 0.1299, + "step": 27630 + }, + { + "epoch": 1.1747829625610418, + "grad_norm": 4.950983047485352, + "learning_rate": 6.316294367548308e-06, + "loss": 0.2917, + "step": 27631 + }, + { + "epoch": 1.1747965274009766, + "grad_norm": 4.288205146789551, + "learning_rate": 6.316157324928053e-06, + "loss": 0.1544, + "step": 27632 + }, + { + "epoch": 1.1748100922409115, + "grad_norm": 4.549182891845703, + "learning_rate": 6.316020282307799e-06, + "loss": 0.3568, + "step": 27633 + }, + { + "epoch": 1.1748236570808464, + "grad_norm": 3.9870758056640625, + "learning_rate": 6.315883239687543e-06, + "loss": 0.2517, + "step": 27634 + }, + { + "epoch": 1.1748372219207814, + "grad_norm": 6.565128326416016, + "learning_rate": 6.315746197067288e-06, + "loss": 0.419, + "step": 27635 + }, + { + "epoch": 1.1748507867607163, + "grad_norm": 4.083813667297363, + "learning_rate": 6.315609154447034e-06, + "loss": 0.166, + "step": 27636 + }, + { + "epoch": 1.1748643516006512, + "grad_norm": 4.530096530914307, + "learning_rate": 6.315472111826778e-06, + "loss": 0.2215, + "step": 27637 + }, + { + "epoch": 1.174877916440586, + "grad_norm": 3.810502767562866, + "learning_rate": 6.315335069206524e-06, + "loss": 0.2094, + "step": 27638 + }, + { + "epoch": 1.174891481280521, + "grad_norm": 3.2574567794799805, + "learning_rate": 6.315198026586269e-06, + "loss": 0.1211, + "step": 27639 + }, + { + "epoch": 1.1749050461204558, + "grad_norm": 5.269933223724365, + "learning_rate": 6.3150609839660135e-06, + "loss": 0.271, + "step": 27640 + }, + { + "epoch": 1.1749186109603906, + "grad_norm": 4.2562971115112305, + "learning_rate": 6.314923941345759e-06, + "loss": 0.2261, + "step": 27641 + }, + { + "epoch": 1.1749321758003255, + "grad_norm": 3.946045398712158, + "learning_rate": 6.314786898725505e-06, + "loss": 0.2763, + "step": 27642 + }, + { + "epoch": 1.1749457406402604, + "grad_norm": 5.382015705108643, + "learning_rate": 6.314649856105249e-06, + "loss": 0.2678, + "step": 27643 + }, + { + "epoch": 1.1749593054801952, + "grad_norm": 6.141302585601807, + "learning_rate": 6.314512813484994e-06, + "loss": 0.3546, + "step": 27644 + }, + { + "epoch": 1.1749728703201303, + "grad_norm": 5.140569686889648, + "learning_rate": 6.314375770864739e-06, + "loss": 0.2366, + "step": 27645 + }, + { + "epoch": 1.1749864351600652, + "grad_norm": 4.043727874755859, + "learning_rate": 6.314238728244485e-06, + "loss": 0.1886, + "step": 27646 + }, + { + "epoch": 1.175, + "grad_norm": 3.937040328979492, + "learning_rate": 6.31410168562423e-06, + "loss": 0.1791, + "step": 27647 + }, + { + "epoch": 1.175013564839935, + "grad_norm": 3.7626562118530273, + "learning_rate": 6.313964643003975e-06, + "loss": 0.1978, + "step": 27648 + }, + { + "epoch": 1.1750271296798698, + "grad_norm": 4.235040187835693, + "learning_rate": 6.313827600383719e-06, + "loss": 0.2625, + "step": 27649 + }, + { + "epoch": 1.1750406945198046, + "grad_norm": 4.521298885345459, + "learning_rate": 6.313690557763464e-06, + "loss": 0.3028, + "step": 27650 + }, + { + "epoch": 1.1750542593597395, + "grad_norm": 4.1512675285339355, + "learning_rate": 6.3135535151432104e-06, + "loss": 0.1808, + "step": 27651 + }, + { + "epoch": 1.1750678241996744, + "grad_norm": 3.3291430473327637, + "learning_rate": 6.313416472522956e-06, + "loss": 0.1495, + "step": 27652 + }, + { + "epoch": 1.1750813890396092, + "grad_norm": 4.838092803955078, + "learning_rate": 6.3132794299027e-06, + "loss": 0.2651, + "step": 27653 + }, + { + "epoch": 1.1750949538795443, + "grad_norm": 4.636582374572754, + "learning_rate": 6.313142387282445e-06, + "loss": 0.1895, + "step": 27654 + }, + { + "epoch": 1.1751085187194792, + "grad_norm": 4.164950847625732, + "learning_rate": 6.313005344662191e-06, + "loss": 0.1925, + "step": 27655 + }, + { + "epoch": 1.175122083559414, + "grad_norm": 5.214596271514893, + "learning_rate": 6.3128683020419355e-06, + "loss": 0.2193, + "step": 27656 + }, + { + "epoch": 1.175135648399349, + "grad_norm": 5.62635612487793, + "learning_rate": 6.312731259421681e-06, + "loss": 0.3601, + "step": 27657 + }, + { + "epoch": 1.1751492132392838, + "grad_norm": 3.8780949115753174, + "learning_rate": 6.312594216801425e-06, + "loss": 0.202, + "step": 27658 + }, + { + "epoch": 1.1751627780792187, + "grad_norm": 3.3270263671875, + "learning_rate": 6.312457174181171e-06, + "loss": 0.1422, + "step": 27659 + }, + { + "epoch": 1.1751763429191535, + "grad_norm": 6.666824817657471, + "learning_rate": 6.312320131560916e-06, + "loss": 0.2115, + "step": 27660 + }, + { + "epoch": 1.1751899077590884, + "grad_norm": 6.312719345092773, + "learning_rate": 6.312183088940661e-06, + "loss": 0.2048, + "step": 27661 + }, + { + "epoch": 1.1752034725990232, + "grad_norm": 3.9787745475769043, + "learning_rate": 6.312046046320406e-06, + "loss": 0.154, + "step": 27662 + }, + { + "epoch": 1.1752170374389581, + "grad_norm": 5.9210944175720215, + "learning_rate": 6.311909003700152e-06, + "loss": 0.3498, + "step": 27663 + }, + { + "epoch": 1.1752306022788932, + "grad_norm": 4.14218807220459, + "learning_rate": 6.311771961079897e-06, + "loss": 0.1283, + "step": 27664 + }, + { + "epoch": 1.175244167118828, + "grad_norm": 4.937883377075195, + "learning_rate": 6.311634918459641e-06, + "loss": 0.2861, + "step": 27665 + }, + { + "epoch": 1.175257731958763, + "grad_norm": 4.699988842010498, + "learning_rate": 6.3114978758393864e-06, + "loss": 0.1739, + "step": 27666 + }, + { + "epoch": 1.1752712967986978, + "grad_norm": 4.27992057800293, + "learning_rate": 6.311360833219131e-06, + "loss": 0.2425, + "step": 27667 + }, + { + "epoch": 1.1752848616386327, + "grad_norm": 6.356965065002441, + "learning_rate": 6.311223790598877e-06, + "loss": 0.2429, + "step": 27668 + }, + { + "epoch": 1.1752984264785675, + "grad_norm": 5.663723468780518, + "learning_rate": 6.311086747978622e-06, + "loss": 0.2066, + "step": 27669 + }, + { + "epoch": 1.1753119913185024, + "grad_norm": 6.443272590637207, + "learning_rate": 6.310949705358367e-06, + "loss": 0.349, + "step": 27670 + }, + { + "epoch": 1.1753255561584373, + "grad_norm": 5.524223804473877, + "learning_rate": 6.3108126627381115e-06, + "loss": 0.2423, + "step": 27671 + }, + { + "epoch": 1.1753391209983721, + "grad_norm": 5.096346855163574, + "learning_rate": 6.3106756201178575e-06, + "loss": 0.2486, + "step": 27672 + }, + { + "epoch": 1.1753526858383072, + "grad_norm": 5.6894378662109375, + "learning_rate": 6.310538577497603e-06, + "loss": 0.3104, + "step": 27673 + }, + { + "epoch": 1.175366250678242, + "grad_norm": 5.084975242614746, + "learning_rate": 6.310401534877347e-06, + "loss": 0.1614, + "step": 27674 + }, + { + "epoch": 1.175379815518177, + "grad_norm": 6.756911277770996, + "learning_rate": 6.310264492257092e-06, + "loss": 0.2997, + "step": 27675 + }, + { + "epoch": 1.1753933803581118, + "grad_norm": 8.571399688720703, + "learning_rate": 6.310127449636838e-06, + "loss": 0.3864, + "step": 27676 + }, + { + "epoch": 1.1754069451980467, + "grad_norm": 4.234395503997803, + "learning_rate": 6.3099904070165825e-06, + "loss": 0.1439, + "step": 27677 + }, + { + "epoch": 1.1754205100379815, + "grad_norm": 4.515933513641357, + "learning_rate": 6.309853364396328e-06, + "loss": 0.225, + "step": 27678 + }, + { + "epoch": 1.1754340748779164, + "grad_norm": 3.9940648078918457, + "learning_rate": 6.309716321776073e-06, + "loss": 0.2016, + "step": 27679 + }, + { + "epoch": 1.1754476397178513, + "grad_norm": 5.665682792663574, + "learning_rate": 6.309579279155817e-06, + "loss": 0.273, + "step": 27680 + }, + { + "epoch": 1.1754612045577861, + "grad_norm": 5.5133771896362305, + "learning_rate": 6.309442236535563e-06, + "loss": 0.2257, + "step": 27681 + }, + { + "epoch": 1.175474769397721, + "grad_norm": 5.263038635253906, + "learning_rate": 6.3093051939153084e-06, + "loss": 0.3235, + "step": 27682 + }, + { + "epoch": 1.175488334237656, + "grad_norm": 6.470577716827393, + "learning_rate": 6.309168151295053e-06, + "loss": 0.2571, + "step": 27683 + }, + { + "epoch": 1.175501899077591, + "grad_norm": 4.460507869720459, + "learning_rate": 6.309031108674798e-06, + "loss": 0.2052, + "step": 27684 + }, + { + "epoch": 1.1755154639175258, + "grad_norm": 5.065907955169678, + "learning_rate": 6.308894066054544e-06, + "loss": 0.2024, + "step": 27685 + }, + { + "epoch": 1.1755290287574607, + "grad_norm": 5.694818019866943, + "learning_rate": 6.308757023434289e-06, + "loss": 0.2597, + "step": 27686 + }, + { + "epoch": 1.1755425935973955, + "grad_norm": 4.820868492126465, + "learning_rate": 6.3086199808140335e-06, + "loss": 0.2916, + "step": 27687 + }, + { + "epoch": 1.1755561584373304, + "grad_norm": 8.160469055175781, + "learning_rate": 6.308482938193779e-06, + "loss": 0.3305, + "step": 27688 + }, + { + "epoch": 1.1755697232772653, + "grad_norm": 4.211348533630371, + "learning_rate": 6.308345895573525e-06, + "loss": 0.1843, + "step": 27689 + }, + { + "epoch": 1.1755832881172001, + "grad_norm": 5.167481422424316, + "learning_rate": 6.308208852953269e-06, + "loss": 0.2684, + "step": 27690 + }, + { + "epoch": 1.1755968529571352, + "grad_norm": 5.169085502624512, + "learning_rate": 6.308071810333014e-06, + "loss": 0.3004, + "step": 27691 + }, + { + "epoch": 1.17561041779707, + "grad_norm": 5.335407733917236, + "learning_rate": 6.3079347677127585e-06, + "loss": 0.2076, + "step": 27692 + }, + { + "epoch": 1.175623982637005, + "grad_norm": 5.6252970695495605, + "learning_rate": 6.307797725092504e-06, + "loss": 0.3461, + "step": 27693 + }, + { + "epoch": 1.1756375474769398, + "grad_norm": 9.647411346435547, + "learning_rate": 6.30766068247225e-06, + "loss": 0.2134, + "step": 27694 + }, + { + "epoch": 1.1756511123168747, + "grad_norm": 5.384635925292969, + "learning_rate": 6.307523639851995e-06, + "loss": 0.2414, + "step": 27695 + }, + { + "epoch": 1.1756646771568096, + "grad_norm": 5.764918804168701, + "learning_rate": 6.307386597231739e-06, + "loss": 0.2499, + "step": 27696 + }, + { + "epoch": 1.1756782419967444, + "grad_norm": 6.208261013031006, + "learning_rate": 6.3072495546114844e-06, + "loss": 0.3096, + "step": 27697 + }, + { + "epoch": 1.1756918068366793, + "grad_norm": 5.243136405944824, + "learning_rate": 6.3071125119912305e-06, + "loss": 0.2148, + "step": 27698 + }, + { + "epoch": 1.1757053716766142, + "grad_norm": 4.3398027420043945, + "learning_rate": 6.306975469370975e-06, + "loss": 0.2602, + "step": 27699 + }, + { + "epoch": 1.175718936516549, + "grad_norm": 5.211288928985596, + "learning_rate": 6.30683842675072e-06, + "loss": 0.2416, + "step": 27700 + }, + { + "epoch": 1.1757325013564839, + "grad_norm": 5.563959121704102, + "learning_rate": 6.306701384130465e-06, + "loss": 0.2275, + "step": 27701 + }, + { + "epoch": 1.175746066196419, + "grad_norm": 6.754264831542969, + "learning_rate": 6.30656434151021e-06, + "loss": 0.3699, + "step": 27702 + }, + { + "epoch": 1.1757596310363538, + "grad_norm": 5.737220287322998, + "learning_rate": 6.3064272988899555e-06, + "loss": 0.2606, + "step": 27703 + }, + { + "epoch": 1.1757731958762887, + "grad_norm": 4.381746768951416, + "learning_rate": 6.306290256269701e-06, + "loss": 0.2631, + "step": 27704 + }, + { + "epoch": 1.1757867607162236, + "grad_norm": 5.20731258392334, + "learning_rate": 6.306153213649445e-06, + "loss": 0.2748, + "step": 27705 + }, + { + "epoch": 1.1758003255561584, + "grad_norm": 6.742835521697998, + "learning_rate": 6.30601617102919e-06, + "loss": 0.2832, + "step": 27706 + }, + { + "epoch": 1.1758138903960933, + "grad_norm": 4.452515125274658, + "learning_rate": 6.305879128408936e-06, + "loss": 0.1843, + "step": 27707 + }, + { + "epoch": 1.1758274552360282, + "grad_norm": 4.895564556121826, + "learning_rate": 6.3057420857886806e-06, + "loss": 0.203, + "step": 27708 + }, + { + "epoch": 1.175841020075963, + "grad_norm": 6.438490867614746, + "learning_rate": 6.305605043168426e-06, + "loss": 0.2829, + "step": 27709 + }, + { + "epoch": 1.1758545849158981, + "grad_norm": 8.068788528442383, + "learning_rate": 6.305468000548171e-06, + "loss": 0.3422, + "step": 27710 + }, + { + "epoch": 1.175868149755833, + "grad_norm": 6.187329292297363, + "learning_rate": 6.305330957927916e-06, + "loss": 0.3699, + "step": 27711 + }, + { + "epoch": 1.1758817145957678, + "grad_norm": 5.1404266357421875, + "learning_rate": 6.305193915307661e-06, + "loss": 0.2879, + "step": 27712 + }, + { + "epoch": 1.1758952794357027, + "grad_norm": 4.838244915008545, + "learning_rate": 6.3050568726874064e-06, + "loss": 0.2525, + "step": 27713 + }, + { + "epoch": 1.1759088442756376, + "grad_norm": 6.120906829833984, + "learning_rate": 6.304919830067151e-06, + "loss": 0.3357, + "step": 27714 + }, + { + "epoch": 1.1759224091155724, + "grad_norm": 5.631261825561523, + "learning_rate": 6.304782787446897e-06, + "loss": 0.3367, + "step": 27715 + }, + { + "epoch": 1.1759359739555073, + "grad_norm": 5.385598182678223, + "learning_rate": 6.304645744826642e-06, + "loss": 0.3862, + "step": 27716 + }, + { + "epoch": 1.1759495387954422, + "grad_norm": 6.519283771514893, + "learning_rate": 6.304508702206386e-06, + "loss": 0.3362, + "step": 27717 + }, + { + "epoch": 1.175963103635377, + "grad_norm": 6.477061748504639, + "learning_rate": 6.3043716595861315e-06, + "loss": 0.2993, + "step": 27718 + }, + { + "epoch": 1.175976668475312, + "grad_norm": 6.121142387390137, + "learning_rate": 6.304234616965877e-06, + "loss": 0.4149, + "step": 27719 + }, + { + "epoch": 1.1759902333152468, + "grad_norm": 7.823376178741455, + "learning_rate": 6.304097574345623e-06, + "loss": 0.452, + "step": 27720 + }, + { + "epoch": 1.1760037981551819, + "grad_norm": 5.799432754516602, + "learning_rate": 6.303960531725367e-06, + "loss": 0.4131, + "step": 27721 + }, + { + "epoch": 1.1760173629951167, + "grad_norm": 5.099063873291016, + "learning_rate": 6.303823489105112e-06, + "loss": 0.4164, + "step": 27722 + }, + { + "epoch": 1.1760309278350516, + "grad_norm": 6.026178359985352, + "learning_rate": 6.3036864464848565e-06, + "loss": 0.3832, + "step": 27723 + }, + { + "epoch": 1.1760444926749865, + "grad_norm": 6.763906002044678, + "learning_rate": 6.3035494038646026e-06, + "loss": 0.3113, + "step": 27724 + }, + { + "epoch": 1.1760580575149213, + "grad_norm": 5.788537502288818, + "learning_rate": 6.303412361244348e-06, + "loss": 0.3604, + "step": 27725 + }, + { + "epoch": 1.1760716223548562, + "grad_norm": 4.74816370010376, + "learning_rate": 6.303275318624092e-06, + "loss": 0.2511, + "step": 27726 + }, + { + "epoch": 1.176085187194791, + "grad_norm": 5.089039325714111, + "learning_rate": 6.303138276003837e-06, + "loss": 0.2513, + "step": 27727 + }, + { + "epoch": 1.176098752034726, + "grad_norm": 5.2444939613342285, + "learning_rate": 6.303001233383583e-06, + "loss": 0.4046, + "step": 27728 + }, + { + "epoch": 1.176112316874661, + "grad_norm": 4.786661148071289, + "learning_rate": 6.3028641907633285e-06, + "loss": 0.2129, + "step": 27729 + }, + { + "epoch": 1.1761258817145959, + "grad_norm": 7.0638604164123535, + "learning_rate": 6.302727148143073e-06, + "loss": 0.3954, + "step": 27730 + }, + { + "epoch": 1.1761394465545307, + "grad_norm": 8.628999710083008, + "learning_rate": 6.302590105522818e-06, + "loss": 0.4227, + "step": 27731 + }, + { + "epoch": 1.1761530113944656, + "grad_norm": 4.745523929595947, + "learning_rate": 6.302453062902564e-06, + "loss": 0.2862, + "step": 27732 + }, + { + "epoch": 1.1761665762344005, + "grad_norm": 5.099331378936768, + "learning_rate": 6.302316020282308e-06, + "loss": 0.3128, + "step": 27733 + }, + { + "epoch": 1.1761801410743353, + "grad_norm": 6.318864345550537, + "learning_rate": 6.3021789776620535e-06, + "loss": 0.3644, + "step": 27734 + }, + { + "epoch": 1.1761937059142702, + "grad_norm": 3.99448299407959, + "learning_rate": 6.302041935041799e-06, + "loss": 0.1606, + "step": 27735 + }, + { + "epoch": 1.176207270754205, + "grad_norm": 5.975213527679443, + "learning_rate": 6.301904892421543e-06, + "loss": 0.2837, + "step": 27736 + }, + { + "epoch": 1.17622083559414, + "grad_norm": 9.04631233215332, + "learning_rate": 6.301767849801289e-06, + "loss": 0.3674, + "step": 27737 + }, + { + "epoch": 1.1762344004340748, + "grad_norm": 5.17930793762207, + "learning_rate": 6.301630807181034e-06, + "loss": 0.4586, + "step": 27738 + }, + { + "epoch": 1.1762479652740097, + "grad_norm": 5.813176155090332, + "learning_rate": 6.3014937645607786e-06, + "loss": 0.2582, + "step": 27739 + }, + { + "epoch": 1.1762615301139447, + "grad_norm": 4.918407917022705, + "learning_rate": 6.301356721940524e-06, + "loss": 0.2621, + "step": 27740 + }, + { + "epoch": 1.1762750949538796, + "grad_norm": 6.327028751373291, + "learning_rate": 6.30121967932027e-06, + "loss": 0.2685, + "step": 27741 + }, + { + "epoch": 1.1762886597938145, + "grad_norm": 3.4173028469085693, + "learning_rate": 6.301082636700014e-06, + "loss": 0.2337, + "step": 27742 + }, + { + "epoch": 1.1763022246337493, + "grad_norm": 4.747611999511719, + "learning_rate": 6.300945594079759e-06, + "loss": 0.3033, + "step": 27743 + }, + { + "epoch": 1.1763157894736842, + "grad_norm": 5.090986728668213, + "learning_rate": 6.3008085514595045e-06, + "loss": 0.3288, + "step": 27744 + }, + { + "epoch": 1.176329354313619, + "grad_norm": 5.140952110290527, + "learning_rate": 6.30067150883925e-06, + "loss": 0.3714, + "step": 27745 + }, + { + "epoch": 1.176342919153554, + "grad_norm": 6.035993576049805, + "learning_rate": 6.300534466218995e-06, + "loss": 0.3242, + "step": 27746 + }, + { + "epoch": 1.1763564839934888, + "grad_norm": 6.24556827545166, + "learning_rate": 6.30039742359874e-06, + "loss": 0.2701, + "step": 27747 + }, + { + "epoch": 1.1763700488334239, + "grad_norm": 7.070750713348389, + "learning_rate": 6.300260380978484e-06, + "loss": 0.356, + "step": 27748 + }, + { + "epoch": 1.1763836136733588, + "grad_norm": 4.528331756591797, + "learning_rate": 6.3001233383582295e-06, + "loss": 0.2372, + "step": 27749 + }, + { + "epoch": 1.1763971785132936, + "grad_norm": 6.655482769012451, + "learning_rate": 6.2999862957379755e-06, + "loss": 0.3323, + "step": 27750 + }, + { + "epoch": 1.1764107433532285, + "grad_norm": 6.17323112487793, + "learning_rate": 6.29984925311772e-06, + "loss": 0.3627, + "step": 27751 + }, + { + "epoch": 1.1764243081931633, + "grad_norm": 6.649962902069092, + "learning_rate": 6.299712210497465e-06, + "loss": 0.5043, + "step": 27752 + }, + { + "epoch": 1.1764378730330982, + "grad_norm": 4.590428829193115, + "learning_rate": 6.29957516787721e-06, + "loss": 0.302, + "step": 27753 + }, + { + "epoch": 1.176451437873033, + "grad_norm": 3.827073574066162, + "learning_rate": 6.299438125256956e-06, + "loss": 0.1653, + "step": 27754 + }, + { + "epoch": 1.176465002712968, + "grad_norm": 6.000858306884766, + "learning_rate": 6.2993010826367006e-06, + "loss": 0.274, + "step": 27755 + }, + { + "epoch": 1.1764785675529028, + "grad_norm": 5.373006820678711, + "learning_rate": 6.299164040016446e-06, + "loss": 0.3187, + "step": 27756 + }, + { + "epoch": 1.1764921323928377, + "grad_norm": 4.829432487487793, + "learning_rate": 6.29902699739619e-06, + "loss": 0.315, + "step": 27757 + }, + { + "epoch": 1.1765056972327725, + "grad_norm": 6.426974296569824, + "learning_rate": 6.298889954775936e-06, + "loss": 0.3437, + "step": 27758 + }, + { + "epoch": 1.1765192620727076, + "grad_norm": 5.8695268630981445, + "learning_rate": 6.298752912155681e-06, + "loss": 0.3199, + "step": 27759 + }, + { + "epoch": 1.1765328269126425, + "grad_norm": 4.8676581382751465, + "learning_rate": 6.298615869535426e-06, + "loss": 0.2431, + "step": 27760 + }, + { + "epoch": 1.1765463917525774, + "grad_norm": 5.942086696624756, + "learning_rate": 6.298478826915171e-06, + "loss": 0.2828, + "step": 27761 + }, + { + "epoch": 1.1765599565925122, + "grad_norm": 8.50118637084961, + "learning_rate": 6.298341784294916e-06, + "loss": 0.4684, + "step": 27762 + }, + { + "epoch": 1.176573521432447, + "grad_norm": 4.440722465515137, + "learning_rate": 6.298204741674662e-06, + "loss": 0.2499, + "step": 27763 + }, + { + "epoch": 1.176587086272382, + "grad_norm": 4.123361587524414, + "learning_rate": 6.298067699054406e-06, + "loss": 0.219, + "step": 27764 + }, + { + "epoch": 1.1766006511123168, + "grad_norm": 4.272416591644287, + "learning_rate": 6.2979306564341515e-06, + "loss": 0.233, + "step": 27765 + }, + { + "epoch": 1.1766142159522517, + "grad_norm": 5.287966251373291, + "learning_rate": 6.297793613813896e-06, + "loss": 0.2954, + "step": 27766 + }, + { + "epoch": 1.1766277807921868, + "grad_norm": 5.0098981857299805, + "learning_rate": 6.297656571193642e-06, + "loss": 0.3232, + "step": 27767 + }, + { + "epoch": 1.1766413456321216, + "grad_norm": 5.086326599121094, + "learning_rate": 6.297519528573387e-06, + "loss": 0.366, + "step": 27768 + }, + { + "epoch": 1.1766549104720565, + "grad_norm": 6.2390313148498535, + "learning_rate": 6.297382485953132e-06, + "loss": 0.3578, + "step": 27769 + }, + { + "epoch": 1.1766684753119914, + "grad_norm": 6.869335651397705, + "learning_rate": 6.2972454433328766e-06, + "loss": 0.4107, + "step": 27770 + }, + { + "epoch": 1.1766820401519262, + "grad_norm": 6.814325332641602, + "learning_rate": 6.297108400712623e-06, + "loss": 0.328, + "step": 27771 + }, + { + "epoch": 1.176695604991861, + "grad_norm": 3.9926538467407227, + "learning_rate": 6.296971358092368e-06, + "loss": 0.2369, + "step": 27772 + }, + { + "epoch": 1.176709169831796, + "grad_norm": 4.630190372467041, + "learning_rate": 6.296834315472112e-06, + "loss": 0.2044, + "step": 27773 + }, + { + "epoch": 1.1767227346717308, + "grad_norm": 5.936595439910889, + "learning_rate": 6.296697272851857e-06, + "loss": 0.2502, + "step": 27774 + }, + { + "epoch": 1.1767362995116657, + "grad_norm": 4.617407321929932, + "learning_rate": 6.296560230231602e-06, + "loss": 0.2718, + "step": 27775 + }, + { + "epoch": 1.1767498643516006, + "grad_norm": 5.127028465270996, + "learning_rate": 6.296423187611348e-06, + "loss": 0.3069, + "step": 27776 + }, + { + "epoch": 1.1767634291915354, + "grad_norm": 5.19568395614624, + "learning_rate": 6.296286144991093e-06, + "loss": 0.3276, + "step": 27777 + }, + { + "epoch": 1.1767769940314705, + "grad_norm": 4.417776107788086, + "learning_rate": 6.296149102370838e-06, + "loss": 0.3011, + "step": 27778 + }, + { + "epoch": 1.1767905588714054, + "grad_norm": 6.697633743286133, + "learning_rate": 6.296012059750582e-06, + "loss": 0.2981, + "step": 27779 + }, + { + "epoch": 1.1768041237113402, + "grad_norm": 4.321652889251709, + "learning_rate": 6.295875017130328e-06, + "loss": 0.2096, + "step": 27780 + }, + { + "epoch": 1.176817688551275, + "grad_norm": 4.403038501739502, + "learning_rate": 6.2957379745100735e-06, + "loss": 0.2037, + "step": 27781 + }, + { + "epoch": 1.17683125339121, + "grad_norm": 3.6761887073516846, + "learning_rate": 6.295600931889818e-06, + "loss": 0.1795, + "step": 27782 + }, + { + "epoch": 1.1768448182311448, + "grad_norm": 7.880154132843018, + "learning_rate": 6.295463889269563e-06, + "loss": 0.3439, + "step": 27783 + }, + { + "epoch": 1.1768583830710797, + "grad_norm": 5.59135627746582, + "learning_rate": 6.295326846649309e-06, + "loss": 0.2732, + "step": 27784 + }, + { + "epoch": 1.1768719479110146, + "grad_norm": 5.774371147155762, + "learning_rate": 6.295189804029053e-06, + "loss": 0.2618, + "step": 27785 + }, + { + "epoch": 1.1768855127509497, + "grad_norm": 4.702739238739014, + "learning_rate": 6.2950527614087986e-06, + "loss": 0.2226, + "step": 27786 + }, + { + "epoch": 1.1768990775908845, + "grad_norm": 5.401529788970947, + "learning_rate": 6.294915718788544e-06, + "loss": 0.2416, + "step": 27787 + }, + { + "epoch": 1.1769126424308194, + "grad_norm": 5.478165149688721, + "learning_rate": 6.294778676168288e-06, + "loss": 0.1757, + "step": 27788 + }, + { + "epoch": 1.1769262072707543, + "grad_norm": 4.327924728393555, + "learning_rate": 6.294641633548034e-06, + "loss": 0.1994, + "step": 27789 + }, + { + "epoch": 1.1769397721106891, + "grad_norm": 4.508748531341553, + "learning_rate": 6.294504590927779e-06, + "loss": 0.2606, + "step": 27790 + }, + { + "epoch": 1.176953336950624, + "grad_norm": 4.488213539123535, + "learning_rate": 6.294367548307524e-06, + "loss": 0.1817, + "step": 27791 + }, + { + "epoch": 1.1769669017905589, + "grad_norm": 6.288574695587158, + "learning_rate": 6.294230505687269e-06, + "loss": 0.2949, + "step": 27792 + }, + { + "epoch": 1.1769804666304937, + "grad_norm": 5.5584187507629395, + "learning_rate": 6.294093463067015e-06, + "loss": 0.2376, + "step": 27793 + }, + { + "epoch": 1.1769940314704286, + "grad_norm": 6.359118461608887, + "learning_rate": 6.29395642044676e-06, + "loss": 0.3249, + "step": 27794 + }, + { + "epoch": 1.1770075963103634, + "grad_norm": 5.646134376525879, + "learning_rate": 6.293819377826504e-06, + "loss": 0.2352, + "step": 27795 + }, + { + "epoch": 1.1770211611502983, + "grad_norm": 4.538043022155762, + "learning_rate": 6.2936823352062495e-06, + "loss": 0.2456, + "step": 27796 + }, + { + "epoch": 1.1770347259902334, + "grad_norm": 3.4031906127929688, + "learning_rate": 6.2935452925859955e-06, + "loss": 0.2396, + "step": 27797 + }, + { + "epoch": 1.1770482908301683, + "grad_norm": 4.062641620635986, + "learning_rate": 6.29340824996574e-06, + "loss": 0.2351, + "step": 27798 + }, + { + "epoch": 1.1770618556701031, + "grad_norm": 5.6823930740356445, + "learning_rate": 6.293271207345485e-06, + "loss": 0.3107, + "step": 27799 + }, + { + "epoch": 1.177075420510038, + "grad_norm": 6.449098587036133, + "learning_rate": 6.293134164725229e-06, + "loss": 0.2966, + "step": 27800 + }, + { + "epoch": 1.1770889853499729, + "grad_norm": 6.035422325134277, + "learning_rate": 6.292997122104975e-06, + "loss": 0.3229, + "step": 27801 + }, + { + "epoch": 1.1771025501899077, + "grad_norm": 4.899925708770752, + "learning_rate": 6.292860079484721e-06, + "loss": 0.297, + "step": 27802 + }, + { + "epoch": 1.1771161150298426, + "grad_norm": 5.022427082061768, + "learning_rate": 6.292723036864466e-06, + "loss": 0.3361, + "step": 27803 + }, + { + "epoch": 1.1771296798697775, + "grad_norm": 5.512683868408203, + "learning_rate": 6.29258599424421e-06, + "loss": 0.2926, + "step": 27804 + }, + { + "epoch": 1.1771432447097125, + "grad_norm": 5.914462566375732, + "learning_rate": 6.292448951623955e-06, + "loss": 0.3542, + "step": 27805 + }, + { + "epoch": 1.1771568095496474, + "grad_norm": 4.573061466217041, + "learning_rate": 6.292311909003701e-06, + "loss": 0.3118, + "step": 27806 + }, + { + "epoch": 1.1771703743895823, + "grad_norm": 5.535287857055664, + "learning_rate": 6.292174866383446e-06, + "loss": 0.2823, + "step": 27807 + }, + { + "epoch": 1.1771839392295171, + "grad_norm": 3.416916847229004, + "learning_rate": 6.292037823763191e-06, + "loss": 0.1572, + "step": 27808 + }, + { + "epoch": 1.177197504069452, + "grad_norm": 4.01552152633667, + "learning_rate": 6.291900781142935e-06, + "loss": 0.2825, + "step": 27809 + }, + { + "epoch": 1.1772110689093869, + "grad_norm": 4.812724590301514, + "learning_rate": 6.291763738522681e-06, + "loss": 0.3008, + "step": 27810 + }, + { + "epoch": 1.1772246337493217, + "grad_norm": 7.016321182250977, + "learning_rate": 6.291626695902426e-06, + "loss": 0.3273, + "step": 27811 + }, + { + "epoch": 1.1772381985892566, + "grad_norm": 5.639989376068115, + "learning_rate": 6.2914896532821715e-06, + "loss": 0.2431, + "step": 27812 + }, + { + "epoch": 1.1772517634291915, + "grad_norm": 7.671236515045166, + "learning_rate": 6.291352610661916e-06, + "loss": 0.4664, + "step": 27813 + }, + { + "epoch": 1.1772653282691263, + "grad_norm": 3.3848447799682617, + "learning_rate": 6.291215568041662e-06, + "loss": 0.1693, + "step": 27814 + }, + { + "epoch": 1.1772788931090612, + "grad_norm": 4.523842811584473, + "learning_rate": 6.291078525421407e-06, + "loss": 0.2222, + "step": 27815 + }, + { + "epoch": 1.1772924579489963, + "grad_norm": 6.195002555847168, + "learning_rate": 6.290941482801151e-06, + "loss": 0.3448, + "step": 27816 + }, + { + "epoch": 1.1773060227889312, + "grad_norm": 4.587676048278809, + "learning_rate": 6.290804440180897e-06, + "loss": 0.2243, + "step": 27817 + }, + { + "epoch": 1.177319587628866, + "grad_norm": 6.040753364562988, + "learning_rate": 6.290667397560642e-06, + "loss": 0.2842, + "step": 27818 + }, + { + "epoch": 1.1773331524688009, + "grad_norm": 6.350332260131836, + "learning_rate": 6.290530354940387e-06, + "loss": 0.2697, + "step": 27819 + }, + { + "epoch": 1.1773467173087357, + "grad_norm": 4.434407711029053, + "learning_rate": 6.290393312320132e-06, + "loss": 0.2656, + "step": 27820 + }, + { + "epoch": 1.1773602821486706, + "grad_norm": 6.133892059326172, + "learning_rate": 6.290256269699877e-06, + "loss": 0.3158, + "step": 27821 + }, + { + "epoch": 1.1773738469886055, + "grad_norm": 6.978968620300293, + "learning_rate": 6.290119227079622e-06, + "loss": 0.406, + "step": 27822 + }, + { + "epoch": 1.1773874118285403, + "grad_norm": 4.662793159484863, + "learning_rate": 6.289982184459368e-06, + "loss": 0.213, + "step": 27823 + }, + { + "epoch": 1.1774009766684754, + "grad_norm": 5.058218002319336, + "learning_rate": 6.289845141839113e-06, + "loss": 0.2153, + "step": 27824 + }, + { + "epoch": 1.1774145415084103, + "grad_norm": 5.752424240112305, + "learning_rate": 6.289708099218857e-06, + "loss": 0.3108, + "step": 27825 + }, + { + "epoch": 1.1774281063483452, + "grad_norm": 3.3444771766662598, + "learning_rate": 6.289571056598602e-06, + "loss": 0.2357, + "step": 27826 + }, + { + "epoch": 1.17744167118828, + "grad_norm": 5.447652339935303, + "learning_rate": 6.289434013978348e-06, + "loss": 0.3217, + "step": 27827 + }, + { + "epoch": 1.177455236028215, + "grad_norm": 5.620075702667236, + "learning_rate": 6.2892969713580935e-06, + "loss": 0.2833, + "step": 27828 + }, + { + "epoch": 1.1774688008681498, + "grad_norm": 4.865479469299316, + "learning_rate": 6.289159928737838e-06, + "loss": 0.247, + "step": 27829 + }, + { + "epoch": 1.1774823657080846, + "grad_norm": 4.035037994384766, + "learning_rate": 6.289022886117583e-06, + "loss": 0.1504, + "step": 27830 + }, + { + "epoch": 1.1774959305480195, + "grad_norm": 5.2657084465026855, + "learning_rate": 6.288885843497327e-06, + "loss": 0.2618, + "step": 27831 + }, + { + "epoch": 1.1775094953879544, + "grad_norm": 4.757819652557373, + "learning_rate": 6.288748800877073e-06, + "loss": 0.2459, + "step": 27832 + }, + { + "epoch": 1.1775230602278892, + "grad_norm": 4.0044331550598145, + "learning_rate": 6.288611758256819e-06, + "loss": 0.2202, + "step": 27833 + }, + { + "epoch": 1.177536625067824, + "grad_norm": 5.861526966094971, + "learning_rate": 6.288474715636563e-06, + "loss": 0.3405, + "step": 27834 + }, + { + "epoch": 1.1775501899077592, + "grad_norm": 5.14126443862915, + "learning_rate": 6.288337673016308e-06, + "loss": 0.2424, + "step": 27835 + }, + { + "epoch": 1.177563754747694, + "grad_norm": 5.377950191497803, + "learning_rate": 6.288200630396054e-06, + "loss": 0.2216, + "step": 27836 + }, + { + "epoch": 1.177577319587629, + "grad_norm": 3.9163248538970947, + "learning_rate": 6.288063587775799e-06, + "loss": 0.1332, + "step": 27837 + }, + { + "epoch": 1.1775908844275638, + "grad_norm": 4.678149700164795, + "learning_rate": 6.287926545155544e-06, + "loss": 0.172, + "step": 27838 + }, + { + "epoch": 1.1776044492674986, + "grad_norm": 3.504777431488037, + "learning_rate": 6.287789502535289e-06, + "loss": 0.2501, + "step": 27839 + }, + { + "epoch": 1.1776180141074335, + "grad_norm": 4.196824550628662, + "learning_rate": 6.287652459915035e-06, + "loss": 0.2019, + "step": 27840 + }, + { + "epoch": 1.1776315789473684, + "grad_norm": 5.347752571105957, + "learning_rate": 6.287515417294779e-06, + "loss": 0.2758, + "step": 27841 + }, + { + "epoch": 1.1776451437873032, + "grad_norm": 3.537102699279785, + "learning_rate": 6.287378374674524e-06, + "loss": 0.1692, + "step": 27842 + }, + { + "epoch": 1.1776587086272383, + "grad_norm": 4.684804916381836, + "learning_rate": 6.2872413320542695e-06, + "loss": 0.264, + "step": 27843 + }, + { + "epoch": 1.1776722734671732, + "grad_norm": 3.7914445400238037, + "learning_rate": 6.287104289434014e-06, + "loss": 0.1704, + "step": 27844 + }, + { + "epoch": 1.177685838307108, + "grad_norm": 4.26907205581665, + "learning_rate": 6.28696724681376e-06, + "loss": 0.2434, + "step": 27845 + }, + { + "epoch": 1.177699403147043, + "grad_norm": 5.107205867767334, + "learning_rate": 6.286830204193505e-06, + "loss": 0.2846, + "step": 27846 + }, + { + "epoch": 1.1777129679869778, + "grad_norm": 6.017826080322266, + "learning_rate": 6.286693161573249e-06, + "loss": 0.2505, + "step": 27847 + }, + { + "epoch": 1.1777265328269126, + "grad_norm": 5.183236598968506, + "learning_rate": 6.286556118952995e-06, + "loss": 0.2957, + "step": 27848 + }, + { + "epoch": 1.1777400976668475, + "grad_norm": 6.120785236358643, + "learning_rate": 6.286419076332741e-06, + "loss": 0.257, + "step": 27849 + }, + { + "epoch": 1.1777536625067824, + "grad_norm": 5.292670249938965, + "learning_rate": 6.286282033712485e-06, + "loss": 0.2361, + "step": 27850 + }, + { + "epoch": 1.1777672273467172, + "grad_norm": 5.513284206390381, + "learning_rate": 6.28614499109223e-06, + "loss": 0.2832, + "step": 27851 + }, + { + "epoch": 1.177780792186652, + "grad_norm": 3.503100633621216, + "learning_rate": 6.286007948471975e-06, + "loss": 0.1998, + "step": 27852 + }, + { + "epoch": 1.177794357026587, + "grad_norm": 3.821225881576538, + "learning_rate": 6.2858709058517205e-06, + "loss": 0.2569, + "step": 27853 + }, + { + "epoch": 1.177807921866522, + "grad_norm": 3.8281004428863525, + "learning_rate": 6.285733863231466e-06, + "loss": 0.1374, + "step": 27854 + }, + { + "epoch": 1.177821486706457, + "grad_norm": 5.2923784255981445, + "learning_rate": 6.285596820611211e-06, + "loss": 0.2622, + "step": 27855 + }, + { + "epoch": 1.1778350515463918, + "grad_norm": 4.303332805633545, + "learning_rate": 6.285459777990955e-06, + "loss": 0.1934, + "step": 27856 + }, + { + "epoch": 1.1778486163863267, + "grad_norm": 4.2445268630981445, + "learning_rate": 6.2853227353707e-06, + "loss": 0.1966, + "step": 27857 + }, + { + "epoch": 1.1778621812262615, + "grad_norm": 4.3952741622924805, + "learning_rate": 6.285185692750446e-06, + "loss": 0.2371, + "step": 27858 + }, + { + "epoch": 1.1778757460661964, + "grad_norm": 4.384689807891846, + "learning_rate": 6.285048650130191e-06, + "loss": 0.237, + "step": 27859 + }, + { + "epoch": 1.1778893109061312, + "grad_norm": 4.391981601715088, + "learning_rate": 6.284911607509936e-06, + "loss": 0.1517, + "step": 27860 + }, + { + "epoch": 1.1779028757460661, + "grad_norm": 4.225353717803955, + "learning_rate": 6.284774564889681e-06, + "loss": 0.2768, + "step": 27861 + }, + { + "epoch": 1.1779164405860012, + "grad_norm": 5.096360683441162, + "learning_rate": 6.284637522269427e-06, + "loss": 0.2237, + "step": 27862 + }, + { + "epoch": 1.177930005425936, + "grad_norm": 4.3220977783203125, + "learning_rate": 6.284500479649171e-06, + "loss": 0.2392, + "step": 27863 + }, + { + "epoch": 1.177943570265871, + "grad_norm": 6.931098461151123, + "learning_rate": 6.284363437028917e-06, + "loss": 0.3412, + "step": 27864 + }, + { + "epoch": 1.1779571351058058, + "grad_norm": 5.895661354064941, + "learning_rate": 6.284226394408661e-06, + "loss": 0.2779, + "step": 27865 + }, + { + "epoch": 1.1779706999457407, + "grad_norm": 4.2477498054504395, + "learning_rate": 6.284089351788407e-06, + "loss": 0.1815, + "step": 27866 + }, + { + "epoch": 1.1779842647856755, + "grad_norm": 5.63570499420166, + "learning_rate": 6.283952309168152e-06, + "loss": 0.2756, + "step": 27867 + }, + { + "epoch": 1.1779978296256104, + "grad_norm": 4.861125946044922, + "learning_rate": 6.2838152665478965e-06, + "loss": 0.2211, + "step": 27868 + }, + { + "epoch": 1.1780113944655453, + "grad_norm": 5.60642671585083, + "learning_rate": 6.283678223927642e-06, + "loss": 0.1732, + "step": 27869 + }, + { + "epoch": 1.1780249593054801, + "grad_norm": 5.233935832977295, + "learning_rate": 6.283541181307388e-06, + "loss": 0.2771, + "step": 27870 + }, + { + "epoch": 1.178038524145415, + "grad_norm": 3.6209359169006348, + "learning_rate": 6.283404138687133e-06, + "loss": 0.2702, + "step": 27871 + }, + { + "epoch": 1.1780520889853499, + "grad_norm": 5.023906707763672, + "learning_rate": 6.283267096066877e-06, + "loss": 0.2202, + "step": 27872 + }, + { + "epoch": 1.178065653825285, + "grad_norm": 4.686152935028076, + "learning_rate": 6.283130053446622e-06, + "loss": 0.216, + "step": 27873 + }, + { + "epoch": 1.1780792186652198, + "grad_norm": 4.782308101654053, + "learning_rate": 6.282993010826367e-06, + "loss": 0.3532, + "step": 27874 + }, + { + "epoch": 1.1780927835051547, + "grad_norm": 4.427450656890869, + "learning_rate": 6.282855968206113e-06, + "loss": 0.2158, + "step": 27875 + }, + { + "epoch": 1.1781063483450895, + "grad_norm": 3.710195779800415, + "learning_rate": 6.282718925585858e-06, + "loss": 0.1555, + "step": 27876 + }, + { + "epoch": 1.1781199131850244, + "grad_norm": 6.004344463348389, + "learning_rate": 6.282581882965603e-06, + "loss": 0.3352, + "step": 27877 + }, + { + "epoch": 1.1781334780249593, + "grad_norm": 3.761639356613159, + "learning_rate": 6.282444840345347e-06, + "loss": 0.2246, + "step": 27878 + }, + { + "epoch": 1.1781470428648941, + "grad_norm": 6.7731523513793945, + "learning_rate": 6.2823077977250934e-06, + "loss": 0.5169, + "step": 27879 + }, + { + "epoch": 1.178160607704829, + "grad_norm": 7.522953510284424, + "learning_rate": 6.282170755104839e-06, + "loss": 0.3476, + "step": 27880 + }, + { + "epoch": 1.178174172544764, + "grad_norm": 6.521481990814209, + "learning_rate": 6.282033712484583e-06, + "loss": 0.4524, + "step": 27881 + }, + { + "epoch": 1.178187737384699, + "grad_norm": 5.642424583435059, + "learning_rate": 6.281896669864328e-06, + "loss": 0.3515, + "step": 27882 + }, + { + "epoch": 1.1782013022246338, + "grad_norm": 6.196506500244141, + "learning_rate": 6.281759627244074e-06, + "loss": 0.4244, + "step": 27883 + }, + { + "epoch": 1.1782148670645687, + "grad_norm": 3.712517023086548, + "learning_rate": 6.2816225846238185e-06, + "loss": 0.1922, + "step": 27884 + }, + { + "epoch": 1.1782284319045035, + "grad_norm": 3.7747926712036133, + "learning_rate": 6.281485542003564e-06, + "loss": 0.198, + "step": 27885 + }, + { + "epoch": 1.1782419967444384, + "grad_norm": 5.3062005043029785, + "learning_rate": 6.281348499383309e-06, + "loss": 0.1568, + "step": 27886 + }, + { + "epoch": 1.1782555615843733, + "grad_norm": 3.8096721172332764, + "learning_rate": 6.281211456763053e-06, + "loss": 0.1642, + "step": 27887 + }, + { + "epoch": 1.1782691264243081, + "grad_norm": 4.74214506149292, + "learning_rate": 6.281074414142799e-06, + "loss": 0.2184, + "step": 27888 + }, + { + "epoch": 1.178282691264243, + "grad_norm": 5.533556938171387, + "learning_rate": 6.280937371522544e-06, + "loss": 0.264, + "step": 27889 + }, + { + "epoch": 1.1782962561041779, + "grad_norm": 5.032910346984863, + "learning_rate": 6.280800328902289e-06, + "loss": 0.2038, + "step": 27890 + }, + { + "epoch": 1.1783098209441127, + "grad_norm": 5.231481552124023, + "learning_rate": 6.280663286282034e-06, + "loss": 0.2977, + "step": 27891 + }, + { + "epoch": 1.1783233857840478, + "grad_norm": 4.3951311111450195, + "learning_rate": 6.28052624366178e-06, + "loss": 0.2362, + "step": 27892 + }, + { + "epoch": 1.1783369506239827, + "grad_norm": 4.250859260559082, + "learning_rate": 6.280389201041524e-06, + "loss": 0.2397, + "step": 27893 + }, + { + "epoch": 1.1783505154639176, + "grad_norm": 5.452892780303955, + "learning_rate": 6.2802521584212694e-06, + "loss": 0.2389, + "step": 27894 + }, + { + "epoch": 1.1783640803038524, + "grad_norm": 5.775974750518799, + "learning_rate": 6.280115115801015e-06, + "loss": 0.3159, + "step": 27895 + }, + { + "epoch": 1.1783776451437873, + "grad_norm": 5.023256778717041, + "learning_rate": 6.279978073180761e-06, + "loss": 0.2921, + "step": 27896 + }, + { + "epoch": 1.1783912099837222, + "grad_norm": 5.014571666717529, + "learning_rate": 6.279841030560505e-06, + "loss": 0.3263, + "step": 27897 + }, + { + "epoch": 1.178404774823657, + "grad_norm": 5.571857452392578, + "learning_rate": 6.27970398794025e-06, + "loss": 0.2603, + "step": 27898 + }, + { + "epoch": 1.1784183396635919, + "grad_norm": 4.972564697265625, + "learning_rate": 6.2795669453199945e-06, + "loss": 0.2784, + "step": 27899 + }, + { + "epoch": 1.178431904503527, + "grad_norm": 4.766254901885986, + "learning_rate": 6.27942990269974e-06, + "loss": 0.2577, + "step": 27900 + }, + { + "epoch": 1.1784454693434618, + "grad_norm": 4.851777076721191, + "learning_rate": 6.279292860079486e-06, + "loss": 0.2265, + "step": 27901 + }, + { + "epoch": 1.1784590341833967, + "grad_norm": 6.147732257843018, + "learning_rate": 6.27915581745923e-06, + "loss": 0.328, + "step": 27902 + }, + { + "epoch": 1.1784725990233316, + "grad_norm": 4.744761943817139, + "learning_rate": 6.279018774838975e-06, + "loss": 0.2252, + "step": 27903 + }, + { + "epoch": 1.1784861638632664, + "grad_norm": 4.814756393432617, + "learning_rate": 6.27888173221872e-06, + "loss": 0.2195, + "step": 27904 + }, + { + "epoch": 1.1784997287032013, + "grad_norm": 5.759835720062256, + "learning_rate": 6.278744689598466e-06, + "loss": 0.2266, + "step": 27905 + }, + { + "epoch": 1.1785132935431362, + "grad_norm": 4.483206272125244, + "learning_rate": 6.278607646978211e-06, + "loss": 0.2144, + "step": 27906 + }, + { + "epoch": 1.178526858383071, + "grad_norm": 5.839992523193359, + "learning_rate": 6.278470604357956e-06, + "loss": 0.2748, + "step": 27907 + }, + { + "epoch": 1.178540423223006, + "grad_norm": 4.765368938446045, + "learning_rate": 6.2783335617377e-06, + "loss": 0.2621, + "step": 27908 + }, + { + "epoch": 1.1785539880629408, + "grad_norm": 4.878927707672119, + "learning_rate": 6.278196519117446e-06, + "loss": 0.2652, + "step": 27909 + }, + { + "epoch": 1.1785675529028756, + "grad_norm": 5.199618816375732, + "learning_rate": 6.2780594764971914e-06, + "loss": 0.1759, + "step": 27910 + }, + { + "epoch": 1.1785811177428107, + "grad_norm": 3.806542158126831, + "learning_rate": 6.277922433876937e-06, + "loss": 0.1515, + "step": 27911 + }, + { + "epoch": 1.1785946825827456, + "grad_norm": 4.140769004821777, + "learning_rate": 6.277785391256681e-06, + "loss": 0.2413, + "step": 27912 + }, + { + "epoch": 1.1786082474226804, + "grad_norm": 5.569029808044434, + "learning_rate": 6.277648348636426e-06, + "loss": 0.2617, + "step": 27913 + }, + { + "epoch": 1.1786218122626153, + "grad_norm": 3.7698252201080322, + "learning_rate": 6.277511306016172e-06, + "loss": 0.177, + "step": 27914 + }, + { + "epoch": 1.1786353771025502, + "grad_norm": 4.726841926574707, + "learning_rate": 6.2773742633959165e-06, + "loss": 0.3648, + "step": 27915 + }, + { + "epoch": 1.178648941942485, + "grad_norm": 4.434640407562256, + "learning_rate": 6.277237220775662e-06, + "loss": 0.1146, + "step": 27916 + }, + { + "epoch": 1.17866250678242, + "grad_norm": 5.15238618850708, + "learning_rate": 6.277100178155406e-06, + "loss": 0.2387, + "step": 27917 + }, + { + "epoch": 1.1786760716223548, + "grad_norm": 8.224774360656738, + "learning_rate": 6.276963135535152e-06, + "loss": 0.3064, + "step": 27918 + }, + { + "epoch": 1.1786896364622899, + "grad_norm": 6.310456275939941, + "learning_rate": 6.276826092914897e-06, + "loss": 0.3241, + "step": 27919 + }, + { + "epoch": 1.1787032013022247, + "grad_norm": 4.193384170532227, + "learning_rate": 6.276689050294642e-06, + "loss": 0.168, + "step": 27920 + }, + { + "epoch": 1.1787167661421596, + "grad_norm": 4.061114311218262, + "learning_rate": 6.276552007674387e-06, + "loss": 0.178, + "step": 27921 + }, + { + "epoch": 1.1787303309820945, + "grad_norm": 4.877568244934082, + "learning_rate": 6.276414965054133e-06, + "loss": 0.1856, + "step": 27922 + }, + { + "epoch": 1.1787438958220293, + "grad_norm": 5.707404136657715, + "learning_rate": 6.276277922433878e-06, + "loss": 0.1974, + "step": 27923 + }, + { + "epoch": 1.1787574606619642, + "grad_norm": 5.729170799255371, + "learning_rate": 6.276140879813622e-06, + "loss": 0.3229, + "step": 27924 + }, + { + "epoch": 1.178771025501899, + "grad_norm": 5.956460475921631, + "learning_rate": 6.2760038371933674e-06, + "loss": 0.4038, + "step": 27925 + }, + { + "epoch": 1.178784590341834, + "grad_norm": 5.354207515716553, + "learning_rate": 6.275866794573113e-06, + "loss": 0.1795, + "step": 27926 + }, + { + "epoch": 1.1787981551817688, + "grad_norm": 6.524247169494629, + "learning_rate": 6.275729751952858e-06, + "loss": 0.2698, + "step": 27927 + }, + { + "epoch": 1.1788117200217036, + "grad_norm": 5.414590835571289, + "learning_rate": 6.275592709332603e-06, + "loss": 0.2337, + "step": 27928 + }, + { + "epoch": 1.1788252848616385, + "grad_norm": 5.638967037200928, + "learning_rate": 6.275455666712348e-06, + "loss": 0.2732, + "step": 27929 + }, + { + "epoch": 1.1788388497015736, + "grad_norm": 5.8330464363098145, + "learning_rate": 6.2753186240920925e-06, + "loss": 0.2749, + "step": 27930 + }, + { + "epoch": 1.1788524145415085, + "grad_norm": 3.959735870361328, + "learning_rate": 6.2751815814718385e-06, + "loss": 0.2079, + "step": 27931 + }, + { + "epoch": 1.1788659793814433, + "grad_norm": 5.292872428894043, + "learning_rate": 6.275044538851584e-06, + "loss": 0.3138, + "step": 27932 + }, + { + "epoch": 1.1788795442213782, + "grad_norm": 4.8559041023254395, + "learning_rate": 6.274907496231328e-06, + "loss": 0.3188, + "step": 27933 + }, + { + "epoch": 1.178893109061313, + "grad_norm": 4.8203301429748535, + "learning_rate": 6.274770453611073e-06, + "loss": 0.2417, + "step": 27934 + }, + { + "epoch": 1.178906673901248, + "grad_norm": 5.806849002838135, + "learning_rate": 6.274633410990819e-06, + "loss": 0.2854, + "step": 27935 + }, + { + "epoch": 1.1789202387411828, + "grad_norm": 4.070178031921387, + "learning_rate": 6.2744963683705636e-06, + "loss": 0.235, + "step": 27936 + }, + { + "epoch": 1.1789338035811177, + "grad_norm": 6.112511157989502, + "learning_rate": 6.274359325750309e-06, + "loss": 0.3554, + "step": 27937 + }, + { + "epoch": 1.1789473684210527, + "grad_norm": 8.619967460632324, + "learning_rate": 6.274222283130054e-06, + "loss": 0.2656, + "step": 27938 + }, + { + "epoch": 1.1789609332609876, + "grad_norm": 5.403902530670166, + "learning_rate": 6.2740852405098e-06, + "loss": 0.274, + "step": 27939 + }, + { + "epoch": 1.1789744981009225, + "grad_norm": 7.290919780731201, + "learning_rate": 6.273948197889544e-06, + "loss": 0.3852, + "step": 27940 + }, + { + "epoch": 1.1789880629408573, + "grad_norm": 6.105174541473389, + "learning_rate": 6.2738111552692894e-06, + "loss": 0.2981, + "step": 27941 + }, + { + "epoch": 1.1790016277807922, + "grad_norm": 4.613891124725342, + "learning_rate": 6.273674112649034e-06, + "loss": 0.3074, + "step": 27942 + }, + { + "epoch": 1.179015192620727, + "grad_norm": 4.611202716827393, + "learning_rate": 6.273537070028779e-06, + "loss": 0.1833, + "step": 27943 + }, + { + "epoch": 1.179028757460662, + "grad_norm": 5.262197494506836, + "learning_rate": 6.273400027408525e-06, + "loss": 0.1822, + "step": 27944 + }, + { + "epoch": 1.1790423223005968, + "grad_norm": 5.117290019989014, + "learning_rate": 6.27326298478827e-06, + "loss": 0.2296, + "step": 27945 + }, + { + "epoch": 1.1790558871405317, + "grad_norm": 4.874688148498535, + "learning_rate": 6.2731259421680145e-06, + "loss": 0.2667, + "step": 27946 + }, + { + "epoch": 1.1790694519804665, + "grad_norm": 4.9799346923828125, + "learning_rate": 6.27298889954776e-06, + "loss": 0.3178, + "step": 27947 + }, + { + "epoch": 1.1790830168204014, + "grad_norm": 7.596078872680664, + "learning_rate": 6.272851856927506e-06, + "loss": 0.2786, + "step": 27948 + }, + { + "epoch": 1.1790965816603365, + "grad_norm": 5.779389381408691, + "learning_rate": 6.27271481430725e-06, + "loss": 0.2578, + "step": 27949 + }, + { + "epoch": 1.1791101465002713, + "grad_norm": 4.448807239532471, + "learning_rate": 6.272577771686995e-06, + "loss": 0.2643, + "step": 27950 + }, + { + "epoch": 1.1791237113402062, + "grad_norm": 4.043468952178955, + "learning_rate": 6.2724407290667395e-06, + "loss": 0.2368, + "step": 27951 + }, + { + "epoch": 1.179137276180141, + "grad_norm": 5.7756218910217285, + "learning_rate": 6.2723036864464856e-06, + "loss": 0.2085, + "step": 27952 + }, + { + "epoch": 1.179150841020076, + "grad_norm": 4.835773944854736, + "learning_rate": 6.272166643826231e-06, + "loss": 0.2889, + "step": 27953 + }, + { + "epoch": 1.1791644058600108, + "grad_norm": 4.441351413726807, + "learning_rate": 6.272029601205976e-06, + "loss": 0.1862, + "step": 27954 + }, + { + "epoch": 1.1791779706999457, + "grad_norm": 6.334599018096924, + "learning_rate": 6.27189255858572e-06, + "loss": 0.286, + "step": 27955 + }, + { + "epoch": 1.1791915355398805, + "grad_norm": 5.6975812911987305, + "learning_rate": 6.2717555159654654e-06, + "loss": 0.252, + "step": 27956 + }, + { + "epoch": 1.1792051003798156, + "grad_norm": 5.741541862487793, + "learning_rate": 6.2716184733452115e-06, + "loss": 0.2378, + "step": 27957 + }, + { + "epoch": 1.1792186652197505, + "grad_norm": 4.422055721282959, + "learning_rate": 6.271481430724956e-06, + "loss": 0.2999, + "step": 27958 + }, + { + "epoch": 1.1792322300596854, + "grad_norm": 5.091248989105225, + "learning_rate": 6.271344388104701e-06, + "loss": 0.2139, + "step": 27959 + }, + { + "epoch": 1.1792457948996202, + "grad_norm": 4.291145324707031, + "learning_rate": 6.271207345484446e-06, + "loss": 0.1824, + "step": 27960 + }, + { + "epoch": 1.179259359739555, + "grad_norm": 5.625090599060059, + "learning_rate": 6.271070302864191e-06, + "loss": 0.3176, + "step": 27961 + }, + { + "epoch": 1.17927292457949, + "grad_norm": 4.3872551918029785, + "learning_rate": 6.2709332602439365e-06, + "loss": 0.2282, + "step": 27962 + }, + { + "epoch": 1.1792864894194248, + "grad_norm": 4.624480247497559, + "learning_rate": 6.270796217623682e-06, + "loss": 0.1806, + "step": 27963 + }, + { + "epoch": 1.1793000542593597, + "grad_norm": 5.113215446472168, + "learning_rate": 6.270659175003426e-06, + "loss": 0.2931, + "step": 27964 + }, + { + "epoch": 1.1793136190992946, + "grad_norm": 6.644736289978027, + "learning_rate": 6.270522132383172e-06, + "loss": 0.3433, + "step": 27965 + }, + { + "epoch": 1.1793271839392294, + "grad_norm": 5.736347198486328, + "learning_rate": 6.270385089762917e-06, + "loss": 0.3584, + "step": 27966 + }, + { + "epoch": 1.1793407487791643, + "grad_norm": 3.335928440093994, + "learning_rate": 6.2702480471426616e-06, + "loss": 0.289, + "step": 27967 + }, + { + "epoch": 1.1793543136190994, + "grad_norm": 4.177717685699463, + "learning_rate": 6.270111004522407e-06, + "loss": 0.2142, + "step": 27968 + }, + { + "epoch": 1.1793678784590342, + "grad_norm": 5.551850318908691, + "learning_rate": 6.269973961902152e-06, + "loss": 0.2366, + "step": 27969 + }, + { + "epoch": 1.179381443298969, + "grad_norm": 5.910002708435059, + "learning_rate": 6.269836919281898e-06, + "loss": 0.2874, + "step": 27970 + }, + { + "epoch": 1.179395008138904, + "grad_norm": 6.171860694885254, + "learning_rate": 6.269699876661642e-06, + "loss": 0.3237, + "step": 27971 + }, + { + "epoch": 1.1794085729788388, + "grad_norm": 5.219543933868408, + "learning_rate": 6.2695628340413874e-06, + "loss": 0.2558, + "step": 27972 + }, + { + "epoch": 1.1794221378187737, + "grad_norm": 4.531976699829102, + "learning_rate": 6.269425791421132e-06, + "loss": 0.2357, + "step": 27973 + }, + { + "epoch": 1.1794357026587086, + "grad_norm": 6.01621675491333, + "learning_rate": 6.269288748800878e-06, + "loss": 0.3344, + "step": 27974 + }, + { + "epoch": 1.1794492674986434, + "grad_norm": 5.129848003387451, + "learning_rate": 6.269151706180623e-06, + "loss": 0.3432, + "step": 27975 + }, + { + "epoch": 1.1794628323385785, + "grad_norm": 4.689633369445801, + "learning_rate": 6.269014663560367e-06, + "loss": 0.2796, + "step": 27976 + }, + { + "epoch": 1.1794763971785134, + "grad_norm": 6.370443344116211, + "learning_rate": 6.2688776209401125e-06, + "loss": 0.3427, + "step": 27977 + }, + { + "epoch": 1.1794899620184482, + "grad_norm": 5.822595596313477, + "learning_rate": 6.2687405783198585e-06, + "loss": 0.2594, + "step": 27978 + }, + { + "epoch": 1.1795035268583831, + "grad_norm": 5.059378147125244, + "learning_rate": 6.268603535699604e-06, + "loss": 0.209, + "step": 27979 + }, + { + "epoch": 1.179517091698318, + "grad_norm": 5.107985973358154, + "learning_rate": 6.268466493079348e-06, + "loss": 0.2703, + "step": 27980 + }, + { + "epoch": 1.1795306565382528, + "grad_norm": 6.595765590667725, + "learning_rate": 6.268329450459093e-06, + "loss": 0.3158, + "step": 27981 + }, + { + "epoch": 1.1795442213781877, + "grad_norm": 7.202970027923584, + "learning_rate": 6.2681924078388375e-06, + "loss": 0.4721, + "step": 27982 + }, + { + "epoch": 1.1795577862181226, + "grad_norm": 5.710823059082031, + "learning_rate": 6.2680553652185836e-06, + "loss": 0.271, + "step": 27983 + }, + { + "epoch": 1.1795713510580574, + "grad_norm": 6.971832752227783, + "learning_rate": 6.267918322598329e-06, + "loss": 0.5584, + "step": 27984 + }, + { + "epoch": 1.1795849158979923, + "grad_norm": 6.592094421386719, + "learning_rate": 6.267781279978074e-06, + "loss": 0.3241, + "step": 27985 + }, + { + "epoch": 1.1795984807379272, + "grad_norm": 5.492500305175781, + "learning_rate": 6.267644237357818e-06, + "loss": 0.2122, + "step": 27986 + }, + { + "epoch": 1.1796120455778623, + "grad_norm": 5.254662990570068, + "learning_rate": 6.267507194737564e-06, + "loss": 0.2739, + "step": 27987 + }, + { + "epoch": 1.1796256104177971, + "grad_norm": 5.070627212524414, + "learning_rate": 6.2673701521173095e-06, + "loss": 0.2333, + "step": 27988 + }, + { + "epoch": 1.179639175257732, + "grad_norm": 5.286841869354248, + "learning_rate": 6.267233109497054e-06, + "loss": 0.3188, + "step": 27989 + }, + { + "epoch": 1.1796527400976669, + "grad_norm": 6.345398426055908, + "learning_rate": 6.267096066876799e-06, + "loss": 0.2021, + "step": 27990 + }, + { + "epoch": 1.1796663049376017, + "grad_norm": 6.557464599609375, + "learning_rate": 6.266959024256545e-06, + "loss": 0.3278, + "step": 27991 + }, + { + "epoch": 1.1796798697775366, + "grad_norm": 5.161532402038574, + "learning_rate": 6.266821981636289e-06, + "loss": 0.2888, + "step": 27992 + }, + { + "epoch": 1.1796934346174714, + "grad_norm": 6.1021318435668945, + "learning_rate": 6.2666849390160345e-06, + "loss": 0.3623, + "step": 27993 + }, + { + "epoch": 1.1797069994574063, + "grad_norm": 5.3594207763671875, + "learning_rate": 6.26654789639578e-06, + "loss": 0.2647, + "step": 27994 + }, + { + "epoch": 1.1797205642973414, + "grad_norm": 6.117381572723389, + "learning_rate": 6.266410853775524e-06, + "loss": 0.2618, + "step": 27995 + }, + { + "epoch": 1.1797341291372763, + "grad_norm": 5.331268310546875, + "learning_rate": 6.26627381115527e-06, + "loss": 0.1468, + "step": 27996 + }, + { + "epoch": 1.1797476939772111, + "grad_norm": 4.085937976837158, + "learning_rate": 6.266136768535015e-06, + "loss": 0.1886, + "step": 27997 + }, + { + "epoch": 1.179761258817146, + "grad_norm": 5.463886260986328, + "learning_rate": 6.2659997259147596e-06, + "loss": 0.2019, + "step": 27998 + }, + { + "epoch": 1.1797748236570809, + "grad_norm": 6.332595348358154, + "learning_rate": 6.265862683294505e-06, + "loss": 0.2522, + "step": 27999 + }, + { + "epoch": 1.1797883884970157, + "grad_norm": 4.13121223449707, + "learning_rate": 6.265725640674251e-06, + "loss": 0.176, + "step": 28000 + }, + { + "epoch": 1.1798019533369506, + "grad_norm": 4.653298377990723, + "learning_rate": 6.265588598053995e-06, + "loss": 0.1725, + "step": 28001 + }, + { + "epoch": 1.1798155181768855, + "grad_norm": 4.539926052093506, + "learning_rate": 6.26545155543374e-06, + "loss": 0.1942, + "step": 28002 + }, + { + "epoch": 1.1798290830168203, + "grad_norm": 5.8794941902160645, + "learning_rate": 6.2653145128134855e-06, + "loss": 0.3555, + "step": 28003 + }, + { + "epoch": 1.1798426478567552, + "grad_norm": 4.053892135620117, + "learning_rate": 6.2651774701932315e-06, + "loss": 0.1539, + "step": 28004 + }, + { + "epoch": 1.17985621269669, + "grad_norm": 4.716421604156494, + "learning_rate": 6.265040427572976e-06, + "loss": 0.2086, + "step": 28005 + }, + { + "epoch": 1.1798697775366251, + "grad_norm": 4.452615737915039, + "learning_rate": 6.264903384952721e-06, + "loss": 0.1841, + "step": 28006 + }, + { + "epoch": 1.17988334237656, + "grad_norm": 5.005966663360596, + "learning_rate": 6.264766342332465e-06, + "loss": 0.2921, + "step": 28007 + }, + { + "epoch": 1.1798969072164949, + "grad_norm": 2.6340126991271973, + "learning_rate": 6.264629299712211e-06, + "loss": 0.0836, + "step": 28008 + }, + { + "epoch": 1.1799104720564297, + "grad_norm": 4.434617519378662, + "learning_rate": 6.2644922570919565e-06, + "loss": 0.2132, + "step": 28009 + }, + { + "epoch": 1.1799240368963646, + "grad_norm": 5.773787021636963, + "learning_rate": 6.264355214471701e-06, + "loss": 0.2798, + "step": 28010 + }, + { + "epoch": 1.1799376017362995, + "grad_norm": 5.614292144775391, + "learning_rate": 6.264218171851446e-06, + "loss": 0.3107, + "step": 28011 + }, + { + "epoch": 1.1799511665762343, + "grad_norm": 4.596277713775635, + "learning_rate": 6.264081129231191e-06, + "loss": 0.2376, + "step": 28012 + }, + { + "epoch": 1.1799647314161692, + "grad_norm": 4.863966941833496, + "learning_rate": 6.263944086610937e-06, + "loss": 0.1893, + "step": 28013 + }, + { + "epoch": 1.1799782962561043, + "grad_norm": 4.88026762008667, + "learning_rate": 6.2638070439906816e-06, + "loss": 0.2302, + "step": 28014 + }, + { + "epoch": 1.1799918610960392, + "grad_norm": 5.953002452850342, + "learning_rate": 6.263670001370427e-06, + "loss": 0.3921, + "step": 28015 + }, + { + "epoch": 1.180005425935974, + "grad_norm": 5.887280464172363, + "learning_rate": 6.263532958750171e-06, + "loss": 0.3587, + "step": 28016 + }, + { + "epoch": 1.1800189907759089, + "grad_norm": 6.046491622924805, + "learning_rate": 6.263395916129917e-06, + "loss": 0.3847, + "step": 28017 + }, + { + "epoch": 1.1800325556158437, + "grad_norm": 6.637539863586426, + "learning_rate": 6.263258873509662e-06, + "loss": 0.2943, + "step": 28018 + }, + { + "epoch": 1.1800461204557786, + "grad_norm": 5.03754186630249, + "learning_rate": 6.2631218308894075e-06, + "loss": 0.3873, + "step": 28019 + }, + { + "epoch": 1.1800596852957135, + "grad_norm": 4.906796932220459, + "learning_rate": 6.262984788269152e-06, + "loss": 0.2609, + "step": 28020 + }, + { + "epoch": 1.1800732501356483, + "grad_norm": 5.20546817779541, + "learning_rate": 6.262847745648898e-06, + "loss": 0.406, + "step": 28021 + }, + { + "epoch": 1.1800868149755832, + "grad_norm": 4.943950653076172, + "learning_rate": 6.262710703028643e-06, + "loss": 0.1852, + "step": 28022 + }, + { + "epoch": 1.180100379815518, + "grad_norm": 5.7297868728637695, + "learning_rate": 6.262573660408387e-06, + "loss": 0.3001, + "step": 28023 + }, + { + "epoch": 1.1801139446554532, + "grad_norm": 6.130950927734375, + "learning_rate": 6.2624366177881325e-06, + "loss": 0.4459, + "step": 28024 + }, + { + "epoch": 1.180127509495388, + "grad_norm": 4.866851806640625, + "learning_rate": 6.262299575167877e-06, + "loss": 0.302, + "step": 28025 + }, + { + "epoch": 1.180141074335323, + "grad_norm": 5.876840591430664, + "learning_rate": 6.262162532547623e-06, + "loss": 0.3946, + "step": 28026 + }, + { + "epoch": 1.1801546391752578, + "grad_norm": 6.027993202209473, + "learning_rate": 6.262025489927368e-06, + "loss": 0.3767, + "step": 28027 + }, + { + "epoch": 1.1801682040151926, + "grad_norm": 5.885769367218018, + "learning_rate": 6.261888447307113e-06, + "loss": 0.2974, + "step": 28028 + }, + { + "epoch": 1.1801817688551275, + "grad_norm": 8.076480865478516, + "learning_rate": 6.2617514046868576e-06, + "loss": 0.4505, + "step": 28029 + }, + { + "epoch": 1.1801953336950624, + "grad_norm": 6.139158725738525, + "learning_rate": 6.261614362066604e-06, + "loss": 0.4875, + "step": 28030 + }, + { + "epoch": 1.1802088985349972, + "grad_norm": 3.7695834636688232, + "learning_rate": 6.261477319446349e-06, + "loss": 0.2536, + "step": 28031 + }, + { + "epoch": 1.180222463374932, + "grad_norm": 4.961698532104492, + "learning_rate": 6.261340276826093e-06, + "loss": 0.5243, + "step": 28032 + }, + { + "epoch": 1.1802360282148672, + "grad_norm": 4.87361478805542, + "learning_rate": 6.261203234205838e-06, + "loss": 0.3053, + "step": 28033 + }, + { + "epoch": 1.180249593054802, + "grad_norm": 6.299048900604248, + "learning_rate": 6.261066191585584e-06, + "loss": 0.419, + "step": 28034 + }, + { + "epoch": 1.180263157894737, + "grad_norm": 5.2159624099731445, + "learning_rate": 6.260929148965329e-06, + "loss": 0.3558, + "step": 28035 + }, + { + "epoch": 1.1802767227346718, + "grad_norm": 5.321614742279053, + "learning_rate": 6.260792106345074e-06, + "loss": 0.3864, + "step": 28036 + }, + { + "epoch": 1.1802902875746066, + "grad_norm": 5.34455680847168, + "learning_rate": 6.260655063724819e-06, + "loss": 0.3761, + "step": 28037 + }, + { + "epoch": 1.1803038524145415, + "grad_norm": 9.834115982055664, + "learning_rate": 6.260518021104563e-06, + "loss": 0.6173, + "step": 28038 + }, + { + "epoch": 1.1803174172544764, + "grad_norm": 5.427895545959473, + "learning_rate": 6.260380978484309e-06, + "loss": 0.2922, + "step": 28039 + }, + { + "epoch": 1.1803309820944112, + "grad_norm": 6.882497787475586, + "learning_rate": 6.2602439358640545e-06, + "loss": 0.4423, + "step": 28040 + }, + { + "epoch": 1.180344546934346, + "grad_norm": 6.185166835784912, + "learning_rate": 6.260106893243799e-06, + "loss": 0.3505, + "step": 28041 + }, + { + "epoch": 1.180358111774281, + "grad_norm": 7.7748541831970215, + "learning_rate": 6.259969850623544e-06, + "loss": 0.52, + "step": 28042 + }, + { + "epoch": 1.180371676614216, + "grad_norm": 6.870823860168457, + "learning_rate": 6.25983280800329e-06, + "loss": 0.4129, + "step": 28043 + }, + { + "epoch": 1.180385241454151, + "grad_norm": 5.807595252990723, + "learning_rate": 6.259695765383034e-06, + "loss": 0.3621, + "step": 28044 + }, + { + "epoch": 1.1803988062940858, + "grad_norm": 6.763804912567139, + "learning_rate": 6.25955872276278e-06, + "loss": 0.3074, + "step": 28045 + }, + { + "epoch": 1.1804123711340206, + "grad_norm": 8.138934135437012, + "learning_rate": 6.259421680142525e-06, + "loss": 0.4086, + "step": 28046 + }, + { + "epoch": 1.1804259359739555, + "grad_norm": 4.310797214508057, + "learning_rate": 6.259284637522271e-06, + "loss": 0.2152, + "step": 28047 + }, + { + "epoch": 1.1804395008138904, + "grad_norm": 7.416234970092773, + "learning_rate": 6.259147594902015e-06, + "loss": 0.6313, + "step": 28048 + }, + { + "epoch": 1.1804530656538252, + "grad_norm": 8.362975120544434, + "learning_rate": 6.25901055228176e-06, + "loss": 0.6337, + "step": 28049 + }, + { + "epoch": 1.18046663049376, + "grad_norm": 6.7645039558410645, + "learning_rate": 6.258873509661505e-06, + "loss": 0.4535, + "step": 28050 + }, + { + "epoch": 1.180480195333695, + "grad_norm": 5.555196762084961, + "learning_rate": 6.25873646704125e-06, + "loss": 0.3541, + "step": 28051 + }, + { + "epoch": 1.18049376017363, + "grad_norm": 6.892233371734619, + "learning_rate": 6.258599424420996e-06, + "loss": 0.4832, + "step": 28052 + }, + { + "epoch": 1.180507325013565, + "grad_norm": 6.157584190368652, + "learning_rate": 6.258462381800741e-06, + "loss": 0.2939, + "step": 28053 + }, + { + "epoch": 1.1805208898534998, + "grad_norm": 6.141429901123047, + "learning_rate": 6.258325339180485e-06, + "loss": 0.3404, + "step": 28054 + }, + { + "epoch": 1.1805344546934347, + "grad_norm": 5.7377119064331055, + "learning_rate": 6.2581882965602305e-06, + "loss": 0.3251, + "step": 28055 + }, + { + "epoch": 1.1805480195333695, + "grad_norm": 5.230452060699463, + "learning_rate": 6.2580512539399765e-06, + "loss": 0.2699, + "step": 28056 + }, + { + "epoch": 1.1805615843733044, + "grad_norm": 4.186845302581787, + "learning_rate": 6.257914211319721e-06, + "loss": 0.3171, + "step": 28057 + }, + { + "epoch": 1.1805751492132392, + "grad_norm": 6.537107467651367, + "learning_rate": 6.257777168699466e-06, + "loss": 0.3145, + "step": 28058 + }, + { + "epoch": 1.1805887140531741, + "grad_norm": 5.111566543579102, + "learning_rate": 6.25764012607921e-06, + "loss": 0.2746, + "step": 28059 + }, + { + "epoch": 1.180602278893109, + "grad_norm": 5.971938133239746, + "learning_rate": 6.257503083458956e-06, + "loss": 0.3024, + "step": 28060 + }, + { + "epoch": 1.1806158437330438, + "grad_norm": 4.8224382400512695, + "learning_rate": 6.257366040838702e-06, + "loss": 0.3347, + "step": 28061 + }, + { + "epoch": 1.180629408572979, + "grad_norm": 4.997513294219971, + "learning_rate": 6.257228998218447e-06, + "loss": 0.2402, + "step": 28062 + }, + { + "epoch": 1.1806429734129138, + "grad_norm": 5.600213050842285, + "learning_rate": 6.257091955598191e-06, + "loss": 0.3659, + "step": 28063 + }, + { + "epoch": 1.1806565382528487, + "grad_norm": 5.70601224899292, + "learning_rate": 6.256954912977936e-06, + "loss": 0.3214, + "step": 28064 + }, + { + "epoch": 1.1806701030927835, + "grad_norm": 5.652800559997559, + "learning_rate": 6.256817870357682e-06, + "loss": 0.2181, + "step": 28065 + }, + { + "epoch": 1.1806836679327184, + "grad_norm": 6.617313861846924, + "learning_rate": 6.256680827737427e-06, + "loss": 0.3326, + "step": 28066 + }, + { + "epoch": 1.1806972327726533, + "grad_norm": 7.182305335998535, + "learning_rate": 6.256543785117172e-06, + "loss": 0.337, + "step": 28067 + }, + { + "epoch": 1.1807107976125881, + "grad_norm": 7.39563512802124, + "learning_rate": 6.256406742496917e-06, + "loss": 0.3542, + "step": 28068 + }, + { + "epoch": 1.180724362452523, + "grad_norm": 5.473397254943848, + "learning_rate": 6.256269699876662e-06, + "loss": 0.2944, + "step": 28069 + }, + { + "epoch": 1.1807379272924579, + "grad_norm": 5.835431098937988, + "learning_rate": 6.256132657256407e-06, + "loss": 0.3016, + "step": 28070 + }, + { + "epoch": 1.180751492132393, + "grad_norm": 5.722198486328125, + "learning_rate": 6.2559956146361525e-06, + "loss": 0.2183, + "step": 28071 + }, + { + "epoch": 1.1807650569723278, + "grad_norm": 5.737222671508789, + "learning_rate": 6.255858572015897e-06, + "loss": 0.2456, + "step": 28072 + }, + { + "epoch": 1.1807786218122627, + "grad_norm": 4.558277130126953, + "learning_rate": 6.255721529395643e-06, + "loss": 0.2514, + "step": 28073 + }, + { + "epoch": 1.1807921866521975, + "grad_norm": 6.680436134338379, + "learning_rate": 6.255584486775388e-06, + "loss": 0.3689, + "step": 28074 + }, + { + "epoch": 1.1808057514921324, + "grad_norm": 6.000889301300049, + "learning_rate": 6.255447444155132e-06, + "loss": 0.3767, + "step": 28075 + }, + { + "epoch": 1.1808193163320673, + "grad_norm": 5.074882507324219, + "learning_rate": 6.255310401534878e-06, + "loss": 0.3985, + "step": 28076 + }, + { + "epoch": 1.1808328811720021, + "grad_norm": 6.237216472625732, + "learning_rate": 6.255173358914623e-06, + "loss": 0.2734, + "step": 28077 + }, + { + "epoch": 1.180846446011937, + "grad_norm": 5.8506975173950195, + "learning_rate": 6.255036316294368e-06, + "loss": 0.3644, + "step": 28078 + }, + { + "epoch": 1.1808600108518719, + "grad_norm": 7.615849494934082, + "learning_rate": 6.254899273674113e-06, + "loss": 0.4286, + "step": 28079 + }, + { + "epoch": 1.1808735756918067, + "grad_norm": 6.340744972229004, + "learning_rate": 6.254762231053858e-06, + "loss": 0.3054, + "step": 28080 + }, + { + "epoch": 1.1808871405317418, + "grad_norm": 5.417635917663574, + "learning_rate": 6.254625188433603e-06, + "loss": 0.2707, + "step": 28081 + }, + { + "epoch": 1.1809007053716767, + "grad_norm": 5.181328296661377, + "learning_rate": 6.254488145813349e-06, + "loss": 0.2757, + "step": 28082 + }, + { + "epoch": 1.1809142702116115, + "grad_norm": 4.584226608276367, + "learning_rate": 6.254351103193094e-06, + "loss": 0.2732, + "step": 28083 + }, + { + "epoch": 1.1809278350515464, + "grad_norm": 6.758202075958252, + "learning_rate": 6.254214060572838e-06, + "loss": 0.306, + "step": 28084 + }, + { + "epoch": 1.1809413998914813, + "grad_norm": 5.953095436096191, + "learning_rate": 6.254077017952583e-06, + "loss": 0.2077, + "step": 28085 + }, + { + "epoch": 1.1809549647314161, + "grad_norm": 7.4817795753479, + "learning_rate": 6.253939975332329e-06, + "loss": 0.3087, + "step": 28086 + }, + { + "epoch": 1.180968529571351, + "grad_norm": 7.871462821960449, + "learning_rate": 6.2538029327120746e-06, + "loss": 0.489, + "step": 28087 + }, + { + "epoch": 1.1809820944112859, + "grad_norm": 5.007901668548584, + "learning_rate": 6.253665890091819e-06, + "loss": 0.1795, + "step": 28088 + }, + { + "epoch": 1.1809956592512207, + "grad_norm": 4.482062816619873, + "learning_rate": 6.253528847471564e-06, + "loss": 0.2048, + "step": 28089 + }, + { + "epoch": 1.1810092240911558, + "grad_norm": 5.666423797607422, + "learning_rate": 6.25339180485131e-06, + "loss": 0.3331, + "step": 28090 + }, + { + "epoch": 1.1810227889310907, + "grad_norm": 5.578549861907959, + "learning_rate": 6.253254762231054e-06, + "loss": 0.2743, + "step": 28091 + }, + { + "epoch": 1.1810363537710256, + "grad_norm": 4.524394989013672, + "learning_rate": 6.2531177196108e-06, + "loss": 0.2429, + "step": 28092 + }, + { + "epoch": 1.1810499186109604, + "grad_norm": 6.503476142883301, + "learning_rate": 6.252980676990544e-06, + "loss": 0.3699, + "step": 28093 + }, + { + "epoch": 1.1810634834508953, + "grad_norm": 5.85567045211792, + "learning_rate": 6.252843634370289e-06, + "loss": 0.2827, + "step": 28094 + }, + { + "epoch": 1.1810770482908302, + "grad_norm": 6.361648082733154, + "learning_rate": 6.252706591750035e-06, + "loss": 0.3874, + "step": 28095 + }, + { + "epoch": 1.181090613130765, + "grad_norm": 8.285736083984375, + "learning_rate": 6.25256954912978e-06, + "loss": 0.4731, + "step": 28096 + }, + { + "epoch": 1.1811041779706999, + "grad_norm": 6.358407974243164, + "learning_rate": 6.252432506509525e-06, + "loss": 0.2718, + "step": 28097 + }, + { + "epoch": 1.1811177428106348, + "grad_norm": 4.293875217437744, + "learning_rate": 6.25229546388927e-06, + "loss": 0.2452, + "step": 28098 + }, + { + "epoch": 1.1811313076505696, + "grad_norm": 4.984266757965088, + "learning_rate": 6.252158421269016e-06, + "loss": 0.2962, + "step": 28099 + }, + { + "epoch": 1.1811448724905047, + "grad_norm": 6.4697747230529785, + "learning_rate": 6.25202137864876e-06, + "loss": 0.3948, + "step": 28100 + }, + { + "epoch": 1.1811584373304396, + "grad_norm": 10.090791702270508, + "learning_rate": 6.251884336028505e-06, + "loss": 0.5893, + "step": 28101 + }, + { + "epoch": 1.1811720021703744, + "grad_norm": 5.940415382385254, + "learning_rate": 6.2517472934082505e-06, + "loss": 0.3498, + "step": 28102 + }, + { + "epoch": 1.1811855670103093, + "grad_norm": 7.421281814575195, + "learning_rate": 6.251610250787996e-06, + "loss": 0.435, + "step": 28103 + }, + { + "epoch": 1.1811991318502442, + "grad_norm": 5.406671047210693, + "learning_rate": 6.251473208167741e-06, + "loss": 0.3288, + "step": 28104 + }, + { + "epoch": 1.181212696690179, + "grad_norm": 5.745080947875977, + "learning_rate": 6.251336165547486e-06, + "loss": 0.2615, + "step": 28105 + }, + { + "epoch": 1.181226261530114, + "grad_norm": 7.49940824508667, + "learning_rate": 6.25119912292723e-06, + "loss": 0.3287, + "step": 28106 + }, + { + "epoch": 1.1812398263700488, + "grad_norm": 6.002631664276123, + "learning_rate": 6.251062080306976e-06, + "loss": 0.3649, + "step": 28107 + }, + { + "epoch": 1.1812533912099836, + "grad_norm": 7.847999572753906, + "learning_rate": 6.250925037686722e-06, + "loss": 0.467, + "step": 28108 + }, + { + "epoch": 1.1812669560499187, + "grad_norm": 6.342557430267334, + "learning_rate": 6.250787995066466e-06, + "loss": 0.4551, + "step": 28109 + }, + { + "epoch": 1.1812805208898536, + "grad_norm": 4.78196382522583, + "learning_rate": 6.250650952446211e-06, + "loss": 0.2787, + "step": 28110 + }, + { + "epoch": 1.1812940857297884, + "grad_norm": 8.75070571899414, + "learning_rate": 6.250513909825956e-06, + "loss": 0.4353, + "step": 28111 + }, + { + "epoch": 1.1813076505697233, + "grad_norm": 7.334474563598633, + "learning_rate": 6.250376867205702e-06, + "loss": 0.3312, + "step": 28112 + }, + { + "epoch": 1.1813212154096582, + "grad_norm": 6.619359970092773, + "learning_rate": 6.250239824585447e-06, + "loss": 0.3857, + "step": 28113 + }, + { + "epoch": 1.181334780249593, + "grad_norm": 7.118873596191406, + "learning_rate": 6.250102781965192e-06, + "loss": 0.326, + "step": 28114 + }, + { + "epoch": 1.181348345089528, + "grad_norm": 6.162513732910156, + "learning_rate": 6.249965739344936e-06, + "loss": 0.3895, + "step": 28115 + }, + { + "epoch": 1.1813619099294628, + "grad_norm": 4.497071266174316, + "learning_rate": 6.249828696724682e-06, + "loss": 0.2738, + "step": 28116 + }, + { + "epoch": 1.1813754747693976, + "grad_norm": 6.335721492767334, + "learning_rate": 6.249691654104427e-06, + "loss": 0.3848, + "step": 28117 + }, + { + "epoch": 1.1813890396093325, + "grad_norm": 4.302047252655029, + "learning_rate": 6.249554611484172e-06, + "loss": 0.2278, + "step": 28118 + }, + { + "epoch": 1.1814026044492676, + "grad_norm": 5.731928825378418, + "learning_rate": 6.249417568863917e-06, + "loss": 0.2716, + "step": 28119 + }, + { + "epoch": 1.1814161692892025, + "grad_norm": 8.887102127075195, + "learning_rate": 6.249280526243662e-06, + "loss": 0.4847, + "step": 28120 + }, + { + "epoch": 1.1814297341291373, + "grad_norm": 5.452442169189453, + "learning_rate": 6.249143483623408e-06, + "loss": 0.2723, + "step": 28121 + }, + { + "epoch": 1.1814432989690722, + "grad_norm": 6.009444713592529, + "learning_rate": 6.249006441003152e-06, + "loss": 0.3072, + "step": 28122 + }, + { + "epoch": 1.181456863809007, + "grad_norm": 5.374151706695557, + "learning_rate": 6.248869398382898e-06, + "loss": 0.3401, + "step": 28123 + }, + { + "epoch": 1.181470428648942, + "grad_norm": 4.11728048324585, + "learning_rate": 6.248732355762642e-06, + "loss": 0.2315, + "step": 28124 + }, + { + "epoch": 1.1814839934888768, + "grad_norm": 8.189316749572754, + "learning_rate": 6.248595313142388e-06, + "loss": 0.5511, + "step": 28125 + }, + { + "epoch": 1.1814975583288116, + "grad_norm": 6.986126899719238, + "learning_rate": 6.248458270522133e-06, + "loss": 0.3868, + "step": 28126 + }, + { + "epoch": 1.1815111231687465, + "grad_norm": 5.968687534332275, + "learning_rate": 6.248321227901878e-06, + "loss": 0.3292, + "step": 28127 + }, + { + "epoch": 1.1815246880086816, + "grad_norm": 5.799838066101074, + "learning_rate": 6.248184185281623e-06, + "loss": 0.2867, + "step": 28128 + }, + { + "epoch": 1.1815382528486165, + "grad_norm": 5.245661735534668, + "learning_rate": 6.248047142661369e-06, + "loss": 0.3129, + "step": 28129 + }, + { + "epoch": 1.1815518176885513, + "grad_norm": 4.467957973480225, + "learning_rate": 6.247910100041114e-06, + "loss": 0.4111, + "step": 28130 + }, + { + "epoch": 1.1815653825284862, + "grad_norm": 2.453442096710205, + "learning_rate": 6.247773057420858e-06, + "loss": 0.0933, + "step": 28131 + }, + { + "epoch": 1.181578947368421, + "grad_norm": 4.785589694976807, + "learning_rate": 6.247636014800603e-06, + "loss": 0.2817, + "step": 28132 + }, + { + "epoch": 1.181592512208356, + "grad_norm": 5.215362548828125, + "learning_rate": 6.247498972180348e-06, + "loss": 0.3645, + "step": 28133 + }, + { + "epoch": 1.1816060770482908, + "grad_norm": 5.549493789672852, + "learning_rate": 6.247361929560094e-06, + "loss": 0.2646, + "step": 28134 + }, + { + "epoch": 1.1816196418882257, + "grad_norm": 4.154262065887451, + "learning_rate": 6.247224886939839e-06, + "loss": 0.2421, + "step": 28135 + }, + { + "epoch": 1.1816332067281605, + "grad_norm": 6.236972332000732, + "learning_rate": 6.247087844319584e-06, + "loss": 0.3097, + "step": 28136 + }, + { + "epoch": 1.1816467715680954, + "grad_norm": 7.449249267578125, + "learning_rate": 6.246950801699328e-06, + "loss": 0.4709, + "step": 28137 + }, + { + "epoch": 1.1816603364080305, + "grad_norm": 4.625776290893555, + "learning_rate": 6.2468137590790744e-06, + "loss": 0.2148, + "step": 28138 + }, + { + "epoch": 1.1816739012479653, + "grad_norm": 4.048327922821045, + "learning_rate": 6.24667671645882e-06, + "loss": 0.219, + "step": 28139 + }, + { + "epoch": 1.1816874660879002, + "grad_norm": 3.1265814304351807, + "learning_rate": 6.246539673838564e-06, + "loss": 0.165, + "step": 28140 + }, + { + "epoch": 1.181701030927835, + "grad_norm": 3.9826440811157227, + "learning_rate": 6.246402631218309e-06, + "loss": 0.2775, + "step": 28141 + }, + { + "epoch": 1.18171459576777, + "grad_norm": 4.587472438812256, + "learning_rate": 6.246265588598055e-06, + "loss": 0.256, + "step": 28142 + }, + { + "epoch": 1.1817281606077048, + "grad_norm": 4.183034420013428, + "learning_rate": 6.2461285459777995e-06, + "loss": 0.1919, + "step": 28143 + }, + { + "epoch": 1.1817417254476397, + "grad_norm": 4.3826985359191895, + "learning_rate": 6.245991503357545e-06, + "loss": 0.2041, + "step": 28144 + }, + { + "epoch": 1.1817552902875745, + "grad_norm": 4.151773929595947, + "learning_rate": 6.24585446073729e-06, + "loss": 0.1428, + "step": 28145 + }, + { + "epoch": 1.1817688551275094, + "grad_norm": 4.3639349937438965, + "learning_rate": 6.245717418117034e-06, + "loss": 0.2511, + "step": 28146 + }, + { + "epoch": 1.1817824199674445, + "grad_norm": 4.086172580718994, + "learning_rate": 6.24558037549678e-06, + "loss": 0.2116, + "step": 28147 + }, + { + "epoch": 1.1817959848073794, + "grad_norm": 3.8768463134765625, + "learning_rate": 6.245443332876525e-06, + "loss": 0.1916, + "step": 28148 + }, + { + "epoch": 1.1818095496473142, + "grad_norm": 3.5336427688598633, + "learning_rate": 6.24530629025627e-06, + "loss": 0.1764, + "step": 28149 + }, + { + "epoch": 1.181823114487249, + "grad_norm": 4.175853729248047, + "learning_rate": 6.245169247636015e-06, + "loss": 0.2188, + "step": 28150 + }, + { + "epoch": 1.181836679327184, + "grad_norm": 4.27568244934082, + "learning_rate": 6.245032205015761e-06, + "loss": 0.1781, + "step": 28151 + }, + { + "epoch": 1.1818502441671188, + "grad_norm": 4.896144866943359, + "learning_rate": 6.244895162395505e-06, + "loss": 0.2617, + "step": 28152 + }, + { + "epoch": 1.1818638090070537, + "grad_norm": 4.189817905426025, + "learning_rate": 6.2447581197752504e-06, + "loss": 0.1834, + "step": 28153 + }, + { + "epoch": 1.1818773738469885, + "grad_norm": 4.411780834197998, + "learning_rate": 6.244621077154996e-06, + "loss": 0.1856, + "step": 28154 + }, + { + "epoch": 1.1818909386869234, + "grad_norm": 4.463425636291504, + "learning_rate": 6.244484034534742e-06, + "loss": 0.2568, + "step": 28155 + }, + { + "epoch": 1.1819045035268583, + "grad_norm": 5.0218024253845215, + "learning_rate": 6.244346991914486e-06, + "loss": 0.2809, + "step": 28156 + }, + { + "epoch": 1.1819180683667934, + "grad_norm": 3.4429802894592285, + "learning_rate": 6.244209949294231e-06, + "loss": 0.2301, + "step": 28157 + }, + { + "epoch": 1.1819316332067282, + "grad_norm": 3.556222915649414, + "learning_rate": 6.2440729066739755e-06, + "loss": 0.1771, + "step": 28158 + }, + { + "epoch": 1.181945198046663, + "grad_norm": 3.811534881591797, + "learning_rate": 6.2439358640537215e-06, + "loss": 0.1942, + "step": 28159 + }, + { + "epoch": 1.181958762886598, + "grad_norm": 3.758176565170288, + "learning_rate": 6.243798821433467e-06, + "loss": 0.253, + "step": 28160 + }, + { + "epoch": 1.1819723277265328, + "grad_norm": 5.301749229431152, + "learning_rate": 6.243661778813212e-06, + "loss": 0.2681, + "step": 28161 + }, + { + "epoch": 1.1819858925664677, + "grad_norm": 4.307156085968018, + "learning_rate": 6.243524736192956e-06, + "loss": 0.1763, + "step": 28162 + }, + { + "epoch": 1.1819994574064026, + "grad_norm": 4.698807716369629, + "learning_rate": 6.243387693572701e-06, + "loss": 0.2109, + "step": 28163 + }, + { + "epoch": 1.1820130222463374, + "grad_norm": 3.627336025238037, + "learning_rate": 6.243250650952447e-06, + "loss": 0.2093, + "step": 28164 + }, + { + "epoch": 1.1820265870862725, + "grad_norm": 5.279500484466553, + "learning_rate": 6.243113608332192e-06, + "loss": 0.2486, + "step": 28165 + }, + { + "epoch": 1.1820401519262074, + "grad_norm": 7.052189350128174, + "learning_rate": 6.242976565711937e-06, + "loss": 0.2752, + "step": 28166 + }, + { + "epoch": 1.1820537167661422, + "grad_norm": 3.4232192039489746, + "learning_rate": 6.242839523091681e-06, + "loss": 0.1275, + "step": 28167 + }, + { + "epoch": 1.182067281606077, + "grad_norm": 5.759434700012207, + "learning_rate": 6.242702480471427e-06, + "loss": 0.2987, + "step": 28168 + }, + { + "epoch": 1.182080846446012, + "grad_norm": 6.361066818237305, + "learning_rate": 6.2425654378511724e-06, + "loss": 0.2347, + "step": 28169 + }, + { + "epoch": 1.1820944112859468, + "grad_norm": 5.025218486785889, + "learning_rate": 6.242428395230918e-06, + "loss": 0.3357, + "step": 28170 + }, + { + "epoch": 1.1821079761258817, + "grad_norm": 3.741607904434204, + "learning_rate": 6.242291352610662e-06, + "loss": 0.2238, + "step": 28171 + }, + { + "epoch": 1.1821215409658166, + "grad_norm": 4.64489221572876, + "learning_rate": 6.242154309990408e-06, + "loss": 0.2622, + "step": 28172 + }, + { + "epoch": 1.1821351058057514, + "grad_norm": 3.4002203941345215, + "learning_rate": 6.242017267370153e-06, + "loss": 0.174, + "step": 28173 + }, + { + "epoch": 1.1821486706456863, + "grad_norm": 4.53381872177124, + "learning_rate": 6.2418802247498975e-06, + "loss": 0.2676, + "step": 28174 + }, + { + "epoch": 1.1821622354856212, + "grad_norm": 3.3975353240966797, + "learning_rate": 6.241743182129643e-06, + "loss": 0.1884, + "step": 28175 + }, + { + "epoch": 1.1821758003255562, + "grad_norm": 4.022921085357666, + "learning_rate": 6.241606139509388e-06, + "loss": 0.2042, + "step": 28176 + }, + { + "epoch": 1.1821893651654911, + "grad_norm": 4.480217933654785, + "learning_rate": 6.241469096889133e-06, + "loss": 0.1657, + "step": 28177 + }, + { + "epoch": 1.182202930005426, + "grad_norm": 6.265221118927002, + "learning_rate": 6.241332054268878e-06, + "loss": 0.2693, + "step": 28178 + }, + { + "epoch": 1.1822164948453608, + "grad_norm": 4.659851551055908, + "learning_rate": 6.241195011648623e-06, + "loss": 0.1894, + "step": 28179 + }, + { + "epoch": 1.1822300596852957, + "grad_norm": 4.382359027862549, + "learning_rate": 6.241057969028368e-06, + "loss": 0.209, + "step": 28180 + }, + { + "epoch": 1.1822436245252306, + "grad_norm": 4.428689002990723, + "learning_rate": 6.240920926408114e-06, + "loss": 0.3046, + "step": 28181 + }, + { + "epoch": 1.1822571893651654, + "grad_norm": 4.6142802238464355, + "learning_rate": 6.240783883787859e-06, + "loss": 0.3301, + "step": 28182 + }, + { + "epoch": 1.1822707542051003, + "grad_norm": 5.677584171295166, + "learning_rate": 6.240646841167603e-06, + "loss": 0.2665, + "step": 28183 + }, + { + "epoch": 1.1822843190450354, + "grad_norm": 6.399709701538086, + "learning_rate": 6.2405097985473484e-06, + "loss": 0.3565, + "step": 28184 + }, + { + "epoch": 1.1822978838849703, + "grad_norm": 5.148944854736328, + "learning_rate": 6.2403727559270945e-06, + "loss": 0.2012, + "step": 28185 + }, + { + "epoch": 1.1823114487249051, + "grad_norm": 4.258717060089111, + "learning_rate": 6.240235713306839e-06, + "loss": 0.248, + "step": 28186 + }, + { + "epoch": 1.18232501356484, + "grad_norm": 4.170973777770996, + "learning_rate": 6.240098670686584e-06, + "loss": 0.2966, + "step": 28187 + }, + { + "epoch": 1.1823385784047749, + "grad_norm": 4.013266086578369, + "learning_rate": 6.239961628066329e-06, + "loss": 0.2164, + "step": 28188 + }, + { + "epoch": 1.1823521432447097, + "grad_norm": 5.705953121185303, + "learning_rate": 6.2398245854460735e-06, + "loss": 0.3172, + "step": 28189 + }, + { + "epoch": 1.1823657080846446, + "grad_norm": 3.0125277042388916, + "learning_rate": 6.2396875428258195e-06, + "loss": 0.2059, + "step": 28190 + }, + { + "epoch": 1.1823792729245794, + "grad_norm": 6.87726354598999, + "learning_rate": 6.239550500205565e-06, + "loss": 0.3102, + "step": 28191 + }, + { + "epoch": 1.1823928377645143, + "grad_norm": 9.867301940917969, + "learning_rate": 6.239413457585309e-06, + "loss": 0.2312, + "step": 28192 + }, + { + "epoch": 1.1824064026044492, + "grad_norm": 4.874615669250488, + "learning_rate": 6.239276414965054e-06, + "loss": 0.343, + "step": 28193 + }, + { + "epoch": 1.182419967444384, + "grad_norm": 4.9836602210998535, + "learning_rate": 6.2391393723448e-06, + "loss": 0.2686, + "step": 28194 + }, + { + "epoch": 1.1824335322843191, + "grad_norm": 5.125797271728516, + "learning_rate": 6.239002329724545e-06, + "loss": 0.2178, + "step": 28195 + }, + { + "epoch": 1.182447097124254, + "grad_norm": 5.804526329040527, + "learning_rate": 6.23886528710429e-06, + "loss": 0.3006, + "step": 28196 + }, + { + "epoch": 1.1824606619641889, + "grad_norm": 3.3303816318511963, + "learning_rate": 6.238728244484035e-06, + "loss": 0.1534, + "step": 28197 + }, + { + "epoch": 1.1824742268041237, + "grad_norm": 5.53572416305542, + "learning_rate": 6.238591201863781e-06, + "loss": 0.2715, + "step": 28198 + }, + { + "epoch": 1.1824877916440586, + "grad_norm": 7.00027322769165, + "learning_rate": 6.238454159243525e-06, + "loss": 0.4557, + "step": 28199 + }, + { + "epoch": 1.1825013564839935, + "grad_norm": 4.389031887054443, + "learning_rate": 6.2383171166232704e-06, + "loss": 0.2745, + "step": 28200 + }, + { + "epoch": 1.1825149213239283, + "grad_norm": 4.189424514770508, + "learning_rate": 6.238180074003015e-06, + "loss": 0.2236, + "step": 28201 + }, + { + "epoch": 1.1825284861638632, + "grad_norm": 4.217773914337158, + "learning_rate": 6.23804303138276e-06, + "loss": 0.2463, + "step": 28202 + }, + { + "epoch": 1.1825420510037983, + "grad_norm": 7.620140552520752, + "learning_rate": 6.237905988762506e-06, + "loss": 0.2814, + "step": 28203 + }, + { + "epoch": 1.1825556158437331, + "grad_norm": 3.726485252380371, + "learning_rate": 6.237768946142251e-06, + "loss": 0.1976, + "step": 28204 + }, + { + "epoch": 1.182569180683668, + "grad_norm": 6.122939109802246, + "learning_rate": 6.2376319035219955e-06, + "loss": 0.3243, + "step": 28205 + }, + { + "epoch": 1.1825827455236029, + "grad_norm": 5.844294548034668, + "learning_rate": 6.237494860901741e-06, + "loss": 0.3429, + "step": 28206 + }, + { + "epoch": 1.1825963103635377, + "grad_norm": 5.046809196472168, + "learning_rate": 6.237357818281487e-06, + "loss": 0.2779, + "step": 28207 + }, + { + "epoch": 1.1826098752034726, + "grad_norm": 6.004251480102539, + "learning_rate": 6.237220775661231e-06, + "loss": 0.4527, + "step": 28208 + }, + { + "epoch": 1.1826234400434075, + "grad_norm": 6.006711483001709, + "learning_rate": 6.237083733040976e-06, + "loss": 0.3076, + "step": 28209 + }, + { + "epoch": 1.1826370048833423, + "grad_norm": 5.868229866027832, + "learning_rate": 6.236946690420721e-06, + "loss": 0.2489, + "step": 28210 + }, + { + "epoch": 1.1826505697232772, + "grad_norm": 4.556729793548584, + "learning_rate": 6.2368096478004666e-06, + "loss": 0.1561, + "step": 28211 + }, + { + "epoch": 1.182664134563212, + "grad_norm": 6.0032429695129395, + "learning_rate": 6.236672605180212e-06, + "loss": 0.3299, + "step": 28212 + }, + { + "epoch": 1.182677699403147, + "grad_norm": 5.694411754608154, + "learning_rate": 6.236535562559957e-06, + "loss": 0.3422, + "step": 28213 + }, + { + "epoch": 1.182691264243082, + "grad_norm": 6.283468723297119, + "learning_rate": 6.236398519939701e-06, + "loss": 0.3266, + "step": 28214 + }, + { + "epoch": 1.1827048290830169, + "grad_norm": 5.60817289352417, + "learning_rate": 6.2362614773194464e-06, + "loss": 0.3483, + "step": 28215 + }, + { + "epoch": 1.1827183939229517, + "grad_norm": 6.481150150299072, + "learning_rate": 6.2361244346991925e-06, + "loss": 0.2676, + "step": 28216 + }, + { + "epoch": 1.1827319587628866, + "grad_norm": 6.357338905334473, + "learning_rate": 6.235987392078937e-06, + "loss": 0.2569, + "step": 28217 + }, + { + "epoch": 1.1827455236028215, + "grad_norm": 4.881000995635986, + "learning_rate": 6.235850349458682e-06, + "loss": 0.4021, + "step": 28218 + }, + { + "epoch": 1.1827590884427563, + "grad_norm": 5.871342658996582, + "learning_rate": 6.235713306838427e-06, + "loss": 0.2593, + "step": 28219 + }, + { + "epoch": 1.1827726532826912, + "grad_norm": 7.186363220214844, + "learning_rate": 6.235576264218172e-06, + "loss": 0.3538, + "step": 28220 + }, + { + "epoch": 1.182786218122626, + "grad_norm": 4.523905277252197, + "learning_rate": 6.2354392215979175e-06, + "loss": 0.3596, + "step": 28221 + }, + { + "epoch": 1.1827997829625612, + "grad_norm": 5.557064056396484, + "learning_rate": 6.235302178977663e-06, + "loss": 0.2968, + "step": 28222 + }, + { + "epoch": 1.182813347802496, + "grad_norm": 5.446160316467285, + "learning_rate": 6.235165136357407e-06, + "loss": 0.306, + "step": 28223 + }, + { + "epoch": 1.182826912642431, + "grad_norm": 5.155763149261475, + "learning_rate": 6.235028093737153e-06, + "loss": 0.3001, + "step": 28224 + }, + { + "epoch": 1.1828404774823658, + "grad_norm": 7.683920860290527, + "learning_rate": 6.234891051116898e-06, + "loss": 0.4638, + "step": 28225 + }, + { + "epoch": 1.1828540423223006, + "grad_norm": 7.350813388824463, + "learning_rate": 6.2347540084966426e-06, + "loss": 0.4122, + "step": 28226 + }, + { + "epoch": 1.1828676071622355, + "grad_norm": 7.031463146209717, + "learning_rate": 6.234616965876388e-06, + "loss": 0.5118, + "step": 28227 + }, + { + "epoch": 1.1828811720021704, + "grad_norm": 5.6539387702941895, + "learning_rate": 6.234479923256134e-06, + "loss": 0.2645, + "step": 28228 + }, + { + "epoch": 1.1828947368421052, + "grad_norm": 5.4231133460998535, + "learning_rate": 6.234342880635879e-06, + "loss": 0.322, + "step": 28229 + }, + { + "epoch": 1.18290830168204, + "grad_norm": 5.0849456787109375, + "learning_rate": 6.234205838015623e-06, + "loss": 0.2728, + "step": 28230 + }, + { + "epoch": 1.182921866521975, + "grad_norm": 5.442086219787598, + "learning_rate": 6.2340687953953685e-06, + "loss": 0.2309, + "step": 28231 + }, + { + "epoch": 1.1829354313619098, + "grad_norm": 5.336852550506592, + "learning_rate": 6.233931752775113e-06, + "loss": 0.3081, + "step": 28232 + }, + { + "epoch": 1.182948996201845, + "grad_norm": 5.17158317565918, + "learning_rate": 6.233794710154859e-06, + "loss": 0.2918, + "step": 28233 + }, + { + "epoch": 1.1829625610417798, + "grad_norm": 7.331072807312012, + "learning_rate": 6.233657667534604e-06, + "loss": 0.3597, + "step": 28234 + }, + { + "epoch": 1.1829761258817146, + "grad_norm": 5.864682674407959, + "learning_rate": 6.233520624914348e-06, + "loss": 0.2675, + "step": 28235 + }, + { + "epoch": 1.1829896907216495, + "grad_norm": 5.562289237976074, + "learning_rate": 6.2333835822940935e-06, + "loss": 0.2833, + "step": 28236 + }, + { + "epoch": 1.1830032555615844, + "grad_norm": 6.142395973205566, + "learning_rate": 6.2332465396738395e-06, + "loss": 0.296, + "step": 28237 + }, + { + "epoch": 1.1830168204015192, + "grad_norm": 5.923182487487793, + "learning_rate": 6.233109497053585e-06, + "loss": 0.2905, + "step": 28238 + }, + { + "epoch": 1.183030385241454, + "grad_norm": 5.711511135101318, + "learning_rate": 6.232972454433329e-06, + "loss": 0.2943, + "step": 28239 + }, + { + "epoch": 1.183043950081389, + "grad_norm": 6.297043800354004, + "learning_rate": 6.232835411813074e-06, + "loss": 0.3092, + "step": 28240 + }, + { + "epoch": 1.183057514921324, + "grad_norm": 6.234115123748779, + "learning_rate": 6.23269836919282e-06, + "loss": 0.4196, + "step": 28241 + }, + { + "epoch": 1.183071079761259, + "grad_norm": 5.131528854370117, + "learning_rate": 6.2325613265725646e-06, + "loss": 0.2747, + "step": 28242 + }, + { + "epoch": 1.1830846446011938, + "grad_norm": 7.908230304718018, + "learning_rate": 6.23242428395231e-06, + "loss": 0.3902, + "step": 28243 + }, + { + "epoch": 1.1830982094411286, + "grad_norm": 3.8679776191711426, + "learning_rate": 6.232287241332055e-06, + "loss": 0.2579, + "step": 28244 + }, + { + "epoch": 1.1831117742810635, + "grad_norm": 6.4628214836120605, + "learning_rate": 6.232150198711799e-06, + "loss": 0.3471, + "step": 28245 + }, + { + "epoch": 1.1831253391209984, + "grad_norm": 7.096031188964844, + "learning_rate": 6.232013156091545e-06, + "loss": 0.3974, + "step": 28246 + }, + { + "epoch": 1.1831389039609332, + "grad_norm": 6.1342854499816895, + "learning_rate": 6.2318761134712905e-06, + "loss": 0.3095, + "step": 28247 + }, + { + "epoch": 1.183152468800868, + "grad_norm": 4.891206741333008, + "learning_rate": 6.231739070851035e-06, + "loss": 0.313, + "step": 28248 + }, + { + "epoch": 1.183166033640803, + "grad_norm": 6.168277740478516, + "learning_rate": 6.23160202823078e-06, + "loss": 0.4183, + "step": 28249 + }, + { + "epoch": 1.1831795984807378, + "grad_norm": 4.673417568206787, + "learning_rate": 6.231464985610526e-06, + "loss": 0.3008, + "step": 28250 + }, + { + "epoch": 1.1831931633206727, + "grad_norm": 4.409352779388428, + "learning_rate": 6.23132794299027e-06, + "loss": 0.278, + "step": 28251 + }, + { + "epoch": 1.1832067281606078, + "grad_norm": 6.550973415374756, + "learning_rate": 6.2311909003700155e-06, + "loss": 0.2856, + "step": 28252 + }, + { + "epoch": 1.1832202930005427, + "grad_norm": 6.225629806518555, + "learning_rate": 6.231053857749761e-06, + "loss": 0.3592, + "step": 28253 + }, + { + "epoch": 1.1832338578404775, + "grad_norm": 6.085760116577148, + "learning_rate": 6.230916815129507e-06, + "loss": 0.3766, + "step": 28254 + }, + { + "epoch": 1.1832474226804124, + "grad_norm": 6.223269939422607, + "learning_rate": 6.230779772509251e-06, + "loss": 0.3484, + "step": 28255 + }, + { + "epoch": 1.1832609875203473, + "grad_norm": 5.73270320892334, + "learning_rate": 6.230642729888996e-06, + "loss": 0.2765, + "step": 28256 + }, + { + "epoch": 1.1832745523602821, + "grad_norm": 6.982554912567139, + "learning_rate": 6.2305056872687406e-06, + "loss": 0.3284, + "step": 28257 + }, + { + "epoch": 1.183288117200217, + "grad_norm": 4.341052532196045, + "learning_rate": 6.230368644648486e-06, + "loss": 0.2324, + "step": 28258 + }, + { + "epoch": 1.1833016820401518, + "grad_norm": 7.969029903411865, + "learning_rate": 6.230231602028232e-06, + "loss": 0.3351, + "step": 28259 + }, + { + "epoch": 1.183315246880087, + "grad_norm": 6.471618175506592, + "learning_rate": 6.230094559407976e-06, + "loss": 0.2626, + "step": 28260 + }, + { + "epoch": 1.1833288117200218, + "grad_norm": 5.303191184997559, + "learning_rate": 6.229957516787721e-06, + "loss": 0.403, + "step": 28261 + }, + { + "epoch": 1.1833423765599567, + "grad_norm": 6.121845245361328, + "learning_rate": 6.2298204741674665e-06, + "loss": 0.3692, + "step": 28262 + }, + { + "epoch": 1.1833559413998915, + "grad_norm": 6.319770336151123, + "learning_rate": 6.2296834315472125e-06, + "loss": 0.4352, + "step": 28263 + }, + { + "epoch": 1.1833695062398264, + "grad_norm": 7.320370197296143, + "learning_rate": 6.229546388926957e-06, + "loss": 0.4006, + "step": 28264 + }, + { + "epoch": 1.1833830710797613, + "grad_norm": 5.035038471221924, + "learning_rate": 6.229409346306702e-06, + "loss": 0.2948, + "step": 28265 + }, + { + "epoch": 1.1833966359196961, + "grad_norm": 4.637347221374512, + "learning_rate": 6.229272303686446e-06, + "loss": 0.3313, + "step": 28266 + }, + { + "epoch": 1.183410200759631, + "grad_norm": 5.490170001983643, + "learning_rate": 6.229135261066192e-06, + "loss": 0.289, + "step": 28267 + }, + { + "epoch": 1.1834237655995659, + "grad_norm": 5.669425010681152, + "learning_rate": 6.2289982184459375e-06, + "loss": 0.2374, + "step": 28268 + }, + { + "epoch": 1.1834373304395007, + "grad_norm": 7.946979999542236, + "learning_rate": 6.228861175825682e-06, + "loss": 0.4142, + "step": 28269 + }, + { + "epoch": 1.1834508952794356, + "grad_norm": 4.905590534210205, + "learning_rate": 6.228724133205427e-06, + "loss": 0.3734, + "step": 28270 + }, + { + "epoch": 1.1834644601193707, + "grad_norm": 5.496829032897949, + "learning_rate": 6.228587090585172e-06, + "loss": 0.3568, + "step": 28271 + }, + { + "epoch": 1.1834780249593055, + "grad_norm": 5.28912878036499, + "learning_rate": 6.228450047964918e-06, + "loss": 0.2916, + "step": 28272 + }, + { + "epoch": 1.1834915897992404, + "grad_norm": 5.10080623626709, + "learning_rate": 6.2283130053446626e-06, + "loss": 0.2681, + "step": 28273 + }, + { + "epoch": 1.1835051546391753, + "grad_norm": 5.907332420349121, + "learning_rate": 6.228175962724408e-06, + "loss": 0.3315, + "step": 28274 + }, + { + "epoch": 1.1835187194791101, + "grad_norm": 6.042215824127197, + "learning_rate": 6.228038920104152e-06, + "loss": 0.4782, + "step": 28275 + }, + { + "epoch": 1.183532284319045, + "grad_norm": 6.106248378753662, + "learning_rate": 6.227901877483898e-06, + "loss": 0.2982, + "step": 28276 + }, + { + "epoch": 1.1835458491589799, + "grad_norm": 5.189445972442627, + "learning_rate": 6.227764834863643e-06, + "loss": 0.4193, + "step": 28277 + }, + { + "epoch": 1.1835594139989147, + "grad_norm": 5.2396769523620605, + "learning_rate": 6.2276277922433885e-06, + "loss": 0.313, + "step": 28278 + }, + { + "epoch": 1.1835729788388498, + "grad_norm": 7.708981990814209, + "learning_rate": 6.227490749623133e-06, + "loss": 0.5543, + "step": 28279 + }, + { + "epoch": 1.1835865436787847, + "grad_norm": 7.723461151123047, + "learning_rate": 6.227353707002879e-06, + "loss": 0.564, + "step": 28280 + }, + { + "epoch": 1.1836001085187196, + "grad_norm": 7.935519218444824, + "learning_rate": 6.227216664382624e-06, + "loss": 0.4025, + "step": 28281 + }, + { + "epoch": 1.1836136733586544, + "grad_norm": 6.702826976776123, + "learning_rate": 6.227079621762368e-06, + "loss": 0.4489, + "step": 28282 + }, + { + "epoch": 1.1836272381985893, + "grad_norm": 7.480117321014404, + "learning_rate": 6.2269425791421135e-06, + "loss": 0.5613, + "step": 28283 + }, + { + "epoch": 1.1836408030385241, + "grad_norm": 4.423028469085693, + "learning_rate": 6.226805536521858e-06, + "loss": 0.2574, + "step": 28284 + }, + { + "epoch": 1.183654367878459, + "grad_norm": 6.039924621582031, + "learning_rate": 6.226668493901604e-06, + "loss": 0.5164, + "step": 28285 + }, + { + "epoch": 1.1836679327183939, + "grad_norm": 5.088134765625, + "learning_rate": 6.226531451281349e-06, + "loss": 0.1954, + "step": 28286 + }, + { + "epoch": 1.1836814975583287, + "grad_norm": 7.511084079742432, + "learning_rate": 6.226394408661094e-06, + "loss": 0.7477, + "step": 28287 + }, + { + "epoch": 1.1836950623982636, + "grad_norm": 6.718544960021973, + "learning_rate": 6.2262573660408386e-06, + "loss": 0.6376, + "step": 28288 + }, + { + "epoch": 1.1837086272381985, + "grad_norm": 8.472527503967285, + "learning_rate": 6.226120323420585e-06, + "loss": 0.4151, + "step": 28289 + }, + { + "epoch": 1.1837221920781336, + "grad_norm": 7.776481628417969, + "learning_rate": 6.22598328080033e-06, + "loss": 0.5656, + "step": 28290 + }, + { + "epoch": 1.1837357569180684, + "grad_norm": 6.69016170501709, + "learning_rate": 6.225846238180074e-06, + "loss": 0.3524, + "step": 28291 + }, + { + "epoch": 1.1837493217580033, + "grad_norm": 6.794927597045898, + "learning_rate": 6.225709195559819e-06, + "loss": 0.3299, + "step": 28292 + }, + { + "epoch": 1.1837628865979382, + "grad_norm": 6.525270938873291, + "learning_rate": 6.225572152939565e-06, + "loss": 0.6781, + "step": 28293 + }, + { + "epoch": 1.183776451437873, + "grad_norm": 5.543622970581055, + "learning_rate": 6.22543511031931e-06, + "loss": 0.3213, + "step": 28294 + }, + { + "epoch": 1.1837900162778079, + "grad_norm": 7.51439905166626, + "learning_rate": 6.225298067699055e-06, + "loss": 0.3289, + "step": 28295 + }, + { + "epoch": 1.1838035811177428, + "grad_norm": 5.154743194580078, + "learning_rate": 6.2251610250788e-06, + "loss": 0.4023, + "step": 28296 + }, + { + "epoch": 1.1838171459576776, + "grad_norm": 5.68240213394165, + "learning_rate": 6.225023982458546e-06, + "loss": 0.3164, + "step": 28297 + }, + { + "epoch": 1.1838307107976127, + "grad_norm": 7.398452281951904, + "learning_rate": 6.22488693983829e-06, + "loss": 0.4657, + "step": 28298 + }, + { + "epoch": 1.1838442756375476, + "grad_norm": 6.201918125152588, + "learning_rate": 6.2247498972180355e-06, + "loss": 0.3525, + "step": 28299 + }, + { + "epoch": 1.1838578404774824, + "grad_norm": 7.105139255523682, + "learning_rate": 6.22461285459778e-06, + "loss": 0.4707, + "step": 28300 + }, + { + "epoch": 1.1838714053174173, + "grad_norm": 9.062090873718262, + "learning_rate": 6.224475811977525e-06, + "loss": 0.7963, + "step": 28301 + }, + { + "epoch": 1.1838849701573522, + "grad_norm": 6.1227288246154785, + "learning_rate": 6.224338769357271e-06, + "loss": 0.3124, + "step": 28302 + }, + { + "epoch": 1.183898534997287, + "grad_norm": 4.455018043518066, + "learning_rate": 6.224201726737016e-06, + "loss": 0.2944, + "step": 28303 + }, + { + "epoch": 1.183912099837222, + "grad_norm": 5.417843341827393, + "learning_rate": 6.224064684116761e-06, + "loss": 0.2788, + "step": 28304 + }, + { + "epoch": 1.1839256646771568, + "grad_norm": 7.063292980194092, + "learning_rate": 6.223927641496506e-06, + "loss": 0.3278, + "step": 28305 + }, + { + "epoch": 1.1839392295170916, + "grad_norm": 6.792050361633301, + "learning_rate": 6.223790598876252e-06, + "loss": 0.2656, + "step": 28306 + }, + { + "epoch": 1.1839527943570265, + "grad_norm": 4.795830726623535, + "learning_rate": 6.223653556255996e-06, + "loss": 0.2597, + "step": 28307 + }, + { + "epoch": 1.1839663591969614, + "grad_norm": 4.411136150360107, + "learning_rate": 6.223516513635741e-06, + "loss": 0.1984, + "step": 28308 + }, + { + "epoch": 1.1839799240368964, + "grad_norm": 5.498684883117676, + "learning_rate": 6.223379471015486e-06, + "loss": 0.3093, + "step": 28309 + }, + { + "epoch": 1.1839934888768313, + "grad_norm": 5.615501880645752, + "learning_rate": 6.223242428395232e-06, + "loss": 0.3021, + "step": 28310 + }, + { + "epoch": 1.1840070537167662, + "grad_norm": 6.143216609954834, + "learning_rate": 6.223105385774977e-06, + "loss": 0.2658, + "step": 28311 + }, + { + "epoch": 1.184020618556701, + "grad_norm": 8.156095504760742, + "learning_rate": 6.222968343154722e-06, + "loss": 0.3987, + "step": 28312 + }, + { + "epoch": 1.184034183396636, + "grad_norm": 5.388477325439453, + "learning_rate": 6.222831300534466e-06, + "loss": 0.3537, + "step": 28313 + }, + { + "epoch": 1.1840477482365708, + "grad_norm": 6.311400890350342, + "learning_rate": 6.2226942579142115e-06, + "loss": 0.3121, + "step": 28314 + }, + { + "epoch": 1.1840613130765056, + "grad_norm": 4.550206661224365, + "learning_rate": 6.2225572152939575e-06, + "loss": 0.2029, + "step": 28315 + }, + { + "epoch": 1.1840748779164405, + "grad_norm": 6.66693639755249, + "learning_rate": 6.222420172673702e-06, + "loss": 0.4074, + "step": 28316 + }, + { + "epoch": 1.1840884427563756, + "grad_norm": 7.247567176818848, + "learning_rate": 6.222283130053447e-06, + "loss": 0.4749, + "step": 28317 + }, + { + "epoch": 1.1841020075963105, + "grad_norm": 6.199003219604492, + "learning_rate": 6.222146087433192e-06, + "loss": 0.4055, + "step": 28318 + }, + { + "epoch": 1.1841155724362453, + "grad_norm": 5.279077529907227, + "learning_rate": 6.222009044812937e-06, + "loss": 0.3185, + "step": 28319 + }, + { + "epoch": 1.1841291372761802, + "grad_norm": 5.626030921936035, + "learning_rate": 6.221872002192683e-06, + "loss": 0.2548, + "step": 28320 + }, + { + "epoch": 1.184142702116115, + "grad_norm": 7.677748203277588, + "learning_rate": 6.221734959572428e-06, + "loss": 0.5598, + "step": 28321 + }, + { + "epoch": 1.18415626695605, + "grad_norm": 6.547774791717529, + "learning_rate": 6.221597916952172e-06, + "loss": 0.3452, + "step": 28322 + }, + { + "epoch": 1.1841698317959848, + "grad_norm": 4.986783027648926, + "learning_rate": 6.221460874331918e-06, + "loss": 0.2065, + "step": 28323 + }, + { + "epoch": 1.1841833966359196, + "grad_norm": 4.1439409255981445, + "learning_rate": 6.221323831711663e-06, + "loss": 0.2641, + "step": 28324 + }, + { + "epoch": 1.1841969614758545, + "grad_norm": 5.280555725097656, + "learning_rate": 6.221186789091408e-06, + "loss": 0.3316, + "step": 28325 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 5.82144021987915, + "learning_rate": 6.221049746471153e-06, + "loss": 0.2568, + "step": 28326 + }, + { + "epoch": 1.1842240911557242, + "grad_norm": 5.463586807250977, + "learning_rate": 6.220912703850898e-06, + "loss": 0.247, + "step": 28327 + }, + { + "epoch": 1.1842376559956593, + "grad_norm": 6.511600017547607, + "learning_rate": 6.220775661230643e-06, + "loss": 0.2579, + "step": 28328 + }, + { + "epoch": 1.1842512208355942, + "grad_norm": 8.221904754638672, + "learning_rate": 6.220638618610388e-06, + "loss": 0.5372, + "step": 28329 + }, + { + "epoch": 1.184264785675529, + "grad_norm": 5.150643825531006, + "learning_rate": 6.2205015759901335e-06, + "loss": 0.3471, + "step": 28330 + }, + { + "epoch": 1.184278350515464, + "grad_norm": 4.402989864349365, + "learning_rate": 6.220364533369878e-06, + "loss": 0.28, + "step": 28331 + }, + { + "epoch": 1.1842919153553988, + "grad_norm": 7.571165084838867, + "learning_rate": 6.220227490749624e-06, + "loss": 0.4226, + "step": 28332 + }, + { + "epoch": 1.1843054801953337, + "grad_norm": 7.920655250549316, + "learning_rate": 6.220090448129369e-06, + "loss": 0.4671, + "step": 28333 + }, + { + "epoch": 1.1843190450352685, + "grad_norm": 5.596644401550293, + "learning_rate": 6.219953405509113e-06, + "loss": 0.3901, + "step": 28334 + }, + { + "epoch": 1.1843326098752034, + "grad_norm": 4.449042320251465, + "learning_rate": 6.219816362888859e-06, + "loss": 0.3337, + "step": 28335 + }, + { + "epoch": 1.1843461747151385, + "grad_norm": 5.16092586517334, + "learning_rate": 6.219679320268605e-06, + "loss": 0.2542, + "step": 28336 + }, + { + "epoch": 1.1843597395550733, + "grad_norm": 8.292262077331543, + "learning_rate": 6.21954227764835e-06, + "loss": 0.1619, + "step": 28337 + }, + { + "epoch": 1.1843733043950082, + "grad_norm": 4.176001071929932, + "learning_rate": 6.219405235028094e-06, + "loss": 0.2397, + "step": 28338 + }, + { + "epoch": 1.184386869234943, + "grad_norm": 5.953124046325684, + "learning_rate": 6.219268192407839e-06, + "loss": 0.3238, + "step": 28339 + }, + { + "epoch": 1.184400434074878, + "grad_norm": 5.596897602081299, + "learning_rate": 6.219131149787584e-06, + "loss": 0.2971, + "step": 28340 + }, + { + "epoch": 1.1844139989148128, + "grad_norm": 5.900373935699463, + "learning_rate": 6.21899410716733e-06, + "loss": 0.3527, + "step": 28341 + }, + { + "epoch": 1.1844275637547477, + "grad_norm": 5.390142440795898, + "learning_rate": 6.218857064547075e-06, + "loss": 0.2773, + "step": 28342 + }, + { + "epoch": 1.1844411285946825, + "grad_norm": 5.971894264221191, + "learning_rate": 6.218720021926819e-06, + "loss": 0.3246, + "step": 28343 + }, + { + "epoch": 1.1844546934346174, + "grad_norm": 6.043708324432373, + "learning_rate": 6.218582979306564e-06, + "loss": 0.3713, + "step": 28344 + }, + { + "epoch": 1.1844682582745523, + "grad_norm": 5.927027702331543, + "learning_rate": 6.21844593668631e-06, + "loss": 0.3886, + "step": 28345 + }, + { + "epoch": 1.1844818231144871, + "grad_norm": 6.29110860824585, + "learning_rate": 6.2183088940660556e-06, + "loss": 0.3417, + "step": 28346 + }, + { + "epoch": 1.1844953879544222, + "grad_norm": 6.903054237365723, + "learning_rate": 6.2181718514458e-06, + "loss": 0.3967, + "step": 28347 + }, + { + "epoch": 1.184508952794357, + "grad_norm": 5.7688727378845215, + "learning_rate": 6.218034808825545e-06, + "loss": 0.3188, + "step": 28348 + }, + { + "epoch": 1.184522517634292, + "grad_norm": 5.450108051300049, + "learning_rate": 6.217897766205291e-06, + "loss": 0.2913, + "step": 28349 + }, + { + "epoch": 1.1845360824742268, + "grad_norm": 3.5425233840942383, + "learning_rate": 6.217760723585035e-06, + "loss": 0.2425, + "step": 28350 + }, + { + "epoch": 1.1845496473141617, + "grad_norm": 5.414345741271973, + "learning_rate": 6.217623680964781e-06, + "loss": 0.3098, + "step": 28351 + }, + { + "epoch": 1.1845632121540965, + "grad_norm": 6.612019062042236, + "learning_rate": 6.217486638344526e-06, + "loss": 0.3339, + "step": 28352 + }, + { + "epoch": 1.1845767769940314, + "grad_norm": 4.786302089691162, + "learning_rate": 6.21734959572427e-06, + "loss": 0.2764, + "step": 28353 + }, + { + "epoch": 1.1845903418339663, + "grad_norm": 7.965022563934326, + "learning_rate": 6.217212553104016e-06, + "loss": 0.3916, + "step": 28354 + }, + { + "epoch": 1.1846039066739014, + "grad_norm": 5.272785186767578, + "learning_rate": 6.217075510483761e-06, + "loss": 0.2281, + "step": 28355 + }, + { + "epoch": 1.1846174715138362, + "grad_norm": 6.329564571380615, + "learning_rate": 6.216938467863506e-06, + "loss": 0.2981, + "step": 28356 + }, + { + "epoch": 1.184631036353771, + "grad_norm": 4.678710460662842, + "learning_rate": 6.216801425243251e-06, + "loss": 0.2928, + "step": 28357 + }, + { + "epoch": 1.184644601193706, + "grad_norm": 4.261331081390381, + "learning_rate": 6.216664382622997e-06, + "loss": 0.2667, + "step": 28358 + }, + { + "epoch": 1.1846581660336408, + "grad_norm": 4.412598133087158, + "learning_rate": 6.216527340002741e-06, + "loss": 0.3992, + "step": 28359 + }, + { + "epoch": 1.1846717308735757, + "grad_norm": 6.230301380157471, + "learning_rate": 6.216390297382486e-06, + "loss": 0.2645, + "step": 28360 + }, + { + "epoch": 1.1846852957135106, + "grad_norm": 6.992010116577148, + "learning_rate": 6.2162532547622315e-06, + "loss": 0.3999, + "step": 28361 + }, + { + "epoch": 1.1846988605534454, + "grad_norm": 6.252387046813965, + "learning_rate": 6.216116212141977e-06, + "loss": 0.3879, + "step": 28362 + }, + { + "epoch": 1.1847124253933803, + "grad_norm": 4.109602451324463, + "learning_rate": 6.215979169521722e-06, + "loss": 0.2461, + "step": 28363 + }, + { + "epoch": 1.1847259902333152, + "grad_norm": 7.526053428649902, + "learning_rate": 6.215842126901467e-06, + "loss": 0.3512, + "step": 28364 + }, + { + "epoch": 1.18473955507325, + "grad_norm": 6.328792095184326, + "learning_rate": 6.215705084281211e-06, + "loss": 0.4298, + "step": 28365 + }, + { + "epoch": 1.184753119913185, + "grad_norm": 8.47066593170166, + "learning_rate": 6.2155680416609574e-06, + "loss": 0.4475, + "step": 28366 + }, + { + "epoch": 1.18476668475312, + "grad_norm": 5.903276443481445, + "learning_rate": 6.215430999040703e-06, + "loss": 0.3666, + "step": 28367 + }, + { + "epoch": 1.1847802495930548, + "grad_norm": 8.650339126586914, + "learning_rate": 6.215293956420447e-06, + "loss": 0.3454, + "step": 28368 + }, + { + "epoch": 1.1847938144329897, + "grad_norm": 6.3961873054504395, + "learning_rate": 6.215156913800192e-06, + "loss": 0.4256, + "step": 28369 + }, + { + "epoch": 1.1848073792729246, + "grad_norm": 5.397269248962402, + "learning_rate": 6.215019871179937e-06, + "loss": 0.3484, + "step": 28370 + }, + { + "epoch": 1.1848209441128594, + "grad_norm": 5.6219987869262695, + "learning_rate": 6.214882828559683e-06, + "loss": 0.3652, + "step": 28371 + }, + { + "epoch": 1.1848345089527943, + "grad_norm": 8.634248733520508, + "learning_rate": 6.214745785939428e-06, + "loss": 0.4732, + "step": 28372 + }, + { + "epoch": 1.1848480737927292, + "grad_norm": 5.825688362121582, + "learning_rate": 6.214608743319173e-06, + "loss": 0.3561, + "step": 28373 + }, + { + "epoch": 1.1848616386326642, + "grad_norm": 6.88893985748291, + "learning_rate": 6.214471700698917e-06, + "loss": 0.3623, + "step": 28374 + }, + { + "epoch": 1.1848752034725991, + "grad_norm": 7.265125751495361, + "learning_rate": 6.214334658078663e-06, + "loss": 0.4428, + "step": 28375 + }, + { + "epoch": 1.184888768312534, + "grad_norm": 5.928593635559082, + "learning_rate": 6.214197615458408e-06, + "loss": 0.2202, + "step": 28376 + }, + { + "epoch": 1.1849023331524688, + "grad_norm": 6.7306976318359375, + "learning_rate": 6.214060572838153e-06, + "loss": 0.3459, + "step": 28377 + }, + { + "epoch": 1.1849158979924037, + "grad_norm": 7.418840408325195, + "learning_rate": 6.213923530217898e-06, + "loss": 0.4078, + "step": 28378 + }, + { + "epoch": 1.1849294628323386, + "grad_norm": 7.014773845672607, + "learning_rate": 6.213786487597644e-06, + "loss": 0.4596, + "step": 28379 + }, + { + "epoch": 1.1849430276722734, + "grad_norm": 5.17608642578125, + "learning_rate": 6.213649444977389e-06, + "loss": 0.2057, + "step": 28380 + }, + { + "epoch": 1.1849565925122083, + "grad_norm": 5.423193454742432, + "learning_rate": 6.2135124023571334e-06, + "loss": 0.2512, + "step": 28381 + }, + { + "epoch": 1.1849701573521432, + "grad_norm": 6.275533199310303, + "learning_rate": 6.213375359736879e-06, + "loss": 0.2784, + "step": 28382 + }, + { + "epoch": 1.184983722192078, + "grad_norm": 4.648232936859131, + "learning_rate": 6.213238317116623e-06, + "loss": 0.26, + "step": 28383 + }, + { + "epoch": 1.184997287032013, + "grad_norm": 6.854219913482666, + "learning_rate": 6.213101274496369e-06, + "loss": 0.2597, + "step": 28384 + }, + { + "epoch": 1.185010851871948, + "grad_norm": 5.257809638977051, + "learning_rate": 6.212964231876114e-06, + "loss": 0.2574, + "step": 28385 + }, + { + "epoch": 1.1850244167118829, + "grad_norm": 5.577162265777588, + "learning_rate": 6.212827189255859e-06, + "loss": 0.2548, + "step": 28386 + }, + { + "epoch": 1.1850379815518177, + "grad_norm": 7.3191819190979, + "learning_rate": 6.212690146635604e-06, + "loss": 0.443, + "step": 28387 + }, + { + "epoch": 1.1850515463917526, + "grad_norm": 6.070218086242676, + "learning_rate": 6.21255310401535e-06, + "loss": 0.2184, + "step": 28388 + }, + { + "epoch": 1.1850651112316875, + "grad_norm": 8.68527603149414, + "learning_rate": 6.212416061395095e-06, + "loss": 0.3729, + "step": 28389 + }, + { + "epoch": 1.1850786760716223, + "grad_norm": 6.014714241027832, + "learning_rate": 6.212279018774839e-06, + "loss": 0.4623, + "step": 28390 + }, + { + "epoch": 1.1850922409115572, + "grad_norm": 4.169238090515137, + "learning_rate": 6.212141976154584e-06, + "loss": 0.2881, + "step": 28391 + }, + { + "epoch": 1.185105805751492, + "grad_norm": 15.845216751098633, + "learning_rate": 6.21200493353433e-06, + "loss": 0.2261, + "step": 28392 + }, + { + "epoch": 1.1851193705914271, + "grad_norm": 4.80888557434082, + "learning_rate": 6.211867890914075e-06, + "loss": 0.344, + "step": 28393 + }, + { + "epoch": 1.185132935431362, + "grad_norm": 6.310265064239502, + "learning_rate": 6.21173084829382e-06, + "loss": 0.3041, + "step": 28394 + }, + { + "epoch": 1.1851465002712969, + "grad_norm": 5.171874046325684, + "learning_rate": 6.211593805673565e-06, + "loss": 0.2636, + "step": 28395 + }, + { + "epoch": 1.1851600651112317, + "grad_norm": 7.368221282958984, + "learning_rate": 6.211456763053309e-06, + "loss": 0.4614, + "step": 28396 + }, + { + "epoch": 1.1851736299511666, + "grad_norm": 4.074239730834961, + "learning_rate": 6.2113197204330554e-06, + "loss": 0.2107, + "step": 28397 + }, + { + "epoch": 1.1851871947911015, + "grad_norm": 6.91958475112915, + "learning_rate": 6.211182677812801e-06, + "loss": 0.2774, + "step": 28398 + }, + { + "epoch": 1.1852007596310363, + "grad_norm": 5.471890926361084, + "learning_rate": 6.211045635192545e-06, + "loss": 0.2502, + "step": 28399 + }, + { + "epoch": 1.1852143244709712, + "grad_norm": 5.926656723022461, + "learning_rate": 6.21090859257229e-06, + "loss": 0.2315, + "step": 28400 + }, + { + "epoch": 1.185227889310906, + "grad_norm": 6.09475040435791, + "learning_rate": 6.210771549952036e-06, + "loss": 0.3574, + "step": 28401 + }, + { + "epoch": 1.185241454150841, + "grad_norm": 5.44488525390625, + "learning_rate": 6.2106345073317805e-06, + "loss": 0.2901, + "step": 28402 + }, + { + "epoch": 1.1852550189907758, + "grad_norm": 6.672934532165527, + "learning_rate": 6.210497464711526e-06, + "loss": 0.2484, + "step": 28403 + }, + { + "epoch": 1.1852685838307109, + "grad_norm": 8.451732635498047, + "learning_rate": 6.210360422091271e-06, + "loss": 0.3122, + "step": 28404 + }, + { + "epoch": 1.1852821486706457, + "grad_norm": 5.982987403869629, + "learning_rate": 6.210223379471017e-06, + "loss": 0.2506, + "step": 28405 + }, + { + "epoch": 1.1852957135105806, + "grad_norm": 6.763962745666504, + "learning_rate": 6.210086336850761e-06, + "loss": 0.3169, + "step": 28406 + }, + { + "epoch": 1.1853092783505155, + "grad_norm": 6.903855800628662, + "learning_rate": 6.209949294230506e-06, + "loss": 0.5346, + "step": 28407 + }, + { + "epoch": 1.1853228431904503, + "grad_norm": 5.821254253387451, + "learning_rate": 6.209812251610251e-06, + "loss": 0.2334, + "step": 28408 + }, + { + "epoch": 1.1853364080303852, + "grad_norm": 6.33995246887207, + "learning_rate": 6.209675208989996e-06, + "loss": 0.2508, + "step": 28409 + }, + { + "epoch": 1.18534997287032, + "grad_norm": 9.49990177154541, + "learning_rate": 6.209538166369742e-06, + "loss": 0.3729, + "step": 28410 + }, + { + "epoch": 1.185363537710255, + "grad_norm": 9.200301170349121, + "learning_rate": 6.209401123749486e-06, + "loss": 0.488, + "step": 28411 + }, + { + "epoch": 1.18537710255019, + "grad_norm": 6.753642559051514, + "learning_rate": 6.2092640811292314e-06, + "loss": 0.3909, + "step": 28412 + }, + { + "epoch": 1.1853906673901249, + "grad_norm": 8.48990249633789, + "learning_rate": 6.209127038508977e-06, + "loss": 0.3308, + "step": 28413 + }, + { + "epoch": 1.1854042322300598, + "grad_norm": 7.2085065841674805, + "learning_rate": 6.208989995888723e-06, + "loss": 0.3518, + "step": 28414 + }, + { + "epoch": 1.1854177970699946, + "grad_norm": 5.607386112213135, + "learning_rate": 6.208852953268467e-06, + "loss": 0.406, + "step": 28415 + }, + { + "epoch": 1.1854313619099295, + "grad_norm": 5.896542072296143, + "learning_rate": 6.208715910648212e-06, + "loss": 0.1824, + "step": 28416 + }, + { + "epoch": 1.1854449267498643, + "grad_norm": 8.882600784301758, + "learning_rate": 6.2085788680279565e-06, + "loss": 0.3536, + "step": 28417 + }, + { + "epoch": 1.1854584915897992, + "grad_norm": 4.684752941131592, + "learning_rate": 6.2084418254077025e-06, + "loss": 0.188, + "step": 28418 + }, + { + "epoch": 1.185472056429734, + "grad_norm": 6.103949546813965, + "learning_rate": 6.208304782787448e-06, + "loss": 0.2118, + "step": 28419 + }, + { + "epoch": 1.185485621269669, + "grad_norm": 8.309111595153809, + "learning_rate": 6.208167740167193e-06, + "loss": 0.3502, + "step": 28420 + }, + { + "epoch": 1.1854991861096038, + "grad_norm": 4.732407569885254, + "learning_rate": 6.208030697546937e-06, + "loss": 0.2883, + "step": 28421 + }, + { + "epoch": 1.1855127509495387, + "grad_norm": 5.309390068054199, + "learning_rate": 6.207893654926682e-06, + "loss": 0.192, + "step": 28422 + }, + { + "epoch": 1.1855263157894738, + "grad_norm": 7.763061046600342, + "learning_rate": 6.207756612306428e-06, + "loss": 0.4102, + "step": 28423 + }, + { + "epoch": 1.1855398806294086, + "grad_norm": 7.05091667175293, + "learning_rate": 6.207619569686173e-06, + "loss": 0.3298, + "step": 28424 + }, + { + "epoch": 1.1855534454693435, + "grad_norm": 4.070611000061035, + "learning_rate": 6.207482527065918e-06, + "loss": 0.2462, + "step": 28425 + }, + { + "epoch": 1.1855670103092784, + "grad_norm": 6.925450325012207, + "learning_rate": 6.207345484445662e-06, + "loss": 0.5029, + "step": 28426 + }, + { + "epoch": 1.1855805751492132, + "grad_norm": 6.609673023223877, + "learning_rate": 6.207208441825408e-06, + "loss": 0.37, + "step": 28427 + }, + { + "epoch": 1.185594139989148, + "grad_norm": 4.840489387512207, + "learning_rate": 6.2070713992051534e-06, + "loss": 0.2289, + "step": 28428 + }, + { + "epoch": 1.185607704829083, + "grad_norm": 4.564587116241455, + "learning_rate": 6.206934356584899e-06, + "loss": 0.2162, + "step": 28429 + }, + { + "epoch": 1.1856212696690178, + "grad_norm": 4.6343607902526855, + "learning_rate": 6.206797313964643e-06, + "loss": 0.3357, + "step": 28430 + }, + { + "epoch": 1.185634834508953, + "grad_norm": 5.272946357727051, + "learning_rate": 6.206660271344389e-06, + "loss": 0.252, + "step": 28431 + }, + { + "epoch": 1.1856483993488878, + "grad_norm": 6.921078681945801, + "learning_rate": 6.206523228724134e-06, + "loss": 0.4367, + "step": 28432 + }, + { + "epoch": 1.1856619641888226, + "grad_norm": 6.955146312713623, + "learning_rate": 6.2063861861038785e-06, + "loss": 0.3511, + "step": 28433 + }, + { + "epoch": 1.1856755290287575, + "grad_norm": 6.261321544647217, + "learning_rate": 6.206249143483624e-06, + "loss": 0.4162, + "step": 28434 + }, + { + "epoch": 1.1856890938686924, + "grad_norm": 5.517294883728027, + "learning_rate": 6.206112100863369e-06, + "loss": 0.2194, + "step": 28435 + }, + { + "epoch": 1.1857026587086272, + "grad_norm": 5.1864423751831055, + "learning_rate": 6.205975058243114e-06, + "loss": 0.2257, + "step": 28436 + }, + { + "epoch": 1.185716223548562, + "grad_norm": 5.068427085876465, + "learning_rate": 6.205838015622859e-06, + "loss": 0.3193, + "step": 28437 + }, + { + "epoch": 1.185729788388497, + "grad_norm": 7.440979957580566, + "learning_rate": 6.205700973002604e-06, + "loss": 0.3744, + "step": 28438 + }, + { + "epoch": 1.1857433532284318, + "grad_norm": 4.765352725982666, + "learning_rate": 6.205563930382349e-06, + "loss": 0.2304, + "step": 28439 + }, + { + "epoch": 1.1857569180683667, + "grad_norm": 5.78010892868042, + "learning_rate": 6.205426887762095e-06, + "loss": 0.4511, + "step": 28440 + }, + { + "epoch": 1.1857704829083016, + "grad_norm": 6.855781078338623, + "learning_rate": 6.20528984514184e-06, + "loss": 0.2732, + "step": 28441 + }, + { + "epoch": 1.1857840477482366, + "grad_norm": 6.068266868591309, + "learning_rate": 6.205152802521584e-06, + "loss": 0.313, + "step": 28442 + }, + { + "epoch": 1.1857976125881715, + "grad_norm": 6.16610050201416, + "learning_rate": 6.2050157599013294e-06, + "loss": 0.3187, + "step": 28443 + }, + { + "epoch": 1.1858111774281064, + "grad_norm": 6.487165451049805, + "learning_rate": 6.2048787172810755e-06, + "loss": 0.3809, + "step": 28444 + }, + { + "epoch": 1.1858247422680412, + "grad_norm": 7.321776866912842, + "learning_rate": 6.204741674660821e-06, + "loss": 0.2881, + "step": 28445 + }, + { + "epoch": 1.185838307107976, + "grad_norm": 4.203503608703613, + "learning_rate": 6.204604632040565e-06, + "loss": 0.1885, + "step": 28446 + }, + { + "epoch": 1.185851871947911, + "grad_norm": 5.935147285461426, + "learning_rate": 6.20446758942031e-06, + "loss": 0.3258, + "step": 28447 + }, + { + "epoch": 1.1858654367878458, + "grad_norm": 7.582267761230469, + "learning_rate": 6.204330546800056e-06, + "loss": 0.4892, + "step": 28448 + }, + { + "epoch": 1.1858790016277807, + "grad_norm": 6.692717552185059, + "learning_rate": 6.2041935041798005e-06, + "loss": 0.3297, + "step": 28449 + }, + { + "epoch": 1.1858925664677158, + "grad_norm": 6.669566631317139, + "learning_rate": 6.204056461559546e-06, + "loss": 0.2828, + "step": 28450 + }, + { + "epoch": 1.1859061313076507, + "grad_norm": 4.874516010284424, + "learning_rate": 6.20391941893929e-06, + "loss": 0.1786, + "step": 28451 + }, + { + "epoch": 1.1859196961475855, + "grad_norm": 6.171983242034912, + "learning_rate": 6.203782376319035e-06, + "loss": 0.3807, + "step": 28452 + }, + { + "epoch": 1.1859332609875204, + "grad_norm": 6.503600597381592, + "learning_rate": 6.203645333698781e-06, + "loss": 0.422, + "step": 28453 + }, + { + "epoch": 1.1859468258274553, + "grad_norm": 4.572989463806152, + "learning_rate": 6.203508291078526e-06, + "loss": 0.1806, + "step": 28454 + }, + { + "epoch": 1.1859603906673901, + "grad_norm": 6.723608493804932, + "learning_rate": 6.203371248458271e-06, + "loss": 0.2984, + "step": 28455 + }, + { + "epoch": 1.185973955507325, + "grad_norm": 4.635809898376465, + "learning_rate": 6.203234205838016e-06, + "loss": 0.2064, + "step": 28456 + }, + { + "epoch": 1.1859875203472598, + "grad_norm": 6.741098880767822, + "learning_rate": 6.203097163217762e-06, + "loss": 0.2413, + "step": 28457 + }, + { + "epoch": 1.1860010851871947, + "grad_norm": 5.913419246673584, + "learning_rate": 6.202960120597506e-06, + "loss": 0.2549, + "step": 28458 + }, + { + "epoch": 1.1860146500271296, + "grad_norm": 6.287648677825928, + "learning_rate": 6.2028230779772514e-06, + "loss": 0.3492, + "step": 28459 + }, + { + "epoch": 1.1860282148670644, + "grad_norm": 6.289571285247803, + "learning_rate": 6.202686035356997e-06, + "loss": 0.2854, + "step": 28460 + }, + { + "epoch": 1.1860417797069995, + "grad_norm": 4.2131218910217285, + "learning_rate": 6.202548992736742e-06, + "loss": 0.1778, + "step": 28461 + }, + { + "epoch": 1.1860553445469344, + "grad_norm": 5.715005397796631, + "learning_rate": 6.202411950116487e-06, + "loss": 0.2647, + "step": 28462 + }, + { + "epoch": 1.1860689093868693, + "grad_norm": 6.338351726531982, + "learning_rate": 6.202274907496232e-06, + "loss": 0.2233, + "step": 28463 + }, + { + "epoch": 1.1860824742268041, + "grad_norm": 7.373296737670898, + "learning_rate": 6.2021378648759765e-06, + "loss": 0.3608, + "step": 28464 + }, + { + "epoch": 1.186096039066739, + "grad_norm": 5.3740925788879395, + "learning_rate": 6.202000822255722e-06, + "loss": 0.2827, + "step": 28465 + }, + { + "epoch": 1.1861096039066739, + "grad_norm": 5.382529258728027, + "learning_rate": 6.201863779635468e-06, + "loss": 0.2498, + "step": 28466 + }, + { + "epoch": 1.1861231687466087, + "grad_norm": 5.915447235107422, + "learning_rate": 6.201726737015212e-06, + "loss": 0.3938, + "step": 28467 + }, + { + "epoch": 1.1861367335865436, + "grad_norm": 4.1056904792785645, + "learning_rate": 6.201589694394957e-06, + "loss": 0.1767, + "step": 28468 + }, + { + "epoch": 1.1861502984264787, + "grad_norm": 6.474739074707031, + "learning_rate": 6.201452651774702e-06, + "loss": 0.3017, + "step": 28469 + }, + { + "epoch": 1.1861638632664135, + "grad_norm": 6.5055060386657715, + "learning_rate": 6.2013156091544476e-06, + "loss": 0.4013, + "step": 28470 + }, + { + "epoch": 1.1861774281063484, + "grad_norm": 4.412998676300049, + "learning_rate": 6.201178566534193e-06, + "loss": 0.2531, + "step": 28471 + }, + { + "epoch": 1.1861909929462833, + "grad_norm": 4.979118347167969, + "learning_rate": 6.201041523913938e-06, + "loss": 0.34, + "step": 28472 + }, + { + "epoch": 1.1862045577862181, + "grad_norm": 5.511168003082275, + "learning_rate": 6.200904481293682e-06, + "loss": 0.2697, + "step": 28473 + }, + { + "epoch": 1.186218122626153, + "grad_norm": 7.725805282592773, + "learning_rate": 6.200767438673428e-06, + "loss": 0.5335, + "step": 28474 + }, + { + "epoch": 1.1862316874660879, + "grad_norm": 6.619484901428223, + "learning_rate": 6.2006303960531735e-06, + "loss": 0.293, + "step": 28475 + }, + { + "epoch": 1.1862452523060227, + "grad_norm": 5.391592502593994, + "learning_rate": 6.200493353432918e-06, + "loss": 0.2424, + "step": 28476 + }, + { + "epoch": 1.1862588171459576, + "grad_norm": 4.6907243728637695, + "learning_rate": 6.200356310812663e-06, + "loss": 0.2641, + "step": 28477 + }, + { + "epoch": 1.1862723819858925, + "grad_norm": 4.4546709060668945, + "learning_rate": 6.200219268192408e-06, + "loss": 0.2096, + "step": 28478 + }, + { + "epoch": 1.1862859468258273, + "grad_norm": 5.5128397941589355, + "learning_rate": 6.200082225572154e-06, + "loss": 0.3952, + "step": 28479 + }, + { + "epoch": 1.1862995116657624, + "grad_norm": 5.201226711273193, + "learning_rate": 6.1999451829518985e-06, + "loss": 0.2394, + "step": 28480 + }, + { + "epoch": 1.1863130765056973, + "grad_norm": 6.728315830230713, + "learning_rate": 6.199808140331644e-06, + "loss": 0.3706, + "step": 28481 + }, + { + "epoch": 1.1863266413456321, + "grad_norm": 4.797784805297852, + "learning_rate": 6.199671097711388e-06, + "loss": 0.2805, + "step": 28482 + }, + { + "epoch": 1.186340206185567, + "grad_norm": 5.474194526672363, + "learning_rate": 6.199534055091134e-06, + "loss": 0.1559, + "step": 28483 + }, + { + "epoch": 1.1863537710255019, + "grad_norm": 5.030036926269531, + "learning_rate": 6.199397012470879e-06, + "loss": 0.247, + "step": 28484 + }, + { + "epoch": 1.1863673358654367, + "grad_norm": 5.266298770904541, + "learning_rate": 6.1992599698506236e-06, + "loss": 0.2472, + "step": 28485 + }, + { + "epoch": 1.1863809007053716, + "grad_norm": 5.197788238525391, + "learning_rate": 6.199122927230369e-06, + "loss": 0.2285, + "step": 28486 + }, + { + "epoch": 1.1863944655453065, + "grad_norm": 3.72278094291687, + "learning_rate": 6.198985884610115e-06, + "loss": 0.1818, + "step": 28487 + }, + { + "epoch": 1.1864080303852416, + "grad_norm": 4.061209678649902, + "learning_rate": 6.19884884198986e-06, + "loss": 0.1993, + "step": 28488 + }, + { + "epoch": 1.1864215952251764, + "grad_norm": 4.359179496765137, + "learning_rate": 6.198711799369604e-06, + "loss": 0.317, + "step": 28489 + }, + { + "epoch": 1.1864351600651113, + "grad_norm": 6.269919395446777, + "learning_rate": 6.1985747567493495e-06, + "loss": 0.2669, + "step": 28490 + }, + { + "epoch": 1.1864487249050462, + "grad_norm": 5.894814491271973, + "learning_rate": 6.198437714129094e-06, + "loss": 0.2348, + "step": 28491 + }, + { + "epoch": 1.186462289744981, + "grad_norm": 4.038005828857422, + "learning_rate": 6.19830067150884e-06, + "loss": 0.3012, + "step": 28492 + }, + { + "epoch": 1.186475854584916, + "grad_norm": 4.50924825668335, + "learning_rate": 6.198163628888585e-06, + "loss": 0.3497, + "step": 28493 + }, + { + "epoch": 1.1864894194248508, + "grad_norm": 3.3181769847869873, + "learning_rate": 6.19802658626833e-06, + "loss": 0.226, + "step": 28494 + }, + { + "epoch": 1.1865029842647856, + "grad_norm": 4.963696479797363, + "learning_rate": 6.1978895436480745e-06, + "loss": 0.3969, + "step": 28495 + }, + { + "epoch": 1.1865165491047205, + "grad_norm": 4.2602057456970215, + "learning_rate": 6.1977525010278205e-06, + "loss": 0.2051, + "step": 28496 + }, + { + "epoch": 1.1865301139446554, + "grad_norm": 4.516676425933838, + "learning_rate": 6.197615458407566e-06, + "loss": 0.2026, + "step": 28497 + }, + { + "epoch": 1.1865436787845904, + "grad_norm": 5.756451606750488, + "learning_rate": 6.19747841578731e-06, + "loss": 0.233, + "step": 28498 + }, + { + "epoch": 1.1865572436245253, + "grad_norm": 4.6103901863098145, + "learning_rate": 6.197341373167055e-06, + "loss": 0.3334, + "step": 28499 + }, + { + "epoch": 1.1865708084644602, + "grad_norm": 3.3753108978271484, + "learning_rate": 6.197204330546801e-06, + "loss": 0.1489, + "step": 28500 + }, + { + "epoch": 1.186584373304395, + "grad_norm": 5.294849395751953, + "learning_rate": 6.1970672879265456e-06, + "loss": 0.2045, + "step": 28501 + }, + { + "epoch": 1.18659793814433, + "grad_norm": 4.302120685577393, + "learning_rate": 6.196930245306291e-06, + "loss": 0.2425, + "step": 28502 + }, + { + "epoch": 1.1866115029842648, + "grad_norm": 4.733495712280273, + "learning_rate": 6.196793202686036e-06, + "loss": 0.2521, + "step": 28503 + }, + { + "epoch": 1.1866250678241996, + "grad_norm": 4.225207328796387, + "learning_rate": 6.19665616006578e-06, + "loss": 0.2589, + "step": 28504 + }, + { + "epoch": 1.1866386326641345, + "grad_norm": 3.9481093883514404, + "learning_rate": 6.196519117445526e-06, + "loss": 0.2856, + "step": 28505 + }, + { + "epoch": 1.1866521975040694, + "grad_norm": 3.620666265487671, + "learning_rate": 6.1963820748252715e-06, + "loss": 0.1643, + "step": 28506 + }, + { + "epoch": 1.1866657623440044, + "grad_norm": 4.705620288848877, + "learning_rate": 6.196245032205016e-06, + "loss": 0.2451, + "step": 28507 + }, + { + "epoch": 1.1866793271839393, + "grad_norm": 3.5636777877807617, + "learning_rate": 6.196107989584761e-06, + "loss": 0.1758, + "step": 28508 + }, + { + "epoch": 1.1866928920238742, + "grad_norm": 3.9101169109344482, + "learning_rate": 6.195970946964507e-06, + "loss": 0.2274, + "step": 28509 + }, + { + "epoch": 1.186706456863809, + "grad_norm": 3.4962644577026367, + "learning_rate": 6.195833904344251e-06, + "loss": 0.1308, + "step": 28510 + }, + { + "epoch": 1.186720021703744, + "grad_norm": 6.016679763793945, + "learning_rate": 6.1956968617239965e-06, + "loss": 0.3742, + "step": 28511 + }, + { + "epoch": 1.1867335865436788, + "grad_norm": 4.395196437835693, + "learning_rate": 6.195559819103742e-06, + "loss": 0.1651, + "step": 28512 + }, + { + "epoch": 1.1867471513836136, + "grad_norm": 4.181546211242676, + "learning_rate": 6.195422776483488e-06, + "loss": 0.213, + "step": 28513 + }, + { + "epoch": 1.1867607162235485, + "grad_norm": 4.06545352935791, + "learning_rate": 6.195285733863232e-06, + "loss": 0.1762, + "step": 28514 + }, + { + "epoch": 1.1867742810634834, + "grad_norm": 3.2196011543273926, + "learning_rate": 6.195148691242977e-06, + "loss": 0.1569, + "step": 28515 + }, + { + "epoch": 1.1867878459034182, + "grad_norm": 5.303373336791992, + "learning_rate": 6.1950116486227216e-06, + "loss": 0.208, + "step": 28516 + }, + { + "epoch": 1.1868014107433533, + "grad_norm": 5.239858150482178, + "learning_rate": 6.194874606002468e-06, + "loss": 0.3035, + "step": 28517 + }, + { + "epoch": 1.1868149755832882, + "grad_norm": 5.29013204574585, + "learning_rate": 6.194737563382213e-06, + "loss": 0.2428, + "step": 28518 + }, + { + "epoch": 1.186828540423223, + "grad_norm": 3.0013620853424072, + "learning_rate": 6.194600520761957e-06, + "loss": 0.1626, + "step": 28519 + }, + { + "epoch": 1.186842105263158, + "grad_norm": 5.234058380126953, + "learning_rate": 6.194463478141702e-06, + "loss": 0.2283, + "step": 28520 + }, + { + "epoch": 1.1868556701030928, + "grad_norm": 4.8106560707092285, + "learning_rate": 6.1943264355214475e-06, + "loss": 0.2149, + "step": 28521 + }, + { + "epoch": 1.1868692349430277, + "grad_norm": 4.8354597091674805, + "learning_rate": 6.1941893929011935e-06, + "loss": 0.2507, + "step": 28522 + }, + { + "epoch": 1.1868827997829625, + "grad_norm": 4.186209678649902, + "learning_rate": 6.194052350280938e-06, + "loss": 0.1975, + "step": 28523 + }, + { + "epoch": 1.1868963646228974, + "grad_norm": 3.5502610206604004, + "learning_rate": 6.193915307660683e-06, + "loss": 0.2588, + "step": 28524 + }, + { + "epoch": 1.1869099294628322, + "grad_norm": 3.6069796085357666, + "learning_rate": 6.193778265040427e-06, + "loss": 0.262, + "step": 28525 + }, + { + "epoch": 1.1869234943027673, + "grad_norm": 5.412714004516602, + "learning_rate": 6.193641222420173e-06, + "loss": 0.2204, + "step": 28526 + }, + { + "epoch": 1.1869370591427022, + "grad_norm": 4.681711673736572, + "learning_rate": 6.1935041797999185e-06, + "loss": 0.2499, + "step": 28527 + }, + { + "epoch": 1.186950623982637, + "grad_norm": 3.9246411323547363, + "learning_rate": 6.193367137179664e-06, + "loss": 0.1664, + "step": 28528 + }, + { + "epoch": 1.186964188822572, + "grad_norm": 4.497432708740234, + "learning_rate": 6.193230094559408e-06, + "loss": 0.2224, + "step": 28529 + }, + { + "epoch": 1.1869777536625068, + "grad_norm": 5.410967826843262, + "learning_rate": 6.193093051939154e-06, + "loss": 0.2141, + "step": 28530 + }, + { + "epoch": 1.1869913185024417, + "grad_norm": 4.740687370300293, + "learning_rate": 6.192956009318899e-06, + "loss": 0.2113, + "step": 28531 + }, + { + "epoch": 1.1870048833423765, + "grad_norm": 7.948010444641113, + "learning_rate": 6.192818966698644e-06, + "loss": 0.2749, + "step": 28532 + }, + { + "epoch": 1.1870184481823114, + "grad_norm": 4.251758575439453, + "learning_rate": 6.192681924078389e-06, + "loss": 0.2206, + "step": 28533 + }, + { + "epoch": 1.1870320130222463, + "grad_norm": 5.7825446128845215, + "learning_rate": 6.192544881458133e-06, + "loss": 0.2625, + "step": 28534 + }, + { + "epoch": 1.1870455778621811, + "grad_norm": 6.328247547149658, + "learning_rate": 6.192407838837879e-06, + "loss": 0.267, + "step": 28535 + }, + { + "epoch": 1.1870591427021162, + "grad_norm": 5.825782299041748, + "learning_rate": 6.192270796217624e-06, + "loss": 0.3429, + "step": 28536 + }, + { + "epoch": 1.187072707542051, + "grad_norm": 6.50437593460083, + "learning_rate": 6.1921337535973695e-06, + "loss": 0.4285, + "step": 28537 + }, + { + "epoch": 1.187086272381986, + "grad_norm": 4.48021125793457, + "learning_rate": 6.191996710977114e-06, + "loss": 0.2673, + "step": 28538 + }, + { + "epoch": 1.1870998372219208, + "grad_norm": 5.143357753753662, + "learning_rate": 6.19185966835686e-06, + "loss": 0.2511, + "step": 28539 + }, + { + "epoch": 1.1871134020618557, + "grad_norm": 4.614096641540527, + "learning_rate": 6.191722625736605e-06, + "loss": 0.3475, + "step": 28540 + }, + { + "epoch": 1.1871269669017905, + "grad_norm": 4.394341468811035, + "learning_rate": 6.191585583116349e-06, + "loss": 0.3491, + "step": 28541 + }, + { + "epoch": 1.1871405317417254, + "grad_norm": 4.671844959259033, + "learning_rate": 6.1914485404960945e-06, + "loss": 0.3444, + "step": 28542 + }, + { + "epoch": 1.1871540965816603, + "grad_norm": 5.163529396057129, + "learning_rate": 6.1913114978758405e-06, + "loss": 0.3547, + "step": 28543 + }, + { + "epoch": 1.1871676614215951, + "grad_norm": 6.461959362030029, + "learning_rate": 6.191174455255585e-06, + "loss": 0.3498, + "step": 28544 + }, + { + "epoch": 1.1871812262615302, + "grad_norm": 7.52573823928833, + "learning_rate": 6.19103741263533e-06, + "loss": 0.3425, + "step": 28545 + }, + { + "epoch": 1.187194791101465, + "grad_norm": 4.492635250091553, + "learning_rate": 6.190900370015075e-06, + "loss": 0.228, + "step": 28546 + }, + { + "epoch": 1.1872083559414, + "grad_norm": 4.284180641174316, + "learning_rate": 6.1907633273948196e-06, + "loss": 0.3059, + "step": 28547 + }, + { + "epoch": 1.1872219207813348, + "grad_norm": 4.423342227935791, + "learning_rate": 6.190626284774566e-06, + "loss": 0.2687, + "step": 28548 + }, + { + "epoch": 1.1872354856212697, + "grad_norm": 4.405217170715332, + "learning_rate": 6.190489242154311e-06, + "loss": 0.3478, + "step": 28549 + }, + { + "epoch": 1.1872490504612045, + "grad_norm": 5.535479545593262, + "learning_rate": 6.190352199534055e-06, + "loss": 0.4056, + "step": 28550 + }, + { + "epoch": 1.1872626153011394, + "grad_norm": 5.8874711990356445, + "learning_rate": 6.1902151569138e-06, + "loss": 0.1805, + "step": 28551 + }, + { + "epoch": 1.1872761801410743, + "grad_norm": 5.077761173248291, + "learning_rate": 6.190078114293546e-06, + "loss": 0.2762, + "step": 28552 + }, + { + "epoch": 1.1872897449810091, + "grad_norm": 4.48350715637207, + "learning_rate": 6.189941071673291e-06, + "loss": 0.3467, + "step": 28553 + }, + { + "epoch": 1.187303309820944, + "grad_norm": 5.587735176086426, + "learning_rate": 6.189804029053036e-06, + "loss": 0.4266, + "step": 28554 + }, + { + "epoch": 1.187316874660879, + "grad_norm": 6.581872940063477, + "learning_rate": 6.189666986432781e-06, + "loss": 0.3601, + "step": 28555 + }, + { + "epoch": 1.187330439500814, + "grad_norm": 4.9821977615356445, + "learning_rate": 6.189529943812527e-06, + "loss": 0.3334, + "step": 28556 + }, + { + "epoch": 1.1873440043407488, + "grad_norm": 5.037509918212891, + "learning_rate": 6.189392901192271e-06, + "loss": 0.3088, + "step": 28557 + }, + { + "epoch": 1.1873575691806837, + "grad_norm": 5.7692952156066895, + "learning_rate": 6.1892558585720165e-06, + "loss": 0.3547, + "step": 28558 + }, + { + "epoch": 1.1873711340206186, + "grad_norm": 4.767807483673096, + "learning_rate": 6.189118815951761e-06, + "loss": 0.3364, + "step": 28559 + }, + { + "epoch": 1.1873846988605534, + "grad_norm": 5.259003162384033, + "learning_rate": 6.188981773331506e-06, + "loss": 0.2776, + "step": 28560 + }, + { + "epoch": 1.1873982637004883, + "grad_norm": 6.01900577545166, + "learning_rate": 6.188844730711252e-06, + "loss": 0.4545, + "step": 28561 + }, + { + "epoch": 1.1874118285404232, + "grad_norm": 4.65612268447876, + "learning_rate": 6.188707688090997e-06, + "loss": 0.2426, + "step": 28562 + }, + { + "epoch": 1.187425393380358, + "grad_norm": 5.867589473724365, + "learning_rate": 6.188570645470742e-06, + "loss": 0.3905, + "step": 28563 + }, + { + "epoch": 1.187438958220293, + "grad_norm": 6.738822937011719, + "learning_rate": 6.188433602850487e-06, + "loss": 0.3348, + "step": 28564 + }, + { + "epoch": 1.187452523060228, + "grad_norm": 5.027095794677734, + "learning_rate": 6.188296560230233e-06, + "loss": 0.2608, + "step": 28565 + }, + { + "epoch": 1.1874660879001628, + "grad_norm": 6.773409843444824, + "learning_rate": 6.188159517609977e-06, + "loss": 0.3208, + "step": 28566 + }, + { + "epoch": 1.1874796527400977, + "grad_norm": 4.454805850982666, + "learning_rate": 6.188022474989722e-06, + "loss": 0.221, + "step": 28567 + }, + { + "epoch": 1.1874932175800326, + "grad_norm": 6.3704304695129395, + "learning_rate": 6.187885432369467e-06, + "loss": 0.409, + "step": 28568 + }, + { + "epoch": 1.1875067824199674, + "grad_norm": 5.179131031036377, + "learning_rate": 6.187748389749213e-06, + "loss": 0.237, + "step": 28569 + }, + { + "epoch": 1.1875203472599023, + "grad_norm": 6.355732440948486, + "learning_rate": 6.187611347128958e-06, + "loss": 0.4388, + "step": 28570 + }, + { + "epoch": 1.1875339120998372, + "grad_norm": 6.22226619720459, + "learning_rate": 6.187474304508703e-06, + "loss": 0.3232, + "step": 28571 + }, + { + "epoch": 1.187547476939772, + "grad_norm": 6.875695705413818, + "learning_rate": 6.187337261888447e-06, + "loss": 0.4335, + "step": 28572 + }, + { + "epoch": 1.187561041779707, + "grad_norm": 6.285234451293945, + "learning_rate": 6.1872002192681925e-06, + "loss": 0.4011, + "step": 28573 + }, + { + "epoch": 1.187574606619642, + "grad_norm": 7.762821197509766, + "learning_rate": 6.1870631766479385e-06, + "loss": 0.4161, + "step": 28574 + }, + { + "epoch": 1.1875881714595768, + "grad_norm": 5.241636276245117, + "learning_rate": 6.186926134027683e-06, + "loss": 0.3185, + "step": 28575 + }, + { + "epoch": 1.1876017362995117, + "grad_norm": 5.362857341766357, + "learning_rate": 6.186789091407428e-06, + "loss": 0.2554, + "step": 28576 + }, + { + "epoch": 1.1876153011394466, + "grad_norm": 4.627424716949463, + "learning_rate": 6.186652048787173e-06, + "loss": 0.3064, + "step": 28577 + }, + { + "epoch": 1.1876288659793814, + "grad_norm": 6.4341020584106445, + "learning_rate": 6.186515006166918e-06, + "loss": 0.33, + "step": 28578 + }, + { + "epoch": 1.1876424308193163, + "grad_norm": 6.480379104614258, + "learning_rate": 6.186377963546664e-06, + "loss": 0.2949, + "step": 28579 + }, + { + "epoch": 1.1876559956592512, + "grad_norm": 5.104166030883789, + "learning_rate": 6.186240920926409e-06, + "loss": 0.2219, + "step": 28580 + }, + { + "epoch": 1.187669560499186, + "grad_norm": 7.1401872634887695, + "learning_rate": 6.186103878306153e-06, + "loss": 0.3592, + "step": 28581 + }, + { + "epoch": 1.187683125339121, + "grad_norm": 7.089640140533447, + "learning_rate": 6.185966835685899e-06, + "loss": 0.4076, + "step": 28582 + }, + { + "epoch": 1.187696690179056, + "grad_norm": 5.513156890869141, + "learning_rate": 6.185829793065644e-06, + "loss": 0.2468, + "step": 28583 + }, + { + "epoch": 1.1877102550189909, + "grad_norm": 8.288480758666992, + "learning_rate": 6.185692750445389e-06, + "loss": 0.6023, + "step": 28584 + }, + { + "epoch": 1.1877238198589257, + "grad_norm": 6.103438377380371, + "learning_rate": 6.185555707825134e-06, + "loss": 0.4056, + "step": 28585 + }, + { + "epoch": 1.1877373846988606, + "grad_norm": 5.947842121124268, + "learning_rate": 6.18541866520488e-06, + "loss": 0.4196, + "step": 28586 + }, + { + "epoch": 1.1877509495387955, + "grad_norm": 4.573252201080322, + "learning_rate": 6.185281622584625e-06, + "loss": 0.2154, + "step": 28587 + }, + { + "epoch": 1.1877645143787303, + "grad_norm": 6.47760009765625, + "learning_rate": 6.185144579964369e-06, + "loss": 0.3809, + "step": 28588 + }, + { + "epoch": 1.1877780792186652, + "grad_norm": 6.685666084289551, + "learning_rate": 6.1850075373441145e-06, + "loss": 0.3753, + "step": 28589 + }, + { + "epoch": 1.1877916440586, + "grad_norm": 6.119844913482666, + "learning_rate": 6.184870494723859e-06, + "loss": 0.3595, + "step": 28590 + }, + { + "epoch": 1.187805208898535, + "grad_norm": 5.187047958374023, + "learning_rate": 6.184733452103605e-06, + "loss": 0.2861, + "step": 28591 + }, + { + "epoch": 1.1878187737384698, + "grad_norm": 5.515477180480957, + "learning_rate": 6.18459640948335e-06, + "loss": 0.2525, + "step": 28592 + }, + { + "epoch": 1.1878323385784049, + "grad_norm": 6.605901718139648, + "learning_rate": 6.184459366863094e-06, + "loss": 0.5213, + "step": 28593 + }, + { + "epoch": 1.1878459034183397, + "grad_norm": 5.487936019897461, + "learning_rate": 6.18432232424284e-06, + "loss": 0.2998, + "step": 28594 + }, + { + "epoch": 1.1878594682582746, + "grad_norm": 6.697423934936523, + "learning_rate": 6.184185281622586e-06, + "loss": 0.3666, + "step": 28595 + }, + { + "epoch": 1.1878730330982095, + "grad_norm": 6.091304302215576, + "learning_rate": 6.184048239002331e-06, + "loss": 0.3366, + "step": 28596 + }, + { + "epoch": 1.1878865979381443, + "grad_norm": 7.543860912322998, + "learning_rate": 6.183911196382075e-06, + "loss": 0.6645, + "step": 28597 + }, + { + "epoch": 1.1879001627780792, + "grad_norm": 5.559720993041992, + "learning_rate": 6.18377415376182e-06, + "loss": 0.3954, + "step": 28598 + }, + { + "epoch": 1.187913727618014, + "grad_norm": 5.981405258178711, + "learning_rate": 6.183637111141566e-06, + "loss": 0.3969, + "step": 28599 + }, + { + "epoch": 1.187927292457949, + "grad_norm": 6.856416702270508, + "learning_rate": 6.183500068521311e-06, + "loss": 0.3383, + "step": 28600 + }, + { + "epoch": 1.1879408572978838, + "grad_norm": 6.215513706207275, + "learning_rate": 6.183363025901056e-06, + "loss": 0.534, + "step": 28601 + }, + { + "epoch": 1.1879544221378189, + "grad_norm": 4.836192607879639, + "learning_rate": 6.1832259832808e-06, + "loss": 0.2743, + "step": 28602 + }, + { + "epoch": 1.1879679869777537, + "grad_norm": 3.1993939876556396, + "learning_rate": 6.183088940660545e-06, + "loss": 0.1654, + "step": 28603 + }, + { + "epoch": 1.1879815518176886, + "grad_norm": 5.775193691253662, + "learning_rate": 6.182951898040291e-06, + "loss": 0.3744, + "step": 28604 + }, + { + "epoch": 1.1879951166576235, + "grad_norm": 5.260907173156738, + "learning_rate": 6.1828148554200366e-06, + "loss": 0.3459, + "step": 28605 + }, + { + "epoch": 1.1880086814975583, + "grad_norm": 6.957780838012695, + "learning_rate": 6.182677812799781e-06, + "loss": 0.381, + "step": 28606 + }, + { + "epoch": 1.1880222463374932, + "grad_norm": 4.013978958129883, + "learning_rate": 6.182540770179526e-06, + "loss": 0.2284, + "step": 28607 + }, + { + "epoch": 1.188035811177428, + "grad_norm": 3.9088830947875977, + "learning_rate": 6.182403727559272e-06, + "loss": 0.1495, + "step": 28608 + }, + { + "epoch": 1.188049376017363, + "grad_norm": 5.9689507484436035, + "learning_rate": 6.182266684939016e-06, + "loss": 0.2802, + "step": 28609 + }, + { + "epoch": 1.1880629408572978, + "grad_norm": 6.1143317222595215, + "learning_rate": 6.182129642318762e-06, + "loss": 0.3497, + "step": 28610 + }, + { + "epoch": 1.1880765056972327, + "grad_norm": 6.571101665496826, + "learning_rate": 6.181992599698507e-06, + "loss": 0.4788, + "step": 28611 + }, + { + "epoch": 1.1880900705371678, + "grad_norm": 6.543920993804932, + "learning_rate": 6.181855557078252e-06, + "loss": 0.3655, + "step": 28612 + }, + { + "epoch": 1.1881036353771026, + "grad_norm": 5.8950910568237305, + "learning_rate": 6.181718514457997e-06, + "loss": 0.327, + "step": 28613 + }, + { + "epoch": 1.1881172002170375, + "grad_norm": 5.535221099853516, + "learning_rate": 6.181581471837742e-06, + "loss": 0.3374, + "step": 28614 + }, + { + "epoch": 1.1881307650569723, + "grad_norm": 4.658801555633545, + "learning_rate": 6.181444429217487e-06, + "loss": 0.261, + "step": 28615 + }, + { + "epoch": 1.1881443298969072, + "grad_norm": 5.492800235748291, + "learning_rate": 6.181307386597232e-06, + "loss": 0.2578, + "step": 28616 + }, + { + "epoch": 1.188157894736842, + "grad_norm": 5.404764652252197, + "learning_rate": 6.181170343976978e-06, + "loss": 0.3095, + "step": 28617 + }, + { + "epoch": 1.188171459576777, + "grad_norm": 5.557641983032227, + "learning_rate": 6.181033301356722e-06, + "loss": 0.2631, + "step": 28618 + }, + { + "epoch": 1.1881850244167118, + "grad_norm": 5.120761394500732, + "learning_rate": 6.180896258736467e-06, + "loss": 0.223, + "step": 28619 + }, + { + "epoch": 1.1881985892566467, + "grad_norm": 4.686125755310059, + "learning_rate": 6.1807592161162125e-06, + "loss": 0.2595, + "step": 28620 + }, + { + "epoch": 1.1882121540965818, + "grad_norm": 6.858528137207031, + "learning_rate": 6.1806221734959586e-06, + "loss": 0.3695, + "step": 28621 + }, + { + "epoch": 1.1882257189365166, + "grad_norm": 7.007960796356201, + "learning_rate": 6.180485130875703e-06, + "loss": 0.452, + "step": 28622 + }, + { + "epoch": 1.1882392837764515, + "grad_norm": 6.250173568725586, + "learning_rate": 6.180348088255448e-06, + "loss": 0.2976, + "step": 28623 + }, + { + "epoch": 1.1882528486163864, + "grad_norm": 4.692554473876953, + "learning_rate": 6.180211045635192e-06, + "loss": 0.2402, + "step": 28624 + }, + { + "epoch": 1.1882664134563212, + "grad_norm": 5.762017250061035, + "learning_rate": 6.1800740030149384e-06, + "loss": 0.3082, + "step": 28625 + }, + { + "epoch": 1.188279978296256, + "grad_norm": 5.2303972244262695, + "learning_rate": 6.179936960394684e-06, + "loss": 0.2113, + "step": 28626 + }, + { + "epoch": 1.188293543136191, + "grad_norm": 5.442984580993652, + "learning_rate": 6.179799917774428e-06, + "loss": 0.3571, + "step": 28627 + }, + { + "epoch": 1.1883071079761258, + "grad_norm": 6.11572265625, + "learning_rate": 6.179662875154173e-06, + "loss": 0.3414, + "step": 28628 + }, + { + "epoch": 1.1883206728160607, + "grad_norm": 6.777576446533203, + "learning_rate": 6.179525832533918e-06, + "loss": 0.3863, + "step": 28629 + }, + { + "epoch": 1.1883342376559956, + "grad_norm": 4.55021858215332, + "learning_rate": 6.179388789913664e-06, + "loss": 0.1797, + "step": 28630 + }, + { + "epoch": 1.1883478024959306, + "grad_norm": 4.635959148406982, + "learning_rate": 6.179251747293409e-06, + "loss": 0.2231, + "step": 28631 + }, + { + "epoch": 1.1883613673358655, + "grad_norm": 6.7694549560546875, + "learning_rate": 6.179114704673154e-06, + "loss": 0.3039, + "step": 28632 + }, + { + "epoch": 1.1883749321758004, + "grad_norm": 5.188144683837891, + "learning_rate": 6.178977662052898e-06, + "loss": 0.2901, + "step": 28633 + }, + { + "epoch": 1.1883884970157352, + "grad_norm": 7.843433380126953, + "learning_rate": 6.178840619432644e-06, + "loss": 0.3112, + "step": 28634 + }, + { + "epoch": 1.18840206185567, + "grad_norm": 4.85883092880249, + "learning_rate": 6.178703576812389e-06, + "loss": 0.2428, + "step": 28635 + }, + { + "epoch": 1.188415626695605, + "grad_norm": 5.074324131011963, + "learning_rate": 6.1785665341921346e-06, + "loss": 0.2584, + "step": 28636 + }, + { + "epoch": 1.1884291915355398, + "grad_norm": 5.160990238189697, + "learning_rate": 6.178429491571879e-06, + "loss": 0.2919, + "step": 28637 + }, + { + "epoch": 1.1884427563754747, + "grad_norm": 4.526954174041748, + "learning_rate": 6.178292448951625e-06, + "loss": 0.1651, + "step": 28638 + }, + { + "epoch": 1.1884563212154096, + "grad_norm": 5.529280185699463, + "learning_rate": 6.17815540633137e-06, + "loss": 0.3271, + "step": 28639 + }, + { + "epoch": 1.1884698860553446, + "grad_norm": 6.153957366943359, + "learning_rate": 6.1780183637111144e-06, + "loss": 0.4417, + "step": 28640 + }, + { + "epoch": 1.1884834508952795, + "grad_norm": 7.180099964141846, + "learning_rate": 6.17788132109086e-06, + "loss": 0.4143, + "step": 28641 + }, + { + "epoch": 1.1884970157352144, + "grad_norm": 5.704042911529541, + "learning_rate": 6.177744278470604e-06, + "loss": 0.3831, + "step": 28642 + }, + { + "epoch": 1.1885105805751492, + "grad_norm": 6.796253204345703, + "learning_rate": 6.17760723585035e-06, + "loss": 0.2974, + "step": 28643 + }, + { + "epoch": 1.188524145415084, + "grad_norm": 5.37118673324585, + "learning_rate": 6.177470193230095e-06, + "loss": 0.2218, + "step": 28644 + }, + { + "epoch": 1.188537710255019, + "grad_norm": 6.754150390625, + "learning_rate": 6.17733315060984e-06, + "loss": 0.2711, + "step": 28645 + }, + { + "epoch": 1.1885512750949538, + "grad_norm": 5.079777240753174, + "learning_rate": 6.177196107989585e-06, + "loss": 0.2642, + "step": 28646 + }, + { + "epoch": 1.1885648399348887, + "grad_norm": 6.3530097007751465, + "learning_rate": 6.177059065369331e-06, + "loss": 0.2974, + "step": 28647 + }, + { + "epoch": 1.1885784047748236, + "grad_norm": 5.02938985824585, + "learning_rate": 6.176922022749076e-06, + "loss": 0.4056, + "step": 28648 + }, + { + "epoch": 1.1885919696147584, + "grad_norm": 6.840432167053223, + "learning_rate": 6.17678498012882e-06, + "loss": 0.4343, + "step": 28649 + }, + { + "epoch": 1.1886055344546935, + "grad_norm": 3.9889464378356934, + "learning_rate": 6.176647937508565e-06, + "loss": 0.2302, + "step": 28650 + }, + { + "epoch": 1.1886190992946284, + "grad_norm": 5.714286804199219, + "learning_rate": 6.176510894888311e-06, + "loss": 0.3593, + "step": 28651 + }, + { + "epoch": 1.1886326641345633, + "grad_norm": 5.1857123374938965, + "learning_rate": 6.176373852268056e-06, + "loss": 0.3417, + "step": 28652 + }, + { + "epoch": 1.1886462289744981, + "grad_norm": 6.95072603225708, + "learning_rate": 6.176236809647801e-06, + "loss": 0.3069, + "step": 28653 + }, + { + "epoch": 1.188659793814433, + "grad_norm": 8.041520118713379, + "learning_rate": 6.176099767027546e-06, + "loss": 0.3394, + "step": 28654 + }, + { + "epoch": 1.1886733586543679, + "grad_norm": 7.689925193786621, + "learning_rate": 6.175962724407292e-06, + "loss": 0.3355, + "step": 28655 + }, + { + "epoch": 1.1886869234943027, + "grad_norm": 7.564487457275391, + "learning_rate": 6.1758256817870364e-06, + "loss": 0.3956, + "step": 28656 + }, + { + "epoch": 1.1887004883342376, + "grad_norm": 6.090713977813721, + "learning_rate": 6.175688639166782e-06, + "loss": 0.402, + "step": 28657 + }, + { + "epoch": 1.1887140531741727, + "grad_norm": 4.889334201812744, + "learning_rate": 6.175551596546526e-06, + "loss": 0.324, + "step": 28658 + }, + { + "epoch": 1.1887276180141075, + "grad_norm": 6.032786846160889, + "learning_rate": 6.175414553926271e-06, + "loss": 0.2983, + "step": 28659 + }, + { + "epoch": 1.1887411828540424, + "grad_norm": 6.536579132080078, + "learning_rate": 6.175277511306017e-06, + "loss": 0.2966, + "step": 28660 + }, + { + "epoch": 1.1887547476939773, + "grad_norm": 4.120591163635254, + "learning_rate": 6.1751404686857615e-06, + "loss": 0.2439, + "step": 28661 + }, + { + "epoch": 1.1887683125339121, + "grad_norm": 5.620045185089111, + "learning_rate": 6.175003426065507e-06, + "loss": 0.4192, + "step": 28662 + }, + { + "epoch": 1.188781877373847, + "grad_norm": 5.5372138023376465, + "learning_rate": 6.174866383445252e-06, + "loss": 0.2864, + "step": 28663 + }, + { + "epoch": 1.1887954422137819, + "grad_norm": 5.336600303649902, + "learning_rate": 6.174729340824998e-06, + "loss": 0.1702, + "step": 28664 + }, + { + "epoch": 1.1888090070537167, + "grad_norm": 6.237483978271484, + "learning_rate": 6.174592298204742e-06, + "loss": 0.2699, + "step": 28665 + }, + { + "epoch": 1.1888225718936516, + "grad_norm": 7.284326076507568, + "learning_rate": 6.174455255584487e-06, + "loss": 0.4846, + "step": 28666 + }, + { + "epoch": 1.1888361367335865, + "grad_norm": 8.231573104858398, + "learning_rate": 6.174318212964232e-06, + "loss": 0.3901, + "step": 28667 + }, + { + "epoch": 1.1888497015735213, + "grad_norm": 4.756435871124268, + "learning_rate": 6.174181170343978e-06, + "loss": 0.2449, + "step": 28668 + }, + { + "epoch": 1.1888632664134564, + "grad_norm": 5.747459411621094, + "learning_rate": 6.174044127723723e-06, + "loss": 0.2238, + "step": 28669 + }, + { + "epoch": 1.1888768312533913, + "grad_norm": 8.273528099060059, + "learning_rate": 6.173907085103468e-06, + "loss": 0.4283, + "step": 28670 + }, + { + "epoch": 1.1888903960933261, + "grad_norm": 5.204527854919434, + "learning_rate": 6.1737700424832124e-06, + "loss": 0.2203, + "step": 28671 + }, + { + "epoch": 1.188903960933261, + "grad_norm": 5.831607818603516, + "learning_rate": 6.173632999862958e-06, + "loss": 0.1976, + "step": 28672 + }, + { + "epoch": 1.1889175257731959, + "grad_norm": 14.942849159240723, + "learning_rate": 6.173495957242704e-06, + "loss": 0.3122, + "step": 28673 + }, + { + "epoch": 1.1889310906131307, + "grad_norm": 4.439630508422852, + "learning_rate": 6.173358914622448e-06, + "loss": 0.2583, + "step": 28674 + }, + { + "epoch": 1.1889446554530656, + "grad_norm": 7.658847332000732, + "learning_rate": 6.173221872002193e-06, + "loss": 0.248, + "step": 28675 + }, + { + "epoch": 1.1889582202930005, + "grad_norm": 5.566850185394287, + "learning_rate": 6.1730848293819375e-06, + "loss": 0.244, + "step": 28676 + }, + { + "epoch": 1.1889717851329356, + "grad_norm": 7.021132469177246, + "learning_rate": 6.1729477867616835e-06, + "loss": 0.265, + "step": 28677 + }, + { + "epoch": 1.1889853499728704, + "grad_norm": 5.623444080352783, + "learning_rate": 6.172810744141429e-06, + "loss": 0.3233, + "step": 28678 + }, + { + "epoch": 1.1889989148128053, + "grad_norm": 6.756374835968018, + "learning_rate": 6.172673701521174e-06, + "loss": 0.3471, + "step": 28679 + }, + { + "epoch": 1.1890124796527402, + "grad_norm": 4.757625102996826, + "learning_rate": 6.172536658900918e-06, + "loss": 0.155, + "step": 28680 + }, + { + "epoch": 1.189026044492675, + "grad_norm": 3.9269516468048096, + "learning_rate": 6.172399616280664e-06, + "loss": 0.1332, + "step": 28681 + }, + { + "epoch": 1.1890396093326099, + "grad_norm": 4.196984767913818, + "learning_rate": 6.172262573660409e-06, + "loss": 0.2237, + "step": 28682 + }, + { + "epoch": 1.1890531741725447, + "grad_norm": 8.435465812683105, + "learning_rate": 6.172125531040154e-06, + "loss": 0.4028, + "step": 28683 + }, + { + "epoch": 1.1890667390124796, + "grad_norm": 6.1162309646606445, + "learning_rate": 6.171988488419899e-06, + "loss": 0.4144, + "step": 28684 + }, + { + "epoch": 1.1890803038524145, + "grad_norm": 7.735028266906738, + "learning_rate": 6.171851445799644e-06, + "loss": 0.3125, + "step": 28685 + }, + { + "epoch": 1.1890938686923493, + "grad_norm": 5.168452739715576, + "learning_rate": 6.171714403179389e-06, + "loss": 0.3727, + "step": 28686 + }, + { + "epoch": 1.1891074335322842, + "grad_norm": 6.597168922424316, + "learning_rate": 6.1715773605591344e-06, + "loss": 0.2694, + "step": 28687 + }, + { + "epoch": 1.1891209983722193, + "grad_norm": 6.187490940093994, + "learning_rate": 6.17144031793888e-06, + "loss": 0.2448, + "step": 28688 + }, + { + "epoch": 1.1891345632121542, + "grad_norm": 4.241481781005859, + "learning_rate": 6.171303275318624e-06, + "loss": 0.1769, + "step": 28689 + }, + { + "epoch": 1.189148128052089, + "grad_norm": 6.281609058380127, + "learning_rate": 6.17116623269837e-06, + "loss": 0.3227, + "step": 28690 + }, + { + "epoch": 1.189161692892024, + "grad_norm": 7.155547618865967, + "learning_rate": 6.171029190078115e-06, + "loss": 0.3723, + "step": 28691 + }, + { + "epoch": 1.1891752577319588, + "grad_norm": 6.260705947875977, + "learning_rate": 6.1708921474578595e-06, + "loss": 0.3966, + "step": 28692 + }, + { + "epoch": 1.1891888225718936, + "grad_norm": 6.629338264465332, + "learning_rate": 6.170755104837605e-06, + "loss": 0.3102, + "step": 28693 + }, + { + "epoch": 1.1892023874118285, + "grad_norm": 5.729138374328613, + "learning_rate": 6.170618062217351e-06, + "loss": 0.3673, + "step": 28694 + }, + { + "epoch": 1.1892159522517634, + "grad_norm": 6.967848777770996, + "learning_rate": 6.170481019597095e-06, + "loss": 0.3093, + "step": 28695 + }, + { + "epoch": 1.1892295170916984, + "grad_norm": 5.888991355895996, + "learning_rate": 6.17034397697684e-06, + "loss": 0.3217, + "step": 28696 + }, + { + "epoch": 1.1892430819316333, + "grad_norm": 5.413425922393799, + "learning_rate": 6.170206934356585e-06, + "loss": 0.2745, + "step": 28697 + }, + { + "epoch": 1.1892566467715682, + "grad_norm": 4.453486442565918, + "learning_rate": 6.17006989173633e-06, + "loss": 0.265, + "step": 28698 + }, + { + "epoch": 1.189270211611503, + "grad_norm": 5.295039653778076, + "learning_rate": 6.169932849116076e-06, + "loss": 0.211, + "step": 28699 + }, + { + "epoch": 1.189283776451438, + "grad_norm": 6.037790298461914, + "learning_rate": 6.169795806495821e-06, + "loss": 0.3564, + "step": 28700 + }, + { + "epoch": 1.1892973412913728, + "grad_norm": 4.215536117553711, + "learning_rate": 6.169658763875565e-06, + "loss": 0.3128, + "step": 28701 + }, + { + "epoch": 1.1893109061313076, + "grad_norm": 10.328190803527832, + "learning_rate": 6.1695217212553104e-06, + "loss": 0.3969, + "step": 28702 + }, + { + "epoch": 1.1893244709712425, + "grad_norm": 5.447604179382324, + "learning_rate": 6.1693846786350565e-06, + "loss": 0.2783, + "step": 28703 + }, + { + "epoch": 1.1893380358111774, + "grad_norm": 5.644315719604492, + "learning_rate": 6.169247636014802e-06, + "loss": 0.319, + "step": 28704 + }, + { + "epoch": 1.1893516006511122, + "grad_norm": 7.254546642303467, + "learning_rate": 6.169110593394546e-06, + "loss": 0.3952, + "step": 28705 + }, + { + "epoch": 1.189365165491047, + "grad_norm": 5.277140140533447, + "learning_rate": 6.168973550774291e-06, + "loss": 0.2172, + "step": 28706 + }, + { + "epoch": 1.1893787303309822, + "grad_norm": 6.842103004455566, + "learning_rate": 6.168836508154037e-06, + "loss": 0.3283, + "step": 28707 + }, + { + "epoch": 1.189392295170917, + "grad_norm": 5.085458278656006, + "learning_rate": 6.1686994655337815e-06, + "loss": 0.3219, + "step": 28708 + }, + { + "epoch": 1.189405860010852, + "grad_norm": 5.712339878082275, + "learning_rate": 6.168562422913527e-06, + "loss": 0.2197, + "step": 28709 + }, + { + "epoch": 1.1894194248507868, + "grad_norm": 5.91460657119751, + "learning_rate": 6.168425380293271e-06, + "loss": 0.2934, + "step": 28710 + }, + { + "epoch": 1.1894329896907216, + "grad_norm": 6.666463851928711, + "learning_rate": 6.168288337673016e-06, + "loss": 0.2702, + "step": 28711 + }, + { + "epoch": 1.1894465545306565, + "grad_norm": 3.1037368774414062, + "learning_rate": 6.168151295052762e-06, + "loss": 0.0881, + "step": 28712 + }, + { + "epoch": 1.1894601193705914, + "grad_norm": 5.72466516494751, + "learning_rate": 6.168014252432507e-06, + "loss": 0.2291, + "step": 28713 + }, + { + "epoch": 1.1894736842105262, + "grad_norm": 5.723904609680176, + "learning_rate": 6.167877209812252e-06, + "loss": 0.2568, + "step": 28714 + }, + { + "epoch": 1.1894872490504613, + "grad_norm": 3.938593864440918, + "learning_rate": 6.167740167191997e-06, + "loss": 0.2361, + "step": 28715 + }, + { + "epoch": 1.1895008138903962, + "grad_norm": 6.041304111480713, + "learning_rate": 6.167603124571743e-06, + "loss": 0.2598, + "step": 28716 + }, + { + "epoch": 1.189514378730331, + "grad_norm": 6.147245407104492, + "learning_rate": 6.167466081951487e-06, + "loss": 0.3679, + "step": 28717 + }, + { + "epoch": 1.189527943570266, + "grad_norm": 5.7978644371032715, + "learning_rate": 6.1673290393312325e-06, + "loss": 0.2687, + "step": 28718 + }, + { + "epoch": 1.1895415084102008, + "grad_norm": 4.65205717086792, + "learning_rate": 6.167191996710978e-06, + "loss": 0.3098, + "step": 28719 + }, + { + "epoch": 1.1895550732501357, + "grad_norm": 6.151169776916504, + "learning_rate": 6.167054954090723e-06, + "loss": 0.2236, + "step": 28720 + }, + { + "epoch": 1.1895686380900705, + "grad_norm": 5.79147481918335, + "learning_rate": 6.166917911470468e-06, + "loss": 0.3732, + "step": 28721 + }, + { + "epoch": 1.1895822029300054, + "grad_norm": 6.134860515594482, + "learning_rate": 6.166780868850213e-06, + "loss": 0.2875, + "step": 28722 + }, + { + "epoch": 1.1895957677699402, + "grad_norm": 4.895954132080078, + "learning_rate": 6.1666438262299575e-06, + "loss": 0.2482, + "step": 28723 + }, + { + "epoch": 1.1896093326098751, + "grad_norm": 5.040493011474609, + "learning_rate": 6.1665067836097035e-06, + "loss": 0.202, + "step": 28724 + }, + { + "epoch": 1.18962289744981, + "grad_norm": 6.900740623474121, + "learning_rate": 6.166369740989449e-06, + "loss": 0.3229, + "step": 28725 + }, + { + "epoch": 1.189636462289745, + "grad_norm": 4.212784290313721, + "learning_rate": 6.166232698369193e-06, + "loss": 0.1619, + "step": 28726 + }, + { + "epoch": 1.18965002712968, + "grad_norm": 7.640531063079834, + "learning_rate": 6.166095655748938e-06, + "loss": 0.3109, + "step": 28727 + }, + { + "epoch": 1.1896635919696148, + "grad_norm": 7.666197776794434, + "learning_rate": 6.165958613128683e-06, + "loss": 0.3107, + "step": 28728 + }, + { + "epoch": 1.1896771568095497, + "grad_norm": 4.374649524688721, + "learning_rate": 6.165821570508429e-06, + "loss": 0.1569, + "step": 28729 + }, + { + "epoch": 1.1896907216494845, + "grad_norm": 3.4587252140045166, + "learning_rate": 6.165684527888174e-06, + "loss": 0.1172, + "step": 28730 + }, + { + "epoch": 1.1897042864894194, + "grad_norm": 5.802964687347412, + "learning_rate": 6.165547485267919e-06, + "loss": 0.2276, + "step": 28731 + }, + { + "epoch": 1.1897178513293543, + "grad_norm": 4.638496398925781, + "learning_rate": 6.165410442647663e-06, + "loss": 0.261, + "step": 28732 + }, + { + "epoch": 1.1897314161692891, + "grad_norm": 5.600608825683594, + "learning_rate": 6.165273400027409e-06, + "loss": 0.2098, + "step": 28733 + }, + { + "epoch": 1.1897449810092242, + "grad_norm": 3.50107741355896, + "learning_rate": 6.1651363574071545e-06, + "loss": 0.1196, + "step": 28734 + }, + { + "epoch": 1.189758545849159, + "grad_norm": 4.712131023406982, + "learning_rate": 6.164999314786899e-06, + "loss": 0.1968, + "step": 28735 + }, + { + "epoch": 1.189772110689094, + "grad_norm": 5.499820232391357, + "learning_rate": 6.164862272166644e-06, + "loss": 0.2231, + "step": 28736 + }, + { + "epoch": 1.1897856755290288, + "grad_norm": 5.903183460235596, + "learning_rate": 6.16472522954639e-06, + "loss": 0.2496, + "step": 28737 + }, + { + "epoch": 1.1897992403689637, + "grad_norm": 6.173363208770752, + "learning_rate": 6.164588186926135e-06, + "loss": 0.2158, + "step": 28738 + }, + { + "epoch": 1.1898128052088985, + "grad_norm": 6.398208141326904, + "learning_rate": 6.1644511443058795e-06, + "loss": 0.3308, + "step": 28739 + }, + { + "epoch": 1.1898263700488334, + "grad_norm": 6.344508647918701, + "learning_rate": 6.164314101685625e-06, + "loss": 0.3513, + "step": 28740 + }, + { + "epoch": 1.1898399348887683, + "grad_norm": 4.9189910888671875, + "learning_rate": 6.164177059065369e-06, + "loss": 0.2573, + "step": 28741 + }, + { + "epoch": 1.1898534997287031, + "grad_norm": 5.112584590911865, + "learning_rate": 6.164040016445115e-06, + "loss": 0.211, + "step": 28742 + }, + { + "epoch": 1.189867064568638, + "grad_norm": 4.105168342590332, + "learning_rate": 6.16390297382486e-06, + "loss": 0.1319, + "step": 28743 + }, + { + "epoch": 1.1898806294085729, + "grad_norm": 4.627345561981201, + "learning_rate": 6.1637659312046046e-06, + "loss": 0.188, + "step": 28744 + }, + { + "epoch": 1.189894194248508, + "grad_norm": 5.300770282745361, + "learning_rate": 6.16362888858435e-06, + "loss": 0.2256, + "step": 28745 + }, + { + "epoch": 1.1899077590884428, + "grad_norm": 4.99685525894165, + "learning_rate": 6.163491845964096e-06, + "loss": 0.1761, + "step": 28746 + }, + { + "epoch": 1.1899213239283777, + "grad_norm": 5.871298313140869, + "learning_rate": 6.163354803343841e-06, + "loss": 0.2292, + "step": 28747 + }, + { + "epoch": 1.1899348887683125, + "grad_norm": 5.307163715362549, + "learning_rate": 6.163217760723585e-06, + "loss": 0.2657, + "step": 28748 + }, + { + "epoch": 1.1899484536082474, + "grad_norm": 6.332579612731934, + "learning_rate": 6.1630807181033305e-06, + "loss": 0.2295, + "step": 28749 + }, + { + "epoch": 1.1899620184481823, + "grad_norm": 5.775089263916016, + "learning_rate": 6.1629436754830765e-06, + "loss": 0.1627, + "step": 28750 + }, + { + "epoch": 1.1899755832881171, + "grad_norm": 4.836669445037842, + "learning_rate": 6.162806632862821e-06, + "loss": 0.2428, + "step": 28751 + }, + { + "epoch": 1.189989148128052, + "grad_norm": 6.187076568603516, + "learning_rate": 6.162669590242566e-06, + "loss": 0.2655, + "step": 28752 + }, + { + "epoch": 1.190002712967987, + "grad_norm": 5.090274333953857, + "learning_rate": 6.162532547622311e-06, + "loss": 0.1444, + "step": 28753 + }, + { + "epoch": 1.190016277807922, + "grad_norm": 4.700002193450928, + "learning_rate": 6.1623955050020555e-06, + "loss": 0.2363, + "step": 28754 + }, + { + "epoch": 1.1900298426478568, + "grad_norm": 4.486874580383301, + "learning_rate": 6.1622584623818015e-06, + "loss": 0.264, + "step": 28755 + }, + { + "epoch": 1.1900434074877917, + "grad_norm": 5.403441905975342, + "learning_rate": 6.162121419761547e-06, + "loss": 0.2581, + "step": 28756 + }, + { + "epoch": 1.1900569723277266, + "grad_norm": 5.295580863952637, + "learning_rate": 6.161984377141291e-06, + "loss": 0.1747, + "step": 28757 + }, + { + "epoch": 1.1900705371676614, + "grad_norm": 6.025216102600098, + "learning_rate": 6.161847334521036e-06, + "loss": 0.2815, + "step": 28758 + }, + { + "epoch": 1.1900841020075963, + "grad_norm": 4.274361610412598, + "learning_rate": 6.161710291900782e-06, + "loss": 0.1278, + "step": 28759 + }, + { + "epoch": 1.1900976668475312, + "grad_norm": 5.345637321472168, + "learning_rate": 6.1615732492805266e-06, + "loss": 0.2311, + "step": 28760 + }, + { + "epoch": 1.190111231687466, + "grad_norm": 3.8883490562438965, + "learning_rate": 6.161436206660272e-06, + "loss": 0.2308, + "step": 28761 + }, + { + "epoch": 1.1901247965274009, + "grad_norm": 5.888689041137695, + "learning_rate": 6.161299164040017e-06, + "loss": 0.4459, + "step": 28762 + }, + { + "epoch": 1.1901383613673358, + "grad_norm": 5.395107269287109, + "learning_rate": 6.161162121419763e-06, + "loss": 0.251, + "step": 28763 + }, + { + "epoch": 1.1901519262072708, + "grad_norm": 4.670958995819092, + "learning_rate": 6.161025078799507e-06, + "loss": 0.243, + "step": 28764 + }, + { + "epoch": 1.1901654910472057, + "grad_norm": 6.069555759429932, + "learning_rate": 6.1608880361792525e-06, + "loss": 0.1788, + "step": 28765 + }, + { + "epoch": 1.1901790558871406, + "grad_norm": 5.203128337860107, + "learning_rate": 6.160750993558997e-06, + "loss": 0.3325, + "step": 28766 + }, + { + "epoch": 1.1901926207270754, + "grad_norm": 5.258502960205078, + "learning_rate": 6.160613950938742e-06, + "loss": 0.2967, + "step": 28767 + }, + { + "epoch": 1.1902061855670103, + "grad_norm": 4.925634384155273, + "learning_rate": 6.160476908318488e-06, + "loss": 0.2251, + "step": 28768 + }, + { + "epoch": 1.1902197504069452, + "grad_norm": 5.302141189575195, + "learning_rate": 6.160339865698232e-06, + "loss": 0.2937, + "step": 28769 + }, + { + "epoch": 1.19023331524688, + "grad_norm": 5.339837074279785, + "learning_rate": 6.1602028230779775e-06, + "loss": 0.247, + "step": 28770 + }, + { + "epoch": 1.190246880086815, + "grad_norm": 5.806437969207764, + "learning_rate": 6.160065780457723e-06, + "loss": 0.2961, + "step": 28771 + }, + { + "epoch": 1.19026044492675, + "grad_norm": 4.034998416900635, + "learning_rate": 6.159928737837469e-06, + "loss": 0.2572, + "step": 28772 + }, + { + "epoch": 1.1902740097666848, + "grad_norm": 4.46354866027832, + "learning_rate": 6.159791695217213e-06, + "loss": 0.2591, + "step": 28773 + }, + { + "epoch": 1.1902875746066197, + "grad_norm": 5.225803852081299, + "learning_rate": 6.159654652596958e-06, + "loss": 0.2698, + "step": 28774 + }, + { + "epoch": 1.1903011394465546, + "grad_norm": 4.701201915740967, + "learning_rate": 6.1595176099767026e-06, + "loss": 0.3275, + "step": 28775 + }, + { + "epoch": 1.1903147042864894, + "grad_norm": 6.077515602111816, + "learning_rate": 6.159380567356449e-06, + "loss": 0.2263, + "step": 28776 + }, + { + "epoch": 1.1903282691264243, + "grad_norm": 5.174983024597168, + "learning_rate": 6.159243524736194e-06, + "loss": 0.2679, + "step": 28777 + }, + { + "epoch": 1.1903418339663592, + "grad_norm": 6.841537952423096, + "learning_rate": 6.159106482115939e-06, + "loss": 0.2824, + "step": 28778 + }, + { + "epoch": 1.190355398806294, + "grad_norm": 4.211066722869873, + "learning_rate": 6.158969439495683e-06, + "loss": 0.2498, + "step": 28779 + }, + { + "epoch": 1.190368963646229, + "grad_norm": 3.3633806705474854, + "learning_rate": 6.1588323968754285e-06, + "loss": 0.1613, + "step": 28780 + }, + { + "epoch": 1.1903825284861638, + "grad_norm": 5.756792068481445, + "learning_rate": 6.1586953542551745e-06, + "loss": 0.2503, + "step": 28781 + }, + { + "epoch": 1.1903960933260986, + "grad_norm": 4.106358051300049, + "learning_rate": 6.158558311634919e-06, + "loss": 0.2202, + "step": 28782 + }, + { + "epoch": 1.1904096581660337, + "grad_norm": 5.993332862854004, + "learning_rate": 6.158421269014664e-06, + "loss": 0.3052, + "step": 28783 + }, + { + "epoch": 1.1904232230059686, + "grad_norm": 5.858652591705322, + "learning_rate": 6.158284226394408e-06, + "loss": 0.2952, + "step": 28784 + }, + { + "epoch": 1.1904367878459035, + "grad_norm": 4.763323783874512, + "learning_rate": 6.158147183774154e-06, + "loss": 0.2562, + "step": 28785 + }, + { + "epoch": 1.1904503526858383, + "grad_norm": 5.299948692321777, + "learning_rate": 6.1580101411538995e-06, + "loss": 0.3116, + "step": 28786 + }, + { + "epoch": 1.1904639175257732, + "grad_norm": 4.128816604614258, + "learning_rate": 6.157873098533645e-06, + "loss": 0.1832, + "step": 28787 + }, + { + "epoch": 1.190477482365708, + "grad_norm": 5.934619903564453, + "learning_rate": 6.157736055913389e-06, + "loss": 0.2444, + "step": 28788 + }, + { + "epoch": 1.190491047205643, + "grad_norm": 7.037363529205322, + "learning_rate": 6.157599013293135e-06, + "loss": 0.4033, + "step": 28789 + }, + { + "epoch": 1.1905046120455778, + "grad_norm": 5.406345367431641, + "learning_rate": 6.15746197067288e-06, + "loss": 0.2935, + "step": 28790 + }, + { + "epoch": 1.1905181768855129, + "grad_norm": 7.014753818511963, + "learning_rate": 6.157324928052625e-06, + "loss": 0.308, + "step": 28791 + }, + { + "epoch": 1.1905317417254477, + "grad_norm": 5.744534015655518, + "learning_rate": 6.15718788543237e-06, + "loss": 0.2877, + "step": 28792 + }, + { + "epoch": 1.1905453065653826, + "grad_norm": 6.295556545257568, + "learning_rate": 6.157050842812116e-06, + "loss": 0.3917, + "step": 28793 + }, + { + "epoch": 1.1905588714053175, + "grad_norm": 6.315849304199219, + "learning_rate": 6.15691380019186e-06, + "loss": 0.275, + "step": 28794 + }, + { + "epoch": 1.1905724362452523, + "grad_norm": 5.479904651641846, + "learning_rate": 6.156776757571605e-06, + "loss": 0.3168, + "step": 28795 + }, + { + "epoch": 1.1905860010851872, + "grad_norm": 4.5417799949646, + "learning_rate": 6.1566397149513505e-06, + "loss": 0.2347, + "step": 28796 + }, + { + "epoch": 1.190599565925122, + "grad_norm": 3.910324811935425, + "learning_rate": 6.156502672331095e-06, + "loss": 0.1951, + "step": 28797 + }, + { + "epoch": 1.190613130765057, + "grad_norm": 5.020628929138184, + "learning_rate": 6.156365629710841e-06, + "loss": 0.1482, + "step": 28798 + }, + { + "epoch": 1.1906266956049918, + "grad_norm": 6.26593542098999, + "learning_rate": 6.156228587090586e-06, + "loss": 0.3914, + "step": 28799 + }, + { + "epoch": 1.1906402604449267, + "grad_norm": 6.592925071716309, + "learning_rate": 6.15609154447033e-06, + "loss": 0.2535, + "step": 28800 + }, + { + "epoch": 1.1906538252848615, + "grad_norm": 5.096033573150635, + "learning_rate": 6.1559545018500755e-06, + "loss": 0.3148, + "step": 28801 + }, + { + "epoch": 1.1906673901247966, + "grad_norm": 5.459571838378906, + "learning_rate": 6.1558174592298215e-06, + "loss": 0.3824, + "step": 28802 + }, + { + "epoch": 1.1906809549647315, + "grad_norm": 4.437593936920166, + "learning_rate": 6.155680416609566e-06, + "loss": 0.2982, + "step": 28803 + }, + { + "epoch": 1.1906945198046663, + "grad_norm": 3.941476821899414, + "learning_rate": 6.155543373989311e-06, + "loss": 0.1739, + "step": 28804 + }, + { + "epoch": 1.1907080846446012, + "grad_norm": 4.01726770401001, + "learning_rate": 6.155406331369056e-06, + "loss": 0.2117, + "step": 28805 + }, + { + "epoch": 1.190721649484536, + "grad_norm": 7.446018695831299, + "learning_rate": 6.155269288748802e-06, + "loss": 0.394, + "step": 28806 + }, + { + "epoch": 1.190735214324471, + "grad_norm": 5.847209453582764, + "learning_rate": 6.155132246128547e-06, + "loss": 0.2296, + "step": 28807 + }, + { + "epoch": 1.1907487791644058, + "grad_norm": 5.497203826904297, + "learning_rate": 6.154995203508292e-06, + "loss": 0.3549, + "step": 28808 + }, + { + "epoch": 1.1907623440043407, + "grad_norm": 5.278249740600586, + "learning_rate": 6.154858160888036e-06, + "loss": 0.2893, + "step": 28809 + }, + { + "epoch": 1.1907759088442758, + "grad_norm": 5.857492923736572, + "learning_rate": 6.154721118267781e-06, + "loss": 0.3911, + "step": 28810 + }, + { + "epoch": 1.1907894736842106, + "grad_norm": 5.729299545288086, + "learning_rate": 6.154584075647527e-06, + "loss": 0.2968, + "step": 28811 + }, + { + "epoch": 1.1908030385241455, + "grad_norm": 3.8854165077209473, + "learning_rate": 6.1544470330272725e-06, + "loss": 0.2346, + "step": 28812 + }, + { + "epoch": 1.1908166033640804, + "grad_norm": 4.5161237716674805, + "learning_rate": 6.154309990407017e-06, + "loss": 0.2247, + "step": 28813 + }, + { + "epoch": 1.1908301682040152, + "grad_norm": 5.507789134979248, + "learning_rate": 6.154172947786762e-06, + "loss": 0.3865, + "step": 28814 + }, + { + "epoch": 1.19084373304395, + "grad_norm": 4.4302287101745605, + "learning_rate": 6.154035905166508e-06, + "loss": 0.1728, + "step": 28815 + }, + { + "epoch": 1.190857297883885, + "grad_norm": 5.818873882293701, + "learning_rate": 6.153898862546252e-06, + "loss": 0.2357, + "step": 28816 + }, + { + "epoch": 1.1908708627238198, + "grad_norm": 4.158796310424805, + "learning_rate": 6.1537618199259975e-06, + "loss": 0.1849, + "step": 28817 + }, + { + "epoch": 1.1908844275637547, + "grad_norm": 5.178306579589844, + "learning_rate": 6.153624777305742e-06, + "loss": 0.3014, + "step": 28818 + }, + { + "epoch": 1.1908979924036895, + "grad_norm": 6.230889320373535, + "learning_rate": 6.153487734685488e-06, + "loss": 0.257, + "step": 28819 + }, + { + "epoch": 1.1909115572436244, + "grad_norm": 6.5704545974731445, + "learning_rate": 6.153350692065233e-06, + "loss": 0.233, + "step": 28820 + }, + { + "epoch": 1.1909251220835595, + "grad_norm": 3.389627456665039, + "learning_rate": 6.153213649444978e-06, + "loss": 0.1407, + "step": 28821 + }, + { + "epoch": 1.1909386869234944, + "grad_norm": 5.219782829284668, + "learning_rate": 6.153076606824723e-06, + "loss": 0.2183, + "step": 28822 + }, + { + "epoch": 1.1909522517634292, + "grad_norm": 4.800002574920654, + "learning_rate": 6.152939564204468e-06, + "loss": 0.1702, + "step": 28823 + }, + { + "epoch": 1.190965816603364, + "grad_norm": 4.991024494171143, + "learning_rate": 6.152802521584214e-06, + "loss": 0.3809, + "step": 28824 + }, + { + "epoch": 1.190979381443299, + "grad_norm": 6.32275915145874, + "learning_rate": 6.152665478963958e-06, + "loss": 0.3768, + "step": 28825 + }, + { + "epoch": 1.1909929462832338, + "grad_norm": 6.140286445617676, + "learning_rate": 6.152528436343703e-06, + "loss": 0.223, + "step": 28826 + }, + { + "epoch": 1.1910065111231687, + "grad_norm": 5.790253639221191, + "learning_rate": 6.1523913937234485e-06, + "loss": 0.2601, + "step": 28827 + }, + { + "epoch": 1.1910200759631036, + "grad_norm": 4.653525352478027, + "learning_rate": 6.152254351103194e-06, + "loss": 0.1514, + "step": 28828 + }, + { + "epoch": 1.1910336408030386, + "grad_norm": 5.403162956237793, + "learning_rate": 6.152117308482939e-06, + "loss": 0.2768, + "step": 28829 + }, + { + "epoch": 1.1910472056429735, + "grad_norm": 4.232517242431641, + "learning_rate": 6.151980265862684e-06, + "loss": 0.1645, + "step": 28830 + }, + { + "epoch": 1.1910607704829084, + "grad_norm": 5.807642936706543, + "learning_rate": 6.151843223242428e-06, + "loss": 0.1743, + "step": 28831 + }, + { + "epoch": 1.1910743353228432, + "grad_norm": 3.4403798580169678, + "learning_rate": 6.151706180622174e-06, + "loss": 0.1395, + "step": 28832 + }, + { + "epoch": 1.191087900162778, + "grad_norm": 3.9564993381500244, + "learning_rate": 6.1515691380019196e-06, + "loss": 0.1743, + "step": 28833 + }, + { + "epoch": 1.191101465002713, + "grad_norm": 5.43894624710083, + "learning_rate": 6.151432095381664e-06, + "loss": 0.2773, + "step": 28834 + }, + { + "epoch": 1.1911150298426478, + "grad_norm": 4.118300437927246, + "learning_rate": 6.151295052761409e-06, + "loss": 0.2067, + "step": 28835 + }, + { + "epoch": 1.1911285946825827, + "grad_norm": 5.731271266937256, + "learning_rate": 6.151158010141154e-06, + "loss": 0.2229, + "step": 28836 + }, + { + "epoch": 1.1911421595225176, + "grad_norm": 5.995532035827637, + "learning_rate": 6.151020967520899e-06, + "loss": 0.2199, + "step": 28837 + }, + { + "epoch": 1.1911557243624524, + "grad_norm": 6.413832187652588, + "learning_rate": 6.150883924900645e-06, + "loss": 0.3591, + "step": 28838 + }, + { + "epoch": 1.1911692892023873, + "grad_norm": 4.834931373596191, + "learning_rate": 6.15074688228039e-06, + "loss": 0.2534, + "step": 28839 + }, + { + "epoch": 1.1911828540423224, + "grad_norm": 5.5523600578308105, + "learning_rate": 6.150609839660134e-06, + "loss": 0.2199, + "step": 28840 + }, + { + "epoch": 1.1911964188822572, + "grad_norm": 5.307421684265137, + "learning_rate": 6.15047279703988e-06, + "loss": 0.1726, + "step": 28841 + }, + { + "epoch": 1.1912099837221921, + "grad_norm": 5.1027727127075195, + "learning_rate": 6.150335754419625e-06, + "loss": 0.1686, + "step": 28842 + }, + { + "epoch": 1.191223548562127, + "grad_norm": 6.076318264007568, + "learning_rate": 6.15019871179937e-06, + "loss": 0.2345, + "step": 28843 + }, + { + "epoch": 1.1912371134020618, + "grad_norm": 5.208882808685303, + "learning_rate": 6.150061669179115e-06, + "loss": 0.2518, + "step": 28844 + }, + { + "epoch": 1.1912506782419967, + "grad_norm": 5.937344074249268, + "learning_rate": 6.149924626558861e-06, + "loss": 0.235, + "step": 28845 + }, + { + "epoch": 1.1912642430819316, + "grad_norm": 5.886946201324463, + "learning_rate": 6.149787583938606e-06, + "loss": 0.2752, + "step": 28846 + }, + { + "epoch": 1.1912778079218664, + "grad_norm": 5.6078596115112305, + "learning_rate": 6.14965054131835e-06, + "loss": 0.206, + "step": 28847 + }, + { + "epoch": 1.1912913727618015, + "grad_norm": 4.337634563446045, + "learning_rate": 6.1495134986980955e-06, + "loss": 0.1809, + "step": 28848 + }, + { + "epoch": 1.1913049376017364, + "grad_norm": 6.095215797424316, + "learning_rate": 6.14937645607784e-06, + "loss": 0.2648, + "step": 28849 + }, + { + "epoch": 1.1913185024416713, + "grad_norm": 5.654581069946289, + "learning_rate": 6.149239413457586e-06, + "loss": 0.1854, + "step": 28850 + }, + { + "epoch": 1.1913320672816061, + "grad_norm": 7.31391716003418, + "learning_rate": 6.149102370837331e-06, + "loss": 0.385, + "step": 28851 + }, + { + "epoch": 1.191345632121541, + "grad_norm": 5.392666339874268, + "learning_rate": 6.148965328217075e-06, + "loss": 0.2666, + "step": 28852 + }, + { + "epoch": 1.1913591969614759, + "grad_norm": 4.445328235626221, + "learning_rate": 6.148828285596821e-06, + "loss": 0.2201, + "step": 28853 + }, + { + "epoch": 1.1913727618014107, + "grad_norm": 5.876420497894287, + "learning_rate": 6.148691242976567e-06, + "loss": 0.2191, + "step": 28854 + }, + { + "epoch": 1.1913863266413456, + "grad_norm": 4.590814590454102, + "learning_rate": 6.148554200356312e-06, + "loss": 0.1885, + "step": 28855 + }, + { + "epoch": 1.1913998914812804, + "grad_norm": 5.331863880157471, + "learning_rate": 6.148417157736056e-06, + "loss": 0.1846, + "step": 28856 + }, + { + "epoch": 1.1914134563212153, + "grad_norm": 4.524691104888916, + "learning_rate": 6.148280115115801e-06, + "loss": 0.2213, + "step": 28857 + }, + { + "epoch": 1.1914270211611502, + "grad_norm": 5.920454025268555, + "learning_rate": 6.148143072495547e-06, + "loss": 0.261, + "step": 28858 + }, + { + "epoch": 1.1914405860010853, + "grad_norm": 6.658926486968994, + "learning_rate": 6.148006029875292e-06, + "loss": 0.3399, + "step": 28859 + }, + { + "epoch": 1.1914541508410201, + "grad_norm": 8.876755714416504, + "learning_rate": 6.147868987255037e-06, + "loss": 0.3259, + "step": 28860 + }, + { + "epoch": 1.191467715680955, + "grad_norm": 6.181185722351074, + "learning_rate": 6.147731944634782e-06, + "loss": 0.2776, + "step": 28861 + }, + { + "epoch": 1.1914812805208899, + "grad_norm": 7.60447359085083, + "learning_rate": 6.147594902014526e-06, + "loss": 0.4043, + "step": 28862 + }, + { + "epoch": 1.1914948453608247, + "grad_norm": 5.7088236808776855, + "learning_rate": 6.147457859394272e-06, + "loss": 0.2918, + "step": 28863 + }, + { + "epoch": 1.1915084102007596, + "grad_norm": 4.994305610656738, + "learning_rate": 6.1473208167740176e-06, + "loss": 0.2579, + "step": 28864 + }, + { + "epoch": 1.1915219750406945, + "grad_norm": 5.652531623840332, + "learning_rate": 6.147183774153762e-06, + "loss": 0.3207, + "step": 28865 + }, + { + "epoch": 1.1915355398806293, + "grad_norm": 4.644229888916016, + "learning_rate": 6.147046731533507e-06, + "loss": 0.2271, + "step": 28866 + }, + { + "epoch": 1.1915491047205644, + "grad_norm": 3.934870719909668, + "learning_rate": 6.146909688913253e-06, + "loss": 0.2826, + "step": 28867 + }, + { + "epoch": 1.1915626695604993, + "grad_norm": 6.672372341156006, + "learning_rate": 6.1467726462929974e-06, + "loss": 0.3164, + "step": 28868 + }, + { + "epoch": 1.1915762344004341, + "grad_norm": 6.168491840362549, + "learning_rate": 6.146635603672743e-06, + "loss": 0.2328, + "step": 28869 + }, + { + "epoch": 1.191589799240369, + "grad_norm": 4.485607147216797, + "learning_rate": 6.146498561052488e-06, + "loss": 0.214, + "step": 28870 + }, + { + "epoch": 1.1916033640803039, + "grad_norm": 5.74706506729126, + "learning_rate": 6.146361518432234e-06, + "loss": 0.2747, + "step": 28871 + }, + { + "epoch": 1.1916169289202387, + "grad_norm": 4.996333599090576, + "learning_rate": 6.146224475811978e-06, + "loss": 0.2666, + "step": 28872 + }, + { + "epoch": 1.1916304937601736, + "grad_norm": 6.339268684387207, + "learning_rate": 6.146087433191723e-06, + "loss": 0.3915, + "step": 28873 + }, + { + "epoch": 1.1916440586001085, + "grad_norm": 5.323904991149902, + "learning_rate": 6.145950390571468e-06, + "loss": 0.2442, + "step": 28874 + }, + { + "epoch": 1.1916576234400433, + "grad_norm": 5.767351150512695, + "learning_rate": 6.145813347951214e-06, + "loss": 0.2982, + "step": 28875 + }, + { + "epoch": 1.1916711882799782, + "grad_norm": 6.934707164764404, + "learning_rate": 6.145676305330959e-06, + "loss": 0.4298, + "step": 28876 + }, + { + "epoch": 1.191684753119913, + "grad_norm": 5.108276844024658, + "learning_rate": 6.145539262710703e-06, + "loss": 0.3262, + "step": 28877 + }, + { + "epoch": 1.1916983179598482, + "grad_norm": 6.204653263092041, + "learning_rate": 6.145402220090448e-06, + "loss": 0.3997, + "step": 28878 + }, + { + "epoch": 1.191711882799783, + "grad_norm": 4.790892601013184, + "learning_rate": 6.1452651774701935e-06, + "loss": 0.2698, + "step": 28879 + }, + { + "epoch": 1.1917254476397179, + "grad_norm": 4.380870342254639, + "learning_rate": 6.1451281348499396e-06, + "loss": 0.2331, + "step": 28880 + }, + { + "epoch": 1.1917390124796527, + "grad_norm": 6.477596282958984, + "learning_rate": 6.144991092229684e-06, + "loss": 0.2772, + "step": 28881 + }, + { + "epoch": 1.1917525773195876, + "grad_norm": 6.817770004272461, + "learning_rate": 6.144854049609429e-06, + "loss": 0.2759, + "step": 28882 + }, + { + "epoch": 1.1917661421595225, + "grad_norm": 4.688663959503174, + "learning_rate": 6.144717006989173e-06, + "loss": 0.2495, + "step": 28883 + }, + { + "epoch": 1.1917797069994573, + "grad_norm": 6.440329074859619, + "learning_rate": 6.1445799643689194e-06, + "loss": 0.2648, + "step": 28884 + }, + { + "epoch": 1.1917932718393922, + "grad_norm": 5.968048095703125, + "learning_rate": 6.144442921748665e-06, + "loss": 0.3123, + "step": 28885 + }, + { + "epoch": 1.1918068366793273, + "grad_norm": 4.0633134841918945, + "learning_rate": 6.144305879128409e-06, + "loss": 0.1289, + "step": 28886 + }, + { + "epoch": 1.1918204015192622, + "grad_norm": 5.680691242218018, + "learning_rate": 6.144168836508154e-06, + "loss": 0.2693, + "step": 28887 + }, + { + "epoch": 1.191833966359197, + "grad_norm": 4.82562255859375, + "learning_rate": 6.1440317938879e-06, + "loss": 0.165, + "step": 28888 + }, + { + "epoch": 1.191847531199132, + "grad_norm": 5.244272708892822, + "learning_rate": 6.143894751267645e-06, + "loss": 0.3004, + "step": 28889 + }, + { + "epoch": 1.1918610960390668, + "grad_norm": 5.957800388336182, + "learning_rate": 6.14375770864739e-06, + "loss": 0.3496, + "step": 28890 + }, + { + "epoch": 1.1918746608790016, + "grad_norm": 4.369710445404053, + "learning_rate": 6.143620666027135e-06, + "loss": 0.2202, + "step": 28891 + }, + { + "epoch": 1.1918882257189365, + "grad_norm": 5.520050048828125, + "learning_rate": 6.143483623406879e-06, + "loss": 0.2344, + "step": 28892 + }, + { + "epoch": 1.1919017905588714, + "grad_norm": 4.641012191772461, + "learning_rate": 6.143346580786625e-06, + "loss": 0.2316, + "step": 28893 + }, + { + "epoch": 1.1919153553988062, + "grad_norm": 5.818510055541992, + "learning_rate": 6.14320953816637e-06, + "loss": 0.2304, + "step": 28894 + }, + { + "epoch": 1.191928920238741, + "grad_norm": 4.328723907470703, + "learning_rate": 6.1430724955461156e-06, + "loss": 0.2681, + "step": 28895 + }, + { + "epoch": 1.191942485078676, + "grad_norm": 4.547598361968994, + "learning_rate": 6.14293545292586e-06, + "loss": 0.1864, + "step": 28896 + }, + { + "epoch": 1.191956049918611, + "grad_norm": 4.013489246368408, + "learning_rate": 6.142798410305606e-06, + "loss": 0.2139, + "step": 28897 + }, + { + "epoch": 1.191969614758546, + "grad_norm": 7.854605674743652, + "learning_rate": 6.142661367685351e-06, + "loss": 0.3237, + "step": 28898 + }, + { + "epoch": 1.1919831795984808, + "grad_norm": 4.97974157333374, + "learning_rate": 6.1425243250650954e-06, + "loss": 0.2647, + "step": 28899 + }, + { + "epoch": 1.1919967444384156, + "grad_norm": 5.906293869018555, + "learning_rate": 6.142387282444841e-06, + "loss": 0.2568, + "step": 28900 + }, + { + "epoch": 1.1920103092783505, + "grad_norm": 4.348628520965576, + "learning_rate": 6.142250239824587e-06, + "loss": 0.2064, + "step": 28901 + }, + { + "epoch": 1.1920238741182854, + "grad_norm": 5.59250545501709, + "learning_rate": 6.142113197204331e-06, + "loss": 0.2355, + "step": 28902 + }, + { + "epoch": 1.1920374389582202, + "grad_norm": 4.148808479309082, + "learning_rate": 6.141976154584076e-06, + "loss": 0.2142, + "step": 28903 + }, + { + "epoch": 1.192051003798155, + "grad_norm": 3.7562410831451416, + "learning_rate": 6.141839111963821e-06, + "loss": 0.2326, + "step": 28904 + }, + { + "epoch": 1.1920645686380902, + "grad_norm": 5.939432144165039, + "learning_rate": 6.141702069343566e-06, + "loss": 0.2676, + "step": 28905 + }, + { + "epoch": 1.192078133478025, + "grad_norm": 4.6792216300964355, + "learning_rate": 6.141565026723312e-06, + "loss": 0.2303, + "step": 28906 + }, + { + "epoch": 1.19209169831796, + "grad_norm": 5.061247825622559, + "learning_rate": 6.141427984103057e-06, + "loss": 0.2129, + "step": 28907 + }, + { + "epoch": 1.1921052631578948, + "grad_norm": 5.238762855529785, + "learning_rate": 6.141290941482801e-06, + "loss": 0.3433, + "step": 28908 + }, + { + "epoch": 1.1921188279978296, + "grad_norm": 4.979771137237549, + "learning_rate": 6.141153898862546e-06, + "loss": 0.2765, + "step": 28909 + }, + { + "epoch": 1.1921323928377645, + "grad_norm": 4.877206325531006, + "learning_rate": 6.141016856242292e-06, + "loss": 0.28, + "step": 28910 + }, + { + "epoch": 1.1921459576776994, + "grad_norm": 4.369208335876465, + "learning_rate": 6.140879813622037e-06, + "loss": 0.2122, + "step": 28911 + }, + { + "epoch": 1.1921595225176342, + "grad_norm": 4.772840976715088, + "learning_rate": 6.140742771001782e-06, + "loss": 0.2662, + "step": 28912 + }, + { + "epoch": 1.192173087357569, + "grad_norm": 3.8507602214813232, + "learning_rate": 6.140605728381527e-06, + "loss": 0.1955, + "step": 28913 + }, + { + "epoch": 1.192186652197504, + "grad_norm": 4.1000447273254395, + "learning_rate": 6.140468685761273e-06, + "loss": 0.201, + "step": 28914 + }, + { + "epoch": 1.1922002170374388, + "grad_norm": 5.4740447998046875, + "learning_rate": 6.1403316431410174e-06, + "loss": 0.2438, + "step": 28915 + }, + { + "epoch": 1.192213781877374, + "grad_norm": 4.284925937652588, + "learning_rate": 6.140194600520763e-06, + "loss": 0.2534, + "step": 28916 + }, + { + "epoch": 1.1922273467173088, + "grad_norm": 4.599987983703613, + "learning_rate": 6.140057557900507e-06, + "loss": 0.1866, + "step": 28917 + }, + { + "epoch": 1.1922409115572437, + "grad_norm": 4.8943095207214355, + "learning_rate": 6.139920515280252e-06, + "loss": 0.2249, + "step": 28918 + }, + { + "epoch": 1.1922544763971785, + "grad_norm": 3.751934289932251, + "learning_rate": 6.139783472659998e-06, + "loss": 0.1608, + "step": 28919 + }, + { + "epoch": 1.1922680412371134, + "grad_norm": 6.461059093475342, + "learning_rate": 6.139646430039743e-06, + "loss": 0.2496, + "step": 28920 + }, + { + "epoch": 1.1922816060770483, + "grad_norm": 4.601910591125488, + "learning_rate": 6.139509387419488e-06, + "loss": 0.2215, + "step": 28921 + }, + { + "epoch": 1.1922951709169831, + "grad_norm": 5.878423690795898, + "learning_rate": 6.139372344799233e-06, + "loss": 0.2374, + "step": 28922 + }, + { + "epoch": 1.192308735756918, + "grad_norm": 4.627579212188721, + "learning_rate": 6.139235302178979e-06, + "loss": 0.154, + "step": 28923 + }, + { + "epoch": 1.192322300596853, + "grad_norm": 4.998785972595215, + "learning_rate": 6.139098259558723e-06, + "loss": 0.1903, + "step": 28924 + }, + { + "epoch": 1.192335865436788, + "grad_norm": 5.4736175537109375, + "learning_rate": 6.138961216938468e-06, + "loss": 0.2753, + "step": 28925 + }, + { + "epoch": 1.1923494302767228, + "grad_norm": 4.601052761077881, + "learning_rate": 6.138824174318213e-06, + "loss": 0.231, + "step": 28926 + }, + { + "epoch": 1.1923629951166577, + "grad_norm": 4.698885917663574, + "learning_rate": 6.138687131697959e-06, + "loss": 0.1719, + "step": 28927 + }, + { + "epoch": 1.1923765599565925, + "grad_norm": 3.925996780395508, + "learning_rate": 6.138550089077704e-06, + "loss": 0.1831, + "step": 28928 + }, + { + "epoch": 1.1923901247965274, + "grad_norm": 4.734602928161621, + "learning_rate": 6.138413046457449e-06, + "loss": 0.2471, + "step": 28929 + }, + { + "epoch": 1.1924036896364623, + "grad_norm": 5.7130513191223145, + "learning_rate": 6.1382760038371934e-06, + "loss": 0.2914, + "step": 28930 + }, + { + "epoch": 1.1924172544763971, + "grad_norm": 5.41434383392334, + "learning_rate": 6.138138961216939e-06, + "loss": 0.1586, + "step": 28931 + }, + { + "epoch": 1.192430819316332, + "grad_norm": 6.228926181793213, + "learning_rate": 6.138001918596685e-06, + "loss": 0.3155, + "step": 28932 + }, + { + "epoch": 1.1924443841562669, + "grad_norm": 4.482178211212158, + "learning_rate": 6.137864875976429e-06, + "loss": 0.2648, + "step": 28933 + }, + { + "epoch": 1.1924579489962017, + "grad_norm": 5.865010738372803, + "learning_rate": 6.137727833356174e-06, + "loss": 0.2164, + "step": 28934 + }, + { + "epoch": 1.1924715138361368, + "grad_norm": 5.179736614227295, + "learning_rate": 6.1375907907359185e-06, + "loss": 0.1717, + "step": 28935 + }, + { + "epoch": 1.1924850786760717, + "grad_norm": 4.629314422607422, + "learning_rate": 6.1374537481156645e-06, + "loss": 0.2025, + "step": 28936 + }, + { + "epoch": 1.1924986435160065, + "grad_norm": 4.8802490234375, + "learning_rate": 6.13731670549541e-06, + "loss": 0.2066, + "step": 28937 + }, + { + "epoch": 1.1925122083559414, + "grad_norm": 6.529094219207764, + "learning_rate": 6.137179662875155e-06, + "loss": 0.2135, + "step": 28938 + }, + { + "epoch": 1.1925257731958763, + "grad_norm": 4.695415496826172, + "learning_rate": 6.137042620254899e-06, + "loss": 0.1829, + "step": 28939 + }, + { + "epoch": 1.1925393380358111, + "grad_norm": 3.88242769241333, + "learning_rate": 6.136905577634645e-06, + "loss": 0.1515, + "step": 28940 + }, + { + "epoch": 1.192552902875746, + "grad_norm": 4.806201457977295, + "learning_rate": 6.13676853501439e-06, + "loss": 0.3211, + "step": 28941 + }, + { + "epoch": 1.1925664677156809, + "grad_norm": 5.0420823097229, + "learning_rate": 6.136631492394135e-06, + "loss": 0.1584, + "step": 28942 + }, + { + "epoch": 1.192580032555616, + "grad_norm": 5.971208095550537, + "learning_rate": 6.13649444977388e-06, + "loss": 0.2313, + "step": 28943 + }, + { + "epoch": 1.1925935973955508, + "grad_norm": 6.64444637298584, + "learning_rate": 6.136357407153626e-06, + "loss": 0.266, + "step": 28944 + }, + { + "epoch": 1.1926071622354857, + "grad_norm": 4.1221771240234375, + "learning_rate": 6.13622036453337e-06, + "loss": 0.0833, + "step": 28945 + }, + { + "epoch": 1.1926207270754206, + "grad_norm": 5.366844177246094, + "learning_rate": 6.1360833219131154e-06, + "loss": 0.2124, + "step": 28946 + }, + { + "epoch": 1.1926342919153554, + "grad_norm": 5.427058696746826, + "learning_rate": 6.135946279292861e-06, + "loss": 0.1997, + "step": 28947 + }, + { + "epoch": 1.1926478567552903, + "grad_norm": 5.1125407218933105, + "learning_rate": 6.135809236672605e-06, + "loss": 0.1569, + "step": 28948 + }, + { + "epoch": 1.1926614215952251, + "grad_norm": 5.633157730102539, + "learning_rate": 6.135672194052351e-06, + "loss": 0.1725, + "step": 28949 + }, + { + "epoch": 1.19267498643516, + "grad_norm": 5.245049476623535, + "learning_rate": 6.135535151432096e-06, + "loss": 0.1668, + "step": 28950 + }, + { + "epoch": 1.1926885512750949, + "grad_norm": 4.8030009269714355, + "learning_rate": 6.1353981088118405e-06, + "loss": 0.3167, + "step": 28951 + }, + { + "epoch": 1.1927021161150297, + "grad_norm": 4.547921180725098, + "learning_rate": 6.135261066191586e-06, + "loss": 0.2444, + "step": 28952 + }, + { + "epoch": 1.1927156809549646, + "grad_norm": 7.02052116394043, + "learning_rate": 6.135124023571332e-06, + "loss": 0.2198, + "step": 28953 + }, + { + "epoch": 1.1927292457948997, + "grad_norm": 4.2458367347717285, + "learning_rate": 6.134986980951077e-06, + "loss": 0.2627, + "step": 28954 + }, + { + "epoch": 1.1927428106348346, + "grad_norm": 3.8175384998321533, + "learning_rate": 6.134849938330821e-06, + "loss": 0.1526, + "step": 28955 + }, + { + "epoch": 1.1927563754747694, + "grad_norm": 4.281859874725342, + "learning_rate": 6.134712895710566e-06, + "loss": 0.1473, + "step": 28956 + }, + { + "epoch": 1.1927699403147043, + "grad_norm": 5.45219087600708, + "learning_rate": 6.134575853090312e-06, + "loss": 0.2451, + "step": 28957 + }, + { + "epoch": 1.1927835051546392, + "grad_norm": 5.825793266296387, + "learning_rate": 6.134438810470057e-06, + "loss": 0.2374, + "step": 28958 + }, + { + "epoch": 1.192797069994574, + "grad_norm": 5.622601509094238, + "learning_rate": 6.134301767849802e-06, + "loss": 0.1916, + "step": 28959 + }, + { + "epoch": 1.1928106348345089, + "grad_norm": 4.694264888763428, + "learning_rate": 6.134164725229546e-06, + "loss": 0.1517, + "step": 28960 + }, + { + "epoch": 1.1928241996744438, + "grad_norm": 4.8016252517700195, + "learning_rate": 6.1340276826092914e-06, + "loss": 0.1617, + "step": 28961 + }, + { + "epoch": 1.1928377645143788, + "grad_norm": 4.343303680419922, + "learning_rate": 6.1338906399890375e-06, + "loss": 0.1503, + "step": 28962 + }, + { + "epoch": 1.1928513293543137, + "grad_norm": 4.917147159576416, + "learning_rate": 6.133753597368783e-06, + "loss": 0.1543, + "step": 28963 + }, + { + "epoch": 1.1928648941942486, + "grad_norm": 7.302289009094238, + "learning_rate": 6.133616554748527e-06, + "loss": 0.3206, + "step": 28964 + }, + { + "epoch": 1.1928784590341834, + "grad_norm": 5.942391872406006, + "learning_rate": 6.133479512128272e-06, + "loss": 0.2047, + "step": 28965 + }, + { + "epoch": 1.1928920238741183, + "grad_norm": 4.858501434326172, + "learning_rate": 6.133342469508018e-06, + "loss": 0.195, + "step": 28966 + }, + { + "epoch": 1.1929055887140532, + "grad_norm": 6.732553005218506, + "learning_rate": 6.1332054268877625e-06, + "loss": 0.1991, + "step": 28967 + }, + { + "epoch": 1.192919153553988, + "grad_norm": 4.415129661560059, + "learning_rate": 6.133068384267508e-06, + "loss": 0.242, + "step": 28968 + }, + { + "epoch": 1.192932718393923, + "grad_norm": 5.283682823181152, + "learning_rate": 6.132931341647253e-06, + "loss": 0.2217, + "step": 28969 + }, + { + "epoch": 1.1929462832338578, + "grad_norm": 4.175083160400391, + "learning_rate": 6.132794299026998e-06, + "loss": 0.2426, + "step": 28970 + }, + { + "epoch": 1.1929598480737926, + "grad_norm": 6.6274871826171875, + "learning_rate": 6.132657256406743e-06, + "loss": 0.3314, + "step": 28971 + }, + { + "epoch": 1.1929734129137275, + "grad_norm": 5.121971607208252, + "learning_rate": 6.132520213786488e-06, + "loss": 0.2088, + "step": 28972 + }, + { + "epoch": 1.1929869777536626, + "grad_norm": 5.613219261169434, + "learning_rate": 6.132383171166233e-06, + "loss": 0.187, + "step": 28973 + }, + { + "epoch": 1.1930005425935974, + "grad_norm": 6.508055210113525, + "learning_rate": 6.132246128545978e-06, + "loss": 0.2537, + "step": 28974 + }, + { + "epoch": 1.1930141074335323, + "grad_norm": 4.759511947631836, + "learning_rate": 6.132109085925724e-06, + "loss": 0.1579, + "step": 28975 + }, + { + "epoch": 1.1930276722734672, + "grad_norm": 4.361155986785889, + "learning_rate": 6.131972043305468e-06, + "loss": 0.1351, + "step": 28976 + }, + { + "epoch": 1.193041237113402, + "grad_norm": 5.664331912994385, + "learning_rate": 6.1318350006852135e-06, + "loss": 0.1638, + "step": 28977 + }, + { + "epoch": 1.193054801953337, + "grad_norm": 4.487883567810059, + "learning_rate": 6.131697958064959e-06, + "loss": 0.1732, + "step": 28978 + }, + { + "epoch": 1.1930683667932718, + "grad_norm": 4.408966541290283, + "learning_rate": 6.131560915444704e-06, + "loss": 0.1876, + "step": 28979 + }, + { + "epoch": 1.1930819316332066, + "grad_norm": 4.557348251342773, + "learning_rate": 6.131423872824449e-06, + "loss": 0.2071, + "step": 28980 + }, + { + "epoch": 1.1930954964731417, + "grad_norm": 8.322650909423828, + "learning_rate": 6.131286830204194e-06, + "loss": 0.4631, + "step": 28981 + }, + { + "epoch": 1.1931090613130766, + "grad_norm": 6.189538478851318, + "learning_rate": 6.1311497875839385e-06, + "loss": 0.3084, + "step": 28982 + }, + { + "epoch": 1.1931226261530115, + "grad_norm": 5.402520179748535, + "learning_rate": 6.1310127449636845e-06, + "loss": 0.3105, + "step": 28983 + }, + { + "epoch": 1.1931361909929463, + "grad_norm": 4.848709583282471, + "learning_rate": 6.13087570234343e-06, + "loss": 0.2342, + "step": 28984 + }, + { + "epoch": 1.1931497558328812, + "grad_norm": 5.786057472229004, + "learning_rate": 6.130738659723174e-06, + "loss": 0.3149, + "step": 28985 + }, + { + "epoch": 1.193163320672816, + "grad_norm": 5.273150444030762, + "learning_rate": 6.130601617102919e-06, + "loss": 0.1677, + "step": 28986 + }, + { + "epoch": 1.193176885512751, + "grad_norm": 4.682434558868408, + "learning_rate": 6.130464574482664e-06, + "loss": 0.2582, + "step": 28987 + }, + { + "epoch": 1.1931904503526858, + "grad_norm": 6.010147571563721, + "learning_rate": 6.13032753186241e-06, + "loss": 0.288, + "step": 28988 + }, + { + "epoch": 1.1932040151926206, + "grad_norm": 8.89334774017334, + "learning_rate": 6.130190489242155e-06, + "loss": 0.4153, + "step": 28989 + }, + { + "epoch": 1.1932175800325555, + "grad_norm": 5.696282863616943, + "learning_rate": 6.1300534466219e-06, + "loss": 0.3982, + "step": 28990 + }, + { + "epoch": 1.1932311448724906, + "grad_norm": 6.387485980987549, + "learning_rate": 6.129916404001644e-06, + "loss": 0.2954, + "step": 28991 + }, + { + "epoch": 1.1932447097124255, + "grad_norm": 5.326652526855469, + "learning_rate": 6.12977936138139e-06, + "loss": 0.1784, + "step": 28992 + }, + { + "epoch": 1.1932582745523603, + "grad_norm": 6.110314846038818, + "learning_rate": 6.1296423187611355e-06, + "loss": 0.3226, + "step": 28993 + }, + { + "epoch": 1.1932718393922952, + "grad_norm": 5.490855693817139, + "learning_rate": 6.12950527614088e-06, + "loss": 0.2593, + "step": 28994 + }, + { + "epoch": 1.19328540423223, + "grad_norm": 4.482979774475098, + "learning_rate": 6.129368233520625e-06, + "loss": 0.1934, + "step": 28995 + }, + { + "epoch": 1.193298969072165, + "grad_norm": 6.206386089324951, + "learning_rate": 6.129231190900371e-06, + "loss": 0.3191, + "step": 28996 + }, + { + "epoch": 1.1933125339120998, + "grad_norm": 6.6321120262146, + "learning_rate": 6.129094148280116e-06, + "loss": 0.3295, + "step": 28997 + }, + { + "epoch": 1.1933260987520347, + "grad_norm": 6.6372833251953125, + "learning_rate": 6.1289571056598605e-06, + "loss": 0.3232, + "step": 28998 + }, + { + "epoch": 1.1933396635919695, + "grad_norm": 5.757080554962158, + "learning_rate": 6.128820063039606e-06, + "loss": 0.3345, + "step": 28999 + }, + { + "epoch": 1.1933532284319046, + "grad_norm": 6.386155605316162, + "learning_rate": 6.12868302041935e-06, + "loss": 0.2592, + "step": 29000 + }, + { + "epoch": 1.1933667932718395, + "grad_norm": 6.032389163970947, + "learning_rate": 6.128545977799096e-06, + "loss": 0.2791, + "step": 29001 + }, + { + "epoch": 1.1933803581117743, + "grad_norm": 7.087648868560791, + "learning_rate": 6.128408935178841e-06, + "loss": 0.3515, + "step": 29002 + }, + { + "epoch": 1.1933939229517092, + "grad_norm": 4.418646812438965, + "learning_rate": 6.128271892558586e-06, + "loss": 0.2162, + "step": 29003 + }, + { + "epoch": 1.193407487791644, + "grad_norm": 7.276529312133789, + "learning_rate": 6.128134849938331e-06, + "loss": 0.3124, + "step": 29004 + }, + { + "epoch": 1.193421052631579, + "grad_norm": 8.828003883361816, + "learning_rate": 6.127997807318077e-06, + "loss": 0.3101, + "step": 29005 + }, + { + "epoch": 1.1934346174715138, + "grad_norm": 6.140713691711426, + "learning_rate": 6.127860764697822e-06, + "loss": 0.2385, + "step": 29006 + }, + { + "epoch": 1.1934481823114487, + "grad_norm": 6.4368577003479, + "learning_rate": 6.127723722077566e-06, + "loss": 0.2656, + "step": 29007 + }, + { + "epoch": 1.1934617471513835, + "grad_norm": 7.493887901306152, + "learning_rate": 6.1275866794573115e-06, + "loss": 0.2554, + "step": 29008 + }, + { + "epoch": 1.1934753119913184, + "grad_norm": 5.981090545654297, + "learning_rate": 6.1274496368370575e-06, + "loss": 0.2444, + "step": 29009 + }, + { + "epoch": 1.1934888768312535, + "grad_norm": 6.263730525970459, + "learning_rate": 6.127312594216802e-06, + "loss": 0.3767, + "step": 29010 + }, + { + "epoch": 1.1935024416711884, + "grad_norm": 6.057425498962402, + "learning_rate": 6.127175551596547e-06, + "loss": 0.2714, + "step": 29011 + }, + { + "epoch": 1.1935160065111232, + "grad_norm": 5.404515743255615, + "learning_rate": 6.127038508976292e-06, + "loss": 0.2193, + "step": 29012 + }, + { + "epoch": 1.193529571351058, + "grad_norm": 8.864517211914062, + "learning_rate": 6.126901466356038e-06, + "loss": 0.5383, + "step": 29013 + }, + { + "epoch": 1.193543136190993, + "grad_norm": 6.364664554595947, + "learning_rate": 6.1267644237357825e-06, + "loss": 0.2712, + "step": 29014 + }, + { + "epoch": 1.1935567010309278, + "grad_norm": 4.2063093185424805, + "learning_rate": 6.126627381115528e-06, + "loss": 0.1928, + "step": 29015 + }, + { + "epoch": 1.1935702658708627, + "grad_norm": 4.03534460067749, + "learning_rate": 6.126490338495272e-06, + "loss": 0.1468, + "step": 29016 + }, + { + "epoch": 1.1935838307107975, + "grad_norm": 6.135470867156982, + "learning_rate": 6.126353295875017e-06, + "loss": 0.1894, + "step": 29017 + }, + { + "epoch": 1.1935973955507324, + "grad_norm": 4.900063514709473, + "learning_rate": 6.126216253254763e-06, + "loss": 0.167, + "step": 29018 + }, + { + "epoch": 1.1936109603906675, + "grad_norm": 5.748156547546387, + "learning_rate": 6.126079210634508e-06, + "loss": 0.2018, + "step": 29019 + }, + { + "epoch": 1.1936245252306024, + "grad_norm": 4.227119445800781, + "learning_rate": 6.125942168014253e-06, + "loss": 0.2251, + "step": 29020 + }, + { + "epoch": 1.1936380900705372, + "grad_norm": 4.326196193695068, + "learning_rate": 6.125805125393998e-06, + "loss": 0.1762, + "step": 29021 + }, + { + "epoch": 1.193651654910472, + "grad_norm": 5.245850563049316, + "learning_rate": 6.125668082773744e-06, + "loss": 0.2284, + "step": 29022 + }, + { + "epoch": 1.193665219750407, + "grad_norm": 5.464799880981445, + "learning_rate": 6.125531040153488e-06, + "loss": 0.2169, + "step": 29023 + }, + { + "epoch": 1.1936787845903418, + "grad_norm": 3.2910146713256836, + "learning_rate": 6.1253939975332335e-06, + "loss": 0.0884, + "step": 29024 + }, + { + "epoch": 1.1936923494302767, + "grad_norm": 4.466131210327148, + "learning_rate": 6.125256954912978e-06, + "loss": 0.1587, + "step": 29025 + }, + { + "epoch": 1.1937059142702116, + "grad_norm": 6.263229846954346, + "learning_rate": 6.125119912292724e-06, + "loss": 0.2835, + "step": 29026 + }, + { + "epoch": 1.1937194791101464, + "grad_norm": 5.810818672180176, + "learning_rate": 6.124982869672469e-06, + "loss": 0.2896, + "step": 29027 + }, + { + "epoch": 1.1937330439500813, + "grad_norm": 2.703803777694702, + "learning_rate": 6.124845827052213e-06, + "loss": 0.0867, + "step": 29028 + }, + { + "epoch": 1.1937466087900164, + "grad_norm": 6.138451099395752, + "learning_rate": 6.1247087844319585e-06, + "loss": 0.3721, + "step": 29029 + }, + { + "epoch": 1.1937601736299512, + "grad_norm": 5.587165355682373, + "learning_rate": 6.124571741811704e-06, + "loss": 0.1575, + "step": 29030 + }, + { + "epoch": 1.193773738469886, + "grad_norm": 5.389708995819092, + "learning_rate": 6.12443469919145e-06, + "loss": 0.2531, + "step": 29031 + }, + { + "epoch": 1.193787303309821, + "grad_norm": 5.5196099281311035, + "learning_rate": 6.124297656571194e-06, + "loss": 0.2502, + "step": 29032 + }, + { + "epoch": 1.1938008681497558, + "grad_norm": 3.5849404335021973, + "learning_rate": 6.124160613950939e-06, + "loss": 0.1584, + "step": 29033 + }, + { + "epoch": 1.1938144329896907, + "grad_norm": 4.985674858093262, + "learning_rate": 6.1240235713306836e-06, + "loss": 0.2377, + "step": 29034 + }, + { + "epoch": 1.1938279978296256, + "grad_norm": 4.888916969299316, + "learning_rate": 6.12388652871043e-06, + "loss": 0.1747, + "step": 29035 + }, + { + "epoch": 1.1938415626695604, + "grad_norm": 5.294637680053711, + "learning_rate": 6.123749486090175e-06, + "loss": 0.265, + "step": 29036 + }, + { + "epoch": 1.1938551275094953, + "grad_norm": 4.44131326675415, + "learning_rate": 6.12361244346992e-06, + "loss": 0.2358, + "step": 29037 + }, + { + "epoch": 1.1938686923494304, + "grad_norm": 6.836719989776611, + "learning_rate": 6.123475400849664e-06, + "loss": 0.5099, + "step": 29038 + }, + { + "epoch": 1.1938822571893652, + "grad_norm": 5.547824382781982, + "learning_rate": 6.12333835822941e-06, + "loss": 0.2654, + "step": 29039 + }, + { + "epoch": 1.1938958220293001, + "grad_norm": 4.2987565994262695, + "learning_rate": 6.1232013156091555e-06, + "loss": 0.1702, + "step": 29040 + }, + { + "epoch": 1.193909386869235, + "grad_norm": 3.4361932277679443, + "learning_rate": 6.1230642729889e-06, + "loss": 0.208, + "step": 29041 + }, + { + "epoch": 1.1939229517091698, + "grad_norm": 3.910654306411743, + "learning_rate": 6.122927230368645e-06, + "loss": 0.2957, + "step": 29042 + }, + { + "epoch": 1.1939365165491047, + "grad_norm": 5.124967575073242, + "learning_rate": 6.122790187748389e-06, + "loss": 0.2285, + "step": 29043 + }, + { + "epoch": 1.1939500813890396, + "grad_norm": 6.1961989402771, + "learning_rate": 6.122653145128135e-06, + "loss": 0.3048, + "step": 29044 + }, + { + "epoch": 1.1939636462289744, + "grad_norm": 5.892141342163086, + "learning_rate": 6.1225161025078805e-06, + "loss": 0.2166, + "step": 29045 + }, + { + "epoch": 1.1939772110689093, + "grad_norm": 5.728940963745117, + "learning_rate": 6.122379059887626e-06, + "loss": 0.3501, + "step": 29046 + }, + { + "epoch": 1.1939907759088442, + "grad_norm": 4.249785900115967, + "learning_rate": 6.12224201726737e-06, + "loss": 0.1635, + "step": 29047 + }, + { + "epoch": 1.1940043407487793, + "grad_norm": 5.328415870666504, + "learning_rate": 6.122104974647116e-06, + "loss": 0.185, + "step": 29048 + }, + { + "epoch": 1.1940179055887141, + "grad_norm": 6.3109002113342285, + "learning_rate": 6.121967932026861e-06, + "loss": 0.3213, + "step": 29049 + }, + { + "epoch": 1.194031470428649, + "grad_norm": 5.201697826385498, + "learning_rate": 6.121830889406606e-06, + "loss": 0.2233, + "step": 29050 + }, + { + "epoch": 1.1940450352685839, + "grad_norm": 6.009829044342041, + "learning_rate": 6.121693846786351e-06, + "loss": 0.2167, + "step": 29051 + }, + { + "epoch": 1.1940586001085187, + "grad_norm": 5.25322961807251, + "learning_rate": 6.121556804166097e-06, + "loss": 0.2822, + "step": 29052 + }, + { + "epoch": 1.1940721649484536, + "grad_norm": 3.0586845874786377, + "learning_rate": 6.121419761545841e-06, + "loss": 0.1228, + "step": 29053 + }, + { + "epoch": 1.1940857297883885, + "grad_norm": 3.5860061645507812, + "learning_rate": 6.121282718925586e-06, + "loss": 0.1384, + "step": 29054 + }, + { + "epoch": 1.1940992946283233, + "grad_norm": 4.340599536895752, + "learning_rate": 6.1211456763053315e-06, + "loss": 0.1678, + "step": 29055 + }, + { + "epoch": 1.1941128594682582, + "grad_norm": 4.401878833770752, + "learning_rate": 6.121008633685076e-06, + "loss": 0.1976, + "step": 29056 + }, + { + "epoch": 1.1941264243081933, + "grad_norm": 4.462413787841797, + "learning_rate": 6.120871591064822e-06, + "loss": 0.2463, + "step": 29057 + }, + { + "epoch": 1.1941399891481281, + "grad_norm": 4.445476055145264, + "learning_rate": 6.120734548444567e-06, + "loss": 0.2265, + "step": 29058 + }, + { + "epoch": 1.194153553988063, + "grad_norm": 5.3956685066223145, + "learning_rate": 6.120597505824311e-06, + "loss": 0.2219, + "step": 29059 + }, + { + "epoch": 1.1941671188279979, + "grad_norm": 4.287038326263428, + "learning_rate": 6.1204604632040565e-06, + "loss": 0.2232, + "step": 29060 + }, + { + "epoch": 1.1941806836679327, + "grad_norm": 3.772594451904297, + "learning_rate": 6.1203234205838025e-06, + "loss": 0.1663, + "step": 29061 + }, + { + "epoch": 1.1941942485078676, + "grad_norm": 6.870316982269287, + "learning_rate": 6.120186377963548e-06, + "loss": 0.296, + "step": 29062 + }, + { + "epoch": 1.1942078133478025, + "grad_norm": 3.14390230178833, + "learning_rate": 6.120049335343292e-06, + "loss": 0.1687, + "step": 29063 + }, + { + "epoch": 1.1942213781877373, + "grad_norm": 6.70708703994751, + "learning_rate": 6.119912292723037e-06, + "loss": 0.2447, + "step": 29064 + }, + { + "epoch": 1.1942349430276722, + "grad_norm": 4.621710777282715, + "learning_rate": 6.119775250102783e-06, + "loss": 0.2664, + "step": 29065 + }, + { + "epoch": 1.194248507867607, + "grad_norm": 5.8078508377075195, + "learning_rate": 6.119638207482528e-06, + "loss": 0.2887, + "step": 29066 + }, + { + "epoch": 1.1942620727075421, + "grad_norm": 5.802167892456055, + "learning_rate": 6.119501164862273e-06, + "loss": 0.2154, + "step": 29067 + }, + { + "epoch": 1.194275637547477, + "grad_norm": 4.327387809753418, + "learning_rate": 6.119364122242017e-06, + "loss": 0.2116, + "step": 29068 + }, + { + "epoch": 1.1942892023874119, + "grad_norm": 6.713402271270752, + "learning_rate": 6.119227079621762e-06, + "loss": 0.3208, + "step": 29069 + }, + { + "epoch": 1.1943027672273467, + "grad_norm": 5.606779098510742, + "learning_rate": 6.119090037001508e-06, + "loss": 0.2414, + "step": 29070 + }, + { + "epoch": 1.1943163320672816, + "grad_norm": 6.247385501861572, + "learning_rate": 6.1189529943812535e-06, + "loss": 0.303, + "step": 29071 + }, + { + "epoch": 1.1943298969072165, + "grad_norm": 6.710386276245117, + "learning_rate": 6.118815951760998e-06, + "loss": 0.3753, + "step": 29072 + }, + { + "epoch": 1.1943434617471513, + "grad_norm": 3.3963677883148193, + "learning_rate": 6.118678909140743e-06, + "loss": 0.1482, + "step": 29073 + }, + { + "epoch": 1.1943570265870862, + "grad_norm": 7.854901313781738, + "learning_rate": 6.118541866520489e-06, + "loss": 0.3782, + "step": 29074 + }, + { + "epoch": 1.194370591427021, + "grad_norm": 3.6394901275634766, + "learning_rate": 6.118404823900233e-06, + "loss": 0.1455, + "step": 29075 + }, + { + "epoch": 1.1943841562669562, + "grad_norm": 4.9490532875061035, + "learning_rate": 6.1182677812799785e-06, + "loss": 0.222, + "step": 29076 + }, + { + "epoch": 1.194397721106891, + "grad_norm": 4.457577705383301, + "learning_rate": 6.118130738659723e-06, + "loss": 0.196, + "step": 29077 + }, + { + "epoch": 1.1944112859468259, + "grad_norm": 7.864894866943359, + "learning_rate": 6.117993696039469e-06, + "loss": 0.3691, + "step": 29078 + }, + { + "epoch": 1.1944248507867608, + "grad_norm": 7.57906436920166, + "learning_rate": 6.117856653419214e-06, + "loss": 0.2636, + "step": 29079 + }, + { + "epoch": 1.1944384156266956, + "grad_norm": 5.1966423988342285, + "learning_rate": 6.117719610798959e-06, + "loss": 0.1917, + "step": 29080 + }, + { + "epoch": 1.1944519804666305, + "grad_norm": 4.90685510635376, + "learning_rate": 6.117582568178704e-06, + "loss": 0.1734, + "step": 29081 + }, + { + "epoch": 1.1944655453065653, + "grad_norm": 4.360256671905518, + "learning_rate": 6.11744552555845e-06, + "loss": 0.2055, + "step": 29082 + }, + { + "epoch": 1.1944791101465002, + "grad_norm": 4.456181526184082, + "learning_rate": 6.117308482938195e-06, + "loss": 0.2019, + "step": 29083 + }, + { + "epoch": 1.194492674986435, + "grad_norm": 5.740438938140869, + "learning_rate": 6.117171440317939e-06, + "loss": 0.2674, + "step": 29084 + }, + { + "epoch": 1.19450623982637, + "grad_norm": 4.805330753326416, + "learning_rate": 6.117034397697684e-06, + "loss": 0.232, + "step": 29085 + }, + { + "epoch": 1.194519804666305, + "grad_norm": 4.711282253265381, + "learning_rate": 6.1168973550774295e-06, + "loss": 0.1302, + "step": 29086 + }, + { + "epoch": 1.19453336950624, + "grad_norm": 5.59427547454834, + "learning_rate": 6.116760312457175e-06, + "loss": 0.2399, + "step": 29087 + }, + { + "epoch": 1.1945469343461748, + "grad_norm": 5.1096320152282715, + "learning_rate": 6.11662326983692e-06, + "loss": 0.253, + "step": 29088 + }, + { + "epoch": 1.1945604991861096, + "grad_norm": 3.865811824798584, + "learning_rate": 6.116486227216665e-06, + "loss": 0.2145, + "step": 29089 + }, + { + "epoch": 1.1945740640260445, + "grad_norm": 4.759492874145508, + "learning_rate": 6.116349184596409e-06, + "loss": 0.222, + "step": 29090 + }, + { + "epoch": 1.1945876288659794, + "grad_norm": 4.853538513183594, + "learning_rate": 6.116212141976155e-06, + "loss": 0.1502, + "step": 29091 + }, + { + "epoch": 1.1946011937059142, + "grad_norm": 7.3465576171875, + "learning_rate": 6.1160750993559006e-06, + "loss": 0.3143, + "step": 29092 + }, + { + "epoch": 1.194614758545849, + "grad_norm": 6.548439979553223, + "learning_rate": 6.115938056735645e-06, + "loss": 0.324, + "step": 29093 + }, + { + "epoch": 1.194628323385784, + "grad_norm": 4.976332187652588, + "learning_rate": 6.11580101411539e-06, + "loss": 0.2383, + "step": 29094 + }, + { + "epoch": 1.194641888225719, + "grad_norm": 6.1133341789245605, + "learning_rate": 6.115663971495136e-06, + "loss": 0.2128, + "step": 29095 + }, + { + "epoch": 1.194655453065654, + "grad_norm": 6.1652727127075195, + "learning_rate": 6.115526928874881e-06, + "loss": 0.271, + "step": 29096 + }, + { + "epoch": 1.1946690179055888, + "grad_norm": 4.994441986083984, + "learning_rate": 6.115389886254626e-06, + "loss": 0.2203, + "step": 29097 + }, + { + "epoch": 1.1946825827455236, + "grad_norm": 6.4080729484558105, + "learning_rate": 6.115252843634371e-06, + "loss": 0.2785, + "step": 29098 + }, + { + "epoch": 1.1946961475854585, + "grad_norm": 5.7927680015563965, + "learning_rate": 6.115115801014115e-06, + "loss": 0.3542, + "step": 29099 + }, + { + "epoch": 1.1947097124253934, + "grad_norm": 3.2178585529327393, + "learning_rate": 6.114978758393861e-06, + "loss": 0.1142, + "step": 29100 + }, + { + "epoch": 1.1947232772653282, + "grad_norm": 3.3767051696777344, + "learning_rate": 6.114841715773606e-06, + "loss": 0.1983, + "step": 29101 + }, + { + "epoch": 1.194736842105263, + "grad_norm": 4.091147422790527, + "learning_rate": 6.114704673153351e-06, + "loss": 0.1799, + "step": 29102 + }, + { + "epoch": 1.194750406945198, + "grad_norm": 7.311334609985352, + "learning_rate": 6.114567630533096e-06, + "loss": 0.3269, + "step": 29103 + }, + { + "epoch": 1.1947639717851328, + "grad_norm": 4.79865026473999, + "learning_rate": 6.114430587912842e-06, + "loss": 0.2075, + "step": 29104 + }, + { + "epoch": 1.194777536625068, + "grad_norm": 3.658376693725586, + "learning_rate": 6.114293545292587e-06, + "loss": 0.1654, + "step": 29105 + }, + { + "epoch": 1.1947911014650028, + "grad_norm": 5.194636821746826, + "learning_rate": 6.114156502672331e-06, + "loss": 0.1928, + "step": 29106 + }, + { + "epoch": 1.1948046663049376, + "grad_norm": 4.487626552581787, + "learning_rate": 6.1140194600520765e-06, + "loss": 0.2278, + "step": 29107 + }, + { + "epoch": 1.1948182311448725, + "grad_norm": 5.526892185211182, + "learning_rate": 6.1138824174318226e-06, + "loss": 0.1911, + "step": 29108 + }, + { + "epoch": 1.1948317959848074, + "grad_norm": 5.6831889152526855, + "learning_rate": 6.113745374811567e-06, + "loss": 0.2099, + "step": 29109 + }, + { + "epoch": 1.1948453608247422, + "grad_norm": 5.661334991455078, + "learning_rate": 6.113608332191312e-06, + "loss": 0.2095, + "step": 29110 + }, + { + "epoch": 1.194858925664677, + "grad_norm": 5.241004467010498, + "learning_rate": 6.113471289571057e-06, + "loss": 0.2049, + "step": 29111 + }, + { + "epoch": 1.194872490504612, + "grad_norm": 4.554258823394775, + "learning_rate": 6.113334246950802e-06, + "loss": 0.1753, + "step": 29112 + }, + { + "epoch": 1.1948860553445468, + "grad_norm": 4.3927388191223145, + "learning_rate": 6.113197204330548e-06, + "loss": 0.155, + "step": 29113 + }, + { + "epoch": 1.194899620184482, + "grad_norm": 4.956627368927002, + "learning_rate": 6.113060161710293e-06, + "loss": 0.2063, + "step": 29114 + }, + { + "epoch": 1.1949131850244168, + "grad_norm": 4.170094013214111, + "learning_rate": 6.112923119090037e-06, + "loss": 0.2443, + "step": 29115 + }, + { + "epoch": 1.1949267498643517, + "grad_norm": 4.464968681335449, + "learning_rate": 6.112786076469782e-06, + "loss": 0.2667, + "step": 29116 + }, + { + "epoch": 1.1949403147042865, + "grad_norm": 6.377896308898926, + "learning_rate": 6.112649033849528e-06, + "loss": 0.2869, + "step": 29117 + }, + { + "epoch": 1.1949538795442214, + "grad_norm": 4.3497796058654785, + "learning_rate": 6.112511991229273e-06, + "loss": 0.1633, + "step": 29118 + }, + { + "epoch": 1.1949674443841563, + "grad_norm": 6.3892741203308105, + "learning_rate": 6.112374948609018e-06, + "loss": 0.2448, + "step": 29119 + }, + { + "epoch": 1.1949810092240911, + "grad_norm": 4.011580944061279, + "learning_rate": 6.112237905988763e-06, + "loss": 0.202, + "step": 29120 + }, + { + "epoch": 1.194994574064026, + "grad_norm": 4.306175231933594, + "learning_rate": 6.112100863368508e-06, + "loss": 0.2402, + "step": 29121 + }, + { + "epoch": 1.1950081389039608, + "grad_norm": 4.652029514312744, + "learning_rate": 6.111963820748253e-06, + "loss": 0.2083, + "step": 29122 + }, + { + "epoch": 1.1950217037438957, + "grad_norm": 4.734904766082764, + "learning_rate": 6.1118267781279986e-06, + "loss": 0.2419, + "step": 29123 + }, + { + "epoch": 1.1950352685838308, + "grad_norm": 7.5212907791137695, + "learning_rate": 6.111689735507743e-06, + "loss": 0.3595, + "step": 29124 + }, + { + "epoch": 1.1950488334237657, + "grad_norm": 5.593688488006592, + "learning_rate": 6.111552692887488e-06, + "loss": 0.2683, + "step": 29125 + }, + { + "epoch": 1.1950623982637005, + "grad_norm": 7.671948432922363, + "learning_rate": 6.111415650267234e-06, + "loss": 0.3277, + "step": 29126 + }, + { + "epoch": 1.1950759631036354, + "grad_norm": 4.353267192840576, + "learning_rate": 6.1112786076469784e-06, + "loss": 0.1524, + "step": 29127 + }, + { + "epoch": 1.1950895279435703, + "grad_norm": 5.204283237457275, + "learning_rate": 6.111141565026724e-06, + "loss": 0.231, + "step": 29128 + }, + { + "epoch": 1.1951030927835051, + "grad_norm": 7.633509159088135, + "learning_rate": 6.111004522406469e-06, + "loss": 0.2792, + "step": 29129 + }, + { + "epoch": 1.19511665762344, + "grad_norm": 7.534749507904053, + "learning_rate": 6.110867479786215e-06, + "loss": 0.3029, + "step": 29130 + }, + { + "epoch": 1.1951302224633749, + "grad_norm": 6.05913782119751, + "learning_rate": 6.110730437165959e-06, + "loss": 0.2079, + "step": 29131 + }, + { + "epoch": 1.19514378730331, + "grad_norm": 6.477247714996338, + "learning_rate": 6.110593394545704e-06, + "loss": 0.274, + "step": 29132 + }, + { + "epoch": 1.1951573521432448, + "grad_norm": 5.418745994567871, + "learning_rate": 6.110456351925449e-06, + "loss": 0.3027, + "step": 29133 + }, + { + "epoch": 1.1951709169831797, + "grad_norm": 5.8746747970581055, + "learning_rate": 6.110319309305195e-06, + "loss": 0.2857, + "step": 29134 + }, + { + "epoch": 1.1951844818231145, + "grad_norm": 5.612428188323975, + "learning_rate": 6.11018226668494e-06, + "loss": 0.2115, + "step": 29135 + }, + { + "epoch": 1.1951980466630494, + "grad_norm": 6.773231029510498, + "learning_rate": 6.110045224064684e-06, + "loss": 0.3102, + "step": 29136 + }, + { + "epoch": 1.1952116115029843, + "grad_norm": 6.462397575378418, + "learning_rate": 6.109908181444429e-06, + "loss": 0.3236, + "step": 29137 + }, + { + "epoch": 1.1952251763429191, + "grad_norm": 5.984546184539795, + "learning_rate": 6.1097711388241745e-06, + "loss": 0.2894, + "step": 29138 + }, + { + "epoch": 1.195238741182854, + "grad_norm": 5.227367877960205, + "learning_rate": 6.1096340962039206e-06, + "loss": 0.2431, + "step": 29139 + }, + { + "epoch": 1.1952523060227889, + "grad_norm": 3.9186391830444336, + "learning_rate": 6.109497053583665e-06, + "loss": 0.2239, + "step": 29140 + }, + { + "epoch": 1.1952658708627237, + "grad_norm": 5.519989490509033, + "learning_rate": 6.10936001096341e-06, + "loss": 0.1492, + "step": 29141 + }, + { + "epoch": 1.1952794357026586, + "grad_norm": 5.870161056518555, + "learning_rate": 6.109222968343154e-06, + "loss": 0.2474, + "step": 29142 + }, + { + "epoch": 1.1952930005425937, + "grad_norm": 5.974062442779541, + "learning_rate": 6.1090859257229004e-06, + "loss": 0.3116, + "step": 29143 + }, + { + "epoch": 1.1953065653825286, + "grad_norm": 5.68678617477417, + "learning_rate": 6.108948883102646e-06, + "loss": 0.3273, + "step": 29144 + }, + { + "epoch": 1.1953201302224634, + "grad_norm": 6.457253932952881, + "learning_rate": 6.108811840482391e-06, + "loss": 0.2331, + "step": 29145 + }, + { + "epoch": 1.1953336950623983, + "grad_norm": 7.377357006072998, + "learning_rate": 6.108674797862135e-06, + "loss": 0.2768, + "step": 29146 + }, + { + "epoch": 1.1953472599023331, + "grad_norm": 5.65972375869751, + "learning_rate": 6.108537755241881e-06, + "loss": 0.2219, + "step": 29147 + }, + { + "epoch": 1.195360824742268, + "grad_norm": 5.653735637664795, + "learning_rate": 6.108400712621626e-06, + "loss": 0.2147, + "step": 29148 + }, + { + "epoch": 1.1953743895822029, + "grad_norm": 7.090018272399902, + "learning_rate": 6.108263670001371e-06, + "loss": 0.4368, + "step": 29149 + }, + { + "epoch": 1.1953879544221377, + "grad_norm": 5.825945854187012, + "learning_rate": 6.108126627381116e-06, + "loss": 0.3379, + "step": 29150 + }, + { + "epoch": 1.1954015192620728, + "grad_norm": 6.500669002532959, + "learning_rate": 6.107989584760862e-06, + "loss": 0.3099, + "step": 29151 + }, + { + "epoch": 1.1954150841020077, + "grad_norm": 7.4767022132873535, + "learning_rate": 6.107852542140606e-06, + "loss": 0.368, + "step": 29152 + }, + { + "epoch": 1.1954286489419426, + "grad_norm": 4.925818920135498, + "learning_rate": 6.107715499520351e-06, + "loss": 0.1447, + "step": 29153 + }, + { + "epoch": 1.1954422137818774, + "grad_norm": 6.093046188354492, + "learning_rate": 6.1075784569000966e-06, + "loss": 0.3898, + "step": 29154 + }, + { + "epoch": 1.1954557786218123, + "grad_norm": 5.028097152709961, + "learning_rate": 6.107441414279841e-06, + "loss": 0.2681, + "step": 29155 + }, + { + "epoch": 1.1954693434617472, + "grad_norm": 5.508334159851074, + "learning_rate": 6.107304371659587e-06, + "loss": 0.3559, + "step": 29156 + }, + { + "epoch": 1.195482908301682, + "grad_norm": 6.757209777832031, + "learning_rate": 6.107167329039332e-06, + "loss": 0.3612, + "step": 29157 + }, + { + "epoch": 1.1954964731416169, + "grad_norm": 5.66149377822876, + "learning_rate": 6.1070302864190764e-06, + "loss": 0.2614, + "step": 29158 + }, + { + "epoch": 1.1955100379815518, + "grad_norm": 4.714325428009033, + "learning_rate": 6.106893243798822e-06, + "loss": 0.1735, + "step": 29159 + }, + { + "epoch": 1.1955236028214866, + "grad_norm": 5.631072044372559, + "learning_rate": 6.106756201178568e-06, + "loss": 0.3532, + "step": 29160 + }, + { + "epoch": 1.1955371676614215, + "grad_norm": 5.934891700744629, + "learning_rate": 6.106619158558312e-06, + "loss": 0.3704, + "step": 29161 + }, + { + "epoch": 1.1955507325013566, + "grad_norm": 5.223016738891602, + "learning_rate": 6.106482115938057e-06, + "loss": 0.2695, + "step": 29162 + }, + { + "epoch": 1.1955642973412914, + "grad_norm": 4.9650702476501465, + "learning_rate": 6.106345073317802e-06, + "loss": 0.3169, + "step": 29163 + }, + { + "epoch": 1.1955778621812263, + "grad_norm": 4.537567615509033, + "learning_rate": 6.106208030697548e-06, + "loss": 0.2544, + "step": 29164 + }, + { + "epoch": 1.1955914270211612, + "grad_norm": 5.025857448577881, + "learning_rate": 6.106070988077293e-06, + "loss": 0.2956, + "step": 29165 + }, + { + "epoch": 1.195604991861096, + "grad_norm": 4.989935398101807, + "learning_rate": 6.105933945457038e-06, + "loss": 0.357, + "step": 29166 + }, + { + "epoch": 1.195618556701031, + "grad_norm": 6.9662275314331055, + "learning_rate": 6.105796902836782e-06, + "loss": 0.3896, + "step": 29167 + }, + { + "epoch": 1.1956321215409658, + "grad_norm": 5.790435314178467, + "learning_rate": 6.105659860216527e-06, + "loss": 0.3169, + "step": 29168 + }, + { + "epoch": 1.1956456863809006, + "grad_norm": 5.671999454498291, + "learning_rate": 6.105522817596273e-06, + "loss": 0.2521, + "step": 29169 + }, + { + "epoch": 1.1956592512208357, + "grad_norm": 4.644941329956055, + "learning_rate": 6.105385774976018e-06, + "loss": 0.1891, + "step": 29170 + }, + { + "epoch": 1.1956728160607706, + "grad_norm": 5.569098472595215, + "learning_rate": 6.105248732355763e-06, + "loss": 0.2493, + "step": 29171 + }, + { + "epoch": 1.1956863809007054, + "grad_norm": 6.912055492401123, + "learning_rate": 6.105111689735508e-06, + "loss": 0.4359, + "step": 29172 + }, + { + "epoch": 1.1956999457406403, + "grad_norm": 5.972219944000244, + "learning_rate": 6.104974647115254e-06, + "loss": 0.2791, + "step": 29173 + }, + { + "epoch": 1.1957135105805752, + "grad_norm": 6.3707966804504395, + "learning_rate": 6.1048376044949984e-06, + "loss": 0.285, + "step": 29174 + }, + { + "epoch": 1.19572707542051, + "grad_norm": 4.909082889556885, + "learning_rate": 6.104700561874744e-06, + "loss": 0.2604, + "step": 29175 + }, + { + "epoch": 1.195740640260445, + "grad_norm": 4.028067588806152, + "learning_rate": 6.104563519254488e-06, + "loss": 0.2082, + "step": 29176 + }, + { + "epoch": 1.1957542051003798, + "grad_norm": 4.750537872314453, + "learning_rate": 6.104426476634234e-06, + "loss": 0.2102, + "step": 29177 + }, + { + "epoch": 1.1957677699403146, + "grad_norm": 5.538916110992432, + "learning_rate": 6.104289434013979e-06, + "loss": 0.28, + "step": 29178 + }, + { + "epoch": 1.1957813347802495, + "grad_norm": 3.712062120437622, + "learning_rate": 6.104152391393724e-06, + "loss": 0.1957, + "step": 29179 + }, + { + "epoch": 1.1957948996201844, + "grad_norm": 3.652604341506958, + "learning_rate": 6.104015348773469e-06, + "loss": 0.1743, + "step": 29180 + }, + { + "epoch": 1.1958084644601195, + "grad_norm": 3.859400749206543, + "learning_rate": 6.103878306153214e-06, + "loss": 0.2044, + "step": 29181 + }, + { + "epoch": 1.1958220293000543, + "grad_norm": 4.34329891204834, + "learning_rate": 6.10374126353296e-06, + "loss": 0.2719, + "step": 29182 + }, + { + "epoch": 1.1958355941399892, + "grad_norm": 4.672139644622803, + "learning_rate": 6.103604220912704e-06, + "loss": 0.2939, + "step": 29183 + }, + { + "epoch": 1.195849158979924, + "grad_norm": 4.181500434875488, + "learning_rate": 6.103467178292449e-06, + "loss": 0.163, + "step": 29184 + }, + { + "epoch": 1.195862723819859, + "grad_norm": 4.3231425285339355, + "learning_rate": 6.103330135672194e-06, + "loss": 0.1764, + "step": 29185 + }, + { + "epoch": 1.1958762886597938, + "grad_norm": 6.598895072937012, + "learning_rate": 6.10319309305194e-06, + "loss": 0.3954, + "step": 29186 + }, + { + "epoch": 1.1958898534997287, + "grad_norm": 4.679350852966309, + "learning_rate": 6.103056050431685e-06, + "loss": 0.1696, + "step": 29187 + }, + { + "epoch": 1.1959034183396635, + "grad_norm": 3.3579823970794678, + "learning_rate": 6.10291900781143e-06, + "loss": 0.1761, + "step": 29188 + }, + { + "epoch": 1.1959169831795986, + "grad_norm": 4.324631690979004, + "learning_rate": 6.1027819651911744e-06, + "loss": 0.1799, + "step": 29189 + }, + { + "epoch": 1.1959305480195335, + "grad_norm": 6.590025424957275, + "learning_rate": 6.1026449225709205e-06, + "loss": 0.2784, + "step": 29190 + }, + { + "epoch": 1.1959441128594683, + "grad_norm": 4.052825450897217, + "learning_rate": 6.102507879950666e-06, + "loss": 0.2055, + "step": 29191 + }, + { + "epoch": 1.1959576776994032, + "grad_norm": 5.708737850189209, + "learning_rate": 6.10237083733041e-06, + "loss": 0.2355, + "step": 29192 + }, + { + "epoch": 1.195971242539338, + "grad_norm": 6.392261505126953, + "learning_rate": 6.102233794710155e-06, + "loss": 0.2197, + "step": 29193 + }, + { + "epoch": 1.195984807379273, + "grad_norm": 5.241166114807129, + "learning_rate": 6.1020967520899e-06, + "loss": 0.2226, + "step": 29194 + }, + { + "epoch": 1.1959983722192078, + "grad_norm": 6.942201137542725, + "learning_rate": 6.1019597094696455e-06, + "loss": 0.2525, + "step": 29195 + }, + { + "epoch": 1.1960119370591427, + "grad_norm": 4.13767671585083, + "learning_rate": 6.101822666849391e-06, + "loss": 0.2121, + "step": 29196 + }, + { + "epoch": 1.1960255018990775, + "grad_norm": 4.28750467300415, + "learning_rate": 6.101685624229136e-06, + "loss": 0.2835, + "step": 29197 + }, + { + "epoch": 1.1960390667390124, + "grad_norm": 7.24882173538208, + "learning_rate": 6.10154858160888e-06, + "loss": 0.4169, + "step": 29198 + }, + { + "epoch": 1.1960526315789473, + "grad_norm": 4.472668170928955, + "learning_rate": 6.101411538988626e-06, + "loss": 0.2328, + "step": 29199 + }, + { + "epoch": 1.1960661964188823, + "grad_norm": 2.8501474857330322, + "learning_rate": 6.101274496368371e-06, + "loss": 0.1382, + "step": 29200 + }, + { + "epoch": 1.1960797612588172, + "grad_norm": 5.216028213500977, + "learning_rate": 6.101137453748116e-06, + "loss": 0.2288, + "step": 29201 + }, + { + "epoch": 1.196093326098752, + "grad_norm": 5.243200778961182, + "learning_rate": 6.101000411127861e-06, + "loss": 0.2394, + "step": 29202 + }, + { + "epoch": 1.196106890938687, + "grad_norm": 3.501267910003662, + "learning_rate": 6.100863368507607e-06, + "loss": 0.1573, + "step": 29203 + }, + { + "epoch": 1.1961204557786218, + "grad_norm": 4.312822341918945, + "learning_rate": 6.100726325887352e-06, + "loss": 0.1498, + "step": 29204 + }, + { + "epoch": 1.1961340206185567, + "grad_norm": 3.5658938884735107, + "learning_rate": 6.1005892832670965e-06, + "loss": 0.2299, + "step": 29205 + }, + { + "epoch": 1.1961475854584915, + "grad_norm": 3.916867971420288, + "learning_rate": 6.100452240646842e-06, + "loss": 0.2085, + "step": 29206 + }, + { + "epoch": 1.1961611502984264, + "grad_norm": 3.7247071266174316, + "learning_rate": 6.100315198026586e-06, + "loss": 0.1549, + "step": 29207 + }, + { + "epoch": 1.1961747151383615, + "grad_norm": 4.729541778564453, + "learning_rate": 6.100178155406332e-06, + "loss": 0.1776, + "step": 29208 + }, + { + "epoch": 1.1961882799782964, + "grad_norm": 4.46266508102417, + "learning_rate": 6.100041112786077e-06, + "loss": 0.1651, + "step": 29209 + }, + { + "epoch": 1.1962018448182312, + "grad_norm": 5.033540725708008, + "learning_rate": 6.0999040701658215e-06, + "loss": 0.219, + "step": 29210 + }, + { + "epoch": 1.196215409658166, + "grad_norm": 3.795933723449707, + "learning_rate": 6.099767027545567e-06, + "loss": 0.1882, + "step": 29211 + }, + { + "epoch": 1.196228974498101, + "grad_norm": 3.2873189449310303, + "learning_rate": 6.099629984925313e-06, + "loss": 0.1162, + "step": 29212 + }, + { + "epoch": 1.1962425393380358, + "grad_norm": 4.414279460906982, + "learning_rate": 6.099492942305058e-06, + "loss": 0.2099, + "step": 29213 + }, + { + "epoch": 1.1962561041779707, + "grad_norm": 4.942863941192627, + "learning_rate": 6.099355899684802e-06, + "loss": 0.2297, + "step": 29214 + }, + { + "epoch": 1.1962696690179055, + "grad_norm": 4.87052583694458, + "learning_rate": 6.099218857064547e-06, + "loss": 0.1976, + "step": 29215 + }, + { + "epoch": 1.1962832338578404, + "grad_norm": 3.679453134536743, + "learning_rate": 6.099081814444293e-06, + "loss": 0.1493, + "step": 29216 + }, + { + "epoch": 1.1962967986977753, + "grad_norm": 3.869050979614258, + "learning_rate": 6.098944771824038e-06, + "loss": 0.1842, + "step": 29217 + }, + { + "epoch": 1.1963103635377101, + "grad_norm": 3.2234649658203125, + "learning_rate": 6.098807729203783e-06, + "loss": 0.1608, + "step": 29218 + }, + { + "epoch": 1.1963239283776452, + "grad_norm": 3.192849636077881, + "learning_rate": 6.098670686583527e-06, + "loss": 0.1506, + "step": 29219 + }, + { + "epoch": 1.19633749321758, + "grad_norm": 5.0665459632873535, + "learning_rate": 6.098533643963273e-06, + "loss": 0.271, + "step": 29220 + }, + { + "epoch": 1.196351058057515, + "grad_norm": 4.511570453643799, + "learning_rate": 6.0983966013430185e-06, + "loss": 0.1285, + "step": 29221 + }, + { + "epoch": 1.1963646228974498, + "grad_norm": 3.9807190895080566, + "learning_rate": 6.098259558722764e-06, + "loss": 0.1819, + "step": 29222 + }, + { + "epoch": 1.1963781877373847, + "grad_norm": 7.099253177642822, + "learning_rate": 6.098122516102508e-06, + "loss": 0.2918, + "step": 29223 + }, + { + "epoch": 1.1963917525773196, + "grad_norm": 3.6821324825286865, + "learning_rate": 6.097985473482253e-06, + "loss": 0.1767, + "step": 29224 + }, + { + "epoch": 1.1964053174172544, + "grad_norm": 3.6139309406280518, + "learning_rate": 6.097848430861999e-06, + "loss": 0.1677, + "step": 29225 + }, + { + "epoch": 1.1964188822571893, + "grad_norm": 5.843629360198975, + "learning_rate": 6.0977113882417435e-06, + "loss": 0.2549, + "step": 29226 + }, + { + "epoch": 1.1964324470971244, + "grad_norm": 2.8775131702423096, + "learning_rate": 6.097574345621489e-06, + "loss": 0.1416, + "step": 29227 + }, + { + "epoch": 1.1964460119370592, + "grad_norm": 3.709381580352783, + "learning_rate": 6.097437303001234e-06, + "loss": 0.187, + "step": 29228 + }, + { + "epoch": 1.196459576776994, + "grad_norm": 5.4629597663879395, + "learning_rate": 6.097300260380979e-06, + "loss": 0.2967, + "step": 29229 + }, + { + "epoch": 1.196473141616929, + "grad_norm": 5.568920612335205, + "learning_rate": 6.097163217760724e-06, + "loss": 0.2485, + "step": 29230 + }, + { + "epoch": 1.1964867064568638, + "grad_norm": 4.575098991394043, + "learning_rate": 6.097026175140469e-06, + "loss": 0.2055, + "step": 29231 + }, + { + "epoch": 1.1965002712967987, + "grad_norm": 3.691499710083008, + "learning_rate": 6.096889132520214e-06, + "loss": 0.1577, + "step": 29232 + }, + { + "epoch": 1.1965138361367336, + "grad_norm": 3.4168272018432617, + "learning_rate": 6.09675208989996e-06, + "loss": 0.1426, + "step": 29233 + }, + { + "epoch": 1.1965274009766684, + "grad_norm": 5.051673412322998, + "learning_rate": 6.096615047279705e-06, + "loss": 0.2398, + "step": 29234 + }, + { + "epoch": 1.1965409658166033, + "grad_norm": 3.124957323074341, + "learning_rate": 6.096478004659449e-06, + "loss": 0.121, + "step": 29235 + }, + { + "epoch": 1.1965545306565382, + "grad_norm": 4.529147148132324, + "learning_rate": 6.0963409620391945e-06, + "loss": 0.155, + "step": 29236 + }, + { + "epoch": 1.196568095496473, + "grad_norm": 3.7111034393310547, + "learning_rate": 6.09620391941894e-06, + "loss": 0.1876, + "step": 29237 + }, + { + "epoch": 1.1965816603364081, + "grad_norm": 4.193472862243652, + "learning_rate": 6.096066876798686e-06, + "loss": 0.1382, + "step": 29238 + }, + { + "epoch": 1.196595225176343, + "grad_norm": 5.914418697357178, + "learning_rate": 6.09592983417843e-06, + "loss": 0.2368, + "step": 29239 + }, + { + "epoch": 1.1966087900162778, + "grad_norm": 3.5159075260162354, + "learning_rate": 6.095792791558175e-06, + "loss": 0.1731, + "step": 29240 + }, + { + "epoch": 1.1966223548562127, + "grad_norm": 3.8119499683380127, + "learning_rate": 6.0956557489379195e-06, + "loss": 0.1407, + "step": 29241 + }, + { + "epoch": 1.1966359196961476, + "grad_norm": 4.392148971557617, + "learning_rate": 6.0955187063176655e-06, + "loss": 0.1926, + "step": 29242 + }, + { + "epoch": 1.1966494845360824, + "grad_norm": 4.148070335388184, + "learning_rate": 6.095381663697411e-06, + "loss": 0.2011, + "step": 29243 + }, + { + "epoch": 1.1966630493760173, + "grad_norm": 4.730804920196533, + "learning_rate": 6.095244621077155e-06, + "loss": 0.2009, + "step": 29244 + }, + { + "epoch": 1.1966766142159522, + "grad_norm": 5.452323913574219, + "learning_rate": 6.0951075784569e-06, + "loss": 0.2205, + "step": 29245 + }, + { + "epoch": 1.1966901790558873, + "grad_norm": 9.806530952453613, + "learning_rate": 6.094970535836646e-06, + "loss": 0.216, + "step": 29246 + }, + { + "epoch": 1.1967037438958221, + "grad_norm": 4.22998571395874, + "learning_rate": 6.094833493216391e-06, + "loss": 0.1702, + "step": 29247 + }, + { + "epoch": 1.196717308735757, + "grad_norm": 4.537391662597656, + "learning_rate": 6.094696450596136e-06, + "loss": 0.2129, + "step": 29248 + }, + { + "epoch": 1.1967308735756919, + "grad_norm": 3.886667251586914, + "learning_rate": 6.094559407975881e-06, + "loss": 0.2036, + "step": 29249 + }, + { + "epoch": 1.1967444384156267, + "grad_norm": 3.6081252098083496, + "learning_rate": 6.094422365355625e-06, + "loss": 0.121, + "step": 29250 + }, + { + "epoch": 1.1967580032555616, + "grad_norm": 4.740240097045898, + "learning_rate": 6.094285322735371e-06, + "loss": 0.1569, + "step": 29251 + }, + { + "epoch": 1.1967715680954965, + "grad_norm": 3.5385236740112305, + "learning_rate": 6.0941482801151165e-06, + "loss": 0.1738, + "step": 29252 + }, + { + "epoch": 1.1967851329354313, + "grad_norm": 4.24751615524292, + "learning_rate": 6.094011237494862e-06, + "loss": 0.1061, + "step": 29253 + }, + { + "epoch": 1.1967986977753662, + "grad_norm": 5.052559852600098, + "learning_rate": 6.093874194874606e-06, + "loss": 0.2439, + "step": 29254 + }, + { + "epoch": 1.196812262615301, + "grad_norm": 5.887508392333984, + "learning_rate": 6.093737152254352e-06, + "loss": 0.3419, + "step": 29255 + }, + { + "epoch": 1.196825827455236, + "grad_norm": 64.30597686767578, + "learning_rate": 6.093600109634097e-06, + "loss": 0.2345, + "step": 29256 + }, + { + "epoch": 1.196839392295171, + "grad_norm": 4.975040435791016, + "learning_rate": 6.0934630670138415e-06, + "loss": 0.1636, + "step": 29257 + }, + { + "epoch": 1.1968529571351059, + "grad_norm": 6.0172576904296875, + "learning_rate": 6.093326024393587e-06, + "loss": 0.2019, + "step": 29258 + }, + { + "epoch": 1.1968665219750407, + "grad_norm": 6.702269077301025, + "learning_rate": 6.093188981773333e-06, + "loss": 0.3748, + "step": 29259 + }, + { + "epoch": 1.1968800868149756, + "grad_norm": 4.277676582336426, + "learning_rate": 6.093051939153077e-06, + "loss": 0.2852, + "step": 29260 + }, + { + "epoch": 1.1968936516549105, + "grad_norm": 5.083630561828613, + "learning_rate": 6.092914896532822e-06, + "loss": 0.1983, + "step": 29261 + }, + { + "epoch": 1.1969072164948453, + "grad_norm": 4.054041862487793, + "learning_rate": 6.092777853912567e-06, + "loss": 0.2016, + "step": 29262 + }, + { + "epoch": 1.1969207813347802, + "grad_norm": 4.139208793640137, + "learning_rate": 6.092640811292312e-06, + "loss": 0.2052, + "step": 29263 + }, + { + "epoch": 1.196934346174715, + "grad_norm": 4.688540935516357, + "learning_rate": 6.092503768672058e-06, + "loss": 0.1861, + "step": 29264 + }, + { + "epoch": 1.1969479110146501, + "grad_norm": 3.8437695503234863, + "learning_rate": 6.092366726051803e-06, + "loss": 0.1964, + "step": 29265 + }, + { + "epoch": 1.196961475854585, + "grad_norm": 6.837349891662598, + "learning_rate": 6.092229683431547e-06, + "loss": 0.2881, + "step": 29266 + }, + { + "epoch": 1.1969750406945199, + "grad_norm": 5.0223541259765625, + "learning_rate": 6.0920926408112925e-06, + "loss": 0.2555, + "step": 29267 + }, + { + "epoch": 1.1969886055344547, + "grad_norm": 3.9551661014556885, + "learning_rate": 6.0919555981910385e-06, + "loss": 0.1805, + "step": 29268 + }, + { + "epoch": 1.1970021703743896, + "grad_norm": 4.424444675445557, + "learning_rate": 6.091818555570783e-06, + "loss": 0.2289, + "step": 29269 + }, + { + "epoch": 1.1970157352143245, + "grad_norm": 3.768125534057617, + "learning_rate": 6.091681512950528e-06, + "loss": 0.0898, + "step": 29270 + }, + { + "epoch": 1.1970293000542593, + "grad_norm": 6.635931968688965, + "learning_rate": 6.091544470330273e-06, + "loss": 0.2742, + "step": 29271 + }, + { + "epoch": 1.1970428648941942, + "grad_norm": 5.1169867515563965, + "learning_rate": 6.091407427710019e-06, + "loss": 0.2604, + "step": 29272 + }, + { + "epoch": 1.197056429734129, + "grad_norm": 6.526943206787109, + "learning_rate": 6.0912703850897635e-06, + "loss": 0.4521, + "step": 29273 + }, + { + "epoch": 1.197069994574064, + "grad_norm": 5.029794216156006, + "learning_rate": 6.091133342469509e-06, + "loss": 0.28, + "step": 29274 + }, + { + "epoch": 1.1970835594139988, + "grad_norm": 4.642775058746338, + "learning_rate": 6.090996299849253e-06, + "loss": 0.2079, + "step": 29275 + }, + { + "epoch": 1.1970971242539339, + "grad_norm": 4.077212333679199, + "learning_rate": 6.090859257228998e-06, + "loss": 0.171, + "step": 29276 + }, + { + "epoch": 1.1971106890938688, + "grad_norm": 4.992940902709961, + "learning_rate": 6.090722214608744e-06, + "loss": 0.2445, + "step": 29277 + }, + { + "epoch": 1.1971242539338036, + "grad_norm": 4.124855995178223, + "learning_rate": 6.090585171988489e-06, + "loss": 0.1796, + "step": 29278 + }, + { + "epoch": 1.1971378187737385, + "grad_norm": 4.593504905700684, + "learning_rate": 6.090448129368234e-06, + "loss": 0.1845, + "step": 29279 + }, + { + "epoch": 1.1971513836136733, + "grad_norm": 2.3926782608032227, + "learning_rate": 6.090311086747979e-06, + "loss": 0.0908, + "step": 29280 + }, + { + "epoch": 1.1971649484536082, + "grad_norm": 4.290074825286865, + "learning_rate": 6.090174044127725e-06, + "loss": 0.3207, + "step": 29281 + }, + { + "epoch": 1.197178513293543, + "grad_norm": 5.181358814239502, + "learning_rate": 6.090037001507469e-06, + "loss": 0.2493, + "step": 29282 + }, + { + "epoch": 1.197192078133478, + "grad_norm": 4.101006031036377, + "learning_rate": 6.0898999588872145e-06, + "loss": 0.1739, + "step": 29283 + }, + { + "epoch": 1.197205642973413, + "grad_norm": 7.8854169845581055, + "learning_rate": 6.089762916266959e-06, + "loss": 0.2599, + "step": 29284 + }, + { + "epoch": 1.197219207813348, + "grad_norm": 3.686038017272949, + "learning_rate": 6.089625873646705e-06, + "loss": 0.1503, + "step": 29285 + }, + { + "epoch": 1.1972327726532828, + "grad_norm": 2.9156196117401123, + "learning_rate": 6.08948883102645e-06, + "loss": 0.1974, + "step": 29286 + }, + { + "epoch": 1.1972463374932176, + "grad_norm": 4.440371990203857, + "learning_rate": 6.089351788406195e-06, + "loss": 0.2119, + "step": 29287 + }, + { + "epoch": 1.1972599023331525, + "grad_norm": 4.208444118499756, + "learning_rate": 6.0892147457859395e-06, + "loss": 0.2185, + "step": 29288 + }, + { + "epoch": 1.1972734671730874, + "grad_norm": 3.6103434562683105, + "learning_rate": 6.089077703165685e-06, + "loss": 0.2656, + "step": 29289 + }, + { + "epoch": 1.1972870320130222, + "grad_norm": 4.884446144104004, + "learning_rate": 6.088940660545431e-06, + "loss": 0.3852, + "step": 29290 + }, + { + "epoch": 1.197300596852957, + "grad_norm": 2.9172778129577637, + "learning_rate": 6.088803617925175e-06, + "loss": 0.2271, + "step": 29291 + }, + { + "epoch": 1.197314161692892, + "grad_norm": 5.0159125328063965, + "learning_rate": 6.08866657530492e-06, + "loss": 0.3411, + "step": 29292 + }, + { + "epoch": 1.1973277265328268, + "grad_norm": 5.047353744506836, + "learning_rate": 6.0885295326846646e-06, + "loss": 0.27, + "step": 29293 + }, + { + "epoch": 1.1973412913727617, + "grad_norm": 5.206892967224121, + "learning_rate": 6.088392490064411e-06, + "loss": 0.2194, + "step": 29294 + }, + { + "epoch": 1.1973548562126968, + "grad_norm": 3.3073244094848633, + "learning_rate": 6.088255447444156e-06, + "loss": 0.1836, + "step": 29295 + }, + { + "epoch": 1.1973684210526316, + "grad_norm": 3.396711587905884, + "learning_rate": 6.088118404823901e-06, + "loss": 0.1528, + "step": 29296 + }, + { + "epoch": 1.1973819858925665, + "grad_norm": 4.247379302978516, + "learning_rate": 6.087981362203645e-06, + "loss": 0.2191, + "step": 29297 + }, + { + "epoch": 1.1973955507325014, + "grad_norm": 4.624016284942627, + "learning_rate": 6.087844319583391e-06, + "loss": 0.2174, + "step": 29298 + }, + { + "epoch": 1.1974091155724362, + "grad_norm": 4.003233432769775, + "learning_rate": 6.0877072769631365e-06, + "loss": 0.1645, + "step": 29299 + }, + { + "epoch": 1.197422680412371, + "grad_norm": 4.435753345489502, + "learning_rate": 6.087570234342881e-06, + "loss": 0.2572, + "step": 29300 + }, + { + "epoch": 1.197436245252306, + "grad_norm": 4.797935962677002, + "learning_rate": 6.087433191722626e-06, + "loss": 0.1692, + "step": 29301 + }, + { + "epoch": 1.1974498100922408, + "grad_norm": 3.5967206954956055, + "learning_rate": 6.087296149102372e-06, + "loss": 0.1656, + "step": 29302 + }, + { + "epoch": 1.197463374932176, + "grad_norm": 3.9465582370758057, + "learning_rate": 6.087159106482116e-06, + "loss": 0.1939, + "step": 29303 + }, + { + "epoch": 1.1974769397721108, + "grad_norm": 5.458287239074707, + "learning_rate": 6.0870220638618615e-06, + "loss": 0.1978, + "step": 29304 + }, + { + "epoch": 1.1974905046120456, + "grad_norm": 3.1240732669830322, + "learning_rate": 6.086885021241607e-06, + "loss": 0.1493, + "step": 29305 + }, + { + "epoch": 1.1975040694519805, + "grad_norm": 3.423496961593628, + "learning_rate": 6.086747978621351e-06, + "loss": 0.2326, + "step": 29306 + }, + { + "epoch": 1.1975176342919154, + "grad_norm": 2.7757482528686523, + "learning_rate": 6.086610936001097e-06, + "loss": 0.1689, + "step": 29307 + }, + { + "epoch": 1.1975311991318502, + "grad_norm": 5.937081336975098, + "learning_rate": 6.086473893380842e-06, + "loss": 0.2873, + "step": 29308 + }, + { + "epoch": 1.197544763971785, + "grad_norm": 4.8502373695373535, + "learning_rate": 6.086336850760587e-06, + "loss": 0.2185, + "step": 29309 + }, + { + "epoch": 1.19755832881172, + "grad_norm": 3.664787530899048, + "learning_rate": 6.086199808140332e-06, + "loss": 0.1697, + "step": 29310 + }, + { + "epoch": 1.1975718936516548, + "grad_norm": 3.3686647415161133, + "learning_rate": 6.086062765520078e-06, + "loss": 0.1683, + "step": 29311 + }, + { + "epoch": 1.1975854584915897, + "grad_norm": 5.383915424346924, + "learning_rate": 6.085925722899822e-06, + "loss": 0.1677, + "step": 29312 + }, + { + "epoch": 1.1975990233315246, + "grad_norm": 4.112597465515137, + "learning_rate": 6.085788680279567e-06, + "loss": 0.1887, + "step": 29313 + }, + { + "epoch": 1.1976125881714597, + "grad_norm": 4.019228458404541, + "learning_rate": 6.0856516376593125e-06, + "loss": 0.2337, + "step": 29314 + }, + { + "epoch": 1.1976261530113945, + "grad_norm": 4.302664756774902, + "learning_rate": 6.0855145950390585e-06, + "loss": 0.2259, + "step": 29315 + }, + { + "epoch": 1.1976397178513294, + "grad_norm": 5.0976881980896, + "learning_rate": 6.085377552418803e-06, + "loss": 0.1784, + "step": 29316 + }, + { + "epoch": 1.1976532826912643, + "grad_norm": 3.922603130340576, + "learning_rate": 6.085240509798548e-06, + "loss": 0.1969, + "step": 29317 + }, + { + "epoch": 1.1976668475311991, + "grad_norm": 5.234149932861328, + "learning_rate": 6.085103467178292e-06, + "loss": 0.2538, + "step": 29318 + }, + { + "epoch": 1.197680412371134, + "grad_norm": 3.660475730895996, + "learning_rate": 6.0849664245580375e-06, + "loss": 0.1582, + "step": 29319 + }, + { + "epoch": 1.1976939772110688, + "grad_norm": 7.394401550292969, + "learning_rate": 6.0848293819377836e-06, + "loss": 0.2971, + "step": 29320 + }, + { + "epoch": 1.1977075420510037, + "grad_norm": 5.656851768493652, + "learning_rate": 6.084692339317529e-06, + "loss": 0.2318, + "step": 29321 + }, + { + "epoch": 1.1977211068909388, + "grad_norm": 5.749536037445068, + "learning_rate": 6.084555296697273e-06, + "loss": 0.3221, + "step": 29322 + }, + { + "epoch": 1.1977346717308737, + "grad_norm": 3.238823890686035, + "learning_rate": 6.084418254077018e-06, + "loss": 0.1795, + "step": 29323 + }, + { + "epoch": 1.1977482365708085, + "grad_norm": 4.143205165863037, + "learning_rate": 6.084281211456764e-06, + "loss": 0.2231, + "step": 29324 + }, + { + "epoch": 1.1977618014107434, + "grad_norm": 3.3329880237579346, + "learning_rate": 6.084144168836509e-06, + "loss": 0.1582, + "step": 29325 + }, + { + "epoch": 1.1977753662506783, + "grad_norm": 4.434657573699951, + "learning_rate": 6.084007126216254e-06, + "loss": 0.2419, + "step": 29326 + }, + { + "epoch": 1.1977889310906131, + "grad_norm": 4.661877155303955, + "learning_rate": 6.083870083595998e-06, + "loss": 0.2122, + "step": 29327 + }, + { + "epoch": 1.197802495930548, + "grad_norm": 3.8135647773742676, + "learning_rate": 6.083733040975744e-06, + "loss": 0.1674, + "step": 29328 + }, + { + "epoch": 1.1978160607704829, + "grad_norm": 6.4793901443481445, + "learning_rate": 6.083595998355489e-06, + "loss": 0.3482, + "step": 29329 + }, + { + "epoch": 1.1978296256104177, + "grad_norm": 4.085458755493164, + "learning_rate": 6.0834589557352345e-06, + "loss": 0.211, + "step": 29330 + }, + { + "epoch": 1.1978431904503526, + "grad_norm": 4.869283199310303, + "learning_rate": 6.083321913114979e-06, + "loss": 0.3025, + "step": 29331 + }, + { + "epoch": 1.1978567552902875, + "grad_norm": 5.75457239151001, + "learning_rate": 6.083184870494724e-06, + "loss": 0.2575, + "step": 29332 + }, + { + "epoch": 1.1978703201302225, + "grad_norm": 4.607999324798584, + "learning_rate": 6.08304782787447e-06, + "loss": 0.1774, + "step": 29333 + }, + { + "epoch": 1.1978838849701574, + "grad_norm": 6.289374828338623, + "learning_rate": 6.082910785254214e-06, + "loss": 0.3727, + "step": 29334 + }, + { + "epoch": 1.1978974498100923, + "grad_norm": 3.601396322250366, + "learning_rate": 6.0827737426339595e-06, + "loss": 0.221, + "step": 29335 + }, + { + "epoch": 1.1979110146500271, + "grad_norm": 4.283364772796631, + "learning_rate": 6.082636700013705e-06, + "loss": 0.1789, + "step": 29336 + }, + { + "epoch": 1.197924579489962, + "grad_norm": 5.72291898727417, + "learning_rate": 6.08249965739345e-06, + "loss": 0.2679, + "step": 29337 + }, + { + "epoch": 1.1979381443298969, + "grad_norm": 6.294159889221191, + "learning_rate": 6.082362614773195e-06, + "loss": 0.2969, + "step": 29338 + }, + { + "epoch": 1.1979517091698317, + "grad_norm": 6.497221946716309, + "learning_rate": 6.08222557215294e-06, + "loss": 0.3624, + "step": 29339 + }, + { + "epoch": 1.1979652740097666, + "grad_norm": 4.360152721405029, + "learning_rate": 6.082088529532685e-06, + "loss": 0.2292, + "step": 29340 + }, + { + "epoch": 1.1979788388497017, + "grad_norm": 5.111702919006348, + "learning_rate": 6.081951486912431e-06, + "loss": 0.2782, + "step": 29341 + }, + { + "epoch": 1.1979924036896366, + "grad_norm": 3.6038947105407715, + "learning_rate": 6.081814444292176e-06, + "loss": 0.143, + "step": 29342 + }, + { + "epoch": 1.1980059685295714, + "grad_norm": 4.845264434814453, + "learning_rate": 6.08167740167192e-06, + "loss": 0.2801, + "step": 29343 + }, + { + "epoch": 1.1980195333695063, + "grad_norm": 5.668172836303711, + "learning_rate": 6.081540359051665e-06, + "loss": 0.2263, + "step": 29344 + }, + { + "epoch": 1.1980330982094411, + "grad_norm": 6.862621307373047, + "learning_rate": 6.0814033164314105e-06, + "loss": 0.4747, + "step": 29345 + }, + { + "epoch": 1.198046663049376, + "grad_norm": 4.558234691619873, + "learning_rate": 6.081266273811156e-06, + "loss": 0.2969, + "step": 29346 + }, + { + "epoch": 1.1980602278893109, + "grad_norm": 7.201395034790039, + "learning_rate": 6.081129231190901e-06, + "loss": 0.2613, + "step": 29347 + }, + { + "epoch": 1.1980737927292457, + "grad_norm": 5.176681041717529, + "learning_rate": 6.080992188570646e-06, + "loss": 0.2679, + "step": 29348 + }, + { + "epoch": 1.1980873575691806, + "grad_norm": 4.963736534118652, + "learning_rate": 6.08085514595039e-06, + "loss": 0.3095, + "step": 29349 + }, + { + "epoch": 1.1981009224091155, + "grad_norm": 4.102190017700195, + "learning_rate": 6.080718103330136e-06, + "loss": 0.2225, + "step": 29350 + }, + { + "epoch": 1.1981144872490503, + "grad_norm": 6.145539283752441, + "learning_rate": 6.0805810607098816e-06, + "loss": 0.2534, + "step": 29351 + }, + { + "epoch": 1.1981280520889854, + "grad_norm": 7.696703910827637, + "learning_rate": 6.080444018089626e-06, + "loss": 0.3072, + "step": 29352 + }, + { + "epoch": 1.1981416169289203, + "grad_norm": 5.1630988121032715, + "learning_rate": 6.080306975469371e-06, + "loss": 0.2528, + "step": 29353 + }, + { + "epoch": 1.1981551817688552, + "grad_norm": 4.2745842933654785, + "learning_rate": 6.080169932849117e-06, + "loss": 0.2166, + "step": 29354 + }, + { + "epoch": 1.19816874660879, + "grad_norm": 4.1376633644104, + "learning_rate": 6.080032890228862e-06, + "loss": 0.1852, + "step": 29355 + }, + { + "epoch": 1.198182311448725, + "grad_norm": 5.401645660400391, + "learning_rate": 6.079895847608607e-06, + "loss": 0.2678, + "step": 29356 + }, + { + "epoch": 1.1981958762886598, + "grad_norm": 4.882465839385986, + "learning_rate": 6.079758804988352e-06, + "loss": 0.2193, + "step": 29357 + }, + { + "epoch": 1.1982094411285946, + "grad_norm": 5.751667499542236, + "learning_rate": 6.079621762368096e-06, + "loss": 0.3055, + "step": 29358 + }, + { + "epoch": 1.1982230059685295, + "grad_norm": 6.312779903411865, + "learning_rate": 6.079484719747842e-06, + "loss": 0.3683, + "step": 29359 + }, + { + "epoch": 1.1982365708084646, + "grad_norm": 4.601779460906982, + "learning_rate": 6.079347677127587e-06, + "loss": 0.2346, + "step": 29360 + }, + { + "epoch": 1.1982501356483994, + "grad_norm": 6.331817150115967, + "learning_rate": 6.079210634507332e-06, + "loss": 0.2738, + "step": 29361 + }, + { + "epoch": 1.1982637004883343, + "grad_norm": 4.042582988739014, + "learning_rate": 6.079073591887077e-06, + "loss": 0.3001, + "step": 29362 + }, + { + "epoch": 1.1982772653282692, + "grad_norm": 6.654684543609619, + "learning_rate": 6.078936549266823e-06, + "loss": 0.4822, + "step": 29363 + }, + { + "epoch": 1.198290830168204, + "grad_norm": 5.257929801940918, + "learning_rate": 6.078799506646568e-06, + "loss": 0.2187, + "step": 29364 + }, + { + "epoch": 1.198304395008139, + "grad_norm": 7.154142379760742, + "learning_rate": 6.078662464026312e-06, + "loss": 0.3891, + "step": 29365 + }, + { + "epoch": 1.1983179598480738, + "grad_norm": 5.632797718048096, + "learning_rate": 6.0785254214060575e-06, + "loss": 0.2506, + "step": 29366 + }, + { + "epoch": 1.1983315246880086, + "grad_norm": 7.432693958282471, + "learning_rate": 6.0783883787858036e-06, + "loss": 0.5165, + "step": 29367 + }, + { + "epoch": 1.1983450895279435, + "grad_norm": 5.710155487060547, + "learning_rate": 6.078251336165548e-06, + "loss": 0.402, + "step": 29368 + }, + { + "epoch": 1.1983586543678784, + "grad_norm": 4.052069187164307, + "learning_rate": 6.078114293545293e-06, + "loss": 0.188, + "step": 29369 + }, + { + "epoch": 1.1983722192078132, + "grad_norm": 6.207756996154785, + "learning_rate": 6.077977250925038e-06, + "loss": 0.243, + "step": 29370 + }, + { + "epoch": 1.1983857840477483, + "grad_norm": 9.263059616088867, + "learning_rate": 6.0778402083047834e-06, + "loss": 0.6173, + "step": 29371 + }, + { + "epoch": 1.1983993488876832, + "grad_norm": 4.952548503875732, + "learning_rate": 6.077703165684529e-06, + "loss": 0.3033, + "step": 29372 + }, + { + "epoch": 1.198412913727618, + "grad_norm": 6.832855701446533, + "learning_rate": 6.077566123064274e-06, + "loss": 0.339, + "step": 29373 + }, + { + "epoch": 1.198426478567553, + "grad_norm": 5.910215854644775, + "learning_rate": 6.077429080444018e-06, + "loss": 0.2623, + "step": 29374 + }, + { + "epoch": 1.1984400434074878, + "grad_norm": 5.533792018890381, + "learning_rate": 6.077292037823763e-06, + "loss": 0.2821, + "step": 29375 + }, + { + "epoch": 1.1984536082474226, + "grad_norm": 9.111169815063477, + "learning_rate": 6.077154995203509e-06, + "loss": 0.5072, + "step": 29376 + }, + { + "epoch": 1.1984671730873575, + "grad_norm": 5.404979228973389, + "learning_rate": 6.077017952583254e-06, + "loss": 0.2209, + "step": 29377 + }, + { + "epoch": 1.1984807379272924, + "grad_norm": 6.693869590759277, + "learning_rate": 6.076880909962999e-06, + "loss": 0.3751, + "step": 29378 + }, + { + "epoch": 1.1984943027672275, + "grad_norm": 7.026740074157715, + "learning_rate": 6.076743867342744e-06, + "loss": 0.5243, + "step": 29379 + }, + { + "epoch": 1.1985078676071623, + "grad_norm": 7.154550075531006, + "learning_rate": 6.07660682472249e-06, + "loss": 0.3173, + "step": 29380 + }, + { + "epoch": 1.1985214324470972, + "grad_norm": 7.713930130004883, + "learning_rate": 6.076469782102234e-06, + "loss": 0.4582, + "step": 29381 + }, + { + "epoch": 1.198534997287032, + "grad_norm": 5.43427848815918, + "learning_rate": 6.0763327394819796e-06, + "loss": 0.285, + "step": 29382 + }, + { + "epoch": 1.198548562126967, + "grad_norm": 7.112875938415527, + "learning_rate": 6.076195696861724e-06, + "loss": 0.373, + "step": 29383 + }, + { + "epoch": 1.1985621269669018, + "grad_norm": 7.367411136627197, + "learning_rate": 6.07605865424147e-06, + "loss": 0.3979, + "step": 29384 + }, + { + "epoch": 1.1985756918068367, + "grad_norm": 6.695449352264404, + "learning_rate": 6.075921611621215e-06, + "loss": 0.3047, + "step": 29385 + }, + { + "epoch": 1.1985892566467715, + "grad_norm": 5.735617637634277, + "learning_rate": 6.0757845690009594e-06, + "loss": 0.304, + "step": 29386 + }, + { + "epoch": 1.1986028214867064, + "grad_norm": 4.752694606781006, + "learning_rate": 6.075647526380705e-06, + "loss": 0.2695, + "step": 29387 + }, + { + "epoch": 1.1986163863266412, + "grad_norm": 6.618912696838379, + "learning_rate": 6.07551048376045e-06, + "loss": 0.399, + "step": 29388 + }, + { + "epoch": 1.1986299511665761, + "grad_norm": 5.581808090209961, + "learning_rate": 6.075373441140196e-06, + "loss": 0.2716, + "step": 29389 + }, + { + "epoch": 1.1986435160065112, + "grad_norm": 6.474546909332275, + "learning_rate": 6.07523639851994e-06, + "loss": 0.4216, + "step": 29390 + }, + { + "epoch": 1.198657080846446, + "grad_norm": 4.134642124176025, + "learning_rate": 6.075099355899685e-06, + "loss": 0.1675, + "step": 29391 + }, + { + "epoch": 1.198670645686381, + "grad_norm": 6.104239463806152, + "learning_rate": 6.07496231327943e-06, + "loss": 0.2929, + "step": 29392 + }, + { + "epoch": 1.1986842105263158, + "grad_norm": 6.3188862800598145, + "learning_rate": 6.074825270659176e-06, + "loss": 0.3125, + "step": 29393 + }, + { + "epoch": 1.1986977753662507, + "grad_norm": 5.347156524658203, + "learning_rate": 6.074688228038921e-06, + "loss": 0.2588, + "step": 29394 + }, + { + "epoch": 1.1987113402061855, + "grad_norm": 4.937253952026367, + "learning_rate": 6.074551185418666e-06, + "loss": 0.2792, + "step": 29395 + }, + { + "epoch": 1.1987249050461204, + "grad_norm": 5.222975730895996, + "learning_rate": 6.07441414279841e-06, + "loss": 0.2132, + "step": 29396 + }, + { + "epoch": 1.1987384698860553, + "grad_norm": 6.064265727996826, + "learning_rate": 6.074277100178156e-06, + "loss": 0.298, + "step": 29397 + }, + { + "epoch": 1.1987520347259903, + "grad_norm": 7.073333263397217, + "learning_rate": 6.0741400575579016e-06, + "loss": 0.3191, + "step": 29398 + }, + { + "epoch": 1.1987655995659252, + "grad_norm": 6.742733001708984, + "learning_rate": 6.074003014937646e-06, + "loss": 0.3505, + "step": 29399 + }, + { + "epoch": 1.19877916440586, + "grad_norm": 5.932519435882568, + "learning_rate": 6.073865972317391e-06, + "loss": 0.2905, + "step": 29400 + }, + { + "epoch": 1.198792729245795, + "grad_norm": 5.527735233306885, + "learning_rate": 6.073728929697135e-06, + "loss": 0.3146, + "step": 29401 + }, + { + "epoch": 1.1988062940857298, + "grad_norm": 5.955005645751953, + "learning_rate": 6.0735918870768814e-06, + "loss": 0.3674, + "step": 29402 + }, + { + "epoch": 1.1988198589256647, + "grad_norm": 6.459188938140869, + "learning_rate": 6.073454844456627e-06, + "loss": 0.346, + "step": 29403 + }, + { + "epoch": 1.1988334237655995, + "grad_norm": 6.913725852966309, + "learning_rate": 6.073317801836372e-06, + "loss": 0.3189, + "step": 29404 + }, + { + "epoch": 1.1988469886055344, + "grad_norm": 5.726381301879883, + "learning_rate": 6.073180759216116e-06, + "loss": 0.2382, + "step": 29405 + }, + { + "epoch": 1.1988605534454693, + "grad_norm": 4.895872592926025, + "learning_rate": 6.073043716595862e-06, + "loss": 0.3683, + "step": 29406 + }, + { + "epoch": 1.1988741182854041, + "grad_norm": 6.260135650634766, + "learning_rate": 6.072906673975607e-06, + "loss": 0.3766, + "step": 29407 + }, + { + "epoch": 1.198887683125339, + "grad_norm": 6.511036396026611, + "learning_rate": 6.072769631355352e-06, + "loss": 0.436, + "step": 29408 + }, + { + "epoch": 1.198901247965274, + "grad_norm": 6.8310160636901855, + "learning_rate": 6.072632588735097e-06, + "loss": 0.3302, + "step": 29409 + }, + { + "epoch": 1.198914812805209, + "grad_norm": 6.876121520996094, + "learning_rate": 6.072495546114843e-06, + "loss": 0.2902, + "step": 29410 + }, + { + "epoch": 1.1989283776451438, + "grad_norm": 11.633605003356934, + "learning_rate": 6.072358503494587e-06, + "loss": 0.3057, + "step": 29411 + }, + { + "epoch": 1.1989419424850787, + "grad_norm": 5.995682239532471, + "learning_rate": 6.072221460874332e-06, + "loss": 0.3055, + "step": 29412 + }, + { + "epoch": 1.1989555073250135, + "grad_norm": 6.367959022521973, + "learning_rate": 6.0720844182540776e-06, + "loss": 0.2972, + "step": 29413 + }, + { + "epoch": 1.1989690721649484, + "grad_norm": 8.26877498626709, + "learning_rate": 6.071947375633822e-06, + "loss": 0.4468, + "step": 29414 + }, + { + "epoch": 1.1989826370048833, + "grad_norm": 5.622145652770996, + "learning_rate": 6.071810333013568e-06, + "loss": 0.367, + "step": 29415 + }, + { + "epoch": 1.1989962018448181, + "grad_norm": 5.351986408233643, + "learning_rate": 6.071673290393313e-06, + "loss": 0.2983, + "step": 29416 + }, + { + "epoch": 1.1990097666847532, + "grad_norm": 5.168118000030518, + "learning_rate": 6.0715362477730574e-06, + "loss": 0.2358, + "step": 29417 + }, + { + "epoch": 1.199023331524688, + "grad_norm": 6.159500598907471, + "learning_rate": 6.071399205152803e-06, + "loss": 0.2404, + "step": 29418 + }, + { + "epoch": 1.199036896364623, + "grad_norm": 4.144064426422119, + "learning_rate": 6.071262162532549e-06, + "loss": 0.2505, + "step": 29419 + }, + { + "epoch": 1.1990504612045578, + "grad_norm": 8.683600425720215, + "learning_rate": 6.071125119912293e-06, + "loss": 0.3519, + "step": 29420 + }, + { + "epoch": 1.1990640260444927, + "grad_norm": 4.9206461906433105, + "learning_rate": 6.070988077292038e-06, + "loss": 0.1811, + "step": 29421 + }, + { + "epoch": 1.1990775908844276, + "grad_norm": 6.545164585113525, + "learning_rate": 6.070851034671783e-06, + "loss": 0.2734, + "step": 29422 + }, + { + "epoch": 1.1990911557243624, + "grad_norm": 5.062915802001953, + "learning_rate": 6.070713992051529e-06, + "loss": 0.2166, + "step": 29423 + }, + { + "epoch": 1.1991047205642973, + "grad_norm": 5.629680156707764, + "learning_rate": 6.070576949431274e-06, + "loss": 0.1869, + "step": 29424 + }, + { + "epoch": 1.1991182854042322, + "grad_norm": 6.770866870880127, + "learning_rate": 6.070439906811019e-06, + "loss": 0.3954, + "step": 29425 + }, + { + "epoch": 1.199131850244167, + "grad_norm": 7.959651470184326, + "learning_rate": 6.070302864190763e-06, + "loss": 0.3823, + "step": 29426 + }, + { + "epoch": 1.1991454150841019, + "grad_norm": 5.582370281219482, + "learning_rate": 6.070165821570508e-06, + "loss": 0.3395, + "step": 29427 + }, + { + "epoch": 1.199158979924037, + "grad_norm": 5.816582202911377, + "learning_rate": 6.070028778950254e-06, + "loss": 0.3224, + "step": 29428 + }, + { + "epoch": 1.1991725447639718, + "grad_norm": 5.5800371170043945, + "learning_rate": 6.06989173633e-06, + "loss": 0.2662, + "step": 29429 + }, + { + "epoch": 1.1991861096039067, + "grad_norm": 4.123271465301514, + "learning_rate": 6.069754693709744e-06, + "loss": 0.2114, + "step": 29430 + }, + { + "epoch": 1.1991996744438416, + "grad_norm": 6.21649169921875, + "learning_rate": 6.069617651089489e-06, + "loss": 0.345, + "step": 29431 + }, + { + "epoch": 1.1992132392837764, + "grad_norm": 4.0399675369262695, + "learning_rate": 6.069480608469235e-06, + "loss": 0.2019, + "step": 29432 + }, + { + "epoch": 1.1992268041237113, + "grad_norm": 4.32054328918457, + "learning_rate": 6.0693435658489794e-06, + "loss": 0.246, + "step": 29433 + }, + { + "epoch": 1.1992403689636462, + "grad_norm": 4.543469429016113, + "learning_rate": 6.069206523228725e-06, + "loss": 0.1931, + "step": 29434 + }, + { + "epoch": 1.199253933803581, + "grad_norm": 5.382787227630615, + "learning_rate": 6.069069480608469e-06, + "loss": 0.1897, + "step": 29435 + }, + { + "epoch": 1.1992674986435161, + "grad_norm": 3.873339891433716, + "learning_rate": 6.068932437988215e-06, + "loss": 0.144, + "step": 29436 + }, + { + "epoch": 1.199281063483451, + "grad_norm": 6.150580883026123, + "learning_rate": 6.06879539536796e-06, + "loss": 0.2846, + "step": 29437 + }, + { + "epoch": 1.1992946283233858, + "grad_norm": 4.183403968811035, + "learning_rate": 6.068658352747705e-06, + "loss": 0.1562, + "step": 29438 + }, + { + "epoch": 1.1993081931633207, + "grad_norm": 4.317538261413574, + "learning_rate": 6.06852131012745e-06, + "loss": 0.1602, + "step": 29439 + }, + { + "epoch": 1.1993217580032556, + "grad_norm": 5.286999702453613, + "learning_rate": 6.068384267507196e-06, + "loss": 0.1903, + "step": 29440 + }, + { + "epoch": 1.1993353228431904, + "grad_norm": 5.887821197509766, + "learning_rate": 6.068247224886941e-06, + "loss": 0.3614, + "step": 29441 + }, + { + "epoch": 1.1993488876831253, + "grad_norm": 4.054635047912598, + "learning_rate": 6.068110182266685e-06, + "loss": 0.1704, + "step": 29442 + }, + { + "epoch": 1.1993624525230602, + "grad_norm": 5.464253902435303, + "learning_rate": 6.06797313964643e-06, + "loss": 0.2234, + "step": 29443 + }, + { + "epoch": 1.199376017362995, + "grad_norm": 8.094425201416016, + "learning_rate": 6.0678360970261756e-06, + "loss": 0.3483, + "step": 29444 + }, + { + "epoch": 1.19938958220293, + "grad_norm": 3.769881010055542, + "learning_rate": 6.067699054405921e-06, + "loss": 0.2154, + "step": 29445 + }, + { + "epoch": 1.1994031470428648, + "grad_norm": 6.162215709686279, + "learning_rate": 6.067562011785666e-06, + "loss": 0.2542, + "step": 29446 + }, + { + "epoch": 1.1994167118827999, + "grad_norm": 4.598709583282471, + "learning_rate": 6.067424969165411e-06, + "loss": 0.2786, + "step": 29447 + }, + { + "epoch": 1.1994302767227347, + "grad_norm": 5.437886714935303, + "learning_rate": 6.0672879265451554e-06, + "loss": 0.2806, + "step": 29448 + }, + { + "epoch": 1.1994438415626696, + "grad_norm": 5.208433151245117, + "learning_rate": 6.0671508839249015e-06, + "loss": 0.2189, + "step": 29449 + }, + { + "epoch": 1.1994574064026045, + "grad_norm": 3.8426647186279297, + "learning_rate": 6.067013841304647e-06, + "loss": 0.2409, + "step": 29450 + }, + { + "epoch": 1.1994709712425393, + "grad_norm": 6.994232654571533, + "learning_rate": 6.066876798684391e-06, + "loss": 0.2259, + "step": 29451 + }, + { + "epoch": 1.1994845360824742, + "grad_norm": 4.596844673156738, + "learning_rate": 6.066739756064136e-06, + "loss": 0.2596, + "step": 29452 + }, + { + "epoch": 1.199498100922409, + "grad_norm": 5.343226432800293, + "learning_rate": 6.066602713443882e-06, + "loss": 0.252, + "step": 29453 + }, + { + "epoch": 1.199511665762344, + "grad_norm": 4.330999374389648, + "learning_rate": 6.0664656708236265e-06, + "loss": 0.2214, + "step": 29454 + }, + { + "epoch": 1.199525230602279, + "grad_norm": 6.043239593505859, + "learning_rate": 6.066328628203372e-06, + "loss": 0.3366, + "step": 29455 + }, + { + "epoch": 1.1995387954422139, + "grad_norm": 6.119742393493652, + "learning_rate": 6.066191585583117e-06, + "loss": 0.3846, + "step": 29456 + }, + { + "epoch": 1.1995523602821487, + "grad_norm": 6.771426200866699, + "learning_rate": 6.066054542962861e-06, + "loss": 0.4715, + "step": 29457 + }, + { + "epoch": 1.1995659251220836, + "grad_norm": 6.908957004547119, + "learning_rate": 6.065917500342607e-06, + "loss": 0.2298, + "step": 29458 + }, + { + "epoch": 1.1995794899620185, + "grad_norm": 7.114611625671387, + "learning_rate": 6.065780457722352e-06, + "loss": 0.3402, + "step": 29459 + }, + { + "epoch": 1.1995930548019533, + "grad_norm": 5.174566745758057, + "learning_rate": 6.065643415102097e-06, + "loss": 0.3928, + "step": 29460 + }, + { + "epoch": 1.1996066196418882, + "grad_norm": 5.3904643058776855, + "learning_rate": 6.065506372481842e-06, + "loss": 0.2268, + "step": 29461 + }, + { + "epoch": 1.199620184481823, + "grad_norm": 3.2831549644470215, + "learning_rate": 6.065369329861588e-06, + "loss": 0.1398, + "step": 29462 + }, + { + "epoch": 1.199633749321758, + "grad_norm": 4.415506362915039, + "learning_rate": 6.065232287241333e-06, + "loss": 0.2052, + "step": 29463 + }, + { + "epoch": 1.1996473141616928, + "grad_norm": 5.223123550415039, + "learning_rate": 6.0650952446210775e-06, + "loss": 0.2285, + "step": 29464 + }, + { + "epoch": 1.1996608790016279, + "grad_norm": 5.681556224822998, + "learning_rate": 6.064958202000823e-06, + "loss": 0.2825, + "step": 29465 + }, + { + "epoch": 1.1996744438415627, + "grad_norm": 5.270027160644531, + "learning_rate": 6.064821159380569e-06, + "loss": 0.28, + "step": 29466 + }, + { + "epoch": 1.1996880086814976, + "grad_norm": 4.9480767250061035, + "learning_rate": 6.064684116760313e-06, + "loss": 0.314, + "step": 29467 + }, + { + "epoch": 1.1997015735214325, + "grad_norm": 5.6923909187316895, + "learning_rate": 6.064547074140058e-06, + "loss": 0.3843, + "step": 29468 + }, + { + "epoch": 1.1997151383613673, + "grad_norm": 4.093871116638184, + "learning_rate": 6.0644100315198025e-06, + "loss": 0.2725, + "step": 29469 + }, + { + "epoch": 1.1997287032013022, + "grad_norm": 6.03287410736084, + "learning_rate": 6.064272988899548e-06, + "loss": 0.3072, + "step": 29470 + }, + { + "epoch": 1.199742268041237, + "grad_norm": 5.044521808624268, + "learning_rate": 6.064135946279294e-06, + "loss": 0.2293, + "step": 29471 + }, + { + "epoch": 1.199755832881172, + "grad_norm": 7.543575763702393, + "learning_rate": 6.063998903659039e-06, + "loss": 0.4417, + "step": 29472 + }, + { + "epoch": 1.1997693977211068, + "grad_norm": 6.596044063568115, + "learning_rate": 6.063861861038783e-06, + "loss": 0.415, + "step": 29473 + }, + { + "epoch": 1.1997829625610419, + "grad_norm": 5.742938995361328, + "learning_rate": 6.063724818418528e-06, + "loss": 0.3196, + "step": 29474 + }, + { + "epoch": 1.1997965274009768, + "grad_norm": 5.6911702156066895, + "learning_rate": 6.063587775798274e-06, + "loss": 0.3352, + "step": 29475 + }, + { + "epoch": 1.1998100922409116, + "grad_norm": 6.43945837020874, + "learning_rate": 6.063450733178019e-06, + "loss": 0.5066, + "step": 29476 + }, + { + "epoch": 1.1998236570808465, + "grad_norm": 5.346480846405029, + "learning_rate": 6.063313690557764e-06, + "loss": 0.1681, + "step": 29477 + }, + { + "epoch": 1.1998372219207813, + "grad_norm": 5.15872859954834, + "learning_rate": 6.063176647937509e-06, + "loss": 0.253, + "step": 29478 + }, + { + "epoch": 1.1998507867607162, + "grad_norm": 6.3206377029418945, + "learning_rate": 6.063039605317254e-06, + "loss": 0.3653, + "step": 29479 + }, + { + "epoch": 1.199864351600651, + "grad_norm": 7.8550028800964355, + "learning_rate": 6.0629025626969995e-06, + "loss": 0.3702, + "step": 29480 + }, + { + "epoch": 1.199877916440586, + "grad_norm": 6.878612518310547, + "learning_rate": 6.062765520076745e-06, + "loss": 0.3302, + "step": 29481 + }, + { + "epoch": 1.1998914812805208, + "grad_norm": 4.281429767608643, + "learning_rate": 6.062628477456489e-06, + "loss": 0.2795, + "step": 29482 + }, + { + "epoch": 1.1999050461204557, + "grad_norm": 5.040634632110596, + "learning_rate": 6.062491434836234e-06, + "loss": 0.1899, + "step": 29483 + }, + { + "epoch": 1.1999186109603908, + "grad_norm": 4.836827278137207, + "learning_rate": 6.06235439221598e-06, + "loss": 0.2452, + "step": 29484 + }, + { + "epoch": 1.1999321758003256, + "grad_norm": 6.556644439697266, + "learning_rate": 6.0622173495957245e-06, + "loss": 0.3962, + "step": 29485 + }, + { + "epoch": 1.1999457406402605, + "grad_norm": 4.628382682800293, + "learning_rate": 6.06208030697547e-06, + "loss": 0.2518, + "step": 29486 + }, + { + "epoch": 1.1999593054801954, + "grad_norm": 6.6590986251831055, + "learning_rate": 6.061943264355215e-06, + "loss": 0.4161, + "step": 29487 + }, + { + "epoch": 1.1999728703201302, + "grad_norm": 6.1248860359191895, + "learning_rate": 6.06180622173496e-06, + "loss": 0.4218, + "step": 29488 + }, + { + "epoch": 1.1999728703201302, + "eval_loss": 0.32295089960098267, + "eval_noise_accuracy": NaN, + "eval_runtime": 4555.0804, + "eval_samples_per_second": 1.103, + "eval_steps_per_second": 0.069, + "eval_wer": 27.617311791418242, + "step": 29488 + }, + { + "epoch": 1.199986435160065, + "grad_norm": 7.855661392211914, + "learning_rate": 6.061669179114705e-06, + "loss": 0.323, + "step": 29489 + }, + { + "epoch": 1.2, + "grad_norm": 5.166101455688477, + "learning_rate": 6.06153213649445e-06, + "loss": 0.2193, + "step": 29490 + }, + { + "epoch": 1.2000135648399348, + "grad_norm": 5.9514288902282715, + "learning_rate": 6.061395093874195e-06, + "loss": 0.3612, + "step": 29491 + }, + { + "epoch": 1.2000271296798697, + "grad_norm": 4.880106449127197, + "learning_rate": 6.061258051253941e-06, + "loss": 0.255, + "step": 29492 + }, + { + "epoch": 2.000013564839935, + "grad_norm": 6.410866737365723, + "learning_rate": 6.061121008633686e-06, + "loss": 0.4722, + "step": 29493 + }, + { + "epoch": 2.0000271296798697, + "grad_norm": 5.79226541519165, + "learning_rate": 6.06098396601343e-06, + "loss": 0.2599, + "step": 29494 + }, + { + "epoch": 2.0000406945198046, + "grad_norm": 5.540696144104004, + "learning_rate": 6.0608469233931755e-06, + "loss": 0.2865, + "step": 29495 + }, + { + "epoch": 2.0000542593597395, + "grad_norm": 4.788917541503906, + "learning_rate": 6.060709880772921e-06, + "loss": 0.2075, + "step": 29496 + }, + { + "epoch": 2.0000678241996743, + "grad_norm": 4.746397972106934, + "learning_rate": 6.060572838152667e-06, + "loss": 0.2257, + "step": 29497 + }, + { + "epoch": 2.000081389039609, + "grad_norm": 5.113203525543213, + "learning_rate": 6.060435795532411e-06, + "loss": 0.1432, + "step": 29498 + }, + { + "epoch": 2.000094953879544, + "grad_norm": 5.383908748626709, + "learning_rate": 6.060298752912156e-06, + "loss": 0.2772, + "step": 29499 + }, + { + "epoch": 2.000108518719479, + "grad_norm": 6.5310869216918945, + "learning_rate": 6.0601617102919005e-06, + "loss": 0.3011, + "step": 29500 + }, + { + "epoch": 2.000122083559414, + "grad_norm": 4.594729900360107, + "learning_rate": 6.0600246676716465e-06, + "loss": 0.2459, + "step": 29501 + }, + { + "epoch": 2.000135648399349, + "grad_norm": 5.437864780426025, + "learning_rate": 6.059887625051392e-06, + "loss": 0.2249, + "step": 29502 + }, + { + "epoch": 2.000149213239284, + "grad_norm": 5.379997730255127, + "learning_rate": 6.059750582431136e-06, + "loss": 0.2732, + "step": 29503 + }, + { + "epoch": 2.000162778079219, + "grad_norm": 6.2502546310424805, + "learning_rate": 6.059613539810881e-06, + "loss": 0.3334, + "step": 29504 + }, + { + "epoch": 2.0001763429191537, + "grad_norm": 5.943936347961426, + "learning_rate": 6.059476497190627e-06, + "loss": 0.2732, + "step": 29505 + }, + { + "epoch": 2.0001899077590886, + "grad_norm": 6.431733131408691, + "learning_rate": 6.0593394545703724e-06, + "loss": 0.3494, + "step": 29506 + }, + { + "epoch": 2.0002034725990234, + "grad_norm": 6.923510551452637, + "learning_rate": 6.059202411950117e-06, + "loss": 0.36, + "step": 29507 + }, + { + "epoch": 2.0002170374389583, + "grad_norm": 4.289192199707031, + "learning_rate": 6.059065369329862e-06, + "loss": 0.2464, + "step": 29508 + }, + { + "epoch": 2.000230602278893, + "grad_norm": 5.852446556091309, + "learning_rate": 6.058928326709608e-06, + "loss": 0.276, + "step": 29509 + }, + { + "epoch": 2.000244167118828, + "grad_norm": 5.387768268585205, + "learning_rate": 6.058791284089352e-06, + "loss": 0.3209, + "step": 29510 + }, + { + "epoch": 2.000257731958763, + "grad_norm": 5.284151554107666, + "learning_rate": 6.0586542414690975e-06, + "loss": 0.1337, + "step": 29511 + }, + { + "epoch": 2.0002712967986978, + "grad_norm": 4.862300395965576, + "learning_rate": 6.058517198848843e-06, + "loss": 0.2405, + "step": 29512 + }, + { + "epoch": 2.0002848616386326, + "grad_norm": 4.358911514282227, + "learning_rate": 6.058380156228587e-06, + "loss": 0.1795, + "step": 29513 + }, + { + "epoch": 2.0002984264785675, + "grad_norm": 4.366707801818848, + "learning_rate": 6.058243113608333e-06, + "loss": 0.2046, + "step": 29514 + }, + { + "epoch": 2.0003119913185023, + "grad_norm": 4.712066650390625, + "learning_rate": 6.058106070988078e-06, + "loss": 0.1393, + "step": 29515 + }, + { + "epoch": 2.000325556158437, + "grad_norm": 4.8091654777526855, + "learning_rate": 6.0579690283678225e-06, + "loss": 0.1942, + "step": 29516 + }, + { + "epoch": 2.000339120998372, + "grad_norm": 5.2159318923950195, + "learning_rate": 6.057831985747568e-06, + "loss": 0.2203, + "step": 29517 + }, + { + "epoch": 2.000352685838307, + "grad_norm": 4.97781229019165, + "learning_rate": 6.057694943127314e-06, + "loss": 0.1472, + "step": 29518 + }, + { + "epoch": 2.000366250678242, + "grad_norm": 6.097066402435303, + "learning_rate": 6.057557900507058e-06, + "loss": 0.1755, + "step": 29519 + }, + { + "epoch": 2.0003798155181767, + "grad_norm": 6.17605447769165, + "learning_rate": 6.057420857886803e-06, + "loss": 0.2189, + "step": 29520 + }, + { + "epoch": 2.000393380358112, + "grad_norm": 3.8523991107940674, + "learning_rate": 6.057283815266548e-06, + "loss": 0.2136, + "step": 29521 + }, + { + "epoch": 2.000406945198047, + "grad_norm": 3.720515727996826, + "learning_rate": 6.0571467726462944e-06, + "loss": 0.1504, + "step": 29522 + }, + { + "epoch": 2.0004205100379817, + "grad_norm": 4.085292339324951, + "learning_rate": 6.057009730026039e-06, + "loss": 0.1184, + "step": 29523 + }, + { + "epoch": 2.0004340748779166, + "grad_norm": 3.9032299518585205, + "learning_rate": 6.056872687405784e-06, + "loss": 0.1472, + "step": 29524 + }, + { + "epoch": 2.0004476397178514, + "grad_norm": 4.471344947814941, + "learning_rate": 6.056735644785528e-06, + "loss": 0.2413, + "step": 29525 + }, + { + "epoch": 2.0004612045577863, + "grad_norm": 5.191699504852295, + "learning_rate": 6.0565986021652735e-06, + "loss": 0.341, + "step": 29526 + }, + { + "epoch": 2.000474769397721, + "grad_norm": 4.1197075843811035, + "learning_rate": 6.0564615595450195e-06, + "loss": 0.1882, + "step": 29527 + }, + { + "epoch": 2.000488334237656, + "grad_norm": 5.319610118865967, + "learning_rate": 6.056324516924764e-06, + "loss": 0.2861, + "step": 29528 + }, + { + "epoch": 2.000501899077591, + "grad_norm": 4.988667964935303, + "learning_rate": 6.056187474304509e-06, + "loss": 0.2271, + "step": 29529 + }, + { + "epoch": 2.0005154639175258, + "grad_norm": 4.518866062164307, + "learning_rate": 6.056050431684254e-06, + "loss": 0.1465, + "step": 29530 + }, + { + "epoch": 2.0005290287574606, + "grad_norm": 5.073981761932373, + "learning_rate": 6.055913389064e-06, + "loss": 0.2259, + "step": 29531 + }, + { + "epoch": 2.0005425935973955, + "grad_norm": 4.894001483917236, + "learning_rate": 6.0557763464437445e-06, + "loss": 0.2869, + "step": 29532 + }, + { + "epoch": 2.0005561584373304, + "grad_norm": 7.125443935394287, + "learning_rate": 6.05563930382349e-06, + "loss": 0.3192, + "step": 29533 + }, + { + "epoch": 2.0005697232772652, + "grad_norm": 4.884639263153076, + "learning_rate": 6.055502261203234e-06, + "loss": 0.2668, + "step": 29534 + }, + { + "epoch": 2.0005832881172, + "grad_norm": 4.512053489685059, + "learning_rate": 6.05536521858298e-06, + "loss": 0.2275, + "step": 29535 + }, + { + "epoch": 2.000596852957135, + "grad_norm": 4.473408222198486, + "learning_rate": 6.055228175962725e-06, + "loss": 0.1819, + "step": 29536 + }, + { + "epoch": 2.00061041779707, + "grad_norm": 5.181739807128906, + "learning_rate": 6.0550911333424704e-06, + "loss": 0.2755, + "step": 29537 + }, + { + "epoch": 2.0006239826370047, + "grad_norm": 5.528360366821289, + "learning_rate": 6.054954090722215e-06, + "loss": 0.3089, + "step": 29538 + }, + { + "epoch": 2.0006375474769396, + "grad_norm": 5.049745082855225, + "learning_rate": 6.05481704810196e-06, + "loss": 0.2509, + "step": 29539 + }, + { + "epoch": 2.000651112316875, + "grad_norm": 4.663800239562988, + "learning_rate": 6.054680005481706e-06, + "loss": 0.191, + "step": 29540 + }, + { + "epoch": 2.0006646771568097, + "grad_norm": 5.286008834838867, + "learning_rate": 6.05454296286145e-06, + "loss": 0.299, + "step": 29541 + }, + { + "epoch": 2.0006782419967446, + "grad_norm": 5.208630561828613, + "learning_rate": 6.0544059202411955e-06, + "loss": 0.2481, + "step": 29542 + }, + { + "epoch": 2.0006918068366795, + "grad_norm": 5.644317626953125, + "learning_rate": 6.05426887762094e-06, + "loss": 0.2675, + "step": 29543 + }, + { + "epoch": 2.0007053716766143, + "grad_norm": 3.7513427734375, + "learning_rate": 6.054131835000686e-06, + "loss": 0.1442, + "step": 29544 + }, + { + "epoch": 2.000718936516549, + "grad_norm": 3.8540244102478027, + "learning_rate": 6.053994792380431e-06, + "loss": 0.1861, + "step": 29545 + }, + { + "epoch": 2.000732501356484, + "grad_norm": 6.571068286895752, + "learning_rate": 6.053857749760176e-06, + "loss": 0.2594, + "step": 29546 + }, + { + "epoch": 2.000746066196419, + "grad_norm": 5.634305953979492, + "learning_rate": 6.0537207071399205e-06, + "loss": 0.2477, + "step": 29547 + }, + { + "epoch": 2.000759631036354, + "grad_norm": 5.461812496185303, + "learning_rate": 6.0535836645196665e-06, + "loss": 0.2992, + "step": 29548 + }, + { + "epoch": 2.0007731958762887, + "grad_norm": 4.327268600463867, + "learning_rate": 6.053446621899412e-06, + "loss": 0.238, + "step": 29549 + }, + { + "epoch": 2.0007867607162235, + "grad_norm": 6.276987075805664, + "learning_rate": 6.053309579279156e-06, + "loss": 0.2029, + "step": 29550 + }, + { + "epoch": 2.0008003255561584, + "grad_norm": 4.6780571937561035, + "learning_rate": 6.053172536658901e-06, + "loss": 0.308, + "step": 29551 + }, + { + "epoch": 2.0008138903960933, + "grad_norm": 6.104717254638672, + "learning_rate": 6.0530354940386456e-06, + "loss": 0.1581, + "step": 29552 + }, + { + "epoch": 2.000827455236028, + "grad_norm": 4.572674751281738, + "learning_rate": 6.052898451418392e-06, + "loss": 0.228, + "step": 29553 + }, + { + "epoch": 2.000841020075963, + "grad_norm": 4.058246612548828, + "learning_rate": 6.052761408798137e-06, + "loss": 0.2214, + "step": 29554 + }, + { + "epoch": 2.000854584915898, + "grad_norm": 5.978023052215576, + "learning_rate": 6.052624366177882e-06, + "loss": 0.2766, + "step": 29555 + }, + { + "epoch": 2.0008681497558327, + "grad_norm": 5.616140842437744, + "learning_rate": 6.052487323557626e-06, + "loss": 0.2102, + "step": 29556 + }, + { + "epoch": 2.0008817145957676, + "grad_norm": 5.329655170440674, + "learning_rate": 6.052350280937372e-06, + "loss": 0.1947, + "step": 29557 + }, + { + "epoch": 2.0008952794357024, + "grad_norm": 5.93106746673584, + "learning_rate": 6.0522132383171175e-06, + "loss": 0.2772, + "step": 29558 + }, + { + "epoch": 2.0009088442756378, + "grad_norm": 6.455223560333252, + "learning_rate": 6.052076195696862e-06, + "loss": 0.2676, + "step": 29559 + }, + { + "epoch": 2.0009224091155726, + "grad_norm": 4.699410915374756, + "learning_rate": 6.051939153076607e-06, + "loss": 0.1623, + "step": 29560 + }, + { + "epoch": 2.0009359739555075, + "grad_norm": 4.360458850860596, + "learning_rate": 6.051802110456353e-06, + "loss": 0.2498, + "step": 29561 + }, + { + "epoch": 2.0009495387954424, + "grad_norm": 5.292986869812012, + "learning_rate": 6.051665067836097e-06, + "loss": 0.3203, + "step": 29562 + }, + { + "epoch": 2.000963103635377, + "grad_norm": 5.689291477203369, + "learning_rate": 6.0515280252158425e-06, + "loss": 0.2207, + "step": 29563 + }, + { + "epoch": 2.000976668475312, + "grad_norm": 6.315918922424316, + "learning_rate": 6.051390982595588e-06, + "loss": 0.2941, + "step": 29564 + }, + { + "epoch": 2.000990233315247, + "grad_norm": 7.146295547485352, + "learning_rate": 6.051253939975332e-06, + "loss": 0.2672, + "step": 29565 + }, + { + "epoch": 2.001003798155182, + "grad_norm": 5.3021440505981445, + "learning_rate": 6.051116897355078e-06, + "loss": 0.2512, + "step": 29566 + }, + { + "epoch": 2.0010173629951167, + "grad_norm": 5.44648551940918, + "learning_rate": 6.050979854734823e-06, + "loss": 0.2293, + "step": 29567 + }, + { + "epoch": 2.0010309278350515, + "grad_norm": 4.977643013000488, + "learning_rate": 6.050842812114568e-06, + "loss": 0.2037, + "step": 29568 + }, + { + "epoch": 2.0010444926749864, + "grad_norm": 5.236270904541016, + "learning_rate": 6.050705769494313e-06, + "loss": 0.2918, + "step": 29569 + }, + { + "epoch": 2.0010580575149213, + "grad_norm": 3.928919553756714, + "learning_rate": 6.050568726874059e-06, + "loss": 0.2199, + "step": 29570 + }, + { + "epoch": 2.001071622354856, + "grad_norm": 4.397491455078125, + "learning_rate": 6.050431684253804e-06, + "loss": 0.2235, + "step": 29571 + }, + { + "epoch": 2.001085187194791, + "grad_norm": 8.961160659790039, + "learning_rate": 6.050294641633548e-06, + "loss": 0.2419, + "step": 29572 + }, + { + "epoch": 2.001098752034726, + "grad_norm": 8.210709571838379, + "learning_rate": 6.0501575990132935e-06, + "loss": 0.2959, + "step": 29573 + }, + { + "epoch": 2.0011123168746607, + "grad_norm": 4.278981685638428, + "learning_rate": 6.0500205563930395e-06, + "loss": 0.1152, + "step": 29574 + }, + { + "epoch": 2.0011258817145956, + "grad_norm": 6.123210906982422, + "learning_rate": 6.049883513772784e-06, + "loss": 0.2794, + "step": 29575 + }, + { + "epoch": 2.0011394465545305, + "grad_norm": 4.432503700256348, + "learning_rate": 6.049746471152529e-06, + "loss": 0.1662, + "step": 29576 + }, + { + "epoch": 2.0011530113944653, + "grad_norm": 5.791211128234863, + "learning_rate": 6.049609428532273e-06, + "loss": 0.1889, + "step": 29577 + }, + { + "epoch": 2.0011665762344006, + "grad_norm": 4.7437849044799805, + "learning_rate": 6.049472385912019e-06, + "loss": 0.2266, + "step": 29578 + }, + { + "epoch": 2.0011801410743355, + "grad_norm": 5.198010444641113, + "learning_rate": 6.0493353432917646e-06, + "loss": 0.301, + "step": 29579 + }, + { + "epoch": 2.0011937059142704, + "grad_norm": 5.436578750610352, + "learning_rate": 6.04919830067151e-06, + "loss": 0.2656, + "step": 29580 + }, + { + "epoch": 2.0012072707542052, + "grad_norm": 4.581027984619141, + "learning_rate": 6.049061258051254e-06, + "loss": 0.2892, + "step": 29581 + }, + { + "epoch": 2.00122083559414, + "grad_norm": 4.8054022789001465, + "learning_rate": 6.048924215430999e-06, + "loss": 0.2574, + "step": 29582 + }, + { + "epoch": 2.001234400434075, + "grad_norm": 6.4369096755981445, + "learning_rate": 6.048787172810745e-06, + "loss": 0.2312, + "step": 29583 + }, + { + "epoch": 2.00124796527401, + "grad_norm": 7.627025127410889, + "learning_rate": 6.04865013019049e-06, + "loss": 0.2331, + "step": 29584 + }, + { + "epoch": 2.0012615301139447, + "grad_norm": 5.229847431182861, + "learning_rate": 6.048513087570235e-06, + "loss": 0.1529, + "step": 29585 + }, + { + "epoch": 2.0012750949538796, + "grad_norm": 3.6267130374908447, + "learning_rate": 6.04837604494998e-06, + "loss": 0.1272, + "step": 29586 + }, + { + "epoch": 2.0012886597938144, + "grad_norm": 4.7611165046691895, + "learning_rate": 6.048239002329725e-06, + "loss": 0.168, + "step": 29587 + }, + { + "epoch": 2.0013022246337493, + "grad_norm": 4.671642303466797, + "learning_rate": 6.04810195970947e-06, + "loss": 0.1601, + "step": 29588 + }, + { + "epoch": 2.001315789473684, + "grad_norm": 5.141688823699951, + "learning_rate": 6.0479649170892155e-06, + "loss": 0.179, + "step": 29589 + }, + { + "epoch": 2.001329354313619, + "grad_norm": 5.313141822814941, + "learning_rate": 6.04782787446896e-06, + "loss": 0.2804, + "step": 29590 + }, + { + "epoch": 2.001342919153554, + "grad_norm": 6.013298988342285, + "learning_rate": 6.047690831848706e-06, + "loss": 0.2509, + "step": 29591 + }, + { + "epoch": 2.0013564839934888, + "grad_norm": 3.866436243057251, + "learning_rate": 6.047553789228451e-06, + "loss": 0.1768, + "step": 29592 + }, + { + "epoch": 2.0013700488334236, + "grad_norm": 5.710753440856934, + "learning_rate": 6.047416746608195e-06, + "loss": 0.3332, + "step": 29593 + }, + { + "epoch": 2.0013836136733585, + "grad_norm": 4.798130512237549, + "learning_rate": 6.0472797039879405e-06, + "loss": 0.173, + "step": 29594 + }, + { + "epoch": 2.0013971785132934, + "grad_norm": 6.884631633758545, + "learning_rate": 6.047142661367686e-06, + "loss": 0.2562, + "step": 29595 + }, + { + "epoch": 2.001410743353228, + "grad_norm": 3.6686441898345947, + "learning_rate": 6.047005618747431e-06, + "loss": 0.16, + "step": 29596 + }, + { + "epoch": 2.0014243081931635, + "grad_norm": 6.917757034301758, + "learning_rate": 6.046868576127176e-06, + "loss": 0.2552, + "step": 29597 + }, + { + "epoch": 2.0014378730330984, + "grad_norm": 7.92914342880249, + "learning_rate": 6.046731533506921e-06, + "loss": 0.3278, + "step": 29598 + }, + { + "epoch": 2.0014514378730333, + "grad_norm": 6.230526447296143, + "learning_rate": 6.046594490886666e-06, + "loss": 0.1906, + "step": 29599 + }, + { + "epoch": 2.001465002712968, + "grad_norm": 4.209604740142822, + "learning_rate": 6.046457448266412e-06, + "loss": 0.1913, + "step": 29600 + }, + { + "epoch": 2.001478567552903, + "grad_norm": 4.508684158325195, + "learning_rate": 6.046320405646157e-06, + "loss": 0.2282, + "step": 29601 + }, + { + "epoch": 2.001492132392838, + "grad_norm": 4.895576477050781, + "learning_rate": 6.046183363025901e-06, + "loss": 0.2556, + "step": 29602 + }, + { + "epoch": 2.0015056972327727, + "grad_norm": 4.651477813720703, + "learning_rate": 6.046046320405646e-06, + "loss": 0.298, + "step": 29603 + }, + { + "epoch": 2.0015192620727076, + "grad_norm": 3.898029327392578, + "learning_rate": 6.045909277785392e-06, + "loss": 0.1621, + "step": 29604 + }, + { + "epoch": 2.0015328269126424, + "grad_norm": 4.323267459869385, + "learning_rate": 6.0457722351651375e-06, + "loss": 0.2299, + "step": 29605 + }, + { + "epoch": 2.0015463917525773, + "grad_norm": 6.018947601318359, + "learning_rate": 6.045635192544882e-06, + "loss": 0.1959, + "step": 29606 + }, + { + "epoch": 2.001559956592512, + "grad_norm": 3.7513606548309326, + "learning_rate": 6.045498149924627e-06, + "loss": 0.2844, + "step": 29607 + }, + { + "epoch": 2.001573521432447, + "grad_norm": 4.106677532196045, + "learning_rate": 6.045361107304371e-06, + "loss": 0.2409, + "step": 29608 + }, + { + "epoch": 2.001587086272382, + "grad_norm": 5.413166522979736, + "learning_rate": 6.045224064684117e-06, + "loss": 0.294, + "step": 29609 + }, + { + "epoch": 2.0016006511123168, + "grad_norm": 5.882267951965332, + "learning_rate": 6.0450870220638626e-06, + "loss": 0.3321, + "step": 29610 + }, + { + "epoch": 2.0016142159522516, + "grad_norm": 4.157297611236572, + "learning_rate": 6.044949979443607e-06, + "loss": 0.222, + "step": 29611 + }, + { + "epoch": 2.0016277807921865, + "grad_norm": 3.1001923084259033, + "learning_rate": 6.044812936823352e-06, + "loss": 0.1373, + "step": 29612 + }, + { + "epoch": 2.0016413456321214, + "grad_norm": 4.021347999572754, + "learning_rate": 6.044675894203098e-06, + "loss": 0.1316, + "step": 29613 + }, + { + "epoch": 2.0016549104720562, + "grad_norm": 3.42928409576416, + "learning_rate": 6.044538851582843e-06, + "loss": 0.1681, + "step": 29614 + }, + { + "epoch": 2.001668475311991, + "grad_norm": 5.92982816696167, + "learning_rate": 6.044401808962588e-06, + "loss": 0.2899, + "step": 29615 + }, + { + "epoch": 2.0016820401519264, + "grad_norm": 3.957064151763916, + "learning_rate": 6.044264766342333e-06, + "loss": 0.2085, + "step": 29616 + }, + { + "epoch": 2.0016956049918613, + "grad_norm": 3.997643232345581, + "learning_rate": 6.044127723722079e-06, + "loss": 0.1833, + "step": 29617 + }, + { + "epoch": 2.001709169831796, + "grad_norm": 3.7190310955047607, + "learning_rate": 6.043990681101823e-06, + "loss": 0.1754, + "step": 29618 + }, + { + "epoch": 2.001722734671731, + "grad_norm": 4.438190937042236, + "learning_rate": 6.043853638481568e-06, + "loss": 0.1789, + "step": 29619 + }, + { + "epoch": 2.001736299511666, + "grad_norm": 4.248712062835693, + "learning_rate": 6.0437165958613135e-06, + "loss": 0.254, + "step": 29620 + }, + { + "epoch": 2.0017498643516007, + "grad_norm": 4.13180685043335, + "learning_rate": 6.043579553241058e-06, + "loss": 0.2066, + "step": 29621 + }, + { + "epoch": 2.0017634291915356, + "grad_norm": 4.112487316131592, + "learning_rate": 6.043442510620804e-06, + "loss": 0.214, + "step": 29622 + }, + { + "epoch": 2.0017769940314705, + "grad_norm": 4.016573429107666, + "learning_rate": 6.043305468000549e-06, + "loss": 0.2203, + "step": 29623 + }, + { + "epoch": 2.0017905588714053, + "grad_norm": 4.671843528747559, + "learning_rate": 6.043168425380293e-06, + "loss": 0.2196, + "step": 29624 + }, + { + "epoch": 2.00180412371134, + "grad_norm": 6.644144535064697, + "learning_rate": 6.0430313827600385e-06, + "loss": 0.218, + "step": 29625 + }, + { + "epoch": 2.001817688551275, + "grad_norm": 5.724316596984863, + "learning_rate": 6.0428943401397846e-06, + "loss": 0.283, + "step": 29626 + }, + { + "epoch": 2.00183125339121, + "grad_norm": 5.495025157928467, + "learning_rate": 6.042757297519529e-06, + "loss": 0.2117, + "step": 29627 + }, + { + "epoch": 2.001844818231145, + "grad_norm": 4.585480213165283, + "learning_rate": 6.042620254899274e-06, + "loss": 0.1727, + "step": 29628 + }, + { + "epoch": 2.0018583830710797, + "grad_norm": 4.200197696685791, + "learning_rate": 6.042483212279019e-06, + "loss": 0.2807, + "step": 29629 + }, + { + "epoch": 2.0018719479110145, + "grad_norm": 6.325399875640869, + "learning_rate": 6.0423461696587644e-06, + "loss": 0.3053, + "step": 29630 + }, + { + "epoch": 2.0018855127509494, + "grad_norm": 4.73615837097168, + "learning_rate": 6.04220912703851e-06, + "loss": 0.232, + "step": 29631 + }, + { + "epoch": 2.0018990775908843, + "grad_norm": 3.7068915367126465, + "learning_rate": 6.042072084418255e-06, + "loss": 0.1998, + "step": 29632 + }, + { + "epoch": 2.001912642430819, + "grad_norm": 4.509664535522461, + "learning_rate": 6.041935041797999e-06, + "loss": 0.2441, + "step": 29633 + }, + { + "epoch": 2.001926207270754, + "grad_norm": 3.9324352741241455, + "learning_rate": 6.041797999177744e-06, + "loss": 0.2076, + "step": 29634 + }, + { + "epoch": 2.0019397721106893, + "grad_norm": 4.004769802093506, + "learning_rate": 6.04166095655749e-06, + "loss": 0.1642, + "step": 29635 + }, + { + "epoch": 2.001953336950624, + "grad_norm": 7.526399612426758, + "learning_rate": 6.041523913937235e-06, + "loss": 0.2998, + "step": 29636 + }, + { + "epoch": 2.001966901790559, + "grad_norm": 3.3809351921081543, + "learning_rate": 6.04138687131698e-06, + "loss": 0.0906, + "step": 29637 + }, + { + "epoch": 2.001980466630494, + "grad_norm": 4.630045413970947, + "learning_rate": 6.041249828696725e-06, + "loss": 0.2108, + "step": 29638 + }, + { + "epoch": 2.0019940314704288, + "grad_norm": 5.495885848999023, + "learning_rate": 6.041112786076471e-06, + "loss": 0.2263, + "step": 29639 + }, + { + "epoch": 2.0020075963103636, + "grad_norm": 4.470329761505127, + "learning_rate": 6.040975743456215e-06, + "loss": 0.2213, + "step": 29640 + }, + { + "epoch": 2.0020211611502985, + "grad_norm": 4.237164497375488, + "learning_rate": 6.0408387008359606e-06, + "loss": 0.1853, + "step": 29641 + }, + { + "epoch": 2.0020347259902334, + "grad_norm": 4.311321258544922, + "learning_rate": 6.040701658215705e-06, + "loss": 0.2027, + "step": 29642 + }, + { + "epoch": 2.002048290830168, + "grad_norm": 6.874748229980469, + "learning_rate": 6.040564615595451e-06, + "loss": 0.3484, + "step": 29643 + }, + { + "epoch": 2.002061855670103, + "grad_norm": 3.264878511428833, + "learning_rate": 6.040427572975196e-06, + "loss": 0.1051, + "step": 29644 + }, + { + "epoch": 2.002075420510038, + "grad_norm": 3.2820098400115967, + "learning_rate": 6.0402905303549404e-06, + "loss": 0.1395, + "step": 29645 + }, + { + "epoch": 2.002088985349973, + "grad_norm": 4.985538482666016, + "learning_rate": 6.040153487734686e-06, + "loss": 0.2789, + "step": 29646 + }, + { + "epoch": 2.0021025501899077, + "grad_norm": 4.676082134246826, + "learning_rate": 6.040016445114432e-06, + "loss": 0.2096, + "step": 29647 + }, + { + "epoch": 2.0021161150298425, + "grad_norm": 4.9660234451293945, + "learning_rate": 6.039879402494177e-06, + "loss": 0.2265, + "step": 29648 + }, + { + "epoch": 2.0021296798697774, + "grad_norm": 5.199625015258789, + "learning_rate": 6.039742359873921e-06, + "loss": 0.2214, + "step": 29649 + }, + { + "epoch": 2.0021432447097123, + "grad_norm": 5.478246688842773, + "learning_rate": 6.039605317253666e-06, + "loss": 0.3256, + "step": 29650 + }, + { + "epoch": 2.002156809549647, + "grad_norm": 4.5969696044921875, + "learning_rate": 6.039468274633411e-06, + "loss": 0.1724, + "step": 29651 + }, + { + "epoch": 2.002170374389582, + "grad_norm": 5.772426605224609, + "learning_rate": 6.039331232013157e-06, + "loss": 0.1734, + "step": 29652 + }, + { + "epoch": 2.002183939229517, + "grad_norm": 6.504081726074219, + "learning_rate": 6.039194189392902e-06, + "loss": 0.2898, + "step": 29653 + }, + { + "epoch": 2.002197504069452, + "grad_norm": 5.906357765197754, + "learning_rate": 6.039057146772647e-06, + "loss": 0.2966, + "step": 29654 + }, + { + "epoch": 2.002211068909387, + "grad_norm": 3.8405799865722656, + "learning_rate": 6.038920104152391e-06, + "loss": 0.146, + "step": 29655 + }, + { + "epoch": 2.002224633749322, + "grad_norm": 4.145849227905273, + "learning_rate": 6.038783061532137e-06, + "loss": 0.2177, + "step": 29656 + }, + { + "epoch": 2.002238198589257, + "grad_norm": 5.921053409576416, + "learning_rate": 6.038646018911883e-06, + "loss": 0.4399, + "step": 29657 + }, + { + "epoch": 2.0022517634291916, + "grad_norm": 4.456338405609131, + "learning_rate": 6.038508976291627e-06, + "loss": 0.1346, + "step": 29658 + }, + { + "epoch": 2.0022653282691265, + "grad_norm": 4.697864055633545, + "learning_rate": 6.038371933671372e-06, + "loss": 0.2441, + "step": 29659 + }, + { + "epoch": 2.0022788931090614, + "grad_norm": 3.419205904006958, + "learning_rate": 6.038234891051118e-06, + "loss": 0.2152, + "step": 29660 + }, + { + "epoch": 2.0022924579489962, + "grad_norm": 4.338498592376709, + "learning_rate": 6.0380978484308624e-06, + "loss": 0.1724, + "step": 29661 + }, + { + "epoch": 2.002306022788931, + "grad_norm": 5.310248851776123, + "learning_rate": 6.037960805810608e-06, + "loss": 0.2682, + "step": 29662 + }, + { + "epoch": 2.002319587628866, + "grad_norm": 6.886302947998047, + "learning_rate": 6.037823763190353e-06, + "loss": 0.3297, + "step": 29663 + }, + { + "epoch": 2.002333152468801, + "grad_norm": 5.434284687042236, + "learning_rate": 6.037686720570097e-06, + "loss": 0.2251, + "step": 29664 + }, + { + "epoch": 2.0023467173087357, + "grad_norm": 4.141078472137451, + "learning_rate": 6.037549677949843e-06, + "loss": 0.1084, + "step": 29665 + }, + { + "epoch": 2.0023602821486706, + "grad_norm": 4.149473667144775, + "learning_rate": 6.037412635329588e-06, + "loss": 0.1568, + "step": 29666 + }, + { + "epoch": 2.0023738469886054, + "grad_norm": 5.064810752868652, + "learning_rate": 6.037275592709333e-06, + "loss": 0.2305, + "step": 29667 + }, + { + "epoch": 2.0023874118285403, + "grad_norm": 4.794678211212158, + "learning_rate": 6.037138550089078e-06, + "loss": 0.2241, + "step": 29668 + }, + { + "epoch": 2.002400976668475, + "grad_norm": 3.668865919113159, + "learning_rate": 6.037001507468824e-06, + "loss": 0.1685, + "step": 29669 + }, + { + "epoch": 2.00241454150841, + "grad_norm": 4.965047359466553, + "learning_rate": 6.036864464848568e-06, + "loss": 0.188, + "step": 29670 + }, + { + "epoch": 2.002428106348345, + "grad_norm": 4.399731159210205, + "learning_rate": 6.036727422228313e-06, + "loss": 0.2546, + "step": 29671 + }, + { + "epoch": 2.0024416711882798, + "grad_norm": 3.8079028129577637, + "learning_rate": 6.0365903796080586e-06, + "loss": 0.1453, + "step": 29672 + }, + { + "epoch": 2.002455236028215, + "grad_norm": 4.634005069732666, + "learning_rate": 6.036453336987805e-06, + "loss": 0.2906, + "step": 29673 + }, + { + "epoch": 2.00246880086815, + "grad_norm": 6.595572471618652, + "learning_rate": 6.036316294367549e-06, + "loss": 0.2501, + "step": 29674 + }, + { + "epoch": 2.002482365708085, + "grad_norm": 4.117857456207275, + "learning_rate": 6.036179251747294e-06, + "loss": 0.1737, + "step": 29675 + }, + { + "epoch": 2.0024959305480197, + "grad_norm": 5.083346366882324, + "learning_rate": 6.0360422091270384e-06, + "loss": 0.2656, + "step": 29676 + }, + { + "epoch": 2.0025094953879545, + "grad_norm": 5.941793918609619, + "learning_rate": 6.035905166506784e-06, + "loss": 0.1488, + "step": 29677 + }, + { + "epoch": 2.0025230602278894, + "grad_norm": 4.312050819396973, + "learning_rate": 6.03576812388653e-06, + "loss": 0.1309, + "step": 29678 + }, + { + "epoch": 2.0025366250678243, + "grad_norm": 3.7455713748931885, + "learning_rate": 6.035631081266274e-06, + "loss": 0.1982, + "step": 29679 + }, + { + "epoch": 2.002550189907759, + "grad_norm": 3.1636860370635986, + "learning_rate": 6.035494038646019e-06, + "loss": 0.1058, + "step": 29680 + }, + { + "epoch": 2.002563754747694, + "grad_norm": 7.065624713897705, + "learning_rate": 6.035356996025764e-06, + "loss": 0.2785, + "step": 29681 + }, + { + "epoch": 2.002577319587629, + "grad_norm": 4.035253047943115, + "learning_rate": 6.03521995340551e-06, + "loss": 0.1606, + "step": 29682 + }, + { + "epoch": 2.0025908844275637, + "grad_norm": 5.379906177520752, + "learning_rate": 6.035082910785255e-06, + "loss": 0.2405, + "step": 29683 + }, + { + "epoch": 2.0026044492674986, + "grad_norm": 4.4467363357543945, + "learning_rate": 6.034945868165e-06, + "loss": 0.2655, + "step": 29684 + }, + { + "epoch": 2.0026180141074335, + "grad_norm": 3.7238683700561523, + "learning_rate": 6.034808825544744e-06, + "loss": 0.1213, + "step": 29685 + }, + { + "epoch": 2.0026315789473683, + "grad_norm": 4.577859401702881, + "learning_rate": 6.03467178292449e-06, + "loss": 0.169, + "step": 29686 + }, + { + "epoch": 2.002645143787303, + "grad_norm": 5.342857360839844, + "learning_rate": 6.034534740304235e-06, + "loss": 0.2793, + "step": 29687 + }, + { + "epoch": 2.002658708627238, + "grad_norm": 4.318839073181152, + "learning_rate": 6.034397697683981e-06, + "loss": 0.1578, + "step": 29688 + }, + { + "epoch": 2.002672273467173, + "grad_norm": 3.7426388263702393, + "learning_rate": 6.034260655063725e-06, + "loss": 0.1485, + "step": 29689 + }, + { + "epoch": 2.002685838307108, + "grad_norm": 4.863033771514893, + "learning_rate": 6.03412361244347e-06, + "loss": 0.1231, + "step": 29690 + }, + { + "epoch": 2.0026994031470426, + "grad_norm": 4.142323017120361, + "learning_rate": 6.033986569823216e-06, + "loss": 0.1694, + "step": 29691 + }, + { + "epoch": 2.002712967986978, + "grad_norm": 3.047004461288452, + "learning_rate": 6.0338495272029604e-06, + "loss": 0.1219, + "step": 29692 + }, + { + "epoch": 2.002726532826913, + "grad_norm": 4.997838497161865, + "learning_rate": 6.033712484582706e-06, + "loss": 0.1854, + "step": 29693 + }, + { + "epoch": 2.0027400976668477, + "grad_norm": 4.196404457092285, + "learning_rate": 6.03357544196245e-06, + "loss": 0.1307, + "step": 29694 + }, + { + "epoch": 2.0027536625067826, + "grad_norm": 4.937524318695068, + "learning_rate": 6.033438399342196e-06, + "loss": 0.181, + "step": 29695 + }, + { + "epoch": 2.0027672273467174, + "grad_norm": 5.2490034103393555, + "learning_rate": 6.033301356721941e-06, + "loss": 0.2324, + "step": 29696 + }, + { + "epoch": 2.0027807921866523, + "grad_norm": 3.198141098022461, + "learning_rate": 6.033164314101686e-06, + "loss": 0.1056, + "step": 29697 + }, + { + "epoch": 2.002794357026587, + "grad_norm": 5.001834392547607, + "learning_rate": 6.033027271481431e-06, + "loss": 0.1453, + "step": 29698 + }, + { + "epoch": 2.002807921866522, + "grad_norm": 4.367541313171387, + "learning_rate": 6.032890228861177e-06, + "loss": 0.1032, + "step": 29699 + }, + { + "epoch": 2.002821486706457, + "grad_norm": 2.7478461265563965, + "learning_rate": 6.032753186240922e-06, + "loss": 0.0863, + "step": 29700 + }, + { + "epoch": 2.0028350515463917, + "grad_norm": 3.7197911739349365, + "learning_rate": 6.032616143620666e-06, + "loss": 0.1395, + "step": 29701 + }, + { + "epoch": 2.0028486163863266, + "grad_norm": 3.768584728240967, + "learning_rate": 6.032479101000411e-06, + "loss": 0.0877, + "step": 29702 + }, + { + "epoch": 2.0028621812262615, + "grad_norm": 4.175812721252441, + "learning_rate": 6.0323420583801566e-06, + "loss": 0.1399, + "step": 29703 + }, + { + "epoch": 2.0028757460661963, + "grad_norm": 4.623100280761719, + "learning_rate": 6.032205015759902e-06, + "loss": 0.1947, + "step": 29704 + }, + { + "epoch": 2.002889310906131, + "grad_norm": 5.020755767822266, + "learning_rate": 6.032067973139647e-06, + "loss": 0.1523, + "step": 29705 + }, + { + "epoch": 2.002902875746066, + "grad_norm": 3.8376190662384033, + "learning_rate": 6.031930930519392e-06, + "loss": 0.109, + "step": 29706 + }, + { + "epoch": 2.002916440586001, + "grad_norm": 4.729328632354736, + "learning_rate": 6.0317938878991364e-06, + "loss": 0.1201, + "step": 29707 + }, + { + "epoch": 2.002930005425936, + "grad_norm": 3.6307528018951416, + "learning_rate": 6.0316568452788825e-06, + "loss": 0.0967, + "step": 29708 + }, + { + "epoch": 2.0029435702658707, + "grad_norm": 3.4684982299804688, + "learning_rate": 6.031519802658628e-06, + "loss": 0.1673, + "step": 29709 + }, + { + "epoch": 2.0029571351058055, + "grad_norm": 4.56478214263916, + "learning_rate": 6.031382760038372e-06, + "loss": 0.1367, + "step": 29710 + }, + { + "epoch": 2.002970699945741, + "grad_norm": 3.7056941986083984, + "learning_rate": 6.031245717418117e-06, + "loss": 0.1808, + "step": 29711 + }, + { + "epoch": 2.0029842647856757, + "grad_norm": 4.128059387207031, + "learning_rate": 6.031108674797863e-06, + "loss": 0.1338, + "step": 29712 + }, + { + "epoch": 2.0029978296256106, + "grad_norm": 5.4561991691589355, + "learning_rate": 6.030971632177608e-06, + "loss": 0.2356, + "step": 29713 + }, + { + "epoch": 2.0030113944655454, + "grad_norm": 4.314090728759766, + "learning_rate": 6.030834589557353e-06, + "loss": 0.1593, + "step": 29714 + }, + { + "epoch": 2.0030249593054803, + "grad_norm": 3.4012961387634277, + "learning_rate": 6.030697546937098e-06, + "loss": 0.1678, + "step": 29715 + }, + { + "epoch": 2.003038524145415, + "grad_norm": 5.51699686050415, + "learning_rate": 6.030560504316842e-06, + "loss": 0.3143, + "step": 29716 + }, + { + "epoch": 2.00305208898535, + "grad_norm": 4.731903553009033, + "learning_rate": 6.030423461696588e-06, + "loss": 0.2348, + "step": 29717 + }, + { + "epoch": 2.003065653825285, + "grad_norm": 3.4341108798980713, + "learning_rate": 6.030286419076333e-06, + "loss": 0.0895, + "step": 29718 + }, + { + "epoch": 2.0030792186652198, + "grad_norm": 3.5952746868133545, + "learning_rate": 6.030149376456078e-06, + "loss": 0.1478, + "step": 29719 + }, + { + "epoch": 2.0030927835051546, + "grad_norm": 3.2644975185394287, + "learning_rate": 6.030012333835823e-06, + "loss": 0.1277, + "step": 29720 + }, + { + "epoch": 2.0031063483450895, + "grad_norm": 3.9152097702026367, + "learning_rate": 6.029875291215569e-06, + "loss": 0.1452, + "step": 29721 + }, + { + "epoch": 2.0031199131850244, + "grad_norm": 4.052661418914795, + "learning_rate": 6.029738248595314e-06, + "loss": 0.0877, + "step": 29722 + }, + { + "epoch": 2.0031334780249592, + "grad_norm": 4.396867275238037, + "learning_rate": 6.0296012059750585e-06, + "loss": 0.1766, + "step": 29723 + }, + { + "epoch": 2.003147042864894, + "grad_norm": 4.542559623718262, + "learning_rate": 6.029464163354804e-06, + "loss": 0.1715, + "step": 29724 + }, + { + "epoch": 2.003160607704829, + "grad_norm": 3.139810800552368, + "learning_rate": 6.02932712073455e-06, + "loss": 0.1351, + "step": 29725 + }, + { + "epoch": 2.003174172544764, + "grad_norm": 5.446795463562012, + "learning_rate": 6.029190078114294e-06, + "loss": 0.1818, + "step": 29726 + }, + { + "epoch": 2.0031877373846987, + "grad_norm": 4.2256693840026855, + "learning_rate": 6.029053035494039e-06, + "loss": 0.1724, + "step": 29727 + }, + { + "epoch": 2.0032013022246336, + "grad_norm": 3.1122610569000244, + "learning_rate": 6.028915992873784e-06, + "loss": 0.084, + "step": 29728 + }, + { + "epoch": 2.0032148670645684, + "grad_norm": 5.126117706298828, + "learning_rate": 6.0287789502535295e-06, + "loss": 0.2585, + "step": 29729 + }, + { + "epoch": 2.0032284319045037, + "grad_norm": 4.908526420593262, + "learning_rate": 6.028641907633275e-06, + "loss": 0.1886, + "step": 29730 + }, + { + "epoch": 2.0032419967444386, + "grad_norm": 3.5914862155914307, + "learning_rate": 6.02850486501302e-06, + "loss": 0.1328, + "step": 29731 + }, + { + "epoch": 2.0032555615843735, + "grad_norm": 5.346248626708984, + "learning_rate": 6.028367822392764e-06, + "loss": 0.1665, + "step": 29732 + }, + { + "epoch": 2.0032691264243083, + "grad_norm": 9.406003952026367, + "learning_rate": 6.028230779772509e-06, + "loss": 0.2051, + "step": 29733 + }, + { + "epoch": 2.003282691264243, + "grad_norm": 4.777210235595703, + "learning_rate": 6.028093737152255e-06, + "loss": 0.1243, + "step": 29734 + }, + { + "epoch": 2.003296256104178, + "grad_norm": 3.80745267868042, + "learning_rate": 6.027956694532e-06, + "loss": 0.1854, + "step": 29735 + }, + { + "epoch": 2.003309820944113, + "grad_norm": 5.336659908294678, + "learning_rate": 6.027819651911745e-06, + "loss": 0.2771, + "step": 29736 + }, + { + "epoch": 2.003323385784048, + "grad_norm": 4.065623760223389, + "learning_rate": 6.02768260929149e-06, + "loss": 0.1204, + "step": 29737 + }, + { + "epoch": 2.0033369506239826, + "grad_norm": 3.9736053943634033, + "learning_rate": 6.027545566671235e-06, + "loss": 0.1746, + "step": 29738 + }, + { + "epoch": 2.0033505154639175, + "grad_norm": 4.96009635925293, + "learning_rate": 6.0274085240509805e-06, + "loss": 0.1718, + "step": 29739 + }, + { + "epoch": 2.0033640803038524, + "grad_norm": 2.9002976417541504, + "learning_rate": 6.027271481430726e-06, + "loss": 0.0699, + "step": 29740 + }, + { + "epoch": 2.0033776451437872, + "grad_norm": 4.557712078094482, + "learning_rate": 6.02713443881047e-06, + "loss": 0.2028, + "step": 29741 + }, + { + "epoch": 2.003391209983722, + "grad_norm": 3.8540422916412354, + "learning_rate": 6.026997396190216e-06, + "loss": 0.1523, + "step": 29742 + }, + { + "epoch": 2.003404774823657, + "grad_norm": 4.726232051849365, + "learning_rate": 6.026860353569961e-06, + "loss": 0.1704, + "step": 29743 + }, + { + "epoch": 2.003418339663592, + "grad_norm": 3.2681260108947754, + "learning_rate": 6.0267233109497055e-06, + "loss": 0.1031, + "step": 29744 + }, + { + "epoch": 2.0034319045035267, + "grad_norm": 4.526285648345947, + "learning_rate": 6.026586268329451e-06, + "loss": 0.1466, + "step": 29745 + }, + { + "epoch": 2.0034454693434616, + "grad_norm": 3.000088930130005, + "learning_rate": 6.026449225709196e-06, + "loss": 0.1037, + "step": 29746 + }, + { + "epoch": 2.0034590341833964, + "grad_norm": 5.856100082397461, + "learning_rate": 6.026312183088942e-06, + "loss": 0.3339, + "step": 29747 + }, + { + "epoch": 2.0034725990233317, + "grad_norm": 2.7399041652679443, + "learning_rate": 6.026175140468686e-06, + "loss": 0.0769, + "step": 29748 + }, + { + "epoch": 2.0034861638632666, + "grad_norm": 3.508047342300415, + "learning_rate": 6.026038097848431e-06, + "loss": 0.1512, + "step": 29749 + }, + { + "epoch": 2.0034997287032015, + "grad_norm": 5.154852867126465, + "learning_rate": 6.025901055228176e-06, + "loss": 0.1237, + "step": 29750 + }, + { + "epoch": 2.0035132935431363, + "grad_norm": 3.8956797122955322, + "learning_rate": 6.025764012607922e-06, + "loss": 0.1834, + "step": 29751 + }, + { + "epoch": 2.003526858383071, + "grad_norm": 3.656111717224121, + "learning_rate": 6.025626969987667e-06, + "loss": 0.1166, + "step": 29752 + }, + { + "epoch": 2.003540423223006, + "grad_norm": 4.57454776763916, + "learning_rate": 6.025489927367411e-06, + "loss": 0.1954, + "step": 29753 + }, + { + "epoch": 2.003553988062941, + "grad_norm": 4.634127140045166, + "learning_rate": 6.0253528847471565e-06, + "loss": 0.1468, + "step": 29754 + }, + { + "epoch": 2.003567552902876, + "grad_norm": 4.438699245452881, + "learning_rate": 6.0252158421269025e-06, + "loss": 0.1508, + "step": 29755 + }, + { + "epoch": 2.0035811177428107, + "grad_norm": 3.117489814758301, + "learning_rate": 6.025078799506648e-06, + "loss": 0.0787, + "step": 29756 + }, + { + "epoch": 2.0035946825827455, + "grad_norm": 3.4326772689819336, + "learning_rate": 6.024941756886392e-06, + "loss": 0.1269, + "step": 29757 + }, + { + "epoch": 2.0036082474226804, + "grad_norm": 5.695893287658691, + "learning_rate": 6.024804714266137e-06, + "loss": 0.2093, + "step": 29758 + }, + { + "epoch": 2.0036218122626153, + "grad_norm": 5.824776649475098, + "learning_rate": 6.0246676716458815e-06, + "loss": 0.1529, + "step": 29759 + }, + { + "epoch": 2.00363537710255, + "grad_norm": 5.158665657043457, + "learning_rate": 6.0245306290256275e-06, + "loss": 0.1578, + "step": 29760 + }, + { + "epoch": 2.003648941942485, + "grad_norm": 4.18060302734375, + "learning_rate": 6.024393586405373e-06, + "loss": 0.1301, + "step": 29761 + }, + { + "epoch": 2.00366250678242, + "grad_norm": 2.318995237350464, + "learning_rate": 6.024256543785118e-06, + "loss": 0.0747, + "step": 29762 + }, + { + "epoch": 2.0036760716223547, + "grad_norm": 4.284292221069336, + "learning_rate": 6.024119501164862e-06, + "loss": 0.1867, + "step": 29763 + }, + { + "epoch": 2.0036896364622896, + "grad_norm": 4.571227550506592, + "learning_rate": 6.023982458544608e-06, + "loss": 0.1497, + "step": 29764 + }, + { + "epoch": 2.0037032013022245, + "grad_norm": 3.9214529991149902, + "learning_rate": 6.0238454159243534e-06, + "loss": 0.1358, + "step": 29765 + }, + { + "epoch": 2.0037167661421593, + "grad_norm": 4.906215667724609, + "learning_rate": 6.023708373304098e-06, + "loss": 0.1703, + "step": 29766 + }, + { + "epoch": 2.0037303309820946, + "grad_norm": 3.0134122371673584, + "learning_rate": 6.023571330683843e-06, + "loss": 0.0863, + "step": 29767 + }, + { + "epoch": 2.0037438958220295, + "grad_norm": 4.72182035446167, + "learning_rate": 6.023434288063589e-06, + "loss": 0.3464, + "step": 29768 + }, + { + "epoch": 2.0037574606619644, + "grad_norm": 3.8070948123931885, + "learning_rate": 6.023297245443333e-06, + "loss": 0.1343, + "step": 29769 + }, + { + "epoch": 2.0037710255018992, + "grad_norm": 2.747476816177368, + "learning_rate": 6.0231602028230785e-06, + "loss": 0.127, + "step": 29770 + }, + { + "epoch": 2.003784590341834, + "grad_norm": 5.8372039794921875, + "learning_rate": 6.023023160202824e-06, + "loss": 0.2221, + "step": 29771 + }, + { + "epoch": 2.003798155181769, + "grad_norm": 6.022825717926025, + "learning_rate": 6.022886117582568e-06, + "loss": 0.246, + "step": 29772 + }, + { + "epoch": 2.003811720021704, + "grad_norm": 6.345205307006836, + "learning_rate": 6.022749074962314e-06, + "loss": 0.2819, + "step": 29773 + }, + { + "epoch": 2.0038252848616387, + "grad_norm": 4.030877590179443, + "learning_rate": 6.022612032342059e-06, + "loss": 0.1822, + "step": 29774 + }, + { + "epoch": 2.0038388497015736, + "grad_norm": 4.653899192810059, + "learning_rate": 6.0224749897218035e-06, + "loss": 0.1969, + "step": 29775 + }, + { + "epoch": 2.0038524145415084, + "grad_norm": 3.5589044094085693, + "learning_rate": 6.022337947101549e-06, + "loss": 0.1515, + "step": 29776 + }, + { + "epoch": 2.0038659793814433, + "grad_norm": 5.84842586517334, + "learning_rate": 6.022200904481295e-06, + "loss": 0.1786, + "step": 29777 + }, + { + "epoch": 2.003879544221378, + "grad_norm": 4.094016075134277, + "learning_rate": 6.022063861861039e-06, + "loss": 0.1723, + "step": 29778 + }, + { + "epoch": 2.003893109061313, + "grad_norm": 5.7938690185546875, + "learning_rate": 6.021926819240784e-06, + "loss": 0.1634, + "step": 29779 + }, + { + "epoch": 2.003906673901248, + "grad_norm": 3.7942698001861572, + "learning_rate": 6.021789776620529e-06, + "loss": 0.1219, + "step": 29780 + }, + { + "epoch": 2.0039202387411827, + "grad_norm": 5.125576496124268, + "learning_rate": 6.0216527340002754e-06, + "loss": 0.2909, + "step": 29781 + }, + { + "epoch": 2.0039338035811176, + "grad_norm": 3.9814400672912598, + "learning_rate": 6.02151569138002e-06, + "loss": 0.2208, + "step": 29782 + }, + { + "epoch": 2.0039473684210525, + "grad_norm": 5.272357940673828, + "learning_rate": 6.021378648759765e-06, + "loss": 0.2525, + "step": 29783 + }, + { + "epoch": 2.0039609332609873, + "grad_norm": 4.505438327789307, + "learning_rate": 6.021241606139509e-06, + "loss": 0.1694, + "step": 29784 + }, + { + "epoch": 2.003974498100922, + "grad_norm": 7.226220607757568, + "learning_rate": 6.0211045635192545e-06, + "loss": 0.2283, + "step": 29785 + }, + { + "epoch": 2.0039880629408575, + "grad_norm": 6.16316556930542, + "learning_rate": 6.0209675208990005e-06, + "loss": 0.1003, + "step": 29786 + }, + { + "epoch": 2.0040016277807924, + "grad_norm": 5.504739284515381, + "learning_rate": 6.020830478278745e-06, + "loss": 0.216, + "step": 29787 + }, + { + "epoch": 2.0040151926207272, + "grad_norm": 4.470648288726807, + "learning_rate": 6.02069343565849e-06, + "loss": 0.114, + "step": 29788 + }, + { + "epoch": 2.004028757460662, + "grad_norm": 4.997382640838623, + "learning_rate": 6.020556393038235e-06, + "loss": 0.1587, + "step": 29789 + }, + { + "epoch": 2.004042322300597, + "grad_norm": 5.215266227722168, + "learning_rate": 6.020419350417981e-06, + "loss": 0.1861, + "step": 29790 + }, + { + "epoch": 2.004055887140532, + "grad_norm": 5.071870803833008, + "learning_rate": 6.0202823077977255e-06, + "loss": 0.2021, + "step": 29791 + }, + { + "epoch": 2.0040694519804667, + "grad_norm": 3.9103598594665527, + "learning_rate": 6.020145265177471e-06, + "loss": 0.139, + "step": 29792 + }, + { + "epoch": 2.0040830168204016, + "grad_norm": 3.2754664421081543, + "learning_rate": 6.020008222557215e-06, + "loss": 0.0771, + "step": 29793 + }, + { + "epoch": 2.0040965816603364, + "grad_norm": 3.3425588607788086, + "learning_rate": 6.019871179936961e-06, + "loss": 0.0864, + "step": 29794 + }, + { + "epoch": 2.0041101465002713, + "grad_norm": 4.674820423126221, + "learning_rate": 6.019734137316706e-06, + "loss": 0.1745, + "step": 29795 + }, + { + "epoch": 2.004123711340206, + "grad_norm": 4.435101509094238, + "learning_rate": 6.0195970946964514e-06, + "loss": 0.2557, + "step": 29796 + }, + { + "epoch": 2.004137276180141, + "grad_norm": 2.4137301445007324, + "learning_rate": 6.019460052076196e-06, + "loss": 0.1184, + "step": 29797 + }, + { + "epoch": 2.004150841020076, + "grad_norm": 5.84463357925415, + "learning_rate": 6.019323009455942e-06, + "loss": 0.1803, + "step": 29798 + }, + { + "epoch": 2.0041644058600108, + "grad_norm": 4.576058387756348, + "learning_rate": 6.019185966835687e-06, + "loss": 0.2273, + "step": 29799 + }, + { + "epoch": 2.0041779706999456, + "grad_norm": 3.8836417198181152, + "learning_rate": 6.019048924215431e-06, + "loss": 0.1241, + "step": 29800 + }, + { + "epoch": 2.0041915355398805, + "grad_norm": 4.895035743713379, + "learning_rate": 6.0189118815951765e-06, + "loss": 0.2081, + "step": 29801 + }, + { + "epoch": 2.0042051003798154, + "grad_norm": 6.935303688049316, + "learning_rate": 6.018774838974921e-06, + "loss": 0.2415, + "step": 29802 + }, + { + "epoch": 2.0042186652197502, + "grad_norm": 3.168222427368164, + "learning_rate": 6.018637796354667e-06, + "loss": 0.1205, + "step": 29803 + }, + { + "epoch": 2.004232230059685, + "grad_norm": 5.608975410461426, + "learning_rate": 6.018500753734412e-06, + "loss": 0.1982, + "step": 29804 + }, + { + "epoch": 2.0042457948996204, + "grad_norm": 6.084989547729492, + "learning_rate": 6.018363711114157e-06, + "loss": 0.2787, + "step": 29805 + }, + { + "epoch": 2.0042593597395553, + "grad_norm": 4.937464237213135, + "learning_rate": 6.0182266684939015e-06, + "loss": 0.0985, + "step": 29806 + }, + { + "epoch": 2.00427292457949, + "grad_norm": 7.101826190948486, + "learning_rate": 6.0180896258736476e-06, + "loss": 0.3255, + "step": 29807 + }, + { + "epoch": 2.004286489419425, + "grad_norm": 3.552216053009033, + "learning_rate": 6.017952583253393e-06, + "loss": 0.1819, + "step": 29808 + }, + { + "epoch": 2.00430005425936, + "grad_norm": 3.30248761177063, + "learning_rate": 6.017815540633137e-06, + "loss": 0.1552, + "step": 29809 + }, + { + "epoch": 2.0043136190992947, + "grad_norm": 5.0783185958862305, + "learning_rate": 6.017678498012882e-06, + "loss": 0.2231, + "step": 29810 + }, + { + "epoch": 2.0043271839392296, + "grad_norm": 4.779599189758301, + "learning_rate": 6.017541455392628e-06, + "loss": 0.1188, + "step": 29811 + }, + { + "epoch": 2.0043407487791645, + "grad_norm": 4.616952896118164, + "learning_rate": 6.017404412772373e-06, + "loss": 0.1745, + "step": 29812 + }, + { + "epoch": 2.0043543136190993, + "grad_norm": 3.708003520965576, + "learning_rate": 6.017267370152118e-06, + "loss": 0.1441, + "step": 29813 + }, + { + "epoch": 2.004367878459034, + "grad_norm": 3.1447322368621826, + "learning_rate": 6.017130327531863e-06, + "loss": 0.0855, + "step": 29814 + }, + { + "epoch": 2.004381443298969, + "grad_norm": 5.479887962341309, + "learning_rate": 6.016993284911607e-06, + "loss": 0.196, + "step": 29815 + }, + { + "epoch": 2.004395008138904, + "grad_norm": 5.17919921875, + "learning_rate": 6.016856242291353e-06, + "loss": 0.181, + "step": 29816 + }, + { + "epoch": 2.004408572978839, + "grad_norm": 4.4521918296813965, + "learning_rate": 6.0167191996710985e-06, + "loss": 0.1434, + "step": 29817 + }, + { + "epoch": 2.0044221378187737, + "grad_norm": 5.582705020904541, + "learning_rate": 6.016582157050843e-06, + "loss": 0.174, + "step": 29818 + }, + { + "epoch": 2.0044357026587085, + "grad_norm": 5.5927605628967285, + "learning_rate": 6.016445114430588e-06, + "loss": 0.2193, + "step": 29819 + }, + { + "epoch": 2.0044492674986434, + "grad_norm": 5.204585075378418, + "learning_rate": 6.016308071810334e-06, + "loss": 0.1718, + "step": 29820 + }, + { + "epoch": 2.0044628323385782, + "grad_norm": 3.5108771324157715, + "learning_rate": 6.016171029190078e-06, + "loss": 0.124, + "step": 29821 + }, + { + "epoch": 2.004476397178513, + "grad_norm": 5.5004353523254395, + "learning_rate": 6.0160339865698235e-06, + "loss": 0.2099, + "step": 29822 + }, + { + "epoch": 2.004489962018448, + "grad_norm": 3.5207035541534424, + "learning_rate": 6.015896943949569e-06, + "loss": 0.0595, + "step": 29823 + }, + { + "epoch": 2.0045035268583833, + "grad_norm": 4.873053073883057, + "learning_rate": 6.015759901329315e-06, + "loss": 0.1893, + "step": 29824 + }, + { + "epoch": 2.004517091698318, + "grad_norm": 5.896127700805664, + "learning_rate": 6.015622858709059e-06, + "loss": 0.2014, + "step": 29825 + }, + { + "epoch": 2.004530656538253, + "grad_norm": 3.734933614730835, + "learning_rate": 6.015485816088804e-06, + "loss": 0.1752, + "step": 29826 + }, + { + "epoch": 2.004544221378188, + "grad_norm": 5.320511817932129, + "learning_rate": 6.015348773468549e-06, + "loss": 0.1644, + "step": 29827 + }, + { + "epoch": 2.0045577862181228, + "grad_norm": 5.148853778839111, + "learning_rate": 6.015211730848294e-06, + "loss": 0.2727, + "step": 29828 + }, + { + "epoch": 2.0045713510580576, + "grad_norm": 4.785950660705566, + "learning_rate": 6.01507468822804e-06, + "loss": 0.1971, + "step": 29829 + }, + { + "epoch": 2.0045849158979925, + "grad_norm": 5.337391376495361, + "learning_rate": 6.014937645607785e-06, + "loss": 0.2236, + "step": 29830 + }, + { + "epoch": 2.0045984807379273, + "grad_norm": 3.7810444831848145, + "learning_rate": 6.014800602987529e-06, + "loss": 0.1167, + "step": 29831 + }, + { + "epoch": 2.004612045577862, + "grad_norm": 6.529689311981201, + "learning_rate": 6.0146635603672745e-06, + "loss": 0.2014, + "step": 29832 + }, + { + "epoch": 2.004625610417797, + "grad_norm": 3.6590092182159424, + "learning_rate": 6.0145265177470205e-06, + "loss": 0.1271, + "step": 29833 + }, + { + "epoch": 2.004639175257732, + "grad_norm": 3.586481809616089, + "learning_rate": 6.014389475126765e-06, + "loss": 0.1253, + "step": 29834 + }, + { + "epoch": 2.004652740097667, + "grad_norm": 5.030137538909912, + "learning_rate": 6.01425243250651e-06, + "loss": 0.1871, + "step": 29835 + }, + { + "epoch": 2.0046663049376017, + "grad_norm": 4.597630500793457, + "learning_rate": 6.014115389886254e-06, + "loss": 0.1176, + "step": 29836 + }, + { + "epoch": 2.0046798697775365, + "grad_norm": 4.180872440338135, + "learning_rate": 6.013978347266e-06, + "loss": 0.1732, + "step": 29837 + }, + { + "epoch": 2.0046934346174714, + "grad_norm": 6.493716716766357, + "learning_rate": 6.0138413046457456e-06, + "loss": 0.3231, + "step": 29838 + }, + { + "epoch": 2.0047069994574063, + "grad_norm": 6.057530403137207, + "learning_rate": 6.013704262025491e-06, + "loss": 0.2808, + "step": 29839 + }, + { + "epoch": 2.004720564297341, + "grad_norm": 4.9205145835876465, + "learning_rate": 6.013567219405235e-06, + "loss": 0.1217, + "step": 29840 + }, + { + "epoch": 2.004734129137276, + "grad_norm": 4.168666839599609, + "learning_rate": 6.01343017678498e-06, + "loss": 0.1898, + "step": 29841 + }, + { + "epoch": 2.004747693977211, + "grad_norm": 3.623704671859741, + "learning_rate": 6.013293134164726e-06, + "loss": 0.1456, + "step": 29842 + }, + { + "epoch": 2.004761258817146, + "grad_norm": 5.5805559158325195, + "learning_rate": 6.013156091544471e-06, + "loss": 0.1644, + "step": 29843 + }, + { + "epoch": 2.004774823657081, + "grad_norm": 4.222209453582764, + "learning_rate": 6.013019048924216e-06, + "loss": 0.1002, + "step": 29844 + }, + { + "epoch": 2.004788388497016, + "grad_norm": 5.196708679199219, + "learning_rate": 6.012882006303961e-06, + "loss": 0.1846, + "step": 29845 + }, + { + "epoch": 2.0048019533369508, + "grad_norm": 5.309334754943848, + "learning_rate": 6.012744963683706e-06, + "loss": 0.1441, + "step": 29846 + }, + { + "epoch": 2.0048155181768856, + "grad_norm": 5.409106731414795, + "learning_rate": 6.012607921063451e-06, + "loss": 0.244, + "step": 29847 + }, + { + "epoch": 2.0048290830168205, + "grad_norm": 5.747700214385986, + "learning_rate": 6.0124708784431965e-06, + "loss": 0.2246, + "step": 29848 + }, + { + "epoch": 2.0048426478567554, + "grad_norm": 4.5885701179504395, + "learning_rate": 6.012333835822941e-06, + "loss": 0.1133, + "step": 29849 + }, + { + "epoch": 2.0048562126966902, + "grad_norm": 5.5200629234313965, + "learning_rate": 6.012196793202687e-06, + "loss": 0.1724, + "step": 29850 + }, + { + "epoch": 2.004869777536625, + "grad_norm": 3.752311944961548, + "learning_rate": 6.012059750582432e-06, + "loss": 0.1427, + "step": 29851 + }, + { + "epoch": 2.00488334237656, + "grad_norm": 3.4885964393615723, + "learning_rate": 6.011922707962176e-06, + "loss": 0.1215, + "step": 29852 + }, + { + "epoch": 2.004896907216495, + "grad_norm": 2.7354846000671387, + "learning_rate": 6.0117856653419215e-06, + "loss": 0.0634, + "step": 29853 + }, + { + "epoch": 2.0049104720564297, + "grad_norm": 4.1649885177612305, + "learning_rate": 6.011648622721667e-06, + "loss": 0.1792, + "step": 29854 + }, + { + "epoch": 2.0049240368963646, + "grad_norm": 2.915454626083374, + "learning_rate": 6.011511580101413e-06, + "loss": 0.0807, + "step": 29855 + }, + { + "epoch": 2.0049376017362994, + "grad_norm": 4.301723003387451, + "learning_rate": 6.011374537481157e-06, + "loss": 0.2154, + "step": 29856 + }, + { + "epoch": 2.0049511665762343, + "grad_norm": 3.7574589252471924, + "learning_rate": 6.011237494860902e-06, + "loss": 0.1053, + "step": 29857 + }, + { + "epoch": 2.004964731416169, + "grad_norm": 4.602481842041016, + "learning_rate": 6.011100452240647e-06, + "loss": 0.2011, + "step": 29858 + }, + { + "epoch": 2.004978296256104, + "grad_norm": 4.491512775421143, + "learning_rate": 6.010963409620393e-06, + "loss": 0.1594, + "step": 29859 + }, + { + "epoch": 2.004991861096039, + "grad_norm": 4.977077007293701, + "learning_rate": 6.010826367000138e-06, + "loss": 0.1902, + "step": 29860 + }, + { + "epoch": 2.0050054259359738, + "grad_norm": 3.462808847427368, + "learning_rate": 6.010689324379882e-06, + "loss": 0.0958, + "step": 29861 + }, + { + "epoch": 2.005018990775909, + "grad_norm": 3.590704917907715, + "learning_rate": 6.010552281759627e-06, + "loss": 0.171, + "step": 29862 + }, + { + "epoch": 2.005032555615844, + "grad_norm": 5.691359519958496, + "learning_rate": 6.010415239139373e-06, + "loss": 0.2173, + "step": 29863 + }, + { + "epoch": 2.005046120455779, + "grad_norm": 7.004158020019531, + "learning_rate": 6.0102781965191185e-06, + "loss": 0.2476, + "step": 29864 + }, + { + "epoch": 2.0050596852957137, + "grad_norm": 5.7188639640808105, + "learning_rate": 6.010141153898863e-06, + "loss": 0.1812, + "step": 29865 + }, + { + "epoch": 2.0050732501356485, + "grad_norm": 6.626070499420166, + "learning_rate": 6.010004111278608e-06, + "loss": 0.2251, + "step": 29866 + }, + { + "epoch": 2.0050868149755834, + "grad_norm": 5.402113437652588, + "learning_rate": 6.009867068658354e-06, + "loss": 0.1491, + "step": 29867 + }, + { + "epoch": 2.0051003798155183, + "grad_norm": 4.799886226654053, + "learning_rate": 6.009730026038098e-06, + "loss": 0.259, + "step": 29868 + }, + { + "epoch": 2.005113944655453, + "grad_norm": 4.418784141540527, + "learning_rate": 6.0095929834178436e-06, + "loss": 0.2132, + "step": 29869 + }, + { + "epoch": 2.005127509495388, + "grad_norm": 4.149484157562256, + "learning_rate": 6.009455940797589e-06, + "loss": 0.1685, + "step": 29870 + }, + { + "epoch": 2.005141074335323, + "grad_norm": 4.820040225982666, + "learning_rate": 6.009318898177333e-06, + "loss": 0.2029, + "step": 29871 + }, + { + "epoch": 2.0051546391752577, + "grad_norm": 5.312498569488525, + "learning_rate": 6.009181855557079e-06, + "loss": 0.1776, + "step": 29872 + }, + { + "epoch": 2.0051682040151926, + "grad_norm": 4.5421037673950195, + "learning_rate": 6.009044812936824e-06, + "loss": 0.2438, + "step": 29873 + }, + { + "epoch": 2.0051817688551274, + "grad_norm": 4.028698444366455, + "learning_rate": 6.008907770316569e-06, + "loss": 0.155, + "step": 29874 + }, + { + "epoch": 2.0051953336950623, + "grad_norm": 3.5881736278533936, + "learning_rate": 6.008770727696314e-06, + "loss": 0.0953, + "step": 29875 + }, + { + "epoch": 2.005208898534997, + "grad_norm": 4.002417087554932, + "learning_rate": 6.00863368507606e-06, + "loss": 0.1528, + "step": 29876 + }, + { + "epoch": 2.005222463374932, + "grad_norm": 4.76049280166626, + "learning_rate": 6.008496642455804e-06, + "loss": 0.214, + "step": 29877 + }, + { + "epoch": 2.005236028214867, + "grad_norm": 5.036396503448486, + "learning_rate": 6.008359599835549e-06, + "loss": 0.2089, + "step": 29878 + }, + { + "epoch": 2.0052495930548018, + "grad_norm": 6.613214492797852, + "learning_rate": 6.0082225572152945e-06, + "loss": 0.1882, + "step": 29879 + }, + { + "epoch": 2.0052631578947366, + "grad_norm": 4.747036457061768, + "learning_rate": 6.00808551459504e-06, + "loss": 0.1774, + "step": 29880 + }, + { + "epoch": 2.005276722734672, + "grad_norm": 3.3063414096832275, + "learning_rate": 6.007948471974785e-06, + "loss": 0.1581, + "step": 29881 + }, + { + "epoch": 2.005290287574607, + "grad_norm": 4.939749717712402, + "learning_rate": 6.00781142935453e-06, + "loss": 0.1519, + "step": 29882 + }, + { + "epoch": 2.0053038524145417, + "grad_norm": 3.7599005699157715, + "learning_rate": 6.007674386734274e-06, + "loss": 0.1743, + "step": 29883 + }, + { + "epoch": 2.0053174172544765, + "grad_norm": 4.816522598266602, + "learning_rate": 6.0075373441140195e-06, + "loss": 0.2014, + "step": 29884 + }, + { + "epoch": 2.0053309820944114, + "grad_norm": 4.251832962036133, + "learning_rate": 6.0074003014937656e-06, + "loss": 0.128, + "step": 29885 + }, + { + "epoch": 2.0053445469343463, + "grad_norm": 6.306318283081055, + "learning_rate": 6.00726325887351e-06, + "loss": 0.2274, + "step": 29886 + }, + { + "epoch": 2.005358111774281, + "grad_norm": 3.9492926597595215, + "learning_rate": 6.007126216253255e-06, + "loss": 0.1355, + "step": 29887 + }, + { + "epoch": 2.005371676614216, + "grad_norm": 3.5866081714630127, + "learning_rate": 6.006989173633e-06, + "loss": 0.1463, + "step": 29888 + }, + { + "epoch": 2.005385241454151, + "grad_norm": 3.766571283340454, + "learning_rate": 6.006852131012746e-06, + "loss": 0.1339, + "step": 29889 + }, + { + "epoch": 2.0053988062940857, + "grad_norm": 4.670563697814941, + "learning_rate": 6.006715088392491e-06, + "loss": 0.1754, + "step": 29890 + }, + { + "epoch": 2.0054123711340206, + "grad_norm": 4.641321659088135, + "learning_rate": 6.006578045772236e-06, + "loss": 0.0915, + "step": 29891 + }, + { + "epoch": 2.0054259359739555, + "grad_norm": 4.739593505859375, + "learning_rate": 6.00644100315198e-06, + "loss": 0.1463, + "step": 29892 + }, + { + "epoch": 2.0054395008138903, + "grad_norm": 4.666049957275391, + "learning_rate": 6.006303960531726e-06, + "loss": 0.183, + "step": 29893 + }, + { + "epoch": 2.005453065653825, + "grad_norm": 4.263364315032959, + "learning_rate": 6.006166917911471e-06, + "loss": 0.1381, + "step": 29894 + }, + { + "epoch": 2.00546663049376, + "grad_norm": 4.626394748687744, + "learning_rate": 6.006029875291216e-06, + "loss": 0.1366, + "step": 29895 + }, + { + "epoch": 2.005480195333695, + "grad_norm": 2.8113105297088623, + "learning_rate": 6.005892832670961e-06, + "loss": 0.1368, + "step": 29896 + }, + { + "epoch": 2.00549376017363, + "grad_norm": 4.147366046905518, + "learning_rate": 6.005755790050706e-06, + "loss": 0.1678, + "step": 29897 + }, + { + "epoch": 2.0055073250135647, + "grad_norm": 3.3657121658325195, + "learning_rate": 6.005618747430452e-06, + "loss": 0.119, + "step": 29898 + }, + { + "epoch": 2.0055208898534995, + "grad_norm": 3.362090826034546, + "learning_rate": 6.005481704810196e-06, + "loss": 0.1247, + "step": 29899 + }, + { + "epoch": 2.005534454693435, + "grad_norm": 3.6831657886505127, + "learning_rate": 6.0053446621899416e-06, + "loss": 0.1067, + "step": 29900 + }, + { + "epoch": 2.0055480195333697, + "grad_norm": 4.800037860870361, + "learning_rate": 6.005207619569686e-06, + "loss": 0.1332, + "step": 29901 + }, + { + "epoch": 2.0055615843733046, + "grad_norm": 4.463134765625, + "learning_rate": 6.005070576949432e-06, + "loss": 0.1832, + "step": 29902 + }, + { + "epoch": 2.0055751492132394, + "grad_norm": 3.591128349304199, + "learning_rate": 6.004933534329177e-06, + "loss": 0.1263, + "step": 29903 + }, + { + "epoch": 2.0055887140531743, + "grad_norm": 6.071329593658447, + "learning_rate": 6.004796491708922e-06, + "loss": 0.2735, + "step": 29904 + }, + { + "epoch": 2.005602278893109, + "grad_norm": 6.470883846282959, + "learning_rate": 6.004659449088667e-06, + "loss": 0.1487, + "step": 29905 + }, + { + "epoch": 2.005615843733044, + "grad_norm": 6.315400123596191, + "learning_rate": 6.004522406468413e-06, + "loss": 0.2653, + "step": 29906 + }, + { + "epoch": 2.005629408572979, + "grad_norm": 3.1490695476531982, + "learning_rate": 6.004385363848158e-06, + "loss": 0.0939, + "step": 29907 + }, + { + "epoch": 2.0056429734129138, + "grad_norm": 3.526594638824463, + "learning_rate": 6.004248321227902e-06, + "loss": 0.1694, + "step": 29908 + }, + { + "epoch": 2.0056565382528486, + "grad_norm": 4.174339771270752, + "learning_rate": 6.004111278607647e-06, + "loss": 0.19, + "step": 29909 + }, + { + "epoch": 2.0056701030927835, + "grad_norm": 5.9642534255981445, + "learning_rate": 6.003974235987392e-06, + "loss": 0.2854, + "step": 29910 + }, + { + "epoch": 2.0056836679327184, + "grad_norm": 4.126433849334717, + "learning_rate": 6.003837193367138e-06, + "loss": 0.1123, + "step": 29911 + }, + { + "epoch": 2.005697232772653, + "grad_norm": 4.742685317993164, + "learning_rate": 6.003700150746883e-06, + "loss": 0.1641, + "step": 29912 + }, + { + "epoch": 2.005710797612588, + "grad_norm": 2.7634904384613037, + "learning_rate": 6.003563108126628e-06, + "loss": 0.0653, + "step": 29913 + }, + { + "epoch": 2.005724362452523, + "grad_norm": 4.151203155517578, + "learning_rate": 6.003426065506372e-06, + "loss": 0.2278, + "step": 29914 + }, + { + "epoch": 2.005737927292458, + "grad_norm": 4.985335826873779, + "learning_rate": 6.003289022886118e-06, + "loss": 0.1812, + "step": 29915 + }, + { + "epoch": 2.0057514921323927, + "grad_norm": 2.915992498397827, + "learning_rate": 6.003151980265864e-06, + "loss": 0.0711, + "step": 29916 + }, + { + "epoch": 2.0057650569723275, + "grad_norm": 3.634657144546509, + "learning_rate": 6.003014937645608e-06, + "loss": 0.0897, + "step": 29917 + }, + { + "epoch": 2.0057786218122624, + "grad_norm": 3.522714853286743, + "learning_rate": 6.002877895025353e-06, + "loss": 0.1164, + "step": 29918 + }, + { + "epoch": 2.0057921866521977, + "grad_norm": 4.9160590171813965, + "learning_rate": 6.002740852405099e-06, + "loss": 0.2063, + "step": 29919 + }, + { + "epoch": 2.0058057514921326, + "grad_norm": 3.9600472450256348, + "learning_rate": 6.0026038097848434e-06, + "loss": 0.1315, + "step": 29920 + }, + { + "epoch": 2.0058193163320674, + "grad_norm": 3.2367706298828125, + "learning_rate": 6.002466767164589e-06, + "loss": 0.124, + "step": 29921 + }, + { + "epoch": 2.0058328811720023, + "grad_norm": 3.3307316303253174, + "learning_rate": 6.002329724544334e-06, + "loss": 0.1094, + "step": 29922 + }, + { + "epoch": 2.005846446011937, + "grad_norm": 3.1403329372406006, + "learning_rate": 6.002192681924078e-06, + "loss": 0.1159, + "step": 29923 + }, + { + "epoch": 2.005860010851872, + "grad_norm": 3.7025036811828613, + "learning_rate": 6.002055639303824e-06, + "loss": 0.0936, + "step": 29924 + }, + { + "epoch": 2.005873575691807, + "grad_norm": 3.6466898918151855, + "learning_rate": 6.001918596683569e-06, + "loss": 0.0796, + "step": 29925 + }, + { + "epoch": 2.0058871405317418, + "grad_norm": 4.8885817527771, + "learning_rate": 6.001781554063314e-06, + "loss": 0.1647, + "step": 29926 + }, + { + "epoch": 2.0059007053716766, + "grad_norm": 4.661799907684326, + "learning_rate": 6.001644511443059e-06, + "loss": 0.1413, + "step": 29927 + }, + { + "epoch": 2.0059142702116115, + "grad_norm": 5.5381269454956055, + "learning_rate": 6.001507468822805e-06, + "loss": 0.202, + "step": 29928 + }, + { + "epoch": 2.0059278350515464, + "grad_norm": 3.258791446685791, + "learning_rate": 6.001370426202549e-06, + "loss": 0.0928, + "step": 29929 + }, + { + "epoch": 2.0059413998914812, + "grad_norm": 5.799801826477051, + "learning_rate": 6.001233383582294e-06, + "loss": 0.1657, + "step": 29930 + }, + { + "epoch": 2.005954964731416, + "grad_norm": 4.624266624450684, + "learning_rate": 6.0010963409620396e-06, + "loss": 0.1859, + "step": 29931 + }, + { + "epoch": 2.005968529571351, + "grad_norm": 6.599963188171387, + "learning_rate": 6.000959298341786e-06, + "loss": 0.2731, + "step": 29932 + }, + { + "epoch": 2.005982094411286, + "grad_norm": 5.326626777648926, + "learning_rate": 6.00082225572153e-06, + "loss": 0.1558, + "step": 29933 + }, + { + "epoch": 2.0059956592512207, + "grad_norm": 3.8339242935180664, + "learning_rate": 6.000685213101275e-06, + "loss": 0.1611, + "step": 29934 + }, + { + "epoch": 2.0060092240911556, + "grad_norm": 5.169824600219727, + "learning_rate": 6.0005481704810194e-06, + "loss": 0.154, + "step": 29935 + }, + { + "epoch": 2.0060227889310904, + "grad_norm": 4.408021926879883, + "learning_rate": 6.0004111278607655e-06, + "loss": 0.2352, + "step": 29936 + }, + { + "epoch": 2.0060363537710253, + "grad_norm": 5.890031814575195, + "learning_rate": 6.000274085240511e-06, + "loss": 0.2362, + "step": 29937 + }, + { + "epoch": 2.0060499186109606, + "grad_norm": 3.5055344104766846, + "learning_rate": 6.000137042620256e-06, + "loss": 0.1277, + "step": 29938 + }, + { + "epoch": 2.0060634834508955, + "grad_norm": 4.974165916442871, + "learning_rate": 6e-06, + "loss": 0.1479, + "step": 29939 + }, + { + "epoch": 2.0060770482908303, + "grad_norm": 4.789752960205078, + "learning_rate": 5.999862957379745e-06, + "loss": 0.2026, + "step": 29940 + }, + { + "epoch": 2.006090613130765, + "grad_norm": 5.545248031616211, + "learning_rate": 5.999725914759491e-06, + "loss": 0.2612, + "step": 29941 + }, + { + "epoch": 2.0061041779707, + "grad_norm": 5.539586544036865, + "learning_rate": 5.999588872139236e-06, + "loss": 0.2153, + "step": 29942 + }, + { + "epoch": 2.006117742810635, + "grad_norm": 5.406344413757324, + "learning_rate": 5.999451829518981e-06, + "loss": 0.1905, + "step": 29943 + }, + { + "epoch": 2.00613130765057, + "grad_norm": 5.0338969230651855, + "learning_rate": 5.999314786898725e-06, + "loss": 0.2013, + "step": 29944 + }, + { + "epoch": 2.0061448724905047, + "grad_norm": 5.298381328582764, + "learning_rate": 5.999177744278471e-06, + "loss": 0.1761, + "step": 29945 + }, + { + "epoch": 2.0061584373304395, + "grad_norm": 5.240994930267334, + "learning_rate": 5.999040701658216e-06, + "loss": 0.1488, + "step": 29946 + }, + { + "epoch": 2.0061720021703744, + "grad_norm": 4.072123050689697, + "learning_rate": 5.998903659037962e-06, + "loss": 0.1509, + "step": 29947 + }, + { + "epoch": 2.0061855670103093, + "grad_norm": 4.834268569946289, + "learning_rate": 5.998766616417706e-06, + "loss": 0.1566, + "step": 29948 + }, + { + "epoch": 2.006199131850244, + "grad_norm": 3.4377100467681885, + "learning_rate": 5.998629573797452e-06, + "loss": 0.1595, + "step": 29949 + }, + { + "epoch": 2.006212696690179, + "grad_norm": 3.7786879539489746, + "learning_rate": 5.998492531177197e-06, + "loss": 0.165, + "step": 29950 + }, + { + "epoch": 2.006226261530114, + "grad_norm": 4.079684734344482, + "learning_rate": 5.9983554885569415e-06, + "loss": 0.2601, + "step": 29951 + }, + { + "epoch": 2.0062398263700487, + "grad_norm": 5.65918493270874, + "learning_rate": 5.998218445936687e-06, + "loss": 0.2204, + "step": 29952 + }, + { + "epoch": 2.0062533912099836, + "grad_norm": 4.418201446533203, + "learning_rate": 5.998081403316432e-06, + "loss": 0.2212, + "step": 29953 + }, + { + "epoch": 2.0062669560499184, + "grad_norm": 5.919445514678955, + "learning_rate": 5.997944360696177e-06, + "loss": 0.2098, + "step": 29954 + }, + { + "epoch": 2.0062805208898533, + "grad_norm": 3.9207441806793213, + "learning_rate": 5.997807318075922e-06, + "loss": 0.2125, + "step": 29955 + }, + { + "epoch": 2.006294085729788, + "grad_norm": 4.50789213180542, + "learning_rate": 5.997670275455667e-06, + "loss": 0.1989, + "step": 29956 + }, + { + "epoch": 2.0063076505697235, + "grad_norm": 4.878339767456055, + "learning_rate": 5.997533232835412e-06, + "loss": 0.2962, + "step": 29957 + }, + { + "epoch": 2.0063212154096584, + "grad_norm": 3.9870684146881104, + "learning_rate": 5.997396190215158e-06, + "loss": 0.1734, + "step": 29958 + }, + { + "epoch": 2.006334780249593, + "grad_norm": 4.982840061187744, + "learning_rate": 5.997259147594903e-06, + "loss": 0.2048, + "step": 29959 + }, + { + "epoch": 2.006348345089528, + "grad_norm": 4.971100330352783, + "learning_rate": 5.997122104974647e-06, + "loss": 0.1641, + "step": 29960 + }, + { + "epoch": 2.006361909929463, + "grad_norm": 5.723292350769043, + "learning_rate": 5.996985062354392e-06, + "loss": 0.2288, + "step": 29961 + }, + { + "epoch": 2.006375474769398, + "grad_norm": 4.383686065673828, + "learning_rate": 5.996848019734138e-06, + "loss": 0.1674, + "step": 29962 + }, + { + "epoch": 2.0063890396093327, + "grad_norm": 4.864990711212158, + "learning_rate": 5.996710977113883e-06, + "loss": 0.2356, + "step": 29963 + }, + { + "epoch": 2.0064026044492675, + "grad_norm": 3.901275157928467, + "learning_rate": 5.996573934493628e-06, + "loss": 0.2497, + "step": 29964 + }, + { + "epoch": 2.0064161692892024, + "grad_norm": 4.862034797668457, + "learning_rate": 5.996436891873373e-06, + "loss": 0.2277, + "step": 29965 + }, + { + "epoch": 2.0064297341291373, + "grad_norm": 5.46804666519165, + "learning_rate": 5.9962998492531174e-06, + "loss": 0.2916, + "step": 29966 + }, + { + "epoch": 2.006443298969072, + "grad_norm": 5.399448394775391, + "learning_rate": 5.9961628066328635e-06, + "loss": 0.3184, + "step": 29967 + }, + { + "epoch": 2.006456863809007, + "grad_norm": 4.198899745941162, + "learning_rate": 5.996025764012609e-06, + "loss": 0.2371, + "step": 29968 + }, + { + "epoch": 2.006470428648942, + "grad_norm": 5.171520709991455, + "learning_rate": 5.995888721392353e-06, + "loss": 0.2542, + "step": 29969 + }, + { + "epoch": 2.0064839934888767, + "grad_norm": 5.208345413208008, + "learning_rate": 5.995751678772098e-06, + "loss": 0.3097, + "step": 29970 + }, + { + "epoch": 2.0064975583288116, + "grad_norm": 5.211331844329834, + "learning_rate": 5.995614636151844e-06, + "loss": 0.3462, + "step": 29971 + }, + { + "epoch": 2.0065111231687465, + "grad_norm": 6.401289463043213, + "learning_rate": 5.995477593531589e-06, + "loss": 0.2847, + "step": 29972 + }, + { + "epoch": 2.0065246880086813, + "grad_norm": 5.270621299743652, + "learning_rate": 5.995340550911334e-06, + "loss": 0.1961, + "step": 29973 + }, + { + "epoch": 2.006538252848616, + "grad_norm": 5.801931858062744, + "learning_rate": 5.995203508291079e-06, + "loss": 0.3046, + "step": 29974 + }, + { + "epoch": 2.006551817688551, + "grad_norm": 5.158419609069824, + "learning_rate": 5.995066465670825e-06, + "loss": 0.2546, + "step": 29975 + }, + { + "epoch": 2.0065653825284864, + "grad_norm": 4.580848693847656, + "learning_rate": 5.994929423050569e-06, + "loss": 0.2949, + "step": 29976 + }, + { + "epoch": 2.0065789473684212, + "grad_norm": 5.25888204574585, + "learning_rate": 5.994792380430314e-06, + "loss": 0.3039, + "step": 29977 + }, + { + "epoch": 2.006592512208356, + "grad_norm": 4.280442714691162, + "learning_rate": 5.994655337810059e-06, + "loss": 0.2855, + "step": 29978 + }, + { + "epoch": 2.006606077048291, + "grad_norm": 4.79707670211792, + "learning_rate": 5.994518295189804e-06, + "loss": 0.3447, + "step": 29979 + }, + { + "epoch": 2.006619641888226, + "grad_norm": 5.512500286102295, + "learning_rate": 5.99438125256955e-06, + "loss": 0.2117, + "step": 29980 + }, + { + "epoch": 2.0066332067281607, + "grad_norm": 8.488304138183594, + "learning_rate": 5.994244209949295e-06, + "loss": 0.3537, + "step": 29981 + }, + { + "epoch": 2.0066467715680956, + "grad_norm": 5.589847087860107, + "learning_rate": 5.9941071673290395e-06, + "loss": 0.2522, + "step": 29982 + }, + { + "epoch": 2.0066603364080304, + "grad_norm": 4.36855411529541, + "learning_rate": 5.993970124708785e-06, + "loss": 0.1822, + "step": 29983 + }, + { + "epoch": 2.0066739012479653, + "grad_norm": 4.915555477142334, + "learning_rate": 5.993833082088531e-06, + "loss": 0.2572, + "step": 29984 + }, + { + "epoch": 2.0066874660879, + "grad_norm": 7.883239269256592, + "learning_rate": 5.993696039468275e-06, + "loss": 0.235, + "step": 29985 + }, + { + "epoch": 2.006701030927835, + "grad_norm": 4.271549224853516, + "learning_rate": 5.99355899684802e-06, + "loss": 0.2718, + "step": 29986 + }, + { + "epoch": 2.00671459576777, + "grad_norm": 5.291656970977783, + "learning_rate": 5.993421954227765e-06, + "loss": 0.2331, + "step": 29987 + }, + { + "epoch": 2.0067281606077048, + "grad_norm": 5.967001438140869, + "learning_rate": 5.9932849116075105e-06, + "loss": 0.3293, + "step": 29988 + }, + { + "epoch": 2.0067417254476396, + "grad_norm": 5.7714715003967285, + "learning_rate": 5.993147868987256e-06, + "loss": 0.2646, + "step": 29989 + }, + { + "epoch": 2.0067552902875745, + "grad_norm": 4.3095574378967285, + "learning_rate": 5.993010826367001e-06, + "loss": 0.2421, + "step": 29990 + }, + { + "epoch": 2.0067688551275094, + "grad_norm": 4.096252918243408, + "learning_rate": 5.992873783746745e-06, + "loss": 0.1984, + "step": 29991 + }, + { + "epoch": 2.006782419967444, + "grad_norm": 5.7565765380859375, + "learning_rate": 5.99273674112649e-06, + "loss": 0.2975, + "step": 29992 + }, + { + "epoch": 2.006795984807379, + "grad_norm": 3.9778501987457275, + "learning_rate": 5.9925996985062364e-06, + "loss": 0.2273, + "step": 29993 + }, + { + "epoch": 2.006809549647314, + "grad_norm": 3.864804744720459, + "learning_rate": 5.992462655885981e-06, + "loss": 0.2097, + "step": 29994 + }, + { + "epoch": 2.0068231144872493, + "grad_norm": 5.200274467468262, + "learning_rate": 5.992325613265726e-06, + "loss": 0.3286, + "step": 29995 + }, + { + "epoch": 2.006836679327184, + "grad_norm": 4.032801628112793, + "learning_rate": 5.992188570645471e-06, + "loss": 0.1921, + "step": 29996 + }, + { + "epoch": 2.006850244167119, + "grad_norm": 4.247580528259277, + "learning_rate": 5.992051528025217e-06, + "loss": 0.1412, + "step": 29997 + }, + { + "epoch": 2.006863809007054, + "grad_norm": 4.203069686889648, + "learning_rate": 5.9919144854049615e-06, + "loss": 0.251, + "step": 29998 + }, + { + "epoch": 2.0068773738469887, + "grad_norm": 4.48220157623291, + "learning_rate": 5.991777442784707e-06, + "loss": 0.1973, + "step": 29999 + }, + { + "epoch": 2.0068909386869236, + "grad_norm": 6.765216827392578, + "learning_rate": 5.991640400164451e-06, + "loss": 0.2805, + "step": 30000 + }, + { + "epoch": 2.0069045035268585, + "grad_norm": 4.79746675491333, + "learning_rate": 5.991503357544197e-06, + "loss": 0.1799, + "step": 30001 + }, + { + "epoch": 2.0069180683667933, + "grad_norm": 4.280520439147949, + "learning_rate": 5.991366314923942e-06, + "loss": 0.1877, + "step": 30002 + }, + { + "epoch": 2.006931633206728, + "grad_norm": 4.262504577636719, + "learning_rate": 5.9912292723036865e-06, + "loss": 0.2346, + "step": 30003 + }, + { + "epoch": 2.006945198046663, + "grad_norm": 4.453530311584473, + "learning_rate": 5.991092229683432e-06, + "loss": 0.2256, + "step": 30004 + }, + { + "epoch": 2.006958762886598, + "grad_norm": 4.6171040534973145, + "learning_rate": 5.990955187063178e-06, + "loss": 0.2594, + "step": 30005 + }, + { + "epoch": 2.006972327726533, + "grad_norm": 4.889510154724121, + "learning_rate": 5.990818144442923e-06, + "loss": 0.2127, + "step": 30006 + }, + { + "epoch": 2.0069858925664676, + "grad_norm": 5.963386535644531, + "learning_rate": 5.990681101822667e-06, + "loss": 0.1975, + "step": 30007 + }, + { + "epoch": 2.0069994574064025, + "grad_norm": 5.692573547363281, + "learning_rate": 5.990544059202412e-06, + "loss": 0.2834, + "step": 30008 + }, + { + "epoch": 2.0070130222463374, + "grad_norm": 6.273273468017578, + "learning_rate": 5.990407016582157e-06, + "loss": 0.2333, + "step": 30009 + }, + { + "epoch": 2.0070265870862722, + "grad_norm": 5.659169673919678, + "learning_rate": 5.990269973961903e-06, + "loss": 0.2697, + "step": 30010 + }, + { + "epoch": 2.007040151926207, + "grad_norm": 6.39816951751709, + "learning_rate": 5.990132931341648e-06, + "loss": 0.3278, + "step": 30011 + }, + { + "epoch": 2.007053716766142, + "grad_norm": 4.985381126403809, + "learning_rate": 5.989995888721392e-06, + "loss": 0.1782, + "step": 30012 + }, + { + "epoch": 2.007067281606077, + "grad_norm": 4.734172344207764, + "learning_rate": 5.9898588461011375e-06, + "loss": 0.2892, + "step": 30013 + }, + { + "epoch": 2.007080846446012, + "grad_norm": 6.352209091186523, + "learning_rate": 5.9897218034808835e-06, + "loss": 0.3012, + "step": 30014 + }, + { + "epoch": 2.007094411285947, + "grad_norm": 4.103928565979004, + "learning_rate": 5.989584760860629e-06, + "loss": 0.1845, + "step": 30015 + }, + { + "epoch": 2.007107976125882, + "grad_norm": 5.3845295906066895, + "learning_rate": 5.989447718240373e-06, + "loss": 0.2362, + "step": 30016 + }, + { + "epoch": 2.0071215409658167, + "grad_norm": 3.8717713356018066, + "learning_rate": 5.989310675620118e-06, + "loss": 0.208, + "step": 30017 + }, + { + "epoch": 2.0071351058057516, + "grad_norm": 5.124373435974121, + "learning_rate": 5.989173632999864e-06, + "loss": 0.1734, + "step": 30018 + }, + { + "epoch": 2.0071486706456865, + "grad_norm": 6.31597375869751, + "learning_rate": 5.9890365903796085e-06, + "loss": 0.3423, + "step": 30019 + }, + { + "epoch": 2.0071622354856213, + "grad_norm": 5.928891658782959, + "learning_rate": 5.988899547759354e-06, + "loss": 0.2497, + "step": 30020 + }, + { + "epoch": 2.007175800325556, + "grad_norm": 7.568945407867432, + "learning_rate": 5.988762505139099e-06, + "loss": 0.6148, + "step": 30021 + }, + { + "epoch": 2.007189365165491, + "grad_norm": 4.329026699066162, + "learning_rate": 5.988625462518843e-06, + "loss": 0.1813, + "step": 30022 + }, + { + "epoch": 2.007202930005426, + "grad_norm": 5.908293724060059, + "learning_rate": 5.988488419898589e-06, + "loss": 0.1477, + "step": 30023 + }, + { + "epoch": 2.007216494845361, + "grad_norm": 4.696905136108398, + "learning_rate": 5.9883513772783344e-06, + "loss": 0.1215, + "step": 30024 + }, + { + "epoch": 2.0072300596852957, + "grad_norm": 14.365432739257812, + "learning_rate": 5.988214334658079e-06, + "loss": 0.2455, + "step": 30025 + }, + { + "epoch": 2.0072436245252305, + "grad_norm": 4.516969680786133, + "learning_rate": 5.988077292037824e-06, + "loss": 0.1672, + "step": 30026 + }, + { + "epoch": 2.0072571893651654, + "grad_norm": 7.319301128387451, + "learning_rate": 5.98794024941757e-06, + "loss": 0.2622, + "step": 30027 + }, + { + "epoch": 2.0072707542051003, + "grad_norm": 5.150638580322266, + "learning_rate": 5.987803206797314e-06, + "loss": 0.2375, + "step": 30028 + }, + { + "epoch": 2.007284319045035, + "grad_norm": 5.6539306640625, + "learning_rate": 5.9876661641770595e-06, + "loss": 0.217, + "step": 30029 + }, + { + "epoch": 2.00729788388497, + "grad_norm": 5.440591335296631, + "learning_rate": 5.987529121556805e-06, + "loss": 0.1819, + "step": 30030 + }, + { + "epoch": 2.007311448724905, + "grad_norm": 5.623834133148193, + "learning_rate": 5.987392078936551e-06, + "loss": 0.285, + "step": 30031 + }, + { + "epoch": 2.0073250135648397, + "grad_norm": 6.154253005981445, + "learning_rate": 5.987255036316295e-06, + "loss": 0.2528, + "step": 30032 + }, + { + "epoch": 2.007338578404775, + "grad_norm": 6.573709487915039, + "learning_rate": 5.98711799369604e-06, + "loss": 0.1755, + "step": 30033 + }, + { + "epoch": 2.00735214324471, + "grad_norm": 4.152193069458008, + "learning_rate": 5.9869809510757845e-06, + "loss": 0.1392, + "step": 30034 + }, + { + "epoch": 2.0073657080846448, + "grad_norm": 4.521434783935547, + "learning_rate": 5.98684390845553e-06, + "loss": 0.1401, + "step": 30035 + }, + { + "epoch": 2.0073792729245796, + "grad_norm": 4.747629165649414, + "learning_rate": 5.986706865835276e-06, + "loss": 0.2684, + "step": 30036 + }, + { + "epoch": 2.0073928377645145, + "grad_norm": 8.117989540100098, + "learning_rate": 5.98656982321502e-06, + "loss": 0.242, + "step": 30037 + }, + { + "epoch": 2.0074064026044494, + "grad_norm": 5.15924072265625, + "learning_rate": 5.986432780594765e-06, + "loss": 0.1439, + "step": 30038 + }, + { + "epoch": 2.0074199674443842, + "grad_norm": 6.183912754058838, + "learning_rate": 5.98629573797451e-06, + "loss": 0.2536, + "step": 30039 + }, + { + "epoch": 2.007433532284319, + "grad_norm": 4.951907634735107, + "learning_rate": 5.9861586953542564e-06, + "loss": 0.2538, + "step": 30040 + }, + { + "epoch": 2.007447097124254, + "grad_norm": 8.238279342651367, + "learning_rate": 5.986021652734001e-06, + "loss": 0.2723, + "step": 30041 + }, + { + "epoch": 2.007460661964189, + "grad_norm": 4.0808892250061035, + "learning_rate": 5.985884610113746e-06, + "loss": 0.1685, + "step": 30042 + }, + { + "epoch": 2.0074742268041237, + "grad_norm": 4.893246650695801, + "learning_rate": 5.98574756749349e-06, + "loss": 0.2309, + "step": 30043 + }, + { + "epoch": 2.0074877916440586, + "grad_norm": 4.319149494171143, + "learning_rate": 5.985610524873236e-06, + "loss": 0.1796, + "step": 30044 + }, + { + "epoch": 2.0075013564839934, + "grad_norm": 4.6858906745910645, + "learning_rate": 5.9854734822529815e-06, + "loss": 0.3146, + "step": 30045 + }, + { + "epoch": 2.0075149213239283, + "grad_norm": 5.459374904632568, + "learning_rate": 5.985336439632727e-06, + "loss": 0.2408, + "step": 30046 + }, + { + "epoch": 2.007528486163863, + "grad_norm": 5.098420143127441, + "learning_rate": 5.985199397012471e-06, + "loss": 0.2401, + "step": 30047 + }, + { + "epoch": 2.007542051003798, + "grad_norm": 4.789816856384277, + "learning_rate": 5.985062354392216e-06, + "loss": 0.208, + "step": 30048 + }, + { + "epoch": 2.007555615843733, + "grad_norm": 5.297523498535156, + "learning_rate": 5.984925311771962e-06, + "loss": 0.2791, + "step": 30049 + }, + { + "epoch": 2.0075691806836677, + "grad_norm": 4.8545637130737305, + "learning_rate": 5.9847882691517065e-06, + "loss": 0.2692, + "step": 30050 + }, + { + "epoch": 2.0075827455236026, + "grad_norm": 4.21722936630249, + "learning_rate": 5.984651226531452e-06, + "loss": 0.193, + "step": 30051 + }, + { + "epoch": 2.007596310363538, + "grad_norm": 4.071099758148193, + "learning_rate": 5.984514183911196e-06, + "loss": 0.179, + "step": 30052 + }, + { + "epoch": 2.007609875203473, + "grad_norm": 4.524535179138184, + "learning_rate": 5.984377141290942e-06, + "loss": 0.2037, + "step": 30053 + }, + { + "epoch": 2.0076234400434076, + "grad_norm": 6.156968116760254, + "learning_rate": 5.984240098670687e-06, + "loss": 0.3064, + "step": 30054 + }, + { + "epoch": 2.0076370048833425, + "grad_norm": 3.794083595275879, + "learning_rate": 5.9841030560504324e-06, + "loss": 0.0674, + "step": 30055 + }, + { + "epoch": 2.0076505697232774, + "grad_norm": 4.746605396270752, + "learning_rate": 5.983966013430177e-06, + "loss": 0.1733, + "step": 30056 + }, + { + "epoch": 2.0076641345632122, + "grad_norm": 5.79870080947876, + "learning_rate": 5.983828970809923e-06, + "loss": 0.2328, + "step": 30057 + }, + { + "epoch": 2.007677699403147, + "grad_norm": 4.07304048538208, + "learning_rate": 5.983691928189668e-06, + "loss": 0.1892, + "step": 30058 + }, + { + "epoch": 2.007691264243082, + "grad_norm": 3.4627721309661865, + "learning_rate": 5.983554885569412e-06, + "loss": 0.1478, + "step": 30059 + }, + { + "epoch": 2.007704829083017, + "grad_norm": 4.468811511993408, + "learning_rate": 5.9834178429491575e-06, + "loss": 0.1264, + "step": 30060 + }, + { + "epoch": 2.0077183939229517, + "grad_norm": 4.680801868438721, + "learning_rate": 5.983280800328903e-06, + "loss": 0.1955, + "step": 30061 + }, + { + "epoch": 2.0077319587628866, + "grad_norm": 3.6545238494873047, + "learning_rate": 5.983143757708648e-06, + "loss": 0.1243, + "step": 30062 + }, + { + "epoch": 2.0077455236028214, + "grad_norm": 2.3782002925872803, + "learning_rate": 5.983006715088393e-06, + "loss": 0.1063, + "step": 30063 + }, + { + "epoch": 2.0077590884427563, + "grad_norm": 5.6069722175598145, + "learning_rate": 5.982869672468138e-06, + "loss": 0.2555, + "step": 30064 + }, + { + "epoch": 2.007772653282691, + "grad_norm": 6.108743190765381, + "learning_rate": 5.9827326298478825e-06, + "loss": 0.1789, + "step": 30065 + }, + { + "epoch": 2.007786218122626, + "grad_norm": 4.024709224700928, + "learning_rate": 5.9825955872276286e-06, + "loss": 0.1821, + "step": 30066 + }, + { + "epoch": 2.007799782962561, + "grad_norm": 4.934719562530518, + "learning_rate": 5.982458544607374e-06, + "loss": 0.1975, + "step": 30067 + }, + { + "epoch": 2.0078133478024958, + "grad_norm": 4.781680583953857, + "learning_rate": 5.982321501987118e-06, + "loss": 0.2075, + "step": 30068 + }, + { + "epoch": 2.0078269126424306, + "grad_norm": 4.05891227722168, + "learning_rate": 5.982184459366863e-06, + "loss": 0.175, + "step": 30069 + }, + { + "epoch": 2.0078404774823655, + "grad_norm": 3.6871187686920166, + "learning_rate": 5.982047416746609e-06, + "loss": 0.1979, + "step": 30070 + }, + { + "epoch": 2.007854042322301, + "grad_norm": 3.4533331394195557, + "learning_rate": 5.981910374126354e-06, + "loss": 0.1312, + "step": 30071 + }, + { + "epoch": 2.0078676071622357, + "grad_norm": 3.9613540172576904, + "learning_rate": 5.981773331506099e-06, + "loss": 0.162, + "step": 30072 + }, + { + "epoch": 2.0078811720021705, + "grad_norm": 4.908271312713623, + "learning_rate": 5.981636288885844e-06, + "loss": 0.2122, + "step": 30073 + }, + { + "epoch": 2.0078947368421054, + "grad_norm": 4.992568492889404, + "learning_rate": 5.981499246265588e-06, + "loss": 0.2517, + "step": 30074 + }, + { + "epoch": 2.0079083016820403, + "grad_norm": 4.386870861053467, + "learning_rate": 5.981362203645334e-06, + "loss": 0.189, + "step": 30075 + }, + { + "epoch": 2.007921866521975, + "grad_norm": 3.999464750289917, + "learning_rate": 5.9812251610250795e-06, + "loss": 0.2162, + "step": 30076 + }, + { + "epoch": 2.00793543136191, + "grad_norm": 7.634922981262207, + "learning_rate": 5.981088118404824e-06, + "loss": 0.1738, + "step": 30077 + }, + { + "epoch": 2.007948996201845, + "grad_norm": 2.880750894546509, + "learning_rate": 5.980951075784569e-06, + "loss": 0.1601, + "step": 30078 + }, + { + "epoch": 2.0079625610417797, + "grad_norm": 6.402960777282715, + "learning_rate": 5.980814033164315e-06, + "loss": 0.2236, + "step": 30079 + }, + { + "epoch": 2.0079761258817146, + "grad_norm": 4.24351692199707, + "learning_rate": 5.98067699054406e-06, + "loss": 0.0902, + "step": 30080 + }, + { + "epoch": 2.0079896907216495, + "grad_norm": 6.094954490661621, + "learning_rate": 5.9805399479238045e-06, + "loss": 0.1726, + "step": 30081 + }, + { + "epoch": 2.0080032555615843, + "grad_norm": 4.156350135803223, + "learning_rate": 5.98040290530355e-06, + "loss": 0.2509, + "step": 30082 + }, + { + "epoch": 2.008016820401519, + "grad_norm": 4.126699447631836, + "learning_rate": 5.980265862683296e-06, + "loss": 0.1448, + "step": 30083 + }, + { + "epoch": 2.008030385241454, + "grad_norm": 2.9566242694854736, + "learning_rate": 5.98012882006304e-06, + "loss": 0.1022, + "step": 30084 + }, + { + "epoch": 2.008043950081389, + "grad_norm": 4.9863176345825195, + "learning_rate": 5.979991777442785e-06, + "loss": 0.2757, + "step": 30085 + }, + { + "epoch": 2.008057514921324, + "grad_norm": 3.486755847930908, + "learning_rate": 5.97985473482253e-06, + "loss": 0.1748, + "step": 30086 + }, + { + "epoch": 2.0080710797612586, + "grad_norm": 4.013486385345459, + "learning_rate": 5.979717692202276e-06, + "loss": 0.1983, + "step": 30087 + }, + { + "epoch": 2.0080846446011935, + "grad_norm": 3.300194025039673, + "learning_rate": 5.979580649582021e-06, + "loss": 0.167, + "step": 30088 + }, + { + "epoch": 2.0080982094411284, + "grad_norm": 4.360215187072754, + "learning_rate": 5.979443606961766e-06, + "loss": 0.1775, + "step": 30089 + }, + { + "epoch": 2.0081117742810637, + "grad_norm": 4.637024402618408, + "learning_rate": 5.97930656434151e-06, + "loss": 0.202, + "step": 30090 + }, + { + "epoch": 2.0081253391209986, + "grad_norm": 4.669125080108643, + "learning_rate": 5.9791695217212555e-06, + "loss": 0.1711, + "step": 30091 + }, + { + "epoch": 2.0081389039609334, + "grad_norm": 5.623705863952637, + "learning_rate": 5.9790324791010015e-06, + "loss": 0.2894, + "step": 30092 + }, + { + "epoch": 2.0081524688008683, + "grad_norm": 6.037446975708008, + "learning_rate": 5.978895436480746e-06, + "loss": 0.2978, + "step": 30093 + }, + { + "epoch": 2.008166033640803, + "grad_norm": 5.080705642700195, + "learning_rate": 5.978758393860491e-06, + "loss": 0.1765, + "step": 30094 + }, + { + "epoch": 2.008179598480738, + "grad_norm": 6.2432475090026855, + "learning_rate": 5.978621351240236e-06, + "loss": 0.3036, + "step": 30095 + }, + { + "epoch": 2.008193163320673, + "grad_norm": 6.520114898681641, + "learning_rate": 5.978484308619981e-06, + "loss": 0.2557, + "step": 30096 + }, + { + "epoch": 2.0082067281606077, + "grad_norm": 6.980972766876221, + "learning_rate": 5.9783472659997266e-06, + "loss": 0.2272, + "step": 30097 + }, + { + "epoch": 2.0082202930005426, + "grad_norm": 5.142707347869873, + "learning_rate": 5.978210223379472e-06, + "loss": 0.1668, + "step": 30098 + }, + { + "epoch": 2.0082338578404775, + "grad_norm": 4.104881286621094, + "learning_rate": 5.978073180759216e-06, + "loss": 0.1762, + "step": 30099 + }, + { + "epoch": 2.0082474226804123, + "grad_norm": 4.178413391113281, + "learning_rate": 5.977936138138962e-06, + "loss": 0.2542, + "step": 30100 + }, + { + "epoch": 2.008260987520347, + "grad_norm": 4.807538032531738, + "learning_rate": 5.977799095518707e-06, + "loss": 0.2306, + "step": 30101 + }, + { + "epoch": 2.008274552360282, + "grad_norm": 5.233449935913086, + "learning_rate": 5.977662052898452e-06, + "loss": 0.2702, + "step": 30102 + }, + { + "epoch": 2.008288117200217, + "grad_norm": 5.384355545043945, + "learning_rate": 5.977525010278197e-06, + "loss": 0.2343, + "step": 30103 + }, + { + "epoch": 2.008301682040152, + "grad_norm": 4.109414100646973, + "learning_rate": 5.977387967657942e-06, + "loss": 0.1146, + "step": 30104 + }, + { + "epoch": 2.0083152468800867, + "grad_norm": 7.450100898742676, + "learning_rate": 5.977250925037687e-06, + "loss": 0.2919, + "step": 30105 + }, + { + "epoch": 2.0083288117200215, + "grad_norm": 5.649921417236328, + "learning_rate": 5.977113882417432e-06, + "loss": 0.2177, + "step": 30106 + }, + { + "epoch": 2.0083423765599564, + "grad_norm": 5.348848819732666, + "learning_rate": 5.9769768397971775e-06, + "loss": 0.1558, + "step": 30107 + }, + { + "epoch": 2.0083559413998913, + "grad_norm": 3.793738603591919, + "learning_rate": 5.976839797176922e-06, + "loss": 0.2254, + "step": 30108 + }, + { + "epoch": 2.0083695062398266, + "grad_norm": 4.347303867340088, + "learning_rate": 5.976702754556668e-06, + "loss": 0.1838, + "step": 30109 + }, + { + "epoch": 2.0083830710797614, + "grad_norm": 4.3540940284729, + "learning_rate": 5.976565711936413e-06, + "loss": 0.271, + "step": 30110 + }, + { + "epoch": 2.0083966359196963, + "grad_norm": 3.983259916305542, + "learning_rate": 5.976428669316157e-06, + "loss": 0.1948, + "step": 30111 + }, + { + "epoch": 2.008410200759631, + "grad_norm": 4.408810138702393, + "learning_rate": 5.9762916266959025e-06, + "loss": 0.2902, + "step": 30112 + }, + { + "epoch": 2.008423765599566, + "grad_norm": 4.2894744873046875, + "learning_rate": 5.9761545840756486e-06, + "loss": 0.197, + "step": 30113 + }, + { + "epoch": 2.008437330439501, + "grad_norm": 7.146726131439209, + "learning_rate": 5.976017541455394e-06, + "loss": 0.3281, + "step": 30114 + }, + { + "epoch": 2.0084508952794358, + "grad_norm": 4.110563278198242, + "learning_rate": 5.975880498835138e-06, + "loss": 0.1605, + "step": 30115 + }, + { + "epoch": 2.0084644601193706, + "grad_norm": 6.341118335723877, + "learning_rate": 5.975743456214883e-06, + "loss": 0.2652, + "step": 30116 + }, + { + "epoch": 2.0084780249593055, + "grad_norm": 3.789991617202759, + "learning_rate": 5.975606413594628e-06, + "loss": 0.1435, + "step": 30117 + }, + { + "epoch": 2.0084915897992404, + "grad_norm": 5.49022102355957, + "learning_rate": 5.975469370974374e-06, + "loss": 0.2439, + "step": 30118 + }, + { + "epoch": 2.0085051546391752, + "grad_norm": 5.7368879318237305, + "learning_rate": 5.975332328354119e-06, + "loss": 0.2593, + "step": 30119 + }, + { + "epoch": 2.00851871947911, + "grad_norm": 4.703343391418457, + "learning_rate": 5.975195285733863e-06, + "loss": 0.198, + "step": 30120 + }, + { + "epoch": 2.008532284319045, + "grad_norm": 4.034531116485596, + "learning_rate": 5.975058243113608e-06, + "loss": 0.1337, + "step": 30121 + }, + { + "epoch": 2.00854584915898, + "grad_norm": 5.286866664886475, + "learning_rate": 5.974921200493354e-06, + "loss": 0.2924, + "step": 30122 + }, + { + "epoch": 2.0085594139989147, + "grad_norm": 5.535855770111084, + "learning_rate": 5.9747841578730995e-06, + "loss": 0.3669, + "step": 30123 + }, + { + "epoch": 2.0085729788388496, + "grad_norm": 4.204042434692383, + "learning_rate": 5.974647115252844e-06, + "loss": 0.1773, + "step": 30124 + }, + { + "epoch": 2.0085865436787844, + "grad_norm": 6.996128559112549, + "learning_rate": 5.974510072632589e-06, + "loss": 0.3556, + "step": 30125 + }, + { + "epoch": 2.0086001085187193, + "grad_norm": 3.6038665771484375, + "learning_rate": 5.974373030012335e-06, + "loss": 0.2033, + "step": 30126 + }, + { + "epoch": 2.008613673358654, + "grad_norm": 4.306238174438477, + "learning_rate": 5.974235987392079e-06, + "loss": 0.1891, + "step": 30127 + }, + { + "epoch": 2.0086272381985895, + "grad_norm": 4.541459083557129, + "learning_rate": 5.9740989447718246e-06, + "loss": 0.261, + "step": 30128 + }, + { + "epoch": 2.0086408030385243, + "grad_norm": 4.59633207321167, + "learning_rate": 5.97396190215157e-06, + "loss": 0.1795, + "step": 30129 + }, + { + "epoch": 2.008654367878459, + "grad_norm": 4.201813697814941, + "learning_rate": 5.973824859531314e-06, + "loss": 0.1634, + "step": 30130 + }, + { + "epoch": 2.008667932718394, + "grad_norm": 5.7313361167907715, + "learning_rate": 5.97368781691106e-06, + "loss": 0.2426, + "step": 30131 + }, + { + "epoch": 2.008681497558329, + "grad_norm": 6.348270416259766, + "learning_rate": 5.973550774290805e-06, + "loss": 0.34, + "step": 30132 + }, + { + "epoch": 2.008695062398264, + "grad_norm": 3.5479977130889893, + "learning_rate": 5.97341373167055e-06, + "loss": 0.158, + "step": 30133 + }, + { + "epoch": 2.0087086272381987, + "grad_norm": 5.154605865478516, + "learning_rate": 5.973276689050295e-06, + "loss": 0.192, + "step": 30134 + }, + { + "epoch": 2.0087221920781335, + "grad_norm": 4.66636323928833, + "learning_rate": 5.973139646430041e-06, + "loss": 0.2369, + "step": 30135 + }, + { + "epoch": 2.0087357569180684, + "grad_norm": 4.5623250007629395, + "learning_rate": 5.973002603809785e-06, + "loss": 0.2723, + "step": 30136 + }, + { + "epoch": 2.0087493217580032, + "grad_norm": 3.2180166244506836, + "learning_rate": 5.97286556118953e-06, + "loss": 0.1712, + "step": 30137 + }, + { + "epoch": 2.008762886597938, + "grad_norm": 3.8522796630859375, + "learning_rate": 5.9727285185692755e-06, + "loss": 0.2065, + "step": 30138 + }, + { + "epoch": 2.008776451437873, + "grad_norm": 4.320857048034668, + "learning_rate": 5.9725914759490215e-06, + "loss": 0.1629, + "step": 30139 + }, + { + "epoch": 2.008790016277808, + "grad_norm": 4.87664794921875, + "learning_rate": 5.972454433328766e-06, + "loss": 0.272, + "step": 30140 + }, + { + "epoch": 2.0088035811177427, + "grad_norm": 4.778518199920654, + "learning_rate": 5.972317390708511e-06, + "loss": 0.2334, + "step": 30141 + }, + { + "epoch": 2.0088171459576776, + "grad_norm": 4.493358135223389, + "learning_rate": 5.972180348088255e-06, + "loss": 0.1309, + "step": 30142 + }, + { + "epoch": 2.0088307107976124, + "grad_norm": 5.072476387023926, + "learning_rate": 5.9720433054680006e-06, + "loss": 0.2942, + "step": 30143 + }, + { + "epoch": 2.0088442756375473, + "grad_norm": 3.7498202323913574, + "learning_rate": 5.9719062628477466e-06, + "loss": 0.2666, + "step": 30144 + }, + { + "epoch": 2.008857840477482, + "grad_norm": 4.756328582763672, + "learning_rate": 5.971769220227491e-06, + "loss": 0.2065, + "step": 30145 + }, + { + "epoch": 2.008871405317417, + "grad_norm": 4.776732444763184, + "learning_rate": 5.971632177607236e-06, + "loss": 0.2195, + "step": 30146 + }, + { + "epoch": 2.0088849701573523, + "grad_norm": 4.4739789962768555, + "learning_rate": 5.971495134986981e-06, + "loss": 0.2048, + "step": 30147 + }, + { + "epoch": 2.008898534997287, + "grad_norm": 5.456634044647217, + "learning_rate": 5.971358092366727e-06, + "loss": 0.2464, + "step": 30148 + }, + { + "epoch": 2.008912099837222, + "grad_norm": 4.267604351043701, + "learning_rate": 5.971221049746472e-06, + "loss": 0.1545, + "step": 30149 + }, + { + "epoch": 2.008925664677157, + "grad_norm": 3.9055464267730713, + "learning_rate": 5.971084007126217e-06, + "loss": 0.1505, + "step": 30150 + }, + { + "epoch": 2.008939229517092, + "grad_norm": 2.715775966644287, + "learning_rate": 5.970946964505961e-06, + "loss": 0.1113, + "step": 30151 + }, + { + "epoch": 2.0089527943570267, + "grad_norm": 5.727941036224365, + "learning_rate": 5.970809921885707e-06, + "loss": 0.219, + "step": 30152 + }, + { + "epoch": 2.0089663591969615, + "grad_norm": 4.301347732543945, + "learning_rate": 5.970672879265452e-06, + "loss": 0.2451, + "step": 30153 + }, + { + "epoch": 2.0089799240368964, + "grad_norm": 2.7883071899414062, + "learning_rate": 5.970535836645197e-06, + "loss": 0.0959, + "step": 30154 + }, + { + "epoch": 2.0089934888768313, + "grad_norm": 4.106019496917725, + "learning_rate": 5.970398794024942e-06, + "loss": 0.2009, + "step": 30155 + }, + { + "epoch": 2.009007053716766, + "grad_norm": 3.062331199645996, + "learning_rate": 5.970261751404688e-06, + "loss": 0.111, + "step": 30156 + }, + { + "epoch": 2.009020618556701, + "grad_norm": 2.958333730697632, + "learning_rate": 5.970124708784433e-06, + "loss": 0.0697, + "step": 30157 + }, + { + "epoch": 2.009034183396636, + "grad_norm": 3.1998291015625, + "learning_rate": 5.969987666164177e-06, + "loss": 0.1261, + "step": 30158 + }, + { + "epoch": 2.0090477482365707, + "grad_norm": 4.4423394203186035, + "learning_rate": 5.9698506235439226e-06, + "loss": 0.1831, + "step": 30159 + }, + { + "epoch": 2.0090613130765056, + "grad_norm": 3.9204258918762207, + "learning_rate": 5.969713580923667e-06, + "loss": 0.1223, + "step": 30160 + }, + { + "epoch": 2.0090748779164405, + "grad_norm": 5.245474815368652, + "learning_rate": 5.969576538303413e-06, + "loss": 0.1152, + "step": 30161 + }, + { + "epoch": 2.0090884427563753, + "grad_norm": 4.006704807281494, + "learning_rate": 5.969439495683158e-06, + "loss": 0.1626, + "step": 30162 + }, + { + "epoch": 2.00910200759631, + "grad_norm": 2.9223227500915527, + "learning_rate": 5.969302453062903e-06, + "loss": 0.1548, + "step": 30163 + }, + { + "epoch": 2.009115572436245, + "grad_norm": 3.5789411067962646, + "learning_rate": 5.969165410442648e-06, + "loss": 0.1357, + "step": 30164 + }, + { + "epoch": 2.00912913727618, + "grad_norm": 3.8481688499450684, + "learning_rate": 5.969028367822394e-06, + "loss": 0.1858, + "step": 30165 + }, + { + "epoch": 2.0091427021161152, + "grad_norm": 2.9038968086242676, + "learning_rate": 5.968891325202139e-06, + "loss": 0.1251, + "step": 30166 + }, + { + "epoch": 2.00915626695605, + "grad_norm": 5.101450443267822, + "learning_rate": 5.968754282581883e-06, + "loss": 0.1987, + "step": 30167 + }, + { + "epoch": 2.009169831795985, + "grad_norm": 3.8437821865081787, + "learning_rate": 5.968617239961628e-06, + "loss": 0.1677, + "step": 30168 + }, + { + "epoch": 2.00918339663592, + "grad_norm": 2.8522586822509766, + "learning_rate": 5.968480197341374e-06, + "loss": 0.0985, + "step": 30169 + }, + { + "epoch": 2.0091969614758547, + "grad_norm": 3.3569111824035645, + "learning_rate": 5.968343154721119e-06, + "loss": 0.1295, + "step": 30170 + }, + { + "epoch": 2.0092105263157896, + "grad_norm": 2.9323065280914307, + "learning_rate": 5.968206112100864e-06, + "loss": 0.1156, + "step": 30171 + }, + { + "epoch": 2.0092240911557244, + "grad_norm": 6.405881881713867, + "learning_rate": 5.968069069480609e-06, + "loss": 0.2738, + "step": 30172 + }, + { + "epoch": 2.0092376559956593, + "grad_norm": 3.297238349914551, + "learning_rate": 5.967932026860353e-06, + "loss": 0.1112, + "step": 30173 + }, + { + "epoch": 2.009251220835594, + "grad_norm": 4.204854488372803, + "learning_rate": 5.967794984240099e-06, + "loss": 0.1514, + "step": 30174 + }, + { + "epoch": 2.009264785675529, + "grad_norm": 3.885345220565796, + "learning_rate": 5.967657941619845e-06, + "loss": 0.1981, + "step": 30175 + }, + { + "epoch": 2.009278350515464, + "grad_norm": 5.877448558807373, + "learning_rate": 5.967520898999589e-06, + "loss": 0.2161, + "step": 30176 + }, + { + "epoch": 2.0092919153553987, + "grad_norm": 4.46683931350708, + "learning_rate": 5.967383856379334e-06, + "loss": 0.2245, + "step": 30177 + }, + { + "epoch": 2.0093054801953336, + "grad_norm": 3.929213285446167, + "learning_rate": 5.96724681375908e-06, + "loss": 0.1988, + "step": 30178 + }, + { + "epoch": 2.0093190450352685, + "grad_norm": 4.422571182250977, + "learning_rate": 5.9671097711388244e-06, + "loss": 0.1906, + "step": 30179 + }, + { + "epoch": 2.0093326098752033, + "grad_norm": 4.484927177429199, + "learning_rate": 5.96697272851857e-06, + "loss": 0.2251, + "step": 30180 + }, + { + "epoch": 2.009346174715138, + "grad_norm": 3.948806047439575, + "learning_rate": 5.966835685898315e-06, + "loss": 0.2225, + "step": 30181 + }, + { + "epoch": 2.009359739555073, + "grad_norm": 6.037754535675049, + "learning_rate": 5.966698643278061e-06, + "loss": 0.2669, + "step": 30182 + }, + { + "epoch": 2.009373304395008, + "grad_norm": 5.223752498626709, + "learning_rate": 5.966561600657805e-06, + "loss": 0.2974, + "step": 30183 + }, + { + "epoch": 2.009386869234943, + "grad_norm": 3.5804524421691895, + "learning_rate": 5.96642455803755e-06, + "loss": 0.1265, + "step": 30184 + }, + { + "epoch": 2.009400434074878, + "grad_norm": 3.2156903743743896, + "learning_rate": 5.966287515417295e-06, + "loss": 0.1284, + "step": 30185 + }, + { + "epoch": 2.009413998914813, + "grad_norm": 4.6980509757995605, + "learning_rate": 5.96615047279704e-06, + "loss": 0.2753, + "step": 30186 + }, + { + "epoch": 2.009427563754748, + "grad_norm": 4.612116813659668, + "learning_rate": 5.966013430176786e-06, + "loss": 0.1891, + "step": 30187 + }, + { + "epoch": 2.0094411285946827, + "grad_norm": 5.871756553649902, + "learning_rate": 5.965876387556531e-06, + "loss": 0.1964, + "step": 30188 + }, + { + "epoch": 2.0094546934346176, + "grad_norm": 4.063449382781982, + "learning_rate": 5.965739344936275e-06, + "loss": 0.2729, + "step": 30189 + }, + { + "epoch": 2.0094682582745524, + "grad_norm": 4.450484275817871, + "learning_rate": 5.9656023023160206e-06, + "loss": 0.1992, + "step": 30190 + }, + { + "epoch": 2.0094818231144873, + "grad_norm": 6.2781782150268555, + "learning_rate": 5.965465259695767e-06, + "loss": 0.3233, + "step": 30191 + }, + { + "epoch": 2.009495387954422, + "grad_norm": 4.550093173980713, + "learning_rate": 5.965328217075511e-06, + "loss": 0.2186, + "step": 30192 + }, + { + "epoch": 2.009508952794357, + "grad_norm": 4.935550689697266, + "learning_rate": 5.965191174455256e-06, + "loss": 0.2688, + "step": 30193 + }, + { + "epoch": 2.009522517634292, + "grad_norm": 3.318408489227295, + "learning_rate": 5.9650541318350004e-06, + "loss": 0.1489, + "step": 30194 + }, + { + "epoch": 2.0095360824742268, + "grad_norm": 3.412912607192993, + "learning_rate": 5.9649170892147465e-06, + "loss": 0.14, + "step": 30195 + }, + { + "epoch": 2.0095496473141616, + "grad_norm": 3.861816644668579, + "learning_rate": 5.964780046594492e-06, + "loss": 0.2393, + "step": 30196 + }, + { + "epoch": 2.0095632121540965, + "grad_norm": 3.4779725074768066, + "learning_rate": 5.964643003974237e-06, + "loss": 0.1289, + "step": 30197 + }, + { + "epoch": 2.0095767769940314, + "grad_norm": 3.8018314838409424, + "learning_rate": 5.964505961353981e-06, + "loss": 0.1528, + "step": 30198 + }, + { + "epoch": 2.0095903418339662, + "grad_norm": 3.5759589672088623, + "learning_rate": 5.964368918733726e-06, + "loss": 0.1419, + "step": 30199 + }, + { + "epoch": 2.009603906673901, + "grad_norm": 4.502577304840088, + "learning_rate": 5.964231876113472e-06, + "loss": 0.138, + "step": 30200 + }, + { + "epoch": 2.009617471513836, + "grad_norm": 4.124410629272461, + "learning_rate": 5.964094833493217e-06, + "loss": 0.1422, + "step": 30201 + }, + { + "epoch": 2.009631036353771, + "grad_norm": 3.9637112617492676, + "learning_rate": 5.963957790872962e-06, + "loss": 0.1507, + "step": 30202 + }, + { + "epoch": 2.009644601193706, + "grad_norm": 3.809453010559082, + "learning_rate": 5.963820748252707e-06, + "loss": 0.1669, + "step": 30203 + }, + { + "epoch": 2.009658166033641, + "grad_norm": 4.120179653167725, + "learning_rate": 5.963683705632452e-06, + "loss": 0.181, + "step": 30204 + }, + { + "epoch": 2.009671730873576, + "grad_norm": 3.0351860523223877, + "learning_rate": 5.963546663012197e-06, + "loss": 0.1353, + "step": 30205 + }, + { + "epoch": 2.0096852957135107, + "grad_norm": 4.946230888366699, + "learning_rate": 5.963409620391943e-06, + "loss": 0.2804, + "step": 30206 + }, + { + "epoch": 2.0096988605534456, + "grad_norm": 3.627692461013794, + "learning_rate": 5.963272577771687e-06, + "loss": 0.1296, + "step": 30207 + }, + { + "epoch": 2.0097124253933805, + "grad_norm": 5.814447402954102, + "learning_rate": 5.963135535151433e-06, + "loss": 0.2946, + "step": 30208 + }, + { + "epoch": 2.0097259902333153, + "grad_norm": 3.584123373031616, + "learning_rate": 5.962998492531178e-06, + "loss": 0.1204, + "step": 30209 + }, + { + "epoch": 2.00973955507325, + "grad_norm": 3.6118404865264893, + "learning_rate": 5.9628614499109225e-06, + "loss": 0.1351, + "step": 30210 + }, + { + "epoch": 2.009753119913185, + "grad_norm": 5.45561408996582, + "learning_rate": 5.962724407290668e-06, + "loss": 0.185, + "step": 30211 + }, + { + "epoch": 2.00976668475312, + "grad_norm": 4.234549045562744, + "learning_rate": 5.962587364670413e-06, + "loss": 0.1475, + "step": 30212 + }, + { + "epoch": 2.009780249593055, + "grad_norm": 4.860503196716309, + "learning_rate": 5.962450322050158e-06, + "loss": 0.2768, + "step": 30213 + }, + { + "epoch": 2.0097938144329897, + "grad_norm": 5.098595142364502, + "learning_rate": 5.962313279429903e-06, + "loss": 0.1903, + "step": 30214 + }, + { + "epoch": 2.0098073792729245, + "grad_norm": 4.818135738372803, + "learning_rate": 5.962176236809648e-06, + "loss": 0.2581, + "step": 30215 + }, + { + "epoch": 2.0098209441128594, + "grad_norm": 4.342344284057617, + "learning_rate": 5.962039194189393e-06, + "loss": 0.2409, + "step": 30216 + }, + { + "epoch": 2.0098345089527943, + "grad_norm": 3.8264269828796387, + "learning_rate": 5.961902151569139e-06, + "loss": 0.1176, + "step": 30217 + }, + { + "epoch": 2.009848073792729, + "grad_norm": 4.3613386154174805, + "learning_rate": 5.961765108948884e-06, + "loss": 0.2224, + "step": 30218 + }, + { + "epoch": 2.009861638632664, + "grad_norm": 2.9278275966644287, + "learning_rate": 5.961628066328628e-06, + "loss": 0.0745, + "step": 30219 + }, + { + "epoch": 2.009875203472599, + "grad_norm": 5.566346645355225, + "learning_rate": 5.961491023708373e-06, + "loss": 0.2711, + "step": 30220 + }, + { + "epoch": 2.0098887683125337, + "grad_norm": 4.800873279571533, + "learning_rate": 5.961353981088119e-06, + "loss": 0.2366, + "step": 30221 + }, + { + "epoch": 2.0099023331524686, + "grad_norm": 5.487566947937012, + "learning_rate": 5.961216938467865e-06, + "loss": 0.2752, + "step": 30222 + }, + { + "epoch": 2.009915897992404, + "grad_norm": 4.908525466918945, + "learning_rate": 5.961079895847609e-06, + "loss": 0.2079, + "step": 30223 + }, + { + "epoch": 2.0099294628323388, + "grad_norm": 6.048881530761719, + "learning_rate": 5.960942853227354e-06, + "loss": 0.2193, + "step": 30224 + }, + { + "epoch": 2.0099430276722736, + "grad_norm": 3.554456949234009, + "learning_rate": 5.9608058106071e-06, + "loss": 0.1489, + "step": 30225 + }, + { + "epoch": 2.0099565925122085, + "grad_norm": 3.9439613819122314, + "learning_rate": 5.9606687679868445e-06, + "loss": 0.1908, + "step": 30226 + }, + { + "epoch": 2.0099701573521433, + "grad_norm": 5.055774688720703, + "learning_rate": 5.96053172536659e-06, + "loss": 0.2126, + "step": 30227 + }, + { + "epoch": 2.009983722192078, + "grad_norm": 5.7781853675842285, + "learning_rate": 5.960394682746334e-06, + "loss": 0.1939, + "step": 30228 + }, + { + "epoch": 2.009997287032013, + "grad_norm": 4.590102195739746, + "learning_rate": 5.960257640126079e-06, + "loss": 0.2454, + "step": 30229 + }, + { + "epoch": 2.010010851871948, + "grad_norm": 3.920771837234497, + "learning_rate": 5.960120597505825e-06, + "loss": 0.1386, + "step": 30230 + }, + { + "epoch": 2.010024416711883, + "grad_norm": 4.208185195922852, + "learning_rate": 5.95998355488557e-06, + "loss": 0.183, + "step": 30231 + }, + { + "epoch": 2.0100379815518177, + "grad_norm": 4.206214904785156, + "learning_rate": 5.959846512265315e-06, + "loss": 0.1325, + "step": 30232 + }, + { + "epoch": 2.0100515463917525, + "grad_norm": 5.950942516326904, + "learning_rate": 5.95970946964506e-06, + "loss": 0.2844, + "step": 30233 + }, + { + "epoch": 2.0100651112316874, + "grad_norm": 3.270899534225464, + "learning_rate": 5.959572427024806e-06, + "loss": 0.088, + "step": 30234 + }, + { + "epoch": 2.0100786760716223, + "grad_norm": 5.713623523712158, + "learning_rate": 5.95943538440455e-06, + "loss": 0.2442, + "step": 30235 + }, + { + "epoch": 2.010092240911557, + "grad_norm": 3.873171091079712, + "learning_rate": 5.959298341784295e-06, + "loss": 0.1217, + "step": 30236 + }, + { + "epoch": 2.010105805751492, + "grad_norm": 4.510666847229004, + "learning_rate": 5.959161299164041e-06, + "loss": 0.1837, + "step": 30237 + }, + { + "epoch": 2.010119370591427, + "grad_norm": 5.302369117736816, + "learning_rate": 5.959024256543786e-06, + "loss": 0.2593, + "step": 30238 + }, + { + "epoch": 2.0101329354313617, + "grad_norm": 3.5772817134857178, + "learning_rate": 5.958887213923531e-06, + "loss": 0.127, + "step": 30239 + }, + { + "epoch": 2.0101465002712966, + "grad_norm": 5.575503826141357, + "learning_rate": 5.958750171303276e-06, + "loss": 0.168, + "step": 30240 + }, + { + "epoch": 2.010160065111232, + "grad_norm": 4.456669807434082, + "learning_rate": 5.9586131286830205e-06, + "loss": 0.2104, + "step": 30241 + }, + { + "epoch": 2.0101736299511668, + "grad_norm": 5.937808513641357, + "learning_rate": 5.958476086062766e-06, + "loss": 0.2588, + "step": 30242 + }, + { + "epoch": 2.0101871947911016, + "grad_norm": 5.888652324676514, + "learning_rate": 5.958339043442512e-06, + "loss": 0.428, + "step": 30243 + }, + { + "epoch": 2.0102007596310365, + "grad_norm": 6.323023796081543, + "learning_rate": 5.958202000822256e-06, + "loss": 0.3109, + "step": 30244 + }, + { + "epoch": 2.0102143244709714, + "grad_norm": 3.801297426223755, + "learning_rate": 5.958064958202001e-06, + "loss": 0.1826, + "step": 30245 + }, + { + "epoch": 2.0102278893109062, + "grad_norm": 4.885457515716553, + "learning_rate": 5.957927915581746e-06, + "loss": 0.1676, + "step": 30246 + }, + { + "epoch": 2.010241454150841, + "grad_norm": 6.250980377197266, + "learning_rate": 5.9577908729614915e-06, + "loss": 0.3741, + "step": 30247 + }, + { + "epoch": 2.010255018990776, + "grad_norm": 4.916316986083984, + "learning_rate": 5.957653830341237e-06, + "loss": 0.1584, + "step": 30248 + }, + { + "epoch": 2.010268583830711, + "grad_norm": 4.427427768707275, + "learning_rate": 5.957516787720982e-06, + "loss": 0.2208, + "step": 30249 + }, + { + "epoch": 2.0102821486706457, + "grad_norm": 6.257414817810059, + "learning_rate": 5.957379745100726e-06, + "loss": 0.2967, + "step": 30250 + }, + { + "epoch": 2.0102957135105806, + "grad_norm": 4.832929611206055, + "learning_rate": 5.957242702480472e-06, + "loss": 0.1732, + "step": 30251 + }, + { + "epoch": 2.0103092783505154, + "grad_norm": 5.627852439880371, + "learning_rate": 5.9571056598602174e-06, + "loss": 0.1818, + "step": 30252 + }, + { + "epoch": 2.0103228431904503, + "grad_norm": 5.017886638641357, + "learning_rate": 5.956968617239962e-06, + "loss": 0.2014, + "step": 30253 + }, + { + "epoch": 2.010336408030385, + "grad_norm": 6.152318477630615, + "learning_rate": 5.956831574619707e-06, + "loss": 0.2282, + "step": 30254 + }, + { + "epoch": 2.01034997287032, + "grad_norm": 4.732523441314697, + "learning_rate": 5.956694531999452e-06, + "loss": 0.1487, + "step": 30255 + }, + { + "epoch": 2.010363537710255, + "grad_norm": 5.523727893829346, + "learning_rate": 5.956557489379198e-06, + "loss": 0.187, + "step": 30256 + }, + { + "epoch": 2.0103771025501898, + "grad_norm": 4.309514999389648, + "learning_rate": 5.9564204467589425e-06, + "loss": 0.1734, + "step": 30257 + }, + { + "epoch": 2.0103906673901246, + "grad_norm": 3.4623279571533203, + "learning_rate": 5.956283404138688e-06, + "loss": 0.1509, + "step": 30258 + }, + { + "epoch": 2.0104042322300595, + "grad_norm": 3.777257204055786, + "learning_rate": 5.956146361518432e-06, + "loss": 0.1674, + "step": 30259 + }, + { + "epoch": 2.010417797069995, + "grad_norm": 5.16766881942749, + "learning_rate": 5.956009318898178e-06, + "loss": 0.1546, + "step": 30260 + }, + { + "epoch": 2.0104313619099297, + "grad_norm": 4.051912307739258, + "learning_rate": 5.955872276277923e-06, + "loss": 0.1678, + "step": 30261 + }, + { + "epoch": 2.0104449267498645, + "grad_norm": 4.7394490242004395, + "learning_rate": 5.9557352336576675e-06, + "loss": 0.1303, + "step": 30262 + }, + { + "epoch": 2.0104584915897994, + "grad_norm": 4.561591625213623, + "learning_rate": 5.955598191037413e-06, + "loss": 0.1739, + "step": 30263 + }, + { + "epoch": 2.0104720564297343, + "grad_norm": 3.7404563426971436, + "learning_rate": 5.955461148417159e-06, + "loss": 0.2199, + "step": 30264 + }, + { + "epoch": 2.010485621269669, + "grad_norm": 6.670025825500488, + "learning_rate": 5.955324105796904e-06, + "loss": 0.2514, + "step": 30265 + }, + { + "epoch": 2.010499186109604, + "grad_norm": 3.6405255794525146, + "learning_rate": 5.955187063176648e-06, + "loss": 0.0846, + "step": 30266 + }, + { + "epoch": 2.010512750949539, + "grad_norm": 7.650672435760498, + "learning_rate": 5.955050020556393e-06, + "loss": 0.2783, + "step": 30267 + }, + { + "epoch": 2.0105263157894737, + "grad_norm": 4.530752658843994, + "learning_rate": 5.954912977936138e-06, + "loss": 0.1815, + "step": 30268 + }, + { + "epoch": 2.0105398806294086, + "grad_norm": 6.780417442321777, + "learning_rate": 5.954775935315884e-06, + "loss": 0.2485, + "step": 30269 + }, + { + "epoch": 2.0105534454693434, + "grad_norm": 7.794325828552246, + "learning_rate": 5.954638892695629e-06, + "loss": 0.238, + "step": 30270 + }, + { + "epoch": 2.0105670103092783, + "grad_norm": 4.733854293823242, + "learning_rate": 5.954501850075374e-06, + "loss": 0.1442, + "step": 30271 + }, + { + "epoch": 2.010580575149213, + "grad_norm": 3.455176830291748, + "learning_rate": 5.9543648074551185e-06, + "loss": 0.1519, + "step": 30272 + }, + { + "epoch": 2.010594139989148, + "grad_norm": 4.471992492675781, + "learning_rate": 5.9542277648348645e-06, + "loss": 0.1228, + "step": 30273 + }, + { + "epoch": 2.010607704829083, + "grad_norm": 5.561471939086914, + "learning_rate": 5.95409072221461e-06, + "loss": 0.2271, + "step": 30274 + }, + { + "epoch": 2.0106212696690178, + "grad_norm": 4.063155174255371, + "learning_rate": 5.953953679594354e-06, + "loss": 0.1826, + "step": 30275 + }, + { + "epoch": 2.0106348345089526, + "grad_norm": 6.2412543296813965, + "learning_rate": 5.953816636974099e-06, + "loss": 0.2986, + "step": 30276 + }, + { + "epoch": 2.0106483993488875, + "grad_norm": 4.857354164123535, + "learning_rate": 5.953679594353845e-06, + "loss": 0.1825, + "step": 30277 + }, + { + "epoch": 2.0106619641888224, + "grad_norm": 6.657655239105225, + "learning_rate": 5.9535425517335895e-06, + "loss": 0.2722, + "step": 30278 + }, + { + "epoch": 2.0106755290287577, + "grad_norm": 4.80628776550293, + "learning_rate": 5.953405509113335e-06, + "loss": 0.2152, + "step": 30279 + }, + { + "epoch": 2.0106890938686925, + "grad_norm": 4.790548801422119, + "learning_rate": 5.95326846649308e-06, + "loss": 0.1349, + "step": 30280 + }, + { + "epoch": 2.0107026587086274, + "grad_norm": 5.718246936798096, + "learning_rate": 5.953131423872824e-06, + "loss": 0.2224, + "step": 30281 + }, + { + "epoch": 2.0107162235485623, + "grad_norm": 4.366886615753174, + "learning_rate": 5.95299438125257e-06, + "loss": 0.1862, + "step": 30282 + }, + { + "epoch": 2.010729788388497, + "grad_norm": 5.321536064147949, + "learning_rate": 5.9528573386323154e-06, + "loss": 0.2505, + "step": 30283 + }, + { + "epoch": 2.010743353228432, + "grad_norm": 4.596107006072998, + "learning_rate": 5.95272029601206e-06, + "loss": 0.2361, + "step": 30284 + }, + { + "epoch": 2.010756918068367, + "grad_norm": 6.577988147735596, + "learning_rate": 5.952583253391805e-06, + "loss": 0.2889, + "step": 30285 + }, + { + "epoch": 2.0107704829083017, + "grad_norm": 6.637228965759277, + "learning_rate": 5.952446210771551e-06, + "loss": 0.2245, + "step": 30286 + }, + { + "epoch": 2.0107840477482366, + "grad_norm": 8.255231857299805, + "learning_rate": 5.952309168151295e-06, + "loss": 0.2336, + "step": 30287 + }, + { + "epoch": 2.0107976125881715, + "grad_norm": 5.280693054199219, + "learning_rate": 5.9521721255310405e-06, + "loss": 0.2656, + "step": 30288 + }, + { + "epoch": 2.0108111774281063, + "grad_norm": 4.164672374725342, + "learning_rate": 5.952035082910786e-06, + "loss": 0.1594, + "step": 30289 + }, + { + "epoch": 2.010824742268041, + "grad_norm": 6.552535533905029, + "learning_rate": 5.951898040290532e-06, + "loss": 0.2576, + "step": 30290 + }, + { + "epoch": 2.010838307107976, + "grad_norm": 5.272776126861572, + "learning_rate": 5.951760997670276e-06, + "loss": 0.2677, + "step": 30291 + }, + { + "epoch": 2.010851871947911, + "grad_norm": 4.144861221313477, + "learning_rate": 5.951623955050021e-06, + "loss": 0.1941, + "step": 30292 + }, + { + "epoch": 2.010865436787846, + "grad_norm": 6.192998886108398, + "learning_rate": 5.9514869124297655e-06, + "loss": 0.2547, + "step": 30293 + }, + { + "epoch": 2.0108790016277807, + "grad_norm": 4.384244918823242, + "learning_rate": 5.9513498698095116e-06, + "loss": 0.2288, + "step": 30294 + }, + { + "epoch": 2.0108925664677155, + "grad_norm": 5.69662618637085, + "learning_rate": 5.951212827189257e-06, + "loss": 0.2325, + "step": 30295 + }, + { + "epoch": 2.0109061313076504, + "grad_norm": 5.256923675537109, + "learning_rate": 5.951075784569001e-06, + "loss": 0.2371, + "step": 30296 + }, + { + "epoch": 2.0109196961475853, + "grad_norm": 5.762890338897705, + "learning_rate": 5.950938741948746e-06, + "loss": 0.2678, + "step": 30297 + }, + { + "epoch": 2.0109332609875206, + "grad_norm": 6.423457145690918, + "learning_rate": 5.950801699328491e-06, + "loss": 0.3063, + "step": 30298 + }, + { + "epoch": 2.0109468258274554, + "grad_norm": 5.357529640197754, + "learning_rate": 5.9506646567082374e-06, + "loss": 0.2782, + "step": 30299 + }, + { + "epoch": 2.0109603906673903, + "grad_norm": 3.6495094299316406, + "learning_rate": 5.950527614087982e-06, + "loss": 0.1287, + "step": 30300 + }, + { + "epoch": 2.010973955507325, + "grad_norm": 4.616799831390381, + "learning_rate": 5.950390571467727e-06, + "loss": 0.2324, + "step": 30301 + }, + { + "epoch": 2.01098752034726, + "grad_norm": 6.205652713775635, + "learning_rate": 5.950253528847471e-06, + "loss": 0.1936, + "step": 30302 + }, + { + "epoch": 2.011001085187195, + "grad_norm": 5.356813907623291, + "learning_rate": 5.950116486227217e-06, + "loss": 0.2862, + "step": 30303 + }, + { + "epoch": 2.0110146500271298, + "grad_norm": 5.116394519805908, + "learning_rate": 5.9499794436069625e-06, + "loss": 0.1721, + "step": 30304 + }, + { + "epoch": 2.0110282148670646, + "grad_norm": 4.1697564125061035, + "learning_rate": 5.949842400986708e-06, + "loss": 0.1876, + "step": 30305 + }, + { + "epoch": 2.0110417797069995, + "grad_norm": 4.148760795593262, + "learning_rate": 5.949705358366452e-06, + "loss": 0.1993, + "step": 30306 + }, + { + "epoch": 2.0110553445469344, + "grad_norm": 3.396646738052368, + "learning_rate": 5.949568315746198e-06, + "loss": 0.1105, + "step": 30307 + }, + { + "epoch": 2.011068909386869, + "grad_norm": 4.186799049377441, + "learning_rate": 5.949431273125943e-06, + "loss": 0.1369, + "step": 30308 + }, + { + "epoch": 2.011082474226804, + "grad_norm": 5.593893527984619, + "learning_rate": 5.9492942305056875e-06, + "loss": 0.2623, + "step": 30309 + }, + { + "epoch": 2.011096039066739, + "grad_norm": 4.585619926452637, + "learning_rate": 5.949157187885433e-06, + "loss": 0.1959, + "step": 30310 + }, + { + "epoch": 2.011109603906674, + "grad_norm": 4.261785984039307, + "learning_rate": 5.949020145265177e-06, + "loss": 0.1604, + "step": 30311 + }, + { + "epoch": 2.0111231687466087, + "grad_norm": 4.487710952758789, + "learning_rate": 5.948883102644923e-06, + "loss": 0.1204, + "step": 30312 + }, + { + "epoch": 2.0111367335865435, + "grad_norm": 5.102488994598389, + "learning_rate": 5.948746060024668e-06, + "loss": 0.1554, + "step": 30313 + }, + { + "epoch": 2.0111502984264784, + "grad_norm": 5.555622100830078, + "learning_rate": 5.9486090174044134e-06, + "loss": 0.1848, + "step": 30314 + }, + { + "epoch": 2.0111638632664133, + "grad_norm": 6.397450923919678, + "learning_rate": 5.948471974784158e-06, + "loss": 0.1978, + "step": 30315 + }, + { + "epoch": 2.011177428106348, + "grad_norm": 6.438563346862793, + "learning_rate": 5.948334932163904e-06, + "loss": 0.2014, + "step": 30316 + }, + { + "epoch": 2.0111909929462835, + "grad_norm": 5.204226970672607, + "learning_rate": 5.948197889543649e-06, + "loss": 0.2267, + "step": 30317 + }, + { + "epoch": 2.0112045577862183, + "grad_norm": 5.25266695022583, + "learning_rate": 5.948060846923393e-06, + "loss": 0.1822, + "step": 30318 + }, + { + "epoch": 2.011218122626153, + "grad_norm": 4.829931735992432, + "learning_rate": 5.9479238043031385e-06, + "loss": 0.1888, + "step": 30319 + }, + { + "epoch": 2.011231687466088, + "grad_norm": 5.238044261932373, + "learning_rate": 5.9477867616828845e-06, + "loss": 0.1988, + "step": 30320 + }, + { + "epoch": 2.011245252306023, + "grad_norm": 5.342546463012695, + "learning_rate": 5.947649719062629e-06, + "loss": 0.2191, + "step": 30321 + }, + { + "epoch": 2.0112588171459578, + "grad_norm": 6.150433540344238, + "learning_rate": 5.947512676442374e-06, + "loss": 0.2248, + "step": 30322 + }, + { + "epoch": 2.0112723819858926, + "grad_norm": 3.719088315963745, + "learning_rate": 5.947375633822119e-06, + "loss": 0.1475, + "step": 30323 + }, + { + "epoch": 2.0112859468258275, + "grad_norm": 5.074422359466553, + "learning_rate": 5.9472385912018635e-06, + "loss": 0.277, + "step": 30324 + }, + { + "epoch": 2.0112995116657624, + "grad_norm": 6.12474250793457, + "learning_rate": 5.9471015485816096e-06, + "loss": 0.2405, + "step": 30325 + }, + { + "epoch": 2.0113130765056972, + "grad_norm": 3.6984212398529053, + "learning_rate": 5.946964505961355e-06, + "loss": 0.1814, + "step": 30326 + }, + { + "epoch": 2.011326641345632, + "grad_norm": 4.865243434906006, + "learning_rate": 5.946827463341099e-06, + "loss": 0.1938, + "step": 30327 + }, + { + "epoch": 2.011340206185567, + "grad_norm": 5.31882905960083, + "learning_rate": 5.946690420720844e-06, + "loss": 0.2556, + "step": 30328 + }, + { + "epoch": 2.011353771025502, + "grad_norm": 5.444392204284668, + "learning_rate": 5.94655337810059e-06, + "loss": 0.195, + "step": 30329 + }, + { + "epoch": 2.0113673358654367, + "grad_norm": 5.380152225494385, + "learning_rate": 5.9464163354803354e-06, + "loss": 0.2141, + "step": 30330 + }, + { + "epoch": 2.0113809007053716, + "grad_norm": 5.334604740142822, + "learning_rate": 5.94627929286008e-06, + "loss": 0.2702, + "step": 30331 + }, + { + "epoch": 2.0113944655453064, + "grad_norm": 4.391207695007324, + "learning_rate": 5.946142250239825e-06, + "loss": 0.1666, + "step": 30332 + }, + { + "epoch": 2.0114080303852413, + "grad_norm": 10.101068496704102, + "learning_rate": 5.946005207619571e-06, + "loss": 0.344, + "step": 30333 + }, + { + "epoch": 2.011421595225176, + "grad_norm": 4.6111226081848145, + "learning_rate": 5.945868164999315e-06, + "loss": 0.1921, + "step": 30334 + }, + { + "epoch": 2.011435160065111, + "grad_norm": 6.106762409210205, + "learning_rate": 5.9457311223790605e-06, + "loss": 0.24, + "step": 30335 + }, + { + "epoch": 2.0114487249050463, + "grad_norm": 6.1475830078125, + "learning_rate": 5.945594079758805e-06, + "loss": 0.2626, + "step": 30336 + }, + { + "epoch": 2.011462289744981, + "grad_norm": 4.177346229553223, + "learning_rate": 5.94545703713855e-06, + "loss": 0.1867, + "step": 30337 + }, + { + "epoch": 2.011475854584916, + "grad_norm": 4.281813621520996, + "learning_rate": 5.945319994518296e-06, + "loss": 0.1662, + "step": 30338 + }, + { + "epoch": 2.011489419424851, + "grad_norm": 4.533009052276611, + "learning_rate": 5.945182951898041e-06, + "loss": 0.1418, + "step": 30339 + }, + { + "epoch": 2.011502984264786, + "grad_norm": 3.5977742671966553, + "learning_rate": 5.9450459092777855e-06, + "loss": 0.083, + "step": 30340 + }, + { + "epoch": 2.0115165491047207, + "grad_norm": 3.7357382774353027, + "learning_rate": 5.944908866657531e-06, + "loss": 0.1047, + "step": 30341 + }, + { + "epoch": 2.0115301139446555, + "grad_norm": 5.572368144989014, + "learning_rate": 5.944771824037277e-06, + "loss": 0.2298, + "step": 30342 + }, + { + "epoch": 2.0115436787845904, + "grad_norm": 4.7054643630981445, + "learning_rate": 5.944634781417021e-06, + "loss": 0.1175, + "step": 30343 + }, + { + "epoch": 2.0115572436245253, + "grad_norm": 4.053924560546875, + "learning_rate": 5.944497738796766e-06, + "loss": 0.1619, + "step": 30344 + }, + { + "epoch": 2.01157080846446, + "grad_norm": 4.4670562744140625, + "learning_rate": 5.944360696176511e-06, + "loss": 0.1447, + "step": 30345 + }, + { + "epoch": 2.011584373304395, + "grad_norm": 3.5875537395477295, + "learning_rate": 5.944223653556257e-06, + "loss": 0.0943, + "step": 30346 + }, + { + "epoch": 2.01159793814433, + "grad_norm": 3.2664411067962646, + "learning_rate": 5.944086610936002e-06, + "loss": 0.1126, + "step": 30347 + }, + { + "epoch": 2.0116115029842647, + "grad_norm": 3.4979121685028076, + "learning_rate": 5.943949568315747e-06, + "loss": 0.1293, + "step": 30348 + }, + { + "epoch": 2.0116250678241996, + "grad_norm": 2.505605936050415, + "learning_rate": 5.943812525695491e-06, + "loss": 0.1079, + "step": 30349 + }, + { + "epoch": 2.0116386326641345, + "grad_norm": 4.640312194824219, + "learning_rate": 5.9436754830752365e-06, + "loss": 0.1746, + "step": 30350 + }, + { + "epoch": 2.0116521975040693, + "grad_norm": 5.672585964202881, + "learning_rate": 5.9435384404549825e-06, + "loss": 0.1854, + "step": 30351 + }, + { + "epoch": 2.011665762344004, + "grad_norm": 6.614630699157715, + "learning_rate": 5.943401397834727e-06, + "loss": 0.2656, + "step": 30352 + }, + { + "epoch": 2.011679327183939, + "grad_norm": 4.830597877502441, + "learning_rate": 5.943264355214472e-06, + "loss": 0.1522, + "step": 30353 + }, + { + "epoch": 2.011692892023874, + "grad_norm": 4.876213073730469, + "learning_rate": 5.943127312594217e-06, + "loss": 0.2189, + "step": 30354 + }, + { + "epoch": 2.0117064568638092, + "grad_norm": 3.6997339725494385, + "learning_rate": 5.942990269973962e-06, + "loss": 0.0933, + "step": 30355 + }, + { + "epoch": 2.011720021703744, + "grad_norm": 5.344045162200928, + "learning_rate": 5.9428532273537076e-06, + "loss": 0.2039, + "step": 30356 + }, + { + "epoch": 2.011733586543679, + "grad_norm": 4.174921035766602, + "learning_rate": 5.942716184733453e-06, + "loss": 0.1383, + "step": 30357 + }, + { + "epoch": 2.011747151383614, + "grad_norm": 3.528573751449585, + "learning_rate": 5.942579142113197e-06, + "loss": 0.0729, + "step": 30358 + }, + { + "epoch": 2.0117607162235487, + "grad_norm": 3.784407615661621, + "learning_rate": 5.942442099492943e-06, + "loss": 0.1681, + "step": 30359 + }, + { + "epoch": 2.0117742810634835, + "grad_norm": 3.3082902431488037, + "learning_rate": 5.942305056872688e-06, + "loss": 0.0976, + "step": 30360 + }, + { + "epoch": 2.0117878459034184, + "grad_norm": 4.6900434494018555, + "learning_rate": 5.942168014252433e-06, + "loss": 0.1792, + "step": 30361 + }, + { + "epoch": 2.0118014107433533, + "grad_norm": 4.214311599731445, + "learning_rate": 5.942030971632178e-06, + "loss": 0.1363, + "step": 30362 + }, + { + "epoch": 2.011814975583288, + "grad_norm": 3.9966907501220703, + "learning_rate": 5.941893929011924e-06, + "loss": 0.1343, + "step": 30363 + }, + { + "epoch": 2.011828540423223, + "grad_norm": 4.559352397918701, + "learning_rate": 5.941756886391669e-06, + "loss": 0.1584, + "step": 30364 + }, + { + "epoch": 2.011842105263158, + "grad_norm": 5.131619930267334, + "learning_rate": 5.941619843771413e-06, + "loss": 0.1567, + "step": 30365 + }, + { + "epoch": 2.0118556701030927, + "grad_norm": 5.022639751434326, + "learning_rate": 5.9414828011511585e-06, + "loss": 0.1855, + "step": 30366 + }, + { + "epoch": 2.0118692349430276, + "grad_norm": 5.2429914474487305, + "learning_rate": 5.941345758530903e-06, + "loss": 0.1616, + "step": 30367 + }, + { + "epoch": 2.0118827997829625, + "grad_norm": 3.97117280960083, + "learning_rate": 5.941208715910649e-06, + "loss": 0.1002, + "step": 30368 + }, + { + "epoch": 2.0118963646228973, + "grad_norm": 4.056297302246094, + "learning_rate": 5.941071673290394e-06, + "loss": 0.1062, + "step": 30369 + }, + { + "epoch": 2.011909929462832, + "grad_norm": 4.363439559936523, + "learning_rate": 5.940934630670138e-06, + "loss": 0.1639, + "step": 30370 + }, + { + "epoch": 2.011923494302767, + "grad_norm": 3.841460943222046, + "learning_rate": 5.9407975880498835e-06, + "loss": 0.1584, + "step": 30371 + }, + { + "epoch": 2.011937059142702, + "grad_norm": 2.7852721214294434, + "learning_rate": 5.9406605454296296e-06, + "loss": 0.1149, + "step": 30372 + }, + { + "epoch": 2.011950623982637, + "grad_norm": 5.221619129180908, + "learning_rate": 5.940523502809375e-06, + "loss": 0.1717, + "step": 30373 + }, + { + "epoch": 2.011964188822572, + "grad_norm": 4.6351094245910645, + "learning_rate": 5.940386460189119e-06, + "loss": 0.1241, + "step": 30374 + }, + { + "epoch": 2.011977753662507, + "grad_norm": 8.930910110473633, + "learning_rate": 5.940249417568864e-06, + "loss": 0.2222, + "step": 30375 + }, + { + "epoch": 2.011991318502442, + "grad_norm": 5.4226250648498535, + "learning_rate": 5.94011237494861e-06, + "loss": 0.1592, + "step": 30376 + }, + { + "epoch": 2.0120048833423767, + "grad_norm": 7.386364936828613, + "learning_rate": 5.939975332328355e-06, + "loss": 0.1812, + "step": 30377 + }, + { + "epoch": 2.0120184481823116, + "grad_norm": 4.229894161224365, + "learning_rate": 5.9398382897081e-06, + "loss": 0.1832, + "step": 30378 + }, + { + "epoch": 2.0120320130222464, + "grad_norm": 3.6525299549102783, + "learning_rate": 5.939701247087845e-06, + "loss": 0.2165, + "step": 30379 + }, + { + "epoch": 2.0120455778621813, + "grad_norm": 4.900630950927734, + "learning_rate": 5.939564204467589e-06, + "loss": 0.2334, + "step": 30380 + }, + { + "epoch": 2.012059142702116, + "grad_norm": 4.292429447174072, + "learning_rate": 5.939427161847335e-06, + "loss": 0.1144, + "step": 30381 + }, + { + "epoch": 2.012072707542051, + "grad_norm": 4.9781084060668945, + "learning_rate": 5.9392901192270805e-06, + "loss": 0.1668, + "step": 30382 + }, + { + "epoch": 2.012086272381986, + "grad_norm": 5.835954666137695, + "learning_rate": 5.939153076606825e-06, + "loss": 0.2084, + "step": 30383 + }, + { + "epoch": 2.0120998372219208, + "grad_norm": 4.14034366607666, + "learning_rate": 5.93901603398657e-06, + "loss": 0.1755, + "step": 30384 + }, + { + "epoch": 2.0121134020618556, + "grad_norm": 5.048938274383545, + "learning_rate": 5.938878991366316e-06, + "loss": 0.1716, + "step": 30385 + }, + { + "epoch": 2.0121269669017905, + "grad_norm": 4.56303596496582, + "learning_rate": 5.93874194874606e-06, + "loss": 0.2478, + "step": 30386 + }, + { + "epoch": 2.0121405317417254, + "grad_norm": 2.907210350036621, + "learning_rate": 5.9386049061258056e-06, + "loss": 0.1077, + "step": 30387 + }, + { + "epoch": 2.0121540965816602, + "grad_norm": 3.8207223415374756, + "learning_rate": 5.938467863505551e-06, + "loss": 0.1696, + "step": 30388 + }, + { + "epoch": 2.012167661421595, + "grad_norm": 4.633561134338379, + "learning_rate": 5.938330820885296e-06, + "loss": 0.1891, + "step": 30389 + }, + { + "epoch": 2.01218122626153, + "grad_norm": 5.3386454582214355, + "learning_rate": 5.938193778265041e-06, + "loss": 0.1872, + "step": 30390 + }, + { + "epoch": 2.012194791101465, + "grad_norm": 3.465552568435669, + "learning_rate": 5.938056735644786e-06, + "loss": 0.1228, + "step": 30391 + }, + { + "epoch": 2.0122083559413997, + "grad_norm": 3.9389865398406982, + "learning_rate": 5.937919693024531e-06, + "loss": 0.1626, + "step": 30392 + }, + { + "epoch": 2.012221920781335, + "grad_norm": 4.833376884460449, + "learning_rate": 5.937782650404276e-06, + "loss": 0.1683, + "step": 30393 + }, + { + "epoch": 2.01223548562127, + "grad_norm": 4.488217353820801, + "learning_rate": 5.937645607784022e-06, + "loss": 0.1684, + "step": 30394 + }, + { + "epoch": 2.0122490504612047, + "grad_norm": 4.655168056488037, + "learning_rate": 5.937508565163766e-06, + "loss": 0.2262, + "step": 30395 + }, + { + "epoch": 2.0122626153011396, + "grad_norm": 4.1672515869140625, + "learning_rate": 5.937371522543511e-06, + "loss": 0.0901, + "step": 30396 + }, + { + "epoch": 2.0122761801410745, + "grad_norm": 5.165213584899902, + "learning_rate": 5.9372344799232565e-06, + "loss": 0.2086, + "step": 30397 + }, + { + "epoch": 2.0122897449810093, + "grad_norm": 4.06265926361084, + "learning_rate": 5.9370974373030025e-06, + "loss": 0.129, + "step": 30398 + }, + { + "epoch": 2.012303309820944, + "grad_norm": 4.354295253753662, + "learning_rate": 5.936960394682747e-06, + "loss": 0.2043, + "step": 30399 + }, + { + "epoch": 2.012316874660879, + "grad_norm": 7.362504005432129, + "learning_rate": 5.936823352062492e-06, + "loss": 0.2987, + "step": 30400 + }, + { + "epoch": 2.012330439500814, + "grad_norm": 4.584404468536377, + "learning_rate": 5.936686309442236e-06, + "loss": 0.2351, + "step": 30401 + }, + { + "epoch": 2.012344004340749, + "grad_norm": 5.426820278167725, + "learning_rate": 5.936549266821982e-06, + "loss": 0.1304, + "step": 30402 + }, + { + "epoch": 2.0123575691806836, + "grad_norm": 3.725020170211792, + "learning_rate": 5.936412224201728e-06, + "loss": 0.1206, + "step": 30403 + }, + { + "epoch": 2.0123711340206185, + "grad_norm": 4.652640342712402, + "learning_rate": 5.936275181581472e-06, + "loss": 0.141, + "step": 30404 + }, + { + "epoch": 2.0123846988605534, + "grad_norm": 4.421252250671387, + "learning_rate": 5.936138138961217e-06, + "loss": 0.1625, + "step": 30405 + }, + { + "epoch": 2.0123982637004882, + "grad_norm": 6.413271903991699, + "learning_rate": 5.936001096340962e-06, + "loss": 0.3227, + "step": 30406 + }, + { + "epoch": 2.012411828540423, + "grad_norm": 6.213190078735352, + "learning_rate": 5.935864053720708e-06, + "loss": 0.2235, + "step": 30407 + }, + { + "epoch": 2.012425393380358, + "grad_norm": 5.459728717803955, + "learning_rate": 5.935727011100453e-06, + "loss": 0.3514, + "step": 30408 + }, + { + "epoch": 2.012438958220293, + "grad_norm": 8.359888076782227, + "learning_rate": 5.935589968480198e-06, + "loss": 0.1986, + "step": 30409 + }, + { + "epoch": 2.0124525230602277, + "grad_norm": 4.1654744148254395, + "learning_rate": 5.935452925859942e-06, + "loss": 0.1644, + "step": 30410 + }, + { + "epoch": 2.0124660879001626, + "grad_norm": 6.006695747375488, + "learning_rate": 5.935315883239688e-06, + "loss": 0.2508, + "step": 30411 + }, + { + "epoch": 2.012479652740098, + "grad_norm": 9.41033935546875, + "learning_rate": 5.935178840619433e-06, + "loss": 0.4713, + "step": 30412 + }, + { + "epoch": 2.0124932175800327, + "grad_norm": 4.657042026519775, + "learning_rate": 5.9350417979991785e-06, + "loss": 0.2228, + "step": 30413 + }, + { + "epoch": 2.0125067824199676, + "grad_norm": 5.2867841720581055, + "learning_rate": 5.934904755378923e-06, + "loss": 0.253, + "step": 30414 + }, + { + "epoch": 2.0125203472599025, + "grad_norm": 8.111017227172852, + "learning_rate": 5.934767712758669e-06, + "loss": 0.3318, + "step": 30415 + }, + { + "epoch": 2.0125339120998373, + "grad_norm": 4.5786004066467285, + "learning_rate": 5.934630670138414e-06, + "loss": 0.1461, + "step": 30416 + }, + { + "epoch": 2.012547476939772, + "grad_norm": 6.714792251586914, + "learning_rate": 5.934493627518158e-06, + "loss": 0.2245, + "step": 30417 + }, + { + "epoch": 2.012561041779707, + "grad_norm": 5.340067386627197, + "learning_rate": 5.9343565848979036e-06, + "loss": 0.1967, + "step": 30418 + }, + { + "epoch": 2.012574606619642, + "grad_norm": 5.684030532836914, + "learning_rate": 5.934219542277648e-06, + "loss": 0.2722, + "step": 30419 + }, + { + "epoch": 2.012588171459577, + "grad_norm": 5.27370023727417, + "learning_rate": 5.934082499657394e-06, + "loss": 0.2837, + "step": 30420 + }, + { + "epoch": 2.0126017362995117, + "grad_norm": 5.200896263122559, + "learning_rate": 5.933945457037139e-06, + "loss": 0.2286, + "step": 30421 + }, + { + "epoch": 2.0126153011394465, + "grad_norm": 6.033214569091797, + "learning_rate": 5.933808414416884e-06, + "loss": 0.2995, + "step": 30422 + }, + { + "epoch": 2.0126288659793814, + "grad_norm": 5.848839282989502, + "learning_rate": 5.933671371796629e-06, + "loss": 0.2737, + "step": 30423 + }, + { + "epoch": 2.0126424308193163, + "grad_norm": 5.96304178237915, + "learning_rate": 5.933534329176375e-06, + "loss": 0.2866, + "step": 30424 + }, + { + "epoch": 2.012655995659251, + "grad_norm": 6.440776824951172, + "learning_rate": 5.93339728655612e-06, + "loss": 0.2523, + "step": 30425 + }, + { + "epoch": 2.012669560499186, + "grad_norm": 7.374543190002441, + "learning_rate": 5.933260243935864e-06, + "loss": 0.3662, + "step": 30426 + }, + { + "epoch": 2.012683125339121, + "grad_norm": 4.691711902618408, + "learning_rate": 5.933123201315609e-06, + "loss": 0.2909, + "step": 30427 + }, + { + "epoch": 2.0126966901790557, + "grad_norm": 5.23933219909668, + "learning_rate": 5.932986158695355e-06, + "loss": 0.2984, + "step": 30428 + }, + { + "epoch": 2.0127102550189906, + "grad_norm": 6.744339466094971, + "learning_rate": 5.9328491160751e-06, + "loss": 0.3744, + "step": 30429 + }, + { + "epoch": 2.0127238198589255, + "grad_norm": 5.2272162437438965, + "learning_rate": 5.932712073454845e-06, + "loss": 0.1669, + "step": 30430 + }, + { + "epoch": 2.0127373846988608, + "grad_norm": 5.12821102142334, + "learning_rate": 5.93257503083459e-06, + "loss": 0.2545, + "step": 30431 + }, + { + "epoch": 2.0127509495387956, + "grad_norm": 5.587438106536865, + "learning_rate": 5.932437988214336e-06, + "loss": 0.2287, + "step": 30432 + }, + { + "epoch": 2.0127645143787305, + "grad_norm": 5.853616714477539, + "learning_rate": 5.93230094559408e-06, + "loss": 0.2018, + "step": 30433 + }, + { + "epoch": 2.0127780792186654, + "grad_norm": 6.902920246124268, + "learning_rate": 5.932163902973826e-06, + "loss": 0.3838, + "step": 30434 + }, + { + "epoch": 2.0127916440586002, + "grad_norm": 5.749354362487793, + "learning_rate": 5.93202686035357e-06, + "loss": 0.1425, + "step": 30435 + }, + { + "epoch": 2.012805208898535, + "grad_norm": 4.621260166168213, + "learning_rate": 5.931889817733315e-06, + "loss": 0.2891, + "step": 30436 + }, + { + "epoch": 2.01281877373847, + "grad_norm": 5.306709289550781, + "learning_rate": 5.931752775113061e-06, + "loss": 0.2357, + "step": 30437 + }, + { + "epoch": 2.012832338578405, + "grad_norm": 5.255603790283203, + "learning_rate": 5.9316157324928055e-06, + "loss": 0.246, + "step": 30438 + }, + { + "epoch": 2.0128459034183397, + "grad_norm": 4.870081901550293, + "learning_rate": 5.931478689872551e-06, + "loss": 0.2979, + "step": 30439 + }, + { + "epoch": 2.0128594682582746, + "grad_norm": 4.523792266845703, + "learning_rate": 5.931341647252296e-06, + "loss": 0.1755, + "step": 30440 + }, + { + "epoch": 2.0128730330982094, + "grad_norm": 5.247000217437744, + "learning_rate": 5.931204604632042e-06, + "loss": 0.4378, + "step": 30441 + }, + { + "epoch": 2.0128865979381443, + "grad_norm": 7.827233791351318, + "learning_rate": 5.931067562011786e-06, + "loss": 0.5354, + "step": 30442 + }, + { + "epoch": 2.012900162778079, + "grad_norm": 4.429061412811279, + "learning_rate": 5.930930519391531e-06, + "loss": 0.2552, + "step": 30443 + }, + { + "epoch": 2.012913727618014, + "grad_norm": 5.665137767791748, + "learning_rate": 5.930793476771276e-06, + "loss": 0.2272, + "step": 30444 + }, + { + "epoch": 2.012927292457949, + "grad_norm": 3.5016820430755615, + "learning_rate": 5.930656434151022e-06, + "loss": 0.1525, + "step": 30445 + }, + { + "epoch": 2.0129408572978837, + "grad_norm": 4.592020511627197, + "learning_rate": 5.930519391530767e-06, + "loss": 0.2184, + "step": 30446 + }, + { + "epoch": 2.0129544221378186, + "grad_norm": 5.172698497772217, + "learning_rate": 5.930382348910512e-06, + "loss": 0.2614, + "step": 30447 + }, + { + "epoch": 2.0129679869777535, + "grad_norm": 5.643458366394043, + "learning_rate": 5.930245306290256e-06, + "loss": 0.32, + "step": 30448 + }, + { + "epoch": 2.0129815518176883, + "grad_norm": 5.332181930541992, + "learning_rate": 5.9301082636700016e-06, + "loss": 0.2784, + "step": 30449 + }, + { + "epoch": 2.0129951166576237, + "grad_norm": 3.1794676780700684, + "learning_rate": 5.929971221049748e-06, + "loss": 0.1515, + "step": 30450 + }, + { + "epoch": 2.0130086814975585, + "grad_norm": 5.591188907623291, + "learning_rate": 5.929834178429492e-06, + "loss": 0.3249, + "step": 30451 + }, + { + "epoch": 2.0130222463374934, + "grad_norm": 6.846940994262695, + "learning_rate": 5.929697135809237e-06, + "loss": 0.2563, + "step": 30452 + }, + { + "epoch": 2.0130358111774282, + "grad_norm": 5.626097202301025, + "learning_rate": 5.9295600931889814e-06, + "loss": 0.2443, + "step": 30453 + }, + { + "epoch": 2.013049376017363, + "grad_norm": 5.7369513511657715, + "learning_rate": 5.9294230505687275e-06, + "loss": 0.271, + "step": 30454 + }, + { + "epoch": 2.013062940857298, + "grad_norm": 3.807809352874756, + "learning_rate": 5.929286007948473e-06, + "loss": 0.1759, + "step": 30455 + }, + { + "epoch": 2.013076505697233, + "grad_norm": 5.378807067871094, + "learning_rate": 5.929148965328218e-06, + "loss": 0.271, + "step": 30456 + }, + { + "epoch": 2.0130900705371677, + "grad_norm": 5.154965877532959, + "learning_rate": 5.929011922707962e-06, + "loss": 0.2925, + "step": 30457 + }, + { + "epoch": 2.0131036353771026, + "grad_norm": 5.348361015319824, + "learning_rate": 5.928874880087708e-06, + "loss": 0.3096, + "step": 30458 + }, + { + "epoch": 2.0131172002170374, + "grad_norm": 3.966015100479126, + "learning_rate": 5.928737837467453e-06, + "loss": 0.1593, + "step": 30459 + }, + { + "epoch": 2.0131307650569723, + "grad_norm": 4.4679341316223145, + "learning_rate": 5.928600794847198e-06, + "loss": 0.281, + "step": 30460 + }, + { + "epoch": 2.013144329896907, + "grad_norm": 4.801743030548096, + "learning_rate": 5.928463752226943e-06, + "loss": 0.2479, + "step": 30461 + }, + { + "epoch": 2.013157894736842, + "grad_norm": 6.548966884613037, + "learning_rate": 5.928326709606688e-06, + "loss": 0.4688, + "step": 30462 + }, + { + "epoch": 2.013171459576777, + "grad_norm": 4.793659210205078, + "learning_rate": 5.928189666986433e-06, + "loss": 0.1918, + "step": 30463 + }, + { + "epoch": 2.0131850244167118, + "grad_norm": 5.487215042114258, + "learning_rate": 5.928052624366178e-06, + "loss": 0.2513, + "step": 30464 + }, + { + "epoch": 2.0131985892566466, + "grad_norm": 6.0180230140686035, + "learning_rate": 5.927915581745924e-06, + "loss": 0.2978, + "step": 30465 + }, + { + "epoch": 2.0132121540965815, + "grad_norm": 3.237586498260498, + "learning_rate": 5.927778539125668e-06, + "loss": 0.1814, + "step": 30466 + }, + { + "epoch": 2.0132257189365164, + "grad_norm": 5.6551971435546875, + "learning_rate": 5.927641496505414e-06, + "loss": 0.2671, + "step": 30467 + }, + { + "epoch": 2.0132392837764512, + "grad_norm": 4.043097972869873, + "learning_rate": 5.927504453885159e-06, + "loss": 0.1034, + "step": 30468 + }, + { + "epoch": 2.0132528486163865, + "grad_norm": 4.216641426086426, + "learning_rate": 5.9273674112649035e-06, + "loss": 0.1513, + "step": 30469 + }, + { + "epoch": 2.0132664134563214, + "grad_norm": 4.1919660568237305, + "learning_rate": 5.927230368644649e-06, + "loss": 0.1707, + "step": 30470 + }, + { + "epoch": 2.0132799782962563, + "grad_norm": 4.472083568572998, + "learning_rate": 5.927093326024395e-06, + "loss": 0.2433, + "step": 30471 + }, + { + "epoch": 2.013293543136191, + "grad_norm": 4.032868385314941, + "learning_rate": 5.92695628340414e-06, + "loss": 0.1905, + "step": 30472 + }, + { + "epoch": 2.013307107976126, + "grad_norm": 5.076817512512207, + "learning_rate": 5.926819240783884e-06, + "loss": 0.202, + "step": 30473 + }, + { + "epoch": 2.013320672816061, + "grad_norm": 5.145644664764404, + "learning_rate": 5.926682198163629e-06, + "loss": 0.2684, + "step": 30474 + }, + { + "epoch": 2.0133342376559957, + "grad_norm": 3.6491446495056152, + "learning_rate": 5.926545155543374e-06, + "loss": 0.1613, + "step": 30475 + }, + { + "epoch": 2.0133478024959306, + "grad_norm": 4.739253520965576, + "learning_rate": 5.92640811292312e-06, + "loss": 0.2864, + "step": 30476 + }, + { + "epoch": 2.0133613673358655, + "grad_norm": 4.0821452140808105, + "learning_rate": 5.926271070302865e-06, + "loss": 0.1818, + "step": 30477 + }, + { + "epoch": 2.0133749321758003, + "grad_norm": 4.6917314529418945, + "learning_rate": 5.926134027682609e-06, + "loss": 0.256, + "step": 30478 + }, + { + "epoch": 2.013388497015735, + "grad_norm": 6.856494426727295, + "learning_rate": 5.925996985062354e-06, + "loss": 0.2744, + "step": 30479 + }, + { + "epoch": 2.01340206185567, + "grad_norm": 3.593416213989258, + "learning_rate": 5.9258599424421e-06, + "loss": 0.146, + "step": 30480 + }, + { + "epoch": 2.013415626695605, + "grad_norm": 5.268973350524902, + "learning_rate": 5.925722899821846e-06, + "loss": 0.1486, + "step": 30481 + }, + { + "epoch": 2.01342919153554, + "grad_norm": 4.609306812286377, + "learning_rate": 5.92558585720159e-06, + "loss": 0.2763, + "step": 30482 + }, + { + "epoch": 2.0134427563754747, + "grad_norm": 6.124961853027344, + "learning_rate": 5.925448814581335e-06, + "loss": 0.2878, + "step": 30483 + }, + { + "epoch": 2.0134563212154095, + "grad_norm": 7.252035140991211, + "learning_rate": 5.925311771961081e-06, + "loss": 0.3278, + "step": 30484 + }, + { + "epoch": 2.0134698860553444, + "grad_norm": 4.8185715675354, + "learning_rate": 5.9251747293408255e-06, + "loss": 0.1423, + "step": 30485 + }, + { + "epoch": 2.0134834508952792, + "grad_norm": 3.702810287475586, + "learning_rate": 5.925037686720571e-06, + "loss": 0.1323, + "step": 30486 + }, + { + "epoch": 2.013497015735214, + "grad_norm": 5.015443325042725, + "learning_rate": 5.924900644100315e-06, + "loss": 0.1922, + "step": 30487 + }, + { + "epoch": 2.0135105805751494, + "grad_norm": 4.141050338745117, + "learning_rate": 5.92476360148006e-06, + "loss": 0.1706, + "step": 30488 + }, + { + "epoch": 2.0135241454150843, + "grad_norm": 3.9626975059509277, + "learning_rate": 5.924626558859806e-06, + "loss": 0.1302, + "step": 30489 + }, + { + "epoch": 2.013537710255019, + "grad_norm": 4.648900508880615, + "learning_rate": 5.924489516239551e-06, + "loss": 0.1723, + "step": 30490 + }, + { + "epoch": 2.013551275094954, + "grad_norm": 5.652100563049316, + "learning_rate": 5.924352473619296e-06, + "loss": 0.1766, + "step": 30491 + }, + { + "epoch": 2.013564839934889, + "grad_norm": 4.494685173034668, + "learning_rate": 5.924215430999041e-06, + "loss": 0.1458, + "step": 30492 + }, + { + "epoch": 2.0135784047748237, + "grad_norm": 4.673364639282227, + "learning_rate": 5.924078388378787e-06, + "loss": 0.1743, + "step": 30493 + }, + { + "epoch": 2.0135919696147586, + "grad_norm": 4.0644989013671875, + "learning_rate": 5.923941345758531e-06, + "loss": 0.1283, + "step": 30494 + }, + { + "epoch": 2.0136055344546935, + "grad_norm": 3.6887638568878174, + "learning_rate": 5.923804303138276e-06, + "loss": 0.2122, + "step": 30495 + }, + { + "epoch": 2.0136190992946283, + "grad_norm": 4.850529670715332, + "learning_rate": 5.923667260518022e-06, + "loss": 0.3268, + "step": 30496 + }, + { + "epoch": 2.013632664134563, + "grad_norm": 5.600224018096924, + "learning_rate": 5.923530217897767e-06, + "loss": 0.2938, + "step": 30497 + }, + { + "epoch": 2.013646228974498, + "grad_norm": 3.3480982780456543, + "learning_rate": 5.923393175277512e-06, + "loss": 0.1224, + "step": 30498 + }, + { + "epoch": 2.013659793814433, + "grad_norm": 5.13189697265625, + "learning_rate": 5.923256132657257e-06, + "loss": 0.2566, + "step": 30499 + }, + { + "epoch": 2.013673358654368, + "grad_norm": 3.463913917541504, + "learning_rate": 5.9231190900370015e-06, + "loss": 0.1827, + "step": 30500 + }, + { + "epoch": 2.0136869234943027, + "grad_norm": 4.530714511871338, + "learning_rate": 5.922982047416747e-06, + "loss": 0.2179, + "step": 30501 + }, + { + "epoch": 2.0137004883342375, + "grad_norm": 3.946903705596924, + "learning_rate": 5.922845004796493e-06, + "loss": 0.1462, + "step": 30502 + }, + { + "epoch": 2.0137140531741724, + "grad_norm": 6.801116466522217, + "learning_rate": 5.922707962176237e-06, + "loss": 0.2425, + "step": 30503 + }, + { + "epoch": 2.0137276180141073, + "grad_norm": 4.898852825164795, + "learning_rate": 5.922570919555982e-06, + "loss": 0.271, + "step": 30504 + }, + { + "epoch": 2.013741182854042, + "grad_norm": 5.0796051025390625, + "learning_rate": 5.922433876935727e-06, + "loss": 0.254, + "step": 30505 + }, + { + "epoch": 2.013754747693977, + "grad_norm": 5.536642074584961, + "learning_rate": 5.922296834315473e-06, + "loss": 0.2113, + "step": 30506 + }, + { + "epoch": 2.0137683125339123, + "grad_norm": 5.266396522521973, + "learning_rate": 5.922159791695218e-06, + "loss": 0.2121, + "step": 30507 + }, + { + "epoch": 2.013781877373847, + "grad_norm": 5.23099946975708, + "learning_rate": 5.922022749074963e-06, + "loss": 0.2978, + "step": 30508 + }, + { + "epoch": 2.013795442213782, + "grad_norm": 6.221008777618408, + "learning_rate": 5.921885706454707e-06, + "loss": 0.2712, + "step": 30509 + }, + { + "epoch": 2.013809007053717, + "grad_norm": 6.398380756378174, + "learning_rate": 5.921748663834453e-06, + "loss": 0.2904, + "step": 30510 + }, + { + "epoch": 2.0138225718936518, + "grad_norm": 4.371461391448975, + "learning_rate": 5.9216116212141984e-06, + "loss": 0.31, + "step": 30511 + }, + { + "epoch": 2.0138361367335866, + "grad_norm": 6.227139949798584, + "learning_rate": 5.921474578593943e-06, + "loss": 0.289, + "step": 30512 + }, + { + "epoch": 2.0138497015735215, + "grad_norm": 5.059231281280518, + "learning_rate": 5.921337535973688e-06, + "loss": 0.1858, + "step": 30513 + }, + { + "epoch": 2.0138632664134564, + "grad_norm": 2.933312177658081, + "learning_rate": 5.921200493353434e-06, + "loss": 0.1449, + "step": 30514 + }, + { + "epoch": 2.0138768312533912, + "grad_norm": 4.854264259338379, + "learning_rate": 5.921063450733179e-06, + "loss": 0.2184, + "step": 30515 + }, + { + "epoch": 2.013890396093326, + "grad_norm": 4.03229284286499, + "learning_rate": 5.9209264081129235e-06, + "loss": 0.2462, + "step": 30516 + }, + { + "epoch": 2.013903960933261, + "grad_norm": 4.854499340057373, + "learning_rate": 5.920789365492669e-06, + "loss": 0.1615, + "step": 30517 + }, + { + "epoch": 2.013917525773196, + "grad_norm": 3.427004337310791, + "learning_rate": 5.920652322872413e-06, + "loss": 0.1353, + "step": 30518 + }, + { + "epoch": 2.0139310906131307, + "grad_norm": 5.840498447418213, + "learning_rate": 5.920515280252159e-06, + "loss": 0.1907, + "step": 30519 + }, + { + "epoch": 2.0139446554530656, + "grad_norm": 5.349653720855713, + "learning_rate": 5.920378237631904e-06, + "loss": 0.2516, + "step": 30520 + }, + { + "epoch": 2.0139582202930004, + "grad_norm": 5.1999688148498535, + "learning_rate": 5.920241195011649e-06, + "loss": 0.2493, + "step": 30521 + }, + { + "epoch": 2.0139717851329353, + "grad_norm": 3.5700831413269043, + "learning_rate": 5.920104152391394e-06, + "loss": 0.1264, + "step": 30522 + }, + { + "epoch": 2.01398534997287, + "grad_norm": 4.899703502655029, + "learning_rate": 5.91996710977114e-06, + "loss": 0.1544, + "step": 30523 + }, + { + "epoch": 2.013998914812805, + "grad_norm": 4.3880534172058105, + "learning_rate": 5.919830067150885e-06, + "loss": 0.1953, + "step": 30524 + }, + { + "epoch": 2.01401247965274, + "grad_norm": 4.375804901123047, + "learning_rate": 5.919693024530629e-06, + "loss": 0.1529, + "step": 30525 + }, + { + "epoch": 2.014026044492675, + "grad_norm": 2.9786736965179443, + "learning_rate": 5.919555981910374e-06, + "loss": 0.098, + "step": 30526 + }, + { + "epoch": 2.01403960933261, + "grad_norm": 5.737345218658447, + "learning_rate": 5.9194189392901204e-06, + "loss": 0.3304, + "step": 30527 + }, + { + "epoch": 2.014053174172545, + "grad_norm": 4.296929836273193, + "learning_rate": 5.919281896669865e-06, + "loss": 0.225, + "step": 30528 + }, + { + "epoch": 2.01406673901248, + "grad_norm": 3.849978446960449, + "learning_rate": 5.91914485404961e-06, + "loss": 0.187, + "step": 30529 + }, + { + "epoch": 2.0140803038524147, + "grad_norm": 4.1622772216796875, + "learning_rate": 5.919007811429355e-06, + "loss": 0.171, + "step": 30530 + }, + { + "epoch": 2.0140938686923495, + "grad_norm": 4.731345176696777, + "learning_rate": 5.9188707688090995e-06, + "loss": 0.2203, + "step": 30531 + }, + { + "epoch": 2.0141074335322844, + "grad_norm": 4.0557379722595215, + "learning_rate": 5.9187337261888455e-06, + "loss": 0.1392, + "step": 30532 + }, + { + "epoch": 2.0141209983722193, + "grad_norm": 4.326881408691406, + "learning_rate": 5.918596683568591e-06, + "loss": 0.1951, + "step": 30533 + }, + { + "epoch": 2.014134563212154, + "grad_norm": 5.06155252456665, + "learning_rate": 5.918459640948335e-06, + "loss": 0.1769, + "step": 30534 + }, + { + "epoch": 2.014148128052089, + "grad_norm": 3.6136813163757324, + "learning_rate": 5.91832259832808e-06, + "loss": 0.1396, + "step": 30535 + }, + { + "epoch": 2.014161692892024, + "grad_norm": 5.777255058288574, + "learning_rate": 5.918185555707826e-06, + "loss": 0.2356, + "step": 30536 + }, + { + "epoch": 2.0141752577319587, + "grad_norm": 4.707388401031494, + "learning_rate": 5.9180485130875705e-06, + "loss": 0.2339, + "step": 30537 + }, + { + "epoch": 2.0141888225718936, + "grad_norm": 2.809431791305542, + "learning_rate": 5.917911470467316e-06, + "loss": 0.1238, + "step": 30538 + }, + { + "epoch": 2.0142023874118284, + "grad_norm": 3.595625877380371, + "learning_rate": 5.917774427847061e-06, + "loss": 0.2205, + "step": 30539 + }, + { + "epoch": 2.0142159522517633, + "grad_norm": 4.11424446105957, + "learning_rate": 5.917637385226807e-06, + "loss": 0.1103, + "step": 30540 + }, + { + "epoch": 2.014229517091698, + "grad_norm": 2.446331024169922, + "learning_rate": 5.917500342606551e-06, + "loss": 0.076, + "step": 30541 + }, + { + "epoch": 2.014243081931633, + "grad_norm": 3.695423126220703, + "learning_rate": 5.9173632999862964e-06, + "loss": 0.1741, + "step": 30542 + }, + { + "epoch": 2.014256646771568, + "grad_norm": 4.968713760375977, + "learning_rate": 5.917226257366041e-06, + "loss": 0.2171, + "step": 30543 + }, + { + "epoch": 2.0142702116115028, + "grad_norm": 3.371802568435669, + "learning_rate": 5.917089214745786e-06, + "loss": 0.1015, + "step": 30544 + }, + { + "epoch": 2.014283776451438, + "grad_norm": 3.7407588958740234, + "learning_rate": 5.916952172125532e-06, + "loss": 0.1564, + "step": 30545 + }, + { + "epoch": 2.014297341291373, + "grad_norm": 6.626188278198242, + "learning_rate": 5.916815129505276e-06, + "loss": 0.2557, + "step": 30546 + }, + { + "epoch": 2.014310906131308, + "grad_norm": 3.270540714263916, + "learning_rate": 5.9166780868850215e-06, + "loss": 0.1272, + "step": 30547 + }, + { + "epoch": 2.0143244709712427, + "grad_norm": 4.75962495803833, + "learning_rate": 5.916541044264767e-06, + "loss": 0.181, + "step": 30548 + }, + { + "epoch": 2.0143380358111775, + "grad_norm": 7.3506035804748535, + "learning_rate": 5.916404001644513e-06, + "loss": 0.2722, + "step": 30549 + }, + { + "epoch": 2.0143516006511124, + "grad_norm": 4.1674604415893555, + "learning_rate": 5.916266959024257e-06, + "loss": 0.1113, + "step": 30550 + }, + { + "epoch": 2.0143651654910473, + "grad_norm": 3.1652324199676514, + "learning_rate": 5.916129916404002e-06, + "loss": 0.1, + "step": 30551 + }, + { + "epoch": 2.014378730330982, + "grad_norm": 3.0117645263671875, + "learning_rate": 5.9159928737837465e-06, + "loss": 0.1471, + "step": 30552 + }, + { + "epoch": 2.014392295170917, + "grad_norm": 3.2907967567443848, + "learning_rate": 5.9158558311634926e-06, + "loss": 0.1088, + "step": 30553 + }, + { + "epoch": 2.014405860010852, + "grad_norm": 4.140408515930176, + "learning_rate": 5.915718788543238e-06, + "loss": 0.1766, + "step": 30554 + }, + { + "epoch": 2.0144194248507867, + "grad_norm": 4.204504489898682, + "learning_rate": 5.915581745922983e-06, + "loss": 0.2323, + "step": 30555 + }, + { + "epoch": 2.0144329896907216, + "grad_norm": 4.724156856536865, + "learning_rate": 5.915444703302727e-06, + "loss": 0.2051, + "step": 30556 + }, + { + "epoch": 2.0144465545306565, + "grad_norm": 5.0811028480529785, + "learning_rate": 5.915307660682472e-06, + "loss": 0.1803, + "step": 30557 + }, + { + "epoch": 2.0144601193705913, + "grad_norm": 3.91611385345459, + "learning_rate": 5.9151706180622184e-06, + "loss": 0.1635, + "step": 30558 + }, + { + "epoch": 2.014473684210526, + "grad_norm": 3.493941068649292, + "learning_rate": 5.915033575441963e-06, + "loss": 0.1749, + "step": 30559 + }, + { + "epoch": 2.014487249050461, + "grad_norm": 6.062831401824951, + "learning_rate": 5.914896532821708e-06, + "loss": 0.2438, + "step": 30560 + }, + { + "epoch": 2.014500813890396, + "grad_norm": 5.442829608917236, + "learning_rate": 5.914759490201452e-06, + "loss": 0.2459, + "step": 30561 + }, + { + "epoch": 2.014514378730331, + "grad_norm": 3.039156913757324, + "learning_rate": 5.914622447581198e-06, + "loss": 0.1418, + "step": 30562 + }, + { + "epoch": 2.0145279435702657, + "grad_norm": 3.1246581077575684, + "learning_rate": 5.9144854049609435e-06, + "loss": 0.1375, + "step": 30563 + }, + { + "epoch": 2.014541508410201, + "grad_norm": 6.200985908508301, + "learning_rate": 5.914348362340689e-06, + "loss": 0.213, + "step": 30564 + }, + { + "epoch": 2.014555073250136, + "grad_norm": 3.315603017807007, + "learning_rate": 5.914211319720433e-06, + "loss": 0.1222, + "step": 30565 + }, + { + "epoch": 2.0145686380900707, + "grad_norm": 5.904848575592041, + "learning_rate": 5.914074277100179e-06, + "loss": 0.1529, + "step": 30566 + }, + { + "epoch": 2.0145822029300056, + "grad_norm": 3.9585301876068115, + "learning_rate": 5.913937234479924e-06, + "loss": 0.2089, + "step": 30567 + }, + { + "epoch": 2.0145957677699404, + "grad_norm": 3.751526355743408, + "learning_rate": 5.9138001918596685e-06, + "loss": 0.1596, + "step": 30568 + }, + { + "epoch": 2.0146093326098753, + "grad_norm": 5.131175994873047, + "learning_rate": 5.913663149239414e-06, + "loss": 0.1675, + "step": 30569 + }, + { + "epoch": 2.01462289744981, + "grad_norm": 3.7444374561309814, + "learning_rate": 5.913526106619159e-06, + "loss": 0.2119, + "step": 30570 + }, + { + "epoch": 2.014636462289745, + "grad_norm": 3.160663366317749, + "learning_rate": 5.913389063998904e-06, + "loss": 0.1074, + "step": 30571 + }, + { + "epoch": 2.01465002712968, + "grad_norm": 4.497218608856201, + "learning_rate": 5.913252021378649e-06, + "loss": 0.2419, + "step": 30572 + }, + { + "epoch": 2.0146635919696148, + "grad_norm": 5.3682661056518555, + "learning_rate": 5.9131149787583944e-06, + "loss": 0.307, + "step": 30573 + }, + { + "epoch": 2.0146771568095496, + "grad_norm": 5.626722812652588, + "learning_rate": 5.912977936138139e-06, + "loss": 0.33, + "step": 30574 + }, + { + "epoch": 2.0146907216494845, + "grad_norm": 6.812227249145508, + "learning_rate": 5.912840893517885e-06, + "loss": 0.2659, + "step": 30575 + }, + { + "epoch": 2.0147042864894193, + "grad_norm": 6.414539337158203, + "learning_rate": 5.91270385089763e-06, + "loss": 0.3934, + "step": 30576 + }, + { + "epoch": 2.014717851329354, + "grad_norm": 4.633365154266357, + "learning_rate": 5.912566808277374e-06, + "loss": 0.1975, + "step": 30577 + }, + { + "epoch": 2.014731416169289, + "grad_norm": 3.5247578620910645, + "learning_rate": 5.9124297656571195e-06, + "loss": 0.1282, + "step": 30578 + }, + { + "epoch": 2.014744981009224, + "grad_norm": 4.899960041046143, + "learning_rate": 5.9122927230368655e-06, + "loss": 0.2588, + "step": 30579 + }, + { + "epoch": 2.014758545849159, + "grad_norm": 4.593648910522461, + "learning_rate": 5.91215568041661e-06, + "loss": 0.2198, + "step": 30580 + }, + { + "epoch": 2.0147721106890937, + "grad_norm": 3.4389865398406982, + "learning_rate": 5.912018637796355e-06, + "loss": 0.1199, + "step": 30581 + }, + { + "epoch": 2.0147856755290285, + "grad_norm": 4.388910293579102, + "learning_rate": 5.9118815951761e-06, + "loss": 0.1438, + "step": 30582 + }, + { + "epoch": 2.014799240368964, + "grad_norm": 5.101113796234131, + "learning_rate": 5.911744552555846e-06, + "loss": 0.3852, + "step": 30583 + }, + { + "epoch": 2.0148128052088987, + "grad_norm": 4.707088947296143, + "learning_rate": 5.9116075099355906e-06, + "loss": 0.1816, + "step": 30584 + }, + { + "epoch": 2.0148263700488336, + "grad_norm": 3.2904927730560303, + "learning_rate": 5.911470467315336e-06, + "loss": 0.1191, + "step": 30585 + }, + { + "epoch": 2.0148399348887684, + "grad_norm": 3.563647747039795, + "learning_rate": 5.91133342469508e-06, + "loss": 0.1502, + "step": 30586 + }, + { + "epoch": 2.0148534997287033, + "grad_norm": 5.732647895812988, + "learning_rate": 5.911196382074825e-06, + "loss": 0.23, + "step": 30587 + }, + { + "epoch": 2.014867064568638, + "grad_norm": 3.549497127532959, + "learning_rate": 5.911059339454571e-06, + "loss": 0.1861, + "step": 30588 + }, + { + "epoch": 2.014880629408573, + "grad_norm": 5.8173508644104, + "learning_rate": 5.9109222968343165e-06, + "loss": 0.2447, + "step": 30589 + }, + { + "epoch": 2.014894194248508, + "grad_norm": 5.310005187988281, + "learning_rate": 5.910785254214061e-06, + "loss": 0.247, + "step": 30590 + }, + { + "epoch": 2.0149077590884428, + "grad_norm": 6.025533199310303, + "learning_rate": 5.910648211593806e-06, + "loss": 0.238, + "step": 30591 + }, + { + "epoch": 2.0149213239283776, + "grad_norm": 4.722930431365967, + "learning_rate": 5.910511168973552e-06, + "loss": 0.1681, + "step": 30592 + }, + { + "epoch": 2.0149348887683125, + "grad_norm": 4.578681945800781, + "learning_rate": 5.910374126353296e-06, + "loss": 0.2004, + "step": 30593 + }, + { + "epoch": 2.0149484536082474, + "grad_norm": 4.343827724456787, + "learning_rate": 5.9102370837330415e-06, + "loss": 0.1836, + "step": 30594 + }, + { + "epoch": 2.0149620184481822, + "grad_norm": 5.202144622802734, + "learning_rate": 5.910100041112786e-06, + "loss": 0.2601, + "step": 30595 + }, + { + "epoch": 2.014975583288117, + "grad_norm": 5.931861400604248, + "learning_rate": 5.909962998492532e-06, + "loss": 0.2044, + "step": 30596 + }, + { + "epoch": 2.014989148128052, + "grad_norm": 4.701220989227295, + "learning_rate": 5.909825955872277e-06, + "loss": 0.2977, + "step": 30597 + }, + { + "epoch": 2.015002712967987, + "grad_norm": 3.457746982574463, + "learning_rate": 5.909688913252022e-06, + "loss": 0.094, + "step": 30598 + }, + { + "epoch": 2.0150162778079217, + "grad_norm": 4.678277015686035, + "learning_rate": 5.9095518706317665e-06, + "loss": 0.1467, + "step": 30599 + }, + { + "epoch": 2.0150298426478566, + "grad_norm": 3.5949416160583496, + "learning_rate": 5.909414828011512e-06, + "loss": 0.1508, + "step": 30600 + }, + { + "epoch": 2.0150434074877914, + "grad_norm": 6.789238452911377, + "learning_rate": 5.909277785391258e-06, + "loss": 0.2745, + "step": 30601 + }, + { + "epoch": 2.0150569723277267, + "grad_norm": 4.85530948638916, + "learning_rate": 5.909140742771002e-06, + "loss": 0.1847, + "step": 30602 + }, + { + "epoch": 2.0150705371676616, + "grad_norm": 3.5702297687530518, + "learning_rate": 5.909003700150747e-06, + "loss": 0.1688, + "step": 30603 + }, + { + "epoch": 2.0150841020075965, + "grad_norm": 4.186743259429932, + "learning_rate": 5.9088666575304924e-06, + "loss": 0.2143, + "step": 30604 + }, + { + "epoch": 2.0150976668475313, + "grad_norm": 5.632944107055664, + "learning_rate": 5.908729614910238e-06, + "loss": 0.1895, + "step": 30605 + }, + { + "epoch": 2.015111231687466, + "grad_norm": 5.396726608276367, + "learning_rate": 5.908592572289983e-06, + "loss": 0.2614, + "step": 30606 + }, + { + "epoch": 2.015124796527401, + "grad_norm": 5.405796051025391, + "learning_rate": 5.908455529669728e-06, + "loss": 0.2323, + "step": 30607 + }, + { + "epoch": 2.015138361367336, + "grad_norm": 5.643486499786377, + "learning_rate": 5.908318487049472e-06, + "loss": 0.233, + "step": 30608 + }, + { + "epoch": 2.015151926207271, + "grad_norm": 4.689497470855713, + "learning_rate": 5.908181444429218e-06, + "loss": 0.2133, + "step": 30609 + }, + { + "epoch": 2.0151654910472057, + "grad_norm": 6.596088409423828, + "learning_rate": 5.9080444018089635e-06, + "loss": 0.1672, + "step": 30610 + }, + { + "epoch": 2.0151790558871405, + "grad_norm": 4.461429119110107, + "learning_rate": 5.907907359188708e-06, + "loss": 0.2119, + "step": 30611 + }, + { + "epoch": 2.0151926207270754, + "grad_norm": 5.967731952667236, + "learning_rate": 5.907770316568453e-06, + "loss": 0.2408, + "step": 30612 + }, + { + "epoch": 2.0152061855670103, + "grad_norm": 4.099944591522217, + "learning_rate": 5.907633273948198e-06, + "loss": 0.178, + "step": 30613 + }, + { + "epoch": 2.015219750406945, + "grad_norm": 4.640238285064697, + "learning_rate": 5.907496231327944e-06, + "loss": 0.1627, + "step": 30614 + }, + { + "epoch": 2.01523331524688, + "grad_norm": 4.888449668884277, + "learning_rate": 5.9073591887076886e-06, + "loss": 0.2114, + "step": 30615 + }, + { + "epoch": 2.015246880086815, + "grad_norm": 3.9536471366882324, + "learning_rate": 5.907222146087434e-06, + "loss": 0.1062, + "step": 30616 + }, + { + "epoch": 2.0152604449267497, + "grad_norm": 6.0182976722717285, + "learning_rate": 5.907085103467178e-06, + "loss": 0.1835, + "step": 30617 + }, + { + "epoch": 2.0152740097666846, + "grad_norm": 3.9777987003326416, + "learning_rate": 5.906948060846924e-06, + "loss": 0.2269, + "step": 30618 + }, + { + "epoch": 2.0152875746066194, + "grad_norm": 4.414687633514404, + "learning_rate": 5.906811018226669e-06, + "loss": 0.2341, + "step": 30619 + }, + { + "epoch": 2.0153011394465543, + "grad_norm": 5.637482166290283, + "learning_rate": 5.906673975606414e-06, + "loss": 0.2092, + "step": 30620 + }, + { + "epoch": 2.0153147042864896, + "grad_norm": 3.2401387691497803, + "learning_rate": 5.906536932986159e-06, + "loss": 0.1068, + "step": 30621 + }, + { + "epoch": 2.0153282691264245, + "grad_norm": 3.682384490966797, + "learning_rate": 5.906399890365905e-06, + "loss": 0.1445, + "step": 30622 + }, + { + "epoch": 2.0153418339663594, + "grad_norm": 5.150812149047852, + "learning_rate": 5.90626284774565e-06, + "loss": 0.158, + "step": 30623 + }, + { + "epoch": 2.015355398806294, + "grad_norm": 4.233219146728516, + "learning_rate": 5.906125805125394e-06, + "loss": 0.2045, + "step": 30624 + }, + { + "epoch": 2.015368963646229, + "grad_norm": 4.548326015472412, + "learning_rate": 5.9059887625051395e-06, + "loss": 0.2021, + "step": 30625 + }, + { + "epoch": 2.015382528486164, + "grad_norm": 4.8538970947265625, + "learning_rate": 5.905851719884884e-06, + "loss": 0.1547, + "step": 30626 + }, + { + "epoch": 2.015396093326099, + "grad_norm": 6.199810028076172, + "learning_rate": 5.90571467726463e-06, + "loss": 0.2449, + "step": 30627 + }, + { + "epoch": 2.0154096581660337, + "grad_norm": 3.818826198577881, + "learning_rate": 5.905577634644375e-06, + "loss": 0.1558, + "step": 30628 + }, + { + "epoch": 2.0154232230059685, + "grad_norm": 3.918907403945923, + "learning_rate": 5.905440592024119e-06, + "loss": 0.1882, + "step": 30629 + }, + { + "epoch": 2.0154367878459034, + "grad_norm": 5.273148536682129, + "learning_rate": 5.9053035494038646e-06, + "loss": 0.3021, + "step": 30630 + }, + { + "epoch": 2.0154503526858383, + "grad_norm": 4.010735034942627, + "learning_rate": 5.9051665067836106e-06, + "loss": 0.1611, + "step": 30631 + }, + { + "epoch": 2.015463917525773, + "grad_norm": 5.321109294891357, + "learning_rate": 5.905029464163356e-06, + "loss": 0.1718, + "step": 30632 + }, + { + "epoch": 2.015477482365708, + "grad_norm": 4.765657424926758, + "learning_rate": 5.9048924215431e-06, + "loss": 0.1861, + "step": 30633 + }, + { + "epoch": 2.015491047205643, + "grad_norm": 4.430872440338135, + "learning_rate": 5.904755378922845e-06, + "loss": 0.2668, + "step": 30634 + }, + { + "epoch": 2.0155046120455777, + "grad_norm": 3.688713788986206, + "learning_rate": 5.904618336302591e-06, + "loss": 0.1632, + "step": 30635 + }, + { + "epoch": 2.0155181768855126, + "grad_norm": 4.568917274475098, + "learning_rate": 5.904481293682336e-06, + "loss": 0.1941, + "step": 30636 + }, + { + "epoch": 2.0155317417254475, + "grad_norm": 5.6310715675354, + "learning_rate": 5.904344251062081e-06, + "loss": 0.261, + "step": 30637 + }, + { + "epoch": 2.0155453065653823, + "grad_norm": 4.914051055908203, + "learning_rate": 5.904207208441826e-06, + "loss": 0.1818, + "step": 30638 + }, + { + "epoch": 2.015558871405317, + "grad_norm": 5.4559245109558105, + "learning_rate": 5.90407016582157e-06, + "loss": 0.2745, + "step": 30639 + }, + { + "epoch": 2.0155724362452525, + "grad_norm": 4.160536289215088, + "learning_rate": 5.903933123201316e-06, + "loss": 0.1377, + "step": 30640 + }, + { + "epoch": 2.0155860010851874, + "grad_norm": 3.423996686935425, + "learning_rate": 5.9037960805810615e-06, + "loss": 0.1385, + "step": 30641 + }, + { + "epoch": 2.0155995659251222, + "grad_norm": 3.414640426635742, + "learning_rate": 5.903659037960806e-06, + "loss": 0.1235, + "step": 30642 + }, + { + "epoch": 2.015613130765057, + "grad_norm": 4.33316707611084, + "learning_rate": 5.903521995340551e-06, + "loss": 0.1552, + "step": 30643 + }, + { + "epoch": 2.015626695604992, + "grad_norm": 3.5305702686309814, + "learning_rate": 5.903384952720297e-06, + "loss": 0.0884, + "step": 30644 + }, + { + "epoch": 2.015640260444927, + "grad_norm": 5.69609260559082, + "learning_rate": 5.903247910100041e-06, + "loss": 0.1904, + "step": 30645 + }, + { + "epoch": 2.0156538252848617, + "grad_norm": 4.514864444732666, + "learning_rate": 5.9031108674797866e-06, + "loss": 0.2976, + "step": 30646 + }, + { + "epoch": 2.0156673901247966, + "grad_norm": 4.325079441070557, + "learning_rate": 5.902973824859532e-06, + "loss": 0.1584, + "step": 30647 + }, + { + "epoch": 2.0156809549647314, + "grad_norm": 4.486607551574707, + "learning_rate": 5.902836782239278e-06, + "loss": 0.1638, + "step": 30648 + }, + { + "epoch": 2.0156945198046663, + "grad_norm": 3.2098605632781982, + "learning_rate": 5.902699739619022e-06, + "loss": 0.0822, + "step": 30649 + }, + { + "epoch": 2.015708084644601, + "grad_norm": 5.52536153793335, + "learning_rate": 5.902562696998767e-06, + "loss": 0.2107, + "step": 30650 + }, + { + "epoch": 2.015721649484536, + "grad_norm": 4.620817184448242, + "learning_rate": 5.902425654378512e-06, + "loss": 0.1713, + "step": 30651 + }, + { + "epoch": 2.015735214324471, + "grad_norm": 4.428233623504639, + "learning_rate": 5.902288611758258e-06, + "loss": 0.1754, + "step": 30652 + }, + { + "epoch": 2.0157487791644058, + "grad_norm": 5.872151851654053, + "learning_rate": 5.902151569138003e-06, + "loss": 0.405, + "step": 30653 + }, + { + "epoch": 2.0157623440043406, + "grad_norm": 3.3356778621673584, + "learning_rate": 5.902014526517747e-06, + "loss": 0.0864, + "step": 30654 + }, + { + "epoch": 2.0157759088442755, + "grad_norm": 3.390016794204712, + "learning_rate": 5.901877483897492e-06, + "loss": 0.1281, + "step": 30655 + }, + { + "epoch": 2.0157894736842104, + "grad_norm": 3.989238739013672, + "learning_rate": 5.9017404412772375e-06, + "loss": 0.1767, + "step": 30656 + }, + { + "epoch": 2.015803038524145, + "grad_norm": 3.732896327972412, + "learning_rate": 5.9016033986569835e-06, + "loss": 0.148, + "step": 30657 + }, + { + "epoch": 2.01581660336408, + "grad_norm": 3.990600824356079, + "learning_rate": 5.901466356036728e-06, + "loss": 0.1133, + "step": 30658 + }, + { + "epoch": 2.0158301682040154, + "grad_norm": 5.995012283325195, + "learning_rate": 5.901329313416473e-06, + "loss": 0.2493, + "step": 30659 + }, + { + "epoch": 2.0158437330439503, + "grad_norm": 4.412806987762451, + "learning_rate": 5.901192270796217e-06, + "loss": 0.2079, + "step": 30660 + }, + { + "epoch": 2.015857297883885, + "grad_norm": 4.794227600097656, + "learning_rate": 5.901055228175963e-06, + "loss": 0.1601, + "step": 30661 + }, + { + "epoch": 2.01587086272382, + "grad_norm": 4.594535827636719, + "learning_rate": 5.900918185555709e-06, + "loss": 0.175, + "step": 30662 + }, + { + "epoch": 2.015884427563755, + "grad_norm": 4.079092502593994, + "learning_rate": 5.900781142935454e-06, + "loss": 0.135, + "step": 30663 + }, + { + "epoch": 2.0158979924036897, + "grad_norm": 5.058148384094238, + "learning_rate": 5.900644100315198e-06, + "loss": 0.1997, + "step": 30664 + }, + { + "epoch": 2.0159115572436246, + "grad_norm": 4.051612854003906, + "learning_rate": 5.900507057694944e-06, + "loss": 0.3026, + "step": 30665 + }, + { + "epoch": 2.0159251220835595, + "grad_norm": 4.514309406280518, + "learning_rate": 5.900370015074689e-06, + "loss": 0.1813, + "step": 30666 + }, + { + "epoch": 2.0159386869234943, + "grad_norm": 6.039506435394287, + "learning_rate": 5.900232972454434e-06, + "loss": 0.3432, + "step": 30667 + }, + { + "epoch": 2.015952251763429, + "grad_norm": 4.291135311126709, + "learning_rate": 5.900095929834179e-06, + "loss": 0.1075, + "step": 30668 + }, + { + "epoch": 2.015965816603364, + "grad_norm": 6.255913734436035, + "learning_rate": 5.899958887213923e-06, + "loss": 0.2353, + "step": 30669 + }, + { + "epoch": 2.015979381443299, + "grad_norm": 4.492503643035889, + "learning_rate": 5.899821844593669e-06, + "loss": 0.1653, + "step": 30670 + }, + { + "epoch": 2.0159929462832338, + "grad_norm": 3.4031431674957275, + "learning_rate": 5.899684801973414e-06, + "loss": 0.181, + "step": 30671 + }, + { + "epoch": 2.0160065111231686, + "grad_norm": 4.149576663970947, + "learning_rate": 5.8995477593531595e-06, + "loss": 0.2009, + "step": 30672 + }, + { + "epoch": 2.0160200759631035, + "grad_norm": 4.312473297119141, + "learning_rate": 5.899410716732904e-06, + "loss": 0.1703, + "step": 30673 + }, + { + "epoch": 2.0160336408030384, + "grad_norm": 4.395203590393066, + "learning_rate": 5.89927367411265e-06, + "loss": 0.1829, + "step": 30674 + }, + { + "epoch": 2.0160472056429732, + "grad_norm": 4.016585826873779, + "learning_rate": 5.899136631492395e-06, + "loss": 0.166, + "step": 30675 + }, + { + "epoch": 2.016060770482908, + "grad_norm": 5.036203384399414, + "learning_rate": 5.898999588872139e-06, + "loss": 0.156, + "step": 30676 + }, + { + "epoch": 2.016074335322843, + "grad_norm": 3.927668809890747, + "learning_rate": 5.8988625462518846e-06, + "loss": 0.254, + "step": 30677 + }, + { + "epoch": 2.0160879001627783, + "grad_norm": 3.7394957542419434, + "learning_rate": 5.898725503631631e-06, + "loss": 0.1447, + "step": 30678 + }, + { + "epoch": 2.016101465002713, + "grad_norm": 4.4299540519714355, + "learning_rate": 5.898588461011375e-06, + "loss": 0.1634, + "step": 30679 + }, + { + "epoch": 2.016115029842648, + "grad_norm": 4.4288411140441895, + "learning_rate": 5.89845141839112e-06, + "loss": 0.1561, + "step": 30680 + }, + { + "epoch": 2.016128594682583, + "grad_norm": 3.9128048419952393, + "learning_rate": 5.898314375770865e-06, + "loss": 0.1945, + "step": 30681 + }, + { + "epoch": 2.0161421595225177, + "grad_norm": 3.252234935760498, + "learning_rate": 5.89817733315061e-06, + "loss": 0.1555, + "step": 30682 + }, + { + "epoch": 2.0161557243624526, + "grad_norm": 4.739649772644043, + "learning_rate": 5.898040290530356e-06, + "loss": 0.206, + "step": 30683 + }, + { + "epoch": 2.0161692892023875, + "grad_norm": 6.3711066246032715, + "learning_rate": 5.897903247910101e-06, + "loss": 0.3116, + "step": 30684 + }, + { + "epoch": 2.0161828540423223, + "grad_norm": 4.9584174156188965, + "learning_rate": 5.897766205289845e-06, + "loss": 0.245, + "step": 30685 + }, + { + "epoch": 2.016196418882257, + "grad_norm": 5.811217308044434, + "learning_rate": 5.89762916266959e-06, + "loss": 0.2763, + "step": 30686 + }, + { + "epoch": 2.016209983722192, + "grad_norm": 4.072588920593262, + "learning_rate": 5.897492120049336e-06, + "loss": 0.264, + "step": 30687 + }, + { + "epoch": 2.016223548562127, + "grad_norm": 4.946502208709717, + "learning_rate": 5.897355077429081e-06, + "loss": 0.1822, + "step": 30688 + }, + { + "epoch": 2.016237113402062, + "grad_norm": 5.106288909912109, + "learning_rate": 5.897218034808826e-06, + "loss": 0.294, + "step": 30689 + }, + { + "epoch": 2.0162506782419967, + "grad_norm": 4.2648091316223145, + "learning_rate": 5.897080992188571e-06, + "loss": 0.2051, + "step": 30690 + }, + { + "epoch": 2.0162642430819315, + "grad_norm": 3.947946548461914, + "learning_rate": 5.896943949568317e-06, + "loss": 0.1747, + "step": 30691 + }, + { + "epoch": 2.0162778079218664, + "grad_norm": 4.305627346038818, + "learning_rate": 5.896806906948061e-06, + "loss": 0.2091, + "step": 30692 + }, + { + "epoch": 2.0162913727618013, + "grad_norm": 4.446785926818848, + "learning_rate": 5.896669864327807e-06, + "loss": 0.2045, + "step": 30693 + }, + { + "epoch": 2.016304937601736, + "grad_norm": 4.9722981452941895, + "learning_rate": 5.896532821707551e-06, + "loss": 0.3697, + "step": 30694 + }, + { + "epoch": 2.016318502441671, + "grad_norm": 5.522793292999268, + "learning_rate": 5.896395779087296e-06, + "loss": 0.1901, + "step": 30695 + }, + { + "epoch": 2.0163320672816063, + "grad_norm": 4.993606090545654, + "learning_rate": 5.896258736467042e-06, + "loss": 0.1624, + "step": 30696 + }, + { + "epoch": 2.016345632121541, + "grad_norm": 4.8599042892456055, + "learning_rate": 5.896121693846787e-06, + "loss": 0.2016, + "step": 30697 + }, + { + "epoch": 2.016359196961476, + "grad_norm": 5.181332111358643, + "learning_rate": 5.895984651226532e-06, + "loss": 0.2339, + "step": 30698 + }, + { + "epoch": 2.016372761801411, + "grad_norm": 4.370525360107422, + "learning_rate": 5.895847608606277e-06, + "loss": 0.1664, + "step": 30699 + }, + { + "epoch": 2.0163863266413458, + "grad_norm": 6.4928789138793945, + "learning_rate": 5.895710565986023e-06, + "loss": 0.308, + "step": 30700 + }, + { + "epoch": 2.0163998914812806, + "grad_norm": 4.103401184082031, + "learning_rate": 5.895573523365767e-06, + "loss": 0.1003, + "step": 30701 + }, + { + "epoch": 2.0164134563212155, + "grad_norm": 5.382745265960693, + "learning_rate": 5.895436480745512e-06, + "loss": 0.1245, + "step": 30702 + }, + { + "epoch": 2.0164270211611504, + "grad_norm": 6.357028484344482, + "learning_rate": 5.895299438125257e-06, + "loss": 0.3568, + "step": 30703 + }, + { + "epoch": 2.0164405860010852, + "grad_norm": 4.076702117919922, + "learning_rate": 5.895162395505003e-06, + "loss": 0.1387, + "step": 30704 + }, + { + "epoch": 2.01645415084102, + "grad_norm": 5.655196666717529, + "learning_rate": 5.895025352884748e-06, + "loss": 0.2445, + "step": 30705 + }, + { + "epoch": 2.016467715680955, + "grad_norm": 3.3468523025512695, + "learning_rate": 5.894888310264493e-06, + "loss": 0.1589, + "step": 30706 + }, + { + "epoch": 2.01648128052089, + "grad_norm": 3.3440539836883545, + "learning_rate": 5.894751267644237e-06, + "loss": 0.0914, + "step": 30707 + }, + { + "epoch": 2.0164948453608247, + "grad_norm": 5.272005081176758, + "learning_rate": 5.8946142250239826e-06, + "loss": 0.263, + "step": 30708 + }, + { + "epoch": 2.0165084102007595, + "grad_norm": 4.648811340332031, + "learning_rate": 5.894477182403729e-06, + "loss": 0.1302, + "step": 30709 + }, + { + "epoch": 2.0165219750406944, + "grad_norm": 4.993647575378418, + "learning_rate": 5.894340139783473e-06, + "loss": 0.2537, + "step": 30710 + }, + { + "epoch": 2.0165355398806293, + "grad_norm": 5.123989582061768, + "learning_rate": 5.894203097163218e-06, + "loss": 0.2165, + "step": 30711 + }, + { + "epoch": 2.016549104720564, + "grad_norm": 4.640966892242432, + "learning_rate": 5.894066054542963e-06, + "loss": 0.2247, + "step": 30712 + }, + { + "epoch": 2.016562669560499, + "grad_norm": 5.314429759979248, + "learning_rate": 5.8939290119227085e-06, + "loss": 0.222, + "step": 30713 + }, + { + "epoch": 2.016576234400434, + "grad_norm": 6.118106842041016, + "learning_rate": 5.893791969302454e-06, + "loss": 0.2672, + "step": 30714 + }, + { + "epoch": 2.0165897992403687, + "grad_norm": 4.4792656898498535, + "learning_rate": 5.893654926682199e-06, + "loss": 0.257, + "step": 30715 + }, + { + "epoch": 2.016603364080304, + "grad_norm": 4.2516889572143555, + "learning_rate": 5.893517884061943e-06, + "loss": 0.1826, + "step": 30716 + }, + { + "epoch": 2.016616928920239, + "grad_norm": 5.198099613189697, + "learning_rate": 5.893380841441689e-06, + "loss": 0.2354, + "step": 30717 + }, + { + "epoch": 2.016630493760174, + "grad_norm": 5.226161003112793, + "learning_rate": 5.893243798821434e-06, + "loss": 0.2788, + "step": 30718 + }, + { + "epoch": 2.0166440586001086, + "grad_norm": 3.8314061164855957, + "learning_rate": 5.893106756201179e-06, + "loss": 0.2108, + "step": 30719 + }, + { + "epoch": 2.0166576234400435, + "grad_norm": 6.871849536895752, + "learning_rate": 5.892969713580924e-06, + "loss": 0.2443, + "step": 30720 + }, + { + "epoch": 2.0166711882799784, + "grad_norm": 4.299870014190674, + "learning_rate": 5.89283267096067e-06, + "loss": 0.0946, + "step": 30721 + }, + { + "epoch": 2.0166847531199132, + "grad_norm": 4.972745895385742, + "learning_rate": 5.892695628340414e-06, + "loss": 0.2246, + "step": 30722 + }, + { + "epoch": 2.016698317959848, + "grad_norm": 6.176463603973389, + "learning_rate": 5.892558585720159e-06, + "loss": 0.1313, + "step": 30723 + }, + { + "epoch": 2.016711882799783, + "grad_norm": 4.0144782066345215, + "learning_rate": 5.892421543099905e-06, + "loss": 0.173, + "step": 30724 + }, + { + "epoch": 2.016725447639718, + "grad_norm": 5.58394193649292, + "learning_rate": 5.892284500479649e-06, + "loss": 0.1949, + "step": 30725 + }, + { + "epoch": 2.0167390124796527, + "grad_norm": 4.613594055175781, + "learning_rate": 5.892147457859395e-06, + "loss": 0.144, + "step": 30726 + }, + { + "epoch": 2.0167525773195876, + "grad_norm": 4.767554759979248, + "learning_rate": 5.89201041523914e-06, + "loss": 0.1772, + "step": 30727 + }, + { + "epoch": 2.0167661421595224, + "grad_norm": 4.4163641929626465, + "learning_rate": 5.8918733726188845e-06, + "loss": 0.2346, + "step": 30728 + }, + { + "epoch": 2.0167797069994573, + "grad_norm": 3.9029653072357178, + "learning_rate": 5.89173632999863e-06, + "loss": 0.1759, + "step": 30729 + }, + { + "epoch": 2.016793271839392, + "grad_norm": 4.828647136688232, + "learning_rate": 5.891599287378376e-06, + "loss": 0.2443, + "step": 30730 + }, + { + "epoch": 2.016806836679327, + "grad_norm": 6.423611164093018, + "learning_rate": 5.891462244758121e-06, + "loss": 0.268, + "step": 30731 + }, + { + "epoch": 2.016820401519262, + "grad_norm": 4.91892671585083, + "learning_rate": 5.891325202137865e-06, + "loss": 0.1739, + "step": 30732 + }, + { + "epoch": 2.0168339663591968, + "grad_norm": 4.116065979003906, + "learning_rate": 5.89118815951761e-06, + "loss": 0.169, + "step": 30733 + }, + { + "epoch": 2.016847531199132, + "grad_norm": 5.942781925201416, + "learning_rate": 5.891051116897356e-06, + "loss": 0.23, + "step": 30734 + }, + { + "epoch": 2.016861096039067, + "grad_norm": 5.274904727935791, + "learning_rate": 5.890914074277101e-06, + "loss": 0.1754, + "step": 30735 + }, + { + "epoch": 2.016874660879002, + "grad_norm": 3.9287984371185303, + "learning_rate": 5.890777031656846e-06, + "loss": 0.1893, + "step": 30736 + }, + { + "epoch": 2.0168882257189367, + "grad_norm": 4.4800214767456055, + "learning_rate": 5.89063998903659e-06, + "loss": 0.2248, + "step": 30737 + }, + { + "epoch": 2.0169017905588715, + "grad_norm": 5.542880058288574, + "learning_rate": 5.890502946416335e-06, + "loss": 0.2202, + "step": 30738 + }, + { + "epoch": 2.0169153553988064, + "grad_norm": 5.372951507568359, + "learning_rate": 5.8903659037960814e-06, + "loss": 0.3618, + "step": 30739 + }, + { + "epoch": 2.0169289202387413, + "grad_norm": 4.788368225097656, + "learning_rate": 5.890228861175827e-06, + "loss": 0.1583, + "step": 30740 + }, + { + "epoch": 2.016942485078676, + "grad_norm": 2.848891496658325, + "learning_rate": 5.890091818555571e-06, + "loss": 0.0892, + "step": 30741 + }, + { + "epoch": 2.016956049918611, + "grad_norm": 3.284820318222046, + "learning_rate": 5.889954775935316e-06, + "loss": 0.1821, + "step": 30742 + }, + { + "epoch": 2.016969614758546, + "grad_norm": 4.554249286651611, + "learning_rate": 5.889817733315062e-06, + "loss": 0.218, + "step": 30743 + }, + { + "epoch": 2.0169831795984807, + "grad_norm": 4.673208236694336, + "learning_rate": 5.8896806906948065e-06, + "loss": 0.235, + "step": 30744 + }, + { + "epoch": 2.0169967444384156, + "grad_norm": 6.575447082519531, + "learning_rate": 5.889543648074552e-06, + "loss": 0.1653, + "step": 30745 + }, + { + "epoch": 2.0170103092783505, + "grad_norm": 6.2098774909973145, + "learning_rate": 5.889406605454297e-06, + "loss": 0.3149, + "step": 30746 + }, + { + "epoch": 2.0170238741182853, + "grad_norm": 3.7838821411132812, + "learning_rate": 5.889269562834042e-06, + "loss": 0.1675, + "step": 30747 + }, + { + "epoch": 2.01703743895822, + "grad_norm": 4.352366924285889, + "learning_rate": 5.889132520213787e-06, + "loss": 0.1989, + "step": 30748 + }, + { + "epoch": 2.017051003798155, + "grad_norm": 3.9077985286712646, + "learning_rate": 5.888995477593532e-06, + "loss": 0.1393, + "step": 30749 + }, + { + "epoch": 2.01706456863809, + "grad_norm": 7.421656608581543, + "learning_rate": 5.888858434973277e-06, + "loss": 0.2482, + "step": 30750 + }, + { + "epoch": 2.017078133478025, + "grad_norm": 3.928628444671631, + "learning_rate": 5.888721392353022e-06, + "loss": 0.1592, + "step": 30751 + }, + { + "epoch": 2.0170916983179596, + "grad_norm": 3.3251795768737793, + "learning_rate": 5.888584349732768e-06, + "loss": 0.0826, + "step": 30752 + }, + { + "epoch": 2.017105263157895, + "grad_norm": 4.541581630706787, + "learning_rate": 5.888447307112512e-06, + "loss": 0.2191, + "step": 30753 + }, + { + "epoch": 2.01711882799783, + "grad_norm": 5.147043228149414, + "learning_rate": 5.888310264492257e-06, + "loss": 0.1725, + "step": 30754 + }, + { + "epoch": 2.0171323928377647, + "grad_norm": 5.017758369445801, + "learning_rate": 5.888173221872003e-06, + "loss": 0.242, + "step": 30755 + }, + { + "epoch": 2.0171459576776996, + "grad_norm": 3.120680809020996, + "learning_rate": 5.888036179251749e-06, + "loss": 0.0894, + "step": 30756 + }, + { + "epoch": 2.0171595225176344, + "grad_norm": 4.449270725250244, + "learning_rate": 5.887899136631493e-06, + "loss": 0.1839, + "step": 30757 + }, + { + "epoch": 2.0171730873575693, + "grad_norm": 6.020421981811523, + "learning_rate": 5.887762094011238e-06, + "loss": 0.2678, + "step": 30758 + }, + { + "epoch": 2.017186652197504, + "grad_norm": 3.692340850830078, + "learning_rate": 5.8876250513909825e-06, + "loss": 0.1249, + "step": 30759 + }, + { + "epoch": 2.017200217037439, + "grad_norm": 4.160720348358154, + "learning_rate": 5.8874880087707285e-06, + "loss": 0.0865, + "step": 30760 + }, + { + "epoch": 2.017213781877374, + "grad_norm": 5.16252326965332, + "learning_rate": 5.887350966150474e-06, + "loss": 0.1642, + "step": 30761 + }, + { + "epoch": 2.0172273467173087, + "grad_norm": 5.814200401306152, + "learning_rate": 5.887213923530218e-06, + "loss": 0.1506, + "step": 30762 + }, + { + "epoch": 2.0172409115572436, + "grad_norm": 3.7057344913482666, + "learning_rate": 5.887076880909963e-06, + "loss": 0.128, + "step": 30763 + }, + { + "epoch": 2.0172544763971785, + "grad_norm": 6.1141839027404785, + "learning_rate": 5.886939838289708e-06, + "loss": 0.1446, + "step": 30764 + }, + { + "epoch": 2.0172680412371133, + "grad_norm": 3.111128807067871, + "learning_rate": 5.886802795669454e-06, + "loss": 0.1233, + "step": 30765 + }, + { + "epoch": 2.017281606077048, + "grad_norm": 4.70784854888916, + "learning_rate": 5.886665753049199e-06, + "loss": 0.1465, + "step": 30766 + }, + { + "epoch": 2.017295170916983, + "grad_norm": 3.5253024101257324, + "learning_rate": 5.886528710428944e-06, + "loss": 0.1588, + "step": 30767 + }, + { + "epoch": 2.017308735756918, + "grad_norm": 4.505819320678711, + "learning_rate": 5.886391667808688e-06, + "loss": 0.1629, + "step": 30768 + }, + { + "epoch": 2.017322300596853, + "grad_norm": 3.6505351066589355, + "learning_rate": 5.886254625188434e-06, + "loss": 0.1133, + "step": 30769 + }, + { + "epoch": 2.0173358654367877, + "grad_norm": 3.597270965576172, + "learning_rate": 5.8861175825681794e-06, + "loss": 0.1828, + "step": 30770 + }, + { + "epoch": 2.0173494302767225, + "grad_norm": 4.452805519104004, + "learning_rate": 5.885980539947924e-06, + "loss": 0.0805, + "step": 30771 + }, + { + "epoch": 2.017362995116658, + "grad_norm": 5.139366626739502, + "learning_rate": 5.885843497327669e-06, + "loss": 0.1931, + "step": 30772 + }, + { + "epoch": 2.0173765599565927, + "grad_norm": 3.940269947052002, + "learning_rate": 5.885706454707415e-06, + "loss": 0.1538, + "step": 30773 + }, + { + "epoch": 2.0173901247965276, + "grad_norm": 4.366747856140137, + "learning_rate": 5.88556941208716e-06, + "loss": 0.1467, + "step": 30774 + }, + { + "epoch": 2.0174036896364624, + "grad_norm": 4.04473352432251, + "learning_rate": 5.8854323694669045e-06, + "loss": 0.1648, + "step": 30775 + }, + { + "epoch": 2.0174172544763973, + "grad_norm": 4.75742244720459, + "learning_rate": 5.88529532684665e-06, + "loss": 0.1082, + "step": 30776 + }, + { + "epoch": 2.017430819316332, + "grad_norm": 3.0585734844207764, + "learning_rate": 5.885158284226394e-06, + "loss": 0.0784, + "step": 30777 + }, + { + "epoch": 2.017444384156267, + "grad_norm": 3.0956342220306396, + "learning_rate": 5.88502124160614e-06, + "loss": 0.1437, + "step": 30778 + }, + { + "epoch": 2.017457948996202, + "grad_norm": 3.3169009685516357, + "learning_rate": 5.884884198985885e-06, + "loss": 0.172, + "step": 30779 + }, + { + "epoch": 2.0174715138361368, + "grad_norm": 2.4861466884613037, + "learning_rate": 5.88474715636563e-06, + "loss": 0.0845, + "step": 30780 + }, + { + "epoch": 2.0174850786760716, + "grad_norm": 4.813458442687988, + "learning_rate": 5.884610113745375e-06, + "loss": 0.1652, + "step": 30781 + }, + { + "epoch": 2.0174986435160065, + "grad_norm": 3.278961181640625, + "learning_rate": 5.884473071125121e-06, + "loss": 0.1123, + "step": 30782 + }, + { + "epoch": 2.0175122083559414, + "grad_norm": 7.949279308319092, + "learning_rate": 5.884336028504866e-06, + "loss": 0.3056, + "step": 30783 + }, + { + "epoch": 2.0175257731958762, + "grad_norm": 3.0579986572265625, + "learning_rate": 5.88419898588461e-06, + "loss": 0.1133, + "step": 30784 + }, + { + "epoch": 2.017539338035811, + "grad_norm": 4.7575531005859375, + "learning_rate": 5.884061943264355e-06, + "loss": 0.2148, + "step": 30785 + }, + { + "epoch": 2.017552902875746, + "grad_norm": 3.346102714538574, + "learning_rate": 5.8839249006441014e-06, + "loss": 0.0955, + "step": 30786 + }, + { + "epoch": 2.017566467715681, + "grad_norm": 4.556130409240723, + "learning_rate": 5.883787858023846e-06, + "loss": 0.1474, + "step": 30787 + }, + { + "epoch": 2.0175800325556157, + "grad_norm": 5.227485656738281, + "learning_rate": 5.883650815403591e-06, + "loss": 0.2367, + "step": 30788 + }, + { + "epoch": 2.0175935973955506, + "grad_norm": 4.900545120239258, + "learning_rate": 5.883513772783336e-06, + "loss": 0.145, + "step": 30789 + }, + { + "epoch": 2.0176071622354854, + "grad_norm": 6.115124702453613, + "learning_rate": 5.883376730163082e-06, + "loss": 0.1222, + "step": 30790 + }, + { + "epoch": 2.0176207270754207, + "grad_norm": 4.7916579246521, + "learning_rate": 5.8832396875428265e-06, + "loss": 0.237, + "step": 30791 + }, + { + "epoch": 2.0176342919153556, + "grad_norm": 2.613585948944092, + "learning_rate": 5.883102644922572e-06, + "loss": 0.122, + "step": 30792 + }, + { + "epoch": 2.0176478567552905, + "grad_norm": 3.698129653930664, + "learning_rate": 5.882965602302316e-06, + "loss": 0.1291, + "step": 30793 + }, + { + "epoch": 2.0176614215952253, + "grad_norm": 4.452915668487549, + "learning_rate": 5.882828559682061e-06, + "loss": 0.1256, + "step": 30794 + }, + { + "epoch": 2.01767498643516, + "grad_norm": 3.776010513305664, + "learning_rate": 5.882691517061807e-06, + "loss": 0.1026, + "step": 30795 + }, + { + "epoch": 2.017688551275095, + "grad_norm": 3.588395595550537, + "learning_rate": 5.8825544744415515e-06, + "loss": 0.1248, + "step": 30796 + }, + { + "epoch": 2.01770211611503, + "grad_norm": 3.658815383911133, + "learning_rate": 5.882417431821297e-06, + "loss": 0.1161, + "step": 30797 + }, + { + "epoch": 2.017715680954965, + "grad_norm": 3.527853012084961, + "learning_rate": 5.882280389201042e-06, + "loss": 0.0876, + "step": 30798 + }, + { + "epoch": 2.0177292457948997, + "grad_norm": 4.003257751464844, + "learning_rate": 5.882143346580788e-06, + "loss": 0.2333, + "step": 30799 + }, + { + "epoch": 2.0177428106348345, + "grad_norm": 4.964365482330322, + "learning_rate": 5.882006303960532e-06, + "loss": 0.1906, + "step": 30800 + }, + { + "epoch": 2.0177563754747694, + "grad_norm": 4.915555477142334, + "learning_rate": 5.8818692613402774e-06, + "loss": 0.196, + "step": 30801 + }, + { + "epoch": 2.0177699403147042, + "grad_norm": 3.9236271381378174, + "learning_rate": 5.881732218720022e-06, + "loss": 0.1946, + "step": 30802 + }, + { + "epoch": 2.017783505154639, + "grad_norm": 4.060488700866699, + "learning_rate": 5.881595176099768e-06, + "loss": 0.2043, + "step": 30803 + }, + { + "epoch": 2.017797069994574, + "grad_norm": 4.783073425292969, + "learning_rate": 5.881458133479513e-06, + "loss": 0.2162, + "step": 30804 + }, + { + "epoch": 2.017810634834509, + "grad_norm": 5.009195327758789, + "learning_rate": 5.881321090859258e-06, + "loss": 0.1384, + "step": 30805 + }, + { + "epoch": 2.0178241996744437, + "grad_norm": 4.824466228485107, + "learning_rate": 5.8811840482390025e-06, + "loss": 0.1631, + "step": 30806 + }, + { + "epoch": 2.0178377645143786, + "grad_norm": 4.384683609008789, + "learning_rate": 5.881047005618748e-06, + "loss": 0.1479, + "step": 30807 + }, + { + "epoch": 2.0178513293543134, + "grad_norm": 3.987990379333496, + "learning_rate": 5.880909962998494e-06, + "loss": 0.1093, + "step": 30808 + }, + { + "epoch": 2.0178648941942483, + "grad_norm": 4.589195728302002, + "learning_rate": 5.880772920378238e-06, + "loss": 0.1322, + "step": 30809 + }, + { + "epoch": 2.0178784590341836, + "grad_norm": 2.7425174713134766, + "learning_rate": 5.880635877757983e-06, + "loss": 0.0574, + "step": 30810 + }, + { + "epoch": 2.0178920238741185, + "grad_norm": 4.998401641845703, + "learning_rate": 5.8804988351377275e-06, + "loss": 0.1876, + "step": 30811 + }, + { + "epoch": 2.0179055887140533, + "grad_norm": 6.898106098175049, + "learning_rate": 5.8803617925174736e-06, + "loss": 0.2487, + "step": 30812 + }, + { + "epoch": 2.017919153553988, + "grad_norm": 4.754002094268799, + "learning_rate": 5.880224749897219e-06, + "loss": 0.122, + "step": 30813 + }, + { + "epoch": 2.017932718393923, + "grad_norm": 5.819657802581787, + "learning_rate": 5.880087707276964e-06, + "loss": 0.237, + "step": 30814 + }, + { + "epoch": 2.017946283233858, + "grad_norm": 5.001923561096191, + "learning_rate": 5.879950664656708e-06, + "loss": 0.1675, + "step": 30815 + }, + { + "epoch": 2.017959848073793, + "grad_norm": 4.233462333679199, + "learning_rate": 5.879813622036454e-06, + "loss": 0.1595, + "step": 30816 + }, + { + "epoch": 2.0179734129137277, + "grad_norm": 4.856497764587402, + "learning_rate": 5.8796765794161994e-06, + "loss": 0.1408, + "step": 30817 + }, + { + "epoch": 2.0179869777536625, + "grad_norm": 4.291774272918701, + "learning_rate": 5.879539536795944e-06, + "loss": 0.1482, + "step": 30818 + }, + { + "epoch": 2.0180005425935974, + "grad_norm": 4.908650875091553, + "learning_rate": 5.879402494175689e-06, + "loss": 0.1639, + "step": 30819 + }, + { + "epoch": 2.0180141074335323, + "grad_norm": 3.957218647003174, + "learning_rate": 5.879265451555433e-06, + "loss": 0.1668, + "step": 30820 + }, + { + "epoch": 2.018027672273467, + "grad_norm": 5.5497636795043945, + "learning_rate": 5.879128408935179e-06, + "loss": 0.2683, + "step": 30821 + }, + { + "epoch": 2.018041237113402, + "grad_norm": 3.787371873855591, + "learning_rate": 5.8789913663149245e-06, + "loss": 0.1497, + "step": 30822 + }, + { + "epoch": 2.018054801953337, + "grad_norm": 6.841041564941406, + "learning_rate": 5.87885432369467e-06, + "loss": 0.2925, + "step": 30823 + }, + { + "epoch": 2.0180683667932717, + "grad_norm": 4.7358551025390625, + "learning_rate": 5.878717281074414e-06, + "loss": 0.1834, + "step": 30824 + }, + { + "epoch": 2.0180819316332066, + "grad_norm": 3.958857774734497, + "learning_rate": 5.87858023845416e-06, + "loss": 0.1873, + "step": 30825 + }, + { + "epoch": 2.0180954964731415, + "grad_norm": 4.221246719360352, + "learning_rate": 5.878443195833905e-06, + "loss": 0.1837, + "step": 30826 + }, + { + "epoch": 2.0181090613130763, + "grad_norm": 6.174000263214111, + "learning_rate": 5.8783061532136495e-06, + "loss": 0.2529, + "step": 30827 + }, + { + "epoch": 2.018122626153011, + "grad_norm": 5.480151653289795, + "learning_rate": 5.878169110593395e-06, + "loss": 0.2801, + "step": 30828 + }, + { + "epoch": 2.0181361909929465, + "grad_norm": 3.7995588779449463, + "learning_rate": 5.878032067973141e-06, + "loss": 0.1648, + "step": 30829 + }, + { + "epoch": 2.0181497558328814, + "grad_norm": 5.068328380584717, + "learning_rate": 5.877895025352885e-06, + "loss": 0.1961, + "step": 30830 + }, + { + "epoch": 2.0181633206728162, + "grad_norm": 7.613152027130127, + "learning_rate": 5.87775798273263e-06, + "loss": 0.2743, + "step": 30831 + }, + { + "epoch": 2.018176885512751, + "grad_norm": 5.154718399047852, + "learning_rate": 5.8776209401123754e-06, + "loss": 0.21, + "step": 30832 + }, + { + "epoch": 2.018190450352686, + "grad_norm": 4.326369762420654, + "learning_rate": 5.87748389749212e-06, + "loss": 0.2133, + "step": 30833 + }, + { + "epoch": 2.018204015192621, + "grad_norm": 4.092097282409668, + "learning_rate": 5.877346854871866e-06, + "loss": 0.2387, + "step": 30834 + }, + { + "epoch": 2.0182175800325557, + "grad_norm": 4.989639759063721, + "learning_rate": 5.877209812251611e-06, + "loss": 0.2157, + "step": 30835 + }, + { + "epoch": 2.0182311448724906, + "grad_norm": 4.678386688232422, + "learning_rate": 5.877072769631355e-06, + "loss": 0.2356, + "step": 30836 + }, + { + "epoch": 2.0182447097124254, + "grad_norm": 4.110081672668457, + "learning_rate": 5.8769357270111005e-06, + "loss": 0.2147, + "step": 30837 + }, + { + "epoch": 2.0182582745523603, + "grad_norm": 4.518900394439697, + "learning_rate": 5.8767986843908465e-06, + "loss": 0.2257, + "step": 30838 + }, + { + "epoch": 2.018271839392295, + "grad_norm": 4.180613994598389, + "learning_rate": 5.876661641770592e-06, + "loss": 0.2747, + "step": 30839 + }, + { + "epoch": 2.01828540423223, + "grad_norm": 6.254437446594238, + "learning_rate": 5.876524599150336e-06, + "loss": 0.2408, + "step": 30840 + }, + { + "epoch": 2.018298969072165, + "grad_norm": 4.265171051025391, + "learning_rate": 5.876387556530081e-06, + "loss": 0.2289, + "step": 30841 + }, + { + "epoch": 2.0183125339120997, + "grad_norm": 4.740447044372559, + "learning_rate": 5.876250513909827e-06, + "loss": 0.2459, + "step": 30842 + }, + { + "epoch": 2.0183260987520346, + "grad_norm": 4.981945514678955, + "learning_rate": 5.8761134712895716e-06, + "loss": 0.2672, + "step": 30843 + }, + { + "epoch": 2.0183396635919695, + "grad_norm": 3.773635149002075, + "learning_rate": 5.875976428669317e-06, + "loss": 0.2186, + "step": 30844 + }, + { + "epoch": 2.0183532284319043, + "grad_norm": 3.3657147884368896, + "learning_rate": 5.875839386049061e-06, + "loss": 0.1959, + "step": 30845 + }, + { + "epoch": 2.018366793271839, + "grad_norm": 4.767660140991211, + "learning_rate": 5.875702343428806e-06, + "loss": 0.1486, + "step": 30846 + }, + { + "epoch": 2.018380358111774, + "grad_norm": 4.227700710296631, + "learning_rate": 5.875565300808552e-06, + "loss": 0.1578, + "step": 30847 + }, + { + "epoch": 2.0183939229517094, + "grad_norm": 3.559237480163574, + "learning_rate": 5.8754282581882975e-06, + "loss": 0.1245, + "step": 30848 + }, + { + "epoch": 2.0184074877916443, + "grad_norm": 5.023123741149902, + "learning_rate": 5.875291215568042e-06, + "loss": 0.0953, + "step": 30849 + }, + { + "epoch": 2.018421052631579, + "grad_norm": 4.684626579284668, + "learning_rate": 5.875154172947787e-06, + "loss": 0.2199, + "step": 30850 + }, + { + "epoch": 2.018434617471514, + "grad_norm": 4.146646022796631, + "learning_rate": 5.875017130327533e-06, + "loss": 0.1282, + "step": 30851 + }, + { + "epoch": 2.018448182311449, + "grad_norm": 5.835871696472168, + "learning_rate": 5.874880087707277e-06, + "loss": 0.2371, + "step": 30852 + }, + { + "epoch": 2.0184617471513837, + "grad_norm": 4.421996593475342, + "learning_rate": 5.8747430450870225e-06, + "loss": 0.1629, + "step": 30853 + }, + { + "epoch": 2.0184753119913186, + "grad_norm": 3.916428327560425, + "learning_rate": 5.874606002466768e-06, + "loss": 0.1365, + "step": 30854 + }, + { + "epoch": 2.0184888768312534, + "grad_norm": 4.036525726318359, + "learning_rate": 5.874468959846513e-06, + "loss": 0.1594, + "step": 30855 + }, + { + "epoch": 2.0185024416711883, + "grad_norm": 4.276673316955566, + "learning_rate": 5.874331917226258e-06, + "loss": 0.1827, + "step": 30856 + }, + { + "epoch": 2.018516006511123, + "grad_norm": 3.9706833362579346, + "learning_rate": 5.874194874606003e-06, + "loss": 0.1097, + "step": 30857 + }, + { + "epoch": 2.018529571351058, + "grad_norm": 4.43280553817749, + "learning_rate": 5.8740578319857475e-06, + "loss": 0.1114, + "step": 30858 + }, + { + "epoch": 2.018543136190993, + "grad_norm": 4.2948479652404785, + "learning_rate": 5.8739207893654936e-06, + "loss": 0.1842, + "step": 30859 + }, + { + "epoch": 2.0185567010309278, + "grad_norm": 4.290488243103027, + "learning_rate": 5.873783746745239e-06, + "loss": 0.142, + "step": 30860 + }, + { + "epoch": 2.0185702658708626, + "grad_norm": 5.3692240715026855, + "learning_rate": 5.873646704124983e-06, + "loss": 0.2491, + "step": 30861 + }, + { + "epoch": 2.0185838307107975, + "grad_norm": 5.251304626464844, + "learning_rate": 5.873509661504728e-06, + "loss": 0.2354, + "step": 30862 + }, + { + "epoch": 2.0185973955507324, + "grad_norm": 4.181405544281006, + "learning_rate": 5.8733726188844734e-06, + "loss": 0.1095, + "step": 30863 + }, + { + "epoch": 2.0186109603906672, + "grad_norm": 5.042109966278076, + "learning_rate": 5.873235576264219e-06, + "loss": 0.1567, + "step": 30864 + }, + { + "epoch": 2.018624525230602, + "grad_norm": 3.8166112899780273, + "learning_rate": 5.873098533643964e-06, + "loss": 0.1291, + "step": 30865 + }, + { + "epoch": 2.018638090070537, + "grad_norm": 4.937655925750732, + "learning_rate": 5.872961491023709e-06, + "loss": 0.1612, + "step": 30866 + }, + { + "epoch": 2.0186516549104723, + "grad_norm": 5.4108195304870605, + "learning_rate": 5.872824448403453e-06, + "loss": 0.2317, + "step": 30867 + }, + { + "epoch": 2.018665219750407, + "grad_norm": 3.516404867172241, + "learning_rate": 5.872687405783199e-06, + "loss": 0.1084, + "step": 30868 + }, + { + "epoch": 2.018678784590342, + "grad_norm": 3.997939109802246, + "learning_rate": 5.8725503631629445e-06, + "loss": 0.1634, + "step": 30869 + }, + { + "epoch": 2.018692349430277, + "grad_norm": 3.288879156112671, + "learning_rate": 5.872413320542689e-06, + "loss": 0.0872, + "step": 30870 + }, + { + "epoch": 2.0187059142702117, + "grad_norm": 4.597202777862549, + "learning_rate": 5.872276277922434e-06, + "loss": 0.1496, + "step": 30871 + }, + { + "epoch": 2.0187194791101466, + "grad_norm": 4.356744766235352, + "learning_rate": 5.87213923530218e-06, + "loss": 0.1583, + "step": 30872 + }, + { + "epoch": 2.0187330439500815, + "grad_norm": 4.611505031585693, + "learning_rate": 5.872002192681925e-06, + "loss": 0.2063, + "step": 30873 + }, + { + "epoch": 2.0187466087900163, + "grad_norm": 4.952306270599365, + "learning_rate": 5.8718651500616696e-06, + "loss": 0.1923, + "step": 30874 + }, + { + "epoch": 2.018760173629951, + "grad_norm": 4.049211502075195, + "learning_rate": 5.871728107441415e-06, + "loss": 0.2006, + "step": 30875 + }, + { + "epoch": 2.018773738469886, + "grad_norm": 4.889492511749268, + "learning_rate": 5.871591064821159e-06, + "loss": 0.212, + "step": 30876 + }, + { + "epoch": 2.018787303309821, + "grad_norm": 5.599410533905029, + "learning_rate": 5.871454022200905e-06, + "loss": 0.1736, + "step": 30877 + }, + { + "epoch": 2.018800868149756, + "grad_norm": 5.07775354385376, + "learning_rate": 5.87131697958065e-06, + "loss": 0.1633, + "step": 30878 + }, + { + "epoch": 2.0188144329896907, + "grad_norm": 5.4379425048828125, + "learning_rate": 5.871179936960395e-06, + "loss": 0.1622, + "step": 30879 + }, + { + "epoch": 2.0188279978296255, + "grad_norm": 5.0634026527404785, + "learning_rate": 5.87104289434014e-06, + "loss": 0.1649, + "step": 30880 + }, + { + "epoch": 2.0188415626695604, + "grad_norm": 3.208920955657959, + "learning_rate": 5.870905851719886e-06, + "loss": 0.1041, + "step": 30881 + }, + { + "epoch": 2.0188551275094953, + "grad_norm": 6.09941291809082, + "learning_rate": 5.870768809099631e-06, + "loss": 0.2705, + "step": 30882 + }, + { + "epoch": 2.01886869234943, + "grad_norm": 3.691051721572876, + "learning_rate": 5.870631766479375e-06, + "loss": 0.1293, + "step": 30883 + }, + { + "epoch": 2.018882257189365, + "grad_norm": 4.8897905349731445, + "learning_rate": 5.8704947238591205e-06, + "loss": 0.148, + "step": 30884 + }, + { + "epoch": 2.0188958220293, + "grad_norm": 5.435028076171875, + "learning_rate": 5.8703576812388665e-06, + "loss": 0.23, + "step": 30885 + }, + { + "epoch": 2.018909386869235, + "grad_norm": 4.065162658691406, + "learning_rate": 5.870220638618611e-06, + "loss": 0.1806, + "step": 30886 + }, + { + "epoch": 2.01892295170917, + "grad_norm": 6.4562554359436035, + "learning_rate": 5.870083595998356e-06, + "loss": 0.1805, + "step": 30887 + }, + { + "epoch": 2.018936516549105, + "grad_norm": 3.5764219760894775, + "learning_rate": 5.869946553378101e-06, + "loss": 0.1105, + "step": 30888 + }, + { + "epoch": 2.0189500813890398, + "grad_norm": 4.047962188720703, + "learning_rate": 5.8698095107578456e-06, + "loss": 0.1649, + "step": 30889 + }, + { + "epoch": 2.0189636462289746, + "grad_norm": 5.146194934844971, + "learning_rate": 5.869672468137592e-06, + "loss": 0.1303, + "step": 30890 + }, + { + "epoch": 2.0189772110689095, + "grad_norm": 4.785486698150635, + "learning_rate": 5.869535425517337e-06, + "loss": 0.2078, + "step": 30891 + }, + { + "epoch": 2.0189907759088443, + "grad_norm": 5.21315860748291, + "learning_rate": 5.869398382897081e-06, + "loss": 0.2084, + "step": 30892 + }, + { + "epoch": 2.019004340748779, + "grad_norm": 5.039401054382324, + "learning_rate": 5.869261340276826e-06, + "loss": 0.1631, + "step": 30893 + }, + { + "epoch": 2.019017905588714, + "grad_norm": 4.0078606605529785, + "learning_rate": 5.869124297656572e-06, + "loss": 0.1434, + "step": 30894 + }, + { + "epoch": 2.019031470428649, + "grad_norm": 5.565270900726318, + "learning_rate": 5.868987255036317e-06, + "loss": 0.1869, + "step": 30895 + }, + { + "epoch": 2.019045035268584, + "grad_norm": 3.7864127159118652, + "learning_rate": 5.868850212416062e-06, + "loss": 0.139, + "step": 30896 + }, + { + "epoch": 2.0190586001085187, + "grad_norm": 4.788288116455078, + "learning_rate": 5.868713169795807e-06, + "loss": 0.1922, + "step": 30897 + }, + { + "epoch": 2.0190721649484535, + "grad_norm": 4.70335054397583, + "learning_rate": 5.868576127175552e-06, + "loss": 0.1137, + "step": 30898 + }, + { + "epoch": 2.0190857297883884, + "grad_norm": 4.59484338760376, + "learning_rate": 5.868439084555297e-06, + "loss": 0.135, + "step": 30899 + }, + { + "epoch": 2.0190992946283233, + "grad_norm": 3.7446582317352295, + "learning_rate": 5.8683020419350425e-06, + "loss": 0.22, + "step": 30900 + }, + { + "epoch": 2.019112859468258, + "grad_norm": 5.485884189605713, + "learning_rate": 5.868164999314787e-06, + "loss": 0.2926, + "step": 30901 + }, + { + "epoch": 2.019126424308193, + "grad_norm": 4.126550197601318, + "learning_rate": 5.868027956694532e-06, + "loss": 0.1245, + "step": 30902 + }, + { + "epoch": 2.019139989148128, + "grad_norm": 5.219831466674805, + "learning_rate": 5.867890914074278e-06, + "loss": 0.0981, + "step": 30903 + }, + { + "epoch": 2.0191535539880627, + "grad_norm": 4.540748596191406, + "learning_rate": 5.867753871454022e-06, + "loss": 0.2782, + "step": 30904 + }, + { + "epoch": 2.019167118827998, + "grad_norm": 6.626127243041992, + "learning_rate": 5.8676168288337676e-06, + "loss": 0.2547, + "step": 30905 + }, + { + "epoch": 2.019180683667933, + "grad_norm": 4.481944561004639, + "learning_rate": 5.867479786213513e-06, + "loss": 0.2011, + "step": 30906 + }, + { + "epoch": 2.0191942485078678, + "grad_norm": 5.1320319175720215, + "learning_rate": 5.867342743593259e-06, + "loss": 0.2302, + "step": 30907 + }, + { + "epoch": 2.0192078133478026, + "grad_norm": 4.377249717712402, + "learning_rate": 5.867205700973003e-06, + "loss": 0.163, + "step": 30908 + }, + { + "epoch": 2.0192213781877375, + "grad_norm": 3.149984836578369, + "learning_rate": 5.867068658352748e-06, + "loss": 0.1488, + "step": 30909 + }, + { + "epoch": 2.0192349430276724, + "grad_norm": 5.611385345458984, + "learning_rate": 5.866931615732493e-06, + "loss": 0.1589, + "step": 30910 + }, + { + "epoch": 2.0192485078676072, + "grad_norm": 4.470381736755371, + "learning_rate": 5.866794573112239e-06, + "loss": 0.21, + "step": 30911 + }, + { + "epoch": 2.019262072707542, + "grad_norm": 3.921349287033081, + "learning_rate": 5.866657530491984e-06, + "loss": 0.1371, + "step": 30912 + }, + { + "epoch": 2.019275637547477, + "grad_norm": 6.293389320373535, + "learning_rate": 5.866520487871728e-06, + "loss": 0.2167, + "step": 30913 + }, + { + "epoch": 2.019289202387412, + "grad_norm": 4.463344097137451, + "learning_rate": 5.866383445251473e-06, + "loss": 0.2099, + "step": 30914 + }, + { + "epoch": 2.0193027672273467, + "grad_norm": 3.022740602493286, + "learning_rate": 5.8662464026312185e-06, + "loss": 0.1532, + "step": 30915 + }, + { + "epoch": 2.0193163320672816, + "grad_norm": 5.01224946975708, + "learning_rate": 5.8661093600109645e-06, + "loss": 0.1789, + "step": 30916 + }, + { + "epoch": 2.0193298969072164, + "grad_norm": 3.570497512817383, + "learning_rate": 5.865972317390709e-06, + "loss": 0.1365, + "step": 30917 + }, + { + "epoch": 2.0193434617471513, + "grad_norm": 4.440344333648682, + "learning_rate": 5.865835274770454e-06, + "loss": 0.176, + "step": 30918 + }, + { + "epoch": 2.019357026587086, + "grad_norm": 4.518100261688232, + "learning_rate": 5.865698232150198e-06, + "loss": 0.1835, + "step": 30919 + }, + { + "epoch": 2.019370591427021, + "grad_norm": 3.4251859188079834, + "learning_rate": 5.865561189529944e-06, + "loss": 0.156, + "step": 30920 + }, + { + "epoch": 2.019384156266956, + "grad_norm": 4.184248447418213, + "learning_rate": 5.86542414690969e-06, + "loss": 0.2217, + "step": 30921 + }, + { + "epoch": 2.0193977211068908, + "grad_norm": 4.03118896484375, + "learning_rate": 5.865287104289435e-06, + "loss": 0.1703, + "step": 30922 + }, + { + "epoch": 2.0194112859468256, + "grad_norm": 5.626711368560791, + "learning_rate": 5.865150061669179e-06, + "loss": 0.1885, + "step": 30923 + }, + { + "epoch": 2.019424850786761, + "grad_norm": 3.9697558879852295, + "learning_rate": 5.865013019048925e-06, + "loss": 0.1498, + "step": 30924 + }, + { + "epoch": 2.019438415626696, + "grad_norm": 4.246633529663086, + "learning_rate": 5.86487597642867e-06, + "loss": 0.1702, + "step": 30925 + }, + { + "epoch": 2.0194519804666307, + "grad_norm": 3.3335866928100586, + "learning_rate": 5.864738933808415e-06, + "loss": 0.1233, + "step": 30926 + }, + { + "epoch": 2.0194655453065655, + "grad_norm": 3.257819652557373, + "learning_rate": 5.86460189118816e-06, + "loss": 0.1779, + "step": 30927 + }, + { + "epoch": 2.0194791101465004, + "grad_norm": 4.907714366912842, + "learning_rate": 5.864464848567904e-06, + "loss": 0.1981, + "step": 30928 + }, + { + "epoch": 2.0194926749864353, + "grad_norm": 4.66825532913208, + "learning_rate": 5.86432780594765e-06, + "loss": 0.2357, + "step": 30929 + }, + { + "epoch": 2.01950623982637, + "grad_norm": 3.906329393386841, + "learning_rate": 5.864190763327395e-06, + "loss": 0.1306, + "step": 30930 + }, + { + "epoch": 2.019519804666305, + "grad_norm": 3.804900646209717, + "learning_rate": 5.8640537207071405e-06, + "loss": 0.1189, + "step": 30931 + }, + { + "epoch": 2.01953336950624, + "grad_norm": 6.536111831665039, + "learning_rate": 5.863916678086885e-06, + "loss": 0.1926, + "step": 30932 + }, + { + "epoch": 2.0195469343461747, + "grad_norm": 4.728719711303711, + "learning_rate": 5.863779635466631e-06, + "loss": 0.1288, + "step": 30933 + }, + { + "epoch": 2.0195604991861096, + "grad_norm": 5.513079643249512, + "learning_rate": 5.863642592846376e-06, + "loss": 0.2479, + "step": 30934 + }, + { + "epoch": 2.0195740640260444, + "grad_norm": 3.7596805095672607, + "learning_rate": 5.86350555022612e-06, + "loss": 0.1887, + "step": 30935 + }, + { + "epoch": 2.0195876288659793, + "grad_norm": 5.839684963226318, + "learning_rate": 5.8633685076058656e-06, + "loss": 0.3088, + "step": 30936 + }, + { + "epoch": 2.019601193705914, + "grad_norm": 5.237293243408203, + "learning_rate": 5.863231464985612e-06, + "loss": 0.3365, + "step": 30937 + }, + { + "epoch": 2.019614758545849, + "grad_norm": 3.9089601039886475, + "learning_rate": 5.863094422365356e-06, + "loss": 0.2307, + "step": 30938 + }, + { + "epoch": 2.019628323385784, + "grad_norm": 5.133913040161133, + "learning_rate": 5.862957379745101e-06, + "loss": 0.2236, + "step": 30939 + }, + { + "epoch": 2.0196418882257188, + "grad_norm": 19.053916931152344, + "learning_rate": 5.862820337124846e-06, + "loss": 0.1319, + "step": 30940 + }, + { + "epoch": 2.0196554530656536, + "grad_norm": 6.27778959274292, + "learning_rate": 5.862683294504592e-06, + "loss": 0.2767, + "step": 30941 + }, + { + "epoch": 2.0196690179055885, + "grad_norm": 3.411313533782959, + "learning_rate": 5.862546251884337e-06, + "loss": 0.1389, + "step": 30942 + }, + { + "epoch": 2.019682582745524, + "grad_norm": 4.059286117553711, + "learning_rate": 5.862409209264082e-06, + "loss": 0.1912, + "step": 30943 + }, + { + "epoch": 2.0196961475854587, + "grad_norm": 5.035428047180176, + "learning_rate": 5.862272166643826e-06, + "loss": 0.1791, + "step": 30944 + }, + { + "epoch": 2.0197097124253935, + "grad_norm": 5.496766567230225, + "learning_rate": 5.862135124023571e-06, + "loss": 0.1743, + "step": 30945 + }, + { + "epoch": 2.0197232772653284, + "grad_norm": 4.564632892608643, + "learning_rate": 5.861998081403317e-06, + "loss": 0.1988, + "step": 30946 + }, + { + "epoch": 2.0197368421052633, + "grad_norm": 4.061696529388428, + "learning_rate": 5.8618610387830625e-06, + "loss": 0.1504, + "step": 30947 + }, + { + "epoch": 2.019750406945198, + "grad_norm": 4.970775604248047, + "learning_rate": 5.861723996162807e-06, + "loss": 0.2661, + "step": 30948 + }, + { + "epoch": 2.019763971785133, + "grad_norm": 5.182441711425781, + "learning_rate": 5.861586953542552e-06, + "loss": 0.139, + "step": 30949 + }, + { + "epoch": 2.019777536625068, + "grad_norm": 4.084550857543945, + "learning_rate": 5.861449910922298e-06, + "loss": 0.1753, + "step": 30950 + }, + { + "epoch": 2.0197911014650027, + "grad_norm": 4.2389349937438965, + "learning_rate": 5.861312868302042e-06, + "loss": 0.1721, + "step": 30951 + }, + { + "epoch": 2.0198046663049376, + "grad_norm": 3.0539164543151855, + "learning_rate": 5.861175825681788e-06, + "loss": 0.1546, + "step": 30952 + }, + { + "epoch": 2.0198182311448725, + "grad_norm": 5.481414794921875, + "learning_rate": 5.861038783061532e-06, + "loss": 0.2988, + "step": 30953 + }, + { + "epoch": 2.0198317959848073, + "grad_norm": 3.926724672317505, + "learning_rate": 5.860901740441278e-06, + "loss": 0.1615, + "step": 30954 + }, + { + "epoch": 2.019845360824742, + "grad_norm": 5.550044059753418, + "learning_rate": 5.860764697821023e-06, + "loss": 0.1888, + "step": 30955 + }, + { + "epoch": 2.019858925664677, + "grad_norm": 5.159952163696289, + "learning_rate": 5.860627655200768e-06, + "loss": 0.1913, + "step": 30956 + }, + { + "epoch": 2.019872490504612, + "grad_norm": 4.352792739868164, + "learning_rate": 5.860490612580513e-06, + "loss": 0.2534, + "step": 30957 + }, + { + "epoch": 2.019886055344547, + "grad_norm": 5.4258880615234375, + "learning_rate": 5.860353569960258e-06, + "loss": 0.2583, + "step": 30958 + }, + { + "epoch": 2.0198996201844817, + "grad_norm": 5.741855621337891, + "learning_rate": 5.860216527340004e-06, + "loss": 0.2688, + "step": 30959 + }, + { + "epoch": 2.0199131850244165, + "grad_norm": 4.778398036956787, + "learning_rate": 5.860079484719748e-06, + "loss": 0.2872, + "step": 30960 + }, + { + "epoch": 2.0199267498643514, + "grad_norm": 4.182025909423828, + "learning_rate": 5.859942442099493e-06, + "loss": 0.178, + "step": 30961 + }, + { + "epoch": 2.0199403147042867, + "grad_norm": 5.294769763946533, + "learning_rate": 5.859805399479238e-06, + "loss": 0.3327, + "step": 30962 + }, + { + "epoch": 2.0199538795442216, + "grad_norm": 5.172272682189941, + "learning_rate": 5.859668356858984e-06, + "loss": 0.2801, + "step": 30963 + }, + { + "epoch": 2.0199674443841564, + "grad_norm": 6.755577087402344, + "learning_rate": 5.859531314238729e-06, + "loss": 0.3249, + "step": 30964 + }, + { + "epoch": 2.0199810092240913, + "grad_norm": 4.60114049911499, + "learning_rate": 5.859394271618474e-06, + "loss": 0.2185, + "step": 30965 + }, + { + "epoch": 2.019994574064026, + "grad_norm": 6.180255889892578, + "learning_rate": 5.859257228998218e-06, + "loss": 0.3724, + "step": 30966 + }, + { + "epoch": 2.020008138903961, + "grad_norm": 4.927112579345703, + "learning_rate": 5.859120186377964e-06, + "loss": 0.2731, + "step": 30967 + }, + { + "epoch": 2.020021703743896, + "grad_norm": 5.556604862213135, + "learning_rate": 5.85898314375771e-06, + "loss": 0.3822, + "step": 30968 + }, + { + "epoch": 2.0200352685838308, + "grad_norm": 3.0730249881744385, + "learning_rate": 5.858846101137454e-06, + "loss": 0.1474, + "step": 30969 + }, + { + "epoch": 2.0200488334237656, + "grad_norm": 5.008271217346191, + "learning_rate": 5.858709058517199e-06, + "loss": 0.1472, + "step": 30970 + }, + { + "epoch": 2.0200623982637005, + "grad_norm": 4.659600734710693, + "learning_rate": 5.858572015896944e-06, + "loss": 0.2235, + "step": 30971 + }, + { + "epoch": 2.0200759631036354, + "grad_norm": 5.676185607910156, + "learning_rate": 5.8584349732766895e-06, + "loss": 0.1976, + "step": 30972 + }, + { + "epoch": 2.02008952794357, + "grad_norm": 4.631758689880371, + "learning_rate": 5.858297930656435e-06, + "loss": 0.2051, + "step": 30973 + }, + { + "epoch": 2.020103092783505, + "grad_norm": 4.342037677764893, + "learning_rate": 5.85816088803618e-06, + "loss": 0.1316, + "step": 30974 + }, + { + "epoch": 2.02011665762344, + "grad_norm": 7.063383102416992, + "learning_rate": 5.858023845415924e-06, + "loss": 0.4231, + "step": 30975 + }, + { + "epoch": 2.020130222463375, + "grad_norm": 6.12737512588501, + "learning_rate": 5.85788680279567e-06, + "loss": 0.2718, + "step": 30976 + }, + { + "epoch": 2.0201437873033097, + "grad_norm": 4.569084167480469, + "learning_rate": 5.857749760175415e-06, + "loss": 0.1353, + "step": 30977 + }, + { + "epoch": 2.0201573521432445, + "grad_norm": 5.847105503082275, + "learning_rate": 5.85761271755516e-06, + "loss": 0.2149, + "step": 30978 + }, + { + "epoch": 2.0201709169831794, + "grad_norm": 3.294919967651367, + "learning_rate": 5.857475674934905e-06, + "loss": 0.1449, + "step": 30979 + }, + { + "epoch": 2.0201844818231143, + "grad_norm": 6.083104133605957, + "learning_rate": 5.857338632314651e-06, + "loss": 0.156, + "step": 30980 + }, + { + "epoch": 2.0201980466630496, + "grad_norm": 5.943918704986572, + "learning_rate": 5.857201589694396e-06, + "loss": 0.2492, + "step": 30981 + }, + { + "epoch": 2.0202116115029845, + "grad_norm": 5.499664783477783, + "learning_rate": 5.85706454707414e-06, + "loss": 0.2059, + "step": 30982 + }, + { + "epoch": 2.0202251763429193, + "grad_norm": 7.986443519592285, + "learning_rate": 5.856927504453886e-06, + "loss": 0.1773, + "step": 30983 + }, + { + "epoch": 2.020238741182854, + "grad_norm": 4.737986087799072, + "learning_rate": 5.85679046183363e-06, + "loss": 0.22, + "step": 30984 + }, + { + "epoch": 2.020252306022789, + "grad_norm": 7.291651725769043, + "learning_rate": 5.856653419213376e-06, + "loss": 0.2737, + "step": 30985 + }, + { + "epoch": 2.020265870862724, + "grad_norm": 7.941144943237305, + "learning_rate": 5.856516376593121e-06, + "loss": 0.3808, + "step": 30986 + }, + { + "epoch": 2.0202794357026588, + "grad_norm": 4.561113357543945, + "learning_rate": 5.8563793339728655e-06, + "loss": 0.1031, + "step": 30987 + }, + { + "epoch": 2.0202930005425936, + "grad_norm": 4.289868354797363, + "learning_rate": 5.856242291352611e-06, + "loss": 0.1352, + "step": 30988 + }, + { + "epoch": 2.0203065653825285, + "grad_norm": 5.552104473114014, + "learning_rate": 5.856105248732357e-06, + "loss": 0.265, + "step": 30989 + }, + { + "epoch": 2.0203201302224634, + "grad_norm": 4.8366379737854, + "learning_rate": 5.855968206112102e-06, + "loss": 0.1457, + "step": 30990 + }, + { + "epoch": 2.0203336950623982, + "grad_norm": 4.706340789794922, + "learning_rate": 5.855831163491846e-06, + "loss": 0.1778, + "step": 30991 + }, + { + "epoch": 2.020347259902333, + "grad_norm": 5.366743564605713, + "learning_rate": 5.855694120871591e-06, + "loss": 0.1481, + "step": 30992 + }, + { + "epoch": 2.020360824742268, + "grad_norm": 4.511369228363037, + "learning_rate": 5.855557078251337e-06, + "loss": 0.1992, + "step": 30993 + }, + { + "epoch": 2.020374389582203, + "grad_norm": 3.6358284950256348, + "learning_rate": 5.855420035631082e-06, + "loss": 0.114, + "step": 30994 + }, + { + "epoch": 2.0203879544221377, + "grad_norm": 4.521365642547607, + "learning_rate": 5.855282993010827e-06, + "loss": 0.179, + "step": 30995 + }, + { + "epoch": 2.0204015192620726, + "grad_norm": 4.631608009338379, + "learning_rate": 5.855145950390572e-06, + "loss": 0.1242, + "step": 30996 + }, + { + "epoch": 2.0204150841020074, + "grad_norm": 3.697467803955078, + "learning_rate": 5.855008907770316e-06, + "loss": 0.1156, + "step": 30997 + }, + { + "epoch": 2.0204286489419423, + "grad_norm": 4.859679222106934, + "learning_rate": 5.8548718651500624e-06, + "loss": 0.2029, + "step": 30998 + }, + { + "epoch": 2.020442213781877, + "grad_norm": 6.210973262786865, + "learning_rate": 5.854734822529808e-06, + "loss": 0.1782, + "step": 30999 + }, + { + "epoch": 2.0204557786218125, + "grad_norm": 5.455559253692627, + "learning_rate": 5.854597779909552e-06, + "loss": 0.16, + "step": 31000 + }, + { + "epoch": 2.0204693434617473, + "grad_norm": 5.3699049949646, + "learning_rate": 5.854460737289297e-06, + "loss": 0.1613, + "step": 31001 + }, + { + "epoch": 2.020482908301682, + "grad_norm": 3.0930910110473633, + "learning_rate": 5.854323694669043e-06, + "loss": 0.089, + "step": 31002 + }, + { + "epoch": 2.020496473141617, + "grad_norm": 5.933002471923828, + "learning_rate": 5.8541866520487875e-06, + "loss": 0.195, + "step": 31003 + }, + { + "epoch": 2.020510037981552, + "grad_norm": 4.745878219604492, + "learning_rate": 5.854049609428533e-06, + "loss": 0.1749, + "step": 31004 + }, + { + "epoch": 2.020523602821487, + "grad_norm": 4.338488578796387, + "learning_rate": 5.853912566808278e-06, + "loss": 0.11, + "step": 31005 + }, + { + "epoch": 2.0205371676614217, + "grad_norm": 5.196321487426758, + "learning_rate": 5.853775524188023e-06, + "loss": 0.1754, + "step": 31006 + }, + { + "epoch": 2.0205507325013565, + "grad_norm": 4.270659446716309, + "learning_rate": 5.853638481567768e-06, + "loss": 0.1783, + "step": 31007 + }, + { + "epoch": 2.0205642973412914, + "grad_norm": 5.551525115966797, + "learning_rate": 5.853501438947513e-06, + "loss": 0.2163, + "step": 31008 + }, + { + "epoch": 2.0205778621812263, + "grad_norm": 5.24199104309082, + "learning_rate": 5.853364396327258e-06, + "loss": 0.1824, + "step": 31009 + }, + { + "epoch": 2.020591427021161, + "grad_norm": 3.6915769577026367, + "learning_rate": 5.853227353707004e-06, + "loss": 0.1088, + "step": 31010 + }, + { + "epoch": 2.020604991861096, + "grad_norm": 4.0255632400512695, + "learning_rate": 5.853090311086749e-06, + "loss": 0.1155, + "step": 31011 + }, + { + "epoch": 2.020618556701031, + "grad_norm": 3.148956775665283, + "learning_rate": 5.852953268466493e-06, + "loss": 0.1598, + "step": 31012 + }, + { + "epoch": 2.0206321215409657, + "grad_norm": 4.635887145996094, + "learning_rate": 5.852816225846238e-06, + "loss": 0.1813, + "step": 31013 + }, + { + "epoch": 2.0206456863809006, + "grad_norm": 4.3969292640686035, + "learning_rate": 5.852679183225984e-06, + "loss": 0.1754, + "step": 31014 + }, + { + "epoch": 2.0206592512208355, + "grad_norm": 5.216739654541016, + "learning_rate": 5.85254214060573e-06, + "loss": 0.2065, + "step": 31015 + }, + { + "epoch": 2.0206728160607703, + "grad_norm": 4.04443359375, + "learning_rate": 5.852405097985474e-06, + "loss": 0.1822, + "step": 31016 + }, + { + "epoch": 2.020686380900705, + "grad_norm": 4.300306797027588, + "learning_rate": 5.852268055365219e-06, + "loss": 0.1652, + "step": 31017 + }, + { + "epoch": 2.02069994574064, + "grad_norm": 5.926276683807373, + "learning_rate": 5.8521310127449635e-06, + "loss": 0.3325, + "step": 31018 + }, + { + "epoch": 2.0207135105805754, + "grad_norm": 4.148251533508301, + "learning_rate": 5.8519939701247095e-06, + "loss": 0.2069, + "step": 31019 + }, + { + "epoch": 2.02072707542051, + "grad_norm": 4.3310866355896, + "learning_rate": 5.851856927504455e-06, + "loss": 0.1958, + "step": 31020 + }, + { + "epoch": 2.020740640260445, + "grad_norm": 4.365510940551758, + "learning_rate": 5.851719884884199e-06, + "loss": 0.1408, + "step": 31021 + }, + { + "epoch": 2.02075420510038, + "grad_norm": 5.014245986938477, + "learning_rate": 5.851582842263944e-06, + "loss": 0.1847, + "step": 31022 + }, + { + "epoch": 2.020767769940315, + "grad_norm": 3.6722779273986816, + "learning_rate": 5.85144579964369e-06, + "loss": 0.0885, + "step": 31023 + }, + { + "epoch": 2.0207813347802497, + "grad_norm": 4.786022663116455, + "learning_rate": 5.851308757023435e-06, + "loss": 0.0963, + "step": 31024 + }, + { + "epoch": 2.0207948996201845, + "grad_norm": 4.488465785980225, + "learning_rate": 5.85117171440318e-06, + "loss": 0.1144, + "step": 31025 + }, + { + "epoch": 2.0208084644601194, + "grad_norm": 4.152493000030518, + "learning_rate": 5.851034671782925e-06, + "loss": 0.1389, + "step": 31026 + }, + { + "epoch": 2.0208220293000543, + "grad_norm": 5.472561359405518, + "learning_rate": 5.850897629162669e-06, + "loss": 0.1898, + "step": 31027 + }, + { + "epoch": 2.020835594139989, + "grad_norm": 5.8625898361206055, + "learning_rate": 5.850760586542415e-06, + "loss": 0.1477, + "step": 31028 + }, + { + "epoch": 2.020849158979924, + "grad_norm": 4.007721900939941, + "learning_rate": 5.8506235439221604e-06, + "loss": 0.185, + "step": 31029 + }, + { + "epoch": 2.020862723819859, + "grad_norm": 6.025591850280762, + "learning_rate": 5.850486501301906e-06, + "loss": 0.2378, + "step": 31030 + }, + { + "epoch": 2.0208762886597937, + "grad_norm": 5.508298873901367, + "learning_rate": 5.85034945868165e-06, + "loss": 0.2532, + "step": 31031 + }, + { + "epoch": 2.0208898534997286, + "grad_norm": 5.06314754486084, + "learning_rate": 5.850212416061396e-06, + "loss": 0.1865, + "step": 31032 + }, + { + "epoch": 2.0209034183396635, + "grad_norm": 4.496896266937256, + "learning_rate": 5.850075373441141e-06, + "loss": 0.2069, + "step": 31033 + }, + { + "epoch": 2.0209169831795983, + "grad_norm": 5.162569999694824, + "learning_rate": 5.8499383308208855e-06, + "loss": 0.1702, + "step": 31034 + }, + { + "epoch": 2.020930548019533, + "grad_norm": 5.615498065948486, + "learning_rate": 5.849801288200631e-06, + "loss": 0.1455, + "step": 31035 + }, + { + "epoch": 2.020944112859468, + "grad_norm": 4.845937728881836, + "learning_rate": 5.849664245580377e-06, + "loss": 0.1997, + "step": 31036 + }, + { + "epoch": 2.020957677699403, + "grad_norm": 5.243080139160156, + "learning_rate": 5.849527202960121e-06, + "loss": 0.1917, + "step": 31037 + }, + { + "epoch": 2.0209712425393382, + "grad_norm": 6.916054725646973, + "learning_rate": 5.849390160339866e-06, + "loss": 0.323, + "step": 31038 + }, + { + "epoch": 2.020984807379273, + "grad_norm": 9.407909393310547, + "learning_rate": 5.849253117719611e-06, + "loss": 0.2057, + "step": 31039 + }, + { + "epoch": 2.020998372219208, + "grad_norm": 5.213211536407471, + "learning_rate": 5.849116075099356e-06, + "loss": 0.167, + "step": 31040 + }, + { + "epoch": 2.021011937059143, + "grad_norm": 6.000476360321045, + "learning_rate": 5.848979032479102e-06, + "loss": 0.2097, + "step": 31041 + }, + { + "epoch": 2.0210255018990777, + "grad_norm": 5.528698444366455, + "learning_rate": 5.848841989858847e-06, + "loss": 0.1847, + "step": 31042 + }, + { + "epoch": 2.0210390667390126, + "grad_norm": 5.192267417907715, + "learning_rate": 5.848704947238591e-06, + "loss": 0.204, + "step": 31043 + }, + { + "epoch": 2.0210526315789474, + "grad_norm": 6.042995452880859, + "learning_rate": 5.848567904618336e-06, + "loss": 0.2688, + "step": 31044 + }, + { + "epoch": 2.0210661964188823, + "grad_norm": 8.061670303344727, + "learning_rate": 5.8484308619980824e-06, + "loss": 0.2441, + "step": 31045 + }, + { + "epoch": 2.021079761258817, + "grad_norm": 3.8027548789978027, + "learning_rate": 5.848293819377827e-06, + "loss": 0.1658, + "step": 31046 + }, + { + "epoch": 2.021093326098752, + "grad_norm": 3.8549258708953857, + "learning_rate": 5.848156776757572e-06, + "loss": 0.1131, + "step": 31047 + }, + { + "epoch": 2.021106890938687, + "grad_norm": 5.732485294342041, + "learning_rate": 5.848019734137317e-06, + "loss": 0.205, + "step": 31048 + }, + { + "epoch": 2.0211204557786218, + "grad_norm": 5.323009967803955, + "learning_rate": 5.847882691517063e-06, + "loss": 0.2586, + "step": 31049 + }, + { + "epoch": 2.0211340206185566, + "grad_norm": 3.773162841796875, + "learning_rate": 5.8477456488968075e-06, + "loss": 0.1274, + "step": 31050 + }, + { + "epoch": 2.0211475854584915, + "grad_norm": 4.013357639312744, + "learning_rate": 5.847608606276553e-06, + "loss": 0.197, + "step": 31051 + }, + { + "epoch": 2.0211611502984264, + "grad_norm": 4.833342552185059, + "learning_rate": 5.847471563656297e-06, + "loss": 0.1984, + "step": 31052 + }, + { + "epoch": 2.021174715138361, + "grad_norm": 7.3271589279174805, + "learning_rate": 5.847334521036042e-06, + "loss": 0.214, + "step": 31053 + }, + { + "epoch": 2.021188279978296, + "grad_norm": 5.676037788391113, + "learning_rate": 5.847197478415788e-06, + "loss": 0.2148, + "step": 31054 + }, + { + "epoch": 2.021201844818231, + "grad_norm": 6.69470739364624, + "learning_rate": 5.8470604357955325e-06, + "loss": 0.2801, + "step": 31055 + }, + { + "epoch": 2.021215409658166, + "grad_norm": 4.5855889320373535, + "learning_rate": 5.846923393175278e-06, + "loss": 0.0972, + "step": 31056 + }, + { + "epoch": 2.021228974498101, + "grad_norm": 8.397534370422363, + "learning_rate": 5.846786350555023e-06, + "loss": 0.3095, + "step": 31057 + }, + { + "epoch": 2.021242539338036, + "grad_norm": 5.324253559112549, + "learning_rate": 5.846649307934769e-06, + "loss": 0.1408, + "step": 31058 + }, + { + "epoch": 2.021256104177971, + "grad_norm": 4.9323225021362305, + "learning_rate": 5.846512265314513e-06, + "loss": 0.1745, + "step": 31059 + }, + { + "epoch": 2.0212696690179057, + "grad_norm": 5.493448257446289, + "learning_rate": 5.8463752226942584e-06, + "loss": 0.1536, + "step": 31060 + }, + { + "epoch": 2.0212832338578406, + "grad_norm": 6.956564426422119, + "learning_rate": 5.846238180074003e-06, + "loss": 0.2209, + "step": 31061 + }, + { + "epoch": 2.0212967986977755, + "grad_norm": 4.750028133392334, + "learning_rate": 5.846101137453749e-06, + "loss": 0.1488, + "step": 31062 + }, + { + "epoch": 2.0213103635377103, + "grad_norm": 5.074346542358398, + "learning_rate": 5.845964094833494e-06, + "loss": 0.1552, + "step": 31063 + }, + { + "epoch": 2.021323928377645, + "grad_norm": 3.264849901199341, + "learning_rate": 5.845827052213239e-06, + "loss": 0.1128, + "step": 31064 + }, + { + "epoch": 2.02133749321758, + "grad_norm": 4.611337184906006, + "learning_rate": 5.8456900095929835e-06, + "loss": 0.1499, + "step": 31065 + }, + { + "epoch": 2.021351058057515, + "grad_norm": 3.0380163192749023, + "learning_rate": 5.845552966972729e-06, + "loss": 0.1199, + "step": 31066 + }, + { + "epoch": 2.02136462289745, + "grad_norm": 4.597949981689453, + "learning_rate": 5.845415924352475e-06, + "loss": 0.226, + "step": 31067 + }, + { + "epoch": 2.0213781877373846, + "grad_norm": 8.103657722473145, + "learning_rate": 5.845278881732219e-06, + "loss": 0.2149, + "step": 31068 + }, + { + "epoch": 2.0213917525773195, + "grad_norm": 3.665786027908325, + "learning_rate": 5.845141839111964e-06, + "loss": 0.1193, + "step": 31069 + }, + { + "epoch": 2.0214053174172544, + "grad_norm": 5.285228252410889, + "learning_rate": 5.8450047964917085e-06, + "loss": 0.1405, + "step": 31070 + }, + { + "epoch": 2.0214188822571892, + "grad_norm": 3.798673629760742, + "learning_rate": 5.8448677538714546e-06, + "loss": 0.1271, + "step": 31071 + }, + { + "epoch": 2.021432447097124, + "grad_norm": 4.196320056915283, + "learning_rate": 5.8447307112512e-06, + "loss": 0.1464, + "step": 31072 + }, + { + "epoch": 2.021446011937059, + "grad_norm": 4.3711934089660645, + "learning_rate": 5.844593668630945e-06, + "loss": 0.1321, + "step": 31073 + }, + { + "epoch": 2.021459576776994, + "grad_norm": 5.325603008270264, + "learning_rate": 5.844456626010689e-06, + "loss": 0.196, + "step": 31074 + }, + { + "epoch": 2.0214731416169287, + "grad_norm": 4.340768337249756, + "learning_rate": 5.844319583390435e-06, + "loss": 0.1672, + "step": 31075 + }, + { + "epoch": 2.021486706456864, + "grad_norm": 4.362648010253906, + "learning_rate": 5.8441825407701805e-06, + "loss": 0.1949, + "step": 31076 + }, + { + "epoch": 2.021500271296799, + "grad_norm": 7.05803108215332, + "learning_rate": 5.844045498149925e-06, + "loss": 0.2209, + "step": 31077 + }, + { + "epoch": 2.0215138361367337, + "grad_norm": 3.6378257274627686, + "learning_rate": 5.84390845552967e-06, + "loss": 0.1139, + "step": 31078 + }, + { + "epoch": 2.0215274009766686, + "grad_norm": 4.344342231750488, + "learning_rate": 5.843771412909416e-06, + "loss": 0.1519, + "step": 31079 + }, + { + "epoch": 2.0215409658166035, + "grad_norm": 2.6591994762420654, + "learning_rate": 5.84363437028916e-06, + "loss": 0.0723, + "step": 31080 + }, + { + "epoch": 2.0215545306565383, + "grad_norm": 4.267821311950684, + "learning_rate": 5.8434973276689055e-06, + "loss": 0.1307, + "step": 31081 + }, + { + "epoch": 2.021568095496473, + "grad_norm": 6.254342555999756, + "learning_rate": 5.843360285048651e-06, + "loss": 0.2058, + "step": 31082 + }, + { + "epoch": 2.021581660336408, + "grad_norm": 4.936707496643066, + "learning_rate": 5.843223242428395e-06, + "loss": 0.2188, + "step": 31083 + }, + { + "epoch": 2.021595225176343, + "grad_norm": 5.645618438720703, + "learning_rate": 5.843086199808141e-06, + "loss": 0.2273, + "step": 31084 + }, + { + "epoch": 2.021608790016278, + "grad_norm": 5.00984525680542, + "learning_rate": 5.842949157187886e-06, + "loss": 0.1754, + "step": 31085 + }, + { + "epoch": 2.0216223548562127, + "grad_norm": 2.310758113861084, + "learning_rate": 5.8428121145676305e-06, + "loss": 0.1288, + "step": 31086 + }, + { + "epoch": 2.0216359196961475, + "grad_norm": 6.653704643249512, + "learning_rate": 5.842675071947376e-06, + "loss": 0.1996, + "step": 31087 + }, + { + "epoch": 2.0216494845360824, + "grad_norm": 2.679579496383667, + "learning_rate": 5.842538029327122e-06, + "loss": 0.0876, + "step": 31088 + }, + { + "epoch": 2.0216630493760173, + "grad_norm": 4.2173542976379395, + "learning_rate": 5.842400986706867e-06, + "loss": 0.1702, + "step": 31089 + }, + { + "epoch": 2.021676614215952, + "grad_norm": 4.106582164764404, + "learning_rate": 5.842263944086611e-06, + "loss": 0.1633, + "step": 31090 + }, + { + "epoch": 2.021690179055887, + "grad_norm": 4.169125080108643, + "learning_rate": 5.8421269014663564e-06, + "loss": 0.136, + "step": 31091 + }, + { + "epoch": 2.021703743895822, + "grad_norm": 3.8659720420837402, + "learning_rate": 5.8419898588461025e-06, + "loss": 0.1367, + "step": 31092 + }, + { + "epoch": 2.0217173087357567, + "grad_norm": 4.796975135803223, + "learning_rate": 5.841852816225847e-06, + "loss": 0.1797, + "step": 31093 + }, + { + "epoch": 2.0217308735756916, + "grad_norm": 2.931601047515869, + "learning_rate": 5.841715773605592e-06, + "loss": 0.0761, + "step": 31094 + }, + { + "epoch": 2.021744438415627, + "grad_norm": 5.758009433746338, + "learning_rate": 5.841578730985336e-06, + "loss": 0.184, + "step": 31095 + }, + { + "epoch": 2.0217580032555618, + "grad_norm": 4.054744243621826, + "learning_rate": 5.8414416883650815e-06, + "loss": 0.1273, + "step": 31096 + }, + { + "epoch": 2.0217715680954966, + "grad_norm": 4.426259994506836, + "learning_rate": 5.8413046457448275e-06, + "loss": 0.1431, + "step": 31097 + }, + { + "epoch": 2.0217851329354315, + "grad_norm": 4.760029315948486, + "learning_rate": 5.841167603124573e-06, + "loss": 0.1312, + "step": 31098 + }, + { + "epoch": 2.0217986977753664, + "grad_norm": 4.417428016662598, + "learning_rate": 5.841030560504317e-06, + "loss": 0.1179, + "step": 31099 + }, + { + "epoch": 2.0218122626153012, + "grad_norm": 4.167212009429932, + "learning_rate": 5.840893517884062e-06, + "loss": 0.1229, + "step": 31100 + }, + { + "epoch": 2.021825827455236, + "grad_norm": 3.365257501602173, + "learning_rate": 5.840756475263808e-06, + "loss": 0.1134, + "step": 31101 + }, + { + "epoch": 2.021839392295171, + "grad_norm": 3.944272518157959, + "learning_rate": 5.8406194326435526e-06, + "loss": 0.1326, + "step": 31102 + }, + { + "epoch": 2.021852957135106, + "grad_norm": 4.203176498413086, + "learning_rate": 5.840482390023298e-06, + "loss": 0.132, + "step": 31103 + }, + { + "epoch": 2.0218665219750407, + "grad_norm": 4.885890960693359, + "learning_rate": 5.840345347403042e-06, + "loss": 0.1991, + "step": 31104 + }, + { + "epoch": 2.0218800868149756, + "grad_norm": 4.156216144561768, + "learning_rate": 5.840208304782788e-06, + "loss": 0.1155, + "step": 31105 + }, + { + "epoch": 2.0218936516549104, + "grad_norm": 4.109571933746338, + "learning_rate": 5.840071262162533e-06, + "loss": 0.1641, + "step": 31106 + }, + { + "epoch": 2.0219072164948453, + "grad_norm": 3.299156427383423, + "learning_rate": 5.8399342195422785e-06, + "loss": 0.1065, + "step": 31107 + }, + { + "epoch": 2.02192078133478, + "grad_norm": 4.481093406677246, + "learning_rate": 5.839797176922023e-06, + "loss": 0.1877, + "step": 31108 + }, + { + "epoch": 2.021934346174715, + "grad_norm": 5.559284687042236, + "learning_rate": 5.839660134301768e-06, + "loss": 0.1735, + "step": 31109 + }, + { + "epoch": 2.02194791101465, + "grad_norm": 5.363767147064209, + "learning_rate": 5.839523091681514e-06, + "loss": 0.1547, + "step": 31110 + }, + { + "epoch": 2.0219614758545847, + "grad_norm": 4.661280632019043, + "learning_rate": 5.839386049061258e-06, + "loss": 0.1498, + "step": 31111 + }, + { + "epoch": 2.0219750406945196, + "grad_norm": 5.096444606781006, + "learning_rate": 5.8392490064410035e-06, + "loss": 0.2118, + "step": 31112 + }, + { + "epoch": 2.0219886055344545, + "grad_norm": 3.293816089630127, + "learning_rate": 5.839111963820749e-06, + "loss": 0.1422, + "step": 31113 + }, + { + "epoch": 2.02200217037439, + "grad_norm": 4.40207576751709, + "learning_rate": 5.838974921200494e-06, + "loss": 0.1179, + "step": 31114 + }, + { + "epoch": 2.0220157352143246, + "grad_norm": 4.071784496307373, + "learning_rate": 5.838837878580239e-06, + "loss": 0.0995, + "step": 31115 + }, + { + "epoch": 2.0220293000542595, + "grad_norm": 3.8254714012145996, + "learning_rate": 5.838700835959984e-06, + "loss": 0.1342, + "step": 31116 + }, + { + "epoch": 2.0220428648941944, + "grad_norm": 3.2614285945892334, + "learning_rate": 5.8385637933397285e-06, + "loss": 0.1002, + "step": 31117 + }, + { + "epoch": 2.0220564297341292, + "grad_norm": 6.692364692687988, + "learning_rate": 5.8384267507194746e-06, + "loss": 0.1922, + "step": 31118 + }, + { + "epoch": 2.022069994574064, + "grad_norm": 4.1238508224487305, + "learning_rate": 5.83828970809922e-06, + "loss": 0.1552, + "step": 31119 + }, + { + "epoch": 2.022083559413999, + "grad_norm": 4.111976623535156, + "learning_rate": 5.838152665478964e-06, + "loss": 0.1394, + "step": 31120 + }, + { + "epoch": 2.022097124253934, + "grad_norm": 5.868329048156738, + "learning_rate": 5.838015622858709e-06, + "loss": 0.1381, + "step": 31121 + }, + { + "epoch": 2.0221106890938687, + "grad_norm": 5.338443756103516, + "learning_rate": 5.8378785802384544e-06, + "loss": 0.1694, + "step": 31122 + }, + { + "epoch": 2.0221242539338036, + "grad_norm": 2.8829376697540283, + "learning_rate": 5.8377415376182005e-06, + "loss": 0.0858, + "step": 31123 + }, + { + "epoch": 2.0221378187737384, + "grad_norm": 3.4045066833496094, + "learning_rate": 5.837604494997945e-06, + "loss": 0.0921, + "step": 31124 + }, + { + "epoch": 2.0221513836136733, + "grad_norm": 3.658881187438965, + "learning_rate": 5.83746745237769e-06, + "loss": 0.1237, + "step": 31125 + }, + { + "epoch": 2.022164948453608, + "grad_norm": 3.3545756340026855, + "learning_rate": 5.837330409757434e-06, + "loss": 0.1357, + "step": 31126 + }, + { + "epoch": 2.022178513293543, + "grad_norm": 5.1104536056518555, + "learning_rate": 5.83719336713718e-06, + "loss": 0.1573, + "step": 31127 + }, + { + "epoch": 2.022192078133478, + "grad_norm": 4.6406354904174805, + "learning_rate": 5.8370563245169255e-06, + "loss": 0.0716, + "step": 31128 + }, + { + "epoch": 2.0222056429734128, + "grad_norm": 3.308919668197632, + "learning_rate": 5.83691928189667e-06, + "loss": 0.0962, + "step": 31129 + }, + { + "epoch": 2.0222192078133476, + "grad_norm": 4.6097893714904785, + "learning_rate": 5.836782239276415e-06, + "loss": 0.1125, + "step": 31130 + }, + { + "epoch": 2.0222327726532825, + "grad_norm": 3.154958724975586, + "learning_rate": 5.836645196656161e-06, + "loss": 0.1135, + "step": 31131 + }, + { + "epoch": 2.0222463374932174, + "grad_norm": 3.4776370525360107, + "learning_rate": 5.836508154035906e-06, + "loss": 0.1065, + "step": 31132 + }, + { + "epoch": 2.0222599023331527, + "grad_norm": 3.5602023601531982, + "learning_rate": 5.8363711114156506e-06, + "loss": 0.0639, + "step": 31133 + }, + { + "epoch": 2.0222734671730875, + "grad_norm": 3.7270140647888184, + "learning_rate": 5.836234068795396e-06, + "loss": 0.1227, + "step": 31134 + }, + { + "epoch": 2.0222870320130224, + "grad_norm": 4.1353888511657715, + "learning_rate": 5.83609702617514e-06, + "loss": 0.1506, + "step": 31135 + }, + { + "epoch": 2.0223005968529573, + "grad_norm": 6.891726493835449, + "learning_rate": 5.835959983554886e-06, + "loss": 0.2396, + "step": 31136 + }, + { + "epoch": 2.022314161692892, + "grad_norm": 4.474689960479736, + "learning_rate": 5.835822940934631e-06, + "loss": 0.1222, + "step": 31137 + }, + { + "epoch": 2.022327726532827, + "grad_norm": 3.8583555221557617, + "learning_rate": 5.8356858983143765e-06, + "loss": 0.1375, + "step": 31138 + }, + { + "epoch": 2.022341291372762, + "grad_norm": 4.903691291809082, + "learning_rate": 5.835548855694121e-06, + "loss": 0.2402, + "step": 31139 + }, + { + "epoch": 2.0223548562126967, + "grad_norm": 4.0061116218566895, + "learning_rate": 5.835411813073867e-06, + "loss": 0.143, + "step": 31140 + }, + { + "epoch": 2.0223684210526316, + "grad_norm": 3.3304786682128906, + "learning_rate": 5.835274770453612e-06, + "loss": 0.0854, + "step": 31141 + }, + { + "epoch": 2.0223819858925665, + "grad_norm": 4.489682197570801, + "learning_rate": 5.835137727833356e-06, + "loss": 0.1391, + "step": 31142 + }, + { + "epoch": 2.0223955507325013, + "grad_norm": 4.996711254119873, + "learning_rate": 5.8350006852131015e-06, + "loss": 0.123, + "step": 31143 + }, + { + "epoch": 2.022409115572436, + "grad_norm": 3.811446189880371, + "learning_rate": 5.8348636425928475e-06, + "loss": 0.0901, + "step": 31144 + }, + { + "epoch": 2.022422680412371, + "grad_norm": 4.669033050537109, + "learning_rate": 5.834726599972592e-06, + "loss": 0.1048, + "step": 31145 + }, + { + "epoch": 2.022436245252306, + "grad_norm": 3.5633530616760254, + "learning_rate": 5.834589557352337e-06, + "loss": 0.0855, + "step": 31146 + }, + { + "epoch": 2.022449810092241, + "grad_norm": 4.449235916137695, + "learning_rate": 5.834452514732082e-06, + "loss": 0.1411, + "step": 31147 + }, + { + "epoch": 2.0224633749321757, + "grad_norm": 4.365769863128662, + "learning_rate": 5.834315472111827e-06, + "loss": 0.1631, + "step": 31148 + }, + { + "epoch": 2.0224769397721105, + "grad_norm": 4.806119918823242, + "learning_rate": 5.834178429491573e-06, + "loss": 0.1355, + "step": 31149 + }, + { + "epoch": 2.0224905046120454, + "grad_norm": 4.376874923706055, + "learning_rate": 5.834041386871318e-06, + "loss": 0.1581, + "step": 31150 + }, + { + "epoch": 2.0225040694519802, + "grad_norm": 3.613661766052246, + "learning_rate": 5.833904344251062e-06, + "loss": 0.0923, + "step": 31151 + }, + { + "epoch": 2.0225176342919156, + "grad_norm": 4.8994975090026855, + "learning_rate": 5.833767301630807e-06, + "loss": 0.2127, + "step": 31152 + }, + { + "epoch": 2.0225311991318504, + "grad_norm": 5.182204723358154, + "learning_rate": 5.833630259010553e-06, + "loss": 0.1639, + "step": 31153 + }, + { + "epoch": 2.0225447639717853, + "grad_norm": 4.613064765930176, + "learning_rate": 5.833493216390298e-06, + "loss": 0.1413, + "step": 31154 + }, + { + "epoch": 2.02255832881172, + "grad_norm": 4.718308925628662, + "learning_rate": 5.833356173770043e-06, + "loss": 0.2423, + "step": 31155 + }, + { + "epoch": 2.022571893651655, + "grad_norm": 2.9470701217651367, + "learning_rate": 5.833219131149788e-06, + "loss": 0.1208, + "step": 31156 + }, + { + "epoch": 2.02258545849159, + "grad_norm": 5.980273246765137, + "learning_rate": 5.833082088529534e-06, + "loss": 0.2685, + "step": 31157 + }, + { + "epoch": 2.0225990233315247, + "grad_norm": 4.434802532196045, + "learning_rate": 5.832945045909278e-06, + "loss": 0.1619, + "step": 31158 + }, + { + "epoch": 2.0226125881714596, + "grad_norm": 4.85379695892334, + "learning_rate": 5.8328080032890235e-06, + "loss": 0.1467, + "step": 31159 + }, + { + "epoch": 2.0226261530113945, + "grad_norm": 4.475793838500977, + "learning_rate": 5.832670960668768e-06, + "loss": 0.1325, + "step": 31160 + }, + { + "epoch": 2.0226397178513293, + "grad_norm": 4.441945552825928, + "learning_rate": 5.832533918048514e-06, + "loss": 0.1394, + "step": 31161 + }, + { + "epoch": 2.022653282691264, + "grad_norm": 4.105896949768066, + "learning_rate": 5.832396875428259e-06, + "loss": 0.1167, + "step": 31162 + }, + { + "epoch": 2.022666847531199, + "grad_norm": 3.7268476486206055, + "learning_rate": 5.832259832808003e-06, + "loss": 0.1342, + "step": 31163 + }, + { + "epoch": 2.022680412371134, + "grad_norm": 3.5267717838287354, + "learning_rate": 5.8321227901877486e-06, + "loss": 0.1175, + "step": 31164 + }, + { + "epoch": 2.022693977211069, + "grad_norm": 5.791805744171143, + "learning_rate": 5.831985747567494e-06, + "loss": 0.2054, + "step": 31165 + }, + { + "epoch": 2.0227075420510037, + "grad_norm": 6.419836521148682, + "learning_rate": 5.83184870494724e-06, + "loss": 0.1962, + "step": 31166 + }, + { + "epoch": 2.0227211068909385, + "grad_norm": 5.400291919708252, + "learning_rate": 5.831711662326984e-06, + "loss": 0.2428, + "step": 31167 + }, + { + "epoch": 2.0227346717308734, + "grad_norm": 4.378450393676758, + "learning_rate": 5.831574619706729e-06, + "loss": 0.112, + "step": 31168 + }, + { + "epoch": 2.0227482365708083, + "grad_norm": 5.246620178222656, + "learning_rate": 5.831437577086474e-06, + "loss": 0.2813, + "step": 31169 + }, + { + "epoch": 2.022761801410743, + "grad_norm": 5.331899166107178, + "learning_rate": 5.83130053446622e-06, + "loss": 0.2346, + "step": 31170 + }, + { + "epoch": 2.0227753662506784, + "grad_norm": 5.796200275421143, + "learning_rate": 5.831163491845965e-06, + "loss": 0.2079, + "step": 31171 + }, + { + "epoch": 2.0227889310906133, + "grad_norm": 5.739612102508545, + "learning_rate": 5.83102644922571e-06, + "loss": 0.2519, + "step": 31172 + }, + { + "epoch": 2.022802495930548, + "grad_norm": 5.001238822937012, + "learning_rate": 5.830889406605454e-06, + "loss": 0.315, + "step": 31173 + }, + { + "epoch": 2.022816060770483, + "grad_norm": 5.088479518890381, + "learning_rate": 5.8307523639852e-06, + "loss": 0.1705, + "step": 31174 + }, + { + "epoch": 2.022829625610418, + "grad_norm": 4.275674343109131, + "learning_rate": 5.8306153213649455e-06, + "loss": 0.1845, + "step": 31175 + }, + { + "epoch": 2.0228431904503528, + "grad_norm": 6.189159393310547, + "learning_rate": 5.83047827874469e-06, + "loss": 0.236, + "step": 31176 + }, + { + "epoch": 2.0228567552902876, + "grad_norm": 7.7211785316467285, + "learning_rate": 5.830341236124435e-06, + "loss": 0.3759, + "step": 31177 + }, + { + "epoch": 2.0228703201302225, + "grad_norm": 4.987462520599365, + "learning_rate": 5.830204193504179e-06, + "loss": 0.222, + "step": 31178 + }, + { + "epoch": 2.0228838849701574, + "grad_norm": 4.115234375, + "learning_rate": 5.830067150883925e-06, + "loss": 0.1655, + "step": 31179 + }, + { + "epoch": 2.0228974498100922, + "grad_norm": 4.134896755218506, + "learning_rate": 5.829930108263671e-06, + "loss": 0.2241, + "step": 31180 + }, + { + "epoch": 2.022911014650027, + "grad_norm": 5.439570903778076, + "learning_rate": 5.829793065643416e-06, + "loss": 0.2262, + "step": 31181 + }, + { + "epoch": 2.022924579489962, + "grad_norm": 5.918800354003906, + "learning_rate": 5.82965602302316e-06, + "loss": 0.2376, + "step": 31182 + }, + { + "epoch": 2.022938144329897, + "grad_norm": 5.1591596603393555, + "learning_rate": 5.829518980402906e-06, + "loss": 0.1692, + "step": 31183 + }, + { + "epoch": 2.0229517091698317, + "grad_norm": 4.6430253982543945, + "learning_rate": 5.829381937782651e-06, + "loss": 0.1277, + "step": 31184 + }, + { + "epoch": 2.0229652740097666, + "grad_norm": 6.145174503326416, + "learning_rate": 5.829244895162396e-06, + "loss": 0.1762, + "step": 31185 + }, + { + "epoch": 2.0229788388497014, + "grad_norm": 5.496021270751953, + "learning_rate": 5.829107852542141e-06, + "loss": 0.207, + "step": 31186 + }, + { + "epoch": 2.0229924036896363, + "grad_norm": 5.320046424865723, + "learning_rate": 5.828970809921887e-06, + "loss": 0.1868, + "step": 31187 + }, + { + "epoch": 2.023005968529571, + "grad_norm": 5.897588729858398, + "learning_rate": 5.828833767301631e-06, + "loss": 0.1594, + "step": 31188 + }, + { + "epoch": 2.0230195333695065, + "grad_norm": 5.969021797180176, + "learning_rate": 5.828696724681376e-06, + "loss": 0.2647, + "step": 31189 + }, + { + "epoch": 2.0230330982094413, + "grad_norm": 5.0086140632629395, + "learning_rate": 5.8285596820611215e-06, + "loss": 0.2145, + "step": 31190 + }, + { + "epoch": 2.023046663049376, + "grad_norm": 5.413150310516357, + "learning_rate": 5.828422639440866e-06, + "loss": 0.233, + "step": 31191 + }, + { + "epoch": 2.023060227889311, + "grad_norm": 5.717957973480225, + "learning_rate": 5.828285596820612e-06, + "loss": 0.2322, + "step": 31192 + }, + { + "epoch": 2.023073792729246, + "grad_norm": 4.97625732421875, + "learning_rate": 5.828148554200357e-06, + "loss": 0.1672, + "step": 31193 + }, + { + "epoch": 2.023087357569181, + "grad_norm": 5.1810302734375, + "learning_rate": 5.828011511580101e-06, + "loss": 0.2467, + "step": 31194 + }, + { + "epoch": 2.0231009224091157, + "grad_norm": 6.619444847106934, + "learning_rate": 5.8278744689598466e-06, + "loss": 0.219, + "step": 31195 + }, + { + "epoch": 2.0231144872490505, + "grad_norm": 3.7823357582092285, + "learning_rate": 5.827737426339593e-06, + "loss": 0.1364, + "step": 31196 + }, + { + "epoch": 2.0231280520889854, + "grad_norm": 5.942790508270264, + "learning_rate": 5.827600383719337e-06, + "loss": 0.2979, + "step": 31197 + }, + { + "epoch": 2.0231416169289203, + "grad_norm": 4.9449381828308105, + "learning_rate": 5.827463341099082e-06, + "loss": 0.2529, + "step": 31198 + }, + { + "epoch": 2.023155181768855, + "grad_norm": 4.525881290435791, + "learning_rate": 5.827326298478827e-06, + "loss": 0.2113, + "step": 31199 + }, + { + "epoch": 2.02316874660879, + "grad_norm": 4.776209354400635, + "learning_rate": 5.827189255858573e-06, + "loss": 0.2209, + "step": 31200 + }, + { + "epoch": 2.023182311448725, + "grad_norm": 6.154207706451416, + "learning_rate": 5.827052213238318e-06, + "loss": 0.2399, + "step": 31201 + }, + { + "epoch": 2.0231958762886597, + "grad_norm": 5.527010917663574, + "learning_rate": 5.826915170618063e-06, + "loss": 0.1765, + "step": 31202 + }, + { + "epoch": 2.0232094411285946, + "grad_norm": 3.4693009853363037, + "learning_rate": 5.826778127997807e-06, + "loss": 0.1941, + "step": 31203 + }, + { + "epoch": 2.0232230059685294, + "grad_norm": 5.473498821258545, + "learning_rate": 5.826641085377552e-06, + "loss": 0.1512, + "step": 31204 + }, + { + "epoch": 2.0232365708084643, + "grad_norm": 7.227364540100098, + "learning_rate": 5.826504042757298e-06, + "loss": 0.2997, + "step": 31205 + }, + { + "epoch": 2.023250135648399, + "grad_norm": 5.471198558807373, + "learning_rate": 5.8263670001370435e-06, + "loss": 0.3107, + "step": 31206 + }, + { + "epoch": 2.023263700488334, + "grad_norm": 6.181941509246826, + "learning_rate": 5.826229957516788e-06, + "loss": 0.274, + "step": 31207 + }, + { + "epoch": 2.023277265328269, + "grad_norm": 3.7519803047180176, + "learning_rate": 5.826092914896533e-06, + "loss": 0.2548, + "step": 31208 + }, + { + "epoch": 2.023290830168204, + "grad_norm": 5.571971416473389, + "learning_rate": 5.825955872276279e-06, + "loss": 0.1939, + "step": 31209 + }, + { + "epoch": 2.023304395008139, + "grad_norm": 5.440896987915039, + "learning_rate": 5.825818829656023e-06, + "loss": 0.2084, + "step": 31210 + }, + { + "epoch": 2.023317959848074, + "grad_norm": 6.271611213684082, + "learning_rate": 5.825681787035769e-06, + "loss": 0.3039, + "step": 31211 + }, + { + "epoch": 2.023331524688009, + "grad_norm": 8.024931907653809, + "learning_rate": 5.825544744415513e-06, + "loss": 0.3859, + "step": 31212 + }, + { + "epoch": 2.0233450895279437, + "grad_norm": 6.109992504119873, + "learning_rate": 5.825407701795259e-06, + "loss": 0.1819, + "step": 31213 + }, + { + "epoch": 2.0233586543678785, + "grad_norm": 5.863861083984375, + "learning_rate": 5.825270659175004e-06, + "loss": 0.2325, + "step": 31214 + }, + { + "epoch": 2.0233722192078134, + "grad_norm": 5.34328556060791, + "learning_rate": 5.825133616554749e-06, + "loss": 0.2189, + "step": 31215 + }, + { + "epoch": 2.0233857840477483, + "grad_norm": 6.6442952156066895, + "learning_rate": 5.824996573934494e-06, + "loss": 0.2724, + "step": 31216 + }, + { + "epoch": 2.023399348887683, + "grad_norm": 5.531362533569336, + "learning_rate": 5.82485953131424e-06, + "loss": 0.3032, + "step": 31217 + }, + { + "epoch": 2.023412913727618, + "grad_norm": 5.356703281402588, + "learning_rate": 5.824722488693985e-06, + "loss": 0.2794, + "step": 31218 + }, + { + "epoch": 2.023426478567553, + "grad_norm": 5.3925580978393555, + "learning_rate": 5.824585446073729e-06, + "loss": 0.2509, + "step": 31219 + }, + { + "epoch": 2.0234400434074877, + "grad_norm": 8.652953147888184, + "learning_rate": 5.824448403453474e-06, + "loss": 0.2506, + "step": 31220 + }, + { + "epoch": 2.0234536082474226, + "grad_norm": 6.3571577072143555, + "learning_rate": 5.8243113608332195e-06, + "loss": 0.2538, + "step": 31221 + }, + { + "epoch": 2.0234671730873575, + "grad_norm": 5.4173583984375, + "learning_rate": 5.824174318212965e-06, + "loss": 0.2241, + "step": 31222 + }, + { + "epoch": 2.0234807379272923, + "grad_norm": 7.306674480438232, + "learning_rate": 5.82403727559271e-06, + "loss": 0.3217, + "step": 31223 + }, + { + "epoch": 2.023494302767227, + "grad_norm": 4.642022609710693, + "learning_rate": 5.823900232972455e-06, + "loss": 0.2335, + "step": 31224 + }, + { + "epoch": 2.023507867607162, + "grad_norm": 5.571737766265869, + "learning_rate": 5.823763190352199e-06, + "loss": 0.1926, + "step": 31225 + }, + { + "epoch": 2.023521432447097, + "grad_norm": 4.661173343658447, + "learning_rate": 5.8236261477319454e-06, + "loss": 0.1766, + "step": 31226 + }, + { + "epoch": 2.0235349972870322, + "grad_norm": 4.689609050750732, + "learning_rate": 5.823489105111691e-06, + "loss": 0.1446, + "step": 31227 + }, + { + "epoch": 2.023548562126967, + "grad_norm": 5.379673957824707, + "learning_rate": 5.823352062491435e-06, + "loss": 0.2424, + "step": 31228 + }, + { + "epoch": 2.023562126966902, + "grad_norm": 5.159482479095459, + "learning_rate": 5.82321501987118e-06, + "loss": 0.3201, + "step": 31229 + }, + { + "epoch": 2.023575691806837, + "grad_norm": 7.978439807891846, + "learning_rate": 5.823077977250926e-06, + "loss": 0.358, + "step": 31230 + }, + { + "epoch": 2.0235892566467717, + "grad_norm": 6.016668796539307, + "learning_rate": 5.8229409346306705e-06, + "loss": 0.1935, + "step": 31231 + }, + { + "epoch": 2.0236028214867066, + "grad_norm": 4.10191535949707, + "learning_rate": 5.822803892010416e-06, + "loss": 0.218, + "step": 31232 + }, + { + "epoch": 2.0236163863266414, + "grad_norm": 4.8612141609191895, + "learning_rate": 5.822666849390161e-06, + "loss": 0.1857, + "step": 31233 + }, + { + "epoch": 2.0236299511665763, + "grad_norm": 4.361344814300537, + "learning_rate": 5.822529806769905e-06, + "loss": 0.2535, + "step": 31234 + }, + { + "epoch": 2.023643516006511, + "grad_norm": 6.650040149688721, + "learning_rate": 5.822392764149651e-06, + "loss": 0.2269, + "step": 31235 + }, + { + "epoch": 2.023657080846446, + "grad_norm": 5.53926420211792, + "learning_rate": 5.822255721529396e-06, + "loss": 0.265, + "step": 31236 + }, + { + "epoch": 2.023670645686381, + "grad_norm": 5.794167995452881, + "learning_rate": 5.822118678909141e-06, + "loss": 0.2676, + "step": 31237 + }, + { + "epoch": 2.0236842105263158, + "grad_norm": 5.394037246704102, + "learning_rate": 5.821981636288886e-06, + "loss": 0.223, + "step": 31238 + }, + { + "epoch": 2.0236977753662506, + "grad_norm": 4.273834228515625, + "learning_rate": 5.821844593668632e-06, + "loss": 0.1965, + "step": 31239 + }, + { + "epoch": 2.0237113402061855, + "grad_norm": 5.6998419761657715, + "learning_rate": 5.821707551048377e-06, + "loss": 0.3346, + "step": 31240 + }, + { + "epoch": 2.0237249050461203, + "grad_norm": 5.3105058670043945, + "learning_rate": 5.821570508428121e-06, + "loss": 0.1796, + "step": 31241 + }, + { + "epoch": 2.023738469886055, + "grad_norm": 5.053627014160156, + "learning_rate": 5.821433465807867e-06, + "loss": 0.2207, + "step": 31242 + }, + { + "epoch": 2.02375203472599, + "grad_norm": 6.187284469604492, + "learning_rate": 5.821296423187613e-06, + "loss": 0.186, + "step": 31243 + }, + { + "epoch": 2.023765599565925, + "grad_norm": 3.590053081512451, + "learning_rate": 5.821159380567357e-06, + "loss": 0.1583, + "step": 31244 + }, + { + "epoch": 2.02377916440586, + "grad_norm": 5.257990837097168, + "learning_rate": 5.821022337947102e-06, + "loss": 0.2684, + "step": 31245 + }, + { + "epoch": 2.023792729245795, + "grad_norm": 5.71854305267334, + "learning_rate": 5.8208852953268465e-06, + "loss": 0.2551, + "step": 31246 + }, + { + "epoch": 2.02380629408573, + "grad_norm": 3.9305777549743652, + "learning_rate": 5.820748252706592e-06, + "loss": 0.1423, + "step": 31247 + }, + { + "epoch": 2.023819858925665, + "grad_norm": 5.904084205627441, + "learning_rate": 5.820611210086338e-06, + "loss": 0.1605, + "step": 31248 + }, + { + "epoch": 2.0238334237655997, + "grad_norm": 3.9022886753082275, + "learning_rate": 5.820474167466083e-06, + "loss": 0.2587, + "step": 31249 + }, + { + "epoch": 2.0238469886055346, + "grad_norm": 4.837749481201172, + "learning_rate": 5.820337124845827e-06, + "loss": 0.2069, + "step": 31250 + }, + { + "epoch": 2.0238605534454694, + "grad_norm": 4.829869747161865, + "learning_rate": 5.820200082225572e-06, + "loss": 0.2503, + "step": 31251 + }, + { + "epoch": 2.0238741182854043, + "grad_norm": 6.562459468841553, + "learning_rate": 5.820063039605318e-06, + "loss": 0.2394, + "step": 31252 + }, + { + "epoch": 2.023887683125339, + "grad_norm": 4.983363151550293, + "learning_rate": 5.819925996985063e-06, + "loss": 0.1976, + "step": 31253 + }, + { + "epoch": 2.023901247965274, + "grad_norm": 4.2694993019104, + "learning_rate": 5.819788954364808e-06, + "loss": 0.1835, + "step": 31254 + }, + { + "epoch": 2.023914812805209, + "grad_norm": 5.044859409332275, + "learning_rate": 5.819651911744553e-06, + "loss": 0.3034, + "step": 31255 + }, + { + "epoch": 2.0239283776451438, + "grad_norm": 5.005366802215576, + "learning_rate": 5.819514869124298e-06, + "loss": 0.1833, + "step": 31256 + }, + { + "epoch": 2.0239419424850786, + "grad_norm": 5.140117168426514, + "learning_rate": 5.8193778265040434e-06, + "loss": 0.2344, + "step": 31257 + }, + { + "epoch": 2.0239555073250135, + "grad_norm": 6.80615234375, + "learning_rate": 5.819240783883789e-06, + "loss": 0.3821, + "step": 31258 + }, + { + "epoch": 2.0239690721649484, + "grad_norm": 4.351129055023193, + "learning_rate": 5.819103741263533e-06, + "loss": 0.167, + "step": 31259 + }, + { + "epoch": 2.0239826370048832, + "grad_norm": 4.761237621307373, + "learning_rate": 5.818966698643278e-06, + "loss": 0.2365, + "step": 31260 + }, + { + "epoch": 2.023996201844818, + "grad_norm": 3.433201313018799, + "learning_rate": 5.818829656023024e-06, + "loss": 0.1729, + "step": 31261 + }, + { + "epoch": 2.024009766684753, + "grad_norm": 4.913248062133789, + "learning_rate": 5.8186926134027685e-06, + "loss": 0.1962, + "step": 31262 + }, + { + "epoch": 2.024023331524688, + "grad_norm": 3.191256046295166, + "learning_rate": 5.818555570782514e-06, + "loss": 0.1462, + "step": 31263 + }, + { + "epoch": 2.0240368963646227, + "grad_norm": 3.2230663299560547, + "learning_rate": 5.818418528162259e-06, + "loss": 0.0968, + "step": 31264 + }, + { + "epoch": 2.024050461204558, + "grad_norm": 4.910565376281738, + "learning_rate": 5.818281485542005e-06, + "loss": 0.1949, + "step": 31265 + }, + { + "epoch": 2.024064026044493, + "grad_norm": 3.7014753818511963, + "learning_rate": 5.818144442921749e-06, + "loss": 0.1666, + "step": 31266 + }, + { + "epoch": 2.0240775908844277, + "grad_norm": 4.309627532958984, + "learning_rate": 5.818007400301494e-06, + "loss": 0.1526, + "step": 31267 + }, + { + "epoch": 2.0240911557243626, + "grad_norm": 3.889798879623413, + "learning_rate": 5.817870357681239e-06, + "loss": 0.2123, + "step": 31268 + }, + { + "epoch": 2.0241047205642975, + "grad_norm": 4.312389850616455, + "learning_rate": 5.817733315060985e-06, + "loss": 0.1841, + "step": 31269 + }, + { + "epoch": 2.0241182854042323, + "grad_norm": 4.656946182250977, + "learning_rate": 5.81759627244073e-06, + "loss": 0.2429, + "step": 31270 + }, + { + "epoch": 2.024131850244167, + "grad_norm": 3.2850818634033203, + "learning_rate": 5.817459229820474e-06, + "loss": 0.0875, + "step": 31271 + }, + { + "epoch": 2.024145415084102, + "grad_norm": 3.140580415725708, + "learning_rate": 5.817322187200219e-06, + "loss": 0.1215, + "step": 31272 + }, + { + "epoch": 2.024158979924037, + "grad_norm": 4.5448737144470215, + "learning_rate": 5.817185144579965e-06, + "loss": 0.2085, + "step": 31273 + }, + { + "epoch": 2.024172544763972, + "grad_norm": 4.516096591949463, + "learning_rate": 5.817048101959711e-06, + "loss": 0.1593, + "step": 31274 + }, + { + "epoch": 2.0241861096039067, + "grad_norm": 4.38353967666626, + "learning_rate": 5.816911059339455e-06, + "loss": 0.2503, + "step": 31275 + }, + { + "epoch": 2.0241996744438415, + "grad_norm": 7.1581926345825195, + "learning_rate": 5.8167740167192e-06, + "loss": 0.2247, + "step": 31276 + }, + { + "epoch": 2.0242132392837764, + "grad_norm": 4.532074451446533, + "learning_rate": 5.8166369740989445e-06, + "loss": 0.1615, + "step": 31277 + }, + { + "epoch": 2.0242268041237113, + "grad_norm": 4.121785640716553, + "learning_rate": 5.8164999314786905e-06, + "loss": 0.1664, + "step": 31278 + }, + { + "epoch": 2.024240368963646, + "grad_norm": 3.9479451179504395, + "learning_rate": 5.816362888858436e-06, + "loss": 0.1521, + "step": 31279 + }, + { + "epoch": 2.024253933803581, + "grad_norm": 4.511180400848389, + "learning_rate": 5.816225846238181e-06, + "loss": 0.1164, + "step": 31280 + }, + { + "epoch": 2.024267498643516, + "grad_norm": 3.005190849304199, + "learning_rate": 5.816088803617925e-06, + "loss": 0.1072, + "step": 31281 + }, + { + "epoch": 2.0242810634834507, + "grad_norm": 3.600245714187622, + "learning_rate": 5.815951760997671e-06, + "loss": 0.0998, + "step": 31282 + }, + { + "epoch": 2.0242946283233856, + "grad_norm": 4.058021545410156, + "learning_rate": 5.815814718377416e-06, + "loss": 0.1733, + "step": 31283 + }, + { + "epoch": 2.024308193163321, + "grad_norm": 4.9384918212890625, + "learning_rate": 5.815677675757161e-06, + "loss": 0.2233, + "step": 31284 + }, + { + "epoch": 2.0243217580032558, + "grad_norm": 5.210160732269287, + "learning_rate": 5.815540633136906e-06, + "loss": 0.2481, + "step": 31285 + }, + { + "epoch": 2.0243353228431906, + "grad_norm": 4.7175679206848145, + "learning_rate": 5.815403590516652e-06, + "loss": 0.095, + "step": 31286 + }, + { + "epoch": 2.0243488876831255, + "grad_norm": 4.253451347351074, + "learning_rate": 5.815266547896396e-06, + "loss": 0.1425, + "step": 31287 + }, + { + "epoch": 2.0243624525230604, + "grad_norm": 3.074932336807251, + "learning_rate": 5.8151295052761414e-06, + "loss": 0.1901, + "step": 31288 + }, + { + "epoch": 2.024376017362995, + "grad_norm": 4.217238426208496, + "learning_rate": 5.814992462655887e-06, + "loss": 0.1344, + "step": 31289 + }, + { + "epoch": 2.02438958220293, + "grad_norm": 7.545583248138428, + "learning_rate": 5.814855420035631e-06, + "loss": 0.2154, + "step": 31290 + }, + { + "epoch": 2.024403147042865, + "grad_norm": 3.499725341796875, + "learning_rate": 5.814718377415377e-06, + "loss": 0.1419, + "step": 31291 + }, + { + "epoch": 2.0244167118828, + "grad_norm": 4.176524639129639, + "learning_rate": 5.814581334795122e-06, + "loss": 0.2443, + "step": 31292 + }, + { + "epoch": 2.0244302767227347, + "grad_norm": 5.243805408477783, + "learning_rate": 5.8144442921748665e-06, + "loss": 0.2093, + "step": 31293 + }, + { + "epoch": 2.0244438415626695, + "grad_norm": 3.3474528789520264, + "learning_rate": 5.814307249554612e-06, + "loss": 0.1299, + "step": 31294 + }, + { + "epoch": 2.0244574064026044, + "grad_norm": 4.219059467315674, + "learning_rate": 5.814170206934358e-06, + "loss": 0.1781, + "step": 31295 + }, + { + "epoch": 2.0244709712425393, + "grad_norm": 4.296371936798096, + "learning_rate": 5.814033164314102e-06, + "loss": 0.2519, + "step": 31296 + }, + { + "epoch": 2.024484536082474, + "grad_norm": 4.680388927459717, + "learning_rate": 5.813896121693847e-06, + "loss": 0.1701, + "step": 31297 + }, + { + "epoch": 2.024498100922409, + "grad_norm": 5.687288284301758, + "learning_rate": 5.813759079073592e-06, + "loss": 0.1968, + "step": 31298 + }, + { + "epoch": 2.024511665762344, + "grad_norm": 6.7397780418396, + "learning_rate": 5.813622036453338e-06, + "loss": 0.1987, + "step": 31299 + }, + { + "epoch": 2.0245252306022787, + "grad_norm": 4.623495101928711, + "learning_rate": 5.813484993833083e-06, + "loss": 0.1459, + "step": 31300 + }, + { + "epoch": 2.0245387954422136, + "grad_norm": 4.278435707092285, + "learning_rate": 5.813347951212828e-06, + "loss": 0.1664, + "step": 31301 + }, + { + "epoch": 2.0245523602821485, + "grad_norm": 4.827096939086914, + "learning_rate": 5.813210908592572e-06, + "loss": 0.2045, + "step": 31302 + }, + { + "epoch": 2.0245659251220838, + "grad_norm": 3.224224090576172, + "learning_rate": 5.813073865972317e-06, + "loss": 0.0944, + "step": 31303 + }, + { + "epoch": 2.0245794899620186, + "grad_norm": 4.400850772857666, + "learning_rate": 5.8129368233520634e-06, + "loss": 0.1585, + "step": 31304 + }, + { + "epoch": 2.0245930548019535, + "grad_norm": 3.6426401138305664, + "learning_rate": 5.812799780731808e-06, + "loss": 0.1488, + "step": 31305 + }, + { + "epoch": 2.0246066196418884, + "grad_norm": 4.941890239715576, + "learning_rate": 5.812662738111553e-06, + "loss": 0.1697, + "step": 31306 + }, + { + "epoch": 2.0246201844818232, + "grad_norm": 3.954190492630005, + "learning_rate": 5.812525695491298e-06, + "loss": 0.1311, + "step": 31307 + }, + { + "epoch": 2.024633749321758, + "grad_norm": 6.230375289916992, + "learning_rate": 5.812388652871044e-06, + "loss": 0.2025, + "step": 31308 + }, + { + "epoch": 2.024647314161693, + "grad_norm": 3.9482438564300537, + "learning_rate": 5.8122516102507885e-06, + "loss": 0.155, + "step": 31309 + }, + { + "epoch": 2.024660879001628, + "grad_norm": 3.752040147781372, + "learning_rate": 5.812114567630534e-06, + "loss": 0.1804, + "step": 31310 + }, + { + "epoch": 2.0246744438415627, + "grad_norm": 5.930881023406982, + "learning_rate": 5.811977525010278e-06, + "loss": 0.2228, + "step": 31311 + }, + { + "epoch": 2.0246880086814976, + "grad_norm": 4.514124870300293, + "learning_rate": 5.811840482390024e-06, + "loss": 0.2022, + "step": 31312 + }, + { + "epoch": 2.0247015735214324, + "grad_norm": 6.391369819641113, + "learning_rate": 5.811703439769769e-06, + "loss": 0.2991, + "step": 31313 + }, + { + "epoch": 2.0247151383613673, + "grad_norm": 4.460489273071289, + "learning_rate": 5.811566397149514e-06, + "loss": 0.2288, + "step": 31314 + }, + { + "epoch": 2.024728703201302, + "grad_norm": 5.006066799163818, + "learning_rate": 5.811429354529259e-06, + "loss": 0.1924, + "step": 31315 + }, + { + "epoch": 2.024742268041237, + "grad_norm": 6.19888973236084, + "learning_rate": 5.811292311909004e-06, + "loss": 0.2754, + "step": 31316 + }, + { + "epoch": 2.024755832881172, + "grad_norm": 4.86897087097168, + "learning_rate": 5.81115526928875e-06, + "loss": 0.1889, + "step": 31317 + }, + { + "epoch": 2.0247693977211068, + "grad_norm": 4.757304668426514, + "learning_rate": 5.811018226668494e-06, + "loss": 0.1935, + "step": 31318 + }, + { + "epoch": 2.0247829625610416, + "grad_norm": 4.350261688232422, + "learning_rate": 5.8108811840482394e-06, + "loss": 0.2241, + "step": 31319 + }, + { + "epoch": 2.0247965274009765, + "grad_norm": 4.852260589599609, + "learning_rate": 5.810744141427984e-06, + "loss": 0.236, + "step": 31320 + }, + { + "epoch": 2.0248100922409114, + "grad_norm": 5.014920234680176, + "learning_rate": 5.81060709880773e-06, + "loss": 0.1765, + "step": 31321 + }, + { + "epoch": 2.0248236570808467, + "grad_norm": 4.1462531089782715, + "learning_rate": 5.810470056187475e-06, + "loss": 0.1298, + "step": 31322 + }, + { + "epoch": 2.0248372219207815, + "grad_norm": 5.644700527191162, + "learning_rate": 5.81033301356722e-06, + "loss": 0.2679, + "step": 31323 + }, + { + "epoch": 2.0248507867607164, + "grad_norm": 6.083492755889893, + "learning_rate": 5.8101959709469645e-06, + "loss": 0.1454, + "step": 31324 + }, + { + "epoch": 2.0248643516006513, + "grad_norm": 5.044363021850586, + "learning_rate": 5.8100589283267105e-06, + "loss": 0.2223, + "step": 31325 + }, + { + "epoch": 2.024877916440586, + "grad_norm": 4.2057623863220215, + "learning_rate": 5.809921885706456e-06, + "loss": 0.1649, + "step": 31326 + }, + { + "epoch": 2.024891481280521, + "grad_norm": 7.2442097663879395, + "learning_rate": 5.8097848430862e-06, + "loss": 0.2238, + "step": 31327 + }, + { + "epoch": 2.024905046120456, + "grad_norm": 5.194863796234131, + "learning_rate": 5.809647800465945e-06, + "loss": 0.1998, + "step": 31328 + }, + { + "epoch": 2.0249186109603907, + "grad_norm": 5.200730323791504, + "learning_rate": 5.80951075784569e-06, + "loss": 0.154, + "step": 31329 + }, + { + "epoch": 2.0249321758003256, + "grad_norm": 4.837625980377197, + "learning_rate": 5.8093737152254356e-06, + "loss": 0.1449, + "step": 31330 + }, + { + "epoch": 2.0249457406402604, + "grad_norm": 5.8117594718933105, + "learning_rate": 5.809236672605181e-06, + "loss": 0.1634, + "step": 31331 + }, + { + "epoch": 2.0249593054801953, + "grad_norm": 3.9391350746154785, + "learning_rate": 5.809099629984926e-06, + "loss": 0.1632, + "step": 31332 + }, + { + "epoch": 2.02497287032013, + "grad_norm": 4.185323238372803, + "learning_rate": 5.80896258736467e-06, + "loss": 0.2341, + "step": 31333 + }, + { + "epoch": 2.024986435160065, + "grad_norm": 5.148209095001221, + "learning_rate": 5.808825544744416e-06, + "loss": 0.2437, + "step": 31334 + }, + { + "epoch": 2.025, + "grad_norm": 4.560213565826416, + "learning_rate": 5.8086885021241615e-06, + "loss": 0.2449, + "step": 31335 + }, + { + "epoch": 2.0250135648399348, + "grad_norm": 4.900280952453613, + "learning_rate": 5.808551459503906e-06, + "loss": 0.1307, + "step": 31336 + }, + { + "epoch": 2.0250271296798696, + "grad_norm": 6.039163589477539, + "learning_rate": 5.808414416883651e-06, + "loss": 0.1884, + "step": 31337 + }, + { + "epoch": 2.0250406945198045, + "grad_norm": 5.012422561645508, + "learning_rate": 5.808277374263397e-06, + "loss": 0.1877, + "step": 31338 + }, + { + "epoch": 2.0250542593597394, + "grad_norm": 6.7476701736450195, + "learning_rate": 5.808140331643141e-06, + "loss": 0.1554, + "step": 31339 + }, + { + "epoch": 2.0250678241996742, + "grad_norm": 6.911814212799072, + "learning_rate": 5.8080032890228865e-06, + "loss": 0.2366, + "step": 31340 + }, + { + "epoch": 2.0250813890396095, + "grad_norm": 5.619748592376709, + "learning_rate": 5.807866246402632e-06, + "loss": 0.2212, + "step": 31341 + }, + { + "epoch": 2.0250949538795444, + "grad_norm": 4.835917949676514, + "learning_rate": 5.807729203782376e-06, + "loss": 0.1994, + "step": 31342 + }, + { + "epoch": 2.0251085187194793, + "grad_norm": 4.378597736358643, + "learning_rate": 5.807592161162122e-06, + "loss": 0.2291, + "step": 31343 + }, + { + "epoch": 2.025122083559414, + "grad_norm": 4.622643947601318, + "learning_rate": 5.807455118541867e-06, + "loss": 0.1856, + "step": 31344 + }, + { + "epoch": 2.025135648399349, + "grad_norm": 5.493952751159668, + "learning_rate": 5.8073180759216115e-06, + "loss": 0.2201, + "step": 31345 + }, + { + "epoch": 2.025149213239284, + "grad_norm": 4.8331499099731445, + "learning_rate": 5.807181033301357e-06, + "loss": 0.0817, + "step": 31346 + }, + { + "epoch": 2.0251627780792187, + "grad_norm": 4.0506744384765625, + "learning_rate": 5.807043990681103e-06, + "loss": 0.1906, + "step": 31347 + }, + { + "epoch": 2.0251763429191536, + "grad_norm": 4.171058177947998, + "learning_rate": 5.806906948060848e-06, + "loss": 0.2397, + "step": 31348 + }, + { + "epoch": 2.0251899077590885, + "grad_norm": 5.951988220214844, + "learning_rate": 5.806769905440592e-06, + "loss": 0.2992, + "step": 31349 + }, + { + "epoch": 2.0252034725990233, + "grad_norm": 6.9445576667785645, + "learning_rate": 5.8066328628203374e-06, + "loss": 0.4167, + "step": 31350 + }, + { + "epoch": 2.025217037438958, + "grad_norm": 6.920068264007568, + "learning_rate": 5.8064958202000835e-06, + "loss": 0.2717, + "step": 31351 + }, + { + "epoch": 2.025230602278893, + "grad_norm": 5.582949638366699, + "learning_rate": 5.806358777579828e-06, + "loss": 0.2289, + "step": 31352 + }, + { + "epoch": 2.025244167118828, + "grad_norm": 5.222111701965332, + "learning_rate": 5.806221734959573e-06, + "loss": 0.2976, + "step": 31353 + }, + { + "epoch": 2.025257731958763, + "grad_norm": 4.873632907867432, + "learning_rate": 5.806084692339317e-06, + "loss": 0.2387, + "step": 31354 + }, + { + "epoch": 2.0252712967986977, + "grad_norm": 5.321532726287842, + "learning_rate": 5.8059476497190625e-06, + "loss": 0.2234, + "step": 31355 + }, + { + "epoch": 2.0252848616386325, + "grad_norm": 5.645258903503418, + "learning_rate": 5.8058106070988085e-06, + "loss": 0.426, + "step": 31356 + }, + { + "epoch": 2.0252984264785674, + "grad_norm": 6.205279350280762, + "learning_rate": 5.805673564478554e-06, + "loss": 0.3277, + "step": 31357 + }, + { + "epoch": 2.0253119913185023, + "grad_norm": 5.376227855682373, + "learning_rate": 5.805536521858298e-06, + "loss": 0.223, + "step": 31358 + }, + { + "epoch": 2.025325556158437, + "grad_norm": 5.407473087310791, + "learning_rate": 5.805399479238043e-06, + "loss": 0.208, + "step": 31359 + }, + { + "epoch": 2.0253391209983724, + "grad_norm": 6.033572196960449, + "learning_rate": 5.805262436617789e-06, + "loss": 0.3053, + "step": 31360 + }, + { + "epoch": 2.0253526858383073, + "grad_norm": 4.2847771644592285, + "learning_rate": 5.8051253939975336e-06, + "loss": 0.2448, + "step": 31361 + }, + { + "epoch": 2.025366250678242, + "grad_norm": 4.255252838134766, + "learning_rate": 5.804988351377279e-06, + "loss": 0.1675, + "step": 31362 + }, + { + "epoch": 2.025379815518177, + "grad_norm": 7.74020528793335, + "learning_rate": 5.804851308757024e-06, + "loss": 0.4216, + "step": 31363 + }, + { + "epoch": 2.025393380358112, + "grad_norm": 4.615942478179932, + "learning_rate": 5.804714266136769e-06, + "loss": 0.2512, + "step": 31364 + }, + { + "epoch": 2.0254069451980468, + "grad_norm": 4.841658592224121, + "learning_rate": 5.804577223516514e-06, + "loss": 0.2935, + "step": 31365 + }, + { + "epoch": 2.0254205100379816, + "grad_norm": 8.30505657196045, + "learning_rate": 5.8044401808962595e-06, + "loss": 0.2468, + "step": 31366 + }, + { + "epoch": 2.0254340748779165, + "grad_norm": 4.532249450683594, + "learning_rate": 5.804303138276004e-06, + "loss": 0.2166, + "step": 31367 + }, + { + "epoch": 2.0254476397178514, + "grad_norm": 6.663468837738037, + "learning_rate": 5.80416609565575e-06, + "loss": 0.2384, + "step": 31368 + }, + { + "epoch": 2.025461204557786, + "grad_norm": 4.750587463378906, + "learning_rate": 5.804029053035495e-06, + "loss": 0.2141, + "step": 31369 + }, + { + "epoch": 2.025474769397721, + "grad_norm": 5.371639728546143, + "learning_rate": 5.803892010415239e-06, + "loss": 0.2123, + "step": 31370 + }, + { + "epoch": 2.025488334237656, + "grad_norm": 4.330106258392334, + "learning_rate": 5.8037549677949845e-06, + "loss": 0.0993, + "step": 31371 + }, + { + "epoch": 2.025501899077591, + "grad_norm": 5.952037334442139, + "learning_rate": 5.80361792517473e-06, + "loss": 0.2956, + "step": 31372 + }, + { + "epoch": 2.0255154639175257, + "grad_norm": 6.3840413093566895, + "learning_rate": 5.803480882554475e-06, + "loss": 0.1789, + "step": 31373 + }, + { + "epoch": 2.0255290287574605, + "grad_norm": 4.967148780822754, + "learning_rate": 5.80334383993422e-06, + "loss": 0.209, + "step": 31374 + }, + { + "epoch": 2.0255425935973954, + "grad_norm": 7.302680969238281, + "learning_rate": 5.803206797313965e-06, + "loss": 0.2472, + "step": 31375 + }, + { + "epoch": 2.0255561584373303, + "grad_norm": 6.9639081954956055, + "learning_rate": 5.8030697546937096e-06, + "loss": 0.2191, + "step": 31376 + }, + { + "epoch": 2.025569723277265, + "grad_norm": 6.145883083343506, + "learning_rate": 5.802932712073456e-06, + "loss": 0.2522, + "step": 31377 + }, + { + "epoch": 2.0255832881172, + "grad_norm": 6.316090106964111, + "learning_rate": 5.802795669453201e-06, + "loss": 0.2383, + "step": 31378 + }, + { + "epoch": 2.0255968529571353, + "grad_norm": 4.977849960327148, + "learning_rate": 5.802658626832945e-06, + "loss": 0.2797, + "step": 31379 + }, + { + "epoch": 2.02561041779707, + "grad_norm": 4.549228668212891, + "learning_rate": 5.80252158421269e-06, + "loss": 0.2268, + "step": 31380 + }, + { + "epoch": 2.025623982637005, + "grad_norm": 4.0584893226623535, + "learning_rate": 5.802384541592436e-06, + "loss": 0.1777, + "step": 31381 + }, + { + "epoch": 2.02563754747694, + "grad_norm": 4.026088714599609, + "learning_rate": 5.8022474989721815e-06, + "loss": 0.1807, + "step": 31382 + }, + { + "epoch": 2.025651112316875, + "grad_norm": 5.37759256362915, + "learning_rate": 5.802110456351926e-06, + "loss": 0.3075, + "step": 31383 + }, + { + "epoch": 2.0256646771568096, + "grad_norm": 3.7221269607543945, + "learning_rate": 5.801973413731671e-06, + "loss": 0.1844, + "step": 31384 + }, + { + "epoch": 2.0256782419967445, + "grad_norm": 7.159755706787109, + "learning_rate": 5.801836371111415e-06, + "loss": 0.3408, + "step": 31385 + }, + { + "epoch": 2.0256918068366794, + "grad_norm": 4.856331825256348, + "learning_rate": 5.801699328491161e-06, + "loss": 0.1439, + "step": 31386 + }, + { + "epoch": 2.0257053716766142, + "grad_norm": 4.160035610198975, + "learning_rate": 5.8015622858709065e-06, + "loss": 0.1462, + "step": 31387 + }, + { + "epoch": 2.025718936516549, + "grad_norm": 5.60374116897583, + "learning_rate": 5.801425243250651e-06, + "loss": 0.2843, + "step": 31388 + }, + { + "epoch": 2.025732501356484, + "grad_norm": 7.1189727783203125, + "learning_rate": 5.801288200630396e-06, + "loss": 0.1864, + "step": 31389 + }, + { + "epoch": 2.025746066196419, + "grad_norm": 6.916689395904541, + "learning_rate": 5.801151158010142e-06, + "loss": 0.3529, + "step": 31390 + }, + { + "epoch": 2.0257596310363537, + "grad_norm": 4.700474739074707, + "learning_rate": 5.801014115389887e-06, + "loss": 0.1888, + "step": 31391 + }, + { + "epoch": 2.0257731958762886, + "grad_norm": 6.465152263641357, + "learning_rate": 5.8008770727696316e-06, + "loss": 0.2976, + "step": 31392 + }, + { + "epoch": 2.0257867607162234, + "grad_norm": 5.39270544052124, + "learning_rate": 5.800740030149377e-06, + "loss": 0.2752, + "step": 31393 + }, + { + "epoch": 2.0258003255561583, + "grad_norm": 6.286915302276611, + "learning_rate": 5.800602987529123e-06, + "loss": 0.2684, + "step": 31394 + }, + { + "epoch": 2.025813890396093, + "grad_norm": 4.972365856170654, + "learning_rate": 5.800465944908867e-06, + "loss": 0.1995, + "step": 31395 + }, + { + "epoch": 2.025827455236028, + "grad_norm": 4.910745143890381, + "learning_rate": 5.800328902288612e-06, + "loss": 0.2385, + "step": 31396 + }, + { + "epoch": 2.025841020075963, + "grad_norm": 5.590447902679443, + "learning_rate": 5.8001918596683575e-06, + "loss": 0.1892, + "step": 31397 + }, + { + "epoch": 2.025854584915898, + "grad_norm": 5.594264984130859, + "learning_rate": 5.800054817048102e-06, + "loss": 0.2092, + "step": 31398 + }, + { + "epoch": 2.025868149755833, + "grad_norm": 5.53828239440918, + "learning_rate": 5.799917774427848e-06, + "loss": 0.3454, + "step": 31399 + }, + { + "epoch": 2.025881714595768, + "grad_norm": 5.671660900115967, + "learning_rate": 5.799780731807593e-06, + "loss": 0.2631, + "step": 31400 + }, + { + "epoch": 2.025895279435703, + "grad_norm": 5.753937721252441, + "learning_rate": 5.799643689187337e-06, + "loss": 0.178, + "step": 31401 + }, + { + "epoch": 2.0259088442756377, + "grad_norm": 5.543538570404053, + "learning_rate": 5.7995066465670825e-06, + "loss": 0.27, + "step": 31402 + }, + { + "epoch": 2.0259224091155725, + "grad_norm": 4.7463274002075195, + "learning_rate": 5.7993696039468285e-06, + "loss": 0.2122, + "step": 31403 + }, + { + "epoch": 2.0259359739555074, + "grad_norm": 4.905918598175049, + "learning_rate": 5.799232561326573e-06, + "loss": 0.2013, + "step": 31404 + }, + { + "epoch": 2.0259495387954423, + "grad_norm": 4.757081508636475, + "learning_rate": 5.799095518706318e-06, + "loss": 0.2305, + "step": 31405 + }, + { + "epoch": 2.025963103635377, + "grad_norm": 6.1530914306640625, + "learning_rate": 5.798958476086063e-06, + "loss": 0.1912, + "step": 31406 + }, + { + "epoch": 2.025976668475312, + "grad_norm": 4.46172571182251, + "learning_rate": 5.798821433465809e-06, + "loss": 0.1651, + "step": 31407 + }, + { + "epoch": 2.025990233315247, + "grad_norm": 6.211423873901367, + "learning_rate": 5.798684390845554e-06, + "loss": 0.221, + "step": 31408 + }, + { + "epoch": 2.0260037981551817, + "grad_norm": 3.8149056434631348, + "learning_rate": 5.798547348225299e-06, + "loss": 0.1323, + "step": 31409 + }, + { + "epoch": 2.0260173629951166, + "grad_norm": 5.4137773513793945, + "learning_rate": 5.798410305605043e-06, + "loss": 0.2543, + "step": 31410 + }, + { + "epoch": 2.0260309278350515, + "grad_norm": 6.483639717102051, + "learning_rate": 5.798273262984788e-06, + "loss": 0.2805, + "step": 31411 + }, + { + "epoch": 2.0260444926749863, + "grad_norm": 4.5773701667785645, + "learning_rate": 5.798136220364534e-06, + "loss": 0.2181, + "step": 31412 + }, + { + "epoch": 2.026058057514921, + "grad_norm": 4.79464864730835, + "learning_rate": 5.797999177744279e-06, + "loss": 0.2195, + "step": 31413 + }, + { + "epoch": 2.026071622354856, + "grad_norm": 5.044744491577148, + "learning_rate": 5.797862135124024e-06, + "loss": 0.2077, + "step": 31414 + }, + { + "epoch": 2.026085187194791, + "grad_norm": 4.277337074279785, + "learning_rate": 5.797725092503769e-06, + "loss": 0.1627, + "step": 31415 + }, + { + "epoch": 2.026098752034726, + "grad_norm": 5.666258335113525, + "learning_rate": 5.797588049883515e-06, + "loss": 0.2492, + "step": 31416 + }, + { + "epoch": 2.026112316874661, + "grad_norm": 4.434303283691406, + "learning_rate": 5.797451007263259e-06, + "loss": 0.167, + "step": 31417 + }, + { + "epoch": 2.026125881714596, + "grad_norm": 5.02217960357666, + "learning_rate": 5.7973139646430045e-06, + "loss": 0.1682, + "step": 31418 + }, + { + "epoch": 2.026139446554531, + "grad_norm": 5.123186111450195, + "learning_rate": 5.797176922022749e-06, + "loss": 0.2919, + "step": 31419 + }, + { + "epoch": 2.0261530113944657, + "grad_norm": 4.152202606201172, + "learning_rate": 5.797039879402495e-06, + "loss": 0.1564, + "step": 31420 + }, + { + "epoch": 2.0261665762344006, + "grad_norm": 4.344786643981934, + "learning_rate": 5.79690283678224e-06, + "loss": 0.1235, + "step": 31421 + }, + { + "epoch": 2.0261801410743354, + "grad_norm": 4.11253547668457, + "learning_rate": 5.796765794161985e-06, + "loss": 0.2073, + "step": 31422 + }, + { + "epoch": 2.0261937059142703, + "grad_norm": 4.2136735916137695, + "learning_rate": 5.7966287515417296e-06, + "loss": 0.1165, + "step": 31423 + }, + { + "epoch": 2.026207270754205, + "grad_norm": 5.921762466430664, + "learning_rate": 5.796491708921475e-06, + "loss": 0.2079, + "step": 31424 + }, + { + "epoch": 2.02622083559414, + "grad_norm": 4.471375465393066, + "learning_rate": 5.796354666301221e-06, + "loss": 0.1659, + "step": 31425 + }, + { + "epoch": 2.026234400434075, + "grad_norm": 4.094939231872559, + "learning_rate": 5.796217623680965e-06, + "loss": 0.1248, + "step": 31426 + }, + { + "epoch": 2.0262479652740097, + "grad_norm": 5.285923957824707, + "learning_rate": 5.79608058106071e-06, + "loss": 0.1685, + "step": 31427 + }, + { + "epoch": 2.0262615301139446, + "grad_norm": 4.983057022094727, + "learning_rate": 5.795943538440455e-06, + "loss": 0.1471, + "step": 31428 + }, + { + "epoch": 2.0262750949538795, + "grad_norm": 4.082727909088135, + "learning_rate": 5.795806495820201e-06, + "loss": 0.148, + "step": 31429 + }, + { + "epoch": 2.0262886597938143, + "grad_norm": 4.252615928649902, + "learning_rate": 5.795669453199946e-06, + "loss": 0.2064, + "step": 31430 + }, + { + "epoch": 2.026302224633749, + "grad_norm": 3.583144187927246, + "learning_rate": 5.795532410579691e-06, + "loss": 0.0545, + "step": 31431 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 4.044727325439453, + "learning_rate": 5.795395367959435e-06, + "loss": 0.1253, + "step": 31432 + }, + { + "epoch": 2.026329354313619, + "grad_norm": 6.194811820983887, + "learning_rate": 5.795258325339181e-06, + "loss": 0.2044, + "step": 31433 + }, + { + "epoch": 2.026342919153554, + "grad_norm": 3.50566029548645, + "learning_rate": 5.7951212827189265e-06, + "loss": 0.1581, + "step": 31434 + }, + { + "epoch": 2.0263564839934887, + "grad_norm": 4.059770107269287, + "learning_rate": 5.794984240098671e-06, + "loss": 0.1483, + "step": 31435 + }, + { + "epoch": 2.026370048833424, + "grad_norm": 3.5859601497650146, + "learning_rate": 5.794847197478416e-06, + "loss": 0.0636, + "step": 31436 + }, + { + "epoch": 2.026383613673359, + "grad_norm": 4.099684715270996, + "learning_rate": 5.794710154858162e-06, + "loss": 0.1629, + "step": 31437 + }, + { + "epoch": 2.0263971785132937, + "grad_norm": 4.670354843139648, + "learning_rate": 5.794573112237906e-06, + "loss": 0.1013, + "step": 31438 + }, + { + "epoch": 2.0264107433532286, + "grad_norm": 5.811747074127197, + "learning_rate": 5.794436069617652e-06, + "loss": 0.1376, + "step": 31439 + }, + { + "epoch": 2.0264243081931634, + "grad_norm": 7.729996681213379, + "learning_rate": 5.794299026997397e-06, + "loss": 0.2054, + "step": 31440 + }, + { + "epoch": 2.0264378730330983, + "grad_norm": 5.640031814575195, + "learning_rate": 5.794161984377141e-06, + "loss": 0.2302, + "step": 31441 + }, + { + "epoch": 2.026451437873033, + "grad_norm": 4.968193531036377, + "learning_rate": 5.794024941756887e-06, + "loss": 0.1322, + "step": 31442 + }, + { + "epoch": 2.026465002712968, + "grad_norm": 4.950216293334961, + "learning_rate": 5.793887899136632e-06, + "loss": 0.21, + "step": 31443 + }, + { + "epoch": 2.026478567552903, + "grad_norm": 4.713672637939453, + "learning_rate": 5.793750856516377e-06, + "loss": 0.1784, + "step": 31444 + }, + { + "epoch": 2.0264921323928378, + "grad_norm": 5.32790470123291, + "learning_rate": 5.793613813896122e-06, + "loss": 0.2522, + "step": 31445 + }, + { + "epoch": 2.0265056972327726, + "grad_norm": 5.391395568847656, + "learning_rate": 5.793476771275868e-06, + "loss": 0.1728, + "step": 31446 + }, + { + "epoch": 2.0265192620727075, + "grad_norm": 5.467123985290527, + "learning_rate": 5.793339728655612e-06, + "loss": 0.2333, + "step": 31447 + }, + { + "epoch": 2.0265328269126424, + "grad_norm": 5.380981922149658, + "learning_rate": 5.793202686035357e-06, + "loss": 0.1798, + "step": 31448 + }, + { + "epoch": 2.0265463917525772, + "grad_norm": 5.128340721130371, + "learning_rate": 5.7930656434151025e-06, + "loss": 0.209, + "step": 31449 + }, + { + "epoch": 2.026559956592512, + "grad_norm": 6.222262859344482, + "learning_rate": 5.7929286007948486e-06, + "loss": 0.3186, + "step": 31450 + }, + { + "epoch": 2.026573521432447, + "grad_norm": 5.804595470428467, + "learning_rate": 5.792791558174593e-06, + "loss": 0.183, + "step": 31451 + }, + { + "epoch": 2.026587086272382, + "grad_norm": 5.981701374053955, + "learning_rate": 5.792654515554338e-06, + "loss": 0.1885, + "step": 31452 + }, + { + "epoch": 2.0266006511123167, + "grad_norm": 8.422140121459961, + "learning_rate": 5.792517472934082e-06, + "loss": 0.328, + "step": 31453 + }, + { + "epoch": 2.0266142159522516, + "grad_norm": 6.9052534103393555, + "learning_rate": 5.7923804303138276e-06, + "loss": 0.1956, + "step": 31454 + }, + { + "epoch": 2.026627780792187, + "grad_norm": 4.403593063354492, + "learning_rate": 5.792243387693574e-06, + "loss": 0.1537, + "step": 31455 + }, + { + "epoch": 2.0266413456321217, + "grad_norm": 6.050261974334717, + "learning_rate": 5.792106345073319e-06, + "loss": 0.2107, + "step": 31456 + }, + { + "epoch": 2.0266549104720566, + "grad_norm": 3.9885449409484863, + "learning_rate": 5.791969302453063e-06, + "loss": 0.1652, + "step": 31457 + }, + { + "epoch": 2.0266684753119915, + "grad_norm": 5.003262519836426, + "learning_rate": 5.791832259832808e-06, + "loss": 0.1615, + "step": 31458 + }, + { + "epoch": 2.0266820401519263, + "grad_norm": 6.300954818725586, + "learning_rate": 5.791695217212554e-06, + "loss": 0.2572, + "step": 31459 + }, + { + "epoch": 2.026695604991861, + "grad_norm": 3.929525136947632, + "learning_rate": 5.791558174592299e-06, + "loss": 0.13, + "step": 31460 + }, + { + "epoch": 2.026709169831796, + "grad_norm": 5.185161113739014, + "learning_rate": 5.791421131972044e-06, + "loss": 0.1823, + "step": 31461 + }, + { + "epoch": 2.026722734671731, + "grad_norm": 4.086474418640137, + "learning_rate": 5.791284089351788e-06, + "loss": 0.0824, + "step": 31462 + }, + { + "epoch": 2.026736299511666, + "grad_norm": 5.441516399383545, + "learning_rate": 5.791147046731534e-06, + "loss": 0.1909, + "step": 31463 + }, + { + "epoch": 2.0267498643516006, + "grad_norm": 4.155700206756592, + "learning_rate": 5.791010004111279e-06, + "loss": 0.1568, + "step": 31464 + }, + { + "epoch": 2.0267634291915355, + "grad_norm": 3.5283875465393066, + "learning_rate": 5.7908729614910245e-06, + "loss": 0.1002, + "step": 31465 + }, + { + "epoch": 2.0267769940314704, + "grad_norm": 3.9608466625213623, + "learning_rate": 5.790735918870769e-06, + "loss": 0.0945, + "step": 31466 + }, + { + "epoch": 2.0267905588714052, + "grad_norm": 4.7070393562316895, + "learning_rate": 5.790598876250514e-06, + "loss": 0.1349, + "step": 31467 + }, + { + "epoch": 2.02680412371134, + "grad_norm": 6.5021538734436035, + "learning_rate": 5.79046183363026e-06, + "loss": 0.2025, + "step": 31468 + }, + { + "epoch": 2.026817688551275, + "grad_norm": 5.279685020446777, + "learning_rate": 5.790324791010004e-06, + "loss": 0.2849, + "step": 31469 + }, + { + "epoch": 2.02683125339121, + "grad_norm": 3.784398317337036, + "learning_rate": 5.79018774838975e-06, + "loss": 0.1237, + "step": 31470 + }, + { + "epoch": 2.0268448182311447, + "grad_norm": 3.491746664047241, + "learning_rate": 5.790050705769495e-06, + "loss": 0.0974, + "step": 31471 + }, + { + "epoch": 2.0268583830710796, + "grad_norm": 5.446599006652832, + "learning_rate": 5.78991366314924e-06, + "loss": 0.1606, + "step": 31472 + }, + { + "epoch": 2.0268719479110144, + "grad_norm": 4.483397960662842, + "learning_rate": 5.789776620528985e-06, + "loss": 0.1512, + "step": 31473 + }, + { + "epoch": 2.0268855127509497, + "grad_norm": 5.593812942504883, + "learning_rate": 5.78963957790873e-06, + "loss": 0.1588, + "step": 31474 + }, + { + "epoch": 2.0268990775908846, + "grad_norm": 4.770936489105225, + "learning_rate": 5.789502535288475e-06, + "loss": 0.1995, + "step": 31475 + }, + { + "epoch": 2.0269126424308195, + "grad_norm": 4.802027702331543, + "learning_rate": 5.789365492668221e-06, + "loss": 0.1558, + "step": 31476 + }, + { + "epoch": 2.0269262072707543, + "grad_norm": 5.51239013671875, + "learning_rate": 5.789228450047966e-06, + "loss": 0.2376, + "step": 31477 + }, + { + "epoch": 2.026939772110689, + "grad_norm": 3.769002914428711, + "learning_rate": 5.78909140742771e-06, + "loss": 0.088, + "step": 31478 + }, + { + "epoch": 2.026953336950624, + "grad_norm": 5.3722147941589355, + "learning_rate": 5.788954364807455e-06, + "loss": 0.1285, + "step": 31479 + }, + { + "epoch": 2.026966901790559, + "grad_norm": 6.235334873199463, + "learning_rate": 5.7888173221872005e-06, + "loss": 0.1628, + "step": 31480 + }, + { + "epoch": 2.026980466630494, + "grad_norm": 5.740928649902344, + "learning_rate": 5.788680279566946e-06, + "loss": 0.1689, + "step": 31481 + }, + { + "epoch": 2.0269940314704287, + "grad_norm": 4.877124309539795, + "learning_rate": 5.788543236946691e-06, + "loss": 0.126, + "step": 31482 + }, + { + "epoch": 2.0270075963103635, + "grad_norm": 5.5720930099487305, + "learning_rate": 5.788406194326436e-06, + "loss": 0.2109, + "step": 31483 + }, + { + "epoch": 2.0270211611502984, + "grad_norm": 4.924638748168945, + "learning_rate": 5.78826915170618e-06, + "loss": 0.1615, + "step": 31484 + }, + { + "epoch": 2.0270347259902333, + "grad_norm": 4.8334197998046875, + "learning_rate": 5.7881321090859264e-06, + "loss": 0.1011, + "step": 31485 + }, + { + "epoch": 2.027048290830168, + "grad_norm": 4.806172847747803, + "learning_rate": 5.787995066465672e-06, + "loss": 0.1626, + "step": 31486 + }, + { + "epoch": 2.027061855670103, + "grad_norm": 5.344363689422607, + "learning_rate": 5.787858023845416e-06, + "loss": 0.2576, + "step": 31487 + }, + { + "epoch": 2.027075420510038, + "grad_norm": 7.540692329406738, + "learning_rate": 5.787720981225161e-06, + "loss": 0.2607, + "step": 31488 + }, + { + "epoch": 2.0270889853499727, + "grad_norm": 7.210381984710693, + "learning_rate": 5.787583938604907e-06, + "loss": 0.25, + "step": 31489 + }, + { + "epoch": 2.0271025501899076, + "grad_norm": 5.502627372741699, + "learning_rate": 5.787446895984652e-06, + "loss": 0.2185, + "step": 31490 + }, + { + "epoch": 2.0271161150298425, + "grad_norm": 3.5693440437316895, + "learning_rate": 5.787309853364397e-06, + "loss": 0.173, + "step": 31491 + }, + { + "epoch": 2.0271296798697773, + "grad_norm": 5.021147727966309, + "learning_rate": 5.787172810744142e-06, + "loss": 0.155, + "step": 31492 + }, + { + "epoch": 2.0271432447097126, + "grad_norm": 3.6279077529907227, + "learning_rate": 5.787035768123886e-06, + "loss": 0.1382, + "step": 31493 + }, + { + "epoch": 2.0271568095496475, + "grad_norm": 3.8906667232513428, + "learning_rate": 5.786898725503632e-06, + "loss": 0.1727, + "step": 31494 + }, + { + "epoch": 2.0271703743895824, + "grad_norm": 3.5612213611602783, + "learning_rate": 5.786761682883377e-06, + "loss": 0.1277, + "step": 31495 + }, + { + "epoch": 2.0271839392295172, + "grad_norm": 4.6381120681762695, + "learning_rate": 5.786624640263122e-06, + "loss": 0.2161, + "step": 31496 + }, + { + "epoch": 2.027197504069452, + "grad_norm": 6.338496208190918, + "learning_rate": 5.786487597642867e-06, + "loss": 0.2407, + "step": 31497 + }, + { + "epoch": 2.027211068909387, + "grad_norm": 4.488436698913574, + "learning_rate": 5.786350555022613e-06, + "loss": 0.1919, + "step": 31498 + }, + { + "epoch": 2.027224633749322, + "grad_norm": 5.571244716644287, + "learning_rate": 5.786213512402358e-06, + "loss": 0.1828, + "step": 31499 + }, + { + "epoch": 2.0272381985892567, + "grad_norm": 4.227355003356934, + "learning_rate": 5.786076469782102e-06, + "loss": 0.159, + "step": 31500 + }, + { + "epoch": 2.0272517634291916, + "grad_norm": 4.1418914794921875, + "learning_rate": 5.785939427161848e-06, + "loss": 0.1151, + "step": 31501 + }, + { + "epoch": 2.0272653282691264, + "grad_norm": 5.605856895446777, + "learning_rate": 5.785802384541594e-06, + "loss": 0.2036, + "step": 31502 + }, + { + "epoch": 2.0272788931090613, + "grad_norm": 4.634557723999023, + "learning_rate": 5.785665341921338e-06, + "loss": 0.1523, + "step": 31503 + }, + { + "epoch": 2.027292457948996, + "grad_norm": 5.7658371925354, + "learning_rate": 5.785528299301083e-06, + "loss": 0.1578, + "step": 31504 + }, + { + "epoch": 2.027306022788931, + "grad_norm": 5.839292049407959, + "learning_rate": 5.785391256680828e-06, + "loss": 0.1944, + "step": 31505 + }, + { + "epoch": 2.027319587628866, + "grad_norm": 4.0016021728515625, + "learning_rate": 5.7852542140605735e-06, + "loss": 0.1285, + "step": 31506 + }, + { + "epoch": 2.0273331524688007, + "grad_norm": 5.583497524261475, + "learning_rate": 5.785117171440319e-06, + "loss": 0.1751, + "step": 31507 + }, + { + "epoch": 2.0273467173087356, + "grad_norm": 4.405056953430176, + "learning_rate": 5.784980128820064e-06, + "loss": 0.1152, + "step": 31508 + }, + { + "epoch": 2.0273602821486705, + "grad_norm": 5.4269304275512695, + "learning_rate": 5.784843086199808e-06, + "loss": 0.2187, + "step": 31509 + }, + { + "epoch": 2.0273738469886053, + "grad_norm": 5.659458160400391, + "learning_rate": 5.784706043579553e-06, + "loss": 0.1653, + "step": 31510 + }, + { + "epoch": 2.02738741182854, + "grad_norm": 4.94170618057251, + "learning_rate": 5.784569000959299e-06, + "loss": 0.17, + "step": 31511 + }, + { + "epoch": 2.0274009766684755, + "grad_norm": 5.1508870124816895, + "learning_rate": 5.784431958339044e-06, + "loss": 0.1904, + "step": 31512 + }, + { + "epoch": 2.0274145415084104, + "grad_norm": 5.03253173828125, + "learning_rate": 5.784294915718789e-06, + "loss": 0.1491, + "step": 31513 + }, + { + "epoch": 2.0274281063483452, + "grad_norm": 4.741971969604492, + "learning_rate": 5.784157873098534e-06, + "loss": 0.1891, + "step": 31514 + }, + { + "epoch": 2.02744167118828, + "grad_norm": 4.094430923461914, + "learning_rate": 5.784020830478279e-06, + "loss": 0.1314, + "step": 31515 + }, + { + "epoch": 2.027455236028215, + "grad_norm": 5.262353420257568, + "learning_rate": 5.7838837878580244e-06, + "loss": 0.1317, + "step": 31516 + }, + { + "epoch": 2.02746880086815, + "grad_norm": 5.1377973556518555, + "learning_rate": 5.78374674523777e-06, + "loss": 0.2158, + "step": 31517 + }, + { + "epoch": 2.0274823657080847, + "grad_norm": 5.832840442657471, + "learning_rate": 5.783609702617514e-06, + "loss": 0.2319, + "step": 31518 + }, + { + "epoch": 2.0274959305480196, + "grad_norm": 3.9508683681488037, + "learning_rate": 5.78347265999726e-06, + "loss": 0.1135, + "step": 31519 + }, + { + "epoch": 2.0275094953879544, + "grad_norm": 5.765097618103027, + "learning_rate": 5.783335617377005e-06, + "loss": 0.2158, + "step": 31520 + }, + { + "epoch": 2.0275230602278893, + "grad_norm": 3.4597744941711426, + "learning_rate": 5.7831985747567495e-06, + "loss": 0.1333, + "step": 31521 + }, + { + "epoch": 2.027536625067824, + "grad_norm": 3.3483142852783203, + "learning_rate": 5.783061532136495e-06, + "loss": 0.0863, + "step": 31522 + }, + { + "epoch": 2.027550189907759, + "grad_norm": 4.6092529296875, + "learning_rate": 5.78292448951624e-06, + "loss": 0.1497, + "step": 31523 + }, + { + "epoch": 2.027563754747694, + "grad_norm": 4.3127241134643555, + "learning_rate": 5.782787446895986e-06, + "loss": 0.2134, + "step": 31524 + }, + { + "epoch": 2.0275773195876288, + "grad_norm": 3.2938618659973145, + "learning_rate": 5.78265040427573e-06, + "loss": 0.1543, + "step": 31525 + }, + { + "epoch": 2.0275908844275636, + "grad_norm": 6.54311466217041, + "learning_rate": 5.782513361655475e-06, + "loss": 0.2469, + "step": 31526 + }, + { + "epoch": 2.0276044492674985, + "grad_norm": 4.193729877471924, + "learning_rate": 5.78237631903522e-06, + "loss": 0.1816, + "step": 31527 + }, + { + "epoch": 2.0276180141074334, + "grad_norm": 4.533850193023682, + "learning_rate": 5.782239276414966e-06, + "loss": 0.2853, + "step": 31528 + }, + { + "epoch": 2.0276315789473682, + "grad_norm": 3.590667963027954, + "learning_rate": 5.782102233794711e-06, + "loss": 0.0715, + "step": 31529 + }, + { + "epoch": 2.027645143787303, + "grad_norm": 4.943844318389893, + "learning_rate": 5.781965191174455e-06, + "loss": 0.1788, + "step": 31530 + }, + { + "epoch": 2.0276587086272384, + "grad_norm": 5.109727382659912, + "learning_rate": 5.7818281485542e-06, + "loss": 0.2151, + "step": 31531 + }, + { + "epoch": 2.0276722734671733, + "grad_norm": 3.9349474906921387, + "learning_rate": 5.7816911059339464e-06, + "loss": 0.0912, + "step": 31532 + }, + { + "epoch": 2.027685838307108, + "grad_norm": 6.5413994789123535, + "learning_rate": 5.781554063313692e-06, + "loss": 0.1946, + "step": 31533 + }, + { + "epoch": 2.027699403147043, + "grad_norm": 5.970999717712402, + "learning_rate": 5.781417020693436e-06, + "loss": 0.1424, + "step": 31534 + }, + { + "epoch": 2.027712967986978, + "grad_norm": 4.963487148284912, + "learning_rate": 5.781279978073181e-06, + "loss": 0.2476, + "step": 31535 + }, + { + "epoch": 2.0277265328269127, + "grad_norm": 4.446384906768799, + "learning_rate": 5.7811429354529255e-06, + "loss": 0.124, + "step": 31536 + }, + { + "epoch": 2.0277400976668476, + "grad_norm": 4.9000139236450195, + "learning_rate": 5.7810058928326715e-06, + "loss": 0.1983, + "step": 31537 + }, + { + "epoch": 2.0277536625067825, + "grad_norm": 4.15995979309082, + "learning_rate": 5.780868850212417e-06, + "loss": 0.2033, + "step": 31538 + }, + { + "epoch": 2.0277672273467173, + "grad_norm": 3.768678665161133, + "learning_rate": 5.780731807592162e-06, + "loss": 0.1086, + "step": 31539 + }, + { + "epoch": 2.027780792186652, + "grad_norm": 3.5017290115356445, + "learning_rate": 5.780594764971906e-06, + "loss": 0.1285, + "step": 31540 + }, + { + "epoch": 2.027794357026587, + "grad_norm": 4.100923538208008, + "learning_rate": 5.780457722351652e-06, + "loss": 0.1608, + "step": 31541 + }, + { + "epoch": 2.027807921866522, + "grad_norm": 3.465271234512329, + "learning_rate": 5.780320679731397e-06, + "loss": 0.0953, + "step": 31542 + }, + { + "epoch": 2.027821486706457, + "grad_norm": 4.295408725738525, + "learning_rate": 5.780183637111142e-06, + "loss": 0.1041, + "step": 31543 + }, + { + "epoch": 2.0278350515463917, + "grad_norm": 5.100844383239746, + "learning_rate": 5.780046594490887e-06, + "loss": 0.2067, + "step": 31544 + }, + { + "epoch": 2.0278486163863265, + "grad_norm": 3.6791629791259766, + "learning_rate": 5.779909551870633e-06, + "loss": 0.1368, + "step": 31545 + }, + { + "epoch": 2.0278621812262614, + "grad_norm": 3.5362355709075928, + "learning_rate": 5.779772509250377e-06, + "loss": 0.1472, + "step": 31546 + }, + { + "epoch": 2.0278757460661962, + "grad_norm": 5.915068626403809, + "learning_rate": 5.7796354666301224e-06, + "loss": 0.1998, + "step": 31547 + }, + { + "epoch": 2.027889310906131, + "grad_norm": 4.742856979370117, + "learning_rate": 5.779498424009868e-06, + "loss": 0.1366, + "step": 31548 + }, + { + "epoch": 2.027902875746066, + "grad_norm": 2.9920384883880615, + "learning_rate": 5.779361381389612e-06, + "loss": 0.0902, + "step": 31549 + }, + { + "epoch": 2.0279164405860013, + "grad_norm": 3.3043434619903564, + "learning_rate": 5.779224338769358e-06, + "loss": 0.0878, + "step": 31550 + }, + { + "epoch": 2.027930005425936, + "grad_norm": 3.2578184604644775, + "learning_rate": 5.779087296149103e-06, + "loss": 0.1294, + "step": 31551 + }, + { + "epoch": 2.027943570265871, + "grad_norm": 4.2963666915893555, + "learning_rate": 5.7789502535288475e-06, + "loss": 0.2333, + "step": 31552 + }, + { + "epoch": 2.027957135105806, + "grad_norm": 5.71350622177124, + "learning_rate": 5.778813210908593e-06, + "loss": 0.1603, + "step": 31553 + }, + { + "epoch": 2.0279706999457408, + "grad_norm": 4.263877868652344, + "learning_rate": 5.778676168288339e-06, + "loss": 0.1437, + "step": 31554 + }, + { + "epoch": 2.0279842647856756, + "grad_norm": 4.189605236053467, + "learning_rate": 5.778539125668083e-06, + "loss": 0.1333, + "step": 31555 + }, + { + "epoch": 2.0279978296256105, + "grad_norm": 3.892713785171509, + "learning_rate": 5.778402083047828e-06, + "loss": 0.1594, + "step": 31556 + }, + { + "epoch": 2.0280113944655453, + "grad_norm": 5.013333320617676, + "learning_rate": 5.778265040427573e-06, + "loss": 0.1715, + "step": 31557 + }, + { + "epoch": 2.02802495930548, + "grad_norm": 3.233823299407959, + "learning_rate": 5.778127997807319e-06, + "loss": 0.1169, + "step": 31558 + }, + { + "epoch": 2.028038524145415, + "grad_norm": 4.096220016479492, + "learning_rate": 5.777990955187064e-06, + "loss": 0.1582, + "step": 31559 + }, + { + "epoch": 2.02805208898535, + "grad_norm": 3.471675395965576, + "learning_rate": 5.777853912566809e-06, + "loss": 0.1249, + "step": 31560 + }, + { + "epoch": 2.028065653825285, + "grad_norm": 3.210261344909668, + "learning_rate": 5.777716869946553e-06, + "loss": 0.1129, + "step": 31561 + }, + { + "epoch": 2.0280792186652197, + "grad_norm": 3.76820707321167, + "learning_rate": 5.7775798273262984e-06, + "loss": 0.1017, + "step": 31562 + }, + { + "epoch": 2.0280927835051545, + "grad_norm": 3.4925918579101562, + "learning_rate": 5.7774427847060445e-06, + "loss": 0.134, + "step": 31563 + }, + { + "epoch": 2.0281063483450894, + "grad_norm": 4.918438911437988, + "learning_rate": 5.777305742085789e-06, + "loss": 0.1403, + "step": 31564 + }, + { + "epoch": 2.0281199131850243, + "grad_norm": 5.912224769592285, + "learning_rate": 5.777168699465534e-06, + "loss": 0.1485, + "step": 31565 + }, + { + "epoch": 2.028133478024959, + "grad_norm": 3.7365736961364746, + "learning_rate": 5.777031656845279e-06, + "loss": 0.0793, + "step": 31566 + }, + { + "epoch": 2.028147042864894, + "grad_norm": 3.1571600437164307, + "learning_rate": 5.776894614225025e-06, + "loss": 0.0957, + "step": 31567 + }, + { + "epoch": 2.028160607704829, + "grad_norm": 3.6271603107452393, + "learning_rate": 5.7767575716047695e-06, + "loss": 0.1359, + "step": 31568 + }, + { + "epoch": 2.028174172544764, + "grad_norm": 3.3102588653564453, + "learning_rate": 5.776620528984515e-06, + "loss": 0.0943, + "step": 31569 + }, + { + "epoch": 2.028187737384699, + "grad_norm": 4.176542282104492, + "learning_rate": 5.776483486364259e-06, + "loss": 0.1177, + "step": 31570 + }, + { + "epoch": 2.028201302224634, + "grad_norm": 3.672832727432251, + "learning_rate": 5.776346443744005e-06, + "loss": 0.1282, + "step": 31571 + }, + { + "epoch": 2.0282148670645688, + "grad_norm": 3.8596863746643066, + "learning_rate": 5.77620940112375e-06, + "loss": 0.1218, + "step": 31572 + }, + { + "epoch": 2.0282284319045036, + "grad_norm": 6.065972805023193, + "learning_rate": 5.776072358503495e-06, + "loss": 0.1864, + "step": 31573 + }, + { + "epoch": 2.0282419967444385, + "grad_norm": 3.0847370624542236, + "learning_rate": 5.77593531588324e-06, + "loss": 0.0774, + "step": 31574 + }, + { + "epoch": 2.0282555615843734, + "grad_norm": 3.3161914348602295, + "learning_rate": 5.775798273262986e-06, + "loss": 0.1121, + "step": 31575 + }, + { + "epoch": 2.0282691264243082, + "grad_norm": 4.695056915283203, + "learning_rate": 5.775661230642731e-06, + "loss": 0.1274, + "step": 31576 + }, + { + "epoch": 2.028282691264243, + "grad_norm": 3.2647390365600586, + "learning_rate": 5.775524188022475e-06, + "loss": 0.1088, + "step": 31577 + }, + { + "epoch": 2.028296256104178, + "grad_norm": 4.033751964569092, + "learning_rate": 5.7753871454022204e-06, + "loss": 0.156, + "step": 31578 + }, + { + "epoch": 2.028309820944113, + "grad_norm": 4.184975624084473, + "learning_rate": 5.775250102781965e-06, + "loss": 0.1201, + "step": 31579 + }, + { + "epoch": 2.0283233857840477, + "grad_norm": 3.7809956073760986, + "learning_rate": 5.775113060161711e-06, + "loss": 0.1027, + "step": 31580 + }, + { + "epoch": 2.0283369506239826, + "grad_norm": 3.6416845321655273, + "learning_rate": 5.774976017541456e-06, + "loss": 0.0878, + "step": 31581 + }, + { + "epoch": 2.0283505154639174, + "grad_norm": 5.693507194519043, + "learning_rate": 5.774838974921201e-06, + "loss": 0.1416, + "step": 31582 + }, + { + "epoch": 2.0283640803038523, + "grad_norm": 3.1713016033172607, + "learning_rate": 5.7747019323009455e-06, + "loss": 0.1584, + "step": 31583 + }, + { + "epoch": 2.028377645143787, + "grad_norm": 3.9515156745910645, + "learning_rate": 5.7745648896806915e-06, + "loss": 0.1353, + "step": 31584 + }, + { + "epoch": 2.028391209983722, + "grad_norm": 5.011046409606934, + "learning_rate": 5.774427847060437e-06, + "loss": 0.1766, + "step": 31585 + }, + { + "epoch": 2.028404774823657, + "grad_norm": 3.0883734226226807, + "learning_rate": 5.774290804440181e-06, + "loss": 0.1569, + "step": 31586 + }, + { + "epoch": 2.0284183396635918, + "grad_norm": 4.5545973777771, + "learning_rate": 5.774153761819926e-06, + "loss": 0.1248, + "step": 31587 + }, + { + "epoch": 2.028431904503527, + "grad_norm": 3.6934425830841064, + "learning_rate": 5.774016719199672e-06, + "loss": 0.1216, + "step": 31588 + }, + { + "epoch": 2.028445469343462, + "grad_norm": 4.27810525894165, + "learning_rate": 5.7738796765794166e-06, + "loss": 0.129, + "step": 31589 + }, + { + "epoch": 2.028459034183397, + "grad_norm": 3.3823840618133545, + "learning_rate": 5.773742633959162e-06, + "loss": 0.1152, + "step": 31590 + }, + { + "epoch": 2.0284725990233317, + "grad_norm": 3.8428118228912354, + "learning_rate": 5.773605591338907e-06, + "loss": 0.1439, + "step": 31591 + }, + { + "epoch": 2.0284861638632665, + "grad_norm": 6.097501754760742, + "learning_rate": 5.773468548718651e-06, + "loss": 0.168, + "step": 31592 + }, + { + "epoch": 2.0284997287032014, + "grad_norm": 5.093976974487305, + "learning_rate": 5.773331506098397e-06, + "loss": 0.2145, + "step": 31593 + }, + { + "epoch": 2.0285132935431363, + "grad_norm": 4.3788862228393555, + "learning_rate": 5.7731944634781425e-06, + "loss": 0.1973, + "step": 31594 + }, + { + "epoch": 2.028526858383071, + "grad_norm": 4.2923665046691895, + "learning_rate": 5.773057420857887e-06, + "loss": 0.2023, + "step": 31595 + }, + { + "epoch": 2.028540423223006, + "grad_norm": 3.531604766845703, + "learning_rate": 5.772920378237632e-06, + "loss": 0.1362, + "step": 31596 + }, + { + "epoch": 2.028553988062941, + "grad_norm": 5.4839186668396, + "learning_rate": 5.772783335617378e-06, + "loss": 0.126, + "step": 31597 + }, + { + "epoch": 2.0285675529028757, + "grad_norm": 3.9846889972686768, + "learning_rate": 5.772646292997123e-06, + "loss": 0.1474, + "step": 31598 + }, + { + "epoch": 2.0285811177428106, + "grad_norm": 2.4321506023406982, + "learning_rate": 5.7725092503768675e-06, + "loss": 0.0804, + "step": 31599 + }, + { + "epoch": 2.0285946825827454, + "grad_norm": 2.9482228755950928, + "learning_rate": 5.772372207756613e-06, + "loss": 0.1394, + "step": 31600 + }, + { + "epoch": 2.0286082474226803, + "grad_norm": 3.8831329345703125, + "learning_rate": 5.772235165136359e-06, + "loss": 0.1243, + "step": 31601 + }, + { + "epoch": 2.028621812262615, + "grad_norm": 4.618125915527344, + "learning_rate": 5.772098122516103e-06, + "loss": 0.1592, + "step": 31602 + }, + { + "epoch": 2.02863537710255, + "grad_norm": 3.1953125, + "learning_rate": 5.771961079895848e-06, + "loss": 0.0888, + "step": 31603 + }, + { + "epoch": 2.028648941942485, + "grad_norm": 4.73395299911499, + "learning_rate": 5.7718240372755925e-06, + "loss": 0.1553, + "step": 31604 + }, + { + "epoch": 2.0286625067824198, + "grad_norm": 4.2447943687438965, + "learning_rate": 5.771686994655338e-06, + "loss": 0.1719, + "step": 31605 + }, + { + "epoch": 2.0286760716223546, + "grad_norm": 4.276226043701172, + "learning_rate": 5.771549952035084e-06, + "loss": 0.1719, + "step": 31606 + }, + { + "epoch": 2.02868963646229, + "grad_norm": 5.753256320953369, + "learning_rate": 5.771412909414829e-06, + "loss": 0.1499, + "step": 31607 + }, + { + "epoch": 2.028703201302225, + "grad_norm": 5.274227142333984, + "learning_rate": 5.771275866794573e-06, + "loss": 0.1665, + "step": 31608 + }, + { + "epoch": 2.0287167661421597, + "grad_norm": 4.4235453605651855, + "learning_rate": 5.7711388241743184e-06, + "loss": 0.1987, + "step": 31609 + }, + { + "epoch": 2.0287303309820945, + "grad_norm": 4.903316497802734, + "learning_rate": 5.7710017815540645e-06, + "loss": 0.2157, + "step": 31610 + }, + { + "epoch": 2.0287438958220294, + "grad_norm": 3.6951403617858887, + "learning_rate": 5.770864738933809e-06, + "loss": 0.1083, + "step": 31611 + }, + { + "epoch": 2.0287574606619643, + "grad_norm": 5.405227184295654, + "learning_rate": 5.770727696313554e-06, + "loss": 0.2031, + "step": 31612 + }, + { + "epoch": 2.028771025501899, + "grad_norm": 4.3129425048828125, + "learning_rate": 5.770590653693299e-06, + "loss": 0.1915, + "step": 31613 + }, + { + "epoch": 2.028784590341834, + "grad_norm": 4.11384916305542, + "learning_rate": 5.770453611073044e-06, + "loss": 0.1816, + "step": 31614 + }, + { + "epoch": 2.028798155181769, + "grad_norm": 5.935000419616699, + "learning_rate": 5.7703165684527895e-06, + "loss": 0.1911, + "step": 31615 + }, + { + "epoch": 2.0288117200217037, + "grad_norm": 4.091300010681152, + "learning_rate": 5.770179525832535e-06, + "loss": 0.1908, + "step": 31616 + }, + { + "epoch": 2.0288252848616386, + "grad_norm": 4.617484092712402, + "learning_rate": 5.770042483212279e-06, + "loss": 0.1326, + "step": 31617 + }, + { + "epoch": 2.0288388497015735, + "grad_norm": 4.396921157836914, + "learning_rate": 5.769905440592024e-06, + "loss": 0.1918, + "step": 31618 + }, + { + "epoch": 2.0288524145415083, + "grad_norm": 3.9553496837615967, + "learning_rate": 5.76976839797177e-06, + "loss": 0.1871, + "step": 31619 + }, + { + "epoch": 2.028865979381443, + "grad_norm": 6.833406448364258, + "learning_rate": 5.7696313553515146e-06, + "loss": 0.2233, + "step": 31620 + }, + { + "epoch": 2.028879544221378, + "grad_norm": 3.1913788318634033, + "learning_rate": 5.76949431273126e-06, + "loss": 0.1054, + "step": 31621 + }, + { + "epoch": 2.028893109061313, + "grad_norm": 4.412611484527588, + "learning_rate": 5.769357270111005e-06, + "loss": 0.1632, + "step": 31622 + }, + { + "epoch": 2.028906673901248, + "grad_norm": 3.3622586727142334, + "learning_rate": 5.76922022749075e-06, + "loss": 0.1251, + "step": 31623 + }, + { + "epoch": 2.0289202387411827, + "grad_norm": 5.2361578941345215, + "learning_rate": 5.769083184870495e-06, + "loss": 0.1908, + "step": 31624 + }, + { + "epoch": 2.0289338035811175, + "grad_norm": 4.0283379554748535, + "learning_rate": 5.7689461422502405e-06, + "loss": 0.1537, + "step": 31625 + }, + { + "epoch": 2.028947368421053, + "grad_norm": 4.911263942718506, + "learning_rate": 5.768809099629985e-06, + "loss": 0.2046, + "step": 31626 + }, + { + "epoch": 2.0289609332609877, + "grad_norm": 4.010214328765869, + "learning_rate": 5.768672057009731e-06, + "loss": 0.2262, + "step": 31627 + }, + { + "epoch": 2.0289744981009226, + "grad_norm": 4.609050750732422, + "learning_rate": 5.768535014389476e-06, + "loss": 0.1169, + "step": 31628 + }, + { + "epoch": 2.0289880629408574, + "grad_norm": 4.0335612297058105, + "learning_rate": 5.76839797176922e-06, + "loss": 0.1105, + "step": 31629 + }, + { + "epoch": 2.0290016277807923, + "grad_norm": 4.093088626861572, + "learning_rate": 5.7682609291489655e-06, + "loss": 0.1803, + "step": 31630 + }, + { + "epoch": 2.029015192620727, + "grad_norm": 5.299415111541748, + "learning_rate": 5.768123886528711e-06, + "loss": 0.2215, + "step": 31631 + }, + { + "epoch": 2.029028757460662, + "grad_norm": 4.905751705169678, + "learning_rate": 5.767986843908457e-06, + "loss": 0.2184, + "step": 31632 + }, + { + "epoch": 2.029042322300597, + "grad_norm": 6.055450439453125, + "learning_rate": 5.767849801288201e-06, + "loss": 0.2079, + "step": 31633 + }, + { + "epoch": 2.0290558871405318, + "grad_norm": 4.227892875671387, + "learning_rate": 5.767712758667946e-06, + "loss": 0.1397, + "step": 31634 + }, + { + "epoch": 2.0290694519804666, + "grad_norm": 5.265262126922607, + "learning_rate": 5.7675757160476906e-06, + "loss": 0.2441, + "step": 31635 + }, + { + "epoch": 2.0290830168204015, + "grad_norm": 3.960721254348755, + "learning_rate": 5.767438673427437e-06, + "loss": 0.1879, + "step": 31636 + }, + { + "epoch": 2.0290965816603364, + "grad_norm": 4.204654693603516, + "learning_rate": 5.767301630807182e-06, + "loss": 0.1404, + "step": 31637 + }, + { + "epoch": 2.029110146500271, + "grad_norm": 6.7209696769714355, + "learning_rate": 5.767164588186926e-06, + "loss": 0.2853, + "step": 31638 + }, + { + "epoch": 2.029123711340206, + "grad_norm": 4.161157131195068, + "learning_rate": 5.767027545566671e-06, + "loss": 0.1584, + "step": 31639 + }, + { + "epoch": 2.029137276180141, + "grad_norm": 4.415778636932373, + "learning_rate": 5.766890502946417e-06, + "loss": 0.2228, + "step": 31640 + }, + { + "epoch": 2.029150841020076, + "grad_norm": 4.665091037750244, + "learning_rate": 5.7667534603261625e-06, + "loss": 0.1465, + "step": 31641 + }, + { + "epoch": 2.0291644058600107, + "grad_norm": 3.1061344146728516, + "learning_rate": 5.766616417705907e-06, + "loss": 0.0961, + "step": 31642 + }, + { + "epoch": 2.0291779706999455, + "grad_norm": 5.3483567237854, + "learning_rate": 5.766479375085652e-06, + "loss": 0.2343, + "step": 31643 + }, + { + "epoch": 2.029191535539881, + "grad_norm": 7.088043212890625, + "learning_rate": 5.766342332465398e-06, + "loss": 0.2567, + "step": 31644 + }, + { + "epoch": 2.0292051003798157, + "grad_norm": 3.456261396408081, + "learning_rate": 5.766205289845142e-06, + "loss": 0.1806, + "step": 31645 + }, + { + "epoch": 2.0292186652197506, + "grad_norm": 4.330183506011963, + "learning_rate": 5.7660682472248875e-06, + "loss": 0.1539, + "step": 31646 + }, + { + "epoch": 2.0292322300596854, + "grad_norm": 3.931474208831787, + "learning_rate": 5.765931204604633e-06, + "loss": 0.1673, + "step": 31647 + }, + { + "epoch": 2.0292457948996203, + "grad_norm": 3.8597259521484375, + "learning_rate": 5.765794161984377e-06, + "loss": 0.1251, + "step": 31648 + }, + { + "epoch": 2.029259359739555, + "grad_norm": 6.492249965667725, + "learning_rate": 5.765657119364123e-06, + "loss": 0.2582, + "step": 31649 + }, + { + "epoch": 2.02927292457949, + "grad_norm": 4.203916549682617, + "learning_rate": 5.765520076743868e-06, + "loss": 0.1469, + "step": 31650 + }, + { + "epoch": 2.029286489419425, + "grad_norm": 5.169037342071533, + "learning_rate": 5.7653830341236126e-06, + "loss": 0.1812, + "step": 31651 + }, + { + "epoch": 2.0293000542593598, + "grad_norm": 4.565700531005859, + "learning_rate": 5.765245991503358e-06, + "loss": 0.1859, + "step": 31652 + }, + { + "epoch": 2.0293136190992946, + "grad_norm": 6.831933975219727, + "learning_rate": 5.765108948883104e-06, + "loss": 0.2645, + "step": 31653 + }, + { + "epoch": 2.0293271839392295, + "grad_norm": 4.471279621124268, + "learning_rate": 5.764971906262848e-06, + "loss": 0.1432, + "step": 31654 + }, + { + "epoch": 2.0293407487791644, + "grad_norm": 3.6399996280670166, + "learning_rate": 5.764834863642593e-06, + "loss": 0.1172, + "step": 31655 + }, + { + "epoch": 2.0293543136190992, + "grad_norm": 5.534780502319336, + "learning_rate": 5.7646978210223385e-06, + "loss": 0.2793, + "step": 31656 + }, + { + "epoch": 2.029367878459034, + "grad_norm": 5.18507719039917, + "learning_rate": 5.764560778402084e-06, + "loss": 0.1557, + "step": 31657 + }, + { + "epoch": 2.029381443298969, + "grad_norm": 4.583239555358887, + "learning_rate": 5.764423735781829e-06, + "loss": 0.1876, + "step": 31658 + }, + { + "epoch": 2.029395008138904, + "grad_norm": 4.894259929656982, + "learning_rate": 5.764286693161574e-06, + "loss": 0.1471, + "step": 31659 + }, + { + "epoch": 2.0294085729788387, + "grad_norm": 3.72617506980896, + "learning_rate": 5.764149650541318e-06, + "loss": 0.1506, + "step": 31660 + }, + { + "epoch": 2.0294221378187736, + "grad_norm": 3.1133670806884766, + "learning_rate": 5.7640126079210635e-06, + "loss": 0.1017, + "step": 31661 + }, + { + "epoch": 2.0294357026587084, + "grad_norm": 4.244518280029297, + "learning_rate": 5.7638755653008095e-06, + "loss": 0.152, + "step": 31662 + }, + { + "epoch": 2.0294492674986433, + "grad_norm": 5.556978225708008, + "learning_rate": 5.763738522680554e-06, + "loss": 0.1051, + "step": 31663 + }, + { + "epoch": 2.0294628323385786, + "grad_norm": 4.954058647155762, + "learning_rate": 5.763601480060299e-06, + "loss": 0.1397, + "step": 31664 + }, + { + "epoch": 2.0294763971785135, + "grad_norm": 2.6729178428649902, + "learning_rate": 5.763464437440044e-06, + "loss": 0.0913, + "step": 31665 + }, + { + "epoch": 2.0294899620184483, + "grad_norm": 3.388908624649048, + "learning_rate": 5.76332739481979e-06, + "loss": 0.1604, + "step": 31666 + }, + { + "epoch": 2.029503526858383, + "grad_norm": 3.530059814453125, + "learning_rate": 5.763190352199535e-06, + "loss": 0.1181, + "step": 31667 + }, + { + "epoch": 2.029517091698318, + "grad_norm": 4.105801582336426, + "learning_rate": 5.76305330957928e-06, + "loss": 0.1338, + "step": 31668 + }, + { + "epoch": 2.029530656538253, + "grad_norm": 3.7837798595428467, + "learning_rate": 5.762916266959024e-06, + "loss": 0.1311, + "step": 31669 + }, + { + "epoch": 2.029544221378188, + "grad_norm": 5.693995475769043, + "learning_rate": 5.76277922433877e-06, + "loss": 0.1932, + "step": 31670 + }, + { + "epoch": 2.0295577862181227, + "grad_norm": 4.933228015899658, + "learning_rate": 5.762642181718515e-06, + "loss": 0.1457, + "step": 31671 + }, + { + "epoch": 2.0295713510580575, + "grad_norm": 3.5007824897766113, + "learning_rate": 5.76250513909826e-06, + "loss": 0.1585, + "step": 31672 + }, + { + "epoch": 2.0295849158979924, + "grad_norm": 4.362252235412598, + "learning_rate": 5.762368096478005e-06, + "loss": 0.1433, + "step": 31673 + }, + { + "epoch": 2.0295984807379273, + "grad_norm": 3.512108564376831, + "learning_rate": 5.76223105385775e-06, + "loss": 0.2109, + "step": 31674 + }, + { + "epoch": 2.029612045577862, + "grad_norm": 4.25073766708374, + "learning_rate": 5.762094011237496e-06, + "loss": 0.1247, + "step": 31675 + }, + { + "epoch": 2.029625610417797, + "grad_norm": 4.6630754470825195, + "learning_rate": 5.76195696861724e-06, + "loss": 0.1991, + "step": 31676 + }, + { + "epoch": 2.029639175257732, + "grad_norm": 4.492167949676514, + "learning_rate": 5.7618199259969855e-06, + "loss": 0.1452, + "step": 31677 + }, + { + "epoch": 2.0296527400976667, + "grad_norm": 3.7627248764038086, + "learning_rate": 5.76168288337673e-06, + "loss": 0.1242, + "step": 31678 + }, + { + "epoch": 2.0296663049376016, + "grad_norm": 5.1095805168151855, + "learning_rate": 5.761545840756476e-06, + "loss": 0.166, + "step": 31679 + }, + { + "epoch": 2.0296798697775364, + "grad_norm": 2.7446537017822266, + "learning_rate": 5.761408798136221e-06, + "loss": 0.1061, + "step": 31680 + }, + { + "epoch": 2.0296934346174713, + "grad_norm": 6.108964920043945, + "learning_rate": 5.761271755515966e-06, + "loss": 0.1394, + "step": 31681 + }, + { + "epoch": 2.0297069994574066, + "grad_norm": 3.7653727531433105, + "learning_rate": 5.7611347128957106e-06, + "loss": 0.1344, + "step": 31682 + }, + { + "epoch": 2.0297205642973415, + "grad_norm": 5.399042129516602, + "learning_rate": 5.760997670275457e-06, + "loss": 0.2321, + "step": 31683 + }, + { + "epoch": 2.0297341291372764, + "grad_norm": 3.7408077716827393, + "learning_rate": 5.760860627655202e-06, + "loss": 0.1448, + "step": 31684 + }, + { + "epoch": 2.029747693977211, + "grad_norm": 4.911413669586182, + "learning_rate": 5.760723585034946e-06, + "loss": 0.2061, + "step": 31685 + }, + { + "epoch": 2.029761258817146, + "grad_norm": 5.674678325653076, + "learning_rate": 5.760586542414691e-06, + "loss": 0.2029, + "step": 31686 + }, + { + "epoch": 2.029774823657081, + "grad_norm": 4.2939605712890625, + "learning_rate": 5.760449499794436e-06, + "loss": 0.1976, + "step": 31687 + }, + { + "epoch": 2.029788388497016, + "grad_norm": 3.997326135635376, + "learning_rate": 5.760312457174182e-06, + "loss": 0.1635, + "step": 31688 + }, + { + "epoch": 2.0298019533369507, + "grad_norm": 3.747154951095581, + "learning_rate": 5.760175414553927e-06, + "loss": 0.0888, + "step": 31689 + }, + { + "epoch": 2.0298155181768855, + "grad_norm": 4.031787872314453, + "learning_rate": 5.760038371933672e-06, + "loss": 0.1609, + "step": 31690 + }, + { + "epoch": 2.0298290830168204, + "grad_norm": 4.822635650634766, + "learning_rate": 5.759901329313416e-06, + "loss": 0.1735, + "step": 31691 + }, + { + "epoch": 2.0298426478567553, + "grad_norm": 4.385833263397217, + "learning_rate": 5.759764286693162e-06, + "loss": 0.2159, + "step": 31692 + }, + { + "epoch": 2.02985621269669, + "grad_norm": 4.228362083435059, + "learning_rate": 5.7596272440729075e-06, + "loss": 0.1404, + "step": 31693 + }, + { + "epoch": 2.029869777536625, + "grad_norm": 5.055871486663818, + "learning_rate": 5.759490201452652e-06, + "loss": 0.224, + "step": 31694 + }, + { + "epoch": 2.02988334237656, + "grad_norm": 4.217179298400879, + "learning_rate": 5.759353158832397e-06, + "loss": 0.1495, + "step": 31695 + }, + { + "epoch": 2.0298969072164947, + "grad_norm": 3.150271415710449, + "learning_rate": 5.759216116212143e-06, + "loss": 0.0944, + "step": 31696 + }, + { + "epoch": 2.0299104720564296, + "grad_norm": 4.6272053718566895, + "learning_rate": 5.759079073591887e-06, + "loss": 0.269, + "step": 31697 + }, + { + "epoch": 2.0299240368963645, + "grad_norm": 3.865476608276367, + "learning_rate": 5.758942030971633e-06, + "loss": 0.1353, + "step": 31698 + }, + { + "epoch": 2.0299376017362993, + "grad_norm": 4.3640336990356445, + "learning_rate": 5.758804988351378e-06, + "loss": 0.2636, + "step": 31699 + }, + { + "epoch": 2.029951166576234, + "grad_norm": 4.43472957611084, + "learning_rate": 5.758667945731122e-06, + "loss": 0.171, + "step": 31700 + }, + { + "epoch": 2.029964731416169, + "grad_norm": 4.377333641052246, + "learning_rate": 5.758530903110868e-06, + "loss": 0.1935, + "step": 31701 + }, + { + "epoch": 2.0299782962561044, + "grad_norm": 3.5576391220092773, + "learning_rate": 5.758393860490613e-06, + "loss": 0.1751, + "step": 31702 + }, + { + "epoch": 2.0299918610960392, + "grad_norm": 3.8369109630584717, + "learning_rate": 5.758256817870358e-06, + "loss": 0.1346, + "step": 31703 + }, + { + "epoch": 2.030005425935974, + "grad_norm": 4.026220321655273, + "learning_rate": 5.758119775250103e-06, + "loss": 0.1455, + "step": 31704 + }, + { + "epoch": 2.030018990775909, + "grad_norm": 3.1903440952301025, + "learning_rate": 5.757982732629849e-06, + "loss": 0.1396, + "step": 31705 + }, + { + "epoch": 2.030032555615844, + "grad_norm": 5.935842990875244, + "learning_rate": 5.757845690009593e-06, + "loss": 0.2509, + "step": 31706 + }, + { + "epoch": 2.0300461204557787, + "grad_norm": 5.3327226638793945, + "learning_rate": 5.757708647389338e-06, + "loss": 0.1979, + "step": 31707 + }, + { + "epoch": 2.0300596852957136, + "grad_norm": 5.370906829833984, + "learning_rate": 5.7575716047690835e-06, + "loss": 0.1908, + "step": 31708 + }, + { + "epoch": 2.0300732501356484, + "grad_norm": 4.578307628631592, + "learning_rate": 5.7574345621488296e-06, + "loss": 0.1785, + "step": 31709 + }, + { + "epoch": 2.0300868149755833, + "grad_norm": 7.167757511138916, + "learning_rate": 5.757297519528574e-06, + "loss": 0.1813, + "step": 31710 + }, + { + "epoch": 2.030100379815518, + "grad_norm": 4.04452657699585, + "learning_rate": 5.757160476908319e-06, + "loss": 0.1386, + "step": 31711 + }, + { + "epoch": 2.030113944655453, + "grad_norm": 5.841156482696533, + "learning_rate": 5.757023434288063e-06, + "loss": 0.2725, + "step": 31712 + }, + { + "epoch": 2.030127509495388, + "grad_norm": 3.342460870742798, + "learning_rate": 5.756886391667809e-06, + "loss": 0.107, + "step": 31713 + }, + { + "epoch": 2.0301410743353228, + "grad_norm": 6.731874465942383, + "learning_rate": 5.756749349047555e-06, + "loss": 0.2623, + "step": 31714 + }, + { + "epoch": 2.0301546391752576, + "grad_norm": 4.349486827850342, + "learning_rate": 5.7566123064273e-06, + "loss": 0.182, + "step": 31715 + }, + { + "epoch": 2.0301682040151925, + "grad_norm": 3.254124402999878, + "learning_rate": 5.756475263807044e-06, + "loss": 0.1313, + "step": 31716 + }, + { + "epoch": 2.0301817688551274, + "grad_norm": 3.762855052947998, + "learning_rate": 5.756338221186789e-06, + "loss": 0.1554, + "step": 31717 + }, + { + "epoch": 2.030195333695062, + "grad_norm": 4.237764835357666, + "learning_rate": 5.756201178566535e-06, + "loss": 0.175, + "step": 31718 + }, + { + "epoch": 2.030208898534997, + "grad_norm": 4.681094169616699, + "learning_rate": 5.75606413594628e-06, + "loss": 0.1745, + "step": 31719 + }, + { + "epoch": 2.0302224633749324, + "grad_norm": 5.195992946624756, + "learning_rate": 5.755927093326025e-06, + "loss": 0.1438, + "step": 31720 + }, + { + "epoch": 2.0302360282148673, + "grad_norm": 7.150989532470703, + "learning_rate": 5.755790050705769e-06, + "loss": 0.2377, + "step": 31721 + }, + { + "epoch": 2.030249593054802, + "grad_norm": 5.5333428382873535, + "learning_rate": 5.755653008085515e-06, + "loss": 0.2454, + "step": 31722 + }, + { + "epoch": 2.030263157894737, + "grad_norm": 4.205320358276367, + "learning_rate": 5.75551596546526e-06, + "loss": 0.1296, + "step": 31723 + }, + { + "epoch": 2.030276722734672, + "grad_norm": 3.647887945175171, + "learning_rate": 5.7553789228450055e-06, + "loss": 0.1286, + "step": 31724 + }, + { + "epoch": 2.0302902875746067, + "grad_norm": 3.7088730335235596, + "learning_rate": 5.75524188022475e-06, + "loss": 0.1264, + "step": 31725 + }, + { + "epoch": 2.0303038524145416, + "grad_norm": 4.293959140777588, + "learning_rate": 5.755104837604496e-06, + "loss": 0.1563, + "step": 31726 + }, + { + "epoch": 2.0303174172544765, + "grad_norm": 4.16005277633667, + "learning_rate": 5.754967794984241e-06, + "loss": 0.0932, + "step": 31727 + }, + { + "epoch": 2.0303309820944113, + "grad_norm": 6.74511194229126, + "learning_rate": 5.754830752363985e-06, + "loss": 0.2447, + "step": 31728 + }, + { + "epoch": 2.030344546934346, + "grad_norm": 4.4063944816589355, + "learning_rate": 5.754693709743731e-06, + "loss": 0.0934, + "step": 31729 + }, + { + "epoch": 2.030358111774281, + "grad_norm": 3.9594202041625977, + "learning_rate": 5.754556667123476e-06, + "loss": 0.1417, + "step": 31730 + }, + { + "epoch": 2.030371676614216, + "grad_norm": 4.983125686645508, + "learning_rate": 5.754419624503221e-06, + "loss": 0.1828, + "step": 31731 + }, + { + "epoch": 2.030385241454151, + "grad_norm": 3.8817334175109863, + "learning_rate": 5.754282581882966e-06, + "loss": 0.1955, + "step": 31732 + }, + { + "epoch": 2.0303988062940856, + "grad_norm": 4.391325950622559, + "learning_rate": 5.754145539262711e-06, + "loss": 0.2115, + "step": 31733 + }, + { + "epoch": 2.0304123711340205, + "grad_norm": 4.628381252288818, + "learning_rate": 5.754008496642456e-06, + "loss": 0.1499, + "step": 31734 + }, + { + "epoch": 2.0304259359739554, + "grad_norm": 4.82995080947876, + "learning_rate": 5.753871454022202e-06, + "loss": 0.1999, + "step": 31735 + }, + { + "epoch": 2.0304395008138902, + "grad_norm": 5.117644786834717, + "learning_rate": 5.753734411401947e-06, + "loss": 0.2339, + "step": 31736 + }, + { + "epoch": 2.030453065653825, + "grad_norm": 4.197762489318848, + "learning_rate": 5.753597368781691e-06, + "loss": 0.1163, + "step": 31737 + }, + { + "epoch": 2.03046663049376, + "grad_norm": 3.8786516189575195, + "learning_rate": 5.753460326161436e-06, + "loss": 0.1302, + "step": 31738 + }, + { + "epoch": 2.0304801953336953, + "grad_norm": 5.957799434661865, + "learning_rate": 5.753323283541182e-06, + "loss": 0.2249, + "step": 31739 + }, + { + "epoch": 2.03049376017363, + "grad_norm": 6.096660614013672, + "learning_rate": 5.7531862409209276e-06, + "loss": 0.2389, + "step": 31740 + }, + { + "epoch": 2.030507325013565, + "grad_norm": 4.36637020111084, + "learning_rate": 5.753049198300672e-06, + "loss": 0.1663, + "step": 31741 + }, + { + "epoch": 2.0305208898535, + "grad_norm": 5.353189468383789, + "learning_rate": 5.752912155680417e-06, + "loss": 0.2003, + "step": 31742 + }, + { + "epoch": 2.0305344546934347, + "grad_norm": 4.679619312286377, + "learning_rate": 5.752775113060161e-06, + "loss": 0.2322, + "step": 31743 + }, + { + "epoch": 2.0305480195333696, + "grad_norm": 4.668236255645752, + "learning_rate": 5.7526380704399074e-06, + "loss": 0.1543, + "step": 31744 + }, + { + "epoch": 2.0305615843733045, + "grad_norm": 6.432284355163574, + "learning_rate": 5.752501027819653e-06, + "loss": 0.2153, + "step": 31745 + }, + { + "epoch": 2.0305751492132393, + "grad_norm": 5.061105728149414, + "learning_rate": 5.752363985199397e-06, + "loss": 0.2436, + "step": 31746 + }, + { + "epoch": 2.030588714053174, + "grad_norm": 5.845552444458008, + "learning_rate": 5.752226942579142e-06, + "loss": 0.2202, + "step": 31747 + }, + { + "epoch": 2.030602278893109, + "grad_norm": 5.975447177886963, + "learning_rate": 5.752089899958888e-06, + "loss": 0.2359, + "step": 31748 + }, + { + "epoch": 2.030615843733044, + "grad_norm": 5.243342399597168, + "learning_rate": 5.751952857338633e-06, + "loss": 0.2878, + "step": 31749 + }, + { + "epoch": 2.030629408572979, + "grad_norm": 5.857909202575684, + "learning_rate": 5.751815814718378e-06, + "loss": 0.2609, + "step": 31750 + }, + { + "epoch": 2.0306429734129137, + "grad_norm": 4.948077201843262, + "learning_rate": 5.751678772098123e-06, + "loss": 0.1674, + "step": 31751 + }, + { + "epoch": 2.0306565382528485, + "grad_norm": 4.354673385620117, + "learning_rate": 5.751541729477869e-06, + "loss": 0.207, + "step": 31752 + }, + { + "epoch": 2.0306701030927834, + "grad_norm": 4.309595108032227, + "learning_rate": 5.751404686857613e-06, + "loss": 0.1547, + "step": 31753 + }, + { + "epoch": 2.0306836679327183, + "grad_norm": 4.185052394866943, + "learning_rate": 5.751267644237358e-06, + "loss": 0.2005, + "step": 31754 + }, + { + "epoch": 2.030697232772653, + "grad_norm": 6.645596981048584, + "learning_rate": 5.7511306016171035e-06, + "loss": 0.3359, + "step": 31755 + }, + { + "epoch": 2.030710797612588, + "grad_norm": 7.028080463409424, + "learning_rate": 5.750993558996848e-06, + "loss": 0.2967, + "step": 31756 + }, + { + "epoch": 2.030724362452523, + "grad_norm": 5.426482677459717, + "learning_rate": 5.750856516376594e-06, + "loss": 0.2587, + "step": 31757 + }, + { + "epoch": 2.030737927292458, + "grad_norm": 5.024044513702393, + "learning_rate": 5.750719473756339e-06, + "loss": 0.2546, + "step": 31758 + }, + { + "epoch": 2.030751492132393, + "grad_norm": 6.957773685455322, + "learning_rate": 5.750582431136083e-06, + "loss": 0.2488, + "step": 31759 + }, + { + "epoch": 2.030765056972328, + "grad_norm": 4.955602645874023, + "learning_rate": 5.750445388515829e-06, + "loss": 0.2091, + "step": 31760 + }, + { + "epoch": 2.0307786218122628, + "grad_norm": 5.395803928375244, + "learning_rate": 5.750308345895575e-06, + "loss": 0.2199, + "step": 31761 + }, + { + "epoch": 2.0307921866521976, + "grad_norm": 6.6037211418151855, + "learning_rate": 5.750171303275319e-06, + "loss": 0.2572, + "step": 31762 + }, + { + "epoch": 2.0308057514921325, + "grad_norm": 5.674611568450928, + "learning_rate": 5.750034260655064e-06, + "loss": 0.2828, + "step": 31763 + }, + { + "epoch": 2.0308193163320674, + "grad_norm": 4.405578136444092, + "learning_rate": 5.749897218034809e-06, + "loss": 0.2305, + "step": 31764 + }, + { + "epoch": 2.0308328811720022, + "grad_norm": 6.045233726501465, + "learning_rate": 5.7497601754145545e-06, + "loss": 0.2495, + "step": 31765 + }, + { + "epoch": 2.030846446011937, + "grad_norm": 6.673331260681152, + "learning_rate": 5.7496231327943e-06, + "loss": 0.2792, + "step": 31766 + }, + { + "epoch": 2.030860010851872, + "grad_norm": 5.3525710105896, + "learning_rate": 5.749486090174045e-06, + "loss": 0.2842, + "step": 31767 + }, + { + "epoch": 2.030873575691807, + "grad_norm": 4.462241172790527, + "learning_rate": 5.749349047553789e-06, + "loss": 0.1404, + "step": 31768 + }, + { + "epoch": 2.0308871405317417, + "grad_norm": 7.291294097900391, + "learning_rate": 5.749212004933534e-06, + "loss": 0.3921, + "step": 31769 + }, + { + "epoch": 2.0309007053716766, + "grad_norm": 7.407866954803467, + "learning_rate": 5.74907496231328e-06, + "loss": 0.3071, + "step": 31770 + }, + { + "epoch": 2.0309142702116114, + "grad_norm": 3.9263858795166016, + "learning_rate": 5.748937919693025e-06, + "loss": 0.1545, + "step": 31771 + }, + { + "epoch": 2.0309278350515463, + "grad_norm": 6.015365123748779, + "learning_rate": 5.74880087707277e-06, + "loss": 0.3072, + "step": 31772 + }, + { + "epoch": 2.030941399891481, + "grad_norm": 7.163460731506348, + "learning_rate": 5.748663834452515e-06, + "loss": 0.3142, + "step": 31773 + }, + { + "epoch": 2.030954964731416, + "grad_norm": 3.995657205581665, + "learning_rate": 5.748526791832261e-06, + "loss": 0.2276, + "step": 31774 + }, + { + "epoch": 2.030968529571351, + "grad_norm": 6.34889030456543, + "learning_rate": 5.7483897492120054e-06, + "loss": 0.2852, + "step": 31775 + }, + { + "epoch": 2.0309820944112857, + "grad_norm": 4.780775547027588, + "learning_rate": 5.748252706591751e-06, + "loss": 0.1948, + "step": 31776 + }, + { + "epoch": 2.030995659251221, + "grad_norm": 5.687309741973877, + "learning_rate": 5.748115663971495e-06, + "loss": 0.362, + "step": 31777 + }, + { + "epoch": 2.031009224091156, + "grad_norm": 3.806891679763794, + "learning_rate": 5.747978621351241e-06, + "loss": 0.226, + "step": 31778 + }, + { + "epoch": 2.031022788931091, + "grad_norm": 4.98725700378418, + "learning_rate": 5.747841578730986e-06, + "loss": 0.1665, + "step": 31779 + }, + { + "epoch": 2.0310363537710256, + "grad_norm": 4.566564083099365, + "learning_rate": 5.7477045361107305e-06, + "loss": 0.252, + "step": 31780 + }, + { + "epoch": 2.0310499186109605, + "grad_norm": 5.0261430740356445, + "learning_rate": 5.747567493490476e-06, + "loss": 0.2555, + "step": 31781 + }, + { + "epoch": 2.0310634834508954, + "grad_norm": 5.310913562774658, + "learning_rate": 5.747430450870221e-06, + "loss": 0.2215, + "step": 31782 + }, + { + "epoch": 2.0310770482908302, + "grad_norm": 4.55078649520874, + "learning_rate": 5.747293408249967e-06, + "loss": 0.2201, + "step": 31783 + }, + { + "epoch": 2.031090613130765, + "grad_norm": 4.5765461921691895, + "learning_rate": 5.747156365629711e-06, + "loss": 0.2772, + "step": 31784 + }, + { + "epoch": 2.0311041779707, + "grad_norm": 8.094525337219238, + "learning_rate": 5.747019323009456e-06, + "loss": 0.2725, + "step": 31785 + }, + { + "epoch": 2.031117742810635, + "grad_norm": 6.657487392425537, + "learning_rate": 5.746882280389201e-06, + "loss": 0.3395, + "step": 31786 + }, + { + "epoch": 2.0311313076505697, + "grad_norm": 4.363317966461182, + "learning_rate": 5.746745237768947e-06, + "loss": 0.1451, + "step": 31787 + }, + { + "epoch": 2.0311448724905046, + "grad_norm": 4.796170711517334, + "learning_rate": 5.746608195148692e-06, + "loss": 0.1979, + "step": 31788 + }, + { + "epoch": 2.0311584373304394, + "grad_norm": 4.316288471221924, + "learning_rate": 5.746471152528437e-06, + "loss": 0.2289, + "step": 31789 + }, + { + "epoch": 2.0311720021703743, + "grad_norm": 5.997834205627441, + "learning_rate": 5.746334109908181e-06, + "loss": 0.1941, + "step": 31790 + }, + { + "epoch": 2.031185567010309, + "grad_norm": 4.065699100494385, + "learning_rate": 5.7461970672879274e-06, + "loss": 0.1616, + "step": 31791 + }, + { + "epoch": 2.031199131850244, + "grad_norm": 8.296380996704102, + "learning_rate": 5.746060024667673e-06, + "loss": 0.3548, + "step": 31792 + }, + { + "epoch": 2.031212696690179, + "grad_norm": 4.289844512939453, + "learning_rate": 5.745922982047417e-06, + "loss": 0.1451, + "step": 31793 + }, + { + "epoch": 2.0312262615301138, + "grad_norm": 4.3921074867248535, + "learning_rate": 5.745785939427162e-06, + "loss": 0.2167, + "step": 31794 + }, + { + "epoch": 2.0312398263700486, + "grad_norm": 5.5453314781188965, + "learning_rate": 5.745648896806908e-06, + "loss": 0.2008, + "step": 31795 + }, + { + "epoch": 2.031253391209984, + "grad_norm": 4.593565940856934, + "learning_rate": 5.7455118541866525e-06, + "loss": 0.2206, + "step": 31796 + }, + { + "epoch": 2.031266956049919, + "grad_norm": 5.443597793579102, + "learning_rate": 5.745374811566398e-06, + "loss": 0.2337, + "step": 31797 + }, + { + "epoch": 2.0312805208898537, + "grad_norm": 5.379974842071533, + "learning_rate": 5.745237768946143e-06, + "loss": 0.2621, + "step": 31798 + }, + { + "epoch": 2.0312940857297885, + "grad_norm": 6.361632347106934, + "learning_rate": 5.745100726325887e-06, + "loss": 0.2296, + "step": 31799 + }, + { + "epoch": 2.0313076505697234, + "grad_norm": 4.252471923828125, + "learning_rate": 5.744963683705633e-06, + "loss": 0.1859, + "step": 31800 + }, + { + "epoch": 2.0313212154096583, + "grad_norm": 3.9429144859313965, + "learning_rate": 5.744826641085378e-06, + "loss": 0.1605, + "step": 31801 + }, + { + "epoch": 2.031334780249593, + "grad_norm": 6.310182094573975, + "learning_rate": 5.744689598465123e-06, + "loss": 0.2193, + "step": 31802 + }, + { + "epoch": 2.031348345089528, + "grad_norm": 3.1201894283294678, + "learning_rate": 5.744552555844868e-06, + "loss": 0.1179, + "step": 31803 + }, + { + "epoch": 2.031361909929463, + "grad_norm": 5.719117164611816, + "learning_rate": 5.744415513224614e-06, + "loss": 0.1729, + "step": 31804 + }, + { + "epoch": 2.0313754747693977, + "grad_norm": 4.441357612609863, + "learning_rate": 5.744278470604358e-06, + "loss": 0.2188, + "step": 31805 + }, + { + "epoch": 2.0313890396093326, + "grad_norm": 5.0582733154296875, + "learning_rate": 5.7441414279841034e-06, + "loss": 0.2282, + "step": 31806 + }, + { + "epoch": 2.0314026044492675, + "grad_norm": 5.995356559753418, + "learning_rate": 5.744004385363849e-06, + "loss": 0.2771, + "step": 31807 + }, + { + "epoch": 2.0314161692892023, + "grad_norm": 4.788510322570801, + "learning_rate": 5.743867342743595e-06, + "loss": 0.1791, + "step": 31808 + }, + { + "epoch": 2.031429734129137, + "grad_norm": 4.372063159942627, + "learning_rate": 5.743730300123339e-06, + "loss": 0.1554, + "step": 31809 + }, + { + "epoch": 2.031443298969072, + "grad_norm": 6.765610694885254, + "learning_rate": 5.743593257503084e-06, + "loss": 0.2006, + "step": 31810 + }, + { + "epoch": 2.031456863809007, + "grad_norm": 4.034022808074951, + "learning_rate": 5.7434562148828285e-06, + "loss": 0.1553, + "step": 31811 + }, + { + "epoch": 2.031470428648942, + "grad_norm": 3.8857147693634033, + "learning_rate": 5.743319172262574e-06, + "loss": 0.145, + "step": 31812 + }, + { + "epoch": 2.0314839934888766, + "grad_norm": 4.528906345367432, + "learning_rate": 5.74318212964232e-06, + "loss": 0.1972, + "step": 31813 + }, + { + "epoch": 2.0314975583288115, + "grad_norm": 4.459639072418213, + "learning_rate": 5.743045087022064e-06, + "loss": 0.159, + "step": 31814 + }, + { + "epoch": 2.031511123168747, + "grad_norm": 5.47945499420166, + "learning_rate": 5.742908044401809e-06, + "loss": 0.2201, + "step": 31815 + }, + { + "epoch": 2.0315246880086817, + "grad_norm": 5.017501354217529, + "learning_rate": 5.742771001781554e-06, + "loss": 0.1466, + "step": 31816 + }, + { + "epoch": 2.0315382528486166, + "grad_norm": 5.602023124694824, + "learning_rate": 5.7426339591613e-06, + "loss": 0.1469, + "step": 31817 + }, + { + "epoch": 2.0315518176885514, + "grad_norm": 6.903644561767578, + "learning_rate": 5.742496916541045e-06, + "loss": 0.2568, + "step": 31818 + }, + { + "epoch": 2.0315653825284863, + "grad_norm": 5.307804584503174, + "learning_rate": 5.74235987392079e-06, + "loss": 0.2302, + "step": 31819 + }, + { + "epoch": 2.031578947368421, + "grad_norm": 4.6174821853637695, + "learning_rate": 5.742222831300534e-06, + "loss": 0.2686, + "step": 31820 + }, + { + "epoch": 2.031592512208356, + "grad_norm": 5.886061191558838, + "learning_rate": 5.74208578868028e-06, + "loss": 0.2449, + "step": 31821 + }, + { + "epoch": 2.031606077048291, + "grad_norm": 6.467228412628174, + "learning_rate": 5.7419487460600255e-06, + "loss": 0.2439, + "step": 31822 + }, + { + "epoch": 2.0316196418882257, + "grad_norm": 5.765330791473389, + "learning_rate": 5.741811703439771e-06, + "loss": 0.2041, + "step": 31823 + }, + { + "epoch": 2.0316332067281606, + "grad_norm": 3.6815035343170166, + "learning_rate": 5.741674660819515e-06, + "loss": 0.1676, + "step": 31824 + }, + { + "epoch": 2.0316467715680955, + "grad_norm": 6.658992767333984, + "learning_rate": 5.74153761819926e-06, + "loss": 0.2809, + "step": 31825 + }, + { + "epoch": 2.0316603364080303, + "grad_norm": 5.13250207901001, + "learning_rate": 5.741400575579006e-06, + "loss": 0.2343, + "step": 31826 + }, + { + "epoch": 2.031673901247965, + "grad_norm": 6.325632572174072, + "learning_rate": 5.7412635329587505e-06, + "loss": 0.2467, + "step": 31827 + }, + { + "epoch": 2.0316874660879, + "grad_norm": 5.320633888244629, + "learning_rate": 5.741126490338496e-06, + "loss": 0.1887, + "step": 31828 + }, + { + "epoch": 2.031701030927835, + "grad_norm": 7.734386920928955, + "learning_rate": 5.74098944771824e-06, + "loss": 0.2673, + "step": 31829 + }, + { + "epoch": 2.03171459576777, + "grad_norm": 6.405468940734863, + "learning_rate": 5.740852405097986e-06, + "loss": 0.1906, + "step": 31830 + }, + { + "epoch": 2.0317281606077047, + "grad_norm": 3.8486311435699463, + "learning_rate": 5.740715362477731e-06, + "loss": 0.0892, + "step": 31831 + }, + { + "epoch": 2.0317417254476395, + "grad_norm": 7.5706562995910645, + "learning_rate": 5.740578319857476e-06, + "loss": 0.1724, + "step": 31832 + }, + { + "epoch": 2.0317552902875744, + "grad_norm": 6.317013740539551, + "learning_rate": 5.740441277237221e-06, + "loss": 0.2926, + "step": 31833 + }, + { + "epoch": 2.0317688551275097, + "grad_norm": 4.272587776184082, + "learning_rate": 5.740304234616967e-06, + "loss": 0.137, + "step": 31834 + }, + { + "epoch": 2.0317824199674446, + "grad_norm": 5.605556488037109, + "learning_rate": 5.740167191996712e-06, + "loss": 0.1893, + "step": 31835 + }, + { + "epoch": 2.0317959848073794, + "grad_norm": 4.662060260772705, + "learning_rate": 5.740030149376456e-06, + "loss": 0.0994, + "step": 31836 + }, + { + "epoch": 2.0318095496473143, + "grad_norm": 4.925424575805664, + "learning_rate": 5.7398931067562014e-06, + "loss": 0.2376, + "step": 31837 + }, + { + "epoch": 2.031823114487249, + "grad_norm": 6.681009292602539, + "learning_rate": 5.739756064135947e-06, + "loss": 0.2919, + "step": 31838 + }, + { + "epoch": 2.031836679327184, + "grad_norm": 3.896124839782715, + "learning_rate": 5.739619021515692e-06, + "loss": 0.1416, + "step": 31839 + }, + { + "epoch": 2.031850244167119, + "grad_norm": 4.029694557189941, + "learning_rate": 5.739481978895437e-06, + "loss": 0.1774, + "step": 31840 + }, + { + "epoch": 2.0318638090070538, + "grad_norm": 5.093973636627197, + "learning_rate": 5.739344936275182e-06, + "loss": 0.2787, + "step": 31841 + }, + { + "epoch": 2.0318773738469886, + "grad_norm": 5.055113315582275, + "learning_rate": 5.7392078936549265e-06, + "loss": 0.2061, + "step": 31842 + }, + { + "epoch": 2.0318909386869235, + "grad_norm": 5.675424575805664, + "learning_rate": 5.7390708510346725e-06, + "loss": 0.1962, + "step": 31843 + }, + { + "epoch": 2.0319045035268584, + "grad_norm": 4.915670394897461, + "learning_rate": 5.738933808414418e-06, + "loss": 0.1651, + "step": 31844 + }, + { + "epoch": 2.0319180683667932, + "grad_norm": 5.955011367797852, + "learning_rate": 5.738796765794162e-06, + "loss": 0.2594, + "step": 31845 + }, + { + "epoch": 2.031931633206728, + "grad_norm": 5.487809658050537, + "learning_rate": 5.738659723173907e-06, + "loss": 0.1434, + "step": 31846 + }, + { + "epoch": 2.031945198046663, + "grad_norm": 5.632306098937988, + "learning_rate": 5.738522680553653e-06, + "loss": 0.2669, + "step": 31847 + }, + { + "epoch": 2.031958762886598, + "grad_norm": 4.464147090911865, + "learning_rate": 5.7383856379333976e-06, + "loss": 0.218, + "step": 31848 + }, + { + "epoch": 2.0319723277265327, + "grad_norm": 4.878547668457031, + "learning_rate": 5.738248595313143e-06, + "loss": 0.314, + "step": 31849 + }, + { + "epoch": 2.0319858925664676, + "grad_norm": 5.0821919441223145, + "learning_rate": 5.738111552692888e-06, + "loss": 0.1613, + "step": 31850 + }, + { + "epoch": 2.0319994574064024, + "grad_norm": 6.091895580291748, + "learning_rate": 5.737974510072632e-06, + "loss": 0.3, + "step": 31851 + }, + { + "epoch": 2.0320130222463373, + "grad_norm": 5.696297645568848, + "learning_rate": 5.737837467452378e-06, + "loss": 0.2614, + "step": 31852 + }, + { + "epoch": 2.0320265870862726, + "grad_norm": 4.454263687133789, + "learning_rate": 5.7377004248321235e-06, + "loss": 0.178, + "step": 31853 + }, + { + "epoch": 2.0320401519262075, + "grad_norm": 4.0239081382751465, + "learning_rate": 5.737563382211868e-06, + "loss": 0.1921, + "step": 31854 + }, + { + "epoch": 2.0320537167661423, + "grad_norm": 3.857775926589966, + "learning_rate": 5.737426339591613e-06, + "loss": 0.1816, + "step": 31855 + }, + { + "epoch": 2.032067281606077, + "grad_norm": 4.028318881988525, + "learning_rate": 5.737289296971359e-06, + "loss": 0.2005, + "step": 31856 + }, + { + "epoch": 2.032080846446012, + "grad_norm": 4.851174831390381, + "learning_rate": 5.737152254351104e-06, + "loss": 0.2046, + "step": 31857 + }, + { + "epoch": 2.032094411285947, + "grad_norm": 4.497766971588135, + "learning_rate": 5.7370152117308485e-06, + "loss": 0.1469, + "step": 31858 + }, + { + "epoch": 2.032107976125882, + "grad_norm": 4.582972049713135, + "learning_rate": 5.736878169110594e-06, + "loss": 0.2079, + "step": 31859 + }, + { + "epoch": 2.0321215409658167, + "grad_norm": 4.660776138305664, + "learning_rate": 5.73674112649034e-06, + "loss": 0.2452, + "step": 31860 + }, + { + "epoch": 2.0321351058057515, + "grad_norm": 5.454090595245361, + "learning_rate": 5.736604083870084e-06, + "loss": 0.2703, + "step": 31861 + }, + { + "epoch": 2.0321486706456864, + "grad_norm": 5.496408462524414, + "learning_rate": 5.736467041249829e-06, + "loss": 0.2267, + "step": 31862 + }, + { + "epoch": 2.0321622354856212, + "grad_norm": 3.5350799560546875, + "learning_rate": 5.7363299986295736e-06, + "loss": 0.1905, + "step": 31863 + }, + { + "epoch": 2.032175800325556, + "grad_norm": 4.736698627471924, + "learning_rate": 5.73619295600932e-06, + "loss": 0.2379, + "step": 31864 + }, + { + "epoch": 2.032189365165491, + "grad_norm": 5.236162185668945, + "learning_rate": 5.736055913389065e-06, + "loss": 0.2283, + "step": 31865 + }, + { + "epoch": 2.032202930005426, + "grad_norm": 3.8239083290100098, + "learning_rate": 5.73591887076881e-06, + "loss": 0.1467, + "step": 31866 + }, + { + "epoch": 2.0322164948453607, + "grad_norm": 6.057971477508545, + "learning_rate": 5.735781828148554e-06, + "loss": 0.2943, + "step": 31867 + }, + { + "epoch": 2.0322300596852956, + "grad_norm": 4.396544933319092, + "learning_rate": 5.7356447855282994e-06, + "loss": 0.1763, + "step": 31868 + }, + { + "epoch": 2.0322436245252304, + "grad_norm": 5.136081695556641, + "learning_rate": 5.7355077429080455e-06, + "loss": 0.2989, + "step": 31869 + }, + { + "epoch": 2.0322571893651653, + "grad_norm": 5.199326992034912, + "learning_rate": 5.73537070028779e-06, + "loss": 0.2635, + "step": 31870 + }, + { + "epoch": 2.0322707542051, + "grad_norm": 4.356503486633301, + "learning_rate": 5.735233657667535e-06, + "loss": 0.138, + "step": 31871 + }, + { + "epoch": 2.0322843190450355, + "grad_norm": 5.034225940704346, + "learning_rate": 5.73509661504728e-06, + "loss": 0.2255, + "step": 31872 + }, + { + "epoch": 2.0322978838849703, + "grad_norm": 4.5842108726501465, + "learning_rate": 5.734959572427025e-06, + "loss": 0.2785, + "step": 31873 + }, + { + "epoch": 2.032311448724905, + "grad_norm": 5.921799659729004, + "learning_rate": 5.7348225298067705e-06, + "loss": 0.3108, + "step": 31874 + }, + { + "epoch": 2.03232501356484, + "grad_norm": 5.423091888427734, + "learning_rate": 5.734685487186516e-06, + "loss": 0.3611, + "step": 31875 + }, + { + "epoch": 2.032338578404775, + "grad_norm": 4.3994832038879395, + "learning_rate": 5.73454844456626e-06, + "loss": 0.1845, + "step": 31876 + }, + { + "epoch": 2.03235214324471, + "grad_norm": 5.24705696105957, + "learning_rate": 5.734411401946006e-06, + "loss": 0.2609, + "step": 31877 + }, + { + "epoch": 2.0323657080846447, + "grad_norm": 6.011734485626221, + "learning_rate": 5.734274359325751e-06, + "loss": 0.3549, + "step": 31878 + }, + { + "epoch": 2.0323792729245795, + "grad_norm": 4.795345306396484, + "learning_rate": 5.7341373167054956e-06, + "loss": 0.2836, + "step": 31879 + }, + { + "epoch": 2.0323928377645144, + "grad_norm": 4.58482551574707, + "learning_rate": 5.734000274085241e-06, + "loss": 0.2109, + "step": 31880 + }, + { + "epoch": 2.0324064026044493, + "grad_norm": 5.553371429443359, + "learning_rate": 5.733863231464986e-06, + "loss": 0.2681, + "step": 31881 + }, + { + "epoch": 2.032419967444384, + "grad_norm": 5.16260290145874, + "learning_rate": 5.733726188844732e-06, + "loss": 0.2847, + "step": 31882 + }, + { + "epoch": 2.032433532284319, + "grad_norm": 5.57948637008667, + "learning_rate": 5.733589146224476e-06, + "loss": 0.3476, + "step": 31883 + }, + { + "epoch": 2.032447097124254, + "grad_norm": 5.943261623382568, + "learning_rate": 5.7334521036042215e-06, + "loss": 0.2687, + "step": 31884 + }, + { + "epoch": 2.0324606619641887, + "grad_norm": 4.548216819763184, + "learning_rate": 5.733315060983966e-06, + "loss": 0.3228, + "step": 31885 + }, + { + "epoch": 2.0324742268041236, + "grad_norm": 4.4572834968566895, + "learning_rate": 5.733178018363712e-06, + "loss": 0.3034, + "step": 31886 + }, + { + "epoch": 2.0324877916440585, + "grad_norm": 4.632081508636475, + "learning_rate": 5.733040975743457e-06, + "loss": 0.2047, + "step": 31887 + }, + { + "epoch": 2.0325013564839933, + "grad_norm": 4.733822345733643, + "learning_rate": 5.732903933123201e-06, + "loss": 0.1409, + "step": 31888 + }, + { + "epoch": 2.032514921323928, + "grad_norm": 3.5731022357940674, + "learning_rate": 5.7327668905029465e-06, + "loss": 0.1908, + "step": 31889 + }, + { + "epoch": 2.032528486163863, + "grad_norm": 4.3692307472229, + "learning_rate": 5.7326298478826925e-06, + "loss": 0.2199, + "step": 31890 + }, + { + "epoch": 2.0325420510037984, + "grad_norm": 5.622495651245117, + "learning_rate": 5.732492805262438e-06, + "loss": 0.2683, + "step": 31891 + }, + { + "epoch": 2.0325556158437332, + "grad_norm": 3.7910678386688232, + "learning_rate": 5.732355762642182e-06, + "loss": 0.202, + "step": 31892 + }, + { + "epoch": 2.032569180683668, + "grad_norm": 4.944097995758057, + "learning_rate": 5.732218720021927e-06, + "loss": 0.2338, + "step": 31893 + }, + { + "epoch": 2.032582745523603, + "grad_norm": 5.518423557281494, + "learning_rate": 5.7320816774016716e-06, + "loss": 0.2429, + "step": 31894 + }, + { + "epoch": 2.032596310363538, + "grad_norm": 3.3327698707580566, + "learning_rate": 5.731944634781418e-06, + "loss": 0.1481, + "step": 31895 + }, + { + "epoch": 2.0326098752034727, + "grad_norm": 4.972209453582764, + "learning_rate": 5.731807592161163e-06, + "loss": 0.1894, + "step": 31896 + }, + { + "epoch": 2.0326234400434076, + "grad_norm": 5.184420585632324, + "learning_rate": 5.731670549540907e-06, + "loss": 0.1664, + "step": 31897 + }, + { + "epoch": 2.0326370048833424, + "grad_norm": 3.1446373462677, + "learning_rate": 5.731533506920652e-06, + "loss": 0.134, + "step": 31898 + }, + { + "epoch": 2.0326505697232773, + "grad_norm": 5.359377861022949, + "learning_rate": 5.731396464300398e-06, + "loss": 0.1876, + "step": 31899 + }, + { + "epoch": 2.032664134563212, + "grad_norm": 4.327010154724121, + "learning_rate": 5.7312594216801435e-06, + "loss": 0.1874, + "step": 31900 + }, + { + "epoch": 2.032677699403147, + "grad_norm": 5.6206440925598145, + "learning_rate": 5.731122379059888e-06, + "loss": 0.2965, + "step": 31901 + }, + { + "epoch": 2.032691264243082, + "grad_norm": 4.182478904724121, + "learning_rate": 5.730985336439633e-06, + "loss": 0.1736, + "step": 31902 + }, + { + "epoch": 2.0327048290830168, + "grad_norm": 6.062556743621826, + "learning_rate": 5.730848293819379e-06, + "loss": 0.2856, + "step": 31903 + }, + { + "epoch": 2.0327183939229516, + "grad_norm": 6.244141578674316, + "learning_rate": 5.730711251199123e-06, + "loss": 0.2274, + "step": 31904 + }, + { + "epoch": 2.0327319587628865, + "grad_norm": 4.045329570770264, + "learning_rate": 5.7305742085788685e-06, + "loss": 0.303, + "step": 31905 + }, + { + "epoch": 2.0327455236028213, + "grad_norm": 4.34418249130249, + "learning_rate": 5.730437165958614e-06, + "loss": 0.2278, + "step": 31906 + }, + { + "epoch": 2.032759088442756, + "grad_norm": 4.151523590087891, + "learning_rate": 5.730300123338358e-06, + "loss": 0.2486, + "step": 31907 + }, + { + "epoch": 2.032772653282691, + "grad_norm": 4.970081806182861, + "learning_rate": 5.730163080718104e-06, + "loss": 0.2338, + "step": 31908 + }, + { + "epoch": 2.032786218122626, + "grad_norm": 4.30971622467041, + "learning_rate": 5.730026038097849e-06, + "loss": 0.2295, + "step": 31909 + }, + { + "epoch": 2.0327997829625613, + "grad_norm": 4.699008464813232, + "learning_rate": 5.7298889954775936e-06, + "loss": 0.1958, + "step": 31910 + }, + { + "epoch": 2.032813347802496, + "grad_norm": 5.787978649139404, + "learning_rate": 5.729751952857339e-06, + "loss": 0.2961, + "step": 31911 + }, + { + "epoch": 2.032826912642431, + "grad_norm": 4.246174335479736, + "learning_rate": 5.729614910237085e-06, + "loss": 0.1461, + "step": 31912 + }, + { + "epoch": 2.032840477482366, + "grad_norm": 5.397308349609375, + "learning_rate": 5.729477867616829e-06, + "loss": 0.2182, + "step": 31913 + }, + { + "epoch": 2.0328540423223007, + "grad_norm": 4.1059088706970215, + "learning_rate": 5.729340824996574e-06, + "loss": 0.2256, + "step": 31914 + }, + { + "epoch": 2.0328676071622356, + "grad_norm": 5.555905818939209, + "learning_rate": 5.7292037823763195e-06, + "loss": 0.2238, + "step": 31915 + }, + { + "epoch": 2.0328811720021704, + "grad_norm": 3.946152448654175, + "learning_rate": 5.7290667397560655e-06, + "loss": 0.2243, + "step": 31916 + }, + { + "epoch": 2.0328947368421053, + "grad_norm": 3.19840145111084, + "learning_rate": 5.72892969713581e-06, + "loss": 0.1776, + "step": 31917 + }, + { + "epoch": 2.03290830168204, + "grad_norm": 3.873232126235962, + "learning_rate": 5.728792654515555e-06, + "loss": 0.1615, + "step": 31918 + }, + { + "epoch": 2.032921866521975, + "grad_norm": 5.409035682678223, + "learning_rate": 5.728655611895299e-06, + "loss": 0.2588, + "step": 31919 + }, + { + "epoch": 2.03293543136191, + "grad_norm": 4.613316059112549, + "learning_rate": 5.7285185692750445e-06, + "loss": 0.2391, + "step": 31920 + }, + { + "epoch": 2.0329489962018448, + "grad_norm": 3.671980381011963, + "learning_rate": 5.7283815266547905e-06, + "loss": 0.0994, + "step": 31921 + }, + { + "epoch": 2.0329625610417796, + "grad_norm": 4.7916154861450195, + "learning_rate": 5.728244484034535e-06, + "loss": 0.2086, + "step": 31922 + }, + { + "epoch": 2.0329761258817145, + "grad_norm": 3.1671431064605713, + "learning_rate": 5.72810744141428e-06, + "loss": 0.1317, + "step": 31923 + }, + { + "epoch": 2.0329896907216494, + "grad_norm": 3.03275203704834, + "learning_rate": 5.727970398794025e-06, + "loss": 0.1228, + "step": 31924 + }, + { + "epoch": 2.0330032555615842, + "grad_norm": 4.93811559677124, + "learning_rate": 5.727833356173771e-06, + "loss": 0.2381, + "step": 31925 + }, + { + "epoch": 2.033016820401519, + "grad_norm": 4.93950891494751, + "learning_rate": 5.727696313553516e-06, + "loss": 0.1546, + "step": 31926 + }, + { + "epoch": 2.033030385241454, + "grad_norm": 4.816161155700684, + "learning_rate": 5.727559270933261e-06, + "loss": 0.17, + "step": 31927 + }, + { + "epoch": 2.033043950081389, + "grad_norm": 5.1309356689453125, + "learning_rate": 5.727422228313005e-06, + "loss": 0.1099, + "step": 31928 + }, + { + "epoch": 2.033057514921324, + "grad_norm": 5.539281368255615, + "learning_rate": 5.727285185692751e-06, + "loss": 0.2672, + "step": 31929 + }, + { + "epoch": 2.033071079761259, + "grad_norm": 4.29469108581543, + "learning_rate": 5.727148143072496e-06, + "loss": 0.14, + "step": 31930 + }, + { + "epoch": 2.033084644601194, + "grad_norm": 4.233517169952393, + "learning_rate": 5.7270111004522415e-06, + "loss": 0.1624, + "step": 31931 + }, + { + "epoch": 2.0330982094411287, + "grad_norm": 3.499173402786255, + "learning_rate": 5.726874057831986e-06, + "loss": 0.204, + "step": 31932 + }, + { + "epoch": 2.0331117742810636, + "grad_norm": 4.59381628036499, + "learning_rate": 5.726737015211732e-06, + "loss": 0.206, + "step": 31933 + }, + { + "epoch": 2.0331253391209985, + "grad_norm": 4.809260845184326, + "learning_rate": 5.726599972591477e-06, + "loss": 0.1708, + "step": 31934 + }, + { + "epoch": 2.0331389039609333, + "grad_norm": 4.23709774017334, + "learning_rate": 5.726462929971221e-06, + "loss": 0.1512, + "step": 31935 + }, + { + "epoch": 2.033152468800868, + "grad_norm": 3.470217227935791, + "learning_rate": 5.7263258873509665e-06, + "loss": 0.1209, + "step": 31936 + }, + { + "epoch": 2.033166033640803, + "grad_norm": 3.176964282989502, + "learning_rate": 5.726188844730711e-06, + "loss": 0.1045, + "step": 31937 + }, + { + "epoch": 2.033179598480738, + "grad_norm": 4.451381206512451, + "learning_rate": 5.726051802110457e-06, + "loss": 0.2187, + "step": 31938 + }, + { + "epoch": 2.033193163320673, + "grad_norm": 5.405400276184082, + "learning_rate": 5.725914759490202e-06, + "loss": 0.1604, + "step": 31939 + }, + { + "epoch": 2.0332067281606077, + "grad_norm": 4.074153423309326, + "learning_rate": 5.725777716869947e-06, + "loss": 0.1826, + "step": 31940 + }, + { + "epoch": 2.0332202930005425, + "grad_norm": 6.491714000701904, + "learning_rate": 5.7256406742496916e-06, + "loss": 0.2485, + "step": 31941 + }, + { + "epoch": 2.0332338578404774, + "grad_norm": 4.872002124786377, + "learning_rate": 5.725503631629438e-06, + "loss": 0.1884, + "step": 31942 + }, + { + "epoch": 2.0332474226804123, + "grad_norm": 4.2131524085998535, + "learning_rate": 5.725366589009183e-06, + "loss": 0.141, + "step": 31943 + }, + { + "epoch": 2.033260987520347, + "grad_norm": 4.39371395111084, + "learning_rate": 5.725229546388927e-06, + "loss": 0.1856, + "step": 31944 + }, + { + "epoch": 2.033274552360282, + "grad_norm": 4.68526029586792, + "learning_rate": 5.725092503768672e-06, + "loss": 0.1821, + "step": 31945 + }, + { + "epoch": 2.033288117200217, + "grad_norm": 4.730166912078857, + "learning_rate": 5.724955461148418e-06, + "loss": 0.1898, + "step": 31946 + }, + { + "epoch": 2.0333016820401517, + "grad_norm": 4.556856155395508, + "learning_rate": 5.724818418528163e-06, + "loss": 0.1578, + "step": 31947 + }, + { + "epoch": 2.033315246880087, + "grad_norm": 3.8804593086242676, + "learning_rate": 5.724681375907908e-06, + "loss": 0.2094, + "step": 31948 + }, + { + "epoch": 2.033328811720022, + "grad_norm": 3.0365793704986572, + "learning_rate": 5.724544333287653e-06, + "loss": 0.1093, + "step": 31949 + }, + { + "epoch": 2.0333423765599568, + "grad_norm": 3.0820677280426025, + "learning_rate": 5.724407290667397e-06, + "loss": 0.0925, + "step": 31950 + }, + { + "epoch": 2.0333559413998916, + "grad_norm": 5.224712371826172, + "learning_rate": 5.724270248047143e-06, + "loss": 0.2303, + "step": 31951 + }, + { + "epoch": 2.0333695062398265, + "grad_norm": 3.6915218830108643, + "learning_rate": 5.7241332054268885e-06, + "loss": 0.1473, + "step": 31952 + }, + { + "epoch": 2.0333830710797614, + "grad_norm": 3.6524288654327393, + "learning_rate": 5.723996162806633e-06, + "loss": 0.1915, + "step": 31953 + }, + { + "epoch": 2.033396635919696, + "grad_norm": 5.096993923187256, + "learning_rate": 5.723859120186378e-06, + "loss": 0.2657, + "step": 31954 + }, + { + "epoch": 2.033410200759631, + "grad_norm": 5.517510890960693, + "learning_rate": 5.723722077566124e-06, + "loss": 0.1989, + "step": 31955 + }, + { + "epoch": 2.033423765599566, + "grad_norm": 3.7651331424713135, + "learning_rate": 5.723585034945868e-06, + "loss": 0.1796, + "step": 31956 + }, + { + "epoch": 2.033437330439501, + "grad_norm": 5.233283042907715, + "learning_rate": 5.723447992325614e-06, + "loss": 0.2032, + "step": 31957 + }, + { + "epoch": 2.0334508952794357, + "grad_norm": 3.8387227058410645, + "learning_rate": 5.723310949705359e-06, + "loss": 0.0981, + "step": 31958 + }, + { + "epoch": 2.0334644601193705, + "grad_norm": 4.2304887771606445, + "learning_rate": 5.723173907085105e-06, + "loss": 0.2321, + "step": 31959 + }, + { + "epoch": 2.0334780249593054, + "grad_norm": 3.0908327102661133, + "learning_rate": 5.723036864464849e-06, + "loss": 0.1693, + "step": 31960 + }, + { + "epoch": 2.0334915897992403, + "grad_norm": 4.207859516143799, + "learning_rate": 5.722899821844594e-06, + "loss": 0.1574, + "step": 31961 + }, + { + "epoch": 2.033505154639175, + "grad_norm": 5.296657562255859, + "learning_rate": 5.722762779224339e-06, + "loss": 0.2596, + "step": 31962 + }, + { + "epoch": 2.03351871947911, + "grad_norm": 5.785730361938477, + "learning_rate": 5.722625736604084e-06, + "loss": 0.3534, + "step": 31963 + }, + { + "epoch": 2.033532284319045, + "grad_norm": 4.020279884338379, + "learning_rate": 5.72248869398383e-06, + "loss": 0.2046, + "step": 31964 + }, + { + "epoch": 2.0335458491589797, + "grad_norm": 3.9536993503570557, + "learning_rate": 5.722351651363575e-06, + "loss": 0.1666, + "step": 31965 + }, + { + "epoch": 2.0335594139989146, + "grad_norm": 3.2600269317626953, + "learning_rate": 5.722214608743319e-06, + "loss": 0.0788, + "step": 31966 + }, + { + "epoch": 2.03357297883885, + "grad_norm": 3.0280988216400146, + "learning_rate": 5.7220775661230645e-06, + "loss": 0.0909, + "step": 31967 + }, + { + "epoch": 2.0335865436787848, + "grad_norm": 4.144165992736816, + "learning_rate": 5.7219405235028106e-06, + "loss": 0.231, + "step": 31968 + }, + { + "epoch": 2.0336001085187196, + "grad_norm": 2.986746072769165, + "learning_rate": 5.721803480882555e-06, + "loss": 0.1705, + "step": 31969 + }, + { + "epoch": 2.0336136733586545, + "grad_norm": 5.444370269775391, + "learning_rate": 5.7216664382623e-06, + "loss": 0.2659, + "step": 31970 + }, + { + "epoch": 2.0336272381985894, + "grad_norm": 3.5857906341552734, + "learning_rate": 5.721529395642044e-06, + "loss": 0.119, + "step": 31971 + }, + { + "epoch": 2.0336408030385242, + "grad_norm": 3.161893844604492, + "learning_rate": 5.7213923530217904e-06, + "loss": 0.1824, + "step": 31972 + }, + { + "epoch": 2.033654367878459, + "grad_norm": 3.542637586593628, + "learning_rate": 5.721255310401536e-06, + "loss": 0.1045, + "step": 31973 + }, + { + "epoch": 2.033667932718394, + "grad_norm": 4.460687637329102, + "learning_rate": 5.721118267781281e-06, + "loss": 0.1942, + "step": 31974 + }, + { + "epoch": 2.033681497558329, + "grad_norm": 3.080228328704834, + "learning_rate": 5.720981225161025e-06, + "loss": 0.1474, + "step": 31975 + }, + { + "epoch": 2.0336950623982637, + "grad_norm": 4.304510116577148, + "learning_rate": 5.72084418254077e-06, + "loss": 0.2246, + "step": 31976 + }, + { + "epoch": 2.0337086272381986, + "grad_norm": 3.0913846492767334, + "learning_rate": 5.720707139920516e-06, + "loss": 0.1049, + "step": 31977 + }, + { + "epoch": 2.0337221920781334, + "grad_norm": 3.759103775024414, + "learning_rate": 5.720570097300261e-06, + "loss": 0.1527, + "step": 31978 + }, + { + "epoch": 2.0337357569180683, + "grad_norm": 4.108724594116211, + "learning_rate": 5.720433054680006e-06, + "loss": 0.1555, + "step": 31979 + }, + { + "epoch": 2.033749321758003, + "grad_norm": 3.0339672565460205, + "learning_rate": 5.720296012059751e-06, + "loss": 0.0986, + "step": 31980 + }, + { + "epoch": 2.033762886597938, + "grad_norm": 5.648107528686523, + "learning_rate": 5.720158969439496e-06, + "loss": 0.1662, + "step": 31981 + }, + { + "epoch": 2.033776451437873, + "grad_norm": 3.7392373085021973, + "learning_rate": 5.720021926819241e-06, + "loss": 0.1366, + "step": 31982 + }, + { + "epoch": 2.0337900162778078, + "grad_norm": 2.747939348220825, + "learning_rate": 5.7198848841989865e-06, + "loss": 0.089, + "step": 31983 + }, + { + "epoch": 2.0338035811177426, + "grad_norm": 3.152662515640259, + "learning_rate": 5.719747841578731e-06, + "loss": 0.1324, + "step": 31984 + }, + { + "epoch": 2.0338171459576775, + "grad_norm": 5.100496292114258, + "learning_rate": 5.719610798958477e-06, + "loss": 0.1914, + "step": 31985 + }, + { + "epoch": 2.033830710797613, + "grad_norm": 2.998603105545044, + "learning_rate": 5.719473756338222e-06, + "loss": 0.0834, + "step": 31986 + }, + { + "epoch": 2.0338442756375477, + "grad_norm": 3.5951788425445557, + "learning_rate": 5.719336713717966e-06, + "loss": 0.1899, + "step": 31987 + }, + { + "epoch": 2.0338578404774825, + "grad_norm": 3.216712474822998, + "learning_rate": 5.719199671097712e-06, + "loss": 0.12, + "step": 31988 + }, + { + "epoch": 2.0338714053174174, + "grad_norm": 2.9409730434417725, + "learning_rate": 5.719062628477457e-06, + "loss": 0.0727, + "step": 31989 + }, + { + "epoch": 2.0338849701573523, + "grad_norm": 4.249246597290039, + "learning_rate": 5.718925585857202e-06, + "loss": 0.1492, + "step": 31990 + }, + { + "epoch": 2.033898534997287, + "grad_norm": 3.7757821083068848, + "learning_rate": 5.718788543236947e-06, + "loss": 0.1903, + "step": 31991 + }, + { + "epoch": 2.033912099837222, + "grad_norm": 2.921482801437378, + "learning_rate": 5.718651500616692e-06, + "loss": 0.1085, + "step": 31992 + }, + { + "epoch": 2.033925664677157, + "grad_norm": 3.8712801933288574, + "learning_rate": 5.718514457996437e-06, + "loss": 0.1634, + "step": 31993 + }, + { + "epoch": 2.0339392295170917, + "grad_norm": 3.4453823566436768, + "learning_rate": 5.718377415376183e-06, + "loss": 0.1024, + "step": 31994 + }, + { + "epoch": 2.0339527943570266, + "grad_norm": 2.5257365703582764, + "learning_rate": 5.718240372755928e-06, + "loss": 0.0614, + "step": 31995 + }, + { + "epoch": 2.0339663591969614, + "grad_norm": 3.1776671409606934, + "learning_rate": 5.718103330135672e-06, + "loss": 0.1204, + "step": 31996 + }, + { + "epoch": 2.0339799240368963, + "grad_norm": 6.0096049308776855, + "learning_rate": 5.717966287515417e-06, + "loss": 0.1794, + "step": 31997 + }, + { + "epoch": 2.033993488876831, + "grad_norm": 3.8065993785858154, + "learning_rate": 5.717829244895163e-06, + "loss": 0.0967, + "step": 31998 + }, + { + "epoch": 2.034007053716766, + "grad_norm": 4.007357597351074, + "learning_rate": 5.7176922022749086e-06, + "loss": 0.16, + "step": 31999 + }, + { + "epoch": 2.034020618556701, + "grad_norm": 3.073357343673706, + "learning_rate": 5.717555159654653e-06, + "loss": 0.1038, + "step": 32000 + }, + { + "epoch": 2.0340341833966358, + "grad_norm": 3.2175045013427734, + "learning_rate": 5.717418117034398e-06, + "loss": 0.0878, + "step": 32001 + }, + { + "epoch": 2.0340477482365706, + "grad_norm": 4.155307292938232, + "learning_rate": 5.717281074414144e-06, + "loss": 0.156, + "step": 32002 + }, + { + "epoch": 2.0340613130765055, + "grad_norm": 7.196442127227783, + "learning_rate": 5.7171440317938884e-06, + "loss": 0.2234, + "step": 32003 + }, + { + "epoch": 2.0340748779164404, + "grad_norm": 3.8274426460266113, + "learning_rate": 5.717006989173634e-06, + "loss": 0.1496, + "step": 32004 + }, + { + "epoch": 2.0340884427563757, + "grad_norm": 4.114119052886963, + "learning_rate": 5.716869946553378e-06, + "loss": 0.1671, + "step": 32005 + }, + { + "epoch": 2.0341020075963105, + "grad_norm": 3.744579553604126, + "learning_rate": 5.716732903933123e-06, + "loss": 0.1556, + "step": 32006 + }, + { + "epoch": 2.0341155724362454, + "grad_norm": 4.446450710296631, + "learning_rate": 5.716595861312869e-06, + "loss": 0.1188, + "step": 32007 + }, + { + "epoch": 2.0341291372761803, + "grad_norm": 3.4205331802368164, + "learning_rate": 5.716458818692614e-06, + "loss": 0.149, + "step": 32008 + }, + { + "epoch": 2.034142702116115, + "grad_norm": 4.326399803161621, + "learning_rate": 5.716321776072359e-06, + "loss": 0.1308, + "step": 32009 + }, + { + "epoch": 2.03415626695605, + "grad_norm": 2.8504209518432617, + "learning_rate": 5.716184733452104e-06, + "loss": 0.1008, + "step": 32010 + }, + { + "epoch": 2.034169831795985, + "grad_norm": 3.3753764629364014, + "learning_rate": 5.71604769083185e-06, + "loss": 0.0711, + "step": 32011 + }, + { + "epoch": 2.0341833966359197, + "grad_norm": 6.360939025878906, + "learning_rate": 5.715910648211594e-06, + "loss": 0.1846, + "step": 32012 + }, + { + "epoch": 2.0341969614758546, + "grad_norm": 4.704305171966553, + "learning_rate": 5.715773605591339e-06, + "loss": 0.1412, + "step": 32013 + }, + { + "epoch": 2.0342105263157895, + "grad_norm": 4.316980838775635, + "learning_rate": 5.7156365629710846e-06, + "loss": 0.1312, + "step": 32014 + }, + { + "epoch": 2.0342240911557243, + "grad_norm": 3.963491678237915, + "learning_rate": 5.71549952035083e-06, + "loss": 0.142, + "step": 32015 + }, + { + "epoch": 2.034237655995659, + "grad_norm": 5.255434513092041, + "learning_rate": 5.715362477730575e-06, + "loss": 0.1207, + "step": 32016 + }, + { + "epoch": 2.034251220835594, + "grad_norm": 4.023892402648926, + "learning_rate": 5.71522543511032e-06, + "loss": 0.1186, + "step": 32017 + }, + { + "epoch": 2.034264785675529, + "grad_norm": 3.9989383220672607, + "learning_rate": 5.715088392490064e-06, + "loss": 0.135, + "step": 32018 + }, + { + "epoch": 2.034278350515464, + "grad_norm": 2.990302324295044, + "learning_rate": 5.71495134986981e-06, + "loss": 0.0556, + "step": 32019 + }, + { + "epoch": 2.0342919153553987, + "grad_norm": 6.5439372062683105, + "learning_rate": 5.714814307249556e-06, + "loss": 0.2222, + "step": 32020 + }, + { + "epoch": 2.0343054801953335, + "grad_norm": 3.9681544303894043, + "learning_rate": 5.7146772646293e-06, + "loss": 0.1627, + "step": 32021 + }, + { + "epoch": 2.0343190450352684, + "grad_norm": 6.037377834320068, + "learning_rate": 5.714540222009045e-06, + "loss": 0.2551, + "step": 32022 + }, + { + "epoch": 2.0343326098752033, + "grad_norm": 6.82733154296875, + "learning_rate": 5.71440317938879e-06, + "loss": 0.1724, + "step": 32023 + }, + { + "epoch": 2.0343461747151386, + "grad_norm": 7.578234672546387, + "learning_rate": 5.714266136768536e-06, + "loss": 0.202, + "step": 32024 + }, + { + "epoch": 2.0343597395550734, + "grad_norm": 5.274173736572266, + "learning_rate": 5.714129094148281e-06, + "loss": 0.1907, + "step": 32025 + }, + { + "epoch": 2.0343733043950083, + "grad_norm": 5.794957160949707, + "learning_rate": 5.713992051528026e-06, + "loss": 0.2147, + "step": 32026 + }, + { + "epoch": 2.034386869234943, + "grad_norm": 5.349926471710205, + "learning_rate": 5.71385500890777e-06, + "loss": 0.236, + "step": 32027 + }, + { + "epoch": 2.034400434074878, + "grad_norm": 3.9517316818237305, + "learning_rate": 5.713717966287516e-06, + "loss": 0.2009, + "step": 32028 + }, + { + "epoch": 2.034413998914813, + "grad_norm": 3.6604931354522705, + "learning_rate": 5.713580923667261e-06, + "loss": 0.1498, + "step": 32029 + }, + { + "epoch": 2.0344275637547478, + "grad_norm": 3.5202410221099854, + "learning_rate": 5.713443881047006e-06, + "loss": 0.1153, + "step": 32030 + }, + { + "epoch": 2.0344411285946826, + "grad_norm": 3.7913129329681396, + "learning_rate": 5.713306838426751e-06, + "loss": 0.1789, + "step": 32031 + }, + { + "epoch": 2.0344546934346175, + "grad_norm": 5.145929336547852, + "learning_rate": 5.713169795806496e-06, + "loss": 0.1817, + "step": 32032 + }, + { + "epoch": 2.0344682582745524, + "grad_norm": 4.0001935958862305, + "learning_rate": 5.713032753186242e-06, + "loss": 0.2001, + "step": 32033 + }, + { + "epoch": 2.034481823114487, + "grad_norm": 5.084697723388672, + "learning_rate": 5.7128957105659864e-06, + "loss": 0.2387, + "step": 32034 + }, + { + "epoch": 2.034495387954422, + "grad_norm": 4.279709815979004, + "learning_rate": 5.712758667945732e-06, + "loss": 0.1827, + "step": 32035 + }, + { + "epoch": 2.034508952794357, + "grad_norm": 4.797248363494873, + "learning_rate": 5.712621625325476e-06, + "loss": 0.2235, + "step": 32036 + }, + { + "epoch": 2.034522517634292, + "grad_norm": 5.224442958831787, + "learning_rate": 5.712484582705222e-06, + "loss": 0.2835, + "step": 32037 + }, + { + "epoch": 2.0345360824742267, + "grad_norm": 4.9646077156066895, + "learning_rate": 5.712347540084967e-06, + "loss": 0.2337, + "step": 32038 + }, + { + "epoch": 2.0345496473141615, + "grad_norm": 5.205994606018066, + "learning_rate": 5.7122104974647115e-06, + "loss": 0.1877, + "step": 32039 + }, + { + "epoch": 2.0345632121540964, + "grad_norm": 5.141986846923828, + "learning_rate": 5.712073454844457e-06, + "loss": 0.2456, + "step": 32040 + }, + { + "epoch": 2.0345767769940313, + "grad_norm": 5.716540813446045, + "learning_rate": 5.711936412224203e-06, + "loss": 0.2498, + "step": 32041 + }, + { + "epoch": 2.034590341833966, + "grad_norm": 5.83416748046875, + "learning_rate": 5.711799369603948e-06, + "loss": 0.208, + "step": 32042 + }, + { + "epoch": 2.0346039066739015, + "grad_norm": 4.685983180999756, + "learning_rate": 5.711662326983692e-06, + "loss": 0.1476, + "step": 32043 + }, + { + "epoch": 2.0346174715138363, + "grad_norm": 4.0362162590026855, + "learning_rate": 5.711525284363437e-06, + "loss": 0.1523, + "step": 32044 + }, + { + "epoch": 2.034631036353771, + "grad_norm": 6.395218849182129, + "learning_rate": 5.711388241743182e-06, + "loss": 0.2933, + "step": 32045 + }, + { + "epoch": 2.034644601193706, + "grad_norm": 4.315359115600586, + "learning_rate": 5.711251199122928e-06, + "loss": 0.1237, + "step": 32046 + }, + { + "epoch": 2.034658166033641, + "grad_norm": 4.069464206695557, + "learning_rate": 5.711114156502673e-06, + "loss": 0.1744, + "step": 32047 + }, + { + "epoch": 2.034671730873576, + "grad_norm": 4.3514227867126465, + "learning_rate": 5.710977113882418e-06, + "loss": 0.1744, + "step": 32048 + }, + { + "epoch": 2.0346852957135106, + "grad_norm": 7.168612957000732, + "learning_rate": 5.7108400712621624e-06, + "loss": 0.2264, + "step": 32049 + }, + { + "epoch": 2.0346988605534455, + "grad_norm": 4.113030910491943, + "learning_rate": 5.7107030286419084e-06, + "loss": 0.0987, + "step": 32050 + }, + { + "epoch": 2.0347124253933804, + "grad_norm": 6.844817161560059, + "learning_rate": 5.710565986021654e-06, + "loss": 0.2338, + "step": 32051 + }, + { + "epoch": 2.0347259902333152, + "grad_norm": 4.743610858917236, + "learning_rate": 5.710428943401398e-06, + "loss": 0.1444, + "step": 32052 + }, + { + "epoch": 2.03473955507325, + "grad_norm": 4.326112270355225, + "learning_rate": 5.710291900781143e-06, + "loss": 0.1609, + "step": 32053 + }, + { + "epoch": 2.034753119913185, + "grad_norm": 5.357155799865723, + "learning_rate": 5.710154858160889e-06, + "loss": 0.1876, + "step": 32054 + }, + { + "epoch": 2.03476668475312, + "grad_norm": 7.048205852508545, + "learning_rate": 5.7100178155406335e-06, + "loss": 0.282, + "step": 32055 + }, + { + "epoch": 2.0347802495930547, + "grad_norm": 3.6365270614624023, + "learning_rate": 5.709880772920379e-06, + "loss": 0.1715, + "step": 32056 + }, + { + "epoch": 2.0347938144329896, + "grad_norm": 4.981573581695557, + "learning_rate": 5.709743730300124e-06, + "loss": 0.2029, + "step": 32057 + }, + { + "epoch": 2.0348073792729244, + "grad_norm": 4.645832538604736, + "learning_rate": 5.709606687679868e-06, + "loss": 0.1348, + "step": 32058 + }, + { + "epoch": 2.0348209441128593, + "grad_norm": 5.576443672180176, + "learning_rate": 5.709469645059614e-06, + "loss": 0.2975, + "step": 32059 + }, + { + "epoch": 2.034834508952794, + "grad_norm": 3.774151086807251, + "learning_rate": 5.709332602439359e-06, + "loss": 0.1227, + "step": 32060 + }, + { + "epoch": 2.034848073792729, + "grad_norm": 4.1826581954956055, + "learning_rate": 5.709195559819104e-06, + "loss": 0.1621, + "step": 32061 + }, + { + "epoch": 2.0348616386326643, + "grad_norm": 4.891364097595215, + "learning_rate": 5.709058517198849e-06, + "loss": 0.2057, + "step": 32062 + }, + { + "epoch": 2.034875203472599, + "grad_norm": 4.492170333862305, + "learning_rate": 5.708921474578595e-06, + "loss": 0.1476, + "step": 32063 + }, + { + "epoch": 2.034888768312534, + "grad_norm": 4.530392169952393, + "learning_rate": 5.708784431958339e-06, + "loss": 0.1957, + "step": 32064 + }, + { + "epoch": 2.034902333152469, + "grad_norm": 5.16109561920166, + "learning_rate": 5.7086473893380844e-06, + "loss": 0.1684, + "step": 32065 + }, + { + "epoch": 2.034915897992404, + "grad_norm": 4.8901143074035645, + "learning_rate": 5.70851034671783e-06, + "loss": 0.1585, + "step": 32066 + }, + { + "epoch": 2.0349294628323387, + "grad_norm": 4.782639026641846, + "learning_rate": 5.708373304097576e-06, + "loss": 0.1686, + "step": 32067 + }, + { + "epoch": 2.0349430276722735, + "grad_norm": 4.142115592956543, + "learning_rate": 5.70823626147732e-06, + "loss": 0.1811, + "step": 32068 + }, + { + "epoch": 2.0349565925122084, + "grad_norm": 6.490285396575928, + "learning_rate": 5.708099218857065e-06, + "loss": 0.2575, + "step": 32069 + }, + { + "epoch": 2.0349701573521433, + "grad_norm": 4.313798427581787, + "learning_rate": 5.7079621762368095e-06, + "loss": 0.117, + "step": 32070 + }, + { + "epoch": 2.034983722192078, + "grad_norm": 3.832382917404175, + "learning_rate": 5.7078251336165555e-06, + "loss": 0.13, + "step": 32071 + }, + { + "epoch": 2.034997287032013, + "grad_norm": 4.953077793121338, + "learning_rate": 5.707688090996301e-06, + "loss": 0.1446, + "step": 32072 + }, + { + "epoch": 2.035010851871948, + "grad_norm": 3.9699559211730957, + "learning_rate": 5.707551048376046e-06, + "loss": 0.1911, + "step": 32073 + }, + { + "epoch": 2.0350244167118827, + "grad_norm": 4.273946762084961, + "learning_rate": 5.70741400575579e-06, + "loss": 0.1254, + "step": 32074 + }, + { + "epoch": 2.0350379815518176, + "grad_norm": 4.136703014373779, + "learning_rate": 5.707276963135535e-06, + "loss": 0.2246, + "step": 32075 + }, + { + "epoch": 2.0350515463917525, + "grad_norm": 4.688154220581055, + "learning_rate": 5.707139920515281e-06, + "loss": 0.1566, + "step": 32076 + }, + { + "epoch": 2.0350651112316873, + "grad_norm": 3.679445266723633, + "learning_rate": 5.707002877895026e-06, + "loss": 0.1019, + "step": 32077 + }, + { + "epoch": 2.035078676071622, + "grad_norm": 3.853912353515625, + "learning_rate": 5.706865835274771e-06, + "loss": 0.1112, + "step": 32078 + }, + { + "epoch": 2.035092240911557, + "grad_norm": 3.593003511428833, + "learning_rate": 5.706728792654515e-06, + "loss": 0.1329, + "step": 32079 + }, + { + "epoch": 2.035105805751492, + "grad_norm": 4.860957145690918, + "learning_rate": 5.706591750034261e-06, + "loss": 0.179, + "step": 32080 + }, + { + "epoch": 2.0351193705914272, + "grad_norm": 5.790056228637695, + "learning_rate": 5.7064547074140065e-06, + "loss": 0.1783, + "step": 32081 + }, + { + "epoch": 2.035132935431362, + "grad_norm": 4.1868438720703125, + "learning_rate": 5.706317664793752e-06, + "loss": 0.1291, + "step": 32082 + }, + { + "epoch": 2.035146500271297, + "grad_norm": 4.56041145324707, + "learning_rate": 5.706180622173496e-06, + "loss": 0.183, + "step": 32083 + }, + { + "epoch": 2.035160065111232, + "grad_norm": 4.016542911529541, + "learning_rate": 5.706043579553242e-06, + "loss": 0.1376, + "step": 32084 + }, + { + "epoch": 2.0351736299511667, + "grad_norm": 3.290771245956421, + "learning_rate": 5.705906536932987e-06, + "loss": 0.125, + "step": 32085 + }, + { + "epoch": 2.0351871947911016, + "grad_norm": 6.340839385986328, + "learning_rate": 5.7057694943127315e-06, + "loss": 0.2714, + "step": 32086 + }, + { + "epoch": 2.0352007596310364, + "grad_norm": 5.277726173400879, + "learning_rate": 5.705632451692477e-06, + "loss": 0.2091, + "step": 32087 + }, + { + "epoch": 2.0352143244709713, + "grad_norm": 5.115804672241211, + "learning_rate": 5.705495409072222e-06, + "loss": 0.2307, + "step": 32088 + }, + { + "epoch": 2.035227889310906, + "grad_norm": 4.338479995727539, + "learning_rate": 5.705358366451967e-06, + "loss": 0.1132, + "step": 32089 + }, + { + "epoch": 2.035241454150841, + "grad_norm": 3.6445207595825195, + "learning_rate": 5.705221323831712e-06, + "loss": 0.1346, + "step": 32090 + }, + { + "epoch": 2.035255018990776, + "grad_norm": 4.659468173980713, + "learning_rate": 5.705084281211457e-06, + "loss": 0.2056, + "step": 32091 + }, + { + "epoch": 2.0352685838307107, + "grad_norm": 4.520008563995361, + "learning_rate": 5.704947238591202e-06, + "loss": 0.1601, + "step": 32092 + }, + { + "epoch": 2.0352821486706456, + "grad_norm": 3.639458179473877, + "learning_rate": 5.704810195970948e-06, + "loss": 0.1093, + "step": 32093 + }, + { + "epoch": 2.0352957135105805, + "grad_norm": 4.66680908203125, + "learning_rate": 5.704673153350693e-06, + "loss": 0.1824, + "step": 32094 + }, + { + "epoch": 2.0353092783505153, + "grad_norm": 4.112683296203613, + "learning_rate": 5.704536110730437e-06, + "loss": 0.148, + "step": 32095 + }, + { + "epoch": 2.03532284319045, + "grad_norm": 5.174907207489014, + "learning_rate": 5.7043990681101824e-06, + "loss": 0.2127, + "step": 32096 + }, + { + "epoch": 2.035336408030385, + "grad_norm": 4.465386867523193, + "learning_rate": 5.7042620254899285e-06, + "loss": 0.2151, + "step": 32097 + }, + { + "epoch": 2.03534997287032, + "grad_norm": 4.427164077758789, + "learning_rate": 5.704124982869673e-06, + "loss": 0.1606, + "step": 32098 + }, + { + "epoch": 2.0353635377102552, + "grad_norm": 5.457852363586426, + "learning_rate": 5.703987940249418e-06, + "loss": 0.2149, + "step": 32099 + }, + { + "epoch": 2.03537710255019, + "grad_norm": 4.212426662445068, + "learning_rate": 5.703850897629163e-06, + "loss": 0.1564, + "step": 32100 + }, + { + "epoch": 2.035390667390125, + "grad_norm": 3.447335720062256, + "learning_rate": 5.7037138550089075e-06, + "loss": 0.1047, + "step": 32101 + }, + { + "epoch": 2.03540423223006, + "grad_norm": 4.021501064300537, + "learning_rate": 5.7035768123886535e-06, + "loss": 0.1934, + "step": 32102 + }, + { + "epoch": 2.0354177970699947, + "grad_norm": 7.279144287109375, + "learning_rate": 5.703439769768399e-06, + "loss": 0.3436, + "step": 32103 + }, + { + "epoch": 2.0354313619099296, + "grad_norm": 3.1919808387756348, + "learning_rate": 5.703302727148143e-06, + "loss": 0.1271, + "step": 32104 + }, + { + "epoch": 2.0354449267498644, + "grad_norm": 4.490861415863037, + "learning_rate": 5.703165684527888e-06, + "loss": 0.2018, + "step": 32105 + }, + { + "epoch": 2.0354584915897993, + "grad_norm": 5.255126953125, + "learning_rate": 5.703028641907634e-06, + "loss": 0.2054, + "step": 32106 + }, + { + "epoch": 2.035472056429734, + "grad_norm": 5.056951999664307, + "learning_rate": 5.702891599287379e-06, + "loss": 0.2096, + "step": 32107 + }, + { + "epoch": 2.035485621269669, + "grad_norm": 3.5048329830169678, + "learning_rate": 5.702754556667124e-06, + "loss": 0.1976, + "step": 32108 + }, + { + "epoch": 2.035499186109604, + "grad_norm": 3.805506944656372, + "learning_rate": 5.702617514046869e-06, + "loss": 0.1339, + "step": 32109 + }, + { + "epoch": 2.0355127509495388, + "grad_norm": 3.7373220920562744, + "learning_rate": 5.702480471426615e-06, + "loss": 0.1001, + "step": 32110 + }, + { + "epoch": 2.0355263157894736, + "grad_norm": 5.588230133056641, + "learning_rate": 5.702343428806359e-06, + "loss": 0.2074, + "step": 32111 + }, + { + "epoch": 2.0355398806294085, + "grad_norm": 6.018347263336182, + "learning_rate": 5.7022063861861045e-06, + "loss": 0.3015, + "step": 32112 + }, + { + "epoch": 2.0355534454693434, + "grad_norm": 5.116769313812256, + "learning_rate": 5.702069343565849e-06, + "loss": 0.2327, + "step": 32113 + }, + { + "epoch": 2.0355670103092782, + "grad_norm": 5.754708766937256, + "learning_rate": 5.701932300945594e-06, + "loss": 0.3438, + "step": 32114 + }, + { + "epoch": 2.035580575149213, + "grad_norm": 3.590080499649048, + "learning_rate": 5.70179525832534e-06, + "loss": 0.1379, + "step": 32115 + }, + { + "epoch": 2.035594139989148, + "grad_norm": 4.533808708190918, + "learning_rate": 5.701658215705085e-06, + "loss": 0.1489, + "step": 32116 + }, + { + "epoch": 2.035607704829083, + "grad_norm": 4.337202548980713, + "learning_rate": 5.7015211730848295e-06, + "loss": 0.144, + "step": 32117 + }, + { + "epoch": 2.0356212696690177, + "grad_norm": 4.017757892608643, + "learning_rate": 5.701384130464575e-06, + "loss": 0.1522, + "step": 32118 + }, + { + "epoch": 2.035634834508953, + "grad_norm": 4.803829193115234, + "learning_rate": 5.701247087844321e-06, + "loss": 0.1779, + "step": 32119 + }, + { + "epoch": 2.035648399348888, + "grad_norm": 3.703119993209839, + "learning_rate": 5.701110045224065e-06, + "loss": 0.2223, + "step": 32120 + }, + { + "epoch": 2.0356619641888227, + "grad_norm": 5.831074237823486, + "learning_rate": 5.70097300260381e-06, + "loss": 0.2611, + "step": 32121 + }, + { + "epoch": 2.0356755290287576, + "grad_norm": 5.556247234344482, + "learning_rate": 5.700835959983555e-06, + "loss": 0.2002, + "step": 32122 + }, + { + "epoch": 2.0356890938686925, + "grad_norm": 4.65195369720459, + "learning_rate": 5.700698917363301e-06, + "loss": 0.2305, + "step": 32123 + }, + { + "epoch": 2.0357026587086273, + "grad_norm": 6.60451078414917, + "learning_rate": 5.700561874743046e-06, + "loss": 0.2428, + "step": 32124 + }, + { + "epoch": 2.035716223548562, + "grad_norm": 5.131845951080322, + "learning_rate": 5.700424832122791e-06, + "loss": 0.1719, + "step": 32125 + }, + { + "epoch": 2.035729788388497, + "grad_norm": 5.6437811851501465, + "learning_rate": 5.700287789502535e-06, + "loss": 0.1916, + "step": 32126 + }, + { + "epoch": 2.035743353228432, + "grad_norm": 4.128879547119141, + "learning_rate": 5.7001507468822804e-06, + "loss": 0.1517, + "step": 32127 + }, + { + "epoch": 2.035756918068367, + "grad_norm": 5.291683197021484, + "learning_rate": 5.7000137042620265e-06, + "loss": 0.1993, + "step": 32128 + }, + { + "epoch": 2.0357704829083016, + "grad_norm": 6.008725643157959, + "learning_rate": 5.699876661641771e-06, + "loss": 0.3541, + "step": 32129 + }, + { + "epoch": 2.0357840477482365, + "grad_norm": 5.458795547485352, + "learning_rate": 5.699739619021516e-06, + "loss": 0.2358, + "step": 32130 + }, + { + "epoch": 2.0357976125881714, + "grad_norm": 3.365602493286133, + "learning_rate": 5.699602576401261e-06, + "loss": 0.1423, + "step": 32131 + }, + { + "epoch": 2.0358111774281062, + "grad_norm": 5.282225608825684, + "learning_rate": 5.699465533781006e-06, + "loss": 0.2171, + "step": 32132 + }, + { + "epoch": 2.035824742268041, + "grad_norm": 7.585700511932373, + "learning_rate": 5.6993284911607515e-06, + "loss": 0.2317, + "step": 32133 + }, + { + "epoch": 2.035838307107976, + "grad_norm": 4.517817497253418, + "learning_rate": 5.699191448540497e-06, + "loss": 0.0957, + "step": 32134 + }, + { + "epoch": 2.035851871947911, + "grad_norm": 5.891587257385254, + "learning_rate": 5.699054405920241e-06, + "loss": 0.2064, + "step": 32135 + }, + { + "epoch": 2.0358654367878457, + "grad_norm": 3.9401988983154297, + "learning_rate": 5.698917363299987e-06, + "loss": 0.1532, + "step": 32136 + }, + { + "epoch": 2.035879001627781, + "grad_norm": 4.735526084899902, + "learning_rate": 5.698780320679732e-06, + "loss": 0.1779, + "step": 32137 + }, + { + "epoch": 2.035892566467716, + "grad_norm": 5.561591148376465, + "learning_rate": 5.6986432780594766e-06, + "loss": 0.3308, + "step": 32138 + }, + { + "epoch": 2.0359061313076507, + "grad_norm": 6.910447597503662, + "learning_rate": 5.698506235439222e-06, + "loss": 0.3047, + "step": 32139 + }, + { + "epoch": 2.0359196961475856, + "grad_norm": 4.449025630950928, + "learning_rate": 5.698369192818967e-06, + "loss": 0.2009, + "step": 32140 + }, + { + "epoch": 2.0359332609875205, + "grad_norm": 4.134695053100586, + "learning_rate": 5.698232150198713e-06, + "loss": 0.1771, + "step": 32141 + }, + { + "epoch": 2.0359468258274553, + "grad_norm": 4.315830230712891, + "learning_rate": 5.698095107578457e-06, + "loss": 0.1879, + "step": 32142 + }, + { + "epoch": 2.03596039066739, + "grad_norm": 5.627399921417236, + "learning_rate": 5.6979580649582025e-06, + "loss": 0.2551, + "step": 32143 + }, + { + "epoch": 2.035973955507325, + "grad_norm": 4.9318647384643555, + "learning_rate": 5.697821022337947e-06, + "loss": 0.115, + "step": 32144 + }, + { + "epoch": 2.03598752034726, + "grad_norm": 4.723263263702393, + "learning_rate": 5.697683979717693e-06, + "loss": 0.2704, + "step": 32145 + }, + { + "epoch": 2.036001085187195, + "grad_norm": 5.002097129821777, + "learning_rate": 5.697546937097438e-06, + "loss": 0.192, + "step": 32146 + }, + { + "epoch": 2.0360146500271297, + "grad_norm": 3.863194704055786, + "learning_rate": 5.697409894477182e-06, + "loss": 0.1342, + "step": 32147 + }, + { + "epoch": 2.0360282148670645, + "grad_norm": 5.03292179107666, + "learning_rate": 5.6972728518569275e-06, + "loss": 0.1719, + "step": 32148 + }, + { + "epoch": 2.0360417797069994, + "grad_norm": 5.480430603027344, + "learning_rate": 5.6971358092366735e-06, + "loss": 0.2419, + "step": 32149 + }, + { + "epoch": 2.0360553445469343, + "grad_norm": 5.5309224128723145, + "learning_rate": 5.696998766616419e-06, + "loss": 0.3288, + "step": 32150 + }, + { + "epoch": 2.036068909386869, + "grad_norm": 5.277033805847168, + "learning_rate": 5.696861723996163e-06, + "loss": 0.1966, + "step": 32151 + }, + { + "epoch": 2.036082474226804, + "grad_norm": 6.080153942108154, + "learning_rate": 5.696724681375908e-06, + "loss": 0.1787, + "step": 32152 + }, + { + "epoch": 2.036096039066739, + "grad_norm": 5.387202262878418, + "learning_rate": 5.696587638755654e-06, + "loss": 0.1768, + "step": 32153 + }, + { + "epoch": 2.0361096039066737, + "grad_norm": 6.205844402313232, + "learning_rate": 5.696450596135399e-06, + "loss": 0.214, + "step": 32154 + }, + { + "epoch": 2.0361231687466086, + "grad_norm": 5.592677116394043, + "learning_rate": 5.696313553515144e-06, + "loss": 0.1989, + "step": 32155 + }, + { + "epoch": 2.0361367335865435, + "grad_norm": 4.80960750579834, + "learning_rate": 5.696176510894889e-06, + "loss": 0.2197, + "step": 32156 + }, + { + "epoch": 2.0361502984264788, + "grad_norm": 3.888180732727051, + "learning_rate": 5.696039468274633e-06, + "loss": 0.1963, + "step": 32157 + }, + { + "epoch": 2.0361638632664136, + "grad_norm": 5.769931793212891, + "learning_rate": 5.695902425654379e-06, + "loss": 0.2066, + "step": 32158 + }, + { + "epoch": 2.0361774281063485, + "grad_norm": 5.560713768005371, + "learning_rate": 5.6957653830341245e-06, + "loss": 0.2364, + "step": 32159 + }, + { + "epoch": 2.0361909929462834, + "grad_norm": 4.744348049163818, + "learning_rate": 5.695628340413869e-06, + "loss": 0.1812, + "step": 32160 + }, + { + "epoch": 2.0362045577862182, + "grad_norm": 4.042556285858154, + "learning_rate": 5.695491297793614e-06, + "loss": 0.1329, + "step": 32161 + }, + { + "epoch": 2.036218122626153, + "grad_norm": 4.692816734313965, + "learning_rate": 5.69535425517336e-06, + "loss": 0.2131, + "step": 32162 + }, + { + "epoch": 2.036231687466088, + "grad_norm": 6.010852336883545, + "learning_rate": 5.695217212553104e-06, + "loss": 0.2457, + "step": 32163 + }, + { + "epoch": 2.036245252306023, + "grad_norm": 4.479952335357666, + "learning_rate": 5.6950801699328495e-06, + "loss": 0.1728, + "step": 32164 + }, + { + "epoch": 2.0362588171459577, + "grad_norm": 4.304853439331055, + "learning_rate": 5.694943127312595e-06, + "loss": 0.2208, + "step": 32165 + }, + { + "epoch": 2.0362723819858926, + "grad_norm": 4.210478782653809, + "learning_rate": 5.694806084692341e-06, + "loss": 0.1376, + "step": 32166 + }, + { + "epoch": 2.0362859468258274, + "grad_norm": 5.0697855949401855, + "learning_rate": 5.694669042072085e-06, + "loss": 0.2158, + "step": 32167 + }, + { + "epoch": 2.0362995116657623, + "grad_norm": 4.558598041534424, + "learning_rate": 5.69453199945183e-06, + "loss": 0.2657, + "step": 32168 + }, + { + "epoch": 2.036313076505697, + "grad_norm": 3.4767093658447266, + "learning_rate": 5.6943949568315746e-06, + "loss": 0.1805, + "step": 32169 + }, + { + "epoch": 2.036326641345632, + "grad_norm": 3.7856550216674805, + "learning_rate": 5.69425791421132e-06, + "loss": 0.2059, + "step": 32170 + }, + { + "epoch": 2.036340206185567, + "grad_norm": 5.426961898803711, + "learning_rate": 5.694120871591066e-06, + "loss": 0.258, + "step": 32171 + }, + { + "epoch": 2.0363537710255017, + "grad_norm": 5.798883438110352, + "learning_rate": 5.69398382897081e-06, + "loss": 0.2505, + "step": 32172 + }, + { + "epoch": 2.0363673358654366, + "grad_norm": 4.655064582824707, + "learning_rate": 5.693846786350555e-06, + "loss": 0.1507, + "step": 32173 + }, + { + "epoch": 2.0363809007053715, + "grad_norm": 4.2273335456848145, + "learning_rate": 5.6937097437303005e-06, + "loss": 0.1812, + "step": 32174 + }, + { + "epoch": 2.036394465545307, + "grad_norm": 4.701847076416016, + "learning_rate": 5.6935727011100465e-06, + "loss": 0.244, + "step": 32175 + }, + { + "epoch": 2.0364080303852417, + "grad_norm": 5.285560607910156, + "learning_rate": 5.693435658489791e-06, + "loss": 0.2331, + "step": 32176 + }, + { + "epoch": 2.0364215952251765, + "grad_norm": 3.7215800285339355, + "learning_rate": 5.693298615869536e-06, + "loss": 0.198, + "step": 32177 + }, + { + "epoch": 2.0364351600651114, + "grad_norm": 4.311730861663818, + "learning_rate": 5.69316157324928e-06, + "loss": 0.1513, + "step": 32178 + }, + { + "epoch": 2.0364487249050462, + "grad_norm": 3.7995994091033936, + "learning_rate": 5.693024530629026e-06, + "loss": 0.1722, + "step": 32179 + }, + { + "epoch": 2.036462289744981, + "grad_norm": 4.909282207489014, + "learning_rate": 5.6928874880087715e-06, + "loss": 0.1367, + "step": 32180 + }, + { + "epoch": 2.036475854584916, + "grad_norm": 5.104074478149414, + "learning_rate": 5.692750445388516e-06, + "loss": 0.1856, + "step": 32181 + }, + { + "epoch": 2.036489419424851, + "grad_norm": 4.983562469482422, + "learning_rate": 5.692613402768261e-06, + "loss": 0.2005, + "step": 32182 + }, + { + "epoch": 2.0365029842647857, + "grad_norm": 3.8215010166168213, + "learning_rate": 5.692476360148006e-06, + "loss": 0.1685, + "step": 32183 + }, + { + "epoch": 2.0365165491047206, + "grad_norm": 4.975784778594971, + "learning_rate": 5.692339317527752e-06, + "loss": 0.1952, + "step": 32184 + }, + { + "epoch": 2.0365301139446554, + "grad_norm": 3.7766382694244385, + "learning_rate": 5.692202274907497e-06, + "loss": 0.1549, + "step": 32185 + }, + { + "epoch": 2.0365436787845903, + "grad_norm": 4.713341236114502, + "learning_rate": 5.692065232287242e-06, + "loss": 0.2338, + "step": 32186 + }, + { + "epoch": 2.036557243624525, + "grad_norm": 3.9982290267944336, + "learning_rate": 5.691928189666986e-06, + "loss": 0.2089, + "step": 32187 + }, + { + "epoch": 2.03657080846446, + "grad_norm": 6.803133964538574, + "learning_rate": 5.691791147046732e-06, + "loss": 0.2328, + "step": 32188 + }, + { + "epoch": 2.036584373304395, + "grad_norm": 5.444317817687988, + "learning_rate": 5.691654104426477e-06, + "loss": 0.2309, + "step": 32189 + }, + { + "epoch": 2.0365979381443298, + "grad_norm": 3.5882012844085693, + "learning_rate": 5.6915170618062225e-06, + "loss": 0.1129, + "step": 32190 + }, + { + "epoch": 2.0366115029842646, + "grad_norm": 4.17536735534668, + "learning_rate": 5.691380019185967e-06, + "loss": 0.1745, + "step": 32191 + }, + { + "epoch": 2.0366250678241995, + "grad_norm": 3.2330753803253174, + "learning_rate": 5.691242976565713e-06, + "loss": 0.208, + "step": 32192 + }, + { + "epoch": 2.0366386326641344, + "grad_norm": 3.9525351524353027, + "learning_rate": 5.691105933945458e-06, + "loss": 0.1581, + "step": 32193 + }, + { + "epoch": 2.0366521975040692, + "grad_norm": 3.328669786453247, + "learning_rate": 5.690968891325202e-06, + "loss": 0.0891, + "step": 32194 + }, + { + "epoch": 2.0366657623440045, + "grad_norm": 4.90842866897583, + "learning_rate": 5.6908318487049475e-06, + "loss": 0.1694, + "step": 32195 + }, + { + "epoch": 2.0366793271839394, + "grad_norm": 4.9007887840271, + "learning_rate": 5.690694806084692e-06, + "loss": 0.1657, + "step": 32196 + }, + { + "epoch": 2.0366928920238743, + "grad_norm": 4.422636032104492, + "learning_rate": 5.690557763464438e-06, + "loss": 0.1818, + "step": 32197 + }, + { + "epoch": 2.036706456863809, + "grad_norm": 7.260521411895752, + "learning_rate": 5.690420720844183e-06, + "loss": 0.2732, + "step": 32198 + }, + { + "epoch": 2.036720021703744, + "grad_norm": 6.2785491943359375, + "learning_rate": 5.690283678223928e-06, + "loss": 0.2622, + "step": 32199 + }, + { + "epoch": 2.036733586543679, + "grad_norm": 3.7345688343048096, + "learning_rate": 5.690146635603673e-06, + "loss": 0.1993, + "step": 32200 + }, + { + "epoch": 2.0367471513836137, + "grad_norm": 4.886543273925781, + "learning_rate": 5.690009592983419e-06, + "loss": 0.1769, + "step": 32201 + }, + { + "epoch": 2.0367607162235486, + "grad_norm": 3.9901747703552246, + "learning_rate": 5.689872550363164e-06, + "loss": 0.1509, + "step": 32202 + }, + { + "epoch": 2.0367742810634835, + "grad_norm": 4.158050060272217, + "learning_rate": 5.689735507742908e-06, + "loss": 0.2129, + "step": 32203 + }, + { + "epoch": 2.0367878459034183, + "grad_norm": 6.391851425170898, + "learning_rate": 5.689598465122653e-06, + "loss": 0.2267, + "step": 32204 + }, + { + "epoch": 2.036801410743353, + "grad_norm": 3.3140065670013428, + "learning_rate": 5.689461422502399e-06, + "loss": 0.1305, + "step": 32205 + }, + { + "epoch": 2.036814975583288, + "grad_norm": 4.976333141326904, + "learning_rate": 5.689324379882144e-06, + "loss": 0.2207, + "step": 32206 + }, + { + "epoch": 2.036828540423223, + "grad_norm": 4.048966884613037, + "learning_rate": 5.689187337261889e-06, + "loss": 0.2139, + "step": 32207 + }, + { + "epoch": 2.036842105263158, + "grad_norm": 4.740118503570557, + "learning_rate": 5.689050294641634e-06, + "loss": 0.2023, + "step": 32208 + }, + { + "epoch": 2.0368556701030927, + "grad_norm": 3.53865909576416, + "learning_rate": 5.688913252021378e-06, + "loss": 0.1872, + "step": 32209 + }, + { + "epoch": 2.0368692349430275, + "grad_norm": 3.425753355026245, + "learning_rate": 5.688776209401124e-06, + "loss": 0.1659, + "step": 32210 + }, + { + "epoch": 2.0368827997829624, + "grad_norm": 4.7194037437438965, + "learning_rate": 5.6886391667808695e-06, + "loss": 0.2193, + "step": 32211 + }, + { + "epoch": 2.0368963646228972, + "grad_norm": 4.519185543060303, + "learning_rate": 5.688502124160614e-06, + "loss": 0.221, + "step": 32212 + }, + { + "epoch": 2.0369099294628326, + "grad_norm": 3.270848035812378, + "learning_rate": 5.688365081540359e-06, + "loss": 0.178, + "step": 32213 + }, + { + "epoch": 2.0369234943027674, + "grad_norm": 4.024291038513184, + "learning_rate": 5.688228038920105e-06, + "loss": 0.1388, + "step": 32214 + }, + { + "epoch": 2.0369370591427023, + "grad_norm": 4.421721935272217, + "learning_rate": 5.68809099629985e-06, + "loss": 0.2157, + "step": 32215 + }, + { + "epoch": 2.036950623982637, + "grad_norm": 4.516293525695801, + "learning_rate": 5.687953953679595e-06, + "loss": 0.2428, + "step": 32216 + }, + { + "epoch": 2.036964188822572, + "grad_norm": 4.34290885925293, + "learning_rate": 5.68781691105934e-06, + "loss": 0.2062, + "step": 32217 + }, + { + "epoch": 2.036977753662507, + "grad_norm": 4.450679302215576, + "learning_rate": 5.687679868439086e-06, + "loss": 0.1844, + "step": 32218 + }, + { + "epoch": 2.0369913185024418, + "grad_norm": 4.802562236785889, + "learning_rate": 5.68754282581883e-06, + "loss": 0.2346, + "step": 32219 + }, + { + "epoch": 2.0370048833423766, + "grad_norm": 5.184760093688965, + "learning_rate": 5.687405783198575e-06, + "loss": 0.2125, + "step": 32220 + }, + { + "epoch": 2.0370184481823115, + "grad_norm": 3.6629629135131836, + "learning_rate": 5.68726874057832e-06, + "loss": 0.1419, + "step": 32221 + }, + { + "epoch": 2.0370320130222463, + "grad_norm": 3.8088927268981934, + "learning_rate": 5.687131697958066e-06, + "loss": 0.1759, + "step": 32222 + }, + { + "epoch": 2.037045577862181, + "grad_norm": 3.6159675121307373, + "learning_rate": 5.686994655337811e-06, + "loss": 0.108, + "step": 32223 + }, + { + "epoch": 2.037059142702116, + "grad_norm": 4.923719882965088, + "learning_rate": 5.686857612717556e-06, + "loss": 0.2321, + "step": 32224 + }, + { + "epoch": 2.037072707542051, + "grad_norm": 5.7115373611450195, + "learning_rate": 5.6867205700973e-06, + "loss": 0.2479, + "step": 32225 + }, + { + "epoch": 2.037086272381986, + "grad_norm": 3.182872772216797, + "learning_rate": 5.6865835274770455e-06, + "loss": 0.1389, + "step": 32226 + }, + { + "epoch": 2.0370998372219207, + "grad_norm": 5.7203192710876465, + "learning_rate": 5.6864464848567916e-06, + "loss": 0.1688, + "step": 32227 + }, + { + "epoch": 2.0371134020618555, + "grad_norm": 3.8890936374664307, + "learning_rate": 5.686309442236536e-06, + "loss": 0.102, + "step": 32228 + }, + { + "epoch": 2.0371269669017904, + "grad_norm": 4.54682731628418, + "learning_rate": 5.686172399616281e-06, + "loss": 0.2316, + "step": 32229 + }, + { + "epoch": 2.0371405317417253, + "grad_norm": 5.377321720123291, + "learning_rate": 5.686035356996025e-06, + "loss": 0.2291, + "step": 32230 + }, + { + "epoch": 2.03715409658166, + "grad_norm": 5.2304582595825195, + "learning_rate": 5.6858983143757714e-06, + "loss": 0.1918, + "step": 32231 + }, + { + "epoch": 2.0371676614215954, + "grad_norm": 5.215002536773682, + "learning_rate": 5.685761271755517e-06, + "loss": 0.2094, + "step": 32232 + }, + { + "epoch": 2.0371812262615303, + "grad_norm": 4.569885730743408, + "learning_rate": 5.685624229135262e-06, + "loss": 0.2457, + "step": 32233 + }, + { + "epoch": 2.037194791101465, + "grad_norm": 3.621128559112549, + "learning_rate": 5.685487186515006e-06, + "loss": 0.0978, + "step": 32234 + }, + { + "epoch": 2.0372083559414, + "grad_norm": 3.3367812633514404, + "learning_rate": 5.685350143894752e-06, + "loss": 0.0808, + "step": 32235 + }, + { + "epoch": 2.037221920781335, + "grad_norm": 4.140963554382324, + "learning_rate": 5.685213101274497e-06, + "loss": 0.1846, + "step": 32236 + }, + { + "epoch": 2.0372354856212698, + "grad_norm": 5.7018842697143555, + "learning_rate": 5.685076058654242e-06, + "loss": 0.1638, + "step": 32237 + }, + { + "epoch": 2.0372490504612046, + "grad_norm": 4.9272918701171875, + "learning_rate": 5.684939016033987e-06, + "loss": 0.2244, + "step": 32238 + }, + { + "epoch": 2.0372626153011395, + "grad_norm": 3.7948851585388184, + "learning_rate": 5.684801973413732e-06, + "loss": 0.2058, + "step": 32239 + }, + { + "epoch": 2.0372761801410744, + "grad_norm": 3.9062252044677734, + "learning_rate": 5.684664930793477e-06, + "loss": 0.1461, + "step": 32240 + }, + { + "epoch": 2.0372897449810092, + "grad_norm": 4.493876934051514, + "learning_rate": 5.684527888173222e-06, + "loss": 0.204, + "step": 32241 + }, + { + "epoch": 2.037303309820944, + "grad_norm": 4.891174793243408, + "learning_rate": 5.6843908455529675e-06, + "loss": 0.1725, + "step": 32242 + }, + { + "epoch": 2.037316874660879, + "grad_norm": 4.914257526397705, + "learning_rate": 5.684253802932712e-06, + "loss": 0.1797, + "step": 32243 + }, + { + "epoch": 2.037330439500814, + "grad_norm": 3.987082004547119, + "learning_rate": 5.684116760312458e-06, + "loss": 0.1461, + "step": 32244 + }, + { + "epoch": 2.0373440043407487, + "grad_norm": 3.9321935176849365, + "learning_rate": 5.683979717692203e-06, + "loss": 0.1952, + "step": 32245 + }, + { + "epoch": 2.0373575691806836, + "grad_norm": 4.755430698394775, + "learning_rate": 5.683842675071947e-06, + "loss": 0.1664, + "step": 32246 + }, + { + "epoch": 2.0373711340206184, + "grad_norm": 3.873556137084961, + "learning_rate": 5.683705632451693e-06, + "loss": 0.1225, + "step": 32247 + }, + { + "epoch": 2.0373846988605533, + "grad_norm": 4.806483268737793, + "learning_rate": 5.683568589831439e-06, + "loss": 0.1466, + "step": 32248 + }, + { + "epoch": 2.037398263700488, + "grad_norm": 3.489565372467041, + "learning_rate": 5.683431547211184e-06, + "loss": 0.0968, + "step": 32249 + }, + { + "epoch": 2.037411828540423, + "grad_norm": 7.494976997375488, + "learning_rate": 5.683294504590928e-06, + "loss": 0.3047, + "step": 32250 + }, + { + "epoch": 2.0374253933803583, + "grad_norm": 3.342339515686035, + "learning_rate": 5.683157461970673e-06, + "loss": 0.136, + "step": 32251 + }, + { + "epoch": 2.037438958220293, + "grad_norm": 6.40392541885376, + "learning_rate": 5.683020419350418e-06, + "loss": 0.2847, + "step": 32252 + }, + { + "epoch": 2.037452523060228, + "grad_norm": 3.3637747764587402, + "learning_rate": 5.682883376730164e-06, + "loss": 0.111, + "step": 32253 + }, + { + "epoch": 2.037466087900163, + "grad_norm": 5.657938480377197, + "learning_rate": 5.682746334109909e-06, + "loss": 0.1335, + "step": 32254 + }, + { + "epoch": 2.037479652740098, + "grad_norm": 4.360304355621338, + "learning_rate": 5.682609291489653e-06, + "loss": 0.1466, + "step": 32255 + }, + { + "epoch": 2.0374932175800327, + "grad_norm": 4.994993209838867, + "learning_rate": 5.682472248869398e-06, + "loss": 0.1931, + "step": 32256 + }, + { + "epoch": 2.0375067824199675, + "grad_norm": 7.264434814453125, + "learning_rate": 5.682335206249144e-06, + "loss": 0.2652, + "step": 32257 + }, + { + "epoch": 2.0375203472599024, + "grad_norm": 6.104292869567871, + "learning_rate": 5.6821981636288896e-06, + "loss": 0.273, + "step": 32258 + }, + { + "epoch": 2.0375339120998373, + "grad_norm": 4.902286529541016, + "learning_rate": 5.682061121008634e-06, + "loss": 0.1815, + "step": 32259 + }, + { + "epoch": 2.037547476939772, + "grad_norm": 4.947573661804199, + "learning_rate": 5.681924078388379e-06, + "loss": 0.1439, + "step": 32260 + }, + { + "epoch": 2.037561041779707, + "grad_norm": 4.831869602203369, + "learning_rate": 5.681787035768125e-06, + "loss": 0.1467, + "step": 32261 + }, + { + "epoch": 2.037574606619642, + "grad_norm": 3.6556074619293213, + "learning_rate": 5.6816499931478694e-06, + "loss": 0.1057, + "step": 32262 + }, + { + "epoch": 2.0375881714595767, + "grad_norm": 4.891834259033203, + "learning_rate": 5.681512950527615e-06, + "loss": 0.1287, + "step": 32263 + }, + { + "epoch": 2.0376017362995116, + "grad_norm": 5.537383079528809, + "learning_rate": 5.68137590790736e-06, + "loss": 0.2148, + "step": 32264 + }, + { + "epoch": 2.0376153011394464, + "grad_norm": 4.179056167602539, + "learning_rate": 5.681238865287104e-06, + "loss": 0.1572, + "step": 32265 + }, + { + "epoch": 2.0376288659793813, + "grad_norm": 3.823326587677002, + "learning_rate": 5.68110182266685e-06, + "loss": 0.1607, + "step": 32266 + }, + { + "epoch": 2.037642430819316, + "grad_norm": 22.76288414001465, + "learning_rate": 5.680964780046595e-06, + "loss": 0.1459, + "step": 32267 + }, + { + "epoch": 2.037655995659251, + "grad_norm": 2.8490724563598633, + "learning_rate": 5.68082773742634e-06, + "loss": 0.1412, + "step": 32268 + }, + { + "epoch": 2.037669560499186, + "grad_norm": 4.14873743057251, + "learning_rate": 5.680690694806085e-06, + "loss": 0.1615, + "step": 32269 + }, + { + "epoch": 2.037683125339121, + "grad_norm": 3.9008948802948, + "learning_rate": 5.680553652185831e-06, + "loss": 0.1569, + "step": 32270 + }, + { + "epoch": 2.037696690179056, + "grad_norm": 6.205657005310059, + "learning_rate": 5.680416609565575e-06, + "loss": 0.3708, + "step": 32271 + }, + { + "epoch": 2.037710255018991, + "grad_norm": 4.248453140258789, + "learning_rate": 5.68027956694532e-06, + "loss": 0.1664, + "step": 32272 + }, + { + "epoch": 2.037723819858926, + "grad_norm": 6.824501037597656, + "learning_rate": 5.6801425243250656e-06, + "loss": 0.2617, + "step": 32273 + }, + { + "epoch": 2.0377373846988607, + "grad_norm": 6.017183303833008, + "learning_rate": 5.680005481704811e-06, + "loss": 0.2883, + "step": 32274 + }, + { + "epoch": 2.0377509495387955, + "grad_norm": 4.428195476531982, + "learning_rate": 5.679868439084556e-06, + "loss": 0.1273, + "step": 32275 + }, + { + "epoch": 2.0377645143787304, + "grad_norm": 4.005402088165283, + "learning_rate": 5.679731396464301e-06, + "loss": 0.1547, + "step": 32276 + }, + { + "epoch": 2.0377780792186653, + "grad_norm": 3.643556594848633, + "learning_rate": 5.679594353844045e-06, + "loss": 0.1733, + "step": 32277 + }, + { + "epoch": 2.0377916440586, + "grad_norm": 4.814225673675537, + "learning_rate": 5.679457311223791e-06, + "loss": 0.2118, + "step": 32278 + }, + { + "epoch": 2.037805208898535, + "grad_norm": 4.668099880218506, + "learning_rate": 5.679320268603537e-06, + "loss": 0.263, + "step": 32279 + }, + { + "epoch": 2.03781877373847, + "grad_norm": 5.247705459594727, + "learning_rate": 5.679183225983281e-06, + "loss": 0.1642, + "step": 32280 + }, + { + "epoch": 2.0378323385784047, + "grad_norm": 4.616891860961914, + "learning_rate": 5.679046183363026e-06, + "loss": 0.2404, + "step": 32281 + }, + { + "epoch": 2.0378459034183396, + "grad_norm": 4.834565162658691, + "learning_rate": 5.678909140742771e-06, + "loss": 0.1312, + "step": 32282 + }, + { + "epoch": 2.0378594682582745, + "grad_norm": 4.604322910308838, + "learning_rate": 5.678772098122517e-06, + "loss": 0.2334, + "step": 32283 + }, + { + "epoch": 2.0378730330982093, + "grad_norm": 3.884709596633911, + "learning_rate": 5.678635055502262e-06, + "loss": 0.1382, + "step": 32284 + }, + { + "epoch": 2.037886597938144, + "grad_norm": 4.826648712158203, + "learning_rate": 5.678498012882007e-06, + "loss": 0.1636, + "step": 32285 + }, + { + "epoch": 2.037900162778079, + "grad_norm": 3.5830154418945312, + "learning_rate": 5.678360970261751e-06, + "loss": 0.0982, + "step": 32286 + }, + { + "epoch": 2.037913727618014, + "grad_norm": 5.73073673248291, + "learning_rate": 5.678223927641497e-06, + "loss": 0.3024, + "step": 32287 + }, + { + "epoch": 2.037927292457949, + "grad_norm": 3.367213726043701, + "learning_rate": 5.678086885021242e-06, + "loss": 0.1342, + "step": 32288 + }, + { + "epoch": 2.037940857297884, + "grad_norm": 4.001850128173828, + "learning_rate": 5.677949842400987e-06, + "loss": 0.1456, + "step": 32289 + }, + { + "epoch": 2.037954422137819, + "grad_norm": 7.363819599151611, + "learning_rate": 5.677812799780732e-06, + "loss": 0.1621, + "step": 32290 + }, + { + "epoch": 2.037967986977754, + "grad_norm": 4.68760871887207, + "learning_rate": 5.677675757160478e-06, + "loss": 0.1159, + "step": 32291 + }, + { + "epoch": 2.0379815518176887, + "grad_norm": 3.753704786300659, + "learning_rate": 5.677538714540223e-06, + "loss": 0.122, + "step": 32292 + }, + { + "epoch": 2.0379951166576236, + "grad_norm": 4.824158191680908, + "learning_rate": 5.6774016719199674e-06, + "loss": 0.2443, + "step": 32293 + }, + { + "epoch": 2.0380086814975584, + "grad_norm": 5.381950378417969, + "learning_rate": 5.677264629299713e-06, + "loss": 0.1531, + "step": 32294 + }, + { + "epoch": 2.0380222463374933, + "grad_norm": 5.146902084350586, + "learning_rate": 5.677127586679457e-06, + "loss": 0.2373, + "step": 32295 + }, + { + "epoch": 2.038035811177428, + "grad_norm": 5.142261028289795, + "learning_rate": 5.676990544059203e-06, + "loss": 0.1921, + "step": 32296 + }, + { + "epoch": 2.038049376017363, + "grad_norm": 5.344785690307617, + "learning_rate": 5.676853501438948e-06, + "loss": 0.19, + "step": 32297 + }, + { + "epoch": 2.038062940857298, + "grad_norm": 4.156361103057861, + "learning_rate": 5.676716458818693e-06, + "loss": 0.202, + "step": 32298 + }, + { + "epoch": 2.0380765056972328, + "grad_norm": 4.3514251708984375, + "learning_rate": 5.676579416198438e-06, + "loss": 0.1757, + "step": 32299 + }, + { + "epoch": 2.0380900705371676, + "grad_norm": 5.238565921783447, + "learning_rate": 5.676442373578184e-06, + "loss": 0.2398, + "step": 32300 + }, + { + "epoch": 2.0381036353771025, + "grad_norm": 4.950132369995117, + "learning_rate": 5.676305330957929e-06, + "loss": 0.1977, + "step": 32301 + }, + { + "epoch": 2.0381172002170374, + "grad_norm": 5.432149410247803, + "learning_rate": 5.676168288337673e-06, + "loss": 0.2395, + "step": 32302 + }, + { + "epoch": 2.038130765056972, + "grad_norm": 5.789926528930664, + "learning_rate": 5.676031245717418e-06, + "loss": 0.2155, + "step": 32303 + }, + { + "epoch": 2.038144329896907, + "grad_norm": 3.3182213306427, + "learning_rate": 5.675894203097164e-06, + "loss": 0.0894, + "step": 32304 + }, + { + "epoch": 2.038157894736842, + "grad_norm": 4.4028778076171875, + "learning_rate": 5.675757160476909e-06, + "loss": 0.0935, + "step": 32305 + }, + { + "epoch": 2.038171459576777, + "grad_norm": 3.5103423595428467, + "learning_rate": 5.675620117856654e-06, + "loss": 0.1619, + "step": 32306 + }, + { + "epoch": 2.0381850244167117, + "grad_norm": 3.8782405853271484, + "learning_rate": 5.675483075236399e-06, + "loss": 0.1269, + "step": 32307 + }, + { + "epoch": 2.038198589256647, + "grad_norm": 2.9716103076934814, + "learning_rate": 5.6753460326161434e-06, + "loss": 0.0993, + "step": 32308 + }, + { + "epoch": 2.038212154096582, + "grad_norm": 4.601158618927002, + "learning_rate": 5.6752089899958895e-06, + "loss": 0.2327, + "step": 32309 + }, + { + "epoch": 2.0382257189365167, + "grad_norm": 4.232269287109375, + "learning_rate": 5.675071947375635e-06, + "loss": 0.1517, + "step": 32310 + }, + { + "epoch": 2.0382392837764516, + "grad_norm": 3.096996784210205, + "learning_rate": 5.674934904755379e-06, + "loss": 0.1064, + "step": 32311 + }, + { + "epoch": 2.0382528486163864, + "grad_norm": 4.575033664703369, + "learning_rate": 5.674797862135124e-06, + "loss": 0.1155, + "step": 32312 + }, + { + "epoch": 2.0382664134563213, + "grad_norm": 3.971292495727539, + "learning_rate": 5.67466081951487e-06, + "loss": 0.1497, + "step": 32313 + }, + { + "epoch": 2.038279978296256, + "grad_norm": 3.3220138549804688, + "learning_rate": 5.6745237768946145e-06, + "loss": 0.1117, + "step": 32314 + }, + { + "epoch": 2.038293543136191, + "grad_norm": 5.714237689971924, + "learning_rate": 5.67438673427436e-06, + "loss": 0.2462, + "step": 32315 + }, + { + "epoch": 2.038307107976126, + "grad_norm": 3.4461748600006104, + "learning_rate": 5.674249691654105e-06, + "loss": 0.1073, + "step": 32316 + }, + { + "epoch": 2.0383206728160608, + "grad_norm": 3.5144309997558594, + "learning_rate": 5.674112649033851e-06, + "loss": 0.1807, + "step": 32317 + }, + { + "epoch": 2.0383342376559956, + "grad_norm": 4.308674335479736, + "learning_rate": 5.673975606413595e-06, + "loss": 0.1929, + "step": 32318 + }, + { + "epoch": 2.0383478024959305, + "grad_norm": 5.166889667510986, + "learning_rate": 5.67383856379334e-06, + "loss": 0.1184, + "step": 32319 + }, + { + "epoch": 2.0383613673358654, + "grad_norm": 3.669563055038452, + "learning_rate": 5.673701521173085e-06, + "loss": 0.1878, + "step": 32320 + }, + { + "epoch": 2.0383749321758002, + "grad_norm": 3.9374027252197266, + "learning_rate": 5.67356447855283e-06, + "loss": 0.1458, + "step": 32321 + }, + { + "epoch": 2.038388497015735, + "grad_norm": 5.000288963317871, + "learning_rate": 5.673427435932576e-06, + "loss": 0.1621, + "step": 32322 + }, + { + "epoch": 2.03840206185567, + "grad_norm": 4.961819171905518, + "learning_rate": 5.67329039331232e-06, + "loss": 0.2217, + "step": 32323 + }, + { + "epoch": 2.038415626695605, + "grad_norm": 4.380270481109619, + "learning_rate": 5.6731533506920654e-06, + "loss": 0.1315, + "step": 32324 + }, + { + "epoch": 2.0384291915355397, + "grad_norm": 5.129506587982178, + "learning_rate": 5.673016308071811e-06, + "loss": 0.2173, + "step": 32325 + }, + { + "epoch": 2.0384427563754746, + "grad_norm": 5.67062520980835, + "learning_rate": 5.672879265451557e-06, + "loss": 0.25, + "step": 32326 + }, + { + "epoch": 2.03845632121541, + "grad_norm": 3.529277801513672, + "learning_rate": 5.672742222831301e-06, + "loss": 0.1471, + "step": 32327 + }, + { + "epoch": 2.0384698860553447, + "grad_norm": 3.6277992725372314, + "learning_rate": 5.672605180211046e-06, + "loss": 0.2032, + "step": 32328 + }, + { + "epoch": 2.0384834508952796, + "grad_norm": 4.422636032104492, + "learning_rate": 5.6724681375907905e-06, + "loss": 0.2819, + "step": 32329 + }, + { + "epoch": 2.0384970157352145, + "grad_norm": 4.63619327545166, + "learning_rate": 5.6723310949705365e-06, + "loss": 0.1827, + "step": 32330 + }, + { + "epoch": 2.0385105805751493, + "grad_norm": 4.418618679046631, + "learning_rate": 5.672194052350282e-06, + "loss": 0.1911, + "step": 32331 + }, + { + "epoch": 2.038524145415084, + "grad_norm": 4.1857147216796875, + "learning_rate": 5.672057009730027e-06, + "loss": 0.1329, + "step": 32332 + }, + { + "epoch": 2.038537710255019, + "grad_norm": 5.219493389129639, + "learning_rate": 5.671919967109771e-06, + "loss": 0.3083, + "step": 32333 + }, + { + "epoch": 2.038551275094954, + "grad_norm": 3.9808316230773926, + "learning_rate": 5.671782924489516e-06, + "loss": 0.1365, + "step": 32334 + }, + { + "epoch": 2.038564839934889, + "grad_norm": 5.381138324737549, + "learning_rate": 5.671645881869262e-06, + "loss": 0.2578, + "step": 32335 + }, + { + "epoch": 2.0385784047748237, + "grad_norm": 6.225030899047852, + "learning_rate": 5.671508839249007e-06, + "loss": 0.2668, + "step": 32336 + }, + { + "epoch": 2.0385919696147585, + "grad_norm": 5.45007848739624, + "learning_rate": 5.671371796628752e-06, + "loss": 0.3089, + "step": 32337 + }, + { + "epoch": 2.0386055344546934, + "grad_norm": 4.426843643188477, + "learning_rate": 5.671234754008496e-06, + "loss": 0.1532, + "step": 32338 + }, + { + "epoch": 2.0386190992946283, + "grad_norm": 5.9257402420043945, + "learning_rate": 5.671097711388242e-06, + "loss": 0.3228, + "step": 32339 + }, + { + "epoch": 2.038632664134563, + "grad_norm": 4.3961310386657715, + "learning_rate": 5.6709606687679875e-06, + "loss": 0.197, + "step": 32340 + }, + { + "epoch": 2.038646228974498, + "grad_norm": 4.682528972625732, + "learning_rate": 5.670823626147733e-06, + "loss": 0.1492, + "step": 32341 + }, + { + "epoch": 2.038659793814433, + "grad_norm": 5.086734771728516, + "learning_rate": 5.670686583527477e-06, + "loss": 0.2183, + "step": 32342 + }, + { + "epoch": 2.0386733586543677, + "grad_norm": 5.972494602203369, + "learning_rate": 5.670549540907223e-06, + "loss": 0.2116, + "step": 32343 + }, + { + "epoch": 2.0386869234943026, + "grad_norm": 5.373525142669678, + "learning_rate": 5.670412498286968e-06, + "loss": 0.3066, + "step": 32344 + }, + { + "epoch": 2.0387004883342374, + "grad_norm": 4.358631610870361, + "learning_rate": 5.6702754556667125e-06, + "loss": 0.1898, + "step": 32345 + }, + { + "epoch": 2.0387140531741728, + "grad_norm": 4.895732879638672, + "learning_rate": 5.670138413046458e-06, + "loss": 0.2208, + "step": 32346 + }, + { + "epoch": 2.0387276180141076, + "grad_norm": 4.687086582183838, + "learning_rate": 5.670001370426203e-06, + "loss": 0.1654, + "step": 32347 + }, + { + "epoch": 2.0387411828540425, + "grad_norm": 9.231689453125, + "learning_rate": 5.669864327805948e-06, + "loss": 0.3267, + "step": 32348 + }, + { + "epoch": 2.0387547476939774, + "grad_norm": 5.698723316192627, + "learning_rate": 5.669727285185693e-06, + "loss": 0.1463, + "step": 32349 + }, + { + "epoch": 2.038768312533912, + "grad_norm": 4.428232669830322, + "learning_rate": 5.669590242565438e-06, + "loss": 0.1472, + "step": 32350 + }, + { + "epoch": 2.038781877373847, + "grad_norm": 6.187435150146484, + "learning_rate": 5.669453199945183e-06, + "loss": 0.3128, + "step": 32351 + }, + { + "epoch": 2.038795442213782, + "grad_norm": 5.467541217803955, + "learning_rate": 5.669316157324929e-06, + "loss": 0.1716, + "step": 32352 + }, + { + "epoch": 2.038809007053717, + "grad_norm": 4.722482204437256, + "learning_rate": 5.669179114704674e-06, + "loss": 0.2851, + "step": 32353 + }, + { + "epoch": 2.0388225718936517, + "grad_norm": 6.480011463165283, + "learning_rate": 5.669042072084418e-06, + "loss": 0.2638, + "step": 32354 + }, + { + "epoch": 2.0388361367335865, + "grad_norm": 5.225229263305664, + "learning_rate": 5.6689050294641634e-06, + "loss": 0.2221, + "step": 32355 + }, + { + "epoch": 2.0388497015735214, + "grad_norm": 6.079147815704346, + "learning_rate": 5.6687679868439095e-06, + "loss": 0.2257, + "step": 32356 + }, + { + "epoch": 2.0388632664134563, + "grad_norm": 4.778081893920898, + "learning_rate": 5.668630944223655e-06, + "loss": 0.1728, + "step": 32357 + }, + { + "epoch": 2.038876831253391, + "grad_norm": 5.2093353271484375, + "learning_rate": 5.668493901603399e-06, + "loss": 0.2214, + "step": 32358 + }, + { + "epoch": 2.038890396093326, + "grad_norm": 5.2478742599487305, + "learning_rate": 5.668356858983144e-06, + "loss": 0.1882, + "step": 32359 + }, + { + "epoch": 2.038903960933261, + "grad_norm": 4.6642842292785645, + "learning_rate": 5.66821981636289e-06, + "loss": 0.1635, + "step": 32360 + }, + { + "epoch": 2.0389175257731957, + "grad_norm": 5.9003682136535645, + "learning_rate": 5.6680827737426345e-06, + "loss": 0.2195, + "step": 32361 + }, + { + "epoch": 2.0389310906131306, + "grad_norm": 5.254138946533203, + "learning_rate": 5.66794573112238e-06, + "loss": 0.2499, + "step": 32362 + }, + { + "epoch": 2.0389446554530655, + "grad_norm": 6.182652473449707, + "learning_rate": 5.667808688502124e-06, + "loss": 0.2793, + "step": 32363 + }, + { + "epoch": 2.0389582202930003, + "grad_norm": 4.693573474884033, + "learning_rate": 5.667671645881869e-06, + "loss": 0.1383, + "step": 32364 + }, + { + "epoch": 2.0389717851329356, + "grad_norm": 4.991238594055176, + "learning_rate": 5.667534603261615e-06, + "loss": 0.1791, + "step": 32365 + }, + { + "epoch": 2.0389853499728705, + "grad_norm": 5.080939292907715, + "learning_rate": 5.66739756064136e-06, + "loss": 0.1919, + "step": 32366 + }, + { + "epoch": 2.0389989148128054, + "grad_norm": 7.732697010040283, + "learning_rate": 5.667260518021105e-06, + "loss": 0.3009, + "step": 32367 + }, + { + "epoch": 2.0390124796527402, + "grad_norm": 6.699514389038086, + "learning_rate": 5.66712347540085e-06, + "loss": 0.229, + "step": 32368 + }, + { + "epoch": 2.039026044492675, + "grad_norm": 5.743411064147949, + "learning_rate": 5.666986432780596e-06, + "loss": 0.2395, + "step": 32369 + }, + { + "epoch": 2.03903960933261, + "grad_norm": 4.991340160369873, + "learning_rate": 5.66684939016034e-06, + "loss": 0.2733, + "step": 32370 + }, + { + "epoch": 2.039053174172545, + "grad_norm": 7.2909464836120605, + "learning_rate": 5.6667123475400855e-06, + "loss": 0.3453, + "step": 32371 + }, + { + "epoch": 2.0390667390124797, + "grad_norm": 5.853425025939941, + "learning_rate": 5.66657530491983e-06, + "loss": 0.2874, + "step": 32372 + }, + { + "epoch": 2.0390803038524146, + "grad_norm": 5.210921764373779, + "learning_rate": 5.666438262299576e-06, + "loss": 0.2601, + "step": 32373 + }, + { + "epoch": 2.0390938686923494, + "grad_norm": 5.48492956161499, + "learning_rate": 5.666301219679321e-06, + "loss": 0.2172, + "step": 32374 + }, + { + "epoch": 2.0391074335322843, + "grad_norm": 5.996910095214844, + "learning_rate": 5.666164177059066e-06, + "loss": 0.3127, + "step": 32375 + }, + { + "epoch": 2.039120998372219, + "grad_norm": 6.413768768310547, + "learning_rate": 5.6660271344388105e-06, + "loss": 0.2771, + "step": 32376 + }, + { + "epoch": 2.039134563212154, + "grad_norm": 5.4397358894348145, + "learning_rate": 5.665890091818556e-06, + "loss": 0.2319, + "step": 32377 + }, + { + "epoch": 2.039148128052089, + "grad_norm": 5.02218770980835, + "learning_rate": 5.665753049198302e-06, + "loss": 0.2989, + "step": 32378 + }, + { + "epoch": 2.0391616928920238, + "grad_norm": 4.301563739776611, + "learning_rate": 5.665616006578046e-06, + "loss": 0.2197, + "step": 32379 + }, + { + "epoch": 2.0391752577319586, + "grad_norm": 4.961670398712158, + "learning_rate": 5.665478963957791e-06, + "loss": 0.1666, + "step": 32380 + }, + { + "epoch": 2.0391888225718935, + "grad_norm": 5.5245361328125, + "learning_rate": 5.665341921337536e-06, + "loss": 0.2381, + "step": 32381 + }, + { + "epoch": 2.0392023874118284, + "grad_norm": 4.5219316482543945, + "learning_rate": 5.665204878717282e-06, + "loss": 0.2168, + "step": 32382 + }, + { + "epoch": 2.039215952251763, + "grad_norm": 5.991391181945801, + "learning_rate": 5.665067836097027e-06, + "loss": 0.2993, + "step": 32383 + }, + { + "epoch": 2.0392295170916985, + "grad_norm": 5.707866191864014, + "learning_rate": 5.664930793476772e-06, + "loss": 0.2357, + "step": 32384 + }, + { + "epoch": 2.0392430819316334, + "grad_norm": 7.3611979484558105, + "learning_rate": 5.664793750856516e-06, + "loss": 0.299, + "step": 32385 + }, + { + "epoch": 2.0392566467715683, + "grad_norm": 4.531004428863525, + "learning_rate": 5.664656708236262e-06, + "loss": 0.2325, + "step": 32386 + }, + { + "epoch": 2.039270211611503, + "grad_norm": 5.6882710456848145, + "learning_rate": 5.6645196656160075e-06, + "loss": 0.28, + "step": 32387 + }, + { + "epoch": 2.039283776451438, + "grad_norm": 8.724127769470215, + "learning_rate": 5.664382622995752e-06, + "loss": 0.3695, + "step": 32388 + }, + { + "epoch": 2.039297341291373, + "grad_norm": 6.128524303436279, + "learning_rate": 5.664245580375497e-06, + "loss": 0.2438, + "step": 32389 + }, + { + "epoch": 2.0393109061313077, + "grad_norm": 7.7389302253723145, + "learning_rate": 5.664108537755242e-06, + "loss": 0.5713, + "step": 32390 + }, + { + "epoch": 2.0393244709712426, + "grad_norm": 5.8696608543396, + "learning_rate": 5.663971495134988e-06, + "loss": 0.3487, + "step": 32391 + }, + { + "epoch": 2.0393380358111775, + "grad_norm": 6.5715718269348145, + "learning_rate": 5.6638344525147325e-06, + "loss": 0.3545, + "step": 32392 + }, + { + "epoch": 2.0393516006511123, + "grad_norm": 4.524681091308594, + "learning_rate": 5.663697409894478e-06, + "loss": 0.15, + "step": 32393 + }, + { + "epoch": 2.039365165491047, + "grad_norm": 5.249607563018799, + "learning_rate": 5.663560367274222e-06, + "loss": 0.3579, + "step": 32394 + }, + { + "epoch": 2.039378730330982, + "grad_norm": 5.705023765563965, + "learning_rate": 5.663423324653968e-06, + "loss": 0.2209, + "step": 32395 + }, + { + "epoch": 2.039392295170917, + "grad_norm": 4.5674214363098145, + "learning_rate": 5.663286282033713e-06, + "loss": 0.1663, + "step": 32396 + }, + { + "epoch": 2.039405860010852, + "grad_norm": 5.1195855140686035, + "learning_rate": 5.6631492394134576e-06, + "loss": 0.277, + "step": 32397 + }, + { + "epoch": 2.0394194248507866, + "grad_norm": 5.811316967010498, + "learning_rate": 5.663012196793203e-06, + "loss": 0.2651, + "step": 32398 + }, + { + "epoch": 2.0394329896907215, + "grad_norm": 6.08833646774292, + "learning_rate": 5.662875154172949e-06, + "loss": 0.3476, + "step": 32399 + }, + { + "epoch": 2.0394465545306564, + "grad_norm": 4.302377223968506, + "learning_rate": 5.662738111552694e-06, + "loss": 0.1656, + "step": 32400 + }, + { + "epoch": 2.0394601193705912, + "grad_norm": 5.990259647369385, + "learning_rate": 5.662601068932438e-06, + "loss": 0.2101, + "step": 32401 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 4.700876235961914, + "learning_rate": 5.6624640263121835e-06, + "loss": 0.2632, + "step": 32402 + }, + { + "epoch": 2.0394872490504614, + "grad_norm": 4.263350009918213, + "learning_rate": 5.662326983691928e-06, + "loss": 0.2257, + "step": 32403 + }, + { + "epoch": 2.0395008138903963, + "grad_norm": 5.335392475128174, + "learning_rate": 5.662189941071674e-06, + "loss": 0.2336, + "step": 32404 + }, + { + "epoch": 2.039514378730331, + "grad_norm": 6.1341729164123535, + "learning_rate": 5.662052898451419e-06, + "loss": 0.3563, + "step": 32405 + }, + { + "epoch": 2.039527943570266, + "grad_norm": 5.973769187927246, + "learning_rate": 5.661915855831164e-06, + "loss": 0.2399, + "step": 32406 + }, + { + "epoch": 2.039541508410201, + "grad_norm": 4.720031261444092, + "learning_rate": 5.6617788132109085e-06, + "loss": 0.1876, + "step": 32407 + }, + { + "epoch": 2.0395550732501357, + "grad_norm": 6.691627025604248, + "learning_rate": 5.6616417705906545e-06, + "loss": 0.3065, + "step": 32408 + }, + { + "epoch": 2.0395686380900706, + "grad_norm": 6.261849880218506, + "learning_rate": 5.6615047279704e-06, + "loss": 0.2544, + "step": 32409 + }, + { + "epoch": 2.0395822029300055, + "grad_norm": 5.402215003967285, + "learning_rate": 5.661367685350144e-06, + "loss": 0.2378, + "step": 32410 + }, + { + "epoch": 2.0395957677699403, + "grad_norm": 5.826996326446533, + "learning_rate": 5.661230642729889e-06, + "loss": 0.3015, + "step": 32411 + }, + { + "epoch": 2.039609332609875, + "grad_norm": 5.62440299987793, + "learning_rate": 5.661093600109635e-06, + "loss": 0.2666, + "step": 32412 + }, + { + "epoch": 2.03962289744981, + "grad_norm": 5.742226600646973, + "learning_rate": 5.66095655748938e-06, + "loss": 0.2541, + "step": 32413 + }, + { + "epoch": 2.039636462289745, + "grad_norm": 5.891969680786133, + "learning_rate": 5.660819514869125e-06, + "loss": 0.2316, + "step": 32414 + }, + { + "epoch": 2.03965002712968, + "grad_norm": 6.587793827056885, + "learning_rate": 5.66068247224887e-06, + "loss": 0.2661, + "step": 32415 + }, + { + "epoch": 2.0396635919696147, + "grad_norm": 6.269620895385742, + "learning_rate": 5.660545429628614e-06, + "loss": 0.2692, + "step": 32416 + }, + { + "epoch": 2.0396771568095495, + "grad_norm": 5.551718711853027, + "learning_rate": 5.66040838700836e-06, + "loss": 0.3248, + "step": 32417 + }, + { + "epoch": 2.0396907216494844, + "grad_norm": 4.776015281677246, + "learning_rate": 5.6602713443881055e-06, + "loss": 0.2021, + "step": 32418 + }, + { + "epoch": 2.0397042864894193, + "grad_norm": 7.3381781578063965, + "learning_rate": 5.66013430176785e-06, + "loss": 0.3258, + "step": 32419 + }, + { + "epoch": 2.039717851329354, + "grad_norm": 5.598896026611328, + "learning_rate": 5.659997259147595e-06, + "loss": 0.2288, + "step": 32420 + }, + { + "epoch": 2.039731416169289, + "grad_norm": 5.09476375579834, + "learning_rate": 5.659860216527341e-06, + "loss": 0.1877, + "step": 32421 + }, + { + "epoch": 2.0397449810092243, + "grad_norm": 4.5612664222717285, + "learning_rate": 5.659723173907085e-06, + "loss": 0.2619, + "step": 32422 + }, + { + "epoch": 2.039758545849159, + "grad_norm": 7.717566967010498, + "learning_rate": 5.6595861312868305e-06, + "loss": 0.333, + "step": 32423 + }, + { + "epoch": 2.039772110689094, + "grad_norm": 5.037364959716797, + "learning_rate": 5.659449088666576e-06, + "loss": 0.2326, + "step": 32424 + }, + { + "epoch": 2.039785675529029, + "grad_norm": 5.2324371337890625, + "learning_rate": 5.659312046046322e-06, + "loss": 0.2374, + "step": 32425 + }, + { + "epoch": 2.0397992403689638, + "grad_norm": 5.794745922088623, + "learning_rate": 5.659175003426066e-06, + "loss": 0.2055, + "step": 32426 + }, + { + "epoch": 2.0398128052088986, + "grad_norm": 5.965963363647461, + "learning_rate": 5.659037960805811e-06, + "loss": 0.2527, + "step": 32427 + }, + { + "epoch": 2.0398263700488335, + "grad_norm": 4.587182998657227, + "learning_rate": 5.6589009181855556e-06, + "loss": 0.253, + "step": 32428 + }, + { + "epoch": 2.0398399348887684, + "grad_norm": 4.767476558685303, + "learning_rate": 5.658763875565302e-06, + "loss": 0.1914, + "step": 32429 + }, + { + "epoch": 2.0398534997287032, + "grad_norm": 5.1199493408203125, + "learning_rate": 5.658626832945047e-06, + "loss": 0.218, + "step": 32430 + }, + { + "epoch": 2.039867064568638, + "grad_norm": 5.362854480743408, + "learning_rate": 5.658489790324791e-06, + "loss": 0.1941, + "step": 32431 + }, + { + "epoch": 2.039880629408573, + "grad_norm": 4.976467609405518, + "learning_rate": 5.658352747704536e-06, + "loss": 0.1745, + "step": 32432 + }, + { + "epoch": 2.039894194248508, + "grad_norm": 5.6883039474487305, + "learning_rate": 5.6582157050842815e-06, + "loss": 0.2468, + "step": 32433 + }, + { + "epoch": 2.0399077590884427, + "grad_norm": 5.162442684173584, + "learning_rate": 5.6580786624640275e-06, + "loss": 0.1864, + "step": 32434 + }, + { + "epoch": 2.0399213239283776, + "grad_norm": 3.3588929176330566, + "learning_rate": 5.657941619843772e-06, + "loss": 0.1582, + "step": 32435 + }, + { + "epoch": 2.0399348887683124, + "grad_norm": 4.272243499755859, + "learning_rate": 5.657804577223517e-06, + "loss": 0.1654, + "step": 32436 + }, + { + "epoch": 2.0399484536082473, + "grad_norm": 5.57037878036499, + "learning_rate": 5.657667534603261e-06, + "loss": 0.1629, + "step": 32437 + }, + { + "epoch": 2.039962018448182, + "grad_norm": 4.982306003570557, + "learning_rate": 5.657530491983007e-06, + "loss": 0.1752, + "step": 32438 + }, + { + "epoch": 2.039975583288117, + "grad_norm": 3.972607135772705, + "learning_rate": 5.6573934493627525e-06, + "loss": 0.1282, + "step": 32439 + }, + { + "epoch": 2.039989148128052, + "grad_norm": 3.227064371109009, + "learning_rate": 5.657256406742498e-06, + "loss": 0.1456, + "step": 32440 + }, + { + "epoch": 2.040002712967987, + "grad_norm": 2.9508137702941895, + "learning_rate": 5.657119364122242e-06, + "loss": 0.1339, + "step": 32441 + }, + { + "epoch": 2.040016277807922, + "grad_norm": 3.8936643600463867, + "learning_rate": 5.656982321501988e-06, + "loss": 0.1553, + "step": 32442 + }, + { + "epoch": 2.040029842647857, + "grad_norm": 5.799884796142578, + "learning_rate": 5.656845278881733e-06, + "loss": 0.1794, + "step": 32443 + }, + { + "epoch": 2.040043407487792, + "grad_norm": 5.506021022796631, + "learning_rate": 5.656708236261478e-06, + "loss": 0.2424, + "step": 32444 + }, + { + "epoch": 2.0400569723277266, + "grad_norm": 6.6384358406066895, + "learning_rate": 5.656571193641223e-06, + "loss": 0.21, + "step": 32445 + }, + { + "epoch": 2.0400705371676615, + "grad_norm": 4.986966609954834, + "learning_rate": 5.656434151020967e-06, + "loss": 0.1845, + "step": 32446 + }, + { + "epoch": 2.0400841020075964, + "grad_norm": 4.149304389953613, + "learning_rate": 5.656297108400713e-06, + "loss": 0.193, + "step": 32447 + }, + { + "epoch": 2.0400976668475312, + "grad_norm": 3.92282772064209, + "learning_rate": 5.656160065780458e-06, + "loss": 0.1075, + "step": 32448 + }, + { + "epoch": 2.040111231687466, + "grad_norm": 3.817222833633423, + "learning_rate": 5.6560230231602035e-06, + "loss": 0.1709, + "step": 32449 + }, + { + "epoch": 2.040124796527401, + "grad_norm": 3.587953805923462, + "learning_rate": 5.655885980539948e-06, + "loss": 0.1126, + "step": 32450 + }, + { + "epoch": 2.040138361367336, + "grad_norm": 5.6892619132995605, + "learning_rate": 5.655748937919694e-06, + "loss": 0.238, + "step": 32451 + }, + { + "epoch": 2.0401519262072707, + "grad_norm": 3.2018563747406006, + "learning_rate": 5.655611895299439e-06, + "loss": 0.1463, + "step": 32452 + }, + { + "epoch": 2.0401654910472056, + "grad_norm": 3.636640787124634, + "learning_rate": 5.655474852679183e-06, + "loss": 0.1668, + "step": 32453 + }, + { + "epoch": 2.0401790558871404, + "grad_norm": 2.6494085788726807, + "learning_rate": 5.6553378100589285e-06, + "loss": 0.1096, + "step": 32454 + }, + { + "epoch": 2.0401926207270753, + "grad_norm": 3.1046676635742188, + "learning_rate": 5.6552007674386746e-06, + "loss": 0.095, + "step": 32455 + }, + { + "epoch": 2.04020618556701, + "grad_norm": 5.269025802612305, + "learning_rate": 5.655063724818419e-06, + "loss": 0.3033, + "step": 32456 + }, + { + "epoch": 2.040219750406945, + "grad_norm": 4.7994866371154785, + "learning_rate": 5.654926682198164e-06, + "loss": 0.234, + "step": 32457 + }, + { + "epoch": 2.04023331524688, + "grad_norm": 4.431975841522217, + "learning_rate": 5.654789639577909e-06, + "loss": 0.1572, + "step": 32458 + }, + { + "epoch": 2.0402468800868148, + "grad_norm": 3.723555326461792, + "learning_rate": 5.654652596957654e-06, + "loss": 0.0918, + "step": 32459 + }, + { + "epoch": 2.04026044492675, + "grad_norm": 4.381360054016113, + "learning_rate": 5.6545155543374e-06, + "loss": 0.1959, + "step": 32460 + }, + { + "epoch": 2.040274009766685, + "grad_norm": 4.9470038414001465, + "learning_rate": 5.654378511717145e-06, + "loss": 0.1902, + "step": 32461 + }, + { + "epoch": 2.04028757460662, + "grad_norm": 3.9684488773345947, + "learning_rate": 5.654241469096889e-06, + "loss": 0.1135, + "step": 32462 + }, + { + "epoch": 2.0403011394465547, + "grad_norm": 4.024957656860352, + "learning_rate": 5.654104426476634e-06, + "loss": 0.1892, + "step": 32463 + }, + { + "epoch": 2.0403147042864895, + "grad_norm": 5.211918354034424, + "learning_rate": 5.65396738385638e-06, + "loss": 0.2323, + "step": 32464 + }, + { + "epoch": 2.0403282691264244, + "grad_norm": 5.198262691497803, + "learning_rate": 5.653830341236125e-06, + "loss": 0.2471, + "step": 32465 + }, + { + "epoch": 2.0403418339663593, + "grad_norm": 5.53044319152832, + "learning_rate": 5.65369329861587e-06, + "loss": 0.2008, + "step": 32466 + }, + { + "epoch": 2.040355398806294, + "grad_norm": 4.430172920227051, + "learning_rate": 5.653556255995615e-06, + "loss": 0.1736, + "step": 32467 + }, + { + "epoch": 2.040368963646229, + "grad_norm": 5.057955741882324, + "learning_rate": 5.653419213375361e-06, + "loss": 0.2512, + "step": 32468 + }, + { + "epoch": 2.040382528486164, + "grad_norm": 5.404523849487305, + "learning_rate": 5.653282170755105e-06, + "loss": 0.2068, + "step": 32469 + }, + { + "epoch": 2.0403960933260987, + "grad_norm": 4.129746437072754, + "learning_rate": 5.6531451281348505e-06, + "loss": 0.1673, + "step": 32470 + }, + { + "epoch": 2.0404096581660336, + "grad_norm": 3.7575690746307373, + "learning_rate": 5.653008085514595e-06, + "loss": 0.146, + "step": 32471 + }, + { + "epoch": 2.0404232230059685, + "grad_norm": 4.817957878112793, + "learning_rate": 5.65287104289434e-06, + "loss": 0.1524, + "step": 32472 + }, + { + "epoch": 2.0404367878459033, + "grad_norm": 5.7854695320129395, + "learning_rate": 5.652734000274086e-06, + "loss": 0.2445, + "step": 32473 + }, + { + "epoch": 2.040450352685838, + "grad_norm": 4.225490093231201, + "learning_rate": 5.652596957653831e-06, + "loss": 0.229, + "step": 32474 + }, + { + "epoch": 2.040463917525773, + "grad_norm": 3.7821812629699707, + "learning_rate": 5.652459915033576e-06, + "loss": 0.0841, + "step": 32475 + }, + { + "epoch": 2.040477482365708, + "grad_norm": 4.399118900299072, + "learning_rate": 5.652322872413321e-06, + "loss": 0.1111, + "step": 32476 + }, + { + "epoch": 2.040491047205643, + "grad_norm": 5.796506404876709, + "learning_rate": 5.652185829793067e-06, + "loss": 0.2573, + "step": 32477 + }, + { + "epoch": 2.0405046120455776, + "grad_norm": 3.8067033290863037, + "learning_rate": 5.652048787172811e-06, + "loss": 0.1033, + "step": 32478 + }, + { + "epoch": 2.040518176885513, + "grad_norm": 4.403622627258301, + "learning_rate": 5.651911744552556e-06, + "loss": 0.2378, + "step": 32479 + }, + { + "epoch": 2.040531741725448, + "grad_norm": 4.7881574630737305, + "learning_rate": 5.651774701932301e-06, + "loss": 0.1503, + "step": 32480 + }, + { + "epoch": 2.0405453065653827, + "grad_norm": 4.681417465209961, + "learning_rate": 5.651637659312047e-06, + "loss": 0.1777, + "step": 32481 + }, + { + "epoch": 2.0405588714053176, + "grad_norm": 4.815636157989502, + "learning_rate": 5.651500616691792e-06, + "loss": 0.1464, + "step": 32482 + }, + { + "epoch": 2.0405724362452524, + "grad_norm": 4.860743522644043, + "learning_rate": 5.651363574071537e-06, + "loss": 0.3116, + "step": 32483 + }, + { + "epoch": 2.0405860010851873, + "grad_norm": 5.676449298858643, + "learning_rate": 5.651226531451281e-06, + "loss": 0.2699, + "step": 32484 + }, + { + "epoch": 2.040599565925122, + "grad_norm": 5.860196113586426, + "learning_rate": 5.6510894888310265e-06, + "loss": 0.2047, + "step": 32485 + }, + { + "epoch": 2.040613130765057, + "grad_norm": 4.900198936462402, + "learning_rate": 5.6509524462107726e-06, + "loss": 0.212, + "step": 32486 + }, + { + "epoch": 2.040626695604992, + "grad_norm": 3.9448318481445312, + "learning_rate": 5.650815403590517e-06, + "loss": 0.1842, + "step": 32487 + }, + { + "epoch": 2.0406402604449267, + "grad_norm": 4.823146343231201, + "learning_rate": 5.650678360970262e-06, + "loss": 0.2284, + "step": 32488 + }, + { + "epoch": 2.0406538252848616, + "grad_norm": 4.706414699554443, + "learning_rate": 5.650541318350007e-06, + "loss": 0.1799, + "step": 32489 + }, + { + "epoch": 2.0406673901247965, + "grad_norm": 5.177079200744629, + "learning_rate": 5.6504042757297524e-06, + "loss": 0.1617, + "step": 32490 + }, + { + "epoch": 2.0406809549647313, + "grad_norm": 5.053128242492676, + "learning_rate": 5.650267233109498e-06, + "loss": 0.1816, + "step": 32491 + }, + { + "epoch": 2.040694519804666, + "grad_norm": 4.578268051147461, + "learning_rate": 5.650130190489243e-06, + "loss": 0.124, + "step": 32492 + }, + { + "epoch": 2.040708084644601, + "grad_norm": 3.8734965324401855, + "learning_rate": 5.649993147868987e-06, + "loss": 0.0918, + "step": 32493 + }, + { + "epoch": 2.040721649484536, + "grad_norm": 6.471956253051758, + "learning_rate": 5.649856105248733e-06, + "loss": 0.2347, + "step": 32494 + }, + { + "epoch": 2.040735214324471, + "grad_norm": 3.6926448345184326, + "learning_rate": 5.649719062628478e-06, + "loss": 0.1523, + "step": 32495 + }, + { + "epoch": 2.0407487791644057, + "grad_norm": 6.134160995483398, + "learning_rate": 5.649582020008223e-06, + "loss": 0.1868, + "step": 32496 + }, + { + "epoch": 2.0407623440043405, + "grad_norm": 3.0105092525482178, + "learning_rate": 5.649444977387968e-06, + "loss": 0.1057, + "step": 32497 + }, + { + "epoch": 2.040775908844276, + "grad_norm": 3.3747072219848633, + "learning_rate": 5.649307934767714e-06, + "loss": 0.1651, + "step": 32498 + }, + { + "epoch": 2.0407894736842107, + "grad_norm": 5.2335991859436035, + "learning_rate": 5.649170892147459e-06, + "loss": 0.1702, + "step": 32499 + }, + { + "epoch": 2.0408030385241456, + "grad_norm": 3.8128271102905273, + "learning_rate": 5.649033849527203e-06, + "loss": 0.0811, + "step": 32500 + }, + { + "epoch": 2.0408166033640804, + "grad_norm": 6.306234359741211, + "learning_rate": 5.6488968069069486e-06, + "loss": 0.1333, + "step": 32501 + }, + { + "epoch": 2.0408301682040153, + "grad_norm": 3.8421249389648438, + "learning_rate": 5.648759764286693e-06, + "loss": 0.1604, + "step": 32502 + }, + { + "epoch": 2.04084373304395, + "grad_norm": 5.672192573547363, + "learning_rate": 5.648622721666439e-06, + "loss": 0.1343, + "step": 32503 + }, + { + "epoch": 2.040857297883885, + "grad_norm": 4.153378486633301, + "learning_rate": 5.648485679046184e-06, + "loss": 0.149, + "step": 32504 + }, + { + "epoch": 2.04087086272382, + "grad_norm": 4.329651832580566, + "learning_rate": 5.648348636425928e-06, + "loss": 0.1257, + "step": 32505 + }, + { + "epoch": 2.0408844275637548, + "grad_norm": 4.027733325958252, + "learning_rate": 5.648211593805674e-06, + "loss": 0.1648, + "step": 32506 + }, + { + "epoch": 2.0408979924036896, + "grad_norm": 7.447219371795654, + "learning_rate": 5.64807455118542e-06, + "loss": 0.1977, + "step": 32507 + }, + { + "epoch": 2.0409115572436245, + "grad_norm": 4.383652210235596, + "learning_rate": 5.647937508565165e-06, + "loss": 0.1941, + "step": 32508 + }, + { + "epoch": 2.0409251220835594, + "grad_norm": 4.532651901245117, + "learning_rate": 5.647800465944909e-06, + "loss": 0.1382, + "step": 32509 + }, + { + "epoch": 2.0409386869234942, + "grad_norm": 5.2210516929626465, + "learning_rate": 5.647663423324654e-06, + "loss": 0.2418, + "step": 32510 + }, + { + "epoch": 2.040952251763429, + "grad_norm": 3.2359066009521484, + "learning_rate": 5.6475263807044e-06, + "loss": 0.1043, + "step": 32511 + }, + { + "epoch": 2.040965816603364, + "grad_norm": 7.687182426452637, + "learning_rate": 5.647389338084145e-06, + "loss": 0.3009, + "step": 32512 + }, + { + "epoch": 2.040979381443299, + "grad_norm": 5.32936954498291, + "learning_rate": 5.64725229546389e-06, + "loss": 0.1643, + "step": 32513 + }, + { + "epoch": 2.0409929462832337, + "grad_norm": 7.677581787109375, + "learning_rate": 5.647115252843634e-06, + "loss": 0.3644, + "step": 32514 + }, + { + "epoch": 2.0410065111231686, + "grad_norm": 6.065434455871582, + "learning_rate": 5.646978210223379e-06, + "loss": 0.2461, + "step": 32515 + }, + { + "epoch": 2.0410200759631034, + "grad_norm": 7.874722480773926, + "learning_rate": 5.646841167603125e-06, + "loss": 0.2357, + "step": 32516 + }, + { + "epoch": 2.0410336408030387, + "grad_norm": 6.6565656661987305, + "learning_rate": 5.6467041249828706e-06, + "loss": 0.2196, + "step": 32517 + }, + { + "epoch": 2.0410472056429736, + "grad_norm": 3.74237322807312, + "learning_rate": 5.646567082362615e-06, + "loss": 0.0934, + "step": 32518 + }, + { + "epoch": 2.0410607704829085, + "grad_norm": 3.9245266914367676, + "learning_rate": 5.64643003974236e-06, + "loss": 0.193, + "step": 32519 + }, + { + "epoch": 2.0410743353228433, + "grad_norm": 5.850906848907471, + "learning_rate": 5.646292997122106e-06, + "loss": 0.1972, + "step": 32520 + }, + { + "epoch": 2.041087900162778, + "grad_norm": 5.76419734954834, + "learning_rate": 5.6461559545018504e-06, + "loss": 0.199, + "step": 32521 + }, + { + "epoch": 2.041101465002713, + "grad_norm": 6.634578704833984, + "learning_rate": 5.646018911881596e-06, + "loss": 0.1791, + "step": 32522 + }, + { + "epoch": 2.041115029842648, + "grad_norm": 4.046746253967285, + "learning_rate": 5.645881869261341e-06, + "loss": 0.1398, + "step": 32523 + }, + { + "epoch": 2.041128594682583, + "grad_norm": 6.963071346282959, + "learning_rate": 5.645744826641086e-06, + "loss": 0.2724, + "step": 32524 + }, + { + "epoch": 2.0411421595225177, + "grad_norm": 5.2947258949279785, + "learning_rate": 5.645607784020831e-06, + "loss": 0.1706, + "step": 32525 + }, + { + "epoch": 2.0411557243624525, + "grad_norm": 3.983232021331787, + "learning_rate": 5.645470741400576e-06, + "loss": 0.1564, + "step": 32526 + }, + { + "epoch": 2.0411692892023874, + "grad_norm": 4.570990562438965, + "learning_rate": 5.645333698780321e-06, + "loss": 0.1698, + "step": 32527 + }, + { + "epoch": 2.0411828540423222, + "grad_norm": 6.176641464233398, + "learning_rate": 5.645196656160066e-06, + "loss": 0.2942, + "step": 32528 + }, + { + "epoch": 2.041196418882257, + "grad_norm": 5.375547885894775, + "learning_rate": 5.645059613539812e-06, + "loss": 0.2267, + "step": 32529 + }, + { + "epoch": 2.041209983722192, + "grad_norm": 5.883249759674072, + "learning_rate": 5.644922570919556e-06, + "loss": 0.184, + "step": 32530 + }, + { + "epoch": 2.041223548562127, + "grad_norm": 4.6298394203186035, + "learning_rate": 5.644785528299301e-06, + "loss": 0.1689, + "step": 32531 + }, + { + "epoch": 2.0412371134020617, + "grad_norm": 5.197902202606201, + "learning_rate": 5.6446484856790466e-06, + "loss": 0.18, + "step": 32532 + }, + { + "epoch": 2.0412506782419966, + "grad_norm": 4.822007656097412, + "learning_rate": 5.644511443058793e-06, + "loss": 0.2407, + "step": 32533 + }, + { + "epoch": 2.0412642430819314, + "grad_norm": 6.225181579589844, + "learning_rate": 5.644374400438537e-06, + "loss": 0.1969, + "step": 32534 + }, + { + "epoch": 2.0412778079218663, + "grad_norm": 4.9029412269592285, + "learning_rate": 5.644237357818282e-06, + "loss": 0.1665, + "step": 32535 + }, + { + "epoch": 2.0412913727618016, + "grad_norm": 5.345344066619873, + "learning_rate": 5.6441003151980264e-06, + "loss": 0.1649, + "step": 32536 + }, + { + "epoch": 2.0413049376017365, + "grad_norm": 4.933811187744141, + "learning_rate": 5.6439632725777724e-06, + "loss": 0.1598, + "step": 32537 + }, + { + "epoch": 2.0413185024416713, + "grad_norm": 4.443840503692627, + "learning_rate": 5.643826229957518e-06, + "loss": 0.1553, + "step": 32538 + }, + { + "epoch": 2.041332067281606, + "grad_norm": 6.342072010040283, + "learning_rate": 5.643689187337262e-06, + "loss": 0.242, + "step": 32539 + }, + { + "epoch": 2.041345632121541, + "grad_norm": 3.77348256111145, + "learning_rate": 5.643552144717007e-06, + "loss": 0.131, + "step": 32540 + }, + { + "epoch": 2.041359196961476, + "grad_norm": 5.327807903289795, + "learning_rate": 5.643415102096752e-06, + "loss": 0.2262, + "step": 32541 + }, + { + "epoch": 2.041372761801411, + "grad_norm": 6.100937366485596, + "learning_rate": 5.643278059476498e-06, + "loss": 0.2365, + "step": 32542 + }, + { + "epoch": 2.0413863266413457, + "grad_norm": 4.136590480804443, + "learning_rate": 5.643141016856243e-06, + "loss": 0.1868, + "step": 32543 + }, + { + "epoch": 2.0413998914812805, + "grad_norm": 5.123090744018555, + "learning_rate": 5.643003974235988e-06, + "loss": 0.2419, + "step": 32544 + }, + { + "epoch": 2.0414134563212154, + "grad_norm": 5.694998741149902, + "learning_rate": 5.642866931615732e-06, + "loss": 0.1929, + "step": 32545 + }, + { + "epoch": 2.0414270211611503, + "grad_norm": 4.03552770614624, + "learning_rate": 5.642729888995478e-06, + "loss": 0.1162, + "step": 32546 + }, + { + "epoch": 2.041440586001085, + "grad_norm": 3.478698968887329, + "learning_rate": 5.642592846375223e-06, + "loss": 0.1163, + "step": 32547 + }, + { + "epoch": 2.04145415084102, + "grad_norm": 5.374188423156738, + "learning_rate": 5.6424558037549686e-06, + "loss": 0.1208, + "step": 32548 + }, + { + "epoch": 2.041467715680955, + "grad_norm": 5.474344253540039, + "learning_rate": 5.642318761134713e-06, + "loss": 0.1663, + "step": 32549 + }, + { + "epoch": 2.0414812805208897, + "grad_norm": 4.519346237182617, + "learning_rate": 5.642181718514459e-06, + "loss": 0.1607, + "step": 32550 + }, + { + "epoch": 2.0414948453608246, + "grad_norm": 5.150018215179443, + "learning_rate": 5.642044675894204e-06, + "loss": 0.1714, + "step": 32551 + }, + { + "epoch": 2.0415084102007595, + "grad_norm": 29.464719772338867, + "learning_rate": 5.6419076332739484e-06, + "loss": 0.2238, + "step": 32552 + }, + { + "epoch": 2.0415219750406943, + "grad_norm": 4.909587383270264, + "learning_rate": 5.641770590653694e-06, + "loss": 0.1632, + "step": 32553 + }, + { + "epoch": 2.041535539880629, + "grad_norm": 5.381070613861084, + "learning_rate": 5.641633548033438e-06, + "loss": 0.2214, + "step": 32554 + }, + { + "epoch": 2.0415491047205645, + "grad_norm": 4.824828624725342, + "learning_rate": 5.641496505413184e-06, + "loss": 0.181, + "step": 32555 + }, + { + "epoch": 2.0415626695604994, + "grad_norm": 3.4344654083251953, + "learning_rate": 5.641359462792929e-06, + "loss": 0.1115, + "step": 32556 + }, + { + "epoch": 2.0415762344004342, + "grad_norm": 3.6172616481781006, + "learning_rate": 5.641222420172674e-06, + "loss": 0.1156, + "step": 32557 + }, + { + "epoch": 2.041589799240369, + "grad_norm": 4.33974027633667, + "learning_rate": 5.641085377552419e-06, + "loss": 0.1831, + "step": 32558 + }, + { + "epoch": 2.041603364080304, + "grad_norm": 4.522791385650635, + "learning_rate": 5.640948334932165e-06, + "loss": 0.17, + "step": 32559 + }, + { + "epoch": 2.041616928920239, + "grad_norm": 4.365846157073975, + "learning_rate": 5.64081129231191e-06, + "loss": 0.1262, + "step": 32560 + }, + { + "epoch": 2.0416304937601737, + "grad_norm": 6.729569911956787, + "learning_rate": 5.640674249691654e-06, + "loss": 0.3176, + "step": 32561 + }, + { + "epoch": 2.0416440586001086, + "grad_norm": 6.3760457038879395, + "learning_rate": 5.640537207071399e-06, + "loss": 0.3144, + "step": 32562 + }, + { + "epoch": 2.0416576234400434, + "grad_norm": 4.127978324890137, + "learning_rate": 5.640400164451145e-06, + "loss": 0.1331, + "step": 32563 + }, + { + "epoch": 2.0416711882799783, + "grad_norm": 3.693619966506958, + "learning_rate": 5.64026312183089e-06, + "loss": 0.1612, + "step": 32564 + }, + { + "epoch": 2.041684753119913, + "grad_norm": 3.729210376739502, + "learning_rate": 5.640126079210635e-06, + "loss": 0.1134, + "step": 32565 + }, + { + "epoch": 2.041698317959848, + "grad_norm": 5.043074131011963, + "learning_rate": 5.63998903659038e-06, + "loss": 0.2147, + "step": 32566 + }, + { + "epoch": 2.041711882799783, + "grad_norm": 7.6137285232543945, + "learning_rate": 5.6398519939701244e-06, + "loss": 0.1828, + "step": 32567 + }, + { + "epoch": 2.0417254476397177, + "grad_norm": 5.115918159484863, + "learning_rate": 5.6397149513498705e-06, + "loss": 0.2511, + "step": 32568 + }, + { + "epoch": 2.0417390124796526, + "grad_norm": 6.4170451164245605, + "learning_rate": 5.639577908729616e-06, + "loss": 0.2112, + "step": 32569 + }, + { + "epoch": 2.0417525773195875, + "grad_norm": 2.4113643169403076, + "learning_rate": 5.63944086610936e-06, + "loss": 0.0877, + "step": 32570 + }, + { + "epoch": 2.0417661421595223, + "grad_norm": 3.837008237838745, + "learning_rate": 5.639303823489105e-06, + "loss": 0.1451, + "step": 32571 + }, + { + "epoch": 2.041779706999457, + "grad_norm": 5.452906131744385, + "learning_rate": 5.639166780868851e-06, + "loss": 0.1889, + "step": 32572 + }, + { + "epoch": 2.041793271839392, + "grad_norm": 6.806614875793457, + "learning_rate": 5.6390297382485955e-06, + "loss": 0.1252, + "step": 32573 + }, + { + "epoch": 2.0418068366793274, + "grad_norm": 5.046072483062744, + "learning_rate": 5.638892695628341e-06, + "loss": 0.2957, + "step": 32574 + }, + { + "epoch": 2.0418204015192623, + "grad_norm": 4.5843987464904785, + "learning_rate": 5.638755653008086e-06, + "loss": 0.2001, + "step": 32575 + }, + { + "epoch": 2.041833966359197, + "grad_norm": 4.621218204498291, + "learning_rate": 5.638618610387832e-06, + "loss": 0.1617, + "step": 32576 + }, + { + "epoch": 2.041847531199132, + "grad_norm": 4.763766288757324, + "learning_rate": 5.638481567767576e-06, + "loss": 0.2332, + "step": 32577 + }, + { + "epoch": 2.041861096039067, + "grad_norm": 4.677460193634033, + "learning_rate": 5.638344525147321e-06, + "loss": 0.161, + "step": 32578 + }, + { + "epoch": 2.0418746608790017, + "grad_norm": 5.215254783630371, + "learning_rate": 5.638207482527066e-06, + "loss": 0.2527, + "step": 32579 + }, + { + "epoch": 2.0418882257189366, + "grad_norm": 6.761467456817627, + "learning_rate": 5.638070439906812e-06, + "loss": 0.208, + "step": 32580 + }, + { + "epoch": 2.0419017905588714, + "grad_norm": 5.85029935836792, + "learning_rate": 5.637933397286557e-06, + "loss": 0.1956, + "step": 32581 + }, + { + "epoch": 2.0419153553988063, + "grad_norm": 5.325582504272461, + "learning_rate": 5.637796354666302e-06, + "loss": 0.2305, + "step": 32582 + }, + { + "epoch": 2.041928920238741, + "grad_norm": 4.921667098999023, + "learning_rate": 5.6376593120460464e-06, + "loss": 0.1353, + "step": 32583 + }, + { + "epoch": 2.041942485078676, + "grad_norm": 4.173671245574951, + "learning_rate": 5.637522269425792e-06, + "loss": 0.1395, + "step": 32584 + }, + { + "epoch": 2.041956049918611, + "grad_norm": 4.612457275390625, + "learning_rate": 5.637385226805538e-06, + "loss": 0.2387, + "step": 32585 + }, + { + "epoch": 2.0419696147585458, + "grad_norm": 4.987951278686523, + "learning_rate": 5.637248184185282e-06, + "loss": 0.1903, + "step": 32586 + }, + { + "epoch": 2.0419831795984806, + "grad_norm": 5.830971717834473, + "learning_rate": 5.637111141565027e-06, + "loss": 0.1615, + "step": 32587 + }, + { + "epoch": 2.0419967444384155, + "grad_norm": 5.599053859710693, + "learning_rate": 5.6369740989447715e-06, + "loss": 0.1555, + "step": 32588 + }, + { + "epoch": 2.0420103092783504, + "grad_norm": 5.452754020690918, + "learning_rate": 5.6368370563245175e-06, + "loss": 0.1776, + "step": 32589 + }, + { + "epoch": 2.0420238741182852, + "grad_norm": 4.416078567504883, + "learning_rate": 5.636700013704263e-06, + "loss": 0.1393, + "step": 32590 + }, + { + "epoch": 2.04203743895822, + "grad_norm": 3.502934694290161, + "learning_rate": 5.636562971084008e-06, + "loss": 0.0974, + "step": 32591 + }, + { + "epoch": 2.0420510037981554, + "grad_norm": 6.507702350616455, + "learning_rate": 5.636425928463752e-06, + "loss": 0.2094, + "step": 32592 + }, + { + "epoch": 2.0420645686380903, + "grad_norm": 4.6553955078125, + "learning_rate": 5.636288885843498e-06, + "loss": 0.1887, + "step": 32593 + }, + { + "epoch": 2.042078133478025, + "grad_norm": 5.205216884613037, + "learning_rate": 5.636151843223243e-06, + "loss": 0.1418, + "step": 32594 + }, + { + "epoch": 2.04209169831796, + "grad_norm": 3.8409087657928467, + "learning_rate": 5.636014800602988e-06, + "loss": 0.1691, + "step": 32595 + }, + { + "epoch": 2.042105263157895, + "grad_norm": 5.362593650817871, + "learning_rate": 5.635877757982733e-06, + "loss": 0.2241, + "step": 32596 + }, + { + "epoch": 2.0421188279978297, + "grad_norm": 5.408133506774902, + "learning_rate": 5.635740715362478e-06, + "loss": 0.1959, + "step": 32597 + }, + { + "epoch": 2.0421323928377646, + "grad_norm": 5.553140163421631, + "learning_rate": 5.635603672742223e-06, + "loss": 0.1848, + "step": 32598 + }, + { + "epoch": 2.0421459576776995, + "grad_norm": 5.004478454589844, + "learning_rate": 5.6354666301219685e-06, + "loss": 0.2196, + "step": 32599 + }, + { + "epoch": 2.0421595225176343, + "grad_norm": 4.797797203063965, + "learning_rate": 5.635329587501714e-06, + "loss": 0.1796, + "step": 32600 + }, + { + "epoch": 2.042173087357569, + "grad_norm": 5.979886531829834, + "learning_rate": 5.635192544881458e-06, + "loss": 0.1843, + "step": 32601 + }, + { + "epoch": 2.042186652197504, + "grad_norm": 4.532436370849609, + "learning_rate": 5.635055502261204e-06, + "loss": 0.1656, + "step": 32602 + }, + { + "epoch": 2.042200217037439, + "grad_norm": 5.075762748718262, + "learning_rate": 5.634918459640949e-06, + "loss": 0.2634, + "step": 32603 + }, + { + "epoch": 2.042213781877374, + "grad_norm": 4.539929389953613, + "learning_rate": 5.6347814170206935e-06, + "loss": 0.1833, + "step": 32604 + }, + { + "epoch": 2.0422273467173087, + "grad_norm": 6.213540077209473, + "learning_rate": 5.634644374400439e-06, + "loss": 0.2978, + "step": 32605 + }, + { + "epoch": 2.0422409115572435, + "grad_norm": 4.652551174163818, + "learning_rate": 5.634507331780185e-06, + "loss": 0.1573, + "step": 32606 + }, + { + "epoch": 2.0422544763971784, + "grad_norm": 5.096548080444336, + "learning_rate": 5.634370289159929e-06, + "loss": 0.1854, + "step": 32607 + }, + { + "epoch": 2.0422680412371133, + "grad_norm": 4.193049907684326, + "learning_rate": 5.634233246539674e-06, + "loss": 0.2086, + "step": 32608 + }, + { + "epoch": 2.042281606077048, + "grad_norm": 4.186485767364502, + "learning_rate": 5.634096203919419e-06, + "loss": 0.2339, + "step": 32609 + }, + { + "epoch": 2.042295170916983, + "grad_norm": 3.3605387210845947, + "learning_rate": 5.633959161299164e-06, + "loss": 0.1623, + "step": 32610 + }, + { + "epoch": 2.042308735756918, + "grad_norm": 3.6996703147888184, + "learning_rate": 5.63382211867891e-06, + "loss": 0.1499, + "step": 32611 + }, + { + "epoch": 2.042322300596853, + "grad_norm": 3.958646059036255, + "learning_rate": 5.633685076058655e-06, + "loss": 0.2135, + "step": 32612 + }, + { + "epoch": 2.042335865436788, + "grad_norm": 5.909546375274658, + "learning_rate": 5.633548033438399e-06, + "loss": 0.2379, + "step": 32613 + }, + { + "epoch": 2.042349430276723, + "grad_norm": 3.5171608924865723, + "learning_rate": 5.6334109908181444e-06, + "loss": 0.1326, + "step": 32614 + }, + { + "epoch": 2.0423629951166578, + "grad_norm": 5.546108722686768, + "learning_rate": 5.6332739481978905e-06, + "loss": 0.2187, + "step": 32615 + }, + { + "epoch": 2.0423765599565926, + "grad_norm": 4.399496555328369, + "learning_rate": 5.633136905577636e-06, + "loss": 0.1637, + "step": 32616 + }, + { + "epoch": 2.0423901247965275, + "grad_norm": 4.09601354598999, + "learning_rate": 5.63299986295738e-06, + "loss": 0.1754, + "step": 32617 + }, + { + "epoch": 2.0424036896364623, + "grad_norm": 6.0099568367004395, + "learning_rate": 5.632862820337125e-06, + "loss": 0.2562, + "step": 32618 + }, + { + "epoch": 2.042417254476397, + "grad_norm": 3.795867919921875, + "learning_rate": 5.632725777716871e-06, + "loss": 0.2233, + "step": 32619 + }, + { + "epoch": 2.042430819316332, + "grad_norm": 4.616788864135742, + "learning_rate": 5.6325887350966155e-06, + "loss": 0.1779, + "step": 32620 + }, + { + "epoch": 2.042444384156267, + "grad_norm": 4.522929668426514, + "learning_rate": 5.632451692476361e-06, + "loss": 0.1986, + "step": 32621 + }, + { + "epoch": 2.042457948996202, + "grad_norm": 4.82993745803833, + "learning_rate": 5.632314649856105e-06, + "loss": 0.2332, + "step": 32622 + }, + { + "epoch": 2.0424715138361367, + "grad_norm": 4.1155805587768555, + "learning_rate": 5.63217760723585e-06, + "loss": 0.111, + "step": 32623 + }, + { + "epoch": 2.0424850786760715, + "grad_norm": 3.5986199378967285, + "learning_rate": 5.632040564615596e-06, + "loss": 0.1324, + "step": 32624 + }, + { + "epoch": 2.0424986435160064, + "grad_norm": 3.6148905754089355, + "learning_rate": 5.631903521995341e-06, + "loss": 0.1771, + "step": 32625 + }, + { + "epoch": 2.0425122083559413, + "grad_norm": 6.171442985534668, + "learning_rate": 5.631766479375086e-06, + "loss": 0.2167, + "step": 32626 + }, + { + "epoch": 2.042525773195876, + "grad_norm": 5.933838844299316, + "learning_rate": 5.631629436754831e-06, + "loss": 0.2398, + "step": 32627 + }, + { + "epoch": 2.042539338035811, + "grad_norm": 4.277800559997559, + "learning_rate": 5.631492394134577e-06, + "loss": 0.1195, + "step": 32628 + }, + { + "epoch": 2.042552902875746, + "grad_norm": 5.553558349609375, + "learning_rate": 5.631355351514321e-06, + "loss": 0.2396, + "step": 32629 + }, + { + "epoch": 2.042566467715681, + "grad_norm": 5.092428684234619, + "learning_rate": 5.6312183088940665e-06, + "loss": 0.1879, + "step": 32630 + }, + { + "epoch": 2.042580032555616, + "grad_norm": 4.250147342681885, + "learning_rate": 5.631081266273812e-06, + "loss": 0.1578, + "step": 32631 + }, + { + "epoch": 2.042593597395551, + "grad_norm": 3.7079198360443115, + "learning_rate": 5.630944223653557e-06, + "loss": 0.0931, + "step": 32632 + }, + { + "epoch": 2.0426071622354858, + "grad_norm": 4.511800289154053, + "learning_rate": 5.630807181033302e-06, + "loss": 0.1644, + "step": 32633 + }, + { + "epoch": 2.0426207270754206, + "grad_norm": 5.472817420959473, + "learning_rate": 5.630670138413047e-06, + "loss": 0.155, + "step": 32634 + }, + { + "epoch": 2.0426342919153555, + "grad_norm": 5.93713903427124, + "learning_rate": 5.6305330957927915e-06, + "loss": 0.2166, + "step": 32635 + }, + { + "epoch": 2.0426478567552904, + "grad_norm": 4.736853122711182, + "learning_rate": 5.630396053172537e-06, + "loss": 0.1723, + "step": 32636 + }, + { + "epoch": 2.0426614215952252, + "grad_norm": 4.4769392013549805, + "learning_rate": 5.630259010552283e-06, + "loss": 0.1337, + "step": 32637 + }, + { + "epoch": 2.04267498643516, + "grad_norm": 5.131325721740723, + "learning_rate": 5.630121967932027e-06, + "loss": 0.1071, + "step": 32638 + }, + { + "epoch": 2.042688551275095, + "grad_norm": 5.991976261138916, + "learning_rate": 5.629984925311772e-06, + "loss": 0.2583, + "step": 32639 + }, + { + "epoch": 2.04270211611503, + "grad_norm": 5.389906883239746, + "learning_rate": 5.629847882691517e-06, + "loss": 0.2428, + "step": 32640 + }, + { + "epoch": 2.0427156809549647, + "grad_norm": 5.710029125213623, + "learning_rate": 5.629710840071263e-06, + "loss": 0.3019, + "step": 32641 + }, + { + "epoch": 2.0427292457948996, + "grad_norm": 5.853984355926514, + "learning_rate": 5.629573797451008e-06, + "loss": 0.1912, + "step": 32642 + }, + { + "epoch": 2.0427428106348344, + "grad_norm": 3.7186994552612305, + "learning_rate": 5.629436754830753e-06, + "loss": 0.1205, + "step": 32643 + }, + { + "epoch": 2.0427563754747693, + "grad_norm": 4.7696075439453125, + "learning_rate": 5.629299712210497e-06, + "loss": 0.1858, + "step": 32644 + }, + { + "epoch": 2.042769940314704, + "grad_norm": 4.234038829803467, + "learning_rate": 5.629162669590243e-06, + "loss": 0.1537, + "step": 32645 + }, + { + "epoch": 2.042783505154639, + "grad_norm": 6.948106288909912, + "learning_rate": 5.6290256269699885e-06, + "loss": 0.2879, + "step": 32646 + }, + { + "epoch": 2.042797069994574, + "grad_norm": 5.466169834136963, + "learning_rate": 5.628888584349733e-06, + "loss": 0.27, + "step": 32647 + }, + { + "epoch": 2.0428106348345088, + "grad_norm": 4.3861002922058105, + "learning_rate": 5.628751541729478e-06, + "loss": 0.1403, + "step": 32648 + }, + { + "epoch": 2.0428241996744436, + "grad_norm": 6.609713554382324, + "learning_rate": 5.628614499109224e-06, + "loss": 0.2037, + "step": 32649 + }, + { + "epoch": 2.042837764514379, + "grad_norm": 3.6390790939331055, + "learning_rate": 5.628477456488969e-06, + "loss": 0.1519, + "step": 32650 + }, + { + "epoch": 2.042851329354314, + "grad_norm": 3.9947640895843506, + "learning_rate": 5.6283404138687135e-06, + "loss": 0.2012, + "step": 32651 + }, + { + "epoch": 2.0428648941942487, + "grad_norm": 4.787960529327393, + "learning_rate": 5.628203371248459e-06, + "loss": 0.1891, + "step": 32652 + }, + { + "epoch": 2.0428784590341835, + "grad_norm": 4.249922752380371, + "learning_rate": 5.628066328628203e-06, + "loss": 0.1775, + "step": 32653 + }, + { + "epoch": 2.0428920238741184, + "grad_norm": 5.531542778015137, + "learning_rate": 5.627929286007949e-06, + "loss": 0.2176, + "step": 32654 + }, + { + "epoch": 2.0429055887140533, + "grad_norm": 5.785735607147217, + "learning_rate": 5.627792243387694e-06, + "loss": 0.2532, + "step": 32655 + }, + { + "epoch": 2.042919153553988, + "grad_norm": 5.0334296226501465, + "learning_rate": 5.6276552007674386e-06, + "loss": 0.1742, + "step": 32656 + }, + { + "epoch": 2.042932718393923, + "grad_norm": 3.534984588623047, + "learning_rate": 5.627518158147184e-06, + "loss": 0.1397, + "step": 32657 + }, + { + "epoch": 2.042946283233858, + "grad_norm": 4.426806449890137, + "learning_rate": 5.62738111552693e-06, + "loss": 0.2336, + "step": 32658 + }, + { + "epoch": 2.0429598480737927, + "grad_norm": 5.2934441566467285, + "learning_rate": 5.627244072906675e-06, + "loss": 0.1582, + "step": 32659 + }, + { + "epoch": 2.0429734129137276, + "grad_norm": 3.7051727771759033, + "learning_rate": 5.627107030286419e-06, + "loss": 0.1501, + "step": 32660 + }, + { + "epoch": 2.0429869777536624, + "grad_norm": 6.5399394035339355, + "learning_rate": 5.6269699876661645e-06, + "loss": 0.26, + "step": 32661 + }, + { + "epoch": 2.0430005425935973, + "grad_norm": 6.666992664337158, + "learning_rate": 5.6268329450459105e-06, + "loss": 0.2484, + "step": 32662 + }, + { + "epoch": 2.043014107433532, + "grad_norm": 5.511443614959717, + "learning_rate": 5.626695902425655e-06, + "loss": 0.2039, + "step": 32663 + }, + { + "epoch": 2.043027672273467, + "grad_norm": 4.305146217346191, + "learning_rate": 5.6265588598054e-06, + "loss": 0.0932, + "step": 32664 + }, + { + "epoch": 2.043041237113402, + "grad_norm": 3.777104139328003, + "learning_rate": 5.626421817185145e-06, + "loss": 0.1114, + "step": 32665 + }, + { + "epoch": 2.0430548019533368, + "grad_norm": 3.9464006423950195, + "learning_rate": 5.6262847745648895e-06, + "loss": 0.1626, + "step": 32666 + }, + { + "epoch": 2.0430683667932716, + "grad_norm": 3.599047899246216, + "learning_rate": 5.6261477319446355e-06, + "loss": 0.1414, + "step": 32667 + }, + { + "epoch": 2.043081931633207, + "grad_norm": 4.47625207901001, + "learning_rate": 5.626010689324381e-06, + "loss": 0.1239, + "step": 32668 + }, + { + "epoch": 2.043095496473142, + "grad_norm": 4.760026931762695, + "learning_rate": 5.625873646704125e-06, + "loss": 0.1427, + "step": 32669 + }, + { + "epoch": 2.0431090613130767, + "grad_norm": 5.958448886871338, + "learning_rate": 5.62573660408387e-06, + "loss": 0.192, + "step": 32670 + }, + { + "epoch": 2.0431226261530115, + "grad_norm": 6.173208236694336, + "learning_rate": 5.625599561463616e-06, + "loss": 0.1914, + "step": 32671 + }, + { + "epoch": 2.0431361909929464, + "grad_norm": 4.594109535217285, + "learning_rate": 5.625462518843361e-06, + "loss": 0.162, + "step": 32672 + }, + { + "epoch": 2.0431497558328813, + "grad_norm": 3.4965689182281494, + "learning_rate": 5.625325476223106e-06, + "loss": 0.1006, + "step": 32673 + }, + { + "epoch": 2.043163320672816, + "grad_norm": 2.9325878620147705, + "learning_rate": 5.625188433602851e-06, + "loss": 0.0596, + "step": 32674 + }, + { + "epoch": 2.043176885512751, + "grad_norm": 4.577995300292969, + "learning_rate": 5.625051390982597e-06, + "loss": 0.1282, + "step": 32675 + }, + { + "epoch": 2.043190450352686, + "grad_norm": 4.2212700843811035, + "learning_rate": 5.624914348362341e-06, + "loss": 0.09, + "step": 32676 + }, + { + "epoch": 2.0432040151926207, + "grad_norm": 1.9559414386749268, + "learning_rate": 5.6247773057420865e-06, + "loss": 0.0289, + "step": 32677 + }, + { + "epoch": 2.0432175800325556, + "grad_norm": 4.410648345947266, + "learning_rate": 5.624640263121831e-06, + "loss": 0.1607, + "step": 32678 + }, + { + "epoch": 2.0432311448724905, + "grad_norm": 3.6815035343170166, + "learning_rate": 5.624503220501576e-06, + "loss": 0.1418, + "step": 32679 + }, + { + "epoch": 2.0432447097124253, + "grad_norm": 4.202114105224609, + "learning_rate": 5.624366177881322e-06, + "loss": 0.127, + "step": 32680 + }, + { + "epoch": 2.04325827455236, + "grad_norm": 4.127376556396484, + "learning_rate": 5.624229135261066e-06, + "loss": 0.1016, + "step": 32681 + }, + { + "epoch": 2.043271839392295, + "grad_norm": 5.658267021179199, + "learning_rate": 5.6240920926408115e-06, + "loss": 0.1148, + "step": 32682 + }, + { + "epoch": 2.04328540423223, + "grad_norm": 5.609280109405518, + "learning_rate": 5.623955050020557e-06, + "loss": 0.2348, + "step": 32683 + }, + { + "epoch": 2.043298969072165, + "grad_norm": 3.8721582889556885, + "learning_rate": 5.623818007400303e-06, + "loss": 0.1413, + "step": 32684 + }, + { + "epoch": 2.0433125339120997, + "grad_norm": 4.859395503997803, + "learning_rate": 5.623680964780047e-06, + "loss": 0.2436, + "step": 32685 + }, + { + "epoch": 2.0433260987520345, + "grad_norm": 4.454765796661377, + "learning_rate": 5.623543922159792e-06, + "loss": 0.2424, + "step": 32686 + }, + { + "epoch": 2.0433396635919694, + "grad_norm": 3.6685874462127686, + "learning_rate": 5.6234068795395366e-06, + "loss": 0.092, + "step": 32687 + }, + { + "epoch": 2.0433532284319047, + "grad_norm": 2.988088369369507, + "learning_rate": 5.623269836919283e-06, + "loss": 0.081, + "step": 32688 + }, + { + "epoch": 2.0433667932718396, + "grad_norm": 6.63841438293457, + "learning_rate": 5.623132794299028e-06, + "loss": 0.1799, + "step": 32689 + }, + { + "epoch": 2.0433803581117744, + "grad_norm": 4.875881671905518, + "learning_rate": 5.622995751678773e-06, + "loss": 0.2206, + "step": 32690 + }, + { + "epoch": 2.0433939229517093, + "grad_norm": 4.670040130615234, + "learning_rate": 5.622858709058517e-06, + "loss": 0.2076, + "step": 32691 + }, + { + "epoch": 2.043407487791644, + "grad_norm": 3.787731409072876, + "learning_rate": 5.6227216664382625e-06, + "loss": 0.1311, + "step": 32692 + }, + { + "epoch": 2.043421052631579, + "grad_norm": 7.313857555389404, + "learning_rate": 5.6225846238180085e-06, + "loss": 0.2874, + "step": 32693 + }, + { + "epoch": 2.043434617471514, + "grad_norm": 4.292141437530518, + "learning_rate": 5.622447581197753e-06, + "loss": 0.2636, + "step": 32694 + }, + { + "epoch": 2.0434481823114488, + "grad_norm": 5.479879856109619, + "learning_rate": 5.622310538577498e-06, + "loss": 0.199, + "step": 32695 + }, + { + "epoch": 2.0434617471513836, + "grad_norm": 6.2607927322387695, + "learning_rate": 5.622173495957242e-06, + "loss": 0.1801, + "step": 32696 + }, + { + "epoch": 2.0434753119913185, + "grad_norm": 5.7076897621154785, + "learning_rate": 5.622036453336988e-06, + "loss": 0.2543, + "step": 32697 + }, + { + "epoch": 2.0434888768312534, + "grad_norm": 5.14288854598999, + "learning_rate": 5.6218994107167335e-06, + "loss": 0.2401, + "step": 32698 + }, + { + "epoch": 2.043502441671188, + "grad_norm": 4.393619060516357, + "learning_rate": 5.621762368096479e-06, + "loss": 0.1511, + "step": 32699 + }, + { + "epoch": 2.043516006511123, + "grad_norm": 5.322328567504883, + "learning_rate": 5.621625325476223e-06, + "loss": 0.0971, + "step": 32700 + }, + { + "epoch": 2.043529571351058, + "grad_norm": 5.49933385848999, + "learning_rate": 5.621488282855969e-06, + "loss": 0.1605, + "step": 32701 + }, + { + "epoch": 2.043543136190993, + "grad_norm": 4.206383228302002, + "learning_rate": 5.621351240235714e-06, + "loss": 0.1272, + "step": 32702 + }, + { + "epoch": 2.0435567010309277, + "grad_norm": 5.288899898529053, + "learning_rate": 5.621214197615459e-06, + "loss": 0.1807, + "step": 32703 + }, + { + "epoch": 2.0435702658708625, + "grad_norm": 4.538654327392578, + "learning_rate": 5.621077154995204e-06, + "loss": 0.1695, + "step": 32704 + }, + { + "epoch": 2.0435838307107974, + "grad_norm": 5.628879070281982, + "learning_rate": 5.620940112374948e-06, + "loss": 0.2769, + "step": 32705 + }, + { + "epoch": 2.0435973955507327, + "grad_norm": 6.430584907531738, + "learning_rate": 5.620803069754694e-06, + "loss": 0.2627, + "step": 32706 + }, + { + "epoch": 2.0436109603906676, + "grad_norm": 3.774904251098633, + "learning_rate": 5.620666027134439e-06, + "loss": 0.0979, + "step": 32707 + }, + { + "epoch": 2.0436245252306025, + "grad_norm": 4.056736469268799, + "learning_rate": 5.6205289845141845e-06, + "loss": 0.1303, + "step": 32708 + }, + { + "epoch": 2.0436380900705373, + "grad_norm": 4.77683162689209, + "learning_rate": 5.620391941893929e-06, + "loss": 0.2184, + "step": 32709 + }, + { + "epoch": 2.043651654910472, + "grad_norm": 4.3137335777282715, + "learning_rate": 5.620254899273675e-06, + "loss": 0.1127, + "step": 32710 + }, + { + "epoch": 2.043665219750407, + "grad_norm": 6.469101905822754, + "learning_rate": 5.62011785665342e-06, + "loss": 0.1725, + "step": 32711 + }, + { + "epoch": 2.043678784590342, + "grad_norm": 5.575201034545898, + "learning_rate": 5.619980814033164e-06, + "loss": 0.1123, + "step": 32712 + }, + { + "epoch": 2.0436923494302768, + "grad_norm": 5.8350934982299805, + "learning_rate": 5.6198437714129095e-06, + "loss": 0.1226, + "step": 32713 + }, + { + "epoch": 2.0437059142702116, + "grad_norm": 4.593674182891846, + "learning_rate": 5.6197067287926556e-06, + "loss": 0.1043, + "step": 32714 + }, + { + "epoch": 2.0437194791101465, + "grad_norm": 4.5326828956604, + "learning_rate": 5.6195696861724e-06, + "loss": 0.161, + "step": 32715 + }, + { + "epoch": 2.0437330439500814, + "grad_norm": 4.572452068328857, + "learning_rate": 5.619432643552145e-06, + "loss": 0.1473, + "step": 32716 + }, + { + "epoch": 2.0437466087900162, + "grad_norm": 4.143370151519775, + "learning_rate": 5.61929560093189e-06, + "loss": 0.1525, + "step": 32717 + }, + { + "epoch": 2.043760173629951, + "grad_norm": 6.499404430389404, + "learning_rate": 5.619158558311636e-06, + "loss": 0.2062, + "step": 32718 + }, + { + "epoch": 2.043773738469886, + "grad_norm": 4.422947406768799, + "learning_rate": 5.619021515691381e-06, + "loss": 0.1919, + "step": 32719 + }, + { + "epoch": 2.043787303309821, + "grad_norm": 3.2877049446105957, + "learning_rate": 5.618884473071126e-06, + "loss": 0.1169, + "step": 32720 + }, + { + "epoch": 2.0438008681497557, + "grad_norm": 6.119497776031494, + "learning_rate": 5.61874743045087e-06, + "loss": 0.2323, + "step": 32721 + }, + { + "epoch": 2.0438144329896906, + "grad_norm": 5.420662879943848, + "learning_rate": 5.618610387830615e-06, + "loss": 0.1891, + "step": 32722 + }, + { + "epoch": 2.0438279978296254, + "grad_norm": 6.780368804931641, + "learning_rate": 5.618473345210361e-06, + "loss": 0.187, + "step": 32723 + }, + { + "epoch": 2.0438415626695603, + "grad_norm": 6.175833225250244, + "learning_rate": 5.6183363025901065e-06, + "loss": 0.1848, + "step": 32724 + }, + { + "epoch": 2.0438551275094956, + "grad_norm": 6.669198513031006, + "learning_rate": 5.618199259969851e-06, + "loss": 0.2541, + "step": 32725 + }, + { + "epoch": 2.0438686923494305, + "grad_norm": 5.3174543380737305, + "learning_rate": 5.618062217349596e-06, + "loss": 0.1675, + "step": 32726 + }, + { + "epoch": 2.0438822571893653, + "grad_norm": 5.328845977783203, + "learning_rate": 5.617925174729342e-06, + "loss": 0.2052, + "step": 32727 + }, + { + "epoch": 2.0438958220293, + "grad_norm": 5.317482948303223, + "learning_rate": 5.617788132109086e-06, + "loss": 0.2652, + "step": 32728 + }, + { + "epoch": 2.043909386869235, + "grad_norm": 5.330844402313232, + "learning_rate": 5.6176510894888315e-06, + "loss": 0.2122, + "step": 32729 + }, + { + "epoch": 2.04392295170917, + "grad_norm": 4.82316780090332, + "learning_rate": 5.617514046868576e-06, + "loss": 0.1896, + "step": 32730 + }, + { + "epoch": 2.043936516549105, + "grad_norm": 4.526586055755615, + "learning_rate": 5.617377004248322e-06, + "loss": 0.101, + "step": 32731 + }, + { + "epoch": 2.0439500813890397, + "grad_norm": 7.763976573944092, + "learning_rate": 5.617239961628067e-06, + "loss": 0.3627, + "step": 32732 + }, + { + "epoch": 2.0439636462289745, + "grad_norm": 4.828673362731934, + "learning_rate": 5.617102919007812e-06, + "loss": 0.1253, + "step": 32733 + }, + { + "epoch": 2.0439772110689094, + "grad_norm": 4.272846698760986, + "learning_rate": 5.616965876387557e-06, + "loss": 0.1518, + "step": 32734 + }, + { + "epoch": 2.0439907759088443, + "grad_norm": 7.2624616622924805, + "learning_rate": 5.616828833767302e-06, + "loss": 0.2682, + "step": 32735 + }, + { + "epoch": 2.044004340748779, + "grad_norm": 4.9224677085876465, + "learning_rate": 5.616691791147048e-06, + "loss": 0.2135, + "step": 32736 + }, + { + "epoch": 2.044017905588714, + "grad_norm": 5.40220308303833, + "learning_rate": 5.616554748526792e-06, + "loss": 0.1414, + "step": 32737 + }, + { + "epoch": 2.044031470428649, + "grad_norm": 3.8442795276641846, + "learning_rate": 5.616417705906537e-06, + "loss": 0.1803, + "step": 32738 + }, + { + "epoch": 2.0440450352685837, + "grad_norm": 5.055732727050781, + "learning_rate": 5.6162806632862825e-06, + "loss": 0.2326, + "step": 32739 + }, + { + "epoch": 2.0440586001085186, + "grad_norm": 6.7244954109191895, + "learning_rate": 5.616143620666028e-06, + "loss": 0.1474, + "step": 32740 + }, + { + "epoch": 2.0440721649484535, + "grad_norm": 6.005195617675781, + "learning_rate": 5.616006578045773e-06, + "loss": 0.2662, + "step": 32741 + }, + { + "epoch": 2.0440857297883883, + "grad_norm": 4.986042022705078, + "learning_rate": 5.615869535425518e-06, + "loss": 0.2823, + "step": 32742 + }, + { + "epoch": 2.044099294628323, + "grad_norm": 5.188518047332764, + "learning_rate": 5.615732492805262e-06, + "loss": 0.204, + "step": 32743 + }, + { + "epoch": 2.0441128594682585, + "grad_norm": 5.51822566986084, + "learning_rate": 5.615595450185008e-06, + "loss": 0.3234, + "step": 32744 + }, + { + "epoch": 2.0441264243081934, + "grad_norm": 5.376275539398193, + "learning_rate": 5.6154584075647536e-06, + "loss": 0.2073, + "step": 32745 + }, + { + "epoch": 2.0441399891481282, + "grad_norm": 5.8808135986328125, + "learning_rate": 5.615321364944498e-06, + "loss": 0.2817, + "step": 32746 + }, + { + "epoch": 2.044153553988063, + "grad_norm": 5.2303385734558105, + "learning_rate": 5.615184322324243e-06, + "loss": 0.2094, + "step": 32747 + }, + { + "epoch": 2.044167118827998, + "grad_norm": 4.748104095458984, + "learning_rate": 5.615047279703988e-06, + "loss": 0.1463, + "step": 32748 + }, + { + "epoch": 2.044180683667933, + "grad_norm": 5.939907073974609, + "learning_rate": 5.6149102370837334e-06, + "loss": 0.2405, + "step": 32749 + }, + { + "epoch": 2.0441942485078677, + "grad_norm": 4.457977771759033, + "learning_rate": 5.614773194463479e-06, + "loss": 0.2118, + "step": 32750 + }, + { + "epoch": 2.0442078133478025, + "grad_norm": 7.009493827819824, + "learning_rate": 5.614636151843224e-06, + "loss": 0.2482, + "step": 32751 + }, + { + "epoch": 2.0442213781877374, + "grad_norm": 4.30074405670166, + "learning_rate": 5.614499109222968e-06, + "loss": 0.1475, + "step": 32752 + }, + { + "epoch": 2.0442349430276723, + "grad_norm": 4.952131748199463, + "learning_rate": 5.614362066602714e-06, + "loss": 0.1998, + "step": 32753 + }, + { + "epoch": 2.044248507867607, + "grad_norm": 4.4058146476745605, + "learning_rate": 5.614225023982459e-06, + "loss": 0.1615, + "step": 32754 + }, + { + "epoch": 2.044262072707542, + "grad_norm": 5.871364593505859, + "learning_rate": 5.614087981362204e-06, + "loss": 0.2264, + "step": 32755 + }, + { + "epoch": 2.044275637547477, + "grad_norm": 8.892424583435059, + "learning_rate": 5.613950938741949e-06, + "loss": 0.2255, + "step": 32756 + }, + { + "epoch": 2.0442892023874117, + "grad_norm": 10.372607231140137, + "learning_rate": 5.613813896121695e-06, + "loss": 0.2794, + "step": 32757 + }, + { + "epoch": 2.0443027672273466, + "grad_norm": 5.446860313415527, + "learning_rate": 5.61367685350144e-06, + "loss": 0.2865, + "step": 32758 + }, + { + "epoch": 2.0443163320672815, + "grad_norm": 5.6349711418151855, + "learning_rate": 5.613539810881184e-06, + "loss": 0.168, + "step": 32759 + }, + { + "epoch": 2.0443298969072163, + "grad_norm": 5.620394706726074, + "learning_rate": 5.6134027682609296e-06, + "loss": 0.2383, + "step": 32760 + }, + { + "epoch": 2.044343461747151, + "grad_norm": 7.367404460906982, + "learning_rate": 5.613265725640674e-06, + "loss": 0.136, + "step": 32761 + }, + { + "epoch": 2.044357026587086, + "grad_norm": 3.1415131092071533, + "learning_rate": 5.61312868302042e-06, + "loss": 0.1239, + "step": 32762 + }, + { + "epoch": 2.0443705914270214, + "grad_norm": 7.018084526062012, + "learning_rate": 5.612991640400165e-06, + "loss": 0.3251, + "step": 32763 + }, + { + "epoch": 2.0443841562669562, + "grad_norm": 4.763927459716797, + "learning_rate": 5.612854597779909e-06, + "loss": 0.1505, + "step": 32764 + }, + { + "epoch": 2.044397721106891, + "grad_norm": 3.6976277828216553, + "learning_rate": 5.612717555159655e-06, + "loss": 0.1737, + "step": 32765 + }, + { + "epoch": 2.044411285946826, + "grad_norm": 4.679534912109375, + "learning_rate": 5.612580512539401e-06, + "loss": 0.1922, + "step": 32766 + }, + { + "epoch": 2.044424850786761, + "grad_norm": 4.527793884277344, + "learning_rate": 5.612443469919146e-06, + "loss": 0.1095, + "step": 32767 + }, + { + "epoch": 2.0444384156266957, + "grad_norm": 5.7051591873168945, + "learning_rate": 5.61230642729889e-06, + "loss": 0.2863, + "step": 32768 + }, + { + "epoch": 2.0444519804666306, + "grad_norm": 4.826326847076416, + "learning_rate": 5.612169384678635e-06, + "loss": 0.1365, + "step": 32769 + }, + { + "epoch": 2.0444655453065654, + "grad_norm": 5.13153076171875, + "learning_rate": 5.612032342058381e-06, + "loss": 0.2286, + "step": 32770 + }, + { + "epoch": 2.0444791101465003, + "grad_norm": 5.642795085906982, + "learning_rate": 5.611895299438126e-06, + "loss": 0.2509, + "step": 32771 + }, + { + "epoch": 2.044492674986435, + "grad_norm": 5.483695983886719, + "learning_rate": 5.611758256817871e-06, + "loss": 0.2041, + "step": 32772 + }, + { + "epoch": 2.04450623982637, + "grad_norm": 5.441693305969238, + "learning_rate": 5.611621214197616e-06, + "loss": 0.1817, + "step": 32773 + }, + { + "epoch": 2.044519804666305, + "grad_norm": 5.467276096343994, + "learning_rate": 5.61148417157736e-06, + "loss": 0.2359, + "step": 32774 + }, + { + "epoch": 2.0445333695062398, + "grad_norm": 4.35488224029541, + "learning_rate": 5.611347128957106e-06, + "loss": 0.1733, + "step": 32775 + }, + { + "epoch": 2.0445469343461746, + "grad_norm": 4.632789611816406, + "learning_rate": 5.6112100863368516e-06, + "loss": 0.2134, + "step": 32776 + }, + { + "epoch": 2.0445604991861095, + "grad_norm": 6.101511478424072, + "learning_rate": 5.611073043716596e-06, + "loss": 0.3057, + "step": 32777 + }, + { + "epoch": 2.0445740640260444, + "grad_norm": 6.875174045562744, + "learning_rate": 5.610936001096341e-06, + "loss": 0.284, + "step": 32778 + }, + { + "epoch": 2.0445876288659792, + "grad_norm": 4.232342720031738, + "learning_rate": 5.610798958476087e-06, + "loss": 0.1872, + "step": 32779 + }, + { + "epoch": 2.044601193705914, + "grad_norm": 4.972719192504883, + "learning_rate": 5.6106619158558314e-06, + "loss": 0.188, + "step": 32780 + }, + { + "epoch": 2.044614758545849, + "grad_norm": 4.829664707183838, + "learning_rate": 5.610524873235577e-06, + "loss": 0.2716, + "step": 32781 + }, + { + "epoch": 2.0446283233857843, + "grad_norm": 4.804851055145264, + "learning_rate": 5.610387830615322e-06, + "loss": 0.1696, + "step": 32782 + }, + { + "epoch": 2.044641888225719, + "grad_norm": 5.689901351928711, + "learning_rate": 5.610250787995067e-06, + "loss": 0.2473, + "step": 32783 + }, + { + "epoch": 2.044655453065654, + "grad_norm": 4.5896687507629395, + "learning_rate": 5.610113745374812e-06, + "loss": 0.1383, + "step": 32784 + }, + { + "epoch": 2.044669017905589, + "grad_norm": 5.68281364440918, + "learning_rate": 5.609976702754557e-06, + "loss": 0.3093, + "step": 32785 + }, + { + "epoch": 2.0446825827455237, + "grad_norm": 3.625684976577759, + "learning_rate": 5.609839660134302e-06, + "loss": 0.1437, + "step": 32786 + }, + { + "epoch": 2.0446961475854586, + "grad_norm": 5.3828911781311035, + "learning_rate": 5.609702617514048e-06, + "loss": 0.1911, + "step": 32787 + }, + { + "epoch": 2.0447097124253935, + "grad_norm": 5.236733913421631, + "learning_rate": 5.609565574893793e-06, + "loss": 0.2272, + "step": 32788 + }, + { + "epoch": 2.0447232772653283, + "grad_norm": 4.7129716873168945, + "learning_rate": 5.609428532273537e-06, + "loss": 0.1085, + "step": 32789 + }, + { + "epoch": 2.044736842105263, + "grad_norm": 4.931076526641846, + "learning_rate": 5.609291489653282e-06, + "loss": 0.2, + "step": 32790 + }, + { + "epoch": 2.044750406945198, + "grad_norm": 5.18082857131958, + "learning_rate": 5.6091544470330276e-06, + "loss": 0.235, + "step": 32791 + }, + { + "epoch": 2.044763971785133, + "grad_norm": 3.221776008605957, + "learning_rate": 5.609017404412774e-06, + "loss": 0.1323, + "step": 32792 + }, + { + "epoch": 2.044777536625068, + "grad_norm": 5.144106864929199, + "learning_rate": 5.608880361792518e-06, + "loss": 0.1964, + "step": 32793 + }, + { + "epoch": 2.0447911014650026, + "grad_norm": 4.9487433433532715, + "learning_rate": 5.608743319172263e-06, + "loss": 0.2085, + "step": 32794 + }, + { + "epoch": 2.0448046663049375, + "grad_norm": 8.592700958251953, + "learning_rate": 5.6086062765520074e-06, + "loss": 0.273, + "step": 32795 + }, + { + "epoch": 2.0448182311448724, + "grad_norm": 6.585714340209961, + "learning_rate": 5.6084692339317535e-06, + "loss": 0.3819, + "step": 32796 + }, + { + "epoch": 2.0448317959848072, + "grad_norm": 4.87030553817749, + "learning_rate": 5.608332191311499e-06, + "loss": 0.2707, + "step": 32797 + }, + { + "epoch": 2.044845360824742, + "grad_norm": 4.409985542297363, + "learning_rate": 5.608195148691243e-06, + "loss": 0.1512, + "step": 32798 + }, + { + "epoch": 2.044858925664677, + "grad_norm": 3.196582555770874, + "learning_rate": 5.608058106070988e-06, + "loss": 0.1382, + "step": 32799 + }, + { + "epoch": 2.044872490504612, + "grad_norm": 5.384702682495117, + "learning_rate": 5.607921063450734e-06, + "loss": 0.2679, + "step": 32800 + }, + { + "epoch": 2.044886055344547, + "grad_norm": 4.0029425621032715, + "learning_rate": 5.607784020830479e-06, + "loss": 0.1431, + "step": 32801 + }, + { + "epoch": 2.044899620184482, + "grad_norm": 4.2313232421875, + "learning_rate": 5.607646978210224e-06, + "loss": 0.2129, + "step": 32802 + }, + { + "epoch": 2.044913185024417, + "grad_norm": 6.499772071838379, + "learning_rate": 5.607509935589969e-06, + "loss": 0.3155, + "step": 32803 + }, + { + "epoch": 2.0449267498643517, + "grad_norm": 5.806670188903809, + "learning_rate": 5.607372892969713e-06, + "loss": 0.1516, + "step": 32804 + }, + { + "epoch": 2.0449403147042866, + "grad_norm": 4.183772087097168, + "learning_rate": 5.607235850349459e-06, + "loss": 0.2163, + "step": 32805 + }, + { + "epoch": 2.0449538795442215, + "grad_norm": 4.028794288635254, + "learning_rate": 5.607098807729204e-06, + "loss": 0.1878, + "step": 32806 + }, + { + "epoch": 2.0449674443841563, + "grad_norm": 4.779233932495117, + "learning_rate": 5.6069617651089496e-06, + "loss": 0.1414, + "step": 32807 + }, + { + "epoch": 2.044981009224091, + "grad_norm": 4.158600330352783, + "learning_rate": 5.606824722488694e-06, + "loss": 0.1752, + "step": 32808 + }, + { + "epoch": 2.044994574064026, + "grad_norm": 4.058454990386963, + "learning_rate": 5.60668767986844e-06, + "loss": 0.0996, + "step": 32809 + }, + { + "epoch": 2.045008138903961, + "grad_norm": 3.797218084335327, + "learning_rate": 5.606550637248185e-06, + "loss": 0.1168, + "step": 32810 + }, + { + "epoch": 2.045021703743896, + "grad_norm": 3.7192142009735107, + "learning_rate": 5.6064135946279294e-06, + "loss": 0.1546, + "step": 32811 + }, + { + "epoch": 2.0450352685838307, + "grad_norm": 3.6294755935668945, + "learning_rate": 5.606276552007675e-06, + "loss": 0.152, + "step": 32812 + }, + { + "epoch": 2.0450488334237655, + "grad_norm": 4.623921871185303, + "learning_rate": 5.606139509387421e-06, + "loss": 0.1181, + "step": 32813 + }, + { + "epoch": 2.0450623982637004, + "grad_norm": 4.96267032623291, + "learning_rate": 5.606002466767165e-06, + "loss": 0.139, + "step": 32814 + }, + { + "epoch": 2.0450759631036353, + "grad_norm": 4.199245452880859, + "learning_rate": 5.60586542414691e-06, + "loss": 0.0863, + "step": 32815 + }, + { + "epoch": 2.04508952794357, + "grad_norm": 4.246528148651123, + "learning_rate": 5.605728381526655e-06, + "loss": 0.1746, + "step": 32816 + }, + { + "epoch": 2.045103092783505, + "grad_norm": 5.420455455780029, + "learning_rate": 5.6055913389064e-06, + "loss": 0.2072, + "step": 32817 + }, + { + "epoch": 2.04511665762344, + "grad_norm": 4.474845886230469, + "learning_rate": 5.605454296286146e-06, + "loss": 0.1801, + "step": 32818 + }, + { + "epoch": 2.0451302224633747, + "grad_norm": 4.324436664581299, + "learning_rate": 5.605317253665891e-06, + "loss": 0.1085, + "step": 32819 + }, + { + "epoch": 2.04514378730331, + "grad_norm": 3.9547154903411865, + "learning_rate": 5.605180211045635e-06, + "loss": 0.2897, + "step": 32820 + }, + { + "epoch": 2.045157352143245, + "grad_norm": 6.965083122253418, + "learning_rate": 5.60504316842538e-06, + "loss": 0.1998, + "step": 32821 + }, + { + "epoch": 2.0451709169831798, + "grad_norm": 5.0389275550842285, + "learning_rate": 5.604906125805126e-06, + "loss": 0.1592, + "step": 32822 + }, + { + "epoch": 2.0451844818231146, + "grad_norm": 6.1566548347473145, + "learning_rate": 5.604769083184871e-06, + "loss": 0.1715, + "step": 32823 + }, + { + "epoch": 2.0451980466630495, + "grad_norm": 4.911836624145508, + "learning_rate": 5.604632040564616e-06, + "loss": 0.1765, + "step": 32824 + }, + { + "epoch": 2.0452116115029844, + "grad_norm": 3.278221607208252, + "learning_rate": 5.604494997944361e-06, + "loss": 0.066, + "step": 32825 + }, + { + "epoch": 2.0452251763429192, + "grad_norm": 5.617729187011719, + "learning_rate": 5.604357955324107e-06, + "loss": 0.1554, + "step": 32826 + }, + { + "epoch": 2.045238741182854, + "grad_norm": 3.6926167011260986, + "learning_rate": 5.6042209127038515e-06, + "loss": 0.1575, + "step": 32827 + }, + { + "epoch": 2.045252306022789, + "grad_norm": 3.179910659790039, + "learning_rate": 5.604083870083597e-06, + "loss": 0.082, + "step": 32828 + }, + { + "epoch": 2.045265870862724, + "grad_norm": 4.693575382232666, + "learning_rate": 5.603946827463341e-06, + "loss": 0.2377, + "step": 32829 + }, + { + "epoch": 2.0452794357026587, + "grad_norm": 4.486457824707031, + "learning_rate": 5.603809784843086e-06, + "loss": 0.151, + "step": 32830 + }, + { + "epoch": 2.0452930005425936, + "grad_norm": 3.881511926651001, + "learning_rate": 5.603672742222832e-06, + "loss": 0.1618, + "step": 32831 + }, + { + "epoch": 2.0453065653825284, + "grad_norm": 4.057583332061768, + "learning_rate": 5.603535699602577e-06, + "loss": 0.2024, + "step": 32832 + }, + { + "epoch": 2.0453201302224633, + "grad_norm": 4.090681552886963, + "learning_rate": 5.603398656982322e-06, + "loss": 0.1309, + "step": 32833 + }, + { + "epoch": 2.045333695062398, + "grad_norm": 4.049610614776611, + "learning_rate": 5.603261614362067e-06, + "loss": 0.1338, + "step": 32834 + }, + { + "epoch": 2.045347259902333, + "grad_norm": 3.387489080429077, + "learning_rate": 5.603124571741813e-06, + "loss": 0.0947, + "step": 32835 + }, + { + "epoch": 2.045360824742268, + "grad_norm": 4.657264232635498, + "learning_rate": 5.602987529121557e-06, + "loss": 0.2352, + "step": 32836 + }, + { + "epoch": 2.0453743895822027, + "grad_norm": 3.66504168510437, + "learning_rate": 5.602850486501302e-06, + "loss": 0.1178, + "step": 32837 + }, + { + "epoch": 2.0453879544221376, + "grad_norm": 4.313072204589844, + "learning_rate": 5.602713443881047e-06, + "loss": 0.1659, + "step": 32838 + }, + { + "epoch": 2.045401519262073, + "grad_norm": 4.522352695465088, + "learning_rate": 5.602576401260793e-06, + "loss": 0.2015, + "step": 32839 + }, + { + "epoch": 2.045415084102008, + "grad_norm": 4.418980598449707, + "learning_rate": 5.602439358640538e-06, + "loss": 0.1993, + "step": 32840 + }, + { + "epoch": 2.0454286489419427, + "grad_norm": 5.229631423950195, + "learning_rate": 5.602302316020283e-06, + "loss": 0.175, + "step": 32841 + }, + { + "epoch": 2.0454422137818775, + "grad_norm": 4.086696624755859, + "learning_rate": 5.6021652734000274e-06, + "loss": 0.1759, + "step": 32842 + }, + { + "epoch": 2.0454557786218124, + "grad_norm": 5.038846969604492, + "learning_rate": 5.602028230779773e-06, + "loss": 0.2444, + "step": 32843 + }, + { + "epoch": 2.0454693434617472, + "grad_norm": 4.863763332366943, + "learning_rate": 5.601891188159519e-06, + "loss": 0.1685, + "step": 32844 + }, + { + "epoch": 2.045482908301682, + "grad_norm": 5.499582767486572, + "learning_rate": 5.601754145539263e-06, + "loss": 0.2345, + "step": 32845 + }, + { + "epoch": 2.045496473141617, + "grad_norm": 6.1757731437683105, + "learning_rate": 5.601617102919008e-06, + "loss": 0.2798, + "step": 32846 + }, + { + "epoch": 2.045510037981552, + "grad_norm": 4.966898441314697, + "learning_rate": 5.6014800602987525e-06, + "loss": 0.2187, + "step": 32847 + }, + { + "epoch": 2.0455236028214867, + "grad_norm": 4.122717380523682, + "learning_rate": 5.6013430176784985e-06, + "loss": 0.2095, + "step": 32848 + }, + { + "epoch": 2.0455371676614216, + "grad_norm": 3.9985365867614746, + "learning_rate": 5.601205975058244e-06, + "loss": 0.168, + "step": 32849 + }, + { + "epoch": 2.0455507325013564, + "grad_norm": 5.783458232879639, + "learning_rate": 5.601068932437989e-06, + "loss": 0.2414, + "step": 32850 + }, + { + "epoch": 2.0455642973412913, + "grad_norm": 4.03044319152832, + "learning_rate": 5.600931889817733e-06, + "loss": 0.2134, + "step": 32851 + }, + { + "epoch": 2.045577862181226, + "grad_norm": 5.20336389541626, + "learning_rate": 5.600794847197479e-06, + "loss": 0.2449, + "step": 32852 + }, + { + "epoch": 2.045591427021161, + "grad_norm": 3.7117297649383545, + "learning_rate": 5.600657804577224e-06, + "loss": 0.1881, + "step": 32853 + }, + { + "epoch": 2.045604991861096, + "grad_norm": 4.726012706756592, + "learning_rate": 5.600520761956969e-06, + "loss": 0.2262, + "step": 32854 + }, + { + "epoch": 2.0456185567010308, + "grad_norm": 4.876471996307373, + "learning_rate": 5.600383719336714e-06, + "loss": 0.192, + "step": 32855 + }, + { + "epoch": 2.0456321215409656, + "grad_norm": 4.13856840133667, + "learning_rate": 5.60024667671646e-06, + "loss": 0.1174, + "step": 32856 + }, + { + "epoch": 2.0456456863809005, + "grad_norm": 4.804093360900879, + "learning_rate": 5.600109634096204e-06, + "loss": 0.2063, + "step": 32857 + }, + { + "epoch": 2.045659251220836, + "grad_norm": 4.591667175292969, + "learning_rate": 5.5999725914759495e-06, + "loss": 0.189, + "step": 32858 + }, + { + "epoch": 2.0456728160607707, + "grad_norm": 4.413074016571045, + "learning_rate": 5.599835548855695e-06, + "loss": 0.1192, + "step": 32859 + }, + { + "epoch": 2.0456863809007055, + "grad_norm": 3.4298524856567383, + "learning_rate": 5.599698506235439e-06, + "loss": 0.1973, + "step": 32860 + }, + { + "epoch": 2.0456999457406404, + "grad_norm": 4.835302829742432, + "learning_rate": 5.599561463615185e-06, + "loss": 0.226, + "step": 32861 + }, + { + "epoch": 2.0457135105805753, + "grad_norm": 3.7438488006591797, + "learning_rate": 5.59942442099493e-06, + "loss": 0.1243, + "step": 32862 + }, + { + "epoch": 2.04572707542051, + "grad_norm": 3.9875147342681885, + "learning_rate": 5.5992873783746745e-06, + "loss": 0.1413, + "step": 32863 + }, + { + "epoch": 2.045740640260445, + "grad_norm": 3.3347461223602295, + "learning_rate": 5.59915033575442e-06, + "loss": 0.1807, + "step": 32864 + }, + { + "epoch": 2.04575420510038, + "grad_norm": 3.2232706546783447, + "learning_rate": 5.599013293134166e-06, + "loss": 0.1382, + "step": 32865 + }, + { + "epoch": 2.0457677699403147, + "grad_norm": 4.526812553405762, + "learning_rate": 5.598876250513911e-06, + "loss": 0.2062, + "step": 32866 + }, + { + "epoch": 2.0457813347802496, + "grad_norm": 4.230914115905762, + "learning_rate": 5.598739207893655e-06, + "loss": 0.2369, + "step": 32867 + }, + { + "epoch": 2.0457948996201845, + "grad_norm": 4.20550012588501, + "learning_rate": 5.5986021652734e-06, + "loss": 0.1847, + "step": 32868 + }, + { + "epoch": 2.0458084644601193, + "grad_norm": 3.4662532806396484, + "learning_rate": 5.5984651226531464e-06, + "loss": 0.1797, + "step": 32869 + }, + { + "epoch": 2.045822029300054, + "grad_norm": 4.187618732452393, + "learning_rate": 5.598328080032891e-06, + "loss": 0.1468, + "step": 32870 + }, + { + "epoch": 2.045835594139989, + "grad_norm": 4.969371318817139, + "learning_rate": 5.598191037412636e-06, + "loss": 0.2005, + "step": 32871 + }, + { + "epoch": 2.045849158979924, + "grad_norm": 5.145946025848389, + "learning_rate": 5.59805399479238e-06, + "loss": 0.2556, + "step": 32872 + }, + { + "epoch": 2.045862723819859, + "grad_norm": 4.593642234802246, + "learning_rate": 5.5979169521721254e-06, + "loss": 0.1761, + "step": 32873 + }, + { + "epoch": 2.0458762886597937, + "grad_norm": 5.804953098297119, + "learning_rate": 5.5977799095518715e-06, + "loss": 0.246, + "step": 32874 + }, + { + "epoch": 2.0458898534997285, + "grad_norm": 4.481743335723877, + "learning_rate": 5.597642866931617e-06, + "loss": 0.1421, + "step": 32875 + }, + { + "epoch": 2.0459034183396634, + "grad_norm": 3.9222025871276855, + "learning_rate": 5.597505824311361e-06, + "loss": 0.104, + "step": 32876 + }, + { + "epoch": 2.0459169831795987, + "grad_norm": 3.868697166442871, + "learning_rate": 5.597368781691106e-06, + "loss": 0.1321, + "step": 32877 + }, + { + "epoch": 2.0459305480195336, + "grad_norm": 4.10301399230957, + "learning_rate": 5.597231739070852e-06, + "loss": 0.1769, + "step": 32878 + }, + { + "epoch": 2.0459441128594684, + "grad_norm": 5.537866592407227, + "learning_rate": 5.5970946964505965e-06, + "loss": 0.1733, + "step": 32879 + }, + { + "epoch": 2.0459576776994033, + "grad_norm": 3.803673028945923, + "learning_rate": 5.596957653830342e-06, + "loss": 0.1406, + "step": 32880 + }, + { + "epoch": 2.045971242539338, + "grad_norm": 4.521854877471924, + "learning_rate": 5.596820611210087e-06, + "loss": 0.235, + "step": 32881 + }, + { + "epoch": 2.045984807379273, + "grad_norm": 5.04767370223999, + "learning_rate": 5.596683568589832e-06, + "loss": 0.2217, + "step": 32882 + }, + { + "epoch": 2.045998372219208, + "grad_norm": 3.490513324737549, + "learning_rate": 5.596546525969577e-06, + "loss": 0.1251, + "step": 32883 + }, + { + "epoch": 2.0460119370591427, + "grad_norm": 5.110513210296631, + "learning_rate": 5.596409483349322e-06, + "loss": 0.1932, + "step": 32884 + }, + { + "epoch": 2.0460255018990776, + "grad_norm": 4.347048759460449, + "learning_rate": 5.596272440729067e-06, + "loss": 0.1333, + "step": 32885 + }, + { + "epoch": 2.0460390667390125, + "grad_norm": 6.318069934844971, + "learning_rate": 5.596135398108812e-06, + "loss": 0.1911, + "step": 32886 + }, + { + "epoch": 2.0460526315789473, + "grad_norm": 3.359143018722534, + "learning_rate": 5.595998355488558e-06, + "loss": 0.0995, + "step": 32887 + }, + { + "epoch": 2.046066196418882, + "grad_norm": 4.566503524780273, + "learning_rate": 5.595861312868302e-06, + "loss": 0.1722, + "step": 32888 + }, + { + "epoch": 2.046079761258817, + "grad_norm": 3.8348305225372314, + "learning_rate": 5.5957242702480475e-06, + "loss": 0.1061, + "step": 32889 + }, + { + "epoch": 2.046093326098752, + "grad_norm": 4.1485595703125, + "learning_rate": 5.595587227627793e-06, + "loss": 0.0923, + "step": 32890 + }, + { + "epoch": 2.046106890938687, + "grad_norm": 4.700128078460693, + "learning_rate": 5.595450185007538e-06, + "loss": 0.1545, + "step": 32891 + }, + { + "epoch": 2.0461204557786217, + "grad_norm": 4.511589527130127, + "learning_rate": 5.595313142387283e-06, + "loss": 0.1723, + "step": 32892 + }, + { + "epoch": 2.0461340206185565, + "grad_norm": 3.2439610958099365, + "learning_rate": 5.595176099767028e-06, + "loss": 0.1036, + "step": 32893 + }, + { + "epoch": 2.0461475854584914, + "grad_norm": 4.103337287902832, + "learning_rate": 5.5950390571467725e-06, + "loss": 0.1239, + "step": 32894 + }, + { + "epoch": 2.0461611502984263, + "grad_norm": 4.674832820892334, + "learning_rate": 5.5949020145265185e-06, + "loss": 0.165, + "step": 32895 + }, + { + "epoch": 2.0461747151383616, + "grad_norm": 3.586796998977661, + "learning_rate": 5.594764971906264e-06, + "loss": 0.1383, + "step": 32896 + }, + { + "epoch": 2.0461882799782964, + "grad_norm": 5.749266147613525, + "learning_rate": 5.594627929286008e-06, + "loss": 0.2258, + "step": 32897 + }, + { + "epoch": 2.0462018448182313, + "grad_norm": 4.518834590911865, + "learning_rate": 5.594490886665753e-06, + "loss": 0.1779, + "step": 32898 + }, + { + "epoch": 2.046215409658166, + "grad_norm": 4.145565509796143, + "learning_rate": 5.594353844045498e-06, + "loss": 0.2089, + "step": 32899 + }, + { + "epoch": 2.046228974498101, + "grad_norm": 5.411869525909424, + "learning_rate": 5.5942168014252444e-06, + "loss": 0.272, + "step": 32900 + }, + { + "epoch": 2.046242539338036, + "grad_norm": 4.32988166809082, + "learning_rate": 5.594079758804989e-06, + "loss": 0.1365, + "step": 32901 + }, + { + "epoch": 2.0462561041779708, + "grad_norm": 5.119082927703857, + "learning_rate": 5.593942716184734e-06, + "loss": 0.1222, + "step": 32902 + }, + { + "epoch": 2.0462696690179056, + "grad_norm": 4.940980911254883, + "learning_rate": 5.593805673564478e-06, + "loss": 0.1059, + "step": 32903 + }, + { + "epoch": 2.0462832338578405, + "grad_norm": 5.501920223236084, + "learning_rate": 5.593668630944224e-06, + "loss": 0.2628, + "step": 32904 + }, + { + "epoch": 2.0462967986977754, + "grad_norm": 4.626515865325928, + "learning_rate": 5.5935315883239695e-06, + "loss": 0.1534, + "step": 32905 + }, + { + "epoch": 2.0463103635377102, + "grad_norm": 3.4522430896759033, + "learning_rate": 5.593394545703714e-06, + "loss": 0.1269, + "step": 32906 + }, + { + "epoch": 2.046323928377645, + "grad_norm": 4.239388942718506, + "learning_rate": 5.593257503083459e-06, + "loss": 0.136, + "step": 32907 + }, + { + "epoch": 2.04633749321758, + "grad_norm": 3.3872008323669434, + "learning_rate": 5.593120460463205e-06, + "loss": 0.1817, + "step": 32908 + }, + { + "epoch": 2.046351058057515, + "grad_norm": 3.6599557399749756, + "learning_rate": 5.59298341784295e-06, + "loss": 0.1473, + "step": 32909 + }, + { + "epoch": 2.0463646228974497, + "grad_norm": 4.624715328216553, + "learning_rate": 5.5928463752226945e-06, + "loss": 0.2084, + "step": 32910 + }, + { + "epoch": 2.0463781877373846, + "grad_norm": 3.975999355316162, + "learning_rate": 5.59270933260244e-06, + "loss": 0.0984, + "step": 32911 + }, + { + "epoch": 2.0463917525773194, + "grad_norm": 3.7945048809051514, + "learning_rate": 5.592572289982184e-06, + "loss": 0.1114, + "step": 32912 + }, + { + "epoch": 2.0464053174172543, + "grad_norm": 4.8448944091796875, + "learning_rate": 5.59243524736193e-06, + "loss": 0.2312, + "step": 32913 + }, + { + "epoch": 2.046418882257189, + "grad_norm": 4.557831764221191, + "learning_rate": 5.592298204741675e-06, + "loss": 0.1291, + "step": 32914 + }, + { + "epoch": 2.0464324470971245, + "grad_norm": 4.563733100891113, + "learning_rate": 5.59216116212142e-06, + "loss": 0.1515, + "step": 32915 + }, + { + "epoch": 2.0464460119370593, + "grad_norm": 5.009810924530029, + "learning_rate": 5.592024119501165e-06, + "loss": 0.2536, + "step": 32916 + }, + { + "epoch": 2.046459576776994, + "grad_norm": 4.637267112731934, + "learning_rate": 5.591887076880911e-06, + "loss": 0.1878, + "step": 32917 + }, + { + "epoch": 2.046473141616929, + "grad_norm": 4.4129958152771, + "learning_rate": 5.591750034260656e-06, + "loss": 0.1895, + "step": 32918 + }, + { + "epoch": 2.046486706456864, + "grad_norm": 4.351622104644775, + "learning_rate": 5.5916129916404e-06, + "loss": 0.1321, + "step": 32919 + }, + { + "epoch": 2.046500271296799, + "grad_norm": 3.0621285438537598, + "learning_rate": 5.5914759490201455e-06, + "loss": 0.155, + "step": 32920 + }, + { + "epoch": 2.0465138361367337, + "grad_norm": 4.52423620223999, + "learning_rate": 5.5913389063998915e-06, + "loss": 0.1792, + "step": 32921 + }, + { + "epoch": 2.0465274009766685, + "grad_norm": 4.396218776702881, + "learning_rate": 5.591201863779636e-06, + "loss": 0.1771, + "step": 32922 + }, + { + "epoch": 2.0465409658166034, + "grad_norm": 3.875803232192993, + "learning_rate": 5.591064821159381e-06, + "loss": 0.1537, + "step": 32923 + }, + { + "epoch": 2.0465545306565383, + "grad_norm": 4.095250606536865, + "learning_rate": 5.590927778539126e-06, + "loss": 0.144, + "step": 32924 + }, + { + "epoch": 2.046568095496473, + "grad_norm": 3.2487518787384033, + "learning_rate": 5.590790735918871e-06, + "loss": 0.1223, + "step": 32925 + }, + { + "epoch": 2.046581660336408, + "grad_norm": 2.716613292694092, + "learning_rate": 5.5906536932986165e-06, + "loss": 0.1173, + "step": 32926 + }, + { + "epoch": 2.046595225176343, + "grad_norm": 4.2016096115112305, + "learning_rate": 5.590516650678362e-06, + "loss": 0.1161, + "step": 32927 + }, + { + "epoch": 2.0466087900162777, + "grad_norm": 6.4496541023254395, + "learning_rate": 5.590379608058106e-06, + "loss": 0.2827, + "step": 32928 + }, + { + "epoch": 2.0466223548562126, + "grad_norm": 4.409578323364258, + "learning_rate": 5.590242565437851e-06, + "loss": 0.2568, + "step": 32929 + }, + { + "epoch": 2.0466359196961474, + "grad_norm": 6.068375587463379, + "learning_rate": 5.590105522817597e-06, + "loss": 0.2926, + "step": 32930 + }, + { + "epoch": 2.0466494845360823, + "grad_norm": 4.9799089431762695, + "learning_rate": 5.589968480197342e-06, + "loss": 0.1856, + "step": 32931 + }, + { + "epoch": 2.046663049376017, + "grad_norm": 4.878389835357666, + "learning_rate": 5.589831437577087e-06, + "loss": 0.2227, + "step": 32932 + }, + { + "epoch": 2.046676614215952, + "grad_norm": 3.507451057434082, + "learning_rate": 5.589694394956832e-06, + "loss": 0.1295, + "step": 32933 + }, + { + "epoch": 2.0466901790558873, + "grad_norm": 5.083434581756592, + "learning_rate": 5.589557352336578e-06, + "loss": 0.1644, + "step": 32934 + }, + { + "epoch": 2.046703743895822, + "grad_norm": 4.777708053588867, + "learning_rate": 5.589420309716322e-06, + "loss": 0.161, + "step": 32935 + }, + { + "epoch": 2.046717308735757, + "grad_norm": 5.309625625610352, + "learning_rate": 5.5892832670960675e-06, + "loss": 0.2852, + "step": 32936 + }, + { + "epoch": 2.046730873575692, + "grad_norm": 4.03338623046875, + "learning_rate": 5.589146224475812e-06, + "loss": 0.197, + "step": 32937 + }, + { + "epoch": 2.046744438415627, + "grad_norm": 4.86077356338501, + "learning_rate": 5.589009181855558e-06, + "loss": 0.2242, + "step": 32938 + }, + { + "epoch": 2.0467580032555617, + "grad_norm": 6.125213623046875, + "learning_rate": 5.588872139235303e-06, + "loss": 0.2365, + "step": 32939 + }, + { + "epoch": 2.0467715680954965, + "grad_norm": 5.616026401519775, + "learning_rate": 5.588735096615047e-06, + "loss": 0.151, + "step": 32940 + }, + { + "epoch": 2.0467851329354314, + "grad_norm": 4.158665180206299, + "learning_rate": 5.5885980539947925e-06, + "loss": 0.1269, + "step": 32941 + }, + { + "epoch": 2.0467986977753663, + "grad_norm": 4.206359386444092, + "learning_rate": 5.588461011374538e-06, + "loss": 0.1662, + "step": 32942 + }, + { + "epoch": 2.046812262615301, + "grad_norm": 3.025838613510132, + "learning_rate": 5.588323968754284e-06, + "loss": 0.0969, + "step": 32943 + }, + { + "epoch": 2.046825827455236, + "grad_norm": 4.400938034057617, + "learning_rate": 5.588186926134028e-06, + "loss": 0.1457, + "step": 32944 + }, + { + "epoch": 2.046839392295171, + "grad_norm": 5.63422966003418, + "learning_rate": 5.588049883513773e-06, + "loss": 0.2168, + "step": 32945 + }, + { + "epoch": 2.0468529571351057, + "grad_norm": 3.0477702617645264, + "learning_rate": 5.587912840893518e-06, + "loss": 0.1953, + "step": 32946 + }, + { + "epoch": 2.0468665219750406, + "grad_norm": 3.3454699516296387, + "learning_rate": 5.587775798273264e-06, + "loss": 0.1289, + "step": 32947 + }, + { + "epoch": 2.0468800868149755, + "grad_norm": 3.1268529891967773, + "learning_rate": 5.587638755653009e-06, + "loss": 0.1147, + "step": 32948 + }, + { + "epoch": 2.0468936516549103, + "grad_norm": 4.093272686004639, + "learning_rate": 5.587501713032754e-06, + "loss": 0.1957, + "step": 32949 + }, + { + "epoch": 2.046907216494845, + "grad_norm": 3.890491008758545, + "learning_rate": 5.587364670412498e-06, + "loss": 0.1335, + "step": 32950 + }, + { + "epoch": 2.04692078133478, + "grad_norm": 3.6187102794647217, + "learning_rate": 5.587227627792244e-06, + "loss": 0.1017, + "step": 32951 + }, + { + "epoch": 2.046934346174715, + "grad_norm": 3.971541404724121, + "learning_rate": 5.5870905851719895e-06, + "loss": 0.147, + "step": 32952 + }, + { + "epoch": 2.0469479110146502, + "grad_norm": 3.560558557510376, + "learning_rate": 5.586953542551734e-06, + "loss": 0.2176, + "step": 32953 + }, + { + "epoch": 2.046961475854585, + "grad_norm": 5.05259895324707, + "learning_rate": 5.586816499931479e-06, + "loss": 0.1389, + "step": 32954 + }, + { + "epoch": 2.04697504069452, + "grad_norm": 2.658003568649292, + "learning_rate": 5.586679457311223e-06, + "loss": 0.1141, + "step": 32955 + }, + { + "epoch": 2.046988605534455, + "grad_norm": 3.4495015144348145, + "learning_rate": 5.586542414690969e-06, + "loss": 0.1425, + "step": 32956 + }, + { + "epoch": 2.0470021703743897, + "grad_norm": 5.187408924102783, + "learning_rate": 5.5864053720707145e-06, + "loss": 0.1453, + "step": 32957 + }, + { + "epoch": 2.0470157352143246, + "grad_norm": 3.552807331085205, + "learning_rate": 5.58626832945046e-06, + "loss": 0.0938, + "step": 32958 + }, + { + "epoch": 2.0470293000542594, + "grad_norm": 3.4052746295928955, + "learning_rate": 5.586131286830204e-06, + "loss": 0.1417, + "step": 32959 + }, + { + "epoch": 2.0470428648941943, + "grad_norm": 3.7795515060424805, + "learning_rate": 5.58599424420995e-06, + "loss": 0.1362, + "step": 32960 + }, + { + "epoch": 2.047056429734129, + "grad_norm": 4.921939373016357, + "learning_rate": 5.585857201589695e-06, + "loss": 0.1516, + "step": 32961 + }, + { + "epoch": 2.047069994574064, + "grad_norm": 4.855523109436035, + "learning_rate": 5.58572015896944e-06, + "loss": 0.2514, + "step": 32962 + }, + { + "epoch": 2.047083559413999, + "grad_norm": 4.612720489501953, + "learning_rate": 5.585583116349185e-06, + "loss": 0.1963, + "step": 32963 + }, + { + "epoch": 2.0470971242539338, + "grad_norm": 3.568134069442749, + "learning_rate": 5.585446073728931e-06, + "loss": 0.1645, + "step": 32964 + }, + { + "epoch": 2.0471106890938686, + "grad_norm": 3.298542022705078, + "learning_rate": 5.585309031108675e-06, + "loss": 0.1066, + "step": 32965 + }, + { + "epoch": 2.0471242539338035, + "grad_norm": 5.507323741912842, + "learning_rate": 5.58517198848842e-06, + "loss": 0.1712, + "step": 32966 + }, + { + "epoch": 2.0471378187737383, + "grad_norm": 4.380934715270996, + "learning_rate": 5.5850349458681655e-06, + "loss": 0.1634, + "step": 32967 + }, + { + "epoch": 2.047151383613673, + "grad_norm": 3.5985054969787598, + "learning_rate": 5.58489790324791e-06, + "loss": 0.1367, + "step": 32968 + }, + { + "epoch": 2.047164948453608, + "grad_norm": 3.3980777263641357, + "learning_rate": 5.584760860627656e-06, + "loss": 0.1481, + "step": 32969 + }, + { + "epoch": 2.047178513293543, + "grad_norm": 5.02021598815918, + "learning_rate": 5.584623818007401e-06, + "loss": 0.173, + "step": 32970 + }, + { + "epoch": 2.047192078133478, + "grad_norm": 3.7994112968444824, + "learning_rate": 5.584486775387145e-06, + "loss": 0.1218, + "step": 32971 + }, + { + "epoch": 2.047205642973413, + "grad_norm": 2.5936458110809326, + "learning_rate": 5.5843497327668905e-06, + "loss": 0.0838, + "step": 32972 + }, + { + "epoch": 2.047219207813348, + "grad_norm": 3.921419620513916, + "learning_rate": 5.5842126901466366e-06, + "loss": 0.1641, + "step": 32973 + }, + { + "epoch": 2.047232772653283, + "grad_norm": 2.7970499992370605, + "learning_rate": 5.584075647526381e-06, + "loss": 0.1143, + "step": 32974 + }, + { + "epoch": 2.0472463374932177, + "grad_norm": 3.856116771697998, + "learning_rate": 5.583938604906126e-06, + "loss": 0.1388, + "step": 32975 + }, + { + "epoch": 2.0472599023331526, + "grad_norm": 4.553953170776367, + "learning_rate": 5.583801562285871e-06, + "loss": 0.1656, + "step": 32976 + }, + { + "epoch": 2.0472734671730874, + "grad_norm": 2.932493209838867, + "learning_rate": 5.583664519665617e-06, + "loss": 0.0808, + "step": 32977 + }, + { + "epoch": 2.0472870320130223, + "grad_norm": 5.810069561004639, + "learning_rate": 5.583527477045362e-06, + "loss": 0.2897, + "step": 32978 + }, + { + "epoch": 2.047300596852957, + "grad_norm": 4.05963659286499, + "learning_rate": 5.583390434425107e-06, + "loss": 0.119, + "step": 32979 + }, + { + "epoch": 2.047314161692892, + "grad_norm": 2.6970224380493164, + "learning_rate": 5.583253391804851e-06, + "loss": 0.105, + "step": 32980 + }, + { + "epoch": 2.047327726532827, + "grad_norm": 3.64640736579895, + "learning_rate": 5.583116349184596e-06, + "loss": 0.1088, + "step": 32981 + }, + { + "epoch": 2.0473412913727618, + "grad_norm": 3.802746534347534, + "learning_rate": 5.582979306564342e-06, + "loss": 0.1437, + "step": 32982 + }, + { + "epoch": 2.0473548562126966, + "grad_norm": 3.406094551086426, + "learning_rate": 5.5828422639440875e-06, + "loss": 0.1225, + "step": 32983 + }, + { + "epoch": 2.0473684210526315, + "grad_norm": 3.6698050498962402, + "learning_rate": 5.582705221323832e-06, + "loss": 0.1752, + "step": 32984 + }, + { + "epoch": 2.0473819858925664, + "grad_norm": 4.832592010498047, + "learning_rate": 5.582568178703577e-06, + "loss": 0.1153, + "step": 32985 + }, + { + "epoch": 2.0473955507325012, + "grad_norm": 5.600584030151367, + "learning_rate": 5.582431136083323e-06, + "loss": 0.2041, + "step": 32986 + }, + { + "epoch": 2.047409115572436, + "grad_norm": 5.940456390380859, + "learning_rate": 5.582294093463067e-06, + "loss": 0.2133, + "step": 32987 + }, + { + "epoch": 2.047422680412371, + "grad_norm": 4.214011192321777, + "learning_rate": 5.5821570508428126e-06, + "loss": 0.1595, + "step": 32988 + }, + { + "epoch": 2.047436245252306, + "grad_norm": 2.8575384616851807, + "learning_rate": 5.582020008222557e-06, + "loss": 0.1026, + "step": 32989 + }, + { + "epoch": 2.0474498100922407, + "grad_norm": 3.5283124446868896, + "learning_rate": 5.581882965602303e-06, + "loss": 0.1373, + "step": 32990 + }, + { + "epoch": 2.047463374932176, + "grad_norm": 4.473843097686768, + "learning_rate": 5.581745922982048e-06, + "loss": 0.2286, + "step": 32991 + }, + { + "epoch": 2.047476939772111, + "grad_norm": 4.786129951477051, + "learning_rate": 5.581608880361793e-06, + "loss": 0.1443, + "step": 32992 + }, + { + "epoch": 2.0474905046120457, + "grad_norm": 3.977264165878296, + "learning_rate": 5.581471837741538e-06, + "loss": 0.1119, + "step": 32993 + }, + { + "epoch": 2.0475040694519806, + "grad_norm": 4.625195026397705, + "learning_rate": 5.581334795121283e-06, + "loss": 0.1456, + "step": 32994 + }, + { + "epoch": 2.0475176342919155, + "grad_norm": 5.298923969268799, + "learning_rate": 5.581197752501029e-06, + "loss": 0.2098, + "step": 32995 + }, + { + "epoch": 2.0475311991318503, + "grad_norm": 4.686511993408203, + "learning_rate": 5.581060709880773e-06, + "loss": 0.127, + "step": 32996 + }, + { + "epoch": 2.047544763971785, + "grad_norm": 4.107317924499512, + "learning_rate": 5.580923667260518e-06, + "loss": 0.1382, + "step": 32997 + }, + { + "epoch": 2.04755832881172, + "grad_norm": 2.5715484619140625, + "learning_rate": 5.5807866246402635e-06, + "loss": 0.1165, + "step": 32998 + }, + { + "epoch": 2.047571893651655, + "grad_norm": 5.230008602142334, + "learning_rate": 5.580649582020009e-06, + "loss": 0.2746, + "step": 32999 + }, + { + "epoch": 2.04758545849159, + "grad_norm": 5.247121334075928, + "learning_rate": 5.580512539399754e-06, + "loss": 0.1682, + "step": 33000 + }, + { + "epoch": 2.0475990233315247, + "grad_norm": 5.347707271575928, + "learning_rate": 5.580375496779499e-06, + "loss": 0.1146, + "step": 33001 + }, + { + "epoch": 2.0476125881714595, + "grad_norm": 4.433112621307373, + "learning_rate": 5.580238454159243e-06, + "loss": 0.2466, + "step": 33002 + }, + { + "epoch": 2.0476261530113944, + "grad_norm": 5.760982036590576, + "learning_rate": 5.580101411538989e-06, + "loss": 0.2283, + "step": 33003 + }, + { + "epoch": 2.0476397178513293, + "grad_norm": 4.3037919998168945, + "learning_rate": 5.5799643689187346e-06, + "loss": 0.1611, + "step": 33004 + }, + { + "epoch": 2.047653282691264, + "grad_norm": 4.67963981628418, + "learning_rate": 5.579827326298479e-06, + "loss": 0.1912, + "step": 33005 + }, + { + "epoch": 2.047666847531199, + "grad_norm": 3.882648229598999, + "learning_rate": 5.579690283678224e-06, + "loss": 0.1652, + "step": 33006 + }, + { + "epoch": 2.047680412371134, + "grad_norm": 4.24918270111084, + "learning_rate": 5.57955324105797e-06, + "loss": 0.1418, + "step": 33007 + }, + { + "epoch": 2.0476939772110687, + "grad_norm": 5.448393821716309, + "learning_rate": 5.579416198437715e-06, + "loss": 0.1451, + "step": 33008 + }, + { + "epoch": 2.0477075420510036, + "grad_norm": 3.7962543964385986, + "learning_rate": 5.57927915581746e-06, + "loss": 0.1213, + "step": 33009 + }, + { + "epoch": 2.047721106890939, + "grad_norm": 3.9320690631866455, + "learning_rate": 5.579142113197205e-06, + "loss": 0.1751, + "step": 33010 + }, + { + "epoch": 2.0477346717308738, + "grad_norm": 4.128190994262695, + "learning_rate": 5.579005070576949e-06, + "loss": 0.1766, + "step": 33011 + }, + { + "epoch": 2.0477482365708086, + "grad_norm": 6.0205535888671875, + "learning_rate": 5.578868027956695e-06, + "loss": 0.2057, + "step": 33012 + }, + { + "epoch": 2.0477618014107435, + "grad_norm": 5.884095191955566, + "learning_rate": 5.57873098533644e-06, + "loss": 0.2401, + "step": 33013 + }, + { + "epoch": 2.0477753662506784, + "grad_norm": 6.834526538848877, + "learning_rate": 5.578593942716185e-06, + "loss": 0.2117, + "step": 33014 + }, + { + "epoch": 2.047788931090613, + "grad_norm": 4.085616111755371, + "learning_rate": 5.57845690009593e-06, + "loss": 0.1672, + "step": 33015 + }, + { + "epoch": 2.047802495930548, + "grad_norm": 4.647893905639648, + "learning_rate": 5.578319857475676e-06, + "loss": 0.174, + "step": 33016 + }, + { + "epoch": 2.047816060770483, + "grad_norm": 7.798650741577148, + "learning_rate": 5.578182814855421e-06, + "loss": 0.2622, + "step": 33017 + }, + { + "epoch": 2.047829625610418, + "grad_norm": 5.145317554473877, + "learning_rate": 5.578045772235165e-06, + "loss": 0.1783, + "step": 33018 + }, + { + "epoch": 2.0478431904503527, + "grad_norm": 4.243060111999512, + "learning_rate": 5.5779087296149106e-06, + "loss": 0.161, + "step": 33019 + }, + { + "epoch": 2.0478567552902875, + "grad_norm": 3.790048837661743, + "learning_rate": 5.577771686994657e-06, + "loss": 0.0931, + "step": 33020 + }, + { + "epoch": 2.0478703201302224, + "grad_norm": 3.7864577770233154, + "learning_rate": 5.577634644374401e-06, + "loss": 0.1394, + "step": 33021 + }, + { + "epoch": 2.0478838849701573, + "grad_norm": 2.7692689895629883, + "learning_rate": 5.577497601754146e-06, + "loss": 0.1206, + "step": 33022 + }, + { + "epoch": 2.047897449810092, + "grad_norm": 4.515834808349609, + "learning_rate": 5.577360559133891e-06, + "loss": 0.1609, + "step": 33023 + }, + { + "epoch": 2.047911014650027, + "grad_norm": 5.076347351074219, + "learning_rate": 5.577223516513636e-06, + "loss": 0.1925, + "step": 33024 + }, + { + "epoch": 2.047924579489962, + "grad_norm": 3.8520596027374268, + "learning_rate": 5.577086473893382e-06, + "loss": 0.1872, + "step": 33025 + }, + { + "epoch": 2.0479381443298967, + "grad_norm": 4.321295261383057, + "learning_rate": 5.576949431273127e-06, + "loss": 0.1279, + "step": 33026 + }, + { + "epoch": 2.0479517091698316, + "grad_norm": 4.22285270690918, + "learning_rate": 5.576812388652871e-06, + "loss": 0.1795, + "step": 33027 + }, + { + "epoch": 2.0479652740097665, + "grad_norm": 3.7831430435180664, + "learning_rate": 5.576675346032616e-06, + "loss": 0.1473, + "step": 33028 + }, + { + "epoch": 2.0479788388497018, + "grad_norm": 5.676750183105469, + "learning_rate": 5.576538303412362e-06, + "loss": 0.244, + "step": 33029 + }, + { + "epoch": 2.0479924036896366, + "grad_norm": 4.573310852050781, + "learning_rate": 5.576401260792107e-06, + "loss": 0.164, + "step": 33030 + }, + { + "epoch": 2.0480059685295715, + "grad_norm": 3.5345020294189453, + "learning_rate": 5.576264218171852e-06, + "loss": 0.1073, + "step": 33031 + }, + { + "epoch": 2.0480195333695064, + "grad_norm": 4.171061992645264, + "learning_rate": 5.576127175551597e-06, + "loss": 0.2433, + "step": 33032 + }, + { + "epoch": 2.0480330982094412, + "grad_norm": 2.8054862022399902, + "learning_rate": 5.575990132931342e-06, + "loss": 0.1205, + "step": 33033 + }, + { + "epoch": 2.048046663049376, + "grad_norm": 3.2251412868499756, + "learning_rate": 5.575853090311087e-06, + "loss": 0.1438, + "step": 33034 + }, + { + "epoch": 2.048060227889311, + "grad_norm": 3.928225040435791, + "learning_rate": 5.5757160476908326e-06, + "loss": 0.156, + "step": 33035 + }, + { + "epoch": 2.048073792729246, + "grad_norm": 3.5185165405273438, + "learning_rate": 5.575579005070577e-06, + "loss": 0.1179, + "step": 33036 + }, + { + "epoch": 2.0480873575691807, + "grad_norm": 3.18510103225708, + "learning_rate": 5.575441962450322e-06, + "loss": 0.1074, + "step": 33037 + }, + { + "epoch": 2.0481009224091156, + "grad_norm": 3.4787631034851074, + "learning_rate": 5.575304919830068e-06, + "loss": 0.1204, + "step": 33038 + }, + { + "epoch": 2.0481144872490504, + "grad_norm": 4.277920246124268, + "learning_rate": 5.5751678772098124e-06, + "loss": 0.1478, + "step": 33039 + }, + { + "epoch": 2.0481280520889853, + "grad_norm": 3.279654026031494, + "learning_rate": 5.575030834589558e-06, + "loss": 0.1514, + "step": 33040 + }, + { + "epoch": 2.04814161692892, + "grad_norm": 4.472829818725586, + "learning_rate": 5.574893791969303e-06, + "loss": 0.1613, + "step": 33041 + }, + { + "epoch": 2.048155181768855, + "grad_norm": 3.1246776580810547, + "learning_rate": 5.574756749349049e-06, + "loss": 0.1107, + "step": 33042 + }, + { + "epoch": 2.04816874660879, + "grad_norm": 4.758264064788818, + "learning_rate": 5.574619706728793e-06, + "loss": 0.1421, + "step": 33043 + }, + { + "epoch": 2.0481823114487248, + "grad_norm": 5.1512451171875, + "learning_rate": 5.574482664108538e-06, + "loss": 0.2003, + "step": 33044 + }, + { + "epoch": 2.0481958762886596, + "grad_norm": 4.408531665802002, + "learning_rate": 5.574345621488283e-06, + "loss": 0.1836, + "step": 33045 + }, + { + "epoch": 2.0482094411285945, + "grad_norm": 4.783205986022949, + "learning_rate": 5.574208578868029e-06, + "loss": 0.2201, + "step": 33046 + }, + { + "epoch": 2.0482230059685294, + "grad_norm": 6.362489223480225, + "learning_rate": 5.574071536247774e-06, + "loss": 0.0891, + "step": 33047 + }, + { + "epoch": 2.0482365708084647, + "grad_norm": 3.981760263442993, + "learning_rate": 5.573934493627518e-06, + "loss": 0.1239, + "step": 33048 + }, + { + "epoch": 2.0482501356483995, + "grad_norm": 6.806806564331055, + "learning_rate": 5.573797451007263e-06, + "loss": 0.291, + "step": 33049 + }, + { + "epoch": 2.0482637004883344, + "grad_norm": 4.020619869232178, + "learning_rate": 5.5736604083870086e-06, + "loss": 0.1451, + "step": 33050 + }, + { + "epoch": 2.0482772653282693, + "grad_norm": 4.244359016418457, + "learning_rate": 5.573523365766755e-06, + "loss": 0.1592, + "step": 33051 + }, + { + "epoch": 2.048290830168204, + "grad_norm": 3.661750316619873, + "learning_rate": 5.573386323146499e-06, + "loss": 0.107, + "step": 33052 + }, + { + "epoch": 2.048304395008139, + "grad_norm": 6.9925923347473145, + "learning_rate": 5.573249280526244e-06, + "loss": 0.3053, + "step": 33053 + }, + { + "epoch": 2.048317959848074, + "grad_norm": 4.123327732086182, + "learning_rate": 5.5731122379059884e-06, + "loss": 0.1327, + "step": 33054 + }, + { + "epoch": 2.0483315246880087, + "grad_norm": 3.9155802726745605, + "learning_rate": 5.5729751952857345e-06, + "loss": 0.1648, + "step": 33055 + }, + { + "epoch": 2.0483450895279436, + "grad_norm": 6.139108180999756, + "learning_rate": 5.57283815266548e-06, + "loss": 0.1622, + "step": 33056 + }, + { + "epoch": 2.0483586543678785, + "grad_norm": 3.2734286785125732, + "learning_rate": 5.572701110045225e-06, + "loss": 0.1153, + "step": 33057 + }, + { + "epoch": 2.0483722192078133, + "grad_norm": 4.438693046569824, + "learning_rate": 5.572564067424969e-06, + "loss": 0.1532, + "step": 33058 + }, + { + "epoch": 2.048385784047748, + "grad_norm": 5.637927532196045, + "learning_rate": 5.572427024804715e-06, + "loss": 0.1994, + "step": 33059 + }, + { + "epoch": 2.048399348887683, + "grad_norm": 4.429859638214111, + "learning_rate": 5.57228998218446e-06, + "loss": 0.1294, + "step": 33060 + }, + { + "epoch": 2.048412913727618, + "grad_norm": 4.860764980316162, + "learning_rate": 5.572152939564205e-06, + "loss": 0.184, + "step": 33061 + }, + { + "epoch": 2.0484264785675528, + "grad_norm": 5.053316593170166, + "learning_rate": 5.57201589694395e-06, + "loss": 0.2341, + "step": 33062 + }, + { + "epoch": 2.0484400434074876, + "grad_norm": 4.3942179679870605, + "learning_rate": 5.571878854323694e-06, + "loss": 0.1372, + "step": 33063 + }, + { + "epoch": 2.0484536082474225, + "grad_norm": 4.396140098571777, + "learning_rate": 5.57174181170344e-06, + "loss": 0.2459, + "step": 33064 + }, + { + "epoch": 2.0484671730873574, + "grad_norm": 3.7972447872161865, + "learning_rate": 5.571604769083185e-06, + "loss": 0.1613, + "step": 33065 + }, + { + "epoch": 2.0484807379272922, + "grad_norm": 5.0385894775390625, + "learning_rate": 5.5714677264629306e-06, + "loss": 0.1786, + "step": 33066 + }, + { + "epoch": 2.0484943027672275, + "grad_norm": 6.632956504821777, + "learning_rate": 5.571330683842675e-06, + "loss": 0.2186, + "step": 33067 + }, + { + "epoch": 2.0485078676071624, + "grad_norm": 3.559121608734131, + "learning_rate": 5.571193641222421e-06, + "loss": 0.1398, + "step": 33068 + }, + { + "epoch": 2.0485214324470973, + "grad_norm": 3.9643473625183105, + "learning_rate": 5.571056598602166e-06, + "loss": 0.1269, + "step": 33069 + }, + { + "epoch": 2.048534997287032, + "grad_norm": 5.1956095695495605, + "learning_rate": 5.5709195559819104e-06, + "loss": 0.1741, + "step": 33070 + }, + { + "epoch": 2.048548562126967, + "grad_norm": 4.455199718475342, + "learning_rate": 5.570782513361656e-06, + "loss": 0.1776, + "step": 33071 + }, + { + "epoch": 2.048562126966902, + "grad_norm": 5.814861297607422, + "learning_rate": 5.570645470741402e-06, + "loss": 0.2613, + "step": 33072 + }, + { + "epoch": 2.0485756918068367, + "grad_norm": 3.773555278778076, + "learning_rate": 5.570508428121146e-06, + "loss": 0.1397, + "step": 33073 + }, + { + "epoch": 2.0485892566467716, + "grad_norm": 3.8424909114837646, + "learning_rate": 5.570371385500891e-06, + "loss": 0.1408, + "step": 33074 + }, + { + "epoch": 2.0486028214867065, + "grad_norm": 5.148669242858887, + "learning_rate": 5.570234342880636e-06, + "loss": 0.2009, + "step": 33075 + }, + { + "epoch": 2.0486163863266413, + "grad_norm": 4.599322319030762, + "learning_rate": 5.570097300260382e-06, + "loss": 0.1395, + "step": 33076 + }, + { + "epoch": 2.048629951166576, + "grad_norm": 3.703364610671997, + "learning_rate": 5.569960257640127e-06, + "loss": 0.0951, + "step": 33077 + }, + { + "epoch": 2.048643516006511, + "grad_norm": 3.622965097427368, + "learning_rate": 5.569823215019872e-06, + "loss": 0.1095, + "step": 33078 + }, + { + "epoch": 2.048657080846446, + "grad_norm": 4.920285224914551, + "learning_rate": 5.569686172399616e-06, + "loss": 0.1668, + "step": 33079 + }, + { + "epoch": 2.048670645686381, + "grad_norm": 4.65264892578125, + "learning_rate": 5.569549129779361e-06, + "loss": 0.1925, + "step": 33080 + }, + { + "epoch": 2.0486842105263157, + "grad_norm": 2.6812543869018555, + "learning_rate": 5.569412087159107e-06, + "loss": 0.061, + "step": 33081 + }, + { + "epoch": 2.0486977753662505, + "grad_norm": 4.916552543640137, + "learning_rate": 5.569275044538852e-06, + "loss": 0.1335, + "step": 33082 + }, + { + "epoch": 2.0487113402061854, + "grad_norm": 5.18197774887085, + "learning_rate": 5.569138001918597e-06, + "loss": 0.174, + "step": 33083 + }, + { + "epoch": 2.0487249050461203, + "grad_norm": 5.100924491882324, + "learning_rate": 5.569000959298342e-06, + "loss": 0.1838, + "step": 33084 + }, + { + "epoch": 2.0487384698860556, + "grad_norm": 6.27457332611084, + "learning_rate": 5.568863916678088e-06, + "loss": 0.1951, + "step": 33085 + }, + { + "epoch": 2.0487520347259904, + "grad_norm": 4.743306636810303, + "learning_rate": 5.5687268740578325e-06, + "loss": 0.159, + "step": 33086 + }, + { + "epoch": 2.0487655995659253, + "grad_norm": 6.09374475479126, + "learning_rate": 5.568589831437578e-06, + "loss": 0.2677, + "step": 33087 + }, + { + "epoch": 2.04877916440586, + "grad_norm": 3.658863067626953, + "learning_rate": 5.568452788817322e-06, + "loss": 0.1599, + "step": 33088 + }, + { + "epoch": 2.048792729245795, + "grad_norm": 4.671378135681152, + "learning_rate": 5.568315746197068e-06, + "loss": 0.1966, + "step": 33089 + }, + { + "epoch": 2.04880629408573, + "grad_norm": 5.276984691619873, + "learning_rate": 5.568178703576813e-06, + "loss": 0.1838, + "step": 33090 + }, + { + "epoch": 2.0488198589256648, + "grad_norm": 4.936059474945068, + "learning_rate": 5.568041660956558e-06, + "loss": 0.1747, + "step": 33091 + }, + { + "epoch": 2.0488334237655996, + "grad_norm": 3.3588244915008545, + "learning_rate": 5.567904618336303e-06, + "loss": 0.071, + "step": 33092 + }, + { + "epoch": 2.0488469886055345, + "grad_norm": 5.747990608215332, + "learning_rate": 5.567767575716048e-06, + "loss": 0.1927, + "step": 33093 + }, + { + "epoch": 2.0488605534454694, + "grad_norm": 4.933157920837402, + "learning_rate": 5.567630533095794e-06, + "loss": 0.1615, + "step": 33094 + }, + { + "epoch": 2.0488741182854042, + "grad_norm": 3.9819259643554688, + "learning_rate": 5.567493490475538e-06, + "loss": 0.1233, + "step": 33095 + }, + { + "epoch": 2.048887683125339, + "grad_norm": 5.239864349365234, + "learning_rate": 5.567356447855283e-06, + "loss": 0.1639, + "step": 33096 + }, + { + "epoch": 2.048901247965274, + "grad_norm": 3.473872423171997, + "learning_rate": 5.567219405235028e-06, + "loss": 0.0954, + "step": 33097 + }, + { + "epoch": 2.048914812805209, + "grad_norm": 5.068465709686279, + "learning_rate": 5.567082362614774e-06, + "loss": 0.1595, + "step": 33098 + }, + { + "epoch": 2.0489283776451437, + "grad_norm": 3.9933693408966064, + "learning_rate": 5.566945319994519e-06, + "loss": 0.133, + "step": 33099 + }, + { + "epoch": 2.0489419424850785, + "grad_norm": 5.454643726348877, + "learning_rate": 5.566808277374264e-06, + "loss": 0.2174, + "step": 33100 + }, + { + "epoch": 2.0489555073250134, + "grad_norm": 5.684656620025635, + "learning_rate": 5.5666712347540084e-06, + "loss": 0.171, + "step": 33101 + }, + { + "epoch": 2.0489690721649483, + "grad_norm": 3.454561233520508, + "learning_rate": 5.5665341921337545e-06, + "loss": 0.0986, + "step": 33102 + }, + { + "epoch": 2.048982637004883, + "grad_norm": 3.5508639812469482, + "learning_rate": 5.5663971495135e-06, + "loss": 0.1021, + "step": 33103 + }, + { + "epoch": 2.048996201844818, + "grad_norm": 4.155135631561279, + "learning_rate": 5.566260106893244e-06, + "loss": 0.1308, + "step": 33104 + }, + { + "epoch": 2.0490097666847533, + "grad_norm": 5.81545877456665, + "learning_rate": 5.566123064272989e-06, + "loss": 0.2791, + "step": 33105 + }, + { + "epoch": 2.049023331524688, + "grad_norm": 5.133398056030273, + "learning_rate": 5.565986021652734e-06, + "loss": 0.1466, + "step": 33106 + }, + { + "epoch": 2.049036896364623, + "grad_norm": 3.985818386077881, + "learning_rate": 5.5658489790324795e-06, + "loss": 0.1356, + "step": 33107 + }, + { + "epoch": 2.049050461204558, + "grad_norm": 3.4236977100372314, + "learning_rate": 5.565711936412225e-06, + "loss": 0.1222, + "step": 33108 + }, + { + "epoch": 2.049064026044493, + "grad_norm": 4.544102191925049, + "learning_rate": 5.56557489379197e-06, + "loss": 0.1088, + "step": 33109 + }, + { + "epoch": 2.0490775908844276, + "grad_norm": 4.894236087799072, + "learning_rate": 5.565437851171714e-06, + "loss": 0.2601, + "step": 33110 + }, + { + "epoch": 2.0490911557243625, + "grad_norm": 5.54179573059082, + "learning_rate": 5.56530080855146e-06, + "loss": 0.2371, + "step": 33111 + }, + { + "epoch": 2.0491047205642974, + "grad_norm": 4.084501266479492, + "learning_rate": 5.565163765931205e-06, + "loss": 0.0869, + "step": 33112 + }, + { + "epoch": 2.0491182854042322, + "grad_norm": 4.798716068267822, + "learning_rate": 5.56502672331095e-06, + "loss": 0.2029, + "step": 33113 + }, + { + "epoch": 2.049131850244167, + "grad_norm": 4.846728801727295, + "learning_rate": 5.564889680690695e-06, + "loss": 0.187, + "step": 33114 + }, + { + "epoch": 2.049145415084102, + "grad_norm": 6.197231292724609, + "learning_rate": 5.564752638070441e-06, + "loss": 0.194, + "step": 33115 + }, + { + "epoch": 2.049158979924037, + "grad_norm": 4.039585113525391, + "learning_rate": 5.564615595450185e-06, + "loss": 0.2092, + "step": 33116 + }, + { + "epoch": 2.0491725447639717, + "grad_norm": 3.922487735748291, + "learning_rate": 5.5644785528299305e-06, + "loss": 0.1216, + "step": 33117 + }, + { + "epoch": 2.0491861096039066, + "grad_norm": 3.794856071472168, + "learning_rate": 5.564341510209676e-06, + "loss": 0.1414, + "step": 33118 + }, + { + "epoch": 2.0491996744438414, + "grad_norm": 4.75031042098999, + "learning_rate": 5.56420446758942e-06, + "loss": 0.3454, + "step": 33119 + }, + { + "epoch": 2.0492132392837763, + "grad_norm": 4.258359909057617, + "learning_rate": 5.564067424969166e-06, + "loss": 0.2032, + "step": 33120 + }, + { + "epoch": 2.049226804123711, + "grad_norm": 5.453441619873047, + "learning_rate": 5.563930382348911e-06, + "loss": 0.2177, + "step": 33121 + }, + { + "epoch": 2.049240368963646, + "grad_norm": 6.655379772186279, + "learning_rate": 5.5637933397286555e-06, + "loss": 0.1936, + "step": 33122 + }, + { + "epoch": 2.0492539338035813, + "grad_norm": 5.155655860900879, + "learning_rate": 5.563656297108401e-06, + "loss": 0.2142, + "step": 33123 + }, + { + "epoch": 2.049267498643516, + "grad_norm": 5.155247211456299, + "learning_rate": 5.563519254488147e-06, + "loss": 0.2149, + "step": 33124 + }, + { + "epoch": 2.049281063483451, + "grad_norm": 3.552694797515869, + "learning_rate": 5.563382211867892e-06, + "loss": 0.1237, + "step": 33125 + }, + { + "epoch": 2.049294628323386, + "grad_norm": 5.9429850578308105, + "learning_rate": 5.563245169247636e-06, + "loss": 0.2752, + "step": 33126 + }, + { + "epoch": 2.049308193163321, + "grad_norm": 4.766165256500244, + "learning_rate": 5.563108126627381e-06, + "loss": 0.2015, + "step": 33127 + }, + { + "epoch": 2.0493217580032557, + "grad_norm": 4.852773189544678, + "learning_rate": 5.5629710840071274e-06, + "loss": 0.2465, + "step": 33128 + }, + { + "epoch": 2.0493353228431905, + "grad_norm": 4.906679153442383, + "learning_rate": 5.562834041386872e-06, + "loss": 0.1718, + "step": 33129 + }, + { + "epoch": 2.0493488876831254, + "grad_norm": 4.010128498077393, + "learning_rate": 5.562696998766617e-06, + "loss": 0.1743, + "step": 33130 + }, + { + "epoch": 2.0493624525230603, + "grad_norm": 5.575150012969971, + "learning_rate": 5.562559956146361e-06, + "loss": 0.1526, + "step": 33131 + }, + { + "epoch": 2.049376017362995, + "grad_norm": 5.0735626220703125, + "learning_rate": 5.5624229135261065e-06, + "loss": 0.1888, + "step": 33132 + }, + { + "epoch": 2.04938958220293, + "grad_norm": 5.439300060272217, + "learning_rate": 5.5622858709058525e-06, + "loss": 0.1645, + "step": 33133 + }, + { + "epoch": 2.049403147042865, + "grad_norm": 5.457746505737305, + "learning_rate": 5.562148828285598e-06, + "loss": 0.2399, + "step": 33134 + }, + { + "epoch": 2.0494167118827997, + "grad_norm": 4.628756046295166, + "learning_rate": 5.562011785665342e-06, + "loss": 0.2226, + "step": 33135 + }, + { + "epoch": 2.0494302767227346, + "grad_norm": 4.382485866546631, + "learning_rate": 5.561874743045087e-06, + "loss": 0.2418, + "step": 33136 + }, + { + "epoch": 2.0494438415626695, + "grad_norm": 4.719852924346924, + "learning_rate": 5.561737700424833e-06, + "loss": 0.2094, + "step": 33137 + }, + { + "epoch": 2.0494574064026043, + "grad_norm": 3.9989020824432373, + "learning_rate": 5.5616006578045775e-06, + "loss": 0.1769, + "step": 33138 + }, + { + "epoch": 2.049470971242539, + "grad_norm": 8.821228981018066, + "learning_rate": 5.561463615184323e-06, + "loss": 0.3367, + "step": 33139 + }, + { + "epoch": 2.049484536082474, + "grad_norm": 4.95918083190918, + "learning_rate": 5.561326572564068e-06, + "loss": 0.236, + "step": 33140 + }, + { + "epoch": 2.049498100922409, + "grad_norm": 4.653881549835205, + "learning_rate": 5.561189529943813e-06, + "loss": 0.3675, + "step": 33141 + }, + { + "epoch": 2.049511665762344, + "grad_norm": 4.180106163024902, + "learning_rate": 5.561052487323558e-06, + "loss": 0.1488, + "step": 33142 + }, + { + "epoch": 2.049525230602279, + "grad_norm": 4.891503810882568, + "learning_rate": 5.560915444703303e-06, + "loss": 0.2176, + "step": 33143 + }, + { + "epoch": 2.049538795442214, + "grad_norm": 4.823364734649658, + "learning_rate": 5.560778402083048e-06, + "loss": 0.2459, + "step": 33144 + }, + { + "epoch": 2.049552360282149, + "grad_norm": 6.008081912994385, + "learning_rate": 5.560641359462794e-06, + "loss": 0.2816, + "step": 33145 + }, + { + "epoch": 2.0495659251220837, + "grad_norm": 4.746716499328613, + "learning_rate": 5.560504316842539e-06, + "loss": 0.203, + "step": 33146 + }, + { + "epoch": 2.0495794899620186, + "grad_norm": 4.538222789764404, + "learning_rate": 5.560367274222283e-06, + "loss": 0.1556, + "step": 33147 + }, + { + "epoch": 2.0495930548019534, + "grad_norm": 5.134883880615234, + "learning_rate": 5.5602302316020285e-06, + "loss": 0.2594, + "step": 33148 + }, + { + "epoch": 2.0496066196418883, + "grad_norm": 4.073458194732666, + "learning_rate": 5.560093188981774e-06, + "loss": 0.1201, + "step": 33149 + }, + { + "epoch": 2.049620184481823, + "grad_norm": 7.994776248931885, + "learning_rate": 5.55995614636152e-06, + "loss": 0.237, + "step": 33150 + }, + { + "epoch": 2.049633749321758, + "grad_norm": 5.491556167602539, + "learning_rate": 5.559819103741264e-06, + "loss": 0.2077, + "step": 33151 + }, + { + "epoch": 2.049647314161693, + "grad_norm": 3.9893455505371094, + "learning_rate": 5.559682061121009e-06, + "loss": 0.1981, + "step": 33152 + }, + { + "epoch": 2.0496608790016277, + "grad_norm": 4.93526554107666, + "learning_rate": 5.5595450185007535e-06, + "loss": 0.2095, + "step": 33153 + }, + { + "epoch": 2.0496744438415626, + "grad_norm": 6.962979316711426, + "learning_rate": 5.5594079758804995e-06, + "loss": 0.2963, + "step": 33154 + }, + { + "epoch": 2.0496880086814975, + "grad_norm": 4.167845726013184, + "learning_rate": 5.559270933260245e-06, + "loss": 0.2139, + "step": 33155 + }, + { + "epoch": 2.0497015735214323, + "grad_norm": 4.357701778411865, + "learning_rate": 5.559133890639989e-06, + "loss": 0.1957, + "step": 33156 + }, + { + "epoch": 2.049715138361367, + "grad_norm": 4.050886154174805, + "learning_rate": 5.558996848019734e-06, + "loss": 0.1176, + "step": 33157 + }, + { + "epoch": 2.049728703201302, + "grad_norm": 5.76212739944458, + "learning_rate": 5.55885980539948e-06, + "loss": 0.2508, + "step": 33158 + }, + { + "epoch": 2.049742268041237, + "grad_norm": 3.019077777862549, + "learning_rate": 5.5587227627792254e-06, + "loss": 0.1126, + "step": 33159 + }, + { + "epoch": 2.049755832881172, + "grad_norm": 4.201850414276123, + "learning_rate": 5.55858572015897e-06, + "loss": 0.1531, + "step": 33160 + }, + { + "epoch": 2.049769397721107, + "grad_norm": 5.1134748458862305, + "learning_rate": 5.558448677538715e-06, + "loss": 0.2299, + "step": 33161 + }, + { + "epoch": 2.049782962561042, + "grad_norm": 4.040091037750244, + "learning_rate": 5.558311634918459e-06, + "loss": 0.192, + "step": 33162 + }, + { + "epoch": 2.049796527400977, + "grad_norm": 6.1570143699646, + "learning_rate": 5.558174592298205e-06, + "loss": 0.1705, + "step": 33163 + }, + { + "epoch": 2.0498100922409117, + "grad_norm": 3.029541015625, + "learning_rate": 5.5580375496779505e-06, + "loss": 0.1182, + "step": 33164 + }, + { + "epoch": 2.0498236570808466, + "grad_norm": 4.219806671142578, + "learning_rate": 5.557900507057696e-06, + "loss": 0.1488, + "step": 33165 + }, + { + "epoch": 2.0498372219207814, + "grad_norm": 5.222376823425293, + "learning_rate": 5.55776346443744e-06, + "loss": 0.1471, + "step": 33166 + }, + { + "epoch": 2.0498507867607163, + "grad_norm": 7.276233673095703, + "learning_rate": 5.557626421817186e-06, + "loss": 0.2342, + "step": 33167 + }, + { + "epoch": 2.049864351600651, + "grad_norm": 5.564558029174805, + "learning_rate": 5.557489379196931e-06, + "loss": 0.1945, + "step": 33168 + }, + { + "epoch": 2.049877916440586, + "grad_norm": 7.326227188110352, + "learning_rate": 5.5573523365766755e-06, + "loss": 0.3333, + "step": 33169 + }, + { + "epoch": 2.049891481280521, + "grad_norm": 4.149921417236328, + "learning_rate": 5.557215293956421e-06, + "loss": 0.1194, + "step": 33170 + }, + { + "epoch": 2.0499050461204558, + "grad_norm": 5.450437545776367, + "learning_rate": 5.557078251336167e-06, + "loss": 0.2894, + "step": 33171 + }, + { + "epoch": 2.0499186109603906, + "grad_norm": 4.639832496643066, + "learning_rate": 5.556941208715911e-06, + "loss": 0.12, + "step": 33172 + }, + { + "epoch": 2.0499321758003255, + "grad_norm": 5.3227434158325195, + "learning_rate": 5.556804166095656e-06, + "loss": 0.1924, + "step": 33173 + }, + { + "epoch": 2.0499457406402604, + "grad_norm": 3.771475076675415, + "learning_rate": 5.556667123475401e-06, + "loss": 0.1012, + "step": 33174 + }, + { + "epoch": 2.0499457406402604, + "eval_loss": 0.3416061997413635, + "eval_noise_accuracy": NaN, + "eval_runtime": 4541.5387, + "eval_samples_per_second": 1.106, + "eval_steps_per_second": 0.069, + "eval_wer": 26.148404436069324, + "step": 33174 + }, + { + "epoch": 2.0499593054801952, + "grad_norm": 4.707533359527588, + "learning_rate": 5.556530080855146e-06, + "loss": 0.2071, + "step": 33175 + }, + { + "epoch": 2.04997287032013, + "grad_norm": 7.428777694702148, + "learning_rate": 5.556393038234892e-06, + "loss": 0.2131, + "step": 33176 + }, + { + "epoch": 2.049986435160065, + "grad_norm": 3.157947301864624, + "learning_rate": 5.556255995614637e-06, + "loss": 0.1171, + "step": 33177 + }, + { + "epoch": 2.05, + "grad_norm": 4.067929744720459, + "learning_rate": 5.556118952994381e-06, + "loss": 0.1531, + "step": 33178 + }, + { + "epoch": 2.0500135648399347, + "grad_norm": 6.103881359100342, + "learning_rate": 5.5559819103741265e-06, + "loss": 0.23, + "step": 33179 + }, + { + "epoch": 2.0500271296798696, + "grad_norm": 5.145921230316162, + "learning_rate": 5.5558448677538725e-06, + "loss": 0.1745, + "step": 33180 + }, + { + "epoch": 2.050040694519805, + "grad_norm": 5.882827281951904, + "learning_rate": 5.555707825133617e-06, + "loss": 0.2197, + "step": 33181 + }, + { + "epoch": 2.0500542593597397, + "grad_norm": 3.8992960453033447, + "learning_rate": 5.555570782513362e-06, + "loss": 0.1605, + "step": 33182 + }, + { + "epoch": 2.0500678241996746, + "grad_norm": 3.6276090145111084, + "learning_rate": 5.555433739893107e-06, + "loss": 0.1527, + "step": 33183 + }, + { + "epoch": 2.0500813890396095, + "grad_norm": 4.428309440612793, + "learning_rate": 5.555296697272853e-06, + "loss": 0.16, + "step": 33184 + }, + { + "epoch": 2.0500949538795443, + "grad_norm": 5.630728244781494, + "learning_rate": 5.5551596546525975e-06, + "loss": 0.3248, + "step": 33185 + }, + { + "epoch": 2.050108518719479, + "grad_norm": 4.04826545715332, + "learning_rate": 5.555022612032343e-06, + "loss": 0.2212, + "step": 33186 + }, + { + "epoch": 2.050122083559414, + "grad_norm": 4.4914937019348145, + "learning_rate": 5.554885569412087e-06, + "loss": 0.1513, + "step": 33187 + }, + { + "epoch": 2.050135648399349, + "grad_norm": 6.68916130065918, + "learning_rate": 5.554748526791832e-06, + "loss": 0.3369, + "step": 33188 + }, + { + "epoch": 2.050149213239284, + "grad_norm": 4.489754676818848, + "learning_rate": 5.554611484171578e-06, + "loss": 0.1843, + "step": 33189 + }, + { + "epoch": 2.0501627780792187, + "grad_norm": 4.6938676834106445, + "learning_rate": 5.554474441551323e-06, + "loss": 0.1911, + "step": 33190 + }, + { + "epoch": 2.0501763429191535, + "grad_norm": 3.820284843444824, + "learning_rate": 5.554337398931068e-06, + "loss": 0.1878, + "step": 33191 + }, + { + "epoch": 2.0501899077590884, + "grad_norm": 4.825601100921631, + "learning_rate": 5.554200356310813e-06, + "loss": 0.2251, + "step": 33192 + }, + { + "epoch": 2.0502034725990232, + "grad_norm": 4.787213325500488, + "learning_rate": 5.554063313690559e-06, + "loss": 0.2591, + "step": 33193 + }, + { + "epoch": 2.050217037438958, + "grad_norm": 5.2866597175598145, + "learning_rate": 5.553926271070303e-06, + "loss": 0.3147, + "step": 33194 + }, + { + "epoch": 2.050230602278893, + "grad_norm": 4.848115921020508, + "learning_rate": 5.5537892284500485e-06, + "loss": 0.3142, + "step": 33195 + }, + { + "epoch": 2.050244167118828, + "grad_norm": 4.580886363983154, + "learning_rate": 5.553652185829793e-06, + "loss": 0.2733, + "step": 33196 + }, + { + "epoch": 2.0502577319587627, + "grad_norm": 5.588908672332764, + "learning_rate": 5.553515143209539e-06, + "loss": 0.1521, + "step": 33197 + }, + { + "epoch": 2.0502712967986976, + "grad_norm": 4.653001308441162, + "learning_rate": 5.553378100589284e-06, + "loss": 0.1666, + "step": 33198 + }, + { + "epoch": 2.050284861638633, + "grad_norm": 3.862400531768799, + "learning_rate": 5.553241057969029e-06, + "loss": 0.1607, + "step": 33199 + }, + { + "epoch": 2.0502984264785677, + "grad_norm": 4.658569812774658, + "learning_rate": 5.5531040153487735e-06, + "loss": 0.2128, + "step": 33200 + }, + { + "epoch": 2.0503119913185026, + "grad_norm": 5.402076244354248, + "learning_rate": 5.552966972728519e-06, + "loss": 0.1899, + "step": 33201 + }, + { + "epoch": 2.0503255561584375, + "grad_norm": 4.362599849700928, + "learning_rate": 5.552829930108265e-06, + "loss": 0.1777, + "step": 33202 + }, + { + "epoch": 2.0503391209983723, + "grad_norm": 2.9104080200195312, + "learning_rate": 5.552692887488009e-06, + "loss": 0.116, + "step": 33203 + }, + { + "epoch": 2.050352685838307, + "grad_norm": 4.962741851806641, + "learning_rate": 5.552555844867754e-06, + "loss": 0.1711, + "step": 33204 + }, + { + "epoch": 2.050366250678242, + "grad_norm": 3.5382771492004395, + "learning_rate": 5.552418802247499e-06, + "loss": 0.111, + "step": 33205 + }, + { + "epoch": 2.050379815518177, + "grad_norm": 4.445010185241699, + "learning_rate": 5.552281759627245e-06, + "loss": 0.1624, + "step": 33206 + }, + { + "epoch": 2.050393380358112, + "grad_norm": 3.417166233062744, + "learning_rate": 5.55214471700699e-06, + "loss": 0.1459, + "step": 33207 + }, + { + "epoch": 2.0504069451980467, + "grad_norm": 3.735478639602661, + "learning_rate": 5.552007674386735e-06, + "loss": 0.1735, + "step": 33208 + }, + { + "epoch": 2.0504205100379815, + "grad_norm": 5.012730598449707, + "learning_rate": 5.551870631766479e-06, + "loss": 0.2063, + "step": 33209 + }, + { + "epoch": 2.0504340748779164, + "grad_norm": 5.028580188751221, + "learning_rate": 5.551733589146225e-06, + "loss": 0.1511, + "step": 33210 + }, + { + "epoch": 2.0504476397178513, + "grad_norm": 4.857041358947754, + "learning_rate": 5.5515965465259705e-06, + "loss": 0.2633, + "step": 33211 + }, + { + "epoch": 2.050461204557786, + "grad_norm": 4.109816074371338, + "learning_rate": 5.551459503905715e-06, + "loss": 0.121, + "step": 33212 + }, + { + "epoch": 2.050474769397721, + "grad_norm": 3.152343273162842, + "learning_rate": 5.55132246128546e-06, + "loss": 0.0718, + "step": 33213 + }, + { + "epoch": 2.050488334237656, + "grad_norm": 5.086043357849121, + "learning_rate": 5.551185418665206e-06, + "loss": 0.1381, + "step": 33214 + }, + { + "epoch": 2.0505018990775907, + "grad_norm": 4.78048038482666, + "learning_rate": 5.55104837604495e-06, + "loss": 0.1956, + "step": 33215 + }, + { + "epoch": 2.0505154639175256, + "grad_norm": 3.7194323539733887, + "learning_rate": 5.5509113334246955e-06, + "loss": 0.145, + "step": 33216 + }, + { + "epoch": 2.0505290287574605, + "grad_norm": 3.6656150817871094, + "learning_rate": 5.550774290804441e-06, + "loss": 0.1549, + "step": 33217 + }, + { + "epoch": 2.0505425935973958, + "grad_norm": 5.2986369132995605, + "learning_rate": 5.550637248184185e-06, + "loss": 0.206, + "step": 33218 + }, + { + "epoch": 2.0505561584373306, + "grad_norm": 4.330502510070801, + "learning_rate": 5.550500205563931e-06, + "loss": 0.1722, + "step": 33219 + }, + { + "epoch": 2.0505697232772655, + "grad_norm": 7.022960662841797, + "learning_rate": 5.550363162943676e-06, + "loss": 0.2071, + "step": 33220 + }, + { + "epoch": 2.0505832881172004, + "grad_norm": 3.780062675476074, + "learning_rate": 5.550226120323421e-06, + "loss": 0.1655, + "step": 33221 + }, + { + "epoch": 2.0505968529571352, + "grad_norm": 5.2509260177612305, + "learning_rate": 5.550089077703166e-06, + "loss": 0.2137, + "step": 33222 + }, + { + "epoch": 2.05061041779707, + "grad_norm": 5.171702861785889, + "learning_rate": 5.549952035082912e-06, + "loss": 0.2028, + "step": 33223 + }, + { + "epoch": 2.050623982637005, + "grad_norm": 3.4186623096466064, + "learning_rate": 5.549814992462656e-06, + "loss": 0.1058, + "step": 33224 + }, + { + "epoch": 2.05063754747694, + "grad_norm": 5.401627540588379, + "learning_rate": 5.549677949842401e-06, + "loss": 0.1791, + "step": 33225 + }, + { + "epoch": 2.0506511123168747, + "grad_norm": 7.112062931060791, + "learning_rate": 5.5495409072221465e-06, + "loss": 0.2391, + "step": 33226 + }, + { + "epoch": 2.0506646771568096, + "grad_norm": 4.151144981384277, + "learning_rate": 5.5494038646018925e-06, + "loss": 0.1296, + "step": 33227 + }, + { + "epoch": 2.0506782419967444, + "grad_norm": 5.242328643798828, + "learning_rate": 5.549266821981637e-06, + "loss": 0.2255, + "step": 33228 + }, + { + "epoch": 2.0506918068366793, + "grad_norm": 4.2408294677734375, + "learning_rate": 5.549129779361382e-06, + "loss": 0.1396, + "step": 33229 + }, + { + "epoch": 2.050705371676614, + "grad_norm": 5.622593879699707, + "learning_rate": 5.548992736741126e-06, + "loss": 0.2469, + "step": 33230 + }, + { + "epoch": 2.050718936516549, + "grad_norm": 5.665379047393799, + "learning_rate": 5.5488556941208715e-06, + "loss": 0.1772, + "step": 33231 + }, + { + "epoch": 2.050732501356484, + "grad_norm": 4.710973262786865, + "learning_rate": 5.5487186515006176e-06, + "loss": 0.1621, + "step": 33232 + }, + { + "epoch": 2.0507460661964187, + "grad_norm": 4.740562915802002, + "learning_rate": 5.548581608880363e-06, + "loss": 0.1533, + "step": 33233 + }, + { + "epoch": 2.0507596310363536, + "grad_norm": 4.272677898406982, + "learning_rate": 5.548444566260107e-06, + "loss": 0.1536, + "step": 33234 + }, + { + "epoch": 2.0507731958762885, + "grad_norm": 2.8397037982940674, + "learning_rate": 5.548307523639852e-06, + "loss": 0.0946, + "step": 33235 + }, + { + "epoch": 2.0507867607162233, + "grad_norm": 5.584941387176514, + "learning_rate": 5.548170481019598e-06, + "loss": 0.1908, + "step": 33236 + }, + { + "epoch": 2.0508003255561587, + "grad_norm": 4.448690891265869, + "learning_rate": 5.548033438399343e-06, + "loss": 0.118, + "step": 33237 + }, + { + "epoch": 2.0508138903960935, + "grad_norm": 4.51439905166626, + "learning_rate": 5.547896395779088e-06, + "loss": 0.0792, + "step": 33238 + }, + { + "epoch": 2.0508274552360284, + "grad_norm": 4.79908561706543, + "learning_rate": 5.547759353158832e-06, + "loss": 0.1156, + "step": 33239 + }, + { + "epoch": 2.0508410200759633, + "grad_norm": 5.821778774261475, + "learning_rate": 5.547622310538578e-06, + "loss": 0.2087, + "step": 33240 + }, + { + "epoch": 2.050854584915898, + "grad_norm": 3.4450180530548096, + "learning_rate": 5.547485267918323e-06, + "loss": 0.1405, + "step": 33241 + }, + { + "epoch": 2.050868149755833, + "grad_norm": 3.816880702972412, + "learning_rate": 5.5473482252980685e-06, + "loss": 0.1139, + "step": 33242 + }, + { + "epoch": 2.050881714595768, + "grad_norm": 4.821766376495361, + "learning_rate": 5.547211182677813e-06, + "loss": 0.17, + "step": 33243 + }, + { + "epoch": 2.0508952794357027, + "grad_norm": 4.0759077072143555, + "learning_rate": 5.547074140057558e-06, + "loss": 0.13, + "step": 33244 + }, + { + "epoch": 2.0509088442756376, + "grad_norm": 5.467583656311035, + "learning_rate": 5.546937097437304e-06, + "loss": 0.165, + "step": 33245 + }, + { + "epoch": 2.0509224091155724, + "grad_norm": 4.283092498779297, + "learning_rate": 5.546800054817048e-06, + "loss": 0.1761, + "step": 33246 + }, + { + "epoch": 2.0509359739555073, + "grad_norm": 3.195798635482788, + "learning_rate": 5.5466630121967936e-06, + "loss": 0.1, + "step": 33247 + }, + { + "epoch": 2.050949538795442, + "grad_norm": 5.573493003845215, + "learning_rate": 5.546525969576539e-06, + "loss": 0.1596, + "step": 33248 + }, + { + "epoch": 2.050963103635377, + "grad_norm": 6.384302139282227, + "learning_rate": 5.546388926956284e-06, + "loss": 0.2038, + "step": 33249 + }, + { + "epoch": 2.050976668475312, + "grad_norm": 5.2941484451293945, + "learning_rate": 5.546251884336029e-06, + "loss": 0.1645, + "step": 33250 + }, + { + "epoch": 2.0509902333152468, + "grad_norm": 3.346994400024414, + "learning_rate": 5.546114841715774e-06, + "loss": 0.107, + "step": 33251 + }, + { + "epoch": 2.0510037981551816, + "grad_norm": 4.551461219787598, + "learning_rate": 5.545977799095519e-06, + "loss": 0.2247, + "step": 33252 + }, + { + "epoch": 2.0510173629951165, + "grad_norm": 4.351534366607666, + "learning_rate": 5.545840756475265e-06, + "loss": 0.227, + "step": 33253 + }, + { + "epoch": 2.0510309278350514, + "grad_norm": 3.2313573360443115, + "learning_rate": 5.54570371385501e-06, + "loss": 0.1471, + "step": 33254 + }, + { + "epoch": 2.0510444926749862, + "grad_norm": 2.9792957305908203, + "learning_rate": 5.545566671234754e-06, + "loss": 0.096, + "step": 33255 + }, + { + "epoch": 2.0510580575149215, + "grad_norm": 4.993274211883545, + "learning_rate": 5.545429628614499e-06, + "loss": 0.2336, + "step": 33256 + }, + { + "epoch": 2.0510716223548564, + "grad_norm": 4.793374538421631, + "learning_rate": 5.5452925859942445e-06, + "loss": 0.1615, + "step": 33257 + }, + { + "epoch": 2.0510851871947913, + "grad_norm": 4.310244083404541, + "learning_rate": 5.54515554337399e-06, + "loss": 0.1892, + "step": 33258 + }, + { + "epoch": 2.051098752034726, + "grad_norm": 5.625546932220459, + "learning_rate": 5.545018500753735e-06, + "loss": 0.2559, + "step": 33259 + }, + { + "epoch": 2.051112316874661, + "grad_norm": 3.096468925476074, + "learning_rate": 5.54488145813348e-06, + "loss": 0.0849, + "step": 33260 + }, + { + "epoch": 2.051125881714596, + "grad_norm": 3.4214353561401367, + "learning_rate": 5.544744415513224e-06, + "loss": 0.1129, + "step": 33261 + }, + { + "epoch": 2.0511394465545307, + "grad_norm": 6.0023112297058105, + "learning_rate": 5.54460737289297e-06, + "loss": 0.3206, + "step": 33262 + }, + { + "epoch": 2.0511530113944656, + "grad_norm": 6.734333038330078, + "learning_rate": 5.5444703302727156e-06, + "loss": 0.2914, + "step": 33263 + }, + { + "epoch": 2.0511665762344005, + "grad_norm": 5.04567289352417, + "learning_rate": 5.54433328765246e-06, + "loss": 0.3702, + "step": 33264 + }, + { + "epoch": 2.0511801410743353, + "grad_norm": 4.18075704574585, + "learning_rate": 5.544196245032205e-06, + "loss": 0.1018, + "step": 33265 + }, + { + "epoch": 2.05119370591427, + "grad_norm": 4.411576747894287, + "learning_rate": 5.544059202411951e-06, + "loss": 0.1199, + "step": 33266 + }, + { + "epoch": 2.051207270754205, + "grad_norm": 3.058769464492798, + "learning_rate": 5.543922159791696e-06, + "loss": 0.1194, + "step": 33267 + }, + { + "epoch": 2.05122083559414, + "grad_norm": 5.096037864685059, + "learning_rate": 5.543785117171441e-06, + "loss": 0.2144, + "step": 33268 + }, + { + "epoch": 2.051234400434075, + "grad_norm": 4.942098140716553, + "learning_rate": 5.543648074551186e-06, + "loss": 0.2355, + "step": 33269 + }, + { + "epoch": 2.0512479652740097, + "grad_norm": 4.971188545227051, + "learning_rate": 5.54351103193093e-06, + "loss": 0.232, + "step": 33270 + }, + { + "epoch": 2.0512615301139445, + "grad_norm": 3.7733352184295654, + "learning_rate": 5.543373989310676e-06, + "loss": 0.1883, + "step": 33271 + }, + { + "epoch": 2.0512750949538794, + "grad_norm": 5.549583911895752, + "learning_rate": 5.543236946690421e-06, + "loss": 0.172, + "step": 33272 + }, + { + "epoch": 2.0512886597938143, + "grad_norm": 4.615512847900391, + "learning_rate": 5.543099904070166e-06, + "loss": 0.2262, + "step": 33273 + }, + { + "epoch": 2.051302224633749, + "grad_norm": 4.55172061920166, + "learning_rate": 5.542962861449911e-06, + "loss": 0.1969, + "step": 33274 + }, + { + "epoch": 2.0513157894736844, + "grad_norm": 5.565757751464844, + "learning_rate": 5.542825818829657e-06, + "loss": 0.1589, + "step": 33275 + }, + { + "epoch": 2.0513293543136193, + "grad_norm": 4.471691608428955, + "learning_rate": 5.542688776209402e-06, + "loss": 0.1693, + "step": 33276 + }, + { + "epoch": 2.051342919153554, + "grad_norm": 4.201033592224121, + "learning_rate": 5.542551733589146e-06, + "loss": 0.1672, + "step": 33277 + }, + { + "epoch": 2.051356483993489, + "grad_norm": 4.751383304595947, + "learning_rate": 5.5424146909688916e-06, + "loss": 0.2847, + "step": 33278 + }, + { + "epoch": 2.051370048833424, + "grad_norm": 5.233029842376709, + "learning_rate": 5.542277648348638e-06, + "loss": 0.2434, + "step": 33279 + }, + { + "epoch": 2.0513836136733588, + "grad_norm": 5.716373920440674, + "learning_rate": 5.542140605728382e-06, + "loss": 0.1518, + "step": 33280 + }, + { + "epoch": 2.0513971785132936, + "grad_norm": 5.251829147338867, + "learning_rate": 5.542003563108127e-06, + "loss": 0.3617, + "step": 33281 + }, + { + "epoch": 2.0514107433532285, + "grad_norm": 4.0266571044921875, + "learning_rate": 5.541866520487872e-06, + "loss": 0.1367, + "step": 33282 + }, + { + "epoch": 2.0514243081931633, + "grad_norm": 5.625945568084717, + "learning_rate": 5.5417294778676175e-06, + "loss": 0.2582, + "step": 33283 + }, + { + "epoch": 2.051437873033098, + "grad_norm": 5.496835231781006, + "learning_rate": 5.541592435247363e-06, + "loss": 0.242, + "step": 33284 + }, + { + "epoch": 2.051451437873033, + "grad_norm": 5.531557083129883, + "learning_rate": 5.541455392627108e-06, + "loss": 0.2305, + "step": 33285 + }, + { + "epoch": 2.051465002712968, + "grad_norm": 5.133620738983154, + "learning_rate": 5.541318350006852e-06, + "loss": 0.1852, + "step": 33286 + }, + { + "epoch": 2.051478567552903, + "grad_norm": 6.06619930267334, + "learning_rate": 5.541181307386597e-06, + "loss": 0.3032, + "step": 33287 + }, + { + "epoch": 2.0514921323928377, + "grad_norm": 4.425962448120117, + "learning_rate": 5.541044264766343e-06, + "loss": 0.1059, + "step": 33288 + }, + { + "epoch": 2.0515056972327725, + "grad_norm": 4.612727642059326, + "learning_rate": 5.540907222146088e-06, + "loss": 0.1222, + "step": 33289 + }, + { + "epoch": 2.0515192620727074, + "grad_norm": 4.492236137390137, + "learning_rate": 5.540770179525833e-06, + "loss": 0.1823, + "step": 33290 + }, + { + "epoch": 2.0515328269126423, + "grad_norm": 4.23216438293457, + "learning_rate": 5.540633136905578e-06, + "loss": 0.1799, + "step": 33291 + }, + { + "epoch": 2.051546391752577, + "grad_norm": 4.56997013092041, + "learning_rate": 5.540496094285324e-06, + "loss": 0.1719, + "step": 33292 + }, + { + "epoch": 2.051559956592512, + "grad_norm": 3.963526964187622, + "learning_rate": 5.540359051665068e-06, + "loss": 0.1737, + "step": 33293 + }, + { + "epoch": 2.0515735214324473, + "grad_norm": 4.855352878570557, + "learning_rate": 5.5402220090448136e-06, + "loss": 0.2082, + "step": 33294 + }, + { + "epoch": 2.051587086272382, + "grad_norm": 5.349179744720459, + "learning_rate": 5.540084966424558e-06, + "loss": 0.2306, + "step": 33295 + }, + { + "epoch": 2.051600651112317, + "grad_norm": 5.0006561279296875, + "learning_rate": 5.539947923804304e-06, + "loss": 0.1305, + "step": 33296 + }, + { + "epoch": 2.051614215952252, + "grad_norm": 4.217953681945801, + "learning_rate": 5.539810881184049e-06, + "loss": 0.1721, + "step": 33297 + }, + { + "epoch": 2.0516277807921868, + "grad_norm": 3.1592631340026855, + "learning_rate": 5.5396738385637934e-06, + "loss": 0.1093, + "step": 33298 + }, + { + "epoch": 2.0516413456321216, + "grad_norm": 4.434487342834473, + "learning_rate": 5.539536795943539e-06, + "loss": 0.1239, + "step": 33299 + }, + { + "epoch": 2.0516549104720565, + "grad_norm": 4.049318790435791, + "learning_rate": 5.539399753323284e-06, + "loss": 0.1706, + "step": 33300 + }, + { + "epoch": 2.0516684753119914, + "grad_norm": 6.724670886993408, + "learning_rate": 5.53926271070303e-06, + "loss": 0.0924, + "step": 33301 + }, + { + "epoch": 2.0516820401519262, + "grad_norm": 3.3208303451538086, + "learning_rate": 5.539125668082774e-06, + "loss": 0.0804, + "step": 33302 + }, + { + "epoch": 2.051695604991861, + "grad_norm": 4.970296382904053, + "learning_rate": 5.538988625462519e-06, + "loss": 0.1263, + "step": 33303 + }, + { + "epoch": 2.051709169831796, + "grad_norm": 3.867257595062256, + "learning_rate": 5.538851582842264e-06, + "loss": 0.1063, + "step": 33304 + }, + { + "epoch": 2.051722734671731, + "grad_norm": 4.018773555755615, + "learning_rate": 5.53871454022201e-06, + "loss": 0.1358, + "step": 33305 + }, + { + "epoch": 2.0517362995116657, + "grad_norm": 4.972194194793701, + "learning_rate": 5.538577497601755e-06, + "loss": 0.1918, + "step": 33306 + }, + { + "epoch": 2.0517498643516006, + "grad_norm": 4.618951797485352, + "learning_rate": 5.538440454981499e-06, + "loss": 0.2713, + "step": 33307 + }, + { + "epoch": 2.0517634291915354, + "grad_norm": 6.812234401702881, + "learning_rate": 5.538303412361244e-06, + "loss": 0.2247, + "step": 33308 + }, + { + "epoch": 2.0517769940314703, + "grad_norm": 5.489130020141602, + "learning_rate": 5.53816636974099e-06, + "loss": 0.2033, + "step": 33309 + }, + { + "epoch": 2.051790558871405, + "grad_norm": 4.966981410980225, + "learning_rate": 5.538029327120736e-06, + "loss": 0.1591, + "step": 33310 + }, + { + "epoch": 2.05180412371134, + "grad_norm": 5.041135311126709, + "learning_rate": 5.53789228450048e-06, + "loss": 0.1812, + "step": 33311 + }, + { + "epoch": 2.051817688551275, + "grad_norm": 5.783110618591309, + "learning_rate": 5.537755241880225e-06, + "loss": 0.1896, + "step": 33312 + }, + { + "epoch": 2.05183125339121, + "grad_norm": 5.79330587387085, + "learning_rate": 5.5376181992599694e-06, + "loss": 0.2199, + "step": 33313 + }, + { + "epoch": 2.051844818231145, + "grad_norm": 5.208004474639893, + "learning_rate": 5.5374811566397155e-06, + "loss": 0.2355, + "step": 33314 + }, + { + "epoch": 2.05185838307108, + "grad_norm": 3.399092674255371, + "learning_rate": 5.537344114019461e-06, + "loss": 0.1495, + "step": 33315 + }, + { + "epoch": 2.051871947911015, + "grad_norm": 5.6616339683532715, + "learning_rate": 5.537207071399206e-06, + "loss": 0.1962, + "step": 33316 + }, + { + "epoch": 2.0518855127509497, + "grad_norm": 4.742101192474365, + "learning_rate": 5.53707002877895e-06, + "loss": 0.2422, + "step": 33317 + }, + { + "epoch": 2.0518990775908845, + "grad_norm": 3.617354154586792, + "learning_rate": 5.536932986158696e-06, + "loss": 0.1017, + "step": 33318 + }, + { + "epoch": 2.0519126424308194, + "grad_norm": 4.954315185546875, + "learning_rate": 5.536795943538441e-06, + "loss": 0.1359, + "step": 33319 + }, + { + "epoch": 2.0519262072707543, + "grad_norm": 4.976500034332275, + "learning_rate": 5.536658900918186e-06, + "loss": 0.1401, + "step": 33320 + }, + { + "epoch": 2.051939772110689, + "grad_norm": 4.10707950592041, + "learning_rate": 5.536521858297931e-06, + "loss": 0.2529, + "step": 33321 + }, + { + "epoch": 2.051953336950624, + "grad_norm": 4.079710483551025, + "learning_rate": 5.536384815677677e-06, + "loss": 0.1708, + "step": 33322 + }, + { + "epoch": 2.051966901790559, + "grad_norm": 6.288585186004639, + "learning_rate": 5.536247773057421e-06, + "loss": 0.2601, + "step": 33323 + }, + { + "epoch": 2.0519804666304937, + "grad_norm": 4.977166652679443, + "learning_rate": 5.536110730437166e-06, + "loss": 0.1904, + "step": 33324 + }, + { + "epoch": 2.0519940314704286, + "grad_norm": 4.964511394500732, + "learning_rate": 5.5359736878169116e-06, + "loss": 0.1699, + "step": 33325 + }, + { + "epoch": 2.0520075963103634, + "grad_norm": 6.956914901733398, + "learning_rate": 5.535836645196656e-06, + "loss": 0.3544, + "step": 33326 + }, + { + "epoch": 2.0520211611502983, + "grad_norm": 4.329329490661621, + "learning_rate": 5.535699602576402e-06, + "loss": 0.1801, + "step": 33327 + }, + { + "epoch": 2.052034725990233, + "grad_norm": 4.008821964263916, + "learning_rate": 5.535562559956147e-06, + "loss": 0.1506, + "step": 33328 + }, + { + "epoch": 2.052048290830168, + "grad_norm": 5.269648551940918, + "learning_rate": 5.5354255173358914e-06, + "loss": 0.2048, + "step": 33329 + }, + { + "epoch": 2.052061855670103, + "grad_norm": 5.632940292358398, + "learning_rate": 5.535288474715637e-06, + "loss": 0.2596, + "step": 33330 + }, + { + "epoch": 2.0520754205100378, + "grad_norm": 5.2613844871521, + "learning_rate": 5.535151432095383e-06, + "loss": 0.1951, + "step": 33331 + }, + { + "epoch": 2.052088985349973, + "grad_norm": 4.535618305206299, + "learning_rate": 5.535014389475127e-06, + "loss": 0.1375, + "step": 33332 + }, + { + "epoch": 2.052102550189908, + "grad_norm": 4.411693572998047, + "learning_rate": 5.534877346854872e-06, + "loss": 0.1254, + "step": 33333 + }, + { + "epoch": 2.052116115029843, + "grad_norm": 5.108825206756592, + "learning_rate": 5.534740304234617e-06, + "loss": 0.1781, + "step": 33334 + }, + { + "epoch": 2.0521296798697777, + "grad_norm": 4.661789894104004, + "learning_rate": 5.534603261614363e-06, + "loss": 0.1552, + "step": 33335 + }, + { + "epoch": 2.0521432447097125, + "grad_norm": 4.792236804962158, + "learning_rate": 5.534466218994108e-06, + "loss": 0.1337, + "step": 33336 + }, + { + "epoch": 2.0521568095496474, + "grad_norm": 3.807481527328491, + "learning_rate": 5.534329176373853e-06, + "loss": 0.1448, + "step": 33337 + }, + { + "epoch": 2.0521703743895823, + "grad_norm": 5.541726112365723, + "learning_rate": 5.534192133753597e-06, + "loss": 0.2146, + "step": 33338 + }, + { + "epoch": 2.052183939229517, + "grad_norm": 5.359046459197998, + "learning_rate": 5.534055091133342e-06, + "loss": 0.2709, + "step": 33339 + }, + { + "epoch": 2.052197504069452, + "grad_norm": 5.5069661140441895, + "learning_rate": 5.533918048513088e-06, + "loss": 0.2236, + "step": 33340 + }, + { + "epoch": 2.052211068909387, + "grad_norm": 5.741988658905029, + "learning_rate": 5.533781005892834e-06, + "loss": 0.207, + "step": 33341 + }, + { + "epoch": 2.0522246337493217, + "grad_norm": 5.6948161125183105, + "learning_rate": 5.533643963272578e-06, + "loss": 0.1857, + "step": 33342 + }, + { + "epoch": 2.0522381985892566, + "grad_norm": 5.147307872772217, + "learning_rate": 5.533506920652323e-06, + "loss": 0.1972, + "step": 33343 + }, + { + "epoch": 2.0522517634291915, + "grad_norm": 5.149298191070557, + "learning_rate": 5.533369878032069e-06, + "loss": 0.1303, + "step": 33344 + }, + { + "epoch": 2.0522653282691263, + "grad_norm": 3.9263668060302734, + "learning_rate": 5.5332328354118135e-06, + "loss": 0.1707, + "step": 33345 + }, + { + "epoch": 2.052278893109061, + "grad_norm": 4.767831325531006, + "learning_rate": 5.533095792791559e-06, + "loss": 0.1978, + "step": 33346 + }, + { + "epoch": 2.052292457948996, + "grad_norm": 5.904316425323486, + "learning_rate": 5.532958750171303e-06, + "loss": 0.313, + "step": 33347 + }, + { + "epoch": 2.052306022788931, + "grad_norm": 4.97991418838501, + "learning_rate": 5.532821707551049e-06, + "loss": 0.2646, + "step": 33348 + }, + { + "epoch": 2.052319587628866, + "grad_norm": 4.851743698120117, + "learning_rate": 5.532684664930794e-06, + "loss": 0.2004, + "step": 33349 + }, + { + "epoch": 2.0523331524688007, + "grad_norm": 3.6461551189422607, + "learning_rate": 5.532547622310539e-06, + "loss": 0.1894, + "step": 33350 + }, + { + "epoch": 2.052346717308736, + "grad_norm": 3.6148300170898438, + "learning_rate": 5.532410579690284e-06, + "loss": 0.1092, + "step": 33351 + }, + { + "epoch": 2.052360282148671, + "grad_norm": 5.895342826843262, + "learning_rate": 5.532273537070029e-06, + "loss": 0.3295, + "step": 33352 + }, + { + "epoch": 2.0523738469886057, + "grad_norm": 6.263014793395996, + "learning_rate": 5.532136494449775e-06, + "loss": 0.269, + "step": 33353 + }, + { + "epoch": 2.0523874118285406, + "grad_norm": 5.632965087890625, + "learning_rate": 5.531999451829519e-06, + "loss": 0.1468, + "step": 33354 + }, + { + "epoch": 2.0524009766684754, + "grad_norm": 5.446651935577393, + "learning_rate": 5.531862409209264e-06, + "loss": 0.2403, + "step": 33355 + }, + { + "epoch": 2.0524145415084103, + "grad_norm": 5.123668193817139, + "learning_rate": 5.53172536658901e-06, + "loss": 0.1859, + "step": 33356 + }, + { + "epoch": 2.052428106348345, + "grad_norm": 3.9975521564483643, + "learning_rate": 5.531588323968755e-06, + "loss": 0.2398, + "step": 33357 + }, + { + "epoch": 2.05244167118828, + "grad_norm": 4.793484210968018, + "learning_rate": 5.5314512813485e-06, + "loss": 0.1845, + "step": 33358 + }, + { + "epoch": 2.052455236028215, + "grad_norm": 5.333451747894287, + "learning_rate": 5.531314238728245e-06, + "loss": 0.2847, + "step": 33359 + }, + { + "epoch": 2.0524688008681498, + "grad_norm": 5.052577018737793, + "learning_rate": 5.5311771961079894e-06, + "loss": 0.2327, + "step": 33360 + }, + { + "epoch": 2.0524823657080846, + "grad_norm": 4.364218711853027, + "learning_rate": 5.5310401534877355e-06, + "loss": 0.1437, + "step": 33361 + }, + { + "epoch": 2.0524959305480195, + "grad_norm": 5.365538597106934, + "learning_rate": 5.530903110867481e-06, + "loss": 0.2033, + "step": 33362 + }, + { + "epoch": 2.0525094953879544, + "grad_norm": 6.083786964416504, + "learning_rate": 5.530766068247225e-06, + "loss": 0.1873, + "step": 33363 + }, + { + "epoch": 2.052523060227889, + "grad_norm": 6.667262554168701, + "learning_rate": 5.53062902562697e-06, + "loss": 0.3373, + "step": 33364 + }, + { + "epoch": 2.052536625067824, + "grad_norm": 4.31280517578125, + "learning_rate": 5.530491983006716e-06, + "loss": 0.1496, + "step": 33365 + }, + { + "epoch": 2.052550189907759, + "grad_norm": 5.610807418823242, + "learning_rate": 5.5303549403864605e-06, + "loss": 0.2281, + "step": 33366 + }, + { + "epoch": 2.052563754747694, + "grad_norm": 4.866436958312988, + "learning_rate": 5.530217897766206e-06, + "loss": 0.1332, + "step": 33367 + }, + { + "epoch": 2.0525773195876287, + "grad_norm": 6.4173407554626465, + "learning_rate": 5.530080855145951e-06, + "loss": 0.2533, + "step": 33368 + }, + { + "epoch": 2.0525908844275635, + "grad_norm": 3.1247522830963135, + "learning_rate": 5.529943812525695e-06, + "loss": 0.085, + "step": 33369 + }, + { + "epoch": 2.052604449267499, + "grad_norm": 5.619094371795654, + "learning_rate": 5.529806769905441e-06, + "loss": 0.2104, + "step": 33370 + }, + { + "epoch": 2.0526180141074337, + "grad_norm": 8.361300468444824, + "learning_rate": 5.529669727285186e-06, + "loss": 0.2521, + "step": 33371 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 4.899557113647461, + "learning_rate": 5.529532684664931e-06, + "loss": 0.1719, + "step": 33372 + }, + { + "epoch": 2.0526451437873035, + "grad_norm": 5.729744911193848, + "learning_rate": 5.529395642044676e-06, + "loss": 0.284, + "step": 33373 + }, + { + "epoch": 2.0526587086272383, + "grad_norm": 3.8272719383239746, + "learning_rate": 5.529258599424422e-06, + "loss": 0.1664, + "step": 33374 + }, + { + "epoch": 2.052672273467173, + "grad_norm": 5.608619213104248, + "learning_rate": 5.529121556804167e-06, + "loss": 0.2347, + "step": 33375 + }, + { + "epoch": 2.052685838307108, + "grad_norm": 4.444719314575195, + "learning_rate": 5.5289845141839115e-06, + "loss": 0.2202, + "step": 33376 + }, + { + "epoch": 2.052699403147043, + "grad_norm": 5.621547698974609, + "learning_rate": 5.528847471563657e-06, + "loss": 0.2656, + "step": 33377 + }, + { + "epoch": 2.0527129679869778, + "grad_norm": 4.248170852661133, + "learning_rate": 5.528710428943403e-06, + "loss": 0.1977, + "step": 33378 + }, + { + "epoch": 2.0527265328269126, + "grad_norm": 5.766707897186279, + "learning_rate": 5.528573386323147e-06, + "loss": 0.2853, + "step": 33379 + }, + { + "epoch": 2.0527400976668475, + "grad_norm": 3.9579808712005615, + "learning_rate": 5.528436343702892e-06, + "loss": 0.1054, + "step": 33380 + }, + { + "epoch": 2.0527536625067824, + "grad_norm": 3.774909734725952, + "learning_rate": 5.5282993010826365e-06, + "loss": 0.2137, + "step": 33381 + }, + { + "epoch": 2.0527672273467172, + "grad_norm": 4.533005237579346, + "learning_rate": 5.528162258462382e-06, + "loss": 0.1393, + "step": 33382 + }, + { + "epoch": 2.052780792186652, + "grad_norm": 5.147291660308838, + "learning_rate": 5.528025215842128e-06, + "loss": 0.2521, + "step": 33383 + }, + { + "epoch": 2.052794357026587, + "grad_norm": 4.272508144378662, + "learning_rate": 5.527888173221873e-06, + "loss": 0.1789, + "step": 33384 + }, + { + "epoch": 2.052807921866522, + "grad_norm": 4.418466567993164, + "learning_rate": 5.527751130601617e-06, + "loss": 0.2145, + "step": 33385 + }, + { + "epoch": 2.0528214867064567, + "grad_norm": 4.76779317855835, + "learning_rate": 5.527614087981362e-06, + "loss": 0.1539, + "step": 33386 + }, + { + "epoch": 2.0528350515463916, + "grad_norm": 4.335226535797119, + "learning_rate": 5.5274770453611084e-06, + "loss": 0.1543, + "step": 33387 + }, + { + "epoch": 2.0528486163863264, + "grad_norm": 5.087762832641602, + "learning_rate": 5.527340002740853e-06, + "loss": 0.1796, + "step": 33388 + }, + { + "epoch": 2.0528621812262617, + "grad_norm": 5.232700824737549, + "learning_rate": 5.527202960120598e-06, + "loss": 0.213, + "step": 33389 + }, + { + "epoch": 2.0528757460661966, + "grad_norm": 5.174870014190674, + "learning_rate": 5.527065917500343e-06, + "loss": 0.2235, + "step": 33390 + }, + { + "epoch": 2.0528893109061315, + "grad_norm": 5.4483160972595215, + "learning_rate": 5.526928874880088e-06, + "loss": 0.2322, + "step": 33391 + }, + { + "epoch": 2.0529028757460663, + "grad_norm": 7.211057662963867, + "learning_rate": 5.5267918322598335e-06, + "loss": 0.2637, + "step": 33392 + }, + { + "epoch": 2.052916440586001, + "grad_norm": 5.936369895935059, + "learning_rate": 5.526654789639579e-06, + "loss": 0.2486, + "step": 33393 + }, + { + "epoch": 2.052930005425936, + "grad_norm": 5.718989372253418, + "learning_rate": 5.526517747019323e-06, + "loss": 0.2784, + "step": 33394 + }, + { + "epoch": 2.052943570265871, + "grad_norm": 6.077357292175293, + "learning_rate": 5.526380704399068e-06, + "loss": 0.1755, + "step": 33395 + }, + { + "epoch": 2.052957135105806, + "grad_norm": 4.557773590087891, + "learning_rate": 5.526243661778814e-06, + "loss": 0.1512, + "step": 33396 + }, + { + "epoch": 2.0529706999457407, + "grad_norm": 5.255412578582764, + "learning_rate": 5.5261066191585585e-06, + "loss": 0.2902, + "step": 33397 + }, + { + "epoch": 2.0529842647856755, + "grad_norm": 5.829819202423096, + "learning_rate": 5.525969576538304e-06, + "loss": 0.28, + "step": 33398 + }, + { + "epoch": 2.0529978296256104, + "grad_norm": 6.398519992828369, + "learning_rate": 5.525832533918049e-06, + "loss": 0.246, + "step": 33399 + }, + { + "epoch": 2.0530113944655453, + "grad_norm": 5.932423114776611, + "learning_rate": 5.525695491297794e-06, + "loss": 0.2544, + "step": 33400 + }, + { + "epoch": 2.05302495930548, + "grad_norm": 4.601344108581543, + "learning_rate": 5.525558448677539e-06, + "loss": 0.2456, + "step": 33401 + }, + { + "epoch": 2.053038524145415, + "grad_norm": 4.566141605377197, + "learning_rate": 5.525421406057284e-06, + "loss": 0.1555, + "step": 33402 + }, + { + "epoch": 2.05305208898535, + "grad_norm": 4.962438106536865, + "learning_rate": 5.525284363437029e-06, + "loss": 0.2335, + "step": 33403 + }, + { + "epoch": 2.0530656538252847, + "grad_norm": 4.834848880767822, + "learning_rate": 5.525147320816775e-06, + "loss": 0.163, + "step": 33404 + }, + { + "epoch": 2.0530792186652196, + "grad_norm": 6.0189008712768555, + "learning_rate": 5.52501027819652e-06, + "loss": 0.2442, + "step": 33405 + }, + { + "epoch": 2.0530927835051545, + "grad_norm": 5.333268642425537, + "learning_rate": 5.524873235576264e-06, + "loss": 0.3281, + "step": 33406 + }, + { + "epoch": 2.0531063483450893, + "grad_norm": 5.131180763244629, + "learning_rate": 5.5247361929560095e-06, + "loss": 0.1916, + "step": 33407 + }, + { + "epoch": 2.0531199131850246, + "grad_norm": 5.929292678833008, + "learning_rate": 5.524599150335755e-06, + "loss": 0.2379, + "step": 33408 + }, + { + "epoch": 2.0531334780249595, + "grad_norm": 4.736342430114746, + "learning_rate": 5.524462107715501e-06, + "loss": 0.2137, + "step": 33409 + }, + { + "epoch": 2.0531470428648944, + "grad_norm": 4.1488823890686035, + "learning_rate": 5.524325065095245e-06, + "loss": 0.1294, + "step": 33410 + }, + { + "epoch": 2.053160607704829, + "grad_norm": 5.966083526611328, + "learning_rate": 5.52418802247499e-06, + "loss": 0.2253, + "step": 33411 + }, + { + "epoch": 2.053174172544764, + "grad_norm": 6.1801981925964355, + "learning_rate": 5.5240509798547345e-06, + "loss": 0.3156, + "step": 33412 + }, + { + "epoch": 2.053187737384699, + "grad_norm": 5.650005340576172, + "learning_rate": 5.5239139372344805e-06, + "loss": 0.2267, + "step": 33413 + }, + { + "epoch": 2.053201302224634, + "grad_norm": 3.945850372314453, + "learning_rate": 5.523776894614226e-06, + "loss": 0.1561, + "step": 33414 + }, + { + "epoch": 2.0532148670645687, + "grad_norm": 5.176631927490234, + "learning_rate": 5.52363985199397e-06, + "loss": 0.2409, + "step": 33415 + }, + { + "epoch": 2.0532284319045035, + "grad_norm": 6.117358207702637, + "learning_rate": 5.523502809373715e-06, + "loss": 0.3, + "step": 33416 + }, + { + "epoch": 2.0532419967444384, + "grad_norm": 5.828317165374756, + "learning_rate": 5.523365766753461e-06, + "loss": 0.1788, + "step": 33417 + }, + { + "epoch": 2.0532555615843733, + "grad_norm": 4.943636894226074, + "learning_rate": 5.5232287241332064e-06, + "loss": 0.1858, + "step": 33418 + }, + { + "epoch": 2.053269126424308, + "grad_norm": 4.019440174102783, + "learning_rate": 5.523091681512951e-06, + "loss": 0.2232, + "step": 33419 + }, + { + "epoch": 2.053282691264243, + "grad_norm": 5.3923845291137695, + "learning_rate": 5.522954638892696e-06, + "loss": 0.3052, + "step": 33420 + }, + { + "epoch": 2.053296256104178, + "grad_norm": 4.85563850402832, + "learning_rate": 5.52281759627244e-06, + "loss": 0.1396, + "step": 33421 + }, + { + "epoch": 2.0533098209441127, + "grad_norm": 4.8881731033325195, + "learning_rate": 5.522680553652186e-06, + "loss": 0.181, + "step": 33422 + }, + { + "epoch": 2.0533233857840476, + "grad_norm": 5.3464274406433105, + "learning_rate": 5.5225435110319315e-06, + "loss": 0.1994, + "step": 33423 + }, + { + "epoch": 2.0533369506239825, + "grad_norm": 4.567008018493652, + "learning_rate": 5.522406468411677e-06, + "loss": 0.2378, + "step": 33424 + }, + { + "epoch": 2.0533505154639173, + "grad_norm": 3.728572368621826, + "learning_rate": 5.522269425791421e-06, + "loss": 0.1431, + "step": 33425 + }, + { + "epoch": 2.053364080303852, + "grad_norm": 5.753464221954346, + "learning_rate": 5.522132383171167e-06, + "loss": 0.357, + "step": 33426 + }, + { + "epoch": 2.0533776451437875, + "grad_norm": 5.589348793029785, + "learning_rate": 5.521995340550912e-06, + "loss": 0.1885, + "step": 33427 + }, + { + "epoch": 2.0533912099837224, + "grad_norm": 5.633077621459961, + "learning_rate": 5.5218582979306565e-06, + "loss": 0.3825, + "step": 33428 + }, + { + "epoch": 2.0534047748236572, + "grad_norm": 4.866176128387451, + "learning_rate": 5.521721255310402e-06, + "loss": 0.1748, + "step": 33429 + }, + { + "epoch": 2.053418339663592, + "grad_norm": 4.663951873779297, + "learning_rate": 5.521584212690148e-06, + "loss": 0.2243, + "step": 33430 + }, + { + "epoch": 2.053431904503527, + "grad_norm": 4.283304691314697, + "learning_rate": 5.521447170069892e-06, + "loss": 0.206, + "step": 33431 + }, + { + "epoch": 2.053445469343462, + "grad_norm": 6.195065975189209, + "learning_rate": 5.521310127449637e-06, + "loss": 0.2638, + "step": 33432 + }, + { + "epoch": 2.0534590341833967, + "grad_norm": 5.383176326751709, + "learning_rate": 5.5211730848293824e-06, + "loss": 0.2478, + "step": 33433 + }, + { + "epoch": 2.0534725990233316, + "grad_norm": 3.7081894874572754, + "learning_rate": 5.5210360422091285e-06, + "loss": 0.1381, + "step": 33434 + }, + { + "epoch": 2.0534861638632664, + "grad_norm": 3.8488426208496094, + "learning_rate": 5.520898999588873e-06, + "loss": 0.2025, + "step": 33435 + }, + { + "epoch": 2.0534997287032013, + "grad_norm": 5.657495975494385, + "learning_rate": 5.520761956968618e-06, + "loss": 0.2883, + "step": 33436 + }, + { + "epoch": 2.053513293543136, + "grad_norm": 4.795625686645508, + "learning_rate": 5.520624914348362e-06, + "loss": 0.1801, + "step": 33437 + }, + { + "epoch": 2.053526858383071, + "grad_norm": 4.942617893218994, + "learning_rate": 5.5204878717281075e-06, + "loss": 0.147, + "step": 33438 + }, + { + "epoch": 2.053540423223006, + "grad_norm": 4.538021087646484, + "learning_rate": 5.5203508291078535e-06, + "loss": 0.1818, + "step": 33439 + }, + { + "epoch": 2.0535539880629408, + "grad_norm": 4.25412654876709, + "learning_rate": 5.520213786487598e-06, + "loss": 0.1569, + "step": 33440 + }, + { + "epoch": 2.0535675529028756, + "grad_norm": 3.517789125442505, + "learning_rate": 5.520076743867343e-06, + "loss": 0.1589, + "step": 33441 + }, + { + "epoch": 2.0535811177428105, + "grad_norm": 4.025458812713623, + "learning_rate": 5.519939701247088e-06, + "loss": 0.2379, + "step": 33442 + }, + { + "epoch": 2.0535946825827454, + "grad_norm": 3.2711899280548096, + "learning_rate": 5.519802658626834e-06, + "loss": 0.1397, + "step": 33443 + }, + { + "epoch": 2.05360824742268, + "grad_norm": 5.49528169631958, + "learning_rate": 5.5196656160065785e-06, + "loss": 0.1783, + "step": 33444 + }, + { + "epoch": 2.053621812262615, + "grad_norm": 4.478174209594727, + "learning_rate": 5.519528573386324e-06, + "loss": 0.1651, + "step": 33445 + }, + { + "epoch": 2.0536353771025504, + "grad_norm": 3.29879093170166, + "learning_rate": 5.519391530766068e-06, + "loss": 0.1099, + "step": 33446 + }, + { + "epoch": 2.0536489419424853, + "grad_norm": 4.7621541023254395, + "learning_rate": 5.519254488145814e-06, + "loss": 0.18, + "step": 33447 + }, + { + "epoch": 2.05366250678242, + "grad_norm": 3.997861385345459, + "learning_rate": 5.519117445525559e-06, + "loss": 0.1227, + "step": 33448 + }, + { + "epoch": 2.053676071622355, + "grad_norm": 4.732400894165039, + "learning_rate": 5.518980402905304e-06, + "loss": 0.2011, + "step": 33449 + }, + { + "epoch": 2.05368963646229, + "grad_norm": 4.274107456207275, + "learning_rate": 5.518843360285049e-06, + "loss": 0.1694, + "step": 33450 + }, + { + "epoch": 2.0537032013022247, + "grad_norm": 4.678411483764648, + "learning_rate": 5.518706317664794e-06, + "loss": 0.2012, + "step": 33451 + }, + { + "epoch": 2.0537167661421596, + "grad_norm": 3.8211734294891357, + "learning_rate": 5.51856927504454e-06, + "loss": 0.2079, + "step": 33452 + }, + { + "epoch": 2.0537303309820945, + "grad_norm": 7.403665542602539, + "learning_rate": 5.518432232424284e-06, + "loss": 0.2023, + "step": 33453 + }, + { + "epoch": 2.0537438958220293, + "grad_norm": 7.410224914550781, + "learning_rate": 5.5182951898040295e-06, + "loss": 0.2857, + "step": 33454 + }, + { + "epoch": 2.053757460661964, + "grad_norm": 6.227719306945801, + "learning_rate": 5.518158147183774e-06, + "loss": 0.2104, + "step": 33455 + }, + { + "epoch": 2.053771025501899, + "grad_norm": 3.807023286819458, + "learning_rate": 5.51802110456352e-06, + "loss": 0.2006, + "step": 33456 + }, + { + "epoch": 2.053784590341834, + "grad_norm": 5.386016368865967, + "learning_rate": 5.517884061943265e-06, + "loss": 0.2223, + "step": 33457 + }, + { + "epoch": 2.053798155181769, + "grad_norm": 4.218685626983643, + "learning_rate": 5.51774701932301e-06, + "loss": 0.2147, + "step": 33458 + }, + { + "epoch": 2.0538117200217036, + "grad_norm": 4.553362846374512, + "learning_rate": 5.5176099767027545e-06, + "loss": 0.2108, + "step": 33459 + }, + { + "epoch": 2.0538252848616385, + "grad_norm": 4.631407260894775, + "learning_rate": 5.5174729340825006e-06, + "loss": 0.1576, + "step": 33460 + }, + { + "epoch": 2.0538388497015734, + "grad_norm": 6.137426853179932, + "learning_rate": 5.517335891462246e-06, + "loss": 0.251, + "step": 33461 + }, + { + "epoch": 2.0538524145415082, + "grad_norm": 4.455629348754883, + "learning_rate": 5.51719884884199e-06, + "loss": 0.2135, + "step": 33462 + }, + { + "epoch": 2.053865979381443, + "grad_norm": 4.454300403594971, + "learning_rate": 5.517061806221735e-06, + "loss": 0.1677, + "step": 33463 + }, + { + "epoch": 2.053879544221378, + "grad_norm": 6.14044713973999, + "learning_rate": 5.51692476360148e-06, + "loss": 0.3127, + "step": 33464 + }, + { + "epoch": 2.0538931090613133, + "grad_norm": 6.587625026702881, + "learning_rate": 5.516787720981226e-06, + "loss": 0.2066, + "step": 33465 + }, + { + "epoch": 2.053906673901248, + "grad_norm": 5.164535045623779, + "learning_rate": 5.516650678360971e-06, + "loss": 0.1655, + "step": 33466 + }, + { + "epoch": 2.053920238741183, + "grad_norm": 3.747483253479004, + "learning_rate": 5.516513635740716e-06, + "loss": 0.1729, + "step": 33467 + }, + { + "epoch": 2.053933803581118, + "grad_norm": 4.499764919281006, + "learning_rate": 5.51637659312046e-06, + "loss": 0.1312, + "step": 33468 + }, + { + "epoch": 2.0539473684210527, + "grad_norm": 6.033027172088623, + "learning_rate": 5.516239550500206e-06, + "loss": 0.2418, + "step": 33469 + }, + { + "epoch": 2.0539609332609876, + "grad_norm": 5.424554824829102, + "learning_rate": 5.5161025078799515e-06, + "loss": 0.1501, + "step": 33470 + }, + { + "epoch": 2.0539744981009225, + "grad_norm": 9.764275550842285, + "learning_rate": 5.515965465259696e-06, + "loss": 0.2666, + "step": 33471 + }, + { + "epoch": 2.0539880629408573, + "grad_norm": 5.484443664550781, + "learning_rate": 5.515828422639441e-06, + "loss": 0.2006, + "step": 33472 + }, + { + "epoch": 2.054001627780792, + "grad_norm": 5.962772846221924, + "learning_rate": 5.515691380019187e-06, + "loss": 0.2979, + "step": 33473 + }, + { + "epoch": 2.054015192620727, + "grad_norm": 5.106655597686768, + "learning_rate": 5.515554337398931e-06, + "loss": 0.1598, + "step": 33474 + }, + { + "epoch": 2.054028757460662, + "grad_norm": 5.612268447875977, + "learning_rate": 5.5154172947786766e-06, + "loss": 0.2269, + "step": 33475 + }, + { + "epoch": 2.054042322300597, + "grad_norm": 4.553926944732666, + "learning_rate": 5.515280252158422e-06, + "loss": 0.1263, + "step": 33476 + }, + { + "epoch": 2.0540558871405317, + "grad_norm": 6.148923397064209, + "learning_rate": 5.515143209538166e-06, + "loss": 0.2006, + "step": 33477 + }, + { + "epoch": 2.0540694519804665, + "grad_norm": 5.507701873779297, + "learning_rate": 5.515006166917912e-06, + "loss": 0.2119, + "step": 33478 + }, + { + "epoch": 2.0540830168204014, + "grad_norm": 4.685178756713867, + "learning_rate": 5.514869124297657e-06, + "loss": 0.1744, + "step": 33479 + }, + { + "epoch": 2.0540965816603363, + "grad_norm": 5.4740095138549805, + "learning_rate": 5.514732081677402e-06, + "loss": 0.1874, + "step": 33480 + }, + { + "epoch": 2.054110146500271, + "grad_norm": 3.8643627166748047, + "learning_rate": 5.514595039057147e-06, + "loss": 0.1623, + "step": 33481 + }, + { + "epoch": 2.054123711340206, + "grad_norm": 3.3863332271575928, + "learning_rate": 5.514457996436893e-06, + "loss": 0.1494, + "step": 33482 + }, + { + "epoch": 2.054137276180141, + "grad_norm": 5.101263999938965, + "learning_rate": 5.514320953816638e-06, + "loss": 0.1991, + "step": 33483 + }, + { + "epoch": 2.054150841020076, + "grad_norm": 3.874163866043091, + "learning_rate": 5.514183911196382e-06, + "loss": 0.2044, + "step": 33484 + }, + { + "epoch": 2.054164405860011, + "grad_norm": 4.557651519775391, + "learning_rate": 5.5140468685761275e-06, + "loss": 0.1421, + "step": 33485 + }, + { + "epoch": 2.054177970699946, + "grad_norm": 7.050649166107178, + "learning_rate": 5.5139098259558735e-06, + "loss": 0.2062, + "step": 33486 + }, + { + "epoch": 2.0541915355398808, + "grad_norm": 3.702029228210449, + "learning_rate": 5.513772783335618e-06, + "loss": 0.1619, + "step": 33487 + }, + { + "epoch": 2.0542051003798156, + "grad_norm": 5.922512531280518, + "learning_rate": 5.513635740715363e-06, + "loss": 0.2268, + "step": 33488 + }, + { + "epoch": 2.0542186652197505, + "grad_norm": 4.318526268005371, + "learning_rate": 5.513498698095107e-06, + "loss": 0.1489, + "step": 33489 + }, + { + "epoch": 2.0542322300596854, + "grad_norm": 9.133536338806152, + "learning_rate": 5.5133616554748525e-06, + "loss": 0.1593, + "step": 33490 + }, + { + "epoch": 2.0542457948996202, + "grad_norm": 6.300415515899658, + "learning_rate": 5.5132246128545986e-06, + "loss": 0.2203, + "step": 33491 + }, + { + "epoch": 2.054259359739555, + "grad_norm": 7.715260982513428, + "learning_rate": 5.513087570234344e-06, + "loss": 0.3635, + "step": 33492 + }, + { + "epoch": 2.05427292457949, + "grad_norm": 3.8826045989990234, + "learning_rate": 5.512950527614088e-06, + "loss": 0.1578, + "step": 33493 + }, + { + "epoch": 2.054286489419425, + "grad_norm": 5.77102518081665, + "learning_rate": 5.512813484993833e-06, + "loss": 0.1952, + "step": 33494 + }, + { + "epoch": 2.0543000542593597, + "grad_norm": 4.460284233093262, + "learning_rate": 5.512676442373579e-06, + "loss": 0.1499, + "step": 33495 + }, + { + "epoch": 2.0543136190992946, + "grad_norm": 6.007505416870117, + "learning_rate": 5.512539399753324e-06, + "loss": 0.2548, + "step": 33496 + }, + { + "epoch": 2.0543271839392294, + "grad_norm": 5.617042541503906, + "learning_rate": 5.512402357133069e-06, + "loss": 0.1316, + "step": 33497 + }, + { + "epoch": 2.0543407487791643, + "grad_norm": 5.389925003051758, + "learning_rate": 5.512265314512814e-06, + "loss": 0.1922, + "step": 33498 + }, + { + "epoch": 2.054354313619099, + "grad_norm": 3.819502592086792, + "learning_rate": 5.512128271892559e-06, + "loss": 0.1603, + "step": 33499 + }, + { + "epoch": 2.054367878459034, + "grad_norm": 3.1344590187072754, + "learning_rate": 5.511991229272304e-06, + "loss": 0.0859, + "step": 33500 + }, + { + "epoch": 2.054381443298969, + "grad_norm": 6.50056266784668, + "learning_rate": 5.5118541866520495e-06, + "loss": 0.179, + "step": 33501 + }, + { + "epoch": 2.0543950081389037, + "grad_norm": 5.0982584953308105, + "learning_rate": 5.511717144031794e-06, + "loss": 0.2187, + "step": 33502 + }, + { + "epoch": 2.054408572978839, + "grad_norm": 4.194479465484619, + "learning_rate": 5.51158010141154e-06, + "loss": 0.133, + "step": 33503 + }, + { + "epoch": 2.054422137818774, + "grad_norm": 4.879585266113281, + "learning_rate": 5.511443058791285e-06, + "loss": 0.1865, + "step": 33504 + }, + { + "epoch": 2.054435702658709, + "grad_norm": 3.349639654159546, + "learning_rate": 5.511306016171029e-06, + "loss": 0.1365, + "step": 33505 + }, + { + "epoch": 2.0544492674986436, + "grad_norm": 4.486438751220703, + "learning_rate": 5.5111689735507746e-06, + "loss": 0.137, + "step": 33506 + }, + { + "epoch": 2.0544628323385785, + "grad_norm": 4.589223384857178, + "learning_rate": 5.51103193093052e-06, + "loss": 0.1632, + "step": 33507 + }, + { + "epoch": 2.0544763971785134, + "grad_norm": 5.624855995178223, + "learning_rate": 5.510894888310265e-06, + "loss": 0.1923, + "step": 33508 + }, + { + "epoch": 2.0544899620184482, + "grad_norm": 4.723420143127441, + "learning_rate": 5.51075784569001e-06, + "loss": 0.1475, + "step": 33509 + }, + { + "epoch": 2.054503526858383, + "grad_norm": 4.093896865844727, + "learning_rate": 5.510620803069755e-06, + "loss": 0.1558, + "step": 33510 + }, + { + "epoch": 2.054517091698318, + "grad_norm": 4.5695109367370605, + "learning_rate": 5.5104837604495e-06, + "loss": 0.1244, + "step": 33511 + }, + { + "epoch": 2.054530656538253, + "grad_norm": 4.083562850952148, + "learning_rate": 5.510346717829246e-06, + "loss": 0.1603, + "step": 33512 + }, + { + "epoch": 2.0545442213781877, + "grad_norm": 4.405839920043945, + "learning_rate": 5.510209675208991e-06, + "loss": 0.1553, + "step": 33513 + }, + { + "epoch": 2.0545577862181226, + "grad_norm": 4.7124176025390625, + "learning_rate": 5.510072632588735e-06, + "loss": 0.1645, + "step": 33514 + }, + { + "epoch": 2.0545713510580574, + "grad_norm": 3.1045191287994385, + "learning_rate": 5.50993558996848e-06, + "loss": 0.127, + "step": 33515 + }, + { + "epoch": 2.0545849158979923, + "grad_norm": 3.3259501457214355, + "learning_rate": 5.509798547348226e-06, + "loss": 0.1468, + "step": 33516 + }, + { + "epoch": 2.054598480737927, + "grad_norm": 3.153756618499756, + "learning_rate": 5.5096615047279715e-06, + "loss": 0.1401, + "step": 33517 + }, + { + "epoch": 2.054612045577862, + "grad_norm": 5.040887355804443, + "learning_rate": 5.509524462107716e-06, + "loss": 0.1741, + "step": 33518 + }, + { + "epoch": 2.054625610417797, + "grad_norm": 4.098245620727539, + "learning_rate": 5.509387419487461e-06, + "loss": 0.1366, + "step": 33519 + }, + { + "epoch": 2.0546391752577318, + "grad_norm": 3.966357707977295, + "learning_rate": 5.509250376867205e-06, + "loss": 0.1623, + "step": 33520 + }, + { + "epoch": 2.0546527400976666, + "grad_norm": 4.761327266693115, + "learning_rate": 5.509113334246951e-06, + "loss": 0.2227, + "step": 33521 + }, + { + "epoch": 2.054666304937602, + "grad_norm": 3.4294471740722656, + "learning_rate": 5.5089762916266966e-06, + "loss": 0.1797, + "step": 33522 + }, + { + "epoch": 2.054679869777537, + "grad_norm": 4.588423252105713, + "learning_rate": 5.508839249006441e-06, + "loss": 0.162, + "step": 33523 + }, + { + "epoch": 2.0546934346174717, + "grad_norm": 4.7456841468811035, + "learning_rate": 5.508702206386186e-06, + "loss": 0.1529, + "step": 33524 + }, + { + "epoch": 2.0547069994574065, + "grad_norm": 4.835416793823242, + "learning_rate": 5.508565163765932e-06, + "loss": 0.1832, + "step": 33525 + }, + { + "epoch": 2.0547205642973414, + "grad_norm": 3.5419540405273438, + "learning_rate": 5.508428121145677e-06, + "loss": 0.084, + "step": 33526 + }, + { + "epoch": 2.0547341291372763, + "grad_norm": 2.7940263748168945, + "learning_rate": 5.508291078525422e-06, + "loss": 0.1019, + "step": 33527 + }, + { + "epoch": 2.054747693977211, + "grad_norm": 2.6677663326263428, + "learning_rate": 5.508154035905167e-06, + "loss": 0.058, + "step": 33528 + }, + { + "epoch": 2.054761258817146, + "grad_norm": 4.350683689117432, + "learning_rate": 5.508016993284913e-06, + "loss": 0.1025, + "step": 33529 + }, + { + "epoch": 2.054774823657081, + "grad_norm": 3.3605597019195557, + "learning_rate": 5.507879950664657e-06, + "loss": 0.1075, + "step": 33530 + }, + { + "epoch": 2.0547883884970157, + "grad_norm": 3.2940940856933594, + "learning_rate": 5.507742908044402e-06, + "loss": 0.1251, + "step": 33531 + }, + { + "epoch": 2.0548019533369506, + "grad_norm": 3.949941396713257, + "learning_rate": 5.5076058654241475e-06, + "loss": 0.1595, + "step": 33532 + }, + { + "epoch": 2.0548155181768855, + "grad_norm": 5.06728982925415, + "learning_rate": 5.507468822803892e-06, + "loss": 0.2222, + "step": 33533 + }, + { + "epoch": 2.0548290830168203, + "grad_norm": 3.5369820594787598, + "learning_rate": 5.507331780183638e-06, + "loss": 0.091, + "step": 33534 + }, + { + "epoch": 2.054842647856755, + "grad_norm": 3.6550564765930176, + "learning_rate": 5.507194737563383e-06, + "loss": 0.1142, + "step": 33535 + }, + { + "epoch": 2.05485621269669, + "grad_norm": 4.674771785736084, + "learning_rate": 5.507057694943127e-06, + "loss": 0.1848, + "step": 33536 + }, + { + "epoch": 2.054869777536625, + "grad_norm": 3.6171188354492188, + "learning_rate": 5.5069206523228726e-06, + "loss": 0.1141, + "step": 33537 + }, + { + "epoch": 2.05488334237656, + "grad_norm": 4.845623016357422, + "learning_rate": 5.506783609702619e-06, + "loss": 0.2983, + "step": 33538 + }, + { + "epoch": 2.0548969072164947, + "grad_norm": 3.728046417236328, + "learning_rate": 5.506646567082363e-06, + "loss": 0.1565, + "step": 33539 + }, + { + "epoch": 2.0549104720564295, + "grad_norm": 4.315160274505615, + "learning_rate": 5.506509524462108e-06, + "loss": 0.1323, + "step": 33540 + }, + { + "epoch": 2.054924036896365, + "grad_norm": 4.32929801940918, + "learning_rate": 5.506372481841853e-06, + "loss": 0.1134, + "step": 33541 + }, + { + "epoch": 2.0549376017362997, + "grad_norm": 3.866760730743408, + "learning_rate": 5.5062354392215985e-06, + "loss": 0.1377, + "step": 33542 + }, + { + "epoch": 2.0549511665762346, + "grad_norm": 4.577235221862793, + "learning_rate": 5.506098396601344e-06, + "loss": 0.2033, + "step": 33543 + }, + { + "epoch": 2.0549647314161694, + "grad_norm": 5.89492654800415, + "learning_rate": 5.505961353981089e-06, + "loss": 0.2306, + "step": 33544 + }, + { + "epoch": 2.0549782962561043, + "grad_norm": 5.4703569412231445, + "learning_rate": 5.505824311360833e-06, + "loss": 0.169, + "step": 33545 + }, + { + "epoch": 2.054991861096039, + "grad_norm": 3.1730527877807617, + "learning_rate": 5.505687268740578e-06, + "loss": 0.1038, + "step": 33546 + }, + { + "epoch": 2.055005425935974, + "grad_norm": 4.3327765464782715, + "learning_rate": 5.505550226120324e-06, + "loss": 0.1306, + "step": 33547 + }, + { + "epoch": 2.055018990775909, + "grad_norm": 3.8115108013153076, + "learning_rate": 5.505413183500069e-06, + "loss": 0.0815, + "step": 33548 + }, + { + "epoch": 2.0550325556158437, + "grad_norm": 5.832677364349365, + "learning_rate": 5.505276140879814e-06, + "loss": 0.2049, + "step": 33549 + }, + { + "epoch": 2.0550461204557786, + "grad_norm": 5.021646976470947, + "learning_rate": 5.505139098259559e-06, + "loss": 0.1534, + "step": 33550 + }, + { + "epoch": 2.0550596852957135, + "grad_norm": 3.9635040760040283, + "learning_rate": 5.505002055639305e-06, + "loss": 0.1895, + "step": 33551 + }, + { + "epoch": 2.0550732501356483, + "grad_norm": 3.9256629943847656, + "learning_rate": 5.504865013019049e-06, + "loss": 0.1175, + "step": 33552 + }, + { + "epoch": 2.055086814975583, + "grad_norm": 5.223761081695557, + "learning_rate": 5.5047279703987946e-06, + "loss": 0.2241, + "step": 33553 + }, + { + "epoch": 2.055100379815518, + "grad_norm": 3.7742700576782227, + "learning_rate": 5.504590927778539e-06, + "loss": 0.1634, + "step": 33554 + }, + { + "epoch": 2.055113944655453, + "grad_norm": 4.842248916625977, + "learning_rate": 5.504453885158285e-06, + "loss": 0.18, + "step": 33555 + }, + { + "epoch": 2.055127509495388, + "grad_norm": 5.336091041564941, + "learning_rate": 5.50431684253803e-06, + "loss": 0.2135, + "step": 33556 + }, + { + "epoch": 2.0551410743353227, + "grad_norm": 4.68034553527832, + "learning_rate": 5.5041797999177744e-06, + "loss": 0.2262, + "step": 33557 + }, + { + "epoch": 2.0551546391752575, + "grad_norm": 3.5617833137512207, + "learning_rate": 5.50404275729752e-06, + "loss": 0.1948, + "step": 33558 + }, + { + "epoch": 2.0551682040151924, + "grad_norm": 5.393107891082764, + "learning_rate": 5.503905714677265e-06, + "loss": 0.1813, + "step": 33559 + }, + { + "epoch": 2.0551817688551277, + "grad_norm": 6.056443691253662, + "learning_rate": 5.503768672057011e-06, + "loss": 0.274, + "step": 33560 + }, + { + "epoch": 2.0551953336950626, + "grad_norm": 5.138908863067627, + "learning_rate": 5.503631629436755e-06, + "loss": 0.1168, + "step": 33561 + }, + { + "epoch": 2.0552088985349974, + "grad_norm": 4.159504413604736, + "learning_rate": 5.5034945868165e-06, + "loss": 0.1813, + "step": 33562 + }, + { + "epoch": 2.0552224633749323, + "grad_norm": 5.731973171234131, + "learning_rate": 5.503357544196245e-06, + "loss": 0.2354, + "step": 33563 + }, + { + "epoch": 2.055236028214867, + "grad_norm": 6.461435794830322, + "learning_rate": 5.503220501575991e-06, + "loss": 0.1987, + "step": 33564 + }, + { + "epoch": 2.055249593054802, + "grad_norm": 6.791018962860107, + "learning_rate": 5.503083458955736e-06, + "loss": 0.2897, + "step": 33565 + }, + { + "epoch": 2.055263157894737, + "grad_norm": 5.2806854248046875, + "learning_rate": 5.502946416335481e-06, + "loss": 0.1723, + "step": 33566 + }, + { + "epoch": 2.0552767227346718, + "grad_norm": 4.4496660232543945, + "learning_rate": 5.502809373715225e-06, + "loss": 0.208, + "step": 33567 + }, + { + "epoch": 2.0552902875746066, + "grad_norm": 4.80120849609375, + "learning_rate": 5.502672331094971e-06, + "loss": 0.133, + "step": 33568 + }, + { + "epoch": 2.0553038524145415, + "grad_norm": 6.472076892852783, + "learning_rate": 5.502535288474717e-06, + "loss": 0.2506, + "step": 33569 + }, + { + "epoch": 2.0553174172544764, + "grad_norm": 8.224011421203613, + "learning_rate": 5.502398245854461e-06, + "loss": 0.3349, + "step": 33570 + }, + { + "epoch": 2.0553309820944112, + "grad_norm": 3.93546986579895, + "learning_rate": 5.502261203234206e-06, + "loss": 0.1682, + "step": 33571 + }, + { + "epoch": 2.055344546934346, + "grad_norm": 5.194952011108398, + "learning_rate": 5.502124160613952e-06, + "loss": 0.2186, + "step": 33572 + }, + { + "epoch": 2.055358111774281, + "grad_norm": 6.6549859046936035, + "learning_rate": 5.5019871179936965e-06, + "loss": 0.2491, + "step": 33573 + }, + { + "epoch": 2.055371676614216, + "grad_norm": 4.882343769073486, + "learning_rate": 5.501850075373442e-06, + "loss": 0.1876, + "step": 33574 + }, + { + "epoch": 2.0553852414541507, + "grad_norm": 4.0251359939575195, + "learning_rate": 5.501713032753187e-06, + "loss": 0.208, + "step": 33575 + }, + { + "epoch": 2.0553988062940856, + "grad_norm": 5.049236297607422, + "learning_rate": 5.501575990132931e-06, + "loss": 0.2301, + "step": 33576 + }, + { + "epoch": 2.0554123711340204, + "grad_norm": 4.420140266418457, + "learning_rate": 5.501438947512677e-06, + "loss": 0.1686, + "step": 33577 + }, + { + "epoch": 2.0554259359739557, + "grad_norm": 5.487096309661865, + "learning_rate": 5.501301904892422e-06, + "loss": 0.2706, + "step": 33578 + }, + { + "epoch": 2.0554395008138906, + "grad_norm": 6.047562599182129, + "learning_rate": 5.501164862272167e-06, + "loss": 0.2032, + "step": 33579 + }, + { + "epoch": 2.0554530656538255, + "grad_norm": 5.996899604797363, + "learning_rate": 5.501027819651912e-06, + "loss": 0.2382, + "step": 33580 + }, + { + "epoch": 2.0554666304937603, + "grad_norm": 4.394115924835205, + "learning_rate": 5.500890777031658e-06, + "loss": 0.1624, + "step": 33581 + }, + { + "epoch": 2.055480195333695, + "grad_norm": 5.949937343597412, + "learning_rate": 5.500753734411402e-06, + "loss": 0.2101, + "step": 33582 + }, + { + "epoch": 2.05549376017363, + "grad_norm": 5.825270175933838, + "learning_rate": 5.500616691791147e-06, + "loss": 0.1972, + "step": 33583 + }, + { + "epoch": 2.055507325013565, + "grad_norm": 5.931243896484375, + "learning_rate": 5.500479649170893e-06, + "loss": 0.2154, + "step": 33584 + }, + { + "epoch": 2.0555208898535, + "grad_norm": 3.984922170639038, + "learning_rate": 5.500342606550639e-06, + "loss": 0.1779, + "step": 33585 + }, + { + "epoch": 2.0555344546934347, + "grad_norm": 6.9263787269592285, + "learning_rate": 5.500205563930383e-06, + "loss": 0.288, + "step": 33586 + }, + { + "epoch": 2.0555480195333695, + "grad_norm": 4.79423713684082, + "learning_rate": 5.500068521310128e-06, + "loss": 0.1192, + "step": 33587 + }, + { + "epoch": 2.0555615843733044, + "grad_norm": 6.412978172302246, + "learning_rate": 5.4999314786898724e-06, + "loss": 0.2588, + "step": 33588 + }, + { + "epoch": 2.0555751492132392, + "grad_norm": 5.273432731628418, + "learning_rate": 5.499794436069618e-06, + "loss": 0.1893, + "step": 33589 + }, + { + "epoch": 2.055588714053174, + "grad_norm": 4.101857662200928, + "learning_rate": 5.499657393449364e-06, + "loss": 0.1133, + "step": 33590 + }, + { + "epoch": 2.055602278893109, + "grad_norm": 6.113105773925781, + "learning_rate": 5.499520350829108e-06, + "loss": 0.2953, + "step": 33591 + }, + { + "epoch": 2.055615843733044, + "grad_norm": 5.334524154663086, + "learning_rate": 5.499383308208853e-06, + "loss": 0.2051, + "step": 33592 + }, + { + "epoch": 2.0556294085729787, + "grad_norm": 8.370508193969727, + "learning_rate": 5.499246265588598e-06, + "loss": 0.4066, + "step": 33593 + }, + { + "epoch": 2.0556429734129136, + "grad_norm": 4.603095054626465, + "learning_rate": 5.499109222968344e-06, + "loss": 0.1264, + "step": 33594 + }, + { + "epoch": 2.0556565382528484, + "grad_norm": 5.364534378051758, + "learning_rate": 5.498972180348089e-06, + "loss": 0.1736, + "step": 33595 + }, + { + "epoch": 2.0556701030927833, + "grad_norm": 5.244779586791992, + "learning_rate": 5.498835137727834e-06, + "loss": 0.1484, + "step": 33596 + }, + { + "epoch": 2.055683667932718, + "grad_norm": 7.718408584594727, + "learning_rate": 5.498698095107578e-06, + "loss": 0.3037, + "step": 33597 + }, + { + "epoch": 2.0556972327726535, + "grad_norm": 6.065162181854248, + "learning_rate": 5.498561052487324e-06, + "loss": 0.2297, + "step": 33598 + }, + { + "epoch": 2.0557107976125883, + "grad_norm": 5.900818347930908, + "learning_rate": 5.498424009867069e-06, + "loss": 0.2365, + "step": 33599 + }, + { + "epoch": 2.055724362452523, + "grad_norm": 4.450569152832031, + "learning_rate": 5.498286967246815e-06, + "loss": 0.185, + "step": 33600 + }, + { + "epoch": 2.055737927292458, + "grad_norm": 4.867260932922363, + "learning_rate": 5.498149924626559e-06, + "loss": 0.1147, + "step": 33601 + }, + { + "epoch": 2.055751492132393, + "grad_norm": 3.4181253910064697, + "learning_rate": 5.498012882006304e-06, + "loss": 0.0952, + "step": 33602 + }, + { + "epoch": 2.055765056972328, + "grad_norm": 6.720203399658203, + "learning_rate": 5.49787583938605e-06, + "loss": 0.2221, + "step": 33603 + }, + { + "epoch": 2.0557786218122627, + "grad_norm": 4.427046298980713, + "learning_rate": 5.4977387967657945e-06, + "loss": 0.0914, + "step": 33604 + }, + { + "epoch": 2.0557921866521975, + "grad_norm": 5.955300807952881, + "learning_rate": 5.49760175414554e-06, + "loss": 0.164, + "step": 33605 + }, + { + "epoch": 2.0558057514921324, + "grad_norm": 3.3530476093292236, + "learning_rate": 5.497464711525284e-06, + "loss": 0.0928, + "step": 33606 + }, + { + "epoch": 2.0558193163320673, + "grad_norm": 10.035697937011719, + "learning_rate": 5.49732766890503e-06, + "loss": 0.4536, + "step": 33607 + }, + { + "epoch": 2.055832881172002, + "grad_norm": 7.232962608337402, + "learning_rate": 5.497190626284775e-06, + "loss": 0.3472, + "step": 33608 + }, + { + "epoch": 2.055846446011937, + "grad_norm": 5.748767375946045, + "learning_rate": 5.49705358366452e-06, + "loss": 0.2278, + "step": 33609 + }, + { + "epoch": 2.055860010851872, + "grad_norm": 7.7230224609375, + "learning_rate": 5.496916541044265e-06, + "loss": 0.5294, + "step": 33610 + }, + { + "epoch": 2.0558735756918067, + "grad_norm": 7.902023792266846, + "learning_rate": 5.496779498424011e-06, + "loss": 0.5052, + "step": 33611 + }, + { + "epoch": 2.0558871405317416, + "grad_norm": 4.261447906494141, + "learning_rate": 5.496642455803756e-06, + "loss": 0.1714, + "step": 33612 + }, + { + "epoch": 2.0559007053716765, + "grad_norm": 4.055497646331787, + "learning_rate": 5.4965054131835e-06, + "loss": 0.2531, + "step": 33613 + }, + { + "epoch": 2.0559142702116113, + "grad_norm": 6.015761852264404, + "learning_rate": 5.496368370563245e-06, + "loss": 0.2963, + "step": 33614 + }, + { + "epoch": 2.055927835051546, + "grad_norm": 7.659693717956543, + "learning_rate": 5.496231327942991e-06, + "loss": 0.4189, + "step": 33615 + }, + { + "epoch": 2.0559413998914815, + "grad_norm": 5.379951477050781, + "learning_rate": 5.496094285322736e-06, + "loss": 0.1572, + "step": 33616 + }, + { + "epoch": 2.0559549647314164, + "grad_norm": 10.652728080749512, + "learning_rate": 5.495957242702481e-06, + "loss": 0.4453, + "step": 33617 + }, + { + "epoch": 2.0559685295713512, + "grad_norm": 6.4153852462768555, + "learning_rate": 5.495820200082226e-06, + "loss": 0.2822, + "step": 33618 + }, + { + "epoch": 2.055982094411286, + "grad_norm": 8.028077125549316, + "learning_rate": 5.4956831574619705e-06, + "loss": 0.3475, + "step": 33619 + }, + { + "epoch": 2.055995659251221, + "grad_norm": 9.285174369812012, + "learning_rate": 5.4955461148417165e-06, + "loss": 0.3193, + "step": 33620 + }, + { + "epoch": 2.056009224091156, + "grad_norm": 5.870519161224365, + "learning_rate": 5.495409072221462e-06, + "loss": 0.1869, + "step": 33621 + }, + { + "epoch": 2.0560227889310907, + "grad_norm": 6.821006774902344, + "learning_rate": 5.495272029601206e-06, + "loss": 0.243, + "step": 33622 + }, + { + "epoch": 2.0560363537710256, + "grad_norm": 8.08555793762207, + "learning_rate": 5.495134986980951e-06, + "loss": 0.2632, + "step": 33623 + }, + { + "epoch": 2.0560499186109604, + "grad_norm": 5.266432285308838, + "learning_rate": 5.494997944360697e-06, + "loss": 0.2398, + "step": 33624 + }, + { + "epoch": 2.0560634834508953, + "grad_norm": 4.862651348114014, + "learning_rate": 5.494860901740442e-06, + "loss": 0.1775, + "step": 33625 + }, + { + "epoch": 2.05607704829083, + "grad_norm": 6.396923542022705, + "learning_rate": 5.494723859120187e-06, + "loss": 0.2262, + "step": 33626 + }, + { + "epoch": 2.056090613130765, + "grad_norm": 7.504505157470703, + "learning_rate": 5.494586816499932e-06, + "loss": 0.3094, + "step": 33627 + }, + { + "epoch": 2.0561041779707, + "grad_norm": 5.366990089416504, + "learning_rate": 5.494449773879676e-06, + "loss": 0.1365, + "step": 33628 + }, + { + "epoch": 2.0561177428106348, + "grad_norm": 4.0203704833984375, + "learning_rate": 5.494312731259422e-06, + "loss": 0.192, + "step": 33629 + }, + { + "epoch": 2.0561313076505696, + "grad_norm": 5.739722728729248, + "learning_rate": 5.494175688639167e-06, + "loss": 0.2832, + "step": 33630 + }, + { + "epoch": 2.0561448724905045, + "grad_norm": 7.2415313720703125, + "learning_rate": 5.494038646018912e-06, + "loss": 0.2357, + "step": 33631 + }, + { + "epoch": 2.0561584373304393, + "grad_norm": 6.2955827713012695, + "learning_rate": 5.493901603398657e-06, + "loss": 0.2404, + "step": 33632 + }, + { + "epoch": 2.056172002170374, + "grad_norm": 5.62396764755249, + "learning_rate": 5.493764560778403e-06, + "loss": 0.3213, + "step": 33633 + }, + { + "epoch": 2.056185567010309, + "grad_norm": 6.827342987060547, + "learning_rate": 5.493627518158148e-06, + "loss": 0.3197, + "step": 33634 + }, + { + "epoch": 2.056199131850244, + "grad_norm": 6.206226825714111, + "learning_rate": 5.4934904755378925e-06, + "loss": 0.2513, + "step": 33635 + }, + { + "epoch": 2.0562126966901793, + "grad_norm": 5.7361836433410645, + "learning_rate": 5.493353432917638e-06, + "loss": 0.3281, + "step": 33636 + }, + { + "epoch": 2.056226261530114, + "grad_norm": 4.8996477127075195, + "learning_rate": 5.493216390297384e-06, + "loss": 0.1601, + "step": 33637 + }, + { + "epoch": 2.056239826370049, + "grad_norm": 5.232273101806641, + "learning_rate": 5.493079347677128e-06, + "loss": 0.2298, + "step": 33638 + }, + { + "epoch": 2.056253391209984, + "grad_norm": 4.143950462341309, + "learning_rate": 5.492942305056873e-06, + "loss": 0.147, + "step": 33639 + }, + { + "epoch": 2.0562669560499187, + "grad_norm": 6.139623641967773, + "learning_rate": 5.4928052624366175e-06, + "loss": 0.1886, + "step": 33640 + }, + { + "epoch": 2.0562805208898536, + "grad_norm": 6.426245212554932, + "learning_rate": 5.4926682198163635e-06, + "loss": 0.3038, + "step": 33641 + }, + { + "epoch": 2.0562940857297884, + "grad_norm": 5.829150676727295, + "learning_rate": 5.492531177196109e-06, + "loss": 0.2265, + "step": 33642 + }, + { + "epoch": 2.0563076505697233, + "grad_norm": 4.791210174560547, + "learning_rate": 5.492394134575854e-06, + "loss": 0.1875, + "step": 33643 + }, + { + "epoch": 2.056321215409658, + "grad_norm": 4.850093841552734, + "learning_rate": 5.492257091955598e-06, + "loss": 0.2592, + "step": 33644 + }, + { + "epoch": 2.056334780249593, + "grad_norm": 6.472709655761719, + "learning_rate": 5.492120049335343e-06, + "loss": 0.223, + "step": 33645 + }, + { + "epoch": 2.056348345089528, + "grad_norm": 7.060451030731201, + "learning_rate": 5.4919830067150894e-06, + "loss": 0.3471, + "step": 33646 + }, + { + "epoch": 2.0563619099294628, + "grad_norm": 4.583021640777588, + "learning_rate": 5.491845964094834e-06, + "loss": 0.1678, + "step": 33647 + }, + { + "epoch": 2.0563754747693976, + "grad_norm": 4.917828559875488, + "learning_rate": 5.491708921474579e-06, + "loss": 0.1526, + "step": 33648 + }, + { + "epoch": 2.0563890396093325, + "grad_norm": 3.7710936069488525, + "learning_rate": 5.491571878854324e-06, + "loss": 0.1119, + "step": 33649 + }, + { + "epoch": 2.0564026044492674, + "grad_norm": 3.444751739501953, + "learning_rate": 5.491434836234069e-06, + "loss": 0.0983, + "step": 33650 + }, + { + "epoch": 2.0564161692892022, + "grad_norm": 3.717442750930786, + "learning_rate": 5.4912977936138145e-06, + "loss": 0.1532, + "step": 33651 + }, + { + "epoch": 2.056429734129137, + "grad_norm": 7.117782115936279, + "learning_rate": 5.49116075099356e-06, + "loss": 0.2304, + "step": 33652 + }, + { + "epoch": 2.056443298969072, + "grad_norm": 5.165327548980713, + "learning_rate": 5.491023708373304e-06, + "loss": 0.2158, + "step": 33653 + }, + { + "epoch": 2.0564568638090073, + "grad_norm": 6.820663928985596, + "learning_rate": 5.49088666575305e-06, + "loss": 0.2299, + "step": 33654 + }, + { + "epoch": 2.056470428648942, + "grad_norm": 3.6256229877471924, + "learning_rate": 5.490749623132795e-06, + "loss": 0.0761, + "step": 33655 + }, + { + "epoch": 2.056483993488877, + "grad_norm": 4.509796619415283, + "learning_rate": 5.4906125805125395e-06, + "loss": 0.1605, + "step": 33656 + }, + { + "epoch": 2.056497558328812, + "grad_norm": 3.656301736831665, + "learning_rate": 5.490475537892285e-06, + "loss": 0.1568, + "step": 33657 + }, + { + "epoch": 2.0565111231687467, + "grad_norm": 5.285406112670898, + "learning_rate": 5.49033849527203e-06, + "loss": 0.1732, + "step": 33658 + }, + { + "epoch": 2.0565246880086816, + "grad_norm": 4.935068607330322, + "learning_rate": 5.490201452651776e-06, + "loss": 0.1598, + "step": 33659 + }, + { + "epoch": 2.0565382528486165, + "grad_norm": 5.515816688537598, + "learning_rate": 5.49006441003152e-06, + "loss": 0.1771, + "step": 33660 + }, + { + "epoch": 2.0565518176885513, + "grad_norm": 5.508336544036865, + "learning_rate": 5.489927367411265e-06, + "loss": 0.1919, + "step": 33661 + }, + { + "epoch": 2.056565382528486, + "grad_norm": 4.51301383972168, + "learning_rate": 5.48979032479101e-06, + "loss": 0.1474, + "step": 33662 + }, + { + "epoch": 2.056578947368421, + "grad_norm": 4.7196269035339355, + "learning_rate": 5.489653282170756e-06, + "loss": 0.1653, + "step": 33663 + }, + { + "epoch": 2.056592512208356, + "grad_norm": 4.373140811920166, + "learning_rate": 5.489516239550501e-06, + "loss": 0.2049, + "step": 33664 + }, + { + "epoch": 2.056606077048291, + "grad_norm": 4.829638481140137, + "learning_rate": 5.489379196930245e-06, + "loss": 0.1931, + "step": 33665 + }, + { + "epoch": 2.0566196418882257, + "grad_norm": 6.377479076385498, + "learning_rate": 5.4892421543099905e-06, + "loss": 0.179, + "step": 33666 + }, + { + "epoch": 2.0566332067281605, + "grad_norm": 5.341516971588135, + "learning_rate": 5.4891051116897365e-06, + "loss": 0.3889, + "step": 33667 + }, + { + "epoch": 2.0566467715680954, + "grad_norm": 3.8880274295806885, + "learning_rate": 5.488968069069482e-06, + "loss": 0.1719, + "step": 33668 + }, + { + "epoch": 2.0566603364080303, + "grad_norm": 4.6288251876831055, + "learning_rate": 5.488831026449226e-06, + "loss": 0.1328, + "step": 33669 + }, + { + "epoch": 2.056673901247965, + "grad_norm": 4.594690322875977, + "learning_rate": 5.488693983828971e-06, + "loss": 0.2034, + "step": 33670 + }, + { + "epoch": 2.0566874660879, + "grad_norm": 4.643803596496582, + "learning_rate": 5.4885569412087155e-06, + "loss": 0.1409, + "step": 33671 + }, + { + "epoch": 2.056701030927835, + "grad_norm": 3.648676872253418, + "learning_rate": 5.4884198985884615e-06, + "loss": 0.1297, + "step": 33672 + }, + { + "epoch": 2.0567145957677697, + "grad_norm": 5.145866870880127, + "learning_rate": 5.488282855968207e-06, + "loss": 0.1437, + "step": 33673 + }, + { + "epoch": 2.056728160607705, + "grad_norm": 5.854969501495361, + "learning_rate": 5.488145813347952e-06, + "loss": 0.2272, + "step": 33674 + }, + { + "epoch": 2.05674172544764, + "grad_norm": 6.613326072692871, + "learning_rate": 5.488008770727696e-06, + "loss": 0.1827, + "step": 33675 + }, + { + "epoch": 2.0567552902875748, + "grad_norm": 4.129372596740723, + "learning_rate": 5.487871728107442e-06, + "loss": 0.119, + "step": 33676 + }, + { + "epoch": 2.0567688551275096, + "grad_norm": 4.46673059463501, + "learning_rate": 5.4877346854871874e-06, + "loss": 0.1502, + "step": 33677 + }, + { + "epoch": 2.0567824199674445, + "grad_norm": 5.765194892883301, + "learning_rate": 5.487597642866932e-06, + "loss": 0.2391, + "step": 33678 + }, + { + "epoch": 2.0567959848073794, + "grad_norm": 5.58146333694458, + "learning_rate": 5.487460600246677e-06, + "loss": 0.2822, + "step": 33679 + }, + { + "epoch": 2.056809549647314, + "grad_norm": 4.0490641593933105, + "learning_rate": 5.487323557626423e-06, + "loss": 0.2064, + "step": 33680 + }, + { + "epoch": 2.056823114487249, + "grad_norm": 4.949092388153076, + "learning_rate": 5.487186515006167e-06, + "loss": 0.2232, + "step": 33681 + }, + { + "epoch": 2.056836679327184, + "grad_norm": 2.950695276260376, + "learning_rate": 5.4870494723859125e-06, + "loss": 0.1008, + "step": 33682 + }, + { + "epoch": 2.056850244167119, + "grad_norm": 4.816627502441406, + "learning_rate": 5.486912429765658e-06, + "loss": 0.2024, + "step": 33683 + }, + { + "epoch": 2.0568638090070537, + "grad_norm": 4.490283966064453, + "learning_rate": 5.486775387145402e-06, + "loss": 0.1553, + "step": 33684 + }, + { + "epoch": 2.0568773738469885, + "grad_norm": 3.8685176372528076, + "learning_rate": 5.486638344525148e-06, + "loss": 0.1402, + "step": 33685 + }, + { + "epoch": 2.0568909386869234, + "grad_norm": 4.975740909576416, + "learning_rate": 5.486501301904893e-06, + "loss": 0.1605, + "step": 33686 + }, + { + "epoch": 2.0569045035268583, + "grad_norm": 5.5699286460876465, + "learning_rate": 5.4863642592846375e-06, + "loss": 0.2738, + "step": 33687 + }, + { + "epoch": 2.056918068366793, + "grad_norm": 3.6685283184051514, + "learning_rate": 5.486227216664383e-06, + "loss": 0.1509, + "step": 33688 + }, + { + "epoch": 2.056931633206728, + "grad_norm": 2.949721336364746, + "learning_rate": 5.486090174044129e-06, + "loss": 0.1201, + "step": 33689 + }, + { + "epoch": 2.056945198046663, + "grad_norm": 4.532892227172852, + "learning_rate": 5.485953131423873e-06, + "loss": 0.1406, + "step": 33690 + }, + { + "epoch": 2.0569587628865977, + "grad_norm": 3.9970998764038086, + "learning_rate": 5.485816088803618e-06, + "loss": 0.1101, + "step": 33691 + }, + { + "epoch": 2.056972327726533, + "grad_norm": 5.158461570739746, + "learning_rate": 5.4856790461833634e-06, + "loss": 0.1738, + "step": 33692 + }, + { + "epoch": 2.056985892566468, + "grad_norm": 4.388282299041748, + "learning_rate": 5.4855420035631095e-06, + "loss": 0.1781, + "step": 33693 + }, + { + "epoch": 2.0569994574064028, + "grad_norm": 3.9925460815429688, + "learning_rate": 5.485404960942854e-06, + "loss": 0.1091, + "step": 33694 + }, + { + "epoch": 2.0570130222463376, + "grad_norm": 4.75469446182251, + "learning_rate": 5.485267918322599e-06, + "loss": 0.2183, + "step": 33695 + }, + { + "epoch": 2.0570265870862725, + "grad_norm": 5.531163692474365, + "learning_rate": 5.485130875702343e-06, + "loss": 0.217, + "step": 33696 + }, + { + "epoch": 2.0570401519262074, + "grad_norm": 5.3330559730529785, + "learning_rate": 5.4849938330820885e-06, + "loss": 0.2116, + "step": 33697 + }, + { + "epoch": 2.0570537167661422, + "grad_norm": 5.554696083068848, + "learning_rate": 5.4848567904618345e-06, + "loss": 0.1958, + "step": 33698 + }, + { + "epoch": 2.057067281606077, + "grad_norm": 4.934061527252197, + "learning_rate": 5.484719747841579e-06, + "loss": 0.1216, + "step": 33699 + }, + { + "epoch": 2.057080846446012, + "grad_norm": 6.167428970336914, + "learning_rate": 5.484582705221324e-06, + "loss": 0.267, + "step": 33700 + }, + { + "epoch": 2.057094411285947, + "grad_norm": 4.80098295211792, + "learning_rate": 5.484445662601069e-06, + "loss": 0.1725, + "step": 33701 + }, + { + "epoch": 2.0571079761258817, + "grad_norm": 5.432327747344971, + "learning_rate": 5.484308619980815e-06, + "loss": 0.1681, + "step": 33702 + }, + { + "epoch": 2.0571215409658166, + "grad_norm": 3.7219302654266357, + "learning_rate": 5.4841715773605595e-06, + "loss": 0.1007, + "step": 33703 + }, + { + "epoch": 2.0571351058057514, + "grad_norm": 4.5920538902282715, + "learning_rate": 5.484034534740305e-06, + "loss": 0.1648, + "step": 33704 + }, + { + "epoch": 2.0571486706456863, + "grad_norm": 5.47546911239624, + "learning_rate": 5.483897492120049e-06, + "loss": 0.1416, + "step": 33705 + }, + { + "epoch": 2.057162235485621, + "grad_norm": 5.74537467956543, + "learning_rate": 5.483760449499795e-06, + "loss": 0.1546, + "step": 33706 + }, + { + "epoch": 2.057175800325556, + "grad_norm": 4.2869720458984375, + "learning_rate": 5.48362340687954e-06, + "loss": 0.1154, + "step": 33707 + }, + { + "epoch": 2.057189365165491, + "grad_norm": 5.843245506286621, + "learning_rate": 5.4834863642592854e-06, + "loss": 0.1839, + "step": 33708 + }, + { + "epoch": 2.0572029300054258, + "grad_norm": 3.3884024620056152, + "learning_rate": 5.48334932163903e-06, + "loss": 0.108, + "step": 33709 + }, + { + "epoch": 2.0572164948453606, + "grad_norm": 5.794863700866699, + "learning_rate": 5.483212279018776e-06, + "loss": 0.2014, + "step": 33710 + }, + { + "epoch": 2.057230059685296, + "grad_norm": 4.3220672607421875, + "learning_rate": 5.483075236398521e-06, + "loss": 0.1115, + "step": 33711 + }, + { + "epoch": 2.057243624525231, + "grad_norm": 6.81009578704834, + "learning_rate": 5.482938193778265e-06, + "loss": 0.4227, + "step": 33712 + }, + { + "epoch": 2.0572571893651657, + "grad_norm": 3.8302812576293945, + "learning_rate": 5.4828011511580105e-06, + "loss": 0.1474, + "step": 33713 + }, + { + "epoch": 2.0572707542051005, + "grad_norm": 5.995415210723877, + "learning_rate": 5.482664108537755e-06, + "loss": 0.1811, + "step": 33714 + }, + { + "epoch": 2.0572843190450354, + "grad_norm": 4.762346267700195, + "learning_rate": 5.482527065917501e-06, + "loss": 0.1478, + "step": 33715 + }, + { + "epoch": 2.0572978838849703, + "grad_norm": 6.812037944793701, + "learning_rate": 5.482390023297246e-06, + "loss": 0.1396, + "step": 33716 + }, + { + "epoch": 2.057311448724905, + "grad_norm": 4.860437393188477, + "learning_rate": 5.482252980676991e-06, + "loss": 0.174, + "step": 33717 + }, + { + "epoch": 2.05732501356484, + "grad_norm": 4.226644992828369, + "learning_rate": 5.4821159380567355e-06, + "loss": 0.1508, + "step": 33718 + }, + { + "epoch": 2.057338578404775, + "grad_norm": 5.63773250579834, + "learning_rate": 5.4819788954364816e-06, + "loss": 0.1852, + "step": 33719 + }, + { + "epoch": 2.0573521432447097, + "grad_norm": 4.241535186767578, + "learning_rate": 5.481841852816227e-06, + "loss": 0.0932, + "step": 33720 + }, + { + "epoch": 2.0573657080846446, + "grad_norm": 5.2595953941345215, + "learning_rate": 5.481704810195971e-06, + "loss": 0.1042, + "step": 33721 + }, + { + "epoch": 2.0573792729245794, + "grad_norm": 4.595753192901611, + "learning_rate": 5.481567767575716e-06, + "loss": 0.1687, + "step": 33722 + }, + { + "epoch": 2.0573928377645143, + "grad_norm": 3.4599742889404297, + "learning_rate": 5.481430724955462e-06, + "loss": 0.082, + "step": 33723 + }, + { + "epoch": 2.057406402604449, + "grad_norm": 7.855424880981445, + "learning_rate": 5.481293682335207e-06, + "loss": 0.1989, + "step": 33724 + }, + { + "epoch": 2.057419967444384, + "grad_norm": 8.107216835021973, + "learning_rate": 5.481156639714952e-06, + "loss": 0.1913, + "step": 33725 + }, + { + "epoch": 2.057433532284319, + "grad_norm": 5.387030124664307, + "learning_rate": 5.481019597094697e-06, + "loss": 0.2182, + "step": 33726 + }, + { + "epoch": 2.0574470971242538, + "grad_norm": 5.897358417510986, + "learning_rate": 5.480882554474441e-06, + "loss": 0.2549, + "step": 33727 + }, + { + "epoch": 2.0574606619641886, + "grad_norm": 5.8339409828186035, + "learning_rate": 5.480745511854187e-06, + "loss": 0.3015, + "step": 33728 + }, + { + "epoch": 2.0574742268041235, + "grad_norm": 5.108001232147217, + "learning_rate": 5.4806084692339325e-06, + "loss": 0.1366, + "step": 33729 + }, + { + "epoch": 2.057487791644059, + "grad_norm": 3.7473082542419434, + "learning_rate": 5.480471426613677e-06, + "loss": 0.0878, + "step": 33730 + }, + { + "epoch": 2.0575013564839937, + "grad_norm": 5.663012504577637, + "learning_rate": 5.480334383993422e-06, + "loss": 0.2245, + "step": 33731 + }, + { + "epoch": 2.0575149213239285, + "grad_norm": 4.897394180297852, + "learning_rate": 5.480197341373168e-06, + "loss": 0.1807, + "step": 33732 + }, + { + "epoch": 2.0575284861638634, + "grad_norm": 5.140928268432617, + "learning_rate": 5.480060298752912e-06, + "loss": 0.1484, + "step": 33733 + }, + { + "epoch": 2.0575420510037983, + "grad_norm": 5.293062686920166, + "learning_rate": 5.4799232561326576e-06, + "loss": 0.2676, + "step": 33734 + }, + { + "epoch": 2.057555615843733, + "grad_norm": 4.124567031860352, + "learning_rate": 5.479786213512403e-06, + "loss": 0.1187, + "step": 33735 + }, + { + "epoch": 2.057569180683668, + "grad_norm": 6.050452709197998, + "learning_rate": 5.479649170892149e-06, + "loss": 0.1618, + "step": 33736 + }, + { + "epoch": 2.057582745523603, + "grad_norm": 5.741689682006836, + "learning_rate": 5.479512128271893e-06, + "loss": 0.1965, + "step": 33737 + }, + { + "epoch": 2.0575963103635377, + "grad_norm": 6.499661922454834, + "learning_rate": 5.479375085651638e-06, + "loss": 0.3176, + "step": 33738 + }, + { + "epoch": 2.0576098752034726, + "grad_norm": 4.978801250457764, + "learning_rate": 5.479238043031383e-06, + "loss": 0.1992, + "step": 33739 + }, + { + "epoch": 2.0576234400434075, + "grad_norm": 6.181230545043945, + "learning_rate": 5.479101000411128e-06, + "loss": 0.3396, + "step": 33740 + }, + { + "epoch": 2.0576370048833423, + "grad_norm": 5.527645111083984, + "learning_rate": 5.478963957790874e-06, + "loss": 0.313, + "step": 33741 + }, + { + "epoch": 2.057650569723277, + "grad_norm": 7.063731670379639, + "learning_rate": 5.478826915170619e-06, + "loss": 0.2829, + "step": 33742 + }, + { + "epoch": 2.057664134563212, + "grad_norm": 4.454232215881348, + "learning_rate": 5.478689872550363e-06, + "loss": 0.172, + "step": 33743 + }, + { + "epoch": 2.057677699403147, + "grad_norm": 5.573300361633301, + "learning_rate": 5.4785528299301085e-06, + "loss": 0.2664, + "step": 33744 + }, + { + "epoch": 2.057691264243082, + "grad_norm": 6.084722995758057, + "learning_rate": 5.4784157873098545e-06, + "loss": 0.2409, + "step": 33745 + }, + { + "epoch": 2.0577048290830167, + "grad_norm": 3.8094191551208496, + "learning_rate": 5.478278744689599e-06, + "loss": 0.1392, + "step": 33746 + }, + { + "epoch": 2.0577183939229515, + "grad_norm": 4.4214067459106445, + "learning_rate": 5.478141702069344e-06, + "loss": 0.1193, + "step": 33747 + }, + { + "epoch": 2.0577319587628864, + "grad_norm": 4.703001499176025, + "learning_rate": 5.478004659449088e-06, + "loss": 0.1481, + "step": 33748 + }, + { + "epoch": 2.0577455236028217, + "grad_norm": 4.025692939758301, + "learning_rate": 5.477867616828834e-06, + "loss": 0.1676, + "step": 33749 + }, + { + "epoch": 2.0577590884427566, + "grad_norm": 4.82866096496582, + "learning_rate": 5.4777305742085796e-06, + "loss": 0.1929, + "step": 33750 + }, + { + "epoch": 2.0577726532826914, + "grad_norm": 5.241706371307373, + "learning_rate": 5.477593531588325e-06, + "loss": 0.1893, + "step": 33751 + }, + { + "epoch": 2.0577862181226263, + "grad_norm": 4.293602466583252, + "learning_rate": 5.477456488968069e-06, + "loss": 0.1539, + "step": 33752 + }, + { + "epoch": 2.057799782962561, + "grad_norm": 5.545947551727295, + "learning_rate": 5.477319446347814e-06, + "loss": 0.2198, + "step": 33753 + }, + { + "epoch": 2.057813347802496, + "grad_norm": 3.730036497116089, + "learning_rate": 5.47718240372756e-06, + "loss": 0.1308, + "step": 33754 + }, + { + "epoch": 2.057826912642431, + "grad_norm": 5.196306228637695, + "learning_rate": 5.477045361107305e-06, + "loss": 0.1949, + "step": 33755 + }, + { + "epoch": 2.0578404774823658, + "grad_norm": 8.82504653930664, + "learning_rate": 5.47690831848705e-06, + "loss": 0.3504, + "step": 33756 + }, + { + "epoch": 2.0578540423223006, + "grad_norm": 3.3898658752441406, + "learning_rate": 5.476771275866795e-06, + "loss": 0.0902, + "step": 33757 + }, + { + "epoch": 2.0578676071622355, + "grad_norm": 4.141444206237793, + "learning_rate": 5.47663423324654e-06, + "loss": 0.2347, + "step": 33758 + }, + { + "epoch": 2.0578811720021704, + "grad_norm": 7.688693523406982, + "learning_rate": 5.476497190626285e-06, + "loss": 0.2407, + "step": 33759 + }, + { + "epoch": 2.057894736842105, + "grad_norm": 3.685981512069702, + "learning_rate": 5.4763601480060305e-06, + "loss": 0.1781, + "step": 33760 + }, + { + "epoch": 2.05790830168204, + "grad_norm": 4.503770351409912, + "learning_rate": 5.476223105385775e-06, + "loss": 0.17, + "step": 33761 + }, + { + "epoch": 2.057921866521975, + "grad_norm": 3.6773221492767334, + "learning_rate": 5.476086062765521e-06, + "loss": 0.1833, + "step": 33762 + }, + { + "epoch": 2.05793543136191, + "grad_norm": 4.863100051879883, + "learning_rate": 5.475949020145266e-06, + "loss": 0.1954, + "step": 33763 + }, + { + "epoch": 2.0579489962018447, + "grad_norm": 3.9904685020446777, + "learning_rate": 5.47581197752501e-06, + "loss": 0.2215, + "step": 33764 + }, + { + "epoch": 2.0579625610417795, + "grad_norm": 4.398934841156006, + "learning_rate": 5.4756749349047556e-06, + "loss": 0.1431, + "step": 33765 + }, + { + "epoch": 2.0579761258817144, + "grad_norm": 3.6644532680511475, + "learning_rate": 5.475537892284501e-06, + "loss": 0.2104, + "step": 33766 + }, + { + "epoch": 2.0579896907216493, + "grad_norm": 5.249566078186035, + "learning_rate": 5.475400849664247e-06, + "loss": 0.1104, + "step": 33767 + }, + { + "epoch": 2.0580032555615846, + "grad_norm": 6.540065765380859, + "learning_rate": 5.475263807043991e-06, + "loss": 0.3501, + "step": 33768 + }, + { + "epoch": 2.0580168204015195, + "grad_norm": 3.365386724472046, + "learning_rate": 5.475126764423736e-06, + "loss": 0.093, + "step": 33769 + }, + { + "epoch": 2.0580303852414543, + "grad_norm": 3.9969425201416016, + "learning_rate": 5.474989721803481e-06, + "loss": 0.2127, + "step": 33770 + }, + { + "epoch": 2.058043950081389, + "grad_norm": 4.2322869300842285, + "learning_rate": 5.474852679183227e-06, + "loss": 0.1822, + "step": 33771 + }, + { + "epoch": 2.058057514921324, + "grad_norm": 3.7349376678466797, + "learning_rate": 5.474715636562972e-06, + "loss": 0.1937, + "step": 33772 + }, + { + "epoch": 2.058071079761259, + "grad_norm": 4.609387397766113, + "learning_rate": 5.474578593942716e-06, + "loss": 0.1839, + "step": 33773 + }, + { + "epoch": 2.058084644601194, + "grad_norm": 3.95291805267334, + "learning_rate": 5.474441551322461e-06, + "loss": 0.1299, + "step": 33774 + }, + { + "epoch": 2.0580982094411286, + "grad_norm": 3.845163583755493, + "learning_rate": 5.474304508702207e-06, + "loss": 0.2062, + "step": 33775 + }, + { + "epoch": 2.0581117742810635, + "grad_norm": 5.527761936187744, + "learning_rate": 5.4741674660819525e-06, + "loss": 0.2114, + "step": 33776 + }, + { + "epoch": 2.0581253391209984, + "grad_norm": 5.782878398895264, + "learning_rate": 5.474030423461697e-06, + "loss": 0.2242, + "step": 33777 + }, + { + "epoch": 2.0581389039609332, + "grad_norm": 7.516814708709717, + "learning_rate": 5.473893380841442e-06, + "loss": 0.3938, + "step": 33778 + }, + { + "epoch": 2.058152468800868, + "grad_norm": 3.6250877380371094, + "learning_rate": 5.473756338221186e-06, + "loss": 0.1152, + "step": 33779 + }, + { + "epoch": 2.058166033640803, + "grad_norm": 5.189467906951904, + "learning_rate": 5.473619295600932e-06, + "loss": 0.2067, + "step": 33780 + }, + { + "epoch": 2.058179598480738, + "grad_norm": 4.101466655731201, + "learning_rate": 5.4734822529806776e-06, + "loss": 0.1284, + "step": 33781 + }, + { + "epoch": 2.0581931633206727, + "grad_norm": 3.585542678833008, + "learning_rate": 5.473345210360422e-06, + "loss": 0.1794, + "step": 33782 + }, + { + "epoch": 2.0582067281606076, + "grad_norm": 3.96616268157959, + "learning_rate": 5.473208167740167e-06, + "loss": 0.1447, + "step": 33783 + }, + { + "epoch": 2.0582202930005424, + "grad_norm": 5.835402011871338, + "learning_rate": 5.473071125119913e-06, + "loss": 0.2145, + "step": 33784 + }, + { + "epoch": 2.0582338578404773, + "grad_norm": 5.749636650085449, + "learning_rate": 5.472934082499658e-06, + "loss": 0.2654, + "step": 33785 + }, + { + "epoch": 2.058247422680412, + "grad_norm": 3.196621894836426, + "learning_rate": 5.472797039879403e-06, + "loss": 0.0958, + "step": 33786 + }, + { + "epoch": 2.0582609875203475, + "grad_norm": 4.320978164672852, + "learning_rate": 5.472659997259148e-06, + "loss": 0.2083, + "step": 33787 + }, + { + "epoch": 2.0582745523602823, + "grad_norm": 8.58769416809082, + "learning_rate": 5.472522954638894e-06, + "loss": 0.5461, + "step": 33788 + }, + { + "epoch": 2.058288117200217, + "grad_norm": 4.714343070983887, + "learning_rate": 5.472385912018638e-06, + "loss": 0.1778, + "step": 33789 + }, + { + "epoch": 2.058301682040152, + "grad_norm": 4.527885913848877, + "learning_rate": 5.472248869398383e-06, + "loss": 0.2524, + "step": 33790 + }, + { + "epoch": 2.058315246880087, + "grad_norm": 6.453401565551758, + "learning_rate": 5.4721118267781285e-06, + "loss": 0.2856, + "step": 33791 + }, + { + "epoch": 2.058328811720022, + "grad_norm": 4.077685832977295, + "learning_rate": 5.471974784157874e-06, + "loss": 0.1516, + "step": 33792 + }, + { + "epoch": 2.0583423765599567, + "grad_norm": 4.257521629333496, + "learning_rate": 5.471837741537619e-06, + "loss": 0.1857, + "step": 33793 + }, + { + "epoch": 2.0583559413998915, + "grad_norm": 4.114638805389404, + "learning_rate": 5.471700698917364e-06, + "loss": 0.1081, + "step": 33794 + }, + { + "epoch": 2.0583695062398264, + "grad_norm": 4.9287214279174805, + "learning_rate": 5.471563656297108e-06, + "loss": 0.1513, + "step": 33795 + }, + { + "epoch": 2.0583830710797613, + "grad_norm": 4.277852535247803, + "learning_rate": 5.4714266136768536e-06, + "loss": 0.2045, + "step": 33796 + }, + { + "epoch": 2.058396635919696, + "grad_norm": 3.9964280128479004, + "learning_rate": 5.4712895710566e-06, + "loss": 0.1897, + "step": 33797 + }, + { + "epoch": 2.058410200759631, + "grad_norm": 7.935815334320068, + "learning_rate": 5.471152528436344e-06, + "loss": 0.3981, + "step": 33798 + }, + { + "epoch": 2.058423765599566, + "grad_norm": 5.386159420013428, + "learning_rate": 5.471015485816089e-06, + "loss": 0.2325, + "step": 33799 + }, + { + "epoch": 2.0584373304395007, + "grad_norm": 6.276516914367676, + "learning_rate": 5.470878443195834e-06, + "loss": 0.3285, + "step": 33800 + }, + { + "epoch": 2.0584508952794356, + "grad_norm": 4.317415714263916, + "learning_rate": 5.47074140057558e-06, + "loss": 0.1316, + "step": 33801 + }, + { + "epoch": 2.0584644601193705, + "grad_norm": 6.143228054046631, + "learning_rate": 5.470604357955325e-06, + "loss": 0.2234, + "step": 33802 + }, + { + "epoch": 2.0584780249593053, + "grad_norm": 5.619141101837158, + "learning_rate": 5.47046731533507e-06, + "loss": 0.2032, + "step": 33803 + }, + { + "epoch": 2.05849158979924, + "grad_norm": 4.267284393310547, + "learning_rate": 5.470330272714814e-06, + "loss": 0.2186, + "step": 33804 + }, + { + "epoch": 2.058505154639175, + "grad_norm": 4.999780178070068, + "learning_rate": 5.47019323009456e-06, + "loss": 0.2516, + "step": 33805 + }, + { + "epoch": 2.0585187194791104, + "grad_norm": 4.211101055145264, + "learning_rate": 5.470056187474305e-06, + "loss": 0.1596, + "step": 33806 + }, + { + "epoch": 2.0585322843190452, + "grad_norm": 4.985128402709961, + "learning_rate": 5.46991914485405e-06, + "loss": 0.1575, + "step": 33807 + }, + { + "epoch": 2.05854584915898, + "grad_norm": 6.766435146331787, + "learning_rate": 5.469782102233795e-06, + "loss": 0.3809, + "step": 33808 + }, + { + "epoch": 2.058559413998915, + "grad_norm": 4.210143089294434, + "learning_rate": 5.46964505961354e-06, + "loss": 0.1367, + "step": 33809 + }, + { + "epoch": 2.05857297883885, + "grad_norm": 4.702183246612549, + "learning_rate": 5.469508016993286e-06, + "loss": 0.2103, + "step": 33810 + }, + { + "epoch": 2.0585865436787847, + "grad_norm": 4.973165988922119, + "learning_rate": 5.46937097437303e-06, + "loss": 0.2261, + "step": 33811 + }, + { + "epoch": 2.0586001085187196, + "grad_norm": 3.857940196990967, + "learning_rate": 5.4692339317527756e-06, + "loss": 0.1788, + "step": 33812 + }, + { + "epoch": 2.0586136733586544, + "grad_norm": 7.587179183959961, + "learning_rate": 5.46909688913252e-06, + "loss": 0.3827, + "step": 33813 + }, + { + "epoch": 2.0586272381985893, + "grad_norm": 4.62351655960083, + "learning_rate": 5.468959846512266e-06, + "loss": 0.1522, + "step": 33814 + }, + { + "epoch": 2.058640803038524, + "grad_norm": 4.913422584533691, + "learning_rate": 5.468822803892011e-06, + "loss": 0.1323, + "step": 33815 + }, + { + "epoch": 2.058654367878459, + "grad_norm": 6.356014728546143, + "learning_rate": 5.468685761271756e-06, + "loss": 0.3773, + "step": 33816 + }, + { + "epoch": 2.058667932718394, + "grad_norm": 3.624660015106201, + "learning_rate": 5.468548718651501e-06, + "loss": 0.1185, + "step": 33817 + }, + { + "epoch": 2.0586814975583287, + "grad_norm": 4.897812366485596, + "learning_rate": 5.468411676031247e-06, + "loss": 0.1428, + "step": 33818 + }, + { + "epoch": 2.0586950623982636, + "grad_norm": 4.551772117614746, + "learning_rate": 5.468274633410992e-06, + "loss": 0.1344, + "step": 33819 + }, + { + "epoch": 2.0587086272381985, + "grad_norm": 4.170984745025635, + "learning_rate": 5.468137590790736e-06, + "loss": 0.1661, + "step": 33820 + }, + { + "epoch": 2.0587221920781333, + "grad_norm": 5.930610179901123, + "learning_rate": 5.468000548170481e-06, + "loss": 0.228, + "step": 33821 + }, + { + "epoch": 2.058735756918068, + "grad_norm": 3.3585689067840576, + "learning_rate": 5.467863505550226e-06, + "loss": 0.1343, + "step": 33822 + }, + { + "epoch": 2.058749321758003, + "grad_norm": 3.904294729232788, + "learning_rate": 5.467726462929972e-06, + "loss": 0.111, + "step": 33823 + }, + { + "epoch": 2.058762886597938, + "grad_norm": 3.0978245735168457, + "learning_rate": 5.467589420309717e-06, + "loss": 0.0915, + "step": 33824 + }, + { + "epoch": 2.0587764514378732, + "grad_norm": 4.52126407623291, + "learning_rate": 5.467452377689462e-06, + "loss": 0.1813, + "step": 33825 + }, + { + "epoch": 2.058790016277808, + "grad_norm": 3.396106719970703, + "learning_rate": 5.467315335069206e-06, + "loss": 0.1617, + "step": 33826 + }, + { + "epoch": 2.058803581117743, + "grad_norm": 3.31318736076355, + "learning_rate": 5.467178292448952e-06, + "loss": 0.0724, + "step": 33827 + }, + { + "epoch": 2.058817145957678, + "grad_norm": 3.109483242034912, + "learning_rate": 5.467041249828698e-06, + "loss": 0.1173, + "step": 33828 + }, + { + "epoch": 2.0588307107976127, + "grad_norm": 4.190613746643066, + "learning_rate": 5.466904207208442e-06, + "loss": 0.1607, + "step": 33829 + }, + { + "epoch": 2.0588442756375476, + "grad_norm": 3.2200632095336914, + "learning_rate": 5.466767164588187e-06, + "loss": 0.0929, + "step": 33830 + }, + { + "epoch": 2.0588578404774824, + "grad_norm": 5.001763343811035, + "learning_rate": 5.466630121967933e-06, + "loss": 0.162, + "step": 33831 + }, + { + "epoch": 2.0588714053174173, + "grad_norm": 4.782763957977295, + "learning_rate": 5.4664930793476775e-06, + "loss": 0.1848, + "step": 33832 + }, + { + "epoch": 2.058884970157352, + "grad_norm": 2.8915247917175293, + "learning_rate": 5.466356036727423e-06, + "loss": 0.1083, + "step": 33833 + }, + { + "epoch": 2.058898534997287, + "grad_norm": 3.1431281566619873, + "learning_rate": 5.466218994107168e-06, + "loss": 0.1438, + "step": 33834 + }, + { + "epoch": 2.058912099837222, + "grad_norm": 5.834199905395508, + "learning_rate": 5.466081951486912e-06, + "loss": 0.3501, + "step": 33835 + }, + { + "epoch": 2.0589256646771568, + "grad_norm": 5.6226396560668945, + "learning_rate": 5.465944908866658e-06, + "loss": 0.2036, + "step": 33836 + }, + { + "epoch": 2.0589392295170916, + "grad_norm": 4.490776062011719, + "learning_rate": 5.465807866246403e-06, + "loss": 0.1725, + "step": 33837 + }, + { + "epoch": 2.0589527943570265, + "grad_norm": 6.1423773765563965, + "learning_rate": 5.465670823626148e-06, + "loss": 0.1519, + "step": 33838 + }, + { + "epoch": 2.0589663591969614, + "grad_norm": 3.8975253105163574, + "learning_rate": 5.465533781005893e-06, + "loss": 0.14, + "step": 33839 + }, + { + "epoch": 2.0589799240368962, + "grad_norm": 3.938436269760132, + "learning_rate": 5.465396738385639e-06, + "loss": 0.1285, + "step": 33840 + }, + { + "epoch": 2.058993488876831, + "grad_norm": 5.730786323547363, + "learning_rate": 5.465259695765383e-06, + "loss": 0.1603, + "step": 33841 + }, + { + "epoch": 2.059007053716766, + "grad_norm": 3.507065773010254, + "learning_rate": 5.465122653145128e-06, + "loss": 0.1318, + "step": 33842 + }, + { + "epoch": 2.059020618556701, + "grad_norm": 5.035525798797607, + "learning_rate": 5.464985610524874e-06, + "loss": 0.1766, + "step": 33843 + }, + { + "epoch": 2.059034183396636, + "grad_norm": 3.87064528465271, + "learning_rate": 5.46484856790462e-06, + "loss": 0.1023, + "step": 33844 + }, + { + "epoch": 2.059047748236571, + "grad_norm": 4.718033790588379, + "learning_rate": 5.464711525284364e-06, + "loss": 0.1032, + "step": 33845 + }, + { + "epoch": 2.059061313076506, + "grad_norm": 3.813000440597534, + "learning_rate": 5.464574482664109e-06, + "loss": 0.1693, + "step": 33846 + }, + { + "epoch": 2.0590748779164407, + "grad_norm": 3.9040474891662598, + "learning_rate": 5.4644374400438534e-06, + "loss": 0.124, + "step": 33847 + }, + { + "epoch": 2.0590884427563756, + "grad_norm": 2.5552167892456055, + "learning_rate": 5.464300397423599e-06, + "loss": 0.0529, + "step": 33848 + }, + { + "epoch": 2.0591020075963105, + "grad_norm": 5.264678001403809, + "learning_rate": 5.464163354803345e-06, + "loss": 0.1551, + "step": 33849 + }, + { + "epoch": 2.0591155724362453, + "grad_norm": 5.045903205871582, + "learning_rate": 5.46402631218309e-06, + "loss": 0.1688, + "step": 33850 + }, + { + "epoch": 2.05912913727618, + "grad_norm": 5.318442344665527, + "learning_rate": 5.463889269562834e-06, + "loss": 0.2558, + "step": 33851 + }, + { + "epoch": 2.059142702116115, + "grad_norm": 4.184573173522949, + "learning_rate": 5.463752226942579e-06, + "loss": 0.1927, + "step": 33852 + }, + { + "epoch": 2.05915626695605, + "grad_norm": 4.146298408508301, + "learning_rate": 5.463615184322325e-06, + "loss": 0.1512, + "step": 33853 + }, + { + "epoch": 2.059169831795985, + "grad_norm": 6.932418346405029, + "learning_rate": 5.46347814170207e-06, + "loss": 0.2368, + "step": 33854 + }, + { + "epoch": 2.0591833966359196, + "grad_norm": 3.618116855621338, + "learning_rate": 5.463341099081815e-06, + "loss": 0.1524, + "step": 33855 + }, + { + "epoch": 2.0591969614758545, + "grad_norm": 5.488018989562988, + "learning_rate": 5.463204056461559e-06, + "loss": 0.203, + "step": 33856 + }, + { + "epoch": 2.0592105263157894, + "grad_norm": 5.0043206214904785, + "learning_rate": 5.463067013841305e-06, + "loss": 0.2086, + "step": 33857 + }, + { + "epoch": 2.0592240911557242, + "grad_norm": 5.336464881896973, + "learning_rate": 5.46292997122105e-06, + "loss": 0.2795, + "step": 33858 + }, + { + "epoch": 2.059237655995659, + "grad_norm": 4.068552017211914, + "learning_rate": 5.462792928600796e-06, + "loss": 0.1757, + "step": 33859 + }, + { + "epoch": 2.059251220835594, + "grad_norm": 5.346880912780762, + "learning_rate": 5.46265588598054e-06, + "loss": 0.1599, + "step": 33860 + }, + { + "epoch": 2.059264785675529, + "grad_norm": 3.461611270904541, + "learning_rate": 5.462518843360286e-06, + "loss": 0.1506, + "step": 33861 + }, + { + "epoch": 2.0592783505154637, + "grad_norm": 3.925213575363159, + "learning_rate": 5.462381800740031e-06, + "loss": 0.1704, + "step": 33862 + }, + { + "epoch": 2.059291915355399, + "grad_norm": 4.132728576660156, + "learning_rate": 5.4622447581197755e-06, + "loss": 0.0893, + "step": 33863 + }, + { + "epoch": 2.059305480195334, + "grad_norm": 3.234558582305908, + "learning_rate": 5.462107715499521e-06, + "loss": 0.1242, + "step": 33864 + }, + { + "epoch": 2.0593190450352687, + "grad_norm": 6.149785041809082, + "learning_rate": 5.461970672879266e-06, + "loss": 0.2819, + "step": 33865 + }, + { + "epoch": 2.0593326098752036, + "grad_norm": 5.055850028991699, + "learning_rate": 5.461833630259011e-06, + "loss": 0.1928, + "step": 33866 + }, + { + "epoch": 2.0593461747151385, + "grad_norm": 4.88685417175293, + "learning_rate": 5.461696587638756e-06, + "loss": 0.1662, + "step": 33867 + }, + { + "epoch": 2.0593597395550733, + "grad_norm": 4.2096662521362305, + "learning_rate": 5.461559545018501e-06, + "loss": 0.1581, + "step": 33868 + }, + { + "epoch": 2.059373304395008, + "grad_norm": 4.182452201843262, + "learning_rate": 5.461422502398246e-06, + "loss": 0.2104, + "step": 33869 + }, + { + "epoch": 2.059386869234943, + "grad_norm": 3.7196435928344727, + "learning_rate": 5.461285459777992e-06, + "loss": 0.153, + "step": 33870 + }, + { + "epoch": 2.059400434074878, + "grad_norm": 3.565053939819336, + "learning_rate": 5.461148417157737e-06, + "loss": 0.1065, + "step": 33871 + }, + { + "epoch": 2.059413998914813, + "grad_norm": 2.9819915294647217, + "learning_rate": 5.461011374537481e-06, + "loss": 0.0888, + "step": 33872 + }, + { + "epoch": 2.0594275637547477, + "grad_norm": 4.294938087463379, + "learning_rate": 5.460874331917226e-06, + "loss": 0.1423, + "step": 33873 + }, + { + "epoch": 2.0594411285946825, + "grad_norm": 5.494981288909912, + "learning_rate": 5.4607372892969724e-06, + "loss": 0.2202, + "step": 33874 + }, + { + "epoch": 2.0594546934346174, + "grad_norm": 4.608980178833008, + "learning_rate": 5.460600246676717e-06, + "loss": 0.1584, + "step": 33875 + }, + { + "epoch": 2.0594682582745523, + "grad_norm": 4.340164661407471, + "learning_rate": 5.460463204056462e-06, + "loss": 0.1577, + "step": 33876 + }, + { + "epoch": 2.059481823114487, + "grad_norm": 4.024872779846191, + "learning_rate": 5.460326161436207e-06, + "loss": 0.164, + "step": 33877 + }, + { + "epoch": 2.059495387954422, + "grad_norm": 6.2760467529296875, + "learning_rate": 5.4601891188159515e-06, + "loss": 0.2366, + "step": 33878 + }, + { + "epoch": 2.059508952794357, + "grad_norm": 6.36717414855957, + "learning_rate": 5.4600520761956975e-06, + "loss": 0.2365, + "step": 33879 + }, + { + "epoch": 2.0595225176342917, + "grad_norm": 5.4529290199279785, + "learning_rate": 5.459915033575443e-06, + "loss": 0.1759, + "step": 33880 + }, + { + "epoch": 2.0595360824742266, + "grad_norm": 4.111979007720947, + "learning_rate": 5.459777990955187e-06, + "loss": 0.1875, + "step": 33881 + }, + { + "epoch": 2.059549647314162, + "grad_norm": 5.596325874328613, + "learning_rate": 5.459640948334932e-06, + "loss": 0.1871, + "step": 33882 + }, + { + "epoch": 2.0595632121540968, + "grad_norm": 3.9410319328308105, + "learning_rate": 5.459503905714678e-06, + "loss": 0.1731, + "step": 33883 + }, + { + "epoch": 2.0595767769940316, + "grad_norm": 5.909302234649658, + "learning_rate": 5.459366863094423e-06, + "loss": 0.2887, + "step": 33884 + }, + { + "epoch": 2.0595903418339665, + "grad_norm": 4.553335666656494, + "learning_rate": 5.459229820474168e-06, + "loss": 0.2753, + "step": 33885 + }, + { + "epoch": 2.0596039066739014, + "grad_norm": 7.14124059677124, + "learning_rate": 5.459092777853913e-06, + "loss": 0.232, + "step": 33886 + }, + { + "epoch": 2.0596174715138362, + "grad_norm": 3.724217414855957, + "learning_rate": 5.458955735233659e-06, + "loss": 0.1836, + "step": 33887 + }, + { + "epoch": 2.059631036353771, + "grad_norm": 5.957900047302246, + "learning_rate": 5.458818692613403e-06, + "loss": 0.263, + "step": 33888 + }, + { + "epoch": 2.059644601193706, + "grad_norm": 4.628172397613525, + "learning_rate": 5.458681649993148e-06, + "loss": 0.236, + "step": 33889 + }, + { + "epoch": 2.059658166033641, + "grad_norm": 5.854728698730469, + "learning_rate": 5.458544607372893e-06, + "loss": 0.2852, + "step": 33890 + }, + { + "epoch": 2.0596717308735757, + "grad_norm": 3.752115488052368, + "learning_rate": 5.458407564752638e-06, + "loss": 0.1975, + "step": 33891 + }, + { + "epoch": 2.0596852957135106, + "grad_norm": 4.535750389099121, + "learning_rate": 5.458270522132384e-06, + "loss": 0.2288, + "step": 33892 + }, + { + "epoch": 2.0596988605534454, + "grad_norm": 4.975393295288086, + "learning_rate": 5.458133479512129e-06, + "loss": 0.2654, + "step": 33893 + }, + { + "epoch": 2.0597124253933803, + "grad_norm": 4.734886169433594, + "learning_rate": 5.4579964368918735e-06, + "loss": 0.2126, + "step": 33894 + }, + { + "epoch": 2.059725990233315, + "grad_norm": 4.734899997711182, + "learning_rate": 5.457859394271619e-06, + "loss": 0.2081, + "step": 33895 + }, + { + "epoch": 2.05973955507325, + "grad_norm": 4.382165431976318, + "learning_rate": 5.457722351651365e-06, + "loss": 0.2368, + "step": 33896 + }, + { + "epoch": 2.059753119913185, + "grad_norm": 5.35761833190918, + "learning_rate": 5.457585309031109e-06, + "loss": 0.1953, + "step": 33897 + }, + { + "epoch": 2.0597666847531197, + "grad_norm": 4.959895610809326, + "learning_rate": 5.457448266410854e-06, + "loss": 0.2143, + "step": 33898 + }, + { + "epoch": 2.0597802495930546, + "grad_norm": 5.949836254119873, + "learning_rate": 5.457311223790599e-06, + "loss": 0.2135, + "step": 33899 + }, + { + "epoch": 2.0597938144329895, + "grad_norm": 4.344404220581055, + "learning_rate": 5.4571741811703445e-06, + "loss": 0.179, + "step": 33900 + }, + { + "epoch": 2.059807379272925, + "grad_norm": 5.713665962219238, + "learning_rate": 5.45703713855009e-06, + "loss": 0.286, + "step": 33901 + }, + { + "epoch": 2.0598209441128597, + "grad_norm": 6.135073661804199, + "learning_rate": 5.456900095929835e-06, + "loss": 0.1983, + "step": 33902 + }, + { + "epoch": 2.0598345089527945, + "grad_norm": 6.82203483581543, + "learning_rate": 5.456763053309579e-06, + "loss": 0.2488, + "step": 33903 + }, + { + "epoch": 2.0598480737927294, + "grad_norm": 4.480790138244629, + "learning_rate": 5.456626010689324e-06, + "loss": 0.1355, + "step": 33904 + }, + { + "epoch": 2.0598616386326642, + "grad_norm": 5.559930801391602, + "learning_rate": 5.4564889680690704e-06, + "loss": 0.1986, + "step": 33905 + }, + { + "epoch": 2.059875203472599, + "grad_norm": 3.7476680278778076, + "learning_rate": 5.456351925448815e-06, + "loss": 0.1457, + "step": 33906 + }, + { + "epoch": 2.059888768312534, + "grad_norm": 4.275031566619873, + "learning_rate": 5.45621488282856e-06, + "loss": 0.1904, + "step": 33907 + }, + { + "epoch": 2.059902333152469, + "grad_norm": 4.720082759857178, + "learning_rate": 5.456077840208305e-06, + "loss": 0.2163, + "step": 33908 + }, + { + "epoch": 2.0599158979924037, + "grad_norm": 5.243856906890869, + "learning_rate": 5.455940797588051e-06, + "loss": 0.234, + "step": 33909 + }, + { + "epoch": 2.0599294628323386, + "grad_norm": 3.5188772678375244, + "learning_rate": 5.4558037549677955e-06, + "loss": 0.1972, + "step": 33910 + }, + { + "epoch": 2.0599430276722734, + "grad_norm": 4.300881862640381, + "learning_rate": 5.455666712347541e-06, + "loss": 0.2573, + "step": 33911 + }, + { + "epoch": 2.0599565925122083, + "grad_norm": 5.827674865722656, + "learning_rate": 5.455529669727285e-06, + "loss": 0.2844, + "step": 33912 + }, + { + "epoch": 2.059970157352143, + "grad_norm": 4.66912841796875, + "learning_rate": 5.455392627107031e-06, + "loss": 0.1909, + "step": 33913 + }, + { + "epoch": 2.059983722192078, + "grad_norm": 3.9637253284454346, + "learning_rate": 5.455255584486776e-06, + "loss": 0.1583, + "step": 33914 + }, + { + "epoch": 2.059997287032013, + "grad_norm": 5.766948223114014, + "learning_rate": 5.4551185418665205e-06, + "loss": 0.1991, + "step": 33915 + }, + { + "epoch": 2.0600108518719478, + "grad_norm": 3.2579658031463623, + "learning_rate": 5.454981499246266e-06, + "loss": 0.1212, + "step": 33916 + }, + { + "epoch": 2.0600244167118826, + "grad_norm": 6.056964874267578, + "learning_rate": 5.454844456626011e-06, + "loss": 0.2306, + "step": 33917 + }, + { + "epoch": 2.0600379815518175, + "grad_norm": 7.247814178466797, + "learning_rate": 5.454707414005757e-06, + "loss": 0.2224, + "step": 33918 + }, + { + "epoch": 2.0600515463917524, + "grad_norm": 3.713405132293701, + "learning_rate": 5.454570371385501e-06, + "loss": 0.1671, + "step": 33919 + }, + { + "epoch": 2.0600651112316877, + "grad_norm": 4.670582294464111, + "learning_rate": 5.4544333287652464e-06, + "loss": 0.1803, + "step": 33920 + }, + { + "epoch": 2.0600786760716225, + "grad_norm": 4.937886714935303, + "learning_rate": 5.454296286144991e-06, + "loss": 0.2033, + "step": 33921 + }, + { + "epoch": 2.0600922409115574, + "grad_norm": 3.9254984855651855, + "learning_rate": 5.454159243524737e-06, + "loss": 0.0888, + "step": 33922 + }, + { + "epoch": 2.0601058057514923, + "grad_norm": 3.9663407802581787, + "learning_rate": 5.454022200904482e-06, + "loss": 0.1108, + "step": 33923 + }, + { + "epoch": 2.060119370591427, + "grad_norm": 3.749582052230835, + "learning_rate": 5.453885158284226e-06, + "loss": 0.1521, + "step": 33924 + }, + { + "epoch": 2.060132935431362, + "grad_norm": 3.838613510131836, + "learning_rate": 5.4537481156639715e-06, + "loss": 0.1597, + "step": 33925 + }, + { + "epoch": 2.060146500271297, + "grad_norm": 3.145094633102417, + "learning_rate": 5.4536110730437175e-06, + "loss": 0.1287, + "step": 33926 + }, + { + "epoch": 2.0601600651112317, + "grad_norm": 4.095972061157227, + "learning_rate": 5.453474030423463e-06, + "loss": 0.1471, + "step": 33927 + }, + { + "epoch": 2.0601736299511666, + "grad_norm": 4.242530822753906, + "learning_rate": 5.453336987803207e-06, + "loss": 0.1898, + "step": 33928 + }, + { + "epoch": 2.0601871947911015, + "grad_norm": 5.193459510803223, + "learning_rate": 5.453199945182952e-06, + "loss": 0.1795, + "step": 33929 + }, + { + "epoch": 2.0602007596310363, + "grad_norm": 4.567845821380615, + "learning_rate": 5.453062902562698e-06, + "loss": 0.1396, + "step": 33930 + }, + { + "epoch": 2.060214324470971, + "grad_norm": 4.648841857910156, + "learning_rate": 5.4529258599424425e-06, + "loss": 0.1819, + "step": 33931 + }, + { + "epoch": 2.060227889310906, + "grad_norm": 3.372044086456299, + "learning_rate": 5.452788817322188e-06, + "loss": 0.1622, + "step": 33932 + }, + { + "epoch": 2.060241454150841, + "grad_norm": 4.422704219818115, + "learning_rate": 5.452651774701933e-06, + "loss": 0.1721, + "step": 33933 + }, + { + "epoch": 2.060255018990776, + "grad_norm": 4.369091987609863, + "learning_rate": 5.452514732081677e-06, + "loss": 0.1595, + "step": 33934 + }, + { + "epoch": 2.0602685838307107, + "grad_norm": 3.6640589237213135, + "learning_rate": 5.452377689461423e-06, + "loss": 0.0959, + "step": 33935 + }, + { + "epoch": 2.0602821486706455, + "grad_norm": 6.072648525238037, + "learning_rate": 5.4522406468411684e-06, + "loss": 0.1475, + "step": 33936 + }, + { + "epoch": 2.0602957135105804, + "grad_norm": 4.981650352478027, + "learning_rate": 5.452103604220913e-06, + "loss": 0.1774, + "step": 33937 + }, + { + "epoch": 2.0603092783505152, + "grad_norm": 3.8319544792175293, + "learning_rate": 5.451966561600658e-06, + "loss": 0.1399, + "step": 33938 + }, + { + "epoch": 2.0603228431904506, + "grad_norm": 4.455573558807373, + "learning_rate": 5.451829518980404e-06, + "loss": 0.174, + "step": 33939 + }, + { + "epoch": 2.0603364080303854, + "grad_norm": 3.8695931434631348, + "learning_rate": 5.451692476360148e-06, + "loss": 0.1782, + "step": 33940 + }, + { + "epoch": 2.0603499728703203, + "grad_norm": 5.30676794052124, + "learning_rate": 5.4515554337398935e-06, + "loss": 0.31, + "step": 33941 + }, + { + "epoch": 2.060363537710255, + "grad_norm": 6.004730701446533, + "learning_rate": 5.451418391119639e-06, + "loss": 0.1854, + "step": 33942 + }, + { + "epoch": 2.06037710255019, + "grad_norm": 6.129815101623535, + "learning_rate": 5.451281348499385e-06, + "loss": 0.2508, + "step": 33943 + }, + { + "epoch": 2.060390667390125, + "grad_norm": 7.22970724105835, + "learning_rate": 5.451144305879129e-06, + "loss": 0.1725, + "step": 33944 + }, + { + "epoch": 2.0604042322300598, + "grad_norm": 3.577397108078003, + "learning_rate": 5.451007263258874e-06, + "loss": 0.1238, + "step": 33945 + }, + { + "epoch": 2.0604177970699946, + "grad_norm": 6.523820877075195, + "learning_rate": 5.4508702206386185e-06, + "loss": 0.192, + "step": 33946 + }, + { + "epoch": 2.0604313619099295, + "grad_norm": 3.3921101093292236, + "learning_rate": 5.450733178018364e-06, + "loss": 0.0997, + "step": 33947 + }, + { + "epoch": 2.0604449267498643, + "grad_norm": 5.5833048820495605, + "learning_rate": 5.45059613539811e-06, + "loss": 0.212, + "step": 33948 + }, + { + "epoch": 2.060458491589799, + "grad_norm": 3.130944013595581, + "learning_rate": 5.450459092777854e-06, + "loss": 0.0954, + "step": 33949 + }, + { + "epoch": 2.060472056429734, + "grad_norm": 4.164292812347412, + "learning_rate": 5.450322050157599e-06, + "loss": 0.2485, + "step": 33950 + }, + { + "epoch": 2.060485621269669, + "grad_norm": 6.572721004486084, + "learning_rate": 5.4501850075373444e-06, + "loss": 0.1994, + "step": 33951 + }, + { + "epoch": 2.060499186109604, + "grad_norm": 3.141012668609619, + "learning_rate": 5.4500479649170905e-06, + "loss": 0.1111, + "step": 33952 + }, + { + "epoch": 2.0605127509495387, + "grad_norm": 3.9272592067718506, + "learning_rate": 5.449910922296835e-06, + "loss": 0.1442, + "step": 33953 + }, + { + "epoch": 2.0605263157894735, + "grad_norm": 4.443992614746094, + "learning_rate": 5.44977387967658e-06, + "loss": 0.1656, + "step": 33954 + }, + { + "epoch": 2.0605398806294084, + "grad_norm": 4.109272003173828, + "learning_rate": 5.449636837056324e-06, + "loss": 0.1958, + "step": 33955 + }, + { + "epoch": 2.0605534454693433, + "grad_norm": 5.348515510559082, + "learning_rate": 5.44949979443607e-06, + "loss": 0.2164, + "step": 33956 + }, + { + "epoch": 2.060567010309278, + "grad_norm": 4.915592193603516, + "learning_rate": 5.4493627518158155e-06, + "loss": 0.149, + "step": 33957 + }, + { + "epoch": 2.0605805751492134, + "grad_norm": 4.999640464782715, + "learning_rate": 5.449225709195561e-06, + "loss": 0.1972, + "step": 33958 + }, + { + "epoch": 2.0605941399891483, + "grad_norm": 4.440965175628662, + "learning_rate": 5.449088666575305e-06, + "loss": 0.1349, + "step": 33959 + }, + { + "epoch": 2.060607704829083, + "grad_norm": 3.9905388355255127, + "learning_rate": 5.44895162395505e-06, + "loss": 0.1198, + "step": 33960 + }, + { + "epoch": 2.060621269669018, + "grad_norm": 5.647791385650635, + "learning_rate": 5.448814581334796e-06, + "loss": 0.2744, + "step": 33961 + }, + { + "epoch": 2.060634834508953, + "grad_norm": 6.815630912780762, + "learning_rate": 5.4486775387145405e-06, + "loss": 0.2122, + "step": 33962 + }, + { + "epoch": 2.0606483993488878, + "grad_norm": 4.5577802658081055, + "learning_rate": 5.448540496094286e-06, + "loss": 0.1154, + "step": 33963 + }, + { + "epoch": 2.0606619641888226, + "grad_norm": 8.457215309143066, + "learning_rate": 5.44840345347403e-06, + "loss": 0.2805, + "step": 33964 + }, + { + "epoch": 2.0606755290287575, + "grad_norm": 3.3961756229400635, + "learning_rate": 5.448266410853776e-06, + "loss": 0.0809, + "step": 33965 + }, + { + "epoch": 2.0606890938686924, + "grad_norm": 4.5467634201049805, + "learning_rate": 5.448129368233521e-06, + "loss": 0.1198, + "step": 33966 + }, + { + "epoch": 2.0607026587086272, + "grad_norm": 5.618119716644287, + "learning_rate": 5.4479923256132664e-06, + "loss": 0.2117, + "step": 33967 + }, + { + "epoch": 2.060716223548562, + "grad_norm": 4.723643779754639, + "learning_rate": 5.447855282993011e-06, + "loss": 0.2525, + "step": 33968 + }, + { + "epoch": 2.060729788388497, + "grad_norm": 4.553889751434326, + "learning_rate": 5.447718240372757e-06, + "loss": 0.1554, + "step": 33969 + }, + { + "epoch": 2.060743353228432, + "grad_norm": 6.654090404510498, + "learning_rate": 5.447581197752502e-06, + "loss": 0.2841, + "step": 33970 + }, + { + "epoch": 2.0607569180683667, + "grad_norm": 5.401650905609131, + "learning_rate": 5.447444155132246e-06, + "loss": 0.2796, + "step": 33971 + }, + { + "epoch": 2.0607704829083016, + "grad_norm": 5.1516008377075195, + "learning_rate": 5.4473071125119915e-06, + "loss": 0.2057, + "step": 33972 + }, + { + "epoch": 2.0607840477482364, + "grad_norm": 6.71877908706665, + "learning_rate": 5.447170069891736e-06, + "loss": 0.3431, + "step": 33973 + }, + { + "epoch": 2.0607976125881713, + "grad_norm": 3.434299945831299, + "learning_rate": 5.447033027271482e-06, + "loss": 0.1163, + "step": 33974 + }, + { + "epoch": 2.060811177428106, + "grad_norm": 5.108819007873535, + "learning_rate": 5.446895984651227e-06, + "loss": 0.2004, + "step": 33975 + }, + { + "epoch": 2.060824742268041, + "grad_norm": 4.420967102050781, + "learning_rate": 5.446758942030972e-06, + "loss": 0.1976, + "step": 33976 + }, + { + "epoch": 2.0608383071079763, + "grad_norm": 4.128447532653809, + "learning_rate": 5.4466218994107165e-06, + "loss": 0.1469, + "step": 33977 + }, + { + "epoch": 2.060851871947911, + "grad_norm": 4.047733306884766, + "learning_rate": 5.4464848567904626e-06, + "loss": 0.188, + "step": 33978 + }, + { + "epoch": 2.060865436787846, + "grad_norm": 4.804057598114014, + "learning_rate": 5.446347814170208e-06, + "loss": 0.1846, + "step": 33979 + }, + { + "epoch": 2.060879001627781, + "grad_norm": 5.231438636779785, + "learning_rate": 5.446210771549952e-06, + "loss": 0.2423, + "step": 33980 + }, + { + "epoch": 2.060892566467716, + "grad_norm": 4.982701778411865, + "learning_rate": 5.446073728929697e-06, + "loss": 0.1471, + "step": 33981 + }, + { + "epoch": 2.0609061313076507, + "grad_norm": 4.630263328552246, + "learning_rate": 5.445936686309443e-06, + "loss": 0.1441, + "step": 33982 + }, + { + "epoch": 2.0609196961475855, + "grad_norm": 5.615734577178955, + "learning_rate": 5.445799643689188e-06, + "loss": 0.1974, + "step": 33983 + }, + { + "epoch": 2.0609332609875204, + "grad_norm": 5.867135047912598, + "learning_rate": 5.445662601068933e-06, + "loss": 0.1649, + "step": 33984 + }, + { + "epoch": 2.0609468258274553, + "grad_norm": 5.049881935119629, + "learning_rate": 5.445525558448678e-06, + "loss": 0.1845, + "step": 33985 + }, + { + "epoch": 2.06096039066739, + "grad_norm": 6.060460567474365, + "learning_rate": 5.445388515828422e-06, + "loss": 0.2614, + "step": 33986 + }, + { + "epoch": 2.060973955507325, + "grad_norm": 6.8391432762146, + "learning_rate": 5.445251473208168e-06, + "loss": 0.2148, + "step": 33987 + }, + { + "epoch": 2.06098752034726, + "grad_norm": 4.5074992179870605, + "learning_rate": 5.4451144305879135e-06, + "loss": 0.2334, + "step": 33988 + }, + { + "epoch": 2.0610010851871947, + "grad_norm": 7.065555095672607, + "learning_rate": 5.444977387967658e-06, + "loss": 0.2618, + "step": 33989 + }, + { + "epoch": 2.0610146500271296, + "grad_norm": 5.0398454666137695, + "learning_rate": 5.444840345347403e-06, + "loss": 0.2394, + "step": 33990 + }, + { + "epoch": 2.0610282148670644, + "grad_norm": 4.275350570678711, + "learning_rate": 5.444703302727149e-06, + "loss": 0.2099, + "step": 33991 + }, + { + "epoch": 2.0610417797069993, + "grad_norm": 5.739682197570801, + "learning_rate": 5.444566260106894e-06, + "loss": 0.3203, + "step": 33992 + }, + { + "epoch": 2.061055344546934, + "grad_norm": 4.605108261108398, + "learning_rate": 5.4444292174866386e-06, + "loss": 0.2165, + "step": 33993 + }, + { + "epoch": 2.061068909386869, + "grad_norm": 4.534181594848633, + "learning_rate": 5.444292174866384e-06, + "loss": 0.2259, + "step": 33994 + }, + { + "epoch": 2.061082474226804, + "grad_norm": 4.534286022186279, + "learning_rate": 5.44415513224613e-06, + "loss": 0.1749, + "step": 33995 + }, + { + "epoch": 2.061096039066739, + "grad_norm": 3.6834990978240967, + "learning_rate": 5.444018089625874e-06, + "loss": 0.1327, + "step": 33996 + }, + { + "epoch": 2.061109603906674, + "grad_norm": 6.710902214050293, + "learning_rate": 5.443881047005619e-06, + "loss": 0.3074, + "step": 33997 + }, + { + "epoch": 2.061123168746609, + "grad_norm": 7.429980754852295, + "learning_rate": 5.443744004385364e-06, + "loss": 0.3541, + "step": 33998 + }, + { + "epoch": 2.061136733586544, + "grad_norm": 6.261305809020996, + "learning_rate": 5.44360696176511e-06, + "loss": 0.2493, + "step": 33999 + }, + { + "epoch": 2.0611502984264787, + "grad_norm": 6.1797332763671875, + "learning_rate": 5.443469919144855e-06, + "loss": 0.2301, + "step": 34000 + }, + { + "epoch": 2.0611638632664135, + "grad_norm": 6.502707481384277, + "learning_rate": 5.4433328765246e-06, + "loss": 0.3004, + "step": 34001 + }, + { + "epoch": 2.0611774281063484, + "grad_norm": 4.6404194831848145, + "learning_rate": 5.443195833904344e-06, + "loss": 0.2169, + "step": 34002 + }, + { + "epoch": 2.0611909929462833, + "grad_norm": 4.46113920211792, + "learning_rate": 5.4430587912840895e-06, + "loss": 0.1668, + "step": 34003 + }, + { + "epoch": 2.061204557786218, + "grad_norm": 5.416807174682617, + "learning_rate": 5.4429217486638355e-06, + "loss": 0.174, + "step": 34004 + }, + { + "epoch": 2.061218122626153, + "grad_norm": 8.044841766357422, + "learning_rate": 5.44278470604358e-06, + "loss": 0.3147, + "step": 34005 + }, + { + "epoch": 2.061231687466088, + "grad_norm": 5.908016681671143, + "learning_rate": 5.442647663423325e-06, + "loss": 0.2882, + "step": 34006 + }, + { + "epoch": 2.0612452523060227, + "grad_norm": 4.739662170410156, + "learning_rate": 5.44251062080307e-06, + "loss": 0.2561, + "step": 34007 + }, + { + "epoch": 2.0612588171459576, + "grad_norm": 4.770229816436768, + "learning_rate": 5.442373578182815e-06, + "loss": 0.1271, + "step": 34008 + }, + { + "epoch": 2.0612723819858925, + "grad_norm": 6.091816425323486, + "learning_rate": 5.4422365355625606e-06, + "loss": 0.2055, + "step": 34009 + }, + { + "epoch": 2.0612859468258273, + "grad_norm": 5.700109004974365, + "learning_rate": 5.442099492942306e-06, + "loss": 0.3211, + "step": 34010 + }, + { + "epoch": 2.061299511665762, + "grad_norm": 6.113646507263184, + "learning_rate": 5.44196245032205e-06, + "loss": 0.2735, + "step": 34011 + }, + { + "epoch": 2.061313076505697, + "grad_norm": 3.4141578674316406, + "learning_rate": 5.441825407701796e-06, + "loss": 0.1857, + "step": 34012 + }, + { + "epoch": 2.061326641345632, + "grad_norm": 7.162111282348633, + "learning_rate": 5.441688365081541e-06, + "loss": 0.3759, + "step": 34013 + }, + { + "epoch": 2.061340206185567, + "grad_norm": 5.806449890136719, + "learning_rate": 5.441551322461286e-06, + "loss": 0.2133, + "step": 34014 + }, + { + "epoch": 2.061353771025502, + "grad_norm": 3.7335658073425293, + "learning_rate": 5.441414279841031e-06, + "loss": 0.1878, + "step": 34015 + }, + { + "epoch": 2.061367335865437, + "grad_norm": 4.744592666625977, + "learning_rate": 5.441277237220776e-06, + "loss": 0.1766, + "step": 34016 + }, + { + "epoch": 2.061380900705372, + "grad_norm": 4.298946380615234, + "learning_rate": 5.441140194600521e-06, + "loss": 0.1831, + "step": 34017 + }, + { + "epoch": 2.0613944655453067, + "grad_norm": 5.089208126068115, + "learning_rate": 5.441003151980266e-06, + "loss": 0.2, + "step": 34018 + }, + { + "epoch": 2.0614080303852416, + "grad_norm": 5.600697994232178, + "learning_rate": 5.4408661093600115e-06, + "loss": 0.224, + "step": 34019 + }, + { + "epoch": 2.0614215952251764, + "grad_norm": 5.6633687019348145, + "learning_rate": 5.440729066739756e-06, + "loss": 0.2602, + "step": 34020 + }, + { + "epoch": 2.0614351600651113, + "grad_norm": 5.073419570922852, + "learning_rate": 5.440592024119502e-06, + "loss": 0.204, + "step": 34021 + }, + { + "epoch": 2.061448724905046, + "grad_norm": 6.520572662353516, + "learning_rate": 5.440454981499247e-06, + "loss": 0.2348, + "step": 34022 + }, + { + "epoch": 2.061462289744981, + "grad_norm": 4.923872470855713, + "learning_rate": 5.440317938878991e-06, + "loss": 0.1546, + "step": 34023 + }, + { + "epoch": 2.061475854584916, + "grad_norm": 5.895728588104248, + "learning_rate": 5.4401808962587366e-06, + "loss": 0.2055, + "step": 34024 + }, + { + "epoch": 2.0614894194248508, + "grad_norm": 4.35606050491333, + "learning_rate": 5.440043853638483e-06, + "loss": 0.1688, + "step": 34025 + }, + { + "epoch": 2.0615029842647856, + "grad_norm": 6.228154182434082, + "learning_rate": 5.439906811018228e-06, + "loss": 0.3133, + "step": 34026 + }, + { + "epoch": 2.0615165491047205, + "grad_norm": 4.345833778381348, + "learning_rate": 5.439769768397972e-06, + "loss": 0.1451, + "step": 34027 + }, + { + "epoch": 2.0615301139446554, + "grad_norm": 3.8297064304351807, + "learning_rate": 5.439632725777717e-06, + "loss": 0.151, + "step": 34028 + }, + { + "epoch": 2.06154367878459, + "grad_norm": 4.125931739807129, + "learning_rate": 5.439495683157462e-06, + "loss": 0.1309, + "step": 34029 + }, + { + "epoch": 2.061557243624525, + "grad_norm": 6.245725631713867, + "learning_rate": 5.439358640537208e-06, + "loss": 0.269, + "step": 34030 + }, + { + "epoch": 2.06157080846446, + "grad_norm": 4.464747905731201, + "learning_rate": 5.439221597916953e-06, + "loss": 0.1788, + "step": 34031 + }, + { + "epoch": 2.061584373304395, + "grad_norm": 6.809254169464111, + "learning_rate": 5.439084555296697e-06, + "loss": 0.2922, + "step": 34032 + }, + { + "epoch": 2.06159793814433, + "grad_norm": 4.753357410430908, + "learning_rate": 5.438947512676442e-06, + "loss": 0.296, + "step": 34033 + }, + { + "epoch": 2.061611502984265, + "grad_norm": 6.413049221038818, + "learning_rate": 5.438810470056188e-06, + "loss": 0.3321, + "step": 34034 + }, + { + "epoch": 2.0616250678242, + "grad_norm": 3.992079734802246, + "learning_rate": 5.4386734274359335e-06, + "loss": 0.1417, + "step": 34035 + }, + { + "epoch": 2.0616386326641347, + "grad_norm": 5.237245559692383, + "learning_rate": 5.438536384815678e-06, + "loss": 0.2693, + "step": 34036 + }, + { + "epoch": 2.0616521975040696, + "grad_norm": 4.127764701843262, + "learning_rate": 5.438399342195423e-06, + "loss": 0.1423, + "step": 34037 + }, + { + "epoch": 2.0616657623440044, + "grad_norm": 6.061254024505615, + "learning_rate": 5.438262299575169e-06, + "loss": 0.2285, + "step": 34038 + }, + { + "epoch": 2.0616793271839393, + "grad_norm": 6.2141547203063965, + "learning_rate": 5.438125256954913e-06, + "loss": 0.2153, + "step": 34039 + }, + { + "epoch": 2.061692892023874, + "grad_norm": 8.780482292175293, + "learning_rate": 5.4379882143346586e-06, + "loss": 0.4103, + "step": 34040 + }, + { + "epoch": 2.061706456863809, + "grad_norm": 6.229882717132568, + "learning_rate": 5.437851171714404e-06, + "loss": 0.2867, + "step": 34041 + }, + { + "epoch": 2.061720021703744, + "grad_norm": 4.422661781311035, + "learning_rate": 5.437714129094148e-06, + "loss": 0.1391, + "step": 34042 + }, + { + "epoch": 2.0617335865436788, + "grad_norm": 6.391876697540283, + "learning_rate": 5.437577086473894e-06, + "loss": 0.2582, + "step": 34043 + }, + { + "epoch": 2.0617471513836136, + "grad_norm": 5.220160484313965, + "learning_rate": 5.437440043853639e-06, + "loss": 0.1749, + "step": 34044 + }, + { + "epoch": 2.0617607162235485, + "grad_norm": 4.054870128631592, + "learning_rate": 5.437303001233384e-06, + "loss": 0.3076, + "step": 34045 + }, + { + "epoch": 2.0617742810634834, + "grad_norm": 3.0314061641693115, + "learning_rate": 5.437165958613129e-06, + "loss": 0.0847, + "step": 34046 + }, + { + "epoch": 2.0617878459034182, + "grad_norm": 4.246902942657471, + "learning_rate": 5.437028915992875e-06, + "loss": 0.2597, + "step": 34047 + }, + { + "epoch": 2.061801410743353, + "grad_norm": 5.253680229187012, + "learning_rate": 5.436891873372619e-06, + "loss": 0.2533, + "step": 34048 + }, + { + "epoch": 2.061814975583288, + "grad_norm": 6.7599005699157715, + "learning_rate": 5.436754830752364e-06, + "loss": 0.352, + "step": 34049 + }, + { + "epoch": 2.061828540423223, + "grad_norm": 7.103600978851318, + "learning_rate": 5.4366177881321095e-06, + "loss": 0.2836, + "step": 34050 + }, + { + "epoch": 2.0618421052631577, + "grad_norm": 6.429677486419678, + "learning_rate": 5.4364807455118555e-06, + "loss": 0.3043, + "step": 34051 + }, + { + "epoch": 2.0618556701030926, + "grad_norm": 5.321510314941406, + "learning_rate": 5.4363437028916e-06, + "loss": 0.1434, + "step": 34052 + }, + { + "epoch": 2.061869234943028, + "grad_norm": 7.835411071777344, + "learning_rate": 5.436206660271345e-06, + "loss": 0.3554, + "step": 34053 + }, + { + "epoch": 2.0618827997829627, + "grad_norm": 7.000866413116455, + "learning_rate": 5.436069617651089e-06, + "loss": 0.2896, + "step": 34054 + }, + { + "epoch": 2.0618963646228976, + "grad_norm": 6.59138822555542, + "learning_rate": 5.4359325750308346e-06, + "loss": 0.1912, + "step": 34055 + }, + { + "epoch": 2.0619099294628325, + "grad_norm": 3.5481040477752686, + "learning_rate": 5.435795532410581e-06, + "loss": 0.1105, + "step": 34056 + }, + { + "epoch": 2.0619234943027673, + "grad_norm": 5.905213832855225, + "learning_rate": 5.435658489790325e-06, + "loss": 0.2321, + "step": 34057 + }, + { + "epoch": 2.061937059142702, + "grad_norm": 5.7562031745910645, + "learning_rate": 5.43552144717007e-06, + "loss": 0.1464, + "step": 34058 + }, + { + "epoch": 2.061950623982637, + "grad_norm": 7.149603843688965, + "learning_rate": 5.435384404549815e-06, + "loss": 0.3189, + "step": 34059 + }, + { + "epoch": 2.061964188822572, + "grad_norm": 5.2208709716796875, + "learning_rate": 5.435247361929561e-06, + "loss": 0.2156, + "step": 34060 + }, + { + "epoch": 2.061977753662507, + "grad_norm": 5.5084638595581055, + "learning_rate": 5.435110319309306e-06, + "loss": 0.2459, + "step": 34061 + }, + { + "epoch": 2.0619913185024417, + "grad_norm": 4.263581275939941, + "learning_rate": 5.434973276689051e-06, + "loss": 0.1692, + "step": 34062 + }, + { + "epoch": 2.0620048833423765, + "grad_norm": 5.952816963195801, + "learning_rate": 5.434836234068795e-06, + "loss": 0.3455, + "step": 34063 + }, + { + "epoch": 2.0620184481823114, + "grad_norm": 5.025806903839111, + "learning_rate": 5.434699191448541e-06, + "loss": 0.2583, + "step": 34064 + }, + { + "epoch": 2.0620320130222463, + "grad_norm": 6.078362941741943, + "learning_rate": 5.434562148828286e-06, + "loss": 0.3741, + "step": 34065 + }, + { + "epoch": 2.062045577862181, + "grad_norm": 4.08030891418457, + "learning_rate": 5.434425106208031e-06, + "loss": 0.2181, + "step": 34066 + }, + { + "epoch": 2.062059142702116, + "grad_norm": 7.890849590301514, + "learning_rate": 5.434288063587776e-06, + "loss": 0.3966, + "step": 34067 + }, + { + "epoch": 2.062072707542051, + "grad_norm": 4.390295505523682, + "learning_rate": 5.434151020967522e-06, + "loss": 0.1604, + "step": 34068 + }, + { + "epoch": 2.0620862723819857, + "grad_norm": 5.411861419677734, + "learning_rate": 5.434013978347267e-06, + "loss": 0.1832, + "step": 34069 + }, + { + "epoch": 2.0620998372219206, + "grad_norm": 6.1900153160095215, + "learning_rate": 5.433876935727011e-06, + "loss": 0.2708, + "step": 34070 + }, + { + "epoch": 2.062113402061856, + "grad_norm": 5.414394855499268, + "learning_rate": 5.433739893106757e-06, + "loss": 0.2436, + "step": 34071 + }, + { + "epoch": 2.0621269669017908, + "grad_norm": 5.6031975746154785, + "learning_rate": 5.433602850486501e-06, + "loss": 0.2527, + "step": 34072 + }, + { + "epoch": 2.0621405317417256, + "grad_norm": 5.044384002685547, + "learning_rate": 5.433465807866247e-06, + "loss": 0.1378, + "step": 34073 + }, + { + "epoch": 2.0621540965816605, + "grad_norm": 5.582083225250244, + "learning_rate": 5.433328765245992e-06, + "loss": 0.1745, + "step": 34074 + }, + { + "epoch": 2.0621676614215954, + "grad_norm": 5.891573429107666, + "learning_rate": 5.433191722625737e-06, + "loss": 0.1968, + "step": 34075 + }, + { + "epoch": 2.06218122626153, + "grad_norm": 6.833204746246338, + "learning_rate": 5.433054680005482e-06, + "loss": 0.1957, + "step": 34076 + }, + { + "epoch": 2.062194791101465, + "grad_norm": 3.501055955886841, + "learning_rate": 5.432917637385228e-06, + "loss": 0.0755, + "step": 34077 + }, + { + "epoch": 2.0622083559414, + "grad_norm": 5.424310684204102, + "learning_rate": 5.432780594764973e-06, + "loss": 0.2263, + "step": 34078 + }, + { + "epoch": 2.062221920781335, + "grad_norm": 5.213437080383301, + "learning_rate": 5.432643552144717e-06, + "loss": 0.1967, + "step": 34079 + }, + { + "epoch": 2.0622354856212697, + "grad_norm": 5.397943019866943, + "learning_rate": 5.432506509524462e-06, + "loss": 0.2241, + "step": 34080 + }, + { + "epoch": 2.0622490504612045, + "grad_norm": 4.187997817993164, + "learning_rate": 5.432369466904208e-06, + "loss": 0.1037, + "step": 34081 + }, + { + "epoch": 2.0622626153011394, + "grad_norm": 5.39340877532959, + "learning_rate": 5.432232424283953e-06, + "loss": 0.2051, + "step": 34082 + }, + { + "epoch": 2.0622761801410743, + "grad_norm": 4.692936420440674, + "learning_rate": 5.432095381663698e-06, + "loss": 0.2391, + "step": 34083 + }, + { + "epoch": 2.062289744981009, + "grad_norm": 5.436929225921631, + "learning_rate": 5.431958339043443e-06, + "loss": 0.2235, + "step": 34084 + }, + { + "epoch": 2.062303309820944, + "grad_norm": 5.347605228424072, + "learning_rate": 5.431821296423187e-06, + "loss": 0.2037, + "step": 34085 + }, + { + "epoch": 2.062316874660879, + "grad_norm": 5.196797847747803, + "learning_rate": 5.431684253802933e-06, + "loss": 0.26, + "step": 34086 + }, + { + "epoch": 2.0623304395008137, + "grad_norm": 3.7242045402526855, + "learning_rate": 5.431547211182679e-06, + "loss": 0.1582, + "step": 34087 + }, + { + "epoch": 2.0623440043407486, + "grad_norm": 5.624413967132568, + "learning_rate": 5.431410168562423e-06, + "loss": 0.1718, + "step": 34088 + }, + { + "epoch": 2.0623575691806835, + "grad_norm": 4.867279529571533, + "learning_rate": 5.431273125942168e-06, + "loss": 0.1751, + "step": 34089 + }, + { + "epoch": 2.0623711340206183, + "grad_norm": 3.771383762359619, + "learning_rate": 5.431136083321914e-06, + "loss": 0.1549, + "step": 34090 + }, + { + "epoch": 2.0623846988605536, + "grad_norm": 5.530686378479004, + "learning_rate": 5.4309990407016585e-06, + "loss": 0.2747, + "step": 34091 + }, + { + "epoch": 2.0623982637004885, + "grad_norm": 3.9202821254730225, + "learning_rate": 5.430861998081404e-06, + "loss": 0.1076, + "step": 34092 + }, + { + "epoch": 2.0624118285404234, + "grad_norm": 5.070395469665527, + "learning_rate": 5.430724955461149e-06, + "loss": 0.0866, + "step": 34093 + }, + { + "epoch": 2.0624253933803582, + "grad_norm": 5.175241947174072, + "learning_rate": 5.430587912840895e-06, + "loss": 0.2345, + "step": 34094 + }, + { + "epoch": 2.062438958220293, + "grad_norm": 5.752140522003174, + "learning_rate": 5.430450870220639e-06, + "loss": 0.1773, + "step": 34095 + }, + { + "epoch": 2.062452523060228, + "grad_norm": 4.179776668548584, + "learning_rate": 5.430313827600384e-06, + "loss": 0.1338, + "step": 34096 + }, + { + "epoch": 2.062466087900163, + "grad_norm": 3.4384143352508545, + "learning_rate": 5.430176784980129e-06, + "loss": 0.1066, + "step": 34097 + }, + { + "epoch": 2.0624796527400977, + "grad_norm": 5.385684013366699, + "learning_rate": 5.430039742359874e-06, + "loss": 0.1392, + "step": 34098 + }, + { + "epoch": 2.0624932175800326, + "grad_norm": 6.273384094238281, + "learning_rate": 5.42990269973962e-06, + "loss": 0.1604, + "step": 34099 + }, + { + "epoch": 2.0625067824199674, + "grad_norm": 3.2945947647094727, + "learning_rate": 5.429765657119365e-06, + "loss": 0.1023, + "step": 34100 + }, + { + "epoch": 2.0625203472599023, + "grad_norm": 7.378097057342529, + "learning_rate": 5.429628614499109e-06, + "loss": 0.202, + "step": 34101 + }, + { + "epoch": 2.062533912099837, + "grad_norm": 5.138009548187256, + "learning_rate": 5.429491571878855e-06, + "loss": 0.107, + "step": 34102 + }, + { + "epoch": 2.062547476939772, + "grad_norm": 6.035910129547119, + "learning_rate": 5.429354529258601e-06, + "loss": 0.1392, + "step": 34103 + }, + { + "epoch": 2.062561041779707, + "grad_norm": 5.874853134155273, + "learning_rate": 5.429217486638345e-06, + "loss": 0.131, + "step": 34104 + }, + { + "epoch": 2.0625746066196418, + "grad_norm": 3.224402666091919, + "learning_rate": 5.42908044401809e-06, + "loss": 0.0788, + "step": 34105 + }, + { + "epoch": 2.0625881714595766, + "grad_norm": 5.267356872558594, + "learning_rate": 5.4289434013978345e-06, + "loss": 0.1586, + "step": 34106 + }, + { + "epoch": 2.0626017362995115, + "grad_norm": 4.484120845794678, + "learning_rate": 5.4288063587775805e-06, + "loss": 0.1942, + "step": 34107 + }, + { + "epoch": 2.0626153011394464, + "grad_norm": 4.3946123123168945, + "learning_rate": 5.428669316157326e-06, + "loss": 0.1346, + "step": 34108 + }, + { + "epoch": 2.0626288659793817, + "grad_norm": 3.7679660320281982, + "learning_rate": 5.428532273537071e-06, + "loss": 0.1261, + "step": 34109 + }, + { + "epoch": 2.0626424308193165, + "grad_norm": 5.24008846282959, + "learning_rate": 5.428395230916815e-06, + "loss": 0.2079, + "step": 34110 + }, + { + "epoch": 2.0626559956592514, + "grad_norm": 5.328325271606445, + "learning_rate": 5.42825818829656e-06, + "loss": 0.1315, + "step": 34111 + }, + { + "epoch": 2.0626695604991863, + "grad_norm": 4.494898319244385, + "learning_rate": 5.428121145676306e-06, + "loss": 0.1176, + "step": 34112 + }, + { + "epoch": 2.062683125339121, + "grad_norm": 4.455499172210693, + "learning_rate": 5.427984103056051e-06, + "loss": 0.1274, + "step": 34113 + }, + { + "epoch": 2.062696690179056, + "grad_norm": 2.8945960998535156, + "learning_rate": 5.427847060435796e-06, + "loss": 0.0968, + "step": 34114 + }, + { + "epoch": 2.062710255018991, + "grad_norm": 4.314733982086182, + "learning_rate": 5.42771001781554e-06, + "loss": 0.1255, + "step": 34115 + }, + { + "epoch": 2.0627238198589257, + "grad_norm": 4.98146915435791, + "learning_rate": 5.427572975195286e-06, + "loss": 0.2318, + "step": 34116 + }, + { + "epoch": 2.0627373846988606, + "grad_norm": 5.517872333526611, + "learning_rate": 5.427435932575031e-06, + "loss": 0.215, + "step": 34117 + }, + { + "epoch": 2.0627509495387955, + "grad_norm": 5.624122619628906, + "learning_rate": 5.427298889954777e-06, + "loss": 0.171, + "step": 34118 + }, + { + "epoch": 2.0627645143787303, + "grad_norm": 4.756476402282715, + "learning_rate": 5.427161847334521e-06, + "loss": 0.1381, + "step": 34119 + }, + { + "epoch": 2.062778079218665, + "grad_norm": 3.8042869567871094, + "learning_rate": 5.427024804714267e-06, + "loss": 0.1463, + "step": 34120 + }, + { + "epoch": 2.0627916440586, + "grad_norm": 4.733521938323975, + "learning_rate": 5.426887762094012e-06, + "loss": 0.1504, + "step": 34121 + }, + { + "epoch": 2.062805208898535, + "grad_norm": 4.617213726043701, + "learning_rate": 5.4267507194737565e-06, + "loss": 0.1528, + "step": 34122 + }, + { + "epoch": 2.06281877373847, + "grad_norm": 6.211338996887207, + "learning_rate": 5.426613676853502e-06, + "loss": 0.2378, + "step": 34123 + }, + { + "epoch": 2.0628323385784046, + "grad_norm": 4.546331882476807, + "learning_rate": 5.426476634233247e-06, + "loss": 0.214, + "step": 34124 + }, + { + "epoch": 2.0628459034183395, + "grad_norm": 6.206206321716309, + "learning_rate": 5.426339591612992e-06, + "loss": 0.2788, + "step": 34125 + }, + { + "epoch": 2.0628594682582744, + "grad_norm": 4.923035621643066, + "learning_rate": 5.426202548992737e-06, + "loss": 0.1903, + "step": 34126 + }, + { + "epoch": 2.0628730330982092, + "grad_norm": 4.354254245758057, + "learning_rate": 5.426065506372482e-06, + "loss": 0.1238, + "step": 34127 + }, + { + "epoch": 2.062886597938144, + "grad_norm": 5.654518127441406, + "learning_rate": 5.425928463752227e-06, + "loss": 0.407, + "step": 34128 + }, + { + "epoch": 2.0629001627780794, + "grad_norm": 3.916083812713623, + "learning_rate": 5.425791421131973e-06, + "loss": 0.1332, + "step": 34129 + }, + { + "epoch": 2.0629137276180143, + "grad_norm": 3.7851755619049072, + "learning_rate": 5.425654378511718e-06, + "loss": 0.1709, + "step": 34130 + }, + { + "epoch": 2.062927292457949, + "grad_norm": 6.179845333099365, + "learning_rate": 5.425517335891462e-06, + "loss": 0.1588, + "step": 34131 + }, + { + "epoch": 2.062940857297884, + "grad_norm": 5.189854145050049, + "learning_rate": 5.425380293271207e-06, + "loss": 0.1545, + "step": 34132 + }, + { + "epoch": 2.062954422137819, + "grad_norm": 4.145925998687744, + "learning_rate": 5.4252432506509534e-06, + "loss": 0.1134, + "step": 34133 + }, + { + "epoch": 2.0629679869777537, + "grad_norm": 5.999450206756592, + "learning_rate": 5.425106208030699e-06, + "loss": 0.2229, + "step": 34134 + }, + { + "epoch": 2.0629815518176886, + "grad_norm": 5.337032318115234, + "learning_rate": 5.424969165410443e-06, + "loss": 0.2277, + "step": 34135 + }, + { + "epoch": 2.0629951166576235, + "grad_norm": 7.145895481109619, + "learning_rate": 5.424832122790188e-06, + "loss": 0.1628, + "step": 34136 + }, + { + "epoch": 2.0630086814975583, + "grad_norm": 5.317572593688965, + "learning_rate": 5.424695080169934e-06, + "loss": 0.1626, + "step": 34137 + }, + { + "epoch": 2.063022246337493, + "grad_norm": 5.353126049041748, + "learning_rate": 5.4245580375496785e-06, + "loss": 0.2564, + "step": 34138 + }, + { + "epoch": 2.063035811177428, + "grad_norm": 3.711270570755005, + "learning_rate": 5.424420994929424e-06, + "loss": 0.0872, + "step": 34139 + }, + { + "epoch": 2.063049376017363, + "grad_norm": 4.856014728546143, + "learning_rate": 5.424283952309168e-06, + "loss": 0.1516, + "step": 34140 + }, + { + "epoch": 2.063062940857298, + "grad_norm": 6.0086164474487305, + "learning_rate": 5.424146909688913e-06, + "loss": 0.183, + "step": 34141 + }, + { + "epoch": 2.0630765056972327, + "grad_norm": 3.2363712787628174, + "learning_rate": 5.424009867068659e-06, + "loss": 0.1482, + "step": 34142 + }, + { + "epoch": 2.0630900705371675, + "grad_norm": 4.663849830627441, + "learning_rate": 5.423872824448404e-06, + "loss": 0.0905, + "step": 34143 + }, + { + "epoch": 2.0631036353771024, + "grad_norm": 4.738953590393066, + "learning_rate": 5.423735781828149e-06, + "loss": 0.1975, + "step": 34144 + }, + { + "epoch": 2.0631172002170373, + "grad_norm": 5.715179920196533, + "learning_rate": 5.423598739207894e-06, + "loss": 0.1512, + "step": 34145 + }, + { + "epoch": 2.063130765056972, + "grad_norm": 5.179936408996582, + "learning_rate": 5.42346169658764e-06, + "loss": 0.1653, + "step": 34146 + }, + { + "epoch": 2.0631443298969074, + "grad_norm": 6.350955486297607, + "learning_rate": 5.423324653967384e-06, + "loss": 0.1429, + "step": 34147 + }, + { + "epoch": 2.0631578947368423, + "grad_norm": 3.8496389389038086, + "learning_rate": 5.423187611347129e-06, + "loss": 0.1433, + "step": 34148 + }, + { + "epoch": 2.063171459576777, + "grad_norm": 5.93748664855957, + "learning_rate": 5.423050568726875e-06, + "loss": 0.3526, + "step": 34149 + }, + { + "epoch": 2.063185024416712, + "grad_norm": 6.132879734039307, + "learning_rate": 5.42291352610662e-06, + "loss": 0.1674, + "step": 34150 + }, + { + "epoch": 2.063198589256647, + "grad_norm": 4.946229934692383, + "learning_rate": 5.422776483486365e-06, + "loss": 0.1817, + "step": 34151 + }, + { + "epoch": 2.0632121540965818, + "grad_norm": 6.3963422775268555, + "learning_rate": 5.42263944086611e-06, + "loss": 0.2347, + "step": 34152 + }, + { + "epoch": 2.0632257189365166, + "grad_norm": 4.240288257598877, + "learning_rate": 5.4225023982458545e-06, + "loss": 0.1766, + "step": 34153 + }, + { + "epoch": 2.0632392837764515, + "grad_norm": 6.318517684936523, + "learning_rate": 5.4223653556256e-06, + "loss": 0.2092, + "step": 34154 + }, + { + "epoch": 2.0632528486163864, + "grad_norm": 8.036035537719727, + "learning_rate": 5.422228313005346e-06, + "loss": 0.3037, + "step": 34155 + }, + { + "epoch": 2.0632664134563212, + "grad_norm": 4.531861305236816, + "learning_rate": 5.42209127038509e-06, + "loss": 0.1679, + "step": 34156 + }, + { + "epoch": 2.063279978296256, + "grad_norm": 3.7862777709960938, + "learning_rate": 5.421954227764835e-06, + "loss": 0.1523, + "step": 34157 + }, + { + "epoch": 2.063293543136191, + "grad_norm": 6.945790767669678, + "learning_rate": 5.42181718514458e-06, + "loss": 0.2919, + "step": 34158 + }, + { + "epoch": 2.063307107976126, + "grad_norm": 6.0970001220703125, + "learning_rate": 5.4216801425243255e-06, + "loss": 0.2998, + "step": 34159 + }, + { + "epoch": 2.0633206728160607, + "grad_norm": 5.338636875152588, + "learning_rate": 5.421543099904071e-06, + "loss": 0.218, + "step": 34160 + }, + { + "epoch": 2.0633342376559956, + "grad_norm": 7.635647773742676, + "learning_rate": 5.421406057283816e-06, + "loss": 0.2465, + "step": 34161 + }, + { + "epoch": 2.0633478024959304, + "grad_norm": 4.908900260925293, + "learning_rate": 5.42126901466356e-06, + "loss": 0.1781, + "step": 34162 + }, + { + "epoch": 2.0633613673358653, + "grad_norm": 4.955967426300049, + "learning_rate": 5.421131972043306e-06, + "loss": 0.0955, + "step": 34163 + }, + { + "epoch": 2.0633749321758, + "grad_norm": 6.333974361419678, + "learning_rate": 5.4209949294230514e-06, + "loss": 0.2565, + "step": 34164 + }, + { + "epoch": 2.063388497015735, + "grad_norm": 5.07388973236084, + "learning_rate": 5.420857886802796e-06, + "loss": 0.1952, + "step": 34165 + }, + { + "epoch": 2.06340206185567, + "grad_norm": 5.879273414611816, + "learning_rate": 5.420720844182541e-06, + "loss": 0.2037, + "step": 34166 + }, + { + "epoch": 2.063415626695605, + "grad_norm": 4.730052947998047, + "learning_rate": 5.420583801562286e-06, + "loss": 0.1655, + "step": 34167 + }, + { + "epoch": 2.06342919153554, + "grad_norm": 7.174006462097168, + "learning_rate": 5.420446758942032e-06, + "loss": 0.257, + "step": 34168 + }, + { + "epoch": 2.063442756375475, + "grad_norm": 6.596229553222656, + "learning_rate": 5.4203097163217765e-06, + "loss": 0.2552, + "step": 34169 + }, + { + "epoch": 2.06345632121541, + "grad_norm": 6.146685600280762, + "learning_rate": 5.420172673701522e-06, + "loss": 0.2607, + "step": 34170 + }, + { + "epoch": 2.0634698860553446, + "grad_norm": 5.701332092285156, + "learning_rate": 5.420035631081266e-06, + "loss": 0.2397, + "step": 34171 + }, + { + "epoch": 2.0634834508952795, + "grad_norm": 5.16707181930542, + "learning_rate": 5.419898588461012e-06, + "loss": 0.2173, + "step": 34172 + }, + { + "epoch": 2.0634970157352144, + "grad_norm": 6.3348259925842285, + "learning_rate": 5.419761545840757e-06, + "loss": 0.1917, + "step": 34173 + }, + { + "epoch": 2.0635105805751492, + "grad_norm": 11.368685722351074, + "learning_rate": 5.4196245032205015e-06, + "loss": 0.4184, + "step": 34174 + }, + { + "epoch": 2.063524145415084, + "grad_norm": 6.478106498718262, + "learning_rate": 5.419487460600247e-06, + "loss": 0.1854, + "step": 34175 + }, + { + "epoch": 2.063537710255019, + "grad_norm": 5.572831153869629, + "learning_rate": 5.419350417979993e-06, + "loss": 0.2284, + "step": 34176 + }, + { + "epoch": 2.063551275094954, + "grad_norm": 6.099785327911377, + "learning_rate": 5.419213375359738e-06, + "loss": 0.2533, + "step": 34177 + }, + { + "epoch": 2.0635648399348887, + "grad_norm": 7.574753284454346, + "learning_rate": 5.419076332739482e-06, + "loss": 0.2909, + "step": 34178 + }, + { + "epoch": 2.0635784047748236, + "grad_norm": 5.978358745574951, + "learning_rate": 5.4189392901192274e-06, + "loss": 0.1659, + "step": 34179 + }, + { + "epoch": 2.0635919696147584, + "grad_norm": 7.600856304168701, + "learning_rate": 5.418802247498972e-06, + "loss": 0.3496, + "step": 34180 + }, + { + "epoch": 2.0636055344546933, + "grad_norm": 10.010787963867188, + "learning_rate": 5.418665204878718e-06, + "loss": 0.4496, + "step": 34181 + }, + { + "epoch": 2.063619099294628, + "grad_norm": 6.899075031280518, + "learning_rate": 5.418528162258463e-06, + "loss": 0.2683, + "step": 34182 + }, + { + "epoch": 2.063632664134563, + "grad_norm": 6.7348151206970215, + "learning_rate": 5.418391119638208e-06, + "loss": 0.2641, + "step": 34183 + }, + { + "epoch": 2.063646228974498, + "grad_norm": 6.042293071746826, + "learning_rate": 5.4182540770179525e-06, + "loss": 0.1507, + "step": 34184 + }, + { + "epoch": 2.063659793814433, + "grad_norm": 21.74418067932129, + "learning_rate": 5.4181170343976985e-06, + "loss": 0.3026, + "step": 34185 + }, + { + "epoch": 2.063673358654368, + "grad_norm": 6.809963703155518, + "learning_rate": 5.417979991777444e-06, + "loss": 0.2297, + "step": 34186 + }, + { + "epoch": 2.063686923494303, + "grad_norm": 6.447926044464111, + "learning_rate": 5.417842949157188e-06, + "loss": 0.2409, + "step": 34187 + }, + { + "epoch": 2.063700488334238, + "grad_norm": 5.498472690582275, + "learning_rate": 5.417705906536933e-06, + "loss": 0.1822, + "step": 34188 + }, + { + "epoch": 2.0637140531741727, + "grad_norm": 7.47065544128418, + "learning_rate": 5.417568863916679e-06, + "loss": 0.3142, + "step": 34189 + }, + { + "epoch": 2.0637276180141075, + "grad_norm": 10.335919380187988, + "learning_rate": 5.4174318212964235e-06, + "loss": 0.5883, + "step": 34190 + }, + { + "epoch": 2.0637411828540424, + "grad_norm": 9.93298053741455, + "learning_rate": 5.417294778676169e-06, + "loss": 0.3663, + "step": 34191 + }, + { + "epoch": 2.0637547476939773, + "grad_norm": 9.80853271484375, + "learning_rate": 5.417157736055914e-06, + "loss": 0.2712, + "step": 34192 + }, + { + "epoch": 2.063768312533912, + "grad_norm": 5.074231147766113, + "learning_rate": 5.417020693435658e-06, + "loss": 0.1852, + "step": 34193 + }, + { + "epoch": 2.063781877373847, + "grad_norm": 6.618776321411133, + "learning_rate": 5.416883650815404e-06, + "loss": 0.231, + "step": 34194 + }, + { + "epoch": 2.063795442213782, + "grad_norm": 8.350410461425781, + "learning_rate": 5.4167466081951494e-06, + "loss": 0.3337, + "step": 34195 + }, + { + "epoch": 2.0638090070537167, + "grad_norm": 3.696930170059204, + "learning_rate": 5.416609565574894e-06, + "loss": 0.1227, + "step": 34196 + }, + { + "epoch": 2.0638225718936516, + "grad_norm": 6.644956111907959, + "learning_rate": 5.416472522954639e-06, + "loss": 0.2163, + "step": 34197 + }, + { + "epoch": 2.0638361367335865, + "grad_norm": 6.835323810577393, + "learning_rate": 5.416335480334385e-06, + "loss": 0.1791, + "step": 34198 + }, + { + "epoch": 2.0638497015735213, + "grad_norm": 5.8827996253967285, + "learning_rate": 5.416198437714129e-06, + "loss": 0.2385, + "step": 34199 + }, + { + "epoch": 2.063863266413456, + "grad_norm": 6.047597408294678, + "learning_rate": 5.4160613950938745e-06, + "loss": 0.1807, + "step": 34200 + }, + { + "epoch": 2.063876831253391, + "grad_norm": 7.020841121673584, + "learning_rate": 5.41592435247362e-06, + "loss": 0.2925, + "step": 34201 + }, + { + "epoch": 2.063890396093326, + "grad_norm": 5.0922675132751465, + "learning_rate": 5.415787309853366e-06, + "loss": 0.1705, + "step": 34202 + }, + { + "epoch": 2.063903960933261, + "grad_norm": 7.088716506958008, + "learning_rate": 5.41565026723311e-06, + "loss": 0.3364, + "step": 34203 + }, + { + "epoch": 2.0639175257731956, + "grad_norm": 4.456423282623291, + "learning_rate": 5.415513224612855e-06, + "loss": 0.1708, + "step": 34204 + }, + { + "epoch": 2.063931090613131, + "grad_norm": 6.36506462097168, + "learning_rate": 5.4153761819925995e-06, + "loss": 0.2915, + "step": 34205 + }, + { + "epoch": 2.063944655453066, + "grad_norm": 5.681338310241699, + "learning_rate": 5.415239139372345e-06, + "loss": 0.214, + "step": 34206 + }, + { + "epoch": 2.0639582202930007, + "grad_norm": 8.83919906616211, + "learning_rate": 5.415102096752091e-06, + "loss": 0.3555, + "step": 34207 + }, + { + "epoch": 2.0639717851329356, + "grad_norm": 6.944702625274658, + "learning_rate": 5.414965054131835e-06, + "loss": 0.3567, + "step": 34208 + }, + { + "epoch": 2.0639853499728704, + "grad_norm": 4.752420902252197, + "learning_rate": 5.41482801151158e-06, + "loss": 0.1317, + "step": 34209 + }, + { + "epoch": 2.0639989148128053, + "grad_norm": 4.576679706573486, + "learning_rate": 5.4146909688913254e-06, + "loss": 0.1524, + "step": 34210 + }, + { + "epoch": 2.06401247965274, + "grad_norm": 5.190742492675781, + "learning_rate": 5.4145539262710715e-06, + "loss": 0.1948, + "step": 34211 + }, + { + "epoch": 2.064026044492675, + "grad_norm": 4.710108757019043, + "learning_rate": 5.414416883650816e-06, + "loss": 0.1582, + "step": 34212 + }, + { + "epoch": 2.06403960933261, + "grad_norm": 4.8591084480285645, + "learning_rate": 5.414279841030561e-06, + "loss": 0.1755, + "step": 34213 + }, + { + "epoch": 2.0640531741725447, + "grad_norm": 8.259966850280762, + "learning_rate": 5.414142798410305e-06, + "loss": 0.3072, + "step": 34214 + }, + { + "epoch": 2.0640667390124796, + "grad_norm": 4.952193737030029, + "learning_rate": 5.414005755790051e-06, + "loss": 0.2072, + "step": 34215 + }, + { + "epoch": 2.0640803038524145, + "grad_norm": 4.414349555969238, + "learning_rate": 5.4138687131697965e-06, + "loss": 0.1171, + "step": 34216 + }, + { + "epoch": 2.0640938686923493, + "grad_norm": 4.530846118927002, + "learning_rate": 5.413731670549542e-06, + "loss": 0.1597, + "step": 34217 + }, + { + "epoch": 2.064107433532284, + "grad_norm": 7.0824456214904785, + "learning_rate": 5.413594627929286e-06, + "loss": 0.3123, + "step": 34218 + }, + { + "epoch": 2.064120998372219, + "grad_norm": 3.026634454727173, + "learning_rate": 5.413457585309032e-06, + "loss": 0.0981, + "step": 34219 + }, + { + "epoch": 2.064134563212154, + "grad_norm": 5.566258907318115, + "learning_rate": 5.413320542688777e-06, + "loss": 0.1859, + "step": 34220 + }, + { + "epoch": 2.064148128052089, + "grad_norm": 5.073446750640869, + "learning_rate": 5.4131835000685216e-06, + "loss": 0.2495, + "step": 34221 + }, + { + "epoch": 2.0641616928920237, + "grad_norm": 3.1218936443328857, + "learning_rate": 5.413046457448267e-06, + "loss": 0.1359, + "step": 34222 + }, + { + "epoch": 2.064175257731959, + "grad_norm": 4.107949733734131, + "learning_rate": 5.412909414828011e-06, + "loss": 0.1443, + "step": 34223 + }, + { + "epoch": 2.064188822571894, + "grad_norm": 6.175339221954346, + "learning_rate": 5.412772372207757e-06, + "loss": 0.2948, + "step": 34224 + }, + { + "epoch": 2.0642023874118287, + "grad_norm": 3.8271842002868652, + "learning_rate": 5.412635329587502e-06, + "loss": 0.1503, + "step": 34225 + }, + { + "epoch": 2.0642159522517636, + "grad_norm": 5.000904560089111, + "learning_rate": 5.4124982869672474e-06, + "loss": 0.1998, + "step": 34226 + }, + { + "epoch": 2.0642295170916984, + "grad_norm": 6.0764312744140625, + "learning_rate": 5.412361244346992e-06, + "loss": 0.248, + "step": 34227 + }, + { + "epoch": 2.0642430819316333, + "grad_norm": 4.538567066192627, + "learning_rate": 5.412224201726738e-06, + "loss": 0.1819, + "step": 34228 + }, + { + "epoch": 2.064256646771568, + "grad_norm": 5.297530174255371, + "learning_rate": 5.412087159106483e-06, + "loss": 0.2267, + "step": 34229 + }, + { + "epoch": 2.064270211611503, + "grad_norm": 4.9320969581604, + "learning_rate": 5.411950116486227e-06, + "loss": 0.1438, + "step": 34230 + }, + { + "epoch": 2.064283776451438, + "grad_norm": 4.8057541847229, + "learning_rate": 5.4118130738659725e-06, + "loss": 0.1803, + "step": 34231 + }, + { + "epoch": 2.0642973412913728, + "grad_norm": 4.696059226989746, + "learning_rate": 5.4116760312457185e-06, + "loss": 0.172, + "step": 34232 + }, + { + "epoch": 2.0643109061313076, + "grad_norm": 5.2392401695251465, + "learning_rate": 5.411538988625463e-06, + "loss": 0.2859, + "step": 34233 + }, + { + "epoch": 2.0643244709712425, + "grad_norm": 4.029743671417236, + "learning_rate": 5.411401946005208e-06, + "loss": 0.1736, + "step": 34234 + }, + { + "epoch": 2.0643380358111774, + "grad_norm": 6.229303359985352, + "learning_rate": 5.411264903384953e-06, + "loss": 0.332, + "step": 34235 + }, + { + "epoch": 2.0643516006511122, + "grad_norm": 6.66051721572876, + "learning_rate": 5.4111278607646975e-06, + "loss": 0.241, + "step": 34236 + }, + { + "epoch": 2.064365165491047, + "grad_norm": 3.601274013519287, + "learning_rate": 5.4109908181444436e-06, + "loss": 0.1421, + "step": 34237 + }, + { + "epoch": 2.064378730330982, + "grad_norm": 4.7361063957214355, + "learning_rate": 5.410853775524189e-06, + "loss": 0.2902, + "step": 34238 + }, + { + "epoch": 2.064392295170917, + "grad_norm": 3.4289746284484863, + "learning_rate": 5.410716732903933e-06, + "loss": 0.1422, + "step": 34239 + }, + { + "epoch": 2.0644058600108517, + "grad_norm": 4.588127613067627, + "learning_rate": 5.410579690283678e-06, + "loss": 0.1671, + "step": 34240 + }, + { + "epoch": 2.0644194248507866, + "grad_norm": 3.655606508255005, + "learning_rate": 5.410442647663424e-06, + "loss": 0.1318, + "step": 34241 + }, + { + "epoch": 2.0644329896907214, + "grad_norm": 4.737013339996338, + "learning_rate": 5.4103056050431695e-06, + "loss": 0.1919, + "step": 34242 + }, + { + "epoch": 2.0644465545306567, + "grad_norm": 4.862591743469238, + "learning_rate": 5.410168562422914e-06, + "loss": 0.2057, + "step": 34243 + }, + { + "epoch": 2.0644601193705916, + "grad_norm": 7.60087251663208, + "learning_rate": 5.410031519802659e-06, + "loss": 0.4794, + "step": 34244 + }, + { + "epoch": 2.0644736842105265, + "grad_norm": 4.449604034423828, + "learning_rate": 5.409894477182405e-06, + "loss": 0.1469, + "step": 34245 + }, + { + "epoch": 2.0644872490504613, + "grad_norm": 4.385066032409668, + "learning_rate": 5.409757434562149e-06, + "loss": 0.1841, + "step": 34246 + }, + { + "epoch": 2.064500813890396, + "grad_norm": 4.472084999084473, + "learning_rate": 5.4096203919418945e-06, + "loss": 0.1108, + "step": 34247 + }, + { + "epoch": 2.064514378730331, + "grad_norm": 6.894242763519287, + "learning_rate": 5.409483349321639e-06, + "loss": 0.2806, + "step": 34248 + }, + { + "epoch": 2.064527943570266, + "grad_norm": 5.035432815551758, + "learning_rate": 5.409346306701384e-06, + "loss": 0.1996, + "step": 34249 + }, + { + "epoch": 2.064541508410201, + "grad_norm": 4.184558391571045, + "learning_rate": 5.40920926408113e-06, + "loss": 0.2197, + "step": 34250 + }, + { + "epoch": 2.0645550732501357, + "grad_norm": 4.564599990844727, + "learning_rate": 5.409072221460875e-06, + "loss": 0.2, + "step": 34251 + }, + { + "epoch": 2.0645686380900705, + "grad_norm": 4.917469024658203, + "learning_rate": 5.4089351788406196e-06, + "loss": 0.1934, + "step": 34252 + }, + { + "epoch": 2.0645822029300054, + "grad_norm": 5.699681758880615, + "learning_rate": 5.408798136220365e-06, + "loss": 0.1924, + "step": 34253 + }, + { + "epoch": 2.0645957677699402, + "grad_norm": 4.171458721160889, + "learning_rate": 5.408661093600111e-06, + "loss": 0.1357, + "step": 34254 + }, + { + "epoch": 2.064609332609875, + "grad_norm": 4.257033348083496, + "learning_rate": 5.408524050979855e-06, + "loss": 0.2685, + "step": 34255 + }, + { + "epoch": 2.06462289744981, + "grad_norm": 4.183890342712402, + "learning_rate": 5.4083870083596e-06, + "loss": 0.2456, + "step": 34256 + }, + { + "epoch": 2.064636462289745, + "grad_norm": 5.150819301605225, + "learning_rate": 5.408249965739345e-06, + "loss": 0.1812, + "step": 34257 + }, + { + "epoch": 2.0646500271296797, + "grad_norm": 4.48192834854126, + "learning_rate": 5.408112923119091e-06, + "loss": 0.1857, + "step": 34258 + }, + { + "epoch": 2.0646635919696146, + "grad_norm": 4.779526233673096, + "learning_rate": 5.407975880498836e-06, + "loss": 0.2085, + "step": 34259 + }, + { + "epoch": 2.0646771568095494, + "grad_norm": 4.785874843597412, + "learning_rate": 5.407838837878581e-06, + "loss": 0.2409, + "step": 34260 + }, + { + "epoch": 2.0646907216494848, + "grad_norm": 4.327748775482178, + "learning_rate": 5.407701795258325e-06, + "loss": 0.2174, + "step": 34261 + }, + { + "epoch": 2.0647042864894196, + "grad_norm": 5.782824993133545, + "learning_rate": 5.4075647526380705e-06, + "loss": 0.3455, + "step": 34262 + }, + { + "epoch": 2.0647178513293545, + "grad_norm": 4.219358921051025, + "learning_rate": 5.4074277100178165e-06, + "loss": 0.2173, + "step": 34263 + }, + { + "epoch": 2.0647314161692893, + "grad_norm": 8.123499870300293, + "learning_rate": 5.407290667397561e-06, + "loss": 0.4246, + "step": 34264 + }, + { + "epoch": 2.064744981009224, + "grad_norm": 3.946892261505127, + "learning_rate": 5.407153624777306e-06, + "loss": 0.1171, + "step": 34265 + }, + { + "epoch": 2.064758545849159, + "grad_norm": 5.048271179199219, + "learning_rate": 5.407016582157051e-06, + "loss": 0.1799, + "step": 34266 + }, + { + "epoch": 2.064772110689094, + "grad_norm": 3.8663530349731445, + "learning_rate": 5.406879539536796e-06, + "loss": 0.133, + "step": 34267 + }, + { + "epoch": 2.064785675529029, + "grad_norm": 2.58598256111145, + "learning_rate": 5.4067424969165416e-06, + "loss": 0.0944, + "step": 34268 + }, + { + "epoch": 2.0647992403689637, + "grad_norm": 2.9154281616210938, + "learning_rate": 5.406605454296287e-06, + "loss": 0.0964, + "step": 34269 + }, + { + "epoch": 2.0648128052088985, + "grad_norm": 4.976294994354248, + "learning_rate": 5.406468411676031e-06, + "loss": 0.2362, + "step": 34270 + }, + { + "epoch": 2.0648263700488334, + "grad_norm": 5.061795234680176, + "learning_rate": 5.406331369055777e-06, + "loss": 0.2469, + "step": 34271 + }, + { + "epoch": 2.0648399348887683, + "grad_norm": 6.012362957000732, + "learning_rate": 5.406194326435522e-06, + "loss": 0.2648, + "step": 34272 + }, + { + "epoch": 2.064853499728703, + "grad_norm": 4.644646167755127, + "learning_rate": 5.406057283815267e-06, + "loss": 0.2012, + "step": 34273 + }, + { + "epoch": 2.064867064568638, + "grad_norm": 5.405529499053955, + "learning_rate": 5.405920241195012e-06, + "loss": 0.2074, + "step": 34274 + }, + { + "epoch": 2.064880629408573, + "grad_norm": 6.01418924331665, + "learning_rate": 5.405783198574757e-06, + "loss": 0.2462, + "step": 34275 + }, + { + "epoch": 2.0648941942485077, + "grad_norm": 4.310466289520264, + "learning_rate": 5.405646155954503e-06, + "loss": 0.1336, + "step": 34276 + }, + { + "epoch": 2.0649077590884426, + "grad_norm": 2.6654388904571533, + "learning_rate": 5.405509113334247e-06, + "loss": 0.0943, + "step": 34277 + }, + { + "epoch": 2.0649213239283775, + "grad_norm": 3.8594112396240234, + "learning_rate": 5.4053720707139925e-06, + "loss": 0.1769, + "step": 34278 + }, + { + "epoch": 2.0649348887683123, + "grad_norm": 3.6586620807647705, + "learning_rate": 5.405235028093737e-06, + "loss": 0.167, + "step": 34279 + }, + { + "epoch": 2.0649484536082476, + "grad_norm": 5.771500587463379, + "learning_rate": 5.405097985473483e-06, + "loss": 0.2248, + "step": 34280 + }, + { + "epoch": 2.0649620184481825, + "grad_norm": 4.18577241897583, + "learning_rate": 5.404960942853228e-06, + "loss": 0.173, + "step": 34281 + }, + { + "epoch": 2.0649755832881174, + "grad_norm": 4.826218128204346, + "learning_rate": 5.404823900232972e-06, + "loss": 0.1761, + "step": 34282 + }, + { + "epoch": 2.0649891481280522, + "grad_norm": 4.010910987854004, + "learning_rate": 5.4046868576127176e-06, + "loss": 0.2022, + "step": 34283 + }, + { + "epoch": 2.065002712967987, + "grad_norm": 2.9552159309387207, + "learning_rate": 5.404549814992464e-06, + "loss": 0.1112, + "step": 34284 + }, + { + "epoch": 2.065016277807922, + "grad_norm": 5.774412631988525, + "learning_rate": 5.404412772372209e-06, + "loss": 0.314, + "step": 34285 + }, + { + "epoch": 2.065029842647857, + "grad_norm": 5.9584221839904785, + "learning_rate": 5.404275729751953e-06, + "loss": 0.2171, + "step": 34286 + }, + { + "epoch": 2.0650434074877917, + "grad_norm": 5.189080715179443, + "learning_rate": 5.404138687131698e-06, + "loss": 0.1257, + "step": 34287 + }, + { + "epoch": 2.0650569723277266, + "grad_norm": 3.275738477706909, + "learning_rate": 5.404001644511444e-06, + "loss": 0.1835, + "step": 34288 + }, + { + "epoch": 2.0650705371676614, + "grad_norm": 4.1285810470581055, + "learning_rate": 5.403864601891189e-06, + "loss": 0.1746, + "step": 34289 + }, + { + "epoch": 2.0650841020075963, + "grad_norm": 3.4618163108825684, + "learning_rate": 5.403727559270934e-06, + "loss": 0.1175, + "step": 34290 + }, + { + "epoch": 2.065097666847531, + "grad_norm": 4.715808391571045, + "learning_rate": 5.403590516650679e-06, + "loss": 0.1708, + "step": 34291 + }, + { + "epoch": 2.065111231687466, + "grad_norm": 5.264575004577637, + "learning_rate": 5.403453474030423e-06, + "loss": 0.2334, + "step": 34292 + }, + { + "epoch": 2.065124796527401, + "grad_norm": 3.2113144397735596, + "learning_rate": 5.403316431410169e-06, + "loss": 0.1583, + "step": 34293 + }, + { + "epoch": 2.0651383613673358, + "grad_norm": 4.388155460357666, + "learning_rate": 5.4031793887899145e-06, + "loss": 0.137, + "step": 34294 + }, + { + "epoch": 2.0651519262072706, + "grad_norm": 4.666842460632324, + "learning_rate": 5.403042346169659e-06, + "loss": 0.099, + "step": 34295 + }, + { + "epoch": 2.0651654910472055, + "grad_norm": 6.476246356964111, + "learning_rate": 5.402905303549404e-06, + "loss": 0.1975, + "step": 34296 + }, + { + "epoch": 2.0651790558871403, + "grad_norm": 5.3652238845825195, + "learning_rate": 5.40276826092915e-06, + "loss": 0.2461, + "step": 34297 + }, + { + "epoch": 2.065192620727075, + "grad_norm": 4.734585285186768, + "learning_rate": 5.402631218308894e-06, + "loss": 0.15, + "step": 34298 + }, + { + "epoch": 2.0652061855670105, + "grad_norm": 4.582381725311279, + "learning_rate": 5.4024941756886396e-06, + "loss": 0.1818, + "step": 34299 + }, + { + "epoch": 2.0652197504069454, + "grad_norm": 4.824341297149658, + "learning_rate": 5.402357133068385e-06, + "loss": 0.1404, + "step": 34300 + }, + { + "epoch": 2.0652333152468803, + "grad_norm": 5.16810941696167, + "learning_rate": 5.40222009044813e-06, + "loss": 0.2825, + "step": 34301 + }, + { + "epoch": 2.065246880086815, + "grad_norm": 5.789796352386475, + "learning_rate": 5.402083047827875e-06, + "loss": 0.2771, + "step": 34302 + }, + { + "epoch": 2.06526044492675, + "grad_norm": 4.67024564743042, + "learning_rate": 5.40194600520762e-06, + "loss": 0.2218, + "step": 34303 + }, + { + "epoch": 2.065274009766685, + "grad_norm": 4.439709186553955, + "learning_rate": 5.401808962587365e-06, + "loss": 0.1025, + "step": 34304 + }, + { + "epoch": 2.0652875746066197, + "grad_norm": 4.6850666999816895, + "learning_rate": 5.40167191996711e-06, + "loss": 0.161, + "step": 34305 + }, + { + "epoch": 2.0653011394465546, + "grad_norm": 6.853418827056885, + "learning_rate": 5.401534877346856e-06, + "loss": 0.1824, + "step": 34306 + }, + { + "epoch": 2.0653147042864894, + "grad_norm": 4.346058368682861, + "learning_rate": 5.4013978347266e-06, + "loss": 0.1498, + "step": 34307 + }, + { + "epoch": 2.0653282691264243, + "grad_norm": 5.5308427810668945, + "learning_rate": 5.401260792106345e-06, + "loss": 0.2333, + "step": 34308 + }, + { + "epoch": 2.065341833966359, + "grad_norm": 4.351638317108154, + "learning_rate": 5.4011237494860905e-06, + "loss": 0.1347, + "step": 34309 + }, + { + "epoch": 2.065355398806294, + "grad_norm": 6.387172698974609, + "learning_rate": 5.4009867068658365e-06, + "loss": 0.2071, + "step": 34310 + }, + { + "epoch": 2.065368963646229, + "grad_norm": 4.168952941894531, + "learning_rate": 5.400849664245581e-06, + "loss": 0.2189, + "step": 34311 + }, + { + "epoch": 2.0653825284861638, + "grad_norm": 4.3341522216796875, + "learning_rate": 5.400712621625326e-06, + "loss": 0.2243, + "step": 34312 + }, + { + "epoch": 2.0653960933260986, + "grad_norm": 5.790652751922607, + "learning_rate": 5.40057557900507e-06, + "loss": 0.1358, + "step": 34313 + }, + { + "epoch": 2.0654096581660335, + "grad_norm": 5.150426864624023, + "learning_rate": 5.400438536384816e-06, + "loss": 0.1952, + "step": 34314 + }, + { + "epoch": 2.0654232230059684, + "grad_norm": 4.470137596130371, + "learning_rate": 5.400301493764562e-06, + "loss": 0.201, + "step": 34315 + }, + { + "epoch": 2.0654367878459032, + "grad_norm": 5.688676357269287, + "learning_rate": 5.400164451144306e-06, + "loss": 0.1988, + "step": 34316 + }, + { + "epoch": 2.065450352685838, + "grad_norm": 5.501699924468994, + "learning_rate": 5.400027408524051e-06, + "loss": 0.2034, + "step": 34317 + }, + { + "epoch": 2.0654639175257734, + "grad_norm": 8.925294876098633, + "learning_rate": 5.399890365903796e-06, + "loss": 0.3011, + "step": 34318 + }, + { + "epoch": 2.0654774823657083, + "grad_norm": 5.145762920379639, + "learning_rate": 5.399753323283542e-06, + "loss": 0.1966, + "step": 34319 + }, + { + "epoch": 2.065491047205643, + "grad_norm": 5.104071140289307, + "learning_rate": 5.399616280663287e-06, + "loss": 0.1453, + "step": 34320 + }, + { + "epoch": 2.065504612045578, + "grad_norm": 5.767024993896484, + "learning_rate": 5.399479238043032e-06, + "loss": 0.2636, + "step": 34321 + }, + { + "epoch": 2.065518176885513, + "grad_norm": 4.3627471923828125, + "learning_rate": 5.399342195422776e-06, + "loss": 0.2041, + "step": 34322 + }, + { + "epoch": 2.0655317417254477, + "grad_norm": 5.249500751495361, + "learning_rate": 5.399205152802522e-06, + "loss": 0.2697, + "step": 34323 + }, + { + "epoch": 2.0655453065653826, + "grad_norm": 4.018186569213867, + "learning_rate": 5.399068110182267e-06, + "loss": 0.1928, + "step": 34324 + }, + { + "epoch": 2.0655588714053175, + "grad_norm": 5.347404479980469, + "learning_rate": 5.3989310675620125e-06, + "loss": 0.2027, + "step": 34325 + }, + { + "epoch": 2.0655724362452523, + "grad_norm": 5.434020042419434, + "learning_rate": 5.398794024941757e-06, + "loss": 0.1661, + "step": 34326 + }, + { + "epoch": 2.065586001085187, + "grad_norm": 4.513852119445801, + "learning_rate": 5.398656982321503e-06, + "loss": 0.157, + "step": 34327 + }, + { + "epoch": 2.065599565925122, + "grad_norm": 4.691534042358398, + "learning_rate": 5.398519939701248e-06, + "loss": 0.1871, + "step": 34328 + }, + { + "epoch": 2.065613130765057, + "grad_norm": 5.722592353820801, + "learning_rate": 5.398382897080992e-06, + "loss": 0.1494, + "step": 34329 + }, + { + "epoch": 2.065626695604992, + "grad_norm": 4.937067985534668, + "learning_rate": 5.398245854460738e-06, + "loss": 0.1833, + "step": 34330 + }, + { + "epoch": 2.0656402604449267, + "grad_norm": 4.4701151847839355, + "learning_rate": 5.398108811840482e-06, + "loss": 0.1813, + "step": 34331 + }, + { + "epoch": 2.0656538252848615, + "grad_norm": 4.444278717041016, + "learning_rate": 5.397971769220228e-06, + "loss": 0.1476, + "step": 34332 + }, + { + "epoch": 2.0656673901247964, + "grad_norm": 5.080759525299072, + "learning_rate": 5.397834726599973e-06, + "loss": 0.1733, + "step": 34333 + }, + { + "epoch": 2.0656809549647313, + "grad_norm": 5.653689861297607, + "learning_rate": 5.397697683979718e-06, + "loss": 0.2549, + "step": 34334 + }, + { + "epoch": 2.065694519804666, + "grad_norm": 4.615601062774658, + "learning_rate": 5.397560641359463e-06, + "loss": 0.1879, + "step": 34335 + }, + { + "epoch": 2.065708084644601, + "grad_norm": 6.5646071434021, + "learning_rate": 5.397423598739209e-06, + "loss": 0.315, + "step": 34336 + }, + { + "epoch": 2.0657216494845363, + "grad_norm": 5.2486348152160645, + "learning_rate": 5.397286556118954e-06, + "loss": 0.1743, + "step": 34337 + }, + { + "epoch": 2.065735214324471, + "grad_norm": 4.310727596282959, + "learning_rate": 5.397149513498698e-06, + "loss": 0.1244, + "step": 34338 + }, + { + "epoch": 2.065748779164406, + "grad_norm": 4.617703437805176, + "learning_rate": 5.397012470878443e-06, + "loss": 0.1331, + "step": 34339 + }, + { + "epoch": 2.065762344004341, + "grad_norm": 4.579964637756348, + "learning_rate": 5.396875428258189e-06, + "loss": 0.2413, + "step": 34340 + }, + { + "epoch": 2.0657759088442758, + "grad_norm": 5.035261631011963, + "learning_rate": 5.396738385637934e-06, + "loss": 0.2152, + "step": 34341 + }, + { + "epoch": 2.0657894736842106, + "grad_norm": 4.515773773193359, + "learning_rate": 5.396601343017679e-06, + "loss": 0.1962, + "step": 34342 + }, + { + "epoch": 2.0658030385241455, + "grad_norm": 5.889891147613525, + "learning_rate": 5.396464300397424e-06, + "loss": 0.1667, + "step": 34343 + }, + { + "epoch": 2.0658166033640804, + "grad_norm": 5.970041275024414, + "learning_rate": 5.396327257777168e-06, + "loss": 0.2298, + "step": 34344 + }, + { + "epoch": 2.065830168204015, + "grad_norm": 6.0844502449035645, + "learning_rate": 5.396190215156914e-06, + "loss": 0.1962, + "step": 34345 + }, + { + "epoch": 2.06584373304395, + "grad_norm": 7.502601146697998, + "learning_rate": 5.39605317253666e-06, + "loss": 0.2955, + "step": 34346 + }, + { + "epoch": 2.065857297883885, + "grad_norm": 4.321715354919434, + "learning_rate": 5.395916129916404e-06, + "loss": 0.1193, + "step": 34347 + }, + { + "epoch": 2.06587086272382, + "grad_norm": 5.3379364013671875, + "learning_rate": 5.395779087296149e-06, + "loss": 0.2149, + "step": 34348 + }, + { + "epoch": 2.0658844275637547, + "grad_norm": 5.034964561462402, + "learning_rate": 5.395642044675895e-06, + "loss": 0.1891, + "step": 34349 + }, + { + "epoch": 2.0658979924036895, + "grad_norm": 4.787804126739502, + "learning_rate": 5.3955050020556395e-06, + "loss": 0.1341, + "step": 34350 + }, + { + "epoch": 2.0659115572436244, + "grad_norm": 6.292237758636475, + "learning_rate": 5.395367959435385e-06, + "loss": 0.2023, + "step": 34351 + }, + { + "epoch": 2.0659251220835593, + "grad_norm": 4.161075592041016, + "learning_rate": 5.39523091681513e-06, + "loss": 0.2199, + "step": 34352 + }, + { + "epoch": 2.065938686923494, + "grad_norm": 6.372089862823486, + "learning_rate": 5.395093874194876e-06, + "loss": 0.2125, + "step": 34353 + }, + { + "epoch": 2.065952251763429, + "grad_norm": 3.9818572998046875, + "learning_rate": 5.39495683157462e-06, + "loss": 0.1447, + "step": 34354 + }, + { + "epoch": 2.065965816603364, + "grad_norm": 6.422176361083984, + "learning_rate": 5.394819788954365e-06, + "loss": 0.2404, + "step": 34355 + }, + { + "epoch": 2.065979381443299, + "grad_norm": 5.592154502868652, + "learning_rate": 5.39468274633411e-06, + "loss": 0.2703, + "step": 34356 + }, + { + "epoch": 2.065992946283234, + "grad_norm": 5.0129313468933105, + "learning_rate": 5.394545703713856e-06, + "loss": 0.1684, + "step": 34357 + }, + { + "epoch": 2.066006511123169, + "grad_norm": 5.8049139976501465, + "learning_rate": 5.394408661093601e-06, + "loss": 0.2386, + "step": 34358 + }, + { + "epoch": 2.0660200759631038, + "grad_norm": 4.723591327667236, + "learning_rate": 5.394271618473346e-06, + "loss": 0.1965, + "step": 34359 + }, + { + "epoch": 2.0660336408030386, + "grad_norm": 4.319504261016846, + "learning_rate": 5.39413457585309e-06, + "loss": 0.1486, + "step": 34360 + }, + { + "epoch": 2.0660472056429735, + "grad_norm": 6.817748069763184, + "learning_rate": 5.393997533232836e-06, + "loss": 0.3235, + "step": 34361 + }, + { + "epoch": 2.0660607704829084, + "grad_norm": 6.42543888092041, + "learning_rate": 5.393860490612582e-06, + "loss": 0.282, + "step": 34362 + }, + { + "epoch": 2.0660743353228432, + "grad_norm": 5.733143329620361, + "learning_rate": 5.393723447992326e-06, + "loss": 0.2204, + "step": 34363 + }, + { + "epoch": 2.066087900162778, + "grad_norm": 4.15709924697876, + "learning_rate": 5.393586405372071e-06, + "loss": 0.166, + "step": 34364 + }, + { + "epoch": 2.066101465002713, + "grad_norm": 4.2759552001953125, + "learning_rate": 5.3934493627518155e-06, + "loss": 0.0932, + "step": 34365 + }, + { + "epoch": 2.066115029842648, + "grad_norm": 6.9747233390808105, + "learning_rate": 5.3933123201315615e-06, + "loss": 0.283, + "step": 34366 + }, + { + "epoch": 2.0661285946825827, + "grad_norm": 6.5950164794921875, + "learning_rate": 5.393175277511307e-06, + "loss": 0.3279, + "step": 34367 + }, + { + "epoch": 2.0661421595225176, + "grad_norm": 5.594324111938477, + "learning_rate": 5.393038234891052e-06, + "loss": 0.1863, + "step": 34368 + }, + { + "epoch": 2.0661557243624524, + "grad_norm": 6.24147367477417, + "learning_rate": 5.392901192270796e-06, + "loss": 0.2929, + "step": 34369 + }, + { + "epoch": 2.0661692892023873, + "grad_norm": 8.281856536865234, + "learning_rate": 5.392764149650542e-06, + "loss": 0.3104, + "step": 34370 + }, + { + "epoch": 2.066182854042322, + "grad_norm": 5.9553542137146, + "learning_rate": 5.392627107030287e-06, + "loss": 0.2596, + "step": 34371 + }, + { + "epoch": 2.066196418882257, + "grad_norm": 5.521484851837158, + "learning_rate": 5.392490064410032e-06, + "loss": 0.2776, + "step": 34372 + }, + { + "epoch": 2.066209983722192, + "grad_norm": 4.064058780670166, + "learning_rate": 5.392353021789777e-06, + "loss": 0.1175, + "step": 34373 + }, + { + "epoch": 2.0662235485621268, + "grad_norm": 5.256862163543701, + "learning_rate": 5.392215979169522e-06, + "loss": 0.2058, + "step": 34374 + }, + { + "epoch": 2.066237113402062, + "grad_norm": 4.808828830718994, + "learning_rate": 5.392078936549267e-06, + "loss": 0.2004, + "step": 34375 + }, + { + "epoch": 2.066250678241997, + "grad_norm": 5.067358016967773, + "learning_rate": 5.391941893929012e-06, + "loss": 0.1896, + "step": 34376 + }, + { + "epoch": 2.066264243081932, + "grad_norm": 5.901841640472412, + "learning_rate": 5.391804851308758e-06, + "loss": 0.2785, + "step": 34377 + }, + { + "epoch": 2.0662778079218667, + "grad_norm": 5.396363258361816, + "learning_rate": 5.391667808688502e-06, + "loss": 0.3568, + "step": 34378 + }, + { + "epoch": 2.0662913727618015, + "grad_norm": 5.2391886711120605, + "learning_rate": 5.391530766068248e-06, + "loss": 0.2109, + "step": 34379 + }, + { + "epoch": 2.0663049376017364, + "grad_norm": 4.702127456665039, + "learning_rate": 5.391393723447993e-06, + "loss": 0.1826, + "step": 34380 + }, + { + "epoch": 2.0663185024416713, + "grad_norm": 7.946695327758789, + "learning_rate": 5.3912566808277375e-06, + "loss": 0.3113, + "step": 34381 + }, + { + "epoch": 2.066332067281606, + "grad_norm": 5.327291965484619, + "learning_rate": 5.391119638207483e-06, + "loss": 0.2446, + "step": 34382 + }, + { + "epoch": 2.066345632121541, + "grad_norm": 5.031408309936523, + "learning_rate": 5.390982595587229e-06, + "loss": 0.2437, + "step": 34383 + }, + { + "epoch": 2.066359196961476, + "grad_norm": 6.1595940589904785, + "learning_rate": 5.390845552966974e-06, + "loss": 0.2251, + "step": 34384 + }, + { + "epoch": 2.0663727618014107, + "grad_norm": 5.423579692840576, + "learning_rate": 5.390708510346718e-06, + "loss": 0.1692, + "step": 34385 + }, + { + "epoch": 2.0663863266413456, + "grad_norm": 4.141510009765625, + "learning_rate": 5.390571467726463e-06, + "loss": 0.1393, + "step": 34386 + }, + { + "epoch": 2.0663998914812804, + "grad_norm": 6.050014495849609, + "learning_rate": 5.390434425106208e-06, + "loss": 0.2255, + "step": 34387 + }, + { + "epoch": 2.0664134563212153, + "grad_norm": 5.65231466293335, + "learning_rate": 5.390297382485954e-06, + "loss": 0.2446, + "step": 34388 + }, + { + "epoch": 2.06642702116115, + "grad_norm": 4.966948509216309, + "learning_rate": 5.390160339865699e-06, + "loss": 0.1088, + "step": 34389 + }, + { + "epoch": 2.066440586001085, + "grad_norm": 4.92078971862793, + "learning_rate": 5.390023297245443e-06, + "loss": 0.1766, + "step": 34390 + }, + { + "epoch": 2.06645415084102, + "grad_norm": 4.953845977783203, + "learning_rate": 5.389886254625188e-06, + "loss": 0.2134, + "step": 34391 + }, + { + "epoch": 2.0664677156809548, + "grad_norm": 6.290046691894531, + "learning_rate": 5.3897492120049344e-06, + "loss": 0.1714, + "step": 34392 + }, + { + "epoch": 2.0664812805208896, + "grad_norm": 5.640913009643555, + "learning_rate": 5.38961216938468e-06, + "loss": 0.2045, + "step": 34393 + }, + { + "epoch": 2.066494845360825, + "grad_norm": 3.929431438446045, + "learning_rate": 5.389475126764424e-06, + "loss": 0.1197, + "step": 34394 + }, + { + "epoch": 2.06650841020076, + "grad_norm": 7.372042179107666, + "learning_rate": 5.389338084144169e-06, + "loss": 0.3113, + "step": 34395 + }, + { + "epoch": 2.0665219750406947, + "grad_norm": 9.070904731750488, + "learning_rate": 5.389201041523915e-06, + "loss": 0.2916, + "step": 34396 + }, + { + "epoch": 2.0665355398806295, + "grad_norm": 5.559075355529785, + "learning_rate": 5.3890639989036595e-06, + "loss": 0.2375, + "step": 34397 + }, + { + "epoch": 2.0665491047205644, + "grad_norm": 5.67906379699707, + "learning_rate": 5.388926956283405e-06, + "loss": 0.3223, + "step": 34398 + }, + { + "epoch": 2.0665626695604993, + "grad_norm": 10.009143829345703, + "learning_rate": 5.388789913663149e-06, + "loss": 0.4474, + "step": 34399 + }, + { + "epoch": 2.066576234400434, + "grad_norm": 5.265479564666748, + "learning_rate": 5.388652871042894e-06, + "loss": 0.142, + "step": 34400 + }, + { + "epoch": 2.066589799240369, + "grad_norm": 4.917382717132568, + "learning_rate": 5.38851582842264e-06, + "loss": 0.1625, + "step": 34401 + }, + { + "epoch": 2.066603364080304, + "grad_norm": 4.751510143280029, + "learning_rate": 5.388378785802385e-06, + "loss": 0.1598, + "step": 34402 + }, + { + "epoch": 2.0666169289202387, + "grad_norm": 4.275659561157227, + "learning_rate": 5.38824174318213e-06, + "loss": 0.1874, + "step": 34403 + }, + { + "epoch": 2.0666304937601736, + "grad_norm": 5.383002281188965, + "learning_rate": 5.388104700561875e-06, + "loss": 0.2105, + "step": 34404 + }, + { + "epoch": 2.0666440586001085, + "grad_norm": 5.23295783996582, + "learning_rate": 5.387967657941621e-06, + "loss": 0.256, + "step": 34405 + }, + { + "epoch": 2.0666576234400433, + "grad_norm": 5.824202537536621, + "learning_rate": 5.387830615321365e-06, + "loss": 0.2582, + "step": 34406 + }, + { + "epoch": 2.066671188279978, + "grad_norm": 4.536264419555664, + "learning_rate": 5.3876935727011104e-06, + "loss": 0.16, + "step": 34407 + }, + { + "epoch": 2.066684753119913, + "grad_norm": 4.361211776733398, + "learning_rate": 5.387556530080856e-06, + "loss": 0.169, + "step": 34408 + }, + { + "epoch": 2.066698317959848, + "grad_norm": 5.602545261383057, + "learning_rate": 5.387419487460601e-06, + "loss": 0.2805, + "step": 34409 + }, + { + "epoch": 2.066711882799783, + "grad_norm": 6.491081714630127, + "learning_rate": 5.387282444840346e-06, + "loss": 0.2296, + "step": 34410 + }, + { + "epoch": 2.0667254476397177, + "grad_norm": 3.4964170455932617, + "learning_rate": 5.387145402220091e-06, + "loss": 0.1249, + "step": 34411 + }, + { + "epoch": 2.0667390124796525, + "grad_norm": 7.263920783996582, + "learning_rate": 5.3870083595998355e-06, + "loss": 0.2909, + "step": 34412 + }, + { + "epoch": 2.066752577319588, + "grad_norm": 5.617500305175781, + "learning_rate": 5.386871316979581e-06, + "loss": 0.2741, + "step": 34413 + }, + { + "epoch": 2.0667661421595227, + "grad_norm": 4.736079692840576, + "learning_rate": 5.386734274359327e-06, + "loss": 0.2199, + "step": 34414 + }, + { + "epoch": 2.0667797069994576, + "grad_norm": 5.563385486602783, + "learning_rate": 5.386597231739071e-06, + "loss": 0.2226, + "step": 34415 + }, + { + "epoch": 2.0667932718393924, + "grad_norm": 4.6216936111450195, + "learning_rate": 5.386460189118816e-06, + "loss": 0.1682, + "step": 34416 + }, + { + "epoch": 2.0668068366793273, + "grad_norm": 6.470797061920166, + "learning_rate": 5.386323146498561e-06, + "loss": 0.142, + "step": 34417 + }, + { + "epoch": 2.066820401519262, + "grad_norm": 4.29946231842041, + "learning_rate": 5.386186103878307e-06, + "loss": 0.2036, + "step": 34418 + }, + { + "epoch": 2.066833966359197, + "grad_norm": 4.881328582763672, + "learning_rate": 5.386049061258052e-06, + "loss": 0.2067, + "step": 34419 + }, + { + "epoch": 2.066847531199132, + "grad_norm": 4.7474775314331055, + "learning_rate": 5.385912018637797e-06, + "loss": 0.1538, + "step": 34420 + }, + { + "epoch": 2.0668610960390668, + "grad_norm": 4.081918716430664, + "learning_rate": 5.385774976017541e-06, + "loss": 0.1508, + "step": 34421 + }, + { + "epoch": 2.0668746608790016, + "grad_norm": 4.398732662200928, + "learning_rate": 5.385637933397287e-06, + "loss": 0.2598, + "step": 34422 + }, + { + "epoch": 2.0668882257189365, + "grad_norm": 4.787651062011719, + "learning_rate": 5.3855008907770324e-06, + "loss": 0.1576, + "step": 34423 + }, + { + "epoch": 2.0669017905588714, + "grad_norm": 4.56805419921875, + "learning_rate": 5.385363848156777e-06, + "loss": 0.2312, + "step": 34424 + }, + { + "epoch": 2.066915355398806, + "grad_norm": 4.975305080413818, + "learning_rate": 5.385226805536522e-06, + "loss": 0.2055, + "step": 34425 + }, + { + "epoch": 2.066928920238741, + "grad_norm": 3.834807872772217, + "learning_rate": 5.385089762916268e-06, + "loss": 0.1891, + "step": 34426 + }, + { + "epoch": 2.066942485078676, + "grad_norm": 3.6790106296539307, + "learning_rate": 5.384952720296013e-06, + "loss": 0.1623, + "step": 34427 + }, + { + "epoch": 2.066956049918611, + "grad_norm": 4.69525146484375, + "learning_rate": 5.3848156776757575e-06, + "loss": 0.19, + "step": 34428 + }, + { + "epoch": 2.0669696147585457, + "grad_norm": 7.0505290031433105, + "learning_rate": 5.384678635055503e-06, + "loss": 0.1751, + "step": 34429 + }, + { + "epoch": 2.0669831795984805, + "grad_norm": 4.726398944854736, + "learning_rate": 5.384541592435247e-06, + "loss": 0.2021, + "step": 34430 + }, + { + "epoch": 2.0669967444384154, + "grad_norm": 3.9938457012176514, + "learning_rate": 5.384404549814993e-06, + "loss": 0.1332, + "step": 34431 + }, + { + "epoch": 2.0670103092783507, + "grad_norm": 4.611945152282715, + "learning_rate": 5.384267507194738e-06, + "loss": 0.2088, + "step": 34432 + }, + { + "epoch": 2.0670238741182856, + "grad_norm": 2.904320240020752, + "learning_rate": 5.384130464574483e-06, + "loss": 0.1041, + "step": 34433 + }, + { + "epoch": 2.0670374389582205, + "grad_norm": 5.084370136260986, + "learning_rate": 5.383993421954228e-06, + "loss": 0.184, + "step": 34434 + }, + { + "epoch": 2.0670510037981553, + "grad_norm": 3.315699577331543, + "learning_rate": 5.383856379333974e-06, + "loss": 0.1273, + "step": 34435 + }, + { + "epoch": 2.06706456863809, + "grad_norm": 4.476407051086426, + "learning_rate": 5.383719336713719e-06, + "loss": 0.1795, + "step": 34436 + }, + { + "epoch": 2.067078133478025, + "grad_norm": 4.35988187789917, + "learning_rate": 5.383582294093463e-06, + "loss": 0.212, + "step": 34437 + }, + { + "epoch": 2.06709169831796, + "grad_norm": 3.2640042304992676, + "learning_rate": 5.3834452514732084e-06, + "loss": 0.1372, + "step": 34438 + }, + { + "epoch": 2.067105263157895, + "grad_norm": 4.083929061889648, + "learning_rate": 5.3833082088529545e-06, + "loss": 0.1152, + "step": 34439 + }, + { + "epoch": 2.0671188279978296, + "grad_norm": 4.626319885253906, + "learning_rate": 5.383171166232699e-06, + "loss": 0.1254, + "step": 34440 + }, + { + "epoch": 2.0671323928377645, + "grad_norm": 2.519070863723755, + "learning_rate": 5.383034123612444e-06, + "loss": 0.1024, + "step": 34441 + }, + { + "epoch": 2.0671459576776994, + "grad_norm": 4.9218831062316895, + "learning_rate": 5.382897080992189e-06, + "loss": 0.1465, + "step": 34442 + }, + { + "epoch": 2.0671595225176342, + "grad_norm": 5.072726249694824, + "learning_rate": 5.3827600383719335e-06, + "loss": 0.1954, + "step": 34443 + }, + { + "epoch": 2.067173087357569, + "grad_norm": 4.398864269256592, + "learning_rate": 5.3826229957516795e-06, + "loss": 0.1638, + "step": 34444 + }, + { + "epoch": 2.067186652197504, + "grad_norm": 3.501537561416626, + "learning_rate": 5.382485953131425e-06, + "loss": 0.1022, + "step": 34445 + }, + { + "epoch": 2.067200217037439, + "grad_norm": 3.503833055496216, + "learning_rate": 5.382348910511169e-06, + "loss": 0.1485, + "step": 34446 + }, + { + "epoch": 2.0672137818773737, + "grad_norm": 5.356082916259766, + "learning_rate": 5.382211867890914e-06, + "loss": 0.1947, + "step": 34447 + }, + { + "epoch": 2.0672273467173086, + "grad_norm": 5.151503086090088, + "learning_rate": 5.38207482527066e-06, + "loss": 0.2012, + "step": 34448 + }, + { + "epoch": 2.0672409115572434, + "grad_norm": 3.3340494632720947, + "learning_rate": 5.3819377826504045e-06, + "loss": 0.1335, + "step": 34449 + }, + { + "epoch": 2.0672544763971787, + "grad_norm": 3.7473316192626953, + "learning_rate": 5.38180074003015e-06, + "loss": 0.1467, + "step": 34450 + }, + { + "epoch": 2.0672680412371136, + "grad_norm": 3.2695858478546143, + "learning_rate": 5.381663697409895e-06, + "loss": 0.0885, + "step": 34451 + }, + { + "epoch": 2.0672816060770485, + "grad_norm": 3.2334797382354736, + "learning_rate": 5.381526654789641e-06, + "loss": 0.1305, + "step": 34452 + }, + { + "epoch": 2.0672951709169833, + "grad_norm": 3.30827260017395, + "learning_rate": 5.381389612169385e-06, + "loss": 0.1205, + "step": 34453 + }, + { + "epoch": 2.067308735756918, + "grad_norm": 4.125016689300537, + "learning_rate": 5.3812525695491304e-06, + "loss": 0.2135, + "step": 34454 + }, + { + "epoch": 2.067322300596853, + "grad_norm": 5.079186916351318, + "learning_rate": 5.381115526928875e-06, + "loss": 0.2506, + "step": 34455 + }, + { + "epoch": 2.067335865436788, + "grad_norm": 4.081688404083252, + "learning_rate": 5.38097848430862e-06, + "loss": 0.1576, + "step": 34456 + }, + { + "epoch": 2.067349430276723, + "grad_norm": 3.1632513999938965, + "learning_rate": 5.380841441688366e-06, + "loss": 0.0912, + "step": 34457 + }, + { + "epoch": 2.0673629951166577, + "grad_norm": 3.6978933811187744, + "learning_rate": 5.38070439906811e-06, + "loss": 0.1744, + "step": 34458 + }, + { + "epoch": 2.0673765599565925, + "grad_norm": 3.6359994411468506, + "learning_rate": 5.3805673564478555e-06, + "loss": 0.1168, + "step": 34459 + }, + { + "epoch": 2.0673901247965274, + "grad_norm": 4.172922611236572, + "learning_rate": 5.380430313827601e-06, + "loss": 0.1715, + "step": 34460 + }, + { + "epoch": 2.0674036896364623, + "grad_norm": 5.40273904800415, + "learning_rate": 5.380293271207347e-06, + "loss": 0.1928, + "step": 34461 + }, + { + "epoch": 2.067417254476397, + "grad_norm": 3.869067907333374, + "learning_rate": 5.380156228587091e-06, + "loss": 0.1263, + "step": 34462 + }, + { + "epoch": 2.067430819316332, + "grad_norm": 3.2170612812042236, + "learning_rate": 5.380019185966836e-06, + "loss": 0.1098, + "step": 34463 + }, + { + "epoch": 2.067444384156267, + "grad_norm": 3.0317447185516357, + "learning_rate": 5.3798821433465805e-06, + "loss": 0.1065, + "step": 34464 + }, + { + "epoch": 2.0674579489962017, + "grad_norm": 3.2317323684692383, + "learning_rate": 5.3797451007263266e-06, + "loss": 0.1202, + "step": 34465 + }, + { + "epoch": 2.0674715138361366, + "grad_norm": 4.344921588897705, + "learning_rate": 5.379608058106072e-06, + "loss": 0.1919, + "step": 34466 + }, + { + "epoch": 2.0674850786760715, + "grad_norm": 3.923208475112915, + "learning_rate": 5.379471015485817e-06, + "loss": 0.148, + "step": 34467 + }, + { + "epoch": 2.0674986435160063, + "grad_norm": 3.3770220279693604, + "learning_rate": 5.379333972865561e-06, + "loss": 0.1787, + "step": 34468 + }, + { + "epoch": 2.067512208355941, + "grad_norm": 3.474271297454834, + "learning_rate": 5.3791969302453064e-06, + "loss": 0.1643, + "step": 34469 + }, + { + "epoch": 2.0675257731958765, + "grad_norm": 3.6647956371307373, + "learning_rate": 5.3790598876250525e-06, + "loss": 0.1889, + "step": 34470 + }, + { + "epoch": 2.0675393380358114, + "grad_norm": 3.7841291427612305, + "learning_rate": 5.378922845004797e-06, + "loss": 0.1338, + "step": 34471 + }, + { + "epoch": 2.0675529028757462, + "grad_norm": 3.9206082820892334, + "learning_rate": 5.378785802384542e-06, + "loss": 0.145, + "step": 34472 + }, + { + "epoch": 2.067566467715681, + "grad_norm": 4.441991329193115, + "learning_rate": 5.378648759764286e-06, + "loss": 0.2296, + "step": 34473 + }, + { + "epoch": 2.067580032555616, + "grad_norm": 3.5168771743774414, + "learning_rate": 5.378511717144032e-06, + "loss": 0.138, + "step": 34474 + }, + { + "epoch": 2.067593597395551, + "grad_norm": 4.052290439605713, + "learning_rate": 5.3783746745237775e-06, + "loss": 0.1747, + "step": 34475 + }, + { + "epoch": 2.0676071622354857, + "grad_norm": 4.40687370300293, + "learning_rate": 5.378237631903523e-06, + "loss": 0.1491, + "step": 34476 + }, + { + "epoch": 2.0676207270754206, + "grad_norm": 3.760544776916504, + "learning_rate": 5.378100589283267e-06, + "loss": 0.0896, + "step": 34477 + }, + { + "epoch": 2.0676342919153554, + "grad_norm": 5.369268417358398, + "learning_rate": 5.377963546663013e-06, + "loss": 0.1779, + "step": 34478 + }, + { + "epoch": 2.0676478567552903, + "grad_norm": 4.455672264099121, + "learning_rate": 5.377826504042758e-06, + "loss": 0.1054, + "step": 34479 + }, + { + "epoch": 2.067661421595225, + "grad_norm": 4.110386371612549, + "learning_rate": 5.3776894614225026e-06, + "loss": 0.0973, + "step": 34480 + }, + { + "epoch": 2.06767498643516, + "grad_norm": 7.028196811676025, + "learning_rate": 5.377552418802248e-06, + "loss": 0.2317, + "step": 34481 + }, + { + "epoch": 2.067688551275095, + "grad_norm": 4.2619309425354, + "learning_rate": 5.377415376181993e-06, + "loss": 0.1585, + "step": 34482 + }, + { + "epoch": 2.0677021161150297, + "grad_norm": 4.655306339263916, + "learning_rate": 5.377278333561738e-06, + "loss": 0.1923, + "step": 34483 + }, + { + "epoch": 2.0677156809549646, + "grad_norm": 5.564991474151611, + "learning_rate": 5.377141290941483e-06, + "loss": 0.2299, + "step": 34484 + }, + { + "epoch": 2.0677292457948995, + "grad_norm": 7.228907585144043, + "learning_rate": 5.3770042483212284e-06, + "loss": 0.2604, + "step": 34485 + }, + { + "epoch": 2.0677428106348343, + "grad_norm": 6.138908386230469, + "learning_rate": 5.376867205700973e-06, + "loss": 0.2154, + "step": 34486 + }, + { + "epoch": 2.067756375474769, + "grad_norm": 6.940118312835693, + "learning_rate": 5.376730163080719e-06, + "loss": 0.2434, + "step": 34487 + }, + { + "epoch": 2.0677699403147045, + "grad_norm": 3.8773975372314453, + "learning_rate": 5.376593120460464e-06, + "loss": 0.2427, + "step": 34488 + }, + { + "epoch": 2.0677835051546394, + "grad_norm": 5.144530773162842, + "learning_rate": 5.376456077840208e-06, + "loss": 0.1414, + "step": 34489 + }, + { + "epoch": 2.0677970699945742, + "grad_norm": 5.882584095001221, + "learning_rate": 5.3763190352199535e-06, + "loss": 0.0773, + "step": 34490 + }, + { + "epoch": 2.067810634834509, + "grad_norm": 4.8093953132629395, + "learning_rate": 5.3761819925996995e-06, + "loss": 0.1493, + "step": 34491 + }, + { + "epoch": 2.067824199674444, + "grad_norm": 4.731720924377441, + "learning_rate": 5.376044949979444e-06, + "loss": 0.1037, + "step": 34492 + }, + { + "epoch": 2.067837764514379, + "grad_norm": 3.449272632598877, + "learning_rate": 5.375907907359189e-06, + "loss": 0.1092, + "step": 34493 + }, + { + "epoch": 2.0678513293543137, + "grad_norm": 3.697857141494751, + "learning_rate": 5.375770864738934e-06, + "loss": 0.1797, + "step": 34494 + }, + { + "epoch": 2.0678648941942486, + "grad_norm": 7.1820526123046875, + "learning_rate": 5.37563382211868e-06, + "loss": 0.2357, + "step": 34495 + }, + { + "epoch": 2.0678784590341834, + "grad_norm": 4.1340508460998535, + "learning_rate": 5.3754967794984246e-06, + "loss": 0.1203, + "step": 34496 + }, + { + "epoch": 2.0678920238741183, + "grad_norm": 3.6504340171813965, + "learning_rate": 5.37535973687817e-06, + "loss": 0.1569, + "step": 34497 + }, + { + "epoch": 2.067905588714053, + "grad_norm": 6.187094211578369, + "learning_rate": 5.375222694257914e-06, + "loss": 0.2639, + "step": 34498 + }, + { + "epoch": 2.067919153553988, + "grad_norm": 4.823352813720703, + "learning_rate": 5.375085651637659e-06, + "loss": 0.1198, + "step": 34499 + }, + { + "epoch": 2.067932718393923, + "grad_norm": 5.831884860992432, + "learning_rate": 5.374948609017405e-06, + "loss": 0.1982, + "step": 34500 + }, + { + "epoch": 2.0679462832338578, + "grad_norm": 4.125052452087402, + "learning_rate": 5.3748115663971505e-06, + "loss": 0.1296, + "step": 34501 + }, + { + "epoch": 2.0679598480737926, + "grad_norm": 4.606639385223389, + "learning_rate": 5.374674523776895e-06, + "loss": 0.1073, + "step": 34502 + }, + { + "epoch": 2.0679734129137275, + "grad_norm": 13.167760848999023, + "learning_rate": 5.37453748115664e-06, + "loss": 0.1414, + "step": 34503 + }, + { + "epoch": 2.0679869777536624, + "grad_norm": 5.749434947967529, + "learning_rate": 5.374400438536386e-06, + "loss": 0.2007, + "step": 34504 + }, + { + "epoch": 2.0680005425935972, + "grad_norm": 4.785974979400635, + "learning_rate": 5.37426339591613e-06, + "loss": 0.1594, + "step": 34505 + }, + { + "epoch": 2.068014107433532, + "grad_norm": 5.10278844833374, + "learning_rate": 5.3741263532958755e-06, + "loss": 0.2073, + "step": 34506 + }, + { + "epoch": 2.068027672273467, + "grad_norm": 6.363340377807617, + "learning_rate": 5.37398931067562e-06, + "loss": 0.1999, + "step": 34507 + }, + { + "epoch": 2.0680412371134023, + "grad_norm": 4.5751166343688965, + "learning_rate": 5.373852268055366e-06, + "loss": 0.1281, + "step": 34508 + }, + { + "epoch": 2.068054801953337, + "grad_norm": 5.2388200759887695, + "learning_rate": 5.373715225435111e-06, + "loss": 0.1976, + "step": 34509 + }, + { + "epoch": 2.068068366793272, + "grad_norm": 4.054666519165039, + "learning_rate": 5.373578182814856e-06, + "loss": 0.142, + "step": 34510 + }, + { + "epoch": 2.068081931633207, + "grad_norm": 6.101775169372559, + "learning_rate": 5.3734411401946006e-06, + "loss": 0.2485, + "step": 34511 + }, + { + "epoch": 2.0680954964731417, + "grad_norm": 4.953958988189697, + "learning_rate": 5.373304097574346e-06, + "loss": 0.1575, + "step": 34512 + }, + { + "epoch": 2.0681090613130766, + "grad_norm": 4.299771308898926, + "learning_rate": 5.373167054954092e-06, + "loss": 0.1707, + "step": 34513 + }, + { + "epoch": 2.0681226261530115, + "grad_norm": 5.305327892303467, + "learning_rate": 5.373030012333836e-06, + "loss": 0.1544, + "step": 34514 + }, + { + "epoch": 2.0681361909929463, + "grad_norm": 6.9015913009643555, + "learning_rate": 5.372892969713581e-06, + "loss": 0.1646, + "step": 34515 + }, + { + "epoch": 2.068149755832881, + "grad_norm": 4.604848384857178, + "learning_rate": 5.3727559270933265e-06, + "loss": 0.1504, + "step": 34516 + }, + { + "epoch": 2.068163320672816, + "grad_norm": 4.984081268310547, + "learning_rate": 5.372618884473072e-06, + "loss": 0.1547, + "step": 34517 + }, + { + "epoch": 2.068176885512751, + "grad_norm": 7.346175670623779, + "learning_rate": 5.372481841852817e-06, + "loss": 0.2514, + "step": 34518 + }, + { + "epoch": 2.068190450352686, + "grad_norm": 4.665338039398193, + "learning_rate": 5.372344799232562e-06, + "loss": 0.14, + "step": 34519 + }, + { + "epoch": 2.0682040151926206, + "grad_norm": 6.214216232299805, + "learning_rate": 5.372207756612306e-06, + "loss": 0.2243, + "step": 34520 + }, + { + "epoch": 2.0682175800325555, + "grad_norm": 5.421309471130371, + "learning_rate": 5.372070713992052e-06, + "loss": 0.1613, + "step": 34521 + }, + { + "epoch": 2.0682311448724904, + "grad_norm": 4.830777168273926, + "learning_rate": 5.3719336713717975e-06, + "loss": 0.1184, + "step": 34522 + }, + { + "epoch": 2.0682447097124252, + "grad_norm": 4.619707107543945, + "learning_rate": 5.371796628751542e-06, + "loss": 0.0934, + "step": 34523 + }, + { + "epoch": 2.06825827455236, + "grad_norm": 4.646872043609619, + "learning_rate": 5.371659586131287e-06, + "loss": 0.1997, + "step": 34524 + }, + { + "epoch": 2.068271839392295, + "grad_norm": 5.194339275360107, + "learning_rate": 5.371522543511032e-06, + "loss": 0.1934, + "step": 34525 + }, + { + "epoch": 2.0682854042322303, + "grad_norm": 4.740140438079834, + "learning_rate": 5.371385500890777e-06, + "loss": 0.2695, + "step": 34526 + }, + { + "epoch": 2.068298969072165, + "grad_norm": 5.01839017868042, + "learning_rate": 5.3712484582705226e-06, + "loss": 0.2173, + "step": 34527 + }, + { + "epoch": 2.0683125339121, + "grad_norm": 4.097059726715088, + "learning_rate": 5.371111415650268e-06, + "loss": 0.1092, + "step": 34528 + }, + { + "epoch": 2.068326098752035, + "grad_norm": 4.62028694152832, + "learning_rate": 5.370974373030012e-06, + "loss": 0.3018, + "step": 34529 + }, + { + "epoch": 2.0683396635919697, + "grad_norm": 3.9730679988861084, + "learning_rate": 5.370837330409758e-06, + "loss": 0.1502, + "step": 34530 + }, + { + "epoch": 2.0683532284319046, + "grad_norm": 6.15358304977417, + "learning_rate": 5.370700287789503e-06, + "loss": 0.3107, + "step": 34531 + }, + { + "epoch": 2.0683667932718395, + "grad_norm": 4.875705242156982, + "learning_rate": 5.370563245169248e-06, + "loss": 0.1912, + "step": 34532 + }, + { + "epoch": 2.0683803581117743, + "grad_norm": 5.925015926361084, + "learning_rate": 5.370426202548993e-06, + "loss": 0.3995, + "step": 34533 + }, + { + "epoch": 2.068393922951709, + "grad_norm": 6.0501203536987305, + "learning_rate": 5.370289159928739e-06, + "loss": 0.1925, + "step": 34534 + }, + { + "epoch": 2.068407487791644, + "grad_norm": 3.7768943309783936, + "learning_rate": 5.370152117308484e-06, + "loss": 0.1022, + "step": 34535 + }, + { + "epoch": 2.068421052631579, + "grad_norm": 5.617530822753906, + "learning_rate": 5.370015074688228e-06, + "loss": 0.2564, + "step": 34536 + }, + { + "epoch": 2.068434617471514, + "grad_norm": 4.956742286682129, + "learning_rate": 5.3698780320679735e-06, + "loss": 0.2097, + "step": 34537 + }, + { + "epoch": 2.0684481823114487, + "grad_norm": 4.716979026794434, + "learning_rate": 5.369740989447718e-06, + "loss": 0.1436, + "step": 34538 + }, + { + "epoch": 2.0684617471513835, + "grad_norm": 4.449690341949463, + "learning_rate": 5.369603946827464e-06, + "loss": 0.1668, + "step": 34539 + }, + { + "epoch": 2.0684753119913184, + "grad_norm": 4.336256504058838, + "learning_rate": 5.369466904207209e-06, + "loss": 0.171, + "step": 34540 + }, + { + "epoch": 2.0684888768312533, + "grad_norm": 6.101827144622803, + "learning_rate": 5.369329861586953e-06, + "loss": 0.2159, + "step": 34541 + }, + { + "epoch": 2.068502441671188, + "grad_norm": 4.602423667907715, + "learning_rate": 5.3691928189666986e-06, + "loss": 0.2613, + "step": 34542 + }, + { + "epoch": 2.068516006511123, + "grad_norm": 5.7813191413879395, + "learning_rate": 5.369055776346445e-06, + "loss": 0.1577, + "step": 34543 + }, + { + "epoch": 2.068529571351058, + "grad_norm": 4.814667224884033, + "learning_rate": 5.36891873372619e-06, + "loss": 0.1981, + "step": 34544 + }, + { + "epoch": 2.0685431361909927, + "grad_norm": 6.516980171203613, + "learning_rate": 5.368781691105934e-06, + "loss": 0.1674, + "step": 34545 + }, + { + "epoch": 2.068556701030928, + "grad_norm": 7.961434841156006, + "learning_rate": 5.368644648485679e-06, + "loss": 0.2475, + "step": 34546 + }, + { + "epoch": 2.068570265870863, + "grad_norm": 4.551132678985596, + "learning_rate": 5.368507605865425e-06, + "loss": 0.1665, + "step": 34547 + }, + { + "epoch": 2.0685838307107978, + "grad_norm": 8.634883880615234, + "learning_rate": 5.36837056324517e-06, + "loss": 0.245, + "step": 34548 + }, + { + "epoch": 2.0685973955507326, + "grad_norm": 8.02469253540039, + "learning_rate": 5.368233520624915e-06, + "loss": 0.1685, + "step": 34549 + }, + { + "epoch": 2.0686109603906675, + "grad_norm": 7.822572231292725, + "learning_rate": 5.36809647800466e-06, + "loss": 0.2503, + "step": 34550 + }, + { + "epoch": 2.0686245252306024, + "grad_norm": 3.6956546306610107, + "learning_rate": 5.367959435384404e-06, + "loss": 0.1243, + "step": 34551 + }, + { + "epoch": 2.0686380900705372, + "grad_norm": 7.287557601928711, + "learning_rate": 5.36782239276415e-06, + "loss": 0.1747, + "step": 34552 + }, + { + "epoch": 2.068651654910472, + "grad_norm": 6.054903507232666, + "learning_rate": 5.3676853501438955e-06, + "loss": 0.1544, + "step": 34553 + }, + { + "epoch": 2.068665219750407, + "grad_norm": 7.199602127075195, + "learning_rate": 5.36754830752364e-06, + "loss": 0.2967, + "step": 34554 + }, + { + "epoch": 2.068678784590342, + "grad_norm": 6.706830024719238, + "learning_rate": 5.367411264903385e-06, + "loss": 0.2181, + "step": 34555 + }, + { + "epoch": 2.0686923494302767, + "grad_norm": 5.3744215965271, + "learning_rate": 5.367274222283131e-06, + "loss": 0.1951, + "step": 34556 + }, + { + "epoch": 2.0687059142702116, + "grad_norm": 6.109210968017578, + "learning_rate": 5.367137179662875e-06, + "loss": 0.3618, + "step": 34557 + }, + { + "epoch": 2.0687194791101464, + "grad_norm": 6.358506202697754, + "learning_rate": 5.367000137042621e-06, + "loss": 0.2078, + "step": 34558 + }, + { + "epoch": 2.0687330439500813, + "grad_norm": 5.062975883483887, + "learning_rate": 5.366863094422366e-06, + "loss": 0.1744, + "step": 34559 + }, + { + "epoch": 2.068746608790016, + "grad_norm": 4.709887504577637, + "learning_rate": 5.366726051802112e-06, + "loss": 0.1191, + "step": 34560 + }, + { + "epoch": 2.068760173629951, + "grad_norm": 4.621922969818115, + "learning_rate": 5.366589009181856e-06, + "loss": 0.1592, + "step": 34561 + }, + { + "epoch": 2.068773738469886, + "grad_norm": 5.396744251251221, + "learning_rate": 5.366451966561601e-06, + "loss": 0.214, + "step": 34562 + }, + { + "epoch": 2.0687873033098207, + "grad_norm": 3.909463882446289, + "learning_rate": 5.366314923941346e-06, + "loss": 0.1551, + "step": 34563 + }, + { + "epoch": 2.068800868149756, + "grad_norm": 8.140369415283203, + "learning_rate": 5.366177881321092e-06, + "loss": 0.1916, + "step": 34564 + }, + { + "epoch": 2.068814432989691, + "grad_norm": 4.619136810302734, + "learning_rate": 5.366040838700837e-06, + "loss": 0.1234, + "step": 34565 + }, + { + "epoch": 2.068827997829626, + "grad_norm": 5.287317276000977, + "learning_rate": 5.365903796080581e-06, + "loss": 0.1542, + "step": 34566 + }, + { + "epoch": 2.0688415626695607, + "grad_norm": 4.914394855499268, + "learning_rate": 5.365766753460326e-06, + "loss": 0.1979, + "step": 34567 + }, + { + "epoch": 2.0688551275094955, + "grad_norm": 4.534660339355469, + "learning_rate": 5.3656297108400715e-06, + "loss": 0.0741, + "step": 34568 + }, + { + "epoch": 2.0688686923494304, + "grad_norm": 6.459794998168945, + "learning_rate": 5.3654926682198175e-06, + "loss": 0.1933, + "step": 34569 + }, + { + "epoch": 2.0688822571893652, + "grad_norm": 5.976544380187988, + "learning_rate": 5.365355625599562e-06, + "loss": 0.2315, + "step": 34570 + }, + { + "epoch": 2.0688958220293, + "grad_norm": 4.014500141143799, + "learning_rate": 5.365218582979307e-06, + "loss": 0.112, + "step": 34571 + }, + { + "epoch": 2.068909386869235, + "grad_norm": 4.677563667297363, + "learning_rate": 5.365081540359051e-06, + "loss": 0.1788, + "step": 34572 + }, + { + "epoch": 2.06892295170917, + "grad_norm": 4.420586109161377, + "learning_rate": 5.364944497738797e-06, + "loss": 0.1178, + "step": 34573 + }, + { + "epoch": 2.0689365165491047, + "grad_norm": 7.298883438110352, + "learning_rate": 5.364807455118543e-06, + "loss": 0.2331, + "step": 34574 + }, + { + "epoch": 2.0689500813890396, + "grad_norm": 5.07058572769165, + "learning_rate": 5.364670412498288e-06, + "loss": 0.1649, + "step": 34575 + }, + { + "epoch": 2.0689636462289744, + "grad_norm": 6.130166530609131, + "learning_rate": 5.364533369878032e-06, + "loss": 0.1455, + "step": 34576 + }, + { + "epoch": 2.0689772110689093, + "grad_norm": 5.324884414672852, + "learning_rate": 5.364396327257778e-06, + "loss": 0.1374, + "step": 34577 + }, + { + "epoch": 2.068990775908844, + "grad_norm": 4.4195780754089355, + "learning_rate": 5.364259284637523e-06, + "loss": 0.2185, + "step": 34578 + }, + { + "epoch": 2.069004340748779, + "grad_norm": 4.289610862731934, + "learning_rate": 5.364122242017268e-06, + "loss": 0.1788, + "step": 34579 + }, + { + "epoch": 2.069017905588714, + "grad_norm": 4.966606616973877, + "learning_rate": 5.363985199397013e-06, + "loss": 0.1815, + "step": 34580 + }, + { + "epoch": 2.0690314704286488, + "grad_norm": 5.694653511047363, + "learning_rate": 5.363848156776757e-06, + "loss": 0.2414, + "step": 34581 + }, + { + "epoch": 2.0690450352685836, + "grad_norm": 4.254513740539551, + "learning_rate": 5.363711114156503e-06, + "loss": 0.1345, + "step": 34582 + }, + { + "epoch": 2.0690586001085185, + "grad_norm": 4.30728816986084, + "learning_rate": 5.363574071536248e-06, + "loss": 0.1358, + "step": 34583 + }, + { + "epoch": 2.069072164948454, + "grad_norm": 4.395829200744629, + "learning_rate": 5.3634370289159935e-06, + "loss": 0.1825, + "step": 34584 + }, + { + "epoch": 2.0690857297883887, + "grad_norm": 4.415181636810303, + "learning_rate": 5.363299986295738e-06, + "loss": 0.1753, + "step": 34585 + }, + { + "epoch": 2.0690992946283235, + "grad_norm": 3.550602674484253, + "learning_rate": 5.363162943675484e-06, + "loss": 0.1185, + "step": 34586 + }, + { + "epoch": 2.0691128594682584, + "grad_norm": 5.198605537414551, + "learning_rate": 5.363025901055229e-06, + "loss": 0.2434, + "step": 34587 + }, + { + "epoch": 2.0691264243081933, + "grad_norm": 4.973571300506592, + "learning_rate": 5.362888858434973e-06, + "loss": 0.2261, + "step": 34588 + }, + { + "epoch": 2.069139989148128, + "grad_norm": 4.98569393157959, + "learning_rate": 5.362751815814719e-06, + "loss": 0.1487, + "step": 34589 + }, + { + "epoch": 2.069153553988063, + "grad_norm": 5.190793514251709, + "learning_rate": 5.362614773194465e-06, + "loss": 0.1973, + "step": 34590 + }, + { + "epoch": 2.069167118827998, + "grad_norm": 4.584632396697998, + "learning_rate": 5.362477730574209e-06, + "loss": 0.1854, + "step": 34591 + }, + { + "epoch": 2.0691806836679327, + "grad_norm": 4.668850898742676, + "learning_rate": 5.362340687953954e-06, + "loss": 0.1711, + "step": 34592 + }, + { + "epoch": 2.0691942485078676, + "grad_norm": 3.7652337551116943, + "learning_rate": 5.362203645333699e-06, + "loss": 0.1652, + "step": 34593 + }, + { + "epoch": 2.0692078133478025, + "grad_norm": 4.136931896209717, + "learning_rate": 5.362066602713444e-06, + "loss": 0.1037, + "step": 34594 + }, + { + "epoch": 2.0692213781877373, + "grad_norm": 4.006331920623779, + "learning_rate": 5.36192956009319e-06, + "loss": 0.093, + "step": 34595 + }, + { + "epoch": 2.069234943027672, + "grad_norm": 3.9778504371643066, + "learning_rate": 5.361792517472935e-06, + "loss": 0.1246, + "step": 34596 + }, + { + "epoch": 2.069248507867607, + "grad_norm": 5.648600101470947, + "learning_rate": 5.361655474852679e-06, + "loss": 0.1744, + "step": 34597 + }, + { + "epoch": 2.069262072707542, + "grad_norm": 4.1213698387146, + "learning_rate": 5.361518432232424e-06, + "loss": 0.2054, + "step": 34598 + }, + { + "epoch": 2.069275637547477, + "grad_norm": 4.606891632080078, + "learning_rate": 5.36138138961217e-06, + "loss": 0.1751, + "step": 34599 + }, + { + "epoch": 2.0692892023874117, + "grad_norm": 4.461347579956055, + "learning_rate": 5.361244346991915e-06, + "loss": 0.1436, + "step": 34600 + }, + { + "epoch": 2.0693027672273465, + "grad_norm": 3.628009796142578, + "learning_rate": 5.36110730437166e-06, + "loss": 0.1484, + "step": 34601 + }, + { + "epoch": 2.069316332067282, + "grad_norm": 3.578928232192993, + "learning_rate": 5.360970261751405e-06, + "loss": 0.1193, + "step": 34602 + }, + { + "epoch": 2.0693298969072167, + "grad_norm": 9.378863334655762, + "learning_rate": 5.360833219131151e-06, + "loss": 0.1943, + "step": 34603 + }, + { + "epoch": 2.0693434617471516, + "grad_norm": 3.54185152053833, + "learning_rate": 5.360696176510895e-06, + "loss": 0.1767, + "step": 34604 + }, + { + "epoch": 2.0693570265870864, + "grad_norm": 3.9529004096984863, + "learning_rate": 5.360559133890641e-06, + "loss": 0.1297, + "step": 34605 + }, + { + "epoch": 2.0693705914270213, + "grad_norm": 5.354590892791748, + "learning_rate": 5.360422091270385e-06, + "loss": 0.1948, + "step": 34606 + }, + { + "epoch": 2.069384156266956, + "grad_norm": 4.999314308166504, + "learning_rate": 5.36028504865013e-06, + "loss": 0.1481, + "step": 34607 + }, + { + "epoch": 2.069397721106891, + "grad_norm": 5.2728376388549805, + "learning_rate": 5.360148006029876e-06, + "loss": 0.22, + "step": 34608 + }, + { + "epoch": 2.069411285946826, + "grad_norm": 3.1713929176330566, + "learning_rate": 5.360010963409621e-06, + "loss": 0.1576, + "step": 34609 + }, + { + "epoch": 2.0694248507867608, + "grad_norm": 4.181657791137695, + "learning_rate": 5.359873920789366e-06, + "loss": 0.1692, + "step": 34610 + }, + { + "epoch": 2.0694384156266956, + "grad_norm": 5.3925065994262695, + "learning_rate": 5.359736878169111e-06, + "loss": 0.2482, + "step": 34611 + }, + { + "epoch": 2.0694519804666305, + "grad_norm": 4.661276817321777, + "learning_rate": 5.359599835548857e-06, + "loss": 0.1892, + "step": 34612 + }, + { + "epoch": 2.0694655453065653, + "grad_norm": 3.035104990005493, + "learning_rate": 5.359462792928601e-06, + "loss": 0.1159, + "step": 34613 + }, + { + "epoch": 2.0694791101465, + "grad_norm": 3.8035359382629395, + "learning_rate": 5.359325750308346e-06, + "loss": 0.1146, + "step": 34614 + }, + { + "epoch": 2.069492674986435, + "grad_norm": 4.474504470825195, + "learning_rate": 5.359188707688091e-06, + "loss": 0.1677, + "step": 34615 + }, + { + "epoch": 2.06950623982637, + "grad_norm": 4.481441974639893, + "learning_rate": 5.359051665067837e-06, + "loss": 0.1535, + "step": 34616 + }, + { + "epoch": 2.069519804666305, + "grad_norm": 3.945077419281006, + "learning_rate": 5.358914622447582e-06, + "loss": 0.1997, + "step": 34617 + }, + { + "epoch": 2.0695333695062397, + "grad_norm": 4.190158367156982, + "learning_rate": 5.358777579827327e-06, + "loss": 0.2297, + "step": 34618 + }, + { + "epoch": 2.0695469343461745, + "grad_norm": 4.252965927124023, + "learning_rate": 5.358640537207071e-06, + "loss": 0.1492, + "step": 34619 + }, + { + "epoch": 2.0695604991861094, + "grad_norm": 6.191282272338867, + "learning_rate": 5.358503494586817e-06, + "loss": 0.1571, + "step": 34620 + }, + { + "epoch": 2.0695740640260443, + "grad_norm": 2.7049508094787598, + "learning_rate": 5.358366451966563e-06, + "loss": 0.0713, + "step": 34621 + }, + { + "epoch": 2.0695876288659796, + "grad_norm": 4.287086009979248, + "learning_rate": 5.358229409346307e-06, + "loss": 0.1701, + "step": 34622 + }, + { + "epoch": 2.0696011937059144, + "grad_norm": 4.056237697601318, + "learning_rate": 5.358092366726052e-06, + "loss": 0.0913, + "step": 34623 + }, + { + "epoch": 2.0696147585458493, + "grad_norm": 5.8159990310668945, + "learning_rate": 5.357955324105797e-06, + "loss": 0.1589, + "step": 34624 + }, + { + "epoch": 2.069628323385784, + "grad_norm": 3.3938748836517334, + "learning_rate": 5.3578182814855425e-06, + "loss": 0.1888, + "step": 34625 + }, + { + "epoch": 2.069641888225719, + "grad_norm": 3.9773004055023193, + "learning_rate": 5.357681238865288e-06, + "loss": 0.1672, + "step": 34626 + }, + { + "epoch": 2.069655453065654, + "grad_norm": 4.904401779174805, + "learning_rate": 5.357544196245033e-06, + "loss": 0.1602, + "step": 34627 + }, + { + "epoch": 2.0696690179055888, + "grad_norm": 4.396978855133057, + "learning_rate": 5.357407153624777e-06, + "loss": 0.1417, + "step": 34628 + }, + { + "epoch": 2.0696825827455236, + "grad_norm": 4.894436359405518, + "learning_rate": 5.357270111004523e-06, + "loss": 0.2141, + "step": 34629 + }, + { + "epoch": 2.0696961475854585, + "grad_norm": 4.315196514129639, + "learning_rate": 5.357133068384268e-06, + "loss": 0.1769, + "step": 34630 + }, + { + "epoch": 2.0697097124253934, + "grad_norm": 5.138433456420898, + "learning_rate": 5.356996025764013e-06, + "loss": 0.1288, + "step": 34631 + }, + { + "epoch": 2.0697232772653282, + "grad_norm": 5.600983619689941, + "learning_rate": 5.356858983143758e-06, + "loss": 0.1925, + "step": 34632 + }, + { + "epoch": 2.069736842105263, + "grad_norm": 4.311887264251709, + "learning_rate": 5.356721940523503e-06, + "loss": 0.0984, + "step": 34633 + }, + { + "epoch": 2.069750406945198, + "grad_norm": 3.768549680709839, + "learning_rate": 5.356584897903248e-06, + "loss": 0.1071, + "step": 34634 + }, + { + "epoch": 2.069763971785133, + "grad_norm": 4.255873203277588, + "learning_rate": 5.356447855282993e-06, + "loss": 0.1912, + "step": 34635 + }, + { + "epoch": 2.0697775366250677, + "grad_norm": 5.336487770080566, + "learning_rate": 5.356310812662739e-06, + "loss": 0.2101, + "step": 34636 + }, + { + "epoch": 2.0697911014650026, + "grad_norm": 3.935905933380127, + "learning_rate": 5.356173770042483e-06, + "loss": 0.111, + "step": 34637 + }, + { + "epoch": 2.0698046663049374, + "grad_norm": 4.798677921295166, + "learning_rate": 5.356036727422229e-06, + "loss": 0.2103, + "step": 34638 + }, + { + "epoch": 2.0698182311448723, + "grad_norm": 5.605565071105957, + "learning_rate": 5.355899684801974e-06, + "loss": 0.1918, + "step": 34639 + }, + { + "epoch": 2.0698317959848076, + "grad_norm": 5.040698051452637, + "learning_rate": 5.3557626421817185e-06, + "loss": 0.1219, + "step": 34640 + }, + { + "epoch": 2.0698453608247425, + "grad_norm": 4.731406211853027, + "learning_rate": 5.355625599561464e-06, + "loss": 0.1422, + "step": 34641 + }, + { + "epoch": 2.0698589256646773, + "grad_norm": 5.486025333404541, + "learning_rate": 5.35548855694121e-06, + "loss": 0.1478, + "step": 34642 + }, + { + "epoch": 2.069872490504612, + "grad_norm": 4.302469730377197, + "learning_rate": 5.355351514320955e-06, + "loss": 0.1554, + "step": 34643 + }, + { + "epoch": 2.069886055344547, + "grad_norm": 4.8472466468811035, + "learning_rate": 5.355214471700699e-06, + "loss": 0.198, + "step": 34644 + }, + { + "epoch": 2.069899620184482, + "grad_norm": 3.507330894470215, + "learning_rate": 5.355077429080444e-06, + "loss": 0.0705, + "step": 34645 + }, + { + "epoch": 2.069913185024417, + "grad_norm": 5.259732246398926, + "learning_rate": 5.35494038646019e-06, + "loss": 0.1707, + "step": 34646 + }, + { + "epoch": 2.0699267498643517, + "grad_norm": 4.245731353759766, + "learning_rate": 5.354803343839935e-06, + "loss": 0.1667, + "step": 34647 + }, + { + "epoch": 2.0699403147042865, + "grad_norm": 3.1054599285125732, + "learning_rate": 5.35466630121968e-06, + "loss": 0.091, + "step": 34648 + }, + { + "epoch": 2.0699538795442214, + "grad_norm": 3.495004892349243, + "learning_rate": 5.354529258599424e-06, + "loss": 0.1358, + "step": 34649 + }, + { + "epoch": 2.0699674443841563, + "grad_norm": 3.743849515914917, + "learning_rate": 5.354392215979169e-06, + "loss": 0.1721, + "step": 34650 + }, + { + "epoch": 2.069981009224091, + "grad_norm": 4.746275424957275, + "learning_rate": 5.3542551733589154e-06, + "loss": 0.1372, + "step": 34651 + }, + { + "epoch": 2.069994574064026, + "grad_norm": 4.186279773712158, + "learning_rate": 5.354118130738661e-06, + "loss": 0.1681, + "step": 34652 + }, + { + "epoch": 2.070008138903961, + "grad_norm": 4.095898151397705, + "learning_rate": 5.353981088118405e-06, + "loss": 0.1169, + "step": 34653 + }, + { + "epoch": 2.0700217037438957, + "grad_norm": 4.04675817489624, + "learning_rate": 5.35384404549815e-06, + "loss": 0.1397, + "step": 34654 + }, + { + "epoch": 2.0700352685838306, + "grad_norm": 4.393401622772217, + "learning_rate": 5.353707002877896e-06, + "loss": 0.1319, + "step": 34655 + }, + { + "epoch": 2.0700488334237654, + "grad_norm": 7.370962619781494, + "learning_rate": 5.3535699602576405e-06, + "loss": 0.2306, + "step": 34656 + }, + { + "epoch": 2.0700623982637003, + "grad_norm": 5.205973148345947, + "learning_rate": 5.353432917637386e-06, + "loss": 0.1613, + "step": 34657 + }, + { + "epoch": 2.070075963103635, + "grad_norm": 5.820201873779297, + "learning_rate": 5.353295875017131e-06, + "loss": 0.1717, + "step": 34658 + }, + { + "epoch": 2.07008952794357, + "grad_norm": 5.561651706695557, + "learning_rate": 5.353158832396876e-06, + "loss": 0.1824, + "step": 34659 + }, + { + "epoch": 2.0701030927835053, + "grad_norm": 4.609227180480957, + "learning_rate": 5.353021789776621e-06, + "loss": 0.1531, + "step": 34660 + }, + { + "epoch": 2.07011665762344, + "grad_norm": 4.775851249694824, + "learning_rate": 5.352884747156366e-06, + "loss": 0.173, + "step": 34661 + }, + { + "epoch": 2.070130222463375, + "grad_norm": 3.364673614501953, + "learning_rate": 5.352747704536111e-06, + "loss": 0.0776, + "step": 34662 + }, + { + "epoch": 2.07014378730331, + "grad_norm": 4.675586700439453, + "learning_rate": 5.352610661915856e-06, + "loss": 0.1113, + "step": 34663 + }, + { + "epoch": 2.070157352143245, + "grad_norm": 3.87069034576416, + "learning_rate": 5.352473619295602e-06, + "loss": 0.1169, + "step": 34664 + }, + { + "epoch": 2.0701709169831797, + "grad_norm": 4.292645454406738, + "learning_rate": 5.352336576675346e-06, + "loss": 0.0887, + "step": 34665 + }, + { + "epoch": 2.0701844818231145, + "grad_norm": 5.571885585784912, + "learning_rate": 5.3521995340550914e-06, + "loss": 0.1987, + "step": 34666 + }, + { + "epoch": 2.0701980466630494, + "grad_norm": 4.277320861816406, + "learning_rate": 5.352062491434837e-06, + "loss": 0.0958, + "step": 34667 + }, + { + "epoch": 2.0702116115029843, + "grad_norm": 3.815871238708496, + "learning_rate": 5.351925448814582e-06, + "loss": 0.1495, + "step": 34668 + }, + { + "epoch": 2.070225176342919, + "grad_norm": 6.103312015533447, + "learning_rate": 5.351788406194327e-06, + "loss": 0.1746, + "step": 34669 + }, + { + "epoch": 2.070238741182854, + "grad_norm": 6.227212429046631, + "learning_rate": 5.351651363574072e-06, + "loss": 0.2097, + "step": 34670 + }, + { + "epoch": 2.070252306022789, + "grad_norm": 3.3349192142486572, + "learning_rate": 5.3515143209538165e-06, + "loss": 0.1035, + "step": 34671 + }, + { + "epoch": 2.0702658708627237, + "grad_norm": 5.407984256744385, + "learning_rate": 5.3513772783335625e-06, + "loss": 0.2086, + "step": 34672 + }, + { + "epoch": 2.0702794357026586, + "grad_norm": 4.598270416259766, + "learning_rate": 5.351240235713308e-06, + "loss": 0.1404, + "step": 34673 + }, + { + "epoch": 2.0702930005425935, + "grad_norm": 5.077341079711914, + "learning_rate": 5.351103193093052e-06, + "loss": 0.1554, + "step": 34674 + }, + { + "epoch": 2.0703065653825283, + "grad_norm": 4.241244316101074, + "learning_rate": 5.350966150472797e-06, + "loss": 0.1467, + "step": 34675 + }, + { + "epoch": 2.070320130222463, + "grad_norm": 5.979525089263916, + "learning_rate": 5.350829107852542e-06, + "loss": 0.2078, + "step": 34676 + }, + { + "epoch": 2.070333695062398, + "grad_norm": 4.212604999542236, + "learning_rate": 5.350692065232288e-06, + "loss": 0.1087, + "step": 34677 + }, + { + "epoch": 2.0703472599023334, + "grad_norm": 4.74302864074707, + "learning_rate": 5.350555022612033e-06, + "loss": 0.1249, + "step": 34678 + }, + { + "epoch": 2.0703608247422682, + "grad_norm": 5.382836818695068, + "learning_rate": 5.350417979991778e-06, + "loss": 0.151, + "step": 34679 + }, + { + "epoch": 2.070374389582203, + "grad_norm": 4.534398555755615, + "learning_rate": 5.350280937371522e-06, + "loss": 0.1184, + "step": 34680 + }, + { + "epoch": 2.070387954422138, + "grad_norm": 4.133510589599609, + "learning_rate": 5.350143894751268e-06, + "loss": 0.118, + "step": 34681 + }, + { + "epoch": 2.070401519262073, + "grad_norm": 2.9940848350524902, + "learning_rate": 5.3500068521310134e-06, + "loss": 0.1258, + "step": 34682 + }, + { + "epoch": 2.0704150841020077, + "grad_norm": 5.171895980834961, + "learning_rate": 5.349869809510758e-06, + "loss": 0.1605, + "step": 34683 + }, + { + "epoch": 2.0704286489419426, + "grad_norm": 4.040356636047363, + "learning_rate": 5.349732766890503e-06, + "loss": 0.1325, + "step": 34684 + }, + { + "epoch": 2.0704422137818774, + "grad_norm": 4.377152442932129, + "learning_rate": 5.349595724270249e-06, + "loss": 0.1234, + "step": 34685 + }, + { + "epoch": 2.0704557786218123, + "grad_norm": 4.852046012878418, + "learning_rate": 5.349458681649994e-06, + "loss": 0.2121, + "step": 34686 + }, + { + "epoch": 2.070469343461747, + "grad_norm": 4.330448627471924, + "learning_rate": 5.3493216390297385e-06, + "loss": 0.213, + "step": 34687 + }, + { + "epoch": 2.070482908301682, + "grad_norm": 3.90527081489563, + "learning_rate": 5.349184596409484e-06, + "loss": 0.1229, + "step": 34688 + }, + { + "epoch": 2.070496473141617, + "grad_norm": 4.642719268798828, + "learning_rate": 5.349047553789228e-06, + "loss": 0.158, + "step": 34689 + }, + { + "epoch": 2.0705100379815518, + "grad_norm": 3.666802167892456, + "learning_rate": 5.348910511168974e-06, + "loss": 0.142, + "step": 34690 + }, + { + "epoch": 2.0705236028214866, + "grad_norm": 4.110407829284668, + "learning_rate": 5.348773468548719e-06, + "loss": 0.1537, + "step": 34691 + }, + { + "epoch": 2.0705371676614215, + "grad_norm": 4.080847263336182, + "learning_rate": 5.348636425928464e-06, + "loss": 0.1713, + "step": 34692 + }, + { + "epoch": 2.0705507325013564, + "grad_norm": 4.861835956573486, + "learning_rate": 5.348499383308209e-06, + "loss": 0.2162, + "step": 34693 + }, + { + "epoch": 2.070564297341291, + "grad_norm": 3.093949556350708, + "learning_rate": 5.348362340687955e-06, + "loss": 0.1064, + "step": 34694 + }, + { + "epoch": 2.070577862181226, + "grad_norm": 3.093496561050415, + "learning_rate": 5.3482252980677e-06, + "loss": 0.1187, + "step": 34695 + }, + { + "epoch": 2.070591427021161, + "grad_norm": 3.363556385040283, + "learning_rate": 5.348088255447444e-06, + "loss": 0.1532, + "step": 34696 + }, + { + "epoch": 2.070604991861096, + "grad_norm": 4.9194416999816895, + "learning_rate": 5.3479512128271894e-06, + "loss": 0.119, + "step": 34697 + }, + { + "epoch": 2.070618556701031, + "grad_norm": 4.7180938720703125, + "learning_rate": 5.3478141702069355e-06, + "loss": 0.1691, + "step": 34698 + }, + { + "epoch": 2.070632121540966, + "grad_norm": 5.328714370727539, + "learning_rate": 5.34767712758668e-06, + "loss": 0.1672, + "step": 34699 + }, + { + "epoch": 2.070645686380901, + "grad_norm": 4.29316520690918, + "learning_rate": 5.347540084966425e-06, + "loss": 0.1725, + "step": 34700 + }, + { + "epoch": 2.0706592512208357, + "grad_norm": 3.797569751739502, + "learning_rate": 5.34740304234617e-06, + "loss": 0.1403, + "step": 34701 + }, + { + "epoch": 2.0706728160607706, + "grad_norm": 6.330019474029541, + "learning_rate": 5.3472659997259145e-06, + "loss": 0.2755, + "step": 34702 + }, + { + "epoch": 2.0706863809007054, + "grad_norm": 3.7965261936187744, + "learning_rate": 5.3471289571056605e-06, + "loss": 0.1232, + "step": 34703 + }, + { + "epoch": 2.0706999457406403, + "grad_norm": 5.693422794342041, + "learning_rate": 5.346991914485406e-06, + "loss": 0.1948, + "step": 34704 + }, + { + "epoch": 2.070713510580575, + "grad_norm": 4.379137992858887, + "learning_rate": 5.34685487186515e-06, + "loss": 0.1572, + "step": 34705 + }, + { + "epoch": 2.07072707542051, + "grad_norm": 5.410006523132324, + "learning_rate": 5.346717829244895e-06, + "loss": 0.2135, + "step": 34706 + }, + { + "epoch": 2.070740640260445, + "grad_norm": 4.574869155883789, + "learning_rate": 5.346580786624641e-06, + "loss": 0.1673, + "step": 34707 + }, + { + "epoch": 2.0707542051003798, + "grad_norm": 3.4833526611328125, + "learning_rate": 5.3464437440043856e-06, + "loss": 0.1267, + "step": 34708 + }, + { + "epoch": 2.0707677699403146, + "grad_norm": 4.713863372802734, + "learning_rate": 5.346306701384131e-06, + "loss": 0.1563, + "step": 34709 + }, + { + "epoch": 2.0707813347802495, + "grad_norm": 4.040092468261719, + "learning_rate": 5.346169658763876e-06, + "loss": 0.1196, + "step": 34710 + }, + { + "epoch": 2.0707948996201844, + "grad_norm": 4.455559730529785, + "learning_rate": 5.346032616143622e-06, + "loss": 0.1786, + "step": 34711 + }, + { + "epoch": 2.0708084644601192, + "grad_norm": 3.307123899459839, + "learning_rate": 5.345895573523366e-06, + "loss": 0.1078, + "step": 34712 + }, + { + "epoch": 2.070822029300054, + "grad_norm": 4.680637836456299, + "learning_rate": 5.3457585309031114e-06, + "loss": 0.1768, + "step": 34713 + }, + { + "epoch": 2.070835594139989, + "grad_norm": 4.9213714599609375, + "learning_rate": 5.345621488282856e-06, + "loss": 0.1792, + "step": 34714 + }, + { + "epoch": 2.070849158979924, + "grad_norm": 4.148021697998047, + "learning_rate": 5.345484445662602e-06, + "loss": 0.1509, + "step": 34715 + }, + { + "epoch": 2.070862723819859, + "grad_norm": 5.294439315795898, + "learning_rate": 5.345347403042347e-06, + "loss": 0.2317, + "step": 34716 + }, + { + "epoch": 2.070876288659794, + "grad_norm": 4.314024925231934, + "learning_rate": 5.345210360422091e-06, + "loss": 0.2046, + "step": 34717 + }, + { + "epoch": 2.070889853499729, + "grad_norm": 4.588984489440918, + "learning_rate": 5.3450733178018365e-06, + "loss": 0.2013, + "step": 34718 + }, + { + "epoch": 2.0709034183396637, + "grad_norm": 3.7125890254974365, + "learning_rate": 5.344936275181582e-06, + "loss": 0.1724, + "step": 34719 + }, + { + "epoch": 2.0709169831795986, + "grad_norm": 5.15078067779541, + "learning_rate": 5.344799232561328e-06, + "loss": 0.2172, + "step": 34720 + }, + { + "epoch": 2.0709305480195335, + "grad_norm": 4.098113536834717, + "learning_rate": 5.344662189941072e-06, + "loss": 0.1173, + "step": 34721 + }, + { + "epoch": 2.0709441128594683, + "grad_norm": 4.258269786834717, + "learning_rate": 5.344525147320817e-06, + "loss": 0.1924, + "step": 34722 + }, + { + "epoch": 2.070957677699403, + "grad_norm": 5.7515387535095215, + "learning_rate": 5.3443881047005615e-06, + "loss": 0.1977, + "step": 34723 + }, + { + "epoch": 2.070971242539338, + "grad_norm": 4.086321830749512, + "learning_rate": 5.3442510620803076e-06, + "loss": 0.171, + "step": 34724 + }, + { + "epoch": 2.070984807379273, + "grad_norm": 4.230347633361816, + "learning_rate": 5.344114019460053e-06, + "loss": 0.1657, + "step": 34725 + }, + { + "epoch": 2.070998372219208, + "grad_norm": 4.331300258636475, + "learning_rate": 5.343976976839798e-06, + "loss": 0.1766, + "step": 34726 + }, + { + "epoch": 2.0710119370591427, + "grad_norm": 4.203355312347412, + "learning_rate": 5.343839934219542e-06, + "loss": 0.1491, + "step": 34727 + }, + { + "epoch": 2.0710255018990775, + "grad_norm": 4.345555305480957, + "learning_rate": 5.343702891599288e-06, + "loss": 0.1338, + "step": 34728 + }, + { + "epoch": 2.0710390667390124, + "grad_norm": 3.7774107456207275, + "learning_rate": 5.3435658489790335e-06, + "loss": 0.1871, + "step": 34729 + }, + { + "epoch": 2.0710526315789473, + "grad_norm": 3.4892473220825195, + "learning_rate": 5.343428806358778e-06, + "loss": 0.0954, + "step": 34730 + }, + { + "epoch": 2.071066196418882, + "grad_norm": 3.7112550735473633, + "learning_rate": 5.343291763738523e-06, + "loss": 0.1658, + "step": 34731 + }, + { + "epoch": 2.071079761258817, + "grad_norm": 3.394439697265625, + "learning_rate": 5.343154721118267e-06, + "loss": 0.1166, + "step": 34732 + }, + { + "epoch": 2.071093326098752, + "grad_norm": 3.9401309490203857, + "learning_rate": 5.343017678498013e-06, + "loss": 0.1363, + "step": 34733 + }, + { + "epoch": 2.0711068909386867, + "grad_norm": 4.306032657623291, + "learning_rate": 5.3428806358777585e-06, + "loss": 0.1802, + "step": 34734 + }, + { + "epoch": 2.071120455778622, + "grad_norm": 4.670659065246582, + "learning_rate": 5.342743593257504e-06, + "loss": 0.1704, + "step": 34735 + }, + { + "epoch": 2.071134020618557, + "grad_norm": 4.724315166473389, + "learning_rate": 5.342606550637248e-06, + "loss": 0.2708, + "step": 34736 + }, + { + "epoch": 2.0711475854584918, + "grad_norm": 3.110546112060547, + "learning_rate": 5.342469508016994e-06, + "loss": 0.1492, + "step": 34737 + }, + { + "epoch": 2.0711611502984266, + "grad_norm": 3.6488187313079834, + "learning_rate": 5.342332465396739e-06, + "loss": 0.1491, + "step": 34738 + }, + { + "epoch": 2.0711747151383615, + "grad_norm": 4.329780101776123, + "learning_rate": 5.3421954227764836e-06, + "loss": 0.1591, + "step": 34739 + }, + { + "epoch": 2.0711882799782964, + "grad_norm": 3.9284708499908447, + "learning_rate": 5.342058380156229e-06, + "loss": 0.1275, + "step": 34740 + }, + { + "epoch": 2.071201844818231, + "grad_norm": 4.115987300872803, + "learning_rate": 5.341921337535975e-06, + "loss": 0.1794, + "step": 34741 + }, + { + "epoch": 2.071215409658166, + "grad_norm": 6.455690383911133, + "learning_rate": 5.341784294915719e-06, + "loss": 0.1846, + "step": 34742 + }, + { + "epoch": 2.071228974498101, + "grad_norm": 4.957980632781982, + "learning_rate": 5.341647252295464e-06, + "loss": 0.1757, + "step": 34743 + }, + { + "epoch": 2.071242539338036, + "grad_norm": 7.934102535247803, + "learning_rate": 5.3415102096752094e-06, + "loss": 0.1866, + "step": 34744 + }, + { + "epoch": 2.0712561041779707, + "grad_norm": 4.651479244232178, + "learning_rate": 5.341373167054954e-06, + "loss": 0.2013, + "step": 34745 + }, + { + "epoch": 2.0712696690179055, + "grad_norm": 4.512563228607178, + "learning_rate": 5.3412361244347e-06, + "loss": 0.142, + "step": 34746 + }, + { + "epoch": 2.0712832338578404, + "grad_norm": 4.933764457702637, + "learning_rate": 5.341099081814445e-06, + "loss": 0.1373, + "step": 34747 + }, + { + "epoch": 2.0712967986977753, + "grad_norm": 4.745753288269043, + "learning_rate": 5.340962039194189e-06, + "loss": 0.1158, + "step": 34748 + }, + { + "epoch": 2.07131036353771, + "grad_norm": 4.847140312194824, + "learning_rate": 5.3408249965739345e-06, + "loss": 0.1303, + "step": 34749 + }, + { + "epoch": 2.071323928377645, + "grad_norm": 5.664739608764648, + "learning_rate": 5.3406879539536805e-06, + "loss": 0.1645, + "step": 34750 + }, + { + "epoch": 2.07133749321758, + "grad_norm": 4.929425239562988, + "learning_rate": 5.340550911333426e-06, + "loss": 0.238, + "step": 34751 + }, + { + "epoch": 2.0713510580575147, + "grad_norm": 5.353986740112305, + "learning_rate": 5.34041386871317e-06, + "loss": 0.2486, + "step": 34752 + }, + { + "epoch": 2.0713646228974496, + "grad_norm": 6.006464958190918, + "learning_rate": 5.340276826092915e-06, + "loss": 0.2242, + "step": 34753 + }, + { + "epoch": 2.071378187737385, + "grad_norm": 3.06673264503479, + "learning_rate": 5.340139783472661e-06, + "loss": 0.0892, + "step": 34754 + }, + { + "epoch": 2.07139175257732, + "grad_norm": 4.544408321380615, + "learning_rate": 5.3400027408524056e-06, + "loss": 0.1742, + "step": 34755 + }, + { + "epoch": 2.0714053174172546, + "grad_norm": 5.229532241821289, + "learning_rate": 5.339865698232151e-06, + "loss": 0.1337, + "step": 34756 + }, + { + "epoch": 2.0714188822571895, + "grad_norm": 3.9333720207214355, + "learning_rate": 5.339728655611895e-06, + "loss": 0.1466, + "step": 34757 + }, + { + "epoch": 2.0714324470971244, + "grad_norm": 3.1296019554138184, + "learning_rate": 5.33959161299164e-06, + "loss": 0.1321, + "step": 34758 + }, + { + "epoch": 2.0714460119370592, + "grad_norm": 6.08425235748291, + "learning_rate": 5.339454570371386e-06, + "loss": 0.1914, + "step": 34759 + }, + { + "epoch": 2.071459576776994, + "grad_norm": 5.934654235839844, + "learning_rate": 5.3393175277511315e-06, + "loss": 0.1875, + "step": 34760 + }, + { + "epoch": 2.071473141616929, + "grad_norm": 4.617459297180176, + "learning_rate": 5.339180485130876e-06, + "loss": 0.1561, + "step": 34761 + }, + { + "epoch": 2.071486706456864, + "grad_norm": 3.5906169414520264, + "learning_rate": 5.339043442510621e-06, + "loss": 0.154, + "step": 34762 + }, + { + "epoch": 2.0715002712967987, + "grad_norm": 3.223958730697632, + "learning_rate": 5.338906399890367e-06, + "loss": 0.0919, + "step": 34763 + }, + { + "epoch": 2.0715138361367336, + "grad_norm": 5.372805118560791, + "learning_rate": 5.338769357270111e-06, + "loss": 0.217, + "step": 34764 + }, + { + "epoch": 2.0715274009766684, + "grad_norm": 4.0696187019348145, + "learning_rate": 5.3386323146498565e-06, + "loss": 0.1215, + "step": 34765 + }, + { + "epoch": 2.0715409658166033, + "grad_norm": 5.2248125076293945, + "learning_rate": 5.338495272029602e-06, + "loss": 0.1598, + "step": 34766 + }, + { + "epoch": 2.071554530656538, + "grad_norm": 4.692464828491211, + "learning_rate": 5.338358229409347e-06, + "loss": 0.1702, + "step": 34767 + }, + { + "epoch": 2.071568095496473, + "grad_norm": 5.493951797485352, + "learning_rate": 5.338221186789092e-06, + "loss": 0.1693, + "step": 34768 + }, + { + "epoch": 2.071581660336408, + "grad_norm": 4.196450710296631, + "learning_rate": 5.338084144168837e-06, + "loss": 0.1573, + "step": 34769 + }, + { + "epoch": 2.0715952251763428, + "grad_norm": 3.5699565410614014, + "learning_rate": 5.3379471015485816e-06, + "loss": 0.1933, + "step": 34770 + }, + { + "epoch": 2.0716087900162776, + "grad_norm": 3.651336669921875, + "learning_rate": 5.337810058928327e-06, + "loss": 0.1272, + "step": 34771 + }, + { + "epoch": 2.0716223548562125, + "grad_norm": 3.4576497077941895, + "learning_rate": 5.337673016308073e-06, + "loss": 0.1103, + "step": 34772 + }, + { + "epoch": 2.071635919696148, + "grad_norm": 4.496460914611816, + "learning_rate": 5.337535973687817e-06, + "loss": 0.1426, + "step": 34773 + }, + { + "epoch": 2.0716494845360827, + "grad_norm": 6.473145008087158, + "learning_rate": 5.337398931067562e-06, + "loss": 0.1944, + "step": 34774 + }, + { + "epoch": 2.0716630493760175, + "grad_norm": 5.063573360443115, + "learning_rate": 5.3372618884473075e-06, + "loss": 0.1886, + "step": 34775 + }, + { + "epoch": 2.0716766142159524, + "grad_norm": 4.294363498687744, + "learning_rate": 5.337124845827053e-06, + "loss": 0.1514, + "step": 34776 + }, + { + "epoch": 2.0716901790558873, + "grad_norm": 6.052708148956299, + "learning_rate": 5.336987803206798e-06, + "loss": 0.2865, + "step": 34777 + }, + { + "epoch": 2.071703743895822, + "grad_norm": 4.709859371185303, + "learning_rate": 5.336850760586543e-06, + "loss": 0.3363, + "step": 34778 + }, + { + "epoch": 2.071717308735757, + "grad_norm": 3.6808841228485107, + "learning_rate": 5.336713717966287e-06, + "loss": 0.1595, + "step": 34779 + }, + { + "epoch": 2.071730873575692, + "grad_norm": 3.809803009033203, + "learning_rate": 5.336576675346033e-06, + "loss": 0.1522, + "step": 34780 + }, + { + "epoch": 2.0717444384156267, + "grad_norm": 5.235830783843994, + "learning_rate": 5.3364396327257785e-06, + "loss": 0.1949, + "step": 34781 + }, + { + "epoch": 2.0717580032555616, + "grad_norm": 5.939822196960449, + "learning_rate": 5.336302590105523e-06, + "loss": 0.2232, + "step": 34782 + }, + { + "epoch": 2.0717715680954965, + "grad_norm": 4.030106544494629, + "learning_rate": 5.336165547485268e-06, + "loss": 0.1713, + "step": 34783 + }, + { + "epoch": 2.0717851329354313, + "grad_norm": 6.434859752655029, + "learning_rate": 5.336028504865014e-06, + "loss": 0.2359, + "step": 34784 + }, + { + "epoch": 2.071798697775366, + "grad_norm": 3.608957290649414, + "learning_rate": 5.335891462244759e-06, + "loss": 0.1514, + "step": 34785 + }, + { + "epoch": 2.071812262615301, + "grad_norm": 3.3514490127563477, + "learning_rate": 5.3357544196245036e-06, + "loss": 0.1532, + "step": 34786 + }, + { + "epoch": 2.071825827455236, + "grad_norm": 6.178299427032471, + "learning_rate": 5.335617377004249e-06, + "loss": 0.2337, + "step": 34787 + }, + { + "epoch": 2.071839392295171, + "grad_norm": 5.841312408447266, + "learning_rate": 5.335480334383993e-06, + "loss": 0.1647, + "step": 34788 + }, + { + "epoch": 2.0718529571351056, + "grad_norm": 4.606271266937256, + "learning_rate": 5.335343291763739e-06, + "loss": 0.2251, + "step": 34789 + }, + { + "epoch": 2.0718665219750405, + "grad_norm": 4.535763740539551, + "learning_rate": 5.335206249143484e-06, + "loss": 0.1646, + "step": 34790 + }, + { + "epoch": 2.0718800868149754, + "grad_norm": 5.889235973358154, + "learning_rate": 5.335069206523229e-06, + "loss": 0.2034, + "step": 34791 + }, + { + "epoch": 2.0718936516549107, + "grad_norm": 4.9317402839660645, + "learning_rate": 5.334932163902974e-06, + "loss": 0.1602, + "step": 34792 + }, + { + "epoch": 2.0719072164948455, + "grad_norm": 5.973465442657471, + "learning_rate": 5.33479512128272e-06, + "loss": 0.2324, + "step": 34793 + }, + { + "epoch": 2.0719207813347804, + "grad_norm": 4.718503952026367, + "learning_rate": 5.334658078662465e-06, + "loss": 0.3697, + "step": 34794 + }, + { + "epoch": 2.0719343461747153, + "grad_norm": 4.4208855628967285, + "learning_rate": 5.334521036042209e-06, + "loss": 0.2228, + "step": 34795 + }, + { + "epoch": 2.07194791101465, + "grad_norm": 4.5349650382995605, + "learning_rate": 5.3343839934219545e-06, + "loss": 0.2207, + "step": 34796 + }, + { + "epoch": 2.071961475854585, + "grad_norm": 3.8726227283477783, + "learning_rate": 5.3342469508017005e-06, + "loss": 0.1838, + "step": 34797 + }, + { + "epoch": 2.07197504069452, + "grad_norm": 4.076454162597656, + "learning_rate": 5.334109908181445e-06, + "loss": 0.1563, + "step": 34798 + }, + { + "epoch": 2.0719886055344547, + "grad_norm": 4.606115818023682, + "learning_rate": 5.33397286556119e-06, + "loss": 0.152, + "step": 34799 + }, + { + "epoch": 2.0720021703743896, + "grad_norm": 4.766351222991943, + "learning_rate": 5.333835822940935e-06, + "loss": 0.1962, + "step": 34800 + }, + { + "epoch": 2.0720157352143245, + "grad_norm": 3.3953402042388916, + "learning_rate": 5.3336987803206796e-06, + "loss": 0.1257, + "step": 34801 + }, + { + "epoch": 2.0720293000542593, + "grad_norm": 3.5807809829711914, + "learning_rate": 5.333561737700426e-06, + "loss": 0.1423, + "step": 34802 + }, + { + "epoch": 2.072042864894194, + "grad_norm": 3.8959741592407227, + "learning_rate": 5.333424695080171e-06, + "loss": 0.174, + "step": 34803 + }, + { + "epoch": 2.072056429734129, + "grad_norm": 5.180213451385498, + "learning_rate": 5.333287652459915e-06, + "loss": 0.1816, + "step": 34804 + }, + { + "epoch": 2.072069994574064, + "grad_norm": 4.763204574584961, + "learning_rate": 5.33315060983966e-06, + "loss": 0.1468, + "step": 34805 + }, + { + "epoch": 2.072083559413999, + "grad_norm": 4.966068744659424, + "learning_rate": 5.333013567219406e-06, + "loss": 0.2581, + "step": 34806 + }, + { + "epoch": 2.0720971242539337, + "grad_norm": 5.437462329864502, + "learning_rate": 5.332876524599151e-06, + "loss": 0.2962, + "step": 34807 + }, + { + "epoch": 2.0721106890938685, + "grad_norm": 4.463862895965576, + "learning_rate": 5.332739481978896e-06, + "loss": 0.1656, + "step": 34808 + }, + { + "epoch": 2.0721242539338034, + "grad_norm": 4.240190505981445, + "learning_rate": 5.332602439358641e-06, + "loss": 0.1502, + "step": 34809 + }, + { + "epoch": 2.0721378187737383, + "grad_norm": 4.933957099914551, + "learning_rate": 5.332465396738386e-06, + "loss": 0.2265, + "step": 34810 + }, + { + "epoch": 2.0721513836136736, + "grad_norm": 4.653257846832275, + "learning_rate": 5.332328354118131e-06, + "loss": 0.2113, + "step": 34811 + }, + { + "epoch": 2.0721649484536084, + "grad_norm": 4.417355537414551, + "learning_rate": 5.3321913114978765e-06, + "loss": 0.1573, + "step": 34812 + }, + { + "epoch": 2.0721785132935433, + "grad_norm": 3.822782039642334, + "learning_rate": 5.332054268877621e-06, + "loss": 0.1596, + "step": 34813 + }, + { + "epoch": 2.072192078133478, + "grad_norm": 3.834489107131958, + "learning_rate": 5.331917226257366e-06, + "loss": 0.1063, + "step": 34814 + }, + { + "epoch": 2.072205642973413, + "grad_norm": 5.441555023193359, + "learning_rate": 5.331780183637112e-06, + "loss": 0.2401, + "step": 34815 + }, + { + "epoch": 2.072219207813348, + "grad_norm": 5.38484525680542, + "learning_rate": 5.331643141016856e-06, + "loss": 0.2282, + "step": 34816 + }, + { + "epoch": 2.0722327726532828, + "grad_norm": 4.621110916137695, + "learning_rate": 5.331506098396602e-06, + "loss": 0.3977, + "step": 34817 + }, + { + "epoch": 2.0722463374932176, + "grad_norm": 4.476343631744385, + "learning_rate": 5.331369055776347e-06, + "loss": 0.1673, + "step": 34818 + }, + { + "epoch": 2.0722599023331525, + "grad_norm": 6.129164695739746, + "learning_rate": 5.331232013156093e-06, + "loss": 0.2862, + "step": 34819 + }, + { + "epoch": 2.0722734671730874, + "grad_norm": 3.326035737991333, + "learning_rate": 5.331094970535837e-06, + "loss": 0.1329, + "step": 34820 + }, + { + "epoch": 2.0722870320130222, + "grad_norm": 6.974586486816406, + "learning_rate": 5.330957927915582e-06, + "loss": 0.2439, + "step": 34821 + }, + { + "epoch": 2.072300596852957, + "grad_norm": 4.4412312507629395, + "learning_rate": 5.330820885295327e-06, + "loss": 0.2027, + "step": 34822 + }, + { + "epoch": 2.072314161692892, + "grad_norm": 3.7742207050323486, + "learning_rate": 5.330683842675073e-06, + "loss": 0.1252, + "step": 34823 + }, + { + "epoch": 2.072327726532827, + "grad_norm": 4.281974792480469, + "learning_rate": 5.330546800054818e-06, + "loss": 0.1793, + "step": 34824 + }, + { + "epoch": 2.0723412913727617, + "grad_norm": 5.1305365562438965, + "learning_rate": 5.330409757434562e-06, + "loss": 0.2694, + "step": 34825 + }, + { + "epoch": 2.0723548562126965, + "grad_norm": 4.255083084106445, + "learning_rate": 5.330272714814307e-06, + "loss": 0.2357, + "step": 34826 + }, + { + "epoch": 2.0723684210526314, + "grad_norm": 4.0927886962890625, + "learning_rate": 5.3301356721940525e-06, + "loss": 0.1515, + "step": 34827 + }, + { + "epoch": 2.0723819858925663, + "grad_norm": 4.906242847442627, + "learning_rate": 5.3299986295737985e-06, + "loss": 0.2614, + "step": 34828 + }, + { + "epoch": 2.072395550732501, + "grad_norm": 5.337129592895508, + "learning_rate": 5.329861586953543e-06, + "loss": 0.2495, + "step": 34829 + }, + { + "epoch": 2.0724091155724365, + "grad_norm": 4.339972496032715, + "learning_rate": 5.329724544333288e-06, + "loss": 0.1768, + "step": 34830 + }, + { + "epoch": 2.0724226804123713, + "grad_norm": 5.212459564208984, + "learning_rate": 5.329587501713032e-06, + "loss": 0.1843, + "step": 34831 + }, + { + "epoch": 2.072436245252306, + "grad_norm": 5.552265644073486, + "learning_rate": 5.329450459092778e-06, + "loss": 0.2281, + "step": 34832 + }, + { + "epoch": 2.072449810092241, + "grad_norm": 3.848606586456299, + "learning_rate": 5.329313416472524e-06, + "loss": 0.1666, + "step": 34833 + }, + { + "epoch": 2.072463374932176, + "grad_norm": 5.611324787139893, + "learning_rate": 5.329176373852269e-06, + "loss": 0.2518, + "step": 34834 + }, + { + "epoch": 2.072476939772111, + "grad_norm": 7.2232985496521, + "learning_rate": 5.329039331232013e-06, + "loss": 0.2705, + "step": 34835 + }, + { + "epoch": 2.0724905046120456, + "grad_norm": 4.662771701812744, + "learning_rate": 5.328902288611759e-06, + "loss": 0.1297, + "step": 34836 + }, + { + "epoch": 2.0725040694519805, + "grad_norm": 6.052548408508301, + "learning_rate": 5.328765245991504e-06, + "loss": 0.3003, + "step": 34837 + }, + { + "epoch": 2.0725176342919154, + "grad_norm": 2.9692111015319824, + "learning_rate": 5.328628203371249e-06, + "loss": 0.1284, + "step": 34838 + }, + { + "epoch": 2.0725311991318502, + "grad_norm": 4.170711994171143, + "learning_rate": 5.328491160750994e-06, + "loss": 0.1643, + "step": 34839 + }, + { + "epoch": 2.072544763971785, + "grad_norm": 7.524317264556885, + "learning_rate": 5.328354118130738e-06, + "loss": 0.3137, + "step": 34840 + }, + { + "epoch": 2.07255832881172, + "grad_norm": 6.269019603729248, + "learning_rate": 5.328217075510484e-06, + "loss": 0.267, + "step": 34841 + }, + { + "epoch": 2.072571893651655, + "grad_norm": 3.911083698272705, + "learning_rate": 5.328080032890229e-06, + "loss": 0.2204, + "step": 34842 + }, + { + "epoch": 2.0725854584915897, + "grad_norm": 3.800936460494995, + "learning_rate": 5.3279429902699745e-06, + "loss": 0.141, + "step": 34843 + }, + { + "epoch": 2.0725990233315246, + "grad_norm": 5.279613494873047, + "learning_rate": 5.327805947649719e-06, + "loss": 0.1918, + "step": 34844 + }, + { + "epoch": 2.0726125881714594, + "grad_norm": 6.516115188598633, + "learning_rate": 5.327668905029465e-06, + "loss": 0.2565, + "step": 34845 + }, + { + "epoch": 2.0726261530113943, + "grad_norm": 5.174530982971191, + "learning_rate": 5.32753186240921e-06, + "loss": 0.2247, + "step": 34846 + }, + { + "epoch": 2.072639717851329, + "grad_norm": 3.9618654251098633, + "learning_rate": 5.327394819788954e-06, + "loss": 0.1578, + "step": 34847 + }, + { + "epoch": 2.072653282691264, + "grad_norm": 4.608860492706299, + "learning_rate": 5.3272577771687e-06, + "loss": 0.1466, + "step": 34848 + }, + { + "epoch": 2.0726668475311993, + "grad_norm": 5.961297988891602, + "learning_rate": 5.327120734548446e-06, + "loss": 0.2071, + "step": 34849 + }, + { + "epoch": 2.072680412371134, + "grad_norm": 4.812244892120361, + "learning_rate": 5.32698369192819e-06, + "loss": 0.1848, + "step": 34850 + }, + { + "epoch": 2.072693977211069, + "grad_norm": 6.9489216804504395, + "learning_rate": 5.326846649307935e-06, + "loss": 0.3526, + "step": 34851 + }, + { + "epoch": 2.072707542051004, + "grad_norm": 4.781084060668945, + "learning_rate": 5.32670960668768e-06, + "loss": 0.2006, + "step": 34852 + }, + { + "epoch": 2.072721106890939, + "grad_norm": 3.9538068771362305, + "learning_rate": 5.326572564067426e-06, + "loss": 0.1941, + "step": 34853 + }, + { + "epoch": 2.0727346717308737, + "grad_norm": 7.572839736938477, + "learning_rate": 5.326435521447171e-06, + "loss": 0.4721, + "step": 34854 + }, + { + "epoch": 2.0727482365708085, + "grad_norm": 4.386140823364258, + "learning_rate": 5.326298478826916e-06, + "loss": 0.3008, + "step": 34855 + }, + { + "epoch": 2.0727618014107434, + "grad_norm": 4.82869815826416, + "learning_rate": 5.32616143620666e-06, + "loss": 0.2089, + "step": 34856 + }, + { + "epoch": 2.0727753662506783, + "grad_norm": 4.448416233062744, + "learning_rate": 5.326024393586405e-06, + "loss": 0.2298, + "step": 34857 + }, + { + "epoch": 2.072788931090613, + "grad_norm": 4.891351699829102, + "learning_rate": 5.325887350966151e-06, + "loss": 0.2127, + "step": 34858 + }, + { + "epoch": 2.072802495930548, + "grad_norm": 5.486021995544434, + "learning_rate": 5.325750308345896e-06, + "loss": 0.2771, + "step": 34859 + }, + { + "epoch": 2.072816060770483, + "grad_norm": 7.084805965423584, + "learning_rate": 5.325613265725641e-06, + "loss": 0.4774, + "step": 34860 + }, + { + "epoch": 2.0728296256104177, + "grad_norm": 4.199751377105713, + "learning_rate": 5.325476223105386e-06, + "loss": 0.2093, + "step": 34861 + }, + { + "epoch": 2.0728431904503526, + "grad_norm": 4.44636869430542, + "learning_rate": 5.325339180485132e-06, + "loss": 0.181, + "step": 34862 + }, + { + "epoch": 2.0728567552902875, + "grad_norm": 4.683556079864502, + "learning_rate": 5.325202137864876e-06, + "loss": 0.2644, + "step": 34863 + }, + { + "epoch": 2.0728703201302223, + "grad_norm": 4.0002899169921875, + "learning_rate": 5.325065095244622e-06, + "loss": 0.2138, + "step": 34864 + }, + { + "epoch": 2.072883884970157, + "grad_norm": 5.7647857666015625, + "learning_rate": 5.324928052624366e-06, + "loss": 0.3232, + "step": 34865 + }, + { + "epoch": 2.072897449810092, + "grad_norm": 5.574615478515625, + "learning_rate": 5.324791010004112e-06, + "loss": 0.1717, + "step": 34866 + }, + { + "epoch": 2.072911014650027, + "grad_norm": 4.772979736328125, + "learning_rate": 5.324653967383857e-06, + "loss": 0.2059, + "step": 34867 + }, + { + "epoch": 2.0729245794899622, + "grad_norm": 5.150420665740967, + "learning_rate": 5.324516924763602e-06, + "loss": 0.3225, + "step": 34868 + }, + { + "epoch": 2.072938144329897, + "grad_norm": 4.202169895172119, + "learning_rate": 5.324379882143347e-06, + "loss": 0.1635, + "step": 34869 + }, + { + "epoch": 2.072951709169832, + "grad_norm": 5.453418731689453, + "learning_rate": 5.324242839523092e-06, + "loss": 0.2602, + "step": 34870 + }, + { + "epoch": 2.072965274009767, + "grad_norm": 4.568401336669922, + "learning_rate": 5.324105796902838e-06, + "loss": 0.142, + "step": 34871 + }, + { + "epoch": 2.0729788388497017, + "grad_norm": 7.579629421234131, + "learning_rate": 5.323968754282582e-06, + "loss": 0.3326, + "step": 34872 + }, + { + "epoch": 2.0729924036896366, + "grad_norm": 4.213592529296875, + "learning_rate": 5.323831711662327e-06, + "loss": 0.2257, + "step": 34873 + }, + { + "epoch": 2.0730059685295714, + "grad_norm": 4.462466239929199, + "learning_rate": 5.323694669042072e-06, + "loss": 0.2115, + "step": 34874 + }, + { + "epoch": 2.0730195333695063, + "grad_norm": 6.647542476654053, + "learning_rate": 5.323557626421818e-06, + "loss": 0.3723, + "step": 34875 + }, + { + "epoch": 2.073033098209441, + "grad_norm": 6.430266857147217, + "learning_rate": 5.323420583801563e-06, + "loss": 0.2399, + "step": 34876 + }, + { + "epoch": 2.073046663049376, + "grad_norm": 6.058624744415283, + "learning_rate": 5.323283541181308e-06, + "loss": 0.2149, + "step": 34877 + }, + { + "epoch": 2.073060227889311, + "grad_norm": 5.280872344970703, + "learning_rate": 5.323146498561052e-06, + "loss": 0.2375, + "step": 34878 + }, + { + "epoch": 2.0730737927292457, + "grad_norm": 4.037337303161621, + "learning_rate": 5.3230094559407984e-06, + "loss": 0.211, + "step": 34879 + }, + { + "epoch": 2.0730873575691806, + "grad_norm": 5.242565155029297, + "learning_rate": 5.322872413320544e-06, + "loss": 0.3354, + "step": 34880 + }, + { + "epoch": 2.0731009224091155, + "grad_norm": 6.011491298675537, + "learning_rate": 5.322735370700288e-06, + "loss": 0.3335, + "step": 34881 + }, + { + "epoch": 2.0731144872490503, + "grad_norm": 5.266612529754639, + "learning_rate": 5.322598328080033e-06, + "loss": 0.3233, + "step": 34882 + }, + { + "epoch": 2.073128052088985, + "grad_norm": 4.497291564941406, + "learning_rate": 5.322461285459778e-06, + "loss": 0.2857, + "step": 34883 + }, + { + "epoch": 2.07314161692892, + "grad_norm": 4.172420501708984, + "learning_rate": 5.3223242428395235e-06, + "loss": 0.1971, + "step": 34884 + }, + { + "epoch": 2.073155181768855, + "grad_norm": 4.349915981292725, + "learning_rate": 5.322187200219269e-06, + "loss": 0.1243, + "step": 34885 + }, + { + "epoch": 2.07316874660879, + "grad_norm": 5.935385704040527, + "learning_rate": 5.322050157599014e-06, + "loss": 0.1865, + "step": 34886 + }, + { + "epoch": 2.073182311448725, + "grad_norm": 4.4957804679870605, + "learning_rate": 5.321913114978758e-06, + "loss": 0.1372, + "step": 34887 + }, + { + "epoch": 2.07319587628866, + "grad_norm": 3.882589101791382, + "learning_rate": 5.321776072358504e-06, + "loss": 0.1735, + "step": 34888 + }, + { + "epoch": 2.073209441128595, + "grad_norm": 4.4493184089660645, + "learning_rate": 5.321639029738249e-06, + "loss": 0.2087, + "step": 34889 + }, + { + "epoch": 2.0732230059685297, + "grad_norm": 6.07926607131958, + "learning_rate": 5.321501987117994e-06, + "loss": 0.3192, + "step": 34890 + }, + { + "epoch": 2.0732365708084646, + "grad_norm": 5.4420318603515625, + "learning_rate": 5.321364944497739e-06, + "loss": 0.4061, + "step": 34891 + }, + { + "epoch": 2.0732501356483994, + "grad_norm": 2.9847888946533203, + "learning_rate": 5.321227901877485e-06, + "loss": 0.1294, + "step": 34892 + }, + { + "epoch": 2.0732637004883343, + "grad_norm": 5.020192623138428, + "learning_rate": 5.32109085925723e-06, + "loss": 0.1776, + "step": 34893 + }, + { + "epoch": 2.073277265328269, + "grad_norm": 6.353851318359375, + "learning_rate": 5.3209538166369744e-06, + "loss": 0.3699, + "step": 34894 + }, + { + "epoch": 2.073290830168204, + "grad_norm": 5.2399492263793945, + "learning_rate": 5.32081677401672e-06, + "loss": 0.368, + "step": 34895 + }, + { + "epoch": 2.073304395008139, + "grad_norm": 4.941635608673096, + "learning_rate": 5.320679731396464e-06, + "loss": 0.2281, + "step": 34896 + }, + { + "epoch": 2.0733179598480738, + "grad_norm": 7.003410816192627, + "learning_rate": 5.32054268877621e-06, + "loss": 0.2448, + "step": 34897 + }, + { + "epoch": 2.0733315246880086, + "grad_norm": 4.766111373901367, + "learning_rate": 5.320405646155955e-06, + "loss": 0.2391, + "step": 34898 + }, + { + "epoch": 2.0733450895279435, + "grad_norm": 4.318088531494141, + "learning_rate": 5.3202686035356995e-06, + "loss": 0.1383, + "step": 34899 + }, + { + "epoch": 2.0733586543678784, + "grad_norm": 5.4318013191223145, + "learning_rate": 5.320131560915445e-06, + "loss": 0.1516, + "step": 34900 + }, + { + "epoch": 2.0733722192078132, + "grad_norm": 4.268290996551514, + "learning_rate": 5.319994518295191e-06, + "loss": 0.1481, + "step": 34901 + }, + { + "epoch": 2.073385784047748, + "grad_norm": 4.022075653076172, + "learning_rate": 5.319857475674936e-06, + "loss": 0.2233, + "step": 34902 + }, + { + "epoch": 2.073399348887683, + "grad_norm": 4.56826114654541, + "learning_rate": 5.31972043305468e-06, + "loss": 0.1892, + "step": 34903 + }, + { + "epoch": 2.073412913727618, + "grad_norm": 4.141087055206299, + "learning_rate": 5.319583390434425e-06, + "loss": 0.1728, + "step": 34904 + }, + { + "epoch": 2.0734264785675527, + "grad_norm": 5.57880163192749, + "learning_rate": 5.319446347814171e-06, + "loss": 0.2559, + "step": 34905 + }, + { + "epoch": 2.073440043407488, + "grad_norm": 3.7800722122192383, + "learning_rate": 5.319309305193916e-06, + "loss": 0.2089, + "step": 34906 + }, + { + "epoch": 2.073453608247423, + "grad_norm": 6.316837310791016, + "learning_rate": 5.319172262573661e-06, + "loss": 0.2551, + "step": 34907 + }, + { + "epoch": 2.0734671730873577, + "grad_norm": 4.297363758087158, + "learning_rate": 5.319035219953406e-06, + "loss": 0.1999, + "step": 34908 + }, + { + "epoch": 2.0734807379272926, + "grad_norm": 4.900580406188965, + "learning_rate": 5.31889817733315e-06, + "loss": 0.2708, + "step": 34909 + }, + { + "epoch": 2.0734943027672275, + "grad_norm": 4.498155117034912, + "learning_rate": 5.3187611347128964e-06, + "loss": 0.173, + "step": 34910 + }, + { + "epoch": 2.0735078676071623, + "grad_norm": 4.885806083679199, + "learning_rate": 5.318624092092642e-06, + "loss": 0.1721, + "step": 34911 + }, + { + "epoch": 2.073521432447097, + "grad_norm": 4.699012279510498, + "learning_rate": 5.318487049472386e-06, + "loss": 0.1873, + "step": 34912 + }, + { + "epoch": 2.073534997287032, + "grad_norm": 5.8119940757751465, + "learning_rate": 5.318350006852131e-06, + "loss": 0.2184, + "step": 34913 + }, + { + "epoch": 2.073548562126967, + "grad_norm": 5.519835948944092, + "learning_rate": 5.318212964231877e-06, + "loss": 0.2142, + "step": 34914 + }, + { + "epoch": 2.073562126966902, + "grad_norm": 4.438675403594971, + "learning_rate": 5.3180759216116215e-06, + "loss": 0.2251, + "step": 34915 + }, + { + "epoch": 2.0735756918068367, + "grad_norm": 3.310025691986084, + "learning_rate": 5.317938878991367e-06, + "loss": 0.1331, + "step": 34916 + }, + { + "epoch": 2.0735892566467715, + "grad_norm": 5.420127868652344, + "learning_rate": 5.317801836371112e-06, + "loss": 0.2481, + "step": 34917 + }, + { + "epoch": 2.0736028214867064, + "grad_norm": 4.589391708374023, + "learning_rate": 5.317664793750857e-06, + "loss": 0.1828, + "step": 34918 + }, + { + "epoch": 2.0736163863266412, + "grad_norm": 4.770117282867432, + "learning_rate": 5.317527751130602e-06, + "loss": 0.1944, + "step": 34919 + }, + { + "epoch": 2.073629951166576, + "grad_norm": 4.474983215332031, + "learning_rate": 5.317390708510347e-06, + "loss": 0.1745, + "step": 34920 + }, + { + "epoch": 2.073643516006511, + "grad_norm": 3.509171485900879, + "learning_rate": 5.317253665890092e-06, + "loss": 0.1381, + "step": 34921 + }, + { + "epoch": 2.073657080846446, + "grad_norm": 3.564872980117798, + "learning_rate": 5.317116623269838e-06, + "loss": 0.1431, + "step": 34922 + }, + { + "epoch": 2.0736706456863807, + "grad_norm": 4.950373649597168, + "learning_rate": 5.316979580649583e-06, + "loss": 0.3176, + "step": 34923 + }, + { + "epoch": 2.0736842105263156, + "grad_norm": 6.017450332641602, + "learning_rate": 5.316842538029327e-06, + "loss": 0.2165, + "step": 34924 + }, + { + "epoch": 2.073697775366251, + "grad_norm": 4.102934837341309, + "learning_rate": 5.3167054954090724e-06, + "loss": 0.2338, + "step": 34925 + }, + { + "epoch": 2.0737113402061857, + "grad_norm": 6.652548789978027, + "learning_rate": 5.316568452788818e-06, + "loss": 0.3482, + "step": 34926 + }, + { + "epoch": 2.0737249050461206, + "grad_norm": 4.152266025543213, + "learning_rate": 5.316431410168564e-06, + "loss": 0.3565, + "step": 34927 + }, + { + "epoch": 2.0737384698860555, + "grad_norm": 4.2773756980896, + "learning_rate": 5.316294367548308e-06, + "loss": 0.1555, + "step": 34928 + }, + { + "epoch": 2.0737520347259903, + "grad_norm": 4.113269329071045, + "learning_rate": 5.316157324928053e-06, + "loss": 0.2098, + "step": 34929 + }, + { + "epoch": 2.073765599565925, + "grad_norm": 4.742950916290283, + "learning_rate": 5.3160202823077975e-06, + "loss": 0.2445, + "step": 34930 + }, + { + "epoch": 2.07377916440586, + "grad_norm": 3.2364776134490967, + "learning_rate": 5.3158832396875435e-06, + "loss": 0.1195, + "step": 34931 + }, + { + "epoch": 2.073792729245795, + "grad_norm": 4.248806953430176, + "learning_rate": 5.315746197067289e-06, + "loss": 0.1557, + "step": 34932 + }, + { + "epoch": 2.07380629408573, + "grad_norm": 3.3680472373962402, + "learning_rate": 5.315609154447033e-06, + "loss": 0.2578, + "step": 34933 + }, + { + "epoch": 2.0738198589256647, + "grad_norm": 6.166984558105469, + "learning_rate": 5.315472111826778e-06, + "loss": 0.2051, + "step": 34934 + }, + { + "epoch": 2.0738334237655995, + "grad_norm": 3.3668618202209473, + "learning_rate": 5.315335069206524e-06, + "loss": 0.1551, + "step": 34935 + }, + { + "epoch": 2.0738469886055344, + "grad_norm": 4.011549472808838, + "learning_rate": 5.315198026586269e-06, + "loss": 0.204, + "step": 34936 + }, + { + "epoch": 2.0738605534454693, + "grad_norm": 5.19244384765625, + "learning_rate": 5.315060983966014e-06, + "loss": 0.1412, + "step": 34937 + }, + { + "epoch": 2.073874118285404, + "grad_norm": 4.4606547355651855, + "learning_rate": 5.314923941345759e-06, + "loss": 0.3399, + "step": 34938 + }, + { + "epoch": 2.073887683125339, + "grad_norm": 6.014185905456543, + "learning_rate": 5.314786898725503e-06, + "loss": 0.2327, + "step": 34939 + }, + { + "epoch": 2.073901247965274, + "grad_norm": 5.0526862144470215, + "learning_rate": 5.314649856105249e-06, + "loss": 0.1938, + "step": 34940 + }, + { + "epoch": 2.0739148128052087, + "grad_norm": 3.957261562347412, + "learning_rate": 5.3145128134849944e-06, + "loss": 0.1343, + "step": 34941 + }, + { + "epoch": 2.0739283776451436, + "grad_norm": 6.383306503295898, + "learning_rate": 5.31437577086474e-06, + "loss": 0.2293, + "step": 34942 + }, + { + "epoch": 2.073941942485079, + "grad_norm": 3.403320550918579, + "learning_rate": 5.314238728244484e-06, + "loss": 0.0913, + "step": 34943 + }, + { + "epoch": 2.0739555073250138, + "grad_norm": 4.782614707946777, + "learning_rate": 5.31410168562423e-06, + "loss": 0.1599, + "step": 34944 + }, + { + "epoch": 2.0739690721649486, + "grad_norm": 4.402663707733154, + "learning_rate": 5.313964643003975e-06, + "loss": 0.191, + "step": 34945 + }, + { + "epoch": 2.0739826370048835, + "grad_norm": 4.761346817016602, + "learning_rate": 5.3138276003837195e-06, + "loss": 0.2136, + "step": 34946 + }, + { + "epoch": 2.0739962018448184, + "grad_norm": 5.670767307281494, + "learning_rate": 5.313690557763465e-06, + "loss": 0.1676, + "step": 34947 + }, + { + "epoch": 2.0740097666847532, + "grad_norm": 5.7455973625183105, + "learning_rate": 5.313553515143211e-06, + "loss": 0.2207, + "step": 34948 + }, + { + "epoch": 2.074023331524688, + "grad_norm": 4.016399383544922, + "learning_rate": 5.313416472522955e-06, + "loss": 0.1346, + "step": 34949 + }, + { + "epoch": 2.074036896364623, + "grad_norm": 3.4658379554748535, + "learning_rate": 5.3132794299027e-06, + "loss": 0.1614, + "step": 34950 + }, + { + "epoch": 2.074050461204558, + "grad_norm": 4.453837871551514, + "learning_rate": 5.313142387282445e-06, + "loss": 0.1456, + "step": 34951 + }, + { + "epoch": 2.0740640260444927, + "grad_norm": 4.861586570739746, + "learning_rate": 5.31300534466219e-06, + "loss": 0.139, + "step": 34952 + }, + { + "epoch": 2.0740775908844276, + "grad_norm": 4.449578762054443, + "learning_rate": 5.312868302041936e-06, + "loss": 0.1571, + "step": 34953 + }, + { + "epoch": 2.0740911557243624, + "grad_norm": 3.2317745685577393, + "learning_rate": 5.312731259421681e-06, + "loss": 0.0989, + "step": 34954 + }, + { + "epoch": 2.0741047205642973, + "grad_norm": 3.9194273948669434, + "learning_rate": 5.312594216801425e-06, + "loss": 0.1599, + "step": 34955 + }, + { + "epoch": 2.074118285404232, + "grad_norm": 5.401934623718262, + "learning_rate": 5.3124571741811704e-06, + "loss": 0.2168, + "step": 34956 + }, + { + "epoch": 2.074131850244167, + "grad_norm": 3.118394613265991, + "learning_rate": 5.3123201315609165e-06, + "loss": 0.0935, + "step": 34957 + }, + { + "epoch": 2.074145415084102, + "grad_norm": 3.059072732925415, + "learning_rate": 5.312183088940661e-06, + "loss": 0.0852, + "step": 34958 + }, + { + "epoch": 2.0741589799240367, + "grad_norm": 4.874688625335693, + "learning_rate": 5.312046046320406e-06, + "loss": 0.1498, + "step": 34959 + }, + { + "epoch": 2.0741725447639716, + "grad_norm": 6.331234931945801, + "learning_rate": 5.311909003700151e-06, + "loss": 0.2567, + "step": 34960 + }, + { + "epoch": 2.0741861096039065, + "grad_norm": 5.01832389831543, + "learning_rate": 5.311771961079897e-06, + "loss": 0.1525, + "step": 34961 + }, + { + "epoch": 2.0741996744438413, + "grad_norm": 4.80035400390625, + "learning_rate": 5.3116349184596415e-06, + "loss": 0.1985, + "step": 34962 + }, + { + "epoch": 2.0742132392837767, + "grad_norm": 5.325833797454834, + "learning_rate": 5.311497875839387e-06, + "loss": 0.1575, + "step": 34963 + }, + { + "epoch": 2.0742268041237115, + "grad_norm": 3.3755812644958496, + "learning_rate": 5.311360833219131e-06, + "loss": 0.1195, + "step": 34964 + }, + { + "epoch": 2.0742403689636464, + "grad_norm": 3.9325997829437256, + "learning_rate": 5.311223790598876e-06, + "loss": 0.1223, + "step": 34965 + }, + { + "epoch": 2.0742539338035813, + "grad_norm": 7.263195037841797, + "learning_rate": 5.311086747978622e-06, + "loss": 0.3415, + "step": 34966 + }, + { + "epoch": 2.074267498643516, + "grad_norm": 5.2036004066467285, + "learning_rate": 5.3109497053583666e-06, + "loss": 0.2421, + "step": 34967 + }, + { + "epoch": 2.074281063483451, + "grad_norm": 4.732908725738525, + "learning_rate": 5.310812662738112e-06, + "loss": 0.1409, + "step": 34968 + }, + { + "epoch": 2.074294628323386, + "grad_norm": 4.194632053375244, + "learning_rate": 5.310675620117857e-06, + "loss": 0.1694, + "step": 34969 + }, + { + "epoch": 2.0743081931633207, + "grad_norm": 4.476887226104736, + "learning_rate": 5.310538577497603e-06, + "loss": 0.2029, + "step": 34970 + }, + { + "epoch": 2.0743217580032556, + "grad_norm": 7.919378757476807, + "learning_rate": 5.310401534877347e-06, + "loss": 0.1687, + "step": 34971 + }, + { + "epoch": 2.0743353228431904, + "grad_norm": 4.3235039710998535, + "learning_rate": 5.3102644922570924e-06, + "loss": 0.0905, + "step": 34972 + }, + { + "epoch": 2.0743488876831253, + "grad_norm": 3.6860389709472656, + "learning_rate": 5.310127449636837e-06, + "loss": 0.1446, + "step": 34973 + }, + { + "epoch": 2.07436245252306, + "grad_norm": 3.1409993171691895, + "learning_rate": 5.309990407016583e-06, + "loss": 0.1147, + "step": 34974 + }, + { + "epoch": 2.074376017362995, + "grad_norm": 4.504542827606201, + "learning_rate": 5.309853364396328e-06, + "loss": 0.156, + "step": 34975 + }, + { + "epoch": 2.07438958220293, + "grad_norm": 4.914617538452148, + "learning_rate": 5.309716321776073e-06, + "loss": 0.1938, + "step": 34976 + }, + { + "epoch": 2.0744031470428648, + "grad_norm": 3.8628151416778564, + "learning_rate": 5.3095792791558175e-06, + "loss": 0.1644, + "step": 34977 + }, + { + "epoch": 2.0744167118827996, + "grad_norm": 5.910713195800781, + "learning_rate": 5.309442236535563e-06, + "loss": 0.193, + "step": 34978 + }, + { + "epoch": 2.0744302767227345, + "grad_norm": 5.729589462280273, + "learning_rate": 5.309305193915309e-06, + "loss": 0.3423, + "step": 34979 + }, + { + "epoch": 2.0744438415626694, + "grad_norm": 5.291414260864258, + "learning_rate": 5.309168151295053e-06, + "loss": 0.1273, + "step": 34980 + }, + { + "epoch": 2.0744574064026047, + "grad_norm": 6.7675957679748535, + "learning_rate": 5.309031108674798e-06, + "loss": 0.2185, + "step": 34981 + }, + { + "epoch": 2.0744709712425395, + "grad_norm": 4.836896896362305, + "learning_rate": 5.3088940660545425e-06, + "loss": 0.151, + "step": 34982 + }, + { + "epoch": 2.0744845360824744, + "grad_norm": 5.335206508636475, + "learning_rate": 5.3087570234342886e-06, + "loss": 0.1755, + "step": 34983 + }, + { + "epoch": 2.0744981009224093, + "grad_norm": 5.5586838722229, + "learning_rate": 5.308619980814034e-06, + "loss": 0.1419, + "step": 34984 + }, + { + "epoch": 2.074511665762344, + "grad_norm": 5.252753257751465, + "learning_rate": 5.308482938193779e-06, + "loss": 0.3226, + "step": 34985 + }, + { + "epoch": 2.074525230602279, + "grad_norm": 4.811952590942383, + "learning_rate": 5.308345895573523e-06, + "loss": 0.1282, + "step": 34986 + }, + { + "epoch": 2.074538795442214, + "grad_norm": 4.854787349700928, + "learning_rate": 5.308208852953269e-06, + "loss": 0.1973, + "step": 34987 + }, + { + "epoch": 2.0745523602821487, + "grad_norm": 5.6599578857421875, + "learning_rate": 5.3080718103330145e-06, + "loss": 0.2131, + "step": 34988 + }, + { + "epoch": 2.0745659251220836, + "grad_norm": 4.087532043457031, + "learning_rate": 5.307934767712759e-06, + "loss": 0.1055, + "step": 34989 + }, + { + "epoch": 2.0745794899620185, + "grad_norm": 5.528558254241943, + "learning_rate": 5.307797725092504e-06, + "loss": 0.1867, + "step": 34990 + }, + { + "epoch": 2.0745930548019533, + "grad_norm": 3.849374294281006, + "learning_rate": 5.307660682472249e-06, + "loss": 0.1155, + "step": 34991 + }, + { + "epoch": 2.074606619641888, + "grad_norm": 4.90092658996582, + "learning_rate": 5.307523639851994e-06, + "loss": 0.1339, + "step": 34992 + }, + { + "epoch": 2.074620184481823, + "grad_norm": 3.614163637161255, + "learning_rate": 5.3073865972317395e-06, + "loss": 0.0948, + "step": 34993 + }, + { + "epoch": 2.074633749321758, + "grad_norm": 4.117270469665527, + "learning_rate": 5.307249554611485e-06, + "loss": 0.1331, + "step": 34994 + }, + { + "epoch": 2.074647314161693, + "grad_norm": 3.2563464641571045, + "learning_rate": 5.307112511991229e-06, + "loss": 0.1363, + "step": 34995 + }, + { + "epoch": 2.0746608790016277, + "grad_norm": 6.200369834899902, + "learning_rate": 5.306975469370975e-06, + "loss": 0.1872, + "step": 34996 + }, + { + "epoch": 2.0746744438415625, + "grad_norm": 4.505550384521484, + "learning_rate": 5.30683842675072e-06, + "loss": 0.1872, + "step": 34997 + }, + { + "epoch": 2.0746880086814974, + "grad_norm": 5.010730266571045, + "learning_rate": 5.3067013841304646e-06, + "loss": 0.1624, + "step": 34998 + }, + { + "epoch": 2.0747015735214323, + "grad_norm": 4.525468826293945, + "learning_rate": 5.30656434151021e-06, + "loss": 0.1208, + "step": 34999 + }, + { + "epoch": 2.074715138361367, + "grad_norm": 2.71012806892395, + "learning_rate": 5.306427298889956e-06, + "loss": 0.0622, + "step": 35000 + }, + { + "epoch": 2.0747287032013024, + "grad_norm": 5.552031993865967, + "learning_rate": 5.3062902562697e-06, + "loss": 0.2356, + "step": 35001 + }, + { + "epoch": 2.0747422680412373, + "grad_norm": 4.415078639984131, + "learning_rate": 5.306153213649445e-06, + "loss": 0.2663, + "step": 35002 + }, + { + "epoch": 2.074755832881172, + "grad_norm": 5.188309669494629, + "learning_rate": 5.3060161710291905e-06, + "loss": 0.1998, + "step": 35003 + }, + { + "epoch": 2.074769397721107, + "grad_norm": 3.3642160892486572, + "learning_rate": 5.3058791284089365e-06, + "loss": 0.1282, + "step": 35004 + }, + { + "epoch": 2.074782962561042, + "grad_norm": 3.8607895374298096, + "learning_rate": 5.305742085788681e-06, + "loss": 0.1458, + "step": 35005 + }, + { + "epoch": 2.0747965274009768, + "grad_norm": 7.396337509155273, + "learning_rate": 5.305605043168426e-06, + "loss": 0.1754, + "step": 35006 + }, + { + "epoch": 2.0748100922409116, + "grad_norm": 3.6327691078186035, + "learning_rate": 5.30546800054817e-06, + "loss": 0.1137, + "step": 35007 + }, + { + "epoch": 2.0748236570808465, + "grad_norm": 3.239849805831909, + "learning_rate": 5.3053309579279155e-06, + "loss": 0.0877, + "step": 35008 + }, + { + "epoch": 2.0748372219207813, + "grad_norm": 3.608307361602783, + "learning_rate": 5.3051939153076615e-06, + "loss": 0.1327, + "step": 35009 + }, + { + "epoch": 2.074850786760716, + "grad_norm": 3.6452951431274414, + "learning_rate": 5.305056872687407e-06, + "loss": 0.1238, + "step": 35010 + }, + { + "epoch": 2.074864351600651, + "grad_norm": 5.402791976928711, + "learning_rate": 5.304919830067151e-06, + "loss": 0.1489, + "step": 35011 + }, + { + "epoch": 2.074877916440586, + "grad_norm": 3.602440357208252, + "learning_rate": 5.304782787446896e-06, + "loss": 0.0889, + "step": 35012 + }, + { + "epoch": 2.074891481280521, + "grad_norm": 2.795243263244629, + "learning_rate": 5.304645744826642e-06, + "loss": 0.0991, + "step": 35013 + }, + { + "epoch": 2.0749050461204557, + "grad_norm": 6.009124279022217, + "learning_rate": 5.3045087022063866e-06, + "loss": 0.199, + "step": 35014 + }, + { + "epoch": 2.0749186109603905, + "grad_norm": 4.802966117858887, + "learning_rate": 5.304371659586132e-06, + "loss": 0.1314, + "step": 35015 + }, + { + "epoch": 2.0749321758003254, + "grad_norm": 3.8676278591156006, + "learning_rate": 5.304234616965876e-06, + "loss": 0.1328, + "step": 35016 + }, + { + "epoch": 2.0749457406402603, + "grad_norm": 5.070106506347656, + "learning_rate": 5.304097574345622e-06, + "loss": 0.1566, + "step": 35017 + }, + { + "epoch": 2.074959305480195, + "grad_norm": 4.942478656768799, + "learning_rate": 5.303960531725367e-06, + "loss": 0.2435, + "step": 35018 + }, + { + "epoch": 2.0749728703201304, + "grad_norm": 3.8086910247802734, + "learning_rate": 5.3038234891051125e-06, + "loss": 0.2204, + "step": 35019 + }, + { + "epoch": 2.0749864351600653, + "grad_norm": 4.135288715362549, + "learning_rate": 5.303686446484857e-06, + "loss": 0.1813, + "step": 35020 + }, + { + "epoch": 2.075, + "grad_norm": 4.950632095336914, + "learning_rate": 5.303549403864602e-06, + "loss": 0.1619, + "step": 35021 + }, + { + "epoch": 2.075013564839935, + "grad_norm": 4.323246955871582, + "learning_rate": 5.303412361244348e-06, + "loss": 0.1643, + "step": 35022 + }, + { + "epoch": 2.07502712967987, + "grad_norm": 4.838996410369873, + "learning_rate": 5.303275318624092e-06, + "loss": 0.2319, + "step": 35023 + }, + { + "epoch": 2.0750406945198048, + "grad_norm": 5.379822254180908, + "learning_rate": 5.3031382760038375e-06, + "loss": 0.1809, + "step": 35024 + }, + { + "epoch": 2.0750542593597396, + "grad_norm": 3.9858694076538086, + "learning_rate": 5.303001233383583e-06, + "loss": 0.1414, + "step": 35025 + }, + { + "epoch": 2.0750678241996745, + "grad_norm": 4.369178295135498, + "learning_rate": 5.302864190763328e-06, + "loss": 0.1751, + "step": 35026 + }, + { + "epoch": 2.0750813890396094, + "grad_norm": 6.039318084716797, + "learning_rate": 5.302727148143073e-06, + "loss": 0.2268, + "step": 35027 + }, + { + "epoch": 2.0750949538795442, + "grad_norm": 4.594400405883789, + "learning_rate": 5.302590105522818e-06, + "loss": 0.1724, + "step": 35028 + }, + { + "epoch": 2.075108518719479, + "grad_norm": 6.014914035797119, + "learning_rate": 5.3024530629025626e-06, + "loss": 0.2131, + "step": 35029 + }, + { + "epoch": 2.075122083559414, + "grad_norm": 5.29360818862915, + "learning_rate": 5.302316020282309e-06, + "loss": 0.1681, + "step": 35030 + }, + { + "epoch": 2.075135648399349, + "grad_norm": 5.722127437591553, + "learning_rate": 5.302178977662054e-06, + "loss": 0.2499, + "step": 35031 + }, + { + "epoch": 2.0751492132392837, + "grad_norm": 7.115274906158447, + "learning_rate": 5.302041935041798e-06, + "loss": 0.3437, + "step": 35032 + }, + { + "epoch": 2.0751627780792186, + "grad_norm": 7.645002365112305, + "learning_rate": 5.301904892421543e-06, + "loss": 0.2965, + "step": 35033 + }, + { + "epoch": 2.0751763429191534, + "grad_norm": 3.4153294563293457, + "learning_rate": 5.3017678498012885e-06, + "loss": 0.146, + "step": 35034 + }, + { + "epoch": 2.0751899077590883, + "grad_norm": 4.228055000305176, + "learning_rate": 5.3016308071810345e-06, + "loss": 0.1905, + "step": 35035 + }, + { + "epoch": 2.075203472599023, + "grad_norm": 5.677211761474609, + "learning_rate": 5.301493764560779e-06, + "loss": 0.212, + "step": 35036 + }, + { + "epoch": 2.075217037438958, + "grad_norm": 4.528668403625488, + "learning_rate": 5.301356721940524e-06, + "loss": 0.2075, + "step": 35037 + }, + { + "epoch": 2.075230602278893, + "grad_norm": 6.271589279174805, + "learning_rate": 5.301219679320268e-06, + "loss": 0.2012, + "step": 35038 + }, + { + "epoch": 2.075244167118828, + "grad_norm": 4.539023399353027, + "learning_rate": 5.301082636700014e-06, + "loss": 0.2354, + "step": 35039 + }, + { + "epoch": 2.075257731958763, + "grad_norm": 3.8847124576568604, + "learning_rate": 5.3009455940797595e-06, + "loss": 0.1974, + "step": 35040 + }, + { + "epoch": 2.075271296798698, + "grad_norm": 4.409023761749268, + "learning_rate": 5.300808551459504e-06, + "loss": 0.1981, + "step": 35041 + }, + { + "epoch": 2.075284861638633, + "grad_norm": 3.629652738571167, + "learning_rate": 5.300671508839249e-06, + "loss": 0.1686, + "step": 35042 + }, + { + "epoch": 2.0752984264785677, + "grad_norm": 4.872387409210205, + "learning_rate": 5.300534466218995e-06, + "loss": 0.1793, + "step": 35043 + }, + { + "epoch": 2.0753119913185025, + "grad_norm": 4.4771294593811035, + "learning_rate": 5.30039742359874e-06, + "loss": 0.1556, + "step": 35044 + }, + { + "epoch": 2.0753255561584374, + "grad_norm": 5.271425724029541, + "learning_rate": 5.300260380978485e-06, + "loss": 0.1936, + "step": 35045 + }, + { + "epoch": 2.0753391209983723, + "grad_norm": 4.353413105010986, + "learning_rate": 5.30012333835823e-06, + "loss": 0.1978, + "step": 35046 + }, + { + "epoch": 2.075352685838307, + "grad_norm": 4.0849761962890625, + "learning_rate": 5.299986295737974e-06, + "loss": 0.1481, + "step": 35047 + }, + { + "epoch": 2.075366250678242, + "grad_norm": 3.9280595779418945, + "learning_rate": 5.29984925311772e-06, + "loss": 0.1788, + "step": 35048 + }, + { + "epoch": 2.075379815518177, + "grad_norm": 5.170573711395264, + "learning_rate": 5.299712210497465e-06, + "loss": 0.1508, + "step": 35049 + }, + { + "epoch": 2.0753933803581117, + "grad_norm": 4.732450008392334, + "learning_rate": 5.29957516787721e-06, + "loss": 0.1322, + "step": 35050 + }, + { + "epoch": 2.0754069451980466, + "grad_norm": 5.765316009521484, + "learning_rate": 5.299438125256955e-06, + "loss": 0.1951, + "step": 35051 + }, + { + "epoch": 2.0754205100379814, + "grad_norm": 3.910445213317871, + "learning_rate": 5.299301082636701e-06, + "loss": 0.1634, + "step": 35052 + }, + { + "epoch": 2.0754340748779163, + "grad_norm": 3.409168243408203, + "learning_rate": 5.299164040016446e-06, + "loss": 0.1523, + "step": 35053 + }, + { + "epoch": 2.075447639717851, + "grad_norm": 4.417617321014404, + "learning_rate": 5.29902699739619e-06, + "loss": 0.1898, + "step": 35054 + }, + { + "epoch": 2.075461204557786, + "grad_norm": 4.543375015258789, + "learning_rate": 5.2988899547759355e-06, + "loss": 0.2925, + "step": 35055 + }, + { + "epoch": 2.075474769397721, + "grad_norm": 4.401989459991455, + "learning_rate": 5.2987529121556815e-06, + "loss": 0.1375, + "step": 35056 + }, + { + "epoch": 2.075488334237656, + "grad_norm": 4.261926174163818, + "learning_rate": 5.298615869535426e-06, + "loss": 0.2313, + "step": 35057 + }, + { + "epoch": 2.075501899077591, + "grad_norm": 6.594056129455566, + "learning_rate": 5.298478826915171e-06, + "loss": 0.199, + "step": 35058 + }, + { + "epoch": 2.075515463917526, + "grad_norm": 4.332762241363525, + "learning_rate": 5.298341784294916e-06, + "loss": 0.1564, + "step": 35059 + }, + { + "epoch": 2.075529028757461, + "grad_norm": 3.98518705368042, + "learning_rate": 5.2982047416746606e-06, + "loss": 0.1703, + "step": 35060 + }, + { + "epoch": 2.0755425935973957, + "grad_norm": 4.430374622344971, + "learning_rate": 5.298067699054407e-06, + "loss": 0.1381, + "step": 35061 + }, + { + "epoch": 2.0755561584373305, + "grad_norm": 3.646399974822998, + "learning_rate": 5.297930656434152e-06, + "loss": 0.1606, + "step": 35062 + }, + { + "epoch": 2.0755697232772654, + "grad_norm": 3.8547308444976807, + "learning_rate": 5.297793613813896e-06, + "loss": 0.1555, + "step": 35063 + }, + { + "epoch": 2.0755832881172003, + "grad_norm": 5.721790790557861, + "learning_rate": 5.297656571193641e-06, + "loss": 0.2429, + "step": 35064 + }, + { + "epoch": 2.075596852957135, + "grad_norm": 5.731585502624512, + "learning_rate": 5.297519528573387e-06, + "loss": 0.2348, + "step": 35065 + }, + { + "epoch": 2.07561041779707, + "grad_norm": 3.9717650413513184, + "learning_rate": 5.297382485953132e-06, + "loss": 0.1347, + "step": 35066 + }, + { + "epoch": 2.075623982637005, + "grad_norm": 5.699509143829346, + "learning_rate": 5.297245443332877e-06, + "loss": 0.2412, + "step": 35067 + }, + { + "epoch": 2.0756375474769397, + "grad_norm": 5.43875789642334, + "learning_rate": 5.297108400712622e-06, + "loss": 0.2741, + "step": 35068 + }, + { + "epoch": 2.0756511123168746, + "grad_norm": 4.228780269622803, + "learning_rate": 5.296971358092368e-06, + "loss": 0.1868, + "step": 35069 + }, + { + "epoch": 2.0756646771568095, + "grad_norm": 4.209415435791016, + "learning_rate": 5.296834315472112e-06, + "loss": 0.1927, + "step": 35070 + }, + { + "epoch": 2.0756782419967443, + "grad_norm": 5.195568561553955, + "learning_rate": 5.2966972728518575e-06, + "loss": 0.1888, + "step": 35071 + }, + { + "epoch": 2.075691806836679, + "grad_norm": 5.489865779876709, + "learning_rate": 5.296560230231602e-06, + "loss": 0.1182, + "step": 35072 + }, + { + "epoch": 2.075705371676614, + "grad_norm": 4.053184509277344, + "learning_rate": 5.296423187611348e-06, + "loss": 0.2162, + "step": 35073 + }, + { + "epoch": 2.075718936516549, + "grad_norm": 7.6452436447143555, + "learning_rate": 5.296286144991093e-06, + "loss": 0.3028, + "step": 35074 + }, + { + "epoch": 2.075732501356484, + "grad_norm": 5.0528950691223145, + "learning_rate": 5.296149102370837e-06, + "loss": 0.1745, + "step": 35075 + }, + { + "epoch": 2.0757460661964187, + "grad_norm": 3.8158555030822754, + "learning_rate": 5.296012059750583e-06, + "loss": 0.1505, + "step": 35076 + }, + { + "epoch": 2.075759631036354, + "grad_norm": 4.834350109100342, + "learning_rate": 5.295875017130328e-06, + "loss": 0.2947, + "step": 35077 + }, + { + "epoch": 2.075773195876289, + "grad_norm": 6.198107719421387, + "learning_rate": 5.295737974510074e-06, + "loss": 0.2109, + "step": 35078 + }, + { + "epoch": 2.0757867607162237, + "grad_norm": 5.268849849700928, + "learning_rate": 5.295600931889818e-06, + "loss": 0.1442, + "step": 35079 + }, + { + "epoch": 2.0758003255561586, + "grad_norm": 4.532546043395996, + "learning_rate": 5.295463889269563e-06, + "loss": 0.1871, + "step": 35080 + }, + { + "epoch": 2.0758138903960934, + "grad_norm": 5.713054656982422, + "learning_rate": 5.295326846649308e-06, + "loss": 0.2651, + "step": 35081 + }, + { + "epoch": 2.0758274552360283, + "grad_norm": 4.816193580627441, + "learning_rate": 5.295189804029054e-06, + "loss": 0.2437, + "step": 35082 + }, + { + "epoch": 2.075841020075963, + "grad_norm": 7.551194190979004, + "learning_rate": 5.295052761408799e-06, + "loss": 0.2829, + "step": 35083 + }, + { + "epoch": 2.075854584915898, + "grad_norm": 5.072015762329102, + "learning_rate": 5.294915718788544e-06, + "loss": 0.2185, + "step": 35084 + }, + { + "epoch": 2.075868149755833, + "grad_norm": 3.487774133682251, + "learning_rate": 5.294778676168288e-06, + "loss": 0.1942, + "step": 35085 + }, + { + "epoch": 2.0758817145957678, + "grad_norm": 5.534746170043945, + "learning_rate": 5.294641633548034e-06, + "loss": 0.2001, + "step": 35086 + }, + { + "epoch": 2.0758952794357026, + "grad_norm": 4.845167636871338, + "learning_rate": 5.2945045909277795e-06, + "loss": 0.1644, + "step": 35087 + }, + { + "epoch": 2.0759088442756375, + "grad_norm": 4.506134033203125, + "learning_rate": 5.294367548307524e-06, + "loss": 0.1657, + "step": 35088 + }, + { + "epoch": 2.0759224091155724, + "grad_norm": 4.122008323669434, + "learning_rate": 5.294230505687269e-06, + "loss": 0.1783, + "step": 35089 + }, + { + "epoch": 2.075935973955507, + "grad_norm": 6.110777378082275, + "learning_rate": 5.294093463067013e-06, + "loss": 0.2427, + "step": 35090 + }, + { + "epoch": 2.075949538795442, + "grad_norm": 8.032179832458496, + "learning_rate": 5.293956420446759e-06, + "loss": 0.3116, + "step": 35091 + }, + { + "epoch": 2.075963103635377, + "grad_norm": 4.018044471740723, + "learning_rate": 5.293819377826505e-06, + "loss": 0.1131, + "step": 35092 + }, + { + "epoch": 2.075976668475312, + "grad_norm": 6.5023698806762695, + "learning_rate": 5.29368233520625e-06, + "loss": 0.2043, + "step": 35093 + }, + { + "epoch": 2.0759902333152467, + "grad_norm": 4.58357572555542, + "learning_rate": 5.293545292585994e-06, + "loss": 0.236, + "step": 35094 + }, + { + "epoch": 2.076003798155182, + "grad_norm": 3.3943538665771484, + "learning_rate": 5.29340824996574e-06, + "loss": 0.124, + "step": 35095 + }, + { + "epoch": 2.076017362995117, + "grad_norm": 7.432614803314209, + "learning_rate": 5.293271207345485e-06, + "loss": 0.287, + "step": 35096 + }, + { + "epoch": 2.0760309278350517, + "grad_norm": 4.7771124839782715, + "learning_rate": 5.29313416472523e-06, + "loss": 0.2043, + "step": 35097 + }, + { + "epoch": 2.0760444926749866, + "grad_norm": 3.9132766723632812, + "learning_rate": 5.292997122104975e-06, + "loss": 0.1535, + "step": 35098 + }, + { + "epoch": 2.0760580575149215, + "grad_norm": 5.426444053649902, + "learning_rate": 5.292860079484721e-06, + "loss": 0.2287, + "step": 35099 + }, + { + "epoch": 2.0760716223548563, + "grad_norm": 5.87410306930542, + "learning_rate": 5.292723036864465e-06, + "loss": 0.2797, + "step": 35100 + }, + { + "epoch": 2.076085187194791, + "grad_norm": 4.533654689788818, + "learning_rate": 5.29258599424421e-06, + "loss": 0.1185, + "step": 35101 + }, + { + "epoch": 2.076098752034726, + "grad_norm": 3.744058847427368, + "learning_rate": 5.2924489516239555e-06, + "loss": 0.1439, + "step": 35102 + }, + { + "epoch": 2.076112316874661, + "grad_norm": 4.373595714569092, + "learning_rate": 5.2923119090037e-06, + "loss": 0.1195, + "step": 35103 + }, + { + "epoch": 2.0761258817145958, + "grad_norm": 8.943634033203125, + "learning_rate": 5.292174866383446e-06, + "loss": 0.1562, + "step": 35104 + }, + { + "epoch": 2.0761394465545306, + "grad_norm": 4.923007488250732, + "learning_rate": 5.292037823763191e-06, + "loss": 0.1553, + "step": 35105 + }, + { + "epoch": 2.0761530113944655, + "grad_norm": 4.6917924880981445, + "learning_rate": 5.291900781142935e-06, + "loss": 0.173, + "step": 35106 + }, + { + "epoch": 2.0761665762344004, + "grad_norm": 4.236485481262207, + "learning_rate": 5.291763738522681e-06, + "loss": 0.172, + "step": 35107 + }, + { + "epoch": 2.0761801410743352, + "grad_norm": 3.67706298828125, + "learning_rate": 5.291626695902427e-06, + "loss": 0.0847, + "step": 35108 + }, + { + "epoch": 2.07619370591427, + "grad_norm": 4.862210750579834, + "learning_rate": 5.291489653282171e-06, + "loss": 0.1618, + "step": 35109 + }, + { + "epoch": 2.076207270754205, + "grad_norm": 6.463556289672852, + "learning_rate": 5.291352610661916e-06, + "loss": 0.2331, + "step": 35110 + }, + { + "epoch": 2.07622083559414, + "grad_norm": 4.761146068572998, + "learning_rate": 5.291215568041661e-06, + "loss": 0.19, + "step": 35111 + }, + { + "epoch": 2.0762344004340747, + "grad_norm": 4.824360370635986, + "learning_rate": 5.291078525421407e-06, + "loss": 0.1443, + "step": 35112 + }, + { + "epoch": 2.0762479652740096, + "grad_norm": 4.728107452392578, + "learning_rate": 5.290941482801152e-06, + "loss": 0.2103, + "step": 35113 + }, + { + "epoch": 2.0762615301139444, + "grad_norm": 6.564864635467529, + "learning_rate": 5.290804440180897e-06, + "loss": 0.2574, + "step": 35114 + }, + { + "epoch": 2.0762750949538797, + "grad_norm": 5.1702494621276855, + "learning_rate": 5.290667397560641e-06, + "loss": 0.2443, + "step": 35115 + }, + { + "epoch": 2.0762886597938146, + "grad_norm": 4.958520889282227, + "learning_rate": 5.290530354940386e-06, + "loss": 0.2427, + "step": 35116 + }, + { + "epoch": 2.0763022246337495, + "grad_norm": 5.84328031539917, + "learning_rate": 5.290393312320132e-06, + "loss": 0.2195, + "step": 35117 + }, + { + "epoch": 2.0763157894736843, + "grad_norm": 4.693010330200195, + "learning_rate": 5.2902562696998776e-06, + "loss": 0.1635, + "step": 35118 + }, + { + "epoch": 2.076329354313619, + "grad_norm": 4.651347637176514, + "learning_rate": 5.290119227079622e-06, + "loss": 0.2617, + "step": 35119 + }, + { + "epoch": 2.076342919153554, + "grad_norm": 4.773622035980225, + "learning_rate": 5.289982184459367e-06, + "loss": 0.0966, + "step": 35120 + }, + { + "epoch": 2.076356483993489, + "grad_norm": 6.271479606628418, + "learning_rate": 5.289845141839113e-06, + "loss": 0.2979, + "step": 35121 + }, + { + "epoch": 2.076370048833424, + "grad_norm": 3.706207036972046, + "learning_rate": 5.289708099218857e-06, + "loss": 0.1896, + "step": 35122 + }, + { + "epoch": 2.0763836136733587, + "grad_norm": 4.198701858520508, + "learning_rate": 5.289571056598603e-06, + "loss": 0.1276, + "step": 35123 + }, + { + "epoch": 2.0763971785132935, + "grad_norm": 4.0205206871032715, + "learning_rate": 5.289434013978347e-06, + "loss": 0.1437, + "step": 35124 + }, + { + "epoch": 2.0764107433532284, + "grad_norm": 5.03259801864624, + "learning_rate": 5.289296971358093e-06, + "loss": 0.2147, + "step": 35125 + }, + { + "epoch": 2.0764243081931633, + "grad_norm": 5.381930351257324, + "learning_rate": 5.289159928737838e-06, + "loss": 0.2087, + "step": 35126 + }, + { + "epoch": 2.076437873033098, + "grad_norm": 6.780990123748779, + "learning_rate": 5.289022886117583e-06, + "loss": 0.1998, + "step": 35127 + }, + { + "epoch": 2.076451437873033, + "grad_norm": 4.22955846786499, + "learning_rate": 5.288885843497328e-06, + "loss": 0.1983, + "step": 35128 + }, + { + "epoch": 2.076465002712968, + "grad_norm": 8.15951919555664, + "learning_rate": 5.288748800877073e-06, + "loss": 0.2102, + "step": 35129 + }, + { + "epoch": 2.0764785675529027, + "grad_norm": 4.474189281463623, + "learning_rate": 5.288611758256819e-06, + "loss": 0.222, + "step": 35130 + }, + { + "epoch": 2.0764921323928376, + "grad_norm": 5.550292015075684, + "learning_rate": 5.288474715636563e-06, + "loss": 0.2584, + "step": 35131 + }, + { + "epoch": 2.0765056972327725, + "grad_norm": 5.268360614776611, + "learning_rate": 5.288337673016308e-06, + "loss": 0.2297, + "step": 35132 + }, + { + "epoch": 2.0765192620727078, + "grad_norm": 5.95271635055542, + "learning_rate": 5.2882006303960535e-06, + "loss": 0.1819, + "step": 35133 + }, + { + "epoch": 2.0765328269126426, + "grad_norm": 4.756574630737305, + "learning_rate": 5.288063587775799e-06, + "loss": 0.1474, + "step": 35134 + }, + { + "epoch": 2.0765463917525775, + "grad_norm": 5.023174285888672, + "learning_rate": 5.287926545155544e-06, + "loss": 0.1576, + "step": 35135 + }, + { + "epoch": 2.0765599565925124, + "grad_norm": 4.33878231048584, + "learning_rate": 5.287789502535289e-06, + "loss": 0.1941, + "step": 35136 + }, + { + "epoch": 2.0765735214324472, + "grad_norm": 5.058975696563721, + "learning_rate": 5.287652459915033e-06, + "loss": 0.1475, + "step": 35137 + }, + { + "epoch": 2.076587086272382, + "grad_norm": 4.1715850830078125, + "learning_rate": 5.2875154172947794e-06, + "loss": 0.1263, + "step": 35138 + }, + { + "epoch": 2.076600651112317, + "grad_norm": 4.211369037628174, + "learning_rate": 5.287378374674525e-06, + "loss": 0.2559, + "step": 35139 + }, + { + "epoch": 2.076614215952252, + "grad_norm": 7.169167995452881, + "learning_rate": 5.287241332054269e-06, + "loss": 0.282, + "step": 35140 + }, + { + "epoch": 2.0766277807921867, + "grad_norm": 6.085692405700684, + "learning_rate": 5.287104289434014e-06, + "loss": 0.1598, + "step": 35141 + }, + { + "epoch": 2.0766413456321215, + "grad_norm": 4.786525249481201, + "learning_rate": 5.28696724681376e-06, + "loss": 0.1802, + "step": 35142 + }, + { + "epoch": 2.0766549104720564, + "grad_norm": 5.102776050567627, + "learning_rate": 5.2868302041935045e-06, + "loss": 0.175, + "step": 35143 + }, + { + "epoch": 2.0766684753119913, + "grad_norm": 5.451902389526367, + "learning_rate": 5.28669316157325e-06, + "loss": 0.1504, + "step": 35144 + }, + { + "epoch": 2.076682040151926, + "grad_norm": 5.495208263397217, + "learning_rate": 5.286556118952995e-06, + "loss": 0.2293, + "step": 35145 + }, + { + "epoch": 2.076695604991861, + "grad_norm": 6.198517322540283, + "learning_rate": 5.286419076332739e-06, + "loss": 0.2302, + "step": 35146 + }, + { + "epoch": 2.076709169831796, + "grad_norm": 3.777219295501709, + "learning_rate": 5.286282033712485e-06, + "loss": 0.1732, + "step": 35147 + }, + { + "epoch": 2.0767227346717307, + "grad_norm": 3.444357395172119, + "learning_rate": 5.28614499109223e-06, + "loss": 0.116, + "step": 35148 + }, + { + "epoch": 2.0767362995116656, + "grad_norm": 4.021606922149658, + "learning_rate": 5.286007948471975e-06, + "loss": 0.1596, + "step": 35149 + }, + { + "epoch": 2.0767498643516005, + "grad_norm": 7.1934685707092285, + "learning_rate": 5.28587090585172e-06, + "loss": 0.3226, + "step": 35150 + }, + { + "epoch": 2.0767634291915353, + "grad_norm": 6.7377400398254395, + "learning_rate": 5.285733863231466e-06, + "loss": 0.2371, + "step": 35151 + }, + { + "epoch": 2.07677699403147, + "grad_norm": 5.100495338439941, + "learning_rate": 5.285596820611211e-06, + "loss": 0.2508, + "step": 35152 + }, + { + "epoch": 2.0767905588714055, + "grad_norm": 4.6714324951171875, + "learning_rate": 5.2854597779909554e-06, + "loss": 0.1552, + "step": 35153 + }, + { + "epoch": 2.0768041237113404, + "grad_norm": 5.276425361633301, + "learning_rate": 5.285322735370701e-06, + "loss": 0.2474, + "step": 35154 + }, + { + "epoch": 2.0768176885512752, + "grad_norm": 4.306646347045898, + "learning_rate": 5.285185692750447e-06, + "loss": 0.1254, + "step": 35155 + }, + { + "epoch": 2.07683125339121, + "grad_norm": 4.8818039894104, + "learning_rate": 5.285048650130191e-06, + "loss": 0.1881, + "step": 35156 + }, + { + "epoch": 2.076844818231145, + "grad_norm": 4.517587184906006, + "learning_rate": 5.284911607509936e-06, + "loss": 0.1942, + "step": 35157 + }, + { + "epoch": 2.07685838307108, + "grad_norm": 3.9114365577697754, + "learning_rate": 5.2847745648896805e-06, + "loss": 0.1979, + "step": 35158 + }, + { + "epoch": 2.0768719479110147, + "grad_norm": 4.598385810852051, + "learning_rate": 5.284637522269426e-06, + "loss": 0.1635, + "step": 35159 + }, + { + "epoch": 2.0768855127509496, + "grad_norm": 5.7647624015808105, + "learning_rate": 5.284500479649172e-06, + "loss": 0.168, + "step": 35160 + }, + { + "epoch": 2.0768990775908844, + "grad_norm": 5.288087368011475, + "learning_rate": 5.284363437028917e-06, + "loss": 0.2354, + "step": 35161 + }, + { + "epoch": 2.0769126424308193, + "grad_norm": 5.796029567718506, + "learning_rate": 5.284226394408661e-06, + "loss": 0.3381, + "step": 35162 + }, + { + "epoch": 2.076926207270754, + "grad_norm": 6.7286458015441895, + "learning_rate": 5.284089351788406e-06, + "loss": 0.1686, + "step": 35163 + }, + { + "epoch": 2.076939772110689, + "grad_norm": 5.673914432525635, + "learning_rate": 5.283952309168152e-06, + "loss": 0.2092, + "step": 35164 + }, + { + "epoch": 2.076953336950624, + "grad_norm": 4.184517860412598, + "learning_rate": 5.283815266547897e-06, + "loss": 0.159, + "step": 35165 + }, + { + "epoch": 2.0769669017905588, + "grad_norm": 3.5021018981933594, + "learning_rate": 5.283678223927642e-06, + "loss": 0.2036, + "step": 35166 + }, + { + "epoch": 2.0769804666304936, + "grad_norm": 3.067650318145752, + "learning_rate": 5.283541181307387e-06, + "loss": 0.132, + "step": 35167 + }, + { + "epoch": 2.0769940314704285, + "grad_norm": 3.7405874729156494, + "learning_rate": 5.283404138687132e-06, + "loss": 0.1925, + "step": 35168 + }, + { + "epoch": 2.0770075963103634, + "grad_norm": 5.8935770988464355, + "learning_rate": 5.2832670960668774e-06, + "loss": 0.1769, + "step": 35169 + }, + { + "epoch": 2.0770211611502982, + "grad_norm": 4.239914417266846, + "learning_rate": 5.283130053446623e-06, + "loss": 0.1369, + "step": 35170 + }, + { + "epoch": 2.0770347259902335, + "grad_norm": 5.3827595710754395, + "learning_rate": 5.282993010826367e-06, + "loss": 0.1946, + "step": 35171 + }, + { + "epoch": 2.0770482908301684, + "grad_norm": 5.307985305786133, + "learning_rate": 5.282855968206112e-06, + "loss": 0.2002, + "step": 35172 + }, + { + "epoch": 2.0770618556701033, + "grad_norm": 4.84160041809082, + "learning_rate": 5.282718925585858e-06, + "loss": 0.146, + "step": 35173 + }, + { + "epoch": 2.077075420510038, + "grad_norm": 7.477349758148193, + "learning_rate": 5.2825818829656025e-06, + "loss": 0.2588, + "step": 35174 + }, + { + "epoch": 2.077088985349973, + "grad_norm": 2.9126338958740234, + "learning_rate": 5.282444840345348e-06, + "loss": 0.0669, + "step": 35175 + }, + { + "epoch": 2.077102550189908, + "grad_norm": 5.7296552658081055, + "learning_rate": 5.282307797725093e-06, + "loss": 0.1555, + "step": 35176 + }, + { + "epoch": 2.0771161150298427, + "grad_norm": 5.867940425872803, + "learning_rate": 5.282170755104839e-06, + "loss": 0.1653, + "step": 35177 + }, + { + "epoch": 2.0771296798697776, + "grad_norm": 4.624780654907227, + "learning_rate": 5.282033712484583e-06, + "loss": 0.134, + "step": 35178 + }, + { + "epoch": 2.0771432447097125, + "grad_norm": 4.433721542358398, + "learning_rate": 5.281896669864328e-06, + "loss": 0.1831, + "step": 35179 + }, + { + "epoch": 2.0771568095496473, + "grad_norm": 5.37082052230835, + "learning_rate": 5.281759627244073e-06, + "loss": 0.206, + "step": 35180 + }, + { + "epoch": 2.077170374389582, + "grad_norm": 4.014220714569092, + "learning_rate": 5.281622584623819e-06, + "loss": 0.1229, + "step": 35181 + }, + { + "epoch": 2.077183939229517, + "grad_norm": 7.02965784072876, + "learning_rate": 5.281485542003564e-06, + "loss": 0.2261, + "step": 35182 + }, + { + "epoch": 2.077197504069452, + "grad_norm": 3.4972832202911377, + "learning_rate": 5.281348499383308e-06, + "loss": 0.1116, + "step": 35183 + }, + { + "epoch": 2.077211068909387, + "grad_norm": 3.671414852142334, + "learning_rate": 5.2812114567630534e-06, + "loss": 0.1472, + "step": 35184 + }, + { + "epoch": 2.0772246337493216, + "grad_norm": 4.874673366546631, + "learning_rate": 5.281074414142799e-06, + "loss": 0.1845, + "step": 35185 + }, + { + "epoch": 2.0772381985892565, + "grad_norm": 4.500556945800781, + "learning_rate": 5.280937371522545e-06, + "loss": 0.1027, + "step": 35186 + }, + { + "epoch": 2.0772517634291914, + "grad_norm": 3.7367069721221924, + "learning_rate": 5.280800328902289e-06, + "loss": 0.0799, + "step": 35187 + }, + { + "epoch": 2.0772653282691262, + "grad_norm": 4.850644111633301, + "learning_rate": 5.280663286282034e-06, + "loss": 0.2027, + "step": 35188 + }, + { + "epoch": 2.077278893109061, + "grad_norm": 4.516453266143799, + "learning_rate": 5.2805262436617785e-06, + "loss": 0.1861, + "step": 35189 + }, + { + "epoch": 2.077292457948996, + "grad_norm": 5.381181240081787, + "learning_rate": 5.2803892010415245e-06, + "loss": 0.1591, + "step": 35190 + }, + { + "epoch": 2.0773060227889313, + "grad_norm": 3.7077884674072266, + "learning_rate": 5.28025215842127e-06, + "loss": 0.0962, + "step": 35191 + }, + { + "epoch": 2.077319587628866, + "grad_norm": 5.256110191345215, + "learning_rate": 5.280115115801014e-06, + "loss": 0.1809, + "step": 35192 + }, + { + "epoch": 2.077333152468801, + "grad_norm": 5.541477203369141, + "learning_rate": 5.279978073180759e-06, + "loss": 0.1404, + "step": 35193 + }, + { + "epoch": 2.077346717308736, + "grad_norm": 3.837444305419922, + "learning_rate": 5.279841030560505e-06, + "loss": 0.1135, + "step": 35194 + }, + { + "epoch": 2.0773602821486707, + "grad_norm": 3.5853800773620605, + "learning_rate": 5.27970398794025e-06, + "loss": 0.1145, + "step": 35195 + }, + { + "epoch": 2.0773738469886056, + "grad_norm": 3.8228960037231445, + "learning_rate": 5.279566945319995e-06, + "loss": 0.0929, + "step": 35196 + }, + { + "epoch": 2.0773874118285405, + "grad_norm": 4.781336784362793, + "learning_rate": 5.27942990269974e-06, + "loss": 0.192, + "step": 35197 + }, + { + "epoch": 2.0774009766684753, + "grad_norm": 3.89373779296875, + "learning_rate": 5.279292860079484e-06, + "loss": 0.128, + "step": 35198 + }, + { + "epoch": 2.07741454150841, + "grad_norm": 2.9413902759552, + "learning_rate": 5.27915581745923e-06, + "loss": 0.0785, + "step": 35199 + }, + { + "epoch": 2.077428106348345, + "grad_norm": 7.582605361938477, + "learning_rate": 5.2790187748389754e-06, + "loss": 0.3066, + "step": 35200 + }, + { + "epoch": 2.07744167118828, + "grad_norm": 4.953664302825928, + "learning_rate": 5.278881732218721e-06, + "loss": 0.1466, + "step": 35201 + }, + { + "epoch": 2.077455236028215, + "grad_norm": 5.358097076416016, + "learning_rate": 5.278744689598465e-06, + "loss": 0.2093, + "step": 35202 + }, + { + "epoch": 2.0774688008681497, + "grad_norm": 4.049144744873047, + "learning_rate": 5.278607646978211e-06, + "loss": 0.1324, + "step": 35203 + }, + { + "epoch": 2.0774823657080845, + "grad_norm": 4.416859149932861, + "learning_rate": 5.278470604357956e-06, + "loss": 0.1308, + "step": 35204 + }, + { + "epoch": 2.0774959305480194, + "grad_norm": 4.176764011383057, + "learning_rate": 5.2783335617377005e-06, + "loss": 0.1467, + "step": 35205 + }, + { + "epoch": 2.0775094953879543, + "grad_norm": 5.168213844299316, + "learning_rate": 5.278196519117446e-06, + "loss": 0.1517, + "step": 35206 + }, + { + "epoch": 2.077523060227889, + "grad_norm": 3.691324234008789, + "learning_rate": 5.278059476497192e-06, + "loss": 0.0846, + "step": 35207 + }, + { + "epoch": 2.077536625067824, + "grad_norm": 5.066051959991455, + "learning_rate": 5.277922433876936e-06, + "loss": 0.1522, + "step": 35208 + }, + { + "epoch": 2.0775501899077593, + "grad_norm": 7.575168609619141, + "learning_rate": 5.277785391256681e-06, + "loss": 0.3204, + "step": 35209 + }, + { + "epoch": 2.077563754747694, + "grad_norm": 4.6430511474609375, + "learning_rate": 5.277648348636426e-06, + "loss": 0.1334, + "step": 35210 + }, + { + "epoch": 2.077577319587629, + "grad_norm": 6.7991228103637695, + "learning_rate": 5.277511306016172e-06, + "loss": 0.1925, + "step": 35211 + }, + { + "epoch": 2.077590884427564, + "grad_norm": 4.191086769104004, + "learning_rate": 5.277374263395917e-06, + "loss": 0.0999, + "step": 35212 + }, + { + "epoch": 2.0776044492674988, + "grad_norm": 4.3263702392578125, + "learning_rate": 5.277237220775662e-06, + "loss": 0.1322, + "step": 35213 + }, + { + "epoch": 2.0776180141074336, + "grad_norm": 4.120864391326904, + "learning_rate": 5.277100178155406e-06, + "loss": 0.1179, + "step": 35214 + }, + { + "epoch": 2.0776315789473685, + "grad_norm": 3.8470633029937744, + "learning_rate": 5.2769631355351514e-06, + "loss": 0.1704, + "step": 35215 + }, + { + "epoch": 2.0776451437873034, + "grad_norm": 5.3886542320251465, + "learning_rate": 5.2768260929148975e-06, + "loss": 0.1178, + "step": 35216 + }, + { + "epoch": 2.0776587086272382, + "grad_norm": 5.269715785980225, + "learning_rate": 5.276689050294642e-06, + "loss": 0.2453, + "step": 35217 + }, + { + "epoch": 2.077672273467173, + "grad_norm": 3.827652931213379, + "learning_rate": 5.276552007674387e-06, + "loss": 0.1239, + "step": 35218 + }, + { + "epoch": 2.077685838307108, + "grad_norm": 5.277419567108154, + "learning_rate": 5.276414965054132e-06, + "loss": 0.1508, + "step": 35219 + }, + { + "epoch": 2.077699403147043, + "grad_norm": 5.240535736083984, + "learning_rate": 5.276277922433878e-06, + "loss": 0.133, + "step": 35220 + }, + { + "epoch": 2.0777129679869777, + "grad_norm": 4.523005485534668, + "learning_rate": 5.2761408798136225e-06, + "loss": 0.1616, + "step": 35221 + }, + { + "epoch": 2.0777265328269126, + "grad_norm": 4.228806972503662, + "learning_rate": 5.276003837193368e-06, + "loss": 0.158, + "step": 35222 + }, + { + "epoch": 2.0777400976668474, + "grad_norm": 6.295123100280762, + "learning_rate": 5.275866794573112e-06, + "loss": 0.3239, + "step": 35223 + }, + { + "epoch": 2.0777536625067823, + "grad_norm": 6.62786340713501, + "learning_rate": 5.275729751952858e-06, + "loss": 0.1389, + "step": 35224 + }, + { + "epoch": 2.077767227346717, + "grad_norm": 3.462994337081909, + "learning_rate": 5.275592709332603e-06, + "loss": 0.135, + "step": 35225 + }, + { + "epoch": 2.077780792186652, + "grad_norm": 6.266153812408447, + "learning_rate": 5.275455666712348e-06, + "loss": 0.1921, + "step": 35226 + }, + { + "epoch": 2.077794357026587, + "grad_norm": 4.667854309082031, + "learning_rate": 5.275318624092093e-06, + "loss": 0.1356, + "step": 35227 + }, + { + "epoch": 2.077807921866522, + "grad_norm": 6.068342685699463, + "learning_rate": 5.275181581471838e-06, + "loss": 0.2662, + "step": 35228 + }, + { + "epoch": 2.077821486706457, + "grad_norm": 4.07343864440918, + "learning_rate": 5.275044538851584e-06, + "loss": 0.137, + "step": 35229 + }, + { + "epoch": 2.077835051546392, + "grad_norm": 5.924271583557129, + "learning_rate": 5.274907496231328e-06, + "loss": 0.1949, + "step": 35230 + }, + { + "epoch": 2.077848616386327, + "grad_norm": 3.4266693592071533, + "learning_rate": 5.2747704536110734e-06, + "loss": 0.1194, + "step": 35231 + }, + { + "epoch": 2.0778621812262617, + "grad_norm": 5.618600845336914, + "learning_rate": 5.274633410990818e-06, + "loss": 0.2024, + "step": 35232 + }, + { + "epoch": 2.0778757460661965, + "grad_norm": 3.9852378368377686, + "learning_rate": 5.274496368370564e-06, + "loss": 0.1519, + "step": 35233 + }, + { + "epoch": 2.0778893109061314, + "grad_norm": 5.346179008483887, + "learning_rate": 5.274359325750309e-06, + "loss": 0.2014, + "step": 35234 + }, + { + "epoch": 2.0779028757460662, + "grad_norm": 4.247946262359619, + "learning_rate": 5.274222283130054e-06, + "loss": 0.1375, + "step": 35235 + }, + { + "epoch": 2.077916440586001, + "grad_norm": 3.5022048950195312, + "learning_rate": 5.2740852405097985e-06, + "loss": 0.1337, + "step": 35236 + }, + { + "epoch": 2.077930005425936, + "grad_norm": 5.039401531219482, + "learning_rate": 5.2739481978895445e-06, + "loss": 0.1554, + "step": 35237 + }, + { + "epoch": 2.077943570265871, + "grad_norm": 4.627959728240967, + "learning_rate": 5.27381115526929e-06, + "loss": 0.2383, + "step": 35238 + }, + { + "epoch": 2.0779571351058057, + "grad_norm": 5.361251354217529, + "learning_rate": 5.273674112649034e-06, + "loss": 0.1722, + "step": 35239 + }, + { + "epoch": 2.0779706999457406, + "grad_norm": 5.128270626068115, + "learning_rate": 5.273537070028779e-06, + "loss": 0.2077, + "step": 35240 + }, + { + "epoch": 2.0779842647856754, + "grad_norm": 6.283564567565918, + "learning_rate": 5.273400027408524e-06, + "loss": 0.1967, + "step": 35241 + }, + { + "epoch": 2.0779978296256103, + "grad_norm": 5.517698287963867, + "learning_rate": 5.2732629847882696e-06, + "loss": 0.2699, + "step": 35242 + }, + { + "epoch": 2.078011394465545, + "grad_norm": 5.982024192810059, + "learning_rate": 5.273125942168015e-06, + "loss": 0.1852, + "step": 35243 + }, + { + "epoch": 2.07802495930548, + "grad_norm": 5.41074800491333, + "learning_rate": 5.27298889954776e-06, + "loss": 0.2159, + "step": 35244 + }, + { + "epoch": 2.078038524145415, + "grad_norm": 5.497872829437256, + "learning_rate": 5.272851856927504e-06, + "loss": 0.2715, + "step": 35245 + }, + { + "epoch": 2.0780520889853498, + "grad_norm": 4.2490234375, + "learning_rate": 5.27271481430725e-06, + "loss": 0.2349, + "step": 35246 + }, + { + "epoch": 2.078065653825285, + "grad_norm": 4.8103437423706055, + "learning_rate": 5.2725777716869955e-06, + "loss": 0.1504, + "step": 35247 + }, + { + "epoch": 2.07807921866522, + "grad_norm": 3.364522695541382, + "learning_rate": 5.27244072906674e-06, + "loss": 0.1308, + "step": 35248 + }, + { + "epoch": 2.078092783505155, + "grad_norm": 4.769736289978027, + "learning_rate": 5.272303686446485e-06, + "loss": 0.1636, + "step": 35249 + }, + { + "epoch": 2.0781063483450897, + "grad_norm": 4.927892208099365, + "learning_rate": 5.272166643826231e-06, + "loss": 0.1333, + "step": 35250 + }, + { + "epoch": 2.0781199131850245, + "grad_norm": 4.74432373046875, + "learning_rate": 5.272029601205975e-06, + "loss": 0.1851, + "step": 35251 + }, + { + "epoch": 2.0781334780249594, + "grad_norm": 4.297420024871826, + "learning_rate": 5.2718925585857205e-06, + "loss": 0.1765, + "step": 35252 + }, + { + "epoch": 2.0781470428648943, + "grad_norm": 6.045319080352783, + "learning_rate": 5.271755515965466e-06, + "loss": 0.1808, + "step": 35253 + }, + { + "epoch": 2.078160607704829, + "grad_norm": 4.842016696929932, + "learning_rate": 5.27161847334521e-06, + "loss": 0.1567, + "step": 35254 + }, + { + "epoch": 2.078174172544764, + "grad_norm": 4.441745281219482, + "learning_rate": 5.271481430724956e-06, + "loss": 0.1426, + "step": 35255 + }, + { + "epoch": 2.078187737384699, + "grad_norm": 5.524448394775391, + "learning_rate": 5.271344388104701e-06, + "loss": 0.1646, + "step": 35256 + }, + { + "epoch": 2.0782013022246337, + "grad_norm": 4.094393253326416, + "learning_rate": 5.2712073454844456e-06, + "loss": 0.2303, + "step": 35257 + }, + { + "epoch": 2.0782148670645686, + "grad_norm": 4.362449645996094, + "learning_rate": 5.271070302864191e-06, + "loss": 0.1476, + "step": 35258 + }, + { + "epoch": 2.0782284319045035, + "grad_norm": 4.86173152923584, + "learning_rate": 5.270933260243937e-06, + "loss": 0.372, + "step": 35259 + }, + { + "epoch": 2.0782419967444383, + "grad_norm": 6.299611568450928, + "learning_rate": 5.270796217623682e-06, + "loss": 0.2021, + "step": 35260 + }, + { + "epoch": 2.078255561584373, + "grad_norm": 8.13729476928711, + "learning_rate": 5.270659175003426e-06, + "loss": 0.1553, + "step": 35261 + }, + { + "epoch": 2.078269126424308, + "grad_norm": 5.272372722625732, + "learning_rate": 5.2705221323831715e-06, + "loss": 0.2659, + "step": 35262 + }, + { + "epoch": 2.078282691264243, + "grad_norm": 4.885499477386475, + "learning_rate": 5.2703850897629175e-06, + "loss": 0.2738, + "step": 35263 + }, + { + "epoch": 2.078296256104178, + "grad_norm": 6.488142013549805, + "learning_rate": 5.270248047142662e-06, + "loss": 0.2777, + "step": 35264 + }, + { + "epoch": 2.0783098209441127, + "grad_norm": 4.722879409790039, + "learning_rate": 5.270111004522407e-06, + "loss": 0.2162, + "step": 35265 + }, + { + "epoch": 2.078323385784048, + "grad_norm": 7.037403106689453, + "learning_rate": 5.269973961902151e-06, + "loss": 0.2401, + "step": 35266 + }, + { + "epoch": 2.078336950623983, + "grad_norm": 6.250558853149414, + "learning_rate": 5.2698369192818965e-06, + "loss": 0.3936, + "step": 35267 + }, + { + "epoch": 2.0783505154639177, + "grad_norm": 8.35130786895752, + "learning_rate": 5.2696998766616425e-06, + "loss": 0.342, + "step": 35268 + }, + { + "epoch": 2.0783640803038526, + "grad_norm": 6.779307842254639, + "learning_rate": 5.269562834041388e-06, + "loss": 0.3204, + "step": 35269 + }, + { + "epoch": 2.0783776451437874, + "grad_norm": 4.121673583984375, + "learning_rate": 5.269425791421132e-06, + "loss": 0.2152, + "step": 35270 + }, + { + "epoch": 2.0783912099837223, + "grad_norm": 5.806743621826172, + "learning_rate": 5.269288748800877e-06, + "loss": 0.2938, + "step": 35271 + }, + { + "epoch": 2.078404774823657, + "grad_norm": 5.672455787658691, + "learning_rate": 5.269151706180623e-06, + "loss": 0.2206, + "step": 35272 + }, + { + "epoch": 2.078418339663592, + "grad_norm": 4.959291458129883, + "learning_rate": 5.2690146635603676e-06, + "loss": 0.1998, + "step": 35273 + }, + { + "epoch": 2.078431904503527, + "grad_norm": 3.6110918521881104, + "learning_rate": 5.268877620940113e-06, + "loss": 0.2153, + "step": 35274 + }, + { + "epoch": 2.0784454693434617, + "grad_norm": 6.317676067352295, + "learning_rate": 5.268740578319858e-06, + "loss": 0.2435, + "step": 35275 + }, + { + "epoch": 2.0784590341833966, + "grad_norm": 6.7095441818237305, + "learning_rate": 5.268603535699603e-06, + "loss": 0.3387, + "step": 35276 + }, + { + "epoch": 2.0784725990233315, + "grad_norm": 5.93740177154541, + "learning_rate": 5.268466493079348e-06, + "loss": 0.206, + "step": 35277 + }, + { + "epoch": 2.0784861638632663, + "grad_norm": 5.904688835144043, + "learning_rate": 5.2683294504590935e-06, + "loss": 0.2497, + "step": 35278 + }, + { + "epoch": 2.078499728703201, + "grad_norm": 5.7155961990356445, + "learning_rate": 5.268192407838838e-06, + "loss": 0.2739, + "step": 35279 + }, + { + "epoch": 2.078513293543136, + "grad_norm": 5.156871318817139, + "learning_rate": 5.268055365218584e-06, + "loss": 0.2562, + "step": 35280 + }, + { + "epoch": 2.078526858383071, + "grad_norm": 4.994926929473877, + "learning_rate": 5.267918322598329e-06, + "loss": 0.2169, + "step": 35281 + }, + { + "epoch": 2.078540423223006, + "grad_norm": 5.9125566482543945, + "learning_rate": 5.267781279978073e-06, + "loss": 0.2425, + "step": 35282 + }, + { + "epoch": 2.0785539880629407, + "grad_norm": 5.421830654144287, + "learning_rate": 5.2676442373578185e-06, + "loss": 0.1952, + "step": 35283 + }, + { + "epoch": 2.0785675529028755, + "grad_norm": 5.673482418060303, + "learning_rate": 5.267507194737564e-06, + "loss": 0.223, + "step": 35284 + }, + { + "epoch": 2.078581117742811, + "grad_norm": 5.184985637664795, + "learning_rate": 5.267370152117309e-06, + "loss": 0.3126, + "step": 35285 + }, + { + "epoch": 2.0785946825827457, + "grad_norm": 6.630436897277832, + "learning_rate": 5.267233109497054e-06, + "loss": 0.3372, + "step": 35286 + }, + { + "epoch": 2.0786082474226806, + "grad_norm": 5.352460861206055, + "learning_rate": 5.267096066876799e-06, + "loss": 0.2326, + "step": 35287 + }, + { + "epoch": 2.0786218122626154, + "grad_norm": 3.796384811401367, + "learning_rate": 5.2669590242565436e-06, + "loss": 0.1424, + "step": 35288 + }, + { + "epoch": 2.0786353771025503, + "grad_norm": 5.74296760559082, + "learning_rate": 5.26682198163629e-06, + "loss": 0.2181, + "step": 35289 + }, + { + "epoch": 2.078648941942485, + "grad_norm": 5.981072425842285, + "learning_rate": 5.266684939016035e-06, + "loss": 0.138, + "step": 35290 + }, + { + "epoch": 2.07866250678242, + "grad_norm": 5.665091037750244, + "learning_rate": 5.266547896395779e-06, + "loss": 0.2148, + "step": 35291 + }, + { + "epoch": 2.078676071622355, + "grad_norm": 5.076591968536377, + "learning_rate": 5.266410853775524e-06, + "loss": 0.2268, + "step": 35292 + }, + { + "epoch": 2.0786896364622898, + "grad_norm": 4.20680570602417, + "learning_rate": 5.26627381115527e-06, + "loss": 0.0996, + "step": 35293 + }, + { + "epoch": 2.0787032013022246, + "grad_norm": 7.630396366119385, + "learning_rate": 5.2661367685350155e-06, + "loss": 0.2656, + "step": 35294 + }, + { + "epoch": 2.0787167661421595, + "grad_norm": 4.613272190093994, + "learning_rate": 5.26599972591476e-06, + "loss": 0.1395, + "step": 35295 + }, + { + "epoch": 2.0787303309820944, + "grad_norm": 5.137233734130859, + "learning_rate": 5.265862683294505e-06, + "loss": 0.2522, + "step": 35296 + }, + { + "epoch": 2.0787438958220292, + "grad_norm": 3.5982534885406494, + "learning_rate": 5.265725640674249e-06, + "loss": 0.1068, + "step": 35297 + }, + { + "epoch": 2.078757460661964, + "grad_norm": 3.7288172245025635, + "learning_rate": 5.265588598053995e-06, + "loss": 0.1438, + "step": 35298 + }, + { + "epoch": 2.078771025501899, + "grad_norm": 3.6030006408691406, + "learning_rate": 5.2654515554337405e-06, + "loss": 0.0903, + "step": 35299 + }, + { + "epoch": 2.078784590341834, + "grad_norm": 3.8916778564453125, + "learning_rate": 5.265314512813485e-06, + "loss": 0.1657, + "step": 35300 + }, + { + "epoch": 2.0787981551817687, + "grad_norm": 5.836126327514648, + "learning_rate": 5.26517747019323e-06, + "loss": 0.2361, + "step": 35301 + }, + { + "epoch": 2.0788117200217036, + "grad_norm": 4.919701099395752, + "learning_rate": 5.265040427572976e-06, + "loss": 0.1611, + "step": 35302 + }, + { + "epoch": 2.0788252848616384, + "grad_norm": 6.811140060424805, + "learning_rate": 5.264903384952721e-06, + "loss": 0.2847, + "step": 35303 + }, + { + "epoch": 2.0788388497015737, + "grad_norm": 4.343784332275391, + "learning_rate": 5.264766342332466e-06, + "loss": 0.2559, + "step": 35304 + }, + { + "epoch": 2.0788524145415086, + "grad_norm": 5.953258991241455, + "learning_rate": 5.264629299712211e-06, + "loss": 0.1387, + "step": 35305 + }, + { + "epoch": 2.0788659793814435, + "grad_norm": 5.267901420593262, + "learning_rate": 5.264492257091957e-06, + "loss": 0.2378, + "step": 35306 + }, + { + "epoch": 2.0788795442213783, + "grad_norm": 3.997457504272461, + "learning_rate": 5.264355214471701e-06, + "loss": 0.1016, + "step": 35307 + }, + { + "epoch": 2.078893109061313, + "grad_norm": 5.498113632202148, + "learning_rate": 5.264218171851446e-06, + "loss": 0.1478, + "step": 35308 + }, + { + "epoch": 2.078906673901248, + "grad_norm": 3.9357032775878906, + "learning_rate": 5.2640811292311915e-06, + "loss": 0.1324, + "step": 35309 + }, + { + "epoch": 2.078920238741183, + "grad_norm": 3.297715425491333, + "learning_rate": 5.263944086610936e-06, + "loss": 0.1441, + "step": 35310 + }, + { + "epoch": 2.078933803581118, + "grad_norm": 3.88189435005188, + "learning_rate": 5.263807043990682e-06, + "loss": 0.1036, + "step": 35311 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 5.025567531585693, + "learning_rate": 5.263670001370427e-06, + "loss": 0.1493, + "step": 35312 + }, + { + "epoch": 2.0789609332609875, + "grad_norm": 3.8721330165863037, + "learning_rate": 5.263532958750171e-06, + "loss": 0.1234, + "step": 35313 + }, + { + "epoch": 2.0789744981009224, + "grad_norm": 2.9525160789489746, + "learning_rate": 5.2633959161299165e-06, + "loss": 0.1081, + "step": 35314 + }, + { + "epoch": 2.0789880629408573, + "grad_norm": 4.154388904571533, + "learning_rate": 5.2632588735096625e-06, + "loss": 0.0951, + "step": 35315 + }, + { + "epoch": 2.079001627780792, + "grad_norm": 3.430318832397461, + "learning_rate": 5.263121830889407e-06, + "loss": 0.1459, + "step": 35316 + }, + { + "epoch": 2.079015192620727, + "grad_norm": 4.0105743408203125, + "learning_rate": 5.262984788269152e-06, + "loss": 0.1494, + "step": 35317 + }, + { + "epoch": 2.079028757460662, + "grad_norm": 3.5875730514526367, + "learning_rate": 5.262847745648897e-06, + "loss": 0.1434, + "step": 35318 + }, + { + "epoch": 2.0790423223005967, + "grad_norm": 5.958414077758789, + "learning_rate": 5.262710703028643e-06, + "loss": 0.2332, + "step": 35319 + }, + { + "epoch": 2.0790558871405316, + "grad_norm": 5.39689302444458, + "learning_rate": 5.262573660408388e-06, + "loss": 0.2036, + "step": 35320 + }, + { + "epoch": 2.0790694519804664, + "grad_norm": 3.636322021484375, + "learning_rate": 5.262436617788133e-06, + "loss": 0.1179, + "step": 35321 + }, + { + "epoch": 2.0790830168204013, + "grad_norm": 3.1801514625549316, + "learning_rate": 5.262299575167877e-06, + "loss": 0.0894, + "step": 35322 + }, + { + "epoch": 2.0790965816603366, + "grad_norm": 3.3692374229431152, + "learning_rate": 5.262162532547622e-06, + "loss": 0.1176, + "step": 35323 + }, + { + "epoch": 2.0791101465002715, + "grad_norm": 4.8680219650268555, + "learning_rate": 5.262025489927368e-06, + "loss": 0.1382, + "step": 35324 + }, + { + "epoch": 2.0791237113402063, + "grad_norm": 5.0551557540893555, + "learning_rate": 5.261888447307113e-06, + "loss": 0.2009, + "step": 35325 + }, + { + "epoch": 2.079137276180141, + "grad_norm": 3.7506086826324463, + "learning_rate": 5.261751404686858e-06, + "loss": 0.1612, + "step": 35326 + }, + { + "epoch": 2.079150841020076, + "grad_norm": 3.4500248432159424, + "learning_rate": 5.261614362066603e-06, + "loss": 0.2611, + "step": 35327 + }, + { + "epoch": 2.079164405860011, + "grad_norm": 2.948124647140503, + "learning_rate": 5.261477319446349e-06, + "loss": 0.103, + "step": 35328 + }, + { + "epoch": 2.079177970699946, + "grad_norm": 4.271480560302734, + "learning_rate": 5.261340276826093e-06, + "loss": 0.1456, + "step": 35329 + }, + { + "epoch": 2.0791915355398807, + "grad_norm": 3.9214353561401367, + "learning_rate": 5.2612032342058385e-06, + "loss": 0.1449, + "step": 35330 + }, + { + "epoch": 2.0792051003798155, + "grad_norm": 4.065163612365723, + "learning_rate": 5.261066191585583e-06, + "loss": 0.1053, + "step": 35331 + }, + { + "epoch": 2.0792186652197504, + "grad_norm": 3.671492099761963, + "learning_rate": 5.260929148965329e-06, + "loss": 0.1377, + "step": 35332 + }, + { + "epoch": 2.0792322300596853, + "grad_norm": 3.7636196613311768, + "learning_rate": 5.260792106345074e-06, + "loss": 0.1281, + "step": 35333 + }, + { + "epoch": 2.07924579489962, + "grad_norm": 4.377630710601807, + "learning_rate": 5.260655063724818e-06, + "loss": 0.1148, + "step": 35334 + }, + { + "epoch": 2.079259359739555, + "grad_norm": 3.3939340114593506, + "learning_rate": 5.260518021104564e-06, + "loss": 0.1863, + "step": 35335 + }, + { + "epoch": 2.07927292457949, + "grad_norm": 3.1815128326416016, + "learning_rate": 5.260380978484309e-06, + "loss": 0.1302, + "step": 35336 + }, + { + "epoch": 2.0792864894194247, + "grad_norm": 4.499813079833984, + "learning_rate": 5.260243935864055e-06, + "loss": 0.2743, + "step": 35337 + }, + { + "epoch": 2.0793000542593596, + "grad_norm": 3.478930950164795, + "learning_rate": 5.260106893243799e-06, + "loss": 0.1418, + "step": 35338 + }, + { + "epoch": 2.0793136190992945, + "grad_norm": 5.047633647918701, + "learning_rate": 5.259969850623544e-06, + "loss": 0.1615, + "step": 35339 + }, + { + "epoch": 2.0793271839392293, + "grad_norm": 3.67333984375, + "learning_rate": 5.259832808003289e-06, + "loss": 0.0959, + "step": 35340 + }, + { + "epoch": 2.079340748779164, + "grad_norm": 4.7674174308776855, + "learning_rate": 5.259695765383035e-06, + "loss": 0.2504, + "step": 35341 + }, + { + "epoch": 2.0793543136190995, + "grad_norm": 4.109875202178955, + "learning_rate": 5.25955872276278e-06, + "loss": 0.187, + "step": 35342 + }, + { + "epoch": 2.0793678784590344, + "grad_norm": 3.078195571899414, + "learning_rate": 5.259421680142525e-06, + "loss": 0.1166, + "step": 35343 + }, + { + "epoch": 2.0793814432989692, + "grad_norm": 4.196523666381836, + "learning_rate": 5.259284637522269e-06, + "loss": 0.1312, + "step": 35344 + }, + { + "epoch": 2.079395008138904, + "grad_norm": 4.581688404083252, + "learning_rate": 5.259147594902015e-06, + "loss": 0.1189, + "step": 35345 + }, + { + "epoch": 2.079408572978839, + "grad_norm": 3.7779877185821533, + "learning_rate": 5.2590105522817606e-06, + "loss": 0.1231, + "step": 35346 + }, + { + "epoch": 2.079422137818774, + "grad_norm": 3.6293745040893555, + "learning_rate": 5.258873509661505e-06, + "loss": 0.1359, + "step": 35347 + }, + { + "epoch": 2.0794357026587087, + "grad_norm": 3.6191229820251465, + "learning_rate": 5.25873646704125e-06, + "loss": 0.127, + "step": 35348 + }, + { + "epoch": 2.0794492674986436, + "grad_norm": 4.024201393127441, + "learning_rate": 5.258599424420996e-06, + "loss": 0.17, + "step": 35349 + }, + { + "epoch": 2.0794628323385784, + "grad_norm": 4.573602199554443, + "learning_rate": 5.25846238180074e-06, + "loss": 0.1997, + "step": 35350 + }, + { + "epoch": 2.0794763971785133, + "grad_norm": 3.2919583320617676, + "learning_rate": 5.258325339180486e-06, + "loss": 0.1509, + "step": 35351 + }, + { + "epoch": 2.079489962018448, + "grad_norm": 3.468761920928955, + "learning_rate": 5.258188296560231e-06, + "loss": 0.1409, + "step": 35352 + }, + { + "epoch": 2.079503526858383, + "grad_norm": 4.221287727355957, + "learning_rate": 5.258051253939975e-06, + "loss": 0.1517, + "step": 35353 + }, + { + "epoch": 2.079517091698318, + "grad_norm": 2.493269443511963, + "learning_rate": 5.257914211319721e-06, + "loss": 0.1016, + "step": 35354 + }, + { + "epoch": 2.0795306565382528, + "grad_norm": 3.933868646621704, + "learning_rate": 5.257777168699466e-06, + "loss": 0.1593, + "step": 35355 + }, + { + "epoch": 2.0795442213781876, + "grad_norm": 3.7394909858703613, + "learning_rate": 5.257640126079211e-06, + "loss": 0.1087, + "step": 35356 + }, + { + "epoch": 2.0795577862181225, + "grad_norm": 5.016623020172119, + "learning_rate": 5.257503083458956e-06, + "loss": 0.1728, + "step": 35357 + }, + { + "epoch": 2.0795713510580573, + "grad_norm": 4.8870086669921875, + "learning_rate": 5.257366040838702e-06, + "loss": 0.1632, + "step": 35358 + }, + { + "epoch": 2.079584915897992, + "grad_norm": 8.522310256958008, + "learning_rate": 5.257228998218446e-06, + "loss": 0.3415, + "step": 35359 + }, + { + "epoch": 2.079598480737927, + "grad_norm": 3.4501214027404785, + "learning_rate": 5.257091955598191e-06, + "loss": 0.1273, + "step": 35360 + }, + { + "epoch": 2.0796120455778624, + "grad_norm": 4.244034290313721, + "learning_rate": 5.2569549129779365e-06, + "loss": 0.2248, + "step": 35361 + }, + { + "epoch": 2.0796256104177973, + "grad_norm": 5.048824310302734, + "learning_rate": 5.2568178703576826e-06, + "loss": 0.2738, + "step": 35362 + }, + { + "epoch": 2.079639175257732, + "grad_norm": 4.63247013092041, + "learning_rate": 5.256680827737427e-06, + "loss": 0.1677, + "step": 35363 + }, + { + "epoch": 2.079652740097667, + "grad_norm": 4.730788230895996, + "learning_rate": 5.256543785117172e-06, + "loss": 0.3324, + "step": 35364 + }, + { + "epoch": 2.079666304937602, + "grad_norm": 4.881850719451904, + "learning_rate": 5.256406742496916e-06, + "loss": 0.2474, + "step": 35365 + }, + { + "epoch": 2.0796798697775367, + "grad_norm": 4.950562000274658, + "learning_rate": 5.256269699876662e-06, + "loss": 0.2216, + "step": 35366 + }, + { + "epoch": 2.0796934346174716, + "grad_norm": 5.8226518630981445, + "learning_rate": 5.256132657256408e-06, + "loss": 0.2088, + "step": 35367 + }, + { + "epoch": 2.0797069994574064, + "grad_norm": 4.249869346618652, + "learning_rate": 5.255995614636153e-06, + "loss": 0.0813, + "step": 35368 + }, + { + "epoch": 2.0797205642973413, + "grad_norm": 5.484267711639404, + "learning_rate": 5.255858572015897e-06, + "loss": 0.1585, + "step": 35369 + }, + { + "epoch": 2.079734129137276, + "grad_norm": 4.7791218757629395, + "learning_rate": 5.255721529395642e-06, + "loss": 0.1178, + "step": 35370 + }, + { + "epoch": 2.079747693977211, + "grad_norm": 5.825482368469238, + "learning_rate": 5.255584486775388e-06, + "loss": 0.2003, + "step": 35371 + }, + { + "epoch": 2.079761258817146, + "grad_norm": 4.919283866882324, + "learning_rate": 5.255447444155133e-06, + "loss": 0.1475, + "step": 35372 + }, + { + "epoch": 2.0797748236570808, + "grad_norm": 4.661211967468262, + "learning_rate": 5.255310401534878e-06, + "loss": 0.1419, + "step": 35373 + }, + { + "epoch": 2.0797883884970156, + "grad_norm": 3.275007724761963, + "learning_rate": 5.255173358914622e-06, + "loss": 0.1427, + "step": 35374 + }, + { + "epoch": 2.0798019533369505, + "grad_norm": 5.0241379737854, + "learning_rate": 5.255036316294368e-06, + "loss": 0.1452, + "step": 35375 + }, + { + "epoch": 2.0798155181768854, + "grad_norm": 5.0696611404418945, + "learning_rate": 5.254899273674113e-06, + "loss": 0.1118, + "step": 35376 + }, + { + "epoch": 2.0798290830168202, + "grad_norm": 3.480168342590332, + "learning_rate": 5.2547622310538586e-06, + "loss": 0.1394, + "step": 35377 + }, + { + "epoch": 2.079842647856755, + "grad_norm": 4.636454105377197, + "learning_rate": 5.254625188433603e-06, + "loss": 0.1476, + "step": 35378 + }, + { + "epoch": 2.07985621269669, + "grad_norm": 4.461330890655518, + "learning_rate": 5.254488145813348e-06, + "loss": 0.224, + "step": 35379 + }, + { + "epoch": 2.0798697775366253, + "grad_norm": 4.334012031555176, + "learning_rate": 5.254351103193094e-06, + "loss": 0.1269, + "step": 35380 + }, + { + "epoch": 2.07988334237656, + "grad_norm": 3.0751473903656006, + "learning_rate": 5.254214060572838e-06, + "loss": 0.0649, + "step": 35381 + }, + { + "epoch": 2.079896907216495, + "grad_norm": 3.753565549850464, + "learning_rate": 5.254077017952584e-06, + "loss": 0.1066, + "step": 35382 + }, + { + "epoch": 2.07991047205643, + "grad_norm": 5.745123386383057, + "learning_rate": 5.253939975332328e-06, + "loss": 0.2911, + "step": 35383 + }, + { + "epoch": 2.0799240368963647, + "grad_norm": 4.749727725982666, + "learning_rate": 5.253802932712074e-06, + "loss": 0.154, + "step": 35384 + }, + { + "epoch": 2.0799376017362996, + "grad_norm": 6.436411380767822, + "learning_rate": 5.253665890091819e-06, + "loss": 0.224, + "step": 35385 + }, + { + "epoch": 2.0799511665762345, + "grad_norm": 6.560678482055664, + "learning_rate": 5.253528847471564e-06, + "loss": 0.2865, + "step": 35386 + }, + { + "epoch": 2.0799647314161693, + "grad_norm": 5.291139602661133, + "learning_rate": 5.253391804851309e-06, + "loss": 0.2152, + "step": 35387 + }, + { + "epoch": 2.079978296256104, + "grad_norm": 5.332456588745117, + "learning_rate": 5.253254762231055e-06, + "loss": 0.1484, + "step": 35388 + }, + { + "epoch": 2.079991861096039, + "grad_norm": 5.013857364654541, + "learning_rate": 5.2531177196108e-06, + "loss": 0.1904, + "step": 35389 + }, + { + "epoch": 2.080005425935974, + "grad_norm": 4.620364189147949, + "learning_rate": 5.252980676990544e-06, + "loss": 0.1035, + "step": 35390 + }, + { + "epoch": 2.080018990775909, + "grad_norm": 6.581412315368652, + "learning_rate": 5.252843634370289e-06, + "loss": 0.4816, + "step": 35391 + }, + { + "epoch": 2.0800325556158437, + "grad_norm": 4.9114766120910645, + "learning_rate": 5.2527065917500345e-06, + "loss": 0.1899, + "step": 35392 + }, + { + "epoch": 2.0800461204557785, + "grad_norm": 6.01371955871582, + "learning_rate": 5.25256954912978e-06, + "loss": 0.2885, + "step": 35393 + }, + { + "epoch": 2.0800596852957134, + "grad_norm": 3.9066333770751953, + "learning_rate": 5.252432506509525e-06, + "loss": 0.1527, + "step": 35394 + }, + { + "epoch": 2.0800732501356483, + "grad_norm": 4.867191791534424, + "learning_rate": 5.25229546388927e-06, + "loss": 0.1882, + "step": 35395 + }, + { + "epoch": 2.080086814975583, + "grad_norm": 7.631230354309082, + "learning_rate": 5.252158421269014e-06, + "loss": 0.3087, + "step": 35396 + }, + { + "epoch": 2.080100379815518, + "grad_norm": 3.8930299282073975, + "learning_rate": 5.2520213786487604e-06, + "loss": 0.1053, + "step": 35397 + }, + { + "epoch": 2.080113944655453, + "grad_norm": 4.231515884399414, + "learning_rate": 5.251884336028506e-06, + "loss": 0.1378, + "step": 35398 + }, + { + "epoch": 2.080127509495388, + "grad_norm": 4.258678436279297, + "learning_rate": 5.25174729340825e-06, + "loss": 0.1587, + "step": 35399 + }, + { + "epoch": 2.080141074335323, + "grad_norm": 5.199521541595459, + "learning_rate": 5.251610250787995e-06, + "loss": 0.1639, + "step": 35400 + }, + { + "epoch": 2.080154639175258, + "grad_norm": 3.7839508056640625, + "learning_rate": 5.251473208167741e-06, + "loss": 0.1977, + "step": 35401 + }, + { + "epoch": 2.0801682040151928, + "grad_norm": 3.582987070083618, + "learning_rate": 5.251336165547486e-06, + "loss": 0.1358, + "step": 35402 + }, + { + "epoch": 2.0801817688551276, + "grad_norm": 3.1554887294769287, + "learning_rate": 5.251199122927231e-06, + "loss": 0.1351, + "step": 35403 + }, + { + "epoch": 2.0801953336950625, + "grad_norm": 4.671252727508545, + "learning_rate": 5.251062080306976e-06, + "loss": 0.2735, + "step": 35404 + }, + { + "epoch": 2.0802088985349974, + "grad_norm": 3.242233991622925, + "learning_rate": 5.25092503768672e-06, + "loss": 0.1271, + "step": 35405 + }, + { + "epoch": 2.080222463374932, + "grad_norm": 6.367852687835693, + "learning_rate": 5.250787995066466e-06, + "loss": 0.2532, + "step": 35406 + }, + { + "epoch": 2.080236028214867, + "grad_norm": 4.957093238830566, + "learning_rate": 5.250650952446211e-06, + "loss": 0.1651, + "step": 35407 + }, + { + "epoch": 2.080249593054802, + "grad_norm": 3.6532490253448486, + "learning_rate": 5.250513909825956e-06, + "loss": 0.0688, + "step": 35408 + }, + { + "epoch": 2.080263157894737, + "grad_norm": 5.487337589263916, + "learning_rate": 5.250376867205701e-06, + "loss": 0.2434, + "step": 35409 + }, + { + "epoch": 2.0802767227346717, + "grad_norm": 5.315445899963379, + "learning_rate": 5.250239824585447e-06, + "loss": 0.1942, + "step": 35410 + }, + { + "epoch": 2.0802902875746065, + "grad_norm": 4.790143013000488, + "learning_rate": 5.250102781965192e-06, + "loss": 0.3211, + "step": 35411 + }, + { + "epoch": 2.0803038524145414, + "grad_norm": 4.550577163696289, + "learning_rate": 5.2499657393449364e-06, + "loss": 0.1789, + "step": 35412 + }, + { + "epoch": 2.0803174172544763, + "grad_norm": 3.3840839862823486, + "learning_rate": 5.249828696724682e-06, + "loss": 0.1225, + "step": 35413 + }, + { + "epoch": 2.080330982094411, + "grad_norm": 6.474860668182373, + "learning_rate": 5.249691654104428e-06, + "loss": 0.2198, + "step": 35414 + }, + { + "epoch": 2.080344546934346, + "grad_norm": 4.806397438049316, + "learning_rate": 5.249554611484172e-06, + "loss": 0.2043, + "step": 35415 + }, + { + "epoch": 2.080358111774281, + "grad_norm": 3.366230010986328, + "learning_rate": 5.249417568863917e-06, + "loss": 0.1891, + "step": 35416 + }, + { + "epoch": 2.0803716766142157, + "grad_norm": 3.7983524799346924, + "learning_rate": 5.249280526243662e-06, + "loss": 0.1343, + "step": 35417 + }, + { + "epoch": 2.080385241454151, + "grad_norm": 7.176701545715332, + "learning_rate": 5.249143483623407e-06, + "loss": 0.3702, + "step": 35418 + }, + { + "epoch": 2.080398806294086, + "grad_norm": 4.092810153961182, + "learning_rate": 5.249006441003153e-06, + "loss": 0.2303, + "step": 35419 + }, + { + "epoch": 2.0804123711340208, + "grad_norm": 5.142918586730957, + "learning_rate": 5.248869398382898e-06, + "loss": 0.1764, + "step": 35420 + }, + { + "epoch": 2.0804259359739556, + "grad_norm": 5.281996726989746, + "learning_rate": 5.248732355762642e-06, + "loss": 0.2092, + "step": 35421 + }, + { + "epoch": 2.0804395008138905, + "grad_norm": 6.12044620513916, + "learning_rate": 5.248595313142387e-06, + "loss": 0.1768, + "step": 35422 + }, + { + "epoch": 2.0804530656538254, + "grad_norm": 4.801672458648682, + "learning_rate": 5.248458270522133e-06, + "loss": 0.2783, + "step": 35423 + }, + { + "epoch": 2.0804666304937602, + "grad_norm": 2.941183090209961, + "learning_rate": 5.248321227901878e-06, + "loss": 0.1124, + "step": 35424 + }, + { + "epoch": 2.080480195333695, + "grad_norm": 5.5192341804504395, + "learning_rate": 5.248184185281623e-06, + "loss": 0.4559, + "step": 35425 + }, + { + "epoch": 2.08049376017363, + "grad_norm": 5.064696311950684, + "learning_rate": 5.248047142661368e-06, + "loss": 0.3282, + "step": 35426 + }, + { + "epoch": 2.080507325013565, + "grad_norm": 5.427356243133545, + "learning_rate": 5.247910100041113e-06, + "loss": 0.245, + "step": 35427 + }, + { + "epoch": 2.0805208898534997, + "grad_norm": 8.512049674987793, + "learning_rate": 5.2477730574208584e-06, + "loss": 0.4257, + "step": 35428 + }, + { + "epoch": 2.0805344546934346, + "grad_norm": 5.070436477661133, + "learning_rate": 5.247636014800604e-06, + "loss": 0.2302, + "step": 35429 + }, + { + "epoch": 2.0805480195333694, + "grad_norm": 5.9714860916137695, + "learning_rate": 5.247498972180348e-06, + "loss": 0.2688, + "step": 35430 + }, + { + "epoch": 2.0805615843733043, + "grad_norm": 5.152323246002197, + "learning_rate": 5.247361929560094e-06, + "loss": 0.1409, + "step": 35431 + }, + { + "epoch": 2.080575149213239, + "grad_norm": 6.100936412811279, + "learning_rate": 5.247224886939839e-06, + "loss": 0.3292, + "step": 35432 + }, + { + "epoch": 2.080588714053174, + "grad_norm": 4.294587135314941, + "learning_rate": 5.2470878443195835e-06, + "loss": 0.2026, + "step": 35433 + }, + { + "epoch": 2.080602278893109, + "grad_norm": 4.37125301361084, + "learning_rate": 5.246950801699329e-06, + "loss": 0.1428, + "step": 35434 + }, + { + "epoch": 2.0806158437330438, + "grad_norm": 4.122230052947998, + "learning_rate": 5.246813759079074e-06, + "loss": 0.1234, + "step": 35435 + }, + { + "epoch": 2.080629408572979, + "grad_norm": 5.071965217590332, + "learning_rate": 5.24667671645882e-06, + "loss": 0.2651, + "step": 35436 + }, + { + "epoch": 2.080642973412914, + "grad_norm": 4.933481216430664, + "learning_rate": 5.246539673838564e-06, + "loss": 0.185, + "step": 35437 + }, + { + "epoch": 2.080656538252849, + "grad_norm": 5.874873161315918, + "learning_rate": 5.246402631218309e-06, + "loss": 0.2299, + "step": 35438 + }, + { + "epoch": 2.0806701030927837, + "grad_norm": 5.123067378997803, + "learning_rate": 5.246265588598054e-06, + "loss": 0.2034, + "step": 35439 + }, + { + "epoch": 2.0806836679327185, + "grad_norm": 6.365821838378906, + "learning_rate": 5.2461285459778e-06, + "loss": 0.1524, + "step": 35440 + }, + { + "epoch": 2.0806972327726534, + "grad_norm": 4.702725410461426, + "learning_rate": 5.245991503357545e-06, + "loss": 0.2731, + "step": 35441 + }, + { + "epoch": 2.0807107976125883, + "grad_norm": 5.689424514770508, + "learning_rate": 5.245854460737289e-06, + "loss": 0.2486, + "step": 35442 + }, + { + "epoch": 2.080724362452523, + "grad_norm": 5.641607761383057, + "learning_rate": 5.2457174181170344e-06, + "loss": 0.3233, + "step": 35443 + }, + { + "epoch": 2.080737927292458, + "grad_norm": 4.271235466003418, + "learning_rate": 5.2455803754967805e-06, + "loss": 0.1615, + "step": 35444 + }, + { + "epoch": 2.080751492132393, + "grad_norm": 3.93465256690979, + "learning_rate": 5.245443332876526e-06, + "loss": 0.1255, + "step": 35445 + }, + { + "epoch": 2.0807650569723277, + "grad_norm": 5.898784637451172, + "learning_rate": 5.24530629025627e-06, + "loss": 0.3154, + "step": 35446 + }, + { + "epoch": 2.0807786218122626, + "grad_norm": 4.676525115966797, + "learning_rate": 5.245169247636015e-06, + "loss": 0.1954, + "step": 35447 + }, + { + "epoch": 2.0807921866521975, + "grad_norm": 5.146273136138916, + "learning_rate": 5.2450322050157595e-06, + "loss": 0.1869, + "step": 35448 + }, + { + "epoch": 2.0808057514921323, + "grad_norm": 3.5226261615753174, + "learning_rate": 5.2448951623955055e-06, + "loss": 0.153, + "step": 35449 + }, + { + "epoch": 2.080819316332067, + "grad_norm": 6.123178005218506, + "learning_rate": 5.244758119775251e-06, + "loss": 0.2763, + "step": 35450 + }, + { + "epoch": 2.080832881172002, + "grad_norm": 4.293102264404297, + "learning_rate": 5.244621077154996e-06, + "loss": 0.2221, + "step": 35451 + }, + { + "epoch": 2.080846446011937, + "grad_norm": 4.453196048736572, + "learning_rate": 5.24448403453474e-06, + "loss": 0.1787, + "step": 35452 + }, + { + "epoch": 2.0808600108518718, + "grad_norm": 3.2859456539154053, + "learning_rate": 5.244346991914486e-06, + "loss": 0.1021, + "step": 35453 + }, + { + "epoch": 2.0808735756918066, + "grad_norm": 3.7964541912078857, + "learning_rate": 5.244209949294231e-06, + "loss": 0.1329, + "step": 35454 + }, + { + "epoch": 2.0808871405317415, + "grad_norm": 4.526364803314209, + "learning_rate": 5.244072906673976e-06, + "loss": 0.1495, + "step": 35455 + }, + { + "epoch": 2.080900705371677, + "grad_norm": 4.881514072418213, + "learning_rate": 5.243935864053721e-06, + "loss": 0.1862, + "step": 35456 + }, + { + "epoch": 2.0809142702116117, + "grad_norm": 3.2075746059417725, + "learning_rate": 5.243798821433467e-06, + "loss": 0.1154, + "step": 35457 + }, + { + "epoch": 2.0809278350515465, + "grad_norm": 4.056614875793457, + "learning_rate": 5.243661778813211e-06, + "loss": 0.1603, + "step": 35458 + }, + { + "epoch": 2.0809413998914814, + "grad_norm": 5.569624423980713, + "learning_rate": 5.2435247361929564e-06, + "loss": 0.2561, + "step": 35459 + }, + { + "epoch": 2.0809549647314163, + "grad_norm": 3.743770122528076, + "learning_rate": 5.243387693572702e-06, + "loss": 0.1511, + "step": 35460 + }, + { + "epoch": 2.080968529571351, + "grad_norm": 4.675075531005859, + "learning_rate": 5.243250650952446e-06, + "loss": 0.2957, + "step": 35461 + }, + { + "epoch": 2.080982094411286, + "grad_norm": 5.110232353210449, + "learning_rate": 5.243113608332192e-06, + "loss": 0.2117, + "step": 35462 + }, + { + "epoch": 2.080995659251221, + "grad_norm": 3.3642873764038086, + "learning_rate": 5.242976565711937e-06, + "loss": 0.2064, + "step": 35463 + }, + { + "epoch": 2.0810092240911557, + "grad_norm": 3.9227938652038574, + "learning_rate": 5.2428395230916815e-06, + "loss": 0.1851, + "step": 35464 + }, + { + "epoch": 2.0810227889310906, + "grad_norm": 3.7647671699523926, + "learning_rate": 5.242702480471427e-06, + "loss": 0.1357, + "step": 35465 + }, + { + "epoch": 2.0810363537710255, + "grad_norm": 4.578083515167236, + "learning_rate": 5.242565437851173e-06, + "loss": 0.0959, + "step": 35466 + }, + { + "epoch": 2.0810499186109603, + "grad_norm": 3.758106231689453, + "learning_rate": 5.242428395230917e-06, + "loss": 0.1241, + "step": 35467 + }, + { + "epoch": 2.081063483450895, + "grad_norm": 3.2671356201171875, + "learning_rate": 5.242291352610662e-06, + "loss": 0.0924, + "step": 35468 + }, + { + "epoch": 2.08107704829083, + "grad_norm": 5.286655902862549, + "learning_rate": 5.242154309990407e-06, + "loss": 0.1627, + "step": 35469 + }, + { + "epoch": 2.081090613130765, + "grad_norm": 4.178537368774414, + "learning_rate": 5.242017267370153e-06, + "loss": 0.242, + "step": 35470 + }, + { + "epoch": 2.0811041779707, + "grad_norm": 6.344712257385254, + "learning_rate": 5.241880224749898e-06, + "loss": 0.2041, + "step": 35471 + }, + { + "epoch": 2.0811177428106347, + "grad_norm": 3.563415765762329, + "learning_rate": 5.241743182129643e-06, + "loss": 0.0833, + "step": 35472 + }, + { + "epoch": 2.0811313076505695, + "grad_norm": 4.311857223510742, + "learning_rate": 5.241606139509387e-06, + "loss": 0.1341, + "step": 35473 + }, + { + "epoch": 2.081144872490505, + "grad_norm": 4.1041789054870605, + "learning_rate": 5.2414690968891324e-06, + "loss": 0.2284, + "step": 35474 + }, + { + "epoch": 2.0811584373304397, + "grad_norm": 3.4080119132995605, + "learning_rate": 5.2413320542688785e-06, + "loss": 0.1303, + "step": 35475 + }, + { + "epoch": 2.0811720021703746, + "grad_norm": 3.4662187099456787, + "learning_rate": 5.241195011648623e-06, + "loss": 0.097, + "step": 35476 + }, + { + "epoch": 2.0811855670103094, + "grad_norm": 3.3777425289154053, + "learning_rate": 5.241057969028368e-06, + "loss": 0.1577, + "step": 35477 + }, + { + "epoch": 2.0811991318502443, + "grad_norm": 5.087578773498535, + "learning_rate": 5.240920926408113e-06, + "loss": 0.1473, + "step": 35478 + }, + { + "epoch": 2.081212696690179, + "grad_norm": 3.5240848064422607, + "learning_rate": 5.240783883787859e-06, + "loss": 0.1364, + "step": 35479 + }, + { + "epoch": 2.081226261530114, + "grad_norm": 3.744234323501587, + "learning_rate": 5.2406468411676035e-06, + "loss": 0.1815, + "step": 35480 + }, + { + "epoch": 2.081239826370049, + "grad_norm": 3.9225926399230957, + "learning_rate": 5.240509798547349e-06, + "loss": 0.1342, + "step": 35481 + }, + { + "epoch": 2.0812533912099838, + "grad_norm": 5.804020881652832, + "learning_rate": 5.240372755927093e-06, + "loss": 0.1383, + "step": 35482 + }, + { + "epoch": 2.0812669560499186, + "grad_norm": 3.2828996181488037, + "learning_rate": 5.240235713306839e-06, + "loss": 0.1561, + "step": 35483 + }, + { + "epoch": 2.0812805208898535, + "grad_norm": 4.224976539611816, + "learning_rate": 5.240098670686584e-06, + "loss": 0.1468, + "step": 35484 + }, + { + "epoch": 2.0812940857297884, + "grad_norm": 5.228097438812256, + "learning_rate": 5.239961628066329e-06, + "loss": 0.2606, + "step": 35485 + }, + { + "epoch": 2.0813076505697232, + "grad_norm": 3.35142183303833, + "learning_rate": 5.239824585446074e-06, + "loss": 0.1029, + "step": 35486 + }, + { + "epoch": 2.081321215409658, + "grad_norm": 4.785384178161621, + "learning_rate": 5.239687542825819e-06, + "loss": 0.1676, + "step": 35487 + }, + { + "epoch": 2.081334780249593, + "grad_norm": 3.6022024154663086, + "learning_rate": 5.239550500205565e-06, + "loss": 0.123, + "step": 35488 + }, + { + "epoch": 2.081348345089528, + "grad_norm": 4.87741756439209, + "learning_rate": 5.239413457585309e-06, + "loss": 0.1853, + "step": 35489 + }, + { + "epoch": 2.0813619099294627, + "grad_norm": 2.789808750152588, + "learning_rate": 5.2392764149650545e-06, + "loss": 0.0949, + "step": 35490 + }, + { + "epoch": 2.0813754747693975, + "grad_norm": 3.9651846885681152, + "learning_rate": 5.239139372344799e-06, + "loss": 0.103, + "step": 35491 + }, + { + "epoch": 2.0813890396093324, + "grad_norm": 4.565555572509766, + "learning_rate": 5.239002329724545e-06, + "loss": 0.2364, + "step": 35492 + }, + { + "epoch": 2.0814026044492673, + "grad_norm": 4.019651889801025, + "learning_rate": 5.23886528710429e-06, + "loss": 0.0993, + "step": 35493 + }, + { + "epoch": 2.0814161692892026, + "grad_norm": 3.4163875579833984, + "learning_rate": 5.238728244484035e-06, + "loss": 0.1134, + "step": 35494 + }, + { + "epoch": 2.0814297341291375, + "grad_norm": 4.7816691398620605, + "learning_rate": 5.2385912018637795e-06, + "loss": 0.1565, + "step": 35495 + }, + { + "epoch": 2.0814432989690723, + "grad_norm": 5.857966423034668, + "learning_rate": 5.2384541592435255e-06, + "loss": 0.2165, + "step": 35496 + }, + { + "epoch": 2.081456863809007, + "grad_norm": 6.576647758483887, + "learning_rate": 5.238317116623271e-06, + "loss": 0.3054, + "step": 35497 + }, + { + "epoch": 2.081470428648942, + "grad_norm": 4.9427809715271, + "learning_rate": 5.238180074003015e-06, + "loss": 0.149, + "step": 35498 + }, + { + "epoch": 2.081483993488877, + "grad_norm": 4.531035900115967, + "learning_rate": 5.23804303138276e-06, + "loss": 0.1303, + "step": 35499 + }, + { + "epoch": 2.081497558328812, + "grad_norm": 3.3720836639404297, + "learning_rate": 5.237905988762506e-06, + "loss": 0.0765, + "step": 35500 + }, + { + "epoch": 2.0815111231687466, + "grad_norm": 4.155003547668457, + "learning_rate": 5.2377689461422506e-06, + "loss": 0.1456, + "step": 35501 + }, + { + "epoch": 2.0815246880086815, + "grad_norm": 6.388220310211182, + "learning_rate": 5.237631903521996e-06, + "loss": 0.2219, + "step": 35502 + }, + { + "epoch": 2.0815382528486164, + "grad_norm": 5.187641620635986, + "learning_rate": 5.237494860901741e-06, + "loss": 0.1249, + "step": 35503 + }, + { + "epoch": 2.0815518176885512, + "grad_norm": 4.7116217613220215, + "learning_rate": 5.237357818281485e-06, + "loss": 0.1927, + "step": 35504 + }, + { + "epoch": 2.081565382528486, + "grad_norm": 3.8227078914642334, + "learning_rate": 5.237220775661231e-06, + "loss": 0.1466, + "step": 35505 + }, + { + "epoch": 2.081578947368421, + "grad_norm": 4.15797758102417, + "learning_rate": 5.2370837330409765e-06, + "loss": 0.1858, + "step": 35506 + }, + { + "epoch": 2.081592512208356, + "grad_norm": 4.052672386169434, + "learning_rate": 5.236946690420721e-06, + "loss": 0.1611, + "step": 35507 + }, + { + "epoch": 2.0816060770482907, + "grad_norm": 4.295933723449707, + "learning_rate": 5.236809647800466e-06, + "loss": 0.0857, + "step": 35508 + }, + { + "epoch": 2.0816196418882256, + "grad_norm": 4.178001403808594, + "learning_rate": 5.236672605180212e-06, + "loss": 0.1777, + "step": 35509 + }, + { + "epoch": 2.0816332067281604, + "grad_norm": 5.446472644805908, + "learning_rate": 5.236535562559957e-06, + "loss": 0.1405, + "step": 35510 + }, + { + "epoch": 2.0816467715680953, + "grad_norm": 4.270878314971924, + "learning_rate": 5.2363985199397015e-06, + "loss": 0.1433, + "step": 35511 + }, + { + "epoch": 2.0816603364080306, + "grad_norm": 4.416099548339844, + "learning_rate": 5.236261477319447e-06, + "loss": 0.1382, + "step": 35512 + }, + { + "epoch": 2.0816739012479655, + "grad_norm": 3.3960893154144287, + "learning_rate": 5.236124434699193e-06, + "loss": 0.196, + "step": 35513 + }, + { + "epoch": 2.0816874660879003, + "grad_norm": 4.267235279083252, + "learning_rate": 5.235987392078937e-06, + "loss": 0.2084, + "step": 35514 + }, + { + "epoch": 2.081701030927835, + "grad_norm": 4.880862712860107, + "learning_rate": 5.235850349458682e-06, + "loss": 0.1787, + "step": 35515 + }, + { + "epoch": 2.08171459576777, + "grad_norm": 4.977239608764648, + "learning_rate": 5.2357133068384266e-06, + "loss": 0.1612, + "step": 35516 + }, + { + "epoch": 2.081728160607705, + "grad_norm": 6.215895652770996, + "learning_rate": 5.235576264218172e-06, + "loss": 0.276, + "step": 35517 + }, + { + "epoch": 2.08174172544764, + "grad_norm": 4.740032196044922, + "learning_rate": 5.235439221597918e-06, + "loss": 0.1815, + "step": 35518 + }, + { + "epoch": 2.0817552902875747, + "grad_norm": 6.0713114738464355, + "learning_rate": 5.235302178977663e-06, + "loss": 0.252, + "step": 35519 + }, + { + "epoch": 2.0817688551275095, + "grad_norm": 6.142726421356201, + "learning_rate": 5.235165136357407e-06, + "loss": 0.2849, + "step": 35520 + }, + { + "epoch": 2.0817824199674444, + "grad_norm": 5.773497581481934, + "learning_rate": 5.2350280937371525e-06, + "loss": 0.2747, + "step": 35521 + }, + { + "epoch": 2.0817959848073793, + "grad_norm": 3.7809102535247803, + "learning_rate": 5.2348910511168985e-06, + "loss": 0.2023, + "step": 35522 + }, + { + "epoch": 2.081809549647314, + "grad_norm": 5.822170734405518, + "learning_rate": 5.234754008496643e-06, + "loss": 0.3076, + "step": 35523 + }, + { + "epoch": 2.081823114487249, + "grad_norm": 4.733989238739014, + "learning_rate": 5.234616965876388e-06, + "loss": 0.2533, + "step": 35524 + }, + { + "epoch": 2.081836679327184, + "grad_norm": 2.6681032180786133, + "learning_rate": 5.234479923256132e-06, + "loss": 0.1039, + "step": 35525 + }, + { + "epoch": 2.0818502441671187, + "grad_norm": 4.6901469230651855, + "learning_rate": 5.234342880635878e-06, + "loss": 0.2322, + "step": 35526 + }, + { + "epoch": 2.0818638090070536, + "grad_norm": 4.431936264038086, + "learning_rate": 5.2342058380156235e-06, + "loss": 0.1942, + "step": 35527 + }, + { + "epoch": 2.0818773738469885, + "grad_norm": 4.735555648803711, + "learning_rate": 5.234068795395369e-06, + "loss": 0.2766, + "step": 35528 + }, + { + "epoch": 2.0818909386869233, + "grad_norm": 6.467983722686768, + "learning_rate": 5.233931752775113e-06, + "loss": 0.2273, + "step": 35529 + }, + { + "epoch": 2.081904503526858, + "grad_norm": 3.258758068084717, + "learning_rate": 5.233794710154858e-06, + "loss": 0.1746, + "step": 35530 + }, + { + "epoch": 2.081918068366793, + "grad_norm": 4.996499061584473, + "learning_rate": 5.233657667534604e-06, + "loss": 0.2025, + "step": 35531 + }, + { + "epoch": 2.0819316332067284, + "grad_norm": 5.278348445892334, + "learning_rate": 5.2335206249143486e-06, + "loss": 0.1927, + "step": 35532 + }, + { + "epoch": 2.0819451980466632, + "grad_norm": 5.090909004211426, + "learning_rate": 5.233383582294094e-06, + "loss": 0.2005, + "step": 35533 + }, + { + "epoch": 2.081958762886598, + "grad_norm": 4.641794681549072, + "learning_rate": 5.233246539673839e-06, + "loss": 0.1784, + "step": 35534 + }, + { + "epoch": 2.081972327726533, + "grad_norm": 5.4807610511779785, + "learning_rate": 5.233109497053584e-06, + "loss": 0.2559, + "step": 35535 + }, + { + "epoch": 2.081985892566468, + "grad_norm": 3.331108331680298, + "learning_rate": 5.232972454433329e-06, + "loss": 0.1657, + "step": 35536 + }, + { + "epoch": 2.0819994574064027, + "grad_norm": 6.299142837524414, + "learning_rate": 5.2328354118130745e-06, + "loss": 0.336, + "step": 35537 + }, + { + "epoch": 2.0820130222463376, + "grad_norm": 4.9928765296936035, + "learning_rate": 5.232698369192819e-06, + "loss": 0.2292, + "step": 35538 + }, + { + "epoch": 2.0820265870862724, + "grad_norm": 5.546536445617676, + "learning_rate": 5.232561326572565e-06, + "loss": 0.1671, + "step": 35539 + }, + { + "epoch": 2.0820401519262073, + "grad_norm": 4.616925239562988, + "learning_rate": 5.23242428395231e-06, + "loss": 0.2506, + "step": 35540 + }, + { + "epoch": 2.082053716766142, + "grad_norm": 4.58898401260376, + "learning_rate": 5.232287241332054e-06, + "loss": 0.1895, + "step": 35541 + }, + { + "epoch": 2.082067281606077, + "grad_norm": 3.3813421726226807, + "learning_rate": 5.2321501987117995e-06, + "loss": 0.1489, + "step": 35542 + }, + { + "epoch": 2.082080846446012, + "grad_norm": 5.112771034240723, + "learning_rate": 5.232013156091545e-06, + "loss": 0.1602, + "step": 35543 + }, + { + "epoch": 2.0820944112859467, + "grad_norm": 6.045651435852051, + "learning_rate": 5.231876113471291e-06, + "loss": 0.2023, + "step": 35544 + }, + { + "epoch": 2.0821079761258816, + "grad_norm": 4.412164688110352, + "learning_rate": 5.231739070851035e-06, + "loss": 0.1574, + "step": 35545 + }, + { + "epoch": 2.0821215409658165, + "grad_norm": 3.002063512802124, + "learning_rate": 5.23160202823078e-06, + "loss": 0.1075, + "step": 35546 + }, + { + "epoch": 2.0821351058057513, + "grad_norm": 7.0752434730529785, + "learning_rate": 5.2314649856105246e-06, + "loss": 0.1793, + "step": 35547 + }, + { + "epoch": 2.082148670645686, + "grad_norm": 6.547384262084961, + "learning_rate": 5.231327942990271e-06, + "loss": 0.2777, + "step": 35548 + }, + { + "epoch": 2.082162235485621, + "grad_norm": 5.073884010314941, + "learning_rate": 5.231190900370016e-06, + "loss": 0.1602, + "step": 35549 + }, + { + "epoch": 2.0821758003255564, + "grad_norm": 4.110706806182861, + "learning_rate": 5.23105385774976e-06, + "loss": 0.1357, + "step": 35550 + }, + { + "epoch": 2.0821893651654912, + "grad_norm": 5.415646076202393, + "learning_rate": 5.230916815129505e-06, + "loss": 0.2192, + "step": 35551 + }, + { + "epoch": 2.082202930005426, + "grad_norm": 4.21345329284668, + "learning_rate": 5.230779772509251e-06, + "loss": 0.1693, + "step": 35552 + }, + { + "epoch": 2.082216494845361, + "grad_norm": 5.6414384841918945, + "learning_rate": 5.2306427298889965e-06, + "loss": 0.2384, + "step": 35553 + }, + { + "epoch": 2.082230059685296, + "grad_norm": 3.4509823322296143, + "learning_rate": 5.230505687268741e-06, + "loss": 0.1837, + "step": 35554 + }, + { + "epoch": 2.0822436245252307, + "grad_norm": 5.002497673034668, + "learning_rate": 5.230368644648486e-06, + "loss": 0.2397, + "step": 35555 + }, + { + "epoch": 2.0822571893651656, + "grad_norm": 3.90804123878479, + "learning_rate": 5.23023160202823e-06, + "loss": 0.2571, + "step": 35556 + }, + { + "epoch": 2.0822707542051004, + "grad_norm": 4.312190532684326, + "learning_rate": 5.230094559407976e-06, + "loss": 0.2833, + "step": 35557 + }, + { + "epoch": 2.0822843190450353, + "grad_norm": 4.674388408660889, + "learning_rate": 5.2299575167877215e-06, + "loss": 0.1993, + "step": 35558 + }, + { + "epoch": 2.08229788388497, + "grad_norm": 3.7636117935180664, + "learning_rate": 5.229820474167467e-06, + "loss": 0.1286, + "step": 35559 + }, + { + "epoch": 2.082311448724905, + "grad_norm": 3.025144338607788, + "learning_rate": 5.229683431547211e-06, + "loss": 0.1645, + "step": 35560 + }, + { + "epoch": 2.08232501356484, + "grad_norm": 3.8621087074279785, + "learning_rate": 5.229546388926957e-06, + "loss": 0.1978, + "step": 35561 + }, + { + "epoch": 2.0823385784047748, + "grad_norm": 4.338473320007324, + "learning_rate": 5.229409346306702e-06, + "loss": 0.1996, + "step": 35562 + }, + { + "epoch": 2.0823521432447096, + "grad_norm": 5.20556640625, + "learning_rate": 5.229272303686447e-06, + "loss": 0.2028, + "step": 35563 + }, + { + "epoch": 2.0823657080846445, + "grad_norm": 4.102988243103027, + "learning_rate": 5.229135261066192e-06, + "loss": 0.1915, + "step": 35564 + }, + { + "epoch": 2.0823792729245794, + "grad_norm": 4.313281059265137, + "learning_rate": 5.228998218445938e-06, + "loss": 0.2294, + "step": 35565 + }, + { + "epoch": 2.0823928377645142, + "grad_norm": 4.5020928382873535, + "learning_rate": 5.228861175825682e-06, + "loss": 0.1964, + "step": 35566 + }, + { + "epoch": 2.082406402604449, + "grad_norm": 3.1687965393066406, + "learning_rate": 5.228724133205427e-06, + "loss": 0.1256, + "step": 35567 + }, + { + "epoch": 2.082419967444384, + "grad_norm": 6.081828594207764, + "learning_rate": 5.2285870905851725e-06, + "loss": 0.2577, + "step": 35568 + }, + { + "epoch": 2.082433532284319, + "grad_norm": 5.097936153411865, + "learning_rate": 5.228450047964918e-06, + "loss": 0.1132, + "step": 35569 + }, + { + "epoch": 2.082447097124254, + "grad_norm": 3.401240587234497, + "learning_rate": 5.228313005344663e-06, + "loss": 0.1485, + "step": 35570 + }, + { + "epoch": 2.082460661964189, + "grad_norm": 3.899057626724243, + "learning_rate": 5.228175962724408e-06, + "loss": 0.1621, + "step": 35571 + }, + { + "epoch": 2.082474226804124, + "grad_norm": 4.329565525054932, + "learning_rate": 5.228038920104152e-06, + "loss": 0.1736, + "step": 35572 + }, + { + "epoch": 2.0824877916440587, + "grad_norm": 5.1747589111328125, + "learning_rate": 5.2279018774838975e-06, + "loss": 0.2192, + "step": 35573 + }, + { + "epoch": 2.0825013564839936, + "grad_norm": 3.5346336364746094, + "learning_rate": 5.2277648348636435e-06, + "loss": 0.1406, + "step": 35574 + }, + { + "epoch": 2.0825149213239285, + "grad_norm": 4.454044342041016, + "learning_rate": 5.227627792243388e-06, + "loss": 0.206, + "step": 35575 + }, + { + "epoch": 2.0825284861638633, + "grad_norm": 4.028907299041748, + "learning_rate": 5.227490749623133e-06, + "loss": 0.1593, + "step": 35576 + }, + { + "epoch": 2.082542051003798, + "grad_norm": 5.829555511474609, + "learning_rate": 5.227353707002878e-06, + "loss": 0.1827, + "step": 35577 + }, + { + "epoch": 2.082555615843733, + "grad_norm": 5.562022686004639, + "learning_rate": 5.227216664382624e-06, + "loss": 0.1796, + "step": 35578 + }, + { + "epoch": 2.082569180683668, + "grad_norm": 4.716475009918213, + "learning_rate": 5.227079621762369e-06, + "loss": 0.1492, + "step": 35579 + }, + { + "epoch": 2.082582745523603, + "grad_norm": 5.3135271072387695, + "learning_rate": 5.226942579142114e-06, + "loss": 0.2316, + "step": 35580 + }, + { + "epoch": 2.0825963103635377, + "grad_norm": 5.483519077301025, + "learning_rate": 5.226805536521858e-06, + "loss": 0.2034, + "step": 35581 + }, + { + "epoch": 2.0826098752034725, + "grad_norm": 4.360384941101074, + "learning_rate": 5.226668493901604e-06, + "loss": 0.1609, + "step": 35582 + }, + { + "epoch": 2.0826234400434074, + "grad_norm": 5.249281406402588, + "learning_rate": 5.226531451281349e-06, + "loss": 0.1741, + "step": 35583 + }, + { + "epoch": 2.0826370048833422, + "grad_norm": 4.286457061767578, + "learning_rate": 5.226394408661094e-06, + "loss": 0.2542, + "step": 35584 + }, + { + "epoch": 2.082650569723277, + "grad_norm": 5.2043328285217285, + "learning_rate": 5.226257366040839e-06, + "loss": 0.1651, + "step": 35585 + }, + { + "epoch": 2.082664134563212, + "grad_norm": 6.454636573791504, + "learning_rate": 5.226120323420584e-06, + "loss": 0.2656, + "step": 35586 + }, + { + "epoch": 2.082677699403147, + "grad_norm": 4.820414066314697, + "learning_rate": 5.22598328080033e-06, + "loss": 0.2054, + "step": 35587 + }, + { + "epoch": 2.082691264243082, + "grad_norm": 4.664513111114502, + "learning_rate": 5.225846238180074e-06, + "loss": 0.144, + "step": 35588 + }, + { + "epoch": 2.082704829083017, + "grad_norm": 4.952854633331299, + "learning_rate": 5.2257091955598195e-06, + "loss": 0.169, + "step": 35589 + }, + { + "epoch": 2.082718393922952, + "grad_norm": 6.069165229797363, + "learning_rate": 5.225572152939564e-06, + "loss": 0.2385, + "step": 35590 + }, + { + "epoch": 2.0827319587628867, + "grad_norm": 6.416177749633789, + "learning_rate": 5.22543511031931e-06, + "loss": 0.2395, + "step": 35591 + }, + { + "epoch": 2.0827455236028216, + "grad_norm": 3.7731399536132812, + "learning_rate": 5.225298067699055e-06, + "loss": 0.1607, + "step": 35592 + }, + { + "epoch": 2.0827590884427565, + "grad_norm": 5.908918380737305, + "learning_rate": 5.2251610250788e-06, + "loss": 0.303, + "step": 35593 + }, + { + "epoch": 2.0827726532826913, + "grad_norm": 5.174413681030273, + "learning_rate": 5.225023982458545e-06, + "loss": 0.1956, + "step": 35594 + }, + { + "epoch": 2.082786218122626, + "grad_norm": 5.61961555480957, + "learning_rate": 5.224886939838291e-06, + "loss": 0.1792, + "step": 35595 + }, + { + "epoch": 2.082799782962561, + "grad_norm": 4.247034549713135, + "learning_rate": 5.224749897218036e-06, + "loss": 0.1688, + "step": 35596 + }, + { + "epoch": 2.082813347802496, + "grad_norm": 5.354546070098877, + "learning_rate": 5.22461285459778e-06, + "loss": 0.2344, + "step": 35597 + }, + { + "epoch": 2.082826912642431, + "grad_norm": 3.993905544281006, + "learning_rate": 5.224475811977525e-06, + "loss": 0.1389, + "step": 35598 + }, + { + "epoch": 2.0828404774823657, + "grad_norm": 3.766507148742676, + "learning_rate": 5.22433876935727e-06, + "loss": 0.1359, + "step": 35599 + }, + { + "epoch": 2.0828540423223005, + "grad_norm": 4.087651252746582, + "learning_rate": 5.224201726737016e-06, + "loss": 0.1921, + "step": 35600 + }, + { + "epoch": 2.0828676071622354, + "grad_norm": 6.170285224914551, + "learning_rate": 5.224064684116761e-06, + "loss": 0.1597, + "step": 35601 + }, + { + "epoch": 2.0828811720021703, + "grad_norm": 5.239121437072754, + "learning_rate": 5.223927641496506e-06, + "loss": 0.1817, + "step": 35602 + }, + { + "epoch": 2.082894736842105, + "grad_norm": 6.258025646209717, + "learning_rate": 5.22379059887625e-06, + "loss": 0.2001, + "step": 35603 + }, + { + "epoch": 2.08290830168204, + "grad_norm": 3.8367457389831543, + "learning_rate": 5.223653556255996e-06, + "loss": 0.1348, + "step": 35604 + }, + { + "epoch": 2.082921866521975, + "grad_norm": 6.293825149536133, + "learning_rate": 5.2235165136357416e-06, + "loss": 0.304, + "step": 35605 + }, + { + "epoch": 2.0829354313619097, + "grad_norm": 5.22102689743042, + "learning_rate": 5.223379471015486e-06, + "loss": 0.1509, + "step": 35606 + }, + { + "epoch": 2.0829489962018446, + "grad_norm": 4.371652603149414, + "learning_rate": 5.223242428395231e-06, + "loss": 0.1309, + "step": 35607 + }, + { + "epoch": 2.08296256104178, + "grad_norm": 4.5904541015625, + "learning_rate": 5.223105385774977e-06, + "loss": 0.1507, + "step": 35608 + }, + { + "epoch": 2.0829761258817148, + "grad_norm": 5.878068447113037, + "learning_rate": 5.222968343154721e-06, + "loss": 0.1461, + "step": 35609 + }, + { + "epoch": 2.0829896907216496, + "grad_norm": 5.36635160446167, + "learning_rate": 5.222831300534467e-06, + "loss": 0.168, + "step": 35610 + }, + { + "epoch": 2.0830032555615845, + "grad_norm": 5.505244255065918, + "learning_rate": 5.222694257914212e-06, + "loss": 0.182, + "step": 35611 + }, + { + "epoch": 2.0830168204015194, + "grad_norm": 3.6922528743743896, + "learning_rate": 5.222557215293956e-06, + "loss": 0.1008, + "step": 35612 + }, + { + "epoch": 2.0830303852414542, + "grad_norm": 3.328009843826294, + "learning_rate": 5.222420172673702e-06, + "loss": 0.1307, + "step": 35613 + }, + { + "epoch": 2.083043950081389, + "grad_norm": 6.611368656158447, + "learning_rate": 5.222283130053447e-06, + "loss": 0.175, + "step": 35614 + }, + { + "epoch": 2.083057514921324, + "grad_norm": 4.762660503387451, + "learning_rate": 5.222146087433192e-06, + "loss": 0.1747, + "step": 35615 + }, + { + "epoch": 2.083071079761259, + "grad_norm": 4.620240688323975, + "learning_rate": 5.222009044812937e-06, + "loss": 0.1393, + "step": 35616 + }, + { + "epoch": 2.0830846446011937, + "grad_norm": 5.333117961883545, + "learning_rate": 5.221872002192683e-06, + "loss": 0.1475, + "step": 35617 + }, + { + "epoch": 2.0830982094411286, + "grad_norm": 4.792811870574951, + "learning_rate": 5.221734959572427e-06, + "loss": 0.1627, + "step": 35618 + }, + { + "epoch": 2.0831117742810634, + "grad_norm": 6.429009437561035, + "learning_rate": 5.221597916952172e-06, + "loss": 0.1667, + "step": 35619 + }, + { + "epoch": 2.0831253391209983, + "grad_norm": 4.914426326751709, + "learning_rate": 5.2214608743319175e-06, + "loss": 0.1857, + "step": 35620 + }, + { + "epoch": 2.083138903960933, + "grad_norm": 4.738076686859131, + "learning_rate": 5.2213238317116636e-06, + "loss": 0.1907, + "step": 35621 + }, + { + "epoch": 2.083152468800868, + "grad_norm": 4.556074619293213, + "learning_rate": 5.221186789091408e-06, + "loss": 0.1119, + "step": 35622 + }, + { + "epoch": 2.083166033640803, + "grad_norm": 4.5977864265441895, + "learning_rate": 5.221049746471153e-06, + "loss": 0.1395, + "step": 35623 + }, + { + "epoch": 2.0831795984807377, + "grad_norm": 4.192423343658447, + "learning_rate": 5.220912703850897e-06, + "loss": 0.138, + "step": 35624 + }, + { + "epoch": 2.0831931633206726, + "grad_norm": 4.9181365966796875, + "learning_rate": 5.220775661230643e-06, + "loss": 0.1471, + "step": 35625 + }, + { + "epoch": 2.083206728160608, + "grad_norm": 3.047837018966675, + "learning_rate": 5.220638618610389e-06, + "loss": 0.0934, + "step": 35626 + }, + { + "epoch": 2.083220293000543, + "grad_norm": 4.84074068069458, + "learning_rate": 5.220501575990134e-06, + "loss": 0.1844, + "step": 35627 + }, + { + "epoch": 2.0832338578404777, + "grad_norm": 5.050709247589111, + "learning_rate": 5.220364533369878e-06, + "loss": 0.1334, + "step": 35628 + }, + { + "epoch": 2.0832474226804125, + "grad_norm": 4.66599702835083, + "learning_rate": 5.220227490749623e-06, + "loss": 0.2281, + "step": 35629 + }, + { + "epoch": 2.0832609875203474, + "grad_norm": 5.413567066192627, + "learning_rate": 5.220090448129369e-06, + "loss": 0.1976, + "step": 35630 + }, + { + "epoch": 2.0832745523602823, + "grad_norm": 4.945500373840332, + "learning_rate": 5.219953405509114e-06, + "loss": 0.1816, + "step": 35631 + }, + { + "epoch": 2.083288117200217, + "grad_norm": 4.904531002044678, + "learning_rate": 5.219816362888859e-06, + "loss": 0.1903, + "step": 35632 + }, + { + "epoch": 2.083301682040152, + "grad_norm": 4.837728977203369, + "learning_rate": 5.219679320268603e-06, + "loss": 0.1868, + "step": 35633 + }, + { + "epoch": 2.083315246880087, + "grad_norm": 4.4003777503967285, + "learning_rate": 5.219542277648349e-06, + "loss": 0.1285, + "step": 35634 + }, + { + "epoch": 2.0833288117200217, + "grad_norm": 5.577904224395752, + "learning_rate": 5.219405235028094e-06, + "loss": 0.2299, + "step": 35635 + }, + { + "epoch": 2.0833423765599566, + "grad_norm": 6.4318647384643555, + "learning_rate": 5.2192681924078396e-06, + "loss": 0.2139, + "step": 35636 + }, + { + "epoch": 2.0833559413998914, + "grad_norm": 6.7392120361328125, + "learning_rate": 5.219131149787584e-06, + "loss": 0.2028, + "step": 35637 + }, + { + "epoch": 2.0833695062398263, + "grad_norm": 4.129542350769043, + "learning_rate": 5.21899410716733e-06, + "loss": 0.1706, + "step": 35638 + }, + { + "epoch": 2.083383071079761, + "grad_norm": 4.7846293449401855, + "learning_rate": 5.218857064547075e-06, + "loss": 0.1721, + "step": 35639 + }, + { + "epoch": 2.083396635919696, + "grad_norm": 3.350036382675171, + "learning_rate": 5.2187200219268194e-06, + "loss": 0.1564, + "step": 35640 + }, + { + "epoch": 2.083410200759631, + "grad_norm": 3.8868722915649414, + "learning_rate": 5.218582979306565e-06, + "loss": 0.1731, + "step": 35641 + }, + { + "epoch": 2.0834237655995658, + "grad_norm": 3.605339288711548, + "learning_rate": 5.21844593668631e-06, + "loss": 0.1028, + "step": 35642 + }, + { + "epoch": 2.0834373304395006, + "grad_norm": 4.51558256149292, + "learning_rate": 5.218308894066055e-06, + "loss": 0.2002, + "step": 35643 + }, + { + "epoch": 2.0834508952794355, + "grad_norm": 6.647059917449951, + "learning_rate": 5.2181718514458e-06, + "loss": 0.2863, + "step": 35644 + }, + { + "epoch": 2.0834644601193704, + "grad_norm": 4.268841743469238, + "learning_rate": 5.218034808825545e-06, + "loss": 0.1629, + "step": 35645 + }, + { + "epoch": 2.0834780249593057, + "grad_norm": 3.3173675537109375, + "learning_rate": 5.21789776620529e-06, + "loss": 0.1189, + "step": 35646 + }, + { + "epoch": 2.0834915897992405, + "grad_norm": 4.795090198516846, + "learning_rate": 5.217760723585036e-06, + "loss": 0.1993, + "step": 35647 + }, + { + "epoch": 2.0835051546391754, + "grad_norm": 4.703158855438232, + "learning_rate": 5.217623680964781e-06, + "loss": 0.222, + "step": 35648 + }, + { + "epoch": 2.0835187194791103, + "grad_norm": 4.49915885925293, + "learning_rate": 5.217486638344525e-06, + "loss": 0.1994, + "step": 35649 + }, + { + "epoch": 2.083532284319045, + "grad_norm": 3.3028712272644043, + "learning_rate": 5.21734959572427e-06, + "loss": 0.108, + "step": 35650 + }, + { + "epoch": 2.08354584915898, + "grad_norm": 4.21524715423584, + "learning_rate": 5.217212553104016e-06, + "loss": 0.1496, + "step": 35651 + }, + { + "epoch": 2.083559413998915, + "grad_norm": 4.622231960296631, + "learning_rate": 5.2170755104837616e-06, + "loss": 0.1515, + "step": 35652 + }, + { + "epoch": 2.0835729788388497, + "grad_norm": 4.494816780090332, + "learning_rate": 5.216938467863506e-06, + "loss": 0.1967, + "step": 35653 + }, + { + "epoch": 2.0835865436787846, + "grad_norm": 5.73393440246582, + "learning_rate": 5.216801425243251e-06, + "loss": 0.1616, + "step": 35654 + }, + { + "epoch": 2.0836001085187195, + "grad_norm": 3.8149054050445557, + "learning_rate": 5.216664382622995e-06, + "loss": 0.1003, + "step": 35655 + }, + { + "epoch": 2.0836136733586543, + "grad_norm": 4.959342002868652, + "learning_rate": 5.2165273400027414e-06, + "loss": 0.1684, + "step": 35656 + }, + { + "epoch": 2.083627238198589, + "grad_norm": 4.041841983795166, + "learning_rate": 5.216390297382487e-06, + "loss": 0.1444, + "step": 35657 + }, + { + "epoch": 2.083640803038524, + "grad_norm": 4.919468879699707, + "learning_rate": 5.216253254762231e-06, + "loss": 0.24, + "step": 35658 + }, + { + "epoch": 2.083654367878459, + "grad_norm": 3.6548941135406494, + "learning_rate": 5.216116212141976e-06, + "loss": 0.1514, + "step": 35659 + }, + { + "epoch": 2.083667932718394, + "grad_norm": 3.50541090965271, + "learning_rate": 5.215979169521722e-06, + "loss": 0.1082, + "step": 35660 + }, + { + "epoch": 2.0836814975583287, + "grad_norm": 4.67173957824707, + "learning_rate": 5.215842126901467e-06, + "loss": 0.1596, + "step": 35661 + }, + { + "epoch": 2.0836950623982635, + "grad_norm": 6.097330093383789, + "learning_rate": 5.215705084281212e-06, + "loss": 0.1821, + "step": 35662 + }, + { + "epoch": 2.0837086272381984, + "grad_norm": 4.7225751876831055, + "learning_rate": 5.215568041660957e-06, + "loss": 0.0913, + "step": 35663 + }, + { + "epoch": 2.0837221920781337, + "grad_norm": 3.830441474914551, + "learning_rate": 5.215430999040703e-06, + "loss": 0.1385, + "step": 35664 + }, + { + "epoch": 2.0837357569180686, + "grad_norm": 4.168881893157959, + "learning_rate": 5.215293956420447e-06, + "loss": 0.1701, + "step": 35665 + }, + { + "epoch": 2.0837493217580034, + "grad_norm": 6.587944030761719, + "learning_rate": 5.215156913800192e-06, + "loss": 0.1841, + "step": 35666 + }, + { + "epoch": 2.0837628865979383, + "grad_norm": 3.7932958602905273, + "learning_rate": 5.215019871179937e-06, + "loss": 0.1538, + "step": 35667 + }, + { + "epoch": 2.083776451437873, + "grad_norm": 3.8752026557922363, + "learning_rate": 5.214882828559682e-06, + "loss": 0.1765, + "step": 35668 + }, + { + "epoch": 2.083790016277808, + "grad_norm": 5.6168999671936035, + "learning_rate": 5.214745785939428e-06, + "loss": 0.2362, + "step": 35669 + }, + { + "epoch": 2.083803581117743, + "grad_norm": 4.837762832641602, + "learning_rate": 5.214608743319173e-06, + "loss": 0.1762, + "step": 35670 + }, + { + "epoch": 2.0838171459576778, + "grad_norm": 4.3760457038879395, + "learning_rate": 5.2144717006989174e-06, + "loss": 0.1837, + "step": 35671 + }, + { + "epoch": 2.0838307107976126, + "grad_norm": 3.826094627380371, + "learning_rate": 5.214334658078663e-06, + "loss": 0.1139, + "step": 35672 + }, + { + "epoch": 2.0838442756375475, + "grad_norm": 4.356306552886963, + "learning_rate": 5.214197615458409e-06, + "loss": 0.1155, + "step": 35673 + }, + { + "epoch": 2.0838578404774823, + "grad_norm": 4.461370468139648, + "learning_rate": 5.214060572838153e-06, + "loss": 0.2741, + "step": 35674 + }, + { + "epoch": 2.083871405317417, + "grad_norm": 5.614892482757568, + "learning_rate": 5.213923530217898e-06, + "loss": 0.2244, + "step": 35675 + }, + { + "epoch": 2.083884970157352, + "grad_norm": 4.963206768035889, + "learning_rate": 5.213786487597643e-06, + "loss": 0.1268, + "step": 35676 + }, + { + "epoch": 2.083898534997287, + "grad_norm": 5.1568098068237305, + "learning_rate": 5.2136494449773885e-06, + "loss": 0.2636, + "step": 35677 + }, + { + "epoch": 2.083912099837222, + "grad_norm": 5.676686763763428, + "learning_rate": 5.213512402357134e-06, + "loss": 0.2179, + "step": 35678 + }, + { + "epoch": 2.0839256646771567, + "grad_norm": 5.204169750213623, + "learning_rate": 5.213375359736879e-06, + "loss": 0.2961, + "step": 35679 + }, + { + "epoch": 2.0839392295170915, + "grad_norm": 3.872328281402588, + "learning_rate": 5.213238317116623e-06, + "loss": 0.2057, + "step": 35680 + }, + { + "epoch": 2.0839527943570264, + "grad_norm": 5.43017053604126, + "learning_rate": 5.213101274496368e-06, + "loss": 0.2247, + "step": 35681 + }, + { + "epoch": 2.0839663591969613, + "grad_norm": 4.583033561706543, + "learning_rate": 5.212964231876114e-06, + "loss": 0.1219, + "step": 35682 + }, + { + "epoch": 2.083979924036896, + "grad_norm": 3.663691520690918, + "learning_rate": 5.212827189255859e-06, + "loss": 0.0996, + "step": 35683 + }, + { + "epoch": 2.0839934888768314, + "grad_norm": 5.636842727661133, + "learning_rate": 5.212690146635604e-06, + "loss": 0.1668, + "step": 35684 + }, + { + "epoch": 2.0840070537167663, + "grad_norm": 3.13466477394104, + "learning_rate": 5.212553104015349e-06, + "loss": 0.1295, + "step": 35685 + }, + { + "epoch": 2.084020618556701, + "grad_norm": 2.896209955215454, + "learning_rate": 5.212416061395095e-06, + "loss": 0.075, + "step": 35686 + }, + { + "epoch": 2.084034183396636, + "grad_norm": 3.6999146938323975, + "learning_rate": 5.2122790187748394e-06, + "loss": 0.1367, + "step": 35687 + }, + { + "epoch": 2.084047748236571, + "grad_norm": 2.93961763381958, + "learning_rate": 5.212141976154585e-06, + "loss": 0.1092, + "step": 35688 + }, + { + "epoch": 2.0840613130765058, + "grad_norm": 4.367901802062988, + "learning_rate": 5.212004933534329e-06, + "loss": 0.1609, + "step": 35689 + }, + { + "epoch": 2.0840748779164406, + "grad_norm": 3.636017322540283, + "learning_rate": 5.211867890914075e-06, + "loss": 0.1606, + "step": 35690 + }, + { + "epoch": 2.0840884427563755, + "grad_norm": 5.032949447631836, + "learning_rate": 5.21173084829382e-06, + "loss": 0.1562, + "step": 35691 + }, + { + "epoch": 2.0841020075963104, + "grad_norm": 4.4746928215026855, + "learning_rate": 5.2115938056735645e-06, + "loss": 0.1287, + "step": 35692 + }, + { + "epoch": 2.0841155724362452, + "grad_norm": 5.1640143394470215, + "learning_rate": 5.21145676305331e-06, + "loss": 0.2607, + "step": 35693 + }, + { + "epoch": 2.08412913727618, + "grad_norm": 4.607021808624268, + "learning_rate": 5.211319720433055e-06, + "loss": 0.1558, + "step": 35694 + }, + { + "epoch": 2.084142702116115, + "grad_norm": 3.249567985534668, + "learning_rate": 5.211182677812801e-06, + "loss": 0.1248, + "step": 35695 + }, + { + "epoch": 2.08415626695605, + "grad_norm": 3.372237205505371, + "learning_rate": 5.211045635192545e-06, + "loss": 0.0894, + "step": 35696 + }, + { + "epoch": 2.0841698317959847, + "grad_norm": 3.121217966079712, + "learning_rate": 5.21090859257229e-06, + "loss": 0.0823, + "step": 35697 + }, + { + "epoch": 2.0841833966359196, + "grad_norm": 5.418755054473877, + "learning_rate": 5.210771549952035e-06, + "loss": 0.2259, + "step": 35698 + }, + { + "epoch": 2.0841969614758544, + "grad_norm": 6.678102970123291, + "learning_rate": 5.210634507331781e-06, + "loss": 0.2559, + "step": 35699 + }, + { + "epoch": 2.0842105263157893, + "grad_norm": 3.5021281242370605, + "learning_rate": 5.210497464711526e-06, + "loss": 0.1099, + "step": 35700 + }, + { + "epoch": 2.084224091155724, + "grad_norm": 3.96679425239563, + "learning_rate": 5.210360422091271e-06, + "loss": 0.18, + "step": 35701 + }, + { + "epoch": 2.0842376559956595, + "grad_norm": 4.269003391265869, + "learning_rate": 5.2102233794710154e-06, + "loss": 0.1648, + "step": 35702 + }, + { + "epoch": 2.0842512208355943, + "grad_norm": 4.815479755401611, + "learning_rate": 5.2100863368507615e-06, + "loss": 0.1672, + "step": 35703 + }, + { + "epoch": 2.084264785675529, + "grad_norm": 4.779609203338623, + "learning_rate": 5.209949294230507e-06, + "loss": 0.1558, + "step": 35704 + }, + { + "epoch": 2.084278350515464, + "grad_norm": 3.7263407707214355, + "learning_rate": 5.209812251610251e-06, + "loss": 0.1072, + "step": 35705 + }, + { + "epoch": 2.084291915355399, + "grad_norm": 4.818544864654541, + "learning_rate": 5.209675208989996e-06, + "loss": 0.2081, + "step": 35706 + }, + { + "epoch": 2.084305480195334, + "grad_norm": 4.23234748840332, + "learning_rate": 5.209538166369742e-06, + "loss": 0.1499, + "step": 35707 + }, + { + "epoch": 2.0843190450352687, + "grad_norm": 6.385865688323975, + "learning_rate": 5.2094011237494865e-06, + "loss": 0.2957, + "step": 35708 + }, + { + "epoch": 2.0843326098752035, + "grad_norm": 3.2367606163024902, + "learning_rate": 5.209264081129232e-06, + "loss": 0.1245, + "step": 35709 + }, + { + "epoch": 2.0843461747151384, + "grad_norm": 5.404021739959717, + "learning_rate": 5.209127038508977e-06, + "loss": 0.1768, + "step": 35710 + }, + { + "epoch": 2.0843597395550733, + "grad_norm": 3.5535836219787598, + "learning_rate": 5.208989995888721e-06, + "loss": 0.1452, + "step": 35711 + }, + { + "epoch": 2.084373304395008, + "grad_norm": 3.3689301013946533, + "learning_rate": 5.208852953268467e-06, + "loss": 0.106, + "step": 35712 + }, + { + "epoch": 2.084386869234943, + "grad_norm": 4.75955867767334, + "learning_rate": 5.208715910648212e-06, + "loss": 0.2524, + "step": 35713 + }, + { + "epoch": 2.084400434074878, + "grad_norm": 5.578322410583496, + "learning_rate": 5.208578868027957e-06, + "loss": 0.1742, + "step": 35714 + }, + { + "epoch": 2.0844139989148127, + "grad_norm": 3.403937816619873, + "learning_rate": 5.208441825407702e-06, + "loss": 0.0916, + "step": 35715 + }, + { + "epoch": 2.0844275637547476, + "grad_norm": 4.394507884979248, + "learning_rate": 5.208304782787448e-06, + "loss": 0.1479, + "step": 35716 + }, + { + "epoch": 2.0844411285946824, + "grad_norm": 3.3544747829437256, + "learning_rate": 5.208167740167192e-06, + "loss": 0.13, + "step": 35717 + }, + { + "epoch": 2.0844546934346173, + "grad_norm": 2.5617916584014893, + "learning_rate": 5.2080306975469374e-06, + "loss": 0.0743, + "step": 35718 + }, + { + "epoch": 2.084468258274552, + "grad_norm": 4.090343952178955, + "learning_rate": 5.207893654926683e-06, + "loss": 0.1282, + "step": 35719 + }, + { + "epoch": 2.084481823114487, + "grad_norm": 2.603949546813965, + "learning_rate": 5.207756612306429e-06, + "loss": 0.1016, + "step": 35720 + }, + { + "epoch": 2.0844953879544224, + "grad_norm": 4.598490238189697, + "learning_rate": 5.207619569686173e-06, + "loss": 0.2167, + "step": 35721 + }, + { + "epoch": 2.084508952794357, + "grad_norm": 3.8523669242858887, + "learning_rate": 5.207482527065918e-06, + "loss": 0.1207, + "step": 35722 + }, + { + "epoch": 2.084522517634292, + "grad_norm": 4.73619270324707, + "learning_rate": 5.2073454844456625e-06, + "loss": 0.1161, + "step": 35723 + }, + { + "epoch": 2.084536082474227, + "grad_norm": 3.49393367767334, + "learning_rate": 5.207208441825408e-06, + "loss": 0.1518, + "step": 35724 + }, + { + "epoch": 2.084549647314162, + "grad_norm": 4.341973304748535, + "learning_rate": 5.207071399205154e-06, + "loss": 0.1231, + "step": 35725 + }, + { + "epoch": 2.0845632121540967, + "grad_norm": 3.6109108924865723, + "learning_rate": 5.206934356584898e-06, + "loss": 0.087, + "step": 35726 + }, + { + "epoch": 2.0845767769940315, + "grad_norm": 5.166426181793213, + "learning_rate": 5.206797313964643e-06, + "loss": 0.1629, + "step": 35727 + }, + { + "epoch": 2.0845903418339664, + "grad_norm": 4.954616069793701, + "learning_rate": 5.206660271344388e-06, + "loss": 0.1549, + "step": 35728 + }, + { + "epoch": 2.0846039066739013, + "grad_norm": 3.4192328453063965, + "learning_rate": 5.206523228724134e-06, + "loss": 0.1278, + "step": 35729 + }, + { + "epoch": 2.084617471513836, + "grad_norm": 3.6095306873321533, + "learning_rate": 5.206386186103879e-06, + "loss": 0.1235, + "step": 35730 + }, + { + "epoch": 2.084631036353771, + "grad_norm": 3.0677671432495117, + "learning_rate": 5.206249143483624e-06, + "loss": 0.0871, + "step": 35731 + }, + { + "epoch": 2.084644601193706, + "grad_norm": 5.259298801422119, + "learning_rate": 5.206112100863368e-06, + "loss": 0.1843, + "step": 35732 + }, + { + "epoch": 2.0846581660336407, + "grad_norm": 4.488002300262451, + "learning_rate": 5.205975058243114e-06, + "loss": 0.2077, + "step": 35733 + }, + { + "epoch": 2.0846717308735756, + "grad_norm": 3.7687249183654785, + "learning_rate": 5.2058380156228595e-06, + "loss": 0.1435, + "step": 35734 + }, + { + "epoch": 2.0846852957135105, + "grad_norm": 4.266343116760254, + "learning_rate": 5.205700973002605e-06, + "loss": 0.1454, + "step": 35735 + }, + { + "epoch": 2.0846988605534453, + "grad_norm": 4.486297607421875, + "learning_rate": 5.205563930382349e-06, + "loss": 0.1832, + "step": 35736 + }, + { + "epoch": 2.08471242539338, + "grad_norm": 2.923098087310791, + "learning_rate": 5.205426887762094e-06, + "loss": 0.0779, + "step": 35737 + }, + { + "epoch": 2.084725990233315, + "grad_norm": 4.462001800537109, + "learning_rate": 5.20528984514184e-06, + "loss": 0.0843, + "step": 35738 + }, + { + "epoch": 2.08473955507325, + "grad_norm": 2.7718052864074707, + "learning_rate": 5.2051528025215845e-06, + "loss": 0.0991, + "step": 35739 + }, + { + "epoch": 2.0847531199131852, + "grad_norm": 4.262258529663086, + "learning_rate": 5.20501575990133e-06, + "loss": 0.158, + "step": 35740 + }, + { + "epoch": 2.08476668475312, + "grad_norm": 5.733902454376221, + "learning_rate": 5.204878717281074e-06, + "loss": 0.1599, + "step": 35741 + }, + { + "epoch": 2.084780249593055, + "grad_norm": 3.033794641494751, + "learning_rate": 5.20474167466082e-06, + "loss": 0.0879, + "step": 35742 + }, + { + "epoch": 2.08479381443299, + "grad_norm": 5.361350059509277, + "learning_rate": 5.204604632040565e-06, + "loss": 0.1819, + "step": 35743 + }, + { + "epoch": 2.0848073792729247, + "grad_norm": 5.425729751586914, + "learning_rate": 5.20446758942031e-06, + "loss": 0.171, + "step": 35744 + }, + { + "epoch": 2.0848209441128596, + "grad_norm": 4.537123203277588, + "learning_rate": 5.204330546800055e-06, + "loss": 0.1394, + "step": 35745 + }, + { + "epoch": 2.0848345089527944, + "grad_norm": 4.66334342956543, + "learning_rate": 5.204193504179801e-06, + "loss": 0.185, + "step": 35746 + }, + { + "epoch": 2.0848480737927293, + "grad_norm": 3.5036041736602783, + "learning_rate": 5.204056461559546e-06, + "loss": 0.1378, + "step": 35747 + }, + { + "epoch": 2.084861638632664, + "grad_norm": 3.8807532787323, + "learning_rate": 5.20391941893929e-06, + "loss": 0.1256, + "step": 35748 + }, + { + "epoch": 2.084875203472599, + "grad_norm": 3.1865923404693604, + "learning_rate": 5.2037823763190355e-06, + "loss": 0.0868, + "step": 35749 + }, + { + "epoch": 2.084888768312534, + "grad_norm": 3.410083055496216, + "learning_rate": 5.203645333698781e-06, + "loss": 0.094, + "step": 35750 + }, + { + "epoch": 2.0849023331524688, + "grad_norm": 4.437623023986816, + "learning_rate": 5.203508291078526e-06, + "loss": 0.1513, + "step": 35751 + }, + { + "epoch": 2.0849158979924036, + "grad_norm": 3.168712615966797, + "learning_rate": 5.203371248458271e-06, + "loss": 0.0732, + "step": 35752 + }, + { + "epoch": 2.0849294628323385, + "grad_norm": 3.3815383911132812, + "learning_rate": 5.203234205838016e-06, + "loss": 0.1054, + "step": 35753 + }, + { + "epoch": 2.0849430276722734, + "grad_norm": 3.7374141216278076, + "learning_rate": 5.2030971632177605e-06, + "loss": 0.1139, + "step": 35754 + }, + { + "epoch": 2.084956592512208, + "grad_norm": 3.8327274322509766, + "learning_rate": 5.2029601205975065e-06, + "loss": 0.1341, + "step": 35755 + }, + { + "epoch": 2.084970157352143, + "grad_norm": 4.131344795227051, + "learning_rate": 5.202823077977252e-06, + "loss": 0.1225, + "step": 35756 + }, + { + "epoch": 2.084983722192078, + "grad_norm": 4.972899436950684, + "learning_rate": 5.202686035356996e-06, + "loss": 0.1362, + "step": 35757 + }, + { + "epoch": 2.084997287032013, + "grad_norm": 9.193486213684082, + "learning_rate": 5.202548992736741e-06, + "loss": 0.2116, + "step": 35758 + }, + { + "epoch": 2.085010851871948, + "grad_norm": 2.70843243598938, + "learning_rate": 5.202411950116487e-06, + "loss": 0.074, + "step": 35759 + }, + { + "epoch": 2.085024416711883, + "grad_norm": 2.9455645084381104, + "learning_rate": 5.2022749074962316e-06, + "loss": 0.067, + "step": 35760 + }, + { + "epoch": 2.085037981551818, + "grad_norm": 4.004158020019531, + "learning_rate": 5.202137864875977e-06, + "loss": 0.1206, + "step": 35761 + }, + { + "epoch": 2.0850515463917527, + "grad_norm": 4.596392631530762, + "learning_rate": 5.202000822255722e-06, + "loss": 0.122, + "step": 35762 + }, + { + "epoch": 2.0850651112316876, + "grad_norm": 4.588801383972168, + "learning_rate": 5.201863779635466e-06, + "loss": 0.1678, + "step": 35763 + }, + { + "epoch": 2.0850786760716224, + "grad_norm": 5.828886985778809, + "learning_rate": 5.201726737015212e-06, + "loss": 0.1641, + "step": 35764 + }, + { + "epoch": 2.0850922409115573, + "grad_norm": 4.164643287658691, + "learning_rate": 5.2015896943949575e-06, + "loss": 0.0979, + "step": 35765 + }, + { + "epoch": 2.085105805751492, + "grad_norm": 3.359731435775757, + "learning_rate": 5.201452651774702e-06, + "loss": 0.0742, + "step": 35766 + }, + { + "epoch": 2.085119370591427, + "grad_norm": 4.952978610992432, + "learning_rate": 5.201315609154447e-06, + "loss": 0.2146, + "step": 35767 + }, + { + "epoch": 2.085132935431362, + "grad_norm": 4.662286281585693, + "learning_rate": 5.201178566534193e-06, + "loss": 0.1304, + "step": 35768 + }, + { + "epoch": 2.0851465002712968, + "grad_norm": 3.268451452255249, + "learning_rate": 5.201041523913938e-06, + "loss": 0.0907, + "step": 35769 + }, + { + "epoch": 2.0851600651112316, + "grad_norm": 4.254659175872803, + "learning_rate": 5.2009044812936825e-06, + "loss": 0.1463, + "step": 35770 + }, + { + "epoch": 2.0851736299511665, + "grad_norm": 3.646631956100464, + "learning_rate": 5.200767438673428e-06, + "loss": 0.0868, + "step": 35771 + }, + { + "epoch": 2.0851871947911014, + "grad_norm": 3.072089910507202, + "learning_rate": 5.200630396053174e-06, + "loss": 0.078, + "step": 35772 + }, + { + "epoch": 2.0852007596310362, + "grad_norm": 4.382805824279785, + "learning_rate": 5.200493353432918e-06, + "loss": 0.0772, + "step": 35773 + }, + { + "epoch": 2.085214324470971, + "grad_norm": 5.388833522796631, + "learning_rate": 5.200356310812663e-06, + "loss": 0.1196, + "step": 35774 + }, + { + "epoch": 2.085227889310906, + "grad_norm": 4.9579997062683105, + "learning_rate": 5.2002192681924076e-06, + "loss": 0.2103, + "step": 35775 + }, + { + "epoch": 2.085241454150841, + "grad_norm": 6.008780479431152, + "learning_rate": 5.200082225572154e-06, + "loss": 0.1735, + "step": 35776 + }, + { + "epoch": 2.0852550189907757, + "grad_norm": 3.850029945373535, + "learning_rate": 5.199945182951899e-06, + "loss": 0.099, + "step": 35777 + }, + { + "epoch": 2.085268583830711, + "grad_norm": 3.3202919960021973, + "learning_rate": 5.199808140331644e-06, + "loss": 0.094, + "step": 35778 + }, + { + "epoch": 2.085282148670646, + "grad_norm": 3.658257246017456, + "learning_rate": 5.199671097711388e-06, + "loss": 0.1041, + "step": 35779 + }, + { + "epoch": 2.0852957135105807, + "grad_norm": 3.165053606033325, + "learning_rate": 5.1995340550911335e-06, + "loss": 0.0828, + "step": 35780 + }, + { + "epoch": 2.0853092783505156, + "grad_norm": 4.155436038970947, + "learning_rate": 5.1993970124708795e-06, + "loss": 0.1098, + "step": 35781 + }, + { + "epoch": 2.0853228431904505, + "grad_norm": 3.423192262649536, + "learning_rate": 5.199259969850624e-06, + "loss": 0.0661, + "step": 35782 + }, + { + "epoch": 2.0853364080303853, + "grad_norm": 4.135579586029053, + "learning_rate": 5.199122927230369e-06, + "loss": 0.1473, + "step": 35783 + }, + { + "epoch": 2.08534997287032, + "grad_norm": 4.059831142425537, + "learning_rate": 5.198985884610114e-06, + "loss": 0.1222, + "step": 35784 + }, + { + "epoch": 2.085363537710255, + "grad_norm": 3.656791925430298, + "learning_rate": 5.198848841989859e-06, + "loss": 0.1297, + "step": 35785 + }, + { + "epoch": 2.08537710255019, + "grad_norm": 5.370920181274414, + "learning_rate": 5.1987117993696045e-06, + "loss": 0.1601, + "step": 35786 + }, + { + "epoch": 2.085390667390125, + "grad_norm": 4.039222240447998, + "learning_rate": 5.19857475674935e-06, + "loss": 0.1805, + "step": 35787 + }, + { + "epoch": 2.0854042322300597, + "grad_norm": 4.236637592315674, + "learning_rate": 5.198437714129094e-06, + "loss": 0.153, + "step": 35788 + }, + { + "epoch": 2.0854177970699945, + "grad_norm": 3.3627660274505615, + "learning_rate": 5.19830067150884e-06, + "loss": 0.1346, + "step": 35789 + }, + { + "epoch": 2.0854313619099294, + "grad_norm": 4.506913185119629, + "learning_rate": 5.198163628888585e-06, + "loss": 0.2532, + "step": 35790 + }, + { + "epoch": 2.0854449267498643, + "grad_norm": 4.992980003356934, + "learning_rate": 5.19802658626833e-06, + "loss": 0.2166, + "step": 35791 + }, + { + "epoch": 2.085458491589799, + "grad_norm": 3.731651544570923, + "learning_rate": 5.197889543648075e-06, + "loss": 0.1598, + "step": 35792 + }, + { + "epoch": 2.085472056429734, + "grad_norm": 5.197535991668701, + "learning_rate": 5.19775250102782e-06, + "loss": 0.2033, + "step": 35793 + }, + { + "epoch": 2.085485621269669, + "grad_norm": 3.789846420288086, + "learning_rate": 5.197615458407566e-06, + "loss": 0.1228, + "step": 35794 + }, + { + "epoch": 2.0854991861096037, + "grad_norm": 4.236766338348389, + "learning_rate": 5.19747841578731e-06, + "loss": 0.1128, + "step": 35795 + }, + { + "epoch": 2.0855127509495386, + "grad_norm": 4.665505409240723, + "learning_rate": 5.1973413731670555e-06, + "loss": 0.2538, + "step": 35796 + }, + { + "epoch": 2.085526315789474, + "grad_norm": 4.94050931930542, + "learning_rate": 5.1972043305468e-06, + "loss": 0.1692, + "step": 35797 + }, + { + "epoch": 2.0855398806294088, + "grad_norm": 6.154919624328613, + "learning_rate": 5.197067287926546e-06, + "loss": 0.245, + "step": 35798 + }, + { + "epoch": 2.0855534454693436, + "grad_norm": 4.263175964355469, + "learning_rate": 5.196930245306291e-06, + "loss": 0.1678, + "step": 35799 + }, + { + "epoch": 2.0855670103092785, + "grad_norm": 5.168676376342773, + "learning_rate": 5.196793202686035e-06, + "loss": 0.2667, + "step": 35800 + }, + { + "epoch": 2.0855805751492134, + "grad_norm": 4.650435924530029, + "learning_rate": 5.1966561600657805e-06, + "loss": 0.1934, + "step": 35801 + }, + { + "epoch": 2.085594139989148, + "grad_norm": 4.200222969055176, + "learning_rate": 5.1965191174455265e-06, + "loss": 0.1241, + "step": 35802 + }, + { + "epoch": 2.085607704829083, + "grad_norm": 3.802759885787964, + "learning_rate": 5.196382074825272e-06, + "loss": 0.1472, + "step": 35803 + }, + { + "epoch": 2.085621269669018, + "grad_norm": 4.955031394958496, + "learning_rate": 5.196245032205016e-06, + "loss": 0.2404, + "step": 35804 + }, + { + "epoch": 2.085634834508953, + "grad_norm": 5.200767993927002, + "learning_rate": 5.196107989584761e-06, + "loss": 0.2307, + "step": 35805 + }, + { + "epoch": 2.0856483993488877, + "grad_norm": 4.194596767425537, + "learning_rate": 5.1959709469645056e-06, + "loss": 0.1917, + "step": 35806 + }, + { + "epoch": 2.0856619641888225, + "grad_norm": 3.9429056644439697, + "learning_rate": 5.195833904344252e-06, + "loss": 0.1763, + "step": 35807 + }, + { + "epoch": 2.0856755290287574, + "grad_norm": 4.679740905761719, + "learning_rate": 5.195696861723997e-06, + "loss": 0.1294, + "step": 35808 + }, + { + "epoch": 2.0856890938686923, + "grad_norm": 3.892758846282959, + "learning_rate": 5.195559819103741e-06, + "loss": 0.2131, + "step": 35809 + }, + { + "epoch": 2.085702658708627, + "grad_norm": 4.664053916931152, + "learning_rate": 5.195422776483486e-06, + "loss": 0.1749, + "step": 35810 + }, + { + "epoch": 2.085716223548562, + "grad_norm": 5.020132541656494, + "learning_rate": 5.195285733863232e-06, + "loss": 0.2809, + "step": 35811 + }, + { + "epoch": 2.085729788388497, + "grad_norm": 4.93833065032959, + "learning_rate": 5.1951486912429775e-06, + "loss": 0.2444, + "step": 35812 + }, + { + "epoch": 2.0857433532284317, + "grad_norm": 4.687741756439209, + "learning_rate": 5.195011648622722e-06, + "loss": 0.1273, + "step": 35813 + }, + { + "epoch": 2.0857569180683666, + "grad_norm": 3.9695687294006348, + "learning_rate": 5.194874606002467e-06, + "loss": 0.238, + "step": 35814 + }, + { + "epoch": 2.0857704829083015, + "grad_norm": 4.5233473777771, + "learning_rate": 5.194737563382213e-06, + "loss": 0.1342, + "step": 35815 + }, + { + "epoch": 2.085784047748237, + "grad_norm": 4.591284275054932, + "learning_rate": 5.194600520761957e-06, + "loss": 0.2303, + "step": 35816 + }, + { + "epoch": 2.0857976125881716, + "grad_norm": 4.76379919052124, + "learning_rate": 5.1944634781417025e-06, + "loss": 0.2257, + "step": 35817 + }, + { + "epoch": 2.0858111774281065, + "grad_norm": 4.762755870819092, + "learning_rate": 5.194326435521448e-06, + "loss": 0.2379, + "step": 35818 + }, + { + "epoch": 2.0858247422680414, + "grad_norm": 4.068609237670898, + "learning_rate": 5.194189392901192e-06, + "loss": 0.1951, + "step": 35819 + }, + { + "epoch": 2.0858383071079762, + "grad_norm": 7.247941493988037, + "learning_rate": 5.194052350280938e-06, + "loss": 0.3628, + "step": 35820 + }, + { + "epoch": 2.085851871947911, + "grad_norm": 5.019738674163818, + "learning_rate": 5.193915307660683e-06, + "loss": 0.2181, + "step": 35821 + }, + { + "epoch": 2.085865436787846, + "grad_norm": 4.636410713195801, + "learning_rate": 5.193778265040428e-06, + "loss": 0.1906, + "step": 35822 + }, + { + "epoch": 2.085879001627781, + "grad_norm": 5.2201738357543945, + "learning_rate": 5.193641222420173e-06, + "loss": 0.2344, + "step": 35823 + }, + { + "epoch": 2.0858925664677157, + "grad_norm": 5.757354736328125, + "learning_rate": 5.193504179799919e-06, + "loss": 0.2218, + "step": 35824 + }, + { + "epoch": 2.0859061313076506, + "grad_norm": 5.1880645751953125, + "learning_rate": 5.193367137179663e-06, + "loss": 0.2159, + "step": 35825 + }, + { + "epoch": 2.0859196961475854, + "grad_norm": 6.079439640045166, + "learning_rate": 5.193230094559408e-06, + "loss": 0.2524, + "step": 35826 + }, + { + "epoch": 2.0859332609875203, + "grad_norm": 4.361489295959473, + "learning_rate": 5.1930930519391535e-06, + "loss": 0.3275, + "step": 35827 + }, + { + "epoch": 2.085946825827455, + "grad_norm": 6.207448959350586, + "learning_rate": 5.1929560093188995e-06, + "loss": 0.2996, + "step": 35828 + }, + { + "epoch": 2.08596039066739, + "grad_norm": 4.9373345375061035, + "learning_rate": 5.192818966698644e-06, + "loss": 0.1744, + "step": 35829 + }, + { + "epoch": 2.085973955507325, + "grad_norm": 4.962740898132324, + "learning_rate": 5.192681924078389e-06, + "loss": 0.1564, + "step": 35830 + }, + { + "epoch": 2.0859875203472598, + "grad_norm": 6.31044864654541, + "learning_rate": 5.192544881458133e-06, + "loss": 0.194, + "step": 35831 + }, + { + "epoch": 2.0860010851871946, + "grad_norm": 5.813060283660889, + "learning_rate": 5.1924078388378785e-06, + "loss": 0.2614, + "step": 35832 + }, + { + "epoch": 2.0860146500271295, + "grad_norm": 5.311504364013672, + "learning_rate": 5.1922707962176246e-06, + "loss": 0.2978, + "step": 35833 + }, + { + "epoch": 2.0860282148670644, + "grad_norm": 7.588066577911377, + "learning_rate": 5.192133753597369e-06, + "loss": 0.1934, + "step": 35834 + }, + { + "epoch": 2.0860417797069997, + "grad_norm": 4.774542808532715, + "learning_rate": 5.191996710977114e-06, + "loss": 0.2265, + "step": 35835 + }, + { + "epoch": 2.0860553445469345, + "grad_norm": 6.3907790184021, + "learning_rate": 5.191859668356859e-06, + "loss": 0.2884, + "step": 35836 + }, + { + "epoch": 2.0860689093868694, + "grad_norm": 4.910195350646973, + "learning_rate": 5.191722625736605e-06, + "loss": 0.3118, + "step": 35837 + }, + { + "epoch": 2.0860824742268043, + "grad_norm": 5.899437427520752, + "learning_rate": 5.19158558311635e-06, + "loss": 0.292, + "step": 35838 + }, + { + "epoch": 2.086096039066739, + "grad_norm": 4.653271675109863, + "learning_rate": 5.191448540496095e-06, + "loss": 0.2889, + "step": 35839 + }, + { + "epoch": 2.086109603906674, + "grad_norm": 5.444576263427734, + "learning_rate": 5.191311497875839e-06, + "loss": 0.3358, + "step": 35840 + }, + { + "epoch": 2.086123168746609, + "grad_norm": 6.377725601196289, + "learning_rate": 5.191174455255585e-06, + "loss": 0.2923, + "step": 35841 + }, + { + "epoch": 2.0861367335865437, + "grad_norm": 7.155869007110596, + "learning_rate": 5.19103741263533e-06, + "loss": 0.3101, + "step": 35842 + }, + { + "epoch": 2.0861502984264786, + "grad_norm": 4.029052734375, + "learning_rate": 5.1909003700150755e-06, + "loss": 0.1159, + "step": 35843 + }, + { + "epoch": 2.0861638632664135, + "grad_norm": 6.227407932281494, + "learning_rate": 5.19076332739482e-06, + "loss": 0.2548, + "step": 35844 + }, + { + "epoch": 2.0861774281063483, + "grad_norm": 6.965211391448975, + "learning_rate": 5.190626284774565e-06, + "loss": 0.2977, + "step": 35845 + }, + { + "epoch": 2.086190992946283, + "grad_norm": 5.032188415527344, + "learning_rate": 5.190489242154311e-06, + "loss": 0.2144, + "step": 35846 + }, + { + "epoch": 2.086204557786218, + "grad_norm": 5.18914794921875, + "learning_rate": 5.190352199534055e-06, + "loss": 0.3418, + "step": 35847 + }, + { + "epoch": 2.086218122626153, + "grad_norm": 5.493496894836426, + "learning_rate": 5.1902151569138005e-06, + "loss": 0.1795, + "step": 35848 + }, + { + "epoch": 2.086231687466088, + "grad_norm": 5.652132034301758, + "learning_rate": 5.190078114293545e-06, + "loss": 0.1598, + "step": 35849 + }, + { + "epoch": 2.0862452523060226, + "grad_norm": 4.5337114334106445, + "learning_rate": 5.189941071673291e-06, + "loss": 0.1612, + "step": 35850 + }, + { + "epoch": 2.0862588171459575, + "grad_norm": 4.708499431610107, + "learning_rate": 5.189804029053036e-06, + "loss": 0.2112, + "step": 35851 + }, + { + "epoch": 2.0862723819858924, + "grad_norm": 3.922252893447876, + "learning_rate": 5.189666986432781e-06, + "loss": 0.1771, + "step": 35852 + }, + { + "epoch": 2.0862859468258272, + "grad_norm": 6.0946946144104, + "learning_rate": 5.189529943812526e-06, + "loss": 0.332, + "step": 35853 + }, + { + "epoch": 2.0862995116657626, + "grad_norm": 4.833128452301025, + "learning_rate": 5.189392901192272e-06, + "loss": 0.2254, + "step": 35854 + }, + { + "epoch": 2.0863130765056974, + "grad_norm": 6.910426616668701, + "learning_rate": 5.189255858572017e-06, + "loss": 0.1828, + "step": 35855 + }, + { + "epoch": 2.0863266413456323, + "grad_norm": 6.5335798263549805, + "learning_rate": 5.189118815951761e-06, + "loss": 0.267, + "step": 35856 + }, + { + "epoch": 2.086340206185567, + "grad_norm": 4.59364652633667, + "learning_rate": 5.188981773331506e-06, + "loss": 0.1465, + "step": 35857 + }, + { + "epoch": 2.086353771025502, + "grad_norm": 6.963953495025635, + "learning_rate": 5.188844730711252e-06, + "loss": 0.259, + "step": 35858 + }, + { + "epoch": 2.086367335865437, + "grad_norm": 5.629866123199463, + "learning_rate": 5.188707688090997e-06, + "loss": 0.2605, + "step": 35859 + }, + { + "epoch": 2.0863809007053717, + "grad_norm": 4.827028274536133, + "learning_rate": 5.188570645470742e-06, + "loss": 0.2371, + "step": 35860 + }, + { + "epoch": 2.0863944655453066, + "grad_norm": 3.6209027767181396, + "learning_rate": 5.188433602850487e-06, + "loss": 0.221, + "step": 35861 + }, + { + "epoch": 2.0864080303852415, + "grad_norm": 5.884127616882324, + "learning_rate": 5.188296560230231e-06, + "loss": 0.3302, + "step": 35862 + }, + { + "epoch": 2.0864215952251763, + "grad_norm": 5.025123596191406, + "learning_rate": 5.188159517609977e-06, + "loss": 0.2241, + "step": 35863 + }, + { + "epoch": 2.086435160065111, + "grad_norm": 4.955124378204346, + "learning_rate": 5.1880224749897226e-06, + "loss": 0.2069, + "step": 35864 + }, + { + "epoch": 2.086448724905046, + "grad_norm": 5.131430625915527, + "learning_rate": 5.187885432369467e-06, + "loss": 0.146, + "step": 35865 + }, + { + "epoch": 2.086462289744981, + "grad_norm": 5.672430992126465, + "learning_rate": 5.187748389749212e-06, + "loss": 0.2649, + "step": 35866 + }, + { + "epoch": 2.086475854584916, + "grad_norm": 4.671714782714844, + "learning_rate": 5.187611347128958e-06, + "loss": 0.2329, + "step": 35867 + }, + { + "epoch": 2.0864894194248507, + "grad_norm": 4.863974094390869, + "learning_rate": 5.187474304508702e-06, + "loss": 0.1851, + "step": 35868 + }, + { + "epoch": 2.0865029842647855, + "grad_norm": 8.51910400390625, + "learning_rate": 5.187337261888448e-06, + "loss": 0.2763, + "step": 35869 + }, + { + "epoch": 2.0865165491047204, + "grad_norm": 4.588523864746094, + "learning_rate": 5.187200219268193e-06, + "loss": 0.1862, + "step": 35870 + }, + { + "epoch": 2.0865301139446553, + "grad_norm": 5.663214206695557, + "learning_rate": 5.187063176647939e-06, + "loss": 0.266, + "step": 35871 + }, + { + "epoch": 2.08654367878459, + "grad_norm": 5.305416107177734, + "learning_rate": 5.186926134027683e-06, + "loss": 0.2832, + "step": 35872 + }, + { + "epoch": 2.0865572436245254, + "grad_norm": 4.345667839050293, + "learning_rate": 5.186789091407428e-06, + "loss": 0.2209, + "step": 35873 + }, + { + "epoch": 2.0865708084644603, + "grad_norm": 4.500986576080322, + "learning_rate": 5.186652048787173e-06, + "loss": 0.2071, + "step": 35874 + }, + { + "epoch": 2.086584373304395, + "grad_norm": 4.260486602783203, + "learning_rate": 5.186515006166918e-06, + "loss": 0.2598, + "step": 35875 + }, + { + "epoch": 2.08659793814433, + "grad_norm": 4.623676776885986, + "learning_rate": 5.186377963546664e-06, + "loss": 0.1348, + "step": 35876 + }, + { + "epoch": 2.086611502984265, + "grad_norm": 5.564742565155029, + "learning_rate": 5.186240920926409e-06, + "loss": 0.3055, + "step": 35877 + }, + { + "epoch": 2.0866250678241998, + "grad_norm": 5.358513355255127, + "learning_rate": 5.186103878306153e-06, + "loss": 0.1606, + "step": 35878 + }, + { + "epoch": 2.0866386326641346, + "grad_norm": 4.683084011077881, + "learning_rate": 5.1859668356858985e-06, + "loss": 0.3148, + "step": 35879 + }, + { + "epoch": 2.0866521975040695, + "grad_norm": 4.797120094299316, + "learning_rate": 5.1858297930656446e-06, + "loss": 0.1948, + "step": 35880 + }, + { + "epoch": 2.0866657623440044, + "grad_norm": 5.224207878112793, + "learning_rate": 5.185692750445389e-06, + "loss": 0.2663, + "step": 35881 + }, + { + "epoch": 2.0866793271839392, + "grad_norm": 3.007418155670166, + "learning_rate": 5.185555707825134e-06, + "loss": 0.1413, + "step": 35882 + }, + { + "epoch": 2.086692892023874, + "grad_norm": 3.867640256881714, + "learning_rate": 5.185418665204878e-06, + "loss": 0.1528, + "step": 35883 + }, + { + "epoch": 2.086706456863809, + "grad_norm": 5.346765995025635, + "learning_rate": 5.1852816225846244e-06, + "loss": 0.3986, + "step": 35884 + }, + { + "epoch": 2.086720021703744, + "grad_norm": 4.094478130340576, + "learning_rate": 5.18514457996437e-06, + "loss": 0.1773, + "step": 35885 + }, + { + "epoch": 2.0867335865436787, + "grad_norm": 4.122087001800537, + "learning_rate": 5.185007537344115e-06, + "loss": 0.2232, + "step": 35886 + }, + { + "epoch": 2.0867471513836136, + "grad_norm": 4.420017242431641, + "learning_rate": 5.184870494723859e-06, + "loss": 0.2129, + "step": 35887 + }, + { + "epoch": 2.0867607162235484, + "grad_norm": 3.9563426971435547, + "learning_rate": 5.184733452103604e-06, + "loss": 0.1387, + "step": 35888 + }, + { + "epoch": 2.0867742810634833, + "grad_norm": 3.9076602458953857, + "learning_rate": 5.18459640948335e-06, + "loss": 0.2173, + "step": 35889 + }, + { + "epoch": 2.086787845903418, + "grad_norm": 4.50921106338501, + "learning_rate": 5.184459366863095e-06, + "loss": 0.2085, + "step": 35890 + }, + { + "epoch": 2.086801410743353, + "grad_norm": 4.355838298797607, + "learning_rate": 5.18432232424284e-06, + "loss": 0.168, + "step": 35891 + }, + { + "epoch": 2.0868149755832883, + "grad_norm": 4.789572715759277, + "learning_rate": 5.184185281622585e-06, + "loss": 0.2066, + "step": 35892 + }, + { + "epoch": 2.086828540423223, + "grad_norm": 4.544978141784668, + "learning_rate": 5.18404823900233e-06, + "loss": 0.2667, + "step": 35893 + }, + { + "epoch": 2.086842105263158, + "grad_norm": 4.087651252746582, + "learning_rate": 5.183911196382075e-06, + "loss": 0.1498, + "step": 35894 + }, + { + "epoch": 2.086855670103093, + "grad_norm": 4.215954303741455, + "learning_rate": 5.1837741537618206e-06, + "loss": 0.1326, + "step": 35895 + }, + { + "epoch": 2.086869234943028, + "grad_norm": 3.98793888092041, + "learning_rate": 5.183637111141565e-06, + "loss": 0.2418, + "step": 35896 + }, + { + "epoch": 2.0868827997829626, + "grad_norm": 5.4251837730407715, + "learning_rate": 5.183500068521311e-06, + "loss": 0.2416, + "step": 35897 + }, + { + "epoch": 2.0868963646228975, + "grad_norm": 4.363173961639404, + "learning_rate": 5.183363025901056e-06, + "loss": 0.2439, + "step": 35898 + }, + { + "epoch": 2.0869099294628324, + "grad_norm": 6.496006965637207, + "learning_rate": 5.1832259832808004e-06, + "loss": 0.3968, + "step": 35899 + }, + { + "epoch": 2.0869234943027672, + "grad_norm": 5.556194305419922, + "learning_rate": 5.183088940660546e-06, + "loss": 0.2043, + "step": 35900 + }, + { + "epoch": 2.086937059142702, + "grad_norm": 6.503538608551025, + "learning_rate": 5.182951898040291e-06, + "loss": 0.3777, + "step": 35901 + }, + { + "epoch": 2.086950623982637, + "grad_norm": 6.210550785064697, + "learning_rate": 5.182814855420036e-06, + "loss": 0.1968, + "step": 35902 + }, + { + "epoch": 2.086964188822572, + "grad_norm": 4.780006408691406, + "learning_rate": 5.182677812799781e-06, + "loss": 0.1667, + "step": 35903 + }, + { + "epoch": 2.0869777536625067, + "grad_norm": 3.6777963638305664, + "learning_rate": 5.182540770179526e-06, + "loss": 0.0783, + "step": 35904 + }, + { + "epoch": 2.0869913185024416, + "grad_norm": 6.482008457183838, + "learning_rate": 5.182403727559271e-06, + "loss": 0.2608, + "step": 35905 + }, + { + "epoch": 2.0870048833423764, + "grad_norm": 4.78435754776001, + "learning_rate": 5.182266684939017e-06, + "loss": 0.3004, + "step": 35906 + }, + { + "epoch": 2.0870184481823113, + "grad_norm": 3.4401204586029053, + "learning_rate": 5.182129642318762e-06, + "loss": 0.1016, + "step": 35907 + }, + { + "epoch": 2.087032013022246, + "grad_norm": 3.5409469604492188, + "learning_rate": 5.181992599698506e-06, + "loss": 0.1783, + "step": 35908 + }, + { + "epoch": 2.087045577862181, + "grad_norm": 4.676644325256348, + "learning_rate": 5.181855557078251e-06, + "loss": 0.1611, + "step": 35909 + }, + { + "epoch": 2.087059142702116, + "grad_norm": 4.864742279052734, + "learning_rate": 5.181718514457997e-06, + "loss": 0.2413, + "step": 35910 + }, + { + "epoch": 2.087072707542051, + "grad_norm": 4.171813488006592, + "learning_rate": 5.1815814718377426e-06, + "loss": 0.1489, + "step": 35911 + }, + { + "epoch": 2.087086272381986, + "grad_norm": 7.937798500061035, + "learning_rate": 5.181444429217487e-06, + "loss": 0.2454, + "step": 35912 + }, + { + "epoch": 2.087099837221921, + "grad_norm": 3.799881935119629, + "learning_rate": 5.181307386597232e-06, + "loss": 0.1032, + "step": 35913 + }, + { + "epoch": 2.087113402061856, + "grad_norm": 4.742914199829102, + "learning_rate": 5.181170343976976e-06, + "loss": 0.2038, + "step": 35914 + }, + { + "epoch": 2.0871269669017907, + "grad_norm": 4.3717241287231445, + "learning_rate": 5.1810333013567224e-06, + "loss": 0.2404, + "step": 35915 + }, + { + "epoch": 2.0871405317417255, + "grad_norm": 4.268477916717529, + "learning_rate": 5.180896258736468e-06, + "loss": 0.1558, + "step": 35916 + }, + { + "epoch": 2.0871540965816604, + "grad_norm": 4.477766513824463, + "learning_rate": 5.180759216116212e-06, + "loss": 0.1908, + "step": 35917 + }, + { + "epoch": 2.0871676614215953, + "grad_norm": 5.168200492858887, + "learning_rate": 5.180622173495957e-06, + "loss": 0.1363, + "step": 35918 + }, + { + "epoch": 2.08718122626153, + "grad_norm": 5.017099380493164, + "learning_rate": 5.180485130875703e-06, + "loss": 0.1911, + "step": 35919 + }, + { + "epoch": 2.087194791101465, + "grad_norm": 5.676417827606201, + "learning_rate": 5.180348088255448e-06, + "loss": 0.2599, + "step": 35920 + }, + { + "epoch": 2.0872083559414, + "grad_norm": 4.217838764190674, + "learning_rate": 5.180211045635193e-06, + "loss": 0.1237, + "step": 35921 + }, + { + "epoch": 2.0872219207813347, + "grad_norm": 4.215515613555908, + "learning_rate": 5.180074003014938e-06, + "loss": 0.1244, + "step": 35922 + }, + { + "epoch": 2.0872354856212696, + "grad_norm": 5.985095500946045, + "learning_rate": 5.179936960394684e-06, + "loss": 0.1628, + "step": 35923 + }, + { + "epoch": 2.0872490504612045, + "grad_norm": 5.3373637199401855, + "learning_rate": 5.179799917774428e-06, + "loss": 0.2058, + "step": 35924 + }, + { + "epoch": 2.0872626153011393, + "grad_norm": 4.484455585479736, + "learning_rate": 5.179662875154173e-06, + "loss": 0.169, + "step": 35925 + }, + { + "epoch": 2.087276180141074, + "grad_norm": 7.34168815612793, + "learning_rate": 5.1795258325339186e-06, + "loss": 0.3165, + "step": 35926 + }, + { + "epoch": 2.087289744981009, + "grad_norm": 4.769913673400879, + "learning_rate": 5.179388789913664e-06, + "loss": 0.1818, + "step": 35927 + }, + { + "epoch": 2.087303309820944, + "grad_norm": 6.229904651641846, + "learning_rate": 5.179251747293409e-06, + "loss": 0.1942, + "step": 35928 + }, + { + "epoch": 2.0873168746608792, + "grad_norm": 6.25451135635376, + "learning_rate": 5.179114704673154e-06, + "loss": 0.1749, + "step": 35929 + }, + { + "epoch": 2.087330439500814, + "grad_norm": 4.520456790924072, + "learning_rate": 5.1789776620528984e-06, + "loss": 0.1769, + "step": 35930 + }, + { + "epoch": 2.087344004340749, + "grad_norm": 3.386192798614502, + "learning_rate": 5.178840619432644e-06, + "loss": 0.1228, + "step": 35931 + }, + { + "epoch": 2.087357569180684, + "grad_norm": 4.959575653076172, + "learning_rate": 5.17870357681239e-06, + "loss": 0.1102, + "step": 35932 + }, + { + "epoch": 2.0873711340206187, + "grad_norm": 5.278842449188232, + "learning_rate": 5.178566534192134e-06, + "loss": 0.2154, + "step": 35933 + }, + { + "epoch": 2.0873846988605536, + "grad_norm": 4.325601577758789, + "learning_rate": 5.178429491571879e-06, + "loss": 0.115, + "step": 35934 + }, + { + "epoch": 2.0873982637004884, + "grad_norm": 6.855091571807861, + "learning_rate": 5.178292448951624e-06, + "loss": 0.298, + "step": 35935 + }, + { + "epoch": 2.0874118285404233, + "grad_norm": 6.5024590492248535, + "learning_rate": 5.1781554063313695e-06, + "loss": 0.2449, + "step": 35936 + }, + { + "epoch": 2.087425393380358, + "grad_norm": 4.5343403816223145, + "learning_rate": 5.178018363711115e-06, + "loss": 0.1548, + "step": 35937 + }, + { + "epoch": 2.087438958220293, + "grad_norm": 5.223387718200684, + "learning_rate": 5.17788132109086e-06, + "loss": 0.1891, + "step": 35938 + }, + { + "epoch": 2.087452523060228, + "grad_norm": 3.934831142425537, + "learning_rate": 5.177744278470604e-06, + "loss": 0.1263, + "step": 35939 + }, + { + "epoch": 2.0874660879001627, + "grad_norm": 3.939122438430786, + "learning_rate": 5.17760723585035e-06, + "loss": 0.1619, + "step": 35940 + }, + { + "epoch": 2.0874796527400976, + "grad_norm": 3.1142938137054443, + "learning_rate": 5.177470193230095e-06, + "loss": 0.1384, + "step": 35941 + }, + { + "epoch": 2.0874932175800325, + "grad_norm": 4.045409679412842, + "learning_rate": 5.17733315060984e-06, + "loss": 0.1102, + "step": 35942 + }, + { + "epoch": 2.0875067824199673, + "grad_norm": 5.086850166320801, + "learning_rate": 5.177196107989585e-06, + "loss": 0.1493, + "step": 35943 + }, + { + "epoch": 2.087520347259902, + "grad_norm": 6.010527610778809, + "learning_rate": 5.17705906536933e-06, + "loss": 0.314, + "step": 35944 + }, + { + "epoch": 2.087533912099837, + "grad_norm": 7.3993306159973145, + "learning_rate": 5.176922022749076e-06, + "loss": 0.0956, + "step": 35945 + }, + { + "epoch": 2.087547476939772, + "grad_norm": 4.907071113586426, + "learning_rate": 5.1767849801288204e-06, + "loss": 0.1624, + "step": 35946 + }, + { + "epoch": 2.087561041779707, + "grad_norm": 4.219353199005127, + "learning_rate": 5.176647937508566e-06, + "loss": 0.1819, + "step": 35947 + }, + { + "epoch": 2.0875746066196417, + "grad_norm": 4.988004207611084, + "learning_rate": 5.17651089488831e-06, + "loss": 0.1372, + "step": 35948 + }, + { + "epoch": 2.087588171459577, + "grad_norm": 3.877223491668701, + "learning_rate": 5.176373852268056e-06, + "loss": 0.1257, + "step": 35949 + }, + { + "epoch": 2.087601736299512, + "grad_norm": 2.8017210960388184, + "learning_rate": 5.176236809647801e-06, + "loss": 0.0501, + "step": 35950 + }, + { + "epoch": 2.0876153011394467, + "grad_norm": 6.951383113861084, + "learning_rate": 5.1760997670275455e-06, + "loss": 0.234, + "step": 35951 + }, + { + "epoch": 2.0876288659793816, + "grad_norm": 4.909226417541504, + "learning_rate": 5.175962724407291e-06, + "loss": 0.1055, + "step": 35952 + }, + { + "epoch": 2.0876424308193164, + "grad_norm": 5.139492034912109, + "learning_rate": 5.175825681787037e-06, + "loss": 0.1895, + "step": 35953 + }, + { + "epoch": 2.0876559956592513, + "grad_norm": 3.7353734970092773, + "learning_rate": 5.175688639166782e-06, + "loss": 0.1511, + "step": 35954 + }, + { + "epoch": 2.087669560499186, + "grad_norm": 3.794373035430908, + "learning_rate": 5.175551596546526e-06, + "loss": 0.1028, + "step": 35955 + }, + { + "epoch": 2.087683125339121, + "grad_norm": 5.302633285522461, + "learning_rate": 5.175414553926271e-06, + "loss": 0.1938, + "step": 35956 + }, + { + "epoch": 2.087696690179056, + "grad_norm": 4.975041389465332, + "learning_rate": 5.175277511306016e-06, + "loss": 0.1354, + "step": 35957 + }, + { + "epoch": 2.0877102550189908, + "grad_norm": 4.322301864624023, + "learning_rate": 5.175140468685762e-06, + "loss": 0.1386, + "step": 35958 + }, + { + "epoch": 2.0877238198589256, + "grad_norm": 4.753016471862793, + "learning_rate": 5.175003426065507e-06, + "loss": 0.1534, + "step": 35959 + }, + { + "epoch": 2.0877373846988605, + "grad_norm": 5.408968448638916, + "learning_rate": 5.174866383445252e-06, + "loss": 0.1705, + "step": 35960 + }, + { + "epoch": 2.0877509495387954, + "grad_norm": 4.897381782531738, + "learning_rate": 5.1747293408249964e-06, + "loss": 0.2136, + "step": 35961 + }, + { + "epoch": 2.0877645143787302, + "grad_norm": 3.895082950592041, + "learning_rate": 5.1745922982047425e-06, + "loss": 0.1722, + "step": 35962 + }, + { + "epoch": 2.087778079218665, + "grad_norm": 4.391253471374512, + "learning_rate": 5.174455255584488e-06, + "loss": 0.1331, + "step": 35963 + }, + { + "epoch": 2.0877916440586, + "grad_norm": 4.762655735015869, + "learning_rate": 5.174318212964232e-06, + "loss": 0.1447, + "step": 35964 + }, + { + "epoch": 2.087805208898535, + "grad_norm": 4.679595470428467, + "learning_rate": 5.174181170343977e-06, + "loss": 0.1514, + "step": 35965 + }, + { + "epoch": 2.0878187737384697, + "grad_norm": 4.651141166687012, + "learning_rate": 5.174044127723723e-06, + "loss": 0.1375, + "step": 35966 + }, + { + "epoch": 2.087832338578405, + "grad_norm": 4.136259078979492, + "learning_rate": 5.1739070851034675e-06, + "loss": 0.134, + "step": 35967 + }, + { + "epoch": 2.08784590341834, + "grad_norm": 3.785308837890625, + "learning_rate": 5.173770042483213e-06, + "loss": 0.1546, + "step": 35968 + }, + { + "epoch": 2.0878594682582747, + "grad_norm": 5.697822093963623, + "learning_rate": 5.173632999862958e-06, + "loss": 0.1717, + "step": 35969 + }, + { + "epoch": 2.0878730330982096, + "grad_norm": 3.143278121948242, + "learning_rate": 5.173495957242702e-06, + "loss": 0.0804, + "step": 35970 + }, + { + "epoch": 2.0878865979381445, + "grad_norm": 5.2727556228637695, + "learning_rate": 5.173358914622448e-06, + "loss": 0.2342, + "step": 35971 + }, + { + "epoch": 2.0879001627780793, + "grad_norm": 4.1313652992248535, + "learning_rate": 5.173221872002193e-06, + "loss": 0.1113, + "step": 35972 + }, + { + "epoch": 2.087913727618014, + "grad_norm": 4.9610819816589355, + "learning_rate": 5.173084829381938e-06, + "loss": 0.2389, + "step": 35973 + }, + { + "epoch": 2.087927292457949, + "grad_norm": 7.5309014320373535, + "learning_rate": 5.172947786761683e-06, + "loss": 0.2555, + "step": 35974 + }, + { + "epoch": 2.087940857297884, + "grad_norm": 5.228592872619629, + "learning_rate": 5.172810744141429e-06, + "loss": 0.2165, + "step": 35975 + }, + { + "epoch": 2.087954422137819, + "grad_norm": 6.348066329956055, + "learning_rate": 5.172673701521173e-06, + "loss": 0.2149, + "step": 35976 + }, + { + "epoch": 2.0879679869777537, + "grad_norm": 4.920747756958008, + "learning_rate": 5.1725366589009185e-06, + "loss": 0.1749, + "step": 35977 + }, + { + "epoch": 2.0879815518176885, + "grad_norm": 4.642980575561523, + "learning_rate": 5.172399616280664e-06, + "loss": 0.1218, + "step": 35978 + }, + { + "epoch": 2.0879951166576234, + "grad_norm": 4.40752649307251, + "learning_rate": 5.17226257366041e-06, + "loss": 0.1373, + "step": 35979 + }, + { + "epoch": 2.0880086814975582, + "grad_norm": 4.977158069610596, + "learning_rate": 5.172125531040154e-06, + "loss": 0.1948, + "step": 35980 + }, + { + "epoch": 2.088022246337493, + "grad_norm": 4.044840335845947, + "learning_rate": 5.171988488419899e-06, + "loss": 0.0591, + "step": 35981 + }, + { + "epoch": 2.088035811177428, + "grad_norm": 5.242351055145264, + "learning_rate": 5.1718514457996435e-06, + "loss": 0.1709, + "step": 35982 + }, + { + "epoch": 2.088049376017363, + "grad_norm": 5.8302106857299805, + "learning_rate": 5.171714403179389e-06, + "loss": 0.1674, + "step": 35983 + }, + { + "epoch": 2.0880629408572977, + "grad_norm": 4.760416030883789, + "learning_rate": 5.171577360559135e-06, + "loss": 0.1731, + "step": 35984 + }, + { + "epoch": 2.0880765056972326, + "grad_norm": 5.075459957122803, + "learning_rate": 5.17144031793888e-06, + "loss": 0.1351, + "step": 35985 + }, + { + "epoch": 2.0880900705371674, + "grad_norm": 5.938236236572266, + "learning_rate": 5.171303275318624e-06, + "loss": 0.1072, + "step": 35986 + }, + { + "epoch": 2.0881036353771028, + "grad_norm": 3.855011463165283, + "learning_rate": 5.171166232698369e-06, + "loss": 0.1404, + "step": 35987 + }, + { + "epoch": 2.0881172002170376, + "grad_norm": 5.271301746368408, + "learning_rate": 5.171029190078115e-06, + "loss": 0.1606, + "step": 35988 + }, + { + "epoch": 2.0881307650569725, + "grad_norm": 6.198547840118408, + "learning_rate": 5.17089214745786e-06, + "loss": 0.2334, + "step": 35989 + }, + { + "epoch": 2.0881443298969073, + "grad_norm": 4.803575038909912, + "learning_rate": 5.170755104837605e-06, + "loss": 0.1526, + "step": 35990 + }, + { + "epoch": 2.088157894736842, + "grad_norm": 5.576166152954102, + "learning_rate": 5.170618062217349e-06, + "loss": 0.1589, + "step": 35991 + }, + { + "epoch": 2.088171459576777, + "grad_norm": 3.8081531524658203, + "learning_rate": 5.170481019597095e-06, + "loss": 0.1048, + "step": 35992 + }, + { + "epoch": 2.088185024416712, + "grad_norm": 5.926699638366699, + "learning_rate": 5.1703439769768405e-06, + "loss": 0.1309, + "step": 35993 + }, + { + "epoch": 2.088198589256647, + "grad_norm": 4.912655353546143, + "learning_rate": 5.170206934356586e-06, + "loss": 0.1952, + "step": 35994 + }, + { + "epoch": 2.0882121540965817, + "grad_norm": 4.557026386260986, + "learning_rate": 5.17006989173633e-06, + "loss": 0.1082, + "step": 35995 + }, + { + "epoch": 2.0882257189365165, + "grad_norm": 5.026486873626709, + "learning_rate": 5.169932849116076e-06, + "loss": 0.2251, + "step": 35996 + }, + { + "epoch": 2.0882392837764514, + "grad_norm": 6.509409427642822, + "learning_rate": 5.169795806495821e-06, + "loss": 0.2201, + "step": 35997 + }, + { + "epoch": 2.0882528486163863, + "grad_norm": 3.8678574562072754, + "learning_rate": 5.1696587638755655e-06, + "loss": 0.0892, + "step": 35998 + }, + { + "epoch": 2.088266413456321, + "grad_norm": 6.212740898132324, + "learning_rate": 5.169521721255311e-06, + "loss": 0.2454, + "step": 35999 + }, + { + "epoch": 2.088279978296256, + "grad_norm": 4.9311652183532715, + "learning_rate": 5.169384678635055e-06, + "loss": 0.2396, + "step": 36000 + }, + { + "epoch": 2.088293543136191, + "grad_norm": 3.729966163635254, + "learning_rate": 5.169247636014801e-06, + "loss": 0.1179, + "step": 36001 + }, + { + "epoch": 2.0883071079761257, + "grad_norm": 5.219093322753906, + "learning_rate": 5.169110593394546e-06, + "loss": 0.1704, + "step": 36002 + }, + { + "epoch": 2.0883206728160606, + "grad_norm": 7.9911603927612305, + "learning_rate": 5.168973550774291e-06, + "loss": 0.1301, + "step": 36003 + }, + { + "epoch": 2.0883342376559955, + "grad_norm": 3.835200309753418, + "learning_rate": 5.168836508154036e-06, + "loss": 0.1015, + "step": 36004 + }, + { + "epoch": 2.0883478024959308, + "grad_norm": 3.902932643890381, + "learning_rate": 5.168699465533782e-06, + "loss": 0.1343, + "step": 36005 + }, + { + "epoch": 2.0883613673358656, + "grad_norm": 3.422316074371338, + "learning_rate": 5.168562422913527e-06, + "loss": 0.0989, + "step": 36006 + }, + { + "epoch": 2.0883749321758005, + "grad_norm": 4.856903076171875, + "learning_rate": 5.168425380293271e-06, + "loss": 0.1404, + "step": 36007 + }, + { + "epoch": 2.0883884970157354, + "grad_norm": 2.6355416774749756, + "learning_rate": 5.1682883376730165e-06, + "loss": 0.0775, + "step": 36008 + }, + { + "epoch": 2.0884020618556702, + "grad_norm": 4.499763488769531, + "learning_rate": 5.1681512950527625e-06, + "loss": 0.1698, + "step": 36009 + }, + { + "epoch": 2.088415626695605, + "grad_norm": 5.775925636291504, + "learning_rate": 5.168014252432507e-06, + "loss": 0.1357, + "step": 36010 + }, + { + "epoch": 2.08842919153554, + "grad_norm": 3.5804662704467773, + "learning_rate": 5.167877209812252e-06, + "loss": 0.1607, + "step": 36011 + }, + { + "epoch": 2.088442756375475, + "grad_norm": 5.706169128417969, + "learning_rate": 5.167740167191997e-06, + "loss": 0.1823, + "step": 36012 + }, + { + "epoch": 2.0884563212154097, + "grad_norm": 5.478936672210693, + "learning_rate": 5.1676031245717415e-06, + "loss": 0.1771, + "step": 36013 + }, + { + "epoch": 2.0884698860553446, + "grad_norm": 5.082883358001709, + "learning_rate": 5.1674660819514875e-06, + "loss": 0.164, + "step": 36014 + }, + { + "epoch": 2.0884834508952794, + "grad_norm": 5.567978382110596, + "learning_rate": 5.167329039331233e-06, + "loss": 0.1283, + "step": 36015 + }, + { + "epoch": 2.0884970157352143, + "grad_norm": 4.487649917602539, + "learning_rate": 5.167191996710977e-06, + "loss": 0.1307, + "step": 36016 + }, + { + "epoch": 2.088510580575149, + "grad_norm": 5.078252792358398, + "learning_rate": 5.167054954090722e-06, + "loss": 0.1829, + "step": 36017 + }, + { + "epoch": 2.088524145415084, + "grad_norm": 3.6634819507598877, + "learning_rate": 5.166917911470468e-06, + "loss": 0.0838, + "step": 36018 + }, + { + "epoch": 2.088537710255019, + "grad_norm": 4.821308612823486, + "learning_rate": 5.166780868850213e-06, + "loss": 0.1875, + "step": 36019 + }, + { + "epoch": 2.0885512750949538, + "grad_norm": 4.322057247161865, + "learning_rate": 5.166643826229958e-06, + "loss": 0.1383, + "step": 36020 + }, + { + "epoch": 2.0885648399348886, + "grad_norm": 5.911044120788574, + "learning_rate": 5.166506783609703e-06, + "loss": 0.1731, + "step": 36021 + }, + { + "epoch": 2.0885784047748235, + "grad_norm": 3.322667121887207, + "learning_rate": 5.166369740989449e-06, + "loss": 0.0888, + "step": 36022 + }, + { + "epoch": 2.0885919696147583, + "grad_norm": 5.621752738952637, + "learning_rate": 5.166232698369193e-06, + "loss": 0.2237, + "step": 36023 + }, + { + "epoch": 2.088605534454693, + "grad_norm": 4.342380523681641, + "learning_rate": 5.1660956557489385e-06, + "loss": 0.1455, + "step": 36024 + }, + { + "epoch": 2.0886190992946285, + "grad_norm": 6.248378276824951, + "learning_rate": 5.165958613128683e-06, + "loss": 0.1878, + "step": 36025 + }, + { + "epoch": 2.0886326641345634, + "grad_norm": 4.381879806518555, + "learning_rate": 5.165821570508428e-06, + "loss": 0.1571, + "step": 36026 + }, + { + "epoch": 2.0886462289744983, + "grad_norm": 4.400645732879639, + "learning_rate": 5.165684527888174e-06, + "loss": 0.132, + "step": 36027 + }, + { + "epoch": 2.088659793814433, + "grad_norm": 6.255349159240723, + "learning_rate": 5.165547485267919e-06, + "loss": 0.2029, + "step": 36028 + }, + { + "epoch": 2.088673358654368, + "grad_norm": 6.069897651672363, + "learning_rate": 5.1654104426476635e-06, + "loss": 0.1963, + "step": 36029 + }, + { + "epoch": 2.088686923494303, + "grad_norm": 5.650758743286133, + "learning_rate": 5.165273400027409e-06, + "loss": 0.2186, + "step": 36030 + }, + { + "epoch": 2.0887004883342377, + "grad_norm": 4.86760950088501, + "learning_rate": 5.165136357407155e-06, + "loss": 0.1612, + "step": 36031 + }, + { + "epoch": 2.0887140531741726, + "grad_norm": 5.366583824157715, + "learning_rate": 5.164999314786899e-06, + "loss": 0.1577, + "step": 36032 + }, + { + "epoch": 2.0887276180141074, + "grad_norm": 5.686246871948242, + "learning_rate": 5.164862272166644e-06, + "loss": 0.222, + "step": 36033 + }, + { + "epoch": 2.0887411828540423, + "grad_norm": 6.031483173370361, + "learning_rate": 5.164725229546389e-06, + "loss": 0.1832, + "step": 36034 + }, + { + "epoch": 2.088754747693977, + "grad_norm": 5.05930233001709, + "learning_rate": 5.164588186926135e-06, + "loss": 0.1556, + "step": 36035 + }, + { + "epoch": 2.088768312533912, + "grad_norm": 3.4546730518341064, + "learning_rate": 5.16445114430588e-06, + "loss": 0.183, + "step": 36036 + }, + { + "epoch": 2.088781877373847, + "grad_norm": 4.909605979919434, + "learning_rate": 5.164314101685625e-06, + "loss": 0.1217, + "step": 36037 + }, + { + "epoch": 2.0887954422137818, + "grad_norm": 3.5907764434814453, + "learning_rate": 5.164177059065369e-06, + "loss": 0.0887, + "step": 36038 + }, + { + "epoch": 2.0888090070537166, + "grad_norm": 3.240626335144043, + "learning_rate": 5.1640400164451145e-06, + "loss": 0.0922, + "step": 36039 + }, + { + "epoch": 2.0888225718936515, + "grad_norm": 6.175863265991211, + "learning_rate": 5.1639029738248605e-06, + "loss": 0.1914, + "step": 36040 + }, + { + "epoch": 2.0888361367335864, + "grad_norm": 4.990594387054443, + "learning_rate": 5.163765931204605e-06, + "loss": 0.1677, + "step": 36041 + }, + { + "epoch": 2.0888497015735212, + "grad_norm": 4.214107990264893, + "learning_rate": 5.16362888858435e-06, + "loss": 0.1383, + "step": 36042 + }, + { + "epoch": 2.0888632664134565, + "grad_norm": 6.1616668701171875, + "learning_rate": 5.163491845964095e-06, + "loss": 0.2149, + "step": 36043 + }, + { + "epoch": 2.0888768312533914, + "grad_norm": 4.774511337280273, + "learning_rate": 5.16335480334384e-06, + "loss": 0.1749, + "step": 36044 + }, + { + "epoch": 2.0888903960933263, + "grad_norm": 4.9272003173828125, + "learning_rate": 5.1632177607235855e-06, + "loss": 0.1477, + "step": 36045 + }, + { + "epoch": 2.088903960933261, + "grad_norm": 4.7873101234436035, + "learning_rate": 5.163080718103331e-06, + "loss": 0.1478, + "step": 36046 + }, + { + "epoch": 2.088917525773196, + "grad_norm": 3.8805160522460938, + "learning_rate": 5.162943675483075e-06, + "loss": 0.1046, + "step": 36047 + }, + { + "epoch": 2.088931090613131, + "grad_norm": 3.990142822265625, + "learning_rate": 5.162806632862821e-06, + "loss": 0.1575, + "step": 36048 + }, + { + "epoch": 2.0889446554530657, + "grad_norm": 3.3533265590667725, + "learning_rate": 5.162669590242566e-06, + "loss": 0.1702, + "step": 36049 + }, + { + "epoch": 2.0889582202930006, + "grad_norm": 3.09456729888916, + "learning_rate": 5.162532547622311e-06, + "loss": 0.1757, + "step": 36050 + }, + { + "epoch": 2.0889717851329355, + "grad_norm": 3.615111827850342, + "learning_rate": 5.162395505002056e-06, + "loss": 0.1538, + "step": 36051 + }, + { + "epoch": 2.0889853499728703, + "grad_norm": 4.213534355163574, + "learning_rate": 5.162258462381801e-06, + "loss": 0.169, + "step": 36052 + }, + { + "epoch": 2.088998914812805, + "grad_norm": 4.459965705871582, + "learning_rate": 5.162121419761547e-06, + "loss": 0.145, + "step": 36053 + }, + { + "epoch": 2.08901247965274, + "grad_norm": 4.166052341461182, + "learning_rate": 5.161984377141291e-06, + "loss": 0.2238, + "step": 36054 + }, + { + "epoch": 2.089026044492675, + "grad_norm": 3.8322248458862305, + "learning_rate": 5.1618473345210365e-06, + "loss": 0.1291, + "step": 36055 + }, + { + "epoch": 2.08903960933261, + "grad_norm": 3.447813034057617, + "learning_rate": 5.161710291900781e-06, + "loss": 0.1291, + "step": 36056 + }, + { + "epoch": 2.0890531741725447, + "grad_norm": 6.048730850219727, + "learning_rate": 5.161573249280527e-06, + "loss": 0.1752, + "step": 36057 + }, + { + "epoch": 2.0890667390124795, + "grad_norm": 7.362112045288086, + "learning_rate": 5.161436206660272e-06, + "loss": 0.2624, + "step": 36058 + }, + { + "epoch": 2.0890803038524144, + "grad_norm": 3.7965025901794434, + "learning_rate": 5.161299164040016e-06, + "loss": 0.1184, + "step": 36059 + }, + { + "epoch": 2.0890938686923493, + "grad_norm": 5.323660373687744, + "learning_rate": 5.1611621214197615e-06, + "loss": 0.1737, + "step": 36060 + }, + { + "epoch": 2.089107433532284, + "grad_norm": 7.508100509643555, + "learning_rate": 5.1610250787995075e-06, + "loss": 0.2292, + "step": 36061 + }, + { + "epoch": 2.089120998372219, + "grad_norm": 4.349948406219482, + "learning_rate": 5.160888036179253e-06, + "loss": 0.1678, + "step": 36062 + }, + { + "epoch": 2.0891345632121543, + "grad_norm": 5.234578609466553, + "learning_rate": 5.160750993558997e-06, + "loss": 0.1389, + "step": 36063 + }, + { + "epoch": 2.089148128052089, + "grad_norm": 5.019532680511475, + "learning_rate": 5.160613950938742e-06, + "loss": 0.1346, + "step": 36064 + }, + { + "epoch": 2.089161692892024, + "grad_norm": 4.776162147521973, + "learning_rate": 5.160476908318488e-06, + "loss": 0.1959, + "step": 36065 + }, + { + "epoch": 2.089175257731959, + "grad_norm": 4.666951656341553, + "learning_rate": 5.160339865698233e-06, + "loss": 0.1907, + "step": 36066 + }, + { + "epoch": 2.0891888225718938, + "grad_norm": 5.7279157638549805, + "learning_rate": 5.160202823077978e-06, + "loss": 0.1922, + "step": 36067 + }, + { + "epoch": 2.0892023874118286, + "grad_norm": 5.1067609786987305, + "learning_rate": 5.160065780457723e-06, + "loss": 0.1476, + "step": 36068 + }, + { + "epoch": 2.0892159522517635, + "grad_norm": 5.093376159667969, + "learning_rate": 5.159928737837467e-06, + "loss": 0.1658, + "step": 36069 + }, + { + "epoch": 2.0892295170916984, + "grad_norm": 4.408427715301514, + "learning_rate": 5.159791695217213e-06, + "loss": 0.1562, + "step": 36070 + }, + { + "epoch": 2.089243081931633, + "grad_norm": 4.27933931350708, + "learning_rate": 5.1596546525969585e-06, + "loss": 0.1326, + "step": 36071 + }, + { + "epoch": 2.089256646771568, + "grad_norm": 5.2088303565979, + "learning_rate": 5.159517609976703e-06, + "loss": 0.2085, + "step": 36072 + }, + { + "epoch": 2.089270211611503, + "grad_norm": 5.78525447845459, + "learning_rate": 5.159380567356448e-06, + "loss": 0.1769, + "step": 36073 + }, + { + "epoch": 2.089283776451438, + "grad_norm": 4.295950889587402, + "learning_rate": 5.159243524736194e-06, + "loss": 0.1417, + "step": 36074 + }, + { + "epoch": 2.0892973412913727, + "grad_norm": 5.488327980041504, + "learning_rate": 5.159106482115938e-06, + "loss": 0.2094, + "step": 36075 + }, + { + "epoch": 2.0893109061313075, + "grad_norm": 4.134729862213135, + "learning_rate": 5.1589694394956835e-06, + "loss": 0.2014, + "step": 36076 + }, + { + "epoch": 2.0893244709712424, + "grad_norm": 5.847999572753906, + "learning_rate": 5.158832396875429e-06, + "loss": 0.1523, + "step": 36077 + }, + { + "epoch": 2.0893380358111773, + "grad_norm": 5.262691974639893, + "learning_rate": 5.158695354255174e-06, + "loss": 0.1342, + "step": 36078 + }, + { + "epoch": 2.089351600651112, + "grad_norm": 6.233669281005859, + "learning_rate": 5.158558311634919e-06, + "loss": 0.1959, + "step": 36079 + }, + { + "epoch": 2.089365165491047, + "grad_norm": 6.351256370544434, + "learning_rate": 5.158421269014664e-06, + "loss": 0.1918, + "step": 36080 + }, + { + "epoch": 2.0893787303309823, + "grad_norm": 7.435949802398682, + "learning_rate": 5.158284226394409e-06, + "loss": 0.2367, + "step": 36081 + }, + { + "epoch": 2.089392295170917, + "grad_norm": 3.826596260070801, + "learning_rate": 5.158147183774154e-06, + "loss": 0.1954, + "step": 36082 + }, + { + "epoch": 2.089405860010852, + "grad_norm": 4.636504173278809, + "learning_rate": 5.1580101411539e-06, + "loss": 0.1529, + "step": 36083 + }, + { + "epoch": 2.089419424850787, + "grad_norm": 3.775822401046753, + "learning_rate": 5.157873098533644e-06, + "loss": 0.1504, + "step": 36084 + }, + { + "epoch": 2.0894329896907218, + "grad_norm": 4.702217102050781, + "learning_rate": 5.157736055913389e-06, + "loss": 0.1852, + "step": 36085 + }, + { + "epoch": 2.0894465545306566, + "grad_norm": 5.62492561340332, + "learning_rate": 5.1575990132931345e-06, + "loss": 0.2709, + "step": 36086 + }, + { + "epoch": 2.0894601193705915, + "grad_norm": 5.863659858703613, + "learning_rate": 5.1574619706728805e-06, + "loss": 0.2116, + "step": 36087 + }, + { + "epoch": 2.0894736842105264, + "grad_norm": 5.893734455108643, + "learning_rate": 5.157324928052625e-06, + "loss": 0.1983, + "step": 36088 + }, + { + "epoch": 2.0894872490504612, + "grad_norm": 4.417194366455078, + "learning_rate": 5.15718788543237e-06, + "loss": 0.1729, + "step": 36089 + }, + { + "epoch": 2.089500813890396, + "grad_norm": 5.299117088317871, + "learning_rate": 5.157050842812114e-06, + "loss": 0.1841, + "step": 36090 + }, + { + "epoch": 2.089514378730331, + "grad_norm": 4.871049880981445, + "learning_rate": 5.15691380019186e-06, + "loss": 0.1677, + "step": 36091 + }, + { + "epoch": 2.089527943570266, + "grad_norm": 3.8058323860168457, + "learning_rate": 5.1567767575716056e-06, + "loss": 0.126, + "step": 36092 + }, + { + "epoch": 2.0895415084102007, + "grad_norm": 6.150278091430664, + "learning_rate": 5.15663971495135e-06, + "loss": 0.2624, + "step": 36093 + }, + { + "epoch": 2.0895550732501356, + "grad_norm": 8.078882217407227, + "learning_rate": 5.156502672331095e-06, + "loss": 0.1924, + "step": 36094 + }, + { + "epoch": 2.0895686380900704, + "grad_norm": 4.792164325714111, + "learning_rate": 5.15636562971084e-06, + "loss": 0.1553, + "step": 36095 + }, + { + "epoch": 2.0895822029300053, + "grad_norm": 6.425355434417725, + "learning_rate": 5.156228587090586e-06, + "loss": 0.1981, + "step": 36096 + }, + { + "epoch": 2.08959576776994, + "grad_norm": 4.909770488739014, + "learning_rate": 5.156091544470331e-06, + "loss": 0.2606, + "step": 36097 + }, + { + "epoch": 2.089609332609875, + "grad_norm": 4.931803226470947, + "learning_rate": 5.155954501850076e-06, + "loss": 0.1338, + "step": 36098 + }, + { + "epoch": 2.08962289744981, + "grad_norm": 5.414323329925537, + "learning_rate": 5.15581745922982e-06, + "loss": 0.1913, + "step": 36099 + }, + { + "epoch": 2.0896364622897448, + "grad_norm": 4.335480213165283, + "learning_rate": 5.155680416609566e-06, + "loss": 0.1116, + "step": 36100 + }, + { + "epoch": 2.08965002712968, + "grad_norm": 5.558242321014404, + "learning_rate": 5.155543373989311e-06, + "loss": 0.1775, + "step": 36101 + }, + { + "epoch": 2.089663591969615, + "grad_norm": 4.806772708892822, + "learning_rate": 5.1554063313690565e-06, + "loss": 0.1488, + "step": 36102 + }, + { + "epoch": 2.08967715680955, + "grad_norm": 5.818003177642822, + "learning_rate": 5.155269288748801e-06, + "loss": 0.1297, + "step": 36103 + }, + { + "epoch": 2.0896907216494847, + "grad_norm": 6.466949462890625, + "learning_rate": 5.155132246128547e-06, + "loss": 0.2112, + "step": 36104 + }, + { + "epoch": 2.0897042864894195, + "grad_norm": 3.6537604331970215, + "learning_rate": 5.154995203508292e-06, + "loss": 0.1471, + "step": 36105 + }, + { + "epoch": 2.0897178513293544, + "grad_norm": 4.889575004577637, + "learning_rate": 5.154858160888036e-06, + "loss": 0.1978, + "step": 36106 + }, + { + "epoch": 2.0897314161692893, + "grad_norm": 4.254573345184326, + "learning_rate": 5.1547211182677815e-06, + "loss": 0.1857, + "step": 36107 + }, + { + "epoch": 2.089744981009224, + "grad_norm": 5.661263942718506, + "learning_rate": 5.154584075647526e-06, + "loss": 0.2286, + "step": 36108 + }, + { + "epoch": 2.089758545849159, + "grad_norm": 5.396842956542969, + "learning_rate": 5.154447033027272e-06, + "loss": 0.1934, + "step": 36109 + }, + { + "epoch": 2.089772110689094, + "grad_norm": 4.597019195556641, + "learning_rate": 5.154309990407017e-06, + "loss": 0.1544, + "step": 36110 + }, + { + "epoch": 2.0897856755290287, + "grad_norm": 4.3076300621032715, + "learning_rate": 5.154172947786762e-06, + "loss": 0.1752, + "step": 36111 + }, + { + "epoch": 2.0897992403689636, + "grad_norm": 4.139642238616943, + "learning_rate": 5.154035905166507e-06, + "loss": 0.1104, + "step": 36112 + }, + { + "epoch": 2.0898128052088984, + "grad_norm": 3.948453903198242, + "learning_rate": 5.153898862546253e-06, + "loss": 0.0962, + "step": 36113 + }, + { + "epoch": 2.0898263700488333, + "grad_norm": 4.96976900100708, + "learning_rate": 5.153761819925998e-06, + "loss": 0.1951, + "step": 36114 + }, + { + "epoch": 2.089839934888768, + "grad_norm": 6.976902484893799, + "learning_rate": 5.153624777305742e-06, + "loss": 0.182, + "step": 36115 + }, + { + "epoch": 2.089853499728703, + "grad_norm": 4.102454662322998, + "learning_rate": 5.153487734685487e-06, + "loss": 0.1488, + "step": 36116 + }, + { + "epoch": 2.089867064568638, + "grad_norm": 4.64992618560791, + "learning_rate": 5.153350692065233e-06, + "loss": 0.1453, + "step": 36117 + }, + { + "epoch": 2.0898806294085728, + "grad_norm": 4.613925457000732, + "learning_rate": 5.153213649444978e-06, + "loss": 0.1815, + "step": 36118 + }, + { + "epoch": 2.089894194248508, + "grad_norm": 7.333338737487793, + "learning_rate": 5.153076606824723e-06, + "loss": 0.2168, + "step": 36119 + }, + { + "epoch": 2.089907759088443, + "grad_norm": 4.05126428604126, + "learning_rate": 5.152939564204468e-06, + "loss": 0.1423, + "step": 36120 + }, + { + "epoch": 2.089921323928378, + "grad_norm": 3.84488582611084, + "learning_rate": 5.152802521584212e-06, + "loss": 0.2134, + "step": 36121 + }, + { + "epoch": 2.0899348887683127, + "grad_norm": 5.797738075256348, + "learning_rate": 5.152665478963958e-06, + "loss": 0.1475, + "step": 36122 + }, + { + "epoch": 2.0899484536082475, + "grad_norm": 3.79612398147583, + "learning_rate": 5.1525284363437036e-06, + "loss": 0.1264, + "step": 36123 + }, + { + "epoch": 2.0899620184481824, + "grad_norm": 4.300917148590088, + "learning_rate": 5.152391393723448e-06, + "loss": 0.0873, + "step": 36124 + }, + { + "epoch": 2.0899755832881173, + "grad_norm": 6.93009614944458, + "learning_rate": 5.152254351103193e-06, + "loss": 0.3194, + "step": 36125 + }, + { + "epoch": 2.089989148128052, + "grad_norm": 5.390421390533447, + "learning_rate": 5.152117308482939e-06, + "loss": 0.1871, + "step": 36126 + }, + { + "epoch": 2.090002712967987, + "grad_norm": 4.2538676261901855, + "learning_rate": 5.151980265862684e-06, + "loss": 0.1339, + "step": 36127 + }, + { + "epoch": 2.090016277807922, + "grad_norm": 5.833746433258057, + "learning_rate": 5.151843223242429e-06, + "loss": 0.2342, + "step": 36128 + }, + { + "epoch": 2.0900298426478567, + "grad_norm": 4.865384578704834, + "learning_rate": 5.151706180622174e-06, + "loss": 0.1907, + "step": 36129 + }, + { + "epoch": 2.0900434074877916, + "grad_norm": 4.684786319732666, + "learning_rate": 5.15156913800192e-06, + "loss": 0.1506, + "step": 36130 + }, + { + "epoch": 2.0900569723277265, + "grad_norm": 5.1234450340271, + "learning_rate": 5.151432095381664e-06, + "loss": 0.2544, + "step": 36131 + }, + { + "epoch": 2.0900705371676613, + "grad_norm": 4.535607814788818, + "learning_rate": 5.151295052761409e-06, + "loss": 0.1542, + "step": 36132 + }, + { + "epoch": 2.090084102007596, + "grad_norm": 5.724211692810059, + "learning_rate": 5.151158010141154e-06, + "loss": 0.2408, + "step": 36133 + }, + { + "epoch": 2.090097666847531, + "grad_norm": 5.342179298400879, + "learning_rate": 5.1510209675209e-06, + "loss": 0.2556, + "step": 36134 + }, + { + "epoch": 2.090111231687466, + "grad_norm": 4.166306972503662, + "learning_rate": 5.150883924900645e-06, + "loss": 0.1399, + "step": 36135 + }, + { + "epoch": 2.090124796527401, + "grad_norm": 4.887415885925293, + "learning_rate": 5.15074688228039e-06, + "loss": 0.1965, + "step": 36136 + }, + { + "epoch": 2.0901383613673357, + "grad_norm": 5.412460803985596, + "learning_rate": 5.150609839660134e-06, + "loss": 0.1583, + "step": 36137 + }, + { + "epoch": 2.0901519262072705, + "grad_norm": 5.733548164367676, + "learning_rate": 5.1504727970398795e-06, + "loss": 0.2199, + "step": 36138 + }, + { + "epoch": 2.090165491047206, + "grad_norm": 5.557891368865967, + "learning_rate": 5.1503357544196256e-06, + "loss": 0.2525, + "step": 36139 + }, + { + "epoch": 2.0901790558871407, + "grad_norm": 4.294386863708496, + "learning_rate": 5.15019871179937e-06, + "loss": 0.1685, + "step": 36140 + }, + { + "epoch": 2.0901926207270756, + "grad_norm": 3.975102663040161, + "learning_rate": 5.150061669179115e-06, + "loss": 0.1501, + "step": 36141 + }, + { + "epoch": 2.0902061855670104, + "grad_norm": 5.529491424560547, + "learning_rate": 5.149924626558859e-06, + "loss": 0.1809, + "step": 36142 + }, + { + "epoch": 2.0902197504069453, + "grad_norm": 3.9603207111358643, + "learning_rate": 5.1497875839386054e-06, + "loss": 0.1351, + "step": 36143 + }, + { + "epoch": 2.09023331524688, + "grad_norm": 5.29759407043457, + "learning_rate": 5.149650541318351e-06, + "loss": 0.2437, + "step": 36144 + }, + { + "epoch": 2.090246880086815, + "grad_norm": 3.88696026802063, + "learning_rate": 5.149513498698096e-06, + "loss": 0.1774, + "step": 36145 + }, + { + "epoch": 2.09026044492675, + "grad_norm": 4.864508628845215, + "learning_rate": 5.14937645607784e-06, + "loss": 0.1865, + "step": 36146 + }, + { + "epoch": 2.0902740097666848, + "grad_norm": 4.954718589782715, + "learning_rate": 5.149239413457586e-06, + "loss": 0.1488, + "step": 36147 + }, + { + "epoch": 2.0902875746066196, + "grad_norm": 4.099288463592529, + "learning_rate": 5.149102370837331e-06, + "loss": 0.1799, + "step": 36148 + }, + { + "epoch": 2.0903011394465545, + "grad_norm": 5.571205139160156, + "learning_rate": 5.148965328217076e-06, + "loss": 0.2509, + "step": 36149 + }, + { + "epoch": 2.0903147042864894, + "grad_norm": 6.32422399520874, + "learning_rate": 5.148828285596821e-06, + "loss": 0.276, + "step": 36150 + }, + { + "epoch": 2.090328269126424, + "grad_norm": 4.801826000213623, + "learning_rate": 5.148691242976566e-06, + "loss": 0.1489, + "step": 36151 + }, + { + "epoch": 2.090341833966359, + "grad_norm": 5.901898384094238, + "learning_rate": 5.148554200356311e-06, + "loss": 0.1782, + "step": 36152 + }, + { + "epoch": 2.090355398806294, + "grad_norm": 4.696988105773926, + "learning_rate": 5.148417157736056e-06, + "loss": 0.1341, + "step": 36153 + }, + { + "epoch": 2.090368963646229, + "grad_norm": 4.5988264083862305, + "learning_rate": 5.1482801151158016e-06, + "loss": 0.1794, + "step": 36154 + }, + { + "epoch": 2.0903825284861637, + "grad_norm": 4.592135906219482, + "learning_rate": 5.148143072495546e-06, + "loss": 0.1581, + "step": 36155 + }, + { + "epoch": 2.0903960933260985, + "grad_norm": 5.859033107757568, + "learning_rate": 5.148006029875292e-06, + "loss": 0.1371, + "step": 36156 + }, + { + "epoch": 2.090409658166034, + "grad_norm": 5.597508907318115, + "learning_rate": 5.147868987255037e-06, + "loss": 0.2494, + "step": 36157 + }, + { + "epoch": 2.0904232230059687, + "grad_norm": 3.458406925201416, + "learning_rate": 5.1477319446347814e-06, + "loss": 0.1245, + "step": 36158 + }, + { + "epoch": 2.0904367878459036, + "grad_norm": 5.593756198883057, + "learning_rate": 5.147594902014527e-06, + "loss": 0.2311, + "step": 36159 + }, + { + "epoch": 2.0904503526858385, + "grad_norm": 4.4842753410339355, + "learning_rate": 5.147457859394273e-06, + "loss": 0.1698, + "step": 36160 + }, + { + "epoch": 2.0904639175257733, + "grad_norm": 3.968214988708496, + "learning_rate": 5.147320816774018e-06, + "loss": 0.1676, + "step": 36161 + }, + { + "epoch": 2.090477482365708, + "grad_norm": 4.603355884552002, + "learning_rate": 5.147183774153762e-06, + "loss": 0.179, + "step": 36162 + }, + { + "epoch": 2.090491047205643, + "grad_norm": 3.4919333457946777, + "learning_rate": 5.147046731533507e-06, + "loss": 0.1246, + "step": 36163 + }, + { + "epoch": 2.090504612045578, + "grad_norm": 4.928685665130615, + "learning_rate": 5.146909688913252e-06, + "loss": 0.1492, + "step": 36164 + }, + { + "epoch": 2.090518176885513, + "grad_norm": 7.346469402313232, + "learning_rate": 5.146772646292998e-06, + "loss": 0.2951, + "step": 36165 + }, + { + "epoch": 2.0905317417254476, + "grad_norm": 4.2604780197143555, + "learning_rate": 5.146635603672743e-06, + "loss": 0.2003, + "step": 36166 + }, + { + "epoch": 2.0905453065653825, + "grad_norm": 4.323750972747803, + "learning_rate": 5.146498561052487e-06, + "loss": 0.1914, + "step": 36167 + }, + { + "epoch": 2.0905588714053174, + "grad_norm": 4.12168025970459, + "learning_rate": 5.146361518432232e-06, + "loss": 0.15, + "step": 36168 + }, + { + "epoch": 2.0905724362452522, + "grad_norm": 4.2403154373168945, + "learning_rate": 5.146224475811978e-06, + "loss": 0.2341, + "step": 36169 + }, + { + "epoch": 2.090586001085187, + "grad_norm": 6.265125751495361, + "learning_rate": 5.1460874331917236e-06, + "loss": 0.2418, + "step": 36170 + }, + { + "epoch": 2.090599565925122, + "grad_norm": 6.495774745941162, + "learning_rate": 5.145950390571468e-06, + "loss": 0.2479, + "step": 36171 + }, + { + "epoch": 2.090613130765057, + "grad_norm": 5.320396900177002, + "learning_rate": 5.145813347951213e-06, + "loss": 0.2599, + "step": 36172 + }, + { + "epoch": 2.0906266956049917, + "grad_norm": 5.2375102043151855, + "learning_rate": 5.145676305330959e-06, + "loss": 0.2747, + "step": 36173 + }, + { + "epoch": 2.0906402604449266, + "grad_norm": 4.969486236572266, + "learning_rate": 5.1455392627107034e-06, + "loss": 0.2434, + "step": 36174 + }, + { + "epoch": 2.0906538252848614, + "grad_norm": 3.4999241828918457, + "learning_rate": 5.145402220090449e-06, + "loss": 0.1462, + "step": 36175 + }, + { + "epoch": 2.0906673901247963, + "grad_norm": 4.580533504486084, + "learning_rate": 5.145265177470194e-06, + "loss": 0.198, + "step": 36176 + }, + { + "epoch": 2.0906809549647316, + "grad_norm": 4.670506477355957, + "learning_rate": 5.145128134849938e-06, + "loss": 0.1779, + "step": 36177 + }, + { + "epoch": 2.0906945198046665, + "grad_norm": 5.638258934020996, + "learning_rate": 5.144991092229684e-06, + "loss": 0.1817, + "step": 36178 + }, + { + "epoch": 2.0907080846446013, + "grad_norm": 4.383211135864258, + "learning_rate": 5.144854049609429e-06, + "loss": 0.1557, + "step": 36179 + }, + { + "epoch": 2.090721649484536, + "grad_norm": 3.455686330795288, + "learning_rate": 5.144717006989174e-06, + "loss": 0.1641, + "step": 36180 + }, + { + "epoch": 2.090735214324471, + "grad_norm": 5.689219951629639, + "learning_rate": 5.144579964368919e-06, + "loss": 0.1544, + "step": 36181 + }, + { + "epoch": 2.090748779164406, + "grad_norm": 5.564864158630371, + "learning_rate": 5.144442921748665e-06, + "loss": 0.2055, + "step": 36182 + }, + { + "epoch": 2.090762344004341, + "grad_norm": 4.387528419494629, + "learning_rate": 5.144305879128409e-06, + "loss": 0.148, + "step": 36183 + }, + { + "epoch": 2.0907759088442757, + "grad_norm": 4.253730297088623, + "learning_rate": 5.144168836508154e-06, + "loss": 0.1359, + "step": 36184 + }, + { + "epoch": 2.0907894736842105, + "grad_norm": 4.194333553314209, + "learning_rate": 5.1440317938878996e-06, + "loss": 0.131, + "step": 36185 + }, + { + "epoch": 2.0908030385241454, + "grad_norm": 4.629617691040039, + "learning_rate": 5.143894751267645e-06, + "loss": 0.1558, + "step": 36186 + }, + { + "epoch": 2.0908166033640803, + "grad_norm": 4.4035210609436035, + "learning_rate": 5.14375770864739e-06, + "loss": 0.1381, + "step": 36187 + }, + { + "epoch": 2.090830168204015, + "grad_norm": 4.689391136169434, + "learning_rate": 5.143620666027135e-06, + "loss": 0.1501, + "step": 36188 + }, + { + "epoch": 2.09084373304395, + "grad_norm": 4.3781046867370605, + "learning_rate": 5.1434836234068794e-06, + "loss": 0.1576, + "step": 36189 + }, + { + "epoch": 2.090857297883885, + "grad_norm": 4.5156025886535645, + "learning_rate": 5.143346580786625e-06, + "loss": 0.2109, + "step": 36190 + }, + { + "epoch": 2.0908708627238197, + "grad_norm": 5.723244667053223, + "learning_rate": 5.143209538166371e-06, + "loss": 0.1559, + "step": 36191 + }, + { + "epoch": 2.0908844275637546, + "grad_norm": 3.256723642349243, + "learning_rate": 5.143072495546115e-06, + "loss": 0.1426, + "step": 36192 + }, + { + "epoch": 2.0908979924036895, + "grad_norm": 4.170260906219482, + "learning_rate": 5.14293545292586e-06, + "loss": 0.1256, + "step": 36193 + }, + { + "epoch": 2.0909115572436243, + "grad_norm": 4.8149943351745605, + "learning_rate": 5.142798410305605e-06, + "loss": 0.1644, + "step": 36194 + }, + { + "epoch": 2.0909251220835596, + "grad_norm": 3.3997673988342285, + "learning_rate": 5.142661367685351e-06, + "loss": 0.1166, + "step": 36195 + }, + { + "epoch": 2.0909386869234945, + "grad_norm": 4.18201208114624, + "learning_rate": 5.142524325065096e-06, + "loss": 0.1224, + "step": 36196 + }, + { + "epoch": 2.0909522517634294, + "grad_norm": 3.838536500930786, + "learning_rate": 5.142387282444841e-06, + "loss": 0.1254, + "step": 36197 + }, + { + "epoch": 2.0909658166033642, + "grad_norm": 5.298476219177246, + "learning_rate": 5.142250239824585e-06, + "loss": 0.2213, + "step": 36198 + }, + { + "epoch": 2.090979381443299, + "grad_norm": 6.7248077392578125, + "learning_rate": 5.142113197204331e-06, + "loss": 0.2564, + "step": 36199 + }, + { + "epoch": 2.090992946283234, + "grad_norm": 7.224491119384766, + "learning_rate": 5.141976154584076e-06, + "loss": 0.2928, + "step": 36200 + }, + { + "epoch": 2.091006511123169, + "grad_norm": 6.401351451873779, + "learning_rate": 5.141839111963821e-06, + "loss": 0.2302, + "step": 36201 + }, + { + "epoch": 2.0910200759631037, + "grad_norm": 3.206108570098877, + "learning_rate": 5.141702069343566e-06, + "loss": 0.1573, + "step": 36202 + }, + { + "epoch": 2.0910336408030386, + "grad_norm": 4.638542175292969, + "learning_rate": 5.141565026723312e-06, + "loss": 0.1163, + "step": 36203 + }, + { + "epoch": 2.0910472056429734, + "grad_norm": 3.1815202236175537, + "learning_rate": 5.141427984103057e-06, + "loss": 0.1578, + "step": 36204 + }, + { + "epoch": 2.0910607704829083, + "grad_norm": 4.272199630737305, + "learning_rate": 5.1412909414828014e-06, + "loss": 0.1445, + "step": 36205 + }, + { + "epoch": 2.091074335322843, + "grad_norm": 4.999129772186279, + "learning_rate": 5.141153898862547e-06, + "loss": 0.1996, + "step": 36206 + }, + { + "epoch": 2.091087900162778, + "grad_norm": 4.062748432159424, + "learning_rate": 5.141016856242291e-06, + "loss": 0.1282, + "step": 36207 + }, + { + "epoch": 2.091101465002713, + "grad_norm": 5.027868270874023, + "learning_rate": 5.140879813622037e-06, + "loss": 0.1596, + "step": 36208 + }, + { + "epoch": 2.0911150298426477, + "grad_norm": 3.8725950717926025, + "learning_rate": 5.140742771001782e-06, + "loss": 0.1109, + "step": 36209 + }, + { + "epoch": 2.0911285946825826, + "grad_norm": 3.8211021423339844, + "learning_rate": 5.140605728381527e-06, + "loss": 0.1329, + "step": 36210 + }, + { + "epoch": 2.0911421595225175, + "grad_norm": 4.074338912963867, + "learning_rate": 5.140468685761272e-06, + "loss": 0.1201, + "step": 36211 + }, + { + "epoch": 2.0911557243624523, + "grad_norm": 5.99242639541626, + "learning_rate": 5.140331643141018e-06, + "loss": 0.1948, + "step": 36212 + }, + { + "epoch": 2.091169289202387, + "grad_norm": 5.615684509277344, + "learning_rate": 5.140194600520763e-06, + "loss": 0.2396, + "step": 36213 + }, + { + "epoch": 2.0911828540423225, + "grad_norm": 5.567915916442871, + "learning_rate": 5.140057557900507e-06, + "loss": 0.1182, + "step": 36214 + }, + { + "epoch": 2.0911964188822574, + "grad_norm": 5.303440093994141, + "learning_rate": 5.139920515280252e-06, + "loss": 0.1608, + "step": 36215 + }, + { + "epoch": 2.0912099837221922, + "grad_norm": 5.644383430480957, + "learning_rate": 5.139783472659998e-06, + "loss": 0.1748, + "step": 36216 + }, + { + "epoch": 2.091223548562127, + "grad_norm": 4.907453536987305, + "learning_rate": 5.139646430039743e-06, + "loss": 0.1147, + "step": 36217 + }, + { + "epoch": 2.091237113402062, + "grad_norm": 4.79291296005249, + "learning_rate": 5.139509387419488e-06, + "loss": 0.1398, + "step": 36218 + }, + { + "epoch": 2.091250678241997, + "grad_norm": 6.772900581359863, + "learning_rate": 5.139372344799233e-06, + "loss": 0.119, + "step": 36219 + }, + { + "epoch": 2.0912642430819317, + "grad_norm": 6.368528366088867, + "learning_rate": 5.1392353021789774e-06, + "loss": 0.1873, + "step": 36220 + }, + { + "epoch": 2.0912778079218666, + "grad_norm": 5.3483686447143555, + "learning_rate": 5.1390982595587235e-06, + "loss": 0.1975, + "step": 36221 + }, + { + "epoch": 2.0912913727618014, + "grad_norm": 6.443546295166016, + "learning_rate": 5.138961216938469e-06, + "loss": 0.1868, + "step": 36222 + }, + { + "epoch": 2.0913049376017363, + "grad_norm": 4.879418849945068, + "learning_rate": 5.138824174318213e-06, + "loss": 0.1693, + "step": 36223 + }, + { + "epoch": 2.091318502441671, + "grad_norm": 3.4761712551116943, + "learning_rate": 5.138687131697958e-06, + "loss": 0.1466, + "step": 36224 + }, + { + "epoch": 2.091332067281606, + "grad_norm": 4.233182430267334, + "learning_rate": 5.138550089077704e-06, + "loss": 0.1396, + "step": 36225 + }, + { + "epoch": 2.091345632121541, + "grad_norm": 4.781218528747559, + "learning_rate": 5.1384130464574485e-06, + "loss": 0.1581, + "step": 36226 + }, + { + "epoch": 2.0913591969614758, + "grad_norm": 6.63283634185791, + "learning_rate": 5.138276003837194e-06, + "loss": 0.2417, + "step": 36227 + }, + { + "epoch": 2.0913727618014106, + "grad_norm": 4.931253910064697, + "learning_rate": 5.138138961216939e-06, + "loss": 0.1831, + "step": 36228 + }, + { + "epoch": 2.0913863266413455, + "grad_norm": 4.018011569976807, + "learning_rate": 5.138001918596685e-06, + "loss": 0.1578, + "step": 36229 + }, + { + "epoch": 2.0913998914812804, + "grad_norm": 4.457418918609619, + "learning_rate": 5.137864875976429e-06, + "loss": 0.2244, + "step": 36230 + }, + { + "epoch": 2.0914134563212152, + "grad_norm": 7.3360514640808105, + "learning_rate": 5.137727833356174e-06, + "loss": 0.2024, + "step": 36231 + }, + { + "epoch": 2.09142702116115, + "grad_norm": 4.6426191329956055, + "learning_rate": 5.137590790735919e-06, + "loss": 0.1616, + "step": 36232 + }, + { + "epoch": 2.0914405860010854, + "grad_norm": 6.037620544433594, + "learning_rate": 5.137453748115664e-06, + "loss": 0.2449, + "step": 36233 + }, + { + "epoch": 2.0914541508410203, + "grad_norm": 9.337223052978516, + "learning_rate": 5.13731670549541e-06, + "loss": 0.4239, + "step": 36234 + }, + { + "epoch": 2.091467715680955, + "grad_norm": 4.309842586517334, + "learning_rate": 5.137179662875154e-06, + "loss": 0.1647, + "step": 36235 + }, + { + "epoch": 2.09148128052089, + "grad_norm": 4.052709102630615, + "learning_rate": 5.1370426202548995e-06, + "loss": 0.1226, + "step": 36236 + }, + { + "epoch": 2.091494845360825, + "grad_norm": 6.1731109619140625, + "learning_rate": 5.136905577634645e-06, + "loss": 0.2728, + "step": 36237 + }, + { + "epoch": 2.0915084102007597, + "grad_norm": 3.4212112426757812, + "learning_rate": 5.136768535014391e-06, + "loss": 0.1145, + "step": 36238 + }, + { + "epoch": 2.0915219750406946, + "grad_norm": 4.43424129486084, + "learning_rate": 5.136631492394135e-06, + "loss": 0.1436, + "step": 36239 + }, + { + "epoch": 2.0915355398806295, + "grad_norm": 5.52958345413208, + "learning_rate": 5.13649444977388e-06, + "loss": 0.1436, + "step": 36240 + }, + { + "epoch": 2.0915491047205643, + "grad_norm": 5.639009952545166, + "learning_rate": 5.1363574071536245e-06, + "loss": 0.1912, + "step": 36241 + }, + { + "epoch": 2.091562669560499, + "grad_norm": 4.723384380340576, + "learning_rate": 5.1362203645333705e-06, + "loss": 0.1655, + "step": 36242 + }, + { + "epoch": 2.091576234400434, + "grad_norm": 4.37864875793457, + "learning_rate": 5.136083321913116e-06, + "loss": 0.2008, + "step": 36243 + }, + { + "epoch": 2.091589799240369, + "grad_norm": 5.388842582702637, + "learning_rate": 5.135946279292861e-06, + "loss": 0.1892, + "step": 36244 + }, + { + "epoch": 2.091603364080304, + "grad_norm": 3.710939645767212, + "learning_rate": 5.135809236672605e-06, + "loss": 0.0754, + "step": 36245 + }, + { + "epoch": 2.0916169289202386, + "grad_norm": 5.163658618927002, + "learning_rate": 5.13567219405235e-06, + "loss": 0.2111, + "step": 36246 + }, + { + "epoch": 2.0916304937601735, + "grad_norm": 4.331665992736816, + "learning_rate": 5.135535151432096e-06, + "loss": 0.097, + "step": 36247 + }, + { + "epoch": 2.0916440586001084, + "grad_norm": 4.859874248504639, + "learning_rate": 5.135398108811841e-06, + "loss": 0.1547, + "step": 36248 + }, + { + "epoch": 2.0916576234400432, + "grad_norm": 5.658341407775879, + "learning_rate": 5.135261066191586e-06, + "loss": 0.2726, + "step": 36249 + }, + { + "epoch": 2.091671188279978, + "grad_norm": 5.015299320220947, + "learning_rate": 5.13512402357133e-06, + "loss": 0.1405, + "step": 36250 + }, + { + "epoch": 2.091684753119913, + "grad_norm": 5.713726043701172, + "learning_rate": 5.134986980951076e-06, + "loss": 0.1748, + "step": 36251 + }, + { + "epoch": 2.0916983179598483, + "grad_norm": 4.137688636779785, + "learning_rate": 5.1348499383308215e-06, + "loss": 0.1114, + "step": 36252 + }, + { + "epoch": 2.091711882799783, + "grad_norm": 6.633403301239014, + "learning_rate": 5.134712895710567e-06, + "loss": 0.2403, + "step": 36253 + }, + { + "epoch": 2.091725447639718, + "grad_norm": 4.3549370765686035, + "learning_rate": 5.134575853090311e-06, + "loss": 0.3435, + "step": 36254 + }, + { + "epoch": 2.091739012479653, + "grad_norm": 6.135636806488037, + "learning_rate": 5.134438810470057e-06, + "loss": 0.1773, + "step": 36255 + }, + { + "epoch": 2.0917525773195877, + "grad_norm": 4.450905799865723, + "learning_rate": 5.134301767849802e-06, + "loss": 0.1893, + "step": 36256 + }, + { + "epoch": 2.0917661421595226, + "grad_norm": 8.178553581237793, + "learning_rate": 5.1341647252295465e-06, + "loss": 0.3792, + "step": 36257 + }, + { + "epoch": 2.0917797069994575, + "grad_norm": 4.882638454437256, + "learning_rate": 5.134027682609292e-06, + "loss": 0.29, + "step": 36258 + }, + { + "epoch": 2.0917932718393923, + "grad_norm": 3.799315929412842, + "learning_rate": 5.133890639989037e-06, + "loss": 0.1364, + "step": 36259 + }, + { + "epoch": 2.091806836679327, + "grad_norm": 7.088410377502441, + "learning_rate": 5.133753597368782e-06, + "loss": 0.2426, + "step": 36260 + }, + { + "epoch": 2.091820401519262, + "grad_norm": 4.045424938201904, + "learning_rate": 5.133616554748527e-06, + "loss": 0.2039, + "step": 36261 + }, + { + "epoch": 2.091833966359197, + "grad_norm": 5.211124897003174, + "learning_rate": 5.133479512128272e-06, + "loss": 0.2044, + "step": 36262 + }, + { + "epoch": 2.091847531199132, + "grad_norm": 6.5461931228637695, + "learning_rate": 5.133342469508017e-06, + "loss": 0.2584, + "step": 36263 + }, + { + "epoch": 2.0918610960390667, + "grad_norm": 6.286808967590332, + "learning_rate": 5.133205426887763e-06, + "loss": 0.1944, + "step": 36264 + }, + { + "epoch": 2.0918746608790015, + "grad_norm": 5.107754230499268, + "learning_rate": 5.133068384267508e-06, + "loss": 0.2089, + "step": 36265 + }, + { + "epoch": 2.0918882257189364, + "grad_norm": 6.134947776794434, + "learning_rate": 5.132931341647252e-06, + "loss": 0.2601, + "step": 36266 + }, + { + "epoch": 2.0919017905588713, + "grad_norm": 5.790166854858398, + "learning_rate": 5.1327942990269975e-06, + "loss": 0.3093, + "step": 36267 + }, + { + "epoch": 2.091915355398806, + "grad_norm": 4.618276119232178, + "learning_rate": 5.1326572564067435e-06, + "loss": 0.239, + "step": 36268 + }, + { + "epoch": 2.091928920238741, + "grad_norm": 5.757810592651367, + "learning_rate": 5.132520213786488e-06, + "loss": 0.2737, + "step": 36269 + }, + { + "epoch": 2.091942485078676, + "grad_norm": 4.627043724060059, + "learning_rate": 5.132383171166233e-06, + "loss": 0.1583, + "step": 36270 + }, + { + "epoch": 2.091956049918611, + "grad_norm": 5.452010154724121, + "learning_rate": 5.132246128545978e-06, + "loss": 0.3135, + "step": 36271 + }, + { + "epoch": 2.091969614758546, + "grad_norm": 4.450401306152344, + "learning_rate": 5.1321090859257225e-06, + "loss": 0.2764, + "step": 36272 + }, + { + "epoch": 2.091983179598481, + "grad_norm": 5.128136157989502, + "learning_rate": 5.1319720433054685e-06, + "loss": 0.2189, + "step": 36273 + }, + { + "epoch": 2.0919967444384158, + "grad_norm": 5.320921897888184, + "learning_rate": 5.131835000685214e-06, + "loss": 0.2412, + "step": 36274 + }, + { + "epoch": 2.0920103092783506, + "grad_norm": 6.5768890380859375, + "learning_rate": 5.131697958064958e-06, + "loss": 0.3525, + "step": 36275 + }, + { + "epoch": 2.0920238741182855, + "grad_norm": 6.285091400146484, + "learning_rate": 5.131560915444703e-06, + "loss": 0.287, + "step": 36276 + }, + { + "epoch": 2.0920374389582204, + "grad_norm": 5.466391563415527, + "learning_rate": 5.131423872824449e-06, + "loss": 0.245, + "step": 36277 + }, + { + "epoch": 2.0920510037981552, + "grad_norm": 5.140247821807861, + "learning_rate": 5.1312868302041944e-06, + "loss": 0.2417, + "step": 36278 + }, + { + "epoch": 2.09206456863809, + "grad_norm": 6.363702297210693, + "learning_rate": 5.131149787583939e-06, + "loss": 0.2314, + "step": 36279 + }, + { + "epoch": 2.092078133478025, + "grad_norm": 4.489377975463867, + "learning_rate": 5.131012744963684e-06, + "loss": 0.2078, + "step": 36280 + }, + { + "epoch": 2.09209169831796, + "grad_norm": 5.304409027099609, + "learning_rate": 5.13087570234343e-06, + "loss": 0.2402, + "step": 36281 + }, + { + "epoch": 2.0921052631578947, + "grad_norm": 5.695414066314697, + "learning_rate": 5.130738659723174e-06, + "loss": 0.1862, + "step": 36282 + }, + { + "epoch": 2.0921188279978296, + "grad_norm": 4.855548858642578, + "learning_rate": 5.1306016171029195e-06, + "loss": 0.1593, + "step": 36283 + }, + { + "epoch": 2.0921323928377644, + "grad_norm": 5.864386558532715, + "learning_rate": 5.130464574482664e-06, + "loss": 0.2064, + "step": 36284 + }, + { + "epoch": 2.0921459576776993, + "grad_norm": 4.383917808532715, + "learning_rate": 5.13032753186241e-06, + "loss": 0.1846, + "step": 36285 + }, + { + "epoch": 2.092159522517634, + "grad_norm": 5.351215362548828, + "learning_rate": 5.130190489242155e-06, + "loss": 0.1738, + "step": 36286 + }, + { + "epoch": 2.092173087357569, + "grad_norm": 5.798489570617676, + "learning_rate": 5.1300534466219e-06, + "loss": 0.2405, + "step": 36287 + }, + { + "epoch": 2.092186652197504, + "grad_norm": 6.49294900894165, + "learning_rate": 5.1299164040016445e-06, + "loss": 0.2081, + "step": 36288 + }, + { + "epoch": 2.0922002170374387, + "grad_norm": 5.217217922210693, + "learning_rate": 5.12977936138139e-06, + "loss": 0.1558, + "step": 36289 + }, + { + "epoch": 2.092213781877374, + "grad_norm": 5.499239444732666, + "learning_rate": 5.129642318761136e-06, + "loss": 0.1557, + "step": 36290 + }, + { + "epoch": 2.092227346717309, + "grad_norm": 5.374154567718506, + "learning_rate": 5.12950527614088e-06, + "loss": 0.2846, + "step": 36291 + }, + { + "epoch": 2.092240911557244, + "grad_norm": 3.9423418045043945, + "learning_rate": 5.129368233520625e-06, + "loss": 0.1292, + "step": 36292 + }, + { + "epoch": 2.0922544763971787, + "grad_norm": 5.958962440490723, + "learning_rate": 5.12923119090037e-06, + "loss": 0.202, + "step": 36293 + }, + { + "epoch": 2.0922680412371135, + "grad_norm": 5.478047847747803, + "learning_rate": 5.129094148280116e-06, + "loss": 0.2154, + "step": 36294 + }, + { + "epoch": 2.0922816060770484, + "grad_norm": 5.4836320877075195, + "learning_rate": 5.128957105659861e-06, + "loss": 0.206, + "step": 36295 + }, + { + "epoch": 2.0922951709169832, + "grad_norm": 5.175171852111816, + "learning_rate": 5.128820063039606e-06, + "loss": 0.1979, + "step": 36296 + }, + { + "epoch": 2.092308735756918, + "grad_norm": 5.2806196212768555, + "learning_rate": 5.12868302041935e-06, + "loss": 0.167, + "step": 36297 + }, + { + "epoch": 2.092322300596853, + "grad_norm": 6.025948524475098, + "learning_rate": 5.128545977799096e-06, + "loss": 0.2755, + "step": 36298 + }, + { + "epoch": 2.092335865436788, + "grad_norm": 5.120134353637695, + "learning_rate": 5.1284089351788415e-06, + "loss": 0.1616, + "step": 36299 + }, + { + "epoch": 2.0923494302767227, + "grad_norm": 4.457945823669434, + "learning_rate": 5.128271892558586e-06, + "loss": 0.1597, + "step": 36300 + }, + { + "epoch": 2.0923629951166576, + "grad_norm": 5.3262224197387695, + "learning_rate": 5.128134849938331e-06, + "loss": 0.1937, + "step": 36301 + }, + { + "epoch": 2.0923765599565924, + "grad_norm": 5.327727317810059, + "learning_rate": 5.127997807318076e-06, + "loss": 0.1856, + "step": 36302 + }, + { + "epoch": 2.0923901247965273, + "grad_norm": 4.121345043182373, + "learning_rate": 5.127860764697822e-06, + "loss": 0.1273, + "step": 36303 + }, + { + "epoch": 2.092403689636462, + "grad_norm": 6.158629894256592, + "learning_rate": 5.1277237220775665e-06, + "loss": 0.1276, + "step": 36304 + }, + { + "epoch": 2.092417254476397, + "grad_norm": 6.1601104736328125, + "learning_rate": 5.127586679457312e-06, + "loss": 0.2871, + "step": 36305 + }, + { + "epoch": 2.092430819316332, + "grad_norm": 3.8391478061676025, + "learning_rate": 5.127449636837056e-06, + "loss": 0.1206, + "step": 36306 + }, + { + "epoch": 2.0924443841562668, + "grad_norm": 4.0344953536987305, + "learning_rate": 5.127312594216802e-06, + "loss": 0.2301, + "step": 36307 + }, + { + "epoch": 2.0924579489962016, + "grad_norm": 5.0181660652160645, + "learning_rate": 5.127175551596547e-06, + "loss": 0.2531, + "step": 36308 + }, + { + "epoch": 2.092471513836137, + "grad_norm": 5.8512468338012695, + "learning_rate": 5.127038508976292e-06, + "loss": 0.1793, + "step": 36309 + }, + { + "epoch": 2.092485078676072, + "grad_norm": 4.962325572967529, + "learning_rate": 5.126901466356037e-06, + "loss": 0.1153, + "step": 36310 + }, + { + "epoch": 2.0924986435160067, + "grad_norm": 5.184467792510986, + "learning_rate": 5.126764423735783e-06, + "loss": 0.2992, + "step": 36311 + }, + { + "epoch": 2.0925122083559415, + "grad_norm": 4.075189590454102, + "learning_rate": 5.126627381115528e-06, + "loss": 0.1768, + "step": 36312 + }, + { + "epoch": 2.0925257731958764, + "grad_norm": 3.142207145690918, + "learning_rate": 5.126490338495272e-06, + "loss": 0.1042, + "step": 36313 + }, + { + "epoch": 2.0925393380358113, + "grad_norm": 4.467170238494873, + "learning_rate": 5.1263532958750175e-06, + "loss": 0.1585, + "step": 36314 + }, + { + "epoch": 2.092552902875746, + "grad_norm": 3.7577977180480957, + "learning_rate": 5.126216253254762e-06, + "loss": 0.1225, + "step": 36315 + }, + { + "epoch": 2.092566467715681, + "grad_norm": 4.840517520904541, + "learning_rate": 5.126079210634508e-06, + "loss": 0.2091, + "step": 36316 + }, + { + "epoch": 2.092580032555616, + "grad_norm": 6.413314342498779, + "learning_rate": 5.125942168014253e-06, + "loss": 0.1756, + "step": 36317 + }, + { + "epoch": 2.0925935973955507, + "grad_norm": 5.2848896980285645, + "learning_rate": 5.125805125393998e-06, + "loss": 0.2088, + "step": 36318 + }, + { + "epoch": 2.0926071622354856, + "grad_norm": 4.420228481292725, + "learning_rate": 5.1256680827737425e-06, + "loss": 0.1587, + "step": 36319 + }, + { + "epoch": 2.0926207270754205, + "grad_norm": 5.232393741607666, + "learning_rate": 5.1255310401534885e-06, + "loss": 0.151, + "step": 36320 + }, + { + "epoch": 2.0926342919153553, + "grad_norm": 6.650683879852295, + "learning_rate": 5.125393997533234e-06, + "loss": 0.191, + "step": 36321 + }, + { + "epoch": 2.09264785675529, + "grad_norm": 7.6021623611450195, + "learning_rate": 5.125256954912978e-06, + "loss": 0.23, + "step": 36322 + }, + { + "epoch": 2.092661421595225, + "grad_norm": 5.252205848693848, + "learning_rate": 5.125119912292723e-06, + "loss": 0.1703, + "step": 36323 + }, + { + "epoch": 2.09267498643516, + "grad_norm": 5.4203643798828125, + "learning_rate": 5.124982869672469e-06, + "loss": 0.2428, + "step": 36324 + }, + { + "epoch": 2.092688551275095, + "grad_norm": 4.93168830871582, + "learning_rate": 5.124845827052214e-06, + "loss": 0.1682, + "step": 36325 + }, + { + "epoch": 2.0927021161150297, + "grad_norm": 5.680068492889404, + "learning_rate": 5.124708784431959e-06, + "loss": 0.2849, + "step": 36326 + }, + { + "epoch": 2.0927156809549645, + "grad_norm": 5.59302282333374, + "learning_rate": 5.124571741811704e-06, + "loss": 0.1792, + "step": 36327 + }, + { + "epoch": 2.0927292457949, + "grad_norm": 4.851034641265869, + "learning_rate": 5.124434699191448e-06, + "loss": 0.1767, + "step": 36328 + }, + { + "epoch": 2.0927428106348347, + "grad_norm": 7.0747599601745605, + "learning_rate": 5.124297656571194e-06, + "loss": 0.1906, + "step": 36329 + }, + { + "epoch": 2.0927563754747696, + "grad_norm": 4.771688461303711, + "learning_rate": 5.1241606139509395e-06, + "loss": 0.186, + "step": 36330 + }, + { + "epoch": 2.0927699403147044, + "grad_norm": 4.885480880737305, + "learning_rate": 5.124023571330684e-06, + "loss": 0.1695, + "step": 36331 + }, + { + "epoch": 2.0927835051546393, + "grad_norm": 4.451913833618164, + "learning_rate": 5.123886528710429e-06, + "loss": 0.1768, + "step": 36332 + }, + { + "epoch": 2.092797069994574, + "grad_norm": 4.416203498840332, + "learning_rate": 5.123749486090175e-06, + "loss": 0.1803, + "step": 36333 + }, + { + "epoch": 2.092810634834509, + "grad_norm": 5.4892168045043945, + "learning_rate": 5.123612443469919e-06, + "loss": 0.2993, + "step": 36334 + }, + { + "epoch": 2.092824199674444, + "grad_norm": 5.017806053161621, + "learning_rate": 5.1234754008496645e-06, + "loss": 0.2042, + "step": 36335 + }, + { + "epoch": 2.0928377645143788, + "grad_norm": 8.09714412689209, + "learning_rate": 5.12333835822941e-06, + "loss": 0.2895, + "step": 36336 + }, + { + "epoch": 2.0928513293543136, + "grad_norm": 3.8152191638946533, + "learning_rate": 5.123201315609156e-06, + "loss": 0.0918, + "step": 36337 + }, + { + "epoch": 2.0928648941942485, + "grad_norm": 4.390310287475586, + "learning_rate": 5.1230642729889e-06, + "loss": 0.2256, + "step": 36338 + }, + { + "epoch": 2.0928784590341833, + "grad_norm": 3.8957574367523193, + "learning_rate": 5.122927230368645e-06, + "loss": 0.1629, + "step": 36339 + }, + { + "epoch": 2.092892023874118, + "grad_norm": 4.667967796325684, + "learning_rate": 5.12279018774839e-06, + "loss": 0.1811, + "step": 36340 + }, + { + "epoch": 2.092905588714053, + "grad_norm": 3.147054433822632, + "learning_rate": 5.122653145128135e-06, + "loss": 0.0977, + "step": 36341 + }, + { + "epoch": 2.092919153553988, + "grad_norm": 5.552811622619629, + "learning_rate": 5.122516102507881e-06, + "loss": 0.2371, + "step": 36342 + }, + { + "epoch": 2.092932718393923, + "grad_norm": 5.251802921295166, + "learning_rate": 5.122379059887625e-06, + "loss": 0.1792, + "step": 36343 + }, + { + "epoch": 2.0929462832338577, + "grad_norm": 5.7299652099609375, + "learning_rate": 5.12224201726737e-06, + "loss": 0.1725, + "step": 36344 + }, + { + "epoch": 2.0929598480737925, + "grad_norm": 4.291919231414795, + "learning_rate": 5.1221049746471155e-06, + "loss": 0.2278, + "step": 36345 + }, + { + "epoch": 2.0929734129137274, + "grad_norm": 4.7223639488220215, + "learning_rate": 5.1219679320268615e-06, + "loss": 0.1925, + "step": 36346 + }, + { + "epoch": 2.0929869777536627, + "grad_norm": 7.129335403442383, + "learning_rate": 5.121830889406606e-06, + "loss": 0.2135, + "step": 36347 + }, + { + "epoch": 2.0930005425935976, + "grad_norm": 5.741975784301758, + "learning_rate": 5.121693846786351e-06, + "loss": 0.2784, + "step": 36348 + }, + { + "epoch": 2.0930141074335324, + "grad_norm": 4.617368698120117, + "learning_rate": 5.121556804166095e-06, + "loss": 0.2483, + "step": 36349 + }, + { + "epoch": 2.0930276722734673, + "grad_norm": 6.561962127685547, + "learning_rate": 5.121419761545841e-06, + "loss": 0.3276, + "step": 36350 + }, + { + "epoch": 2.093041237113402, + "grad_norm": 6.13238525390625, + "learning_rate": 5.1212827189255866e-06, + "loss": 0.2952, + "step": 36351 + }, + { + "epoch": 2.093054801953337, + "grad_norm": 5.728820323944092, + "learning_rate": 5.121145676305332e-06, + "loss": 0.2801, + "step": 36352 + }, + { + "epoch": 2.093068366793272, + "grad_norm": 5.7817792892456055, + "learning_rate": 5.121008633685076e-06, + "loss": 0.2537, + "step": 36353 + }, + { + "epoch": 2.0930819316332068, + "grad_norm": 6.432260513305664, + "learning_rate": 5.120871591064822e-06, + "loss": 0.3188, + "step": 36354 + }, + { + "epoch": 2.0930954964731416, + "grad_norm": 4.8390913009643555, + "learning_rate": 5.120734548444567e-06, + "loss": 0.1789, + "step": 36355 + }, + { + "epoch": 2.0931090613130765, + "grad_norm": 6.032914638519287, + "learning_rate": 5.120597505824312e-06, + "loss": 0.245, + "step": 36356 + }, + { + "epoch": 2.0931226261530114, + "grad_norm": 6.433894157409668, + "learning_rate": 5.120460463204057e-06, + "loss": 0.2467, + "step": 36357 + }, + { + "epoch": 2.0931361909929462, + "grad_norm": 5.131703853607178, + "learning_rate": 5.120323420583801e-06, + "loss": 0.2151, + "step": 36358 + }, + { + "epoch": 2.093149755832881, + "grad_norm": 5.471070289611816, + "learning_rate": 5.120186377963547e-06, + "loss": 0.2771, + "step": 36359 + }, + { + "epoch": 2.093163320672816, + "grad_norm": 6.0554327964782715, + "learning_rate": 5.120049335343292e-06, + "loss": 0.1754, + "step": 36360 + }, + { + "epoch": 2.093176885512751, + "grad_norm": 6.930372714996338, + "learning_rate": 5.1199122927230375e-06, + "loss": 0.3133, + "step": 36361 + }, + { + "epoch": 2.0931904503526857, + "grad_norm": 7.3701324462890625, + "learning_rate": 5.119775250102782e-06, + "loss": 0.3235, + "step": 36362 + }, + { + "epoch": 2.0932040151926206, + "grad_norm": 3.2813708782196045, + "learning_rate": 5.119638207482528e-06, + "loss": 0.1632, + "step": 36363 + }, + { + "epoch": 2.0932175800325554, + "grad_norm": 5.4922308921813965, + "learning_rate": 5.119501164862273e-06, + "loss": 0.2347, + "step": 36364 + }, + { + "epoch": 2.0932311448724903, + "grad_norm": 4.608667373657227, + "learning_rate": 5.119364122242017e-06, + "loss": 0.2276, + "step": 36365 + }, + { + "epoch": 2.0932447097124256, + "grad_norm": 6.587433338165283, + "learning_rate": 5.1192270796217625e-06, + "loss": 0.3113, + "step": 36366 + }, + { + "epoch": 2.0932582745523605, + "grad_norm": 3.227418899536133, + "learning_rate": 5.1190900370015086e-06, + "loss": 0.1515, + "step": 36367 + }, + { + "epoch": 2.0932718393922953, + "grad_norm": 5.331503391265869, + "learning_rate": 5.118952994381253e-06, + "loss": 0.3233, + "step": 36368 + }, + { + "epoch": 2.09328540423223, + "grad_norm": 6.930660247802734, + "learning_rate": 5.118815951760998e-06, + "loss": 0.4187, + "step": 36369 + }, + { + "epoch": 2.093298969072165, + "grad_norm": 7.0789923667907715, + "learning_rate": 5.118678909140743e-06, + "loss": 0.3368, + "step": 36370 + }, + { + "epoch": 2.0933125339121, + "grad_norm": 5.300539493560791, + "learning_rate": 5.118541866520488e-06, + "loss": 0.3529, + "step": 36371 + }, + { + "epoch": 2.093326098752035, + "grad_norm": 5.184312343597412, + "learning_rate": 5.118404823900234e-06, + "loss": 0.1413, + "step": 36372 + }, + { + "epoch": 2.0933396635919697, + "grad_norm": 4.775303363800049, + "learning_rate": 5.118267781279979e-06, + "loss": 0.2489, + "step": 36373 + }, + { + "epoch": 2.0933532284319045, + "grad_norm": 4.390860557556152, + "learning_rate": 5.118130738659723e-06, + "loss": 0.2111, + "step": 36374 + }, + { + "epoch": 2.0933667932718394, + "grad_norm": 6.764459609985352, + "learning_rate": 5.117993696039468e-06, + "loss": 0.207, + "step": 36375 + }, + { + "epoch": 2.0933803581117743, + "grad_norm": 3.9513847827911377, + "learning_rate": 5.117856653419214e-06, + "loss": 0.2142, + "step": 36376 + }, + { + "epoch": 2.093393922951709, + "grad_norm": 5.495826721191406, + "learning_rate": 5.117719610798959e-06, + "loss": 0.2547, + "step": 36377 + }, + { + "epoch": 2.093407487791644, + "grad_norm": 5.0547194480896, + "learning_rate": 5.117582568178704e-06, + "loss": 0.2633, + "step": 36378 + }, + { + "epoch": 2.093421052631579, + "grad_norm": 6.034143447875977, + "learning_rate": 5.117445525558449e-06, + "loss": 0.2373, + "step": 36379 + }, + { + "epoch": 2.0934346174715137, + "grad_norm": 6.301210880279541, + "learning_rate": 5.117308482938195e-06, + "loss": 0.3967, + "step": 36380 + }, + { + "epoch": 2.0934481823114486, + "grad_norm": 3.8746962547302246, + "learning_rate": 5.117171440317939e-06, + "loss": 0.1276, + "step": 36381 + }, + { + "epoch": 2.0934617471513834, + "grad_norm": 5.300254821777344, + "learning_rate": 5.1170343976976846e-06, + "loss": 0.3028, + "step": 36382 + }, + { + "epoch": 2.0934753119913183, + "grad_norm": 5.433785438537598, + "learning_rate": 5.116897355077429e-06, + "loss": 0.298, + "step": 36383 + }, + { + "epoch": 2.0934888768312536, + "grad_norm": 4.096948623657227, + "learning_rate": 5.116760312457174e-06, + "loss": 0.1617, + "step": 36384 + }, + { + "epoch": 2.0935024416711885, + "grad_norm": 4.361498832702637, + "learning_rate": 5.11662326983692e-06, + "loss": 0.2121, + "step": 36385 + }, + { + "epoch": 2.0935160065111234, + "grad_norm": 4.288690567016602, + "learning_rate": 5.116486227216665e-06, + "loss": 0.1414, + "step": 36386 + }, + { + "epoch": 2.093529571351058, + "grad_norm": 6.395524501800537, + "learning_rate": 5.11634918459641e-06, + "loss": 0.1575, + "step": 36387 + }, + { + "epoch": 2.093543136190993, + "grad_norm": 5.237772464752197, + "learning_rate": 5.116212141976155e-06, + "loss": 0.1796, + "step": 36388 + }, + { + "epoch": 2.093556701030928, + "grad_norm": 5.567746162414551, + "learning_rate": 5.116075099355901e-06, + "loss": 0.1806, + "step": 36389 + }, + { + "epoch": 2.093570265870863, + "grad_norm": 4.748943328857422, + "learning_rate": 5.115938056735645e-06, + "loss": 0.2145, + "step": 36390 + }, + { + "epoch": 2.0935838307107977, + "grad_norm": 5.364665985107422, + "learning_rate": 5.11580101411539e-06, + "loss": 0.286, + "step": 36391 + }, + { + "epoch": 2.0935973955507325, + "grad_norm": 5.615705490112305, + "learning_rate": 5.115663971495135e-06, + "loss": 0.3098, + "step": 36392 + }, + { + "epoch": 2.0936109603906674, + "grad_norm": 5.043832778930664, + "learning_rate": 5.115526928874881e-06, + "loss": 0.2645, + "step": 36393 + }, + { + "epoch": 2.0936245252306023, + "grad_norm": 4.518285751342773, + "learning_rate": 5.115389886254626e-06, + "loss": 0.298, + "step": 36394 + }, + { + "epoch": 2.093638090070537, + "grad_norm": 4.161536693572998, + "learning_rate": 5.115252843634371e-06, + "loss": 0.183, + "step": 36395 + }, + { + "epoch": 2.093651654910472, + "grad_norm": 3.2932496070861816, + "learning_rate": 5.115115801014115e-06, + "loss": 0.1952, + "step": 36396 + }, + { + "epoch": 2.093665219750407, + "grad_norm": 5.9283552169799805, + "learning_rate": 5.1149787583938605e-06, + "loss": 0.3185, + "step": 36397 + }, + { + "epoch": 2.0936787845903417, + "grad_norm": 3.6715219020843506, + "learning_rate": 5.1148417157736066e-06, + "loss": 0.1539, + "step": 36398 + }, + { + "epoch": 2.0936923494302766, + "grad_norm": 4.109351634979248, + "learning_rate": 5.114704673153351e-06, + "loss": 0.2038, + "step": 36399 + }, + { + "epoch": 2.0937059142702115, + "grad_norm": 7.497345924377441, + "learning_rate": 5.114567630533096e-06, + "loss": 0.2283, + "step": 36400 + }, + { + "epoch": 2.0937194791101463, + "grad_norm": 6.082578182220459, + "learning_rate": 5.114430587912841e-06, + "loss": 0.1988, + "step": 36401 + }, + { + "epoch": 2.093733043950081, + "grad_norm": 4.758437156677246, + "learning_rate": 5.1142935452925864e-06, + "loss": 0.1945, + "step": 36402 + }, + { + "epoch": 2.093746608790016, + "grad_norm": 5.858137130737305, + "learning_rate": 5.114156502672332e-06, + "loss": 0.2489, + "step": 36403 + }, + { + "epoch": 2.0937601736299514, + "grad_norm": 6.130527019500732, + "learning_rate": 5.114019460052077e-06, + "loss": 0.2326, + "step": 36404 + }, + { + "epoch": 2.0937737384698862, + "grad_norm": 4.057469367980957, + "learning_rate": 5.113882417431821e-06, + "loss": 0.2238, + "step": 36405 + }, + { + "epoch": 2.093787303309821, + "grad_norm": 6.773895740509033, + "learning_rate": 5.113745374811567e-06, + "loss": 0.2903, + "step": 36406 + }, + { + "epoch": 2.093800868149756, + "grad_norm": 4.426364421844482, + "learning_rate": 5.113608332191312e-06, + "loss": 0.2274, + "step": 36407 + }, + { + "epoch": 2.093814432989691, + "grad_norm": 3.990548849105835, + "learning_rate": 5.113471289571057e-06, + "loss": 0.1833, + "step": 36408 + }, + { + "epoch": 2.0938279978296257, + "grad_norm": 5.370026588439941, + "learning_rate": 5.113334246950802e-06, + "loss": 0.2014, + "step": 36409 + }, + { + "epoch": 2.0938415626695606, + "grad_norm": 5.216211318969727, + "learning_rate": 5.113197204330547e-06, + "loss": 0.2087, + "step": 36410 + }, + { + "epoch": 2.0938551275094954, + "grad_norm": 6.981297492980957, + "learning_rate": 5.113060161710292e-06, + "loss": 0.3455, + "step": 36411 + }, + { + "epoch": 2.0938686923494303, + "grad_norm": 5.783673286437988, + "learning_rate": 5.112923119090037e-06, + "loss": 0.327, + "step": 36412 + }, + { + "epoch": 2.093882257189365, + "grad_norm": 4.923280715942383, + "learning_rate": 5.1127860764697826e-06, + "loss": 0.2033, + "step": 36413 + }, + { + "epoch": 2.0938958220293, + "grad_norm": 4.393435478210449, + "learning_rate": 5.112649033849527e-06, + "loss": 0.1635, + "step": 36414 + }, + { + "epoch": 2.093909386869235, + "grad_norm": 6.386752128601074, + "learning_rate": 5.112511991229273e-06, + "loss": 0.2091, + "step": 36415 + }, + { + "epoch": 2.0939229517091698, + "grad_norm": 4.1265482902526855, + "learning_rate": 5.112374948609018e-06, + "loss": 0.1471, + "step": 36416 + }, + { + "epoch": 2.0939365165491046, + "grad_norm": 4.788018703460693, + "learning_rate": 5.1122379059887624e-06, + "loss": 0.1752, + "step": 36417 + }, + { + "epoch": 2.0939500813890395, + "grad_norm": 5.459280014038086, + "learning_rate": 5.112100863368508e-06, + "loss": 0.1557, + "step": 36418 + }, + { + "epoch": 2.0939636462289744, + "grad_norm": 6.440643787384033, + "learning_rate": 5.111963820748254e-06, + "loss": 0.2591, + "step": 36419 + }, + { + "epoch": 2.093977211068909, + "grad_norm": 5.671428203582764, + "learning_rate": 5.111826778127999e-06, + "loss": 0.2279, + "step": 36420 + }, + { + "epoch": 2.093990775908844, + "grad_norm": 5.57115364074707, + "learning_rate": 5.111689735507743e-06, + "loss": 0.2254, + "step": 36421 + }, + { + "epoch": 2.0940043407487794, + "grad_norm": 4.335434436798096, + "learning_rate": 5.111552692887488e-06, + "loss": 0.2647, + "step": 36422 + }, + { + "epoch": 2.0940179055887143, + "grad_norm": 4.177915096282959, + "learning_rate": 5.111415650267234e-06, + "loss": 0.3055, + "step": 36423 + }, + { + "epoch": 2.094031470428649, + "grad_norm": 5.118659019470215, + "learning_rate": 5.111278607646979e-06, + "loss": 0.1761, + "step": 36424 + }, + { + "epoch": 2.094045035268584, + "grad_norm": 4.896010875701904, + "learning_rate": 5.111141565026724e-06, + "loss": 0.2897, + "step": 36425 + }, + { + "epoch": 2.094058600108519, + "grad_norm": 4.195600986480713, + "learning_rate": 5.111004522406468e-06, + "loss": 0.1268, + "step": 36426 + }, + { + "epoch": 2.0940721649484537, + "grad_norm": 5.133702754974365, + "learning_rate": 5.110867479786213e-06, + "loss": 0.1988, + "step": 36427 + }, + { + "epoch": 2.0940857297883886, + "grad_norm": 3.8379874229431152, + "learning_rate": 5.110730437165959e-06, + "loss": 0.1144, + "step": 36428 + }, + { + "epoch": 2.0940992946283234, + "grad_norm": 4.637453556060791, + "learning_rate": 5.110593394545705e-06, + "loss": 0.1783, + "step": 36429 + }, + { + "epoch": 2.0941128594682583, + "grad_norm": 4.521214962005615, + "learning_rate": 5.110456351925449e-06, + "loss": 0.2198, + "step": 36430 + }, + { + "epoch": 2.094126424308193, + "grad_norm": 5.277384281158447, + "learning_rate": 5.110319309305194e-06, + "loss": 0.2964, + "step": 36431 + }, + { + "epoch": 2.094139989148128, + "grad_norm": 4.197881698608398, + "learning_rate": 5.11018226668494e-06, + "loss": 0.2236, + "step": 36432 + }, + { + "epoch": 2.094153553988063, + "grad_norm": 4.013148784637451, + "learning_rate": 5.1100452240646844e-06, + "loss": 0.184, + "step": 36433 + }, + { + "epoch": 2.0941671188279978, + "grad_norm": 5.200738906860352, + "learning_rate": 5.10990818144443e-06, + "loss": 0.2262, + "step": 36434 + }, + { + "epoch": 2.0941806836679326, + "grad_norm": 5.053796291351318, + "learning_rate": 5.109771138824175e-06, + "loss": 0.2314, + "step": 36435 + }, + { + "epoch": 2.0941942485078675, + "grad_norm": 5.836983680725098, + "learning_rate": 5.10963409620392e-06, + "loss": 0.255, + "step": 36436 + }, + { + "epoch": 2.0942078133478024, + "grad_norm": 3.822495937347412, + "learning_rate": 5.109497053583665e-06, + "loss": 0.1178, + "step": 36437 + }, + { + "epoch": 2.0942213781877372, + "grad_norm": 5.328802108764648, + "learning_rate": 5.10936001096341e-06, + "loss": 0.2061, + "step": 36438 + }, + { + "epoch": 2.094234943027672, + "grad_norm": 5.718576431274414, + "learning_rate": 5.109222968343155e-06, + "loss": 0.2607, + "step": 36439 + }, + { + "epoch": 2.094248507867607, + "grad_norm": 4.389863967895508, + "learning_rate": 5.1090859257229e-06, + "loss": 0.1702, + "step": 36440 + }, + { + "epoch": 2.094262072707542, + "grad_norm": 6.702631950378418, + "learning_rate": 5.108948883102646e-06, + "loss": 0.2448, + "step": 36441 + }, + { + "epoch": 2.094275637547477, + "grad_norm": 4.892337799072266, + "learning_rate": 5.10881184048239e-06, + "loss": 0.223, + "step": 36442 + }, + { + "epoch": 2.094289202387412, + "grad_norm": 4.711414337158203, + "learning_rate": 5.108674797862135e-06, + "loss": 0.1387, + "step": 36443 + }, + { + "epoch": 2.094302767227347, + "grad_norm": 3.918738842010498, + "learning_rate": 5.1085377552418806e-06, + "loss": 0.1246, + "step": 36444 + }, + { + "epoch": 2.0943163320672817, + "grad_norm": 4.532251834869385, + "learning_rate": 5.108400712621627e-06, + "loss": 0.1603, + "step": 36445 + }, + { + "epoch": 2.0943298969072166, + "grad_norm": 6.772305011749268, + "learning_rate": 5.108263670001371e-06, + "loss": 0.2475, + "step": 36446 + }, + { + "epoch": 2.0943434617471515, + "grad_norm": 3.6827304363250732, + "learning_rate": 5.108126627381116e-06, + "loss": 0.2054, + "step": 36447 + }, + { + "epoch": 2.0943570265870863, + "grad_norm": 5.07503604888916, + "learning_rate": 5.1079895847608604e-06, + "loss": 0.1754, + "step": 36448 + }, + { + "epoch": 2.094370591427021, + "grad_norm": 5.293680667877197, + "learning_rate": 5.1078525421406065e-06, + "loss": 0.1583, + "step": 36449 + }, + { + "epoch": 2.094384156266956, + "grad_norm": 4.444522857666016, + "learning_rate": 5.107715499520352e-06, + "loss": 0.1872, + "step": 36450 + }, + { + "epoch": 2.094397721106891, + "grad_norm": 5.736577033996582, + "learning_rate": 5.107578456900096e-06, + "loss": 0.1824, + "step": 36451 + }, + { + "epoch": 2.094411285946826, + "grad_norm": 4.884171009063721, + "learning_rate": 5.107441414279841e-06, + "loss": 0.2192, + "step": 36452 + }, + { + "epoch": 2.0944248507867607, + "grad_norm": 4.791146755218506, + "learning_rate": 5.107304371659586e-06, + "loss": 0.1242, + "step": 36453 + }, + { + "epoch": 2.0944384156266955, + "grad_norm": 6.023681640625, + "learning_rate": 5.107167329039332e-06, + "loss": 0.2294, + "step": 36454 + }, + { + "epoch": 2.0944519804666304, + "grad_norm": 4.62440299987793, + "learning_rate": 5.107030286419077e-06, + "loss": 0.1853, + "step": 36455 + }, + { + "epoch": 2.0944655453065653, + "grad_norm": 5.524590015411377, + "learning_rate": 5.106893243798822e-06, + "loss": 0.1766, + "step": 36456 + }, + { + "epoch": 2.0944791101465, + "grad_norm": 6.791815280914307, + "learning_rate": 5.106756201178566e-06, + "loss": 0.3617, + "step": 36457 + }, + { + "epoch": 2.094492674986435, + "grad_norm": 4.977296352386475, + "learning_rate": 5.106619158558312e-06, + "loss": 0.2154, + "step": 36458 + }, + { + "epoch": 2.09450623982637, + "grad_norm": 4.0940446853637695, + "learning_rate": 5.106482115938057e-06, + "loss": 0.1702, + "step": 36459 + }, + { + "epoch": 2.094519804666305, + "grad_norm": 5.956483364105225, + "learning_rate": 5.106345073317803e-06, + "loss": 0.181, + "step": 36460 + }, + { + "epoch": 2.09453336950624, + "grad_norm": 3.5797643661499023, + "learning_rate": 5.106208030697547e-06, + "loss": 0.1011, + "step": 36461 + }, + { + "epoch": 2.094546934346175, + "grad_norm": 4.428463935852051, + "learning_rate": 5.106070988077293e-06, + "loss": 0.2241, + "step": 36462 + }, + { + "epoch": 2.0945604991861098, + "grad_norm": 5.762874126434326, + "learning_rate": 5.105933945457038e-06, + "loss": 0.2555, + "step": 36463 + }, + { + "epoch": 2.0945740640260446, + "grad_norm": 5.5073394775390625, + "learning_rate": 5.1057969028367825e-06, + "loss": 0.2427, + "step": 36464 + }, + { + "epoch": 2.0945876288659795, + "grad_norm": 5.041779041290283, + "learning_rate": 5.105659860216528e-06, + "loss": 0.2549, + "step": 36465 + }, + { + "epoch": 2.0946011937059144, + "grad_norm": 5.313963890075684, + "learning_rate": 5.105522817596272e-06, + "loss": 0.2633, + "step": 36466 + }, + { + "epoch": 2.094614758545849, + "grad_norm": 3.457404375076294, + "learning_rate": 5.105385774976018e-06, + "loss": 0.0994, + "step": 36467 + }, + { + "epoch": 2.094628323385784, + "grad_norm": 4.65662145614624, + "learning_rate": 5.105248732355763e-06, + "loss": 0.1545, + "step": 36468 + }, + { + "epoch": 2.094641888225719, + "grad_norm": 4.499513626098633, + "learning_rate": 5.105111689735508e-06, + "loss": 0.256, + "step": 36469 + }, + { + "epoch": 2.094655453065654, + "grad_norm": 3.9437057971954346, + "learning_rate": 5.104974647115253e-06, + "loss": 0.1193, + "step": 36470 + }, + { + "epoch": 2.0946690179055887, + "grad_norm": 5.6096601486206055, + "learning_rate": 5.104837604494999e-06, + "loss": 0.2576, + "step": 36471 + }, + { + "epoch": 2.0946825827455235, + "grad_norm": 5.410391330718994, + "learning_rate": 5.104700561874744e-06, + "loss": 0.2173, + "step": 36472 + }, + { + "epoch": 2.0946961475854584, + "grad_norm": 4.271038055419922, + "learning_rate": 5.104563519254488e-06, + "loss": 0.1556, + "step": 36473 + }, + { + "epoch": 2.0947097124253933, + "grad_norm": 4.600406169891357, + "learning_rate": 5.104426476634233e-06, + "loss": 0.1597, + "step": 36474 + }, + { + "epoch": 2.094723277265328, + "grad_norm": 4.587945461273193, + "learning_rate": 5.104289434013979e-06, + "loss": 0.2393, + "step": 36475 + }, + { + "epoch": 2.094736842105263, + "grad_norm": 4.068012237548828, + "learning_rate": 5.104152391393724e-06, + "loss": 0.1564, + "step": 36476 + }, + { + "epoch": 2.094750406945198, + "grad_norm": 5.213968276977539, + "learning_rate": 5.104015348773469e-06, + "loss": 0.2148, + "step": 36477 + }, + { + "epoch": 2.0947639717851327, + "grad_norm": 4.116811275482178, + "learning_rate": 5.103878306153214e-06, + "loss": 0.1184, + "step": 36478 + }, + { + "epoch": 2.0947775366250676, + "grad_norm": 3.9340147972106934, + "learning_rate": 5.1037412635329584e-06, + "loss": 0.1361, + "step": 36479 + }, + { + "epoch": 2.094791101465003, + "grad_norm": 3.6926777362823486, + "learning_rate": 5.1036042209127045e-06, + "loss": 0.0978, + "step": 36480 + }, + { + "epoch": 2.094804666304938, + "grad_norm": 4.325023174285889, + "learning_rate": 5.10346717829245e-06, + "loss": 0.2364, + "step": 36481 + }, + { + "epoch": 2.0948182311448726, + "grad_norm": 3.1754372119903564, + "learning_rate": 5.103330135672194e-06, + "loss": 0.1543, + "step": 36482 + }, + { + "epoch": 2.0948317959848075, + "grad_norm": 5.317481517791748, + "learning_rate": 5.103193093051939e-06, + "loss": 0.1567, + "step": 36483 + }, + { + "epoch": 2.0948453608247424, + "grad_norm": 4.08096170425415, + "learning_rate": 5.103056050431685e-06, + "loss": 0.1343, + "step": 36484 + }, + { + "epoch": 2.0948589256646772, + "grad_norm": 5.268133640289307, + "learning_rate": 5.1029190078114295e-06, + "loss": 0.2422, + "step": 36485 + }, + { + "epoch": 2.094872490504612, + "grad_norm": 5.910648822784424, + "learning_rate": 5.102781965191175e-06, + "loss": 0.1954, + "step": 36486 + }, + { + "epoch": 2.094886055344547, + "grad_norm": 5.907496452331543, + "learning_rate": 5.10264492257092e-06, + "loss": 0.1714, + "step": 36487 + }, + { + "epoch": 2.094899620184482, + "grad_norm": 5.353236675262451, + "learning_rate": 5.102507879950666e-06, + "loss": 0.2149, + "step": 36488 + }, + { + "epoch": 2.0949131850244167, + "grad_norm": 4.690246105194092, + "learning_rate": 5.10237083733041e-06, + "loss": 0.1342, + "step": 36489 + }, + { + "epoch": 2.0949267498643516, + "grad_norm": 4.405946731567383, + "learning_rate": 5.102233794710155e-06, + "loss": 0.1661, + "step": 36490 + }, + { + "epoch": 2.0949403147042864, + "grad_norm": 4.213315963745117, + "learning_rate": 5.1020967520899e-06, + "loss": 0.1371, + "step": 36491 + }, + { + "epoch": 2.0949538795442213, + "grad_norm": 4.1248602867126465, + "learning_rate": 5.101959709469646e-06, + "loss": 0.1836, + "step": 36492 + }, + { + "epoch": 2.094967444384156, + "grad_norm": 4.150331020355225, + "learning_rate": 5.101822666849391e-06, + "loss": 0.1971, + "step": 36493 + }, + { + "epoch": 2.094981009224091, + "grad_norm": 4.9573822021484375, + "learning_rate": 5.101685624229136e-06, + "loss": 0.1167, + "step": 36494 + }, + { + "epoch": 2.094994574064026, + "grad_norm": 4.665413856506348, + "learning_rate": 5.1015485816088805e-06, + "loss": 0.2231, + "step": 36495 + }, + { + "epoch": 2.0950081389039608, + "grad_norm": 3.6579999923706055, + "learning_rate": 5.101411538988626e-06, + "loss": 0.1724, + "step": 36496 + }, + { + "epoch": 2.0950217037438956, + "grad_norm": 6.173638343811035, + "learning_rate": 5.101274496368372e-06, + "loss": 0.1627, + "step": 36497 + }, + { + "epoch": 2.095035268583831, + "grad_norm": 4.326790809631348, + "learning_rate": 5.101137453748116e-06, + "loss": 0.1817, + "step": 36498 + }, + { + "epoch": 2.095048833423766, + "grad_norm": 6.06634521484375, + "learning_rate": 5.101000411127861e-06, + "loss": 0.252, + "step": 36499 + }, + { + "epoch": 2.0950623982637007, + "grad_norm": 3.6297388076782227, + "learning_rate": 5.1008633685076055e-06, + "loss": 0.0913, + "step": 36500 + }, + { + "epoch": 2.0950759631036355, + "grad_norm": 3.857154369354248, + "learning_rate": 5.1007263258873515e-06, + "loss": 0.105, + "step": 36501 + }, + { + "epoch": 2.0950895279435704, + "grad_norm": 4.190481662750244, + "learning_rate": 5.100589283267097e-06, + "loss": 0.1553, + "step": 36502 + }, + { + "epoch": 2.0951030927835053, + "grad_norm": 6.606911659240723, + "learning_rate": 5.100452240646842e-06, + "loss": 0.2285, + "step": 36503 + }, + { + "epoch": 2.09511665762344, + "grad_norm": 4.786945819854736, + "learning_rate": 5.100315198026586e-06, + "loss": 0.0742, + "step": 36504 + }, + { + "epoch": 2.095130222463375, + "grad_norm": 3.80010724067688, + "learning_rate": 5.100178155406332e-06, + "loss": 0.1343, + "step": 36505 + }, + { + "epoch": 2.09514378730331, + "grad_norm": 5.676063060760498, + "learning_rate": 5.100041112786077e-06, + "loss": 0.1992, + "step": 36506 + }, + { + "epoch": 2.0951573521432447, + "grad_norm": 5.130107402801514, + "learning_rate": 5.099904070165822e-06, + "loss": 0.291, + "step": 36507 + }, + { + "epoch": 2.0951709169831796, + "grad_norm": 4.461615562438965, + "learning_rate": 5.099767027545567e-06, + "loss": 0.2001, + "step": 36508 + }, + { + "epoch": 2.0951844818231145, + "grad_norm": 5.519401550292969, + "learning_rate": 5.099629984925312e-06, + "loss": 0.2696, + "step": 36509 + }, + { + "epoch": 2.0951980466630493, + "grad_norm": 4.556848049163818, + "learning_rate": 5.099492942305057e-06, + "loss": 0.1152, + "step": 36510 + }, + { + "epoch": 2.095211611502984, + "grad_norm": 4.498539924621582, + "learning_rate": 5.0993558996848025e-06, + "loss": 0.2017, + "step": 36511 + }, + { + "epoch": 2.095225176342919, + "grad_norm": 4.09421443939209, + "learning_rate": 5.099218857064548e-06, + "loss": 0.1754, + "step": 36512 + }, + { + "epoch": 2.095238741182854, + "grad_norm": 4.015405654907227, + "learning_rate": 5.099081814444292e-06, + "loss": 0.1195, + "step": 36513 + }, + { + "epoch": 2.095252306022789, + "grad_norm": 4.352156162261963, + "learning_rate": 5.098944771824038e-06, + "loss": 0.248, + "step": 36514 + }, + { + "epoch": 2.0952658708627236, + "grad_norm": 3.8991549015045166, + "learning_rate": 5.098807729203783e-06, + "loss": 0.1344, + "step": 36515 + }, + { + "epoch": 2.0952794357026585, + "grad_norm": 4.8872199058532715, + "learning_rate": 5.0986706865835275e-06, + "loss": 0.1628, + "step": 36516 + }, + { + "epoch": 2.0952930005425934, + "grad_norm": 3.8255770206451416, + "learning_rate": 5.098533643963273e-06, + "loss": 0.0973, + "step": 36517 + }, + { + "epoch": 2.0953065653825287, + "grad_norm": 4.820638179779053, + "learning_rate": 5.098396601343019e-06, + "loss": 0.165, + "step": 36518 + }, + { + "epoch": 2.0953201302224636, + "grad_norm": 6.168221473693848, + "learning_rate": 5.098259558722763e-06, + "loss": 0.2656, + "step": 36519 + }, + { + "epoch": 2.0953336950623984, + "grad_norm": 3.403229236602783, + "learning_rate": 5.098122516102508e-06, + "loss": 0.1763, + "step": 36520 + }, + { + "epoch": 2.0953472599023333, + "grad_norm": 4.411746978759766, + "learning_rate": 5.097985473482253e-06, + "loss": 0.1442, + "step": 36521 + }, + { + "epoch": 2.095360824742268, + "grad_norm": 4.782623291015625, + "learning_rate": 5.097848430861998e-06, + "loss": 0.1813, + "step": 36522 + }, + { + "epoch": 2.095374389582203, + "grad_norm": 5.142375469207764, + "learning_rate": 5.097711388241744e-06, + "loss": 0.2154, + "step": 36523 + }, + { + "epoch": 2.095387954422138, + "grad_norm": 4.230426788330078, + "learning_rate": 5.097574345621489e-06, + "loss": 0.1933, + "step": 36524 + }, + { + "epoch": 2.0954015192620727, + "grad_norm": 3.6093406677246094, + "learning_rate": 5.097437303001233e-06, + "loss": 0.1508, + "step": 36525 + }, + { + "epoch": 2.0954150841020076, + "grad_norm": 4.321255207061768, + "learning_rate": 5.0973002603809785e-06, + "loss": 0.141, + "step": 36526 + }, + { + "epoch": 2.0954286489419425, + "grad_norm": 5.012269020080566, + "learning_rate": 5.0971632177607245e-06, + "loss": 0.2215, + "step": 36527 + }, + { + "epoch": 2.0954422137818773, + "grad_norm": 4.773229598999023, + "learning_rate": 5.09702617514047e-06, + "loss": 0.2049, + "step": 36528 + }, + { + "epoch": 2.095455778621812, + "grad_norm": 5.227787494659424, + "learning_rate": 5.096889132520214e-06, + "loss": 0.2147, + "step": 36529 + }, + { + "epoch": 2.095469343461747, + "grad_norm": 5.169530391693115, + "learning_rate": 5.096752089899959e-06, + "loss": 0.1333, + "step": 36530 + }, + { + "epoch": 2.095482908301682, + "grad_norm": 4.004429340362549, + "learning_rate": 5.096615047279705e-06, + "loss": 0.1775, + "step": 36531 + }, + { + "epoch": 2.095496473141617, + "grad_norm": 6.2724223136901855, + "learning_rate": 5.0964780046594495e-06, + "loss": 0.2884, + "step": 36532 + }, + { + "epoch": 2.0955100379815517, + "grad_norm": 4.624467372894287, + "learning_rate": 5.096340962039195e-06, + "loss": 0.2019, + "step": 36533 + }, + { + "epoch": 2.0955236028214865, + "grad_norm": 4.937800884246826, + "learning_rate": 5.096203919418939e-06, + "loss": 0.1505, + "step": 36534 + }, + { + "epoch": 2.0955371676614214, + "grad_norm": 4.395092964172363, + "learning_rate": 5.096066876798684e-06, + "loss": 0.1711, + "step": 36535 + }, + { + "epoch": 2.0955507325013567, + "grad_norm": 6.32219123840332, + "learning_rate": 5.09592983417843e-06, + "loss": 0.2023, + "step": 36536 + }, + { + "epoch": 2.0955642973412916, + "grad_norm": 4.719010829925537, + "learning_rate": 5.0957927915581754e-06, + "loss": 0.2343, + "step": 36537 + }, + { + "epoch": 2.0955778621812264, + "grad_norm": 6.61615514755249, + "learning_rate": 5.09565574893792e-06, + "loss": 0.2153, + "step": 36538 + }, + { + "epoch": 2.0955914270211613, + "grad_norm": 7.559816360473633, + "learning_rate": 5.095518706317665e-06, + "loss": 0.1474, + "step": 36539 + }, + { + "epoch": 2.095604991861096, + "grad_norm": 4.942541599273682, + "learning_rate": 5.095381663697411e-06, + "loss": 0.1867, + "step": 36540 + }, + { + "epoch": 2.095618556701031, + "grad_norm": 5.787927150726318, + "learning_rate": 5.095244621077155e-06, + "loss": 0.321, + "step": 36541 + }, + { + "epoch": 2.095632121540966, + "grad_norm": 4.618238925933838, + "learning_rate": 5.0951075784569005e-06, + "loss": 0.1553, + "step": 36542 + }, + { + "epoch": 2.0956456863809008, + "grad_norm": 4.19481897354126, + "learning_rate": 5.094970535836646e-06, + "loss": 0.1267, + "step": 36543 + }, + { + "epoch": 2.0956592512208356, + "grad_norm": 6.239037990570068, + "learning_rate": 5.094833493216391e-06, + "loss": 0.2619, + "step": 36544 + }, + { + "epoch": 2.0956728160607705, + "grad_norm": 5.946385860443115, + "learning_rate": 5.094696450596136e-06, + "loss": 0.1987, + "step": 36545 + }, + { + "epoch": 2.0956863809007054, + "grad_norm": 5.345183849334717, + "learning_rate": 5.094559407975881e-06, + "loss": 0.1987, + "step": 36546 + }, + { + "epoch": 2.0956999457406402, + "grad_norm": 5.375628471374512, + "learning_rate": 5.0944223653556255e-06, + "loss": 0.2197, + "step": 36547 + }, + { + "epoch": 2.095713510580575, + "grad_norm": 3.725847005844116, + "learning_rate": 5.094285322735371e-06, + "loss": 0.1716, + "step": 36548 + }, + { + "epoch": 2.09572707542051, + "grad_norm": 6.280081748962402, + "learning_rate": 5.094148280115117e-06, + "loss": 0.244, + "step": 36549 + }, + { + "epoch": 2.095740640260445, + "grad_norm": 5.2712578773498535, + "learning_rate": 5.094011237494861e-06, + "loss": 0.3657, + "step": 36550 + }, + { + "epoch": 2.0957542051003797, + "grad_norm": 4.509661674499512, + "learning_rate": 5.093874194874606e-06, + "loss": 0.2267, + "step": 36551 + }, + { + "epoch": 2.0957677699403146, + "grad_norm": 5.030352592468262, + "learning_rate": 5.093737152254351e-06, + "loss": 0.2149, + "step": 36552 + }, + { + "epoch": 2.0957813347802494, + "grad_norm": 6.449043273925781, + "learning_rate": 5.093600109634097e-06, + "loss": 0.242, + "step": 36553 + }, + { + "epoch": 2.0957948996201843, + "grad_norm": 5.324206352233887, + "learning_rate": 5.093463067013842e-06, + "loss": 0.3137, + "step": 36554 + }, + { + "epoch": 2.095808464460119, + "grad_norm": 6.203188896179199, + "learning_rate": 5.093326024393587e-06, + "loss": 0.2217, + "step": 36555 + }, + { + "epoch": 2.0958220293000545, + "grad_norm": 5.30579948425293, + "learning_rate": 5.093188981773331e-06, + "loss": 0.2739, + "step": 36556 + }, + { + "epoch": 2.0958355941399893, + "grad_norm": 6.565138339996338, + "learning_rate": 5.093051939153077e-06, + "loss": 0.3166, + "step": 36557 + }, + { + "epoch": 2.095849158979924, + "grad_norm": 6.589541435241699, + "learning_rate": 5.0929148965328225e-06, + "loss": 0.4237, + "step": 36558 + }, + { + "epoch": 2.095862723819859, + "grad_norm": 4.360292434692383, + "learning_rate": 5.092777853912567e-06, + "loss": 0.2349, + "step": 36559 + }, + { + "epoch": 2.095876288659794, + "grad_norm": 3.7301025390625, + "learning_rate": 5.092640811292312e-06, + "loss": 0.2046, + "step": 36560 + }, + { + "epoch": 2.095889853499729, + "grad_norm": 6.146653652191162, + "learning_rate": 5.092503768672058e-06, + "loss": 0.3674, + "step": 36561 + }, + { + "epoch": 2.0959034183396636, + "grad_norm": 7.75732421875, + "learning_rate": 5.092366726051803e-06, + "loss": 0.261, + "step": 36562 + }, + { + "epoch": 2.0959169831795985, + "grad_norm": 5.191887378692627, + "learning_rate": 5.0922296834315475e-06, + "loss": 0.2203, + "step": 36563 + }, + { + "epoch": 2.0959305480195334, + "grad_norm": 3.3837838172912598, + "learning_rate": 5.092092640811293e-06, + "loss": 0.1666, + "step": 36564 + }, + { + "epoch": 2.0959441128594682, + "grad_norm": 4.910731792449951, + "learning_rate": 5.091955598191037e-06, + "loss": 0.1774, + "step": 36565 + }, + { + "epoch": 2.095957677699403, + "grad_norm": 4.114241600036621, + "learning_rate": 5.091818555570783e-06, + "loss": 0.2028, + "step": 36566 + }, + { + "epoch": 2.095971242539338, + "grad_norm": 5.689295768737793, + "learning_rate": 5.091681512950528e-06, + "loss": 0.2508, + "step": 36567 + }, + { + "epoch": 2.095984807379273, + "grad_norm": 6.619846820831299, + "learning_rate": 5.091544470330273e-06, + "loss": 0.2668, + "step": 36568 + }, + { + "epoch": 2.0959983722192077, + "grad_norm": 5.502187252044678, + "learning_rate": 5.091407427710018e-06, + "loss": 0.1869, + "step": 36569 + }, + { + "epoch": 2.0960119370591426, + "grad_norm": 5.642695426940918, + "learning_rate": 5.091270385089764e-06, + "loss": 0.2664, + "step": 36570 + }, + { + "epoch": 2.0960255018990774, + "grad_norm": 4.986104488372803, + "learning_rate": 5.091133342469509e-06, + "loss": 0.2603, + "step": 36571 + }, + { + "epoch": 2.0960390667390123, + "grad_norm": 4.46726131439209, + "learning_rate": 5.090996299849253e-06, + "loss": 0.1965, + "step": 36572 + }, + { + "epoch": 2.096052631578947, + "grad_norm": 6.164486408233643, + "learning_rate": 5.0908592572289985e-06, + "loss": 0.2719, + "step": 36573 + }, + { + "epoch": 2.0960661964188825, + "grad_norm": 6.027204990386963, + "learning_rate": 5.0907222146087445e-06, + "loss": 0.2571, + "step": 36574 + }, + { + "epoch": 2.0960797612588173, + "grad_norm": 5.750868797302246, + "learning_rate": 5.090585171988489e-06, + "loss": 0.2415, + "step": 36575 + }, + { + "epoch": 2.096093326098752, + "grad_norm": 6.719381809234619, + "learning_rate": 5.090448129368234e-06, + "loss": 0.2289, + "step": 36576 + }, + { + "epoch": 2.096106890938687, + "grad_norm": 6.887788772583008, + "learning_rate": 5.090311086747979e-06, + "loss": 0.3162, + "step": 36577 + }, + { + "epoch": 2.096120455778622, + "grad_norm": 4.413045883178711, + "learning_rate": 5.0901740441277235e-06, + "loss": 0.1607, + "step": 36578 + }, + { + "epoch": 2.096134020618557, + "grad_norm": 5.839818954467773, + "learning_rate": 5.0900370015074696e-06, + "loss": 0.2311, + "step": 36579 + }, + { + "epoch": 2.0961475854584917, + "grad_norm": 4.960274696350098, + "learning_rate": 5.089899958887215e-06, + "loss": 0.202, + "step": 36580 + }, + { + "epoch": 2.0961611502984265, + "grad_norm": 3.9569613933563232, + "learning_rate": 5.089762916266959e-06, + "loss": 0.1223, + "step": 36581 + }, + { + "epoch": 2.0961747151383614, + "grad_norm": 3.887059211730957, + "learning_rate": 5.089625873646704e-06, + "loss": 0.0978, + "step": 36582 + }, + { + "epoch": 2.0961882799782963, + "grad_norm": 5.758914947509766, + "learning_rate": 5.08948883102645e-06, + "loss": 0.2429, + "step": 36583 + }, + { + "epoch": 2.096201844818231, + "grad_norm": 5.166321754455566, + "learning_rate": 5.089351788406195e-06, + "loss": 0.3115, + "step": 36584 + }, + { + "epoch": 2.096215409658166, + "grad_norm": 5.358804702758789, + "learning_rate": 5.08921474578594e-06, + "loss": 0.2794, + "step": 36585 + }, + { + "epoch": 2.096228974498101, + "grad_norm": 6.38132381439209, + "learning_rate": 5.089077703165685e-06, + "loss": 0.2328, + "step": 36586 + }, + { + "epoch": 2.0962425393380357, + "grad_norm": 8.06579875946045, + "learning_rate": 5.088940660545431e-06, + "loss": 0.4593, + "step": 36587 + }, + { + "epoch": 2.0962561041779706, + "grad_norm": 5.279659271240234, + "learning_rate": 5.088803617925175e-06, + "loss": 0.279, + "step": 36588 + }, + { + "epoch": 2.0962696690179055, + "grad_norm": 3.790684938430786, + "learning_rate": 5.0886665753049205e-06, + "loss": 0.174, + "step": 36589 + }, + { + "epoch": 2.0962832338578403, + "grad_norm": 9.145484924316406, + "learning_rate": 5.088529532684665e-06, + "loss": 0.3167, + "step": 36590 + }, + { + "epoch": 2.096296798697775, + "grad_norm": 5.00024938583374, + "learning_rate": 5.08839249006441e-06, + "loss": 0.2029, + "step": 36591 + }, + { + "epoch": 2.09631036353771, + "grad_norm": 7.234602451324463, + "learning_rate": 5.088255447444156e-06, + "loss": 0.2415, + "step": 36592 + }, + { + "epoch": 2.096323928377645, + "grad_norm": 3.9511637687683105, + "learning_rate": 5.0881184048239e-06, + "loss": 0.1877, + "step": 36593 + }, + { + "epoch": 2.0963374932175802, + "grad_norm": 5.95682430267334, + "learning_rate": 5.0879813622036455e-06, + "loss": 0.2571, + "step": 36594 + }, + { + "epoch": 2.096351058057515, + "grad_norm": 6.05704402923584, + "learning_rate": 5.087844319583391e-06, + "loss": 0.2735, + "step": 36595 + }, + { + "epoch": 2.09636462289745, + "grad_norm": 4.692957878112793, + "learning_rate": 5.087707276963137e-06, + "loss": 0.2004, + "step": 36596 + }, + { + "epoch": 2.096378187737385, + "grad_norm": 5.3634138107299805, + "learning_rate": 5.087570234342881e-06, + "loss": 0.2328, + "step": 36597 + }, + { + "epoch": 2.0963917525773197, + "grad_norm": 5.792215824127197, + "learning_rate": 5.087433191722626e-06, + "loss": 0.2731, + "step": 36598 + }, + { + "epoch": 2.0964053174172546, + "grad_norm": 5.7519049644470215, + "learning_rate": 5.087296149102371e-06, + "loss": 0.292, + "step": 36599 + }, + { + "epoch": 2.0964188822571894, + "grad_norm": 4.341899394989014, + "learning_rate": 5.087159106482117e-06, + "loss": 0.1965, + "step": 36600 + }, + { + "epoch": 2.0964324470971243, + "grad_norm": 5.240265369415283, + "learning_rate": 5.087022063861862e-06, + "loss": 0.2719, + "step": 36601 + }, + { + "epoch": 2.096446011937059, + "grad_norm": 4.714715003967285, + "learning_rate": 5.086885021241606e-06, + "loss": 0.1656, + "step": 36602 + }, + { + "epoch": 2.096459576776994, + "grad_norm": 5.984236240386963, + "learning_rate": 5.086747978621351e-06, + "loss": 0.1659, + "step": 36603 + }, + { + "epoch": 2.096473141616929, + "grad_norm": 5.7546820640563965, + "learning_rate": 5.0866109360010965e-06, + "loss": 0.2141, + "step": 36604 + }, + { + "epoch": 2.0964867064568637, + "grad_norm": 5.192488193511963, + "learning_rate": 5.0864738933808425e-06, + "loss": 0.1638, + "step": 36605 + }, + { + "epoch": 2.0965002712967986, + "grad_norm": 6.296707630157471, + "learning_rate": 5.086336850760587e-06, + "loss": 0.2191, + "step": 36606 + }, + { + "epoch": 2.0965138361367335, + "grad_norm": 3.6206116676330566, + "learning_rate": 5.086199808140332e-06, + "loss": 0.1023, + "step": 36607 + }, + { + "epoch": 2.0965274009766683, + "grad_norm": 5.3770527839660645, + "learning_rate": 5.086062765520076e-06, + "loss": 0.1333, + "step": 36608 + }, + { + "epoch": 2.096540965816603, + "grad_norm": 7.1700005531311035, + "learning_rate": 5.085925722899822e-06, + "loss": 0.2461, + "step": 36609 + }, + { + "epoch": 2.096554530656538, + "grad_norm": 4.809903144836426, + "learning_rate": 5.0857886802795676e-06, + "loss": 0.1743, + "step": 36610 + }, + { + "epoch": 2.096568095496473, + "grad_norm": 7.438170909881592, + "learning_rate": 5.085651637659313e-06, + "loss": 0.2561, + "step": 36611 + }, + { + "epoch": 2.0965816603364082, + "grad_norm": 3.9662458896636963, + "learning_rate": 5.085514595039057e-06, + "loss": 0.1336, + "step": 36612 + }, + { + "epoch": 2.096595225176343, + "grad_norm": 4.659099102020264, + "learning_rate": 5.085377552418803e-06, + "loss": 0.1768, + "step": 36613 + }, + { + "epoch": 2.096608790016278, + "grad_norm": 3.776959180831909, + "learning_rate": 5.085240509798548e-06, + "loss": 0.1551, + "step": 36614 + }, + { + "epoch": 2.096622354856213, + "grad_norm": 4.874373435974121, + "learning_rate": 5.085103467178293e-06, + "loss": 0.1328, + "step": 36615 + }, + { + "epoch": 2.0966359196961477, + "grad_norm": 4.283676624298096, + "learning_rate": 5.084966424558038e-06, + "loss": 0.1098, + "step": 36616 + }, + { + "epoch": 2.0966494845360826, + "grad_norm": 5.666899681091309, + "learning_rate": 5.084829381937782e-06, + "loss": 0.23, + "step": 36617 + }, + { + "epoch": 2.0966630493760174, + "grad_norm": 5.265195369720459, + "learning_rate": 5.084692339317528e-06, + "loss": 0.2469, + "step": 36618 + }, + { + "epoch": 2.0966766142159523, + "grad_norm": 3.7525200843811035, + "learning_rate": 5.084555296697273e-06, + "loss": 0.1023, + "step": 36619 + }, + { + "epoch": 2.096690179055887, + "grad_norm": 3.971503496170044, + "learning_rate": 5.0844182540770185e-06, + "loss": 0.1045, + "step": 36620 + }, + { + "epoch": 2.096703743895822, + "grad_norm": 3.7308576107025146, + "learning_rate": 5.084281211456763e-06, + "loss": 0.1302, + "step": 36621 + }, + { + "epoch": 2.096717308735757, + "grad_norm": 5.863087177276611, + "learning_rate": 5.084144168836509e-06, + "loss": 0.231, + "step": 36622 + }, + { + "epoch": 2.0967308735756918, + "grad_norm": 4.554514408111572, + "learning_rate": 5.084007126216254e-06, + "loss": 0.1595, + "step": 36623 + }, + { + "epoch": 2.0967444384156266, + "grad_norm": 3.7351303100585938, + "learning_rate": 5.083870083595998e-06, + "loss": 0.1338, + "step": 36624 + }, + { + "epoch": 2.0967580032555615, + "grad_norm": 5.714691638946533, + "learning_rate": 5.0837330409757435e-06, + "loss": 0.2159, + "step": 36625 + }, + { + "epoch": 2.0967715680954964, + "grad_norm": 5.149048805236816, + "learning_rate": 5.0835959983554896e-06, + "loss": 0.187, + "step": 36626 + }, + { + "epoch": 2.0967851329354312, + "grad_norm": 3.7371718883514404, + "learning_rate": 5.083458955735234e-06, + "loss": 0.1284, + "step": 36627 + }, + { + "epoch": 2.096798697775366, + "grad_norm": 7.886058807373047, + "learning_rate": 5.083321913114979e-06, + "loss": 0.2825, + "step": 36628 + }, + { + "epoch": 2.096812262615301, + "grad_norm": 3.0419929027557373, + "learning_rate": 5.083184870494724e-06, + "loss": 0.0864, + "step": 36629 + }, + { + "epoch": 2.096825827455236, + "grad_norm": 4.807602405548096, + "learning_rate": 5.083047827874469e-06, + "loss": 0.1754, + "step": 36630 + }, + { + "epoch": 2.0968393922951707, + "grad_norm": 4.770853519439697, + "learning_rate": 5.082910785254215e-06, + "loss": 0.1546, + "step": 36631 + }, + { + "epoch": 2.096852957135106, + "grad_norm": 5.789196968078613, + "learning_rate": 5.08277374263396e-06, + "loss": 0.1396, + "step": 36632 + }, + { + "epoch": 2.096866521975041, + "grad_norm": 7.155068874359131, + "learning_rate": 5.082636700013704e-06, + "loss": 0.0983, + "step": 36633 + }, + { + "epoch": 2.0968800868149757, + "grad_norm": 4.276484489440918, + "learning_rate": 5.082499657393449e-06, + "loss": 0.1695, + "step": 36634 + }, + { + "epoch": 2.0968936516549106, + "grad_norm": 4.575873374938965, + "learning_rate": 5.082362614773195e-06, + "loss": 0.1593, + "step": 36635 + }, + { + "epoch": 2.0969072164948455, + "grad_norm": 6.280921936035156, + "learning_rate": 5.0822255721529405e-06, + "loss": 0.1631, + "step": 36636 + }, + { + "epoch": 2.0969207813347803, + "grad_norm": 3.1592588424682617, + "learning_rate": 5.082088529532685e-06, + "loss": 0.0769, + "step": 36637 + }, + { + "epoch": 2.096934346174715, + "grad_norm": 4.080670356750488, + "learning_rate": 5.08195148691243e-06, + "loss": 0.0997, + "step": 36638 + }, + { + "epoch": 2.09694791101465, + "grad_norm": 3.564563035964966, + "learning_rate": 5.081814444292176e-06, + "loss": 0.1438, + "step": 36639 + }, + { + "epoch": 2.096961475854585, + "grad_norm": 4.625853538513184, + "learning_rate": 5.08167740167192e-06, + "loss": 0.1517, + "step": 36640 + }, + { + "epoch": 2.09697504069452, + "grad_norm": 3.49957013130188, + "learning_rate": 5.0815403590516656e-06, + "loss": 0.164, + "step": 36641 + }, + { + "epoch": 2.0969886055344547, + "grad_norm": 5.433933258056641, + "learning_rate": 5.08140331643141e-06, + "loss": 0.2619, + "step": 36642 + }, + { + "epoch": 2.0970021703743895, + "grad_norm": 6.099010944366455, + "learning_rate": 5.081266273811156e-06, + "loss": 0.2374, + "step": 36643 + }, + { + "epoch": 2.0970157352143244, + "grad_norm": 4.667161464691162, + "learning_rate": 5.081129231190901e-06, + "loss": 0.1792, + "step": 36644 + }, + { + "epoch": 2.0970293000542592, + "grad_norm": 2.987870216369629, + "learning_rate": 5.080992188570646e-06, + "loss": 0.1043, + "step": 36645 + }, + { + "epoch": 2.097042864894194, + "grad_norm": 4.313905239105225, + "learning_rate": 5.080855145950391e-06, + "loss": 0.1535, + "step": 36646 + }, + { + "epoch": 2.097056429734129, + "grad_norm": 3.0059306621551514, + "learning_rate": 5.080718103330136e-06, + "loss": 0.1059, + "step": 36647 + }, + { + "epoch": 2.097069994574064, + "grad_norm": 3.2661898136138916, + "learning_rate": 5.080581060709882e-06, + "loss": 0.0928, + "step": 36648 + }, + { + "epoch": 2.0970835594139987, + "grad_norm": 3.9500668048858643, + "learning_rate": 5.080444018089626e-06, + "loss": 0.1331, + "step": 36649 + }, + { + "epoch": 2.097097124253934, + "grad_norm": 2.80419921875, + "learning_rate": 5.080306975469371e-06, + "loss": 0.0813, + "step": 36650 + }, + { + "epoch": 2.097110689093869, + "grad_norm": 5.092185020446777, + "learning_rate": 5.0801699328491165e-06, + "loss": 0.1949, + "step": 36651 + }, + { + "epoch": 2.0971242539338038, + "grad_norm": 4.361963272094727, + "learning_rate": 5.080032890228862e-06, + "loss": 0.1221, + "step": 36652 + }, + { + "epoch": 2.0971378187737386, + "grad_norm": 3.5077295303344727, + "learning_rate": 5.079895847608607e-06, + "loss": 0.1021, + "step": 36653 + }, + { + "epoch": 2.0971513836136735, + "grad_norm": 4.183321475982666, + "learning_rate": 5.079758804988352e-06, + "loss": 0.1509, + "step": 36654 + }, + { + "epoch": 2.0971649484536083, + "grad_norm": 2.239692449569702, + "learning_rate": 5.079621762368096e-06, + "loss": 0.0661, + "step": 36655 + }, + { + "epoch": 2.097178513293543, + "grad_norm": 2.914494037628174, + "learning_rate": 5.079484719747842e-06, + "loss": 0.0853, + "step": 36656 + }, + { + "epoch": 2.097192078133478, + "grad_norm": 2.588012218475342, + "learning_rate": 5.0793476771275876e-06, + "loss": 0.104, + "step": 36657 + }, + { + "epoch": 2.097205642973413, + "grad_norm": 5.968690872192383, + "learning_rate": 5.079210634507332e-06, + "loss": 0.2257, + "step": 36658 + }, + { + "epoch": 2.097219207813348, + "grad_norm": 3.7710087299346924, + "learning_rate": 5.079073591887077e-06, + "loss": 0.153, + "step": 36659 + }, + { + "epoch": 2.0972327726532827, + "grad_norm": 3.4113006591796875, + "learning_rate": 5.078936549266822e-06, + "loss": 0.0741, + "step": 36660 + }, + { + "epoch": 2.0972463374932175, + "grad_norm": 3.2214667797088623, + "learning_rate": 5.0787995066465674e-06, + "loss": 0.1344, + "step": 36661 + }, + { + "epoch": 2.0972599023331524, + "grad_norm": 3.3261702060699463, + "learning_rate": 5.078662464026313e-06, + "loss": 0.0966, + "step": 36662 + }, + { + "epoch": 2.0972734671730873, + "grad_norm": 2.8583431243896484, + "learning_rate": 5.078525421406058e-06, + "loss": 0.0856, + "step": 36663 + }, + { + "epoch": 2.097287032013022, + "grad_norm": 3.9325952529907227, + "learning_rate": 5.078388378785802e-06, + "loss": 0.1163, + "step": 36664 + }, + { + "epoch": 2.097300596852957, + "grad_norm": 2.5410568714141846, + "learning_rate": 5.078251336165548e-06, + "loss": 0.0843, + "step": 36665 + }, + { + "epoch": 2.097314161692892, + "grad_norm": 3.405366897583008, + "learning_rate": 5.078114293545293e-06, + "loss": 0.0738, + "step": 36666 + }, + { + "epoch": 2.0973277265328267, + "grad_norm": 3.423713445663452, + "learning_rate": 5.077977250925038e-06, + "loss": 0.1087, + "step": 36667 + }, + { + "epoch": 2.0973412913727616, + "grad_norm": 4.9619526863098145, + "learning_rate": 5.077840208304783e-06, + "loss": 0.1217, + "step": 36668 + }, + { + "epoch": 2.097354856212697, + "grad_norm": 2.891756296157837, + "learning_rate": 5.077703165684529e-06, + "loss": 0.0924, + "step": 36669 + }, + { + "epoch": 2.0973684210526318, + "grad_norm": 5.614386081695557, + "learning_rate": 5.077566123064274e-06, + "loss": 0.1803, + "step": 36670 + }, + { + "epoch": 2.0973819858925666, + "grad_norm": 3.7602837085723877, + "learning_rate": 5.077429080444018e-06, + "loss": 0.1224, + "step": 36671 + }, + { + "epoch": 2.0973955507325015, + "grad_norm": 3.5642645359039307, + "learning_rate": 5.0772920378237636e-06, + "loss": 0.1378, + "step": 36672 + }, + { + "epoch": 2.0974091155724364, + "grad_norm": 3.926866292953491, + "learning_rate": 5.077154995203508e-06, + "loss": 0.1898, + "step": 36673 + }, + { + "epoch": 2.0974226804123712, + "grad_norm": 3.266855239868164, + "learning_rate": 5.077017952583254e-06, + "loss": 0.1231, + "step": 36674 + }, + { + "epoch": 2.097436245252306, + "grad_norm": 3.3649239540100098, + "learning_rate": 5.076880909962999e-06, + "loss": 0.0972, + "step": 36675 + }, + { + "epoch": 2.097449810092241, + "grad_norm": 3.3899967670440674, + "learning_rate": 5.0767438673427434e-06, + "loss": 0.1195, + "step": 36676 + }, + { + "epoch": 2.097463374932176, + "grad_norm": 5.545123100280762, + "learning_rate": 5.076606824722489e-06, + "loss": 0.2182, + "step": 36677 + }, + { + "epoch": 2.0974769397721107, + "grad_norm": 4.391219615936279, + "learning_rate": 5.076469782102235e-06, + "loss": 0.1932, + "step": 36678 + }, + { + "epoch": 2.0974905046120456, + "grad_norm": 3.9148154258728027, + "learning_rate": 5.07633273948198e-06, + "loss": 0.1592, + "step": 36679 + }, + { + "epoch": 2.0975040694519804, + "grad_norm": 3.4254024028778076, + "learning_rate": 5.076195696861724e-06, + "loss": 0.1735, + "step": 36680 + }, + { + "epoch": 2.0975176342919153, + "grad_norm": 4.8573784828186035, + "learning_rate": 5.076058654241469e-06, + "loss": 0.1346, + "step": 36681 + }, + { + "epoch": 2.09753119913185, + "grad_norm": 5.3275604248046875, + "learning_rate": 5.075921611621215e-06, + "loss": 0.1448, + "step": 36682 + }, + { + "epoch": 2.097544763971785, + "grad_norm": 3.5047295093536377, + "learning_rate": 5.07578456900096e-06, + "loss": 0.1614, + "step": 36683 + }, + { + "epoch": 2.09755832881172, + "grad_norm": 4.567100524902344, + "learning_rate": 5.075647526380705e-06, + "loss": 0.2401, + "step": 36684 + }, + { + "epoch": 2.0975718936516548, + "grad_norm": 3.8510758876800537, + "learning_rate": 5.07551048376045e-06, + "loss": 0.1554, + "step": 36685 + }, + { + "epoch": 2.0975854584915896, + "grad_norm": 5.241659164428711, + "learning_rate": 5.075373441140194e-06, + "loss": 0.1397, + "step": 36686 + }, + { + "epoch": 2.0975990233315245, + "grad_norm": 5.137263774871826, + "learning_rate": 5.07523639851994e-06, + "loss": 0.2356, + "step": 36687 + }, + { + "epoch": 2.09761258817146, + "grad_norm": 4.787866115570068, + "learning_rate": 5.075099355899686e-06, + "loss": 0.239, + "step": 36688 + }, + { + "epoch": 2.0976261530113947, + "grad_norm": 5.083878040313721, + "learning_rate": 5.07496231327943e-06, + "loss": 0.1702, + "step": 36689 + }, + { + "epoch": 2.0976397178513295, + "grad_norm": 4.082061290740967, + "learning_rate": 5.074825270659175e-06, + "loss": 0.1195, + "step": 36690 + }, + { + "epoch": 2.0976532826912644, + "grad_norm": 3.5668327808380127, + "learning_rate": 5.074688228038921e-06, + "loss": 0.1712, + "step": 36691 + }, + { + "epoch": 2.0976668475311993, + "grad_norm": 5.702536106109619, + "learning_rate": 5.0745511854186654e-06, + "loss": 0.2423, + "step": 36692 + }, + { + "epoch": 2.097680412371134, + "grad_norm": 4.4890666007995605, + "learning_rate": 5.074414142798411e-06, + "loss": 0.1488, + "step": 36693 + }, + { + "epoch": 2.097693977211069, + "grad_norm": 4.0073771476745605, + "learning_rate": 5.074277100178156e-06, + "loss": 0.2558, + "step": 36694 + }, + { + "epoch": 2.097707542051004, + "grad_norm": 4.029855251312256, + "learning_rate": 5.074140057557901e-06, + "loss": 0.1438, + "step": 36695 + }, + { + "epoch": 2.0977211068909387, + "grad_norm": 3.963984489440918, + "learning_rate": 5.074003014937646e-06, + "loss": 0.1671, + "step": 36696 + }, + { + "epoch": 2.0977346717308736, + "grad_norm": 3.954616069793701, + "learning_rate": 5.073865972317391e-06, + "loss": 0.1396, + "step": 36697 + }, + { + "epoch": 2.0977482365708084, + "grad_norm": 3.878357410430908, + "learning_rate": 5.073728929697136e-06, + "loss": 0.1998, + "step": 36698 + }, + { + "epoch": 2.0977618014107433, + "grad_norm": 3.0694286823272705, + "learning_rate": 5.073591887076881e-06, + "loss": 0.0936, + "step": 36699 + }, + { + "epoch": 2.097775366250678, + "grad_norm": 7.6222920417785645, + "learning_rate": 5.073454844456627e-06, + "loss": 0.3743, + "step": 36700 + }, + { + "epoch": 2.097788931090613, + "grad_norm": 3.5228097438812256, + "learning_rate": 5.073317801836371e-06, + "loss": 0.1255, + "step": 36701 + }, + { + "epoch": 2.097802495930548, + "grad_norm": 5.832801818847656, + "learning_rate": 5.073180759216116e-06, + "loss": 0.2319, + "step": 36702 + }, + { + "epoch": 2.0978160607704828, + "grad_norm": 3.2622435092926025, + "learning_rate": 5.0730437165958616e-06, + "loss": 0.1805, + "step": 36703 + }, + { + "epoch": 2.0978296256104176, + "grad_norm": 6.102672576904297, + "learning_rate": 5.072906673975608e-06, + "loss": 0.2783, + "step": 36704 + }, + { + "epoch": 2.0978431904503525, + "grad_norm": 7.223011016845703, + "learning_rate": 5.072769631355352e-06, + "loss": 0.2146, + "step": 36705 + }, + { + "epoch": 2.0978567552902874, + "grad_norm": 6.52828311920166, + "learning_rate": 5.072632588735097e-06, + "loss": 0.2185, + "step": 36706 + }, + { + "epoch": 2.0978703201302227, + "grad_norm": 4.427388668060303, + "learning_rate": 5.0724955461148414e-06, + "loss": 0.171, + "step": 36707 + }, + { + "epoch": 2.0978838849701575, + "grad_norm": 5.264225482940674, + "learning_rate": 5.0723585034945875e-06, + "loss": 0.2959, + "step": 36708 + }, + { + "epoch": 2.0978974498100924, + "grad_norm": 3.682321548461914, + "learning_rate": 5.072221460874333e-06, + "loss": 0.2067, + "step": 36709 + }, + { + "epoch": 2.0979110146500273, + "grad_norm": 4.942027568817139, + "learning_rate": 5.072084418254077e-06, + "loss": 0.2749, + "step": 36710 + }, + { + "epoch": 2.097924579489962, + "grad_norm": 4.713546276092529, + "learning_rate": 5.071947375633822e-06, + "loss": 0.1959, + "step": 36711 + }, + { + "epoch": 2.097938144329897, + "grad_norm": 3.8974809646606445, + "learning_rate": 5.071810333013568e-06, + "loss": 0.1329, + "step": 36712 + }, + { + "epoch": 2.097951709169832, + "grad_norm": 5.751064777374268, + "learning_rate": 5.071673290393313e-06, + "loss": 0.2327, + "step": 36713 + }, + { + "epoch": 2.0979652740097667, + "grad_norm": 3.330505132675171, + "learning_rate": 5.071536247773058e-06, + "loss": 0.1244, + "step": 36714 + }, + { + "epoch": 2.0979788388497016, + "grad_norm": 5.155090808868408, + "learning_rate": 5.071399205152803e-06, + "loss": 0.2196, + "step": 36715 + }, + { + "epoch": 2.0979924036896365, + "grad_norm": 4.776763439178467, + "learning_rate": 5.071262162532547e-06, + "loss": 0.1804, + "step": 36716 + }, + { + "epoch": 2.0980059685295713, + "grad_norm": 4.866196632385254, + "learning_rate": 5.071125119912293e-06, + "loss": 0.228, + "step": 36717 + }, + { + "epoch": 2.098019533369506, + "grad_norm": 3.7036776542663574, + "learning_rate": 5.070988077292038e-06, + "loss": 0.1278, + "step": 36718 + }, + { + "epoch": 2.098033098209441, + "grad_norm": 3.4759721755981445, + "learning_rate": 5.070851034671784e-06, + "loss": 0.1226, + "step": 36719 + }, + { + "epoch": 2.098046663049376, + "grad_norm": 3.445866584777832, + "learning_rate": 5.070713992051528e-06, + "loss": 0.1014, + "step": 36720 + }, + { + "epoch": 2.098060227889311, + "grad_norm": 3.585214853286743, + "learning_rate": 5.070576949431274e-06, + "loss": 0.1829, + "step": 36721 + }, + { + "epoch": 2.0980737927292457, + "grad_norm": 3.639744997024536, + "learning_rate": 5.070439906811019e-06, + "loss": 0.1617, + "step": 36722 + }, + { + "epoch": 2.0980873575691805, + "grad_norm": 4.863242149353027, + "learning_rate": 5.0703028641907635e-06, + "loss": 0.113, + "step": 36723 + }, + { + "epoch": 2.0981009224091154, + "grad_norm": 4.179619312286377, + "learning_rate": 5.070165821570509e-06, + "loss": 0.1414, + "step": 36724 + }, + { + "epoch": 2.0981144872490503, + "grad_norm": 3.7592411041259766, + "learning_rate": 5.070028778950255e-06, + "loss": 0.1126, + "step": 36725 + }, + { + "epoch": 2.0981280520889856, + "grad_norm": 5.111708641052246, + "learning_rate": 5.069891736329999e-06, + "loss": 0.2189, + "step": 36726 + }, + { + "epoch": 2.0981416169289204, + "grad_norm": 2.9414479732513428, + "learning_rate": 5.069754693709744e-06, + "loss": 0.1145, + "step": 36727 + }, + { + "epoch": 2.0981551817688553, + "grad_norm": 4.171831130981445, + "learning_rate": 5.069617651089489e-06, + "loss": 0.1213, + "step": 36728 + }, + { + "epoch": 2.09816874660879, + "grad_norm": 3.0876357555389404, + "learning_rate": 5.069480608469234e-06, + "loss": 0.1527, + "step": 36729 + }, + { + "epoch": 2.098182311448725, + "grad_norm": 3.333207368850708, + "learning_rate": 5.06934356584898e-06, + "loss": 0.146, + "step": 36730 + }, + { + "epoch": 2.09819587628866, + "grad_norm": 3.8347702026367188, + "learning_rate": 5.069206523228725e-06, + "loss": 0.129, + "step": 36731 + }, + { + "epoch": 2.0982094411285948, + "grad_norm": 4.559117794036865, + "learning_rate": 5.069069480608469e-06, + "loss": 0.1714, + "step": 36732 + }, + { + "epoch": 2.0982230059685296, + "grad_norm": 3.83476185798645, + "learning_rate": 5.068932437988214e-06, + "loss": 0.1699, + "step": 36733 + }, + { + "epoch": 2.0982365708084645, + "grad_norm": 3.413574457168579, + "learning_rate": 5.06879539536796e-06, + "loss": 0.0984, + "step": 36734 + }, + { + "epoch": 2.0982501356483994, + "grad_norm": 4.601879596710205, + "learning_rate": 5.068658352747705e-06, + "loss": 0.1165, + "step": 36735 + }, + { + "epoch": 2.098263700488334, + "grad_norm": 5.255918025970459, + "learning_rate": 5.06852131012745e-06, + "loss": 0.1573, + "step": 36736 + }, + { + "epoch": 2.098277265328269, + "grad_norm": 4.7344970703125, + "learning_rate": 5.068384267507195e-06, + "loss": 0.1411, + "step": 36737 + }, + { + "epoch": 2.098290830168204, + "grad_norm": 3.7827093601226807, + "learning_rate": 5.068247224886941e-06, + "loss": 0.1239, + "step": 36738 + }, + { + "epoch": 2.098304395008139, + "grad_norm": 4.800731182098389, + "learning_rate": 5.0681101822666855e-06, + "loss": 0.092, + "step": 36739 + }, + { + "epoch": 2.0983179598480737, + "grad_norm": 5.181897163391113, + "learning_rate": 5.067973139646431e-06, + "loss": 0.1862, + "step": 36740 + }, + { + "epoch": 2.0983315246880085, + "grad_norm": 4.399363994598389, + "learning_rate": 5.067836097026175e-06, + "loss": 0.133, + "step": 36741 + }, + { + "epoch": 2.0983450895279434, + "grad_norm": 2.584301471710205, + "learning_rate": 5.06769905440592e-06, + "loss": 0.0887, + "step": 36742 + }, + { + "epoch": 2.0983586543678783, + "grad_norm": 4.456638813018799, + "learning_rate": 5.067562011785666e-06, + "loss": 0.1764, + "step": 36743 + }, + { + "epoch": 2.098372219207813, + "grad_norm": 3.5604312419891357, + "learning_rate": 5.0674249691654105e-06, + "loss": 0.0978, + "step": 36744 + }, + { + "epoch": 2.0983857840477484, + "grad_norm": 3.6959121227264404, + "learning_rate": 5.067287926545156e-06, + "loss": 0.1772, + "step": 36745 + }, + { + "epoch": 2.0983993488876833, + "grad_norm": 3.7144012451171875, + "learning_rate": 5.067150883924901e-06, + "loss": 0.1709, + "step": 36746 + }, + { + "epoch": 2.098412913727618, + "grad_norm": 3.32633638381958, + "learning_rate": 5.067013841304647e-06, + "loss": 0.144, + "step": 36747 + }, + { + "epoch": 2.098426478567553, + "grad_norm": 3.725954055786133, + "learning_rate": 5.066876798684391e-06, + "loss": 0.1485, + "step": 36748 + }, + { + "epoch": 2.098440043407488, + "grad_norm": 4.398526668548584, + "learning_rate": 5.066739756064136e-06, + "loss": 0.1549, + "step": 36749 + }, + { + "epoch": 2.0984536082474228, + "grad_norm": 4.078920841217041, + "learning_rate": 5.066602713443881e-06, + "loss": 0.129, + "step": 36750 + }, + { + "epoch": 2.0984671730873576, + "grad_norm": 4.296572208404541, + "learning_rate": 5.066465670823627e-06, + "loss": 0.2888, + "step": 36751 + }, + { + "epoch": 2.0984807379272925, + "grad_norm": 3.9168992042541504, + "learning_rate": 5.066328628203372e-06, + "loss": 0.0808, + "step": 36752 + }, + { + "epoch": 2.0984943027672274, + "grad_norm": 2.8162877559661865, + "learning_rate": 5.066191585583117e-06, + "loss": 0.1265, + "step": 36753 + }, + { + "epoch": 2.0985078676071622, + "grad_norm": 3.8926291465759277, + "learning_rate": 5.0660545429628615e-06, + "loss": 0.1361, + "step": 36754 + }, + { + "epoch": 2.098521432447097, + "grad_norm": 4.105612277984619, + "learning_rate": 5.065917500342607e-06, + "loss": 0.1575, + "step": 36755 + }, + { + "epoch": 2.098534997287032, + "grad_norm": 5.829082489013672, + "learning_rate": 5.065780457722353e-06, + "loss": 0.17, + "step": 36756 + }, + { + "epoch": 2.098548562126967, + "grad_norm": 4.628836631774902, + "learning_rate": 5.065643415102097e-06, + "loss": 0.2745, + "step": 36757 + }, + { + "epoch": 2.0985621269669017, + "grad_norm": 5.227781295776367, + "learning_rate": 5.065506372481842e-06, + "loss": 0.2134, + "step": 36758 + }, + { + "epoch": 2.0985756918068366, + "grad_norm": 4.6222243309021, + "learning_rate": 5.0653693298615865e-06, + "loss": 0.2122, + "step": 36759 + }, + { + "epoch": 2.0985892566467714, + "grad_norm": 4.129080772399902, + "learning_rate": 5.0652322872413325e-06, + "loss": 0.153, + "step": 36760 + }, + { + "epoch": 2.0986028214867063, + "grad_norm": 3.4953243732452393, + "learning_rate": 5.065095244621078e-06, + "loss": 0.1759, + "step": 36761 + }, + { + "epoch": 2.098616386326641, + "grad_norm": 5.381767272949219, + "learning_rate": 5.064958202000823e-06, + "loss": 0.2692, + "step": 36762 + }, + { + "epoch": 2.098629951166576, + "grad_norm": 5.314648151397705, + "learning_rate": 5.064821159380567e-06, + "loss": 0.2811, + "step": 36763 + }, + { + "epoch": 2.0986435160065113, + "grad_norm": 3.6919782161712646, + "learning_rate": 5.064684116760313e-06, + "loss": 0.1968, + "step": 36764 + }, + { + "epoch": 2.098657080846446, + "grad_norm": 3.89687180519104, + "learning_rate": 5.0645470741400584e-06, + "loss": 0.1776, + "step": 36765 + }, + { + "epoch": 2.098670645686381, + "grad_norm": 5.890658855438232, + "learning_rate": 5.064410031519803e-06, + "loss": 0.176, + "step": 36766 + }, + { + "epoch": 2.098684210526316, + "grad_norm": 3.4998281002044678, + "learning_rate": 5.064272988899548e-06, + "loss": 0.1569, + "step": 36767 + }, + { + "epoch": 2.098697775366251, + "grad_norm": 3.334329128265381, + "learning_rate": 5.064135946279293e-06, + "loss": 0.1457, + "step": 36768 + }, + { + "epoch": 2.0987113402061857, + "grad_norm": 5.522091388702393, + "learning_rate": 5.063998903659038e-06, + "loss": 0.2725, + "step": 36769 + }, + { + "epoch": 2.0987249050461205, + "grad_norm": 4.837398052215576, + "learning_rate": 5.0638618610387835e-06, + "loss": 0.1794, + "step": 36770 + }, + { + "epoch": 2.0987384698860554, + "grad_norm": 5.093581676483154, + "learning_rate": 5.063724818418529e-06, + "loss": 0.2184, + "step": 36771 + }, + { + "epoch": 2.0987520347259903, + "grad_norm": 4.613124370574951, + "learning_rate": 5.063587775798273e-06, + "loss": 0.1589, + "step": 36772 + }, + { + "epoch": 2.098765599565925, + "grad_norm": 4.13287353515625, + "learning_rate": 5.063450733178019e-06, + "loss": 0.2088, + "step": 36773 + }, + { + "epoch": 2.09877916440586, + "grad_norm": 3.1417043209075928, + "learning_rate": 5.063313690557764e-06, + "loss": 0.1433, + "step": 36774 + }, + { + "epoch": 2.098792729245795, + "grad_norm": 2.7169926166534424, + "learning_rate": 5.0631766479375085e-06, + "loss": 0.0861, + "step": 36775 + }, + { + "epoch": 2.0988062940857297, + "grad_norm": 4.372298717498779, + "learning_rate": 5.063039605317254e-06, + "loss": 0.1902, + "step": 36776 + }, + { + "epoch": 2.0988198589256646, + "grad_norm": 2.5271639823913574, + "learning_rate": 5.062902562697e-06, + "loss": 0.1111, + "step": 36777 + }, + { + "epoch": 2.0988334237655994, + "grad_norm": 6.250350475311279, + "learning_rate": 5.062765520076745e-06, + "loss": 0.2658, + "step": 36778 + }, + { + "epoch": 2.0988469886055343, + "grad_norm": 3.976935863494873, + "learning_rate": 5.062628477456489e-06, + "loss": 0.1517, + "step": 36779 + }, + { + "epoch": 2.098860553445469, + "grad_norm": 5.930964946746826, + "learning_rate": 5.062491434836234e-06, + "loss": 0.2613, + "step": 36780 + }, + { + "epoch": 2.098874118285404, + "grad_norm": 3.2578651905059814, + "learning_rate": 5.0623543922159804e-06, + "loss": 0.1257, + "step": 36781 + }, + { + "epoch": 2.098887683125339, + "grad_norm": 4.474981784820557, + "learning_rate": 5.062217349595725e-06, + "loss": 0.2162, + "step": 36782 + }, + { + "epoch": 2.098901247965274, + "grad_norm": 4.335660457611084, + "learning_rate": 5.06208030697547e-06, + "loss": 0.1387, + "step": 36783 + }, + { + "epoch": 2.098914812805209, + "grad_norm": 5.078575611114502, + "learning_rate": 5.061943264355214e-06, + "loss": 0.1693, + "step": 36784 + }, + { + "epoch": 2.098928377645144, + "grad_norm": 3.995701313018799, + "learning_rate": 5.0618062217349595e-06, + "loss": 0.1289, + "step": 36785 + }, + { + "epoch": 2.098941942485079, + "grad_norm": 4.907779693603516, + "learning_rate": 5.0616691791147055e-06, + "loss": 0.1481, + "step": 36786 + }, + { + "epoch": 2.0989555073250137, + "grad_norm": 4.256848335266113, + "learning_rate": 5.061532136494451e-06, + "loss": 0.1944, + "step": 36787 + }, + { + "epoch": 2.0989690721649485, + "grad_norm": 5.64000129699707, + "learning_rate": 5.061395093874195e-06, + "loss": 0.1563, + "step": 36788 + }, + { + "epoch": 2.0989826370048834, + "grad_norm": 4.768791198730469, + "learning_rate": 5.06125805125394e-06, + "loss": 0.1505, + "step": 36789 + }, + { + "epoch": 2.0989962018448183, + "grad_norm": 5.479790210723877, + "learning_rate": 5.061121008633686e-06, + "loss": 0.137, + "step": 36790 + }, + { + "epoch": 2.099009766684753, + "grad_norm": 7.122424125671387, + "learning_rate": 5.0609839660134305e-06, + "loss": 0.333, + "step": 36791 + }, + { + "epoch": 2.099023331524688, + "grad_norm": 5.044108867645264, + "learning_rate": 5.060846923393176e-06, + "loss": 0.1494, + "step": 36792 + }, + { + "epoch": 2.099036896364623, + "grad_norm": 4.873672008514404, + "learning_rate": 5.060709880772921e-06, + "loss": 0.1328, + "step": 36793 + }, + { + "epoch": 2.0990504612045577, + "grad_norm": 3.9440112113952637, + "learning_rate": 5.060572838152666e-06, + "loss": 0.1076, + "step": 36794 + }, + { + "epoch": 2.0990640260444926, + "grad_norm": 7.417835235595703, + "learning_rate": 5.060435795532411e-06, + "loss": 0.3138, + "step": 36795 + }, + { + "epoch": 2.0990775908844275, + "grad_norm": 21.86701202392578, + "learning_rate": 5.0602987529121564e-06, + "loss": 0.1453, + "step": 36796 + }, + { + "epoch": 2.0990911557243623, + "grad_norm": 3.7544779777526855, + "learning_rate": 5.060161710291901e-06, + "loss": 0.1587, + "step": 36797 + }, + { + "epoch": 2.099104720564297, + "grad_norm": 4.834535121917725, + "learning_rate": 5.060024667671646e-06, + "loss": 0.1928, + "step": 36798 + }, + { + "epoch": 2.099118285404232, + "grad_norm": 6.362491130828857, + "learning_rate": 5.059887625051392e-06, + "loss": 0.2242, + "step": 36799 + }, + { + "epoch": 2.099131850244167, + "grad_norm": 4.434542655944824, + "learning_rate": 5.059750582431136e-06, + "loss": 0.1928, + "step": 36800 + }, + { + "epoch": 2.099145415084102, + "grad_norm": 4.474409103393555, + "learning_rate": 5.0596135398108815e-06, + "loss": 0.1405, + "step": 36801 + }, + { + "epoch": 2.099158979924037, + "grad_norm": 4.7800726890563965, + "learning_rate": 5.059476497190627e-06, + "loss": 0.1317, + "step": 36802 + }, + { + "epoch": 2.099172544763972, + "grad_norm": 5.11089563369751, + "learning_rate": 5.059339454570372e-06, + "loss": 0.1929, + "step": 36803 + }, + { + "epoch": 2.099186109603907, + "grad_norm": 4.799040794372559, + "learning_rate": 5.059202411950117e-06, + "loss": 0.2069, + "step": 36804 + }, + { + "epoch": 2.0991996744438417, + "grad_norm": 4.817936420440674, + "learning_rate": 5.059065369329862e-06, + "loss": 0.2117, + "step": 36805 + }, + { + "epoch": 2.0992132392837766, + "grad_norm": 4.137979984283447, + "learning_rate": 5.0589283267096065e-06, + "loss": 0.1257, + "step": 36806 + }, + { + "epoch": 2.0992268041237114, + "grad_norm": 3.689718008041382, + "learning_rate": 5.0587912840893525e-06, + "loss": 0.1245, + "step": 36807 + }, + { + "epoch": 2.0992403689636463, + "grad_norm": 4.364894866943359, + "learning_rate": 5.058654241469098e-06, + "loss": 0.1678, + "step": 36808 + }, + { + "epoch": 2.099253933803581, + "grad_norm": 5.50692892074585, + "learning_rate": 5.058517198848842e-06, + "loss": 0.3307, + "step": 36809 + }, + { + "epoch": 2.099267498643516, + "grad_norm": 5.543346405029297, + "learning_rate": 5.058380156228587e-06, + "loss": 0.2415, + "step": 36810 + }, + { + "epoch": 2.099281063483451, + "grad_norm": 4.569676876068115, + "learning_rate": 5.058243113608332e-06, + "loss": 0.1446, + "step": 36811 + }, + { + "epoch": 2.0992946283233858, + "grad_norm": 5.8200602531433105, + "learning_rate": 5.0581060709880784e-06, + "loss": 0.1687, + "step": 36812 + }, + { + "epoch": 2.0993081931633206, + "grad_norm": 5.286948204040527, + "learning_rate": 5.057969028367823e-06, + "loss": 0.2278, + "step": 36813 + }, + { + "epoch": 2.0993217580032555, + "grad_norm": 4.195122718811035, + "learning_rate": 5.057831985747568e-06, + "loss": 0.166, + "step": 36814 + }, + { + "epoch": 2.0993353228431904, + "grad_norm": 4.916154384613037, + "learning_rate": 5.057694943127312e-06, + "loss": 0.2271, + "step": 36815 + }, + { + "epoch": 2.099348887683125, + "grad_norm": 3.615539789199829, + "learning_rate": 5.057557900507058e-06, + "loss": 0.1213, + "step": 36816 + }, + { + "epoch": 2.09936245252306, + "grad_norm": 3.7262840270996094, + "learning_rate": 5.0574208578868035e-06, + "loss": 0.1829, + "step": 36817 + }, + { + "epoch": 2.099376017362995, + "grad_norm": 5.938318252563477, + "learning_rate": 5.057283815266548e-06, + "loss": 0.2237, + "step": 36818 + }, + { + "epoch": 2.09938958220293, + "grad_norm": 5.312605381011963, + "learning_rate": 5.057146772646293e-06, + "loss": 0.2939, + "step": 36819 + }, + { + "epoch": 2.0994031470428647, + "grad_norm": 5.533188343048096, + "learning_rate": 5.057009730026039e-06, + "loss": 0.2622, + "step": 36820 + }, + { + "epoch": 2.0994167118828, + "grad_norm": 3.94742488861084, + "learning_rate": 5.056872687405784e-06, + "loss": 0.1113, + "step": 36821 + }, + { + "epoch": 2.099430276722735, + "grad_norm": 4.49992036819458, + "learning_rate": 5.0567356447855285e-06, + "loss": 0.1999, + "step": 36822 + }, + { + "epoch": 2.0994438415626697, + "grad_norm": 5.495753288269043, + "learning_rate": 5.056598602165274e-06, + "loss": 0.2227, + "step": 36823 + }, + { + "epoch": 2.0994574064026046, + "grad_norm": 6.844241142272949, + "learning_rate": 5.056461559545018e-06, + "loss": 0.1991, + "step": 36824 + }, + { + "epoch": 2.0994709712425395, + "grad_norm": 3.817617654800415, + "learning_rate": 5.056324516924764e-06, + "loss": 0.1327, + "step": 36825 + }, + { + "epoch": 2.0994845360824743, + "grad_norm": 3.7740471363067627, + "learning_rate": 5.056187474304509e-06, + "loss": 0.2147, + "step": 36826 + }, + { + "epoch": 2.099498100922409, + "grad_norm": 3.2492945194244385, + "learning_rate": 5.0560504316842544e-06, + "loss": 0.0978, + "step": 36827 + }, + { + "epoch": 2.099511665762344, + "grad_norm": 5.429851055145264, + "learning_rate": 5.055913389063999e-06, + "loss": 0.3081, + "step": 36828 + }, + { + "epoch": 2.099525230602279, + "grad_norm": 3.938924789428711, + "learning_rate": 5.055776346443745e-06, + "loss": 0.1689, + "step": 36829 + }, + { + "epoch": 2.099538795442214, + "grad_norm": 6.394139766693115, + "learning_rate": 5.05563930382349e-06, + "loss": 0.2217, + "step": 36830 + }, + { + "epoch": 2.0995523602821486, + "grad_norm": 4.720000267028809, + "learning_rate": 5.055502261203234e-06, + "loss": 0.1532, + "step": 36831 + }, + { + "epoch": 2.0995659251220835, + "grad_norm": 4.759544849395752, + "learning_rate": 5.0553652185829795e-06, + "loss": 0.2479, + "step": 36832 + }, + { + "epoch": 2.0995794899620184, + "grad_norm": 3.716621160507202, + "learning_rate": 5.0552281759627255e-06, + "loss": 0.135, + "step": 36833 + }, + { + "epoch": 2.0995930548019532, + "grad_norm": 3.6646275520324707, + "learning_rate": 5.05509113334247e-06, + "loss": 0.1209, + "step": 36834 + }, + { + "epoch": 2.099606619641888, + "grad_norm": 4.69378137588501, + "learning_rate": 5.054954090722215e-06, + "loss": 0.2288, + "step": 36835 + }, + { + "epoch": 2.099620184481823, + "grad_norm": 4.400119781494141, + "learning_rate": 5.05481704810196e-06, + "loss": 0.1568, + "step": 36836 + }, + { + "epoch": 2.099633749321758, + "grad_norm": 4.233384132385254, + "learning_rate": 5.0546800054817045e-06, + "loss": 0.2844, + "step": 36837 + }, + { + "epoch": 2.0996473141616927, + "grad_norm": 6.076700687408447, + "learning_rate": 5.0545429628614506e-06, + "loss": 0.2883, + "step": 36838 + }, + { + "epoch": 2.0996608790016276, + "grad_norm": 5.108829975128174, + "learning_rate": 5.054405920241196e-06, + "loss": 0.2376, + "step": 36839 + }, + { + "epoch": 2.099674443841563, + "grad_norm": 4.169896602630615, + "learning_rate": 5.05426887762094e-06, + "loss": 0.1409, + "step": 36840 + }, + { + "epoch": 2.0996880086814977, + "grad_norm": 4.664840221405029, + "learning_rate": 5.054131835000685e-06, + "loss": 0.1414, + "step": 36841 + }, + { + "epoch": 2.0997015735214326, + "grad_norm": 5.7798967361450195, + "learning_rate": 5.053994792380431e-06, + "loss": 0.3174, + "step": 36842 + }, + { + "epoch": 2.0997151383613675, + "grad_norm": 6.728352069854736, + "learning_rate": 5.053857749760176e-06, + "loss": 0.2243, + "step": 36843 + }, + { + "epoch": 2.0997287032013023, + "grad_norm": 4.398952007293701, + "learning_rate": 5.053720707139921e-06, + "loss": 0.2925, + "step": 36844 + }, + { + "epoch": 2.099742268041237, + "grad_norm": 4.761440277099609, + "learning_rate": 5.053583664519666e-06, + "loss": 0.2229, + "step": 36845 + }, + { + "epoch": 2.099755832881172, + "grad_norm": 4.538337707519531, + "learning_rate": 5.053446621899412e-06, + "loss": 0.1846, + "step": 36846 + }, + { + "epoch": 2.099769397721107, + "grad_norm": 4.031926155090332, + "learning_rate": 5.053309579279156e-06, + "loss": 0.1484, + "step": 36847 + }, + { + "epoch": 2.099782962561042, + "grad_norm": 5.567375183105469, + "learning_rate": 5.0531725366589015e-06, + "loss": 0.2459, + "step": 36848 + }, + { + "epoch": 2.0997965274009767, + "grad_norm": 5.234550476074219, + "learning_rate": 5.053035494038646e-06, + "loss": 0.197, + "step": 36849 + }, + { + "epoch": 2.0998100922409115, + "grad_norm": 5.377106189727783, + "learning_rate": 5.052898451418392e-06, + "loss": 0.2757, + "step": 36850 + }, + { + "epoch": 2.0998236570808464, + "grad_norm": 5.045158386230469, + "learning_rate": 5.052761408798137e-06, + "loss": 0.1332, + "step": 36851 + }, + { + "epoch": 2.0998372219207813, + "grad_norm": 2.966315507888794, + "learning_rate": 5.052624366177881e-06, + "loss": 0.1416, + "step": 36852 + }, + { + "epoch": 2.099850786760716, + "grad_norm": 4.714857578277588, + "learning_rate": 5.0524873235576265e-06, + "loss": 0.1962, + "step": 36853 + }, + { + "epoch": 2.099864351600651, + "grad_norm": 3.7790937423706055, + "learning_rate": 5.052350280937372e-06, + "loss": 0.1826, + "step": 36854 + }, + { + "epoch": 2.099877916440586, + "grad_norm": 6.076509475708008, + "learning_rate": 5.052213238317118e-06, + "loss": 0.2585, + "step": 36855 + }, + { + "epoch": 2.0998914812805207, + "grad_norm": 4.967735290527344, + "learning_rate": 5.052076195696862e-06, + "loss": 0.2792, + "step": 36856 + }, + { + "epoch": 2.0999050461204556, + "grad_norm": 4.73546028137207, + "learning_rate": 5.051939153076607e-06, + "loss": 0.2726, + "step": 36857 + }, + { + "epoch": 2.0999186109603905, + "grad_norm": 5.080722808837891, + "learning_rate": 5.051802110456352e-06, + "loss": 0.2031, + "step": 36858 + }, + { + "epoch": 2.0999321758003258, + "grad_norm": 4.755847454071045, + "learning_rate": 5.051665067836098e-06, + "loss": 0.1709, + "step": 36859 + }, + { + "epoch": 2.0999457406402606, + "grad_norm": 7.171204090118408, + "learning_rate": 5.051528025215843e-06, + "loss": 0.3515, + "step": 36860 + }, + { + "epoch": 2.0999457406402606, + "eval_loss": 0.3394649028778076, + "eval_noise_accuracy": NaN, + "eval_runtime": 4519.5374, + "eval_samples_per_second": 1.112, + "eval_steps_per_second": 0.069, + "eval_wer": 26.200920577059716, + "step": 36860 + }, + { + "epoch": 2.0999593054801955, + "grad_norm": 5.346063613891602, + "learning_rate": 5.051390982595588e-06, + "loss": 0.2414, + "step": 36861 + }, + { + "epoch": 2.0999728703201304, + "grad_norm": 4.996189594268799, + "learning_rate": 5.051253939975332e-06, + "loss": 0.1915, + "step": 36862 + }, + { + "epoch": 2.0999864351600652, + "grad_norm": 6.387147426605225, + "learning_rate": 5.051116897355078e-06, + "loss": 0.2444, + "step": 36863 + }, + { + "epoch": 2.1, + "grad_norm": 6.993805885314941, + "learning_rate": 5.0509798547348235e-06, + "loss": 0.2963, + "step": 36864 + }, + { + "epoch": 2.100013564839935, + "grad_norm": 4.922557353973389, + "learning_rate": 5.050842812114568e-06, + "loss": 0.2374, + "step": 36865 + }, + { + "epoch": 2.10002712967987, + "grad_norm": 4.854891777038574, + "learning_rate": 5.050705769494313e-06, + "loss": 0.2296, + "step": 36866 + }, + { + "epoch": 2.1000406945198047, + "grad_norm": 4.231067180633545, + "learning_rate": 5.050568726874057e-06, + "loss": 0.1611, + "step": 36867 + }, + { + "epoch": 2.1000542593597396, + "grad_norm": 3.5737199783325195, + "learning_rate": 5.050431684253803e-06, + "loss": 0.1739, + "step": 36868 + }, + { + "epoch": 2.1000678241996744, + "grad_norm": 5.31974458694458, + "learning_rate": 5.0502946416335486e-06, + "loss": 0.1898, + "step": 36869 + }, + { + "epoch": 2.1000813890396093, + "grad_norm": 5.010523319244385, + "learning_rate": 5.050157599013294e-06, + "loss": 0.2288, + "step": 36870 + }, + { + "epoch": 2.100094953879544, + "grad_norm": 7.961256504058838, + "learning_rate": 5.050020556393038e-06, + "loss": 0.3212, + "step": 36871 + }, + { + "epoch": 2.100108518719479, + "grad_norm": 6.117156505584717, + "learning_rate": 5.049883513772784e-06, + "loss": 0.2728, + "step": 36872 + }, + { + "epoch": 2.100122083559414, + "grad_norm": 5.375687122344971, + "learning_rate": 5.049746471152529e-06, + "loss": 0.2055, + "step": 36873 + }, + { + "epoch": 2.1001356483993487, + "grad_norm": 4.884711742401123, + "learning_rate": 5.049609428532274e-06, + "loss": 0.1728, + "step": 36874 + }, + { + "epoch": 2.1001492132392836, + "grad_norm": 4.800637245178223, + "learning_rate": 5.049472385912019e-06, + "loss": 0.2429, + "step": 36875 + }, + { + "epoch": 2.1001627780792185, + "grad_norm": 3.7505578994750977, + "learning_rate": 5.049335343291765e-06, + "loss": 0.1946, + "step": 36876 + }, + { + "epoch": 2.100176342919154, + "grad_norm": 5.26515531539917, + "learning_rate": 5.049198300671509e-06, + "loss": 0.1786, + "step": 36877 + }, + { + "epoch": 2.1001899077590886, + "grad_norm": 4.124671936035156, + "learning_rate": 5.049061258051254e-06, + "loss": 0.1632, + "step": 36878 + }, + { + "epoch": 2.1002034725990235, + "grad_norm": 4.852006435394287, + "learning_rate": 5.0489242154309995e-06, + "loss": 0.1665, + "step": 36879 + }, + { + "epoch": 2.1002170374389584, + "grad_norm": 4.391028881072998, + "learning_rate": 5.048787172810744e-06, + "loss": 0.2376, + "step": 36880 + }, + { + "epoch": 2.1002306022788932, + "grad_norm": 5.27459192276001, + "learning_rate": 5.04865013019049e-06, + "loss": 0.2674, + "step": 36881 + }, + { + "epoch": 2.100244167118828, + "grad_norm": 3.637066602706909, + "learning_rate": 5.048513087570235e-06, + "loss": 0.1921, + "step": 36882 + }, + { + "epoch": 2.100257731958763, + "grad_norm": 7.450692653656006, + "learning_rate": 5.048376044949979e-06, + "loss": 0.2231, + "step": 36883 + }, + { + "epoch": 2.100271296798698, + "grad_norm": 5.426553249359131, + "learning_rate": 5.0482390023297245e-06, + "loss": 0.2402, + "step": 36884 + }, + { + "epoch": 2.1002848616386327, + "grad_norm": 5.217471599578857, + "learning_rate": 5.0481019597094706e-06, + "loss": 0.2835, + "step": 36885 + }, + { + "epoch": 2.1002984264785676, + "grad_norm": 4.759101867675781, + "learning_rate": 5.047964917089215e-06, + "loss": 0.2368, + "step": 36886 + }, + { + "epoch": 2.1003119913185024, + "grad_norm": 4.97844123840332, + "learning_rate": 5.04782787446896e-06, + "loss": 0.218, + "step": 36887 + }, + { + "epoch": 2.1003255561584373, + "grad_norm": 4.673793315887451, + "learning_rate": 5.047690831848705e-06, + "loss": 0.1547, + "step": 36888 + }, + { + "epoch": 2.100339120998372, + "grad_norm": 6.403059482574463, + "learning_rate": 5.047553789228451e-06, + "loss": 0.2688, + "step": 36889 + }, + { + "epoch": 2.100352685838307, + "grad_norm": 4.24210262298584, + "learning_rate": 5.047416746608196e-06, + "loss": 0.2318, + "step": 36890 + }, + { + "epoch": 2.100366250678242, + "grad_norm": 6.222787380218506, + "learning_rate": 5.047279703987941e-06, + "loss": 0.2594, + "step": 36891 + }, + { + "epoch": 2.1003798155181768, + "grad_norm": 6.409961223602295, + "learning_rate": 5.047142661367685e-06, + "loss": 0.2641, + "step": 36892 + }, + { + "epoch": 2.1003933803581116, + "grad_norm": 6.807676792144775, + "learning_rate": 5.04700561874743e-06, + "loss": 0.291, + "step": 36893 + }, + { + "epoch": 2.1004069451980465, + "grad_norm": 6.416013717651367, + "learning_rate": 5.046868576127176e-06, + "loss": 0.3757, + "step": 36894 + }, + { + "epoch": 2.1004205100379814, + "grad_norm": 4.36906099319458, + "learning_rate": 5.0467315335069215e-06, + "loss": 0.2408, + "step": 36895 + }, + { + "epoch": 2.1004340748779162, + "grad_norm": 4.410221099853516, + "learning_rate": 5.046594490886666e-06, + "loss": 0.1693, + "step": 36896 + }, + { + "epoch": 2.1004476397178515, + "grad_norm": 5.280663967132568, + "learning_rate": 5.046457448266411e-06, + "loss": 0.2051, + "step": 36897 + }, + { + "epoch": 2.1004612045577864, + "grad_norm": 5.133880138397217, + "learning_rate": 5.046320405646157e-06, + "loss": 0.2081, + "step": 36898 + }, + { + "epoch": 2.1004747693977213, + "grad_norm": 4.876520156860352, + "learning_rate": 5.046183363025901e-06, + "loss": 0.1759, + "step": 36899 + }, + { + "epoch": 2.100488334237656, + "grad_norm": 4.389840602874756, + "learning_rate": 5.0460463204056466e-06, + "loss": 0.1031, + "step": 36900 + }, + { + "epoch": 2.100501899077591, + "grad_norm": 4.295661926269531, + "learning_rate": 5.045909277785391e-06, + "loss": 0.279, + "step": 36901 + }, + { + "epoch": 2.100515463917526, + "grad_norm": 4.977407932281494, + "learning_rate": 5.045772235165137e-06, + "loss": 0.2219, + "step": 36902 + }, + { + "epoch": 2.1005290287574607, + "grad_norm": 5.506997585296631, + "learning_rate": 5.045635192544882e-06, + "loss": 0.285, + "step": 36903 + }, + { + "epoch": 2.1005425935973956, + "grad_norm": 5.060914516448975, + "learning_rate": 5.045498149924627e-06, + "loss": 0.1575, + "step": 36904 + }, + { + "epoch": 2.1005561584373305, + "grad_norm": 4.939321994781494, + "learning_rate": 5.045361107304372e-06, + "loss": 0.2546, + "step": 36905 + }, + { + "epoch": 2.1005697232772653, + "grad_norm": 5.489306449890137, + "learning_rate": 5.045224064684117e-06, + "loss": 0.1677, + "step": 36906 + }, + { + "epoch": 2.1005832881172, + "grad_norm": 4.648562908172607, + "learning_rate": 5.045087022063863e-06, + "loss": 0.2353, + "step": 36907 + }, + { + "epoch": 2.100596852957135, + "grad_norm": 4.521800994873047, + "learning_rate": 5.044949979443607e-06, + "loss": 0.0922, + "step": 36908 + }, + { + "epoch": 2.10061041779707, + "grad_norm": 5.418447017669678, + "learning_rate": 5.044812936823352e-06, + "loss": 0.1597, + "step": 36909 + }, + { + "epoch": 2.100623982637005, + "grad_norm": 4.805333614349365, + "learning_rate": 5.0446758942030975e-06, + "loss": 0.2085, + "step": 36910 + }, + { + "epoch": 2.1006375474769396, + "grad_norm": 6.438055992126465, + "learning_rate": 5.044538851582843e-06, + "loss": 0.3049, + "step": 36911 + }, + { + "epoch": 2.1006511123168745, + "grad_norm": 3.6353724002838135, + "learning_rate": 5.044401808962588e-06, + "loss": 0.1189, + "step": 36912 + }, + { + "epoch": 2.1006646771568094, + "grad_norm": 4.8640313148498535, + "learning_rate": 5.044264766342333e-06, + "loss": 0.2743, + "step": 36913 + }, + { + "epoch": 2.1006782419967442, + "grad_norm": 3.583022356033325, + "learning_rate": 5.044127723722077e-06, + "loss": 0.1251, + "step": 36914 + }, + { + "epoch": 2.1006918068366796, + "grad_norm": 5.6116862297058105, + "learning_rate": 5.043990681101823e-06, + "loss": 0.1758, + "step": 36915 + }, + { + "epoch": 2.1007053716766144, + "grad_norm": 3.9540326595306396, + "learning_rate": 5.043853638481569e-06, + "loss": 0.1018, + "step": 36916 + }, + { + "epoch": 2.1007189365165493, + "grad_norm": 4.3567423820495605, + "learning_rate": 5.043716595861313e-06, + "loss": 0.2926, + "step": 36917 + }, + { + "epoch": 2.100732501356484, + "grad_norm": 5.963047504425049, + "learning_rate": 5.043579553241058e-06, + "loss": 0.2539, + "step": 36918 + }, + { + "epoch": 2.100746066196419, + "grad_norm": 3.1377055644989014, + "learning_rate": 5.043442510620804e-06, + "loss": 0.0903, + "step": 36919 + }, + { + "epoch": 2.100759631036354, + "grad_norm": 5.544557571411133, + "learning_rate": 5.043305468000549e-06, + "loss": 0.2031, + "step": 36920 + }, + { + "epoch": 2.1007731958762887, + "grad_norm": 3.9714436531066895, + "learning_rate": 5.043168425380294e-06, + "loss": 0.1507, + "step": 36921 + }, + { + "epoch": 2.1007867607162236, + "grad_norm": 4.426733016967773, + "learning_rate": 5.043031382760039e-06, + "loss": 0.2684, + "step": 36922 + }, + { + "epoch": 2.1008003255561585, + "grad_norm": 5.5143232345581055, + "learning_rate": 5.042894340139783e-06, + "loss": 0.2552, + "step": 36923 + }, + { + "epoch": 2.1008138903960933, + "grad_norm": 3.529776096343994, + "learning_rate": 5.042757297519529e-06, + "loss": 0.086, + "step": 36924 + }, + { + "epoch": 2.100827455236028, + "grad_norm": 4.6873369216918945, + "learning_rate": 5.042620254899274e-06, + "loss": 0.2054, + "step": 36925 + }, + { + "epoch": 2.100841020075963, + "grad_norm": 5.085905075073242, + "learning_rate": 5.042483212279019e-06, + "loss": 0.1919, + "step": 36926 + }, + { + "epoch": 2.100854584915898, + "grad_norm": 5.562769889831543, + "learning_rate": 5.042346169658764e-06, + "loss": 0.192, + "step": 36927 + }, + { + "epoch": 2.100868149755833, + "grad_norm": 5.555851459503174, + "learning_rate": 5.04220912703851e-06, + "loss": 0.2161, + "step": 36928 + }, + { + "epoch": 2.1008817145957677, + "grad_norm": 5.976626873016357, + "learning_rate": 5.042072084418255e-06, + "loss": 0.3078, + "step": 36929 + }, + { + "epoch": 2.1008952794357025, + "grad_norm": 5.2277607917785645, + "learning_rate": 5.041935041797999e-06, + "loss": 0.1615, + "step": 36930 + }, + { + "epoch": 2.1009088442756374, + "grad_norm": 4.757242202758789, + "learning_rate": 5.0417979991777446e-06, + "loss": 0.2044, + "step": 36931 + }, + { + "epoch": 2.1009224091155723, + "grad_norm": 3.553546905517578, + "learning_rate": 5.041660956557491e-06, + "loss": 0.1569, + "step": 36932 + }, + { + "epoch": 2.100935973955507, + "grad_norm": 5.797561168670654, + "learning_rate": 5.041523913937235e-06, + "loss": 0.2905, + "step": 36933 + }, + { + "epoch": 2.100949538795442, + "grad_norm": 3.3358943462371826, + "learning_rate": 5.04138687131698e-06, + "loss": 0.1122, + "step": 36934 + }, + { + "epoch": 2.1009631036353773, + "grad_norm": 4.80289888381958, + "learning_rate": 5.0412498286967244e-06, + "loss": 0.2122, + "step": 36935 + }, + { + "epoch": 2.100976668475312, + "grad_norm": 4.35129451751709, + "learning_rate": 5.04111278607647e-06, + "loss": 0.1776, + "step": 36936 + }, + { + "epoch": 2.100990233315247, + "grad_norm": 6.382969856262207, + "learning_rate": 5.040975743456216e-06, + "loss": 0.173, + "step": 36937 + }, + { + "epoch": 2.101003798155182, + "grad_norm": 6.863016128540039, + "learning_rate": 5.040838700835961e-06, + "loss": 0.2361, + "step": 36938 + }, + { + "epoch": 2.1010173629951168, + "grad_norm": 4.752105236053467, + "learning_rate": 5.040701658215705e-06, + "loss": 0.1984, + "step": 36939 + }, + { + "epoch": 2.1010309278350516, + "grad_norm": 4.6311187744140625, + "learning_rate": 5.04056461559545e-06, + "loss": 0.2461, + "step": 36940 + }, + { + "epoch": 2.1010444926749865, + "grad_norm": 4.806771755218506, + "learning_rate": 5.040427572975196e-06, + "loss": 0.1635, + "step": 36941 + }, + { + "epoch": 2.1010580575149214, + "grad_norm": 3.9772374629974365, + "learning_rate": 5.040290530354941e-06, + "loss": 0.159, + "step": 36942 + }, + { + "epoch": 2.1010716223548562, + "grad_norm": 3.7978599071502686, + "learning_rate": 5.040153487734686e-06, + "loss": 0.1515, + "step": 36943 + }, + { + "epoch": 2.101085187194791, + "grad_norm": 4.868009567260742, + "learning_rate": 5.040016445114431e-06, + "loss": 0.247, + "step": 36944 + }, + { + "epoch": 2.101098752034726, + "grad_norm": 4.016318321228027, + "learning_rate": 5.039879402494176e-06, + "loss": 0.1817, + "step": 36945 + }, + { + "epoch": 2.101112316874661, + "grad_norm": 3.9113380908966064, + "learning_rate": 5.039742359873921e-06, + "loss": 0.1163, + "step": 36946 + }, + { + "epoch": 2.1011258817145957, + "grad_norm": 5.647721290588379, + "learning_rate": 5.039605317253667e-06, + "loss": 0.2057, + "step": 36947 + }, + { + "epoch": 2.1011394465545306, + "grad_norm": 5.703680038452148, + "learning_rate": 5.039468274633411e-06, + "loss": 0.1666, + "step": 36948 + }, + { + "epoch": 2.1011530113944654, + "grad_norm": 6.600372314453125, + "learning_rate": 5.039331232013156e-06, + "loss": 0.2938, + "step": 36949 + }, + { + "epoch": 2.1011665762344003, + "grad_norm": 8.350854873657227, + "learning_rate": 5.039194189392902e-06, + "loss": 0.3448, + "step": 36950 + }, + { + "epoch": 2.101180141074335, + "grad_norm": 6.842883586883545, + "learning_rate": 5.0390571467726465e-06, + "loss": 0.2062, + "step": 36951 + }, + { + "epoch": 2.10119370591427, + "grad_norm": 5.488662242889404, + "learning_rate": 5.038920104152392e-06, + "loss": 0.2435, + "step": 36952 + }, + { + "epoch": 2.1012072707542053, + "grad_norm": 5.0133233070373535, + "learning_rate": 5.038783061532137e-06, + "loss": 0.2245, + "step": 36953 + }, + { + "epoch": 2.10122083559414, + "grad_norm": 4.140139102935791, + "learning_rate": 5.038646018911883e-06, + "loss": 0.1, + "step": 36954 + }, + { + "epoch": 2.101234400434075, + "grad_norm": 3.799055576324463, + "learning_rate": 5.038508976291627e-06, + "loss": 0.1154, + "step": 36955 + }, + { + "epoch": 2.10124796527401, + "grad_norm": 5.15091609954834, + "learning_rate": 5.038371933671372e-06, + "loss": 0.2246, + "step": 36956 + }, + { + "epoch": 2.101261530113945, + "grad_norm": 5.682432651519775, + "learning_rate": 5.038234891051117e-06, + "loss": 0.1469, + "step": 36957 + }, + { + "epoch": 2.1012750949538797, + "grad_norm": 5.474177360534668, + "learning_rate": 5.038097848430863e-06, + "loss": 0.1833, + "step": 36958 + }, + { + "epoch": 2.1012886597938145, + "grad_norm": 4.719005584716797, + "learning_rate": 5.037960805810608e-06, + "loss": 0.2327, + "step": 36959 + }, + { + "epoch": 2.1013022246337494, + "grad_norm": 4.100346088409424, + "learning_rate": 5.037823763190352e-06, + "loss": 0.1812, + "step": 36960 + }, + { + "epoch": 2.1013157894736842, + "grad_norm": 6.801455974578857, + "learning_rate": 5.037686720570097e-06, + "loss": 0.2609, + "step": 36961 + }, + { + "epoch": 2.101329354313619, + "grad_norm": 6.909151554107666, + "learning_rate": 5.0375496779498426e-06, + "loss": 0.3197, + "step": 36962 + }, + { + "epoch": 2.101342919153554, + "grad_norm": 3.763767957687378, + "learning_rate": 5.037412635329589e-06, + "loss": 0.1483, + "step": 36963 + }, + { + "epoch": 2.101356483993489, + "grad_norm": 6.850638389587402, + "learning_rate": 5.037275592709333e-06, + "loss": 0.2846, + "step": 36964 + }, + { + "epoch": 2.1013700488334237, + "grad_norm": 4.975355625152588, + "learning_rate": 5.037138550089078e-06, + "loss": 0.1787, + "step": 36965 + }, + { + "epoch": 2.1013836136733586, + "grad_norm": 5.887415885925293, + "learning_rate": 5.0370015074688224e-06, + "loss": 0.2853, + "step": 36966 + }, + { + "epoch": 2.1013971785132934, + "grad_norm": 5.4920783042907715, + "learning_rate": 5.0368644648485685e-06, + "loss": 0.219, + "step": 36967 + }, + { + "epoch": 2.1014107433532283, + "grad_norm": 5.517894744873047, + "learning_rate": 5.036727422228314e-06, + "loss": 0.1841, + "step": 36968 + }, + { + "epoch": 2.101424308193163, + "grad_norm": 4.517158031463623, + "learning_rate": 5.036590379608059e-06, + "loss": 0.1622, + "step": 36969 + }, + { + "epoch": 2.101437873033098, + "grad_norm": 5.461539268493652, + "learning_rate": 5.036453336987803e-06, + "loss": 0.2891, + "step": 36970 + }, + { + "epoch": 2.101451437873033, + "grad_norm": 6.828488826751709, + "learning_rate": 5.036316294367549e-06, + "loss": 0.1736, + "step": 36971 + }, + { + "epoch": 2.1014650027129678, + "grad_norm": 4.835423469543457, + "learning_rate": 5.036179251747294e-06, + "loss": 0.2019, + "step": 36972 + }, + { + "epoch": 2.101478567552903, + "grad_norm": 6.12782096862793, + "learning_rate": 5.036042209127039e-06, + "loss": 0.2507, + "step": 36973 + }, + { + "epoch": 2.101492132392838, + "grad_norm": 6.5770769119262695, + "learning_rate": 5.035905166506784e-06, + "loss": 0.2871, + "step": 36974 + }, + { + "epoch": 2.101505697232773, + "grad_norm": 4.635656833648682, + "learning_rate": 5.035768123886528e-06, + "loss": 0.2106, + "step": 36975 + }, + { + "epoch": 2.1015192620727077, + "grad_norm": 5.046820640563965, + "learning_rate": 5.035631081266274e-06, + "loss": 0.182, + "step": 36976 + }, + { + "epoch": 2.1015328269126425, + "grad_norm": 5.83230447769165, + "learning_rate": 5.035494038646019e-06, + "loss": 0.2454, + "step": 36977 + }, + { + "epoch": 2.1015463917525774, + "grad_norm": 5.511341571807861, + "learning_rate": 5.035356996025765e-06, + "loss": 0.1526, + "step": 36978 + }, + { + "epoch": 2.1015599565925123, + "grad_norm": 7.583707809448242, + "learning_rate": 5.035219953405509e-06, + "loss": 0.3116, + "step": 36979 + }, + { + "epoch": 2.101573521432447, + "grad_norm": 3.8969995975494385, + "learning_rate": 5.035082910785255e-06, + "loss": 0.1402, + "step": 36980 + }, + { + "epoch": 2.101587086272382, + "grad_norm": 5.0854291915893555, + "learning_rate": 5.034945868165e-06, + "loss": 0.2133, + "step": 36981 + }, + { + "epoch": 2.101600651112317, + "grad_norm": 5.358312129974365, + "learning_rate": 5.0348088255447445e-06, + "loss": 0.2577, + "step": 36982 + }, + { + "epoch": 2.1016142159522517, + "grad_norm": 4.209126949310303, + "learning_rate": 5.03467178292449e-06, + "loss": 0.1626, + "step": 36983 + }, + { + "epoch": 2.1016277807921866, + "grad_norm": 3.896557331085205, + "learning_rate": 5.034534740304236e-06, + "loss": 0.0856, + "step": 36984 + }, + { + "epoch": 2.1016413456321215, + "grad_norm": 6.393720626831055, + "learning_rate": 5.03439769768398e-06, + "loss": 0.2579, + "step": 36985 + }, + { + "epoch": 2.1016549104720563, + "grad_norm": 4.045780181884766, + "learning_rate": 5.034260655063725e-06, + "loss": 0.1756, + "step": 36986 + }, + { + "epoch": 2.101668475311991, + "grad_norm": 6.736215114593506, + "learning_rate": 5.03412361244347e-06, + "loss": 0.2342, + "step": 36987 + }, + { + "epoch": 2.101682040151926, + "grad_norm": 6.118187427520752, + "learning_rate": 5.033986569823216e-06, + "loss": 0.2929, + "step": 36988 + }, + { + "epoch": 2.101695604991861, + "grad_norm": 4.308638095855713, + "learning_rate": 5.033849527202961e-06, + "loss": 0.2014, + "step": 36989 + }, + { + "epoch": 2.101709169831796, + "grad_norm": 5.4456610679626465, + "learning_rate": 5.033712484582706e-06, + "loss": 0.1704, + "step": 36990 + }, + { + "epoch": 2.101722734671731, + "grad_norm": 4.703659534454346, + "learning_rate": 5.03357544196245e-06, + "loss": 0.1383, + "step": 36991 + }, + { + "epoch": 2.101736299511666, + "grad_norm": 4.741276741027832, + "learning_rate": 5.033438399342195e-06, + "loss": 0.1163, + "step": 36992 + }, + { + "epoch": 2.101749864351601, + "grad_norm": 4.864923477172852, + "learning_rate": 5.033301356721941e-06, + "loss": 0.1287, + "step": 36993 + }, + { + "epoch": 2.1017634291915357, + "grad_norm": 5.072958946228027, + "learning_rate": 5.033164314101686e-06, + "loss": 0.1541, + "step": 36994 + }, + { + "epoch": 2.1017769940314706, + "grad_norm": 5.7260966300964355, + "learning_rate": 5.033027271481431e-06, + "loss": 0.1976, + "step": 36995 + }, + { + "epoch": 2.1017905588714054, + "grad_norm": 4.293222427368164, + "learning_rate": 5.032890228861176e-06, + "loss": 0.1614, + "step": 36996 + }, + { + "epoch": 2.1018041237113403, + "grad_norm": 4.424979209899902, + "learning_rate": 5.032753186240922e-06, + "loss": 0.1537, + "step": 36997 + }, + { + "epoch": 2.101817688551275, + "grad_norm": 5.421058177947998, + "learning_rate": 5.0326161436206665e-06, + "loss": 0.1947, + "step": 36998 + }, + { + "epoch": 2.10183125339121, + "grad_norm": 4.618035316467285, + "learning_rate": 5.032479101000412e-06, + "loss": 0.1346, + "step": 36999 + }, + { + "epoch": 2.101844818231145, + "grad_norm": 4.971240520477295, + "learning_rate": 5.032342058380156e-06, + "loss": 0.1637, + "step": 37000 + }, + { + "epoch": 2.1018583830710797, + "grad_norm": 6.136168479919434, + "learning_rate": 5.032205015759902e-06, + "loss": 0.259, + "step": 37001 + }, + { + "epoch": 2.1018719479110146, + "grad_norm": 5.528060436248779, + "learning_rate": 5.032067973139647e-06, + "loss": 0.1944, + "step": 37002 + }, + { + "epoch": 2.1018855127509495, + "grad_norm": 5.093918800354004, + "learning_rate": 5.031930930519392e-06, + "loss": 0.219, + "step": 37003 + }, + { + "epoch": 2.1018990775908843, + "grad_norm": 6.3865742683410645, + "learning_rate": 5.031793887899137e-06, + "loss": 0.1957, + "step": 37004 + }, + { + "epoch": 2.101912642430819, + "grad_norm": 4.297328472137451, + "learning_rate": 5.031656845278882e-06, + "loss": 0.1569, + "step": 37005 + }, + { + "epoch": 2.101926207270754, + "grad_norm": 4.20407247543335, + "learning_rate": 5.031519802658628e-06, + "loss": 0.1626, + "step": 37006 + }, + { + "epoch": 2.101939772110689, + "grad_norm": 3.183650016784668, + "learning_rate": 5.031382760038372e-06, + "loss": 0.109, + "step": 37007 + }, + { + "epoch": 2.101953336950624, + "grad_norm": 5.58060359954834, + "learning_rate": 5.031245717418117e-06, + "loss": 0.2077, + "step": 37008 + }, + { + "epoch": 2.1019669017905587, + "grad_norm": 4.804312229156494, + "learning_rate": 5.031108674797862e-06, + "loss": 0.1329, + "step": 37009 + }, + { + "epoch": 2.1019804666304935, + "grad_norm": 4.806100845336914, + "learning_rate": 5.030971632177608e-06, + "loss": 0.2223, + "step": 37010 + }, + { + "epoch": 2.101994031470429, + "grad_norm": 5.370398998260498, + "learning_rate": 5.030834589557353e-06, + "loss": 0.2168, + "step": 37011 + }, + { + "epoch": 2.1020075963103637, + "grad_norm": 5.142247676849365, + "learning_rate": 5.030697546937098e-06, + "loss": 0.1545, + "step": 37012 + }, + { + "epoch": 2.1020211611502986, + "grad_norm": 4.145064353942871, + "learning_rate": 5.0305605043168425e-06, + "loss": 0.1273, + "step": 37013 + }, + { + "epoch": 2.1020347259902334, + "grad_norm": 6.883955955505371, + "learning_rate": 5.0304234616965885e-06, + "loss": 0.2421, + "step": 37014 + }, + { + "epoch": 2.1020482908301683, + "grad_norm": 6.353143215179443, + "learning_rate": 5.030286419076334e-06, + "loss": 0.3281, + "step": 37015 + }, + { + "epoch": 2.102061855670103, + "grad_norm": 4.486026287078857, + "learning_rate": 5.030149376456078e-06, + "loss": 0.2697, + "step": 37016 + }, + { + "epoch": 2.102075420510038, + "grad_norm": 8.957527160644531, + "learning_rate": 5.030012333835823e-06, + "loss": 0.4622, + "step": 37017 + }, + { + "epoch": 2.102088985349973, + "grad_norm": 6.324873924255371, + "learning_rate": 5.029875291215568e-06, + "loss": 0.2422, + "step": 37018 + }, + { + "epoch": 2.1021025501899078, + "grad_norm": 4.628686428070068, + "learning_rate": 5.0297382485953135e-06, + "loss": 0.2182, + "step": 37019 + }, + { + "epoch": 2.1021161150298426, + "grad_norm": 5.440537929534912, + "learning_rate": 5.029601205975059e-06, + "loss": 0.1302, + "step": 37020 + }, + { + "epoch": 2.1021296798697775, + "grad_norm": 4.958572864532471, + "learning_rate": 5.029464163354804e-06, + "loss": 0.1703, + "step": 37021 + }, + { + "epoch": 2.1021432447097124, + "grad_norm": 4.7703166007995605, + "learning_rate": 5.029327120734548e-06, + "loss": 0.1605, + "step": 37022 + }, + { + "epoch": 2.1021568095496472, + "grad_norm": 6.491521835327148, + "learning_rate": 5.029190078114294e-06, + "loss": 0.2343, + "step": 37023 + }, + { + "epoch": 2.102170374389582, + "grad_norm": 6.52086067199707, + "learning_rate": 5.0290530354940394e-06, + "loss": 0.2474, + "step": 37024 + }, + { + "epoch": 2.102183939229517, + "grad_norm": 3.9863200187683105, + "learning_rate": 5.028915992873784e-06, + "loss": 0.1831, + "step": 37025 + }, + { + "epoch": 2.102197504069452, + "grad_norm": 4.244513034820557, + "learning_rate": 5.028778950253529e-06, + "loss": 0.1847, + "step": 37026 + }, + { + "epoch": 2.1022110689093867, + "grad_norm": 3.5254533290863037, + "learning_rate": 5.028641907633275e-06, + "loss": 0.1948, + "step": 37027 + }, + { + "epoch": 2.1022246337493216, + "grad_norm": 4.47584867477417, + "learning_rate": 5.028504865013019e-06, + "loss": 0.1735, + "step": 37028 + }, + { + "epoch": 2.102238198589257, + "grad_norm": 5.5960798263549805, + "learning_rate": 5.0283678223927645e-06, + "loss": 0.1996, + "step": 37029 + }, + { + "epoch": 2.1022517634291917, + "grad_norm": 6.0770392417907715, + "learning_rate": 5.02823077977251e-06, + "loss": 0.2557, + "step": 37030 + }, + { + "epoch": 2.1022653282691266, + "grad_norm": 3.772822380065918, + "learning_rate": 5.028093737152254e-06, + "loss": 0.1618, + "step": 37031 + }, + { + "epoch": 2.1022788931090615, + "grad_norm": 5.5280632972717285, + "learning_rate": 5.027956694532e-06, + "loss": 0.204, + "step": 37032 + }, + { + "epoch": 2.1022924579489963, + "grad_norm": 5.335318565368652, + "learning_rate": 5.027819651911745e-06, + "loss": 0.2493, + "step": 37033 + }, + { + "epoch": 2.102306022788931, + "grad_norm": 4.248979091644287, + "learning_rate": 5.0276826092914895e-06, + "loss": 0.2455, + "step": 37034 + }, + { + "epoch": 2.102319587628866, + "grad_norm": 5.179954528808594, + "learning_rate": 5.027545566671235e-06, + "loss": 0.1331, + "step": 37035 + }, + { + "epoch": 2.102333152468801, + "grad_norm": 3.766929864883423, + "learning_rate": 5.027408524050981e-06, + "loss": 0.2008, + "step": 37036 + }, + { + "epoch": 2.102346717308736, + "grad_norm": 6.5330657958984375, + "learning_rate": 5.027271481430726e-06, + "loss": 0.2184, + "step": 37037 + }, + { + "epoch": 2.1023602821486707, + "grad_norm": 4.147812366485596, + "learning_rate": 5.02713443881047e-06, + "loss": 0.1864, + "step": 37038 + }, + { + "epoch": 2.1023738469886055, + "grad_norm": 6.043140411376953, + "learning_rate": 5.026997396190215e-06, + "loss": 0.2101, + "step": 37039 + }, + { + "epoch": 2.1023874118285404, + "grad_norm": 3.904219388961792, + "learning_rate": 5.0268603535699614e-06, + "loss": 0.1864, + "step": 37040 + }, + { + "epoch": 2.1024009766684753, + "grad_norm": 4.9677605628967285, + "learning_rate": 5.026723310949706e-06, + "loss": 0.1727, + "step": 37041 + }, + { + "epoch": 2.10241454150841, + "grad_norm": 6.289180278778076, + "learning_rate": 5.026586268329451e-06, + "loss": 0.2985, + "step": 37042 + }, + { + "epoch": 2.102428106348345, + "grad_norm": 5.2880730628967285, + "learning_rate": 5.026449225709195e-06, + "loss": 0.1846, + "step": 37043 + }, + { + "epoch": 2.10244167118828, + "grad_norm": 3.1811885833740234, + "learning_rate": 5.0263121830889405e-06, + "loss": 0.1186, + "step": 37044 + }, + { + "epoch": 2.1024552360282147, + "grad_norm": 5.458782196044922, + "learning_rate": 5.0261751404686865e-06, + "loss": 0.2569, + "step": 37045 + }, + { + "epoch": 2.1024688008681496, + "grad_norm": 3.297853946685791, + "learning_rate": 5.026038097848432e-06, + "loss": 0.1385, + "step": 37046 + }, + { + "epoch": 2.1024823657080844, + "grad_norm": 5.220884323120117, + "learning_rate": 5.025901055228176e-06, + "loss": 0.2041, + "step": 37047 + }, + { + "epoch": 2.1024959305480193, + "grad_norm": 4.992417812347412, + "learning_rate": 5.025764012607921e-06, + "loss": 0.1901, + "step": 37048 + }, + { + "epoch": 2.1025094953879546, + "grad_norm": 6.814432621002197, + "learning_rate": 5.025626969987667e-06, + "loss": 0.264, + "step": 37049 + }, + { + "epoch": 2.1025230602278895, + "grad_norm": 4.6888203620910645, + "learning_rate": 5.0254899273674115e-06, + "loss": 0.1545, + "step": 37050 + }, + { + "epoch": 2.1025366250678243, + "grad_norm": 5.50662088394165, + "learning_rate": 5.025352884747157e-06, + "loss": 0.2057, + "step": 37051 + }, + { + "epoch": 2.102550189907759, + "grad_norm": 5.7204108238220215, + "learning_rate": 5.025215842126902e-06, + "loss": 0.3441, + "step": 37052 + }, + { + "epoch": 2.102563754747694, + "grad_norm": 3.7433252334594727, + "learning_rate": 5.025078799506647e-06, + "loss": 0.2073, + "step": 37053 + }, + { + "epoch": 2.102577319587629, + "grad_norm": 3.4023592472076416, + "learning_rate": 5.024941756886392e-06, + "loss": 0.1173, + "step": 37054 + }, + { + "epoch": 2.102590884427564, + "grad_norm": 4.849945068359375, + "learning_rate": 5.0248047142661374e-06, + "loss": 0.2186, + "step": 37055 + }, + { + "epoch": 2.1026044492674987, + "grad_norm": 3.7460484504699707, + "learning_rate": 5.024667671645882e-06, + "loss": 0.1133, + "step": 37056 + }, + { + "epoch": 2.1026180141074335, + "grad_norm": 4.65609884262085, + "learning_rate": 5.024530629025627e-06, + "loss": 0.1849, + "step": 37057 + }, + { + "epoch": 2.1026315789473684, + "grad_norm": 4.588845729827881, + "learning_rate": 5.024393586405373e-06, + "loss": 0.1569, + "step": 37058 + }, + { + "epoch": 2.1026451437873033, + "grad_norm": 3.6011199951171875, + "learning_rate": 5.024256543785117e-06, + "loss": 0.1932, + "step": 37059 + }, + { + "epoch": 2.102658708627238, + "grad_norm": 5.041926860809326, + "learning_rate": 5.0241195011648625e-06, + "loss": 0.2221, + "step": 37060 + }, + { + "epoch": 2.102672273467173, + "grad_norm": 3.706219434738159, + "learning_rate": 5.023982458544608e-06, + "loss": 0.1448, + "step": 37061 + }, + { + "epoch": 2.102685838307108, + "grad_norm": 3.9352004528045654, + "learning_rate": 5.023845415924354e-06, + "loss": 0.1505, + "step": 37062 + }, + { + "epoch": 2.1026994031470427, + "grad_norm": 6.314170837402344, + "learning_rate": 5.023708373304098e-06, + "loss": 0.2174, + "step": 37063 + }, + { + "epoch": 2.1027129679869776, + "grad_norm": 5.534008979797363, + "learning_rate": 5.023571330683843e-06, + "loss": 0.1786, + "step": 37064 + }, + { + "epoch": 2.1027265328269125, + "grad_norm": 4.945911884307861, + "learning_rate": 5.0234342880635875e-06, + "loss": 0.136, + "step": 37065 + }, + { + "epoch": 2.1027400976668473, + "grad_norm": 5.742223262786865, + "learning_rate": 5.0232972454433336e-06, + "loss": 0.2197, + "step": 37066 + }, + { + "epoch": 2.1027536625067826, + "grad_norm": 5.145753860473633, + "learning_rate": 5.023160202823079e-06, + "loss": 0.2599, + "step": 37067 + }, + { + "epoch": 2.1027672273467175, + "grad_norm": 4.6969218254089355, + "learning_rate": 5.023023160202823e-06, + "loss": 0.1761, + "step": 37068 + }, + { + "epoch": 2.1027807921866524, + "grad_norm": 3.9448530673980713, + "learning_rate": 5.022886117582568e-06, + "loss": 0.1682, + "step": 37069 + }, + { + "epoch": 2.1027943570265872, + "grad_norm": 5.3478569984436035, + "learning_rate": 5.022749074962314e-06, + "loss": 0.2421, + "step": 37070 + }, + { + "epoch": 2.102807921866522, + "grad_norm": 3.2177650928497314, + "learning_rate": 5.0226120323420594e-06, + "loss": 0.1378, + "step": 37071 + }, + { + "epoch": 2.102821486706457, + "grad_norm": 3.896092176437378, + "learning_rate": 5.022474989721804e-06, + "loss": 0.1833, + "step": 37072 + }, + { + "epoch": 2.102835051546392, + "grad_norm": 3.86185622215271, + "learning_rate": 5.022337947101549e-06, + "loss": 0.1464, + "step": 37073 + }, + { + "epoch": 2.1028486163863267, + "grad_norm": 3.0810468196868896, + "learning_rate": 5.022200904481293e-06, + "loss": 0.0929, + "step": 37074 + }, + { + "epoch": 2.1028621812262616, + "grad_norm": 3.5447418689727783, + "learning_rate": 5.022063861861039e-06, + "loss": 0.1751, + "step": 37075 + }, + { + "epoch": 2.1028757460661964, + "grad_norm": 3.0918641090393066, + "learning_rate": 5.0219268192407845e-06, + "loss": 0.1292, + "step": 37076 + }, + { + "epoch": 2.1028893109061313, + "grad_norm": 6.166425704956055, + "learning_rate": 5.021789776620529e-06, + "loss": 0.3106, + "step": 37077 + }, + { + "epoch": 2.102902875746066, + "grad_norm": 4.244475364685059, + "learning_rate": 5.021652734000274e-06, + "loss": 0.2305, + "step": 37078 + }, + { + "epoch": 2.102916440586001, + "grad_norm": 4.489232063293457, + "learning_rate": 5.02151569138002e-06, + "loss": 0.2064, + "step": 37079 + }, + { + "epoch": 2.102930005425936, + "grad_norm": 5.147386074066162, + "learning_rate": 5.021378648759765e-06, + "loss": 0.199, + "step": 37080 + }, + { + "epoch": 2.1029435702658708, + "grad_norm": 3.905008316040039, + "learning_rate": 5.0212416061395095e-06, + "loss": 0.1765, + "step": 37081 + }, + { + "epoch": 2.1029571351058056, + "grad_norm": 5.00277042388916, + "learning_rate": 5.021104563519255e-06, + "loss": 0.3215, + "step": 37082 + }, + { + "epoch": 2.1029706999457405, + "grad_norm": 5.445255279541016, + "learning_rate": 5.020967520899001e-06, + "loss": 0.2279, + "step": 37083 + }, + { + "epoch": 2.1029842647856754, + "grad_norm": 4.162213325500488, + "learning_rate": 5.020830478278745e-06, + "loss": 0.1068, + "step": 37084 + }, + { + "epoch": 2.10299782962561, + "grad_norm": 4.0487799644470215, + "learning_rate": 5.02069343565849e-06, + "loss": 0.1689, + "step": 37085 + }, + { + "epoch": 2.103011394465545, + "grad_norm": 4.487557411193848, + "learning_rate": 5.0205563930382354e-06, + "loss": 0.158, + "step": 37086 + }, + { + "epoch": 2.1030249593054804, + "grad_norm": 4.2369866371154785, + "learning_rate": 5.02041935041798e-06, + "loss": 0.1843, + "step": 37087 + }, + { + "epoch": 2.1030385241454153, + "grad_norm": 5.738308429718018, + "learning_rate": 5.020282307797726e-06, + "loss": 0.1951, + "step": 37088 + }, + { + "epoch": 2.10305208898535, + "grad_norm": 5.286615371704102, + "learning_rate": 5.020145265177471e-06, + "loss": 0.2062, + "step": 37089 + }, + { + "epoch": 2.103065653825285, + "grad_norm": 4.624053001403809, + "learning_rate": 5.020008222557215e-06, + "loss": 0.2399, + "step": 37090 + }, + { + "epoch": 2.10307921866522, + "grad_norm": 4.381683349609375, + "learning_rate": 5.0198711799369605e-06, + "loss": 0.22, + "step": 37091 + }, + { + "epoch": 2.1030927835051547, + "grad_norm": 5.060644149780273, + "learning_rate": 5.0197341373167065e-06, + "loss": 0.1703, + "step": 37092 + }, + { + "epoch": 2.1031063483450896, + "grad_norm": 4.296213626861572, + "learning_rate": 5.019597094696451e-06, + "loss": 0.1619, + "step": 37093 + }, + { + "epoch": 2.1031199131850244, + "grad_norm": 4.5592756271362305, + "learning_rate": 5.019460052076196e-06, + "loss": 0.2447, + "step": 37094 + }, + { + "epoch": 2.1031334780249593, + "grad_norm": 5.637767314910889, + "learning_rate": 5.019323009455941e-06, + "loss": 0.196, + "step": 37095 + }, + { + "epoch": 2.103147042864894, + "grad_norm": 6.715153217315674, + "learning_rate": 5.019185966835687e-06, + "loss": 0.3334, + "step": 37096 + }, + { + "epoch": 2.103160607704829, + "grad_norm": 4.556453704833984, + "learning_rate": 5.0190489242154316e-06, + "loss": 0.1648, + "step": 37097 + }, + { + "epoch": 2.103174172544764, + "grad_norm": 5.2844438552856445, + "learning_rate": 5.018911881595177e-06, + "loss": 0.3284, + "step": 37098 + }, + { + "epoch": 2.1031877373846988, + "grad_norm": 4.323697566986084, + "learning_rate": 5.018774838974921e-06, + "loss": 0.1628, + "step": 37099 + }, + { + "epoch": 2.1032013022246336, + "grad_norm": 5.0334014892578125, + "learning_rate": 5.018637796354666e-06, + "loss": 0.1909, + "step": 37100 + }, + { + "epoch": 2.1032148670645685, + "grad_norm": 4.176586627960205, + "learning_rate": 5.018500753734412e-06, + "loss": 0.1855, + "step": 37101 + }, + { + "epoch": 2.1032284319045034, + "grad_norm": 5.383432865142822, + "learning_rate": 5.018363711114157e-06, + "loss": 0.1988, + "step": 37102 + }, + { + "epoch": 2.1032419967444382, + "grad_norm": 5.814565658569336, + "learning_rate": 5.018226668493902e-06, + "loss": 0.1778, + "step": 37103 + }, + { + "epoch": 2.103255561584373, + "grad_norm": 6.282428741455078, + "learning_rate": 5.018089625873647e-06, + "loss": 0.3197, + "step": 37104 + }, + { + "epoch": 2.1032691264243084, + "grad_norm": 4.2852935791015625, + "learning_rate": 5.017952583253393e-06, + "loss": 0.1737, + "step": 37105 + }, + { + "epoch": 2.1032826912642433, + "grad_norm": 4.80528450012207, + "learning_rate": 5.017815540633137e-06, + "loss": 0.2754, + "step": 37106 + }, + { + "epoch": 2.103296256104178, + "grad_norm": 4.865468502044678, + "learning_rate": 5.0176784980128825e-06, + "loss": 0.179, + "step": 37107 + }, + { + "epoch": 2.103309820944113, + "grad_norm": 6.120302677154541, + "learning_rate": 5.017541455392627e-06, + "loss": 0.194, + "step": 37108 + }, + { + "epoch": 2.103323385784048, + "grad_norm": 4.86842679977417, + "learning_rate": 5.017404412772373e-06, + "loss": 0.1959, + "step": 37109 + }, + { + "epoch": 2.1033369506239827, + "grad_norm": 4.844985485076904, + "learning_rate": 5.017267370152118e-06, + "loss": 0.1818, + "step": 37110 + }, + { + "epoch": 2.1033505154639176, + "grad_norm": 4.300119400024414, + "learning_rate": 5.017130327531863e-06, + "loss": 0.2012, + "step": 37111 + }, + { + "epoch": 2.1033640803038525, + "grad_norm": 4.395332336425781, + "learning_rate": 5.0169932849116075e-06, + "loss": 0.2604, + "step": 37112 + }, + { + "epoch": 2.1033776451437873, + "grad_norm": 3.753394842147827, + "learning_rate": 5.016856242291353e-06, + "loss": 0.1444, + "step": 37113 + }, + { + "epoch": 2.103391209983722, + "grad_norm": 4.918655872344971, + "learning_rate": 5.016719199671099e-06, + "loss": 0.1882, + "step": 37114 + }, + { + "epoch": 2.103404774823657, + "grad_norm": 5.386929035186768, + "learning_rate": 5.016582157050843e-06, + "loss": 0.3084, + "step": 37115 + }, + { + "epoch": 2.103418339663592, + "grad_norm": 3.7244086265563965, + "learning_rate": 5.016445114430588e-06, + "loss": 0.1597, + "step": 37116 + }, + { + "epoch": 2.103431904503527, + "grad_norm": 3.4294378757476807, + "learning_rate": 5.016308071810333e-06, + "loss": 0.1243, + "step": 37117 + }, + { + "epoch": 2.1034454693434617, + "grad_norm": 4.317474365234375, + "learning_rate": 5.016171029190079e-06, + "loss": 0.2357, + "step": 37118 + }, + { + "epoch": 2.1034590341833965, + "grad_norm": 4.4902424812316895, + "learning_rate": 5.016033986569824e-06, + "loss": 0.2042, + "step": 37119 + }, + { + "epoch": 2.1034725990233314, + "grad_norm": 7.056674957275391, + "learning_rate": 5.015896943949569e-06, + "loss": 0.263, + "step": 37120 + }, + { + "epoch": 2.1034861638632663, + "grad_norm": 4.058080196380615, + "learning_rate": 5.015759901329313e-06, + "loss": 0.2003, + "step": 37121 + }, + { + "epoch": 2.103499728703201, + "grad_norm": 5.946634769439697, + "learning_rate": 5.015622858709059e-06, + "loss": 0.2332, + "step": 37122 + }, + { + "epoch": 2.103513293543136, + "grad_norm": 5.154284477233887, + "learning_rate": 5.0154858160888045e-06, + "loss": 0.1538, + "step": 37123 + }, + { + "epoch": 2.103526858383071, + "grad_norm": 4.131481647491455, + "learning_rate": 5.015348773468549e-06, + "loss": 0.1667, + "step": 37124 + }, + { + "epoch": 2.103540423223006, + "grad_norm": 4.04300594329834, + "learning_rate": 5.015211730848294e-06, + "loss": 0.225, + "step": 37125 + }, + { + "epoch": 2.103553988062941, + "grad_norm": 5.171632289886475, + "learning_rate": 5.015074688228039e-06, + "loss": 0.207, + "step": 37126 + }, + { + "epoch": 2.103567552902876, + "grad_norm": 3.1570253372192383, + "learning_rate": 5.014937645607784e-06, + "loss": 0.1081, + "step": 37127 + }, + { + "epoch": 2.1035811177428108, + "grad_norm": 5.892799377441406, + "learning_rate": 5.0148006029875296e-06, + "loss": 0.1685, + "step": 37128 + }, + { + "epoch": 2.1035946825827456, + "grad_norm": 3.878448247909546, + "learning_rate": 5.014663560367275e-06, + "loss": 0.1396, + "step": 37129 + }, + { + "epoch": 2.1036082474226805, + "grad_norm": 4.92470121383667, + "learning_rate": 5.014526517747019e-06, + "loss": 0.3052, + "step": 37130 + }, + { + "epoch": 2.1036218122626154, + "grad_norm": 4.563099384307861, + "learning_rate": 5.014389475126765e-06, + "loss": 0.1477, + "step": 37131 + }, + { + "epoch": 2.10363537710255, + "grad_norm": 4.128721714019775, + "learning_rate": 5.01425243250651e-06, + "loss": 0.1184, + "step": 37132 + }, + { + "epoch": 2.103648941942485, + "grad_norm": 6.220264434814453, + "learning_rate": 5.014115389886255e-06, + "loss": 0.3352, + "step": 37133 + }, + { + "epoch": 2.10366250678242, + "grad_norm": 3.384918689727783, + "learning_rate": 5.013978347266e-06, + "loss": 0.14, + "step": 37134 + }, + { + "epoch": 2.103676071622355, + "grad_norm": 3.5587873458862305, + "learning_rate": 5.013841304645746e-06, + "loss": 0.18, + "step": 37135 + }, + { + "epoch": 2.1036896364622897, + "grad_norm": 4.72081995010376, + "learning_rate": 5.01370426202549e-06, + "loss": 0.1436, + "step": 37136 + }, + { + "epoch": 2.1037032013022245, + "grad_norm": 4.489575386047363, + "learning_rate": 5.013567219405235e-06, + "loss": 0.1395, + "step": 37137 + }, + { + "epoch": 2.1037167661421594, + "grad_norm": 3.545001268386841, + "learning_rate": 5.0134301767849805e-06, + "loss": 0.1543, + "step": 37138 + }, + { + "epoch": 2.1037303309820943, + "grad_norm": 4.405797004699707, + "learning_rate": 5.0132931341647265e-06, + "loss": 0.1704, + "step": 37139 + }, + { + "epoch": 2.103743895822029, + "grad_norm": 4.154317855834961, + "learning_rate": 5.013156091544471e-06, + "loss": 0.1632, + "step": 37140 + }, + { + "epoch": 2.103757460661964, + "grad_norm": 5.247410774230957, + "learning_rate": 5.013019048924216e-06, + "loss": 0.1534, + "step": 37141 + }, + { + "epoch": 2.103771025501899, + "grad_norm": 3.69507098197937, + "learning_rate": 5.01288200630396e-06, + "loss": 0.113, + "step": 37142 + }, + { + "epoch": 2.103784590341834, + "grad_norm": 2.732212781906128, + "learning_rate": 5.0127449636837055e-06, + "loss": 0.0828, + "step": 37143 + }, + { + "epoch": 2.103798155181769, + "grad_norm": 3.0284738540649414, + "learning_rate": 5.0126079210634516e-06, + "loss": 0.0893, + "step": 37144 + }, + { + "epoch": 2.103811720021704, + "grad_norm": 4.646831035614014, + "learning_rate": 5.012470878443197e-06, + "loss": 0.1722, + "step": 37145 + }, + { + "epoch": 2.103825284861639, + "grad_norm": 5.265676975250244, + "learning_rate": 5.012333835822941e-06, + "loss": 0.2119, + "step": 37146 + }, + { + "epoch": 2.1038388497015736, + "grad_norm": 4.520508766174316, + "learning_rate": 5.012196793202686e-06, + "loss": 0.1625, + "step": 37147 + }, + { + "epoch": 2.1038524145415085, + "grad_norm": 2.6853668689727783, + "learning_rate": 5.012059750582432e-06, + "loss": 0.0988, + "step": 37148 + }, + { + "epoch": 2.1038659793814434, + "grad_norm": 3.4141945838928223, + "learning_rate": 5.011922707962177e-06, + "loss": 0.1282, + "step": 37149 + }, + { + "epoch": 2.1038795442213782, + "grad_norm": 2.1324007511138916, + "learning_rate": 5.011785665341922e-06, + "loss": 0.0939, + "step": 37150 + }, + { + "epoch": 2.103893109061313, + "grad_norm": 3.143000602722168, + "learning_rate": 5.011648622721666e-06, + "loss": 0.1461, + "step": 37151 + }, + { + "epoch": 2.103906673901248, + "grad_norm": 3.893709182739258, + "learning_rate": 5.011511580101412e-06, + "loss": 0.2014, + "step": 37152 + }, + { + "epoch": 2.103920238741183, + "grad_norm": 2.695331335067749, + "learning_rate": 5.011374537481157e-06, + "loss": 0.0712, + "step": 37153 + }, + { + "epoch": 2.1039338035811177, + "grad_norm": 3.2709717750549316, + "learning_rate": 5.0112374948609025e-06, + "loss": 0.1301, + "step": 37154 + }, + { + "epoch": 2.1039473684210526, + "grad_norm": 3.8641366958618164, + "learning_rate": 5.011100452240647e-06, + "loss": 0.0988, + "step": 37155 + }, + { + "epoch": 2.1039609332609874, + "grad_norm": 2.925017833709717, + "learning_rate": 5.010963409620392e-06, + "loss": 0.104, + "step": 37156 + }, + { + "epoch": 2.1039744981009223, + "grad_norm": 2.886766195297241, + "learning_rate": 5.010826367000138e-06, + "loss": 0.0985, + "step": 37157 + }, + { + "epoch": 2.103988062940857, + "grad_norm": 3.5749387741088867, + "learning_rate": 5.010689324379882e-06, + "loss": 0.0778, + "step": 37158 + }, + { + "epoch": 2.104001627780792, + "grad_norm": 5.330498218536377, + "learning_rate": 5.0105522817596276e-06, + "loss": 0.0878, + "step": 37159 + }, + { + "epoch": 2.104015192620727, + "grad_norm": 3.7678070068359375, + "learning_rate": 5.010415239139373e-06, + "loss": 0.2041, + "step": 37160 + }, + { + "epoch": 2.1040287574606618, + "grad_norm": 3.7430968284606934, + "learning_rate": 5.010278196519118e-06, + "loss": 0.0913, + "step": 37161 + }, + { + "epoch": 2.104042322300597, + "grad_norm": 6.419101238250732, + "learning_rate": 5.010141153898863e-06, + "loss": 0.1158, + "step": 37162 + }, + { + "epoch": 2.104055887140532, + "grad_norm": 4.033359050750732, + "learning_rate": 5.010004111278608e-06, + "loss": 0.1947, + "step": 37163 + }, + { + "epoch": 2.104069451980467, + "grad_norm": 3.2028965950012207, + "learning_rate": 5.009867068658353e-06, + "loss": 0.1285, + "step": 37164 + }, + { + "epoch": 2.1040830168204017, + "grad_norm": 2.795118808746338, + "learning_rate": 5.009730026038099e-06, + "loss": 0.09, + "step": 37165 + }, + { + "epoch": 2.1040965816603365, + "grad_norm": 3.9081954956054688, + "learning_rate": 5.009592983417844e-06, + "loss": 0.1314, + "step": 37166 + }, + { + "epoch": 2.1041101465002714, + "grad_norm": 5.357539176940918, + "learning_rate": 5.009455940797588e-06, + "loss": 0.2584, + "step": 37167 + }, + { + "epoch": 2.1041237113402063, + "grad_norm": 4.079217433929443, + "learning_rate": 5.009318898177333e-06, + "loss": 0.1572, + "step": 37168 + }, + { + "epoch": 2.104137276180141, + "grad_norm": 4.696761608123779, + "learning_rate": 5.0091818555570785e-06, + "loss": 0.1495, + "step": 37169 + }, + { + "epoch": 2.104150841020076, + "grad_norm": 4.995138645172119, + "learning_rate": 5.009044812936824e-06, + "loss": 0.1994, + "step": 37170 + }, + { + "epoch": 2.104164405860011, + "grad_norm": 5.014369487762451, + "learning_rate": 5.008907770316569e-06, + "loss": 0.1648, + "step": 37171 + }, + { + "epoch": 2.1041779706999457, + "grad_norm": 4.285702228546143, + "learning_rate": 5.008770727696314e-06, + "loss": 0.145, + "step": 37172 + }, + { + "epoch": 2.1041915355398806, + "grad_norm": 4.552685737609863, + "learning_rate": 5.008633685076058e-06, + "loss": 0.1734, + "step": 37173 + }, + { + "epoch": 2.1042051003798155, + "grad_norm": 3.7324676513671875, + "learning_rate": 5.008496642455804e-06, + "loss": 0.1093, + "step": 37174 + }, + { + "epoch": 2.1042186652197503, + "grad_norm": 3.5408554077148438, + "learning_rate": 5.00835959983555e-06, + "loss": 0.1647, + "step": 37175 + }, + { + "epoch": 2.104232230059685, + "grad_norm": 5.473789215087891, + "learning_rate": 5.008222557215294e-06, + "loss": 0.2255, + "step": 37176 + }, + { + "epoch": 2.10424579489962, + "grad_norm": 4.660139083862305, + "learning_rate": 5.008085514595039e-06, + "loss": 0.1363, + "step": 37177 + }, + { + "epoch": 2.104259359739555, + "grad_norm": 6.874318599700928, + "learning_rate": 5.007948471974785e-06, + "loss": 0.16, + "step": 37178 + }, + { + "epoch": 2.10427292457949, + "grad_norm": 6.027631759643555, + "learning_rate": 5.00781142935453e-06, + "loss": 0.2029, + "step": 37179 + }, + { + "epoch": 2.1042864894194246, + "grad_norm": 2.655914545059204, + "learning_rate": 5.007674386734275e-06, + "loss": 0.1078, + "step": 37180 + }, + { + "epoch": 2.10430005425936, + "grad_norm": 5.317319393157959, + "learning_rate": 5.00753734411402e-06, + "loss": 0.1629, + "step": 37181 + }, + { + "epoch": 2.104313619099295, + "grad_norm": 3.305710554122925, + "learning_rate": 5.007400301493764e-06, + "loss": 0.1036, + "step": 37182 + }, + { + "epoch": 2.1043271839392297, + "grad_norm": 4.717237949371338, + "learning_rate": 5.00726325887351e-06, + "loss": 0.1471, + "step": 37183 + }, + { + "epoch": 2.1043407487791645, + "grad_norm": 3.6453263759613037, + "learning_rate": 5.007126216253255e-06, + "loss": 0.1481, + "step": 37184 + }, + { + "epoch": 2.1043543136190994, + "grad_norm": 3.9376306533813477, + "learning_rate": 5.006989173633e-06, + "loss": 0.1096, + "step": 37185 + }, + { + "epoch": 2.1043678784590343, + "grad_norm": 2.728855848312378, + "learning_rate": 5.006852131012745e-06, + "loss": 0.1238, + "step": 37186 + }, + { + "epoch": 2.104381443298969, + "grad_norm": 6.158775329589844, + "learning_rate": 5.006715088392491e-06, + "loss": 0.2055, + "step": 37187 + }, + { + "epoch": 2.104395008138904, + "grad_norm": 3.983360528945923, + "learning_rate": 5.006578045772236e-06, + "loss": 0.0837, + "step": 37188 + }, + { + "epoch": 2.104408572978839, + "grad_norm": 2.8268606662750244, + "learning_rate": 5.00644100315198e-06, + "loss": 0.1715, + "step": 37189 + }, + { + "epoch": 2.1044221378187737, + "grad_norm": 5.6997270584106445, + "learning_rate": 5.0063039605317256e-06, + "loss": 0.2156, + "step": 37190 + }, + { + "epoch": 2.1044357026587086, + "grad_norm": 3.8454792499542236, + "learning_rate": 5.006166917911472e-06, + "loss": 0.2586, + "step": 37191 + }, + { + "epoch": 2.1044492674986435, + "grad_norm": 5.773959159851074, + "learning_rate": 5.006029875291216e-06, + "loss": 0.1742, + "step": 37192 + }, + { + "epoch": 2.1044628323385783, + "grad_norm": 3.57767391204834, + "learning_rate": 5.005892832670961e-06, + "loss": 0.1303, + "step": 37193 + }, + { + "epoch": 2.104476397178513, + "grad_norm": 4.341502666473389, + "learning_rate": 5.005755790050706e-06, + "loss": 0.1697, + "step": 37194 + }, + { + "epoch": 2.104489962018448, + "grad_norm": 4.79642391204834, + "learning_rate": 5.005618747430451e-06, + "loss": 0.2791, + "step": 37195 + }, + { + "epoch": 2.104503526858383, + "grad_norm": 5.242081642150879, + "learning_rate": 5.005481704810197e-06, + "loss": 0.1917, + "step": 37196 + }, + { + "epoch": 2.104517091698318, + "grad_norm": 5.527791500091553, + "learning_rate": 5.005344662189942e-06, + "loss": 0.1945, + "step": 37197 + }, + { + "epoch": 2.1045306565382527, + "grad_norm": 5.576564311981201, + "learning_rate": 5.005207619569686e-06, + "loss": 0.2314, + "step": 37198 + }, + { + "epoch": 2.1045442213781875, + "grad_norm": 5.072925090789795, + "learning_rate": 5.005070576949431e-06, + "loss": 0.2705, + "step": 37199 + }, + { + "epoch": 2.104557786218123, + "grad_norm": 3.7548482418060303, + "learning_rate": 5.004933534329177e-06, + "loss": 0.0845, + "step": 37200 + }, + { + "epoch": 2.1045713510580577, + "grad_norm": 5.23036527633667, + "learning_rate": 5.004796491708922e-06, + "loss": 0.14, + "step": 37201 + }, + { + "epoch": 2.1045849158979926, + "grad_norm": 4.721410751342773, + "learning_rate": 5.004659449088667e-06, + "loss": 0.2203, + "step": 37202 + }, + { + "epoch": 2.1045984807379274, + "grad_norm": 4.95338773727417, + "learning_rate": 5.004522406468412e-06, + "loss": 0.1664, + "step": 37203 + }, + { + "epoch": 2.1046120455778623, + "grad_norm": 5.4539618492126465, + "learning_rate": 5.004385363848158e-06, + "loss": 0.1807, + "step": 37204 + }, + { + "epoch": 2.104625610417797, + "grad_norm": 3.974773406982422, + "learning_rate": 5.004248321227902e-06, + "loss": 0.1616, + "step": 37205 + }, + { + "epoch": 2.104639175257732, + "grad_norm": 6.012622833251953, + "learning_rate": 5.004111278607648e-06, + "loss": 0.2085, + "step": 37206 + }, + { + "epoch": 2.104652740097667, + "grad_norm": 4.18576192855835, + "learning_rate": 5.003974235987392e-06, + "loss": 0.1416, + "step": 37207 + }, + { + "epoch": 2.1046663049376018, + "grad_norm": 3.544329881668091, + "learning_rate": 5.003837193367138e-06, + "loss": 0.1482, + "step": 37208 + }, + { + "epoch": 2.1046798697775366, + "grad_norm": 7.243969917297363, + "learning_rate": 5.003700150746883e-06, + "loss": 0.2651, + "step": 37209 + }, + { + "epoch": 2.1046934346174715, + "grad_norm": 4.086756229400635, + "learning_rate": 5.0035631081266275e-06, + "loss": 0.0888, + "step": 37210 + }, + { + "epoch": 2.1047069994574064, + "grad_norm": 3.3627212047576904, + "learning_rate": 5.003426065506373e-06, + "loss": 0.14, + "step": 37211 + }, + { + "epoch": 2.1047205642973412, + "grad_norm": 5.531743049621582, + "learning_rate": 5.003289022886118e-06, + "loss": 0.2626, + "step": 37212 + }, + { + "epoch": 2.104734129137276, + "grad_norm": 4.883323669433594, + "learning_rate": 5.003151980265864e-06, + "loss": 0.2164, + "step": 37213 + }, + { + "epoch": 2.104747693977211, + "grad_norm": 5.6204118728637695, + "learning_rate": 5.003014937645608e-06, + "loss": 0.1744, + "step": 37214 + }, + { + "epoch": 2.104761258817146, + "grad_norm": 6.305934906005859, + "learning_rate": 5.002877895025353e-06, + "loss": 0.2466, + "step": 37215 + }, + { + "epoch": 2.1047748236570807, + "grad_norm": 4.340088844299316, + "learning_rate": 5.002740852405098e-06, + "loss": 0.1661, + "step": 37216 + }, + { + "epoch": 2.1047883884970155, + "grad_norm": 5.1743597984313965, + "learning_rate": 5.002603809784844e-06, + "loss": 0.1949, + "step": 37217 + }, + { + "epoch": 2.1048019533369504, + "grad_norm": 4.45379114151001, + "learning_rate": 5.002466767164589e-06, + "loss": 0.1507, + "step": 37218 + }, + { + "epoch": 2.1048155181768857, + "grad_norm": 4.167552947998047, + "learning_rate": 5.002329724544333e-06, + "loss": 0.1014, + "step": 37219 + }, + { + "epoch": 2.1048290830168206, + "grad_norm": 3.247035026550293, + "learning_rate": 5.002192681924078e-06, + "loss": 0.0907, + "step": 37220 + }, + { + "epoch": 2.1048426478567555, + "grad_norm": 6.002133369445801, + "learning_rate": 5.002055639303824e-06, + "loss": 0.1807, + "step": 37221 + }, + { + "epoch": 2.1048562126966903, + "grad_norm": 3.5015668869018555, + "learning_rate": 5.00191859668357e-06, + "loss": 0.1254, + "step": 37222 + }, + { + "epoch": 2.104869777536625, + "grad_norm": 4.756872177124023, + "learning_rate": 5.001781554063314e-06, + "loss": 0.1287, + "step": 37223 + }, + { + "epoch": 2.10488334237656, + "grad_norm": 3.3344948291778564, + "learning_rate": 5.001644511443059e-06, + "loss": 0.1037, + "step": 37224 + }, + { + "epoch": 2.104896907216495, + "grad_norm": 4.6815667152404785, + "learning_rate": 5.0015074688228034e-06, + "loss": 0.1525, + "step": 37225 + }, + { + "epoch": 2.10491047205643, + "grad_norm": 5.918367862701416, + "learning_rate": 5.0013704262025495e-06, + "loss": 0.2016, + "step": 37226 + }, + { + "epoch": 2.1049240368963646, + "grad_norm": 4.48985481262207, + "learning_rate": 5.001233383582295e-06, + "loss": 0.181, + "step": 37227 + }, + { + "epoch": 2.1049376017362995, + "grad_norm": 5.186934947967529, + "learning_rate": 5.00109634096204e-06, + "loss": 0.2044, + "step": 37228 + }, + { + "epoch": 2.1049511665762344, + "grad_norm": 3.8549797534942627, + "learning_rate": 5.000959298341784e-06, + "loss": 0.1178, + "step": 37229 + }, + { + "epoch": 2.1049647314161692, + "grad_norm": 4.147420406341553, + "learning_rate": 5.00082225572153e-06, + "loss": 0.1214, + "step": 37230 + }, + { + "epoch": 2.104978296256104, + "grad_norm": 4.018648147583008, + "learning_rate": 5.000685213101275e-06, + "loss": 0.1526, + "step": 37231 + }, + { + "epoch": 2.104991861096039, + "grad_norm": 5.594878673553467, + "learning_rate": 5.00054817048102e-06, + "loss": 0.2676, + "step": 37232 + }, + { + "epoch": 2.105005425935974, + "grad_norm": 3.999462604522705, + "learning_rate": 5.000411127860765e-06, + "loss": 0.175, + "step": 37233 + }, + { + "epoch": 2.1050189907759087, + "grad_norm": 3.607365608215332, + "learning_rate": 5.000274085240511e-06, + "loss": 0.1265, + "step": 37234 + }, + { + "epoch": 2.1050325556158436, + "grad_norm": 4.280890941619873, + "learning_rate": 5.000137042620255e-06, + "loss": 0.1455, + "step": 37235 + }, + { + "epoch": 2.1050461204557784, + "grad_norm": 5.890313625335693, + "learning_rate": 5e-06, + "loss": 0.1654, + "step": 37236 + }, + { + "epoch": 2.1050596852957133, + "grad_norm": 4.874685287475586, + "learning_rate": 4.999862957379746e-06, + "loss": 0.1994, + "step": 37237 + }, + { + "epoch": 2.1050732501356486, + "grad_norm": 5.441013813018799, + "learning_rate": 4.999725914759491e-06, + "loss": 0.1798, + "step": 37238 + }, + { + "epoch": 2.1050868149755835, + "grad_norm": 5.163740158081055, + "learning_rate": 4.999588872139235e-06, + "loss": 0.1772, + "step": 37239 + }, + { + "epoch": 2.1051003798155183, + "grad_norm": 4.845993518829346, + "learning_rate": 4.999451829518981e-06, + "loss": 0.1661, + "step": 37240 + }, + { + "epoch": 2.105113944655453, + "grad_norm": 8.191343307495117, + "learning_rate": 4.9993147868987255e-06, + "loss": 0.272, + "step": 37241 + }, + { + "epoch": 2.105127509495388, + "grad_norm": 4.341683864593506, + "learning_rate": 4.9991777442784715e-06, + "loss": 0.1706, + "step": 37242 + }, + { + "epoch": 2.105141074335323, + "grad_norm": 3.1598153114318848, + "learning_rate": 4.999040701658216e-06, + "loss": 0.0941, + "step": 37243 + }, + { + "epoch": 2.105154639175258, + "grad_norm": 4.249356746673584, + "learning_rate": 4.998903659037961e-06, + "loss": 0.1664, + "step": 37244 + }, + { + "epoch": 2.1051682040151927, + "grad_norm": 4.106667518615723, + "learning_rate": 4.998766616417706e-06, + "loss": 0.114, + "step": 37245 + }, + { + "epoch": 2.1051817688551275, + "grad_norm": 4.348952770233154, + "learning_rate": 4.998629573797451e-06, + "loss": 0.1624, + "step": 37246 + }, + { + "epoch": 2.1051953336950624, + "grad_norm": 5.0187602043151855, + "learning_rate": 4.9984925311771965e-06, + "loss": 0.2726, + "step": 37247 + }, + { + "epoch": 2.1052088985349973, + "grad_norm": 5.892847537994385, + "learning_rate": 4.998355488556942e-06, + "loss": 0.1097, + "step": 37248 + }, + { + "epoch": 2.105222463374932, + "grad_norm": 4.205574989318848, + "learning_rate": 4.998218445936687e-06, + "loss": 0.1435, + "step": 37249 + }, + { + "epoch": 2.105236028214867, + "grad_norm": 3.886566162109375, + "learning_rate": 4.998081403316431e-06, + "loss": 0.1523, + "step": 37250 + }, + { + "epoch": 2.105249593054802, + "grad_norm": 4.778016090393066, + "learning_rate": 4.997944360696177e-06, + "loss": 0.2098, + "step": 37251 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 5.01401948928833, + "learning_rate": 4.997807318075922e-06, + "loss": 0.1779, + "step": 37252 + }, + { + "epoch": 2.1052767227346716, + "grad_norm": 4.67655611038208, + "learning_rate": 4.997670275455668e-06, + "loss": 0.1728, + "step": 37253 + }, + { + "epoch": 2.1052902875746065, + "grad_norm": 4.945975303649902, + "learning_rate": 4.997533232835412e-06, + "loss": 0.1591, + "step": 37254 + }, + { + "epoch": 2.1053038524145413, + "grad_norm": 4.285203456878662, + "learning_rate": 4.997396190215157e-06, + "loss": 0.1232, + "step": 37255 + }, + { + "epoch": 2.105317417254476, + "grad_norm": 4.191242218017578, + "learning_rate": 4.997259147594902e-06, + "loss": 0.194, + "step": 37256 + }, + { + "epoch": 2.1053309820944115, + "grad_norm": 6.654203414916992, + "learning_rate": 4.9971221049746475e-06, + "loss": 0.2422, + "step": 37257 + }, + { + "epoch": 2.1053445469343464, + "grad_norm": 4.257664680480957, + "learning_rate": 4.996985062354393e-06, + "loss": 0.1293, + "step": 37258 + }, + { + "epoch": 2.1053581117742812, + "grad_norm": 3.9090540409088135, + "learning_rate": 4.996848019734138e-06, + "loss": 0.1457, + "step": 37259 + }, + { + "epoch": 2.105371676614216, + "grad_norm": 4.403857231140137, + "learning_rate": 4.996710977113883e-06, + "loss": 0.1972, + "step": 37260 + }, + { + "epoch": 2.105385241454151, + "grad_norm": 2.9729628562927246, + "learning_rate": 4.996573934493628e-06, + "loss": 0.1179, + "step": 37261 + }, + { + "epoch": 2.105398806294086, + "grad_norm": 4.809093952178955, + "learning_rate": 4.996436891873373e-06, + "loss": 0.2024, + "step": 37262 + }, + { + "epoch": 2.1054123711340207, + "grad_norm": 4.676050662994385, + "learning_rate": 4.9962998492531185e-06, + "loss": 0.2722, + "step": 37263 + }, + { + "epoch": 2.1054259359739556, + "grad_norm": 4.275721549987793, + "learning_rate": 4.996162806632863e-06, + "loss": 0.1291, + "step": 37264 + }, + { + "epoch": 2.1054395008138904, + "grad_norm": 4.215418338775635, + "learning_rate": 4.996025764012608e-06, + "loss": 0.2086, + "step": 37265 + }, + { + "epoch": 2.1054530656538253, + "grad_norm": 3.5867667198181152, + "learning_rate": 4.995888721392353e-06, + "loss": 0.117, + "step": 37266 + }, + { + "epoch": 2.10546663049376, + "grad_norm": 4.6287665367126465, + "learning_rate": 4.995751678772098e-06, + "loss": 0.1453, + "step": 37267 + }, + { + "epoch": 2.105480195333695, + "grad_norm": 5.177972316741943, + "learning_rate": 4.995614636151844e-06, + "loss": 0.1939, + "step": 37268 + }, + { + "epoch": 2.10549376017363, + "grad_norm": 4.5849504470825195, + "learning_rate": 4.995477593531589e-06, + "loss": 0.1829, + "step": 37269 + }, + { + "epoch": 2.1055073250135647, + "grad_norm": 3.6757216453552246, + "learning_rate": 4.995340550911334e-06, + "loss": 0.1377, + "step": 37270 + }, + { + "epoch": 2.1055208898534996, + "grad_norm": 5.1381306648254395, + "learning_rate": 4.995203508291079e-06, + "loss": 0.1514, + "step": 37271 + }, + { + "epoch": 2.1055344546934345, + "grad_norm": 7.303917407989502, + "learning_rate": 4.995066465670824e-06, + "loss": 0.2061, + "step": 37272 + }, + { + "epoch": 2.1055480195333693, + "grad_norm": 4.755018711090088, + "learning_rate": 4.994929423050569e-06, + "loss": 0.1808, + "step": 37273 + }, + { + "epoch": 2.105561584373304, + "grad_norm": 3.603846788406372, + "learning_rate": 4.994792380430315e-06, + "loss": 0.1467, + "step": 37274 + }, + { + "epoch": 2.105575149213239, + "grad_norm": 5.504025936126709, + "learning_rate": 4.994655337810059e-06, + "loss": 0.2094, + "step": 37275 + }, + { + "epoch": 2.1055887140531744, + "grad_norm": 4.4124436378479, + "learning_rate": 4.994518295189805e-06, + "loss": 0.2372, + "step": 37276 + }, + { + "epoch": 2.1056022788931092, + "grad_norm": 4.7273359298706055, + "learning_rate": 4.994381252569549e-06, + "loss": 0.185, + "step": 37277 + }, + { + "epoch": 2.105615843733044, + "grad_norm": 5.431962013244629, + "learning_rate": 4.9942442099492945e-06, + "loss": 0.1677, + "step": 37278 + }, + { + "epoch": 2.105629408572979, + "grad_norm": 3.7706735134124756, + "learning_rate": 4.99410716732904e-06, + "loss": 0.1482, + "step": 37279 + }, + { + "epoch": 2.105642973412914, + "grad_norm": 4.914137363433838, + "learning_rate": 4.993970124708785e-06, + "loss": 0.1688, + "step": 37280 + }, + { + "epoch": 2.1056565382528487, + "grad_norm": 5.567780494689941, + "learning_rate": 4.99383308208853e-06, + "loss": 0.1832, + "step": 37281 + }, + { + "epoch": 2.1056701030927836, + "grad_norm": 5.723496913909912, + "learning_rate": 4.993696039468275e-06, + "loss": 0.1496, + "step": 37282 + }, + { + "epoch": 2.1056836679327184, + "grad_norm": 4.8240766525268555, + "learning_rate": 4.9935589968480204e-06, + "loss": 0.1577, + "step": 37283 + }, + { + "epoch": 2.1056972327726533, + "grad_norm": 5.315557479858398, + "learning_rate": 4.993421954227765e-06, + "loss": 0.2268, + "step": 37284 + }, + { + "epoch": 2.105710797612588, + "grad_norm": 4.412738800048828, + "learning_rate": 4.993284911607511e-06, + "loss": 0.152, + "step": 37285 + }, + { + "epoch": 2.105724362452523, + "grad_norm": 2.9475064277648926, + "learning_rate": 4.993147868987255e-06, + "loss": 0.082, + "step": 37286 + }, + { + "epoch": 2.105737927292458, + "grad_norm": 3.889786720275879, + "learning_rate": 4.993010826367001e-06, + "loss": 0.1843, + "step": 37287 + }, + { + "epoch": 2.1057514921323928, + "grad_norm": 3.704502582550049, + "learning_rate": 4.9928737837467455e-06, + "loss": 0.1724, + "step": 37288 + }, + { + "epoch": 2.1057650569723276, + "grad_norm": 3.030674934387207, + "learning_rate": 4.992736741126491e-06, + "loss": 0.1113, + "step": 37289 + }, + { + "epoch": 2.1057786218122625, + "grad_norm": 4.619530200958252, + "learning_rate": 4.992599698506236e-06, + "loss": 0.22, + "step": 37290 + }, + { + "epoch": 2.1057921866521974, + "grad_norm": 5.727408409118652, + "learning_rate": 4.992462655885981e-06, + "loss": 0.1735, + "step": 37291 + }, + { + "epoch": 2.1058057514921322, + "grad_norm": 5.839106559753418, + "learning_rate": 4.992325613265726e-06, + "loss": 0.1611, + "step": 37292 + }, + { + "epoch": 2.105819316332067, + "grad_norm": 5.336245059967041, + "learning_rate": 4.9921885706454705e-06, + "loss": 0.1981, + "step": 37293 + }, + { + "epoch": 2.105832881172002, + "grad_norm": 4.991760730743408, + "learning_rate": 4.9920515280252165e-06, + "loss": 0.1257, + "step": 37294 + }, + { + "epoch": 2.1058464460119373, + "grad_norm": 4.375823497772217, + "learning_rate": 4.991914485404961e-06, + "loss": 0.1444, + "step": 37295 + }, + { + "epoch": 2.105860010851872, + "grad_norm": 5.369782447814941, + "learning_rate": 4.991777442784707e-06, + "loss": 0.1699, + "step": 37296 + }, + { + "epoch": 2.105873575691807, + "grad_norm": 5.2191481590271, + "learning_rate": 4.991640400164451e-06, + "loss": 0.1797, + "step": 37297 + }, + { + "epoch": 2.105887140531742, + "grad_norm": 6.020334720611572, + "learning_rate": 4.991503357544196e-06, + "loss": 0.1837, + "step": 37298 + }, + { + "epoch": 2.1059007053716767, + "grad_norm": 4.1541948318481445, + "learning_rate": 4.991366314923942e-06, + "loss": 0.1475, + "step": 37299 + }, + { + "epoch": 2.1059142702116116, + "grad_norm": 3.710108518600464, + "learning_rate": 4.991229272303687e-06, + "loss": 0.1488, + "step": 37300 + }, + { + "epoch": 2.1059278350515465, + "grad_norm": 4.696605682373047, + "learning_rate": 4.991092229683432e-06, + "loss": 0.1303, + "step": 37301 + }, + { + "epoch": 2.1059413998914813, + "grad_norm": 4.428075313568115, + "learning_rate": 4.990955187063177e-06, + "loss": 0.1066, + "step": 37302 + }, + { + "epoch": 2.105954964731416, + "grad_norm": 4.306586265563965, + "learning_rate": 4.990818144442922e-06, + "loss": 0.1634, + "step": 37303 + }, + { + "epoch": 2.105968529571351, + "grad_norm": 3.2507972717285156, + "learning_rate": 4.9906811018226675e-06, + "loss": 0.1444, + "step": 37304 + }, + { + "epoch": 2.105982094411286, + "grad_norm": 3.4340169429779053, + "learning_rate": 4.990544059202413e-06, + "loss": 0.0837, + "step": 37305 + }, + { + "epoch": 2.105995659251221, + "grad_norm": 4.048287868499756, + "learning_rate": 4.990407016582157e-06, + "loss": 0.1724, + "step": 37306 + }, + { + "epoch": 2.1060092240911557, + "grad_norm": 4.493576526641846, + "learning_rate": 4.990269973961902e-06, + "loss": 0.1314, + "step": 37307 + }, + { + "epoch": 2.1060227889310905, + "grad_norm": 4.084855079650879, + "learning_rate": 4.990132931341647e-06, + "loss": 0.0855, + "step": 37308 + }, + { + "epoch": 2.1060363537710254, + "grad_norm": 4.306626796722412, + "learning_rate": 4.9899958887213925e-06, + "loss": 0.1651, + "step": 37309 + }, + { + "epoch": 2.1060499186109602, + "grad_norm": 3.4450647830963135, + "learning_rate": 4.989858846101138e-06, + "loss": 0.1731, + "step": 37310 + }, + { + "epoch": 2.106063483450895, + "grad_norm": 3.799795389175415, + "learning_rate": 4.989721803480883e-06, + "loss": 0.1791, + "step": 37311 + }, + { + "epoch": 2.10607704829083, + "grad_norm": 4.197301864624023, + "learning_rate": 4.989584760860628e-06, + "loss": 0.216, + "step": 37312 + }, + { + "epoch": 2.106090613130765, + "grad_norm": 4.914029121398926, + "learning_rate": 4.989447718240373e-06, + "loss": 0.1362, + "step": 37313 + }, + { + "epoch": 2.1061041779707, + "grad_norm": 4.135285377502441, + "learning_rate": 4.9893106756201184e-06, + "loss": 0.1108, + "step": 37314 + }, + { + "epoch": 2.106117742810635, + "grad_norm": 3.1419529914855957, + "learning_rate": 4.989173632999864e-06, + "loss": 0.079, + "step": 37315 + }, + { + "epoch": 2.10613130765057, + "grad_norm": 3.558403968811035, + "learning_rate": 4.989036590379609e-06, + "loss": 0.1116, + "step": 37316 + }, + { + "epoch": 2.1061448724905047, + "grad_norm": 4.301039695739746, + "learning_rate": 4.988899547759354e-06, + "loss": 0.1625, + "step": 37317 + }, + { + "epoch": 2.1061584373304396, + "grad_norm": 3.1661384105682373, + "learning_rate": 4.988762505139098e-06, + "loss": 0.1053, + "step": 37318 + }, + { + "epoch": 2.1061720021703745, + "grad_norm": 5.224211692810059, + "learning_rate": 4.9886254625188435e-06, + "loss": 0.2322, + "step": 37319 + }, + { + "epoch": 2.1061855670103093, + "grad_norm": 5.178080081939697, + "learning_rate": 4.988488419898589e-06, + "loss": 0.2115, + "step": 37320 + }, + { + "epoch": 2.106199131850244, + "grad_norm": 3.705824851989746, + "learning_rate": 4.988351377278334e-06, + "loss": 0.0816, + "step": 37321 + }, + { + "epoch": 2.106212696690179, + "grad_norm": 3.662212371826172, + "learning_rate": 4.988214334658079e-06, + "loss": 0.1585, + "step": 37322 + }, + { + "epoch": 2.106226261530114, + "grad_norm": 6.7458906173706055, + "learning_rate": 4.988077292037824e-06, + "loss": 0.345, + "step": 37323 + }, + { + "epoch": 2.106239826370049, + "grad_norm": 5.462011337280273, + "learning_rate": 4.987940249417569e-06, + "loss": 0.2319, + "step": 37324 + }, + { + "epoch": 2.1062533912099837, + "grad_norm": 4.740096569061279, + "learning_rate": 4.9878032067973146e-06, + "loss": 0.1903, + "step": 37325 + }, + { + "epoch": 2.1062669560499185, + "grad_norm": 3.2292027473449707, + "learning_rate": 4.98766616417706e-06, + "loss": 0.1276, + "step": 37326 + }, + { + "epoch": 2.1062805208898534, + "grad_norm": 4.137899875640869, + "learning_rate": 4.987529121556804e-06, + "loss": 0.1351, + "step": 37327 + }, + { + "epoch": 2.1062940857297883, + "grad_norm": 3.719139814376831, + "learning_rate": 4.98739207893655e-06, + "loss": 0.1302, + "step": 37328 + }, + { + "epoch": 2.106307650569723, + "grad_norm": 2.3964779376983643, + "learning_rate": 4.987255036316294e-06, + "loss": 0.1037, + "step": 37329 + }, + { + "epoch": 2.106321215409658, + "grad_norm": 4.814018726348877, + "learning_rate": 4.9871179936960404e-06, + "loss": 0.1417, + "step": 37330 + }, + { + "epoch": 2.106334780249593, + "grad_norm": 5.4188032150268555, + "learning_rate": 4.986980951075785e-06, + "loss": 0.1797, + "step": 37331 + }, + { + "epoch": 2.1063483450895277, + "grad_norm": 4.07574987411499, + "learning_rate": 4.98684390845553e-06, + "loss": 0.2596, + "step": 37332 + }, + { + "epoch": 2.106361909929463, + "grad_norm": 5.226585865020752, + "learning_rate": 4.986706865835275e-06, + "loss": 0.2079, + "step": 37333 + }, + { + "epoch": 2.106375474769398, + "grad_norm": 4.732209205627441, + "learning_rate": 4.98656982321502e-06, + "loss": 0.2026, + "step": 37334 + }, + { + "epoch": 2.1063890396093328, + "grad_norm": 4.996542453765869, + "learning_rate": 4.9864327805947655e-06, + "loss": 0.119, + "step": 37335 + }, + { + "epoch": 2.1064026044492676, + "grad_norm": 5.1878252029418945, + "learning_rate": 4.986295737974511e-06, + "loss": 0.2012, + "step": 37336 + }, + { + "epoch": 2.1064161692892025, + "grad_norm": 3.4295384883880615, + "learning_rate": 4.986158695354256e-06, + "loss": 0.1557, + "step": 37337 + }, + { + "epoch": 2.1064297341291374, + "grad_norm": 3.7997591495513916, + "learning_rate": 4.986021652734e-06, + "loss": 0.1564, + "step": 37338 + }, + { + "epoch": 2.1064432989690722, + "grad_norm": 3.9616611003875732, + "learning_rate": 4.985884610113746e-06, + "loss": 0.2502, + "step": 37339 + }, + { + "epoch": 2.106456863809007, + "grad_norm": 3.553895950317383, + "learning_rate": 4.9857475674934905e-06, + "loss": 0.1007, + "step": 37340 + }, + { + "epoch": 2.106470428648942, + "grad_norm": 3.8071348667144775, + "learning_rate": 4.9856105248732366e-06, + "loss": 0.1056, + "step": 37341 + }, + { + "epoch": 2.106483993488877, + "grad_norm": 3.748314142227173, + "learning_rate": 4.985473482252981e-06, + "loss": 0.1034, + "step": 37342 + }, + { + "epoch": 2.1064975583288117, + "grad_norm": 3.955512523651123, + "learning_rate": 4.985336439632726e-06, + "loss": 0.1218, + "step": 37343 + }, + { + "epoch": 2.1065111231687466, + "grad_norm": 3.5796234607696533, + "learning_rate": 4.985199397012471e-06, + "loss": 0.1085, + "step": 37344 + }, + { + "epoch": 2.1065246880086814, + "grad_norm": 4.842869281768799, + "learning_rate": 4.9850623543922164e-06, + "loss": 0.1457, + "step": 37345 + }, + { + "epoch": 2.1065382528486163, + "grad_norm": 3.5152029991149902, + "learning_rate": 4.984925311771962e-06, + "loss": 0.1236, + "step": 37346 + }, + { + "epoch": 2.106551817688551, + "grad_norm": 4.050786972045898, + "learning_rate": 4.984788269151706e-06, + "loss": 0.1988, + "step": 37347 + }, + { + "epoch": 2.106565382528486, + "grad_norm": 6.645169734954834, + "learning_rate": 4.984651226531452e-06, + "loss": 0.1777, + "step": 37348 + }, + { + "epoch": 2.106578947368421, + "grad_norm": 4.255162715911865, + "learning_rate": 4.984514183911196e-06, + "loss": 0.1484, + "step": 37349 + }, + { + "epoch": 2.1065925122083557, + "grad_norm": 4.000788688659668, + "learning_rate": 4.984377141290942e-06, + "loss": 0.2038, + "step": 37350 + }, + { + "epoch": 2.1066060770482906, + "grad_norm": 4.154021739959717, + "learning_rate": 4.984240098670687e-06, + "loss": 0.1192, + "step": 37351 + }, + { + "epoch": 2.106619641888226, + "grad_norm": 3.71893048286438, + "learning_rate": 4.984103056050432e-06, + "loss": 0.1268, + "step": 37352 + }, + { + "epoch": 2.106633206728161, + "grad_norm": 4.509829521179199, + "learning_rate": 4.983966013430177e-06, + "loss": 0.1857, + "step": 37353 + }, + { + "epoch": 2.1066467715680957, + "grad_norm": 3.791823387145996, + "learning_rate": 4.983828970809922e-06, + "loss": 0.0835, + "step": 37354 + }, + { + "epoch": 2.1066603364080305, + "grad_norm": 6.4755353927612305, + "learning_rate": 4.983691928189667e-06, + "loss": 0.2849, + "step": 37355 + }, + { + "epoch": 2.1066739012479654, + "grad_norm": 4.366964340209961, + "learning_rate": 4.9835548855694126e-06, + "loss": 0.1633, + "step": 37356 + }, + { + "epoch": 2.1066874660879003, + "grad_norm": 3.7787699699401855, + "learning_rate": 4.983417842949158e-06, + "loss": 0.1991, + "step": 37357 + }, + { + "epoch": 2.106701030927835, + "grad_norm": 5.788957118988037, + "learning_rate": 4.983280800328903e-06, + "loss": 0.1988, + "step": 37358 + }, + { + "epoch": 2.10671459576777, + "grad_norm": 5.054245948791504, + "learning_rate": 4.983143757708648e-06, + "loss": 0.1341, + "step": 37359 + }, + { + "epoch": 2.106728160607705, + "grad_norm": 4.076462745666504, + "learning_rate": 4.9830067150883924e-06, + "loss": 0.2158, + "step": 37360 + }, + { + "epoch": 2.1067417254476397, + "grad_norm": 4.888559341430664, + "learning_rate": 4.982869672468138e-06, + "loss": 0.2036, + "step": 37361 + }, + { + "epoch": 2.1067552902875746, + "grad_norm": 5.4567155838012695, + "learning_rate": 4.982732629847883e-06, + "loss": 0.2151, + "step": 37362 + }, + { + "epoch": 2.1067688551275094, + "grad_norm": 3.9760870933532715, + "learning_rate": 4.982595587227628e-06, + "loss": 0.1814, + "step": 37363 + }, + { + "epoch": 2.1067824199674443, + "grad_norm": 4.833907604217529, + "learning_rate": 4.982458544607373e-06, + "loss": 0.218, + "step": 37364 + }, + { + "epoch": 2.106795984807379, + "grad_norm": 5.172830104827881, + "learning_rate": 4.982321501987118e-06, + "loss": 0.2235, + "step": 37365 + }, + { + "epoch": 2.106809549647314, + "grad_norm": 4.08350133895874, + "learning_rate": 4.9821844593668635e-06, + "loss": 0.1629, + "step": 37366 + }, + { + "epoch": 2.106823114487249, + "grad_norm": 3.1144933700561523, + "learning_rate": 4.982047416746609e-06, + "loss": 0.1161, + "step": 37367 + }, + { + "epoch": 2.1068366793271838, + "grad_norm": 5.7128376960754395, + "learning_rate": 4.981910374126354e-06, + "loss": 0.2675, + "step": 37368 + }, + { + "epoch": 2.1068502441671186, + "grad_norm": 5.62742805480957, + "learning_rate": 4.981773331506099e-06, + "loss": 0.2799, + "step": 37369 + }, + { + "epoch": 2.106863809007054, + "grad_norm": 3.6747210025787354, + "learning_rate": 4.981636288885844e-06, + "loss": 0.1714, + "step": 37370 + }, + { + "epoch": 2.106877373846989, + "grad_norm": 7.156214714050293, + "learning_rate": 4.981499246265589e-06, + "loss": 0.3336, + "step": 37371 + }, + { + "epoch": 2.1068909386869237, + "grad_norm": 5.354926586151123, + "learning_rate": 4.981362203645334e-06, + "loss": 0.2411, + "step": 37372 + }, + { + "epoch": 2.1069045035268585, + "grad_norm": 4.752800941467285, + "learning_rate": 4.98122516102508e-06, + "loss": 0.1436, + "step": 37373 + }, + { + "epoch": 2.1069180683667934, + "grad_norm": 6.319612503051758, + "learning_rate": 4.981088118404824e-06, + "loss": 0.2785, + "step": 37374 + }, + { + "epoch": 2.1069316332067283, + "grad_norm": 5.114192962646484, + "learning_rate": 4.980951075784569e-06, + "loss": 0.2291, + "step": 37375 + }, + { + "epoch": 2.106945198046663, + "grad_norm": 5.890740394592285, + "learning_rate": 4.9808140331643144e-06, + "loss": 0.295, + "step": 37376 + }, + { + "epoch": 2.106958762886598, + "grad_norm": 3.7006070613861084, + "learning_rate": 4.98067699054406e-06, + "loss": 0.1932, + "step": 37377 + }, + { + "epoch": 2.106972327726533, + "grad_norm": 5.8203582763671875, + "learning_rate": 4.980539947923805e-06, + "loss": 0.185, + "step": 37378 + }, + { + "epoch": 2.1069858925664677, + "grad_norm": 5.476778030395508, + "learning_rate": 4.98040290530355e-06, + "loss": 0.2574, + "step": 37379 + }, + { + "epoch": 2.1069994574064026, + "grad_norm": 6.856842994689941, + "learning_rate": 4.980265862683295e-06, + "loss": 0.3416, + "step": 37380 + }, + { + "epoch": 2.1070130222463375, + "grad_norm": 5.893267631530762, + "learning_rate": 4.9801288200630395e-06, + "loss": 0.199, + "step": 37381 + }, + { + "epoch": 2.1070265870862723, + "grad_norm": 4.572781562805176, + "learning_rate": 4.9799917774427855e-06, + "loss": 0.1555, + "step": 37382 + }, + { + "epoch": 2.107040151926207, + "grad_norm": 5.609434604644775, + "learning_rate": 4.97985473482253e-06, + "loss": 0.1613, + "step": 37383 + }, + { + "epoch": 2.107053716766142, + "grad_norm": 6.06780481338501, + "learning_rate": 4.979717692202276e-06, + "loss": 0.2893, + "step": 37384 + }, + { + "epoch": 2.107067281606077, + "grad_norm": 5.587108612060547, + "learning_rate": 4.97958064958202e-06, + "loss": 0.2904, + "step": 37385 + }, + { + "epoch": 2.107080846446012, + "grad_norm": 6.111056804656982, + "learning_rate": 4.979443606961765e-06, + "loss": 0.1822, + "step": 37386 + }, + { + "epoch": 2.1070944112859467, + "grad_norm": 4.248708248138428, + "learning_rate": 4.9793065643415106e-06, + "loss": 0.1615, + "step": 37387 + }, + { + "epoch": 2.1071079761258815, + "grad_norm": 7.50846004486084, + "learning_rate": 4.979169521721256e-06, + "loss": 0.2809, + "step": 37388 + }, + { + "epoch": 2.1071215409658164, + "grad_norm": 5.787415504455566, + "learning_rate": 4.979032479101001e-06, + "loss": 0.2337, + "step": 37389 + }, + { + "epoch": 2.1071351058057517, + "grad_norm": 5.60810661315918, + "learning_rate": 4.978895436480746e-06, + "loss": 0.2035, + "step": 37390 + }, + { + "epoch": 2.1071486706456866, + "grad_norm": 4.565834045410156, + "learning_rate": 4.978758393860491e-06, + "loss": 0.1732, + "step": 37391 + }, + { + "epoch": 2.1071622354856214, + "grad_norm": 4.966722011566162, + "learning_rate": 4.978621351240236e-06, + "loss": 0.1682, + "step": 37392 + }, + { + "epoch": 2.1071758003255563, + "grad_norm": 4.612664222717285, + "learning_rate": 4.978484308619982e-06, + "loss": 0.2113, + "step": 37393 + }, + { + "epoch": 2.107189365165491, + "grad_norm": 6.6631340980529785, + "learning_rate": 4.978347265999726e-06, + "loss": 0.3697, + "step": 37394 + }, + { + "epoch": 2.107202930005426, + "grad_norm": 7.272415637969971, + "learning_rate": 4.978210223379472e-06, + "loss": 0.2636, + "step": 37395 + }, + { + "epoch": 2.107216494845361, + "grad_norm": 5.919912338256836, + "learning_rate": 4.978073180759216e-06, + "loss": 0.2367, + "step": 37396 + }, + { + "epoch": 2.1072300596852958, + "grad_norm": 7.594437122344971, + "learning_rate": 4.9779361381389615e-06, + "loss": 0.3388, + "step": 37397 + }, + { + "epoch": 2.1072436245252306, + "grad_norm": 5.44914436340332, + "learning_rate": 4.977799095518707e-06, + "loss": 0.1843, + "step": 37398 + }, + { + "epoch": 2.1072571893651655, + "grad_norm": 4.5736589431762695, + "learning_rate": 4.977662052898452e-06, + "loss": 0.1679, + "step": 37399 + }, + { + "epoch": 2.1072707542051003, + "grad_norm": 4.499279975891113, + "learning_rate": 4.977525010278197e-06, + "loss": 0.1816, + "step": 37400 + }, + { + "epoch": 2.107284319045035, + "grad_norm": 4.025246620178223, + "learning_rate": 4.977387967657942e-06, + "loss": 0.1253, + "step": 37401 + }, + { + "epoch": 2.10729788388497, + "grad_norm": 6.015604496002197, + "learning_rate": 4.977250925037687e-06, + "loss": 0.2561, + "step": 37402 + }, + { + "epoch": 2.107311448724905, + "grad_norm": 5.215950965881348, + "learning_rate": 4.977113882417432e-06, + "loss": 0.2429, + "step": 37403 + }, + { + "epoch": 2.10732501356484, + "grad_norm": 6.927908897399902, + "learning_rate": 4.976976839797178e-06, + "loss": 0.2398, + "step": 37404 + }, + { + "epoch": 2.1073385784047747, + "grad_norm": 5.728813171386719, + "learning_rate": 4.976839797176922e-06, + "loss": 0.2565, + "step": 37405 + }, + { + "epoch": 2.1073521432447095, + "grad_norm": 5.0588765144348145, + "learning_rate": 4.976702754556667e-06, + "loss": 0.1888, + "step": 37406 + }, + { + "epoch": 2.1073657080846444, + "grad_norm": 7.654257297515869, + "learning_rate": 4.9765657119364124e-06, + "loss": 0.3825, + "step": 37407 + }, + { + "epoch": 2.1073792729245797, + "grad_norm": 4.946113586425781, + "learning_rate": 4.976428669316158e-06, + "loss": 0.1751, + "step": 37408 + }, + { + "epoch": 2.1073928377645146, + "grad_norm": 4.869851589202881, + "learning_rate": 4.976291626695903e-06, + "loss": 0.1651, + "step": 37409 + }, + { + "epoch": 2.1074064026044494, + "grad_norm": 5.964179992675781, + "learning_rate": 4.976154584075648e-06, + "loss": 0.2289, + "step": 37410 + }, + { + "epoch": 2.1074199674443843, + "grad_norm": 5.351656436920166, + "learning_rate": 4.976017541455393e-06, + "loss": 0.2395, + "step": 37411 + }, + { + "epoch": 2.107433532284319, + "grad_norm": 5.258713245391846, + "learning_rate": 4.975880498835138e-06, + "loss": 0.2026, + "step": 37412 + }, + { + "epoch": 2.107447097124254, + "grad_norm": 5.163981914520264, + "learning_rate": 4.9757434562148835e-06, + "loss": 0.2736, + "step": 37413 + }, + { + "epoch": 2.107460661964189, + "grad_norm": 6.3115553855896, + "learning_rate": 4.975606413594629e-06, + "loss": 0.29, + "step": 37414 + }, + { + "epoch": 2.1074742268041238, + "grad_norm": 4.754356861114502, + "learning_rate": 4.975469370974373e-06, + "loss": 0.1948, + "step": 37415 + }, + { + "epoch": 2.1074877916440586, + "grad_norm": 6.506290435791016, + "learning_rate": 4.975332328354118e-06, + "loss": 0.3105, + "step": 37416 + }, + { + "epoch": 2.1075013564839935, + "grad_norm": 6.913153171539307, + "learning_rate": 4.975195285733863e-06, + "loss": 0.3137, + "step": 37417 + }, + { + "epoch": 2.1075149213239284, + "grad_norm": 4.8443603515625, + "learning_rate": 4.9750582431136086e-06, + "loss": 0.1685, + "step": 37418 + }, + { + "epoch": 2.1075284861638632, + "grad_norm": 6.224743843078613, + "learning_rate": 4.974921200493354e-06, + "loss": 0.2107, + "step": 37419 + }, + { + "epoch": 2.107542051003798, + "grad_norm": 5.058409214019775, + "learning_rate": 4.974784157873099e-06, + "loss": 0.1684, + "step": 37420 + }, + { + "epoch": 2.107555615843733, + "grad_norm": 3.874009370803833, + "learning_rate": 4.974647115252844e-06, + "loss": 0.1628, + "step": 37421 + }, + { + "epoch": 2.107569180683668, + "grad_norm": 4.969799518585205, + "learning_rate": 4.974510072632589e-06, + "loss": 0.2656, + "step": 37422 + }, + { + "epoch": 2.1075827455236027, + "grad_norm": 3.3501365184783936, + "learning_rate": 4.9743730300123345e-06, + "loss": 0.0816, + "step": 37423 + }, + { + "epoch": 2.1075963103635376, + "grad_norm": 6.3407301902771, + "learning_rate": 4.97423598739208e-06, + "loss": 0.2546, + "step": 37424 + }, + { + "epoch": 2.1076098752034724, + "grad_norm": 4.058142185211182, + "learning_rate": 4.974098944771825e-06, + "loss": 0.1799, + "step": 37425 + }, + { + "epoch": 2.1076234400434073, + "grad_norm": 4.2136712074279785, + "learning_rate": 4.973961902151569e-06, + "loss": 0.1421, + "step": 37426 + }, + { + "epoch": 2.107637004883342, + "grad_norm": 5.434636116027832, + "learning_rate": 4.973824859531315e-06, + "loss": 0.1089, + "step": 37427 + }, + { + "epoch": 2.1076505697232775, + "grad_norm": 5.614259243011475, + "learning_rate": 4.9736878169110595e-06, + "loss": 0.2743, + "step": 37428 + }, + { + "epoch": 2.1076641345632123, + "grad_norm": 6.1045756340026855, + "learning_rate": 4.973550774290805e-06, + "loss": 0.1899, + "step": 37429 + }, + { + "epoch": 2.107677699403147, + "grad_norm": 3.3248090744018555, + "learning_rate": 4.97341373167055e-06, + "loss": 0.1168, + "step": 37430 + }, + { + "epoch": 2.107691264243082, + "grad_norm": 4.891628742218018, + "learning_rate": 4.973276689050295e-06, + "loss": 0.1821, + "step": 37431 + }, + { + "epoch": 2.107704829083017, + "grad_norm": 4.025465488433838, + "learning_rate": 4.97313964643004e-06, + "loss": 0.1865, + "step": 37432 + }, + { + "epoch": 2.107718393922952, + "grad_norm": 4.406876564025879, + "learning_rate": 4.973002603809785e-06, + "loss": 0.1605, + "step": 37433 + }, + { + "epoch": 2.1077319587628867, + "grad_norm": 5.152411937713623, + "learning_rate": 4.972865561189531e-06, + "loss": 0.1336, + "step": 37434 + }, + { + "epoch": 2.1077455236028215, + "grad_norm": 4.4397382736206055, + "learning_rate": 4.972728518569275e-06, + "loss": 0.2273, + "step": 37435 + }, + { + "epoch": 2.1077590884427564, + "grad_norm": 4.77239990234375, + "learning_rate": 4.972591475949021e-06, + "loss": 0.125, + "step": 37436 + }, + { + "epoch": 2.1077726532826913, + "grad_norm": 3.887213945388794, + "learning_rate": 4.972454433328765e-06, + "loss": 0.1113, + "step": 37437 + }, + { + "epoch": 2.107786218122626, + "grad_norm": 4.773576259613037, + "learning_rate": 4.972317390708511e-06, + "loss": 0.1732, + "step": 37438 + }, + { + "epoch": 2.107799782962561, + "grad_norm": 4.213334083557129, + "learning_rate": 4.972180348088256e-06, + "loss": 0.2282, + "step": 37439 + }, + { + "epoch": 2.107813347802496, + "grad_norm": 2.599804401397705, + "learning_rate": 4.972043305468001e-06, + "loss": 0.0837, + "step": 37440 + }, + { + "epoch": 2.1078269126424307, + "grad_norm": 4.47505521774292, + "learning_rate": 4.971906262847746e-06, + "loss": 0.2025, + "step": 37441 + }, + { + "epoch": 2.1078404774823656, + "grad_norm": 4.661273956298828, + "learning_rate": 4.971769220227491e-06, + "loss": 0.2056, + "step": 37442 + }, + { + "epoch": 2.1078540423223004, + "grad_norm": 4.125754356384277, + "learning_rate": 4.971632177607236e-06, + "loss": 0.1695, + "step": 37443 + }, + { + "epoch": 2.1078676071622353, + "grad_norm": 3.5423061847686768, + "learning_rate": 4.9714951349869815e-06, + "loss": 0.132, + "step": 37444 + }, + { + "epoch": 2.10788117200217, + "grad_norm": 5.531236171722412, + "learning_rate": 4.971358092366727e-06, + "loss": 0.1648, + "step": 37445 + }, + { + "epoch": 2.1078947368421055, + "grad_norm": 5.024869918823242, + "learning_rate": 4.971221049746471e-06, + "loss": 0.1327, + "step": 37446 + }, + { + "epoch": 2.1079083016820404, + "grad_norm": 4.805945873260498, + "learning_rate": 4.971084007126217e-06, + "loss": 0.2682, + "step": 37447 + }, + { + "epoch": 2.107921866521975, + "grad_norm": 3.048600196838379, + "learning_rate": 4.970946964505961e-06, + "loss": 0.0849, + "step": 37448 + }, + { + "epoch": 2.10793543136191, + "grad_norm": 4.750275611877441, + "learning_rate": 4.9708099218857066e-06, + "loss": 0.1086, + "step": 37449 + }, + { + "epoch": 2.107948996201845, + "grad_norm": 4.119388103485107, + "learning_rate": 4.970672879265452e-06, + "loss": 0.1122, + "step": 37450 + }, + { + "epoch": 2.10796256104178, + "grad_norm": 3.8488922119140625, + "learning_rate": 4.970535836645197e-06, + "loss": 0.13, + "step": 37451 + }, + { + "epoch": 2.1079761258817147, + "grad_norm": 4.926059246063232, + "learning_rate": 4.970398794024942e-06, + "loss": 0.2083, + "step": 37452 + }, + { + "epoch": 2.1079896907216495, + "grad_norm": 3.8343687057495117, + "learning_rate": 4.970261751404687e-06, + "loss": 0.1624, + "step": 37453 + }, + { + "epoch": 2.1080032555615844, + "grad_norm": 4.545895099639893, + "learning_rate": 4.9701247087844325e-06, + "loss": 0.304, + "step": 37454 + }, + { + "epoch": 2.1080168204015193, + "grad_norm": 5.337911128997803, + "learning_rate": 4.969987666164178e-06, + "loss": 0.259, + "step": 37455 + }, + { + "epoch": 2.108030385241454, + "grad_norm": 4.781266212463379, + "learning_rate": 4.969850623543923e-06, + "loss": 0.2937, + "step": 37456 + }, + { + "epoch": 2.108043950081389, + "grad_norm": 4.179374694824219, + "learning_rate": 4.969713580923667e-06, + "loss": 0.1996, + "step": 37457 + }, + { + "epoch": 2.108057514921324, + "grad_norm": 4.193159580230713, + "learning_rate": 4.969576538303413e-06, + "loss": 0.1763, + "step": 37458 + }, + { + "epoch": 2.1080710797612587, + "grad_norm": 5.146814823150635, + "learning_rate": 4.9694394956831575e-06, + "loss": 0.184, + "step": 37459 + }, + { + "epoch": 2.1080846446011936, + "grad_norm": 3.8145487308502197, + "learning_rate": 4.969302453062903e-06, + "loss": 0.1996, + "step": 37460 + }, + { + "epoch": 2.1080982094411285, + "grad_norm": 5.71408224105835, + "learning_rate": 4.969165410442648e-06, + "loss": 0.1563, + "step": 37461 + }, + { + "epoch": 2.1081117742810633, + "grad_norm": 4.0837202072143555, + "learning_rate": 4.969028367822393e-06, + "loss": 0.1668, + "step": 37462 + }, + { + "epoch": 2.108125339120998, + "grad_norm": 3.6630942821502686, + "learning_rate": 4.968891325202138e-06, + "loss": 0.1123, + "step": 37463 + }, + { + "epoch": 2.108138903960933, + "grad_norm": 3.766641616821289, + "learning_rate": 4.968754282581883e-06, + "loss": 0.1957, + "step": 37464 + }, + { + "epoch": 2.108152468800868, + "grad_norm": 4.6258544921875, + "learning_rate": 4.968617239961629e-06, + "loss": 0.1543, + "step": 37465 + }, + { + "epoch": 2.1081660336408032, + "grad_norm": 4.476648330688477, + "learning_rate": 4.968480197341374e-06, + "loss": 0.1441, + "step": 37466 + }, + { + "epoch": 2.108179598480738, + "grad_norm": 5.1674418449401855, + "learning_rate": 4.968343154721119e-06, + "loss": 0.201, + "step": 37467 + }, + { + "epoch": 2.108193163320673, + "grad_norm": 9.002358436584473, + "learning_rate": 4.968206112100864e-06, + "loss": 0.283, + "step": 37468 + }, + { + "epoch": 2.108206728160608, + "grad_norm": 3.20100474357605, + "learning_rate": 4.9680690694806085e-06, + "loss": 0.1083, + "step": 37469 + }, + { + "epoch": 2.1082202930005427, + "grad_norm": 5.425304412841797, + "learning_rate": 4.9679320268603545e-06, + "loss": 0.2002, + "step": 37470 + }, + { + "epoch": 2.1082338578404776, + "grad_norm": 4.441858768463135, + "learning_rate": 4.967794984240099e-06, + "loss": 0.1596, + "step": 37471 + }, + { + "epoch": 2.1082474226804124, + "grad_norm": 4.5028228759765625, + "learning_rate": 4.967657941619844e-06, + "loss": 0.1674, + "step": 37472 + }, + { + "epoch": 2.1082609875203473, + "grad_norm": 6.123804092407227, + "learning_rate": 4.967520898999589e-06, + "loss": 0.1972, + "step": 37473 + }, + { + "epoch": 2.108274552360282, + "grad_norm": 5.414285182952881, + "learning_rate": 4.967383856379334e-06, + "loss": 0.3166, + "step": 37474 + }, + { + "epoch": 2.108288117200217, + "grad_norm": 5.455643653869629, + "learning_rate": 4.9672468137590795e-06, + "loss": 0.179, + "step": 37475 + }, + { + "epoch": 2.108301682040152, + "grad_norm": 5.555728912353516, + "learning_rate": 4.967109771138825e-06, + "loss": 0.344, + "step": 37476 + }, + { + "epoch": 2.1083152468800868, + "grad_norm": 4.8552093505859375, + "learning_rate": 4.96697272851857e-06, + "loss": 0.2056, + "step": 37477 + }, + { + "epoch": 2.1083288117200216, + "grad_norm": 4.893070220947266, + "learning_rate": 4.966835685898315e-06, + "loss": 0.3025, + "step": 37478 + }, + { + "epoch": 2.1083423765599565, + "grad_norm": 5.908266544342041, + "learning_rate": 4.96669864327806e-06, + "loss": 0.1963, + "step": 37479 + }, + { + "epoch": 2.1083559413998914, + "grad_norm": 5.086868762969971, + "learning_rate": 4.9665616006578046e-06, + "loss": 0.1444, + "step": 37480 + }, + { + "epoch": 2.108369506239826, + "grad_norm": 5.662474155426025, + "learning_rate": 4.966424558037551e-06, + "loss": 0.2196, + "step": 37481 + }, + { + "epoch": 2.108383071079761, + "grad_norm": 4.347929000854492, + "learning_rate": 4.966287515417295e-06, + "loss": 0.2493, + "step": 37482 + }, + { + "epoch": 2.108396635919696, + "grad_norm": 5.106480121612549, + "learning_rate": 4.966150472797041e-06, + "loss": 0.2915, + "step": 37483 + }, + { + "epoch": 2.1084102007596313, + "grad_norm": 4.443521022796631, + "learning_rate": 4.966013430176785e-06, + "loss": 0.1858, + "step": 37484 + }, + { + "epoch": 2.108423765599566, + "grad_norm": 5.195148944854736, + "learning_rate": 4.9658763875565305e-06, + "loss": 0.179, + "step": 37485 + }, + { + "epoch": 2.108437330439501, + "grad_norm": 5.238844871520996, + "learning_rate": 4.965739344936276e-06, + "loss": 0.3146, + "step": 37486 + }, + { + "epoch": 2.108450895279436, + "grad_norm": 6.539799690246582, + "learning_rate": 4.965602302316021e-06, + "loss": 0.2076, + "step": 37487 + }, + { + "epoch": 2.1084644601193707, + "grad_norm": 4.448632717132568, + "learning_rate": 4.965465259695766e-06, + "loss": 0.1518, + "step": 37488 + }, + { + "epoch": 2.1084780249593056, + "grad_norm": 6.54038143157959, + "learning_rate": 4.96532821707551e-06, + "loss": 0.2353, + "step": 37489 + }, + { + "epoch": 2.1084915897992405, + "grad_norm": 4.941913604736328, + "learning_rate": 4.965191174455256e-06, + "loss": 0.1016, + "step": 37490 + }, + { + "epoch": 2.1085051546391753, + "grad_norm": 3.6441667079925537, + "learning_rate": 4.965054131835001e-06, + "loss": 0.1365, + "step": 37491 + }, + { + "epoch": 2.10851871947911, + "grad_norm": 3.8226113319396973, + "learning_rate": 4.964917089214747e-06, + "loss": 0.1643, + "step": 37492 + }, + { + "epoch": 2.108532284319045, + "grad_norm": 4.220802307128906, + "learning_rate": 4.964780046594491e-06, + "loss": 0.2155, + "step": 37493 + }, + { + "epoch": 2.10854584915898, + "grad_norm": 5.651036739349365, + "learning_rate": 4.964643003974236e-06, + "loss": 0.2124, + "step": 37494 + }, + { + "epoch": 2.1085594139989148, + "grad_norm": 4.634835243225098, + "learning_rate": 4.964505961353981e-06, + "loss": 0.2496, + "step": 37495 + }, + { + "epoch": 2.1085729788388496, + "grad_norm": 5.147171497344971, + "learning_rate": 4.964368918733727e-06, + "loss": 0.1309, + "step": 37496 + }, + { + "epoch": 2.1085865436787845, + "grad_norm": 4.412699222564697, + "learning_rate": 4.964231876113472e-06, + "loss": 0.1827, + "step": 37497 + }, + { + "epoch": 2.1086001085187194, + "grad_norm": 5.538992881774902, + "learning_rate": 4.964094833493217e-06, + "loss": 0.2332, + "step": 37498 + }, + { + "epoch": 2.1086136733586542, + "grad_norm": 5.809177398681641, + "learning_rate": 4.963957790872962e-06, + "loss": 0.2253, + "step": 37499 + }, + { + "epoch": 2.108627238198589, + "grad_norm": 5.610894203186035, + "learning_rate": 4.9638207482527065e-06, + "loss": 0.214, + "step": 37500 + }, + { + "epoch": 2.108640803038524, + "grad_norm": 4.927340507507324, + "learning_rate": 4.9636837056324525e-06, + "loss": 0.111, + "step": 37501 + }, + { + "epoch": 2.108654367878459, + "grad_norm": 5.01945686340332, + "learning_rate": 4.963546663012197e-06, + "loss": 0.1244, + "step": 37502 + }, + { + "epoch": 2.1086679327183937, + "grad_norm": 4.8362579345703125, + "learning_rate": 4.963409620391942e-06, + "loss": 0.1809, + "step": 37503 + }, + { + "epoch": 2.108681497558329, + "grad_norm": 5.7761945724487305, + "learning_rate": 4.963272577771687e-06, + "loss": 0.2529, + "step": 37504 + }, + { + "epoch": 2.108695062398264, + "grad_norm": 4.1917195320129395, + "learning_rate": 4.963135535151432e-06, + "loss": 0.1526, + "step": 37505 + }, + { + "epoch": 2.1087086272381987, + "grad_norm": 4.070080280303955, + "learning_rate": 4.9629984925311775e-06, + "loss": 0.1206, + "step": 37506 + }, + { + "epoch": 2.1087221920781336, + "grad_norm": 4.032481670379639, + "learning_rate": 4.962861449910923e-06, + "loss": 0.1031, + "step": 37507 + }, + { + "epoch": 2.1087357569180685, + "grad_norm": 5.8328657150268555, + "learning_rate": 4.962724407290668e-06, + "loss": 0.2506, + "step": 37508 + }, + { + "epoch": 2.1087493217580033, + "grad_norm": 3.3653440475463867, + "learning_rate": 4.962587364670413e-06, + "loss": 0.1138, + "step": 37509 + }, + { + "epoch": 2.108762886597938, + "grad_norm": 3.890238046646118, + "learning_rate": 4.962450322050158e-06, + "loss": 0.1368, + "step": 37510 + }, + { + "epoch": 2.108776451437873, + "grad_norm": 6.552274227142334, + "learning_rate": 4.9623132794299034e-06, + "loss": 0.1604, + "step": 37511 + }, + { + "epoch": 2.108790016277808, + "grad_norm": 5.140017032623291, + "learning_rate": 4.962176236809649e-06, + "loss": 0.1899, + "step": 37512 + }, + { + "epoch": 2.108803581117743, + "grad_norm": 5.081916332244873, + "learning_rate": 4.962039194189393e-06, + "loss": 0.2531, + "step": 37513 + }, + { + "epoch": 2.1088171459576777, + "grad_norm": 7.059625148773193, + "learning_rate": 4.961902151569138e-06, + "loss": 0.3105, + "step": 37514 + }, + { + "epoch": 2.1088307107976125, + "grad_norm": 4.303487300872803, + "learning_rate": 4.961765108948883e-06, + "loss": 0.1388, + "step": 37515 + }, + { + "epoch": 2.1088442756375474, + "grad_norm": 5.186850070953369, + "learning_rate": 4.9616280663286285e-06, + "loss": 0.1624, + "step": 37516 + }, + { + "epoch": 2.1088578404774823, + "grad_norm": 4.326850414276123, + "learning_rate": 4.961491023708374e-06, + "loss": 0.1346, + "step": 37517 + }, + { + "epoch": 2.108871405317417, + "grad_norm": 4.531720161437988, + "learning_rate": 4.961353981088119e-06, + "loss": 0.1441, + "step": 37518 + }, + { + "epoch": 2.108884970157352, + "grad_norm": 5.442174434661865, + "learning_rate": 4.961216938467864e-06, + "loss": 0.2635, + "step": 37519 + }, + { + "epoch": 2.108898534997287, + "grad_norm": 4.042089939117432, + "learning_rate": 4.961079895847609e-06, + "loss": 0.1512, + "step": 37520 + }, + { + "epoch": 2.1089120998372217, + "grad_norm": 3.801222085952759, + "learning_rate": 4.960942853227354e-06, + "loss": 0.1535, + "step": 37521 + }, + { + "epoch": 2.108925664677157, + "grad_norm": 5.038852214813232, + "learning_rate": 4.9608058106070995e-06, + "loss": 0.1684, + "step": 37522 + }, + { + "epoch": 2.108939229517092, + "grad_norm": 7.13576078414917, + "learning_rate": 4.960668767986844e-06, + "loss": 0.304, + "step": 37523 + }, + { + "epoch": 2.1089527943570268, + "grad_norm": 7.069732189178467, + "learning_rate": 4.96053172536659e-06, + "loss": 0.2023, + "step": 37524 + }, + { + "epoch": 2.1089663591969616, + "grad_norm": 5.0657501220703125, + "learning_rate": 4.960394682746334e-06, + "loss": 0.2073, + "step": 37525 + }, + { + "epoch": 2.1089799240368965, + "grad_norm": 5.0322184562683105, + "learning_rate": 4.960257640126079e-06, + "loss": 0.1133, + "step": 37526 + }, + { + "epoch": 2.1089934888768314, + "grad_norm": 4.111887454986572, + "learning_rate": 4.960120597505825e-06, + "loss": 0.091, + "step": 37527 + }, + { + "epoch": 2.1090070537167662, + "grad_norm": 8.388757705688477, + "learning_rate": 4.95998355488557e-06, + "loss": 0.2112, + "step": 37528 + }, + { + "epoch": 2.109020618556701, + "grad_norm": 3.8427629470825195, + "learning_rate": 4.959846512265315e-06, + "loss": 0.1085, + "step": 37529 + }, + { + "epoch": 2.109034183396636, + "grad_norm": 4.472696781158447, + "learning_rate": 4.95970946964506e-06, + "loss": 0.1075, + "step": 37530 + }, + { + "epoch": 2.109047748236571, + "grad_norm": 4.073357105255127, + "learning_rate": 4.959572427024805e-06, + "loss": 0.085, + "step": 37531 + }, + { + "epoch": 2.1090613130765057, + "grad_norm": 5.51647424697876, + "learning_rate": 4.9594353844045505e-06, + "loss": 0.186, + "step": 37532 + }, + { + "epoch": 2.1090748779164405, + "grad_norm": 4.608328342437744, + "learning_rate": 4.959298341784296e-06, + "loss": 0.1538, + "step": 37533 + }, + { + "epoch": 2.1090884427563754, + "grad_norm": 4.1309099197387695, + "learning_rate": 4.95916129916404e-06, + "loss": 0.1571, + "step": 37534 + }, + { + "epoch": 2.1091020075963103, + "grad_norm": 2.695528507232666, + "learning_rate": 4.959024256543786e-06, + "loss": 0.0673, + "step": 37535 + }, + { + "epoch": 2.109115572436245, + "grad_norm": 3.5576815605163574, + "learning_rate": 4.95888721392353e-06, + "loss": 0.0924, + "step": 37536 + }, + { + "epoch": 2.10912913727618, + "grad_norm": 3.6999456882476807, + "learning_rate": 4.958750171303276e-06, + "loss": 0.1185, + "step": 37537 + }, + { + "epoch": 2.109142702116115, + "grad_norm": 6.726930618286133, + "learning_rate": 4.958613128683021e-06, + "loss": 0.2934, + "step": 37538 + }, + { + "epoch": 2.1091562669560497, + "grad_norm": 4.030592918395996, + "learning_rate": 4.958476086062766e-06, + "loss": 0.2059, + "step": 37539 + }, + { + "epoch": 2.1091698317959846, + "grad_norm": 6.258602619171143, + "learning_rate": 4.958339043442511e-06, + "loss": 0.2422, + "step": 37540 + }, + { + "epoch": 2.1091833966359195, + "grad_norm": 3.3163228034973145, + "learning_rate": 4.958202000822256e-06, + "loss": 0.0692, + "step": 37541 + }, + { + "epoch": 2.109196961475855, + "grad_norm": 5.492267608642578, + "learning_rate": 4.9580649582020014e-06, + "loss": 0.237, + "step": 37542 + }, + { + "epoch": 2.1092105263157896, + "grad_norm": 3.976905584335327, + "learning_rate": 4.957927915581746e-06, + "loss": 0.1369, + "step": 37543 + }, + { + "epoch": 2.1092240911557245, + "grad_norm": 4.243357181549072, + "learning_rate": 4.957790872961492e-06, + "loss": 0.0798, + "step": 37544 + }, + { + "epoch": 2.1092376559956594, + "grad_norm": 4.630911827087402, + "learning_rate": 4.957653830341236e-06, + "loss": 0.1133, + "step": 37545 + }, + { + "epoch": 2.1092512208355942, + "grad_norm": 4.134326934814453, + "learning_rate": 4.957516787720982e-06, + "loss": 0.1177, + "step": 37546 + }, + { + "epoch": 2.109264785675529, + "grad_norm": 3.728797435760498, + "learning_rate": 4.9573797451007265e-06, + "loss": 0.0925, + "step": 37547 + }, + { + "epoch": 2.109278350515464, + "grad_norm": 4.36843729019165, + "learning_rate": 4.957242702480472e-06, + "loss": 0.0952, + "step": 37548 + }, + { + "epoch": 2.109291915355399, + "grad_norm": 5.488646030426025, + "learning_rate": 4.957105659860217e-06, + "loss": 0.1293, + "step": 37549 + }, + { + "epoch": 2.1093054801953337, + "grad_norm": 5.087798595428467, + "learning_rate": 4.956968617239962e-06, + "loss": 0.1554, + "step": 37550 + }, + { + "epoch": 2.1093190450352686, + "grad_norm": 5.031553745269775, + "learning_rate": 4.956831574619707e-06, + "loss": 0.1543, + "step": 37551 + }, + { + "epoch": 2.1093326098752034, + "grad_norm": 5.730477333068848, + "learning_rate": 4.956694531999452e-06, + "loss": 0.1683, + "step": 37552 + }, + { + "epoch": 2.1093461747151383, + "grad_norm": 5.126269340515137, + "learning_rate": 4.9565574893791976e-06, + "loss": 0.2332, + "step": 37553 + }, + { + "epoch": 2.109359739555073, + "grad_norm": 4.065330505371094, + "learning_rate": 4.956420446758942e-06, + "loss": 0.0885, + "step": 37554 + }, + { + "epoch": 2.109373304395008, + "grad_norm": 5.153535842895508, + "learning_rate": 4.956283404138688e-06, + "loss": 0.1651, + "step": 37555 + }, + { + "epoch": 2.109386869234943, + "grad_norm": 4.329144477844238, + "learning_rate": 4.956146361518432e-06, + "loss": 0.1539, + "step": 37556 + }, + { + "epoch": 2.1094004340748778, + "grad_norm": 5.990606784820557, + "learning_rate": 4.956009318898177e-06, + "loss": 0.1725, + "step": 37557 + }, + { + "epoch": 2.1094139989148126, + "grad_norm": 3.8815577030181885, + "learning_rate": 4.955872276277923e-06, + "loss": 0.147, + "step": 37558 + }, + { + "epoch": 2.1094275637547475, + "grad_norm": 5.906256675720215, + "learning_rate": 4.955735233657668e-06, + "loss": 0.2084, + "step": 37559 + }, + { + "epoch": 2.109441128594683, + "grad_norm": 4.0993804931640625, + "learning_rate": 4.955598191037413e-06, + "loss": 0.1254, + "step": 37560 + }, + { + "epoch": 2.1094546934346177, + "grad_norm": 4.383791923522949, + "learning_rate": 4.955461148417158e-06, + "loss": 0.0933, + "step": 37561 + }, + { + "epoch": 2.1094682582745525, + "grad_norm": 4.835148811340332, + "learning_rate": 4.955324105796903e-06, + "loss": 0.2488, + "step": 37562 + }, + { + "epoch": 2.1094818231144874, + "grad_norm": 5.174984455108643, + "learning_rate": 4.9551870631766485e-06, + "loss": 0.1892, + "step": 37563 + }, + { + "epoch": 2.1094953879544223, + "grad_norm": 3.6739909648895264, + "learning_rate": 4.955050020556394e-06, + "loss": 0.1322, + "step": 37564 + }, + { + "epoch": 2.109508952794357, + "grad_norm": 3.672560930252075, + "learning_rate": 4.954912977936139e-06, + "loss": 0.1347, + "step": 37565 + }, + { + "epoch": 2.109522517634292, + "grad_norm": 5.321358680725098, + "learning_rate": 4.954775935315884e-06, + "loss": 0.1541, + "step": 37566 + }, + { + "epoch": 2.109536082474227, + "grad_norm": 5.417652606964111, + "learning_rate": 4.954638892695628e-06, + "loss": 0.1887, + "step": 37567 + }, + { + "epoch": 2.1095496473141617, + "grad_norm": 5.746029853820801, + "learning_rate": 4.9545018500753735e-06, + "loss": 0.2371, + "step": 37568 + }, + { + "epoch": 2.1095632121540966, + "grad_norm": 4.581528663635254, + "learning_rate": 4.954364807455119e-06, + "loss": 0.1691, + "step": 37569 + }, + { + "epoch": 2.1095767769940315, + "grad_norm": 4.406888484954834, + "learning_rate": 4.954227764834864e-06, + "loss": 0.1706, + "step": 37570 + }, + { + "epoch": 2.1095903418339663, + "grad_norm": 5.588423252105713, + "learning_rate": 4.954090722214609e-06, + "loss": 0.2047, + "step": 37571 + }, + { + "epoch": 2.109603906673901, + "grad_norm": 4.79322624206543, + "learning_rate": 4.953953679594354e-06, + "loss": 0.1211, + "step": 37572 + }, + { + "epoch": 2.109617471513836, + "grad_norm": 4.036471366882324, + "learning_rate": 4.9538166369740994e-06, + "loss": 0.1509, + "step": 37573 + }, + { + "epoch": 2.109631036353771, + "grad_norm": 5.326806545257568, + "learning_rate": 4.953679594353845e-06, + "loss": 0.1952, + "step": 37574 + }, + { + "epoch": 2.109644601193706, + "grad_norm": 4.909534454345703, + "learning_rate": 4.95354255173359e-06, + "loss": 0.1412, + "step": 37575 + }, + { + "epoch": 2.1096581660336406, + "grad_norm": 6.270799160003662, + "learning_rate": 4.953405509113335e-06, + "loss": 0.1363, + "step": 37576 + }, + { + "epoch": 2.1096717308735755, + "grad_norm": 6.605591297149658, + "learning_rate": 4.953268466493079e-06, + "loss": 0.2042, + "step": 37577 + }, + { + "epoch": 2.1096852957135104, + "grad_norm": 6.215043067932129, + "learning_rate": 4.953131423872825e-06, + "loss": 0.1821, + "step": 37578 + }, + { + "epoch": 2.1096988605534452, + "grad_norm": 4.454365253448486, + "learning_rate": 4.95299438125257e-06, + "loss": 0.1164, + "step": 37579 + }, + { + "epoch": 2.1097124253933806, + "grad_norm": 3.5424282550811768, + "learning_rate": 4.952857338632316e-06, + "loss": 0.0887, + "step": 37580 + }, + { + "epoch": 2.1097259902333154, + "grad_norm": 6.128200531005859, + "learning_rate": 4.95272029601206e-06, + "loss": 0.2122, + "step": 37581 + }, + { + "epoch": 2.1097395550732503, + "grad_norm": 5.015523910522461, + "learning_rate": 4.952583253391805e-06, + "loss": 0.2507, + "step": 37582 + }, + { + "epoch": 2.109753119913185, + "grad_norm": 4.234519958496094, + "learning_rate": 4.95244621077155e-06, + "loss": 0.1255, + "step": 37583 + }, + { + "epoch": 2.10976668475312, + "grad_norm": 3.6799609661102295, + "learning_rate": 4.9523091681512956e-06, + "loss": 0.1095, + "step": 37584 + }, + { + "epoch": 2.109780249593055, + "grad_norm": 5.419877529144287, + "learning_rate": 4.952172125531041e-06, + "loss": 0.2899, + "step": 37585 + }, + { + "epoch": 2.1097938144329897, + "grad_norm": 6.071089744567871, + "learning_rate": 4.952035082910786e-06, + "loss": 0.1904, + "step": 37586 + }, + { + "epoch": 2.1098073792729246, + "grad_norm": 4.251120090484619, + "learning_rate": 4.951898040290531e-06, + "loss": 0.0997, + "step": 37587 + }, + { + "epoch": 2.1098209441128595, + "grad_norm": 6.901736736297607, + "learning_rate": 4.9517609976702754e-06, + "loss": 0.1882, + "step": 37588 + }, + { + "epoch": 2.1098345089527943, + "grad_norm": 4.045631408691406, + "learning_rate": 4.9516239550500214e-06, + "loss": 0.1489, + "step": 37589 + }, + { + "epoch": 2.109848073792729, + "grad_norm": 3.424626350402832, + "learning_rate": 4.951486912429766e-06, + "loss": 0.1088, + "step": 37590 + }, + { + "epoch": 2.109861638632664, + "grad_norm": 5.274782657623291, + "learning_rate": 4.951349869809511e-06, + "loss": 0.1744, + "step": 37591 + }, + { + "epoch": 2.109875203472599, + "grad_norm": 4.455362319946289, + "learning_rate": 4.951212827189256e-06, + "loss": 0.1623, + "step": 37592 + }, + { + "epoch": 2.109888768312534, + "grad_norm": 6.911591529846191, + "learning_rate": 4.951075784569001e-06, + "loss": 0.2413, + "step": 37593 + }, + { + "epoch": 2.1099023331524687, + "grad_norm": 6.524144649505615, + "learning_rate": 4.9509387419487465e-06, + "loss": 0.297, + "step": 37594 + }, + { + "epoch": 2.1099158979924035, + "grad_norm": 6.7433881759643555, + "learning_rate": 4.950801699328492e-06, + "loss": 0.2972, + "step": 37595 + }, + { + "epoch": 2.1099294628323384, + "grad_norm": 4.395590782165527, + "learning_rate": 4.950664656708237e-06, + "loss": 0.2208, + "step": 37596 + }, + { + "epoch": 2.1099430276722733, + "grad_norm": 6.635767459869385, + "learning_rate": 4.950527614087981e-06, + "loss": 0.2365, + "step": 37597 + }, + { + "epoch": 2.1099565925122086, + "grad_norm": 6.265383720397949, + "learning_rate": 4.950390571467727e-06, + "loss": 0.2176, + "step": 37598 + }, + { + "epoch": 2.1099701573521434, + "grad_norm": 5.005163192749023, + "learning_rate": 4.9502535288474715e-06, + "loss": 0.2728, + "step": 37599 + }, + { + "epoch": 2.1099837221920783, + "grad_norm": 6.397558212280273, + "learning_rate": 4.9501164862272176e-06, + "loss": 0.1737, + "step": 37600 + }, + { + "epoch": 2.109997287032013, + "grad_norm": 7.002110958099365, + "learning_rate": 4.949979443606962e-06, + "loss": 0.2377, + "step": 37601 + }, + { + "epoch": 2.110010851871948, + "grad_norm": 4.717994689941406, + "learning_rate": 4.949842400986707e-06, + "loss": 0.1802, + "step": 37602 + }, + { + "epoch": 2.110024416711883, + "grad_norm": 6.518162727355957, + "learning_rate": 4.949705358366452e-06, + "loss": 0.2806, + "step": 37603 + }, + { + "epoch": 2.1100379815518178, + "grad_norm": 4.641628265380859, + "learning_rate": 4.9495683157461974e-06, + "loss": 0.1785, + "step": 37604 + }, + { + "epoch": 2.1100515463917526, + "grad_norm": 6.308604717254639, + "learning_rate": 4.949431273125943e-06, + "loss": 0.3311, + "step": 37605 + }, + { + "epoch": 2.1100651112316875, + "grad_norm": 5.4130539894104, + "learning_rate": 4.949294230505688e-06, + "loss": 0.2379, + "step": 37606 + }, + { + "epoch": 2.1100786760716224, + "grad_norm": 5.882821559906006, + "learning_rate": 4.949157187885433e-06, + "loss": 0.3459, + "step": 37607 + }, + { + "epoch": 2.1100922409115572, + "grad_norm": 4.960631370544434, + "learning_rate": 4.949020145265177e-06, + "loss": 0.1627, + "step": 37608 + }, + { + "epoch": 2.110105805751492, + "grad_norm": 4.246293544769287, + "learning_rate": 4.948883102644923e-06, + "loss": 0.2858, + "step": 37609 + }, + { + "epoch": 2.110119370591427, + "grad_norm": 5.087319374084473, + "learning_rate": 4.948746060024668e-06, + "loss": 0.2544, + "step": 37610 + }, + { + "epoch": 2.110132935431362, + "grad_norm": 5.630465984344482, + "learning_rate": 4.948609017404413e-06, + "loss": 0.3118, + "step": 37611 + }, + { + "epoch": 2.1101465002712967, + "grad_norm": 3.4766523838043213, + "learning_rate": 4.948471974784158e-06, + "loss": 0.1959, + "step": 37612 + }, + { + "epoch": 2.1101600651112316, + "grad_norm": 5.882561206817627, + "learning_rate": 4.948334932163903e-06, + "loss": 0.2852, + "step": 37613 + }, + { + "epoch": 2.1101736299511664, + "grad_norm": 4.5567708015441895, + "learning_rate": 4.948197889543648e-06, + "loss": 0.1754, + "step": 37614 + }, + { + "epoch": 2.1101871947911013, + "grad_norm": 4.347555160522461, + "learning_rate": 4.9480608469233936e-06, + "loss": 0.1547, + "step": 37615 + }, + { + "epoch": 2.110200759631036, + "grad_norm": 3.8992297649383545, + "learning_rate": 4.947923804303139e-06, + "loss": 0.184, + "step": 37616 + }, + { + "epoch": 2.110214324470971, + "grad_norm": 5.703546524047852, + "learning_rate": 4.947786761682884e-06, + "loss": 0.2693, + "step": 37617 + }, + { + "epoch": 2.1102278893109063, + "grad_norm": 5.101001739501953, + "learning_rate": 4.947649719062629e-06, + "loss": 0.2661, + "step": 37618 + }, + { + "epoch": 2.110241454150841, + "grad_norm": 4.881816387176514, + "learning_rate": 4.947512676442374e-06, + "loss": 0.2133, + "step": 37619 + }, + { + "epoch": 2.110255018990776, + "grad_norm": 5.681136608123779, + "learning_rate": 4.9473756338221195e-06, + "loss": 0.4664, + "step": 37620 + }, + { + "epoch": 2.110268583830711, + "grad_norm": 9.249739646911621, + "learning_rate": 4.947238591201865e-06, + "loss": 0.2059, + "step": 37621 + }, + { + "epoch": 2.110282148670646, + "grad_norm": 5.20902681350708, + "learning_rate": 4.947101548581609e-06, + "loss": 0.2162, + "step": 37622 + }, + { + "epoch": 2.1102957135105807, + "grad_norm": 4.223804950714111, + "learning_rate": 4.946964505961354e-06, + "loss": 0.166, + "step": 37623 + }, + { + "epoch": 2.1103092783505155, + "grad_norm": 5.890469074249268, + "learning_rate": 4.946827463341099e-06, + "loss": 0.3, + "step": 37624 + }, + { + "epoch": 2.1103228431904504, + "grad_norm": 4.404855728149414, + "learning_rate": 4.9466904207208445e-06, + "loss": 0.1787, + "step": 37625 + }, + { + "epoch": 2.1103364080303852, + "grad_norm": 3.9114115238189697, + "learning_rate": 4.94655337810059e-06, + "loss": 0.1253, + "step": 37626 + }, + { + "epoch": 2.11034997287032, + "grad_norm": 4.025645732879639, + "learning_rate": 4.946416335480335e-06, + "loss": 0.1615, + "step": 37627 + }, + { + "epoch": 2.110363537710255, + "grad_norm": 5.1182780265808105, + "learning_rate": 4.94627929286008e-06, + "loss": 0.1921, + "step": 37628 + }, + { + "epoch": 2.11037710255019, + "grad_norm": 4.401126861572266, + "learning_rate": 4.946142250239825e-06, + "loss": 0.2038, + "step": 37629 + }, + { + "epoch": 2.1103906673901247, + "grad_norm": 4.592215061187744, + "learning_rate": 4.94600520761957e-06, + "loss": 0.203, + "step": 37630 + }, + { + "epoch": 2.1104042322300596, + "grad_norm": 7.176284313201904, + "learning_rate": 4.945868164999315e-06, + "loss": 0.2001, + "step": 37631 + }, + { + "epoch": 2.1104177970699944, + "grad_norm": 5.58917236328125, + "learning_rate": 4.945731122379061e-06, + "loss": 0.1703, + "step": 37632 + }, + { + "epoch": 2.1104313619099293, + "grad_norm": 5.489295959472656, + "learning_rate": 4.945594079758805e-06, + "loss": 0.1773, + "step": 37633 + }, + { + "epoch": 2.110444926749864, + "grad_norm": 4.333493709564209, + "learning_rate": 4.945457037138551e-06, + "loss": 0.1819, + "step": 37634 + }, + { + "epoch": 2.110458491589799, + "grad_norm": 5.223306179046631, + "learning_rate": 4.9453199945182954e-06, + "loss": 0.2467, + "step": 37635 + }, + { + "epoch": 2.1104720564297343, + "grad_norm": 4.112163543701172, + "learning_rate": 4.945182951898041e-06, + "loss": 0.146, + "step": 37636 + }, + { + "epoch": 2.110485621269669, + "grad_norm": 5.07403039932251, + "learning_rate": 4.945045909277786e-06, + "loss": 0.2921, + "step": 37637 + }, + { + "epoch": 2.110499186109604, + "grad_norm": 3.713792562484741, + "learning_rate": 4.944908866657531e-06, + "loss": 0.1212, + "step": 37638 + }, + { + "epoch": 2.110512750949539, + "grad_norm": 6.8120503425598145, + "learning_rate": 4.944771824037276e-06, + "loss": 0.319, + "step": 37639 + }, + { + "epoch": 2.110526315789474, + "grad_norm": 5.8921990394592285, + "learning_rate": 4.9446347814170205e-06, + "loss": 0.2108, + "step": 37640 + }, + { + "epoch": 2.1105398806294087, + "grad_norm": 4.089254856109619, + "learning_rate": 4.9444977387967665e-06, + "loss": 0.2756, + "step": 37641 + }, + { + "epoch": 2.1105534454693435, + "grad_norm": 6.60243034362793, + "learning_rate": 4.944360696176511e-06, + "loss": 0.3198, + "step": 37642 + }, + { + "epoch": 2.1105670103092784, + "grad_norm": 5.732228755950928, + "learning_rate": 4.944223653556257e-06, + "loss": 0.2753, + "step": 37643 + }, + { + "epoch": 2.1105805751492133, + "grad_norm": 5.00722599029541, + "learning_rate": 4.944086610936001e-06, + "loss": 0.2369, + "step": 37644 + }, + { + "epoch": 2.110594139989148, + "grad_norm": 5.391229629516602, + "learning_rate": 4.943949568315746e-06, + "loss": 0.2537, + "step": 37645 + }, + { + "epoch": 2.110607704829083, + "grad_norm": 5.522195816040039, + "learning_rate": 4.9438125256954916e-06, + "loss": 0.2346, + "step": 37646 + }, + { + "epoch": 2.110621269669018, + "grad_norm": 4.536427021026611, + "learning_rate": 4.943675483075237e-06, + "loss": 0.2083, + "step": 37647 + }, + { + "epoch": 2.1106348345089527, + "grad_norm": 6.961038112640381, + "learning_rate": 4.943538440454982e-06, + "loss": 0.2692, + "step": 37648 + }, + { + "epoch": 2.1106483993488876, + "grad_norm": 5.03800106048584, + "learning_rate": 4.943401397834727e-06, + "loss": 0.253, + "step": 37649 + }, + { + "epoch": 2.1106619641888225, + "grad_norm": 5.621884822845459, + "learning_rate": 4.943264355214472e-06, + "loss": 0.2751, + "step": 37650 + }, + { + "epoch": 2.1106755290287573, + "grad_norm": 4.806560516357422, + "learning_rate": 4.943127312594217e-06, + "loss": 0.1973, + "step": 37651 + }, + { + "epoch": 2.110689093868692, + "grad_norm": 3.9752044677734375, + "learning_rate": 4.942990269973963e-06, + "loss": 0.2253, + "step": 37652 + }, + { + "epoch": 2.110702658708627, + "grad_norm": 5.756505966186523, + "learning_rate": 4.942853227353707e-06, + "loss": 0.3332, + "step": 37653 + }, + { + "epoch": 2.110716223548562, + "grad_norm": 3.2526590824127197, + "learning_rate": 4.942716184733453e-06, + "loss": 0.13, + "step": 37654 + }, + { + "epoch": 2.1107297883884972, + "grad_norm": 4.16675329208374, + "learning_rate": 4.942579142113197e-06, + "loss": 0.1935, + "step": 37655 + }, + { + "epoch": 2.110743353228432, + "grad_norm": 4.480915069580078, + "learning_rate": 4.9424420994929425e-06, + "loss": 0.2768, + "step": 37656 + }, + { + "epoch": 2.110756918068367, + "grad_norm": 4.137725830078125, + "learning_rate": 4.942305056872688e-06, + "loss": 0.2479, + "step": 37657 + }, + { + "epoch": 2.110770482908302, + "grad_norm": 4.492196559906006, + "learning_rate": 4.942168014252433e-06, + "loss": 0.2211, + "step": 37658 + }, + { + "epoch": 2.1107840477482367, + "grad_norm": 6.234377384185791, + "learning_rate": 4.942030971632178e-06, + "loss": 0.204, + "step": 37659 + }, + { + "epoch": 2.1107976125881716, + "grad_norm": 4.964364528656006, + "learning_rate": 4.941893929011923e-06, + "loss": 0.1893, + "step": 37660 + }, + { + "epoch": 2.1108111774281064, + "grad_norm": 5.110334873199463, + "learning_rate": 4.941756886391668e-06, + "loss": 0.2038, + "step": 37661 + }, + { + "epoch": 2.1108247422680413, + "grad_norm": 6.063808441162109, + "learning_rate": 4.941619843771414e-06, + "loss": 0.2675, + "step": 37662 + }, + { + "epoch": 2.110838307107976, + "grad_norm": 4.883305549621582, + "learning_rate": 4.941482801151159e-06, + "loss": 0.189, + "step": 37663 + }, + { + "epoch": 2.110851871947911, + "grad_norm": 4.279688835144043, + "learning_rate": 4.941345758530903e-06, + "loss": 0.2149, + "step": 37664 + }, + { + "epoch": 2.110865436787846, + "grad_norm": 4.227415561676025, + "learning_rate": 4.941208715910648e-06, + "loss": 0.2004, + "step": 37665 + }, + { + "epoch": 2.1108790016277807, + "grad_norm": 3.502713203430176, + "learning_rate": 4.9410716732903934e-06, + "loss": 0.1869, + "step": 37666 + }, + { + "epoch": 2.1108925664677156, + "grad_norm": 5.661661148071289, + "learning_rate": 4.940934630670139e-06, + "loss": 0.2288, + "step": 37667 + }, + { + "epoch": 2.1109061313076505, + "grad_norm": 4.448362350463867, + "learning_rate": 4.940797588049884e-06, + "loss": 0.1667, + "step": 37668 + }, + { + "epoch": 2.1109196961475853, + "grad_norm": 4.824066638946533, + "learning_rate": 4.940660545429629e-06, + "loss": 0.2021, + "step": 37669 + }, + { + "epoch": 2.11093326098752, + "grad_norm": 6.26292610168457, + "learning_rate": 4.940523502809374e-06, + "loss": 0.3392, + "step": 37670 + }, + { + "epoch": 2.110946825827455, + "grad_norm": 4.2924580574035645, + "learning_rate": 4.940386460189119e-06, + "loss": 0.1902, + "step": 37671 + }, + { + "epoch": 2.11096039066739, + "grad_norm": 3.6060423851013184, + "learning_rate": 4.9402494175688645e-06, + "loss": 0.2099, + "step": 37672 + }, + { + "epoch": 2.110973955507325, + "grad_norm": 4.39959716796875, + "learning_rate": 4.94011237494861e-06, + "loss": 0.2312, + "step": 37673 + }, + { + "epoch": 2.11098752034726, + "grad_norm": 4.1344780921936035, + "learning_rate": 4.939975332328355e-06, + "loss": 0.1916, + "step": 37674 + }, + { + "epoch": 2.111001085187195, + "grad_norm": 4.666558265686035, + "learning_rate": 4.9398382897081e-06, + "loss": 0.1454, + "step": 37675 + }, + { + "epoch": 2.11101465002713, + "grad_norm": 4.004388809204102, + "learning_rate": 4.939701247087844e-06, + "loss": 0.1251, + "step": 37676 + }, + { + "epoch": 2.1110282148670647, + "grad_norm": 4.38934850692749, + "learning_rate": 4.9395642044675896e-06, + "loss": 0.1477, + "step": 37677 + }, + { + "epoch": 2.1110417797069996, + "grad_norm": 4.35979700088501, + "learning_rate": 4.939427161847335e-06, + "loss": 0.2683, + "step": 37678 + }, + { + "epoch": 2.1110553445469344, + "grad_norm": 6.237381458282471, + "learning_rate": 4.93929011922708e-06, + "loss": 0.1286, + "step": 37679 + }, + { + "epoch": 2.1110689093868693, + "grad_norm": 4.216734886169434, + "learning_rate": 4.939153076606825e-06, + "loss": 0.182, + "step": 37680 + }, + { + "epoch": 2.111082474226804, + "grad_norm": 4.7925262451171875, + "learning_rate": 4.93901603398657e-06, + "loss": 0.1851, + "step": 37681 + }, + { + "epoch": 2.111096039066739, + "grad_norm": 4.89057731628418, + "learning_rate": 4.9388789913663155e-06, + "loss": 0.1833, + "step": 37682 + }, + { + "epoch": 2.111109603906674, + "grad_norm": 5.2857489585876465, + "learning_rate": 4.938741948746061e-06, + "loss": 0.1969, + "step": 37683 + }, + { + "epoch": 2.1111231687466088, + "grad_norm": 4.939233303070068, + "learning_rate": 4.938604906125806e-06, + "loss": 0.2428, + "step": 37684 + }, + { + "epoch": 2.1111367335865436, + "grad_norm": 4.790319442749023, + "learning_rate": 4.93846786350555e-06, + "loss": 0.1737, + "step": 37685 + }, + { + "epoch": 2.1111502984264785, + "grad_norm": 5.752976894378662, + "learning_rate": 4.938330820885296e-06, + "loss": 0.2503, + "step": 37686 + }, + { + "epoch": 2.1111638632664134, + "grad_norm": 6.666606426239014, + "learning_rate": 4.9381937782650405e-06, + "loss": 0.4029, + "step": 37687 + }, + { + "epoch": 2.1111774281063482, + "grad_norm": 4.452284336090088, + "learning_rate": 4.9380567356447865e-06, + "loss": 0.2341, + "step": 37688 + }, + { + "epoch": 2.111190992946283, + "grad_norm": 6.284824848175049, + "learning_rate": 4.937919693024531e-06, + "loss": 0.284, + "step": 37689 + }, + { + "epoch": 2.111204557786218, + "grad_norm": 5.092901229858398, + "learning_rate": 4.937782650404276e-06, + "loss": 0.2566, + "step": 37690 + }, + { + "epoch": 2.111218122626153, + "grad_norm": 5.186883926391602, + "learning_rate": 4.937645607784021e-06, + "loss": 0.2457, + "step": 37691 + }, + { + "epoch": 2.1112316874660877, + "grad_norm": 5.037536144256592, + "learning_rate": 4.937508565163766e-06, + "loss": 0.2226, + "step": 37692 + }, + { + "epoch": 2.111245252306023, + "grad_norm": 5.13674783706665, + "learning_rate": 4.937371522543512e-06, + "loss": 0.2213, + "step": 37693 + }, + { + "epoch": 2.111258817145958, + "grad_norm": 4.563555717468262, + "learning_rate": 4.937234479923256e-06, + "loss": 0.2762, + "step": 37694 + }, + { + "epoch": 2.1112723819858927, + "grad_norm": 4.473300457000732, + "learning_rate": 4.937097437303002e-06, + "loss": 0.1825, + "step": 37695 + }, + { + "epoch": 2.1112859468258276, + "grad_norm": 4.464677333831787, + "learning_rate": 4.936960394682746e-06, + "loss": 0.1713, + "step": 37696 + }, + { + "epoch": 2.1112995116657625, + "grad_norm": 5.621427536010742, + "learning_rate": 4.936823352062492e-06, + "loss": 0.2821, + "step": 37697 + }, + { + "epoch": 2.1113130765056973, + "grad_norm": 5.650838851928711, + "learning_rate": 4.936686309442237e-06, + "loss": 0.1693, + "step": 37698 + }, + { + "epoch": 2.111326641345632, + "grad_norm": 5.4070539474487305, + "learning_rate": 4.936549266821982e-06, + "loss": 0.2163, + "step": 37699 + }, + { + "epoch": 2.111340206185567, + "grad_norm": 4.156387805938721, + "learning_rate": 4.936412224201727e-06, + "loss": 0.1544, + "step": 37700 + }, + { + "epoch": 2.111353771025502, + "grad_norm": 5.099246025085449, + "learning_rate": 4.936275181581472e-06, + "loss": 0.306, + "step": 37701 + }, + { + "epoch": 2.111367335865437, + "grad_norm": 4.510982036590576, + "learning_rate": 4.936138138961217e-06, + "loss": 0.1861, + "step": 37702 + }, + { + "epoch": 2.1113809007053717, + "grad_norm": 5.136892795562744, + "learning_rate": 4.9360010963409625e-06, + "loss": 0.1976, + "step": 37703 + }, + { + "epoch": 2.1113944655453065, + "grad_norm": 6.009306907653809, + "learning_rate": 4.935864053720708e-06, + "loss": 0.2655, + "step": 37704 + }, + { + "epoch": 2.1114080303852414, + "grad_norm": 4.353601455688477, + "learning_rate": 4.935727011100452e-06, + "loss": 0.0932, + "step": 37705 + }, + { + "epoch": 2.1114215952251763, + "grad_norm": 6.491856098175049, + "learning_rate": 4.935589968480198e-06, + "loss": 0.2889, + "step": 37706 + }, + { + "epoch": 2.111435160065111, + "grad_norm": 5.28895902633667, + "learning_rate": 4.935452925859942e-06, + "loss": 0.1708, + "step": 37707 + }, + { + "epoch": 2.111448724905046, + "grad_norm": 5.819134712219238, + "learning_rate": 4.935315883239688e-06, + "loss": 0.2055, + "step": 37708 + }, + { + "epoch": 2.111462289744981, + "grad_norm": 3.646719217300415, + "learning_rate": 4.935178840619433e-06, + "loss": 0.1397, + "step": 37709 + }, + { + "epoch": 2.1114758545849157, + "grad_norm": 5.845798969268799, + "learning_rate": 4.935041797999178e-06, + "loss": 0.2493, + "step": 37710 + }, + { + "epoch": 2.1114894194248506, + "grad_norm": 4.967955112457275, + "learning_rate": 4.934904755378923e-06, + "loss": 0.1278, + "step": 37711 + }, + { + "epoch": 2.111502984264786, + "grad_norm": 6.400629997253418, + "learning_rate": 4.934767712758668e-06, + "loss": 0.2315, + "step": 37712 + }, + { + "epoch": 2.1115165491047208, + "grad_norm": 4.891623497009277, + "learning_rate": 4.9346306701384135e-06, + "loss": 0.1907, + "step": 37713 + }, + { + "epoch": 2.1115301139446556, + "grad_norm": 4.8029327392578125, + "learning_rate": 4.934493627518159e-06, + "loss": 0.1455, + "step": 37714 + }, + { + "epoch": 2.1115436787845905, + "grad_norm": 4.596168518066406, + "learning_rate": 4.934356584897904e-06, + "loss": 0.1912, + "step": 37715 + }, + { + "epoch": 2.1115572436245253, + "grad_norm": 3.981907844543457, + "learning_rate": 4.934219542277649e-06, + "loss": 0.1189, + "step": 37716 + }, + { + "epoch": 2.11157080846446, + "grad_norm": 4.403814792633057, + "learning_rate": 4.934082499657394e-06, + "loss": 0.1116, + "step": 37717 + }, + { + "epoch": 2.111584373304395, + "grad_norm": 4.532073020935059, + "learning_rate": 4.933945457037139e-06, + "loss": 0.1468, + "step": 37718 + }, + { + "epoch": 2.11159793814433, + "grad_norm": 4.535727024078369, + "learning_rate": 4.933808414416884e-06, + "loss": 0.197, + "step": 37719 + }, + { + "epoch": 2.111611502984265, + "grad_norm": 5.7979888916015625, + "learning_rate": 4.933671371796629e-06, + "loss": 0.2215, + "step": 37720 + }, + { + "epoch": 2.1116250678241997, + "grad_norm": 5.271831512451172, + "learning_rate": 4.933534329176374e-06, + "loss": 0.3157, + "step": 37721 + }, + { + "epoch": 2.1116386326641345, + "grad_norm": 4.183428764343262, + "learning_rate": 4.933397286556119e-06, + "loss": 0.2017, + "step": 37722 + }, + { + "epoch": 2.1116521975040694, + "grad_norm": 4.331623077392578, + "learning_rate": 4.933260243935864e-06, + "loss": 0.1768, + "step": 37723 + }, + { + "epoch": 2.1116657623440043, + "grad_norm": 3.900657892227173, + "learning_rate": 4.93312320131561e-06, + "loss": 0.0893, + "step": 37724 + }, + { + "epoch": 2.111679327183939, + "grad_norm": 7.174276828765869, + "learning_rate": 4.932986158695355e-06, + "loss": 0.1565, + "step": 37725 + }, + { + "epoch": 2.111692892023874, + "grad_norm": 5.500100612640381, + "learning_rate": 4.9328491160751e-06, + "loss": 0.1885, + "step": 37726 + }, + { + "epoch": 2.111706456863809, + "grad_norm": 3.8427417278289795, + "learning_rate": 4.932712073454845e-06, + "loss": 0.2049, + "step": 37727 + }, + { + "epoch": 2.1117200217037437, + "grad_norm": 3.444106340408325, + "learning_rate": 4.93257503083459e-06, + "loss": 0.1346, + "step": 37728 + }, + { + "epoch": 2.1117335865436786, + "grad_norm": 4.996918678283691, + "learning_rate": 4.9324379882143355e-06, + "loss": 0.141, + "step": 37729 + }, + { + "epoch": 2.1117471513836135, + "grad_norm": 4.001639366149902, + "learning_rate": 4.93230094559408e-06, + "loss": 0.1864, + "step": 37730 + }, + { + "epoch": 2.1117607162235488, + "grad_norm": 4.29253625869751, + "learning_rate": 4.932163902973826e-06, + "loss": 0.1222, + "step": 37731 + }, + { + "epoch": 2.1117742810634836, + "grad_norm": 4.590970993041992, + "learning_rate": 4.93202686035357e-06, + "loss": 0.145, + "step": 37732 + }, + { + "epoch": 2.1117878459034185, + "grad_norm": 3.7476799488067627, + "learning_rate": 4.931889817733315e-06, + "loss": 0.1979, + "step": 37733 + }, + { + "epoch": 2.1118014107433534, + "grad_norm": 3.7966740131378174, + "learning_rate": 4.9317527751130605e-06, + "loss": 0.1333, + "step": 37734 + }, + { + "epoch": 2.1118149755832882, + "grad_norm": 3.9642577171325684, + "learning_rate": 4.931615732492806e-06, + "loss": 0.1406, + "step": 37735 + }, + { + "epoch": 2.111828540423223, + "grad_norm": 4.6384148597717285, + "learning_rate": 4.931478689872551e-06, + "loss": 0.129, + "step": 37736 + }, + { + "epoch": 2.111842105263158, + "grad_norm": 5.1043171882629395, + "learning_rate": 4.931341647252296e-06, + "loss": 0.2686, + "step": 37737 + }, + { + "epoch": 2.111855670103093, + "grad_norm": 3.305436611175537, + "learning_rate": 4.931204604632041e-06, + "loss": 0.1173, + "step": 37738 + }, + { + "epoch": 2.1118692349430277, + "grad_norm": 4.531826019287109, + "learning_rate": 4.931067562011786e-06, + "loss": 0.1302, + "step": 37739 + }, + { + "epoch": 2.1118827997829626, + "grad_norm": 4.415868759155273, + "learning_rate": 4.930930519391532e-06, + "loss": 0.2225, + "step": 37740 + }, + { + "epoch": 2.1118963646228974, + "grad_norm": 5.545820236206055, + "learning_rate": 4.930793476771276e-06, + "loss": 0.1623, + "step": 37741 + }, + { + "epoch": 2.1119099294628323, + "grad_norm": 5.3035969734191895, + "learning_rate": 4.930656434151022e-06, + "loss": 0.1401, + "step": 37742 + }, + { + "epoch": 2.111923494302767, + "grad_norm": 3.8761963844299316, + "learning_rate": 4.930519391530766e-06, + "loss": 0.213, + "step": 37743 + }, + { + "epoch": 2.111937059142702, + "grad_norm": 4.544070720672607, + "learning_rate": 4.9303823489105115e-06, + "loss": 0.1723, + "step": 37744 + }, + { + "epoch": 2.111950623982637, + "grad_norm": 3.5737733840942383, + "learning_rate": 4.930245306290257e-06, + "loss": 0.1216, + "step": 37745 + }, + { + "epoch": 2.1119641888225718, + "grad_norm": 5.213139533996582, + "learning_rate": 4.930108263670002e-06, + "loss": 0.3468, + "step": 37746 + }, + { + "epoch": 2.1119777536625066, + "grad_norm": 3.895815372467041, + "learning_rate": 4.929971221049747e-06, + "loss": 0.0868, + "step": 37747 + }, + { + "epoch": 2.1119913185024415, + "grad_norm": 4.102388858795166, + "learning_rate": 4.929834178429491e-06, + "loss": 0.1814, + "step": 37748 + }, + { + "epoch": 2.1120048833423763, + "grad_norm": 6.261706829071045, + "learning_rate": 4.929697135809237e-06, + "loss": 0.2379, + "step": 37749 + }, + { + "epoch": 2.1120184481823117, + "grad_norm": 3.7160484790802, + "learning_rate": 4.929560093188982e-06, + "loss": 0.1546, + "step": 37750 + }, + { + "epoch": 2.1120320130222465, + "grad_norm": 4.3593010902404785, + "learning_rate": 4.929423050568728e-06, + "loss": 0.1841, + "step": 37751 + }, + { + "epoch": 2.1120455778621814, + "grad_norm": 4.9685893058776855, + "learning_rate": 4.929286007948472e-06, + "loss": 0.1619, + "step": 37752 + }, + { + "epoch": 2.1120591427021163, + "grad_norm": 5.422763347625732, + "learning_rate": 4.929148965328217e-06, + "loss": 0.2465, + "step": 37753 + }, + { + "epoch": 2.112072707542051, + "grad_norm": 4.954564571380615, + "learning_rate": 4.929011922707962e-06, + "loss": 0.2315, + "step": 37754 + }, + { + "epoch": 2.112086272381986, + "grad_norm": 6.481478691101074, + "learning_rate": 4.928874880087708e-06, + "loss": 0.2461, + "step": 37755 + }, + { + "epoch": 2.112099837221921, + "grad_norm": 6.070411205291748, + "learning_rate": 4.928737837467453e-06, + "loss": 0.2047, + "step": 37756 + }, + { + "epoch": 2.1121134020618557, + "grad_norm": 4.415811061859131, + "learning_rate": 4.928600794847198e-06, + "loss": 0.2006, + "step": 37757 + }, + { + "epoch": 2.1121269669017906, + "grad_norm": 4.825860023498535, + "learning_rate": 4.928463752226943e-06, + "loss": 0.1739, + "step": 37758 + }, + { + "epoch": 2.1121405317417254, + "grad_norm": 4.611213684082031, + "learning_rate": 4.928326709606688e-06, + "loss": 0.1506, + "step": 37759 + }, + { + "epoch": 2.1121540965816603, + "grad_norm": 4.530659198760986, + "learning_rate": 4.9281896669864335e-06, + "loss": 0.1237, + "step": 37760 + }, + { + "epoch": 2.112167661421595, + "grad_norm": 5.918539047241211, + "learning_rate": 4.928052624366178e-06, + "loss": 0.1209, + "step": 37761 + }, + { + "epoch": 2.11218122626153, + "grad_norm": 4.68961238861084, + "learning_rate": 4.927915581745924e-06, + "loss": 0.2109, + "step": 37762 + }, + { + "epoch": 2.112194791101465, + "grad_norm": 5.150729656219482, + "learning_rate": 4.927778539125668e-06, + "loss": 0.1925, + "step": 37763 + }, + { + "epoch": 2.1122083559413998, + "grad_norm": 3.740337371826172, + "learning_rate": 4.927641496505413e-06, + "loss": 0.2277, + "step": 37764 + }, + { + "epoch": 2.1122219207813346, + "grad_norm": 3.574321746826172, + "learning_rate": 4.9275044538851585e-06, + "loss": 0.1721, + "step": 37765 + }, + { + "epoch": 2.1122354856212695, + "grad_norm": 3.8229167461395264, + "learning_rate": 4.927367411264904e-06, + "loss": 0.1314, + "step": 37766 + }, + { + "epoch": 2.1122490504612044, + "grad_norm": 6.407866954803467, + "learning_rate": 4.927230368644649e-06, + "loss": 0.2362, + "step": 37767 + }, + { + "epoch": 2.1122626153011392, + "grad_norm": 5.678332805633545, + "learning_rate": 4.927093326024394e-06, + "loss": 0.2342, + "step": 37768 + }, + { + "epoch": 2.1122761801410745, + "grad_norm": 4.0571441650390625, + "learning_rate": 4.926956283404139e-06, + "loss": 0.1109, + "step": 37769 + }, + { + "epoch": 2.1122897449810094, + "grad_norm": 4.168806552886963, + "learning_rate": 4.9268192407838844e-06, + "loss": 0.157, + "step": 37770 + }, + { + "epoch": 2.1123033098209443, + "grad_norm": 4.031968593597412, + "learning_rate": 4.92668219816363e-06, + "loss": 0.1746, + "step": 37771 + }, + { + "epoch": 2.112316874660879, + "grad_norm": 4.926746845245361, + "learning_rate": 4.926545155543375e-06, + "loss": 0.2518, + "step": 37772 + }, + { + "epoch": 2.112330439500814, + "grad_norm": 4.580276966094971, + "learning_rate": 4.926408112923119e-06, + "loss": 0.1646, + "step": 37773 + }, + { + "epoch": 2.112344004340749, + "grad_norm": 3.068592071533203, + "learning_rate": 4.926271070302864e-06, + "loss": 0.1081, + "step": 37774 + }, + { + "epoch": 2.1123575691806837, + "grad_norm": 4.3321943283081055, + "learning_rate": 4.9261340276826095e-06, + "loss": 0.1866, + "step": 37775 + }, + { + "epoch": 2.1123711340206186, + "grad_norm": 4.689193248748779, + "learning_rate": 4.925996985062355e-06, + "loss": 0.2012, + "step": 37776 + }, + { + "epoch": 2.1123846988605535, + "grad_norm": 4.5480217933654785, + "learning_rate": 4.9258599424421e-06, + "loss": 0.1219, + "step": 37777 + }, + { + "epoch": 2.1123982637004883, + "grad_norm": 4.587195873260498, + "learning_rate": 4.925722899821845e-06, + "loss": 0.1649, + "step": 37778 + }, + { + "epoch": 2.112411828540423, + "grad_norm": 5.192200660705566, + "learning_rate": 4.92558585720159e-06, + "loss": 0.1215, + "step": 37779 + }, + { + "epoch": 2.112425393380358, + "grad_norm": 4.627276420593262, + "learning_rate": 4.925448814581335e-06, + "loss": 0.1859, + "step": 37780 + }, + { + "epoch": 2.112438958220293, + "grad_norm": 4.260687828063965, + "learning_rate": 4.9253117719610805e-06, + "loss": 0.2896, + "step": 37781 + }, + { + "epoch": 2.112452523060228, + "grad_norm": 5.291443824768066, + "learning_rate": 4.925174729340825e-06, + "loss": 0.2017, + "step": 37782 + }, + { + "epoch": 2.1124660879001627, + "grad_norm": 5.150305271148682, + "learning_rate": 4.925037686720571e-06, + "loss": 0.2513, + "step": 37783 + }, + { + "epoch": 2.1124796527400975, + "grad_norm": 5.193885803222656, + "learning_rate": 4.924900644100315e-06, + "loss": 0.2218, + "step": 37784 + }, + { + "epoch": 2.1124932175800324, + "grad_norm": 5.369775772094727, + "learning_rate": 4.924763601480061e-06, + "loss": 0.1789, + "step": 37785 + }, + { + "epoch": 2.1125067824199673, + "grad_norm": 6.31469202041626, + "learning_rate": 4.924626558859806e-06, + "loss": 0.2473, + "step": 37786 + }, + { + "epoch": 2.112520347259902, + "grad_norm": 5.647921085357666, + "learning_rate": 4.924489516239551e-06, + "loss": 0.2061, + "step": 37787 + }, + { + "epoch": 2.1125339120998374, + "grad_norm": 4.707167148590088, + "learning_rate": 4.924352473619296e-06, + "loss": 0.1368, + "step": 37788 + }, + { + "epoch": 2.1125474769397723, + "grad_norm": 4.756472587585449, + "learning_rate": 4.924215430999041e-06, + "loss": 0.255, + "step": 37789 + }, + { + "epoch": 2.112561041779707, + "grad_norm": 4.824155330657959, + "learning_rate": 4.924078388378786e-06, + "loss": 0.1955, + "step": 37790 + }, + { + "epoch": 2.112574606619642, + "grad_norm": 4.5679779052734375, + "learning_rate": 4.9239413457585315e-06, + "loss": 0.2122, + "step": 37791 + }, + { + "epoch": 2.112588171459577, + "grad_norm": 4.948588848114014, + "learning_rate": 4.923804303138277e-06, + "loss": 0.1841, + "step": 37792 + }, + { + "epoch": 2.1126017362995118, + "grad_norm": 3.532665491104126, + "learning_rate": 4.923667260518021e-06, + "loss": 0.1811, + "step": 37793 + }, + { + "epoch": 2.1126153011394466, + "grad_norm": 5.206410884857178, + "learning_rate": 4.923530217897767e-06, + "loss": 0.1415, + "step": 37794 + }, + { + "epoch": 2.1126288659793815, + "grad_norm": 5.787376880645752, + "learning_rate": 4.923393175277511e-06, + "loss": 0.2174, + "step": 37795 + }, + { + "epoch": 2.1126424308193164, + "grad_norm": 5.10504150390625, + "learning_rate": 4.923256132657257e-06, + "loss": 0.218, + "step": 37796 + }, + { + "epoch": 2.112655995659251, + "grad_norm": 4.056045055389404, + "learning_rate": 4.923119090037002e-06, + "loss": 0.2224, + "step": 37797 + }, + { + "epoch": 2.112669560499186, + "grad_norm": 3.413429021835327, + "learning_rate": 4.922982047416747e-06, + "loss": 0.1239, + "step": 37798 + }, + { + "epoch": 2.112683125339121, + "grad_norm": 4.576292514801025, + "learning_rate": 4.922845004796492e-06, + "loss": 0.164, + "step": 37799 + }, + { + "epoch": 2.112696690179056, + "grad_norm": 6.413699626922607, + "learning_rate": 4.922707962176237e-06, + "loss": 0.1072, + "step": 37800 + }, + { + "epoch": 2.1127102550189907, + "grad_norm": 4.712959289550781, + "learning_rate": 4.9225709195559824e-06, + "loss": 0.1867, + "step": 37801 + }, + { + "epoch": 2.1127238198589255, + "grad_norm": 6.070496082305908, + "learning_rate": 4.922433876935727e-06, + "loss": 0.1974, + "step": 37802 + }, + { + "epoch": 2.1127373846988604, + "grad_norm": 5.599503517150879, + "learning_rate": 4.922296834315473e-06, + "loss": 0.1847, + "step": 37803 + }, + { + "epoch": 2.1127509495387953, + "grad_norm": 5.875993251800537, + "learning_rate": 4.922159791695217e-06, + "loss": 0.1691, + "step": 37804 + }, + { + "epoch": 2.11276451437873, + "grad_norm": 4.270632266998291, + "learning_rate": 4.922022749074963e-06, + "loss": 0.1054, + "step": 37805 + }, + { + "epoch": 2.112778079218665, + "grad_norm": 4.667281627655029, + "learning_rate": 4.9218857064547075e-06, + "loss": 0.1526, + "step": 37806 + }, + { + "epoch": 2.1127916440586003, + "grad_norm": 5.312440395355225, + "learning_rate": 4.921748663834453e-06, + "loss": 0.1733, + "step": 37807 + }, + { + "epoch": 2.112805208898535, + "grad_norm": 5.407674312591553, + "learning_rate": 4.921611621214198e-06, + "loss": 0.1846, + "step": 37808 + }, + { + "epoch": 2.11281877373847, + "grad_norm": 4.775298118591309, + "learning_rate": 4.921474578593943e-06, + "loss": 0.2062, + "step": 37809 + }, + { + "epoch": 2.112832338578405, + "grad_norm": 5.30326509475708, + "learning_rate": 4.921337535973688e-06, + "loss": 0.2636, + "step": 37810 + }, + { + "epoch": 2.1128459034183398, + "grad_norm": 4.91317081451416, + "learning_rate": 4.921200493353433e-06, + "loss": 0.2396, + "step": 37811 + }, + { + "epoch": 2.1128594682582746, + "grad_norm": 3.2112925052642822, + "learning_rate": 4.9210634507331786e-06, + "loss": 0.1295, + "step": 37812 + }, + { + "epoch": 2.1128730330982095, + "grad_norm": 5.028449058532715, + "learning_rate": 4.920926408112924e-06, + "loss": 0.2332, + "step": 37813 + }, + { + "epoch": 2.1128865979381444, + "grad_norm": 5.073397159576416, + "learning_rate": 4.920789365492669e-06, + "loss": 0.1541, + "step": 37814 + }, + { + "epoch": 2.1129001627780792, + "grad_norm": 6.009879112243652, + "learning_rate": 4.920652322872413e-06, + "loss": 0.1905, + "step": 37815 + }, + { + "epoch": 2.112913727618014, + "grad_norm": 6.239137172698975, + "learning_rate": 4.920515280252159e-06, + "loss": 0.168, + "step": 37816 + }, + { + "epoch": 2.112927292457949, + "grad_norm": 3.938253879547119, + "learning_rate": 4.920378237631904e-06, + "loss": 0.1623, + "step": 37817 + }, + { + "epoch": 2.112940857297884, + "grad_norm": 3.537191152572632, + "learning_rate": 4.920241195011649e-06, + "loss": 0.1246, + "step": 37818 + }, + { + "epoch": 2.1129544221378187, + "grad_norm": 3.567603588104248, + "learning_rate": 4.920104152391394e-06, + "loss": 0.0916, + "step": 37819 + }, + { + "epoch": 2.1129679869777536, + "grad_norm": 3.765374183654785, + "learning_rate": 4.919967109771139e-06, + "loss": 0.1726, + "step": 37820 + }, + { + "epoch": 2.1129815518176884, + "grad_norm": 5.450263977050781, + "learning_rate": 4.919830067150884e-06, + "loss": 0.1386, + "step": 37821 + }, + { + "epoch": 2.1129951166576233, + "grad_norm": 4.3349223136901855, + "learning_rate": 4.9196930245306295e-06, + "loss": 0.1292, + "step": 37822 + }, + { + "epoch": 2.113008681497558, + "grad_norm": 3.3628878593444824, + "learning_rate": 4.919555981910375e-06, + "loss": 0.1409, + "step": 37823 + }, + { + "epoch": 2.113022246337493, + "grad_norm": 3.6393392086029053, + "learning_rate": 4.91941893929012e-06, + "loss": 0.1209, + "step": 37824 + }, + { + "epoch": 2.113035811177428, + "grad_norm": 5.663567543029785, + "learning_rate": 4.919281896669865e-06, + "loss": 0.1696, + "step": 37825 + }, + { + "epoch": 2.113049376017363, + "grad_norm": 5.251705646514893, + "learning_rate": 4.91914485404961e-06, + "loss": 0.2245, + "step": 37826 + }, + { + "epoch": 2.113062940857298, + "grad_norm": 4.480950355529785, + "learning_rate": 4.9190078114293545e-06, + "loss": 0.1642, + "step": 37827 + }, + { + "epoch": 2.113076505697233, + "grad_norm": 5.097380638122559, + "learning_rate": 4.9188707688091006e-06, + "loss": 0.269, + "step": 37828 + }, + { + "epoch": 2.113090070537168, + "grad_norm": 4.302520275115967, + "learning_rate": 4.918733726188845e-06, + "loss": 0.1165, + "step": 37829 + }, + { + "epoch": 2.1131036353771027, + "grad_norm": 3.7043278217315674, + "learning_rate": 4.91859668356859e-06, + "loss": 0.147, + "step": 37830 + }, + { + "epoch": 2.1131172002170375, + "grad_norm": 4.435162544250488, + "learning_rate": 4.918459640948335e-06, + "loss": 0.1488, + "step": 37831 + }, + { + "epoch": 2.1131307650569724, + "grad_norm": 5.52271032333374, + "learning_rate": 4.9183225983280804e-06, + "loss": 0.1636, + "step": 37832 + }, + { + "epoch": 2.1131443298969073, + "grad_norm": 3.7283284664154053, + "learning_rate": 4.918185555707826e-06, + "loss": 0.1306, + "step": 37833 + }, + { + "epoch": 2.113157894736842, + "grad_norm": 4.06548547744751, + "learning_rate": 4.918048513087571e-06, + "loss": 0.2036, + "step": 37834 + }, + { + "epoch": 2.113171459576777, + "grad_norm": 3.516301393508911, + "learning_rate": 4.917911470467316e-06, + "loss": 0.2209, + "step": 37835 + }, + { + "epoch": 2.113185024416712, + "grad_norm": 5.070723056793213, + "learning_rate": 4.91777442784706e-06, + "loss": 0.1641, + "step": 37836 + }, + { + "epoch": 2.1131985892566467, + "grad_norm": 3.9167416095733643, + "learning_rate": 4.917637385226806e-06, + "loss": 0.1658, + "step": 37837 + }, + { + "epoch": 2.1132121540965816, + "grad_norm": 5.01719856262207, + "learning_rate": 4.917500342606551e-06, + "loss": 0.2816, + "step": 37838 + }, + { + "epoch": 2.1132257189365165, + "grad_norm": 3.5853981971740723, + "learning_rate": 4.917363299986297e-06, + "loss": 0.1298, + "step": 37839 + }, + { + "epoch": 2.1132392837764513, + "grad_norm": 4.508688926696777, + "learning_rate": 4.917226257366041e-06, + "loss": 0.2438, + "step": 37840 + }, + { + "epoch": 2.113252848616386, + "grad_norm": 3.9817166328430176, + "learning_rate": 4.917089214745786e-06, + "loss": 0.1232, + "step": 37841 + }, + { + "epoch": 2.113266413456321, + "grad_norm": 6.7022624015808105, + "learning_rate": 4.916952172125531e-06, + "loss": 0.2689, + "step": 37842 + }, + { + "epoch": 2.113279978296256, + "grad_norm": 3.8649239540100098, + "learning_rate": 4.9168151295052766e-06, + "loss": 0.2217, + "step": 37843 + }, + { + "epoch": 2.1132935431361908, + "grad_norm": 5.761922359466553, + "learning_rate": 4.916678086885022e-06, + "loss": 0.2356, + "step": 37844 + }, + { + "epoch": 2.113307107976126, + "grad_norm": 7.640268325805664, + "learning_rate": 4.916541044264767e-06, + "loss": 0.3143, + "step": 37845 + }, + { + "epoch": 2.113320672816061, + "grad_norm": 4.825284957885742, + "learning_rate": 4.916404001644512e-06, + "loss": 0.2337, + "step": 37846 + }, + { + "epoch": 2.113334237655996, + "grad_norm": 4.249501705169678, + "learning_rate": 4.9162669590242564e-06, + "loss": 0.2709, + "step": 37847 + }, + { + "epoch": 2.1133478024959307, + "grad_norm": 5.552248477935791, + "learning_rate": 4.9161299164040025e-06, + "loss": 0.1893, + "step": 37848 + }, + { + "epoch": 2.1133613673358655, + "grad_norm": 5.643508434295654, + "learning_rate": 4.915992873783747e-06, + "loss": 0.2485, + "step": 37849 + }, + { + "epoch": 2.1133749321758004, + "grad_norm": 5.190353870391846, + "learning_rate": 4.915855831163493e-06, + "loss": 0.2319, + "step": 37850 + }, + { + "epoch": 2.1133884970157353, + "grad_norm": 7.557851791381836, + "learning_rate": 4.915718788543237e-06, + "loss": 0.2822, + "step": 37851 + }, + { + "epoch": 2.11340206185567, + "grad_norm": 5.0717082023620605, + "learning_rate": 4.915581745922982e-06, + "loss": 0.1531, + "step": 37852 + }, + { + "epoch": 2.113415626695605, + "grad_norm": 6.06540060043335, + "learning_rate": 4.9154447033027275e-06, + "loss": 0.2331, + "step": 37853 + }, + { + "epoch": 2.11342919153554, + "grad_norm": 6.441834926605225, + "learning_rate": 4.915307660682473e-06, + "loss": 0.2885, + "step": 37854 + }, + { + "epoch": 2.1134427563754747, + "grad_norm": 6.427072525024414, + "learning_rate": 4.915170618062218e-06, + "loss": 0.2862, + "step": 37855 + }, + { + "epoch": 2.1134563212154096, + "grad_norm": 7.959908485412598, + "learning_rate": 4.915033575441962e-06, + "loss": 0.3938, + "step": 37856 + }, + { + "epoch": 2.1134698860553445, + "grad_norm": 4.526071548461914, + "learning_rate": 4.914896532821708e-06, + "loss": 0.208, + "step": 37857 + }, + { + "epoch": 2.1134834508952793, + "grad_norm": 4.075110912322998, + "learning_rate": 4.9147594902014525e-06, + "loss": 0.135, + "step": 37858 + }, + { + "epoch": 2.113497015735214, + "grad_norm": 4.549718856811523, + "learning_rate": 4.9146224475811986e-06, + "loss": 0.1696, + "step": 37859 + }, + { + "epoch": 2.113510580575149, + "grad_norm": 3.560373306274414, + "learning_rate": 4.914485404960943e-06, + "loss": 0.1168, + "step": 37860 + }, + { + "epoch": 2.113524145415084, + "grad_norm": 3.5948386192321777, + "learning_rate": 4.914348362340688e-06, + "loss": 0.1719, + "step": 37861 + }, + { + "epoch": 2.113537710255019, + "grad_norm": 4.176479816436768, + "learning_rate": 4.914211319720433e-06, + "loss": 0.1552, + "step": 37862 + }, + { + "epoch": 2.113551275094954, + "grad_norm": 5.426889419555664, + "learning_rate": 4.9140742771001784e-06, + "loss": 0.2017, + "step": 37863 + }, + { + "epoch": 2.113564839934889, + "grad_norm": 3.850922107696533, + "learning_rate": 4.913937234479924e-06, + "loss": 0.1338, + "step": 37864 + }, + { + "epoch": 2.113578404774824, + "grad_norm": 5.486979961395264, + "learning_rate": 4.913800191859669e-06, + "loss": 0.2371, + "step": 37865 + }, + { + "epoch": 2.1135919696147587, + "grad_norm": 2.881709098815918, + "learning_rate": 4.913663149239414e-06, + "loss": 0.116, + "step": 37866 + }, + { + "epoch": 2.1136055344546936, + "grad_norm": 4.872896194458008, + "learning_rate": 4.913526106619159e-06, + "loss": 0.1285, + "step": 37867 + }, + { + "epoch": 2.1136190992946284, + "grad_norm": 8.140386581420898, + "learning_rate": 4.913389063998904e-06, + "loss": 0.287, + "step": 37868 + }, + { + "epoch": 2.1136326641345633, + "grad_norm": 6.036877155303955, + "learning_rate": 4.9132520213786495e-06, + "loss": 0.2283, + "step": 37869 + }, + { + "epoch": 2.113646228974498, + "grad_norm": 4.989403247833252, + "learning_rate": 4.913114978758395e-06, + "loss": 0.195, + "step": 37870 + }, + { + "epoch": 2.113659793814433, + "grad_norm": 4.6221771240234375, + "learning_rate": 4.912977936138139e-06, + "loss": 0.2409, + "step": 37871 + }, + { + "epoch": 2.113673358654368, + "grad_norm": 3.7604897022247314, + "learning_rate": 4.912840893517884e-06, + "loss": 0.142, + "step": 37872 + }, + { + "epoch": 2.1136869234943028, + "grad_norm": 4.409125804901123, + "learning_rate": 4.912703850897629e-06, + "loss": 0.1235, + "step": 37873 + }, + { + "epoch": 2.1137004883342376, + "grad_norm": 3.9075076580047607, + "learning_rate": 4.9125668082773746e-06, + "loss": 0.1799, + "step": 37874 + }, + { + "epoch": 2.1137140531741725, + "grad_norm": 4.11862325668335, + "learning_rate": 4.91242976565712e-06, + "loss": 0.1633, + "step": 37875 + }, + { + "epoch": 2.1137276180141074, + "grad_norm": 4.231854438781738, + "learning_rate": 4.912292723036865e-06, + "loss": 0.1795, + "step": 37876 + }, + { + "epoch": 2.1137411828540422, + "grad_norm": 3.7362399101257324, + "learning_rate": 4.91215568041661e-06, + "loss": 0.117, + "step": 37877 + }, + { + "epoch": 2.113754747693977, + "grad_norm": 4.635146617889404, + "learning_rate": 4.912018637796355e-06, + "loss": 0.1644, + "step": 37878 + }, + { + "epoch": 2.113768312533912, + "grad_norm": 4.060244560241699, + "learning_rate": 4.9118815951761005e-06, + "loss": 0.2036, + "step": 37879 + }, + { + "epoch": 2.113781877373847, + "grad_norm": 4.748641014099121, + "learning_rate": 4.911744552555846e-06, + "loss": 0.2181, + "step": 37880 + }, + { + "epoch": 2.1137954422137817, + "grad_norm": 6.068209648132324, + "learning_rate": 4.91160750993559e-06, + "loss": 0.3161, + "step": 37881 + }, + { + "epoch": 2.1138090070537165, + "grad_norm": 6.3428826332092285, + "learning_rate": 4.911470467315336e-06, + "loss": 0.2752, + "step": 37882 + }, + { + "epoch": 2.113822571893652, + "grad_norm": 3.843733549118042, + "learning_rate": 4.91133342469508e-06, + "loss": 0.1734, + "step": 37883 + }, + { + "epoch": 2.1138361367335867, + "grad_norm": 5.048338413238525, + "learning_rate": 4.9111963820748255e-06, + "loss": 0.2613, + "step": 37884 + }, + { + "epoch": 2.1138497015735216, + "grad_norm": 4.717692852020264, + "learning_rate": 4.911059339454571e-06, + "loss": 0.1536, + "step": 37885 + }, + { + "epoch": 2.1138632664134565, + "grad_norm": 5.787415027618408, + "learning_rate": 4.910922296834316e-06, + "loss": 0.1667, + "step": 37886 + }, + { + "epoch": 2.1138768312533913, + "grad_norm": 5.040714263916016, + "learning_rate": 4.910785254214061e-06, + "loss": 0.2061, + "step": 37887 + }, + { + "epoch": 2.113890396093326, + "grad_norm": 7.704279899597168, + "learning_rate": 4.910648211593806e-06, + "loss": 0.2308, + "step": 37888 + }, + { + "epoch": 2.113903960933261, + "grad_norm": 5.063169479370117, + "learning_rate": 4.910511168973551e-06, + "loss": 0.2952, + "step": 37889 + }, + { + "epoch": 2.113917525773196, + "grad_norm": 6.622520446777344, + "learning_rate": 4.910374126353296e-06, + "loss": 0.2879, + "step": 37890 + }, + { + "epoch": 2.113931090613131, + "grad_norm": 5.160475254058838, + "learning_rate": 4.910237083733042e-06, + "loss": 0.2565, + "step": 37891 + }, + { + "epoch": 2.1139446554530656, + "grad_norm": 3.6630282402038574, + "learning_rate": 4.910100041112786e-06, + "loss": 0.2631, + "step": 37892 + }, + { + "epoch": 2.1139582202930005, + "grad_norm": 5.6491475105285645, + "learning_rate": 4.909962998492532e-06, + "loss": 0.3129, + "step": 37893 + }, + { + "epoch": 2.1139717851329354, + "grad_norm": 5.864149570465088, + "learning_rate": 4.9098259558722764e-06, + "loss": 0.303, + "step": 37894 + }, + { + "epoch": 2.1139853499728702, + "grad_norm": 4.088162422180176, + "learning_rate": 4.909688913252022e-06, + "loss": 0.181, + "step": 37895 + }, + { + "epoch": 2.113998914812805, + "grad_norm": 6.795029640197754, + "learning_rate": 4.909551870631767e-06, + "loss": 0.4004, + "step": 37896 + }, + { + "epoch": 2.11401247965274, + "grad_norm": 4.3716936111450195, + "learning_rate": 4.909414828011512e-06, + "loss": 0.2506, + "step": 37897 + }, + { + "epoch": 2.114026044492675, + "grad_norm": 6.769200325012207, + "learning_rate": 4.909277785391257e-06, + "loss": 0.2414, + "step": 37898 + }, + { + "epoch": 2.1140396093326097, + "grad_norm": 5.195227146148682, + "learning_rate": 4.909140742771002e-06, + "loss": 0.1693, + "step": 37899 + }, + { + "epoch": 2.1140531741725446, + "grad_norm": 5.092493534088135, + "learning_rate": 4.9090037001507475e-06, + "loss": 0.2103, + "step": 37900 + }, + { + "epoch": 2.11406673901248, + "grad_norm": 4.054653644561768, + "learning_rate": 4.908866657530492e-06, + "loss": 0.1974, + "step": 37901 + }, + { + "epoch": 2.1140803038524147, + "grad_norm": 8.11949348449707, + "learning_rate": 4.908729614910238e-06, + "loss": 0.3343, + "step": 37902 + }, + { + "epoch": 2.1140938686923496, + "grad_norm": 9.388958930969238, + "learning_rate": 4.908592572289982e-06, + "loss": 0.3486, + "step": 37903 + }, + { + "epoch": 2.1141074335322845, + "grad_norm": 5.62890625, + "learning_rate": 4.908455529669728e-06, + "loss": 0.2714, + "step": 37904 + }, + { + "epoch": 2.1141209983722193, + "grad_norm": 4.417288303375244, + "learning_rate": 4.9083184870494726e-06, + "loss": 0.1875, + "step": 37905 + }, + { + "epoch": 2.114134563212154, + "grad_norm": 3.1859982013702393, + "learning_rate": 4.908181444429218e-06, + "loss": 0.1323, + "step": 37906 + }, + { + "epoch": 2.114148128052089, + "grad_norm": 6.923009872436523, + "learning_rate": 4.908044401808963e-06, + "loss": 0.3079, + "step": 37907 + }, + { + "epoch": 2.114161692892024, + "grad_norm": 5.806600093841553, + "learning_rate": 4.907907359188708e-06, + "loss": 0.2018, + "step": 37908 + }, + { + "epoch": 2.114175257731959, + "grad_norm": 7.3398356437683105, + "learning_rate": 4.907770316568453e-06, + "loss": 0.2686, + "step": 37909 + }, + { + "epoch": 2.1141888225718937, + "grad_norm": 5.823063373565674, + "learning_rate": 4.9076332739481985e-06, + "loss": 0.3108, + "step": 37910 + }, + { + "epoch": 2.1142023874118285, + "grad_norm": 5.431607723236084, + "learning_rate": 4.907496231327944e-06, + "loss": 0.2125, + "step": 37911 + }, + { + "epoch": 2.1142159522517634, + "grad_norm": 4.7529377937316895, + "learning_rate": 4.907359188707688e-06, + "loss": 0.1556, + "step": 37912 + }, + { + "epoch": 2.1142295170916983, + "grad_norm": 5.506214141845703, + "learning_rate": 4.907222146087434e-06, + "loss": 0.2862, + "step": 37913 + }, + { + "epoch": 2.114243081931633, + "grad_norm": 5.182661056518555, + "learning_rate": 4.907085103467178e-06, + "loss": 0.2181, + "step": 37914 + }, + { + "epoch": 2.114256646771568, + "grad_norm": 5.633523941040039, + "learning_rate": 4.9069480608469235e-06, + "loss": 0.2064, + "step": 37915 + }, + { + "epoch": 2.114270211611503, + "grad_norm": 7.46161413192749, + "learning_rate": 4.906811018226669e-06, + "loss": 0.2459, + "step": 37916 + }, + { + "epoch": 2.1142837764514377, + "grad_norm": 6.577004909515381, + "learning_rate": 4.906673975606414e-06, + "loss": 0.2379, + "step": 37917 + }, + { + "epoch": 2.1142973412913726, + "grad_norm": 5.5394606590271, + "learning_rate": 4.906536932986159e-06, + "loss": 0.3188, + "step": 37918 + }, + { + "epoch": 2.1143109061313075, + "grad_norm": 7.300769329071045, + "learning_rate": 4.906399890365904e-06, + "loss": 0.215, + "step": 37919 + }, + { + "epoch": 2.1143244709712423, + "grad_norm": 5.656517028808594, + "learning_rate": 4.906262847745649e-06, + "loss": 0.2348, + "step": 37920 + }, + { + "epoch": 2.1143380358111776, + "grad_norm": 4.1623687744140625, + "learning_rate": 4.906125805125395e-06, + "loss": 0.1486, + "step": 37921 + }, + { + "epoch": 2.1143516006511125, + "grad_norm": 6.200606822967529, + "learning_rate": 4.90598876250514e-06, + "loss": 0.2186, + "step": 37922 + }, + { + "epoch": 2.1143651654910474, + "grad_norm": 4.452912330627441, + "learning_rate": 4.905851719884885e-06, + "loss": 0.1623, + "step": 37923 + }, + { + "epoch": 2.1143787303309822, + "grad_norm": 6.225486755371094, + "learning_rate": 4.905714677264629e-06, + "loss": 0.2442, + "step": 37924 + }, + { + "epoch": 2.114392295170917, + "grad_norm": 4.111649513244629, + "learning_rate": 4.9055776346443744e-06, + "loss": 0.1608, + "step": 37925 + }, + { + "epoch": 2.114405860010852, + "grad_norm": 6.664304256439209, + "learning_rate": 4.90544059202412e-06, + "loss": 0.2801, + "step": 37926 + }, + { + "epoch": 2.114419424850787, + "grad_norm": 4.991153240203857, + "learning_rate": 4.905303549403865e-06, + "loss": 0.2134, + "step": 37927 + }, + { + "epoch": 2.1144329896907217, + "grad_norm": 5.525742053985596, + "learning_rate": 4.90516650678361e-06, + "loss": 0.1732, + "step": 37928 + }, + { + "epoch": 2.1144465545306566, + "grad_norm": 6.021524429321289, + "learning_rate": 4.905029464163355e-06, + "loss": 0.2106, + "step": 37929 + }, + { + "epoch": 2.1144601193705914, + "grad_norm": 6.097970962524414, + "learning_rate": 4.9048924215431e-06, + "loss": 0.1697, + "step": 37930 + }, + { + "epoch": 2.1144736842105263, + "grad_norm": 5.423789978027344, + "learning_rate": 4.9047553789228455e-06, + "loss": 0.2756, + "step": 37931 + }, + { + "epoch": 2.114487249050461, + "grad_norm": 5.593977928161621, + "learning_rate": 4.904618336302591e-06, + "loss": 0.2394, + "step": 37932 + }, + { + "epoch": 2.114500813890396, + "grad_norm": 3.9881763458251953, + "learning_rate": 4.904481293682336e-06, + "loss": 0.1507, + "step": 37933 + }, + { + "epoch": 2.114514378730331, + "grad_norm": 5.845048904418945, + "learning_rate": 4.904344251062081e-06, + "loss": 0.2586, + "step": 37934 + }, + { + "epoch": 2.1145279435702657, + "grad_norm": 5.224837779998779, + "learning_rate": 4.904207208441825e-06, + "loss": 0.1946, + "step": 37935 + }, + { + "epoch": 2.1145415084102006, + "grad_norm": 6.3664093017578125, + "learning_rate": 4.904070165821571e-06, + "loss": 0.2066, + "step": 37936 + }, + { + "epoch": 2.1145550732501355, + "grad_norm": 5.279378890991211, + "learning_rate": 4.903933123201316e-06, + "loss": 0.1605, + "step": 37937 + }, + { + "epoch": 2.1145686380900703, + "grad_norm": 6.802852630615234, + "learning_rate": 4.903796080581062e-06, + "loss": 0.2708, + "step": 37938 + }, + { + "epoch": 2.1145822029300056, + "grad_norm": 5.918600082397461, + "learning_rate": 4.903659037960806e-06, + "loss": 0.2595, + "step": 37939 + }, + { + "epoch": 2.1145957677699405, + "grad_norm": 4.7760748863220215, + "learning_rate": 4.903521995340551e-06, + "loss": 0.2343, + "step": 37940 + }, + { + "epoch": 2.1146093326098754, + "grad_norm": 6.232849597930908, + "learning_rate": 4.9033849527202965e-06, + "loss": 0.2064, + "step": 37941 + }, + { + "epoch": 2.1146228974498102, + "grad_norm": 5.0183186531066895, + "learning_rate": 4.903247910100042e-06, + "loss": 0.1726, + "step": 37942 + }, + { + "epoch": 2.114636462289745, + "grad_norm": 5.582856178283691, + "learning_rate": 4.903110867479787e-06, + "loss": 0.1808, + "step": 37943 + }, + { + "epoch": 2.11465002712968, + "grad_norm": 6.395777225494385, + "learning_rate": 4.902973824859531e-06, + "loss": 0.1829, + "step": 37944 + }, + { + "epoch": 2.114663591969615, + "grad_norm": 5.927727699279785, + "learning_rate": 4.902836782239277e-06, + "loss": 0.3401, + "step": 37945 + }, + { + "epoch": 2.1146771568095497, + "grad_norm": 5.902190208435059, + "learning_rate": 4.9026997396190215e-06, + "loss": 0.248, + "step": 37946 + }, + { + "epoch": 2.1146907216494846, + "grad_norm": 5.499579906463623, + "learning_rate": 4.9025626969987675e-06, + "loss": 0.195, + "step": 37947 + }, + { + "epoch": 2.1147042864894194, + "grad_norm": 3.6814322471618652, + "learning_rate": 4.902425654378512e-06, + "loss": 0.0727, + "step": 37948 + }, + { + "epoch": 2.1147178513293543, + "grad_norm": 6.603621482849121, + "learning_rate": 4.902288611758257e-06, + "loss": 0.2335, + "step": 37949 + }, + { + "epoch": 2.114731416169289, + "grad_norm": 5.391689300537109, + "learning_rate": 4.902151569138002e-06, + "loss": 0.1948, + "step": 37950 + }, + { + "epoch": 2.114744981009224, + "grad_norm": 5.487659931182861, + "learning_rate": 4.902014526517747e-06, + "loss": 0.2228, + "step": 37951 + }, + { + "epoch": 2.114758545849159, + "grad_norm": 6.559010028839111, + "learning_rate": 4.901877483897493e-06, + "loss": 0.1972, + "step": 37952 + }, + { + "epoch": 2.1147721106890938, + "grad_norm": 4.999364852905273, + "learning_rate": 4.901740441277238e-06, + "loss": 0.1659, + "step": 37953 + }, + { + "epoch": 2.1147856755290286, + "grad_norm": 4.217598915100098, + "learning_rate": 4.901603398656983e-06, + "loss": 0.1412, + "step": 37954 + }, + { + "epoch": 2.1147992403689635, + "grad_norm": 5.216146945953369, + "learning_rate": 4.901466356036727e-06, + "loss": 0.1339, + "step": 37955 + }, + { + "epoch": 2.1148128052088984, + "grad_norm": 5.176929950714111, + "learning_rate": 4.901329313416473e-06, + "loss": 0.264, + "step": 37956 + }, + { + "epoch": 2.1148263700488332, + "grad_norm": 6.142651081085205, + "learning_rate": 4.901192270796218e-06, + "loss": 0.2922, + "step": 37957 + }, + { + "epoch": 2.114839934888768, + "grad_norm": 4.292889595031738, + "learning_rate": 4.901055228175964e-06, + "loss": 0.1312, + "step": 37958 + }, + { + "epoch": 2.1148534997287034, + "grad_norm": 7.681642532348633, + "learning_rate": 4.900918185555708e-06, + "loss": 0.2737, + "step": 37959 + }, + { + "epoch": 2.1148670645686383, + "grad_norm": 6.0333733558654785, + "learning_rate": 4.900781142935453e-06, + "loss": 0.1623, + "step": 37960 + }, + { + "epoch": 2.114880629408573, + "grad_norm": 4.233589172363281, + "learning_rate": 4.900644100315198e-06, + "loss": 0.1131, + "step": 37961 + }, + { + "epoch": 2.114894194248508, + "grad_norm": 6.909358024597168, + "learning_rate": 4.9005070576949435e-06, + "loss": 0.225, + "step": 37962 + }, + { + "epoch": 2.114907759088443, + "grad_norm": 3.20880389213562, + "learning_rate": 4.900370015074689e-06, + "loss": 0.0616, + "step": 37963 + }, + { + "epoch": 2.1149213239283777, + "grad_norm": 5.741124629974365, + "learning_rate": 4.900232972454434e-06, + "loss": 0.1476, + "step": 37964 + }, + { + "epoch": 2.1149348887683126, + "grad_norm": 4.384615421295166, + "learning_rate": 4.900095929834179e-06, + "loss": 0.1885, + "step": 37965 + }, + { + "epoch": 2.1149484536082475, + "grad_norm": 6.610482692718506, + "learning_rate": 4.899958887213923e-06, + "loss": 0.1855, + "step": 37966 + }, + { + "epoch": 2.1149620184481823, + "grad_norm": 4.431482791900635, + "learning_rate": 4.899821844593669e-06, + "loss": 0.2303, + "step": 37967 + }, + { + "epoch": 2.114975583288117, + "grad_norm": 6.5341339111328125, + "learning_rate": 4.899684801973414e-06, + "loss": 0.2202, + "step": 37968 + }, + { + "epoch": 2.114989148128052, + "grad_norm": 3.996951103210449, + "learning_rate": 4.899547759353159e-06, + "loss": 0.1309, + "step": 37969 + }, + { + "epoch": 2.115002712967987, + "grad_norm": 5.99315881729126, + "learning_rate": 4.899410716732904e-06, + "loss": 0.2348, + "step": 37970 + }, + { + "epoch": 2.115016277807922, + "grad_norm": 5.709745407104492, + "learning_rate": 4.899273674112649e-06, + "loss": 0.2889, + "step": 37971 + }, + { + "epoch": 2.1150298426478567, + "grad_norm": 4.263908863067627, + "learning_rate": 4.8991366314923945e-06, + "loss": 0.1966, + "step": 37972 + }, + { + "epoch": 2.1150434074877915, + "grad_norm": 4.753966808319092, + "learning_rate": 4.89899958887214e-06, + "loss": 0.243, + "step": 37973 + }, + { + "epoch": 2.1150569723277264, + "grad_norm": 4.853216648101807, + "learning_rate": 4.898862546251885e-06, + "loss": 0.2951, + "step": 37974 + }, + { + "epoch": 2.1150705371676612, + "grad_norm": 3.431535482406616, + "learning_rate": 4.89872550363163e-06, + "loss": 0.0674, + "step": 37975 + }, + { + "epoch": 2.115084102007596, + "grad_norm": 5.6595778465271, + "learning_rate": 4.898588461011375e-06, + "loss": 0.2616, + "step": 37976 + }, + { + "epoch": 2.1150976668475314, + "grad_norm": 4.385922908782959, + "learning_rate": 4.89845141839112e-06, + "loss": 0.1251, + "step": 37977 + }, + { + "epoch": 2.1151112316874663, + "grad_norm": 5.123707294464111, + "learning_rate": 4.898314375770865e-06, + "loss": 0.2638, + "step": 37978 + }, + { + "epoch": 2.115124796527401, + "grad_norm": 6.23225212097168, + "learning_rate": 4.898177333150611e-06, + "loss": 0.2645, + "step": 37979 + }, + { + "epoch": 2.115138361367336, + "grad_norm": 5.335419178009033, + "learning_rate": 4.898040290530355e-06, + "loss": 0.1545, + "step": 37980 + }, + { + "epoch": 2.115151926207271, + "grad_norm": 4.011857986450195, + "learning_rate": 4.8979032479101e-06, + "loss": 0.1139, + "step": 37981 + }, + { + "epoch": 2.1151654910472057, + "grad_norm": 4.605039596557617, + "learning_rate": 4.897766205289845e-06, + "loss": 0.1273, + "step": 37982 + }, + { + "epoch": 2.1151790558871406, + "grad_norm": 6.406795978546143, + "learning_rate": 4.897629162669591e-06, + "loss": 0.2549, + "step": 37983 + }, + { + "epoch": 2.1151926207270755, + "grad_norm": 5.722355842590332, + "learning_rate": 4.897492120049336e-06, + "loss": 0.2346, + "step": 37984 + }, + { + "epoch": 2.1152061855670103, + "grad_norm": 3.773555040359497, + "learning_rate": 4.897355077429081e-06, + "loss": 0.1634, + "step": 37985 + }, + { + "epoch": 2.115219750406945, + "grad_norm": 4.4173903465271, + "learning_rate": 4.897218034808826e-06, + "loss": 0.2236, + "step": 37986 + }, + { + "epoch": 2.11523331524688, + "grad_norm": 4.332160472869873, + "learning_rate": 4.897080992188571e-06, + "loss": 0.1514, + "step": 37987 + }, + { + "epoch": 2.115246880086815, + "grad_norm": 4.5176262855529785, + "learning_rate": 4.8969439495683165e-06, + "loss": 0.2301, + "step": 37988 + }, + { + "epoch": 2.11526044492675, + "grad_norm": 4.918111324310303, + "learning_rate": 4.896806906948061e-06, + "loss": 0.2003, + "step": 37989 + }, + { + "epoch": 2.1152740097666847, + "grad_norm": 4.9633049964904785, + "learning_rate": 4.896669864327807e-06, + "loss": 0.3025, + "step": 37990 + }, + { + "epoch": 2.1152875746066195, + "grad_norm": 5.1172895431518555, + "learning_rate": 4.896532821707551e-06, + "loss": 0.2319, + "step": 37991 + }, + { + "epoch": 2.1153011394465544, + "grad_norm": 4.34553861618042, + "learning_rate": 4.896395779087297e-06, + "loss": 0.1969, + "step": 37992 + }, + { + "epoch": 2.1153147042864893, + "grad_norm": 6.391091346740723, + "learning_rate": 4.8962587364670415e-06, + "loss": 0.4188, + "step": 37993 + }, + { + "epoch": 2.115328269126424, + "grad_norm": 6.356151580810547, + "learning_rate": 4.896121693846787e-06, + "loss": 0.3937, + "step": 37994 + }, + { + "epoch": 2.115341833966359, + "grad_norm": 3.5302891731262207, + "learning_rate": 4.895984651226532e-06, + "loss": 0.1092, + "step": 37995 + }, + { + "epoch": 2.115355398806294, + "grad_norm": 3.9951932430267334, + "learning_rate": 4.895847608606277e-06, + "loss": 0.1633, + "step": 37996 + }, + { + "epoch": 2.115368963646229, + "grad_norm": 5.340160369873047, + "learning_rate": 4.895710565986022e-06, + "loss": 0.3392, + "step": 37997 + }, + { + "epoch": 2.115382528486164, + "grad_norm": 5.653497219085693, + "learning_rate": 4.895573523365767e-06, + "loss": 0.222, + "step": 37998 + }, + { + "epoch": 2.115396093326099, + "grad_norm": 4.2356767654418945, + "learning_rate": 4.895436480745513e-06, + "loss": 0.1658, + "step": 37999 + }, + { + "epoch": 2.1154096581660338, + "grad_norm": 4.086590766906738, + "learning_rate": 4.895299438125257e-06, + "loss": 0.1236, + "step": 38000 + }, + { + "epoch": 2.1154232230059686, + "grad_norm": 5.258297920227051, + "learning_rate": 4.895162395505003e-06, + "loss": 0.225, + "step": 38001 + }, + { + "epoch": 2.1154367878459035, + "grad_norm": 5.888895034790039, + "learning_rate": 4.895025352884747e-06, + "loss": 0.2832, + "step": 38002 + }, + { + "epoch": 2.1154503526858384, + "grad_norm": 5.4150848388671875, + "learning_rate": 4.8948883102644925e-06, + "loss": 0.1828, + "step": 38003 + }, + { + "epoch": 2.1154639175257732, + "grad_norm": 4.114983558654785, + "learning_rate": 4.894751267644238e-06, + "loss": 0.1482, + "step": 38004 + }, + { + "epoch": 2.115477482365708, + "grad_norm": 4.660737037658691, + "learning_rate": 4.894614225023983e-06, + "loss": 0.2429, + "step": 38005 + }, + { + "epoch": 2.115491047205643, + "grad_norm": 3.4425294399261475, + "learning_rate": 4.894477182403728e-06, + "loss": 0.144, + "step": 38006 + }, + { + "epoch": 2.115504612045578, + "grad_norm": 3.8164167404174805, + "learning_rate": 4.894340139783473e-06, + "loss": 0.1414, + "step": 38007 + }, + { + "epoch": 2.1155181768855127, + "grad_norm": 4.7993316650390625, + "learning_rate": 4.894203097163218e-06, + "loss": 0.2159, + "step": 38008 + }, + { + "epoch": 2.1155317417254476, + "grad_norm": 4.4205498695373535, + "learning_rate": 4.894066054542963e-06, + "loss": 0.2237, + "step": 38009 + }, + { + "epoch": 2.1155453065653824, + "grad_norm": 3.845923900604248, + "learning_rate": 4.893929011922709e-06, + "loss": 0.1643, + "step": 38010 + }, + { + "epoch": 2.1155588714053173, + "grad_norm": 6.59098482131958, + "learning_rate": 4.893791969302453e-06, + "loss": 0.3048, + "step": 38011 + }, + { + "epoch": 2.115572436245252, + "grad_norm": 4.518064022064209, + "learning_rate": 4.893654926682198e-06, + "loss": 0.2271, + "step": 38012 + }, + { + "epoch": 2.115586001085187, + "grad_norm": 3.749345302581787, + "learning_rate": 4.893517884061943e-06, + "loss": 0.2769, + "step": 38013 + }, + { + "epoch": 2.115599565925122, + "grad_norm": 4.147832870483398, + "learning_rate": 4.893380841441689e-06, + "loss": 0.2495, + "step": 38014 + }, + { + "epoch": 2.115613130765057, + "grad_norm": 5.939830303192139, + "learning_rate": 4.893243798821434e-06, + "loss": 0.206, + "step": 38015 + }, + { + "epoch": 2.115626695604992, + "grad_norm": 7.053264141082764, + "learning_rate": 4.893106756201179e-06, + "loss": 0.3803, + "step": 38016 + }, + { + "epoch": 2.115640260444927, + "grad_norm": 4.095452308654785, + "learning_rate": 4.892969713580924e-06, + "loss": 0.1409, + "step": 38017 + }, + { + "epoch": 2.115653825284862, + "grad_norm": 5.3285231590271, + "learning_rate": 4.892832670960669e-06, + "loss": 0.2889, + "step": 38018 + }, + { + "epoch": 2.1156673901247967, + "grad_norm": 3.5567727088928223, + "learning_rate": 4.8926956283404145e-06, + "loss": 0.1504, + "step": 38019 + }, + { + "epoch": 2.1156809549647315, + "grad_norm": 5.746053695678711, + "learning_rate": 4.89255858572016e-06, + "loss": 0.1505, + "step": 38020 + }, + { + "epoch": 2.1156945198046664, + "grad_norm": 5.001447677612305, + "learning_rate": 4.892421543099905e-06, + "loss": 0.2209, + "step": 38021 + }, + { + "epoch": 2.1157080846446013, + "grad_norm": 2.8373749256134033, + "learning_rate": 4.892284500479649e-06, + "loss": 0.1072, + "step": 38022 + }, + { + "epoch": 2.115721649484536, + "grad_norm": 4.396749019622803, + "learning_rate": 4.892147457859394e-06, + "loss": 0.2664, + "step": 38023 + }, + { + "epoch": 2.115735214324471, + "grad_norm": 4.219638347625732, + "learning_rate": 4.8920104152391395e-06, + "loss": 0.171, + "step": 38024 + }, + { + "epoch": 2.115748779164406, + "grad_norm": 5.520439624786377, + "learning_rate": 4.891873372618885e-06, + "loss": 0.1877, + "step": 38025 + }, + { + "epoch": 2.1157623440043407, + "grad_norm": 3.4929189682006836, + "learning_rate": 4.89173632999863e-06, + "loss": 0.1969, + "step": 38026 + }, + { + "epoch": 2.1157759088442756, + "grad_norm": 5.612005710601807, + "learning_rate": 4.891599287378375e-06, + "loss": 0.2483, + "step": 38027 + }, + { + "epoch": 2.1157894736842104, + "grad_norm": 3.7936103343963623, + "learning_rate": 4.89146224475812e-06, + "loss": 0.1798, + "step": 38028 + }, + { + "epoch": 2.1158030385241453, + "grad_norm": 4.1101484298706055, + "learning_rate": 4.8913252021378654e-06, + "loss": 0.2488, + "step": 38029 + }, + { + "epoch": 2.11581660336408, + "grad_norm": 4.666894912719727, + "learning_rate": 4.891188159517611e-06, + "loss": 0.2247, + "step": 38030 + }, + { + "epoch": 2.115830168204015, + "grad_norm": 3.175872325897217, + "learning_rate": 4.891051116897356e-06, + "loss": 0.0885, + "step": 38031 + }, + { + "epoch": 2.11584373304395, + "grad_norm": 3.371950149536133, + "learning_rate": 4.8909140742771e-06, + "loss": 0.0961, + "step": 38032 + }, + { + "epoch": 2.1158572978838848, + "grad_norm": 5.64055871963501, + "learning_rate": 4.890777031656846e-06, + "loss": 0.2717, + "step": 38033 + }, + { + "epoch": 2.1158708627238196, + "grad_norm": 2.4103288650512695, + "learning_rate": 4.8906399890365905e-06, + "loss": 0.0885, + "step": 38034 + }, + { + "epoch": 2.115884427563755, + "grad_norm": 4.310950756072998, + "learning_rate": 4.890502946416336e-06, + "loss": 0.2231, + "step": 38035 + }, + { + "epoch": 2.11589799240369, + "grad_norm": 2.9753456115722656, + "learning_rate": 4.890365903796081e-06, + "loss": 0.1387, + "step": 38036 + }, + { + "epoch": 2.1159115572436247, + "grad_norm": 3.792447805404663, + "learning_rate": 4.890228861175826e-06, + "loss": 0.1227, + "step": 38037 + }, + { + "epoch": 2.1159251220835595, + "grad_norm": 4.752485752105713, + "learning_rate": 4.890091818555571e-06, + "loss": 0.193, + "step": 38038 + }, + { + "epoch": 2.1159386869234944, + "grad_norm": 3.52620530128479, + "learning_rate": 4.889954775935316e-06, + "loss": 0.1397, + "step": 38039 + }, + { + "epoch": 2.1159522517634293, + "grad_norm": 2.7511813640594482, + "learning_rate": 4.8898177333150616e-06, + "loss": 0.1236, + "step": 38040 + }, + { + "epoch": 2.115965816603364, + "grad_norm": 4.112495422363281, + "learning_rate": 4.889680690694807e-06, + "loss": 0.2202, + "step": 38041 + }, + { + "epoch": 2.115979381443299, + "grad_norm": 2.5034401416778564, + "learning_rate": 4.889543648074552e-06, + "loss": 0.1058, + "step": 38042 + }, + { + "epoch": 2.115992946283234, + "grad_norm": 3.156637668609619, + "learning_rate": 4.889406605454296e-06, + "loss": 0.1441, + "step": 38043 + }, + { + "epoch": 2.1160065111231687, + "grad_norm": 4.413824081420898, + "learning_rate": 4.889269562834042e-06, + "loss": 0.0958, + "step": 38044 + }, + { + "epoch": 2.1160200759631036, + "grad_norm": 3.0385515689849854, + "learning_rate": 4.889132520213787e-06, + "loss": 0.1227, + "step": 38045 + }, + { + "epoch": 2.1160336408030385, + "grad_norm": 2.2256081104278564, + "learning_rate": 4.888995477593533e-06, + "loss": 0.0559, + "step": 38046 + }, + { + "epoch": 2.1160472056429733, + "grad_norm": 2.4565138816833496, + "learning_rate": 4.888858434973277e-06, + "loss": 0.1015, + "step": 38047 + }, + { + "epoch": 2.116060770482908, + "grad_norm": 4.39996337890625, + "learning_rate": 4.888721392353022e-06, + "loss": 0.228, + "step": 38048 + }, + { + "epoch": 2.116074335322843, + "grad_norm": 5.215075492858887, + "learning_rate": 4.888584349732767e-06, + "loss": 0.3032, + "step": 38049 + }, + { + "epoch": 2.116087900162778, + "grad_norm": 3.7997922897338867, + "learning_rate": 4.8884473071125125e-06, + "loss": 0.1748, + "step": 38050 + }, + { + "epoch": 2.116101465002713, + "grad_norm": 4.4988884925842285, + "learning_rate": 4.888310264492258e-06, + "loss": 0.144, + "step": 38051 + }, + { + "epoch": 2.1161150298426477, + "grad_norm": 3.6181232929229736, + "learning_rate": 4.888173221872002e-06, + "loss": 0.1939, + "step": 38052 + }, + { + "epoch": 2.116128594682583, + "grad_norm": 3.0772440433502197, + "learning_rate": 4.888036179251748e-06, + "loss": 0.1299, + "step": 38053 + }, + { + "epoch": 2.116142159522518, + "grad_norm": 3.763333559036255, + "learning_rate": 4.887899136631492e-06, + "loss": 0.142, + "step": 38054 + }, + { + "epoch": 2.1161557243624527, + "grad_norm": 2.651226282119751, + "learning_rate": 4.887762094011238e-06, + "loss": 0.1538, + "step": 38055 + }, + { + "epoch": 2.1161692892023876, + "grad_norm": 5.3312668800354, + "learning_rate": 4.887625051390983e-06, + "loss": 0.2787, + "step": 38056 + }, + { + "epoch": 2.1161828540423224, + "grad_norm": 5.166918754577637, + "learning_rate": 4.887488008770728e-06, + "loss": 0.2047, + "step": 38057 + }, + { + "epoch": 2.1161964188822573, + "grad_norm": 3.828981637954712, + "learning_rate": 4.887350966150473e-06, + "loss": 0.1955, + "step": 38058 + }, + { + "epoch": 2.116209983722192, + "grad_norm": 3.1433160305023193, + "learning_rate": 4.887213923530218e-06, + "loss": 0.1251, + "step": 38059 + }, + { + "epoch": 2.116223548562127, + "grad_norm": 3.1988072395324707, + "learning_rate": 4.8870768809099634e-06, + "loss": 0.1556, + "step": 38060 + }, + { + "epoch": 2.116237113402062, + "grad_norm": 2.733103036880493, + "learning_rate": 4.886939838289709e-06, + "loss": 0.0817, + "step": 38061 + }, + { + "epoch": 2.1162506782419968, + "grad_norm": 3.3607969284057617, + "learning_rate": 4.886802795669454e-06, + "loss": 0.2024, + "step": 38062 + }, + { + "epoch": 2.1162642430819316, + "grad_norm": 4.134974956512451, + "learning_rate": 4.886665753049198e-06, + "loss": 0.1547, + "step": 38063 + }, + { + "epoch": 2.1162778079218665, + "grad_norm": 3.6599037647247314, + "learning_rate": 4.886528710428944e-06, + "loss": 0.1408, + "step": 38064 + }, + { + "epoch": 2.1162913727618013, + "grad_norm": 2.6374459266662598, + "learning_rate": 4.8863916678086885e-06, + "loss": 0.0809, + "step": 38065 + }, + { + "epoch": 2.116304937601736, + "grad_norm": 4.524782657623291, + "learning_rate": 4.886254625188434e-06, + "loss": 0.2236, + "step": 38066 + }, + { + "epoch": 2.116318502441671, + "grad_norm": 4.021321773529053, + "learning_rate": 4.886117582568179e-06, + "loss": 0.1761, + "step": 38067 + }, + { + "epoch": 2.116332067281606, + "grad_norm": 3.478288412094116, + "learning_rate": 4.885980539947924e-06, + "loss": 0.1617, + "step": 38068 + }, + { + "epoch": 2.116345632121541, + "grad_norm": 4.695715427398682, + "learning_rate": 4.885843497327669e-06, + "loss": 0.18, + "step": 38069 + }, + { + "epoch": 2.1163591969614757, + "grad_norm": 4.2208051681518555, + "learning_rate": 4.885706454707414e-06, + "loss": 0.1623, + "step": 38070 + }, + { + "epoch": 2.1163727618014105, + "grad_norm": 4.719554901123047, + "learning_rate": 4.8855694120871596e-06, + "loss": 0.1878, + "step": 38071 + }, + { + "epoch": 2.1163863266413454, + "grad_norm": 4.512644290924072, + "learning_rate": 4.885432369466905e-06, + "loss": 0.174, + "step": 38072 + }, + { + "epoch": 2.1163998914812807, + "grad_norm": 4.057526111602783, + "learning_rate": 4.88529532684665e-06, + "loss": 0.1474, + "step": 38073 + }, + { + "epoch": 2.1164134563212156, + "grad_norm": 5.9501752853393555, + "learning_rate": 4.885158284226395e-06, + "loss": 0.2213, + "step": 38074 + }, + { + "epoch": 2.1164270211611504, + "grad_norm": 4.124257564544678, + "learning_rate": 4.88502124160614e-06, + "loss": 0.1832, + "step": 38075 + }, + { + "epoch": 2.1164405860010853, + "grad_norm": 3.3707711696624756, + "learning_rate": 4.8848841989858854e-06, + "loss": 0.1738, + "step": 38076 + }, + { + "epoch": 2.11645415084102, + "grad_norm": 4.416244029998779, + "learning_rate": 4.88474715636563e-06, + "loss": 0.1575, + "step": 38077 + }, + { + "epoch": 2.116467715680955, + "grad_norm": 4.365594387054443, + "learning_rate": 4.884610113745375e-06, + "loss": 0.1522, + "step": 38078 + }, + { + "epoch": 2.11648128052089, + "grad_norm": 5.447072982788086, + "learning_rate": 4.88447307112512e-06, + "loss": 0.1622, + "step": 38079 + }, + { + "epoch": 2.1164948453608248, + "grad_norm": 4.733750820159912, + "learning_rate": 4.884336028504865e-06, + "loss": 0.1672, + "step": 38080 + }, + { + "epoch": 2.1165084102007596, + "grad_norm": 3.4730610847473145, + "learning_rate": 4.8841989858846105e-06, + "loss": 0.1378, + "step": 38081 + }, + { + "epoch": 2.1165219750406945, + "grad_norm": 5.408134460449219, + "learning_rate": 4.884061943264356e-06, + "loss": 0.2357, + "step": 38082 + }, + { + "epoch": 2.1165355398806294, + "grad_norm": 3.8479361534118652, + "learning_rate": 4.883924900644101e-06, + "loss": 0.1683, + "step": 38083 + }, + { + "epoch": 2.1165491047205642, + "grad_norm": 3.5323054790496826, + "learning_rate": 4.883787858023846e-06, + "loss": 0.093, + "step": 38084 + }, + { + "epoch": 2.116562669560499, + "grad_norm": 4.113174915313721, + "learning_rate": 4.883650815403591e-06, + "loss": 0.1004, + "step": 38085 + }, + { + "epoch": 2.116576234400434, + "grad_norm": 5.719544410705566, + "learning_rate": 4.8835137727833355e-06, + "loss": 0.2196, + "step": 38086 + }, + { + "epoch": 2.116589799240369, + "grad_norm": 5.119551181793213, + "learning_rate": 4.8833767301630816e-06, + "loss": 0.1, + "step": 38087 + }, + { + "epoch": 2.1166033640803037, + "grad_norm": 5.368305206298828, + "learning_rate": 4.883239687542826e-06, + "loss": 0.182, + "step": 38088 + }, + { + "epoch": 2.1166169289202386, + "grad_norm": 4.788884162902832, + "learning_rate": 4.883102644922572e-06, + "loss": 0.1888, + "step": 38089 + }, + { + "epoch": 2.1166304937601734, + "grad_norm": 4.330326557159424, + "learning_rate": 4.882965602302316e-06, + "loss": 0.1392, + "step": 38090 + }, + { + "epoch": 2.1166440586001087, + "grad_norm": 3.223367929458618, + "learning_rate": 4.8828285596820614e-06, + "loss": 0.0767, + "step": 38091 + }, + { + "epoch": 2.1166576234400436, + "grad_norm": 5.263343334197998, + "learning_rate": 4.882691517061807e-06, + "loss": 0.2433, + "step": 38092 + }, + { + "epoch": 2.1166711882799785, + "grad_norm": 4.260280609130859, + "learning_rate": 4.882554474441552e-06, + "loss": 0.1206, + "step": 38093 + }, + { + "epoch": 2.1166847531199133, + "grad_norm": 4.223134994506836, + "learning_rate": 4.882417431821297e-06, + "loss": 0.1002, + "step": 38094 + }, + { + "epoch": 2.116698317959848, + "grad_norm": 5.1334028244018555, + "learning_rate": 4.882280389201042e-06, + "loss": 0.1036, + "step": 38095 + }, + { + "epoch": 2.116711882799783, + "grad_norm": 4.034974575042725, + "learning_rate": 4.882143346580787e-06, + "loss": 0.1346, + "step": 38096 + }, + { + "epoch": 2.116725447639718, + "grad_norm": 3.7558610439300537, + "learning_rate": 4.882006303960532e-06, + "loss": 0.1222, + "step": 38097 + }, + { + "epoch": 2.116739012479653, + "grad_norm": 3.9320483207702637, + "learning_rate": 4.881869261340278e-06, + "loss": 0.0974, + "step": 38098 + }, + { + "epoch": 2.1167525773195877, + "grad_norm": 4.91774845123291, + "learning_rate": 4.881732218720022e-06, + "loss": 0.1332, + "step": 38099 + }, + { + "epoch": 2.1167661421595225, + "grad_norm": 5.841246128082275, + "learning_rate": 4.881595176099768e-06, + "loss": 0.1542, + "step": 38100 + }, + { + "epoch": 2.1167797069994574, + "grad_norm": 5.361678123474121, + "learning_rate": 4.881458133479512e-06, + "loss": 0.2557, + "step": 38101 + }, + { + "epoch": 2.1167932718393923, + "grad_norm": 4.455821990966797, + "learning_rate": 4.8813210908592576e-06, + "loss": 0.1289, + "step": 38102 + }, + { + "epoch": 2.116806836679327, + "grad_norm": 6.6789021492004395, + "learning_rate": 4.881184048239003e-06, + "loss": 0.2527, + "step": 38103 + }, + { + "epoch": 2.116820401519262, + "grad_norm": 5.241738796234131, + "learning_rate": 4.881047005618748e-06, + "loss": 0.1792, + "step": 38104 + }, + { + "epoch": 2.116833966359197, + "grad_norm": 5.879714488983154, + "learning_rate": 4.880909962998493e-06, + "loss": 0.3047, + "step": 38105 + }, + { + "epoch": 2.1168475311991317, + "grad_norm": 5.464931488037109, + "learning_rate": 4.8807729203782374e-06, + "loss": 0.2322, + "step": 38106 + }, + { + "epoch": 2.1168610960390666, + "grad_norm": 4.922351837158203, + "learning_rate": 4.8806358777579835e-06, + "loss": 0.2137, + "step": 38107 + }, + { + "epoch": 2.1168746608790014, + "grad_norm": 5.109568119049072, + "learning_rate": 4.880498835137728e-06, + "loss": 0.1688, + "step": 38108 + }, + { + "epoch": 2.1168882257189363, + "grad_norm": 5.361091613769531, + "learning_rate": 4.880361792517474e-06, + "loss": 0.2404, + "step": 38109 + }, + { + "epoch": 2.116901790558871, + "grad_norm": 3.064852476119995, + "learning_rate": 4.880224749897218e-06, + "loss": 0.1002, + "step": 38110 + }, + { + "epoch": 2.1169153553988065, + "grad_norm": 4.496077537536621, + "learning_rate": 4.880087707276963e-06, + "loss": 0.179, + "step": 38111 + }, + { + "epoch": 2.1169289202387414, + "grad_norm": 6.364594459533691, + "learning_rate": 4.8799506646567085e-06, + "loss": 0.2991, + "step": 38112 + }, + { + "epoch": 2.116942485078676, + "grad_norm": 4.465275764465332, + "learning_rate": 4.879813622036454e-06, + "loss": 0.0771, + "step": 38113 + }, + { + "epoch": 2.116956049918611, + "grad_norm": 4.855903625488281, + "learning_rate": 4.879676579416199e-06, + "loss": 0.1001, + "step": 38114 + }, + { + "epoch": 2.116969614758546, + "grad_norm": 5.372677803039551, + "learning_rate": 4.879539536795944e-06, + "loss": 0.2218, + "step": 38115 + }, + { + "epoch": 2.116983179598481, + "grad_norm": 5.180665969848633, + "learning_rate": 4.879402494175689e-06, + "loss": 0.2271, + "step": 38116 + }, + { + "epoch": 2.1169967444384157, + "grad_norm": 6.253826141357422, + "learning_rate": 4.879265451555434e-06, + "loss": 0.1938, + "step": 38117 + }, + { + "epoch": 2.1170103092783505, + "grad_norm": 5.099809169769287, + "learning_rate": 4.8791284089351796e-06, + "loss": 0.1288, + "step": 38118 + }, + { + "epoch": 2.1170238741182854, + "grad_norm": 4.826717853546143, + "learning_rate": 4.878991366314924e-06, + "loss": 0.1739, + "step": 38119 + }, + { + "epoch": 2.1170374389582203, + "grad_norm": 5.5720391273498535, + "learning_rate": 4.878854323694669e-06, + "loss": 0.1416, + "step": 38120 + }, + { + "epoch": 2.117051003798155, + "grad_norm": 5.123623371124268, + "learning_rate": 4.878717281074414e-06, + "loss": 0.1787, + "step": 38121 + }, + { + "epoch": 2.11706456863809, + "grad_norm": 4.267484188079834, + "learning_rate": 4.8785802384541594e-06, + "loss": 0.1148, + "step": 38122 + }, + { + "epoch": 2.117078133478025, + "grad_norm": 5.360883712768555, + "learning_rate": 4.878443195833905e-06, + "loss": 0.3063, + "step": 38123 + }, + { + "epoch": 2.1170916983179597, + "grad_norm": 3.6877152919769287, + "learning_rate": 4.87830615321365e-06, + "loss": 0.1584, + "step": 38124 + }, + { + "epoch": 2.1171052631578946, + "grad_norm": 4.315601825714111, + "learning_rate": 4.878169110593395e-06, + "loss": 0.186, + "step": 38125 + }, + { + "epoch": 2.1171188279978295, + "grad_norm": 8.410405158996582, + "learning_rate": 4.87803206797314e-06, + "loss": 0.3726, + "step": 38126 + }, + { + "epoch": 2.1171323928377643, + "grad_norm": 3.001213550567627, + "learning_rate": 4.877895025352885e-06, + "loss": 0.1088, + "step": 38127 + }, + { + "epoch": 2.117145957677699, + "grad_norm": 4.943282604217529, + "learning_rate": 4.8777579827326305e-06, + "loss": 0.218, + "step": 38128 + }, + { + "epoch": 2.1171595225176345, + "grad_norm": 4.842109680175781, + "learning_rate": 4.877620940112376e-06, + "loss": 0.1661, + "step": 38129 + }, + { + "epoch": 2.1171730873575694, + "grad_norm": 3.7219040393829346, + "learning_rate": 4.877483897492121e-06, + "loss": 0.116, + "step": 38130 + }, + { + "epoch": 2.1171866521975042, + "grad_norm": 4.9894304275512695, + "learning_rate": 4.877346854871865e-06, + "loss": 0.3366, + "step": 38131 + }, + { + "epoch": 2.117200217037439, + "grad_norm": 4.262874603271484, + "learning_rate": 4.87720981225161e-06, + "loss": 0.1478, + "step": 38132 + }, + { + "epoch": 2.117213781877374, + "grad_norm": 6.154002666473389, + "learning_rate": 4.8770727696313556e-06, + "loss": 0.2251, + "step": 38133 + }, + { + "epoch": 2.117227346717309, + "grad_norm": 3.5959901809692383, + "learning_rate": 4.876935727011101e-06, + "loss": 0.1454, + "step": 38134 + }, + { + "epoch": 2.1172409115572437, + "grad_norm": 4.826418876647949, + "learning_rate": 4.876798684390846e-06, + "loss": 0.2656, + "step": 38135 + }, + { + "epoch": 2.1172544763971786, + "grad_norm": 6.125154495239258, + "learning_rate": 4.876661641770591e-06, + "loss": 0.2371, + "step": 38136 + }, + { + "epoch": 2.1172680412371134, + "grad_norm": 5.479516983032227, + "learning_rate": 4.876524599150336e-06, + "loss": 0.1862, + "step": 38137 + }, + { + "epoch": 2.1172816060770483, + "grad_norm": 6.289843559265137, + "learning_rate": 4.8763875565300815e-06, + "loss": 0.107, + "step": 38138 + }, + { + "epoch": 2.117295170916983, + "grad_norm": 6.2843122482299805, + "learning_rate": 4.876250513909827e-06, + "loss": 0.2032, + "step": 38139 + }, + { + "epoch": 2.117308735756918, + "grad_norm": 4.7432122230529785, + "learning_rate": 4.876113471289571e-06, + "loss": 0.2607, + "step": 38140 + }, + { + "epoch": 2.117322300596853, + "grad_norm": 6.21451473236084, + "learning_rate": 4.875976428669317e-06, + "loss": 0.3132, + "step": 38141 + }, + { + "epoch": 2.1173358654367878, + "grad_norm": 4.642719268798828, + "learning_rate": 4.875839386049061e-06, + "loss": 0.135, + "step": 38142 + }, + { + "epoch": 2.1173494302767226, + "grad_norm": 4.011285781860352, + "learning_rate": 4.875702343428807e-06, + "loss": 0.1212, + "step": 38143 + }, + { + "epoch": 2.1173629951166575, + "grad_norm": 4.48731803894043, + "learning_rate": 4.875565300808552e-06, + "loss": 0.1699, + "step": 38144 + }, + { + "epoch": 2.1173765599565924, + "grad_norm": 4.714205265045166, + "learning_rate": 4.875428258188297e-06, + "loss": 0.2072, + "step": 38145 + }, + { + "epoch": 2.117390124796527, + "grad_norm": 4.104607582092285, + "learning_rate": 4.875291215568042e-06, + "loss": 0.2427, + "step": 38146 + }, + { + "epoch": 2.117403689636462, + "grad_norm": 5.084217548370361, + "learning_rate": 4.875154172947787e-06, + "loss": 0.2841, + "step": 38147 + }, + { + "epoch": 2.1174172544763974, + "grad_norm": 4.61679744720459, + "learning_rate": 4.875017130327532e-06, + "loss": 0.1872, + "step": 38148 + }, + { + "epoch": 2.1174308193163323, + "grad_norm": 5.794646739959717, + "learning_rate": 4.874880087707278e-06, + "loss": 0.2847, + "step": 38149 + }, + { + "epoch": 2.117444384156267, + "grad_norm": 8.248492240905762, + "learning_rate": 4.874743045087023e-06, + "loss": 0.3927, + "step": 38150 + }, + { + "epoch": 2.117457948996202, + "grad_norm": 5.808778762817383, + "learning_rate": 4.874606002466767e-06, + "loss": 0.1888, + "step": 38151 + }, + { + "epoch": 2.117471513836137, + "grad_norm": 5.510549068450928, + "learning_rate": 4.874468959846513e-06, + "loss": 0.2983, + "step": 38152 + }, + { + "epoch": 2.1174850786760717, + "grad_norm": 6.782939434051514, + "learning_rate": 4.8743319172262574e-06, + "loss": 0.1442, + "step": 38153 + }, + { + "epoch": 2.1174986435160066, + "grad_norm": 4.254834175109863, + "learning_rate": 4.874194874606003e-06, + "loss": 0.1585, + "step": 38154 + }, + { + "epoch": 2.1175122083559414, + "grad_norm": 5.462084770202637, + "learning_rate": 4.874057831985748e-06, + "loss": 0.1728, + "step": 38155 + }, + { + "epoch": 2.1175257731958763, + "grad_norm": 5.662916660308838, + "learning_rate": 4.873920789365493e-06, + "loss": 0.2258, + "step": 38156 + }, + { + "epoch": 2.117539338035811, + "grad_norm": 4.993857383728027, + "learning_rate": 4.873783746745238e-06, + "loss": 0.2164, + "step": 38157 + }, + { + "epoch": 2.117552902875746, + "grad_norm": 5.13858699798584, + "learning_rate": 4.873646704124983e-06, + "loss": 0.1386, + "step": 38158 + }, + { + "epoch": 2.117566467715681, + "grad_norm": 3.6000118255615234, + "learning_rate": 4.8735096615047285e-06, + "loss": 0.1296, + "step": 38159 + }, + { + "epoch": 2.1175800325556158, + "grad_norm": 5.0459208488464355, + "learning_rate": 4.873372618884473e-06, + "loss": 0.2952, + "step": 38160 + }, + { + "epoch": 2.1175935973955506, + "grad_norm": 6.2562150955200195, + "learning_rate": 4.873235576264219e-06, + "loss": 0.2326, + "step": 38161 + }, + { + "epoch": 2.1176071622354855, + "grad_norm": 5.266112327575684, + "learning_rate": 4.873098533643963e-06, + "loss": 0.2343, + "step": 38162 + }, + { + "epoch": 2.1176207270754204, + "grad_norm": 6.146487236022949, + "learning_rate": 4.872961491023709e-06, + "loss": 0.2292, + "step": 38163 + }, + { + "epoch": 2.1176342919153552, + "grad_norm": 5.026271820068359, + "learning_rate": 4.8728244484034536e-06, + "loss": 0.1938, + "step": 38164 + }, + { + "epoch": 2.11764785675529, + "grad_norm": 3.8834195137023926, + "learning_rate": 4.872687405783199e-06, + "loss": 0.1609, + "step": 38165 + }, + { + "epoch": 2.117661421595225, + "grad_norm": 5.539112567901611, + "learning_rate": 4.872550363162944e-06, + "loss": 0.209, + "step": 38166 + }, + { + "epoch": 2.1176749864351603, + "grad_norm": 6.633530616760254, + "learning_rate": 4.872413320542689e-06, + "loss": 0.3248, + "step": 38167 + }, + { + "epoch": 2.117688551275095, + "grad_norm": 5.500662803649902, + "learning_rate": 4.872276277922434e-06, + "loss": 0.1449, + "step": 38168 + }, + { + "epoch": 2.11770211611503, + "grad_norm": 5.105485439300537, + "learning_rate": 4.8721392353021795e-06, + "loss": 0.2328, + "step": 38169 + }, + { + "epoch": 2.117715680954965, + "grad_norm": 5.204995155334473, + "learning_rate": 4.872002192681925e-06, + "loss": 0.236, + "step": 38170 + }, + { + "epoch": 2.1177292457948997, + "grad_norm": 6.104000568389893, + "learning_rate": 4.87186515006167e-06, + "loss": 0.2913, + "step": 38171 + }, + { + "epoch": 2.1177428106348346, + "grad_norm": 4.345973491668701, + "learning_rate": 4.871728107441415e-06, + "loss": 0.2133, + "step": 38172 + }, + { + "epoch": 2.1177563754747695, + "grad_norm": 5.695334434509277, + "learning_rate": 4.871591064821159e-06, + "loss": 0.3811, + "step": 38173 + }, + { + "epoch": 2.1177699403147043, + "grad_norm": 5.785604000091553, + "learning_rate": 4.8714540222009045e-06, + "loss": 0.2997, + "step": 38174 + }, + { + "epoch": 2.117783505154639, + "grad_norm": 5.246399879455566, + "learning_rate": 4.87131697958065e-06, + "loss": 0.2526, + "step": 38175 + }, + { + "epoch": 2.117797069994574, + "grad_norm": 4.239151477813721, + "learning_rate": 4.871179936960395e-06, + "loss": 0.1545, + "step": 38176 + }, + { + "epoch": 2.117810634834509, + "grad_norm": 7.558828353881836, + "learning_rate": 4.87104289434014e-06, + "loss": 0.2282, + "step": 38177 + }, + { + "epoch": 2.117824199674444, + "grad_norm": 5.451466083526611, + "learning_rate": 4.870905851719885e-06, + "loss": 0.2438, + "step": 38178 + }, + { + "epoch": 2.1178377645143787, + "grad_norm": 5.103581428527832, + "learning_rate": 4.87076880909963e-06, + "loss": 0.198, + "step": 38179 + }, + { + "epoch": 2.1178513293543135, + "grad_norm": 5.087442874908447, + "learning_rate": 4.870631766479376e-06, + "loss": 0.229, + "step": 38180 + }, + { + "epoch": 2.1178648941942484, + "grad_norm": 4.4121832847595215, + "learning_rate": 4.870494723859121e-06, + "loss": 0.2843, + "step": 38181 + }, + { + "epoch": 2.1178784590341833, + "grad_norm": 5.306347846984863, + "learning_rate": 4.870357681238866e-06, + "loss": 0.3483, + "step": 38182 + }, + { + "epoch": 2.117892023874118, + "grad_norm": 5.473597049713135, + "learning_rate": 4.870220638618611e-06, + "loss": 0.3087, + "step": 38183 + }, + { + "epoch": 2.117905588714053, + "grad_norm": 3.965608835220337, + "learning_rate": 4.870083595998356e-06, + "loss": 0.1816, + "step": 38184 + }, + { + "epoch": 2.117919153553988, + "grad_norm": 5.745582580566406, + "learning_rate": 4.869946553378101e-06, + "loss": 0.3305, + "step": 38185 + }, + { + "epoch": 2.117932718393923, + "grad_norm": 4.3159027099609375, + "learning_rate": 4.869809510757847e-06, + "loss": 0.2063, + "step": 38186 + }, + { + "epoch": 2.117946283233858, + "grad_norm": 5.552708148956299, + "learning_rate": 4.869672468137591e-06, + "loss": 0.2214, + "step": 38187 + }, + { + "epoch": 2.117959848073793, + "grad_norm": 4.7883501052856445, + "learning_rate": 4.869535425517336e-06, + "loss": 0.1991, + "step": 38188 + }, + { + "epoch": 2.1179734129137278, + "grad_norm": 6.205864906311035, + "learning_rate": 4.869398382897081e-06, + "loss": 0.186, + "step": 38189 + }, + { + "epoch": 2.1179869777536626, + "grad_norm": 4.498876094818115, + "learning_rate": 4.8692613402768265e-06, + "loss": 0.2398, + "step": 38190 + }, + { + "epoch": 2.1180005425935975, + "grad_norm": 4.573760032653809, + "learning_rate": 4.869124297656572e-06, + "loss": 0.1595, + "step": 38191 + }, + { + "epoch": 2.1180141074335324, + "grad_norm": 5.9832987785339355, + "learning_rate": 4.868987255036317e-06, + "loss": 0.24, + "step": 38192 + }, + { + "epoch": 2.118027672273467, + "grad_norm": 6.2003092765808105, + "learning_rate": 4.868850212416062e-06, + "loss": 0.3029, + "step": 38193 + }, + { + "epoch": 2.118041237113402, + "grad_norm": 6.337939262390137, + "learning_rate": 4.868713169795806e-06, + "loss": 0.252, + "step": 38194 + }, + { + "epoch": 2.118054801953337, + "grad_norm": 3.565330743789673, + "learning_rate": 4.868576127175552e-06, + "loss": 0.1947, + "step": 38195 + }, + { + "epoch": 2.118068366793272, + "grad_norm": 3.575284004211426, + "learning_rate": 4.868439084555297e-06, + "loss": 0.1406, + "step": 38196 + }, + { + "epoch": 2.1180819316332067, + "grad_norm": 4.821125507354736, + "learning_rate": 4.868302041935043e-06, + "loss": 0.2496, + "step": 38197 + }, + { + "epoch": 2.1180954964731415, + "grad_norm": 6.570946216583252, + "learning_rate": 4.868164999314787e-06, + "loss": 0.2431, + "step": 38198 + }, + { + "epoch": 2.1181090613130764, + "grad_norm": 4.9164228439331055, + "learning_rate": 4.868027956694532e-06, + "loss": 0.2292, + "step": 38199 + }, + { + "epoch": 2.1181226261530113, + "grad_norm": 5.6993327140808105, + "learning_rate": 4.8678909140742775e-06, + "loss": 0.3043, + "step": 38200 + }, + { + "epoch": 2.118136190992946, + "grad_norm": 5.223666667938232, + "learning_rate": 4.867753871454023e-06, + "loss": 0.1995, + "step": 38201 + }, + { + "epoch": 2.118149755832881, + "grad_norm": 5.053431034088135, + "learning_rate": 4.867616828833768e-06, + "loss": 0.2172, + "step": 38202 + }, + { + "epoch": 2.118163320672816, + "grad_norm": 5.309780120849609, + "learning_rate": 4.867479786213513e-06, + "loss": 0.2285, + "step": 38203 + }, + { + "epoch": 2.1181768855127507, + "grad_norm": 3.980851411819458, + "learning_rate": 4.867342743593258e-06, + "loss": 0.2223, + "step": 38204 + }, + { + "epoch": 2.118190450352686, + "grad_norm": 4.529550552368164, + "learning_rate": 4.8672057009730025e-06, + "loss": 0.2356, + "step": 38205 + }, + { + "epoch": 2.118204015192621, + "grad_norm": 7.263951301574707, + "learning_rate": 4.8670686583527485e-06, + "loss": 0.2242, + "step": 38206 + }, + { + "epoch": 2.118217580032556, + "grad_norm": 3.9670286178588867, + "learning_rate": 4.866931615732493e-06, + "loss": 0.1047, + "step": 38207 + }, + { + "epoch": 2.1182311448724906, + "grad_norm": 4.582658290863037, + "learning_rate": 4.866794573112238e-06, + "loss": 0.168, + "step": 38208 + }, + { + "epoch": 2.1182447097124255, + "grad_norm": 4.422564506530762, + "learning_rate": 4.866657530491983e-06, + "loss": 0.2016, + "step": 38209 + }, + { + "epoch": 2.1182582745523604, + "grad_norm": 5.906453609466553, + "learning_rate": 4.866520487871728e-06, + "loss": 0.2488, + "step": 38210 + }, + { + "epoch": 2.1182718393922952, + "grad_norm": 5.545660972595215, + "learning_rate": 4.866383445251474e-06, + "loss": 0.2596, + "step": 38211 + }, + { + "epoch": 2.11828540423223, + "grad_norm": 6.06660795211792, + "learning_rate": 4.866246402631219e-06, + "loss": 0.2097, + "step": 38212 + }, + { + "epoch": 2.118298969072165, + "grad_norm": 6.573777675628662, + "learning_rate": 4.866109360010964e-06, + "loss": 0.3259, + "step": 38213 + }, + { + "epoch": 2.1183125339121, + "grad_norm": 5.974889755249023, + "learning_rate": 4.865972317390708e-06, + "loss": 0.3394, + "step": 38214 + }, + { + "epoch": 2.1183260987520347, + "grad_norm": 5.986969470977783, + "learning_rate": 4.865835274770454e-06, + "loss": 0.3062, + "step": 38215 + }, + { + "epoch": 2.1183396635919696, + "grad_norm": 7.9282684326171875, + "learning_rate": 4.865698232150199e-06, + "loss": 0.3828, + "step": 38216 + }, + { + "epoch": 2.1183532284319044, + "grad_norm": 6.007266521453857, + "learning_rate": 4.865561189529945e-06, + "loss": 0.3335, + "step": 38217 + }, + { + "epoch": 2.1183667932718393, + "grad_norm": 5.590966701507568, + "learning_rate": 4.865424146909689e-06, + "loss": 0.1976, + "step": 38218 + }, + { + "epoch": 2.118380358111774, + "grad_norm": 5.873636245727539, + "learning_rate": 4.865287104289434e-06, + "loss": 0.3356, + "step": 38219 + }, + { + "epoch": 2.118393922951709, + "grad_norm": 6.225541591644287, + "learning_rate": 4.865150061669179e-06, + "loss": 0.335, + "step": 38220 + }, + { + "epoch": 2.118407487791644, + "grad_norm": 4.043988227844238, + "learning_rate": 4.8650130190489245e-06, + "loss": 0.1639, + "step": 38221 + }, + { + "epoch": 2.1184210526315788, + "grad_norm": 4.77955961227417, + "learning_rate": 4.86487597642867e-06, + "loss": 0.2856, + "step": 38222 + }, + { + "epoch": 2.1184346174715136, + "grad_norm": 5.218646049499512, + "learning_rate": 4.864738933808415e-06, + "loss": 0.3, + "step": 38223 + }, + { + "epoch": 2.118448182311449, + "grad_norm": 7.869358062744141, + "learning_rate": 4.86460189118816e-06, + "loss": 0.3749, + "step": 38224 + }, + { + "epoch": 2.118461747151384, + "grad_norm": 6.725947856903076, + "learning_rate": 4.864464848567905e-06, + "loss": 0.3008, + "step": 38225 + }, + { + "epoch": 2.1184753119913187, + "grad_norm": 5.738832950592041, + "learning_rate": 4.86432780594765e-06, + "loss": 0.2659, + "step": 38226 + }, + { + "epoch": 2.1184888768312535, + "grad_norm": 6.276448726654053, + "learning_rate": 4.864190763327396e-06, + "loss": 0.5156, + "step": 38227 + }, + { + "epoch": 2.1185024416711884, + "grad_norm": 7.734469890594482, + "learning_rate": 4.86405372070714e-06, + "loss": 0.2581, + "step": 38228 + }, + { + "epoch": 2.1185160065111233, + "grad_norm": 5.452076435089111, + "learning_rate": 4.863916678086885e-06, + "loss": 0.219, + "step": 38229 + }, + { + "epoch": 2.118529571351058, + "grad_norm": 6.191189289093018, + "learning_rate": 4.86377963546663e-06, + "loss": 0.333, + "step": 38230 + }, + { + "epoch": 2.118543136190993, + "grad_norm": 5.610475063323975, + "learning_rate": 4.8636425928463755e-06, + "loss": 0.3295, + "step": 38231 + }, + { + "epoch": 2.118556701030928, + "grad_norm": 5.894343376159668, + "learning_rate": 4.863505550226121e-06, + "loss": 0.1727, + "step": 38232 + }, + { + "epoch": 2.1185702658708627, + "grad_norm": 4.576267719268799, + "learning_rate": 4.863368507605866e-06, + "loss": 0.1983, + "step": 38233 + }, + { + "epoch": 2.1185838307107976, + "grad_norm": 7.4322309494018555, + "learning_rate": 4.863231464985611e-06, + "loss": 0.2789, + "step": 38234 + }, + { + "epoch": 2.1185973955507325, + "grad_norm": 4.6229424476623535, + "learning_rate": 4.863094422365356e-06, + "loss": 0.2272, + "step": 38235 + }, + { + "epoch": 2.1186109603906673, + "grad_norm": 6.373045921325684, + "learning_rate": 4.862957379745101e-06, + "loss": 0.2521, + "step": 38236 + }, + { + "epoch": 2.118624525230602, + "grad_norm": 6.546846866607666, + "learning_rate": 4.8628203371248465e-06, + "loss": 0.32, + "step": 38237 + }, + { + "epoch": 2.118638090070537, + "grad_norm": 6.622132301330566, + "learning_rate": 4.862683294504592e-06, + "loss": 0.3191, + "step": 38238 + }, + { + "epoch": 2.118651654910472, + "grad_norm": 5.666070938110352, + "learning_rate": 4.862546251884336e-06, + "loss": 0.2569, + "step": 38239 + }, + { + "epoch": 2.118665219750407, + "grad_norm": 6.3058857917785645, + "learning_rate": 4.862409209264082e-06, + "loss": 0.2375, + "step": 38240 + }, + { + "epoch": 2.1186787845903416, + "grad_norm": 5.337404251098633, + "learning_rate": 4.862272166643826e-06, + "loss": 0.2819, + "step": 38241 + }, + { + "epoch": 2.1186923494302765, + "grad_norm": 5.021304607391357, + "learning_rate": 4.862135124023572e-06, + "loss": 0.2759, + "step": 38242 + }, + { + "epoch": 2.118705914270212, + "grad_norm": 5.241852283477783, + "learning_rate": 4.861998081403317e-06, + "loss": 0.2611, + "step": 38243 + }, + { + "epoch": 2.1187194791101467, + "grad_norm": 6.039509296417236, + "learning_rate": 4.861861038783062e-06, + "loss": 0.2507, + "step": 38244 + }, + { + "epoch": 2.1187330439500816, + "grad_norm": 5.31232213973999, + "learning_rate": 4.861723996162807e-06, + "loss": 0.274, + "step": 38245 + }, + { + "epoch": 2.1187466087900164, + "grad_norm": 5.009944438934326, + "learning_rate": 4.861586953542552e-06, + "loss": 0.2226, + "step": 38246 + }, + { + "epoch": 2.1187601736299513, + "grad_norm": 5.620429515838623, + "learning_rate": 4.8614499109222975e-06, + "loss": 0.2664, + "step": 38247 + }, + { + "epoch": 2.118773738469886, + "grad_norm": 5.3240861892700195, + "learning_rate": 4.861312868302042e-06, + "loss": 0.3867, + "step": 38248 + }, + { + "epoch": 2.118787303309821, + "grad_norm": 5.047102451324463, + "learning_rate": 4.861175825681788e-06, + "loss": 0.1816, + "step": 38249 + }, + { + "epoch": 2.118800868149756, + "grad_norm": 4.810622692108154, + "learning_rate": 4.861038783061532e-06, + "loss": 0.1858, + "step": 38250 + }, + { + "epoch": 2.1188144329896907, + "grad_norm": 4.099294185638428, + "learning_rate": 4.860901740441278e-06, + "loss": 0.1715, + "step": 38251 + }, + { + "epoch": 2.1188279978296256, + "grad_norm": 5.615232944488525, + "learning_rate": 4.8607646978210225e-06, + "loss": 0.3921, + "step": 38252 + }, + { + "epoch": 2.1188415626695605, + "grad_norm": 4.86767578125, + "learning_rate": 4.860627655200768e-06, + "loss": 0.2673, + "step": 38253 + }, + { + "epoch": 2.1188551275094953, + "grad_norm": 3.687061309814453, + "learning_rate": 4.860490612580513e-06, + "loss": 0.1466, + "step": 38254 + }, + { + "epoch": 2.11886869234943, + "grad_norm": 4.633785247802734, + "learning_rate": 4.860353569960258e-06, + "loss": 0.1137, + "step": 38255 + }, + { + "epoch": 2.118882257189365, + "grad_norm": 6.571352958679199, + "learning_rate": 4.860216527340003e-06, + "loss": 0.2527, + "step": 38256 + }, + { + "epoch": 2.1188958220293, + "grad_norm": 4.662614822387695, + "learning_rate": 4.860079484719748e-06, + "loss": 0.1533, + "step": 38257 + }, + { + "epoch": 2.118909386869235, + "grad_norm": 4.203762531280518, + "learning_rate": 4.859942442099494e-06, + "loss": 0.1415, + "step": 38258 + }, + { + "epoch": 2.1189229517091697, + "grad_norm": 6.745728969573975, + "learning_rate": 4.859805399479238e-06, + "loss": 0.1904, + "step": 38259 + }, + { + "epoch": 2.1189365165491045, + "grad_norm": 5.52054500579834, + "learning_rate": 4.859668356858984e-06, + "loss": 0.1612, + "step": 38260 + }, + { + "epoch": 2.1189500813890394, + "grad_norm": 3.481192111968994, + "learning_rate": 4.859531314238728e-06, + "loss": 0.1044, + "step": 38261 + }, + { + "epoch": 2.1189636462289747, + "grad_norm": 6.521943092346191, + "learning_rate": 4.8593942716184735e-06, + "loss": 0.2674, + "step": 38262 + }, + { + "epoch": 2.1189772110689096, + "grad_norm": 5.489834308624268, + "learning_rate": 4.859257228998219e-06, + "loss": 0.2205, + "step": 38263 + }, + { + "epoch": 2.1189907759088444, + "grad_norm": 6.727524757385254, + "learning_rate": 4.859120186377964e-06, + "loss": 0.1938, + "step": 38264 + }, + { + "epoch": 2.1190043407487793, + "grad_norm": 5.647843360900879, + "learning_rate": 4.858983143757709e-06, + "loss": 0.2539, + "step": 38265 + }, + { + "epoch": 2.119017905588714, + "grad_norm": 6.482769966125488, + "learning_rate": 4.858846101137454e-06, + "loss": 0.3121, + "step": 38266 + }, + { + "epoch": 2.119031470428649, + "grad_norm": 6.368213653564453, + "learning_rate": 4.858709058517199e-06, + "loss": 0.2169, + "step": 38267 + }, + { + "epoch": 2.119045035268584, + "grad_norm": 3.8095972537994385, + "learning_rate": 4.8585720158969445e-06, + "loss": 0.1603, + "step": 38268 + }, + { + "epoch": 2.1190586001085188, + "grad_norm": 5.292489528656006, + "learning_rate": 4.85843497327669e-06, + "loss": 0.1974, + "step": 38269 + }, + { + "epoch": 2.1190721649484536, + "grad_norm": 4.818967342376709, + "learning_rate": 4.858297930656434e-06, + "loss": 0.1845, + "step": 38270 + }, + { + "epoch": 2.1190857297883885, + "grad_norm": 5.01568603515625, + "learning_rate": 4.85816088803618e-06, + "loss": 0.1666, + "step": 38271 + }, + { + "epoch": 2.1190992946283234, + "grad_norm": 6.009840965270996, + "learning_rate": 4.858023845415924e-06, + "loss": 0.2421, + "step": 38272 + }, + { + "epoch": 2.1191128594682582, + "grad_norm": 5.933858394622803, + "learning_rate": 4.85788680279567e-06, + "loss": 0.1811, + "step": 38273 + }, + { + "epoch": 2.119126424308193, + "grad_norm": 6.949031352996826, + "learning_rate": 4.857749760175415e-06, + "loss": 0.2777, + "step": 38274 + }, + { + "epoch": 2.119139989148128, + "grad_norm": 4.038187026977539, + "learning_rate": 4.85761271755516e-06, + "loss": 0.1289, + "step": 38275 + }, + { + "epoch": 2.119153553988063, + "grad_norm": 5.730057239532471, + "learning_rate": 4.857475674934905e-06, + "loss": 0.1703, + "step": 38276 + }, + { + "epoch": 2.1191671188279977, + "grad_norm": 4.914816379547119, + "learning_rate": 4.85733863231465e-06, + "loss": 0.1373, + "step": 38277 + }, + { + "epoch": 2.1191806836679326, + "grad_norm": 5.6435699462890625, + "learning_rate": 4.8572015896943955e-06, + "loss": 0.2666, + "step": 38278 + }, + { + "epoch": 2.1191942485078674, + "grad_norm": 5.203526020050049, + "learning_rate": 4.857064547074141e-06, + "loss": 0.1594, + "step": 38279 + }, + { + "epoch": 2.1192078133478023, + "grad_norm": 5.915379524230957, + "learning_rate": 4.856927504453886e-06, + "loss": 0.1931, + "step": 38280 + }, + { + "epoch": 2.1192213781877376, + "grad_norm": 5.262880802154541, + "learning_rate": 4.856790461833631e-06, + "loss": 0.1537, + "step": 38281 + }, + { + "epoch": 2.1192349430276725, + "grad_norm": 7.436219215393066, + "learning_rate": 4.856653419213375e-06, + "loss": 0.224, + "step": 38282 + }, + { + "epoch": 2.1192485078676073, + "grad_norm": 7.284661293029785, + "learning_rate": 4.8565163765931205e-06, + "loss": 0.1641, + "step": 38283 + }, + { + "epoch": 2.119262072707542, + "grad_norm": 3.857879400253296, + "learning_rate": 4.856379333972866e-06, + "loss": 0.1705, + "step": 38284 + }, + { + "epoch": 2.119275637547477, + "grad_norm": 4.792239189147949, + "learning_rate": 4.856242291352611e-06, + "loss": 0.1223, + "step": 38285 + }, + { + "epoch": 2.119289202387412, + "grad_norm": 4.767630577087402, + "learning_rate": 4.856105248732356e-06, + "loss": 0.1196, + "step": 38286 + }, + { + "epoch": 2.119302767227347, + "grad_norm": 7.585509777069092, + "learning_rate": 4.855968206112101e-06, + "loss": 0.3003, + "step": 38287 + }, + { + "epoch": 2.1193163320672816, + "grad_norm": 3.495321750640869, + "learning_rate": 4.8558311634918464e-06, + "loss": 0.117, + "step": 38288 + }, + { + "epoch": 2.1193298969072165, + "grad_norm": 5.0895233154296875, + "learning_rate": 4.855694120871592e-06, + "loss": 0.1734, + "step": 38289 + }, + { + "epoch": 2.1193434617471514, + "grad_norm": 4.534762859344482, + "learning_rate": 4.855557078251337e-06, + "loss": 0.1313, + "step": 38290 + }, + { + "epoch": 2.1193570265870862, + "grad_norm": 5.400264739990234, + "learning_rate": 4.855420035631082e-06, + "loss": 0.2386, + "step": 38291 + }, + { + "epoch": 2.119370591427021, + "grad_norm": 4.2201337814331055, + "learning_rate": 4.855282993010827e-06, + "loss": 0.1597, + "step": 38292 + }, + { + "epoch": 2.119384156266956, + "grad_norm": 4.798543930053711, + "learning_rate": 4.8551459503905715e-06, + "loss": 0.1375, + "step": 38293 + }, + { + "epoch": 2.119397721106891, + "grad_norm": 3.1897695064544678, + "learning_rate": 4.8550089077703175e-06, + "loss": 0.0831, + "step": 38294 + }, + { + "epoch": 2.1194112859468257, + "grad_norm": 3.8704662322998047, + "learning_rate": 4.854871865150062e-06, + "loss": 0.1053, + "step": 38295 + }, + { + "epoch": 2.1194248507867606, + "grad_norm": 5.577070713043213, + "learning_rate": 4.854734822529807e-06, + "loss": 0.2598, + "step": 38296 + }, + { + "epoch": 2.1194384156266954, + "grad_norm": 5.894707202911377, + "learning_rate": 4.854597779909552e-06, + "loss": 0.1963, + "step": 38297 + }, + { + "epoch": 2.1194519804666303, + "grad_norm": 4.61434268951416, + "learning_rate": 4.854460737289297e-06, + "loss": 0.1514, + "step": 38298 + }, + { + "epoch": 2.119465545306565, + "grad_norm": 3.8470897674560547, + "learning_rate": 4.8543236946690426e-06, + "loss": 0.1149, + "step": 38299 + }, + { + "epoch": 2.1194791101465005, + "grad_norm": 4.107395648956299, + "learning_rate": 4.854186652048788e-06, + "loss": 0.1441, + "step": 38300 + }, + { + "epoch": 2.1194926749864353, + "grad_norm": 3.5260658264160156, + "learning_rate": 4.854049609428533e-06, + "loss": 0.128, + "step": 38301 + }, + { + "epoch": 2.11950623982637, + "grad_norm": 3.5019612312316895, + "learning_rate": 4.853912566808277e-06, + "loss": 0.1374, + "step": 38302 + }, + { + "epoch": 2.119519804666305, + "grad_norm": 3.4128620624542236, + "learning_rate": 4.853775524188023e-06, + "loss": 0.1072, + "step": 38303 + }, + { + "epoch": 2.11953336950624, + "grad_norm": 3.524049997329712, + "learning_rate": 4.853638481567768e-06, + "loss": 0.1309, + "step": 38304 + }, + { + "epoch": 2.119546934346175, + "grad_norm": 3.841672897338867, + "learning_rate": 4.853501438947514e-06, + "loss": 0.1193, + "step": 38305 + }, + { + "epoch": 2.1195604991861097, + "grad_norm": 7.5490899085998535, + "learning_rate": 4.853364396327258e-06, + "loss": 0.2074, + "step": 38306 + }, + { + "epoch": 2.1195740640260445, + "grad_norm": 3.630988836288452, + "learning_rate": 4.853227353707003e-06, + "loss": 0.1093, + "step": 38307 + }, + { + "epoch": 2.1195876288659794, + "grad_norm": 4.572481632232666, + "learning_rate": 4.853090311086748e-06, + "loss": 0.144, + "step": 38308 + }, + { + "epoch": 2.1196011937059143, + "grad_norm": 4.178348064422607, + "learning_rate": 4.8529532684664935e-06, + "loss": 0.1074, + "step": 38309 + }, + { + "epoch": 2.119614758545849, + "grad_norm": 3.606773853302002, + "learning_rate": 4.852816225846239e-06, + "loss": 0.0919, + "step": 38310 + }, + { + "epoch": 2.119628323385784, + "grad_norm": 4.169139385223389, + "learning_rate": 4.852679183225983e-06, + "loss": 0.1783, + "step": 38311 + }, + { + "epoch": 2.119641888225719, + "grad_norm": 3.86608624458313, + "learning_rate": 4.852542140605729e-06, + "loss": 0.0975, + "step": 38312 + }, + { + "epoch": 2.1196554530656537, + "grad_norm": 4.365728855133057, + "learning_rate": 4.852405097985473e-06, + "loss": 0.1829, + "step": 38313 + }, + { + "epoch": 2.1196690179055886, + "grad_norm": 4.7131829261779785, + "learning_rate": 4.852268055365219e-06, + "loss": 0.0875, + "step": 38314 + }, + { + "epoch": 2.1196825827455235, + "grad_norm": 4.417270660400391, + "learning_rate": 4.852131012744964e-06, + "loss": 0.1281, + "step": 38315 + }, + { + "epoch": 2.1196961475854583, + "grad_norm": 7.189777374267578, + "learning_rate": 4.851993970124709e-06, + "loss": 0.218, + "step": 38316 + }, + { + "epoch": 2.119709712425393, + "grad_norm": 3.117246627807617, + "learning_rate": 4.851856927504454e-06, + "loss": 0.0947, + "step": 38317 + }, + { + "epoch": 2.119723277265328, + "grad_norm": 3.567688465118408, + "learning_rate": 4.851719884884199e-06, + "loss": 0.1196, + "step": 38318 + }, + { + "epoch": 2.1197368421052634, + "grad_norm": 4.117586612701416, + "learning_rate": 4.8515828422639444e-06, + "loss": 0.1039, + "step": 38319 + }, + { + "epoch": 2.1197504069451982, + "grad_norm": 5.894524574279785, + "learning_rate": 4.85144579964369e-06, + "loss": 0.2283, + "step": 38320 + }, + { + "epoch": 2.119763971785133, + "grad_norm": 5.4495015144348145, + "learning_rate": 4.851308757023435e-06, + "loss": 0.1804, + "step": 38321 + }, + { + "epoch": 2.119777536625068, + "grad_norm": 3.430379629135132, + "learning_rate": 4.85117171440318e-06, + "loss": 0.0738, + "step": 38322 + }, + { + "epoch": 2.119791101465003, + "grad_norm": 3.6334590911865234, + "learning_rate": 4.851034671782925e-06, + "loss": 0.137, + "step": 38323 + }, + { + "epoch": 2.1198046663049377, + "grad_norm": 4.586292266845703, + "learning_rate": 4.85089762916267e-06, + "loss": 0.106, + "step": 38324 + }, + { + "epoch": 2.1198182311448726, + "grad_norm": 4.176207065582275, + "learning_rate": 4.8507605865424155e-06, + "loss": 0.0819, + "step": 38325 + }, + { + "epoch": 2.1198317959848074, + "grad_norm": 4.878733158111572, + "learning_rate": 4.85062354392216e-06, + "loss": 0.1487, + "step": 38326 + }, + { + "epoch": 2.1198453608247423, + "grad_norm": 3.691107749938965, + "learning_rate": 4.850486501301905e-06, + "loss": 0.1329, + "step": 38327 + }, + { + "epoch": 2.119858925664677, + "grad_norm": 4.9363627433776855, + "learning_rate": 4.85034945868165e-06, + "loss": 0.2197, + "step": 38328 + }, + { + "epoch": 2.119872490504612, + "grad_norm": 3.99200177192688, + "learning_rate": 4.850212416061395e-06, + "loss": 0.1077, + "step": 38329 + }, + { + "epoch": 2.119886055344547, + "grad_norm": 5.00649356842041, + "learning_rate": 4.8500753734411406e-06, + "loss": 0.169, + "step": 38330 + }, + { + "epoch": 2.1198996201844817, + "grad_norm": 4.601771831512451, + "learning_rate": 4.849938330820886e-06, + "loss": 0.0799, + "step": 38331 + }, + { + "epoch": 2.1199131850244166, + "grad_norm": 4.912171840667725, + "learning_rate": 4.849801288200631e-06, + "loss": 0.158, + "step": 38332 + }, + { + "epoch": 2.1199267498643515, + "grad_norm": 3.2651267051696777, + "learning_rate": 4.849664245580376e-06, + "loss": 0.0889, + "step": 38333 + }, + { + "epoch": 2.1199403147042863, + "grad_norm": 3.686880111694336, + "learning_rate": 4.849527202960121e-06, + "loss": 0.1642, + "step": 38334 + }, + { + "epoch": 2.119953879544221, + "grad_norm": 3.4887094497680664, + "learning_rate": 4.8493901603398665e-06, + "loss": 0.1494, + "step": 38335 + }, + { + "epoch": 2.119967444384156, + "grad_norm": 5.2624921798706055, + "learning_rate": 4.849253117719611e-06, + "loss": 0.2119, + "step": 38336 + }, + { + "epoch": 2.119981009224091, + "grad_norm": 4.646086692810059, + "learning_rate": 4.849116075099357e-06, + "loss": 0.1728, + "step": 38337 + }, + { + "epoch": 2.1199945740640262, + "grad_norm": 3.434708833694458, + "learning_rate": 4.848979032479101e-06, + "loss": 0.1309, + "step": 38338 + }, + { + "epoch": 2.120008138903961, + "grad_norm": 3.3307061195373535, + "learning_rate": 4.848841989858846e-06, + "loss": 0.1827, + "step": 38339 + }, + { + "epoch": 2.120021703743896, + "grad_norm": 5.519669532775879, + "learning_rate": 4.8487049472385915e-06, + "loss": 0.1919, + "step": 38340 + }, + { + "epoch": 2.120035268583831, + "grad_norm": 3.8456740379333496, + "learning_rate": 4.848567904618337e-06, + "loss": 0.1095, + "step": 38341 + }, + { + "epoch": 2.1200488334237657, + "grad_norm": 4.006491184234619, + "learning_rate": 4.848430861998082e-06, + "loss": 0.1311, + "step": 38342 + }, + { + "epoch": 2.1200623982637006, + "grad_norm": 3.1249136924743652, + "learning_rate": 4.848293819377827e-06, + "loss": 0.1259, + "step": 38343 + }, + { + "epoch": 2.1200759631036354, + "grad_norm": 6.798045635223389, + "learning_rate": 4.848156776757572e-06, + "loss": 0.2446, + "step": 38344 + }, + { + "epoch": 2.1200895279435703, + "grad_norm": 4.735002517700195, + "learning_rate": 4.8480197341373165e-06, + "loss": 0.1998, + "step": 38345 + }, + { + "epoch": 2.120103092783505, + "grad_norm": 5.844849109649658, + "learning_rate": 4.8478826915170626e-06, + "loss": 0.2147, + "step": 38346 + }, + { + "epoch": 2.12011665762344, + "grad_norm": 3.2699060440063477, + "learning_rate": 4.847745648896807e-06, + "loss": 0.1082, + "step": 38347 + }, + { + "epoch": 2.120130222463375, + "grad_norm": 4.458327293395996, + "learning_rate": 4.847608606276553e-06, + "loss": 0.1777, + "step": 38348 + }, + { + "epoch": 2.1201437873033098, + "grad_norm": 4.027411460876465, + "learning_rate": 4.847471563656297e-06, + "loss": 0.115, + "step": 38349 + }, + { + "epoch": 2.1201573521432446, + "grad_norm": 3.1679563522338867, + "learning_rate": 4.8473345210360424e-06, + "loss": 0.0754, + "step": 38350 + }, + { + "epoch": 2.1201709169831795, + "grad_norm": 5.887576103210449, + "learning_rate": 4.847197478415788e-06, + "loss": 0.2358, + "step": 38351 + }, + { + "epoch": 2.1201844818231144, + "grad_norm": 5.11474609375, + "learning_rate": 4.847060435795533e-06, + "loss": 0.1577, + "step": 38352 + }, + { + "epoch": 2.1201980466630492, + "grad_norm": 4.199692726135254, + "learning_rate": 4.846923393175278e-06, + "loss": 0.2406, + "step": 38353 + }, + { + "epoch": 2.120211611502984, + "grad_norm": 4.013174533843994, + "learning_rate": 4.846786350555023e-06, + "loss": 0.1536, + "step": 38354 + }, + { + "epoch": 2.120225176342919, + "grad_norm": 4.727816581726074, + "learning_rate": 4.846649307934768e-06, + "loss": 0.1408, + "step": 38355 + }, + { + "epoch": 2.1202387411828543, + "grad_norm": 5.8749470710754395, + "learning_rate": 4.846512265314513e-06, + "loss": 0.2351, + "step": 38356 + }, + { + "epoch": 2.120252306022789, + "grad_norm": 5.293092250823975, + "learning_rate": 4.846375222694259e-06, + "loss": 0.1429, + "step": 38357 + }, + { + "epoch": 2.120265870862724, + "grad_norm": 6.616144180297852, + "learning_rate": 4.846238180074003e-06, + "loss": 0.2549, + "step": 38358 + }, + { + "epoch": 2.120279435702659, + "grad_norm": 6.716892719268799, + "learning_rate": 4.846101137453749e-06, + "loss": 0.2822, + "step": 38359 + }, + { + "epoch": 2.1202930005425937, + "grad_norm": 4.657563209533691, + "learning_rate": 4.845964094833493e-06, + "loss": 0.1211, + "step": 38360 + }, + { + "epoch": 2.1203065653825286, + "grad_norm": 7.401418685913086, + "learning_rate": 4.8458270522132386e-06, + "loss": 0.1733, + "step": 38361 + }, + { + "epoch": 2.1203201302224635, + "grad_norm": 4.608692646026611, + "learning_rate": 4.845690009592984e-06, + "loss": 0.1999, + "step": 38362 + }, + { + "epoch": 2.1203336950623983, + "grad_norm": 7.0568156242370605, + "learning_rate": 4.845552966972729e-06, + "loss": 0.2232, + "step": 38363 + }, + { + "epoch": 2.120347259902333, + "grad_norm": 4.302216529846191, + "learning_rate": 4.845415924352474e-06, + "loss": 0.1854, + "step": 38364 + }, + { + "epoch": 2.120360824742268, + "grad_norm": 6.7424445152282715, + "learning_rate": 4.845278881732219e-06, + "loss": 0.2004, + "step": 38365 + }, + { + "epoch": 2.120374389582203, + "grad_norm": 6.3713788986206055, + "learning_rate": 4.8451418391119645e-06, + "loss": 0.196, + "step": 38366 + }, + { + "epoch": 2.120387954422138, + "grad_norm": 4.863402366638184, + "learning_rate": 4.845004796491709e-06, + "loss": 0.1582, + "step": 38367 + }, + { + "epoch": 2.1204015192620727, + "grad_norm": 5.906815528869629, + "learning_rate": 4.844867753871455e-06, + "loss": 0.1897, + "step": 38368 + }, + { + "epoch": 2.1204150841020075, + "grad_norm": 5.1746416091918945, + "learning_rate": 4.844730711251199e-06, + "loss": 0.1349, + "step": 38369 + }, + { + "epoch": 2.1204286489419424, + "grad_norm": 4.141191482543945, + "learning_rate": 4.844593668630944e-06, + "loss": 0.0974, + "step": 38370 + }, + { + "epoch": 2.1204422137818772, + "grad_norm": 4.729357719421387, + "learning_rate": 4.8444566260106895e-06, + "loss": 0.1946, + "step": 38371 + }, + { + "epoch": 2.120455778621812, + "grad_norm": 6.449577331542969, + "learning_rate": 4.844319583390435e-06, + "loss": 0.2066, + "step": 38372 + }, + { + "epoch": 2.120469343461747, + "grad_norm": 4.423739910125732, + "learning_rate": 4.84418254077018e-06, + "loss": 0.1525, + "step": 38373 + }, + { + "epoch": 2.120482908301682, + "grad_norm": 5.878715991973877, + "learning_rate": 4.844045498149925e-06, + "loss": 0.2395, + "step": 38374 + }, + { + "epoch": 2.1204964731416167, + "grad_norm": 4.282615661621094, + "learning_rate": 4.84390845552967e-06, + "loss": 0.2054, + "step": 38375 + }, + { + "epoch": 2.120510037981552, + "grad_norm": 6.126754283905029, + "learning_rate": 4.843771412909415e-06, + "loss": 0.174, + "step": 38376 + }, + { + "epoch": 2.120523602821487, + "grad_norm": 5.441467761993408, + "learning_rate": 4.8436343702891606e-06, + "loss": 0.2226, + "step": 38377 + }, + { + "epoch": 2.1205371676614218, + "grad_norm": 4.118372917175293, + "learning_rate": 4.843497327668906e-06, + "loss": 0.1392, + "step": 38378 + }, + { + "epoch": 2.1205507325013566, + "grad_norm": 5.152260780334473, + "learning_rate": 4.843360285048651e-06, + "loss": 0.0905, + "step": 38379 + }, + { + "epoch": 2.1205642973412915, + "grad_norm": 5.542933464050293, + "learning_rate": 4.843223242428395e-06, + "loss": 0.1817, + "step": 38380 + }, + { + "epoch": 2.1205778621812263, + "grad_norm": 4.6371846199035645, + "learning_rate": 4.8430861998081404e-06, + "loss": 0.1372, + "step": 38381 + }, + { + "epoch": 2.120591427021161, + "grad_norm": 4.653492450714111, + "learning_rate": 4.842949157187886e-06, + "loss": 0.2452, + "step": 38382 + }, + { + "epoch": 2.120604991861096, + "grad_norm": 4.004563808441162, + "learning_rate": 4.842812114567631e-06, + "loss": 0.1798, + "step": 38383 + }, + { + "epoch": 2.120618556701031, + "grad_norm": 5.010835647583008, + "learning_rate": 4.842675071947376e-06, + "loss": 0.2054, + "step": 38384 + }, + { + "epoch": 2.120632121540966, + "grad_norm": 6.039752006530762, + "learning_rate": 4.842538029327121e-06, + "loss": 0.3262, + "step": 38385 + }, + { + "epoch": 2.1206456863809007, + "grad_norm": 4.457266807556152, + "learning_rate": 4.842400986706866e-06, + "loss": 0.1704, + "step": 38386 + }, + { + "epoch": 2.1206592512208355, + "grad_norm": 4.084266662597656, + "learning_rate": 4.8422639440866115e-06, + "loss": 0.1605, + "step": 38387 + }, + { + "epoch": 2.1206728160607704, + "grad_norm": 4.149411678314209, + "learning_rate": 4.842126901466357e-06, + "loss": 0.1544, + "step": 38388 + }, + { + "epoch": 2.1206863809007053, + "grad_norm": 5.197302341461182, + "learning_rate": 4.841989858846102e-06, + "loss": 0.1637, + "step": 38389 + }, + { + "epoch": 2.12069994574064, + "grad_norm": 5.337989330291748, + "learning_rate": 4.841852816225846e-06, + "loss": 0.2351, + "step": 38390 + }, + { + "epoch": 2.120713510580575, + "grad_norm": 3.8845579624176025, + "learning_rate": 4.841715773605592e-06, + "loss": 0.1541, + "step": 38391 + }, + { + "epoch": 2.12072707542051, + "grad_norm": 4.820804595947266, + "learning_rate": 4.8415787309853366e-06, + "loss": 0.1402, + "step": 38392 + }, + { + "epoch": 2.1207406402604447, + "grad_norm": 3.9381282329559326, + "learning_rate": 4.841441688365082e-06, + "loss": 0.113, + "step": 38393 + }, + { + "epoch": 2.12075420510038, + "grad_norm": 5.307742595672607, + "learning_rate": 4.841304645744827e-06, + "loss": 0.2091, + "step": 38394 + }, + { + "epoch": 2.120767769940315, + "grad_norm": 4.77200984954834, + "learning_rate": 4.841167603124572e-06, + "loss": 0.1521, + "step": 38395 + }, + { + "epoch": 2.1207813347802498, + "grad_norm": 5.467351913452148, + "learning_rate": 4.841030560504317e-06, + "loss": 0.1911, + "step": 38396 + }, + { + "epoch": 2.1207948996201846, + "grad_norm": 5.649357795715332, + "learning_rate": 4.8408935178840625e-06, + "loss": 0.1625, + "step": 38397 + }, + { + "epoch": 2.1208084644601195, + "grad_norm": 5.716867446899414, + "learning_rate": 4.840756475263808e-06, + "loss": 0.1377, + "step": 38398 + }, + { + "epoch": 2.1208220293000544, + "grad_norm": 2.791987895965576, + "learning_rate": 4.840619432643552e-06, + "loss": 0.1112, + "step": 38399 + }, + { + "epoch": 2.1208355941399892, + "grad_norm": 4.88864278793335, + "learning_rate": 4.840482390023298e-06, + "loss": 0.1871, + "step": 38400 + }, + { + "epoch": 2.120849158979924, + "grad_norm": 5.205704689025879, + "learning_rate": 4.840345347403042e-06, + "loss": 0.1493, + "step": 38401 + }, + { + "epoch": 2.120862723819859, + "grad_norm": 4.748361587524414, + "learning_rate": 4.840208304782788e-06, + "loss": 0.1735, + "step": 38402 + }, + { + "epoch": 2.120876288659794, + "grad_norm": 4.086081504821777, + "learning_rate": 4.840071262162533e-06, + "loss": 0.1688, + "step": 38403 + }, + { + "epoch": 2.1208898534997287, + "grad_norm": 5.9488935470581055, + "learning_rate": 4.839934219542278e-06, + "loss": 0.2376, + "step": 38404 + }, + { + "epoch": 2.1209034183396636, + "grad_norm": 3.802065134048462, + "learning_rate": 4.839797176922023e-06, + "loss": 0.1636, + "step": 38405 + }, + { + "epoch": 2.1209169831795984, + "grad_norm": 7.07735538482666, + "learning_rate": 4.839660134301768e-06, + "loss": 0.225, + "step": 38406 + }, + { + "epoch": 2.1209305480195333, + "grad_norm": 5.678722858428955, + "learning_rate": 4.839523091681513e-06, + "loss": 0.1509, + "step": 38407 + }, + { + "epoch": 2.120944112859468, + "grad_norm": 5.384393215179443, + "learning_rate": 4.839386049061259e-06, + "loss": 0.1863, + "step": 38408 + }, + { + "epoch": 2.120957677699403, + "grad_norm": 5.564447402954102, + "learning_rate": 4.839249006441004e-06, + "loss": 0.1843, + "step": 38409 + }, + { + "epoch": 2.120971242539338, + "grad_norm": 4.81939697265625, + "learning_rate": 4.839111963820748e-06, + "loss": 0.163, + "step": 38410 + }, + { + "epoch": 2.1209848073792728, + "grad_norm": 6.57435417175293, + "learning_rate": 4.838974921200494e-06, + "loss": 0.2766, + "step": 38411 + }, + { + "epoch": 2.1209983722192076, + "grad_norm": 9.362244606018066, + "learning_rate": 4.8388378785802384e-06, + "loss": 0.2845, + "step": 38412 + }, + { + "epoch": 2.1210119370591425, + "grad_norm": 5.614478588104248, + "learning_rate": 4.8387008359599845e-06, + "loss": 0.2077, + "step": 38413 + }, + { + "epoch": 2.121025501899078, + "grad_norm": 5.054571151733398, + "learning_rate": 4.838563793339729e-06, + "loss": 0.2606, + "step": 38414 + }, + { + "epoch": 2.1210390667390127, + "grad_norm": 5.640186309814453, + "learning_rate": 4.838426750719474e-06, + "loss": 0.2094, + "step": 38415 + }, + { + "epoch": 2.1210526315789475, + "grad_norm": 3.880197048187256, + "learning_rate": 4.838289708099219e-06, + "loss": 0.1513, + "step": 38416 + }, + { + "epoch": 2.1210661964188824, + "grad_norm": 5.976685523986816, + "learning_rate": 4.838152665478964e-06, + "loss": 0.306, + "step": 38417 + }, + { + "epoch": 2.1210797612588173, + "grad_norm": 4.7164788246154785, + "learning_rate": 4.8380156228587095e-06, + "loss": 0.2599, + "step": 38418 + }, + { + "epoch": 2.121093326098752, + "grad_norm": 8.217769622802734, + "learning_rate": 4.837878580238455e-06, + "loss": 0.2546, + "step": 38419 + }, + { + "epoch": 2.121106890938687, + "grad_norm": 7.183389186859131, + "learning_rate": 4.8377415376182e-06, + "loss": 0.3358, + "step": 38420 + }, + { + "epoch": 2.121120455778622, + "grad_norm": 4.075717449188232, + "learning_rate": 4.837604494997944e-06, + "loss": 0.2078, + "step": 38421 + }, + { + "epoch": 2.1211340206185567, + "grad_norm": 5.211243152618408, + "learning_rate": 4.83746745237769e-06, + "loss": 0.2114, + "step": 38422 + }, + { + "epoch": 2.1211475854584916, + "grad_norm": 5.388209819793701, + "learning_rate": 4.8373304097574346e-06, + "loss": 0.2587, + "step": 38423 + }, + { + "epoch": 2.1211611502984264, + "grad_norm": 6.4714579582214355, + "learning_rate": 4.83719336713718e-06, + "loss": 0.2485, + "step": 38424 + }, + { + "epoch": 2.1211747151383613, + "grad_norm": 6.524555683135986, + "learning_rate": 4.837056324516925e-06, + "loss": 0.2479, + "step": 38425 + }, + { + "epoch": 2.121188279978296, + "grad_norm": 5.654632091522217, + "learning_rate": 4.83691928189667e-06, + "loss": 0.2638, + "step": 38426 + }, + { + "epoch": 2.121201844818231, + "grad_norm": 6.054002285003662, + "learning_rate": 4.836782239276415e-06, + "loss": 0.2443, + "step": 38427 + }, + { + "epoch": 2.121215409658166, + "grad_norm": 4.185038089752197, + "learning_rate": 4.8366451966561605e-06, + "loss": 0.1883, + "step": 38428 + }, + { + "epoch": 2.1212289744981008, + "grad_norm": 5.682998180389404, + "learning_rate": 4.836508154035906e-06, + "loss": 0.198, + "step": 38429 + }, + { + "epoch": 2.1212425393380356, + "grad_norm": 4.657747745513916, + "learning_rate": 4.836371111415651e-06, + "loss": 0.2321, + "step": 38430 + }, + { + "epoch": 2.1212561041779705, + "grad_norm": 4.090828895568848, + "learning_rate": 4.836234068795396e-06, + "loss": 0.1609, + "step": 38431 + }, + { + "epoch": 2.121269669017906, + "grad_norm": 3.8468384742736816, + "learning_rate": 4.836097026175141e-06, + "loss": 0.1734, + "step": 38432 + }, + { + "epoch": 2.1212832338578407, + "grad_norm": 4.3257341384887695, + "learning_rate": 4.835959983554886e-06, + "loss": 0.1385, + "step": 38433 + }, + { + "epoch": 2.1212967986977755, + "grad_norm": 5.372816562652588, + "learning_rate": 4.8358229409346315e-06, + "loss": 0.2532, + "step": 38434 + }, + { + "epoch": 2.1213103635377104, + "grad_norm": 4.67399787902832, + "learning_rate": 4.835685898314376e-06, + "loss": 0.229, + "step": 38435 + }, + { + "epoch": 2.1213239283776453, + "grad_norm": 3.47110652923584, + "learning_rate": 4.835548855694121e-06, + "loss": 0.1467, + "step": 38436 + }, + { + "epoch": 2.12133749321758, + "grad_norm": 5.096695899963379, + "learning_rate": 4.835411813073866e-06, + "loss": 0.1776, + "step": 38437 + }, + { + "epoch": 2.121351058057515, + "grad_norm": 6.010454177856445, + "learning_rate": 4.835274770453611e-06, + "loss": 0.141, + "step": 38438 + }, + { + "epoch": 2.12136462289745, + "grad_norm": 4.69008731842041, + "learning_rate": 4.835137727833357e-06, + "loss": 0.2279, + "step": 38439 + }, + { + "epoch": 2.1213781877373847, + "grad_norm": 4.811089515686035, + "learning_rate": 4.835000685213102e-06, + "loss": 0.1686, + "step": 38440 + }, + { + "epoch": 2.1213917525773196, + "grad_norm": 5.397887706756592, + "learning_rate": 4.834863642592847e-06, + "loss": 0.2526, + "step": 38441 + }, + { + "epoch": 2.1214053174172545, + "grad_norm": 5.589671611785889, + "learning_rate": 4.834726599972592e-06, + "loss": 0.2289, + "step": 38442 + }, + { + "epoch": 2.1214188822571893, + "grad_norm": 5.34511137008667, + "learning_rate": 4.834589557352337e-06, + "loss": 0.2305, + "step": 38443 + }, + { + "epoch": 2.121432447097124, + "grad_norm": 5.53705358505249, + "learning_rate": 4.834452514732082e-06, + "loss": 0.2876, + "step": 38444 + }, + { + "epoch": 2.121446011937059, + "grad_norm": 4.926916599273682, + "learning_rate": 4.834315472111828e-06, + "loss": 0.1605, + "step": 38445 + }, + { + "epoch": 2.121459576776994, + "grad_norm": 5.246581554412842, + "learning_rate": 4.834178429491572e-06, + "loss": 0.1759, + "step": 38446 + }, + { + "epoch": 2.121473141616929, + "grad_norm": 6.03526496887207, + "learning_rate": 4.834041386871318e-06, + "loss": 0.2479, + "step": 38447 + }, + { + "epoch": 2.1214867064568637, + "grad_norm": 4.06975793838501, + "learning_rate": 4.833904344251062e-06, + "loss": 0.1325, + "step": 38448 + }, + { + "epoch": 2.1215002712967985, + "grad_norm": 4.034987926483154, + "learning_rate": 4.8337673016308075e-06, + "loss": 0.17, + "step": 38449 + }, + { + "epoch": 2.1215138361367334, + "grad_norm": 3.5007669925689697, + "learning_rate": 4.833630259010553e-06, + "loss": 0.1587, + "step": 38450 + }, + { + "epoch": 2.1215274009766683, + "grad_norm": 3.9849400520324707, + "learning_rate": 4.833493216390298e-06, + "loss": 0.1402, + "step": 38451 + }, + { + "epoch": 2.1215409658166036, + "grad_norm": 4.676619529724121, + "learning_rate": 4.833356173770043e-06, + "loss": 0.1191, + "step": 38452 + }, + { + "epoch": 2.1215545306565384, + "grad_norm": 5.981693744659424, + "learning_rate": 4.833219131149787e-06, + "loss": 0.183, + "step": 38453 + }, + { + "epoch": 2.1215680954964733, + "grad_norm": 5.511632919311523, + "learning_rate": 4.833082088529533e-06, + "loss": 0.1389, + "step": 38454 + }, + { + "epoch": 2.121581660336408, + "grad_norm": 4.114802360534668, + "learning_rate": 4.832945045909278e-06, + "loss": 0.1283, + "step": 38455 + }, + { + "epoch": 2.121595225176343, + "grad_norm": 6.583785533905029, + "learning_rate": 4.832808003289024e-06, + "loss": 0.3039, + "step": 38456 + }, + { + "epoch": 2.121608790016278, + "grad_norm": 5.765059471130371, + "learning_rate": 4.832670960668768e-06, + "loss": 0.2673, + "step": 38457 + }, + { + "epoch": 2.1216223548562128, + "grad_norm": 5.434948921203613, + "learning_rate": 4.832533918048513e-06, + "loss": 0.2611, + "step": 38458 + }, + { + "epoch": 2.1216359196961476, + "grad_norm": 4.057236671447754, + "learning_rate": 4.8323968754282585e-06, + "loss": 0.1533, + "step": 38459 + }, + { + "epoch": 2.1216494845360825, + "grad_norm": 4.492869853973389, + "learning_rate": 4.832259832808004e-06, + "loss": 0.2057, + "step": 38460 + }, + { + "epoch": 2.1216630493760174, + "grad_norm": 3.9036169052124023, + "learning_rate": 4.832122790187749e-06, + "loss": 0.138, + "step": 38461 + }, + { + "epoch": 2.121676614215952, + "grad_norm": 7.554219722747803, + "learning_rate": 4.831985747567494e-06, + "loss": 0.2098, + "step": 38462 + }, + { + "epoch": 2.121690179055887, + "grad_norm": 4.4604716300964355, + "learning_rate": 4.831848704947239e-06, + "loss": 0.185, + "step": 38463 + }, + { + "epoch": 2.121703743895822, + "grad_norm": 5.517338275909424, + "learning_rate": 4.8317116623269835e-06, + "loss": 0.4099, + "step": 38464 + }, + { + "epoch": 2.121717308735757, + "grad_norm": 4.213395595550537, + "learning_rate": 4.8315746197067295e-06, + "loss": 0.1977, + "step": 38465 + }, + { + "epoch": 2.1217308735756917, + "grad_norm": 4.284689426422119, + "learning_rate": 4.831437577086474e-06, + "loss": 0.1384, + "step": 38466 + }, + { + "epoch": 2.1217444384156265, + "grad_norm": 4.514042377471924, + "learning_rate": 4.83130053446622e-06, + "loss": 0.1704, + "step": 38467 + }, + { + "epoch": 2.1217580032555614, + "grad_norm": 3.7447268962860107, + "learning_rate": 4.831163491845964e-06, + "loss": 0.2207, + "step": 38468 + }, + { + "epoch": 2.1217715680954963, + "grad_norm": 4.684281826019287, + "learning_rate": 4.831026449225709e-06, + "loss": 0.2522, + "step": 38469 + }, + { + "epoch": 2.1217851329354316, + "grad_norm": 4.31045389175415, + "learning_rate": 4.830889406605455e-06, + "loss": 0.1418, + "step": 38470 + }, + { + "epoch": 2.1217986977753664, + "grad_norm": 5.452924728393555, + "learning_rate": 4.8307523639852e-06, + "loss": 0.1977, + "step": 38471 + }, + { + "epoch": 2.1218122626153013, + "grad_norm": 5.64726448059082, + "learning_rate": 4.830615321364945e-06, + "loss": 0.2823, + "step": 38472 + }, + { + "epoch": 2.121825827455236, + "grad_norm": 5.222045421600342, + "learning_rate": 4.83047827874469e-06, + "loss": 0.2571, + "step": 38473 + }, + { + "epoch": 2.121839392295171, + "grad_norm": 6.769890308380127, + "learning_rate": 4.830341236124435e-06, + "loss": 0.2863, + "step": 38474 + }, + { + "epoch": 2.121852957135106, + "grad_norm": 5.089825630187988, + "learning_rate": 4.8302041935041805e-06, + "loss": 0.3136, + "step": 38475 + }, + { + "epoch": 2.1218665219750408, + "grad_norm": 5.55278205871582, + "learning_rate": 4.830067150883926e-06, + "loss": 0.344, + "step": 38476 + }, + { + "epoch": 2.1218800868149756, + "grad_norm": 6.82029914855957, + "learning_rate": 4.82993010826367e-06, + "loss": 0.3023, + "step": 38477 + }, + { + "epoch": 2.1218936516549105, + "grad_norm": 5.167213439941406, + "learning_rate": 4.829793065643415e-06, + "loss": 0.2969, + "step": 38478 + }, + { + "epoch": 2.1219072164948454, + "grad_norm": 4.6816935539245605, + "learning_rate": 4.82965602302316e-06, + "loss": 0.238, + "step": 38479 + }, + { + "epoch": 2.1219207813347802, + "grad_norm": 6.035610198974609, + "learning_rate": 4.8295189804029055e-06, + "loss": 0.3307, + "step": 38480 + }, + { + "epoch": 2.121934346174715, + "grad_norm": 5.1686296463012695, + "learning_rate": 4.829381937782651e-06, + "loss": 0.203, + "step": 38481 + }, + { + "epoch": 2.12194791101465, + "grad_norm": 5.390745162963867, + "learning_rate": 4.829244895162396e-06, + "loss": 0.2062, + "step": 38482 + }, + { + "epoch": 2.121961475854585, + "grad_norm": 5.493684768676758, + "learning_rate": 4.829107852542141e-06, + "loss": 0.2341, + "step": 38483 + }, + { + "epoch": 2.1219750406945197, + "grad_norm": 6.283891677856445, + "learning_rate": 4.828970809921886e-06, + "loss": 0.3427, + "step": 38484 + }, + { + "epoch": 2.1219886055344546, + "grad_norm": 6.759603023529053, + "learning_rate": 4.8288337673016314e-06, + "loss": 0.2542, + "step": 38485 + }, + { + "epoch": 2.1220021703743894, + "grad_norm": 4.919315814971924, + "learning_rate": 4.828696724681377e-06, + "loss": 0.3428, + "step": 38486 + }, + { + "epoch": 2.1220157352143243, + "grad_norm": 3.4882781505584717, + "learning_rate": 4.828559682061121e-06, + "loss": 0.1883, + "step": 38487 + }, + { + "epoch": 2.122029300054259, + "grad_norm": 4.111656188964844, + "learning_rate": 4.828422639440867e-06, + "loss": 0.1684, + "step": 38488 + }, + { + "epoch": 2.122042864894194, + "grad_norm": 5.823415756225586, + "learning_rate": 4.828285596820611e-06, + "loss": 0.2615, + "step": 38489 + }, + { + "epoch": 2.1220564297341293, + "grad_norm": 3.4977686405181885, + "learning_rate": 4.8281485542003565e-06, + "loss": 0.1828, + "step": 38490 + }, + { + "epoch": 2.122069994574064, + "grad_norm": 4.279115200042725, + "learning_rate": 4.828011511580102e-06, + "loss": 0.2582, + "step": 38491 + }, + { + "epoch": 2.122083559413999, + "grad_norm": 4.517462253570557, + "learning_rate": 4.827874468959847e-06, + "loss": 0.2396, + "step": 38492 + }, + { + "epoch": 2.122097124253934, + "grad_norm": 4.3696675300598145, + "learning_rate": 4.827737426339592e-06, + "loss": 0.1878, + "step": 38493 + }, + { + "epoch": 2.122110689093869, + "grad_norm": 3.7020111083984375, + "learning_rate": 4.827600383719337e-06, + "loss": 0.1696, + "step": 38494 + }, + { + "epoch": 2.1221242539338037, + "grad_norm": 4.081884860992432, + "learning_rate": 4.827463341099082e-06, + "loss": 0.1328, + "step": 38495 + }, + { + "epoch": 2.1221378187737385, + "grad_norm": 4.339351654052734, + "learning_rate": 4.8273262984788275e-06, + "loss": 0.2254, + "step": 38496 + }, + { + "epoch": 2.1221513836136734, + "grad_norm": 4.357648849487305, + "learning_rate": 4.827189255858573e-06, + "loss": 0.2381, + "step": 38497 + }, + { + "epoch": 2.1221649484536083, + "grad_norm": 4.475072860717773, + "learning_rate": 4.827052213238317e-06, + "loss": 0.1998, + "step": 38498 + }, + { + "epoch": 2.122178513293543, + "grad_norm": 4.0075531005859375, + "learning_rate": 4.826915170618063e-06, + "loss": 0.1508, + "step": 38499 + }, + { + "epoch": 2.122192078133478, + "grad_norm": 6.355903148651123, + "learning_rate": 4.826778127997807e-06, + "loss": 0.2336, + "step": 38500 + }, + { + "epoch": 2.122205642973413, + "grad_norm": 4.840726852416992, + "learning_rate": 4.8266410853775534e-06, + "loss": 0.208, + "step": 38501 + }, + { + "epoch": 2.1222192078133477, + "grad_norm": 5.876436233520508, + "learning_rate": 4.826504042757298e-06, + "loss": 0.3947, + "step": 38502 + }, + { + "epoch": 2.1222327726532826, + "grad_norm": 6.304915428161621, + "learning_rate": 4.826367000137043e-06, + "loss": 0.2259, + "step": 38503 + }, + { + "epoch": 2.1222463374932174, + "grad_norm": 5.032739162445068, + "learning_rate": 4.826229957516788e-06, + "loss": 0.1418, + "step": 38504 + }, + { + "epoch": 2.1222599023331523, + "grad_norm": 5.97961950302124, + "learning_rate": 4.826092914896533e-06, + "loss": 0.2931, + "step": 38505 + }, + { + "epoch": 2.122273467173087, + "grad_norm": 4.179996490478516, + "learning_rate": 4.8259558722762785e-06, + "loss": 0.2815, + "step": 38506 + }, + { + "epoch": 2.122287032013022, + "grad_norm": 4.801240921020508, + "learning_rate": 4.825818829656023e-06, + "loss": 0.2635, + "step": 38507 + }, + { + "epoch": 2.1223005968529574, + "grad_norm": 3.667701005935669, + "learning_rate": 4.825681787035769e-06, + "loss": 0.1167, + "step": 38508 + }, + { + "epoch": 2.122314161692892, + "grad_norm": 5.101428985595703, + "learning_rate": 4.825544744415513e-06, + "loss": 0.276, + "step": 38509 + }, + { + "epoch": 2.122327726532827, + "grad_norm": 5.780508995056152, + "learning_rate": 4.825407701795259e-06, + "loss": 0.2997, + "step": 38510 + }, + { + "epoch": 2.122341291372762, + "grad_norm": 4.496563911437988, + "learning_rate": 4.8252706591750035e-06, + "loss": 0.1842, + "step": 38511 + }, + { + "epoch": 2.122354856212697, + "grad_norm": 4.531282901763916, + "learning_rate": 4.825133616554749e-06, + "loss": 0.1495, + "step": 38512 + }, + { + "epoch": 2.1223684210526317, + "grad_norm": 4.190202236175537, + "learning_rate": 4.824996573934494e-06, + "loss": 0.217, + "step": 38513 + }, + { + "epoch": 2.1223819858925665, + "grad_norm": 4.08519983291626, + "learning_rate": 4.824859531314239e-06, + "loss": 0.1281, + "step": 38514 + }, + { + "epoch": 2.1223955507325014, + "grad_norm": 5.147055149078369, + "learning_rate": 4.824722488693984e-06, + "loss": 0.2706, + "step": 38515 + }, + { + "epoch": 2.1224091155724363, + "grad_norm": 7.108630180358887, + "learning_rate": 4.8245854460737294e-06, + "loss": 0.2522, + "step": 38516 + }, + { + "epoch": 2.122422680412371, + "grad_norm": 4.681534290313721, + "learning_rate": 4.824448403453475e-06, + "loss": 0.1531, + "step": 38517 + }, + { + "epoch": 2.122436245252306, + "grad_norm": 4.213144302368164, + "learning_rate": 4.824311360833219e-06, + "loss": 0.161, + "step": 38518 + }, + { + "epoch": 2.122449810092241, + "grad_norm": 5.9690937995910645, + "learning_rate": 4.824174318212965e-06, + "loss": 0.2161, + "step": 38519 + }, + { + "epoch": 2.1224633749321757, + "grad_norm": 3.963229179382324, + "learning_rate": 4.824037275592709e-06, + "loss": 0.1305, + "step": 38520 + }, + { + "epoch": 2.1224769397721106, + "grad_norm": 6.611966609954834, + "learning_rate": 4.823900232972455e-06, + "loss": 0.2132, + "step": 38521 + }, + { + "epoch": 2.1224905046120455, + "grad_norm": 6.303510665893555, + "learning_rate": 4.8237631903522e-06, + "loss": 0.2208, + "step": 38522 + }, + { + "epoch": 2.1225040694519803, + "grad_norm": 5.379961967468262, + "learning_rate": 4.823626147731945e-06, + "loss": 0.2436, + "step": 38523 + }, + { + "epoch": 2.122517634291915, + "grad_norm": 4.238004684448242, + "learning_rate": 4.82348910511169e-06, + "loss": 0.169, + "step": 38524 + }, + { + "epoch": 2.12253119913185, + "grad_norm": 4.970907211303711, + "learning_rate": 4.823352062491435e-06, + "loss": 0.1532, + "step": 38525 + }, + { + "epoch": 2.122544763971785, + "grad_norm": 4.019947052001953, + "learning_rate": 4.82321501987118e-06, + "loss": 0.1597, + "step": 38526 + }, + { + "epoch": 2.12255832881172, + "grad_norm": 4.658174514770508, + "learning_rate": 4.8230779772509256e-06, + "loss": 0.2331, + "step": 38527 + }, + { + "epoch": 2.122571893651655, + "grad_norm": 4.583615303039551, + "learning_rate": 4.822940934630671e-06, + "loss": 0.2756, + "step": 38528 + }, + { + "epoch": 2.12258545849159, + "grad_norm": 6.081179141998291, + "learning_rate": 4.822803892010416e-06, + "loss": 0.2471, + "step": 38529 + }, + { + "epoch": 2.122599023331525, + "grad_norm": 5.639899253845215, + "learning_rate": 4.822666849390161e-06, + "loss": 0.2925, + "step": 38530 + }, + { + "epoch": 2.1226125881714597, + "grad_norm": 5.700550556182861, + "learning_rate": 4.822529806769905e-06, + "loss": 0.2076, + "step": 38531 + }, + { + "epoch": 2.1226261530113946, + "grad_norm": 3.119981050491333, + "learning_rate": 4.822392764149651e-06, + "loss": 0.0793, + "step": 38532 + }, + { + "epoch": 2.1226397178513294, + "grad_norm": 5.364095687866211, + "learning_rate": 4.822255721529396e-06, + "loss": 0.1646, + "step": 38533 + }, + { + "epoch": 2.1226532826912643, + "grad_norm": 5.886381149291992, + "learning_rate": 4.822118678909141e-06, + "loss": 0.2019, + "step": 38534 + }, + { + "epoch": 2.122666847531199, + "grad_norm": 4.933221817016602, + "learning_rate": 4.821981636288886e-06, + "loss": 0.2021, + "step": 38535 + }, + { + "epoch": 2.122680412371134, + "grad_norm": 5.252888202667236, + "learning_rate": 4.821844593668631e-06, + "loss": 0.1338, + "step": 38536 + }, + { + "epoch": 2.122693977211069, + "grad_norm": 3.5401439666748047, + "learning_rate": 4.8217075510483765e-06, + "loss": 0.1209, + "step": 38537 + }, + { + "epoch": 2.1227075420510038, + "grad_norm": 3.496612548828125, + "learning_rate": 4.821570508428122e-06, + "loss": 0.1275, + "step": 38538 + }, + { + "epoch": 2.1227211068909386, + "grad_norm": 4.774394989013672, + "learning_rate": 4.821433465807867e-06, + "loss": 0.1562, + "step": 38539 + }, + { + "epoch": 2.1227346717308735, + "grad_norm": 4.219128131866455, + "learning_rate": 4.821296423187612e-06, + "loss": 0.2259, + "step": 38540 + }, + { + "epoch": 2.1227482365708084, + "grad_norm": 6.092848300933838, + "learning_rate": 4.821159380567356e-06, + "loss": 0.2289, + "step": 38541 + }, + { + "epoch": 2.122761801410743, + "grad_norm": 5.868314743041992, + "learning_rate": 4.821022337947102e-06, + "loss": 0.1952, + "step": 38542 + }, + { + "epoch": 2.122775366250678, + "grad_norm": 4.886345863342285, + "learning_rate": 4.820885295326847e-06, + "loss": 0.1469, + "step": 38543 + }, + { + "epoch": 2.122788931090613, + "grad_norm": 3.8383262157440186, + "learning_rate": 4.820748252706593e-06, + "loss": 0.1555, + "step": 38544 + }, + { + "epoch": 2.122802495930548, + "grad_norm": 5.520276069641113, + "learning_rate": 4.820611210086337e-06, + "loss": 0.2743, + "step": 38545 + }, + { + "epoch": 2.122816060770483, + "grad_norm": 3.475229501724243, + "learning_rate": 4.820474167466082e-06, + "loss": 0.1309, + "step": 38546 + }, + { + "epoch": 2.122829625610418, + "grad_norm": 5.830836772918701, + "learning_rate": 4.8203371248458274e-06, + "loss": 0.1093, + "step": 38547 + }, + { + "epoch": 2.122843190450353, + "grad_norm": 4.917609214782715, + "learning_rate": 4.820200082225573e-06, + "loss": 0.2159, + "step": 38548 + }, + { + "epoch": 2.1228567552902877, + "grad_norm": 9.383581161499023, + "learning_rate": 4.820063039605318e-06, + "loss": 0.1392, + "step": 38549 + }, + { + "epoch": 2.1228703201302226, + "grad_norm": 4.80062198638916, + "learning_rate": 4.819925996985063e-06, + "loss": 0.1779, + "step": 38550 + }, + { + "epoch": 2.1228838849701575, + "grad_norm": 6.218019485473633, + "learning_rate": 4.819788954364808e-06, + "loss": 0.2233, + "step": 38551 + }, + { + "epoch": 2.1228974498100923, + "grad_norm": 4.609155178070068, + "learning_rate": 4.8196519117445525e-06, + "loss": 0.1478, + "step": 38552 + }, + { + "epoch": 2.122911014650027, + "grad_norm": 4.328676700592041, + "learning_rate": 4.8195148691242985e-06, + "loss": 0.1911, + "step": 38553 + }, + { + "epoch": 2.122924579489962, + "grad_norm": 5.188510894775391, + "learning_rate": 4.819377826504043e-06, + "loss": 0.2339, + "step": 38554 + }, + { + "epoch": 2.122938144329897, + "grad_norm": 5.82826042175293, + "learning_rate": 4.819240783883789e-06, + "loss": 0.2048, + "step": 38555 + }, + { + "epoch": 2.122951709169832, + "grad_norm": 5.733758449554443, + "learning_rate": 4.819103741263533e-06, + "loss": 0.1632, + "step": 38556 + }, + { + "epoch": 2.1229652740097666, + "grad_norm": 4.551557540893555, + "learning_rate": 4.818966698643278e-06, + "loss": 0.1614, + "step": 38557 + }, + { + "epoch": 2.1229788388497015, + "grad_norm": 4.453614711761475, + "learning_rate": 4.8188296560230236e-06, + "loss": 0.1948, + "step": 38558 + }, + { + "epoch": 2.1229924036896364, + "grad_norm": 6.92141580581665, + "learning_rate": 4.818692613402769e-06, + "loss": 0.2833, + "step": 38559 + }, + { + "epoch": 2.1230059685295712, + "grad_norm": 4.587485313415527, + "learning_rate": 4.818555570782514e-06, + "loss": 0.1381, + "step": 38560 + }, + { + "epoch": 2.123019533369506, + "grad_norm": 6.061525821685791, + "learning_rate": 4.818418528162258e-06, + "loss": 0.2259, + "step": 38561 + }, + { + "epoch": 2.123033098209441, + "grad_norm": 5.509962558746338, + "learning_rate": 4.818281485542004e-06, + "loss": 0.1378, + "step": 38562 + }, + { + "epoch": 2.123046663049376, + "grad_norm": 4.586260795593262, + "learning_rate": 4.818144442921749e-06, + "loss": 0.1686, + "step": 38563 + }, + { + "epoch": 2.1230602278893107, + "grad_norm": 3.9002580642700195, + "learning_rate": 4.818007400301495e-06, + "loss": 0.1595, + "step": 38564 + }, + { + "epoch": 2.1230737927292456, + "grad_norm": 4.664724349975586, + "learning_rate": 4.817870357681239e-06, + "loss": 0.1726, + "step": 38565 + }, + { + "epoch": 2.123087357569181, + "grad_norm": 5.054211616516113, + "learning_rate": 4.817733315060984e-06, + "loss": 0.2087, + "step": 38566 + }, + { + "epoch": 2.1231009224091157, + "grad_norm": 5.616432189941406, + "learning_rate": 4.817596272440729e-06, + "loss": 0.172, + "step": 38567 + }, + { + "epoch": 2.1231144872490506, + "grad_norm": 8.426955223083496, + "learning_rate": 4.8174592298204745e-06, + "loss": 0.2418, + "step": 38568 + }, + { + "epoch": 2.1231280520889855, + "grad_norm": 5.516958236694336, + "learning_rate": 4.81732218720022e-06, + "loss": 0.1339, + "step": 38569 + }, + { + "epoch": 2.1231416169289203, + "grad_norm": 6.349319934844971, + "learning_rate": 4.817185144579965e-06, + "loss": 0.2351, + "step": 38570 + }, + { + "epoch": 2.123155181768855, + "grad_norm": 6.037003993988037, + "learning_rate": 4.81704810195971e-06, + "loss": 0.215, + "step": 38571 + }, + { + "epoch": 2.12316874660879, + "grad_norm": 6.543456077575684, + "learning_rate": 4.816911059339454e-06, + "loss": 0.2011, + "step": 38572 + }, + { + "epoch": 2.123182311448725, + "grad_norm": 4.691971302032471, + "learning_rate": 4.8167740167192e-06, + "loss": 0.1607, + "step": 38573 + }, + { + "epoch": 2.12319587628866, + "grad_norm": 5.174056053161621, + "learning_rate": 4.816636974098945e-06, + "loss": 0.1735, + "step": 38574 + }, + { + "epoch": 2.1232094411285947, + "grad_norm": 5.098902225494385, + "learning_rate": 4.816499931478691e-06, + "loss": 0.2095, + "step": 38575 + }, + { + "epoch": 2.1232230059685295, + "grad_norm": 5.379402160644531, + "learning_rate": 4.816362888858435e-06, + "loss": 0.1905, + "step": 38576 + }, + { + "epoch": 2.1232365708084644, + "grad_norm": 8.436312675476074, + "learning_rate": 4.81622584623818e-06, + "loss": 0.3627, + "step": 38577 + }, + { + "epoch": 2.1232501356483993, + "grad_norm": 6.102072238922119, + "learning_rate": 4.8160888036179254e-06, + "loss": 0.2538, + "step": 38578 + }, + { + "epoch": 2.123263700488334, + "grad_norm": 4.600434303283691, + "learning_rate": 4.815951760997671e-06, + "loss": 0.1229, + "step": 38579 + }, + { + "epoch": 2.123277265328269, + "grad_norm": 6.559769630432129, + "learning_rate": 4.815814718377416e-06, + "loss": 0.2498, + "step": 38580 + }, + { + "epoch": 2.123290830168204, + "grad_norm": 5.317172050476074, + "learning_rate": 4.815677675757161e-06, + "loss": 0.1414, + "step": 38581 + }, + { + "epoch": 2.1233043950081387, + "grad_norm": 5.067509651184082, + "learning_rate": 4.815540633136906e-06, + "loss": 0.1929, + "step": 38582 + }, + { + "epoch": 2.1233179598480736, + "grad_norm": 4.910228729248047, + "learning_rate": 4.815403590516651e-06, + "loss": 0.1652, + "step": 38583 + }, + { + "epoch": 2.123331524688009, + "grad_norm": 5.65821647644043, + "learning_rate": 4.8152665478963965e-06, + "loss": 0.1952, + "step": 38584 + }, + { + "epoch": 2.1233450895279438, + "grad_norm": 5.341721534729004, + "learning_rate": 4.815129505276142e-06, + "loss": 0.1863, + "step": 38585 + }, + { + "epoch": 2.1233586543678786, + "grad_norm": 6.777492523193359, + "learning_rate": 4.814992462655886e-06, + "loss": 0.185, + "step": 38586 + }, + { + "epoch": 2.1233722192078135, + "grad_norm": 5.980885028839111, + "learning_rate": 4.814855420035631e-06, + "loss": 0.184, + "step": 38587 + }, + { + "epoch": 2.1233857840477484, + "grad_norm": 6.7814531326293945, + "learning_rate": 4.814718377415376e-06, + "loss": 0.1926, + "step": 38588 + }, + { + "epoch": 2.1233993488876832, + "grad_norm": 6.003931522369385, + "learning_rate": 4.8145813347951216e-06, + "loss": 0.17, + "step": 38589 + }, + { + "epoch": 2.123412913727618, + "grad_norm": 5.800844192504883, + "learning_rate": 4.814444292174867e-06, + "loss": 0.1972, + "step": 38590 + }, + { + "epoch": 2.123426478567553, + "grad_norm": 6.976144313812256, + "learning_rate": 4.814307249554612e-06, + "loss": 0.2625, + "step": 38591 + }, + { + "epoch": 2.123440043407488, + "grad_norm": 4.862860202789307, + "learning_rate": 4.814170206934357e-06, + "loss": 0.2018, + "step": 38592 + }, + { + "epoch": 2.1234536082474227, + "grad_norm": 6.740907192230225, + "learning_rate": 4.814033164314102e-06, + "loss": 0.2719, + "step": 38593 + }, + { + "epoch": 2.1234671730873576, + "grad_norm": 5.2718682289123535, + "learning_rate": 4.8138961216938475e-06, + "loss": 0.2026, + "step": 38594 + }, + { + "epoch": 2.1234807379272924, + "grad_norm": 5.262733459472656, + "learning_rate": 4.813759079073592e-06, + "loss": 0.1973, + "step": 38595 + }, + { + "epoch": 2.1234943027672273, + "grad_norm": 5.203236103057861, + "learning_rate": 4.813622036453338e-06, + "loss": 0.1728, + "step": 38596 + }, + { + "epoch": 2.123507867607162, + "grad_norm": 5.568318843841553, + "learning_rate": 4.813484993833082e-06, + "loss": 0.1335, + "step": 38597 + }, + { + "epoch": 2.123521432447097, + "grad_norm": 5.7043561935424805, + "learning_rate": 4.813347951212828e-06, + "loss": 0.179, + "step": 38598 + }, + { + "epoch": 2.123534997287032, + "grad_norm": 5.0820393562316895, + "learning_rate": 4.8132109085925725e-06, + "loss": 0.1665, + "step": 38599 + }, + { + "epoch": 2.1235485621269667, + "grad_norm": 6.689797878265381, + "learning_rate": 4.813073865972318e-06, + "loss": 0.2748, + "step": 38600 + }, + { + "epoch": 2.1235621269669016, + "grad_norm": 4.251958847045898, + "learning_rate": 4.812936823352063e-06, + "loss": 0.1838, + "step": 38601 + }, + { + "epoch": 2.1235756918068365, + "grad_norm": 5.472357749938965, + "learning_rate": 4.812799780731808e-06, + "loss": 0.1869, + "step": 38602 + }, + { + "epoch": 2.1235892566467713, + "grad_norm": 6.952911376953125, + "learning_rate": 4.812662738111553e-06, + "loss": 0.19, + "step": 38603 + }, + { + "epoch": 2.1236028214867066, + "grad_norm": 4.4832305908203125, + "learning_rate": 4.812525695491298e-06, + "loss": 0.1726, + "step": 38604 + }, + { + "epoch": 2.1236163863266415, + "grad_norm": 3.4418482780456543, + "learning_rate": 4.8123886528710436e-06, + "loss": 0.1054, + "step": 38605 + }, + { + "epoch": 2.1236299511665764, + "grad_norm": 5.772141933441162, + "learning_rate": 4.812251610250788e-06, + "loss": 0.2303, + "step": 38606 + }, + { + "epoch": 2.1236435160065112, + "grad_norm": 4.356330871582031, + "learning_rate": 4.812114567630534e-06, + "loss": 0.1791, + "step": 38607 + }, + { + "epoch": 2.123657080846446, + "grad_norm": 5.032657146453857, + "learning_rate": 4.811977525010278e-06, + "loss": 0.1166, + "step": 38608 + }, + { + "epoch": 2.123670645686381, + "grad_norm": 5.543009281158447, + "learning_rate": 4.811840482390024e-06, + "loss": 0.2067, + "step": 38609 + }, + { + "epoch": 2.123684210526316, + "grad_norm": 4.27304220199585, + "learning_rate": 4.811703439769769e-06, + "loss": 0.1516, + "step": 38610 + }, + { + "epoch": 2.1236977753662507, + "grad_norm": 6.2254815101623535, + "learning_rate": 4.811566397149514e-06, + "loss": 0.2215, + "step": 38611 + }, + { + "epoch": 2.1237113402061856, + "grad_norm": 3.3561758995056152, + "learning_rate": 4.811429354529259e-06, + "loss": 0.1404, + "step": 38612 + }, + { + "epoch": 2.1237249050461204, + "grad_norm": 4.447394847869873, + "learning_rate": 4.811292311909004e-06, + "loss": 0.1541, + "step": 38613 + }, + { + "epoch": 2.1237384698860553, + "grad_norm": 6.052786827087402, + "learning_rate": 4.811155269288749e-06, + "loss": 0.2673, + "step": 38614 + }, + { + "epoch": 2.12375203472599, + "grad_norm": 5.084473133087158, + "learning_rate": 4.811018226668494e-06, + "loss": 0.1668, + "step": 38615 + }, + { + "epoch": 2.123765599565925, + "grad_norm": 4.471351623535156, + "learning_rate": 4.81088118404824e-06, + "loss": 0.1903, + "step": 38616 + }, + { + "epoch": 2.12377916440586, + "grad_norm": 6.376526355743408, + "learning_rate": 4.810744141427984e-06, + "loss": 0.1537, + "step": 38617 + }, + { + "epoch": 2.1237927292457948, + "grad_norm": 3.875532627105713, + "learning_rate": 4.81060709880773e-06, + "loss": 0.1077, + "step": 38618 + }, + { + "epoch": 2.1238062940857296, + "grad_norm": 5.728725910186768, + "learning_rate": 4.810470056187474e-06, + "loss": 0.161, + "step": 38619 + }, + { + "epoch": 2.1238198589256645, + "grad_norm": 6.218297004699707, + "learning_rate": 4.8103330135672196e-06, + "loss": 0.1994, + "step": 38620 + }, + { + "epoch": 2.1238334237655994, + "grad_norm": 4.777394771575928, + "learning_rate": 4.810195970946965e-06, + "loss": 0.1389, + "step": 38621 + }, + { + "epoch": 2.1238469886055347, + "grad_norm": 5.559463977813721, + "learning_rate": 4.81005892832671e-06, + "loss": 0.1976, + "step": 38622 + }, + { + "epoch": 2.1238605534454695, + "grad_norm": 3.5856730937957764, + "learning_rate": 4.809921885706455e-06, + "loss": 0.0808, + "step": 38623 + }, + { + "epoch": 2.1238741182854044, + "grad_norm": 3.887362003326416, + "learning_rate": 4.8097848430862e-06, + "loss": 0.1061, + "step": 38624 + }, + { + "epoch": 2.1238876831253393, + "grad_norm": 5.380065441131592, + "learning_rate": 4.8096478004659455e-06, + "loss": 0.2257, + "step": 38625 + }, + { + "epoch": 2.123901247965274, + "grad_norm": 5.737995624542236, + "learning_rate": 4.809510757845691e-06, + "loss": 0.234, + "step": 38626 + }, + { + "epoch": 2.123914812805209, + "grad_norm": 5.6709771156311035, + "learning_rate": 4.809373715225436e-06, + "loss": 0.196, + "step": 38627 + }, + { + "epoch": 2.123928377645144, + "grad_norm": 4.408980369567871, + "learning_rate": 4.80923667260518e-06, + "loss": 0.1214, + "step": 38628 + }, + { + "epoch": 2.1239419424850787, + "grad_norm": 6.323074817657471, + "learning_rate": 4.809099629984925e-06, + "loss": 0.2268, + "step": 38629 + }, + { + "epoch": 2.1239555073250136, + "grad_norm": 7.983553886413574, + "learning_rate": 4.8089625873646705e-06, + "loss": 0.2362, + "step": 38630 + }, + { + "epoch": 2.1239690721649485, + "grad_norm": 5.065009117126465, + "learning_rate": 4.808825544744416e-06, + "loss": 0.1659, + "step": 38631 + }, + { + "epoch": 2.1239826370048833, + "grad_norm": 5.006319522857666, + "learning_rate": 4.808688502124161e-06, + "loss": 0.1657, + "step": 38632 + }, + { + "epoch": 2.123996201844818, + "grad_norm": 3.0220465660095215, + "learning_rate": 4.808551459503906e-06, + "loss": 0.1353, + "step": 38633 + }, + { + "epoch": 2.124009766684753, + "grad_norm": 5.722053527832031, + "learning_rate": 4.808414416883651e-06, + "loss": 0.1914, + "step": 38634 + }, + { + "epoch": 2.124023331524688, + "grad_norm": 5.2380194664001465, + "learning_rate": 4.808277374263396e-06, + "loss": 0.1965, + "step": 38635 + }, + { + "epoch": 2.124036896364623, + "grad_norm": 3.705317974090576, + "learning_rate": 4.808140331643142e-06, + "loss": 0.1223, + "step": 38636 + }, + { + "epoch": 2.1240504612045576, + "grad_norm": 6.407452583312988, + "learning_rate": 4.808003289022887e-06, + "loss": 0.1619, + "step": 38637 + }, + { + "epoch": 2.1240640260444925, + "grad_norm": 5.933191776275635, + "learning_rate": 4.807866246402632e-06, + "loss": 0.1975, + "step": 38638 + }, + { + "epoch": 2.1240775908844274, + "grad_norm": 6.452818870544434, + "learning_rate": 4.807729203782377e-06, + "loss": 0.2396, + "step": 38639 + }, + { + "epoch": 2.1240911557243622, + "grad_norm": 9.037408828735352, + "learning_rate": 4.8075921611621214e-06, + "loss": 0.3141, + "step": 38640 + }, + { + "epoch": 2.1241047205642976, + "grad_norm": 5.5458807945251465, + "learning_rate": 4.807455118541867e-06, + "loss": 0.2179, + "step": 38641 + }, + { + "epoch": 2.1241182854042324, + "grad_norm": 4.289215087890625, + "learning_rate": 4.807318075921612e-06, + "loss": 0.1596, + "step": 38642 + }, + { + "epoch": 2.1241318502441673, + "grad_norm": 6.858643054962158, + "learning_rate": 4.807181033301357e-06, + "loss": 0.2774, + "step": 38643 + }, + { + "epoch": 2.124145415084102, + "grad_norm": 4.438443183898926, + "learning_rate": 4.807043990681102e-06, + "loss": 0.1568, + "step": 38644 + }, + { + "epoch": 2.124158979924037, + "grad_norm": 5.6823577880859375, + "learning_rate": 4.806906948060847e-06, + "loss": 0.1721, + "step": 38645 + }, + { + "epoch": 2.124172544763972, + "grad_norm": 5.430360317230225, + "learning_rate": 4.8067699054405925e-06, + "loss": 0.1364, + "step": 38646 + }, + { + "epoch": 2.1241861096039067, + "grad_norm": 4.586727142333984, + "learning_rate": 4.806632862820338e-06, + "loss": 0.1659, + "step": 38647 + }, + { + "epoch": 2.1241996744438416, + "grad_norm": 4.662207126617432, + "learning_rate": 4.806495820200083e-06, + "loss": 0.1403, + "step": 38648 + }, + { + "epoch": 2.1242132392837765, + "grad_norm": 7.0973968505859375, + "learning_rate": 4.806358777579827e-06, + "loss": 0.2094, + "step": 38649 + }, + { + "epoch": 2.1242268041237113, + "grad_norm": 6.949914932250977, + "learning_rate": 4.806221734959573e-06, + "loss": 0.2109, + "step": 38650 + }, + { + "epoch": 2.124240368963646, + "grad_norm": 4.520650386810303, + "learning_rate": 4.8060846923393176e-06, + "loss": 0.1616, + "step": 38651 + }, + { + "epoch": 2.124253933803581, + "grad_norm": 4.651691913604736, + "learning_rate": 4.805947649719064e-06, + "loss": 0.1398, + "step": 38652 + }, + { + "epoch": 2.124267498643516, + "grad_norm": 6.113833427429199, + "learning_rate": 4.805810607098808e-06, + "loss": 0.1791, + "step": 38653 + }, + { + "epoch": 2.124281063483451, + "grad_norm": 5.719991683959961, + "learning_rate": 4.805673564478553e-06, + "loss": 0.2411, + "step": 38654 + }, + { + "epoch": 2.1242946283233857, + "grad_norm": 5.919278621673584, + "learning_rate": 4.805536521858298e-06, + "loss": 0.1862, + "step": 38655 + }, + { + "epoch": 2.1243081931633205, + "grad_norm": 7.177728652954102, + "learning_rate": 4.8053994792380435e-06, + "loss": 0.2677, + "step": 38656 + }, + { + "epoch": 2.1243217580032554, + "grad_norm": 4.838059425354004, + "learning_rate": 4.805262436617789e-06, + "loss": 0.117, + "step": 38657 + }, + { + "epoch": 2.1243353228431903, + "grad_norm": 6.578509330749512, + "learning_rate": 4.805125393997534e-06, + "loss": 0.222, + "step": 38658 + }, + { + "epoch": 2.124348887683125, + "grad_norm": 4.671193599700928, + "learning_rate": 4.804988351377279e-06, + "loss": 0.1886, + "step": 38659 + }, + { + "epoch": 2.1243624525230604, + "grad_norm": 6.446725845336914, + "learning_rate": 4.804851308757023e-06, + "loss": 0.2356, + "step": 38660 + }, + { + "epoch": 2.1243760173629953, + "grad_norm": 5.117269515991211, + "learning_rate": 4.804714266136769e-06, + "loss": 0.2697, + "step": 38661 + }, + { + "epoch": 2.12438958220293, + "grad_norm": 5.935332298278809, + "learning_rate": 4.804577223516514e-06, + "loss": 0.2748, + "step": 38662 + }, + { + "epoch": 2.124403147042865, + "grad_norm": 5.632731914520264, + "learning_rate": 4.80444018089626e-06, + "loss": 0.1699, + "step": 38663 + }, + { + "epoch": 2.1244167118828, + "grad_norm": 7.445952415466309, + "learning_rate": 4.804303138276004e-06, + "loss": 0.271, + "step": 38664 + }, + { + "epoch": 2.1244302767227348, + "grad_norm": 5.7694010734558105, + "learning_rate": 4.804166095655749e-06, + "loss": 0.2125, + "step": 38665 + }, + { + "epoch": 2.1244438415626696, + "grad_norm": 5.530646800994873, + "learning_rate": 4.804029053035494e-06, + "loss": 0.223, + "step": 38666 + }, + { + "epoch": 2.1244574064026045, + "grad_norm": 4.534673690795898, + "learning_rate": 4.80389201041524e-06, + "loss": 0.1734, + "step": 38667 + }, + { + "epoch": 2.1244709712425394, + "grad_norm": 4.243316650390625, + "learning_rate": 4.803754967794985e-06, + "loss": 0.1896, + "step": 38668 + }, + { + "epoch": 2.1244845360824742, + "grad_norm": 5.50630521774292, + "learning_rate": 4.803617925174729e-06, + "loss": 0.2118, + "step": 38669 + }, + { + "epoch": 2.124498100922409, + "grad_norm": 5.051016330718994, + "learning_rate": 4.803480882554475e-06, + "loss": 0.1901, + "step": 38670 + }, + { + "epoch": 2.124511665762344, + "grad_norm": 5.743578910827637, + "learning_rate": 4.8033438399342195e-06, + "loss": 0.2142, + "step": 38671 + }, + { + "epoch": 2.124525230602279, + "grad_norm": 4.756266117095947, + "learning_rate": 4.8032067973139655e-06, + "loss": 0.1659, + "step": 38672 + }, + { + "epoch": 2.1245387954422137, + "grad_norm": 5.997903347015381, + "learning_rate": 4.80306975469371e-06, + "loss": 0.2136, + "step": 38673 + }, + { + "epoch": 2.1245523602821486, + "grad_norm": 4.886926174163818, + "learning_rate": 4.802932712073455e-06, + "loss": 0.2367, + "step": 38674 + }, + { + "epoch": 2.1245659251220834, + "grad_norm": 4.384330749511719, + "learning_rate": 4.8027956694532e-06, + "loss": 0.1853, + "step": 38675 + }, + { + "epoch": 2.1245794899620183, + "grad_norm": 5.196650505065918, + "learning_rate": 4.802658626832945e-06, + "loss": 0.2114, + "step": 38676 + }, + { + "epoch": 2.124593054801953, + "grad_norm": 4.422690391540527, + "learning_rate": 4.8025215842126905e-06, + "loss": 0.189, + "step": 38677 + }, + { + "epoch": 2.124606619641888, + "grad_norm": 4.5844550132751465, + "learning_rate": 4.802384541592436e-06, + "loss": 0.1513, + "step": 38678 + }, + { + "epoch": 2.1246201844818233, + "grad_norm": 4.431817054748535, + "learning_rate": 4.802247498972181e-06, + "loss": 0.1363, + "step": 38679 + }, + { + "epoch": 2.124633749321758, + "grad_norm": 4.177393436431885, + "learning_rate": 4.802110456351926e-06, + "loss": 0.1486, + "step": 38680 + }, + { + "epoch": 2.124647314161693, + "grad_norm": 5.780587673187256, + "learning_rate": 4.801973413731671e-06, + "loss": 0.2191, + "step": 38681 + }, + { + "epoch": 2.124660879001628, + "grad_norm": 4.381385326385498, + "learning_rate": 4.801836371111416e-06, + "loss": 0.1245, + "step": 38682 + }, + { + "epoch": 2.124674443841563, + "grad_norm": 5.422560214996338, + "learning_rate": 4.801699328491161e-06, + "loss": 0.1244, + "step": 38683 + }, + { + "epoch": 2.1246880086814977, + "grad_norm": 5.353389263153076, + "learning_rate": 4.801562285870906e-06, + "loss": 0.2311, + "step": 38684 + }, + { + "epoch": 2.1247015735214325, + "grad_norm": 4.776432991027832, + "learning_rate": 4.801425243250651e-06, + "loss": 0.1841, + "step": 38685 + }, + { + "epoch": 2.1247151383613674, + "grad_norm": 4.662440299987793, + "learning_rate": 4.801288200630396e-06, + "loss": 0.1141, + "step": 38686 + }, + { + "epoch": 2.1247287032013022, + "grad_norm": 4.84208345413208, + "learning_rate": 4.8011511580101415e-06, + "loss": 0.2204, + "step": 38687 + }, + { + "epoch": 2.124742268041237, + "grad_norm": 3.782383680343628, + "learning_rate": 4.801014115389887e-06, + "loss": 0.168, + "step": 38688 + }, + { + "epoch": 2.124755832881172, + "grad_norm": 3.1923859119415283, + "learning_rate": 4.800877072769632e-06, + "loss": 0.078, + "step": 38689 + }, + { + "epoch": 2.124769397721107, + "grad_norm": 3.939589262008667, + "learning_rate": 4.800740030149377e-06, + "loss": 0.0995, + "step": 38690 + }, + { + "epoch": 2.1247829625610417, + "grad_norm": 4.484866619110107, + "learning_rate": 4.800602987529122e-06, + "loss": 0.2163, + "step": 38691 + }, + { + "epoch": 2.1247965274009766, + "grad_norm": 5.243659973144531, + "learning_rate": 4.800465944908867e-06, + "loss": 0.1974, + "step": 38692 + }, + { + "epoch": 2.1248100922409114, + "grad_norm": 6.389243125915527, + "learning_rate": 4.8003289022886125e-06, + "loss": 0.1702, + "step": 38693 + }, + { + "epoch": 2.1248236570808463, + "grad_norm": 3.8368301391601562, + "learning_rate": 4.800191859668357e-06, + "loss": 0.2122, + "step": 38694 + }, + { + "epoch": 2.124837221920781, + "grad_norm": 3.5476417541503906, + "learning_rate": 4.800054817048103e-06, + "loss": 0.1263, + "step": 38695 + }, + { + "epoch": 2.124850786760716, + "grad_norm": 4.644011497497559, + "learning_rate": 4.799917774427847e-06, + "loss": 0.1798, + "step": 38696 + }, + { + "epoch": 2.124864351600651, + "grad_norm": 4.742171764373779, + "learning_rate": 4.799780731807592e-06, + "loss": 0.1625, + "step": 38697 + }, + { + "epoch": 2.124877916440586, + "grad_norm": 4.824566841125488, + "learning_rate": 4.799643689187338e-06, + "loss": 0.2195, + "step": 38698 + }, + { + "epoch": 2.124891481280521, + "grad_norm": 5.016665935516357, + "learning_rate": 4.799506646567083e-06, + "loss": 0.1736, + "step": 38699 + }, + { + "epoch": 2.124905046120456, + "grad_norm": 3.0699989795684814, + "learning_rate": 4.799369603946828e-06, + "loss": 0.1243, + "step": 38700 + }, + { + "epoch": 2.124918610960391, + "grad_norm": 3.6505937576293945, + "learning_rate": 4.799232561326573e-06, + "loss": 0.133, + "step": 38701 + }, + { + "epoch": 2.1249321758003257, + "grad_norm": 4.388928413391113, + "learning_rate": 4.799095518706318e-06, + "loss": 0.2591, + "step": 38702 + }, + { + "epoch": 2.1249457406402605, + "grad_norm": 5.285510063171387, + "learning_rate": 4.798958476086063e-06, + "loss": 0.1823, + "step": 38703 + }, + { + "epoch": 2.1249593054801954, + "grad_norm": 3.437591791152954, + "learning_rate": 4.798821433465809e-06, + "loss": 0.1748, + "step": 38704 + }, + { + "epoch": 2.1249728703201303, + "grad_norm": 4.2798662185668945, + "learning_rate": 4.798684390845553e-06, + "loss": 0.2806, + "step": 38705 + }, + { + "epoch": 2.124986435160065, + "grad_norm": 3.232501745223999, + "learning_rate": 4.798547348225299e-06, + "loss": 0.1283, + "step": 38706 + }, + { + "epoch": 2.125, + "grad_norm": 3.663666248321533, + "learning_rate": 4.798410305605043e-06, + "loss": 0.1546, + "step": 38707 + }, + { + "epoch": 2.125013564839935, + "grad_norm": 6.217221736907959, + "learning_rate": 4.7982732629847885e-06, + "loss": 0.316, + "step": 38708 + }, + { + "epoch": 2.1250271296798697, + "grad_norm": 3.770510673522949, + "learning_rate": 4.798136220364534e-06, + "loss": 0.2363, + "step": 38709 + }, + { + "epoch": 2.1250406945198046, + "grad_norm": 4.349020957946777, + "learning_rate": 4.797999177744279e-06, + "loss": 0.2003, + "step": 38710 + }, + { + "epoch": 2.1250542593597395, + "grad_norm": 4.513748645782471, + "learning_rate": 4.797862135124024e-06, + "loss": 0.2985, + "step": 38711 + }, + { + "epoch": 2.1250678241996743, + "grad_norm": 5.954442024230957, + "learning_rate": 4.797725092503769e-06, + "loss": 0.2258, + "step": 38712 + }, + { + "epoch": 2.125081389039609, + "grad_norm": 5.637747287750244, + "learning_rate": 4.797588049883514e-06, + "loss": 0.3032, + "step": 38713 + }, + { + "epoch": 2.125094953879544, + "grad_norm": 3.876518964767456, + "learning_rate": 4.797451007263259e-06, + "loss": 0.1447, + "step": 38714 + }, + { + "epoch": 2.125108518719479, + "grad_norm": 4.7333784103393555, + "learning_rate": 4.797313964643005e-06, + "loss": 0.1435, + "step": 38715 + }, + { + "epoch": 2.125122083559414, + "grad_norm": 4.063120365142822, + "learning_rate": 4.797176922022749e-06, + "loss": 0.1661, + "step": 38716 + }, + { + "epoch": 2.1251356483993487, + "grad_norm": 3.61450457572937, + "learning_rate": 4.797039879402494e-06, + "loss": 0.1368, + "step": 38717 + }, + { + "epoch": 2.125149213239284, + "grad_norm": 3.741032123565674, + "learning_rate": 4.7969028367822395e-06, + "loss": 0.199, + "step": 38718 + }, + { + "epoch": 2.125162778079219, + "grad_norm": 4.283563613891602, + "learning_rate": 4.796765794161985e-06, + "loss": 0.1835, + "step": 38719 + }, + { + "epoch": 2.1251763429191537, + "grad_norm": 4.693050384521484, + "learning_rate": 4.79662875154173e-06, + "loss": 0.2653, + "step": 38720 + }, + { + "epoch": 2.1251899077590886, + "grad_norm": 4.251765727996826, + "learning_rate": 4.796491708921475e-06, + "loss": 0.2169, + "step": 38721 + }, + { + "epoch": 2.1252034725990234, + "grad_norm": 5.058056354522705, + "learning_rate": 4.79635466630122e-06, + "loss": 0.2719, + "step": 38722 + }, + { + "epoch": 2.1252170374389583, + "grad_norm": 3.4901821613311768, + "learning_rate": 4.796217623680965e-06, + "loss": 0.1162, + "step": 38723 + }, + { + "epoch": 2.125230602278893, + "grad_norm": 7.440179824829102, + "learning_rate": 4.7960805810607105e-06, + "loss": 0.1815, + "step": 38724 + }, + { + "epoch": 2.125244167118828, + "grad_norm": 4.340233325958252, + "learning_rate": 4.795943538440455e-06, + "loss": 0.1674, + "step": 38725 + }, + { + "epoch": 2.125257731958763, + "grad_norm": 5.080837249755859, + "learning_rate": 4.795806495820201e-06, + "loss": 0.2345, + "step": 38726 + }, + { + "epoch": 2.1252712967986978, + "grad_norm": 5.978022575378418, + "learning_rate": 4.795669453199945e-06, + "loss": 0.1887, + "step": 38727 + }, + { + "epoch": 2.1252848616386326, + "grad_norm": 4.965583324432373, + "learning_rate": 4.79553241057969e-06, + "loss": 0.1886, + "step": 38728 + }, + { + "epoch": 2.1252984264785675, + "grad_norm": 7.279821872711182, + "learning_rate": 4.795395367959436e-06, + "loss": 0.3729, + "step": 38729 + }, + { + "epoch": 2.1253119913185023, + "grad_norm": 6.517977714538574, + "learning_rate": 4.795258325339181e-06, + "loss": 0.2436, + "step": 38730 + }, + { + "epoch": 2.125325556158437, + "grad_norm": 4.504541873931885, + "learning_rate": 4.795121282718926e-06, + "loss": 0.1498, + "step": 38731 + }, + { + "epoch": 2.125339120998372, + "grad_norm": 3.183732509613037, + "learning_rate": 4.794984240098671e-06, + "loss": 0.0971, + "step": 38732 + }, + { + "epoch": 2.125352685838307, + "grad_norm": 3.2156310081481934, + "learning_rate": 4.794847197478416e-06, + "loss": 0.1444, + "step": 38733 + }, + { + "epoch": 2.125366250678242, + "grad_norm": 4.84113073348999, + "learning_rate": 4.7947101548581615e-06, + "loss": 0.1156, + "step": 38734 + }, + { + "epoch": 2.125379815518177, + "grad_norm": 3.6373775005340576, + "learning_rate": 4.794573112237907e-06, + "loss": 0.1509, + "step": 38735 + }, + { + "epoch": 2.125393380358112, + "grad_norm": 7.570252418518066, + "learning_rate": 4.794436069617652e-06, + "loss": 0.3937, + "step": 38736 + }, + { + "epoch": 2.125406945198047, + "grad_norm": 5.226118564605713, + "learning_rate": 4.794299026997396e-06, + "loss": 0.203, + "step": 38737 + }, + { + "epoch": 2.1254205100379817, + "grad_norm": 3.4776113033294678, + "learning_rate": 4.794161984377141e-06, + "loss": 0.1654, + "step": 38738 + }, + { + "epoch": 2.1254340748779166, + "grad_norm": 3.9699907302856445, + "learning_rate": 4.7940249417568865e-06, + "loss": 0.2154, + "step": 38739 + }, + { + "epoch": 2.1254476397178514, + "grad_norm": 5.2591776847839355, + "learning_rate": 4.793887899136632e-06, + "loss": 0.165, + "step": 38740 + }, + { + "epoch": 2.1254612045577863, + "grad_norm": 5.266297817230225, + "learning_rate": 4.793750856516377e-06, + "loss": 0.1882, + "step": 38741 + }, + { + "epoch": 2.125474769397721, + "grad_norm": 3.2107441425323486, + "learning_rate": 4.793613813896122e-06, + "loss": 0.0824, + "step": 38742 + }, + { + "epoch": 2.125488334237656, + "grad_norm": 4.034885406494141, + "learning_rate": 4.793476771275867e-06, + "loss": 0.1817, + "step": 38743 + }, + { + "epoch": 2.125501899077591, + "grad_norm": 5.017650127410889, + "learning_rate": 4.7933397286556124e-06, + "loss": 0.1494, + "step": 38744 + }, + { + "epoch": 2.1255154639175258, + "grad_norm": 3.4729623794555664, + "learning_rate": 4.793202686035358e-06, + "loss": 0.1333, + "step": 38745 + }, + { + "epoch": 2.1255290287574606, + "grad_norm": 7.100619316101074, + "learning_rate": 4.793065643415103e-06, + "loss": 0.2536, + "step": 38746 + }, + { + "epoch": 2.1255425935973955, + "grad_norm": 3.879702091217041, + "learning_rate": 4.792928600794848e-06, + "loss": 0.1724, + "step": 38747 + }, + { + "epoch": 2.1255561584373304, + "grad_norm": 4.27683687210083, + "learning_rate": 4.792791558174592e-06, + "loss": 0.1604, + "step": 38748 + }, + { + "epoch": 2.1255697232772652, + "grad_norm": 5.05581521987915, + "learning_rate": 4.792654515554338e-06, + "loss": 0.2419, + "step": 38749 + }, + { + "epoch": 2.1255832881172, + "grad_norm": 4.539937496185303, + "learning_rate": 4.792517472934083e-06, + "loss": 0.1678, + "step": 38750 + }, + { + "epoch": 2.125596852957135, + "grad_norm": 4.539522171020508, + "learning_rate": 4.792380430313829e-06, + "loss": 0.1314, + "step": 38751 + }, + { + "epoch": 2.12561041779707, + "grad_norm": 6.434573650360107, + "learning_rate": 4.792243387693573e-06, + "loss": 0.1921, + "step": 38752 + }, + { + "epoch": 2.1256239826370047, + "grad_norm": 5.601485729217529, + "learning_rate": 4.792106345073318e-06, + "loss": 0.2373, + "step": 38753 + }, + { + "epoch": 2.1256375474769396, + "grad_norm": 4.115344524383545, + "learning_rate": 4.791969302453063e-06, + "loss": 0.1635, + "step": 38754 + }, + { + "epoch": 2.125651112316875, + "grad_norm": 3.611938714981079, + "learning_rate": 4.7918322598328085e-06, + "loss": 0.1479, + "step": 38755 + }, + { + "epoch": 2.1256646771568097, + "grad_norm": 4.490554332733154, + "learning_rate": 4.791695217212554e-06, + "loss": 0.1203, + "step": 38756 + }, + { + "epoch": 2.1256782419967446, + "grad_norm": 4.792787551879883, + "learning_rate": 4.791558174592298e-06, + "loss": 0.1674, + "step": 38757 + }, + { + "epoch": 2.1256918068366795, + "grad_norm": 4.431629657745361, + "learning_rate": 4.791421131972044e-06, + "loss": 0.1322, + "step": 38758 + }, + { + "epoch": 2.1257053716766143, + "grad_norm": 4.461884021759033, + "learning_rate": 4.791284089351788e-06, + "loss": 0.1866, + "step": 38759 + }, + { + "epoch": 2.125718936516549, + "grad_norm": 5.647424221038818, + "learning_rate": 4.7911470467315344e-06, + "loss": 0.1947, + "step": 38760 + }, + { + "epoch": 2.125732501356484, + "grad_norm": 3.833498954772949, + "learning_rate": 4.791010004111279e-06, + "loss": 0.1109, + "step": 38761 + }, + { + "epoch": 2.125746066196419, + "grad_norm": 5.093328475952148, + "learning_rate": 4.790872961491024e-06, + "loss": 0.2088, + "step": 38762 + }, + { + "epoch": 2.125759631036354, + "grad_norm": 4.628137588500977, + "learning_rate": 4.790735918870769e-06, + "loss": 0.2479, + "step": 38763 + }, + { + "epoch": 2.1257731958762887, + "grad_norm": 6.319946765899658, + "learning_rate": 4.790598876250514e-06, + "loss": 0.2455, + "step": 38764 + }, + { + "epoch": 2.1257867607162235, + "grad_norm": 5.9343953132629395, + "learning_rate": 4.7904618336302595e-06, + "loss": 0.1334, + "step": 38765 + }, + { + "epoch": 2.1258003255561584, + "grad_norm": 6.450669765472412, + "learning_rate": 4.790324791010005e-06, + "loss": 0.2319, + "step": 38766 + }, + { + "epoch": 2.1258138903960933, + "grad_norm": 4.982086658477783, + "learning_rate": 4.79018774838975e-06, + "loss": 0.1655, + "step": 38767 + }, + { + "epoch": 2.125827455236028, + "grad_norm": 6.836452960968018, + "learning_rate": 4.790050705769494e-06, + "loss": 0.1754, + "step": 38768 + }, + { + "epoch": 2.125841020075963, + "grad_norm": 4.78948974609375, + "learning_rate": 4.78991366314924e-06, + "loss": 0.1579, + "step": 38769 + }, + { + "epoch": 2.125854584915898, + "grad_norm": 9.386752128601074, + "learning_rate": 4.7897766205289845e-06, + "loss": 0.402, + "step": 38770 + }, + { + "epoch": 2.1258681497558327, + "grad_norm": 9.952093124389648, + "learning_rate": 4.78963957790873e-06, + "loss": 0.3265, + "step": 38771 + }, + { + "epoch": 2.1258817145957676, + "grad_norm": 4.900814533233643, + "learning_rate": 4.789502535288475e-06, + "loss": 0.2081, + "step": 38772 + }, + { + "epoch": 2.125895279435703, + "grad_norm": 5.3580451011657715, + "learning_rate": 4.78936549266822e-06, + "loss": 0.1787, + "step": 38773 + }, + { + "epoch": 2.1259088442756378, + "grad_norm": 5.501873970031738, + "learning_rate": 4.789228450047965e-06, + "loss": 0.2693, + "step": 38774 + }, + { + "epoch": 2.1259224091155726, + "grad_norm": 6.901230812072754, + "learning_rate": 4.7890914074277104e-06, + "loss": 0.2371, + "step": 38775 + }, + { + "epoch": 2.1259359739555075, + "grad_norm": 4.772690773010254, + "learning_rate": 4.788954364807456e-06, + "loss": 0.2423, + "step": 38776 + }, + { + "epoch": 2.1259495387954424, + "grad_norm": 5.170041561126709, + "learning_rate": 4.788817322187201e-06, + "loss": 0.1798, + "step": 38777 + }, + { + "epoch": 2.125963103635377, + "grad_norm": 3.632549285888672, + "learning_rate": 4.788680279566946e-06, + "loss": 0.1715, + "step": 38778 + }, + { + "epoch": 2.125976668475312, + "grad_norm": 5.851208209991455, + "learning_rate": 4.78854323694669e-06, + "loss": 0.2472, + "step": 38779 + }, + { + "epoch": 2.125990233315247, + "grad_norm": 4.322566032409668, + "learning_rate": 4.788406194326436e-06, + "loss": 0.178, + "step": 38780 + }, + { + "epoch": 2.126003798155182, + "grad_norm": 6.076322078704834, + "learning_rate": 4.788269151706181e-06, + "loss": 0.22, + "step": 38781 + }, + { + "epoch": 2.1260173629951167, + "grad_norm": 6.071642875671387, + "learning_rate": 4.788132109085926e-06, + "loss": 0.3284, + "step": 38782 + }, + { + "epoch": 2.1260309278350515, + "grad_norm": 6.523460388183594, + "learning_rate": 4.787995066465671e-06, + "loss": 0.2082, + "step": 38783 + }, + { + "epoch": 2.1260444926749864, + "grad_norm": 5.077988624572754, + "learning_rate": 4.787858023845416e-06, + "loss": 0.1922, + "step": 38784 + }, + { + "epoch": 2.1260580575149213, + "grad_norm": 4.684075355529785, + "learning_rate": 4.787720981225161e-06, + "loss": 0.189, + "step": 38785 + }, + { + "epoch": 2.126071622354856, + "grad_norm": 6.21740198135376, + "learning_rate": 4.7875839386049066e-06, + "loss": 0.2404, + "step": 38786 + }, + { + "epoch": 2.126085187194791, + "grad_norm": 6.684408187866211, + "learning_rate": 4.787446895984652e-06, + "loss": 0.2139, + "step": 38787 + }, + { + "epoch": 2.126098752034726, + "grad_norm": 3.889791965484619, + "learning_rate": 4.787309853364397e-06, + "loss": 0.1922, + "step": 38788 + }, + { + "epoch": 2.1261123168746607, + "grad_norm": 9.425581932067871, + "learning_rate": 4.787172810744142e-06, + "loss": 0.4523, + "step": 38789 + }, + { + "epoch": 2.1261258817145956, + "grad_norm": 5.727185249328613, + "learning_rate": 4.787035768123887e-06, + "loss": 0.1607, + "step": 38790 + }, + { + "epoch": 2.1261394465545305, + "grad_norm": 7.763619899749756, + "learning_rate": 4.786898725503632e-06, + "loss": 0.2336, + "step": 38791 + }, + { + "epoch": 2.1261530113944653, + "grad_norm": 6.543951988220215, + "learning_rate": 4.786761682883378e-06, + "loss": 0.255, + "step": 38792 + }, + { + "epoch": 2.1261665762344006, + "grad_norm": 4.620030403137207, + "learning_rate": 4.786624640263122e-06, + "loss": 0.1667, + "step": 38793 + }, + { + "epoch": 2.1261801410743355, + "grad_norm": 7.149778842926025, + "learning_rate": 4.786487597642867e-06, + "loss": 0.3679, + "step": 38794 + }, + { + "epoch": 2.1261937059142704, + "grad_norm": 5.371326923370361, + "learning_rate": 4.786350555022612e-06, + "loss": 0.2125, + "step": 38795 + }, + { + "epoch": 2.1262072707542052, + "grad_norm": 5.610365390777588, + "learning_rate": 4.7862135124023575e-06, + "loss": 0.1468, + "step": 38796 + }, + { + "epoch": 2.12622083559414, + "grad_norm": 5.298338890075684, + "learning_rate": 4.786076469782103e-06, + "loss": 0.2307, + "step": 38797 + }, + { + "epoch": 2.126234400434075, + "grad_norm": 5.218170642852783, + "learning_rate": 4.785939427161848e-06, + "loss": 0.1493, + "step": 38798 + }, + { + "epoch": 2.12624796527401, + "grad_norm": 5.425074577331543, + "learning_rate": 4.785802384541593e-06, + "loss": 0.1762, + "step": 38799 + }, + { + "epoch": 2.1262615301139447, + "grad_norm": 7.050001621246338, + "learning_rate": 4.785665341921338e-06, + "loss": 0.3375, + "step": 38800 + }, + { + "epoch": 2.1262750949538796, + "grad_norm": 5.514482021331787, + "learning_rate": 4.785528299301083e-06, + "loss": 0.2227, + "step": 38801 + }, + { + "epoch": 2.1262886597938144, + "grad_norm": 4.723064422607422, + "learning_rate": 4.785391256680828e-06, + "loss": 0.1482, + "step": 38802 + }, + { + "epoch": 2.1263022246337493, + "grad_norm": 4.085906982421875, + "learning_rate": 4.785254214060574e-06, + "loss": 0.1497, + "step": 38803 + }, + { + "epoch": 2.126315789473684, + "grad_norm": 4.624694347381592, + "learning_rate": 4.785117171440318e-06, + "loss": 0.1652, + "step": 38804 + }, + { + "epoch": 2.126329354313619, + "grad_norm": 4.535265922546387, + "learning_rate": 4.784980128820064e-06, + "loss": 0.1461, + "step": 38805 + }, + { + "epoch": 2.126342919153554, + "grad_norm": 4.813616752624512, + "learning_rate": 4.7848430861998084e-06, + "loss": 0.1726, + "step": 38806 + }, + { + "epoch": 2.1263564839934888, + "grad_norm": 4.946940898895264, + "learning_rate": 4.784706043579554e-06, + "loss": 0.1795, + "step": 38807 + }, + { + "epoch": 2.1263700488334236, + "grad_norm": 5.545337677001953, + "learning_rate": 4.784569000959299e-06, + "loss": 0.226, + "step": 38808 + }, + { + "epoch": 2.1263836136733585, + "grad_norm": 5.468016624450684, + "learning_rate": 4.784431958339044e-06, + "loss": 0.1929, + "step": 38809 + }, + { + "epoch": 2.1263971785132934, + "grad_norm": 7.495020389556885, + "learning_rate": 4.784294915718789e-06, + "loss": 0.1975, + "step": 38810 + }, + { + "epoch": 2.1264107433532287, + "grad_norm": 6.8090362548828125, + "learning_rate": 4.7841578730985335e-06, + "loss": 0.1554, + "step": 38811 + }, + { + "epoch": 2.1264243081931635, + "grad_norm": 5.364293098449707, + "learning_rate": 4.7840208304782795e-06, + "loss": 0.1954, + "step": 38812 + }, + { + "epoch": 2.1264378730330984, + "grad_norm": 4.032032489776611, + "learning_rate": 4.783883787858024e-06, + "loss": 0.1803, + "step": 38813 + }, + { + "epoch": 2.1264514378730333, + "grad_norm": 4.166001796722412, + "learning_rate": 4.78374674523777e-06, + "loss": 0.1987, + "step": 38814 + }, + { + "epoch": 2.126465002712968, + "grad_norm": 5.503312110900879, + "learning_rate": 4.783609702617514e-06, + "loss": 0.1632, + "step": 38815 + }, + { + "epoch": 2.126478567552903, + "grad_norm": 5.046719074249268, + "learning_rate": 4.783472659997259e-06, + "loss": 0.1643, + "step": 38816 + }, + { + "epoch": 2.126492132392838, + "grad_norm": 5.53665828704834, + "learning_rate": 4.7833356173770046e-06, + "loss": 0.2614, + "step": 38817 + }, + { + "epoch": 2.1265056972327727, + "grad_norm": 3.384753465652466, + "learning_rate": 4.78319857475675e-06, + "loss": 0.1364, + "step": 38818 + }, + { + "epoch": 2.1265192620727076, + "grad_norm": 5.48280143737793, + "learning_rate": 4.783061532136495e-06, + "loss": 0.1612, + "step": 38819 + }, + { + "epoch": 2.1265328269126424, + "grad_norm": 3.7747955322265625, + "learning_rate": 4.782924489516239e-06, + "loss": 0.1612, + "step": 38820 + }, + { + "epoch": 2.1265463917525773, + "grad_norm": 5.807760715484619, + "learning_rate": 4.782787446895985e-06, + "loss": 0.1644, + "step": 38821 + }, + { + "epoch": 2.126559956592512, + "grad_norm": 3.9438982009887695, + "learning_rate": 4.78265040427573e-06, + "loss": 0.1677, + "step": 38822 + }, + { + "epoch": 2.126573521432447, + "grad_norm": 3.9789764881134033, + "learning_rate": 4.782513361655476e-06, + "loss": 0.2097, + "step": 38823 + }, + { + "epoch": 2.126587086272382, + "grad_norm": 3.958176612854004, + "learning_rate": 4.78237631903522e-06, + "loss": 0.1734, + "step": 38824 + }, + { + "epoch": 2.1266006511123168, + "grad_norm": 3.9534406661987305, + "learning_rate": 4.782239276414965e-06, + "loss": 0.1596, + "step": 38825 + }, + { + "epoch": 2.1266142159522516, + "grad_norm": 4.89647912979126, + "learning_rate": 4.78210223379471e-06, + "loss": 0.2145, + "step": 38826 + }, + { + "epoch": 2.1266277807921865, + "grad_norm": 4.6651787757873535, + "learning_rate": 4.7819651911744555e-06, + "loss": 0.1184, + "step": 38827 + }, + { + "epoch": 2.1266413456321214, + "grad_norm": 4.402554512023926, + "learning_rate": 4.781828148554201e-06, + "loss": 0.135, + "step": 38828 + }, + { + "epoch": 2.1266549104720562, + "grad_norm": 3.273036241531372, + "learning_rate": 4.781691105933946e-06, + "loss": 0.1066, + "step": 38829 + }, + { + "epoch": 2.126668475311991, + "grad_norm": 4.784929275512695, + "learning_rate": 4.781554063313691e-06, + "loss": 0.1936, + "step": 38830 + }, + { + "epoch": 2.1266820401519264, + "grad_norm": 4.7530951499938965, + "learning_rate": 4.781417020693436e-06, + "loss": 0.2966, + "step": 38831 + }, + { + "epoch": 2.1266956049918613, + "grad_norm": 4.1606340408325195, + "learning_rate": 4.781279978073181e-06, + "loss": 0.1322, + "step": 38832 + }, + { + "epoch": 2.126709169831796, + "grad_norm": 4.897899150848389, + "learning_rate": 4.7811429354529266e-06, + "loss": 0.1414, + "step": 38833 + }, + { + "epoch": 2.126722734671731, + "grad_norm": 4.932185173034668, + "learning_rate": 4.781005892832672e-06, + "loss": 0.1556, + "step": 38834 + }, + { + "epoch": 2.126736299511666, + "grad_norm": 4.909510612487793, + "learning_rate": 4.780868850212416e-06, + "loss": 0.1296, + "step": 38835 + }, + { + "epoch": 2.1267498643516007, + "grad_norm": 4.350966930389404, + "learning_rate": 4.780731807592161e-06, + "loss": 0.1289, + "step": 38836 + }, + { + "epoch": 2.1267634291915356, + "grad_norm": 6.646024703979492, + "learning_rate": 4.7805947649719064e-06, + "loss": 0.2246, + "step": 38837 + }, + { + "epoch": 2.1267769940314705, + "grad_norm": 5.537571430206299, + "learning_rate": 4.780457722351652e-06, + "loss": 0.2247, + "step": 38838 + }, + { + "epoch": 2.1267905588714053, + "grad_norm": 3.968006134033203, + "learning_rate": 4.780320679731397e-06, + "loss": 0.1406, + "step": 38839 + }, + { + "epoch": 2.12680412371134, + "grad_norm": 4.2959794998168945, + "learning_rate": 4.780183637111142e-06, + "loss": 0.1574, + "step": 38840 + }, + { + "epoch": 2.126817688551275, + "grad_norm": 4.928970813751221, + "learning_rate": 4.780046594490887e-06, + "loss": 0.1549, + "step": 38841 + }, + { + "epoch": 2.12683125339121, + "grad_norm": 5.0666680335998535, + "learning_rate": 4.779909551870632e-06, + "loss": 0.2519, + "step": 38842 + }, + { + "epoch": 2.126844818231145, + "grad_norm": 4.187611103057861, + "learning_rate": 4.7797725092503775e-06, + "loss": 0.1477, + "step": 38843 + }, + { + "epoch": 2.1268583830710797, + "grad_norm": 5.007503032684326, + "learning_rate": 4.779635466630123e-06, + "loss": 0.2545, + "step": 38844 + }, + { + "epoch": 2.1268719479110145, + "grad_norm": 5.985466957092285, + "learning_rate": 4.779498424009867e-06, + "loss": 0.2695, + "step": 38845 + }, + { + "epoch": 2.1268855127509494, + "grad_norm": 5.233872413635254, + "learning_rate": 4.779361381389613e-06, + "loss": 0.2027, + "step": 38846 + }, + { + "epoch": 2.1268990775908843, + "grad_norm": 5.686382293701172, + "learning_rate": 4.779224338769357e-06, + "loss": 0.2399, + "step": 38847 + }, + { + "epoch": 2.126912642430819, + "grad_norm": 4.6742939949035645, + "learning_rate": 4.7790872961491026e-06, + "loss": 0.2052, + "step": 38848 + }, + { + "epoch": 2.1269262072707544, + "grad_norm": 4.685502052307129, + "learning_rate": 4.778950253528848e-06, + "loss": 0.1288, + "step": 38849 + }, + { + "epoch": 2.1269397721106893, + "grad_norm": 5.626537799835205, + "learning_rate": 4.778813210908593e-06, + "loss": 0.2028, + "step": 38850 + }, + { + "epoch": 2.126953336950624, + "grad_norm": 5.396045684814453, + "learning_rate": 4.778676168288338e-06, + "loss": 0.2237, + "step": 38851 + }, + { + "epoch": 2.126966901790559, + "grad_norm": 6.581001281738281, + "learning_rate": 4.778539125668083e-06, + "loss": 0.152, + "step": 38852 + }, + { + "epoch": 2.126980466630494, + "grad_norm": 6.356593608856201, + "learning_rate": 4.7784020830478285e-06, + "loss": 0.2662, + "step": 38853 + }, + { + "epoch": 2.1269940314704288, + "grad_norm": 5.472052574157715, + "learning_rate": 4.778265040427574e-06, + "loss": 0.2052, + "step": 38854 + }, + { + "epoch": 2.1270075963103636, + "grad_norm": 6.037502288818359, + "learning_rate": 4.778127997807319e-06, + "loss": 0.2333, + "step": 38855 + }, + { + "epoch": 2.1270211611502985, + "grad_norm": 4.743722438812256, + "learning_rate": 4.777990955187063e-06, + "loss": 0.144, + "step": 38856 + }, + { + "epoch": 2.1270347259902334, + "grad_norm": 6.955173492431641, + "learning_rate": 4.777853912566809e-06, + "loss": 0.2462, + "step": 38857 + }, + { + "epoch": 2.127048290830168, + "grad_norm": 4.472882270812988, + "learning_rate": 4.7777168699465535e-06, + "loss": 0.1743, + "step": 38858 + }, + { + "epoch": 2.127061855670103, + "grad_norm": 4.2273969650268555, + "learning_rate": 4.777579827326299e-06, + "loss": 0.1644, + "step": 38859 + }, + { + "epoch": 2.127075420510038, + "grad_norm": 6.579747200012207, + "learning_rate": 4.777442784706044e-06, + "loss": 0.1887, + "step": 38860 + }, + { + "epoch": 2.127088985349973, + "grad_norm": 4.789218902587891, + "learning_rate": 4.777305742085789e-06, + "loss": 0.1752, + "step": 38861 + }, + { + "epoch": 2.1271025501899077, + "grad_norm": 5.753910064697266, + "learning_rate": 4.777168699465534e-06, + "loss": 0.1633, + "step": 38862 + }, + { + "epoch": 2.1271161150298425, + "grad_norm": 6.329397201538086, + "learning_rate": 4.777031656845279e-06, + "loss": 0.1979, + "step": 38863 + }, + { + "epoch": 2.1271296798697774, + "grad_norm": 4.560678005218506, + "learning_rate": 4.7768946142250246e-06, + "loss": 0.1894, + "step": 38864 + }, + { + "epoch": 2.1271432447097123, + "grad_norm": 5.844564914703369, + "learning_rate": 4.776757571604769e-06, + "loss": 0.1818, + "step": 38865 + }, + { + "epoch": 2.127156809549647, + "grad_norm": 4.686642646789551, + "learning_rate": 4.776620528984515e-06, + "loss": 0.1694, + "step": 38866 + }, + { + "epoch": 2.127170374389582, + "grad_norm": 5.81831693649292, + "learning_rate": 4.776483486364259e-06, + "loss": 0.2258, + "step": 38867 + }, + { + "epoch": 2.127183939229517, + "grad_norm": 5.465958595275879, + "learning_rate": 4.776346443744005e-06, + "loss": 0.196, + "step": 38868 + }, + { + "epoch": 2.127197504069452, + "grad_norm": 6.3384480476379395, + "learning_rate": 4.77620940112375e-06, + "loss": 0.2172, + "step": 38869 + }, + { + "epoch": 2.127211068909387, + "grad_norm": 3.6968977451324463, + "learning_rate": 4.776072358503495e-06, + "loss": 0.1412, + "step": 38870 + }, + { + "epoch": 2.127224633749322, + "grad_norm": 4.352905750274658, + "learning_rate": 4.77593531588324e-06, + "loss": 0.2056, + "step": 38871 + }, + { + "epoch": 2.127238198589257, + "grad_norm": 5.55800199508667, + "learning_rate": 4.775798273262985e-06, + "loss": 0.233, + "step": 38872 + }, + { + "epoch": 2.1272517634291916, + "grad_norm": 6.44683837890625, + "learning_rate": 4.77566123064273e-06, + "loss": 0.3091, + "step": 38873 + }, + { + "epoch": 2.1272653282691265, + "grad_norm": 4.5399603843688965, + "learning_rate": 4.7755241880224755e-06, + "loss": 0.1542, + "step": 38874 + }, + { + "epoch": 2.1272788931090614, + "grad_norm": 4.834455490112305, + "learning_rate": 4.775387145402221e-06, + "loss": 0.202, + "step": 38875 + }, + { + "epoch": 2.1272924579489962, + "grad_norm": 4.34701681137085, + "learning_rate": 4.775250102781965e-06, + "loss": 0.0868, + "step": 38876 + }, + { + "epoch": 2.127306022788931, + "grad_norm": 5.1494550704956055, + "learning_rate": 4.775113060161711e-06, + "loss": 0.1545, + "step": 38877 + }, + { + "epoch": 2.127319587628866, + "grad_norm": 6.353351593017578, + "learning_rate": 4.774976017541455e-06, + "loss": 0.1989, + "step": 38878 + }, + { + "epoch": 2.127333152468801, + "grad_norm": 4.341064929962158, + "learning_rate": 4.7748389749212006e-06, + "loss": 0.1431, + "step": 38879 + }, + { + "epoch": 2.1273467173087357, + "grad_norm": 5.251977920532227, + "learning_rate": 4.774701932300946e-06, + "loss": 0.182, + "step": 38880 + }, + { + "epoch": 2.1273602821486706, + "grad_norm": 7.442902565002441, + "learning_rate": 4.774564889680691e-06, + "loss": 0.273, + "step": 38881 + }, + { + "epoch": 2.1273738469886054, + "grad_norm": 5.212769508361816, + "learning_rate": 4.774427847060436e-06, + "loss": 0.1063, + "step": 38882 + }, + { + "epoch": 2.1273874118285403, + "grad_norm": 4.940099239349365, + "learning_rate": 4.774290804440181e-06, + "loss": 0.2137, + "step": 38883 + }, + { + "epoch": 2.127400976668475, + "grad_norm": 3.627307176589966, + "learning_rate": 4.7741537618199265e-06, + "loss": 0.084, + "step": 38884 + }, + { + "epoch": 2.12741454150841, + "grad_norm": 4.818981647491455, + "learning_rate": 4.774016719199672e-06, + "loss": 0.2223, + "step": 38885 + }, + { + "epoch": 2.127428106348345, + "grad_norm": 3.7336645126342773, + "learning_rate": 4.773879676579417e-06, + "loss": 0.067, + "step": 38886 + }, + { + "epoch": 2.12744167118828, + "grad_norm": 5.008294582366943, + "learning_rate": 4.773742633959162e-06, + "loss": 0.155, + "step": 38887 + }, + { + "epoch": 2.127455236028215, + "grad_norm": 5.626481056213379, + "learning_rate": 4.773605591338907e-06, + "loss": 0.1749, + "step": 38888 + }, + { + "epoch": 2.12746880086815, + "grad_norm": 5.056950569152832, + "learning_rate": 4.7734685487186515e-06, + "loss": 0.1331, + "step": 38889 + }, + { + "epoch": 2.127482365708085, + "grad_norm": 4.864157676696777, + "learning_rate": 4.773331506098397e-06, + "loss": 0.1784, + "step": 38890 + }, + { + "epoch": 2.1274959305480197, + "grad_norm": 6.879688262939453, + "learning_rate": 4.773194463478142e-06, + "loss": 0.1848, + "step": 38891 + }, + { + "epoch": 2.1275094953879545, + "grad_norm": 5.20040225982666, + "learning_rate": 4.773057420857887e-06, + "loss": 0.2598, + "step": 38892 + }, + { + "epoch": 2.1275230602278894, + "grad_norm": 4.4905242919921875, + "learning_rate": 4.772920378237632e-06, + "loss": 0.1989, + "step": 38893 + }, + { + "epoch": 2.1275366250678243, + "grad_norm": 6.3958587646484375, + "learning_rate": 4.772783335617377e-06, + "loss": 0.2356, + "step": 38894 + }, + { + "epoch": 2.127550189907759, + "grad_norm": 6.753513336181641, + "learning_rate": 4.772646292997123e-06, + "loss": 0.3644, + "step": 38895 + }, + { + "epoch": 2.127563754747694, + "grad_norm": 5.422353744506836, + "learning_rate": 4.772509250376868e-06, + "loss": 0.1482, + "step": 38896 + }, + { + "epoch": 2.127577319587629, + "grad_norm": 7.189062595367432, + "learning_rate": 4.772372207756613e-06, + "loss": 0.255, + "step": 38897 + }, + { + "epoch": 2.1275908844275637, + "grad_norm": 5.980993270874023, + "learning_rate": 4.772235165136358e-06, + "loss": 0.2193, + "step": 38898 + }, + { + "epoch": 2.1276044492674986, + "grad_norm": 6.308477878570557, + "learning_rate": 4.7720981225161024e-06, + "loss": 0.282, + "step": 38899 + }, + { + "epoch": 2.1276180141074335, + "grad_norm": 4.880856513977051, + "learning_rate": 4.7719610798958485e-06, + "loss": 0.1273, + "step": 38900 + }, + { + "epoch": 2.1276315789473683, + "grad_norm": 5.858835697174072, + "learning_rate": 4.771824037275593e-06, + "loss": 0.3256, + "step": 38901 + }, + { + "epoch": 2.127645143787303, + "grad_norm": 5.352255821228027, + "learning_rate": 4.771686994655339e-06, + "loss": 0.2317, + "step": 38902 + }, + { + "epoch": 2.127658708627238, + "grad_norm": 4.387622356414795, + "learning_rate": 4.771549952035083e-06, + "loss": 0.1364, + "step": 38903 + }, + { + "epoch": 2.127672273467173, + "grad_norm": 4.579862117767334, + "learning_rate": 4.771412909414828e-06, + "loss": 0.2101, + "step": 38904 + }, + { + "epoch": 2.127685838307108, + "grad_norm": 5.313000202178955, + "learning_rate": 4.7712758667945735e-06, + "loss": 0.227, + "step": 38905 + }, + { + "epoch": 2.1276994031470426, + "grad_norm": 6.920672416687012, + "learning_rate": 4.771138824174319e-06, + "loss": 0.2671, + "step": 38906 + }, + { + "epoch": 2.127712967986978, + "grad_norm": 4.364471435546875, + "learning_rate": 4.771001781554064e-06, + "loss": 0.1253, + "step": 38907 + }, + { + "epoch": 2.127726532826913, + "grad_norm": 6.110966205596924, + "learning_rate": 4.770864738933809e-06, + "loss": 0.224, + "step": 38908 + }, + { + "epoch": 2.1277400976668477, + "grad_norm": 5.000998020172119, + "learning_rate": 4.770727696313554e-06, + "loss": 0.1714, + "step": 38909 + }, + { + "epoch": 2.1277536625067826, + "grad_norm": 3.4151225090026855, + "learning_rate": 4.7705906536932986e-06, + "loss": 0.1635, + "step": 38910 + }, + { + "epoch": 2.1277672273467174, + "grad_norm": 4.681960105895996, + "learning_rate": 4.770453611073045e-06, + "loss": 0.1847, + "step": 38911 + }, + { + "epoch": 2.1277807921866523, + "grad_norm": 4.929730415344238, + "learning_rate": 4.770316568452789e-06, + "loss": 0.1455, + "step": 38912 + }, + { + "epoch": 2.127794357026587, + "grad_norm": 4.908743381500244, + "learning_rate": 4.770179525832534e-06, + "loss": 0.245, + "step": 38913 + }, + { + "epoch": 2.127807921866522, + "grad_norm": 4.046169281005859, + "learning_rate": 4.770042483212279e-06, + "loss": 0.1587, + "step": 38914 + }, + { + "epoch": 2.127821486706457, + "grad_norm": 6.173666477203369, + "learning_rate": 4.7699054405920245e-06, + "loss": 0.3663, + "step": 38915 + }, + { + "epoch": 2.1278350515463917, + "grad_norm": 4.357619285583496, + "learning_rate": 4.76976839797177e-06, + "loss": 0.2013, + "step": 38916 + }, + { + "epoch": 2.1278486163863266, + "grad_norm": 7.638547897338867, + "learning_rate": 4.769631355351515e-06, + "loss": 0.221, + "step": 38917 + }, + { + "epoch": 2.1278621812262615, + "grad_norm": 4.326268196105957, + "learning_rate": 4.76949431273126e-06, + "loss": 0.1926, + "step": 38918 + }, + { + "epoch": 2.1278757460661963, + "grad_norm": 3.6474719047546387, + "learning_rate": 4.769357270111004e-06, + "loss": 0.1466, + "step": 38919 + }, + { + "epoch": 2.127889310906131, + "grad_norm": 8.756609916687012, + "learning_rate": 4.76922022749075e-06, + "loss": 0.31, + "step": 38920 + }, + { + "epoch": 2.127902875746066, + "grad_norm": 5.3469390869140625, + "learning_rate": 4.769083184870495e-06, + "loss": 0.3471, + "step": 38921 + }, + { + "epoch": 2.127916440586001, + "grad_norm": 4.428266525268555, + "learning_rate": 4.768946142250241e-06, + "loss": 0.1667, + "step": 38922 + }, + { + "epoch": 2.127930005425936, + "grad_norm": 5.266048431396484, + "learning_rate": 4.768809099629985e-06, + "loss": 0.2452, + "step": 38923 + }, + { + "epoch": 2.1279435702658707, + "grad_norm": 6.422361850738525, + "learning_rate": 4.76867205700973e-06, + "loss": 0.2168, + "step": 38924 + }, + { + "epoch": 2.127957135105806, + "grad_norm": 5.590585708618164, + "learning_rate": 4.768535014389475e-06, + "loss": 0.2346, + "step": 38925 + }, + { + "epoch": 2.127970699945741, + "grad_norm": 6.214874267578125, + "learning_rate": 4.768397971769221e-06, + "loss": 0.2661, + "step": 38926 + }, + { + "epoch": 2.1279842647856757, + "grad_norm": 4.933439254760742, + "learning_rate": 4.768260929148966e-06, + "loss": 0.2058, + "step": 38927 + }, + { + "epoch": 2.1279978296256106, + "grad_norm": 4.99701452255249, + "learning_rate": 4.768123886528711e-06, + "loss": 0.2584, + "step": 38928 + }, + { + "epoch": 2.1280113944655454, + "grad_norm": 4.681225776672363, + "learning_rate": 4.767986843908456e-06, + "loss": 0.2349, + "step": 38929 + }, + { + "epoch": 2.1280249593054803, + "grad_norm": 3.3522567749023438, + "learning_rate": 4.767849801288201e-06, + "loss": 0.1028, + "step": 38930 + }, + { + "epoch": 2.128038524145415, + "grad_norm": 4.759815216064453, + "learning_rate": 4.7677127586679465e-06, + "loss": 0.2237, + "step": 38931 + }, + { + "epoch": 2.12805208898535, + "grad_norm": 7.0932183265686035, + "learning_rate": 4.767575716047691e-06, + "loss": 0.3961, + "step": 38932 + }, + { + "epoch": 2.128065653825285, + "grad_norm": 4.074620723724365, + "learning_rate": 4.767438673427436e-06, + "loss": 0.174, + "step": 38933 + }, + { + "epoch": 2.1280792186652198, + "grad_norm": 4.246501922607422, + "learning_rate": 4.767301630807181e-06, + "loss": 0.178, + "step": 38934 + }, + { + "epoch": 2.1280927835051546, + "grad_norm": 4.275600910186768, + "learning_rate": 4.767164588186926e-06, + "loss": 0.2243, + "step": 38935 + }, + { + "epoch": 2.1281063483450895, + "grad_norm": 5.4550065994262695, + "learning_rate": 4.7670275455666715e-06, + "loss": 0.2248, + "step": 38936 + }, + { + "epoch": 2.1281199131850244, + "grad_norm": 3.0851054191589355, + "learning_rate": 4.766890502946417e-06, + "loss": 0.1717, + "step": 38937 + }, + { + "epoch": 2.1281334780249592, + "grad_norm": 4.6584882736206055, + "learning_rate": 4.766753460326162e-06, + "loss": 0.3189, + "step": 38938 + }, + { + "epoch": 2.128147042864894, + "grad_norm": 3.850517988204956, + "learning_rate": 4.766616417705907e-06, + "loss": 0.1521, + "step": 38939 + }, + { + "epoch": 2.128160607704829, + "grad_norm": 5.564879894256592, + "learning_rate": 4.766479375085652e-06, + "loss": 0.1479, + "step": 38940 + }, + { + "epoch": 2.128174172544764, + "grad_norm": 5.006930828094482, + "learning_rate": 4.766342332465397e-06, + "loss": 0.2136, + "step": 38941 + }, + { + "epoch": 2.1281877373846987, + "grad_norm": 4.3368144035339355, + "learning_rate": 4.766205289845143e-06, + "loss": 0.1948, + "step": 38942 + }, + { + "epoch": 2.1282013022246336, + "grad_norm": 5.517311096191406, + "learning_rate": 4.766068247224888e-06, + "loss": 0.2356, + "step": 38943 + }, + { + "epoch": 2.1282148670645684, + "grad_norm": 5.428351879119873, + "learning_rate": 4.765931204604632e-06, + "loss": 0.2703, + "step": 38944 + }, + { + "epoch": 2.1282284319045037, + "grad_norm": 4.72736930847168, + "learning_rate": 4.765794161984377e-06, + "loss": 0.1997, + "step": 38945 + }, + { + "epoch": 2.1282419967444386, + "grad_norm": 4.770442008972168, + "learning_rate": 4.7656571193641225e-06, + "loss": 0.2391, + "step": 38946 + }, + { + "epoch": 2.1282555615843735, + "grad_norm": 4.410652160644531, + "learning_rate": 4.765520076743868e-06, + "loss": 0.195, + "step": 38947 + }, + { + "epoch": 2.1282691264243083, + "grad_norm": 4.193578243255615, + "learning_rate": 4.765383034123613e-06, + "loss": 0.1555, + "step": 38948 + }, + { + "epoch": 2.128282691264243, + "grad_norm": 5.451732635498047, + "learning_rate": 4.765245991503358e-06, + "loss": 0.2128, + "step": 38949 + }, + { + "epoch": 2.128296256104178, + "grad_norm": 4.565086841583252, + "learning_rate": 4.765108948883103e-06, + "loss": 0.1851, + "step": 38950 + }, + { + "epoch": 2.128309820944113, + "grad_norm": 5.271655559539795, + "learning_rate": 4.764971906262848e-06, + "loss": 0.2986, + "step": 38951 + }, + { + "epoch": 2.128323385784048, + "grad_norm": 5.000069618225098, + "learning_rate": 4.7648348636425935e-06, + "loss": 0.2196, + "step": 38952 + }, + { + "epoch": 2.1283369506239826, + "grad_norm": 5.2978835105896, + "learning_rate": 4.764697821022338e-06, + "loss": 0.2002, + "step": 38953 + }, + { + "epoch": 2.1283505154639175, + "grad_norm": 6.013311862945557, + "learning_rate": 4.764560778402084e-06, + "loss": 0.2713, + "step": 38954 + }, + { + "epoch": 2.1283640803038524, + "grad_norm": 4.837613105773926, + "learning_rate": 4.764423735781828e-06, + "loss": 0.1445, + "step": 38955 + }, + { + "epoch": 2.1283776451437872, + "grad_norm": 6.960646629333496, + "learning_rate": 4.764286693161574e-06, + "loss": 0.3548, + "step": 38956 + }, + { + "epoch": 2.128391209983722, + "grad_norm": 5.13598108291626, + "learning_rate": 4.764149650541319e-06, + "loss": 0.1405, + "step": 38957 + }, + { + "epoch": 2.128404774823657, + "grad_norm": 4.720665454864502, + "learning_rate": 4.764012607921064e-06, + "loss": 0.1934, + "step": 38958 + }, + { + "epoch": 2.128418339663592, + "grad_norm": 5.537196159362793, + "learning_rate": 4.763875565300809e-06, + "loss": 0.1893, + "step": 38959 + }, + { + "epoch": 2.1284319045035267, + "grad_norm": 4.179873943328857, + "learning_rate": 4.763738522680554e-06, + "loss": 0.1769, + "step": 38960 + }, + { + "epoch": 2.1284454693434616, + "grad_norm": 4.306227684020996, + "learning_rate": 4.763601480060299e-06, + "loss": 0.1996, + "step": 38961 + }, + { + "epoch": 2.1284590341833964, + "grad_norm": 4.251958847045898, + "learning_rate": 4.763464437440044e-06, + "loss": 0.1789, + "step": 38962 + }, + { + "epoch": 2.1284725990233317, + "grad_norm": 4.403754711151123, + "learning_rate": 4.76332739481979e-06, + "loss": 0.2062, + "step": 38963 + }, + { + "epoch": 2.1284861638632666, + "grad_norm": 4.970818519592285, + "learning_rate": 4.763190352199534e-06, + "loss": 0.2331, + "step": 38964 + }, + { + "epoch": 2.1284997287032015, + "grad_norm": 6.002591609954834, + "learning_rate": 4.76305330957928e-06, + "loss": 0.2873, + "step": 38965 + }, + { + "epoch": 2.1285132935431363, + "grad_norm": 4.2208123207092285, + "learning_rate": 4.762916266959024e-06, + "loss": 0.1805, + "step": 38966 + }, + { + "epoch": 2.128526858383071, + "grad_norm": 7.5799031257629395, + "learning_rate": 4.7627792243387695e-06, + "loss": 0.2824, + "step": 38967 + }, + { + "epoch": 2.128540423223006, + "grad_norm": 3.7999868392944336, + "learning_rate": 4.762642181718515e-06, + "loss": 0.1395, + "step": 38968 + }, + { + "epoch": 2.128553988062941, + "grad_norm": 5.805166721343994, + "learning_rate": 4.76250513909826e-06, + "loss": 0.2185, + "step": 38969 + }, + { + "epoch": 2.128567552902876, + "grad_norm": 4.744485855102539, + "learning_rate": 4.762368096478005e-06, + "loss": 0.1997, + "step": 38970 + }, + { + "epoch": 2.1285811177428107, + "grad_norm": 5.35811185836792, + "learning_rate": 4.76223105385775e-06, + "loss": 0.1646, + "step": 38971 + }, + { + "epoch": 2.1285946825827455, + "grad_norm": 5.74029541015625, + "learning_rate": 4.7620940112374954e-06, + "loss": 0.1931, + "step": 38972 + }, + { + "epoch": 2.1286082474226804, + "grad_norm": 4.771696090698242, + "learning_rate": 4.76195696861724e-06, + "loss": 0.1747, + "step": 38973 + }, + { + "epoch": 2.1286218122626153, + "grad_norm": 4.466475963592529, + "learning_rate": 4.761819925996986e-06, + "loss": 0.1302, + "step": 38974 + }, + { + "epoch": 2.12863537710255, + "grad_norm": 6.115350246429443, + "learning_rate": 4.76168288337673e-06, + "loss": 0.211, + "step": 38975 + }, + { + "epoch": 2.128648941942485, + "grad_norm": 5.464465618133545, + "learning_rate": 4.761545840756476e-06, + "loss": 0.2391, + "step": 38976 + }, + { + "epoch": 2.12866250678242, + "grad_norm": 4.447498321533203, + "learning_rate": 4.7614087981362205e-06, + "loss": 0.1549, + "step": 38977 + }, + { + "epoch": 2.1286760716223547, + "grad_norm": 4.953394412994385, + "learning_rate": 4.761271755515966e-06, + "loss": 0.2254, + "step": 38978 + }, + { + "epoch": 2.1286896364622896, + "grad_norm": 7.051719665527344, + "learning_rate": 4.761134712895711e-06, + "loss": 0.3545, + "step": 38979 + }, + { + "epoch": 2.1287032013022245, + "grad_norm": 6.65915060043335, + "learning_rate": 4.760997670275456e-06, + "loss": 0.25, + "step": 38980 + }, + { + "epoch": 2.1287167661421593, + "grad_norm": 5.42241907119751, + "learning_rate": 4.760860627655201e-06, + "loss": 0.1964, + "step": 38981 + }, + { + "epoch": 2.128730330982094, + "grad_norm": 4.783738613128662, + "learning_rate": 4.760723585034946e-06, + "loss": 0.1648, + "step": 38982 + }, + { + "epoch": 2.1287438958220295, + "grad_norm": 5.594949722290039, + "learning_rate": 4.7605865424146915e-06, + "loss": 0.1725, + "step": 38983 + }, + { + "epoch": 2.1287574606619644, + "grad_norm": 5.816532611846924, + "learning_rate": 4.760449499794437e-06, + "loss": 0.1745, + "step": 38984 + }, + { + "epoch": 2.1287710255018992, + "grad_norm": 5.643131256103516, + "learning_rate": 4.760312457174182e-06, + "loss": 0.1534, + "step": 38985 + }, + { + "epoch": 2.128784590341834, + "grad_norm": 5.573972225189209, + "learning_rate": 4.760175414553926e-06, + "loss": 0.1962, + "step": 38986 + }, + { + "epoch": 2.128798155181769, + "grad_norm": 5.390729904174805, + "learning_rate": 4.760038371933671e-06, + "loss": 0.2505, + "step": 38987 + }, + { + "epoch": 2.128811720021704, + "grad_norm": 6.4673027992248535, + "learning_rate": 4.759901329313417e-06, + "loss": 0.2222, + "step": 38988 + }, + { + "epoch": 2.1288252848616387, + "grad_norm": 4.9054179191589355, + "learning_rate": 4.759764286693162e-06, + "loss": 0.1435, + "step": 38989 + }, + { + "epoch": 2.1288388497015736, + "grad_norm": 6.608640670776367, + "learning_rate": 4.759627244072907e-06, + "loss": 0.1971, + "step": 38990 + }, + { + "epoch": 2.1288524145415084, + "grad_norm": 4.086178302764893, + "learning_rate": 4.759490201452652e-06, + "loss": 0.242, + "step": 38991 + }, + { + "epoch": 2.1288659793814433, + "grad_norm": 4.8339056968688965, + "learning_rate": 4.759353158832397e-06, + "loss": 0.1501, + "step": 38992 + }, + { + "epoch": 2.128879544221378, + "grad_norm": 4.2569580078125, + "learning_rate": 4.7592161162121425e-06, + "loss": 0.219, + "step": 38993 + }, + { + "epoch": 2.128893109061313, + "grad_norm": 4.286553382873535, + "learning_rate": 4.759079073591888e-06, + "loss": 0.2551, + "step": 38994 + }, + { + "epoch": 2.128906673901248, + "grad_norm": 4.250699043273926, + "learning_rate": 4.758942030971633e-06, + "loss": 0.1338, + "step": 38995 + }, + { + "epoch": 2.1289202387411827, + "grad_norm": 5.1575236320495605, + "learning_rate": 4.758804988351378e-06, + "loss": 0.1967, + "step": 38996 + }, + { + "epoch": 2.1289338035811176, + "grad_norm": 5.381028652191162, + "learning_rate": 4.758667945731123e-06, + "loss": 0.2482, + "step": 38997 + }, + { + "epoch": 2.1289473684210525, + "grad_norm": 5.112403869628906, + "learning_rate": 4.7585309031108675e-06, + "loss": 0.1862, + "step": 38998 + }, + { + "epoch": 2.1289609332609873, + "grad_norm": 3.464484930038452, + "learning_rate": 4.758393860490613e-06, + "loss": 0.1873, + "step": 38999 + }, + { + "epoch": 2.128974498100922, + "grad_norm": 5.98884391784668, + "learning_rate": 4.758256817870358e-06, + "loss": 0.2041, + "step": 39000 + }, + { + "epoch": 2.1289880629408575, + "grad_norm": 4.954890727996826, + "learning_rate": 4.758119775250103e-06, + "loss": 0.1901, + "step": 39001 + }, + { + "epoch": 2.1290016277807924, + "grad_norm": 4.671130180358887, + "learning_rate": 4.757982732629848e-06, + "loss": 0.1421, + "step": 39002 + }, + { + "epoch": 2.1290151926207272, + "grad_norm": 8.364615440368652, + "learning_rate": 4.7578456900095934e-06, + "loss": 0.2188, + "step": 39003 + }, + { + "epoch": 2.129028757460662, + "grad_norm": 6.4931840896606445, + "learning_rate": 4.757708647389339e-06, + "loss": 0.262, + "step": 39004 + }, + { + "epoch": 2.129042322300597, + "grad_norm": 4.726604461669922, + "learning_rate": 4.757571604769084e-06, + "loss": 0.1512, + "step": 39005 + }, + { + "epoch": 2.129055887140532, + "grad_norm": 5.088418006896973, + "learning_rate": 4.757434562148829e-06, + "loss": 0.2569, + "step": 39006 + }, + { + "epoch": 2.1290694519804667, + "grad_norm": 4.221303939819336, + "learning_rate": 4.757297519528573e-06, + "loss": 0.1872, + "step": 39007 + }, + { + "epoch": 2.1290830168204016, + "grad_norm": 4.606356143951416, + "learning_rate": 4.757160476908319e-06, + "loss": 0.1685, + "step": 39008 + }, + { + "epoch": 2.1290965816603364, + "grad_norm": 4.267007827758789, + "learning_rate": 4.757023434288064e-06, + "loss": 0.1975, + "step": 39009 + }, + { + "epoch": 2.1291101465002713, + "grad_norm": 4.230945587158203, + "learning_rate": 4.75688639166781e-06, + "loss": 0.1216, + "step": 39010 + }, + { + "epoch": 2.129123711340206, + "grad_norm": 5.842548847198486, + "learning_rate": 4.756749349047554e-06, + "loss": 0.2318, + "step": 39011 + }, + { + "epoch": 2.129137276180141, + "grad_norm": 6.908728122711182, + "learning_rate": 4.756612306427299e-06, + "loss": 0.2693, + "step": 39012 + }, + { + "epoch": 2.129150841020076, + "grad_norm": 5.738339900970459, + "learning_rate": 4.756475263807044e-06, + "loss": 0.3294, + "step": 39013 + }, + { + "epoch": 2.1291644058600108, + "grad_norm": 6.149979114532471, + "learning_rate": 4.7563382211867895e-06, + "loss": 0.2198, + "step": 39014 + }, + { + "epoch": 2.1291779706999456, + "grad_norm": 5.846415042877197, + "learning_rate": 4.756201178566535e-06, + "loss": 0.23, + "step": 39015 + }, + { + "epoch": 2.1291915355398805, + "grad_norm": 5.50961446762085, + "learning_rate": 4.756064135946279e-06, + "loss": 0.1897, + "step": 39016 + }, + { + "epoch": 2.1292051003798154, + "grad_norm": 4.4170942306518555, + "learning_rate": 4.755927093326025e-06, + "loss": 0.1442, + "step": 39017 + }, + { + "epoch": 2.1292186652197502, + "grad_norm": 4.322582244873047, + "learning_rate": 4.755790050705769e-06, + "loss": 0.1531, + "step": 39018 + }, + { + "epoch": 2.129232230059685, + "grad_norm": 4.305905342102051, + "learning_rate": 4.7556530080855154e-06, + "loss": 0.2572, + "step": 39019 + }, + { + "epoch": 2.12924579489962, + "grad_norm": 5.487154960632324, + "learning_rate": 4.75551596546526e-06, + "loss": 0.1699, + "step": 39020 + }, + { + "epoch": 2.1292593597395553, + "grad_norm": 4.995138168334961, + "learning_rate": 4.755378922845005e-06, + "loss": 0.2405, + "step": 39021 + }, + { + "epoch": 2.12927292457949, + "grad_norm": 5.085203170776367, + "learning_rate": 4.75524188022475e-06, + "loss": 0.2612, + "step": 39022 + }, + { + "epoch": 2.129286489419425, + "grad_norm": 5.447712421417236, + "learning_rate": 4.755104837604495e-06, + "loss": 0.1911, + "step": 39023 + }, + { + "epoch": 2.12930005425936, + "grad_norm": 5.444380283355713, + "learning_rate": 4.7549677949842405e-06, + "loss": 0.2927, + "step": 39024 + }, + { + "epoch": 2.1293136190992947, + "grad_norm": 5.523067951202393, + "learning_rate": 4.754830752363986e-06, + "loss": 0.3053, + "step": 39025 + }, + { + "epoch": 2.1293271839392296, + "grad_norm": 4.751882553100586, + "learning_rate": 4.754693709743731e-06, + "loss": 0.1363, + "step": 39026 + }, + { + "epoch": 2.1293407487791645, + "grad_norm": 6.8940863609313965, + "learning_rate": 4.754556667123475e-06, + "loss": 0.3857, + "step": 39027 + }, + { + "epoch": 2.1293543136190993, + "grad_norm": 5.070270538330078, + "learning_rate": 4.754419624503221e-06, + "loss": 0.202, + "step": 39028 + }, + { + "epoch": 2.129367878459034, + "grad_norm": 8.119279861450195, + "learning_rate": 4.7542825818829655e-06, + "loss": 0.4396, + "step": 39029 + }, + { + "epoch": 2.129381443298969, + "grad_norm": 4.9407758712768555, + "learning_rate": 4.7541455392627116e-06, + "loss": 0.241, + "step": 39030 + }, + { + "epoch": 2.129395008138904, + "grad_norm": 4.992949485778809, + "learning_rate": 4.754008496642456e-06, + "loss": 0.2891, + "step": 39031 + }, + { + "epoch": 2.129408572978839, + "grad_norm": 5.923847675323486, + "learning_rate": 4.753871454022201e-06, + "loss": 0.363, + "step": 39032 + }, + { + "epoch": 2.1294221378187737, + "grad_norm": 6.2088189125061035, + "learning_rate": 4.753734411401946e-06, + "loss": 0.2669, + "step": 39033 + }, + { + "epoch": 2.1294357026587085, + "grad_norm": 8.785604476928711, + "learning_rate": 4.7535973687816914e-06, + "loss": 0.2641, + "step": 39034 + }, + { + "epoch": 2.1294492674986434, + "grad_norm": 4.385933876037598, + "learning_rate": 4.753460326161437e-06, + "loss": 0.1878, + "step": 39035 + }, + { + "epoch": 2.1294628323385782, + "grad_norm": 5.821471214294434, + "learning_rate": 4.753323283541182e-06, + "loss": 0.2489, + "step": 39036 + }, + { + "epoch": 2.129476397178513, + "grad_norm": 4.978909015655518, + "learning_rate": 4.753186240920927e-06, + "loss": 0.1849, + "step": 39037 + }, + { + "epoch": 2.129489962018448, + "grad_norm": 5.758255481719971, + "learning_rate": 4.753049198300672e-06, + "loss": 0.2473, + "step": 39038 + }, + { + "epoch": 2.1295035268583833, + "grad_norm": 5.8345046043396, + "learning_rate": 4.752912155680417e-06, + "loss": 0.3076, + "step": 39039 + }, + { + "epoch": 2.129517091698318, + "grad_norm": 4.564668655395508, + "learning_rate": 4.7527751130601625e-06, + "loss": 0.2227, + "step": 39040 + }, + { + "epoch": 2.129530656538253, + "grad_norm": 5.281843185424805, + "learning_rate": 4.752638070439907e-06, + "loss": 0.2738, + "step": 39041 + }, + { + "epoch": 2.129544221378188, + "grad_norm": 4.2430195808410645, + "learning_rate": 4.752501027819652e-06, + "loss": 0.2056, + "step": 39042 + }, + { + "epoch": 2.1295577862181228, + "grad_norm": 5.477120876312256, + "learning_rate": 4.752363985199397e-06, + "loss": 0.2791, + "step": 39043 + }, + { + "epoch": 2.1295713510580576, + "grad_norm": 5.512454986572266, + "learning_rate": 4.752226942579142e-06, + "loss": 0.2546, + "step": 39044 + }, + { + "epoch": 2.1295849158979925, + "grad_norm": 4.939752101898193, + "learning_rate": 4.7520898999588876e-06, + "loss": 0.1336, + "step": 39045 + }, + { + "epoch": 2.1295984807379273, + "grad_norm": 4.3234710693359375, + "learning_rate": 4.751952857338633e-06, + "loss": 0.1959, + "step": 39046 + }, + { + "epoch": 2.129612045577862, + "grad_norm": 4.38012170791626, + "learning_rate": 4.751815814718378e-06, + "loss": 0.1069, + "step": 39047 + }, + { + "epoch": 2.129625610417797, + "grad_norm": 5.122222900390625, + "learning_rate": 4.751678772098123e-06, + "loss": 0.2333, + "step": 39048 + }, + { + "epoch": 2.129639175257732, + "grad_norm": 3.92818284034729, + "learning_rate": 4.751541729477868e-06, + "loss": 0.1191, + "step": 39049 + }, + { + "epoch": 2.129652740097667, + "grad_norm": 4.678618907928467, + "learning_rate": 4.751404686857613e-06, + "loss": 0.2117, + "step": 39050 + }, + { + "epoch": 2.1296663049376017, + "grad_norm": 4.7252631187438965, + "learning_rate": 4.751267644237359e-06, + "loss": 0.1807, + "step": 39051 + }, + { + "epoch": 2.1296798697775365, + "grad_norm": 4.890259742736816, + "learning_rate": 4.751130601617103e-06, + "loss": 0.1802, + "step": 39052 + }, + { + "epoch": 2.1296934346174714, + "grad_norm": 4.950329303741455, + "learning_rate": 4.750993558996849e-06, + "loss": 0.2044, + "step": 39053 + }, + { + "epoch": 2.1297069994574063, + "grad_norm": 3.298523426055908, + "learning_rate": 4.750856516376593e-06, + "loss": 0.1609, + "step": 39054 + }, + { + "epoch": 2.129720564297341, + "grad_norm": 3.73919677734375, + "learning_rate": 4.7507194737563385e-06, + "loss": 0.124, + "step": 39055 + }, + { + "epoch": 2.129734129137276, + "grad_norm": 3.1600351333618164, + "learning_rate": 4.750582431136084e-06, + "loss": 0.1033, + "step": 39056 + }, + { + "epoch": 2.129747693977211, + "grad_norm": 6.300355434417725, + "learning_rate": 4.750445388515829e-06, + "loss": 0.1776, + "step": 39057 + }, + { + "epoch": 2.1297612588171457, + "grad_norm": 4.497783660888672, + "learning_rate": 4.750308345895574e-06, + "loss": 0.2154, + "step": 39058 + }, + { + "epoch": 2.129774823657081, + "grad_norm": 3.073882818222046, + "learning_rate": 4.750171303275319e-06, + "loss": 0.1062, + "step": 39059 + }, + { + "epoch": 2.129788388497016, + "grad_norm": 4.204539775848389, + "learning_rate": 4.750034260655064e-06, + "loss": 0.1362, + "step": 39060 + }, + { + "epoch": 2.1298019533369508, + "grad_norm": 4.53772497177124, + "learning_rate": 4.749897218034809e-06, + "loss": 0.214, + "step": 39061 + }, + { + "epoch": 2.1298155181768856, + "grad_norm": 3.846781015396118, + "learning_rate": 4.749760175414555e-06, + "loss": 0.1546, + "step": 39062 + }, + { + "epoch": 2.1298290830168205, + "grad_norm": 4.181769371032715, + "learning_rate": 4.749623132794299e-06, + "loss": 0.2109, + "step": 39063 + }, + { + "epoch": 2.1298426478567554, + "grad_norm": 5.190621852874756, + "learning_rate": 4.749486090174045e-06, + "loss": 0.2436, + "step": 39064 + }, + { + "epoch": 2.1298562126966902, + "grad_norm": 2.9237794876098633, + "learning_rate": 4.7493490475537894e-06, + "loss": 0.1108, + "step": 39065 + }, + { + "epoch": 2.129869777536625, + "grad_norm": 3.795403003692627, + "learning_rate": 4.749212004933535e-06, + "loss": 0.1338, + "step": 39066 + }, + { + "epoch": 2.12988334237656, + "grad_norm": 3.927872896194458, + "learning_rate": 4.74907496231328e-06, + "loss": 0.1792, + "step": 39067 + }, + { + "epoch": 2.129896907216495, + "grad_norm": 2.7674643993377686, + "learning_rate": 4.748937919693025e-06, + "loss": 0.0976, + "step": 39068 + }, + { + "epoch": 2.1299104720564297, + "grad_norm": 5.100415229797363, + "learning_rate": 4.74880087707277e-06, + "loss": 0.187, + "step": 39069 + }, + { + "epoch": 2.1299240368963646, + "grad_norm": 3.9491801261901855, + "learning_rate": 4.7486638344525145e-06, + "loss": 0.1444, + "step": 39070 + }, + { + "epoch": 2.1299376017362994, + "grad_norm": 3.3517282009124756, + "learning_rate": 4.7485267918322605e-06, + "loss": 0.1141, + "step": 39071 + }, + { + "epoch": 2.1299511665762343, + "grad_norm": 2.2592074871063232, + "learning_rate": 4.748389749212005e-06, + "loss": 0.0803, + "step": 39072 + }, + { + "epoch": 2.129964731416169, + "grad_norm": 4.729709148406982, + "learning_rate": 4.748252706591751e-06, + "loss": 0.1753, + "step": 39073 + }, + { + "epoch": 2.129978296256104, + "grad_norm": 3.6487438678741455, + "learning_rate": 4.748115663971495e-06, + "loss": 0.1487, + "step": 39074 + }, + { + "epoch": 2.129991861096039, + "grad_norm": 4.897768974304199, + "learning_rate": 4.74797862135124e-06, + "loss": 0.1637, + "step": 39075 + }, + { + "epoch": 2.1300054259359738, + "grad_norm": 4.3054304122924805, + "learning_rate": 4.7478415787309856e-06, + "loss": 0.1146, + "step": 39076 + }, + { + "epoch": 2.130018990775909, + "grad_norm": 2.9473297595977783, + "learning_rate": 4.747704536110731e-06, + "loss": 0.1038, + "step": 39077 + }, + { + "epoch": 2.130032555615844, + "grad_norm": 4.524110317230225, + "learning_rate": 4.747567493490476e-06, + "loss": 0.1366, + "step": 39078 + }, + { + "epoch": 2.130046120455779, + "grad_norm": 3.7078068256378174, + "learning_rate": 4.747430450870221e-06, + "loss": 0.1005, + "step": 39079 + }, + { + "epoch": 2.1300596852957137, + "grad_norm": 4.546614646911621, + "learning_rate": 4.747293408249966e-06, + "loss": 0.1059, + "step": 39080 + }, + { + "epoch": 2.1300732501356485, + "grad_norm": 3.793562173843384, + "learning_rate": 4.7471563656297115e-06, + "loss": 0.1312, + "step": 39081 + }, + { + "epoch": 2.1300868149755834, + "grad_norm": 3.8908448219299316, + "learning_rate": 4.747019323009457e-06, + "loss": 0.1387, + "step": 39082 + }, + { + "epoch": 2.1301003798155183, + "grad_norm": 4.632841110229492, + "learning_rate": 4.746882280389201e-06, + "loss": 0.1005, + "step": 39083 + }, + { + "epoch": 2.130113944655453, + "grad_norm": 3.99371075630188, + "learning_rate": 4.746745237768947e-06, + "loss": 0.0893, + "step": 39084 + }, + { + "epoch": 2.130127509495388, + "grad_norm": 3.9552247524261475, + "learning_rate": 4.746608195148691e-06, + "loss": 0.1174, + "step": 39085 + }, + { + "epoch": 2.130141074335323, + "grad_norm": 4.713423728942871, + "learning_rate": 4.7464711525284365e-06, + "loss": 0.1203, + "step": 39086 + }, + { + "epoch": 2.1301546391752577, + "grad_norm": 3.2231407165527344, + "learning_rate": 4.746334109908182e-06, + "loss": 0.106, + "step": 39087 + }, + { + "epoch": 2.1301682040151926, + "grad_norm": 5.184538841247559, + "learning_rate": 4.746197067287927e-06, + "loss": 0.2061, + "step": 39088 + }, + { + "epoch": 2.1301817688551274, + "grad_norm": 4.764270782470703, + "learning_rate": 4.746060024667672e-06, + "loss": 0.1648, + "step": 39089 + }, + { + "epoch": 2.1301953336950623, + "grad_norm": 4.287672519683838, + "learning_rate": 4.745922982047417e-06, + "loss": 0.1824, + "step": 39090 + }, + { + "epoch": 2.130208898534997, + "grad_norm": 4.99644136428833, + "learning_rate": 4.745785939427162e-06, + "loss": 0.215, + "step": 39091 + }, + { + "epoch": 2.130222463374932, + "grad_norm": 4.853711128234863, + "learning_rate": 4.7456488968069076e-06, + "loss": 0.1662, + "step": 39092 + }, + { + "epoch": 2.130236028214867, + "grad_norm": 4.702127456665039, + "learning_rate": 4.745511854186653e-06, + "loss": 0.1461, + "step": 39093 + }, + { + "epoch": 2.1302495930548018, + "grad_norm": 3.8339016437530518, + "learning_rate": 4.745374811566398e-06, + "loss": 0.1117, + "step": 39094 + }, + { + "epoch": 2.1302631578947366, + "grad_norm": 3.9097468852996826, + "learning_rate": 4.745237768946142e-06, + "loss": 0.1155, + "step": 39095 + }, + { + "epoch": 2.1302767227346715, + "grad_norm": 5.79128885269165, + "learning_rate": 4.7451007263258874e-06, + "loss": 0.1728, + "step": 39096 + }, + { + "epoch": 2.130290287574607, + "grad_norm": 4.601155757904053, + "learning_rate": 4.744963683705633e-06, + "loss": 0.1549, + "step": 39097 + }, + { + "epoch": 2.1303038524145417, + "grad_norm": 4.05879020690918, + "learning_rate": 4.744826641085378e-06, + "loss": 0.1228, + "step": 39098 + }, + { + "epoch": 2.1303174172544765, + "grad_norm": 4.528907299041748, + "learning_rate": 4.744689598465123e-06, + "loss": 0.063, + "step": 39099 + }, + { + "epoch": 2.1303309820944114, + "grad_norm": 5.257382392883301, + "learning_rate": 4.744552555844868e-06, + "loss": 0.2645, + "step": 39100 + }, + { + "epoch": 2.1303445469343463, + "grad_norm": 5.427699565887451, + "learning_rate": 4.744415513224613e-06, + "loss": 0.2457, + "step": 39101 + }, + { + "epoch": 2.130358111774281, + "grad_norm": 4.655945777893066, + "learning_rate": 4.7442784706043585e-06, + "loss": 0.1711, + "step": 39102 + }, + { + "epoch": 2.130371676614216, + "grad_norm": 6.225466251373291, + "learning_rate": 4.744141427984104e-06, + "loss": 0.2343, + "step": 39103 + }, + { + "epoch": 2.130385241454151, + "grad_norm": 4.341459274291992, + "learning_rate": 4.744004385363848e-06, + "loss": 0.1523, + "step": 39104 + }, + { + "epoch": 2.1303988062940857, + "grad_norm": 4.854366779327393, + "learning_rate": 4.743867342743594e-06, + "loss": 0.2066, + "step": 39105 + }, + { + "epoch": 2.1304123711340206, + "grad_norm": 4.795454978942871, + "learning_rate": 4.743730300123338e-06, + "loss": 0.2206, + "step": 39106 + }, + { + "epoch": 2.1304259359739555, + "grad_norm": 4.91249418258667, + "learning_rate": 4.743593257503084e-06, + "loss": 0.2005, + "step": 39107 + }, + { + "epoch": 2.1304395008138903, + "grad_norm": 5.614875316619873, + "learning_rate": 4.743456214882829e-06, + "loss": 0.2082, + "step": 39108 + }, + { + "epoch": 2.130453065653825, + "grad_norm": 4.823931694030762, + "learning_rate": 4.743319172262574e-06, + "loss": 0.1778, + "step": 39109 + }, + { + "epoch": 2.13046663049376, + "grad_norm": 4.091268062591553, + "learning_rate": 4.743182129642319e-06, + "loss": 0.0997, + "step": 39110 + }, + { + "epoch": 2.130480195333695, + "grad_norm": 3.9646494388580322, + "learning_rate": 4.743045087022064e-06, + "loss": 0.1028, + "step": 39111 + }, + { + "epoch": 2.13049376017363, + "grad_norm": 4.632138252258301, + "learning_rate": 4.7429080444018095e-06, + "loss": 0.1478, + "step": 39112 + }, + { + "epoch": 2.1305073250135647, + "grad_norm": 5.996420860290527, + "learning_rate": 4.742771001781555e-06, + "loss": 0.1978, + "step": 39113 + }, + { + "epoch": 2.1305208898534995, + "grad_norm": 5.602828502655029, + "learning_rate": 4.7426339591613e-06, + "loss": 0.1899, + "step": 39114 + }, + { + "epoch": 2.130534454693435, + "grad_norm": 5.396197319030762, + "learning_rate": 4.742496916541044e-06, + "loss": 0.1842, + "step": 39115 + }, + { + "epoch": 2.1305480195333697, + "grad_norm": 7.224946975708008, + "learning_rate": 4.74235987392079e-06, + "loss": 0.2438, + "step": 39116 + }, + { + "epoch": 2.1305615843733046, + "grad_norm": 3.6583564281463623, + "learning_rate": 4.7422228313005345e-06, + "loss": 0.1119, + "step": 39117 + }, + { + "epoch": 2.1305751492132394, + "grad_norm": 3.99290132522583, + "learning_rate": 4.7420857886802805e-06, + "loss": 0.0833, + "step": 39118 + }, + { + "epoch": 2.1305887140531743, + "grad_norm": 4.419265270233154, + "learning_rate": 4.741948746060025e-06, + "loss": 0.1405, + "step": 39119 + }, + { + "epoch": 2.130602278893109, + "grad_norm": 4.409165859222412, + "learning_rate": 4.74181170343977e-06, + "loss": 0.1293, + "step": 39120 + }, + { + "epoch": 2.130615843733044, + "grad_norm": 4.824589729309082, + "learning_rate": 4.741674660819515e-06, + "loss": 0.101, + "step": 39121 + }, + { + "epoch": 2.130629408572979, + "grad_norm": 6.153580665588379, + "learning_rate": 4.74153761819926e-06, + "loss": 0.2697, + "step": 39122 + }, + { + "epoch": 2.1306429734129138, + "grad_norm": 4.364241600036621, + "learning_rate": 4.741400575579006e-06, + "loss": 0.1624, + "step": 39123 + }, + { + "epoch": 2.1306565382528486, + "grad_norm": 5.114078521728516, + "learning_rate": 4.74126353295875e-06, + "loss": 0.28, + "step": 39124 + }, + { + "epoch": 2.1306701030927835, + "grad_norm": 3.70123553276062, + "learning_rate": 4.741126490338496e-06, + "loss": 0.1354, + "step": 39125 + }, + { + "epoch": 2.1306836679327184, + "grad_norm": 5.929975509643555, + "learning_rate": 4.74098944771824e-06, + "loss": 0.184, + "step": 39126 + }, + { + "epoch": 2.130697232772653, + "grad_norm": 4.8902764320373535, + "learning_rate": 4.740852405097986e-06, + "loss": 0.1776, + "step": 39127 + }, + { + "epoch": 2.130710797612588, + "grad_norm": 4.920883655548096, + "learning_rate": 4.740715362477731e-06, + "loss": 0.2711, + "step": 39128 + }, + { + "epoch": 2.130724362452523, + "grad_norm": 5.028313636779785, + "learning_rate": 4.740578319857476e-06, + "loss": 0.1834, + "step": 39129 + }, + { + "epoch": 2.130737927292458, + "grad_norm": 4.865532398223877, + "learning_rate": 4.740441277237221e-06, + "loss": 0.1533, + "step": 39130 + }, + { + "epoch": 2.1307514921323927, + "grad_norm": 4.536766052246094, + "learning_rate": 4.740304234616966e-06, + "loss": 0.2095, + "step": 39131 + }, + { + "epoch": 2.1307650569723275, + "grad_norm": 3.89339017868042, + "learning_rate": 4.740167191996711e-06, + "loss": 0.1763, + "step": 39132 + }, + { + "epoch": 2.1307786218122624, + "grad_norm": 4.664797782897949, + "learning_rate": 4.7400301493764565e-06, + "loss": 0.1418, + "step": 39133 + }, + { + "epoch": 2.1307921866521973, + "grad_norm": 8.095459938049316, + "learning_rate": 4.739893106756202e-06, + "loss": 0.3222, + "step": 39134 + }, + { + "epoch": 2.1308057514921326, + "grad_norm": 7.385166645050049, + "learning_rate": 4.739756064135947e-06, + "loss": 0.2172, + "step": 39135 + }, + { + "epoch": 2.1308193163320674, + "grad_norm": 6.163136005401611, + "learning_rate": 4.739619021515692e-06, + "loss": 0.2083, + "step": 39136 + }, + { + "epoch": 2.1308328811720023, + "grad_norm": 5.348266124725342, + "learning_rate": 4.739481978895436e-06, + "loss": 0.1813, + "step": 39137 + }, + { + "epoch": 2.130846446011937, + "grad_norm": 3.4952995777130127, + "learning_rate": 4.739344936275182e-06, + "loss": 0.1724, + "step": 39138 + }, + { + "epoch": 2.130860010851872, + "grad_norm": 4.443558216094971, + "learning_rate": 4.739207893654927e-06, + "loss": 0.1253, + "step": 39139 + }, + { + "epoch": 2.130873575691807, + "grad_norm": 4.52706241607666, + "learning_rate": 4.739070851034672e-06, + "loss": 0.1033, + "step": 39140 + }, + { + "epoch": 2.1308871405317418, + "grad_norm": 3.2361643314361572, + "learning_rate": 4.738933808414417e-06, + "loss": 0.1207, + "step": 39141 + }, + { + "epoch": 2.1309007053716766, + "grad_norm": 3.2062976360321045, + "learning_rate": 4.738796765794162e-06, + "loss": 0.0916, + "step": 39142 + }, + { + "epoch": 2.1309142702116115, + "grad_norm": 4.207047939300537, + "learning_rate": 4.7386597231739075e-06, + "loss": 0.1714, + "step": 39143 + }, + { + "epoch": 2.1309278350515464, + "grad_norm": 3.1400225162506104, + "learning_rate": 4.738522680553653e-06, + "loss": 0.0932, + "step": 39144 + }, + { + "epoch": 2.1309413998914812, + "grad_norm": 3.9326422214508057, + "learning_rate": 4.738385637933398e-06, + "loss": 0.1894, + "step": 39145 + }, + { + "epoch": 2.130954964731416, + "grad_norm": 6.406056880950928, + "learning_rate": 4.738248595313143e-06, + "loss": 0.1766, + "step": 39146 + }, + { + "epoch": 2.130968529571351, + "grad_norm": 3.9117839336395264, + "learning_rate": 4.738111552692888e-06, + "loss": 0.1065, + "step": 39147 + }, + { + "epoch": 2.130982094411286, + "grad_norm": 4.546144008636475, + "learning_rate": 4.737974510072633e-06, + "loss": 0.1624, + "step": 39148 + }, + { + "epoch": 2.1309956592512207, + "grad_norm": 3.6525144577026367, + "learning_rate": 4.737837467452378e-06, + "loss": 0.1456, + "step": 39149 + }, + { + "epoch": 2.1310092240911556, + "grad_norm": 4.861083030700684, + "learning_rate": 4.737700424832124e-06, + "loss": 0.2398, + "step": 39150 + }, + { + "epoch": 2.1310227889310904, + "grad_norm": 4.516361713409424, + "learning_rate": 4.737563382211868e-06, + "loss": 0.2604, + "step": 39151 + }, + { + "epoch": 2.1310363537710257, + "grad_norm": 3.857189178466797, + "learning_rate": 4.737426339591613e-06, + "loss": 0.1871, + "step": 39152 + }, + { + "epoch": 2.1310499186109606, + "grad_norm": 4.716011047363281, + "learning_rate": 4.737289296971358e-06, + "loss": 0.209, + "step": 39153 + }, + { + "epoch": 2.1310634834508955, + "grad_norm": 4.959129810333252, + "learning_rate": 4.737152254351104e-06, + "loss": 0.1696, + "step": 39154 + }, + { + "epoch": 2.1310770482908303, + "grad_norm": 3.8688981533050537, + "learning_rate": 4.737015211730849e-06, + "loss": 0.1622, + "step": 39155 + }, + { + "epoch": 2.131090613130765, + "grad_norm": 4.547823905944824, + "learning_rate": 4.736878169110594e-06, + "loss": 0.1781, + "step": 39156 + }, + { + "epoch": 2.1311041779707, + "grad_norm": 5.131424427032471, + "learning_rate": 4.736741126490339e-06, + "loss": 0.1506, + "step": 39157 + }, + { + "epoch": 2.131117742810635, + "grad_norm": 6.020114898681641, + "learning_rate": 4.7366040838700835e-06, + "loss": 0.1987, + "step": 39158 + }, + { + "epoch": 2.13113130765057, + "grad_norm": 4.156523704528809, + "learning_rate": 4.7364670412498295e-06, + "loss": 0.1576, + "step": 39159 + }, + { + "epoch": 2.1311448724905047, + "grad_norm": 4.213033676147461, + "learning_rate": 4.736329998629574e-06, + "loss": 0.1806, + "step": 39160 + }, + { + "epoch": 2.1311584373304395, + "grad_norm": 5.835903644561768, + "learning_rate": 4.73619295600932e-06, + "loss": 0.2532, + "step": 39161 + }, + { + "epoch": 2.1311720021703744, + "grad_norm": 4.310513496398926, + "learning_rate": 4.736055913389064e-06, + "loss": 0.1465, + "step": 39162 + }, + { + "epoch": 2.1311855670103093, + "grad_norm": 5.584687232971191, + "learning_rate": 4.735918870768809e-06, + "loss": 0.3182, + "step": 39163 + }, + { + "epoch": 2.131199131850244, + "grad_norm": 4.854351997375488, + "learning_rate": 4.7357818281485545e-06, + "loss": 0.1608, + "step": 39164 + }, + { + "epoch": 2.131212696690179, + "grad_norm": 4.727290153503418, + "learning_rate": 4.7356447855283e-06, + "loss": 0.1594, + "step": 39165 + }, + { + "epoch": 2.131226261530114, + "grad_norm": 5.2114715576171875, + "learning_rate": 4.735507742908045e-06, + "loss": 0.2328, + "step": 39166 + }, + { + "epoch": 2.1312398263700487, + "grad_norm": 5.308528900146484, + "learning_rate": 4.73537070028779e-06, + "loss": 0.2262, + "step": 39167 + }, + { + "epoch": 2.1312533912099836, + "grad_norm": 4.948671340942383, + "learning_rate": 4.735233657667535e-06, + "loss": 0.2504, + "step": 39168 + }, + { + "epoch": 2.1312669560499184, + "grad_norm": 5.549283027648926, + "learning_rate": 4.7350966150472796e-06, + "loss": 0.3282, + "step": 39169 + }, + { + "epoch": 2.1312805208898533, + "grad_norm": 7.261832237243652, + "learning_rate": 4.734959572427026e-06, + "loss": 0.2759, + "step": 39170 + }, + { + "epoch": 2.131294085729788, + "grad_norm": 4.234572410583496, + "learning_rate": 4.73482252980677e-06, + "loss": 0.2549, + "step": 39171 + }, + { + "epoch": 2.131307650569723, + "grad_norm": 5.085289001464844, + "learning_rate": 4.734685487186516e-06, + "loss": 0.2901, + "step": 39172 + }, + { + "epoch": 2.1313212154096584, + "grad_norm": 3.682518243789673, + "learning_rate": 4.73454844456626e-06, + "loss": 0.1155, + "step": 39173 + }, + { + "epoch": 2.131334780249593, + "grad_norm": 5.756502628326416, + "learning_rate": 4.7344114019460055e-06, + "loss": 0.1776, + "step": 39174 + }, + { + "epoch": 2.131348345089528, + "grad_norm": 4.9562602043151855, + "learning_rate": 4.734274359325751e-06, + "loss": 0.2231, + "step": 39175 + }, + { + "epoch": 2.131361909929463, + "grad_norm": 4.40183162689209, + "learning_rate": 4.734137316705496e-06, + "loss": 0.1488, + "step": 39176 + }, + { + "epoch": 2.131375474769398, + "grad_norm": 7.172739505767822, + "learning_rate": 4.734000274085241e-06, + "loss": 0.289, + "step": 39177 + }, + { + "epoch": 2.1313890396093327, + "grad_norm": 5.44730806350708, + "learning_rate": 4.733863231464986e-06, + "loss": 0.1624, + "step": 39178 + }, + { + "epoch": 2.1314026044492675, + "grad_norm": 9.651453018188477, + "learning_rate": 4.733726188844731e-06, + "loss": 0.4268, + "step": 39179 + }, + { + "epoch": 2.1314161692892024, + "grad_norm": 4.308296203613281, + "learning_rate": 4.733589146224476e-06, + "loss": 0.1365, + "step": 39180 + }, + { + "epoch": 2.1314297341291373, + "grad_norm": 4.5257649421691895, + "learning_rate": 4.733452103604222e-06, + "loss": 0.1615, + "step": 39181 + }, + { + "epoch": 2.131443298969072, + "grad_norm": 5.104664325714111, + "learning_rate": 4.733315060983966e-06, + "loss": 0.2113, + "step": 39182 + }, + { + "epoch": 2.131456863809007, + "grad_norm": 8.557738304138184, + "learning_rate": 4.733178018363711e-06, + "loss": 0.3841, + "step": 39183 + }, + { + "epoch": 2.131470428648942, + "grad_norm": 7.6240010261535645, + "learning_rate": 4.733040975743456e-06, + "loss": 0.2185, + "step": 39184 + }, + { + "epoch": 2.1314839934888767, + "grad_norm": 4.958527565002441, + "learning_rate": 4.732903933123202e-06, + "loss": 0.1836, + "step": 39185 + }, + { + "epoch": 2.1314975583288116, + "grad_norm": 6.754525661468506, + "learning_rate": 4.732766890502947e-06, + "loss": 0.2553, + "step": 39186 + }, + { + "epoch": 2.1315111231687465, + "grad_norm": 6.638381481170654, + "learning_rate": 4.732629847882692e-06, + "loss": 0.2997, + "step": 39187 + }, + { + "epoch": 2.1315246880086813, + "grad_norm": 5.055485248565674, + "learning_rate": 4.732492805262437e-06, + "loss": 0.1592, + "step": 39188 + }, + { + "epoch": 2.131538252848616, + "grad_norm": 5.814815998077393, + "learning_rate": 4.732355762642182e-06, + "loss": 0.3188, + "step": 39189 + }, + { + "epoch": 2.1315518176885515, + "grad_norm": 6.382359027862549, + "learning_rate": 4.7322187200219275e-06, + "loss": 0.243, + "step": 39190 + }, + { + "epoch": 2.1315653825284864, + "grad_norm": 5.738276481628418, + "learning_rate": 4.732081677401673e-06, + "loss": 0.1823, + "step": 39191 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 3.306401252746582, + "learning_rate": 4.731944634781417e-06, + "loss": 0.0991, + "step": 39192 + }, + { + "epoch": 2.131592512208356, + "grad_norm": 7.395318984985352, + "learning_rate": 4.731807592161162e-06, + "loss": 0.291, + "step": 39193 + }, + { + "epoch": 2.131606077048291, + "grad_norm": 4.615084648132324, + "learning_rate": 4.731670549540907e-06, + "loss": 0.1462, + "step": 39194 + }, + { + "epoch": 2.131619641888226, + "grad_norm": 5.2219953536987305, + "learning_rate": 4.7315335069206525e-06, + "loss": 0.2846, + "step": 39195 + }, + { + "epoch": 2.1316332067281607, + "grad_norm": 5.319077014923096, + "learning_rate": 4.731396464300398e-06, + "loss": 0.1973, + "step": 39196 + }, + { + "epoch": 2.1316467715680956, + "grad_norm": 6.608949184417725, + "learning_rate": 4.731259421680143e-06, + "loss": 0.3267, + "step": 39197 + }, + { + "epoch": 2.1316603364080304, + "grad_norm": 4.797887802124023, + "learning_rate": 4.731122379059888e-06, + "loss": 0.141, + "step": 39198 + }, + { + "epoch": 2.1316739012479653, + "grad_norm": 13.441780090332031, + "learning_rate": 4.730985336439633e-06, + "loss": 0.1974, + "step": 39199 + }, + { + "epoch": 2.1316874660879, + "grad_norm": 5.233969211578369, + "learning_rate": 4.730848293819378e-06, + "loss": 0.2351, + "step": 39200 + }, + { + "epoch": 2.131701030927835, + "grad_norm": 5.514242172241211, + "learning_rate": 4.730711251199124e-06, + "loss": 0.2572, + "step": 39201 + }, + { + "epoch": 2.13171459576777, + "grad_norm": 4.426998138427734, + "learning_rate": 4.730574208578869e-06, + "loss": 0.1666, + "step": 39202 + }, + { + "epoch": 2.1317281606077048, + "grad_norm": 4.449156761169434, + "learning_rate": 4.730437165958613e-06, + "loss": 0.2311, + "step": 39203 + }, + { + "epoch": 2.1317417254476396, + "grad_norm": 4.224854946136475, + "learning_rate": 4.730300123338359e-06, + "loss": 0.1672, + "step": 39204 + }, + { + "epoch": 2.1317552902875745, + "grad_norm": 5.133009433746338, + "learning_rate": 4.7301630807181035e-06, + "loss": 0.2124, + "step": 39205 + }, + { + "epoch": 2.1317688551275094, + "grad_norm": 5.109390735626221, + "learning_rate": 4.730026038097849e-06, + "loss": 0.2459, + "step": 39206 + }, + { + "epoch": 2.131782419967444, + "grad_norm": 5.757480621337891, + "learning_rate": 4.729888995477594e-06, + "loss": 0.2119, + "step": 39207 + }, + { + "epoch": 2.131795984807379, + "grad_norm": 5.376547813415527, + "learning_rate": 4.729751952857339e-06, + "loss": 0.1942, + "step": 39208 + }, + { + "epoch": 2.131809549647314, + "grad_norm": 7.558896541595459, + "learning_rate": 4.729614910237084e-06, + "loss": 0.3212, + "step": 39209 + }, + { + "epoch": 2.131823114487249, + "grad_norm": 4.99198055267334, + "learning_rate": 4.729477867616829e-06, + "loss": 0.2045, + "step": 39210 + }, + { + "epoch": 2.131836679327184, + "grad_norm": 3.7999050617218018, + "learning_rate": 4.7293408249965745e-06, + "loss": 0.1149, + "step": 39211 + }, + { + "epoch": 2.131850244167119, + "grad_norm": 4.93734073638916, + "learning_rate": 4.729203782376319e-06, + "loss": 0.1895, + "step": 39212 + }, + { + "epoch": 2.131863809007054, + "grad_norm": 5.056652545928955, + "learning_rate": 4.729066739756065e-06, + "loss": 0.2533, + "step": 39213 + }, + { + "epoch": 2.1318773738469887, + "grad_norm": 5.657982349395752, + "learning_rate": 4.728929697135809e-06, + "loss": 0.2589, + "step": 39214 + }, + { + "epoch": 2.1318909386869236, + "grad_norm": 4.6113996505737305, + "learning_rate": 4.728792654515555e-06, + "loss": 0.1556, + "step": 39215 + }, + { + "epoch": 2.1319045035268585, + "grad_norm": 5.998659133911133, + "learning_rate": 4.7286556118953e-06, + "loss": 0.2258, + "step": 39216 + }, + { + "epoch": 2.1319180683667933, + "grad_norm": 4.336945056915283, + "learning_rate": 4.728518569275045e-06, + "loss": 0.1906, + "step": 39217 + }, + { + "epoch": 2.131931633206728, + "grad_norm": 4.914094924926758, + "learning_rate": 4.72838152665479e-06, + "loss": 0.1671, + "step": 39218 + }, + { + "epoch": 2.131945198046663, + "grad_norm": 5.4742231369018555, + "learning_rate": 4.728244484034535e-06, + "loss": 0.153, + "step": 39219 + }, + { + "epoch": 2.131958762886598, + "grad_norm": 3.4969842433929443, + "learning_rate": 4.72810744141428e-06, + "loss": 0.1572, + "step": 39220 + }, + { + "epoch": 2.131972327726533, + "grad_norm": 6.378020286560059, + "learning_rate": 4.7279703987940255e-06, + "loss": 0.2087, + "step": 39221 + }, + { + "epoch": 2.1319858925664676, + "grad_norm": 8.385404586791992, + "learning_rate": 4.727833356173771e-06, + "loss": 0.2971, + "step": 39222 + }, + { + "epoch": 2.1319994574064025, + "grad_norm": 4.790524482727051, + "learning_rate": 4.727696313553515e-06, + "loss": 0.1301, + "step": 39223 + }, + { + "epoch": 2.1320130222463374, + "grad_norm": 5.601432800292969, + "learning_rate": 4.727559270933261e-06, + "loss": 0.1269, + "step": 39224 + }, + { + "epoch": 2.1320265870862722, + "grad_norm": 4.979140281677246, + "learning_rate": 4.727422228313005e-06, + "loss": 0.2288, + "step": 39225 + }, + { + "epoch": 2.132040151926207, + "grad_norm": 5.410162925720215, + "learning_rate": 4.727285185692751e-06, + "loss": 0.1541, + "step": 39226 + }, + { + "epoch": 2.132053716766142, + "grad_norm": 6.47446346282959, + "learning_rate": 4.727148143072496e-06, + "loss": 0.2801, + "step": 39227 + }, + { + "epoch": 2.1320672816060773, + "grad_norm": 6.234609127044678, + "learning_rate": 4.727011100452241e-06, + "loss": 0.1978, + "step": 39228 + }, + { + "epoch": 2.132080846446012, + "grad_norm": 5.280906677246094, + "learning_rate": 4.726874057831986e-06, + "loss": 0.2209, + "step": 39229 + }, + { + "epoch": 2.132094411285947, + "grad_norm": 4.92662239074707, + "learning_rate": 4.726737015211731e-06, + "loss": 0.2607, + "step": 39230 + }, + { + "epoch": 2.132107976125882, + "grad_norm": 5.876946926116943, + "learning_rate": 4.7265999725914764e-06, + "loss": 0.2586, + "step": 39231 + }, + { + "epoch": 2.1321215409658167, + "grad_norm": 4.652374744415283, + "learning_rate": 4.726462929971222e-06, + "loss": 0.2029, + "step": 39232 + }, + { + "epoch": 2.1321351058057516, + "grad_norm": 6.1812357902526855, + "learning_rate": 4.726325887350967e-06, + "loss": 0.3386, + "step": 39233 + }, + { + "epoch": 2.1321486706456865, + "grad_norm": 5.54313850402832, + "learning_rate": 4.726188844730711e-06, + "loss": 0.1789, + "step": 39234 + }, + { + "epoch": 2.1321622354856213, + "grad_norm": 5.9092817306518555, + "learning_rate": 4.726051802110457e-06, + "loss": 0.2648, + "step": 39235 + }, + { + "epoch": 2.132175800325556, + "grad_norm": 5.7540602684021, + "learning_rate": 4.7259147594902015e-06, + "loss": 0.1748, + "step": 39236 + }, + { + "epoch": 2.132189365165491, + "grad_norm": 4.555816173553467, + "learning_rate": 4.725777716869947e-06, + "loss": 0.1608, + "step": 39237 + }, + { + "epoch": 2.132202930005426, + "grad_norm": 5.3593950271606445, + "learning_rate": 4.725640674249692e-06, + "loss": 0.2402, + "step": 39238 + }, + { + "epoch": 2.132216494845361, + "grad_norm": 4.849949359893799, + "learning_rate": 4.725503631629437e-06, + "loss": 0.2397, + "step": 39239 + }, + { + "epoch": 2.1322300596852957, + "grad_norm": 4.51143217086792, + "learning_rate": 4.725366589009182e-06, + "loss": 0.2039, + "step": 39240 + }, + { + "epoch": 2.1322436245252305, + "grad_norm": 6.774188041687012, + "learning_rate": 4.725229546388927e-06, + "loss": 0.2651, + "step": 39241 + }, + { + "epoch": 2.1322571893651654, + "grad_norm": 6.861362457275391, + "learning_rate": 4.7250925037686725e-06, + "loss": 0.1612, + "step": 39242 + }, + { + "epoch": 2.1322707542051003, + "grad_norm": 5.766853332519531, + "learning_rate": 4.724955461148418e-06, + "loss": 0.2479, + "step": 39243 + }, + { + "epoch": 2.132284319045035, + "grad_norm": 4.695346355438232, + "learning_rate": 4.724818418528163e-06, + "loss": 0.1451, + "step": 39244 + }, + { + "epoch": 2.13229788388497, + "grad_norm": 4.931161403656006, + "learning_rate": 4.724681375907908e-06, + "loss": 0.2916, + "step": 39245 + }, + { + "epoch": 2.132311448724905, + "grad_norm": 5.89555025100708, + "learning_rate": 4.724544333287652e-06, + "loss": 0.1892, + "step": 39246 + }, + { + "epoch": 2.1323250135648397, + "grad_norm": 6.523038864135742, + "learning_rate": 4.724407290667398e-06, + "loss": 0.1956, + "step": 39247 + }, + { + "epoch": 2.132338578404775, + "grad_norm": 9.758101463317871, + "learning_rate": 4.724270248047143e-06, + "loss": 0.3036, + "step": 39248 + }, + { + "epoch": 2.13235214324471, + "grad_norm": 4.938450336456299, + "learning_rate": 4.724133205426888e-06, + "loss": 0.1529, + "step": 39249 + }, + { + "epoch": 2.1323657080846448, + "grad_norm": 6.6674675941467285, + "learning_rate": 4.723996162806633e-06, + "loss": 0.1707, + "step": 39250 + }, + { + "epoch": 2.1323792729245796, + "grad_norm": 5.432708263397217, + "learning_rate": 4.723859120186378e-06, + "loss": 0.2296, + "step": 39251 + }, + { + "epoch": 2.1323928377645145, + "grad_norm": 8.425460815429688, + "learning_rate": 4.7237220775661235e-06, + "loss": 0.2482, + "step": 39252 + }, + { + "epoch": 2.1324064026044494, + "grad_norm": 5.269384384155273, + "learning_rate": 4.723585034945869e-06, + "loss": 0.1608, + "step": 39253 + }, + { + "epoch": 2.1324199674443842, + "grad_norm": 5.347888469696045, + "learning_rate": 4.723447992325614e-06, + "loss": 0.2723, + "step": 39254 + }, + { + "epoch": 2.132433532284319, + "grad_norm": 6.223490238189697, + "learning_rate": 4.723310949705359e-06, + "loss": 0.3702, + "step": 39255 + }, + { + "epoch": 2.132447097124254, + "grad_norm": 4.763641834259033, + "learning_rate": 4.723173907085104e-06, + "loss": 0.2063, + "step": 39256 + }, + { + "epoch": 2.132460661964189, + "grad_norm": 4.141750812530518, + "learning_rate": 4.7230368644648485e-06, + "loss": 0.1711, + "step": 39257 + }, + { + "epoch": 2.1324742268041237, + "grad_norm": 5.9433088302612305, + "learning_rate": 4.7228998218445946e-06, + "loss": 0.2454, + "step": 39258 + }, + { + "epoch": 2.1324877916440586, + "grad_norm": 5.3261847496032715, + "learning_rate": 4.722762779224339e-06, + "loss": 0.2621, + "step": 39259 + }, + { + "epoch": 2.1325013564839934, + "grad_norm": 5.392210960388184, + "learning_rate": 4.722625736604085e-06, + "loss": 0.2231, + "step": 39260 + }, + { + "epoch": 2.1325149213239283, + "grad_norm": 6.377073287963867, + "learning_rate": 4.722488693983829e-06, + "loss": 0.2749, + "step": 39261 + }, + { + "epoch": 2.132528486163863, + "grad_norm": 4.9209113121032715, + "learning_rate": 4.7223516513635744e-06, + "loss": 0.1474, + "step": 39262 + }, + { + "epoch": 2.132542051003798, + "grad_norm": 5.993046760559082, + "learning_rate": 4.72221460874332e-06, + "loss": 0.294, + "step": 39263 + }, + { + "epoch": 2.132555615843733, + "grad_norm": 6.211513996124268, + "learning_rate": 4.722077566123065e-06, + "loss": 0.2602, + "step": 39264 + }, + { + "epoch": 2.1325691806836677, + "grad_norm": 6.125889778137207, + "learning_rate": 4.72194052350281e-06, + "loss": 0.3186, + "step": 39265 + }, + { + "epoch": 2.132582745523603, + "grad_norm": 5.052390098571777, + "learning_rate": 4.721803480882554e-06, + "loss": 0.2475, + "step": 39266 + }, + { + "epoch": 2.132596310363538, + "grad_norm": 5.3175177574157715, + "learning_rate": 4.7216664382623e-06, + "loss": 0.3848, + "step": 39267 + }, + { + "epoch": 2.132609875203473, + "grad_norm": 4.79326057434082, + "learning_rate": 4.721529395642045e-06, + "loss": 0.2204, + "step": 39268 + }, + { + "epoch": 2.1326234400434076, + "grad_norm": 5.517817497253418, + "learning_rate": 4.721392353021791e-06, + "loss": 0.2853, + "step": 39269 + }, + { + "epoch": 2.1326370048833425, + "grad_norm": 4.358646392822266, + "learning_rate": 4.721255310401535e-06, + "loss": 0.1601, + "step": 39270 + }, + { + "epoch": 2.1326505697232774, + "grad_norm": 5.172481536865234, + "learning_rate": 4.72111826778128e-06, + "loss": 0.1942, + "step": 39271 + }, + { + "epoch": 2.1326641345632122, + "grad_norm": 4.415022850036621, + "learning_rate": 4.720981225161025e-06, + "loss": 0.2021, + "step": 39272 + }, + { + "epoch": 2.132677699403147, + "grad_norm": 4.962469577789307, + "learning_rate": 4.7208441825407706e-06, + "loss": 0.2191, + "step": 39273 + }, + { + "epoch": 2.132691264243082, + "grad_norm": 5.232339859008789, + "learning_rate": 4.720707139920516e-06, + "loss": 0.2605, + "step": 39274 + }, + { + "epoch": 2.132704829083017, + "grad_norm": 5.104884624481201, + "learning_rate": 4.720570097300261e-06, + "loss": 0.3171, + "step": 39275 + }, + { + "epoch": 2.1327183939229517, + "grad_norm": 4.323331832885742, + "learning_rate": 4.720433054680006e-06, + "loss": 0.1919, + "step": 39276 + }, + { + "epoch": 2.1327319587628866, + "grad_norm": 4.4760823249816895, + "learning_rate": 4.72029601205975e-06, + "loss": 0.1677, + "step": 39277 + }, + { + "epoch": 2.1327455236028214, + "grad_norm": 5.532690525054932, + "learning_rate": 4.7201589694394964e-06, + "loss": 0.3552, + "step": 39278 + }, + { + "epoch": 2.1327590884427563, + "grad_norm": 3.9776055812835693, + "learning_rate": 4.720021926819241e-06, + "loss": 0.123, + "step": 39279 + }, + { + "epoch": 2.132772653282691, + "grad_norm": 3.825740337371826, + "learning_rate": 4.719884884198987e-06, + "loss": 0.1441, + "step": 39280 + }, + { + "epoch": 2.132786218122626, + "grad_norm": 4.0026726722717285, + "learning_rate": 4.719747841578731e-06, + "loss": 0.1566, + "step": 39281 + }, + { + "epoch": 2.132799782962561, + "grad_norm": 6.578181743621826, + "learning_rate": 4.719610798958476e-06, + "loss": 0.2696, + "step": 39282 + }, + { + "epoch": 2.1328133478024958, + "grad_norm": 3.975844383239746, + "learning_rate": 4.7194737563382215e-06, + "loss": 0.206, + "step": 39283 + }, + { + "epoch": 2.1328269126424306, + "grad_norm": 4.929260730743408, + "learning_rate": 4.719336713717967e-06, + "loss": 0.2545, + "step": 39284 + }, + { + "epoch": 2.1328404774823655, + "grad_norm": 4.205083847045898, + "learning_rate": 4.719199671097712e-06, + "loss": 0.2142, + "step": 39285 + }, + { + "epoch": 2.132854042322301, + "grad_norm": 6.184087753295898, + "learning_rate": 4.719062628477457e-06, + "loss": 0.4184, + "step": 39286 + }, + { + "epoch": 2.1328676071622357, + "grad_norm": 4.348514556884766, + "learning_rate": 4.718925585857202e-06, + "loss": 0.189, + "step": 39287 + }, + { + "epoch": 2.1328811720021705, + "grad_norm": 4.753868103027344, + "learning_rate": 4.718788543236947e-06, + "loss": 0.2286, + "step": 39288 + }, + { + "epoch": 2.1328947368421054, + "grad_norm": 6.465132236480713, + "learning_rate": 4.7186515006166926e-06, + "loss": 0.1944, + "step": 39289 + }, + { + "epoch": 2.1329083016820403, + "grad_norm": 3.914299964904785, + "learning_rate": 4.718514457996437e-06, + "loss": 0.1332, + "step": 39290 + }, + { + "epoch": 2.132921866521975, + "grad_norm": 4.179393291473389, + "learning_rate": 4.718377415376182e-06, + "loss": 0.2226, + "step": 39291 + }, + { + "epoch": 2.13293543136191, + "grad_norm": 4.79816198348999, + "learning_rate": 4.718240372755927e-06, + "loss": 0.1456, + "step": 39292 + }, + { + "epoch": 2.132948996201845, + "grad_norm": 6.387495994567871, + "learning_rate": 4.7181033301356724e-06, + "loss": 0.3298, + "step": 39293 + }, + { + "epoch": 2.1329625610417797, + "grad_norm": 5.4849019050598145, + "learning_rate": 4.717966287515418e-06, + "loss": 0.164, + "step": 39294 + }, + { + "epoch": 2.1329761258817146, + "grad_norm": 5.512101173400879, + "learning_rate": 4.717829244895163e-06, + "loss": 0.24, + "step": 39295 + }, + { + "epoch": 2.1329896907216495, + "grad_norm": 4.2043538093566895, + "learning_rate": 4.717692202274908e-06, + "loss": 0.2661, + "step": 39296 + }, + { + "epoch": 2.1330032555615843, + "grad_norm": 6.2301435470581055, + "learning_rate": 4.717555159654653e-06, + "loss": 0.2465, + "step": 39297 + }, + { + "epoch": 2.133016820401519, + "grad_norm": 5.272871971130371, + "learning_rate": 4.717418117034398e-06, + "loss": 0.2534, + "step": 39298 + }, + { + "epoch": 2.133030385241454, + "grad_norm": 6.39685583114624, + "learning_rate": 4.7172810744141435e-06, + "loss": 0.228, + "step": 39299 + }, + { + "epoch": 2.133043950081389, + "grad_norm": 5.577810287475586, + "learning_rate": 4.717144031793888e-06, + "loss": 0.1624, + "step": 39300 + }, + { + "epoch": 2.133057514921324, + "grad_norm": 5.564451217651367, + "learning_rate": 4.717006989173634e-06, + "loss": 0.2287, + "step": 39301 + }, + { + "epoch": 2.1330710797612586, + "grad_norm": 4.329961776733398, + "learning_rate": 4.716869946553378e-06, + "loss": 0.249, + "step": 39302 + }, + { + "epoch": 2.1330846446011935, + "grad_norm": 6.5152387619018555, + "learning_rate": 4.716732903933123e-06, + "loss": 0.2483, + "step": 39303 + }, + { + "epoch": 2.133098209441129, + "grad_norm": 4.237961292266846, + "learning_rate": 4.7165958613128686e-06, + "loss": 0.1671, + "step": 39304 + }, + { + "epoch": 2.1331117742810637, + "grad_norm": 5.811878681182861, + "learning_rate": 4.716458818692614e-06, + "loss": 0.2013, + "step": 39305 + }, + { + "epoch": 2.1331253391209986, + "grad_norm": 8.380861282348633, + "learning_rate": 4.716321776072359e-06, + "loss": 0.3316, + "step": 39306 + }, + { + "epoch": 2.1331389039609334, + "grad_norm": 7.093493461608887, + "learning_rate": 4.716184733452104e-06, + "loss": 0.3488, + "step": 39307 + }, + { + "epoch": 2.1331524688008683, + "grad_norm": 5.253413200378418, + "learning_rate": 4.716047690831849e-06, + "loss": 0.1727, + "step": 39308 + }, + { + "epoch": 2.133166033640803, + "grad_norm": 4.782898902893066, + "learning_rate": 4.7159106482115945e-06, + "loss": 0.1136, + "step": 39309 + }, + { + "epoch": 2.133179598480738, + "grad_norm": 6.006448268890381, + "learning_rate": 4.71577360559134e-06, + "loss": 0.2659, + "step": 39310 + }, + { + "epoch": 2.133193163320673, + "grad_norm": 7.146973609924316, + "learning_rate": 4.715636562971084e-06, + "loss": 0.2245, + "step": 39311 + }, + { + "epoch": 2.1332067281606077, + "grad_norm": 5.57155704498291, + "learning_rate": 4.71549952035083e-06, + "loss": 0.2294, + "step": 39312 + }, + { + "epoch": 2.1332202930005426, + "grad_norm": 4.775512218475342, + "learning_rate": 4.715362477730574e-06, + "loss": 0.2329, + "step": 39313 + }, + { + "epoch": 2.1332338578404775, + "grad_norm": 3.94398832321167, + "learning_rate": 4.71522543511032e-06, + "loss": 0.1663, + "step": 39314 + }, + { + "epoch": 2.1332474226804123, + "grad_norm": 4.706301212310791, + "learning_rate": 4.715088392490065e-06, + "loss": 0.2033, + "step": 39315 + }, + { + "epoch": 2.133260987520347, + "grad_norm": 5.350060939788818, + "learning_rate": 4.71495134986981e-06, + "loss": 0.2237, + "step": 39316 + }, + { + "epoch": 2.133274552360282, + "grad_norm": 5.210677146911621, + "learning_rate": 4.714814307249555e-06, + "loss": 0.124, + "step": 39317 + }, + { + "epoch": 2.133288117200217, + "grad_norm": 4.744340419769287, + "learning_rate": 4.7146772646293e-06, + "loss": 0.1489, + "step": 39318 + }, + { + "epoch": 2.133301682040152, + "grad_norm": 5.125141620635986, + "learning_rate": 4.714540222009045e-06, + "loss": 0.2206, + "step": 39319 + }, + { + "epoch": 2.1333152468800867, + "grad_norm": 4.734259128570557, + "learning_rate": 4.71440317938879e-06, + "loss": 0.1694, + "step": 39320 + }, + { + "epoch": 2.1333288117200215, + "grad_norm": 4.90833854675293, + "learning_rate": 4.714266136768536e-06, + "loss": 0.1462, + "step": 39321 + }, + { + "epoch": 2.1333423765599564, + "grad_norm": 3.2778522968292236, + "learning_rate": 4.71412909414828e-06, + "loss": 0.1201, + "step": 39322 + }, + { + "epoch": 2.1333559413998913, + "grad_norm": 7.480743408203125, + "learning_rate": 4.713992051528026e-06, + "loss": 0.2635, + "step": 39323 + }, + { + "epoch": 2.1333695062398266, + "grad_norm": 5.226458549499512, + "learning_rate": 4.7138550089077704e-06, + "loss": 0.321, + "step": 39324 + }, + { + "epoch": 2.1333830710797614, + "grad_norm": 7.056634426116943, + "learning_rate": 4.713717966287516e-06, + "loss": 0.3202, + "step": 39325 + }, + { + "epoch": 2.1333966359196963, + "grad_norm": 5.37824010848999, + "learning_rate": 4.713580923667261e-06, + "loss": 0.2088, + "step": 39326 + }, + { + "epoch": 2.133410200759631, + "grad_norm": 5.110317230224609, + "learning_rate": 4.713443881047006e-06, + "loss": 0.2482, + "step": 39327 + }, + { + "epoch": 2.133423765599566, + "grad_norm": 3.514716625213623, + "learning_rate": 4.713306838426751e-06, + "loss": 0.1061, + "step": 39328 + }, + { + "epoch": 2.133437330439501, + "grad_norm": 3.9058523178100586, + "learning_rate": 4.713169795806496e-06, + "loss": 0.126, + "step": 39329 + }, + { + "epoch": 2.1334508952794358, + "grad_norm": 4.681609153747559, + "learning_rate": 4.7130327531862415e-06, + "loss": 0.2088, + "step": 39330 + }, + { + "epoch": 2.1334644601193706, + "grad_norm": 5.449758052825928, + "learning_rate": 4.712895710565986e-06, + "loss": 0.2621, + "step": 39331 + }, + { + "epoch": 2.1334780249593055, + "grad_norm": 4.3545098304748535, + "learning_rate": 4.712758667945732e-06, + "loss": 0.1935, + "step": 39332 + }, + { + "epoch": 2.1334915897992404, + "grad_norm": 3.9909937381744385, + "learning_rate": 4.712621625325476e-06, + "loss": 0.1668, + "step": 39333 + }, + { + "epoch": 2.1335051546391752, + "grad_norm": 4.351733684539795, + "learning_rate": 4.712484582705221e-06, + "loss": 0.188, + "step": 39334 + }, + { + "epoch": 2.13351871947911, + "grad_norm": 4.910131931304932, + "learning_rate": 4.7123475400849666e-06, + "loss": 0.16, + "step": 39335 + }, + { + "epoch": 2.133532284319045, + "grad_norm": 5.44927978515625, + "learning_rate": 4.712210497464712e-06, + "loss": 0.3058, + "step": 39336 + }, + { + "epoch": 2.13354584915898, + "grad_norm": 4.641313076019287, + "learning_rate": 4.712073454844457e-06, + "loss": 0.2377, + "step": 39337 + }, + { + "epoch": 2.1335594139989147, + "grad_norm": 3.935553789138794, + "learning_rate": 4.711936412224202e-06, + "loss": 0.1432, + "step": 39338 + }, + { + "epoch": 2.1335729788388496, + "grad_norm": 5.657815456390381, + "learning_rate": 4.711799369603947e-06, + "loss": 0.1991, + "step": 39339 + }, + { + "epoch": 2.1335865436787844, + "grad_norm": 4.6297502517700195, + "learning_rate": 4.7116623269836925e-06, + "loss": 0.1736, + "step": 39340 + }, + { + "epoch": 2.1336001085187193, + "grad_norm": 4.937187194824219, + "learning_rate": 4.711525284363438e-06, + "loss": 0.1934, + "step": 39341 + }, + { + "epoch": 2.1336136733586546, + "grad_norm": 4.418549537658691, + "learning_rate": 4.711388241743183e-06, + "loss": 0.0883, + "step": 39342 + }, + { + "epoch": 2.1336272381985895, + "grad_norm": 4.985091686248779, + "learning_rate": 4.711251199122928e-06, + "loss": 0.2193, + "step": 39343 + }, + { + "epoch": 2.1336408030385243, + "grad_norm": 5.05246639251709, + "learning_rate": 4.711114156502672e-06, + "loss": 0.2903, + "step": 39344 + }, + { + "epoch": 2.133654367878459, + "grad_norm": 5.216689109802246, + "learning_rate": 4.7109771138824175e-06, + "loss": 0.274, + "step": 39345 + }, + { + "epoch": 2.133667932718394, + "grad_norm": 4.6619720458984375, + "learning_rate": 4.710840071262163e-06, + "loss": 0.1897, + "step": 39346 + }, + { + "epoch": 2.133681497558329, + "grad_norm": 6.138566493988037, + "learning_rate": 4.710703028641908e-06, + "loss": 0.1959, + "step": 39347 + }, + { + "epoch": 2.133695062398264, + "grad_norm": 4.056803226470947, + "learning_rate": 4.710565986021653e-06, + "loss": 0.1758, + "step": 39348 + }, + { + "epoch": 2.1337086272381987, + "grad_norm": 5.41019344329834, + "learning_rate": 4.710428943401398e-06, + "loss": 0.2703, + "step": 39349 + }, + { + "epoch": 2.1337221920781335, + "grad_norm": 4.2645487785339355, + "learning_rate": 4.710291900781143e-06, + "loss": 0.1201, + "step": 39350 + }, + { + "epoch": 2.1337357569180684, + "grad_norm": 5.819480895996094, + "learning_rate": 4.7101548581608886e-06, + "loss": 0.2078, + "step": 39351 + }, + { + "epoch": 2.1337493217580032, + "grad_norm": 5.268866539001465, + "learning_rate": 4.710017815540634e-06, + "loss": 0.2413, + "step": 39352 + }, + { + "epoch": 2.133762886597938, + "grad_norm": 3.839381694793701, + "learning_rate": 4.709880772920379e-06, + "loss": 0.1412, + "step": 39353 + }, + { + "epoch": 2.133776451437873, + "grad_norm": 4.768510818481445, + "learning_rate": 4.709743730300123e-06, + "loss": 0.1473, + "step": 39354 + }, + { + "epoch": 2.133790016277808, + "grad_norm": 4.03552770614624, + "learning_rate": 4.709606687679869e-06, + "loss": 0.1832, + "step": 39355 + }, + { + "epoch": 2.1338035811177427, + "grad_norm": 4.844773292541504, + "learning_rate": 4.709469645059614e-06, + "loss": 0.1656, + "step": 39356 + }, + { + "epoch": 2.1338171459576776, + "grad_norm": 3.998826026916504, + "learning_rate": 4.70933260243936e-06, + "loss": 0.2032, + "step": 39357 + }, + { + "epoch": 2.1338307107976124, + "grad_norm": 5.060991287231445, + "learning_rate": 4.709195559819104e-06, + "loss": 0.1778, + "step": 39358 + }, + { + "epoch": 2.1338442756375473, + "grad_norm": 4.721539497375488, + "learning_rate": 4.709058517198849e-06, + "loss": 0.1914, + "step": 39359 + }, + { + "epoch": 2.133857840477482, + "grad_norm": 4.474964618682861, + "learning_rate": 4.708921474578594e-06, + "loss": 0.2346, + "step": 39360 + }, + { + "epoch": 2.133871405317417, + "grad_norm": 4.351300239562988, + "learning_rate": 4.7087844319583395e-06, + "loss": 0.1948, + "step": 39361 + }, + { + "epoch": 2.1338849701573523, + "grad_norm": 5.4508585929870605, + "learning_rate": 4.708647389338085e-06, + "loss": 0.2656, + "step": 39362 + }, + { + "epoch": 2.133898534997287, + "grad_norm": 5.18276834487915, + "learning_rate": 4.70851034671783e-06, + "loss": 0.1895, + "step": 39363 + }, + { + "epoch": 2.133912099837222, + "grad_norm": 3.9249377250671387, + "learning_rate": 4.708373304097575e-06, + "loss": 0.0951, + "step": 39364 + }, + { + "epoch": 2.133925664677157, + "grad_norm": 4.240599155426025, + "learning_rate": 4.708236261477319e-06, + "loss": 0.3169, + "step": 39365 + }, + { + "epoch": 2.133939229517092, + "grad_norm": 4.551672458648682, + "learning_rate": 4.708099218857065e-06, + "loss": 0.1956, + "step": 39366 + }, + { + "epoch": 2.1339527943570267, + "grad_norm": 4.775611400604248, + "learning_rate": 4.70796217623681e-06, + "loss": 0.2146, + "step": 39367 + }, + { + "epoch": 2.1339663591969615, + "grad_norm": 3.429368257522583, + "learning_rate": 4.707825133616556e-06, + "loss": 0.1886, + "step": 39368 + }, + { + "epoch": 2.1339799240368964, + "grad_norm": 7.163083553314209, + "learning_rate": 4.7076880909963e-06, + "loss": 0.2443, + "step": 39369 + }, + { + "epoch": 2.1339934888768313, + "grad_norm": 3.216697931289673, + "learning_rate": 4.707551048376045e-06, + "loss": 0.1112, + "step": 39370 + }, + { + "epoch": 2.134007053716766, + "grad_norm": 5.7762837409973145, + "learning_rate": 4.7074140057557905e-06, + "loss": 0.1533, + "step": 39371 + }, + { + "epoch": 2.134020618556701, + "grad_norm": 2.9494876861572266, + "learning_rate": 4.707276963135536e-06, + "loss": 0.1369, + "step": 39372 + }, + { + "epoch": 2.134034183396636, + "grad_norm": 4.2153449058532715, + "learning_rate": 4.707139920515281e-06, + "loss": 0.1534, + "step": 39373 + }, + { + "epoch": 2.1340477482365707, + "grad_norm": 3.945387601852417, + "learning_rate": 4.707002877895025e-06, + "loss": 0.1503, + "step": 39374 + }, + { + "epoch": 2.1340613130765056, + "grad_norm": 4.991672039031982, + "learning_rate": 4.706865835274771e-06, + "loss": 0.1989, + "step": 39375 + }, + { + "epoch": 2.1340748779164405, + "grad_norm": 3.4926679134368896, + "learning_rate": 4.7067287926545155e-06, + "loss": 0.1407, + "step": 39376 + }, + { + "epoch": 2.1340884427563753, + "grad_norm": 3.7012157440185547, + "learning_rate": 4.7065917500342615e-06, + "loss": 0.1298, + "step": 39377 + }, + { + "epoch": 2.13410200759631, + "grad_norm": 4.223743915557861, + "learning_rate": 4.706454707414006e-06, + "loss": 0.1538, + "step": 39378 + }, + { + "epoch": 2.134115572436245, + "grad_norm": 3.300875186920166, + "learning_rate": 4.706317664793751e-06, + "loss": 0.104, + "step": 39379 + }, + { + "epoch": 2.1341291372761804, + "grad_norm": 3.6598527431488037, + "learning_rate": 4.706180622173496e-06, + "loss": 0.1516, + "step": 39380 + }, + { + "epoch": 2.1341427021161152, + "grad_norm": 3.6633546352386475, + "learning_rate": 4.706043579553241e-06, + "loss": 0.1438, + "step": 39381 + }, + { + "epoch": 2.13415626695605, + "grad_norm": 3.5751426219940186, + "learning_rate": 4.705906536932987e-06, + "loss": 0.113, + "step": 39382 + }, + { + "epoch": 2.134169831795985, + "grad_norm": 2.828660249710083, + "learning_rate": 4.705769494312732e-06, + "loss": 0.0861, + "step": 39383 + }, + { + "epoch": 2.13418339663592, + "grad_norm": 4.074896812438965, + "learning_rate": 4.705632451692477e-06, + "loss": 0.1584, + "step": 39384 + }, + { + "epoch": 2.1341969614758547, + "grad_norm": 2.670382499694824, + "learning_rate": 4.705495409072221e-06, + "loss": 0.1084, + "step": 39385 + }, + { + "epoch": 2.1342105263157896, + "grad_norm": 3.7416133880615234, + "learning_rate": 4.705358366451967e-06, + "loss": 0.0868, + "step": 39386 + }, + { + "epoch": 2.1342240911557244, + "grad_norm": 4.12455940246582, + "learning_rate": 4.705221323831712e-06, + "loss": 0.1773, + "step": 39387 + }, + { + "epoch": 2.1342376559956593, + "grad_norm": 4.1456780433654785, + "learning_rate": 4.705084281211457e-06, + "loss": 0.12, + "step": 39388 + }, + { + "epoch": 2.134251220835594, + "grad_norm": 3.9337542057037354, + "learning_rate": 4.704947238591202e-06, + "loss": 0.1972, + "step": 39389 + }, + { + "epoch": 2.134264785675529, + "grad_norm": 6.001008987426758, + "learning_rate": 4.704810195970947e-06, + "loss": 0.2864, + "step": 39390 + }, + { + "epoch": 2.134278350515464, + "grad_norm": 6.1529974937438965, + "learning_rate": 4.704673153350692e-06, + "loss": 0.196, + "step": 39391 + }, + { + "epoch": 2.1342919153553987, + "grad_norm": 3.1977756023406982, + "learning_rate": 4.7045361107304375e-06, + "loss": 0.0662, + "step": 39392 + }, + { + "epoch": 2.1343054801953336, + "grad_norm": 3.7174134254455566, + "learning_rate": 4.704399068110183e-06, + "loss": 0.1484, + "step": 39393 + }, + { + "epoch": 2.1343190450352685, + "grad_norm": 4.170144557952881, + "learning_rate": 4.704262025489928e-06, + "loss": 0.1808, + "step": 39394 + }, + { + "epoch": 2.1343326098752033, + "grad_norm": 3.2668066024780273, + "learning_rate": 4.704124982869673e-06, + "loss": 0.0838, + "step": 39395 + }, + { + "epoch": 2.134346174715138, + "grad_norm": 3.263587474822998, + "learning_rate": 4.703987940249418e-06, + "loss": 0.0772, + "step": 39396 + }, + { + "epoch": 2.134359739555073, + "grad_norm": 4.183269500732422, + "learning_rate": 4.703850897629163e-06, + "loss": 0.2045, + "step": 39397 + }, + { + "epoch": 2.134373304395008, + "grad_norm": 4.262491703033447, + "learning_rate": 4.703713855008909e-06, + "loss": 0.1197, + "step": 39398 + }, + { + "epoch": 2.134386869234943, + "grad_norm": 3.338911294937134, + "learning_rate": 4.703576812388653e-06, + "loss": 0.0832, + "step": 39399 + }, + { + "epoch": 2.134400434074878, + "grad_norm": 4.8534440994262695, + "learning_rate": 4.703439769768398e-06, + "loss": 0.1233, + "step": 39400 + }, + { + "epoch": 2.134413998914813, + "grad_norm": 4.591546058654785, + "learning_rate": 4.703302727148143e-06, + "loss": 0.16, + "step": 39401 + }, + { + "epoch": 2.134427563754748, + "grad_norm": 4.068662643432617, + "learning_rate": 4.7031656845278885e-06, + "loss": 0.1393, + "step": 39402 + }, + { + "epoch": 2.1344411285946827, + "grad_norm": 4.347909450531006, + "learning_rate": 4.703028641907634e-06, + "loss": 0.1143, + "step": 39403 + }, + { + "epoch": 2.1344546934346176, + "grad_norm": 3.2814559936523438, + "learning_rate": 4.702891599287379e-06, + "loss": 0.1164, + "step": 39404 + }, + { + "epoch": 2.1344682582745524, + "grad_norm": 4.478135108947754, + "learning_rate": 4.702754556667124e-06, + "loss": 0.1633, + "step": 39405 + }, + { + "epoch": 2.1344818231144873, + "grad_norm": 4.025120258331299, + "learning_rate": 4.702617514046869e-06, + "loss": 0.1433, + "step": 39406 + }, + { + "epoch": 2.134495387954422, + "grad_norm": 5.134811878204346, + "learning_rate": 4.702480471426614e-06, + "loss": 0.1977, + "step": 39407 + }, + { + "epoch": 2.134508952794357, + "grad_norm": 5.028250694274902, + "learning_rate": 4.702343428806359e-06, + "loss": 0.2689, + "step": 39408 + }, + { + "epoch": 2.134522517634292, + "grad_norm": 3.146829843521118, + "learning_rate": 4.702206386186105e-06, + "loss": 0.1042, + "step": 39409 + }, + { + "epoch": 2.1345360824742268, + "grad_norm": 4.217370986938477, + "learning_rate": 4.702069343565849e-06, + "loss": 0.1242, + "step": 39410 + }, + { + "epoch": 2.1345496473141616, + "grad_norm": 3.4323790073394775, + "learning_rate": 4.701932300945595e-06, + "loss": 0.0826, + "step": 39411 + }, + { + "epoch": 2.1345632121540965, + "grad_norm": 3.9296441078186035, + "learning_rate": 4.701795258325339e-06, + "loss": 0.1264, + "step": 39412 + }, + { + "epoch": 2.1345767769940314, + "grad_norm": 4.492713928222656, + "learning_rate": 4.701658215705085e-06, + "loss": 0.1405, + "step": 39413 + }, + { + "epoch": 2.1345903418339662, + "grad_norm": 4.651558876037598, + "learning_rate": 4.70152117308483e-06, + "loss": 0.0894, + "step": 39414 + }, + { + "epoch": 2.134603906673901, + "grad_norm": 3.37571382522583, + "learning_rate": 4.701384130464575e-06, + "loss": 0.0921, + "step": 39415 + }, + { + "epoch": 2.134617471513836, + "grad_norm": 3.7228293418884277, + "learning_rate": 4.70124708784432e-06, + "loss": 0.1713, + "step": 39416 + }, + { + "epoch": 2.134631036353771, + "grad_norm": 6.880213260650635, + "learning_rate": 4.701110045224065e-06, + "loss": 0.1842, + "step": 39417 + }, + { + "epoch": 2.134644601193706, + "grad_norm": 5.32509183883667, + "learning_rate": 4.7009730026038105e-06, + "loss": 0.1703, + "step": 39418 + }, + { + "epoch": 2.134658166033641, + "grad_norm": 4.579054355621338, + "learning_rate": 4.700835959983555e-06, + "loss": 0.1122, + "step": 39419 + }, + { + "epoch": 2.134671730873576, + "grad_norm": 3.470999240875244, + "learning_rate": 4.700698917363301e-06, + "loss": 0.074, + "step": 39420 + }, + { + "epoch": 2.1346852957135107, + "grad_norm": 4.898961067199707, + "learning_rate": 4.700561874743045e-06, + "loss": 0.182, + "step": 39421 + }, + { + "epoch": 2.1346988605534456, + "grad_norm": 5.526978492736816, + "learning_rate": 4.700424832122791e-06, + "loss": 0.1711, + "step": 39422 + }, + { + "epoch": 2.1347124253933805, + "grad_norm": 3.37899112701416, + "learning_rate": 4.7002877895025355e-06, + "loss": 0.124, + "step": 39423 + }, + { + "epoch": 2.1347259902333153, + "grad_norm": 4.084136009216309, + "learning_rate": 4.700150746882281e-06, + "loss": 0.1344, + "step": 39424 + }, + { + "epoch": 2.13473955507325, + "grad_norm": 4.95145845413208, + "learning_rate": 4.700013704262026e-06, + "loss": 0.15, + "step": 39425 + }, + { + "epoch": 2.134753119913185, + "grad_norm": 4.766386985778809, + "learning_rate": 4.699876661641771e-06, + "loss": 0.1671, + "step": 39426 + }, + { + "epoch": 2.13476668475312, + "grad_norm": 4.352990627288818, + "learning_rate": 4.699739619021516e-06, + "loss": 0.1797, + "step": 39427 + }, + { + "epoch": 2.134780249593055, + "grad_norm": 3.7740864753723145, + "learning_rate": 4.6996025764012606e-06, + "loss": 0.1118, + "step": 39428 + }, + { + "epoch": 2.1347938144329897, + "grad_norm": 4.5047454833984375, + "learning_rate": 4.699465533781007e-06, + "loss": 0.0974, + "step": 39429 + }, + { + "epoch": 2.1348073792729245, + "grad_norm": 4.840893268585205, + "learning_rate": 4.699328491160751e-06, + "loss": 0.1678, + "step": 39430 + }, + { + "epoch": 2.1348209441128594, + "grad_norm": 4.991312503814697, + "learning_rate": 4.699191448540497e-06, + "loss": 0.231, + "step": 39431 + }, + { + "epoch": 2.1348345089527943, + "grad_norm": 4.3159589767456055, + "learning_rate": 4.699054405920241e-06, + "loss": 0.2128, + "step": 39432 + }, + { + "epoch": 2.134848073792729, + "grad_norm": 3.7029004096984863, + "learning_rate": 4.6989173632999865e-06, + "loss": 0.1256, + "step": 39433 + }, + { + "epoch": 2.134861638632664, + "grad_norm": 4.664103984832764, + "learning_rate": 4.698780320679732e-06, + "loss": 0.1938, + "step": 39434 + }, + { + "epoch": 2.134875203472599, + "grad_norm": 6.182664394378662, + "learning_rate": 4.698643278059477e-06, + "loss": 0.192, + "step": 39435 + }, + { + "epoch": 2.1348887683125337, + "grad_norm": 5.529041767120361, + "learning_rate": 4.698506235439222e-06, + "loss": 0.2115, + "step": 39436 + }, + { + "epoch": 2.1349023331524686, + "grad_norm": 4.741101264953613, + "learning_rate": 4.698369192818967e-06, + "loss": 0.154, + "step": 39437 + }, + { + "epoch": 2.134915897992404, + "grad_norm": 3.333867073059082, + "learning_rate": 4.698232150198712e-06, + "loss": 0.1214, + "step": 39438 + }, + { + "epoch": 2.1349294628323388, + "grad_norm": 3.5239956378936768, + "learning_rate": 4.6980951075784575e-06, + "loss": 0.1398, + "step": 39439 + }, + { + "epoch": 2.1349430276722736, + "grad_norm": 4.684816360473633, + "learning_rate": 4.697958064958203e-06, + "loss": 0.2168, + "step": 39440 + }, + { + "epoch": 2.1349565925122085, + "grad_norm": 5.661899566650391, + "learning_rate": 4.697821022337947e-06, + "loss": 0.2498, + "step": 39441 + }, + { + "epoch": 2.1349701573521433, + "grad_norm": 4.637606143951416, + "learning_rate": 4.697683979717692e-06, + "loss": 0.1766, + "step": 39442 + }, + { + "epoch": 2.134983722192078, + "grad_norm": 6.261781692504883, + "learning_rate": 4.697546937097437e-06, + "loss": 0.2512, + "step": 39443 + }, + { + "epoch": 2.134997287032013, + "grad_norm": 4.83001708984375, + "learning_rate": 4.697409894477183e-06, + "loss": 0.223, + "step": 39444 + }, + { + "epoch": 2.135010851871948, + "grad_norm": 5.648027420043945, + "learning_rate": 4.697272851856928e-06, + "loss": 0.2522, + "step": 39445 + }, + { + "epoch": 2.135024416711883, + "grad_norm": 5.3843092918396, + "learning_rate": 4.697135809236673e-06, + "loss": 0.2591, + "step": 39446 + }, + { + "epoch": 2.1350379815518177, + "grad_norm": 4.994331359863281, + "learning_rate": 4.696998766616418e-06, + "loss": 0.3116, + "step": 39447 + }, + { + "epoch": 2.1350515463917525, + "grad_norm": 5.977832317352295, + "learning_rate": 4.696861723996163e-06, + "loss": 0.2312, + "step": 39448 + }, + { + "epoch": 2.1350651112316874, + "grad_norm": 6.052032470703125, + "learning_rate": 4.6967246813759085e-06, + "loss": 0.2459, + "step": 39449 + }, + { + "epoch": 2.1350786760716223, + "grad_norm": 4.265114784240723, + "learning_rate": 4.696587638755654e-06, + "loss": 0.1187, + "step": 39450 + }, + { + "epoch": 2.135092240911557, + "grad_norm": 6.264305114746094, + "learning_rate": 4.696450596135399e-06, + "loss": 0.1512, + "step": 39451 + }, + { + "epoch": 2.135105805751492, + "grad_norm": 3.80600643157959, + "learning_rate": 4.696313553515144e-06, + "loss": 0.1571, + "step": 39452 + }, + { + "epoch": 2.135119370591427, + "grad_norm": 5.04330587387085, + "learning_rate": 4.696176510894888e-06, + "loss": 0.1639, + "step": 39453 + }, + { + "epoch": 2.1351329354313617, + "grad_norm": 6.604215145111084, + "learning_rate": 4.6960394682746335e-06, + "loss": 0.3169, + "step": 39454 + }, + { + "epoch": 2.1351465002712966, + "grad_norm": 4.703345775604248, + "learning_rate": 4.695902425654379e-06, + "loss": 0.2282, + "step": 39455 + }, + { + "epoch": 2.135160065111232, + "grad_norm": 8.277692794799805, + "learning_rate": 4.695765383034124e-06, + "loss": 0.4414, + "step": 39456 + }, + { + "epoch": 2.1351736299511668, + "grad_norm": 4.917994976043701, + "learning_rate": 4.695628340413869e-06, + "loss": 0.1901, + "step": 39457 + }, + { + "epoch": 2.1351871947911016, + "grad_norm": 6.756119251251221, + "learning_rate": 4.695491297793614e-06, + "loss": 0.1592, + "step": 39458 + }, + { + "epoch": 2.1352007596310365, + "grad_norm": 4.623398780822754, + "learning_rate": 4.6953542551733594e-06, + "loss": 0.1484, + "step": 39459 + }, + { + "epoch": 2.1352143244709714, + "grad_norm": 3.991410493850708, + "learning_rate": 4.695217212553105e-06, + "loss": 0.2138, + "step": 39460 + }, + { + "epoch": 2.1352278893109062, + "grad_norm": 4.0017595291137695, + "learning_rate": 4.69508016993285e-06, + "loss": 0.1254, + "step": 39461 + }, + { + "epoch": 2.135241454150841, + "grad_norm": 3.3708884716033936, + "learning_rate": 4.694943127312594e-06, + "loss": 0.0931, + "step": 39462 + }, + { + "epoch": 2.135255018990776, + "grad_norm": 4.814582347869873, + "learning_rate": 4.69480608469234e-06, + "loss": 0.2485, + "step": 39463 + }, + { + "epoch": 2.135268583830711, + "grad_norm": 4.495603084564209, + "learning_rate": 4.6946690420720845e-06, + "loss": 0.3239, + "step": 39464 + }, + { + "epoch": 2.1352821486706457, + "grad_norm": 4.937157154083252, + "learning_rate": 4.6945319994518305e-06, + "loss": 0.1668, + "step": 39465 + }, + { + "epoch": 2.1352957135105806, + "grad_norm": 3.236937999725342, + "learning_rate": 4.694394956831575e-06, + "loss": 0.2132, + "step": 39466 + }, + { + "epoch": 2.1353092783505154, + "grad_norm": 4.439558029174805, + "learning_rate": 4.69425791421132e-06, + "loss": 0.1104, + "step": 39467 + }, + { + "epoch": 2.1353228431904503, + "grad_norm": 5.727428436279297, + "learning_rate": 4.694120871591065e-06, + "loss": 0.2059, + "step": 39468 + }, + { + "epoch": 2.135336408030385, + "grad_norm": 5.406670570373535, + "learning_rate": 4.69398382897081e-06, + "loss": 0.2217, + "step": 39469 + }, + { + "epoch": 2.13534997287032, + "grad_norm": 3.501260995864868, + "learning_rate": 4.6938467863505555e-06, + "loss": 0.1963, + "step": 39470 + }, + { + "epoch": 2.135363537710255, + "grad_norm": 4.132650375366211, + "learning_rate": 4.693709743730301e-06, + "loss": 0.1604, + "step": 39471 + }, + { + "epoch": 2.1353771025501898, + "grad_norm": 3.986145496368408, + "learning_rate": 4.693572701110046e-06, + "loss": 0.1603, + "step": 39472 + }, + { + "epoch": 2.1353906673901246, + "grad_norm": 5.149275779724121, + "learning_rate": 4.69343565848979e-06, + "loss": 0.1746, + "step": 39473 + }, + { + "epoch": 2.1354042322300595, + "grad_norm": 5.716921806335449, + "learning_rate": 4.693298615869536e-06, + "loss": 0.2598, + "step": 39474 + }, + { + "epoch": 2.1354177970699944, + "grad_norm": 5.366812229156494, + "learning_rate": 4.693161573249281e-06, + "loss": 0.2426, + "step": 39475 + }, + { + "epoch": 2.1354313619099297, + "grad_norm": 4.662395477294922, + "learning_rate": 4.693024530629026e-06, + "loss": 0.1953, + "step": 39476 + }, + { + "epoch": 2.1354449267498645, + "grad_norm": 5.729886531829834, + "learning_rate": 4.692887488008771e-06, + "loss": 0.3559, + "step": 39477 + }, + { + "epoch": 2.1354584915897994, + "grad_norm": 5.566556453704834, + "learning_rate": 4.692750445388516e-06, + "loss": 0.3631, + "step": 39478 + }, + { + "epoch": 2.1354720564297343, + "grad_norm": 3.7783210277557373, + "learning_rate": 4.692613402768261e-06, + "loss": 0.1837, + "step": 39479 + }, + { + "epoch": 2.135485621269669, + "grad_norm": 5.087238311767578, + "learning_rate": 4.6924763601480065e-06, + "loss": 0.3802, + "step": 39480 + }, + { + "epoch": 2.135499186109604, + "grad_norm": 3.929727554321289, + "learning_rate": 4.692339317527752e-06, + "loss": 0.2478, + "step": 39481 + }, + { + "epoch": 2.135512750949539, + "grad_norm": 4.258079528808594, + "learning_rate": 4.692202274907496e-06, + "loss": 0.2155, + "step": 39482 + }, + { + "epoch": 2.1355263157894737, + "grad_norm": 6.102311611175537, + "learning_rate": 4.692065232287242e-06, + "loss": 0.3648, + "step": 39483 + }, + { + "epoch": 2.1355398806294086, + "grad_norm": 5.687553882598877, + "learning_rate": 4.691928189666986e-06, + "loss": 0.333, + "step": 39484 + }, + { + "epoch": 2.1355534454693434, + "grad_norm": 4.889374732971191, + "learning_rate": 4.691791147046732e-06, + "loss": 0.1917, + "step": 39485 + }, + { + "epoch": 2.1355670103092783, + "grad_norm": 4.578693866729736, + "learning_rate": 4.691654104426477e-06, + "loss": 0.294, + "step": 39486 + }, + { + "epoch": 2.135580575149213, + "grad_norm": 4.571516990661621, + "learning_rate": 4.691517061806222e-06, + "loss": 0.2599, + "step": 39487 + }, + { + "epoch": 2.135594139989148, + "grad_norm": 4.961457252502441, + "learning_rate": 4.691380019185967e-06, + "loss": 0.218, + "step": 39488 + }, + { + "epoch": 2.135607704829083, + "grad_norm": 4.590553283691406, + "learning_rate": 4.691242976565712e-06, + "loss": 0.2419, + "step": 39489 + }, + { + "epoch": 2.1356212696690178, + "grad_norm": 7.299637317657471, + "learning_rate": 4.6911059339454574e-06, + "loss": 0.3065, + "step": 39490 + }, + { + "epoch": 2.1356348345089526, + "grad_norm": 4.53773832321167, + "learning_rate": 4.690968891325203e-06, + "loss": 0.15, + "step": 39491 + }, + { + "epoch": 2.1356483993488875, + "grad_norm": 3.944437265396118, + "learning_rate": 4.690831848704948e-06, + "loss": 0.2483, + "step": 39492 + }, + { + "epoch": 2.1356619641888224, + "grad_norm": 4.714157581329346, + "learning_rate": 4.690694806084693e-06, + "loss": 0.2229, + "step": 39493 + }, + { + "epoch": 2.1356755290287577, + "grad_norm": 5.713501930236816, + "learning_rate": 4.690557763464438e-06, + "loss": 0.293, + "step": 39494 + }, + { + "epoch": 2.1356890938686925, + "grad_norm": 3.164090156555176, + "learning_rate": 4.6904207208441825e-06, + "loss": 0.1769, + "step": 39495 + }, + { + "epoch": 2.1357026587086274, + "grad_norm": 6.2137227058410645, + "learning_rate": 4.690283678223928e-06, + "loss": 0.3168, + "step": 39496 + }, + { + "epoch": 2.1357162235485623, + "grad_norm": 3.957505702972412, + "learning_rate": 4.690146635603673e-06, + "loss": 0.1376, + "step": 39497 + }, + { + "epoch": 2.135729788388497, + "grad_norm": 5.964221954345703, + "learning_rate": 4.690009592983418e-06, + "loss": 0.3454, + "step": 39498 + }, + { + "epoch": 2.135743353228432, + "grad_norm": 3.822455406188965, + "learning_rate": 4.689872550363163e-06, + "loss": 0.1896, + "step": 39499 + }, + { + "epoch": 2.135756918068367, + "grad_norm": 4.481440544128418, + "learning_rate": 4.689735507742908e-06, + "loss": 0.1642, + "step": 39500 + }, + { + "epoch": 2.1357704829083017, + "grad_norm": 5.727396488189697, + "learning_rate": 4.6895984651226535e-06, + "loss": 0.2801, + "step": 39501 + }, + { + "epoch": 2.1357840477482366, + "grad_norm": 5.713456630706787, + "learning_rate": 4.689461422502399e-06, + "loss": 0.3306, + "step": 39502 + }, + { + "epoch": 2.1357976125881715, + "grad_norm": 4.897424221038818, + "learning_rate": 4.689324379882144e-06, + "loss": 0.2351, + "step": 39503 + }, + { + "epoch": 2.1358111774281063, + "grad_norm": 5.53848934173584, + "learning_rate": 4.689187337261889e-06, + "loss": 0.2329, + "step": 39504 + }, + { + "epoch": 2.135824742268041, + "grad_norm": 4.640767574310303, + "learning_rate": 4.689050294641634e-06, + "loss": 0.1744, + "step": 39505 + }, + { + "epoch": 2.135838307107976, + "grad_norm": 4.15962553024292, + "learning_rate": 4.6889132520213794e-06, + "loss": 0.189, + "step": 39506 + }, + { + "epoch": 2.135851871947911, + "grad_norm": 7.0078816413879395, + "learning_rate": 4.688776209401124e-06, + "loss": 0.2281, + "step": 39507 + }, + { + "epoch": 2.135865436787846, + "grad_norm": 4.370272636413574, + "learning_rate": 4.68863916678087e-06, + "loss": 0.224, + "step": 39508 + }, + { + "epoch": 2.1358790016277807, + "grad_norm": 5.81068229675293, + "learning_rate": 4.688502124160614e-06, + "loss": 0.2047, + "step": 39509 + }, + { + "epoch": 2.1358925664677155, + "grad_norm": 4.813228607177734, + "learning_rate": 4.688365081540359e-06, + "loss": 0.24, + "step": 39510 + }, + { + "epoch": 2.1359061313076504, + "grad_norm": 4.366236209869385, + "learning_rate": 4.6882280389201045e-06, + "loss": 0.1711, + "step": 39511 + }, + { + "epoch": 2.1359196961475853, + "grad_norm": 4.4296064376831055, + "learning_rate": 4.68809099629985e-06, + "loss": 0.2387, + "step": 39512 + }, + { + "epoch": 2.13593326098752, + "grad_norm": 7.292909145355225, + "learning_rate": 4.687953953679595e-06, + "loss": 0.3726, + "step": 39513 + }, + { + "epoch": 2.1359468258274554, + "grad_norm": 5.842182159423828, + "learning_rate": 4.68781691105934e-06, + "loss": 0.2874, + "step": 39514 + }, + { + "epoch": 2.1359603906673903, + "grad_norm": 5.19052267074585, + "learning_rate": 4.687679868439085e-06, + "loss": 0.2168, + "step": 39515 + }, + { + "epoch": 2.135973955507325, + "grad_norm": 6.540343284606934, + "learning_rate": 4.6875428258188295e-06, + "loss": 0.2201, + "step": 39516 + }, + { + "epoch": 2.13598752034726, + "grad_norm": 5.535318851470947, + "learning_rate": 4.6874057831985756e-06, + "loss": 0.2537, + "step": 39517 + }, + { + "epoch": 2.136001085187195, + "grad_norm": 3.9503886699676514, + "learning_rate": 4.68726874057832e-06, + "loss": 0.1884, + "step": 39518 + }, + { + "epoch": 2.1360146500271298, + "grad_norm": 4.1209869384765625, + "learning_rate": 4.687131697958066e-06, + "loss": 0.1964, + "step": 39519 + }, + { + "epoch": 2.1360282148670646, + "grad_norm": 3.5381593704223633, + "learning_rate": 4.68699465533781e-06, + "loss": 0.1583, + "step": 39520 + }, + { + "epoch": 2.1360417797069995, + "grad_norm": 5.396389484405518, + "learning_rate": 4.6868576127175554e-06, + "loss": 0.1688, + "step": 39521 + }, + { + "epoch": 2.1360553445469344, + "grad_norm": 4.323113918304443, + "learning_rate": 4.686720570097301e-06, + "loss": 0.1885, + "step": 39522 + }, + { + "epoch": 2.136068909386869, + "grad_norm": 4.032291412353516, + "learning_rate": 4.686583527477046e-06, + "loss": 0.1193, + "step": 39523 + }, + { + "epoch": 2.136082474226804, + "grad_norm": 4.087736129760742, + "learning_rate": 4.686446484856791e-06, + "loss": 0.177, + "step": 39524 + }, + { + "epoch": 2.136096039066739, + "grad_norm": 3.849843740463257, + "learning_rate": 4.686309442236535e-06, + "loss": 0.145, + "step": 39525 + }, + { + "epoch": 2.136109603906674, + "grad_norm": 6.2849650382995605, + "learning_rate": 4.686172399616281e-06, + "loss": 0.2748, + "step": 39526 + }, + { + "epoch": 2.1361231687466087, + "grad_norm": 5.206395149230957, + "learning_rate": 4.686035356996026e-06, + "loss": 0.1811, + "step": 39527 + }, + { + "epoch": 2.1361367335865435, + "grad_norm": 4.463113784790039, + "learning_rate": 4.685898314375772e-06, + "loss": 0.1982, + "step": 39528 + }, + { + "epoch": 2.1361502984264784, + "grad_norm": 4.338145732879639, + "learning_rate": 4.685761271755516e-06, + "loss": 0.1904, + "step": 39529 + }, + { + "epoch": 2.1361638632664133, + "grad_norm": 5.670327663421631, + "learning_rate": 4.685624229135261e-06, + "loss": 0.3009, + "step": 39530 + }, + { + "epoch": 2.136177428106348, + "grad_norm": 3.3608059883117676, + "learning_rate": 4.685487186515006e-06, + "loss": 0.125, + "step": 39531 + }, + { + "epoch": 2.1361909929462835, + "grad_norm": 6.679071426391602, + "learning_rate": 4.6853501438947516e-06, + "loss": 0.236, + "step": 39532 + }, + { + "epoch": 2.1362045577862183, + "grad_norm": 7.260655879974365, + "learning_rate": 4.685213101274497e-06, + "loss": 0.226, + "step": 39533 + }, + { + "epoch": 2.136218122626153, + "grad_norm": 4.4081010818481445, + "learning_rate": 4.685076058654242e-06, + "loss": 0.1791, + "step": 39534 + }, + { + "epoch": 2.136231687466088, + "grad_norm": 4.7928147315979, + "learning_rate": 4.684939016033987e-06, + "loss": 0.1878, + "step": 39535 + }, + { + "epoch": 2.136245252306023, + "grad_norm": 5.246668338775635, + "learning_rate": 4.684801973413732e-06, + "loss": 0.1951, + "step": 39536 + }, + { + "epoch": 2.1362588171459578, + "grad_norm": 2.8447415828704834, + "learning_rate": 4.6846649307934774e-06, + "loss": 0.1195, + "step": 39537 + }, + { + "epoch": 2.1362723819858926, + "grad_norm": 3.5536813735961914, + "learning_rate": 4.684527888173222e-06, + "loss": 0.1574, + "step": 39538 + }, + { + "epoch": 2.1362859468258275, + "grad_norm": 5.157439708709717, + "learning_rate": 4.684390845552968e-06, + "loss": 0.2136, + "step": 39539 + }, + { + "epoch": 2.1362995116657624, + "grad_norm": 5.928327560424805, + "learning_rate": 4.684253802932712e-06, + "loss": 0.2816, + "step": 39540 + }, + { + "epoch": 2.1363130765056972, + "grad_norm": 4.4498748779296875, + "learning_rate": 4.684116760312457e-06, + "loss": 0.1979, + "step": 39541 + }, + { + "epoch": 2.136326641345632, + "grad_norm": 4.655186176300049, + "learning_rate": 4.6839797176922025e-06, + "loss": 0.2137, + "step": 39542 + }, + { + "epoch": 2.136340206185567, + "grad_norm": 3.9826815128326416, + "learning_rate": 4.683842675071948e-06, + "loss": 0.1588, + "step": 39543 + }, + { + "epoch": 2.136353771025502, + "grad_norm": 5.207179546356201, + "learning_rate": 4.683705632451693e-06, + "loss": 0.1919, + "step": 39544 + }, + { + "epoch": 2.1363673358654367, + "grad_norm": 5.3922505378723145, + "learning_rate": 4.683568589831438e-06, + "loss": 0.2551, + "step": 39545 + }, + { + "epoch": 2.1363809007053716, + "grad_norm": 3.086928367614746, + "learning_rate": 4.683431547211183e-06, + "loss": 0.162, + "step": 39546 + }, + { + "epoch": 2.1363944655453064, + "grad_norm": 4.49022102355957, + "learning_rate": 4.683294504590928e-06, + "loss": 0.2319, + "step": 39547 + }, + { + "epoch": 2.1364080303852413, + "grad_norm": 4.376626968383789, + "learning_rate": 4.6831574619706736e-06, + "loss": 0.2344, + "step": 39548 + }, + { + "epoch": 2.136421595225176, + "grad_norm": 3.789802312850952, + "learning_rate": 4.683020419350419e-06, + "loss": 0.1216, + "step": 39549 + }, + { + "epoch": 2.136435160065111, + "grad_norm": 5.284858703613281, + "learning_rate": 4.682883376730163e-06, + "loss": 0.1894, + "step": 39550 + }, + { + "epoch": 2.136448724905046, + "grad_norm": 4.418944358825684, + "learning_rate": 4.682746334109908e-06, + "loss": 0.2494, + "step": 39551 + }, + { + "epoch": 2.136462289744981, + "grad_norm": 5.902987480163574, + "learning_rate": 4.6826092914896534e-06, + "loss": 0.2239, + "step": 39552 + }, + { + "epoch": 2.136475854584916, + "grad_norm": 5.3414130210876465, + "learning_rate": 4.682472248869399e-06, + "loss": 0.2781, + "step": 39553 + }, + { + "epoch": 2.136489419424851, + "grad_norm": 4.442844867706299, + "learning_rate": 4.682335206249144e-06, + "loss": 0.1428, + "step": 39554 + }, + { + "epoch": 2.136502984264786, + "grad_norm": 5.084934234619141, + "learning_rate": 4.682198163628889e-06, + "loss": 0.2479, + "step": 39555 + }, + { + "epoch": 2.1365165491047207, + "grad_norm": 5.603389739990234, + "learning_rate": 4.682061121008634e-06, + "loss": 0.2023, + "step": 39556 + }, + { + "epoch": 2.1365301139446555, + "grad_norm": 6.040477275848389, + "learning_rate": 4.681924078388379e-06, + "loss": 0.3222, + "step": 39557 + }, + { + "epoch": 2.1365436787845904, + "grad_norm": 5.148696422576904, + "learning_rate": 4.6817870357681245e-06, + "loss": 0.2524, + "step": 39558 + }, + { + "epoch": 2.1365572436245253, + "grad_norm": 4.2295823097229, + "learning_rate": 4.68164999314787e-06, + "loss": 0.2324, + "step": 39559 + }, + { + "epoch": 2.13657080846446, + "grad_norm": 5.530237674713135, + "learning_rate": 4.681512950527615e-06, + "loss": 0.251, + "step": 39560 + }, + { + "epoch": 2.136584373304395, + "grad_norm": 3.2435035705566406, + "learning_rate": 4.681375907907359e-06, + "loss": 0.1088, + "step": 39561 + }, + { + "epoch": 2.13659793814433, + "grad_norm": 6.196725368499756, + "learning_rate": 4.681238865287105e-06, + "loss": 0.1935, + "step": 39562 + }, + { + "epoch": 2.1366115029842647, + "grad_norm": 5.272332668304443, + "learning_rate": 4.6811018226668496e-06, + "loss": 0.1976, + "step": 39563 + }, + { + "epoch": 2.1366250678241996, + "grad_norm": 3.449784994125366, + "learning_rate": 4.680964780046595e-06, + "loss": 0.1396, + "step": 39564 + }, + { + "epoch": 2.1366386326641345, + "grad_norm": 4.2743144035339355, + "learning_rate": 4.68082773742634e-06, + "loss": 0.1658, + "step": 39565 + }, + { + "epoch": 2.1366521975040693, + "grad_norm": 5.928525447845459, + "learning_rate": 4.680690694806085e-06, + "loss": 0.1923, + "step": 39566 + }, + { + "epoch": 2.136665762344004, + "grad_norm": 4.308562755584717, + "learning_rate": 4.68055365218583e-06, + "loss": 0.1521, + "step": 39567 + }, + { + "epoch": 2.136679327183939, + "grad_norm": 4.3032732009887695, + "learning_rate": 4.6804166095655755e-06, + "loss": 0.1908, + "step": 39568 + }, + { + "epoch": 2.136692892023874, + "grad_norm": 5.178534030914307, + "learning_rate": 4.680279566945321e-06, + "loss": 0.142, + "step": 39569 + }, + { + "epoch": 2.1367064568638092, + "grad_norm": 4.281432628631592, + "learning_rate": 4.680142524325065e-06, + "loss": 0.2167, + "step": 39570 + }, + { + "epoch": 2.136720021703744, + "grad_norm": 4.696037769317627, + "learning_rate": 4.680005481704811e-06, + "loss": 0.2187, + "step": 39571 + }, + { + "epoch": 2.136733586543679, + "grad_norm": 5.235708713531494, + "learning_rate": 4.679868439084555e-06, + "loss": 0.2531, + "step": 39572 + }, + { + "epoch": 2.136747151383614, + "grad_norm": 4.458719253540039, + "learning_rate": 4.679731396464301e-06, + "loss": 0.1953, + "step": 39573 + }, + { + "epoch": 2.1367607162235487, + "grad_norm": 4.436165809631348, + "learning_rate": 4.679594353844046e-06, + "loss": 0.198, + "step": 39574 + }, + { + "epoch": 2.1367742810634835, + "grad_norm": 3.6999905109405518, + "learning_rate": 4.679457311223791e-06, + "loss": 0.1498, + "step": 39575 + }, + { + "epoch": 2.1367878459034184, + "grad_norm": 6.011572360992432, + "learning_rate": 4.679320268603536e-06, + "loss": 0.3433, + "step": 39576 + }, + { + "epoch": 2.1368014107433533, + "grad_norm": 6.364771366119385, + "learning_rate": 4.679183225983281e-06, + "loss": 0.3238, + "step": 39577 + }, + { + "epoch": 2.136814975583288, + "grad_norm": 5.061529159545898, + "learning_rate": 4.679046183363026e-06, + "loss": 0.2532, + "step": 39578 + }, + { + "epoch": 2.136828540423223, + "grad_norm": 5.018462657928467, + "learning_rate": 4.678909140742771e-06, + "loss": 0.2124, + "step": 39579 + }, + { + "epoch": 2.136842105263158, + "grad_norm": 4.827956676483154, + "learning_rate": 4.678772098122517e-06, + "loss": 0.1732, + "step": 39580 + }, + { + "epoch": 2.1368556701030927, + "grad_norm": 6.070460319519043, + "learning_rate": 4.678635055502261e-06, + "loss": 0.2926, + "step": 39581 + }, + { + "epoch": 2.1368692349430276, + "grad_norm": 4.951826572418213, + "learning_rate": 4.678498012882007e-06, + "loss": 0.1197, + "step": 39582 + }, + { + "epoch": 2.1368827997829625, + "grad_norm": 5.170494556427002, + "learning_rate": 4.6783609702617514e-06, + "loss": 0.1945, + "step": 39583 + }, + { + "epoch": 2.1368963646228973, + "grad_norm": 4.094226837158203, + "learning_rate": 4.678223927641497e-06, + "loss": 0.1454, + "step": 39584 + }, + { + "epoch": 2.136909929462832, + "grad_norm": 4.604540824890137, + "learning_rate": 4.678086885021242e-06, + "loss": 0.177, + "step": 39585 + }, + { + "epoch": 2.136923494302767, + "grad_norm": 4.102510452270508, + "learning_rate": 4.677949842400987e-06, + "loss": 0.1241, + "step": 39586 + }, + { + "epoch": 2.136937059142702, + "grad_norm": 4.291956424713135, + "learning_rate": 4.677812799780732e-06, + "loss": 0.1756, + "step": 39587 + }, + { + "epoch": 2.136950623982637, + "grad_norm": 4.037827968597412, + "learning_rate": 4.677675757160477e-06, + "loss": 0.1395, + "step": 39588 + }, + { + "epoch": 2.1369641888225717, + "grad_norm": 5.500351905822754, + "learning_rate": 4.6775387145402225e-06, + "loss": 0.1981, + "step": 39589 + }, + { + "epoch": 2.136977753662507, + "grad_norm": 6.453286170959473, + "learning_rate": 4.677401671919968e-06, + "loss": 0.2711, + "step": 39590 + }, + { + "epoch": 2.136991318502442, + "grad_norm": 3.9079031944274902, + "learning_rate": 4.677264629299713e-06, + "loss": 0.1351, + "step": 39591 + }, + { + "epoch": 2.1370048833423767, + "grad_norm": 5.642782688140869, + "learning_rate": 4.677127586679457e-06, + "loss": 0.44, + "step": 39592 + }, + { + "epoch": 2.1370184481823116, + "grad_norm": 4.101324081420898, + "learning_rate": 4.676990544059203e-06, + "loss": 0.1562, + "step": 39593 + }, + { + "epoch": 2.1370320130222464, + "grad_norm": 3.002218008041382, + "learning_rate": 4.6768535014389476e-06, + "loss": 0.1423, + "step": 39594 + }, + { + "epoch": 2.1370455778621813, + "grad_norm": 4.355101585388184, + "learning_rate": 4.676716458818693e-06, + "loss": 0.1411, + "step": 39595 + }, + { + "epoch": 2.137059142702116, + "grad_norm": 5.951963901519775, + "learning_rate": 4.676579416198438e-06, + "loss": 0.1995, + "step": 39596 + }, + { + "epoch": 2.137072707542051, + "grad_norm": 4.987802505493164, + "learning_rate": 4.676442373578183e-06, + "loss": 0.1981, + "step": 39597 + }, + { + "epoch": 2.137086272381986, + "grad_norm": 4.287365913391113, + "learning_rate": 4.676305330957928e-06, + "loss": 0.2045, + "step": 39598 + }, + { + "epoch": 2.1370998372219208, + "grad_norm": 4.242221355438232, + "learning_rate": 4.6761682883376735e-06, + "loss": 0.2104, + "step": 39599 + }, + { + "epoch": 2.1371134020618556, + "grad_norm": 4.772700309753418, + "learning_rate": 4.676031245717419e-06, + "loss": 0.1825, + "step": 39600 + }, + { + "epoch": 2.1371269669017905, + "grad_norm": 4.228750228881836, + "learning_rate": 4.675894203097164e-06, + "loss": 0.1608, + "step": 39601 + }, + { + "epoch": 2.1371405317417254, + "grad_norm": 4.977938175201416, + "learning_rate": 4.675757160476909e-06, + "loss": 0.1638, + "step": 39602 + }, + { + "epoch": 2.1371540965816602, + "grad_norm": 5.628145694732666, + "learning_rate": 4.675620117856654e-06, + "loss": 0.1924, + "step": 39603 + }, + { + "epoch": 2.137167661421595, + "grad_norm": 5.557553768157959, + "learning_rate": 4.6754830752363985e-06, + "loss": 0.2052, + "step": 39604 + }, + { + "epoch": 2.13718122626153, + "grad_norm": 6.738976955413818, + "learning_rate": 4.675346032616144e-06, + "loss": 0.2969, + "step": 39605 + }, + { + "epoch": 2.137194791101465, + "grad_norm": 4.743663787841797, + "learning_rate": 4.675208989995889e-06, + "loss": 0.1652, + "step": 39606 + }, + { + "epoch": 2.1372083559413997, + "grad_norm": 6.114787578582764, + "learning_rate": 4.675071947375634e-06, + "loss": 0.3049, + "step": 39607 + }, + { + "epoch": 2.137221920781335, + "grad_norm": 4.967981815338135, + "learning_rate": 4.674934904755379e-06, + "loss": 0.2376, + "step": 39608 + }, + { + "epoch": 2.13723548562127, + "grad_norm": 4.427215099334717, + "learning_rate": 4.674797862135124e-06, + "loss": 0.1505, + "step": 39609 + }, + { + "epoch": 2.1372490504612047, + "grad_norm": 4.837654113769531, + "learning_rate": 4.67466081951487e-06, + "loss": 0.3028, + "step": 39610 + }, + { + "epoch": 2.1372626153011396, + "grad_norm": 5.573819637298584, + "learning_rate": 4.674523776894615e-06, + "loss": 0.3092, + "step": 39611 + }, + { + "epoch": 2.1372761801410745, + "grad_norm": 4.900943279266357, + "learning_rate": 4.67438673427436e-06, + "loss": 0.1992, + "step": 39612 + }, + { + "epoch": 2.1372897449810093, + "grad_norm": 6.592278003692627, + "learning_rate": 4.674249691654105e-06, + "loss": 0.2425, + "step": 39613 + }, + { + "epoch": 2.137303309820944, + "grad_norm": 4.717019557952881, + "learning_rate": 4.67411264903385e-06, + "loss": 0.1799, + "step": 39614 + }, + { + "epoch": 2.137316874660879, + "grad_norm": 5.6451592445373535, + "learning_rate": 4.673975606413595e-06, + "loss": 0.331, + "step": 39615 + }, + { + "epoch": 2.137330439500814, + "grad_norm": 4.885936737060547, + "learning_rate": 4.673838563793341e-06, + "loss": 0.2367, + "step": 39616 + }, + { + "epoch": 2.137344004340749, + "grad_norm": 7.661775588989258, + "learning_rate": 4.673701521173085e-06, + "loss": 0.3306, + "step": 39617 + }, + { + "epoch": 2.1373575691806836, + "grad_norm": 5.821046352386475, + "learning_rate": 4.67356447855283e-06, + "loss": 0.2246, + "step": 39618 + }, + { + "epoch": 2.1373711340206185, + "grad_norm": 6.137730598449707, + "learning_rate": 4.673427435932575e-06, + "loss": 0.2288, + "step": 39619 + }, + { + "epoch": 2.1373846988605534, + "grad_norm": 6.911143779754639, + "learning_rate": 4.6732903933123205e-06, + "loss": 0.3085, + "step": 39620 + }, + { + "epoch": 2.1373982637004882, + "grad_norm": 6.687366485595703, + "learning_rate": 4.673153350692066e-06, + "loss": 0.2557, + "step": 39621 + }, + { + "epoch": 2.137411828540423, + "grad_norm": 7.334685802459717, + "learning_rate": 4.673016308071811e-06, + "loss": 0.3058, + "step": 39622 + }, + { + "epoch": 2.137425393380358, + "grad_norm": 8.452366828918457, + "learning_rate": 4.672879265451556e-06, + "loss": 0.4021, + "step": 39623 + }, + { + "epoch": 2.137438958220293, + "grad_norm": 5.249217510223389, + "learning_rate": 4.6727422228313e-06, + "loss": 0.1958, + "step": 39624 + }, + { + "epoch": 2.1374525230602277, + "grad_norm": 5.01279354095459, + "learning_rate": 4.672605180211046e-06, + "loss": 0.2704, + "step": 39625 + }, + { + "epoch": 2.1374660879001626, + "grad_norm": 7.647317886352539, + "learning_rate": 4.672468137590791e-06, + "loss": 0.3476, + "step": 39626 + }, + { + "epoch": 2.1374796527400974, + "grad_norm": 5.900757312774658, + "learning_rate": 4.672331094970537e-06, + "loss": 0.2259, + "step": 39627 + }, + { + "epoch": 2.1374932175800327, + "grad_norm": 9.32160472869873, + "learning_rate": 4.672194052350281e-06, + "loss": 0.4173, + "step": 39628 + }, + { + "epoch": 2.1375067824199676, + "grad_norm": 5.136759281158447, + "learning_rate": 4.672057009730026e-06, + "loss": 0.2341, + "step": 39629 + }, + { + "epoch": 2.1375203472599025, + "grad_norm": 4.644230842590332, + "learning_rate": 4.6719199671097715e-06, + "loss": 0.1555, + "step": 39630 + }, + { + "epoch": 2.1375339120998373, + "grad_norm": 6.288518905639648, + "learning_rate": 4.671782924489517e-06, + "loss": 0.2683, + "step": 39631 + }, + { + "epoch": 2.137547476939772, + "grad_norm": 5.91995906829834, + "learning_rate": 4.671645881869262e-06, + "loss": 0.2545, + "step": 39632 + }, + { + "epoch": 2.137561041779707, + "grad_norm": 6.797538757324219, + "learning_rate": 4.671508839249006e-06, + "loss": 0.2492, + "step": 39633 + }, + { + "epoch": 2.137574606619642, + "grad_norm": 5.73091983795166, + "learning_rate": 4.671371796628752e-06, + "loss": 0.2771, + "step": 39634 + }, + { + "epoch": 2.137588171459577, + "grad_norm": 8.202414512634277, + "learning_rate": 4.6712347540084965e-06, + "loss": 0.3844, + "step": 39635 + }, + { + "epoch": 2.1376017362995117, + "grad_norm": 6.060584545135498, + "learning_rate": 4.6710977113882425e-06, + "loss": 0.2721, + "step": 39636 + }, + { + "epoch": 2.1376153011394465, + "grad_norm": 6.862985610961914, + "learning_rate": 4.670960668767987e-06, + "loss": 0.2906, + "step": 39637 + }, + { + "epoch": 2.1376288659793814, + "grad_norm": 4.969457626342773, + "learning_rate": 4.670823626147732e-06, + "loss": 0.2142, + "step": 39638 + }, + { + "epoch": 2.1376424308193163, + "grad_norm": 5.6911301612854, + "learning_rate": 4.670686583527477e-06, + "loss": 0.2119, + "step": 39639 + }, + { + "epoch": 2.137655995659251, + "grad_norm": 6.505124568939209, + "learning_rate": 4.670549540907222e-06, + "loss": 0.2937, + "step": 39640 + }, + { + "epoch": 2.137669560499186, + "grad_norm": 4.661116600036621, + "learning_rate": 4.670412498286968e-06, + "loss": 0.1919, + "step": 39641 + }, + { + "epoch": 2.137683125339121, + "grad_norm": 8.30593490600586, + "learning_rate": 4.670275455666713e-06, + "loss": 0.2048, + "step": 39642 + }, + { + "epoch": 2.1376966901790557, + "grad_norm": 7.542766094207764, + "learning_rate": 4.670138413046458e-06, + "loss": 0.268, + "step": 39643 + }, + { + "epoch": 2.1377102550189906, + "grad_norm": 6.581711769104004, + "learning_rate": 4.670001370426203e-06, + "loss": 0.2178, + "step": 39644 + }, + { + "epoch": 2.137723819858926, + "grad_norm": 5.848695755004883, + "learning_rate": 4.669864327805948e-06, + "loss": 0.2637, + "step": 39645 + }, + { + "epoch": 2.1377373846988608, + "grad_norm": 6.4638447761535645, + "learning_rate": 4.6697272851856935e-06, + "loss": 0.2765, + "step": 39646 + }, + { + "epoch": 2.1377509495387956, + "grad_norm": 5.193417072296143, + "learning_rate": 4.669590242565439e-06, + "loss": 0.2636, + "step": 39647 + }, + { + "epoch": 2.1377645143787305, + "grad_norm": 6.202065467834473, + "learning_rate": 4.669453199945183e-06, + "loss": 0.2476, + "step": 39648 + }, + { + "epoch": 2.1377780792186654, + "grad_norm": 6.433887958526611, + "learning_rate": 4.669316157324928e-06, + "loss": 0.301, + "step": 39649 + }, + { + "epoch": 2.1377916440586002, + "grad_norm": 7.975987911224365, + "learning_rate": 4.669179114704673e-06, + "loss": 0.2832, + "step": 39650 + }, + { + "epoch": 2.137805208898535, + "grad_norm": 8.285013198852539, + "learning_rate": 4.6690420720844185e-06, + "loss": 0.3075, + "step": 39651 + }, + { + "epoch": 2.13781877373847, + "grad_norm": 4.012223243713379, + "learning_rate": 4.668905029464164e-06, + "loss": 0.1968, + "step": 39652 + }, + { + "epoch": 2.137832338578405, + "grad_norm": 4.522242069244385, + "learning_rate": 4.668767986843909e-06, + "loss": 0.1822, + "step": 39653 + }, + { + "epoch": 2.1378459034183397, + "grad_norm": 4.509721755981445, + "learning_rate": 4.668630944223654e-06, + "loss": 0.1874, + "step": 39654 + }, + { + "epoch": 2.1378594682582746, + "grad_norm": 5.290036201477051, + "learning_rate": 4.668493901603399e-06, + "loss": 0.1531, + "step": 39655 + }, + { + "epoch": 2.1378730330982094, + "grad_norm": 4.150092601776123, + "learning_rate": 4.668356858983144e-06, + "loss": 0.1658, + "step": 39656 + }, + { + "epoch": 2.1378865979381443, + "grad_norm": 7.748216152191162, + "learning_rate": 4.66821981636289e-06, + "loss": 0.3378, + "step": 39657 + }, + { + "epoch": 2.137900162778079, + "grad_norm": 4.647216796875, + "learning_rate": 4.668082773742634e-06, + "loss": 0.0724, + "step": 39658 + }, + { + "epoch": 2.137913727618014, + "grad_norm": 6.303754806518555, + "learning_rate": 4.66794573112238e-06, + "loss": 0.3129, + "step": 39659 + }, + { + "epoch": 2.137927292457949, + "grad_norm": 6.679449558258057, + "learning_rate": 4.667808688502124e-06, + "loss": 0.3115, + "step": 39660 + }, + { + "epoch": 2.1379408572978837, + "grad_norm": 5.125361919403076, + "learning_rate": 4.6676716458818695e-06, + "loss": 0.1905, + "step": 39661 + }, + { + "epoch": 2.1379544221378186, + "grad_norm": 4.857311248779297, + "learning_rate": 4.667534603261615e-06, + "loss": 0.2233, + "step": 39662 + }, + { + "epoch": 2.1379679869777535, + "grad_norm": 5.001722812652588, + "learning_rate": 4.66739756064136e-06, + "loss": 0.1662, + "step": 39663 + }, + { + "epoch": 2.1379815518176883, + "grad_norm": 6.655759334564209, + "learning_rate": 4.667260518021105e-06, + "loss": 0.2902, + "step": 39664 + }, + { + "epoch": 2.137995116657623, + "grad_norm": 6.091895580291748, + "learning_rate": 4.66712347540085e-06, + "loss": 0.2312, + "step": 39665 + }, + { + "epoch": 2.1380086814975585, + "grad_norm": 5.179520606994629, + "learning_rate": 4.666986432780595e-06, + "loss": 0.2171, + "step": 39666 + }, + { + "epoch": 2.1380222463374934, + "grad_norm": 7.652282238006592, + "learning_rate": 4.66684939016034e-06, + "loss": 0.314, + "step": 39667 + }, + { + "epoch": 2.1380358111774282, + "grad_norm": 3.583615303039551, + "learning_rate": 4.666712347540086e-06, + "loss": 0.1074, + "step": 39668 + }, + { + "epoch": 2.138049376017363, + "grad_norm": 5.251956939697266, + "learning_rate": 4.66657530491983e-06, + "loss": 0.181, + "step": 39669 + }, + { + "epoch": 2.138062940857298, + "grad_norm": 4.9000349044799805, + "learning_rate": 4.666438262299576e-06, + "loss": 0.1221, + "step": 39670 + }, + { + "epoch": 2.138076505697233, + "grad_norm": 8.097460746765137, + "learning_rate": 4.66630121967932e-06, + "loss": 0.2935, + "step": 39671 + }, + { + "epoch": 2.1380900705371677, + "grad_norm": 6.70658016204834, + "learning_rate": 4.666164177059066e-06, + "loss": 0.2549, + "step": 39672 + }, + { + "epoch": 2.1381036353771026, + "grad_norm": 3.960966110229492, + "learning_rate": 4.666027134438811e-06, + "loss": 0.1481, + "step": 39673 + }, + { + "epoch": 2.1381172002170374, + "grad_norm": 4.482970237731934, + "learning_rate": 4.665890091818556e-06, + "loss": 0.1677, + "step": 39674 + }, + { + "epoch": 2.1381307650569723, + "grad_norm": 5.7303667068481445, + "learning_rate": 4.665753049198301e-06, + "loss": 0.1749, + "step": 39675 + }, + { + "epoch": 2.138144329896907, + "grad_norm": 5.419008731842041, + "learning_rate": 4.665616006578046e-06, + "loss": 0.2119, + "step": 39676 + }, + { + "epoch": 2.138157894736842, + "grad_norm": 4.9856696128845215, + "learning_rate": 4.6654789639577915e-06, + "loss": 0.1748, + "step": 39677 + }, + { + "epoch": 2.138171459576777, + "grad_norm": 7.4687418937683105, + "learning_rate": 4.665341921337536e-06, + "loss": 0.3316, + "step": 39678 + }, + { + "epoch": 2.1381850244167118, + "grad_norm": 5.3209733963012695, + "learning_rate": 4.665204878717282e-06, + "loss": 0.2242, + "step": 39679 + }, + { + "epoch": 2.1381985892566466, + "grad_norm": 6.155708312988281, + "learning_rate": 4.665067836097026e-06, + "loss": 0.1268, + "step": 39680 + }, + { + "epoch": 2.1382121540965815, + "grad_norm": 6.271101951599121, + "learning_rate": 4.664930793476772e-06, + "loss": 0.1474, + "step": 39681 + }, + { + "epoch": 2.1382257189365164, + "grad_norm": 4.7011871337890625, + "learning_rate": 4.6647937508565165e-06, + "loss": 0.1764, + "step": 39682 + }, + { + "epoch": 2.1382392837764517, + "grad_norm": 5.199396133422852, + "learning_rate": 4.664656708236262e-06, + "loss": 0.2516, + "step": 39683 + }, + { + "epoch": 2.1382528486163865, + "grad_norm": 6.112926483154297, + "learning_rate": 4.664519665616007e-06, + "loss": 0.2455, + "step": 39684 + }, + { + "epoch": 2.1382664134563214, + "grad_norm": 6.256804943084717, + "learning_rate": 4.664382622995752e-06, + "loss": 0.3591, + "step": 39685 + }, + { + "epoch": 2.1382799782962563, + "grad_norm": 4.8060102462768555, + "learning_rate": 4.664245580375497e-06, + "loss": 0.2237, + "step": 39686 + }, + { + "epoch": 2.138293543136191, + "grad_norm": 7.035305023193359, + "learning_rate": 4.664108537755242e-06, + "loss": 0.2831, + "step": 39687 + }, + { + "epoch": 2.138307107976126, + "grad_norm": 5.5999674797058105, + "learning_rate": 4.663971495134988e-06, + "loss": 0.1633, + "step": 39688 + }, + { + "epoch": 2.138320672816061, + "grad_norm": 3.6256110668182373, + "learning_rate": 4.663834452514732e-06, + "loss": 0.1193, + "step": 39689 + }, + { + "epoch": 2.1383342376559957, + "grad_norm": 5.181826114654541, + "learning_rate": 4.663697409894478e-06, + "loss": 0.2731, + "step": 39690 + }, + { + "epoch": 2.1383478024959306, + "grad_norm": 5.681346893310547, + "learning_rate": 4.663560367274222e-06, + "loss": 0.3031, + "step": 39691 + }, + { + "epoch": 2.1383613673358655, + "grad_norm": 6.848938465118408, + "learning_rate": 4.6634233246539675e-06, + "loss": 0.3929, + "step": 39692 + }, + { + "epoch": 2.1383749321758003, + "grad_norm": 4.515820503234863, + "learning_rate": 4.663286282033713e-06, + "loss": 0.2283, + "step": 39693 + }, + { + "epoch": 2.138388497015735, + "grad_norm": 5.167019367218018, + "learning_rate": 4.663149239413458e-06, + "loss": 0.2129, + "step": 39694 + }, + { + "epoch": 2.13840206185567, + "grad_norm": 5.194212436676025, + "learning_rate": 4.663012196793203e-06, + "loss": 0.1678, + "step": 39695 + }, + { + "epoch": 2.138415626695605, + "grad_norm": 4.2246270179748535, + "learning_rate": 4.662875154172948e-06, + "loss": 0.1797, + "step": 39696 + }, + { + "epoch": 2.13842919153554, + "grad_norm": 3.9801812171936035, + "learning_rate": 4.662738111552693e-06, + "loss": 0.1161, + "step": 39697 + }, + { + "epoch": 2.1384427563754747, + "grad_norm": 5.119497776031494, + "learning_rate": 4.6626010689324385e-06, + "loss": 0.1699, + "step": 39698 + }, + { + "epoch": 2.1384563212154095, + "grad_norm": 6.126781940460205, + "learning_rate": 4.662464026312184e-06, + "loss": 0.3402, + "step": 39699 + }, + { + "epoch": 2.1384698860553444, + "grad_norm": 6.138705253601074, + "learning_rate": 4.662326983691929e-06, + "loss": 0.2815, + "step": 39700 + }, + { + "epoch": 2.1384834508952792, + "grad_norm": 7.332155227661133, + "learning_rate": 4.662189941071674e-06, + "loss": 0.3316, + "step": 39701 + }, + { + "epoch": 2.138497015735214, + "grad_norm": 3.6269137859344482, + "learning_rate": 4.662052898451418e-06, + "loss": 0.1336, + "step": 39702 + }, + { + "epoch": 2.138510580575149, + "grad_norm": 7.213902950286865, + "learning_rate": 4.661915855831164e-06, + "loss": 0.2598, + "step": 39703 + }, + { + "epoch": 2.1385241454150843, + "grad_norm": 5.830721378326416, + "learning_rate": 4.661778813210909e-06, + "loss": 0.2462, + "step": 39704 + }, + { + "epoch": 2.138537710255019, + "grad_norm": 5.351996421813965, + "learning_rate": 4.661641770590654e-06, + "loss": 0.1762, + "step": 39705 + }, + { + "epoch": 2.138551275094954, + "grad_norm": 4.7369585037231445, + "learning_rate": 4.661504727970399e-06, + "loss": 0.2409, + "step": 39706 + }, + { + "epoch": 2.138564839934889, + "grad_norm": 5.34230375289917, + "learning_rate": 4.661367685350144e-06, + "loss": 0.1286, + "step": 39707 + }, + { + "epoch": 2.1385784047748237, + "grad_norm": 6.8557515144348145, + "learning_rate": 4.6612306427298895e-06, + "loss": 0.2532, + "step": 39708 + }, + { + "epoch": 2.1385919696147586, + "grad_norm": 7.630929946899414, + "learning_rate": 4.661093600109635e-06, + "loss": 0.2685, + "step": 39709 + }, + { + "epoch": 2.1386055344546935, + "grad_norm": 4.038519382476807, + "learning_rate": 4.66095655748938e-06, + "loss": 0.1423, + "step": 39710 + }, + { + "epoch": 2.1386190992946283, + "grad_norm": 6.501422882080078, + "learning_rate": 4.660819514869125e-06, + "loss": 0.1545, + "step": 39711 + }, + { + "epoch": 2.138632664134563, + "grad_norm": 4.387541770935059, + "learning_rate": 4.660682472248869e-06, + "loss": 0.1514, + "step": 39712 + }, + { + "epoch": 2.138646228974498, + "grad_norm": 4.938117504119873, + "learning_rate": 4.660545429628615e-06, + "loss": 0.1914, + "step": 39713 + }, + { + "epoch": 2.138659793814433, + "grad_norm": 5.123676776885986, + "learning_rate": 4.66040838700836e-06, + "loss": 0.1917, + "step": 39714 + }, + { + "epoch": 2.138673358654368, + "grad_norm": 3.7090489864349365, + "learning_rate": 4.660271344388106e-06, + "loss": 0.0885, + "step": 39715 + }, + { + "epoch": 2.1386869234943027, + "grad_norm": 4.512618064880371, + "learning_rate": 4.66013430176785e-06, + "loss": 0.1626, + "step": 39716 + }, + { + "epoch": 2.1387004883342375, + "grad_norm": 4.656778812408447, + "learning_rate": 4.659997259147595e-06, + "loss": 0.2183, + "step": 39717 + }, + { + "epoch": 2.1387140531741724, + "grad_norm": 4.630398273468018, + "learning_rate": 4.6598602165273404e-06, + "loss": 0.3017, + "step": 39718 + }, + { + "epoch": 2.1387276180141073, + "grad_norm": 6.737597465515137, + "learning_rate": 4.659723173907086e-06, + "loss": 0.2411, + "step": 39719 + }, + { + "epoch": 2.138741182854042, + "grad_norm": 6.2974629402160645, + "learning_rate": 4.659586131286831e-06, + "loss": 0.2716, + "step": 39720 + }, + { + "epoch": 2.1387547476939774, + "grad_norm": 5.174659729003906, + "learning_rate": 4.659449088666575e-06, + "loss": 0.2318, + "step": 39721 + }, + { + "epoch": 2.1387683125339123, + "grad_norm": 5.372936248779297, + "learning_rate": 4.659312046046321e-06, + "loss": 0.1889, + "step": 39722 + }, + { + "epoch": 2.138781877373847, + "grad_norm": 7.007808208465576, + "learning_rate": 4.6591750034260655e-06, + "loss": 0.2737, + "step": 39723 + }, + { + "epoch": 2.138795442213782, + "grad_norm": 5.484125137329102, + "learning_rate": 4.6590379608058115e-06, + "loss": 0.2377, + "step": 39724 + }, + { + "epoch": 2.138809007053717, + "grad_norm": 5.424732685089111, + "learning_rate": 4.658900918185556e-06, + "loss": 0.311, + "step": 39725 + }, + { + "epoch": 2.1388225718936518, + "grad_norm": 4.842154502868652, + "learning_rate": 4.658763875565301e-06, + "loss": 0.2652, + "step": 39726 + }, + { + "epoch": 2.1388361367335866, + "grad_norm": 3.88633131980896, + "learning_rate": 4.658626832945046e-06, + "loss": 0.1444, + "step": 39727 + }, + { + "epoch": 2.1388497015735215, + "grad_norm": 4.518011093139648, + "learning_rate": 4.658489790324791e-06, + "loss": 0.2249, + "step": 39728 + }, + { + "epoch": 2.1388632664134564, + "grad_norm": 3.9728429317474365, + "learning_rate": 4.6583527477045365e-06, + "loss": 0.1683, + "step": 39729 + }, + { + "epoch": 2.1388768312533912, + "grad_norm": 4.191787242889404, + "learning_rate": 4.658215705084282e-06, + "loss": 0.1593, + "step": 39730 + }, + { + "epoch": 2.138890396093326, + "grad_norm": 3.398728847503662, + "learning_rate": 4.658078662464027e-06, + "loss": 0.1407, + "step": 39731 + }, + { + "epoch": 2.138903960933261, + "grad_norm": 3.927827835083008, + "learning_rate": 4.657941619843771e-06, + "loss": 0.1406, + "step": 39732 + }, + { + "epoch": 2.138917525773196, + "grad_norm": 3.720155954360962, + "learning_rate": 4.657804577223517e-06, + "loss": 0.1324, + "step": 39733 + }, + { + "epoch": 2.1389310906131307, + "grad_norm": 4.340632438659668, + "learning_rate": 4.657667534603262e-06, + "loss": 0.1759, + "step": 39734 + }, + { + "epoch": 2.1389446554530656, + "grad_norm": 3.611459732055664, + "learning_rate": 4.657530491983008e-06, + "loss": 0.1633, + "step": 39735 + }, + { + "epoch": 2.1389582202930004, + "grad_norm": 4.295882701873779, + "learning_rate": 4.657393449362752e-06, + "loss": 0.1354, + "step": 39736 + }, + { + "epoch": 2.1389717851329353, + "grad_norm": 3.170694589614868, + "learning_rate": 4.657256406742497e-06, + "loss": 0.1486, + "step": 39737 + }, + { + "epoch": 2.13898534997287, + "grad_norm": 4.606297016143799, + "learning_rate": 4.657119364122242e-06, + "loss": 0.2294, + "step": 39738 + }, + { + "epoch": 2.138998914812805, + "grad_norm": 3.6862363815307617, + "learning_rate": 4.6569823215019875e-06, + "loss": 0.2251, + "step": 39739 + }, + { + "epoch": 2.13901247965274, + "grad_norm": 4.738830089569092, + "learning_rate": 4.656845278881733e-06, + "loss": 0.2209, + "step": 39740 + }, + { + "epoch": 2.139026044492675, + "grad_norm": 5.230009078979492, + "learning_rate": 4.656708236261478e-06, + "loss": 0.1451, + "step": 39741 + }, + { + "epoch": 2.13903960933261, + "grad_norm": 4.646095275878906, + "learning_rate": 4.656571193641223e-06, + "loss": 0.2789, + "step": 39742 + }, + { + "epoch": 2.139053174172545, + "grad_norm": 5.397850513458252, + "learning_rate": 4.656434151020967e-06, + "loss": 0.2697, + "step": 39743 + }, + { + "epoch": 2.13906673901248, + "grad_norm": 3.7184031009674072, + "learning_rate": 4.656297108400713e-06, + "loss": 0.1665, + "step": 39744 + }, + { + "epoch": 2.1390803038524147, + "grad_norm": 3.8020596504211426, + "learning_rate": 4.656160065780458e-06, + "loss": 0.1249, + "step": 39745 + }, + { + "epoch": 2.1390938686923495, + "grad_norm": 4.849366188049316, + "learning_rate": 4.656023023160203e-06, + "loss": 0.2477, + "step": 39746 + }, + { + "epoch": 2.1391074335322844, + "grad_norm": 3.9393866062164307, + "learning_rate": 4.655885980539948e-06, + "loss": 0.1377, + "step": 39747 + }, + { + "epoch": 2.1391209983722193, + "grad_norm": 3.8917877674102783, + "learning_rate": 4.655748937919693e-06, + "loss": 0.1828, + "step": 39748 + }, + { + "epoch": 2.139134563212154, + "grad_norm": 4.428081035614014, + "learning_rate": 4.6556118952994384e-06, + "loss": 0.1844, + "step": 39749 + }, + { + "epoch": 2.139148128052089, + "grad_norm": 5.418545246124268, + "learning_rate": 4.655474852679184e-06, + "loss": 0.185, + "step": 39750 + }, + { + "epoch": 2.139161692892024, + "grad_norm": 5.177353382110596, + "learning_rate": 4.655337810058929e-06, + "loss": 0.1636, + "step": 39751 + }, + { + "epoch": 2.1391752577319587, + "grad_norm": 3.3346149921417236, + "learning_rate": 4.655200767438674e-06, + "loss": 0.1575, + "step": 39752 + }, + { + "epoch": 2.1391888225718936, + "grad_norm": 4.019618988037109, + "learning_rate": 4.655063724818419e-06, + "loss": 0.2058, + "step": 39753 + }, + { + "epoch": 2.1392023874118284, + "grad_norm": 4.611417770385742, + "learning_rate": 4.654926682198164e-06, + "loss": 0.2159, + "step": 39754 + }, + { + "epoch": 2.1392159522517633, + "grad_norm": 6.755634307861328, + "learning_rate": 4.6547896395779095e-06, + "loss": 0.2601, + "step": 39755 + }, + { + "epoch": 2.139229517091698, + "grad_norm": 4.906658172607422, + "learning_rate": 4.654652596957655e-06, + "loss": 0.16, + "step": 39756 + }, + { + "epoch": 2.139243081931633, + "grad_norm": 4.938669681549072, + "learning_rate": 4.654515554337399e-06, + "loss": 0.2087, + "step": 39757 + }, + { + "epoch": 2.139256646771568, + "grad_norm": 3.7955079078674316, + "learning_rate": 4.654378511717144e-06, + "loss": 0.1431, + "step": 39758 + }, + { + "epoch": 2.139270211611503, + "grad_norm": 2.927109956741333, + "learning_rate": 4.654241469096889e-06, + "loss": 0.0722, + "step": 39759 + }, + { + "epoch": 2.139283776451438, + "grad_norm": 3.2506625652313232, + "learning_rate": 4.6541044264766346e-06, + "loss": 0.113, + "step": 39760 + }, + { + "epoch": 2.139297341291373, + "grad_norm": 5.792637348175049, + "learning_rate": 4.65396738385638e-06, + "loss": 0.301, + "step": 39761 + }, + { + "epoch": 2.139310906131308, + "grad_norm": 4.127195835113525, + "learning_rate": 4.653830341236125e-06, + "loss": 0.1993, + "step": 39762 + }, + { + "epoch": 2.1393244709712427, + "grad_norm": 4.1550140380859375, + "learning_rate": 4.65369329861587e-06, + "loss": 0.1594, + "step": 39763 + }, + { + "epoch": 2.1393380358111775, + "grad_norm": 5.618939399719238, + "learning_rate": 4.653556255995615e-06, + "loss": 0.1559, + "step": 39764 + }, + { + "epoch": 2.1393516006511124, + "grad_norm": 4.653223991394043, + "learning_rate": 4.6534192133753604e-06, + "loss": 0.1437, + "step": 39765 + }, + { + "epoch": 2.1393651654910473, + "grad_norm": 6.3017168045043945, + "learning_rate": 4.653282170755105e-06, + "loss": 0.1633, + "step": 39766 + }, + { + "epoch": 2.139378730330982, + "grad_norm": 4.6619486808776855, + "learning_rate": 4.653145128134851e-06, + "loss": 0.1304, + "step": 39767 + }, + { + "epoch": 2.139392295170917, + "grad_norm": 5.52440071105957, + "learning_rate": 4.653008085514595e-06, + "loss": 0.1822, + "step": 39768 + }, + { + "epoch": 2.139405860010852, + "grad_norm": 4.679969310760498, + "learning_rate": 4.652871042894341e-06, + "loss": 0.1283, + "step": 39769 + }, + { + "epoch": 2.1394194248507867, + "grad_norm": 3.470632553100586, + "learning_rate": 4.6527340002740855e-06, + "loss": 0.0897, + "step": 39770 + }, + { + "epoch": 2.1394329896907216, + "grad_norm": 4.158430099487305, + "learning_rate": 4.652596957653831e-06, + "loss": 0.1718, + "step": 39771 + }, + { + "epoch": 2.1394465545306565, + "grad_norm": 4.904922008514404, + "learning_rate": 4.652459915033576e-06, + "loss": 0.1693, + "step": 39772 + }, + { + "epoch": 2.1394601193705913, + "grad_norm": 4.2705464363098145, + "learning_rate": 4.652322872413321e-06, + "loss": 0.1353, + "step": 39773 + }, + { + "epoch": 2.139473684210526, + "grad_norm": 6.199461936950684, + "learning_rate": 4.652185829793066e-06, + "loss": 0.1277, + "step": 39774 + }, + { + "epoch": 2.139487249050461, + "grad_norm": 3.6397287845611572, + "learning_rate": 4.6520487871728105e-06, + "loss": 0.1211, + "step": 39775 + }, + { + "epoch": 2.139500813890396, + "grad_norm": 4.4676103591918945, + "learning_rate": 4.6519117445525566e-06, + "loss": 0.109, + "step": 39776 + }, + { + "epoch": 2.139514378730331, + "grad_norm": 5.901355266571045, + "learning_rate": 4.651774701932301e-06, + "loss": 0.2703, + "step": 39777 + }, + { + "epoch": 2.1395279435702657, + "grad_norm": 4.462245941162109, + "learning_rate": 4.651637659312047e-06, + "loss": 0.1149, + "step": 39778 + }, + { + "epoch": 2.139541508410201, + "grad_norm": 4.630527019500732, + "learning_rate": 4.651500616691791e-06, + "loss": 0.1473, + "step": 39779 + }, + { + "epoch": 2.139555073250136, + "grad_norm": 5.777224540710449, + "learning_rate": 4.6513635740715364e-06, + "loss": 0.1828, + "step": 39780 + }, + { + "epoch": 2.1395686380900707, + "grad_norm": 3.3946757316589355, + "learning_rate": 4.651226531451282e-06, + "loss": 0.1397, + "step": 39781 + }, + { + "epoch": 2.1395822029300056, + "grad_norm": 5.558448791503906, + "learning_rate": 4.651089488831027e-06, + "loss": 0.1527, + "step": 39782 + }, + { + "epoch": 2.1395957677699404, + "grad_norm": 4.829138278961182, + "learning_rate": 4.650952446210772e-06, + "loss": 0.1592, + "step": 39783 + }, + { + "epoch": 2.1396093326098753, + "grad_norm": 4.239185810089111, + "learning_rate": 4.650815403590517e-06, + "loss": 0.1832, + "step": 39784 + }, + { + "epoch": 2.13962289744981, + "grad_norm": 5.890829086303711, + "learning_rate": 4.650678360970262e-06, + "loss": 0.1962, + "step": 39785 + }, + { + "epoch": 2.139636462289745, + "grad_norm": 2.8002731800079346, + "learning_rate": 4.650541318350007e-06, + "loss": 0.1077, + "step": 39786 + }, + { + "epoch": 2.13965002712968, + "grad_norm": 4.114445686340332, + "learning_rate": 4.650404275729753e-06, + "loss": 0.1208, + "step": 39787 + }, + { + "epoch": 2.1396635919696148, + "grad_norm": 6.33568000793457, + "learning_rate": 4.650267233109497e-06, + "loss": 0.2666, + "step": 39788 + }, + { + "epoch": 2.1396771568095496, + "grad_norm": 4.449061393737793, + "learning_rate": 4.650130190489243e-06, + "loss": 0.1855, + "step": 39789 + }, + { + "epoch": 2.1396907216494845, + "grad_norm": 4.99444580078125, + "learning_rate": 4.649993147868987e-06, + "loss": 0.2427, + "step": 39790 + }, + { + "epoch": 2.1397042864894193, + "grad_norm": 5.907649517059326, + "learning_rate": 4.6498561052487326e-06, + "loss": 0.1651, + "step": 39791 + }, + { + "epoch": 2.139717851329354, + "grad_norm": 3.226796865463257, + "learning_rate": 4.649719062628478e-06, + "loss": 0.139, + "step": 39792 + }, + { + "epoch": 2.139731416169289, + "grad_norm": 4.067944049835205, + "learning_rate": 4.649582020008223e-06, + "loss": 0.1721, + "step": 39793 + }, + { + "epoch": 2.139744981009224, + "grad_norm": 4.556821823120117, + "learning_rate": 4.649444977387968e-06, + "loss": 0.2536, + "step": 39794 + }, + { + "epoch": 2.139758545849159, + "grad_norm": 3.7361271381378174, + "learning_rate": 4.649307934767713e-06, + "loss": 0.1345, + "step": 39795 + }, + { + "epoch": 2.1397721106890937, + "grad_norm": 3.751026153564453, + "learning_rate": 4.6491708921474585e-06, + "loss": 0.1233, + "step": 39796 + }, + { + "epoch": 2.139785675529029, + "grad_norm": 3.749640703201294, + "learning_rate": 4.649033849527204e-06, + "loss": 0.1588, + "step": 39797 + }, + { + "epoch": 2.139799240368964, + "grad_norm": 5.537208080291748, + "learning_rate": 4.648896806906949e-06, + "loss": 0.2526, + "step": 39798 + }, + { + "epoch": 2.1398128052088987, + "grad_norm": 3.5461812019348145, + "learning_rate": 4.648759764286693e-06, + "loss": 0.1336, + "step": 39799 + }, + { + "epoch": 2.1398263700488336, + "grad_norm": 4.402834415435791, + "learning_rate": 4.648622721666438e-06, + "loss": 0.1578, + "step": 39800 + }, + { + "epoch": 2.1398399348887684, + "grad_norm": 4.090976715087891, + "learning_rate": 4.6484856790461835e-06, + "loss": 0.1619, + "step": 39801 + }, + { + "epoch": 2.1398534997287033, + "grad_norm": 5.087306022644043, + "learning_rate": 4.648348636425929e-06, + "loss": 0.1467, + "step": 39802 + }, + { + "epoch": 2.139867064568638, + "grad_norm": 4.734748840332031, + "learning_rate": 4.648211593805674e-06, + "loss": 0.1607, + "step": 39803 + }, + { + "epoch": 2.139880629408573, + "grad_norm": 4.346851348876953, + "learning_rate": 4.648074551185419e-06, + "loss": 0.1814, + "step": 39804 + }, + { + "epoch": 2.139894194248508, + "grad_norm": 5.646866798400879, + "learning_rate": 4.647937508565164e-06, + "loss": 0.2438, + "step": 39805 + }, + { + "epoch": 2.1399077590884428, + "grad_norm": 6.7585859298706055, + "learning_rate": 4.647800465944909e-06, + "loss": 0.2696, + "step": 39806 + }, + { + "epoch": 2.1399213239283776, + "grad_norm": 6.270437717437744, + "learning_rate": 4.6476634233246546e-06, + "loss": 0.3105, + "step": 39807 + }, + { + "epoch": 2.1399348887683125, + "grad_norm": 3.384286642074585, + "learning_rate": 4.6475263807044e-06, + "loss": 0.1292, + "step": 39808 + }, + { + "epoch": 2.1399484536082474, + "grad_norm": 5.465872764587402, + "learning_rate": 4.647389338084144e-06, + "loss": 0.1719, + "step": 39809 + }, + { + "epoch": 2.1399620184481822, + "grad_norm": 6.873321056365967, + "learning_rate": 4.64725229546389e-06, + "loss": 0.1697, + "step": 39810 + }, + { + "epoch": 2.139975583288117, + "grad_norm": 4.267227649688721, + "learning_rate": 4.6471152528436344e-06, + "loss": 0.173, + "step": 39811 + }, + { + "epoch": 2.139989148128052, + "grad_norm": 6.3762407302856445, + "learning_rate": 4.64697821022338e-06, + "loss": 0.2672, + "step": 39812 + }, + { + "epoch": 2.140002712967987, + "grad_norm": 4.880603313446045, + "learning_rate": 4.646841167603125e-06, + "loss": 0.1956, + "step": 39813 + }, + { + "epoch": 2.1400162778079217, + "grad_norm": 5.948768138885498, + "learning_rate": 4.64670412498287e-06, + "loss": 0.1937, + "step": 39814 + }, + { + "epoch": 2.1400298426478566, + "grad_norm": 3.77250075340271, + "learning_rate": 4.646567082362615e-06, + "loss": 0.109, + "step": 39815 + }, + { + "epoch": 2.1400434074877914, + "grad_norm": 5.885945796966553, + "learning_rate": 4.64643003974236e-06, + "loss": 0.1904, + "step": 39816 + }, + { + "epoch": 2.1400569723277267, + "grad_norm": 6.342895984649658, + "learning_rate": 4.6462929971221055e-06, + "loss": 0.2282, + "step": 39817 + }, + { + "epoch": 2.1400705371676616, + "grad_norm": 3.767580032348633, + "learning_rate": 4.646155954501851e-06, + "loss": 0.2034, + "step": 39818 + }, + { + "epoch": 2.1400841020075965, + "grad_norm": 6.02458381652832, + "learning_rate": 4.646018911881596e-06, + "loss": 0.2135, + "step": 39819 + }, + { + "epoch": 2.1400976668475313, + "grad_norm": 4.154595375061035, + "learning_rate": 4.64588186926134e-06, + "loss": 0.1085, + "step": 39820 + }, + { + "epoch": 2.140111231687466, + "grad_norm": 3.3811123371124268, + "learning_rate": 4.645744826641086e-06, + "loss": 0.1509, + "step": 39821 + }, + { + "epoch": 2.140124796527401, + "grad_norm": 3.174893617630005, + "learning_rate": 4.6456077840208306e-06, + "loss": 0.1231, + "step": 39822 + }, + { + "epoch": 2.140138361367336, + "grad_norm": 3.355027437210083, + "learning_rate": 4.645470741400577e-06, + "loss": 0.1326, + "step": 39823 + }, + { + "epoch": 2.140151926207271, + "grad_norm": 3.8085455894470215, + "learning_rate": 4.645333698780321e-06, + "loss": 0.1253, + "step": 39824 + }, + { + "epoch": 2.1401654910472057, + "grad_norm": 4.4305644035339355, + "learning_rate": 4.645196656160066e-06, + "loss": 0.1462, + "step": 39825 + }, + { + "epoch": 2.1401790558871405, + "grad_norm": 3.5076396465301514, + "learning_rate": 4.645059613539811e-06, + "loss": 0.1258, + "step": 39826 + }, + { + "epoch": 2.1401926207270754, + "grad_norm": 3.9116241931915283, + "learning_rate": 4.6449225709195565e-06, + "loss": 0.1308, + "step": 39827 + }, + { + "epoch": 2.1402061855670103, + "grad_norm": 4.983098983764648, + "learning_rate": 4.644785528299302e-06, + "loss": 0.1565, + "step": 39828 + }, + { + "epoch": 2.140219750406945, + "grad_norm": 4.644775390625, + "learning_rate": 4.644648485679046e-06, + "loss": 0.1797, + "step": 39829 + }, + { + "epoch": 2.14023331524688, + "grad_norm": 6.043132305145264, + "learning_rate": 4.644511443058792e-06, + "loss": 0.2081, + "step": 39830 + }, + { + "epoch": 2.140246880086815, + "grad_norm": 4.408373832702637, + "learning_rate": 4.644374400438536e-06, + "loss": 0.1841, + "step": 39831 + }, + { + "epoch": 2.1402604449267497, + "grad_norm": 5.25991153717041, + "learning_rate": 4.644237357818282e-06, + "loss": 0.1625, + "step": 39832 + }, + { + "epoch": 2.1402740097666846, + "grad_norm": 3.58724045753479, + "learning_rate": 4.644100315198027e-06, + "loss": 0.1985, + "step": 39833 + }, + { + "epoch": 2.1402875746066194, + "grad_norm": 4.231210708618164, + "learning_rate": 4.643963272577772e-06, + "loss": 0.1713, + "step": 39834 + }, + { + "epoch": 2.1403011394465548, + "grad_norm": 4.126837253570557, + "learning_rate": 4.643826229957517e-06, + "loss": 0.2324, + "step": 39835 + }, + { + "epoch": 2.1403147042864896, + "grad_norm": 3.1842753887176514, + "learning_rate": 4.643689187337262e-06, + "loss": 0.0982, + "step": 39836 + }, + { + "epoch": 2.1403282691264245, + "grad_norm": 3.459291934967041, + "learning_rate": 4.643552144717007e-06, + "loss": 0.1216, + "step": 39837 + }, + { + "epoch": 2.1403418339663594, + "grad_norm": 3.758208990097046, + "learning_rate": 4.6434151020967526e-06, + "loss": 0.1781, + "step": 39838 + }, + { + "epoch": 2.140355398806294, + "grad_norm": 4.2595672607421875, + "learning_rate": 4.643278059476498e-06, + "loss": 0.1801, + "step": 39839 + }, + { + "epoch": 2.140368963646229, + "grad_norm": 3.400498390197754, + "learning_rate": 4.643141016856242e-06, + "loss": 0.163, + "step": 39840 + }, + { + "epoch": 2.140382528486164, + "grad_norm": 3.3117587566375732, + "learning_rate": 4.643003974235988e-06, + "loss": 0.1229, + "step": 39841 + }, + { + "epoch": 2.140396093326099, + "grad_norm": 5.83709716796875, + "learning_rate": 4.6428669316157324e-06, + "loss": 0.2497, + "step": 39842 + }, + { + "epoch": 2.1404096581660337, + "grad_norm": 4.387572765350342, + "learning_rate": 4.6427298889954785e-06, + "loss": 0.1863, + "step": 39843 + }, + { + "epoch": 2.1404232230059685, + "grad_norm": 4.316835403442383, + "learning_rate": 4.642592846375223e-06, + "loss": 0.2238, + "step": 39844 + }, + { + "epoch": 2.1404367878459034, + "grad_norm": 4.382418632507324, + "learning_rate": 4.642455803754968e-06, + "loss": 0.2395, + "step": 39845 + }, + { + "epoch": 2.1404503526858383, + "grad_norm": 3.9980106353759766, + "learning_rate": 4.642318761134713e-06, + "loss": 0.1439, + "step": 39846 + }, + { + "epoch": 2.140463917525773, + "grad_norm": 4.468371868133545, + "learning_rate": 4.642181718514458e-06, + "loss": 0.2331, + "step": 39847 + }, + { + "epoch": 2.140477482365708, + "grad_norm": 5.177794933319092, + "learning_rate": 4.6420446758942035e-06, + "loss": 0.2512, + "step": 39848 + }, + { + "epoch": 2.140491047205643, + "grad_norm": 5.566516399383545, + "learning_rate": 4.641907633273949e-06, + "loss": 0.213, + "step": 39849 + }, + { + "epoch": 2.1405046120455777, + "grad_norm": 3.8860836029052734, + "learning_rate": 4.641770590653694e-06, + "loss": 0.0981, + "step": 39850 + }, + { + "epoch": 2.1405181768855126, + "grad_norm": 4.098747253417969, + "learning_rate": 4.641633548033439e-06, + "loss": 0.1791, + "step": 39851 + }, + { + "epoch": 2.1405317417254475, + "grad_norm": 4.501659393310547, + "learning_rate": 4.641496505413184e-06, + "loss": 0.1614, + "step": 39852 + }, + { + "epoch": 2.1405453065653823, + "grad_norm": 4.610231876373291, + "learning_rate": 4.6413594627929286e-06, + "loss": 0.2548, + "step": 39853 + }, + { + "epoch": 2.140558871405317, + "grad_norm": 3.980283737182617, + "learning_rate": 4.641222420172674e-06, + "loss": 0.1856, + "step": 39854 + }, + { + "epoch": 2.1405724362452525, + "grad_norm": 4.678385257720947, + "learning_rate": 4.641085377552419e-06, + "loss": 0.1674, + "step": 39855 + }, + { + "epoch": 2.1405860010851874, + "grad_norm": 5.619427680969238, + "learning_rate": 4.640948334932164e-06, + "loss": 0.2128, + "step": 39856 + }, + { + "epoch": 2.1405995659251222, + "grad_norm": 6.001902103424072, + "learning_rate": 4.640811292311909e-06, + "loss": 0.2628, + "step": 39857 + }, + { + "epoch": 2.140613130765057, + "grad_norm": 3.526944398880005, + "learning_rate": 4.6406742496916545e-06, + "loss": 0.1912, + "step": 39858 + }, + { + "epoch": 2.140626695604992, + "grad_norm": 3.9225940704345703, + "learning_rate": 4.6405372070714e-06, + "loss": 0.13, + "step": 39859 + }, + { + "epoch": 2.140640260444927, + "grad_norm": 5.779692649841309, + "learning_rate": 4.640400164451145e-06, + "loss": 0.2766, + "step": 39860 + }, + { + "epoch": 2.1406538252848617, + "grad_norm": 3.9973716735839844, + "learning_rate": 4.64026312183089e-06, + "loss": 0.1812, + "step": 39861 + }, + { + "epoch": 2.1406673901247966, + "grad_norm": 6.597601890563965, + "learning_rate": 4.640126079210635e-06, + "loss": 0.3034, + "step": 39862 + }, + { + "epoch": 2.1406809549647314, + "grad_norm": 4.031370639801025, + "learning_rate": 4.6399890365903795e-06, + "loss": 0.2214, + "step": 39863 + }, + { + "epoch": 2.1406945198046663, + "grad_norm": 4.727718353271484, + "learning_rate": 4.6398519939701255e-06, + "loss": 0.2232, + "step": 39864 + }, + { + "epoch": 2.140708084644601, + "grad_norm": 4.275757789611816, + "learning_rate": 4.63971495134987e-06, + "loss": 0.1813, + "step": 39865 + }, + { + "epoch": 2.140721649484536, + "grad_norm": 5.847032070159912, + "learning_rate": 4.639577908729616e-06, + "loss": 0.269, + "step": 39866 + }, + { + "epoch": 2.140735214324471, + "grad_norm": 4.290449619293213, + "learning_rate": 4.63944086610936e-06, + "loss": 0.1522, + "step": 39867 + }, + { + "epoch": 2.1407487791644058, + "grad_norm": 4.687516689300537, + "learning_rate": 4.639303823489105e-06, + "loss": 0.2357, + "step": 39868 + }, + { + "epoch": 2.1407623440043406, + "grad_norm": 4.925743103027344, + "learning_rate": 4.639166780868851e-06, + "loss": 0.1544, + "step": 39869 + }, + { + "epoch": 2.1407759088442755, + "grad_norm": 5.53660774230957, + "learning_rate": 4.639029738248596e-06, + "loss": 0.1614, + "step": 39870 + }, + { + "epoch": 2.1407894736842104, + "grad_norm": 5.227466106414795, + "learning_rate": 4.638892695628341e-06, + "loss": 0.1795, + "step": 39871 + }, + { + "epoch": 2.140803038524145, + "grad_norm": 4.402958393096924, + "learning_rate": 4.638755653008086e-06, + "loss": 0.1213, + "step": 39872 + }, + { + "epoch": 2.1408166033640805, + "grad_norm": 4.6850738525390625, + "learning_rate": 4.638618610387831e-06, + "loss": 0.2396, + "step": 39873 + }, + { + "epoch": 2.1408301682040154, + "grad_norm": 3.8538291454315186, + "learning_rate": 4.638481567767576e-06, + "loss": 0.1234, + "step": 39874 + }, + { + "epoch": 2.1408437330439503, + "grad_norm": 6.047572612762451, + "learning_rate": 4.638344525147322e-06, + "loss": 0.1815, + "step": 39875 + }, + { + "epoch": 2.140857297883885, + "grad_norm": 4.71359395980835, + "learning_rate": 4.638207482527066e-06, + "loss": 0.1947, + "step": 39876 + }, + { + "epoch": 2.14087086272382, + "grad_norm": 4.90381383895874, + "learning_rate": 4.638070439906812e-06, + "loss": 0.2547, + "step": 39877 + }, + { + "epoch": 2.140884427563755, + "grad_norm": 6.469487190246582, + "learning_rate": 4.637933397286556e-06, + "loss": 0.2297, + "step": 39878 + }, + { + "epoch": 2.1408979924036897, + "grad_norm": 7.256784439086914, + "learning_rate": 4.6377963546663015e-06, + "loss": 0.2468, + "step": 39879 + }, + { + "epoch": 2.1409115572436246, + "grad_norm": 5.2391357421875, + "learning_rate": 4.637659312046047e-06, + "loss": 0.1389, + "step": 39880 + }, + { + "epoch": 2.1409251220835595, + "grad_norm": 4.220232963562012, + "learning_rate": 4.637522269425792e-06, + "loss": 0.1053, + "step": 39881 + }, + { + "epoch": 2.1409386869234943, + "grad_norm": 3.897761344909668, + "learning_rate": 4.637385226805537e-06, + "loss": 0.1437, + "step": 39882 + }, + { + "epoch": 2.140952251763429, + "grad_norm": 4.886656284332275, + "learning_rate": 4.637248184185281e-06, + "loss": 0.145, + "step": 39883 + }, + { + "epoch": 2.140965816603364, + "grad_norm": 6.081120014190674, + "learning_rate": 4.637111141565027e-06, + "loss": 0.2343, + "step": 39884 + }, + { + "epoch": 2.140979381443299, + "grad_norm": 5.357885837554932, + "learning_rate": 4.636974098944772e-06, + "loss": 0.2186, + "step": 39885 + }, + { + "epoch": 2.1409929462832338, + "grad_norm": 5.878670692443848, + "learning_rate": 4.636837056324518e-06, + "loss": 0.2539, + "step": 39886 + }, + { + "epoch": 2.1410065111231686, + "grad_norm": 6.654757022857666, + "learning_rate": 4.636700013704262e-06, + "loss": 0.2455, + "step": 39887 + }, + { + "epoch": 2.1410200759631035, + "grad_norm": 5.839010715484619, + "learning_rate": 4.636562971084007e-06, + "loss": 0.2671, + "step": 39888 + }, + { + "epoch": 2.1410336408030384, + "grad_norm": 5.071692943572998, + "learning_rate": 4.6364259284637525e-06, + "loss": 0.2054, + "step": 39889 + }, + { + "epoch": 2.1410472056429732, + "grad_norm": 6.346408367156982, + "learning_rate": 4.636288885843498e-06, + "loss": 0.27, + "step": 39890 + }, + { + "epoch": 2.141060770482908, + "grad_norm": 5.297297954559326, + "learning_rate": 4.636151843223243e-06, + "loss": 0.321, + "step": 39891 + }, + { + "epoch": 2.141074335322843, + "grad_norm": 7.277157306671143, + "learning_rate": 4.636014800602988e-06, + "loss": 0.3888, + "step": 39892 + }, + { + "epoch": 2.1410879001627783, + "grad_norm": 4.953599452972412, + "learning_rate": 4.635877757982733e-06, + "loss": 0.1883, + "step": 39893 + }, + { + "epoch": 2.141101465002713, + "grad_norm": 4.560851097106934, + "learning_rate": 4.635740715362478e-06, + "loss": 0.2141, + "step": 39894 + }, + { + "epoch": 2.141115029842648, + "grad_norm": 5.9570465087890625, + "learning_rate": 4.6356036727422235e-06, + "loss": 0.3458, + "step": 39895 + }, + { + "epoch": 2.141128594682583, + "grad_norm": 5.520743370056152, + "learning_rate": 4.635466630121968e-06, + "loss": 0.2951, + "step": 39896 + }, + { + "epoch": 2.1411421595225177, + "grad_norm": 5.662570476531982, + "learning_rate": 4.635329587501713e-06, + "loss": 0.2721, + "step": 39897 + }, + { + "epoch": 2.1411557243624526, + "grad_norm": 4.446808338165283, + "learning_rate": 4.635192544881458e-06, + "loss": 0.3005, + "step": 39898 + }, + { + "epoch": 2.1411692892023875, + "grad_norm": 5.71082878112793, + "learning_rate": 4.635055502261203e-06, + "loss": 0.2256, + "step": 39899 + }, + { + "epoch": 2.1411828540423223, + "grad_norm": 7.101160049438477, + "learning_rate": 4.634918459640949e-06, + "loss": 0.3743, + "step": 39900 + }, + { + "epoch": 2.141196418882257, + "grad_norm": 6.077904224395752, + "learning_rate": 4.634781417020694e-06, + "loss": 0.2041, + "step": 39901 + }, + { + "epoch": 2.141209983722192, + "grad_norm": 5.35332727432251, + "learning_rate": 4.634644374400439e-06, + "loss": 0.2429, + "step": 39902 + }, + { + "epoch": 2.141223548562127, + "grad_norm": 5.186465263366699, + "learning_rate": 4.634507331780184e-06, + "loss": 0.1723, + "step": 39903 + }, + { + "epoch": 2.141237113402062, + "grad_norm": 4.0417799949646, + "learning_rate": 4.634370289159929e-06, + "loss": 0.2008, + "step": 39904 + }, + { + "epoch": 2.1412506782419967, + "grad_norm": 5.51975154876709, + "learning_rate": 4.6342332465396745e-06, + "loss": 0.1626, + "step": 39905 + }, + { + "epoch": 2.1412642430819315, + "grad_norm": 5.50181770324707, + "learning_rate": 4.63409620391942e-06, + "loss": 0.3055, + "step": 39906 + }, + { + "epoch": 2.1412778079218664, + "grad_norm": 4.393970966339111, + "learning_rate": 4.633959161299165e-06, + "loss": 0.1973, + "step": 39907 + }, + { + "epoch": 2.1412913727618013, + "grad_norm": 5.820701599121094, + "learning_rate": 4.633822118678909e-06, + "loss": 0.2536, + "step": 39908 + }, + { + "epoch": 2.141304937601736, + "grad_norm": 4.549198627471924, + "learning_rate": 4.633685076058654e-06, + "loss": 0.1898, + "step": 39909 + }, + { + "epoch": 2.141318502441671, + "grad_norm": 5.351070880889893, + "learning_rate": 4.6335480334383995e-06, + "loss": 0.2003, + "step": 39910 + }, + { + "epoch": 2.1413320672816063, + "grad_norm": 5.959909915924072, + "learning_rate": 4.633410990818145e-06, + "loss": 0.3272, + "step": 39911 + }, + { + "epoch": 2.141345632121541, + "grad_norm": 5.826167106628418, + "learning_rate": 4.63327394819789e-06, + "loss": 0.2248, + "step": 39912 + }, + { + "epoch": 2.141359196961476, + "grad_norm": 4.759383201599121, + "learning_rate": 4.633136905577635e-06, + "loss": 0.1803, + "step": 39913 + }, + { + "epoch": 2.141372761801411, + "grad_norm": 6.899336814880371, + "learning_rate": 4.63299986295738e-06, + "loss": 0.2951, + "step": 39914 + }, + { + "epoch": 2.1413863266413458, + "grad_norm": 10.168981552124023, + "learning_rate": 4.632862820337125e-06, + "loss": 0.5023, + "step": 39915 + }, + { + "epoch": 2.1413998914812806, + "grad_norm": 6.118890762329102, + "learning_rate": 4.632725777716871e-06, + "loss": 0.2519, + "step": 39916 + }, + { + "epoch": 2.1414134563212155, + "grad_norm": 5.182275772094727, + "learning_rate": 4.632588735096615e-06, + "loss": 0.182, + "step": 39917 + }, + { + "epoch": 2.1414270211611504, + "grad_norm": 8.240920066833496, + "learning_rate": 4.632451692476361e-06, + "loss": 0.295, + "step": 39918 + }, + { + "epoch": 2.1414405860010852, + "grad_norm": 7.15145206451416, + "learning_rate": 4.632314649856105e-06, + "loss": 0.2637, + "step": 39919 + }, + { + "epoch": 2.14145415084102, + "grad_norm": 4.5833001136779785, + "learning_rate": 4.632177607235851e-06, + "loss": 0.2241, + "step": 39920 + }, + { + "epoch": 2.141467715680955, + "grad_norm": 5.593506813049316, + "learning_rate": 4.632040564615596e-06, + "loss": 0.2344, + "step": 39921 + }, + { + "epoch": 2.14148128052089, + "grad_norm": 6.731923580169678, + "learning_rate": 4.631903521995341e-06, + "loss": 0.2926, + "step": 39922 + }, + { + "epoch": 2.1414948453608247, + "grad_norm": 6.026403903961182, + "learning_rate": 4.631766479375086e-06, + "loss": 0.1998, + "step": 39923 + }, + { + "epoch": 2.1415084102007595, + "grad_norm": 4.879428386688232, + "learning_rate": 4.631629436754831e-06, + "loss": 0.2408, + "step": 39924 + }, + { + "epoch": 2.1415219750406944, + "grad_norm": 6.741490364074707, + "learning_rate": 4.631492394134576e-06, + "loss": 0.2801, + "step": 39925 + }, + { + "epoch": 2.1415355398806293, + "grad_norm": 5.724935531616211, + "learning_rate": 4.6313553515143215e-06, + "loss": 0.3372, + "step": 39926 + }, + { + "epoch": 2.141549104720564, + "grad_norm": 4.542300701141357, + "learning_rate": 4.631218308894067e-06, + "loss": 0.125, + "step": 39927 + }, + { + "epoch": 2.141562669560499, + "grad_norm": 4.955661773681641, + "learning_rate": 4.631081266273811e-06, + "loss": 0.1058, + "step": 39928 + }, + { + "epoch": 2.141576234400434, + "grad_norm": 5.0551042556762695, + "learning_rate": 4.630944223653557e-06, + "loss": 0.2123, + "step": 39929 + }, + { + "epoch": 2.1415897992403687, + "grad_norm": 3.897388458251953, + "learning_rate": 4.630807181033301e-06, + "loss": 0.1414, + "step": 39930 + }, + { + "epoch": 2.141603364080304, + "grad_norm": 5.761928558349609, + "learning_rate": 4.6306701384130474e-06, + "loss": 0.237, + "step": 39931 + }, + { + "epoch": 2.141616928920239, + "grad_norm": 6.816464424133301, + "learning_rate": 4.630533095792792e-06, + "loss": 0.2668, + "step": 39932 + }, + { + "epoch": 2.141630493760174, + "grad_norm": 5.174253940582275, + "learning_rate": 4.630396053172537e-06, + "loss": 0.1245, + "step": 39933 + }, + { + "epoch": 2.1416440586001086, + "grad_norm": 5.458431243896484, + "learning_rate": 4.630259010552282e-06, + "loss": 0.2226, + "step": 39934 + }, + { + "epoch": 2.1416576234400435, + "grad_norm": 8.756535530090332, + "learning_rate": 4.630121967932027e-06, + "loss": 0.3083, + "step": 39935 + }, + { + "epoch": 2.1416711882799784, + "grad_norm": 4.99292516708374, + "learning_rate": 4.6299849253117725e-06, + "loss": 0.1664, + "step": 39936 + }, + { + "epoch": 2.1416847531199132, + "grad_norm": 7.221317291259766, + "learning_rate": 4.629847882691517e-06, + "loss": 0.3039, + "step": 39937 + }, + { + "epoch": 2.141698317959848, + "grad_norm": 5.311516284942627, + "learning_rate": 4.629710840071263e-06, + "loss": 0.2284, + "step": 39938 + }, + { + "epoch": 2.141711882799783, + "grad_norm": 5.618053436279297, + "learning_rate": 4.629573797451007e-06, + "loss": 0.3714, + "step": 39939 + }, + { + "epoch": 2.141725447639718, + "grad_norm": 6.365248680114746, + "learning_rate": 4.629436754830753e-06, + "loss": 0.2798, + "step": 39940 + }, + { + "epoch": 2.1417390124796527, + "grad_norm": 5.482261657714844, + "learning_rate": 4.6292997122104975e-06, + "loss": 0.1639, + "step": 39941 + }, + { + "epoch": 2.1417525773195876, + "grad_norm": 6.112739562988281, + "learning_rate": 4.629162669590243e-06, + "loss": 0.2138, + "step": 39942 + }, + { + "epoch": 2.1417661421595224, + "grad_norm": 3.8984644412994385, + "learning_rate": 4.629025626969988e-06, + "loss": 0.1139, + "step": 39943 + }, + { + "epoch": 2.1417797069994573, + "grad_norm": 7.118228435516357, + "learning_rate": 4.628888584349733e-06, + "loss": 0.274, + "step": 39944 + }, + { + "epoch": 2.141793271839392, + "grad_norm": 5.9582085609436035, + "learning_rate": 4.628751541729478e-06, + "loss": 0.2748, + "step": 39945 + }, + { + "epoch": 2.141806836679327, + "grad_norm": 6.165850639343262, + "learning_rate": 4.6286144991092234e-06, + "loss": 0.2132, + "step": 39946 + }, + { + "epoch": 2.141820401519262, + "grad_norm": 4.66509485244751, + "learning_rate": 4.628477456488969e-06, + "loss": 0.1812, + "step": 39947 + }, + { + "epoch": 2.1418339663591968, + "grad_norm": 5.584015846252441, + "learning_rate": 4.628340413868714e-06, + "loss": 0.2482, + "step": 39948 + }, + { + "epoch": 2.141847531199132, + "grad_norm": 4.2525715827941895, + "learning_rate": 4.628203371248459e-06, + "loss": 0.2664, + "step": 39949 + }, + { + "epoch": 2.141861096039067, + "grad_norm": 5.865943431854248, + "learning_rate": 4.628066328628203e-06, + "loss": 0.2199, + "step": 39950 + }, + { + "epoch": 2.141874660879002, + "grad_norm": 4.281064033508301, + "learning_rate": 4.6279292860079485e-06, + "loss": 0.1733, + "step": 39951 + }, + { + "epoch": 2.1418882257189367, + "grad_norm": 4.573873043060303, + "learning_rate": 4.627792243387694e-06, + "loss": 0.2736, + "step": 39952 + }, + { + "epoch": 2.1419017905588715, + "grad_norm": 6.0947465896606445, + "learning_rate": 4.627655200767439e-06, + "loss": 0.3048, + "step": 39953 + }, + { + "epoch": 2.1419153553988064, + "grad_norm": 4.725376605987549, + "learning_rate": 4.627518158147184e-06, + "loss": 0.2716, + "step": 39954 + }, + { + "epoch": 2.1419289202387413, + "grad_norm": 5.540759086608887, + "learning_rate": 4.627381115526929e-06, + "loss": 0.2435, + "step": 39955 + }, + { + "epoch": 2.141942485078676, + "grad_norm": 4.680979251861572, + "learning_rate": 4.627244072906674e-06, + "loss": 0.2155, + "step": 39956 + }, + { + "epoch": 2.141956049918611, + "grad_norm": 5.027267932891846, + "learning_rate": 4.6271070302864195e-06, + "loss": 0.2038, + "step": 39957 + }, + { + "epoch": 2.141969614758546, + "grad_norm": 5.652698993682861, + "learning_rate": 4.626969987666165e-06, + "loss": 0.2186, + "step": 39958 + }, + { + "epoch": 2.1419831795984807, + "grad_norm": 5.287587642669678, + "learning_rate": 4.62683294504591e-06, + "loss": 0.1802, + "step": 39959 + }, + { + "epoch": 2.1419967444384156, + "grad_norm": 4.584253787994385, + "learning_rate": 4.626695902425655e-06, + "loss": 0.1724, + "step": 39960 + }, + { + "epoch": 2.1420103092783505, + "grad_norm": 4.593979835510254, + "learning_rate": 4.6265588598054e-06, + "loss": 0.211, + "step": 39961 + }, + { + "epoch": 2.1420238741182853, + "grad_norm": 4.346599578857422, + "learning_rate": 4.626421817185145e-06, + "loss": 0.2052, + "step": 39962 + }, + { + "epoch": 2.14203743895822, + "grad_norm": 5.937996864318848, + "learning_rate": 4.626284774564891e-06, + "loss": 0.2146, + "step": 39963 + }, + { + "epoch": 2.142051003798155, + "grad_norm": 4.775333881378174, + "learning_rate": 4.626147731944635e-06, + "loss": 0.234, + "step": 39964 + }, + { + "epoch": 2.14206456863809, + "grad_norm": 4.890218257904053, + "learning_rate": 4.62601068932438e-06, + "loss": 0.2014, + "step": 39965 + }, + { + "epoch": 2.142078133478025, + "grad_norm": 4.295714378356934, + "learning_rate": 4.625873646704125e-06, + "loss": 0.2681, + "step": 39966 + }, + { + "epoch": 2.1420916983179596, + "grad_norm": 3.4784789085388184, + "learning_rate": 4.6257366040838705e-06, + "loss": 0.1074, + "step": 39967 + }, + { + "epoch": 2.1421052631578945, + "grad_norm": 4.474771976470947, + "learning_rate": 4.625599561463616e-06, + "loss": 0.1408, + "step": 39968 + }, + { + "epoch": 2.14211882799783, + "grad_norm": 3.193904399871826, + "learning_rate": 4.625462518843361e-06, + "loss": 0.1532, + "step": 39969 + }, + { + "epoch": 2.1421323928377647, + "grad_norm": 3.845946788787842, + "learning_rate": 4.625325476223106e-06, + "loss": 0.1448, + "step": 39970 + }, + { + "epoch": 2.1421459576776996, + "grad_norm": 4.106462478637695, + "learning_rate": 4.62518843360285e-06, + "loss": 0.1042, + "step": 39971 + }, + { + "epoch": 2.1421595225176344, + "grad_norm": 6.117673873901367, + "learning_rate": 4.625051390982596e-06, + "loss": 0.2396, + "step": 39972 + }, + { + "epoch": 2.1421730873575693, + "grad_norm": 6.243016242980957, + "learning_rate": 4.624914348362341e-06, + "loss": 0.3362, + "step": 39973 + }, + { + "epoch": 2.142186652197504, + "grad_norm": 4.082874774932861, + "learning_rate": 4.624777305742087e-06, + "loss": 0.158, + "step": 39974 + }, + { + "epoch": 2.142200217037439, + "grad_norm": 6.423291206359863, + "learning_rate": 4.624640263121831e-06, + "loss": 0.2927, + "step": 39975 + }, + { + "epoch": 2.142213781877374, + "grad_norm": 4.358722686767578, + "learning_rate": 4.624503220501576e-06, + "loss": 0.1645, + "step": 39976 + }, + { + "epoch": 2.1422273467173087, + "grad_norm": 6.511684417724609, + "learning_rate": 4.6243661778813214e-06, + "loss": 0.1984, + "step": 39977 + }, + { + "epoch": 2.1422409115572436, + "grad_norm": 4.079348087310791, + "learning_rate": 4.624229135261067e-06, + "loss": 0.1343, + "step": 39978 + }, + { + "epoch": 2.1422544763971785, + "grad_norm": 4.9158501625061035, + "learning_rate": 4.624092092640812e-06, + "loss": 0.1134, + "step": 39979 + }, + { + "epoch": 2.1422680412371133, + "grad_norm": 3.7834994792938232, + "learning_rate": 4.623955050020557e-06, + "loss": 0.1054, + "step": 39980 + }, + { + "epoch": 2.142281606077048, + "grad_norm": 6.24231481552124, + "learning_rate": 4.623818007400302e-06, + "loss": 0.2773, + "step": 39981 + }, + { + "epoch": 2.142295170916983, + "grad_norm": 6.543302059173584, + "learning_rate": 4.6236809647800465e-06, + "loss": 0.2001, + "step": 39982 + }, + { + "epoch": 2.142308735756918, + "grad_norm": 4.646049499511719, + "learning_rate": 4.6235439221597925e-06, + "loss": 0.174, + "step": 39983 + }, + { + "epoch": 2.142322300596853, + "grad_norm": 4.2968268394470215, + "learning_rate": 4.623406879539537e-06, + "loss": 0.1428, + "step": 39984 + }, + { + "epoch": 2.1423358654367877, + "grad_norm": 5.0433478355407715, + "learning_rate": 4.623269836919283e-06, + "loss": 0.1388, + "step": 39985 + }, + { + "epoch": 2.1423494302767225, + "grad_norm": 4.9373250007629395, + "learning_rate": 4.623132794299027e-06, + "loss": 0.214, + "step": 39986 + }, + { + "epoch": 2.142362995116658, + "grad_norm": 5.7019147872924805, + "learning_rate": 4.622995751678772e-06, + "loss": 0.2206, + "step": 39987 + }, + { + "epoch": 2.1423765599565927, + "grad_norm": 4.188137531280518, + "learning_rate": 4.6228587090585175e-06, + "loss": 0.1738, + "step": 39988 + }, + { + "epoch": 2.1423901247965276, + "grad_norm": 4.893375873565674, + "learning_rate": 4.622721666438263e-06, + "loss": 0.1852, + "step": 39989 + }, + { + "epoch": 2.1424036896364624, + "grad_norm": 3.499829053878784, + "learning_rate": 4.622584623818008e-06, + "loss": 0.124, + "step": 39990 + }, + { + "epoch": 2.1424172544763973, + "grad_norm": 5.497139930725098, + "learning_rate": 4.622447581197752e-06, + "loss": 0.1417, + "step": 39991 + }, + { + "epoch": 2.142430819316332, + "grad_norm": 5.424048900604248, + "learning_rate": 4.622310538577498e-06, + "loss": 0.172, + "step": 39992 + }, + { + "epoch": 2.142444384156267, + "grad_norm": 4.707244873046875, + "learning_rate": 4.622173495957243e-06, + "loss": 0.1508, + "step": 39993 + }, + { + "epoch": 2.142457948996202, + "grad_norm": 5.9053192138671875, + "learning_rate": 4.622036453336989e-06, + "loss": 0.1868, + "step": 39994 + }, + { + "epoch": 2.1424715138361368, + "grad_norm": 6.910715579986572, + "learning_rate": 4.621899410716733e-06, + "loss": 0.3034, + "step": 39995 + }, + { + "epoch": 2.1424850786760716, + "grad_norm": 3.5586771965026855, + "learning_rate": 4.621762368096478e-06, + "loss": 0.1612, + "step": 39996 + }, + { + "epoch": 2.1424986435160065, + "grad_norm": 5.155373573303223, + "learning_rate": 4.621625325476223e-06, + "loss": 0.2754, + "step": 39997 + }, + { + "epoch": 2.1425122083559414, + "grad_norm": 6.889332294464111, + "learning_rate": 4.6214882828559685e-06, + "loss": 0.2918, + "step": 39998 + }, + { + "epoch": 2.1425257731958762, + "grad_norm": 6.044712543487549, + "learning_rate": 4.621351240235714e-06, + "loss": 0.2661, + "step": 39999 + }, + { + "epoch": 2.142539338035811, + "grad_norm": 4.104878902435303, + "learning_rate": 4.621214197615459e-06, + "loss": 0.1498, + "step": 40000 + }, + { + "epoch": 2.142552902875746, + "grad_norm": 4.09849739074707, + "learning_rate": 4.621077154995204e-06, + "loss": 0.2353, + "step": 40001 + }, + { + "epoch": 2.142566467715681, + "grad_norm": 4.40244722366333, + "learning_rate": 4.620940112374949e-06, + "loss": 0.187, + "step": 40002 + }, + { + "epoch": 2.1425800325556157, + "grad_norm": 3.3979907035827637, + "learning_rate": 4.620803069754694e-06, + "loss": 0.1382, + "step": 40003 + }, + { + "epoch": 2.1425935973955506, + "grad_norm": 6.296878814697266, + "learning_rate": 4.6206660271344396e-06, + "loss": 0.3579, + "step": 40004 + }, + { + "epoch": 2.1426071622354854, + "grad_norm": 8.318757057189941, + "learning_rate": 4.620528984514184e-06, + "loss": 0.3264, + "step": 40005 + }, + { + "epoch": 2.1426207270754203, + "grad_norm": 6.781933784484863, + "learning_rate": 4.620391941893929e-06, + "loss": 0.2933, + "step": 40006 + }, + { + "epoch": 2.1426342919153556, + "grad_norm": 5.402810096740723, + "learning_rate": 4.620254899273674e-06, + "loss": 0.2078, + "step": 40007 + }, + { + "epoch": 2.1426478567552905, + "grad_norm": 2.791184663772583, + "learning_rate": 4.6201178566534194e-06, + "loss": 0.0932, + "step": 40008 + }, + { + "epoch": 2.1426614215952253, + "grad_norm": 4.282898902893066, + "learning_rate": 4.619980814033165e-06, + "loss": 0.188, + "step": 40009 + }, + { + "epoch": 2.14267498643516, + "grad_norm": 5.079311370849609, + "learning_rate": 4.61984377141291e-06, + "loss": 0.2566, + "step": 40010 + }, + { + "epoch": 2.142688551275095, + "grad_norm": 4.21875, + "learning_rate": 4.619706728792655e-06, + "loss": 0.1339, + "step": 40011 + }, + { + "epoch": 2.14270211611503, + "grad_norm": 4.612615585327148, + "learning_rate": 4.6195696861724e-06, + "loss": 0.1402, + "step": 40012 + }, + { + "epoch": 2.142715680954965, + "grad_norm": 5.149693489074707, + "learning_rate": 4.619432643552145e-06, + "loss": 0.2282, + "step": 40013 + }, + { + "epoch": 2.1427292457948997, + "grad_norm": 5.312527656555176, + "learning_rate": 4.6192956009318905e-06, + "loss": 0.1307, + "step": 40014 + }, + { + "epoch": 2.1427428106348345, + "grad_norm": 4.572501182556152, + "learning_rate": 4.619158558311636e-06, + "loss": 0.1789, + "step": 40015 + }, + { + "epoch": 2.1427563754747694, + "grad_norm": 6.099125385284424, + "learning_rate": 4.61902151569138e-06, + "loss": 0.3071, + "step": 40016 + }, + { + "epoch": 2.1427699403147042, + "grad_norm": 4.263621807098389, + "learning_rate": 4.618884473071126e-06, + "loss": 0.1132, + "step": 40017 + }, + { + "epoch": 2.142783505154639, + "grad_norm": 5.203514575958252, + "learning_rate": 4.61874743045087e-06, + "loss": 0.3095, + "step": 40018 + }, + { + "epoch": 2.142797069994574, + "grad_norm": 5.061432361602783, + "learning_rate": 4.6186103878306156e-06, + "loss": 0.1755, + "step": 40019 + }, + { + "epoch": 2.142810634834509, + "grad_norm": 5.405593395233154, + "learning_rate": 4.618473345210361e-06, + "loss": 0.3603, + "step": 40020 + }, + { + "epoch": 2.1428241996744437, + "grad_norm": 5.458115100860596, + "learning_rate": 4.618336302590106e-06, + "loss": 0.1831, + "step": 40021 + }, + { + "epoch": 2.1428377645143786, + "grad_norm": 5.090479850769043, + "learning_rate": 4.618199259969851e-06, + "loss": 0.2141, + "step": 40022 + }, + { + "epoch": 2.1428513293543134, + "grad_norm": 6.097719192504883, + "learning_rate": 4.618062217349596e-06, + "loss": 0.1475, + "step": 40023 + }, + { + "epoch": 2.1428648941942483, + "grad_norm": 3.398981809616089, + "learning_rate": 4.6179251747293414e-06, + "loss": 0.0999, + "step": 40024 + }, + { + "epoch": 2.1428784590341836, + "grad_norm": 4.064953804016113, + "learning_rate": 4.617788132109086e-06, + "loss": 0.1533, + "step": 40025 + }, + { + "epoch": 2.1428920238741185, + "grad_norm": 4.479978561401367, + "learning_rate": 4.617651089488832e-06, + "loss": 0.2047, + "step": 40026 + }, + { + "epoch": 2.1429055887140533, + "grad_norm": 6.1611504554748535, + "learning_rate": 4.617514046868576e-06, + "loss": 0.222, + "step": 40027 + }, + { + "epoch": 2.142919153553988, + "grad_norm": 4.3523149490356445, + "learning_rate": 4.617377004248322e-06, + "loss": 0.177, + "step": 40028 + }, + { + "epoch": 2.142932718393923, + "grad_norm": 4.296981334686279, + "learning_rate": 4.6172399616280665e-06, + "loss": 0.2141, + "step": 40029 + }, + { + "epoch": 2.142946283233858, + "grad_norm": 4.677918910980225, + "learning_rate": 4.617102919007812e-06, + "loss": 0.2097, + "step": 40030 + }, + { + "epoch": 2.142959848073793, + "grad_norm": 7.458133697509766, + "learning_rate": 4.616965876387557e-06, + "loss": 0.2369, + "step": 40031 + }, + { + "epoch": 2.1429734129137277, + "grad_norm": 5.004186630249023, + "learning_rate": 4.616828833767302e-06, + "loss": 0.1293, + "step": 40032 + }, + { + "epoch": 2.1429869777536625, + "grad_norm": 6.627228736877441, + "learning_rate": 4.616691791147047e-06, + "loss": 0.2342, + "step": 40033 + }, + { + "epoch": 2.1430005425935974, + "grad_norm": 5.869100570678711, + "learning_rate": 4.616554748526792e-06, + "loss": 0.219, + "step": 40034 + }, + { + "epoch": 2.1430141074335323, + "grad_norm": 5.648159503936768, + "learning_rate": 4.6164177059065376e-06, + "loss": 0.179, + "step": 40035 + }, + { + "epoch": 2.143027672273467, + "grad_norm": 8.987680435180664, + "learning_rate": 4.616280663286282e-06, + "loss": 0.3923, + "step": 40036 + }, + { + "epoch": 2.143041237113402, + "grad_norm": 4.955613136291504, + "learning_rate": 4.616143620666028e-06, + "loss": 0.2121, + "step": 40037 + }, + { + "epoch": 2.143054801953337, + "grad_norm": 5.948785781860352, + "learning_rate": 4.616006578045772e-06, + "loss": 0.298, + "step": 40038 + }, + { + "epoch": 2.1430683667932717, + "grad_norm": 5.3418402671813965, + "learning_rate": 4.6158695354255174e-06, + "loss": 0.2665, + "step": 40039 + }, + { + "epoch": 2.1430819316332066, + "grad_norm": 3.9307007789611816, + "learning_rate": 4.615732492805263e-06, + "loss": 0.1763, + "step": 40040 + }, + { + "epoch": 2.1430954964731415, + "grad_norm": 4.770304203033447, + "learning_rate": 4.615595450185008e-06, + "loss": 0.1607, + "step": 40041 + }, + { + "epoch": 2.1431090613130763, + "grad_norm": 5.581554889678955, + "learning_rate": 4.615458407564753e-06, + "loss": 0.1899, + "step": 40042 + }, + { + "epoch": 2.143122626153011, + "grad_norm": 6.880980014801025, + "learning_rate": 4.615321364944498e-06, + "loss": 0.286, + "step": 40043 + }, + { + "epoch": 2.143136190992946, + "grad_norm": 5.664278030395508, + "learning_rate": 4.615184322324243e-06, + "loss": 0.3643, + "step": 40044 + }, + { + "epoch": 2.1431497558328814, + "grad_norm": 4.986600399017334, + "learning_rate": 4.6150472797039885e-06, + "loss": 0.2291, + "step": 40045 + }, + { + "epoch": 2.1431633206728162, + "grad_norm": 5.709635257720947, + "learning_rate": 4.614910237083734e-06, + "loss": 0.3121, + "step": 40046 + }, + { + "epoch": 2.143176885512751, + "grad_norm": 4.80470609664917, + "learning_rate": 4.614773194463478e-06, + "loss": 0.2594, + "step": 40047 + }, + { + "epoch": 2.143190450352686, + "grad_norm": 4.736143589019775, + "learning_rate": 4.614636151843224e-06, + "loss": 0.2203, + "step": 40048 + }, + { + "epoch": 2.143204015192621, + "grad_norm": 4.84102725982666, + "learning_rate": 4.614499109222968e-06, + "loss": 0.1974, + "step": 40049 + }, + { + "epoch": 2.1432175800325557, + "grad_norm": 4.615667343139648, + "learning_rate": 4.6143620666027136e-06, + "loss": 0.173, + "step": 40050 + }, + { + "epoch": 2.1432311448724906, + "grad_norm": 4.421017169952393, + "learning_rate": 4.614225023982459e-06, + "loss": 0.1752, + "step": 40051 + }, + { + "epoch": 2.1432447097124254, + "grad_norm": 3.4795496463775635, + "learning_rate": 4.614087981362204e-06, + "loss": 0.1284, + "step": 40052 + }, + { + "epoch": 2.1432582745523603, + "grad_norm": 6.313527584075928, + "learning_rate": 4.613950938741949e-06, + "loss": 0.238, + "step": 40053 + }, + { + "epoch": 2.143271839392295, + "grad_norm": 6.446451187133789, + "learning_rate": 4.613813896121694e-06, + "loss": 0.2694, + "step": 40054 + }, + { + "epoch": 2.14328540423223, + "grad_norm": 3.899028778076172, + "learning_rate": 4.6136768535014395e-06, + "loss": 0.1114, + "step": 40055 + }, + { + "epoch": 2.143298969072165, + "grad_norm": 3.3655521869659424, + "learning_rate": 4.613539810881185e-06, + "loss": 0.1354, + "step": 40056 + }, + { + "epoch": 2.1433125339120997, + "grad_norm": 3.675729751586914, + "learning_rate": 4.61340276826093e-06, + "loss": 0.19, + "step": 40057 + }, + { + "epoch": 2.1433260987520346, + "grad_norm": 5.448123455047607, + "learning_rate": 4.613265725640675e-06, + "loss": 0.2187, + "step": 40058 + }, + { + "epoch": 2.1433396635919695, + "grad_norm": 3.691209316253662, + "learning_rate": 4.613128683020419e-06, + "loss": 0.1643, + "step": 40059 + }, + { + "epoch": 2.1433532284319043, + "grad_norm": 5.108490467071533, + "learning_rate": 4.6129916404001645e-06, + "loss": 0.2041, + "step": 40060 + }, + { + "epoch": 2.143366793271839, + "grad_norm": 7.086821556091309, + "learning_rate": 4.61285459777991e-06, + "loss": 0.2801, + "step": 40061 + }, + { + "epoch": 2.143380358111774, + "grad_norm": 3.8934786319732666, + "learning_rate": 4.612717555159655e-06, + "loss": 0.1826, + "step": 40062 + }, + { + "epoch": 2.1433939229517094, + "grad_norm": 3.6374781131744385, + "learning_rate": 4.6125805125394e-06, + "loss": 0.1233, + "step": 40063 + }, + { + "epoch": 2.1434074877916443, + "grad_norm": 5.683920383453369, + "learning_rate": 4.612443469919145e-06, + "loss": 0.2026, + "step": 40064 + }, + { + "epoch": 2.143421052631579, + "grad_norm": 4.7279863357543945, + "learning_rate": 4.61230642729889e-06, + "loss": 0.1556, + "step": 40065 + }, + { + "epoch": 2.143434617471514, + "grad_norm": 5.281250953674316, + "learning_rate": 4.6121693846786356e-06, + "loss": 0.1814, + "step": 40066 + }, + { + "epoch": 2.143448182311449, + "grad_norm": 5.375067234039307, + "learning_rate": 4.612032342058381e-06, + "loss": 0.2162, + "step": 40067 + }, + { + "epoch": 2.1434617471513837, + "grad_norm": 4.5845112800598145, + "learning_rate": 4.611895299438126e-06, + "loss": 0.1301, + "step": 40068 + }, + { + "epoch": 2.1434753119913186, + "grad_norm": 6.220509052276611, + "learning_rate": 4.611758256817871e-06, + "loss": 0.2278, + "step": 40069 + }, + { + "epoch": 2.1434888768312534, + "grad_norm": 3.446636199951172, + "learning_rate": 4.6116212141976154e-06, + "loss": 0.1578, + "step": 40070 + }, + { + "epoch": 2.1435024416711883, + "grad_norm": 5.710628986358643, + "learning_rate": 4.6114841715773615e-06, + "loss": 0.1371, + "step": 40071 + }, + { + "epoch": 2.143516006511123, + "grad_norm": 5.572876930236816, + "learning_rate": 4.611347128957106e-06, + "loss": 0.1584, + "step": 40072 + }, + { + "epoch": 2.143529571351058, + "grad_norm": 5.9507622718811035, + "learning_rate": 4.611210086336852e-06, + "loss": 0.1684, + "step": 40073 + }, + { + "epoch": 2.143543136190993, + "grad_norm": 4.751129150390625, + "learning_rate": 4.611073043716596e-06, + "loss": 0.2015, + "step": 40074 + }, + { + "epoch": 2.1435567010309278, + "grad_norm": 5.459766387939453, + "learning_rate": 4.610936001096341e-06, + "loss": 0.1845, + "step": 40075 + }, + { + "epoch": 2.1435702658708626, + "grad_norm": 3.899217128753662, + "learning_rate": 4.6107989584760865e-06, + "loss": 0.1741, + "step": 40076 + }, + { + "epoch": 2.1435838307107975, + "grad_norm": 4.592180252075195, + "learning_rate": 4.610661915855832e-06, + "loss": 0.1124, + "step": 40077 + }, + { + "epoch": 2.1435973955507324, + "grad_norm": 4.468812942504883, + "learning_rate": 4.610524873235577e-06, + "loss": 0.1566, + "step": 40078 + }, + { + "epoch": 2.1436109603906672, + "grad_norm": 4.454852104187012, + "learning_rate": 4.610387830615321e-06, + "loss": 0.1991, + "step": 40079 + }, + { + "epoch": 2.143624525230602, + "grad_norm": 4.581943511962891, + "learning_rate": 4.610250787995067e-06, + "loss": 0.2765, + "step": 40080 + }, + { + "epoch": 2.143638090070537, + "grad_norm": 4.475978374481201, + "learning_rate": 4.6101137453748116e-06, + "loss": 0.136, + "step": 40081 + }, + { + "epoch": 2.143651654910472, + "grad_norm": 6.375812530517578, + "learning_rate": 4.609976702754558e-06, + "loss": 0.1991, + "step": 40082 + }, + { + "epoch": 2.143665219750407, + "grad_norm": 5.672956943511963, + "learning_rate": 4.609839660134302e-06, + "loss": 0.1463, + "step": 40083 + }, + { + "epoch": 2.143678784590342, + "grad_norm": 6.833249568939209, + "learning_rate": 4.609702617514047e-06, + "loss": 0.2001, + "step": 40084 + }, + { + "epoch": 2.143692349430277, + "grad_norm": 5.819624423980713, + "learning_rate": 4.609565574893792e-06, + "loss": 0.1828, + "step": 40085 + }, + { + "epoch": 2.1437059142702117, + "grad_norm": 5.042881965637207, + "learning_rate": 4.6094285322735375e-06, + "loss": 0.1296, + "step": 40086 + }, + { + "epoch": 2.1437194791101466, + "grad_norm": 5.082019329071045, + "learning_rate": 4.609291489653283e-06, + "loss": 0.1153, + "step": 40087 + }, + { + "epoch": 2.1437330439500815, + "grad_norm": 3.3165218830108643, + "learning_rate": 4.609154447033027e-06, + "loss": 0.0676, + "step": 40088 + }, + { + "epoch": 2.1437466087900163, + "grad_norm": 6.363658905029297, + "learning_rate": 4.609017404412773e-06, + "loss": 0.224, + "step": 40089 + }, + { + "epoch": 2.143760173629951, + "grad_norm": 4.666301250457764, + "learning_rate": 4.608880361792517e-06, + "loss": 0.206, + "step": 40090 + }, + { + "epoch": 2.143773738469886, + "grad_norm": 6.475952625274658, + "learning_rate": 4.608743319172263e-06, + "loss": 0.2561, + "step": 40091 + }, + { + "epoch": 2.143787303309821, + "grad_norm": 4.7191362380981445, + "learning_rate": 4.608606276552008e-06, + "loss": 0.2416, + "step": 40092 + }, + { + "epoch": 2.143800868149756, + "grad_norm": 5.727900981903076, + "learning_rate": 4.608469233931753e-06, + "loss": 0.2527, + "step": 40093 + }, + { + "epoch": 2.1438144329896907, + "grad_norm": 5.9292731285095215, + "learning_rate": 4.608332191311498e-06, + "loss": 0.2426, + "step": 40094 + }, + { + "epoch": 2.1438279978296255, + "grad_norm": 3.6869022846221924, + "learning_rate": 4.608195148691243e-06, + "loss": 0.1202, + "step": 40095 + }, + { + "epoch": 2.1438415626695604, + "grad_norm": 5.705957889556885, + "learning_rate": 4.608058106070988e-06, + "loss": 0.1687, + "step": 40096 + }, + { + "epoch": 2.1438551275094953, + "grad_norm": 4.508289337158203, + "learning_rate": 4.607921063450734e-06, + "loss": 0.2655, + "step": 40097 + }, + { + "epoch": 2.14386869234943, + "grad_norm": 5.489712715148926, + "learning_rate": 4.607784020830479e-06, + "loss": 0.3175, + "step": 40098 + }, + { + "epoch": 2.143882257189365, + "grad_norm": 6.465664386749268, + "learning_rate": 4.607646978210224e-06, + "loss": 0.294, + "step": 40099 + }, + { + "epoch": 2.1438958220293, + "grad_norm": 6.264527320861816, + "learning_rate": 4.607509935589969e-06, + "loss": 0.2268, + "step": 40100 + }, + { + "epoch": 2.143909386869235, + "grad_norm": 5.421563148498535, + "learning_rate": 4.6073728929697134e-06, + "loss": 0.2718, + "step": 40101 + }, + { + "epoch": 2.14392295170917, + "grad_norm": 5.303670883178711, + "learning_rate": 4.6072358503494595e-06, + "loss": 0.3425, + "step": 40102 + }, + { + "epoch": 2.143936516549105, + "grad_norm": 4.6221818923950195, + "learning_rate": 4.607098807729204e-06, + "loss": 0.2426, + "step": 40103 + }, + { + "epoch": 2.1439500813890398, + "grad_norm": 5.135021209716797, + "learning_rate": 4.606961765108949e-06, + "loss": 0.2288, + "step": 40104 + }, + { + "epoch": 2.1439636462289746, + "grad_norm": 4.840891361236572, + "learning_rate": 4.606824722488694e-06, + "loss": 0.2718, + "step": 40105 + }, + { + "epoch": 2.1439772110689095, + "grad_norm": 4.120151519775391, + "learning_rate": 4.606687679868439e-06, + "loss": 0.1241, + "step": 40106 + }, + { + "epoch": 2.1439907759088443, + "grad_norm": 5.197017192840576, + "learning_rate": 4.6065506372481845e-06, + "loss": 0.1963, + "step": 40107 + }, + { + "epoch": 2.144004340748779, + "grad_norm": 5.275091648101807, + "learning_rate": 4.60641359462793e-06, + "loss": 0.2889, + "step": 40108 + }, + { + "epoch": 2.144017905588714, + "grad_norm": 4.433077335357666, + "learning_rate": 4.606276552007675e-06, + "loss": 0.1748, + "step": 40109 + }, + { + "epoch": 2.144031470428649, + "grad_norm": 4.286155700683594, + "learning_rate": 4.60613950938742e-06, + "loss": 0.1995, + "step": 40110 + }, + { + "epoch": 2.144045035268584, + "grad_norm": 4.257163047790527, + "learning_rate": 4.606002466767165e-06, + "loss": 0.1309, + "step": 40111 + }, + { + "epoch": 2.1440586001085187, + "grad_norm": 6.606256008148193, + "learning_rate": 4.60586542414691e-06, + "loss": 0.2736, + "step": 40112 + }, + { + "epoch": 2.1440721649484535, + "grad_norm": 5.779318809509277, + "learning_rate": 4.605728381526655e-06, + "loss": 0.2638, + "step": 40113 + }, + { + "epoch": 2.1440857297883884, + "grad_norm": 4.666441917419434, + "learning_rate": 4.605591338906401e-06, + "loss": 0.2661, + "step": 40114 + }, + { + "epoch": 2.1440992946283233, + "grad_norm": 5.149387359619141, + "learning_rate": 4.605454296286145e-06, + "loss": 0.1783, + "step": 40115 + }, + { + "epoch": 2.144112859468258, + "grad_norm": 4.6167168617248535, + "learning_rate": 4.60531725366589e-06, + "loss": 0.2825, + "step": 40116 + }, + { + "epoch": 2.144126424308193, + "grad_norm": 6.304187774658203, + "learning_rate": 4.6051802110456355e-06, + "loss": 0.2546, + "step": 40117 + }, + { + "epoch": 2.144139989148128, + "grad_norm": 5.323145866394043, + "learning_rate": 4.605043168425381e-06, + "loss": 0.2128, + "step": 40118 + }, + { + "epoch": 2.1441535539880627, + "grad_norm": 5.527437686920166, + "learning_rate": 4.604906125805126e-06, + "loss": 0.3116, + "step": 40119 + }, + { + "epoch": 2.1441671188279976, + "grad_norm": 5.23090934753418, + "learning_rate": 4.604769083184871e-06, + "loss": 0.2266, + "step": 40120 + }, + { + "epoch": 2.144180683667933, + "grad_norm": 5.550455093383789, + "learning_rate": 4.604632040564616e-06, + "loss": 0.2121, + "step": 40121 + }, + { + "epoch": 2.1441942485078678, + "grad_norm": 4.1283040046691895, + "learning_rate": 4.604494997944361e-06, + "loss": 0.2208, + "step": 40122 + }, + { + "epoch": 2.1442078133478026, + "grad_norm": 4.693509101867676, + "learning_rate": 4.6043579553241065e-06, + "loss": 0.2154, + "step": 40123 + }, + { + "epoch": 2.1442213781877375, + "grad_norm": 5.431735992431641, + "learning_rate": 4.604220912703851e-06, + "loss": 0.2568, + "step": 40124 + }, + { + "epoch": 2.1442349430276724, + "grad_norm": 5.267093658447266, + "learning_rate": 4.604083870083597e-06, + "loss": 0.191, + "step": 40125 + }, + { + "epoch": 2.1442485078676072, + "grad_norm": 6.374930381774902, + "learning_rate": 4.603946827463341e-06, + "loss": 0.2535, + "step": 40126 + }, + { + "epoch": 2.144262072707542, + "grad_norm": 7.246140480041504, + "learning_rate": 4.603809784843087e-06, + "loss": 0.1994, + "step": 40127 + }, + { + "epoch": 2.144275637547477, + "grad_norm": 5.778570175170898, + "learning_rate": 4.603672742222832e-06, + "loss": 0.1997, + "step": 40128 + }, + { + "epoch": 2.144289202387412, + "grad_norm": 5.460764408111572, + "learning_rate": 4.603535699602577e-06, + "loss": 0.1946, + "step": 40129 + }, + { + "epoch": 2.1443027672273467, + "grad_norm": 5.597554683685303, + "learning_rate": 4.603398656982322e-06, + "loss": 0.275, + "step": 40130 + }, + { + "epoch": 2.1443163320672816, + "grad_norm": 8.123467445373535, + "learning_rate": 4.603261614362067e-06, + "loss": 0.1946, + "step": 40131 + }, + { + "epoch": 2.1443298969072164, + "grad_norm": 6.9000372886657715, + "learning_rate": 4.603124571741812e-06, + "loss": 0.3034, + "step": 40132 + }, + { + "epoch": 2.1443434617471513, + "grad_norm": 7.605280876159668, + "learning_rate": 4.602987529121557e-06, + "loss": 0.2966, + "step": 40133 + }, + { + "epoch": 2.144357026587086, + "grad_norm": 4.472770690917969, + "learning_rate": 4.602850486501303e-06, + "loss": 0.2141, + "step": 40134 + }, + { + "epoch": 2.144370591427021, + "grad_norm": 4.38009786605835, + "learning_rate": 4.602713443881047e-06, + "loss": 0.1904, + "step": 40135 + }, + { + "epoch": 2.144384156266956, + "grad_norm": 10.553962707519531, + "learning_rate": 4.602576401260793e-06, + "loss": 0.3268, + "step": 40136 + }, + { + "epoch": 2.1443977211068908, + "grad_norm": 8.466856956481934, + "learning_rate": 4.602439358640537e-06, + "loss": 0.3667, + "step": 40137 + }, + { + "epoch": 2.144411285946826, + "grad_norm": 8.951024055480957, + "learning_rate": 4.6023023160202825e-06, + "loss": 0.3267, + "step": 40138 + }, + { + "epoch": 2.144424850786761, + "grad_norm": 5.177118301391602, + "learning_rate": 4.602165273400028e-06, + "loss": 0.1991, + "step": 40139 + }, + { + "epoch": 2.144438415626696, + "grad_norm": 5.513246536254883, + "learning_rate": 4.602028230779773e-06, + "loss": 0.2376, + "step": 40140 + }, + { + "epoch": 2.1444519804666307, + "grad_norm": 6.059178829193115, + "learning_rate": 4.601891188159518e-06, + "loss": 0.1934, + "step": 40141 + }, + { + "epoch": 2.1444655453065655, + "grad_norm": 7.210268974304199, + "learning_rate": 4.601754145539263e-06, + "loss": 0.217, + "step": 40142 + }, + { + "epoch": 2.1444791101465004, + "grad_norm": 5.851752758026123, + "learning_rate": 4.601617102919008e-06, + "loss": 0.2399, + "step": 40143 + }, + { + "epoch": 2.1444926749864353, + "grad_norm": 5.679160118103027, + "learning_rate": 4.601480060298753e-06, + "loss": 0.2172, + "step": 40144 + }, + { + "epoch": 2.14450623982637, + "grad_norm": 4.48167610168457, + "learning_rate": 4.601343017678499e-06, + "loss": 0.2265, + "step": 40145 + }, + { + "epoch": 2.144519804666305, + "grad_norm": 4.893650054931641, + "learning_rate": 4.601205975058243e-06, + "loss": 0.2162, + "step": 40146 + }, + { + "epoch": 2.14453336950624, + "grad_norm": 5.302924633026123, + "learning_rate": 4.601068932437988e-06, + "loss": 0.1719, + "step": 40147 + }, + { + "epoch": 2.1445469343461747, + "grad_norm": 4.83583402633667, + "learning_rate": 4.6009318898177335e-06, + "loss": 0.2512, + "step": 40148 + }, + { + "epoch": 2.1445604991861096, + "grad_norm": 4.7036895751953125, + "learning_rate": 4.600794847197479e-06, + "loss": 0.2116, + "step": 40149 + }, + { + "epoch": 2.1445740640260444, + "grad_norm": 4.70997953414917, + "learning_rate": 4.600657804577224e-06, + "loss": 0.1599, + "step": 40150 + }, + { + "epoch": 2.1445876288659793, + "grad_norm": 6.140117645263672, + "learning_rate": 4.600520761956969e-06, + "loss": 0.3591, + "step": 40151 + }, + { + "epoch": 2.144601193705914, + "grad_norm": 6.1566643714904785, + "learning_rate": 4.600383719336714e-06, + "loss": 0.2672, + "step": 40152 + }, + { + "epoch": 2.144614758545849, + "grad_norm": 4.970644474029541, + "learning_rate": 4.600246676716459e-06, + "loss": 0.1816, + "step": 40153 + }, + { + "epoch": 2.144628323385784, + "grad_norm": 4.950714588165283, + "learning_rate": 4.6001096340962045e-06, + "loss": 0.1545, + "step": 40154 + }, + { + "epoch": 2.1446418882257188, + "grad_norm": 3.986055374145508, + "learning_rate": 4.59997259147595e-06, + "loss": 0.1885, + "step": 40155 + }, + { + "epoch": 2.1446554530656536, + "grad_norm": 5.06326150894165, + "learning_rate": 4.599835548855695e-06, + "loss": 0.224, + "step": 40156 + }, + { + "epoch": 2.1446690179055885, + "grad_norm": 4.739994525909424, + "learning_rate": 4.599698506235439e-06, + "loss": 0.1803, + "step": 40157 + }, + { + "epoch": 2.1446825827455234, + "grad_norm": 5.472442150115967, + "learning_rate": 4.599561463615184e-06, + "loss": 0.1617, + "step": 40158 + }, + { + "epoch": 2.1446961475854587, + "grad_norm": 3.4940907955169678, + "learning_rate": 4.59942442099493e-06, + "loss": 0.1163, + "step": 40159 + }, + { + "epoch": 2.1447097124253935, + "grad_norm": 5.165465831756592, + "learning_rate": 4.599287378374675e-06, + "loss": 0.1664, + "step": 40160 + }, + { + "epoch": 2.1447232772653284, + "grad_norm": 4.312630653381348, + "learning_rate": 4.59915033575442e-06, + "loss": 0.1696, + "step": 40161 + }, + { + "epoch": 2.1447368421052633, + "grad_norm": 3.490729570388794, + "learning_rate": 4.599013293134165e-06, + "loss": 0.1241, + "step": 40162 + }, + { + "epoch": 2.144750406945198, + "grad_norm": 4.978487968444824, + "learning_rate": 4.59887625051391e-06, + "loss": 0.2335, + "step": 40163 + }, + { + "epoch": 2.144763971785133, + "grad_norm": 4.706657409667969, + "learning_rate": 4.5987392078936555e-06, + "loss": 0.2283, + "step": 40164 + }, + { + "epoch": 2.144777536625068, + "grad_norm": 5.108009338378906, + "learning_rate": 4.598602165273401e-06, + "loss": 0.215, + "step": 40165 + }, + { + "epoch": 2.1447911014650027, + "grad_norm": 5.69379186630249, + "learning_rate": 4.598465122653146e-06, + "loss": 0.3368, + "step": 40166 + }, + { + "epoch": 2.1448046663049376, + "grad_norm": 5.349879741668701, + "learning_rate": 4.59832808003289e-06, + "loss": 0.2293, + "step": 40167 + }, + { + "epoch": 2.1448182311448725, + "grad_norm": 5.8448333740234375, + "learning_rate": 4.598191037412636e-06, + "loss": 0.2852, + "step": 40168 + }, + { + "epoch": 2.1448317959848073, + "grad_norm": 6.152456760406494, + "learning_rate": 4.5980539947923805e-06, + "loss": 0.3057, + "step": 40169 + }, + { + "epoch": 2.144845360824742, + "grad_norm": 5.463896751403809, + "learning_rate": 4.597916952172126e-06, + "loss": 0.2984, + "step": 40170 + }, + { + "epoch": 2.144858925664677, + "grad_norm": 6.272143363952637, + "learning_rate": 4.597779909551871e-06, + "loss": 0.2622, + "step": 40171 + }, + { + "epoch": 2.144872490504612, + "grad_norm": 3.856123208999634, + "learning_rate": 4.597642866931616e-06, + "loss": 0.1415, + "step": 40172 + }, + { + "epoch": 2.144886055344547, + "grad_norm": 4.1763081550598145, + "learning_rate": 4.597505824311361e-06, + "loss": 0.2098, + "step": 40173 + }, + { + "epoch": 2.1448996201844817, + "grad_norm": 4.797016143798828, + "learning_rate": 4.597368781691106e-06, + "loss": 0.2128, + "step": 40174 + }, + { + "epoch": 2.1449131850244165, + "grad_norm": 4.898586273193359, + "learning_rate": 4.597231739070852e-06, + "loss": 0.1905, + "step": 40175 + }, + { + "epoch": 2.144926749864352, + "grad_norm": 5.780462265014648, + "learning_rate": 4.597094696450597e-06, + "loss": 0.294, + "step": 40176 + }, + { + "epoch": 2.1449403147042867, + "grad_norm": 5.895339012145996, + "learning_rate": 4.596957653830342e-06, + "loss": 0.2237, + "step": 40177 + }, + { + "epoch": 2.1449538795442216, + "grad_norm": 3.3777084350585938, + "learning_rate": 4.596820611210086e-06, + "loss": 0.1592, + "step": 40178 + }, + { + "epoch": 2.1449674443841564, + "grad_norm": 5.016540050506592, + "learning_rate": 4.596683568589832e-06, + "loss": 0.2092, + "step": 40179 + }, + { + "epoch": 2.1449810092240913, + "grad_norm": 6.022111415863037, + "learning_rate": 4.596546525969577e-06, + "loss": 0.2572, + "step": 40180 + }, + { + "epoch": 2.144994574064026, + "grad_norm": 4.830803394317627, + "learning_rate": 4.596409483349322e-06, + "loss": 0.1541, + "step": 40181 + }, + { + "epoch": 2.145008138903961, + "grad_norm": 4.1986165046691895, + "learning_rate": 4.596272440729067e-06, + "loss": 0.2573, + "step": 40182 + }, + { + "epoch": 2.145021703743896, + "grad_norm": 4.599698543548584, + "learning_rate": 4.596135398108812e-06, + "loss": 0.1726, + "step": 40183 + }, + { + "epoch": 2.1450352685838308, + "grad_norm": 6.213188171386719, + "learning_rate": 4.595998355488557e-06, + "loss": 0.2768, + "step": 40184 + }, + { + "epoch": 2.1450488334237656, + "grad_norm": 5.599979400634766, + "learning_rate": 4.5958613128683025e-06, + "loss": 0.2607, + "step": 40185 + }, + { + "epoch": 2.1450623982637005, + "grad_norm": 3.7881662845611572, + "learning_rate": 4.595724270248048e-06, + "loss": 0.1609, + "step": 40186 + }, + { + "epoch": 2.1450759631036354, + "grad_norm": 3.5470237731933594, + "learning_rate": 4.595587227627792e-06, + "loss": 0.1475, + "step": 40187 + }, + { + "epoch": 2.14508952794357, + "grad_norm": 4.838249683380127, + "learning_rate": 4.595450185007538e-06, + "loss": 0.2164, + "step": 40188 + }, + { + "epoch": 2.145103092783505, + "grad_norm": 5.760763168334961, + "learning_rate": 4.595313142387282e-06, + "loss": 0.287, + "step": 40189 + }, + { + "epoch": 2.14511665762344, + "grad_norm": 7.089673042297363, + "learning_rate": 4.5951760997670284e-06, + "loss": 0.2455, + "step": 40190 + }, + { + "epoch": 2.145130222463375, + "grad_norm": 6.104663848876953, + "learning_rate": 4.595039057146773e-06, + "loss": 0.1991, + "step": 40191 + }, + { + "epoch": 2.1451437873033097, + "grad_norm": 3.6483371257781982, + "learning_rate": 4.594902014526518e-06, + "loss": 0.1408, + "step": 40192 + }, + { + "epoch": 2.1451573521432445, + "grad_norm": 4.19059419631958, + "learning_rate": 4.594764971906263e-06, + "loss": 0.1789, + "step": 40193 + }, + { + "epoch": 2.1451709169831794, + "grad_norm": 6.147806167602539, + "learning_rate": 4.594627929286008e-06, + "loss": 0.1234, + "step": 40194 + }, + { + "epoch": 2.1451844818231143, + "grad_norm": 4.484314441680908, + "learning_rate": 4.5944908866657535e-06, + "loss": 0.1583, + "step": 40195 + }, + { + "epoch": 2.145198046663049, + "grad_norm": 6.0619988441467285, + "learning_rate": 4.594353844045499e-06, + "loss": 0.1587, + "step": 40196 + }, + { + "epoch": 2.1452116115029845, + "grad_norm": 4.833974361419678, + "learning_rate": 4.594216801425244e-06, + "loss": 0.2527, + "step": 40197 + }, + { + "epoch": 2.1452251763429193, + "grad_norm": 5.0823588371276855, + "learning_rate": 4.594079758804988e-06, + "loss": 0.138, + "step": 40198 + }, + { + "epoch": 2.145238741182854, + "grad_norm": 4.005243301391602, + "learning_rate": 4.593942716184734e-06, + "loss": 0.1508, + "step": 40199 + }, + { + "epoch": 2.145252306022789, + "grad_norm": 4.008980751037598, + "learning_rate": 4.5938056735644785e-06, + "loss": 0.1048, + "step": 40200 + }, + { + "epoch": 2.145265870862724, + "grad_norm": 4.778387546539307, + "learning_rate": 4.593668630944224e-06, + "loss": 0.2313, + "step": 40201 + }, + { + "epoch": 2.1452794357026588, + "grad_norm": 5.273024559020996, + "learning_rate": 4.593531588323969e-06, + "loss": 0.1724, + "step": 40202 + }, + { + "epoch": 2.1452930005425936, + "grad_norm": 4.300991058349609, + "learning_rate": 4.593394545703714e-06, + "loss": 0.1572, + "step": 40203 + }, + { + "epoch": 2.1453065653825285, + "grad_norm": 5.602163314819336, + "learning_rate": 4.593257503083459e-06, + "loss": 0.1482, + "step": 40204 + }, + { + "epoch": 2.1453201302224634, + "grad_norm": 3.8217477798461914, + "learning_rate": 4.5931204604632044e-06, + "loss": 0.1518, + "step": 40205 + }, + { + "epoch": 2.1453336950623982, + "grad_norm": 5.653223991394043, + "learning_rate": 4.59298341784295e-06, + "loss": 0.2383, + "step": 40206 + }, + { + "epoch": 2.145347259902333, + "grad_norm": 6.072429180145264, + "learning_rate": 4.592846375222695e-06, + "loss": 0.2137, + "step": 40207 + }, + { + "epoch": 2.145360824742268, + "grad_norm": 5.324048042297363, + "learning_rate": 4.59270933260244e-06, + "loss": 0.2082, + "step": 40208 + }, + { + "epoch": 2.145374389582203, + "grad_norm": 4.648419380187988, + "learning_rate": 4.592572289982185e-06, + "loss": 0.2265, + "step": 40209 + }, + { + "epoch": 2.1453879544221377, + "grad_norm": 4.680304050445557, + "learning_rate": 4.59243524736193e-06, + "loss": 0.1567, + "step": 40210 + }, + { + "epoch": 2.1454015192620726, + "grad_norm": 4.903381824493408, + "learning_rate": 4.592298204741675e-06, + "loss": 0.1835, + "step": 40211 + }, + { + "epoch": 2.1454150841020074, + "grad_norm": 5.358527660369873, + "learning_rate": 4.59216116212142e-06, + "loss": 0.2099, + "step": 40212 + }, + { + "epoch": 2.1454286489419423, + "grad_norm": 6.59999942779541, + "learning_rate": 4.592024119501165e-06, + "loss": 0.1947, + "step": 40213 + }, + { + "epoch": 2.1454422137818776, + "grad_norm": 7.086155891418457, + "learning_rate": 4.59188707688091e-06, + "loss": 0.3233, + "step": 40214 + }, + { + "epoch": 2.1454557786218125, + "grad_norm": 5.749699115753174, + "learning_rate": 4.591750034260655e-06, + "loss": 0.181, + "step": 40215 + }, + { + "epoch": 2.1454693434617473, + "grad_norm": 4.613423824310303, + "learning_rate": 4.5916129916404005e-06, + "loss": 0.1989, + "step": 40216 + }, + { + "epoch": 2.145482908301682, + "grad_norm": 8.48485279083252, + "learning_rate": 4.591475949020146e-06, + "loss": 0.3143, + "step": 40217 + }, + { + "epoch": 2.145496473141617, + "grad_norm": 5.0754218101501465, + "learning_rate": 4.591338906399891e-06, + "loss": 0.1778, + "step": 40218 + }, + { + "epoch": 2.145510037981552, + "grad_norm": 4.619549751281738, + "learning_rate": 4.591201863779636e-06, + "loss": 0.2655, + "step": 40219 + }, + { + "epoch": 2.145523602821487, + "grad_norm": 3.8885703086853027, + "learning_rate": 4.591064821159381e-06, + "loss": 0.1784, + "step": 40220 + }, + { + "epoch": 2.1455371676614217, + "grad_norm": 3.8169143199920654, + "learning_rate": 4.590927778539126e-06, + "loss": 0.18, + "step": 40221 + }, + { + "epoch": 2.1455507325013565, + "grad_norm": 5.396047592163086, + "learning_rate": 4.590790735918872e-06, + "loss": 0.2344, + "step": 40222 + }, + { + "epoch": 2.1455642973412914, + "grad_norm": 5.134000301361084, + "learning_rate": 4.590653693298616e-06, + "loss": 0.2475, + "step": 40223 + }, + { + "epoch": 2.1455778621812263, + "grad_norm": 4.10040283203125, + "learning_rate": 4.590516650678362e-06, + "loss": 0.1887, + "step": 40224 + }, + { + "epoch": 2.145591427021161, + "grad_norm": 6.19217586517334, + "learning_rate": 4.590379608058106e-06, + "loss": 0.1842, + "step": 40225 + }, + { + "epoch": 2.145604991861096, + "grad_norm": 4.8931403160095215, + "learning_rate": 4.5902425654378515e-06, + "loss": 0.1205, + "step": 40226 + }, + { + "epoch": 2.145618556701031, + "grad_norm": 5.029970645904541, + "learning_rate": 4.590105522817597e-06, + "loss": 0.1977, + "step": 40227 + }, + { + "epoch": 2.1456321215409657, + "grad_norm": 3.3693652153015137, + "learning_rate": 4.589968480197342e-06, + "loss": 0.1273, + "step": 40228 + }, + { + "epoch": 2.1456456863809006, + "grad_norm": 6.481091022491455, + "learning_rate": 4.589831437577087e-06, + "loss": 0.2653, + "step": 40229 + }, + { + "epoch": 2.1456592512208355, + "grad_norm": 6.139348030090332, + "learning_rate": 4.589694394956831e-06, + "loss": 0.3591, + "step": 40230 + }, + { + "epoch": 2.1456728160607703, + "grad_norm": 4.091365814208984, + "learning_rate": 4.589557352336577e-06, + "loss": 0.1672, + "step": 40231 + }, + { + "epoch": 2.145686380900705, + "grad_norm": 5.897311687469482, + "learning_rate": 4.589420309716322e-06, + "loss": 0.1776, + "step": 40232 + }, + { + "epoch": 2.14569994574064, + "grad_norm": 4.8527069091796875, + "learning_rate": 4.589283267096068e-06, + "loss": 0.1723, + "step": 40233 + }, + { + "epoch": 2.1457135105805754, + "grad_norm": 4.74363899230957, + "learning_rate": 4.589146224475812e-06, + "loss": 0.1941, + "step": 40234 + }, + { + "epoch": 2.14572707542051, + "grad_norm": 5.021969795227051, + "learning_rate": 4.589009181855557e-06, + "loss": 0.2285, + "step": 40235 + }, + { + "epoch": 2.145740640260445, + "grad_norm": 3.962836742401123, + "learning_rate": 4.5888721392353024e-06, + "loss": 0.2135, + "step": 40236 + }, + { + "epoch": 2.14575420510038, + "grad_norm": 4.980167388916016, + "learning_rate": 4.588735096615048e-06, + "loss": 0.1336, + "step": 40237 + }, + { + "epoch": 2.145767769940315, + "grad_norm": 6.542621612548828, + "learning_rate": 4.588598053994793e-06, + "loss": 0.1949, + "step": 40238 + }, + { + "epoch": 2.1457813347802497, + "grad_norm": 5.268472194671631, + "learning_rate": 4.588461011374538e-06, + "loss": 0.2423, + "step": 40239 + }, + { + "epoch": 2.1457948996201845, + "grad_norm": 4.294902324676514, + "learning_rate": 4.588323968754283e-06, + "loss": 0.1406, + "step": 40240 + }, + { + "epoch": 2.1458084644601194, + "grad_norm": 4.998803615570068, + "learning_rate": 4.5881869261340275e-06, + "loss": 0.2196, + "step": 40241 + }, + { + "epoch": 2.1458220293000543, + "grad_norm": 4.2767181396484375, + "learning_rate": 4.5880498835137735e-06, + "loss": 0.2328, + "step": 40242 + }, + { + "epoch": 2.145835594139989, + "grad_norm": 3.8817386627197266, + "learning_rate": 4.587912840893518e-06, + "loss": 0.2014, + "step": 40243 + }, + { + "epoch": 2.145849158979924, + "grad_norm": 3.8200862407684326, + "learning_rate": 4.587775798273264e-06, + "loss": 0.1591, + "step": 40244 + }, + { + "epoch": 2.145862723819859, + "grad_norm": 3.3844571113586426, + "learning_rate": 4.587638755653008e-06, + "loss": 0.1137, + "step": 40245 + }, + { + "epoch": 2.1458762886597937, + "grad_norm": 6.282045364379883, + "learning_rate": 4.587501713032753e-06, + "loss": 0.273, + "step": 40246 + }, + { + "epoch": 2.1458898534997286, + "grad_norm": 5.169869422912598, + "learning_rate": 4.5873646704124986e-06, + "loss": 0.2407, + "step": 40247 + }, + { + "epoch": 2.1459034183396635, + "grad_norm": 5.08489990234375, + "learning_rate": 4.587227627792244e-06, + "loss": 0.164, + "step": 40248 + }, + { + "epoch": 2.1459169831795983, + "grad_norm": 3.673888683319092, + "learning_rate": 4.587090585171989e-06, + "loss": 0.1249, + "step": 40249 + }, + { + "epoch": 2.145930548019533, + "grad_norm": 4.020993709564209, + "learning_rate": 4.586953542551734e-06, + "loss": 0.1532, + "step": 40250 + }, + { + "epoch": 2.145944112859468, + "grad_norm": 5.308000087738037, + "learning_rate": 4.586816499931479e-06, + "loss": 0.2421, + "step": 40251 + }, + { + "epoch": 2.1459576776994034, + "grad_norm": 4.2820000648498535, + "learning_rate": 4.5866794573112244e-06, + "loss": 0.1242, + "step": 40252 + }, + { + "epoch": 2.1459712425393382, + "grad_norm": 5.170655727386475, + "learning_rate": 4.58654241469097e-06, + "loss": 0.174, + "step": 40253 + }, + { + "epoch": 2.145984807379273, + "grad_norm": 4.159860610961914, + "learning_rate": 4.586405372070714e-06, + "loss": 0.1873, + "step": 40254 + }, + { + "epoch": 2.145998372219208, + "grad_norm": 4.125247955322266, + "learning_rate": 4.586268329450459e-06, + "loss": 0.1196, + "step": 40255 + }, + { + "epoch": 2.146011937059143, + "grad_norm": 4.387253284454346, + "learning_rate": 4.586131286830204e-06, + "loss": 0.1609, + "step": 40256 + }, + { + "epoch": 2.1460255018990777, + "grad_norm": 4.088627338409424, + "learning_rate": 4.5859942442099495e-06, + "loss": 0.1925, + "step": 40257 + }, + { + "epoch": 2.1460390667390126, + "grad_norm": 6.2274298667907715, + "learning_rate": 4.585857201589695e-06, + "loss": 0.3274, + "step": 40258 + }, + { + "epoch": 2.1460526315789474, + "grad_norm": 4.33961820602417, + "learning_rate": 4.58572015896944e-06, + "loss": 0.2036, + "step": 40259 + }, + { + "epoch": 2.1460661964188823, + "grad_norm": 5.843738079071045, + "learning_rate": 4.585583116349185e-06, + "loss": 0.2877, + "step": 40260 + }, + { + "epoch": 2.146079761258817, + "grad_norm": 4.719614028930664, + "learning_rate": 4.58544607372893e-06, + "loss": 0.1738, + "step": 40261 + }, + { + "epoch": 2.146093326098752, + "grad_norm": 4.543216228485107, + "learning_rate": 4.585309031108675e-06, + "loss": 0.2512, + "step": 40262 + }, + { + "epoch": 2.146106890938687, + "grad_norm": 6.129180431365967, + "learning_rate": 4.5851719884884206e-06, + "loss": 0.1947, + "step": 40263 + }, + { + "epoch": 2.1461204557786218, + "grad_norm": 4.520951747894287, + "learning_rate": 4.585034945868166e-06, + "loss": 0.1073, + "step": 40264 + }, + { + "epoch": 2.1461340206185566, + "grad_norm": 5.636592864990234, + "learning_rate": 4.584897903247911e-06, + "loss": 0.2043, + "step": 40265 + }, + { + "epoch": 2.1461475854584915, + "grad_norm": 4.3474578857421875, + "learning_rate": 4.584760860627655e-06, + "loss": 0.2545, + "step": 40266 + }, + { + "epoch": 2.1461611502984264, + "grad_norm": 5.150498390197754, + "learning_rate": 4.5846238180074004e-06, + "loss": 0.2052, + "step": 40267 + }, + { + "epoch": 2.146174715138361, + "grad_norm": 5.373371124267578, + "learning_rate": 4.584486775387146e-06, + "loss": 0.2933, + "step": 40268 + }, + { + "epoch": 2.146188279978296, + "grad_norm": 4.582302570343018, + "learning_rate": 4.584349732766891e-06, + "loss": 0.2012, + "step": 40269 + }, + { + "epoch": 2.146201844818231, + "grad_norm": 5.271108150482178, + "learning_rate": 4.584212690146636e-06, + "loss": 0.194, + "step": 40270 + }, + { + "epoch": 2.146215409658166, + "grad_norm": 4.083330154418945, + "learning_rate": 4.584075647526381e-06, + "loss": 0.2055, + "step": 40271 + }, + { + "epoch": 2.146228974498101, + "grad_norm": 5.8292059898376465, + "learning_rate": 4.583938604906126e-06, + "loss": 0.2522, + "step": 40272 + }, + { + "epoch": 2.146242539338036, + "grad_norm": 7.357096195220947, + "learning_rate": 4.5838015622858715e-06, + "loss": 0.3387, + "step": 40273 + }, + { + "epoch": 2.146256104177971, + "grad_norm": 5.979825973510742, + "learning_rate": 4.583664519665617e-06, + "loss": 0.2098, + "step": 40274 + }, + { + "epoch": 2.1462696690179057, + "grad_norm": 7.299398899078369, + "learning_rate": 4.583527477045361e-06, + "loss": 0.3627, + "step": 40275 + }, + { + "epoch": 2.1462832338578406, + "grad_norm": 5.6987690925598145, + "learning_rate": 4.583390434425107e-06, + "loss": 0.2618, + "step": 40276 + }, + { + "epoch": 2.1462967986977755, + "grad_norm": 6.069398403167725, + "learning_rate": 4.583253391804851e-06, + "loss": 0.3381, + "step": 40277 + }, + { + "epoch": 2.1463103635377103, + "grad_norm": 6.468533992767334, + "learning_rate": 4.583116349184597e-06, + "loss": 0.3504, + "step": 40278 + }, + { + "epoch": 2.146323928377645, + "grad_norm": 5.023847579956055, + "learning_rate": 4.582979306564342e-06, + "loss": 0.2074, + "step": 40279 + }, + { + "epoch": 2.14633749321758, + "grad_norm": 5.503490447998047, + "learning_rate": 4.582842263944087e-06, + "loss": 0.2592, + "step": 40280 + }, + { + "epoch": 2.146351058057515, + "grad_norm": 4.417675495147705, + "learning_rate": 4.582705221323832e-06, + "loss": 0.1872, + "step": 40281 + }, + { + "epoch": 2.14636462289745, + "grad_norm": 6.789773464202881, + "learning_rate": 4.582568178703577e-06, + "loss": 0.2905, + "step": 40282 + }, + { + "epoch": 2.1463781877373846, + "grad_norm": 4.470239639282227, + "learning_rate": 4.5824311360833224e-06, + "loss": 0.2059, + "step": 40283 + }, + { + "epoch": 2.1463917525773195, + "grad_norm": 6.224654197692871, + "learning_rate": 4.582294093463067e-06, + "loss": 0.1742, + "step": 40284 + }, + { + "epoch": 2.1464053174172544, + "grad_norm": 7.917834281921387, + "learning_rate": 4.582157050842813e-06, + "loss": 0.3574, + "step": 40285 + }, + { + "epoch": 2.1464188822571892, + "grad_norm": 5.074708461761475, + "learning_rate": 4.582020008222557e-06, + "loss": 0.2227, + "step": 40286 + }, + { + "epoch": 2.146432447097124, + "grad_norm": 5.05552339553833, + "learning_rate": 4.581882965602303e-06, + "loss": 0.2043, + "step": 40287 + }, + { + "epoch": 2.146446011937059, + "grad_norm": 4.99701452255249, + "learning_rate": 4.5817459229820475e-06, + "loss": 0.2061, + "step": 40288 + }, + { + "epoch": 2.146459576776994, + "grad_norm": 5.034820079803467, + "learning_rate": 4.581608880361793e-06, + "loss": 0.1086, + "step": 40289 + }, + { + "epoch": 2.146473141616929, + "grad_norm": 3.6813178062438965, + "learning_rate": 4.581471837741538e-06, + "loss": 0.1619, + "step": 40290 + }, + { + "epoch": 2.146486706456864, + "grad_norm": 5.990126609802246, + "learning_rate": 4.581334795121283e-06, + "loss": 0.1952, + "step": 40291 + }, + { + "epoch": 2.146500271296799, + "grad_norm": 7.470614433288574, + "learning_rate": 4.581197752501028e-06, + "loss": 0.3425, + "step": 40292 + }, + { + "epoch": 2.1465138361367337, + "grad_norm": 3.68198823928833, + "learning_rate": 4.581060709880773e-06, + "loss": 0.1343, + "step": 40293 + }, + { + "epoch": 2.1465274009766686, + "grad_norm": 3.432934284210205, + "learning_rate": 4.5809236672605186e-06, + "loss": 0.1599, + "step": 40294 + }, + { + "epoch": 2.1465409658166035, + "grad_norm": 7.231902122497559, + "learning_rate": 4.580786624640263e-06, + "loss": 0.3133, + "step": 40295 + }, + { + "epoch": 2.1465545306565383, + "grad_norm": 3.5393893718719482, + "learning_rate": 4.580649582020009e-06, + "loss": 0.1409, + "step": 40296 + }, + { + "epoch": 2.146568095496473, + "grad_norm": 5.710570335388184, + "learning_rate": 4.580512539399753e-06, + "loss": 0.2041, + "step": 40297 + }, + { + "epoch": 2.146581660336408, + "grad_norm": 3.80218243598938, + "learning_rate": 4.580375496779499e-06, + "loss": 0.1088, + "step": 40298 + }, + { + "epoch": 2.146595225176343, + "grad_norm": 6.842055320739746, + "learning_rate": 4.580238454159244e-06, + "loss": 0.2445, + "step": 40299 + }, + { + "epoch": 2.146608790016278, + "grad_norm": 3.69746994972229, + "learning_rate": 4.580101411538989e-06, + "loss": 0.1562, + "step": 40300 + }, + { + "epoch": 2.1466223548562127, + "grad_norm": 3.1128923892974854, + "learning_rate": 4.579964368918734e-06, + "loss": 0.1526, + "step": 40301 + }, + { + "epoch": 2.1466359196961475, + "grad_norm": 4.5147504806518555, + "learning_rate": 4.579827326298479e-06, + "loss": 0.1696, + "step": 40302 + }, + { + "epoch": 2.1466494845360824, + "grad_norm": 7.480202674865723, + "learning_rate": 4.579690283678224e-06, + "loss": 0.2993, + "step": 40303 + }, + { + "epoch": 2.1466630493760173, + "grad_norm": 5.5618791580200195, + "learning_rate": 4.5795532410579695e-06, + "loss": 0.2225, + "step": 40304 + }, + { + "epoch": 2.146676614215952, + "grad_norm": 4.12368631362915, + "learning_rate": 4.579416198437715e-06, + "loss": 0.1256, + "step": 40305 + }, + { + "epoch": 2.146690179055887, + "grad_norm": 3.790914297103882, + "learning_rate": 4.57927915581746e-06, + "loss": 0.2815, + "step": 40306 + }, + { + "epoch": 2.146703743895822, + "grad_norm": 4.916474342346191, + "learning_rate": 4.579142113197205e-06, + "loss": 0.1932, + "step": 40307 + }, + { + "epoch": 2.1467173087357567, + "grad_norm": 7.1209187507629395, + "learning_rate": 4.579005070576949e-06, + "loss": 0.0947, + "step": 40308 + }, + { + "epoch": 2.1467308735756916, + "grad_norm": 6.514412879943848, + "learning_rate": 4.5788680279566946e-06, + "loss": 0.2056, + "step": 40309 + }, + { + "epoch": 2.146744438415627, + "grad_norm": 8.223552703857422, + "learning_rate": 4.57873098533644e-06, + "loss": 0.2945, + "step": 40310 + }, + { + "epoch": 2.1467580032555618, + "grad_norm": 3.5119447708129883, + "learning_rate": 4.578593942716185e-06, + "loss": 0.111, + "step": 40311 + }, + { + "epoch": 2.1467715680954966, + "grad_norm": 5.540496826171875, + "learning_rate": 4.57845690009593e-06, + "loss": 0.1335, + "step": 40312 + }, + { + "epoch": 2.1467851329354315, + "grad_norm": 3.665447235107422, + "learning_rate": 4.578319857475675e-06, + "loss": 0.136, + "step": 40313 + }, + { + "epoch": 2.1467986977753664, + "grad_norm": 4.140836238861084, + "learning_rate": 4.5781828148554205e-06, + "loss": 0.2072, + "step": 40314 + }, + { + "epoch": 2.1468122626153012, + "grad_norm": 6.539822578430176, + "learning_rate": 4.578045772235166e-06, + "loss": 0.299, + "step": 40315 + }, + { + "epoch": 2.146825827455236, + "grad_norm": 4.167208194732666, + "learning_rate": 4.577908729614911e-06, + "loss": 0.2365, + "step": 40316 + }, + { + "epoch": 2.146839392295171, + "grad_norm": 3.6902740001678467, + "learning_rate": 4.577771686994656e-06, + "loss": 0.1139, + "step": 40317 + }, + { + "epoch": 2.146852957135106, + "grad_norm": 2.911860227584839, + "learning_rate": 4.577634644374401e-06, + "loss": 0.105, + "step": 40318 + }, + { + "epoch": 2.1468665219750407, + "grad_norm": 3.3472392559051514, + "learning_rate": 4.577497601754146e-06, + "loss": 0.1377, + "step": 40319 + }, + { + "epoch": 2.1468800868149756, + "grad_norm": 3.2311489582061768, + "learning_rate": 4.577360559133891e-06, + "loss": 0.1226, + "step": 40320 + }, + { + "epoch": 2.1468936516549104, + "grad_norm": 4.801193714141846, + "learning_rate": 4.577223516513637e-06, + "loss": 0.1749, + "step": 40321 + }, + { + "epoch": 2.1469072164948453, + "grad_norm": 3.2489678859710693, + "learning_rate": 4.577086473893381e-06, + "loss": 0.1525, + "step": 40322 + }, + { + "epoch": 2.14692078133478, + "grad_norm": 6.215337753295898, + "learning_rate": 4.576949431273126e-06, + "loss": 0.2061, + "step": 40323 + }, + { + "epoch": 2.146934346174715, + "grad_norm": 5.1725873947143555, + "learning_rate": 4.576812388652871e-06, + "loss": 0.248, + "step": 40324 + }, + { + "epoch": 2.14694791101465, + "grad_norm": 5.071493625640869, + "learning_rate": 4.5766753460326166e-06, + "loss": 0.2685, + "step": 40325 + }, + { + "epoch": 2.1469614758545847, + "grad_norm": 4.847637176513672, + "learning_rate": 4.576538303412362e-06, + "loss": 0.15, + "step": 40326 + }, + { + "epoch": 2.1469750406945196, + "grad_norm": 4.016812801361084, + "learning_rate": 4.576401260792107e-06, + "loss": 0.1391, + "step": 40327 + }, + { + "epoch": 2.146988605534455, + "grad_norm": 5.8928985595703125, + "learning_rate": 4.576264218171852e-06, + "loss": 0.1913, + "step": 40328 + }, + { + "epoch": 2.14700217037439, + "grad_norm": 4.1098480224609375, + "learning_rate": 4.5761271755515964e-06, + "loss": 0.1497, + "step": 40329 + }, + { + "epoch": 2.1470157352143246, + "grad_norm": 4.596676349639893, + "learning_rate": 4.5759901329313425e-06, + "loss": 0.1635, + "step": 40330 + }, + { + "epoch": 2.1470293000542595, + "grad_norm": 3.5352606773376465, + "learning_rate": 4.575853090311087e-06, + "loss": 0.1668, + "step": 40331 + }, + { + "epoch": 2.1470428648941944, + "grad_norm": 4.204018592834473, + "learning_rate": 4.575716047690833e-06, + "loss": 0.1484, + "step": 40332 + }, + { + "epoch": 2.1470564297341292, + "grad_norm": 5.8288984298706055, + "learning_rate": 4.575579005070577e-06, + "loss": 0.1759, + "step": 40333 + }, + { + "epoch": 2.147069994574064, + "grad_norm": 3.5030357837677, + "learning_rate": 4.575441962450322e-06, + "loss": 0.1863, + "step": 40334 + }, + { + "epoch": 2.147083559413999, + "grad_norm": 5.587913990020752, + "learning_rate": 4.5753049198300675e-06, + "loss": 0.1608, + "step": 40335 + }, + { + "epoch": 2.147097124253934, + "grad_norm": 3.359900712966919, + "learning_rate": 4.575167877209813e-06, + "loss": 0.169, + "step": 40336 + }, + { + "epoch": 2.1471106890938687, + "grad_norm": 5.719805717468262, + "learning_rate": 4.575030834589558e-06, + "loss": 0.2179, + "step": 40337 + }, + { + "epoch": 2.1471242539338036, + "grad_norm": 5.498197555541992, + "learning_rate": 4.574893791969302e-06, + "loss": 0.1907, + "step": 40338 + }, + { + "epoch": 2.1471378187737384, + "grad_norm": 4.711484909057617, + "learning_rate": 4.574756749349048e-06, + "loss": 0.2146, + "step": 40339 + }, + { + "epoch": 2.1471513836136733, + "grad_norm": 4.2360382080078125, + "learning_rate": 4.5746197067287926e-06, + "loss": 0.2499, + "step": 40340 + }, + { + "epoch": 2.147164948453608, + "grad_norm": 3.2025630474090576, + "learning_rate": 4.574482664108539e-06, + "loss": 0.1506, + "step": 40341 + }, + { + "epoch": 2.147178513293543, + "grad_norm": 4.775314807891846, + "learning_rate": 4.574345621488283e-06, + "loss": 0.1821, + "step": 40342 + }, + { + "epoch": 2.147192078133478, + "grad_norm": 4.0407395362854, + "learning_rate": 4.574208578868028e-06, + "loss": 0.1766, + "step": 40343 + }, + { + "epoch": 2.1472056429734128, + "grad_norm": 4.4566168785095215, + "learning_rate": 4.574071536247773e-06, + "loss": 0.1638, + "step": 40344 + }, + { + "epoch": 2.1472192078133476, + "grad_norm": 6.978559970855713, + "learning_rate": 4.5739344936275185e-06, + "loss": 0.2923, + "step": 40345 + }, + { + "epoch": 2.1472327726532825, + "grad_norm": 3.7424156665802, + "learning_rate": 4.573797451007264e-06, + "loss": 0.1663, + "step": 40346 + }, + { + "epoch": 2.1472463374932174, + "grad_norm": 4.775635719299316, + "learning_rate": 4.573660408387009e-06, + "loss": 0.256, + "step": 40347 + }, + { + "epoch": 2.1472599023331527, + "grad_norm": 5.876226425170898, + "learning_rate": 4.573523365766754e-06, + "loss": 0.2224, + "step": 40348 + }, + { + "epoch": 2.1472734671730875, + "grad_norm": 3.8519599437713623, + "learning_rate": 4.573386323146498e-06, + "loss": 0.2028, + "step": 40349 + }, + { + "epoch": 2.1472870320130224, + "grad_norm": 5.211141586303711, + "learning_rate": 4.573249280526244e-06, + "loss": 0.1715, + "step": 40350 + }, + { + "epoch": 2.1473005968529573, + "grad_norm": 4.968794822692871, + "learning_rate": 4.573112237905989e-06, + "loss": 0.2305, + "step": 40351 + }, + { + "epoch": 2.147314161692892, + "grad_norm": 3.9713284969329834, + "learning_rate": 4.572975195285735e-06, + "loss": 0.207, + "step": 40352 + }, + { + "epoch": 2.147327726532827, + "grad_norm": 6.511669158935547, + "learning_rate": 4.572838152665479e-06, + "loss": 0.1146, + "step": 40353 + }, + { + "epoch": 2.147341291372762, + "grad_norm": 4.007136344909668, + "learning_rate": 4.572701110045224e-06, + "loss": 0.2201, + "step": 40354 + }, + { + "epoch": 2.1473548562126967, + "grad_norm": 3.54886531829834, + "learning_rate": 4.572564067424969e-06, + "loss": 0.1681, + "step": 40355 + }, + { + "epoch": 2.1473684210526316, + "grad_norm": 3.528944969177246, + "learning_rate": 4.572427024804715e-06, + "loss": 0.1188, + "step": 40356 + }, + { + "epoch": 2.1473819858925665, + "grad_norm": 4.32860803604126, + "learning_rate": 4.57228998218446e-06, + "loss": 0.2115, + "step": 40357 + }, + { + "epoch": 2.1473955507325013, + "grad_norm": 3.3803181648254395, + "learning_rate": 4.572152939564205e-06, + "loss": 0.1218, + "step": 40358 + }, + { + "epoch": 2.147409115572436, + "grad_norm": 3.373206615447998, + "learning_rate": 4.57201589694395e-06, + "loss": 0.2312, + "step": 40359 + }, + { + "epoch": 2.147422680412371, + "grad_norm": 4.500438213348389, + "learning_rate": 4.571878854323695e-06, + "loss": 0.2199, + "step": 40360 + }, + { + "epoch": 2.147436245252306, + "grad_norm": 5.027309894561768, + "learning_rate": 4.5717418117034405e-06, + "loss": 0.0997, + "step": 40361 + }, + { + "epoch": 2.147449810092241, + "grad_norm": 3.8232624530792236, + "learning_rate": 4.571604769083186e-06, + "loss": 0.1953, + "step": 40362 + }, + { + "epoch": 2.1474633749321757, + "grad_norm": 5.02423095703125, + "learning_rate": 4.57146772646293e-06, + "loss": 0.2703, + "step": 40363 + }, + { + "epoch": 2.1474769397721105, + "grad_norm": 4.3970818519592285, + "learning_rate": 4.571330683842675e-06, + "loss": 0.1309, + "step": 40364 + }, + { + "epoch": 2.1474905046120454, + "grad_norm": 3.4274823665618896, + "learning_rate": 4.57119364122242e-06, + "loss": 0.1472, + "step": 40365 + }, + { + "epoch": 2.1475040694519807, + "grad_norm": 3.6108810901641846, + "learning_rate": 4.5710565986021655e-06, + "loss": 0.1431, + "step": 40366 + }, + { + "epoch": 2.1475176342919156, + "grad_norm": 3.7342326641082764, + "learning_rate": 4.570919555981911e-06, + "loss": 0.1236, + "step": 40367 + }, + { + "epoch": 2.1475311991318504, + "grad_norm": 2.9203684329986572, + "learning_rate": 4.570782513361656e-06, + "loss": 0.0947, + "step": 40368 + }, + { + "epoch": 2.1475447639717853, + "grad_norm": 5.879497051239014, + "learning_rate": 4.570645470741401e-06, + "loss": 0.1948, + "step": 40369 + }, + { + "epoch": 2.14755832881172, + "grad_norm": 4.566032409667969, + "learning_rate": 4.570508428121146e-06, + "loss": 0.216, + "step": 40370 + }, + { + "epoch": 2.147571893651655, + "grad_norm": 3.9823310375213623, + "learning_rate": 4.570371385500891e-06, + "loss": 0.1821, + "step": 40371 + }, + { + "epoch": 2.14758545849159, + "grad_norm": 4.069352626800537, + "learning_rate": 4.570234342880636e-06, + "loss": 0.1495, + "step": 40372 + }, + { + "epoch": 2.1475990233315247, + "grad_norm": 3.1183483600616455, + "learning_rate": 4.570097300260382e-06, + "loss": 0.1346, + "step": 40373 + }, + { + "epoch": 2.1476125881714596, + "grad_norm": 5.161107063293457, + "learning_rate": 4.569960257640126e-06, + "loss": 0.2363, + "step": 40374 + }, + { + "epoch": 2.1476261530113945, + "grad_norm": 4.018076419830322, + "learning_rate": 4.569823215019872e-06, + "loss": 0.159, + "step": 40375 + }, + { + "epoch": 2.1476397178513293, + "grad_norm": 3.828493595123291, + "learning_rate": 4.5696861723996165e-06, + "loss": 0.139, + "step": 40376 + }, + { + "epoch": 2.147653282691264, + "grad_norm": 4.5193562507629395, + "learning_rate": 4.569549129779362e-06, + "loss": 0.1246, + "step": 40377 + }, + { + "epoch": 2.147666847531199, + "grad_norm": 3.3247580528259277, + "learning_rate": 4.569412087159107e-06, + "loss": 0.1045, + "step": 40378 + }, + { + "epoch": 2.147680412371134, + "grad_norm": 4.0106730461120605, + "learning_rate": 4.569275044538852e-06, + "loss": 0.2119, + "step": 40379 + }, + { + "epoch": 2.147693977211069, + "grad_norm": 4.184478282928467, + "learning_rate": 4.569138001918597e-06, + "loss": 0.1571, + "step": 40380 + }, + { + "epoch": 2.1477075420510037, + "grad_norm": 3.7628538608551025, + "learning_rate": 4.569000959298342e-06, + "loss": 0.185, + "step": 40381 + }, + { + "epoch": 2.1477211068909385, + "grad_norm": 3.6492738723754883, + "learning_rate": 4.5688639166780875e-06, + "loss": 0.1933, + "step": 40382 + }, + { + "epoch": 2.1477346717308734, + "grad_norm": 3.9408819675445557, + "learning_rate": 4.568726874057832e-06, + "loss": 0.23, + "step": 40383 + }, + { + "epoch": 2.1477482365708083, + "grad_norm": 3.9938840866088867, + "learning_rate": 4.568589831437578e-06, + "loss": 0.1381, + "step": 40384 + }, + { + "epoch": 2.147761801410743, + "grad_norm": 3.5989432334899902, + "learning_rate": 4.568452788817322e-06, + "loss": 0.1773, + "step": 40385 + }, + { + "epoch": 2.1477753662506784, + "grad_norm": 5.492683410644531, + "learning_rate": 4.568315746197068e-06, + "loss": 0.18, + "step": 40386 + }, + { + "epoch": 2.1477889310906133, + "grad_norm": 4.184011459350586, + "learning_rate": 4.568178703576813e-06, + "loss": 0.1708, + "step": 40387 + }, + { + "epoch": 2.147802495930548, + "grad_norm": 5.079833507537842, + "learning_rate": 4.568041660956558e-06, + "loss": 0.09, + "step": 40388 + }, + { + "epoch": 2.147816060770483, + "grad_norm": 6.139595985412598, + "learning_rate": 4.567904618336303e-06, + "loss": 0.2618, + "step": 40389 + }, + { + "epoch": 2.147829625610418, + "grad_norm": 3.759161949157715, + "learning_rate": 4.567767575716048e-06, + "loss": 0.135, + "step": 40390 + }, + { + "epoch": 2.1478431904503528, + "grad_norm": 4.392947673797607, + "learning_rate": 4.567630533095793e-06, + "loss": 0.1382, + "step": 40391 + }, + { + "epoch": 2.1478567552902876, + "grad_norm": 3.6807053089141846, + "learning_rate": 4.567493490475538e-06, + "loss": 0.1649, + "step": 40392 + }, + { + "epoch": 2.1478703201302225, + "grad_norm": 4.54279899597168, + "learning_rate": 4.567356447855284e-06, + "loss": 0.2718, + "step": 40393 + }, + { + "epoch": 2.1478838849701574, + "grad_norm": 4.210698127746582, + "learning_rate": 4.567219405235028e-06, + "loss": 0.1627, + "step": 40394 + }, + { + "epoch": 2.1478974498100922, + "grad_norm": 4.802218437194824, + "learning_rate": 4.567082362614774e-06, + "loss": 0.2155, + "step": 40395 + }, + { + "epoch": 2.147911014650027, + "grad_norm": 3.5035336017608643, + "learning_rate": 4.566945319994518e-06, + "loss": 0.1304, + "step": 40396 + }, + { + "epoch": 2.147924579489962, + "grad_norm": 4.471922397613525, + "learning_rate": 4.5668082773742635e-06, + "loss": 0.2086, + "step": 40397 + }, + { + "epoch": 2.147938144329897, + "grad_norm": 3.3682303428649902, + "learning_rate": 4.566671234754009e-06, + "loss": 0.198, + "step": 40398 + }, + { + "epoch": 2.1479517091698317, + "grad_norm": 4.394533157348633, + "learning_rate": 4.566534192133754e-06, + "loss": 0.1052, + "step": 40399 + }, + { + "epoch": 2.1479652740097666, + "grad_norm": 4.734092712402344, + "learning_rate": 4.566397149513499e-06, + "loss": 0.1691, + "step": 40400 + }, + { + "epoch": 2.1479788388497014, + "grad_norm": 5.961100101470947, + "learning_rate": 4.566260106893244e-06, + "loss": 0.2458, + "step": 40401 + }, + { + "epoch": 2.1479924036896363, + "grad_norm": 2.966665029525757, + "learning_rate": 4.566123064272989e-06, + "loss": 0.0973, + "step": 40402 + }, + { + "epoch": 2.148005968529571, + "grad_norm": 3.633112907409668, + "learning_rate": 4.565986021652735e-06, + "loss": 0.1094, + "step": 40403 + }, + { + "epoch": 2.1480195333695065, + "grad_norm": 3.066396951675415, + "learning_rate": 4.56584897903248e-06, + "loss": 0.0961, + "step": 40404 + }, + { + "epoch": 2.1480330982094413, + "grad_norm": 3.243105411529541, + "learning_rate": 4.565711936412224e-06, + "loss": 0.1241, + "step": 40405 + }, + { + "epoch": 2.148046663049376, + "grad_norm": 4.347404479980469, + "learning_rate": 4.56557489379197e-06, + "loss": 0.1544, + "step": 40406 + }, + { + "epoch": 2.148060227889311, + "grad_norm": 4.353132247924805, + "learning_rate": 4.5654378511717145e-06, + "loss": 0.2186, + "step": 40407 + }, + { + "epoch": 2.148073792729246, + "grad_norm": 6.276595592498779, + "learning_rate": 4.56530080855146e-06, + "loss": 0.1829, + "step": 40408 + }, + { + "epoch": 2.148087357569181, + "grad_norm": 4.734505653381348, + "learning_rate": 4.565163765931205e-06, + "loss": 0.2064, + "step": 40409 + }, + { + "epoch": 2.1481009224091157, + "grad_norm": 4.890718936920166, + "learning_rate": 4.56502672331095e-06, + "loss": 0.2508, + "step": 40410 + }, + { + "epoch": 2.1481144872490505, + "grad_norm": 6.037501811981201, + "learning_rate": 4.564889680690695e-06, + "loss": 0.2142, + "step": 40411 + }, + { + "epoch": 2.1481280520889854, + "grad_norm": 3.295520782470703, + "learning_rate": 4.56475263807044e-06, + "loss": 0.2155, + "step": 40412 + }, + { + "epoch": 2.1481416169289203, + "grad_norm": 2.681947708129883, + "learning_rate": 4.5646155954501855e-06, + "loss": 0.1208, + "step": 40413 + }, + { + "epoch": 2.148155181768855, + "grad_norm": 3.321211338043213, + "learning_rate": 4.564478552829931e-06, + "loss": 0.0946, + "step": 40414 + }, + { + "epoch": 2.14816874660879, + "grad_norm": 3.7536864280700684, + "learning_rate": 4.564341510209676e-06, + "loss": 0.1441, + "step": 40415 + }, + { + "epoch": 2.148182311448725, + "grad_norm": 5.829695224761963, + "learning_rate": 4.564204467589421e-06, + "loss": 0.181, + "step": 40416 + }, + { + "epoch": 2.1481958762886597, + "grad_norm": 3.9023663997650146, + "learning_rate": 4.564067424969165e-06, + "loss": 0.1464, + "step": 40417 + }, + { + "epoch": 2.1482094411285946, + "grad_norm": 3.7342214584350586, + "learning_rate": 4.563930382348911e-06, + "loss": 0.1861, + "step": 40418 + }, + { + "epoch": 2.1482230059685294, + "grad_norm": 5.411476135253906, + "learning_rate": 4.563793339728656e-06, + "loss": 0.1349, + "step": 40419 + }, + { + "epoch": 2.1482365708084643, + "grad_norm": 5.463492393493652, + "learning_rate": 4.563656297108401e-06, + "loss": 0.2027, + "step": 40420 + }, + { + "epoch": 2.148250135648399, + "grad_norm": 3.874701499938965, + "learning_rate": 4.563519254488146e-06, + "loss": 0.1934, + "step": 40421 + }, + { + "epoch": 2.148263700488334, + "grad_norm": 4.288271427154541, + "learning_rate": 4.563382211867891e-06, + "loss": 0.1423, + "step": 40422 + }, + { + "epoch": 2.148277265328269, + "grad_norm": 3.2902679443359375, + "learning_rate": 4.5632451692476365e-06, + "loss": 0.1194, + "step": 40423 + }, + { + "epoch": 2.148290830168204, + "grad_norm": 5.437423229217529, + "learning_rate": 4.563108126627382e-06, + "loss": 0.2484, + "step": 40424 + }, + { + "epoch": 2.148304395008139, + "grad_norm": 5.402929782867432, + "learning_rate": 4.562971084007127e-06, + "loss": 0.2018, + "step": 40425 + }, + { + "epoch": 2.148317959848074, + "grad_norm": 3.2882297039031982, + "learning_rate": 4.562834041386871e-06, + "loss": 0.1462, + "step": 40426 + }, + { + "epoch": 2.148331524688009, + "grad_norm": 3.618407964706421, + "learning_rate": 4.562696998766617e-06, + "loss": 0.1543, + "step": 40427 + }, + { + "epoch": 2.1483450895279437, + "grad_norm": 4.914290428161621, + "learning_rate": 4.5625599561463615e-06, + "loss": 0.1756, + "step": 40428 + }, + { + "epoch": 2.1483586543678785, + "grad_norm": 4.232460021972656, + "learning_rate": 4.5624229135261076e-06, + "loss": 0.187, + "step": 40429 + }, + { + "epoch": 2.1483722192078134, + "grad_norm": 3.9134998321533203, + "learning_rate": 4.562285870905852e-06, + "loss": 0.1661, + "step": 40430 + }, + { + "epoch": 2.1483857840477483, + "grad_norm": 5.816549777984619, + "learning_rate": 4.562148828285597e-06, + "loss": 0.1897, + "step": 40431 + }, + { + "epoch": 2.148399348887683, + "grad_norm": 3.0687646865844727, + "learning_rate": 4.562011785665342e-06, + "loss": 0.0887, + "step": 40432 + }, + { + "epoch": 2.148412913727618, + "grad_norm": 3.835254430770874, + "learning_rate": 4.5618747430450874e-06, + "loss": 0.187, + "step": 40433 + }, + { + "epoch": 2.148426478567553, + "grad_norm": 5.491872787475586, + "learning_rate": 4.561737700424833e-06, + "loss": 0.2441, + "step": 40434 + }, + { + "epoch": 2.1484400434074877, + "grad_norm": 7.2975263595581055, + "learning_rate": 4.561600657804578e-06, + "loss": 0.271, + "step": 40435 + }, + { + "epoch": 2.1484536082474226, + "grad_norm": 5.2949981689453125, + "learning_rate": 4.561463615184323e-06, + "loss": 0.2502, + "step": 40436 + }, + { + "epoch": 2.1484671730873575, + "grad_norm": 4.3383097648620605, + "learning_rate": 4.561326572564067e-06, + "loss": 0.122, + "step": 40437 + }, + { + "epoch": 2.1484807379272923, + "grad_norm": 5.480558395385742, + "learning_rate": 4.561189529943813e-06, + "loss": 0.1876, + "step": 40438 + }, + { + "epoch": 2.148494302767227, + "grad_norm": 4.194157123565674, + "learning_rate": 4.561052487323558e-06, + "loss": 0.1535, + "step": 40439 + }, + { + "epoch": 2.148507867607162, + "grad_norm": 4.891797065734863, + "learning_rate": 4.560915444703304e-06, + "loss": 0.2215, + "step": 40440 + }, + { + "epoch": 2.148521432447097, + "grad_norm": 3.0875542163848877, + "learning_rate": 4.560778402083048e-06, + "loss": 0.1249, + "step": 40441 + }, + { + "epoch": 2.1485349972870322, + "grad_norm": 4.499641418457031, + "learning_rate": 4.560641359462793e-06, + "loss": 0.2645, + "step": 40442 + }, + { + "epoch": 2.148548562126967, + "grad_norm": 5.063549518585205, + "learning_rate": 4.560504316842538e-06, + "loss": 0.2405, + "step": 40443 + }, + { + "epoch": 2.148562126966902, + "grad_norm": 4.8537068367004395, + "learning_rate": 4.5603672742222835e-06, + "loss": 0.1741, + "step": 40444 + }, + { + "epoch": 2.148575691806837, + "grad_norm": 4.120789527893066, + "learning_rate": 4.560230231602029e-06, + "loss": 0.259, + "step": 40445 + }, + { + "epoch": 2.1485892566467717, + "grad_norm": 8.412773132324219, + "learning_rate": 4.560093188981773e-06, + "loss": 0.2669, + "step": 40446 + }, + { + "epoch": 2.1486028214867066, + "grad_norm": 5.727381706237793, + "learning_rate": 4.559956146361519e-06, + "loss": 0.2398, + "step": 40447 + }, + { + "epoch": 2.1486163863266414, + "grad_norm": 4.790435314178467, + "learning_rate": 4.559819103741263e-06, + "loss": 0.1891, + "step": 40448 + }, + { + "epoch": 2.1486299511665763, + "grad_norm": 3.59016489982605, + "learning_rate": 4.5596820611210094e-06, + "loss": 0.1663, + "step": 40449 + }, + { + "epoch": 2.148643516006511, + "grad_norm": 3.986487865447998, + "learning_rate": 4.559545018500754e-06, + "loss": 0.1838, + "step": 40450 + }, + { + "epoch": 2.148657080846446, + "grad_norm": 3.159581184387207, + "learning_rate": 4.559407975880499e-06, + "loss": 0.1056, + "step": 40451 + }, + { + "epoch": 2.148670645686381, + "grad_norm": 4.042753219604492, + "learning_rate": 4.559270933260244e-06, + "loss": 0.1834, + "step": 40452 + }, + { + "epoch": 2.1486842105263158, + "grad_norm": 5.141642093658447, + "learning_rate": 4.559133890639989e-06, + "loss": 0.2906, + "step": 40453 + }, + { + "epoch": 2.1486977753662506, + "grad_norm": 5.328213214874268, + "learning_rate": 4.5589968480197345e-06, + "loss": 0.2257, + "step": 40454 + }, + { + "epoch": 2.1487113402061855, + "grad_norm": 4.622819423675537, + "learning_rate": 4.55885980539948e-06, + "loss": 0.2314, + "step": 40455 + }, + { + "epoch": 2.1487249050461203, + "grad_norm": 4.102459907531738, + "learning_rate": 4.558722762779225e-06, + "loss": 0.1577, + "step": 40456 + }, + { + "epoch": 2.148738469886055, + "grad_norm": 5.386858940124512, + "learning_rate": 4.55858572015897e-06, + "loss": 0.1895, + "step": 40457 + }, + { + "epoch": 2.14875203472599, + "grad_norm": 3.660352945327759, + "learning_rate": 4.558448677538715e-06, + "loss": 0.1359, + "step": 40458 + }, + { + "epoch": 2.148765599565925, + "grad_norm": 4.273765563964844, + "learning_rate": 4.5583116349184595e-06, + "loss": 0.2361, + "step": 40459 + }, + { + "epoch": 2.14877916440586, + "grad_norm": 5.596372604370117, + "learning_rate": 4.5581745922982056e-06, + "loss": 0.1959, + "step": 40460 + }, + { + "epoch": 2.1487927292457947, + "grad_norm": 4.852372646331787, + "learning_rate": 4.55803754967795e-06, + "loss": 0.1904, + "step": 40461 + }, + { + "epoch": 2.14880629408573, + "grad_norm": 4.934178352355957, + "learning_rate": 4.557900507057695e-06, + "loss": 0.193, + "step": 40462 + }, + { + "epoch": 2.148819858925665, + "grad_norm": 3.1638057231903076, + "learning_rate": 4.55776346443744e-06, + "loss": 0.1109, + "step": 40463 + }, + { + "epoch": 2.1488334237655997, + "grad_norm": 4.9694366455078125, + "learning_rate": 4.5576264218171854e-06, + "loss": 0.2094, + "step": 40464 + }, + { + "epoch": 2.1488469886055346, + "grad_norm": 3.7360925674438477, + "learning_rate": 4.557489379196931e-06, + "loss": 0.148, + "step": 40465 + }, + { + "epoch": 2.1488605534454694, + "grad_norm": 2.6196064949035645, + "learning_rate": 4.557352336576676e-06, + "loss": 0.079, + "step": 40466 + }, + { + "epoch": 2.1488741182854043, + "grad_norm": 6.013394355773926, + "learning_rate": 4.557215293956421e-06, + "loss": 0.235, + "step": 40467 + }, + { + "epoch": 2.148887683125339, + "grad_norm": 4.508981227874756, + "learning_rate": 4.557078251336166e-06, + "loss": 0.1987, + "step": 40468 + }, + { + "epoch": 2.148901247965274, + "grad_norm": 4.543949127197266, + "learning_rate": 4.556941208715911e-06, + "loss": 0.1451, + "step": 40469 + }, + { + "epoch": 2.148914812805209, + "grad_norm": 5.509922027587891, + "learning_rate": 4.5568041660956565e-06, + "loss": 0.2315, + "step": 40470 + }, + { + "epoch": 2.1489283776451438, + "grad_norm": 4.834109783172607, + "learning_rate": 4.556667123475401e-06, + "loss": 0.1909, + "step": 40471 + }, + { + "epoch": 2.1489419424850786, + "grad_norm": 4.1671905517578125, + "learning_rate": 4.556530080855147e-06, + "loss": 0.1507, + "step": 40472 + }, + { + "epoch": 2.1489555073250135, + "grad_norm": 4.657285690307617, + "learning_rate": 4.556393038234891e-06, + "loss": 0.1769, + "step": 40473 + }, + { + "epoch": 2.1489690721649484, + "grad_norm": 4.770646572113037, + "learning_rate": 4.556255995614636e-06, + "loss": 0.2038, + "step": 40474 + }, + { + "epoch": 2.1489826370048832, + "grad_norm": 4.804791450500488, + "learning_rate": 4.5561189529943815e-06, + "loss": 0.1666, + "step": 40475 + }, + { + "epoch": 2.148996201844818, + "grad_norm": 5.174749851226807, + "learning_rate": 4.555981910374127e-06, + "loss": 0.1474, + "step": 40476 + }, + { + "epoch": 2.149009766684753, + "grad_norm": 5.351390838623047, + "learning_rate": 4.555844867753872e-06, + "loss": 0.2512, + "step": 40477 + }, + { + "epoch": 2.149023331524688, + "grad_norm": 2.9207231998443604, + "learning_rate": 4.555707825133617e-06, + "loss": 0.0636, + "step": 40478 + }, + { + "epoch": 2.1490368963646227, + "grad_norm": 3.0313894748687744, + "learning_rate": 4.555570782513362e-06, + "loss": 0.1207, + "step": 40479 + }, + { + "epoch": 2.149050461204558, + "grad_norm": 6.7957539558410645, + "learning_rate": 4.555433739893107e-06, + "loss": 0.2177, + "step": 40480 + }, + { + "epoch": 2.149064026044493, + "grad_norm": 2.3970510959625244, + "learning_rate": 4.555296697272853e-06, + "loss": 0.1037, + "step": 40481 + }, + { + "epoch": 2.1490775908844277, + "grad_norm": 3.0417380332946777, + "learning_rate": 4.555159654652597e-06, + "loss": 0.103, + "step": 40482 + }, + { + "epoch": 2.1490911557243626, + "grad_norm": 3.7783305644989014, + "learning_rate": 4.555022612032343e-06, + "loss": 0.1383, + "step": 40483 + }, + { + "epoch": 2.1491047205642975, + "grad_norm": 4.251687049865723, + "learning_rate": 4.554885569412087e-06, + "loss": 0.1362, + "step": 40484 + }, + { + "epoch": 2.1491182854042323, + "grad_norm": 3.6368916034698486, + "learning_rate": 4.5547485267918325e-06, + "loss": 0.0871, + "step": 40485 + }, + { + "epoch": 2.149131850244167, + "grad_norm": 5.66396427154541, + "learning_rate": 4.554611484171578e-06, + "loss": 0.1886, + "step": 40486 + }, + { + "epoch": 2.149145415084102, + "grad_norm": 4.454671382904053, + "learning_rate": 4.554474441551323e-06, + "loss": 0.1611, + "step": 40487 + }, + { + "epoch": 2.149158979924037, + "grad_norm": 3.7798421382904053, + "learning_rate": 4.554337398931068e-06, + "loss": 0.1401, + "step": 40488 + }, + { + "epoch": 2.149172544763972, + "grad_norm": 3.2648489475250244, + "learning_rate": 4.554200356310813e-06, + "loss": 0.106, + "step": 40489 + }, + { + "epoch": 2.1491861096039067, + "grad_norm": 6.750789642333984, + "learning_rate": 4.554063313690558e-06, + "loss": 0.3695, + "step": 40490 + }, + { + "epoch": 2.1491996744438415, + "grad_norm": 4.005945682525635, + "learning_rate": 4.553926271070303e-06, + "loss": 0.1888, + "step": 40491 + }, + { + "epoch": 2.1492132392837764, + "grad_norm": 3.878293037414551, + "learning_rate": 4.553789228450049e-06, + "loss": 0.1476, + "step": 40492 + }, + { + "epoch": 2.1492268041237113, + "grad_norm": 4.113619327545166, + "learning_rate": 4.553652185829793e-06, + "loss": 0.1499, + "step": 40493 + }, + { + "epoch": 2.149240368963646, + "grad_norm": 4.508474826812744, + "learning_rate": 4.553515143209539e-06, + "loss": 0.2063, + "step": 40494 + }, + { + "epoch": 2.149253933803581, + "grad_norm": 3.5899744033813477, + "learning_rate": 4.5533781005892834e-06, + "loss": 0.1376, + "step": 40495 + }, + { + "epoch": 2.149267498643516, + "grad_norm": 4.097682952880859, + "learning_rate": 4.553241057969029e-06, + "loss": 0.0968, + "step": 40496 + }, + { + "epoch": 2.1492810634834507, + "grad_norm": 3.958045244216919, + "learning_rate": 4.553104015348774e-06, + "loss": 0.1611, + "step": 40497 + }, + { + "epoch": 2.1492946283233856, + "grad_norm": 2.594053030014038, + "learning_rate": 4.552966972728519e-06, + "loss": 0.0779, + "step": 40498 + }, + { + "epoch": 2.1493081931633204, + "grad_norm": 5.361476421356201, + "learning_rate": 4.552829930108264e-06, + "loss": 0.1583, + "step": 40499 + }, + { + "epoch": 2.1493217580032558, + "grad_norm": 4.752254009246826, + "learning_rate": 4.552692887488009e-06, + "loss": 0.1222, + "step": 40500 + }, + { + "epoch": 2.1493353228431906, + "grad_norm": 3.4542784690856934, + "learning_rate": 4.5525558448677545e-06, + "loss": 0.1194, + "step": 40501 + }, + { + "epoch": 2.1493488876831255, + "grad_norm": 2.8096370697021484, + "learning_rate": 4.552418802247499e-06, + "loss": 0.1166, + "step": 40502 + }, + { + "epoch": 2.1493624525230604, + "grad_norm": 3.0899498462677, + "learning_rate": 4.552281759627245e-06, + "loss": 0.1157, + "step": 40503 + }, + { + "epoch": 2.149376017362995, + "grad_norm": 3.137025833129883, + "learning_rate": 4.552144717006989e-06, + "loss": 0.1012, + "step": 40504 + }, + { + "epoch": 2.14938958220293, + "grad_norm": 2.8137309551239014, + "learning_rate": 4.552007674386734e-06, + "loss": 0.1302, + "step": 40505 + }, + { + "epoch": 2.149403147042865, + "grad_norm": 3.339921236038208, + "learning_rate": 4.5518706317664796e-06, + "loss": 0.0723, + "step": 40506 + }, + { + "epoch": 2.1494167118828, + "grad_norm": 3.5877156257629395, + "learning_rate": 4.551733589146225e-06, + "loss": 0.0952, + "step": 40507 + }, + { + "epoch": 2.1494302767227347, + "grad_norm": 4.089829444885254, + "learning_rate": 4.55159654652597e-06, + "loss": 0.1178, + "step": 40508 + }, + { + "epoch": 2.1494438415626695, + "grad_norm": 3.2787885665893555, + "learning_rate": 4.551459503905715e-06, + "loss": 0.2001, + "step": 40509 + }, + { + "epoch": 2.1494574064026044, + "grad_norm": 5.776824474334717, + "learning_rate": 4.55132246128546e-06, + "loss": 0.2349, + "step": 40510 + }, + { + "epoch": 2.1494709712425393, + "grad_norm": 2.6141068935394287, + "learning_rate": 4.5511854186652054e-06, + "loss": 0.0822, + "step": 40511 + }, + { + "epoch": 2.149484536082474, + "grad_norm": 3.4526705741882324, + "learning_rate": 4.551048376044951e-06, + "loss": 0.1014, + "step": 40512 + }, + { + "epoch": 2.149498100922409, + "grad_norm": 5.195729732513428, + "learning_rate": 4.550911333424696e-06, + "loss": 0.1307, + "step": 40513 + }, + { + "epoch": 2.149511665762344, + "grad_norm": 3.29813289642334, + "learning_rate": 4.55077429080444e-06, + "loss": 0.1105, + "step": 40514 + }, + { + "epoch": 2.1495252306022787, + "grad_norm": 4.76573371887207, + "learning_rate": 4.550637248184185e-06, + "loss": 0.2152, + "step": 40515 + }, + { + "epoch": 2.1495387954422136, + "grad_norm": 4.062479496002197, + "learning_rate": 4.5505002055639305e-06, + "loss": 0.1073, + "step": 40516 + }, + { + "epoch": 2.1495523602821485, + "grad_norm": 3.70340895652771, + "learning_rate": 4.550363162943676e-06, + "loss": 0.1429, + "step": 40517 + }, + { + "epoch": 2.1495659251220838, + "grad_norm": 3.8363847732543945, + "learning_rate": 4.550226120323421e-06, + "loss": 0.1259, + "step": 40518 + }, + { + "epoch": 2.1495794899620186, + "grad_norm": 4.33798360824585, + "learning_rate": 4.550089077703166e-06, + "loss": 0.1817, + "step": 40519 + }, + { + "epoch": 2.1495930548019535, + "grad_norm": 2.4906747341156006, + "learning_rate": 4.549952035082911e-06, + "loss": 0.0774, + "step": 40520 + }, + { + "epoch": 2.1496066196418884, + "grad_norm": 4.829166889190674, + "learning_rate": 4.549814992462656e-06, + "loss": 0.1494, + "step": 40521 + }, + { + "epoch": 2.1496201844818232, + "grad_norm": 4.2895426750183105, + "learning_rate": 4.5496779498424016e-06, + "loss": 0.0998, + "step": 40522 + }, + { + "epoch": 2.149633749321758, + "grad_norm": 3.629338502883911, + "learning_rate": 4.549540907222147e-06, + "loss": 0.1145, + "step": 40523 + }, + { + "epoch": 2.149647314161693, + "grad_norm": 3.5944950580596924, + "learning_rate": 4.549403864601892e-06, + "loss": 0.1185, + "step": 40524 + }, + { + "epoch": 2.149660879001628, + "grad_norm": 5.8092780113220215, + "learning_rate": 4.549266821981636e-06, + "loss": 0.1798, + "step": 40525 + }, + { + "epoch": 2.1496744438415627, + "grad_norm": 3.652618885040283, + "learning_rate": 4.549129779361382e-06, + "loss": 0.1119, + "step": 40526 + }, + { + "epoch": 2.1496880086814976, + "grad_norm": 3.9362645149230957, + "learning_rate": 4.548992736741127e-06, + "loss": 0.0761, + "step": 40527 + }, + { + "epoch": 2.1497015735214324, + "grad_norm": 4.483892440795898, + "learning_rate": 4.548855694120872e-06, + "loss": 0.1705, + "step": 40528 + }, + { + "epoch": 2.1497151383613673, + "grad_norm": 4.097272872924805, + "learning_rate": 4.548718651500617e-06, + "loss": 0.1665, + "step": 40529 + }, + { + "epoch": 2.149728703201302, + "grad_norm": 2.998342275619507, + "learning_rate": 4.548581608880362e-06, + "loss": 0.0986, + "step": 40530 + }, + { + "epoch": 2.149742268041237, + "grad_norm": 7.389275074005127, + "learning_rate": 4.548444566260107e-06, + "loss": 0.2448, + "step": 40531 + }, + { + "epoch": 2.149755832881172, + "grad_norm": 3.563066005706787, + "learning_rate": 4.5483075236398525e-06, + "loss": 0.0824, + "step": 40532 + }, + { + "epoch": 2.1497693977211068, + "grad_norm": 3.7585256099700928, + "learning_rate": 4.548170481019598e-06, + "loss": 0.1431, + "step": 40533 + }, + { + "epoch": 2.1497829625610416, + "grad_norm": 5.320085048675537, + "learning_rate": 4.548033438399342e-06, + "loss": 0.2229, + "step": 40534 + }, + { + "epoch": 2.1497965274009765, + "grad_norm": 4.9874725341796875, + "learning_rate": 4.547896395779088e-06, + "loss": 0.1179, + "step": 40535 + }, + { + "epoch": 2.1498100922409114, + "grad_norm": 2.4746487140655518, + "learning_rate": 4.547759353158832e-06, + "loss": 0.0778, + "step": 40536 + }, + { + "epoch": 2.149823657080846, + "grad_norm": 4.145967960357666, + "learning_rate": 4.547622310538578e-06, + "loss": 0.1041, + "step": 40537 + }, + { + "epoch": 2.1498372219207815, + "grad_norm": 5.140527248382568, + "learning_rate": 4.547485267918323e-06, + "loss": 0.2125, + "step": 40538 + }, + { + "epoch": 2.1498507867607164, + "grad_norm": 5.6620635986328125, + "learning_rate": 4.547348225298068e-06, + "loss": 0.1723, + "step": 40539 + }, + { + "epoch": 2.1498643516006513, + "grad_norm": 6.035185813903809, + "learning_rate": 4.547211182677813e-06, + "loss": 0.2112, + "step": 40540 + }, + { + "epoch": 2.149877916440586, + "grad_norm": 4.2369279861450195, + "learning_rate": 4.547074140057558e-06, + "loss": 0.1274, + "step": 40541 + }, + { + "epoch": 2.149891481280521, + "grad_norm": 4.439462184906006, + "learning_rate": 4.5469370974373035e-06, + "loss": 0.1271, + "step": 40542 + }, + { + "epoch": 2.149905046120456, + "grad_norm": 3.4137861728668213, + "learning_rate": 4.546800054817049e-06, + "loss": 0.1158, + "step": 40543 + }, + { + "epoch": 2.1499186109603907, + "grad_norm": 4.175454139709473, + "learning_rate": 4.546663012196794e-06, + "loss": 0.1244, + "step": 40544 + }, + { + "epoch": 2.1499321758003256, + "grad_norm": 4.50094747543335, + "learning_rate": 4.546525969576538e-06, + "loss": 0.126, + "step": 40545 + }, + { + "epoch": 2.1499457406402604, + "grad_norm": 6.250092506408691, + "learning_rate": 4.546388926956284e-06, + "loss": 0.1785, + "step": 40546 + }, + { + "epoch": 2.1499457406402604, + "eval_loss": 0.3351876437664032, + "eval_noise_accuracy": NaN, + "eval_runtime": 4546.5423, + "eval_samples_per_second": 1.105, + "eval_steps_per_second": 0.069, + "eval_wer": 26.237990794229404, + "step": 40546 + }, + { + "epoch": 2.1499593054801953, + "grad_norm": 4.005853176116943, + "learning_rate": 4.5462518843360285e-06, + "loss": 0.1308, + "step": 40547 + }, + { + "epoch": 2.14997287032013, + "grad_norm": 5.485969543457031, + "learning_rate": 4.5461148417157745e-06, + "loss": 0.2111, + "step": 40548 + }, + { + "epoch": 2.149986435160065, + "grad_norm": 4.321046829223633, + "learning_rate": 4.545977799095519e-06, + "loss": 0.2098, + "step": 40549 + }, + { + "epoch": 2.15, + "grad_norm": 5.399173736572266, + "learning_rate": 4.545840756475264e-06, + "loss": 0.117, + "step": 40550 + }, + { + "epoch": 2.1500135648399348, + "grad_norm": 5.331283092498779, + "learning_rate": 4.545703713855009e-06, + "loss": 0.2107, + "step": 40551 + }, + { + "epoch": 2.1500271296798696, + "grad_norm": 3.739563226699829, + "learning_rate": 4.545566671234754e-06, + "loss": 0.1369, + "step": 40552 + }, + { + "epoch": 2.1500406945198045, + "grad_norm": 6.1038947105407715, + "learning_rate": 4.5454296286144996e-06, + "loss": 0.2168, + "step": 40553 + }, + { + "epoch": 2.1500542593597394, + "grad_norm": 5.219608306884766, + "learning_rate": 4.545292585994245e-06, + "loss": 0.1862, + "step": 40554 + }, + { + "epoch": 2.1500678241996742, + "grad_norm": 5.802587509155273, + "learning_rate": 4.54515554337399e-06, + "loss": 0.1714, + "step": 40555 + }, + { + "epoch": 2.1500813890396095, + "grad_norm": 5.097209453582764, + "learning_rate": 4.545018500753734e-06, + "loss": 0.1194, + "step": 40556 + }, + { + "epoch": 2.1500949538795444, + "grad_norm": 3.232684373855591, + "learning_rate": 4.54488145813348e-06, + "loss": 0.0925, + "step": 40557 + }, + { + "epoch": 2.1501085187194793, + "grad_norm": 4.147652626037598, + "learning_rate": 4.544744415513225e-06, + "loss": 0.167, + "step": 40558 + }, + { + "epoch": 2.150122083559414, + "grad_norm": 5.481393814086914, + "learning_rate": 4.54460737289297e-06, + "loss": 0.1562, + "step": 40559 + }, + { + "epoch": 2.150135648399349, + "grad_norm": 4.496026515960693, + "learning_rate": 4.544470330272715e-06, + "loss": 0.1239, + "step": 40560 + }, + { + "epoch": 2.150149213239284, + "grad_norm": 3.1933088302612305, + "learning_rate": 4.54433328765246e-06, + "loss": 0.0913, + "step": 40561 + }, + { + "epoch": 2.1501627780792187, + "grad_norm": 4.596214771270752, + "learning_rate": 4.544196245032205e-06, + "loss": 0.1274, + "step": 40562 + }, + { + "epoch": 2.1501763429191536, + "grad_norm": 4.186740875244141, + "learning_rate": 4.5440592024119505e-06, + "loss": 0.1129, + "step": 40563 + }, + { + "epoch": 2.1501899077590885, + "grad_norm": 5.173599720001221, + "learning_rate": 4.543922159791696e-06, + "loss": 0.1661, + "step": 40564 + }, + { + "epoch": 2.1502034725990233, + "grad_norm": 3.7281012535095215, + "learning_rate": 4.543785117171441e-06, + "loss": 0.1564, + "step": 40565 + }, + { + "epoch": 2.150217037438958, + "grad_norm": 7.899670600891113, + "learning_rate": 4.543648074551186e-06, + "loss": 0.1583, + "step": 40566 + }, + { + "epoch": 2.150230602278893, + "grad_norm": 3.1338911056518555, + "learning_rate": 4.543511031930931e-06, + "loss": 0.0738, + "step": 40567 + }, + { + "epoch": 2.150244167118828, + "grad_norm": 3.3434698581695557, + "learning_rate": 4.5433739893106756e-06, + "loss": 0.1103, + "step": 40568 + }, + { + "epoch": 2.150257731958763, + "grad_norm": 3.6337132453918457, + "learning_rate": 4.543236946690422e-06, + "loss": 0.1718, + "step": 40569 + }, + { + "epoch": 2.1502712967986977, + "grad_norm": 4.458979606628418, + "learning_rate": 4.543099904070166e-06, + "loss": 0.1191, + "step": 40570 + }, + { + "epoch": 2.1502848616386325, + "grad_norm": 4.371098518371582, + "learning_rate": 4.542962861449911e-06, + "loss": 0.1435, + "step": 40571 + }, + { + "epoch": 2.1502984264785674, + "grad_norm": 3.1156749725341797, + "learning_rate": 4.542825818829656e-06, + "loss": 0.1265, + "step": 40572 + }, + { + "epoch": 2.1503119913185023, + "grad_norm": 5.475155353546143, + "learning_rate": 4.5426887762094015e-06, + "loss": 0.1625, + "step": 40573 + }, + { + "epoch": 2.150325556158437, + "grad_norm": 5.2584943771362305, + "learning_rate": 4.542551733589147e-06, + "loss": 0.1759, + "step": 40574 + }, + { + "epoch": 2.150339120998372, + "grad_norm": 3.455141305923462, + "learning_rate": 4.542414690968892e-06, + "loss": 0.0916, + "step": 40575 + }, + { + "epoch": 2.1503526858383073, + "grad_norm": 4.150918483734131, + "learning_rate": 4.542277648348637e-06, + "loss": 0.0931, + "step": 40576 + }, + { + "epoch": 2.150366250678242, + "grad_norm": 5.444733619689941, + "learning_rate": 4.542140605728382e-06, + "loss": 0.1563, + "step": 40577 + }, + { + "epoch": 2.150379815518177, + "grad_norm": 3.5174975395202637, + "learning_rate": 4.542003563108127e-06, + "loss": 0.0853, + "step": 40578 + }, + { + "epoch": 2.150393380358112, + "grad_norm": 3.0898799896240234, + "learning_rate": 4.541866520487872e-06, + "loss": 0.1096, + "step": 40579 + }, + { + "epoch": 2.1504069451980468, + "grad_norm": 4.447843074798584, + "learning_rate": 4.541729477867618e-06, + "loss": 0.1667, + "step": 40580 + }, + { + "epoch": 2.1504205100379816, + "grad_norm": 4.152388572692871, + "learning_rate": 4.541592435247362e-06, + "loss": 0.1545, + "step": 40581 + }, + { + "epoch": 2.1504340748779165, + "grad_norm": 3.947601079940796, + "learning_rate": 4.541455392627108e-06, + "loss": 0.1539, + "step": 40582 + }, + { + "epoch": 2.1504476397178514, + "grad_norm": 5.165382385253906, + "learning_rate": 4.541318350006852e-06, + "loss": 0.1535, + "step": 40583 + }, + { + "epoch": 2.150461204557786, + "grad_norm": 3.998415946960449, + "learning_rate": 4.541181307386598e-06, + "loss": 0.1427, + "step": 40584 + }, + { + "epoch": 2.150474769397721, + "grad_norm": 5.884956359863281, + "learning_rate": 4.541044264766343e-06, + "loss": 0.2402, + "step": 40585 + }, + { + "epoch": 2.150488334237656, + "grad_norm": 5.367870807647705, + "learning_rate": 4.540907222146088e-06, + "loss": 0.3285, + "step": 40586 + }, + { + "epoch": 2.150501899077591, + "grad_norm": 3.5665745735168457, + "learning_rate": 4.540770179525833e-06, + "loss": 0.1426, + "step": 40587 + }, + { + "epoch": 2.1505154639175257, + "grad_norm": 4.969021797180176, + "learning_rate": 4.5406331369055774e-06, + "loss": 0.2683, + "step": 40588 + }, + { + "epoch": 2.1505290287574605, + "grad_norm": 4.148568630218506, + "learning_rate": 4.5404960942853235e-06, + "loss": 0.147, + "step": 40589 + }, + { + "epoch": 2.1505425935973954, + "grad_norm": 5.588632583618164, + "learning_rate": 4.540359051665068e-06, + "loss": 0.1531, + "step": 40590 + }, + { + "epoch": 2.1505561584373303, + "grad_norm": 5.9690752029418945, + "learning_rate": 4.540222009044814e-06, + "loss": 0.2212, + "step": 40591 + }, + { + "epoch": 2.150569723277265, + "grad_norm": 3.5617518424987793, + "learning_rate": 4.540084966424558e-06, + "loss": 0.1568, + "step": 40592 + }, + { + "epoch": 2.1505832881172, + "grad_norm": 3.9426348209381104, + "learning_rate": 4.539947923804303e-06, + "loss": 0.1494, + "step": 40593 + }, + { + "epoch": 2.1505968529571353, + "grad_norm": 3.327169179916382, + "learning_rate": 4.5398108811840485e-06, + "loss": 0.1689, + "step": 40594 + }, + { + "epoch": 2.15061041779707, + "grad_norm": 5.73010778427124, + "learning_rate": 4.539673838563794e-06, + "loss": 0.1751, + "step": 40595 + }, + { + "epoch": 2.150623982637005, + "grad_norm": 5.134832382202148, + "learning_rate": 4.539536795943539e-06, + "loss": 0.1515, + "step": 40596 + }, + { + "epoch": 2.15063754747694, + "grad_norm": 6.062410831451416, + "learning_rate": 4.539399753323284e-06, + "loss": 0.2172, + "step": 40597 + }, + { + "epoch": 2.150651112316875, + "grad_norm": 4.97716760635376, + "learning_rate": 4.539262710703029e-06, + "loss": 0.2067, + "step": 40598 + }, + { + "epoch": 2.1506646771568096, + "grad_norm": 4.178321838378906, + "learning_rate": 4.5391256680827736e-06, + "loss": 0.1758, + "step": 40599 + }, + { + "epoch": 2.1506782419967445, + "grad_norm": 8.282757759094238, + "learning_rate": 4.53898862546252e-06, + "loss": 0.3788, + "step": 40600 + }, + { + "epoch": 2.1506918068366794, + "grad_norm": 4.650074005126953, + "learning_rate": 4.538851582842264e-06, + "loss": 0.1956, + "step": 40601 + }, + { + "epoch": 2.1507053716766142, + "grad_norm": 4.487801551818848, + "learning_rate": 4.538714540222009e-06, + "loss": 0.1662, + "step": 40602 + }, + { + "epoch": 2.150718936516549, + "grad_norm": 4.9360575675964355, + "learning_rate": 4.538577497601754e-06, + "loss": 0.1413, + "step": 40603 + }, + { + "epoch": 2.150732501356484, + "grad_norm": 4.425262928009033, + "learning_rate": 4.5384404549814995e-06, + "loss": 0.1457, + "step": 40604 + }, + { + "epoch": 2.150746066196419, + "grad_norm": 4.162050724029541, + "learning_rate": 4.538303412361245e-06, + "loss": 0.1318, + "step": 40605 + }, + { + "epoch": 2.1507596310363537, + "grad_norm": 5.455963134765625, + "learning_rate": 4.53816636974099e-06, + "loss": 0.2169, + "step": 40606 + }, + { + "epoch": 2.1507731958762886, + "grad_norm": 5.462618827819824, + "learning_rate": 4.538029327120735e-06, + "loss": 0.1786, + "step": 40607 + }, + { + "epoch": 2.1507867607162234, + "grad_norm": 4.2646870613098145, + "learning_rate": 4.53789228450048e-06, + "loss": 0.1856, + "step": 40608 + }, + { + "epoch": 2.1508003255561583, + "grad_norm": 4.327503204345703, + "learning_rate": 4.537755241880225e-06, + "loss": 0.2301, + "step": 40609 + }, + { + "epoch": 2.150813890396093, + "grad_norm": 4.79550838470459, + "learning_rate": 4.5376181992599705e-06, + "loss": 0.2182, + "step": 40610 + }, + { + "epoch": 2.150827455236028, + "grad_norm": 4.095510959625244, + "learning_rate": 4.537481156639716e-06, + "loss": 0.1462, + "step": 40611 + }, + { + "epoch": 2.150841020075963, + "grad_norm": 5.020605087280273, + "learning_rate": 4.53734411401946e-06, + "loss": 0.1902, + "step": 40612 + }, + { + "epoch": 2.1508545849158978, + "grad_norm": 7.839913845062256, + "learning_rate": 4.537207071399205e-06, + "loss": 0.279, + "step": 40613 + }, + { + "epoch": 2.150868149755833, + "grad_norm": 4.927980422973633, + "learning_rate": 4.53707002877895e-06, + "loss": 0.1516, + "step": 40614 + }, + { + "epoch": 2.150881714595768, + "grad_norm": 5.014377593994141, + "learning_rate": 4.536932986158696e-06, + "loss": 0.1536, + "step": 40615 + }, + { + "epoch": 2.150895279435703, + "grad_norm": 4.7787861824035645, + "learning_rate": 4.536795943538441e-06, + "loss": 0.1502, + "step": 40616 + }, + { + "epoch": 2.1509088442756377, + "grad_norm": 3.53686785697937, + "learning_rate": 4.536658900918186e-06, + "loss": 0.0847, + "step": 40617 + }, + { + "epoch": 2.1509224091155725, + "grad_norm": 5.919753551483154, + "learning_rate": 4.536521858297931e-06, + "loss": 0.2444, + "step": 40618 + }, + { + "epoch": 2.1509359739555074, + "grad_norm": 4.730250835418701, + "learning_rate": 4.536384815677676e-06, + "loss": 0.1764, + "step": 40619 + }, + { + "epoch": 2.1509495387954423, + "grad_norm": 4.918337345123291, + "learning_rate": 4.5362477730574215e-06, + "loss": 0.1911, + "step": 40620 + }, + { + "epoch": 2.150963103635377, + "grad_norm": 4.026149749755859, + "learning_rate": 4.536110730437167e-06, + "loss": 0.1546, + "step": 40621 + }, + { + "epoch": 2.150976668475312, + "grad_norm": 5.345086097717285, + "learning_rate": 4.535973687816911e-06, + "loss": 0.1386, + "step": 40622 + }, + { + "epoch": 2.150990233315247, + "grad_norm": 6.270556926727295, + "learning_rate": 4.535836645196657e-06, + "loss": 0.1458, + "step": 40623 + }, + { + "epoch": 2.1510037981551817, + "grad_norm": 4.815964698791504, + "learning_rate": 4.535699602576401e-06, + "loss": 0.1815, + "step": 40624 + }, + { + "epoch": 2.1510173629951166, + "grad_norm": 6.220306873321533, + "learning_rate": 4.5355625599561465e-06, + "loss": 0.2875, + "step": 40625 + }, + { + "epoch": 2.1510309278350515, + "grad_norm": 4.510761260986328, + "learning_rate": 4.535425517335892e-06, + "loss": 0.169, + "step": 40626 + }, + { + "epoch": 2.1510444926749863, + "grad_norm": 4.777133464813232, + "learning_rate": 4.535288474715637e-06, + "loss": 0.2115, + "step": 40627 + }, + { + "epoch": 2.151058057514921, + "grad_norm": 4.247343063354492, + "learning_rate": 4.535151432095382e-06, + "loss": 0.178, + "step": 40628 + }, + { + "epoch": 2.151071622354856, + "grad_norm": 4.178755283355713, + "learning_rate": 4.535014389475127e-06, + "loss": 0.1269, + "step": 40629 + }, + { + "epoch": 2.151085187194791, + "grad_norm": 5.962390422821045, + "learning_rate": 4.534877346854872e-06, + "loss": 0.2597, + "step": 40630 + }, + { + "epoch": 2.1510987520347262, + "grad_norm": 5.283016681671143, + "learning_rate": 4.534740304234618e-06, + "loss": 0.1583, + "step": 40631 + }, + { + "epoch": 2.151112316874661, + "grad_norm": 4.561160564422607, + "learning_rate": 4.534603261614363e-06, + "loss": 0.1011, + "step": 40632 + }, + { + "epoch": 2.151125881714596, + "grad_norm": 4.013394832611084, + "learning_rate": 4.534466218994107e-06, + "loss": 0.2063, + "step": 40633 + }, + { + "epoch": 2.151139446554531, + "grad_norm": 4.652365684509277, + "learning_rate": 4.534329176373853e-06, + "loss": 0.2183, + "step": 40634 + }, + { + "epoch": 2.1511530113944657, + "grad_norm": 5.54542875289917, + "learning_rate": 4.5341921337535975e-06, + "loss": 0.1463, + "step": 40635 + }, + { + "epoch": 2.1511665762344006, + "grad_norm": 5.058992862701416, + "learning_rate": 4.5340550911333435e-06, + "loss": 0.205, + "step": 40636 + }, + { + "epoch": 2.1511801410743354, + "grad_norm": 4.548225402832031, + "learning_rate": 4.533918048513088e-06, + "loss": 0.2407, + "step": 40637 + }, + { + "epoch": 2.1511937059142703, + "grad_norm": 6.2633843421936035, + "learning_rate": 4.533781005892833e-06, + "loss": 0.2086, + "step": 40638 + }, + { + "epoch": 2.151207270754205, + "grad_norm": 4.523732662200928, + "learning_rate": 4.533643963272578e-06, + "loss": 0.1728, + "step": 40639 + }, + { + "epoch": 2.15122083559414, + "grad_norm": 5.528403282165527, + "learning_rate": 4.533506920652323e-06, + "loss": 0.1912, + "step": 40640 + }, + { + "epoch": 2.151234400434075, + "grad_norm": 3.3013699054718018, + "learning_rate": 4.5333698780320685e-06, + "loss": 0.0955, + "step": 40641 + }, + { + "epoch": 2.1512479652740097, + "grad_norm": 5.2505693435668945, + "learning_rate": 4.533232835411813e-06, + "loss": 0.1692, + "step": 40642 + }, + { + "epoch": 2.1512615301139446, + "grad_norm": 5.510406494140625, + "learning_rate": 4.533095792791559e-06, + "loss": 0.2229, + "step": 40643 + }, + { + "epoch": 2.1512750949538795, + "grad_norm": 3.8389699459075928, + "learning_rate": 4.532958750171303e-06, + "loss": 0.1024, + "step": 40644 + }, + { + "epoch": 2.1512886597938143, + "grad_norm": 7.279944896697998, + "learning_rate": 4.532821707551049e-06, + "loss": 0.2324, + "step": 40645 + }, + { + "epoch": 2.151302224633749, + "grad_norm": 3.567011594772339, + "learning_rate": 4.532684664930794e-06, + "loss": 0.1533, + "step": 40646 + }, + { + "epoch": 2.151315789473684, + "grad_norm": 3.3337788581848145, + "learning_rate": 4.532547622310539e-06, + "loss": 0.1381, + "step": 40647 + }, + { + "epoch": 2.151329354313619, + "grad_norm": 4.239434242248535, + "learning_rate": 4.532410579690284e-06, + "loss": 0.1686, + "step": 40648 + }, + { + "epoch": 2.151342919153554, + "grad_norm": 3.976715087890625, + "learning_rate": 4.532273537070029e-06, + "loss": 0.0927, + "step": 40649 + }, + { + "epoch": 2.1513564839934887, + "grad_norm": 4.101602077484131, + "learning_rate": 4.532136494449774e-06, + "loss": 0.1039, + "step": 40650 + }, + { + "epoch": 2.1513700488334235, + "grad_norm": 4.012908935546875, + "learning_rate": 4.5319994518295195e-06, + "loss": 0.2216, + "step": 40651 + }, + { + "epoch": 2.151383613673359, + "grad_norm": 4.358926773071289, + "learning_rate": 4.531862409209265e-06, + "loss": 0.1677, + "step": 40652 + }, + { + "epoch": 2.1513971785132937, + "grad_norm": 4.409259796142578, + "learning_rate": 4.531725366589009e-06, + "loss": 0.1103, + "step": 40653 + }, + { + "epoch": 2.1514107433532286, + "grad_norm": 3.290388822555542, + "learning_rate": 4.531588323968755e-06, + "loss": 0.0967, + "step": 40654 + }, + { + "epoch": 2.1514243081931634, + "grad_norm": 3.839857816696167, + "learning_rate": 4.531451281348499e-06, + "loss": 0.1657, + "step": 40655 + }, + { + "epoch": 2.1514378730330983, + "grad_norm": 4.219344139099121, + "learning_rate": 4.5313142387282445e-06, + "loss": 0.2001, + "step": 40656 + }, + { + "epoch": 2.151451437873033, + "grad_norm": 3.4416143894195557, + "learning_rate": 4.53117719610799e-06, + "loss": 0.1568, + "step": 40657 + }, + { + "epoch": 2.151465002712968, + "grad_norm": 3.5855560302734375, + "learning_rate": 4.531040153487735e-06, + "loss": 0.1713, + "step": 40658 + }, + { + "epoch": 2.151478567552903, + "grad_norm": 4.698116302490234, + "learning_rate": 4.53090311086748e-06, + "loss": 0.2269, + "step": 40659 + }, + { + "epoch": 2.1514921323928378, + "grad_norm": 3.9402053356170654, + "learning_rate": 4.530766068247225e-06, + "loss": 0.1908, + "step": 40660 + }, + { + "epoch": 2.1515056972327726, + "grad_norm": 5.915598392486572, + "learning_rate": 4.53062902562697e-06, + "loss": 0.1446, + "step": 40661 + }, + { + "epoch": 2.1515192620727075, + "grad_norm": 6.037900924682617, + "learning_rate": 4.530491983006716e-06, + "loss": 0.1882, + "step": 40662 + }, + { + "epoch": 2.1515328269126424, + "grad_norm": 5.05055570602417, + "learning_rate": 4.530354940386461e-06, + "loss": 0.1697, + "step": 40663 + }, + { + "epoch": 2.1515463917525772, + "grad_norm": 4.0170183181762695, + "learning_rate": 4.530217897766206e-06, + "loss": 0.1381, + "step": 40664 + }, + { + "epoch": 2.151559956592512, + "grad_norm": 4.999405860900879, + "learning_rate": 4.530080855145951e-06, + "loss": 0.1389, + "step": 40665 + }, + { + "epoch": 2.151573521432447, + "grad_norm": 5.989939212799072, + "learning_rate": 4.5299438125256955e-06, + "loss": 0.1664, + "step": 40666 + }, + { + "epoch": 2.151587086272382, + "grad_norm": 4.674835681915283, + "learning_rate": 4.529806769905441e-06, + "loss": 0.147, + "step": 40667 + }, + { + "epoch": 2.1516006511123167, + "grad_norm": 3.831334352493286, + "learning_rate": 4.529669727285186e-06, + "loss": 0.1197, + "step": 40668 + }, + { + "epoch": 2.151614215952252, + "grad_norm": 4.179650783538818, + "learning_rate": 4.529532684664931e-06, + "loss": 0.0759, + "step": 40669 + }, + { + "epoch": 2.151627780792187, + "grad_norm": 4.9287309646606445, + "learning_rate": 4.529395642044676e-06, + "loss": 0.1473, + "step": 40670 + }, + { + "epoch": 2.1516413456321217, + "grad_norm": 4.033716201782227, + "learning_rate": 4.529258599424421e-06, + "loss": 0.1634, + "step": 40671 + }, + { + "epoch": 2.1516549104720566, + "grad_norm": 4.748419284820557, + "learning_rate": 4.5291215568041665e-06, + "loss": 0.2205, + "step": 40672 + }, + { + "epoch": 2.1516684753119915, + "grad_norm": 4.126311779022217, + "learning_rate": 4.528984514183912e-06, + "loss": 0.1194, + "step": 40673 + }, + { + "epoch": 2.1516820401519263, + "grad_norm": 5.043088436126709, + "learning_rate": 4.528847471563657e-06, + "loss": 0.1845, + "step": 40674 + }, + { + "epoch": 2.151695604991861, + "grad_norm": 4.0895795822143555, + "learning_rate": 4.528710428943402e-06, + "loss": 0.1447, + "step": 40675 + }, + { + "epoch": 2.151709169831796, + "grad_norm": 4.598827838897705, + "learning_rate": 4.528573386323146e-06, + "loss": 0.1377, + "step": 40676 + }, + { + "epoch": 2.151722734671731, + "grad_norm": 4.739293098449707, + "learning_rate": 4.5284363437028924e-06, + "loss": 0.144, + "step": 40677 + }, + { + "epoch": 2.151736299511666, + "grad_norm": 4.826476097106934, + "learning_rate": 4.528299301082637e-06, + "loss": 0.1674, + "step": 40678 + }, + { + "epoch": 2.1517498643516006, + "grad_norm": 5.186114311218262, + "learning_rate": 4.528162258462383e-06, + "loss": 0.209, + "step": 40679 + }, + { + "epoch": 2.1517634291915355, + "grad_norm": 3.415735960006714, + "learning_rate": 4.528025215842127e-06, + "loss": 0.1521, + "step": 40680 + }, + { + "epoch": 2.1517769940314704, + "grad_norm": 4.2965989112854, + "learning_rate": 4.527888173221872e-06, + "loss": 0.1335, + "step": 40681 + }, + { + "epoch": 2.1517905588714052, + "grad_norm": 4.65876579284668, + "learning_rate": 4.5277511306016175e-06, + "loss": 0.1299, + "step": 40682 + }, + { + "epoch": 2.15180412371134, + "grad_norm": 4.037533760070801, + "learning_rate": 4.527614087981363e-06, + "loss": 0.1331, + "step": 40683 + }, + { + "epoch": 2.151817688551275, + "grad_norm": 3.8661160469055176, + "learning_rate": 4.527477045361108e-06, + "loss": 0.1488, + "step": 40684 + }, + { + "epoch": 2.15183125339121, + "grad_norm": 4.183257102966309, + "learning_rate": 4.527340002740853e-06, + "loss": 0.1238, + "step": 40685 + }, + { + "epoch": 2.1518448182311447, + "grad_norm": 5.840410232543945, + "learning_rate": 4.527202960120598e-06, + "loss": 0.1475, + "step": 40686 + }, + { + "epoch": 2.1518583830710796, + "grad_norm": 4.415883541107178, + "learning_rate": 4.5270659175003425e-06, + "loss": 0.1674, + "step": 40687 + }, + { + "epoch": 2.1518719479110144, + "grad_norm": 5.105077743530273, + "learning_rate": 4.5269288748800886e-06, + "loss": 0.1149, + "step": 40688 + }, + { + "epoch": 2.1518855127509493, + "grad_norm": 4.142688751220703, + "learning_rate": 4.526791832259833e-06, + "loss": 0.1164, + "step": 40689 + }, + { + "epoch": 2.1518990775908846, + "grad_norm": 3.5259618759155273, + "learning_rate": 4.526654789639579e-06, + "loss": 0.1119, + "step": 40690 + }, + { + "epoch": 2.1519126424308195, + "grad_norm": 4.34550142288208, + "learning_rate": 4.526517747019323e-06, + "loss": 0.1347, + "step": 40691 + }, + { + "epoch": 2.1519262072707543, + "grad_norm": 6.1891632080078125, + "learning_rate": 4.5263807043990684e-06, + "loss": 0.1849, + "step": 40692 + }, + { + "epoch": 2.151939772110689, + "grad_norm": 3.6432442665100098, + "learning_rate": 4.526243661778814e-06, + "loss": 0.1428, + "step": 40693 + }, + { + "epoch": 2.151953336950624, + "grad_norm": 4.565466403961182, + "learning_rate": 4.526106619158559e-06, + "loss": 0.1264, + "step": 40694 + }, + { + "epoch": 2.151966901790559, + "grad_norm": 4.3433918952941895, + "learning_rate": 4.525969576538304e-06, + "loss": 0.1759, + "step": 40695 + }, + { + "epoch": 2.151980466630494, + "grad_norm": 5.219222545623779, + "learning_rate": 4.525832533918048e-06, + "loss": 0.1631, + "step": 40696 + }, + { + "epoch": 2.1519940314704287, + "grad_norm": 6.767698764801025, + "learning_rate": 4.525695491297794e-06, + "loss": 0.3241, + "step": 40697 + }, + { + "epoch": 2.1520075963103635, + "grad_norm": 5.681527614593506, + "learning_rate": 4.525558448677539e-06, + "loss": 0.1871, + "step": 40698 + }, + { + "epoch": 2.1520211611502984, + "grad_norm": 2.67414927482605, + "learning_rate": 4.525421406057285e-06, + "loss": 0.1018, + "step": 40699 + }, + { + "epoch": 2.1520347259902333, + "grad_norm": 5.4764580726623535, + "learning_rate": 4.525284363437029e-06, + "loss": 0.1484, + "step": 40700 + }, + { + "epoch": 2.152048290830168, + "grad_norm": 4.529231548309326, + "learning_rate": 4.525147320816774e-06, + "loss": 0.1684, + "step": 40701 + }, + { + "epoch": 2.152061855670103, + "grad_norm": 2.850517749786377, + "learning_rate": 4.525010278196519e-06, + "loss": 0.1201, + "step": 40702 + }, + { + "epoch": 2.152075420510038, + "grad_norm": 6.327111721038818, + "learning_rate": 4.5248732355762645e-06, + "loss": 0.1974, + "step": 40703 + }, + { + "epoch": 2.1520889853499727, + "grad_norm": 5.587594509124756, + "learning_rate": 4.52473619295601e-06, + "loss": 0.2713, + "step": 40704 + }, + { + "epoch": 2.1521025501899076, + "grad_norm": 3.0875611305236816, + "learning_rate": 4.524599150335755e-06, + "loss": 0.0745, + "step": 40705 + }, + { + "epoch": 2.1521161150298425, + "grad_norm": 4.187774658203125, + "learning_rate": 4.5244621077155e-06, + "loss": 0.1136, + "step": 40706 + }, + { + "epoch": 2.1521296798697778, + "grad_norm": 5.223255634307861, + "learning_rate": 4.524325065095244e-06, + "loss": 0.2032, + "step": 40707 + }, + { + "epoch": 2.1521432447097126, + "grad_norm": 4.366724014282227, + "learning_rate": 4.5241880224749904e-06, + "loss": 0.1225, + "step": 40708 + }, + { + "epoch": 2.1521568095496475, + "grad_norm": 6.664827346801758, + "learning_rate": 4.524050979854735e-06, + "loss": 0.243, + "step": 40709 + }, + { + "epoch": 2.1521703743895824, + "grad_norm": 6.34402322769165, + "learning_rate": 4.52391393723448e-06, + "loss": 0.3064, + "step": 40710 + }, + { + "epoch": 2.1521839392295172, + "grad_norm": 6.833383083343506, + "learning_rate": 4.523776894614225e-06, + "loss": 0.3375, + "step": 40711 + }, + { + "epoch": 2.152197504069452, + "grad_norm": 5.034159183502197, + "learning_rate": 4.52363985199397e-06, + "loss": 0.2308, + "step": 40712 + }, + { + "epoch": 2.152211068909387, + "grad_norm": 4.823480129241943, + "learning_rate": 4.5235028093737155e-06, + "loss": 0.2155, + "step": 40713 + }, + { + "epoch": 2.152224633749322, + "grad_norm": 4.609325885772705, + "learning_rate": 4.523365766753461e-06, + "loss": 0.2484, + "step": 40714 + }, + { + "epoch": 2.1522381985892567, + "grad_norm": 7.379452705383301, + "learning_rate": 4.523228724133206e-06, + "loss": 0.2941, + "step": 40715 + }, + { + "epoch": 2.1522517634291916, + "grad_norm": 4.6171555519104, + "learning_rate": 4.523091681512951e-06, + "loss": 0.2176, + "step": 40716 + }, + { + "epoch": 2.1522653282691264, + "grad_norm": 4.287282466888428, + "learning_rate": 4.522954638892696e-06, + "loss": 0.1807, + "step": 40717 + }, + { + "epoch": 2.1522788931090613, + "grad_norm": 6.81822395324707, + "learning_rate": 4.522817596272441e-06, + "loss": 0.2304, + "step": 40718 + }, + { + "epoch": 2.152292457948996, + "grad_norm": 5.442262172698975, + "learning_rate": 4.5226805536521866e-06, + "loss": 0.2589, + "step": 40719 + }, + { + "epoch": 2.152306022788931, + "grad_norm": 6.275567054748535, + "learning_rate": 4.522543511031932e-06, + "loss": 0.2421, + "step": 40720 + }, + { + "epoch": 2.152319587628866, + "grad_norm": 3.9761481285095215, + "learning_rate": 4.522406468411676e-06, + "loss": 0.1209, + "step": 40721 + }, + { + "epoch": 2.1523331524688007, + "grad_norm": 5.1704182624816895, + "learning_rate": 4.522269425791421e-06, + "loss": 0.1639, + "step": 40722 + }, + { + "epoch": 2.1523467173087356, + "grad_norm": 4.419622898101807, + "learning_rate": 4.5221323831711664e-06, + "loss": 0.1462, + "step": 40723 + }, + { + "epoch": 2.1523602821486705, + "grad_norm": 5.80369234085083, + "learning_rate": 4.521995340550912e-06, + "loss": 0.1885, + "step": 40724 + }, + { + "epoch": 2.1523738469886053, + "grad_norm": 7.695866107940674, + "learning_rate": 4.521858297930657e-06, + "loss": 0.2678, + "step": 40725 + }, + { + "epoch": 2.15238741182854, + "grad_norm": 5.667210102081299, + "learning_rate": 4.521721255310402e-06, + "loss": 0.2105, + "step": 40726 + }, + { + "epoch": 2.1524009766684755, + "grad_norm": 5.821847438812256, + "learning_rate": 4.521584212690147e-06, + "loss": 0.1862, + "step": 40727 + }, + { + "epoch": 2.1524145415084104, + "grad_norm": 5.594727993011475, + "learning_rate": 4.521447170069892e-06, + "loss": 0.2462, + "step": 40728 + }, + { + "epoch": 2.1524281063483452, + "grad_norm": 5.576606750488281, + "learning_rate": 4.5213101274496375e-06, + "loss": 0.2522, + "step": 40729 + }, + { + "epoch": 2.15244167118828, + "grad_norm": 4.5017242431640625, + "learning_rate": 4.521173084829382e-06, + "loss": 0.1886, + "step": 40730 + }, + { + "epoch": 2.152455236028215, + "grad_norm": 6.050501346588135, + "learning_rate": 4.521036042209128e-06, + "loss": 0.1494, + "step": 40731 + }, + { + "epoch": 2.15246880086815, + "grad_norm": 5.042418003082275, + "learning_rate": 4.520898999588872e-06, + "loss": 0.2036, + "step": 40732 + }, + { + "epoch": 2.1524823657080847, + "grad_norm": 5.01835823059082, + "learning_rate": 4.520761956968618e-06, + "loss": 0.3382, + "step": 40733 + }, + { + "epoch": 2.1524959305480196, + "grad_norm": 3.655165433883667, + "learning_rate": 4.5206249143483626e-06, + "loss": 0.0734, + "step": 40734 + }, + { + "epoch": 2.1525094953879544, + "grad_norm": 7.445361614227295, + "learning_rate": 4.520487871728108e-06, + "loss": 0.2151, + "step": 40735 + }, + { + "epoch": 2.1525230602278893, + "grad_norm": 4.705997943878174, + "learning_rate": 4.520350829107853e-06, + "loss": 0.1177, + "step": 40736 + }, + { + "epoch": 2.152536625067824, + "grad_norm": 3.6603682041168213, + "learning_rate": 4.520213786487598e-06, + "loss": 0.097, + "step": 40737 + }, + { + "epoch": 2.152550189907759, + "grad_norm": 5.687748432159424, + "learning_rate": 4.520076743867343e-06, + "loss": 0.1795, + "step": 40738 + }, + { + "epoch": 2.152563754747694, + "grad_norm": 4.985374450683594, + "learning_rate": 4.5199397012470884e-06, + "loss": 0.2139, + "step": 40739 + }, + { + "epoch": 2.1525773195876288, + "grad_norm": 5.351343631744385, + "learning_rate": 4.519802658626834e-06, + "loss": 0.2824, + "step": 40740 + }, + { + "epoch": 2.1525908844275636, + "grad_norm": 4.22374963760376, + "learning_rate": 4.519665616006578e-06, + "loss": 0.1484, + "step": 40741 + }, + { + "epoch": 2.1526044492674985, + "grad_norm": 5.356111526489258, + "learning_rate": 4.519528573386324e-06, + "loss": 0.1717, + "step": 40742 + }, + { + "epoch": 2.1526180141074334, + "grad_norm": 5.311766147613525, + "learning_rate": 4.519391530766068e-06, + "loss": 0.1816, + "step": 40743 + }, + { + "epoch": 2.1526315789473682, + "grad_norm": 4.902834415435791, + "learning_rate": 4.5192544881458135e-06, + "loss": 0.1398, + "step": 40744 + }, + { + "epoch": 2.1526451437873035, + "grad_norm": 4.948930740356445, + "learning_rate": 4.519117445525559e-06, + "loss": 0.1647, + "step": 40745 + }, + { + "epoch": 2.1526587086272384, + "grad_norm": 6.573355674743652, + "learning_rate": 4.518980402905304e-06, + "loss": 0.2486, + "step": 40746 + }, + { + "epoch": 2.1526722734671733, + "grad_norm": 3.9204471111297607, + "learning_rate": 4.518843360285049e-06, + "loss": 0.1815, + "step": 40747 + }, + { + "epoch": 2.152685838307108, + "grad_norm": 5.352674961090088, + "learning_rate": 4.518706317664794e-06, + "loss": 0.2748, + "step": 40748 + }, + { + "epoch": 2.152699403147043, + "grad_norm": 5.105657577514648, + "learning_rate": 4.518569275044539e-06, + "loss": 0.2587, + "step": 40749 + }, + { + "epoch": 2.152712967986978, + "grad_norm": 4.3238701820373535, + "learning_rate": 4.518432232424284e-06, + "loss": 0.1696, + "step": 40750 + }, + { + "epoch": 2.1527265328269127, + "grad_norm": 4.676621913909912, + "learning_rate": 4.51829518980403e-06, + "loss": 0.2574, + "step": 40751 + }, + { + "epoch": 2.1527400976668476, + "grad_norm": 5.41549825668335, + "learning_rate": 4.518158147183774e-06, + "loss": 0.1759, + "step": 40752 + }, + { + "epoch": 2.1527536625067825, + "grad_norm": 3.5676941871643066, + "learning_rate": 4.51802110456352e-06, + "loss": 0.1378, + "step": 40753 + }, + { + "epoch": 2.1527672273467173, + "grad_norm": 5.086960792541504, + "learning_rate": 4.5178840619432644e-06, + "loss": 0.2044, + "step": 40754 + }, + { + "epoch": 2.152780792186652, + "grad_norm": 4.940203666687012, + "learning_rate": 4.51774701932301e-06, + "loss": 0.2107, + "step": 40755 + }, + { + "epoch": 2.152794357026587, + "grad_norm": 5.057690620422363, + "learning_rate": 4.517609976702755e-06, + "loss": 0.2241, + "step": 40756 + }, + { + "epoch": 2.152807921866522, + "grad_norm": 5.4384870529174805, + "learning_rate": 4.5174729340825e-06, + "loss": 0.232, + "step": 40757 + }, + { + "epoch": 2.152821486706457, + "grad_norm": 5.453596115112305, + "learning_rate": 4.517335891462245e-06, + "loss": 0.1743, + "step": 40758 + }, + { + "epoch": 2.1528350515463917, + "grad_norm": 4.457586765289307, + "learning_rate": 4.51719884884199e-06, + "loss": 0.1962, + "step": 40759 + }, + { + "epoch": 2.1528486163863265, + "grad_norm": 4.936089992523193, + "learning_rate": 4.5170618062217355e-06, + "loss": 0.2008, + "step": 40760 + }, + { + "epoch": 2.1528621812262614, + "grad_norm": 3.9217915534973145, + "learning_rate": 4.516924763601481e-06, + "loss": 0.2427, + "step": 40761 + }, + { + "epoch": 2.1528757460661962, + "grad_norm": 3.2930216789245605, + "learning_rate": 4.516787720981226e-06, + "loss": 0.153, + "step": 40762 + }, + { + "epoch": 2.152889310906131, + "grad_norm": 5.156599044799805, + "learning_rate": 4.51665067836097e-06, + "loss": 0.1869, + "step": 40763 + }, + { + "epoch": 2.152902875746066, + "grad_norm": 4.783578872680664, + "learning_rate": 4.516513635740715e-06, + "loss": 0.232, + "step": 40764 + }, + { + "epoch": 2.1529164405860013, + "grad_norm": 4.871575355529785, + "learning_rate": 4.5163765931204606e-06, + "loss": 0.1565, + "step": 40765 + }, + { + "epoch": 2.152930005425936, + "grad_norm": 4.473821640014648, + "learning_rate": 4.516239550500206e-06, + "loss": 0.244, + "step": 40766 + }, + { + "epoch": 2.152943570265871, + "grad_norm": 4.593005657196045, + "learning_rate": 4.516102507879951e-06, + "loss": 0.1686, + "step": 40767 + }, + { + "epoch": 2.152957135105806, + "grad_norm": 5.620882034301758, + "learning_rate": 4.515965465259696e-06, + "loss": 0.2295, + "step": 40768 + }, + { + "epoch": 2.1529706999457408, + "grad_norm": 4.095781326293945, + "learning_rate": 4.515828422639441e-06, + "loss": 0.1366, + "step": 40769 + }, + { + "epoch": 2.1529842647856756, + "grad_norm": 4.041867256164551, + "learning_rate": 4.5156913800191864e-06, + "loss": 0.2417, + "step": 40770 + }, + { + "epoch": 2.1529978296256105, + "grad_norm": 4.323451995849609, + "learning_rate": 4.515554337398932e-06, + "loss": 0.2618, + "step": 40771 + }, + { + "epoch": 2.1530113944655453, + "grad_norm": 6.731897830963135, + "learning_rate": 4.515417294778677e-06, + "loss": 0.2353, + "step": 40772 + }, + { + "epoch": 2.15302495930548, + "grad_norm": 3.809150457382202, + "learning_rate": 4.515280252158422e-06, + "loss": 0.1751, + "step": 40773 + }, + { + "epoch": 2.153038524145415, + "grad_norm": 4.106357097625732, + "learning_rate": 4.515143209538167e-06, + "loss": 0.2277, + "step": 40774 + }, + { + "epoch": 2.15305208898535, + "grad_norm": 3.8358309268951416, + "learning_rate": 4.5150061669179115e-06, + "loss": 0.2078, + "step": 40775 + }, + { + "epoch": 2.153065653825285, + "grad_norm": 5.815710067749023, + "learning_rate": 4.514869124297657e-06, + "loss": 0.324, + "step": 40776 + }, + { + "epoch": 2.1530792186652197, + "grad_norm": 4.852973461151123, + "learning_rate": 4.514732081677402e-06, + "loss": 0.2352, + "step": 40777 + }, + { + "epoch": 2.1530927835051545, + "grad_norm": 5.038518905639648, + "learning_rate": 4.514595039057147e-06, + "loss": 0.2713, + "step": 40778 + }, + { + "epoch": 2.1531063483450894, + "grad_norm": 3.701503038406372, + "learning_rate": 4.514457996436892e-06, + "loss": 0.1463, + "step": 40779 + }, + { + "epoch": 2.1531199131850243, + "grad_norm": 3.9603545665740967, + "learning_rate": 4.514320953816637e-06, + "loss": 0.1305, + "step": 40780 + }, + { + "epoch": 2.153133478024959, + "grad_norm": 6.478179454803467, + "learning_rate": 4.5141839111963826e-06, + "loss": 0.2166, + "step": 40781 + }, + { + "epoch": 2.153147042864894, + "grad_norm": 4.063236713409424, + "learning_rate": 4.514046868576128e-06, + "loss": 0.2037, + "step": 40782 + }, + { + "epoch": 2.1531606077048293, + "grad_norm": 4.134322643280029, + "learning_rate": 4.513909825955873e-06, + "loss": 0.25, + "step": 40783 + }, + { + "epoch": 2.153174172544764, + "grad_norm": 5.576930999755859, + "learning_rate": 4.513772783335617e-06, + "loss": 0.2454, + "step": 40784 + }, + { + "epoch": 2.153187737384699, + "grad_norm": 4.299707412719727, + "learning_rate": 4.513635740715363e-06, + "loss": 0.2035, + "step": 40785 + }, + { + "epoch": 2.153201302224634, + "grad_norm": 5.0130109786987305, + "learning_rate": 4.513498698095108e-06, + "loss": 0.1957, + "step": 40786 + }, + { + "epoch": 2.1532148670645688, + "grad_norm": 4.890120029449463, + "learning_rate": 4.513361655474854e-06, + "loss": 0.1475, + "step": 40787 + }, + { + "epoch": 2.1532284319045036, + "grad_norm": 4.569880962371826, + "learning_rate": 4.513224612854598e-06, + "loss": 0.2501, + "step": 40788 + }, + { + "epoch": 2.1532419967444385, + "grad_norm": 4.098506927490234, + "learning_rate": 4.513087570234343e-06, + "loss": 0.216, + "step": 40789 + }, + { + "epoch": 2.1532555615843734, + "grad_norm": 5.183156967163086, + "learning_rate": 4.512950527614088e-06, + "loss": 0.2768, + "step": 40790 + }, + { + "epoch": 2.1532691264243082, + "grad_norm": 6.050011157989502, + "learning_rate": 4.5128134849938335e-06, + "loss": 0.2669, + "step": 40791 + }, + { + "epoch": 2.153282691264243, + "grad_norm": 7.059510231018066, + "learning_rate": 4.512676442373579e-06, + "loss": 0.2484, + "step": 40792 + }, + { + "epoch": 2.153296256104178, + "grad_norm": 5.020392417907715, + "learning_rate": 4.512539399753324e-06, + "loss": 0.2746, + "step": 40793 + }, + { + "epoch": 2.153309820944113, + "grad_norm": 6.603011131286621, + "learning_rate": 4.512402357133069e-06, + "loss": 0.2425, + "step": 40794 + }, + { + "epoch": 2.1533233857840477, + "grad_norm": 6.891805171966553, + "learning_rate": 4.512265314512813e-06, + "loss": 0.3284, + "step": 40795 + }, + { + "epoch": 2.1533369506239826, + "grad_norm": 3.4841201305389404, + "learning_rate": 4.512128271892559e-06, + "loss": 0.159, + "step": 40796 + }, + { + "epoch": 2.1533505154639174, + "grad_norm": 3.581676721572876, + "learning_rate": 4.511991229272304e-06, + "loss": 0.1706, + "step": 40797 + }, + { + "epoch": 2.1533640803038523, + "grad_norm": 5.3437018394470215, + "learning_rate": 4.511854186652049e-06, + "loss": 0.2227, + "step": 40798 + }, + { + "epoch": 2.153377645143787, + "grad_norm": 4.694930553436279, + "learning_rate": 4.511717144031794e-06, + "loss": 0.2674, + "step": 40799 + }, + { + "epoch": 2.153391209983722, + "grad_norm": 4.2523274421691895, + "learning_rate": 4.511580101411539e-06, + "loss": 0.2077, + "step": 40800 + }, + { + "epoch": 2.153404774823657, + "grad_norm": 5.000969886779785, + "learning_rate": 4.5114430587912845e-06, + "loss": 0.196, + "step": 40801 + }, + { + "epoch": 2.1534183396635918, + "grad_norm": 3.6742632389068604, + "learning_rate": 4.51130601617103e-06, + "loss": 0.2025, + "step": 40802 + }, + { + "epoch": 2.153431904503527, + "grad_norm": 5.060644149780273, + "learning_rate": 4.511168973550775e-06, + "loss": 0.2206, + "step": 40803 + }, + { + "epoch": 2.153445469343462, + "grad_norm": 4.914466381072998, + "learning_rate": 4.511031930930519e-06, + "loss": 0.1758, + "step": 40804 + }, + { + "epoch": 2.153459034183397, + "grad_norm": 4.207054138183594, + "learning_rate": 4.510894888310265e-06, + "loss": 0.1488, + "step": 40805 + }, + { + "epoch": 2.1534725990233317, + "grad_norm": 3.6404476165771484, + "learning_rate": 4.5107578456900095e-06, + "loss": 0.1611, + "step": 40806 + }, + { + "epoch": 2.1534861638632665, + "grad_norm": 3.446089506149292, + "learning_rate": 4.5106208030697555e-06, + "loss": 0.1313, + "step": 40807 + }, + { + "epoch": 2.1534997287032014, + "grad_norm": 4.011858940124512, + "learning_rate": 4.5104837604495e-06, + "loss": 0.1847, + "step": 40808 + }, + { + "epoch": 2.1535132935431363, + "grad_norm": 5.150480270385742, + "learning_rate": 4.510346717829245e-06, + "loss": 0.2556, + "step": 40809 + }, + { + "epoch": 2.153526858383071, + "grad_norm": 5.307488441467285, + "learning_rate": 4.51020967520899e-06, + "loss": 0.2308, + "step": 40810 + }, + { + "epoch": 2.153540423223006, + "grad_norm": 5.526358127593994, + "learning_rate": 4.510072632588735e-06, + "loss": 0.2787, + "step": 40811 + }, + { + "epoch": 2.153553988062941, + "grad_norm": 4.789261341094971, + "learning_rate": 4.5099355899684806e-06, + "loss": 0.2694, + "step": 40812 + }, + { + "epoch": 2.1535675529028757, + "grad_norm": 5.245181083679199, + "learning_rate": 4.509798547348226e-06, + "loss": 0.1807, + "step": 40813 + }, + { + "epoch": 2.1535811177428106, + "grad_norm": 4.152589321136475, + "learning_rate": 4.509661504727971e-06, + "loss": 0.1818, + "step": 40814 + }, + { + "epoch": 2.1535946825827454, + "grad_norm": 4.699631690979004, + "learning_rate": 4.509524462107716e-06, + "loss": 0.2003, + "step": 40815 + }, + { + "epoch": 2.1536082474226803, + "grad_norm": 3.205498218536377, + "learning_rate": 4.509387419487461e-06, + "loss": 0.1192, + "step": 40816 + }, + { + "epoch": 2.153621812262615, + "grad_norm": 5.245802879333496, + "learning_rate": 4.5092503768672065e-06, + "loss": 0.1512, + "step": 40817 + }, + { + "epoch": 2.15363537710255, + "grad_norm": 4.319262504577637, + "learning_rate": 4.509113334246951e-06, + "loss": 0.1197, + "step": 40818 + }, + { + "epoch": 2.153648941942485, + "grad_norm": 5.8607869148254395, + "learning_rate": 4.508976291626696e-06, + "loss": 0.1715, + "step": 40819 + }, + { + "epoch": 2.1536625067824198, + "grad_norm": 4.643044471740723, + "learning_rate": 4.508839249006441e-06, + "loss": 0.196, + "step": 40820 + }, + { + "epoch": 2.153676071622355, + "grad_norm": 4.705971717834473, + "learning_rate": 4.508702206386186e-06, + "loss": 0.1501, + "step": 40821 + }, + { + "epoch": 2.15368963646229, + "grad_norm": 4.7442851066589355, + "learning_rate": 4.5085651637659315e-06, + "loss": 0.194, + "step": 40822 + }, + { + "epoch": 2.153703201302225, + "grad_norm": 4.109049320220947, + "learning_rate": 4.508428121145677e-06, + "loss": 0.1683, + "step": 40823 + }, + { + "epoch": 2.1537167661421597, + "grad_norm": 4.372992992401123, + "learning_rate": 4.508291078525422e-06, + "loss": 0.1698, + "step": 40824 + }, + { + "epoch": 2.1537303309820945, + "grad_norm": 4.344950199127197, + "learning_rate": 4.508154035905167e-06, + "loss": 0.1827, + "step": 40825 + }, + { + "epoch": 2.1537438958220294, + "grad_norm": 5.582417964935303, + "learning_rate": 4.508016993284912e-06, + "loss": 0.1971, + "step": 40826 + }, + { + "epoch": 2.1537574606619643, + "grad_norm": 3.914316177368164, + "learning_rate": 4.507879950664657e-06, + "loss": 0.2531, + "step": 40827 + }, + { + "epoch": 2.153771025501899, + "grad_norm": 3.9286179542541504, + "learning_rate": 4.507742908044403e-06, + "loss": 0.1256, + "step": 40828 + }, + { + "epoch": 2.153784590341834, + "grad_norm": 5.057641983032227, + "learning_rate": 4.507605865424147e-06, + "loss": 0.1455, + "step": 40829 + }, + { + "epoch": 2.153798155181769, + "grad_norm": 3.0069782733917236, + "learning_rate": 4.507468822803893e-06, + "loss": 0.1285, + "step": 40830 + }, + { + "epoch": 2.1538117200217037, + "grad_norm": 4.1064372062683105, + "learning_rate": 4.507331780183637e-06, + "loss": 0.1359, + "step": 40831 + }, + { + "epoch": 2.1538252848616386, + "grad_norm": 4.18768835067749, + "learning_rate": 4.5071947375633825e-06, + "loss": 0.1552, + "step": 40832 + }, + { + "epoch": 2.1538388497015735, + "grad_norm": 4.196662425994873, + "learning_rate": 4.507057694943128e-06, + "loss": 0.1557, + "step": 40833 + }, + { + "epoch": 2.1538524145415083, + "grad_norm": 3.058961868286133, + "learning_rate": 4.506920652322873e-06, + "loss": 0.0755, + "step": 40834 + }, + { + "epoch": 2.153865979381443, + "grad_norm": 7.114927291870117, + "learning_rate": 4.506783609702618e-06, + "loss": 0.2464, + "step": 40835 + }, + { + "epoch": 2.153879544221378, + "grad_norm": 4.473599910736084, + "learning_rate": 4.506646567082363e-06, + "loss": 0.1662, + "step": 40836 + }, + { + "epoch": 2.153893109061313, + "grad_norm": 4.540731906890869, + "learning_rate": 4.506509524462108e-06, + "loss": 0.1782, + "step": 40837 + }, + { + "epoch": 2.153906673901248, + "grad_norm": 5.196277141571045, + "learning_rate": 4.506372481841853e-06, + "loss": 0.1583, + "step": 40838 + }, + { + "epoch": 2.1539202387411827, + "grad_norm": 2.6389217376708984, + "learning_rate": 4.506235439221599e-06, + "loss": 0.0875, + "step": 40839 + }, + { + "epoch": 2.1539338035811175, + "grad_norm": 5.882532596588135, + "learning_rate": 4.506098396601343e-06, + "loss": 0.1713, + "step": 40840 + }, + { + "epoch": 2.153947368421053, + "grad_norm": 7.4798407554626465, + "learning_rate": 4.505961353981089e-06, + "loss": 0.1731, + "step": 40841 + }, + { + "epoch": 2.1539609332609877, + "grad_norm": 3.584537982940674, + "learning_rate": 4.505824311360833e-06, + "loss": 0.1286, + "step": 40842 + }, + { + "epoch": 2.1539744981009226, + "grad_norm": 4.32918119430542, + "learning_rate": 4.505687268740579e-06, + "loss": 0.14, + "step": 40843 + }, + { + "epoch": 2.1539880629408574, + "grad_norm": 3.5986995697021484, + "learning_rate": 4.505550226120324e-06, + "loss": 0.0984, + "step": 40844 + }, + { + "epoch": 2.1540016277807923, + "grad_norm": 3.701785087585449, + "learning_rate": 4.505413183500069e-06, + "loss": 0.099, + "step": 40845 + }, + { + "epoch": 2.154015192620727, + "grad_norm": 4.225861549377441, + "learning_rate": 4.505276140879814e-06, + "loss": 0.1875, + "step": 40846 + }, + { + "epoch": 2.154028757460662, + "grad_norm": 3.0720062255859375, + "learning_rate": 4.5051390982595584e-06, + "loss": 0.1217, + "step": 40847 + }, + { + "epoch": 2.154042322300597, + "grad_norm": 5.2093353271484375, + "learning_rate": 4.5050020556393045e-06, + "loss": 0.1681, + "step": 40848 + }, + { + "epoch": 2.1540558871405318, + "grad_norm": 4.373510837554932, + "learning_rate": 4.504865013019049e-06, + "loss": 0.1276, + "step": 40849 + }, + { + "epoch": 2.1540694519804666, + "grad_norm": 3.8928380012512207, + "learning_rate": 4.504727970398795e-06, + "loss": 0.1223, + "step": 40850 + }, + { + "epoch": 2.1540830168204015, + "grad_norm": 4.91213321685791, + "learning_rate": 4.504590927778539e-06, + "loss": 0.1589, + "step": 40851 + }, + { + "epoch": 2.1540965816603364, + "grad_norm": 2.8305442333221436, + "learning_rate": 4.504453885158284e-06, + "loss": 0.0659, + "step": 40852 + }, + { + "epoch": 2.154110146500271, + "grad_norm": 5.422524929046631, + "learning_rate": 4.5043168425380295e-06, + "loss": 0.1485, + "step": 40853 + }, + { + "epoch": 2.154123711340206, + "grad_norm": 3.7474255561828613, + "learning_rate": 4.504179799917775e-06, + "loss": 0.0812, + "step": 40854 + }, + { + "epoch": 2.154137276180141, + "grad_norm": 3.866140127182007, + "learning_rate": 4.50404275729752e-06, + "loss": 0.0959, + "step": 40855 + }, + { + "epoch": 2.154150841020076, + "grad_norm": 3.8810882568359375, + "learning_rate": 4.503905714677265e-06, + "loss": 0.158, + "step": 40856 + }, + { + "epoch": 2.1541644058600107, + "grad_norm": 5.130619525909424, + "learning_rate": 4.50376867205701e-06, + "loss": 0.1397, + "step": 40857 + }, + { + "epoch": 2.1541779706999455, + "grad_norm": 3.9170942306518555, + "learning_rate": 4.503631629436755e-06, + "loss": 0.1508, + "step": 40858 + }, + { + "epoch": 2.154191535539881, + "grad_norm": 3.5963053703308105, + "learning_rate": 4.503494586816501e-06, + "loss": 0.1352, + "step": 40859 + }, + { + "epoch": 2.1542051003798157, + "grad_norm": 4.5166335105896, + "learning_rate": 4.503357544196245e-06, + "loss": 0.2498, + "step": 40860 + }, + { + "epoch": 2.1542186652197506, + "grad_norm": 3.9882559776306152, + "learning_rate": 4.503220501575991e-06, + "loss": 0.1063, + "step": 40861 + }, + { + "epoch": 2.1542322300596854, + "grad_norm": 5.161514759063721, + "learning_rate": 4.503083458955735e-06, + "loss": 0.1577, + "step": 40862 + }, + { + "epoch": 2.1542457948996203, + "grad_norm": 3.3582677841186523, + "learning_rate": 4.5029464163354805e-06, + "loss": 0.1138, + "step": 40863 + }, + { + "epoch": 2.154259359739555, + "grad_norm": 4.4828410148620605, + "learning_rate": 4.502809373715226e-06, + "loss": 0.1376, + "step": 40864 + }, + { + "epoch": 2.15427292457949, + "grad_norm": 4.029040813446045, + "learning_rate": 4.502672331094971e-06, + "loss": 0.1725, + "step": 40865 + }, + { + "epoch": 2.154286489419425, + "grad_norm": 5.103982925415039, + "learning_rate": 4.502535288474716e-06, + "loss": 0.1582, + "step": 40866 + }, + { + "epoch": 2.1543000542593598, + "grad_norm": 5.0427141189575195, + "learning_rate": 4.502398245854461e-06, + "loss": 0.0919, + "step": 40867 + }, + { + "epoch": 2.1543136190992946, + "grad_norm": 4.954185485839844, + "learning_rate": 4.502261203234206e-06, + "loss": 0.2093, + "step": 40868 + }, + { + "epoch": 2.1543271839392295, + "grad_norm": 4.400216102600098, + "learning_rate": 4.5021241606139515e-06, + "loss": 0.1541, + "step": 40869 + }, + { + "epoch": 2.1543407487791644, + "grad_norm": 4.20306921005249, + "learning_rate": 4.501987117993697e-06, + "loss": 0.1179, + "step": 40870 + }, + { + "epoch": 2.1543543136190992, + "grad_norm": 5.270503044128418, + "learning_rate": 4.501850075373442e-06, + "loss": 0.2031, + "step": 40871 + }, + { + "epoch": 2.154367878459034, + "grad_norm": 5.972383499145508, + "learning_rate": 4.501713032753186e-06, + "loss": 0.1576, + "step": 40872 + }, + { + "epoch": 2.154381443298969, + "grad_norm": 3.8790738582611084, + "learning_rate": 4.501575990132931e-06, + "loss": 0.1521, + "step": 40873 + }, + { + "epoch": 2.154395008138904, + "grad_norm": 3.9991517066955566, + "learning_rate": 4.501438947512677e-06, + "loss": 0.1278, + "step": 40874 + }, + { + "epoch": 2.1544085729788387, + "grad_norm": 4.873879432678223, + "learning_rate": 4.501301904892422e-06, + "loss": 0.148, + "step": 40875 + }, + { + "epoch": 2.1544221378187736, + "grad_norm": 4.7283124923706055, + "learning_rate": 4.501164862272167e-06, + "loss": 0.1387, + "step": 40876 + }, + { + "epoch": 2.1544357026587084, + "grad_norm": 4.805690288543701, + "learning_rate": 4.501027819651912e-06, + "loss": 0.1523, + "step": 40877 + }, + { + "epoch": 2.1544492674986433, + "grad_norm": 5.137620449066162, + "learning_rate": 4.500890777031657e-06, + "loss": 0.1382, + "step": 40878 + }, + { + "epoch": 2.1544628323385786, + "grad_norm": 6.182934761047363, + "learning_rate": 4.5007537344114025e-06, + "loss": 0.159, + "step": 40879 + }, + { + "epoch": 2.1544763971785135, + "grad_norm": 3.5203354358673096, + "learning_rate": 4.500616691791148e-06, + "loss": 0.1268, + "step": 40880 + }, + { + "epoch": 2.1544899620184483, + "grad_norm": 4.669098854064941, + "learning_rate": 4.500479649170893e-06, + "loss": 0.1641, + "step": 40881 + }, + { + "epoch": 2.154503526858383, + "grad_norm": 5.457401275634766, + "learning_rate": 4.500342606550638e-06, + "loss": 0.1385, + "step": 40882 + }, + { + "epoch": 2.154517091698318, + "grad_norm": 4.253430366516113, + "learning_rate": 4.500205563930382e-06, + "loss": 0.1651, + "step": 40883 + }, + { + "epoch": 2.154530656538253, + "grad_norm": 5.944967269897461, + "learning_rate": 4.500068521310128e-06, + "loss": 0.2162, + "step": 40884 + }, + { + "epoch": 2.154544221378188, + "grad_norm": 6.095129489898682, + "learning_rate": 4.499931478689873e-06, + "loss": 0.2128, + "step": 40885 + }, + { + "epoch": 2.1545577862181227, + "grad_norm": 5.103250503540039, + "learning_rate": 4.499794436069618e-06, + "loss": 0.1382, + "step": 40886 + }, + { + "epoch": 2.1545713510580575, + "grad_norm": 4.564746856689453, + "learning_rate": 4.499657393449363e-06, + "loss": 0.1763, + "step": 40887 + }, + { + "epoch": 2.1545849158979924, + "grad_norm": 7.07996940612793, + "learning_rate": 4.499520350829108e-06, + "loss": 0.1056, + "step": 40888 + }, + { + "epoch": 2.1545984807379273, + "grad_norm": 4.753501892089844, + "learning_rate": 4.499383308208853e-06, + "loss": 0.12, + "step": 40889 + }, + { + "epoch": 2.154612045577862, + "grad_norm": 4.236532688140869, + "learning_rate": 4.499246265588599e-06, + "loss": 0.1304, + "step": 40890 + }, + { + "epoch": 2.154625610417797, + "grad_norm": 4.120491981506348, + "learning_rate": 4.499109222968344e-06, + "loss": 0.0981, + "step": 40891 + }, + { + "epoch": 2.154639175257732, + "grad_norm": 3.9890525341033936, + "learning_rate": 4.498972180348088e-06, + "loss": 0.207, + "step": 40892 + }, + { + "epoch": 2.1546527400976667, + "grad_norm": 6.161162376403809, + "learning_rate": 4.498835137727834e-06, + "loss": 0.1688, + "step": 40893 + }, + { + "epoch": 2.1546663049376016, + "grad_norm": 4.728005886077881, + "learning_rate": 4.4986980951075785e-06, + "loss": 0.1901, + "step": 40894 + }, + { + "epoch": 2.1546798697775364, + "grad_norm": 5.946649551391602, + "learning_rate": 4.4985610524873245e-06, + "loss": 0.1715, + "step": 40895 + }, + { + "epoch": 2.1546934346174713, + "grad_norm": 3.82490873336792, + "learning_rate": 4.498424009867069e-06, + "loss": 0.1112, + "step": 40896 + }, + { + "epoch": 2.1547069994574066, + "grad_norm": 4.788853645324707, + "learning_rate": 4.498286967246814e-06, + "loss": 0.1635, + "step": 40897 + }, + { + "epoch": 2.1547205642973415, + "grad_norm": 7.17760705947876, + "learning_rate": 4.498149924626559e-06, + "loss": 0.2284, + "step": 40898 + }, + { + "epoch": 2.1547341291372764, + "grad_norm": 6.445132732391357, + "learning_rate": 4.498012882006304e-06, + "loss": 0.2341, + "step": 40899 + }, + { + "epoch": 2.154747693977211, + "grad_norm": 6.1243696212768555, + "learning_rate": 4.4978758393860495e-06, + "loss": 0.2354, + "step": 40900 + }, + { + "epoch": 2.154761258817146, + "grad_norm": 5.339096546173096, + "learning_rate": 4.497738796765794e-06, + "loss": 0.176, + "step": 40901 + }, + { + "epoch": 2.154774823657081, + "grad_norm": 5.668790340423584, + "learning_rate": 4.49760175414554e-06, + "loss": 0.174, + "step": 40902 + }, + { + "epoch": 2.154788388497016, + "grad_norm": 3.9261233806610107, + "learning_rate": 4.497464711525284e-06, + "loss": 0.1609, + "step": 40903 + }, + { + "epoch": 2.1548019533369507, + "grad_norm": 6.522303581237793, + "learning_rate": 4.49732766890503e-06, + "loss": 0.2987, + "step": 40904 + }, + { + "epoch": 2.1548155181768855, + "grad_norm": 4.276460647583008, + "learning_rate": 4.497190626284775e-06, + "loss": 0.1518, + "step": 40905 + }, + { + "epoch": 2.1548290830168204, + "grad_norm": 4.217906951904297, + "learning_rate": 4.49705358366452e-06, + "loss": 0.1778, + "step": 40906 + }, + { + "epoch": 2.1548426478567553, + "grad_norm": 4.707910060882568, + "learning_rate": 4.496916541044265e-06, + "loss": 0.2064, + "step": 40907 + }, + { + "epoch": 2.15485621269669, + "grad_norm": 4.392813205718994, + "learning_rate": 4.49677949842401e-06, + "loss": 0.2216, + "step": 40908 + }, + { + "epoch": 2.154869777536625, + "grad_norm": 3.874790906906128, + "learning_rate": 4.496642455803755e-06, + "loss": 0.1561, + "step": 40909 + }, + { + "epoch": 2.15488334237656, + "grad_norm": 3.6677744388580322, + "learning_rate": 4.4965054131835005e-06, + "loss": 0.1515, + "step": 40910 + }, + { + "epoch": 2.1548969072164947, + "grad_norm": 3.539790153503418, + "learning_rate": 4.496368370563246e-06, + "loss": 0.127, + "step": 40911 + }, + { + "epoch": 2.1549104720564296, + "grad_norm": 6.560365200042725, + "learning_rate": 4.496231327942991e-06, + "loss": 0.2026, + "step": 40912 + }, + { + "epoch": 2.1549240368963645, + "grad_norm": 5.000281810760498, + "learning_rate": 4.496094285322736e-06, + "loss": 0.1572, + "step": 40913 + }, + { + "epoch": 2.1549376017362993, + "grad_norm": 3.427666664123535, + "learning_rate": 4.49595724270248e-06, + "loss": 0.18, + "step": 40914 + }, + { + "epoch": 2.154951166576234, + "grad_norm": 6.070725917816162, + "learning_rate": 4.495820200082226e-06, + "loss": 0.2056, + "step": 40915 + }, + { + "epoch": 2.154964731416169, + "grad_norm": 4.472167491912842, + "learning_rate": 4.495683157461971e-06, + "loss": 0.1979, + "step": 40916 + }, + { + "epoch": 2.1549782962561044, + "grad_norm": 5.464618682861328, + "learning_rate": 4.495546114841716e-06, + "loss": 0.2342, + "step": 40917 + }, + { + "epoch": 2.1549918610960392, + "grad_norm": 5.190905570983887, + "learning_rate": 4.495409072221461e-06, + "loss": 0.2175, + "step": 40918 + }, + { + "epoch": 2.155005425935974, + "grad_norm": 5.7293219566345215, + "learning_rate": 4.495272029601206e-06, + "loss": 0.1775, + "step": 40919 + }, + { + "epoch": 2.155018990775909, + "grad_norm": 5.1011271476745605, + "learning_rate": 4.495134986980951e-06, + "loss": 0.2442, + "step": 40920 + }, + { + "epoch": 2.155032555615844, + "grad_norm": 5.3291544914245605, + "learning_rate": 4.494997944360697e-06, + "loss": 0.2175, + "step": 40921 + }, + { + "epoch": 2.1550461204557787, + "grad_norm": 4.708662509918213, + "learning_rate": 4.494860901740442e-06, + "loss": 0.2058, + "step": 40922 + }, + { + "epoch": 2.1550596852957136, + "grad_norm": 6.150596618652344, + "learning_rate": 4.494723859120187e-06, + "loss": 0.2952, + "step": 40923 + }, + { + "epoch": 2.1550732501356484, + "grad_norm": 6.127913951873779, + "learning_rate": 4.494586816499932e-06, + "loss": 0.2628, + "step": 40924 + }, + { + "epoch": 2.1550868149755833, + "grad_norm": 4.482236862182617, + "learning_rate": 4.494449773879677e-06, + "loss": 0.2264, + "step": 40925 + }, + { + "epoch": 2.155100379815518, + "grad_norm": 4.358312129974365, + "learning_rate": 4.494312731259422e-06, + "loss": 0.1603, + "step": 40926 + }, + { + "epoch": 2.155113944655453, + "grad_norm": 4.103761196136475, + "learning_rate": 4.494175688639168e-06, + "loss": 0.1971, + "step": 40927 + }, + { + "epoch": 2.155127509495388, + "grad_norm": 5.804872035980225, + "learning_rate": 4.494038646018912e-06, + "loss": 0.2743, + "step": 40928 + }, + { + "epoch": 2.1551410743353228, + "grad_norm": 4.468423366546631, + "learning_rate": 4.493901603398657e-06, + "loss": 0.1053, + "step": 40929 + }, + { + "epoch": 2.1551546391752576, + "grad_norm": 2.706538438796997, + "learning_rate": 4.493764560778402e-06, + "loss": 0.1018, + "step": 40930 + }, + { + "epoch": 2.1551682040151925, + "grad_norm": 3.988983154296875, + "learning_rate": 4.4936275181581475e-06, + "loss": 0.2029, + "step": 40931 + }, + { + "epoch": 2.1551817688551274, + "grad_norm": 5.667667388916016, + "learning_rate": 4.493490475537893e-06, + "loss": 0.1707, + "step": 40932 + }, + { + "epoch": 2.155195333695062, + "grad_norm": 6.568978786468506, + "learning_rate": 4.493353432917638e-06, + "loss": 0.2705, + "step": 40933 + }, + { + "epoch": 2.155208898534997, + "grad_norm": 4.406030654907227, + "learning_rate": 4.493216390297383e-06, + "loss": 0.1762, + "step": 40934 + }, + { + "epoch": 2.1552224633749324, + "grad_norm": 4.268347263336182, + "learning_rate": 4.493079347677127e-06, + "loss": 0.1642, + "step": 40935 + }, + { + "epoch": 2.1552360282148673, + "grad_norm": 4.063026428222656, + "learning_rate": 4.4929423050568734e-06, + "loss": 0.144, + "step": 40936 + }, + { + "epoch": 2.155249593054802, + "grad_norm": 3.5179286003112793, + "learning_rate": 4.492805262436618e-06, + "loss": 0.1008, + "step": 40937 + }, + { + "epoch": 2.155263157894737, + "grad_norm": 4.221414089202881, + "learning_rate": 4.492668219816364e-06, + "loss": 0.1733, + "step": 40938 + }, + { + "epoch": 2.155276722734672, + "grad_norm": 4.918814182281494, + "learning_rate": 4.492531177196108e-06, + "loss": 0.1956, + "step": 40939 + }, + { + "epoch": 2.1552902875746067, + "grad_norm": 3.455583095550537, + "learning_rate": 4.492394134575853e-06, + "loss": 0.1426, + "step": 40940 + }, + { + "epoch": 2.1553038524145416, + "grad_norm": 4.375432014465332, + "learning_rate": 4.4922570919555985e-06, + "loss": 0.1046, + "step": 40941 + }, + { + "epoch": 2.1553174172544765, + "grad_norm": 4.3722991943359375, + "learning_rate": 4.492120049335344e-06, + "loss": 0.1382, + "step": 40942 + }, + { + "epoch": 2.1553309820944113, + "grad_norm": 4.162030220031738, + "learning_rate": 4.491983006715089e-06, + "loss": 0.1576, + "step": 40943 + }, + { + "epoch": 2.155344546934346, + "grad_norm": 4.128951549530029, + "learning_rate": 4.491845964094834e-06, + "loss": 0.0978, + "step": 40944 + }, + { + "epoch": 2.155358111774281, + "grad_norm": 4.176110744476318, + "learning_rate": 4.491708921474579e-06, + "loss": 0.168, + "step": 40945 + }, + { + "epoch": 2.155371676614216, + "grad_norm": 5.229143142700195, + "learning_rate": 4.4915718788543235e-06, + "loss": 0.2315, + "step": 40946 + }, + { + "epoch": 2.155385241454151, + "grad_norm": 3.801520347595215, + "learning_rate": 4.4914348362340696e-06, + "loss": 0.156, + "step": 40947 + }, + { + "epoch": 2.1553988062940856, + "grad_norm": 4.420543193817139, + "learning_rate": 4.491297793613814e-06, + "loss": 0.1794, + "step": 40948 + }, + { + "epoch": 2.1554123711340205, + "grad_norm": 4.401656627655029, + "learning_rate": 4.49116075099356e-06, + "loss": 0.1213, + "step": 40949 + }, + { + "epoch": 2.1554259359739554, + "grad_norm": 5.158730983734131, + "learning_rate": 4.491023708373304e-06, + "loss": 0.1698, + "step": 40950 + }, + { + "epoch": 2.1554395008138902, + "grad_norm": 4.540731430053711, + "learning_rate": 4.4908866657530494e-06, + "loss": 0.1285, + "step": 40951 + }, + { + "epoch": 2.155453065653825, + "grad_norm": 6.319272041320801, + "learning_rate": 4.490749623132795e-06, + "loss": 0.1115, + "step": 40952 + }, + { + "epoch": 2.15546663049376, + "grad_norm": 3.6194260120391846, + "learning_rate": 4.49061258051254e-06, + "loss": 0.091, + "step": 40953 + }, + { + "epoch": 2.155480195333695, + "grad_norm": 3.8326995372772217, + "learning_rate": 4.490475537892285e-06, + "loss": 0.0947, + "step": 40954 + }, + { + "epoch": 2.15549376017363, + "grad_norm": 4.968395709991455, + "learning_rate": 4.490338495272029e-06, + "loss": 0.1336, + "step": 40955 + }, + { + "epoch": 2.155507325013565, + "grad_norm": 5.571431636810303, + "learning_rate": 4.490201452651775e-06, + "loss": 0.168, + "step": 40956 + }, + { + "epoch": 2.1555208898535, + "grad_norm": 4.540437698364258, + "learning_rate": 4.49006441003152e-06, + "loss": 0.2002, + "step": 40957 + }, + { + "epoch": 2.1555344546934347, + "grad_norm": 5.488803386688232, + "learning_rate": 4.489927367411266e-06, + "loss": 0.2133, + "step": 40958 + }, + { + "epoch": 2.1555480195333696, + "grad_norm": 4.465157985687256, + "learning_rate": 4.48979032479101e-06, + "loss": 0.2036, + "step": 40959 + }, + { + "epoch": 2.1555615843733045, + "grad_norm": 4.7700042724609375, + "learning_rate": 4.489653282170755e-06, + "loss": 0.1603, + "step": 40960 + }, + { + "epoch": 2.1555751492132393, + "grad_norm": 5.3391923904418945, + "learning_rate": 4.4895162395505e-06, + "loss": 0.1409, + "step": 40961 + }, + { + "epoch": 2.155588714053174, + "grad_norm": 5.3234968185424805, + "learning_rate": 4.4893791969302455e-06, + "loss": 0.1465, + "step": 40962 + }, + { + "epoch": 2.155602278893109, + "grad_norm": 3.974006175994873, + "learning_rate": 4.489242154309991e-06, + "loss": 0.1526, + "step": 40963 + }, + { + "epoch": 2.155615843733044, + "grad_norm": 3.7254321575164795, + "learning_rate": 4.489105111689736e-06, + "loss": 0.1364, + "step": 40964 + }, + { + "epoch": 2.155629408572979, + "grad_norm": 5.401611328125, + "learning_rate": 4.488968069069481e-06, + "loss": 0.2417, + "step": 40965 + }, + { + "epoch": 2.1556429734129137, + "grad_norm": 3.689363956451416, + "learning_rate": 4.488831026449226e-06, + "loss": 0.1113, + "step": 40966 + }, + { + "epoch": 2.1556565382528485, + "grad_norm": 5.091294765472412, + "learning_rate": 4.4886939838289714e-06, + "loss": 0.1519, + "step": 40967 + }, + { + "epoch": 2.1556701030927834, + "grad_norm": 3.979290246963501, + "learning_rate": 4.488556941208717e-06, + "loss": 0.1968, + "step": 40968 + }, + { + "epoch": 2.1556836679327183, + "grad_norm": 4.6939544677734375, + "learning_rate": 4.488419898588462e-06, + "loss": 0.1351, + "step": 40969 + }, + { + "epoch": 2.155697232772653, + "grad_norm": 6.9047064781188965, + "learning_rate": 4.488282855968206e-06, + "loss": 0.2208, + "step": 40970 + }, + { + "epoch": 2.155710797612588, + "grad_norm": 5.484201431274414, + "learning_rate": 4.488145813347951e-06, + "loss": 0.1705, + "step": 40971 + }, + { + "epoch": 2.155724362452523, + "grad_norm": 5.215184211730957, + "learning_rate": 4.4880087707276965e-06, + "loss": 0.1882, + "step": 40972 + }, + { + "epoch": 2.155737927292458, + "grad_norm": 3.2499940395355225, + "learning_rate": 4.487871728107442e-06, + "loss": 0.136, + "step": 40973 + }, + { + "epoch": 2.155751492132393, + "grad_norm": 5.1631646156311035, + "learning_rate": 4.487734685487187e-06, + "loss": 0.183, + "step": 40974 + }, + { + "epoch": 2.155765056972328, + "grad_norm": 4.623081207275391, + "learning_rate": 4.487597642866932e-06, + "loss": 0.1707, + "step": 40975 + }, + { + "epoch": 2.1557786218122628, + "grad_norm": 3.3089370727539062, + "learning_rate": 4.487460600246677e-06, + "loss": 0.0826, + "step": 40976 + }, + { + "epoch": 2.1557921866521976, + "grad_norm": 4.470206260681152, + "learning_rate": 4.487323557626422e-06, + "loss": 0.1569, + "step": 40977 + }, + { + "epoch": 2.1558057514921325, + "grad_norm": 6.886084079742432, + "learning_rate": 4.4871865150061676e-06, + "loss": 0.2052, + "step": 40978 + }, + { + "epoch": 2.1558193163320674, + "grad_norm": 4.9045634269714355, + "learning_rate": 4.487049472385913e-06, + "loss": 0.1303, + "step": 40979 + }, + { + "epoch": 2.1558328811720022, + "grad_norm": 4.164214134216309, + "learning_rate": 4.486912429765657e-06, + "loss": 0.1472, + "step": 40980 + }, + { + "epoch": 2.155846446011937, + "grad_norm": 3.932527780532837, + "learning_rate": 4.486775387145403e-06, + "loss": 0.1371, + "step": 40981 + }, + { + "epoch": 2.155860010851872, + "grad_norm": 5.331343173980713, + "learning_rate": 4.4866383445251474e-06, + "loss": 0.1632, + "step": 40982 + }, + { + "epoch": 2.155873575691807, + "grad_norm": 3.6642773151397705, + "learning_rate": 4.486501301904893e-06, + "loss": 0.1173, + "step": 40983 + }, + { + "epoch": 2.1558871405317417, + "grad_norm": 6.245152473449707, + "learning_rate": 4.486364259284638e-06, + "loss": 0.2526, + "step": 40984 + }, + { + "epoch": 2.1559007053716766, + "grad_norm": 6.497813701629639, + "learning_rate": 4.486227216664383e-06, + "loss": 0.2144, + "step": 40985 + }, + { + "epoch": 2.1559142702116114, + "grad_norm": 10.168906211853027, + "learning_rate": 4.486090174044128e-06, + "loss": 0.4565, + "step": 40986 + }, + { + "epoch": 2.1559278350515463, + "grad_norm": 5.542379856109619, + "learning_rate": 4.485953131423873e-06, + "loss": 0.1355, + "step": 40987 + }, + { + "epoch": 2.155941399891481, + "grad_norm": 4.687046051025391, + "learning_rate": 4.4858160888036185e-06, + "loss": 0.166, + "step": 40988 + }, + { + "epoch": 2.155954964731416, + "grad_norm": 3.2843925952911377, + "learning_rate": 4.485679046183363e-06, + "loss": 0.1237, + "step": 40989 + }, + { + "epoch": 2.155968529571351, + "grad_norm": 4.003382682800293, + "learning_rate": 4.485542003563109e-06, + "loss": 0.1335, + "step": 40990 + }, + { + "epoch": 2.1559820944112857, + "grad_norm": 5.878345012664795, + "learning_rate": 4.485404960942853e-06, + "loss": 0.1758, + "step": 40991 + }, + { + "epoch": 2.1559956592512206, + "grad_norm": 5.990060806274414, + "learning_rate": 4.485267918322599e-06, + "loss": 0.2649, + "step": 40992 + }, + { + "epoch": 2.156009224091156, + "grad_norm": 5.5224103927612305, + "learning_rate": 4.4851308757023436e-06, + "loss": 0.1758, + "step": 40993 + }, + { + "epoch": 2.156022788931091, + "grad_norm": 5.314646244049072, + "learning_rate": 4.484993833082089e-06, + "loss": 0.1922, + "step": 40994 + }, + { + "epoch": 2.1560363537710256, + "grad_norm": 4.006992340087891, + "learning_rate": 4.484856790461834e-06, + "loss": 0.142, + "step": 40995 + }, + { + "epoch": 2.1560499186109605, + "grad_norm": 5.081424713134766, + "learning_rate": 4.484719747841579e-06, + "loss": 0.158, + "step": 40996 + }, + { + "epoch": 2.1560634834508954, + "grad_norm": 6.967613697052002, + "learning_rate": 4.484582705221324e-06, + "loss": 0.2776, + "step": 40997 + }, + { + "epoch": 2.1560770482908302, + "grad_norm": 4.653165340423584, + "learning_rate": 4.4844456626010694e-06, + "loss": 0.191, + "step": 40998 + }, + { + "epoch": 2.156090613130765, + "grad_norm": 5.3372297286987305, + "learning_rate": 4.484308619980815e-06, + "loss": 0.1788, + "step": 40999 + }, + { + "epoch": 2.1561041779707, + "grad_norm": 6.425411701202393, + "learning_rate": 4.484171577360559e-06, + "loss": 0.2337, + "step": 41000 + }, + { + "epoch": 2.156117742810635, + "grad_norm": 4.06667423248291, + "learning_rate": 4.484034534740305e-06, + "loss": 0.1697, + "step": 41001 + }, + { + "epoch": 2.1561313076505697, + "grad_norm": 6.737324237823486, + "learning_rate": 4.483897492120049e-06, + "loss": 0.2329, + "step": 41002 + }, + { + "epoch": 2.1561448724905046, + "grad_norm": 3.938941240310669, + "learning_rate": 4.483760449499795e-06, + "loss": 0.1801, + "step": 41003 + }, + { + "epoch": 2.1561584373304394, + "grad_norm": 6.097302436828613, + "learning_rate": 4.48362340687954e-06, + "loss": 0.2512, + "step": 41004 + }, + { + "epoch": 2.1561720021703743, + "grad_norm": 5.648477554321289, + "learning_rate": 4.483486364259285e-06, + "loss": 0.1833, + "step": 41005 + }, + { + "epoch": 2.156185567010309, + "grad_norm": 4.28166389465332, + "learning_rate": 4.48334932163903e-06, + "loss": 0.1342, + "step": 41006 + }, + { + "epoch": 2.156199131850244, + "grad_norm": 3.797154188156128, + "learning_rate": 4.483212279018775e-06, + "loss": 0.1198, + "step": 41007 + }, + { + "epoch": 2.156212696690179, + "grad_norm": 6.323339939117432, + "learning_rate": 4.48307523639852e-06, + "loss": 0.2106, + "step": 41008 + }, + { + "epoch": 2.1562262615301138, + "grad_norm": 7.006045341491699, + "learning_rate": 4.4829381937782656e-06, + "loss": 0.2561, + "step": 41009 + }, + { + "epoch": 2.1562398263700486, + "grad_norm": 5.580580711364746, + "learning_rate": 4.482801151158011e-06, + "loss": 0.2443, + "step": 41010 + }, + { + "epoch": 2.156253391209984, + "grad_norm": 5.456686496734619, + "learning_rate": 4.482664108537755e-06, + "loss": 0.1657, + "step": 41011 + }, + { + "epoch": 2.156266956049919, + "grad_norm": 4.10685396194458, + "learning_rate": 4.482527065917501e-06, + "loss": 0.171, + "step": 41012 + }, + { + "epoch": 2.1562805208898537, + "grad_norm": 4.105914115905762, + "learning_rate": 4.4823900232972454e-06, + "loss": 0.1311, + "step": 41013 + }, + { + "epoch": 2.1562940857297885, + "grad_norm": 6.135598659515381, + "learning_rate": 4.482252980676991e-06, + "loss": 0.1812, + "step": 41014 + }, + { + "epoch": 2.1563076505697234, + "grad_norm": 4.713290691375732, + "learning_rate": 4.482115938056736e-06, + "loss": 0.2343, + "step": 41015 + }, + { + "epoch": 2.1563212154096583, + "grad_norm": 6.491498947143555, + "learning_rate": 4.481978895436481e-06, + "loss": 0.2938, + "step": 41016 + }, + { + "epoch": 2.156334780249593, + "grad_norm": 3.7348856925964355, + "learning_rate": 4.481841852816226e-06, + "loss": 0.1746, + "step": 41017 + }, + { + "epoch": 2.156348345089528, + "grad_norm": 5.554759502410889, + "learning_rate": 4.481704810195971e-06, + "loss": 0.2018, + "step": 41018 + }, + { + "epoch": 2.156361909929463, + "grad_norm": 5.243224143981934, + "learning_rate": 4.4815677675757165e-06, + "loss": 0.1782, + "step": 41019 + }, + { + "epoch": 2.1563754747693977, + "grad_norm": 4.9406046867370605, + "learning_rate": 4.481430724955462e-06, + "loss": 0.212, + "step": 41020 + }, + { + "epoch": 2.1563890396093326, + "grad_norm": 4.911313533782959, + "learning_rate": 4.481293682335207e-06, + "loss": 0.2316, + "step": 41021 + }, + { + "epoch": 2.1564026044492675, + "grad_norm": 4.444357872009277, + "learning_rate": 4.481156639714952e-06, + "loss": 0.1177, + "step": 41022 + }, + { + "epoch": 2.1564161692892023, + "grad_norm": 11.439545631408691, + "learning_rate": 4.481019597094697e-06, + "loss": 0.3086, + "step": 41023 + }, + { + "epoch": 2.156429734129137, + "grad_norm": 6.938476085662842, + "learning_rate": 4.4808825544744416e-06, + "loss": 0.2753, + "step": 41024 + }, + { + "epoch": 2.156443298969072, + "grad_norm": 4.881418704986572, + "learning_rate": 4.480745511854187e-06, + "loss": 0.217, + "step": 41025 + }, + { + "epoch": 2.156456863809007, + "grad_norm": 4.978537559509277, + "learning_rate": 4.480608469233932e-06, + "loss": 0.254, + "step": 41026 + }, + { + "epoch": 2.156470428648942, + "grad_norm": 4.96118688583374, + "learning_rate": 4.480471426613677e-06, + "loss": 0.2256, + "step": 41027 + }, + { + "epoch": 2.1564839934888766, + "grad_norm": 5.282763957977295, + "learning_rate": 4.480334383993422e-06, + "loss": 0.256, + "step": 41028 + }, + { + "epoch": 2.1564975583288115, + "grad_norm": 6.623041152954102, + "learning_rate": 4.4801973413731675e-06, + "loss": 0.206, + "step": 41029 + }, + { + "epoch": 2.1565111231687464, + "grad_norm": 5.378204822540283, + "learning_rate": 4.480060298752913e-06, + "loss": 0.293, + "step": 41030 + }, + { + "epoch": 2.1565246880086817, + "grad_norm": 6.368656158447266, + "learning_rate": 4.479923256132658e-06, + "loss": 0.3399, + "step": 41031 + }, + { + "epoch": 2.1565382528486166, + "grad_norm": 6.094560146331787, + "learning_rate": 4.479786213512403e-06, + "loss": 0.2259, + "step": 41032 + }, + { + "epoch": 2.1565518176885514, + "grad_norm": 5.5138773918151855, + "learning_rate": 4.479649170892148e-06, + "loss": 0.3751, + "step": 41033 + }, + { + "epoch": 2.1565653825284863, + "grad_norm": 7.707118034362793, + "learning_rate": 4.4795121282718925e-06, + "loss": 0.3611, + "step": 41034 + }, + { + "epoch": 2.156578947368421, + "grad_norm": 6.539961814880371, + "learning_rate": 4.4793750856516385e-06, + "loss": 0.2884, + "step": 41035 + }, + { + "epoch": 2.156592512208356, + "grad_norm": 5.53924036026001, + "learning_rate": 4.479238043031383e-06, + "loss": 0.1977, + "step": 41036 + }, + { + "epoch": 2.156606077048291, + "grad_norm": 3.999617099761963, + "learning_rate": 4.479101000411129e-06, + "loss": 0.2135, + "step": 41037 + }, + { + "epoch": 2.1566196418882257, + "grad_norm": 4.753815650939941, + "learning_rate": 4.478963957790873e-06, + "loss": 0.2533, + "step": 41038 + }, + { + "epoch": 2.1566332067281606, + "grad_norm": 4.98063850402832, + "learning_rate": 4.478826915170618e-06, + "loss": 0.2249, + "step": 41039 + }, + { + "epoch": 2.1566467715680955, + "grad_norm": 4.388524532318115, + "learning_rate": 4.4786898725503636e-06, + "loss": 0.1388, + "step": 41040 + }, + { + "epoch": 2.1566603364080303, + "grad_norm": 5.826890468597412, + "learning_rate": 4.478552829930109e-06, + "loss": 0.1767, + "step": 41041 + }, + { + "epoch": 2.156673901247965, + "grad_norm": 7.451931476593018, + "learning_rate": 4.478415787309854e-06, + "loss": 0.2731, + "step": 41042 + }, + { + "epoch": 2.1566874660879, + "grad_norm": 5.014342308044434, + "learning_rate": 4.478278744689598e-06, + "loss": 0.1514, + "step": 41043 + }, + { + "epoch": 2.156701030927835, + "grad_norm": 5.219019412994385, + "learning_rate": 4.478141702069344e-06, + "loss": 0.1277, + "step": 41044 + }, + { + "epoch": 2.15671459576777, + "grad_norm": 6.386125564575195, + "learning_rate": 4.478004659449089e-06, + "loss": 0.3118, + "step": 41045 + }, + { + "epoch": 2.1567281606077047, + "grad_norm": 4.421187400817871, + "learning_rate": 4.477867616828835e-06, + "loss": 0.1992, + "step": 41046 + }, + { + "epoch": 2.1567417254476395, + "grad_norm": 5.652917861938477, + "learning_rate": 4.477730574208579e-06, + "loss": 0.2122, + "step": 41047 + }, + { + "epoch": 2.1567552902875744, + "grad_norm": 2.988293170928955, + "learning_rate": 4.477593531588324e-06, + "loss": 0.1162, + "step": 41048 + }, + { + "epoch": 2.1567688551275097, + "grad_norm": 5.645322799682617, + "learning_rate": 4.477456488968069e-06, + "loss": 0.2321, + "step": 41049 + }, + { + "epoch": 2.1567824199674446, + "grad_norm": 6.292129039764404, + "learning_rate": 4.4773194463478145e-06, + "loss": 0.23, + "step": 41050 + }, + { + "epoch": 2.1567959848073794, + "grad_norm": 5.926116466522217, + "learning_rate": 4.47718240372756e-06, + "loss": 0.2302, + "step": 41051 + }, + { + "epoch": 2.1568095496473143, + "grad_norm": 5.094311237335205, + "learning_rate": 4.477045361107305e-06, + "loss": 0.1739, + "step": 41052 + }, + { + "epoch": 2.156823114487249, + "grad_norm": 7.444231986999512, + "learning_rate": 4.47690831848705e-06, + "loss": 0.2873, + "step": 41053 + }, + { + "epoch": 2.156836679327184, + "grad_norm": 6.344518661499023, + "learning_rate": 4.476771275866794e-06, + "loss": 0.1634, + "step": 41054 + }, + { + "epoch": 2.156850244167119, + "grad_norm": 5.892440319061279, + "learning_rate": 4.47663423324654e-06, + "loss": 0.1755, + "step": 41055 + }, + { + "epoch": 2.1568638090070538, + "grad_norm": 7.283746719360352, + "learning_rate": 4.476497190626285e-06, + "loss": 0.2595, + "step": 41056 + }, + { + "epoch": 2.1568773738469886, + "grad_norm": 3.9940764904022217, + "learning_rate": 4.476360148006031e-06, + "loss": 0.1215, + "step": 41057 + }, + { + "epoch": 2.1568909386869235, + "grad_norm": 4.84168815612793, + "learning_rate": 4.476223105385775e-06, + "loss": 0.1746, + "step": 41058 + }, + { + "epoch": 2.1569045035268584, + "grad_norm": 6.943818092346191, + "learning_rate": 4.47608606276552e-06, + "loss": 0.2626, + "step": 41059 + }, + { + "epoch": 2.1569180683667932, + "grad_norm": 7.177072048187256, + "learning_rate": 4.4759490201452655e-06, + "loss": 0.2579, + "step": 41060 + }, + { + "epoch": 2.156931633206728, + "grad_norm": 7.3732404708862305, + "learning_rate": 4.475811977525011e-06, + "loss": 0.2859, + "step": 41061 + }, + { + "epoch": 2.156945198046663, + "grad_norm": 6.839149475097656, + "learning_rate": 4.475674934904756e-06, + "loss": 0.3125, + "step": 41062 + }, + { + "epoch": 2.156958762886598, + "grad_norm": 4.661604881286621, + "learning_rate": 4.475537892284501e-06, + "loss": 0.1748, + "step": 41063 + }, + { + "epoch": 2.1569723277265327, + "grad_norm": 7.049940586090088, + "learning_rate": 4.475400849664246e-06, + "loss": 0.3877, + "step": 41064 + }, + { + "epoch": 2.1569858925664676, + "grad_norm": 6.881350994110107, + "learning_rate": 4.4752638070439905e-06, + "loss": 0.2981, + "step": 41065 + }, + { + "epoch": 2.1569994574064024, + "grad_norm": 4.462848663330078, + "learning_rate": 4.4751267644237365e-06, + "loss": 0.1584, + "step": 41066 + }, + { + "epoch": 2.1570130222463373, + "grad_norm": 6.145812034606934, + "learning_rate": 4.474989721803481e-06, + "loss": 0.4147, + "step": 41067 + }, + { + "epoch": 2.157026587086272, + "grad_norm": 5.141354560852051, + "learning_rate": 4.474852679183226e-06, + "loss": 0.3132, + "step": 41068 + }, + { + "epoch": 2.1570401519262075, + "grad_norm": 5.09003210067749, + "learning_rate": 4.474715636562971e-06, + "loss": 0.2146, + "step": 41069 + }, + { + "epoch": 2.1570537167661423, + "grad_norm": 3.754879951477051, + "learning_rate": 4.474578593942716e-06, + "loss": 0.1384, + "step": 41070 + }, + { + "epoch": 2.157067281606077, + "grad_norm": 5.889959812164307, + "learning_rate": 4.4744415513224616e-06, + "loss": 0.2681, + "step": 41071 + }, + { + "epoch": 2.157080846446012, + "grad_norm": 3.8751187324523926, + "learning_rate": 4.474304508702207e-06, + "loss": 0.1274, + "step": 41072 + }, + { + "epoch": 2.157094411285947, + "grad_norm": 5.671302795410156, + "learning_rate": 4.474167466081952e-06, + "loss": 0.2721, + "step": 41073 + }, + { + "epoch": 2.157107976125882, + "grad_norm": 4.243295192718506, + "learning_rate": 4.474030423461697e-06, + "loss": 0.1688, + "step": 41074 + }, + { + "epoch": 2.1571215409658167, + "grad_norm": 4.343468189239502, + "learning_rate": 4.473893380841442e-06, + "loss": 0.1937, + "step": 41075 + }, + { + "epoch": 2.1571351058057515, + "grad_norm": 5.314908027648926, + "learning_rate": 4.4737563382211875e-06, + "loss": 0.3161, + "step": 41076 + }, + { + "epoch": 2.1571486706456864, + "grad_norm": 7.1359124183654785, + "learning_rate": 4.473619295600932e-06, + "loss": 0.2749, + "step": 41077 + }, + { + "epoch": 2.1571622354856212, + "grad_norm": 8.562540054321289, + "learning_rate": 4.473482252980678e-06, + "loss": 0.3628, + "step": 41078 + }, + { + "epoch": 2.157175800325556, + "grad_norm": 5.034858226776123, + "learning_rate": 4.473345210360422e-06, + "loss": 0.2501, + "step": 41079 + }, + { + "epoch": 2.157189365165491, + "grad_norm": 4.134374618530273, + "learning_rate": 4.473208167740167e-06, + "loss": 0.1828, + "step": 41080 + }, + { + "epoch": 2.157202930005426, + "grad_norm": 6.838516712188721, + "learning_rate": 4.4730711251199125e-06, + "loss": 0.3279, + "step": 41081 + }, + { + "epoch": 2.1572164948453607, + "grad_norm": 5.423255920410156, + "learning_rate": 4.472934082499658e-06, + "loss": 0.2295, + "step": 41082 + }, + { + "epoch": 2.1572300596852956, + "grad_norm": 6.623130798339844, + "learning_rate": 4.472797039879403e-06, + "loss": 0.2638, + "step": 41083 + }, + { + "epoch": 2.1572436245252304, + "grad_norm": 5.5354719161987305, + "learning_rate": 4.472659997259148e-06, + "loss": 0.1898, + "step": 41084 + }, + { + "epoch": 2.1572571893651653, + "grad_norm": 4.365422248840332, + "learning_rate": 4.472522954638893e-06, + "loss": 0.1873, + "step": 41085 + }, + { + "epoch": 2.1572707542051, + "grad_norm": 4.800962924957275, + "learning_rate": 4.472385912018638e-06, + "loss": 0.2493, + "step": 41086 + }, + { + "epoch": 2.1572843190450355, + "grad_norm": 5.900080680847168, + "learning_rate": 4.472248869398384e-06, + "loss": 0.2669, + "step": 41087 + }, + { + "epoch": 2.1572978838849703, + "grad_norm": 6.060678958892822, + "learning_rate": 4.472111826778128e-06, + "loss": 0.222, + "step": 41088 + }, + { + "epoch": 2.157311448724905, + "grad_norm": 6.477700233459473, + "learning_rate": 4.471974784157874e-06, + "loss": 0.3024, + "step": 41089 + }, + { + "epoch": 2.15732501356484, + "grad_norm": 6.332881927490234, + "learning_rate": 4.471837741537618e-06, + "loss": 0.2418, + "step": 41090 + }, + { + "epoch": 2.157338578404775, + "grad_norm": 4.998441219329834, + "learning_rate": 4.471700698917364e-06, + "loss": 0.1161, + "step": 41091 + }, + { + "epoch": 2.15735214324471, + "grad_norm": 5.427395343780518, + "learning_rate": 4.471563656297109e-06, + "loss": 0.1739, + "step": 41092 + }, + { + "epoch": 2.1573657080846447, + "grad_norm": 4.58737850189209, + "learning_rate": 4.471426613676854e-06, + "loss": 0.1835, + "step": 41093 + }, + { + "epoch": 2.1573792729245795, + "grad_norm": 4.926605701446533, + "learning_rate": 4.471289571056599e-06, + "loss": 0.2403, + "step": 41094 + }, + { + "epoch": 2.1573928377645144, + "grad_norm": 6.544801712036133, + "learning_rate": 4.471152528436344e-06, + "loss": 0.2801, + "step": 41095 + }, + { + "epoch": 2.1574064026044493, + "grad_norm": 5.494704723358154, + "learning_rate": 4.471015485816089e-06, + "loss": 0.1385, + "step": 41096 + }, + { + "epoch": 2.157419967444384, + "grad_norm": 4.882229328155518, + "learning_rate": 4.470878443195834e-06, + "loss": 0.1969, + "step": 41097 + }, + { + "epoch": 2.157433532284319, + "grad_norm": 5.621320724487305, + "learning_rate": 4.47074140057558e-06, + "loss": 0.2084, + "step": 41098 + }, + { + "epoch": 2.157447097124254, + "grad_norm": 4.390806674957275, + "learning_rate": 4.470604357955324e-06, + "loss": 0.1476, + "step": 41099 + }, + { + "epoch": 2.1574606619641887, + "grad_norm": 5.218752384185791, + "learning_rate": 4.47046731533507e-06, + "loss": 0.1987, + "step": 41100 + }, + { + "epoch": 2.1574742268041236, + "grad_norm": 4.998812198638916, + "learning_rate": 4.470330272714814e-06, + "loss": 0.1807, + "step": 41101 + }, + { + "epoch": 2.1574877916440585, + "grad_norm": 3.185328245162964, + "learning_rate": 4.47019323009456e-06, + "loss": 0.0843, + "step": 41102 + }, + { + "epoch": 2.1575013564839933, + "grad_norm": 6.250655651092529, + "learning_rate": 4.470056187474305e-06, + "loss": 0.2195, + "step": 41103 + }, + { + "epoch": 2.157514921323928, + "grad_norm": 5.3377766609191895, + "learning_rate": 4.46991914485405e-06, + "loss": 0.1768, + "step": 41104 + }, + { + "epoch": 2.157528486163863, + "grad_norm": 5.915896892547607, + "learning_rate": 4.469782102233795e-06, + "loss": 0.2324, + "step": 41105 + }, + { + "epoch": 2.157542051003798, + "grad_norm": 4.693198204040527, + "learning_rate": 4.46964505961354e-06, + "loss": 0.1475, + "step": 41106 + }, + { + "epoch": 2.1575556158437332, + "grad_norm": 4.557505130767822, + "learning_rate": 4.4695080169932855e-06, + "loss": 0.2077, + "step": 41107 + }, + { + "epoch": 2.157569180683668, + "grad_norm": 4.652606010437012, + "learning_rate": 4.46937097437303e-06, + "loss": 0.1077, + "step": 41108 + }, + { + "epoch": 2.157582745523603, + "grad_norm": 5.237208843231201, + "learning_rate": 4.469233931752776e-06, + "loss": 0.1691, + "step": 41109 + }, + { + "epoch": 2.157596310363538, + "grad_norm": 6.002910137176514, + "learning_rate": 4.46909688913252e-06, + "loss": 0.192, + "step": 41110 + }, + { + "epoch": 2.1576098752034727, + "grad_norm": 4.447221755981445, + "learning_rate": 4.468959846512266e-06, + "loss": 0.1208, + "step": 41111 + }, + { + "epoch": 2.1576234400434076, + "grad_norm": 5.330416202545166, + "learning_rate": 4.4688228038920105e-06, + "loss": 0.2584, + "step": 41112 + }, + { + "epoch": 2.1576370048833424, + "grad_norm": 5.305457592010498, + "learning_rate": 4.468685761271756e-06, + "loss": 0.1506, + "step": 41113 + }, + { + "epoch": 2.1576505697232773, + "grad_norm": 5.341257095336914, + "learning_rate": 4.468548718651501e-06, + "loss": 0.151, + "step": 41114 + }, + { + "epoch": 2.157664134563212, + "grad_norm": 4.380578517913818, + "learning_rate": 4.468411676031246e-06, + "loss": 0.1722, + "step": 41115 + }, + { + "epoch": 2.157677699403147, + "grad_norm": 4.345613479614258, + "learning_rate": 4.468274633410991e-06, + "loss": 0.0984, + "step": 41116 + }, + { + "epoch": 2.157691264243082, + "grad_norm": 4.778507709503174, + "learning_rate": 4.468137590790736e-06, + "loss": 0.1766, + "step": 41117 + }, + { + "epoch": 2.1577048290830168, + "grad_norm": 5.094076633453369, + "learning_rate": 4.468000548170482e-06, + "loss": 0.2539, + "step": 41118 + }, + { + "epoch": 2.1577183939229516, + "grad_norm": 5.511086463928223, + "learning_rate": 4.467863505550227e-06, + "loss": 0.1598, + "step": 41119 + }, + { + "epoch": 2.1577319587628865, + "grad_norm": 5.344112873077393, + "learning_rate": 4.467726462929972e-06, + "loss": 0.1722, + "step": 41120 + }, + { + "epoch": 2.1577455236028213, + "grad_norm": 5.197447776794434, + "learning_rate": 4.467589420309716e-06, + "loss": 0.2066, + "step": 41121 + }, + { + "epoch": 2.157759088442756, + "grad_norm": 5.644783020019531, + "learning_rate": 4.4674523776894615e-06, + "loss": 0.1777, + "step": 41122 + }, + { + "epoch": 2.157772653282691, + "grad_norm": 4.829768657684326, + "learning_rate": 4.467315335069207e-06, + "loss": 0.1072, + "step": 41123 + }, + { + "epoch": 2.1577862181226264, + "grad_norm": 5.284116268157959, + "learning_rate": 4.467178292448952e-06, + "loss": 0.1355, + "step": 41124 + }, + { + "epoch": 2.1577997829625613, + "grad_norm": 3.8164520263671875, + "learning_rate": 4.467041249828697e-06, + "loss": 0.1934, + "step": 41125 + }, + { + "epoch": 2.157813347802496, + "grad_norm": 3.867581844329834, + "learning_rate": 4.466904207208442e-06, + "loss": 0.1314, + "step": 41126 + }, + { + "epoch": 2.157826912642431, + "grad_norm": 9.874322891235352, + "learning_rate": 4.466767164588187e-06, + "loss": 0.1358, + "step": 41127 + }, + { + "epoch": 2.157840477482366, + "grad_norm": 4.645849227905273, + "learning_rate": 4.4666301219679325e-06, + "loss": 0.1327, + "step": 41128 + }, + { + "epoch": 2.1578540423223007, + "grad_norm": 4.9355244636535645, + "learning_rate": 4.466493079347678e-06, + "loss": 0.1494, + "step": 41129 + }, + { + "epoch": 2.1578676071622356, + "grad_norm": 5.58338737487793, + "learning_rate": 4.466356036727423e-06, + "loss": 0.1736, + "step": 41130 + }, + { + "epoch": 2.1578811720021704, + "grad_norm": 4.152385711669922, + "learning_rate": 4.466218994107167e-06, + "loss": 0.1649, + "step": 41131 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 7.212421417236328, + "learning_rate": 4.466081951486913e-06, + "loss": 0.3419, + "step": 41132 + }, + { + "epoch": 2.15790830168204, + "grad_norm": 4.9985785484313965, + "learning_rate": 4.465944908866658e-06, + "loss": 0.2958, + "step": 41133 + }, + { + "epoch": 2.157921866521975, + "grad_norm": 4.277908802032471, + "learning_rate": 4.465807866246403e-06, + "loss": 0.1576, + "step": 41134 + }, + { + "epoch": 2.15793543136191, + "grad_norm": 4.059985637664795, + "learning_rate": 4.465670823626148e-06, + "loss": 0.1956, + "step": 41135 + }, + { + "epoch": 2.1579489962018448, + "grad_norm": 5.761254787445068, + "learning_rate": 4.465533781005893e-06, + "loss": 0.3359, + "step": 41136 + }, + { + "epoch": 2.1579625610417796, + "grad_norm": 6.404738903045654, + "learning_rate": 4.465396738385638e-06, + "loss": 0.252, + "step": 41137 + }, + { + "epoch": 2.1579761258817145, + "grad_norm": 6.114339351654053, + "learning_rate": 4.4652596957653835e-06, + "loss": 0.2398, + "step": 41138 + }, + { + "epoch": 2.1579896907216494, + "grad_norm": 3.1372311115264893, + "learning_rate": 4.465122653145129e-06, + "loss": 0.0959, + "step": 41139 + }, + { + "epoch": 2.1580032555615842, + "grad_norm": 3.4062063694000244, + "learning_rate": 4.464985610524874e-06, + "loss": 0.1526, + "step": 41140 + }, + { + "epoch": 2.158016820401519, + "grad_norm": 5.823564529418945, + "learning_rate": 4.464848567904619e-06, + "loss": 0.147, + "step": 41141 + }, + { + "epoch": 2.158030385241454, + "grad_norm": 5.450862884521484, + "learning_rate": 4.464711525284363e-06, + "loss": 0.291, + "step": 41142 + }, + { + "epoch": 2.158043950081389, + "grad_norm": 6.973106384277344, + "learning_rate": 4.464574482664109e-06, + "loss": 0.1909, + "step": 41143 + }, + { + "epoch": 2.1580575149213237, + "grad_norm": 4.4627532958984375, + "learning_rate": 4.464437440043854e-06, + "loss": 0.1285, + "step": 41144 + }, + { + "epoch": 2.158071079761259, + "grad_norm": 5.6948466300964355, + "learning_rate": 4.4643003974236e-06, + "loss": 0.2528, + "step": 41145 + }, + { + "epoch": 2.158084644601194, + "grad_norm": 5.173347473144531, + "learning_rate": 4.464163354803344e-06, + "loss": 0.2731, + "step": 41146 + }, + { + "epoch": 2.1580982094411287, + "grad_norm": 7.2032060623168945, + "learning_rate": 4.464026312183089e-06, + "loss": 0.2335, + "step": 41147 + }, + { + "epoch": 2.1581117742810636, + "grad_norm": 5.202299118041992, + "learning_rate": 4.463889269562834e-06, + "loss": 0.1865, + "step": 41148 + }, + { + "epoch": 2.1581253391209985, + "grad_norm": 4.837772369384766, + "learning_rate": 4.46375222694258e-06, + "loss": 0.2072, + "step": 41149 + }, + { + "epoch": 2.1581389039609333, + "grad_norm": 4.606998443603516, + "learning_rate": 4.463615184322325e-06, + "loss": 0.2439, + "step": 41150 + }, + { + "epoch": 2.158152468800868, + "grad_norm": 5.724467754364014, + "learning_rate": 4.463478141702069e-06, + "loss": 0.3102, + "step": 41151 + }, + { + "epoch": 2.158166033640803, + "grad_norm": 5.34172248840332, + "learning_rate": 4.463341099081815e-06, + "loss": 0.2109, + "step": 41152 + }, + { + "epoch": 2.158179598480738, + "grad_norm": 5.337181568145752, + "learning_rate": 4.4632040564615595e-06, + "loss": 0.3335, + "step": 41153 + }, + { + "epoch": 2.158193163320673, + "grad_norm": 4.065188884735107, + "learning_rate": 4.4630670138413055e-06, + "loss": 0.1697, + "step": 41154 + }, + { + "epoch": 2.1582067281606077, + "grad_norm": 8.0499267578125, + "learning_rate": 4.46292997122105e-06, + "loss": 0.1778, + "step": 41155 + }, + { + "epoch": 2.1582202930005425, + "grad_norm": 5.548284530639648, + "learning_rate": 4.462792928600795e-06, + "loss": 0.2137, + "step": 41156 + }, + { + "epoch": 2.1582338578404774, + "grad_norm": 6.881826400756836, + "learning_rate": 4.46265588598054e-06, + "loss": 0.1815, + "step": 41157 + }, + { + "epoch": 2.1582474226804123, + "grad_norm": 7.220543384552002, + "learning_rate": 4.462518843360285e-06, + "loss": 0.301, + "step": 41158 + }, + { + "epoch": 2.158260987520347, + "grad_norm": 4.197717189788818, + "learning_rate": 4.4623818007400305e-06, + "loss": 0.1459, + "step": 41159 + }, + { + "epoch": 2.158274552360282, + "grad_norm": 5.030679702758789, + "learning_rate": 4.462244758119776e-06, + "loss": 0.1721, + "step": 41160 + }, + { + "epoch": 2.158288117200217, + "grad_norm": 5.356434345245361, + "learning_rate": 4.462107715499521e-06, + "loss": 0.2107, + "step": 41161 + }, + { + "epoch": 2.158301682040152, + "grad_norm": 4.39544677734375, + "learning_rate": 4.461970672879265e-06, + "loss": 0.1414, + "step": 41162 + }, + { + "epoch": 2.158315246880087, + "grad_norm": 7.333992004394531, + "learning_rate": 4.461833630259011e-06, + "loss": 0.2882, + "step": 41163 + }, + { + "epoch": 2.158328811720022, + "grad_norm": 7.698757171630859, + "learning_rate": 4.461696587638756e-06, + "loss": 0.3009, + "step": 41164 + }, + { + "epoch": 2.1583423765599568, + "grad_norm": 4.270485877990723, + "learning_rate": 4.461559545018502e-06, + "loss": 0.1345, + "step": 41165 + }, + { + "epoch": 2.1583559413998916, + "grad_norm": 4.597474575042725, + "learning_rate": 4.461422502398246e-06, + "loss": 0.1476, + "step": 41166 + }, + { + "epoch": 2.1583695062398265, + "grad_norm": 6.221900463104248, + "learning_rate": 4.461285459777991e-06, + "loss": 0.2513, + "step": 41167 + }, + { + "epoch": 2.1583830710797614, + "grad_norm": 4.1609601974487305, + "learning_rate": 4.461148417157736e-06, + "loss": 0.1694, + "step": 41168 + }, + { + "epoch": 2.158396635919696, + "grad_norm": 6.827190399169922, + "learning_rate": 4.4610113745374815e-06, + "loss": 0.234, + "step": 41169 + }, + { + "epoch": 2.158410200759631, + "grad_norm": 5.337733268737793, + "learning_rate": 4.460874331917227e-06, + "loss": 0.1896, + "step": 41170 + }, + { + "epoch": 2.158423765599566, + "grad_norm": 4.482297420501709, + "learning_rate": 4.460737289296972e-06, + "loss": 0.1716, + "step": 41171 + }, + { + "epoch": 2.158437330439501, + "grad_norm": 5.248265266418457, + "learning_rate": 4.460600246676717e-06, + "loss": 0.1911, + "step": 41172 + }, + { + "epoch": 2.1584508952794357, + "grad_norm": 5.353996276855469, + "learning_rate": 4.460463204056462e-06, + "loss": 0.1806, + "step": 41173 + }, + { + "epoch": 2.1584644601193705, + "grad_norm": 5.22683048248291, + "learning_rate": 4.460326161436207e-06, + "loss": 0.2734, + "step": 41174 + }, + { + "epoch": 2.1584780249593054, + "grad_norm": 4.27957820892334, + "learning_rate": 4.4601891188159526e-06, + "loss": 0.1826, + "step": 41175 + }, + { + "epoch": 2.1584915897992403, + "grad_norm": 4.236865997314453, + "learning_rate": 4.460052076195697e-06, + "loss": 0.202, + "step": 41176 + }, + { + "epoch": 2.158505154639175, + "grad_norm": 6.0918684005737305, + "learning_rate": 4.459915033575442e-06, + "loss": 0.2057, + "step": 41177 + }, + { + "epoch": 2.15851871947911, + "grad_norm": 5.330113887786865, + "learning_rate": 4.459777990955187e-06, + "loss": 0.2295, + "step": 41178 + }, + { + "epoch": 2.158532284319045, + "grad_norm": 7.273946762084961, + "learning_rate": 4.4596409483349324e-06, + "loss": 0.2339, + "step": 41179 + }, + { + "epoch": 2.1585458491589797, + "grad_norm": 6.260036945343018, + "learning_rate": 4.459503905714678e-06, + "loss": 0.2536, + "step": 41180 + }, + { + "epoch": 2.1585594139989146, + "grad_norm": 4.861570358276367, + "learning_rate": 4.459366863094423e-06, + "loss": 0.2205, + "step": 41181 + }, + { + "epoch": 2.15857297883885, + "grad_norm": 5.401710510253906, + "learning_rate": 4.459229820474168e-06, + "loss": 0.2174, + "step": 41182 + }, + { + "epoch": 2.1585865436787848, + "grad_norm": 4.573204040527344, + "learning_rate": 4.459092777853913e-06, + "loss": 0.1611, + "step": 41183 + }, + { + "epoch": 2.1586001085187196, + "grad_norm": 5.462078094482422, + "learning_rate": 4.458955735233658e-06, + "loss": 0.1484, + "step": 41184 + }, + { + "epoch": 2.1586136733586545, + "grad_norm": 4.823049068450928, + "learning_rate": 4.458818692613403e-06, + "loss": 0.2304, + "step": 41185 + }, + { + "epoch": 2.1586272381985894, + "grad_norm": 4.420257091522217, + "learning_rate": 4.458681649993149e-06, + "loss": 0.2276, + "step": 41186 + }, + { + "epoch": 2.1586408030385242, + "grad_norm": 5.771199703216553, + "learning_rate": 4.458544607372893e-06, + "loss": 0.2452, + "step": 41187 + }, + { + "epoch": 2.158654367878459, + "grad_norm": 5.834940433502197, + "learning_rate": 4.458407564752639e-06, + "loss": 0.1645, + "step": 41188 + }, + { + "epoch": 2.158667932718394, + "grad_norm": 7.290187835693359, + "learning_rate": 4.458270522132383e-06, + "loss": 0.2267, + "step": 41189 + }, + { + "epoch": 2.158681497558329, + "grad_norm": 4.69642448425293, + "learning_rate": 4.4581334795121285e-06, + "loss": 0.2277, + "step": 41190 + }, + { + "epoch": 2.1586950623982637, + "grad_norm": 4.471474647521973, + "learning_rate": 4.457996436891874e-06, + "loss": 0.1489, + "step": 41191 + }, + { + "epoch": 2.1587086272381986, + "grad_norm": 5.126958847045898, + "learning_rate": 4.457859394271619e-06, + "loss": 0.186, + "step": 41192 + }, + { + "epoch": 2.1587221920781334, + "grad_norm": 3.198701858520508, + "learning_rate": 4.457722351651364e-06, + "loss": 0.1094, + "step": 41193 + }, + { + "epoch": 2.1587357569180683, + "grad_norm": 4.869777679443359, + "learning_rate": 4.457585309031109e-06, + "loss": 0.2055, + "step": 41194 + }, + { + "epoch": 2.158749321758003, + "grad_norm": 6.479808330535889, + "learning_rate": 4.4574482664108544e-06, + "loss": 0.2847, + "step": 41195 + }, + { + "epoch": 2.158762886597938, + "grad_norm": 4.3134026527404785, + "learning_rate": 4.457311223790599e-06, + "loss": 0.1939, + "step": 41196 + }, + { + "epoch": 2.158776451437873, + "grad_norm": 4.597324848175049, + "learning_rate": 4.457174181170345e-06, + "loss": 0.1621, + "step": 41197 + }, + { + "epoch": 2.1587900162778078, + "grad_norm": 7.195801734924316, + "learning_rate": 4.457037138550089e-06, + "loss": 0.4079, + "step": 41198 + }, + { + "epoch": 2.1588035811177426, + "grad_norm": 4.515115737915039, + "learning_rate": 4.456900095929835e-06, + "loss": 0.2087, + "step": 41199 + }, + { + "epoch": 2.158817145957678, + "grad_norm": 6.542538166046143, + "learning_rate": 4.4567630533095795e-06, + "loss": 0.3629, + "step": 41200 + }, + { + "epoch": 2.158830710797613, + "grad_norm": 5.433493137359619, + "learning_rate": 4.456626010689325e-06, + "loss": 0.1965, + "step": 41201 + }, + { + "epoch": 2.1588442756375477, + "grad_norm": 4.724492073059082, + "learning_rate": 4.45648896806907e-06, + "loss": 0.174, + "step": 41202 + }, + { + "epoch": 2.1588578404774825, + "grad_norm": 4.122244834899902, + "learning_rate": 4.456351925448815e-06, + "loss": 0.2461, + "step": 41203 + }, + { + "epoch": 2.1588714053174174, + "grad_norm": 5.701219081878662, + "learning_rate": 4.45621488282856e-06, + "loss": 0.2741, + "step": 41204 + }, + { + "epoch": 2.1588849701573523, + "grad_norm": 6.606241226196289, + "learning_rate": 4.4560778402083045e-06, + "loss": 0.272, + "step": 41205 + }, + { + "epoch": 2.158898534997287, + "grad_norm": 4.958266258239746, + "learning_rate": 4.4559407975880506e-06, + "loss": 0.1895, + "step": 41206 + }, + { + "epoch": 2.158912099837222, + "grad_norm": 5.527904510498047, + "learning_rate": 4.455803754967795e-06, + "loss": 0.1868, + "step": 41207 + }, + { + "epoch": 2.158925664677157, + "grad_norm": 5.252099514007568, + "learning_rate": 4.455666712347541e-06, + "loss": 0.3028, + "step": 41208 + }, + { + "epoch": 2.1589392295170917, + "grad_norm": 3.0893337726593018, + "learning_rate": 4.455529669727285e-06, + "loss": 0.1048, + "step": 41209 + }, + { + "epoch": 2.1589527943570266, + "grad_norm": 4.960315704345703, + "learning_rate": 4.4553926271070304e-06, + "loss": 0.1757, + "step": 41210 + }, + { + "epoch": 2.1589663591969614, + "grad_norm": 4.874326705932617, + "learning_rate": 4.455255584486776e-06, + "loss": 0.2135, + "step": 41211 + }, + { + "epoch": 2.1589799240368963, + "grad_norm": 5.3160400390625, + "learning_rate": 4.455118541866521e-06, + "loss": 0.2172, + "step": 41212 + }, + { + "epoch": 2.158993488876831, + "grad_norm": 4.163520812988281, + "learning_rate": 4.454981499246266e-06, + "loss": 0.1654, + "step": 41213 + }, + { + "epoch": 2.159007053716766, + "grad_norm": 3.9878127574920654, + "learning_rate": 4.454844456626011e-06, + "loss": 0.1825, + "step": 41214 + }, + { + "epoch": 2.159020618556701, + "grad_norm": 4.5795207023620605, + "learning_rate": 4.454707414005756e-06, + "loss": 0.2152, + "step": 41215 + }, + { + "epoch": 2.1590341833966358, + "grad_norm": 5.7784647941589355, + "learning_rate": 4.4545703713855015e-06, + "loss": 0.219, + "step": 41216 + }, + { + "epoch": 2.1590477482365706, + "grad_norm": 4.821557521820068, + "learning_rate": 4.454433328765247e-06, + "loss": 0.1895, + "step": 41217 + }, + { + "epoch": 2.1590613130765055, + "grad_norm": 6.092419624328613, + "learning_rate": 4.454296286144991e-06, + "loss": 0.1754, + "step": 41218 + }, + { + "epoch": 2.1590748779164404, + "grad_norm": 5.0022292137146, + "learning_rate": 4.454159243524736e-06, + "loss": 0.1326, + "step": 41219 + }, + { + "epoch": 2.1590884427563757, + "grad_norm": 5.583680629730225, + "learning_rate": 4.454022200904481e-06, + "loss": 0.2658, + "step": 41220 + }, + { + "epoch": 2.1591020075963105, + "grad_norm": 5.0069122314453125, + "learning_rate": 4.4538851582842266e-06, + "loss": 0.1403, + "step": 41221 + }, + { + "epoch": 2.1591155724362454, + "grad_norm": 5.70668888092041, + "learning_rate": 4.453748115663972e-06, + "loss": 0.2201, + "step": 41222 + }, + { + "epoch": 2.1591291372761803, + "grad_norm": 3.9676523208618164, + "learning_rate": 4.453611073043717e-06, + "loss": 0.1461, + "step": 41223 + }, + { + "epoch": 2.159142702116115, + "grad_norm": 4.0544209480285645, + "learning_rate": 4.453474030423462e-06, + "loss": 0.2098, + "step": 41224 + }, + { + "epoch": 2.15915626695605, + "grad_norm": 4.961164474487305, + "learning_rate": 4.453336987803207e-06, + "loss": 0.1403, + "step": 41225 + }, + { + "epoch": 2.159169831795985, + "grad_norm": 4.0964555740356445, + "learning_rate": 4.4531999451829524e-06, + "loss": 0.1857, + "step": 41226 + }, + { + "epoch": 2.1591833966359197, + "grad_norm": 4.016386032104492, + "learning_rate": 4.453062902562698e-06, + "loss": 0.1913, + "step": 41227 + }, + { + "epoch": 2.1591969614758546, + "grad_norm": 5.450855255126953, + "learning_rate": 4.452925859942443e-06, + "loss": 0.2116, + "step": 41228 + }, + { + "epoch": 2.1592105263157895, + "grad_norm": 4.872220039367676, + "learning_rate": 4.452788817322188e-06, + "loss": 0.1009, + "step": 41229 + }, + { + "epoch": 2.1592240911557243, + "grad_norm": 5.009808540344238, + "learning_rate": 4.452651774701932e-06, + "loss": 0.1562, + "step": 41230 + }, + { + "epoch": 2.159237655995659, + "grad_norm": 5.719924449920654, + "learning_rate": 4.4525147320816775e-06, + "loss": 0.2797, + "step": 41231 + }, + { + "epoch": 2.159251220835594, + "grad_norm": 2.8741817474365234, + "learning_rate": 4.452377689461423e-06, + "loss": 0.0898, + "step": 41232 + }, + { + "epoch": 2.159264785675529, + "grad_norm": 3.905703544616699, + "learning_rate": 4.452240646841168e-06, + "loss": 0.1754, + "step": 41233 + }, + { + "epoch": 2.159278350515464, + "grad_norm": 4.394240856170654, + "learning_rate": 4.452103604220913e-06, + "loss": 0.2174, + "step": 41234 + }, + { + "epoch": 2.1592919153553987, + "grad_norm": 6.4619269371032715, + "learning_rate": 4.451966561600658e-06, + "loss": 0.2839, + "step": 41235 + }, + { + "epoch": 2.1593054801953335, + "grad_norm": 4.150162220001221, + "learning_rate": 4.451829518980403e-06, + "loss": 0.1637, + "step": 41236 + }, + { + "epoch": 2.1593190450352684, + "grad_norm": 5.832090377807617, + "learning_rate": 4.4516924763601486e-06, + "loss": 0.1648, + "step": 41237 + }, + { + "epoch": 2.1593326098752037, + "grad_norm": 4.193595886230469, + "learning_rate": 4.451555433739894e-06, + "loss": 0.187, + "step": 41238 + }, + { + "epoch": 2.1593461747151386, + "grad_norm": 5.0837626457214355, + "learning_rate": 4.451418391119638e-06, + "loss": 0.1896, + "step": 41239 + }, + { + "epoch": 2.1593597395550734, + "grad_norm": 6.245334148406982, + "learning_rate": 4.451281348499384e-06, + "loss": 0.1742, + "step": 41240 + }, + { + "epoch": 2.1593733043950083, + "grad_norm": 4.705423355102539, + "learning_rate": 4.4511443058791284e-06, + "loss": 0.1753, + "step": 41241 + }, + { + "epoch": 2.159386869234943, + "grad_norm": 2.7025327682495117, + "learning_rate": 4.4510072632588745e-06, + "loss": 0.0778, + "step": 41242 + }, + { + "epoch": 2.159400434074878, + "grad_norm": 3.048938274383545, + "learning_rate": 4.450870220638619e-06, + "loss": 0.1112, + "step": 41243 + }, + { + "epoch": 2.159413998914813, + "grad_norm": 5.323051929473877, + "learning_rate": 4.450733178018364e-06, + "loss": 0.1812, + "step": 41244 + }, + { + "epoch": 2.1594275637547478, + "grad_norm": 5.224270343780518, + "learning_rate": 4.450596135398109e-06, + "loss": 0.2189, + "step": 41245 + }, + { + "epoch": 2.1594411285946826, + "grad_norm": 4.969074249267578, + "learning_rate": 4.450459092777854e-06, + "loss": 0.1225, + "step": 41246 + }, + { + "epoch": 2.1594546934346175, + "grad_norm": 4.862229824066162, + "learning_rate": 4.4503220501575995e-06, + "loss": 0.1588, + "step": 41247 + }, + { + "epoch": 2.1594682582745524, + "grad_norm": 5.0263495445251465, + "learning_rate": 4.450185007537345e-06, + "loss": 0.2219, + "step": 41248 + }, + { + "epoch": 2.159481823114487, + "grad_norm": 4.273359775543213, + "learning_rate": 4.45004796491709e-06, + "loss": 0.1999, + "step": 41249 + }, + { + "epoch": 2.159495387954422, + "grad_norm": 4.697969436645508, + "learning_rate": 4.449910922296834e-06, + "loss": 0.1551, + "step": 41250 + }, + { + "epoch": 2.159508952794357, + "grad_norm": 4.659139156341553, + "learning_rate": 4.44977387967658e-06, + "loss": 0.1061, + "step": 41251 + }, + { + "epoch": 2.159522517634292, + "grad_norm": 5.267279624938965, + "learning_rate": 4.4496368370563246e-06, + "loss": 0.2279, + "step": 41252 + }, + { + "epoch": 2.1595360824742267, + "grad_norm": 4.0040693283081055, + "learning_rate": 4.449499794436071e-06, + "loss": 0.1917, + "step": 41253 + }, + { + "epoch": 2.1595496473141615, + "grad_norm": 4.130964279174805, + "learning_rate": 4.449362751815815e-06, + "loss": 0.176, + "step": 41254 + }, + { + "epoch": 2.1595632121540964, + "grad_norm": 5.1834397315979, + "learning_rate": 4.44922570919556e-06, + "loss": 0.2788, + "step": 41255 + }, + { + "epoch": 2.1595767769940313, + "grad_norm": 5.29571533203125, + "learning_rate": 4.449088666575305e-06, + "loss": 0.1889, + "step": 41256 + }, + { + "epoch": 2.159590341833966, + "grad_norm": 6.3516526222229, + "learning_rate": 4.4489516239550504e-06, + "loss": 0.2114, + "step": 41257 + }, + { + "epoch": 2.1596039066739015, + "grad_norm": 4.539741516113281, + "learning_rate": 4.448814581334796e-06, + "loss": 0.1148, + "step": 41258 + }, + { + "epoch": 2.1596174715138363, + "grad_norm": 3.934581995010376, + "learning_rate": 4.44867753871454e-06, + "loss": 0.131, + "step": 41259 + }, + { + "epoch": 2.159631036353771, + "grad_norm": 4.390554904937744, + "learning_rate": 4.448540496094286e-06, + "loss": 0.1873, + "step": 41260 + }, + { + "epoch": 2.159644601193706, + "grad_norm": 6.222916126251221, + "learning_rate": 4.44840345347403e-06, + "loss": 0.2157, + "step": 41261 + }, + { + "epoch": 2.159658166033641, + "grad_norm": 3.1740550994873047, + "learning_rate": 4.448266410853776e-06, + "loss": 0.1518, + "step": 41262 + }, + { + "epoch": 2.159671730873576, + "grad_norm": 4.9518046379089355, + "learning_rate": 4.448129368233521e-06, + "loss": 0.1651, + "step": 41263 + }, + { + "epoch": 2.1596852957135106, + "grad_norm": 3.060443878173828, + "learning_rate": 4.447992325613266e-06, + "loss": 0.0976, + "step": 41264 + }, + { + "epoch": 2.1596988605534455, + "grad_norm": 4.278066635131836, + "learning_rate": 4.447855282993011e-06, + "loss": 0.1418, + "step": 41265 + }, + { + "epoch": 2.1597124253933804, + "grad_norm": 5.737680435180664, + "learning_rate": 4.447718240372756e-06, + "loss": 0.1681, + "step": 41266 + }, + { + "epoch": 2.1597259902333152, + "grad_norm": 5.9434919357299805, + "learning_rate": 4.447581197752501e-06, + "loss": 0.1772, + "step": 41267 + }, + { + "epoch": 2.15973955507325, + "grad_norm": 4.098862171173096, + "learning_rate": 4.4474441551322466e-06, + "loss": 0.0858, + "step": 41268 + }, + { + "epoch": 2.159753119913185, + "grad_norm": 6.327176570892334, + "learning_rate": 4.447307112511992e-06, + "loss": 0.2015, + "step": 41269 + }, + { + "epoch": 2.15976668475312, + "grad_norm": 4.046811103820801, + "learning_rate": 4.447170069891737e-06, + "loss": 0.1405, + "step": 41270 + }, + { + "epoch": 2.1597802495930547, + "grad_norm": 4.144856929779053, + "learning_rate": 4.447033027271482e-06, + "loss": 0.1867, + "step": 41271 + }, + { + "epoch": 2.1597938144329896, + "grad_norm": 5.4857096672058105, + "learning_rate": 4.4468959846512264e-06, + "loss": 0.1234, + "step": 41272 + }, + { + "epoch": 2.1598073792729244, + "grad_norm": 3.05787992477417, + "learning_rate": 4.446758942030972e-06, + "loss": 0.0724, + "step": 41273 + }, + { + "epoch": 2.1598209441128593, + "grad_norm": 4.897576332092285, + "learning_rate": 4.446621899410717e-06, + "loss": 0.1179, + "step": 41274 + }, + { + "epoch": 2.159834508952794, + "grad_norm": 3.5347063541412354, + "learning_rate": 4.446484856790462e-06, + "loss": 0.1332, + "step": 41275 + }, + { + "epoch": 2.1598480737927295, + "grad_norm": 4.568532466888428, + "learning_rate": 4.446347814170207e-06, + "loss": 0.1615, + "step": 41276 + }, + { + "epoch": 2.1598616386326643, + "grad_norm": 5.428237438201904, + "learning_rate": 4.446210771549952e-06, + "loss": 0.2295, + "step": 41277 + }, + { + "epoch": 2.159875203472599, + "grad_norm": 4.810482501983643, + "learning_rate": 4.4460737289296975e-06, + "loss": 0.1804, + "step": 41278 + }, + { + "epoch": 2.159888768312534, + "grad_norm": 4.57196044921875, + "learning_rate": 4.445936686309443e-06, + "loss": 0.2074, + "step": 41279 + }, + { + "epoch": 2.159902333152469, + "grad_norm": 4.97979211807251, + "learning_rate": 4.445799643689188e-06, + "loss": 0.1703, + "step": 41280 + }, + { + "epoch": 2.159915897992404, + "grad_norm": 5.02079963684082, + "learning_rate": 4.445662601068933e-06, + "loss": 0.2, + "step": 41281 + }, + { + "epoch": 2.1599294628323387, + "grad_norm": 4.40429162979126, + "learning_rate": 4.445525558448678e-06, + "loss": 0.1593, + "step": 41282 + }, + { + "epoch": 2.1599430276722735, + "grad_norm": 4.3769378662109375, + "learning_rate": 4.445388515828423e-06, + "loss": 0.1593, + "step": 41283 + }, + { + "epoch": 2.1599565925122084, + "grad_norm": 4.2441086769104, + "learning_rate": 4.445251473208168e-06, + "loss": 0.1511, + "step": 41284 + }, + { + "epoch": 2.1599701573521433, + "grad_norm": 4.318920612335205, + "learning_rate": 4.445114430587914e-06, + "loss": 0.1804, + "step": 41285 + }, + { + "epoch": 2.159983722192078, + "grad_norm": 5.777609348297119, + "learning_rate": 4.444977387967658e-06, + "loss": 0.1409, + "step": 41286 + }, + { + "epoch": 2.159997287032013, + "grad_norm": 6.502219200134277, + "learning_rate": 4.444840345347403e-06, + "loss": 0.2223, + "step": 41287 + }, + { + "epoch": 2.160010851871948, + "grad_norm": 3.6496689319610596, + "learning_rate": 4.4447033027271485e-06, + "loss": 0.1642, + "step": 41288 + }, + { + "epoch": 2.1600244167118827, + "grad_norm": 4.3085198402404785, + "learning_rate": 4.444566260106894e-06, + "loss": 0.1894, + "step": 41289 + }, + { + "epoch": 2.1600379815518176, + "grad_norm": 6.0210113525390625, + "learning_rate": 4.444429217486639e-06, + "loss": 0.2023, + "step": 41290 + }, + { + "epoch": 2.1600515463917525, + "grad_norm": 3.5676989555358887, + "learning_rate": 4.444292174866384e-06, + "loss": 0.1058, + "step": 41291 + }, + { + "epoch": 2.1600651112316873, + "grad_norm": 5.472278594970703, + "learning_rate": 4.444155132246129e-06, + "loss": 0.1841, + "step": 41292 + }, + { + "epoch": 2.160078676071622, + "grad_norm": 5.785240650177002, + "learning_rate": 4.4440180896258735e-06, + "loss": 0.2365, + "step": 41293 + }, + { + "epoch": 2.160092240911557, + "grad_norm": 5.586806297302246, + "learning_rate": 4.4438810470056195e-06, + "loss": 0.1993, + "step": 41294 + }, + { + "epoch": 2.160105805751492, + "grad_norm": 7.221579551696777, + "learning_rate": 4.443744004385364e-06, + "loss": 0.2252, + "step": 41295 + }, + { + "epoch": 2.1601193705914272, + "grad_norm": 5.847563743591309, + "learning_rate": 4.44360696176511e-06, + "loss": 0.3102, + "step": 41296 + }, + { + "epoch": 2.160132935431362, + "grad_norm": 7.294893741607666, + "learning_rate": 4.443469919144854e-06, + "loss": 0.2907, + "step": 41297 + }, + { + "epoch": 2.160146500271297, + "grad_norm": 5.123537063598633, + "learning_rate": 4.443332876524599e-06, + "loss": 0.2336, + "step": 41298 + }, + { + "epoch": 2.160160065111232, + "grad_norm": 5.078097343444824, + "learning_rate": 4.4431958339043446e-06, + "loss": 0.1926, + "step": 41299 + }, + { + "epoch": 2.1601736299511667, + "grad_norm": 4.19133996963501, + "learning_rate": 4.44305879128409e-06, + "loss": 0.1619, + "step": 41300 + }, + { + "epoch": 2.1601871947911016, + "grad_norm": 5.2291178703308105, + "learning_rate": 4.442921748663835e-06, + "loss": 0.1843, + "step": 41301 + }, + { + "epoch": 2.1602007596310364, + "grad_norm": 3.2928788661956787, + "learning_rate": 4.44278470604358e-06, + "loss": 0.1271, + "step": 41302 + }, + { + "epoch": 2.1602143244709713, + "grad_norm": 5.511237621307373, + "learning_rate": 4.442647663423325e-06, + "loss": 0.1902, + "step": 41303 + }, + { + "epoch": 2.160227889310906, + "grad_norm": 5.185102939605713, + "learning_rate": 4.44251062080307e-06, + "loss": 0.1911, + "step": 41304 + }, + { + "epoch": 2.160241454150841, + "grad_norm": 4.607919692993164, + "learning_rate": 4.442373578182816e-06, + "loss": 0.1534, + "step": 41305 + }, + { + "epoch": 2.160255018990776, + "grad_norm": 4.833334445953369, + "learning_rate": 4.44223653556256e-06, + "loss": 0.2595, + "step": 41306 + }, + { + "epoch": 2.1602685838307107, + "grad_norm": 4.237411022186279, + "learning_rate": 4.442099492942305e-06, + "loss": 0.1762, + "step": 41307 + }, + { + "epoch": 2.1602821486706456, + "grad_norm": 4.437416076660156, + "learning_rate": 4.44196245032205e-06, + "loss": 0.1911, + "step": 41308 + }, + { + "epoch": 2.1602957135105805, + "grad_norm": 5.022923469543457, + "learning_rate": 4.4418254077017955e-06, + "loss": 0.252, + "step": 41309 + }, + { + "epoch": 2.1603092783505153, + "grad_norm": 6.1482834815979, + "learning_rate": 4.441688365081541e-06, + "loss": 0.1653, + "step": 41310 + }, + { + "epoch": 2.16032284319045, + "grad_norm": 8.362486839294434, + "learning_rate": 4.441551322461286e-06, + "loss": 0.3608, + "step": 41311 + }, + { + "epoch": 2.160336408030385, + "grad_norm": 5.738275051116943, + "learning_rate": 4.441414279841031e-06, + "loss": 0.2379, + "step": 41312 + }, + { + "epoch": 2.16034997287032, + "grad_norm": 5.456993579864502, + "learning_rate": 4.441277237220775e-06, + "loss": 0.2103, + "step": 41313 + }, + { + "epoch": 2.1603635377102552, + "grad_norm": 5.6860785484313965, + "learning_rate": 4.441140194600521e-06, + "loss": 0.2952, + "step": 41314 + }, + { + "epoch": 2.16037710255019, + "grad_norm": 4.560118675231934, + "learning_rate": 4.441003151980266e-06, + "loss": 0.1999, + "step": 41315 + }, + { + "epoch": 2.160390667390125, + "grad_norm": 4.8147969245910645, + "learning_rate": 4.440866109360012e-06, + "loss": 0.2587, + "step": 41316 + }, + { + "epoch": 2.16040423223006, + "grad_norm": 4.744297981262207, + "learning_rate": 4.440729066739756e-06, + "loss": 0.2649, + "step": 41317 + }, + { + "epoch": 2.1604177970699947, + "grad_norm": 6.010342121124268, + "learning_rate": 4.440592024119501e-06, + "loss": 0.3306, + "step": 41318 + }, + { + "epoch": 2.1604313619099296, + "grad_norm": 5.398693084716797, + "learning_rate": 4.4404549814992465e-06, + "loss": 0.2657, + "step": 41319 + }, + { + "epoch": 2.1604449267498644, + "grad_norm": 4.889254093170166, + "learning_rate": 4.440317938878992e-06, + "loss": 0.2306, + "step": 41320 + }, + { + "epoch": 2.1604584915897993, + "grad_norm": 7.793370723724365, + "learning_rate": 4.440180896258737e-06, + "loss": 0.3146, + "step": 41321 + }, + { + "epoch": 2.160472056429734, + "grad_norm": 7.0764923095703125, + "learning_rate": 4.440043853638482e-06, + "loss": 0.2213, + "step": 41322 + }, + { + "epoch": 2.160485621269669, + "grad_norm": 4.169340133666992, + "learning_rate": 4.439906811018227e-06, + "loss": 0.2574, + "step": 41323 + }, + { + "epoch": 2.160499186109604, + "grad_norm": 5.1389899253845215, + "learning_rate": 4.439769768397972e-06, + "loss": 0.1555, + "step": 41324 + }, + { + "epoch": 2.1605127509495388, + "grad_norm": 5.055387020111084, + "learning_rate": 4.4396327257777175e-06, + "loss": 0.3046, + "step": 41325 + }, + { + "epoch": 2.1605263157894736, + "grad_norm": 4.675962448120117, + "learning_rate": 4.439495683157463e-06, + "loss": 0.1586, + "step": 41326 + }, + { + "epoch": 2.1605398806294085, + "grad_norm": 5.583858966827393, + "learning_rate": 4.439358640537207e-06, + "loss": 0.1742, + "step": 41327 + }, + { + "epoch": 2.1605534454693434, + "grad_norm": 6.342473983764648, + "learning_rate": 4.439221597916952e-06, + "loss": 0.3148, + "step": 41328 + }, + { + "epoch": 2.1605670103092782, + "grad_norm": 5.4158101081848145, + "learning_rate": 4.439084555296697e-06, + "loss": 0.3388, + "step": 41329 + }, + { + "epoch": 2.160580575149213, + "grad_norm": 6.3911871910095215, + "learning_rate": 4.438947512676443e-06, + "loss": 0.2961, + "step": 41330 + }, + { + "epoch": 2.160594139989148, + "grad_norm": 5.871066570281982, + "learning_rate": 4.438810470056188e-06, + "loss": 0.2552, + "step": 41331 + }, + { + "epoch": 2.160607704829083, + "grad_norm": 4.8950514793396, + "learning_rate": 4.438673427435933e-06, + "loss": 0.1744, + "step": 41332 + }, + { + "epoch": 2.1606212696690177, + "grad_norm": 4.871192455291748, + "learning_rate": 4.438536384815678e-06, + "loss": 0.2414, + "step": 41333 + }, + { + "epoch": 2.160634834508953, + "grad_norm": 5.639746189117432, + "learning_rate": 4.438399342195423e-06, + "loss": 0.2295, + "step": 41334 + }, + { + "epoch": 2.160648399348888, + "grad_norm": 4.709383964538574, + "learning_rate": 4.4382622995751685e-06, + "loss": 0.1507, + "step": 41335 + }, + { + "epoch": 2.1606619641888227, + "grad_norm": 4.981573104858398, + "learning_rate": 4.438125256954914e-06, + "loss": 0.1811, + "step": 41336 + }, + { + "epoch": 2.1606755290287576, + "grad_norm": 7.147134304046631, + "learning_rate": 4.437988214334659e-06, + "loss": 0.2462, + "step": 41337 + }, + { + "epoch": 2.1606890938686925, + "grad_norm": 5.416805267333984, + "learning_rate": 4.437851171714403e-06, + "loss": 0.1767, + "step": 41338 + }, + { + "epoch": 2.1607026587086273, + "grad_norm": 6.497774124145508, + "learning_rate": 4.437714129094149e-06, + "loss": 0.2654, + "step": 41339 + }, + { + "epoch": 2.160716223548562, + "grad_norm": 5.534370422363281, + "learning_rate": 4.4375770864738935e-06, + "loss": 0.212, + "step": 41340 + }, + { + "epoch": 2.160729788388497, + "grad_norm": 4.557265758514404, + "learning_rate": 4.437440043853639e-06, + "loss": 0.1729, + "step": 41341 + }, + { + "epoch": 2.160743353228432, + "grad_norm": 4.120295524597168, + "learning_rate": 4.437303001233384e-06, + "loss": 0.1356, + "step": 41342 + }, + { + "epoch": 2.160756918068367, + "grad_norm": 4.077791213989258, + "learning_rate": 4.437165958613129e-06, + "loss": 0.1282, + "step": 41343 + }, + { + "epoch": 2.1607704829083016, + "grad_norm": 4.864608287811279, + "learning_rate": 4.437028915992874e-06, + "loss": 0.2277, + "step": 41344 + }, + { + "epoch": 2.1607840477482365, + "grad_norm": 6.1841349601745605, + "learning_rate": 4.436891873372619e-06, + "loss": 0.1968, + "step": 41345 + }, + { + "epoch": 2.1607976125881714, + "grad_norm": 4.245284080505371, + "learning_rate": 4.436754830752365e-06, + "loss": 0.1534, + "step": 41346 + }, + { + "epoch": 2.1608111774281062, + "grad_norm": 5.80714750289917, + "learning_rate": 4.436617788132109e-06, + "loss": 0.2192, + "step": 41347 + }, + { + "epoch": 2.160824742268041, + "grad_norm": 5.973966598510742, + "learning_rate": 4.436480745511855e-06, + "loss": 0.2049, + "step": 41348 + }, + { + "epoch": 2.160838307107976, + "grad_norm": 6.350011348724365, + "learning_rate": 4.436343702891599e-06, + "loss": 0.3044, + "step": 41349 + }, + { + "epoch": 2.160851871947911, + "grad_norm": 4.992949485778809, + "learning_rate": 4.436206660271345e-06, + "loss": 0.1671, + "step": 41350 + }, + { + "epoch": 2.1608654367878457, + "grad_norm": 5.147054195404053, + "learning_rate": 4.43606961765109e-06, + "loss": 0.2407, + "step": 41351 + }, + { + "epoch": 2.160879001627781, + "grad_norm": 3.9845035076141357, + "learning_rate": 4.435932575030835e-06, + "loss": 0.1317, + "step": 41352 + }, + { + "epoch": 2.160892566467716, + "grad_norm": 6.042966365814209, + "learning_rate": 4.43579553241058e-06, + "loss": 0.1284, + "step": 41353 + }, + { + "epoch": 2.1609061313076507, + "grad_norm": 5.162378311157227, + "learning_rate": 4.435658489790325e-06, + "loss": 0.2066, + "step": 41354 + }, + { + "epoch": 2.1609196961475856, + "grad_norm": 5.111474514007568, + "learning_rate": 4.43552144717007e-06, + "loss": 0.1962, + "step": 41355 + }, + { + "epoch": 2.1609332609875205, + "grad_norm": 5.102581977844238, + "learning_rate": 4.4353844045498155e-06, + "loss": 0.2337, + "step": 41356 + }, + { + "epoch": 2.1609468258274553, + "grad_norm": 5.221200942993164, + "learning_rate": 4.435247361929561e-06, + "loss": 0.202, + "step": 41357 + }, + { + "epoch": 2.16096039066739, + "grad_norm": 4.819955348968506, + "learning_rate": 4.435110319309305e-06, + "loss": 0.1139, + "step": 41358 + }, + { + "epoch": 2.160973955507325, + "grad_norm": 3.835777521133423, + "learning_rate": 4.434973276689051e-06, + "loss": 0.1123, + "step": 41359 + }, + { + "epoch": 2.16098752034726, + "grad_norm": 5.205129146575928, + "learning_rate": 4.434836234068795e-06, + "loss": 0.2074, + "step": 41360 + }, + { + "epoch": 2.161001085187195, + "grad_norm": 4.329002380371094, + "learning_rate": 4.434699191448541e-06, + "loss": 0.1785, + "step": 41361 + }, + { + "epoch": 2.1610146500271297, + "grad_norm": 4.157209873199463, + "learning_rate": 4.434562148828286e-06, + "loss": 0.2043, + "step": 41362 + }, + { + "epoch": 2.1610282148670645, + "grad_norm": 4.42020845413208, + "learning_rate": 4.434425106208031e-06, + "loss": 0.162, + "step": 41363 + }, + { + "epoch": 2.1610417797069994, + "grad_norm": 5.864625453948975, + "learning_rate": 4.434288063587776e-06, + "loss": 0.1529, + "step": 41364 + }, + { + "epoch": 2.1610553445469343, + "grad_norm": 6.2047271728515625, + "learning_rate": 4.434151020967521e-06, + "loss": 0.2518, + "step": 41365 + }, + { + "epoch": 2.161068909386869, + "grad_norm": 5.254158973693848, + "learning_rate": 4.4340139783472665e-06, + "loss": 0.1782, + "step": 41366 + }, + { + "epoch": 2.161082474226804, + "grad_norm": 6.473534107208252, + "learning_rate": 4.433876935727012e-06, + "loss": 0.2133, + "step": 41367 + }, + { + "epoch": 2.161096039066739, + "grad_norm": 3.524183988571167, + "learning_rate": 4.433739893106757e-06, + "loss": 0.1116, + "step": 41368 + }, + { + "epoch": 2.1611096039066737, + "grad_norm": 4.535676002502441, + "learning_rate": 4.433602850486501e-06, + "loss": 0.1712, + "step": 41369 + }, + { + "epoch": 2.1611231687466086, + "grad_norm": 6.030797958374023, + "learning_rate": 4.433465807866247e-06, + "loss": 0.2394, + "step": 41370 + }, + { + "epoch": 2.1611367335865435, + "grad_norm": 4.969787120819092, + "learning_rate": 4.4333287652459915e-06, + "loss": 0.1857, + "step": 41371 + }, + { + "epoch": 2.1611502984264788, + "grad_norm": 4.867915630340576, + "learning_rate": 4.433191722625737e-06, + "loss": 0.2488, + "step": 41372 + }, + { + "epoch": 2.1611638632664136, + "grad_norm": 4.2848358154296875, + "learning_rate": 4.433054680005482e-06, + "loss": 0.1729, + "step": 41373 + }, + { + "epoch": 2.1611774281063485, + "grad_norm": 5.019781112670898, + "learning_rate": 4.432917637385227e-06, + "loss": 0.2227, + "step": 41374 + }, + { + "epoch": 2.1611909929462834, + "grad_norm": 3.708326578140259, + "learning_rate": 4.432780594764972e-06, + "loss": 0.1622, + "step": 41375 + }, + { + "epoch": 2.1612045577862182, + "grad_norm": 4.340450763702393, + "learning_rate": 4.432643552144717e-06, + "loss": 0.2691, + "step": 41376 + }, + { + "epoch": 2.161218122626153, + "grad_norm": 4.711470603942871, + "learning_rate": 4.432506509524463e-06, + "loss": 0.1385, + "step": 41377 + }, + { + "epoch": 2.161231687466088, + "grad_norm": 4.182889938354492, + "learning_rate": 4.432369466904208e-06, + "loss": 0.1168, + "step": 41378 + }, + { + "epoch": 2.161245252306023, + "grad_norm": 6.734777927398682, + "learning_rate": 4.432232424283953e-06, + "loss": 0.2024, + "step": 41379 + }, + { + "epoch": 2.1612588171459577, + "grad_norm": 4.178171634674072, + "learning_rate": 4.432095381663698e-06, + "loss": 0.1743, + "step": 41380 + }, + { + "epoch": 2.1612723819858926, + "grad_norm": 5.262098789215088, + "learning_rate": 4.4319583390434425e-06, + "loss": 0.1362, + "step": 41381 + }, + { + "epoch": 2.1612859468258274, + "grad_norm": 3.3834846019744873, + "learning_rate": 4.431821296423188e-06, + "loss": 0.1161, + "step": 41382 + }, + { + "epoch": 2.1612995116657623, + "grad_norm": 5.761053562164307, + "learning_rate": 4.431684253802933e-06, + "loss": 0.241, + "step": 41383 + }, + { + "epoch": 2.161313076505697, + "grad_norm": 4.466930866241455, + "learning_rate": 4.431547211182678e-06, + "loss": 0.202, + "step": 41384 + }, + { + "epoch": 2.161326641345632, + "grad_norm": 4.939257621765137, + "learning_rate": 4.431410168562423e-06, + "loss": 0.1927, + "step": 41385 + }, + { + "epoch": 2.161340206185567, + "grad_norm": 5.881287097930908, + "learning_rate": 4.431273125942168e-06, + "loss": 0.2046, + "step": 41386 + }, + { + "epoch": 2.1613537710255017, + "grad_norm": 3.878042221069336, + "learning_rate": 4.4311360833219135e-06, + "loss": 0.1398, + "step": 41387 + }, + { + "epoch": 2.1613673358654366, + "grad_norm": 3.8593909740448, + "learning_rate": 4.430999040701659e-06, + "loss": 0.1896, + "step": 41388 + }, + { + "epoch": 2.1613809007053715, + "grad_norm": 6.553016185760498, + "learning_rate": 4.430861998081404e-06, + "loss": 0.2353, + "step": 41389 + }, + { + "epoch": 2.161394465545307, + "grad_norm": 4.662750244140625, + "learning_rate": 4.430724955461149e-06, + "loss": 0.1356, + "step": 41390 + }, + { + "epoch": 2.1614080303852417, + "grad_norm": 5.172923564910889, + "learning_rate": 4.430587912840894e-06, + "loss": 0.2535, + "step": 41391 + }, + { + "epoch": 2.1614215952251765, + "grad_norm": 4.0795512199401855, + "learning_rate": 4.430450870220639e-06, + "loss": 0.1061, + "step": 41392 + }, + { + "epoch": 2.1614351600651114, + "grad_norm": 5.613558292388916, + "learning_rate": 4.430313827600385e-06, + "loss": 0.3223, + "step": 41393 + }, + { + "epoch": 2.1614487249050462, + "grad_norm": 3.5789051055908203, + "learning_rate": 4.430176784980129e-06, + "loss": 0.1292, + "step": 41394 + }, + { + "epoch": 2.161462289744981, + "grad_norm": 7.1303253173828125, + "learning_rate": 4.430039742359875e-06, + "loss": 0.2586, + "step": 41395 + }, + { + "epoch": 2.161475854584916, + "grad_norm": 3.434765577316284, + "learning_rate": 4.429902699739619e-06, + "loss": 0.1393, + "step": 41396 + }, + { + "epoch": 2.161489419424851, + "grad_norm": 3.8918681144714355, + "learning_rate": 4.4297656571193645e-06, + "loss": 0.1201, + "step": 41397 + }, + { + "epoch": 2.1615029842647857, + "grad_norm": 4.440539836883545, + "learning_rate": 4.42962861449911e-06, + "loss": 0.1731, + "step": 41398 + }, + { + "epoch": 2.1615165491047206, + "grad_norm": 3.435936689376831, + "learning_rate": 4.429491571878855e-06, + "loss": 0.0994, + "step": 41399 + }, + { + "epoch": 2.1615301139446554, + "grad_norm": 6.726037502288818, + "learning_rate": 4.4293545292586e-06, + "loss": 0.2823, + "step": 41400 + }, + { + "epoch": 2.1615436787845903, + "grad_norm": 5.0283660888671875, + "learning_rate": 4.429217486638344e-06, + "loss": 0.2124, + "step": 41401 + }, + { + "epoch": 2.161557243624525, + "grad_norm": 4.831307411193848, + "learning_rate": 4.42908044401809e-06, + "loss": 0.1497, + "step": 41402 + }, + { + "epoch": 2.16157080846446, + "grad_norm": 5.617495536804199, + "learning_rate": 4.428943401397835e-06, + "loss": 0.1788, + "step": 41403 + }, + { + "epoch": 2.161584373304395, + "grad_norm": 5.483222484588623, + "learning_rate": 4.428806358777581e-06, + "loss": 0.1419, + "step": 41404 + }, + { + "epoch": 2.1615979381443298, + "grad_norm": 5.271903991699219, + "learning_rate": 4.428669316157325e-06, + "loss": 0.2064, + "step": 41405 + }, + { + "epoch": 2.1616115029842646, + "grad_norm": 4.369357109069824, + "learning_rate": 4.42853227353707e-06, + "loss": 0.1662, + "step": 41406 + }, + { + "epoch": 2.1616250678241995, + "grad_norm": 4.465083599090576, + "learning_rate": 4.428395230916815e-06, + "loss": 0.173, + "step": 41407 + }, + { + "epoch": 2.1616386326641344, + "grad_norm": 4.020875930786133, + "learning_rate": 4.428258188296561e-06, + "loss": 0.1873, + "step": 41408 + }, + { + "epoch": 2.1616521975040692, + "grad_norm": 4.183691024780273, + "learning_rate": 4.428121145676306e-06, + "loss": 0.1193, + "step": 41409 + }, + { + "epoch": 2.1616657623440045, + "grad_norm": 4.823816776275635, + "learning_rate": 4.42798410305605e-06, + "loss": 0.1611, + "step": 41410 + }, + { + "epoch": 2.1616793271839394, + "grad_norm": 6.186625003814697, + "learning_rate": 4.427847060435796e-06, + "loss": 0.292, + "step": 41411 + }, + { + "epoch": 2.1616928920238743, + "grad_norm": 3.2218146324157715, + "learning_rate": 4.4277100178155405e-06, + "loss": 0.1304, + "step": 41412 + }, + { + "epoch": 2.161706456863809, + "grad_norm": 2.942371368408203, + "learning_rate": 4.4275729751952865e-06, + "loss": 0.0901, + "step": 41413 + }, + { + "epoch": 2.161720021703744, + "grad_norm": 7.003321647644043, + "learning_rate": 4.427435932575031e-06, + "loss": 0.3514, + "step": 41414 + }, + { + "epoch": 2.161733586543679, + "grad_norm": 4.392544746398926, + "learning_rate": 4.427298889954776e-06, + "loss": 0.2237, + "step": 41415 + }, + { + "epoch": 2.1617471513836137, + "grad_norm": 4.9847259521484375, + "learning_rate": 4.427161847334521e-06, + "loss": 0.126, + "step": 41416 + }, + { + "epoch": 2.1617607162235486, + "grad_norm": 3.485553741455078, + "learning_rate": 4.427024804714266e-06, + "loss": 0.0976, + "step": 41417 + }, + { + "epoch": 2.1617742810634835, + "grad_norm": 4.570655822753906, + "learning_rate": 4.4268877620940115e-06, + "loss": 0.1977, + "step": 41418 + }, + { + "epoch": 2.1617878459034183, + "grad_norm": 4.368175029754639, + "learning_rate": 4.426750719473757e-06, + "loss": 0.1907, + "step": 41419 + }, + { + "epoch": 2.161801410743353, + "grad_norm": 4.141725063323975, + "learning_rate": 4.426613676853502e-06, + "loss": 0.1794, + "step": 41420 + }, + { + "epoch": 2.161814975583288, + "grad_norm": 5.839067459106445, + "learning_rate": 4.426476634233247e-06, + "loss": 0.2148, + "step": 41421 + }, + { + "epoch": 2.161828540423223, + "grad_norm": 6.290927886962891, + "learning_rate": 4.426339591612992e-06, + "loss": 0.1784, + "step": 41422 + }, + { + "epoch": 2.161842105263158, + "grad_norm": 4.073851108551025, + "learning_rate": 4.4262025489927374e-06, + "loss": 0.1575, + "step": 41423 + }, + { + "epoch": 2.1618556701030927, + "grad_norm": 3.114191770553589, + "learning_rate": 4.426065506372483e-06, + "loss": 0.0965, + "step": 41424 + }, + { + "epoch": 2.1618692349430275, + "grad_norm": 3.9246857166290283, + "learning_rate": 4.425928463752227e-06, + "loss": 0.1655, + "step": 41425 + }, + { + "epoch": 2.1618827997829624, + "grad_norm": 5.370104789733887, + "learning_rate": 4.425791421131972e-06, + "loss": 0.1431, + "step": 41426 + }, + { + "epoch": 2.1618963646228972, + "grad_norm": 10.292202949523926, + "learning_rate": 4.425654378511717e-06, + "loss": 0.4023, + "step": 41427 + }, + { + "epoch": 2.1619099294628326, + "grad_norm": 4.123631954193115, + "learning_rate": 4.4255173358914625e-06, + "loss": 0.1562, + "step": 41428 + }, + { + "epoch": 2.1619234943027674, + "grad_norm": 4.9972991943359375, + "learning_rate": 4.425380293271208e-06, + "loss": 0.2038, + "step": 41429 + }, + { + "epoch": 2.1619370591427023, + "grad_norm": 6.107125282287598, + "learning_rate": 4.425243250650953e-06, + "loss": 0.2508, + "step": 41430 + }, + { + "epoch": 2.161950623982637, + "grad_norm": 5.310141086578369, + "learning_rate": 4.425106208030698e-06, + "loss": 0.2036, + "step": 41431 + }, + { + "epoch": 2.161964188822572, + "grad_norm": 4.115614414215088, + "learning_rate": 4.424969165410443e-06, + "loss": 0.144, + "step": 41432 + }, + { + "epoch": 2.161977753662507, + "grad_norm": 3.6561226844787598, + "learning_rate": 4.424832122790188e-06, + "loss": 0.1049, + "step": 41433 + }, + { + "epoch": 2.1619913185024418, + "grad_norm": 5.687582015991211, + "learning_rate": 4.4246950801699336e-06, + "loss": 0.1676, + "step": 41434 + }, + { + "epoch": 2.1620048833423766, + "grad_norm": 3.3518683910369873, + "learning_rate": 4.424558037549678e-06, + "loss": 0.1305, + "step": 41435 + }, + { + "epoch": 2.1620184481823115, + "grad_norm": 6.234836578369141, + "learning_rate": 4.424420994929424e-06, + "loss": 0.2386, + "step": 41436 + }, + { + "epoch": 2.1620320130222463, + "grad_norm": 4.295867919921875, + "learning_rate": 4.424283952309168e-06, + "loss": 0.1569, + "step": 41437 + }, + { + "epoch": 2.162045577862181, + "grad_norm": 4.29660701751709, + "learning_rate": 4.4241469096889134e-06, + "loss": 0.1042, + "step": 41438 + }, + { + "epoch": 2.162059142702116, + "grad_norm": 3.1243462562561035, + "learning_rate": 4.424009867068659e-06, + "loss": 0.0846, + "step": 41439 + }, + { + "epoch": 2.162072707542051, + "grad_norm": 3.5101139545440674, + "learning_rate": 4.423872824448404e-06, + "loss": 0.1436, + "step": 41440 + }, + { + "epoch": 2.162086272381986, + "grad_norm": 3.3055076599121094, + "learning_rate": 4.423735781828149e-06, + "loss": 0.1229, + "step": 41441 + }, + { + "epoch": 2.1620998372219207, + "grad_norm": 4.59870719909668, + "learning_rate": 4.423598739207894e-06, + "loss": 0.1228, + "step": 41442 + }, + { + "epoch": 2.1621134020618555, + "grad_norm": 4.425928592681885, + "learning_rate": 4.423461696587639e-06, + "loss": 0.1443, + "step": 41443 + }, + { + "epoch": 2.1621269669017904, + "grad_norm": 4.224150657653809, + "learning_rate": 4.4233246539673845e-06, + "loss": 0.1437, + "step": 41444 + }, + { + "epoch": 2.1621405317417253, + "grad_norm": 4.200439453125, + "learning_rate": 4.42318761134713e-06, + "loss": 0.1575, + "step": 41445 + }, + { + "epoch": 2.16215409658166, + "grad_norm": 5.068317413330078, + "learning_rate": 4.423050568726874e-06, + "loss": 0.1842, + "step": 41446 + }, + { + "epoch": 2.162167661421595, + "grad_norm": 6.693807601928711, + "learning_rate": 4.42291352610662e-06, + "loss": 0.2384, + "step": 41447 + }, + { + "epoch": 2.1621812262615303, + "grad_norm": 4.150511741638184, + "learning_rate": 4.422776483486364e-06, + "loss": 0.1274, + "step": 41448 + }, + { + "epoch": 2.162194791101465, + "grad_norm": 5.253448963165283, + "learning_rate": 4.4226394408661095e-06, + "loss": 0.2017, + "step": 41449 + }, + { + "epoch": 2.1622083559414, + "grad_norm": 6.121016502380371, + "learning_rate": 4.422502398245855e-06, + "loss": 0.255, + "step": 41450 + }, + { + "epoch": 2.162221920781335, + "grad_norm": 3.5714287757873535, + "learning_rate": 4.4223653556256e-06, + "loss": 0.1737, + "step": 41451 + }, + { + "epoch": 2.1622354856212698, + "grad_norm": 5.476545810699463, + "learning_rate": 4.422228313005345e-06, + "loss": 0.1541, + "step": 41452 + }, + { + "epoch": 2.1622490504612046, + "grad_norm": 5.49727725982666, + "learning_rate": 4.42209127038509e-06, + "loss": 0.1821, + "step": 41453 + }, + { + "epoch": 2.1622626153011395, + "grad_norm": 4.55790901184082, + "learning_rate": 4.4219542277648354e-06, + "loss": 0.1375, + "step": 41454 + }, + { + "epoch": 2.1622761801410744, + "grad_norm": 4.581271171569824, + "learning_rate": 4.42181718514458e-06, + "loss": 0.1247, + "step": 41455 + }, + { + "epoch": 2.1622897449810092, + "grad_norm": 3.94416880607605, + "learning_rate": 4.421680142524326e-06, + "loss": 0.1579, + "step": 41456 + }, + { + "epoch": 2.162303309820944, + "grad_norm": 4.875771999359131, + "learning_rate": 4.42154309990407e-06, + "loss": 0.1675, + "step": 41457 + }, + { + "epoch": 2.162316874660879, + "grad_norm": 6.0202202796936035, + "learning_rate": 4.421406057283816e-06, + "loss": 0.2175, + "step": 41458 + }, + { + "epoch": 2.162330439500814, + "grad_norm": 5.116194248199463, + "learning_rate": 4.4212690146635605e-06, + "loss": 0.224, + "step": 41459 + }, + { + "epoch": 2.1623440043407487, + "grad_norm": 4.232141494750977, + "learning_rate": 4.421131972043306e-06, + "loss": 0.1345, + "step": 41460 + }, + { + "epoch": 2.1623575691806836, + "grad_norm": 6.907426834106445, + "learning_rate": 4.420994929423051e-06, + "loss": 0.214, + "step": 41461 + }, + { + "epoch": 2.1623711340206184, + "grad_norm": 5.521692752838135, + "learning_rate": 4.420857886802796e-06, + "loss": 0.2651, + "step": 41462 + }, + { + "epoch": 2.1623846988605533, + "grad_norm": 4.9138994216918945, + "learning_rate": 4.420720844182541e-06, + "loss": 0.2268, + "step": 41463 + }, + { + "epoch": 2.162398263700488, + "grad_norm": 4.395718574523926, + "learning_rate": 4.420583801562286e-06, + "loss": 0.1135, + "step": 41464 + }, + { + "epoch": 2.162411828540423, + "grad_norm": 4.3248677253723145, + "learning_rate": 4.4204467589420316e-06, + "loss": 0.149, + "step": 41465 + }, + { + "epoch": 2.1624253933803583, + "grad_norm": 6.172943115234375, + "learning_rate": 4.420309716321776e-06, + "loss": 0.1591, + "step": 41466 + }, + { + "epoch": 2.162438958220293, + "grad_norm": 4.5819220542907715, + "learning_rate": 4.420172673701522e-06, + "loss": 0.1627, + "step": 41467 + }, + { + "epoch": 2.162452523060228, + "grad_norm": 4.202844619750977, + "learning_rate": 4.420035631081266e-06, + "loss": 0.1414, + "step": 41468 + }, + { + "epoch": 2.162466087900163, + "grad_norm": 4.065262317657471, + "learning_rate": 4.4198985884610114e-06, + "loss": 0.1675, + "step": 41469 + }, + { + "epoch": 2.162479652740098, + "grad_norm": 4.004557132720947, + "learning_rate": 4.419761545840757e-06, + "loss": 0.1285, + "step": 41470 + }, + { + "epoch": 2.1624932175800327, + "grad_norm": 5.485520362854004, + "learning_rate": 4.419624503220502e-06, + "loss": 0.1911, + "step": 41471 + }, + { + "epoch": 2.1625067824199675, + "grad_norm": 3.009195566177368, + "learning_rate": 4.419487460600247e-06, + "loss": 0.0898, + "step": 41472 + }, + { + "epoch": 2.1625203472599024, + "grad_norm": 5.370391368865967, + "learning_rate": 4.419350417979992e-06, + "loss": 0.2193, + "step": 41473 + }, + { + "epoch": 2.1625339120998373, + "grad_norm": 4.993776798248291, + "learning_rate": 4.419213375359737e-06, + "loss": 0.2871, + "step": 41474 + }, + { + "epoch": 2.162547476939772, + "grad_norm": 5.243163108825684, + "learning_rate": 4.4190763327394825e-06, + "loss": 0.1632, + "step": 41475 + }, + { + "epoch": 2.162561041779707, + "grad_norm": 3.6088356971740723, + "learning_rate": 4.418939290119228e-06, + "loss": 0.1287, + "step": 41476 + }, + { + "epoch": 2.162574606619642, + "grad_norm": 4.254491329193115, + "learning_rate": 4.418802247498973e-06, + "loss": 0.1258, + "step": 41477 + }, + { + "epoch": 2.1625881714595767, + "grad_norm": 4.117949962615967, + "learning_rate": 4.418665204878718e-06, + "loss": 0.1204, + "step": 41478 + }, + { + "epoch": 2.1626017362995116, + "grad_norm": 6.2361249923706055, + "learning_rate": 4.418528162258462e-06, + "loss": 0.2041, + "step": 41479 + }, + { + "epoch": 2.1626153011394464, + "grad_norm": 3.9483590126037598, + "learning_rate": 4.4183911196382076e-06, + "loss": 0.1245, + "step": 41480 + }, + { + "epoch": 2.1626288659793813, + "grad_norm": 3.8683998584747314, + "learning_rate": 4.418254077017953e-06, + "loss": 0.1101, + "step": 41481 + }, + { + "epoch": 2.162642430819316, + "grad_norm": 5.808125019073486, + "learning_rate": 4.418117034397698e-06, + "loss": 0.1856, + "step": 41482 + }, + { + "epoch": 2.162655995659251, + "grad_norm": 4.470551490783691, + "learning_rate": 4.417979991777443e-06, + "loss": 0.1643, + "step": 41483 + }, + { + "epoch": 2.162669560499186, + "grad_norm": 3.755906581878662, + "learning_rate": 4.417842949157188e-06, + "loss": 0.0927, + "step": 41484 + }, + { + "epoch": 2.1626831253391208, + "grad_norm": 4.8971452713012695, + "learning_rate": 4.4177059065369334e-06, + "loss": 0.1094, + "step": 41485 + }, + { + "epoch": 2.162696690179056, + "grad_norm": 6.060553550720215, + "learning_rate": 4.417568863916679e-06, + "loss": 0.2063, + "step": 41486 + }, + { + "epoch": 2.162710255018991, + "grad_norm": 4.551862716674805, + "learning_rate": 4.417431821296424e-06, + "loss": 0.1916, + "step": 41487 + }, + { + "epoch": 2.162723819858926, + "grad_norm": 4.564664363861084, + "learning_rate": 4.417294778676169e-06, + "loss": 0.1214, + "step": 41488 + }, + { + "epoch": 2.1627373846988607, + "grad_norm": 5.477651119232178, + "learning_rate": 4.417157736055913e-06, + "loss": 0.1685, + "step": 41489 + }, + { + "epoch": 2.1627509495387955, + "grad_norm": 3.7540602684020996, + "learning_rate": 4.417020693435659e-06, + "loss": 0.1018, + "step": 41490 + }, + { + "epoch": 2.1627645143787304, + "grad_norm": 6.357149124145508, + "learning_rate": 4.416883650815404e-06, + "loss": 0.2429, + "step": 41491 + }, + { + "epoch": 2.1627780792186653, + "grad_norm": 3.6692051887512207, + "learning_rate": 4.416746608195149e-06, + "loss": 0.1464, + "step": 41492 + }, + { + "epoch": 2.1627916440586, + "grad_norm": 3.681637763977051, + "learning_rate": 4.416609565574894e-06, + "loss": 0.1701, + "step": 41493 + }, + { + "epoch": 2.162805208898535, + "grad_norm": 5.51698637008667, + "learning_rate": 4.416472522954639e-06, + "loss": 0.1683, + "step": 41494 + }, + { + "epoch": 2.16281877373847, + "grad_norm": 3.6258437633514404, + "learning_rate": 4.416335480334384e-06, + "loss": 0.1484, + "step": 41495 + }, + { + "epoch": 2.1628323385784047, + "grad_norm": 4.460050106048584, + "learning_rate": 4.4161984377141296e-06, + "loss": 0.1289, + "step": 41496 + }, + { + "epoch": 2.1628459034183396, + "grad_norm": 4.622032165527344, + "learning_rate": 4.416061395093875e-06, + "loss": 0.1234, + "step": 41497 + }, + { + "epoch": 2.1628594682582745, + "grad_norm": 4.496400356292725, + "learning_rate": 4.41592435247362e-06, + "loss": 0.1642, + "step": 41498 + }, + { + "epoch": 2.1628730330982093, + "grad_norm": 4.2990217208862305, + "learning_rate": 4.415787309853365e-06, + "loss": 0.2251, + "step": 41499 + }, + { + "epoch": 2.162886597938144, + "grad_norm": 4.540638446807861, + "learning_rate": 4.4156502672331094e-06, + "loss": 0.2026, + "step": 41500 + }, + { + "epoch": 2.162900162778079, + "grad_norm": 4.065778732299805, + "learning_rate": 4.4155132246128555e-06, + "loss": 0.1788, + "step": 41501 + }, + { + "epoch": 2.162913727618014, + "grad_norm": 5.787551403045654, + "learning_rate": 4.4153761819926e-06, + "loss": 0.1881, + "step": 41502 + }, + { + "epoch": 2.162927292457949, + "grad_norm": 6.826205730438232, + "learning_rate": 4.415239139372345e-06, + "loss": 0.2295, + "step": 41503 + }, + { + "epoch": 2.162940857297884, + "grad_norm": 6.280277252197266, + "learning_rate": 4.41510209675209e-06, + "loss": 0.198, + "step": 41504 + }, + { + "epoch": 2.162954422137819, + "grad_norm": 5.471731662750244, + "learning_rate": 4.414965054131835e-06, + "loss": 0.2548, + "step": 41505 + }, + { + "epoch": 2.162967986977754, + "grad_norm": 6.502660751342773, + "learning_rate": 4.4148280115115805e-06, + "loss": 0.2413, + "step": 41506 + }, + { + "epoch": 2.1629815518176887, + "grad_norm": 7.229755878448486, + "learning_rate": 4.414690968891326e-06, + "loss": 0.4054, + "step": 41507 + }, + { + "epoch": 2.1629951166576236, + "grad_norm": 5.126866340637207, + "learning_rate": 4.414553926271071e-06, + "loss": 0.2103, + "step": 41508 + }, + { + "epoch": 2.1630086814975584, + "grad_norm": 5.613415241241455, + "learning_rate": 4.414416883650815e-06, + "loss": 0.1931, + "step": 41509 + }, + { + "epoch": 2.1630222463374933, + "grad_norm": 4.781775951385498, + "learning_rate": 4.414279841030561e-06, + "loss": 0.147, + "step": 41510 + }, + { + "epoch": 2.163035811177428, + "grad_norm": 6.879209518432617, + "learning_rate": 4.4141427984103056e-06, + "loss": 0.2566, + "step": 41511 + }, + { + "epoch": 2.163049376017363, + "grad_norm": 6.750422954559326, + "learning_rate": 4.414005755790052e-06, + "loss": 0.3347, + "step": 41512 + }, + { + "epoch": 2.163062940857298, + "grad_norm": 5.2146100997924805, + "learning_rate": 4.413868713169796e-06, + "loss": 0.192, + "step": 41513 + }, + { + "epoch": 2.1630765056972328, + "grad_norm": 4.144359111785889, + "learning_rate": 4.413731670549541e-06, + "loss": 0.1403, + "step": 41514 + }, + { + "epoch": 2.1630900705371676, + "grad_norm": 5.607089519500732, + "learning_rate": 4.413594627929286e-06, + "loss": 0.2238, + "step": 41515 + }, + { + "epoch": 2.1631036353771025, + "grad_norm": 3.629220962524414, + "learning_rate": 4.4134575853090315e-06, + "loss": 0.1668, + "step": 41516 + }, + { + "epoch": 2.1631172002170374, + "grad_norm": 3.547348976135254, + "learning_rate": 4.413320542688777e-06, + "loss": 0.1302, + "step": 41517 + }, + { + "epoch": 2.163130765056972, + "grad_norm": 3.3901917934417725, + "learning_rate": 4.413183500068522e-06, + "loss": 0.1988, + "step": 41518 + }, + { + "epoch": 2.163144329896907, + "grad_norm": 3.1026546955108643, + "learning_rate": 4.413046457448267e-06, + "loss": 0.1698, + "step": 41519 + }, + { + "epoch": 2.163157894736842, + "grad_norm": 4.525592803955078, + "learning_rate": 4.412909414828011e-06, + "loss": 0.1622, + "step": 41520 + }, + { + "epoch": 2.163171459576777, + "grad_norm": 4.982796669006348, + "learning_rate": 4.412772372207757e-06, + "loss": 0.2444, + "step": 41521 + }, + { + "epoch": 2.1631850244167117, + "grad_norm": 5.337011814117432, + "learning_rate": 4.412635329587502e-06, + "loss": 0.257, + "step": 41522 + }, + { + "epoch": 2.1631985892566465, + "grad_norm": 6.784271717071533, + "learning_rate": 4.412498286967247e-06, + "loss": 0.2343, + "step": 41523 + }, + { + "epoch": 2.163212154096582, + "grad_norm": 3.992591381072998, + "learning_rate": 4.412361244346992e-06, + "loss": 0.1663, + "step": 41524 + }, + { + "epoch": 2.1632257189365167, + "grad_norm": 5.389857292175293, + "learning_rate": 4.412224201726737e-06, + "loss": 0.2017, + "step": 41525 + }, + { + "epoch": 2.1632392837764516, + "grad_norm": 4.219752311706543, + "learning_rate": 4.412087159106482e-06, + "loss": 0.1969, + "step": 41526 + }, + { + "epoch": 2.1632528486163864, + "grad_norm": 5.098485469818115, + "learning_rate": 4.4119501164862276e-06, + "loss": 0.1314, + "step": 41527 + }, + { + "epoch": 2.1632664134563213, + "grad_norm": 5.141817569732666, + "learning_rate": 4.411813073865973e-06, + "loss": 0.1667, + "step": 41528 + }, + { + "epoch": 2.163279978296256, + "grad_norm": 5.633886337280273, + "learning_rate": 4.411676031245718e-06, + "loss": 0.3213, + "step": 41529 + }, + { + "epoch": 2.163293543136191, + "grad_norm": 4.816384315490723, + "learning_rate": 4.411538988625463e-06, + "loss": 0.189, + "step": 41530 + }, + { + "epoch": 2.163307107976126, + "grad_norm": 5.031859874725342, + "learning_rate": 4.411401946005208e-06, + "loss": 0.1657, + "step": 41531 + }, + { + "epoch": 2.1633206728160608, + "grad_norm": 5.1400957107543945, + "learning_rate": 4.4112649033849535e-06, + "loss": 0.1975, + "step": 41532 + }, + { + "epoch": 2.1633342376559956, + "grad_norm": 5.076345443725586, + "learning_rate": 4.411127860764699e-06, + "loss": 0.2059, + "step": 41533 + }, + { + "epoch": 2.1633478024959305, + "grad_norm": 6.973932266235352, + "learning_rate": 4.410990818144443e-06, + "loss": 0.2332, + "step": 41534 + }, + { + "epoch": 2.1633613673358654, + "grad_norm": 6.73966646194458, + "learning_rate": 4.410853775524188e-06, + "loss": 0.2566, + "step": 41535 + }, + { + "epoch": 2.1633749321758002, + "grad_norm": 4.6349310874938965, + "learning_rate": 4.410716732903933e-06, + "loss": 0.312, + "step": 41536 + }, + { + "epoch": 2.163388497015735, + "grad_norm": 6.445558547973633, + "learning_rate": 4.4105796902836785e-06, + "loss": 0.3419, + "step": 41537 + }, + { + "epoch": 2.16340206185567, + "grad_norm": 3.9654204845428467, + "learning_rate": 4.410442647663424e-06, + "loss": 0.1613, + "step": 41538 + }, + { + "epoch": 2.163415626695605, + "grad_norm": 5.233925819396973, + "learning_rate": 4.410305605043169e-06, + "loss": 0.1971, + "step": 41539 + }, + { + "epoch": 2.1634291915355397, + "grad_norm": 5.070056438446045, + "learning_rate": 4.410168562422914e-06, + "loss": 0.2038, + "step": 41540 + }, + { + "epoch": 2.1634427563754746, + "grad_norm": 5.375136852264404, + "learning_rate": 4.410031519802659e-06, + "loss": 0.3552, + "step": 41541 + }, + { + "epoch": 2.16345632121541, + "grad_norm": 9.0067720413208, + "learning_rate": 4.409894477182404e-06, + "loss": 0.4944, + "step": 41542 + }, + { + "epoch": 2.1634698860553447, + "grad_norm": 3.9698526859283447, + "learning_rate": 4.409757434562149e-06, + "loss": 0.2126, + "step": 41543 + }, + { + "epoch": 2.1634834508952796, + "grad_norm": 5.131244659423828, + "learning_rate": 4.409620391941895e-06, + "loss": 0.2025, + "step": 41544 + }, + { + "epoch": 2.1634970157352145, + "grad_norm": 6.436604976654053, + "learning_rate": 4.409483349321639e-06, + "loss": 0.2138, + "step": 41545 + }, + { + "epoch": 2.1635105805751493, + "grad_norm": 4.3921284675598145, + "learning_rate": 4.409346306701385e-06, + "loss": 0.2297, + "step": 41546 + }, + { + "epoch": 2.163524145415084, + "grad_norm": 4.310820579528809, + "learning_rate": 4.4092092640811295e-06, + "loss": 0.1695, + "step": 41547 + }, + { + "epoch": 2.163537710255019, + "grad_norm": 3.8556036949157715, + "learning_rate": 4.409072221460875e-06, + "loss": 0.1934, + "step": 41548 + }, + { + "epoch": 2.163551275094954, + "grad_norm": 4.402153491973877, + "learning_rate": 4.40893517884062e-06, + "loss": 0.1429, + "step": 41549 + }, + { + "epoch": 2.163564839934889, + "grad_norm": 4.300446510314941, + "learning_rate": 4.408798136220365e-06, + "loss": 0.1418, + "step": 41550 + }, + { + "epoch": 2.1635784047748237, + "grad_norm": 5.022964000701904, + "learning_rate": 4.40866109360011e-06, + "loss": 0.2779, + "step": 41551 + }, + { + "epoch": 2.1635919696147585, + "grad_norm": 4.24999475479126, + "learning_rate": 4.4085240509798545e-06, + "loss": 0.1846, + "step": 41552 + }, + { + "epoch": 2.1636055344546934, + "grad_norm": 3.649991512298584, + "learning_rate": 4.4083870083596005e-06, + "loss": 0.1451, + "step": 41553 + }, + { + "epoch": 2.1636190992946283, + "grad_norm": 4.302794456481934, + "learning_rate": 4.408249965739345e-06, + "loss": 0.1488, + "step": 41554 + }, + { + "epoch": 2.163632664134563, + "grad_norm": 3.8977062702178955, + "learning_rate": 4.408112923119091e-06, + "loss": 0.1512, + "step": 41555 + }, + { + "epoch": 2.163646228974498, + "grad_norm": 6.4613871574401855, + "learning_rate": 4.407975880498835e-06, + "loss": 0.2512, + "step": 41556 + }, + { + "epoch": 2.163659793814433, + "grad_norm": 4.533210277557373, + "learning_rate": 4.40783883787858e-06, + "loss": 0.1808, + "step": 41557 + }, + { + "epoch": 2.1636733586543677, + "grad_norm": 3.89740252494812, + "learning_rate": 4.4077017952583256e-06, + "loss": 0.1366, + "step": 41558 + }, + { + "epoch": 2.1636869234943026, + "grad_norm": 4.146183013916016, + "learning_rate": 4.407564752638071e-06, + "loss": 0.2198, + "step": 41559 + }, + { + "epoch": 2.1637004883342374, + "grad_norm": 4.806670188903809, + "learning_rate": 4.407427710017816e-06, + "loss": 0.2369, + "step": 41560 + }, + { + "epoch": 2.1637140531741723, + "grad_norm": 5.246286392211914, + "learning_rate": 4.407290667397561e-06, + "loss": 0.1747, + "step": 41561 + }, + { + "epoch": 2.1637276180141076, + "grad_norm": 4.248237609863281, + "learning_rate": 4.407153624777306e-06, + "loss": 0.1442, + "step": 41562 + }, + { + "epoch": 2.1637411828540425, + "grad_norm": 3.6125380992889404, + "learning_rate": 4.407016582157051e-06, + "loss": 0.1409, + "step": 41563 + }, + { + "epoch": 2.1637547476939774, + "grad_norm": 3.9174396991729736, + "learning_rate": 4.406879539536797e-06, + "loss": 0.1737, + "step": 41564 + }, + { + "epoch": 2.163768312533912, + "grad_norm": 5.414877414703369, + "learning_rate": 4.406742496916541e-06, + "loss": 0.2156, + "step": 41565 + }, + { + "epoch": 2.163781877373847, + "grad_norm": 5.816850185394287, + "learning_rate": 4.406605454296287e-06, + "loss": 0.2644, + "step": 41566 + }, + { + "epoch": 2.163795442213782, + "grad_norm": 4.806951522827148, + "learning_rate": 4.406468411676031e-06, + "loss": 0.2216, + "step": 41567 + }, + { + "epoch": 2.163809007053717, + "grad_norm": 4.4613037109375, + "learning_rate": 4.4063313690557765e-06, + "loss": 0.161, + "step": 41568 + }, + { + "epoch": 2.1638225718936517, + "grad_norm": 3.521843910217285, + "learning_rate": 4.406194326435522e-06, + "loss": 0.106, + "step": 41569 + }, + { + "epoch": 2.1638361367335865, + "grad_norm": 4.573376655578613, + "learning_rate": 4.406057283815267e-06, + "loss": 0.1529, + "step": 41570 + }, + { + "epoch": 2.1638497015735214, + "grad_norm": 4.0192551612854, + "learning_rate": 4.405920241195012e-06, + "loss": 0.1302, + "step": 41571 + }, + { + "epoch": 2.1638632664134563, + "grad_norm": 4.448795318603516, + "learning_rate": 4.405783198574757e-06, + "loss": 0.092, + "step": 41572 + }, + { + "epoch": 2.163876831253391, + "grad_norm": 3.674237012863159, + "learning_rate": 4.405646155954502e-06, + "loss": 0.1234, + "step": 41573 + }, + { + "epoch": 2.163890396093326, + "grad_norm": 4.731049060821533, + "learning_rate": 4.405509113334248e-06, + "loss": 0.189, + "step": 41574 + }, + { + "epoch": 2.163903960933261, + "grad_norm": 4.587404251098633, + "learning_rate": 4.405372070713993e-06, + "loss": 0.1691, + "step": 41575 + }, + { + "epoch": 2.1639175257731957, + "grad_norm": 3.841046094894409, + "learning_rate": 4.405235028093737e-06, + "loss": 0.1972, + "step": 41576 + }, + { + "epoch": 2.1639310906131306, + "grad_norm": 4.922031879425049, + "learning_rate": 4.405097985473482e-06, + "loss": 0.2118, + "step": 41577 + }, + { + "epoch": 2.1639446554530655, + "grad_norm": 3.6651194095611572, + "learning_rate": 4.4049609428532275e-06, + "loss": 0.1238, + "step": 41578 + }, + { + "epoch": 2.163958220293001, + "grad_norm": 5.142152786254883, + "learning_rate": 4.404823900232973e-06, + "loss": 0.1722, + "step": 41579 + }, + { + "epoch": 2.1639717851329356, + "grad_norm": 4.916329860687256, + "learning_rate": 4.404686857612718e-06, + "loss": 0.2508, + "step": 41580 + }, + { + "epoch": 2.1639853499728705, + "grad_norm": 5.438999652862549, + "learning_rate": 4.404549814992463e-06, + "loss": 0.1556, + "step": 41581 + }, + { + "epoch": 2.1639989148128054, + "grad_norm": 2.2511415481567383, + "learning_rate": 4.404412772372208e-06, + "loss": 0.0448, + "step": 41582 + }, + { + "epoch": 2.1640124796527402, + "grad_norm": 7.888477802276611, + "learning_rate": 4.404275729751953e-06, + "loss": 0.237, + "step": 41583 + }, + { + "epoch": 2.164026044492675, + "grad_norm": 4.920455455780029, + "learning_rate": 4.4041386871316985e-06, + "loss": 0.1609, + "step": 41584 + }, + { + "epoch": 2.16403960933261, + "grad_norm": 3.844498634338379, + "learning_rate": 4.404001644511444e-06, + "loss": 0.0962, + "step": 41585 + }, + { + "epoch": 2.164053174172545, + "grad_norm": 3.1199190616607666, + "learning_rate": 4.403864601891189e-06, + "loss": 0.11, + "step": 41586 + }, + { + "epoch": 2.1640667390124797, + "grad_norm": 4.272796154022217, + "learning_rate": 4.403727559270934e-06, + "loss": 0.1869, + "step": 41587 + }, + { + "epoch": 2.1640803038524146, + "grad_norm": 6.235702037811279, + "learning_rate": 4.403590516650678e-06, + "loss": 0.2742, + "step": 41588 + }, + { + "epoch": 2.1640938686923494, + "grad_norm": 3.8081629276275635, + "learning_rate": 4.403453474030424e-06, + "loss": 0.1018, + "step": 41589 + }, + { + "epoch": 2.1641074335322843, + "grad_norm": 3.59513521194458, + "learning_rate": 4.403316431410169e-06, + "loss": 0.0661, + "step": 41590 + }, + { + "epoch": 2.164120998372219, + "grad_norm": 4.109064102172852, + "learning_rate": 4.403179388789914e-06, + "loss": 0.1389, + "step": 41591 + }, + { + "epoch": 2.164134563212154, + "grad_norm": 4.735226631164551, + "learning_rate": 4.403042346169659e-06, + "loss": 0.1709, + "step": 41592 + }, + { + "epoch": 2.164148128052089, + "grad_norm": 4.57391357421875, + "learning_rate": 4.402905303549404e-06, + "loss": 0.1603, + "step": 41593 + }, + { + "epoch": 2.1641616928920238, + "grad_norm": 7.820940971374512, + "learning_rate": 4.4027682609291495e-06, + "loss": 0.2385, + "step": 41594 + }, + { + "epoch": 2.1641752577319586, + "grad_norm": 5.487286567687988, + "learning_rate": 4.402631218308895e-06, + "loss": 0.2055, + "step": 41595 + }, + { + "epoch": 2.1641888225718935, + "grad_norm": 3.423264980316162, + "learning_rate": 4.40249417568864e-06, + "loss": 0.0927, + "step": 41596 + }, + { + "epoch": 2.1642023874118284, + "grad_norm": 3.2941081523895264, + "learning_rate": 4.402357133068384e-06, + "loss": 0.0866, + "step": 41597 + }, + { + "epoch": 2.164215952251763, + "grad_norm": 3.849332094192505, + "learning_rate": 4.40222009044813e-06, + "loss": 0.0826, + "step": 41598 + }, + { + "epoch": 2.164229517091698, + "grad_norm": 4.17669677734375, + "learning_rate": 4.4020830478278745e-06, + "loss": 0.1281, + "step": 41599 + }, + { + "epoch": 2.1642430819316334, + "grad_norm": 3.5636463165283203, + "learning_rate": 4.4019460052076205e-06, + "loss": 0.0908, + "step": 41600 + }, + { + "epoch": 2.1642566467715683, + "grad_norm": 2.9665963649749756, + "learning_rate": 4.401808962587365e-06, + "loss": 0.1169, + "step": 41601 + }, + { + "epoch": 2.164270211611503, + "grad_norm": 4.450723648071289, + "learning_rate": 4.40167191996711e-06, + "loss": 0.1506, + "step": 41602 + }, + { + "epoch": 2.164283776451438, + "grad_norm": 4.304983615875244, + "learning_rate": 4.401534877346855e-06, + "loss": 0.1144, + "step": 41603 + }, + { + "epoch": 2.164297341291373, + "grad_norm": 4.5725274085998535, + "learning_rate": 4.4013978347266e-06, + "loss": 0.1506, + "step": 41604 + }, + { + "epoch": 2.1643109061313077, + "grad_norm": 3.402808904647827, + "learning_rate": 4.401260792106346e-06, + "loss": 0.1398, + "step": 41605 + }, + { + "epoch": 2.1643244709712426, + "grad_norm": 3.2663016319274902, + "learning_rate": 4.40112374948609e-06, + "loss": 0.1216, + "step": 41606 + }, + { + "epoch": 2.1643380358111775, + "grad_norm": 3.8730452060699463, + "learning_rate": 4.400986706865836e-06, + "loss": 0.1093, + "step": 41607 + }, + { + "epoch": 2.1643516006511123, + "grad_norm": 4.975002288818359, + "learning_rate": 4.40084966424558e-06, + "loss": 0.1675, + "step": 41608 + }, + { + "epoch": 2.164365165491047, + "grad_norm": 3.5796375274658203, + "learning_rate": 4.400712621625326e-06, + "loss": 0.1552, + "step": 41609 + }, + { + "epoch": 2.164378730330982, + "grad_norm": 4.965854167938232, + "learning_rate": 4.400575579005071e-06, + "loss": 0.1821, + "step": 41610 + }, + { + "epoch": 2.164392295170917, + "grad_norm": 4.074633598327637, + "learning_rate": 4.400438536384816e-06, + "loss": 0.1393, + "step": 41611 + }, + { + "epoch": 2.164405860010852, + "grad_norm": 4.104666709899902, + "learning_rate": 4.400301493764561e-06, + "loss": 0.1566, + "step": 41612 + }, + { + "epoch": 2.1644194248507866, + "grad_norm": 3.0910682678222656, + "learning_rate": 4.400164451144306e-06, + "loss": 0.1553, + "step": 41613 + }, + { + "epoch": 2.1644329896907215, + "grad_norm": 4.22892951965332, + "learning_rate": 4.400027408524051e-06, + "loss": 0.202, + "step": 41614 + }, + { + "epoch": 2.1644465545306564, + "grad_norm": 3.6594319343566895, + "learning_rate": 4.3998903659037965e-06, + "loss": 0.1741, + "step": 41615 + }, + { + "epoch": 2.1644601193705912, + "grad_norm": 3.5079293251037598, + "learning_rate": 4.399753323283542e-06, + "loss": 0.1184, + "step": 41616 + }, + { + "epoch": 2.1644736842105265, + "grad_norm": 2.7831366062164307, + "learning_rate": 4.399616280663286e-06, + "loss": 0.0921, + "step": 41617 + }, + { + "epoch": 2.1644872490504614, + "grad_norm": 4.377305030822754, + "learning_rate": 4.399479238043032e-06, + "loss": 0.1673, + "step": 41618 + }, + { + "epoch": 2.1645008138903963, + "grad_norm": 4.791036128997803, + "learning_rate": 4.399342195422776e-06, + "loss": 0.1882, + "step": 41619 + }, + { + "epoch": 2.164514378730331, + "grad_norm": 4.675565242767334, + "learning_rate": 4.3992051528025224e-06, + "loss": 0.1909, + "step": 41620 + }, + { + "epoch": 2.164527943570266, + "grad_norm": 5.8285112380981445, + "learning_rate": 4.399068110182267e-06, + "loss": 0.1617, + "step": 41621 + }, + { + "epoch": 2.164541508410201, + "grad_norm": 4.16901159286499, + "learning_rate": 4.398931067562012e-06, + "loss": 0.178, + "step": 41622 + }, + { + "epoch": 2.1645550732501357, + "grad_norm": 5.735025405883789, + "learning_rate": 4.398794024941757e-06, + "loss": 0.2302, + "step": 41623 + }, + { + "epoch": 2.1645686380900706, + "grad_norm": 4.864429473876953, + "learning_rate": 4.398656982321502e-06, + "loss": 0.2026, + "step": 41624 + }, + { + "epoch": 2.1645822029300055, + "grad_norm": 5.0827250480651855, + "learning_rate": 4.3985199397012475e-06, + "loss": 0.1934, + "step": 41625 + }, + { + "epoch": 2.1645957677699403, + "grad_norm": 3.8315396308898926, + "learning_rate": 4.398382897080993e-06, + "loss": 0.0869, + "step": 41626 + }, + { + "epoch": 2.164609332609875, + "grad_norm": 4.6978325843811035, + "learning_rate": 4.398245854460738e-06, + "loss": 0.1639, + "step": 41627 + }, + { + "epoch": 2.16462289744981, + "grad_norm": 5.1233110427856445, + "learning_rate": 4.398108811840483e-06, + "loss": 0.18, + "step": 41628 + }, + { + "epoch": 2.164636462289745, + "grad_norm": 4.144281387329102, + "learning_rate": 4.397971769220228e-06, + "loss": 0.0986, + "step": 41629 + }, + { + "epoch": 2.16465002712968, + "grad_norm": 4.428713798522949, + "learning_rate": 4.3978347265999725e-06, + "loss": 0.1287, + "step": 41630 + }, + { + "epoch": 2.1646635919696147, + "grad_norm": 4.7869672775268555, + "learning_rate": 4.397697683979718e-06, + "loss": 0.1906, + "step": 41631 + }, + { + "epoch": 2.1646771568095495, + "grad_norm": 4.849539279937744, + "learning_rate": 4.397560641359463e-06, + "loss": 0.1442, + "step": 41632 + }, + { + "epoch": 2.1646907216494844, + "grad_norm": 4.296658039093018, + "learning_rate": 4.397423598739208e-06, + "loss": 0.1175, + "step": 41633 + }, + { + "epoch": 2.1647042864894193, + "grad_norm": 3.527040958404541, + "learning_rate": 4.397286556118953e-06, + "loss": 0.1207, + "step": 41634 + }, + { + "epoch": 2.164717851329354, + "grad_norm": 4.066212177276611, + "learning_rate": 4.397149513498698e-06, + "loss": 0.1321, + "step": 41635 + }, + { + "epoch": 2.164731416169289, + "grad_norm": 5.286956310272217, + "learning_rate": 4.397012470878444e-06, + "loss": 0.1876, + "step": 41636 + }, + { + "epoch": 2.164744981009224, + "grad_norm": 4.6222825050354, + "learning_rate": 4.396875428258189e-06, + "loss": 0.1979, + "step": 41637 + }, + { + "epoch": 2.164758545849159, + "grad_norm": 5.2135138511657715, + "learning_rate": 4.396738385637934e-06, + "loss": 0.2306, + "step": 41638 + }, + { + "epoch": 2.164772110689094, + "grad_norm": 4.039884090423584, + "learning_rate": 4.396601343017679e-06, + "loss": 0.142, + "step": 41639 + }, + { + "epoch": 2.164785675529029, + "grad_norm": 3.5413529872894287, + "learning_rate": 4.3964643003974235e-06, + "loss": 0.1166, + "step": 41640 + }, + { + "epoch": 2.1647992403689638, + "grad_norm": 4.225872993469238, + "learning_rate": 4.3963272577771695e-06, + "loss": 0.128, + "step": 41641 + }, + { + "epoch": 2.1648128052088986, + "grad_norm": 4.258130073547363, + "learning_rate": 4.396190215156914e-06, + "loss": 0.2433, + "step": 41642 + }, + { + "epoch": 2.1648263700488335, + "grad_norm": 4.315573215484619, + "learning_rate": 4.39605317253666e-06, + "loss": 0.1425, + "step": 41643 + }, + { + "epoch": 2.1648399348887684, + "grad_norm": 3.5404038429260254, + "learning_rate": 4.395916129916404e-06, + "loss": 0.1019, + "step": 41644 + }, + { + "epoch": 2.1648534997287032, + "grad_norm": 4.1312384605407715, + "learning_rate": 4.395779087296149e-06, + "loss": 0.0976, + "step": 41645 + }, + { + "epoch": 2.164867064568638, + "grad_norm": 4.029236793518066, + "learning_rate": 4.3956420446758945e-06, + "loss": 0.1371, + "step": 41646 + }, + { + "epoch": 2.164880629408573, + "grad_norm": 4.844285488128662, + "learning_rate": 4.39550500205564e-06, + "loss": 0.1801, + "step": 41647 + }, + { + "epoch": 2.164894194248508, + "grad_norm": 4.4075822830200195, + "learning_rate": 4.395367959435385e-06, + "loss": 0.1912, + "step": 41648 + }, + { + "epoch": 2.1649077590884427, + "grad_norm": 3.514671564102173, + "learning_rate": 4.39523091681513e-06, + "loss": 0.1124, + "step": 41649 + }, + { + "epoch": 2.1649213239283776, + "grad_norm": 5.044668197631836, + "learning_rate": 4.395093874194875e-06, + "loss": 0.1445, + "step": 41650 + }, + { + "epoch": 2.1649348887683124, + "grad_norm": 3.216989040374756, + "learning_rate": 4.39495683157462e-06, + "loss": 0.0782, + "step": 41651 + }, + { + "epoch": 2.1649484536082473, + "grad_norm": 5.793186187744141, + "learning_rate": 4.394819788954366e-06, + "loss": 0.1084, + "step": 41652 + }, + { + "epoch": 2.164962018448182, + "grad_norm": 4.6939496994018555, + "learning_rate": 4.39468274633411e-06, + "loss": 0.1866, + "step": 41653 + }, + { + "epoch": 2.164975583288117, + "grad_norm": 7.977550029754639, + "learning_rate": 4.394545703713856e-06, + "loss": 0.159, + "step": 41654 + }, + { + "epoch": 2.1649891481280523, + "grad_norm": 4.68395471572876, + "learning_rate": 4.3944086610936e-06, + "loss": 0.1175, + "step": 41655 + }, + { + "epoch": 2.165002712967987, + "grad_norm": 5.066171169281006, + "learning_rate": 4.3942716184733455e-06, + "loss": 0.1548, + "step": 41656 + }, + { + "epoch": 2.165016277807922, + "grad_norm": 4.3365654945373535, + "learning_rate": 4.394134575853091e-06, + "loss": 0.1485, + "step": 41657 + }, + { + "epoch": 2.165029842647857, + "grad_norm": 5.348918914794922, + "learning_rate": 4.393997533232836e-06, + "loss": 0.1331, + "step": 41658 + }, + { + "epoch": 2.165043407487792, + "grad_norm": 4.428699016571045, + "learning_rate": 4.393860490612581e-06, + "loss": 0.1651, + "step": 41659 + }, + { + "epoch": 2.1650569723277266, + "grad_norm": 5.363037586212158, + "learning_rate": 4.393723447992325e-06, + "loss": 0.1798, + "step": 41660 + }, + { + "epoch": 2.1650705371676615, + "grad_norm": 5.099308013916016, + "learning_rate": 4.393586405372071e-06, + "loss": 0.1512, + "step": 41661 + }, + { + "epoch": 2.1650841020075964, + "grad_norm": 3.9870667457580566, + "learning_rate": 4.393449362751816e-06, + "loss": 0.1229, + "step": 41662 + }, + { + "epoch": 2.1650976668475312, + "grad_norm": 5.8625874519348145, + "learning_rate": 4.393312320131562e-06, + "loss": 0.1928, + "step": 41663 + }, + { + "epoch": 2.165111231687466, + "grad_norm": 3.529818534851074, + "learning_rate": 4.393175277511306e-06, + "loss": 0.1098, + "step": 41664 + }, + { + "epoch": 2.165124796527401, + "grad_norm": 3.9525885581970215, + "learning_rate": 4.393038234891051e-06, + "loss": 0.1182, + "step": 41665 + }, + { + "epoch": 2.165138361367336, + "grad_norm": 3.848604679107666, + "learning_rate": 4.3929011922707964e-06, + "loss": 0.1301, + "step": 41666 + }, + { + "epoch": 2.1651519262072707, + "grad_norm": 4.405102729797363, + "learning_rate": 4.392764149650542e-06, + "loss": 0.1251, + "step": 41667 + }, + { + "epoch": 2.1651654910472056, + "grad_norm": 3.1575582027435303, + "learning_rate": 4.392627107030287e-06, + "loss": 0.0812, + "step": 41668 + }, + { + "epoch": 2.1651790558871404, + "grad_norm": 4.960757732391357, + "learning_rate": 4.392490064410032e-06, + "loss": 0.192, + "step": 41669 + }, + { + "epoch": 2.1651926207270753, + "grad_norm": 4.743580341339111, + "learning_rate": 4.392353021789777e-06, + "loss": 0.1848, + "step": 41670 + }, + { + "epoch": 2.16520618556701, + "grad_norm": 3.6558454036712646, + "learning_rate": 4.3922159791695215e-06, + "loss": 0.2309, + "step": 41671 + }, + { + "epoch": 2.165219750406945, + "grad_norm": 3.0383505821228027, + "learning_rate": 4.3920789365492675e-06, + "loss": 0.0568, + "step": 41672 + }, + { + "epoch": 2.16523331524688, + "grad_norm": 4.438817024230957, + "learning_rate": 4.391941893929012e-06, + "loss": 0.1495, + "step": 41673 + }, + { + "epoch": 2.1652468800868148, + "grad_norm": 4.7514166831970215, + "learning_rate": 4.391804851308758e-06, + "loss": 0.1246, + "step": 41674 + }, + { + "epoch": 2.16526044492675, + "grad_norm": 5.857730865478516, + "learning_rate": 4.391667808688502e-06, + "loss": 0.2066, + "step": 41675 + }, + { + "epoch": 2.165274009766685, + "grad_norm": 5.642843246459961, + "learning_rate": 4.391530766068247e-06, + "loss": 0.1935, + "step": 41676 + }, + { + "epoch": 2.16528757460662, + "grad_norm": 4.83952522277832, + "learning_rate": 4.3913937234479925e-06, + "loss": 0.1499, + "step": 41677 + }, + { + "epoch": 2.1653011394465547, + "grad_norm": 3.8805313110351562, + "learning_rate": 4.391256680827738e-06, + "loss": 0.1008, + "step": 41678 + }, + { + "epoch": 2.1653147042864895, + "grad_norm": 5.620236873626709, + "learning_rate": 4.391119638207483e-06, + "loss": 0.1883, + "step": 41679 + }, + { + "epoch": 2.1653282691264244, + "grad_norm": 3.419121265411377, + "learning_rate": 4.390982595587228e-06, + "loss": 0.1694, + "step": 41680 + }, + { + "epoch": 2.1653418339663593, + "grad_norm": 4.7421464920043945, + "learning_rate": 4.390845552966973e-06, + "loss": 0.1522, + "step": 41681 + }, + { + "epoch": 2.165355398806294, + "grad_norm": 5.847919940948486, + "learning_rate": 4.3907085103467184e-06, + "loss": 0.1792, + "step": 41682 + }, + { + "epoch": 2.165368963646229, + "grad_norm": 7.7407612800598145, + "learning_rate": 4.390571467726464e-06, + "loss": 0.2452, + "step": 41683 + }, + { + "epoch": 2.165382528486164, + "grad_norm": 3.392404794692993, + "learning_rate": 4.390434425106209e-06, + "loss": 0.1294, + "step": 41684 + }, + { + "epoch": 2.1653960933260987, + "grad_norm": 3.8653597831726074, + "learning_rate": 4.390297382485953e-06, + "loss": 0.1088, + "step": 41685 + }, + { + "epoch": 2.1654096581660336, + "grad_norm": 3.63242769241333, + "learning_rate": 4.390160339865698e-06, + "loss": 0.1139, + "step": 41686 + }, + { + "epoch": 2.1654232230059685, + "grad_norm": 4.272861957550049, + "learning_rate": 4.3900232972454435e-06, + "loss": 0.1124, + "step": 41687 + }, + { + "epoch": 2.1654367878459033, + "grad_norm": 4.173971176147461, + "learning_rate": 4.389886254625189e-06, + "loss": 0.1797, + "step": 41688 + }, + { + "epoch": 2.165450352685838, + "grad_norm": 5.1347832679748535, + "learning_rate": 4.389749212004934e-06, + "loss": 0.2791, + "step": 41689 + }, + { + "epoch": 2.165463917525773, + "grad_norm": 5.116481781005859, + "learning_rate": 4.389612169384679e-06, + "loss": 0.1675, + "step": 41690 + }, + { + "epoch": 2.165477482365708, + "grad_norm": 5.535745620727539, + "learning_rate": 4.389475126764424e-06, + "loss": 0.2575, + "step": 41691 + }, + { + "epoch": 2.165491047205643, + "grad_norm": 4.5507330894470215, + "learning_rate": 4.389338084144169e-06, + "loss": 0.17, + "step": 41692 + }, + { + "epoch": 2.165504612045578, + "grad_norm": 5.604084491729736, + "learning_rate": 4.3892010415239146e-06, + "loss": 0.1963, + "step": 41693 + }, + { + "epoch": 2.165518176885513, + "grad_norm": 4.739617347717285, + "learning_rate": 4.389063998903659e-06, + "loss": 0.2119, + "step": 41694 + }, + { + "epoch": 2.165531741725448, + "grad_norm": 4.4349517822265625, + "learning_rate": 4.388926956283405e-06, + "loss": 0.1932, + "step": 41695 + }, + { + "epoch": 2.1655453065653827, + "grad_norm": 4.697606086730957, + "learning_rate": 4.388789913663149e-06, + "loss": 0.1737, + "step": 41696 + }, + { + "epoch": 2.1655588714053176, + "grad_norm": 5.7096099853515625, + "learning_rate": 4.388652871042895e-06, + "loss": 0.1547, + "step": 41697 + }, + { + "epoch": 2.1655724362452524, + "grad_norm": 5.9656548500061035, + "learning_rate": 4.38851582842264e-06, + "loss": 0.1852, + "step": 41698 + }, + { + "epoch": 2.1655860010851873, + "grad_norm": 4.085543155670166, + "learning_rate": 4.388378785802385e-06, + "loss": 0.1358, + "step": 41699 + }, + { + "epoch": 2.165599565925122, + "grad_norm": 3.579864263534546, + "learning_rate": 4.38824174318213e-06, + "loss": 0.1271, + "step": 41700 + }, + { + "epoch": 2.165613130765057, + "grad_norm": 5.259249210357666, + "learning_rate": 4.388104700561875e-06, + "loss": 0.2367, + "step": 41701 + }, + { + "epoch": 2.165626695604992, + "grad_norm": 5.697643756866455, + "learning_rate": 4.38796765794162e-06, + "loss": 0.1859, + "step": 41702 + }, + { + "epoch": 2.1656402604449267, + "grad_norm": 4.920681476593018, + "learning_rate": 4.3878306153213655e-06, + "loss": 0.2287, + "step": 41703 + }, + { + "epoch": 2.1656538252848616, + "grad_norm": 3.711871385574341, + "learning_rate": 4.387693572701111e-06, + "loss": 0.141, + "step": 41704 + }, + { + "epoch": 2.1656673901247965, + "grad_norm": 4.510064601898193, + "learning_rate": 4.387556530080855e-06, + "loss": 0.2108, + "step": 41705 + }, + { + "epoch": 2.1656809549647313, + "grad_norm": 4.5120015144348145, + "learning_rate": 4.387419487460601e-06, + "loss": 0.1525, + "step": 41706 + }, + { + "epoch": 2.165694519804666, + "grad_norm": 4.309267520904541, + "learning_rate": 4.387282444840345e-06, + "loss": 0.1596, + "step": 41707 + }, + { + "epoch": 2.165708084644601, + "grad_norm": 5.383986949920654, + "learning_rate": 4.387145402220091e-06, + "loss": 0.1827, + "step": 41708 + }, + { + "epoch": 2.165721649484536, + "grad_norm": 4.687148571014404, + "learning_rate": 4.387008359599836e-06, + "loss": 0.0573, + "step": 41709 + }, + { + "epoch": 2.165735214324471, + "grad_norm": 2.999239683151245, + "learning_rate": 4.386871316979581e-06, + "loss": 0.1031, + "step": 41710 + }, + { + "epoch": 2.1657487791644057, + "grad_norm": 6.802267074584961, + "learning_rate": 4.386734274359326e-06, + "loss": 0.1963, + "step": 41711 + }, + { + "epoch": 2.1657623440043405, + "grad_norm": 4.470213413238525, + "learning_rate": 4.386597231739071e-06, + "loss": 0.1717, + "step": 41712 + }, + { + "epoch": 2.165775908844276, + "grad_norm": 5.698459148406982, + "learning_rate": 4.3864601891188164e-06, + "loss": 0.2268, + "step": 41713 + }, + { + "epoch": 2.1657894736842107, + "grad_norm": 5.336305618286133, + "learning_rate": 4.386323146498561e-06, + "loss": 0.1212, + "step": 41714 + }, + { + "epoch": 2.1658030385241456, + "grad_norm": 4.607823371887207, + "learning_rate": 4.386186103878307e-06, + "loss": 0.1907, + "step": 41715 + }, + { + "epoch": 2.1658166033640804, + "grad_norm": 4.137301445007324, + "learning_rate": 4.386049061258051e-06, + "loss": 0.1636, + "step": 41716 + }, + { + "epoch": 2.1658301682040153, + "grad_norm": 5.2667741775512695, + "learning_rate": 4.385912018637797e-06, + "loss": 0.2074, + "step": 41717 + }, + { + "epoch": 2.16584373304395, + "grad_norm": 4.391490936279297, + "learning_rate": 4.3857749760175415e-06, + "loss": 0.1458, + "step": 41718 + }, + { + "epoch": 2.165857297883885, + "grad_norm": 4.487491607666016, + "learning_rate": 4.385637933397287e-06, + "loss": 0.1377, + "step": 41719 + }, + { + "epoch": 2.16587086272382, + "grad_norm": 4.719962120056152, + "learning_rate": 4.385500890777032e-06, + "loss": 0.0981, + "step": 41720 + }, + { + "epoch": 2.1658844275637548, + "grad_norm": 5.108359336853027, + "learning_rate": 4.385363848156777e-06, + "loss": 0.2068, + "step": 41721 + }, + { + "epoch": 2.1658979924036896, + "grad_norm": 5.8033599853515625, + "learning_rate": 4.385226805536522e-06, + "loss": 0.2463, + "step": 41722 + }, + { + "epoch": 2.1659115572436245, + "grad_norm": 5.449422836303711, + "learning_rate": 4.385089762916267e-06, + "loss": 0.1898, + "step": 41723 + }, + { + "epoch": 2.1659251220835594, + "grad_norm": 3.7931604385375977, + "learning_rate": 4.3849527202960126e-06, + "loss": 0.1644, + "step": 41724 + }, + { + "epoch": 2.1659386869234942, + "grad_norm": 4.725808620452881, + "learning_rate": 4.384815677675758e-06, + "loss": 0.187, + "step": 41725 + }, + { + "epoch": 2.165952251763429, + "grad_norm": 6.418275833129883, + "learning_rate": 4.384678635055503e-06, + "loss": 0.3497, + "step": 41726 + }, + { + "epoch": 2.165965816603364, + "grad_norm": 7.0911865234375, + "learning_rate": 4.384541592435247e-06, + "loss": 0.2495, + "step": 41727 + }, + { + "epoch": 2.165979381443299, + "grad_norm": 8.706093788146973, + "learning_rate": 4.384404549814993e-06, + "loss": 0.465, + "step": 41728 + }, + { + "epoch": 2.1659929462832337, + "grad_norm": 4.0496039390563965, + "learning_rate": 4.384267507194738e-06, + "loss": 0.1631, + "step": 41729 + }, + { + "epoch": 2.1660065111231686, + "grad_norm": 4.075755596160889, + "learning_rate": 4.384130464574483e-06, + "loss": 0.094, + "step": 41730 + }, + { + "epoch": 2.166020075963104, + "grad_norm": 3.584101915359497, + "learning_rate": 4.383993421954228e-06, + "loss": 0.139, + "step": 41731 + }, + { + "epoch": 2.1660336408030387, + "grad_norm": 5.963916301727295, + "learning_rate": 4.383856379333973e-06, + "loss": 0.206, + "step": 41732 + }, + { + "epoch": 2.1660472056429736, + "grad_norm": 5.407899856567383, + "learning_rate": 4.383719336713718e-06, + "loss": 0.1816, + "step": 41733 + }, + { + "epoch": 2.1660607704829085, + "grad_norm": 7.785951614379883, + "learning_rate": 4.3835822940934635e-06, + "loss": 0.3496, + "step": 41734 + }, + { + "epoch": 2.1660743353228433, + "grad_norm": 5.196602821350098, + "learning_rate": 4.383445251473209e-06, + "loss": 0.168, + "step": 41735 + }, + { + "epoch": 2.166087900162778, + "grad_norm": 5.483798503875732, + "learning_rate": 4.383308208852954e-06, + "loss": 0.1605, + "step": 41736 + }, + { + "epoch": 2.166101465002713, + "grad_norm": 5.28062629699707, + "learning_rate": 4.383171166232699e-06, + "loss": 0.1793, + "step": 41737 + }, + { + "epoch": 2.166115029842648, + "grad_norm": 3.8499538898468018, + "learning_rate": 4.383034123612444e-06, + "loss": 0.1169, + "step": 41738 + }, + { + "epoch": 2.166128594682583, + "grad_norm": 4.771034240722656, + "learning_rate": 4.3828970809921886e-06, + "loss": 0.2092, + "step": 41739 + }, + { + "epoch": 2.1661421595225177, + "grad_norm": 5.7758588790893555, + "learning_rate": 4.382760038371934e-06, + "loss": 0.1593, + "step": 41740 + }, + { + "epoch": 2.1661557243624525, + "grad_norm": 4.039862155914307, + "learning_rate": 4.382622995751679e-06, + "loss": 0.1331, + "step": 41741 + }, + { + "epoch": 2.1661692892023874, + "grad_norm": 5.603295803070068, + "learning_rate": 4.382485953131424e-06, + "loss": 0.1783, + "step": 41742 + }, + { + "epoch": 2.1661828540423222, + "grad_norm": 4.321252822875977, + "learning_rate": 4.382348910511169e-06, + "loss": 0.165, + "step": 41743 + }, + { + "epoch": 2.166196418882257, + "grad_norm": 4.906772136688232, + "learning_rate": 4.3822118678909144e-06, + "loss": 0.111, + "step": 41744 + }, + { + "epoch": 2.166209983722192, + "grad_norm": 4.712150573730469, + "learning_rate": 4.38207482527066e-06, + "loss": 0.1618, + "step": 41745 + }, + { + "epoch": 2.166223548562127, + "grad_norm": 4.899378776550293, + "learning_rate": 4.381937782650405e-06, + "loss": 0.1655, + "step": 41746 + }, + { + "epoch": 2.1662371134020617, + "grad_norm": 4.792664527893066, + "learning_rate": 4.38180074003015e-06, + "loss": 0.1477, + "step": 41747 + }, + { + "epoch": 2.1662506782419966, + "grad_norm": 5.753117561340332, + "learning_rate": 4.381663697409894e-06, + "loss": 0.222, + "step": 41748 + }, + { + "epoch": 2.1662642430819314, + "grad_norm": 6.431140899658203, + "learning_rate": 4.38152665478964e-06, + "loss": 0.203, + "step": 41749 + }, + { + "epoch": 2.1662778079218663, + "grad_norm": 4.898826599121094, + "learning_rate": 4.381389612169385e-06, + "loss": 0.2163, + "step": 41750 + }, + { + "epoch": 2.1662913727618016, + "grad_norm": 5.853400230407715, + "learning_rate": 4.381252569549131e-06, + "loss": 0.2351, + "step": 41751 + }, + { + "epoch": 2.1663049376017365, + "grad_norm": 5.751056671142578, + "learning_rate": 4.381115526928875e-06, + "loss": 0.3204, + "step": 41752 + }, + { + "epoch": 2.1663185024416713, + "grad_norm": 5.973260879516602, + "learning_rate": 4.38097848430862e-06, + "loss": 0.2367, + "step": 41753 + }, + { + "epoch": 2.166332067281606, + "grad_norm": 6.553611755371094, + "learning_rate": 4.380841441688365e-06, + "loss": 0.2541, + "step": 41754 + }, + { + "epoch": 2.166345632121541, + "grad_norm": 4.950822353363037, + "learning_rate": 4.3807043990681106e-06, + "loss": 0.1217, + "step": 41755 + }, + { + "epoch": 2.166359196961476, + "grad_norm": 4.056710243225098, + "learning_rate": 4.380567356447856e-06, + "loss": 0.1248, + "step": 41756 + }, + { + "epoch": 2.166372761801411, + "grad_norm": 5.17781400680542, + "learning_rate": 4.380430313827601e-06, + "loss": 0.1956, + "step": 41757 + }, + { + "epoch": 2.1663863266413457, + "grad_norm": 5.524517059326172, + "learning_rate": 4.380293271207346e-06, + "loss": 0.2122, + "step": 41758 + }, + { + "epoch": 2.1663998914812805, + "grad_norm": 5.5276875495910645, + "learning_rate": 4.3801562285870904e-06, + "loss": 0.2745, + "step": 41759 + }, + { + "epoch": 2.1664134563212154, + "grad_norm": 4.4076762199401855, + "learning_rate": 4.3800191859668365e-06, + "loss": 0.0957, + "step": 41760 + }, + { + "epoch": 2.1664270211611503, + "grad_norm": 5.630002498626709, + "learning_rate": 4.379882143346581e-06, + "loss": 0.1728, + "step": 41761 + }, + { + "epoch": 2.166440586001085, + "grad_norm": 5.60050630569458, + "learning_rate": 4.379745100726327e-06, + "loss": 0.1905, + "step": 41762 + }, + { + "epoch": 2.16645415084102, + "grad_norm": 5.974519729614258, + "learning_rate": 4.379608058106071e-06, + "loss": 0.3008, + "step": 41763 + }, + { + "epoch": 2.166467715680955, + "grad_norm": 4.89334774017334, + "learning_rate": 4.379471015485816e-06, + "loss": 0.1738, + "step": 41764 + }, + { + "epoch": 2.1664812805208897, + "grad_norm": 3.624826431274414, + "learning_rate": 4.3793339728655615e-06, + "loss": 0.1466, + "step": 41765 + }, + { + "epoch": 2.1664948453608246, + "grad_norm": 4.087583541870117, + "learning_rate": 4.379196930245307e-06, + "loss": 0.1358, + "step": 41766 + }, + { + "epoch": 2.1665084102007595, + "grad_norm": 4.600354194641113, + "learning_rate": 4.379059887625052e-06, + "loss": 0.1896, + "step": 41767 + }, + { + "epoch": 2.1665219750406943, + "grad_norm": 4.2904558181762695, + "learning_rate": 4.378922845004796e-06, + "loss": 0.1195, + "step": 41768 + }, + { + "epoch": 2.1665355398806296, + "grad_norm": 5.020083427429199, + "learning_rate": 4.378785802384542e-06, + "loss": 0.2324, + "step": 41769 + }, + { + "epoch": 2.1665491047205645, + "grad_norm": 6.35986328125, + "learning_rate": 4.3786487597642866e-06, + "loss": 0.2672, + "step": 41770 + }, + { + "epoch": 2.1665626695604994, + "grad_norm": 4.27715539932251, + "learning_rate": 4.378511717144033e-06, + "loss": 0.1425, + "step": 41771 + }, + { + "epoch": 2.1665762344004342, + "grad_norm": 5.966444492340088, + "learning_rate": 4.378374674523777e-06, + "loss": 0.146, + "step": 41772 + }, + { + "epoch": 2.166589799240369, + "grad_norm": 4.671492576599121, + "learning_rate": 4.378237631903522e-06, + "loss": 0.1223, + "step": 41773 + }, + { + "epoch": 2.166603364080304, + "grad_norm": 3.690398693084717, + "learning_rate": 4.378100589283267e-06, + "loss": 0.1281, + "step": 41774 + }, + { + "epoch": 2.166616928920239, + "grad_norm": 4.93913459777832, + "learning_rate": 4.3779635466630125e-06, + "loss": 0.1728, + "step": 41775 + }, + { + "epoch": 2.1666304937601737, + "grad_norm": 4.8863959312438965, + "learning_rate": 4.377826504042758e-06, + "loss": 0.2401, + "step": 41776 + }, + { + "epoch": 2.1666440586001086, + "grad_norm": 4.658235549926758, + "learning_rate": 4.377689461422503e-06, + "loss": 0.1135, + "step": 41777 + }, + { + "epoch": 2.1666576234400434, + "grad_norm": 4.034455299377441, + "learning_rate": 4.377552418802248e-06, + "loss": 0.1261, + "step": 41778 + }, + { + "epoch": 2.1666711882799783, + "grad_norm": 4.067173957824707, + "learning_rate": 4.377415376181993e-06, + "loss": 0.0893, + "step": 41779 + }, + { + "epoch": 2.166684753119913, + "grad_norm": 3.9742486476898193, + "learning_rate": 4.377278333561738e-06, + "loss": 0.1357, + "step": 41780 + }, + { + "epoch": 2.166698317959848, + "grad_norm": 6.5262227058410645, + "learning_rate": 4.3771412909414835e-06, + "loss": 0.2593, + "step": 41781 + }, + { + "epoch": 2.166711882799783, + "grad_norm": 5.068998336791992, + "learning_rate": 4.377004248321228e-06, + "loss": 0.19, + "step": 41782 + }, + { + "epoch": 2.1667254476397177, + "grad_norm": 3.561366558074951, + "learning_rate": 4.376867205700973e-06, + "loss": 0.151, + "step": 41783 + }, + { + "epoch": 2.1667390124796526, + "grad_norm": 6.675706386566162, + "learning_rate": 4.376730163080718e-06, + "loss": 0.1749, + "step": 41784 + }, + { + "epoch": 2.1667525773195875, + "grad_norm": 5.022318363189697, + "learning_rate": 4.376593120460463e-06, + "loss": 0.1526, + "step": 41785 + }, + { + "epoch": 2.1667661421595223, + "grad_norm": 5.621865272521973, + "learning_rate": 4.3764560778402086e-06, + "loss": 0.1615, + "step": 41786 + }, + { + "epoch": 2.166779706999457, + "grad_norm": 3.5605740547180176, + "learning_rate": 4.376319035219954e-06, + "loss": 0.138, + "step": 41787 + }, + { + "epoch": 2.166793271839392, + "grad_norm": 7.333154678344727, + "learning_rate": 4.376181992599699e-06, + "loss": 0.2802, + "step": 41788 + }, + { + "epoch": 2.1668068366793274, + "grad_norm": 3.632802963256836, + "learning_rate": 4.376044949979444e-06, + "loss": 0.1679, + "step": 41789 + }, + { + "epoch": 2.1668204015192623, + "grad_norm": 3.3692188262939453, + "learning_rate": 4.375907907359189e-06, + "loss": 0.155, + "step": 41790 + }, + { + "epoch": 2.166833966359197, + "grad_norm": 5.4169602394104, + "learning_rate": 4.3757708647389345e-06, + "loss": 0.1854, + "step": 41791 + }, + { + "epoch": 2.166847531199132, + "grad_norm": 5.264914035797119, + "learning_rate": 4.37563382211868e-06, + "loss": 0.1553, + "step": 41792 + }, + { + "epoch": 2.166861096039067, + "grad_norm": 5.062135219573975, + "learning_rate": 4.375496779498424e-06, + "loss": 0.1919, + "step": 41793 + }, + { + "epoch": 2.1668746608790017, + "grad_norm": 3.9772017002105713, + "learning_rate": 4.37535973687817e-06, + "loss": 0.1397, + "step": 41794 + }, + { + "epoch": 2.1668882257189366, + "grad_norm": 4.464770793914795, + "learning_rate": 4.375222694257914e-06, + "loss": 0.1524, + "step": 41795 + }, + { + "epoch": 2.1669017905588714, + "grad_norm": 6.162909984588623, + "learning_rate": 4.3750856516376595e-06, + "loss": 0.1292, + "step": 41796 + }, + { + "epoch": 2.1669153553988063, + "grad_norm": 3.8274729251861572, + "learning_rate": 4.374948609017405e-06, + "loss": 0.1158, + "step": 41797 + }, + { + "epoch": 2.166928920238741, + "grad_norm": 4.183036804199219, + "learning_rate": 4.37481156639715e-06, + "loss": 0.1754, + "step": 41798 + }, + { + "epoch": 2.166942485078676, + "grad_norm": 4.4870524406433105, + "learning_rate": 4.374674523776895e-06, + "loss": 0.1745, + "step": 41799 + }, + { + "epoch": 2.166956049918611, + "grad_norm": 4.69572639465332, + "learning_rate": 4.37453748115664e-06, + "loss": 0.1477, + "step": 41800 + }, + { + "epoch": 2.1669696147585458, + "grad_norm": 4.369714736938477, + "learning_rate": 4.374400438536385e-06, + "loss": 0.1253, + "step": 41801 + }, + { + "epoch": 2.1669831795984806, + "grad_norm": 4.809757232666016, + "learning_rate": 4.37426339591613e-06, + "loss": 0.1906, + "step": 41802 + }, + { + "epoch": 2.1669967444384155, + "grad_norm": 7.940199375152588, + "learning_rate": 4.374126353295876e-06, + "loss": 0.2644, + "step": 41803 + }, + { + "epoch": 2.1670103092783504, + "grad_norm": 5.014492988586426, + "learning_rate": 4.37398931067562e-06, + "loss": 0.1891, + "step": 41804 + }, + { + "epoch": 2.1670238741182852, + "grad_norm": 5.737667083740234, + "learning_rate": 4.373852268055366e-06, + "loss": 0.1678, + "step": 41805 + }, + { + "epoch": 2.16703743895822, + "grad_norm": 4.086864471435547, + "learning_rate": 4.3737152254351105e-06, + "loss": 0.1159, + "step": 41806 + }, + { + "epoch": 2.1670510037981554, + "grad_norm": 4.978017807006836, + "learning_rate": 4.373578182814856e-06, + "loss": 0.2332, + "step": 41807 + }, + { + "epoch": 2.1670645686380903, + "grad_norm": 7.489482402801514, + "learning_rate": 4.373441140194601e-06, + "loss": 0.2487, + "step": 41808 + }, + { + "epoch": 2.167078133478025, + "grad_norm": 4.511488914489746, + "learning_rate": 4.373304097574346e-06, + "loss": 0.1727, + "step": 41809 + }, + { + "epoch": 2.16709169831796, + "grad_norm": 5.336158275604248, + "learning_rate": 4.373167054954091e-06, + "loss": 0.2244, + "step": 41810 + }, + { + "epoch": 2.167105263157895, + "grad_norm": 6.650990962982178, + "learning_rate": 4.373030012333836e-06, + "loss": 0.2874, + "step": 41811 + }, + { + "epoch": 2.1671188279978297, + "grad_norm": 6.847167015075684, + "learning_rate": 4.3728929697135815e-06, + "loss": 0.1977, + "step": 41812 + }, + { + "epoch": 2.1671323928377646, + "grad_norm": 7.010214805603027, + "learning_rate": 4.372755927093326e-06, + "loss": 0.2578, + "step": 41813 + }, + { + "epoch": 2.1671459576776995, + "grad_norm": 4.481696605682373, + "learning_rate": 4.372618884473072e-06, + "loss": 0.1592, + "step": 41814 + }, + { + "epoch": 2.1671595225176343, + "grad_norm": 6.193817138671875, + "learning_rate": 4.372481841852816e-06, + "loss": 0.265, + "step": 41815 + }, + { + "epoch": 2.167173087357569, + "grad_norm": 4.808786392211914, + "learning_rate": 4.372344799232562e-06, + "loss": 0.2127, + "step": 41816 + }, + { + "epoch": 2.167186652197504, + "grad_norm": 4.270417213439941, + "learning_rate": 4.372207756612307e-06, + "loss": 0.1644, + "step": 41817 + }, + { + "epoch": 2.167200217037439, + "grad_norm": 3.9692468643188477, + "learning_rate": 4.372070713992052e-06, + "loss": 0.1154, + "step": 41818 + }, + { + "epoch": 2.167213781877374, + "grad_norm": 4.17396879196167, + "learning_rate": 4.371933671371797e-06, + "loss": 0.163, + "step": 41819 + }, + { + "epoch": 2.1672273467173087, + "grad_norm": 3.3148231506347656, + "learning_rate": 4.371796628751542e-06, + "loss": 0.0628, + "step": 41820 + }, + { + "epoch": 2.1672409115572435, + "grad_norm": 4.406295299530029, + "learning_rate": 4.371659586131287e-06, + "loss": 0.2153, + "step": 41821 + }, + { + "epoch": 2.1672544763971784, + "grad_norm": 4.894477844238281, + "learning_rate": 4.3715225435110325e-06, + "loss": 0.2071, + "step": 41822 + }, + { + "epoch": 2.1672680412371133, + "grad_norm": 3.635921001434326, + "learning_rate": 4.371385500890778e-06, + "loss": 0.1401, + "step": 41823 + }, + { + "epoch": 2.167281606077048, + "grad_norm": 5.340516090393066, + "learning_rate": 4.371248458270522e-06, + "loss": 0.2334, + "step": 41824 + }, + { + "epoch": 2.167295170916983, + "grad_norm": 4.857800006866455, + "learning_rate": 4.371111415650268e-06, + "loss": 0.2349, + "step": 41825 + }, + { + "epoch": 2.167308735756918, + "grad_norm": 4.439352512359619, + "learning_rate": 4.370974373030012e-06, + "loss": 0.1782, + "step": 41826 + }, + { + "epoch": 2.167322300596853, + "grad_norm": 4.1998186111450195, + "learning_rate": 4.3708373304097575e-06, + "loss": 0.1973, + "step": 41827 + }, + { + "epoch": 2.167335865436788, + "grad_norm": 6.76204776763916, + "learning_rate": 4.370700287789503e-06, + "loss": 0.1641, + "step": 41828 + }, + { + "epoch": 2.167349430276723, + "grad_norm": 4.030738353729248, + "learning_rate": 4.370563245169248e-06, + "loss": 0.1958, + "step": 41829 + }, + { + "epoch": 2.1673629951166578, + "grad_norm": 3.978289842605591, + "learning_rate": 4.370426202548993e-06, + "loss": 0.1787, + "step": 41830 + }, + { + "epoch": 2.1673765599565926, + "grad_norm": 4.317348480224609, + "learning_rate": 4.370289159928738e-06, + "loss": 0.1661, + "step": 41831 + }, + { + "epoch": 2.1673901247965275, + "grad_norm": 5.240306377410889, + "learning_rate": 4.370152117308483e-06, + "loss": 0.3026, + "step": 41832 + }, + { + "epoch": 2.1674036896364623, + "grad_norm": 3.557063102722168, + "learning_rate": 4.370015074688229e-06, + "loss": 0.1717, + "step": 41833 + }, + { + "epoch": 2.167417254476397, + "grad_norm": 3.4378724098205566, + "learning_rate": 4.369878032067974e-06, + "loss": 0.1787, + "step": 41834 + }, + { + "epoch": 2.167430819316332, + "grad_norm": 3.3265883922576904, + "learning_rate": 4.369740989447719e-06, + "loss": 0.0929, + "step": 41835 + }, + { + "epoch": 2.167444384156267, + "grad_norm": 4.07204532623291, + "learning_rate": 4.369603946827463e-06, + "loss": 0.1502, + "step": 41836 + }, + { + "epoch": 2.167457948996202, + "grad_norm": 3.3464348316192627, + "learning_rate": 4.3694669042072085e-06, + "loss": 0.1634, + "step": 41837 + }, + { + "epoch": 2.1674715138361367, + "grad_norm": 5.646037578582764, + "learning_rate": 4.369329861586954e-06, + "loss": 0.2904, + "step": 41838 + }, + { + "epoch": 2.1674850786760715, + "grad_norm": 5.570099830627441, + "learning_rate": 4.369192818966699e-06, + "loss": 0.3384, + "step": 41839 + }, + { + "epoch": 2.1674986435160064, + "grad_norm": 4.815001010894775, + "learning_rate": 4.369055776346444e-06, + "loss": 0.1299, + "step": 41840 + }, + { + "epoch": 2.1675122083559413, + "grad_norm": 4.009418487548828, + "learning_rate": 4.368918733726189e-06, + "loss": 0.211, + "step": 41841 + }, + { + "epoch": 2.167525773195876, + "grad_norm": 4.939157485961914, + "learning_rate": 4.368781691105934e-06, + "loss": 0.1762, + "step": 41842 + }, + { + "epoch": 2.167539338035811, + "grad_norm": 3.8220059871673584, + "learning_rate": 4.3686446484856795e-06, + "loss": 0.0976, + "step": 41843 + }, + { + "epoch": 2.167552902875746, + "grad_norm": 5.986886024475098, + "learning_rate": 4.368507605865425e-06, + "loss": 0.1586, + "step": 41844 + }, + { + "epoch": 2.167566467715681, + "grad_norm": 4.012291431427002, + "learning_rate": 4.36837056324517e-06, + "loss": 0.1257, + "step": 41845 + }, + { + "epoch": 2.167580032555616, + "grad_norm": 3.9616928100585938, + "learning_rate": 4.368233520624915e-06, + "loss": 0.1712, + "step": 41846 + }, + { + "epoch": 2.167593597395551, + "grad_norm": 4.27926778793335, + "learning_rate": 4.368096478004659e-06, + "loss": 0.2036, + "step": 41847 + }, + { + "epoch": 2.1676071622354858, + "grad_norm": 3.967620611190796, + "learning_rate": 4.3679594353844054e-06, + "loss": 0.1896, + "step": 41848 + }, + { + "epoch": 2.1676207270754206, + "grad_norm": 4.339913845062256, + "learning_rate": 4.36782239276415e-06, + "loss": 0.1614, + "step": 41849 + }, + { + "epoch": 2.1676342919153555, + "grad_norm": 4.557767391204834, + "learning_rate": 4.367685350143895e-06, + "loss": 0.1534, + "step": 41850 + }, + { + "epoch": 2.1676478567552904, + "grad_norm": 3.4082865715026855, + "learning_rate": 4.36754830752364e-06, + "loss": 0.1113, + "step": 41851 + }, + { + "epoch": 2.1676614215952252, + "grad_norm": 14.55218505859375, + "learning_rate": 4.367411264903385e-06, + "loss": 0.2007, + "step": 41852 + }, + { + "epoch": 2.16767498643516, + "grad_norm": 4.193186283111572, + "learning_rate": 4.3672742222831305e-06, + "loss": 0.181, + "step": 41853 + }, + { + "epoch": 2.167688551275095, + "grad_norm": 4.78713846206665, + "learning_rate": 4.367137179662876e-06, + "loss": 0.1921, + "step": 41854 + }, + { + "epoch": 2.16770211611503, + "grad_norm": 6.220085144042969, + "learning_rate": 4.367000137042621e-06, + "loss": 0.1847, + "step": 41855 + }, + { + "epoch": 2.1677156809549647, + "grad_norm": 3.0778284072875977, + "learning_rate": 4.366863094422365e-06, + "loss": 0.0713, + "step": 41856 + }, + { + "epoch": 2.1677292457948996, + "grad_norm": 5.2112298011779785, + "learning_rate": 4.366726051802111e-06, + "loss": 0.2386, + "step": 41857 + }, + { + "epoch": 2.1677428106348344, + "grad_norm": 3.0480222702026367, + "learning_rate": 4.3665890091818555e-06, + "loss": 0.0624, + "step": 41858 + }, + { + "epoch": 2.1677563754747693, + "grad_norm": 3.904662609100342, + "learning_rate": 4.3664519665616015e-06, + "loss": 0.1772, + "step": 41859 + }, + { + "epoch": 2.167769940314704, + "grad_norm": 4.39075231552124, + "learning_rate": 4.366314923941346e-06, + "loss": 0.1299, + "step": 41860 + }, + { + "epoch": 2.167783505154639, + "grad_norm": 5.490601062774658, + "learning_rate": 4.366177881321091e-06, + "loss": 0.113, + "step": 41861 + }, + { + "epoch": 2.167797069994574, + "grad_norm": 4.450855255126953, + "learning_rate": 4.366040838700836e-06, + "loss": 0.1539, + "step": 41862 + }, + { + "epoch": 2.1678106348345088, + "grad_norm": 4.895722389221191, + "learning_rate": 4.365903796080581e-06, + "loss": 0.1847, + "step": 41863 + }, + { + "epoch": 2.1678241996744436, + "grad_norm": 3.1003050804138184, + "learning_rate": 4.365766753460327e-06, + "loss": 0.0644, + "step": 41864 + }, + { + "epoch": 2.167837764514379, + "grad_norm": 4.046472549438477, + "learning_rate": 4.365629710840072e-06, + "loss": 0.1308, + "step": 41865 + }, + { + "epoch": 2.167851329354314, + "grad_norm": 3.5136935710906982, + "learning_rate": 4.365492668219817e-06, + "loss": 0.0976, + "step": 41866 + }, + { + "epoch": 2.1678648941942487, + "grad_norm": 3.776801347732544, + "learning_rate": 4.365355625599561e-06, + "loss": 0.0774, + "step": 41867 + }, + { + "epoch": 2.1678784590341835, + "grad_norm": 5.106927394866943, + "learning_rate": 4.365218582979307e-06, + "loss": 0.1639, + "step": 41868 + }, + { + "epoch": 2.1678920238741184, + "grad_norm": 4.218564987182617, + "learning_rate": 4.365081540359052e-06, + "loss": 0.1108, + "step": 41869 + }, + { + "epoch": 2.1679055887140533, + "grad_norm": 3.6019580364227295, + "learning_rate": 4.364944497738798e-06, + "loss": 0.0991, + "step": 41870 + }, + { + "epoch": 2.167919153553988, + "grad_norm": 5.390762805938721, + "learning_rate": 4.364807455118542e-06, + "loss": 0.1905, + "step": 41871 + }, + { + "epoch": 2.167932718393923, + "grad_norm": 3.7769761085510254, + "learning_rate": 4.364670412498287e-06, + "loss": 0.1877, + "step": 41872 + }, + { + "epoch": 2.167946283233858, + "grad_norm": 3.984518051147461, + "learning_rate": 4.364533369878032e-06, + "loss": 0.1274, + "step": 41873 + }, + { + "epoch": 2.1679598480737927, + "grad_norm": 5.209394454956055, + "learning_rate": 4.3643963272577775e-06, + "loss": 0.1744, + "step": 41874 + }, + { + "epoch": 2.1679734129137276, + "grad_norm": 4.194287300109863, + "learning_rate": 4.364259284637523e-06, + "loss": 0.152, + "step": 41875 + }, + { + "epoch": 2.1679869777536624, + "grad_norm": 5.124324321746826, + "learning_rate": 4.364122242017268e-06, + "loss": 0.1548, + "step": 41876 + }, + { + "epoch": 2.1680005425935973, + "grad_norm": 3.654841423034668, + "learning_rate": 4.363985199397013e-06, + "loss": 0.1345, + "step": 41877 + }, + { + "epoch": 2.168014107433532, + "grad_norm": 3.5925958156585693, + "learning_rate": 4.363848156776757e-06, + "loss": 0.129, + "step": 41878 + }, + { + "epoch": 2.168027672273467, + "grad_norm": 5.664528846740723, + "learning_rate": 4.3637111141565034e-06, + "loss": 0.1638, + "step": 41879 + }, + { + "epoch": 2.168041237113402, + "grad_norm": 3.333387851715088, + "learning_rate": 4.363574071536248e-06, + "loss": 0.1362, + "step": 41880 + }, + { + "epoch": 2.1680548019533368, + "grad_norm": 4.306241512298584, + "learning_rate": 4.363437028915993e-06, + "loss": 0.1259, + "step": 41881 + }, + { + "epoch": 2.1680683667932716, + "grad_norm": 4.104843616485596, + "learning_rate": 4.363299986295738e-06, + "loss": 0.1458, + "step": 41882 + }, + { + "epoch": 2.168081931633207, + "grad_norm": 3.715013027191162, + "learning_rate": 4.363162943675483e-06, + "loss": 0.1068, + "step": 41883 + }, + { + "epoch": 2.168095496473142, + "grad_norm": 3.066673517227173, + "learning_rate": 4.3630259010552285e-06, + "loss": 0.0951, + "step": 41884 + }, + { + "epoch": 2.1681090613130767, + "grad_norm": 4.652913570404053, + "learning_rate": 4.362888858434974e-06, + "loss": 0.1541, + "step": 41885 + }, + { + "epoch": 2.1681226261530115, + "grad_norm": 4.969249725341797, + "learning_rate": 4.362751815814719e-06, + "loss": 0.0967, + "step": 41886 + }, + { + "epoch": 2.1681361909929464, + "grad_norm": 3.7899532318115234, + "learning_rate": 4.362614773194464e-06, + "loss": 0.1175, + "step": 41887 + }, + { + "epoch": 2.1681497558328813, + "grad_norm": 6.553289413452148, + "learning_rate": 4.362477730574209e-06, + "loss": 0.2254, + "step": 41888 + }, + { + "epoch": 2.168163320672816, + "grad_norm": 4.470531940460205, + "learning_rate": 4.362340687953954e-06, + "loss": 0.1199, + "step": 41889 + }, + { + "epoch": 2.168176885512751, + "grad_norm": 4.4961256980896, + "learning_rate": 4.362203645333699e-06, + "loss": 0.1365, + "step": 41890 + }, + { + "epoch": 2.168190450352686, + "grad_norm": 3.492039918899536, + "learning_rate": 4.362066602713445e-06, + "loss": 0.0908, + "step": 41891 + }, + { + "epoch": 2.1682040151926207, + "grad_norm": 3.7016329765319824, + "learning_rate": 4.361929560093189e-06, + "loss": 0.1176, + "step": 41892 + }, + { + "epoch": 2.1682175800325556, + "grad_norm": 8.769160270690918, + "learning_rate": 4.361792517472934e-06, + "loss": 0.1976, + "step": 41893 + }, + { + "epoch": 2.1682311448724905, + "grad_norm": 3.4924392700195312, + "learning_rate": 4.361655474852679e-06, + "loss": 0.1039, + "step": 41894 + }, + { + "epoch": 2.1682447097124253, + "grad_norm": 5.325410842895508, + "learning_rate": 4.361518432232425e-06, + "loss": 0.1465, + "step": 41895 + }, + { + "epoch": 2.16825827455236, + "grad_norm": 3.2841591835021973, + "learning_rate": 4.36138138961217e-06, + "loss": 0.0692, + "step": 41896 + }, + { + "epoch": 2.168271839392295, + "grad_norm": 4.368040561676025, + "learning_rate": 4.361244346991915e-06, + "loss": 0.1376, + "step": 41897 + }, + { + "epoch": 2.16828540423223, + "grad_norm": 5.088688850402832, + "learning_rate": 4.36110730437166e-06, + "loss": 0.1997, + "step": 41898 + }, + { + "epoch": 2.168298969072165, + "grad_norm": 4.343669414520264, + "learning_rate": 4.360970261751405e-06, + "loss": 0.1341, + "step": 41899 + }, + { + "epoch": 2.1683125339120997, + "grad_norm": 3.8874616622924805, + "learning_rate": 4.3608332191311505e-06, + "loss": 0.1123, + "step": 41900 + }, + { + "epoch": 2.1683260987520345, + "grad_norm": 2.824432373046875, + "learning_rate": 4.360696176510895e-06, + "loss": 0.0596, + "step": 41901 + }, + { + "epoch": 2.1683396635919694, + "grad_norm": 4.1198601722717285, + "learning_rate": 4.360559133890641e-06, + "loss": 0.1049, + "step": 41902 + }, + { + "epoch": 2.1683532284319047, + "grad_norm": 2.573254346847534, + "learning_rate": 4.360422091270385e-06, + "loss": 0.0705, + "step": 41903 + }, + { + "epoch": 2.1683667932718396, + "grad_norm": 2.986095905303955, + "learning_rate": 4.360285048650131e-06, + "loss": 0.0782, + "step": 41904 + }, + { + "epoch": 2.1683803581117744, + "grad_norm": 5.5769758224487305, + "learning_rate": 4.3601480060298755e-06, + "loss": 0.2774, + "step": 41905 + }, + { + "epoch": 2.1683939229517093, + "grad_norm": 5.185262680053711, + "learning_rate": 4.360010963409621e-06, + "loss": 0.1663, + "step": 41906 + }, + { + "epoch": 2.168407487791644, + "grad_norm": 3.9227099418640137, + "learning_rate": 4.359873920789366e-06, + "loss": 0.1001, + "step": 41907 + }, + { + "epoch": 2.168421052631579, + "grad_norm": 4.640965461730957, + "learning_rate": 4.359736878169111e-06, + "loss": 0.1445, + "step": 41908 + }, + { + "epoch": 2.168434617471514, + "grad_norm": 4.275148868560791, + "learning_rate": 4.359599835548856e-06, + "loss": 0.1238, + "step": 41909 + }, + { + "epoch": 2.1684481823114488, + "grad_norm": 3.0219991207122803, + "learning_rate": 4.359462792928601e-06, + "loss": 0.0822, + "step": 41910 + }, + { + "epoch": 2.1684617471513836, + "grad_norm": 5.1741533279418945, + "learning_rate": 4.359325750308347e-06, + "loss": 0.1597, + "step": 41911 + }, + { + "epoch": 2.1684753119913185, + "grad_norm": 4.964521884918213, + "learning_rate": 4.359188707688091e-06, + "loss": 0.1413, + "step": 41912 + }, + { + "epoch": 2.1684888768312534, + "grad_norm": 6.836207389831543, + "learning_rate": 4.359051665067837e-06, + "loss": 0.1734, + "step": 41913 + }, + { + "epoch": 2.168502441671188, + "grad_norm": 4.546108722686768, + "learning_rate": 4.358914622447581e-06, + "loss": 0.1763, + "step": 41914 + }, + { + "epoch": 2.168516006511123, + "grad_norm": 2.9647278785705566, + "learning_rate": 4.3587775798273265e-06, + "loss": 0.0952, + "step": 41915 + }, + { + "epoch": 2.168529571351058, + "grad_norm": 4.01247501373291, + "learning_rate": 4.358640537207072e-06, + "loss": 0.1002, + "step": 41916 + }, + { + "epoch": 2.168543136190993, + "grad_norm": 4.3173508644104, + "learning_rate": 4.358503494586817e-06, + "loss": 0.1223, + "step": 41917 + }, + { + "epoch": 2.1685567010309277, + "grad_norm": 5.228763580322266, + "learning_rate": 4.358366451966562e-06, + "loss": 0.1126, + "step": 41918 + }, + { + "epoch": 2.1685702658708625, + "grad_norm": 3.758216381072998, + "learning_rate": 4.358229409346307e-06, + "loss": 0.0709, + "step": 41919 + }, + { + "epoch": 2.1685838307107974, + "grad_norm": 3.819484233856201, + "learning_rate": 4.358092366726052e-06, + "loss": 0.1296, + "step": 41920 + }, + { + "epoch": 2.1685973955507327, + "grad_norm": 4.417018890380859, + "learning_rate": 4.357955324105797e-06, + "loss": 0.1072, + "step": 41921 + }, + { + "epoch": 2.1686109603906676, + "grad_norm": 5.423564910888672, + "learning_rate": 4.357818281485543e-06, + "loss": 0.1433, + "step": 41922 + }, + { + "epoch": 2.1686245252306025, + "grad_norm": 3.403334140777588, + "learning_rate": 4.357681238865287e-06, + "loss": 0.1043, + "step": 41923 + }, + { + "epoch": 2.1686380900705373, + "grad_norm": 3.3160557746887207, + "learning_rate": 4.357544196245032e-06, + "loss": 0.0876, + "step": 41924 + }, + { + "epoch": 2.168651654910472, + "grad_norm": 2.712548017501831, + "learning_rate": 4.3574071536247774e-06, + "loss": 0.0648, + "step": 41925 + }, + { + "epoch": 2.168665219750407, + "grad_norm": 3.561464548110962, + "learning_rate": 4.357270111004523e-06, + "loss": 0.106, + "step": 41926 + }, + { + "epoch": 2.168678784590342, + "grad_norm": 3.3642537593841553, + "learning_rate": 4.357133068384268e-06, + "loss": 0.0851, + "step": 41927 + }, + { + "epoch": 2.1686923494302768, + "grad_norm": 3.7458584308624268, + "learning_rate": 4.356996025764013e-06, + "loss": 0.1305, + "step": 41928 + }, + { + "epoch": 2.1687059142702116, + "grad_norm": 3.2129876613616943, + "learning_rate": 4.356858983143758e-06, + "loss": 0.0602, + "step": 41929 + }, + { + "epoch": 2.1687194791101465, + "grad_norm": 4.1256279945373535, + "learning_rate": 4.356721940523503e-06, + "loss": 0.0778, + "step": 41930 + }, + { + "epoch": 2.1687330439500814, + "grad_norm": 4.253319263458252, + "learning_rate": 4.3565848979032485e-06, + "loss": 0.1269, + "step": 41931 + }, + { + "epoch": 2.1687466087900162, + "grad_norm": 5.302424907684326, + "learning_rate": 4.356447855282994e-06, + "loss": 0.216, + "step": 41932 + }, + { + "epoch": 2.168760173629951, + "grad_norm": 2.8014492988586426, + "learning_rate": 4.356310812662739e-06, + "loss": 0.1004, + "step": 41933 + }, + { + "epoch": 2.168773738469886, + "grad_norm": 2.8407843112945557, + "learning_rate": 4.356173770042483e-06, + "loss": 0.0565, + "step": 41934 + }, + { + "epoch": 2.168787303309821, + "grad_norm": 4.609347820281982, + "learning_rate": 4.356036727422228e-06, + "loss": 0.1341, + "step": 41935 + }, + { + "epoch": 2.1688008681497557, + "grad_norm": 2.955787181854248, + "learning_rate": 4.3558996848019735e-06, + "loss": 0.0681, + "step": 41936 + }, + { + "epoch": 2.1688144329896906, + "grad_norm": 3.742875099182129, + "learning_rate": 4.355762642181719e-06, + "loss": 0.1021, + "step": 41937 + }, + { + "epoch": 2.1688279978296254, + "grad_norm": 3.5528135299682617, + "learning_rate": 4.355625599561464e-06, + "loss": 0.0988, + "step": 41938 + }, + { + "epoch": 2.1688415626695603, + "grad_norm": 3.8092331886291504, + "learning_rate": 4.355488556941209e-06, + "loss": 0.1395, + "step": 41939 + }, + { + "epoch": 2.168855127509495, + "grad_norm": 5.135671138763428, + "learning_rate": 4.355351514320954e-06, + "loss": 0.1423, + "step": 41940 + }, + { + "epoch": 2.1688686923494305, + "grad_norm": 3.880040407180786, + "learning_rate": 4.3552144717006994e-06, + "loss": 0.0853, + "step": 41941 + }, + { + "epoch": 2.1688822571893653, + "grad_norm": 3.384474754333496, + "learning_rate": 4.355077429080445e-06, + "loss": 0.0678, + "step": 41942 + }, + { + "epoch": 2.1688958220293, + "grad_norm": 5.436346054077148, + "learning_rate": 4.35494038646019e-06, + "loss": 0.1677, + "step": 41943 + }, + { + "epoch": 2.168909386869235, + "grad_norm": 4.827486038208008, + "learning_rate": 4.354803343839934e-06, + "loss": 0.1646, + "step": 41944 + }, + { + "epoch": 2.16892295170917, + "grad_norm": 4.253886699676514, + "learning_rate": 4.35466630121968e-06, + "loss": 0.1693, + "step": 41945 + }, + { + "epoch": 2.168936516549105, + "grad_norm": 3.774116039276123, + "learning_rate": 4.3545292585994245e-06, + "loss": 0.1452, + "step": 41946 + }, + { + "epoch": 2.1689500813890397, + "grad_norm": 4.1566572189331055, + "learning_rate": 4.35439221597917e-06, + "loss": 0.1015, + "step": 41947 + }, + { + "epoch": 2.1689636462289745, + "grad_norm": 3.3677680492401123, + "learning_rate": 4.354255173358915e-06, + "loss": 0.0787, + "step": 41948 + }, + { + "epoch": 2.1689772110689094, + "grad_norm": 5.3378682136535645, + "learning_rate": 4.35411813073866e-06, + "loss": 0.1098, + "step": 41949 + }, + { + "epoch": 2.1689907759088443, + "grad_norm": 3.451646089553833, + "learning_rate": 4.353981088118405e-06, + "loss": 0.1312, + "step": 41950 + }, + { + "epoch": 2.169004340748779, + "grad_norm": 5.309571743011475, + "learning_rate": 4.35384404549815e-06, + "loss": 0.1522, + "step": 41951 + }, + { + "epoch": 2.169017905588714, + "grad_norm": 3.085397958755493, + "learning_rate": 4.3537070028778956e-06, + "loss": 0.1036, + "step": 41952 + }, + { + "epoch": 2.169031470428649, + "grad_norm": 4.9421257972717285, + "learning_rate": 4.353569960257641e-06, + "loss": 0.1218, + "step": 41953 + }, + { + "epoch": 2.1690450352685837, + "grad_norm": 4.476891994476318, + "learning_rate": 4.353432917637386e-06, + "loss": 0.1267, + "step": 41954 + }, + { + "epoch": 2.1690586001085186, + "grad_norm": 5.376461982727051, + "learning_rate": 4.35329587501713e-06, + "loss": 0.1654, + "step": 41955 + }, + { + "epoch": 2.1690721649484535, + "grad_norm": 3.557284355163574, + "learning_rate": 4.353158832396876e-06, + "loss": 0.1001, + "step": 41956 + }, + { + "epoch": 2.1690857297883883, + "grad_norm": 4.1737751960754395, + "learning_rate": 4.353021789776621e-06, + "loss": 0.1656, + "step": 41957 + }, + { + "epoch": 2.169099294628323, + "grad_norm": 6.5912275314331055, + "learning_rate": 4.352884747156367e-06, + "loss": 0.1768, + "step": 41958 + }, + { + "epoch": 2.1691128594682585, + "grad_norm": 4.284869194030762, + "learning_rate": 4.352747704536111e-06, + "loss": 0.1457, + "step": 41959 + }, + { + "epoch": 2.1691264243081934, + "grad_norm": 3.4965202808380127, + "learning_rate": 4.352610661915856e-06, + "loss": 0.076, + "step": 41960 + }, + { + "epoch": 2.1691399891481282, + "grad_norm": 3.963292360305786, + "learning_rate": 4.352473619295601e-06, + "loss": 0.1451, + "step": 41961 + }, + { + "epoch": 2.169153553988063, + "grad_norm": 5.10430383682251, + "learning_rate": 4.3523365766753465e-06, + "loss": 0.1586, + "step": 41962 + }, + { + "epoch": 2.169167118827998, + "grad_norm": 3.1401448249816895, + "learning_rate": 4.352199534055092e-06, + "loss": 0.0877, + "step": 41963 + }, + { + "epoch": 2.169180683667933, + "grad_norm": 4.408167839050293, + "learning_rate": 4.352062491434836e-06, + "loss": 0.1287, + "step": 41964 + }, + { + "epoch": 2.1691942485078677, + "grad_norm": 4.4621052742004395, + "learning_rate": 4.351925448814582e-06, + "loss": 0.1537, + "step": 41965 + }, + { + "epoch": 2.1692078133478025, + "grad_norm": 4.698615550994873, + "learning_rate": 4.351788406194326e-06, + "loss": 0.187, + "step": 41966 + }, + { + "epoch": 2.1692213781877374, + "grad_norm": 5.002861499786377, + "learning_rate": 4.351651363574072e-06, + "loss": 0.2115, + "step": 41967 + }, + { + "epoch": 2.1692349430276723, + "grad_norm": 4.726810932159424, + "learning_rate": 4.351514320953817e-06, + "loss": 0.1272, + "step": 41968 + }, + { + "epoch": 2.169248507867607, + "grad_norm": 3.54258394241333, + "learning_rate": 4.351377278333562e-06, + "loss": 0.1129, + "step": 41969 + }, + { + "epoch": 2.169262072707542, + "grad_norm": 5.1026387214660645, + "learning_rate": 4.351240235713307e-06, + "loss": 0.0786, + "step": 41970 + }, + { + "epoch": 2.169275637547477, + "grad_norm": 5.023276329040527, + "learning_rate": 4.351103193093052e-06, + "loss": 0.1075, + "step": 41971 + }, + { + "epoch": 2.1692892023874117, + "grad_norm": 4.042275428771973, + "learning_rate": 4.3509661504727974e-06, + "loss": 0.1093, + "step": 41972 + }, + { + "epoch": 2.1693027672273466, + "grad_norm": 5.6719231605529785, + "learning_rate": 4.350829107852543e-06, + "loss": 0.1998, + "step": 41973 + }, + { + "epoch": 2.1693163320672815, + "grad_norm": 3.999840497970581, + "learning_rate": 4.350692065232288e-06, + "loss": 0.1202, + "step": 41974 + }, + { + "epoch": 2.1693298969072163, + "grad_norm": 6.187053680419922, + "learning_rate": 4.350555022612032e-06, + "loss": 0.2538, + "step": 41975 + }, + { + "epoch": 2.169343461747151, + "grad_norm": 4.48598575592041, + "learning_rate": 4.350417979991778e-06, + "loss": 0.1379, + "step": 41976 + }, + { + "epoch": 2.169357026587086, + "grad_norm": 3.654547929763794, + "learning_rate": 4.3502809373715225e-06, + "loss": 0.1257, + "step": 41977 + }, + { + "epoch": 2.169370591427021, + "grad_norm": 3.7683849334716797, + "learning_rate": 4.350143894751268e-06, + "loss": 0.1272, + "step": 41978 + }, + { + "epoch": 2.1693841562669562, + "grad_norm": 3.6891236305236816, + "learning_rate": 4.350006852131013e-06, + "loss": 0.1648, + "step": 41979 + }, + { + "epoch": 2.169397721106891, + "grad_norm": 4.970139980316162, + "learning_rate": 4.349869809510758e-06, + "loss": 0.2065, + "step": 41980 + }, + { + "epoch": 2.169411285946826, + "grad_norm": 3.756425619125366, + "learning_rate": 4.349732766890503e-06, + "loss": 0.1532, + "step": 41981 + }, + { + "epoch": 2.169424850786761, + "grad_norm": 4.428192615509033, + "learning_rate": 4.349595724270248e-06, + "loss": 0.2098, + "step": 41982 + }, + { + "epoch": 2.1694384156266957, + "grad_norm": 3.0800938606262207, + "learning_rate": 4.3494586816499936e-06, + "loss": 0.0983, + "step": 41983 + }, + { + "epoch": 2.1694519804666306, + "grad_norm": 3.644218921661377, + "learning_rate": 4.349321639029739e-06, + "loss": 0.1048, + "step": 41984 + }, + { + "epoch": 2.1694655453065654, + "grad_norm": 5.149713516235352, + "learning_rate": 4.349184596409484e-06, + "loss": 0.1461, + "step": 41985 + }, + { + "epoch": 2.1694791101465003, + "grad_norm": 3.8126540184020996, + "learning_rate": 4.349047553789229e-06, + "loss": 0.1022, + "step": 41986 + }, + { + "epoch": 2.169492674986435, + "grad_norm": 4.467464447021484, + "learning_rate": 4.348910511168974e-06, + "loss": 0.1, + "step": 41987 + }, + { + "epoch": 2.16950623982637, + "grad_norm": 5.683497428894043, + "learning_rate": 4.348773468548719e-06, + "loss": 0.1587, + "step": 41988 + }, + { + "epoch": 2.169519804666305, + "grad_norm": 4.443265438079834, + "learning_rate": 4.348636425928464e-06, + "loss": 0.158, + "step": 41989 + }, + { + "epoch": 2.1695333695062398, + "grad_norm": 6.093373775482178, + "learning_rate": 4.348499383308209e-06, + "loss": 0.2497, + "step": 41990 + }, + { + "epoch": 2.1695469343461746, + "grad_norm": 4.825132369995117, + "learning_rate": 4.348362340687954e-06, + "loss": 0.2031, + "step": 41991 + }, + { + "epoch": 2.1695604991861095, + "grad_norm": 3.6282551288604736, + "learning_rate": 4.348225298067699e-06, + "loss": 0.1066, + "step": 41992 + }, + { + "epoch": 2.1695740640260444, + "grad_norm": 5.007158279418945, + "learning_rate": 4.3480882554474445e-06, + "loss": 0.1469, + "step": 41993 + }, + { + "epoch": 2.1695876288659792, + "grad_norm": 5.911045074462891, + "learning_rate": 4.34795121282719e-06, + "loss": 0.1823, + "step": 41994 + }, + { + "epoch": 2.169601193705914, + "grad_norm": 3.675224781036377, + "learning_rate": 4.347814170206935e-06, + "loss": 0.132, + "step": 41995 + }, + { + "epoch": 2.169614758545849, + "grad_norm": 3.8460569381713867, + "learning_rate": 4.34767712758668e-06, + "loss": 0.1291, + "step": 41996 + }, + { + "epoch": 2.1696283233857843, + "grad_norm": 4.006222248077393, + "learning_rate": 4.347540084966425e-06, + "loss": 0.1227, + "step": 41997 + }, + { + "epoch": 2.169641888225719, + "grad_norm": 3.2059643268585205, + "learning_rate": 4.3474030423461696e-06, + "loss": 0.0827, + "step": 41998 + }, + { + "epoch": 2.169655453065654, + "grad_norm": 4.33950138092041, + "learning_rate": 4.347265999725916e-06, + "loss": 0.1259, + "step": 41999 + }, + { + "epoch": 2.169669017905589, + "grad_norm": 3.8376190662384033, + "learning_rate": 4.34712895710566e-06, + "loss": 0.0964, + "step": 42000 + }, + { + "epoch": 2.1696825827455237, + "grad_norm": 3.289679765701294, + "learning_rate": 4.346991914485406e-06, + "loss": 0.1032, + "step": 42001 + }, + { + "epoch": 2.1696961475854586, + "grad_norm": 3.688135862350464, + "learning_rate": 4.34685487186515e-06, + "loss": 0.1501, + "step": 42002 + }, + { + "epoch": 2.1697097124253935, + "grad_norm": 3.6220014095306396, + "learning_rate": 4.3467178292448955e-06, + "loss": 0.1514, + "step": 42003 + }, + { + "epoch": 2.1697232772653283, + "grad_norm": 5.628216743469238, + "learning_rate": 4.346580786624641e-06, + "loss": 0.1921, + "step": 42004 + }, + { + "epoch": 2.169736842105263, + "grad_norm": 3.578557014465332, + "learning_rate": 4.346443744004386e-06, + "loss": 0.1253, + "step": 42005 + }, + { + "epoch": 2.169750406945198, + "grad_norm": 7.874807834625244, + "learning_rate": 4.346306701384131e-06, + "loss": 0.1757, + "step": 42006 + }, + { + "epoch": 2.169763971785133, + "grad_norm": 6.040637969970703, + "learning_rate": 4.346169658763876e-06, + "loss": 0.2295, + "step": 42007 + }, + { + "epoch": 2.169777536625068, + "grad_norm": 5.110116004943848, + "learning_rate": 4.346032616143621e-06, + "loss": 0.1769, + "step": 42008 + }, + { + "epoch": 2.1697911014650026, + "grad_norm": 5.0409393310546875, + "learning_rate": 4.345895573523366e-06, + "loss": 0.1901, + "step": 42009 + }, + { + "epoch": 2.1698046663049375, + "grad_norm": 5.323846340179443, + "learning_rate": 4.345758530903112e-06, + "loss": 0.1595, + "step": 42010 + }, + { + "epoch": 2.1698182311448724, + "grad_norm": 3.7974021434783936, + "learning_rate": 4.345621488282856e-06, + "loss": 0.0969, + "step": 42011 + }, + { + "epoch": 2.1698317959848072, + "grad_norm": 4.910366058349609, + "learning_rate": 4.345484445662601e-06, + "loss": 0.1625, + "step": 42012 + }, + { + "epoch": 2.169845360824742, + "grad_norm": 3.452023983001709, + "learning_rate": 4.345347403042346e-06, + "loss": 0.083, + "step": 42013 + }, + { + "epoch": 2.169858925664677, + "grad_norm": 3.8198087215423584, + "learning_rate": 4.3452103604220916e-06, + "loss": 0.1346, + "step": 42014 + }, + { + "epoch": 2.169872490504612, + "grad_norm": 4.073477268218994, + "learning_rate": 4.345073317801837e-06, + "loss": 0.1206, + "step": 42015 + }, + { + "epoch": 2.1698860553445467, + "grad_norm": 2.467705488204956, + "learning_rate": 4.344936275181582e-06, + "loss": 0.0827, + "step": 42016 + }, + { + "epoch": 2.169899620184482, + "grad_norm": 3.5030605792999268, + "learning_rate": 4.344799232561327e-06, + "loss": 0.1114, + "step": 42017 + }, + { + "epoch": 2.169913185024417, + "grad_norm": 4.022030830383301, + "learning_rate": 4.3446621899410714e-06, + "loss": 0.1181, + "step": 42018 + }, + { + "epoch": 2.1699267498643517, + "grad_norm": 3.304853916168213, + "learning_rate": 4.3445251473208175e-06, + "loss": 0.1157, + "step": 42019 + }, + { + "epoch": 2.1699403147042866, + "grad_norm": 4.2611517906188965, + "learning_rate": 4.344388104700562e-06, + "loss": 0.1649, + "step": 42020 + }, + { + "epoch": 2.1699538795442215, + "grad_norm": 3.726854085922241, + "learning_rate": 4.344251062080308e-06, + "loss": 0.1564, + "step": 42021 + }, + { + "epoch": 2.1699674443841563, + "grad_norm": 5.686788082122803, + "learning_rate": 4.344114019460052e-06, + "loss": 0.1853, + "step": 42022 + }, + { + "epoch": 2.169981009224091, + "grad_norm": 3.793531894683838, + "learning_rate": 4.343976976839797e-06, + "loss": 0.0953, + "step": 42023 + }, + { + "epoch": 2.169994574064026, + "grad_norm": 4.192068099975586, + "learning_rate": 4.3438399342195425e-06, + "loss": 0.1585, + "step": 42024 + }, + { + "epoch": 2.170008138903961, + "grad_norm": 4.409299373626709, + "learning_rate": 4.343702891599288e-06, + "loss": 0.1304, + "step": 42025 + }, + { + "epoch": 2.170021703743896, + "grad_norm": 4.252147197723389, + "learning_rate": 4.343565848979033e-06, + "loss": 0.0969, + "step": 42026 + }, + { + "epoch": 2.1700352685838307, + "grad_norm": 4.26458215713501, + "learning_rate": 4.343428806358778e-06, + "loss": 0.1037, + "step": 42027 + }, + { + "epoch": 2.1700488334237655, + "grad_norm": 3.212881565093994, + "learning_rate": 4.343291763738523e-06, + "loss": 0.0908, + "step": 42028 + }, + { + "epoch": 2.1700623982637004, + "grad_norm": 2.9368834495544434, + "learning_rate": 4.343154721118268e-06, + "loss": 0.1292, + "step": 42029 + }, + { + "epoch": 2.1700759631036353, + "grad_norm": 4.1883320808410645, + "learning_rate": 4.343017678498014e-06, + "loss": 0.1418, + "step": 42030 + }, + { + "epoch": 2.17008952794357, + "grad_norm": 3.9186201095581055, + "learning_rate": 4.342880635877758e-06, + "loss": 0.1742, + "step": 42031 + }, + { + "epoch": 2.170103092783505, + "grad_norm": 4.646181583404541, + "learning_rate": 4.342743593257503e-06, + "loss": 0.1176, + "step": 42032 + }, + { + "epoch": 2.17011665762344, + "grad_norm": 6.67669677734375, + "learning_rate": 4.342606550637248e-06, + "loss": 0.2319, + "step": 42033 + }, + { + "epoch": 2.1701302224633747, + "grad_norm": 3.682356357574463, + "learning_rate": 4.3424695080169935e-06, + "loss": 0.1136, + "step": 42034 + }, + { + "epoch": 2.17014378730331, + "grad_norm": 6.054401874542236, + "learning_rate": 4.342332465396739e-06, + "loss": 0.1296, + "step": 42035 + }, + { + "epoch": 2.170157352143245, + "grad_norm": 5.3655853271484375, + "learning_rate": 4.342195422776484e-06, + "loss": 0.1804, + "step": 42036 + }, + { + "epoch": 2.1701709169831798, + "grad_norm": 4.781262397766113, + "learning_rate": 4.342058380156229e-06, + "loss": 0.1951, + "step": 42037 + }, + { + "epoch": 2.1701844818231146, + "grad_norm": 3.287278175354004, + "learning_rate": 4.341921337535974e-06, + "loss": 0.1011, + "step": 42038 + }, + { + "epoch": 2.1701980466630495, + "grad_norm": 4.708015441894531, + "learning_rate": 4.341784294915719e-06, + "loss": 0.1522, + "step": 42039 + }, + { + "epoch": 2.1702116115029844, + "grad_norm": 6.516994476318359, + "learning_rate": 4.3416472522954645e-06, + "loss": 0.1636, + "step": 42040 + }, + { + "epoch": 2.1702251763429192, + "grad_norm": 4.369401454925537, + "learning_rate": 4.34151020967521e-06, + "loss": 0.2146, + "step": 42041 + }, + { + "epoch": 2.170238741182854, + "grad_norm": 4.644718170166016, + "learning_rate": 4.341373167054955e-06, + "loss": 0.1819, + "step": 42042 + }, + { + "epoch": 2.170252306022789, + "grad_norm": 4.283344745635986, + "learning_rate": 4.341236124434699e-06, + "loss": 0.1142, + "step": 42043 + }, + { + "epoch": 2.170265870862724, + "grad_norm": 5.138661861419678, + "learning_rate": 4.341099081814444e-06, + "loss": 0.1802, + "step": 42044 + }, + { + "epoch": 2.1702794357026587, + "grad_norm": 4.6601080894470215, + "learning_rate": 4.3409620391941896e-06, + "loss": 0.1133, + "step": 42045 + }, + { + "epoch": 2.1702930005425936, + "grad_norm": 5.085728168487549, + "learning_rate": 4.340824996573935e-06, + "loss": 0.1546, + "step": 42046 + }, + { + "epoch": 2.1703065653825284, + "grad_norm": 5.6394758224487305, + "learning_rate": 4.34068795395368e-06, + "loss": 0.1908, + "step": 42047 + }, + { + "epoch": 2.1703201302224633, + "grad_norm": 4.005024433135986, + "learning_rate": 4.340550911333425e-06, + "loss": 0.1628, + "step": 42048 + }, + { + "epoch": 2.170333695062398, + "grad_norm": 5.522148132324219, + "learning_rate": 4.34041386871317e-06, + "loss": 0.1466, + "step": 42049 + }, + { + "epoch": 2.170347259902333, + "grad_norm": 9.299991607666016, + "learning_rate": 4.3402768260929155e-06, + "loss": 0.3053, + "step": 42050 + }, + { + "epoch": 2.170360824742268, + "grad_norm": 4.126321315765381, + "learning_rate": 4.340139783472661e-06, + "loss": 0.1669, + "step": 42051 + }, + { + "epoch": 2.1703743895822027, + "grad_norm": 7.593231678009033, + "learning_rate": 4.340002740852405e-06, + "loss": 0.2354, + "step": 42052 + }, + { + "epoch": 2.1703879544221376, + "grad_norm": 4.8151421546936035, + "learning_rate": 4.339865698232151e-06, + "loss": 0.1645, + "step": 42053 + }, + { + "epoch": 2.1704015192620725, + "grad_norm": 6.4149346351623535, + "learning_rate": 4.339728655611895e-06, + "loss": 0.218, + "step": 42054 + }, + { + "epoch": 2.170415084102008, + "grad_norm": 5.61345100402832, + "learning_rate": 4.339591612991641e-06, + "loss": 0.1831, + "step": 42055 + }, + { + "epoch": 2.1704286489419427, + "grad_norm": 3.960132360458374, + "learning_rate": 4.339454570371386e-06, + "loss": 0.1055, + "step": 42056 + }, + { + "epoch": 2.1704422137818775, + "grad_norm": 6.29553747177124, + "learning_rate": 4.339317527751131e-06, + "loss": 0.1456, + "step": 42057 + }, + { + "epoch": 2.1704557786218124, + "grad_norm": 6.503693103790283, + "learning_rate": 4.339180485130876e-06, + "loss": 0.1573, + "step": 42058 + }, + { + "epoch": 2.1704693434617472, + "grad_norm": 5.396963119506836, + "learning_rate": 4.339043442510621e-06, + "loss": 0.1844, + "step": 42059 + }, + { + "epoch": 2.170482908301682, + "grad_norm": 5.320728778839111, + "learning_rate": 4.338906399890366e-06, + "loss": 0.1815, + "step": 42060 + }, + { + "epoch": 2.170496473141617, + "grad_norm": 6.834394931793213, + "learning_rate": 4.338769357270112e-06, + "loss": 0.226, + "step": 42061 + }, + { + "epoch": 2.170510037981552, + "grad_norm": 4.14882755279541, + "learning_rate": 4.338632314649857e-06, + "loss": 0.137, + "step": 42062 + }, + { + "epoch": 2.1705236028214867, + "grad_norm": 3.0020246505737305, + "learning_rate": 4.338495272029601e-06, + "loss": 0.0616, + "step": 42063 + }, + { + "epoch": 2.1705371676614216, + "grad_norm": 4.287945747375488, + "learning_rate": 4.338358229409347e-06, + "loss": 0.1277, + "step": 42064 + }, + { + "epoch": 2.1705507325013564, + "grad_norm": 5.817485332489014, + "learning_rate": 4.3382211867890915e-06, + "loss": 0.1113, + "step": 42065 + }, + { + "epoch": 2.1705642973412913, + "grad_norm": 3.9615511894226074, + "learning_rate": 4.338084144168837e-06, + "loss": 0.1392, + "step": 42066 + }, + { + "epoch": 2.170577862181226, + "grad_norm": 6.37498664855957, + "learning_rate": 4.337947101548582e-06, + "loss": 0.2276, + "step": 42067 + }, + { + "epoch": 2.170591427021161, + "grad_norm": 5.284466743469238, + "learning_rate": 4.337810058928327e-06, + "loss": 0.1035, + "step": 42068 + }, + { + "epoch": 2.170604991861096, + "grad_norm": 5.982546329498291, + "learning_rate": 4.337673016308072e-06, + "loss": 0.1765, + "step": 42069 + }, + { + "epoch": 2.1706185567010308, + "grad_norm": 5.0210771560668945, + "learning_rate": 4.337535973687817e-06, + "loss": 0.1031, + "step": 42070 + }, + { + "epoch": 2.1706321215409656, + "grad_norm": 3.9068169593811035, + "learning_rate": 4.3373989310675625e-06, + "loss": 0.0971, + "step": 42071 + }, + { + "epoch": 2.170645686380901, + "grad_norm": 5.057249546051025, + "learning_rate": 4.337261888447307e-06, + "loss": 0.1518, + "step": 42072 + }, + { + "epoch": 2.170659251220836, + "grad_norm": 4.133343696594238, + "learning_rate": 4.337124845827053e-06, + "loss": 0.1786, + "step": 42073 + }, + { + "epoch": 2.1706728160607707, + "grad_norm": 3.7418127059936523, + "learning_rate": 4.336987803206797e-06, + "loss": 0.1209, + "step": 42074 + }, + { + "epoch": 2.1706863809007055, + "grad_norm": 3.370849370956421, + "learning_rate": 4.336850760586543e-06, + "loss": 0.0932, + "step": 42075 + }, + { + "epoch": 2.1706999457406404, + "grad_norm": 6.705822467803955, + "learning_rate": 4.336713717966288e-06, + "loss": 0.2185, + "step": 42076 + }, + { + "epoch": 2.1707135105805753, + "grad_norm": 7.15580415725708, + "learning_rate": 4.336576675346033e-06, + "loss": 0.2063, + "step": 42077 + }, + { + "epoch": 2.17072707542051, + "grad_norm": 4.310780048370361, + "learning_rate": 4.336439632725778e-06, + "loss": 0.1123, + "step": 42078 + }, + { + "epoch": 2.170740640260445, + "grad_norm": 3.2103731632232666, + "learning_rate": 4.336302590105523e-06, + "loss": 0.0633, + "step": 42079 + }, + { + "epoch": 2.17075420510038, + "grad_norm": 4.978416919708252, + "learning_rate": 4.336165547485268e-06, + "loss": 0.177, + "step": 42080 + }, + { + "epoch": 2.1707677699403147, + "grad_norm": 6.235945701599121, + "learning_rate": 4.3360285048650135e-06, + "loss": 0.2685, + "step": 42081 + }, + { + "epoch": 2.1707813347802496, + "grad_norm": 4.5431952476501465, + "learning_rate": 4.335891462244759e-06, + "loss": 0.1597, + "step": 42082 + }, + { + "epoch": 2.1707948996201845, + "grad_norm": 4.741804599761963, + "learning_rate": 4.335754419624504e-06, + "loss": 0.108, + "step": 42083 + }, + { + "epoch": 2.1708084644601193, + "grad_norm": 4.192193508148193, + "learning_rate": 4.335617377004249e-06, + "loss": 0.1164, + "step": 42084 + }, + { + "epoch": 2.170822029300054, + "grad_norm": 4.370396614074707, + "learning_rate": 4.335480334383993e-06, + "loss": 0.1277, + "step": 42085 + }, + { + "epoch": 2.170835594139989, + "grad_norm": 6.638369560241699, + "learning_rate": 4.3353432917637385e-06, + "loss": 0.1557, + "step": 42086 + }, + { + "epoch": 2.170849158979924, + "grad_norm": 5.700711250305176, + "learning_rate": 4.335206249143484e-06, + "loss": 0.155, + "step": 42087 + }, + { + "epoch": 2.170862723819859, + "grad_norm": 5.595948696136475, + "learning_rate": 4.335069206523229e-06, + "loss": 0.1425, + "step": 42088 + }, + { + "epoch": 2.1708762886597937, + "grad_norm": 3.966520071029663, + "learning_rate": 4.334932163902974e-06, + "loss": 0.1181, + "step": 42089 + }, + { + "epoch": 2.1708898534997285, + "grad_norm": 4.537512302398682, + "learning_rate": 4.334795121282719e-06, + "loss": 0.1168, + "step": 42090 + }, + { + "epoch": 2.1709034183396634, + "grad_norm": 4.205933094024658, + "learning_rate": 4.334658078662464e-06, + "loss": 0.1274, + "step": 42091 + }, + { + "epoch": 2.1709169831795982, + "grad_norm": 5.909159183502197, + "learning_rate": 4.33452103604221e-06, + "loss": 0.1099, + "step": 42092 + }, + { + "epoch": 2.1709305480195336, + "grad_norm": 4.620608806610107, + "learning_rate": 4.334383993421955e-06, + "loss": 0.1504, + "step": 42093 + }, + { + "epoch": 2.1709441128594684, + "grad_norm": 4.291684627532959, + "learning_rate": 4.3342469508017e-06, + "loss": 0.1119, + "step": 42094 + }, + { + "epoch": 2.1709576776994033, + "grad_norm": 5.878332614898682, + "learning_rate": 4.334109908181445e-06, + "loss": 0.2154, + "step": 42095 + }, + { + "epoch": 2.170971242539338, + "grad_norm": 6.4065351486206055, + "learning_rate": 4.33397286556119e-06, + "loss": 0.1143, + "step": 42096 + }, + { + "epoch": 2.170984807379273, + "grad_norm": 4.964664936065674, + "learning_rate": 4.333835822940935e-06, + "loss": 0.0951, + "step": 42097 + }, + { + "epoch": 2.170998372219208, + "grad_norm": 5.08219051361084, + "learning_rate": 4.33369878032068e-06, + "loss": 0.1964, + "step": 42098 + }, + { + "epoch": 2.1710119370591427, + "grad_norm": 5.931459903717041, + "learning_rate": 4.333561737700425e-06, + "loss": 0.2149, + "step": 42099 + }, + { + "epoch": 2.1710255018990776, + "grad_norm": 5.593341827392578, + "learning_rate": 4.33342469508017e-06, + "loss": 0.2007, + "step": 42100 + }, + { + "epoch": 2.1710390667390125, + "grad_norm": 4.3737945556640625, + "learning_rate": 4.333287652459915e-06, + "loss": 0.1332, + "step": 42101 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 3.1653153896331787, + "learning_rate": 4.3331506098396605e-06, + "loss": 0.0732, + "step": 42102 + }, + { + "epoch": 2.171066196418882, + "grad_norm": 4.182131290435791, + "learning_rate": 4.333013567219406e-06, + "loss": 0.1472, + "step": 42103 + }, + { + "epoch": 2.171079761258817, + "grad_norm": 5.28170108795166, + "learning_rate": 4.332876524599151e-06, + "loss": 0.1183, + "step": 42104 + }, + { + "epoch": 2.171093326098752, + "grad_norm": 7.015645503997803, + "learning_rate": 4.332739481978896e-06, + "loss": 0.226, + "step": 42105 + }, + { + "epoch": 2.171106890938687, + "grad_norm": 5.404136657714844, + "learning_rate": 4.33260243935864e-06, + "loss": 0.1541, + "step": 42106 + }, + { + "epoch": 2.1711204557786217, + "grad_norm": 4.550774097442627, + "learning_rate": 4.3324653967383864e-06, + "loss": 0.1504, + "step": 42107 + }, + { + "epoch": 2.1711340206185565, + "grad_norm": 4.6027326583862305, + "learning_rate": 4.332328354118131e-06, + "loss": 0.1082, + "step": 42108 + }, + { + "epoch": 2.1711475854584914, + "grad_norm": 4.969593524932861, + "learning_rate": 4.332191311497877e-06, + "loss": 0.1465, + "step": 42109 + }, + { + "epoch": 2.1711611502984267, + "grad_norm": 3.702033758163452, + "learning_rate": 4.332054268877621e-06, + "loss": 0.1779, + "step": 42110 + }, + { + "epoch": 2.1711747151383616, + "grad_norm": 5.053191184997559, + "learning_rate": 4.331917226257366e-06, + "loss": 0.183, + "step": 42111 + }, + { + "epoch": 2.1711882799782964, + "grad_norm": 4.028971195220947, + "learning_rate": 4.3317801836371115e-06, + "loss": 0.1327, + "step": 42112 + }, + { + "epoch": 2.1712018448182313, + "grad_norm": 4.562773704528809, + "learning_rate": 4.331643141016857e-06, + "loss": 0.1713, + "step": 42113 + }, + { + "epoch": 2.171215409658166, + "grad_norm": 3.5471959114074707, + "learning_rate": 4.331506098396602e-06, + "loss": 0.1063, + "step": 42114 + }, + { + "epoch": 2.171228974498101, + "grad_norm": 4.539816379547119, + "learning_rate": 4.331369055776346e-06, + "loss": 0.1902, + "step": 42115 + }, + { + "epoch": 2.171242539338036, + "grad_norm": 3.761014461517334, + "learning_rate": 4.331232013156092e-06, + "loss": 0.097, + "step": 42116 + }, + { + "epoch": 2.1712561041779708, + "grad_norm": 4.964864730834961, + "learning_rate": 4.3310949705358365e-06, + "loss": 0.1824, + "step": 42117 + }, + { + "epoch": 2.1712696690179056, + "grad_norm": 4.760923385620117, + "learning_rate": 4.3309579279155826e-06, + "loss": 0.1401, + "step": 42118 + }, + { + "epoch": 2.1712832338578405, + "grad_norm": 4.519721508026123, + "learning_rate": 4.330820885295327e-06, + "loss": 0.1395, + "step": 42119 + }, + { + "epoch": 2.1712967986977754, + "grad_norm": 4.951351642608643, + "learning_rate": 4.330683842675072e-06, + "loss": 0.1595, + "step": 42120 + }, + { + "epoch": 2.1713103635377102, + "grad_norm": 14.47066879272461, + "learning_rate": 4.330546800054817e-06, + "loss": 0.1918, + "step": 42121 + }, + { + "epoch": 2.171323928377645, + "grad_norm": 6.943753719329834, + "learning_rate": 4.330409757434562e-06, + "loss": 0.1645, + "step": 42122 + }, + { + "epoch": 2.17133749321758, + "grad_norm": 5.288437843322754, + "learning_rate": 4.330272714814308e-06, + "loss": 0.1502, + "step": 42123 + }, + { + "epoch": 2.171351058057515, + "grad_norm": 5.717405796051025, + "learning_rate": 4.330135672194053e-06, + "loss": 0.1481, + "step": 42124 + }, + { + "epoch": 2.1713646228974497, + "grad_norm": 6.040078163146973, + "learning_rate": 4.329998629573798e-06, + "loss": 0.2177, + "step": 42125 + }, + { + "epoch": 2.1713781877373846, + "grad_norm": 6.7465009689331055, + "learning_rate": 4.329861586953542e-06, + "loss": 0.2799, + "step": 42126 + }, + { + "epoch": 2.1713917525773194, + "grad_norm": 8.173505783081055, + "learning_rate": 4.329724544333288e-06, + "loss": 0.4405, + "step": 42127 + }, + { + "epoch": 2.1714053174172543, + "grad_norm": 5.504087448120117, + "learning_rate": 4.329587501713033e-06, + "loss": 0.1839, + "step": 42128 + }, + { + "epoch": 2.171418882257189, + "grad_norm": 5.55401611328125, + "learning_rate": 4.329450459092779e-06, + "loss": 0.2571, + "step": 42129 + }, + { + "epoch": 2.171432447097124, + "grad_norm": 7.9362640380859375, + "learning_rate": 4.329313416472523e-06, + "loss": 0.2629, + "step": 42130 + }, + { + "epoch": 2.1714460119370593, + "grad_norm": 5.573113918304443, + "learning_rate": 4.329176373852268e-06, + "loss": 0.2473, + "step": 42131 + }, + { + "epoch": 2.171459576776994, + "grad_norm": 3.915943145751953, + "learning_rate": 4.329039331232013e-06, + "loss": 0.1624, + "step": 42132 + }, + { + "epoch": 2.171473141616929, + "grad_norm": 5.951784610748291, + "learning_rate": 4.3289022886117585e-06, + "loss": 0.2716, + "step": 42133 + }, + { + "epoch": 2.171486706456864, + "grad_norm": 5.667820453643799, + "learning_rate": 4.328765245991504e-06, + "loss": 0.1989, + "step": 42134 + }, + { + "epoch": 2.171500271296799, + "grad_norm": 8.172859191894531, + "learning_rate": 4.328628203371249e-06, + "loss": 0.2704, + "step": 42135 + }, + { + "epoch": 2.1715138361367337, + "grad_norm": 5.047407627105713, + "learning_rate": 4.328491160750994e-06, + "loss": 0.2394, + "step": 42136 + }, + { + "epoch": 2.1715274009766685, + "grad_norm": 4.061216831207275, + "learning_rate": 4.328354118130739e-06, + "loss": 0.1443, + "step": 42137 + }, + { + "epoch": 2.1715409658166034, + "grad_norm": 5.22228479385376, + "learning_rate": 4.3282170755104844e-06, + "loss": 0.2853, + "step": 42138 + }, + { + "epoch": 2.1715545306565383, + "grad_norm": 4.8125457763671875, + "learning_rate": 4.32808003289023e-06, + "loss": 0.2046, + "step": 42139 + }, + { + "epoch": 2.171568095496473, + "grad_norm": 4.5392327308654785, + "learning_rate": 4.327942990269974e-06, + "loss": 0.2599, + "step": 42140 + }, + { + "epoch": 2.171581660336408, + "grad_norm": 6.667728900909424, + "learning_rate": 4.327805947649719e-06, + "loss": 0.3503, + "step": 42141 + }, + { + "epoch": 2.171595225176343, + "grad_norm": 6.126697540283203, + "learning_rate": 4.327668905029464e-06, + "loss": 0.2355, + "step": 42142 + }, + { + "epoch": 2.1716087900162777, + "grad_norm": 6.65032958984375, + "learning_rate": 4.3275318624092095e-06, + "loss": 0.3207, + "step": 42143 + }, + { + "epoch": 2.1716223548562126, + "grad_norm": 5.543915271759033, + "learning_rate": 4.327394819788955e-06, + "loss": 0.1815, + "step": 42144 + }, + { + "epoch": 2.1716359196961474, + "grad_norm": 4.55804967880249, + "learning_rate": 4.3272577771687e-06, + "loss": 0.174, + "step": 42145 + }, + { + "epoch": 2.1716494845360823, + "grad_norm": 6.008541584014893, + "learning_rate": 4.327120734548445e-06, + "loss": 0.2837, + "step": 42146 + }, + { + "epoch": 2.171663049376017, + "grad_norm": 5.476977348327637, + "learning_rate": 4.32698369192819e-06, + "loss": 0.1864, + "step": 42147 + }, + { + "epoch": 2.1716766142159525, + "grad_norm": 5.187090873718262, + "learning_rate": 4.326846649307935e-06, + "loss": 0.1319, + "step": 42148 + }, + { + "epoch": 2.1716901790558873, + "grad_norm": 5.8305463790893555, + "learning_rate": 4.3267096066876806e-06, + "loss": 0.2361, + "step": 42149 + }, + { + "epoch": 2.171703743895822, + "grad_norm": 5.3002543449401855, + "learning_rate": 4.326572564067426e-06, + "loss": 0.1915, + "step": 42150 + }, + { + "epoch": 2.171717308735757, + "grad_norm": 6.190426349639893, + "learning_rate": 4.32643552144717e-06, + "loss": 0.2818, + "step": 42151 + }, + { + "epoch": 2.171730873575692, + "grad_norm": 3.44075345993042, + "learning_rate": 4.326298478826916e-06, + "loss": 0.0894, + "step": 42152 + }, + { + "epoch": 2.171744438415627, + "grad_norm": 7.284088134765625, + "learning_rate": 4.3261614362066604e-06, + "loss": 0.2693, + "step": 42153 + }, + { + "epoch": 2.1717580032555617, + "grad_norm": 5.044764041900635, + "learning_rate": 4.326024393586406e-06, + "loss": 0.2036, + "step": 42154 + }, + { + "epoch": 2.1717715680954965, + "grad_norm": 5.490067481994629, + "learning_rate": 4.325887350966151e-06, + "loss": 0.2611, + "step": 42155 + }, + { + "epoch": 2.1717851329354314, + "grad_norm": 6.187704086303711, + "learning_rate": 4.325750308345896e-06, + "loss": 0.2946, + "step": 42156 + }, + { + "epoch": 2.1717986977753663, + "grad_norm": 5.291778564453125, + "learning_rate": 4.325613265725641e-06, + "loss": 0.1757, + "step": 42157 + }, + { + "epoch": 2.171812262615301, + "grad_norm": 6.783865928649902, + "learning_rate": 4.325476223105386e-06, + "loss": 0.2874, + "step": 42158 + }, + { + "epoch": 2.171825827455236, + "grad_norm": 5.735298156738281, + "learning_rate": 4.3253391804851315e-06, + "loss": 0.2131, + "step": 42159 + }, + { + "epoch": 2.171839392295171, + "grad_norm": 4.205427646636963, + "learning_rate": 4.325202137864876e-06, + "loss": 0.1743, + "step": 42160 + }, + { + "epoch": 2.1718529571351057, + "grad_norm": 4.040684700012207, + "learning_rate": 4.325065095244622e-06, + "loss": 0.1128, + "step": 42161 + }, + { + "epoch": 2.1718665219750406, + "grad_norm": 4.298649787902832, + "learning_rate": 4.324928052624366e-06, + "loss": 0.1318, + "step": 42162 + }, + { + "epoch": 2.1718800868149755, + "grad_norm": 5.803276538848877, + "learning_rate": 4.324791010004112e-06, + "loss": 0.1744, + "step": 42163 + }, + { + "epoch": 2.1718936516549103, + "grad_norm": 5.706297397613525, + "learning_rate": 4.3246539673838565e-06, + "loss": 0.2186, + "step": 42164 + }, + { + "epoch": 2.171907216494845, + "grad_norm": 6.079384803771973, + "learning_rate": 4.324516924763602e-06, + "loss": 0.1857, + "step": 42165 + }, + { + "epoch": 2.17192078133478, + "grad_norm": 6.141973495483398, + "learning_rate": 4.324379882143347e-06, + "loss": 0.1764, + "step": 42166 + }, + { + "epoch": 2.171934346174715, + "grad_norm": 4.198980808258057, + "learning_rate": 4.324242839523092e-06, + "loss": 0.136, + "step": 42167 + }, + { + "epoch": 2.1719479110146502, + "grad_norm": 4.95920467376709, + "learning_rate": 4.324105796902837e-06, + "loss": 0.2165, + "step": 42168 + }, + { + "epoch": 2.171961475854585, + "grad_norm": 5.6593017578125, + "learning_rate": 4.323968754282582e-06, + "loss": 0.2732, + "step": 42169 + }, + { + "epoch": 2.17197504069452, + "grad_norm": 4.349479675292969, + "learning_rate": 4.323831711662328e-06, + "loss": 0.189, + "step": 42170 + }, + { + "epoch": 2.171988605534455, + "grad_norm": 4.745283603668213, + "learning_rate": 4.323694669042072e-06, + "loss": 0.2441, + "step": 42171 + }, + { + "epoch": 2.1720021703743897, + "grad_norm": 6.171271800994873, + "learning_rate": 4.323557626421818e-06, + "loss": 0.283, + "step": 42172 + }, + { + "epoch": 2.1720157352143246, + "grad_norm": 4.029733180999756, + "learning_rate": 4.323420583801562e-06, + "loss": 0.2419, + "step": 42173 + }, + { + "epoch": 2.1720293000542594, + "grad_norm": 4.855224609375, + "learning_rate": 4.3232835411813075e-06, + "loss": 0.1894, + "step": 42174 + }, + { + "epoch": 2.1720428648941943, + "grad_norm": 5.396213531494141, + "learning_rate": 4.323146498561053e-06, + "loss": 0.2277, + "step": 42175 + }, + { + "epoch": 2.172056429734129, + "grad_norm": 5.868607521057129, + "learning_rate": 4.323009455940798e-06, + "loss": 0.3741, + "step": 42176 + }, + { + "epoch": 2.172069994574064, + "grad_norm": 4.519923210144043, + "learning_rate": 4.322872413320543e-06, + "loss": 0.2429, + "step": 42177 + }, + { + "epoch": 2.172083559413999, + "grad_norm": 4.843686580657959, + "learning_rate": 4.322735370700288e-06, + "loss": 0.2383, + "step": 42178 + }, + { + "epoch": 2.1720971242539338, + "grad_norm": 4.703590393066406, + "learning_rate": 4.322598328080033e-06, + "loss": 0.1104, + "step": 42179 + }, + { + "epoch": 2.1721106890938686, + "grad_norm": 6.427975654602051, + "learning_rate": 4.3224612854597786e-06, + "loss": 0.263, + "step": 42180 + }, + { + "epoch": 2.1721242539338035, + "grad_norm": 4.980400562286377, + "learning_rate": 4.322324242839524e-06, + "loss": 0.2009, + "step": 42181 + }, + { + "epoch": 2.1721378187737383, + "grad_norm": 5.2430267333984375, + "learning_rate": 4.322187200219268e-06, + "loss": 0.1666, + "step": 42182 + }, + { + "epoch": 2.172151383613673, + "grad_norm": 4.7603960037231445, + "learning_rate": 4.322050157599014e-06, + "loss": 0.2182, + "step": 42183 + }, + { + "epoch": 2.172164948453608, + "grad_norm": 5.014017105102539, + "learning_rate": 4.3219131149787584e-06, + "loss": 0.2347, + "step": 42184 + }, + { + "epoch": 2.172178513293543, + "grad_norm": 3.7276346683502197, + "learning_rate": 4.321776072358504e-06, + "loss": 0.2174, + "step": 42185 + }, + { + "epoch": 2.1721920781334783, + "grad_norm": 4.809028148651123, + "learning_rate": 4.321639029738249e-06, + "loss": 0.1864, + "step": 42186 + }, + { + "epoch": 2.172205642973413, + "grad_norm": 3.7025504112243652, + "learning_rate": 4.321501987117994e-06, + "loss": 0.1732, + "step": 42187 + }, + { + "epoch": 2.172219207813348, + "grad_norm": 6.243922710418701, + "learning_rate": 4.321364944497739e-06, + "loss": 0.1906, + "step": 42188 + }, + { + "epoch": 2.172232772653283, + "grad_norm": 5.507561683654785, + "learning_rate": 4.321227901877484e-06, + "loss": 0.2862, + "step": 42189 + }, + { + "epoch": 2.1722463374932177, + "grad_norm": 7.614375591278076, + "learning_rate": 4.3210908592572295e-06, + "loss": 0.207, + "step": 42190 + }, + { + "epoch": 2.1722599023331526, + "grad_norm": 5.036942005157471, + "learning_rate": 4.320953816636975e-06, + "loss": 0.2206, + "step": 42191 + }, + { + "epoch": 2.1722734671730874, + "grad_norm": 4.695590972900391, + "learning_rate": 4.32081677401672e-06, + "loss": 0.2279, + "step": 42192 + }, + { + "epoch": 2.1722870320130223, + "grad_norm": 4.307942867279053, + "learning_rate": 4.320679731396465e-06, + "loss": 0.2573, + "step": 42193 + }, + { + "epoch": 2.172300596852957, + "grad_norm": 4.214199066162109, + "learning_rate": 4.320542688776209e-06, + "loss": 0.2087, + "step": 42194 + }, + { + "epoch": 2.172314161692892, + "grad_norm": 4.5059099197387695, + "learning_rate": 4.3204056461559545e-06, + "loss": 0.2925, + "step": 42195 + }, + { + "epoch": 2.172327726532827, + "grad_norm": 6.294857501983643, + "learning_rate": 4.3202686035357e-06, + "loss": 0.2818, + "step": 42196 + }, + { + "epoch": 2.1723412913727618, + "grad_norm": 4.544929027557373, + "learning_rate": 4.320131560915445e-06, + "loss": 0.2942, + "step": 42197 + }, + { + "epoch": 2.1723548562126966, + "grad_norm": 3.7025604248046875, + "learning_rate": 4.31999451829519e-06, + "loss": 0.1208, + "step": 42198 + }, + { + "epoch": 2.1723684210526315, + "grad_norm": 3.5103225708007812, + "learning_rate": 4.319857475674935e-06, + "loss": 0.1344, + "step": 42199 + }, + { + "epoch": 2.1723819858925664, + "grad_norm": 3.781883478164673, + "learning_rate": 4.3197204330546804e-06, + "loss": 0.1389, + "step": 42200 + }, + { + "epoch": 2.1723955507325012, + "grad_norm": 4.642508506774902, + "learning_rate": 4.319583390434426e-06, + "loss": 0.154, + "step": 42201 + }, + { + "epoch": 2.172409115572436, + "grad_norm": 3.8027150630950928, + "learning_rate": 4.319446347814171e-06, + "loss": 0.1633, + "step": 42202 + }, + { + "epoch": 2.172422680412371, + "grad_norm": 5.205086708068848, + "learning_rate": 4.319309305193916e-06, + "loss": 0.2469, + "step": 42203 + }, + { + "epoch": 2.172436245252306, + "grad_norm": 4.5110321044921875, + "learning_rate": 4.319172262573661e-06, + "loss": 0.1434, + "step": 42204 + }, + { + "epoch": 2.1724498100922407, + "grad_norm": 6.005764961242676, + "learning_rate": 4.3190352199534055e-06, + "loss": 0.3102, + "step": 42205 + }, + { + "epoch": 2.172463374932176, + "grad_norm": 4.305675983428955, + "learning_rate": 4.3188981773331515e-06, + "loss": 0.1384, + "step": 42206 + }, + { + "epoch": 2.172476939772111, + "grad_norm": 6.183642387390137, + "learning_rate": 4.318761134712896e-06, + "loss": 0.1913, + "step": 42207 + }, + { + "epoch": 2.1724905046120457, + "grad_norm": 5.42783784866333, + "learning_rate": 4.318624092092641e-06, + "loss": 0.2613, + "step": 42208 + }, + { + "epoch": 2.1725040694519806, + "grad_norm": 5.962738513946533, + "learning_rate": 4.318487049472386e-06, + "loss": 0.2193, + "step": 42209 + }, + { + "epoch": 2.1725176342919155, + "grad_norm": 3.7914011478424072, + "learning_rate": 4.318350006852131e-06, + "loss": 0.0913, + "step": 42210 + }, + { + "epoch": 2.1725311991318503, + "grad_norm": 6.639081001281738, + "learning_rate": 4.3182129642318766e-06, + "loss": 0.2608, + "step": 42211 + }, + { + "epoch": 2.172544763971785, + "grad_norm": 5.396496772766113, + "learning_rate": 4.318075921611622e-06, + "loss": 0.217, + "step": 42212 + }, + { + "epoch": 2.17255832881172, + "grad_norm": 4.1600661277771, + "learning_rate": 4.317938878991367e-06, + "loss": 0.1324, + "step": 42213 + }, + { + "epoch": 2.172571893651655, + "grad_norm": 3.5070905685424805, + "learning_rate": 4.317801836371111e-06, + "loss": 0.1202, + "step": 42214 + }, + { + "epoch": 2.17258545849159, + "grad_norm": 5.513631820678711, + "learning_rate": 4.317664793750857e-06, + "loss": 0.1912, + "step": 42215 + }, + { + "epoch": 2.1725990233315247, + "grad_norm": 4.9377593994140625, + "learning_rate": 4.317527751130602e-06, + "loss": 0.1818, + "step": 42216 + }, + { + "epoch": 2.1726125881714595, + "grad_norm": 5.839482307434082, + "learning_rate": 4.317390708510348e-06, + "loss": 0.2123, + "step": 42217 + }, + { + "epoch": 2.1726261530113944, + "grad_norm": 5.189225673675537, + "learning_rate": 4.317253665890092e-06, + "loss": 0.1995, + "step": 42218 + }, + { + "epoch": 2.1726397178513293, + "grad_norm": 5.924220561981201, + "learning_rate": 4.317116623269837e-06, + "loss": 0.1922, + "step": 42219 + }, + { + "epoch": 2.172653282691264, + "grad_norm": 4.89639139175415, + "learning_rate": 4.316979580649582e-06, + "loss": 0.2811, + "step": 42220 + }, + { + "epoch": 2.172666847531199, + "grad_norm": 4.716000556945801, + "learning_rate": 4.3168425380293275e-06, + "loss": 0.281, + "step": 42221 + }, + { + "epoch": 2.172680412371134, + "grad_norm": 4.347719192504883, + "learning_rate": 4.316705495409073e-06, + "loss": 0.1595, + "step": 42222 + }, + { + "epoch": 2.1726939772110687, + "grad_norm": 5.676243782043457, + "learning_rate": 4.316568452788817e-06, + "loss": 0.1769, + "step": 42223 + }, + { + "epoch": 2.172707542051004, + "grad_norm": 6.6371073722839355, + "learning_rate": 4.316431410168563e-06, + "loss": 0.1766, + "step": 42224 + }, + { + "epoch": 2.172721106890939, + "grad_norm": 4.049071311950684, + "learning_rate": 4.316294367548307e-06, + "loss": 0.1378, + "step": 42225 + }, + { + "epoch": 2.1727346717308738, + "grad_norm": 4.768519878387451, + "learning_rate": 4.316157324928053e-06, + "loss": 0.1181, + "step": 42226 + }, + { + "epoch": 2.1727482365708086, + "grad_norm": 3.003748893737793, + "learning_rate": 4.316020282307798e-06, + "loss": 0.1112, + "step": 42227 + }, + { + "epoch": 2.1727618014107435, + "grad_norm": 6.286721706390381, + "learning_rate": 4.315883239687543e-06, + "loss": 0.1775, + "step": 42228 + }, + { + "epoch": 2.1727753662506784, + "grad_norm": 5.054905891418457, + "learning_rate": 4.315746197067288e-06, + "loss": 0.1525, + "step": 42229 + }, + { + "epoch": 2.172788931090613, + "grad_norm": 4.832767486572266, + "learning_rate": 4.315609154447033e-06, + "loss": 0.1473, + "step": 42230 + }, + { + "epoch": 2.172802495930548, + "grad_norm": 6.431153774261475, + "learning_rate": 4.3154721118267784e-06, + "loss": 0.204, + "step": 42231 + }, + { + "epoch": 2.172816060770483, + "grad_norm": 3.394451379776001, + "learning_rate": 4.315335069206524e-06, + "loss": 0.1185, + "step": 42232 + }, + { + "epoch": 2.172829625610418, + "grad_norm": 4.106197834014893, + "learning_rate": 4.315198026586269e-06, + "loss": 0.1689, + "step": 42233 + }, + { + "epoch": 2.1728431904503527, + "grad_norm": 5.7544145584106445, + "learning_rate": 4.315060983966014e-06, + "loss": 0.2287, + "step": 42234 + }, + { + "epoch": 2.1728567552902875, + "grad_norm": 5.9794392585754395, + "learning_rate": 4.314923941345759e-06, + "loss": 0.1413, + "step": 42235 + }, + { + "epoch": 2.1728703201302224, + "grad_norm": 4.414675712585449, + "learning_rate": 4.3147868987255035e-06, + "loss": 0.1738, + "step": 42236 + }, + { + "epoch": 2.1728838849701573, + "grad_norm": 5.432610988616943, + "learning_rate": 4.3146498561052495e-06, + "loss": 0.2092, + "step": 42237 + }, + { + "epoch": 2.172897449810092, + "grad_norm": 9.123246192932129, + "learning_rate": 4.314512813484994e-06, + "loss": 0.2177, + "step": 42238 + }, + { + "epoch": 2.172911014650027, + "grad_norm": 6.828169345855713, + "learning_rate": 4.314375770864739e-06, + "loss": 0.2552, + "step": 42239 + }, + { + "epoch": 2.172924579489962, + "grad_norm": 5.276933670043945, + "learning_rate": 4.314238728244484e-06, + "loss": 0.1911, + "step": 42240 + }, + { + "epoch": 2.1729381443298967, + "grad_norm": 6.400788307189941, + "learning_rate": 4.314101685624229e-06, + "loss": 0.3185, + "step": 42241 + }, + { + "epoch": 2.1729517091698316, + "grad_norm": 4.9745683670043945, + "learning_rate": 4.3139646430039746e-06, + "loss": 0.2503, + "step": 42242 + }, + { + "epoch": 2.1729652740097665, + "grad_norm": 3.4889419078826904, + "learning_rate": 4.31382760038372e-06, + "loss": 0.1744, + "step": 42243 + }, + { + "epoch": 2.1729788388497018, + "grad_norm": 6.462070941925049, + "learning_rate": 4.313690557763465e-06, + "loss": 0.2431, + "step": 42244 + }, + { + "epoch": 2.1729924036896366, + "grad_norm": 6.693725109100342, + "learning_rate": 4.31355351514321e-06, + "loss": 0.2486, + "step": 42245 + }, + { + "epoch": 2.1730059685295715, + "grad_norm": 6.2772040367126465, + "learning_rate": 4.313416472522955e-06, + "loss": 0.347, + "step": 42246 + }, + { + "epoch": 2.1730195333695064, + "grad_norm": 5.111846923828125, + "learning_rate": 4.3132794299027005e-06, + "loss": 0.2067, + "step": 42247 + }, + { + "epoch": 2.1730330982094412, + "grad_norm": 5.574317932128906, + "learning_rate": 4.313142387282445e-06, + "loss": 0.3076, + "step": 42248 + }, + { + "epoch": 2.173046663049376, + "grad_norm": 4.960054874420166, + "learning_rate": 4.313005344662191e-06, + "loss": 0.2179, + "step": 42249 + }, + { + "epoch": 2.173060227889311, + "grad_norm": 4.876624584197998, + "learning_rate": 4.312868302041935e-06, + "loss": 0.1718, + "step": 42250 + }, + { + "epoch": 2.173073792729246, + "grad_norm": 5.429530143737793, + "learning_rate": 4.31273125942168e-06, + "loss": 0.2566, + "step": 42251 + }, + { + "epoch": 2.1730873575691807, + "grad_norm": 6.4192891120910645, + "learning_rate": 4.3125942168014255e-06, + "loss": 0.3268, + "step": 42252 + }, + { + "epoch": 2.1731009224091156, + "grad_norm": 4.314910888671875, + "learning_rate": 4.312457174181171e-06, + "loss": 0.2032, + "step": 42253 + }, + { + "epoch": 2.1731144872490504, + "grad_norm": 4.967231750488281, + "learning_rate": 4.312320131560916e-06, + "loss": 0.2175, + "step": 42254 + }, + { + "epoch": 2.1731280520889853, + "grad_norm": 4.231474876403809, + "learning_rate": 4.312183088940661e-06, + "loss": 0.1583, + "step": 42255 + }, + { + "epoch": 2.17314161692892, + "grad_norm": 4.947665691375732, + "learning_rate": 4.312046046320406e-06, + "loss": 0.1893, + "step": 42256 + }, + { + "epoch": 2.173155181768855, + "grad_norm": 6.061789512634277, + "learning_rate": 4.3119090037001506e-06, + "loss": 0.2331, + "step": 42257 + }, + { + "epoch": 2.17316874660879, + "grad_norm": 4.877631187438965, + "learning_rate": 4.311771961079897e-06, + "loss": 0.2028, + "step": 42258 + }, + { + "epoch": 2.1731823114487248, + "grad_norm": 6.600836277008057, + "learning_rate": 4.311634918459641e-06, + "loss": 0.2161, + "step": 42259 + }, + { + "epoch": 2.1731958762886596, + "grad_norm": 4.756117820739746, + "learning_rate": 4.311497875839387e-06, + "loss": 0.1862, + "step": 42260 + }, + { + "epoch": 2.1732094411285945, + "grad_norm": 4.440277576446533, + "learning_rate": 4.311360833219131e-06, + "loss": 0.2602, + "step": 42261 + }, + { + "epoch": 2.17322300596853, + "grad_norm": 3.938197374343872, + "learning_rate": 4.3112237905988765e-06, + "loss": 0.1707, + "step": 42262 + }, + { + "epoch": 2.1732365708084647, + "grad_norm": 6.184261798858643, + "learning_rate": 4.311086747978622e-06, + "loss": 0.3491, + "step": 42263 + }, + { + "epoch": 2.1732501356483995, + "grad_norm": 6.031040191650391, + "learning_rate": 4.310949705358367e-06, + "loss": 0.1817, + "step": 42264 + }, + { + "epoch": 2.1732637004883344, + "grad_norm": 5.646583557128906, + "learning_rate": 4.310812662738112e-06, + "loss": 0.1974, + "step": 42265 + }, + { + "epoch": 2.1732772653282693, + "grad_norm": 6.454771041870117, + "learning_rate": 4.310675620117857e-06, + "loss": 0.2461, + "step": 42266 + }, + { + "epoch": 2.173290830168204, + "grad_norm": 5.6007232666015625, + "learning_rate": 4.310538577497602e-06, + "loss": 0.1842, + "step": 42267 + }, + { + "epoch": 2.173304395008139, + "grad_norm": 5.729068756103516, + "learning_rate": 4.310401534877347e-06, + "loss": 0.2187, + "step": 42268 + }, + { + "epoch": 2.173317959848074, + "grad_norm": 4.837162494659424, + "learning_rate": 4.310264492257093e-06, + "loss": 0.1413, + "step": 42269 + }, + { + "epoch": 2.1733315246880087, + "grad_norm": 4.943849563598633, + "learning_rate": 4.310127449636837e-06, + "loss": 0.2897, + "step": 42270 + }, + { + "epoch": 2.1733450895279436, + "grad_norm": 5.879073143005371, + "learning_rate": 4.309990407016583e-06, + "loss": 0.3233, + "step": 42271 + }, + { + "epoch": 2.1733586543678785, + "grad_norm": 4.923727989196777, + "learning_rate": 4.309853364396327e-06, + "loss": 0.2276, + "step": 42272 + }, + { + "epoch": 2.1733722192078133, + "grad_norm": 5.438854217529297, + "learning_rate": 4.3097163217760726e-06, + "loss": 0.1465, + "step": 42273 + }, + { + "epoch": 2.173385784047748, + "grad_norm": 5.454128742218018, + "learning_rate": 4.309579279155818e-06, + "loss": 0.1857, + "step": 42274 + }, + { + "epoch": 2.173399348887683, + "grad_norm": 5.588564872741699, + "learning_rate": 4.309442236535563e-06, + "loss": 0.2217, + "step": 42275 + }, + { + "epoch": 2.173412913727618, + "grad_norm": 3.685896873474121, + "learning_rate": 4.309305193915308e-06, + "loss": 0.1415, + "step": 42276 + }, + { + "epoch": 2.1734264785675528, + "grad_norm": 4.899096488952637, + "learning_rate": 4.3091681512950524e-06, + "loss": 0.1606, + "step": 42277 + }, + { + "epoch": 2.1734400434074876, + "grad_norm": 6.982418537139893, + "learning_rate": 4.3090311086747985e-06, + "loss": 0.169, + "step": 42278 + }, + { + "epoch": 2.1734536082474225, + "grad_norm": 5.390077114105225, + "learning_rate": 4.308894066054543e-06, + "loss": 0.2268, + "step": 42279 + }, + { + "epoch": 2.1734671730873574, + "grad_norm": 4.439104080200195, + "learning_rate": 4.308757023434289e-06, + "loss": 0.1724, + "step": 42280 + }, + { + "epoch": 2.1734807379272922, + "grad_norm": 5.256430625915527, + "learning_rate": 4.308619980814033e-06, + "loss": 0.1335, + "step": 42281 + }, + { + "epoch": 2.1734943027672275, + "grad_norm": 5.017771244049072, + "learning_rate": 4.308482938193778e-06, + "loss": 0.2074, + "step": 42282 + }, + { + "epoch": 2.1735078676071624, + "grad_norm": 5.113106727600098, + "learning_rate": 4.3083458955735235e-06, + "loss": 0.1924, + "step": 42283 + }, + { + "epoch": 2.1735214324470973, + "grad_norm": 7.428164482116699, + "learning_rate": 4.308208852953269e-06, + "loss": 0.2587, + "step": 42284 + }, + { + "epoch": 2.173534997287032, + "grad_norm": 3.908118486404419, + "learning_rate": 4.308071810333014e-06, + "loss": 0.1688, + "step": 42285 + }, + { + "epoch": 2.173548562126967, + "grad_norm": 5.168664455413818, + "learning_rate": 4.307934767712759e-06, + "loss": 0.1935, + "step": 42286 + }, + { + "epoch": 2.173562126966902, + "grad_norm": 7.561816215515137, + "learning_rate": 4.307797725092504e-06, + "loss": 0.2004, + "step": 42287 + }, + { + "epoch": 2.1735756918068367, + "grad_norm": 6.86665153503418, + "learning_rate": 4.307660682472249e-06, + "loss": 0.2741, + "step": 42288 + }, + { + "epoch": 2.1735892566467716, + "grad_norm": 6.985206604003906, + "learning_rate": 4.307523639851995e-06, + "loss": 0.2404, + "step": 42289 + }, + { + "epoch": 2.1736028214867065, + "grad_norm": 6.366512775421143, + "learning_rate": 4.30738659723174e-06, + "loss": 0.2059, + "step": 42290 + }, + { + "epoch": 2.1736163863266413, + "grad_norm": 5.965982437133789, + "learning_rate": 4.307249554611485e-06, + "loss": 0.2039, + "step": 42291 + }, + { + "epoch": 2.173629951166576, + "grad_norm": 4.407238960266113, + "learning_rate": 4.307112511991229e-06, + "loss": 0.1816, + "step": 42292 + }, + { + "epoch": 2.173643516006511, + "grad_norm": 4.903155326843262, + "learning_rate": 4.3069754693709745e-06, + "loss": 0.1603, + "step": 42293 + }, + { + "epoch": 2.173657080846446, + "grad_norm": 5.947366714477539, + "learning_rate": 4.30683842675072e-06, + "loss": 0.1732, + "step": 42294 + }, + { + "epoch": 2.173670645686381, + "grad_norm": 6.589843273162842, + "learning_rate": 4.306701384130465e-06, + "loss": 0.2593, + "step": 42295 + }, + { + "epoch": 2.1736842105263157, + "grad_norm": 3.809783935546875, + "learning_rate": 4.30656434151021e-06, + "loss": 0.1501, + "step": 42296 + }, + { + "epoch": 2.1736977753662505, + "grad_norm": 5.167403221130371, + "learning_rate": 4.306427298889955e-06, + "loss": 0.1762, + "step": 42297 + }, + { + "epoch": 2.1737113402061854, + "grad_norm": 6.108717441558838, + "learning_rate": 4.3062902562697e-06, + "loss": 0.1532, + "step": 42298 + }, + { + "epoch": 2.1737249050461203, + "grad_norm": 5.685377597808838, + "learning_rate": 4.3061532136494455e-06, + "loss": 0.2414, + "step": 42299 + }, + { + "epoch": 2.1737384698860556, + "grad_norm": 6.315914154052734, + "learning_rate": 4.306016171029191e-06, + "loss": 0.2014, + "step": 42300 + }, + { + "epoch": 2.1737520347259904, + "grad_norm": 5.39769172668457, + "learning_rate": 4.305879128408936e-06, + "loss": 0.2409, + "step": 42301 + }, + { + "epoch": 2.1737655995659253, + "grad_norm": 5.523327827453613, + "learning_rate": 4.30574208578868e-06, + "loss": 0.2056, + "step": 42302 + }, + { + "epoch": 2.17377916440586, + "grad_norm": 4.844104766845703, + "learning_rate": 4.305605043168426e-06, + "loss": 0.1677, + "step": 42303 + }, + { + "epoch": 2.173792729245795, + "grad_norm": 6.289772987365723, + "learning_rate": 4.305468000548171e-06, + "loss": 0.1928, + "step": 42304 + }, + { + "epoch": 2.17380629408573, + "grad_norm": 6.601363182067871, + "learning_rate": 4.305330957927916e-06, + "loss": 0.3102, + "step": 42305 + }, + { + "epoch": 2.1738198589256648, + "grad_norm": 7.540712356567383, + "learning_rate": 4.305193915307661e-06, + "loss": 0.4744, + "step": 42306 + }, + { + "epoch": 2.1738334237655996, + "grad_norm": 6.840747356414795, + "learning_rate": 4.305056872687406e-06, + "loss": 0.2356, + "step": 42307 + }, + { + "epoch": 2.1738469886055345, + "grad_norm": 4.528549671173096, + "learning_rate": 4.304919830067151e-06, + "loss": 0.1526, + "step": 42308 + }, + { + "epoch": 2.1738605534454694, + "grad_norm": 6.072309970855713, + "learning_rate": 4.3047827874468965e-06, + "loss": 0.2691, + "step": 42309 + }, + { + "epoch": 2.1738741182854042, + "grad_norm": 6.762880802154541, + "learning_rate": 4.304645744826642e-06, + "loss": 0.2369, + "step": 42310 + }, + { + "epoch": 2.173887683125339, + "grad_norm": 3.7028768062591553, + "learning_rate": 4.304508702206386e-06, + "loss": 0.134, + "step": 42311 + }, + { + "epoch": 2.173901247965274, + "grad_norm": 5.843960285186768, + "learning_rate": 4.304371659586132e-06, + "loss": 0.2602, + "step": 42312 + }, + { + "epoch": 2.173914812805209, + "grad_norm": 7.958972454071045, + "learning_rate": 4.304234616965876e-06, + "loss": 0.3409, + "step": 42313 + }, + { + "epoch": 2.1739283776451437, + "grad_norm": 5.892810344696045, + "learning_rate": 4.304097574345622e-06, + "loss": 0.2682, + "step": 42314 + }, + { + "epoch": 2.1739419424850785, + "grad_norm": 6.121404647827148, + "learning_rate": 4.303960531725367e-06, + "loss": 0.2254, + "step": 42315 + }, + { + "epoch": 2.1739555073250134, + "grad_norm": 4.580174446105957, + "learning_rate": 4.303823489105112e-06, + "loss": 0.122, + "step": 42316 + }, + { + "epoch": 2.1739690721649483, + "grad_norm": 5.289515018463135, + "learning_rate": 4.303686446484857e-06, + "loss": 0.1528, + "step": 42317 + }, + { + "epoch": 2.173982637004883, + "grad_norm": 6.610135078430176, + "learning_rate": 4.303549403864602e-06, + "loss": 0.2608, + "step": 42318 + }, + { + "epoch": 2.173996201844818, + "grad_norm": 5.416635990142822, + "learning_rate": 4.303412361244347e-06, + "loss": 0.293, + "step": 42319 + }, + { + "epoch": 2.1740097666847533, + "grad_norm": 6.1992692947387695, + "learning_rate": 4.303275318624093e-06, + "loss": 0.2672, + "step": 42320 + }, + { + "epoch": 2.174023331524688, + "grad_norm": 4.140466213226318, + "learning_rate": 4.303138276003838e-06, + "loss": 0.1788, + "step": 42321 + }, + { + "epoch": 2.174036896364623, + "grad_norm": 6.261650085449219, + "learning_rate": 4.303001233383582e-06, + "loss": 0.231, + "step": 42322 + }, + { + "epoch": 2.174050461204558, + "grad_norm": 6.246768951416016, + "learning_rate": 4.302864190763328e-06, + "loss": 0.2872, + "step": 42323 + }, + { + "epoch": 2.174064026044493, + "grad_norm": 4.749012470245361, + "learning_rate": 4.3027271481430725e-06, + "loss": 0.1566, + "step": 42324 + }, + { + "epoch": 2.1740775908844276, + "grad_norm": 5.548511505126953, + "learning_rate": 4.3025901055228185e-06, + "loss": 0.1803, + "step": 42325 + }, + { + "epoch": 2.1740911557243625, + "grad_norm": 5.122291564941406, + "learning_rate": 4.302453062902563e-06, + "loss": 0.2393, + "step": 42326 + }, + { + "epoch": 2.1741047205642974, + "grad_norm": 5.258899688720703, + "learning_rate": 4.302316020282308e-06, + "loss": 0.2223, + "step": 42327 + }, + { + "epoch": 2.1741182854042322, + "grad_norm": 6.3580708503723145, + "learning_rate": 4.302178977662053e-06, + "loss": 0.2539, + "step": 42328 + }, + { + "epoch": 2.174131850244167, + "grad_norm": 6.325343132019043, + "learning_rate": 4.302041935041798e-06, + "loss": 0.3289, + "step": 42329 + }, + { + "epoch": 2.174145415084102, + "grad_norm": 7.4736647605896, + "learning_rate": 4.3019048924215435e-06, + "loss": 0.2927, + "step": 42330 + }, + { + "epoch": 2.174158979924037, + "grad_norm": 5.28878116607666, + "learning_rate": 4.301767849801289e-06, + "loss": 0.3052, + "step": 42331 + }, + { + "epoch": 2.1741725447639717, + "grad_norm": 4.8103227615356445, + "learning_rate": 4.301630807181034e-06, + "loss": 0.1731, + "step": 42332 + }, + { + "epoch": 2.1741861096039066, + "grad_norm": 5.4647674560546875, + "learning_rate": 4.301493764560778e-06, + "loss": 0.2708, + "step": 42333 + }, + { + "epoch": 2.1741996744438414, + "grad_norm": 7.691145420074463, + "learning_rate": 4.301356721940524e-06, + "loss": 0.2315, + "step": 42334 + }, + { + "epoch": 2.1742132392837763, + "grad_norm": 5.895251274108887, + "learning_rate": 4.301219679320269e-06, + "loss": 0.3073, + "step": 42335 + }, + { + "epoch": 2.174226804123711, + "grad_norm": 5.586748123168945, + "learning_rate": 4.301082636700014e-06, + "loss": 0.2626, + "step": 42336 + }, + { + "epoch": 2.174240368963646, + "grad_norm": 5.374864101409912, + "learning_rate": 4.300945594079759e-06, + "loss": 0.2267, + "step": 42337 + }, + { + "epoch": 2.1742539338035813, + "grad_norm": 3.9708456993103027, + "learning_rate": 4.300808551459504e-06, + "loss": 0.1764, + "step": 42338 + }, + { + "epoch": 2.174267498643516, + "grad_norm": 4.619402885437012, + "learning_rate": 4.300671508839249e-06, + "loss": 0.29, + "step": 42339 + }, + { + "epoch": 2.174281063483451, + "grad_norm": 5.802499771118164, + "learning_rate": 4.3005344662189945e-06, + "loss": 0.2338, + "step": 42340 + }, + { + "epoch": 2.174294628323386, + "grad_norm": 6.532551288604736, + "learning_rate": 4.30039742359874e-06, + "loss": 0.1975, + "step": 42341 + }, + { + "epoch": 2.174308193163321, + "grad_norm": 6.2253289222717285, + "learning_rate": 4.300260380978485e-06, + "loss": 0.3582, + "step": 42342 + }, + { + "epoch": 2.1743217580032557, + "grad_norm": 8.260353088378906, + "learning_rate": 4.30012333835823e-06, + "loss": 0.436, + "step": 42343 + }, + { + "epoch": 2.1743353228431905, + "grad_norm": 5.415492057800293, + "learning_rate": 4.299986295737975e-06, + "loss": 0.2638, + "step": 42344 + }, + { + "epoch": 2.1743488876831254, + "grad_norm": 5.035810947418213, + "learning_rate": 4.2998492531177195e-06, + "loss": 0.2409, + "step": 42345 + }, + { + "epoch": 2.1743624525230603, + "grad_norm": 6.761145114898682, + "learning_rate": 4.299712210497465e-06, + "loss": 0.4733, + "step": 42346 + }, + { + "epoch": 2.174376017362995, + "grad_norm": 3.864790678024292, + "learning_rate": 4.29957516787721e-06, + "loss": 0.1501, + "step": 42347 + }, + { + "epoch": 2.17438958220293, + "grad_norm": 5.487334251403809, + "learning_rate": 4.299438125256955e-06, + "loss": 0.266, + "step": 42348 + }, + { + "epoch": 2.174403147042865, + "grad_norm": 7.010106563568115, + "learning_rate": 4.2993010826367e-06, + "loss": 0.3166, + "step": 42349 + }, + { + "epoch": 2.1744167118827997, + "grad_norm": 6.4969987869262695, + "learning_rate": 4.299164040016445e-06, + "loss": 0.2278, + "step": 42350 + }, + { + "epoch": 2.1744302767227346, + "grad_norm": 3.455845594406128, + "learning_rate": 4.299026997396191e-06, + "loss": 0.1949, + "step": 42351 + }, + { + "epoch": 2.1744438415626695, + "grad_norm": 6.9291534423828125, + "learning_rate": 4.298889954775936e-06, + "loss": 0.3213, + "step": 42352 + }, + { + "epoch": 2.1744574064026043, + "grad_norm": 4.090323448181152, + "learning_rate": 4.298752912155681e-06, + "loss": 0.1319, + "step": 42353 + }, + { + "epoch": 2.174470971242539, + "grad_norm": 6.56304407119751, + "learning_rate": 4.298615869535426e-06, + "loss": 0.2986, + "step": 42354 + }, + { + "epoch": 2.174484536082474, + "grad_norm": 5.3044352531433105, + "learning_rate": 4.298478826915171e-06, + "loss": 0.2527, + "step": 42355 + }, + { + "epoch": 2.174498100922409, + "grad_norm": 8.700346946716309, + "learning_rate": 4.298341784294916e-06, + "loss": 0.4237, + "step": 42356 + }, + { + "epoch": 2.174511665762344, + "grad_norm": 6.491042613983154, + "learning_rate": 4.298204741674662e-06, + "loss": 0.3341, + "step": 42357 + }, + { + "epoch": 2.174525230602279, + "grad_norm": 5.9040398597717285, + "learning_rate": 4.298067699054406e-06, + "loss": 0.2635, + "step": 42358 + }, + { + "epoch": 2.174538795442214, + "grad_norm": 6.966653823852539, + "learning_rate": 4.297930656434152e-06, + "loss": 0.3649, + "step": 42359 + }, + { + "epoch": 2.174552360282149, + "grad_norm": 5.675570011138916, + "learning_rate": 4.297793613813896e-06, + "loss": 0.2227, + "step": 42360 + }, + { + "epoch": 2.1745659251220837, + "grad_norm": 6.708813667297363, + "learning_rate": 4.2976565711936415e-06, + "loss": 0.3613, + "step": 42361 + }, + { + "epoch": 2.1745794899620186, + "grad_norm": 4.3960747718811035, + "learning_rate": 4.297519528573387e-06, + "loss": 0.2305, + "step": 42362 + }, + { + "epoch": 2.1745930548019534, + "grad_norm": 5.187735080718994, + "learning_rate": 4.297382485953132e-06, + "loss": 0.1862, + "step": 42363 + }, + { + "epoch": 2.1746066196418883, + "grad_norm": 5.76270055770874, + "learning_rate": 4.297245443332877e-06, + "loss": 0.2911, + "step": 42364 + }, + { + "epoch": 2.174620184481823, + "grad_norm": 5.747142314910889, + "learning_rate": 4.297108400712621e-06, + "loss": 0.2471, + "step": 42365 + }, + { + "epoch": 2.174633749321758, + "grad_norm": 4.963595390319824, + "learning_rate": 4.2969713580923674e-06, + "loss": 0.2636, + "step": 42366 + }, + { + "epoch": 2.174647314161693, + "grad_norm": 4.946987628936768, + "learning_rate": 4.296834315472112e-06, + "loss": 0.1934, + "step": 42367 + }, + { + "epoch": 2.1746608790016277, + "grad_norm": 7.722715854644775, + "learning_rate": 4.296697272851858e-06, + "loss": 0.3864, + "step": 42368 + }, + { + "epoch": 2.1746744438415626, + "grad_norm": 5.341751575469971, + "learning_rate": 4.296560230231602e-06, + "loss": 0.2823, + "step": 42369 + }, + { + "epoch": 2.1746880086814975, + "grad_norm": 7.170726776123047, + "learning_rate": 4.296423187611347e-06, + "loss": 0.4406, + "step": 42370 + }, + { + "epoch": 2.1747015735214323, + "grad_norm": 6.354542255401611, + "learning_rate": 4.2962861449910925e-06, + "loss": 0.3468, + "step": 42371 + }, + { + "epoch": 2.174715138361367, + "grad_norm": 5.875642776489258, + "learning_rate": 4.296149102370838e-06, + "loss": 0.2462, + "step": 42372 + }, + { + "epoch": 2.174728703201302, + "grad_norm": 4.86851167678833, + "learning_rate": 4.296012059750583e-06, + "loss": 0.2032, + "step": 42373 + }, + { + "epoch": 2.174742268041237, + "grad_norm": 4.272807598114014, + "learning_rate": 4.295875017130328e-06, + "loss": 0.2369, + "step": 42374 + }, + { + "epoch": 2.174755832881172, + "grad_norm": 8.051214218139648, + "learning_rate": 4.295737974510073e-06, + "loss": 0.3209, + "step": 42375 + }, + { + "epoch": 2.174769397721107, + "grad_norm": 7.600995063781738, + "learning_rate": 4.2956009318898175e-06, + "loss": 0.2421, + "step": 42376 + }, + { + "epoch": 2.174782962561042, + "grad_norm": 4.704276084899902, + "learning_rate": 4.2954638892695636e-06, + "loss": 0.2505, + "step": 42377 + }, + { + "epoch": 2.174796527400977, + "grad_norm": 6.2585530281066895, + "learning_rate": 4.295326846649308e-06, + "loss": 0.277, + "step": 42378 + }, + { + "epoch": 2.1748100922409117, + "grad_norm": 6.1291184425354, + "learning_rate": 4.295189804029054e-06, + "loss": 0.3229, + "step": 42379 + }, + { + "epoch": 2.1748236570808466, + "grad_norm": 6.162114143371582, + "learning_rate": 4.295052761408798e-06, + "loss": 0.3546, + "step": 42380 + }, + { + "epoch": 2.1748372219207814, + "grad_norm": 5.967084884643555, + "learning_rate": 4.294915718788543e-06, + "loss": 0.3061, + "step": 42381 + }, + { + "epoch": 2.1748507867607163, + "grad_norm": 6.040595054626465, + "learning_rate": 4.294778676168289e-06, + "loss": 0.3693, + "step": 42382 + }, + { + "epoch": 2.174864351600651, + "grad_norm": 5.350011825561523, + "learning_rate": 4.294641633548034e-06, + "loss": 0.2746, + "step": 42383 + }, + { + "epoch": 2.174877916440586, + "grad_norm": 4.5257697105407715, + "learning_rate": 4.294504590927779e-06, + "loss": 0.2775, + "step": 42384 + }, + { + "epoch": 2.174891481280521, + "grad_norm": 4.514675617218018, + "learning_rate": 4.294367548307524e-06, + "loss": 0.1686, + "step": 42385 + }, + { + "epoch": 2.1749050461204558, + "grad_norm": 5.755692481994629, + "learning_rate": 4.294230505687269e-06, + "loss": 0.2563, + "step": 42386 + }, + { + "epoch": 2.1749186109603906, + "grad_norm": 5.097183704376221, + "learning_rate": 4.2940934630670145e-06, + "loss": 0.1606, + "step": 42387 + }, + { + "epoch": 2.1749321758003255, + "grad_norm": 5.149115085601807, + "learning_rate": 4.29395642044676e-06, + "loss": 0.1964, + "step": 42388 + }, + { + "epoch": 2.1749457406402604, + "grad_norm": 4.546276569366455, + "learning_rate": 4.293819377826504e-06, + "loss": 0.1836, + "step": 42389 + }, + { + "epoch": 2.1749593054801952, + "grad_norm": 5.353953838348389, + "learning_rate": 4.293682335206249e-06, + "loss": 0.3189, + "step": 42390 + }, + { + "epoch": 2.17497287032013, + "grad_norm": 5.299127101898193, + "learning_rate": 4.293545292585994e-06, + "loss": 0.358, + "step": 42391 + }, + { + "epoch": 2.174986435160065, + "grad_norm": 4.876932621002197, + "learning_rate": 4.2934082499657395e-06, + "loss": 0.2401, + "step": 42392 + }, + { + "epoch": 2.175, + "grad_norm": 5.734769344329834, + "learning_rate": 4.293271207345485e-06, + "loss": 0.2426, + "step": 42393 + }, + { + "epoch": 2.1750135648399347, + "grad_norm": 5.6291961669921875, + "learning_rate": 4.29313416472523e-06, + "loss": 0.2466, + "step": 42394 + }, + { + "epoch": 2.1750271296798696, + "grad_norm": 6.357676982879639, + "learning_rate": 4.292997122104975e-06, + "loss": 0.2622, + "step": 42395 + }, + { + "epoch": 2.175040694519805, + "grad_norm": 4.406365871429443, + "learning_rate": 4.29286007948472e-06, + "loss": 0.2211, + "step": 42396 + }, + { + "epoch": 2.1750542593597397, + "grad_norm": 7.118667125701904, + "learning_rate": 4.2927230368644654e-06, + "loss": 0.3727, + "step": 42397 + }, + { + "epoch": 2.1750678241996746, + "grad_norm": 4.790921688079834, + "learning_rate": 4.292585994244211e-06, + "loss": 0.2138, + "step": 42398 + }, + { + "epoch": 2.1750813890396095, + "grad_norm": 5.494702339172363, + "learning_rate": 4.292448951623955e-06, + "loss": 0.1634, + "step": 42399 + }, + { + "epoch": 2.1750949538795443, + "grad_norm": 3.3912220001220703, + "learning_rate": 4.292311909003701e-06, + "loss": 0.1564, + "step": 42400 + }, + { + "epoch": 2.175108518719479, + "grad_norm": 6.884675025939941, + "learning_rate": 4.292174866383445e-06, + "loss": 0.1993, + "step": 42401 + }, + { + "epoch": 2.175122083559414, + "grad_norm": 6.139594078063965, + "learning_rate": 4.2920378237631905e-06, + "loss": 0.2955, + "step": 42402 + }, + { + "epoch": 2.175135648399349, + "grad_norm": 4.975938320159912, + "learning_rate": 4.291900781142936e-06, + "loss": 0.2386, + "step": 42403 + }, + { + "epoch": 2.175149213239284, + "grad_norm": 7.0618133544921875, + "learning_rate": 4.291763738522681e-06, + "loss": 0.2724, + "step": 42404 + }, + { + "epoch": 2.1751627780792187, + "grad_norm": 4.421055316925049, + "learning_rate": 4.291626695902426e-06, + "loss": 0.1175, + "step": 42405 + }, + { + "epoch": 2.1751763429191535, + "grad_norm": 7.233016490936279, + "learning_rate": 4.291489653282171e-06, + "loss": 0.2842, + "step": 42406 + }, + { + "epoch": 2.1751899077590884, + "grad_norm": 4.376933574676514, + "learning_rate": 4.291352610661916e-06, + "loss": 0.1565, + "step": 42407 + }, + { + "epoch": 2.1752034725990232, + "grad_norm": 6.352889060974121, + "learning_rate": 4.2912155680416616e-06, + "loss": 0.19, + "step": 42408 + }, + { + "epoch": 2.175217037438958, + "grad_norm": 4.982458114624023, + "learning_rate": 4.291078525421407e-06, + "loss": 0.2718, + "step": 42409 + }, + { + "epoch": 2.175230602278893, + "grad_norm": 5.350753307342529, + "learning_rate": 4.290941482801151e-06, + "loss": 0.2613, + "step": 42410 + }, + { + "epoch": 2.175244167118828, + "grad_norm": 5.962644577026367, + "learning_rate": 4.290804440180897e-06, + "loss": 0.2288, + "step": 42411 + }, + { + "epoch": 2.1752577319587627, + "grad_norm": 4.07165002822876, + "learning_rate": 4.2906673975606414e-06, + "loss": 0.1756, + "step": 42412 + }, + { + "epoch": 2.1752712967986976, + "grad_norm": 4.610040187835693, + "learning_rate": 4.2905303549403875e-06, + "loss": 0.276, + "step": 42413 + }, + { + "epoch": 2.175284861638633, + "grad_norm": 5.3132734298706055, + "learning_rate": 4.290393312320132e-06, + "loss": 0.2495, + "step": 42414 + }, + { + "epoch": 2.1752984264785677, + "grad_norm": 4.0413923263549805, + "learning_rate": 4.290256269699877e-06, + "loss": 0.1445, + "step": 42415 + }, + { + "epoch": 2.1753119913185026, + "grad_norm": 4.266062259674072, + "learning_rate": 4.290119227079622e-06, + "loss": 0.1692, + "step": 42416 + }, + { + "epoch": 2.1753255561584375, + "grad_norm": 5.393340587615967, + "learning_rate": 4.289982184459367e-06, + "loss": 0.225, + "step": 42417 + }, + { + "epoch": 2.1753391209983723, + "grad_norm": 4.581186771392822, + "learning_rate": 4.2898451418391125e-06, + "loss": 0.2074, + "step": 42418 + }, + { + "epoch": 2.175352685838307, + "grad_norm": 3.957575559616089, + "learning_rate": 4.289708099218857e-06, + "loss": 0.1748, + "step": 42419 + }, + { + "epoch": 2.175366250678242, + "grad_norm": 4.015173435211182, + "learning_rate": 4.289571056598603e-06, + "loss": 0.2821, + "step": 42420 + }, + { + "epoch": 2.175379815518177, + "grad_norm": 4.991600036621094, + "learning_rate": 4.289434013978347e-06, + "loss": 0.2783, + "step": 42421 + }, + { + "epoch": 2.175393380358112, + "grad_norm": 3.3688790798187256, + "learning_rate": 4.289296971358093e-06, + "loss": 0.1543, + "step": 42422 + }, + { + "epoch": 2.1754069451980467, + "grad_norm": 4.031148433685303, + "learning_rate": 4.2891599287378375e-06, + "loss": 0.1188, + "step": 42423 + }, + { + "epoch": 2.1754205100379815, + "grad_norm": 4.55255651473999, + "learning_rate": 4.289022886117583e-06, + "loss": 0.1481, + "step": 42424 + }, + { + "epoch": 2.1754340748779164, + "grad_norm": 4.5260233879089355, + "learning_rate": 4.288885843497328e-06, + "loss": 0.2548, + "step": 42425 + }, + { + "epoch": 2.1754476397178513, + "grad_norm": 4.25917911529541, + "learning_rate": 4.288748800877073e-06, + "loss": 0.1431, + "step": 42426 + }, + { + "epoch": 2.175461204557786, + "grad_norm": 5.245826244354248, + "learning_rate": 4.288611758256818e-06, + "loss": 0.2275, + "step": 42427 + }, + { + "epoch": 2.175474769397721, + "grad_norm": 6.157567024230957, + "learning_rate": 4.2884747156365634e-06, + "loss": 0.2546, + "step": 42428 + }, + { + "epoch": 2.175488334237656, + "grad_norm": 5.914619445800781, + "learning_rate": 4.288337673016309e-06, + "loss": 0.1521, + "step": 42429 + }, + { + "epoch": 2.1755018990775907, + "grad_norm": 3.889793634414673, + "learning_rate": 4.288200630396053e-06, + "loss": 0.1435, + "step": 42430 + }, + { + "epoch": 2.1755154639175256, + "grad_norm": 5.315907955169678, + "learning_rate": 4.288063587775799e-06, + "loss": 0.3261, + "step": 42431 + }, + { + "epoch": 2.1755290287574605, + "grad_norm": 5.235167980194092, + "learning_rate": 4.287926545155543e-06, + "loss": 0.2389, + "step": 42432 + }, + { + "epoch": 2.1755425935973953, + "grad_norm": 3.8713607788085938, + "learning_rate": 4.287789502535289e-06, + "loss": 0.1985, + "step": 42433 + }, + { + "epoch": 2.1755561584373306, + "grad_norm": 4.39885950088501, + "learning_rate": 4.287652459915034e-06, + "loss": 0.1616, + "step": 42434 + }, + { + "epoch": 2.1755697232772655, + "grad_norm": 5.20362663269043, + "learning_rate": 4.287515417294779e-06, + "loss": 0.2045, + "step": 42435 + }, + { + "epoch": 2.1755832881172004, + "grad_norm": 6.518316268920898, + "learning_rate": 4.287378374674524e-06, + "loss": 0.1907, + "step": 42436 + }, + { + "epoch": 2.1755968529571352, + "grad_norm": 4.652404308319092, + "learning_rate": 4.287241332054269e-06, + "loss": 0.2666, + "step": 42437 + }, + { + "epoch": 2.17561041779707, + "grad_norm": 4.619051933288574, + "learning_rate": 4.287104289434014e-06, + "loss": 0.2114, + "step": 42438 + }, + { + "epoch": 2.175623982637005, + "grad_norm": 3.7296552658081055, + "learning_rate": 4.2869672468137596e-06, + "loss": 0.154, + "step": 42439 + }, + { + "epoch": 2.17563754747694, + "grad_norm": 4.561938285827637, + "learning_rate": 4.286830204193505e-06, + "loss": 0.2474, + "step": 42440 + }, + { + "epoch": 2.1756511123168747, + "grad_norm": 3.8669016361236572, + "learning_rate": 4.28669316157325e-06, + "loss": 0.1522, + "step": 42441 + }, + { + "epoch": 2.1756646771568096, + "grad_norm": 4.5895094871521, + "learning_rate": 4.286556118952995e-06, + "loss": 0.2037, + "step": 42442 + }, + { + "epoch": 2.1756782419967444, + "grad_norm": 4.9080352783203125, + "learning_rate": 4.2864190763327394e-06, + "loss": 0.1254, + "step": 42443 + }, + { + "epoch": 2.1756918068366793, + "grad_norm": 4.8519415855407715, + "learning_rate": 4.286282033712485e-06, + "loss": 0.201, + "step": 42444 + }, + { + "epoch": 2.175705371676614, + "grad_norm": 3.1295626163482666, + "learning_rate": 4.28614499109223e-06, + "loss": 0.0906, + "step": 42445 + }, + { + "epoch": 2.175718936516549, + "grad_norm": 4.487339019775391, + "learning_rate": 4.286007948471975e-06, + "loss": 0.2389, + "step": 42446 + }, + { + "epoch": 2.175732501356484, + "grad_norm": 5.366904258728027, + "learning_rate": 4.28587090585172e-06, + "loss": 0.1761, + "step": 42447 + }, + { + "epoch": 2.1757460661964187, + "grad_norm": 5.055311679840088, + "learning_rate": 4.285733863231465e-06, + "loss": 0.2053, + "step": 42448 + }, + { + "epoch": 2.1757596310363536, + "grad_norm": 6.319391250610352, + "learning_rate": 4.2855968206112105e-06, + "loss": 0.2484, + "step": 42449 + }, + { + "epoch": 2.1757731958762885, + "grad_norm": 5.708659648895264, + "learning_rate": 4.285459777990956e-06, + "loss": 0.175, + "step": 42450 + }, + { + "epoch": 2.1757867607162233, + "grad_norm": 4.356130599975586, + "learning_rate": 4.285322735370701e-06, + "loss": 0.1651, + "step": 42451 + }, + { + "epoch": 2.1758003255561587, + "grad_norm": 4.454977035522461, + "learning_rate": 4.285185692750446e-06, + "loss": 0.1299, + "step": 42452 + }, + { + "epoch": 2.1758138903960935, + "grad_norm": 6.3180413246154785, + "learning_rate": 4.28504865013019e-06, + "loss": 0.3047, + "step": 42453 + }, + { + "epoch": 2.1758274552360284, + "grad_norm": 5.533393859863281, + "learning_rate": 4.284911607509936e-06, + "loss": 0.2037, + "step": 42454 + }, + { + "epoch": 2.1758410200759633, + "grad_norm": 3.851386070251465, + "learning_rate": 4.284774564889681e-06, + "loss": 0.1964, + "step": 42455 + }, + { + "epoch": 2.175854584915898, + "grad_norm": 5.749701499938965, + "learning_rate": 4.284637522269427e-06, + "loss": 0.18, + "step": 42456 + }, + { + "epoch": 2.175868149755833, + "grad_norm": 3.4844181537628174, + "learning_rate": 4.284500479649171e-06, + "loss": 0.1702, + "step": 42457 + }, + { + "epoch": 2.175881714595768, + "grad_norm": 5.051990032196045, + "learning_rate": 4.284363437028916e-06, + "loss": 0.2787, + "step": 42458 + }, + { + "epoch": 2.1758952794357027, + "grad_norm": 5.583751678466797, + "learning_rate": 4.2842263944086614e-06, + "loss": 0.231, + "step": 42459 + }, + { + "epoch": 2.1759088442756376, + "grad_norm": 4.794865608215332, + "learning_rate": 4.284089351788407e-06, + "loss": 0.375, + "step": 42460 + }, + { + "epoch": 2.1759224091155724, + "grad_norm": 3.7613308429718018, + "learning_rate": 4.283952309168152e-06, + "loss": 0.1927, + "step": 42461 + }, + { + "epoch": 2.1759359739555073, + "grad_norm": 4.624495029449463, + "learning_rate": 4.283815266547897e-06, + "loss": 0.2574, + "step": 42462 + }, + { + "epoch": 2.175949538795442, + "grad_norm": 3.3897924423217773, + "learning_rate": 4.283678223927642e-06, + "loss": 0.1075, + "step": 42463 + }, + { + "epoch": 2.175963103635377, + "grad_norm": 4.581183910369873, + "learning_rate": 4.2835411813073865e-06, + "loss": 0.1432, + "step": 42464 + }, + { + "epoch": 2.175976668475312, + "grad_norm": 6.978361129760742, + "learning_rate": 4.2834041386871325e-06, + "loss": 0.2733, + "step": 42465 + }, + { + "epoch": 2.1759902333152468, + "grad_norm": 5.234622955322266, + "learning_rate": 4.283267096066877e-06, + "loss": 0.1731, + "step": 42466 + }, + { + "epoch": 2.1760037981551816, + "grad_norm": 5.765806198120117, + "learning_rate": 4.283130053446623e-06, + "loss": 0.2136, + "step": 42467 + }, + { + "epoch": 2.1760173629951165, + "grad_norm": 4.593288421630859, + "learning_rate": 4.282993010826367e-06, + "loss": 0.1919, + "step": 42468 + }, + { + "epoch": 2.1760309278350514, + "grad_norm": 7.663777828216553, + "learning_rate": 4.282855968206112e-06, + "loss": 0.3448, + "step": 42469 + }, + { + "epoch": 2.1760444926749862, + "grad_norm": 6.190515995025635, + "learning_rate": 4.2827189255858576e-06, + "loss": 0.2728, + "step": 42470 + }, + { + "epoch": 2.176058057514921, + "grad_norm": 4.548609256744385, + "learning_rate": 4.282581882965603e-06, + "loss": 0.198, + "step": 42471 + }, + { + "epoch": 2.1760716223548564, + "grad_norm": 4.121110439300537, + "learning_rate": 4.282444840345348e-06, + "loss": 0.1388, + "step": 42472 + }, + { + "epoch": 2.1760851871947913, + "grad_norm": 5.704239368438721, + "learning_rate": 4.282307797725092e-06, + "loss": 0.1784, + "step": 42473 + }, + { + "epoch": 2.176098752034726, + "grad_norm": 4.248998641967773, + "learning_rate": 4.282170755104838e-06, + "loss": 0.2143, + "step": 42474 + }, + { + "epoch": 2.176112316874661, + "grad_norm": 4.371031761169434, + "learning_rate": 4.282033712484583e-06, + "loss": 0.1927, + "step": 42475 + }, + { + "epoch": 2.176125881714596, + "grad_norm": 4.991716384887695, + "learning_rate": 4.281896669864329e-06, + "loss": 0.2353, + "step": 42476 + }, + { + "epoch": 2.1761394465545307, + "grad_norm": 4.781638145446777, + "learning_rate": 4.281759627244073e-06, + "loss": 0.2067, + "step": 42477 + }, + { + "epoch": 2.1761530113944656, + "grad_norm": 3.5748040676116943, + "learning_rate": 4.281622584623818e-06, + "loss": 0.1363, + "step": 42478 + }, + { + "epoch": 2.1761665762344005, + "grad_norm": 4.675589084625244, + "learning_rate": 4.281485542003563e-06, + "loss": 0.156, + "step": 42479 + }, + { + "epoch": 2.1761801410743353, + "grad_norm": 4.2829108238220215, + "learning_rate": 4.2813484993833085e-06, + "loss": 0.1385, + "step": 42480 + }, + { + "epoch": 2.17619370591427, + "grad_norm": 4.543887615203857, + "learning_rate": 4.281211456763054e-06, + "loss": 0.1601, + "step": 42481 + }, + { + "epoch": 2.176207270754205, + "grad_norm": 3.857938766479492, + "learning_rate": 4.281074414142799e-06, + "loss": 0.1618, + "step": 42482 + }, + { + "epoch": 2.17622083559414, + "grad_norm": 6.6559576988220215, + "learning_rate": 4.280937371522544e-06, + "loss": 0.2378, + "step": 42483 + }, + { + "epoch": 2.176234400434075, + "grad_norm": 5.096664905548096, + "learning_rate": 4.280800328902288e-06, + "loss": 0.3264, + "step": 42484 + }, + { + "epoch": 2.1762479652740097, + "grad_norm": 5.88234281539917, + "learning_rate": 4.280663286282034e-06, + "loss": 0.2438, + "step": 42485 + }, + { + "epoch": 2.1762615301139445, + "grad_norm": 5.390280723571777, + "learning_rate": 4.280526243661779e-06, + "loss": 0.2276, + "step": 42486 + }, + { + "epoch": 2.1762750949538794, + "grad_norm": 4.998981952667236, + "learning_rate": 4.280389201041524e-06, + "loss": 0.2152, + "step": 42487 + }, + { + "epoch": 2.1762886597938143, + "grad_norm": 5.808560848236084, + "learning_rate": 4.280252158421269e-06, + "loss": 0.2013, + "step": 42488 + }, + { + "epoch": 2.176302224633749, + "grad_norm": 7.970210552215576, + "learning_rate": 4.280115115801014e-06, + "loss": 0.4005, + "step": 42489 + }, + { + "epoch": 2.1763157894736844, + "grad_norm": 5.342417240142822, + "learning_rate": 4.2799780731807594e-06, + "loss": 0.1624, + "step": 42490 + }, + { + "epoch": 2.1763293543136193, + "grad_norm": 4.6508469581604, + "learning_rate": 4.279841030560505e-06, + "loss": 0.1901, + "step": 42491 + }, + { + "epoch": 2.176342919153554, + "grad_norm": 4.236232757568359, + "learning_rate": 4.27970398794025e-06, + "loss": 0.1645, + "step": 42492 + }, + { + "epoch": 2.176356483993489, + "grad_norm": 5.056507587432861, + "learning_rate": 4.279566945319995e-06, + "loss": 0.1868, + "step": 42493 + }, + { + "epoch": 2.176370048833424, + "grad_norm": 6.700286865234375, + "learning_rate": 4.27942990269974e-06, + "loss": 0.2559, + "step": 42494 + }, + { + "epoch": 2.1763836136733588, + "grad_norm": 5.256994724273682, + "learning_rate": 4.279292860079485e-06, + "loss": 0.1564, + "step": 42495 + }, + { + "epoch": 2.1763971785132936, + "grad_norm": 5.633719444274902, + "learning_rate": 4.2791558174592305e-06, + "loss": 0.217, + "step": 42496 + }, + { + "epoch": 2.1764107433532285, + "grad_norm": 3.7858316898345947, + "learning_rate": 4.279018774838976e-06, + "loss": 0.1195, + "step": 42497 + }, + { + "epoch": 2.1764243081931633, + "grad_norm": 5.295594215393066, + "learning_rate": 4.27888173221872e-06, + "loss": 0.1212, + "step": 42498 + }, + { + "epoch": 2.176437873033098, + "grad_norm": 3.980862617492676, + "learning_rate": 4.278744689598465e-06, + "loss": 0.1626, + "step": 42499 + }, + { + "epoch": 2.176451437873033, + "grad_norm": 4.576000690460205, + "learning_rate": 4.27860764697821e-06, + "loss": 0.1347, + "step": 42500 + }, + { + "epoch": 2.176465002712968, + "grad_norm": 5.915085315704346, + "learning_rate": 4.2784706043579556e-06, + "loss": 0.1876, + "step": 42501 + }, + { + "epoch": 2.176478567552903, + "grad_norm": 4.859415531158447, + "learning_rate": 4.278333561737701e-06, + "loss": 0.1617, + "step": 42502 + }, + { + "epoch": 2.1764921323928377, + "grad_norm": 5.057374000549316, + "learning_rate": 4.278196519117446e-06, + "loss": 0.1899, + "step": 42503 + }, + { + "epoch": 2.1765056972327725, + "grad_norm": 5.764918804168701, + "learning_rate": 4.278059476497191e-06, + "loss": 0.1619, + "step": 42504 + }, + { + "epoch": 2.1765192620727074, + "grad_norm": 4.115339279174805, + "learning_rate": 4.277922433876936e-06, + "loss": 0.143, + "step": 42505 + }, + { + "epoch": 2.1765328269126423, + "grad_norm": 5.608160018920898, + "learning_rate": 4.2777853912566815e-06, + "loss": 0.215, + "step": 42506 + }, + { + "epoch": 2.176546391752577, + "grad_norm": 4.6378912925720215, + "learning_rate": 4.277648348636426e-06, + "loss": 0.1478, + "step": 42507 + }, + { + "epoch": 2.176559956592512, + "grad_norm": 4.699455738067627, + "learning_rate": 4.277511306016172e-06, + "loss": 0.174, + "step": 42508 + }, + { + "epoch": 2.176573521432447, + "grad_norm": 4.511914253234863, + "learning_rate": 4.277374263395916e-06, + "loss": 0.2127, + "step": 42509 + }, + { + "epoch": 2.176587086272382, + "grad_norm": 4.190530300140381, + "learning_rate": 4.277237220775662e-06, + "loss": 0.2404, + "step": 42510 + }, + { + "epoch": 2.176600651112317, + "grad_norm": 4.165703296661377, + "learning_rate": 4.2771001781554065e-06, + "loss": 0.132, + "step": 42511 + }, + { + "epoch": 2.176614215952252, + "grad_norm": 4.24035120010376, + "learning_rate": 4.276963135535152e-06, + "loss": 0.1503, + "step": 42512 + }, + { + "epoch": 2.1766277807921868, + "grad_norm": 4.2022223472595215, + "learning_rate": 4.276826092914897e-06, + "loss": 0.1738, + "step": 42513 + }, + { + "epoch": 2.1766413456321216, + "grad_norm": 5.549926280975342, + "learning_rate": 4.276689050294642e-06, + "loss": 0.1601, + "step": 42514 + }, + { + "epoch": 2.1766549104720565, + "grad_norm": 3.796875238418579, + "learning_rate": 4.276552007674387e-06, + "loss": 0.0982, + "step": 42515 + }, + { + "epoch": 2.1766684753119914, + "grad_norm": 3.81272029876709, + "learning_rate": 4.276414965054132e-06, + "loss": 0.129, + "step": 42516 + }, + { + "epoch": 2.1766820401519262, + "grad_norm": 5.477246284484863, + "learning_rate": 4.276277922433878e-06, + "loss": 0.2109, + "step": 42517 + }, + { + "epoch": 2.176695604991861, + "grad_norm": 5.298844814300537, + "learning_rate": 4.276140879813622e-06, + "loss": 0.1627, + "step": 42518 + }, + { + "epoch": 2.176709169831796, + "grad_norm": 4.246797561645508, + "learning_rate": 4.276003837193368e-06, + "loss": 0.1795, + "step": 42519 + }, + { + "epoch": 2.176722734671731, + "grad_norm": 5.6338605880737305, + "learning_rate": 4.275866794573112e-06, + "loss": 0.2482, + "step": 42520 + }, + { + "epoch": 2.1767362995116657, + "grad_norm": 3.938265323638916, + "learning_rate": 4.275729751952858e-06, + "loss": 0.1875, + "step": 42521 + }, + { + "epoch": 2.1767498643516006, + "grad_norm": 2.4268405437469482, + "learning_rate": 4.275592709332603e-06, + "loss": 0.1185, + "step": 42522 + }, + { + "epoch": 2.1767634291915354, + "grad_norm": 3.9760892391204834, + "learning_rate": 4.275455666712348e-06, + "loss": 0.1393, + "step": 42523 + }, + { + "epoch": 2.1767769940314703, + "grad_norm": 4.313218116760254, + "learning_rate": 4.275318624092093e-06, + "loss": 0.1431, + "step": 42524 + }, + { + "epoch": 2.176790558871405, + "grad_norm": 4.485200881958008, + "learning_rate": 4.275181581471838e-06, + "loss": 0.1998, + "step": 42525 + }, + { + "epoch": 2.17680412371134, + "grad_norm": 5.280386924743652, + "learning_rate": 4.275044538851583e-06, + "loss": 0.1703, + "step": 42526 + }, + { + "epoch": 2.176817688551275, + "grad_norm": 5.6954851150512695, + "learning_rate": 4.274907496231328e-06, + "loss": 0.1991, + "step": 42527 + }, + { + "epoch": 2.17683125339121, + "grad_norm": 4.4421467781066895, + "learning_rate": 4.274770453611074e-06, + "loss": 0.1286, + "step": 42528 + }, + { + "epoch": 2.176844818231145, + "grad_norm": 3.084029197692871, + "learning_rate": 4.274633410990818e-06, + "loss": 0.1119, + "step": 42529 + }, + { + "epoch": 2.17685838307108, + "grad_norm": 3.7053775787353516, + "learning_rate": 4.274496368370564e-06, + "loss": 0.0776, + "step": 42530 + }, + { + "epoch": 2.176871947911015, + "grad_norm": 4.509139537811279, + "learning_rate": 4.274359325750308e-06, + "loss": 0.142, + "step": 42531 + }, + { + "epoch": 2.1768855127509497, + "grad_norm": 5.889892101287842, + "learning_rate": 4.2742222831300536e-06, + "loss": 0.1883, + "step": 42532 + }, + { + "epoch": 2.1768990775908845, + "grad_norm": 7.499362468719482, + "learning_rate": 4.274085240509799e-06, + "loss": 0.2703, + "step": 42533 + }, + { + "epoch": 2.1769126424308194, + "grad_norm": 3.052251100540161, + "learning_rate": 4.273948197889544e-06, + "loss": 0.1025, + "step": 42534 + }, + { + "epoch": 2.1769262072707543, + "grad_norm": 4.47627067565918, + "learning_rate": 4.273811155269289e-06, + "loss": 0.1787, + "step": 42535 + }, + { + "epoch": 2.176939772110689, + "grad_norm": 4.285854339599609, + "learning_rate": 4.273674112649034e-06, + "loss": 0.1573, + "step": 42536 + }, + { + "epoch": 2.176953336950624, + "grad_norm": 5.391213893890381, + "learning_rate": 4.2735370700287795e-06, + "loss": 0.1367, + "step": 42537 + }, + { + "epoch": 2.176966901790559, + "grad_norm": 4.173135280609131, + "learning_rate": 4.273400027408525e-06, + "loss": 0.1621, + "step": 42538 + }, + { + "epoch": 2.1769804666304937, + "grad_norm": 4.777165412902832, + "learning_rate": 4.27326298478827e-06, + "loss": 0.1449, + "step": 42539 + }, + { + "epoch": 2.1769940314704286, + "grad_norm": 3.3978755474090576, + "learning_rate": 4.273125942168014e-06, + "loss": 0.1349, + "step": 42540 + }, + { + "epoch": 2.1770075963103634, + "grad_norm": 4.183047294616699, + "learning_rate": 4.272988899547759e-06, + "loss": 0.1135, + "step": 42541 + }, + { + "epoch": 2.1770211611502983, + "grad_norm": 3.8824310302734375, + "learning_rate": 4.2728518569275045e-06, + "loss": 0.1093, + "step": 42542 + }, + { + "epoch": 2.177034725990233, + "grad_norm": 4.053329944610596, + "learning_rate": 4.27271481430725e-06, + "loss": 0.1727, + "step": 42543 + }, + { + "epoch": 2.177048290830168, + "grad_norm": 2.8498144149780273, + "learning_rate": 4.272577771686995e-06, + "loss": 0.1326, + "step": 42544 + }, + { + "epoch": 2.177061855670103, + "grad_norm": 3.111246109008789, + "learning_rate": 4.27244072906674e-06, + "loss": 0.0954, + "step": 42545 + }, + { + "epoch": 2.1770754205100378, + "grad_norm": 5.2019944190979, + "learning_rate": 4.272303686446485e-06, + "loss": 0.1358, + "step": 42546 + }, + { + "epoch": 2.1770889853499726, + "grad_norm": 4.062796115875244, + "learning_rate": 4.27216664382623e-06, + "loss": 0.1088, + "step": 42547 + }, + { + "epoch": 2.177102550189908, + "grad_norm": 4.433858871459961, + "learning_rate": 4.272029601205976e-06, + "loss": 0.1148, + "step": 42548 + }, + { + "epoch": 2.177116115029843, + "grad_norm": 2.815542221069336, + "learning_rate": 4.271892558585721e-06, + "loss": 0.0828, + "step": 42549 + }, + { + "epoch": 2.1771296798697777, + "grad_norm": 2.6793296337127686, + "learning_rate": 4.271755515965466e-06, + "loss": 0.0823, + "step": 42550 + }, + { + "epoch": 2.1771432447097125, + "grad_norm": 3.387272596359253, + "learning_rate": 4.271618473345211e-06, + "loss": 0.1109, + "step": 42551 + }, + { + "epoch": 2.1771568095496474, + "grad_norm": 3.398190498352051, + "learning_rate": 4.2714814307249555e-06, + "loss": 0.0896, + "step": 42552 + }, + { + "epoch": 2.1771703743895823, + "grad_norm": 5.570815086364746, + "learning_rate": 4.271344388104701e-06, + "loss": 0.1836, + "step": 42553 + }, + { + "epoch": 2.177183939229517, + "grad_norm": 5.080068588256836, + "learning_rate": 4.271207345484446e-06, + "loss": 0.1367, + "step": 42554 + }, + { + "epoch": 2.177197504069452, + "grad_norm": 3.1788220405578613, + "learning_rate": 4.271070302864191e-06, + "loss": 0.1026, + "step": 42555 + }, + { + "epoch": 2.177211068909387, + "grad_norm": 5.541377544403076, + "learning_rate": 4.270933260243936e-06, + "loss": 0.1886, + "step": 42556 + }, + { + "epoch": 2.1772246337493217, + "grad_norm": 3.537148952484131, + "learning_rate": 4.270796217623681e-06, + "loss": 0.0933, + "step": 42557 + }, + { + "epoch": 2.1772381985892566, + "grad_norm": 3.2307653427124023, + "learning_rate": 4.2706591750034265e-06, + "loss": 0.0917, + "step": 42558 + }, + { + "epoch": 2.1772517634291915, + "grad_norm": 3.5724470615386963, + "learning_rate": 4.270522132383172e-06, + "loss": 0.1227, + "step": 42559 + }, + { + "epoch": 2.1772653282691263, + "grad_norm": 4.053297996520996, + "learning_rate": 4.270385089762917e-06, + "loss": 0.1149, + "step": 42560 + }, + { + "epoch": 2.177278893109061, + "grad_norm": 3.4455573558807373, + "learning_rate": 4.270248047142661e-06, + "loss": 0.1197, + "step": 42561 + }, + { + "epoch": 2.177292457948996, + "grad_norm": 4.229238033294678, + "learning_rate": 4.270111004522407e-06, + "loss": 0.1658, + "step": 42562 + }, + { + "epoch": 2.177306022788931, + "grad_norm": 3.5382020473480225, + "learning_rate": 4.269973961902152e-06, + "loss": 0.1479, + "step": 42563 + }, + { + "epoch": 2.177319587628866, + "grad_norm": 4.331120014190674, + "learning_rate": 4.269836919281898e-06, + "loss": 0.1928, + "step": 42564 + }, + { + "epoch": 2.177333152468801, + "grad_norm": 6.9524149894714355, + "learning_rate": 4.269699876661642e-06, + "loss": 0.2042, + "step": 42565 + }, + { + "epoch": 2.177346717308736, + "grad_norm": 3.2455005645751953, + "learning_rate": 4.269562834041387e-06, + "loss": 0.1246, + "step": 42566 + }, + { + "epoch": 2.177360282148671, + "grad_norm": 4.298677921295166, + "learning_rate": 4.269425791421132e-06, + "loss": 0.1515, + "step": 42567 + }, + { + "epoch": 2.1773738469886057, + "grad_norm": 3.6885902881622314, + "learning_rate": 4.2692887488008775e-06, + "loss": 0.1933, + "step": 42568 + }, + { + "epoch": 2.1773874118285406, + "grad_norm": 4.530057907104492, + "learning_rate": 4.269151706180623e-06, + "loss": 0.2025, + "step": 42569 + }, + { + "epoch": 2.1774009766684754, + "grad_norm": 3.189650774002075, + "learning_rate": 4.269014663560368e-06, + "loss": 0.11, + "step": 42570 + }, + { + "epoch": 2.1774145415084103, + "grad_norm": 3.3856120109558105, + "learning_rate": 4.268877620940113e-06, + "loss": 0.1543, + "step": 42571 + }, + { + "epoch": 2.177428106348345, + "grad_norm": 3.825188636779785, + "learning_rate": 4.268740578319857e-06, + "loss": 0.1442, + "step": 42572 + }, + { + "epoch": 2.17744167118828, + "grad_norm": 3.1540815830230713, + "learning_rate": 4.268603535699603e-06, + "loss": 0.0812, + "step": 42573 + }, + { + "epoch": 2.177455236028215, + "grad_norm": 3.888897657394409, + "learning_rate": 4.268466493079348e-06, + "loss": 0.1406, + "step": 42574 + }, + { + "epoch": 2.1774688008681498, + "grad_norm": 4.1890950202941895, + "learning_rate": 4.268329450459094e-06, + "loss": 0.1712, + "step": 42575 + }, + { + "epoch": 2.1774823657080846, + "grad_norm": 3.3422691822052, + "learning_rate": 4.268192407838838e-06, + "loss": 0.134, + "step": 42576 + }, + { + "epoch": 2.1774959305480195, + "grad_norm": 4.262918472290039, + "learning_rate": 4.268055365218583e-06, + "loss": 0.1815, + "step": 42577 + }, + { + "epoch": 2.1775094953879544, + "grad_norm": 4.892977714538574, + "learning_rate": 4.267918322598328e-06, + "loss": 0.1942, + "step": 42578 + }, + { + "epoch": 2.177523060227889, + "grad_norm": 4.647450923919678, + "learning_rate": 4.267781279978074e-06, + "loss": 0.1515, + "step": 42579 + }, + { + "epoch": 2.177536625067824, + "grad_norm": 3.984086751937866, + "learning_rate": 4.267644237357819e-06, + "loss": 0.1021, + "step": 42580 + }, + { + "epoch": 2.177550189907759, + "grad_norm": 4.360147953033447, + "learning_rate": 4.267507194737563e-06, + "loss": 0.1997, + "step": 42581 + }, + { + "epoch": 2.177563754747694, + "grad_norm": 4.041655540466309, + "learning_rate": 4.267370152117309e-06, + "loss": 0.1565, + "step": 42582 + }, + { + "epoch": 2.1775773195876287, + "grad_norm": 4.497838973999023, + "learning_rate": 4.2672331094970535e-06, + "loss": 0.1022, + "step": 42583 + }, + { + "epoch": 2.1775908844275635, + "grad_norm": 3.5136146545410156, + "learning_rate": 4.2670960668767995e-06, + "loss": 0.113, + "step": 42584 + }, + { + "epoch": 2.1776044492674984, + "grad_norm": 3.6683664321899414, + "learning_rate": 4.266959024256544e-06, + "loss": 0.1967, + "step": 42585 + }, + { + "epoch": 2.1776180141074337, + "grad_norm": 4.775564670562744, + "learning_rate": 4.266821981636289e-06, + "loss": 0.1421, + "step": 42586 + }, + { + "epoch": 2.1776315789473686, + "grad_norm": 3.9332056045532227, + "learning_rate": 4.266684939016034e-06, + "loss": 0.1405, + "step": 42587 + }, + { + "epoch": 2.1776451437873035, + "grad_norm": 2.711392879486084, + "learning_rate": 4.266547896395779e-06, + "loss": 0.1294, + "step": 42588 + }, + { + "epoch": 2.1776587086272383, + "grad_norm": 3.6149537563323975, + "learning_rate": 4.2664108537755245e-06, + "loss": 0.1093, + "step": 42589 + }, + { + "epoch": 2.177672273467173, + "grad_norm": 3.17935848236084, + "learning_rate": 4.26627381115527e-06, + "loss": 0.1264, + "step": 42590 + }, + { + "epoch": 2.177685838307108, + "grad_norm": 3.5989325046539307, + "learning_rate": 4.266136768535015e-06, + "loss": 0.0788, + "step": 42591 + }, + { + "epoch": 2.177699403147043, + "grad_norm": 3.781618118286133, + "learning_rate": 4.26599972591476e-06, + "loss": 0.1408, + "step": 42592 + }, + { + "epoch": 2.1777129679869778, + "grad_norm": 4.058706760406494, + "learning_rate": 4.265862683294505e-06, + "loss": 0.2097, + "step": 42593 + }, + { + "epoch": 2.1777265328269126, + "grad_norm": 3.0075225830078125, + "learning_rate": 4.26572564067425e-06, + "loss": 0.1583, + "step": 42594 + }, + { + "epoch": 2.1777400976668475, + "grad_norm": 4.293310642242432, + "learning_rate": 4.265588598053995e-06, + "loss": 0.2214, + "step": 42595 + }, + { + "epoch": 2.1777536625067824, + "grad_norm": 3.9462273120880127, + "learning_rate": 4.26545155543374e-06, + "loss": 0.1755, + "step": 42596 + }, + { + "epoch": 2.1777672273467172, + "grad_norm": 5.69739294052124, + "learning_rate": 4.265314512813485e-06, + "loss": 0.1894, + "step": 42597 + }, + { + "epoch": 2.177780792186652, + "grad_norm": 3.2086448669433594, + "learning_rate": 4.26517747019323e-06, + "loss": 0.1372, + "step": 42598 + }, + { + "epoch": 2.177794357026587, + "grad_norm": 2.7645466327667236, + "learning_rate": 4.2650404275729755e-06, + "loss": 0.0982, + "step": 42599 + }, + { + "epoch": 2.177807921866522, + "grad_norm": 5.465532302856445, + "learning_rate": 4.264903384952721e-06, + "loss": 0.2454, + "step": 42600 + }, + { + "epoch": 2.1778214867064567, + "grad_norm": 3.5433108806610107, + "learning_rate": 4.264766342332466e-06, + "loss": 0.1578, + "step": 42601 + }, + { + "epoch": 2.1778350515463916, + "grad_norm": 3.3190674781799316, + "learning_rate": 4.264629299712211e-06, + "loss": 0.1561, + "step": 42602 + }, + { + "epoch": 2.177848616386327, + "grad_norm": 3.722473621368408, + "learning_rate": 4.264492257091956e-06, + "loss": 0.1468, + "step": 42603 + }, + { + "epoch": 2.1778621812262617, + "grad_norm": 2.8910646438598633, + "learning_rate": 4.264355214471701e-06, + "loss": 0.132, + "step": 42604 + }, + { + "epoch": 2.1778757460661966, + "grad_norm": 3.4548518657684326, + "learning_rate": 4.2642181718514466e-06, + "loss": 0.1499, + "step": 42605 + }, + { + "epoch": 2.1778893109061315, + "grad_norm": 3.3691251277923584, + "learning_rate": 4.264081129231191e-06, + "loss": 0.1388, + "step": 42606 + }, + { + "epoch": 2.1779028757460663, + "grad_norm": 3.902327060699463, + "learning_rate": 4.263944086610937e-06, + "loss": 0.2005, + "step": 42607 + }, + { + "epoch": 2.177916440586001, + "grad_norm": 4.191803932189941, + "learning_rate": 4.263807043990681e-06, + "loss": 0.1428, + "step": 42608 + }, + { + "epoch": 2.177930005425936, + "grad_norm": 4.135876178741455, + "learning_rate": 4.263670001370426e-06, + "loss": 0.124, + "step": 42609 + }, + { + "epoch": 2.177943570265871, + "grad_norm": 3.8236305713653564, + "learning_rate": 4.263532958750172e-06, + "loss": 0.0935, + "step": 42610 + }, + { + "epoch": 2.177957135105806, + "grad_norm": 3.74592924118042, + "learning_rate": 4.263395916129917e-06, + "loss": 0.1143, + "step": 42611 + }, + { + "epoch": 2.1779706999457407, + "grad_norm": 2.6816928386688232, + "learning_rate": 4.263258873509662e-06, + "loss": 0.1085, + "step": 42612 + }, + { + "epoch": 2.1779842647856755, + "grad_norm": 3.5205090045928955, + "learning_rate": 4.263121830889407e-06, + "loss": 0.1415, + "step": 42613 + }, + { + "epoch": 2.1779978296256104, + "grad_norm": 2.973128080368042, + "learning_rate": 4.262984788269152e-06, + "loss": 0.1034, + "step": 42614 + }, + { + "epoch": 2.1780113944655453, + "grad_norm": 3.176647901535034, + "learning_rate": 4.262847745648897e-06, + "loss": 0.1104, + "step": 42615 + }, + { + "epoch": 2.17802495930548, + "grad_norm": 4.248020172119141, + "learning_rate": 4.262710703028643e-06, + "loss": 0.1551, + "step": 42616 + }, + { + "epoch": 2.178038524145415, + "grad_norm": 3.829000234603882, + "learning_rate": 4.262573660408387e-06, + "loss": 0.2008, + "step": 42617 + }, + { + "epoch": 2.17805208898535, + "grad_norm": 3.740382432937622, + "learning_rate": 4.262436617788133e-06, + "loss": 0.1088, + "step": 42618 + }, + { + "epoch": 2.1780656538252847, + "grad_norm": 4.777955055236816, + "learning_rate": 4.262299575167877e-06, + "loss": 0.1507, + "step": 42619 + }, + { + "epoch": 2.1780792186652196, + "grad_norm": 4.941292762756348, + "learning_rate": 4.2621625325476225e-06, + "loss": 0.211, + "step": 42620 + }, + { + "epoch": 2.1780927835051545, + "grad_norm": 3.7982141971588135, + "learning_rate": 4.262025489927368e-06, + "loss": 0.1103, + "step": 42621 + }, + { + "epoch": 2.1781063483450893, + "grad_norm": 4.494251251220703, + "learning_rate": 4.261888447307113e-06, + "loss": 0.2125, + "step": 42622 + }, + { + "epoch": 2.178119913185024, + "grad_norm": 2.60513973236084, + "learning_rate": 4.261751404686858e-06, + "loss": 0.087, + "step": 42623 + }, + { + "epoch": 2.1781334780249595, + "grad_norm": 4.541375160217285, + "learning_rate": 4.261614362066603e-06, + "loss": 0.1827, + "step": 42624 + }, + { + "epoch": 2.1781470428648944, + "grad_norm": 3.6411736011505127, + "learning_rate": 4.2614773194463484e-06, + "loss": 0.1887, + "step": 42625 + }, + { + "epoch": 2.178160607704829, + "grad_norm": 4.787962436676025, + "learning_rate": 4.261340276826093e-06, + "loss": 0.1915, + "step": 42626 + }, + { + "epoch": 2.178174172544764, + "grad_norm": 5.428746223449707, + "learning_rate": 4.261203234205839e-06, + "loss": 0.2725, + "step": 42627 + }, + { + "epoch": 2.178187737384699, + "grad_norm": 3.483224630355835, + "learning_rate": 4.261066191585583e-06, + "loss": 0.1441, + "step": 42628 + }, + { + "epoch": 2.178201302224634, + "grad_norm": 3.450941562652588, + "learning_rate": 4.260929148965328e-06, + "loss": 0.0933, + "step": 42629 + }, + { + "epoch": 2.1782148670645687, + "grad_norm": 5.156222343444824, + "learning_rate": 4.2607921063450735e-06, + "loss": 0.2205, + "step": 42630 + }, + { + "epoch": 2.1782284319045035, + "grad_norm": 5.213244915008545, + "learning_rate": 4.260655063724819e-06, + "loss": 0.1948, + "step": 42631 + }, + { + "epoch": 2.1782419967444384, + "grad_norm": 7.457581996917725, + "learning_rate": 4.260518021104564e-06, + "loss": 0.2862, + "step": 42632 + }, + { + "epoch": 2.1782555615843733, + "grad_norm": 4.168433666229248, + "learning_rate": 4.260380978484309e-06, + "loss": 0.2138, + "step": 42633 + }, + { + "epoch": 2.178269126424308, + "grad_norm": 4.097379684448242, + "learning_rate": 4.260243935864054e-06, + "loss": 0.124, + "step": 42634 + }, + { + "epoch": 2.178282691264243, + "grad_norm": 5.149532794952393, + "learning_rate": 4.260106893243799e-06, + "loss": 0.2226, + "step": 42635 + }, + { + "epoch": 2.178296256104178, + "grad_norm": 5.854578971862793, + "learning_rate": 4.2599698506235446e-06, + "loss": 0.2, + "step": 42636 + }, + { + "epoch": 2.1783098209441127, + "grad_norm": 4.80692720413208, + "learning_rate": 4.259832808003289e-06, + "loss": 0.1579, + "step": 42637 + }, + { + "epoch": 2.1783233857840476, + "grad_norm": 4.221092224121094, + "learning_rate": 4.259695765383035e-06, + "loss": 0.1285, + "step": 42638 + }, + { + "epoch": 2.1783369506239825, + "grad_norm": 5.1684184074401855, + "learning_rate": 4.259558722762779e-06, + "loss": 0.1759, + "step": 42639 + }, + { + "epoch": 2.1783505154639173, + "grad_norm": 3.428820848464966, + "learning_rate": 4.2594216801425244e-06, + "loss": 0.1215, + "step": 42640 + }, + { + "epoch": 2.1783640803038526, + "grad_norm": 6.6912102699279785, + "learning_rate": 4.25928463752227e-06, + "loss": 0.1746, + "step": 42641 + }, + { + "epoch": 2.1783776451437875, + "grad_norm": 4.982548236846924, + "learning_rate": 4.259147594902015e-06, + "loss": 0.1916, + "step": 42642 + }, + { + "epoch": 2.1783912099837224, + "grad_norm": 3.418494462966919, + "learning_rate": 4.25901055228176e-06, + "loss": 0.1519, + "step": 42643 + }, + { + "epoch": 2.1784047748236572, + "grad_norm": 3.592304229736328, + "learning_rate": 4.258873509661505e-06, + "loss": 0.1462, + "step": 42644 + }, + { + "epoch": 2.178418339663592, + "grad_norm": 4.933498859405518, + "learning_rate": 4.25873646704125e-06, + "loss": 0.2463, + "step": 42645 + }, + { + "epoch": 2.178431904503527, + "grad_norm": 3.3512752056121826, + "learning_rate": 4.2585994244209955e-06, + "loss": 0.1028, + "step": 42646 + }, + { + "epoch": 2.178445469343462, + "grad_norm": 4.649547576904297, + "learning_rate": 4.258462381800741e-06, + "loss": 0.1986, + "step": 42647 + }, + { + "epoch": 2.1784590341833967, + "grad_norm": 3.7359437942504883, + "learning_rate": 4.258325339180486e-06, + "loss": 0.1053, + "step": 42648 + }, + { + "epoch": 2.1784725990233316, + "grad_norm": 4.289437294006348, + "learning_rate": 4.25818829656023e-06, + "loss": 0.1544, + "step": 42649 + }, + { + "epoch": 2.1784861638632664, + "grad_norm": 4.04899787902832, + "learning_rate": 4.258051253939975e-06, + "loss": 0.1993, + "step": 42650 + }, + { + "epoch": 2.1784997287032013, + "grad_norm": 4.808703422546387, + "learning_rate": 4.2579142113197205e-06, + "loss": 0.1693, + "step": 42651 + }, + { + "epoch": 2.178513293543136, + "grad_norm": 6.022561073303223, + "learning_rate": 4.257777168699466e-06, + "loss": 0.2314, + "step": 42652 + }, + { + "epoch": 2.178526858383071, + "grad_norm": 3.7107455730438232, + "learning_rate": 4.257640126079211e-06, + "loss": 0.1279, + "step": 42653 + }, + { + "epoch": 2.178540423223006, + "grad_norm": 5.808221340179443, + "learning_rate": 4.257503083458956e-06, + "loss": 0.2409, + "step": 42654 + }, + { + "epoch": 2.1785539880629408, + "grad_norm": 4.095892429351807, + "learning_rate": 4.257366040838701e-06, + "loss": 0.1486, + "step": 42655 + }, + { + "epoch": 2.1785675529028756, + "grad_norm": 6.008339881896973, + "learning_rate": 4.2572289982184464e-06, + "loss": 0.1918, + "step": 42656 + }, + { + "epoch": 2.1785811177428105, + "grad_norm": 4.2624945640563965, + "learning_rate": 4.257091955598192e-06, + "loss": 0.1911, + "step": 42657 + }, + { + "epoch": 2.1785946825827454, + "grad_norm": 4.605947971343994, + "learning_rate": 4.256954912977937e-06, + "loss": 0.1676, + "step": 42658 + }, + { + "epoch": 2.17860824742268, + "grad_norm": 5.656648635864258, + "learning_rate": 4.256817870357682e-06, + "loss": 0.2897, + "step": 42659 + }, + { + "epoch": 2.178621812262615, + "grad_norm": 4.258183002471924, + "learning_rate": 4.256680827737426e-06, + "loss": 0.1403, + "step": 42660 + }, + { + "epoch": 2.1786353771025504, + "grad_norm": 5.280003547668457, + "learning_rate": 4.256543785117172e-06, + "loss": 0.2331, + "step": 42661 + }, + { + "epoch": 2.1786489419424853, + "grad_norm": 3.832444190979004, + "learning_rate": 4.256406742496917e-06, + "loss": 0.1994, + "step": 42662 + }, + { + "epoch": 2.17866250678242, + "grad_norm": 5.197234153747559, + "learning_rate": 4.256269699876662e-06, + "loss": 0.1185, + "step": 42663 + }, + { + "epoch": 2.178676071622355, + "grad_norm": 5.095271110534668, + "learning_rate": 4.256132657256407e-06, + "loss": 0.2064, + "step": 42664 + }, + { + "epoch": 2.17868963646229, + "grad_norm": 4.307445049285889, + "learning_rate": 4.255995614636152e-06, + "loss": 0.1912, + "step": 42665 + }, + { + "epoch": 2.1787032013022247, + "grad_norm": 4.36405611038208, + "learning_rate": 4.255858572015897e-06, + "loss": 0.2399, + "step": 42666 + }, + { + "epoch": 2.1787167661421596, + "grad_norm": 4.618640899658203, + "learning_rate": 4.2557215293956426e-06, + "loss": 0.2393, + "step": 42667 + }, + { + "epoch": 2.1787303309820945, + "grad_norm": 4.755335807800293, + "learning_rate": 4.255584486775388e-06, + "loss": 0.1574, + "step": 42668 + }, + { + "epoch": 2.1787438958220293, + "grad_norm": 6.406882286071777, + "learning_rate": 4.255447444155132e-06, + "loss": 0.2024, + "step": 42669 + }, + { + "epoch": 2.178757460661964, + "grad_norm": 6.479650020599365, + "learning_rate": 4.255310401534878e-06, + "loss": 0.2351, + "step": 42670 + }, + { + "epoch": 2.178771025501899, + "grad_norm": 5.141571998596191, + "learning_rate": 4.2551733589146224e-06, + "loss": 0.2046, + "step": 42671 + }, + { + "epoch": 2.178784590341834, + "grad_norm": 4.606675624847412, + "learning_rate": 4.2550363162943685e-06, + "loss": 0.2056, + "step": 42672 + }, + { + "epoch": 2.178798155181769, + "grad_norm": 6.698155879974365, + "learning_rate": 4.254899273674113e-06, + "loss": 0.2688, + "step": 42673 + }, + { + "epoch": 2.1788117200217036, + "grad_norm": 3.4133877754211426, + "learning_rate": 4.254762231053858e-06, + "loss": 0.1821, + "step": 42674 + }, + { + "epoch": 2.1788252848616385, + "grad_norm": 8.897698402404785, + "learning_rate": 4.254625188433603e-06, + "loss": 0.3709, + "step": 42675 + }, + { + "epoch": 2.1788388497015734, + "grad_norm": 3.1594295501708984, + "learning_rate": 4.254488145813348e-06, + "loss": 0.1311, + "step": 42676 + }, + { + "epoch": 2.1788524145415082, + "grad_norm": 6.70942497253418, + "learning_rate": 4.2543511031930935e-06, + "loss": 0.2299, + "step": 42677 + }, + { + "epoch": 2.178865979381443, + "grad_norm": 3.8816750049591064, + "learning_rate": 4.254214060572838e-06, + "loss": 0.141, + "step": 42678 + }, + { + "epoch": 2.1788795442213784, + "grad_norm": 7.508209228515625, + "learning_rate": 4.254077017952584e-06, + "loss": 0.2788, + "step": 42679 + }, + { + "epoch": 2.1788931090613133, + "grad_norm": 5.046230792999268, + "learning_rate": 4.253939975332328e-06, + "loss": 0.189, + "step": 42680 + }, + { + "epoch": 2.178906673901248, + "grad_norm": 5.668137073516846, + "learning_rate": 4.253802932712074e-06, + "loss": 0.1721, + "step": 42681 + }, + { + "epoch": 2.178920238741183, + "grad_norm": 4.873688697814941, + "learning_rate": 4.2536658900918185e-06, + "loss": 0.1291, + "step": 42682 + }, + { + "epoch": 2.178933803581118, + "grad_norm": 5.3450751304626465, + "learning_rate": 4.253528847471564e-06, + "loss": 0.1643, + "step": 42683 + }, + { + "epoch": 2.1789473684210527, + "grad_norm": 4.614471912384033, + "learning_rate": 4.253391804851309e-06, + "loss": 0.1452, + "step": 42684 + }, + { + "epoch": 2.1789609332609876, + "grad_norm": 5.69110107421875, + "learning_rate": 4.253254762231054e-06, + "loss": 0.2694, + "step": 42685 + }, + { + "epoch": 2.1789744981009225, + "grad_norm": 4.297261714935303, + "learning_rate": 4.253117719610799e-06, + "loss": 0.1498, + "step": 42686 + }, + { + "epoch": 2.1789880629408573, + "grad_norm": 5.757957935333252, + "learning_rate": 4.2529806769905444e-06, + "loss": 0.2112, + "step": 42687 + }, + { + "epoch": 2.179001627780792, + "grad_norm": 5.423245429992676, + "learning_rate": 4.25284363437029e-06, + "loss": 0.1716, + "step": 42688 + }, + { + "epoch": 2.179015192620727, + "grad_norm": 5.223142147064209, + "learning_rate": 4.252706591750035e-06, + "loss": 0.1482, + "step": 42689 + }, + { + "epoch": 2.179028757460662, + "grad_norm": 4.507802486419678, + "learning_rate": 4.25256954912978e-06, + "loss": 0.2109, + "step": 42690 + }, + { + "epoch": 2.179042322300597, + "grad_norm": 3.901015043258667, + "learning_rate": 4.252432506509524e-06, + "loss": 0.1865, + "step": 42691 + }, + { + "epoch": 2.1790558871405317, + "grad_norm": 5.28440523147583, + "learning_rate": 4.25229546388927e-06, + "loss": 0.2522, + "step": 42692 + }, + { + "epoch": 2.1790694519804665, + "grad_norm": 4.0196146965026855, + "learning_rate": 4.252158421269015e-06, + "loss": 0.1466, + "step": 42693 + }, + { + "epoch": 2.1790830168204014, + "grad_norm": 3.7962567806243896, + "learning_rate": 4.25202137864876e-06, + "loss": 0.1241, + "step": 42694 + }, + { + "epoch": 2.1790965816603363, + "grad_norm": 4.931092739105225, + "learning_rate": 4.251884336028505e-06, + "loss": 0.2408, + "step": 42695 + }, + { + "epoch": 2.179110146500271, + "grad_norm": 3.719586133956909, + "learning_rate": 4.25174729340825e-06, + "loss": 0.1253, + "step": 42696 + }, + { + "epoch": 2.179123711340206, + "grad_norm": 5.252448558807373, + "learning_rate": 4.251610250787995e-06, + "loss": 0.1998, + "step": 42697 + }, + { + "epoch": 2.179137276180141, + "grad_norm": 4.541313648223877, + "learning_rate": 4.2514732081677406e-06, + "loss": 0.1224, + "step": 42698 + }, + { + "epoch": 2.179150841020076, + "grad_norm": 7.15684175491333, + "learning_rate": 4.251336165547486e-06, + "loss": 0.2914, + "step": 42699 + }, + { + "epoch": 2.179164405860011, + "grad_norm": 5.583868503570557, + "learning_rate": 4.251199122927231e-06, + "loss": 0.2463, + "step": 42700 + }, + { + "epoch": 2.179177970699946, + "grad_norm": 3.5555570125579834, + "learning_rate": 4.251062080306976e-06, + "loss": 0.1217, + "step": 42701 + }, + { + "epoch": 2.1791915355398808, + "grad_norm": 4.03844690322876, + "learning_rate": 4.250925037686721e-06, + "loss": 0.1705, + "step": 42702 + }, + { + "epoch": 2.1792051003798156, + "grad_norm": 5.277442932128906, + "learning_rate": 4.250787995066466e-06, + "loss": 0.2151, + "step": 42703 + }, + { + "epoch": 2.1792186652197505, + "grad_norm": 4.563375473022461, + "learning_rate": 4.250650952446211e-06, + "loss": 0.1684, + "step": 42704 + }, + { + "epoch": 2.1792322300596854, + "grad_norm": 5.022988319396973, + "learning_rate": 4.250513909825956e-06, + "loss": 0.231, + "step": 42705 + }, + { + "epoch": 2.1792457948996202, + "grad_norm": 4.757893085479736, + "learning_rate": 4.250376867205701e-06, + "loss": 0.121, + "step": 42706 + }, + { + "epoch": 2.179259359739555, + "grad_norm": 4.679423809051514, + "learning_rate": 4.250239824585446e-06, + "loss": 0.2046, + "step": 42707 + }, + { + "epoch": 2.17927292457949, + "grad_norm": 3.7277109622955322, + "learning_rate": 4.2501027819651915e-06, + "loss": 0.1553, + "step": 42708 + }, + { + "epoch": 2.179286489419425, + "grad_norm": 3.902103900909424, + "learning_rate": 4.249965739344937e-06, + "loss": 0.2104, + "step": 42709 + }, + { + "epoch": 2.1793000542593597, + "grad_norm": 4.857629776000977, + "learning_rate": 4.249828696724682e-06, + "loss": 0.2189, + "step": 42710 + }, + { + "epoch": 2.1793136190992946, + "grad_norm": 5.866518974304199, + "learning_rate": 4.249691654104427e-06, + "loss": 0.2555, + "step": 42711 + }, + { + "epoch": 2.1793271839392294, + "grad_norm": 5.291494369506836, + "learning_rate": 4.249554611484172e-06, + "loss": 0.1834, + "step": 42712 + }, + { + "epoch": 2.1793407487791643, + "grad_norm": 3.782202959060669, + "learning_rate": 4.249417568863917e-06, + "loss": 0.1765, + "step": 42713 + }, + { + "epoch": 2.179354313619099, + "grad_norm": 4.507447242736816, + "learning_rate": 4.249280526243662e-06, + "loss": 0.1425, + "step": 42714 + }, + { + "epoch": 2.179367878459034, + "grad_norm": 4.676187038421631, + "learning_rate": 4.249143483623408e-06, + "loss": 0.1529, + "step": 42715 + }, + { + "epoch": 2.179381443298969, + "grad_norm": 4.286133289337158, + "learning_rate": 4.249006441003152e-06, + "loss": 0.18, + "step": 42716 + }, + { + "epoch": 2.179395008138904, + "grad_norm": 5.140804290771484, + "learning_rate": 4.248869398382897e-06, + "loss": 0.2212, + "step": 42717 + }, + { + "epoch": 2.179408572978839, + "grad_norm": 4.983515739440918, + "learning_rate": 4.2487323557626424e-06, + "loss": 0.2152, + "step": 42718 + }, + { + "epoch": 2.179422137818774, + "grad_norm": 4.517228603363037, + "learning_rate": 4.248595313142388e-06, + "loss": 0.231, + "step": 42719 + }, + { + "epoch": 2.179435702658709, + "grad_norm": 2.573586940765381, + "learning_rate": 4.248458270522133e-06, + "loss": 0.0754, + "step": 42720 + }, + { + "epoch": 2.1794492674986436, + "grad_norm": 4.746798515319824, + "learning_rate": 4.248321227901878e-06, + "loss": 0.2081, + "step": 42721 + }, + { + "epoch": 2.1794628323385785, + "grad_norm": 6.155533790588379, + "learning_rate": 4.248184185281623e-06, + "loss": 0.1415, + "step": 42722 + }, + { + "epoch": 2.1794763971785134, + "grad_norm": 3.3719425201416016, + "learning_rate": 4.2480471426613675e-06, + "loss": 0.1135, + "step": 42723 + }, + { + "epoch": 2.1794899620184482, + "grad_norm": 3.973572254180908, + "learning_rate": 4.2479101000411135e-06, + "loss": 0.1191, + "step": 42724 + }, + { + "epoch": 2.179503526858383, + "grad_norm": 3.9561126232147217, + "learning_rate": 4.247773057420858e-06, + "loss": 0.1379, + "step": 42725 + }, + { + "epoch": 2.179517091698318, + "grad_norm": 3.8246963024139404, + "learning_rate": 4.247636014800604e-06, + "loss": 0.1502, + "step": 42726 + }, + { + "epoch": 2.179530656538253, + "grad_norm": 3.8634276390075684, + "learning_rate": 4.247498972180348e-06, + "loss": 0.1662, + "step": 42727 + }, + { + "epoch": 2.1795442213781877, + "grad_norm": 4.38010311126709, + "learning_rate": 4.247361929560093e-06, + "loss": 0.1477, + "step": 42728 + }, + { + "epoch": 2.1795577862181226, + "grad_norm": 3.958393096923828, + "learning_rate": 4.2472248869398386e-06, + "loss": 0.1362, + "step": 42729 + }, + { + "epoch": 2.1795713510580574, + "grad_norm": 4.776601314544678, + "learning_rate": 4.247087844319584e-06, + "loss": 0.2536, + "step": 42730 + }, + { + "epoch": 2.1795849158979923, + "grad_norm": 3.757402181625366, + "learning_rate": 4.246950801699329e-06, + "loss": 0.093, + "step": 42731 + }, + { + "epoch": 2.179598480737927, + "grad_norm": 2.9944229125976562, + "learning_rate": 4.246813759079073e-06, + "loss": 0.1155, + "step": 42732 + }, + { + "epoch": 2.179612045577862, + "grad_norm": 2.2566099166870117, + "learning_rate": 4.246676716458819e-06, + "loss": 0.0834, + "step": 42733 + }, + { + "epoch": 2.179625610417797, + "grad_norm": 3.8715620040893555, + "learning_rate": 4.246539673838564e-06, + "loss": 0.1459, + "step": 42734 + }, + { + "epoch": 2.1796391752577318, + "grad_norm": 3.977292060852051, + "learning_rate": 4.24640263121831e-06, + "loss": 0.0933, + "step": 42735 + }, + { + "epoch": 2.1796527400976666, + "grad_norm": 3.704468011856079, + "learning_rate": 4.246265588598054e-06, + "loss": 0.1253, + "step": 42736 + }, + { + "epoch": 2.179666304937602, + "grad_norm": 5.042918682098389, + "learning_rate": 4.246128545977799e-06, + "loss": 0.1874, + "step": 42737 + }, + { + "epoch": 2.179679869777537, + "grad_norm": 3.9977855682373047, + "learning_rate": 4.245991503357544e-06, + "loss": 0.2195, + "step": 42738 + }, + { + "epoch": 2.1796934346174717, + "grad_norm": 3.787245035171509, + "learning_rate": 4.2458544607372895e-06, + "loss": 0.1622, + "step": 42739 + }, + { + "epoch": 2.1797069994574065, + "grad_norm": 4.122035503387451, + "learning_rate": 4.245717418117035e-06, + "loss": 0.123, + "step": 42740 + }, + { + "epoch": 2.1797205642973414, + "grad_norm": 5.056248664855957, + "learning_rate": 4.24558037549678e-06, + "loss": 0.1382, + "step": 42741 + }, + { + "epoch": 2.1797341291372763, + "grad_norm": 5.172433376312256, + "learning_rate": 4.245443332876525e-06, + "loss": 0.1531, + "step": 42742 + }, + { + "epoch": 2.179747693977211, + "grad_norm": 3.6235108375549316, + "learning_rate": 4.24530629025627e-06, + "loss": 0.1841, + "step": 42743 + }, + { + "epoch": 2.179761258817146, + "grad_norm": 4.040133476257324, + "learning_rate": 4.245169247636015e-06, + "loss": 0.1337, + "step": 42744 + }, + { + "epoch": 2.179774823657081, + "grad_norm": 5.209867000579834, + "learning_rate": 4.245032205015761e-06, + "loss": 0.2099, + "step": 42745 + }, + { + "epoch": 2.1797883884970157, + "grad_norm": 5.860138893127441, + "learning_rate": 4.244895162395506e-06, + "loss": 0.2892, + "step": 42746 + }, + { + "epoch": 2.1798019533369506, + "grad_norm": 5.233506202697754, + "learning_rate": 4.24475811977525e-06, + "loss": 0.2473, + "step": 42747 + }, + { + "epoch": 2.1798155181768855, + "grad_norm": 6.946685314178467, + "learning_rate": 4.244621077154995e-06, + "loss": 0.2864, + "step": 42748 + }, + { + "epoch": 2.1798290830168203, + "grad_norm": 6.529536247253418, + "learning_rate": 4.2444840345347405e-06, + "loss": 0.1879, + "step": 42749 + }, + { + "epoch": 2.179842647856755, + "grad_norm": 2.7386279106140137, + "learning_rate": 4.244346991914486e-06, + "loss": 0.095, + "step": 42750 + }, + { + "epoch": 2.17985621269669, + "grad_norm": 5.862907886505127, + "learning_rate": 4.244209949294231e-06, + "loss": 0.1999, + "step": 42751 + }, + { + "epoch": 2.179869777536625, + "grad_norm": 3.091944932937622, + "learning_rate": 4.244072906673976e-06, + "loss": 0.1002, + "step": 42752 + }, + { + "epoch": 2.17988334237656, + "grad_norm": 5.55161190032959, + "learning_rate": 4.243935864053721e-06, + "loss": 0.1542, + "step": 42753 + }, + { + "epoch": 2.1798969072164947, + "grad_norm": 5.301056861877441, + "learning_rate": 4.243798821433466e-06, + "loss": 0.2307, + "step": 42754 + }, + { + "epoch": 2.17991047205643, + "grad_norm": 4.699890613555908, + "learning_rate": 4.2436617788132115e-06, + "loss": 0.1887, + "step": 42755 + }, + { + "epoch": 2.179924036896365, + "grad_norm": 5.353878021240234, + "learning_rate": 4.243524736192957e-06, + "loss": 0.2124, + "step": 42756 + }, + { + "epoch": 2.1799376017362997, + "grad_norm": 6.288407802581787, + "learning_rate": 4.243387693572701e-06, + "loss": 0.1972, + "step": 42757 + }, + { + "epoch": 2.1799511665762346, + "grad_norm": 6.176939487457275, + "learning_rate": 4.243250650952447e-06, + "loss": 0.2815, + "step": 42758 + }, + { + "epoch": 2.1799647314161694, + "grad_norm": 4.585352897644043, + "learning_rate": 4.243113608332191e-06, + "loss": 0.1894, + "step": 42759 + }, + { + "epoch": 2.1799782962561043, + "grad_norm": 5.835612773895264, + "learning_rate": 4.2429765657119366e-06, + "loss": 0.1676, + "step": 42760 + }, + { + "epoch": 2.179991861096039, + "grad_norm": 5.810257434844971, + "learning_rate": 4.242839523091682e-06, + "loss": 0.1233, + "step": 42761 + }, + { + "epoch": 2.180005425935974, + "grad_norm": 4.962553977966309, + "learning_rate": 4.242702480471427e-06, + "loss": 0.1435, + "step": 42762 + }, + { + "epoch": 2.180018990775909, + "grad_norm": 6.186310768127441, + "learning_rate": 4.242565437851172e-06, + "loss": 0.211, + "step": 42763 + }, + { + "epoch": 2.1800325556158437, + "grad_norm": 4.189324855804443, + "learning_rate": 4.242428395230917e-06, + "loss": 0.2015, + "step": 42764 + }, + { + "epoch": 2.1800461204557786, + "grad_norm": 4.723616600036621, + "learning_rate": 4.2422913526106625e-06, + "loss": 0.1674, + "step": 42765 + }, + { + "epoch": 2.1800596852957135, + "grad_norm": 5.34842586517334, + "learning_rate": 4.242154309990408e-06, + "loss": 0.2169, + "step": 42766 + }, + { + "epoch": 2.1800732501356483, + "grad_norm": 4.0973920822143555, + "learning_rate": 4.242017267370153e-06, + "loss": 0.1087, + "step": 42767 + }, + { + "epoch": 2.180086814975583, + "grad_norm": 4.942199230194092, + "learning_rate": 4.241880224749897e-06, + "loss": 0.163, + "step": 42768 + }, + { + "epoch": 2.180100379815518, + "grad_norm": 4.361657619476318, + "learning_rate": 4.241743182129643e-06, + "loss": 0.14, + "step": 42769 + }, + { + "epoch": 2.180113944655453, + "grad_norm": 6.2695770263671875, + "learning_rate": 4.2416061395093875e-06, + "loss": 0.2239, + "step": 42770 + }, + { + "epoch": 2.180127509495388, + "grad_norm": 5.312556266784668, + "learning_rate": 4.241469096889133e-06, + "loss": 0.1721, + "step": 42771 + }, + { + "epoch": 2.1801410743353227, + "grad_norm": 5.998492240905762, + "learning_rate": 4.241332054268878e-06, + "loss": 0.1639, + "step": 42772 + }, + { + "epoch": 2.1801546391752575, + "grad_norm": 5.059174060821533, + "learning_rate": 4.241195011648623e-06, + "loss": 0.1785, + "step": 42773 + }, + { + "epoch": 2.1801682040151924, + "grad_norm": 5.813917636871338, + "learning_rate": 4.241057969028368e-06, + "loss": 0.2034, + "step": 42774 + }, + { + "epoch": 2.1801817688551277, + "grad_norm": 5.4495720863342285, + "learning_rate": 4.240920926408113e-06, + "loss": 0.2336, + "step": 42775 + }, + { + "epoch": 2.1801953336950626, + "grad_norm": 4.069321632385254, + "learning_rate": 4.240783883787859e-06, + "loss": 0.1075, + "step": 42776 + }, + { + "epoch": 2.1802088985349974, + "grad_norm": 6.288820743560791, + "learning_rate": 4.240646841167603e-06, + "loss": 0.1907, + "step": 42777 + }, + { + "epoch": 2.1802224633749323, + "grad_norm": 5.445089340209961, + "learning_rate": 4.240509798547349e-06, + "loss": 0.093, + "step": 42778 + }, + { + "epoch": 2.180236028214867, + "grad_norm": 3.611147403717041, + "learning_rate": 4.240372755927093e-06, + "loss": 0.1108, + "step": 42779 + }, + { + "epoch": 2.180249593054802, + "grad_norm": 4.703240394592285, + "learning_rate": 4.240235713306839e-06, + "loss": 0.1185, + "step": 42780 + }, + { + "epoch": 2.180263157894737, + "grad_norm": 4.798577785491943, + "learning_rate": 4.240098670686584e-06, + "loss": 0.1226, + "step": 42781 + }, + { + "epoch": 2.1802767227346718, + "grad_norm": 4.223751544952393, + "learning_rate": 4.239961628066329e-06, + "loss": 0.1206, + "step": 42782 + }, + { + "epoch": 2.1802902875746066, + "grad_norm": 4.422849178314209, + "learning_rate": 4.239824585446074e-06, + "loss": 0.1366, + "step": 42783 + }, + { + "epoch": 2.1803038524145415, + "grad_norm": 4.951329231262207, + "learning_rate": 4.239687542825819e-06, + "loss": 0.1618, + "step": 42784 + }, + { + "epoch": 2.1803174172544764, + "grad_norm": 5.1952385902404785, + "learning_rate": 4.239550500205564e-06, + "loss": 0.302, + "step": 42785 + }, + { + "epoch": 2.1803309820944112, + "grad_norm": 6.750681400299072, + "learning_rate": 4.2394134575853095e-06, + "loss": 0.3231, + "step": 42786 + }, + { + "epoch": 2.180344546934346, + "grad_norm": 3.8377182483673096, + "learning_rate": 4.239276414965055e-06, + "loss": 0.1123, + "step": 42787 + }, + { + "epoch": 2.180358111774281, + "grad_norm": 5.384970188140869, + "learning_rate": 4.239139372344799e-06, + "loss": 0.1575, + "step": 42788 + }, + { + "epoch": 2.180371676614216, + "grad_norm": 3.8480231761932373, + "learning_rate": 4.239002329724545e-06, + "loss": 0.0793, + "step": 42789 + }, + { + "epoch": 2.1803852414541507, + "grad_norm": 6.929058074951172, + "learning_rate": 4.238865287104289e-06, + "loss": 0.1919, + "step": 42790 + }, + { + "epoch": 2.1803988062940856, + "grad_norm": 4.028954029083252, + "learning_rate": 4.238728244484035e-06, + "loss": 0.1113, + "step": 42791 + }, + { + "epoch": 2.1804123711340204, + "grad_norm": 6.091501712799072, + "learning_rate": 4.23859120186378e-06, + "loss": 0.2423, + "step": 42792 + }, + { + "epoch": 2.1804259359739557, + "grad_norm": 4.860542297363281, + "learning_rate": 4.238454159243525e-06, + "loss": 0.1887, + "step": 42793 + }, + { + "epoch": 2.1804395008138906, + "grad_norm": 5.0869340896606445, + "learning_rate": 4.23831711662327e-06, + "loss": 0.1234, + "step": 42794 + }, + { + "epoch": 2.1804530656538255, + "grad_norm": 6.889773368835449, + "learning_rate": 4.238180074003015e-06, + "loss": 0.2674, + "step": 42795 + }, + { + "epoch": 2.1804666304937603, + "grad_norm": 5.400683879852295, + "learning_rate": 4.2380430313827605e-06, + "loss": 0.1988, + "step": 42796 + }, + { + "epoch": 2.180480195333695, + "grad_norm": 3.444654941558838, + "learning_rate": 4.237905988762506e-06, + "loss": 0.0766, + "step": 42797 + }, + { + "epoch": 2.18049376017363, + "grad_norm": 4.782845497131348, + "learning_rate": 4.237768946142251e-06, + "loss": 0.1151, + "step": 42798 + }, + { + "epoch": 2.180507325013565, + "grad_norm": 6.000608921051025, + "learning_rate": 4.237631903521996e-06, + "loss": 0.1702, + "step": 42799 + }, + { + "epoch": 2.1805208898535, + "grad_norm": 6.1563239097595215, + "learning_rate": 4.237494860901741e-06, + "loss": 0.2213, + "step": 42800 + }, + { + "epoch": 2.1805344546934347, + "grad_norm": 5.389964580535889, + "learning_rate": 4.2373578182814855e-06, + "loss": 0.1404, + "step": 42801 + }, + { + "epoch": 2.1805480195333695, + "grad_norm": 4.378762245178223, + "learning_rate": 4.237220775661231e-06, + "loss": 0.0824, + "step": 42802 + }, + { + "epoch": 2.1805615843733044, + "grad_norm": 5.879902362823486, + "learning_rate": 4.237083733040976e-06, + "loss": 0.1553, + "step": 42803 + }, + { + "epoch": 2.1805751492132392, + "grad_norm": 5.920490264892578, + "learning_rate": 4.236946690420721e-06, + "loss": 0.1293, + "step": 42804 + }, + { + "epoch": 2.180588714053174, + "grad_norm": 4.161384105682373, + "learning_rate": 4.236809647800466e-06, + "loss": 0.1125, + "step": 42805 + }, + { + "epoch": 2.180602278893109, + "grad_norm": 4.7314348220825195, + "learning_rate": 4.236672605180211e-06, + "loss": 0.1619, + "step": 42806 + }, + { + "epoch": 2.180615843733044, + "grad_norm": 5.957632064819336, + "learning_rate": 4.236535562559957e-06, + "loss": 0.1519, + "step": 42807 + }, + { + "epoch": 2.1806294085729787, + "grad_norm": 3.6558873653411865, + "learning_rate": 4.236398519939702e-06, + "loss": 0.1213, + "step": 42808 + }, + { + "epoch": 2.1806429734129136, + "grad_norm": 5.209351062774658, + "learning_rate": 4.236261477319447e-06, + "loss": 0.161, + "step": 42809 + }, + { + "epoch": 2.1806565382528484, + "grad_norm": 4.775781154632568, + "learning_rate": 4.236124434699192e-06, + "loss": 0.1056, + "step": 42810 + }, + { + "epoch": 2.1806701030927833, + "grad_norm": 6.043169975280762, + "learning_rate": 4.2359873920789365e-06, + "loss": 0.2513, + "step": 42811 + }, + { + "epoch": 2.180683667932718, + "grad_norm": 4.414405822753906, + "learning_rate": 4.2358503494586825e-06, + "loss": 0.1781, + "step": 42812 + }, + { + "epoch": 2.1806972327726535, + "grad_norm": 8.626042366027832, + "learning_rate": 4.235713306838427e-06, + "loss": 0.1853, + "step": 42813 + }, + { + "epoch": 2.1807107976125883, + "grad_norm": 5.530622959136963, + "learning_rate": 4.235576264218173e-06, + "loss": 0.2414, + "step": 42814 + }, + { + "epoch": 2.180724362452523, + "grad_norm": 4.065241813659668, + "learning_rate": 4.235439221597917e-06, + "loss": 0.1143, + "step": 42815 + }, + { + "epoch": 2.180737927292458, + "grad_norm": 6.674378871917725, + "learning_rate": 4.235302178977662e-06, + "loss": 0.2395, + "step": 42816 + }, + { + "epoch": 2.180751492132393, + "grad_norm": 4.16948938369751, + "learning_rate": 4.2351651363574075e-06, + "loss": 0.1401, + "step": 42817 + }, + { + "epoch": 2.180765056972328, + "grad_norm": 5.413504600524902, + "learning_rate": 4.235028093737153e-06, + "loss": 0.1409, + "step": 42818 + }, + { + "epoch": 2.1807786218122627, + "grad_norm": 7.309869766235352, + "learning_rate": 4.234891051116898e-06, + "loss": 0.3784, + "step": 42819 + }, + { + "epoch": 2.1807921866521975, + "grad_norm": 6.04137659072876, + "learning_rate": 4.234754008496642e-06, + "loss": 0.2629, + "step": 42820 + }, + { + "epoch": 2.1808057514921324, + "grad_norm": 5.083987236022949, + "learning_rate": 4.234616965876388e-06, + "loss": 0.1419, + "step": 42821 + }, + { + "epoch": 2.1808193163320673, + "grad_norm": 5.468877792358398, + "learning_rate": 4.234479923256133e-06, + "loss": 0.2547, + "step": 42822 + }, + { + "epoch": 2.180832881172002, + "grad_norm": 5.921647548675537, + "learning_rate": 4.234342880635879e-06, + "loss": 0.3536, + "step": 42823 + }, + { + "epoch": 2.180846446011937, + "grad_norm": 4.449717998504639, + "learning_rate": 4.234205838015623e-06, + "loss": 0.2066, + "step": 42824 + }, + { + "epoch": 2.180860010851872, + "grad_norm": 5.827352046966553, + "learning_rate": 4.234068795395368e-06, + "loss": 0.1608, + "step": 42825 + }, + { + "epoch": 2.1808735756918067, + "grad_norm": 3.2000529766082764, + "learning_rate": 4.233931752775113e-06, + "loss": 0.0961, + "step": 42826 + }, + { + "epoch": 2.1808871405317416, + "grad_norm": 7.614485740661621, + "learning_rate": 4.2337947101548585e-06, + "loss": 0.3919, + "step": 42827 + }, + { + "epoch": 2.1809007053716765, + "grad_norm": 5.5033183097839355, + "learning_rate": 4.233657667534604e-06, + "loss": 0.2146, + "step": 42828 + }, + { + "epoch": 2.1809142702116113, + "grad_norm": 5.607817649841309, + "learning_rate": 4.233520624914349e-06, + "loss": 0.2551, + "step": 42829 + }, + { + "epoch": 2.180927835051546, + "grad_norm": 4.862995147705078, + "learning_rate": 4.233383582294094e-06, + "loss": 0.1628, + "step": 42830 + }, + { + "epoch": 2.1809413998914815, + "grad_norm": 4.78492546081543, + "learning_rate": 4.233246539673838e-06, + "loss": 0.1296, + "step": 42831 + }, + { + "epoch": 2.1809549647314164, + "grad_norm": 4.588462829589844, + "learning_rate": 4.233109497053584e-06, + "loss": 0.1235, + "step": 42832 + }, + { + "epoch": 2.1809685295713512, + "grad_norm": 5.29396390914917, + "learning_rate": 4.232972454433329e-06, + "loss": 0.1216, + "step": 42833 + }, + { + "epoch": 2.180982094411286, + "grad_norm": 3.343076467514038, + "learning_rate": 4.232835411813075e-06, + "loss": 0.1431, + "step": 42834 + }, + { + "epoch": 2.180995659251221, + "grad_norm": 6.644918918609619, + "learning_rate": 4.232698369192819e-06, + "loss": 0.1962, + "step": 42835 + }, + { + "epoch": 2.181009224091156, + "grad_norm": 4.311469554901123, + "learning_rate": 4.232561326572564e-06, + "loss": 0.1457, + "step": 42836 + }, + { + "epoch": 2.1810227889310907, + "grad_norm": 3.9685332775115967, + "learning_rate": 4.232424283952309e-06, + "loss": 0.101, + "step": 42837 + }, + { + "epoch": 2.1810363537710256, + "grad_norm": 3.939248561859131, + "learning_rate": 4.232287241332055e-06, + "loss": 0.1363, + "step": 42838 + }, + { + "epoch": 2.1810499186109604, + "grad_norm": 5.430272102355957, + "learning_rate": 4.2321501987118e-06, + "loss": 0.2175, + "step": 42839 + }, + { + "epoch": 2.1810634834508953, + "grad_norm": 6.185196399688721, + "learning_rate": 4.232013156091545e-06, + "loss": 0.1981, + "step": 42840 + }, + { + "epoch": 2.18107704829083, + "grad_norm": 4.082045078277588, + "learning_rate": 4.23187611347129e-06, + "loss": 0.0841, + "step": 42841 + }, + { + "epoch": 2.181090613130765, + "grad_norm": 5.789718151092529, + "learning_rate": 4.2317390708510345e-06, + "loss": 0.2317, + "step": 42842 + }, + { + "epoch": 2.1811041779707, + "grad_norm": 4.931159019470215, + "learning_rate": 4.2316020282307805e-06, + "loss": 0.2776, + "step": 42843 + }, + { + "epoch": 2.1811177428106348, + "grad_norm": 3.7442827224731445, + "learning_rate": 4.231464985610525e-06, + "loss": 0.1188, + "step": 42844 + }, + { + "epoch": 2.1811313076505696, + "grad_norm": 4.601095676422119, + "learning_rate": 4.23132794299027e-06, + "loss": 0.1573, + "step": 42845 + }, + { + "epoch": 2.1811448724905045, + "grad_norm": 6.565587997436523, + "learning_rate": 4.231190900370015e-06, + "loss": 0.2016, + "step": 42846 + }, + { + "epoch": 2.1811584373304393, + "grad_norm": 4.311684608459473, + "learning_rate": 4.23105385774976e-06, + "loss": 0.1202, + "step": 42847 + }, + { + "epoch": 2.181172002170374, + "grad_norm": 4.5972065925598145, + "learning_rate": 4.2309168151295055e-06, + "loss": 0.1744, + "step": 42848 + }, + { + "epoch": 2.181185567010309, + "grad_norm": 4.600799560546875, + "learning_rate": 4.230779772509251e-06, + "loss": 0.1366, + "step": 42849 + }, + { + "epoch": 2.181199131850244, + "grad_norm": 6.922797679901123, + "learning_rate": 4.230642729888996e-06, + "loss": 0.1962, + "step": 42850 + }, + { + "epoch": 2.1812126966901793, + "grad_norm": 4.6647186279296875, + "learning_rate": 4.230505687268741e-06, + "loss": 0.1889, + "step": 42851 + }, + { + "epoch": 2.181226261530114, + "grad_norm": 2.960247278213501, + "learning_rate": 4.230368644648486e-06, + "loss": 0.1088, + "step": 42852 + }, + { + "epoch": 2.181239826370049, + "grad_norm": 6.060065269470215, + "learning_rate": 4.2302316020282314e-06, + "loss": 0.1661, + "step": 42853 + }, + { + "epoch": 2.181253391209984, + "grad_norm": 7.555925369262695, + "learning_rate": 4.230094559407977e-06, + "loss": 0.2375, + "step": 42854 + }, + { + "epoch": 2.1812669560499187, + "grad_norm": 7.541865348815918, + "learning_rate": 4.229957516787722e-06, + "loss": 0.35, + "step": 42855 + }, + { + "epoch": 2.1812805208898536, + "grad_norm": 3.770812511444092, + "learning_rate": 4.229820474167466e-06, + "loss": 0.1381, + "step": 42856 + }, + { + "epoch": 2.1812940857297884, + "grad_norm": 5.639317035675049, + "learning_rate": 4.229683431547211e-06, + "loss": 0.2402, + "step": 42857 + }, + { + "epoch": 2.1813076505697233, + "grad_norm": 6.128459930419922, + "learning_rate": 4.2295463889269565e-06, + "loss": 0.1832, + "step": 42858 + }, + { + "epoch": 2.181321215409658, + "grad_norm": 5.99129056930542, + "learning_rate": 4.229409346306702e-06, + "loss": 0.1748, + "step": 42859 + }, + { + "epoch": 2.181334780249593, + "grad_norm": 4.925242900848389, + "learning_rate": 4.229272303686447e-06, + "loss": 0.1829, + "step": 42860 + }, + { + "epoch": 2.181348345089528, + "grad_norm": 5.181553363800049, + "learning_rate": 4.229135261066192e-06, + "loss": 0.1962, + "step": 42861 + }, + { + "epoch": 2.1813619099294628, + "grad_norm": 5.610361099243164, + "learning_rate": 4.228998218445937e-06, + "loss": 0.2597, + "step": 42862 + }, + { + "epoch": 2.1813754747693976, + "grad_norm": 4.754395484924316, + "learning_rate": 4.228861175825682e-06, + "loss": 0.1916, + "step": 42863 + }, + { + "epoch": 2.1813890396093325, + "grad_norm": 5.1593098640441895, + "learning_rate": 4.2287241332054276e-06, + "loss": 0.1234, + "step": 42864 + }, + { + "epoch": 2.1814026044492674, + "grad_norm": 3.507594347000122, + "learning_rate": 4.228587090585172e-06, + "loss": 0.1547, + "step": 42865 + }, + { + "epoch": 2.1814161692892022, + "grad_norm": 5.221263885498047, + "learning_rate": 4.228450047964918e-06, + "loss": 0.1544, + "step": 42866 + }, + { + "epoch": 2.181429734129137, + "grad_norm": 5.061459541320801, + "learning_rate": 4.228313005344662e-06, + "loss": 0.2016, + "step": 42867 + }, + { + "epoch": 2.181443298969072, + "grad_norm": 5.031657695770264, + "learning_rate": 4.228175962724408e-06, + "loss": 0.2489, + "step": 42868 + }, + { + "epoch": 2.1814568638090073, + "grad_norm": 3.9266905784606934, + "learning_rate": 4.228038920104153e-06, + "loss": 0.1746, + "step": 42869 + }, + { + "epoch": 2.181470428648942, + "grad_norm": 3.1614084243774414, + "learning_rate": 4.227901877483898e-06, + "loss": 0.106, + "step": 42870 + }, + { + "epoch": 2.181483993488877, + "grad_norm": 4.418569087982178, + "learning_rate": 4.227764834863643e-06, + "loss": 0.1088, + "step": 42871 + }, + { + "epoch": 2.181497558328812, + "grad_norm": 4.948497295379639, + "learning_rate": 4.227627792243388e-06, + "loss": 0.1626, + "step": 42872 + }, + { + "epoch": 2.1815111231687467, + "grad_norm": 6.22346830368042, + "learning_rate": 4.227490749623133e-06, + "loss": 0.1975, + "step": 42873 + }, + { + "epoch": 2.1815246880086816, + "grad_norm": 4.675849437713623, + "learning_rate": 4.227353707002878e-06, + "loss": 0.1344, + "step": 42874 + }, + { + "epoch": 2.1815382528486165, + "grad_norm": 4.4385762214660645, + "learning_rate": 4.227216664382624e-06, + "loss": 0.1509, + "step": 42875 + }, + { + "epoch": 2.1815518176885513, + "grad_norm": 4.58058500289917, + "learning_rate": 4.227079621762368e-06, + "loss": 0.2642, + "step": 42876 + }, + { + "epoch": 2.181565382528486, + "grad_norm": 6.118190288543701, + "learning_rate": 4.226942579142114e-06, + "loss": 0.1814, + "step": 42877 + }, + { + "epoch": 2.181578947368421, + "grad_norm": 5.037615776062012, + "learning_rate": 4.226805536521858e-06, + "loss": 0.2166, + "step": 42878 + }, + { + "epoch": 2.181592512208356, + "grad_norm": 4.500471115112305, + "learning_rate": 4.2266684939016035e-06, + "loss": 0.1211, + "step": 42879 + }, + { + "epoch": 2.181606077048291, + "grad_norm": 4.471715927124023, + "learning_rate": 4.226531451281349e-06, + "loss": 0.143, + "step": 42880 + }, + { + "epoch": 2.1816196418882257, + "grad_norm": 4.848103046417236, + "learning_rate": 4.226394408661094e-06, + "loss": 0.1382, + "step": 42881 + }, + { + "epoch": 2.1816332067281605, + "grad_norm": 5.135316848754883, + "learning_rate": 4.226257366040839e-06, + "loss": 0.207, + "step": 42882 + }, + { + "epoch": 2.1816467715680954, + "grad_norm": 4.843246936798096, + "learning_rate": 4.226120323420584e-06, + "loss": 0.1589, + "step": 42883 + }, + { + "epoch": 2.1816603364080303, + "grad_norm": 4.042951583862305, + "learning_rate": 4.2259832808003294e-06, + "loss": 0.1574, + "step": 42884 + }, + { + "epoch": 2.181673901247965, + "grad_norm": 4.074462890625, + "learning_rate": 4.225846238180074e-06, + "loss": 0.1498, + "step": 42885 + }, + { + "epoch": 2.1816874660879, + "grad_norm": 4.293385982513428, + "learning_rate": 4.22570919555982e-06, + "loss": 0.164, + "step": 42886 + }, + { + "epoch": 2.181701030927835, + "grad_norm": 4.0432538986206055, + "learning_rate": 4.225572152939564e-06, + "loss": 0.119, + "step": 42887 + }, + { + "epoch": 2.1817145957677697, + "grad_norm": 5.876583099365234, + "learning_rate": 4.22543511031931e-06, + "loss": 0.1485, + "step": 42888 + }, + { + "epoch": 2.181728160607705, + "grad_norm": 5.2366533279418945, + "learning_rate": 4.2252980676990545e-06, + "loss": 0.2058, + "step": 42889 + }, + { + "epoch": 2.18174172544764, + "grad_norm": 4.349696159362793, + "learning_rate": 4.2251610250788e-06, + "loss": 0.157, + "step": 42890 + }, + { + "epoch": 2.1817552902875748, + "grad_norm": 3.8275368213653564, + "learning_rate": 4.225023982458545e-06, + "loss": 0.1199, + "step": 42891 + }, + { + "epoch": 2.1817688551275096, + "grad_norm": 5.614572048187256, + "learning_rate": 4.22488693983829e-06, + "loss": 0.1902, + "step": 42892 + }, + { + "epoch": 2.1817824199674445, + "grad_norm": 5.309086322784424, + "learning_rate": 4.224749897218035e-06, + "loss": 0.153, + "step": 42893 + }, + { + "epoch": 2.1817959848073794, + "grad_norm": 4.314324855804443, + "learning_rate": 4.22461285459778e-06, + "loss": 0.1345, + "step": 42894 + }, + { + "epoch": 2.181809549647314, + "grad_norm": 4.21394157409668, + "learning_rate": 4.2244758119775256e-06, + "loss": 0.1504, + "step": 42895 + }, + { + "epoch": 2.181823114487249, + "grad_norm": 5.929344177246094, + "learning_rate": 4.224338769357271e-06, + "loss": 0.2154, + "step": 42896 + }, + { + "epoch": 2.181836679327184, + "grad_norm": 2.788689374923706, + "learning_rate": 4.224201726737016e-06, + "loss": 0.0631, + "step": 42897 + }, + { + "epoch": 2.181850244167119, + "grad_norm": 4.242358684539795, + "learning_rate": 4.22406468411676e-06, + "loss": 0.1991, + "step": 42898 + }, + { + "epoch": 2.1818638090070537, + "grad_norm": 4.8203229904174805, + "learning_rate": 4.2239276414965054e-06, + "loss": 0.1706, + "step": 42899 + }, + { + "epoch": 2.1818773738469885, + "grad_norm": 3.9295201301574707, + "learning_rate": 4.223790598876251e-06, + "loss": 0.1808, + "step": 42900 + }, + { + "epoch": 2.1818909386869234, + "grad_norm": 5.04495096206665, + "learning_rate": 4.223653556255996e-06, + "loss": 0.1409, + "step": 42901 + }, + { + "epoch": 2.1819045035268583, + "grad_norm": 3.425854206085205, + "learning_rate": 4.223516513635741e-06, + "loss": 0.0794, + "step": 42902 + }, + { + "epoch": 2.181918068366793, + "grad_norm": 5.1505560874938965, + "learning_rate": 4.223379471015486e-06, + "loss": 0.1646, + "step": 42903 + }, + { + "epoch": 2.181931633206728, + "grad_norm": 5.258304595947266, + "learning_rate": 4.223242428395231e-06, + "loss": 0.1854, + "step": 42904 + }, + { + "epoch": 2.181945198046663, + "grad_norm": 4.246516227722168, + "learning_rate": 4.2231053857749765e-06, + "loss": 0.1371, + "step": 42905 + }, + { + "epoch": 2.1819587628865977, + "grad_norm": 5.616031169891357, + "learning_rate": 4.222968343154722e-06, + "loss": 0.1546, + "step": 42906 + }, + { + "epoch": 2.181972327726533, + "grad_norm": 4.662277698516846, + "learning_rate": 4.222831300534467e-06, + "loss": 0.2559, + "step": 42907 + }, + { + "epoch": 2.181985892566468, + "grad_norm": 2.969722270965576, + "learning_rate": 4.222694257914212e-06, + "loss": 0.0886, + "step": 42908 + }, + { + "epoch": 2.1819994574064028, + "grad_norm": 3.6567211151123047, + "learning_rate": 4.222557215293957e-06, + "loss": 0.1363, + "step": 42909 + }, + { + "epoch": 2.1820130222463376, + "grad_norm": 4.220505237579346, + "learning_rate": 4.2224201726737015e-06, + "loss": 0.0935, + "step": 42910 + }, + { + "epoch": 2.1820265870862725, + "grad_norm": 5.5096755027771, + "learning_rate": 4.222283130053447e-06, + "loss": 0.1764, + "step": 42911 + }, + { + "epoch": 2.1820401519262074, + "grad_norm": 5.1660990715026855, + "learning_rate": 4.222146087433192e-06, + "loss": 0.1914, + "step": 42912 + }, + { + "epoch": 2.1820537167661422, + "grad_norm": 7.099596977233887, + "learning_rate": 4.222009044812937e-06, + "loss": 0.2363, + "step": 42913 + }, + { + "epoch": 2.182067281606077, + "grad_norm": 3.6136791706085205, + "learning_rate": 4.221872002192682e-06, + "loss": 0.1331, + "step": 42914 + }, + { + "epoch": 2.182080846446012, + "grad_norm": 4.75339937210083, + "learning_rate": 4.2217349595724274e-06, + "loss": 0.1232, + "step": 42915 + }, + { + "epoch": 2.182094411285947, + "grad_norm": 4.14885139465332, + "learning_rate": 4.221597916952173e-06, + "loss": 0.1382, + "step": 42916 + }, + { + "epoch": 2.1821079761258817, + "grad_norm": 5.316772937774658, + "learning_rate": 4.221460874331918e-06, + "loss": 0.136, + "step": 42917 + }, + { + "epoch": 2.1821215409658166, + "grad_norm": 3.414726495742798, + "learning_rate": 4.221323831711663e-06, + "loss": 0.1197, + "step": 42918 + }, + { + "epoch": 2.1821351058057514, + "grad_norm": 4.675590991973877, + "learning_rate": 4.221186789091407e-06, + "loss": 0.1786, + "step": 42919 + }, + { + "epoch": 2.1821486706456863, + "grad_norm": 2.2428410053253174, + "learning_rate": 4.221049746471153e-06, + "loss": 0.0405, + "step": 42920 + }, + { + "epoch": 2.182162235485621, + "grad_norm": 3.6900205612182617, + "learning_rate": 4.220912703850898e-06, + "loss": 0.0998, + "step": 42921 + }, + { + "epoch": 2.182175800325556, + "grad_norm": 5.8620429039001465, + "learning_rate": 4.220775661230644e-06, + "loss": 0.2083, + "step": 42922 + }, + { + "epoch": 2.182189365165491, + "grad_norm": 4.878808975219727, + "learning_rate": 4.220638618610388e-06, + "loss": 0.1656, + "step": 42923 + }, + { + "epoch": 2.1822029300054258, + "grad_norm": 4.620980262756348, + "learning_rate": 4.220501575990133e-06, + "loss": 0.1151, + "step": 42924 + }, + { + "epoch": 2.1822164948453606, + "grad_norm": 3.9423182010650635, + "learning_rate": 4.220364533369878e-06, + "loss": 0.1011, + "step": 42925 + }, + { + "epoch": 2.1822300596852955, + "grad_norm": 5.717024326324463, + "learning_rate": 4.2202274907496236e-06, + "loss": 0.1828, + "step": 42926 + }, + { + "epoch": 2.182243624525231, + "grad_norm": 4.835278034210205, + "learning_rate": 4.220090448129369e-06, + "loss": 0.2028, + "step": 42927 + }, + { + "epoch": 2.1822571893651657, + "grad_norm": 3.9964287281036377, + "learning_rate": 4.219953405509113e-06, + "loss": 0.1506, + "step": 42928 + }, + { + "epoch": 2.1822707542051005, + "grad_norm": 4.435120582580566, + "learning_rate": 4.219816362888859e-06, + "loss": 0.242, + "step": 42929 + }, + { + "epoch": 2.1822843190450354, + "grad_norm": 3.87186598777771, + "learning_rate": 4.2196793202686034e-06, + "loss": 0.1391, + "step": 42930 + }, + { + "epoch": 2.1822978838849703, + "grad_norm": 3.9563796520233154, + "learning_rate": 4.2195422776483495e-06, + "loss": 0.1407, + "step": 42931 + }, + { + "epoch": 2.182311448724905, + "grad_norm": 3.331080675125122, + "learning_rate": 4.219405235028094e-06, + "loss": 0.085, + "step": 42932 + }, + { + "epoch": 2.18232501356484, + "grad_norm": 4.948309421539307, + "learning_rate": 4.219268192407839e-06, + "loss": 0.1602, + "step": 42933 + }, + { + "epoch": 2.182338578404775, + "grad_norm": 3.886777877807617, + "learning_rate": 4.219131149787584e-06, + "loss": 0.1572, + "step": 42934 + }, + { + "epoch": 2.1823521432447097, + "grad_norm": 5.741180896759033, + "learning_rate": 4.218994107167329e-06, + "loss": 0.2515, + "step": 42935 + }, + { + "epoch": 2.1823657080846446, + "grad_norm": 6.387799263000488, + "learning_rate": 4.2188570645470745e-06, + "loss": 0.1713, + "step": 42936 + }, + { + "epoch": 2.1823792729245794, + "grad_norm": 3.53999662399292, + "learning_rate": 4.21872002192682e-06, + "loss": 0.2023, + "step": 42937 + }, + { + "epoch": 2.1823928377645143, + "grad_norm": 4.883192539215088, + "learning_rate": 4.218582979306565e-06, + "loss": 0.2209, + "step": 42938 + }, + { + "epoch": 2.182406402604449, + "grad_norm": 4.020425796508789, + "learning_rate": 4.218445936686309e-06, + "loss": 0.2059, + "step": 42939 + }, + { + "epoch": 2.182419967444384, + "grad_norm": 4.61696195602417, + "learning_rate": 4.218308894066055e-06, + "loss": 0.1501, + "step": 42940 + }, + { + "epoch": 2.182433532284319, + "grad_norm": 4.940043926239014, + "learning_rate": 4.2181718514457996e-06, + "loss": 0.2297, + "step": 42941 + }, + { + "epoch": 2.1824470971242538, + "grad_norm": 5.728464603424072, + "learning_rate": 4.218034808825546e-06, + "loss": 0.2211, + "step": 42942 + }, + { + "epoch": 2.1824606619641886, + "grad_norm": 5.318254470825195, + "learning_rate": 4.21789776620529e-06, + "loss": 0.1866, + "step": 42943 + }, + { + "epoch": 2.1824742268041235, + "grad_norm": 4.44724178314209, + "learning_rate": 4.217760723585035e-06, + "loss": 0.2134, + "step": 42944 + }, + { + "epoch": 2.182487791644059, + "grad_norm": 3.755174398422241, + "learning_rate": 4.21762368096478e-06, + "loss": 0.146, + "step": 42945 + }, + { + "epoch": 2.1825013564839937, + "grad_norm": 5.079160213470459, + "learning_rate": 4.2174866383445254e-06, + "loss": 0.1943, + "step": 42946 + }, + { + "epoch": 2.1825149213239285, + "grad_norm": 3.7957611083984375, + "learning_rate": 4.217349595724271e-06, + "loss": 0.2507, + "step": 42947 + }, + { + "epoch": 2.1825284861638634, + "grad_norm": 4.653693675994873, + "learning_rate": 4.217212553104016e-06, + "loss": 0.2072, + "step": 42948 + }, + { + "epoch": 2.1825420510037983, + "grad_norm": 4.850846767425537, + "learning_rate": 4.217075510483761e-06, + "loss": 0.1691, + "step": 42949 + }, + { + "epoch": 2.182555615843733, + "grad_norm": 3.5962331295013428, + "learning_rate": 4.216938467863506e-06, + "loss": 0.1234, + "step": 42950 + }, + { + "epoch": 2.182569180683668, + "grad_norm": 4.362756252288818, + "learning_rate": 4.216801425243251e-06, + "loss": 0.1567, + "step": 42951 + }, + { + "epoch": 2.182582745523603, + "grad_norm": 3.526543378829956, + "learning_rate": 4.216664382622996e-06, + "loss": 0.1618, + "step": 42952 + }, + { + "epoch": 2.1825963103635377, + "grad_norm": 3.599576473236084, + "learning_rate": 4.216527340002741e-06, + "loss": 0.165, + "step": 42953 + }, + { + "epoch": 2.1826098752034726, + "grad_norm": 4.182389259338379, + "learning_rate": 4.216390297382486e-06, + "loss": 0.1016, + "step": 42954 + }, + { + "epoch": 2.1826234400434075, + "grad_norm": 5.804392337799072, + "learning_rate": 4.216253254762231e-06, + "loss": 0.1774, + "step": 42955 + }, + { + "epoch": 2.1826370048833423, + "grad_norm": 4.928639888763428, + "learning_rate": 4.216116212141976e-06, + "loss": 0.1738, + "step": 42956 + }, + { + "epoch": 2.182650569723277, + "grad_norm": 5.231955528259277, + "learning_rate": 4.2159791695217216e-06, + "loss": 0.2019, + "step": 42957 + }, + { + "epoch": 2.182664134563212, + "grad_norm": 4.8597798347473145, + "learning_rate": 4.215842126901467e-06, + "loss": 0.1912, + "step": 42958 + }, + { + "epoch": 2.182677699403147, + "grad_norm": 4.952744960784912, + "learning_rate": 4.215705084281212e-06, + "loss": 0.1699, + "step": 42959 + }, + { + "epoch": 2.182691264243082, + "grad_norm": 4.203033447265625, + "learning_rate": 4.215568041660957e-06, + "loss": 0.1637, + "step": 42960 + }, + { + "epoch": 2.1827048290830167, + "grad_norm": 5.933265686035156, + "learning_rate": 4.215430999040702e-06, + "loss": 0.1289, + "step": 42961 + }, + { + "epoch": 2.1827183939229515, + "grad_norm": 4.257768154144287, + "learning_rate": 4.215293956420447e-06, + "loss": 0.126, + "step": 42962 + }, + { + "epoch": 2.1827319587628864, + "grad_norm": 4.382726192474365, + "learning_rate": 4.215156913800193e-06, + "loss": 0.0969, + "step": 42963 + }, + { + "epoch": 2.1827455236028213, + "grad_norm": 4.718879699707031, + "learning_rate": 4.215019871179937e-06, + "loss": 0.1338, + "step": 42964 + }, + { + "epoch": 2.1827590884427566, + "grad_norm": 3.7665390968322754, + "learning_rate": 4.214882828559683e-06, + "loss": 0.1265, + "step": 42965 + }, + { + "epoch": 2.1827726532826914, + "grad_norm": 6.050836086273193, + "learning_rate": 4.214745785939427e-06, + "loss": 0.1599, + "step": 42966 + }, + { + "epoch": 2.1827862181226263, + "grad_norm": 3.5582666397094727, + "learning_rate": 4.2146087433191725e-06, + "loss": 0.1571, + "step": 42967 + }, + { + "epoch": 2.182799782962561, + "grad_norm": 4.786102294921875, + "learning_rate": 4.214471700698918e-06, + "loss": 0.2488, + "step": 42968 + }, + { + "epoch": 2.182813347802496, + "grad_norm": 3.562011957168579, + "learning_rate": 4.214334658078663e-06, + "loss": 0.1459, + "step": 42969 + }, + { + "epoch": 2.182826912642431, + "grad_norm": 3.1178534030914307, + "learning_rate": 4.214197615458408e-06, + "loss": 0.113, + "step": 42970 + }, + { + "epoch": 2.1828404774823658, + "grad_norm": 7.133730411529541, + "learning_rate": 4.214060572838153e-06, + "loss": 0.3767, + "step": 42971 + }, + { + "epoch": 2.1828540423223006, + "grad_norm": 4.954465866088867, + "learning_rate": 4.213923530217898e-06, + "loss": 0.1946, + "step": 42972 + }, + { + "epoch": 2.1828676071622355, + "grad_norm": 4.498608589172363, + "learning_rate": 4.213786487597643e-06, + "loss": 0.1957, + "step": 42973 + }, + { + "epoch": 2.1828811720021704, + "grad_norm": 4.226646900177002, + "learning_rate": 4.213649444977389e-06, + "loss": 0.1321, + "step": 42974 + }, + { + "epoch": 2.182894736842105, + "grad_norm": 5.439342021942139, + "learning_rate": 4.213512402357133e-06, + "loss": 0.3026, + "step": 42975 + }, + { + "epoch": 2.18290830168204, + "grad_norm": 4.240660667419434, + "learning_rate": 4.213375359736879e-06, + "loss": 0.1267, + "step": 42976 + }, + { + "epoch": 2.182921866521975, + "grad_norm": 5.259294033050537, + "learning_rate": 4.2132383171166234e-06, + "loss": 0.3222, + "step": 42977 + }, + { + "epoch": 2.18293543136191, + "grad_norm": 3.68131422996521, + "learning_rate": 4.213101274496369e-06, + "loss": 0.1097, + "step": 42978 + }, + { + "epoch": 2.1829489962018447, + "grad_norm": 5.52584171295166, + "learning_rate": 4.212964231876114e-06, + "loss": 0.2889, + "step": 42979 + }, + { + "epoch": 2.1829625610417795, + "grad_norm": 7.119539737701416, + "learning_rate": 4.212827189255859e-06, + "loss": 0.365, + "step": 42980 + }, + { + "epoch": 2.1829761258817144, + "grad_norm": 6.840139389038086, + "learning_rate": 4.212690146635604e-06, + "loss": 0.4302, + "step": 42981 + }, + { + "epoch": 2.1829896907216493, + "grad_norm": 5.0260467529296875, + "learning_rate": 4.2125531040153485e-06, + "loss": 0.2152, + "step": 42982 + }, + { + "epoch": 2.1830032555615846, + "grad_norm": 4.3018035888671875, + "learning_rate": 4.2124160613950945e-06, + "loss": 0.1594, + "step": 42983 + }, + { + "epoch": 2.1830168204015195, + "grad_norm": 5.843711853027344, + "learning_rate": 4.212279018774839e-06, + "loss": 0.179, + "step": 42984 + }, + { + "epoch": 2.1830303852414543, + "grad_norm": 5.2116899490356445, + "learning_rate": 4.212141976154585e-06, + "loss": 0.1691, + "step": 42985 + }, + { + "epoch": 2.183043950081389, + "grad_norm": 4.907960891723633, + "learning_rate": 4.212004933534329e-06, + "loss": 0.1718, + "step": 42986 + }, + { + "epoch": 2.183057514921324, + "grad_norm": 4.29164457321167, + "learning_rate": 4.211867890914074e-06, + "loss": 0.1706, + "step": 42987 + }, + { + "epoch": 2.183071079761259, + "grad_norm": 3.849316120147705, + "learning_rate": 4.2117308482938196e-06, + "loss": 0.1165, + "step": 42988 + }, + { + "epoch": 2.183084644601194, + "grad_norm": 5.852626323699951, + "learning_rate": 4.211593805673565e-06, + "loss": 0.2416, + "step": 42989 + }, + { + "epoch": 2.1830982094411286, + "grad_norm": 5.23728609085083, + "learning_rate": 4.21145676305331e-06, + "loss": 0.2288, + "step": 42990 + }, + { + "epoch": 2.1831117742810635, + "grad_norm": 5.146726608276367, + "learning_rate": 4.211319720433055e-06, + "loss": 0.165, + "step": 42991 + }, + { + "epoch": 2.1831253391209984, + "grad_norm": 7.364108085632324, + "learning_rate": 4.2111826778128e-06, + "loss": 0.2425, + "step": 42992 + }, + { + "epoch": 2.1831389039609332, + "grad_norm": 4.052000045776367, + "learning_rate": 4.2110456351925455e-06, + "loss": 0.2531, + "step": 42993 + }, + { + "epoch": 2.183152468800868, + "grad_norm": 5.263298511505127, + "learning_rate": 4.210908592572291e-06, + "loss": 0.2631, + "step": 42994 + }, + { + "epoch": 2.183166033640803, + "grad_norm": 5.1139140129089355, + "learning_rate": 4.210771549952035e-06, + "loss": 0.202, + "step": 42995 + }, + { + "epoch": 2.183179598480738, + "grad_norm": 4.994205474853516, + "learning_rate": 4.210634507331781e-06, + "loss": 0.1847, + "step": 42996 + }, + { + "epoch": 2.1831931633206727, + "grad_norm": 6.364112854003906, + "learning_rate": 4.210497464711525e-06, + "loss": 0.2101, + "step": 42997 + }, + { + "epoch": 2.1832067281606076, + "grad_norm": 5.697750091552734, + "learning_rate": 4.2103604220912705e-06, + "loss": 0.249, + "step": 42998 + }, + { + "epoch": 2.1832202930005424, + "grad_norm": 4.686191558837891, + "learning_rate": 4.210223379471016e-06, + "loss": 0.2056, + "step": 42999 + }, + { + "epoch": 2.1832338578404773, + "grad_norm": 3.5145936012268066, + "learning_rate": 4.210086336850761e-06, + "loss": 0.0943, + "step": 43000 + }, + { + "epoch": 2.183247422680412, + "grad_norm": 4.252605438232422, + "learning_rate": 4.209949294230506e-06, + "loss": 0.1406, + "step": 43001 + }, + { + "epoch": 2.183260987520347, + "grad_norm": 4.37864351272583, + "learning_rate": 4.209812251610251e-06, + "loss": 0.1852, + "step": 43002 + }, + { + "epoch": 2.1832745523602823, + "grad_norm": 3.6228039264678955, + "learning_rate": 4.209675208989996e-06, + "loss": 0.1734, + "step": 43003 + }, + { + "epoch": 2.183288117200217, + "grad_norm": 4.181787014007568, + "learning_rate": 4.209538166369742e-06, + "loss": 0.145, + "step": 43004 + }, + { + "epoch": 2.183301682040152, + "grad_norm": 7.109701633453369, + "learning_rate": 4.209401123749487e-06, + "loss": 0.232, + "step": 43005 + }, + { + "epoch": 2.183315246880087, + "grad_norm": 4.645561695098877, + "learning_rate": 4.209264081129232e-06, + "loss": 0.2019, + "step": 43006 + }, + { + "epoch": 2.183328811720022, + "grad_norm": 5.048049449920654, + "learning_rate": 4.209127038508976e-06, + "loss": 0.1803, + "step": 43007 + }, + { + "epoch": 2.1833423765599567, + "grad_norm": 4.3755998611450195, + "learning_rate": 4.2089899958887215e-06, + "loss": 0.1392, + "step": 43008 + }, + { + "epoch": 2.1833559413998915, + "grad_norm": 4.746567249298096, + "learning_rate": 4.208852953268467e-06, + "loss": 0.1775, + "step": 43009 + }, + { + "epoch": 2.1833695062398264, + "grad_norm": 3.4189727306365967, + "learning_rate": 4.208715910648212e-06, + "loss": 0.136, + "step": 43010 + }, + { + "epoch": 2.1833830710797613, + "grad_norm": 3.8582112789154053, + "learning_rate": 4.208578868027957e-06, + "loss": 0.1858, + "step": 43011 + }, + { + "epoch": 2.183396635919696, + "grad_norm": 3.764364004135132, + "learning_rate": 4.208441825407702e-06, + "loss": 0.2022, + "step": 43012 + }, + { + "epoch": 2.183410200759631, + "grad_norm": 4.7828593254089355, + "learning_rate": 4.208304782787447e-06, + "loss": 0.2267, + "step": 43013 + }, + { + "epoch": 2.183423765599566, + "grad_norm": 6.223347187042236, + "learning_rate": 4.2081677401671925e-06, + "loss": 0.2035, + "step": 43014 + }, + { + "epoch": 2.1834373304395007, + "grad_norm": 4.165804386138916, + "learning_rate": 4.208030697546938e-06, + "loss": 0.2102, + "step": 43015 + }, + { + "epoch": 2.1834508952794356, + "grad_norm": 3.84651780128479, + "learning_rate": 4.207893654926682e-06, + "loss": 0.1314, + "step": 43016 + }, + { + "epoch": 2.1834644601193705, + "grad_norm": 3.7196125984191895, + "learning_rate": 4.207756612306428e-06, + "loss": 0.206, + "step": 43017 + }, + { + "epoch": 2.1834780249593053, + "grad_norm": 3.471959114074707, + "learning_rate": 4.207619569686172e-06, + "loss": 0.1317, + "step": 43018 + }, + { + "epoch": 2.18349158979924, + "grad_norm": 4.9290771484375, + "learning_rate": 4.207482527065918e-06, + "loss": 0.153, + "step": 43019 + }, + { + "epoch": 2.183505154639175, + "grad_norm": 4.1511640548706055, + "learning_rate": 4.207345484445663e-06, + "loss": 0.1758, + "step": 43020 + }, + { + "epoch": 2.1835187194791104, + "grad_norm": 4.746891021728516, + "learning_rate": 4.207208441825408e-06, + "loss": 0.1492, + "step": 43021 + }, + { + "epoch": 2.1835322843190452, + "grad_norm": 4.711607456207275, + "learning_rate": 4.207071399205153e-06, + "loss": 0.1824, + "step": 43022 + }, + { + "epoch": 2.18354584915898, + "grad_norm": 4.937815189361572, + "learning_rate": 4.206934356584898e-06, + "loss": 0.2077, + "step": 43023 + }, + { + "epoch": 2.183559413998915, + "grad_norm": 4.382790565490723, + "learning_rate": 4.2067973139646435e-06, + "loss": 0.2012, + "step": 43024 + }, + { + "epoch": 2.18357297883885, + "grad_norm": 4.542139053344727, + "learning_rate": 4.206660271344389e-06, + "loss": 0.2324, + "step": 43025 + }, + { + "epoch": 2.1835865436787847, + "grad_norm": 4.078619003295898, + "learning_rate": 4.206523228724134e-06, + "loss": 0.179, + "step": 43026 + }, + { + "epoch": 2.1836001085187196, + "grad_norm": 4.798214435577393, + "learning_rate": 4.206386186103878e-06, + "loss": 0.2051, + "step": 43027 + }, + { + "epoch": 2.1836136733586544, + "grad_norm": 4.416938304901123, + "learning_rate": 4.206249143483624e-06, + "loss": 0.2152, + "step": 43028 + }, + { + "epoch": 2.1836272381985893, + "grad_norm": 3.2819459438323975, + "learning_rate": 4.2061121008633685e-06, + "loss": 0.1616, + "step": 43029 + }, + { + "epoch": 2.183640803038524, + "grad_norm": 6.26700496673584, + "learning_rate": 4.2059750582431145e-06, + "loss": 0.2276, + "step": 43030 + }, + { + "epoch": 2.183654367878459, + "grad_norm": 4.144959449768066, + "learning_rate": 4.205838015622859e-06, + "loss": 0.2316, + "step": 43031 + }, + { + "epoch": 2.183667932718394, + "grad_norm": 4.720309257507324, + "learning_rate": 4.205700973002604e-06, + "loss": 0.1957, + "step": 43032 + }, + { + "epoch": 2.1836814975583287, + "grad_norm": 3.493558883666992, + "learning_rate": 4.205563930382349e-06, + "loss": 0.1019, + "step": 43033 + }, + { + "epoch": 2.1836950623982636, + "grad_norm": 4.006492614746094, + "learning_rate": 4.205426887762094e-06, + "loss": 0.1488, + "step": 43034 + }, + { + "epoch": 2.1837086272381985, + "grad_norm": 4.806823253631592, + "learning_rate": 4.20528984514184e-06, + "loss": 0.1904, + "step": 43035 + }, + { + "epoch": 2.1837221920781333, + "grad_norm": 4.565989017486572, + "learning_rate": 4.205152802521584e-06, + "loss": 0.2156, + "step": 43036 + }, + { + "epoch": 2.183735756918068, + "grad_norm": 4.758659362792969, + "learning_rate": 4.20501575990133e-06, + "loss": 0.2106, + "step": 43037 + }, + { + "epoch": 2.183749321758003, + "grad_norm": 5.9094038009643555, + "learning_rate": 4.204878717281074e-06, + "loss": 0.257, + "step": 43038 + }, + { + "epoch": 2.183762886597938, + "grad_norm": 4.538239479064941, + "learning_rate": 4.20474167466082e-06, + "loss": 0.2017, + "step": 43039 + }, + { + "epoch": 2.183776451437873, + "grad_norm": 3.7829909324645996, + "learning_rate": 4.204604632040565e-06, + "loss": 0.1209, + "step": 43040 + }, + { + "epoch": 2.183790016277808, + "grad_norm": 4.578479766845703, + "learning_rate": 4.20446758942031e-06, + "loss": 0.1785, + "step": 43041 + }, + { + "epoch": 2.183803581117743, + "grad_norm": 4.471583843231201, + "learning_rate": 4.204330546800055e-06, + "loss": 0.1996, + "step": 43042 + }, + { + "epoch": 2.183817145957678, + "grad_norm": 5.594025135040283, + "learning_rate": 4.2041935041798e-06, + "loss": 0.2507, + "step": 43043 + }, + { + "epoch": 2.1838307107976127, + "grad_norm": 5.766294956207275, + "learning_rate": 4.204056461559545e-06, + "loss": 0.1632, + "step": 43044 + }, + { + "epoch": 2.1838442756375476, + "grad_norm": 4.519522190093994, + "learning_rate": 4.2039194189392905e-06, + "loss": 0.1458, + "step": 43045 + }, + { + "epoch": 2.1838578404774824, + "grad_norm": 4.614290714263916, + "learning_rate": 4.203782376319036e-06, + "loss": 0.3006, + "step": 43046 + }, + { + "epoch": 2.1838714053174173, + "grad_norm": 6.070743083953857, + "learning_rate": 4.203645333698781e-06, + "loss": 0.238, + "step": 43047 + }, + { + "epoch": 2.183884970157352, + "grad_norm": 7.390203475952148, + "learning_rate": 4.203508291078526e-06, + "loss": 0.1815, + "step": 43048 + }, + { + "epoch": 2.183898534997287, + "grad_norm": 4.874340534210205, + "learning_rate": 4.20337124845827e-06, + "loss": 0.1704, + "step": 43049 + }, + { + "epoch": 2.183912099837222, + "grad_norm": 4.928985595703125, + "learning_rate": 4.203234205838016e-06, + "loss": 0.1301, + "step": 43050 + }, + { + "epoch": 2.1839256646771568, + "grad_norm": 4.744161605834961, + "learning_rate": 4.203097163217761e-06, + "loss": 0.2086, + "step": 43051 + }, + { + "epoch": 2.1839392295170916, + "grad_norm": 6.018096446990967, + "learning_rate": 4.202960120597506e-06, + "loss": 0.2393, + "step": 43052 + }, + { + "epoch": 2.1839527943570265, + "grad_norm": 5.263976097106934, + "learning_rate": 4.202823077977251e-06, + "loss": 0.2586, + "step": 43053 + }, + { + "epoch": 2.1839663591969614, + "grad_norm": 5.140873908996582, + "learning_rate": 4.202686035356996e-06, + "loss": 0.2173, + "step": 43054 + }, + { + "epoch": 2.1839799240368962, + "grad_norm": 4.645747184753418, + "learning_rate": 4.2025489927367415e-06, + "loss": 0.1471, + "step": 43055 + }, + { + "epoch": 2.183993488876831, + "grad_norm": 5.227199077606201, + "learning_rate": 4.202411950116487e-06, + "loss": 0.1998, + "step": 43056 + }, + { + "epoch": 2.184007053716766, + "grad_norm": 6.318131923675537, + "learning_rate": 4.202274907496232e-06, + "loss": 0.3234, + "step": 43057 + }, + { + "epoch": 2.1840206185567013, + "grad_norm": 6.356436729431152, + "learning_rate": 4.202137864875977e-06, + "loss": 0.3999, + "step": 43058 + }, + { + "epoch": 2.184034183396636, + "grad_norm": 5.99599027633667, + "learning_rate": 4.202000822255722e-06, + "loss": 0.3155, + "step": 43059 + }, + { + "epoch": 2.184047748236571, + "grad_norm": 4.902771949768066, + "learning_rate": 4.201863779635467e-06, + "loss": 0.2292, + "step": 43060 + }, + { + "epoch": 2.184061313076506, + "grad_norm": 6.896810531616211, + "learning_rate": 4.201726737015212e-06, + "loss": 0.311, + "step": 43061 + }, + { + "epoch": 2.1840748779164407, + "grad_norm": 7.120660305023193, + "learning_rate": 4.201589694394958e-06, + "loss": 0.4123, + "step": 43062 + }, + { + "epoch": 2.1840884427563756, + "grad_norm": 5.315975666046143, + "learning_rate": 4.201452651774702e-06, + "loss": 0.252, + "step": 43063 + }, + { + "epoch": 2.1841020075963105, + "grad_norm": 4.403675079345703, + "learning_rate": 4.201315609154447e-06, + "loss": 0.1511, + "step": 43064 + }, + { + "epoch": 2.1841155724362453, + "grad_norm": 5.665145397186279, + "learning_rate": 4.201178566534192e-06, + "loss": 0.2127, + "step": 43065 + }, + { + "epoch": 2.18412913727618, + "grad_norm": 5.879733085632324, + "learning_rate": 4.201041523913938e-06, + "loss": 0.1991, + "step": 43066 + }, + { + "epoch": 2.184142702116115, + "grad_norm": 5.293054103851318, + "learning_rate": 4.200904481293683e-06, + "loss": 0.3175, + "step": 43067 + }, + { + "epoch": 2.18415626695605, + "grad_norm": 8.104934692382812, + "learning_rate": 4.200767438673428e-06, + "loss": 0.2487, + "step": 43068 + }, + { + "epoch": 2.184169831795985, + "grad_norm": 4.410277843475342, + "learning_rate": 4.200630396053173e-06, + "loss": 0.2182, + "step": 43069 + }, + { + "epoch": 2.1841833966359196, + "grad_norm": 4.9745917320251465, + "learning_rate": 4.2004933534329175e-06, + "loss": 0.2029, + "step": 43070 + }, + { + "epoch": 2.1841969614758545, + "grad_norm": 5.738224506378174, + "learning_rate": 4.2003563108126635e-06, + "loss": 0.2633, + "step": 43071 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 3.6993513107299805, + "learning_rate": 4.200219268192408e-06, + "loss": 0.265, + "step": 43072 + }, + { + "epoch": 2.1842240911557242, + "grad_norm": 6.777976036071777, + "learning_rate": 4.200082225572154e-06, + "loss": 0.2599, + "step": 43073 + }, + { + "epoch": 2.184237655995659, + "grad_norm": 6.703155517578125, + "learning_rate": 4.199945182951898e-06, + "loss": 0.1961, + "step": 43074 + }, + { + "epoch": 2.184251220835594, + "grad_norm": 6.344142436981201, + "learning_rate": 4.199808140331643e-06, + "loss": 0.1825, + "step": 43075 + }, + { + "epoch": 2.184264785675529, + "grad_norm": 8.181449890136719, + "learning_rate": 4.1996710977113885e-06, + "loss": 0.2778, + "step": 43076 + }, + { + "epoch": 2.1842783505154637, + "grad_norm": 7.597161293029785, + "learning_rate": 4.199534055091134e-06, + "loss": 0.3151, + "step": 43077 + }, + { + "epoch": 2.1842919153553986, + "grad_norm": 6.198645114898682, + "learning_rate": 4.199397012470879e-06, + "loss": 0.2355, + "step": 43078 + }, + { + "epoch": 2.184305480195334, + "grad_norm": 7.124317646026611, + "learning_rate": 4.199259969850624e-06, + "loss": 0.3853, + "step": 43079 + }, + { + "epoch": 2.1843190450352687, + "grad_norm": 3.9603779315948486, + "learning_rate": 4.199122927230369e-06, + "loss": 0.1154, + "step": 43080 + }, + { + "epoch": 2.1843326098752036, + "grad_norm": 5.323820114135742, + "learning_rate": 4.198985884610114e-06, + "loss": 0.2359, + "step": 43081 + }, + { + "epoch": 2.1843461747151385, + "grad_norm": 6.197089195251465, + "learning_rate": 4.19884884198986e-06, + "loss": 0.3589, + "step": 43082 + }, + { + "epoch": 2.1843597395550733, + "grad_norm": 7.7900519371032715, + "learning_rate": 4.198711799369604e-06, + "loss": 0.2693, + "step": 43083 + }, + { + "epoch": 2.184373304395008, + "grad_norm": 5.99513578414917, + "learning_rate": 4.19857475674935e-06, + "loss": 0.3177, + "step": 43084 + }, + { + "epoch": 2.184386869234943, + "grad_norm": 6.055047035217285, + "learning_rate": 4.198437714129094e-06, + "loss": 0.2615, + "step": 43085 + }, + { + "epoch": 2.184400434074878, + "grad_norm": 6.167206287384033, + "learning_rate": 4.1983006715088395e-06, + "loss": 0.2783, + "step": 43086 + }, + { + "epoch": 2.184413998914813, + "grad_norm": 6.182493209838867, + "learning_rate": 4.198163628888585e-06, + "loss": 0.2296, + "step": 43087 + }, + { + "epoch": 2.1844275637547477, + "grad_norm": 5.06636905670166, + "learning_rate": 4.19802658626833e-06, + "loss": 0.229, + "step": 43088 + }, + { + "epoch": 2.1844411285946825, + "grad_norm": 3.9131340980529785, + "learning_rate": 4.197889543648075e-06, + "loss": 0.183, + "step": 43089 + }, + { + "epoch": 2.1844546934346174, + "grad_norm": 4.1859869956970215, + "learning_rate": 4.197752501027819e-06, + "loss": 0.2155, + "step": 43090 + }, + { + "epoch": 2.1844682582745523, + "grad_norm": 4.714141845703125, + "learning_rate": 4.197615458407565e-06, + "loss": 0.2161, + "step": 43091 + }, + { + "epoch": 2.184481823114487, + "grad_norm": 5.664577484130859, + "learning_rate": 4.19747841578731e-06, + "loss": 0.2546, + "step": 43092 + }, + { + "epoch": 2.184495387954422, + "grad_norm": 4.463079929351807, + "learning_rate": 4.197341373167056e-06, + "loss": 0.249, + "step": 43093 + }, + { + "epoch": 2.184508952794357, + "grad_norm": 4.3055009841918945, + "learning_rate": 4.1972043305468e-06, + "loss": 0.125, + "step": 43094 + }, + { + "epoch": 2.1845225176342917, + "grad_norm": 4.044596195220947, + "learning_rate": 4.197067287926545e-06, + "loss": 0.2353, + "step": 43095 + }, + { + "epoch": 2.184536082474227, + "grad_norm": 4.481740474700928, + "learning_rate": 4.19693024530629e-06, + "loss": 0.2051, + "step": 43096 + }, + { + "epoch": 2.184549647314162, + "grad_norm": 5.634951591491699, + "learning_rate": 4.196793202686036e-06, + "loss": 0.2601, + "step": 43097 + }, + { + "epoch": 2.1845632121540968, + "grad_norm": 4.8493170738220215, + "learning_rate": 4.196656160065781e-06, + "loss": 0.2522, + "step": 43098 + }, + { + "epoch": 2.1845767769940316, + "grad_norm": 4.879083633422852, + "learning_rate": 4.196519117445526e-06, + "loss": 0.1741, + "step": 43099 + }, + { + "epoch": 2.1845903418339665, + "grad_norm": 4.756614685058594, + "learning_rate": 4.196382074825271e-06, + "loss": 0.1602, + "step": 43100 + }, + { + "epoch": 2.1846039066739014, + "grad_norm": 5.673826217651367, + "learning_rate": 4.196245032205016e-06, + "loss": 0.2379, + "step": 43101 + }, + { + "epoch": 2.1846174715138362, + "grad_norm": 4.13653039932251, + "learning_rate": 4.1961079895847615e-06, + "loss": 0.2041, + "step": 43102 + }, + { + "epoch": 2.184631036353771, + "grad_norm": 5.090222358703613, + "learning_rate": 4.195970946964507e-06, + "loss": 0.2072, + "step": 43103 + }, + { + "epoch": 2.184644601193706, + "grad_norm": 4.315505504608154, + "learning_rate": 4.195833904344251e-06, + "loss": 0.1871, + "step": 43104 + }, + { + "epoch": 2.184658166033641, + "grad_norm": 5.648229122161865, + "learning_rate": 4.195696861723996e-06, + "loss": 0.1936, + "step": 43105 + }, + { + "epoch": 2.1846717308735757, + "grad_norm": 4.834507465362549, + "learning_rate": 4.195559819103741e-06, + "loss": 0.2111, + "step": 43106 + }, + { + "epoch": 2.1846852957135106, + "grad_norm": 6.7001953125, + "learning_rate": 4.1954227764834865e-06, + "loss": 0.1722, + "step": 43107 + }, + { + "epoch": 2.1846988605534454, + "grad_norm": 4.367169380187988, + "learning_rate": 4.195285733863232e-06, + "loss": 0.2025, + "step": 43108 + }, + { + "epoch": 2.1847124253933803, + "grad_norm": 3.961519956588745, + "learning_rate": 4.195148691242977e-06, + "loss": 0.2383, + "step": 43109 + }, + { + "epoch": 2.184725990233315, + "grad_norm": 6.199605941772461, + "learning_rate": 4.195011648622722e-06, + "loss": 0.2437, + "step": 43110 + }, + { + "epoch": 2.18473955507325, + "grad_norm": 5.5555033683776855, + "learning_rate": 4.194874606002467e-06, + "loss": 0.262, + "step": 43111 + }, + { + "epoch": 2.184753119913185, + "grad_norm": 3.8293540477752686, + "learning_rate": 4.1947375633822124e-06, + "loss": 0.1643, + "step": 43112 + }, + { + "epoch": 2.1847666847531197, + "grad_norm": 5.130963325500488, + "learning_rate": 4.194600520761958e-06, + "loss": 0.1811, + "step": 43113 + }, + { + "epoch": 2.1847802495930546, + "grad_norm": 5.796036720275879, + "learning_rate": 4.194463478141703e-06, + "loss": 0.2441, + "step": 43114 + }, + { + "epoch": 2.1847938144329895, + "grad_norm": 6.008515357971191, + "learning_rate": 4.194326435521447e-06, + "loss": 0.3364, + "step": 43115 + }, + { + "epoch": 2.1848073792729243, + "grad_norm": 4.4585161209106445, + "learning_rate": 4.194189392901193e-06, + "loss": 0.2278, + "step": 43116 + }, + { + "epoch": 2.1848209441128597, + "grad_norm": 7.49477481842041, + "learning_rate": 4.1940523502809375e-06, + "loss": 0.2888, + "step": 43117 + }, + { + "epoch": 2.1848345089527945, + "grad_norm": 3.6282477378845215, + "learning_rate": 4.193915307660683e-06, + "loss": 0.1201, + "step": 43118 + }, + { + "epoch": 2.1848480737927294, + "grad_norm": 4.554183006286621, + "learning_rate": 4.193778265040428e-06, + "loss": 0.1932, + "step": 43119 + }, + { + "epoch": 2.1848616386326642, + "grad_norm": 6.034191608428955, + "learning_rate": 4.193641222420173e-06, + "loss": 0.3238, + "step": 43120 + }, + { + "epoch": 2.184875203472599, + "grad_norm": 5.192244052886963, + "learning_rate": 4.193504179799918e-06, + "loss": 0.223, + "step": 43121 + }, + { + "epoch": 2.184888768312534, + "grad_norm": 5.470804214477539, + "learning_rate": 4.193367137179663e-06, + "loss": 0.2367, + "step": 43122 + }, + { + "epoch": 2.184902333152469, + "grad_norm": 6.441656112670898, + "learning_rate": 4.1932300945594086e-06, + "loss": 0.2908, + "step": 43123 + }, + { + "epoch": 2.1849158979924037, + "grad_norm": 6.078130722045898, + "learning_rate": 4.193093051939153e-06, + "loss": 0.2589, + "step": 43124 + }, + { + "epoch": 2.1849294628323386, + "grad_norm": 7.9297356605529785, + "learning_rate": 4.192956009318899e-06, + "loss": 0.3641, + "step": 43125 + }, + { + "epoch": 2.1849430276722734, + "grad_norm": 7.045116901397705, + "learning_rate": 4.192818966698643e-06, + "loss": 0.307, + "step": 43126 + }, + { + "epoch": 2.1849565925122083, + "grad_norm": 5.758180141448975, + "learning_rate": 4.192681924078389e-06, + "loss": 0.184, + "step": 43127 + }, + { + "epoch": 2.184970157352143, + "grad_norm": 5.601241588592529, + "learning_rate": 4.192544881458134e-06, + "loss": 0.1489, + "step": 43128 + }, + { + "epoch": 2.184983722192078, + "grad_norm": 4.023838043212891, + "learning_rate": 4.192407838837879e-06, + "loss": 0.2301, + "step": 43129 + }, + { + "epoch": 2.184997287032013, + "grad_norm": 4.661378860473633, + "learning_rate": 4.192270796217624e-06, + "loss": 0.1984, + "step": 43130 + }, + { + "epoch": 2.1850108518719478, + "grad_norm": 3.39024019241333, + "learning_rate": 4.192133753597369e-06, + "loss": 0.1454, + "step": 43131 + }, + { + "epoch": 2.1850244167118826, + "grad_norm": 3.847604513168335, + "learning_rate": 4.191996710977114e-06, + "loss": 0.1536, + "step": 43132 + }, + { + "epoch": 2.1850379815518175, + "grad_norm": 4.694674015045166, + "learning_rate": 4.1918596683568595e-06, + "loss": 0.1454, + "step": 43133 + }, + { + "epoch": 2.185051546391753, + "grad_norm": 5.126539707183838, + "learning_rate": 4.191722625736605e-06, + "loss": 0.1989, + "step": 43134 + }, + { + "epoch": 2.1850651112316877, + "grad_norm": 6.455295085906982, + "learning_rate": 4.191585583116349e-06, + "loss": 0.2414, + "step": 43135 + }, + { + "epoch": 2.1850786760716225, + "grad_norm": 6.715060234069824, + "learning_rate": 4.191448540496095e-06, + "loss": 0.2203, + "step": 43136 + }, + { + "epoch": 2.1850922409115574, + "grad_norm": 4.346305847167969, + "learning_rate": 4.191311497875839e-06, + "loss": 0.1742, + "step": 43137 + }, + { + "epoch": 2.1851058057514923, + "grad_norm": 5.098424434661865, + "learning_rate": 4.191174455255585e-06, + "loss": 0.1984, + "step": 43138 + }, + { + "epoch": 2.185119370591427, + "grad_norm": 4.643770217895508, + "learning_rate": 4.19103741263533e-06, + "loss": 0.1709, + "step": 43139 + }, + { + "epoch": 2.185132935431362, + "grad_norm": 3.7773396968841553, + "learning_rate": 4.190900370015075e-06, + "loss": 0.1925, + "step": 43140 + }, + { + "epoch": 2.185146500271297, + "grad_norm": 5.6168646812438965, + "learning_rate": 4.19076332739482e-06, + "loss": 0.2546, + "step": 43141 + }, + { + "epoch": 2.1851600651112317, + "grad_norm": 6.623148441314697, + "learning_rate": 4.190626284774565e-06, + "loss": 0.3478, + "step": 43142 + }, + { + "epoch": 2.1851736299511666, + "grad_norm": 4.553048610687256, + "learning_rate": 4.1904892421543104e-06, + "loss": 0.2879, + "step": 43143 + }, + { + "epoch": 2.1851871947911015, + "grad_norm": 4.201727390289307, + "learning_rate": 4.190352199534056e-06, + "loss": 0.0999, + "step": 43144 + }, + { + "epoch": 2.1852007596310363, + "grad_norm": 3.115884304046631, + "learning_rate": 4.190215156913801e-06, + "loss": 0.1619, + "step": 43145 + }, + { + "epoch": 2.185214324470971, + "grad_norm": 3.413370132446289, + "learning_rate": 4.190078114293545e-06, + "loss": 0.1958, + "step": 43146 + }, + { + "epoch": 2.185227889310906, + "grad_norm": 3.5926263332366943, + "learning_rate": 4.189941071673291e-06, + "loss": 0.1545, + "step": 43147 + }, + { + "epoch": 2.185241454150841, + "grad_norm": 3.9299161434173584, + "learning_rate": 4.1898040290530355e-06, + "loss": 0.2629, + "step": 43148 + }, + { + "epoch": 2.185255018990776, + "grad_norm": 3.5296132564544678, + "learning_rate": 4.189666986432781e-06, + "loss": 0.1078, + "step": 43149 + }, + { + "epoch": 2.1852685838307107, + "grad_norm": 4.989166736602783, + "learning_rate": 4.189529943812526e-06, + "loss": 0.1651, + "step": 43150 + }, + { + "epoch": 2.1852821486706455, + "grad_norm": 5.2160725593566895, + "learning_rate": 4.189392901192271e-06, + "loss": 0.1877, + "step": 43151 + }, + { + "epoch": 2.1852957135105804, + "grad_norm": 4.30056095123291, + "learning_rate": 4.189255858572016e-06, + "loss": 0.1505, + "step": 43152 + }, + { + "epoch": 2.1853092783505152, + "grad_norm": 5.865267753601074, + "learning_rate": 4.189118815951761e-06, + "loss": 0.2081, + "step": 43153 + }, + { + "epoch": 2.1853228431904506, + "grad_norm": 3.3171029090881348, + "learning_rate": 4.1889817733315066e-06, + "loss": 0.1476, + "step": 43154 + }, + { + "epoch": 2.1853364080303854, + "grad_norm": 3.558565139770508, + "learning_rate": 4.188844730711252e-06, + "loss": 0.1322, + "step": 43155 + }, + { + "epoch": 2.1853499728703203, + "grad_norm": 5.64432954788208, + "learning_rate": 4.188707688090997e-06, + "loss": 0.1527, + "step": 43156 + }, + { + "epoch": 2.185363537710255, + "grad_norm": 6.179302215576172, + "learning_rate": 4.188570645470742e-06, + "loss": 0.2018, + "step": 43157 + }, + { + "epoch": 2.18537710255019, + "grad_norm": 4.918982028961182, + "learning_rate": 4.1884336028504864e-06, + "loss": 0.186, + "step": 43158 + }, + { + "epoch": 2.185390667390125, + "grad_norm": 4.066384315490723, + "learning_rate": 4.188296560230232e-06, + "loss": 0.2262, + "step": 43159 + }, + { + "epoch": 2.1854042322300598, + "grad_norm": 3.9830141067504883, + "learning_rate": 4.188159517609977e-06, + "loss": 0.1347, + "step": 43160 + }, + { + "epoch": 2.1854177970699946, + "grad_norm": 3.8669090270996094, + "learning_rate": 4.188022474989722e-06, + "loss": 0.1588, + "step": 43161 + }, + { + "epoch": 2.1854313619099295, + "grad_norm": 5.144356727600098, + "learning_rate": 4.187885432369467e-06, + "loss": 0.1863, + "step": 43162 + }, + { + "epoch": 2.1854449267498643, + "grad_norm": 5.087184906005859, + "learning_rate": 4.187748389749212e-06, + "loss": 0.22, + "step": 43163 + }, + { + "epoch": 2.185458491589799, + "grad_norm": 6.100813865661621, + "learning_rate": 4.1876113471289575e-06, + "loss": 0.2295, + "step": 43164 + }, + { + "epoch": 2.185472056429734, + "grad_norm": 6.894138813018799, + "learning_rate": 4.187474304508703e-06, + "loss": 0.2529, + "step": 43165 + }, + { + "epoch": 2.185485621269669, + "grad_norm": 5.388411521911621, + "learning_rate": 4.187337261888448e-06, + "loss": 0.2611, + "step": 43166 + }, + { + "epoch": 2.185499186109604, + "grad_norm": 5.454189300537109, + "learning_rate": 4.187200219268193e-06, + "loss": 0.2005, + "step": 43167 + }, + { + "epoch": 2.1855127509495387, + "grad_norm": 5.2316508293151855, + "learning_rate": 4.187063176647938e-06, + "loss": 0.1378, + "step": 43168 + }, + { + "epoch": 2.1855263157894735, + "grad_norm": 5.379487991333008, + "learning_rate": 4.1869261340276825e-06, + "loss": 0.2323, + "step": 43169 + }, + { + "epoch": 2.1855398806294084, + "grad_norm": 3.7832834720611572, + "learning_rate": 4.1867890914074286e-06, + "loss": 0.1696, + "step": 43170 + }, + { + "epoch": 2.1855534454693433, + "grad_norm": 6.540595531463623, + "learning_rate": 4.186652048787173e-06, + "loss": 0.1577, + "step": 43171 + }, + { + "epoch": 2.1855670103092786, + "grad_norm": 4.494967937469482, + "learning_rate": 4.186515006166919e-06, + "loss": 0.2382, + "step": 43172 + }, + { + "epoch": 2.1855805751492134, + "grad_norm": 4.353511333465576, + "learning_rate": 4.186377963546663e-06, + "loss": 0.2071, + "step": 43173 + }, + { + "epoch": 2.1855941399891483, + "grad_norm": 3.710080146789551, + "learning_rate": 4.1862409209264084e-06, + "loss": 0.1294, + "step": 43174 + }, + { + "epoch": 2.185607704829083, + "grad_norm": 2.853809118270874, + "learning_rate": 4.186103878306154e-06, + "loss": 0.0979, + "step": 43175 + }, + { + "epoch": 2.185621269669018, + "grad_norm": 6.938180446624756, + "learning_rate": 4.185966835685899e-06, + "loss": 0.28, + "step": 43176 + }, + { + "epoch": 2.185634834508953, + "grad_norm": 4.362263202667236, + "learning_rate": 4.185829793065644e-06, + "loss": 0.182, + "step": 43177 + }, + { + "epoch": 2.1856483993488878, + "grad_norm": 7.078090190887451, + "learning_rate": 4.185692750445388e-06, + "loss": 0.2754, + "step": 43178 + }, + { + "epoch": 2.1856619641888226, + "grad_norm": 5.393857479095459, + "learning_rate": 4.185555707825134e-06, + "loss": 0.211, + "step": 43179 + }, + { + "epoch": 2.1856755290287575, + "grad_norm": 5.130621910095215, + "learning_rate": 4.185418665204879e-06, + "loss": 0.2093, + "step": 43180 + }, + { + "epoch": 2.1856890938686924, + "grad_norm": 3.945526599884033, + "learning_rate": 4.185281622584625e-06, + "loss": 0.1, + "step": 43181 + }, + { + "epoch": 2.1857026587086272, + "grad_norm": 3.59844708442688, + "learning_rate": 4.185144579964369e-06, + "loss": 0.1012, + "step": 43182 + }, + { + "epoch": 2.185716223548562, + "grad_norm": 5.93687105178833, + "learning_rate": 4.185007537344114e-06, + "loss": 0.2417, + "step": 43183 + }, + { + "epoch": 2.185729788388497, + "grad_norm": 4.260523319244385, + "learning_rate": 4.184870494723859e-06, + "loss": 0.1091, + "step": 43184 + }, + { + "epoch": 2.185743353228432, + "grad_norm": 4.258644104003906, + "learning_rate": 4.1847334521036046e-06, + "loss": 0.1471, + "step": 43185 + }, + { + "epoch": 2.1857569180683667, + "grad_norm": 3.8955681324005127, + "learning_rate": 4.18459640948335e-06, + "loss": 0.1719, + "step": 43186 + }, + { + "epoch": 2.1857704829083016, + "grad_norm": 5.95446252822876, + "learning_rate": 4.184459366863095e-06, + "loss": 0.1784, + "step": 43187 + }, + { + "epoch": 2.1857840477482364, + "grad_norm": 5.181885719299316, + "learning_rate": 4.18432232424284e-06, + "loss": 0.1689, + "step": 43188 + }, + { + "epoch": 2.1857976125881713, + "grad_norm": 5.4120612144470215, + "learning_rate": 4.1841852816225844e-06, + "loss": 0.1498, + "step": 43189 + }, + { + "epoch": 2.185811177428106, + "grad_norm": 8.429394721984863, + "learning_rate": 4.1840482390023305e-06, + "loss": 0.2297, + "step": 43190 + }, + { + "epoch": 2.185824742268041, + "grad_norm": 5.357088088989258, + "learning_rate": 4.183911196382075e-06, + "loss": 0.1557, + "step": 43191 + }, + { + "epoch": 2.1858383071079763, + "grad_norm": 6.102255344390869, + "learning_rate": 4.18377415376182e-06, + "loss": 0.2666, + "step": 43192 + }, + { + "epoch": 2.185851871947911, + "grad_norm": 5.308225631713867, + "learning_rate": 4.183637111141565e-06, + "loss": 0.2006, + "step": 43193 + }, + { + "epoch": 2.185865436787846, + "grad_norm": 6.882287502288818, + "learning_rate": 4.18350006852131e-06, + "loss": 0.1972, + "step": 43194 + }, + { + "epoch": 2.185879001627781, + "grad_norm": 4.440704345703125, + "learning_rate": 4.1833630259010555e-06, + "loss": 0.1357, + "step": 43195 + }, + { + "epoch": 2.185892566467716, + "grad_norm": 3.7519824504852295, + "learning_rate": 4.183225983280801e-06, + "loss": 0.1203, + "step": 43196 + }, + { + "epoch": 2.1859061313076507, + "grad_norm": 5.315122127532959, + "learning_rate": 4.183088940660546e-06, + "loss": 0.1486, + "step": 43197 + }, + { + "epoch": 2.1859196961475855, + "grad_norm": 5.5967488288879395, + "learning_rate": 4.182951898040291e-06, + "loss": 0.2308, + "step": 43198 + }, + { + "epoch": 2.1859332609875204, + "grad_norm": 4.291515827178955, + "learning_rate": 4.182814855420036e-06, + "loss": 0.1388, + "step": 43199 + }, + { + "epoch": 2.1859468258274553, + "grad_norm": 4.469959259033203, + "learning_rate": 4.1826778127997806e-06, + "loss": 0.1632, + "step": 43200 + }, + { + "epoch": 2.18596039066739, + "grad_norm": 6.087944984436035, + "learning_rate": 4.182540770179527e-06, + "loss": 0.1709, + "step": 43201 + }, + { + "epoch": 2.185973955507325, + "grad_norm": 6.709345817565918, + "learning_rate": 4.182403727559271e-06, + "loss": 0.3207, + "step": 43202 + }, + { + "epoch": 2.18598752034726, + "grad_norm": 5.272362232208252, + "learning_rate": 4.182266684939016e-06, + "loss": 0.1776, + "step": 43203 + }, + { + "epoch": 2.1860010851871947, + "grad_norm": 5.0677642822265625, + "learning_rate": 4.182129642318761e-06, + "loss": 0.1457, + "step": 43204 + }, + { + "epoch": 2.1860146500271296, + "grad_norm": 3.892200469970703, + "learning_rate": 4.1819925996985064e-06, + "loss": 0.2247, + "step": 43205 + }, + { + "epoch": 2.1860282148670644, + "grad_norm": 4.646266460418701, + "learning_rate": 4.181855557078252e-06, + "loss": 0.2022, + "step": 43206 + }, + { + "epoch": 2.1860417797069993, + "grad_norm": 5.542323112487793, + "learning_rate": 4.181718514457997e-06, + "loss": 0.2046, + "step": 43207 + }, + { + "epoch": 2.186055344546934, + "grad_norm": 3.9739229679107666, + "learning_rate": 4.181581471837742e-06, + "loss": 0.1891, + "step": 43208 + }, + { + "epoch": 2.186068909386869, + "grad_norm": 6.842639923095703, + "learning_rate": 4.181444429217487e-06, + "loss": 0.3444, + "step": 43209 + }, + { + "epoch": 2.1860824742268044, + "grad_norm": 3.30373215675354, + "learning_rate": 4.181307386597232e-06, + "loss": 0.1531, + "step": 43210 + }, + { + "epoch": 2.186096039066739, + "grad_norm": 4.935296535491943, + "learning_rate": 4.1811703439769775e-06, + "loss": 0.1863, + "step": 43211 + }, + { + "epoch": 2.186109603906674, + "grad_norm": 2.9596660137176514, + "learning_rate": 4.181033301356722e-06, + "loss": 0.1218, + "step": 43212 + }, + { + "epoch": 2.186123168746609, + "grad_norm": 4.672316074371338, + "learning_rate": 4.180896258736468e-06, + "loss": 0.1553, + "step": 43213 + }, + { + "epoch": 2.186136733586544, + "grad_norm": 4.779160499572754, + "learning_rate": 4.180759216116212e-06, + "loss": 0.198, + "step": 43214 + }, + { + "epoch": 2.1861502984264787, + "grad_norm": 5.237349510192871, + "learning_rate": 4.180622173495957e-06, + "loss": 0.225, + "step": 43215 + }, + { + "epoch": 2.1861638632664135, + "grad_norm": 4.725249767303467, + "learning_rate": 4.1804851308757026e-06, + "loss": 0.3298, + "step": 43216 + }, + { + "epoch": 2.1861774281063484, + "grad_norm": 4.916568279266357, + "learning_rate": 4.180348088255448e-06, + "loss": 0.2207, + "step": 43217 + }, + { + "epoch": 2.1861909929462833, + "grad_norm": 3.8690133094787598, + "learning_rate": 4.180211045635193e-06, + "loss": 0.1344, + "step": 43218 + }, + { + "epoch": 2.186204557786218, + "grad_norm": 4.721479892730713, + "learning_rate": 4.180074003014938e-06, + "loss": 0.1954, + "step": 43219 + }, + { + "epoch": 2.186218122626153, + "grad_norm": 4.242181301116943, + "learning_rate": 4.179936960394683e-06, + "loss": 0.1919, + "step": 43220 + }, + { + "epoch": 2.186231687466088, + "grad_norm": 6.946017742156982, + "learning_rate": 4.1797999177744285e-06, + "loss": 0.258, + "step": 43221 + }, + { + "epoch": 2.1862452523060227, + "grad_norm": 4.071340560913086, + "learning_rate": 4.179662875154174e-06, + "loss": 0.1636, + "step": 43222 + }, + { + "epoch": 2.1862588171459576, + "grad_norm": 5.954448223114014, + "learning_rate": 4.179525832533918e-06, + "loss": 0.2129, + "step": 43223 + }, + { + "epoch": 2.1862723819858925, + "grad_norm": 4.4534735679626465, + "learning_rate": 4.179388789913664e-06, + "loss": 0.1872, + "step": 43224 + }, + { + "epoch": 2.1862859468258273, + "grad_norm": 3.018749713897705, + "learning_rate": 4.179251747293408e-06, + "loss": 0.1256, + "step": 43225 + }, + { + "epoch": 2.186299511665762, + "grad_norm": 4.401392459869385, + "learning_rate": 4.179114704673154e-06, + "loss": 0.1433, + "step": 43226 + }, + { + "epoch": 2.186313076505697, + "grad_norm": 3.476773262023926, + "learning_rate": 4.178977662052899e-06, + "loss": 0.2121, + "step": 43227 + }, + { + "epoch": 2.186326641345632, + "grad_norm": 4.266852855682373, + "learning_rate": 4.178840619432644e-06, + "loss": 0.1372, + "step": 43228 + }, + { + "epoch": 2.186340206185567, + "grad_norm": 4.07780122756958, + "learning_rate": 4.178703576812389e-06, + "loss": 0.1892, + "step": 43229 + }, + { + "epoch": 2.186353771025502, + "grad_norm": 4.064104080200195, + "learning_rate": 4.178566534192134e-06, + "loss": 0.1485, + "step": 43230 + }, + { + "epoch": 2.186367335865437, + "grad_norm": 6.652205467224121, + "learning_rate": 4.178429491571879e-06, + "loss": 0.2168, + "step": 43231 + }, + { + "epoch": 2.186380900705372, + "grad_norm": 3.1403181552886963, + "learning_rate": 4.178292448951624e-06, + "loss": 0.1249, + "step": 43232 + }, + { + "epoch": 2.1863944655453067, + "grad_norm": 3.5902674198150635, + "learning_rate": 4.17815540633137e-06, + "loss": 0.2392, + "step": 43233 + }, + { + "epoch": 2.1864080303852416, + "grad_norm": 3.834373950958252, + "learning_rate": 4.178018363711114e-06, + "loss": 0.2, + "step": 43234 + }, + { + "epoch": 2.1864215952251764, + "grad_norm": 5.057625770568848, + "learning_rate": 4.17788132109086e-06, + "loss": 0.1859, + "step": 43235 + }, + { + "epoch": 2.1864351600651113, + "grad_norm": 3.841392993927002, + "learning_rate": 4.1777442784706045e-06, + "loss": 0.082, + "step": 43236 + }, + { + "epoch": 2.186448724905046, + "grad_norm": 4.802116870880127, + "learning_rate": 4.17760723585035e-06, + "loss": 0.1338, + "step": 43237 + }, + { + "epoch": 2.186462289744981, + "grad_norm": 4.1308135986328125, + "learning_rate": 4.177470193230095e-06, + "loss": 0.1475, + "step": 43238 + }, + { + "epoch": 2.186475854584916, + "grad_norm": 4.2452216148376465, + "learning_rate": 4.17733315060984e-06, + "loss": 0.2006, + "step": 43239 + }, + { + "epoch": 2.1864894194248508, + "grad_norm": 3.6448135375976562, + "learning_rate": 4.177196107989585e-06, + "loss": 0.1426, + "step": 43240 + }, + { + "epoch": 2.1865029842647856, + "grad_norm": 3.400000810623169, + "learning_rate": 4.17705906536933e-06, + "loss": 0.1217, + "step": 43241 + }, + { + "epoch": 2.1865165491047205, + "grad_norm": 4.205625534057617, + "learning_rate": 4.1769220227490755e-06, + "loss": 0.194, + "step": 43242 + }, + { + "epoch": 2.1865301139446554, + "grad_norm": 3.626335859298706, + "learning_rate": 4.17678498012882e-06, + "loss": 0.1224, + "step": 43243 + }, + { + "epoch": 2.18654367878459, + "grad_norm": 3.9447529315948486, + "learning_rate": 4.176647937508566e-06, + "loss": 0.1667, + "step": 43244 + }, + { + "epoch": 2.186557243624525, + "grad_norm": 4.237844467163086, + "learning_rate": 4.17651089488831e-06, + "loss": 0.1874, + "step": 43245 + }, + { + "epoch": 2.18657080846446, + "grad_norm": 4.501078128814697, + "learning_rate": 4.176373852268055e-06, + "loss": 0.1418, + "step": 43246 + }, + { + "epoch": 2.186584373304395, + "grad_norm": 3.7572596073150635, + "learning_rate": 4.1762368096478006e-06, + "loss": 0.1627, + "step": 43247 + }, + { + "epoch": 2.18659793814433, + "grad_norm": 5.045626163482666, + "learning_rate": 4.176099767027546e-06, + "loss": 0.2132, + "step": 43248 + }, + { + "epoch": 2.186611502984265, + "grad_norm": 3.6703617572784424, + "learning_rate": 4.175962724407291e-06, + "loss": 0.1497, + "step": 43249 + }, + { + "epoch": 2.1866250678242, + "grad_norm": 4.805337905883789, + "learning_rate": 4.175825681787036e-06, + "loss": 0.1762, + "step": 43250 + }, + { + "epoch": 2.1866386326641347, + "grad_norm": 3.9279067516326904, + "learning_rate": 4.175688639166781e-06, + "loss": 0.1611, + "step": 43251 + }, + { + "epoch": 2.1866521975040696, + "grad_norm": 5.550412654876709, + "learning_rate": 4.1755515965465265e-06, + "loss": 0.1807, + "step": 43252 + }, + { + "epoch": 2.1866657623440044, + "grad_norm": 3.3847358226776123, + "learning_rate": 4.175414553926272e-06, + "loss": 0.1584, + "step": 43253 + }, + { + "epoch": 2.1866793271839393, + "grad_norm": 4.740591049194336, + "learning_rate": 4.175277511306017e-06, + "loss": 0.2194, + "step": 43254 + }, + { + "epoch": 2.186692892023874, + "grad_norm": 4.3375654220581055, + "learning_rate": 4.175140468685762e-06, + "loss": 0.2503, + "step": 43255 + }, + { + "epoch": 2.186706456863809, + "grad_norm": 7.661896705627441, + "learning_rate": 4.175003426065506e-06, + "loss": 0.2503, + "step": 43256 + }, + { + "epoch": 2.186720021703744, + "grad_norm": 3.972660779953003, + "learning_rate": 4.1748663834452515e-06, + "loss": 0.1648, + "step": 43257 + }, + { + "epoch": 2.1867335865436788, + "grad_norm": 4.606912136077881, + "learning_rate": 4.174729340824997e-06, + "loss": 0.233, + "step": 43258 + }, + { + "epoch": 2.1867471513836136, + "grad_norm": 4.048098087310791, + "learning_rate": 4.174592298204742e-06, + "loss": 0.1199, + "step": 43259 + }, + { + "epoch": 2.1867607162235485, + "grad_norm": 3.6293346881866455, + "learning_rate": 4.174455255584487e-06, + "loss": 0.107, + "step": 43260 + }, + { + "epoch": 2.1867742810634834, + "grad_norm": 5.369421005249023, + "learning_rate": 4.174318212964232e-06, + "loss": 0.206, + "step": 43261 + }, + { + "epoch": 2.1867878459034182, + "grad_norm": 5.264342784881592, + "learning_rate": 4.174181170343977e-06, + "loss": 0.2232, + "step": 43262 + }, + { + "epoch": 2.186801410743353, + "grad_norm": 3.910414934158325, + "learning_rate": 4.174044127723723e-06, + "loss": 0.1514, + "step": 43263 + }, + { + "epoch": 2.186814975583288, + "grad_norm": 3.8357040882110596, + "learning_rate": 4.173907085103468e-06, + "loss": 0.1578, + "step": 43264 + }, + { + "epoch": 2.186828540423223, + "grad_norm": 4.190845966339111, + "learning_rate": 4.173770042483213e-06, + "loss": 0.142, + "step": 43265 + }, + { + "epoch": 2.1868421052631577, + "grad_norm": 3.6018757820129395, + "learning_rate": 4.173632999862957e-06, + "loss": 0.1795, + "step": 43266 + }, + { + "epoch": 2.1868556701030926, + "grad_norm": 5.389034748077393, + "learning_rate": 4.173495957242703e-06, + "loss": 0.2103, + "step": 43267 + }, + { + "epoch": 2.186869234943028, + "grad_norm": 5.355846881866455, + "learning_rate": 4.173358914622448e-06, + "loss": 0.2053, + "step": 43268 + }, + { + "epoch": 2.1868827997829627, + "grad_norm": 5.059501647949219, + "learning_rate": 4.173221872002193e-06, + "loss": 0.1417, + "step": 43269 + }, + { + "epoch": 2.1868963646228976, + "grad_norm": 4.827812194824219, + "learning_rate": 4.173084829381938e-06, + "loss": 0.2778, + "step": 43270 + }, + { + "epoch": 2.1869099294628325, + "grad_norm": 4.820704936981201, + "learning_rate": 4.172947786761683e-06, + "loss": 0.1472, + "step": 43271 + }, + { + "epoch": 2.1869234943027673, + "grad_norm": 4.218754291534424, + "learning_rate": 4.172810744141428e-06, + "loss": 0.1623, + "step": 43272 + }, + { + "epoch": 2.186937059142702, + "grad_norm": 6.10332727432251, + "learning_rate": 4.1726737015211735e-06, + "loss": 0.3073, + "step": 43273 + }, + { + "epoch": 2.186950623982637, + "grad_norm": 4.705130577087402, + "learning_rate": 4.172536658900919e-06, + "loss": 0.1402, + "step": 43274 + }, + { + "epoch": 2.186964188822572, + "grad_norm": 4.938782691955566, + "learning_rate": 4.172399616280664e-06, + "loss": 0.1479, + "step": 43275 + }, + { + "epoch": 2.186977753662507, + "grad_norm": 4.9812188148498535, + "learning_rate": 4.172262573660409e-06, + "loss": 0.2111, + "step": 43276 + }, + { + "epoch": 2.1869913185024417, + "grad_norm": 6.065691947937012, + "learning_rate": 4.172125531040153e-06, + "loss": 0.2393, + "step": 43277 + }, + { + "epoch": 2.1870048833423765, + "grad_norm": 4.5200042724609375, + "learning_rate": 4.1719884884198994e-06, + "loss": 0.2231, + "step": 43278 + }, + { + "epoch": 2.1870184481823114, + "grad_norm": 4.983971118927002, + "learning_rate": 4.171851445799644e-06, + "loss": 0.2327, + "step": 43279 + }, + { + "epoch": 2.1870320130222463, + "grad_norm": 4.823171138763428, + "learning_rate": 4.17171440317939e-06, + "loss": 0.2028, + "step": 43280 + }, + { + "epoch": 2.187045577862181, + "grad_norm": 4.252208232879639, + "learning_rate": 4.171577360559134e-06, + "loss": 0.134, + "step": 43281 + }, + { + "epoch": 2.187059142702116, + "grad_norm": 4.984562873840332, + "learning_rate": 4.171440317938879e-06, + "loss": 0.1796, + "step": 43282 + }, + { + "epoch": 2.187072707542051, + "grad_norm": 4.557436943054199, + "learning_rate": 4.1713032753186245e-06, + "loss": 0.2817, + "step": 43283 + }, + { + "epoch": 2.1870862723819857, + "grad_norm": 4.357544422149658, + "learning_rate": 4.17116623269837e-06, + "loss": 0.1742, + "step": 43284 + }, + { + "epoch": 2.1870998372219206, + "grad_norm": 4.1713128089904785, + "learning_rate": 4.171029190078115e-06, + "loss": 0.1943, + "step": 43285 + }, + { + "epoch": 2.187113402061856, + "grad_norm": 3.8319973945617676, + "learning_rate": 4.170892147457859e-06, + "loss": 0.1365, + "step": 43286 + }, + { + "epoch": 2.1871269669017908, + "grad_norm": 3.898606538772583, + "learning_rate": 4.170755104837605e-06, + "loss": 0.105, + "step": 43287 + }, + { + "epoch": 2.1871405317417256, + "grad_norm": 4.377274990081787, + "learning_rate": 4.1706180622173495e-06, + "loss": 0.1919, + "step": 43288 + }, + { + "epoch": 2.1871540965816605, + "grad_norm": 5.3071675300598145, + "learning_rate": 4.1704810195970955e-06, + "loss": 0.1927, + "step": 43289 + }, + { + "epoch": 2.1871676614215954, + "grad_norm": 6.277828216552734, + "learning_rate": 4.17034397697684e-06, + "loss": 0.303, + "step": 43290 + }, + { + "epoch": 2.18718122626153, + "grad_norm": 5.557199954986572, + "learning_rate": 4.170206934356585e-06, + "loss": 0.3063, + "step": 43291 + }, + { + "epoch": 2.187194791101465, + "grad_norm": 6.404858112335205, + "learning_rate": 4.17006989173633e-06, + "loss": 0.2635, + "step": 43292 + }, + { + "epoch": 2.1872083559414, + "grad_norm": 4.41670560836792, + "learning_rate": 4.169932849116075e-06, + "loss": 0.1682, + "step": 43293 + }, + { + "epoch": 2.187221920781335, + "grad_norm": 4.088221073150635, + "learning_rate": 4.169795806495821e-06, + "loss": 0.1714, + "step": 43294 + }, + { + "epoch": 2.1872354856212697, + "grad_norm": 5.88712739944458, + "learning_rate": 4.169658763875566e-06, + "loss": 0.2046, + "step": 43295 + }, + { + "epoch": 2.1872490504612045, + "grad_norm": 3.627645969390869, + "learning_rate": 4.169521721255311e-06, + "loss": 0.1124, + "step": 43296 + }, + { + "epoch": 2.1872626153011394, + "grad_norm": 4.646523952484131, + "learning_rate": 4.169384678635055e-06, + "loss": 0.1966, + "step": 43297 + }, + { + "epoch": 2.1872761801410743, + "grad_norm": 4.686888694763184, + "learning_rate": 4.169247636014801e-06, + "loss": 0.2196, + "step": 43298 + }, + { + "epoch": 2.187289744981009, + "grad_norm": 4.62481164932251, + "learning_rate": 4.169110593394546e-06, + "loss": 0.2426, + "step": 43299 + }, + { + "epoch": 2.187303309820944, + "grad_norm": 4.6415205001831055, + "learning_rate": 4.168973550774291e-06, + "loss": 0.1877, + "step": 43300 + }, + { + "epoch": 2.187316874660879, + "grad_norm": 3.596822738647461, + "learning_rate": 4.168836508154036e-06, + "loss": 0.0951, + "step": 43301 + }, + { + "epoch": 2.1873304395008137, + "grad_norm": 4.834560871124268, + "learning_rate": 4.168699465533781e-06, + "loss": 0.1685, + "step": 43302 + }, + { + "epoch": 2.1873440043407486, + "grad_norm": 3.655815362930298, + "learning_rate": 4.168562422913526e-06, + "loss": 0.1106, + "step": 43303 + }, + { + "epoch": 2.1873575691806835, + "grad_norm": 4.823155403137207, + "learning_rate": 4.1684253802932715e-06, + "loss": 0.1606, + "step": 43304 + }, + { + "epoch": 2.1873711340206183, + "grad_norm": 4.619109630584717, + "learning_rate": 4.168288337673017e-06, + "loss": 0.1817, + "step": 43305 + }, + { + "epoch": 2.1873846988605536, + "grad_norm": 5.098745346069336, + "learning_rate": 4.168151295052762e-06, + "loss": 0.1379, + "step": 43306 + }, + { + "epoch": 2.1873982637004885, + "grad_norm": 4.302602291107178, + "learning_rate": 4.168014252432507e-06, + "loss": 0.1858, + "step": 43307 + }, + { + "epoch": 2.1874118285404234, + "grad_norm": 7.26900053024292, + "learning_rate": 4.167877209812252e-06, + "loss": 0.1664, + "step": 43308 + }, + { + "epoch": 2.1874253933803582, + "grad_norm": 6.560464859008789, + "learning_rate": 4.1677401671919974e-06, + "loss": 0.2375, + "step": 43309 + }, + { + "epoch": 2.187438958220293, + "grad_norm": 4.218470096588135, + "learning_rate": 4.167603124571742e-06, + "loss": 0.1712, + "step": 43310 + }, + { + "epoch": 2.187452523060228, + "grad_norm": 4.701439380645752, + "learning_rate": 4.167466081951487e-06, + "loss": 0.1707, + "step": 43311 + }, + { + "epoch": 2.187466087900163, + "grad_norm": 4.656521797180176, + "learning_rate": 4.167329039331232e-06, + "loss": 0.175, + "step": 43312 + }, + { + "epoch": 2.1874796527400977, + "grad_norm": 4.315850734710693, + "learning_rate": 4.167191996710977e-06, + "loss": 0.177, + "step": 43313 + }, + { + "epoch": 2.1874932175800326, + "grad_norm": 5.571870803833008, + "learning_rate": 4.1670549540907225e-06, + "loss": 0.1946, + "step": 43314 + }, + { + "epoch": 2.1875067824199674, + "grad_norm": 4.47057580947876, + "learning_rate": 4.166917911470468e-06, + "loss": 0.2019, + "step": 43315 + }, + { + "epoch": 2.1875203472599023, + "grad_norm": 6.168107509613037, + "learning_rate": 4.166780868850213e-06, + "loss": 0.2311, + "step": 43316 + }, + { + "epoch": 2.187533912099837, + "grad_norm": 5.034602642059326, + "learning_rate": 4.166643826229958e-06, + "loss": 0.1869, + "step": 43317 + }, + { + "epoch": 2.187547476939772, + "grad_norm": 3.659817934036255, + "learning_rate": 4.166506783609703e-06, + "loss": 0.1283, + "step": 43318 + }, + { + "epoch": 2.187561041779707, + "grad_norm": 7.0085225105285645, + "learning_rate": 4.166369740989448e-06, + "loss": 0.2557, + "step": 43319 + }, + { + "epoch": 2.1875746066196418, + "grad_norm": 5.842962265014648, + "learning_rate": 4.166232698369193e-06, + "loss": 0.2243, + "step": 43320 + }, + { + "epoch": 2.1875881714595766, + "grad_norm": 4.906863689422607, + "learning_rate": 4.166095655748939e-06, + "loss": 0.1969, + "step": 43321 + }, + { + "epoch": 2.1876017362995115, + "grad_norm": 3.9102232456207275, + "learning_rate": 4.165958613128683e-06, + "loss": 0.131, + "step": 43322 + }, + { + "epoch": 2.1876153011394464, + "grad_norm": 4.560245037078857, + "learning_rate": 4.165821570508429e-06, + "loss": 0.1532, + "step": 43323 + }, + { + "epoch": 2.1876288659793817, + "grad_norm": 5.038930416107178, + "learning_rate": 4.165684527888173e-06, + "loss": 0.1507, + "step": 43324 + }, + { + "epoch": 2.1876424308193165, + "grad_norm": 3.5039470195770264, + "learning_rate": 4.165547485267919e-06, + "loss": 0.157, + "step": 43325 + }, + { + "epoch": 2.1876559956592514, + "grad_norm": 5.050662517547607, + "learning_rate": 4.165410442647664e-06, + "loss": 0.154, + "step": 43326 + }, + { + "epoch": 2.1876695604991863, + "grad_norm": 3.8724756240844727, + "learning_rate": 4.165273400027409e-06, + "loss": 0.1467, + "step": 43327 + }, + { + "epoch": 2.187683125339121, + "grad_norm": 3.3115217685699463, + "learning_rate": 4.165136357407154e-06, + "loss": 0.1417, + "step": 43328 + }, + { + "epoch": 2.187696690179056, + "grad_norm": 5.820950031280518, + "learning_rate": 4.164999314786899e-06, + "loss": 0.221, + "step": 43329 + }, + { + "epoch": 2.187710255018991, + "grad_norm": 5.352038860321045, + "learning_rate": 4.1648622721666445e-06, + "loss": 0.21, + "step": 43330 + }, + { + "epoch": 2.1877238198589257, + "grad_norm": 2.8375282287597656, + "learning_rate": 4.164725229546389e-06, + "loss": 0.1181, + "step": 43331 + }, + { + "epoch": 2.1877373846988606, + "grad_norm": 4.45867395401001, + "learning_rate": 4.164588186926135e-06, + "loss": 0.1864, + "step": 43332 + }, + { + "epoch": 2.1877509495387955, + "grad_norm": 5.460636138916016, + "learning_rate": 4.164451144305879e-06, + "loss": 0.1772, + "step": 43333 + }, + { + "epoch": 2.1877645143787303, + "grad_norm": 3.5968880653381348, + "learning_rate": 4.164314101685624e-06, + "loss": 0.1281, + "step": 43334 + }, + { + "epoch": 2.187778079218665, + "grad_norm": 4.079780101776123, + "learning_rate": 4.1641770590653695e-06, + "loss": 0.1522, + "step": 43335 + }, + { + "epoch": 2.1877916440586, + "grad_norm": 3.5528054237365723, + "learning_rate": 4.164040016445115e-06, + "loss": 0.1017, + "step": 43336 + }, + { + "epoch": 2.187805208898535, + "grad_norm": 3.871105432510376, + "learning_rate": 4.16390297382486e-06, + "loss": 0.1025, + "step": 43337 + }, + { + "epoch": 2.18781877373847, + "grad_norm": 3.4883534908294678, + "learning_rate": 4.163765931204605e-06, + "loss": 0.095, + "step": 43338 + }, + { + "epoch": 2.1878323385784046, + "grad_norm": 3.1611011028289795, + "learning_rate": 4.16362888858435e-06, + "loss": 0.0992, + "step": 43339 + }, + { + "epoch": 2.1878459034183395, + "grad_norm": 2.754507064819336, + "learning_rate": 4.163491845964095e-06, + "loss": 0.0865, + "step": 43340 + }, + { + "epoch": 2.1878594682582744, + "grad_norm": 4.03550910949707, + "learning_rate": 4.163354803343841e-06, + "loss": 0.1037, + "step": 43341 + }, + { + "epoch": 2.1878730330982092, + "grad_norm": 4.182322978973389, + "learning_rate": 4.163217760723585e-06, + "loss": 0.1313, + "step": 43342 + }, + { + "epoch": 2.187886597938144, + "grad_norm": 5.3881425857543945, + "learning_rate": 4.163080718103331e-06, + "loss": 0.1775, + "step": 43343 + }, + { + "epoch": 2.1879001627780794, + "grad_norm": 2.952679395675659, + "learning_rate": 4.162943675483075e-06, + "loss": 0.0556, + "step": 43344 + }, + { + "epoch": 2.1879137276180143, + "grad_norm": 3.070632219314575, + "learning_rate": 4.1628066328628205e-06, + "loss": 0.1186, + "step": 43345 + }, + { + "epoch": 2.187927292457949, + "grad_norm": 6.479413986206055, + "learning_rate": 4.162669590242566e-06, + "loss": 0.2104, + "step": 43346 + }, + { + "epoch": 2.187940857297884, + "grad_norm": 2.8576831817626953, + "learning_rate": 4.162532547622311e-06, + "loss": 0.1029, + "step": 43347 + }, + { + "epoch": 2.187954422137819, + "grad_norm": 3.721982955932617, + "learning_rate": 4.162395505002056e-06, + "loss": 0.1254, + "step": 43348 + }, + { + "epoch": 2.1879679869777537, + "grad_norm": 6.318790912628174, + "learning_rate": 4.162258462381801e-06, + "loss": 0.1148, + "step": 43349 + }, + { + "epoch": 2.1879815518176886, + "grad_norm": 4.137375831604004, + "learning_rate": 4.162121419761546e-06, + "loss": 0.1382, + "step": 43350 + }, + { + "epoch": 2.1879951166576235, + "grad_norm": 3.179616928100586, + "learning_rate": 4.1619843771412916e-06, + "loss": 0.0862, + "step": 43351 + }, + { + "epoch": 2.1880086814975583, + "grad_norm": 4.197441101074219, + "learning_rate": 4.161847334521037e-06, + "loss": 0.1619, + "step": 43352 + }, + { + "epoch": 2.188022246337493, + "grad_norm": 3.641619920730591, + "learning_rate": 4.161710291900781e-06, + "loss": 0.0991, + "step": 43353 + }, + { + "epoch": 2.188035811177428, + "grad_norm": 4.5086236000061035, + "learning_rate": 4.161573249280526e-06, + "loss": 0.1509, + "step": 43354 + }, + { + "epoch": 2.188049376017363, + "grad_norm": 6.630495548248291, + "learning_rate": 4.161436206660271e-06, + "loss": 0.3337, + "step": 43355 + }, + { + "epoch": 2.188062940857298, + "grad_norm": 5.307209014892578, + "learning_rate": 4.161299164040017e-06, + "loss": 0.1508, + "step": 43356 + }, + { + "epoch": 2.1880765056972327, + "grad_norm": 4.501538276672363, + "learning_rate": 4.161162121419762e-06, + "loss": 0.214, + "step": 43357 + }, + { + "epoch": 2.1880900705371675, + "grad_norm": 3.6964409351348877, + "learning_rate": 4.161025078799507e-06, + "loss": 0.2007, + "step": 43358 + }, + { + "epoch": 2.1881036353771024, + "grad_norm": 4.9034318923950195, + "learning_rate": 4.160888036179252e-06, + "loss": 0.1596, + "step": 43359 + }, + { + "epoch": 2.1881172002170373, + "grad_norm": 5.063299179077148, + "learning_rate": 4.160750993558997e-06, + "loss": 0.216, + "step": 43360 + }, + { + "epoch": 2.188130765056972, + "grad_norm": 4.53011417388916, + "learning_rate": 4.1606139509387425e-06, + "loss": 0.1848, + "step": 43361 + }, + { + "epoch": 2.1881443298969074, + "grad_norm": 5.186811447143555, + "learning_rate": 4.160476908318488e-06, + "loss": 0.2689, + "step": 43362 + }, + { + "epoch": 2.1881578947368423, + "grad_norm": 3.9811370372772217, + "learning_rate": 4.160339865698233e-06, + "loss": 0.1595, + "step": 43363 + }, + { + "epoch": 2.188171459576777, + "grad_norm": 4.5023674964904785, + "learning_rate": 4.160202823077978e-06, + "loss": 0.2111, + "step": 43364 + }, + { + "epoch": 2.188185024416712, + "grad_norm": 3.9892776012420654, + "learning_rate": 4.160065780457722e-06, + "loss": 0.1869, + "step": 43365 + }, + { + "epoch": 2.188198589256647, + "grad_norm": 4.049119472503662, + "learning_rate": 4.1599287378374675e-06, + "loss": 0.1019, + "step": 43366 + }, + { + "epoch": 2.1882121540965818, + "grad_norm": 3.2562131881713867, + "learning_rate": 4.159791695217213e-06, + "loss": 0.0969, + "step": 43367 + }, + { + "epoch": 2.1882257189365166, + "grad_norm": 3.4750890731811523, + "learning_rate": 4.159654652596958e-06, + "loss": 0.0981, + "step": 43368 + }, + { + "epoch": 2.1882392837764515, + "grad_norm": 4.547861099243164, + "learning_rate": 4.159517609976703e-06, + "loss": 0.1517, + "step": 43369 + }, + { + "epoch": 2.1882528486163864, + "grad_norm": 4.063167095184326, + "learning_rate": 4.159380567356448e-06, + "loss": 0.1329, + "step": 43370 + }, + { + "epoch": 2.1882664134563212, + "grad_norm": 4.727035999298096, + "learning_rate": 4.1592435247361934e-06, + "loss": 0.1481, + "step": 43371 + }, + { + "epoch": 2.188279978296256, + "grad_norm": 3.9870238304138184, + "learning_rate": 4.159106482115939e-06, + "loss": 0.1952, + "step": 43372 + }, + { + "epoch": 2.188293543136191, + "grad_norm": 3.9838037490844727, + "learning_rate": 4.158969439495684e-06, + "loss": 0.1598, + "step": 43373 + }, + { + "epoch": 2.188307107976126, + "grad_norm": 4.2743659019470215, + "learning_rate": 4.158832396875428e-06, + "loss": 0.1556, + "step": 43374 + }, + { + "epoch": 2.1883206728160607, + "grad_norm": 3.9154818058013916, + "learning_rate": 4.158695354255174e-06, + "loss": 0.1151, + "step": 43375 + }, + { + "epoch": 2.1883342376559956, + "grad_norm": 6.130300998687744, + "learning_rate": 4.1585583116349185e-06, + "loss": 0.1561, + "step": 43376 + }, + { + "epoch": 2.1883478024959304, + "grad_norm": 5.480161666870117, + "learning_rate": 4.1584212690146645e-06, + "loss": 0.2611, + "step": 43377 + }, + { + "epoch": 2.1883613673358653, + "grad_norm": 5.215070724487305, + "learning_rate": 4.158284226394409e-06, + "loss": 0.3011, + "step": 43378 + }, + { + "epoch": 2.1883749321758, + "grad_norm": 3.8353943824768066, + "learning_rate": 4.158147183774154e-06, + "loss": 0.1155, + "step": 43379 + }, + { + "epoch": 2.188388497015735, + "grad_norm": 5.508289813995361, + "learning_rate": 4.158010141153899e-06, + "loss": 0.2567, + "step": 43380 + }, + { + "epoch": 2.18840206185567, + "grad_norm": 3.794447183609009, + "learning_rate": 4.157873098533644e-06, + "loss": 0.173, + "step": 43381 + }, + { + "epoch": 2.188415626695605, + "grad_norm": 3.3605053424835205, + "learning_rate": 4.1577360559133896e-06, + "loss": 0.1648, + "step": 43382 + }, + { + "epoch": 2.18842919153554, + "grad_norm": 3.079331159591675, + "learning_rate": 4.157599013293134e-06, + "loss": 0.1099, + "step": 43383 + }, + { + "epoch": 2.188442756375475, + "grad_norm": 4.1611809730529785, + "learning_rate": 4.15746197067288e-06, + "loss": 0.1449, + "step": 43384 + }, + { + "epoch": 2.18845632121541, + "grad_norm": 2.586207389831543, + "learning_rate": 4.157324928052624e-06, + "loss": 0.1207, + "step": 43385 + }, + { + "epoch": 2.1884698860553446, + "grad_norm": 4.460929870605469, + "learning_rate": 4.15718788543237e-06, + "loss": 0.1253, + "step": 43386 + }, + { + "epoch": 2.1884834508952795, + "grad_norm": 5.750865936279297, + "learning_rate": 4.157050842812115e-06, + "loss": 0.2017, + "step": 43387 + }, + { + "epoch": 2.1884970157352144, + "grad_norm": 5.064373970031738, + "learning_rate": 4.15691380019186e-06, + "loss": 0.1501, + "step": 43388 + }, + { + "epoch": 2.1885105805751492, + "grad_norm": 5.696113109588623, + "learning_rate": 4.156776757571605e-06, + "loss": 0.1928, + "step": 43389 + }, + { + "epoch": 2.188524145415084, + "grad_norm": 3.3847975730895996, + "learning_rate": 4.15663971495135e-06, + "loss": 0.1209, + "step": 43390 + }, + { + "epoch": 2.188537710255019, + "grad_norm": 5.250067234039307, + "learning_rate": 4.156502672331095e-06, + "loss": 0.1963, + "step": 43391 + }, + { + "epoch": 2.188551275094954, + "grad_norm": 6.226656436920166, + "learning_rate": 4.1563656297108405e-06, + "loss": 0.2652, + "step": 43392 + }, + { + "epoch": 2.1885648399348887, + "grad_norm": 4.237506866455078, + "learning_rate": 4.156228587090586e-06, + "loss": 0.1653, + "step": 43393 + }, + { + "epoch": 2.1885784047748236, + "grad_norm": 3.6827821731567383, + "learning_rate": 4.15609154447033e-06, + "loss": 0.1991, + "step": 43394 + }, + { + "epoch": 2.1885919696147584, + "grad_norm": 3.5876288414001465, + "learning_rate": 4.155954501850076e-06, + "loss": 0.1482, + "step": 43395 + }, + { + "epoch": 2.1886055344546933, + "grad_norm": 4.727060794830322, + "learning_rate": 4.15581745922982e-06, + "loss": 0.2531, + "step": 43396 + }, + { + "epoch": 2.188619099294628, + "grad_norm": 3.5812456607818604, + "learning_rate": 4.155680416609566e-06, + "loss": 0.1076, + "step": 43397 + }, + { + "epoch": 2.188632664134563, + "grad_norm": 6.89881706237793, + "learning_rate": 4.155543373989311e-06, + "loss": 0.1785, + "step": 43398 + }, + { + "epoch": 2.188646228974498, + "grad_norm": 3.2309176921844482, + "learning_rate": 4.155406331369056e-06, + "loss": 0.1925, + "step": 43399 + }, + { + "epoch": 2.188659793814433, + "grad_norm": 5.3177924156188965, + "learning_rate": 4.155269288748801e-06, + "loss": 0.1447, + "step": 43400 + }, + { + "epoch": 2.188673358654368, + "grad_norm": 4.340844631195068, + "learning_rate": 4.155132246128546e-06, + "loss": 0.2239, + "step": 43401 + }, + { + "epoch": 2.188686923494303, + "grad_norm": 3.53999400138855, + "learning_rate": 4.1549952035082914e-06, + "loss": 0.1424, + "step": 43402 + }, + { + "epoch": 2.188700488334238, + "grad_norm": 4.41759729385376, + "learning_rate": 4.154858160888037e-06, + "loss": 0.1381, + "step": 43403 + }, + { + "epoch": 2.1887140531741727, + "grad_norm": 6.961092948913574, + "learning_rate": 4.154721118267782e-06, + "loss": 0.2063, + "step": 43404 + }, + { + "epoch": 2.1887276180141075, + "grad_norm": 4.7321953773498535, + "learning_rate": 4.154584075647527e-06, + "loss": 0.1419, + "step": 43405 + }, + { + "epoch": 2.1887411828540424, + "grad_norm": 3.5351665019989014, + "learning_rate": 4.154447033027272e-06, + "loss": 0.1563, + "step": 43406 + }, + { + "epoch": 2.1887547476939773, + "grad_norm": 4.9556379318237305, + "learning_rate": 4.1543099904070165e-06, + "loss": 0.2428, + "step": 43407 + }, + { + "epoch": 2.188768312533912, + "grad_norm": 3.857583999633789, + "learning_rate": 4.154172947786762e-06, + "loss": 0.1457, + "step": 43408 + }, + { + "epoch": 2.188781877373847, + "grad_norm": 3.8385565280914307, + "learning_rate": 4.154035905166507e-06, + "loss": 0.1066, + "step": 43409 + }, + { + "epoch": 2.188795442213782, + "grad_norm": 5.341565132141113, + "learning_rate": 4.153898862546252e-06, + "loss": 0.1319, + "step": 43410 + }, + { + "epoch": 2.1888090070537167, + "grad_norm": 4.46786642074585, + "learning_rate": 4.153761819925997e-06, + "loss": 0.1166, + "step": 43411 + }, + { + "epoch": 2.1888225718936516, + "grad_norm": 4.040576457977295, + "learning_rate": 4.153624777305742e-06, + "loss": 0.2364, + "step": 43412 + }, + { + "epoch": 2.1888361367335865, + "grad_norm": 5.820670127868652, + "learning_rate": 4.1534877346854876e-06, + "loss": 0.2823, + "step": 43413 + }, + { + "epoch": 2.1888497015735213, + "grad_norm": 4.3035054206848145, + "learning_rate": 4.153350692065233e-06, + "loss": 0.1985, + "step": 43414 + }, + { + "epoch": 2.188863266413456, + "grad_norm": 5.010838985443115, + "learning_rate": 4.153213649444978e-06, + "loss": 0.18, + "step": 43415 + }, + { + "epoch": 2.188876831253391, + "grad_norm": 4.341120719909668, + "learning_rate": 4.153076606824723e-06, + "loss": 0.1692, + "step": 43416 + }, + { + "epoch": 2.188890396093326, + "grad_norm": 4.356433391571045, + "learning_rate": 4.152939564204468e-06, + "loss": 0.1005, + "step": 43417 + }, + { + "epoch": 2.188903960933261, + "grad_norm": 3.56786847114563, + "learning_rate": 4.1528025215842135e-06, + "loss": 0.1132, + "step": 43418 + }, + { + "epoch": 2.1889175257731956, + "grad_norm": 4.658616065979004, + "learning_rate": 4.152665478963958e-06, + "loss": 0.1988, + "step": 43419 + }, + { + "epoch": 2.188931090613131, + "grad_norm": 3.894411087036133, + "learning_rate": 4.152528436343704e-06, + "loss": 0.1441, + "step": 43420 + }, + { + "epoch": 2.188944655453066, + "grad_norm": 4.611844539642334, + "learning_rate": 4.152391393723448e-06, + "loss": 0.189, + "step": 43421 + }, + { + "epoch": 2.1889582202930007, + "grad_norm": 4.325307369232178, + "learning_rate": 4.152254351103193e-06, + "loss": 0.1415, + "step": 43422 + }, + { + "epoch": 2.1889717851329356, + "grad_norm": 5.253220558166504, + "learning_rate": 4.1521173084829385e-06, + "loss": 0.2109, + "step": 43423 + }, + { + "epoch": 2.1889853499728704, + "grad_norm": 5.447671413421631, + "learning_rate": 4.151980265862684e-06, + "loss": 0.1567, + "step": 43424 + }, + { + "epoch": 2.1889989148128053, + "grad_norm": 4.063007354736328, + "learning_rate": 4.151843223242429e-06, + "loss": 0.1706, + "step": 43425 + }, + { + "epoch": 2.18901247965274, + "grad_norm": 4.0058465003967285, + "learning_rate": 4.151706180622174e-06, + "loss": 0.1537, + "step": 43426 + }, + { + "epoch": 2.189026044492675, + "grad_norm": 3.798940896987915, + "learning_rate": 4.151569138001919e-06, + "loss": 0.1461, + "step": 43427 + }, + { + "epoch": 2.18903960933261, + "grad_norm": 5.177863597869873, + "learning_rate": 4.1514320953816636e-06, + "loss": 0.2723, + "step": 43428 + }, + { + "epoch": 2.1890531741725447, + "grad_norm": 4.111913204193115, + "learning_rate": 4.1512950527614096e-06, + "loss": 0.1701, + "step": 43429 + }, + { + "epoch": 2.1890667390124796, + "grad_norm": 4.258622646331787, + "learning_rate": 4.151158010141154e-06, + "loss": 0.152, + "step": 43430 + }, + { + "epoch": 2.1890803038524145, + "grad_norm": 4.516399383544922, + "learning_rate": 4.1510209675209e-06, + "loss": 0.1258, + "step": 43431 + }, + { + "epoch": 2.1890938686923493, + "grad_norm": 3.847569227218628, + "learning_rate": 4.150883924900644e-06, + "loss": 0.142, + "step": 43432 + }, + { + "epoch": 2.189107433532284, + "grad_norm": 3.3646609783172607, + "learning_rate": 4.1507468822803894e-06, + "loss": 0.0779, + "step": 43433 + }, + { + "epoch": 2.189120998372219, + "grad_norm": 7.9387102127075195, + "learning_rate": 4.150609839660135e-06, + "loss": 0.2888, + "step": 43434 + }, + { + "epoch": 2.189134563212154, + "grad_norm": 6.62366247177124, + "learning_rate": 4.15047279703988e-06, + "loss": 0.2543, + "step": 43435 + }, + { + "epoch": 2.189148128052089, + "grad_norm": 4.787466049194336, + "learning_rate": 4.150335754419625e-06, + "loss": 0.1826, + "step": 43436 + }, + { + "epoch": 2.1891616928920237, + "grad_norm": 3.334024667739868, + "learning_rate": 4.150198711799369e-06, + "loss": 0.1508, + "step": 43437 + }, + { + "epoch": 2.189175257731959, + "grad_norm": 4.718074321746826, + "learning_rate": 4.150061669179115e-06, + "loss": 0.2229, + "step": 43438 + }, + { + "epoch": 2.189188822571894, + "grad_norm": 3.791654348373413, + "learning_rate": 4.14992462655886e-06, + "loss": 0.1147, + "step": 43439 + }, + { + "epoch": 2.1892023874118287, + "grad_norm": 4.392121315002441, + "learning_rate": 4.149787583938606e-06, + "loss": 0.1829, + "step": 43440 + }, + { + "epoch": 2.1892159522517636, + "grad_norm": 4.650904178619385, + "learning_rate": 4.14965054131835e-06, + "loss": 0.2184, + "step": 43441 + }, + { + "epoch": 2.1892295170916984, + "grad_norm": 6.146583557128906, + "learning_rate": 4.149513498698095e-06, + "loss": 0.264, + "step": 43442 + }, + { + "epoch": 2.1892430819316333, + "grad_norm": 5.305462837219238, + "learning_rate": 4.14937645607784e-06, + "loss": 0.2097, + "step": 43443 + }, + { + "epoch": 2.189256646771568, + "grad_norm": 5.464729309082031, + "learning_rate": 4.1492394134575856e-06, + "loss": 0.1243, + "step": 43444 + }, + { + "epoch": 2.189270211611503, + "grad_norm": 5.338711738586426, + "learning_rate": 4.149102370837331e-06, + "loss": 0.2107, + "step": 43445 + }, + { + "epoch": 2.189283776451438, + "grad_norm": 5.831368446350098, + "learning_rate": 4.148965328217076e-06, + "loss": 0.3083, + "step": 43446 + }, + { + "epoch": 2.1892973412913728, + "grad_norm": 4.204567909240723, + "learning_rate": 4.148828285596821e-06, + "loss": 0.1835, + "step": 43447 + }, + { + "epoch": 2.1893109061313076, + "grad_norm": 4.56123685836792, + "learning_rate": 4.1486912429765654e-06, + "loss": 0.1963, + "step": 43448 + }, + { + "epoch": 2.1893244709712425, + "grad_norm": 4.447992324829102, + "learning_rate": 4.1485542003563115e-06, + "loss": 0.242, + "step": 43449 + }, + { + "epoch": 2.1893380358111774, + "grad_norm": 4.950690746307373, + "learning_rate": 4.148417157736056e-06, + "loss": 0.1813, + "step": 43450 + }, + { + "epoch": 2.1893516006511122, + "grad_norm": 4.713260650634766, + "learning_rate": 4.148280115115802e-06, + "loss": 0.177, + "step": 43451 + }, + { + "epoch": 2.189365165491047, + "grad_norm": 4.942876815795898, + "learning_rate": 4.148143072495546e-06, + "loss": 0.2185, + "step": 43452 + }, + { + "epoch": 2.189378730330982, + "grad_norm": 4.181728363037109, + "learning_rate": 4.148006029875291e-06, + "loss": 0.158, + "step": 43453 + }, + { + "epoch": 2.189392295170917, + "grad_norm": 7.89845085144043, + "learning_rate": 4.1478689872550365e-06, + "loss": 0.3946, + "step": 43454 + }, + { + "epoch": 2.1894058600108517, + "grad_norm": 4.363387107849121, + "learning_rate": 4.147731944634782e-06, + "loss": 0.1545, + "step": 43455 + }, + { + "epoch": 2.1894194248507866, + "grad_norm": 5.573412895202637, + "learning_rate": 4.147594902014527e-06, + "loss": 0.1901, + "step": 43456 + }, + { + "epoch": 2.1894329896907214, + "grad_norm": 6.076234340667725, + "learning_rate": 4.147457859394272e-06, + "loss": 0.2915, + "step": 43457 + }, + { + "epoch": 2.1894465545306567, + "grad_norm": 6.282510757446289, + "learning_rate": 4.147320816774017e-06, + "loss": 0.2605, + "step": 43458 + }, + { + "epoch": 2.1894601193705916, + "grad_norm": 4.373856544494629, + "learning_rate": 4.147183774153762e-06, + "loss": 0.0973, + "step": 43459 + }, + { + "epoch": 2.1894736842105265, + "grad_norm": 5.3898024559021, + "learning_rate": 4.147046731533508e-06, + "loss": 0.2536, + "step": 43460 + }, + { + "epoch": 2.1894872490504613, + "grad_norm": 5.97531795501709, + "learning_rate": 4.146909688913253e-06, + "loss": 0.2165, + "step": 43461 + }, + { + "epoch": 2.189500813890396, + "grad_norm": 4.136640548706055, + "learning_rate": 4.146772646292997e-06, + "loss": 0.14, + "step": 43462 + }, + { + "epoch": 2.189514378730331, + "grad_norm": 5.4226837158203125, + "learning_rate": 4.146635603672742e-06, + "loss": 0.1643, + "step": 43463 + }, + { + "epoch": 2.189527943570266, + "grad_norm": 3.992480516433716, + "learning_rate": 4.1464985610524874e-06, + "loss": 0.2201, + "step": 43464 + }, + { + "epoch": 2.189541508410201, + "grad_norm": 6.074661731719971, + "learning_rate": 4.146361518432233e-06, + "loss": 0.229, + "step": 43465 + }, + { + "epoch": 2.1895550732501357, + "grad_norm": 7.062534332275391, + "learning_rate": 4.146224475811978e-06, + "loss": 0.3027, + "step": 43466 + }, + { + "epoch": 2.1895686380900705, + "grad_norm": 4.869495868682861, + "learning_rate": 4.146087433191723e-06, + "loss": 0.2297, + "step": 43467 + }, + { + "epoch": 2.1895822029300054, + "grad_norm": 5.4720072746276855, + "learning_rate": 4.145950390571468e-06, + "loss": 0.209, + "step": 43468 + }, + { + "epoch": 2.1895957677699402, + "grad_norm": 4.233778476715088, + "learning_rate": 4.145813347951213e-06, + "loss": 0.1406, + "step": 43469 + }, + { + "epoch": 2.189609332609875, + "grad_norm": 6.503030300140381, + "learning_rate": 4.1456763053309585e-06, + "loss": 0.2778, + "step": 43470 + }, + { + "epoch": 2.18962289744981, + "grad_norm": 6.707008361816406, + "learning_rate": 4.145539262710704e-06, + "loss": 0.2951, + "step": 43471 + }, + { + "epoch": 2.189636462289745, + "grad_norm": 5.144767761230469, + "learning_rate": 4.145402220090449e-06, + "loss": 0.267, + "step": 43472 + }, + { + "epoch": 2.1896500271296797, + "grad_norm": 6.411404609680176, + "learning_rate": 4.145265177470193e-06, + "loss": 0.3014, + "step": 43473 + }, + { + "epoch": 2.1896635919696146, + "grad_norm": 4.916797637939453, + "learning_rate": 4.145128134849939e-06, + "loss": 0.2314, + "step": 43474 + }, + { + "epoch": 2.1896771568095494, + "grad_norm": 4.8943400382995605, + "learning_rate": 4.1449910922296836e-06, + "loss": 0.2052, + "step": 43475 + }, + { + "epoch": 2.1896907216494848, + "grad_norm": 4.905750274658203, + "learning_rate": 4.144854049609429e-06, + "loss": 0.1643, + "step": 43476 + }, + { + "epoch": 2.1897042864894196, + "grad_norm": 6.794753074645996, + "learning_rate": 4.144717006989174e-06, + "loss": 0.2518, + "step": 43477 + }, + { + "epoch": 2.1897178513293545, + "grad_norm": 3.580951690673828, + "learning_rate": 4.144579964368919e-06, + "loss": 0.1434, + "step": 43478 + }, + { + "epoch": 2.1897314161692893, + "grad_norm": 4.2581610679626465, + "learning_rate": 4.144442921748664e-06, + "loss": 0.2586, + "step": 43479 + }, + { + "epoch": 2.189744981009224, + "grad_norm": 5.840629577636719, + "learning_rate": 4.1443058791284095e-06, + "loss": 0.1848, + "step": 43480 + }, + { + "epoch": 2.189758545849159, + "grad_norm": 6.864927291870117, + "learning_rate": 4.144168836508155e-06, + "loss": 0.3706, + "step": 43481 + }, + { + "epoch": 2.189772110689094, + "grad_norm": 5.317248344421387, + "learning_rate": 4.144031793887899e-06, + "loss": 0.2425, + "step": 43482 + }, + { + "epoch": 2.189785675529029, + "grad_norm": 9.809683799743652, + "learning_rate": 4.143894751267645e-06, + "loss": 0.3395, + "step": 43483 + }, + { + "epoch": 2.1897992403689637, + "grad_norm": 5.2931227684021, + "learning_rate": 4.143757708647389e-06, + "loss": 0.2028, + "step": 43484 + }, + { + "epoch": 2.1898128052088985, + "grad_norm": 5.887368202209473, + "learning_rate": 4.143620666027135e-06, + "loss": 0.2323, + "step": 43485 + }, + { + "epoch": 2.1898263700488334, + "grad_norm": 4.30416202545166, + "learning_rate": 4.14348362340688e-06, + "loss": 0.1612, + "step": 43486 + }, + { + "epoch": 2.1898399348887683, + "grad_norm": 5.373995304107666, + "learning_rate": 4.143346580786625e-06, + "loss": 0.124, + "step": 43487 + }, + { + "epoch": 2.189853499728703, + "grad_norm": 7.117932319641113, + "learning_rate": 4.14320953816637e-06, + "loss": 0.2922, + "step": 43488 + }, + { + "epoch": 2.189867064568638, + "grad_norm": 4.77265739440918, + "learning_rate": 4.143072495546115e-06, + "loss": 0.1824, + "step": 43489 + }, + { + "epoch": 2.189880629408573, + "grad_norm": 6.768529415130615, + "learning_rate": 4.14293545292586e-06, + "loss": 0.2218, + "step": 43490 + }, + { + "epoch": 2.1898941942485077, + "grad_norm": 4.663319110870361, + "learning_rate": 4.142798410305605e-06, + "loss": 0.1774, + "step": 43491 + }, + { + "epoch": 2.1899077590884426, + "grad_norm": 4.601691722869873, + "learning_rate": 4.142661367685351e-06, + "loss": 0.173, + "step": 43492 + }, + { + "epoch": 2.1899213239283775, + "grad_norm": 4.436800003051758, + "learning_rate": 4.142524325065095e-06, + "loss": 0.2038, + "step": 43493 + }, + { + "epoch": 2.1899348887683123, + "grad_norm": 3.141369581222534, + "learning_rate": 4.142387282444841e-06, + "loss": 0.0568, + "step": 43494 + }, + { + "epoch": 2.189948453608247, + "grad_norm": 4.9169697761535645, + "learning_rate": 4.1422502398245855e-06, + "loss": 0.247, + "step": 43495 + }, + { + "epoch": 2.1899620184481825, + "grad_norm": 3.634465456008911, + "learning_rate": 4.142113197204331e-06, + "loss": 0.1142, + "step": 43496 + }, + { + "epoch": 2.1899755832881174, + "grad_norm": 5.1424241065979, + "learning_rate": 4.141976154584076e-06, + "loss": 0.2296, + "step": 43497 + }, + { + "epoch": 2.1899891481280522, + "grad_norm": 5.642653465270996, + "learning_rate": 4.141839111963821e-06, + "loss": 0.1477, + "step": 43498 + }, + { + "epoch": 2.190002712967987, + "grad_norm": 5.683827877044678, + "learning_rate": 4.141702069343566e-06, + "loss": 0.214, + "step": 43499 + }, + { + "epoch": 2.190016277807922, + "grad_norm": 5.220122337341309, + "learning_rate": 4.141565026723311e-06, + "loss": 0.1908, + "step": 43500 + }, + { + "epoch": 2.190029842647857, + "grad_norm": 5.087393283843994, + "learning_rate": 4.1414279841030565e-06, + "loss": 0.1896, + "step": 43501 + }, + { + "epoch": 2.1900434074877917, + "grad_norm": 5.1353535652160645, + "learning_rate": 4.141290941482802e-06, + "loss": 0.19, + "step": 43502 + }, + { + "epoch": 2.1900569723277266, + "grad_norm": 5.112884044647217, + "learning_rate": 4.141153898862547e-06, + "loss": 0.1461, + "step": 43503 + }, + { + "epoch": 2.1900705371676614, + "grad_norm": 5.531449794769287, + "learning_rate": 4.141016856242291e-06, + "loss": 0.1747, + "step": 43504 + }, + { + "epoch": 2.1900841020075963, + "grad_norm": 4.799428462982178, + "learning_rate": 4.140879813622037e-06, + "loss": 0.1744, + "step": 43505 + }, + { + "epoch": 2.190097666847531, + "grad_norm": 4.023804187774658, + "learning_rate": 4.1407427710017816e-06, + "loss": 0.1446, + "step": 43506 + }, + { + "epoch": 2.190111231687466, + "grad_norm": 7.553983211517334, + "learning_rate": 4.140605728381527e-06, + "loss": 0.2975, + "step": 43507 + }, + { + "epoch": 2.190124796527401, + "grad_norm": 6.235450744628906, + "learning_rate": 4.140468685761272e-06, + "loss": 0.1603, + "step": 43508 + }, + { + "epoch": 2.1901383613673358, + "grad_norm": 4.832520484924316, + "learning_rate": 4.140331643141017e-06, + "loss": 0.1915, + "step": 43509 + }, + { + "epoch": 2.1901519262072706, + "grad_norm": 5.216285705566406, + "learning_rate": 4.140194600520762e-06, + "loss": 0.1877, + "step": 43510 + }, + { + "epoch": 2.1901654910472055, + "grad_norm": 5.7990593910217285, + "learning_rate": 4.1400575579005075e-06, + "loss": 0.174, + "step": 43511 + }, + { + "epoch": 2.1901790558871403, + "grad_norm": 4.706104755401611, + "learning_rate": 4.139920515280253e-06, + "loss": 0.185, + "step": 43512 + }, + { + "epoch": 2.1901926207270757, + "grad_norm": 4.317733287811279, + "learning_rate": 4.139783472659998e-06, + "loss": 0.1289, + "step": 43513 + }, + { + "epoch": 2.1902061855670105, + "grad_norm": 5.085222244262695, + "learning_rate": 4.139646430039743e-06, + "loss": 0.1731, + "step": 43514 + }, + { + "epoch": 2.1902197504069454, + "grad_norm": 4.834609031677246, + "learning_rate": 4.139509387419488e-06, + "loss": 0.1832, + "step": 43515 + }, + { + "epoch": 2.1902333152468803, + "grad_norm": 3.6523444652557373, + "learning_rate": 4.1393723447992325e-06, + "loss": 0.1589, + "step": 43516 + }, + { + "epoch": 2.190246880086815, + "grad_norm": 4.014127731323242, + "learning_rate": 4.139235302178978e-06, + "loss": 0.1304, + "step": 43517 + }, + { + "epoch": 2.19026044492675, + "grad_norm": 7.435295104980469, + "learning_rate": 4.139098259558723e-06, + "loss": 0.3027, + "step": 43518 + }, + { + "epoch": 2.190274009766685, + "grad_norm": 4.4272637367248535, + "learning_rate": 4.138961216938468e-06, + "loss": 0.1413, + "step": 43519 + }, + { + "epoch": 2.1902875746066197, + "grad_norm": 4.218323230743408, + "learning_rate": 4.138824174318213e-06, + "loss": 0.117, + "step": 43520 + }, + { + "epoch": 2.1903011394465546, + "grad_norm": 4.354306697845459, + "learning_rate": 4.138687131697958e-06, + "loss": 0.1722, + "step": 43521 + }, + { + "epoch": 2.1903147042864894, + "grad_norm": 5.764760971069336, + "learning_rate": 4.138550089077704e-06, + "loss": 0.1551, + "step": 43522 + }, + { + "epoch": 2.1903282691264243, + "grad_norm": 4.864695072174072, + "learning_rate": 4.138413046457449e-06, + "loss": 0.1763, + "step": 43523 + }, + { + "epoch": 2.190341833966359, + "grad_norm": 2.7111916542053223, + "learning_rate": 4.138276003837194e-06, + "loss": 0.1172, + "step": 43524 + }, + { + "epoch": 2.190355398806294, + "grad_norm": 5.854109287261963, + "learning_rate": 4.138138961216938e-06, + "loss": 0.2445, + "step": 43525 + }, + { + "epoch": 2.190368963646229, + "grad_norm": 4.6715407371521, + "learning_rate": 4.138001918596684e-06, + "loss": 0.1977, + "step": 43526 + }, + { + "epoch": 2.1903825284861638, + "grad_norm": 3.884158134460449, + "learning_rate": 4.137864875976429e-06, + "loss": 0.1424, + "step": 43527 + }, + { + "epoch": 2.1903960933260986, + "grad_norm": 4.5007452964782715, + "learning_rate": 4.137727833356175e-06, + "loss": 0.1492, + "step": 43528 + }, + { + "epoch": 2.1904096581660335, + "grad_norm": 5.120192050933838, + "learning_rate": 4.137590790735919e-06, + "loss": 0.1267, + "step": 43529 + }, + { + "epoch": 2.1904232230059684, + "grad_norm": 2.8942248821258545, + "learning_rate": 4.137453748115664e-06, + "loss": 0.1081, + "step": 43530 + }, + { + "epoch": 2.1904367878459032, + "grad_norm": 4.5001220703125, + "learning_rate": 4.137316705495409e-06, + "loss": 0.1675, + "step": 43531 + }, + { + "epoch": 2.190450352685838, + "grad_norm": 4.10025691986084, + "learning_rate": 4.1371796628751545e-06, + "loss": 0.1288, + "step": 43532 + }, + { + "epoch": 2.190463917525773, + "grad_norm": 4.218020439147949, + "learning_rate": 4.1370426202549e-06, + "loss": 0.1845, + "step": 43533 + }, + { + "epoch": 2.1904774823657083, + "grad_norm": 4.70529842376709, + "learning_rate": 4.136905577634645e-06, + "loss": 0.1727, + "step": 43534 + }, + { + "epoch": 2.190491047205643, + "grad_norm": 3.6038637161254883, + "learning_rate": 4.13676853501439e-06, + "loss": 0.1812, + "step": 43535 + }, + { + "epoch": 2.190504612045578, + "grad_norm": 4.161665916442871, + "learning_rate": 4.136631492394134e-06, + "loss": 0.1424, + "step": 43536 + }, + { + "epoch": 2.190518176885513, + "grad_norm": 3.925870895385742, + "learning_rate": 4.1364944497738804e-06, + "loss": 0.1156, + "step": 43537 + }, + { + "epoch": 2.1905317417254477, + "grad_norm": 3.7196383476257324, + "learning_rate": 4.136357407153625e-06, + "loss": 0.1502, + "step": 43538 + }, + { + "epoch": 2.1905453065653826, + "grad_norm": 4.975908279418945, + "learning_rate": 4.136220364533371e-06, + "loss": 0.2029, + "step": 43539 + }, + { + "epoch": 2.1905588714053175, + "grad_norm": 5.822708606719971, + "learning_rate": 4.136083321913115e-06, + "loss": 0.2327, + "step": 43540 + }, + { + "epoch": 2.1905724362452523, + "grad_norm": 4.771539688110352, + "learning_rate": 4.13594627929286e-06, + "loss": 0.2374, + "step": 43541 + }, + { + "epoch": 2.190586001085187, + "grad_norm": 4.499478816986084, + "learning_rate": 4.1358092366726055e-06, + "loss": 0.1613, + "step": 43542 + }, + { + "epoch": 2.190599565925122, + "grad_norm": 4.170319080352783, + "learning_rate": 4.135672194052351e-06, + "loss": 0.1132, + "step": 43543 + }, + { + "epoch": 2.190613130765057, + "grad_norm": 4.9820451736450195, + "learning_rate": 4.135535151432096e-06, + "loss": 0.1656, + "step": 43544 + }, + { + "epoch": 2.190626695604992, + "grad_norm": 3.796950340270996, + "learning_rate": 4.13539810881184e-06, + "loss": 0.1385, + "step": 43545 + }, + { + "epoch": 2.1906402604449267, + "grad_norm": 3.54602313041687, + "learning_rate": 4.135261066191586e-06, + "loss": 0.1146, + "step": 43546 + }, + { + "epoch": 2.1906538252848615, + "grad_norm": 4.449289798736572, + "learning_rate": 4.1351240235713305e-06, + "loss": 0.1232, + "step": 43547 + }, + { + "epoch": 2.1906673901247964, + "grad_norm": 4.544443130493164, + "learning_rate": 4.1349869809510765e-06, + "loss": 0.1011, + "step": 43548 + }, + { + "epoch": 2.1906809549647313, + "grad_norm": 6.242441654205322, + "learning_rate": 4.134849938330821e-06, + "loss": 0.2011, + "step": 43549 + }, + { + "epoch": 2.190694519804666, + "grad_norm": 4.4099321365356445, + "learning_rate": 4.134712895710566e-06, + "loss": 0.2241, + "step": 43550 + }, + { + "epoch": 2.1907080846446014, + "grad_norm": 4.634810447692871, + "learning_rate": 4.134575853090311e-06, + "loss": 0.1506, + "step": 43551 + }, + { + "epoch": 2.1907216494845363, + "grad_norm": 3.8295695781707764, + "learning_rate": 4.134438810470056e-06, + "loss": 0.1183, + "step": 43552 + }, + { + "epoch": 2.190735214324471, + "grad_norm": 5.707788944244385, + "learning_rate": 4.134301767849802e-06, + "loss": 0.1783, + "step": 43553 + }, + { + "epoch": 2.190748779164406, + "grad_norm": 6.077641487121582, + "learning_rate": 4.134164725229547e-06, + "loss": 0.1978, + "step": 43554 + }, + { + "epoch": 2.190762344004341, + "grad_norm": 3.551743984222412, + "learning_rate": 4.134027682609292e-06, + "loss": 0.17, + "step": 43555 + }, + { + "epoch": 2.1907759088442758, + "grad_norm": 3.3394298553466797, + "learning_rate": 4.133890639989037e-06, + "loss": 0.0735, + "step": 43556 + }, + { + "epoch": 2.1907894736842106, + "grad_norm": 3.886575937271118, + "learning_rate": 4.133753597368782e-06, + "loss": 0.1596, + "step": 43557 + }, + { + "epoch": 2.1908030385241455, + "grad_norm": 3.9088943004608154, + "learning_rate": 4.133616554748527e-06, + "loss": 0.2224, + "step": 43558 + }, + { + "epoch": 2.1908166033640804, + "grad_norm": 5.843522071838379, + "learning_rate": 4.133479512128273e-06, + "loss": 0.263, + "step": 43559 + }, + { + "epoch": 2.190830168204015, + "grad_norm": 3.9257075786590576, + "learning_rate": 4.133342469508017e-06, + "loss": 0.2062, + "step": 43560 + }, + { + "epoch": 2.19084373304395, + "grad_norm": 3.9645652770996094, + "learning_rate": 4.133205426887762e-06, + "loss": 0.1162, + "step": 43561 + }, + { + "epoch": 2.190857297883885, + "grad_norm": 4.121965408325195, + "learning_rate": 4.133068384267507e-06, + "loss": 0.1342, + "step": 43562 + }, + { + "epoch": 2.19087086272382, + "grad_norm": 3.696841239929199, + "learning_rate": 4.1329313416472525e-06, + "loss": 0.1118, + "step": 43563 + }, + { + "epoch": 2.1908844275637547, + "grad_norm": 4.283877849578857, + "learning_rate": 4.132794299026998e-06, + "loss": 0.1499, + "step": 43564 + }, + { + "epoch": 2.1908979924036895, + "grad_norm": 4.034409523010254, + "learning_rate": 4.132657256406743e-06, + "loss": 0.1451, + "step": 43565 + }, + { + "epoch": 2.1909115572436244, + "grad_norm": 3.8717405796051025, + "learning_rate": 4.132520213786488e-06, + "loss": 0.1225, + "step": 43566 + }, + { + "epoch": 2.1909251220835593, + "grad_norm": 4.630923271179199, + "learning_rate": 4.132383171166233e-06, + "loss": 0.0933, + "step": 43567 + }, + { + "epoch": 2.190938686923494, + "grad_norm": 4.031572341918945, + "learning_rate": 4.1322461285459784e-06, + "loss": 0.1924, + "step": 43568 + }, + { + "epoch": 2.190952251763429, + "grad_norm": 4.286279201507568, + "learning_rate": 4.132109085925724e-06, + "loss": 0.1459, + "step": 43569 + }, + { + "epoch": 2.190965816603364, + "grad_norm": 3.7081127166748047, + "learning_rate": 4.131972043305468e-06, + "loss": 0.1244, + "step": 43570 + }, + { + "epoch": 2.1909793814432987, + "grad_norm": 2.4036874771118164, + "learning_rate": 4.131835000685214e-06, + "loss": 0.1123, + "step": 43571 + }, + { + "epoch": 2.190992946283234, + "grad_norm": 2.9527077674865723, + "learning_rate": 4.131697958064958e-06, + "loss": 0.0844, + "step": 43572 + }, + { + "epoch": 2.191006511123169, + "grad_norm": 3.191946506500244, + "learning_rate": 4.1315609154447035e-06, + "loss": 0.0863, + "step": 43573 + }, + { + "epoch": 2.1910200759631038, + "grad_norm": 4.931705951690674, + "learning_rate": 4.131423872824449e-06, + "loss": 0.1721, + "step": 43574 + }, + { + "epoch": 2.1910336408030386, + "grad_norm": 5.433403968811035, + "learning_rate": 4.131286830204194e-06, + "loss": 0.138, + "step": 43575 + }, + { + "epoch": 2.1910472056429735, + "grad_norm": 6.811564922332764, + "learning_rate": 4.131149787583939e-06, + "loss": 0.229, + "step": 43576 + }, + { + "epoch": 2.1910607704829084, + "grad_norm": 3.3801026344299316, + "learning_rate": 4.131012744963684e-06, + "loss": 0.1054, + "step": 43577 + }, + { + "epoch": 2.1910743353228432, + "grad_norm": 4.270435333251953, + "learning_rate": 4.130875702343429e-06, + "loss": 0.1029, + "step": 43578 + }, + { + "epoch": 2.191087900162778, + "grad_norm": 4.703676223754883, + "learning_rate": 4.130738659723174e-06, + "loss": 0.2271, + "step": 43579 + }, + { + "epoch": 2.191101465002713, + "grad_norm": 6.027235507965088, + "learning_rate": 4.13060161710292e-06, + "loss": 0.1498, + "step": 43580 + }, + { + "epoch": 2.191115029842648, + "grad_norm": 4.245185852050781, + "learning_rate": 4.130464574482664e-06, + "loss": 0.1499, + "step": 43581 + }, + { + "epoch": 2.1911285946825827, + "grad_norm": 2.658320188522339, + "learning_rate": 4.13032753186241e-06, + "loss": 0.1014, + "step": 43582 + }, + { + "epoch": 2.1911421595225176, + "grad_norm": 4.547215461730957, + "learning_rate": 4.130190489242154e-06, + "loss": 0.193, + "step": 43583 + }, + { + "epoch": 2.1911557243624524, + "grad_norm": 3.700251340866089, + "learning_rate": 4.1300534466219e-06, + "loss": 0.1594, + "step": 43584 + }, + { + "epoch": 2.1911692892023873, + "grad_norm": 3.8609731197357178, + "learning_rate": 4.129916404001645e-06, + "loss": 0.1823, + "step": 43585 + }, + { + "epoch": 2.191182854042322, + "grad_norm": 4.712538719177246, + "learning_rate": 4.12977936138139e-06, + "loss": 0.2345, + "step": 43586 + }, + { + "epoch": 2.191196418882257, + "grad_norm": 6.856409549713135, + "learning_rate": 4.129642318761135e-06, + "loss": 0.3625, + "step": 43587 + }, + { + "epoch": 2.191209983722192, + "grad_norm": 4.235418319702148, + "learning_rate": 4.12950527614088e-06, + "loss": 0.1357, + "step": 43588 + }, + { + "epoch": 2.191223548562127, + "grad_norm": 5.227378845214844, + "learning_rate": 4.1293682335206255e-06, + "loss": 0.187, + "step": 43589 + }, + { + "epoch": 2.191237113402062, + "grad_norm": 3.1819543838500977, + "learning_rate": 4.12923119090037e-06, + "loss": 0.1063, + "step": 43590 + }, + { + "epoch": 2.191250678241997, + "grad_norm": 4.898696422576904, + "learning_rate": 4.129094148280116e-06, + "loss": 0.1846, + "step": 43591 + }, + { + "epoch": 2.191264243081932, + "grad_norm": 4.904191970825195, + "learning_rate": 4.12895710565986e-06, + "loss": 0.257, + "step": 43592 + }, + { + "epoch": 2.1912778079218667, + "grad_norm": 5.189712047576904, + "learning_rate": 4.128820063039606e-06, + "loss": 0.2003, + "step": 43593 + }, + { + "epoch": 2.1912913727618015, + "grad_norm": 4.752956390380859, + "learning_rate": 4.1286830204193505e-06, + "loss": 0.1975, + "step": 43594 + }, + { + "epoch": 2.1913049376017364, + "grad_norm": 3.7175183296203613, + "learning_rate": 4.128545977799096e-06, + "loss": 0.1453, + "step": 43595 + }, + { + "epoch": 2.1913185024416713, + "grad_norm": 4.936854362487793, + "learning_rate": 4.128408935178841e-06, + "loss": 0.2304, + "step": 43596 + }, + { + "epoch": 2.191332067281606, + "grad_norm": 5.387242794036865, + "learning_rate": 4.128271892558586e-06, + "loss": 0.2164, + "step": 43597 + }, + { + "epoch": 2.191345632121541, + "grad_norm": 4.831065654754639, + "learning_rate": 4.128134849938331e-06, + "loss": 0.1712, + "step": 43598 + }, + { + "epoch": 2.191359196961476, + "grad_norm": 3.4538540840148926, + "learning_rate": 4.1279978073180764e-06, + "loss": 0.1249, + "step": 43599 + }, + { + "epoch": 2.1913727618014107, + "grad_norm": 4.334278583526611, + "learning_rate": 4.127860764697822e-06, + "loss": 0.1473, + "step": 43600 + }, + { + "epoch": 2.1913863266413456, + "grad_norm": 4.897177696228027, + "learning_rate": 4.127723722077566e-06, + "loss": 0.192, + "step": 43601 + }, + { + "epoch": 2.1913998914812804, + "grad_norm": 3.8292386531829834, + "learning_rate": 4.127586679457312e-06, + "loss": 0.1366, + "step": 43602 + }, + { + "epoch": 2.1914134563212153, + "grad_norm": 4.228552341461182, + "learning_rate": 4.127449636837056e-06, + "loss": 0.2, + "step": 43603 + }, + { + "epoch": 2.19142702116115, + "grad_norm": 4.071042060852051, + "learning_rate": 4.1273125942168015e-06, + "loss": 0.2075, + "step": 43604 + }, + { + "epoch": 2.191440586001085, + "grad_norm": 3.922785520553589, + "learning_rate": 4.127175551596547e-06, + "loss": 0.1628, + "step": 43605 + }, + { + "epoch": 2.19145415084102, + "grad_norm": 4.045895099639893, + "learning_rate": 4.127038508976292e-06, + "loss": 0.1557, + "step": 43606 + }, + { + "epoch": 2.1914677156809548, + "grad_norm": 3.248361110687256, + "learning_rate": 4.126901466356037e-06, + "loss": 0.1818, + "step": 43607 + }, + { + "epoch": 2.1914812805208896, + "grad_norm": 4.034012317657471, + "learning_rate": 4.126764423735782e-06, + "loss": 0.1678, + "step": 43608 + }, + { + "epoch": 2.191494845360825, + "grad_norm": 3.738020181655884, + "learning_rate": 4.126627381115527e-06, + "loss": 0.1372, + "step": 43609 + }, + { + "epoch": 2.19150841020076, + "grad_norm": 5.0534844398498535, + "learning_rate": 4.1264903384952726e-06, + "loss": 0.1953, + "step": 43610 + }, + { + "epoch": 2.1915219750406947, + "grad_norm": 4.309195518493652, + "learning_rate": 4.126353295875018e-06, + "loss": 0.1111, + "step": 43611 + }, + { + "epoch": 2.1915355398806295, + "grad_norm": 3.9021432399749756, + "learning_rate": 4.126216253254763e-06, + "loss": 0.1361, + "step": 43612 + }, + { + "epoch": 2.1915491047205644, + "grad_norm": 4.512505054473877, + "learning_rate": 4.126079210634508e-06, + "loss": 0.1455, + "step": 43613 + }, + { + "epoch": 2.1915626695604993, + "grad_norm": 5.24148416519165, + "learning_rate": 4.125942168014252e-06, + "loss": 0.1891, + "step": 43614 + }, + { + "epoch": 2.191576234400434, + "grad_norm": 5.1339945793151855, + "learning_rate": 4.125805125393998e-06, + "loss": 0.1705, + "step": 43615 + }, + { + "epoch": 2.191589799240369, + "grad_norm": 4.348596096038818, + "learning_rate": 4.125668082773743e-06, + "loss": 0.1587, + "step": 43616 + }, + { + "epoch": 2.191603364080304, + "grad_norm": 4.017444133758545, + "learning_rate": 4.125531040153488e-06, + "loss": 0.188, + "step": 43617 + }, + { + "epoch": 2.1916169289202387, + "grad_norm": 3.9368717670440674, + "learning_rate": 4.125393997533233e-06, + "loss": 0.1497, + "step": 43618 + }, + { + "epoch": 2.1916304937601736, + "grad_norm": 5.40875768661499, + "learning_rate": 4.125256954912978e-06, + "loss": 0.2031, + "step": 43619 + }, + { + "epoch": 2.1916440586001085, + "grad_norm": 5.577641010284424, + "learning_rate": 4.1251199122927235e-06, + "loss": 0.2505, + "step": 43620 + }, + { + "epoch": 2.1916576234400433, + "grad_norm": 4.325815200805664, + "learning_rate": 4.124982869672469e-06, + "loss": 0.1251, + "step": 43621 + }, + { + "epoch": 2.191671188279978, + "grad_norm": 4.188704967498779, + "learning_rate": 4.124845827052214e-06, + "loss": 0.1847, + "step": 43622 + }, + { + "epoch": 2.191684753119913, + "grad_norm": 3.127183437347412, + "learning_rate": 4.124708784431959e-06, + "loss": 0.1395, + "step": 43623 + }, + { + "epoch": 2.191698317959848, + "grad_norm": 3.5457310676574707, + "learning_rate": 4.124571741811703e-06, + "loss": 0.1785, + "step": 43624 + }, + { + "epoch": 2.191711882799783, + "grad_norm": 6.616647243499756, + "learning_rate": 4.124434699191449e-06, + "loss": 0.3334, + "step": 43625 + }, + { + "epoch": 2.1917254476397177, + "grad_norm": 3.3509225845336914, + "learning_rate": 4.124297656571194e-06, + "loss": 0.1017, + "step": 43626 + }, + { + "epoch": 2.191739012479653, + "grad_norm": 4.9970269203186035, + "learning_rate": 4.124160613950939e-06, + "loss": 0.1872, + "step": 43627 + }, + { + "epoch": 2.191752577319588, + "grad_norm": 4.303901195526123, + "learning_rate": 4.124023571330684e-06, + "loss": 0.1913, + "step": 43628 + }, + { + "epoch": 2.1917661421595227, + "grad_norm": 6.874413013458252, + "learning_rate": 4.123886528710429e-06, + "loss": 0.3186, + "step": 43629 + }, + { + "epoch": 2.1917797069994576, + "grad_norm": 4.138920783996582, + "learning_rate": 4.1237494860901744e-06, + "loss": 0.1448, + "step": 43630 + }, + { + "epoch": 2.1917932718393924, + "grad_norm": 5.048730850219727, + "learning_rate": 4.12361244346992e-06, + "loss": 0.1026, + "step": 43631 + }, + { + "epoch": 2.1918068366793273, + "grad_norm": 3.798454761505127, + "learning_rate": 4.123475400849665e-06, + "loss": 0.194, + "step": 43632 + }, + { + "epoch": 2.191820401519262, + "grad_norm": 3.2191269397735596, + "learning_rate": 4.123338358229409e-06, + "loss": 0.0938, + "step": 43633 + }, + { + "epoch": 2.191833966359197, + "grad_norm": 4.8201212882995605, + "learning_rate": 4.123201315609155e-06, + "loss": 0.1713, + "step": 43634 + }, + { + "epoch": 2.191847531199132, + "grad_norm": 4.790146827697754, + "learning_rate": 4.1230642729888995e-06, + "loss": 0.1564, + "step": 43635 + }, + { + "epoch": 2.1918610960390668, + "grad_norm": 4.214473247528076, + "learning_rate": 4.1229272303686455e-06, + "loss": 0.1395, + "step": 43636 + }, + { + "epoch": 2.1918746608790016, + "grad_norm": 4.292646884918213, + "learning_rate": 4.12279018774839e-06, + "loss": 0.1657, + "step": 43637 + }, + { + "epoch": 2.1918882257189365, + "grad_norm": 5.511958122253418, + "learning_rate": 4.122653145128135e-06, + "loss": 0.31, + "step": 43638 + }, + { + "epoch": 2.1919017905588714, + "grad_norm": 3.776097059249878, + "learning_rate": 4.12251610250788e-06, + "loss": 0.1186, + "step": 43639 + }, + { + "epoch": 2.191915355398806, + "grad_norm": 3.4347143173217773, + "learning_rate": 4.122379059887625e-06, + "loss": 0.1456, + "step": 43640 + }, + { + "epoch": 2.191928920238741, + "grad_norm": 4.4094767570495605, + "learning_rate": 4.1222420172673706e-06, + "loss": 0.1222, + "step": 43641 + }, + { + "epoch": 2.191942485078676, + "grad_norm": 4.6118550300598145, + "learning_rate": 4.122104974647116e-06, + "loss": 0.1377, + "step": 43642 + }, + { + "epoch": 2.191956049918611, + "grad_norm": 3.910496950149536, + "learning_rate": 4.121967932026861e-06, + "loss": 0.0993, + "step": 43643 + }, + { + "epoch": 2.1919696147585457, + "grad_norm": 4.652620315551758, + "learning_rate": 4.121830889406605e-06, + "loss": 0.1295, + "step": 43644 + }, + { + "epoch": 2.1919831795984805, + "grad_norm": 4.201432228088379, + "learning_rate": 4.121693846786351e-06, + "loss": 0.1372, + "step": 43645 + }, + { + "epoch": 2.1919967444384154, + "grad_norm": 4.287266254425049, + "learning_rate": 4.121556804166096e-06, + "loss": 0.2494, + "step": 43646 + }, + { + "epoch": 2.1920103092783507, + "grad_norm": 5.670192241668701, + "learning_rate": 4.121419761545842e-06, + "loss": 0.2038, + "step": 43647 + }, + { + "epoch": 2.1920238741182856, + "grad_norm": 6.026391506195068, + "learning_rate": 4.121282718925586e-06, + "loss": 0.3067, + "step": 43648 + }, + { + "epoch": 2.1920374389582205, + "grad_norm": 6.804769992828369, + "learning_rate": 4.121145676305331e-06, + "loss": 0.3834, + "step": 43649 + }, + { + "epoch": 2.1920510037981553, + "grad_norm": 3.855778217315674, + "learning_rate": 4.121008633685076e-06, + "loss": 0.1752, + "step": 43650 + }, + { + "epoch": 2.19206456863809, + "grad_norm": 4.077301502227783, + "learning_rate": 4.1208715910648215e-06, + "loss": 0.1594, + "step": 43651 + }, + { + "epoch": 2.192078133478025, + "grad_norm": 3.778782606124878, + "learning_rate": 4.120734548444567e-06, + "loss": 0.1839, + "step": 43652 + }, + { + "epoch": 2.19209169831796, + "grad_norm": 5.216418266296387, + "learning_rate": 4.120597505824312e-06, + "loss": 0.2015, + "step": 43653 + }, + { + "epoch": 2.192105263157895, + "grad_norm": 5.017104148864746, + "learning_rate": 4.120460463204057e-06, + "loss": 0.1867, + "step": 43654 + }, + { + "epoch": 2.1921188279978296, + "grad_norm": 4.682876110076904, + "learning_rate": 4.120323420583801e-06, + "loss": 0.2576, + "step": 43655 + }, + { + "epoch": 2.1921323928377645, + "grad_norm": 5.34343957901001, + "learning_rate": 4.120186377963547e-06, + "loss": 0.1672, + "step": 43656 + }, + { + "epoch": 2.1921459576776994, + "grad_norm": 3.826840400695801, + "learning_rate": 4.120049335343292e-06, + "loss": 0.1218, + "step": 43657 + }, + { + "epoch": 2.1921595225176342, + "grad_norm": 5.783339500427246, + "learning_rate": 4.119912292723037e-06, + "loss": 0.2174, + "step": 43658 + }, + { + "epoch": 2.192173087357569, + "grad_norm": 4.884347438812256, + "learning_rate": 4.119775250102782e-06, + "loss": 0.1599, + "step": 43659 + }, + { + "epoch": 2.192186652197504, + "grad_norm": 5.898806095123291, + "learning_rate": 4.119638207482527e-06, + "loss": 0.2, + "step": 43660 + }, + { + "epoch": 2.192200217037439, + "grad_norm": 4.971203327178955, + "learning_rate": 4.1195011648622724e-06, + "loss": 0.1889, + "step": 43661 + }, + { + "epoch": 2.1922137818773737, + "grad_norm": 5.63655424118042, + "learning_rate": 4.119364122242018e-06, + "loss": 0.2739, + "step": 43662 + }, + { + "epoch": 2.1922273467173086, + "grad_norm": 4.687589645385742, + "learning_rate": 4.119227079621763e-06, + "loss": 0.3003, + "step": 43663 + }, + { + "epoch": 2.1922409115572434, + "grad_norm": 6.975423336029053, + "learning_rate": 4.119090037001508e-06, + "loss": 0.2526, + "step": 43664 + }, + { + "epoch": 2.1922544763971787, + "grad_norm": 6.1636738777160645, + "learning_rate": 4.118952994381253e-06, + "loss": 0.2165, + "step": 43665 + }, + { + "epoch": 2.1922680412371136, + "grad_norm": 5.297183990478516, + "learning_rate": 4.118815951760998e-06, + "loss": 0.1957, + "step": 43666 + }, + { + "epoch": 2.1922816060770485, + "grad_norm": 5.129025936126709, + "learning_rate": 4.118678909140743e-06, + "loss": 0.1549, + "step": 43667 + }, + { + "epoch": 2.1922951709169833, + "grad_norm": 7.10705041885376, + "learning_rate": 4.118541866520489e-06, + "loss": 0.1987, + "step": 43668 + }, + { + "epoch": 2.192308735756918, + "grad_norm": 4.8341779708862305, + "learning_rate": 4.118404823900233e-06, + "loss": 0.1673, + "step": 43669 + }, + { + "epoch": 2.192322300596853, + "grad_norm": 5.210535049438477, + "learning_rate": 4.118267781279978e-06, + "loss": 0.1657, + "step": 43670 + }, + { + "epoch": 2.192335865436788, + "grad_norm": 3.635723829269409, + "learning_rate": 4.118130738659723e-06, + "loss": 0.0624, + "step": 43671 + }, + { + "epoch": 2.192349430276723, + "grad_norm": 5.8398542404174805, + "learning_rate": 4.1179936960394686e-06, + "loss": 0.1997, + "step": 43672 + }, + { + "epoch": 2.1923629951166577, + "grad_norm": 5.465219497680664, + "learning_rate": 4.117856653419214e-06, + "loss": 0.1685, + "step": 43673 + }, + { + "epoch": 2.1923765599565925, + "grad_norm": 6.555797100067139, + "learning_rate": 4.117719610798959e-06, + "loss": 0.2339, + "step": 43674 + }, + { + "epoch": 2.1923901247965274, + "grad_norm": 5.332581520080566, + "learning_rate": 4.117582568178704e-06, + "loss": 0.1432, + "step": 43675 + }, + { + "epoch": 2.1924036896364623, + "grad_norm": 4.278243541717529, + "learning_rate": 4.117445525558449e-06, + "loss": 0.1236, + "step": 43676 + }, + { + "epoch": 2.192417254476397, + "grad_norm": 3.4867725372314453, + "learning_rate": 4.1173084829381945e-06, + "loss": 0.0786, + "step": 43677 + }, + { + "epoch": 2.192430819316332, + "grad_norm": 4.864253044128418, + "learning_rate": 4.117171440317939e-06, + "loss": 0.1894, + "step": 43678 + }, + { + "epoch": 2.192444384156267, + "grad_norm": 3.3761045932769775, + "learning_rate": 4.117034397697685e-06, + "loss": 0.1263, + "step": 43679 + }, + { + "epoch": 2.1924579489962017, + "grad_norm": 3.609752893447876, + "learning_rate": 4.116897355077429e-06, + "loss": 0.0944, + "step": 43680 + }, + { + "epoch": 2.1924715138361366, + "grad_norm": 4.100904941558838, + "learning_rate": 4.116760312457175e-06, + "loss": 0.1366, + "step": 43681 + }, + { + "epoch": 2.1924850786760715, + "grad_norm": 5.865748405456543, + "learning_rate": 4.1166232698369195e-06, + "loss": 0.2687, + "step": 43682 + }, + { + "epoch": 2.1924986435160063, + "grad_norm": 4.0777058601379395, + "learning_rate": 4.116486227216665e-06, + "loss": 0.0999, + "step": 43683 + }, + { + "epoch": 2.192512208355941, + "grad_norm": 3.7128381729125977, + "learning_rate": 4.11634918459641e-06, + "loss": 0.1339, + "step": 43684 + }, + { + "epoch": 2.1925257731958765, + "grad_norm": 4.62907600402832, + "learning_rate": 4.116212141976155e-06, + "loss": 0.1256, + "step": 43685 + }, + { + "epoch": 2.1925393380358114, + "grad_norm": 4.159270763397217, + "learning_rate": 4.1160750993559e-06, + "loss": 0.1406, + "step": 43686 + }, + { + "epoch": 2.1925529028757462, + "grad_norm": 3.8222639560699463, + "learning_rate": 4.1159380567356446e-06, + "loss": 0.094, + "step": 43687 + }, + { + "epoch": 2.192566467715681, + "grad_norm": 3.645798683166504, + "learning_rate": 4.115801014115391e-06, + "loss": 0.101, + "step": 43688 + }, + { + "epoch": 2.192580032555616, + "grad_norm": 5.980348587036133, + "learning_rate": 4.115663971495135e-06, + "loss": 0.192, + "step": 43689 + }, + { + "epoch": 2.192593597395551, + "grad_norm": 3.780630111694336, + "learning_rate": 4.115526928874881e-06, + "loss": 0.1479, + "step": 43690 + }, + { + "epoch": 2.1926071622354857, + "grad_norm": 3.313112497329712, + "learning_rate": 4.115389886254625e-06, + "loss": 0.1064, + "step": 43691 + }, + { + "epoch": 2.1926207270754206, + "grad_norm": 6.4889912605285645, + "learning_rate": 4.1152528436343704e-06, + "loss": 0.2571, + "step": 43692 + }, + { + "epoch": 2.1926342919153554, + "grad_norm": 4.641890048980713, + "learning_rate": 4.115115801014116e-06, + "loss": 0.1926, + "step": 43693 + }, + { + "epoch": 2.1926478567552903, + "grad_norm": 3.9765806198120117, + "learning_rate": 4.114978758393861e-06, + "loss": 0.0906, + "step": 43694 + }, + { + "epoch": 2.192661421595225, + "grad_norm": 4.009399890899658, + "learning_rate": 4.114841715773606e-06, + "loss": 0.1547, + "step": 43695 + }, + { + "epoch": 2.19267498643516, + "grad_norm": 2.9567880630493164, + "learning_rate": 4.114704673153351e-06, + "loss": 0.1121, + "step": 43696 + }, + { + "epoch": 2.192688551275095, + "grad_norm": 6.04395866394043, + "learning_rate": 4.114567630533096e-06, + "loss": 0.1644, + "step": 43697 + }, + { + "epoch": 2.1927021161150297, + "grad_norm": 3.7362399101257324, + "learning_rate": 4.114430587912841e-06, + "loss": 0.1016, + "step": 43698 + }, + { + "epoch": 2.1927156809549646, + "grad_norm": 7.762925624847412, + "learning_rate": 4.114293545292587e-06, + "loss": 0.2562, + "step": 43699 + }, + { + "epoch": 2.1927292457948995, + "grad_norm": 4.154331684112549, + "learning_rate": 4.114156502672331e-06, + "loss": 0.1254, + "step": 43700 + }, + { + "epoch": 2.1927428106348343, + "grad_norm": 5.123410701751709, + "learning_rate": 4.114019460052077e-06, + "loss": 0.2456, + "step": 43701 + }, + { + "epoch": 2.192756375474769, + "grad_norm": 4.878362655639648, + "learning_rate": 4.113882417431821e-06, + "loss": 0.1969, + "step": 43702 + }, + { + "epoch": 2.1927699403147045, + "grad_norm": 6.76987886428833, + "learning_rate": 4.1137453748115666e-06, + "loss": 0.1734, + "step": 43703 + }, + { + "epoch": 2.1927835051546394, + "grad_norm": 5.586690902709961, + "learning_rate": 4.113608332191312e-06, + "loss": 0.3075, + "step": 43704 + }, + { + "epoch": 2.1927970699945742, + "grad_norm": 5.221121788024902, + "learning_rate": 4.113471289571057e-06, + "loss": 0.1592, + "step": 43705 + }, + { + "epoch": 2.192810634834509, + "grad_norm": 5.262688159942627, + "learning_rate": 4.113334246950802e-06, + "loss": 0.1989, + "step": 43706 + }, + { + "epoch": 2.192824199674444, + "grad_norm": 4.102639675140381, + "learning_rate": 4.113197204330547e-06, + "loss": 0.1488, + "step": 43707 + }, + { + "epoch": 2.192837764514379, + "grad_norm": 5.1769914627075195, + "learning_rate": 4.1130601617102925e-06, + "loss": 0.24, + "step": 43708 + }, + { + "epoch": 2.1928513293543137, + "grad_norm": 4.542200565338135, + "learning_rate": 4.112923119090038e-06, + "loss": 0.2038, + "step": 43709 + }, + { + "epoch": 2.1928648941942486, + "grad_norm": 4.09660005569458, + "learning_rate": 4.112786076469783e-06, + "loss": 0.1139, + "step": 43710 + }, + { + "epoch": 2.1928784590341834, + "grad_norm": 4.131488800048828, + "learning_rate": 4.112649033849527e-06, + "loss": 0.1236, + "step": 43711 + }, + { + "epoch": 2.1928920238741183, + "grad_norm": 4.816374778747559, + "learning_rate": 4.112511991229272e-06, + "loss": 0.1728, + "step": 43712 + }, + { + "epoch": 2.192905588714053, + "grad_norm": 4.971364498138428, + "learning_rate": 4.1123749486090175e-06, + "loss": 0.202, + "step": 43713 + }, + { + "epoch": 2.192919153553988, + "grad_norm": 5.135964393615723, + "learning_rate": 4.112237905988763e-06, + "loss": 0.1474, + "step": 43714 + }, + { + "epoch": 2.192932718393923, + "grad_norm": 4.080387115478516, + "learning_rate": 4.112100863368508e-06, + "loss": 0.2025, + "step": 43715 + }, + { + "epoch": 2.1929462832338578, + "grad_norm": 6.207144260406494, + "learning_rate": 4.111963820748253e-06, + "loss": 0.2772, + "step": 43716 + }, + { + "epoch": 2.1929598480737926, + "grad_norm": 4.486496925354004, + "learning_rate": 4.111826778127998e-06, + "loss": 0.1686, + "step": 43717 + }, + { + "epoch": 2.1929734129137275, + "grad_norm": 5.331693172454834, + "learning_rate": 4.111689735507743e-06, + "loss": 0.2705, + "step": 43718 + }, + { + "epoch": 2.1929869777536624, + "grad_norm": 4.808386325836182, + "learning_rate": 4.111552692887489e-06, + "loss": 0.1696, + "step": 43719 + }, + { + "epoch": 2.1930005425935972, + "grad_norm": 3.4957892894744873, + "learning_rate": 4.111415650267234e-06, + "loss": 0.1554, + "step": 43720 + }, + { + "epoch": 2.193014107433532, + "grad_norm": 4.483224391937256, + "learning_rate": 4.111278607646978e-06, + "loss": 0.149, + "step": 43721 + }, + { + "epoch": 2.193027672273467, + "grad_norm": 5.628072261810303, + "learning_rate": 4.111141565026724e-06, + "loss": 0.1167, + "step": 43722 + }, + { + "epoch": 2.1930412371134023, + "grad_norm": 5.799593448638916, + "learning_rate": 4.1110045224064685e-06, + "loss": 0.1913, + "step": 43723 + }, + { + "epoch": 2.193054801953337, + "grad_norm": 4.338240146636963, + "learning_rate": 4.110867479786214e-06, + "loss": 0.1365, + "step": 43724 + }, + { + "epoch": 2.193068366793272, + "grad_norm": 6.538269519805908, + "learning_rate": 4.110730437165959e-06, + "loss": 0.2953, + "step": 43725 + }, + { + "epoch": 2.193081931633207, + "grad_norm": 4.997184753417969, + "learning_rate": 4.110593394545704e-06, + "loss": 0.2811, + "step": 43726 + }, + { + "epoch": 2.1930954964731417, + "grad_norm": 6.282949447631836, + "learning_rate": 4.110456351925449e-06, + "loss": 0.2273, + "step": 43727 + }, + { + "epoch": 2.1931090613130766, + "grad_norm": 5.779142379760742, + "learning_rate": 4.110319309305194e-06, + "loss": 0.2708, + "step": 43728 + }, + { + "epoch": 2.1931226261530115, + "grad_norm": 5.825348377227783, + "learning_rate": 4.1101822666849395e-06, + "loss": 0.2534, + "step": 43729 + }, + { + "epoch": 2.1931361909929463, + "grad_norm": 4.377586841583252, + "learning_rate": 4.110045224064685e-06, + "loss": 0.2341, + "step": 43730 + }, + { + "epoch": 2.193149755832881, + "grad_norm": 3.8382046222686768, + "learning_rate": 4.10990818144443e-06, + "loss": 0.1133, + "step": 43731 + }, + { + "epoch": 2.193163320672816, + "grad_norm": 4.86421012878418, + "learning_rate": 4.109771138824174e-06, + "loss": 0.1892, + "step": 43732 + }, + { + "epoch": 2.193176885512751, + "grad_norm": 3.8144304752349854, + "learning_rate": 4.10963409620392e-06, + "loss": 0.1328, + "step": 43733 + }, + { + "epoch": 2.193190450352686, + "grad_norm": 3.561699867248535, + "learning_rate": 4.1094970535836646e-06, + "loss": 0.139, + "step": 43734 + }, + { + "epoch": 2.1932040151926206, + "grad_norm": 4.046283721923828, + "learning_rate": 4.109360010963411e-06, + "loss": 0.1543, + "step": 43735 + }, + { + "epoch": 2.1932175800325555, + "grad_norm": 4.174145698547363, + "learning_rate": 4.109222968343155e-06, + "loss": 0.2243, + "step": 43736 + }, + { + "epoch": 2.1932311448724904, + "grad_norm": 6.545080184936523, + "learning_rate": 4.1090859257229e-06, + "loss": 0.2475, + "step": 43737 + }, + { + "epoch": 2.1932447097124252, + "grad_norm": 3.723632574081421, + "learning_rate": 4.108948883102645e-06, + "loss": 0.112, + "step": 43738 + }, + { + "epoch": 2.19325827455236, + "grad_norm": 4.473449230194092, + "learning_rate": 4.1088118404823905e-06, + "loss": 0.2151, + "step": 43739 + }, + { + "epoch": 2.193271839392295, + "grad_norm": 5.564609050750732, + "learning_rate": 4.108674797862136e-06, + "loss": 0.1479, + "step": 43740 + }, + { + "epoch": 2.1932854042322303, + "grad_norm": 4.337413787841797, + "learning_rate": 4.10853775524188e-06, + "loss": 0.1713, + "step": 43741 + }, + { + "epoch": 2.193298969072165, + "grad_norm": 4.730495929718018, + "learning_rate": 4.108400712621626e-06, + "loss": 0.2056, + "step": 43742 + }, + { + "epoch": 2.1933125339121, + "grad_norm": 6.301079273223877, + "learning_rate": 4.10826367000137e-06, + "loss": 0.2728, + "step": 43743 + }, + { + "epoch": 2.193326098752035, + "grad_norm": 5.347896575927734, + "learning_rate": 4.108126627381116e-06, + "loss": 0.1752, + "step": 43744 + }, + { + "epoch": 2.1933396635919697, + "grad_norm": 2.8082430362701416, + "learning_rate": 4.107989584760861e-06, + "loss": 0.1185, + "step": 43745 + }, + { + "epoch": 2.1933532284319046, + "grad_norm": 3.4102091789245605, + "learning_rate": 4.107852542140606e-06, + "loss": 0.1427, + "step": 43746 + }, + { + "epoch": 2.1933667932718395, + "grad_norm": 5.204941749572754, + "learning_rate": 4.107715499520351e-06, + "loss": 0.2216, + "step": 43747 + }, + { + "epoch": 2.1933803581117743, + "grad_norm": 7.267976760864258, + "learning_rate": 4.107578456900096e-06, + "loss": 0.3098, + "step": 43748 + }, + { + "epoch": 2.193393922951709, + "grad_norm": 5.426858425140381, + "learning_rate": 4.107441414279841e-06, + "loss": 0.1859, + "step": 43749 + }, + { + "epoch": 2.193407487791644, + "grad_norm": 4.343398571014404, + "learning_rate": 4.107304371659587e-06, + "loss": 0.1738, + "step": 43750 + }, + { + "epoch": 2.193421052631579, + "grad_norm": 4.858270645141602, + "learning_rate": 4.107167329039332e-06, + "loss": 0.1509, + "step": 43751 + }, + { + "epoch": 2.193434617471514, + "grad_norm": 4.30536413192749, + "learning_rate": 4.107030286419076e-06, + "loss": 0.1745, + "step": 43752 + }, + { + "epoch": 2.1934481823114487, + "grad_norm": 5.801658630371094, + "learning_rate": 4.106893243798822e-06, + "loss": 0.1984, + "step": 43753 + }, + { + "epoch": 2.1934617471513835, + "grad_norm": 4.683990001678467, + "learning_rate": 4.1067562011785665e-06, + "loss": 0.1478, + "step": 43754 + }, + { + "epoch": 2.1934753119913184, + "grad_norm": 5.156713962554932, + "learning_rate": 4.106619158558312e-06, + "loss": 0.2326, + "step": 43755 + }, + { + "epoch": 2.1934888768312533, + "grad_norm": 4.6294355392456055, + "learning_rate": 4.106482115938057e-06, + "loss": 0.1603, + "step": 43756 + }, + { + "epoch": 2.193502441671188, + "grad_norm": 5.812273025512695, + "learning_rate": 4.106345073317802e-06, + "loss": 0.2454, + "step": 43757 + }, + { + "epoch": 2.193516006511123, + "grad_norm": 5.3706841468811035, + "learning_rate": 4.106208030697547e-06, + "loss": 0.1422, + "step": 43758 + }, + { + "epoch": 2.193529571351058, + "grad_norm": 4.319747447967529, + "learning_rate": 4.106070988077292e-06, + "loss": 0.2289, + "step": 43759 + }, + { + "epoch": 2.1935431361909927, + "grad_norm": 3.565133571624756, + "learning_rate": 4.1059339454570375e-06, + "loss": 0.1223, + "step": 43760 + }, + { + "epoch": 2.193556701030928, + "grad_norm": 3.2114076614379883, + "learning_rate": 4.105796902836783e-06, + "loss": 0.1421, + "step": 43761 + }, + { + "epoch": 2.193570265870863, + "grad_norm": 4.300050258636475, + "learning_rate": 4.105659860216528e-06, + "loss": 0.2703, + "step": 43762 + }, + { + "epoch": 2.1935838307107978, + "grad_norm": 4.785367488861084, + "learning_rate": 4.105522817596273e-06, + "loss": 0.1943, + "step": 43763 + }, + { + "epoch": 2.1935973955507326, + "grad_norm": 4.004266738891602, + "learning_rate": 4.105385774976018e-06, + "loss": 0.1641, + "step": 43764 + }, + { + "epoch": 2.1936109603906675, + "grad_norm": 5.246875286102295, + "learning_rate": 4.1052487323557626e-06, + "loss": 0.2412, + "step": 43765 + }, + { + "epoch": 2.1936245252306024, + "grad_norm": 4.239105701446533, + "learning_rate": 4.105111689735508e-06, + "loss": 0.1946, + "step": 43766 + }, + { + "epoch": 2.1936380900705372, + "grad_norm": 3.6311147212982178, + "learning_rate": 4.104974647115253e-06, + "loss": 0.1548, + "step": 43767 + }, + { + "epoch": 2.193651654910472, + "grad_norm": 4.497342586517334, + "learning_rate": 4.104837604494998e-06, + "loss": 0.2099, + "step": 43768 + }, + { + "epoch": 2.193665219750407, + "grad_norm": 5.091672420501709, + "learning_rate": 4.104700561874743e-06, + "loss": 0.2476, + "step": 43769 + }, + { + "epoch": 2.193678784590342, + "grad_norm": 4.758875846862793, + "learning_rate": 4.1045635192544885e-06, + "loss": 0.1599, + "step": 43770 + }, + { + "epoch": 2.1936923494302767, + "grad_norm": 4.336381435394287, + "learning_rate": 4.104426476634234e-06, + "loss": 0.1684, + "step": 43771 + }, + { + "epoch": 2.1937059142702116, + "grad_norm": 4.805108070373535, + "learning_rate": 4.104289434013979e-06, + "loss": 0.2544, + "step": 43772 + }, + { + "epoch": 2.1937194791101464, + "grad_norm": 4.098731517791748, + "learning_rate": 4.104152391393724e-06, + "loss": 0.2021, + "step": 43773 + }, + { + "epoch": 2.1937330439500813, + "grad_norm": 4.169576644897461, + "learning_rate": 4.104015348773469e-06, + "loss": 0.1436, + "step": 43774 + }, + { + "epoch": 2.193746608790016, + "grad_norm": 3.9123284816741943, + "learning_rate": 4.1038783061532135e-06, + "loss": 0.1503, + "step": 43775 + }, + { + "epoch": 2.193760173629951, + "grad_norm": 6.025139808654785, + "learning_rate": 4.1037412635329595e-06, + "loss": 0.2588, + "step": 43776 + }, + { + "epoch": 2.193773738469886, + "grad_norm": 4.805819511413574, + "learning_rate": 4.103604220912704e-06, + "loss": 0.1536, + "step": 43777 + }, + { + "epoch": 2.1937873033098207, + "grad_norm": 2.9909114837646484, + "learning_rate": 4.10346717829245e-06, + "loss": 0.1208, + "step": 43778 + }, + { + "epoch": 2.193800868149756, + "grad_norm": 3.7540693283081055, + "learning_rate": 4.103330135672194e-06, + "loss": 0.1787, + "step": 43779 + }, + { + "epoch": 2.193814432989691, + "grad_norm": 4.432728290557861, + "learning_rate": 4.103193093051939e-06, + "loss": 0.0985, + "step": 43780 + }, + { + "epoch": 2.193827997829626, + "grad_norm": 6.731103420257568, + "learning_rate": 4.103056050431685e-06, + "loss": 0.2149, + "step": 43781 + }, + { + "epoch": 2.1938415626695607, + "grad_norm": 4.6953959465026855, + "learning_rate": 4.10291900781143e-06, + "loss": 0.2402, + "step": 43782 + }, + { + "epoch": 2.1938551275094955, + "grad_norm": 4.92643928527832, + "learning_rate": 4.102781965191175e-06, + "loss": 0.179, + "step": 43783 + }, + { + "epoch": 2.1938686923494304, + "grad_norm": 3.4126553535461426, + "learning_rate": 4.10264492257092e-06, + "loss": 0.1504, + "step": 43784 + }, + { + "epoch": 2.1938822571893652, + "grad_norm": 3.788839101791382, + "learning_rate": 4.102507879950665e-06, + "loss": 0.1296, + "step": 43785 + }, + { + "epoch": 2.1938958220293, + "grad_norm": 5.653036117553711, + "learning_rate": 4.10237083733041e-06, + "loss": 0.2628, + "step": 43786 + }, + { + "epoch": 2.193909386869235, + "grad_norm": 7.7596659660339355, + "learning_rate": 4.102233794710156e-06, + "loss": 0.2545, + "step": 43787 + }, + { + "epoch": 2.19392295170917, + "grad_norm": 7.921628475189209, + "learning_rate": 4.1020967520899e-06, + "loss": 0.366, + "step": 43788 + }, + { + "epoch": 2.1939365165491047, + "grad_norm": 5.88296365737915, + "learning_rate": 4.101959709469646e-06, + "loss": 0.2744, + "step": 43789 + }, + { + "epoch": 2.1939500813890396, + "grad_norm": 5.468423366546631, + "learning_rate": 4.10182266684939e-06, + "loss": 0.2163, + "step": 43790 + }, + { + "epoch": 2.1939636462289744, + "grad_norm": 3.1907169818878174, + "learning_rate": 4.1016856242291355e-06, + "loss": 0.0959, + "step": 43791 + }, + { + "epoch": 2.1939772110689093, + "grad_norm": 5.471850872039795, + "learning_rate": 4.101548581608881e-06, + "loss": 0.1768, + "step": 43792 + }, + { + "epoch": 2.193990775908844, + "grad_norm": 4.978774070739746, + "learning_rate": 4.101411538988626e-06, + "loss": 0.2211, + "step": 43793 + }, + { + "epoch": 2.194004340748779, + "grad_norm": 5.793758392333984, + "learning_rate": 4.101274496368371e-06, + "loss": 0.2394, + "step": 43794 + }, + { + "epoch": 2.194017905588714, + "grad_norm": 4.227866172790527, + "learning_rate": 4.101137453748115e-06, + "loss": 0.2277, + "step": 43795 + }, + { + "epoch": 2.1940314704286488, + "grad_norm": 4.752551555633545, + "learning_rate": 4.1010004111278614e-06, + "loss": 0.1789, + "step": 43796 + }, + { + "epoch": 2.1940450352685836, + "grad_norm": 8.611063957214355, + "learning_rate": 4.100863368507606e-06, + "loss": 0.4347, + "step": 43797 + }, + { + "epoch": 2.1940586001085185, + "grad_norm": 4.779021739959717, + "learning_rate": 4.100726325887352e-06, + "loss": 0.221, + "step": 43798 + }, + { + "epoch": 2.194072164948454, + "grad_norm": 4.440310001373291, + "learning_rate": 4.100589283267096e-06, + "loss": 0.2543, + "step": 43799 + }, + { + "epoch": 2.1940857297883887, + "grad_norm": 5.093035697937012, + "learning_rate": 4.100452240646841e-06, + "loss": 0.272, + "step": 43800 + }, + { + "epoch": 2.1940992946283235, + "grad_norm": 4.454349517822266, + "learning_rate": 4.1003151980265865e-06, + "loss": 0.1132, + "step": 43801 + }, + { + "epoch": 2.1941128594682584, + "grad_norm": 5.049937725067139, + "learning_rate": 4.100178155406332e-06, + "loss": 0.2968, + "step": 43802 + }, + { + "epoch": 2.1941264243081933, + "grad_norm": 4.400862693786621, + "learning_rate": 4.100041112786077e-06, + "loss": 0.2608, + "step": 43803 + }, + { + "epoch": 2.194139989148128, + "grad_norm": 5.44135046005249, + "learning_rate": 4.099904070165822e-06, + "loss": 0.3259, + "step": 43804 + }, + { + "epoch": 2.194153553988063, + "grad_norm": 4.153242588043213, + "learning_rate": 4.099767027545567e-06, + "loss": 0.1711, + "step": 43805 + }, + { + "epoch": 2.194167118827998, + "grad_norm": 4.769242763519287, + "learning_rate": 4.0996299849253115e-06, + "loss": 0.2008, + "step": 43806 + }, + { + "epoch": 2.1941806836679327, + "grad_norm": 4.802552223205566, + "learning_rate": 4.0994929423050575e-06, + "loss": 0.1921, + "step": 43807 + }, + { + "epoch": 2.1941942485078676, + "grad_norm": 5.653146743774414, + "learning_rate": 4.099355899684802e-06, + "loss": 0.1491, + "step": 43808 + }, + { + "epoch": 2.1942078133478025, + "grad_norm": 5.251631259918213, + "learning_rate": 4.099218857064547e-06, + "loss": 0.2029, + "step": 43809 + }, + { + "epoch": 2.1942213781877373, + "grad_norm": 7.052160739898682, + "learning_rate": 4.099081814444292e-06, + "loss": 0.3532, + "step": 43810 + }, + { + "epoch": 2.194234943027672, + "grad_norm": 4.124203205108643, + "learning_rate": 4.098944771824037e-06, + "loss": 0.0915, + "step": 43811 + }, + { + "epoch": 2.194248507867607, + "grad_norm": 5.149755001068115, + "learning_rate": 4.098807729203783e-06, + "loss": 0.2362, + "step": 43812 + }, + { + "epoch": 2.194262072707542, + "grad_norm": 6.482728004455566, + "learning_rate": 4.098670686583528e-06, + "loss": 0.2111, + "step": 43813 + }, + { + "epoch": 2.194275637547477, + "grad_norm": 5.935758590698242, + "learning_rate": 4.098533643963273e-06, + "loss": 0.1506, + "step": 43814 + }, + { + "epoch": 2.1942892023874117, + "grad_norm": 7.614198684692383, + "learning_rate": 4.098396601343018e-06, + "loss": 0.2714, + "step": 43815 + }, + { + "epoch": 2.1943027672273465, + "grad_norm": 4.846184730529785, + "learning_rate": 4.098259558722763e-06, + "loss": 0.1259, + "step": 43816 + }, + { + "epoch": 2.194316332067282, + "grad_norm": 5.598747253417969, + "learning_rate": 4.0981225161025085e-06, + "loss": 0.2468, + "step": 43817 + }, + { + "epoch": 2.1943298969072167, + "grad_norm": 8.098942756652832, + "learning_rate": 4.097985473482254e-06, + "loss": 0.2672, + "step": 43818 + }, + { + "epoch": 2.1943434617471516, + "grad_norm": 5.646481990814209, + "learning_rate": 4.097848430861999e-06, + "loss": 0.1998, + "step": 43819 + }, + { + "epoch": 2.1943570265870864, + "grad_norm": 4.418273448944092, + "learning_rate": 4.097711388241743e-06, + "loss": 0.2316, + "step": 43820 + }, + { + "epoch": 2.1943705914270213, + "grad_norm": 6.034384250640869, + "learning_rate": 4.097574345621488e-06, + "loss": 0.2013, + "step": 43821 + }, + { + "epoch": 2.194384156266956, + "grad_norm": 5.972395420074463, + "learning_rate": 4.0974373030012335e-06, + "loss": 0.2693, + "step": 43822 + }, + { + "epoch": 2.194397721106891, + "grad_norm": 4.777696132659912, + "learning_rate": 4.097300260380979e-06, + "loss": 0.1898, + "step": 43823 + }, + { + "epoch": 2.194411285946826, + "grad_norm": 4.879892349243164, + "learning_rate": 4.097163217760724e-06, + "loss": 0.0955, + "step": 43824 + }, + { + "epoch": 2.1944248507867608, + "grad_norm": 5.2924299240112305, + "learning_rate": 4.097026175140469e-06, + "loss": 0.1879, + "step": 43825 + }, + { + "epoch": 2.1944384156266956, + "grad_norm": 6.767493724822998, + "learning_rate": 4.096889132520214e-06, + "loss": 0.1464, + "step": 43826 + }, + { + "epoch": 2.1944519804666305, + "grad_norm": 3.5707461833953857, + "learning_rate": 4.0967520898999594e-06, + "loss": 0.0998, + "step": 43827 + }, + { + "epoch": 2.1944655453065653, + "grad_norm": 3.491295099258423, + "learning_rate": 4.096615047279705e-06, + "loss": 0.1104, + "step": 43828 + }, + { + "epoch": 2.1944791101465, + "grad_norm": 6.563623428344727, + "learning_rate": 4.096478004659449e-06, + "loss": 0.3028, + "step": 43829 + }, + { + "epoch": 2.194492674986435, + "grad_norm": 4.119425296783447, + "learning_rate": 4.096340962039195e-06, + "loss": 0.1417, + "step": 43830 + }, + { + "epoch": 2.19450623982637, + "grad_norm": 5.62664794921875, + "learning_rate": 4.096203919418939e-06, + "loss": 0.1302, + "step": 43831 + }, + { + "epoch": 2.194519804666305, + "grad_norm": 3.828545093536377, + "learning_rate": 4.096066876798685e-06, + "loss": 0.094, + "step": 43832 + }, + { + "epoch": 2.1945333695062397, + "grad_norm": 4.250943183898926, + "learning_rate": 4.09592983417843e-06, + "loss": 0.1371, + "step": 43833 + }, + { + "epoch": 2.1945469343461745, + "grad_norm": 3.8482601642608643, + "learning_rate": 4.095792791558175e-06, + "loss": 0.1422, + "step": 43834 + }, + { + "epoch": 2.1945604991861094, + "grad_norm": 4.515135765075684, + "learning_rate": 4.09565574893792e-06, + "loss": 0.1497, + "step": 43835 + }, + { + "epoch": 2.1945740640260443, + "grad_norm": 2.8135759830474854, + "learning_rate": 4.095518706317665e-06, + "loss": 0.0715, + "step": 43836 + }, + { + "epoch": 2.1945876288659796, + "grad_norm": 4.9838080406188965, + "learning_rate": 4.09538166369741e-06, + "loss": 0.3035, + "step": 43837 + }, + { + "epoch": 2.1946011937059144, + "grad_norm": 4.526679515838623, + "learning_rate": 4.0952446210771556e-06, + "loss": 0.201, + "step": 43838 + }, + { + "epoch": 2.1946147585458493, + "grad_norm": 5.11272668838501, + "learning_rate": 4.095107578456901e-06, + "loss": 0.1645, + "step": 43839 + }, + { + "epoch": 2.194628323385784, + "grad_norm": 5.330601692199707, + "learning_rate": 4.094970535836645e-06, + "loss": 0.1557, + "step": 43840 + }, + { + "epoch": 2.194641888225719, + "grad_norm": 5.682216644287109, + "learning_rate": 4.094833493216391e-06, + "loss": 0.1706, + "step": 43841 + }, + { + "epoch": 2.194655453065654, + "grad_norm": 5.8584184646606445, + "learning_rate": 4.094696450596135e-06, + "loss": 0.1515, + "step": 43842 + }, + { + "epoch": 2.1946690179055888, + "grad_norm": 5.731082916259766, + "learning_rate": 4.0945594079758814e-06, + "loss": 0.2494, + "step": 43843 + }, + { + "epoch": 2.1946825827455236, + "grad_norm": 3.622772693634033, + "learning_rate": 4.094422365355626e-06, + "loss": 0.0966, + "step": 43844 + }, + { + "epoch": 2.1946961475854585, + "grad_norm": 4.467768669128418, + "learning_rate": 4.094285322735371e-06, + "loss": 0.1445, + "step": 43845 + }, + { + "epoch": 2.1947097124253934, + "grad_norm": 5.506405830383301, + "learning_rate": 4.094148280115116e-06, + "loss": 0.18, + "step": 43846 + }, + { + "epoch": 2.1947232772653282, + "grad_norm": 6.2150559425354, + "learning_rate": 4.094011237494861e-06, + "loss": 0.2534, + "step": 43847 + }, + { + "epoch": 2.194736842105263, + "grad_norm": 3.995743751525879, + "learning_rate": 4.0938741948746065e-06, + "loss": 0.199, + "step": 43848 + }, + { + "epoch": 2.194750406945198, + "grad_norm": 5.915886402130127, + "learning_rate": 4.093737152254351e-06, + "loss": 0.2341, + "step": 43849 + }, + { + "epoch": 2.194763971785133, + "grad_norm": 4.040458679199219, + "learning_rate": 4.093600109634097e-06, + "loss": 0.081, + "step": 43850 + }, + { + "epoch": 2.1947775366250677, + "grad_norm": 6.269663333892822, + "learning_rate": 4.093463067013841e-06, + "loss": 0.1615, + "step": 43851 + }, + { + "epoch": 2.1947911014650026, + "grad_norm": 4.381087303161621, + "learning_rate": 4.093326024393587e-06, + "loss": 0.1738, + "step": 43852 + }, + { + "epoch": 2.1948046663049374, + "grad_norm": 5.2263569831848145, + "learning_rate": 4.0931889817733315e-06, + "loss": 0.1987, + "step": 43853 + }, + { + "epoch": 2.1948182311448723, + "grad_norm": 5.021490573883057, + "learning_rate": 4.093051939153077e-06, + "loss": 0.1419, + "step": 43854 + }, + { + "epoch": 2.1948317959848076, + "grad_norm": 5.974730968475342, + "learning_rate": 4.092914896532822e-06, + "loss": 0.1517, + "step": 43855 + }, + { + "epoch": 2.1948453608247425, + "grad_norm": 5.669755458831787, + "learning_rate": 4.092777853912567e-06, + "loss": 0.2317, + "step": 43856 + }, + { + "epoch": 2.1948589256646773, + "grad_norm": 5.98815393447876, + "learning_rate": 4.092640811292312e-06, + "loss": 0.3204, + "step": 43857 + }, + { + "epoch": 2.194872490504612, + "grad_norm": 6.243668079376221, + "learning_rate": 4.0925037686720574e-06, + "loss": 0.2102, + "step": 43858 + }, + { + "epoch": 2.194886055344547, + "grad_norm": 5.265581130981445, + "learning_rate": 4.092366726051803e-06, + "loss": 0.2093, + "step": 43859 + }, + { + "epoch": 2.194899620184482, + "grad_norm": 5.252496719360352, + "learning_rate": 4.092229683431548e-06, + "loss": 0.1871, + "step": 43860 + }, + { + "epoch": 2.194913185024417, + "grad_norm": 5.1494245529174805, + "learning_rate": 4.092092640811293e-06, + "loss": 0.1704, + "step": 43861 + }, + { + "epoch": 2.1949267498643517, + "grad_norm": 7.14439582824707, + "learning_rate": 4.091955598191037e-06, + "loss": 0.2884, + "step": 43862 + }, + { + "epoch": 2.1949403147042865, + "grad_norm": 4.176238536834717, + "learning_rate": 4.0918185555707825e-06, + "loss": 0.1642, + "step": 43863 + }, + { + "epoch": 2.1949538795442214, + "grad_norm": 6.409376621246338, + "learning_rate": 4.091681512950528e-06, + "loss": 0.163, + "step": 43864 + }, + { + "epoch": 2.1949674443841563, + "grad_norm": 3.6667110919952393, + "learning_rate": 4.091544470330273e-06, + "loss": 0.1293, + "step": 43865 + }, + { + "epoch": 2.194981009224091, + "grad_norm": 5.073285102844238, + "learning_rate": 4.091407427710018e-06, + "loss": 0.1726, + "step": 43866 + }, + { + "epoch": 2.194994574064026, + "grad_norm": 5.31858491897583, + "learning_rate": 4.091270385089763e-06, + "loss": 0.1441, + "step": 43867 + }, + { + "epoch": 2.195008138903961, + "grad_norm": 4.673939228057861, + "learning_rate": 4.091133342469508e-06, + "loss": 0.1408, + "step": 43868 + }, + { + "epoch": 2.1950217037438957, + "grad_norm": 8.222454071044922, + "learning_rate": 4.0909962998492536e-06, + "loss": 0.2512, + "step": 43869 + }, + { + "epoch": 2.1950352685838306, + "grad_norm": 6.573061466217041, + "learning_rate": 4.090859257228999e-06, + "loss": 0.2769, + "step": 43870 + }, + { + "epoch": 2.1950488334237654, + "grad_norm": 9.131486892700195, + "learning_rate": 4.090722214608744e-06, + "loss": 0.3051, + "step": 43871 + }, + { + "epoch": 2.1950623982637003, + "grad_norm": 4.596505641937256, + "learning_rate": 4.090585171988489e-06, + "loss": 0.2707, + "step": 43872 + }, + { + "epoch": 2.195075963103635, + "grad_norm": 4.218550205230713, + "learning_rate": 4.090448129368234e-06, + "loss": 0.2179, + "step": 43873 + }, + { + "epoch": 2.19508952794357, + "grad_norm": 4.642986297607422, + "learning_rate": 4.090311086747979e-06, + "loss": 0.107, + "step": 43874 + }, + { + "epoch": 2.1951030927835053, + "grad_norm": 5.039000988006592, + "learning_rate": 4.090174044127724e-06, + "loss": 0.155, + "step": 43875 + }, + { + "epoch": 2.19511665762344, + "grad_norm": 5.921280860900879, + "learning_rate": 4.090037001507469e-06, + "loss": 0.2175, + "step": 43876 + }, + { + "epoch": 2.195130222463375, + "grad_norm": 4.020207405090332, + "learning_rate": 4.089899958887214e-06, + "loss": 0.1343, + "step": 43877 + }, + { + "epoch": 2.19514378730331, + "grad_norm": 6.169018268585205, + "learning_rate": 4.089762916266959e-06, + "loss": 0.2869, + "step": 43878 + }, + { + "epoch": 2.195157352143245, + "grad_norm": 9.978346824645996, + "learning_rate": 4.0896258736467045e-06, + "loss": 0.566, + "step": 43879 + }, + { + "epoch": 2.1951709169831797, + "grad_norm": 4.667206287384033, + "learning_rate": 4.08948883102645e-06, + "loss": 0.1625, + "step": 43880 + }, + { + "epoch": 2.1951844818231145, + "grad_norm": 4.821290016174316, + "learning_rate": 4.089351788406195e-06, + "loss": 0.2054, + "step": 43881 + }, + { + "epoch": 2.1951980466630494, + "grad_norm": 5.965879917144775, + "learning_rate": 4.08921474578594e-06, + "loss": 0.3811, + "step": 43882 + }, + { + "epoch": 2.1952116115029843, + "grad_norm": 5.81580924987793, + "learning_rate": 4.089077703165684e-06, + "loss": 0.1766, + "step": 43883 + }, + { + "epoch": 2.195225176342919, + "grad_norm": 6.5566229820251465, + "learning_rate": 4.08894066054543e-06, + "loss": 0.229, + "step": 43884 + }, + { + "epoch": 2.195238741182854, + "grad_norm": 4.44732141494751, + "learning_rate": 4.088803617925175e-06, + "loss": 0.2171, + "step": 43885 + }, + { + "epoch": 2.195252306022789, + "grad_norm": 5.959856033325195, + "learning_rate": 4.088666575304921e-06, + "loss": 0.2005, + "step": 43886 + }, + { + "epoch": 2.1952658708627237, + "grad_norm": 5.053309440612793, + "learning_rate": 4.088529532684665e-06, + "loss": 0.2157, + "step": 43887 + }, + { + "epoch": 2.1952794357026586, + "grad_norm": 7.509259223937988, + "learning_rate": 4.08839249006441e-06, + "loss": 0.3577, + "step": 43888 + }, + { + "epoch": 2.1952930005425935, + "grad_norm": 4.3989739418029785, + "learning_rate": 4.0882554474441554e-06, + "loss": 0.1544, + "step": 43889 + }, + { + "epoch": 2.1953065653825283, + "grad_norm": 4.368569374084473, + "learning_rate": 4.088118404823901e-06, + "loss": 0.1809, + "step": 43890 + }, + { + "epoch": 2.195320130222463, + "grad_norm": 6.12044620513916, + "learning_rate": 4.087981362203646e-06, + "loss": 0.3022, + "step": 43891 + }, + { + "epoch": 2.195333695062398, + "grad_norm": 6.291104316711426, + "learning_rate": 4.087844319583391e-06, + "loss": 0.3014, + "step": 43892 + }, + { + "epoch": 2.1953472599023334, + "grad_norm": 5.123579502105713, + "learning_rate": 4.087707276963136e-06, + "loss": 0.284, + "step": 43893 + }, + { + "epoch": 2.1953608247422682, + "grad_norm": 6.616691589355469, + "learning_rate": 4.0875702343428805e-06, + "loss": 0.3554, + "step": 43894 + }, + { + "epoch": 2.195374389582203, + "grad_norm": 6.419832229614258, + "learning_rate": 4.0874331917226265e-06, + "loss": 0.29, + "step": 43895 + }, + { + "epoch": 2.195387954422138, + "grad_norm": 4.873196125030518, + "learning_rate": 4.087296149102371e-06, + "loss": 0.2783, + "step": 43896 + }, + { + "epoch": 2.195401519262073, + "grad_norm": 4.346107482910156, + "learning_rate": 4.087159106482116e-06, + "loss": 0.1735, + "step": 43897 + }, + { + "epoch": 2.1954150841020077, + "grad_norm": 6.6065993309021, + "learning_rate": 4.087022063861861e-06, + "loss": 0.2308, + "step": 43898 + }, + { + "epoch": 2.1954286489419426, + "grad_norm": 5.68745231628418, + "learning_rate": 4.086885021241606e-06, + "loss": 0.341, + "step": 43899 + }, + { + "epoch": 2.1954422137818774, + "grad_norm": 4.933530330657959, + "learning_rate": 4.0867479786213516e-06, + "loss": 0.1637, + "step": 43900 + }, + { + "epoch": 2.1954557786218123, + "grad_norm": 4.8574042320251465, + "learning_rate": 4.086610936001097e-06, + "loss": 0.2109, + "step": 43901 + }, + { + "epoch": 2.195469343461747, + "grad_norm": 5.017123222351074, + "learning_rate": 4.086473893380842e-06, + "loss": 0.2936, + "step": 43902 + }, + { + "epoch": 2.195482908301682, + "grad_norm": 4.061411380767822, + "learning_rate": 4.086336850760586e-06, + "loss": 0.1821, + "step": 43903 + }, + { + "epoch": 2.195496473141617, + "grad_norm": 5.91158390045166, + "learning_rate": 4.086199808140332e-06, + "loss": 0.4076, + "step": 43904 + }, + { + "epoch": 2.1955100379815518, + "grad_norm": 5.135031223297119, + "learning_rate": 4.086062765520077e-06, + "loss": 0.2806, + "step": 43905 + }, + { + "epoch": 2.1955236028214866, + "grad_norm": 4.670094013214111, + "learning_rate": 4.085925722899823e-06, + "loss": 0.2291, + "step": 43906 + }, + { + "epoch": 2.1955371676614215, + "grad_norm": 4.2434401512146, + "learning_rate": 4.085788680279567e-06, + "loss": 0.1456, + "step": 43907 + }, + { + "epoch": 2.1955507325013564, + "grad_norm": 6.8543782234191895, + "learning_rate": 4.085651637659312e-06, + "loss": 0.32, + "step": 43908 + }, + { + "epoch": 2.195564297341291, + "grad_norm": 5.933045387268066, + "learning_rate": 4.085514595039057e-06, + "loss": 0.2114, + "step": 43909 + }, + { + "epoch": 2.195577862181226, + "grad_norm": 6.089057445526123, + "learning_rate": 4.0853775524188025e-06, + "loss": 0.2002, + "step": 43910 + }, + { + "epoch": 2.195591427021161, + "grad_norm": 3.688410997390747, + "learning_rate": 4.085240509798548e-06, + "loss": 0.2065, + "step": 43911 + }, + { + "epoch": 2.195604991861096, + "grad_norm": 6.318342208862305, + "learning_rate": 4.085103467178293e-06, + "loss": 0.2624, + "step": 43912 + }, + { + "epoch": 2.195618556701031, + "grad_norm": 12.020893096923828, + "learning_rate": 4.084966424558038e-06, + "loss": 0.3092, + "step": 43913 + }, + { + "epoch": 2.195632121540966, + "grad_norm": 4.471773624420166, + "learning_rate": 4.084829381937783e-06, + "loss": 0.2197, + "step": 43914 + }, + { + "epoch": 2.195645686380901, + "grad_norm": 4.019205570220947, + "learning_rate": 4.084692339317528e-06, + "loss": 0.2295, + "step": 43915 + }, + { + "epoch": 2.1956592512208357, + "grad_norm": 4.234813690185547, + "learning_rate": 4.084555296697273e-06, + "loss": 0.187, + "step": 43916 + }, + { + "epoch": 2.1956728160607706, + "grad_norm": 5.6364970207214355, + "learning_rate": 4.084418254077018e-06, + "loss": 0.2709, + "step": 43917 + }, + { + "epoch": 2.1956863809007054, + "grad_norm": 4.566650390625, + "learning_rate": 4.084281211456763e-06, + "loss": 0.2908, + "step": 43918 + }, + { + "epoch": 2.1956999457406403, + "grad_norm": 6.016539573669434, + "learning_rate": 4.084144168836508e-06, + "loss": 0.2697, + "step": 43919 + }, + { + "epoch": 2.195713510580575, + "grad_norm": 4.6534223556518555, + "learning_rate": 4.0840071262162534e-06, + "loss": 0.1903, + "step": 43920 + }, + { + "epoch": 2.19572707542051, + "grad_norm": 5.571472644805908, + "learning_rate": 4.083870083595999e-06, + "loss": 0.1955, + "step": 43921 + }, + { + "epoch": 2.195740640260445, + "grad_norm": 5.589174270629883, + "learning_rate": 4.083733040975744e-06, + "loss": 0.2732, + "step": 43922 + }, + { + "epoch": 2.1957542051003798, + "grad_norm": 6.410364627838135, + "learning_rate": 4.083595998355489e-06, + "loss": 0.3013, + "step": 43923 + }, + { + "epoch": 2.1957677699403146, + "grad_norm": 4.212255954742432, + "learning_rate": 4.083458955735234e-06, + "loss": 0.1706, + "step": 43924 + }, + { + "epoch": 2.1957813347802495, + "grad_norm": 5.106890678405762, + "learning_rate": 4.083321913114979e-06, + "loss": 0.23, + "step": 43925 + }, + { + "epoch": 2.1957948996201844, + "grad_norm": 4.072723865509033, + "learning_rate": 4.0831848704947245e-06, + "loss": 0.1841, + "step": 43926 + }, + { + "epoch": 2.1958084644601192, + "grad_norm": 4.134668827056885, + "learning_rate": 4.08304782787447e-06, + "loss": 0.1333, + "step": 43927 + }, + { + "epoch": 2.195822029300054, + "grad_norm": 5.886286735534668, + "learning_rate": 4.082910785254214e-06, + "loss": 0.2993, + "step": 43928 + }, + { + "epoch": 2.195835594139989, + "grad_norm": 5.562561511993408, + "learning_rate": 4.08277374263396e-06, + "loss": 0.2487, + "step": 43929 + }, + { + "epoch": 2.195849158979924, + "grad_norm": 3.5078959465026855, + "learning_rate": 4.082636700013704e-06, + "loss": 0.1788, + "step": 43930 + }, + { + "epoch": 2.195862723819859, + "grad_norm": 4.587540626525879, + "learning_rate": 4.0824996573934496e-06, + "loss": 0.1564, + "step": 43931 + }, + { + "epoch": 2.195876288659794, + "grad_norm": 4.7075395584106445, + "learning_rate": 4.082362614773195e-06, + "loss": 0.1924, + "step": 43932 + }, + { + "epoch": 2.195889853499729, + "grad_norm": 5.0152974128723145, + "learning_rate": 4.08222557215294e-06, + "loss": 0.2493, + "step": 43933 + }, + { + "epoch": 2.1959034183396637, + "grad_norm": 5.889767646789551, + "learning_rate": 4.082088529532685e-06, + "loss": 0.3064, + "step": 43934 + }, + { + "epoch": 2.1959169831795986, + "grad_norm": 6.991335391998291, + "learning_rate": 4.08195148691243e-06, + "loss": 0.4447, + "step": 43935 + }, + { + "epoch": 2.1959305480195335, + "grad_norm": 5.099526882171631, + "learning_rate": 4.0818144442921755e-06, + "loss": 0.2706, + "step": 43936 + }, + { + "epoch": 2.1959441128594683, + "grad_norm": 4.904808044433594, + "learning_rate": 4.08167740167192e-06, + "loss": 0.2163, + "step": 43937 + }, + { + "epoch": 2.195957677699403, + "grad_norm": 5.929512977600098, + "learning_rate": 4.081540359051666e-06, + "loss": 0.2633, + "step": 43938 + }, + { + "epoch": 2.195971242539338, + "grad_norm": 5.893082618713379, + "learning_rate": 4.08140331643141e-06, + "loss": 0.2413, + "step": 43939 + }, + { + "epoch": 2.195984807379273, + "grad_norm": 4.551662921905518, + "learning_rate": 4.081266273811156e-06, + "loss": 0.2789, + "step": 43940 + }, + { + "epoch": 2.195998372219208, + "grad_norm": 5.031886577606201, + "learning_rate": 4.0811292311909005e-06, + "loss": 0.2897, + "step": 43941 + }, + { + "epoch": 2.1960119370591427, + "grad_norm": 4.152824401855469, + "learning_rate": 4.080992188570646e-06, + "loss": 0.2321, + "step": 43942 + }, + { + "epoch": 2.1960255018990775, + "grad_norm": 3.9013748168945312, + "learning_rate": 4.080855145950391e-06, + "loss": 0.2237, + "step": 43943 + }, + { + "epoch": 2.1960390667390124, + "grad_norm": 4.913754940032959, + "learning_rate": 4.080718103330136e-06, + "loss": 0.2652, + "step": 43944 + }, + { + "epoch": 2.1960526315789473, + "grad_norm": 5.294415473937988, + "learning_rate": 4.080581060709881e-06, + "loss": 0.3075, + "step": 43945 + }, + { + "epoch": 2.196066196418882, + "grad_norm": 6.2696661949157715, + "learning_rate": 4.080444018089626e-06, + "loss": 0.2774, + "step": 43946 + }, + { + "epoch": 2.196079761258817, + "grad_norm": 5.815528869628906, + "learning_rate": 4.080306975469372e-06, + "loss": 0.294, + "step": 43947 + }, + { + "epoch": 2.196093326098752, + "grad_norm": 5.686112403869629, + "learning_rate": 4.080169932849116e-06, + "loss": 0.1867, + "step": 43948 + }, + { + "epoch": 2.1961068909386867, + "grad_norm": 5.167359352111816, + "learning_rate": 4.080032890228862e-06, + "loss": 0.1668, + "step": 43949 + }, + { + "epoch": 2.1961204557786216, + "grad_norm": 5.334094047546387, + "learning_rate": 4.079895847608606e-06, + "loss": 0.2612, + "step": 43950 + }, + { + "epoch": 2.196134020618557, + "grad_norm": 4.685885429382324, + "learning_rate": 4.0797588049883514e-06, + "loss": 0.2457, + "step": 43951 + }, + { + "epoch": 2.1961475854584918, + "grad_norm": 5.278240203857422, + "learning_rate": 4.079621762368097e-06, + "loss": 0.3077, + "step": 43952 + }, + { + "epoch": 2.1961611502984266, + "grad_norm": 6.825748443603516, + "learning_rate": 4.079484719747842e-06, + "loss": 0.1817, + "step": 43953 + }, + { + "epoch": 2.1961747151383615, + "grad_norm": 6.946939945220947, + "learning_rate": 4.079347677127587e-06, + "loss": 0.2713, + "step": 43954 + }, + { + "epoch": 2.1961882799782964, + "grad_norm": 4.008724212646484, + "learning_rate": 4.079210634507332e-06, + "loss": 0.2538, + "step": 43955 + }, + { + "epoch": 2.196201844818231, + "grad_norm": 3.9165356159210205, + "learning_rate": 4.079073591887077e-06, + "loss": 0.1764, + "step": 43956 + }, + { + "epoch": 2.196215409658166, + "grad_norm": 5.674745559692383, + "learning_rate": 4.0789365492668225e-06, + "loss": 0.3239, + "step": 43957 + }, + { + "epoch": 2.196228974498101, + "grad_norm": 4.581194877624512, + "learning_rate": 4.078799506646568e-06, + "loss": 0.1735, + "step": 43958 + }, + { + "epoch": 2.196242539338036, + "grad_norm": 5.041153907775879, + "learning_rate": 4.078662464026312e-06, + "loss": 0.2325, + "step": 43959 + }, + { + "epoch": 2.1962561041779707, + "grad_norm": 5.315264701843262, + "learning_rate": 4.078525421406058e-06, + "loss": 0.1242, + "step": 43960 + }, + { + "epoch": 2.1962696690179055, + "grad_norm": 4.813866138458252, + "learning_rate": 4.078388378785802e-06, + "loss": 0.2152, + "step": 43961 + }, + { + "epoch": 2.1962832338578404, + "grad_norm": 3.56976580619812, + "learning_rate": 4.0782513361655476e-06, + "loss": 0.0879, + "step": 43962 + }, + { + "epoch": 2.1962967986977753, + "grad_norm": 4.546311855316162, + "learning_rate": 4.078114293545293e-06, + "loss": 0.1969, + "step": 43963 + }, + { + "epoch": 2.19631036353771, + "grad_norm": 4.101563453674316, + "learning_rate": 4.077977250925038e-06, + "loss": 0.1033, + "step": 43964 + }, + { + "epoch": 2.196323928377645, + "grad_norm": 4.562466621398926, + "learning_rate": 4.077840208304783e-06, + "loss": 0.1928, + "step": 43965 + }, + { + "epoch": 2.19633749321758, + "grad_norm": 5.5636749267578125, + "learning_rate": 4.077703165684528e-06, + "loss": 0.159, + "step": 43966 + }, + { + "epoch": 2.1963510580575147, + "grad_norm": 5.049433708190918, + "learning_rate": 4.0775661230642735e-06, + "loss": 0.3361, + "step": 43967 + }, + { + "epoch": 2.1963646228974496, + "grad_norm": 4.453437328338623, + "learning_rate": 4.077429080444019e-06, + "loss": 0.1816, + "step": 43968 + }, + { + "epoch": 2.196378187737385, + "grad_norm": 6.115437030792236, + "learning_rate": 4.077292037823764e-06, + "loss": 0.2263, + "step": 43969 + }, + { + "epoch": 2.19639175257732, + "grad_norm": 4.257981777191162, + "learning_rate": 4.077154995203509e-06, + "loss": 0.1268, + "step": 43970 + }, + { + "epoch": 2.1964053174172546, + "grad_norm": 5.090757846832275, + "learning_rate": 4.077017952583253e-06, + "loss": 0.174, + "step": 43971 + }, + { + "epoch": 2.1964188822571895, + "grad_norm": 5.55101203918457, + "learning_rate": 4.0768809099629985e-06, + "loss": 0.2022, + "step": 43972 + }, + { + "epoch": 2.1964324470971244, + "grad_norm": 3.5372893810272217, + "learning_rate": 4.076743867342744e-06, + "loss": 0.1103, + "step": 43973 + }, + { + "epoch": 2.1964460119370592, + "grad_norm": 5.027104377746582, + "learning_rate": 4.076606824722489e-06, + "loss": 0.2755, + "step": 43974 + }, + { + "epoch": 2.196459576776994, + "grad_norm": 4.927038192749023, + "learning_rate": 4.076469782102234e-06, + "loss": 0.2364, + "step": 43975 + }, + { + "epoch": 2.196473141616929, + "grad_norm": 4.454240322113037, + "learning_rate": 4.076332739481979e-06, + "loss": 0.2219, + "step": 43976 + }, + { + "epoch": 2.196486706456864, + "grad_norm": 3.6826694011688232, + "learning_rate": 4.076195696861724e-06, + "loss": 0.0532, + "step": 43977 + }, + { + "epoch": 2.1965002712967987, + "grad_norm": 5.751676082611084, + "learning_rate": 4.07605865424147e-06, + "loss": 0.1849, + "step": 43978 + }, + { + "epoch": 2.1965138361367336, + "grad_norm": 5.799069404602051, + "learning_rate": 4.075921611621215e-06, + "loss": 0.3063, + "step": 43979 + }, + { + "epoch": 2.1965274009766684, + "grad_norm": 4.647473335266113, + "learning_rate": 4.07578456900096e-06, + "loss": 0.2525, + "step": 43980 + }, + { + "epoch": 2.1965409658166033, + "grad_norm": 3.4658267498016357, + "learning_rate": 4.075647526380705e-06, + "loss": 0.1258, + "step": 43981 + }, + { + "epoch": 2.196554530656538, + "grad_norm": 4.967123508453369, + "learning_rate": 4.0755104837604495e-06, + "loss": 0.1736, + "step": 43982 + }, + { + "epoch": 2.196568095496473, + "grad_norm": 4.31001091003418, + "learning_rate": 4.0753734411401955e-06, + "loss": 0.2552, + "step": 43983 + }, + { + "epoch": 2.196581660336408, + "grad_norm": 4.978755474090576, + "learning_rate": 4.07523639851994e-06, + "loss": 0.1805, + "step": 43984 + }, + { + "epoch": 2.1965952251763428, + "grad_norm": 3.3233227729797363, + "learning_rate": 4.075099355899685e-06, + "loss": 0.1736, + "step": 43985 + }, + { + "epoch": 2.1966087900162776, + "grad_norm": 5.014573574066162, + "learning_rate": 4.07496231327943e-06, + "loss": 0.2077, + "step": 43986 + }, + { + "epoch": 2.1966223548562125, + "grad_norm": 5.541055202484131, + "learning_rate": 4.074825270659175e-06, + "loss": 0.1929, + "step": 43987 + }, + { + "epoch": 2.1966359196961474, + "grad_norm": 4.209862232208252, + "learning_rate": 4.0746882280389205e-06, + "loss": 0.1801, + "step": 43988 + }, + { + "epoch": 2.1966494845360827, + "grad_norm": 4.651864051818848, + "learning_rate": 4.074551185418666e-06, + "loss": 0.1246, + "step": 43989 + }, + { + "epoch": 2.1966630493760175, + "grad_norm": 5.341027736663818, + "learning_rate": 4.074414142798411e-06, + "loss": 0.2143, + "step": 43990 + }, + { + "epoch": 2.1966766142159524, + "grad_norm": 3.643200159072876, + "learning_rate": 4.074277100178155e-06, + "loss": 0.1702, + "step": 43991 + }, + { + "epoch": 2.1966901790558873, + "grad_norm": 4.266456604003906, + "learning_rate": 4.074140057557901e-06, + "loss": 0.1537, + "step": 43992 + }, + { + "epoch": 2.196703743895822, + "grad_norm": 6.681262493133545, + "learning_rate": 4.0740030149376456e-06, + "loss": 0.2054, + "step": 43993 + }, + { + "epoch": 2.196717308735757, + "grad_norm": 4.506824493408203, + "learning_rate": 4.073865972317392e-06, + "loss": 0.1706, + "step": 43994 + }, + { + "epoch": 2.196730873575692, + "grad_norm": 6.578847885131836, + "learning_rate": 4.073728929697136e-06, + "loss": 0.2085, + "step": 43995 + }, + { + "epoch": 2.1967444384156267, + "grad_norm": 4.144859313964844, + "learning_rate": 4.073591887076881e-06, + "loss": 0.1241, + "step": 43996 + }, + { + "epoch": 2.1967580032555616, + "grad_norm": 3.3987174034118652, + "learning_rate": 4.073454844456626e-06, + "loss": 0.1188, + "step": 43997 + }, + { + "epoch": 2.1967715680954965, + "grad_norm": 4.8620991706848145, + "learning_rate": 4.0733178018363715e-06, + "loss": 0.1426, + "step": 43998 + }, + { + "epoch": 2.1967851329354313, + "grad_norm": 4.566596984863281, + "learning_rate": 4.073180759216117e-06, + "loss": 0.1802, + "step": 43999 + }, + { + "epoch": 2.196798697775366, + "grad_norm": 5.800639629364014, + "learning_rate": 4.073043716595861e-06, + "loss": 0.2169, + "step": 44000 + }, + { + "epoch": 2.196812262615301, + "grad_norm": 5.480759143829346, + "learning_rate": 4.072906673975607e-06, + "loss": 0.1789, + "step": 44001 + }, + { + "epoch": 2.196825827455236, + "grad_norm": 2.861345052719116, + "learning_rate": 4.072769631355351e-06, + "loss": 0.0742, + "step": 44002 + }, + { + "epoch": 2.196839392295171, + "grad_norm": 4.329098224639893, + "learning_rate": 4.072632588735097e-06, + "loss": 0.1256, + "step": 44003 + }, + { + "epoch": 2.1968529571351056, + "grad_norm": 5.033441066741943, + "learning_rate": 4.072495546114842e-06, + "loss": 0.2332, + "step": 44004 + }, + { + "epoch": 2.1968665219750405, + "grad_norm": 4.101115703582764, + "learning_rate": 4.072358503494587e-06, + "loss": 0.0839, + "step": 44005 + }, + { + "epoch": 2.196880086814976, + "grad_norm": 5.120960712432861, + "learning_rate": 4.072221460874332e-06, + "loss": 0.2524, + "step": 44006 + }, + { + "epoch": 2.1968936516549107, + "grad_norm": 4.442478656768799, + "learning_rate": 4.072084418254077e-06, + "loss": 0.1521, + "step": 44007 + }, + { + "epoch": 2.1969072164948455, + "grad_norm": 4.5169548988342285, + "learning_rate": 4.071947375633822e-06, + "loss": 0.1264, + "step": 44008 + }, + { + "epoch": 2.1969207813347804, + "grad_norm": 4.188041687011719, + "learning_rate": 4.071810333013568e-06, + "loss": 0.1473, + "step": 44009 + }, + { + "epoch": 2.1969343461747153, + "grad_norm": 6.494193077087402, + "learning_rate": 4.071673290393313e-06, + "loss": 0.2597, + "step": 44010 + }, + { + "epoch": 2.19694791101465, + "grad_norm": 6.895763397216797, + "learning_rate": 4.071536247773058e-06, + "loss": 0.1851, + "step": 44011 + }, + { + "epoch": 2.196961475854585, + "grad_norm": 5.296653747558594, + "learning_rate": 4.071399205152803e-06, + "loss": 0.1572, + "step": 44012 + }, + { + "epoch": 2.19697504069452, + "grad_norm": 3.873603343963623, + "learning_rate": 4.0712621625325475e-06, + "loss": 0.2523, + "step": 44013 + }, + { + "epoch": 2.1969886055344547, + "grad_norm": 5.902467727661133, + "learning_rate": 4.0711251199122935e-06, + "loss": 0.2139, + "step": 44014 + }, + { + "epoch": 2.1970021703743896, + "grad_norm": 5.938976764678955, + "learning_rate": 4.070988077292038e-06, + "loss": 0.2513, + "step": 44015 + }, + { + "epoch": 2.1970157352143245, + "grad_norm": 5.603074073791504, + "learning_rate": 4.070851034671783e-06, + "loss": 0.1932, + "step": 44016 + }, + { + "epoch": 2.1970293000542593, + "grad_norm": 5.375276565551758, + "learning_rate": 4.070713992051528e-06, + "loss": 0.2583, + "step": 44017 + }, + { + "epoch": 2.197042864894194, + "grad_norm": 4.84359884262085, + "learning_rate": 4.070576949431273e-06, + "loss": 0.2707, + "step": 44018 + }, + { + "epoch": 2.197056429734129, + "grad_norm": 5.866331100463867, + "learning_rate": 4.0704399068110185e-06, + "loss": 0.2511, + "step": 44019 + }, + { + "epoch": 2.197069994574064, + "grad_norm": 6.958326816558838, + "learning_rate": 4.070302864190764e-06, + "loss": 0.3131, + "step": 44020 + }, + { + "epoch": 2.197083559413999, + "grad_norm": 7.020202159881592, + "learning_rate": 4.070165821570509e-06, + "loss": 0.3437, + "step": 44021 + }, + { + "epoch": 2.1970971242539337, + "grad_norm": 4.178642272949219, + "learning_rate": 4.070028778950254e-06, + "loss": 0.1914, + "step": 44022 + }, + { + "epoch": 2.1971106890938685, + "grad_norm": 5.168694019317627, + "learning_rate": 4.069891736329999e-06, + "loss": 0.1924, + "step": 44023 + }, + { + "epoch": 2.1971242539338034, + "grad_norm": 5.6859917640686035, + "learning_rate": 4.0697546937097444e-06, + "loss": 0.289, + "step": 44024 + }, + { + "epoch": 2.1971378187737383, + "grad_norm": 4.473483085632324, + "learning_rate": 4.069617651089489e-06, + "loss": 0.1343, + "step": 44025 + }, + { + "epoch": 2.197151383613673, + "grad_norm": 5.581409454345703, + "learning_rate": 4.069480608469235e-06, + "loss": 0.2328, + "step": 44026 + }, + { + "epoch": 2.1971649484536084, + "grad_norm": 3.5214316844940186, + "learning_rate": 4.069343565848979e-06, + "loss": 0.1252, + "step": 44027 + }, + { + "epoch": 2.1971785132935433, + "grad_norm": 3.318958044052124, + "learning_rate": 4.069206523228724e-06, + "loss": 0.1392, + "step": 44028 + }, + { + "epoch": 2.197192078133478, + "grad_norm": 3.65975022315979, + "learning_rate": 4.0690694806084695e-06, + "loss": 0.2455, + "step": 44029 + }, + { + "epoch": 2.197205642973413, + "grad_norm": 3.627378463745117, + "learning_rate": 4.068932437988215e-06, + "loss": 0.1495, + "step": 44030 + }, + { + "epoch": 2.197219207813348, + "grad_norm": 4.112442970275879, + "learning_rate": 4.06879539536796e-06, + "loss": 0.2284, + "step": 44031 + }, + { + "epoch": 2.1972327726532828, + "grad_norm": 3.5534772872924805, + "learning_rate": 4.068658352747705e-06, + "loss": 0.1585, + "step": 44032 + }, + { + "epoch": 2.1972463374932176, + "grad_norm": 4.296517848968506, + "learning_rate": 4.06852131012745e-06, + "loss": 0.2452, + "step": 44033 + }, + { + "epoch": 2.1972599023331525, + "grad_norm": 5.554291725158691, + "learning_rate": 4.068384267507195e-06, + "loss": 0.2079, + "step": 44034 + }, + { + "epoch": 2.1972734671730874, + "grad_norm": 5.793510913848877, + "learning_rate": 4.0682472248869405e-06, + "loss": 0.1597, + "step": 44035 + }, + { + "epoch": 2.1972870320130222, + "grad_norm": 5.273709774017334, + "learning_rate": 4.068110182266685e-06, + "loss": 0.2249, + "step": 44036 + }, + { + "epoch": 2.197300596852957, + "grad_norm": 4.918381214141846, + "learning_rate": 4.067973139646431e-06, + "loss": 0.1421, + "step": 44037 + }, + { + "epoch": 2.197314161692892, + "grad_norm": 3.2290103435516357, + "learning_rate": 4.067836097026175e-06, + "loss": 0.1661, + "step": 44038 + }, + { + "epoch": 2.197327726532827, + "grad_norm": 4.297372817993164, + "learning_rate": 4.06769905440592e-06, + "loss": 0.2267, + "step": 44039 + }, + { + "epoch": 2.1973412913727617, + "grad_norm": 4.239876747131348, + "learning_rate": 4.067562011785666e-06, + "loss": 0.2337, + "step": 44040 + }, + { + "epoch": 2.1973548562126965, + "grad_norm": 5.457559108734131, + "learning_rate": 4.067424969165411e-06, + "loss": 0.2587, + "step": 44041 + }, + { + "epoch": 2.1973684210526314, + "grad_norm": 6.78803825378418, + "learning_rate": 4.067287926545156e-06, + "loss": 0.3957, + "step": 44042 + }, + { + "epoch": 2.1973819858925663, + "grad_norm": 6.5293660163879395, + "learning_rate": 4.067150883924901e-06, + "loss": 0.245, + "step": 44043 + }, + { + "epoch": 2.1973955507325016, + "grad_norm": 4.860759735107422, + "learning_rate": 4.067013841304646e-06, + "loss": 0.2452, + "step": 44044 + }, + { + "epoch": 2.1974091155724365, + "grad_norm": 5.897308349609375, + "learning_rate": 4.066876798684391e-06, + "loss": 0.3803, + "step": 44045 + }, + { + "epoch": 2.1974226804123713, + "grad_norm": 4.980532169342041, + "learning_rate": 4.066739756064137e-06, + "loss": 0.2504, + "step": 44046 + }, + { + "epoch": 2.197436245252306, + "grad_norm": 4.4051995277404785, + "learning_rate": 4.066602713443881e-06, + "loss": 0.2351, + "step": 44047 + }, + { + "epoch": 2.197449810092241, + "grad_norm": 5.078004837036133, + "learning_rate": 4.066465670823627e-06, + "loss": 0.1278, + "step": 44048 + }, + { + "epoch": 2.197463374932176, + "grad_norm": 4.610983371734619, + "learning_rate": 4.066328628203371e-06, + "loss": 0.3537, + "step": 44049 + }, + { + "epoch": 2.197476939772111, + "grad_norm": 4.995336532592773, + "learning_rate": 4.0661915855831165e-06, + "loss": 0.2792, + "step": 44050 + }, + { + "epoch": 2.1974905046120456, + "grad_norm": 4.096323013305664, + "learning_rate": 4.066054542962862e-06, + "loss": 0.1984, + "step": 44051 + }, + { + "epoch": 2.1975040694519805, + "grad_norm": 5.834864616394043, + "learning_rate": 4.065917500342607e-06, + "loss": 0.1952, + "step": 44052 + }, + { + "epoch": 2.1975176342919154, + "grad_norm": 4.686704635620117, + "learning_rate": 4.065780457722352e-06, + "loss": 0.1844, + "step": 44053 + }, + { + "epoch": 2.1975311991318502, + "grad_norm": 4.763516426086426, + "learning_rate": 4.065643415102096e-06, + "loss": 0.3214, + "step": 44054 + }, + { + "epoch": 2.197544763971785, + "grad_norm": 4.9989094734191895, + "learning_rate": 4.0655063724818424e-06, + "loss": 0.3036, + "step": 44055 + }, + { + "epoch": 2.19755832881172, + "grad_norm": 4.794042110443115, + "learning_rate": 4.065369329861587e-06, + "loss": 0.1803, + "step": 44056 + }, + { + "epoch": 2.197571893651655, + "grad_norm": 4.908589839935303, + "learning_rate": 4.065232287241333e-06, + "loss": 0.1762, + "step": 44057 + }, + { + "epoch": 2.1975854584915897, + "grad_norm": 4.9020891189575195, + "learning_rate": 4.065095244621077e-06, + "loss": 0.2729, + "step": 44058 + }, + { + "epoch": 2.1975990233315246, + "grad_norm": 5.766452789306641, + "learning_rate": 4.064958202000822e-06, + "loss": 0.252, + "step": 44059 + }, + { + "epoch": 2.1976125881714594, + "grad_norm": 4.751336574554443, + "learning_rate": 4.0648211593805675e-06, + "loss": 0.2361, + "step": 44060 + }, + { + "epoch": 2.1976261530113943, + "grad_norm": 3.3346738815307617, + "learning_rate": 4.064684116760313e-06, + "loss": 0.1842, + "step": 44061 + }, + { + "epoch": 2.197639717851329, + "grad_norm": 4.586757659912109, + "learning_rate": 4.064547074140058e-06, + "loss": 0.1961, + "step": 44062 + }, + { + "epoch": 2.197653282691264, + "grad_norm": 3.889066696166992, + "learning_rate": 4.064410031519803e-06, + "loss": 0.1403, + "step": 44063 + }, + { + "epoch": 2.197666847531199, + "grad_norm": 5.488066673278809, + "learning_rate": 4.064272988899548e-06, + "loss": 0.3035, + "step": 44064 + }, + { + "epoch": 2.197680412371134, + "grad_norm": 3.6245992183685303, + "learning_rate": 4.064135946279293e-06, + "loss": 0.1557, + "step": 44065 + }, + { + "epoch": 2.197693977211069, + "grad_norm": 3.2323575019836426, + "learning_rate": 4.0639989036590386e-06, + "loss": 0.0768, + "step": 44066 + }, + { + "epoch": 2.197707542051004, + "grad_norm": 4.373029708862305, + "learning_rate": 4.063861861038784e-06, + "loss": 0.1708, + "step": 44067 + }, + { + "epoch": 2.197721106890939, + "grad_norm": 5.794905662536621, + "learning_rate": 4.063724818418529e-06, + "loss": 0.2118, + "step": 44068 + }, + { + "epoch": 2.1977346717308737, + "grad_norm": 3.8818819522857666, + "learning_rate": 4.063587775798273e-06, + "loss": 0.1386, + "step": 44069 + }, + { + "epoch": 2.1977482365708085, + "grad_norm": 3.978637456893921, + "learning_rate": 4.063450733178018e-06, + "loss": 0.1805, + "step": 44070 + }, + { + "epoch": 2.1977618014107434, + "grad_norm": 5.607916355133057, + "learning_rate": 4.063313690557764e-06, + "loss": 0.1945, + "step": 44071 + }, + { + "epoch": 2.1977753662506783, + "grad_norm": 4.2313432693481445, + "learning_rate": 4.063176647937509e-06, + "loss": 0.1917, + "step": 44072 + }, + { + "epoch": 2.197788931090613, + "grad_norm": 4.547237873077393, + "learning_rate": 4.063039605317254e-06, + "loss": 0.1858, + "step": 44073 + }, + { + "epoch": 2.197802495930548, + "grad_norm": 5.349911689758301, + "learning_rate": 4.062902562696999e-06, + "loss": 0.2641, + "step": 44074 + }, + { + "epoch": 2.197816060770483, + "grad_norm": 4.17678689956665, + "learning_rate": 4.062765520076744e-06, + "loss": 0.2206, + "step": 44075 + }, + { + "epoch": 2.1978296256104177, + "grad_norm": 5.129701137542725, + "learning_rate": 4.0626284774564895e-06, + "loss": 0.1938, + "step": 44076 + }, + { + "epoch": 2.1978431904503526, + "grad_norm": 6.275431156158447, + "learning_rate": 4.062491434836235e-06, + "loss": 0.2701, + "step": 44077 + }, + { + "epoch": 2.1978567552902875, + "grad_norm": 3.433624744415283, + "learning_rate": 4.06235439221598e-06, + "loss": 0.1716, + "step": 44078 + }, + { + "epoch": 2.1978703201302223, + "grad_norm": 4.493418216705322, + "learning_rate": 4.062217349595724e-06, + "loss": 0.1463, + "step": 44079 + }, + { + "epoch": 2.197883884970157, + "grad_norm": 3.639052629470825, + "learning_rate": 4.06208030697547e-06, + "loss": 0.1318, + "step": 44080 + }, + { + "epoch": 2.197897449810092, + "grad_norm": 4.955953598022461, + "learning_rate": 4.0619432643552145e-06, + "loss": 0.1784, + "step": 44081 + }, + { + "epoch": 2.1979110146500274, + "grad_norm": 4.2761549949646, + "learning_rate": 4.06180622173496e-06, + "loss": 0.2359, + "step": 44082 + }, + { + "epoch": 2.1979245794899622, + "grad_norm": 4.741878986358643, + "learning_rate": 4.061669179114705e-06, + "loss": 0.1556, + "step": 44083 + }, + { + "epoch": 2.197938144329897, + "grad_norm": 5.412485122680664, + "learning_rate": 4.06153213649445e-06, + "loss": 0.2057, + "step": 44084 + }, + { + "epoch": 2.197951709169832, + "grad_norm": 7.804938793182373, + "learning_rate": 4.061395093874195e-06, + "loss": 0.5076, + "step": 44085 + }, + { + "epoch": 2.197965274009767, + "grad_norm": 5.067739009857178, + "learning_rate": 4.0612580512539404e-06, + "loss": 0.2552, + "step": 44086 + }, + { + "epoch": 2.1979788388497017, + "grad_norm": 7.941623687744141, + "learning_rate": 4.061121008633686e-06, + "loss": 0.3879, + "step": 44087 + }, + { + "epoch": 2.1979924036896366, + "grad_norm": 3.622215747833252, + "learning_rate": 4.06098396601343e-06, + "loss": 0.1872, + "step": 44088 + }, + { + "epoch": 2.1980059685295714, + "grad_norm": 4.9804463386535645, + "learning_rate": 4.060846923393176e-06, + "loss": 0.4133, + "step": 44089 + }, + { + "epoch": 2.1980195333695063, + "grad_norm": 5.607077598571777, + "learning_rate": 4.06070988077292e-06, + "loss": 0.2248, + "step": 44090 + }, + { + "epoch": 2.198033098209441, + "grad_norm": 4.815762996673584, + "learning_rate": 4.060572838152666e-06, + "loss": 0.1513, + "step": 44091 + }, + { + "epoch": 2.198046663049376, + "grad_norm": 4.39337158203125, + "learning_rate": 4.060435795532411e-06, + "loss": 0.1455, + "step": 44092 + }, + { + "epoch": 2.198060227889311, + "grad_norm": 4.020214557647705, + "learning_rate": 4.060298752912156e-06, + "loss": 0.1743, + "step": 44093 + }, + { + "epoch": 2.1980737927292457, + "grad_norm": 6.167052745819092, + "learning_rate": 4.060161710291901e-06, + "loss": 0.2692, + "step": 44094 + }, + { + "epoch": 2.1980873575691806, + "grad_norm": 4.383294582366943, + "learning_rate": 4.060024667671646e-06, + "loss": 0.2272, + "step": 44095 + }, + { + "epoch": 2.1981009224091155, + "grad_norm": 4.652093887329102, + "learning_rate": 4.059887625051391e-06, + "loss": 0.2535, + "step": 44096 + }, + { + "epoch": 2.1981144872490503, + "grad_norm": 4.584939956665039, + "learning_rate": 4.0597505824311366e-06, + "loss": 0.1994, + "step": 44097 + }, + { + "epoch": 2.198128052088985, + "grad_norm": 4.463438510894775, + "learning_rate": 4.059613539810882e-06, + "loss": 0.2184, + "step": 44098 + }, + { + "epoch": 2.19814161692892, + "grad_norm": 3.7878501415252686, + "learning_rate": 4.059476497190626e-06, + "loss": 0.1563, + "step": 44099 + }, + { + "epoch": 2.198155181768855, + "grad_norm": 4.459500312805176, + "learning_rate": 4.059339454570372e-06, + "loss": 0.2069, + "step": 44100 + }, + { + "epoch": 2.19816874660879, + "grad_norm": 4.998293399810791, + "learning_rate": 4.059202411950116e-06, + "loss": 0.207, + "step": 44101 + }, + { + "epoch": 2.198182311448725, + "grad_norm": 7.323647499084473, + "learning_rate": 4.0590653693298624e-06, + "loss": 0.3957, + "step": 44102 + }, + { + "epoch": 2.19819587628866, + "grad_norm": 4.356011867523193, + "learning_rate": 4.058928326709607e-06, + "loss": 0.261, + "step": 44103 + }, + { + "epoch": 2.198209441128595, + "grad_norm": 5.311983585357666, + "learning_rate": 4.058791284089352e-06, + "loss": 0.2665, + "step": 44104 + }, + { + "epoch": 2.1982230059685297, + "grad_norm": 3.926913261413574, + "learning_rate": 4.058654241469097e-06, + "loss": 0.1438, + "step": 44105 + }, + { + "epoch": 2.1982365708084646, + "grad_norm": 5.326834678649902, + "learning_rate": 4.058517198848842e-06, + "loss": 0.2745, + "step": 44106 + }, + { + "epoch": 2.1982501356483994, + "grad_norm": 4.750804424285889, + "learning_rate": 4.0583801562285875e-06, + "loss": 0.2017, + "step": 44107 + }, + { + "epoch": 2.1982637004883343, + "grad_norm": 6.000792026519775, + "learning_rate": 4.058243113608333e-06, + "loss": 0.2473, + "step": 44108 + }, + { + "epoch": 2.198277265328269, + "grad_norm": 4.468024253845215, + "learning_rate": 4.058106070988078e-06, + "loss": 0.2857, + "step": 44109 + }, + { + "epoch": 2.198290830168204, + "grad_norm": 4.490320682525635, + "learning_rate": 4.057969028367822e-06, + "loss": 0.1695, + "step": 44110 + }, + { + "epoch": 2.198304395008139, + "grad_norm": 4.993827819824219, + "learning_rate": 4.057831985747568e-06, + "loss": 0.3108, + "step": 44111 + }, + { + "epoch": 2.1983179598480738, + "grad_norm": 4.87699031829834, + "learning_rate": 4.0576949431273125e-06, + "loss": 0.2565, + "step": 44112 + }, + { + "epoch": 2.1983315246880086, + "grad_norm": 3.6775858402252197, + "learning_rate": 4.057557900507058e-06, + "loss": 0.1876, + "step": 44113 + }, + { + "epoch": 2.1983450895279435, + "grad_norm": 4.5089898109436035, + "learning_rate": 4.057420857886803e-06, + "loss": 0.218, + "step": 44114 + }, + { + "epoch": 2.1983586543678784, + "grad_norm": 4.849986553192139, + "learning_rate": 4.057283815266548e-06, + "loss": 0.3086, + "step": 44115 + }, + { + "epoch": 2.1983722192078132, + "grad_norm": 4.122210502624512, + "learning_rate": 4.057146772646293e-06, + "loss": 0.1735, + "step": 44116 + }, + { + "epoch": 2.198385784047748, + "grad_norm": 5.677755832672119, + "learning_rate": 4.0570097300260384e-06, + "loss": 0.2, + "step": 44117 + }, + { + "epoch": 2.198399348887683, + "grad_norm": 6.0381879806518555, + "learning_rate": 4.056872687405784e-06, + "loss": 0.2223, + "step": 44118 + }, + { + "epoch": 2.198412913727618, + "grad_norm": 3.970250129699707, + "learning_rate": 4.056735644785529e-06, + "loss": 0.2565, + "step": 44119 + }, + { + "epoch": 2.198426478567553, + "grad_norm": 4.800198078155518, + "learning_rate": 4.056598602165274e-06, + "loss": 0.2134, + "step": 44120 + }, + { + "epoch": 2.198440043407488, + "grad_norm": 6.887106418609619, + "learning_rate": 4.056461559545019e-06, + "loss": 0.3434, + "step": 44121 + }, + { + "epoch": 2.198453608247423, + "grad_norm": 4.971574306488037, + "learning_rate": 4.056324516924764e-06, + "loss": 0.2195, + "step": 44122 + }, + { + "epoch": 2.1984671730873577, + "grad_norm": 4.358517169952393, + "learning_rate": 4.056187474304509e-06, + "loss": 0.1719, + "step": 44123 + }, + { + "epoch": 2.1984807379272926, + "grad_norm": 4.151760101318359, + "learning_rate": 4.056050431684254e-06, + "loss": 0.2187, + "step": 44124 + }, + { + "epoch": 2.1984943027672275, + "grad_norm": 4.499837398529053, + "learning_rate": 4.055913389063999e-06, + "loss": 0.2393, + "step": 44125 + }, + { + "epoch": 2.1985078676071623, + "grad_norm": 3.9136040210723877, + "learning_rate": 4.055776346443744e-06, + "loss": 0.1831, + "step": 44126 + }, + { + "epoch": 2.198521432447097, + "grad_norm": 5.6997880935668945, + "learning_rate": 4.055639303823489e-06, + "loss": 0.2796, + "step": 44127 + }, + { + "epoch": 2.198534997287032, + "grad_norm": 4.505005836486816, + "learning_rate": 4.0555022612032346e-06, + "loss": 0.3643, + "step": 44128 + }, + { + "epoch": 2.198548562126967, + "grad_norm": 3.921463966369629, + "learning_rate": 4.05536521858298e-06, + "loss": 0.182, + "step": 44129 + }, + { + "epoch": 2.198562126966902, + "grad_norm": 5.902541637420654, + "learning_rate": 4.055228175962725e-06, + "loss": 0.2761, + "step": 44130 + }, + { + "epoch": 2.1985756918068367, + "grad_norm": 4.607699871063232, + "learning_rate": 4.05509113334247e-06, + "loss": 0.1633, + "step": 44131 + }, + { + "epoch": 2.1985892566467715, + "grad_norm": 4.800967216491699, + "learning_rate": 4.054954090722215e-06, + "loss": 0.2856, + "step": 44132 + }, + { + "epoch": 2.1986028214867064, + "grad_norm": 5.4832329750061035, + "learning_rate": 4.05481704810196e-06, + "loss": 0.2374, + "step": 44133 + }, + { + "epoch": 2.1986163863266412, + "grad_norm": 4.167923450469971, + "learning_rate": 4.054680005481706e-06, + "loss": 0.2469, + "step": 44134 + }, + { + "epoch": 2.198629951166576, + "grad_norm": 3.318770408630371, + "learning_rate": 4.05454296286145e-06, + "loss": 0.1676, + "step": 44135 + }, + { + "epoch": 2.198643516006511, + "grad_norm": 4.4868011474609375, + "learning_rate": 4.054405920241196e-06, + "loss": 0.25, + "step": 44136 + }, + { + "epoch": 2.198657080846446, + "grad_norm": 5.4848246574401855, + "learning_rate": 4.05426887762094e-06, + "loss": 0.248, + "step": 44137 + }, + { + "epoch": 2.1986706456863807, + "grad_norm": 5.4663496017456055, + "learning_rate": 4.0541318350006855e-06, + "loss": 0.2811, + "step": 44138 + }, + { + "epoch": 2.1986842105263156, + "grad_norm": 5.771106719970703, + "learning_rate": 4.053994792380431e-06, + "loss": 0.1978, + "step": 44139 + }, + { + "epoch": 2.198697775366251, + "grad_norm": 6.058309555053711, + "learning_rate": 4.053857749760176e-06, + "loss": 0.1818, + "step": 44140 + }, + { + "epoch": 2.1987113402061857, + "grad_norm": 4.028778076171875, + "learning_rate": 4.053720707139921e-06, + "loss": 0.1549, + "step": 44141 + }, + { + "epoch": 2.1987249050461206, + "grad_norm": 4.490809917449951, + "learning_rate": 4.053583664519665e-06, + "loss": 0.192, + "step": 44142 + }, + { + "epoch": 2.1987384698860555, + "grad_norm": 4.622379302978516, + "learning_rate": 4.053446621899411e-06, + "loss": 0.1993, + "step": 44143 + }, + { + "epoch": 2.1987520347259903, + "grad_norm": 5.820129871368408, + "learning_rate": 4.053309579279156e-06, + "loss": 0.3145, + "step": 44144 + }, + { + "epoch": 2.198765599565925, + "grad_norm": 5.478548049926758, + "learning_rate": 4.053172536658902e-06, + "loss": 0.2492, + "step": 44145 + }, + { + "epoch": 2.19877916440586, + "grad_norm": 5.118492603302002, + "learning_rate": 4.053035494038646e-06, + "loss": 0.1551, + "step": 44146 + }, + { + "epoch": 2.198792729245795, + "grad_norm": 4.274860382080078, + "learning_rate": 4.052898451418391e-06, + "loss": 0.1676, + "step": 44147 + }, + { + "epoch": 2.19880629408573, + "grad_norm": 4.119549751281738, + "learning_rate": 4.0527614087981364e-06, + "loss": 0.1752, + "step": 44148 + }, + { + "epoch": 2.1988198589256647, + "grad_norm": 5.514997959136963, + "learning_rate": 4.052624366177882e-06, + "loss": 0.187, + "step": 44149 + }, + { + "epoch": 2.1988334237655995, + "grad_norm": 5.91182279586792, + "learning_rate": 4.052487323557627e-06, + "loss": 0.1078, + "step": 44150 + }, + { + "epoch": 2.1988469886055344, + "grad_norm": 5.246894836425781, + "learning_rate": 4.052350280937372e-06, + "loss": 0.2972, + "step": 44151 + }, + { + "epoch": 2.1988605534454693, + "grad_norm": 4.317193984985352, + "learning_rate": 4.052213238317117e-06, + "loss": 0.239, + "step": 44152 + }, + { + "epoch": 2.198874118285404, + "grad_norm": 3.5573182106018066, + "learning_rate": 4.0520761956968615e-06, + "loss": 0.1691, + "step": 44153 + }, + { + "epoch": 2.198887683125339, + "grad_norm": 6.138461112976074, + "learning_rate": 4.0519391530766075e-06, + "loss": 0.215, + "step": 44154 + }, + { + "epoch": 2.198901247965274, + "grad_norm": 4.726728916168213, + "learning_rate": 4.051802110456352e-06, + "loss": 0.2697, + "step": 44155 + }, + { + "epoch": 2.1989148128052087, + "grad_norm": 5.195495128631592, + "learning_rate": 4.051665067836098e-06, + "loss": 0.2128, + "step": 44156 + }, + { + "epoch": 2.1989283776451436, + "grad_norm": 6.044172763824463, + "learning_rate": 4.051528025215842e-06, + "loss": 0.1742, + "step": 44157 + }, + { + "epoch": 2.198941942485079, + "grad_norm": 4.570775985717773, + "learning_rate": 4.051390982595587e-06, + "loss": 0.2056, + "step": 44158 + }, + { + "epoch": 2.1989555073250138, + "grad_norm": 5.797262191772461, + "learning_rate": 4.0512539399753326e-06, + "loss": 0.2292, + "step": 44159 + }, + { + "epoch": 2.1989690721649486, + "grad_norm": 7.476115703582764, + "learning_rate": 4.051116897355078e-06, + "loss": 0.2756, + "step": 44160 + }, + { + "epoch": 2.1989826370048835, + "grad_norm": 4.261616230010986, + "learning_rate": 4.050979854734823e-06, + "loss": 0.1309, + "step": 44161 + }, + { + "epoch": 2.1989962018448184, + "grad_norm": 7.727029323577881, + "learning_rate": 4.050842812114568e-06, + "loss": 0.1898, + "step": 44162 + }, + { + "epoch": 2.1990097666847532, + "grad_norm": 5.800765514373779, + "learning_rate": 4.050705769494313e-06, + "loss": 0.1759, + "step": 44163 + }, + { + "epoch": 2.199023331524688, + "grad_norm": 4.533022403717041, + "learning_rate": 4.050568726874058e-06, + "loss": 0.2578, + "step": 44164 + }, + { + "epoch": 2.199036896364623, + "grad_norm": 4.776997089385986, + "learning_rate": 4.050431684253804e-06, + "loss": 0.1398, + "step": 44165 + }, + { + "epoch": 2.199050461204558, + "grad_norm": 5.242519378662109, + "learning_rate": 4.050294641633548e-06, + "loss": 0.2889, + "step": 44166 + }, + { + "epoch": 2.1990640260444927, + "grad_norm": 6.177070617675781, + "learning_rate": 4.050157599013293e-06, + "loss": 0.2015, + "step": 44167 + }, + { + "epoch": 2.1990775908844276, + "grad_norm": 6.725262641906738, + "learning_rate": 4.050020556393038e-06, + "loss": 0.1996, + "step": 44168 + }, + { + "epoch": 2.1990911557243624, + "grad_norm": 5.160417079925537, + "learning_rate": 4.0498835137727835e-06, + "loss": 0.1829, + "step": 44169 + }, + { + "epoch": 2.1991047205642973, + "grad_norm": 3.9449238777160645, + "learning_rate": 4.049746471152529e-06, + "loss": 0.1503, + "step": 44170 + }, + { + "epoch": 2.199118285404232, + "grad_norm": 5.859297752380371, + "learning_rate": 4.049609428532274e-06, + "loss": 0.2186, + "step": 44171 + }, + { + "epoch": 2.199131850244167, + "grad_norm": 4.909761428833008, + "learning_rate": 4.049472385912019e-06, + "loss": 0.1749, + "step": 44172 + }, + { + "epoch": 2.199145415084102, + "grad_norm": 6.287460803985596, + "learning_rate": 4.049335343291764e-06, + "loss": 0.219, + "step": 44173 + }, + { + "epoch": 2.1991589799240367, + "grad_norm": 5.554868221282959, + "learning_rate": 4.049198300671509e-06, + "loss": 0.1661, + "step": 44174 + }, + { + "epoch": 2.1991725447639716, + "grad_norm": 5.885637283325195, + "learning_rate": 4.049061258051255e-06, + "loss": 0.2593, + "step": 44175 + }, + { + "epoch": 2.1991861096039065, + "grad_norm": 4.74133825302124, + "learning_rate": 4.048924215431e-06, + "loss": 0.1839, + "step": 44176 + }, + { + "epoch": 2.1991996744438413, + "grad_norm": 5.088065147399902, + "learning_rate": 4.048787172810745e-06, + "loss": 0.2429, + "step": 44177 + }, + { + "epoch": 2.1992132392837767, + "grad_norm": 4.551678657531738, + "learning_rate": 4.048650130190489e-06, + "loss": 0.1298, + "step": 44178 + }, + { + "epoch": 2.1992268041237115, + "grad_norm": 3.9989876747131348, + "learning_rate": 4.0485130875702344e-06, + "loss": 0.1442, + "step": 44179 + }, + { + "epoch": 2.1992403689636464, + "grad_norm": 5.315558910369873, + "learning_rate": 4.04837604494998e-06, + "loss": 0.161, + "step": 44180 + }, + { + "epoch": 2.1992539338035813, + "grad_norm": 5.106865406036377, + "learning_rate": 4.048239002329725e-06, + "loss": 0.1835, + "step": 44181 + }, + { + "epoch": 2.199267498643516, + "grad_norm": 7.282375335693359, + "learning_rate": 4.04810195970947e-06, + "loss": 0.2163, + "step": 44182 + }, + { + "epoch": 2.199281063483451, + "grad_norm": 5.96738338470459, + "learning_rate": 4.047964917089215e-06, + "loss": 0.207, + "step": 44183 + }, + { + "epoch": 2.199294628323386, + "grad_norm": 4.139456748962402, + "learning_rate": 4.04782787446896e-06, + "loss": 0.1666, + "step": 44184 + }, + { + "epoch": 2.1993081931633207, + "grad_norm": 4.113230228424072, + "learning_rate": 4.0476908318487055e-06, + "loss": 0.1831, + "step": 44185 + }, + { + "epoch": 2.1993217580032556, + "grad_norm": 4.111947536468506, + "learning_rate": 4.047553789228451e-06, + "loss": 0.1543, + "step": 44186 + }, + { + "epoch": 2.1993353228431904, + "grad_norm": 5.789068698883057, + "learning_rate": 4.047416746608195e-06, + "loss": 0.1431, + "step": 44187 + }, + { + "epoch": 2.1993488876831253, + "grad_norm": 6.625709533691406, + "learning_rate": 4.047279703987941e-06, + "loss": 0.1786, + "step": 44188 + }, + { + "epoch": 2.19936245252306, + "grad_norm": 3.392517328262329, + "learning_rate": 4.047142661367685e-06, + "loss": 0.098, + "step": 44189 + }, + { + "epoch": 2.199376017362995, + "grad_norm": 6.404314041137695, + "learning_rate": 4.047005618747431e-06, + "loss": 0.3125, + "step": 44190 + }, + { + "epoch": 2.19938958220293, + "grad_norm": 5.792778015136719, + "learning_rate": 4.046868576127176e-06, + "loss": 0.2916, + "step": 44191 + }, + { + "epoch": 2.1994031470428648, + "grad_norm": 4.955750942230225, + "learning_rate": 4.046731533506921e-06, + "loss": 0.1819, + "step": 44192 + }, + { + "epoch": 2.1994167118827996, + "grad_norm": 4.810628890991211, + "learning_rate": 4.046594490886666e-06, + "loss": 0.1224, + "step": 44193 + }, + { + "epoch": 2.1994302767227345, + "grad_norm": 5.445047855377197, + "learning_rate": 4.046457448266411e-06, + "loss": 0.2134, + "step": 44194 + }, + { + "epoch": 2.1994438415626694, + "grad_norm": 4.975081920623779, + "learning_rate": 4.0463204056461565e-06, + "loss": 0.1823, + "step": 44195 + }, + { + "epoch": 2.1994574064026047, + "grad_norm": 4.584101676940918, + "learning_rate": 4.046183363025901e-06, + "loss": 0.166, + "step": 44196 + }, + { + "epoch": 2.1994709712425395, + "grad_norm": 4.990189075469971, + "learning_rate": 4.046046320405647e-06, + "loss": 0.1114, + "step": 44197 + }, + { + "epoch": 2.1994845360824744, + "grad_norm": 5.889001846313477, + "learning_rate": 4.045909277785391e-06, + "loss": 0.2427, + "step": 44198 + }, + { + "epoch": 2.1994981009224093, + "grad_norm": 5.368786334991455, + "learning_rate": 4.045772235165137e-06, + "loss": 0.2076, + "step": 44199 + }, + { + "epoch": 2.199511665762344, + "grad_norm": 3.4092953205108643, + "learning_rate": 4.0456351925448815e-06, + "loss": 0.1208, + "step": 44200 + }, + { + "epoch": 2.199525230602279, + "grad_norm": 4.566625118255615, + "learning_rate": 4.045498149924627e-06, + "loss": 0.2361, + "step": 44201 + }, + { + "epoch": 2.199538795442214, + "grad_norm": 4.268670558929443, + "learning_rate": 4.045361107304372e-06, + "loss": 0.1744, + "step": 44202 + }, + { + "epoch": 2.1995523602821487, + "grad_norm": 7.347102642059326, + "learning_rate": 4.045224064684117e-06, + "loss": 0.3244, + "step": 44203 + }, + { + "epoch": 2.1995659251220836, + "grad_norm": 5.661805152893066, + "learning_rate": 4.045087022063862e-06, + "loss": 0.2165, + "step": 44204 + }, + { + "epoch": 2.1995794899620185, + "grad_norm": 3.791555881500244, + "learning_rate": 4.044949979443607e-06, + "loss": 0.0998, + "step": 44205 + }, + { + "epoch": 2.1995930548019533, + "grad_norm": 5.87748908996582, + "learning_rate": 4.044812936823353e-06, + "loss": 0.2069, + "step": 44206 + }, + { + "epoch": 2.199606619641888, + "grad_norm": 4.220809459686279, + "learning_rate": 4.044675894203097e-06, + "loss": 0.13, + "step": 44207 + }, + { + "epoch": 2.199620184481823, + "grad_norm": 4.780485153198242, + "learning_rate": 4.044538851582843e-06, + "loss": 0.108, + "step": 44208 + }, + { + "epoch": 2.199633749321758, + "grad_norm": 5.0754241943359375, + "learning_rate": 4.044401808962587e-06, + "loss": 0.1835, + "step": 44209 + }, + { + "epoch": 2.199647314161693, + "grad_norm": 4.722970008850098, + "learning_rate": 4.044264766342333e-06, + "loss": 0.1347, + "step": 44210 + }, + { + "epoch": 2.1996608790016277, + "grad_norm": 4.187963008880615, + "learning_rate": 4.044127723722078e-06, + "loss": 0.077, + "step": 44211 + }, + { + "epoch": 2.1996744438415625, + "grad_norm": 4.0247111320495605, + "learning_rate": 4.043990681101823e-06, + "loss": 0.1211, + "step": 44212 + }, + { + "epoch": 2.1996880086814974, + "grad_norm": 5.091185569763184, + "learning_rate": 4.043853638481568e-06, + "loss": 0.2574, + "step": 44213 + }, + { + "epoch": 2.1997015735214323, + "grad_norm": 5.303222179412842, + "learning_rate": 4.043716595861313e-06, + "loss": 0.2028, + "step": 44214 + }, + { + "epoch": 2.199715138361367, + "grad_norm": 3.3715531826019287, + "learning_rate": 4.043579553241058e-06, + "loss": 0.1321, + "step": 44215 + }, + { + "epoch": 2.1997287032013024, + "grad_norm": 4.553656578063965, + "learning_rate": 4.0434425106208035e-06, + "loss": 0.1501, + "step": 44216 + }, + { + "epoch": 2.1997422680412373, + "grad_norm": 5.597533226013184, + "learning_rate": 4.043305468000549e-06, + "loss": 0.2834, + "step": 44217 + }, + { + "epoch": 2.199755832881172, + "grad_norm": 4.588717460632324, + "learning_rate": 4.043168425380294e-06, + "loss": 0.1967, + "step": 44218 + }, + { + "epoch": 2.199769397721107, + "grad_norm": 6.138737201690674, + "learning_rate": 4.043031382760039e-06, + "loss": 0.1985, + "step": 44219 + }, + { + "epoch": 2.199782962561042, + "grad_norm": 5.102181434631348, + "learning_rate": 4.042894340139783e-06, + "loss": 0.1642, + "step": 44220 + }, + { + "epoch": 2.1997965274009768, + "grad_norm": 5.477359771728516, + "learning_rate": 4.0427572975195286e-06, + "loss": 0.254, + "step": 44221 + }, + { + "epoch": 2.1998100922409116, + "grad_norm": 4.093328475952148, + "learning_rate": 4.042620254899274e-06, + "loss": 0.124, + "step": 44222 + }, + { + "epoch": 2.1998236570808465, + "grad_norm": 4.472240924835205, + "learning_rate": 4.042483212279019e-06, + "loss": 0.1181, + "step": 44223 + }, + { + "epoch": 2.1998372219207813, + "grad_norm": 3.878598928451538, + "learning_rate": 4.042346169658764e-06, + "loss": 0.0957, + "step": 44224 + }, + { + "epoch": 2.199850786760716, + "grad_norm": 5.991175651550293, + "learning_rate": 4.042209127038509e-06, + "loss": 0.1955, + "step": 44225 + }, + { + "epoch": 2.199864351600651, + "grad_norm": 5.10882568359375, + "learning_rate": 4.0420720844182545e-06, + "loss": 0.1591, + "step": 44226 + }, + { + "epoch": 2.199877916440586, + "grad_norm": 4.3177714347839355, + "learning_rate": 4.041935041798e-06, + "loss": 0.1689, + "step": 44227 + }, + { + "epoch": 2.199891481280521, + "grad_norm": 4.877546310424805, + "learning_rate": 4.041797999177745e-06, + "loss": 0.1702, + "step": 44228 + }, + { + "epoch": 2.1999050461204557, + "grad_norm": 4.184615135192871, + "learning_rate": 4.04166095655749e-06, + "loss": 0.1587, + "step": 44229 + }, + { + "epoch": 2.1999186109603905, + "grad_norm": 4.274989128112793, + "learning_rate": 4.041523913937234e-06, + "loss": 0.132, + "step": 44230 + }, + { + "epoch": 2.1999321758003254, + "grad_norm": 4.012132167816162, + "learning_rate": 4.04138687131698e-06, + "loss": 0.1957, + "step": 44231 + }, + { + "epoch": 2.1999457406402603, + "grad_norm": 4.386122226715088, + "learning_rate": 4.041249828696725e-06, + "loss": 0.1122, + "step": 44232 + }, + { + "epoch": 2.1999457406402603, + "eval_loss": 0.33159905672073364, + "eval_noise_accuracy": NaN, + "eval_runtime": 4460.628, + "eval_samples_per_second": 1.126, + "eval_steps_per_second": 0.07, + "eval_wer": 25.152142349633934, + "step": 44232 + }, + { + "epoch": 2.199959305480195, + "grad_norm": 4.852498531341553, + "learning_rate": 4.04111278607647e-06, + "loss": 0.1518, + "step": 44233 + }, + { + "epoch": 2.1999728703201304, + "grad_norm": 3.2643532752990723, + "learning_rate": 4.040975743456215e-06, + "loss": 0.0804, + "step": 44234 + }, + { + "epoch": 2.1999864351600653, + "grad_norm": 5.303746700286865, + "learning_rate": 4.04083870083596e-06, + "loss": 0.1433, + "step": 44235 + }, + { + "epoch": 2.2, + "grad_norm": 2.7344446182250977, + "learning_rate": 4.040701658215705e-06, + "loss": 0.0787, + "step": 44236 + }, + { + "epoch": 2.200013564839935, + "grad_norm": 4.748047351837158, + "learning_rate": 4.040564615595451e-06, + "loss": 0.0982, + "step": 44237 + }, + { + "epoch": 2.20002712967987, + "grad_norm": 4.50514030456543, + "learning_rate": 4.040427572975196e-06, + "loss": 0.1364, + "step": 44238 + }, + { + "epoch": 3.000013564839935, + "grad_norm": 5.785679817199707, + "learning_rate": 4.040290530354941e-06, + "loss": 0.1258, + "step": 44239 + }, + { + "epoch": 3.0000271296798697, + "grad_norm": 3.9270882606506348, + "learning_rate": 4.040153487734686e-06, + "loss": 0.0642, + "step": 44240 + }, + { + "epoch": 3.0000406945198046, + "grad_norm": 3.7168362140655518, + "learning_rate": 4.0400164451144305e-06, + "loss": 0.1639, + "step": 44241 + }, + { + "epoch": 3.0000542593597395, + "grad_norm": 3.5460000038146973, + "learning_rate": 4.0398794024941765e-06, + "loss": 0.1282, + "step": 44242 + }, + { + "epoch": 3.0000678241996743, + "grad_norm": 4.092308521270752, + "learning_rate": 4.039742359873921e-06, + "loss": 0.2116, + "step": 44243 + }, + { + "epoch": 3.000081389039609, + "grad_norm": 3.8153154850006104, + "learning_rate": 4.039605317253667e-06, + "loss": 0.1079, + "step": 44244 + }, + { + "epoch": 3.000094953879544, + "grad_norm": 4.161372661590576, + "learning_rate": 4.039468274633411e-06, + "loss": 0.1856, + "step": 44245 + }, + { + "epoch": 3.000108518719479, + "grad_norm": 5.884277820587158, + "learning_rate": 4.039331232013156e-06, + "loss": 0.151, + "step": 44246 + }, + { + "epoch": 3.000122083559414, + "grad_norm": 4.52618932723999, + "learning_rate": 4.0391941893929015e-06, + "loss": 0.187, + "step": 44247 + }, + { + "epoch": 3.000135648399349, + "grad_norm": 4.078252792358398, + "learning_rate": 4.039057146772647e-06, + "loss": 0.1671, + "step": 44248 + }, + { + "epoch": 3.000149213239284, + "grad_norm": 5.451068878173828, + "learning_rate": 4.038920104152392e-06, + "loss": 0.2079, + "step": 44249 + }, + { + "epoch": 3.000162778079219, + "grad_norm": 4.168013095855713, + "learning_rate": 4.038783061532136e-06, + "loss": 0.1199, + "step": 44250 + }, + { + "epoch": 3.0001763429191537, + "grad_norm": 3.5420186519622803, + "learning_rate": 4.038646018911882e-06, + "loss": 0.1117, + "step": 44251 + }, + { + "epoch": 3.0001899077590886, + "grad_norm": 5.690127849578857, + "learning_rate": 4.0385089762916266e-06, + "loss": 0.207, + "step": 44252 + }, + { + "epoch": 3.0002034725990234, + "grad_norm": 4.609129428863525, + "learning_rate": 4.038371933671373e-06, + "loss": 0.1774, + "step": 44253 + }, + { + "epoch": 3.0002170374389583, + "grad_norm": 3.7983956336975098, + "learning_rate": 4.038234891051117e-06, + "loss": 0.1134, + "step": 44254 + }, + { + "epoch": 3.000230602278893, + "grad_norm": 4.260553359985352, + "learning_rate": 4.038097848430862e-06, + "loss": 0.114, + "step": 44255 + }, + { + "epoch": 3.000244167118828, + "grad_norm": 5.031543254852295, + "learning_rate": 4.037960805810607e-06, + "loss": 0.0899, + "step": 44256 + }, + { + "epoch": 3.000257731958763, + "grad_norm": 5.070161819458008, + "learning_rate": 4.0378237631903525e-06, + "loss": 0.121, + "step": 44257 + }, + { + "epoch": 3.0002712967986978, + "grad_norm": 2.8923869132995605, + "learning_rate": 4.037686720570098e-06, + "loss": 0.0669, + "step": 44258 + }, + { + "epoch": 3.0002848616386326, + "grad_norm": 3.968031167984009, + "learning_rate": 4.037549677949843e-06, + "loss": 0.1114, + "step": 44259 + }, + { + "epoch": 3.0002984264785675, + "grad_norm": 3.8026280403137207, + "learning_rate": 4.037412635329588e-06, + "loss": 0.1104, + "step": 44260 + }, + { + "epoch": 3.0003119913185023, + "grad_norm": 4.574861526489258, + "learning_rate": 4.037275592709332e-06, + "loss": 0.1497, + "step": 44261 + }, + { + "epoch": 3.000325556158437, + "grad_norm": 2.186251163482666, + "learning_rate": 4.037138550089078e-06, + "loss": 0.0434, + "step": 44262 + }, + { + "epoch": 3.000339120998372, + "grad_norm": 4.279007911682129, + "learning_rate": 4.037001507468823e-06, + "loss": 0.1408, + "step": 44263 + }, + { + "epoch": 3.000352685838307, + "grad_norm": 4.40070104598999, + "learning_rate": 4.036864464848569e-06, + "loss": 0.1731, + "step": 44264 + }, + { + "epoch": 3.000366250678242, + "grad_norm": 3.7693824768066406, + "learning_rate": 4.036727422228313e-06, + "loss": 0.1489, + "step": 44265 + }, + { + "epoch": 3.0003798155181767, + "grad_norm": 3.3225889205932617, + "learning_rate": 4.036590379608058e-06, + "loss": 0.0896, + "step": 44266 + }, + { + "epoch": 3.000393380358112, + "grad_norm": 3.3641607761383057, + "learning_rate": 4.036453336987803e-06, + "loss": 0.1048, + "step": 44267 + }, + { + "epoch": 3.000406945198047, + "grad_norm": 3.6729397773742676, + "learning_rate": 4.036316294367549e-06, + "loss": 0.0941, + "step": 44268 + }, + { + "epoch": 3.0004205100379817, + "grad_norm": 4.216504096984863, + "learning_rate": 4.036179251747294e-06, + "loss": 0.1314, + "step": 44269 + }, + { + "epoch": 3.0004340748779166, + "grad_norm": 2.8624634742736816, + "learning_rate": 4.036042209127039e-06, + "loss": 0.0751, + "step": 44270 + }, + { + "epoch": 3.0004476397178514, + "grad_norm": 4.082829475402832, + "learning_rate": 4.035905166506784e-06, + "loss": 0.1393, + "step": 44271 + }, + { + "epoch": 3.0004612045577863, + "grad_norm": 5.328400611877441, + "learning_rate": 4.035768123886529e-06, + "loss": 0.1728, + "step": 44272 + }, + { + "epoch": 3.000474769397721, + "grad_norm": 2.1184141635894775, + "learning_rate": 4.0356310812662745e-06, + "loss": 0.0764, + "step": 44273 + }, + { + "epoch": 3.000488334237656, + "grad_norm": 3.7376868724823, + "learning_rate": 4.03549403864602e-06, + "loss": 0.1297, + "step": 44274 + }, + { + "epoch": 3.000501899077591, + "grad_norm": 3.040707588195801, + "learning_rate": 4.035356996025764e-06, + "loss": 0.0772, + "step": 44275 + }, + { + "epoch": 3.0005154639175258, + "grad_norm": 2.679746389389038, + "learning_rate": 4.035219953405509e-06, + "loss": 0.0754, + "step": 44276 + }, + { + "epoch": 3.0005290287574606, + "grad_norm": 3.7231030464172363, + "learning_rate": 4.035082910785254e-06, + "loss": 0.1013, + "step": 44277 + }, + { + "epoch": 3.0005425935973955, + "grad_norm": 2.7679460048675537, + "learning_rate": 4.0349458681649995e-06, + "loss": 0.0604, + "step": 44278 + }, + { + "epoch": 3.0005561584373304, + "grad_norm": 2.5537428855895996, + "learning_rate": 4.034808825544745e-06, + "loss": 0.0749, + "step": 44279 + }, + { + "epoch": 3.0005697232772652, + "grad_norm": 3.019890308380127, + "learning_rate": 4.03467178292449e-06, + "loss": 0.1021, + "step": 44280 + }, + { + "epoch": 3.0005832881172, + "grad_norm": 4.2300238609313965, + "learning_rate": 4.034534740304235e-06, + "loss": 0.1698, + "step": 44281 + }, + { + "epoch": 3.000596852957135, + "grad_norm": 3.0683541297912598, + "learning_rate": 4.03439769768398e-06, + "loss": 0.0706, + "step": 44282 + }, + { + "epoch": 3.00061041779707, + "grad_norm": 4.870031356811523, + "learning_rate": 4.0342606550637254e-06, + "loss": 0.1031, + "step": 44283 + }, + { + "epoch": 3.0006239826370047, + "grad_norm": 3.488882303237915, + "learning_rate": 4.03412361244347e-06, + "loss": 0.0745, + "step": 44284 + }, + { + "epoch": 3.0006375474769396, + "grad_norm": 4.514451026916504, + "learning_rate": 4.033986569823216e-06, + "loss": 0.126, + "step": 44285 + }, + { + "epoch": 3.000651112316875, + "grad_norm": 2.800699472427368, + "learning_rate": 4.03384952720296e-06, + "loss": 0.0794, + "step": 44286 + }, + { + "epoch": 3.0006646771568097, + "grad_norm": 4.281912326812744, + "learning_rate": 4.033712484582706e-06, + "loss": 0.1289, + "step": 44287 + }, + { + "epoch": 3.0006782419967446, + "grad_norm": 2.8971810340881348, + "learning_rate": 4.0335754419624505e-06, + "loss": 0.063, + "step": 44288 + }, + { + "epoch": 3.0006918068366795, + "grad_norm": 3.533565044403076, + "learning_rate": 4.033438399342196e-06, + "loss": 0.0894, + "step": 44289 + }, + { + "epoch": 3.0007053716766143, + "grad_norm": 3.77233624458313, + "learning_rate": 4.033301356721941e-06, + "loss": 0.0931, + "step": 44290 + }, + { + "epoch": 3.000718936516549, + "grad_norm": 3.032794713973999, + "learning_rate": 4.033164314101686e-06, + "loss": 0.12, + "step": 44291 + }, + { + "epoch": 3.000732501356484, + "grad_norm": 3.747850179672241, + "learning_rate": 4.033027271481431e-06, + "loss": 0.1274, + "step": 44292 + }, + { + "epoch": 3.000746066196419, + "grad_norm": 5.181124687194824, + "learning_rate": 4.032890228861176e-06, + "loss": 0.1229, + "step": 44293 + }, + { + "epoch": 3.000759631036354, + "grad_norm": 2.869312047958374, + "learning_rate": 4.0327531862409215e-06, + "loss": 0.0726, + "step": 44294 + }, + { + "epoch": 3.0007731958762887, + "grad_norm": 2.7073991298675537, + "learning_rate": 4.032616143620666e-06, + "loss": 0.1424, + "step": 44295 + }, + { + "epoch": 3.0007867607162235, + "grad_norm": 3.2728025913238525, + "learning_rate": 4.032479101000412e-06, + "loss": 0.0845, + "step": 44296 + }, + { + "epoch": 3.0008003255561584, + "grad_norm": 2.7816615104675293, + "learning_rate": 4.032342058380156e-06, + "loss": 0.1167, + "step": 44297 + }, + { + "epoch": 3.0008138903960933, + "grad_norm": 2.755387783050537, + "learning_rate": 4.032205015759902e-06, + "loss": 0.0786, + "step": 44298 + }, + { + "epoch": 3.000827455236028, + "grad_norm": 3.3861684799194336, + "learning_rate": 4.032067973139647e-06, + "loss": 0.0597, + "step": 44299 + }, + { + "epoch": 3.000841020075963, + "grad_norm": 3.248098850250244, + "learning_rate": 4.031930930519392e-06, + "loss": 0.0663, + "step": 44300 + }, + { + "epoch": 3.000854584915898, + "grad_norm": 2.3291800022125244, + "learning_rate": 4.031793887899137e-06, + "loss": 0.0585, + "step": 44301 + }, + { + "epoch": 3.0008681497558327, + "grad_norm": 3.0952351093292236, + "learning_rate": 4.031656845278882e-06, + "loss": 0.0554, + "step": 44302 + }, + { + "epoch": 3.0008817145957676, + "grad_norm": 3.75569748878479, + "learning_rate": 4.031519802658627e-06, + "loss": 0.1064, + "step": 44303 + }, + { + "epoch": 3.0008952794357024, + "grad_norm": 4.033102035522461, + "learning_rate": 4.031382760038372e-06, + "loss": 0.065, + "step": 44304 + }, + { + "epoch": 3.0009088442756378, + "grad_norm": 2.528327465057373, + "learning_rate": 4.031245717418118e-06, + "loss": 0.0744, + "step": 44305 + }, + { + "epoch": 3.0009224091155726, + "grad_norm": 1.6260945796966553, + "learning_rate": 4.031108674797862e-06, + "loss": 0.0275, + "step": 44306 + }, + { + "epoch": 3.0009359739555075, + "grad_norm": 2.837933301925659, + "learning_rate": 4.030971632177608e-06, + "loss": 0.0687, + "step": 44307 + }, + { + "epoch": 3.0009495387954424, + "grad_norm": 2.9490573406219482, + "learning_rate": 4.030834589557352e-06, + "loss": 0.0507, + "step": 44308 + }, + { + "epoch": 3.000963103635377, + "grad_norm": 2.337476968765259, + "learning_rate": 4.0306975469370975e-06, + "loss": 0.0317, + "step": 44309 + }, + { + "epoch": 3.000976668475312, + "grad_norm": 2.9179189205169678, + "learning_rate": 4.030560504316843e-06, + "loss": 0.0766, + "step": 44310 + }, + { + "epoch": 3.000990233315247, + "grad_norm": 2.86175537109375, + "learning_rate": 4.030423461696588e-06, + "loss": 0.0671, + "step": 44311 + }, + { + "epoch": 3.001003798155182, + "grad_norm": 3.059960126876831, + "learning_rate": 4.030286419076333e-06, + "loss": 0.0802, + "step": 44312 + }, + { + "epoch": 3.0010173629951167, + "grad_norm": 2.6978375911712646, + "learning_rate": 4.030149376456078e-06, + "loss": 0.0721, + "step": 44313 + }, + { + "epoch": 3.0010309278350515, + "grad_norm": 3.17789363861084, + "learning_rate": 4.0300123338358234e-06, + "loss": 0.0842, + "step": 44314 + }, + { + "epoch": 3.0010444926749864, + "grad_norm": 1.8520201444625854, + "learning_rate": 4.029875291215569e-06, + "loss": 0.0373, + "step": 44315 + }, + { + "epoch": 3.0010580575149213, + "grad_norm": 2.618117094039917, + "learning_rate": 4.029738248595314e-06, + "loss": 0.0514, + "step": 44316 + }, + { + "epoch": 3.001071622354856, + "grad_norm": 2.739739179611206, + "learning_rate": 4.029601205975058e-06, + "loss": 0.0869, + "step": 44317 + }, + { + "epoch": 3.001085187194791, + "grad_norm": 2.276292085647583, + "learning_rate": 4.029464163354804e-06, + "loss": 0.0357, + "step": 44318 + }, + { + "epoch": 3.001098752034726, + "grad_norm": 3.4849307537078857, + "learning_rate": 4.0293271207345485e-06, + "loss": 0.0691, + "step": 44319 + }, + { + "epoch": 3.0011123168746607, + "grad_norm": 2.3398351669311523, + "learning_rate": 4.029190078114294e-06, + "loss": 0.0541, + "step": 44320 + }, + { + "epoch": 3.0011258817145956, + "grad_norm": 2.821530818939209, + "learning_rate": 4.029053035494039e-06, + "loss": 0.0689, + "step": 44321 + }, + { + "epoch": 3.0011394465545305, + "grad_norm": 3.5497946739196777, + "learning_rate": 4.028915992873784e-06, + "loss": 0.1057, + "step": 44322 + }, + { + "epoch": 3.0011530113944653, + "grad_norm": 3.340871572494507, + "learning_rate": 4.028778950253529e-06, + "loss": 0.102, + "step": 44323 + }, + { + "epoch": 3.0011665762344006, + "grad_norm": 2.4072179794311523, + "learning_rate": 4.028641907633274e-06, + "loss": 0.0567, + "step": 44324 + }, + { + "epoch": 3.0011801410743355, + "grad_norm": 2.2601966857910156, + "learning_rate": 4.0285048650130196e-06, + "loss": 0.0325, + "step": 44325 + }, + { + "epoch": 3.0011937059142704, + "grad_norm": 4.250492572784424, + "learning_rate": 4.028367822392765e-06, + "loss": 0.0873, + "step": 44326 + }, + { + "epoch": 3.0012072707542052, + "grad_norm": 4.188471794128418, + "learning_rate": 4.02823077977251e-06, + "loss": 0.1157, + "step": 44327 + }, + { + "epoch": 3.00122083559414, + "grad_norm": 3.0242385864257812, + "learning_rate": 4.028093737152255e-06, + "loss": 0.0771, + "step": 44328 + }, + { + "epoch": 3.001234400434075, + "grad_norm": 2.930095911026001, + "learning_rate": 4.027956694531999e-06, + "loss": 0.0621, + "step": 44329 + }, + { + "epoch": 3.00124796527401, + "grad_norm": 3.2112510204315186, + "learning_rate": 4.027819651911745e-06, + "loss": 0.1143, + "step": 44330 + }, + { + "epoch": 3.0012615301139447, + "grad_norm": 2.104977607727051, + "learning_rate": 4.02768260929149e-06, + "loss": 0.0474, + "step": 44331 + }, + { + "epoch": 3.0012750949538796, + "grad_norm": 2.955012083053589, + "learning_rate": 4.027545566671235e-06, + "loss": 0.1015, + "step": 44332 + }, + { + "epoch": 3.0012886597938144, + "grad_norm": 3.6079351902008057, + "learning_rate": 4.02740852405098e-06, + "loss": 0.1687, + "step": 44333 + }, + { + "epoch": 3.0013022246337493, + "grad_norm": 3.5799472332000732, + "learning_rate": 4.027271481430725e-06, + "loss": 0.1177, + "step": 44334 + }, + { + "epoch": 3.001315789473684, + "grad_norm": 4.8935699462890625, + "learning_rate": 4.0271344388104705e-06, + "loss": 0.1354, + "step": 44335 + }, + { + "epoch": 3.001329354313619, + "grad_norm": 3.801814556121826, + "learning_rate": 4.026997396190216e-06, + "loss": 0.0952, + "step": 44336 + }, + { + "epoch": 3.001342919153554, + "grad_norm": 3.7047386169433594, + "learning_rate": 4.026860353569961e-06, + "loss": 0.0648, + "step": 44337 + }, + { + "epoch": 3.0013564839934888, + "grad_norm": 3.338446855545044, + "learning_rate": 4.026723310949705e-06, + "loss": 0.0969, + "step": 44338 + }, + { + "epoch": 3.0013700488334236, + "grad_norm": 2.298218011856079, + "learning_rate": 4.026586268329451e-06, + "loss": 0.0543, + "step": 44339 + }, + { + "epoch": 3.0013836136733585, + "grad_norm": 2.352588653564453, + "learning_rate": 4.0264492257091955e-06, + "loss": 0.0646, + "step": 44340 + }, + { + "epoch": 3.0013971785132934, + "grad_norm": 2.985311508178711, + "learning_rate": 4.0263121830889416e-06, + "loss": 0.0699, + "step": 44341 + }, + { + "epoch": 3.001410743353228, + "grad_norm": 2.6299235820770264, + "learning_rate": 4.026175140468686e-06, + "loss": 0.078, + "step": 44342 + }, + { + "epoch": 3.0014243081931635, + "grad_norm": 3.3066930770874023, + "learning_rate": 4.026038097848431e-06, + "loss": 0.1072, + "step": 44343 + }, + { + "epoch": 3.0014378730330984, + "grad_norm": 4.6342291831970215, + "learning_rate": 4.025901055228176e-06, + "loss": 0.1261, + "step": 44344 + }, + { + "epoch": 3.0014514378730333, + "grad_norm": 2.8531031608581543, + "learning_rate": 4.0257640126079214e-06, + "loss": 0.0629, + "step": 44345 + }, + { + "epoch": 3.001465002712968, + "grad_norm": 4.5342326164245605, + "learning_rate": 4.025626969987667e-06, + "loss": 0.1625, + "step": 44346 + }, + { + "epoch": 3.001478567552903, + "grad_norm": 3.7685976028442383, + "learning_rate": 4.025489927367412e-06, + "loss": 0.1309, + "step": 44347 + }, + { + "epoch": 3.001492132392838, + "grad_norm": 3.851292371749878, + "learning_rate": 4.025352884747157e-06, + "loss": 0.0844, + "step": 44348 + }, + { + "epoch": 3.0015056972327727, + "grad_norm": 4.219143867492676, + "learning_rate": 4.025215842126901e-06, + "loss": 0.1094, + "step": 44349 + }, + { + "epoch": 3.0015192620727076, + "grad_norm": 3.4357619285583496, + "learning_rate": 4.025078799506647e-06, + "loss": 0.0875, + "step": 44350 + }, + { + "epoch": 3.0015328269126424, + "grad_norm": 2.5969226360321045, + "learning_rate": 4.024941756886392e-06, + "loss": 0.0694, + "step": 44351 + }, + { + "epoch": 3.0015463917525773, + "grad_norm": 2.910712242126465, + "learning_rate": 4.024804714266138e-06, + "loss": 0.06, + "step": 44352 + }, + { + "epoch": 3.001559956592512, + "grad_norm": 4.190856456756592, + "learning_rate": 4.024667671645882e-06, + "loss": 0.0892, + "step": 44353 + }, + { + "epoch": 3.001573521432447, + "grad_norm": 4.6688408851623535, + "learning_rate": 4.024530629025627e-06, + "loss": 0.1646, + "step": 44354 + }, + { + "epoch": 3.001587086272382, + "grad_norm": 3.9962944984436035, + "learning_rate": 4.024393586405372e-06, + "loss": 0.0874, + "step": 44355 + }, + { + "epoch": 3.0016006511123168, + "grad_norm": 4.131712913513184, + "learning_rate": 4.0242565437851176e-06, + "loss": 0.1794, + "step": 44356 + }, + { + "epoch": 3.0016142159522516, + "grad_norm": 2.7848730087280273, + "learning_rate": 4.024119501164863e-06, + "loss": 0.0823, + "step": 44357 + }, + { + "epoch": 3.0016277807921865, + "grad_norm": 4.560518741607666, + "learning_rate": 4.023982458544607e-06, + "loss": 0.0993, + "step": 44358 + }, + { + "epoch": 3.0016413456321214, + "grad_norm": 3.8166074752807617, + "learning_rate": 4.023845415924353e-06, + "loss": 0.0705, + "step": 44359 + }, + { + "epoch": 3.0016549104720562, + "grad_norm": 4.8293561935424805, + "learning_rate": 4.0237083733040974e-06, + "loss": 0.1269, + "step": 44360 + }, + { + "epoch": 3.001668475311991, + "grad_norm": 2.8844785690307617, + "learning_rate": 4.0235713306838435e-06, + "loss": 0.0975, + "step": 44361 + }, + { + "epoch": 3.0016820401519264, + "grad_norm": 4.145874500274658, + "learning_rate": 4.023434288063588e-06, + "loss": 0.147, + "step": 44362 + }, + { + "epoch": 3.0016956049918613, + "grad_norm": 2.858937978744507, + "learning_rate": 4.023297245443333e-06, + "loss": 0.062, + "step": 44363 + }, + { + "epoch": 3.001709169831796, + "grad_norm": 6.578452110290527, + "learning_rate": 4.023160202823078e-06, + "loss": 0.1848, + "step": 44364 + }, + { + "epoch": 3.001722734671731, + "grad_norm": 2.647085189819336, + "learning_rate": 4.023023160202823e-06, + "loss": 0.0979, + "step": 44365 + }, + { + "epoch": 3.001736299511666, + "grad_norm": 2.930382013320923, + "learning_rate": 4.0228861175825685e-06, + "loss": 0.0591, + "step": 44366 + }, + { + "epoch": 3.0017498643516007, + "grad_norm": 3.7898411750793457, + "learning_rate": 4.022749074962314e-06, + "loss": 0.1111, + "step": 44367 + }, + { + "epoch": 3.0017634291915356, + "grad_norm": 3.8968286514282227, + "learning_rate": 4.022612032342059e-06, + "loss": 0.2046, + "step": 44368 + }, + { + "epoch": 3.0017769940314705, + "grad_norm": 3.5024425983428955, + "learning_rate": 4.022474989721804e-06, + "loss": 0.084, + "step": 44369 + }, + { + "epoch": 3.0017905588714053, + "grad_norm": 3.411479949951172, + "learning_rate": 4.022337947101549e-06, + "loss": 0.0636, + "step": 44370 + }, + { + "epoch": 3.00180412371134, + "grad_norm": 4.425986289978027, + "learning_rate": 4.0222009044812935e-06, + "loss": 0.1194, + "step": 44371 + }, + { + "epoch": 3.001817688551275, + "grad_norm": 2.289661169052124, + "learning_rate": 4.022063861861039e-06, + "loss": 0.0785, + "step": 44372 + }, + { + "epoch": 3.00183125339121, + "grad_norm": 5.300210475921631, + "learning_rate": 4.021926819240784e-06, + "loss": 0.1357, + "step": 44373 + }, + { + "epoch": 3.001844818231145, + "grad_norm": 4.107512950897217, + "learning_rate": 4.021789776620529e-06, + "loss": 0.1384, + "step": 44374 + }, + { + "epoch": 3.0018583830710797, + "grad_norm": 4.216986656188965, + "learning_rate": 4.021652734000274e-06, + "loss": 0.1175, + "step": 44375 + }, + { + "epoch": 3.0018719479110145, + "grad_norm": 3.552049398422241, + "learning_rate": 4.0215156913800194e-06, + "loss": 0.0927, + "step": 44376 + }, + { + "epoch": 3.0018855127509494, + "grad_norm": 3.262809991836548, + "learning_rate": 4.021378648759765e-06, + "loss": 0.1074, + "step": 44377 + }, + { + "epoch": 3.0018990775908843, + "grad_norm": 5.299243450164795, + "learning_rate": 4.02124160613951e-06, + "loss": 0.1587, + "step": 44378 + }, + { + "epoch": 3.001912642430819, + "grad_norm": 4.733795166015625, + "learning_rate": 4.021104563519255e-06, + "loss": 0.1918, + "step": 44379 + }, + { + "epoch": 3.001926207270754, + "grad_norm": 3.1824564933776855, + "learning_rate": 4.020967520899e-06, + "loss": 0.076, + "step": 44380 + }, + { + "epoch": 3.0019397721106893, + "grad_norm": 5.232369422912598, + "learning_rate": 4.020830478278745e-06, + "loss": 0.149, + "step": 44381 + }, + { + "epoch": 3.001953336950624, + "grad_norm": 3.604431629180908, + "learning_rate": 4.0206934356584905e-06, + "loss": 0.0743, + "step": 44382 + }, + { + "epoch": 3.001966901790559, + "grad_norm": 3.8181393146514893, + "learning_rate": 4.020556393038235e-06, + "loss": 0.1027, + "step": 44383 + }, + { + "epoch": 3.001980466630494, + "grad_norm": 3.7380759716033936, + "learning_rate": 4.020419350417981e-06, + "loss": 0.0631, + "step": 44384 + }, + { + "epoch": 3.0019940314704288, + "grad_norm": 4.3673200607299805, + "learning_rate": 4.020282307797725e-06, + "loss": 0.1122, + "step": 44385 + }, + { + "epoch": 3.0020075963103636, + "grad_norm": 4.607533931732178, + "learning_rate": 4.02014526517747e-06, + "loss": 0.0993, + "step": 44386 + }, + { + "epoch": 3.0020211611502985, + "grad_norm": 2.655197858810425, + "learning_rate": 4.0200082225572156e-06, + "loss": 0.0557, + "step": 44387 + }, + { + "epoch": 3.0020347259902334, + "grad_norm": 3.7152137756347656, + "learning_rate": 4.019871179936961e-06, + "loss": 0.135, + "step": 44388 + }, + { + "epoch": 3.002048290830168, + "grad_norm": 4.293601989746094, + "learning_rate": 4.019734137316706e-06, + "loss": 0.0992, + "step": 44389 + }, + { + "epoch": 3.002061855670103, + "grad_norm": 9.389205932617188, + "learning_rate": 4.019597094696451e-06, + "loss": 0.3075, + "step": 44390 + }, + { + "epoch": 3.002075420510038, + "grad_norm": 3.6322669982910156, + "learning_rate": 4.019460052076196e-06, + "loss": 0.0991, + "step": 44391 + }, + { + "epoch": 3.002088985349973, + "grad_norm": 4.099528789520264, + "learning_rate": 4.019323009455941e-06, + "loss": 0.1309, + "step": 44392 + }, + { + "epoch": 3.0021025501899077, + "grad_norm": 3.0399978160858154, + "learning_rate": 4.019185966835687e-06, + "loss": 0.0596, + "step": 44393 + }, + { + "epoch": 3.0021161150298425, + "grad_norm": 4.876704216003418, + "learning_rate": 4.019048924215431e-06, + "loss": 0.1494, + "step": 44394 + }, + { + "epoch": 3.0021296798697774, + "grad_norm": 4.887241840362549, + "learning_rate": 4.018911881595177e-06, + "loss": 0.1445, + "step": 44395 + }, + { + "epoch": 3.0021432447097123, + "grad_norm": 4.018755912780762, + "learning_rate": 4.018774838974921e-06, + "loss": 0.0877, + "step": 44396 + }, + { + "epoch": 3.002156809549647, + "grad_norm": 4.7261433601379395, + "learning_rate": 4.0186377963546665e-06, + "loss": 0.1203, + "step": 44397 + }, + { + "epoch": 3.002170374389582, + "grad_norm": 4.307555198669434, + "learning_rate": 4.018500753734412e-06, + "loss": 0.1269, + "step": 44398 + }, + { + "epoch": 3.002183939229517, + "grad_norm": 3.0077931880950928, + "learning_rate": 4.018363711114157e-06, + "loss": 0.0958, + "step": 44399 + }, + { + "epoch": 3.002197504069452, + "grad_norm": 4.363842487335205, + "learning_rate": 4.018226668493902e-06, + "loss": 0.124, + "step": 44400 + }, + { + "epoch": 3.002211068909387, + "grad_norm": 4.737562656402588, + "learning_rate": 4.018089625873647e-06, + "loss": 0.2267, + "step": 44401 + }, + { + "epoch": 3.002224633749322, + "grad_norm": 5.26780366897583, + "learning_rate": 4.017952583253392e-06, + "loss": 0.2019, + "step": 44402 + }, + { + "epoch": 3.002238198589257, + "grad_norm": 4.009848117828369, + "learning_rate": 4.017815540633137e-06, + "loss": 0.1327, + "step": 44403 + }, + { + "epoch": 3.0022517634291916, + "grad_norm": 3.8827550411224365, + "learning_rate": 4.017678498012883e-06, + "loss": 0.1488, + "step": 44404 + }, + { + "epoch": 3.0022653282691265, + "grad_norm": 3.898033380508423, + "learning_rate": 4.017541455392627e-06, + "loss": 0.0966, + "step": 44405 + }, + { + "epoch": 3.0022788931090614, + "grad_norm": 5.079492568969727, + "learning_rate": 4.017404412772373e-06, + "loss": 0.1421, + "step": 44406 + }, + { + "epoch": 3.0022924579489962, + "grad_norm": 3.8790414333343506, + "learning_rate": 4.0172673701521174e-06, + "loss": 0.1468, + "step": 44407 + }, + { + "epoch": 3.002306022788931, + "grad_norm": 6.418661117553711, + "learning_rate": 4.017130327531863e-06, + "loss": 0.1798, + "step": 44408 + }, + { + "epoch": 3.002319587628866, + "grad_norm": 3.511122226715088, + "learning_rate": 4.016993284911608e-06, + "loss": 0.1195, + "step": 44409 + }, + { + "epoch": 3.002333152468801, + "grad_norm": 5.390814781188965, + "learning_rate": 4.016856242291353e-06, + "loss": 0.1752, + "step": 44410 + }, + { + "epoch": 3.0023467173087357, + "grad_norm": 4.291254997253418, + "learning_rate": 4.016719199671098e-06, + "loss": 0.0959, + "step": 44411 + }, + { + "epoch": 3.0023602821486706, + "grad_norm": 2.7811694145202637, + "learning_rate": 4.0165821570508425e-06, + "loss": 0.0516, + "step": 44412 + }, + { + "epoch": 3.0023738469886054, + "grad_norm": 4.680515289306641, + "learning_rate": 4.0164451144305885e-06, + "loss": 0.0939, + "step": 44413 + }, + { + "epoch": 3.0023874118285403, + "grad_norm": 6.281663417816162, + "learning_rate": 4.016308071810333e-06, + "loss": 0.1723, + "step": 44414 + }, + { + "epoch": 3.002400976668475, + "grad_norm": 4.751073360443115, + "learning_rate": 4.016171029190079e-06, + "loss": 0.1338, + "step": 44415 + }, + { + "epoch": 3.00241454150841, + "grad_norm": 5.753815650939941, + "learning_rate": 4.016033986569823e-06, + "loss": 0.1848, + "step": 44416 + }, + { + "epoch": 3.002428106348345, + "grad_norm": 4.335049629211426, + "learning_rate": 4.015896943949568e-06, + "loss": 0.1062, + "step": 44417 + }, + { + "epoch": 3.0024416711882798, + "grad_norm": 3.9944334030151367, + "learning_rate": 4.0157599013293136e-06, + "loss": 0.0937, + "step": 44418 + }, + { + "epoch": 3.002455236028215, + "grad_norm": 3.9635095596313477, + "learning_rate": 4.015622858709059e-06, + "loss": 0.0995, + "step": 44419 + }, + { + "epoch": 3.00246880086815, + "grad_norm": 4.392760276794434, + "learning_rate": 4.015485816088804e-06, + "loss": 0.0942, + "step": 44420 + }, + { + "epoch": 3.002482365708085, + "grad_norm": 3.8845043182373047, + "learning_rate": 4.015348773468549e-06, + "loss": 0.1344, + "step": 44421 + }, + { + "epoch": 3.0024959305480197, + "grad_norm": 4.812009811401367, + "learning_rate": 4.015211730848294e-06, + "loss": 0.1048, + "step": 44422 + }, + { + "epoch": 3.0025094953879545, + "grad_norm": 2.582866668701172, + "learning_rate": 4.0150746882280395e-06, + "loss": 0.0581, + "step": 44423 + }, + { + "epoch": 3.0025230602278894, + "grad_norm": 4.636440277099609, + "learning_rate": 4.014937645607785e-06, + "loss": 0.1183, + "step": 44424 + }, + { + "epoch": 3.0025366250678243, + "grad_norm": 3.2011513710021973, + "learning_rate": 4.01480060298753e-06, + "loss": 0.0579, + "step": 44425 + }, + { + "epoch": 3.002550189907759, + "grad_norm": 4.488195419311523, + "learning_rate": 4.014663560367274e-06, + "loss": 0.1358, + "step": 44426 + }, + { + "epoch": 3.002563754747694, + "grad_norm": 4.625016689300537, + "learning_rate": 4.014526517747019e-06, + "loss": 0.1551, + "step": 44427 + }, + { + "epoch": 3.002577319587629, + "grad_norm": 4.56710958480835, + "learning_rate": 4.0143894751267645e-06, + "loss": 0.1266, + "step": 44428 + }, + { + "epoch": 3.0025908844275637, + "grad_norm": 2.5157577991485596, + "learning_rate": 4.01425243250651e-06, + "loss": 0.0538, + "step": 44429 + }, + { + "epoch": 3.0026044492674986, + "grad_norm": 3.588477373123169, + "learning_rate": 4.014115389886255e-06, + "loss": 0.0928, + "step": 44430 + }, + { + "epoch": 3.0026180141074335, + "grad_norm": 4.030426502227783, + "learning_rate": 4.013978347266e-06, + "loss": 0.0935, + "step": 44431 + }, + { + "epoch": 3.0026315789473683, + "grad_norm": 4.186124324798584, + "learning_rate": 4.013841304645745e-06, + "loss": 0.0841, + "step": 44432 + }, + { + "epoch": 3.002645143787303, + "grad_norm": 3.487610340118408, + "learning_rate": 4.01370426202549e-06, + "loss": 0.1232, + "step": 44433 + }, + { + "epoch": 3.002658708627238, + "grad_norm": 4.878418445587158, + "learning_rate": 4.013567219405236e-06, + "loss": 0.0945, + "step": 44434 + }, + { + "epoch": 3.002672273467173, + "grad_norm": 3.2001383304595947, + "learning_rate": 4.013430176784981e-06, + "loss": 0.0872, + "step": 44435 + }, + { + "epoch": 3.002685838307108, + "grad_norm": 4.631882190704346, + "learning_rate": 4.013293134164726e-06, + "loss": 0.1591, + "step": 44436 + }, + { + "epoch": 3.0026994031470426, + "grad_norm": 5.620147705078125, + "learning_rate": 4.01315609154447e-06, + "loss": 0.1351, + "step": 44437 + }, + { + "epoch": 3.002712967986978, + "grad_norm": 4.1893181800842285, + "learning_rate": 4.013019048924216e-06, + "loss": 0.1112, + "step": 44438 + }, + { + "epoch": 3.002726532826913, + "grad_norm": 4.968924522399902, + "learning_rate": 4.012882006303961e-06, + "loss": 0.1459, + "step": 44439 + }, + { + "epoch": 3.0027400976668477, + "grad_norm": 3.1582045555114746, + "learning_rate": 4.012744963683706e-06, + "loss": 0.0808, + "step": 44440 + }, + { + "epoch": 3.0027536625067826, + "grad_norm": 3.7344462871551514, + "learning_rate": 4.012607921063451e-06, + "loss": 0.0984, + "step": 44441 + }, + { + "epoch": 3.0027672273467174, + "grad_norm": 3.3948476314544678, + "learning_rate": 4.012470878443196e-06, + "loss": 0.0759, + "step": 44442 + }, + { + "epoch": 3.0027807921866523, + "grad_norm": 7.202311992645264, + "learning_rate": 4.012333835822941e-06, + "loss": 0.2009, + "step": 44443 + }, + { + "epoch": 3.002794357026587, + "grad_norm": 4.272679328918457, + "learning_rate": 4.0121967932026865e-06, + "loss": 0.081, + "step": 44444 + }, + { + "epoch": 3.002807921866522, + "grad_norm": 3.0671277046203613, + "learning_rate": 4.012059750582432e-06, + "loss": 0.0674, + "step": 44445 + }, + { + "epoch": 3.002821486706457, + "grad_norm": 2.6102445125579834, + "learning_rate": 4.011922707962176e-06, + "loss": 0.054, + "step": 44446 + }, + { + "epoch": 3.0028350515463917, + "grad_norm": 2.3518178462982178, + "learning_rate": 4.011785665341922e-06, + "loss": 0.0295, + "step": 44447 + }, + { + "epoch": 3.0028486163863266, + "grad_norm": 3.9582462310791016, + "learning_rate": 4.011648622721666e-06, + "loss": 0.1271, + "step": 44448 + }, + { + "epoch": 3.0028621812262615, + "grad_norm": 3.1069891452789307, + "learning_rate": 4.011511580101412e-06, + "loss": 0.0482, + "step": 44449 + }, + { + "epoch": 3.0028757460661963, + "grad_norm": 4.1555280685424805, + "learning_rate": 4.011374537481157e-06, + "loss": 0.1121, + "step": 44450 + }, + { + "epoch": 3.002889310906131, + "grad_norm": 4.111544132232666, + "learning_rate": 4.011237494860902e-06, + "loss": 0.1055, + "step": 44451 + }, + { + "epoch": 3.002902875746066, + "grad_norm": 2.472923517227173, + "learning_rate": 4.011100452240647e-06, + "loss": 0.0524, + "step": 44452 + }, + { + "epoch": 3.002916440586001, + "grad_norm": 4.033140182495117, + "learning_rate": 4.010963409620392e-06, + "loss": 0.1035, + "step": 44453 + }, + { + "epoch": 3.002930005425936, + "grad_norm": 3.028045892715454, + "learning_rate": 4.0108263670001375e-06, + "loss": 0.0594, + "step": 44454 + }, + { + "epoch": 3.0029435702658707, + "grad_norm": 4.56050443649292, + "learning_rate": 4.010689324379883e-06, + "loss": 0.1499, + "step": 44455 + }, + { + "epoch": 3.0029571351058055, + "grad_norm": 4.023608207702637, + "learning_rate": 4.010552281759628e-06, + "loss": 0.0537, + "step": 44456 + }, + { + "epoch": 3.002970699945741, + "grad_norm": 4.313004016876221, + "learning_rate": 4.010415239139372e-06, + "loss": 0.1505, + "step": 44457 + }, + { + "epoch": 3.0029842647856757, + "grad_norm": 3.9927961826324463, + "learning_rate": 4.010278196519118e-06, + "loss": 0.1088, + "step": 44458 + }, + { + "epoch": 3.0029978296256106, + "grad_norm": 3.9849376678466797, + "learning_rate": 4.0101411538988625e-06, + "loss": 0.1356, + "step": 44459 + }, + { + "epoch": 3.0030113944655454, + "grad_norm": 1.9180498123168945, + "learning_rate": 4.0100041112786085e-06, + "loss": 0.0592, + "step": 44460 + }, + { + "epoch": 3.0030249593054803, + "grad_norm": 2.91998028755188, + "learning_rate": 4.009867068658353e-06, + "loss": 0.1073, + "step": 44461 + }, + { + "epoch": 3.003038524145415, + "grad_norm": 2.85426926612854, + "learning_rate": 4.009730026038098e-06, + "loss": 0.0626, + "step": 44462 + }, + { + "epoch": 3.00305208898535, + "grad_norm": 3.1594507694244385, + "learning_rate": 4.009592983417843e-06, + "loss": 0.0818, + "step": 44463 + }, + { + "epoch": 3.003065653825285, + "grad_norm": 3.942349672317505, + "learning_rate": 4.009455940797588e-06, + "loss": 0.1065, + "step": 44464 + }, + { + "epoch": 3.0030792186652198, + "grad_norm": 3.3864526748657227, + "learning_rate": 4.009318898177334e-06, + "loss": 0.0699, + "step": 44465 + }, + { + "epoch": 3.0030927835051546, + "grad_norm": 2.429861307144165, + "learning_rate": 4.009181855557079e-06, + "loss": 0.0355, + "step": 44466 + }, + { + "epoch": 3.0031063483450895, + "grad_norm": 3.4298458099365234, + "learning_rate": 4.009044812936824e-06, + "loss": 0.1283, + "step": 44467 + }, + { + "epoch": 3.0031199131850244, + "grad_norm": 3.5497820377349854, + "learning_rate": 4.008907770316568e-06, + "loss": 0.0913, + "step": 44468 + }, + { + "epoch": 3.0031334780249592, + "grad_norm": 3.181030750274658, + "learning_rate": 4.008770727696314e-06, + "loss": 0.0489, + "step": 44469 + }, + { + "epoch": 3.003147042864894, + "grad_norm": 4.304847240447998, + "learning_rate": 4.008633685076059e-06, + "loss": 0.1389, + "step": 44470 + }, + { + "epoch": 3.003160607704829, + "grad_norm": 2.1034529209136963, + "learning_rate": 4.008496642455804e-06, + "loss": 0.0746, + "step": 44471 + }, + { + "epoch": 3.003174172544764, + "grad_norm": 3.81816029548645, + "learning_rate": 4.008359599835549e-06, + "loss": 0.1155, + "step": 44472 + }, + { + "epoch": 3.0031877373846987, + "grad_norm": 9.430898666381836, + "learning_rate": 4.008222557215294e-06, + "loss": 0.1135, + "step": 44473 + }, + { + "epoch": 3.0032013022246336, + "grad_norm": 3.486621856689453, + "learning_rate": 4.008085514595039e-06, + "loss": 0.1246, + "step": 44474 + }, + { + "epoch": 3.0032148670645684, + "grad_norm": 3.114344835281372, + "learning_rate": 4.0079484719747845e-06, + "loss": 0.0531, + "step": 44475 + }, + { + "epoch": 3.0032284319045037, + "grad_norm": 2.8247876167297363, + "learning_rate": 4.00781142935453e-06, + "loss": 0.0846, + "step": 44476 + }, + { + "epoch": 3.0032419967444386, + "grad_norm": 4.331211090087891, + "learning_rate": 4.007674386734275e-06, + "loss": 0.1052, + "step": 44477 + }, + { + "epoch": 3.0032555615843735, + "grad_norm": 4.596597194671631, + "learning_rate": 4.00753734411402e-06, + "loss": 0.1131, + "step": 44478 + }, + { + "epoch": 3.0032691264243083, + "grad_norm": 3.426215887069702, + "learning_rate": 4.007400301493765e-06, + "loss": 0.0881, + "step": 44479 + }, + { + "epoch": 3.003282691264243, + "grad_norm": 2.5886104106903076, + "learning_rate": 4.0072632588735096e-06, + "loss": 0.0419, + "step": 44480 + }, + { + "epoch": 3.003296256104178, + "grad_norm": 4.145650386810303, + "learning_rate": 4.007126216253255e-06, + "loss": 0.1959, + "step": 44481 + }, + { + "epoch": 3.003309820944113, + "grad_norm": 4.593999862670898, + "learning_rate": 4.006989173633e-06, + "loss": 0.1453, + "step": 44482 + }, + { + "epoch": 3.003323385784048, + "grad_norm": 3.6943318843841553, + "learning_rate": 4.006852131012745e-06, + "loss": 0.0853, + "step": 44483 + }, + { + "epoch": 3.0033369506239826, + "grad_norm": 4.131954193115234, + "learning_rate": 4.00671508839249e-06, + "loss": 0.1285, + "step": 44484 + }, + { + "epoch": 3.0033505154639175, + "grad_norm": 4.402050971984863, + "learning_rate": 4.0065780457722355e-06, + "loss": 0.1204, + "step": 44485 + }, + { + "epoch": 3.0033640803038524, + "grad_norm": 3.8816850185394287, + "learning_rate": 4.006441003151981e-06, + "loss": 0.0953, + "step": 44486 + }, + { + "epoch": 3.0033776451437872, + "grad_norm": 2.847243070602417, + "learning_rate": 4.006303960531726e-06, + "loss": 0.0453, + "step": 44487 + }, + { + "epoch": 3.003391209983722, + "grad_norm": 3.6382946968078613, + "learning_rate": 4.006166917911471e-06, + "loss": 0.122, + "step": 44488 + }, + { + "epoch": 3.003404774823657, + "grad_norm": 3.3043816089630127, + "learning_rate": 4.006029875291216e-06, + "loss": 0.0827, + "step": 44489 + }, + { + "epoch": 3.003418339663592, + "grad_norm": 2.660144090652466, + "learning_rate": 4.005892832670961e-06, + "loss": 0.0928, + "step": 44490 + }, + { + "epoch": 3.0034319045035267, + "grad_norm": 5.136814117431641, + "learning_rate": 4.005755790050706e-06, + "loss": 0.1897, + "step": 44491 + }, + { + "epoch": 3.0034454693434616, + "grad_norm": 4.358012676239014, + "learning_rate": 4.005618747430452e-06, + "loss": 0.1753, + "step": 44492 + }, + { + "epoch": 3.0034590341833964, + "grad_norm": 3.268289089202881, + "learning_rate": 4.005481704810196e-06, + "loss": 0.1103, + "step": 44493 + }, + { + "epoch": 3.0034725990233317, + "grad_norm": 3.461250066757202, + "learning_rate": 4.005344662189942e-06, + "loss": 0.0784, + "step": 44494 + }, + { + "epoch": 3.0034861638632666, + "grad_norm": 4.094456672668457, + "learning_rate": 4.005207619569686e-06, + "loss": 0.0875, + "step": 44495 + }, + { + "epoch": 3.0034997287032015, + "grad_norm": 4.225960731506348, + "learning_rate": 4.005070576949432e-06, + "loss": 0.116, + "step": 44496 + }, + { + "epoch": 3.0035132935431363, + "grad_norm": 5.407227993011475, + "learning_rate": 4.004933534329177e-06, + "loss": 0.2438, + "step": 44497 + }, + { + "epoch": 3.003526858383071, + "grad_norm": 6.329638481140137, + "learning_rate": 4.004796491708922e-06, + "loss": 0.2447, + "step": 44498 + }, + { + "epoch": 3.003540423223006, + "grad_norm": 5.425896167755127, + "learning_rate": 4.004659449088667e-06, + "loss": 0.1658, + "step": 44499 + }, + { + "epoch": 3.003553988062941, + "grad_norm": 3.4262185096740723, + "learning_rate": 4.0045224064684115e-06, + "loss": 0.1143, + "step": 44500 + }, + { + "epoch": 3.003567552902876, + "grad_norm": 4.057361602783203, + "learning_rate": 4.0043853638481575e-06, + "loss": 0.0993, + "step": 44501 + }, + { + "epoch": 3.0035811177428107, + "grad_norm": 4.220065593719482, + "learning_rate": 4.004248321227902e-06, + "loss": 0.1308, + "step": 44502 + }, + { + "epoch": 3.0035946825827455, + "grad_norm": 4.599380970001221, + "learning_rate": 4.004111278607648e-06, + "loss": 0.138, + "step": 44503 + }, + { + "epoch": 3.0036082474226804, + "grad_norm": 5.105267524719238, + "learning_rate": 4.003974235987392e-06, + "loss": 0.2572, + "step": 44504 + }, + { + "epoch": 3.0036218122626153, + "grad_norm": 5.224923610687256, + "learning_rate": 4.003837193367137e-06, + "loss": 0.1653, + "step": 44505 + }, + { + "epoch": 3.00363537710255, + "grad_norm": 4.402682781219482, + "learning_rate": 4.0037001507468825e-06, + "loss": 0.126, + "step": 44506 + }, + { + "epoch": 3.003648941942485, + "grad_norm": 4.488640785217285, + "learning_rate": 4.003563108126628e-06, + "loss": 0.1007, + "step": 44507 + }, + { + "epoch": 3.00366250678242, + "grad_norm": 4.783355236053467, + "learning_rate": 4.003426065506373e-06, + "loss": 0.1249, + "step": 44508 + }, + { + "epoch": 3.0036760716223547, + "grad_norm": 3.5298688411712646, + "learning_rate": 4.003289022886118e-06, + "loss": 0.1427, + "step": 44509 + }, + { + "epoch": 3.0036896364622896, + "grad_norm": 4.593191146850586, + "learning_rate": 4.003151980265863e-06, + "loss": 0.1356, + "step": 44510 + }, + { + "epoch": 3.0037032013022245, + "grad_norm": 3.2963600158691406, + "learning_rate": 4.003014937645608e-06, + "loss": 0.1197, + "step": 44511 + }, + { + "epoch": 3.0037167661421593, + "grad_norm": 4.768101215362549, + "learning_rate": 4.002877895025354e-06, + "loss": 0.1135, + "step": 44512 + }, + { + "epoch": 3.0037303309820946, + "grad_norm": 4.839300155639648, + "learning_rate": 4.002740852405098e-06, + "loss": 0.1689, + "step": 44513 + }, + { + "epoch": 3.0037438958220295, + "grad_norm": 4.377809524536133, + "learning_rate": 4.002603809784843e-06, + "loss": 0.0641, + "step": 44514 + }, + { + "epoch": 3.0037574606619644, + "grad_norm": 2.9890880584716797, + "learning_rate": 4.002466767164588e-06, + "loss": 0.0641, + "step": 44515 + }, + { + "epoch": 3.0037710255018992, + "grad_norm": 4.404369354248047, + "learning_rate": 4.0023297245443335e-06, + "loss": 0.1331, + "step": 44516 + }, + { + "epoch": 3.003784590341834, + "grad_norm": 3.8701963424682617, + "learning_rate": 4.002192681924079e-06, + "loss": 0.1256, + "step": 44517 + }, + { + "epoch": 3.003798155181769, + "grad_norm": 4.761359691619873, + "learning_rate": 4.002055639303824e-06, + "loss": 0.1515, + "step": 44518 + }, + { + "epoch": 3.003811720021704, + "grad_norm": 4.501156806945801, + "learning_rate": 4.001918596683569e-06, + "loss": 0.1329, + "step": 44519 + }, + { + "epoch": 3.0038252848616387, + "grad_norm": 7.39921760559082, + "learning_rate": 4.001781554063314e-06, + "loss": 0.177, + "step": 44520 + }, + { + "epoch": 3.0038388497015736, + "grad_norm": 3.356510877609253, + "learning_rate": 4.001644511443059e-06, + "loss": 0.0861, + "step": 44521 + }, + { + "epoch": 3.0038524145415084, + "grad_norm": 4.981637954711914, + "learning_rate": 4.001507468822804e-06, + "loss": 0.1947, + "step": 44522 + }, + { + "epoch": 3.0038659793814433, + "grad_norm": 3.667701005935669, + "learning_rate": 4.00137042620255e-06, + "loss": 0.1201, + "step": 44523 + }, + { + "epoch": 3.003879544221378, + "grad_norm": 3.2219114303588867, + "learning_rate": 4.001233383582294e-06, + "loss": 0.105, + "step": 44524 + }, + { + "epoch": 3.003893109061313, + "grad_norm": 3.7348859310150146, + "learning_rate": 4.001096340962039e-06, + "loss": 0.1046, + "step": 44525 + }, + { + "epoch": 3.003906673901248, + "grad_norm": 5.118208408355713, + "learning_rate": 4.000959298341784e-06, + "loss": 0.1794, + "step": 44526 + }, + { + "epoch": 3.0039202387411827, + "grad_norm": 3.9217472076416016, + "learning_rate": 4.00082225572153e-06, + "loss": 0.0936, + "step": 44527 + }, + { + "epoch": 3.0039338035811176, + "grad_norm": 5.631860733032227, + "learning_rate": 4.000685213101275e-06, + "loss": 0.1002, + "step": 44528 + }, + { + "epoch": 3.0039473684210525, + "grad_norm": 3.99636173248291, + "learning_rate": 4.00054817048102e-06, + "loss": 0.1475, + "step": 44529 + }, + { + "epoch": 3.0039609332609873, + "grad_norm": 3.747349262237549, + "learning_rate": 4.000411127860765e-06, + "loss": 0.1733, + "step": 44530 + }, + { + "epoch": 3.003974498100922, + "grad_norm": 3.4266724586486816, + "learning_rate": 4.00027408524051e-06, + "loss": 0.1139, + "step": 44531 + }, + { + "epoch": 3.0039880629408575, + "grad_norm": 4.143014907836914, + "learning_rate": 4.0001370426202555e-06, + "loss": 0.0952, + "step": 44532 + }, + { + "epoch": 3.0040016277807924, + "grad_norm": 3.5992984771728516, + "learning_rate": 4.000000000000001e-06, + "loss": 0.1262, + "step": 44533 + }, + { + "epoch": 3.0040151926207272, + "grad_norm": 5.59938383102417, + "learning_rate": 3.999862957379745e-06, + "loss": 0.2092, + "step": 44534 + }, + { + "epoch": 3.004028757460662, + "grad_norm": 3.8256289958953857, + "learning_rate": 3.999725914759491e-06, + "loss": 0.1431, + "step": 44535 + }, + { + "epoch": 3.004042322300597, + "grad_norm": 6.250509738922119, + "learning_rate": 3.999588872139235e-06, + "loss": 0.2066, + "step": 44536 + }, + { + "epoch": 3.004055887140532, + "grad_norm": 4.773327827453613, + "learning_rate": 3.9994518295189805e-06, + "loss": 0.123, + "step": 44537 + }, + { + "epoch": 3.0040694519804667, + "grad_norm": 4.867129325866699, + "learning_rate": 3.999314786898726e-06, + "loss": 0.1937, + "step": 44538 + }, + { + "epoch": 3.0040830168204016, + "grad_norm": 4.153116703033447, + "learning_rate": 3.999177744278471e-06, + "loss": 0.1393, + "step": 44539 + }, + { + "epoch": 3.0040965816603364, + "grad_norm": 5.292470455169678, + "learning_rate": 3.999040701658216e-06, + "loss": 0.1956, + "step": 44540 + }, + { + "epoch": 3.0041101465002713, + "grad_norm": 4.935672283172607, + "learning_rate": 3.998903659037961e-06, + "loss": 0.169, + "step": 44541 + }, + { + "epoch": 3.004123711340206, + "grad_norm": 5.123663902282715, + "learning_rate": 3.9987666164177064e-06, + "loss": 0.1992, + "step": 44542 + }, + { + "epoch": 3.004137276180141, + "grad_norm": 3.2455992698669434, + "learning_rate": 3.998629573797452e-06, + "loss": 0.1302, + "step": 44543 + }, + { + "epoch": 3.004150841020076, + "grad_norm": 4.052642822265625, + "learning_rate": 3.998492531177197e-06, + "loss": 0.1521, + "step": 44544 + }, + { + "epoch": 3.0041644058600108, + "grad_norm": 4.617681980133057, + "learning_rate": 3.998355488556941e-06, + "loss": 0.198, + "step": 44545 + }, + { + "epoch": 3.0041779706999456, + "grad_norm": 6.488339900970459, + "learning_rate": 3.998218445936687e-06, + "loss": 0.2935, + "step": 44546 + }, + { + "epoch": 3.0041915355398805, + "grad_norm": 4.288943290710449, + "learning_rate": 3.9980814033164315e-06, + "loss": 0.1668, + "step": 44547 + }, + { + "epoch": 3.0042051003798154, + "grad_norm": 5.253893852233887, + "learning_rate": 3.9979443606961775e-06, + "loss": 0.1859, + "step": 44548 + }, + { + "epoch": 3.0042186652197502, + "grad_norm": 3.99296498298645, + "learning_rate": 3.997807318075922e-06, + "loss": 0.1232, + "step": 44549 + }, + { + "epoch": 3.004232230059685, + "grad_norm": 4.469273567199707, + "learning_rate": 3.997670275455667e-06, + "loss": 0.178, + "step": 44550 + }, + { + "epoch": 3.0042457948996204, + "grad_norm": 3.4103305339813232, + "learning_rate": 3.997533232835412e-06, + "loss": 0.0977, + "step": 44551 + }, + { + "epoch": 3.0042593597395553, + "grad_norm": 4.844604969024658, + "learning_rate": 3.997396190215157e-06, + "loss": 0.1863, + "step": 44552 + }, + { + "epoch": 3.00427292457949, + "grad_norm": 5.1716203689575195, + "learning_rate": 3.9972591475949025e-06, + "loss": 0.1724, + "step": 44553 + }, + { + "epoch": 3.004286489419425, + "grad_norm": 4.449887275695801, + "learning_rate": 3.997122104974647e-06, + "loss": 0.1894, + "step": 44554 + }, + { + "epoch": 3.00430005425936, + "grad_norm": 6.362960338592529, + "learning_rate": 3.996985062354393e-06, + "loss": 0.2352, + "step": 44555 + }, + { + "epoch": 3.0043136190992947, + "grad_norm": 5.088710308074951, + "learning_rate": 3.996848019734137e-06, + "loss": 0.1787, + "step": 44556 + }, + { + "epoch": 3.0043271839392296, + "grad_norm": 3.6138389110565186, + "learning_rate": 3.996710977113883e-06, + "loss": 0.1325, + "step": 44557 + }, + { + "epoch": 3.0043407487791645, + "grad_norm": 3.5694451332092285, + "learning_rate": 3.996573934493628e-06, + "loss": 0.1414, + "step": 44558 + }, + { + "epoch": 3.0043543136190993, + "grad_norm": 4.1498870849609375, + "learning_rate": 3.996436891873373e-06, + "loss": 0.1137, + "step": 44559 + }, + { + "epoch": 3.004367878459034, + "grad_norm": 2.823216438293457, + "learning_rate": 3.996299849253118e-06, + "loss": 0.0961, + "step": 44560 + }, + { + "epoch": 3.004381443298969, + "grad_norm": 4.647781848907471, + "learning_rate": 3.996162806632863e-06, + "loss": 0.1687, + "step": 44561 + }, + { + "epoch": 3.004395008138904, + "grad_norm": 3.732527732849121, + "learning_rate": 3.996025764012608e-06, + "loss": 0.1294, + "step": 44562 + }, + { + "epoch": 3.004408572978839, + "grad_norm": 3.8139874935150146, + "learning_rate": 3.9958887213923535e-06, + "loss": 0.1582, + "step": 44563 + }, + { + "epoch": 3.0044221378187737, + "grad_norm": 4.501244068145752, + "learning_rate": 3.995751678772099e-06, + "loss": 0.1768, + "step": 44564 + }, + { + "epoch": 3.0044357026587085, + "grad_norm": 3.7054944038391113, + "learning_rate": 3.995614636151843e-06, + "loss": 0.1236, + "step": 44565 + }, + { + "epoch": 3.0044492674986434, + "grad_norm": 3.6134002208709717, + "learning_rate": 3.995477593531589e-06, + "loss": 0.1236, + "step": 44566 + }, + { + "epoch": 3.0044628323385782, + "grad_norm": 5.446846961975098, + "learning_rate": 3.995340550911333e-06, + "loss": 0.2379, + "step": 44567 + }, + { + "epoch": 3.004476397178513, + "grad_norm": 4.631953239440918, + "learning_rate": 3.9952035082910785e-06, + "loss": 0.163, + "step": 44568 + }, + { + "epoch": 3.004489962018448, + "grad_norm": 2.8047780990600586, + "learning_rate": 3.995066465670824e-06, + "loss": 0.0512, + "step": 44569 + }, + { + "epoch": 3.0045035268583833, + "grad_norm": 6.154024600982666, + "learning_rate": 3.994929423050569e-06, + "loss": 0.2298, + "step": 44570 + }, + { + "epoch": 3.004517091698318, + "grad_norm": 4.17796516418457, + "learning_rate": 3.994792380430314e-06, + "loss": 0.1492, + "step": 44571 + }, + { + "epoch": 3.004530656538253, + "grad_norm": 3.4692575931549072, + "learning_rate": 3.994655337810059e-06, + "loss": 0.1089, + "step": 44572 + }, + { + "epoch": 3.004544221378188, + "grad_norm": 4.796950340270996, + "learning_rate": 3.9945182951898044e-06, + "loss": 0.1421, + "step": 44573 + }, + { + "epoch": 3.0045577862181228, + "grad_norm": 2.596231698989868, + "learning_rate": 3.99438125256955e-06, + "loss": 0.1097, + "step": 44574 + }, + { + "epoch": 3.0045713510580576, + "grad_norm": 3.4167025089263916, + "learning_rate": 3.994244209949295e-06, + "loss": 0.1702, + "step": 44575 + }, + { + "epoch": 3.0045849158979925, + "grad_norm": 4.710054874420166, + "learning_rate": 3.99410716732904e-06, + "loss": 0.1985, + "step": 44576 + }, + { + "epoch": 3.0045984807379273, + "grad_norm": 3.3250882625579834, + "learning_rate": 3.993970124708785e-06, + "loss": 0.1294, + "step": 44577 + }, + { + "epoch": 3.004612045577862, + "grad_norm": 3.311777114868164, + "learning_rate": 3.9938330820885295e-06, + "loss": 0.1127, + "step": 44578 + }, + { + "epoch": 3.004625610417797, + "grad_norm": 3.424870729446411, + "learning_rate": 3.993696039468275e-06, + "loss": 0.152, + "step": 44579 + }, + { + "epoch": 3.004639175257732, + "grad_norm": 3.651982545852661, + "learning_rate": 3.99355899684802e-06, + "loss": 0.1191, + "step": 44580 + }, + { + "epoch": 3.004652740097667, + "grad_norm": 3.023205041885376, + "learning_rate": 3.993421954227765e-06, + "loss": 0.0961, + "step": 44581 + }, + { + "epoch": 3.0046663049376017, + "grad_norm": 4.458583354949951, + "learning_rate": 3.99328491160751e-06, + "loss": 0.1258, + "step": 44582 + }, + { + "epoch": 3.0046798697775365, + "grad_norm": 4.067983150482178, + "learning_rate": 3.993147868987255e-06, + "loss": 0.1742, + "step": 44583 + }, + { + "epoch": 3.0046934346174714, + "grad_norm": 3.9931869506835938, + "learning_rate": 3.9930108263670006e-06, + "loss": 0.1377, + "step": 44584 + }, + { + "epoch": 3.0047069994574063, + "grad_norm": 2.44985032081604, + "learning_rate": 3.992873783746746e-06, + "loss": 0.0564, + "step": 44585 + }, + { + "epoch": 3.004720564297341, + "grad_norm": 4.470058441162109, + "learning_rate": 3.992736741126491e-06, + "loss": 0.1464, + "step": 44586 + }, + { + "epoch": 3.004734129137276, + "grad_norm": 3.545538902282715, + "learning_rate": 3.992599698506236e-06, + "loss": 0.1124, + "step": 44587 + }, + { + "epoch": 3.004747693977211, + "grad_norm": 3.998185634613037, + "learning_rate": 3.99246265588598e-06, + "loss": 0.1804, + "step": 44588 + }, + { + "epoch": 3.004761258817146, + "grad_norm": 2.6306464672088623, + "learning_rate": 3.9923256132657264e-06, + "loss": 0.0762, + "step": 44589 + }, + { + "epoch": 3.004774823657081, + "grad_norm": 5.768706321716309, + "learning_rate": 3.992188570645471e-06, + "loss": 0.2061, + "step": 44590 + }, + { + "epoch": 3.004788388497016, + "grad_norm": 3.6445472240448, + "learning_rate": 3.992051528025216e-06, + "loss": 0.1114, + "step": 44591 + }, + { + "epoch": 3.0048019533369508, + "grad_norm": 3.2973825931549072, + "learning_rate": 3.991914485404961e-06, + "loss": 0.0956, + "step": 44592 + }, + { + "epoch": 3.0048155181768856, + "grad_norm": 2.680290937423706, + "learning_rate": 3.991777442784706e-06, + "loss": 0.0924, + "step": 44593 + }, + { + "epoch": 3.0048290830168205, + "grad_norm": 3.621140956878662, + "learning_rate": 3.9916404001644515e-06, + "loss": 0.1613, + "step": 44594 + }, + { + "epoch": 3.0048426478567554, + "grad_norm": 6.838925838470459, + "learning_rate": 3.991503357544197e-06, + "loss": 0.2361, + "step": 44595 + }, + { + "epoch": 3.0048562126966902, + "grad_norm": 2.7549962997436523, + "learning_rate": 3.991366314923942e-06, + "loss": 0.1241, + "step": 44596 + }, + { + "epoch": 3.004869777536625, + "grad_norm": 4.174786567687988, + "learning_rate": 3.991229272303687e-06, + "loss": 0.1392, + "step": 44597 + }, + { + "epoch": 3.00488334237656, + "grad_norm": 3.771512508392334, + "learning_rate": 3.991092229683432e-06, + "loss": 0.1395, + "step": 44598 + }, + { + "epoch": 3.004896907216495, + "grad_norm": 3.8778910636901855, + "learning_rate": 3.9909551870631765e-06, + "loss": 0.0804, + "step": 44599 + }, + { + "epoch": 3.0049104720564297, + "grad_norm": 4.522482872009277, + "learning_rate": 3.9908181444429226e-06, + "loss": 0.1723, + "step": 44600 + }, + { + "epoch": 3.0049240368963646, + "grad_norm": 3.3589956760406494, + "learning_rate": 3.990681101822667e-06, + "loss": 0.1152, + "step": 44601 + }, + { + "epoch": 3.0049376017362994, + "grad_norm": 3.144836902618408, + "learning_rate": 3.990544059202412e-06, + "loss": 0.0927, + "step": 44602 + }, + { + "epoch": 3.0049511665762343, + "grad_norm": 2.912604570388794, + "learning_rate": 3.990407016582157e-06, + "loss": 0.1226, + "step": 44603 + }, + { + "epoch": 3.004964731416169, + "grad_norm": 4.817079544067383, + "learning_rate": 3.9902699739619024e-06, + "loss": 0.1632, + "step": 44604 + }, + { + "epoch": 3.004978296256104, + "grad_norm": 3.3988356590270996, + "learning_rate": 3.990132931341648e-06, + "loss": 0.1025, + "step": 44605 + }, + { + "epoch": 3.004991861096039, + "grad_norm": 4.334303855895996, + "learning_rate": 3.989995888721393e-06, + "loss": 0.1618, + "step": 44606 + }, + { + "epoch": 3.0050054259359738, + "grad_norm": 3.2039616107940674, + "learning_rate": 3.989858846101138e-06, + "loss": 0.1003, + "step": 44607 + }, + { + "epoch": 3.005018990775909, + "grad_norm": 4.199541091918945, + "learning_rate": 3.989721803480882e-06, + "loss": 0.1068, + "step": 44608 + }, + { + "epoch": 3.005032555615844, + "grad_norm": 3.1829051971435547, + "learning_rate": 3.989584760860628e-06, + "loss": 0.0921, + "step": 44609 + }, + { + "epoch": 3.005046120455779, + "grad_norm": 4.19944953918457, + "learning_rate": 3.989447718240373e-06, + "loss": 0.1624, + "step": 44610 + }, + { + "epoch": 3.0050596852957137, + "grad_norm": 3.94801926612854, + "learning_rate": 3.989310675620119e-06, + "loss": 0.1726, + "step": 44611 + }, + { + "epoch": 3.0050732501356485, + "grad_norm": 2.426929473876953, + "learning_rate": 3.989173632999863e-06, + "loss": 0.0692, + "step": 44612 + }, + { + "epoch": 3.0050868149755834, + "grad_norm": 5.259657382965088, + "learning_rate": 3.989036590379608e-06, + "loss": 0.1659, + "step": 44613 + }, + { + "epoch": 3.0051003798155183, + "grad_norm": 5.179601669311523, + "learning_rate": 3.988899547759353e-06, + "loss": 0.0971, + "step": 44614 + }, + { + "epoch": 3.005113944655453, + "grad_norm": 3.782087564468384, + "learning_rate": 3.9887625051390986e-06, + "loss": 0.0992, + "step": 44615 + }, + { + "epoch": 3.005127509495388, + "grad_norm": 4.187655448913574, + "learning_rate": 3.988625462518844e-06, + "loss": 0.1447, + "step": 44616 + }, + { + "epoch": 3.005141074335323, + "grad_norm": 3.0253336429595947, + "learning_rate": 3.988488419898589e-06, + "loss": 0.0849, + "step": 44617 + }, + { + "epoch": 3.0051546391752577, + "grad_norm": 3.6523349285125732, + "learning_rate": 3.988351377278334e-06, + "loss": 0.1169, + "step": 44618 + }, + { + "epoch": 3.0051682040151926, + "grad_norm": 3.6040616035461426, + "learning_rate": 3.9882143346580784e-06, + "loss": 0.1265, + "step": 44619 + }, + { + "epoch": 3.0051817688551274, + "grad_norm": 3.9826598167419434, + "learning_rate": 3.9880772920378245e-06, + "loss": 0.1857, + "step": 44620 + }, + { + "epoch": 3.0051953336950623, + "grad_norm": 3.293752908706665, + "learning_rate": 3.987940249417569e-06, + "loss": 0.1046, + "step": 44621 + }, + { + "epoch": 3.005208898534997, + "grad_norm": 3.9185688495635986, + "learning_rate": 3.987803206797314e-06, + "loss": 0.13, + "step": 44622 + }, + { + "epoch": 3.005222463374932, + "grad_norm": 3.637014389038086, + "learning_rate": 3.987666164177059e-06, + "loss": 0.1632, + "step": 44623 + }, + { + "epoch": 3.005236028214867, + "grad_norm": 4.9541425704956055, + "learning_rate": 3.987529121556804e-06, + "loss": 0.1715, + "step": 44624 + }, + { + "epoch": 3.0052495930548018, + "grad_norm": 5.136382579803467, + "learning_rate": 3.9873920789365495e-06, + "loss": 0.1338, + "step": 44625 + }, + { + "epoch": 3.0052631578947366, + "grad_norm": 3.086934804916382, + "learning_rate": 3.987255036316295e-06, + "loss": 0.0986, + "step": 44626 + }, + { + "epoch": 3.005276722734672, + "grad_norm": 3.1148269176483154, + "learning_rate": 3.98711799369604e-06, + "loss": 0.1033, + "step": 44627 + }, + { + "epoch": 3.005290287574607, + "grad_norm": 4.3940815925598145, + "learning_rate": 3.986980951075785e-06, + "loss": 0.1349, + "step": 44628 + }, + { + "epoch": 3.0053038524145417, + "grad_norm": 2.8396620750427246, + "learning_rate": 3.98684390845553e-06, + "loss": 0.1072, + "step": 44629 + }, + { + "epoch": 3.0053174172544765, + "grad_norm": 3.513835906982422, + "learning_rate": 3.986706865835275e-06, + "loss": 0.0948, + "step": 44630 + }, + { + "epoch": 3.0053309820944114, + "grad_norm": 3.1593096256256104, + "learning_rate": 3.9865698232150206e-06, + "loss": 0.0863, + "step": 44631 + }, + { + "epoch": 3.0053445469343463, + "grad_norm": 2.3916282653808594, + "learning_rate": 3.986432780594766e-06, + "loss": 0.0653, + "step": 44632 + }, + { + "epoch": 3.005358111774281, + "grad_norm": 4.17496919631958, + "learning_rate": 3.98629573797451e-06, + "loss": 0.1157, + "step": 44633 + }, + { + "epoch": 3.005371676614216, + "grad_norm": 3.19535231590271, + "learning_rate": 3.986158695354255e-06, + "loss": 0.0623, + "step": 44634 + }, + { + "epoch": 3.005385241454151, + "grad_norm": 3.9403302669525146, + "learning_rate": 3.9860216527340004e-06, + "loss": 0.1002, + "step": 44635 + }, + { + "epoch": 3.0053988062940857, + "grad_norm": 2.9948291778564453, + "learning_rate": 3.985884610113746e-06, + "loss": 0.0519, + "step": 44636 + }, + { + "epoch": 3.0054123711340206, + "grad_norm": 3.993959426879883, + "learning_rate": 3.985747567493491e-06, + "loss": 0.1144, + "step": 44637 + }, + { + "epoch": 3.0054259359739555, + "grad_norm": 3.250253915786743, + "learning_rate": 3.985610524873236e-06, + "loss": 0.1078, + "step": 44638 + }, + { + "epoch": 3.0054395008138903, + "grad_norm": 2.943854331970215, + "learning_rate": 3.985473482252981e-06, + "loss": 0.0686, + "step": 44639 + }, + { + "epoch": 3.005453065653825, + "grad_norm": 2.706772565841675, + "learning_rate": 3.985336439632726e-06, + "loss": 0.0859, + "step": 44640 + }, + { + "epoch": 3.00546663049376, + "grad_norm": 3.2579383850097656, + "learning_rate": 3.9851993970124715e-06, + "loss": 0.1454, + "step": 44641 + }, + { + "epoch": 3.005480195333695, + "grad_norm": 4.150716781616211, + "learning_rate": 3.985062354392216e-06, + "loss": 0.1323, + "step": 44642 + }, + { + "epoch": 3.00549376017363, + "grad_norm": 2.484591245651245, + "learning_rate": 3.984925311771962e-06, + "loss": 0.0609, + "step": 44643 + }, + { + "epoch": 3.0055073250135647, + "grad_norm": 2.9669034481048584, + "learning_rate": 3.984788269151706e-06, + "loss": 0.084, + "step": 44644 + }, + { + "epoch": 3.0055208898534995, + "grad_norm": 4.685936450958252, + "learning_rate": 3.984651226531452e-06, + "loss": 0.1268, + "step": 44645 + }, + { + "epoch": 3.005534454693435, + "grad_norm": 3.1636674404144287, + "learning_rate": 3.9845141839111966e-06, + "loss": 0.0731, + "step": 44646 + }, + { + "epoch": 3.0055480195333697, + "grad_norm": 3.944841146469116, + "learning_rate": 3.984377141290942e-06, + "loss": 0.098, + "step": 44647 + }, + { + "epoch": 3.0055615843733046, + "grad_norm": 3.5959877967834473, + "learning_rate": 3.984240098670687e-06, + "loss": 0.0701, + "step": 44648 + }, + { + "epoch": 3.0055751492132394, + "grad_norm": 3.0758650302886963, + "learning_rate": 3.984103056050432e-06, + "loss": 0.1238, + "step": 44649 + }, + { + "epoch": 3.0055887140531743, + "grad_norm": 3.466306447982788, + "learning_rate": 3.983966013430177e-06, + "loss": 0.0584, + "step": 44650 + }, + { + "epoch": 3.005602278893109, + "grad_norm": 4.784510612487793, + "learning_rate": 3.9838289708099225e-06, + "loss": 0.0987, + "step": 44651 + }, + { + "epoch": 3.005615843733044, + "grad_norm": 3.166652202606201, + "learning_rate": 3.983691928189668e-06, + "loss": 0.0936, + "step": 44652 + }, + { + "epoch": 3.005629408572979, + "grad_norm": 4.239447116851807, + "learning_rate": 3.983554885569412e-06, + "loss": 0.0666, + "step": 44653 + }, + { + "epoch": 3.0056429734129138, + "grad_norm": 3.5349693298339844, + "learning_rate": 3.983417842949158e-06, + "loss": 0.0831, + "step": 44654 + }, + { + "epoch": 3.0056565382528486, + "grad_norm": 2.734492778778076, + "learning_rate": 3.983280800328902e-06, + "loss": 0.0577, + "step": 44655 + }, + { + "epoch": 3.0056701030927835, + "grad_norm": 4.507767677307129, + "learning_rate": 3.9831437577086475e-06, + "loss": 0.0881, + "step": 44656 + }, + { + "epoch": 3.0056836679327184, + "grad_norm": 2.7222976684570312, + "learning_rate": 3.983006715088393e-06, + "loss": 0.0711, + "step": 44657 + }, + { + "epoch": 3.005697232772653, + "grad_norm": 2.492351531982422, + "learning_rate": 3.982869672468138e-06, + "loss": 0.0512, + "step": 44658 + }, + { + "epoch": 3.005710797612588, + "grad_norm": 5.495924949645996, + "learning_rate": 3.982732629847883e-06, + "loss": 0.1255, + "step": 44659 + }, + { + "epoch": 3.005724362452523, + "grad_norm": 4.517545223236084, + "learning_rate": 3.982595587227628e-06, + "loss": 0.1532, + "step": 44660 + }, + { + "epoch": 3.005737927292458, + "grad_norm": 3.6561272144317627, + "learning_rate": 3.982458544607373e-06, + "loss": 0.1188, + "step": 44661 + }, + { + "epoch": 3.0057514921323927, + "grad_norm": 3.536975383758545, + "learning_rate": 3.982321501987118e-06, + "loss": 0.0905, + "step": 44662 + }, + { + "epoch": 3.0057650569723275, + "grad_norm": 4.078496932983398, + "learning_rate": 3.982184459366864e-06, + "loss": 0.0638, + "step": 44663 + }, + { + "epoch": 3.0057786218122624, + "grad_norm": 4.2644805908203125, + "learning_rate": 3.982047416746608e-06, + "loss": 0.1049, + "step": 44664 + }, + { + "epoch": 3.0057921866521977, + "grad_norm": 3.0176193714141846, + "learning_rate": 3.981910374126354e-06, + "loss": 0.0691, + "step": 44665 + }, + { + "epoch": 3.0058057514921326, + "grad_norm": 4.384012222290039, + "learning_rate": 3.9817733315060984e-06, + "loss": 0.1463, + "step": 44666 + }, + { + "epoch": 3.0058193163320674, + "grad_norm": 4.364374160766602, + "learning_rate": 3.981636288885844e-06, + "loss": 0.1064, + "step": 44667 + }, + { + "epoch": 3.0058328811720023, + "grad_norm": 3.196946144104004, + "learning_rate": 3.981499246265589e-06, + "loss": 0.0624, + "step": 44668 + }, + { + "epoch": 3.005846446011937, + "grad_norm": 3.2988102436065674, + "learning_rate": 3.981362203645334e-06, + "loss": 0.0689, + "step": 44669 + }, + { + "epoch": 3.005860010851872, + "grad_norm": 6.5298566818237305, + "learning_rate": 3.981225161025079e-06, + "loss": 0.1245, + "step": 44670 + }, + { + "epoch": 3.005873575691807, + "grad_norm": 4.578603267669678, + "learning_rate": 3.981088118404824e-06, + "loss": 0.0977, + "step": 44671 + }, + { + "epoch": 3.0058871405317418, + "grad_norm": 4.017924785614014, + "learning_rate": 3.9809510757845695e-06, + "loss": 0.1113, + "step": 44672 + }, + { + "epoch": 3.0059007053716766, + "grad_norm": 3.1324288845062256, + "learning_rate": 3.980814033164315e-06, + "loss": 0.0669, + "step": 44673 + }, + { + "epoch": 3.0059142702116115, + "grad_norm": 3.0268545150756836, + "learning_rate": 3.98067699054406e-06, + "loss": 0.0732, + "step": 44674 + }, + { + "epoch": 3.0059278350515464, + "grad_norm": 3.3629543781280518, + "learning_rate": 3.980539947923804e-06, + "loss": 0.1255, + "step": 44675 + }, + { + "epoch": 3.0059413998914812, + "grad_norm": 4.08336877822876, + "learning_rate": 3.980402905303549e-06, + "loss": 0.0975, + "step": 44676 + }, + { + "epoch": 3.005954964731416, + "grad_norm": 3.6901228427886963, + "learning_rate": 3.9802658626832946e-06, + "loss": 0.1677, + "step": 44677 + }, + { + "epoch": 3.005968529571351, + "grad_norm": 3.7645819187164307, + "learning_rate": 3.98012882006304e-06, + "loss": 0.1085, + "step": 44678 + }, + { + "epoch": 3.005982094411286, + "grad_norm": 3.8090455532073975, + "learning_rate": 3.979991777442785e-06, + "loss": 0.1442, + "step": 44679 + }, + { + "epoch": 3.0059956592512207, + "grad_norm": 3.850348711013794, + "learning_rate": 3.97985473482253e-06, + "loss": 0.0948, + "step": 44680 + }, + { + "epoch": 3.0060092240911556, + "grad_norm": 5.342382907867432, + "learning_rate": 3.979717692202275e-06, + "loss": 0.1645, + "step": 44681 + }, + { + "epoch": 3.0060227889310904, + "grad_norm": 6.293399810791016, + "learning_rate": 3.9795806495820205e-06, + "loss": 0.1409, + "step": 44682 + }, + { + "epoch": 3.0060363537710253, + "grad_norm": 4.325598239898682, + "learning_rate": 3.979443606961766e-06, + "loss": 0.0821, + "step": 44683 + }, + { + "epoch": 3.0060499186109606, + "grad_norm": 3.0012152194976807, + "learning_rate": 3.979306564341511e-06, + "loss": 0.0884, + "step": 44684 + }, + { + "epoch": 3.0060634834508955, + "grad_norm": 4.306652069091797, + "learning_rate": 3.979169521721256e-06, + "loss": 0.1789, + "step": 44685 + }, + { + "epoch": 3.0060770482908303, + "grad_norm": 5.258504867553711, + "learning_rate": 3.979032479101001e-06, + "loss": 0.2365, + "step": 44686 + }, + { + "epoch": 3.006090613130765, + "grad_norm": 3.9679338932037354, + "learning_rate": 3.9788954364807455e-06, + "loss": 0.0742, + "step": 44687 + }, + { + "epoch": 3.0061041779707, + "grad_norm": 4.143104553222656, + "learning_rate": 3.978758393860491e-06, + "loss": 0.1556, + "step": 44688 + }, + { + "epoch": 3.006117742810635, + "grad_norm": 5.281323432922363, + "learning_rate": 3.978621351240236e-06, + "loss": 0.1231, + "step": 44689 + }, + { + "epoch": 3.00613130765057, + "grad_norm": 4.899589538574219, + "learning_rate": 3.978484308619981e-06, + "loss": 0.1578, + "step": 44690 + }, + { + "epoch": 3.0061448724905047, + "grad_norm": 3.760906457901001, + "learning_rate": 3.978347265999726e-06, + "loss": 0.1498, + "step": 44691 + }, + { + "epoch": 3.0061584373304395, + "grad_norm": 4.903976917266846, + "learning_rate": 3.978210223379471e-06, + "loss": 0.0857, + "step": 44692 + }, + { + "epoch": 3.0061720021703744, + "grad_norm": 3.285961151123047, + "learning_rate": 3.978073180759217e-06, + "loss": 0.0809, + "step": 44693 + }, + { + "epoch": 3.0061855670103093, + "grad_norm": 3.6519665718078613, + "learning_rate": 3.977936138138962e-06, + "loss": 0.1252, + "step": 44694 + }, + { + "epoch": 3.006199131850244, + "grad_norm": 4.5755486488342285, + "learning_rate": 3.977799095518707e-06, + "loss": 0.0888, + "step": 44695 + }, + { + "epoch": 3.006212696690179, + "grad_norm": 4.042819499969482, + "learning_rate": 3.977662052898451e-06, + "loss": 0.0927, + "step": 44696 + }, + { + "epoch": 3.006226261530114, + "grad_norm": 4.003323078155518, + "learning_rate": 3.977525010278197e-06, + "loss": 0.1376, + "step": 44697 + }, + { + "epoch": 3.0062398263700487, + "grad_norm": 4.4095988273620605, + "learning_rate": 3.977387967657942e-06, + "loss": 0.2262, + "step": 44698 + }, + { + "epoch": 3.0062533912099836, + "grad_norm": 6.182307720184326, + "learning_rate": 3.977250925037688e-06, + "loss": 0.1704, + "step": 44699 + }, + { + "epoch": 3.0062669560499184, + "grad_norm": 3.7414116859436035, + "learning_rate": 3.977113882417432e-06, + "loss": 0.1427, + "step": 44700 + }, + { + "epoch": 3.0062805208898533, + "grad_norm": 2.5632359981536865, + "learning_rate": 3.976976839797177e-06, + "loss": 0.0569, + "step": 44701 + }, + { + "epoch": 3.006294085729788, + "grad_norm": 4.242898941040039, + "learning_rate": 3.976839797176922e-06, + "loss": 0.107, + "step": 44702 + }, + { + "epoch": 3.0063076505697235, + "grad_norm": 4.583520412445068, + "learning_rate": 3.9767027545566675e-06, + "loss": 0.1311, + "step": 44703 + }, + { + "epoch": 3.0063212154096584, + "grad_norm": 5.615490436553955, + "learning_rate": 3.976565711936413e-06, + "loss": 0.1703, + "step": 44704 + }, + { + "epoch": 3.006334780249593, + "grad_norm": 7.289864540100098, + "learning_rate": 3.976428669316157e-06, + "loss": 0.2432, + "step": 44705 + }, + { + "epoch": 3.006348345089528, + "grad_norm": 4.292085647583008, + "learning_rate": 3.976291626695903e-06, + "loss": 0.1799, + "step": 44706 + }, + { + "epoch": 3.006361909929463, + "grad_norm": 5.154088020324707, + "learning_rate": 3.976154584075647e-06, + "loss": 0.1297, + "step": 44707 + }, + { + "epoch": 3.006375474769398, + "grad_norm": 2.9673168659210205, + "learning_rate": 3.976017541455393e-06, + "loss": 0.0723, + "step": 44708 + }, + { + "epoch": 3.0063890396093327, + "grad_norm": 3.9298813343048096, + "learning_rate": 3.975880498835138e-06, + "loss": 0.0624, + "step": 44709 + }, + { + "epoch": 3.0064026044492675, + "grad_norm": 4.1998724937438965, + "learning_rate": 3.975743456214883e-06, + "loss": 0.1022, + "step": 44710 + }, + { + "epoch": 3.0064161692892024, + "grad_norm": 3.5657989978790283, + "learning_rate": 3.975606413594628e-06, + "loss": 0.1701, + "step": 44711 + }, + { + "epoch": 3.0064297341291373, + "grad_norm": 5.759223461151123, + "learning_rate": 3.975469370974373e-06, + "loss": 0.1752, + "step": 44712 + }, + { + "epoch": 3.006443298969072, + "grad_norm": 2.4239842891693115, + "learning_rate": 3.9753323283541185e-06, + "loss": 0.0506, + "step": 44713 + }, + { + "epoch": 3.006456863809007, + "grad_norm": 3.6676039695739746, + "learning_rate": 3.975195285733864e-06, + "loss": 0.0804, + "step": 44714 + }, + { + "epoch": 3.006470428648942, + "grad_norm": 5.362635612487793, + "learning_rate": 3.975058243113609e-06, + "loss": 0.2238, + "step": 44715 + }, + { + "epoch": 3.0064839934888767, + "grad_norm": 3.8730084896087646, + "learning_rate": 3.974921200493353e-06, + "loss": 0.1871, + "step": 44716 + }, + { + "epoch": 3.0064975583288116, + "grad_norm": 4.626234531402588, + "learning_rate": 3.974784157873099e-06, + "loss": 0.1718, + "step": 44717 + }, + { + "epoch": 3.0065111231687465, + "grad_norm": 4.851150035858154, + "learning_rate": 3.9746471152528435e-06, + "loss": 0.2033, + "step": 44718 + }, + { + "epoch": 3.0065246880086813, + "grad_norm": 4.631901741027832, + "learning_rate": 3.9745100726325895e-06, + "loss": 0.1657, + "step": 44719 + }, + { + "epoch": 3.006538252848616, + "grad_norm": 5.457311630249023, + "learning_rate": 3.974373030012334e-06, + "loss": 0.1956, + "step": 44720 + }, + { + "epoch": 3.006551817688551, + "grad_norm": 4.362924098968506, + "learning_rate": 3.974235987392079e-06, + "loss": 0.1391, + "step": 44721 + }, + { + "epoch": 3.0065653825284864, + "grad_norm": 4.051392555236816, + "learning_rate": 3.974098944771824e-06, + "loss": 0.1842, + "step": 44722 + }, + { + "epoch": 3.0065789473684212, + "grad_norm": 6.3761138916015625, + "learning_rate": 3.973961902151569e-06, + "loss": 0.0991, + "step": 44723 + }, + { + "epoch": 3.006592512208356, + "grad_norm": 4.088531970977783, + "learning_rate": 3.973824859531315e-06, + "loss": 0.0832, + "step": 44724 + }, + { + "epoch": 3.006606077048291, + "grad_norm": 4.654087543487549, + "learning_rate": 3.97368781691106e-06, + "loss": 0.1304, + "step": 44725 + }, + { + "epoch": 3.006619641888226, + "grad_norm": 2.822220802307129, + "learning_rate": 3.973550774290805e-06, + "loss": 0.0825, + "step": 44726 + }, + { + "epoch": 3.0066332067281607, + "grad_norm": 4.277346611022949, + "learning_rate": 3.97341373167055e-06, + "loss": 0.1691, + "step": 44727 + }, + { + "epoch": 3.0066467715680956, + "grad_norm": 5.989396572113037, + "learning_rate": 3.973276689050295e-06, + "loss": 0.1847, + "step": 44728 + }, + { + "epoch": 3.0066603364080304, + "grad_norm": 5.621981620788574, + "learning_rate": 3.97313964643004e-06, + "loss": 0.1529, + "step": 44729 + }, + { + "epoch": 3.0066739012479653, + "grad_norm": 5.966047286987305, + "learning_rate": 3.973002603809785e-06, + "loss": 0.1993, + "step": 44730 + }, + { + "epoch": 3.0066874660879, + "grad_norm": 3.6019222736358643, + "learning_rate": 3.97286556118953e-06, + "loss": 0.1265, + "step": 44731 + }, + { + "epoch": 3.006701030927835, + "grad_norm": 2.879703998565674, + "learning_rate": 3.972728518569275e-06, + "loss": 0.0778, + "step": 44732 + }, + { + "epoch": 3.00671459576777, + "grad_norm": 4.133113384246826, + "learning_rate": 3.97259147594902e-06, + "loss": 0.1175, + "step": 44733 + }, + { + "epoch": 3.0067281606077048, + "grad_norm": 5.093698501586914, + "learning_rate": 3.9724544333287655e-06, + "loss": 0.1389, + "step": 44734 + }, + { + "epoch": 3.0067417254476396, + "grad_norm": 4.903061866760254, + "learning_rate": 3.972317390708511e-06, + "loss": 0.0973, + "step": 44735 + }, + { + "epoch": 3.0067552902875745, + "grad_norm": 3.6389713287353516, + "learning_rate": 3.972180348088256e-06, + "loss": 0.0833, + "step": 44736 + }, + { + "epoch": 3.0067688551275094, + "grad_norm": 3.752150535583496, + "learning_rate": 3.972043305468001e-06, + "loss": 0.0645, + "step": 44737 + }, + { + "epoch": 3.006782419967444, + "grad_norm": 4.699113845825195, + "learning_rate": 3.971906262847746e-06, + "loss": 0.1024, + "step": 44738 + }, + { + "epoch": 3.006795984807379, + "grad_norm": 4.3823628425598145, + "learning_rate": 3.971769220227491e-06, + "loss": 0.132, + "step": 44739 + }, + { + "epoch": 3.006809549647314, + "grad_norm": 4.319177150726318, + "learning_rate": 3.971632177607237e-06, + "loss": 0.1088, + "step": 44740 + }, + { + "epoch": 3.0068231144872493, + "grad_norm": 3.2695817947387695, + "learning_rate": 3.971495134986981e-06, + "loss": 0.0942, + "step": 44741 + }, + { + "epoch": 3.006836679327184, + "grad_norm": 3.0225725173950195, + "learning_rate": 3.971358092366727e-06, + "loss": 0.0751, + "step": 44742 + }, + { + "epoch": 3.006850244167119, + "grad_norm": 3.562814950942993, + "learning_rate": 3.971221049746471e-06, + "loss": 0.1161, + "step": 44743 + }, + { + "epoch": 3.006863809007054, + "grad_norm": 4.147306442260742, + "learning_rate": 3.9710840071262165e-06, + "loss": 0.1637, + "step": 44744 + }, + { + "epoch": 3.0068773738469887, + "grad_norm": 2.7604751586914062, + "learning_rate": 3.970946964505962e-06, + "loss": 0.0483, + "step": 44745 + }, + { + "epoch": 3.0068909386869236, + "grad_norm": 4.538549423217773, + "learning_rate": 3.970809921885707e-06, + "loss": 0.0911, + "step": 44746 + }, + { + "epoch": 3.0069045035268585, + "grad_norm": 3.3950328826904297, + "learning_rate": 3.970672879265452e-06, + "loss": 0.0653, + "step": 44747 + }, + { + "epoch": 3.0069180683667933, + "grad_norm": 4.822518825531006, + "learning_rate": 3.970535836645197e-06, + "loss": 0.1386, + "step": 44748 + }, + { + "epoch": 3.006931633206728, + "grad_norm": 4.645281791687012, + "learning_rate": 3.970398794024942e-06, + "loss": 0.1253, + "step": 44749 + }, + { + "epoch": 3.006945198046663, + "grad_norm": 5.262773513793945, + "learning_rate": 3.970261751404687e-06, + "loss": 0.0878, + "step": 44750 + }, + { + "epoch": 3.006958762886598, + "grad_norm": 5.101646900177002, + "learning_rate": 3.970124708784433e-06, + "loss": 0.1464, + "step": 44751 + }, + { + "epoch": 3.006972327726533, + "grad_norm": 15.961616516113281, + "learning_rate": 3.969987666164177e-06, + "loss": 0.1183, + "step": 44752 + }, + { + "epoch": 3.0069858925664676, + "grad_norm": 5.217587471008301, + "learning_rate": 3.969850623543923e-06, + "loss": 0.1316, + "step": 44753 + }, + { + "epoch": 3.0069994574064025, + "grad_norm": 5.113339900970459, + "learning_rate": 3.969713580923667e-06, + "loss": 0.152, + "step": 44754 + }, + { + "epoch": 3.0070130222463374, + "grad_norm": 4.375277996063232, + "learning_rate": 3.969576538303413e-06, + "loss": 0.0997, + "step": 44755 + }, + { + "epoch": 3.0070265870862722, + "grad_norm": 4.618370532989502, + "learning_rate": 3.969439495683158e-06, + "loss": 0.0902, + "step": 44756 + }, + { + "epoch": 3.007040151926207, + "grad_norm": 5.544739246368408, + "learning_rate": 3.969302453062903e-06, + "loss": 0.1299, + "step": 44757 + }, + { + "epoch": 3.007053716766142, + "grad_norm": 5.3363566398620605, + "learning_rate": 3.969165410442648e-06, + "loss": 0.2292, + "step": 44758 + }, + { + "epoch": 3.007067281606077, + "grad_norm": 2.640475034713745, + "learning_rate": 3.9690283678223925e-06, + "loss": 0.0711, + "step": 44759 + }, + { + "epoch": 3.007080846446012, + "grad_norm": 3.2069523334503174, + "learning_rate": 3.9688913252021385e-06, + "loss": 0.0604, + "step": 44760 + }, + { + "epoch": 3.007094411285947, + "grad_norm": 3.4107956886291504, + "learning_rate": 3.968754282581883e-06, + "loss": 0.104, + "step": 44761 + }, + { + "epoch": 3.007107976125882, + "grad_norm": 2.832456111907959, + "learning_rate": 3.968617239961629e-06, + "loss": 0.1074, + "step": 44762 + }, + { + "epoch": 3.0071215409658167, + "grad_norm": 4.483641147613525, + "learning_rate": 3.968480197341373e-06, + "loss": 0.0992, + "step": 44763 + }, + { + "epoch": 3.0071351058057516, + "grad_norm": 3.459174871444702, + "learning_rate": 3.968343154721118e-06, + "loss": 0.0989, + "step": 44764 + }, + { + "epoch": 3.0071486706456865, + "grad_norm": 4.012729167938232, + "learning_rate": 3.9682061121008635e-06, + "loss": 0.0806, + "step": 44765 + }, + { + "epoch": 3.0071622354856213, + "grad_norm": 4.569979190826416, + "learning_rate": 3.968069069480609e-06, + "loss": 0.0921, + "step": 44766 + }, + { + "epoch": 3.007175800325556, + "grad_norm": 3.1823818683624268, + "learning_rate": 3.967932026860354e-06, + "loss": 0.112, + "step": 44767 + }, + { + "epoch": 3.007189365165491, + "grad_norm": 4.4359869956970215, + "learning_rate": 3.967794984240099e-06, + "loss": 0.1085, + "step": 44768 + }, + { + "epoch": 3.007202930005426, + "grad_norm": 3.116943597793579, + "learning_rate": 3.967657941619844e-06, + "loss": 0.0489, + "step": 44769 + }, + { + "epoch": 3.007216494845361, + "grad_norm": 3.6660354137420654, + "learning_rate": 3.967520898999589e-06, + "loss": 0.0735, + "step": 44770 + }, + { + "epoch": 3.0072300596852957, + "grad_norm": 4.4208292961120605, + "learning_rate": 3.967383856379335e-06, + "loss": 0.1159, + "step": 44771 + }, + { + "epoch": 3.0072436245252305, + "grad_norm": 3.535008192062378, + "learning_rate": 3.967246813759079e-06, + "loss": 0.1046, + "step": 44772 + }, + { + "epoch": 3.0072571893651654, + "grad_norm": 3.608023166656494, + "learning_rate": 3.967109771138825e-06, + "loss": 0.1058, + "step": 44773 + }, + { + "epoch": 3.0072707542051003, + "grad_norm": 3.744624376296997, + "learning_rate": 3.966972728518569e-06, + "loss": 0.1172, + "step": 44774 + }, + { + "epoch": 3.007284319045035, + "grad_norm": 4.029482841491699, + "learning_rate": 3.9668356858983145e-06, + "loss": 0.0751, + "step": 44775 + }, + { + "epoch": 3.00729788388497, + "grad_norm": 3.1811907291412354, + "learning_rate": 3.96669864327806e-06, + "loss": 0.1369, + "step": 44776 + }, + { + "epoch": 3.007311448724905, + "grad_norm": 3.4181180000305176, + "learning_rate": 3.966561600657805e-06, + "loss": 0.1231, + "step": 44777 + }, + { + "epoch": 3.0073250135648397, + "grad_norm": 4.868630409240723, + "learning_rate": 3.96642455803755e-06, + "loss": 0.1487, + "step": 44778 + }, + { + "epoch": 3.007338578404775, + "grad_norm": 3.965108871459961, + "learning_rate": 3.966287515417295e-06, + "loss": 0.1188, + "step": 44779 + }, + { + "epoch": 3.00735214324471, + "grad_norm": 3.1405093669891357, + "learning_rate": 3.96615047279704e-06, + "loss": 0.0867, + "step": 44780 + }, + { + "epoch": 3.0073657080846448, + "grad_norm": 2.572833299636841, + "learning_rate": 3.9660134301767855e-06, + "loss": 0.0708, + "step": 44781 + }, + { + "epoch": 3.0073792729245796, + "grad_norm": 2.809307813644409, + "learning_rate": 3.965876387556531e-06, + "loss": 0.0493, + "step": 44782 + }, + { + "epoch": 3.0073928377645145, + "grad_norm": 5.961618900299072, + "learning_rate": 3.965739344936276e-06, + "loss": 0.1568, + "step": 44783 + }, + { + "epoch": 3.0074064026044494, + "grad_norm": 3.3962011337280273, + "learning_rate": 3.96560230231602e-06, + "loss": 0.0895, + "step": 44784 + }, + { + "epoch": 3.0074199674443842, + "grad_norm": 5.11834192276001, + "learning_rate": 3.965465259695765e-06, + "loss": 0.1616, + "step": 44785 + }, + { + "epoch": 3.007433532284319, + "grad_norm": 4.853280544281006, + "learning_rate": 3.965328217075511e-06, + "loss": 0.1001, + "step": 44786 + }, + { + "epoch": 3.007447097124254, + "grad_norm": 3.233287811279297, + "learning_rate": 3.965191174455256e-06, + "loss": 0.1419, + "step": 44787 + }, + { + "epoch": 3.007460661964189, + "grad_norm": 3.872131109237671, + "learning_rate": 3.965054131835001e-06, + "loss": 0.1276, + "step": 44788 + }, + { + "epoch": 3.0074742268041237, + "grad_norm": 4.357054233551025, + "learning_rate": 3.964917089214746e-06, + "loss": 0.0954, + "step": 44789 + }, + { + "epoch": 3.0074877916440586, + "grad_norm": 4.7015767097473145, + "learning_rate": 3.964780046594491e-06, + "loss": 0.1302, + "step": 44790 + }, + { + "epoch": 3.0075013564839934, + "grad_norm": 3.7564170360565186, + "learning_rate": 3.9646430039742365e-06, + "loss": 0.1027, + "step": 44791 + }, + { + "epoch": 3.0075149213239283, + "grad_norm": 4.841521263122559, + "learning_rate": 3.964505961353982e-06, + "loss": 0.138, + "step": 44792 + }, + { + "epoch": 3.007528486163863, + "grad_norm": 5.732316017150879, + "learning_rate": 3.964368918733727e-06, + "loss": 0.1425, + "step": 44793 + }, + { + "epoch": 3.007542051003798, + "grad_norm": 3.45111346244812, + "learning_rate": 3.964231876113472e-06, + "loss": 0.1111, + "step": 44794 + }, + { + "epoch": 3.007555615843733, + "grad_norm": 4.539482593536377, + "learning_rate": 3.964094833493216e-06, + "loss": 0.1496, + "step": 44795 + }, + { + "epoch": 3.0075691806836677, + "grad_norm": 6.380039691925049, + "learning_rate": 3.963957790872962e-06, + "loss": 0.2034, + "step": 44796 + }, + { + "epoch": 3.0075827455236026, + "grad_norm": 3.147695779800415, + "learning_rate": 3.963820748252707e-06, + "loss": 0.1529, + "step": 44797 + }, + { + "epoch": 3.007596310363538, + "grad_norm": 4.0118408203125, + "learning_rate": 3.963683705632452e-06, + "loss": 0.1965, + "step": 44798 + }, + { + "epoch": 3.007609875203473, + "grad_norm": 4.968703746795654, + "learning_rate": 3.963546663012197e-06, + "loss": 0.0817, + "step": 44799 + }, + { + "epoch": 3.0076234400434076, + "grad_norm": 4.484930038452148, + "learning_rate": 3.963409620391942e-06, + "loss": 0.1405, + "step": 44800 + }, + { + "epoch": 3.0076370048833425, + "grad_norm": 3.507866144180298, + "learning_rate": 3.9632725777716874e-06, + "loss": 0.0719, + "step": 44801 + }, + { + "epoch": 3.0076505697232774, + "grad_norm": 3.487475633621216, + "learning_rate": 3.963135535151433e-06, + "loss": 0.0701, + "step": 44802 + }, + { + "epoch": 3.0076641345632122, + "grad_norm": 4.360328197479248, + "learning_rate": 3.962998492531178e-06, + "loss": 0.1316, + "step": 44803 + }, + { + "epoch": 3.007677699403147, + "grad_norm": 4.297004222869873, + "learning_rate": 3.962861449910922e-06, + "loss": 0.1344, + "step": 44804 + }, + { + "epoch": 3.007691264243082, + "grad_norm": 3.2391140460968018, + "learning_rate": 3.962724407290668e-06, + "loss": 0.0698, + "step": 44805 + }, + { + "epoch": 3.007704829083017, + "grad_norm": 2.9816253185272217, + "learning_rate": 3.9625873646704125e-06, + "loss": 0.1083, + "step": 44806 + }, + { + "epoch": 3.0077183939229517, + "grad_norm": 4.600029945373535, + "learning_rate": 3.9624503220501585e-06, + "loss": 0.1464, + "step": 44807 + }, + { + "epoch": 3.0077319587628866, + "grad_norm": 3.417113780975342, + "learning_rate": 3.962313279429903e-06, + "loss": 0.0824, + "step": 44808 + }, + { + "epoch": 3.0077455236028214, + "grad_norm": 3.3326215744018555, + "learning_rate": 3.962176236809648e-06, + "loss": 0.0683, + "step": 44809 + }, + { + "epoch": 3.0077590884427563, + "grad_norm": 2.615253210067749, + "learning_rate": 3.962039194189393e-06, + "loss": 0.0537, + "step": 44810 + }, + { + "epoch": 3.007772653282691, + "grad_norm": 4.335676193237305, + "learning_rate": 3.961902151569138e-06, + "loss": 0.0861, + "step": 44811 + }, + { + "epoch": 3.007786218122626, + "grad_norm": 4.338850021362305, + "learning_rate": 3.9617651089488836e-06, + "loss": 0.1217, + "step": 44812 + }, + { + "epoch": 3.007799782962561, + "grad_norm": 5.343811511993408, + "learning_rate": 3.961628066328628e-06, + "loss": 0.1715, + "step": 44813 + }, + { + "epoch": 3.0078133478024958, + "grad_norm": 4.514191627502441, + "learning_rate": 3.961491023708374e-06, + "loss": 0.1333, + "step": 44814 + }, + { + "epoch": 3.0078269126424306, + "grad_norm": 5.014763355255127, + "learning_rate": 3.961353981088118e-06, + "loss": 0.2261, + "step": 44815 + }, + { + "epoch": 3.0078404774823655, + "grad_norm": 3.5940372943878174, + "learning_rate": 3.961216938467864e-06, + "loss": 0.13, + "step": 44816 + }, + { + "epoch": 3.007854042322301, + "grad_norm": 3.572998285293579, + "learning_rate": 3.961079895847609e-06, + "loss": 0.0764, + "step": 44817 + }, + { + "epoch": 3.0078676071622357, + "grad_norm": 4.027010440826416, + "learning_rate": 3.960942853227354e-06, + "loss": 0.1555, + "step": 44818 + }, + { + "epoch": 3.0078811720021705, + "grad_norm": 4.264401912689209, + "learning_rate": 3.960805810607099e-06, + "loss": 0.1216, + "step": 44819 + }, + { + "epoch": 3.0078947368421054, + "grad_norm": 3.1041595935821533, + "learning_rate": 3.960668767986844e-06, + "loss": 0.0854, + "step": 44820 + }, + { + "epoch": 3.0079083016820403, + "grad_norm": 4.981749057769775, + "learning_rate": 3.960531725366589e-06, + "loss": 0.1005, + "step": 44821 + }, + { + "epoch": 3.007921866521975, + "grad_norm": 3.5394179821014404, + "learning_rate": 3.9603946827463345e-06, + "loss": 0.0912, + "step": 44822 + }, + { + "epoch": 3.00793543136191, + "grad_norm": 4.3049540519714355, + "learning_rate": 3.96025764012608e-06, + "loss": 0.1737, + "step": 44823 + }, + { + "epoch": 3.007948996201845, + "grad_norm": 2.5705959796905518, + "learning_rate": 3.960120597505825e-06, + "loss": 0.1022, + "step": 44824 + }, + { + "epoch": 3.0079625610417797, + "grad_norm": 2.9617443084716797, + "learning_rate": 3.95998355488557e-06, + "loss": 0.0937, + "step": 44825 + }, + { + "epoch": 3.0079761258817146, + "grad_norm": 4.953430652618408, + "learning_rate": 3.959846512265314e-06, + "loss": 0.1442, + "step": 44826 + }, + { + "epoch": 3.0079896907216495, + "grad_norm": 5.489286422729492, + "learning_rate": 3.95970946964506e-06, + "loss": 0.0904, + "step": 44827 + }, + { + "epoch": 3.0080032555615843, + "grad_norm": 6.23494815826416, + "learning_rate": 3.959572427024805e-06, + "loss": 0.1834, + "step": 44828 + }, + { + "epoch": 3.008016820401519, + "grad_norm": 4.6486616134643555, + "learning_rate": 3.95943538440455e-06, + "loss": 0.1215, + "step": 44829 + }, + { + "epoch": 3.008030385241454, + "grad_norm": 3.7995352745056152, + "learning_rate": 3.959298341784295e-06, + "loss": 0.1647, + "step": 44830 + }, + { + "epoch": 3.008043950081389, + "grad_norm": 4.913503646850586, + "learning_rate": 3.95916129916404e-06, + "loss": 0.1963, + "step": 44831 + }, + { + "epoch": 3.008057514921324, + "grad_norm": 5.404219150543213, + "learning_rate": 3.9590242565437854e-06, + "loss": 0.1022, + "step": 44832 + }, + { + "epoch": 3.0080710797612586, + "grad_norm": 4.078096866607666, + "learning_rate": 3.958887213923531e-06, + "loss": 0.1538, + "step": 44833 + }, + { + "epoch": 3.0080846446011935, + "grad_norm": 3.6673781871795654, + "learning_rate": 3.958750171303276e-06, + "loss": 0.1677, + "step": 44834 + }, + { + "epoch": 3.0080982094411284, + "grad_norm": 5.862400054931641, + "learning_rate": 3.958613128683021e-06, + "loss": 0.1415, + "step": 44835 + }, + { + "epoch": 3.0081117742810637, + "grad_norm": 4.813644886016846, + "learning_rate": 3.958476086062766e-06, + "loss": 0.0968, + "step": 44836 + }, + { + "epoch": 3.0081253391209986, + "grad_norm": 4.336082458496094, + "learning_rate": 3.958339043442511e-06, + "loss": 0.0987, + "step": 44837 + }, + { + "epoch": 3.0081389039609334, + "grad_norm": 4.538695812225342, + "learning_rate": 3.958202000822256e-06, + "loss": 0.131, + "step": 44838 + }, + { + "epoch": 3.0081524688008683, + "grad_norm": 5.01521635055542, + "learning_rate": 3.958064958202001e-06, + "loss": 0.121, + "step": 44839 + }, + { + "epoch": 3.008166033640803, + "grad_norm": 3.6370954513549805, + "learning_rate": 3.957927915581746e-06, + "loss": 0.0984, + "step": 44840 + }, + { + "epoch": 3.008179598480738, + "grad_norm": 3.9639644622802734, + "learning_rate": 3.957790872961491e-06, + "loss": 0.1328, + "step": 44841 + }, + { + "epoch": 3.008193163320673, + "grad_norm": 4.295654773712158, + "learning_rate": 3.957653830341236e-06, + "loss": 0.1346, + "step": 44842 + }, + { + "epoch": 3.0082067281606077, + "grad_norm": 3.5952043533325195, + "learning_rate": 3.9575167877209816e-06, + "loss": 0.0961, + "step": 44843 + }, + { + "epoch": 3.0082202930005426, + "grad_norm": 4.299055576324463, + "learning_rate": 3.957379745100727e-06, + "loss": 0.1292, + "step": 44844 + }, + { + "epoch": 3.0082338578404775, + "grad_norm": 3.008491039276123, + "learning_rate": 3.957242702480472e-06, + "loss": 0.0667, + "step": 44845 + }, + { + "epoch": 3.0082474226804123, + "grad_norm": 3.492305278778076, + "learning_rate": 3.957105659860217e-06, + "loss": 0.1045, + "step": 44846 + }, + { + "epoch": 3.008260987520347, + "grad_norm": 3.8358750343322754, + "learning_rate": 3.9569686172399614e-06, + "loss": 0.135, + "step": 44847 + }, + { + "epoch": 3.008274552360282, + "grad_norm": 3.253919839859009, + "learning_rate": 3.9568315746197075e-06, + "loss": 0.092, + "step": 44848 + }, + { + "epoch": 3.008288117200217, + "grad_norm": 4.651244640350342, + "learning_rate": 3.956694531999452e-06, + "loss": 0.1791, + "step": 44849 + }, + { + "epoch": 3.008301682040152, + "grad_norm": 5.107532978057861, + "learning_rate": 3.956557489379198e-06, + "loss": 0.1492, + "step": 44850 + }, + { + "epoch": 3.0083152468800867, + "grad_norm": 3.3955671787261963, + "learning_rate": 3.956420446758942e-06, + "loss": 0.1091, + "step": 44851 + }, + { + "epoch": 3.0083288117200215, + "grad_norm": 5.654476642608643, + "learning_rate": 3.956283404138687e-06, + "loss": 0.1696, + "step": 44852 + }, + { + "epoch": 3.0083423765599564, + "grad_norm": 4.777482032775879, + "learning_rate": 3.9561463615184325e-06, + "loss": 0.1505, + "step": 44853 + }, + { + "epoch": 3.0083559413998913, + "grad_norm": 4.214216709136963, + "learning_rate": 3.956009318898178e-06, + "loss": 0.1574, + "step": 44854 + }, + { + "epoch": 3.0083695062398266, + "grad_norm": 7.680605411529541, + "learning_rate": 3.955872276277923e-06, + "loss": 0.1443, + "step": 44855 + }, + { + "epoch": 3.0083830710797614, + "grad_norm": 3.9334254264831543, + "learning_rate": 3.955735233657668e-06, + "loss": 0.1195, + "step": 44856 + }, + { + "epoch": 3.0083966359196963, + "grad_norm": 5.148617267608643, + "learning_rate": 3.955598191037413e-06, + "loss": 0.1463, + "step": 44857 + }, + { + "epoch": 3.008410200759631, + "grad_norm": 5.491179943084717, + "learning_rate": 3.9554611484171575e-06, + "loss": 0.2278, + "step": 44858 + }, + { + "epoch": 3.008423765599566, + "grad_norm": 4.463709354400635, + "learning_rate": 3.9553241057969036e-06, + "loss": 0.1301, + "step": 44859 + }, + { + "epoch": 3.008437330439501, + "grad_norm": 3.202345371246338, + "learning_rate": 3.955187063176648e-06, + "loss": 0.0915, + "step": 44860 + }, + { + "epoch": 3.0084508952794358, + "grad_norm": 4.228085994720459, + "learning_rate": 3.955050020556394e-06, + "loss": 0.0794, + "step": 44861 + }, + { + "epoch": 3.0084644601193706, + "grad_norm": 3.3713784217834473, + "learning_rate": 3.954912977936138e-06, + "loss": 0.1196, + "step": 44862 + }, + { + "epoch": 3.0084780249593055, + "grad_norm": 3.8324618339538574, + "learning_rate": 3.9547759353158834e-06, + "loss": 0.1409, + "step": 44863 + }, + { + "epoch": 3.0084915897992404, + "grad_norm": 4.136819362640381, + "learning_rate": 3.954638892695629e-06, + "loss": 0.1175, + "step": 44864 + }, + { + "epoch": 3.0085051546391752, + "grad_norm": 4.780464172363281, + "learning_rate": 3.954501850075374e-06, + "loss": 0.126, + "step": 44865 + }, + { + "epoch": 3.00851871947911, + "grad_norm": 4.777373790740967, + "learning_rate": 3.954364807455119e-06, + "loss": 0.0796, + "step": 44866 + }, + { + "epoch": 3.008532284319045, + "grad_norm": 3.659379243850708, + "learning_rate": 3.954227764834863e-06, + "loss": 0.157, + "step": 44867 + }, + { + "epoch": 3.00854584915898, + "grad_norm": 4.570199966430664, + "learning_rate": 3.954090722214609e-06, + "loss": 0.1677, + "step": 44868 + }, + { + "epoch": 3.0085594139989147, + "grad_norm": 3.8159101009368896, + "learning_rate": 3.953953679594354e-06, + "loss": 0.0824, + "step": 44869 + }, + { + "epoch": 3.0085729788388496, + "grad_norm": 4.929849147796631, + "learning_rate": 3.9538166369741e-06, + "loss": 0.1578, + "step": 44870 + }, + { + "epoch": 3.0085865436787844, + "grad_norm": 4.7268452644348145, + "learning_rate": 3.953679594353844e-06, + "loss": 0.1698, + "step": 44871 + }, + { + "epoch": 3.0086001085187193, + "grad_norm": 5.060667514801025, + "learning_rate": 3.953542551733589e-06, + "loss": 0.1295, + "step": 44872 + }, + { + "epoch": 3.008613673358654, + "grad_norm": 3.9911797046661377, + "learning_rate": 3.953405509113334e-06, + "loss": 0.1014, + "step": 44873 + }, + { + "epoch": 3.0086272381985895, + "grad_norm": 3.885934829711914, + "learning_rate": 3.9532684664930796e-06, + "loss": 0.1381, + "step": 44874 + }, + { + "epoch": 3.0086408030385243, + "grad_norm": 4.532039642333984, + "learning_rate": 3.953131423872825e-06, + "loss": 0.1942, + "step": 44875 + }, + { + "epoch": 3.008654367878459, + "grad_norm": 4.312134742736816, + "learning_rate": 3.95299438125257e-06, + "loss": 0.1701, + "step": 44876 + }, + { + "epoch": 3.008667932718394, + "grad_norm": 6.997989177703857, + "learning_rate": 3.952857338632315e-06, + "loss": 0.1457, + "step": 44877 + }, + { + "epoch": 3.008681497558329, + "grad_norm": 7.094825744628906, + "learning_rate": 3.95272029601206e-06, + "loss": 0.2197, + "step": 44878 + }, + { + "epoch": 3.008695062398264, + "grad_norm": 4.166245460510254, + "learning_rate": 3.9525832533918055e-06, + "loss": 0.0886, + "step": 44879 + }, + { + "epoch": 3.0087086272381987, + "grad_norm": 4.918633937835693, + "learning_rate": 3.952446210771551e-06, + "loss": 0.203, + "step": 44880 + }, + { + "epoch": 3.0087221920781335, + "grad_norm": 6.396998882293701, + "learning_rate": 3.952309168151296e-06, + "loss": 0.2257, + "step": 44881 + }, + { + "epoch": 3.0087357569180684, + "grad_norm": 4.796778202056885, + "learning_rate": 3.95217212553104e-06, + "loss": 0.1492, + "step": 44882 + }, + { + "epoch": 3.0087493217580032, + "grad_norm": 2.8535923957824707, + "learning_rate": 3.952035082910785e-06, + "loss": 0.0757, + "step": 44883 + }, + { + "epoch": 3.008762886597938, + "grad_norm": 2.753612518310547, + "learning_rate": 3.9518980402905305e-06, + "loss": 0.0923, + "step": 44884 + }, + { + "epoch": 3.008776451437873, + "grad_norm": 3.3675291538238525, + "learning_rate": 3.951760997670276e-06, + "loss": 0.0873, + "step": 44885 + }, + { + "epoch": 3.008790016277808, + "grad_norm": 4.730380535125732, + "learning_rate": 3.951623955050021e-06, + "loss": 0.1315, + "step": 44886 + }, + { + "epoch": 3.0088035811177427, + "grad_norm": 4.395781517028809, + "learning_rate": 3.951486912429766e-06, + "loss": 0.1849, + "step": 44887 + }, + { + "epoch": 3.0088171459576776, + "grad_norm": 3.685185432434082, + "learning_rate": 3.951349869809511e-06, + "loss": 0.1537, + "step": 44888 + }, + { + "epoch": 3.0088307107976124, + "grad_norm": 4.159185886383057, + "learning_rate": 3.951212827189256e-06, + "loss": 0.2405, + "step": 44889 + }, + { + "epoch": 3.0088442756375473, + "grad_norm": 3.9341742992401123, + "learning_rate": 3.9510757845690016e-06, + "loss": 0.1798, + "step": 44890 + }, + { + "epoch": 3.008857840477482, + "grad_norm": 3.5592896938323975, + "learning_rate": 3.950938741948747e-06, + "loss": 0.1032, + "step": 44891 + }, + { + "epoch": 3.008871405317417, + "grad_norm": 3.281717538833618, + "learning_rate": 3.950801699328491e-06, + "loss": 0.1432, + "step": 44892 + }, + { + "epoch": 3.0088849701573523, + "grad_norm": 3.701329231262207, + "learning_rate": 3.950664656708237e-06, + "loss": 0.0959, + "step": 44893 + }, + { + "epoch": 3.008898534997287, + "grad_norm": 6.903454303741455, + "learning_rate": 3.9505276140879814e-06, + "loss": 0.162, + "step": 44894 + }, + { + "epoch": 3.008912099837222, + "grad_norm": 3.2661445140838623, + "learning_rate": 3.950390571467727e-06, + "loss": 0.1222, + "step": 44895 + }, + { + "epoch": 3.008925664677157, + "grad_norm": 3.7180840969085693, + "learning_rate": 3.950253528847472e-06, + "loss": 0.0958, + "step": 44896 + }, + { + "epoch": 3.008939229517092, + "grad_norm": 4.213587284088135, + "learning_rate": 3.950116486227217e-06, + "loss": 0.1138, + "step": 44897 + }, + { + "epoch": 3.0089527943570267, + "grad_norm": 2.977295398712158, + "learning_rate": 3.949979443606962e-06, + "loss": 0.0626, + "step": 44898 + }, + { + "epoch": 3.0089663591969615, + "grad_norm": 5.22776460647583, + "learning_rate": 3.949842400986707e-06, + "loss": 0.2967, + "step": 44899 + }, + { + "epoch": 3.0089799240368964, + "grad_norm": 2.1997761726379395, + "learning_rate": 3.9497053583664525e-06, + "loss": 0.0621, + "step": 44900 + }, + { + "epoch": 3.0089934888768313, + "grad_norm": 3.2330386638641357, + "learning_rate": 3.949568315746197e-06, + "loss": 0.132, + "step": 44901 + }, + { + "epoch": 3.009007053716766, + "grad_norm": 3.2134933471679688, + "learning_rate": 3.949431273125943e-06, + "loss": 0.0618, + "step": 44902 + }, + { + "epoch": 3.009020618556701, + "grad_norm": 2.065768241882324, + "learning_rate": 3.949294230505687e-06, + "loss": 0.0531, + "step": 44903 + }, + { + "epoch": 3.009034183396636, + "grad_norm": 2.4553468227386475, + "learning_rate": 3.949157187885433e-06, + "loss": 0.0492, + "step": 44904 + }, + { + "epoch": 3.0090477482365707, + "grad_norm": 3.6506571769714355, + "learning_rate": 3.9490201452651776e-06, + "loss": 0.0995, + "step": 44905 + }, + { + "epoch": 3.0090613130765056, + "grad_norm": 3.089824914932251, + "learning_rate": 3.948883102644923e-06, + "loss": 0.0792, + "step": 44906 + }, + { + "epoch": 3.0090748779164405, + "grad_norm": 3.036017894744873, + "learning_rate": 3.948746060024668e-06, + "loss": 0.0999, + "step": 44907 + }, + { + "epoch": 3.0090884427563753, + "grad_norm": 2.1669774055480957, + "learning_rate": 3.948609017404413e-06, + "loss": 0.051, + "step": 44908 + }, + { + "epoch": 3.00910200759631, + "grad_norm": 3.678051471710205, + "learning_rate": 3.948471974784158e-06, + "loss": 0.0985, + "step": 44909 + }, + { + "epoch": 3.009115572436245, + "grad_norm": 2.1822266578674316, + "learning_rate": 3.9483349321639035e-06, + "loss": 0.065, + "step": 44910 + }, + { + "epoch": 3.00912913727618, + "grad_norm": 3.5858335494995117, + "learning_rate": 3.948197889543649e-06, + "loss": 0.0739, + "step": 44911 + }, + { + "epoch": 3.0091427021161152, + "grad_norm": 2.0030107498168945, + "learning_rate": 3.948060846923393e-06, + "loss": 0.0531, + "step": 44912 + }, + { + "epoch": 3.00915626695605, + "grad_norm": 3.1352455615997314, + "learning_rate": 3.947923804303139e-06, + "loss": 0.0493, + "step": 44913 + }, + { + "epoch": 3.009169831795985, + "grad_norm": 3.659897565841675, + "learning_rate": 3.947786761682883e-06, + "loss": 0.14, + "step": 44914 + }, + { + "epoch": 3.00918339663592, + "grad_norm": 3.3009536266326904, + "learning_rate": 3.947649719062629e-06, + "loss": 0.0906, + "step": 44915 + }, + { + "epoch": 3.0091969614758547, + "grad_norm": 3.3509035110473633, + "learning_rate": 3.947512676442374e-06, + "loss": 0.0764, + "step": 44916 + }, + { + "epoch": 3.0092105263157896, + "grad_norm": 2.3858420848846436, + "learning_rate": 3.947375633822119e-06, + "loss": 0.0738, + "step": 44917 + }, + { + "epoch": 3.0092240911557244, + "grad_norm": 3.1539247035980225, + "learning_rate": 3.947238591201864e-06, + "loss": 0.1021, + "step": 44918 + }, + { + "epoch": 3.0092376559956593, + "grad_norm": 2.5454742908477783, + "learning_rate": 3.947101548581609e-06, + "loss": 0.1013, + "step": 44919 + }, + { + "epoch": 3.009251220835594, + "grad_norm": 4.43073844909668, + "learning_rate": 3.946964505961354e-06, + "loss": 0.1913, + "step": 44920 + }, + { + "epoch": 3.009264785675529, + "grad_norm": 2.575524091720581, + "learning_rate": 3.9468274633411e-06, + "loss": 0.087, + "step": 44921 + }, + { + "epoch": 3.009278350515464, + "grad_norm": 2.54864501953125, + "learning_rate": 3.946690420720845e-06, + "loss": 0.101, + "step": 44922 + }, + { + "epoch": 3.0092919153553987, + "grad_norm": 3.275937795639038, + "learning_rate": 3.946553378100589e-06, + "loss": 0.1069, + "step": 44923 + }, + { + "epoch": 3.0093054801953336, + "grad_norm": 4.112457275390625, + "learning_rate": 3.946416335480335e-06, + "loss": 0.1748, + "step": 44924 + }, + { + "epoch": 3.0093190450352685, + "grad_norm": 5.737401485443115, + "learning_rate": 3.9462792928600794e-06, + "loss": 0.1622, + "step": 44925 + }, + { + "epoch": 3.0093326098752033, + "grad_norm": 5.340938568115234, + "learning_rate": 3.946142250239825e-06, + "loss": 0.218, + "step": 44926 + }, + { + "epoch": 3.009346174715138, + "grad_norm": 3.078181505203247, + "learning_rate": 3.94600520761957e-06, + "loss": 0.1199, + "step": 44927 + }, + { + "epoch": 3.009359739555073, + "grad_norm": 2.5981526374816895, + "learning_rate": 3.945868164999315e-06, + "loss": 0.0891, + "step": 44928 + }, + { + "epoch": 3.009373304395008, + "grad_norm": 6.009028911590576, + "learning_rate": 3.94573112237906e-06, + "loss": 0.2114, + "step": 44929 + }, + { + "epoch": 3.009386869234943, + "grad_norm": 4.094763278961182, + "learning_rate": 3.945594079758805e-06, + "loss": 0.1225, + "step": 44930 + }, + { + "epoch": 3.009400434074878, + "grad_norm": 3.8782355785369873, + "learning_rate": 3.9454570371385505e-06, + "loss": 0.1604, + "step": 44931 + }, + { + "epoch": 3.009413998914813, + "grad_norm": 2.99055814743042, + "learning_rate": 3.945319994518296e-06, + "loss": 0.0918, + "step": 44932 + }, + { + "epoch": 3.009427563754748, + "grad_norm": 5.98298454284668, + "learning_rate": 3.945182951898041e-06, + "loss": 0.1735, + "step": 44933 + }, + { + "epoch": 3.0094411285946827, + "grad_norm": 5.8243184089660645, + "learning_rate": 3.945045909277786e-06, + "loss": 0.2178, + "step": 44934 + }, + { + "epoch": 3.0094546934346176, + "grad_norm": 2.640923023223877, + "learning_rate": 3.94490886665753e-06, + "loss": 0.1287, + "step": 44935 + }, + { + "epoch": 3.0094682582745524, + "grad_norm": 2.7166101932525635, + "learning_rate": 3.9447718240372756e-06, + "loss": 0.0928, + "step": 44936 + }, + { + "epoch": 3.0094818231144873, + "grad_norm": 2.7159671783447266, + "learning_rate": 3.944634781417021e-06, + "loss": 0.0779, + "step": 44937 + }, + { + "epoch": 3.009495387954422, + "grad_norm": 3.398974657058716, + "learning_rate": 3.944497738796766e-06, + "loss": 0.1081, + "step": 44938 + }, + { + "epoch": 3.009508952794357, + "grad_norm": 4.106166362762451, + "learning_rate": 3.944360696176511e-06, + "loss": 0.1144, + "step": 44939 + }, + { + "epoch": 3.009522517634292, + "grad_norm": 3.8767759799957275, + "learning_rate": 3.944223653556256e-06, + "loss": 0.1011, + "step": 44940 + }, + { + "epoch": 3.0095360824742268, + "grad_norm": 3.7648391723632812, + "learning_rate": 3.9440866109360015e-06, + "loss": 0.0948, + "step": 44941 + }, + { + "epoch": 3.0095496473141616, + "grad_norm": 3.9855117797851562, + "learning_rate": 3.943949568315747e-06, + "loss": 0.0954, + "step": 44942 + }, + { + "epoch": 3.0095632121540965, + "grad_norm": 3.527890205383301, + "learning_rate": 3.943812525695492e-06, + "loss": 0.1069, + "step": 44943 + }, + { + "epoch": 3.0095767769940314, + "grad_norm": 3.4004924297332764, + "learning_rate": 3.943675483075237e-06, + "loss": 0.0706, + "step": 44944 + }, + { + "epoch": 3.0095903418339662, + "grad_norm": 5.428330421447754, + "learning_rate": 3.943538440454982e-06, + "loss": 0.1565, + "step": 44945 + }, + { + "epoch": 3.009603906673901, + "grad_norm": 4.126845836639404, + "learning_rate": 3.9434013978347265e-06, + "loss": 0.1268, + "step": 44946 + }, + { + "epoch": 3.009617471513836, + "grad_norm": 4.137480735778809, + "learning_rate": 3.9432643552144725e-06, + "loss": 0.0914, + "step": 44947 + }, + { + "epoch": 3.009631036353771, + "grad_norm": 3.84706449508667, + "learning_rate": 3.943127312594217e-06, + "loss": 0.1344, + "step": 44948 + }, + { + "epoch": 3.009644601193706, + "grad_norm": 3.7079384326934814, + "learning_rate": 3.942990269973962e-06, + "loss": 0.1044, + "step": 44949 + }, + { + "epoch": 3.009658166033641, + "grad_norm": 4.755125999450684, + "learning_rate": 3.942853227353707e-06, + "loss": 0.1576, + "step": 44950 + }, + { + "epoch": 3.009671730873576, + "grad_norm": 3.462794303894043, + "learning_rate": 3.942716184733452e-06, + "loss": 0.0855, + "step": 44951 + }, + { + "epoch": 3.0096852957135107, + "grad_norm": 4.203447341918945, + "learning_rate": 3.942579142113198e-06, + "loss": 0.1779, + "step": 44952 + }, + { + "epoch": 3.0096988605534456, + "grad_norm": 4.208522319793701, + "learning_rate": 3.942442099492943e-06, + "loss": 0.1286, + "step": 44953 + }, + { + "epoch": 3.0097124253933805, + "grad_norm": 3.573221445083618, + "learning_rate": 3.942305056872688e-06, + "loss": 0.1252, + "step": 44954 + }, + { + "epoch": 3.0097259902333153, + "grad_norm": 3.176835536956787, + "learning_rate": 3.942168014252432e-06, + "loss": 0.0965, + "step": 44955 + }, + { + "epoch": 3.00973955507325, + "grad_norm": 2.605104684829712, + "learning_rate": 3.942030971632178e-06, + "loss": 0.0474, + "step": 44956 + }, + { + "epoch": 3.009753119913185, + "grad_norm": 3.1895368099212646, + "learning_rate": 3.941893929011923e-06, + "loss": 0.0722, + "step": 44957 + }, + { + "epoch": 3.00976668475312, + "grad_norm": 4.070010185241699, + "learning_rate": 3.941756886391669e-06, + "loss": 0.0904, + "step": 44958 + }, + { + "epoch": 3.009780249593055, + "grad_norm": 2.249807357788086, + "learning_rate": 3.941619843771413e-06, + "loss": 0.0572, + "step": 44959 + }, + { + "epoch": 3.0097938144329897, + "grad_norm": 3.9960429668426514, + "learning_rate": 3.941482801151158e-06, + "loss": 0.1132, + "step": 44960 + }, + { + "epoch": 3.0098073792729245, + "grad_norm": 3.33707857131958, + "learning_rate": 3.941345758530903e-06, + "loss": 0.1123, + "step": 44961 + }, + { + "epoch": 3.0098209441128594, + "grad_norm": 2.713860511779785, + "learning_rate": 3.9412087159106485e-06, + "loss": 0.0588, + "step": 44962 + }, + { + "epoch": 3.0098345089527943, + "grad_norm": 3.3185601234436035, + "learning_rate": 3.941071673290394e-06, + "loss": 0.1004, + "step": 44963 + }, + { + "epoch": 3.009848073792729, + "grad_norm": 3.803025007247925, + "learning_rate": 3.940934630670139e-06, + "loss": 0.1146, + "step": 44964 + }, + { + "epoch": 3.009861638632664, + "grad_norm": 3.0770862102508545, + "learning_rate": 3.940797588049884e-06, + "loss": 0.082, + "step": 44965 + }, + { + "epoch": 3.009875203472599, + "grad_norm": 5.8871917724609375, + "learning_rate": 3.940660545429628e-06, + "loss": 0.1194, + "step": 44966 + }, + { + "epoch": 3.0098887683125337, + "grad_norm": 4.308560848236084, + "learning_rate": 3.940523502809374e-06, + "loss": 0.0937, + "step": 44967 + }, + { + "epoch": 3.0099023331524686, + "grad_norm": 4.290681838989258, + "learning_rate": 3.940386460189119e-06, + "loss": 0.1067, + "step": 44968 + }, + { + "epoch": 3.009915897992404, + "grad_norm": 4.022376537322998, + "learning_rate": 3.940249417568865e-06, + "loss": 0.0992, + "step": 44969 + }, + { + "epoch": 3.0099294628323388, + "grad_norm": 2.8083817958831787, + "learning_rate": 3.940112374948609e-06, + "loss": 0.0998, + "step": 44970 + }, + { + "epoch": 3.0099430276722736, + "grad_norm": 3.235549211502075, + "learning_rate": 3.939975332328354e-06, + "loss": 0.0873, + "step": 44971 + }, + { + "epoch": 3.0099565925122085, + "grad_norm": 2.814971446990967, + "learning_rate": 3.9398382897080995e-06, + "loss": 0.0597, + "step": 44972 + }, + { + "epoch": 3.0099701573521433, + "grad_norm": 3.463587999343872, + "learning_rate": 3.939701247087845e-06, + "loss": 0.0881, + "step": 44973 + }, + { + "epoch": 3.009983722192078, + "grad_norm": 3.8041744232177734, + "learning_rate": 3.93956420446759e-06, + "loss": 0.11, + "step": 44974 + }, + { + "epoch": 3.009997287032013, + "grad_norm": 2.8449106216430664, + "learning_rate": 3.939427161847335e-06, + "loss": 0.0595, + "step": 44975 + }, + { + "epoch": 3.010010851871948, + "grad_norm": 4.2014007568359375, + "learning_rate": 3.93929011922708e-06, + "loss": 0.1309, + "step": 44976 + }, + { + "epoch": 3.010024416711883, + "grad_norm": 2.934527635574341, + "learning_rate": 3.9391530766068245e-06, + "loss": 0.0405, + "step": 44977 + }, + { + "epoch": 3.0100379815518177, + "grad_norm": 3.249390125274658, + "learning_rate": 3.9390160339865705e-06, + "loss": 0.1235, + "step": 44978 + }, + { + "epoch": 3.0100515463917525, + "grad_norm": 2.4297289848327637, + "learning_rate": 3.938878991366315e-06, + "loss": 0.0392, + "step": 44979 + }, + { + "epoch": 3.0100651112316874, + "grad_norm": 5.353959560394287, + "learning_rate": 3.93874194874606e-06, + "loss": 0.1101, + "step": 44980 + }, + { + "epoch": 3.0100786760716223, + "grad_norm": 3.0853700637817383, + "learning_rate": 3.938604906125805e-06, + "loss": 0.0383, + "step": 44981 + }, + { + "epoch": 3.010092240911557, + "grad_norm": 4.820535182952881, + "learning_rate": 3.93846786350555e-06, + "loss": 0.109, + "step": 44982 + }, + { + "epoch": 3.010105805751492, + "grad_norm": 3.039348602294922, + "learning_rate": 3.938330820885296e-06, + "loss": 0.0644, + "step": 44983 + }, + { + "epoch": 3.010119370591427, + "grad_norm": 3.7554214000701904, + "learning_rate": 3.938193778265041e-06, + "loss": 0.1402, + "step": 44984 + }, + { + "epoch": 3.0101329354313617, + "grad_norm": 3.3376574516296387, + "learning_rate": 3.938056735644786e-06, + "loss": 0.0742, + "step": 44985 + }, + { + "epoch": 3.0101465002712966, + "grad_norm": 4.980407238006592, + "learning_rate": 3.937919693024531e-06, + "loss": 0.1428, + "step": 44986 + }, + { + "epoch": 3.010160065111232, + "grad_norm": 3.5783886909484863, + "learning_rate": 3.937782650404276e-06, + "loss": 0.0767, + "step": 44987 + }, + { + "epoch": 3.0101736299511668, + "grad_norm": 2.9484686851501465, + "learning_rate": 3.9376456077840215e-06, + "loss": 0.0951, + "step": 44988 + }, + { + "epoch": 3.0101871947911016, + "grad_norm": 3.7295451164245605, + "learning_rate": 3.937508565163766e-06, + "loss": 0.0965, + "step": 44989 + }, + { + "epoch": 3.0102007596310365, + "grad_norm": 3.6491241455078125, + "learning_rate": 3.937371522543512e-06, + "loss": 0.1412, + "step": 44990 + }, + { + "epoch": 3.0102143244709714, + "grad_norm": 3.6115591526031494, + "learning_rate": 3.937234479923256e-06, + "loss": 0.1113, + "step": 44991 + }, + { + "epoch": 3.0102278893109062, + "grad_norm": 5.11383056640625, + "learning_rate": 3.937097437303001e-06, + "loss": 0.1262, + "step": 44992 + }, + { + "epoch": 3.010241454150841, + "grad_norm": 2.529853343963623, + "learning_rate": 3.9369603946827465e-06, + "loss": 0.0935, + "step": 44993 + }, + { + "epoch": 3.010255018990776, + "grad_norm": 4.087149620056152, + "learning_rate": 3.936823352062492e-06, + "loss": 0.0689, + "step": 44994 + }, + { + "epoch": 3.010268583830711, + "grad_norm": 4.127188205718994, + "learning_rate": 3.936686309442237e-06, + "loss": 0.1395, + "step": 44995 + }, + { + "epoch": 3.0102821486706457, + "grad_norm": 3.2401537895202637, + "learning_rate": 3.936549266821982e-06, + "loss": 0.0808, + "step": 44996 + }, + { + "epoch": 3.0102957135105806, + "grad_norm": 3.478682279586792, + "learning_rate": 3.936412224201727e-06, + "loss": 0.1368, + "step": 44997 + }, + { + "epoch": 3.0103092783505154, + "grad_norm": 3.433511257171631, + "learning_rate": 3.9362751815814724e-06, + "loss": 0.1471, + "step": 44998 + }, + { + "epoch": 3.0103228431904503, + "grad_norm": 3.3359739780426025, + "learning_rate": 3.936138138961218e-06, + "loss": 0.1071, + "step": 44999 + }, + { + "epoch": 3.010336408030385, + "grad_norm": 3.8748934268951416, + "learning_rate": 3.936001096340962e-06, + "loss": 0.1412, + "step": 45000 + }, + { + "epoch": 3.01034997287032, + "grad_norm": 4.132126331329346, + "learning_rate": 3.935864053720708e-06, + "loss": 0.1848, + "step": 45001 + }, + { + "epoch": 3.010363537710255, + "grad_norm": 2.9784791469573975, + "learning_rate": 3.935727011100452e-06, + "loss": 0.0681, + "step": 45002 + }, + { + "epoch": 3.0103771025501898, + "grad_norm": 3.5566306114196777, + "learning_rate": 3.935589968480198e-06, + "loss": 0.1212, + "step": 45003 + }, + { + "epoch": 3.0103906673901246, + "grad_norm": 3.0682358741760254, + "learning_rate": 3.935452925859943e-06, + "loss": 0.1172, + "step": 45004 + }, + { + "epoch": 3.0104042322300595, + "grad_norm": 3.0369181632995605, + "learning_rate": 3.935315883239688e-06, + "loss": 0.08, + "step": 45005 + }, + { + "epoch": 3.010417797069995, + "grad_norm": 3.4084389209747314, + "learning_rate": 3.935178840619433e-06, + "loss": 0.112, + "step": 45006 + }, + { + "epoch": 3.0104313619099297, + "grad_norm": 3.4743800163269043, + "learning_rate": 3.935041797999178e-06, + "loss": 0.0798, + "step": 45007 + }, + { + "epoch": 3.0104449267498645, + "grad_norm": 2.836160182952881, + "learning_rate": 3.934904755378923e-06, + "loss": 0.0743, + "step": 45008 + }, + { + "epoch": 3.0104584915897994, + "grad_norm": 3.680788516998291, + "learning_rate": 3.934767712758668e-06, + "loss": 0.1321, + "step": 45009 + }, + { + "epoch": 3.0104720564297343, + "grad_norm": 3.1293418407440186, + "learning_rate": 3.934630670138414e-06, + "loss": 0.052, + "step": 45010 + }, + { + "epoch": 3.010485621269669, + "grad_norm": 3.7739336490631104, + "learning_rate": 3.934493627518158e-06, + "loss": 0.094, + "step": 45011 + }, + { + "epoch": 3.010499186109604, + "grad_norm": 3.7905373573303223, + "learning_rate": 3.934356584897904e-06, + "loss": 0.1297, + "step": 45012 + }, + { + "epoch": 3.010512750949539, + "grad_norm": 3.269287109375, + "learning_rate": 3.934219542277648e-06, + "loss": 0.0811, + "step": 45013 + }, + { + "epoch": 3.0105263157894737, + "grad_norm": 2.7708098888397217, + "learning_rate": 3.934082499657394e-06, + "loss": 0.1052, + "step": 45014 + }, + { + "epoch": 3.0105398806294086, + "grad_norm": 3.185438394546509, + "learning_rate": 3.933945457037139e-06, + "loss": 0.1307, + "step": 45015 + }, + { + "epoch": 3.0105534454693434, + "grad_norm": 2.771162748336792, + "learning_rate": 3.933808414416884e-06, + "loss": 0.0623, + "step": 45016 + }, + { + "epoch": 3.0105670103092783, + "grad_norm": 3.0397655963897705, + "learning_rate": 3.933671371796629e-06, + "loss": 0.0797, + "step": 45017 + }, + { + "epoch": 3.010580575149213, + "grad_norm": 5.26124382019043, + "learning_rate": 3.933534329176374e-06, + "loss": 0.1729, + "step": 45018 + }, + { + "epoch": 3.010594139989148, + "grad_norm": 3.020373821258545, + "learning_rate": 3.9333972865561195e-06, + "loss": 0.1046, + "step": 45019 + }, + { + "epoch": 3.010607704829083, + "grad_norm": 3.2866194248199463, + "learning_rate": 3.933260243935864e-06, + "loss": 0.0894, + "step": 45020 + }, + { + "epoch": 3.0106212696690178, + "grad_norm": 3.662478446960449, + "learning_rate": 3.93312320131561e-06, + "loss": 0.1123, + "step": 45021 + }, + { + "epoch": 3.0106348345089526, + "grad_norm": 3.1581270694732666, + "learning_rate": 3.932986158695354e-06, + "loss": 0.0907, + "step": 45022 + }, + { + "epoch": 3.0106483993488875, + "grad_norm": 3.746108293533325, + "learning_rate": 3.9328491160751e-06, + "loss": 0.0736, + "step": 45023 + }, + { + "epoch": 3.0106619641888224, + "grad_norm": 4.15556001663208, + "learning_rate": 3.9327120734548445e-06, + "loss": 0.1234, + "step": 45024 + }, + { + "epoch": 3.0106755290287577, + "grad_norm": 3.9295663833618164, + "learning_rate": 3.93257503083459e-06, + "loss": 0.1548, + "step": 45025 + }, + { + "epoch": 3.0106890938686925, + "grad_norm": 2.80869460105896, + "learning_rate": 3.932437988214335e-06, + "loss": 0.0637, + "step": 45026 + }, + { + "epoch": 3.0107026587086274, + "grad_norm": 3.486330509185791, + "learning_rate": 3.93230094559408e-06, + "loss": 0.1114, + "step": 45027 + }, + { + "epoch": 3.0107162235485623, + "grad_norm": 3.473073959350586, + "learning_rate": 3.932163902973825e-06, + "loss": 0.0995, + "step": 45028 + }, + { + "epoch": 3.010729788388497, + "grad_norm": 4.5158538818359375, + "learning_rate": 3.9320268603535704e-06, + "loss": 0.0911, + "step": 45029 + }, + { + "epoch": 3.010743353228432, + "grad_norm": 7.355709075927734, + "learning_rate": 3.931889817733316e-06, + "loss": 0.0814, + "step": 45030 + }, + { + "epoch": 3.010756918068367, + "grad_norm": 3.355581760406494, + "learning_rate": 3.931752775113061e-06, + "loss": 0.0704, + "step": 45031 + }, + { + "epoch": 3.0107704829083017, + "grad_norm": 3.307096242904663, + "learning_rate": 3.931615732492806e-06, + "loss": 0.0944, + "step": 45032 + }, + { + "epoch": 3.0107840477482366, + "grad_norm": 2.771756887435913, + "learning_rate": 3.93147868987255e-06, + "loss": 0.0738, + "step": 45033 + }, + { + "epoch": 3.0107976125881715, + "grad_norm": 5.2321882247924805, + "learning_rate": 3.9313416472522955e-06, + "loss": 0.1323, + "step": 45034 + }, + { + "epoch": 3.0108111774281063, + "grad_norm": 3.5373127460479736, + "learning_rate": 3.931204604632041e-06, + "loss": 0.0833, + "step": 45035 + }, + { + "epoch": 3.010824742268041, + "grad_norm": 4.768257141113281, + "learning_rate": 3.931067562011786e-06, + "loss": 0.1011, + "step": 45036 + }, + { + "epoch": 3.010838307107976, + "grad_norm": 3.9425699710845947, + "learning_rate": 3.930930519391531e-06, + "loss": 0.1297, + "step": 45037 + }, + { + "epoch": 3.010851871947911, + "grad_norm": 3.5606322288513184, + "learning_rate": 3.930793476771276e-06, + "loss": 0.0955, + "step": 45038 + }, + { + "epoch": 3.010865436787846, + "grad_norm": 4.474030494689941, + "learning_rate": 3.930656434151021e-06, + "loss": 0.1724, + "step": 45039 + }, + { + "epoch": 3.0108790016277807, + "grad_norm": 3.650285243988037, + "learning_rate": 3.9305193915307665e-06, + "loss": 0.0869, + "step": 45040 + }, + { + "epoch": 3.0108925664677155, + "grad_norm": 4.186142921447754, + "learning_rate": 3.930382348910512e-06, + "loss": 0.0677, + "step": 45041 + }, + { + "epoch": 3.0109061313076504, + "grad_norm": 2.982569456100464, + "learning_rate": 3.930245306290257e-06, + "loss": 0.0998, + "step": 45042 + }, + { + "epoch": 3.0109196961475853, + "grad_norm": 3.358464479446411, + "learning_rate": 3.930108263670001e-06, + "loss": 0.0577, + "step": 45043 + }, + { + "epoch": 3.0109332609875206, + "grad_norm": 4.025033473968506, + "learning_rate": 3.929971221049747e-06, + "loss": 0.1025, + "step": 45044 + }, + { + "epoch": 3.0109468258274554, + "grad_norm": 4.333016395568848, + "learning_rate": 3.929834178429492e-06, + "loss": 0.1196, + "step": 45045 + }, + { + "epoch": 3.0109603906673903, + "grad_norm": 3.4161946773529053, + "learning_rate": 3.929697135809237e-06, + "loss": 0.086, + "step": 45046 + }, + { + "epoch": 3.010973955507325, + "grad_norm": 3.295121908187866, + "learning_rate": 3.929560093188982e-06, + "loss": 0.0823, + "step": 45047 + }, + { + "epoch": 3.01098752034726, + "grad_norm": 2.9407544136047363, + "learning_rate": 3.929423050568727e-06, + "loss": 0.0693, + "step": 45048 + }, + { + "epoch": 3.011001085187195, + "grad_norm": 3.925278663635254, + "learning_rate": 3.929286007948472e-06, + "loss": 0.0825, + "step": 45049 + }, + { + "epoch": 3.0110146500271298, + "grad_norm": 3.321575403213501, + "learning_rate": 3.9291489653282175e-06, + "loss": 0.0945, + "step": 45050 + }, + { + "epoch": 3.0110282148670646, + "grad_norm": 3.8574559688568115, + "learning_rate": 3.929011922707963e-06, + "loss": 0.1404, + "step": 45051 + }, + { + "epoch": 3.0110417797069995, + "grad_norm": 3.60090708732605, + "learning_rate": 3.928874880087708e-06, + "loss": 0.0926, + "step": 45052 + }, + { + "epoch": 3.0110553445469344, + "grad_norm": 5.2515716552734375, + "learning_rate": 3.928737837467453e-06, + "loss": 0.1164, + "step": 45053 + }, + { + "epoch": 3.011068909386869, + "grad_norm": 3.3654892444610596, + "learning_rate": 3.928600794847197e-06, + "loss": 0.0866, + "step": 45054 + }, + { + "epoch": 3.011082474226804, + "grad_norm": 4.599218845367432, + "learning_rate": 3.928463752226943e-06, + "loss": 0.2153, + "step": 45055 + }, + { + "epoch": 3.011096039066739, + "grad_norm": 6.09365177154541, + "learning_rate": 3.928326709606688e-06, + "loss": 0.2133, + "step": 45056 + }, + { + "epoch": 3.011109603906674, + "grad_norm": 3.039963722229004, + "learning_rate": 3.928189666986434e-06, + "loss": 0.0655, + "step": 45057 + }, + { + "epoch": 3.0111231687466087, + "grad_norm": 4.670774936676025, + "learning_rate": 3.928052624366178e-06, + "loss": 0.1357, + "step": 45058 + }, + { + "epoch": 3.0111367335865435, + "grad_norm": 3.819303035736084, + "learning_rate": 3.927915581745923e-06, + "loss": 0.1324, + "step": 45059 + }, + { + "epoch": 3.0111502984264784, + "grad_norm": 4.1117024421691895, + "learning_rate": 3.9277785391256684e-06, + "loss": 0.1015, + "step": 45060 + }, + { + "epoch": 3.0111638632664133, + "grad_norm": 5.3015289306640625, + "learning_rate": 3.927641496505414e-06, + "loss": 0.1755, + "step": 45061 + }, + { + "epoch": 3.011177428106348, + "grad_norm": 5.159714221954346, + "learning_rate": 3.927504453885159e-06, + "loss": 0.185, + "step": 45062 + }, + { + "epoch": 3.0111909929462835, + "grad_norm": 5.099284648895264, + "learning_rate": 3.927367411264903e-06, + "loss": 0.1987, + "step": 45063 + }, + { + "epoch": 3.0112045577862183, + "grad_norm": 6.18085241317749, + "learning_rate": 3.927230368644649e-06, + "loss": 0.1395, + "step": 45064 + }, + { + "epoch": 3.011218122626153, + "grad_norm": 4.007419109344482, + "learning_rate": 3.9270933260243935e-06, + "loss": 0.0921, + "step": 45065 + }, + { + "epoch": 3.011231687466088, + "grad_norm": 4.160006046295166, + "learning_rate": 3.9269562834041395e-06, + "loss": 0.1126, + "step": 45066 + }, + { + "epoch": 3.011245252306023, + "grad_norm": 4.990450382232666, + "learning_rate": 3.926819240783884e-06, + "loss": 0.1358, + "step": 45067 + }, + { + "epoch": 3.0112588171459578, + "grad_norm": 4.233363151550293, + "learning_rate": 3.926682198163629e-06, + "loss": 0.1262, + "step": 45068 + }, + { + "epoch": 3.0112723819858926, + "grad_norm": 4.694145679473877, + "learning_rate": 3.926545155543374e-06, + "loss": 0.1983, + "step": 45069 + }, + { + "epoch": 3.0112859468258275, + "grad_norm": 3.9478330612182617, + "learning_rate": 3.926408112923119e-06, + "loss": 0.162, + "step": 45070 + }, + { + "epoch": 3.0112995116657624, + "grad_norm": 4.050221920013428, + "learning_rate": 3.9262710703028646e-06, + "loss": 0.1177, + "step": 45071 + }, + { + "epoch": 3.0113130765056972, + "grad_norm": 4.945483684539795, + "learning_rate": 3.92613402768261e-06, + "loss": 0.1999, + "step": 45072 + }, + { + "epoch": 3.011326641345632, + "grad_norm": 4.36460542678833, + "learning_rate": 3.925996985062355e-06, + "loss": 0.1692, + "step": 45073 + }, + { + "epoch": 3.011340206185567, + "grad_norm": 3.2323367595672607, + "learning_rate": 3.925859942442099e-06, + "loss": 0.1065, + "step": 45074 + }, + { + "epoch": 3.011353771025502, + "grad_norm": 4.994514465332031, + "learning_rate": 3.925722899821845e-06, + "loss": 0.1679, + "step": 45075 + }, + { + "epoch": 3.0113673358654367, + "grad_norm": 3.5501983165740967, + "learning_rate": 3.92558585720159e-06, + "loss": 0.0992, + "step": 45076 + }, + { + "epoch": 3.0113809007053716, + "grad_norm": 4.343653678894043, + "learning_rate": 3.925448814581335e-06, + "loss": 0.0815, + "step": 45077 + }, + { + "epoch": 3.0113944655453064, + "grad_norm": 3.5567803382873535, + "learning_rate": 3.92531177196108e-06, + "loss": 0.1299, + "step": 45078 + }, + { + "epoch": 3.0114080303852413, + "grad_norm": 4.085153102874756, + "learning_rate": 3.925174729340825e-06, + "loss": 0.1946, + "step": 45079 + }, + { + "epoch": 3.011421595225176, + "grad_norm": 3.951643228530884, + "learning_rate": 3.92503768672057e-06, + "loss": 0.1416, + "step": 45080 + }, + { + "epoch": 3.011435160065111, + "grad_norm": 3.189392328262329, + "learning_rate": 3.9249006441003155e-06, + "loss": 0.0969, + "step": 45081 + }, + { + "epoch": 3.0114487249050463, + "grad_norm": 4.949991703033447, + "learning_rate": 3.924763601480061e-06, + "loss": 0.1216, + "step": 45082 + }, + { + "epoch": 3.011462289744981, + "grad_norm": 3.5767579078674316, + "learning_rate": 3.924626558859806e-06, + "loss": 0.1027, + "step": 45083 + }, + { + "epoch": 3.011475854584916, + "grad_norm": 4.34745979309082, + "learning_rate": 3.924489516239551e-06, + "loss": 0.1753, + "step": 45084 + }, + { + "epoch": 3.011489419424851, + "grad_norm": 3.624415636062622, + "learning_rate": 3.924352473619296e-06, + "loss": 0.0872, + "step": 45085 + }, + { + "epoch": 3.011502984264786, + "grad_norm": 3.83005952835083, + "learning_rate": 3.924215430999041e-06, + "loss": 0.0702, + "step": 45086 + }, + { + "epoch": 3.0115165491047207, + "grad_norm": 3.499476909637451, + "learning_rate": 3.924078388378786e-06, + "loss": 0.0787, + "step": 45087 + }, + { + "epoch": 3.0115301139446555, + "grad_norm": 3.4029035568237305, + "learning_rate": 3.923941345758531e-06, + "loss": 0.0828, + "step": 45088 + }, + { + "epoch": 3.0115436787845904, + "grad_norm": 2.417393684387207, + "learning_rate": 3.923804303138276e-06, + "loss": 0.0825, + "step": 45089 + }, + { + "epoch": 3.0115572436245253, + "grad_norm": 4.018789291381836, + "learning_rate": 3.923667260518021e-06, + "loss": 0.1831, + "step": 45090 + }, + { + "epoch": 3.01157080846446, + "grad_norm": 4.048212051391602, + "learning_rate": 3.9235302178977664e-06, + "loss": 0.1102, + "step": 45091 + }, + { + "epoch": 3.011584373304395, + "grad_norm": 4.486072540283203, + "learning_rate": 3.923393175277512e-06, + "loss": 0.178, + "step": 45092 + }, + { + "epoch": 3.01159793814433, + "grad_norm": 4.264674663543701, + "learning_rate": 3.923256132657257e-06, + "loss": 0.1222, + "step": 45093 + }, + { + "epoch": 3.0116115029842647, + "grad_norm": 4.278429985046387, + "learning_rate": 3.923119090037002e-06, + "loss": 0.0943, + "step": 45094 + }, + { + "epoch": 3.0116250678241996, + "grad_norm": 3.337888240814209, + "learning_rate": 3.922982047416747e-06, + "loss": 0.1355, + "step": 45095 + }, + { + "epoch": 3.0116386326641345, + "grad_norm": 2.984968900680542, + "learning_rate": 3.922845004796492e-06, + "loss": 0.1042, + "step": 45096 + }, + { + "epoch": 3.0116521975040693, + "grad_norm": 4.835737705230713, + "learning_rate": 3.922707962176237e-06, + "loss": 0.1607, + "step": 45097 + }, + { + "epoch": 3.011665762344004, + "grad_norm": 2.9121203422546387, + "learning_rate": 3.922570919555983e-06, + "loss": 0.0816, + "step": 45098 + }, + { + "epoch": 3.011679327183939, + "grad_norm": 4.276001453399658, + "learning_rate": 3.922433876935727e-06, + "loss": 0.1572, + "step": 45099 + }, + { + "epoch": 3.011692892023874, + "grad_norm": 3.865013599395752, + "learning_rate": 3.922296834315473e-06, + "loss": 0.1188, + "step": 45100 + }, + { + "epoch": 3.0117064568638092, + "grad_norm": 4.379191875457764, + "learning_rate": 3.922159791695217e-06, + "loss": 0.1415, + "step": 45101 + }, + { + "epoch": 3.011720021703744, + "grad_norm": 4.546566486358643, + "learning_rate": 3.9220227490749626e-06, + "loss": 0.1774, + "step": 45102 + }, + { + "epoch": 3.011733586543679, + "grad_norm": 4.390886306762695, + "learning_rate": 3.921885706454708e-06, + "loss": 0.0861, + "step": 45103 + }, + { + "epoch": 3.011747151383614, + "grad_norm": 2.6835715770721436, + "learning_rate": 3.921748663834453e-06, + "loss": 0.0804, + "step": 45104 + }, + { + "epoch": 3.0117607162235487, + "grad_norm": 4.156981468200684, + "learning_rate": 3.921611621214198e-06, + "loss": 0.1192, + "step": 45105 + }, + { + "epoch": 3.0117742810634835, + "grad_norm": 4.191978931427002, + "learning_rate": 3.921474578593943e-06, + "loss": 0.1395, + "step": 45106 + }, + { + "epoch": 3.0117878459034184, + "grad_norm": 4.902332305908203, + "learning_rate": 3.9213375359736885e-06, + "loss": 0.1365, + "step": 45107 + }, + { + "epoch": 3.0118014107433533, + "grad_norm": 3.337981700897217, + "learning_rate": 3.921200493353433e-06, + "loss": 0.1303, + "step": 45108 + }, + { + "epoch": 3.011814975583288, + "grad_norm": 4.345434188842773, + "learning_rate": 3.921063450733179e-06, + "loss": 0.1741, + "step": 45109 + }, + { + "epoch": 3.011828540423223, + "grad_norm": 3.263336658477783, + "learning_rate": 3.920926408112923e-06, + "loss": 0.1114, + "step": 45110 + }, + { + "epoch": 3.011842105263158, + "grad_norm": 3.918980360031128, + "learning_rate": 3.920789365492669e-06, + "loss": 0.1425, + "step": 45111 + }, + { + "epoch": 3.0118556701030927, + "grad_norm": 3.5684831142425537, + "learning_rate": 3.9206523228724135e-06, + "loss": 0.1002, + "step": 45112 + }, + { + "epoch": 3.0118692349430276, + "grad_norm": 5.432552814483643, + "learning_rate": 3.920515280252159e-06, + "loss": 0.141, + "step": 45113 + }, + { + "epoch": 3.0118827997829625, + "grad_norm": 4.4667510986328125, + "learning_rate": 3.920378237631904e-06, + "loss": 0.1197, + "step": 45114 + }, + { + "epoch": 3.0118963646228973, + "grad_norm": 5.715919494628906, + "learning_rate": 3.920241195011649e-06, + "loss": 0.1363, + "step": 45115 + }, + { + "epoch": 3.011909929462832, + "grad_norm": 3.4926047325134277, + "learning_rate": 3.920104152391394e-06, + "loss": 0.1371, + "step": 45116 + }, + { + "epoch": 3.011923494302767, + "grad_norm": 3.7393393516540527, + "learning_rate": 3.9199671097711385e-06, + "loss": 0.1768, + "step": 45117 + }, + { + "epoch": 3.011937059142702, + "grad_norm": 3.4875667095184326, + "learning_rate": 3.9198300671508846e-06, + "loss": 0.1313, + "step": 45118 + }, + { + "epoch": 3.011950623982637, + "grad_norm": 4.8735222816467285, + "learning_rate": 3.919693024530629e-06, + "loss": 0.1456, + "step": 45119 + }, + { + "epoch": 3.011964188822572, + "grad_norm": 3.3281443119049072, + "learning_rate": 3.919555981910375e-06, + "loss": 0.0379, + "step": 45120 + }, + { + "epoch": 3.011977753662507, + "grad_norm": 5.598921298980713, + "learning_rate": 3.919418939290119e-06, + "loss": 0.1361, + "step": 45121 + }, + { + "epoch": 3.011991318502442, + "grad_norm": 5.135693073272705, + "learning_rate": 3.9192818966698644e-06, + "loss": 0.188, + "step": 45122 + }, + { + "epoch": 3.0120048833423767, + "grad_norm": 3.38869309425354, + "learning_rate": 3.91914485404961e-06, + "loss": 0.108, + "step": 45123 + }, + { + "epoch": 3.0120184481823116, + "grad_norm": 3.7151172161102295, + "learning_rate": 3.919007811429355e-06, + "loss": 0.0733, + "step": 45124 + }, + { + "epoch": 3.0120320130222464, + "grad_norm": 3.105961561203003, + "learning_rate": 3.9188707688091e-06, + "loss": 0.1361, + "step": 45125 + }, + { + "epoch": 3.0120455778621813, + "grad_norm": 4.6197638511657715, + "learning_rate": 3.918733726188845e-06, + "loss": 0.1057, + "step": 45126 + }, + { + "epoch": 3.012059142702116, + "grad_norm": 5.36179256439209, + "learning_rate": 3.91859668356859e-06, + "loss": 0.1716, + "step": 45127 + }, + { + "epoch": 3.012072707542051, + "grad_norm": 3.8648228645324707, + "learning_rate": 3.918459640948335e-06, + "loss": 0.0669, + "step": 45128 + }, + { + "epoch": 3.012086272381986, + "grad_norm": 3.0470223426818848, + "learning_rate": 3.918322598328081e-06, + "loss": 0.0904, + "step": 45129 + }, + { + "epoch": 3.0120998372219208, + "grad_norm": 4.509516716003418, + "learning_rate": 3.918185555707825e-06, + "loss": 0.0979, + "step": 45130 + }, + { + "epoch": 3.0121134020618556, + "grad_norm": 4.333952903747559, + "learning_rate": 3.91804851308757e-06, + "loss": 0.1253, + "step": 45131 + }, + { + "epoch": 3.0121269669017905, + "grad_norm": 5.135725021362305, + "learning_rate": 3.917911470467315e-06, + "loss": 0.1062, + "step": 45132 + }, + { + "epoch": 3.0121405317417254, + "grad_norm": 4.714373588562012, + "learning_rate": 3.9177744278470606e-06, + "loss": 0.1524, + "step": 45133 + }, + { + "epoch": 3.0121540965816602, + "grad_norm": 4.183762550354004, + "learning_rate": 3.917637385226806e-06, + "loss": 0.0932, + "step": 45134 + }, + { + "epoch": 3.012167661421595, + "grad_norm": 4.236908912658691, + "learning_rate": 3.917500342606551e-06, + "loss": 0.1283, + "step": 45135 + }, + { + "epoch": 3.01218122626153, + "grad_norm": 5.188818454742432, + "learning_rate": 3.917363299986296e-06, + "loss": 0.0672, + "step": 45136 + }, + { + "epoch": 3.012194791101465, + "grad_norm": 3.712703227996826, + "learning_rate": 3.917226257366041e-06, + "loss": 0.0566, + "step": 45137 + }, + { + "epoch": 3.0122083559413997, + "grad_norm": 4.7829365730285645, + "learning_rate": 3.9170892147457865e-06, + "loss": 0.1691, + "step": 45138 + }, + { + "epoch": 3.012221920781335, + "grad_norm": 2.8641672134399414, + "learning_rate": 3.916952172125532e-06, + "loss": 0.0584, + "step": 45139 + }, + { + "epoch": 3.01223548562127, + "grad_norm": 3.769728422164917, + "learning_rate": 3.916815129505277e-06, + "loss": 0.0734, + "step": 45140 + }, + { + "epoch": 3.0122490504612047, + "grad_norm": 3.6613550186157227, + "learning_rate": 3.916678086885022e-06, + "loss": 0.1244, + "step": 45141 + }, + { + "epoch": 3.0122626153011396, + "grad_norm": 3.4775755405426025, + "learning_rate": 3.916541044264766e-06, + "loss": 0.078, + "step": 45142 + }, + { + "epoch": 3.0122761801410745, + "grad_norm": 3.5843188762664795, + "learning_rate": 3.9164040016445115e-06, + "loss": 0.1209, + "step": 45143 + }, + { + "epoch": 3.0122897449810093, + "grad_norm": 3.599187135696411, + "learning_rate": 3.916266959024257e-06, + "loss": 0.1132, + "step": 45144 + }, + { + "epoch": 3.012303309820944, + "grad_norm": 3.299896240234375, + "learning_rate": 3.916129916404002e-06, + "loss": 0.0846, + "step": 45145 + }, + { + "epoch": 3.012316874660879, + "grad_norm": 4.615869522094727, + "learning_rate": 3.915992873783747e-06, + "loss": 0.0894, + "step": 45146 + }, + { + "epoch": 3.012330439500814, + "grad_norm": 4.053589820861816, + "learning_rate": 3.915855831163492e-06, + "loss": 0.1213, + "step": 45147 + }, + { + "epoch": 3.012344004340749, + "grad_norm": 3.2831380367279053, + "learning_rate": 3.915718788543237e-06, + "loss": 0.0952, + "step": 45148 + }, + { + "epoch": 3.0123575691806836, + "grad_norm": 4.216858386993408, + "learning_rate": 3.915581745922983e-06, + "loss": 0.1076, + "step": 45149 + }, + { + "epoch": 3.0123711340206185, + "grad_norm": 4.97488260269165, + "learning_rate": 3.915444703302728e-06, + "loss": 0.1167, + "step": 45150 + }, + { + "epoch": 3.0123846988605534, + "grad_norm": 5.098636150360107, + "learning_rate": 3.915307660682472e-06, + "loss": 0.1461, + "step": 45151 + }, + { + "epoch": 3.0123982637004882, + "grad_norm": 4.205660343170166, + "learning_rate": 3.915170618062218e-06, + "loss": 0.0947, + "step": 45152 + }, + { + "epoch": 3.012411828540423, + "grad_norm": 4.371582984924316, + "learning_rate": 3.9150335754419624e-06, + "loss": 0.1718, + "step": 45153 + }, + { + "epoch": 3.012425393380358, + "grad_norm": 2.870361566543579, + "learning_rate": 3.9148965328217085e-06, + "loss": 0.059, + "step": 45154 + }, + { + "epoch": 3.012438958220293, + "grad_norm": 2.9257588386535645, + "learning_rate": 3.914759490201453e-06, + "loss": 0.0943, + "step": 45155 + }, + { + "epoch": 3.0124525230602277, + "grad_norm": 2.580709457397461, + "learning_rate": 3.914622447581198e-06, + "loss": 0.0322, + "step": 45156 + }, + { + "epoch": 3.0124660879001626, + "grad_norm": 3.2633581161499023, + "learning_rate": 3.914485404960943e-06, + "loss": 0.0604, + "step": 45157 + }, + { + "epoch": 3.012479652740098, + "grad_norm": 3.797128438949585, + "learning_rate": 3.914348362340688e-06, + "loss": 0.0884, + "step": 45158 + }, + { + "epoch": 3.0124932175800327, + "grad_norm": 3.3859593868255615, + "learning_rate": 3.9142113197204335e-06, + "loss": 0.1476, + "step": 45159 + }, + { + "epoch": 3.0125067824199676, + "grad_norm": 2.4828763008117676, + "learning_rate": 3.914074277100179e-06, + "loss": 0.0619, + "step": 45160 + }, + { + "epoch": 3.0125203472599025, + "grad_norm": 3.3029396533966064, + "learning_rate": 3.913937234479924e-06, + "loss": 0.0672, + "step": 45161 + }, + { + "epoch": 3.0125339120998373, + "grad_norm": 4.5215744972229, + "learning_rate": 3.913800191859668e-06, + "loss": 0.1098, + "step": 45162 + }, + { + "epoch": 3.012547476939772, + "grad_norm": 2.4714300632476807, + "learning_rate": 3.913663149239414e-06, + "loss": 0.0815, + "step": 45163 + }, + { + "epoch": 3.012561041779707, + "grad_norm": 2.8928844928741455, + "learning_rate": 3.9135261066191586e-06, + "loss": 0.0965, + "step": 45164 + }, + { + "epoch": 3.012574606619642, + "grad_norm": 3.774634838104248, + "learning_rate": 3.913389063998905e-06, + "loss": 0.0829, + "step": 45165 + }, + { + "epoch": 3.012588171459577, + "grad_norm": 3.220762014389038, + "learning_rate": 3.913252021378649e-06, + "loss": 0.0787, + "step": 45166 + }, + { + "epoch": 3.0126017362995117, + "grad_norm": 3.3828561305999756, + "learning_rate": 3.913114978758394e-06, + "loss": 0.0664, + "step": 45167 + }, + { + "epoch": 3.0126153011394465, + "grad_norm": 2.863253355026245, + "learning_rate": 3.912977936138139e-06, + "loss": 0.0668, + "step": 45168 + }, + { + "epoch": 3.0126288659793814, + "grad_norm": 1.9184916019439697, + "learning_rate": 3.9128408935178845e-06, + "loss": 0.0299, + "step": 45169 + }, + { + "epoch": 3.0126424308193163, + "grad_norm": 2.64620041847229, + "learning_rate": 3.91270385089763e-06, + "loss": 0.0728, + "step": 45170 + }, + { + "epoch": 3.012655995659251, + "grad_norm": 3.1642425060272217, + "learning_rate": 3.912566808277374e-06, + "loss": 0.064, + "step": 45171 + }, + { + "epoch": 3.012669560499186, + "grad_norm": 5.045283794403076, + "learning_rate": 3.91242976565712e-06, + "loss": 0.1144, + "step": 45172 + }, + { + "epoch": 3.012683125339121, + "grad_norm": 3.297823429107666, + "learning_rate": 3.912292723036864e-06, + "loss": 0.083, + "step": 45173 + }, + { + "epoch": 3.0126966901790557, + "grad_norm": 3.5091748237609863, + "learning_rate": 3.91215568041661e-06, + "loss": 0.0701, + "step": 45174 + }, + { + "epoch": 3.0127102550189906, + "grad_norm": 2.7166521549224854, + "learning_rate": 3.912018637796355e-06, + "loss": 0.048, + "step": 45175 + }, + { + "epoch": 3.0127238198589255, + "grad_norm": 3.041931390762329, + "learning_rate": 3.9118815951761e-06, + "loss": 0.0773, + "step": 45176 + }, + { + "epoch": 3.0127373846988608, + "grad_norm": 2.129640579223633, + "learning_rate": 3.911744552555845e-06, + "loss": 0.0477, + "step": 45177 + }, + { + "epoch": 3.0127509495387956, + "grad_norm": 5.734111785888672, + "learning_rate": 3.91160750993559e-06, + "loss": 0.1315, + "step": 45178 + }, + { + "epoch": 3.0127645143787305, + "grad_norm": 2.902874708175659, + "learning_rate": 3.911470467315335e-06, + "loss": 0.0594, + "step": 45179 + }, + { + "epoch": 3.0127780792186654, + "grad_norm": 2.782360792160034, + "learning_rate": 3.911333424695081e-06, + "loss": 0.0543, + "step": 45180 + }, + { + "epoch": 3.0127916440586002, + "grad_norm": 2.227041482925415, + "learning_rate": 3.911196382074826e-06, + "loss": 0.0637, + "step": 45181 + }, + { + "epoch": 3.012805208898535, + "grad_norm": 2.598811149597168, + "learning_rate": 3.911059339454571e-06, + "loss": 0.0562, + "step": 45182 + }, + { + "epoch": 3.01281877373847, + "grad_norm": 4.489684104919434, + "learning_rate": 3.910922296834316e-06, + "loss": 0.0974, + "step": 45183 + }, + { + "epoch": 3.012832338578405, + "grad_norm": 3.8609681129455566, + "learning_rate": 3.9107852542140604e-06, + "loss": 0.1252, + "step": 45184 + }, + { + "epoch": 3.0128459034183397, + "grad_norm": 3.5571868419647217, + "learning_rate": 3.910648211593806e-06, + "loss": 0.0654, + "step": 45185 + }, + { + "epoch": 3.0128594682582746, + "grad_norm": 2.816105604171753, + "learning_rate": 3.910511168973551e-06, + "loss": 0.0595, + "step": 45186 + }, + { + "epoch": 3.0128730330982094, + "grad_norm": 4.603237152099609, + "learning_rate": 3.910374126353296e-06, + "loss": 0.1034, + "step": 45187 + }, + { + "epoch": 3.0128865979381443, + "grad_norm": 3.3522274494171143, + "learning_rate": 3.910237083733041e-06, + "loss": 0.0852, + "step": 45188 + }, + { + "epoch": 3.012900162778079, + "grad_norm": 3.5348803997039795, + "learning_rate": 3.910100041112786e-06, + "loss": 0.0677, + "step": 45189 + }, + { + "epoch": 3.012913727618014, + "grad_norm": 3.0825953483581543, + "learning_rate": 3.9099629984925315e-06, + "loss": 0.0833, + "step": 45190 + }, + { + "epoch": 3.012927292457949, + "grad_norm": 2.857947587966919, + "learning_rate": 3.909825955872277e-06, + "loss": 0.0618, + "step": 45191 + }, + { + "epoch": 3.0129408572978837, + "grad_norm": 3.216538190841675, + "learning_rate": 3.909688913252022e-06, + "loss": 0.056, + "step": 45192 + }, + { + "epoch": 3.0129544221378186, + "grad_norm": 4.522867679595947, + "learning_rate": 3.909551870631767e-06, + "loss": 0.0931, + "step": 45193 + }, + { + "epoch": 3.0129679869777535, + "grad_norm": 3.725126028060913, + "learning_rate": 3.909414828011512e-06, + "loss": 0.1332, + "step": 45194 + }, + { + "epoch": 3.0129815518176883, + "grad_norm": 3.4420111179351807, + "learning_rate": 3.909277785391257e-06, + "loss": 0.0383, + "step": 45195 + }, + { + "epoch": 3.0129951166576237, + "grad_norm": 2.6109609603881836, + "learning_rate": 3.909140742771002e-06, + "loss": 0.0424, + "step": 45196 + }, + { + "epoch": 3.0130086814975585, + "grad_norm": 3.8980016708374023, + "learning_rate": 3.909003700150747e-06, + "loss": 0.1461, + "step": 45197 + }, + { + "epoch": 3.0130222463374934, + "grad_norm": 3.1172499656677246, + "learning_rate": 3.908866657530492e-06, + "loss": 0.0437, + "step": 45198 + }, + { + "epoch": 3.0130358111774282, + "grad_norm": 4.020593166351318, + "learning_rate": 3.908729614910237e-06, + "loss": 0.0807, + "step": 45199 + }, + { + "epoch": 3.013049376017363, + "grad_norm": 4.172723770141602, + "learning_rate": 3.9085925722899825e-06, + "loss": 0.1065, + "step": 45200 + }, + { + "epoch": 3.013062940857298, + "grad_norm": 2.7604339122772217, + "learning_rate": 3.908455529669728e-06, + "loss": 0.0528, + "step": 45201 + }, + { + "epoch": 3.013076505697233, + "grad_norm": 3.4795262813568115, + "learning_rate": 3.908318487049473e-06, + "loss": 0.1196, + "step": 45202 + }, + { + "epoch": 3.0130900705371677, + "grad_norm": 2.631368398666382, + "learning_rate": 3.908181444429218e-06, + "loss": 0.0463, + "step": 45203 + }, + { + "epoch": 3.0131036353771026, + "grad_norm": 3.14096999168396, + "learning_rate": 3.908044401808963e-06, + "loss": 0.0751, + "step": 45204 + }, + { + "epoch": 3.0131172002170374, + "grad_norm": 2.4279141426086426, + "learning_rate": 3.9079073591887075e-06, + "loss": 0.0404, + "step": 45205 + }, + { + "epoch": 3.0131307650569723, + "grad_norm": 2.4694035053253174, + "learning_rate": 3.9077703165684535e-06, + "loss": 0.0538, + "step": 45206 + }, + { + "epoch": 3.013144329896907, + "grad_norm": 3.5178823471069336, + "learning_rate": 3.907633273948198e-06, + "loss": 0.0833, + "step": 45207 + }, + { + "epoch": 3.013157894736842, + "grad_norm": 2.6948983669281006, + "learning_rate": 3.907496231327944e-06, + "loss": 0.0566, + "step": 45208 + }, + { + "epoch": 3.013171459576777, + "grad_norm": 2.4884774684906006, + "learning_rate": 3.907359188707688e-06, + "loss": 0.0907, + "step": 45209 + }, + { + "epoch": 3.0131850244167118, + "grad_norm": 3.173154592514038, + "learning_rate": 3.907222146087433e-06, + "loss": 0.0631, + "step": 45210 + }, + { + "epoch": 3.0131985892566466, + "grad_norm": 3.7229502201080322, + "learning_rate": 3.907085103467179e-06, + "loss": 0.0568, + "step": 45211 + }, + { + "epoch": 3.0132121540965815, + "grad_norm": 3.473848581314087, + "learning_rate": 3.906948060846924e-06, + "loss": 0.1768, + "step": 45212 + }, + { + "epoch": 3.0132257189365164, + "grad_norm": 4.269831657409668, + "learning_rate": 3.906811018226669e-06, + "loss": 0.121, + "step": 45213 + }, + { + "epoch": 3.0132392837764512, + "grad_norm": 3.065939426422119, + "learning_rate": 3.906673975606414e-06, + "loss": 0.0781, + "step": 45214 + }, + { + "epoch": 3.0132528486163865, + "grad_norm": 4.831841945648193, + "learning_rate": 3.906536932986159e-06, + "loss": 0.1609, + "step": 45215 + }, + { + "epoch": 3.0132664134563214, + "grad_norm": 2.786355972290039, + "learning_rate": 3.906399890365904e-06, + "loss": 0.0741, + "step": 45216 + }, + { + "epoch": 3.0132799782962563, + "grad_norm": 4.0950026512146, + "learning_rate": 3.90626284774565e-06, + "loss": 0.0936, + "step": 45217 + }, + { + "epoch": 3.013293543136191, + "grad_norm": 2.985001564025879, + "learning_rate": 3.906125805125394e-06, + "loss": 0.0851, + "step": 45218 + }, + { + "epoch": 3.013307107976126, + "grad_norm": 4.000298023223877, + "learning_rate": 3.905988762505139e-06, + "loss": 0.1748, + "step": 45219 + }, + { + "epoch": 3.013320672816061, + "grad_norm": 3.8920092582702637, + "learning_rate": 3.905851719884884e-06, + "loss": 0.1049, + "step": 45220 + }, + { + "epoch": 3.0133342376559957, + "grad_norm": 3.765625238418579, + "learning_rate": 3.9057146772646295e-06, + "loss": 0.1024, + "step": 45221 + }, + { + "epoch": 3.0133478024959306, + "grad_norm": 4.888944149017334, + "learning_rate": 3.905577634644375e-06, + "loss": 0.1425, + "step": 45222 + }, + { + "epoch": 3.0133613673358655, + "grad_norm": 3.2616915702819824, + "learning_rate": 3.90544059202412e-06, + "loss": 0.1055, + "step": 45223 + }, + { + "epoch": 3.0133749321758003, + "grad_norm": 6.875331401824951, + "learning_rate": 3.905303549403865e-06, + "loss": 0.1638, + "step": 45224 + }, + { + "epoch": 3.013388497015735, + "grad_norm": 4.139455318450928, + "learning_rate": 3.905166506783609e-06, + "loss": 0.0697, + "step": 45225 + }, + { + "epoch": 3.01340206185567, + "grad_norm": 2.6987597942352295, + "learning_rate": 3.905029464163355e-06, + "loss": 0.0527, + "step": 45226 + }, + { + "epoch": 3.013415626695605, + "grad_norm": 3.4783987998962402, + "learning_rate": 3.9048924215431e-06, + "loss": 0.1192, + "step": 45227 + }, + { + "epoch": 3.01342919153554, + "grad_norm": 3.3688595294952393, + "learning_rate": 3.904755378922846e-06, + "loss": 0.0974, + "step": 45228 + }, + { + "epoch": 3.0134427563754747, + "grad_norm": 3.2820851802825928, + "learning_rate": 3.90461833630259e-06, + "loss": 0.1441, + "step": 45229 + }, + { + "epoch": 3.0134563212154095, + "grad_norm": 5.169589996337891, + "learning_rate": 3.904481293682335e-06, + "loss": 0.1277, + "step": 45230 + }, + { + "epoch": 3.0134698860553444, + "grad_norm": 3.6766977310180664, + "learning_rate": 3.9043442510620805e-06, + "loss": 0.1114, + "step": 45231 + }, + { + "epoch": 3.0134834508952792, + "grad_norm": 4.712031364440918, + "learning_rate": 3.904207208441826e-06, + "loss": 0.1361, + "step": 45232 + }, + { + "epoch": 3.013497015735214, + "grad_norm": 4.24972677230835, + "learning_rate": 3.904070165821571e-06, + "loss": 0.1127, + "step": 45233 + }, + { + "epoch": 3.0135105805751494, + "grad_norm": 3.148878574371338, + "learning_rate": 3.903933123201316e-06, + "loss": 0.0999, + "step": 45234 + }, + { + "epoch": 3.0135241454150843, + "grad_norm": 6.054859638214111, + "learning_rate": 3.903796080581061e-06, + "loss": 0.0912, + "step": 45235 + }, + { + "epoch": 3.013537710255019, + "grad_norm": 5.172515869140625, + "learning_rate": 3.903659037960806e-06, + "loss": 0.1072, + "step": 45236 + }, + { + "epoch": 3.013551275094954, + "grad_norm": 4.176354885101318, + "learning_rate": 3.9035219953405515e-06, + "loss": 0.1134, + "step": 45237 + }, + { + "epoch": 3.013564839934889, + "grad_norm": 4.065727710723877, + "learning_rate": 3.903384952720297e-06, + "loss": 0.1136, + "step": 45238 + }, + { + "epoch": 3.0135784047748237, + "grad_norm": 3.7886030673980713, + "learning_rate": 3.903247910100041e-06, + "loss": 0.0726, + "step": 45239 + }, + { + "epoch": 3.0135919696147586, + "grad_norm": 2.83681583404541, + "learning_rate": 3.903110867479786e-06, + "loss": 0.0818, + "step": 45240 + }, + { + "epoch": 3.0136055344546935, + "grad_norm": 2.451998710632324, + "learning_rate": 3.902973824859531e-06, + "loss": 0.0477, + "step": 45241 + }, + { + "epoch": 3.0136190992946283, + "grad_norm": 3.8441035747528076, + "learning_rate": 3.902836782239277e-06, + "loss": 0.1196, + "step": 45242 + }, + { + "epoch": 3.013632664134563, + "grad_norm": 4.482341766357422, + "learning_rate": 3.902699739619022e-06, + "loss": 0.1466, + "step": 45243 + }, + { + "epoch": 3.013646228974498, + "grad_norm": 2.670100450515747, + "learning_rate": 3.902562696998767e-06, + "loss": 0.0774, + "step": 45244 + }, + { + "epoch": 3.013659793814433, + "grad_norm": 5.9229841232299805, + "learning_rate": 3.902425654378512e-06, + "loss": 0.1568, + "step": 45245 + }, + { + "epoch": 3.013673358654368, + "grad_norm": 3.41001033782959, + "learning_rate": 3.902288611758257e-06, + "loss": 0.0685, + "step": 45246 + }, + { + "epoch": 3.0136869234943027, + "grad_norm": 4.070366382598877, + "learning_rate": 3.9021515691380025e-06, + "loss": 0.0831, + "step": 45247 + }, + { + "epoch": 3.0137004883342375, + "grad_norm": 3.734999656677246, + "learning_rate": 3.902014526517748e-06, + "loss": 0.0846, + "step": 45248 + }, + { + "epoch": 3.0137140531741724, + "grad_norm": 4.156504154205322, + "learning_rate": 3.901877483897493e-06, + "loss": 0.1239, + "step": 45249 + }, + { + "epoch": 3.0137276180141073, + "grad_norm": 3.939859628677368, + "learning_rate": 3.901740441277237e-06, + "loss": 0.124, + "step": 45250 + }, + { + "epoch": 3.013741182854042, + "grad_norm": 4.894542217254639, + "learning_rate": 3.901603398656983e-06, + "loss": 0.0962, + "step": 45251 + }, + { + "epoch": 3.013754747693977, + "grad_norm": 4.303115367889404, + "learning_rate": 3.9014663560367275e-06, + "loss": 0.1389, + "step": 45252 + }, + { + "epoch": 3.0137683125339123, + "grad_norm": 3.96026873588562, + "learning_rate": 3.901329313416473e-06, + "loss": 0.0981, + "step": 45253 + }, + { + "epoch": 3.013781877373847, + "grad_norm": 4.809621810913086, + "learning_rate": 3.901192270796218e-06, + "loss": 0.1196, + "step": 45254 + }, + { + "epoch": 3.013795442213782, + "grad_norm": 3.3490991592407227, + "learning_rate": 3.901055228175963e-06, + "loss": 0.0605, + "step": 45255 + }, + { + "epoch": 3.013809007053717, + "grad_norm": 5.459598064422607, + "learning_rate": 3.900918185555708e-06, + "loss": 0.1242, + "step": 45256 + }, + { + "epoch": 3.0138225718936518, + "grad_norm": 3.248806953430176, + "learning_rate": 3.9007811429354534e-06, + "loss": 0.075, + "step": 45257 + }, + { + "epoch": 3.0138361367335866, + "grad_norm": 4.366630554199219, + "learning_rate": 3.900644100315199e-06, + "loss": 0.1201, + "step": 45258 + }, + { + "epoch": 3.0138497015735215, + "grad_norm": 3.2072665691375732, + "learning_rate": 3.900507057694943e-06, + "loss": 0.0761, + "step": 45259 + }, + { + "epoch": 3.0138632664134564, + "grad_norm": 2.5945749282836914, + "learning_rate": 3.900370015074689e-06, + "loss": 0.0858, + "step": 45260 + }, + { + "epoch": 3.0138768312533912, + "grad_norm": 9.531576156616211, + "learning_rate": 3.900232972454433e-06, + "loss": 0.1581, + "step": 45261 + }, + { + "epoch": 3.013890396093326, + "grad_norm": 2.786641836166382, + "learning_rate": 3.900095929834179e-06, + "loss": 0.063, + "step": 45262 + }, + { + "epoch": 3.013903960933261, + "grad_norm": 5.1829752922058105, + "learning_rate": 3.899958887213924e-06, + "loss": 0.1641, + "step": 45263 + }, + { + "epoch": 3.013917525773196, + "grad_norm": 2.757983446121216, + "learning_rate": 3.899821844593669e-06, + "loss": 0.0702, + "step": 45264 + }, + { + "epoch": 3.0139310906131307, + "grad_norm": 3.4160749912261963, + "learning_rate": 3.899684801973414e-06, + "loss": 0.0708, + "step": 45265 + }, + { + "epoch": 3.0139446554530656, + "grad_norm": 4.968110084533691, + "learning_rate": 3.899547759353159e-06, + "loss": 0.133, + "step": 45266 + }, + { + "epoch": 3.0139582202930004, + "grad_norm": 3.135010242462158, + "learning_rate": 3.899410716732904e-06, + "loss": 0.1071, + "step": 45267 + }, + { + "epoch": 3.0139717851329353, + "grad_norm": 4.428134441375732, + "learning_rate": 3.899273674112649e-06, + "loss": 0.1467, + "step": 45268 + }, + { + "epoch": 3.01398534997287, + "grad_norm": 2.8311684131622314, + "learning_rate": 3.899136631492395e-06, + "loss": 0.0464, + "step": 45269 + }, + { + "epoch": 3.013998914812805, + "grad_norm": 3.421064615249634, + "learning_rate": 3.898999588872139e-06, + "loss": 0.0669, + "step": 45270 + }, + { + "epoch": 3.01401247965274, + "grad_norm": 3.7658634185791016, + "learning_rate": 3.898862546251885e-06, + "loss": 0.0752, + "step": 45271 + }, + { + "epoch": 3.014026044492675, + "grad_norm": 3.6278910636901855, + "learning_rate": 3.898725503631629e-06, + "loss": 0.0979, + "step": 45272 + }, + { + "epoch": 3.01403960933261, + "grad_norm": 4.397932529449463, + "learning_rate": 3.898588461011375e-06, + "loss": 0.0935, + "step": 45273 + }, + { + "epoch": 3.014053174172545, + "grad_norm": 3.3366432189941406, + "learning_rate": 3.89845141839112e-06, + "loss": 0.1016, + "step": 45274 + }, + { + "epoch": 3.01406673901248, + "grad_norm": 4.362061023712158, + "learning_rate": 3.898314375770865e-06, + "loss": 0.1147, + "step": 45275 + }, + { + "epoch": 3.0140803038524147, + "grad_norm": 3.478325843811035, + "learning_rate": 3.89817733315061e-06, + "loss": 0.0899, + "step": 45276 + }, + { + "epoch": 3.0140938686923495, + "grad_norm": 6.534043788909912, + "learning_rate": 3.898040290530355e-06, + "loss": 0.1835, + "step": 45277 + }, + { + "epoch": 3.0141074335322844, + "grad_norm": 4.189466953277588, + "learning_rate": 3.8979032479101005e-06, + "loss": 0.1041, + "step": 45278 + }, + { + "epoch": 3.0141209983722193, + "grad_norm": 3.5465567111968994, + "learning_rate": 3.897766205289846e-06, + "loss": 0.1022, + "step": 45279 + }, + { + "epoch": 3.014134563212154, + "grad_norm": 6.411961555480957, + "learning_rate": 3.897629162669591e-06, + "loss": 0.1836, + "step": 45280 + }, + { + "epoch": 3.014148128052089, + "grad_norm": 4.2891740798950195, + "learning_rate": 3.897492120049335e-06, + "loss": 0.1361, + "step": 45281 + }, + { + "epoch": 3.014161692892024, + "grad_norm": 2.7412445545196533, + "learning_rate": 3.897355077429081e-06, + "loss": 0.0432, + "step": 45282 + }, + { + "epoch": 3.0141752577319587, + "grad_norm": 4.294414520263672, + "learning_rate": 3.8972180348088255e-06, + "loss": 0.134, + "step": 45283 + }, + { + "epoch": 3.0141888225718936, + "grad_norm": 3.190255880355835, + "learning_rate": 3.897080992188571e-06, + "loss": 0.0728, + "step": 45284 + }, + { + "epoch": 3.0142023874118284, + "grad_norm": 5.8190836906433105, + "learning_rate": 3.896943949568316e-06, + "loss": 0.1788, + "step": 45285 + }, + { + "epoch": 3.0142159522517633, + "grad_norm": 5.3405022621154785, + "learning_rate": 3.896806906948061e-06, + "loss": 0.1464, + "step": 45286 + }, + { + "epoch": 3.014229517091698, + "grad_norm": 3.1370108127593994, + "learning_rate": 3.896669864327806e-06, + "loss": 0.0667, + "step": 45287 + }, + { + "epoch": 3.014243081931633, + "grad_norm": 3.2672250270843506, + "learning_rate": 3.8965328217075514e-06, + "loss": 0.0757, + "step": 45288 + }, + { + "epoch": 3.014256646771568, + "grad_norm": 3.867598056793213, + "learning_rate": 3.896395779087297e-06, + "loss": 0.149, + "step": 45289 + }, + { + "epoch": 3.0142702116115028, + "grad_norm": 3.5953586101531982, + "learning_rate": 3.896258736467042e-06, + "loss": 0.06, + "step": 45290 + }, + { + "epoch": 3.014283776451438, + "grad_norm": 3.281339406967163, + "learning_rate": 3.896121693846787e-06, + "loss": 0.1101, + "step": 45291 + }, + { + "epoch": 3.014297341291373, + "grad_norm": 5.877263069152832, + "learning_rate": 3.895984651226532e-06, + "loss": 0.0923, + "step": 45292 + }, + { + "epoch": 3.014310906131308, + "grad_norm": 5.05853796005249, + "learning_rate": 3.8958476086062765e-06, + "loss": 0.1787, + "step": 45293 + }, + { + "epoch": 3.0143244709712427, + "grad_norm": 3.183289051055908, + "learning_rate": 3.895710565986022e-06, + "loss": 0.0804, + "step": 45294 + }, + { + "epoch": 3.0143380358111775, + "grad_norm": 3.238726854324341, + "learning_rate": 3.895573523365767e-06, + "loss": 0.1006, + "step": 45295 + }, + { + "epoch": 3.0143516006511124, + "grad_norm": 3.714611768722534, + "learning_rate": 3.895436480745512e-06, + "loss": 0.0799, + "step": 45296 + }, + { + "epoch": 3.0143651654910473, + "grad_norm": 5.207760810852051, + "learning_rate": 3.895299438125257e-06, + "loss": 0.0945, + "step": 45297 + }, + { + "epoch": 3.014378730330982, + "grad_norm": 2.886486530303955, + "learning_rate": 3.895162395505002e-06, + "loss": 0.0706, + "step": 45298 + }, + { + "epoch": 3.014392295170917, + "grad_norm": 3.803652763366699, + "learning_rate": 3.8950253528847476e-06, + "loss": 0.0593, + "step": 45299 + }, + { + "epoch": 3.014405860010852, + "grad_norm": 5.254788875579834, + "learning_rate": 3.894888310264493e-06, + "loss": 0.1098, + "step": 45300 + }, + { + "epoch": 3.0144194248507867, + "grad_norm": 3.5322189331054688, + "learning_rate": 3.894751267644238e-06, + "loss": 0.0938, + "step": 45301 + }, + { + "epoch": 3.0144329896907216, + "grad_norm": 3.3123903274536133, + "learning_rate": 3.894614225023983e-06, + "loss": 0.0804, + "step": 45302 + }, + { + "epoch": 3.0144465545306565, + "grad_norm": 4.598618030548096, + "learning_rate": 3.894477182403728e-06, + "loss": 0.1292, + "step": 45303 + }, + { + "epoch": 3.0144601193705913, + "grad_norm": 5.337893009185791, + "learning_rate": 3.894340139783473e-06, + "loss": 0.1036, + "step": 45304 + }, + { + "epoch": 3.014473684210526, + "grad_norm": 2.1705527305603027, + "learning_rate": 3.894203097163219e-06, + "loss": 0.0296, + "step": 45305 + }, + { + "epoch": 3.014487249050461, + "grad_norm": 1.8218464851379395, + "learning_rate": 3.894066054542963e-06, + "loss": 0.0424, + "step": 45306 + }, + { + "epoch": 3.014500813890396, + "grad_norm": 3.3865275382995605, + "learning_rate": 3.893929011922708e-06, + "loss": 0.0615, + "step": 45307 + }, + { + "epoch": 3.014514378730331, + "grad_norm": 4.065517425537109, + "learning_rate": 3.893791969302453e-06, + "loss": 0.0873, + "step": 45308 + }, + { + "epoch": 3.0145279435702657, + "grad_norm": 3.5203194618225098, + "learning_rate": 3.8936549266821985e-06, + "loss": 0.081, + "step": 45309 + }, + { + "epoch": 3.014541508410201, + "grad_norm": 3.658757448196411, + "learning_rate": 3.893517884061944e-06, + "loss": 0.1158, + "step": 45310 + }, + { + "epoch": 3.014555073250136, + "grad_norm": 3.6734461784362793, + "learning_rate": 3.893380841441689e-06, + "loss": 0.0641, + "step": 45311 + }, + { + "epoch": 3.0145686380900707, + "grad_norm": 3.7884836196899414, + "learning_rate": 3.893243798821434e-06, + "loss": 0.0957, + "step": 45312 + }, + { + "epoch": 3.0145822029300056, + "grad_norm": 6.947147846221924, + "learning_rate": 3.893106756201178e-06, + "loss": 0.1355, + "step": 45313 + }, + { + "epoch": 3.0145957677699404, + "grad_norm": 3.5814032554626465, + "learning_rate": 3.892969713580924e-06, + "loss": 0.0864, + "step": 45314 + }, + { + "epoch": 3.0146093326098753, + "grad_norm": 3.331773042678833, + "learning_rate": 3.892832670960669e-06, + "loss": 0.0614, + "step": 45315 + }, + { + "epoch": 3.01462289744981, + "grad_norm": 7.362322807312012, + "learning_rate": 3.892695628340415e-06, + "loss": 0.196, + "step": 45316 + }, + { + "epoch": 3.014636462289745, + "grad_norm": 3.8784847259521484, + "learning_rate": 3.892558585720159e-06, + "loss": 0.1, + "step": 45317 + }, + { + "epoch": 3.01465002712968, + "grad_norm": 3.221127510070801, + "learning_rate": 3.892421543099904e-06, + "loss": 0.0835, + "step": 45318 + }, + { + "epoch": 3.0146635919696148, + "grad_norm": 5.304460525512695, + "learning_rate": 3.8922845004796494e-06, + "loss": 0.1392, + "step": 45319 + }, + { + "epoch": 3.0146771568095496, + "grad_norm": 4.3582353591918945, + "learning_rate": 3.892147457859395e-06, + "loss": 0.0896, + "step": 45320 + }, + { + "epoch": 3.0146907216494845, + "grad_norm": 3.8014638423919678, + "learning_rate": 3.89201041523914e-06, + "loss": 0.1311, + "step": 45321 + }, + { + "epoch": 3.0147042864894193, + "grad_norm": 3.042076587677002, + "learning_rate": 3.891873372618884e-06, + "loss": 0.087, + "step": 45322 + }, + { + "epoch": 3.014717851329354, + "grad_norm": 3.815842866897583, + "learning_rate": 3.89173632999863e-06, + "loss": 0.0732, + "step": 45323 + }, + { + "epoch": 3.014731416169289, + "grad_norm": 3.684809446334839, + "learning_rate": 3.8915992873783745e-06, + "loss": 0.1057, + "step": 45324 + }, + { + "epoch": 3.014744981009224, + "grad_norm": 4.006299018859863, + "learning_rate": 3.8914622447581205e-06, + "loss": 0.0813, + "step": 45325 + }, + { + "epoch": 3.014758545849159, + "grad_norm": 4.321080207824707, + "learning_rate": 3.891325202137865e-06, + "loss": 0.1025, + "step": 45326 + }, + { + "epoch": 3.0147721106890937, + "grad_norm": 4.163077354431152, + "learning_rate": 3.89118815951761e-06, + "loss": 0.1374, + "step": 45327 + }, + { + "epoch": 3.0147856755290285, + "grad_norm": 2.3231284618377686, + "learning_rate": 3.891051116897355e-06, + "loss": 0.0572, + "step": 45328 + }, + { + "epoch": 3.014799240368964, + "grad_norm": 5.109055042266846, + "learning_rate": 3.8909140742771e-06, + "loss": 0.1141, + "step": 45329 + }, + { + "epoch": 3.0148128052088987, + "grad_norm": 4.146172523498535, + "learning_rate": 3.8907770316568456e-06, + "loss": 0.1191, + "step": 45330 + }, + { + "epoch": 3.0148263700488336, + "grad_norm": 2.6223249435424805, + "learning_rate": 3.890639989036591e-06, + "loss": 0.0863, + "step": 45331 + }, + { + "epoch": 3.0148399348887684, + "grad_norm": 3.1523427963256836, + "learning_rate": 3.890502946416336e-06, + "loss": 0.051, + "step": 45332 + }, + { + "epoch": 3.0148534997287033, + "grad_norm": 3.568787097930908, + "learning_rate": 3.890365903796081e-06, + "loss": 0.0913, + "step": 45333 + }, + { + "epoch": 3.014867064568638, + "grad_norm": 3.4271469116210938, + "learning_rate": 3.890228861175826e-06, + "loss": 0.0726, + "step": 45334 + }, + { + "epoch": 3.014880629408573, + "grad_norm": 3.6108450889587402, + "learning_rate": 3.890091818555571e-06, + "loss": 0.0719, + "step": 45335 + }, + { + "epoch": 3.014894194248508, + "grad_norm": 3.8003134727478027, + "learning_rate": 3.889954775935317e-06, + "loss": 0.0558, + "step": 45336 + }, + { + "epoch": 3.0149077590884428, + "grad_norm": 4.504250526428223, + "learning_rate": 3.889817733315061e-06, + "loss": 0.1856, + "step": 45337 + }, + { + "epoch": 3.0149213239283776, + "grad_norm": 3.600576162338257, + "learning_rate": 3.889680690694806e-06, + "loss": 0.1737, + "step": 45338 + }, + { + "epoch": 3.0149348887683125, + "grad_norm": 2.7004194259643555, + "learning_rate": 3.889543648074551e-06, + "loss": 0.0715, + "step": 45339 + }, + { + "epoch": 3.0149484536082474, + "grad_norm": 3.745992660522461, + "learning_rate": 3.8894066054542965e-06, + "loss": 0.094, + "step": 45340 + }, + { + "epoch": 3.0149620184481822, + "grad_norm": 3.0746872425079346, + "learning_rate": 3.889269562834042e-06, + "loss": 0.1063, + "step": 45341 + }, + { + "epoch": 3.014975583288117, + "grad_norm": 3.774209976196289, + "learning_rate": 3.889132520213787e-06, + "loss": 0.1261, + "step": 45342 + }, + { + "epoch": 3.014989148128052, + "grad_norm": 4.249090671539307, + "learning_rate": 3.888995477593532e-06, + "loss": 0.1502, + "step": 45343 + }, + { + "epoch": 3.015002712967987, + "grad_norm": 4.046438694000244, + "learning_rate": 3.888858434973277e-06, + "loss": 0.0601, + "step": 45344 + }, + { + "epoch": 3.0150162778079217, + "grad_norm": 4.200769424438477, + "learning_rate": 3.888721392353022e-06, + "loss": 0.1391, + "step": 45345 + }, + { + "epoch": 3.0150298426478566, + "grad_norm": 4.179553985595703, + "learning_rate": 3.8885843497327676e-06, + "loss": 0.1052, + "step": 45346 + }, + { + "epoch": 3.0150434074877914, + "grad_norm": 3.289731740951538, + "learning_rate": 3.888447307112512e-06, + "loss": 0.1065, + "step": 45347 + }, + { + "epoch": 3.0150569723277267, + "grad_norm": 5.092464923858643, + "learning_rate": 3.888310264492258e-06, + "loss": 0.1206, + "step": 45348 + }, + { + "epoch": 3.0150705371676616, + "grad_norm": 3.452789068222046, + "learning_rate": 3.888173221872002e-06, + "loss": 0.0618, + "step": 45349 + }, + { + "epoch": 3.0150841020075965, + "grad_norm": 5.125539779663086, + "learning_rate": 3.8880361792517474e-06, + "loss": 0.1658, + "step": 45350 + }, + { + "epoch": 3.0150976668475313, + "grad_norm": 5.329631328582764, + "learning_rate": 3.887899136631493e-06, + "loss": 0.1637, + "step": 45351 + }, + { + "epoch": 3.015111231687466, + "grad_norm": 2.123105764389038, + "learning_rate": 3.887762094011238e-06, + "loss": 0.0737, + "step": 45352 + }, + { + "epoch": 3.015124796527401, + "grad_norm": 4.476833820343018, + "learning_rate": 3.887625051390983e-06, + "loss": 0.1411, + "step": 45353 + }, + { + "epoch": 3.015138361367336, + "grad_norm": 4.230569839477539, + "learning_rate": 3.887488008770728e-06, + "loss": 0.1227, + "step": 45354 + }, + { + "epoch": 3.015151926207271, + "grad_norm": 4.756021022796631, + "learning_rate": 3.887350966150473e-06, + "loss": 0.099, + "step": 45355 + }, + { + "epoch": 3.0151654910472057, + "grad_norm": 4.581940650939941, + "learning_rate": 3.8872139235302185e-06, + "loss": 0.1565, + "step": 45356 + }, + { + "epoch": 3.0151790558871405, + "grad_norm": 4.344298839569092, + "learning_rate": 3.887076880909964e-06, + "loss": 0.1189, + "step": 45357 + }, + { + "epoch": 3.0151926207270754, + "grad_norm": 8.683124542236328, + "learning_rate": 3.886939838289708e-06, + "loss": 0.1224, + "step": 45358 + }, + { + "epoch": 3.0152061855670103, + "grad_norm": 3.8221068382263184, + "learning_rate": 3.886802795669454e-06, + "loss": 0.0915, + "step": 45359 + }, + { + "epoch": 3.015219750406945, + "grad_norm": 3.899444103240967, + "learning_rate": 3.886665753049198e-06, + "loss": 0.1195, + "step": 45360 + }, + { + "epoch": 3.01523331524688, + "grad_norm": 2.7982583045959473, + "learning_rate": 3.8865287104289436e-06, + "loss": 0.0916, + "step": 45361 + }, + { + "epoch": 3.015246880086815, + "grad_norm": 5.161733627319336, + "learning_rate": 3.886391667808689e-06, + "loss": 0.1649, + "step": 45362 + }, + { + "epoch": 3.0152604449267497, + "grad_norm": 4.202443599700928, + "learning_rate": 3.886254625188434e-06, + "loss": 0.0756, + "step": 45363 + }, + { + "epoch": 3.0152740097666846, + "grad_norm": 4.665492057800293, + "learning_rate": 3.886117582568179e-06, + "loss": 0.0812, + "step": 45364 + }, + { + "epoch": 3.0152875746066194, + "grad_norm": 3.6016135215759277, + "learning_rate": 3.885980539947924e-06, + "loss": 0.0871, + "step": 45365 + }, + { + "epoch": 3.0153011394465543, + "grad_norm": 3.5550544261932373, + "learning_rate": 3.8858434973276695e-06, + "loss": 0.1572, + "step": 45366 + }, + { + "epoch": 3.0153147042864896, + "grad_norm": 4.353613376617432, + "learning_rate": 3.885706454707414e-06, + "loss": 0.1547, + "step": 45367 + }, + { + "epoch": 3.0153282691264245, + "grad_norm": 2.8599705696105957, + "learning_rate": 3.88556941208716e-06, + "loss": 0.0836, + "step": 45368 + }, + { + "epoch": 3.0153418339663594, + "grad_norm": 4.912089824676514, + "learning_rate": 3.885432369466904e-06, + "loss": 0.1365, + "step": 45369 + }, + { + "epoch": 3.015355398806294, + "grad_norm": 5.3935418128967285, + "learning_rate": 3.88529532684665e-06, + "loss": 0.15, + "step": 45370 + }, + { + "epoch": 3.015368963646229, + "grad_norm": 5.744715213775635, + "learning_rate": 3.8851582842263945e-06, + "loss": 0.1435, + "step": 45371 + }, + { + "epoch": 3.015382528486164, + "grad_norm": 3.4985036849975586, + "learning_rate": 3.88502124160614e-06, + "loss": 0.1155, + "step": 45372 + }, + { + "epoch": 3.015396093326099, + "grad_norm": 6.245434284210205, + "learning_rate": 3.884884198985885e-06, + "loss": 0.1218, + "step": 45373 + }, + { + "epoch": 3.0154096581660337, + "grad_norm": 3.8701324462890625, + "learning_rate": 3.88474715636563e-06, + "loss": 0.1385, + "step": 45374 + }, + { + "epoch": 3.0154232230059685, + "grad_norm": 4.002198696136475, + "learning_rate": 3.884610113745375e-06, + "loss": 0.1033, + "step": 45375 + }, + { + "epoch": 3.0154367878459034, + "grad_norm": 4.205189228057861, + "learning_rate": 3.8844730711251195e-06, + "loss": 0.0975, + "step": 45376 + }, + { + "epoch": 3.0154503526858383, + "grad_norm": 3.7836923599243164, + "learning_rate": 3.8843360285048656e-06, + "loss": 0.1324, + "step": 45377 + }, + { + "epoch": 3.015463917525773, + "grad_norm": 4.037487030029297, + "learning_rate": 3.88419898588461e-06, + "loss": 0.1165, + "step": 45378 + }, + { + "epoch": 3.015477482365708, + "grad_norm": 3.963351011276245, + "learning_rate": 3.884061943264356e-06, + "loss": 0.0916, + "step": 45379 + }, + { + "epoch": 3.015491047205643, + "grad_norm": 2.9334444999694824, + "learning_rate": 3.8839249006441e-06, + "loss": 0.0899, + "step": 45380 + }, + { + "epoch": 3.0155046120455777, + "grad_norm": 5.114774227142334, + "learning_rate": 3.8837878580238454e-06, + "loss": 0.1927, + "step": 45381 + }, + { + "epoch": 3.0155181768855126, + "grad_norm": 4.064729690551758, + "learning_rate": 3.883650815403591e-06, + "loss": 0.152, + "step": 45382 + }, + { + "epoch": 3.0155317417254475, + "grad_norm": 3.935898542404175, + "learning_rate": 3.883513772783336e-06, + "loss": 0.1266, + "step": 45383 + }, + { + "epoch": 3.0155453065653823, + "grad_norm": 4.404818058013916, + "learning_rate": 3.883376730163081e-06, + "loss": 0.1455, + "step": 45384 + }, + { + "epoch": 3.015558871405317, + "grad_norm": 4.5206170082092285, + "learning_rate": 3.883239687542826e-06, + "loss": 0.139, + "step": 45385 + }, + { + "epoch": 3.0155724362452525, + "grad_norm": 5.0647783279418945, + "learning_rate": 3.883102644922571e-06, + "loss": 0.1988, + "step": 45386 + }, + { + "epoch": 3.0155860010851874, + "grad_norm": 4.075372219085693, + "learning_rate": 3.8829656023023165e-06, + "loss": 0.136, + "step": 45387 + }, + { + "epoch": 3.0155995659251222, + "grad_norm": 4.699920177459717, + "learning_rate": 3.882828559682062e-06, + "loss": 0.1152, + "step": 45388 + }, + { + "epoch": 3.015613130765057, + "grad_norm": 3.4661648273468018, + "learning_rate": 3.882691517061807e-06, + "loss": 0.1099, + "step": 45389 + }, + { + "epoch": 3.015626695604992, + "grad_norm": 4.763086795806885, + "learning_rate": 3.882554474441552e-06, + "loss": 0.2013, + "step": 45390 + }, + { + "epoch": 3.015640260444927, + "grad_norm": 3.1442089080810547, + "learning_rate": 3.882417431821296e-06, + "loss": 0.0919, + "step": 45391 + }, + { + "epoch": 3.0156538252848617, + "grad_norm": 4.191068649291992, + "learning_rate": 3.8822803892010416e-06, + "loss": 0.0934, + "step": 45392 + }, + { + "epoch": 3.0156673901247966, + "grad_norm": 3.9200222492218018, + "learning_rate": 3.882143346580787e-06, + "loss": 0.1053, + "step": 45393 + }, + { + "epoch": 3.0156809549647314, + "grad_norm": 4.390257835388184, + "learning_rate": 3.882006303960532e-06, + "loss": 0.1206, + "step": 45394 + }, + { + "epoch": 3.0156945198046663, + "grad_norm": 5.297770977020264, + "learning_rate": 3.881869261340277e-06, + "loss": 0.1427, + "step": 45395 + }, + { + "epoch": 3.015708084644601, + "grad_norm": 3.783505916595459, + "learning_rate": 3.881732218720022e-06, + "loss": 0.0995, + "step": 45396 + }, + { + "epoch": 3.015721649484536, + "grad_norm": 2.8996615409851074, + "learning_rate": 3.8815951760997675e-06, + "loss": 0.0981, + "step": 45397 + }, + { + "epoch": 3.015735214324471, + "grad_norm": 3.869379758834839, + "learning_rate": 3.881458133479513e-06, + "loss": 0.1046, + "step": 45398 + }, + { + "epoch": 3.0157487791644058, + "grad_norm": 3.49346661567688, + "learning_rate": 3.881321090859258e-06, + "loss": 0.126, + "step": 45399 + }, + { + "epoch": 3.0157623440043406, + "grad_norm": 4.973351955413818, + "learning_rate": 3.881184048239003e-06, + "loss": 0.1495, + "step": 45400 + }, + { + "epoch": 3.0157759088442755, + "grad_norm": 5.553471565246582, + "learning_rate": 3.881047005618747e-06, + "loss": 0.2193, + "step": 45401 + }, + { + "epoch": 3.0157894736842104, + "grad_norm": 3.7486116886138916, + "learning_rate": 3.880909962998493e-06, + "loss": 0.1192, + "step": 45402 + }, + { + "epoch": 3.015803038524145, + "grad_norm": 3.064253330230713, + "learning_rate": 3.880772920378238e-06, + "loss": 0.063, + "step": 45403 + }, + { + "epoch": 3.01581660336408, + "grad_norm": 2.272390127182007, + "learning_rate": 3.880635877757983e-06, + "loss": 0.0403, + "step": 45404 + }, + { + "epoch": 3.0158301682040154, + "grad_norm": 3.4848434925079346, + "learning_rate": 3.880498835137728e-06, + "loss": 0.1069, + "step": 45405 + }, + { + "epoch": 3.0158437330439503, + "grad_norm": 4.227772235870361, + "learning_rate": 3.880361792517473e-06, + "loss": 0.0966, + "step": 45406 + }, + { + "epoch": 3.015857297883885, + "grad_norm": 3.9043691158294678, + "learning_rate": 3.880224749897218e-06, + "loss": 0.1015, + "step": 45407 + }, + { + "epoch": 3.01587086272382, + "grad_norm": 3.1945900917053223, + "learning_rate": 3.880087707276964e-06, + "loss": 0.0806, + "step": 45408 + }, + { + "epoch": 3.015884427563755, + "grad_norm": 4.133888244628906, + "learning_rate": 3.879950664656709e-06, + "loss": 0.0806, + "step": 45409 + }, + { + "epoch": 3.0158979924036897, + "grad_norm": 3.680511713027954, + "learning_rate": 3.879813622036453e-06, + "loss": 0.0787, + "step": 45410 + }, + { + "epoch": 3.0159115572436246, + "grad_norm": 3.0774128437042236, + "learning_rate": 3.879676579416199e-06, + "loss": 0.0751, + "step": 45411 + }, + { + "epoch": 3.0159251220835595, + "grad_norm": 4.200504302978516, + "learning_rate": 3.8795395367959434e-06, + "loss": 0.0921, + "step": 45412 + }, + { + "epoch": 3.0159386869234943, + "grad_norm": 4.256302356719971, + "learning_rate": 3.8794024941756895e-06, + "loss": 0.0898, + "step": 45413 + }, + { + "epoch": 3.015952251763429, + "grad_norm": 2.5574421882629395, + "learning_rate": 3.879265451555434e-06, + "loss": 0.0468, + "step": 45414 + }, + { + "epoch": 3.015965816603364, + "grad_norm": 3.2815749645233154, + "learning_rate": 3.879128408935179e-06, + "loss": 0.0508, + "step": 45415 + }, + { + "epoch": 3.015979381443299, + "grad_norm": 3.3282785415649414, + "learning_rate": 3.878991366314924e-06, + "loss": 0.0721, + "step": 45416 + }, + { + "epoch": 3.0159929462832338, + "grad_norm": 2.8709139823913574, + "learning_rate": 3.878854323694669e-06, + "loss": 0.0312, + "step": 45417 + }, + { + "epoch": 3.0160065111231686, + "grad_norm": 3.5750365257263184, + "learning_rate": 3.8787172810744145e-06, + "loss": 0.1299, + "step": 45418 + }, + { + "epoch": 3.0160200759631035, + "grad_norm": 3.3071048259735107, + "learning_rate": 3.87858023845416e-06, + "loss": 0.0664, + "step": 45419 + }, + { + "epoch": 3.0160336408030384, + "grad_norm": 4.674858570098877, + "learning_rate": 3.878443195833905e-06, + "loss": 0.1127, + "step": 45420 + }, + { + "epoch": 3.0160472056429732, + "grad_norm": 3.906810760498047, + "learning_rate": 3.878306153213649e-06, + "loss": 0.0924, + "step": 45421 + }, + { + "epoch": 3.016060770482908, + "grad_norm": 2.9535279273986816, + "learning_rate": 3.878169110593395e-06, + "loss": 0.036, + "step": 45422 + }, + { + "epoch": 3.016074335322843, + "grad_norm": 3.6090261936187744, + "learning_rate": 3.8780320679731396e-06, + "loss": 0.0858, + "step": 45423 + }, + { + "epoch": 3.0160879001627783, + "grad_norm": 4.864112854003906, + "learning_rate": 3.877895025352886e-06, + "loss": 0.119, + "step": 45424 + }, + { + "epoch": 3.016101465002713, + "grad_norm": 2.941934108734131, + "learning_rate": 3.87775798273263e-06, + "loss": 0.0633, + "step": 45425 + }, + { + "epoch": 3.016115029842648, + "grad_norm": 4.0063886642456055, + "learning_rate": 3.877620940112375e-06, + "loss": 0.0644, + "step": 45426 + }, + { + "epoch": 3.016128594682583, + "grad_norm": 3.0555193424224854, + "learning_rate": 3.87748389749212e-06, + "loss": 0.0584, + "step": 45427 + }, + { + "epoch": 3.0161421595225177, + "grad_norm": 3.6506261825561523, + "learning_rate": 3.8773468548718655e-06, + "loss": 0.079, + "step": 45428 + }, + { + "epoch": 3.0161557243624526, + "grad_norm": 5.397292613983154, + "learning_rate": 3.877209812251611e-06, + "loss": 0.3571, + "step": 45429 + }, + { + "epoch": 3.0161692892023875, + "grad_norm": 6.694531440734863, + "learning_rate": 3.877072769631356e-06, + "loss": 0.2194, + "step": 45430 + }, + { + "epoch": 3.0161828540423223, + "grad_norm": 5.748035430908203, + "learning_rate": 3.876935727011101e-06, + "loss": 0.1534, + "step": 45431 + }, + { + "epoch": 3.016196418882257, + "grad_norm": 4.569302558898926, + "learning_rate": 3.876798684390845e-06, + "loss": 0.1045, + "step": 45432 + }, + { + "epoch": 3.016209983722192, + "grad_norm": 3.820147752761841, + "learning_rate": 3.876661641770591e-06, + "loss": 0.0904, + "step": 45433 + }, + { + "epoch": 3.016223548562127, + "grad_norm": 5.313122272491455, + "learning_rate": 3.876524599150336e-06, + "loss": 0.1423, + "step": 45434 + }, + { + "epoch": 3.016237113402062, + "grad_norm": 4.029165744781494, + "learning_rate": 3.876387556530081e-06, + "loss": 0.1051, + "step": 45435 + }, + { + "epoch": 3.0162506782419967, + "grad_norm": 5.592415809631348, + "learning_rate": 3.876250513909826e-06, + "loss": 0.1342, + "step": 45436 + }, + { + "epoch": 3.0162642430819315, + "grad_norm": 4.473917007446289, + "learning_rate": 3.876113471289571e-06, + "loss": 0.0955, + "step": 45437 + }, + { + "epoch": 3.0162778079218664, + "grad_norm": 2.6329827308654785, + "learning_rate": 3.875976428669316e-06, + "loss": 0.0741, + "step": 45438 + }, + { + "epoch": 3.0162913727618013, + "grad_norm": 3.8523080348968506, + "learning_rate": 3.875839386049062e-06, + "loss": 0.1005, + "step": 45439 + }, + { + "epoch": 3.016304937601736, + "grad_norm": 4.2539520263671875, + "learning_rate": 3.875702343428807e-06, + "loss": 0.0751, + "step": 45440 + }, + { + "epoch": 3.016318502441671, + "grad_norm": 4.38035774230957, + "learning_rate": 3.875565300808552e-06, + "loss": 0.1583, + "step": 45441 + }, + { + "epoch": 3.0163320672816063, + "grad_norm": 3.8300814628601074, + "learning_rate": 3.875428258188297e-06, + "loss": 0.0916, + "step": 45442 + }, + { + "epoch": 3.016345632121541, + "grad_norm": 4.341104984283447, + "learning_rate": 3.875291215568042e-06, + "loss": 0.1343, + "step": 45443 + }, + { + "epoch": 3.016359196961476, + "grad_norm": 8.100242614746094, + "learning_rate": 3.8751541729477875e-06, + "loss": 0.1634, + "step": 45444 + }, + { + "epoch": 3.016372761801411, + "grad_norm": 3.8289413452148438, + "learning_rate": 3.875017130327532e-06, + "loss": 0.0947, + "step": 45445 + }, + { + "epoch": 3.0163863266413458, + "grad_norm": 5.04863977432251, + "learning_rate": 3.874880087707277e-06, + "loss": 0.1449, + "step": 45446 + }, + { + "epoch": 3.0163998914812806, + "grad_norm": 4.1212592124938965, + "learning_rate": 3.874743045087022e-06, + "loss": 0.1322, + "step": 45447 + }, + { + "epoch": 3.0164134563212155, + "grad_norm": 5.995255947113037, + "learning_rate": 3.874606002466767e-06, + "loss": 0.132, + "step": 45448 + }, + { + "epoch": 3.0164270211611504, + "grad_norm": 3.465628147125244, + "learning_rate": 3.8744689598465125e-06, + "loss": 0.1109, + "step": 45449 + }, + { + "epoch": 3.0164405860010852, + "grad_norm": 3.9428863525390625, + "learning_rate": 3.874331917226258e-06, + "loss": 0.1632, + "step": 45450 + }, + { + "epoch": 3.01645415084102, + "grad_norm": 3.8213083744049072, + "learning_rate": 3.874194874606003e-06, + "loss": 0.1526, + "step": 45451 + }, + { + "epoch": 3.016467715680955, + "grad_norm": 4.13889217376709, + "learning_rate": 3.874057831985748e-06, + "loss": 0.1216, + "step": 45452 + }, + { + "epoch": 3.01648128052089, + "grad_norm": 7.504579067230225, + "learning_rate": 3.873920789365493e-06, + "loss": 0.1649, + "step": 45453 + }, + { + "epoch": 3.0164948453608247, + "grad_norm": 2.5230562686920166, + "learning_rate": 3.873783746745238e-06, + "loss": 0.063, + "step": 45454 + }, + { + "epoch": 3.0165084102007595, + "grad_norm": 2.963223457336426, + "learning_rate": 3.873646704124983e-06, + "loss": 0.0737, + "step": 45455 + }, + { + "epoch": 3.0165219750406944, + "grad_norm": 4.277727127075195, + "learning_rate": 3.873509661504729e-06, + "loss": 0.0764, + "step": 45456 + }, + { + "epoch": 3.0165355398806293, + "grad_norm": 3.2561733722686768, + "learning_rate": 3.873372618884473e-06, + "loss": 0.1234, + "step": 45457 + }, + { + "epoch": 3.016549104720564, + "grad_norm": 2.988950490951538, + "learning_rate": 3.873235576264219e-06, + "loss": 0.0462, + "step": 45458 + }, + { + "epoch": 3.016562669560499, + "grad_norm": 3.078587293624878, + "learning_rate": 3.8730985336439635e-06, + "loss": 0.0828, + "step": 45459 + }, + { + "epoch": 3.016576234400434, + "grad_norm": 3.9282500743865967, + "learning_rate": 3.872961491023709e-06, + "loss": 0.0831, + "step": 45460 + }, + { + "epoch": 3.0165897992403687, + "grad_norm": 1.7663724422454834, + "learning_rate": 3.872824448403454e-06, + "loss": 0.0284, + "step": 45461 + }, + { + "epoch": 3.016603364080304, + "grad_norm": 3.8408141136169434, + "learning_rate": 3.872687405783199e-06, + "loss": 0.1922, + "step": 45462 + }, + { + "epoch": 3.016616928920239, + "grad_norm": 4.0902934074401855, + "learning_rate": 3.872550363162944e-06, + "loss": 0.1689, + "step": 45463 + }, + { + "epoch": 3.016630493760174, + "grad_norm": 2.0279009342193604, + "learning_rate": 3.8724133205426885e-06, + "loss": 0.0484, + "step": 45464 + }, + { + "epoch": 3.0166440586001086, + "grad_norm": 4.296383857727051, + "learning_rate": 3.8722762779224345e-06, + "loss": 0.1668, + "step": 45465 + }, + { + "epoch": 3.0166576234400435, + "grad_norm": 3.2106149196624756, + "learning_rate": 3.872139235302179e-06, + "loss": 0.0733, + "step": 45466 + }, + { + "epoch": 3.0166711882799784, + "grad_norm": 3.421222686767578, + "learning_rate": 3.872002192681925e-06, + "loss": 0.1521, + "step": 45467 + }, + { + "epoch": 3.0166847531199132, + "grad_norm": 3.983334541320801, + "learning_rate": 3.871865150061669e-06, + "loss": 0.1611, + "step": 45468 + }, + { + "epoch": 3.016698317959848, + "grad_norm": 2.9099833965301514, + "learning_rate": 3.871728107441414e-06, + "loss": 0.0805, + "step": 45469 + }, + { + "epoch": 3.016711882799783, + "grad_norm": 4.639047145843506, + "learning_rate": 3.87159106482116e-06, + "loss": 0.111, + "step": 45470 + }, + { + "epoch": 3.016725447639718, + "grad_norm": 3.795731782913208, + "learning_rate": 3.871454022200905e-06, + "loss": 0.0875, + "step": 45471 + }, + { + "epoch": 3.0167390124796527, + "grad_norm": 3.1469039916992188, + "learning_rate": 3.87131697958065e-06, + "loss": 0.0881, + "step": 45472 + }, + { + "epoch": 3.0167525773195876, + "grad_norm": 3.8146309852600098, + "learning_rate": 3.871179936960395e-06, + "loss": 0.0758, + "step": 45473 + }, + { + "epoch": 3.0167661421595224, + "grad_norm": 5.003358364105225, + "learning_rate": 3.87104289434014e-06, + "loss": 0.1357, + "step": 45474 + }, + { + "epoch": 3.0167797069994573, + "grad_norm": 3.4368784427642822, + "learning_rate": 3.870905851719885e-06, + "loss": 0.081, + "step": 45475 + }, + { + "epoch": 3.016793271839392, + "grad_norm": 4.841030120849609, + "learning_rate": 3.870768809099631e-06, + "loss": 0.106, + "step": 45476 + }, + { + "epoch": 3.016806836679327, + "grad_norm": 3.762594699859619, + "learning_rate": 3.870631766479375e-06, + "loss": 0.1205, + "step": 45477 + }, + { + "epoch": 3.016820401519262, + "grad_norm": 2.3714568614959717, + "learning_rate": 3.870494723859121e-06, + "loss": 0.0614, + "step": 45478 + }, + { + "epoch": 3.0168339663591968, + "grad_norm": 3.6962718963623047, + "learning_rate": 3.870357681238865e-06, + "loss": 0.1225, + "step": 45479 + }, + { + "epoch": 3.016847531199132, + "grad_norm": 3.504040241241455, + "learning_rate": 3.8702206386186105e-06, + "loss": 0.1022, + "step": 45480 + }, + { + "epoch": 3.016861096039067, + "grad_norm": 3.487027168273926, + "learning_rate": 3.870083595998356e-06, + "loss": 0.1309, + "step": 45481 + }, + { + "epoch": 3.016874660879002, + "grad_norm": 3.755380392074585, + "learning_rate": 3.869946553378101e-06, + "loss": 0.1142, + "step": 45482 + }, + { + "epoch": 3.0168882257189367, + "grad_norm": 4.174705982208252, + "learning_rate": 3.869809510757846e-06, + "loss": 0.1427, + "step": 45483 + }, + { + "epoch": 3.0169017905588715, + "grad_norm": 4.547069072723389, + "learning_rate": 3.869672468137591e-06, + "loss": 0.1331, + "step": 45484 + }, + { + "epoch": 3.0169153553988064, + "grad_norm": 3.0373127460479736, + "learning_rate": 3.8695354255173364e-06, + "loss": 0.0839, + "step": 45485 + }, + { + "epoch": 3.0169289202387413, + "grad_norm": 3.584263801574707, + "learning_rate": 3.869398382897082e-06, + "loss": 0.0542, + "step": 45486 + }, + { + "epoch": 3.016942485078676, + "grad_norm": 3.0411624908447266, + "learning_rate": 3.869261340276827e-06, + "loss": 0.0759, + "step": 45487 + }, + { + "epoch": 3.016956049918611, + "grad_norm": 3.881601095199585, + "learning_rate": 3.869124297656571e-06, + "loss": 0.0865, + "step": 45488 + }, + { + "epoch": 3.016969614758546, + "grad_norm": 3.61930513381958, + "learning_rate": 3.868987255036316e-06, + "loss": 0.1479, + "step": 45489 + }, + { + "epoch": 3.0169831795984807, + "grad_norm": 3.8028879165649414, + "learning_rate": 3.8688502124160615e-06, + "loss": 0.1407, + "step": 45490 + }, + { + "epoch": 3.0169967444384156, + "grad_norm": 4.689044952392578, + "learning_rate": 3.868713169795807e-06, + "loss": 0.1298, + "step": 45491 + }, + { + "epoch": 3.0170103092783505, + "grad_norm": 4.8190836906433105, + "learning_rate": 3.868576127175552e-06, + "loss": 0.1336, + "step": 45492 + }, + { + "epoch": 3.0170238741182853, + "grad_norm": 4.713685035705566, + "learning_rate": 3.868439084555297e-06, + "loss": 0.1809, + "step": 45493 + }, + { + "epoch": 3.01703743895822, + "grad_norm": 4.5529704093933105, + "learning_rate": 3.868302041935042e-06, + "loss": 0.1133, + "step": 45494 + }, + { + "epoch": 3.017051003798155, + "grad_norm": 4.274981498718262, + "learning_rate": 3.868164999314787e-06, + "loss": 0.0603, + "step": 45495 + }, + { + "epoch": 3.01706456863809, + "grad_norm": 3.571676015853882, + "learning_rate": 3.8680279566945325e-06, + "loss": 0.0668, + "step": 45496 + }, + { + "epoch": 3.017078133478025, + "grad_norm": 3.6490390300750732, + "learning_rate": 3.867890914074278e-06, + "loss": 0.2015, + "step": 45497 + }, + { + "epoch": 3.0170916983179596, + "grad_norm": 4.522167682647705, + "learning_rate": 3.867753871454023e-06, + "loss": 0.1506, + "step": 45498 + }, + { + "epoch": 3.017105263157895, + "grad_norm": 3.2541916370391846, + "learning_rate": 3.867616828833768e-06, + "loss": 0.13, + "step": 45499 + }, + { + "epoch": 3.01711882799783, + "grad_norm": 4.221523284912109, + "learning_rate": 3.867479786213512e-06, + "loss": 0.1527, + "step": 45500 + }, + { + "epoch": 3.0171323928377647, + "grad_norm": 3.616295337677002, + "learning_rate": 3.867342743593258e-06, + "loss": 0.0648, + "step": 45501 + }, + { + "epoch": 3.0171459576776996, + "grad_norm": 4.373875617980957, + "learning_rate": 3.867205700973003e-06, + "loss": 0.0734, + "step": 45502 + }, + { + "epoch": 3.0171595225176344, + "grad_norm": 3.5770294666290283, + "learning_rate": 3.867068658352748e-06, + "loss": 0.1078, + "step": 45503 + }, + { + "epoch": 3.0171730873575693, + "grad_norm": 3.323502779006958, + "learning_rate": 3.866931615732493e-06, + "loss": 0.0427, + "step": 45504 + }, + { + "epoch": 3.017186652197504, + "grad_norm": 3.4628233909606934, + "learning_rate": 3.866794573112238e-06, + "loss": 0.1259, + "step": 45505 + }, + { + "epoch": 3.017200217037439, + "grad_norm": 2.139406681060791, + "learning_rate": 3.8666575304919835e-06, + "loss": 0.0491, + "step": 45506 + }, + { + "epoch": 3.017213781877374, + "grad_norm": 3.718604564666748, + "learning_rate": 3.866520487871729e-06, + "loss": 0.1005, + "step": 45507 + }, + { + "epoch": 3.0172273467173087, + "grad_norm": 5.552097320556641, + "learning_rate": 3.866383445251474e-06, + "loss": 0.241, + "step": 45508 + }, + { + "epoch": 3.0172409115572436, + "grad_norm": 4.500661373138428, + "learning_rate": 3.866246402631218e-06, + "loss": 0.1106, + "step": 45509 + }, + { + "epoch": 3.0172544763971785, + "grad_norm": 4.498177528381348, + "learning_rate": 3.866109360010964e-06, + "loss": 0.1401, + "step": 45510 + }, + { + "epoch": 3.0172680412371133, + "grad_norm": 3.663480520248413, + "learning_rate": 3.8659723173907085e-06, + "loss": 0.1126, + "step": 45511 + }, + { + "epoch": 3.017281606077048, + "grad_norm": 4.435110569000244, + "learning_rate": 3.8658352747704546e-06, + "loss": 0.1196, + "step": 45512 + }, + { + "epoch": 3.017295170916983, + "grad_norm": 3.72944712638855, + "learning_rate": 3.865698232150199e-06, + "loss": 0.1015, + "step": 45513 + }, + { + "epoch": 3.017308735756918, + "grad_norm": 5.431502342224121, + "learning_rate": 3.865561189529944e-06, + "loss": 0.1089, + "step": 45514 + }, + { + "epoch": 3.017322300596853, + "grad_norm": 8.163469314575195, + "learning_rate": 3.865424146909689e-06, + "loss": 0.1023, + "step": 45515 + }, + { + "epoch": 3.0173358654367877, + "grad_norm": 3.604215621948242, + "learning_rate": 3.8652871042894344e-06, + "loss": 0.0888, + "step": 45516 + }, + { + "epoch": 3.0173494302767225, + "grad_norm": 4.1760029792785645, + "learning_rate": 3.86515006166918e-06, + "loss": 0.1378, + "step": 45517 + }, + { + "epoch": 3.017362995116658, + "grad_norm": 3.8438186645507812, + "learning_rate": 3.865013019048924e-06, + "loss": 0.0765, + "step": 45518 + }, + { + "epoch": 3.0173765599565927, + "grad_norm": 4.045036315917969, + "learning_rate": 3.86487597642867e-06, + "loss": 0.1428, + "step": 45519 + }, + { + "epoch": 3.0173901247965276, + "grad_norm": 3.6950578689575195, + "learning_rate": 3.864738933808414e-06, + "loss": 0.1269, + "step": 45520 + }, + { + "epoch": 3.0174036896364624, + "grad_norm": 4.964380741119385, + "learning_rate": 3.86460189118816e-06, + "loss": 0.1526, + "step": 45521 + }, + { + "epoch": 3.0174172544763973, + "grad_norm": 4.8669819831848145, + "learning_rate": 3.864464848567905e-06, + "loss": 0.1576, + "step": 45522 + }, + { + "epoch": 3.017430819316332, + "grad_norm": 3.3131866455078125, + "learning_rate": 3.86432780594765e-06, + "loss": 0.0861, + "step": 45523 + }, + { + "epoch": 3.017444384156267, + "grad_norm": 4.782116889953613, + "learning_rate": 3.864190763327395e-06, + "loss": 0.1265, + "step": 45524 + }, + { + "epoch": 3.017457948996202, + "grad_norm": 3.9110660552978516, + "learning_rate": 3.86405372070714e-06, + "loss": 0.086, + "step": 45525 + }, + { + "epoch": 3.0174715138361368, + "grad_norm": 2.63731050491333, + "learning_rate": 3.863916678086885e-06, + "loss": 0.0502, + "step": 45526 + }, + { + "epoch": 3.0174850786760716, + "grad_norm": 4.153833866119385, + "learning_rate": 3.8637796354666305e-06, + "loss": 0.1135, + "step": 45527 + }, + { + "epoch": 3.0174986435160065, + "grad_norm": 4.912102222442627, + "learning_rate": 3.863642592846376e-06, + "loss": 0.1264, + "step": 45528 + }, + { + "epoch": 3.0175122083559414, + "grad_norm": 4.474330902099609, + "learning_rate": 3.86350555022612e-06, + "loss": 0.1807, + "step": 45529 + }, + { + "epoch": 3.0175257731958762, + "grad_norm": 3.710216999053955, + "learning_rate": 3.863368507605866e-06, + "loss": 0.0959, + "step": 45530 + }, + { + "epoch": 3.017539338035811, + "grad_norm": 3.7625155448913574, + "learning_rate": 3.86323146498561e-06, + "loss": 0.1518, + "step": 45531 + }, + { + "epoch": 3.017552902875746, + "grad_norm": 4.658183574676514, + "learning_rate": 3.8630944223653564e-06, + "loss": 0.1575, + "step": 45532 + }, + { + "epoch": 3.017566467715681, + "grad_norm": 3.7291500568389893, + "learning_rate": 3.862957379745101e-06, + "loss": 0.1069, + "step": 45533 + }, + { + "epoch": 3.0175800325556157, + "grad_norm": 3.5910377502441406, + "learning_rate": 3.862820337124846e-06, + "loss": 0.0628, + "step": 45534 + }, + { + "epoch": 3.0175935973955506, + "grad_norm": 3.8969566822052, + "learning_rate": 3.862683294504591e-06, + "loss": 0.1347, + "step": 45535 + }, + { + "epoch": 3.0176071622354854, + "grad_norm": 5.194550037384033, + "learning_rate": 3.862546251884336e-06, + "loss": 0.0913, + "step": 45536 + }, + { + "epoch": 3.0176207270754207, + "grad_norm": 3.7355153560638428, + "learning_rate": 3.8624092092640815e-06, + "loss": 0.1368, + "step": 45537 + }, + { + "epoch": 3.0176342919153556, + "grad_norm": 5.585112571716309, + "learning_rate": 3.862272166643827e-06, + "loss": 0.1497, + "step": 45538 + }, + { + "epoch": 3.0176478567552905, + "grad_norm": 6.376438140869141, + "learning_rate": 3.862135124023572e-06, + "loss": 0.172, + "step": 45539 + }, + { + "epoch": 3.0176614215952253, + "grad_norm": 4.727205753326416, + "learning_rate": 3.861998081403317e-06, + "loss": 0.1079, + "step": 45540 + }, + { + "epoch": 3.01767498643516, + "grad_norm": 2.767920732498169, + "learning_rate": 3.861861038783062e-06, + "loss": 0.0791, + "step": 45541 + }, + { + "epoch": 3.017688551275095, + "grad_norm": 3.5913197994232178, + "learning_rate": 3.8617239961628065e-06, + "loss": 0.0757, + "step": 45542 + }, + { + "epoch": 3.01770211611503, + "grad_norm": 7.115717887878418, + "learning_rate": 3.861586953542552e-06, + "loss": 0.1123, + "step": 45543 + }, + { + "epoch": 3.017715680954965, + "grad_norm": 3.494091272354126, + "learning_rate": 3.861449910922297e-06, + "loss": 0.0635, + "step": 45544 + }, + { + "epoch": 3.0177292457948997, + "grad_norm": 3.9142606258392334, + "learning_rate": 3.861312868302042e-06, + "loss": 0.1279, + "step": 45545 + }, + { + "epoch": 3.0177428106348345, + "grad_norm": 3.505133867263794, + "learning_rate": 3.861175825681787e-06, + "loss": 0.085, + "step": 45546 + }, + { + "epoch": 3.0177563754747694, + "grad_norm": 3.2672252655029297, + "learning_rate": 3.8610387830615324e-06, + "loss": 0.1052, + "step": 45547 + }, + { + "epoch": 3.0177699403147042, + "grad_norm": 4.87999153137207, + "learning_rate": 3.860901740441278e-06, + "loss": 0.1302, + "step": 45548 + }, + { + "epoch": 3.017783505154639, + "grad_norm": 5.085056304931641, + "learning_rate": 3.860764697821023e-06, + "loss": 0.168, + "step": 45549 + }, + { + "epoch": 3.017797069994574, + "grad_norm": 2.8465943336486816, + "learning_rate": 3.860627655200768e-06, + "loss": 0.0424, + "step": 45550 + }, + { + "epoch": 3.017810634834509, + "grad_norm": 2.6533279418945312, + "learning_rate": 3.860490612580513e-06, + "loss": 0.0445, + "step": 45551 + }, + { + "epoch": 3.0178241996744437, + "grad_norm": 3.239981174468994, + "learning_rate": 3.8603535699602575e-06, + "loss": 0.0595, + "step": 45552 + }, + { + "epoch": 3.0178377645143786, + "grad_norm": 3.962563991546631, + "learning_rate": 3.8602165273400035e-06, + "loss": 0.0764, + "step": 45553 + }, + { + "epoch": 3.0178513293543134, + "grad_norm": 4.659573078155518, + "learning_rate": 3.860079484719748e-06, + "loss": 0.1111, + "step": 45554 + }, + { + "epoch": 3.0178648941942483, + "grad_norm": 3.373687744140625, + "learning_rate": 3.859942442099493e-06, + "loss": 0.037, + "step": 45555 + }, + { + "epoch": 3.0178784590341836, + "grad_norm": 3.4006083011627197, + "learning_rate": 3.859805399479238e-06, + "loss": 0.0444, + "step": 45556 + }, + { + "epoch": 3.0178920238741185, + "grad_norm": 4.391827583312988, + "learning_rate": 3.859668356858983e-06, + "loss": 0.1307, + "step": 45557 + }, + { + "epoch": 3.0179055887140533, + "grad_norm": 3.9639475345611572, + "learning_rate": 3.8595313142387286e-06, + "loss": 0.0671, + "step": 45558 + }, + { + "epoch": 3.017919153553988, + "grad_norm": 4.183269500732422, + "learning_rate": 3.859394271618474e-06, + "loss": 0.1823, + "step": 45559 + }, + { + "epoch": 3.017932718393923, + "grad_norm": 2.7656044960021973, + "learning_rate": 3.859257228998219e-06, + "loss": 0.0802, + "step": 45560 + }, + { + "epoch": 3.017946283233858, + "grad_norm": 3.1972360610961914, + "learning_rate": 3.859120186377964e-06, + "loss": 0.0633, + "step": 45561 + }, + { + "epoch": 3.017959848073793, + "grad_norm": 4.143716812133789, + "learning_rate": 3.858983143757709e-06, + "loss": 0.1076, + "step": 45562 + }, + { + "epoch": 3.0179734129137277, + "grad_norm": 3.8775370121002197, + "learning_rate": 3.858846101137454e-06, + "loss": 0.0776, + "step": 45563 + }, + { + "epoch": 3.0179869777536625, + "grad_norm": 2.9219605922698975, + "learning_rate": 3.8587090585172e-06, + "loss": 0.0523, + "step": 45564 + }, + { + "epoch": 3.0180005425935974, + "grad_norm": 3.3707430362701416, + "learning_rate": 3.858572015896944e-06, + "loss": 0.0653, + "step": 45565 + }, + { + "epoch": 3.0180141074335323, + "grad_norm": 3.8052918910980225, + "learning_rate": 3.85843497327669e-06, + "loss": 0.0706, + "step": 45566 + }, + { + "epoch": 3.018027672273467, + "grad_norm": 3.760890007019043, + "learning_rate": 3.858297930656434e-06, + "loss": 0.1044, + "step": 45567 + }, + { + "epoch": 3.018041237113402, + "grad_norm": 4.100378036499023, + "learning_rate": 3.8581608880361795e-06, + "loss": 0.1334, + "step": 45568 + }, + { + "epoch": 3.018054801953337, + "grad_norm": 3.4784090518951416, + "learning_rate": 3.858023845415925e-06, + "loss": 0.0826, + "step": 45569 + }, + { + "epoch": 3.0180683667932717, + "grad_norm": 4.083468914031982, + "learning_rate": 3.85788680279567e-06, + "loss": 0.0885, + "step": 45570 + }, + { + "epoch": 3.0180819316332066, + "grad_norm": 3.7291388511657715, + "learning_rate": 3.857749760175415e-06, + "loss": 0.0611, + "step": 45571 + }, + { + "epoch": 3.0180954964731415, + "grad_norm": 1.9884682893753052, + "learning_rate": 3.857612717555159e-06, + "loss": 0.051, + "step": 45572 + }, + { + "epoch": 3.0181090613130763, + "grad_norm": 2.6557655334472656, + "learning_rate": 3.857475674934905e-06, + "loss": 0.0572, + "step": 45573 + }, + { + "epoch": 3.018122626153011, + "grad_norm": 2.9321582317352295, + "learning_rate": 3.85733863231465e-06, + "loss": 0.0889, + "step": 45574 + }, + { + "epoch": 3.0181361909929465, + "grad_norm": 3.551790714263916, + "learning_rate": 3.857201589694396e-06, + "loss": 0.0708, + "step": 45575 + }, + { + "epoch": 3.0181497558328814, + "grad_norm": 2.556105852127075, + "learning_rate": 3.85706454707414e-06, + "loss": 0.041, + "step": 45576 + }, + { + "epoch": 3.0181633206728162, + "grad_norm": 3.728095054626465, + "learning_rate": 3.856927504453885e-06, + "loss": 0.1018, + "step": 45577 + }, + { + "epoch": 3.018176885512751, + "grad_norm": 3.2424850463867188, + "learning_rate": 3.8567904618336304e-06, + "loss": 0.0988, + "step": 45578 + }, + { + "epoch": 3.018190450352686, + "grad_norm": 3.2495615482330322, + "learning_rate": 3.856653419213376e-06, + "loss": 0.0629, + "step": 45579 + }, + { + "epoch": 3.018204015192621, + "grad_norm": 3.3220016956329346, + "learning_rate": 3.856516376593121e-06, + "loss": 0.0685, + "step": 45580 + }, + { + "epoch": 3.0182175800325557, + "grad_norm": 3.865042209625244, + "learning_rate": 3.856379333972866e-06, + "loss": 0.0954, + "step": 45581 + }, + { + "epoch": 3.0182311448724906, + "grad_norm": 4.173112392425537, + "learning_rate": 3.856242291352611e-06, + "loss": 0.0934, + "step": 45582 + }, + { + "epoch": 3.0182447097124254, + "grad_norm": 3.8389787673950195, + "learning_rate": 3.8561052487323555e-06, + "loss": 0.0855, + "step": 45583 + }, + { + "epoch": 3.0182582745523603, + "grad_norm": 3.866799831390381, + "learning_rate": 3.8559682061121015e-06, + "loss": 0.1056, + "step": 45584 + }, + { + "epoch": 3.018271839392295, + "grad_norm": 3.5069024562835693, + "learning_rate": 3.855831163491846e-06, + "loss": 0.1439, + "step": 45585 + }, + { + "epoch": 3.01828540423223, + "grad_norm": 3.784407377243042, + "learning_rate": 3.855694120871592e-06, + "loss": 0.1426, + "step": 45586 + }, + { + "epoch": 3.018298969072165, + "grad_norm": 4.967761039733887, + "learning_rate": 3.855557078251336e-06, + "loss": 0.1113, + "step": 45587 + }, + { + "epoch": 3.0183125339120997, + "grad_norm": 5.690520286560059, + "learning_rate": 3.855420035631081e-06, + "loss": 0.0988, + "step": 45588 + }, + { + "epoch": 3.0183260987520346, + "grad_norm": 3.396852970123291, + "learning_rate": 3.8552829930108266e-06, + "loss": 0.0493, + "step": 45589 + }, + { + "epoch": 3.0183396635919695, + "grad_norm": 4.469043731689453, + "learning_rate": 3.855145950390572e-06, + "loss": 0.0917, + "step": 45590 + }, + { + "epoch": 3.0183532284319043, + "grad_norm": 3.0948727130889893, + "learning_rate": 3.855008907770317e-06, + "loss": 0.084, + "step": 45591 + }, + { + "epoch": 3.018366793271839, + "grad_norm": 3.587066173553467, + "learning_rate": 3.854871865150062e-06, + "loss": 0.1104, + "step": 45592 + }, + { + "epoch": 3.018380358111774, + "grad_norm": 4.081727504730225, + "learning_rate": 3.854734822529807e-06, + "loss": 0.0963, + "step": 45593 + }, + { + "epoch": 3.0183939229517094, + "grad_norm": 3.3269224166870117, + "learning_rate": 3.8545977799095525e-06, + "loss": 0.0709, + "step": 45594 + }, + { + "epoch": 3.0184074877916443, + "grad_norm": 3.310887098312378, + "learning_rate": 3.854460737289298e-06, + "loss": 0.0942, + "step": 45595 + }, + { + "epoch": 3.018421052631579, + "grad_norm": 3.927522897720337, + "learning_rate": 3.854323694669043e-06, + "loss": 0.1155, + "step": 45596 + }, + { + "epoch": 3.018434617471514, + "grad_norm": 4.997462272644043, + "learning_rate": 3.854186652048787e-06, + "loss": 0.1803, + "step": 45597 + }, + { + "epoch": 3.018448182311449, + "grad_norm": 2.7803258895874023, + "learning_rate": 3.854049609428532e-06, + "loss": 0.0753, + "step": 45598 + }, + { + "epoch": 3.0184617471513837, + "grad_norm": 3.6442322731018066, + "learning_rate": 3.8539125668082775e-06, + "loss": 0.1179, + "step": 45599 + }, + { + "epoch": 3.0184753119913186, + "grad_norm": 2.978712558746338, + "learning_rate": 3.853775524188023e-06, + "loss": 0.0925, + "step": 45600 + }, + { + "epoch": 3.0184888768312534, + "grad_norm": 4.375600814819336, + "learning_rate": 3.853638481567768e-06, + "loss": 0.1466, + "step": 45601 + }, + { + "epoch": 3.0185024416711883, + "grad_norm": 6.32955265045166, + "learning_rate": 3.853501438947513e-06, + "loss": 0.1338, + "step": 45602 + }, + { + "epoch": 3.018516006511123, + "grad_norm": 3.0373551845550537, + "learning_rate": 3.853364396327258e-06, + "loss": 0.0548, + "step": 45603 + }, + { + "epoch": 3.018529571351058, + "grad_norm": 2.7882978916168213, + "learning_rate": 3.853227353707003e-06, + "loss": 0.0605, + "step": 45604 + }, + { + "epoch": 3.018543136190993, + "grad_norm": 5.809317588806152, + "learning_rate": 3.8530903110867486e-06, + "loss": 0.1057, + "step": 45605 + }, + { + "epoch": 3.0185567010309278, + "grad_norm": 4.7897820472717285, + "learning_rate": 3.852953268466493e-06, + "loss": 0.1392, + "step": 45606 + }, + { + "epoch": 3.0185702658708626, + "grad_norm": 3.886013984680176, + "learning_rate": 3.852816225846239e-06, + "loss": 0.0924, + "step": 45607 + }, + { + "epoch": 3.0185838307107975, + "grad_norm": 4.007376194000244, + "learning_rate": 3.852679183225983e-06, + "loss": 0.0509, + "step": 45608 + }, + { + "epoch": 3.0185973955507324, + "grad_norm": 3.5457160472869873, + "learning_rate": 3.852542140605729e-06, + "loss": 0.0792, + "step": 45609 + }, + { + "epoch": 3.0186109603906672, + "grad_norm": 3.919063091278076, + "learning_rate": 3.852405097985474e-06, + "loss": 0.124, + "step": 45610 + }, + { + "epoch": 3.018624525230602, + "grad_norm": 4.347962856292725, + "learning_rate": 3.852268055365219e-06, + "loss": 0.0881, + "step": 45611 + }, + { + "epoch": 3.018638090070537, + "grad_norm": 4.689783573150635, + "learning_rate": 3.852131012744964e-06, + "loss": 0.16, + "step": 45612 + }, + { + "epoch": 3.0186516549104723, + "grad_norm": 4.422888278961182, + "learning_rate": 3.851993970124709e-06, + "loss": 0.1763, + "step": 45613 + }, + { + "epoch": 3.018665219750407, + "grad_norm": 4.007098197937012, + "learning_rate": 3.851856927504454e-06, + "loss": 0.0723, + "step": 45614 + }, + { + "epoch": 3.018678784590342, + "grad_norm": 4.589639186859131, + "learning_rate": 3.8517198848841995e-06, + "loss": 0.1477, + "step": 45615 + }, + { + "epoch": 3.018692349430277, + "grad_norm": 3.887230157852173, + "learning_rate": 3.851582842263945e-06, + "loss": 0.072, + "step": 45616 + }, + { + "epoch": 3.0187059142702117, + "grad_norm": 4.841848850250244, + "learning_rate": 3.851445799643689e-06, + "loss": 0.1126, + "step": 45617 + }, + { + "epoch": 3.0187194791101466, + "grad_norm": 3.854736566543579, + "learning_rate": 3.851308757023435e-06, + "loss": 0.09, + "step": 45618 + }, + { + "epoch": 3.0187330439500815, + "grad_norm": 4.345668792724609, + "learning_rate": 3.851171714403179e-06, + "loss": 0.1083, + "step": 45619 + }, + { + "epoch": 3.0187466087900163, + "grad_norm": 4.2286810874938965, + "learning_rate": 3.851034671782925e-06, + "loss": 0.1027, + "step": 45620 + }, + { + "epoch": 3.018760173629951, + "grad_norm": 3.074336528778076, + "learning_rate": 3.85089762916267e-06, + "loss": 0.1014, + "step": 45621 + }, + { + "epoch": 3.018773738469886, + "grad_norm": 4.441125392913818, + "learning_rate": 3.850760586542415e-06, + "loss": 0.153, + "step": 45622 + }, + { + "epoch": 3.018787303309821, + "grad_norm": 3.602466106414795, + "learning_rate": 3.85062354392216e-06, + "loss": 0.0976, + "step": 45623 + }, + { + "epoch": 3.018800868149756, + "grad_norm": 2.9663448333740234, + "learning_rate": 3.850486501301905e-06, + "loss": 0.0635, + "step": 45624 + }, + { + "epoch": 3.0188144329896907, + "grad_norm": 6.1625213623046875, + "learning_rate": 3.8503494586816505e-06, + "loss": 0.1031, + "step": 45625 + }, + { + "epoch": 3.0188279978296255, + "grad_norm": 4.104104042053223, + "learning_rate": 3.850212416061395e-06, + "loss": 0.1128, + "step": 45626 + }, + { + "epoch": 3.0188415626695604, + "grad_norm": 2.7409214973449707, + "learning_rate": 3.850075373441141e-06, + "loss": 0.0588, + "step": 45627 + }, + { + "epoch": 3.0188551275094953, + "grad_norm": 3.7215309143066406, + "learning_rate": 3.849938330820885e-06, + "loss": 0.0657, + "step": 45628 + }, + { + "epoch": 3.01886869234943, + "grad_norm": 4.079777717590332, + "learning_rate": 3.849801288200631e-06, + "loss": 0.0946, + "step": 45629 + }, + { + "epoch": 3.018882257189365, + "grad_norm": 4.809335231781006, + "learning_rate": 3.8496642455803755e-06, + "loss": 0.1452, + "step": 45630 + }, + { + "epoch": 3.0188958220293, + "grad_norm": 4.95942497253418, + "learning_rate": 3.849527202960121e-06, + "loss": 0.1638, + "step": 45631 + }, + { + "epoch": 3.018909386869235, + "grad_norm": 3.4724364280700684, + "learning_rate": 3.849390160339866e-06, + "loss": 0.1048, + "step": 45632 + }, + { + "epoch": 3.01892295170917, + "grad_norm": 3.93310284614563, + "learning_rate": 3.849253117719611e-06, + "loss": 0.1077, + "step": 45633 + }, + { + "epoch": 3.018936516549105, + "grad_norm": 3.402693033218384, + "learning_rate": 3.849116075099356e-06, + "loss": 0.0935, + "step": 45634 + }, + { + "epoch": 3.0189500813890398, + "grad_norm": 3.038707971572876, + "learning_rate": 3.848979032479101e-06, + "loss": 0.079, + "step": 45635 + }, + { + "epoch": 3.0189636462289746, + "grad_norm": 4.354011535644531, + "learning_rate": 3.848841989858847e-06, + "loss": 0.1865, + "step": 45636 + }, + { + "epoch": 3.0189772110689095, + "grad_norm": 3.2933194637298584, + "learning_rate": 3.848704947238592e-06, + "loss": 0.089, + "step": 45637 + }, + { + "epoch": 3.0189907759088443, + "grad_norm": 4.193222522735596, + "learning_rate": 3.848567904618337e-06, + "loss": 0.0957, + "step": 45638 + }, + { + "epoch": 3.019004340748779, + "grad_norm": 5.063094139099121, + "learning_rate": 3.848430861998081e-06, + "loss": 0.1305, + "step": 45639 + }, + { + "epoch": 3.019017905588714, + "grad_norm": 3.4707272052764893, + "learning_rate": 3.8482938193778264e-06, + "loss": 0.0707, + "step": 45640 + }, + { + "epoch": 3.019031470428649, + "grad_norm": 5.7293477058410645, + "learning_rate": 3.848156776757572e-06, + "loss": 0.1609, + "step": 45641 + }, + { + "epoch": 3.019045035268584, + "grad_norm": 4.01863956451416, + "learning_rate": 3.848019734137317e-06, + "loss": 0.1197, + "step": 45642 + }, + { + "epoch": 3.0190586001085187, + "grad_norm": 4.85142183303833, + "learning_rate": 3.847882691517062e-06, + "loss": 0.1161, + "step": 45643 + }, + { + "epoch": 3.0190721649484535, + "grad_norm": 3.5473620891571045, + "learning_rate": 3.847745648896807e-06, + "loss": 0.0809, + "step": 45644 + }, + { + "epoch": 3.0190857297883884, + "grad_norm": 4.890337944030762, + "learning_rate": 3.847608606276552e-06, + "loss": 0.1246, + "step": 45645 + }, + { + "epoch": 3.0190992946283233, + "grad_norm": 3.696154832839966, + "learning_rate": 3.8474715636562975e-06, + "loss": 0.096, + "step": 45646 + }, + { + "epoch": 3.019112859468258, + "grad_norm": 6.143931865692139, + "learning_rate": 3.847334521036043e-06, + "loss": 0.1809, + "step": 45647 + }, + { + "epoch": 3.019126424308193, + "grad_norm": 4.274282455444336, + "learning_rate": 3.847197478415788e-06, + "loss": 0.1203, + "step": 45648 + }, + { + "epoch": 3.019139989148128, + "grad_norm": 3.982887029647827, + "learning_rate": 3.847060435795533e-06, + "loss": 0.0969, + "step": 45649 + }, + { + "epoch": 3.0191535539880627, + "grad_norm": 4.498735427856445, + "learning_rate": 3.846923393175278e-06, + "loss": 0.1489, + "step": 45650 + }, + { + "epoch": 3.019167118827998, + "grad_norm": 3.5556342601776123, + "learning_rate": 3.8467863505550226e-06, + "loss": 0.0717, + "step": 45651 + }, + { + "epoch": 3.019180683667933, + "grad_norm": 4.086785316467285, + "learning_rate": 3.846649307934768e-06, + "loss": 0.1211, + "step": 45652 + }, + { + "epoch": 3.0191942485078678, + "grad_norm": 4.831697940826416, + "learning_rate": 3.846512265314513e-06, + "loss": 0.1311, + "step": 45653 + }, + { + "epoch": 3.0192078133478026, + "grad_norm": 4.716781139373779, + "learning_rate": 3.846375222694258e-06, + "loss": 0.1172, + "step": 45654 + }, + { + "epoch": 3.0192213781877375, + "grad_norm": 3.764934539794922, + "learning_rate": 3.846238180074003e-06, + "loss": 0.0873, + "step": 45655 + }, + { + "epoch": 3.0192349430276724, + "grad_norm": 3.2454514503479004, + "learning_rate": 3.8461011374537485e-06, + "loss": 0.0996, + "step": 45656 + }, + { + "epoch": 3.0192485078676072, + "grad_norm": 4.911364555358887, + "learning_rate": 3.845964094833494e-06, + "loss": 0.1172, + "step": 45657 + }, + { + "epoch": 3.019262072707542, + "grad_norm": 5.5572991371154785, + "learning_rate": 3.845827052213239e-06, + "loss": 0.1697, + "step": 45658 + }, + { + "epoch": 3.019275637547477, + "grad_norm": 4.363184452056885, + "learning_rate": 3.845690009592984e-06, + "loss": 0.1141, + "step": 45659 + }, + { + "epoch": 3.019289202387412, + "grad_norm": 2.913533926010132, + "learning_rate": 3.845552966972728e-06, + "loss": 0.0524, + "step": 45660 + }, + { + "epoch": 3.0193027672273467, + "grad_norm": 5.7278571128845215, + "learning_rate": 3.845415924352474e-06, + "loss": 0.1432, + "step": 45661 + }, + { + "epoch": 3.0193163320672816, + "grad_norm": 4.892016410827637, + "learning_rate": 3.845278881732219e-06, + "loss": 0.1163, + "step": 45662 + }, + { + "epoch": 3.0193298969072164, + "grad_norm": 2.83355450630188, + "learning_rate": 3.845141839111965e-06, + "loss": 0.0709, + "step": 45663 + }, + { + "epoch": 3.0193434617471513, + "grad_norm": 3.8314435482025146, + "learning_rate": 3.845004796491709e-06, + "loss": 0.0796, + "step": 45664 + }, + { + "epoch": 3.019357026587086, + "grad_norm": 5.115239143371582, + "learning_rate": 3.844867753871454e-06, + "loss": 0.1448, + "step": 45665 + }, + { + "epoch": 3.019370591427021, + "grad_norm": 3.042374849319458, + "learning_rate": 3.844730711251199e-06, + "loss": 0.064, + "step": 45666 + }, + { + "epoch": 3.019384156266956, + "grad_norm": 4.4435834884643555, + "learning_rate": 3.844593668630945e-06, + "loss": 0.0778, + "step": 45667 + }, + { + "epoch": 3.0193977211068908, + "grad_norm": 3.1154088973999023, + "learning_rate": 3.84445662601069e-06, + "loss": 0.0677, + "step": 45668 + }, + { + "epoch": 3.0194112859468256, + "grad_norm": 4.079708576202393, + "learning_rate": 3.844319583390435e-06, + "loss": 0.1243, + "step": 45669 + }, + { + "epoch": 3.019424850786761, + "grad_norm": 4.667032718658447, + "learning_rate": 3.84418254077018e-06, + "loss": 0.0825, + "step": 45670 + }, + { + "epoch": 3.019438415626696, + "grad_norm": 3.751516103744507, + "learning_rate": 3.8440454981499244e-06, + "loss": 0.0932, + "step": 45671 + }, + { + "epoch": 3.0194519804666307, + "grad_norm": 4.316815376281738, + "learning_rate": 3.8439084555296705e-06, + "loss": 0.0986, + "step": 45672 + }, + { + "epoch": 3.0194655453065655, + "grad_norm": 5.384562969207764, + "learning_rate": 3.843771412909415e-06, + "loss": 0.1294, + "step": 45673 + }, + { + "epoch": 3.0194791101465004, + "grad_norm": 3.49617338180542, + "learning_rate": 3.843634370289161e-06, + "loss": 0.0714, + "step": 45674 + }, + { + "epoch": 3.0194926749864353, + "grad_norm": 5.103362560272217, + "learning_rate": 3.843497327668905e-06, + "loss": 0.0797, + "step": 45675 + }, + { + "epoch": 3.01950623982637, + "grad_norm": 4.705478191375732, + "learning_rate": 3.84336028504865e-06, + "loss": 0.2009, + "step": 45676 + }, + { + "epoch": 3.019519804666305, + "grad_norm": 4.339560031890869, + "learning_rate": 3.8432232424283955e-06, + "loss": 0.114, + "step": 45677 + }, + { + "epoch": 3.01953336950624, + "grad_norm": 4.250020503997803, + "learning_rate": 3.843086199808141e-06, + "loss": 0.1606, + "step": 45678 + }, + { + "epoch": 3.0195469343461747, + "grad_norm": 3.5578243732452393, + "learning_rate": 3.842949157187886e-06, + "loss": 0.0864, + "step": 45679 + }, + { + "epoch": 3.0195604991861096, + "grad_norm": 5.256867408752441, + "learning_rate": 3.84281211456763e-06, + "loss": 0.1584, + "step": 45680 + }, + { + "epoch": 3.0195740640260444, + "grad_norm": 4.793933868408203, + "learning_rate": 3.842675071947376e-06, + "loss": 0.1161, + "step": 45681 + }, + { + "epoch": 3.0195876288659793, + "grad_norm": 5.059015274047852, + "learning_rate": 3.8425380293271206e-06, + "loss": 0.1205, + "step": 45682 + }, + { + "epoch": 3.019601193705914, + "grad_norm": 3.8545305728912354, + "learning_rate": 3.842400986706867e-06, + "loss": 0.1263, + "step": 45683 + }, + { + "epoch": 3.019614758545849, + "grad_norm": 4.322991371154785, + "learning_rate": 3.842263944086611e-06, + "loss": 0.191, + "step": 45684 + }, + { + "epoch": 3.019628323385784, + "grad_norm": 3.987769365310669, + "learning_rate": 3.842126901466356e-06, + "loss": 0.1267, + "step": 45685 + }, + { + "epoch": 3.0196418882257188, + "grad_norm": 4.577645301818848, + "learning_rate": 3.841989858846101e-06, + "loss": 0.2225, + "step": 45686 + }, + { + "epoch": 3.0196554530656536, + "grad_norm": 5.541750431060791, + "learning_rate": 3.8418528162258465e-06, + "loss": 0.2179, + "step": 45687 + }, + { + "epoch": 3.0196690179055885, + "grad_norm": 5.036427021026611, + "learning_rate": 3.841715773605592e-06, + "loss": 0.1878, + "step": 45688 + }, + { + "epoch": 3.019682582745524, + "grad_norm": 3.7487010955810547, + "learning_rate": 3.841578730985337e-06, + "loss": 0.1476, + "step": 45689 + }, + { + "epoch": 3.0196961475854587, + "grad_norm": 3.467268466949463, + "learning_rate": 3.841441688365082e-06, + "loss": 0.1115, + "step": 45690 + }, + { + "epoch": 3.0197097124253935, + "grad_norm": 3.5259792804718018, + "learning_rate": 3.841304645744827e-06, + "loss": 0.0746, + "step": 45691 + }, + { + "epoch": 3.0197232772653284, + "grad_norm": 4.163726329803467, + "learning_rate": 3.841167603124572e-06, + "loss": 0.122, + "step": 45692 + }, + { + "epoch": 3.0197368421052633, + "grad_norm": 5.898839950561523, + "learning_rate": 3.841030560504317e-06, + "loss": 0.1691, + "step": 45693 + }, + { + "epoch": 3.019750406945198, + "grad_norm": 4.313035011291504, + "learning_rate": 3.840893517884062e-06, + "loss": 0.1564, + "step": 45694 + }, + { + "epoch": 3.019763971785133, + "grad_norm": 4.529567241668701, + "learning_rate": 3.840756475263807e-06, + "loss": 0.1186, + "step": 45695 + }, + { + "epoch": 3.019777536625068, + "grad_norm": 6.182036399841309, + "learning_rate": 3.840619432643552e-06, + "loss": 0.1804, + "step": 45696 + }, + { + "epoch": 3.0197911014650027, + "grad_norm": 2.9602627754211426, + "learning_rate": 3.840482390023297e-06, + "loss": 0.0757, + "step": 45697 + }, + { + "epoch": 3.0198046663049376, + "grad_norm": 3.748995542526245, + "learning_rate": 3.840345347403043e-06, + "loss": 0.1397, + "step": 45698 + }, + { + "epoch": 3.0198182311448725, + "grad_norm": 5.4143805503845215, + "learning_rate": 3.840208304782788e-06, + "loss": 0.1798, + "step": 45699 + }, + { + "epoch": 3.0198317959848073, + "grad_norm": 5.455563545227051, + "learning_rate": 3.840071262162533e-06, + "loss": 0.1881, + "step": 45700 + }, + { + "epoch": 3.019845360824742, + "grad_norm": 5.321432113647461, + "learning_rate": 3.839934219542278e-06, + "loss": 0.1402, + "step": 45701 + }, + { + "epoch": 3.019858925664677, + "grad_norm": 3.7387590408325195, + "learning_rate": 3.839797176922023e-06, + "loss": 0.0699, + "step": 45702 + }, + { + "epoch": 3.019872490504612, + "grad_norm": 3.9922194480895996, + "learning_rate": 3.8396601343017685e-06, + "loss": 0.1433, + "step": 45703 + }, + { + "epoch": 3.019886055344547, + "grad_norm": 4.345320224761963, + "learning_rate": 3.839523091681514e-06, + "loss": 0.082, + "step": 45704 + }, + { + "epoch": 3.0198996201844817, + "grad_norm": 5.323507308959961, + "learning_rate": 3.839386049061258e-06, + "loss": 0.1068, + "step": 45705 + }, + { + "epoch": 3.0199131850244165, + "grad_norm": 4.540463447570801, + "learning_rate": 3.839249006441004e-06, + "loss": 0.0737, + "step": 45706 + }, + { + "epoch": 3.0199267498643514, + "grad_norm": 5.801939010620117, + "learning_rate": 3.839111963820748e-06, + "loss": 0.1609, + "step": 45707 + }, + { + "epoch": 3.0199403147042867, + "grad_norm": 5.681366443634033, + "learning_rate": 3.8389749212004935e-06, + "loss": 0.121, + "step": 45708 + }, + { + "epoch": 3.0199538795442216, + "grad_norm": 5.106533527374268, + "learning_rate": 3.838837878580239e-06, + "loss": 0.1158, + "step": 45709 + }, + { + "epoch": 3.0199674443841564, + "grad_norm": 4.221926212310791, + "learning_rate": 3.838700835959984e-06, + "loss": 0.1208, + "step": 45710 + }, + { + "epoch": 3.0199810092240913, + "grad_norm": 5.106950283050537, + "learning_rate": 3.838563793339729e-06, + "loss": 0.1628, + "step": 45711 + }, + { + "epoch": 3.019994574064026, + "grad_norm": 3.956146717071533, + "learning_rate": 3.838426750719474e-06, + "loss": 0.1004, + "step": 45712 + }, + { + "epoch": 3.020008138903961, + "grad_norm": 4.088860988616943, + "learning_rate": 3.838289708099219e-06, + "loss": 0.1134, + "step": 45713 + }, + { + "epoch": 3.020021703743896, + "grad_norm": 5.426095962524414, + "learning_rate": 3.838152665478964e-06, + "loss": 0.1385, + "step": 45714 + }, + { + "epoch": 3.0200352685838308, + "grad_norm": 4.078946590423584, + "learning_rate": 3.83801562285871e-06, + "loss": 0.0973, + "step": 45715 + }, + { + "epoch": 3.0200488334237656, + "grad_norm": 4.691465377807617, + "learning_rate": 3.837878580238454e-06, + "loss": 0.088, + "step": 45716 + }, + { + "epoch": 3.0200623982637005, + "grad_norm": 4.6111016273498535, + "learning_rate": 3.8377415376182e-06, + "loss": 0.108, + "step": 45717 + }, + { + "epoch": 3.0200759631036354, + "grad_norm": 3.93029522895813, + "learning_rate": 3.8376044949979445e-06, + "loss": 0.0676, + "step": 45718 + }, + { + "epoch": 3.02008952794357, + "grad_norm": 4.205785274505615, + "learning_rate": 3.83746745237769e-06, + "loss": 0.1432, + "step": 45719 + }, + { + "epoch": 3.020103092783505, + "grad_norm": 4.031331539154053, + "learning_rate": 3.837330409757435e-06, + "loss": 0.0878, + "step": 45720 + }, + { + "epoch": 3.02011665762344, + "grad_norm": 6.81919002532959, + "learning_rate": 3.83719336713718e-06, + "loss": 0.1574, + "step": 45721 + }, + { + "epoch": 3.020130222463375, + "grad_norm": 4.550235271453857, + "learning_rate": 3.837056324516925e-06, + "loss": 0.0994, + "step": 45722 + }, + { + "epoch": 3.0201437873033097, + "grad_norm": 4.972301483154297, + "learning_rate": 3.83691928189667e-06, + "loss": 0.0905, + "step": 45723 + }, + { + "epoch": 3.0201573521432445, + "grad_norm": 4.734463691711426, + "learning_rate": 3.8367822392764155e-06, + "loss": 0.1506, + "step": 45724 + }, + { + "epoch": 3.0201709169831794, + "grad_norm": 7.054677963256836, + "learning_rate": 3.83664519665616e-06, + "loss": 0.1448, + "step": 45725 + }, + { + "epoch": 3.0201844818231143, + "grad_norm": 5.742938995361328, + "learning_rate": 3.836508154035906e-06, + "loss": 0.1948, + "step": 45726 + }, + { + "epoch": 3.0201980466630496, + "grad_norm": 5.494204044342041, + "learning_rate": 3.83637111141565e-06, + "loss": 0.1418, + "step": 45727 + }, + { + "epoch": 3.0202116115029845, + "grad_norm": 4.939630508422852, + "learning_rate": 3.836234068795396e-06, + "loss": 0.1487, + "step": 45728 + }, + { + "epoch": 3.0202251763429193, + "grad_norm": 4.2310333251953125, + "learning_rate": 3.836097026175141e-06, + "loss": 0.1085, + "step": 45729 + }, + { + "epoch": 3.020238741182854, + "grad_norm": 6.028534412384033, + "learning_rate": 3.835959983554886e-06, + "loss": 0.2444, + "step": 45730 + }, + { + "epoch": 3.020252306022789, + "grad_norm": 5.741948127746582, + "learning_rate": 3.835822940934631e-06, + "loss": 0.1318, + "step": 45731 + }, + { + "epoch": 3.020265870862724, + "grad_norm": 4.374882698059082, + "learning_rate": 3.835685898314376e-06, + "loss": 0.0985, + "step": 45732 + }, + { + "epoch": 3.0202794357026588, + "grad_norm": 4.299032688140869, + "learning_rate": 3.835548855694121e-06, + "loss": 0.1098, + "step": 45733 + }, + { + "epoch": 3.0202930005425936, + "grad_norm": 5.172878265380859, + "learning_rate": 3.8354118130738665e-06, + "loss": 0.2658, + "step": 45734 + }, + { + "epoch": 3.0203065653825285, + "grad_norm": 5.554516315460205, + "learning_rate": 3.835274770453612e-06, + "loss": 0.2459, + "step": 45735 + }, + { + "epoch": 3.0203201302224634, + "grad_norm": 4.492218017578125, + "learning_rate": 3.835137727833356e-06, + "loss": 0.1058, + "step": 45736 + }, + { + "epoch": 3.0203336950623982, + "grad_norm": 5.019065856933594, + "learning_rate": 3.835000685213102e-06, + "loss": 0.1578, + "step": 45737 + }, + { + "epoch": 3.020347259902333, + "grad_norm": 3.828704357147217, + "learning_rate": 3.834863642592846e-06, + "loss": 0.0983, + "step": 45738 + }, + { + "epoch": 3.020360824742268, + "grad_norm": 4.921825408935547, + "learning_rate": 3.8347265999725915e-06, + "loss": 0.0925, + "step": 45739 + }, + { + "epoch": 3.020374389582203, + "grad_norm": 4.799773693084717, + "learning_rate": 3.834589557352337e-06, + "loss": 0.1272, + "step": 45740 + }, + { + "epoch": 3.0203879544221377, + "grad_norm": 4.730478286743164, + "learning_rate": 3.834452514732082e-06, + "loss": 0.168, + "step": 45741 + }, + { + "epoch": 3.0204015192620726, + "grad_norm": 4.759481430053711, + "learning_rate": 3.834315472111827e-06, + "loss": 0.1439, + "step": 45742 + }, + { + "epoch": 3.0204150841020074, + "grad_norm": 3.9340295791625977, + "learning_rate": 3.834178429491572e-06, + "loss": 0.103, + "step": 45743 + }, + { + "epoch": 3.0204286489419423, + "grad_norm": 3.680154323577881, + "learning_rate": 3.8340413868713174e-06, + "loss": 0.0768, + "step": 45744 + }, + { + "epoch": 3.020442213781877, + "grad_norm": 3.655646800994873, + "learning_rate": 3.833904344251063e-06, + "loss": 0.0838, + "step": 45745 + }, + { + "epoch": 3.0204557786218125, + "grad_norm": 7.065032482147217, + "learning_rate": 3.833767301630808e-06, + "loss": 0.2088, + "step": 45746 + }, + { + "epoch": 3.0204693434617473, + "grad_norm": 4.505342960357666, + "learning_rate": 3.833630259010553e-06, + "loss": 0.104, + "step": 45747 + }, + { + "epoch": 3.020482908301682, + "grad_norm": 4.56069278717041, + "learning_rate": 3.833493216390297e-06, + "loss": 0.1727, + "step": 45748 + }, + { + "epoch": 3.020496473141617, + "grad_norm": 6.849617958068848, + "learning_rate": 3.8333561737700425e-06, + "loss": 0.1464, + "step": 45749 + }, + { + "epoch": 3.020510037981552, + "grad_norm": 4.053159713745117, + "learning_rate": 3.833219131149788e-06, + "loss": 0.0636, + "step": 45750 + }, + { + "epoch": 3.020523602821487, + "grad_norm": 4.6937174797058105, + "learning_rate": 3.833082088529533e-06, + "loss": 0.107, + "step": 45751 + }, + { + "epoch": 3.0205371676614217, + "grad_norm": 3.928189754486084, + "learning_rate": 3.832945045909278e-06, + "loss": 0.1115, + "step": 45752 + }, + { + "epoch": 3.0205507325013565, + "grad_norm": 6.298698902130127, + "learning_rate": 3.832808003289023e-06, + "loss": 0.1161, + "step": 45753 + }, + { + "epoch": 3.0205642973412914, + "grad_norm": 5.972142696380615, + "learning_rate": 3.832670960668768e-06, + "loss": 0.2144, + "step": 45754 + }, + { + "epoch": 3.0205778621812263, + "grad_norm": 4.608428478240967, + "learning_rate": 3.8325339180485135e-06, + "loss": 0.1008, + "step": 45755 + }, + { + "epoch": 3.020591427021161, + "grad_norm": 6.4847283363342285, + "learning_rate": 3.832396875428259e-06, + "loss": 0.1778, + "step": 45756 + }, + { + "epoch": 3.020604991861096, + "grad_norm": 4.7423906326293945, + "learning_rate": 3.832259832808004e-06, + "loss": 0.1068, + "step": 45757 + }, + { + "epoch": 3.020618556701031, + "grad_norm": 3.8968288898468018, + "learning_rate": 3.832122790187749e-06, + "loss": 0.0882, + "step": 45758 + }, + { + "epoch": 3.0206321215409657, + "grad_norm": 4.761822700500488, + "learning_rate": 3.831985747567493e-06, + "loss": 0.1604, + "step": 45759 + }, + { + "epoch": 3.0206456863809006, + "grad_norm": 4.650644302368164, + "learning_rate": 3.8318487049472394e-06, + "loss": 0.1504, + "step": 45760 + }, + { + "epoch": 3.0206592512208355, + "grad_norm": 3.1370062828063965, + "learning_rate": 3.831711662326984e-06, + "loss": 0.1076, + "step": 45761 + }, + { + "epoch": 3.0206728160607703, + "grad_norm": 3.842191457748413, + "learning_rate": 3.831574619706729e-06, + "loss": 0.0995, + "step": 45762 + }, + { + "epoch": 3.020686380900705, + "grad_norm": 6.127884864807129, + "learning_rate": 3.831437577086474e-06, + "loss": 0.1744, + "step": 45763 + }, + { + "epoch": 3.02069994574064, + "grad_norm": 3.716172218322754, + "learning_rate": 3.831300534466219e-06, + "loss": 0.0988, + "step": 45764 + }, + { + "epoch": 3.0207135105805754, + "grad_norm": 5.53582239151001, + "learning_rate": 3.8311634918459645e-06, + "loss": 0.1565, + "step": 45765 + }, + { + "epoch": 3.02072707542051, + "grad_norm": 4.575583457946777, + "learning_rate": 3.83102644922571e-06, + "loss": 0.1121, + "step": 45766 + }, + { + "epoch": 3.020740640260445, + "grad_norm": 4.537249565124512, + "learning_rate": 3.830889406605455e-06, + "loss": 0.1092, + "step": 45767 + }, + { + "epoch": 3.02075420510038, + "grad_norm": 5.984765529632568, + "learning_rate": 3.830752363985199e-06, + "loss": 0.1245, + "step": 45768 + }, + { + "epoch": 3.020767769940315, + "grad_norm": 4.1623334884643555, + "learning_rate": 3.830615321364945e-06, + "loss": 0.084, + "step": 45769 + }, + { + "epoch": 3.0207813347802497, + "grad_norm": 4.821237564086914, + "learning_rate": 3.8304782787446895e-06, + "loss": 0.0909, + "step": 45770 + }, + { + "epoch": 3.0207948996201845, + "grad_norm": 3.8554892539978027, + "learning_rate": 3.8303412361244356e-06, + "loss": 0.111, + "step": 45771 + }, + { + "epoch": 3.0208084644601194, + "grad_norm": 4.633524417877197, + "learning_rate": 3.83020419350418e-06, + "loss": 0.1573, + "step": 45772 + }, + { + "epoch": 3.0208220293000543, + "grad_norm": 3.8542587757110596, + "learning_rate": 3.830067150883925e-06, + "loss": 0.1118, + "step": 45773 + }, + { + "epoch": 3.020835594139989, + "grad_norm": 6.788280487060547, + "learning_rate": 3.82993010826367e-06, + "loss": 0.1023, + "step": 45774 + }, + { + "epoch": 3.020849158979924, + "grad_norm": 3.8819398880004883, + "learning_rate": 3.8297930656434154e-06, + "loss": 0.1143, + "step": 45775 + }, + { + "epoch": 3.020862723819859, + "grad_norm": 6.777050018310547, + "learning_rate": 3.829656023023161e-06, + "loss": 0.1222, + "step": 45776 + }, + { + "epoch": 3.0208762886597937, + "grad_norm": 6.4678874015808105, + "learning_rate": 3.829518980402906e-06, + "loss": 0.1554, + "step": 45777 + }, + { + "epoch": 3.0208898534997286, + "grad_norm": 4.261251449584961, + "learning_rate": 3.829381937782651e-06, + "loss": 0.069, + "step": 45778 + }, + { + "epoch": 3.0209034183396635, + "grad_norm": 4.416118144989014, + "learning_rate": 3.829244895162395e-06, + "loss": 0.0966, + "step": 45779 + }, + { + "epoch": 3.0209169831795983, + "grad_norm": 4.953581809997559, + "learning_rate": 3.829107852542141e-06, + "loss": 0.0706, + "step": 45780 + }, + { + "epoch": 3.020930548019533, + "grad_norm": 5.141356945037842, + "learning_rate": 3.828970809921886e-06, + "loss": 0.1194, + "step": 45781 + }, + { + "epoch": 3.020944112859468, + "grad_norm": 3.952695846557617, + "learning_rate": 3.828833767301631e-06, + "loss": 0.0908, + "step": 45782 + }, + { + "epoch": 3.020957677699403, + "grad_norm": 3.7208988666534424, + "learning_rate": 3.828696724681376e-06, + "loss": 0.1281, + "step": 45783 + }, + { + "epoch": 3.0209712425393382, + "grad_norm": 4.622592449188232, + "learning_rate": 3.828559682061121e-06, + "loss": 0.0903, + "step": 45784 + }, + { + "epoch": 3.020984807379273, + "grad_norm": 3.4611940383911133, + "learning_rate": 3.828422639440866e-06, + "loss": 0.085, + "step": 45785 + }, + { + "epoch": 3.020998372219208, + "grad_norm": 5.190781116485596, + "learning_rate": 3.8282855968206116e-06, + "loss": 0.1127, + "step": 45786 + }, + { + "epoch": 3.021011937059143, + "grad_norm": 5.224389553070068, + "learning_rate": 3.828148554200357e-06, + "loss": 0.17, + "step": 45787 + }, + { + "epoch": 3.0210255018990777, + "grad_norm": 3.9593863487243652, + "learning_rate": 3.828011511580102e-06, + "loss": 0.0977, + "step": 45788 + }, + { + "epoch": 3.0210390667390126, + "grad_norm": 5.5385541915893555, + "learning_rate": 3.827874468959847e-06, + "loss": 0.0881, + "step": 45789 + }, + { + "epoch": 3.0210526315789474, + "grad_norm": 3.8435943126678467, + "learning_rate": 3.827737426339591e-06, + "loss": 0.1826, + "step": 45790 + }, + { + "epoch": 3.0210661964188823, + "grad_norm": 4.3898491859436035, + "learning_rate": 3.8276003837193374e-06, + "loss": 0.142, + "step": 45791 + }, + { + "epoch": 3.021079761258817, + "grad_norm": 4.52027702331543, + "learning_rate": 3.827463341099082e-06, + "loss": 0.1302, + "step": 45792 + }, + { + "epoch": 3.021093326098752, + "grad_norm": 3.572263240814209, + "learning_rate": 3.827326298478827e-06, + "loss": 0.0723, + "step": 45793 + }, + { + "epoch": 3.021106890938687, + "grad_norm": 4.333461284637451, + "learning_rate": 3.827189255858572e-06, + "loss": 0.109, + "step": 45794 + }, + { + "epoch": 3.0211204557786218, + "grad_norm": 4.365005016326904, + "learning_rate": 3.827052213238317e-06, + "loss": 0.1011, + "step": 45795 + }, + { + "epoch": 3.0211340206185566, + "grad_norm": 5.112380504608154, + "learning_rate": 3.8269151706180625e-06, + "loss": 0.1632, + "step": 45796 + }, + { + "epoch": 3.0211475854584915, + "grad_norm": 3.450525999069214, + "learning_rate": 3.826778127997808e-06, + "loss": 0.0863, + "step": 45797 + }, + { + "epoch": 3.0211611502984264, + "grad_norm": 4.298569679260254, + "learning_rate": 3.826641085377553e-06, + "loss": 0.2032, + "step": 45798 + }, + { + "epoch": 3.021174715138361, + "grad_norm": 3.2309088706970215, + "learning_rate": 3.826504042757298e-06, + "loss": 0.1317, + "step": 45799 + }, + { + "epoch": 3.021188279978296, + "grad_norm": 4.666775226593018, + "learning_rate": 3.826367000137043e-06, + "loss": 0.1402, + "step": 45800 + }, + { + "epoch": 3.021201844818231, + "grad_norm": 3.6496033668518066, + "learning_rate": 3.826229957516788e-06, + "loss": 0.1258, + "step": 45801 + }, + { + "epoch": 3.021215409658166, + "grad_norm": 5.232753276824951, + "learning_rate": 3.826092914896533e-06, + "loss": 0.1619, + "step": 45802 + }, + { + "epoch": 3.021228974498101, + "grad_norm": 3.135671854019165, + "learning_rate": 3.825955872276278e-06, + "loss": 0.0923, + "step": 45803 + }, + { + "epoch": 3.021242539338036, + "grad_norm": 4.018889427185059, + "learning_rate": 3.825818829656023e-06, + "loss": 0.1123, + "step": 45804 + }, + { + "epoch": 3.021256104177971, + "grad_norm": 3.987745761871338, + "learning_rate": 3.825681787035768e-06, + "loss": 0.1321, + "step": 45805 + }, + { + "epoch": 3.0212696690179057, + "grad_norm": 4.029167652130127, + "learning_rate": 3.8255447444155134e-06, + "loss": 0.1269, + "step": 45806 + }, + { + "epoch": 3.0212832338578406, + "grad_norm": 4.403998374938965, + "learning_rate": 3.825407701795259e-06, + "loss": 0.1717, + "step": 45807 + }, + { + "epoch": 3.0212967986977755, + "grad_norm": 4.774411678314209, + "learning_rate": 3.825270659175004e-06, + "loss": 0.0782, + "step": 45808 + }, + { + "epoch": 3.0213103635377103, + "grad_norm": 3.5482089519500732, + "learning_rate": 3.825133616554749e-06, + "loss": 0.1041, + "step": 45809 + }, + { + "epoch": 3.021323928377645, + "grad_norm": 4.747951030731201, + "learning_rate": 3.824996573934494e-06, + "loss": 0.1772, + "step": 45810 + }, + { + "epoch": 3.02133749321758, + "grad_norm": 4.419273376464844, + "learning_rate": 3.824859531314239e-06, + "loss": 0.1888, + "step": 45811 + }, + { + "epoch": 3.021351058057515, + "grad_norm": 4.8324127197265625, + "learning_rate": 3.8247224886939845e-06, + "loss": 0.1307, + "step": 45812 + }, + { + "epoch": 3.02136462289745, + "grad_norm": 3.667529344558716, + "learning_rate": 3.824585446073729e-06, + "loss": 0.1329, + "step": 45813 + }, + { + "epoch": 3.0213781877373846, + "grad_norm": 4.054487705230713, + "learning_rate": 3.824448403453475e-06, + "loss": 0.1078, + "step": 45814 + }, + { + "epoch": 3.0213917525773195, + "grad_norm": 6.563967227935791, + "learning_rate": 3.824311360833219e-06, + "loss": 0.1743, + "step": 45815 + }, + { + "epoch": 3.0214053174172544, + "grad_norm": 5.475177764892578, + "learning_rate": 3.824174318212965e-06, + "loss": 0.2327, + "step": 45816 + }, + { + "epoch": 3.0214188822571892, + "grad_norm": 4.939404487609863, + "learning_rate": 3.8240372755927096e-06, + "loss": 0.1246, + "step": 45817 + }, + { + "epoch": 3.021432447097124, + "grad_norm": 4.461118698120117, + "learning_rate": 3.823900232972455e-06, + "loss": 0.1109, + "step": 45818 + }, + { + "epoch": 3.021446011937059, + "grad_norm": 5.023738861083984, + "learning_rate": 3.8237631903522e-06, + "loss": 0.1619, + "step": 45819 + }, + { + "epoch": 3.021459576776994, + "grad_norm": 4.758404731750488, + "learning_rate": 3.823626147731945e-06, + "loss": 0.1676, + "step": 45820 + }, + { + "epoch": 3.0214731416169287, + "grad_norm": 5.449981212615967, + "learning_rate": 3.82348910511169e-06, + "loss": 0.1805, + "step": 45821 + }, + { + "epoch": 3.021486706456864, + "grad_norm": 3.6808226108551025, + "learning_rate": 3.823352062491435e-06, + "loss": 0.0703, + "step": 45822 + }, + { + "epoch": 3.021500271296799, + "grad_norm": 4.553208351135254, + "learning_rate": 3.823215019871181e-06, + "loss": 0.1015, + "step": 45823 + }, + { + "epoch": 3.0215138361367337, + "grad_norm": 3.2887187004089355, + "learning_rate": 3.823077977250925e-06, + "loss": 0.0659, + "step": 45824 + }, + { + "epoch": 3.0215274009766686, + "grad_norm": 3.186829090118408, + "learning_rate": 3.822940934630671e-06, + "loss": 0.1191, + "step": 45825 + }, + { + "epoch": 3.0215409658166035, + "grad_norm": 3.6370909214019775, + "learning_rate": 3.822803892010415e-06, + "loss": 0.1041, + "step": 45826 + }, + { + "epoch": 3.0215545306565383, + "grad_norm": 2.972456932067871, + "learning_rate": 3.8226668493901605e-06, + "loss": 0.0834, + "step": 45827 + }, + { + "epoch": 3.021568095496473, + "grad_norm": 4.269531726837158, + "learning_rate": 3.822529806769906e-06, + "loss": 0.1149, + "step": 45828 + }, + { + "epoch": 3.021581660336408, + "grad_norm": 3.3182826042175293, + "learning_rate": 3.822392764149651e-06, + "loss": 0.0787, + "step": 45829 + }, + { + "epoch": 3.021595225176343, + "grad_norm": 3.1024086475372314, + "learning_rate": 3.822255721529396e-06, + "loss": 0.1186, + "step": 45830 + }, + { + "epoch": 3.021608790016278, + "grad_norm": 3.915316104888916, + "learning_rate": 3.822118678909141e-06, + "loss": 0.121, + "step": 45831 + }, + { + "epoch": 3.0216223548562127, + "grad_norm": 4.775174140930176, + "learning_rate": 3.821981636288886e-06, + "loss": 0.1399, + "step": 45832 + }, + { + "epoch": 3.0216359196961475, + "grad_norm": 4.402622222900391, + "learning_rate": 3.821844593668631e-06, + "loss": 0.0918, + "step": 45833 + }, + { + "epoch": 3.0216494845360824, + "grad_norm": 4.391513347625732, + "learning_rate": 3.821707551048377e-06, + "loss": 0.1217, + "step": 45834 + }, + { + "epoch": 3.0216630493760173, + "grad_norm": 3.896663188934326, + "learning_rate": 3.821570508428121e-06, + "loss": 0.1196, + "step": 45835 + }, + { + "epoch": 3.021676614215952, + "grad_norm": 4.567366600036621, + "learning_rate": 3.821433465807866e-06, + "loss": 0.1182, + "step": 45836 + }, + { + "epoch": 3.021690179055887, + "grad_norm": 5.233491897583008, + "learning_rate": 3.8212964231876114e-06, + "loss": 0.1586, + "step": 45837 + }, + { + "epoch": 3.021703743895822, + "grad_norm": 5.265045642852783, + "learning_rate": 3.821159380567357e-06, + "loss": 0.1353, + "step": 45838 + }, + { + "epoch": 3.0217173087357567, + "grad_norm": 4.1183953285217285, + "learning_rate": 3.821022337947102e-06, + "loss": 0.0906, + "step": 45839 + }, + { + "epoch": 3.0217308735756916, + "grad_norm": 2.425173044204712, + "learning_rate": 3.820885295326847e-06, + "loss": 0.0454, + "step": 45840 + }, + { + "epoch": 3.021744438415627, + "grad_norm": 4.318347454071045, + "learning_rate": 3.820748252706592e-06, + "loss": 0.1186, + "step": 45841 + }, + { + "epoch": 3.0217580032555618, + "grad_norm": 4.88028621673584, + "learning_rate": 3.820611210086337e-06, + "loss": 0.1363, + "step": 45842 + }, + { + "epoch": 3.0217715680954966, + "grad_norm": 5.458070755004883, + "learning_rate": 3.8204741674660825e-06, + "loss": 0.1442, + "step": 45843 + }, + { + "epoch": 3.0217851329354315, + "grad_norm": 4.4901275634765625, + "learning_rate": 3.820337124845828e-06, + "loss": 0.1456, + "step": 45844 + }, + { + "epoch": 3.0217986977753664, + "grad_norm": 4.647037982940674, + "learning_rate": 3.820200082225573e-06, + "loss": 0.1205, + "step": 45845 + }, + { + "epoch": 3.0218122626153012, + "grad_norm": 5.268465518951416, + "learning_rate": 3.820063039605317e-06, + "loss": 0.1286, + "step": 45846 + }, + { + "epoch": 3.021825827455236, + "grad_norm": 4.241125106811523, + "learning_rate": 3.819925996985062e-06, + "loss": 0.1869, + "step": 45847 + }, + { + "epoch": 3.021839392295171, + "grad_norm": 5.502731800079346, + "learning_rate": 3.8197889543648076e-06, + "loss": 0.1769, + "step": 45848 + }, + { + "epoch": 3.021852957135106, + "grad_norm": 4.12290620803833, + "learning_rate": 3.819651911744553e-06, + "loss": 0.1267, + "step": 45849 + }, + { + "epoch": 3.0218665219750407, + "grad_norm": 3.7882003784179688, + "learning_rate": 3.819514869124298e-06, + "loss": 0.1287, + "step": 45850 + }, + { + "epoch": 3.0218800868149756, + "grad_norm": 3.1401233673095703, + "learning_rate": 3.819377826504043e-06, + "loss": 0.1475, + "step": 45851 + }, + { + "epoch": 3.0218936516549104, + "grad_norm": 3.8170220851898193, + "learning_rate": 3.819240783883788e-06, + "loss": 0.1535, + "step": 45852 + }, + { + "epoch": 3.0219072164948453, + "grad_norm": 2.909106969833374, + "learning_rate": 3.8191037412635335e-06, + "loss": 0.076, + "step": 45853 + }, + { + "epoch": 3.02192078133478, + "grad_norm": 3.7552335262298584, + "learning_rate": 3.818966698643279e-06, + "loss": 0.1425, + "step": 45854 + }, + { + "epoch": 3.021934346174715, + "grad_norm": 4.1232171058654785, + "learning_rate": 3.818829656023024e-06, + "loss": 0.119, + "step": 45855 + }, + { + "epoch": 3.02194791101465, + "grad_norm": 3.2164077758789062, + "learning_rate": 3.818692613402768e-06, + "loss": 0.1107, + "step": 45856 + }, + { + "epoch": 3.0219614758545847, + "grad_norm": 4.067354679107666, + "learning_rate": 3.818555570782514e-06, + "loss": 0.1369, + "step": 45857 + }, + { + "epoch": 3.0219750406945196, + "grad_norm": 5.365081310272217, + "learning_rate": 3.8184185281622585e-06, + "loss": 0.1732, + "step": 45858 + }, + { + "epoch": 3.0219886055344545, + "grad_norm": 3.2478582859039307, + "learning_rate": 3.818281485542004e-06, + "loss": 0.0475, + "step": 45859 + }, + { + "epoch": 3.02200217037439, + "grad_norm": 4.120025157928467, + "learning_rate": 3.818144442921749e-06, + "loss": 0.1266, + "step": 45860 + }, + { + "epoch": 3.0220157352143246, + "grad_norm": 4.119936466217041, + "learning_rate": 3.818007400301494e-06, + "loss": 0.1096, + "step": 45861 + }, + { + "epoch": 3.0220293000542595, + "grad_norm": 4.3881516456604, + "learning_rate": 3.817870357681239e-06, + "loss": 0.1608, + "step": 45862 + }, + { + "epoch": 3.0220428648941944, + "grad_norm": 3.5246903896331787, + "learning_rate": 3.817733315060984e-06, + "loss": 0.0874, + "step": 45863 + }, + { + "epoch": 3.0220564297341292, + "grad_norm": 3.7365503311157227, + "learning_rate": 3.8175962724407296e-06, + "loss": 0.1441, + "step": 45864 + }, + { + "epoch": 3.022069994574064, + "grad_norm": 5.459418773651123, + "learning_rate": 3.817459229820475e-06, + "loss": 0.1429, + "step": 45865 + }, + { + "epoch": 3.022083559413999, + "grad_norm": 4.908646106719971, + "learning_rate": 3.81732218720022e-06, + "loss": 0.2144, + "step": 45866 + }, + { + "epoch": 3.022097124253934, + "grad_norm": 3.357180118560791, + "learning_rate": 3.817185144579964e-06, + "loss": 0.0971, + "step": 45867 + }, + { + "epoch": 3.0221106890938687, + "grad_norm": 4.473004341125488, + "learning_rate": 3.81704810195971e-06, + "loss": 0.1681, + "step": 45868 + }, + { + "epoch": 3.0221242539338036, + "grad_norm": 5.684814929962158, + "learning_rate": 3.816911059339455e-06, + "loss": 0.1758, + "step": 45869 + }, + { + "epoch": 3.0221378187737384, + "grad_norm": 6.446290969848633, + "learning_rate": 3.816774016719201e-06, + "loss": 0.3615, + "step": 45870 + }, + { + "epoch": 3.0221513836136733, + "grad_norm": 4.361947059631348, + "learning_rate": 3.816636974098945e-06, + "loss": 0.1164, + "step": 45871 + }, + { + "epoch": 3.022164948453608, + "grad_norm": 4.150219917297363, + "learning_rate": 3.81649993147869e-06, + "loss": 0.1652, + "step": 45872 + }, + { + "epoch": 3.022178513293543, + "grad_norm": 3.698316812515259, + "learning_rate": 3.816362888858435e-06, + "loss": 0.1989, + "step": 45873 + }, + { + "epoch": 3.022192078133478, + "grad_norm": 4.772838115692139, + "learning_rate": 3.8162258462381805e-06, + "loss": 0.1325, + "step": 45874 + }, + { + "epoch": 3.0222056429734128, + "grad_norm": 3.385425329208374, + "learning_rate": 3.816088803617926e-06, + "loss": 0.1157, + "step": 45875 + }, + { + "epoch": 3.0222192078133476, + "grad_norm": 5.078207492828369, + "learning_rate": 3.81595176099767e-06, + "loss": 0.1637, + "step": 45876 + }, + { + "epoch": 3.0222327726532825, + "grad_norm": 4.211127758026123, + "learning_rate": 3.815814718377416e-06, + "loss": 0.1234, + "step": 45877 + }, + { + "epoch": 3.0222463374932174, + "grad_norm": 3.501615524291992, + "learning_rate": 3.81567767575716e-06, + "loss": 0.0787, + "step": 45878 + }, + { + "epoch": 3.0222599023331527, + "grad_norm": 4.17485237121582, + "learning_rate": 3.815540633136906e-06, + "loss": 0.1757, + "step": 45879 + }, + { + "epoch": 3.0222734671730875, + "grad_norm": 6.070272445678711, + "learning_rate": 3.815403590516651e-06, + "loss": 0.1104, + "step": 45880 + }, + { + "epoch": 3.0222870320130224, + "grad_norm": 2.8159375190734863, + "learning_rate": 3.815266547896396e-06, + "loss": 0.1016, + "step": 45881 + }, + { + "epoch": 3.0223005968529573, + "grad_norm": 3.8240249156951904, + "learning_rate": 3.815129505276141e-06, + "loss": 0.1488, + "step": 45882 + }, + { + "epoch": 3.022314161692892, + "grad_norm": 4.285087585449219, + "learning_rate": 3.814992462655886e-06, + "loss": 0.1249, + "step": 45883 + }, + { + "epoch": 3.022327726532827, + "grad_norm": 4.387032985687256, + "learning_rate": 3.8148554200356315e-06, + "loss": 0.1358, + "step": 45884 + }, + { + "epoch": 3.022341291372762, + "grad_norm": 5.529296875, + "learning_rate": 3.814718377415377e-06, + "loss": 0.1534, + "step": 45885 + }, + { + "epoch": 3.0223548562126967, + "grad_norm": 5.1253743171691895, + "learning_rate": 3.814581334795122e-06, + "loss": 0.1184, + "step": 45886 + }, + { + "epoch": 3.0223684210526316, + "grad_norm": 5.534795761108398, + "learning_rate": 3.8144442921748666e-06, + "loss": 0.1418, + "step": 45887 + }, + { + "epoch": 3.0223819858925665, + "grad_norm": 4.707728385925293, + "learning_rate": 3.8143072495546118e-06, + "loss": 0.1103, + "step": 45888 + }, + { + "epoch": 3.0223955507325013, + "grad_norm": 6.586917877197266, + "learning_rate": 3.8141702069343565e-06, + "loss": 0.1635, + "step": 45889 + }, + { + "epoch": 3.022409115572436, + "grad_norm": 4.486769199371338, + "learning_rate": 3.814033164314102e-06, + "loss": 0.1284, + "step": 45890 + }, + { + "epoch": 3.022422680412371, + "grad_norm": 7.165342807769775, + "learning_rate": 3.813896121693847e-06, + "loss": 0.2022, + "step": 45891 + }, + { + "epoch": 3.022436245252306, + "grad_norm": 3.8573055267333984, + "learning_rate": 3.8137590790735925e-06, + "loss": 0.1376, + "step": 45892 + }, + { + "epoch": 3.022449810092241, + "grad_norm": 5.927915096282959, + "learning_rate": 3.8136220364533372e-06, + "loss": 0.1324, + "step": 45893 + }, + { + "epoch": 3.0224633749321757, + "grad_norm": 5.906139373779297, + "learning_rate": 3.8134849938330824e-06, + "loss": 0.2245, + "step": 45894 + }, + { + "epoch": 3.0224769397721105, + "grad_norm": 4.279573440551758, + "learning_rate": 3.8133479512128276e-06, + "loss": 0.1381, + "step": 45895 + }, + { + "epoch": 3.0224905046120454, + "grad_norm": 4.460529804229736, + "learning_rate": 3.8132109085925728e-06, + "loss": 0.0962, + "step": 45896 + }, + { + "epoch": 3.0225040694519802, + "grad_norm": 4.095603942871094, + "learning_rate": 3.8130738659723175e-06, + "loss": 0.1565, + "step": 45897 + }, + { + "epoch": 3.0225176342919156, + "grad_norm": 6.136927127838135, + "learning_rate": 3.812936823352063e-06, + "loss": 0.1733, + "step": 45898 + }, + { + "epoch": 3.0225311991318504, + "grad_norm": 5.68303918838501, + "learning_rate": 3.812799780731808e-06, + "loss": 0.1662, + "step": 45899 + }, + { + "epoch": 3.0225447639717853, + "grad_norm": 3.0940792560577393, + "learning_rate": 3.8126627381115526e-06, + "loss": 0.0417, + "step": 45900 + }, + { + "epoch": 3.02255832881172, + "grad_norm": 3.8421034812927246, + "learning_rate": 3.8125256954912982e-06, + "loss": 0.0895, + "step": 45901 + }, + { + "epoch": 3.022571893651655, + "grad_norm": 4.023257255554199, + "learning_rate": 3.812388652871043e-06, + "loss": 0.0998, + "step": 45902 + }, + { + "epoch": 3.02258545849159, + "grad_norm": 6.791208744049072, + "learning_rate": 3.8122516102507886e-06, + "loss": 0.1879, + "step": 45903 + }, + { + "epoch": 3.0225990233315247, + "grad_norm": 7.828529357910156, + "learning_rate": 3.8121145676305333e-06, + "loss": 0.2157, + "step": 45904 + }, + { + "epoch": 3.0226125881714596, + "grad_norm": 4.638121128082275, + "learning_rate": 3.8119775250102785e-06, + "loss": 0.1816, + "step": 45905 + }, + { + "epoch": 3.0226261530113945, + "grad_norm": 6.551738739013672, + "learning_rate": 3.8118404823900233e-06, + "loss": 0.1571, + "step": 45906 + }, + { + "epoch": 3.0226397178513293, + "grad_norm": 4.093674182891846, + "learning_rate": 3.811703439769769e-06, + "loss": 0.0922, + "step": 45907 + }, + { + "epoch": 3.022653282691264, + "grad_norm": 3.4055898189544678, + "learning_rate": 3.8115663971495136e-06, + "loss": 0.0817, + "step": 45908 + }, + { + "epoch": 3.022666847531199, + "grad_norm": 3.486837863922119, + "learning_rate": 3.8114293545292592e-06, + "loss": 0.1136, + "step": 45909 + }, + { + "epoch": 3.022680412371134, + "grad_norm": 4.664945602416992, + "learning_rate": 3.811292311909004e-06, + "loss": 0.1596, + "step": 45910 + }, + { + "epoch": 3.022693977211069, + "grad_norm": 2.98477840423584, + "learning_rate": 3.811155269288749e-06, + "loss": 0.1265, + "step": 45911 + }, + { + "epoch": 3.0227075420510037, + "grad_norm": 5.248958587646484, + "learning_rate": 3.8110182266684943e-06, + "loss": 0.2213, + "step": 45912 + }, + { + "epoch": 3.0227211068909385, + "grad_norm": 4.752782344818115, + "learning_rate": 3.8108811840482395e-06, + "loss": 0.1615, + "step": 45913 + }, + { + "epoch": 3.0227346717308734, + "grad_norm": 5.0227580070495605, + "learning_rate": 3.8107441414279843e-06, + "loss": 0.1419, + "step": 45914 + }, + { + "epoch": 3.0227482365708083, + "grad_norm": 4.739851474761963, + "learning_rate": 3.8106070988077295e-06, + "loss": 0.1158, + "step": 45915 + }, + { + "epoch": 3.022761801410743, + "grad_norm": 5.070606708526611, + "learning_rate": 3.8104700561874746e-06, + "loss": 0.1431, + "step": 45916 + }, + { + "epoch": 3.0227753662506784, + "grad_norm": 4.1666975021362305, + "learning_rate": 3.8103330135672194e-06, + "loss": 0.1597, + "step": 45917 + }, + { + "epoch": 3.0227889310906133, + "grad_norm": 3.705004930496216, + "learning_rate": 3.810195970946965e-06, + "loss": 0.0998, + "step": 45918 + }, + { + "epoch": 3.022802495930548, + "grad_norm": 3.694406270980835, + "learning_rate": 3.8100589283267098e-06, + "loss": 0.0942, + "step": 45919 + }, + { + "epoch": 3.022816060770483, + "grad_norm": 4.984564781188965, + "learning_rate": 3.8099218857064554e-06, + "loss": 0.2332, + "step": 45920 + }, + { + "epoch": 3.022829625610418, + "grad_norm": 4.221286296844482, + "learning_rate": 3.8097848430862e-06, + "loss": 0.1736, + "step": 45921 + }, + { + "epoch": 3.0228431904503528, + "grad_norm": 5.826076984405518, + "learning_rate": 3.8096478004659453e-06, + "loss": 0.1118, + "step": 45922 + }, + { + "epoch": 3.0228567552902876, + "grad_norm": 5.24577522277832, + "learning_rate": 3.80951075784569e-06, + "loss": 0.1637, + "step": 45923 + }, + { + "epoch": 3.0228703201302225, + "grad_norm": 3.3102540969848633, + "learning_rate": 3.8093737152254356e-06, + "loss": 0.0556, + "step": 45924 + }, + { + "epoch": 3.0228838849701574, + "grad_norm": 5.318940162658691, + "learning_rate": 3.8092366726051804e-06, + "loss": 0.1965, + "step": 45925 + }, + { + "epoch": 3.0228974498100922, + "grad_norm": 4.6346964836120605, + "learning_rate": 3.809099629984926e-06, + "loss": 0.1519, + "step": 45926 + }, + { + "epoch": 3.022911014650027, + "grad_norm": 4.639054298400879, + "learning_rate": 3.8089625873646708e-06, + "loss": 0.1374, + "step": 45927 + }, + { + "epoch": 3.022924579489962, + "grad_norm": 4.191855430603027, + "learning_rate": 3.8088255447444155e-06, + "loss": 0.1367, + "step": 45928 + }, + { + "epoch": 3.022938144329897, + "grad_norm": 4.8411946296691895, + "learning_rate": 3.808688502124161e-06, + "loss": 0.1633, + "step": 45929 + }, + { + "epoch": 3.0229517091698317, + "grad_norm": 4.129141330718994, + "learning_rate": 3.808551459503906e-06, + "loss": 0.1536, + "step": 45930 + }, + { + "epoch": 3.0229652740097666, + "grad_norm": 5.646906852722168, + "learning_rate": 3.808414416883651e-06, + "loss": 0.1297, + "step": 45931 + }, + { + "epoch": 3.0229788388497014, + "grad_norm": 3.0824790000915527, + "learning_rate": 3.8082773742633962e-06, + "loss": 0.1214, + "step": 45932 + }, + { + "epoch": 3.0229924036896363, + "grad_norm": 3.9746248722076416, + "learning_rate": 3.8081403316431414e-06, + "loss": 0.1074, + "step": 45933 + }, + { + "epoch": 3.023005968529571, + "grad_norm": 5.724006652832031, + "learning_rate": 3.808003289022886e-06, + "loss": 0.1656, + "step": 45934 + }, + { + "epoch": 3.0230195333695065, + "grad_norm": 4.990782737731934, + "learning_rate": 3.8078662464026318e-06, + "loss": 0.1887, + "step": 45935 + }, + { + "epoch": 3.0230330982094413, + "grad_norm": 4.366855621337891, + "learning_rate": 3.8077292037823765e-06, + "loss": 0.1119, + "step": 45936 + }, + { + "epoch": 3.023046663049376, + "grad_norm": 5.837436676025391, + "learning_rate": 3.807592161162122e-06, + "loss": 0.1626, + "step": 45937 + }, + { + "epoch": 3.023060227889311, + "grad_norm": 3.9576802253723145, + "learning_rate": 3.807455118541867e-06, + "loss": 0.1174, + "step": 45938 + }, + { + "epoch": 3.023073792729246, + "grad_norm": 4.215878486633301, + "learning_rate": 3.807318075921612e-06, + "loss": 0.127, + "step": 45939 + }, + { + "epoch": 3.023087357569181, + "grad_norm": 4.096949100494385, + "learning_rate": 3.807181033301357e-06, + "loss": 0.1254, + "step": 45940 + }, + { + "epoch": 3.0231009224091157, + "grad_norm": 6.296926975250244, + "learning_rate": 3.807043990681102e-06, + "loss": 0.1973, + "step": 45941 + }, + { + "epoch": 3.0231144872490505, + "grad_norm": 4.365278720855713, + "learning_rate": 3.806906948060847e-06, + "loss": 0.176, + "step": 45942 + }, + { + "epoch": 3.0231280520889854, + "grad_norm": 5.194193363189697, + "learning_rate": 3.806769905440592e-06, + "loss": 0.1944, + "step": 45943 + }, + { + "epoch": 3.0231416169289203, + "grad_norm": 4.712603569030762, + "learning_rate": 3.8066328628203375e-06, + "loss": 0.1842, + "step": 45944 + }, + { + "epoch": 3.023155181768855, + "grad_norm": 5.3678364753723145, + "learning_rate": 3.8064958202000823e-06, + "loss": 0.149, + "step": 45945 + }, + { + "epoch": 3.02316874660879, + "grad_norm": 6.572410583496094, + "learning_rate": 3.806358777579828e-06, + "loss": 0.2192, + "step": 45946 + }, + { + "epoch": 3.023182311448725, + "grad_norm": 4.232229232788086, + "learning_rate": 3.8062217349595726e-06, + "loss": 0.1708, + "step": 45947 + }, + { + "epoch": 3.0231958762886597, + "grad_norm": 6.804331302642822, + "learning_rate": 3.806084692339318e-06, + "loss": 0.1997, + "step": 45948 + }, + { + "epoch": 3.0232094411285946, + "grad_norm": 4.254672050476074, + "learning_rate": 3.805947649719063e-06, + "loss": 0.185, + "step": 45949 + }, + { + "epoch": 3.0232230059685294, + "grad_norm": 3.5457794666290283, + "learning_rate": 3.805810607098808e-06, + "loss": 0.1473, + "step": 45950 + }, + { + "epoch": 3.0232365708084643, + "grad_norm": 3.4640767574310303, + "learning_rate": 3.805673564478553e-06, + "loss": 0.0844, + "step": 45951 + }, + { + "epoch": 3.023250135648399, + "grad_norm": 4.203543663024902, + "learning_rate": 3.8055365218582985e-06, + "loss": 0.1381, + "step": 45952 + }, + { + "epoch": 3.023263700488334, + "grad_norm": 4.586223602294922, + "learning_rate": 3.8053994792380433e-06, + "loss": 0.116, + "step": 45953 + }, + { + "epoch": 3.023277265328269, + "grad_norm": 5.473325729370117, + "learning_rate": 3.805262436617789e-06, + "loss": 0.2223, + "step": 45954 + }, + { + "epoch": 3.023290830168204, + "grad_norm": 4.026302814483643, + "learning_rate": 3.8051253939975337e-06, + "loss": 0.1669, + "step": 45955 + }, + { + "epoch": 3.023304395008139, + "grad_norm": 5.814387321472168, + "learning_rate": 3.8049883513772784e-06, + "loss": 0.1278, + "step": 45956 + }, + { + "epoch": 3.023317959848074, + "grad_norm": 4.8093180656433105, + "learning_rate": 3.8048513087570236e-06, + "loss": 0.216, + "step": 45957 + }, + { + "epoch": 3.023331524688009, + "grad_norm": 4.592563629150391, + "learning_rate": 3.8047142661367688e-06, + "loss": 0.186, + "step": 45958 + }, + { + "epoch": 3.0233450895279437, + "grad_norm": 3.839744806289673, + "learning_rate": 3.804577223516514e-06, + "loss": 0.1065, + "step": 45959 + }, + { + "epoch": 3.0233586543678785, + "grad_norm": 3.3773396015167236, + "learning_rate": 3.8044401808962587e-06, + "loss": 0.1143, + "step": 45960 + }, + { + "epoch": 3.0233722192078134, + "grad_norm": 4.88927698135376, + "learning_rate": 3.8043031382760043e-06, + "loss": 0.2347, + "step": 45961 + }, + { + "epoch": 3.0233857840477483, + "grad_norm": 4.185288906097412, + "learning_rate": 3.804166095655749e-06, + "loss": 0.2093, + "step": 45962 + }, + { + "epoch": 3.023399348887683, + "grad_norm": 4.740886211395264, + "learning_rate": 3.8040290530354947e-06, + "loss": 0.1304, + "step": 45963 + }, + { + "epoch": 3.023412913727618, + "grad_norm": 4.114689826965332, + "learning_rate": 3.8038920104152394e-06, + "loss": 0.2183, + "step": 45964 + }, + { + "epoch": 3.023426478567553, + "grad_norm": 5.126389026641846, + "learning_rate": 3.8037549677949846e-06, + "loss": 0.2179, + "step": 45965 + }, + { + "epoch": 3.0234400434074877, + "grad_norm": 4.1123881340026855, + "learning_rate": 3.8036179251747298e-06, + "loss": 0.1111, + "step": 45966 + }, + { + "epoch": 3.0234536082474226, + "grad_norm": 6.255568981170654, + "learning_rate": 3.803480882554475e-06, + "loss": 0.1849, + "step": 45967 + }, + { + "epoch": 3.0234671730873575, + "grad_norm": 3.851370334625244, + "learning_rate": 3.8033438399342197e-06, + "loss": 0.1496, + "step": 45968 + }, + { + "epoch": 3.0234807379272923, + "grad_norm": 5.378500461578369, + "learning_rate": 3.803206797313965e-06, + "loss": 0.1717, + "step": 45969 + }, + { + "epoch": 3.023494302767227, + "grad_norm": 6.563686847686768, + "learning_rate": 3.80306975469371e-06, + "loss": 0.2334, + "step": 45970 + }, + { + "epoch": 3.023507867607162, + "grad_norm": 5.850825786590576, + "learning_rate": 3.802932712073455e-06, + "loss": 0.1463, + "step": 45971 + }, + { + "epoch": 3.023521432447097, + "grad_norm": 3.9860153198242188, + "learning_rate": 3.8027956694532004e-06, + "loss": 0.1625, + "step": 45972 + }, + { + "epoch": 3.0235349972870322, + "grad_norm": 4.671588897705078, + "learning_rate": 3.802658626832945e-06, + "loss": 0.1567, + "step": 45973 + }, + { + "epoch": 3.023548562126967, + "grad_norm": 6.602678298950195, + "learning_rate": 3.8025215842126908e-06, + "loss": 0.2199, + "step": 45974 + }, + { + "epoch": 3.023562126966902, + "grad_norm": 4.27610969543457, + "learning_rate": 3.8023845415924355e-06, + "loss": 0.128, + "step": 45975 + }, + { + "epoch": 3.023575691806837, + "grad_norm": 5.554133892059326, + "learning_rate": 3.8022474989721807e-06, + "loss": 0.1857, + "step": 45976 + }, + { + "epoch": 3.0235892566467717, + "grad_norm": 3.9228670597076416, + "learning_rate": 3.8021104563519255e-06, + "loss": 0.1584, + "step": 45977 + }, + { + "epoch": 3.0236028214867066, + "grad_norm": 6.512169361114502, + "learning_rate": 3.801973413731671e-06, + "loss": 0.2836, + "step": 45978 + }, + { + "epoch": 3.0236163863266414, + "grad_norm": 4.789327144622803, + "learning_rate": 3.801836371111416e-06, + "loss": 0.2126, + "step": 45979 + }, + { + "epoch": 3.0236299511665763, + "grad_norm": 4.530351638793945, + "learning_rate": 3.8016993284911614e-06, + "loss": 0.1844, + "step": 45980 + }, + { + "epoch": 3.023643516006511, + "grad_norm": 3.5892701148986816, + "learning_rate": 3.801562285870906e-06, + "loss": 0.1253, + "step": 45981 + }, + { + "epoch": 3.023657080846446, + "grad_norm": 5.830498695373535, + "learning_rate": 3.801425243250651e-06, + "loss": 0.3693, + "step": 45982 + }, + { + "epoch": 3.023670645686381, + "grad_norm": 8.489470481872559, + "learning_rate": 3.8012882006303965e-06, + "loss": 0.2369, + "step": 45983 + }, + { + "epoch": 3.0236842105263158, + "grad_norm": 4.774815559387207, + "learning_rate": 3.8011511580101413e-06, + "loss": 0.1909, + "step": 45984 + }, + { + "epoch": 3.0236977753662506, + "grad_norm": 4.739480972290039, + "learning_rate": 3.8010141153898865e-06, + "loss": 0.1846, + "step": 45985 + }, + { + "epoch": 3.0237113402061855, + "grad_norm": 6.783185005187988, + "learning_rate": 3.8008770727696317e-06, + "loss": 0.156, + "step": 45986 + }, + { + "epoch": 3.0237249050461203, + "grad_norm": 5.710601329803467, + "learning_rate": 3.800740030149377e-06, + "loss": 0.1862, + "step": 45987 + }, + { + "epoch": 3.023738469886055, + "grad_norm": 5.137779712677002, + "learning_rate": 3.8006029875291216e-06, + "loss": 0.1874, + "step": 45988 + }, + { + "epoch": 3.02375203472599, + "grad_norm": 5.064021110534668, + "learning_rate": 3.800465944908867e-06, + "loss": 0.1307, + "step": 45989 + }, + { + "epoch": 3.023765599565925, + "grad_norm": 5.095028400421143, + "learning_rate": 3.800328902288612e-06, + "loss": 0.2252, + "step": 45990 + }, + { + "epoch": 3.02377916440586, + "grad_norm": 4.866274356842041, + "learning_rate": 3.8001918596683576e-06, + "loss": 0.1362, + "step": 45991 + }, + { + "epoch": 3.023792729245795, + "grad_norm": 4.409530162811279, + "learning_rate": 3.8000548170481023e-06, + "loss": 0.1107, + "step": 45992 + }, + { + "epoch": 3.02380629408573, + "grad_norm": 5.348748207092285, + "learning_rate": 3.7999177744278475e-06, + "loss": 0.2105, + "step": 45993 + }, + { + "epoch": 3.023819858925665, + "grad_norm": 4.646729469299316, + "learning_rate": 3.7997807318075922e-06, + "loss": 0.2182, + "step": 45994 + }, + { + "epoch": 3.0238334237655997, + "grad_norm": 3.86647891998291, + "learning_rate": 3.799643689187338e-06, + "loss": 0.0746, + "step": 45995 + }, + { + "epoch": 3.0238469886055346, + "grad_norm": 5.513218402862549, + "learning_rate": 3.7995066465670826e-06, + "loss": 0.284, + "step": 45996 + }, + { + "epoch": 3.0238605534454694, + "grad_norm": 5.350312232971191, + "learning_rate": 3.7993696039468274e-06, + "loss": 0.1468, + "step": 45997 + }, + { + "epoch": 3.0238741182854043, + "grad_norm": 4.111014366149902, + "learning_rate": 3.799232561326573e-06, + "loss": 0.1377, + "step": 45998 + }, + { + "epoch": 3.023887683125339, + "grad_norm": 4.736669063568115, + "learning_rate": 3.7990955187063177e-06, + "loss": 0.2005, + "step": 45999 + }, + { + "epoch": 3.023901247965274, + "grad_norm": 6.954433441162109, + "learning_rate": 3.7989584760860633e-06, + "loss": 0.2606, + "step": 46000 + }, + { + "epoch": 3.023914812805209, + "grad_norm": 7.290079116821289, + "learning_rate": 3.798821433465808e-06, + "loss": 0.2079, + "step": 46001 + }, + { + "epoch": 3.0239283776451438, + "grad_norm": 6.217586040496826, + "learning_rate": 3.7986843908455532e-06, + "loss": 0.1945, + "step": 46002 + }, + { + "epoch": 3.0239419424850786, + "grad_norm": 5.7132487297058105, + "learning_rate": 3.7985473482252984e-06, + "loss": 0.1357, + "step": 46003 + }, + { + "epoch": 3.0239555073250135, + "grad_norm": 5.048947811126709, + "learning_rate": 3.7984103056050436e-06, + "loss": 0.1224, + "step": 46004 + }, + { + "epoch": 3.0239690721649484, + "grad_norm": 4.903604507446289, + "learning_rate": 3.7982732629847884e-06, + "loss": 0.1601, + "step": 46005 + }, + { + "epoch": 3.0239826370048832, + "grad_norm": 5.219593524932861, + "learning_rate": 3.798136220364534e-06, + "loss": 0.1417, + "step": 46006 + }, + { + "epoch": 3.023996201844818, + "grad_norm": 5.201177597045898, + "learning_rate": 3.7979991777442787e-06, + "loss": 0.1804, + "step": 46007 + }, + { + "epoch": 3.024009766684753, + "grad_norm": 4.558902263641357, + "learning_rate": 3.7978621351240243e-06, + "loss": 0.1876, + "step": 46008 + }, + { + "epoch": 3.024023331524688, + "grad_norm": 6.0276360511779785, + "learning_rate": 3.797725092503769e-06, + "loss": 0.0997, + "step": 46009 + }, + { + "epoch": 3.0240368963646227, + "grad_norm": 4.7328619956970215, + "learning_rate": 3.797588049883514e-06, + "loss": 0.113, + "step": 46010 + }, + { + "epoch": 3.024050461204558, + "grad_norm": 4.6013875007629395, + "learning_rate": 3.797451007263259e-06, + "loss": 0.1351, + "step": 46011 + }, + { + "epoch": 3.024064026044493, + "grad_norm": 4.146374225616455, + "learning_rate": 3.797313964643004e-06, + "loss": 0.1347, + "step": 46012 + }, + { + "epoch": 3.0240775908844277, + "grad_norm": 4.608048439025879, + "learning_rate": 3.7971769220227494e-06, + "loss": 0.2007, + "step": 46013 + }, + { + "epoch": 3.0240911557243626, + "grad_norm": 5.09961462020874, + "learning_rate": 3.797039879402494e-06, + "loss": 0.1472, + "step": 46014 + }, + { + "epoch": 3.0241047205642975, + "grad_norm": 5.179697036743164, + "learning_rate": 3.7969028367822397e-06, + "loss": 0.1473, + "step": 46015 + }, + { + "epoch": 3.0241182854042323, + "grad_norm": 3.4150140285491943, + "learning_rate": 3.7967657941619845e-06, + "loss": 0.1008, + "step": 46016 + }, + { + "epoch": 3.024131850244167, + "grad_norm": 5.608922958374023, + "learning_rate": 3.79662875154173e-06, + "loss": 0.1676, + "step": 46017 + }, + { + "epoch": 3.024145415084102, + "grad_norm": 5.008754253387451, + "learning_rate": 3.796491708921475e-06, + "loss": 0.1553, + "step": 46018 + }, + { + "epoch": 3.024158979924037, + "grad_norm": 5.131724834442139, + "learning_rate": 3.79635466630122e-06, + "loss": 0.1462, + "step": 46019 + }, + { + "epoch": 3.024172544763972, + "grad_norm": 5.091279029846191, + "learning_rate": 3.796217623680965e-06, + "loss": 0.1046, + "step": 46020 + }, + { + "epoch": 3.0241861096039067, + "grad_norm": 4.433312892913818, + "learning_rate": 3.7960805810607104e-06, + "loss": 0.0849, + "step": 46021 + }, + { + "epoch": 3.0241996744438415, + "grad_norm": 5.424391269683838, + "learning_rate": 3.795943538440455e-06, + "loss": 0.1799, + "step": 46022 + }, + { + "epoch": 3.0242132392837764, + "grad_norm": 2.7945876121520996, + "learning_rate": 3.7958064958202007e-06, + "loss": 0.0612, + "step": 46023 + }, + { + "epoch": 3.0242268041237113, + "grad_norm": 4.698021411895752, + "learning_rate": 3.7956694531999455e-06, + "loss": 0.1235, + "step": 46024 + }, + { + "epoch": 3.024240368963646, + "grad_norm": 4.56514310836792, + "learning_rate": 3.7955324105796902e-06, + "loss": 0.1817, + "step": 46025 + }, + { + "epoch": 3.024253933803581, + "grad_norm": 5.419850826263428, + "learning_rate": 3.795395367959436e-06, + "loss": 0.2071, + "step": 46026 + }, + { + "epoch": 3.024267498643516, + "grad_norm": 3.617016553878784, + "learning_rate": 3.7952583253391806e-06, + "loss": 0.0704, + "step": 46027 + }, + { + "epoch": 3.0242810634834507, + "grad_norm": 4.316855430603027, + "learning_rate": 3.7951212827189258e-06, + "loss": 0.1411, + "step": 46028 + }, + { + "epoch": 3.0242946283233856, + "grad_norm": 5.227485656738281, + "learning_rate": 3.794984240098671e-06, + "loss": 0.0825, + "step": 46029 + }, + { + "epoch": 3.024308193163321, + "grad_norm": 5.364439964294434, + "learning_rate": 3.794847197478416e-06, + "loss": 0.1695, + "step": 46030 + }, + { + "epoch": 3.0243217580032558, + "grad_norm": 3.6928305625915527, + "learning_rate": 3.794710154858161e-06, + "loss": 0.0663, + "step": 46031 + }, + { + "epoch": 3.0243353228431906, + "grad_norm": 3.933628797531128, + "learning_rate": 3.7945731122379065e-06, + "loss": 0.1203, + "step": 46032 + }, + { + "epoch": 3.0243488876831255, + "grad_norm": 4.547831058502197, + "learning_rate": 3.7944360696176513e-06, + "loss": 0.1272, + "step": 46033 + }, + { + "epoch": 3.0243624525230604, + "grad_norm": 4.06223726272583, + "learning_rate": 3.794299026997397e-06, + "loss": 0.1048, + "step": 46034 + }, + { + "epoch": 3.024376017362995, + "grad_norm": 3.5967652797698975, + "learning_rate": 3.7941619843771416e-06, + "loss": 0.0836, + "step": 46035 + }, + { + "epoch": 3.02438958220293, + "grad_norm": 4.459416389465332, + "learning_rate": 3.794024941756887e-06, + "loss": 0.1472, + "step": 46036 + }, + { + "epoch": 3.024403147042865, + "grad_norm": 3.4705970287323, + "learning_rate": 3.793887899136632e-06, + "loss": 0.0961, + "step": 46037 + }, + { + "epoch": 3.0244167118828, + "grad_norm": 5.5349626541137695, + "learning_rate": 3.7937508565163767e-06, + "loss": 0.1711, + "step": 46038 + }, + { + "epoch": 3.0244302767227347, + "grad_norm": 5.315016269683838, + "learning_rate": 3.793613813896122e-06, + "loss": 0.1626, + "step": 46039 + }, + { + "epoch": 3.0244438415626695, + "grad_norm": 6.283857822418213, + "learning_rate": 3.793476771275867e-06, + "loss": 0.144, + "step": 46040 + }, + { + "epoch": 3.0244574064026044, + "grad_norm": 4.22852897644043, + "learning_rate": 3.7933397286556123e-06, + "loss": 0.0747, + "step": 46041 + }, + { + "epoch": 3.0244709712425393, + "grad_norm": 4.057534694671631, + "learning_rate": 3.793202686035357e-06, + "loss": 0.0685, + "step": 46042 + }, + { + "epoch": 3.024484536082474, + "grad_norm": 3.231600284576416, + "learning_rate": 3.7930656434151026e-06, + "loss": 0.0686, + "step": 46043 + }, + { + "epoch": 3.024498100922409, + "grad_norm": 4.126562595367432, + "learning_rate": 3.7929286007948474e-06, + "loss": 0.1503, + "step": 46044 + }, + { + "epoch": 3.024511665762344, + "grad_norm": 4.942573070526123, + "learning_rate": 3.792791558174593e-06, + "loss": 0.0885, + "step": 46045 + }, + { + "epoch": 3.0245252306022787, + "grad_norm": 5.228277206420898, + "learning_rate": 3.7926545155543377e-06, + "loss": 0.131, + "step": 46046 + }, + { + "epoch": 3.0245387954422136, + "grad_norm": 5.632019519805908, + "learning_rate": 3.792517472934083e-06, + "loss": 0.0848, + "step": 46047 + }, + { + "epoch": 3.0245523602821485, + "grad_norm": 4.958511829376221, + "learning_rate": 3.7923804303138277e-06, + "loss": 0.1637, + "step": 46048 + }, + { + "epoch": 3.0245659251220838, + "grad_norm": 3.885162591934204, + "learning_rate": 3.7922433876935733e-06, + "loss": 0.114, + "step": 46049 + }, + { + "epoch": 3.0245794899620186, + "grad_norm": 3.6797549724578857, + "learning_rate": 3.792106345073318e-06, + "loss": 0.0636, + "step": 46050 + }, + { + "epoch": 3.0245930548019535, + "grad_norm": 5.095202922821045, + "learning_rate": 3.7919693024530628e-06, + "loss": 0.1529, + "step": 46051 + }, + { + "epoch": 3.0246066196418884, + "grad_norm": 4.654642105102539, + "learning_rate": 3.7918322598328084e-06, + "loss": 0.0903, + "step": 46052 + }, + { + "epoch": 3.0246201844818232, + "grad_norm": 3.5332908630371094, + "learning_rate": 3.791695217212553e-06, + "loss": 0.0699, + "step": 46053 + }, + { + "epoch": 3.024633749321758, + "grad_norm": 5.335927963256836, + "learning_rate": 3.7915581745922987e-06, + "loss": 0.1309, + "step": 46054 + }, + { + "epoch": 3.024647314161693, + "grad_norm": 4.039125442504883, + "learning_rate": 3.7914211319720435e-06, + "loss": 0.0896, + "step": 46055 + }, + { + "epoch": 3.024660879001628, + "grad_norm": 4.905579566955566, + "learning_rate": 3.7912840893517887e-06, + "loss": 0.1356, + "step": 46056 + }, + { + "epoch": 3.0246744438415627, + "grad_norm": 4.363735198974609, + "learning_rate": 3.791147046731534e-06, + "loss": 0.1025, + "step": 46057 + }, + { + "epoch": 3.0246880086814976, + "grad_norm": 2.787053346633911, + "learning_rate": 3.791010004111279e-06, + "loss": 0.0649, + "step": 46058 + }, + { + "epoch": 3.0247015735214324, + "grad_norm": 3.990654468536377, + "learning_rate": 3.7908729614910238e-06, + "loss": 0.0885, + "step": 46059 + }, + { + "epoch": 3.0247151383613673, + "grad_norm": 3.052774429321289, + "learning_rate": 3.7907359188707694e-06, + "loss": 0.0527, + "step": 46060 + }, + { + "epoch": 3.024728703201302, + "grad_norm": 5.2158098220825195, + "learning_rate": 3.790598876250514e-06, + "loss": 0.0955, + "step": 46061 + }, + { + "epoch": 3.024742268041237, + "grad_norm": 4.2262282371521, + "learning_rate": 3.7904618336302597e-06, + "loss": 0.1012, + "step": 46062 + }, + { + "epoch": 3.024755832881172, + "grad_norm": 4.42026424407959, + "learning_rate": 3.7903247910100045e-06, + "loss": 0.0734, + "step": 46063 + }, + { + "epoch": 3.0247693977211068, + "grad_norm": 6.314115524291992, + "learning_rate": 3.7901877483897497e-06, + "loss": 0.1176, + "step": 46064 + }, + { + "epoch": 3.0247829625610416, + "grad_norm": 5.760661602020264, + "learning_rate": 3.7900507057694944e-06, + "loss": 0.1261, + "step": 46065 + }, + { + "epoch": 3.0247965274009765, + "grad_norm": 3.3803365230560303, + "learning_rate": 3.7899136631492396e-06, + "loss": 0.0503, + "step": 46066 + }, + { + "epoch": 3.0248100922409114, + "grad_norm": 5.718052864074707, + "learning_rate": 3.789776620528985e-06, + "loss": 0.0969, + "step": 46067 + }, + { + "epoch": 3.0248236570808467, + "grad_norm": 3.901977062225342, + "learning_rate": 3.7896395779087295e-06, + "loss": 0.082, + "step": 46068 + }, + { + "epoch": 3.0248372219207815, + "grad_norm": 4.591672897338867, + "learning_rate": 3.789502535288475e-06, + "loss": 0.1453, + "step": 46069 + }, + { + "epoch": 3.0248507867607164, + "grad_norm": 4.797634601593018, + "learning_rate": 3.78936549266822e-06, + "loss": 0.1165, + "step": 46070 + }, + { + "epoch": 3.0248643516006513, + "grad_norm": 3.7296860218048096, + "learning_rate": 3.7892284500479655e-06, + "loss": 0.0992, + "step": 46071 + }, + { + "epoch": 3.024877916440586, + "grad_norm": 4.792463779449463, + "learning_rate": 3.7890914074277103e-06, + "loss": 0.1722, + "step": 46072 + }, + { + "epoch": 3.024891481280521, + "grad_norm": 3.4552199840545654, + "learning_rate": 3.7889543648074554e-06, + "loss": 0.0574, + "step": 46073 + }, + { + "epoch": 3.024905046120456, + "grad_norm": 5.447633743286133, + "learning_rate": 3.7888173221872006e-06, + "loss": 0.1322, + "step": 46074 + }, + { + "epoch": 3.0249186109603907, + "grad_norm": 3.5818967819213867, + "learning_rate": 3.788680279566946e-06, + "loss": 0.0741, + "step": 46075 + }, + { + "epoch": 3.0249321758003256, + "grad_norm": 4.496258735656738, + "learning_rate": 3.7885432369466906e-06, + "loss": 0.1441, + "step": 46076 + }, + { + "epoch": 3.0249457406402604, + "grad_norm": 5.932643890380859, + "learning_rate": 3.788406194326436e-06, + "loss": 0.1851, + "step": 46077 + }, + { + "epoch": 3.0249593054801953, + "grad_norm": 4.294133186340332, + "learning_rate": 3.788269151706181e-06, + "loss": 0.0862, + "step": 46078 + }, + { + "epoch": 3.02497287032013, + "grad_norm": 4.175201416015625, + "learning_rate": 3.7881321090859257e-06, + "loss": 0.098, + "step": 46079 + }, + { + "epoch": 3.024986435160065, + "grad_norm": 3.0663013458251953, + "learning_rate": 3.7879950664656713e-06, + "loss": 0.1023, + "step": 46080 + }, + { + "epoch": 3.025, + "grad_norm": 4.1407084465026855, + "learning_rate": 3.787858023845416e-06, + "loss": 0.0658, + "step": 46081 + }, + { + "epoch": 3.0250135648399348, + "grad_norm": 4.959621429443359, + "learning_rate": 3.787720981225161e-06, + "loss": 0.0979, + "step": 46082 + }, + { + "epoch": 3.0250271296798696, + "grad_norm": 4.875513076782227, + "learning_rate": 3.7875839386049064e-06, + "loss": 0.1037, + "step": 46083 + }, + { + "epoch": 3.0250406945198045, + "grad_norm": 4.031320095062256, + "learning_rate": 3.7874468959846516e-06, + "loss": 0.1144, + "step": 46084 + }, + { + "epoch": 3.0250542593597394, + "grad_norm": 4.829703330993652, + "learning_rate": 3.7873098533643963e-06, + "loss": 0.1427, + "step": 46085 + }, + { + "epoch": 3.0250678241996742, + "grad_norm": 4.931778907775879, + "learning_rate": 3.787172810744142e-06, + "loss": 0.1083, + "step": 46086 + }, + { + "epoch": 3.0250813890396095, + "grad_norm": 6.183300971984863, + "learning_rate": 3.7870357681238867e-06, + "loss": 0.1424, + "step": 46087 + }, + { + "epoch": 3.0250949538795444, + "grad_norm": 5.178365707397461, + "learning_rate": 3.7868987255036323e-06, + "loss": 0.1562, + "step": 46088 + }, + { + "epoch": 3.0251085187194793, + "grad_norm": 5.462970733642578, + "learning_rate": 3.786761682883377e-06, + "loss": 0.1239, + "step": 46089 + }, + { + "epoch": 3.025122083559414, + "grad_norm": 3.9174726009368896, + "learning_rate": 3.7866246402631222e-06, + "loss": 0.0696, + "step": 46090 + }, + { + "epoch": 3.025135648399349, + "grad_norm": 7.310305118560791, + "learning_rate": 3.7864875976428674e-06, + "loss": 0.169, + "step": 46091 + }, + { + "epoch": 3.025149213239284, + "grad_norm": 5.345200061798096, + "learning_rate": 3.7863505550226126e-06, + "loss": 0.1228, + "step": 46092 + }, + { + "epoch": 3.0251627780792187, + "grad_norm": 4.0954179763793945, + "learning_rate": 3.7862135124023573e-06, + "loss": 0.0717, + "step": 46093 + }, + { + "epoch": 3.0251763429191536, + "grad_norm": 5.119017124176025, + "learning_rate": 3.7860764697821025e-06, + "loss": 0.1, + "step": 46094 + }, + { + "epoch": 3.0251899077590885, + "grad_norm": 4.40755558013916, + "learning_rate": 3.7859394271618477e-06, + "loss": 0.1211, + "step": 46095 + }, + { + "epoch": 3.0252034725990233, + "grad_norm": 4.257655620574951, + "learning_rate": 3.7858023845415924e-06, + "loss": 0.1051, + "step": 46096 + }, + { + "epoch": 3.025217037438958, + "grad_norm": 6.305569648742676, + "learning_rate": 3.785665341921338e-06, + "loss": 0.1726, + "step": 46097 + }, + { + "epoch": 3.025230602278893, + "grad_norm": 3.402838945388794, + "learning_rate": 3.785528299301083e-06, + "loss": 0.068, + "step": 46098 + }, + { + "epoch": 3.025244167118828, + "grad_norm": 4.87168025970459, + "learning_rate": 3.785391256680828e-06, + "loss": 0.2049, + "step": 46099 + }, + { + "epoch": 3.025257731958763, + "grad_norm": 2.222079277038574, + "learning_rate": 3.785254214060573e-06, + "loss": 0.0457, + "step": 46100 + }, + { + "epoch": 3.0252712967986977, + "grad_norm": 4.201146602630615, + "learning_rate": 3.7851171714403183e-06, + "loss": 0.1421, + "step": 46101 + }, + { + "epoch": 3.0252848616386325, + "grad_norm": 4.5301408767700195, + "learning_rate": 3.784980128820063e-06, + "loss": 0.1001, + "step": 46102 + }, + { + "epoch": 3.0252984264785674, + "grad_norm": 3.893155336380005, + "learning_rate": 3.7848430861998087e-06, + "loss": 0.0668, + "step": 46103 + }, + { + "epoch": 3.0253119913185023, + "grad_norm": 4.182188987731934, + "learning_rate": 3.7847060435795534e-06, + "loss": 0.0888, + "step": 46104 + }, + { + "epoch": 3.025325556158437, + "grad_norm": 7.184691429138184, + "learning_rate": 3.784569000959299e-06, + "loss": 0.1683, + "step": 46105 + }, + { + "epoch": 3.0253391209983724, + "grad_norm": 6.048331260681152, + "learning_rate": 3.784431958339044e-06, + "loss": 0.1249, + "step": 46106 + }, + { + "epoch": 3.0253526858383073, + "grad_norm": 4.7924957275390625, + "learning_rate": 3.7842949157187886e-06, + "loss": 0.1364, + "step": 46107 + }, + { + "epoch": 3.025366250678242, + "grad_norm": 3.8439955711364746, + "learning_rate": 3.784157873098534e-06, + "loss": 0.0672, + "step": 46108 + }, + { + "epoch": 3.025379815518177, + "grad_norm": 3.963622570037842, + "learning_rate": 3.784020830478279e-06, + "loss": 0.1066, + "step": 46109 + }, + { + "epoch": 3.025393380358112, + "grad_norm": 4.756536483764648, + "learning_rate": 3.783883787858024e-06, + "loss": 0.0889, + "step": 46110 + }, + { + "epoch": 3.0254069451980468, + "grad_norm": 2.9960010051727295, + "learning_rate": 3.7837467452377693e-06, + "loss": 0.0375, + "step": 46111 + }, + { + "epoch": 3.0254205100379816, + "grad_norm": 3.832735300064087, + "learning_rate": 3.7836097026175145e-06, + "loss": 0.1346, + "step": 46112 + }, + { + "epoch": 3.0254340748779165, + "grad_norm": 4.475766658782959, + "learning_rate": 3.783472659997259e-06, + "loss": 0.1011, + "step": 46113 + }, + { + "epoch": 3.0254476397178514, + "grad_norm": 6.18635892868042, + "learning_rate": 3.783335617377005e-06, + "loss": 0.1327, + "step": 46114 + }, + { + "epoch": 3.025461204557786, + "grad_norm": 3.519127368927002, + "learning_rate": 3.7831985747567496e-06, + "loss": 0.1083, + "step": 46115 + }, + { + "epoch": 3.025474769397721, + "grad_norm": 5.029010772705078, + "learning_rate": 3.783061532136495e-06, + "loss": 0.1207, + "step": 46116 + }, + { + "epoch": 3.025488334237656, + "grad_norm": 4.714893341064453, + "learning_rate": 3.78292448951624e-06, + "loss": 0.133, + "step": 46117 + }, + { + "epoch": 3.025501899077591, + "grad_norm": 5.517729759216309, + "learning_rate": 3.782787446895985e-06, + "loss": 0.216, + "step": 46118 + }, + { + "epoch": 3.0255154639175257, + "grad_norm": 3.909345865249634, + "learning_rate": 3.78265040427573e-06, + "loss": 0.1216, + "step": 46119 + }, + { + "epoch": 3.0255290287574605, + "grad_norm": 4.516412258148193, + "learning_rate": 3.782513361655475e-06, + "loss": 0.1165, + "step": 46120 + }, + { + "epoch": 3.0255425935973954, + "grad_norm": 3.6340460777282715, + "learning_rate": 3.7823763190352202e-06, + "loss": 0.0865, + "step": 46121 + }, + { + "epoch": 3.0255561584373303, + "grad_norm": 4.7052531242370605, + "learning_rate": 3.782239276414965e-06, + "loss": 0.1188, + "step": 46122 + }, + { + "epoch": 3.025569723277265, + "grad_norm": 3.3303585052490234, + "learning_rate": 3.7821022337947106e-06, + "loss": 0.069, + "step": 46123 + }, + { + "epoch": 3.0255832881172, + "grad_norm": 3.969588041305542, + "learning_rate": 3.7819651911744553e-06, + "loss": 0.09, + "step": 46124 + }, + { + "epoch": 3.0255968529571353, + "grad_norm": 5.171687602996826, + "learning_rate": 3.781828148554201e-06, + "loss": 0.1286, + "step": 46125 + }, + { + "epoch": 3.02561041779707, + "grad_norm": 5.634045124053955, + "learning_rate": 3.7816911059339457e-06, + "loss": 0.1032, + "step": 46126 + }, + { + "epoch": 3.025623982637005, + "grad_norm": 4.309391498565674, + "learning_rate": 3.781554063313691e-06, + "loss": 0.1583, + "step": 46127 + }, + { + "epoch": 3.02563754747694, + "grad_norm": 4.453033924102783, + "learning_rate": 3.781417020693436e-06, + "loss": 0.1053, + "step": 46128 + }, + { + "epoch": 3.025651112316875, + "grad_norm": 3.7499279975891113, + "learning_rate": 3.7812799780731812e-06, + "loss": 0.0956, + "step": 46129 + }, + { + "epoch": 3.0256646771568096, + "grad_norm": 4.084914207458496, + "learning_rate": 3.781142935452926e-06, + "loss": 0.1475, + "step": 46130 + }, + { + "epoch": 3.0256782419967445, + "grad_norm": 4.093032360076904, + "learning_rate": 3.7810058928326716e-06, + "loss": 0.1462, + "step": 46131 + }, + { + "epoch": 3.0256918068366794, + "grad_norm": 4.956493377685547, + "learning_rate": 3.7808688502124163e-06, + "loss": 0.1589, + "step": 46132 + }, + { + "epoch": 3.0257053716766142, + "grad_norm": 4.193918704986572, + "learning_rate": 3.780731807592162e-06, + "loss": 0.1087, + "step": 46133 + }, + { + "epoch": 3.025718936516549, + "grad_norm": 3.3931126594543457, + "learning_rate": 3.7805947649719067e-06, + "loss": 0.0636, + "step": 46134 + }, + { + "epoch": 3.025732501356484, + "grad_norm": 4.021085739135742, + "learning_rate": 3.7804577223516515e-06, + "loss": 0.1013, + "step": 46135 + }, + { + "epoch": 3.025746066196419, + "grad_norm": 3.213837146759033, + "learning_rate": 3.7803206797313966e-06, + "loss": 0.0696, + "step": 46136 + }, + { + "epoch": 3.0257596310363537, + "grad_norm": 3.6189351081848145, + "learning_rate": 3.780183637111142e-06, + "loss": 0.0967, + "step": 46137 + }, + { + "epoch": 3.0257731958762886, + "grad_norm": 4.248162269592285, + "learning_rate": 3.780046594490887e-06, + "loss": 0.1809, + "step": 46138 + }, + { + "epoch": 3.0257867607162234, + "grad_norm": 3.350942373275757, + "learning_rate": 3.7799095518706317e-06, + "loss": 0.0781, + "step": 46139 + }, + { + "epoch": 3.0258003255561583, + "grad_norm": 4.61983060836792, + "learning_rate": 3.7797725092503773e-06, + "loss": 0.0967, + "step": 46140 + }, + { + "epoch": 3.025813890396093, + "grad_norm": 5.127768039703369, + "learning_rate": 3.779635466630122e-06, + "loss": 0.1467, + "step": 46141 + }, + { + "epoch": 3.025827455236028, + "grad_norm": 5.5586957931518555, + "learning_rate": 3.7794984240098677e-06, + "loss": 0.153, + "step": 46142 + }, + { + "epoch": 3.025841020075963, + "grad_norm": 4.280720233917236, + "learning_rate": 3.7793613813896125e-06, + "loss": 0.0929, + "step": 46143 + }, + { + "epoch": 3.025854584915898, + "grad_norm": 2.8893699645996094, + "learning_rate": 3.7792243387693576e-06, + "loss": 0.0589, + "step": 46144 + }, + { + "epoch": 3.025868149755833, + "grad_norm": 3.5667805671691895, + "learning_rate": 3.779087296149103e-06, + "loss": 0.0851, + "step": 46145 + }, + { + "epoch": 3.025881714595768, + "grad_norm": 7.0259175300598145, + "learning_rate": 3.778950253528848e-06, + "loss": 0.1406, + "step": 46146 + }, + { + "epoch": 3.025895279435703, + "grad_norm": 4.775197505950928, + "learning_rate": 3.7788132109085928e-06, + "loss": 0.1164, + "step": 46147 + }, + { + "epoch": 3.0259088442756377, + "grad_norm": 4.971026420593262, + "learning_rate": 3.778676168288338e-06, + "loss": 0.1703, + "step": 46148 + }, + { + "epoch": 3.0259224091155725, + "grad_norm": 3.285595178604126, + "learning_rate": 3.778539125668083e-06, + "loss": 0.0634, + "step": 46149 + }, + { + "epoch": 3.0259359739555074, + "grad_norm": 5.708617210388184, + "learning_rate": 3.778402083047828e-06, + "loss": 0.1576, + "step": 46150 + }, + { + "epoch": 3.0259495387954423, + "grad_norm": 3.806344747543335, + "learning_rate": 3.7782650404275735e-06, + "loss": 0.0881, + "step": 46151 + }, + { + "epoch": 3.025963103635377, + "grad_norm": 5.210719108581543, + "learning_rate": 3.7781279978073182e-06, + "loss": 0.1092, + "step": 46152 + }, + { + "epoch": 3.025976668475312, + "grad_norm": 3.9292917251586914, + "learning_rate": 3.7779909551870634e-06, + "loss": 0.0977, + "step": 46153 + }, + { + "epoch": 3.025990233315247, + "grad_norm": 3.96612548828125, + "learning_rate": 3.7778539125668086e-06, + "loss": 0.0861, + "step": 46154 + }, + { + "epoch": 3.0260037981551817, + "grad_norm": 3.8730995655059814, + "learning_rate": 3.7777168699465538e-06, + "loss": 0.0875, + "step": 46155 + }, + { + "epoch": 3.0260173629951166, + "grad_norm": 5.894473075866699, + "learning_rate": 3.7775798273262985e-06, + "loss": 0.1666, + "step": 46156 + }, + { + "epoch": 3.0260309278350515, + "grad_norm": 4.066412925720215, + "learning_rate": 3.777442784706044e-06, + "loss": 0.0707, + "step": 46157 + }, + { + "epoch": 3.0260444926749863, + "grad_norm": 3.5090675354003906, + "learning_rate": 3.777305742085789e-06, + "loss": 0.0961, + "step": 46158 + }, + { + "epoch": 3.026058057514921, + "grad_norm": 4.757835865020752, + "learning_rate": 3.7771686994655345e-06, + "loss": 0.1412, + "step": 46159 + }, + { + "epoch": 3.026071622354856, + "grad_norm": 3.9649717807769775, + "learning_rate": 3.7770316568452792e-06, + "loss": 0.0912, + "step": 46160 + }, + { + "epoch": 3.026085187194791, + "grad_norm": 3.75789213180542, + "learning_rate": 3.776894614225024e-06, + "loss": 0.075, + "step": 46161 + }, + { + "epoch": 3.026098752034726, + "grad_norm": 3.036013603210449, + "learning_rate": 3.7767575716047696e-06, + "loss": 0.1048, + "step": 46162 + }, + { + "epoch": 3.026112316874661, + "grad_norm": 4.253285884857178, + "learning_rate": 3.7766205289845143e-06, + "loss": 0.1002, + "step": 46163 + }, + { + "epoch": 3.026125881714596, + "grad_norm": 2.8527350425720215, + "learning_rate": 3.7764834863642595e-06, + "loss": 0.0526, + "step": 46164 + }, + { + "epoch": 3.026139446554531, + "grad_norm": 4.312983512878418, + "learning_rate": 3.7763464437440047e-06, + "loss": 0.1084, + "step": 46165 + }, + { + "epoch": 3.0261530113944657, + "grad_norm": 2.243210554122925, + "learning_rate": 3.77620940112375e-06, + "loss": 0.0793, + "step": 46166 + }, + { + "epoch": 3.0261665762344006, + "grad_norm": 2.4815468788146973, + "learning_rate": 3.7760723585034946e-06, + "loss": 0.0709, + "step": 46167 + }, + { + "epoch": 3.0261801410743354, + "grad_norm": 3.9809939861297607, + "learning_rate": 3.7759353158832402e-06, + "loss": 0.0869, + "step": 46168 + }, + { + "epoch": 3.0261937059142703, + "grad_norm": 4.989400863647461, + "learning_rate": 3.775798273262985e-06, + "loss": 0.1129, + "step": 46169 + }, + { + "epoch": 3.026207270754205, + "grad_norm": 3.954554319381714, + "learning_rate": 3.77566123064273e-06, + "loss": 0.1163, + "step": 46170 + }, + { + "epoch": 3.02622083559414, + "grad_norm": 3.231454372406006, + "learning_rate": 3.7755241880224754e-06, + "loss": 0.0946, + "step": 46171 + }, + { + "epoch": 3.026234400434075, + "grad_norm": 3.940837860107422, + "learning_rate": 3.7753871454022205e-06, + "loss": 0.0867, + "step": 46172 + }, + { + "epoch": 3.0262479652740097, + "grad_norm": 3.3510849475860596, + "learning_rate": 3.7752501027819653e-06, + "loss": 0.0567, + "step": 46173 + }, + { + "epoch": 3.0262615301139446, + "grad_norm": 3.0126800537109375, + "learning_rate": 3.775113060161711e-06, + "loss": 0.0753, + "step": 46174 + }, + { + "epoch": 3.0262750949538795, + "grad_norm": 4.36277961730957, + "learning_rate": 3.7749760175414556e-06, + "loss": 0.1166, + "step": 46175 + }, + { + "epoch": 3.0262886597938143, + "grad_norm": 4.383848190307617, + "learning_rate": 3.7748389749212004e-06, + "loss": 0.0936, + "step": 46176 + }, + { + "epoch": 3.026302224633749, + "grad_norm": 5.3842453956604, + "learning_rate": 3.774701932300946e-06, + "loss": 0.1301, + "step": 46177 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 4.360657215118408, + "learning_rate": 3.7745648896806908e-06, + "loss": 0.093, + "step": 46178 + }, + { + "epoch": 3.026329354313619, + "grad_norm": 3.9236652851104736, + "learning_rate": 3.7744278470604364e-06, + "loss": 0.0809, + "step": 46179 + }, + { + "epoch": 3.026342919153554, + "grad_norm": 4.907344341278076, + "learning_rate": 3.774290804440181e-06, + "loss": 0.0832, + "step": 46180 + }, + { + "epoch": 3.0263564839934887, + "grad_norm": 3.755652666091919, + "learning_rate": 3.7741537618199263e-06, + "loss": 0.0937, + "step": 46181 + }, + { + "epoch": 3.026370048833424, + "grad_norm": 4.508263111114502, + "learning_rate": 3.7740167191996715e-06, + "loss": 0.0919, + "step": 46182 + }, + { + "epoch": 3.026383613673359, + "grad_norm": 4.92686128616333, + "learning_rate": 3.7738796765794167e-06, + "loss": 0.0872, + "step": 46183 + }, + { + "epoch": 3.0263971785132937, + "grad_norm": 2.833554267883301, + "learning_rate": 3.7737426339591614e-06, + "loss": 0.0623, + "step": 46184 + }, + { + "epoch": 3.0264107433532286, + "grad_norm": 3.539875030517578, + "learning_rate": 3.773605591338907e-06, + "loss": 0.1122, + "step": 46185 + }, + { + "epoch": 3.0264243081931634, + "grad_norm": 5.099429130554199, + "learning_rate": 3.7734685487186518e-06, + "loss": 0.1193, + "step": 46186 + }, + { + "epoch": 3.0264378730330983, + "grad_norm": 4.190959453582764, + "learning_rate": 3.7733315060983974e-06, + "loss": 0.0992, + "step": 46187 + }, + { + "epoch": 3.026451437873033, + "grad_norm": 3.3506500720977783, + "learning_rate": 3.773194463478142e-06, + "loss": 0.0847, + "step": 46188 + }, + { + "epoch": 3.026465002712968, + "grad_norm": 4.919890403747559, + "learning_rate": 3.773057420857887e-06, + "loss": 0.1205, + "step": 46189 + }, + { + "epoch": 3.026478567552903, + "grad_norm": 4.252690315246582, + "learning_rate": 3.772920378237632e-06, + "loss": 0.0773, + "step": 46190 + }, + { + "epoch": 3.0264921323928378, + "grad_norm": 3.2733261585235596, + "learning_rate": 3.7727833356173772e-06, + "loss": 0.0509, + "step": 46191 + }, + { + "epoch": 3.0265056972327726, + "grad_norm": 7.397078037261963, + "learning_rate": 3.7726462929971224e-06, + "loss": 0.1633, + "step": 46192 + }, + { + "epoch": 3.0265192620727075, + "grad_norm": 5.4774603843688965, + "learning_rate": 3.772509250376867e-06, + "loss": 0.1129, + "step": 46193 + }, + { + "epoch": 3.0265328269126424, + "grad_norm": 3.3482649326324463, + "learning_rate": 3.7723722077566128e-06, + "loss": 0.0835, + "step": 46194 + }, + { + "epoch": 3.0265463917525772, + "grad_norm": 3.71933650970459, + "learning_rate": 3.7722351651363575e-06, + "loss": 0.0791, + "step": 46195 + }, + { + "epoch": 3.026559956592512, + "grad_norm": 4.573657989501953, + "learning_rate": 3.772098122516103e-06, + "loss": 0.1126, + "step": 46196 + }, + { + "epoch": 3.026573521432447, + "grad_norm": 3.1897947788238525, + "learning_rate": 3.771961079895848e-06, + "loss": 0.0407, + "step": 46197 + }, + { + "epoch": 3.026587086272382, + "grad_norm": 4.133139133453369, + "learning_rate": 3.771824037275593e-06, + "loss": 0.0965, + "step": 46198 + }, + { + "epoch": 3.0266006511123167, + "grad_norm": 2.9754130840301514, + "learning_rate": 3.7716869946553382e-06, + "loss": 0.0853, + "step": 46199 + }, + { + "epoch": 3.0266142159522516, + "grad_norm": 4.403057098388672, + "learning_rate": 3.7715499520350834e-06, + "loss": 0.0989, + "step": 46200 + }, + { + "epoch": 3.026627780792187, + "grad_norm": 3.6460187435150146, + "learning_rate": 3.771412909414828e-06, + "loss": 0.1817, + "step": 46201 + }, + { + "epoch": 3.0266413456321217, + "grad_norm": 3.8840949535369873, + "learning_rate": 3.7712758667945738e-06, + "loss": 0.1215, + "step": 46202 + }, + { + "epoch": 3.0266549104720566, + "grad_norm": 2.5005502700805664, + "learning_rate": 3.7711388241743185e-06, + "loss": 0.0846, + "step": 46203 + }, + { + "epoch": 3.0266684753119915, + "grad_norm": 2.970856189727783, + "learning_rate": 3.7710017815540633e-06, + "loss": 0.0621, + "step": 46204 + }, + { + "epoch": 3.0266820401519263, + "grad_norm": 5.757165908813477, + "learning_rate": 3.770864738933809e-06, + "loss": 0.1756, + "step": 46205 + }, + { + "epoch": 3.026695604991861, + "grad_norm": 3.753152847290039, + "learning_rate": 3.7707276963135536e-06, + "loss": 0.0847, + "step": 46206 + }, + { + "epoch": 3.026709169831796, + "grad_norm": 4.4670820236206055, + "learning_rate": 3.770590653693299e-06, + "loss": 0.1314, + "step": 46207 + }, + { + "epoch": 3.026722734671731, + "grad_norm": 6.005481719970703, + "learning_rate": 3.770453611073044e-06, + "loss": 0.1259, + "step": 46208 + }, + { + "epoch": 3.026736299511666, + "grad_norm": 4.972628116607666, + "learning_rate": 3.770316568452789e-06, + "loss": 0.1211, + "step": 46209 + }, + { + "epoch": 3.0267498643516006, + "grad_norm": 4.079394817352295, + "learning_rate": 3.770179525832534e-06, + "loss": 0.0495, + "step": 46210 + }, + { + "epoch": 3.0267634291915355, + "grad_norm": 7.274449348449707, + "learning_rate": 3.7700424832122795e-06, + "loss": 0.182, + "step": 46211 + }, + { + "epoch": 3.0267769940314704, + "grad_norm": 3.302093744277954, + "learning_rate": 3.7699054405920243e-06, + "loss": 0.0839, + "step": 46212 + }, + { + "epoch": 3.0267905588714052, + "grad_norm": 3.0846171379089355, + "learning_rate": 3.76976839797177e-06, + "loss": 0.0595, + "step": 46213 + }, + { + "epoch": 3.02680412371134, + "grad_norm": 3.7088611125946045, + "learning_rate": 3.7696313553515147e-06, + "loss": 0.1182, + "step": 46214 + }, + { + "epoch": 3.026817688551275, + "grad_norm": 4.6181440353393555, + "learning_rate": 3.76949431273126e-06, + "loss": 0.1514, + "step": 46215 + }, + { + "epoch": 3.02683125339121, + "grad_norm": 4.541611194610596, + "learning_rate": 3.769357270111005e-06, + "loss": 0.1306, + "step": 46216 + }, + { + "epoch": 3.0268448182311447, + "grad_norm": 4.048205375671387, + "learning_rate": 3.7692202274907498e-06, + "loss": 0.0746, + "step": 46217 + }, + { + "epoch": 3.0268583830710796, + "grad_norm": 4.424219608306885, + "learning_rate": 3.769083184870495e-06, + "loss": 0.1425, + "step": 46218 + }, + { + "epoch": 3.0268719479110144, + "grad_norm": 3.696101665496826, + "learning_rate": 3.76894614225024e-06, + "loss": 0.1185, + "step": 46219 + }, + { + "epoch": 3.0268855127509497, + "grad_norm": 4.2552289962768555, + "learning_rate": 3.7688090996299853e-06, + "loss": 0.0773, + "step": 46220 + }, + { + "epoch": 3.0268990775908846, + "grad_norm": 2.551664352416992, + "learning_rate": 3.76867205700973e-06, + "loss": 0.0559, + "step": 46221 + }, + { + "epoch": 3.0269126424308195, + "grad_norm": 4.869203090667725, + "learning_rate": 3.7685350143894757e-06, + "loss": 0.1141, + "step": 46222 + }, + { + "epoch": 3.0269262072707543, + "grad_norm": 5.035330772399902, + "learning_rate": 3.7683979717692204e-06, + "loss": 0.1483, + "step": 46223 + }, + { + "epoch": 3.026939772110689, + "grad_norm": 3.595418691635132, + "learning_rate": 3.7682609291489656e-06, + "loss": 0.0835, + "step": 46224 + }, + { + "epoch": 3.026953336950624, + "grad_norm": 3.244166135787964, + "learning_rate": 3.7681238865287108e-06, + "loss": 0.0649, + "step": 46225 + }, + { + "epoch": 3.026966901790559, + "grad_norm": 4.658256530761719, + "learning_rate": 3.767986843908456e-06, + "loss": 0.0872, + "step": 46226 + }, + { + "epoch": 3.026980466630494, + "grad_norm": 4.366299152374268, + "learning_rate": 3.7678498012882007e-06, + "loss": 0.1459, + "step": 46227 + }, + { + "epoch": 3.0269940314704287, + "grad_norm": 3.342348575592041, + "learning_rate": 3.7677127586679463e-06, + "loss": 0.054, + "step": 46228 + }, + { + "epoch": 3.0270075963103635, + "grad_norm": 4.636653900146484, + "learning_rate": 3.767575716047691e-06, + "loss": 0.152, + "step": 46229 + }, + { + "epoch": 3.0270211611502984, + "grad_norm": 4.2371649742126465, + "learning_rate": 3.767438673427436e-06, + "loss": 0.0851, + "step": 46230 + }, + { + "epoch": 3.0270347259902333, + "grad_norm": 4.440272331237793, + "learning_rate": 3.7673016308071814e-06, + "loss": 0.1125, + "step": 46231 + }, + { + "epoch": 3.027048290830168, + "grad_norm": 4.3323187828063965, + "learning_rate": 3.767164588186926e-06, + "loss": 0.0772, + "step": 46232 + }, + { + "epoch": 3.027061855670103, + "grad_norm": 3.9972972869873047, + "learning_rate": 3.7670275455666718e-06, + "loss": 0.0837, + "step": 46233 + }, + { + "epoch": 3.027075420510038, + "grad_norm": 2.9835195541381836, + "learning_rate": 3.7668905029464165e-06, + "loss": 0.0805, + "step": 46234 + }, + { + "epoch": 3.0270889853499727, + "grad_norm": 2.97016978263855, + "learning_rate": 3.7667534603261617e-06, + "loss": 0.0523, + "step": 46235 + }, + { + "epoch": 3.0271025501899076, + "grad_norm": 3.9621896743774414, + "learning_rate": 3.766616417705907e-06, + "loss": 0.1595, + "step": 46236 + }, + { + "epoch": 3.0271161150298425, + "grad_norm": 4.012405872344971, + "learning_rate": 3.766479375085652e-06, + "loss": 0.1167, + "step": 46237 + }, + { + "epoch": 3.0271296798697773, + "grad_norm": 4.7018022537231445, + "learning_rate": 3.766342332465397e-06, + "loss": 0.0788, + "step": 46238 + }, + { + "epoch": 3.0271432447097126, + "grad_norm": 3.360013723373413, + "learning_rate": 3.7662052898451424e-06, + "loss": 0.1064, + "step": 46239 + }, + { + "epoch": 3.0271568095496475, + "grad_norm": 3.6119792461395264, + "learning_rate": 3.766068247224887e-06, + "loss": 0.1577, + "step": 46240 + }, + { + "epoch": 3.0271703743895824, + "grad_norm": 4.048095703125, + "learning_rate": 3.7659312046046324e-06, + "loss": 0.1333, + "step": 46241 + }, + { + "epoch": 3.0271839392295172, + "grad_norm": 4.118751049041748, + "learning_rate": 3.7657941619843775e-06, + "loss": 0.1386, + "step": 46242 + }, + { + "epoch": 3.027197504069452, + "grad_norm": 4.740245342254639, + "learning_rate": 3.7656571193641227e-06, + "loss": 0.1236, + "step": 46243 + }, + { + "epoch": 3.027211068909387, + "grad_norm": 6.263385772705078, + "learning_rate": 3.7655200767438675e-06, + "loss": 0.1063, + "step": 46244 + }, + { + "epoch": 3.027224633749322, + "grad_norm": 3.256751537322998, + "learning_rate": 3.7653830341236127e-06, + "loss": 0.1077, + "step": 46245 + }, + { + "epoch": 3.0272381985892567, + "grad_norm": 4.132912635803223, + "learning_rate": 3.765245991503358e-06, + "loss": 0.1704, + "step": 46246 + }, + { + "epoch": 3.0272517634291916, + "grad_norm": 5.509817123413086, + "learning_rate": 3.7651089488831026e-06, + "loss": 0.1207, + "step": 46247 + }, + { + "epoch": 3.0272653282691264, + "grad_norm": 2.617661714553833, + "learning_rate": 3.764971906262848e-06, + "loss": 0.0524, + "step": 46248 + }, + { + "epoch": 3.0272788931090613, + "grad_norm": 4.0055742263793945, + "learning_rate": 3.764834863642593e-06, + "loss": 0.1183, + "step": 46249 + }, + { + "epoch": 3.027292457948996, + "grad_norm": 5.38364839553833, + "learning_rate": 3.7646978210223386e-06, + "loss": 0.1493, + "step": 46250 + }, + { + "epoch": 3.027306022788931, + "grad_norm": 2.728024959564209, + "learning_rate": 3.7645607784020833e-06, + "loss": 0.0634, + "step": 46251 + }, + { + "epoch": 3.027319587628866, + "grad_norm": 4.963957786560059, + "learning_rate": 3.7644237357818285e-06, + "loss": 0.1511, + "step": 46252 + }, + { + "epoch": 3.0273331524688007, + "grad_norm": 3.67396879196167, + "learning_rate": 3.7642866931615737e-06, + "loss": 0.1004, + "step": 46253 + }, + { + "epoch": 3.0273467173087356, + "grad_norm": 3.02622127532959, + "learning_rate": 3.764149650541319e-06, + "loss": 0.0682, + "step": 46254 + }, + { + "epoch": 3.0273602821486705, + "grad_norm": 3.1625123023986816, + "learning_rate": 3.7640126079210636e-06, + "loss": 0.0943, + "step": 46255 + }, + { + "epoch": 3.0273738469886053, + "grad_norm": 2.7424964904785156, + "learning_rate": 3.763875565300809e-06, + "loss": 0.0774, + "step": 46256 + }, + { + "epoch": 3.02738741182854, + "grad_norm": 2.2650794982910156, + "learning_rate": 3.763738522680554e-06, + "loss": 0.0673, + "step": 46257 + }, + { + "epoch": 3.0274009766684755, + "grad_norm": 6.071569919586182, + "learning_rate": 3.7636014800602987e-06, + "loss": 0.188, + "step": 46258 + }, + { + "epoch": 3.0274145415084104, + "grad_norm": 4.149190902709961, + "learning_rate": 3.7634644374400443e-06, + "loss": 0.134, + "step": 46259 + }, + { + "epoch": 3.0274281063483452, + "grad_norm": 4.043160438537598, + "learning_rate": 3.763327394819789e-06, + "loss": 0.0983, + "step": 46260 + }, + { + "epoch": 3.02744167118828, + "grad_norm": 4.072265148162842, + "learning_rate": 3.7631903521995343e-06, + "loss": 0.1651, + "step": 46261 + }, + { + "epoch": 3.027455236028215, + "grad_norm": 6.751486301422119, + "learning_rate": 3.7630533095792794e-06, + "loss": 0.1459, + "step": 46262 + }, + { + "epoch": 3.02746880086815, + "grad_norm": 4.124353408813477, + "learning_rate": 3.7629162669590246e-06, + "loss": 0.1959, + "step": 46263 + }, + { + "epoch": 3.0274823657080847, + "grad_norm": 3.553205966949463, + "learning_rate": 3.7627792243387694e-06, + "loss": 0.1131, + "step": 46264 + }, + { + "epoch": 3.0274959305480196, + "grad_norm": 3.9453015327453613, + "learning_rate": 3.762642181718515e-06, + "loss": 0.0936, + "step": 46265 + }, + { + "epoch": 3.0275094953879544, + "grad_norm": 3.110645055770874, + "learning_rate": 3.7625051390982597e-06, + "loss": 0.1102, + "step": 46266 + }, + { + "epoch": 3.0275230602278893, + "grad_norm": 4.854971408843994, + "learning_rate": 3.7623680964780053e-06, + "loss": 0.1337, + "step": 46267 + }, + { + "epoch": 3.027536625067824, + "grad_norm": 2.6780591011047363, + "learning_rate": 3.76223105385775e-06, + "loss": 0.0956, + "step": 46268 + }, + { + "epoch": 3.027550189907759, + "grad_norm": 4.193848609924316, + "learning_rate": 3.7620940112374953e-06, + "loss": 0.1239, + "step": 46269 + }, + { + "epoch": 3.027563754747694, + "grad_norm": 5.505441665649414, + "learning_rate": 3.7619569686172404e-06, + "loss": 0.1261, + "step": 46270 + }, + { + "epoch": 3.0275773195876288, + "grad_norm": 3.373246192932129, + "learning_rate": 3.7618199259969856e-06, + "loss": 0.1266, + "step": 46271 + }, + { + "epoch": 3.0275908844275636, + "grad_norm": 4.48637580871582, + "learning_rate": 3.7616828833767304e-06, + "loss": 0.1348, + "step": 46272 + }, + { + "epoch": 3.0276044492674985, + "grad_norm": 3.340221881866455, + "learning_rate": 3.761545840756475e-06, + "loss": 0.1088, + "step": 46273 + }, + { + "epoch": 3.0276180141074334, + "grad_norm": 4.450016021728516, + "learning_rate": 3.7614087981362207e-06, + "loss": 0.1612, + "step": 46274 + }, + { + "epoch": 3.0276315789473682, + "grad_norm": 3.1064810752868652, + "learning_rate": 3.7612717555159655e-06, + "loss": 0.0765, + "step": 46275 + }, + { + "epoch": 3.027645143787303, + "grad_norm": 4.01154088973999, + "learning_rate": 3.761134712895711e-06, + "loss": 0.1303, + "step": 46276 + }, + { + "epoch": 3.0276587086272384, + "grad_norm": 2.17695951461792, + "learning_rate": 3.760997670275456e-06, + "loss": 0.0637, + "step": 46277 + }, + { + "epoch": 3.0276722734671733, + "grad_norm": 5.616771221160889, + "learning_rate": 3.760860627655201e-06, + "loss": 0.1201, + "step": 46278 + }, + { + "epoch": 3.027685838307108, + "grad_norm": 2.6676435470581055, + "learning_rate": 3.760723585034946e-06, + "loss": 0.07, + "step": 46279 + }, + { + "epoch": 3.027699403147043, + "grad_norm": 4.110238552093506, + "learning_rate": 3.7605865424146914e-06, + "loss": 0.1434, + "step": 46280 + }, + { + "epoch": 3.027712967986978, + "grad_norm": 4.30861234664917, + "learning_rate": 3.760449499794436e-06, + "loss": 0.098, + "step": 46281 + }, + { + "epoch": 3.0277265328269127, + "grad_norm": 3.1736462116241455, + "learning_rate": 3.7603124571741817e-06, + "loss": 0.0883, + "step": 46282 + }, + { + "epoch": 3.0277400976668476, + "grad_norm": 4.701320648193359, + "learning_rate": 3.7601754145539265e-06, + "loss": 0.1966, + "step": 46283 + }, + { + "epoch": 3.0277536625067825, + "grad_norm": 3.8872759342193604, + "learning_rate": 3.760038371933672e-06, + "loss": 0.1056, + "step": 46284 + }, + { + "epoch": 3.0277672273467173, + "grad_norm": 4.456181049346924, + "learning_rate": 3.759901329313417e-06, + "loss": 0.1436, + "step": 46285 + }, + { + "epoch": 3.027780792186652, + "grad_norm": 4.104862689971924, + "learning_rate": 3.7597642866931616e-06, + "loss": 0.1549, + "step": 46286 + }, + { + "epoch": 3.027794357026587, + "grad_norm": 3.1520841121673584, + "learning_rate": 3.759627244072907e-06, + "loss": 0.0837, + "step": 46287 + }, + { + "epoch": 3.027807921866522, + "grad_norm": 3.2588210105895996, + "learning_rate": 3.759490201452652e-06, + "loss": 0.1074, + "step": 46288 + }, + { + "epoch": 3.027821486706457, + "grad_norm": 4.682791233062744, + "learning_rate": 3.759353158832397e-06, + "loss": 0.0969, + "step": 46289 + }, + { + "epoch": 3.0278350515463917, + "grad_norm": 5.962346076965332, + "learning_rate": 3.759216116212142e-06, + "loss": 0.2499, + "step": 46290 + }, + { + "epoch": 3.0278486163863265, + "grad_norm": 2.8446297645568848, + "learning_rate": 3.7590790735918875e-06, + "loss": 0.0638, + "step": 46291 + }, + { + "epoch": 3.0278621812262614, + "grad_norm": 4.549365520477295, + "learning_rate": 3.7589420309716323e-06, + "loss": 0.1338, + "step": 46292 + }, + { + "epoch": 3.0278757460661962, + "grad_norm": 5.086974143981934, + "learning_rate": 3.758804988351378e-06, + "loss": 0.1868, + "step": 46293 + }, + { + "epoch": 3.027889310906131, + "grad_norm": 6.231742858886719, + "learning_rate": 3.7586679457311226e-06, + "loss": 0.2331, + "step": 46294 + }, + { + "epoch": 3.027902875746066, + "grad_norm": 3.930907964706421, + "learning_rate": 3.758530903110868e-06, + "loss": 0.1873, + "step": 46295 + }, + { + "epoch": 3.0279164405860013, + "grad_norm": 6.527350425720215, + "learning_rate": 3.758393860490613e-06, + "loss": 0.1791, + "step": 46296 + }, + { + "epoch": 3.027930005425936, + "grad_norm": 5.437119960784912, + "learning_rate": 3.758256817870358e-06, + "loss": 0.1445, + "step": 46297 + }, + { + "epoch": 3.027943570265871, + "grad_norm": 5.551764965057373, + "learning_rate": 3.758119775250103e-06, + "loss": 0.2814, + "step": 46298 + }, + { + "epoch": 3.027957135105806, + "grad_norm": 7.189284801483154, + "learning_rate": 3.757982732629848e-06, + "loss": 0.2158, + "step": 46299 + }, + { + "epoch": 3.0279706999457408, + "grad_norm": 4.331296920776367, + "learning_rate": 3.7578456900095933e-06, + "loss": 0.1675, + "step": 46300 + }, + { + "epoch": 3.0279842647856756, + "grad_norm": 3.83747935295105, + "learning_rate": 3.757708647389338e-06, + "loss": 0.0876, + "step": 46301 + }, + { + "epoch": 3.0279978296256105, + "grad_norm": 4.176839351654053, + "learning_rate": 3.7575716047690836e-06, + "loss": 0.1719, + "step": 46302 + }, + { + "epoch": 3.0280113944655453, + "grad_norm": 5.146833419799805, + "learning_rate": 3.7574345621488284e-06, + "loss": 0.1641, + "step": 46303 + }, + { + "epoch": 3.02802495930548, + "grad_norm": 6.112209320068359, + "learning_rate": 3.757297519528574e-06, + "loss": 0.2297, + "step": 46304 + }, + { + "epoch": 3.028038524145415, + "grad_norm": 4.516783714294434, + "learning_rate": 3.7571604769083187e-06, + "loss": 0.1264, + "step": 46305 + }, + { + "epoch": 3.02805208898535, + "grad_norm": 3.4143714904785156, + "learning_rate": 3.757023434288064e-06, + "loss": 0.106, + "step": 46306 + }, + { + "epoch": 3.028065653825285, + "grad_norm": 4.431927680969238, + "learning_rate": 3.756886391667809e-06, + "loss": 0.2074, + "step": 46307 + }, + { + "epoch": 3.0280792186652197, + "grad_norm": 4.305774211883545, + "learning_rate": 3.7567493490475543e-06, + "loss": 0.1273, + "step": 46308 + }, + { + "epoch": 3.0280927835051545, + "grad_norm": 3.297746181488037, + "learning_rate": 3.756612306427299e-06, + "loss": 0.1101, + "step": 46309 + }, + { + "epoch": 3.0281063483450894, + "grad_norm": 4.939028263092041, + "learning_rate": 3.7564752638070446e-06, + "loss": 0.1119, + "step": 46310 + }, + { + "epoch": 3.0281199131850243, + "grad_norm": 3.7285163402557373, + "learning_rate": 3.7563382211867894e-06, + "loss": 0.1086, + "step": 46311 + }, + { + "epoch": 3.028133478024959, + "grad_norm": 4.022597312927246, + "learning_rate": 3.7562011785665346e-06, + "loss": 0.1293, + "step": 46312 + }, + { + "epoch": 3.028147042864894, + "grad_norm": 4.305872917175293, + "learning_rate": 3.7560641359462797e-06, + "loss": 0.1629, + "step": 46313 + }, + { + "epoch": 3.028160607704829, + "grad_norm": 4.613701343536377, + "learning_rate": 3.7559270933260245e-06, + "loss": 0.2517, + "step": 46314 + }, + { + "epoch": 3.028174172544764, + "grad_norm": 4.057231903076172, + "learning_rate": 3.7557900507057697e-06, + "loss": 0.1773, + "step": 46315 + }, + { + "epoch": 3.028187737384699, + "grad_norm": 3.462728500366211, + "learning_rate": 3.755653008085515e-06, + "loss": 0.0717, + "step": 46316 + }, + { + "epoch": 3.028201302224634, + "grad_norm": 4.664487838745117, + "learning_rate": 3.75551596546526e-06, + "loss": 0.0803, + "step": 46317 + }, + { + "epoch": 3.0282148670645688, + "grad_norm": 3.8155624866485596, + "learning_rate": 3.755378922845005e-06, + "loss": 0.1125, + "step": 46318 + }, + { + "epoch": 3.0282284319045036, + "grad_norm": 2.583052158355713, + "learning_rate": 3.7552418802247504e-06, + "loss": 0.0881, + "step": 46319 + }, + { + "epoch": 3.0282419967444385, + "grad_norm": 4.165830612182617, + "learning_rate": 3.755104837604495e-06, + "loss": 0.0983, + "step": 46320 + }, + { + "epoch": 3.0282555615843734, + "grad_norm": 4.5575666427612305, + "learning_rate": 3.7549677949842407e-06, + "loss": 0.1558, + "step": 46321 + }, + { + "epoch": 3.0282691264243082, + "grad_norm": 6.967156410217285, + "learning_rate": 3.7548307523639855e-06, + "loss": 0.2949, + "step": 46322 + }, + { + "epoch": 3.028282691264243, + "grad_norm": 4.599247932434082, + "learning_rate": 3.7546937097437307e-06, + "loss": 0.1342, + "step": 46323 + }, + { + "epoch": 3.028296256104178, + "grad_norm": 6.441134452819824, + "learning_rate": 3.754556667123476e-06, + "loss": 0.2632, + "step": 46324 + }, + { + "epoch": 3.028309820944113, + "grad_norm": 3.511629581451416, + "learning_rate": 3.754419624503221e-06, + "loss": 0.0538, + "step": 46325 + }, + { + "epoch": 3.0283233857840477, + "grad_norm": 6.828917026519775, + "learning_rate": 3.754282581882966e-06, + "loss": 0.1511, + "step": 46326 + }, + { + "epoch": 3.0283369506239826, + "grad_norm": 2.6946613788604736, + "learning_rate": 3.7541455392627106e-06, + "loss": 0.0402, + "step": 46327 + }, + { + "epoch": 3.0283505154639174, + "grad_norm": 3.257943630218506, + "learning_rate": 3.754008496642456e-06, + "loss": 0.0665, + "step": 46328 + }, + { + "epoch": 3.0283640803038523, + "grad_norm": 3.941899061203003, + "learning_rate": 3.753871454022201e-06, + "loss": 0.0534, + "step": 46329 + }, + { + "epoch": 3.028377645143787, + "grad_norm": 3.1886825561523438, + "learning_rate": 3.7537344114019465e-06, + "loss": 0.0998, + "step": 46330 + }, + { + "epoch": 3.028391209983722, + "grad_norm": 2.9642722606658936, + "learning_rate": 3.7535973687816913e-06, + "loss": 0.1278, + "step": 46331 + }, + { + "epoch": 3.028404774823657, + "grad_norm": 5.372627258300781, + "learning_rate": 3.7534603261614364e-06, + "loss": 0.1341, + "step": 46332 + }, + { + "epoch": 3.0284183396635918, + "grad_norm": 4.085811614990234, + "learning_rate": 3.7533232835411816e-06, + "loss": 0.1596, + "step": 46333 + }, + { + "epoch": 3.028431904503527, + "grad_norm": 6.0051445960998535, + "learning_rate": 3.753186240920927e-06, + "loss": 0.1228, + "step": 46334 + }, + { + "epoch": 3.028445469343462, + "grad_norm": 3.015779972076416, + "learning_rate": 3.7530491983006716e-06, + "loss": 0.0752, + "step": 46335 + }, + { + "epoch": 3.028459034183397, + "grad_norm": 4.65980863571167, + "learning_rate": 3.752912155680417e-06, + "loss": 0.1023, + "step": 46336 + }, + { + "epoch": 3.0284725990233317, + "grad_norm": 5.054166316986084, + "learning_rate": 3.752775113060162e-06, + "loss": 0.1013, + "step": 46337 + }, + { + "epoch": 3.0284861638632665, + "grad_norm": 5.409012317657471, + "learning_rate": 3.7526380704399075e-06, + "loss": 0.1312, + "step": 46338 + }, + { + "epoch": 3.0284997287032014, + "grad_norm": 4.824122905731201, + "learning_rate": 3.7525010278196523e-06, + "loss": 0.2015, + "step": 46339 + }, + { + "epoch": 3.0285132935431363, + "grad_norm": 4.2997283935546875, + "learning_rate": 3.7523639851993975e-06, + "loss": 0.1213, + "step": 46340 + }, + { + "epoch": 3.028526858383071, + "grad_norm": 4.382518768310547, + "learning_rate": 3.7522269425791426e-06, + "loss": 0.1311, + "step": 46341 + }, + { + "epoch": 3.028540423223006, + "grad_norm": 3.5797693729400635, + "learning_rate": 3.7520898999588874e-06, + "loss": 0.1012, + "step": 46342 + }, + { + "epoch": 3.028553988062941, + "grad_norm": 8.416440963745117, + "learning_rate": 3.7519528573386326e-06, + "loss": 0.3282, + "step": 46343 + }, + { + "epoch": 3.0285675529028757, + "grad_norm": 4.715501308441162, + "learning_rate": 3.7518158147183773e-06, + "loss": 0.1035, + "step": 46344 + }, + { + "epoch": 3.0285811177428106, + "grad_norm": 3.9231581687927246, + "learning_rate": 3.751678772098123e-06, + "loss": 0.1069, + "step": 46345 + }, + { + "epoch": 3.0285946825827454, + "grad_norm": 4.4762864112854, + "learning_rate": 3.7515417294778677e-06, + "loss": 0.1291, + "step": 46346 + }, + { + "epoch": 3.0286082474226803, + "grad_norm": 3.700164794921875, + "learning_rate": 3.7514046868576133e-06, + "loss": 0.0983, + "step": 46347 + }, + { + "epoch": 3.028621812262615, + "grad_norm": 3.154318332672119, + "learning_rate": 3.751267644237358e-06, + "loss": 0.1208, + "step": 46348 + }, + { + "epoch": 3.02863537710255, + "grad_norm": 4.233975887298584, + "learning_rate": 3.7511306016171032e-06, + "loss": 0.2045, + "step": 46349 + }, + { + "epoch": 3.028648941942485, + "grad_norm": 3.4973812103271484, + "learning_rate": 3.7509935589968484e-06, + "loss": 0.1412, + "step": 46350 + }, + { + "epoch": 3.0286625067824198, + "grad_norm": 4.544825553894043, + "learning_rate": 3.7508565163765936e-06, + "loss": 0.1355, + "step": 46351 + }, + { + "epoch": 3.0286760716223546, + "grad_norm": 4.088281154632568, + "learning_rate": 3.7507194737563383e-06, + "loss": 0.1096, + "step": 46352 + }, + { + "epoch": 3.02868963646229, + "grad_norm": 4.851487636566162, + "learning_rate": 3.750582431136084e-06, + "loss": 0.2398, + "step": 46353 + }, + { + "epoch": 3.028703201302225, + "grad_norm": 4.264734745025635, + "learning_rate": 3.7504453885158287e-06, + "loss": 0.1478, + "step": 46354 + }, + { + "epoch": 3.0287167661421597, + "grad_norm": 5.89813232421875, + "learning_rate": 3.7503083458955734e-06, + "loss": 0.2041, + "step": 46355 + }, + { + "epoch": 3.0287303309820945, + "grad_norm": 7.24713134765625, + "learning_rate": 3.750171303275319e-06, + "loss": 0.1699, + "step": 46356 + }, + { + "epoch": 3.0287438958220294, + "grad_norm": 4.823666572570801, + "learning_rate": 3.750034260655064e-06, + "loss": 0.2071, + "step": 46357 + }, + { + "epoch": 3.0287574606619643, + "grad_norm": 4.8083367347717285, + "learning_rate": 3.7498972180348094e-06, + "loss": 0.2425, + "step": 46358 + }, + { + "epoch": 3.028771025501899, + "grad_norm": 5.159215927124023, + "learning_rate": 3.749760175414554e-06, + "loss": 0.2156, + "step": 46359 + }, + { + "epoch": 3.028784590341834, + "grad_norm": 3.831447124481201, + "learning_rate": 3.7496231327942993e-06, + "loss": 0.1614, + "step": 46360 + }, + { + "epoch": 3.028798155181769, + "grad_norm": 3.5754029750823975, + "learning_rate": 3.749486090174044e-06, + "loss": 0.089, + "step": 46361 + }, + { + "epoch": 3.0288117200217037, + "grad_norm": 4.151426315307617, + "learning_rate": 3.7493490475537897e-06, + "loss": 0.1261, + "step": 46362 + }, + { + "epoch": 3.0288252848616386, + "grad_norm": 6.172522068023682, + "learning_rate": 3.7492120049335345e-06, + "loss": 0.2115, + "step": 46363 + }, + { + "epoch": 3.0288388497015735, + "grad_norm": 6.480712890625, + "learning_rate": 3.74907496231328e-06, + "loss": 0.1841, + "step": 46364 + }, + { + "epoch": 3.0288524145415083, + "grad_norm": 2.5954976081848145, + "learning_rate": 3.748937919693025e-06, + "loss": 0.0708, + "step": 46365 + }, + { + "epoch": 3.028865979381443, + "grad_norm": 3.7880542278289795, + "learning_rate": 3.74880087707277e-06, + "loss": 0.1158, + "step": 46366 + }, + { + "epoch": 3.028879544221378, + "grad_norm": 3.196754217147827, + "learning_rate": 3.748663834452515e-06, + "loss": 0.1013, + "step": 46367 + }, + { + "epoch": 3.028893109061313, + "grad_norm": 3.2900326251983643, + "learning_rate": 3.74852679183226e-06, + "loss": 0.1209, + "step": 46368 + }, + { + "epoch": 3.028906673901248, + "grad_norm": 2.3349764347076416, + "learning_rate": 3.748389749212005e-06, + "loss": 0.053, + "step": 46369 + }, + { + "epoch": 3.0289202387411827, + "grad_norm": 3.3699610233306885, + "learning_rate": 3.7482527065917503e-06, + "loss": 0.0633, + "step": 46370 + }, + { + "epoch": 3.0289338035811175, + "grad_norm": 4.033083915710449, + "learning_rate": 3.7481156639714955e-06, + "loss": 0.1236, + "step": 46371 + }, + { + "epoch": 3.028947368421053, + "grad_norm": 2.915515184402466, + "learning_rate": 3.7479786213512402e-06, + "loss": 0.1234, + "step": 46372 + }, + { + "epoch": 3.0289609332609877, + "grad_norm": 4.415938854217529, + "learning_rate": 3.747841578730986e-06, + "loss": 0.1748, + "step": 46373 + }, + { + "epoch": 3.0289744981009226, + "grad_norm": 2.9009485244750977, + "learning_rate": 3.7477045361107306e-06, + "loss": 0.0919, + "step": 46374 + }, + { + "epoch": 3.0289880629408574, + "grad_norm": 3.1055517196655273, + "learning_rate": 3.747567493490476e-06, + "loss": 0.0808, + "step": 46375 + }, + { + "epoch": 3.0290016277807923, + "grad_norm": 3.5027880668640137, + "learning_rate": 3.747430450870221e-06, + "loss": 0.1357, + "step": 46376 + }, + { + "epoch": 3.029015192620727, + "grad_norm": 4.67741060256958, + "learning_rate": 3.747293408249966e-06, + "loss": 0.1851, + "step": 46377 + }, + { + "epoch": 3.029028757460662, + "grad_norm": 6.076066493988037, + "learning_rate": 3.7471563656297113e-06, + "loss": 0.1174, + "step": 46378 + }, + { + "epoch": 3.029042322300597, + "grad_norm": 4.9162797927856445, + "learning_rate": 3.7470193230094565e-06, + "loss": 0.0999, + "step": 46379 + }, + { + "epoch": 3.0290558871405318, + "grad_norm": 2.62906551361084, + "learning_rate": 3.7468822803892012e-06, + "loss": 0.0565, + "step": 46380 + }, + { + "epoch": 3.0290694519804666, + "grad_norm": 4.274031639099121, + "learning_rate": 3.746745237768947e-06, + "loss": 0.1194, + "step": 46381 + }, + { + "epoch": 3.0290830168204015, + "grad_norm": 3.7572476863861084, + "learning_rate": 3.7466081951486916e-06, + "loss": 0.0942, + "step": 46382 + }, + { + "epoch": 3.0290965816603364, + "grad_norm": 3.75410795211792, + "learning_rate": 3.7464711525284363e-06, + "loss": 0.1279, + "step": 46383 + }, + { + "epoch": 3.029110146500271, + "grad_norm": 4.483282089233398, + "learning_rate": 3.746334109908182e-06, + "loss": 0.131, + "step": 46384 + }, + { + "epoch": 3.029123711340206, + "grad_norm": 4.582353115081787, + "learning_rate": 3.7461970672879267e-06, + "loss": 0.1411, + "step": 46385 + }, + { + "epoch": 3.029137276180141, + "grad_norm": 3.6979482173919678, + "learning_rate": 3.746060024667672e-06, + "loss": 0.153, + "step": 46386 + }, + { + "epoch": 3.029150841020076, + "grad_norm": 4.994331359863281, + "learning_rate": 3.745922982047417e-06, + "loss": 0.0897, + "step": 46387 + }, + { + "epoch": 3.0291644058600107, + "grad_norm": 2.719849109649658, + "learning_rate": 3.7457859394271622e-06, + "loss": 0.0883, + "step": 46388 + }, + { + "epoch": 3.0291779706999455, + "grad_norm": 4.198973655700684, + "learning_rate": 3.745648896806907e-06, + "loss": 0.0847, + "step": 46389 + }, + { + "epoch": 3.029191535539881, + "grad_norm": 5.979050636291504, + "learning_rate": 3.7455118541866526e-06, + "loss": 0.2975, + "step": 46390 + }, + { + "epoch": 3.0292051003798157, + "grad_norm": 6.902307510375977, + "learning_rate": 3.7453748115663973e-06, + "loss": 0.1445, + "step": 46391 + }, + { + "epoch": 3.0292186652197506, + "grad_norm": 4.34725284576416, + "learning_rate": 3.745237768946143e-06, + "loss": 0.1262, + "step": 46392 + }, + { + "epoch": 3.0292322300596854, + "grad_norm": 3.9417874813079834, + "learning_rate": 3.7451007263258877e-06, + "loss": 0.0869, + "step": 46393 + }, + { + "epoch": 3.0292457948996203, + "grad_norm": 2.9279017448425293, + "learning_rate": 3.744963683705633e-06, + "loss": 0.0957, + "step": 46394 + }, + { + "epoch": 3.029259359739555, + "grad_norm": 4.581623554229736, + "learning_rate": 3.744826641085378e-06, + "loss": 0.151, + "step": 46395 + }, + { + "epoch": 3.02927292457949, + "grad_norm": 3.5022549629211426, + "learning_rate": 3.744689598465123e-06, + "loss": 0.1442, + "step": 46396 + }, + { + "epoch": 3.029286489419425, + "grad_norm": 3.049802541732788, + "learning_rate": 3.744552555844868e-06, + "loss": 0.093, + "step": 46397 + }, + { + "epoch": 3.0293000542593598, + "grad_norm": 5.015449523925781, + "learning_rate": 3.7444155132246127e-06, + "loss": 0.116, + "step": 46398 + }, + { + "epoch": 3.0293136190992946, + "grad_norm": 3.442990779876709, + "learning_rate": 3.7442784706043583e-06, + "loss": 0.1075, + "step": 46399 + }, + { + "epoch": 3.0293271839392295, + "grad_norm": 4.111982822418213, + "learning_rate": 3.744141427984103e-06, + "loss": 0.1097, + "step": 46400 + }, + { + "epoch": 3.0293407487791644, + "grad_norm": 3.899585247039795, + "learning_rate": 3.7440043853638487e-06, + "loss": 0.1707, + "step": 46401 + }, + { + "epoch": 3.0293543136190992, + "grad_norm": 3.852863311767578, + "learning_rate": 3.7438673427435935e-06, + "loss": 0.1535, + "step": 46402 + }, + { + "epoch": 3.029367878459034, + "grad_norm": 3.448354721069336, + "learning_rate": 3.7437303001233386e-06, + "loss": 0.0804, + "step": 46403 + }, + { + "epoch": 3.029381443298969, + "grad_norm": 3.0319454669952393, + "learning_rate": 3.743593257503084e-06, + "loss": 0.091, + "step": 46404 + }, + { + "epoch": 3.029395008138904, + "grad_norm": 3.1254537105560303, + "learning_rate": 3.743456214882829e-06, + "loss": 0.1145, + "step": 46405 + }, + { + "epoch": 3.0294085729788387, + "grad_norm": 5.071196556091309, + "learning_rate": 3.7433191722625738e-06, + "loss": 0.1951, + "step": 46406 + }, + { + "epoch": 3.0294221378187736, + "grad_norm": 4.629238605499268, + "learning_rate": 3.7431821296423194e-06, + "loss": 0.1832, + "step": 46407 + }, + { + "epoch": 3.0294357026587084, + "grad_norm": 3.277395248413086, + "learning_rate": 3.743045087022064e-06, + "loss": 0.0881, + "step": 46408 + }, + { + "epoch": 3.0294492674986433, + "grad_norm": 4.3135762214660645, + "learning_rate": 3.742908044401809e-06, + "loss": 0.116, + "step": 46409 + }, + { + "epoch": 3.0294628323385786, + "grad_norm": 4.607868671417236, + "learning_rate": 3.7427710017815545e-06, + "loss": 0.2389, + "step": 46410 + }, + { + "epoch": 3.0294763971785135, + "grad_norm": 4.280542373657227, + "learning_rate": 3.7426339591612992e-06, + "loss": 0.1147, + "step": 46411 + }, + { + "epoch": 3.0294899620184483, + "grad_norm": 5.369312763214111, + "learning_rate": 3.742496916541045e-06, + "loss": 0.2191, + "step": 46412 + }, + { + "epoch": 3.029503526858383, + "grad_norm": 3.342180013656616, + "learning_rate": 3.7423598739207896e-06, + "loss": 0.136, + "step": 46413 + }, + { + "epoch": 3.029517091698318, + "grad_norm": 4.00645112991333, + "learning_rate": 3.7422228313005348e-06, + "loss": 0.1891, + "step": 46414 + }, + { + "epoch": 3.029530656538253, + "grad_norm": 4.187146186828613, + "learning_rate": 3.7420857886802795e-06, + "loss": 0.1517, + "step": 46415 + }, + { + "epoch": 3.029544221378188, + "grad_norm": 3.3747713565826416, + "learning_rate": 3.741948746060025e-06, + "loss": 0.0986, + "step": 46416 + }, + { + "epoch": 3.0295577862181227, + "grad_norm": 3.4498417377471924, + "learning_rate": 3.74181170343977e-06, + "loss": 0.1563, + "step": 46417 + }, + { + "epoch": 3.0295713510580575, + "grad_norm": 3.855543851852417, + "learning_rate": 3.7416746608195155e-06, + "loss": 0.1731, + "step": 46418 + }, + { + "epoch": 3.0295849158979924, + "grad_norm": 3.6130759716033936, + "learning_rate": 3.7415376181992602e-06, + "loss": 0.1271, + "step": 46419 + }, + { + "epoch": 3.0295984807379273, + "grad_norm": 3.424718141555786, + "learning_rate": 3.7414005755790054e-06, + "loss": 0.1034, + "step": 46420 + }, + { + "epoch": 3.029612045577862, + "grad_norm": 3.9972329139709473, + "learning_rate": 3.7412635329587506e-06, + "loss": 0.1798, + "step": 46421 + }, + { + "epoch": 3.029625610417797, + "grad_norm": 4.425123691558838, + "learning_rate": 3.7411264903384958e-06, + "loss": 0.1957, + "step": 46422 + }, + { + "epoch": 3.029639175257732, + "grad_norm": 4.414057731628418, + "learning_rate": 3.7409894477182405e-06, + "loss": 0.1398, + "step": 46423 + }, + { + "epoch": 3.0296527400976667, + "grad_norm": 3.698824882507324, + "learning_rate": 3.7408524050979857e-06, + "loss": 0.1633, + "step": 46424 + }, + { + "epoch": 3.0296663049376016, + "grad_norm": 3.3252599239349365, + "learning_rate": 3.740715362477731e-06, + "loss": 0.0855, + "step": 46425 + }, + { + "epoch": 3.0296798697775364, + "grad_norm": 3.7284319400787354, + "learning_rate": 3.7405783198574756e-06, + "loss": 0.1509, + "step": 46426 + }, + { + "epoch": 3.0296934346174713, + "grad_norm": 4.445902347564697, + "learning_rate": 3.7404412772372212e-06, + "loss": 0.1507, + "step": 46427 + }, + { + "epoch": 3.0297069994574066, + "grad_norm": 3.2548718452453613, + "learning_rate": 3.740304234616966e-06, + "loss": 0.0953, + "step": 46428 + }, + { + "epoch": 3.0297205642973415, + "grad_norm": 4.613763332366943, + "learning_rate": 3.7401671919967116e-06, + "loss": 0.15, + "step": 46429 + }, + { + "epoch": 3.0297341291372764, + "grad_norm": 3.7132084369659424, + "learning_rate": 3.7400301493764564e-06, + "loss": 0.1402, + "step": 46430 + }, + { + "epoch": 3.029747693977211, + "grad_norm": 4.48356294631958, + "learning_rate": 3.7398931067562015e-06, + "loss": 0.1742, + "step": 46431 + }, + { + "epoch": 3.029761258817146, + "grad_norm": 5.251054763793945, + "learning_rate": 3.7397560641359463e-06, + "loss": 0.1526, + "step": 46432 + }, + { + "epoch": 3.029774823657081, + "grad_norm": 3.0488123893737793, + "learning_rate": 3.739619021515692e-06, + "loss": 0.066, + "step": 46433 + }, + { + "epoch": 3.029788388497016, + "grad_norm": 4.338555335998535, + "learning_rate": 3.7394819788954366e-06, + "loss": 0.1729, + "step": 46434 + }, + { + "epoch": 3.0298019533369507, + "grad_norm": 5.163384914398193, + "learning_rate": 3.7393449362751822e-06, + "loss": 0.2074, + "step": 46435 + }, + { + "epoch": 3.0298155181768855, + "grad_norm": 5.2727437019348145, + "learning_rate": 3.739207893654927e-06, + "loss": 0.1761, + "step": 46436 + }, + { + "epoch": 3.0298290830168204, + "grad_norm": 4.19622802734375, + "learning_rate": 3.7390708510346718e-06, + "loss": 0.1753, + "step": 46437 + }, + { + "epoch": 3.0298426478567553, + "grad_norm": 5.358737945556641, + "learning_rate": 3.7389338084144174e-06, + "loss": 0.1733, + "step": 46438 + }, + { + "epoch": 3.02985621269669, + "grad_norm": 3.920079469680786, + "learning_rate": 3.738796765794162e-06, + "loss": 0.1359, + "step": 46439 + }, + { + "epoch": 3.029869777536625, + "grad_norm": 5.274661540985107, + "learning_rate": 3.7386597231739073e-06, + "loss": 0.0754, + "step": 46440 + }, + { + "epoch": 3.02988334237656, + "grad_norm": 5.015258312225342, + "learning_rate": 3.7385226805536525e-06, + "loss": 0.2063, + "step": 46441 + }, + { + "epoch": 3.0298969072164947, + "grad_norm": 4.678563594818115, + "learning_rate": 3.7383856379333977e-06, + "loss": 0.127, + "step": 46442 + }, + { + "epoch": 3.0299104720564296, + "grad_norm": 4.086178779602051, + "learning_rate": 3.7382485953131424e-06, + "loss": 0.1666, + "step": 46443 + }, + { + "epoch": 3.0299240368963645, + "grad_norm": 4.850590705871582, + "learning_rate": 3.738111552692888e-06, + "loss": 0.2002, + "step": 46444 + }, + { + "epoch": 3.0299376017362993, + "grad_norm": 3.4005773067474365, + "learning_rate": 3.7379745100726328e-06, + "loss": 0.0955, + "step": 46445 + }, + { + "epoch": 3.029951166576234, + "grad_norm": 5.2660956382751465, + "learning_rate": 3.7378374674523784e-06, + "loss": 0.239, + "step": 46446 + }, + { + "epoch": 3.029964731416169, + "grad_norm": 4.467745304107666, + "learning_rate": 3.737700424832123e-06, + "loss": 0.1759, + "step": 46447 + }, + { + "epoch": 3.0299782962561044, + "grad_norm": 5.573655605316162, + "learning_rate": 3.7375633822118683e-06, + "loss": 0.1651, + "step": 46448 + }, + { + "epoch": 3.0299918610960392, + "grad_norm": 4.210149765014648, + "learning_rate": 3.7374263395916135e-06, + "loss": 0.1548, + "step": 46449 + }, + { + "epoch": 3.030005425935974, + "grad_norm": 5.8064470291137695, + "learning_rate": 3.7372892969713587e-06, + "loss": 0.1719, + "step": 46450 + }, + { + "epoch": 3.030018990775909, + "grad_norm": 4.180080413818359, + "learning_rate": 3.7371522543511034e-06, + "loss": 0.1962, + "step": 46451 + }, + { + "epoch": 3.030032555615844, + "grad_norm": 6.217140197753906, + "learning_rate": 3.737015211730848e-06, + "loss": 0.1475, + "step": 46452 + }, + { + "epoch": 3.0300461204557787, + "grad_norm": 3.626596212387085, + "learning_rate": 3.7368781691105938e-06, + "loss": 0.0989, + "step": 46453 + }, + { + "epoch": 3.0300596852957136, + "grad_norm": 4.281114101409912, + "learning_rate": 3.7367411264903385e-06, + "loss": 0.1265, + "step": 46454 + }, + { + "epoch": 3.0300732501356484, + "grad_norm": 5.092452049255371, + "learning_rate": 3.736604083870084e-06, + "loss": 0.2503, + "step": 46455 + }, + { + "epoch": 3.0300868149755833, + "grad_norm": 6.147078990936279, + "learning_rate": 3.736467041249829e-06, + "loss": 0.2685, + "step": 46456 + }, + { + "epoch": 3.030100379815518, + "grad_norm": 5.832684516906738, + "learning_rate": 3.736329998629574e-06, + "loss": 0.2064, + "step": 46457 + }, + { + "epoch": 3.030113944655453, + "grad_norm": 7.048910140991211, + "learning_rate": 3.7361929560093192e-06, + "loss": 0.1589, + "step": 46458 + }, + { + "epoch": 3.030127509495388, + "grad_norm": 6.392160892486572, + "learning_rate": 3.7360559133890644e-06, + "loss": 0.1082, + "step": 46459 + }, + { + "epoch": 3.0301410743353228, + "grad_norm": 4.196758270263672, + "learning_rate": 3.735918870768809e-06, + "loss": 0.1075, + "step": 46460 + }, + { + "epoch": 3.0301546391752576, + "grad_norm": 5.56184720993042, + "learning_rate": 3.7357818281485548e-06, + "loss": 0.1096, + "step": 46461 + }, + { + "epoch": 3.0301682040151925, + "grad_norm": 7.481239318847656, + "learning_rate": 3.7356447855282995e-06, + "loss": 0.197, + "step": 46462 + }, + { + "epoch": 3.0301817688551274, + "grad_norm": 3.991274118423462, + "learning_rate": 3.735507742908045e-06, + "loss": 0.1291, + "step": 46463 + }, + { + "epoch": 3.030195333695062, + "grad_norm": 4.674499988555908, + "learning_rate": 3.73537070028779e-06, + "loss": 0.1779, + "step": 46464 + }, + { + "epoch": 3.030208898534997, + "grad_norm": 4.397860050201416, + "learning_rate": 3.7352336576675346e-06, + "loss": 0.1186, + "step": 46465 + }, + { + "epoch": 3.0302224633749324, + "grad_norm": 3.797975540161133, + "learning_rate": 3.7350966150472803e-06, + "loss": 0.0777, + "step": 46466 + }, + { + "epoch": 3.0302360282148673, + "grad_norm": 5.82198429107666, + "learning_rate": 3.734959572427025e-06, + "loss": 0.1747, + "step": 46467 + }, + { + "epoch": 3.030249593054802, + "grad_norm": 4.511969566345215, + "learning_rate": 3.73482252980677e-06, + "loss": 0.097, + "step": 46468 + }, + { + "epoch": 3.030263157894737, + "grad_norm": 3.6415278911590576, + "learning_rate": 3.734685487186515e-06, + "loss": 0.0795, + "step": 46469 + }, + { + "epoch": 3.030276722734672, + "grad_norm": 3.9245238304138184, + "learning_rate": 3.7345484445662605e-06, + "loss": 0.0796, + "step": 46470 + }, + { + "epoch": 3.0302902875746067, + "grad_norm": 4.438195705413818, + "learning_rate": 3.7344114019460053e-06, + "loss": 0.1071, + "step": 46471 + }, + { + "epoch": 3.0303038524145416, + "grad_norm": 5.033811092376709, + "learning_rate": 3.734274359325751e-06, + "loss": 0.1397, + "step": 46472 + }, + { + "epoch": 3.0303174172544765, + "grad_norm": 5.25150203704834, + "learning_rate": 3.7341373167054957e-06, + "loss": 0.1478, + "step": 46473 + }, + { + "epoch": 3.0303309820944113, + "grad_norm": 4.109897613525391, + "learning_rate": 3.734000274085241e-06, + "loss": 0.147, + "step": 46474 + }, + { + "epoch": 3.030344546934346, + "grad_norm": 3.370039701461792, + "learning_rate": 3.733863231464986e-06, + "loss": 0.0961, + "step": 46475 + }, + { + "epoch": 3.030358111774281, + "grad_norm": 4.4897780418396, + "learning_rate": 3.733726188844731e-06, + "loss": 0.1804, + "step": 46476 + }, + { + "epoch": 3.030371676614216, + "grad_norm": 3.8870320320129395, + "learning_rate": 3.733589146224476e-06, + "loss": 0.0646, + "step": 46477 + }, + { + "epoch": 3.030385241454151, + "grad_norm": 3.343308925628662, + "learning_rate": 3.733452103604221e-06, + "loss": 0.1151, + "step": 46478 + }, + { + "epoch": 3.0303988062940856, + "grad_norm": 4.290187358856201, + "learning_rate": 3.7333150609839663e-06, + "loss": 0.0753, + "step": 46479 + }, + { + "epoch": 3.0304123711340205, + "grad_norm": 4.204713821411133, + "learning_rate": 3.733178018363711e-06, + "loss": 0.1187, + "step": 46480 + }, + { + "epoch": 3.0304259359739554, + "grad_norm": 3.0169057846069336, + "learning_rate": 3.7330409757434567e-06, + "loss": 0.0803, + "step": 46481 + }, + { + "epoch": 3.0304395008138902, + "grad_norm": 6.63337516784668, + "learning_rate": 3.7329039331232014e-06, + "loss": 0.1716, + "step": 46482 + }, + { + "epoch": 3.030453065653825, + "grad_norm": 4.18147611618042, + "learning_rate": 3.732766890502947e-06, + "loss": 0.1092, + "step": 46483 + }, + { + "epoch": 3.03046663049376, + "grad_norm": 4.2412896156311035, + "learning_rate": 3.7326298478826918e-06, + "loss": 0.1039, + "step": 46484 + }, + { + "epoch": 3.0304801953336953, + "grad_norm": 4.631346225738525, + "learning_rate": 3.732492805262437e-06, + "loss": 0.1296, + "step": 46485 + }, + { + "epoch": 3.03049376017363, + "grad_norm": 5.377844333648682, + "learning_rate": 3.7323557626421817e-06, + "loss": 0.1002, + "step": 46486 + }, + { + "epoch": 3.030507325013565, + "grad_norm": 3.123133659362793, + "learning_rate": 3.7322187200219273e-06, + "loss": 0.1342, + "step": 46487 + }, + { + "epoch": 3.0305208898535, + "grad_norm": 4.337732315063477, + "learning_rate": 3.732081677401672e-06, + "loss": 0.1152, + "step": 46488 + }, + { + "epoch": 3.0305344546934347, + "grad_norm": 5.04752779006958, + "learning_rate": 3.7319446347814177e-06, + "loss": 0.1596, + "step": 46489 + }, + { + "epoch": 3.0305480195333696, + "grad_norm": 5.649213790893555, + "learning_rate": 3.7318075921611624e-06, + "loss": 0.1394, + "step": 46490 + }, + { + "epoch": 3.0305615843733045, + "grad_norm": 5.192089080810547, + "learning_rate": 3.7316705495409076e-06, + "loss": 0.1232, + "step": 46491 + }, + { + "epoch": 3.0305751492132393, + "grad_norm": 3.6837666034698486, + "learning_rate": 3.7315335069206528e-06, + "loss": 0.0718, + "step": 46492 + }, + { + "epoch": 3.030588714053174, + "grad_norm": 3.1840062141418457, + "learning_rate": 3.7313964643003975e-06, + "loss": 0.0737, + "step": 46493 + }, + { + "epoch": 3.030602278893109, + "grad_norm": 3.9540815353393555, + "learning_rate": 3.7312594216801427e-06, + "loss": 0.1106, + "step": 46494 + }, + { + "epoch": 3.030615843733044, + "grad_norm": 4.521138668060303, + "learning_rate": 3.731122379059888e-06, + "loss": 0.1592, + "step": 46495 + }, + { + "epoch": 3.030629408572979, + "grad_norm": 4.176285266876221, + "learning_rate": 3.730985336439633e-06, + "loss": 0.1043, + "step": 46496 + }, + { + "epoch": 3.0306429734129137, + "grad_norm": 3.2149813175201416, + "learning_rate": 3.730848293819378e-06, + "loss": 0.0916, + "step": 46497 + }, + { + "epoch": 3.0306565382528485, + "grad_norm": 5.055607318878174, + "learning_rate": 3.7307112511991234e-06, + "loss": 0.1473, + "step": 46498 + }, + { + "epoch": 3.0306701030927834, + "grad_norm": 4.448980331420898, + "learning_rate": 3.730574208578868e-06, + "loss": 0.1014, + "step": 46499 + }, + { + "epoch": 3.0306836679327183, + "grad_norm": 3.9990553855895996, + "learning_rate": 3.730437165958614e-06, + "loss": 0.1424, + "step": 46500 + }, + { + "epoch": 3.030697232772653, + "grad_norm": 2.933124542236328, + "learning_rate": 3.7303001233383585e-06, + "loss": 0.1081, + "step": 46501 + }, + { + "epoch": 3.030710797612588, + "grad_norm": 3.528938055038452, + "learning_rate": 3.7301630807181037e-06, + "loss": 0.1131, + "step": 46502 + }, + { + "epoch": 3.030724362452523, + "grad_norm": 5.229177474975586, + "learning_rate": 3.7300260380978485e-06, + "loss": 0.1366, + "step": 46503 + }, + { + "epoch": 3.030737927292458, + "grad_norm": 3.7410905361175537, + "learning_rate": 3.729888995477594e-06, + "loss": 0.1269, + "step": 46504 + }, + { + "epoch": 3.030751492132393, + "grad_norm": 2.764224052429199, + "learning_rate": 3.729751952857339e-06, + "loss": 0.097, + "step": 46505 + }, + { + "epoch": 3.030765056972328, + "grad_norm": 4.77169942855835, + "learning_rate": 3.7296149102370836e-06, + "loss": 0.158, + "step": 46506 + }, + { + "epoch": 3.0307786218122628, + "grad_norm": 4.787199974060059, + "learning_rate": 3.729477867616829e-06, + "loss": 0.1484, + "step": 46507 + }, + { + "epoch": 3.0307921866521976, + "grad_norm": 4.089306831359863, + "learning_rate": 3.729340824996574e-06, + "loss": 0.097, + "step": 46508 + }, + { + "epoch": 3.0308057514921325, + "grad_norm": 4.43187952041626, + "learning_rate": 3.7292037823763196e-06, + "loss": 0.1426, + "step": 46509 + }, + { + "epoch": 3.0308193163320674, + "grad_norm": 2.9412615299224854, + "learning_rate": 3.7290667397560643e-06, + "loss": 0.0885, + "step": 46510 + }, + { + "epoch": 3.0308328811720022, + "grad_norm": 3.9124813079833984, + "learning_rate": 3.7289296971358095e-06, + "loss": 0.1121, + "step": 46511 + }, + { + "epoch": 3.030846446011937, + "grad_norm": 3.474722146987915, + "learning_rate": 3.7287926545155547e-06, + "loss": 0.1121, + "step": 46512 + }, + { + "epoch": 3.030860010851872, + "grad_norm": 3.699608087539673, + "learning_rate": 3.7286556118953e-06, + "loss": 0.1133, + "step": 46513 + }, + { + "epoch": 3.030873575691807, + "grad_norm": 4.06497859954834, + "learning_rate": 3.7285185692750446e-06, + "loss": 0.1158, + "step": 46514 + }, + { + "epoch": 3.0308871405317417, + "grad_norm": 3.05260968208313, + "learning_rate": 3.72838152665479e-06, + "loss": 0.0884, + "step": 46515 + }, + { + "epoch": 3.0309007053716766, + "grad_norm": 4.503106594085693, + "learning_rate": 3.728244484034535e-06, + "loss": 0.2126, + "step": 46516 + }, + { + "epoch": 3.0309142702116114, + "grad_norm": 3.620648145675659, + "learning_rate": 3.7281074414142806e-06, + "loss": 0.1229, + "step": 46517 + }, + { + "epoch": 3.0309278350515463, + "grad_norm": 4.467634677886963, + "learning_rate": 3.7279703987940253e-06, + "loss": 0.1088, + "step": 46518 + }, + { + "epoch": 3.030941399891481, + "grad_norm": 4.63577127456665, + "learning_rate": 3.7278333561737705e-06, + "loss": 0.1694, + "step": 46519 + }, + { + "epoch": 3.030954964731416, + "grad_norm": 4.278985023498535, + "learning_rate": 3.7276963135535157e-06, + "loss": 0.082, + "step": 46520 + }, + { + "epoch": 3.030968529571351, + "grad_norm": 3.9887826442718506, + "learning_rate": 3.7275592709332604e-06, + "loss": 0.1192, + "step": 46521 + }, + { + "epoch": 3.0309820944112857, + "grad_norm": 2.388688325881958, + "learning_rate": 3.7274222283130056e-06, + "loss": 0.0569, + "step": 46522 + }, + { + "epoch": 3.030995659251221, + "grad_norm": 4.5700860023498535, + "learning_rate": 3.7272851856927504e-06, + "loss": 0.1027, + "step": 46523 + }, + { + "epoch": 3.031009224091156, + "grad_norm": 3.393920421600342, + "learning_rate": 3.727148143072496e-06, + "loss": 0.0756, + "step": 46524 + }, + { + "epoch": 3.031022788931091, + "grad_norm": 3.3900089263916016, + "learning_rate": 3.7270111004522407e-06, + "loss": 0.1653, + "step": 46525 + }, + { + "epoch": 3.0310363537710256, + "grad_norm": 4.232946872711182, + "learning_rate": 3.7268740578319863e-06, + "loss": 0.1268, + "step": 46526 + }, + { + "epoch": 3.0310499186109605, + "grad_norm": 4.164162635803223, + "learning_rate": 3.726737015211731e-06, + "loss": 0.1012, + "step": 46527 + }, + { + "epoch": 3.0310634834508954, + "grad_norm": 3.344395875930786, + "learning_rate": 3.7265999725914763e-06, + "loss": 0.0675, + "step": 46528 + }, + { + "epoch": 3.0310770482908302, + "grad_norm": 2.710925579071045, + "learning_rate": 3.7264629299712214e-06, + "loss": 0.0648, + "step": 46529 + }, + { + "epoch": 3.031090613130765, + "grad_norm": 3.6640048027038574, + "learning_rate": 3.7263258873509666e-06, + "loss": 0.1058, + "step": 46530 + }, + { + "epoch": 3.0311041779707, + "grad_norm": 6.470749378204346, + "learning_rate": 3.7261888447307114e-06, + "loss": 0.206, + "step": 46531 + }, + { + "epoch": 3.031117742810635, + "grad_norm": 3.0591795444488525, + "learning_rate": 3.726051802110457e-06, + "loss": 0.1019, + "step": 46532 + }, + { + "epoch": 3.0311313076505697, + "grad_norm": 3.589470863342285, + "learning_rate": 3.7259147594902017e-06, + "loss": 0.1339, + "step": 46533 + }, + { + "epoch": 3.0311448724905046, + "grad_norm": 4.277472019195557, + "learning_rate": 3.7257777168699465e-06, + "loss": 0.1056, + "step": 46534 + }, + { + "epoch": 3.0311584373304394, + "grad_norm": 5.473607063293457, + "learning_rate": 3.725640674249692e-06, + "loss": 0.1735, + "step": 46535 + }, + { + "epoch": 3.0311720021703743, + "grad_norm": 3.0182595252990723, + "learning_rate": 3.725503631629437e-06, + "loss": 0.0639, + "step": 46536 + }, + { + "epoch": 3.031185567010309, + "grad_norm": 4.053988456726074, + "learning_rate": 3.7253665890091824e-06, + "loss": 0.1342, + "step": 46537 + }, + { + "epoch": 3.031199131850244, + "grad_norm": 5.201620578765869, + "learning_rate": 3.725229546388927e-06, + "loss": 0.1614, + "step": 46538 + }, + { + "epoch": 3.031212696690179, + "grad_norm": 5.720811367034912, + "learning_rate": 3.7250925037686724e-06, + "loss": 0.1846, + "step": 46539 + }, + { + "epoch": 3.0312262615301138, + "grad_norm": 4.715588569641113, + "learning_rate": 3.724955461148417e-06, + "loss": 0.1075, + "step": 46540 + }, + { + "epoch": 3.0312398263700486, + "grad_norm": 3.602180004119873, + "learning_rate": 3.7248184185281627e-06, + "loss": 0.0586, + "step": 46541 + }, + { + "epoch": 3.031253391209984, + "grad_norm": 3.4067440032958984, + "learning_rate": 3.7246813759079075e-06, + "loss": 0.1081, + "step": 46542 + }, + { + "epoch": 3.031266956049919, + "grad_norm": 2.8575124740600586, + "learning_rate": 3.724544333287653e-06, + "loss": 0.066, + "step": 46543 + }, + { + "epoch": 3.0312805208898537, + "grad_norm": 4.016512393951416, + "learning_rate": 3.724407290667398e-06, + "loss": 0.1496, + "step": 46544 + }, + { + "epoch": 3.0312940857297885, + "grad_norm": 3.556332588195801, + "learning_rate": 3.724270248047143e-06, + "loss": 0.091, + "step": 46545 + }, + { + "epoch": 3.0313076505697234, + "grad_norm": 3.039092540740967, + "learning_rate": 3.724133205426888e-06, + "loss": 0.0869, + "step": 46546 + }, + { + "epoch": 3.0313212154096583, + "grad_norm": 3.712677240371704, + "learning_rate": 3.723996162806633e-06, + "loss": 0.1231, + "step": 46547 + }, + { + "epoch": 3.031334780249593, + "grad_norm": 3.5784525871276855, + "learning_rate": 3.723859120186378e-06, + "loss": 0.0754, + "step": 46548 + }, + { + "epoch": 3.031348345089528, + "grad_norm": 3.0125210285186768, + "learning_rate": 3.7237220775661233e-06, + "loss": 0.0897, + "step": 46549 + }, + { + "epoch": 3.031361909929463, + "grad_norm": 3.9626305103302, + "learning_rate": 3.7235850349458685e-06, + "loss": 0.1499, + "step": 46550 + }, + { + "epoch": 3.0313754747693977, + "grad_norm": 5.444893836975098, + "learning_rate": 3.7234479923256133e-06, + "loss": 0.2554, + "step": 46551 + }, + { + "epoch": 3.0313890396093326, + "grad_norm": 4.453828811645508, + "learning_rate": 3.723310949705359e-06, + "loss": 0.1059, + "step": 46552 + }, + { + "epoch": 3.0314026044492675, + "grad_norm": 3.7843639850616455, + "learning_rate": 3.7231739070851036e-06, + "loss": 0.1339, + "step": 46553 + }, + { + "epoch": 3.0314161692892023, + "grad_norm": 3.544868230819702, + "learning_rate": 3.7230368644648492e-06, + "loss": 0.1312, + "step": 46554 + }, + { + "epoch": 3.031429734129137, + "grad_norm": 4.254932403564453, + "learning_rate": 3.722899821844594e-06, + "loss": 0.1217, + "step": 46555 + }, + { + "epoch": 3.031443298969072, + "grad_norm": 4.381211280822754, + "learning_rate": 3.722762779224339e-06, + "loss": 0.1288, + "step": 46556 + }, + { + "epoch": 3.031456863809007, + "grad_norm": 2.994715690612793, + "learning_rate": 3.722625736604084e-06, + "loss": 0.0643, + "step": 46557 + }, + { + "epoch": 3.031470428648942, + "grad_norm": 4.190120697021484, + "learning_rate": 3.7224886939838295e-06, + "loss": 0.1035, + "step": 46558 + }, + { + "epoch": 3.0314839934888766, + "grad_norm": 5.016704082489014, + "learning_rate": 3.7223516513635743e-06, + "loss": 0.1637, + "step": 46559 + }, + { + "epoch": 3.0314975583288115, + "grad_norm": 5.225803375244141, + "learning_rate": 3.72221460874332e-06, + "loss": 0.2152, + "step": 46560 + }, + { + "epoch": 3.031511123168747, + "grad_norm": 2.9832425117492676, + "learning_rate": 3.7220775661230646e-06, + "loss": 0.0823, + "step": 46561 + }, + { + "epoch": 3.0315246880086817, + "grad_norm": 4.043549537658691, + "learning_rate": 3.7219405235028094e-06, + "loss": 0.1248, + "step": 46562 + }, + { + "epoch": 3.0315382528486166, + "grad_norm": 6.651263236999512, + "learning_rate": 3.721803480882555e-06, + "loss": 0.2157, + "step": 46563 + }, + { + "epoch": 3.0315518176885514, + "grad_norm": 3.3139994144439697, + "learning_rate": 3.7216664382622997e-06, + "loss": 0.1114, + "step": 46564 + }, + { + "epoch": 3.0315653825284863, + "grad_norm": 4.210015773773193, + "learning_rate": 3.721529395642045e-06, + "loss": 0.1951, + "step": 46565 + }, + { + "epoch": 3.031578947368421, + "grad_norm": 4.932714462280273, + "learning_rate": 3.72139235302179e-06, + "loss": 0.2657, + "step": 46566 + }, + { + "epoch": 3.031592512208356, + "grad_norm": 4.395297527313232, + "learning_rate": 3.7212553104015353e-06, + "loss": 0.1717, + "step": 46567 + }, + { + "epoch": 3.031606077048291, + "grad_norm": 4.510305881500244, + "learning_rate": 3.72111826778128e-06, + "loss": 0.1038, + "step": 46568 + }, + { + "epoch": 3.0316196418882257, + "grad_norm": 4.299645900726318, + "learning_rate": 3.7209812251610256e-06, + "loss": 0.1594, + "step": 46569 + }, + { + "epoch": 3.0316332067281606, + "grad_norm": 4.611059188842773, + "learning_rate": 3.7208441825407704e-06, + "loss": 0.0993, + "step": 46570 + }, + { + "epoch": 3.0316467715680955, + "grad_norm": 6.783905982971191, + "learning_rate": 3.720707139920516e-06, + "loss": 0.125, + "step": 46571 + }, + { + "epoch": 3.0316603364080303, + "grad_norm": 4.865920066833496, + "learning_rate": 3.7205700973002607e-06, + "loss": 0.1657, + "step": 46572 + }, + { + "epoch": 3.031673901247965, + "grad_norm": 3.9638614654541016, + "learning_rate": 3.720433054680006e-06, + "loss": 0.1408, + "step": 46573 + }, + { + "epoch": 3.0316874660879, + "grad_norm": 4.564345836639404, + "learning_rate": 3.7202960120597507e-06, + "loss": 0.0921, + "step": 46574 + }, + { + "epoch": 3.031701030927835, + "grad_norm": 5.940965175628662, + "learning_rate": 3.720158969439496e-06, + "loss": 0.1777, + "step": 46575 + }, + { + "epoch": 3.03171459576777, + "grad_norm": 4.183570384979248, + "learning_rate": 3.720021926819241e-06, + "loss": 0.1237, + "step": 46576 + }, + { + "epoch": 3.0317281606077047, + "grad_norm": 4.495212554931641, + "learning_rate": 3.719884884198986e-06, + "loss": 0.0904, + "step": 46577 + }, + { + "epoch": 3.0317417254476395, + "grad_norm": 3.5700602531433105, + "learning_rate": 3.7197478415787314e-06, + "loss": 0.1424, + "step": 46578 + }, + { + "epoch": 3.0317552902875744, + "grad_norm": 4.333051681518555, + "learning_rate": 3.719610798958476e-06, + "loss": 0.1278, + "step": 46579 + }, + { + "epoch": 3.0317688551275097, + "grad_norm": 4.663031578063965, + "learning_rate": 3.7194737563382218e-06, + "loss": 0.1744, + "step": 46580 + }, + { + "epoch": 3.0317824199674446, + "grad_norm": 4.728066444396973, + "learning_rate": 3.7193367137179665e-06, + "loss": 0.1289, + "step": 46581 + }, + { + "epoch": 3.0317959848073794, + "grad_norm": 4.083491325378418, + "learning_rate": 3.7191996710977117e-06, + "loss": 0.069, + "step": 46582 + }, + { + "epoch": 3.0318095496473143, + "grad_norm": 4.5735344886779785, + "learning_rate": 3.719062628477457e-06, + "loss": 0.1205, + "step": 46583 + }, + { + "epoch": 3.031823114487249, + "grad_norm": 4.511038303375244, + "learning_rate": 3.718925585857202e-06, + "loss": 0.1328, + "step": 46584 + }, + { + "epoch": 3.031836679327184, + "grad_norm": 4.166838645935059, + "learning_rate": 3.718788543236947e-06, + "loss": 0.1409, + "step": 46585 + }, + { + "epoch": 3.031850244167119, + "grad_norm": 4.8467793464660645, + "learning_rate": 3.7186515006166924e-06, + "loss": 0.165, + "step": 46586 + }, + { + "epoch": 3.0318638090070538, + "grad_norm": 4.867868423461914, + "learning_rate": 3.718514457996437e-06, + "loss": 0.1115, + "step": 46587 + }, + { + "epoch": 3.0318773738469886, + "grad_norm": 4.388741970062256, + "learning_rate": 3.718377415376182e-06, + "loss": 0.1239, + "step": 46588 + }, + { + "epoch": 3.0318909386869235, + "grad_norm": 3.9844911098480225, + "learning_rate": 3.7182403727559275e-06, + "loss": 0.1818, + "step": 46589 + }, + { + "epoch": 3.0319045035268584, + "grad_norm": 3.4346861839294434, + "learning_rate": 3.7181033301356723e-06, + "loss": 0.1085, + "step": 46590 + }, + { + "epoch": 3.0319180683667932, + "grad_norm": 4.034808158874512, + "learning_rate": 3.717966287515418e-06, + "loss": 0.1232, + "step": 46591 + }, + { + "epoch": 3.031931633206728, + "grad_norm": 7.080110549926758, + "learning_rate": 3.7178292448951626e-06, + "loss": 0.1542, + "step": 46592 + }, + { + "epoch": 3.031945198046663, + "grad_norm": 3.28605580329895, + "learning_rate": 3.717692202274908e-06, + "loss": 0.047, + "step": 46593 + }, + { + "epoch": 3.031958762886598, + "grad_norm": 4.247119903564453, + "learning_rate": 3.7175551596546526e-06, + "loss": 0.1254, + "step": 46594 + }, + { + "epoch": 3.0319723277265327, + "grad_norm": 3.791332244873047, + "learning_rate": 3.717418117034398e-06, + "loss": 0.049, + "step": 46595 + }, + { + "epoch": 3.0319858925664676, + "grad_norm": 2.593752861022949, + "learning_rate": 3.717281074414143e-06, + "loss": 0.0748, + "step": 46596 + }, + { + "epoch": 3.0319994574064024, + "grad_norm": 4.492000102996826, + "learning_rate": 3.7171440317938885e-06, + "loss": 0.1415, + "step": 46597 + }, + { + "epoch": 3.0320130222463373, + "grad_norm": 4.017098426818848, + "learning_rate": 3.7170069891736333e-06, + "loss": 0.0766, + "step": 46598 + }, + { + "epoch": 3.0320265870862726, + "grad_norm": 2.813098669052124, + "learning_rate": 3.7168699465533785e-06, + "loss": 0.0791, + "step": 46599 + }, + { + "epoch": 3.0320401519262075, + "grad_norm": 3.870025873184204, + "learning_rate": 3.7167329039331236e-06, + "loss": 0.0898, + "step": 46600 + }, + { + "epoch": 3.0320537167661423, + "grad_norm": 3.5158121585845947, + "learning_rate": 3.716595861312869e-06, + "loss": 0.0769, + "step": 46601 + }, + { + "epoch": 3.032067281606077, + "grad_norm": 3.6199140548706055, + "learning_rate": 3.7164588186926136e-06, + "loss": 0.1005, + "step": 46602 + }, + { + "epoch": 3.032080846446012, + "grad_norm": 4.2136406898498535, + "learning_rate": 3.7163217760723587e-06, + "loss": 0.0893, + "step": 46603 + }, + { + "epoch": 3.032094411285947, + "grad_norm": 4.372261047363281, + "learning_rate": 3.716184733452104e-06, + "loss": 0.1036, + "step": 46604 + }, + { + "epoch": 3.032107976125882, + "grad_norm": 4.190074920654297, + "learning_rate": 3.7160476908318487e-06, + "loss": 0.0844, + "step": 46605 + }, + { + "epoch": 3.0321215409658167, + "grad_norm": 4.289199352264404, + "learning_rate": 3.7159106482115943e-06, + "loss": 0.1072, + "step": 46606 + }, + { + "epoch": 3.0321351058057515, + "grad_norm": 3.312744617462158, + "learning_rate": 3.715773605591339e-06, + "loss": 0.0664, + "step": 46607 + }, + { + "epoch": 3.0321486706456864, + "grad_norm": 2.3404839038848877, + "learning_rate": 3.7156365629710846e-06, + "loss": 0.0358, + "step": 46608 + }, + { + "epoch": 3.0321622354856212, + "grad_norm": 3.391937255859375, + "learning_rate": 3.7154995203508294e-06, + "loss": 0.0794, + "step": 46609 + }, + { + "epoch": 3.032175800325556, + "grad_norm": 4.001002788543701, + "learning_rate": 3.7153624777305746e-06, + "loss": 0.1262, + "step": 46610 + }, + { + "epoch": 3.032189365165491, + "grad_norm": 4.087409496307373, + "learning_rate": 3.7152254351103193e-06, + "loss": 0.1528, + "step": 46611 + }, + { + "epoch": 3.032202930005426, + "grad_norm": 4.337540626525879, + "learning_rate": 3.715088392490065e-06, + "loss": 0.1508, + "step": 46612 + }, + { + "epoch": 3.0322164948453607, + "grad_norm": 4.304215431213379, + "learning_rate": 3.7149513498698097e-06, + "loss": 0.1185, + "step": 46613 + }, + { + "epoch": 3.0322300596852956, + "grad_norm": 3.5399441719055176, + "learning_rate": 3.7148143072495553e-06, + "loss": 0.07, + "step": 46614 + }, + { + "epoch": 3.0322436245252304, + "grad_norm": 3.603822708129883, + "learning_rate": 3.7146772646293e-06, + "loss": 0.06, + "step": 46615 + }, + { + "epoch": 3.0322571893651653, + "grad_norm": 2.971081256866455, + "learning_rate": 3.714540222009045e-06, + "loss": 0.0512, + "step": 46616 + }, + { + "epoch": 3.0322707542051, + "grad_norm": 4.053326606750488, + "learning_rate": 3.7144031793887904e-06, + "loss": 0.1009, + "step": 46617 + }, + { + "epoch": 3.0322843190450355, + "grad_norm": 2.7044196128845215, + "learning_rate": 3.714266136768535e-06, + "loss": 0.0638, + "step": 46618 + }, + { + "epoch": 3.0322978838849703, + "grad_norm": 4.874422073364258, + "learning_rate": 3.7141290941482803e-06, + "loss": 0.0967, + "step": 46619 + }, + { + "epoch": 3.032311448724905, + "grad_norm": 4.711963176727295, + "learning_rate": 3.7139920515280255e-06, + "loss": 0.1096, + "step": 46620 + }, + { + "epoch": 3.03232501356484, + "grad_norm": 3.3949904441833496, + "learning_rate": 3.7138550089077707e-06, + "loss": 0.0905, + "step": 46621 + }, + { + "epoch": 3.032338578404775, + "grad_norm": 4.04114294052124, + "learning_rate": 3.7137179662875155e-06, + "loss": 0.0809, + "step": 46622 + }, + { + "epoch": 3.03235214324471, + "grad_norm": 2.7228007316589355, + "learning_rate": 3.713580923667261e-06, + "loss": 0.0822, + "step": 46623 + }, + { + "epoch": 3.0323657080846447, + "grad_norm": 4.335296630859375, + "learning_rate": 3.713443881047006e-06, + "loss": 0.1154, + "step": 46624 + }, + { + "epoch": 3.0323792729245795, + "grad_norm": 3.9387195110321045, + "learning_rate": 3.7133068384267514e-06, + "loss": 0.0796, + "step": 46625 + }, + { + "epoch": 3.0323928377645144, + "grad_norm": 4.9058637619018555, + "learning_rate": 3.713169795806496e-06, + "loss": 0.09, + "step": 46626 + }, + { + "epoch": 3.0324064026044493, + "grad_norm": 5.750135898590088, + "learning_rate": 3.7130327531862413e-06, + "loss": 0.1142, + "step": 46627 + }, + { + "epoch": 3.032419967444384, + "grad_norm": 3.457869529724121, + "learning_rate": 3.712895710565986e-06, + "loss": 0.0674, + "step": 46628 + }, + { + "epoch": 3.032433532284319, + "grad_norm": 5.168114185333252, + "learning_rate": 3.7127586679457317e-06, + "loss": 0.1159, + "step": 46629 + }, + { + "epoch": 3.032447097124254, + "grad_norm": 5.539256572723389, + "learning_rate": 3.7126216253254765e-06, + "loss": 0.1257, + "step": 46630 + }, + { + "epoch": 3.0324606619641887, + "grad_norm": 3.65598464012146, + "learning_rate": 3.7124845827052212e-06, + "loss": 0.0882, + "step": 46631 + }, + { + "epoch": 3.0324742268041236, + "grad_norm": 3.6994669437408447, + "learning_rate": 3.712347540084967e-06, + "loss": 0.1157, + "step": 46632 + }, + { + "epoch": 3.0324877916440585, + "grad_norm": 3.1410324573516846, + "learning_rate": 3.7122104974647116e-06, + "loss": 0.0698, + "step": 46633 + }, + { + "epoch": 3.0325013564839933, + "grad_norm": 5.030690670013428, + "learning_rate": 3.712073454844457e-06, + "loss": 0.1557, + "step": 46634 + }, + { + "epoch": 3.032514921323928, + "grad_norm": 5.255678176879883, + "learning_rate": 3.711936412224202e-06, + "loss": 0.1075, + "step": 46635 + }, + { + "epoch": 3.032528486163863, + "grad_norm": 4.028531074523926, + "learning_rate": 3.711799369603947e-06, + "loss": 0.089, + "step": 46636 + }, + { + "epoch": 3.0325420510037984, + "grad_norm": 4.355154991149902, + "learning_rate": 3.7116623269836923e-06, + "loss": 0.1443, + "step": 46637 + }, + { + "epoch": 3.0325556158437332, + "grad_norm": 5.356024265289307, + "learning_rate": 3.7115252843634375e-06, + "loss": 0.1051, + "step": 46638 + }, + { + "epoch": 3.032569180683668, + "grad_norm": 4.779271125793457, + "learning_rate": 3.7113882417431822e-06, + "loss": 0.0915, + "step": 46639 + }, + { + "epoch": 3.032582745523603, + "grad_norm": 4.492919921875, + "learning_rate": 3.711251199122928e-06, + "loss": 0.0746, + "step": 46640 + }, + { + "epoch": 3.032596310363538, + "grad_norm": 2.91890287399292, + "learning_rate": 3.7111141565026726e-06, + "loss": 0.0793, + "step": 46641 + }, + { + "epoch": 3.0326098752034727, + "grad_norm": 3.67431640625, + "learning_rate": 3.710977113882418e-06, + "loss": 0.1213, + "step": 46642 + }, + { + "epoch": 3.0326234400434076, + "grad_norm": 3.637781858444214, + "learning_rate": 3.710840071262163e-06, + "loss": 0.0858, + "step": 46643 + }, + { + "epoch": 3.0326370048833424, + "grad_norm": 3.589729070663452, + "learning_rate": 3.7107030286419077e-06, + "loss": 0.0841, + "step": 46644 + }, + { + "epoch": 3.0326505697232773, + "grad_norm": 4.626495361328125, + "learning_rate": 3.710565986021653e-06, + "loss": 0.1491, + "step": 46645 + }, + { + "epoch": 3.032664134563212, + "grad_norm": 4.487843990325928, + "learning_rate": 3.710428943401398e-06, + "loss": 0.1283, + "step": 46646 + }, + { + "epoch": 3.032677699403147, + "grad_norm": 2.989954710006714, + "learning_rate": 3.7102919007811432e-06, + "loss": 0.0888, + "step": 46647 + }, + { + "epoch": 3.032691264243082, + "grad_norm": 3.257472515106201, + "learning_rate": 3.710154858160888e-06, + "loss": 0.0932, + "step": 46648 + }, + { + "epoch": 3.0327048290830168, + "grad_norm": 2.776916980743408, + "learning_rate": 3.7100178155406336e-06, + "loss": 0.0525, + "step": 46649 + }, + { + "epoch": 3.0327183939229516, + "grad_norm": 5.499348163604736, + "learning_rate": 3.7098807729203783e-06, + "loss": 0.147, + "step": 46650 + }, + { + "epoch": 3.0327319587628865, + "grad_norm": 3.612377882003784, + "learning_rate": 3.709743730300124e-06, + "loss": 0.084, + "step": 46651 + }, + { + "epoch": 3.0327455236028213, + "grad_norm": 4.2001261711120605, + "learning_rate": 3.7096066876798687e-06, + "loss": 0.1221, + "step": 46652 + }, + { + "epoch": 3.032759088442756, + "grad_norm": 5.558562278747559, + "learning_rate": 3.709469645059614e-06, + "loss": 0.1631, + "step": 46653 + }, + { + "epoch": 3.032772653282691, + "grad_norm": 2.4439783096313477, + "learning_rate": 3.709332602439359e-06, + "loss": 0.0467, + "step": 46654 + }, + { + "epoch": 3.032786218122626, + "grad_norm": 4.9688029289245605, + "learning_rate": 3.7091955598191042e-06, + "loss": 0.1159, + "step": 46655 + }, + { + "epoch": 3.0327997829625613, + "grad_norm": 2.4911251068115234, + "learning_rate": 3.709058517198849e-06, + "loss": 0.0722, + "step": 46656 + }, + { + "epoch": 3.032813347802496, + "grad_norm": 5.363651275634766, + "learning_rate": 3.708921474578594e-06, + "loss": 0.12, + "step": 46657 + }, + { + "epoch": 3.032826912642431, + "grad_norm": 5.202852725982666, + "learning_rate": 3.7087844319583394e-06, + "loss": 0.1118, + "step": 46658 + }, + { + "epoch": 3.032840477482366, + "grad_norm": 3.90360426902771, + "learning_rate": 3.708647389338084e-06, + "loss": 0.0916, + "step": 46659 + }, + { + "epoch": 3.0328540423223007, + "grad_norm": 3.5731799602508545, + "learning_rate": 3.7085103467178297e-06, + "loss": 0.0915, + "step": 46660 + }, + { + "epoch": 3.0328676071622356, + "grad_norm": 3.158423900604248, + "learning_rate": 3.7083733040975745e-06, + "loss": 0.05, + "step": 46661 + }, + { + "epoch": 3.0328811720021704, + "grad_norm": 2.2365670204162598, + "learning_rate": 3.7082362614773196e-06, + "loss": 0.0522, + "step": 46662 + }, + { + "epoch": 3.0328947368421053, + "grad_norm": 2.7989001274108887, + "learning_rate": 3.708099218857065e-06, + "loss": 0.0595, + "step": 46663 + }, + { + "epoch": 3.03290830168204, + "grad_norm": 4.125053405761719, + "learning_rate": 3.70796217623681e-06, + "loss": 0.1117, + "step": 46664 + }, + { + "epoch": 3.032921866521975, + "grad_norm": 2.6566483974456787, + "learning_rate": 3.7078251336165548e-06, + "loss": 0.062, + "step": 46665 + }, + { + "epoch": 3.03293543136191, + "grad_norm": 4.711365699768066, + "learning_rate": 3.7076880909963004e-06, + "loss": 0.1258, + "step": 46666 + }, + { + "epoch": 3.0329489962018448, + "grad_norm": 3.002689838409424, + "learning_rate": 3.707551048376045e-06, + "loss": 0.0899, + "step": 46667 + }, + { + "epoch": 3.0329625610417796, + "grad_norm": 4.286508083343506, + "learning_rate": 3.7074140057557907e-06, + "loss": 0.1222, + "step": 46668 + }, + { + "epoch": 3.0329761258817145, + "grad_norm": 3.898387908935547, + "learning_rate": 3.7072769631355355e-06, + "loss": 0.0906, + "step": 46669 + }, + { + "epoch": 3.0329896907216494, + "grad_norm": 3.5978586673736572, + "learning_rate": 3.7071399205152807e-06, + "loss": 0.0772, + "step": 46670 + }, + { + "epoch": 3.0330032555615842, + "grad_norm": 4.284991264343262, + "learning_rate": 3.707002877895026e-06, + "loss": 0.084, + "step": 46671 + }, + { + "epoch": 3.033016820401519, + "grad_norm": 4.698855400085449, + "learning_rate": 3.7068658352747706e-06, + "loss": 0.1131, + "step": 46672 + }, + { + "epoch": 3.033030385241454, + "grad_norm": 2.963541269302368, + "learning_rate": 3.7067287926545158e-06, + "loss": 0.0836, + "step": 46673 + }, + { + "epoch": 3.033043950081389, + "grad_norm": 2.3160207271575928, + "learning_rate": 3.706591750034261e-06, + "loss": 0.0446, + "step": 46674 + }, + { + "epoch": 3.033057514921324, + "grad_norm": 3.032839298248291, + "learning_rate": 3.706454707414006e-06, + "loss": 0.0523, + "step": 46675 + }, + { + "epoch": 3.033071079761259, + "grad_norm": 2.7249205112457275, + "learning_rate": 3.706317664793751e-06, + "loss": 0.0732, + "step": 46676 + }, + { + "epoch": 3.033084644601194, + "grad_norm": 2.2074475288391113, + "learning_rate": 3.7061806221734965e-06, + "loss": 0.046, + "step": 46677 + }, + { + "epoch": 3.0330982094411287, + "grad_norm": 3.5388569831848145, + "learning_rate": 3.7060435795532412e-06, + "loss": 0.0918, + "step": 46678 + }, + { + "epoch": 3.0331117742810636, + "grad_norm": 3.0913217067718506, + "learning_rate": 3.705906536932987e-06, + "loss": 0.0407, + "step": 46679 + }, + { + "epoch": 3.0331253391209985, + "grad_norm": 2.8817384243011475, + "learning_rate": 3.7057694943127316e-06, + "loss": 0.0482, + "step": 46680 + }, + { + "epoch": 3.0331389039609333, + "grad_norm": 4.0599260330200195, + "learning_rate": 3.7056324516924768e-06, + "loss": 0.1166, + "step": 46681 + }, + { + "epoch": 3.033152468800868, + "grad_norm": 4.3237528800964355, + "learning_rate": 3.7054954090722215e-06, + "loss": 0.0824, + "step": 46682 + }, + { + "epoch": 3.033166033640803, + "grad_norm": 3.3045082092285156, + "learning_rate": 3.705358366451967e-06, + "loss": 0.0839, + "step": 46683 + }, + { + "epoch": 3.033179598480738, + "grad_norm": 3.2978851795196533, + "learning_rate": 3.705221323831712e-06, + "loss": 0.0882, + "step": 46684 + }, + { + "epoch": 3.033193163320673, + "grad_norm": 4.3830342292785645, + "learning_rate": 3.7050842812114566e-06, + "loss": 0.1335, + "step": 46685 + }, + { + "epoch": 3.0332067281606077, + "grad_norm": 3.678863525390625, + "learning_rate": 3.7049472385912022e-06, + "loss": 0.07, + "step": 46686 + }, + { + "epoch": 3.0332202930005425, + "grad_norm": 3.0138540267944336, + "learning_rate": 3.704810195970947e-06, + "loss": 0.0784, + "step": 46687 + }, + { + "epoch": 3.0332338578404774, + "grad_norm": 3.597839832305908, + "learning_rate": 3.7046731533506926e-06, + "loss": 0.0641, + "step": 46688 + }, + { + "epoch": 3.0332474226804123, + "grad_norm": 3.8507747650146484, + "learning_rate": 3.7045361107304374e-06, + "loss": 0.0755, + "step": 46689 + }, + { + "epoch": 3.033260987520347, + "grad_norm": 5.626134872436523, + "learning_rate": 3.7043990681101825e-06, + "loss": 0.1041, + "step": 46690 + }, + { + "epoch": 3.033274552360282, + "grad_norm": 3.881847620010376, + "learning_rate": 3.7042620254899277e-06, + "loss": 0.1039, + "step": 46691 + }, + { + "epoch": 3.033288117200217, + "grad_norm": 4.421785831451416, + "learning_rate": 3.704124982869673e-06, + "loss": 0.1287, + "step": 46692 + }, + { + "epoch": 3.0333016820401517, + "grad_norm": 4.364744186401367, + "learning_rate": 3.7039879402494176e-06, + "loss": 0.0766, + "step": 46693 + }, + { + "epoch": 3.033315246880087, + "grad_norm": 4.8797173500061035, + "learning_rate": 3.7038508976291632e-06, + "loss": 0.1055, + "step": 46694 + }, + { + "epoch": 3.033328811720022, + "grad_norm": 3.0951008796691895, + "learning_rate": 3.703713855008908e-06, + "loss": 0.0575, + "step": 46695 + }, + { + "epoch": 3.0333423765599568, + "grad_norm": 2.7129855155944824, + "learning_rate": 3.7035768123886536e-06, + "loss": 0.0656, + "step": 46696 + }, + { + "epoch": 3.0333559413998916, + "grad_norm": 4.537012100219727, + "learning_rate": 3.7034397697683984e-06, + "loss": 0.0734, + "step": 46697 + }, + { + "epoch": 3.0333695062398265, + "grad_norm": 3.3446507453918457, + "learning_rate": 3.7033027271481435e-06, + "loss": 0.1296, + "step": 46698 + }, + { + "epoch": 3.0333830710797614, + "grad_norm": 4.204111576080322, + "learning_rate": 3.7031656845278883e-06, + "loss": 0.1185, + "step": 46699 + }, + { + "epoch": 3.033396635919696, + "grad_norm": 3.0238518714904785, + "learning_rate": 3.7030286419076335e-06, + "loss": 0.0746, + "step": 46700 + }, + { + "epoch": 3.033410200759631, + "grad_norm": 3.5897395610809326, + "learning_rate": 3.7028915992873787e-06, + "loss": 0.1078, + "step": 46701 + }, + { + "epoch": 3.033423765599566, + "grad_norm": 3.2495808601379395, + "learning_rate": 3.7027545566671234e-06, + "loss": 0.0643, + "step": 46702 + }, + { + "epoch": 3.033437330439501, + "grad_norm": 4.106450080871582, + "learning_rate": 3.702617514046869e-06, + "loss": 0.1165, + "step": 46703 + }, + { + "epoch": 3.0334508952794357, + "grad_norm": 3.756474018096924, + "learning_rate": 3.7024804714266138e-06, + "loss": 0.1039, + "step": 46704 + }, + { + "epoch": 3.0334644601193705, + "grad_norm": 2.7162744998931885, + "learning_rate": 3.7023434288063594e-06, + "loss": 0.0729, + "step": 46705 + }, + { + "epoch": 3.0334780249593054, + "grad_norm": 6.26535701751709, + "learning_rate": 3.702206386186104e-06, + "loss": 0.1478, + "step": 46706 + }, + { + "epoch": 3.0334915897992403, + "grad_norm": 3.451103925704956, + "learning_rate": 3.7020693435658493e-06, + "loss": 0.0895, + "step": 46707 + }, + { + "epoch": 3.033505154639175, + "grad_norm": 4.1372456550598145, + "learning_rate": 3.7019323009455945e-06, + "loss": 0.0718, + "step": 46708 + }, + { + "epoch": 3.03351871947911, + "grad_norm": 4.393223285675049, + "learning_rate": 3.7017952583253397e-06, + "loss": 0.1035, + "step": 46709 + }, + { + "epoch": 3.033532284319045, + "grad_norm": 3.607496976852417, + "learning_rate": 3.7016582157050844e-06, + "loss": 0.0902, + "step": 46710 + }, + { + "epoch": 3.0335458491589797, + "grad_norm": 3.593651294708252, + "learning_rate": 3.70152117308483e-06, + "loss": 0.0641, + "step": 46711 + }, + { + "epoch": 3.0335594139989146, + "grad_norm": 3.1443963050842285, + "learning_rate": 3.7013841304645748e-06, + "loss": 0.066, + "step": 46712 + }, + { + "epoch": 3.03357297883885, + "grad_norm": 3.05545973777771, + "learning_rate": 3.7012470878443195e-06, + "loss": 0.0581, + "step": 46713 + }, + { + "epoch": 3.0335865436787848, + "grad_norm": 3.927593946456909, + "learning_rate": 3.701110045224065e-06, + "loss": 0.1504, + "step": 46714 + }, + { + "epoch": 3.0336001085187196, + "grad_norm": 3.680238962173462, + "learning_rate": 3.70097300260381e-06, + "loss": 0.1037, + "step": 46715 + }, + { + "epoch": 3.0336136733586545, + "grad_norm": 4.116116523742676, + "learning_rate": 3.700835959983555e-06, + "loss": 0.1424, + "step": 46716 + }, + { + "epoch": 3.0336272381985894, + "grad_norm": 4.1045122146606445, + "learning_rate": 3.7006989173633002e-06, + "loss": 0.0935, + "step": 46717 + }, + { + "epoch": 3.0336408030385242, + "grad_norm": 4.2522172927856445, + "learning_rate": 3.7005618747430454e-06, + "loss": 0.0955, + "step": 46718 + }, + { + "epoch": 3.033654367878459, + "grad_norm": 4.973576068878174, + "learning_rate": 3.70042483212279e-06, + "loss": 0.182, + "step": 46719 + }, + { + "epoch": 3.033667932718394, + "grad_norm": 3.5645904541015625, + "learning_rate": 3.7002877895025358e-06, + "loss": 0.0917, + "step": 46720 + }, + { + "epoch": 3.033681497558329, + "grad_norm": 5.374492168426514, + "learning_rate": 3.7001507468822805e-06, + "loss": 0.1484, + "step": 46721 + }, + { + "epoch": 3.0336950623982637, + "grad_norm": 5.076892375946045, + "learning_rate": 3.700013704262026e-06, + "loss": 0.1094, + "step": 46722 + }, + { + "epoch": 3.0337086272381986, + "grad_norm": 4.751781463623047, + "learning_rate": 3.699876661641771e-06, + "loss": 0.1708, + "step": 46723 + }, + { + "epoch": 3.0337221920781334, + "grad_norm": 3.6927988529205322, + "learning_rate": 3.699739619021516e-06, + "loss": 0.0952, + "step": 46724 + }, + { + "epoch": 3.0337357569180683, + "grad_norm": 3.151663303375244, + "learning_rate": 3.6996025764012613e-06, + "loss": 0.0588, + "step": 46725 + }, + { + "epoch": 3.033749321758003, + "grad_norm": 4.705692291259766, + "learning_rate": 3.699465533781006e-06, + "loss": 0.1066, + "step": 46726 + }, + { + "epoch": 3.033762886597938, + "grad_norm": 3.625924825668335, + "learning_rate": 3.699328491160751e-06, + "loss": 0.1268, + "step": 46727 + }, + { + "epoch": 3.033776451437873, + "grad_norm": 3.7776660919189453, + "learning_rate": 3.6991914485404964e-06, + "loss": 0.1045, + "step": 46728 + }, + { + "epoch": 3.0337900162778078, + "grad_norm": 4.6341753005981445, + "learning_rate": 3.6990544059202415e-06, + "loss": 0.1645, + "step": 46729 + }, + { + "epoch": 3.0338035811177426, + "grad_norm": 3.437451124191284, + "learning_rate": 3.6989173632999863e-06, + "loss": 0.0916, + "step": 46730 + }, + { + "epoch": 3.0338171459576775, + "grad_norm": 4.607779502868652, + "learning_rate": 3.698780320679732e-06, + "loss": 0.2062, + "step": 46731 + }, + { + "epoch": 3.033830710797613, + "grad_norm": 3.381010055541992, + "learning_rate": 3.6986432780594767e-06, + "loss": 0.0832, + "step": 46732 + }, + { + "epoch": 3.0338442756375477, + "grad_norm": 5.3812971115112305, + "learning_rate": 3.698506235439222e-06, + "loss": 0.1627, + "step": 46733 + }, + { + "epoch": 3.0338578404774825, + "grad_norm": 4.071030139923096, + "learning_rate": 3.698369192818967e-06, + "loss": 0.0852, + "step": 46734 + }, + { + "epoch": 3.0338714053174174, + "grad_norm": 3.8903210163116455, + "learning_rate": 3.698232150198712e-06, + "loss": 0.1446, + "step": 46735 + }, + { + "epoch": 3.0338849701573523, + "grad_norm": 3.7861204147338867, + "learning_rate": 3.698095107578457e-06, + "loss": 0.088, + "step": 46736 + }, + { + "epoch": 3.033898534997287, + "grad_norm": 4.0508036613464355, + "learning_rate": 3.6979580649582026e-06, + "loss": 0.1282, + "step": 46737 + }, + { + "epoch": 3.033912099837222, + "grad_norm": 4.238513946533203, + "learning_rate": 3.6978210223379473e-06, + "loss": 0.1683, + "step": 46738 + }, + { + "epoch": 3.033925664677157, + "grad_norm": 4.283242225646973, + "learning_rate": 3.697683979717693e-06, + "loss": 0.0778, + "step": 46739 + }, + { + "epoch": 3.0339392295170917, + "grad_norm": 3.2833681106567383, + "learning_rate": 3.6975469370974377e-06, + "loss": 0.1124, + "step": 46740 + }, + { + "epoch": 3.0339527943570266, + "grad_norm": 4.559174537658691, + "learning_rate": 3.6974098944771824e-06, + "loss": 0.0685, + "step": 46741 + }, + { + "epoch": 3.0339663591969614, + "grad_norm": 4.4709248542785645, + "learning_rate": 3.697272851856928e-06, + "loss": 0.1355, + "step": 46742 + }, + { + "epoch": 3.0339799240368963, + "grad_norm": 3.8123350143432617, + "learning_rate": 3.6971358092366728e-06, + "loss": 0.0827, + "step": 46743 + }, + { + "epoch": 3.033993488876831, + "grad_norm": 4.869870185852051, + "learning_rate": 3.696998766616418e-06, + "loss": 0.1362, + "step": 46744 + }, + { + "epoch": 3.034007053716766, + "grad_norm": 4.494709491729736, + "learning_rate": 3.696861723996163e-06, + "loss": 0.1007, + "step": 46745 + }, + { + "epoch": 3.034020618556701, + "grad_norm": 4.467729091644287, + "learning_rate": 3.6967246813759083e-06, + "loss": 0.1034, + "step": 46746 + }, + { + "epoch": 3.0340341833966358, + "grad_norm": 5.399270534515381, + "learning_rate": 3.696587638755653e-06, + "loss": 0.1424, + "step": 46747 + }, + { + "epoch": 3.0340477482365706, + "grad_norm": 4.418285369873047, + "learning_rate": 3.6964505961353987e-06, + "loss": 0.1831, + "step": 46748 + }, + { + "epoch": 3.0340613130765055, + "grad_norm": 3.643761396408081, + "learning_rate": 3.6963135535151434e-06, + "loss": 0.1626, + "step": 46749 + }, + { + "epoch": 3.0340748779164404, + "grad_norm": 5.263571262359619, + "learning_rate": 3.696176510894889e-06, + "loss": 0.108, + "step": 46750 + }, + { + "epoch": 3.0340884427563757, + "grad_norm": 3.440150737762451, + "learning_rate": 3.6960394682746338e-06, + "loss": 0.1155, + "step": 46751 + }, + { + "epoch": 3.0341020075963105, + "grad_norm": 5.3402323722839355, + "learning_rate": 3.695902425654379e-06, + "loss": 0.2546, + "step": 46752 + }, + { + "epoch": 3.0341155724362454, + "grad_norm": 3.2056586742401123, + "learning_rate": 3.6957653830341237e-06, + "loss": 0.0966, + "step": 46753 + }, + { + "epoch": 3.0341291372761803, + "grad_norm": 4.7168145179748535, + "learning_rate": 3.695628340413869e-06, + "loss": 0.0958, + "step": 46754 + }, + { + "epoch": 3.034142702116115, + "grad_norm": 2.496995210647583, + "learning_rate": 3.695491297793614e-06, + "loss": 0.0487, + "step": 46755 + }, + { + "epoch": 3.03415626695605, + "grad_norm": 3.2835636138916016, + "learning_rate": 3.695354255173359e-06, + "loss": 0.0767, + "step": 46756 + }, + { + "epoch": 3.034169831795985, + "grad_norm": 4.792696952819824, + "learning_rate": 3.6952172125531044e-06, + "loss": 0.1043, + "step": 46757 + }, + { + "epoch": 3.0341833966359197, + "grad_norm": 4.443469524383545, + "learning_rate": 3.695080169932849e-06, + "loss": 0.1113, + "step": 46758 + }, + { + "epoch": 3.0341969614758546, + "grad_norm": 4.7581257820129395, + "learning_rate": 3.694943127312595e-06, + "loss": 0.103, + "step": 46759 + }, + { + "epoch": 3.0342105263157895, + "grad_norm": 4.394198417663574, + "learning_rate": 3.6948060846923395e-06, + "loss": 0.104, + "step": 46760 + }, + { + "epoch": 3.0342240911557243, + "grad_norm": 5.945438385009766, + "learning_rate": 3.6946690420720847e-06, + "loss": 0.103, + "step": 46761 + }, + { + "epoch": 3.034237655995659, + "grad_norm": 5.277115821838379, + "learning_rate": 3.69453199945183e-06, + "loss": 0.0693, + "step": 46762 + }, + { + "epoch": 3.034251220835594, + "grad_norm": 4.604868412017822, + "learning_rate": 3.694394956831575e-06, + "loss": 0.1378, + "step": 46763 + }, + { + "epoch": 3.034264785675529, + "grad_norm": 3.9507815837860107, + "learning_rate": 3.69425791421132e-06, + "loss": 0.109, + "step": 46764 + }, + { + "epoch": 3.034278350515464, + "grad_norm": 4.793880939483643, + "learning_rate": 3.6941208715910654e-06, + "loss": 0.1261, + "step": 46765 + }, + { + "epoch": 3.0342919153553987, + "grad_norm": 4.844704627990723, + "learning_rate": 3.69398382897081e-06, + "loss": 0.0998, + "step": 46766 + }, + { + "epoch": 3.0343054801953335, + "grad_norm": 4.990310192108154, + "learning_rate": 3.693846786350556e-06, + "loss": 0.2148, + "step": 46767 + }, + { + "epoch": 3.0343190450352684, + "grad_norm": 4.489482402801514, + "learning_rate": 3.6937097437303006e-06, + "loss": 0.1246, + "step": 46768 + }, + { + "epoch": 3.0343326098752033, + "grad_norm": 4.480895042419434, + "learning_rate": 3.6935727011100453e-06, + "loss": 0.1232, + "step": 46769 + }, + { + "epoch": 3.0343461747151386, + "grad_norm": 3.769174575805664, + "learning_rate": 3.6934356584897905e-06, + "loss": 0.1085, + "step": 46770 + }, + { + "epoch": 3.0343597395550734, + "grad_norm": 4.989015102386475, + "learning_rate": 3.6932986158695357e-06, + "loss": 0.1636, + "step": 46771 + }, + { + "epoch": 3.0343733043950083, + "grad_norm": 4.267027378082275, + "learning_rate": 3.693161573249281e-06, + "loss": 0.0738, + "step": 46772 + }, + { + "epoch": 3.034386869234943, + "grad_norm": 5.365907669067383, + "learning_rate": 3.6930245306290256e-06, + "loss": 0.1105, + "step": 46773 + }, + { + "epoch": 3.034400434074878, + "grad_norm": 6.4539008140563965, + "learning_rate": 3.692887488008771e-06, + "loss": 0.2069, + "step": 46774 + }, + { + "epoch": 3.034413998914813, + "grad_norm": 3.95896577835083, + "learning_rate": 3.692750445388516e-06, + "loss": 0.0904, + "step": 46775 + }, + { + "epoch": 3.0344275637547478, + "grad_norm": 3.5844202041625977, + "learning_rate": 3.6926134027682616e-06, + "loss": 0.1783, + "step": 46776 + }, + { + "epoch": 3.0344411285946826, + "grad_norm": 6.636297225952148, + "learning_rate": 3.6924763601480063e-06, + "loss": 0.1602, + "step": 46777 + }, + { + "epoch": 3.0344546934346175, + "grad_norm": 3.840041399002075, + "learning_rate": 3.6923393175277515e-06, + "loss": 0.1269, + "step": 46778 + }, + { + "epoch": 3.0344682582745524, + "grad_norm": 4.955026149749756, + "learning_rate": 3.6922022749074967e-06, + "loss": 0.1256, + "step": 46779 + }, + { + "epoch": 3.034481823114487, + "grad_norm": 3.590986728668213, + "learning_rate": 3.692065232287242e-06, + "loss": 0.1417, + "step": 46780 + }, + { + "epoch": 3.034495387954422, + "grad_norm": 4.449941158294678, + "learning_rate": 3.6919281896669866e-06, + "loss": 0.1048, + "step": 46781 + }, + { + "epoch": 3.034508952794357, + "grad_norm": 4.160567760467529, + "learning_rate": 3.691791147046732e-06, + "loss": 0.071, + "step": 46782 + }, + { + "epoch": 3.034522517634292, + "grad_norm": 6.542055606842041, + "learning_rate": 3.691654104426477e-06, + "loss": 0.2028, + "step": 46783 + }, + { + "epoch": 3.0345360824742267, + "grad_norm": 3.662611722946167, + "learning_rate": 3.6915170618062217e-06, + "loss": 0.155, + "step": 46784 + }, + { + "epoch": 3.0345496473141615, + "grad_norm": 3.3124940395355225, + "learning_rate": 3.6913800191859673e-06, + "loss": 0.1821, + "step": 46785 + }, + { + "epoch": 3.0345632121540964, + "grad_norm": 2.6980814933776855, + "learning_rate": 3.691242976565712e-06, + "loss": 0.1178, + "step": 46786 + }, + { + "epoch": 3.0345767769940313, + "grad_norm": 3.7349565029144287, + "learning_rate": 3.6911059339454573e-06, + "loss": 0.0868, + "step": 46787 + }, + { + "epoch": 3.034590341833966, + "grad_norm": 3.135995626449585, + "learning_rate": 3.6909688913252024e-06, + "loss": 0.071, + "step": 46788 + }, + { + "epoch": 3.0346039066739015, + "grad_norm": 5.126657009124756, + "learning_rate": 3.6908318487049476e-06, + "loss": 0.1389, + "step": 46789 + }, + { + "epoch": 3.0346174715138363, + "grad_norm": 6.409442901611328, + "learning_rate": 3.6906948060846924e-06, + "loss": 0.1255, + "step": 46790 + }, + { + "epoch": 3.034631036353771, + "grad_norm": 3.4052183628082275, + "learning_rate": 3.690557763464438e-06, + "loss": 0.0635, + "step": 46791 + }, + { + "epoch": 3.034644601193706, + "grad_norm": 3.689626455307007, + "learning_rate": 3.6904207208441827e-06, + "loss": 0.0958, + "step": 46792 + }, + { + "epoch": 3.034658166033641, + "grad_norm": 4.006284713745117, + "learning_rate": 3.6902836782239283e-06, + "loss": 0.1402, + "step": 46793 + }, + { + "epoch": 3.034671730873576, + "grad_norm": 5.587167739868164, + "learning_rate": 3.690146635603673e-06, + "loss": 0.15, + "step": 46794 + }, + { + "epoch": 3.0346852957135106, + "grad_norm": 4.474039554595947, + "learning_rate": 3.690009592983418e-06, + "loss": 0.1363, + "step": 46795 + }, + { + "epoch": 3.0346988605534455, + "grad_norm": 3.6065595149993896, + "learning_rate": 3.6898725503631634e-06, + "loss": 0.1064, + "step": 46796 + }, + { + "epoch": 3.0347124253933804, + "grad_norm": 4.054751873016357, + "learning_rate": 3.689735507742908e-06, + "loss": 0.0866, + "step": 46797 + }, + { + "epoch": 3.0347259902333152, + "grad_norm": 5.633378028869629, + "learning_rate": 3.6895984651226534e-06, + "loss": 0.1168, + "step": 46798 + }, + { + "epoch": 3.03473955507325, + "grad_norm": 4.839487075805664, + "learning_rate": 3.6894614225023986e-06, + "loss": 0.12, + "step": 46799 + }, + { + "epoch": 3.034753119913185, + "grad_norm": 4.1469645500183105, + "learning_rate": 3.6893243798821437e-06, + "loss": 0.096, + "step": 46800 + }, + { + "epoch": 3.03476668475312, + "grad_norm": 3.1181812286376953, + "learning_rate": 3.6891873372618885e-06, + "loss": 0.1069, + "step": 46801 + }, + { + "epoch": 3.0347802495930547, + "grad_norm": 2.8688089847564697, + "learning_rate": 3.689050294641634e-06, + "loss": 0.0525, + "step": 46802 + }, + { + "epoch": 3.0347938144329896, + "grad_norm": 3.249499559402466, + "learning_rate": 3.688913252021379e-06, + "loss": 0.1364, + "step": 46803 + }, + { + "epoch": 3.0348073792729244, + "grad_norm": 3.6055257320404053, + "learning_rate": 3.688776209401124e-06, + "loss": 0.0714, + "step": 46804 + }, + { + "epoch": 3.0348209441128593, + "grad_norm": 2.7942581176757812, + "learning_rate": 3.688639166780869e-06, + "loss": 0.0652, + "step": 46805 + }, + { + "epoch": 3.034834508952794, + "grad_norm": 3.5341527462005615, + "learning_rate": 3.6885021241606144e-06, + "loss": 0.0805, + "step": 46806 + }, + { + "epoch": 3.034848073792729, + "grad_norm": 3.4731223583221436, + "learning_rate": 3.688365081540359e-06, + "loss": 0.1903, + "step": 46807 + }, + { + "epoch": 3.0348616386326643, + "grad_norm": 4.695954322814941, + "learning_rate": 3.6882280389201047e-06, + "loss": 0.149, + "step": 46808 + }, + { + "epoch": 3.034875203472599, + "grad_norm": 2.9625861644744873, + "learning_rate": 3.6880909962998495e-06, + "loss": 0.0975, + "step": 46809 + }, + { + "epoch": 3.034888768312534, + "grad_norm": 2.5876028537750244, + "learning_rate": 3.6879539536795943e-06, + "loss": 0.0534, + "step": 46810 + }, + { + "epoch": 3.034902333152469, + "grad_norm": 3.702465772628784, + "learning_rate": 3.68781691105934e-06, + "loss": 0.0808, + "step": 46811 + }, + { + "epoch": 3.034915897992404, + "grad_norm": 3.704602003097534, + "learning_rate": 3.6876798684390846e-06, + "loss": 0.1135, + "step": 46812 + }, + { + "epoch": 3.0349294628323387, + "grad_norm": 2.9074676036834717, + "learning_rate": 3.6875428258188302e-06, + "loss": 0.0743, + "step": 46813 + }, + { + "epoch": 3.0349430276722735, + "grad_norm": 4.774660110473633, + "learning_rate": 3.687405783198575e-06, + "loss": 0.0972, + "step": 46814 + }, + { + "epoch": 3.0349565925122084, + "grad_norm": 2.859806537628174, + "learning_rate": 3.68726874057832e-06, + "loss": 0.093, + "step": 46815 + }, + { + "epoch": 3.0349701573521433, + "grad_norm": 3.2006375789642334, + "learning_rate": 3.6871316979580653e-06, + "loss": 0.0694, + "step": 46816 + }, + { + "epoch": 3.034983722192078, + "grad_norm": 3.705867290496826, + "learning_rate": 3.6869946553378105e-06, + "loss": 0.1214, + "step": 46817 + }, + { + "epoch": 3.034997287032013, + "grad_norm": 2.8276164531707764, + "learning_rate": 3.6868576127175553e-06, + "loss": 0.0587, + "step": 46818 + }, + { + "epoch": 3.035010851871948, + "grad_norm": 5.738269805908203, + "learning_rate": 3.686720570097301e-06, + "loss": 0.0996, + "step": 46819 + }, + { + "epoch": 3.0350244167118827, + "grad_norm": 3.2014782428741455, + "learning_rate": 3.6865835274770456e-06, + "loss": 0.0804, + "step": 46820 + }, + { + "epoch": 3.0350379815518176, + "grad_norm": 3.7787861824035645, + "learning_rate": 3.6864464848567912e-06, + "loss": 0.1046, + "step": 46821 + }, + { + "epoch": 3.0350515463917525, + "grad_norm": 3.474919557571411, + "learning_rate": 3.686309442236536e-06, + "loss": 0.1115, + "step": 46822 + }, + { + "epoch": 3.0350651112316873, + "grad_norm": 4.583866119384766, + "learning_rate": 3.6861723996162807e-06, + "loss": 0.1306, + "step": 46823 + }, + { + "epoch": 3.035078676071622, + "grad_norm": 5.067930698394775, + "learning_rate": 3.686035356996026e-06, + "loss": 0.1007, + "step": 46824 + }, + { + "epoch": 3.035092240911557, + "grad_norm": 3.3445987701416016, + "learning_rate": 3.685898314375771e-06, + "loss": 0.0964, + "step": 46825 + }, + { + "epoch": 3.035105805751492, + "grad_norm": 3.2029471397399902, + "learning_rate": 3.6857612717555163e-06, + "loss": 0.075, + "step": 46826 + }, + { + "epoch": 3.0351193705914272, + "grad_norm": 5.20483922958374, + "learning_rate": 3.685624229135261e-06, + "loss": 0.124, + "step": 46827 + }, + { + "epoch": 3.035132935431362, + "grad_norm": 3.6032350063323975, + "learning_rate": 3.6854871865150066e-06, + "loss": 0.0799, + "step": 46828 + }, + { + "epoch": 3.035146500271297, + "grad_norm": 3.2508492469787598, + "learning_rate": 3.6853501438947514e-06, + "loss": 0.0706, + "step": 46829 + }, + { + "epoch": 3.035160065111232, + "grad_norm": 3.7078545093536377, + "learning_rate": 3.685213101274497e-06, + "loss": 0.0819, + "step": 46830 + }, + { + "epoch": 3.0351736299511667, + "grad_norm": 3.086171865463257, + "learning_rate": 3.6850760586542417e-06, + "loss": 0.0522, + "step": 46831 + }, + { + "epoch": 3.0351871947911016, + "grad_norm": 4.008054733276367, + "learning_rate": 3.684939016033987e-06, + "loss": 0.0911, + "step": 46832 + }, + { + "epoch": 3.0352007596310364, + "grad_norm": 3.791696310043335, + "learning_rate": 3.684801973413732e-06, + "loss": 0.1065, + "step": 46833 + }, + { + "epoch": 3.0352143244709713, + "grad_norm": 3.3001537322998047, + "learning_rate": 3.6846649307934773e-06, + "loss": 0.1117, + "step": 46834 + }, + { + "epoch": 3.035227889310906, + "grad_norm": 3.7793850898742676, + "learning_rate": 3.684527888173222e-06, + "loss": 0.0873, + "step": 46835 + }, + { + "epoch": 3.035241454150841, + "grad_norm": 3.9338629245758057, + "learning_rate": 3.684390845552967e-06, + "loss": 0.1642, + "step": 46836 + }, + { + "epoch": 3.035255018990776, + "grad_norm": 4.754836559295654, + "learning_rate": 3.6842538029327124e-06, + "loss": 0.1012, + "step": 46837 + }, + { + "epoch": 3.0352685838307107, + "grad_norm": 3.0405352115631104, + "learning_rate": 3.684116760312457e-06, + "loss": 0.0903, + "step": 46838 + }, + { + "epoch": 3.0352821486706456, + "grad_norm": 3.0662436485290527, + "learning_rate": 3.6839797176922028e-06, + "loss": 0.0692, + "step": 46839 + }, + { + "epoch": 3.0352957135105805, + "grad_norm": 3.452759265899658, + "learning_rate": 3.6838426750719475e-06, + "loss": 0.1109, + "step": 46840 + }, + { + "epoch": 3.0353092783505153, + "grad_norm": 3.6729166507720947, + "learning_rate": 3.6837056324516927e-06, + "loss": 0.0619, + "step": 46841 + }, + { + "epoch": 3.03532284319045, + "grad_norm": 5.060348033905029, + "learning_rate": 3.683568589831438e-06, + "loss": 0.1108, + "step": 46842 + }, + { + "epoch": 3.035336408030385, + "grad_norm": 3.8105320930480957, + "learning_rate": 3.683431547211183e-06, + "loss": 0.0793, + "step": 46843 + }, + { + "epoch": 3.03534997287032, + "grad_norm": 3.3540849685668945, + "learning_rate": 3.683294504590928e-06, + "loss": 0.077, + "step": 46844 + }, + { + "epoch": 3.0353635377102552, + "grad_norm": 3.5964653491973877, + "learning_rate": 3.6831574619706734e-06, + "loss": 0.0873, + "step": 46845 + }, + { + "epoch": 3.03537710255019, + "grad_norm": 3.9692299365997314, + "learning_rate": 3.683020419350418e-06, + "loss": 0.0959, + "step": 46846 + }, + { + "epoch": 3.035390667390125, + "grad_norm": 3.432290554046631, + "learning_rate": 3.6828833767301638e-06, + "loss": 0.1225, + "step": 46847 + }, + { + "epoch": 3.03540423223006, + "grad_norm": 3.8253555297851562, + "learning_rate": 3.6827463341099085e-06, + "loss": 0.1408, + "step": 46848 + }, + { + "epoch": 3.0354177970699947, + "grad_norm": 3.4820661544799805, + "learning_rate": 3.6826092914896537e-06, + "loss": 0.0729, + "step": 46849 + }, + { + "epoch": 3.0354313619099296, + "grad_norm": 5.65202522277832, + "learning_rate": 3.682472248869399e-06, + "loss": 0.1918, + "step": 46850 + }, + { + "epoch": 3.0354449267498644, + "grad_norm": 3.9255735874176025, + "learning_rate": 3.6823352062491436e-06, + "loss": 0.1089, + "step": 46851 + }, + { + "epoch": 3.0354584915897993, + "grad_norm": 4.266434192657471, + "learning_rate": 3.682198163628889e-06, + "loss": 0.1393, + "step": 46852 + }, + { + "epoch": 3.035472056429734, + "grad_norm": 3.24515700340271, + "learning_rate": 3.682061121008634e-06, + "loss": 0.0649, + "step": 46853 + }, + { + "epoch": 3.035485621269669, + "grad_norm": 2.840331554412842, + "learning_rate": 3.681924078388379e-06, + "loss": 0.1049, + "step": 46854 + }, + { + "epoch": 3.035499186109604, + "grad_norm": 4.2777862548828125, + "learning_rate": 3.681787035768124e-06, + "loss": 0.1301, + "step": 46855 + }, + { + "epoch": 3.0355127509495388, + "grad_norm": 3.3818318843841553, + "learning_rate": 3.6816499931478695e-06, + "loss": 0.1046, + "step": 46856 + }, + { + "epoch": 3.0355263157894736, + "grad_norm": 3.8615376949310303, + "learning_rate": 3.6815129505276143e-06, + "loss": 0.0733, + "step": 46857 + }, + { + "epoch": 3.0355398806294085, + "grad_norm": 3.677788257598877, + "learning_rate": 3.6813759079073595e-06, + "loss": 0.0639, + "step": 46858 + }, + { + "epoch": 3.0355534454693434, + "grad_norm": 4.52540397644043, + "learning_rate": 3.6812388652871046e-06, + "loss": 0.1631, + "step": 46859 + }, + { + "epoch": 3.0355670103092782, + "grad_norm": 4.274776935577393, + "learning_rate": 3.68110182266685e-06, + "loss": 0.0986, + "step": 46860 + }, + { + "epoch": 3.035580575149213, + "grad_norm": 4.2111310958862305, + "learning_rate": 3.6809647800465946e-06, + "loss": 0.1015, + "step": 46861 + }, + { + "epoch": 3.035594139989148, + "grad_norm": 4.5812225341796875, + "learning_rate": 3.68082773742634e-06, + "loss": 0.0904, + "step": 46862 + }, + { + "epoch": 3.035607704829083, + "grad_norm": 4.721538066864014, + "learning_rate": 3.680690694806085e-06, + "loss": 0.1184, + "step": 46863 + }, + { + "epoch": 3.0356212696690177, + "grad_norm": 3.1243410110473633, + "learning_rate": 3.6805536521858297e-06, + "loss": 0.0941, + "step": 46864 + }, + { + "epoch": 3.035634834508953, + "grad_norm": 6.0200324058532715, + "learning_rate": 3.6804166095655753e-06, + "loss": 0.2126, + "step": 46865 + }, + { + "epoch": 3.035648399348888, + "grad_norm": 3.1957285404205322, + "learning_rate": 3.68027956694532e-06, + "loss": 0.0951, + "step": 46866 + }, + { + "epoch": 3.0356619641888227, + "grad_norm": 4.3074822425842285, + "learning_rate": 3.6801425243250656e-06, + "loss": 0.107, + "step": 46867 + }, + { + "epoch": 3.0356755290287576, + "grad_norm": 2.8424594402313232, + "learning_rate": 3.6800054817048104e-06, + "loss": 0.0424, + "step": 46868 + }, + { + "epoch": 3.0356890938686925, + "grad_norm": 3.571855068206787, + "learning_rate": 3.6798684390845556e-06, + "loss": 0.1033, + "step": 46869 + }, + { + "epoch": 3.0357026587086273, + "grad_norm": 3.3663268089294434, + "learning_rate": 3.6797313964643008e-06, + "loss": 0.0549, + "step": 46870 + }, + { + "epoch": 3.035716223548562, + "grad_norm": 3.9142985343933105, + "learning_rate": 3.679594353844046e-06, + "loss": 0.1592, + "step": 46871 + }, + { + "epoch": 3.035729788388497, + "grad_norm": 3.518122434616089, + "learning_rate": 3.6794573112237907e-06, + "loss": 0.0644, + "step": 46872 + }, + { + "epoch": 3.035743353228432, + "grad_norm": 3.5380489826202393, + "learning_rate": 3.6793202686035363e-06, + "loss": 0.0717, + "step": 46873 + }, + { + "epoch": 3.035756918068367, + "grad_norm": 3.80595064163208, + "learning_rate": 3.679183225983281e-06, + "loss": 0.0969, + "step": 46874 + }, + { + "epoch": 3.0357704829083016, + "grad_norm": 4.2148613929748535, + "learning_rate": 3.6790461833630262e-06, + "loss": 0.156, + "step": 46875 + }, + { + "epoch": 3.0357840477482365, + "grad_norm": 3.8529977798461914, + "learning_rate": 3.6789091407427714e-06, + "loss": 0.0431, + "step": 46876 + }, + { + "epoch": 3.0357976125881714, + "grad_norm": 3.8740718364715576, + "learning_rate": 3.6787720981225166e-06, + "loss": 0.0576, + "step": 46877 + }, + { + "epoch": 3.0358111774281062, + "grad_norm": 4.597614288330078, + "learning_rate": 3.6786350555022613e-06, + "loss": 0.1577, + "step": 46878 + }, + { + "epoch": 3.035824742268041, + "grad_norm": 3.1917946338653564, + "learning_rate": 3.6784980128820065e-06, + "loss": 0.1148, + "step": 46879 + }, + { + "epoch": 3.035838307107976, + "grad_norm": 3.5117173194885254, + "learning_rate": 3.6783609702617517e-06, + "loss": 0.0835, + "step": 46880 + }, + { + "epoch": 3.035851871947911, + "grad_norm": 5.626471996307373, + "learning_rate": 3.6782239276414965e-06, + "loss": 0.1166, + "step": 46881 + }, + { + "epoch": 3.0358654367878457, + "grad_norm": 4.934053897857666, + "learning_rate": 3.678086885021242e-06, + "loss": 0.1337, + "step": 46882 + }, + { + "epoch": 3.035879001627781, + "grad_norm": 3.8310961723327637, + "learning_rate": 3.677949842400987e-06, + "loss": 0.1224, + "step": 46883 + }, + { + "epoch": 3.035892566467716, + "grad_norm": 3.6391777992248535, + "learning_rate": 3.6778127997807324e-06, + "loss": 0.0684, + "step": 46884 + }, + { + "epoch": 3.0359061313076507, + "grad_norm": 6.368396759033203, + "learning_rate": 3.677675757160477e-06, + "loss": 0.1326, + "step": 46885 + }, + { + "epoch": 3.0359196961475856, + "grad_norm": 4.098686695098877, + "learning_rate": 3.6775387145402223e-06, + "loss": 0.0937, + "step": 46886 + }, + { + "epoch": 3.0359332609875205, + "grad_norm": 3.636878728866577, + "learning_rate": 3.6774016719199675e-06, + "loss": 0.0999, + "step": 46887 + }, + { + "epoch": 3.0359468258274553, + "grad_norm": 4.323838710784912, + "learning_rate": 3.6772646292997127e-06, + "loss": 0.0967, + "step": 46888 + }, + { + "epoch": 3.03596039066739, + "grad_norm": 3.1357030868530273, + "learning_rate": 3.6771275866794575e-06, + "loss": 0.0638, + "step": 46889 + }, + { + "epoch": 3.035973955507325, + "grad_norm": 4.704340934753418, + "learning_rate": 3.676990544059203e-06, + "loss": 0.1017, + "step": 46890 + }, + { + "epoch": 3.03598752034726, + "grad_norm": 4.195258617401123, + "learning_rate": 3.676853501438948e-06, + "loss": 0.122, + "step": 46891 + }, + { + "epoch": 3.036001085187195, + "grad_norm": 3.637923240661621, + "learning_rate": 3.6767164588186926e-06, + "loss": 0.0521, + "step": 46892 + }, + { + "epoch": 3.0360146500271297, + "grad_norm": 3.837226390838623, + "learning_rate": 3.676579416198438e-06, + "loss": 0.1248, + "step": 46893 + }, + { + "epoch": 3.0360282148670645, + "grad_norm": 3.5287153720855713, + "learning_rate": 3.676442373578183e-06, + "loss": 0.0888, + "step": 46894 + }, + { + "epoch": 3.0360417797069994, + "grad_norm": 4.377951145172119, + "learning_rate": 3.676305330957928e-06, + "loss": 0.0641, + "step": 46895 + }, + { + "epoch": 3.0360553445469343, + "grad_norm": 4.464954853057861, + "learning_rate": 3.6761682883376733e-06, + "loss": 0.0825, + "step": 46896 + }, + { + "epoch": 3.036068909386869, + "grad_norm": 7.252691268920898, + "learning_rate": 3.6760312457174185e-06, + "loss": 0.1967, + "step": 46897 + }, + { + "epoch": 3.036082474226804, + "grad_norm": 3.5833382606506348, + "learning_rate": 3.6758942030971632e-06, + "loss": 0.0823, + "step": 46898 + }, + { + "epoch": 3.036096039066739, + "grad_norm": 5.7988080978393555, + "learning_rate": 3.675757160476909e-06, + "loss": 0.1458, + "step": 46899 + }, + { + "epoch": 3.0361096039066737, + "grad_norm": 3.5324366092681885, + "learning_rate": 3.6756201178566536e-06, + "loss": 0.0951, + "step": 46900 + }, + { + "epoch": 3.0361231687466086, + "grad_norm": 5.424172401428223, + "learning_rate": 3.675483075236399e-06, + "loss": 0.1636, + "step": 46901 + }, + { + "epoch": 3.0361367335865435, + "grad_norm": 3.9631407260894775, + "learning_rate": 3.675346032616144e-06, + "loss": 0.0727, + "step": 46902 + }, + { + "epoch": 3.0361502984264788, + "grad_norm": 4.000006675720215, + "learning_rate": 3.675208989995889e-06, + "loss": 0.0862, + "step": 46903 + }, + { + "epoch": 3.0361638632664136, + "grad_norm": 4.746936321258545, + "learning_rate": 3.6750719473756343e-06, + "loss": 0.0977, + "step": 46904 + }, + { + "epoch": 3.0361774281063485, + "grad_norm": 4.219296455383301, + "learning_rate": 3.674934904755379e-06, + "loss": 0.0675, + "step": 46905 + }, + { + "epoch": 3.0361909929462834, + "grad_norm": 4.693912029266357, + "learning_rate": 3.6747978621351242e-06, + "loss": 0.1206, + "step": 46906 + }, + { + "epoch": 3.0362045577862182, + "grad_norm": 3.9764747619628906, + "learning_rate": 3.674660819514869e-06, + "loss": 0.1092, + "step": 46907 + }, + { + "epoch": 3.036218122626153, + "grad_norm": 5.7757744789123535, + "learning_rate": 3.6745237768946146e-06, + "loss": 0.1524, + "step": 46908 + }, + { + "epoch": 3.036231687466088, + "grad_norm": 5.129180431365967, + "learning_rate": 3.6743867342743593e-06, + "loss": 0.1172, + "step": 46909 + }, + { + "epoch": 3.036245252306023, + "grad_norm": 4.2680134773254395, + "learning_rate": 3.674249691654105e-06, + "loss": 0.0859, + "step": 46910 + }, + { + "epoch": 3.0362588171459577, + "grad_norm": 6.140940189361572, + "learning_rate": 3.6741126490338497e-06, + "loss": 0.1349, + "step": 46911 + }, + { + "epoch": 3.0362723819858926, + "grad_norm": 4.821258544921875, + "learning_rate": 3.673975606413595e-06, + "loss": 0.106, + "step": 46912 + }, + { + "epoch": 3.0362859468258274, + "grad_norm": 4.526162624359131, + "learning_rate": 3.67383856379334e-06, + "loss": 0.1105, + "step": 46913 + }, + { + "epoch": 3.0362995116657623, + "grad_norm": 4.6617326736450195, + "learning_rate": 3.6737015211730852e-06, + "loss": 0.1161, + "step": 46914 + }, + { + "epoch": 3.036313076505697, + "grad_norm": 4.793005466461182, + "learning_rate": 3.67356447855283e-06, + "loss": 0.1416, + "step": 46915 + }, + { + "epoch": 3.036326641345632, + "grad_norm": 2.663635492324829, + "learning_rate": 3.6734274359325756e-06, + "loss": 0.0377, + "step": 46916 + }, + { + "epoch": 3.036340206185567, + "grad_norm": 5.830387115478516, + "learning_rate": 3.6732903933123204e-06, + "loss": 0.1208, + "step": 46917 + }, + { + "epoch": 3.0363537710255017, + "grad_norm": 3.3978168964385986, + "learning_rate": 3.673153350692066e-06, + "loss": 0.1052, + "step": 46918 + }, + { + "epoch": 3.0363673358654366, + "grad_norm": 6.3078742027282715, + "learning_rate": 3.6730163080718107e-06, + "loss": 0.1945, + "step": 46919 + }, + { + "epoch": 3.0363809007053715, + "grad_norm": 5.404008865356445, + "learning_rate": 3.6728792654515555e-06, + "loss": 0.179, + "step": 46920 + }, + { + "epoch": 3.036394465545307, + "grad_norm": 4.645127296447754, + "learning_rate": 3.672742222831301e-06, + "loss": 0.1036, + "step": 46921 + }, + { + "epoch": 3.0364080303852417, + "grad_norm": 4.610705375671387, + "learning_rate": 3.672605180211046e-06, + "loss": 0.1047, + "step": 46922 + }, + { + "epoch": 3.0364215952251765, + "grad_norm": 3.8476622104644775, + "learning_rate": 3.672468137590791e-06, + "loss": 0.0898, + "step": 46923 + }, + { + "epoch": 3.0364351600651114, + "grad_norm": 3.699873208999634, + "learning_rate": 3.672331094970536e-06, + "loss": 0.0658, + "step": 46924 + }, + { + "epoch": 3.0364487249050462, + "grad_norm": 3.8150744438171387, + "learning_rate": 3.6721940523502814e-06, + "loss": 0.1131, + "step": 46925 + }, + { + "epoch": 3.036462289744981, + "grad_norm": 7.202103137969971, + "learning_rate": 3.672057009730026e-06, + "loss": 0.1687, + "step": 46926 + }, + { + "epoch": 3.036475854584916, + "grad_norm": 5.315985202789307, + "learning_rate": 3.6719199671097717e-06, + "loss": 0.1712, + "step": 46927 + }, + { + "epoch": 3.036489419424851, + "grad_norm": 4.838340759277344, + "learning_rate": 3.6717829244895165e-06, + "loss": 0.1446, + "step": 46928 + }, + { + "epoch": 3.0365029842647857, + "grad_norm": 4.211932182312012, + "learning_rate": 3.6716458818692617e-06, + "loss": 0.1295, + "step": 46929 + }, + { + "epoch": 3.0365165491047206, + "grad_norm": 4.02739953994751, + "learning_rate": 3.671508839249007e-06, + "loss": 0.1514, + "step": 46930 + }, + { + "epoch": 3.0365301139446554, + "grad_norm": 3.598235607147217, + "learning_rate": 3.671371796628752e-06, + "loss": 0.1241, + "step": 46931 + }, + { + "epoch": 3.0365436787845903, + "grad_norm": 4.255699157714844, + "learning_rate": 3.6712347540084968e-06, + "loss": 0.086, + "step": 46932 + }, + { + "epoch": 3.036557243624525, + "grad_norm": 3.433551788330078, + "learning_rate": 3.671097711388242e-06, + "loss": 0.0492, + "step": 46933 + }, + { + "epoch": 3.03657080846446, + "grad_norm": 5.30449104309082, + "learning_rate": 3.670960668767987e-06, + "loss": 0.1184, + "step": 46934 + }, + { + "epoch": 3.036584373304395, + "grad_norm": 3.3706753253936768, + "learning_rate": 3.670823626147732e-06, + "loss": 0.1434, + "step": 46935 + }, + { + "epoch": 3.0365979381443298, + "grad_norm": 3.7072854042053223, + "learning_rate": 3.6706865835274775e-06, + "loss": 0.0872, + "step": 46936 + }, + { + "epoch": 3.0366115029842646, + "grad_norm": 4.591649532318115, + "learning_rate": 3.6705495409072222e-06, + "loss": 0.1175, + "step": 46937 + }, + { + "epoch": 3.0366250678241995, + "grad_norm": 4.121829986572266, + "learning_rate": 3.670412498286968e-06, + "loss": 0.1222, + "step": 46938 + }, + { + "epoch": 3.0366386326641344, + "grad_norm": 4.810267925262451, + "learning_rate": 3.6702754556667126e-06, + "loss": 0.1043, + "step": 46939 + }, + { + "epoch": 3.0366521975040692, + "grad_norm": 3.271061658859253, + "learning_rate": 3.6701384130464578e-06, + "loss": 0.1132, + "step": 46940 + }, + { + "epoch": 3.0366657623440045, + "grad_norm": 3.0496654510498047, + "learning_rate": 3.670001370426203e-06, + "loss": 0.1071, + "step": 46941 + }, + { + "epoch": 3.0366793271839394, + "grad_norm": 5.648352146148682, + "learning_rate": 3.669864327805948e-06, + "loss": 0.1231, + "step": 46942 + }, + { + "epoch": 3.0366928920238743, + "grad_norm": 3.9806156158447266, + "learning_rate": 3.669727285185693e-06, + "loss": 0.0709, + "step": 46943 + }, + { + "epoch": 3.036706456863809, + "grad_norm": 4.942787170410156, + "learning_rate": 3.6695902425654385e-06, + "loss": 0.1382, + "step": 46944 + }, + { + "epoch": 3.036720021703744, + "grad_norm": 5.098390102386475, + "learning_rate": 3.6694531999451832e-06, + "loss": 0.1156, + "step": 46945 + }, + { + "epoch": 3.036733586543679, + "grad_norm": 5.188338756561279, + "learning_rate": 3.6693161573249284e-06, + "loss": 0.1124, + "step": 46946 + }, + { + "epoch": 3.0367471513836137, + "grad_norm": 3.7598283290863037, + "learning_rate": 3.6691791147046736e-06, + "loss": 0.1543, + "step": 46947 + }, + { + "epoch": 3.0367607162235486, + "grad_norm": 3.8449959754943848, + "learning_rate": 3.6690420720844184e-06, + "loss": 0.1561, + "step": 46948 + }, + { + "epoch": 3.0367742810634835, + "grad_norm": 4.055009365081787, + "learning_rate": 3.6689050294641635e-06, + "loss": 0.1274, + "step": 46949 + }, + { + "epoch": 3.0367878459034183, + "grad_norm": 4.4198737144470215, + "learning_rate": 3.6687679868439087e-06, + "loss": 0.1134, + "step": 46950 + }, + { + "epoch": 3.036801410743353, + "grad_norm": 2.8907437324523926, + "learning_rate": 3.668630944223654e-06, + "loss": 0.075, + "step": 46951 + }, + { + "epoch": 3.036814975583288, + "grad_norm": 5.562811851501465, + "learning_rate": 3.6684939016033986e-06, + "loss": 0.1216, + "step": 46952 + }, + { + "epoch": 3.036828540423223, + "grad_norm": 5.106243133544922, + "learning_rate": 3.6683568589831443e-06, + "loss": 0.0818, + "step": 46953 + }, + { + "epoch": 3.036842105263158, + "grad_norm": 3.9184353351593018, + "learning_rate": 3.668219816362889e-06, + "loss": 0.1157, + "step": 46954 + }, + { + "epoch": 3.0368556701030927, + "grad_norm": 4.594974517822266, + "learning_rate": 3.6680827737426346e-06, + "loss": 0.1538, + "step": 46955 + }, + { + "epoch": 3.0368692349430275, + "grad_norm": 2.74088454246521, + "learning_rate": 3.6679457311223794e-06, + "loss": 0.0505, + "step": 46956 + }, + { + "epoch": 3.0368827997829624, + "grad_norm": 3.796166181564331, + "learning_rate": 3.6678086885021245e-06, + "loss": 0.128, + "step": 46957 + }, + { + "epoch": 3.0368963646228972, + "grad_norm": 3.527268886566162, + "learning_rate": 3.6676716458818697e-06, + "loss": 0.0794, + "step": 46958 + }, + { + "epoch": 3.0369099294628326, + "grad_norm": 3.3700685501098633, + "learning_rate": 3.667534603261615e-06, + "loss": 0.0892, + "step": 46959 + }, + { + "epoch": 3.0369234943027674, + "grad_norm": 3.9199817180633545, + "learning_rate": 3.6673975606413597e-06, + "loss": 0.1054, + "step": 46960 + }, + { + "epoch": 3.0369370591427023, + "grad_norm": 3.1368236541748047, + "learning_rate": 3.6672605180211044e-06, + "loss": 0.0712, + "step": 46961 + }, + { + "epoch": 3.036950623982637, + "grad_norm": 3.6890478134155273, + "learning_rate": 3.66712347540085e-06, + "loss": 0.1539, + "step": 46962 + }, + { + "epoch": 3.036964188822572, + "grad_norm": 2.854846239089966, + "learning_rate": 3.6669864327805948e-06, + "loss": 0.0481, + "step": 46963 + }, + { + "epoch": 3.036977753662507, + "grad_norm": 3.9651637077331543, + "learning_rate": 3.6668493901603404e-06, + "loss": 0.0997, + "step": 46964 + }, + { + "epoch": 3.0369913185024418, + "grad_norm": 4.795098781585693, + "learning_rate": 3.666712347540085e-06, + "loss": 0.2057, + "step": 46965 + }, + { + "epoch": 3.0370048833423766, + "grad_norm": 3.3574843406677246, + "learning_rate": 3.6665753049198303e-06, + "loss": 0.0821, + "step": 46966 + }, + { + "epoch": 3.0370184481823115, + "grad_norm": 3.8093385696411133, + "learning_rate": 3.6664382622995755e-06, + "loss": 0.1481, + "step": 46967 + }, + { + "epoch": 3.0370320130222463, + "grad_norm": 3.528744697570801, + "learning_rate": 3.6663012196793207e-06, + "loss": 0.1261, + "step": 46968 + }, + { + "epoch": 3.037045577862181, + "grad_norm": 4.116515159606934, + "learning_rate": 3.6661641770590654e-06, + "loss": 0.1044, + "step": 46969 + }, + { + "epoch": 3.037059142702116, + "grad_norm": 3.7466208934783936, + "learning_rate": 3.666027134438811e-06, + "loss": 0.0898, + "step": 46970 + }, + { + "epoch": 3.037072707542051, + "grad_norm": 5.192962169647217, + "learning_rate": 3.6658900918185558e-06, + "loss": 0.099, + "step": 46971 + }, + { + "epoch": 3.037086272381986, + "grad_norm": 3.706897258758545, + "learning_rate": 3.6657530491983014e-06, + "loss": 0.1277, + "step": 46972 + }, + { + "epoch": 3.0370998372219207, + "grad_norm": 3.1852049827575684, + "learning_rate": 3.665616006578046e-06, + "loss": 0.0517, + "step": 46973 + }, + { + "epoch": 3.0371134020618555, + "grad_norm": 2.7440431118011475, + "learning_rate": 3.665478963957791e-06, + "loss": 0.0539, + "step": 46974 + }, + { + "epoch": 3.0371269669017904, + "grad_norm": 3.6346218585968018, + "learning_rate": 3.6653419213375365e-06, + "loss": 0.0719, + "step": 46975 + }, + { + "epoch": 3.0371405317417253, + "grad_norm": 3.7270848751068115, + "learning_rate": 3.6652048787172812e-06, + "loss": 0.0916, + "step": 46976 + }, + { + "epoch": 3.03715409658166, + "grad_norm": 3.3961567878723145, + "learning_rate": 3.6650678360970264e-06, + "loss": 0.047, + "step": 46977 + }, + { + "epoch": 3.0371676614215954, + "grad_norm": 5.199550151824951, + "learning_rate": 3.664930793476771e-06, + "loss": 0.1555, + "step": 46978 + }, + { + "epoch": 3.0371812262615303, + "grad_norm": 3.7411534786224365, + "learning_rate": 3.6647937508565168e-06, + "loss": 0.0654, + "step": 46979 + }, + { + "epoch": 3.037194791101465, + "grad_norm": 4.090524673461914, + "learning_rate": 3.6646567082362615e-06, + "loss": 0.0956, + "step": 46980 + }, + { + "epoch": 3.0372083559414, + "grad_norm": 7.223310470581055, + "learning_rate": 3.664519665616007e-06, + "loss": 0.1513, + "step": 46981 + }, + { + "epoch": 3.037221920781335, + "grad_norm": 3.2996368408203125, + "learning_rate": 3.664382622995752e-06, + "loss": 0.073, + "step": 46982 + }, + { + "epoch": 3.0372354856212698, + "grad_norm": 3.948073625564575, + "learning_rate": 3.664245580375497e-06, + "loss": 0.0896, + "step": 46983 + }, + { + "epoch": 3.0372490504612046, + "grad_norm": 2.4992551803588867, + "learning_rate": 3.6641085377552423e-06, + "loss": 0.0281, + "step": 46984 + }, + { + "epoch": 3.0372626153011395, + "grad_norm": 4.91109037399292, + "learning_rate": 3.6639714951349874e-06, + "loss": 0.085, + "step": 46985 + }, + { + "epoch": 3.0372761801410744, + "grad_norm": 2.340872287750244, + "learning_rate": 3.663834452514732e-06, + "loss": 0.0235, + "step": 46986 + }, + { + "epoch": 3.0372897449810092, + "grad_norm": 3.4548208713531494, + "learning_rate": 3.663697409894478e-06, + "loss": 0.068, + "step": 46987 + }, + { + "epoch": 3.037303309820944, + "grad_norm": 3.350775957107544, + "learning_rate": 3.6635603672742225e-06, + "loss": 0.0569, + "step": 46988 + }, + { + "epoch": 3.037316874660879, + "grad_norm": 4.047455310821533, + "learning_rate": 3.6634233246539673e-06, + "loss": 0.1041, + "step": 46989 + }, + { + "epoch": 3.037330439500814, + "grad_norm": 2.5921871662139893, + "learning_rate": 3.663286282033713e-06, + "loss": 0.0554, + "step": 46990 + }, + { + "epoch": 3.0373440043407487, + "grad_norm": 3.137537717819214, + "learning_rate": 3.6631492394134577e-06, + "loss": 0.0549, + "step": 46991 + }, + { + "epoch": 3.0373575691806836, + "grad_norm": 2.974724769592285, + "learning_rate": 3.6630121967932033e-06, + "loss": 0.0654, + "step": 46992 + }, + { + "epoch": 3.0373711340206184, + "grad_norm": 3.205249786376953, + "learning_rate": 3.662875154172948e-06, + "loss": 0.0909, + "step": 46993 + }, + { + "epoch": 3.0373846988605533, + "grad_norm": 3.153738260269165, + "learning_rate": 3.662738111552693e-06, + "loss": 0.0839, + "step": 46994 + }, + { + "epoch": 3.037398263700488, + "grad_norm": 4.594786643981934, + "learning_rate": 3.662601068932438e-06, + "loss": 0.0746, + "step": 46995 + }, + { + "epoch": 3.037411828540423, + "grad_norm": 3.264810800552368, + "learning_rate": 3.6624640263121836e-06, + "loss": 0.0928, + "step": 46996 + }, + { + "epoch": 3.0374253933803583, + "grad_norm": 4.009794235229492, + "learning_rate": 3.6623269836919283e-06, + "loss": 0.1427, + "step": 46997 + }, + { + "epoch": 3.037438958220293, + "grad_norm": 3.787893772125244, + "learning_rate": 3.662189941071674e-06, + "loss": 0.0723, + "step": 46998 + }, + { + "epoch": 3.037452523060228, + "grad_norm": 3.617555618286133, + "learning_rate": 3.6620528984514187e-06, + "loss": 0.0873, + "step": 46999 + }, + { + "epoch": 3.037466087900163, + "grad_norm": 7.591612815856934, + "learning_rate": 3.661915855831164e-06, + "loss": 0.1988, + "step": 47000 + }, + { + "epoch": 3.037479652740098, + "grad_norm": 3.311379909515381, + "learning_rate": 3.661778813210909e-06, + "loss": 0.0465, + "step": 47001 + }, + { + "epoch": 3.0374932175800327, + "grad_norm": 3.5841777324676514, + "learning_rate": 3.6616417705906538e-06, + "loss": 0.0829, + "step": 47002 + }, + { + "epoch": 3.0375067824199675, + "grad_norm": 3.2150681018829346, + "learning_rate": 3.661504727970399e-06, + "loss": 0.0908, + "step": 47003 + }, + { + "epoch": 3.0375203472599024, + "grad_norm": 3.6196024417877197, + "learning_rate": 3.661367685350144e-06, + "loss": 0.1284, + "step": 47004 + }, + { + "epoch": 3.0375339120998373, + "grad_norm": 5.585673809051514, + "learning_rate": 3.6612306427298893e-06, + "loss": 0.1196, + "step": 47005 + }, + { + "epoch": 3.037547476939772, + "grad_norm": 3.1560075283050537, + "learning_rate": 3.661093600109634e-06, + "loss": 0.0844, + "step": 47006 + }, + { + "epoch": 3.037561041779707, + "grad_norm": 3.032548189163208, + "learning_rate": 3.6609565574893797e-06, + "loss": 0.054, + "step": 47007 + }, + { + "epoch": 3.037574606619642, + "grad_norm": 4.380399703979492, + "learning_rate": 3.6608195148691244e-06, + "loss": 0.1102, + "step": 47008 + }, + { + "epoch": 3.0375881714595767, + "grad_norm": 3.7839648723602295, + "learning_rate": 3.66068247224887e-06, + "loss": 0.1008, + "step": 47009 + }, + { + "epoch": 3.0376017362995116, + "grad_norm": 2.5798540115356445, + "learning_rate": 3.660545429628615e-06, + "loss": 0.0729, + "step": 47010 + }, + { + "epoch": 3.0376153011394464, + "grad_norm": 3.641021490097046, + "learning_rate": 3.66040838700836e-06, + "loss": 0.1157, + "step": 47011 + }, + { + "epoch": 3.0376288659793813, + "grad_norm": 4.469407081604004, + "learning_rate": 3.660271344388105e-06, + "loss": 0.126, + "step": 47012 + }, + { + "epoch": 3.037642430819316, + "grad_norm": 5.128591537475586, + "learning_rate": 3.6601343017678503e-06, + "loss": 0.0698, + "step": 47013 + }, + { + "epoch": 3.037655995659251, + "grad_norm": 4.550278186798096, + "learning_rate": 3.659997259147595e-06, + "loss": 0.1052, + "step": 47014 + }, + { + "epoch": 3.037669560499186, + "grad_norm": 2.9318299293518066, + "learning_rate": 3.65986021652734e-06, + "loss": 0.0594, + "step": 47015 + }, + { + "epoch": 3.037683125339121, + "grad_norm": 2.195974349975586, + "learning_rate": 3.6597231739070854e-06, + "loss": 0.038, + "step": 47016 + }, + { + "epoch": 3.037696690179056, + "grad_norm": 2.5238475799560547, + "learning_rate": 3.65958613128683e-06, + "loss": 0.0479, + "step": 47017 + }, + { + "epoch": 3.037710255018991, + "grad_norm": 3.6241533756256104, + "learning_rate": 3.659449088666576e-06, + "loss": 0.1004, + "step": 47018 + }, + { + "epoch": 3.037723819858926, + "grad_norm": 5.714498043060303, + "learning_rate": 3.6593120460463206e-06, + "loss": 0.1068, + "step": 47019 + }, + { + "epoch": 3.0377373846988607, + "grad_norm": 3.6811959743499756, + "learning_rate": 3.6591750034260657e-06, + "loss": 0.074, + "step": 47020 + }, + { + "epoch": 3.0377509495387955, + "grad_norm": 4.313022613525391, + "learning_rate": 3.659037960805811e-06, + "loss": 0.1108, + "step": 47021 + }, + { + "epoch": 3.0377645143787304, + "grad_norm": 3.3118984699249268, + "learning_rate": 3.658900918185556e-06, + "loss": 0.0631, + "step": 47022 + }, + { + "epoch": 3.0377780792186653, + "grad_norm": 3.5012621879577637, + "learning_rate": 3.658763875565301e-06, + "loss": 0.08, + "step": 47023 + }, + { + "epoch": 3.0377916440586, + "grad_norm": 3.1692514419555664, + "learning_rate": 3.6586268329450464e-06, + "loss": 0.0475, + "step": 47024 + }, + { + "epoch": 3.037805208898535, + "grad_norm": 3.63789963722229, + "learning_rate": 3.658489790324791e-06, + "loss": 0.0847, + "step": 47025 + }, + { + "epoch": 3.03781877373847, + "grad_norm": 4.532825946807861, + "learning_rate": 3.658352747704537e-06, + "loss": 0.1411, + "step": 47026 + }, + { + "epoch": 3.0378323385784047, + "grad_norm": 4.345278739929199, + "learning_rate": 3.6582157050842816e-06, + "loss": 0.0655, + "step": 47027 + }, + { + "epoch": 3.0378459034183396, + "grad_norm": 3.9374027252197266, + "learning_rate": 3.6580786624640267e-06, + "loss": 0.1036, + "step": 47028 + }, + { + "epoch": 3.0378594682582745, + "grad_norm": 4.550221920013428, + "learning_rate": 3.657941619843772e-06, + "loss": 0.0922, + "step": 47029 + }, + { + "epoch": 3.0378730330982093, + "grad_norm": 9.875078201293945, + "learning_rate": 3.6578045772235167e-06, + "loss": 0.2161, + "step": 47030 + }, + { + "epoch": 3.037886597938144, + "grad_norm": 4.733443260192871, + "learning_rate": 3.657667534603262e-06, + "loss": 0.0887, + "step": 47031 + }, + { + "epoch": 3.037900162778079, + "grad_norm": 4.249114513397217, + "learning_rate": 3.6575304919830066e-06, + "loss": 0.0425, + "step": 47032 + }, + { + "epoch": 3.037913727618014, + "grad_norm": 3.1609458923339844, + "learning_rate": 3.657393449362752e-06, + "loss": 0.0736, + "step": 47033 + }, + { + "epoch": 3.037927292457949, + "grad_norm": 4.132100582122803, + "learning_rate": 3.657256406742497e-06, + "loss": 0.1368, + "step": 47034 + }, + { + "epoch": 3.037940857297884, + "grad_norm": 3.1390132904052734, + "learning_rate": 3.6571193641222426e-06, + "loss": 0.0725, + "step": 47035 + }, + { + "epoch": 3.037954422137819, + "grad_norm": 3.0593371391296387, + "learning_rate": 3.6569823215019873e-06, + "loss": 0.049, + "step": 47036 + }, + { + "epoch": 3.037967986977754, + "grad_norm": 3.0284976959228516, + "learning_rate": 3.6568452788817325e-06, + "loss": 0.0713, + "step": 47037 + }, + { + "epoch": 3.0379815518176887, + "grad_norm": 3.2673091888427734, + "learning_rate": 3.6567082362614777e-06, + "loss": 0.0825, + "step": 47038 + }, + { + "epoch": 3.0379951166576236, + "grad_norm": 3.9507555961608887, + "learning_rate": 3.656571193641223e-06, + "loss": 0.0796, + "step": 47039 + }, + { + "epoch": 3.0380086814975584, + "grad_norm": 2.89300799369812, + "learning_rate": 3.6564341510209676e-06, + "loss": 0.0666, + "step": 47040 + }, + { + "epoch": 3.0380222463374933, + "grad_norm": 2.2509517669677734, + "learning_rate": 3.6562971084007132e-06, + "loss": 0.0587, + "step": 47041 + }, + { + "epoch": 3.038035811177428, + "grad_norm": 5.275765419006348, + "learning_rate": 3.656160065780458e-06, + "loss": 0.1155, + "step": 47042 + }, + { + "epoch": 3.038049376017363, + "grad_norm": 2.956498146057129, + "learning_rate": 3.6560230231602027e-06, + "loss": 0.065, + "step": 47043 + }, + { + "epoch": 3.038062940857298, + "grad_norm": 3.673337459564209, + "learning_rate": 3.6558859805399483e-06, + "loss": 0.0585, + "step": 47044 + }, + { + "epoch": 3.0380765056972328, + "grad_norm": 3.398289203643799, + "learning_rate": 3.655748937919693e-06, + "loss": 0.053, + "step": 47045 + }, + { + "epoch": 3.0380900705371676, + "grad_norm": 3.8646812438964844, + "learning_rate": 3.6556118952994387e-06, + "loss": 0.0628, + "step": 47046 + }, + { + "epoch": 3.0381036353771025, + "grad_norm": 4.022488594055176, + "learning_rate": 3.6554748526791834e-06, + "loss": 0.1252, + "step": 47047 + }, + { + "epoch": 3.0381172002170374, + "grad_norm": 3.6033871173858643, + "learning_rate": 3.6553378100589286e-06, + "loss": 0.1229, + "step": 47048 + }, + { + "epoch": 3.038130765056972, + "grad_norm": 4.839437961578369, + "learning_rate": 3.6552007674386734e-06, + "loss": 0.1037, + "step": 47049 + }, + { + "epoch": 3.038144329896907, + "grad_norm": 4.802637577056885, + "learning_rate": 3.655063724818419e-06, + "loss": 0.1281, + "step": 47050 + }, + { + "epoch": 3.038157894736842, + "grad_norm": 4.417783260345459, + "learning_rate": 3.6549266821981637e-06, + "loss": 0.1321, + "step": 47051 + }, + { + "epoch": 3.038171459576777, + "grad_norm": 3.804429292678833, + "learning_rate": 3.6547896395779093e-06, + "loss": 0.0725, + "step": 47052 + }, + { + "epoch": 3.0381850244167117, + "grad_norm": 3.506026029586792, + "learning_rate": 3.654652596957654e-06, + "loss": 0.0646, + "step": 47053 + }, + { + "epoch": 3.038198589256647, + "grad_norm": 5.226632595062256, + "learning_rate": 3.6545155543373993e-06, + "loss": 0.1559, + "step": 47054 + }, + { + "epoch": 3.038212154096582, + "grad_norm": 3.08620023727417, + "learning_rate": 3.6543785117171445e-06, + "loss": 0.0847, + "step": 47055 + }, + { + "epoch": 3.0382257189365167, + "grad_norm": 3.614837646484375, + "learning_rate": 3.6542414690968896e-06, + "loss": 0.0941, + "step": 47056 + }, + { + "epoch": 3.0382392837764516, + "grad_norm": 4.7290496826171875, + "learning_rate": 3.6541044264766344e-06, + "loss": 0.2141, + "step": 47057 + }, + { + "epoch": 3.0382528486163864, + "grad_norm": 4.64180850982666, + "learning_rate": 3.6539673838563796e-06, + "loss": 0.0837, + "step": 47058 + }, + { + "epoch": 3.0382664134563213, + "grad_norm": 5.521059036254883, + "learning_rate": 3.6538303412361247e-06, + "loss": 0.1817, + "step": 47059 + }, + { + "epoch": 3.038279978296256, + "grad_norm": 4.650789260864258, + "learning_rate": 3.6536932986158695e-06, + "loss": 0.1346, + "step": 47060 + }, + { + "epoch": 3.038293543136191, + "grad_norm": 3.790985345840454, + "learning_rate": 3.653556255995615e-06, + "loss": 0.0852, + "step": 47061 + }, + { + "epoch": 3.038307107976126, + "grad_norm": 3.9195024967193604, + "learning_rate": 3.65341921337536e-06, + "loss": 0.0861, + "step": 47062 + }, + { + "epoch": 3.0383206728160608, + "grad_norm": 3.3171839714050293, + "learning_rate": 3.6532821707551055e-06, + "loss": 0.1269, + "step": 47063 + }, + { + "epoch": 3.0383342376559956, + "grad_norm": 3.7014729976654053, + "learning_rate": 3.6531451281348502e-06, + "loss": 0.09, + "step": 47064 + }, + { + "epoch": 3.0383478024959305, + "grad_norm": 2.848909854888916, + "learning_rate": 3.6530080855145954e-06, + "loss": 0.0815, + "step": 47065 + }, + { + "epoch": 3.0383613673358654, + "grad_norm": 4.387819290161133, + "learning_rate": 3.65287104289434e-06, + "loss": 0.1077, + "step": 47066 + }, + { + "epoch": 3.0383749321758002, + "grad_norm": 4.064497470855713, + "learning_rate": 3.6527340002740858e-06, + "loss": 0.0984, + "step": 47067 + }, + { + "epoch": 3.038388497015735, + "grad_norm": 3.024427890777588, + "learning_rate": 3.6525969576538305e-06, + "loss": 0.0654, + "step": 47068 + }, + { + "epoch": 3.03840206185567, + "grad_norm": 5.166894435882568, + "learning_rate": 3.652459915033576e-06, + "loss": 0.1616, + "step": 47069 + }, + { + "epoch": 3.038415626695605, + "grad_norm": 4.612986087799072, + "learning_rate": 3.652322872413321e-06, + "loss": 0.1522, + "step": 47070 + }, + { + "epoch": 3.0384291915355397, + "grad_norm": 5.33118200302124, + "learning_rate": 3.6521858297930656e-06, + "loss": 0.1276, + "step": 47071 + }, + { + "epoch": 3.0384427563754746, + "grad_norm": 5.000896453857422, + "learning_rate": 3.6520487871728112e-06, + "loss": 0.1396, + "step": 47072 + }, + { + "epoch": 3.03845632121541, + "grad_norm": 3.749673843383789, + "learning_rate": 3.651911744552556e-06, + "loss": 0.1078, + "step": 47073 + }, + { + "epoch": 3.0384698860553447, + "grad_norm": 5.760154724121094, + "learning_rate": 3.651774701932301e-06, + "loss": 0.1537, + "step": 47074 + }, + { + "epoch": 3.0384834508952796, + "grad_norm": 3.5071957111358643, + "learning_rate": 3.6516376593120463e-06, + "loss": 0.1093, + "step": 47075 + }, + { + "epoch": 3.0384970157352145, + "grad_norm": 3.450894832611084, + "learning_rate": 3.6515006166917915e-06, + "loss": 0.094, + "step": 47076 + }, + { + "epoch": 3.0385105805751493, + "grad_norm": 3.5728907585144043, + "learning_rate": 3.6513635740715363e-06, + "loss": 0.0702, + "step": 47077 + }, + { + "epoch": 3.038524145415084, + "grad_norm": 4.677549839019775, + "learning_rate": 3.651226531451282e-06, + "loss": 0.1524, + "step": 47078 + }, + { + "epoch": 3.038537710255019, + "grad_norm": 3.9074349403381348, + "learning_rate": 3.6510894888310266e-06, + "loss": 0.0809, + "step": 47079 + }, + { + "epoch": 3.038551275094954, + "grad_norm": 5.787700176239014, + "learning_rate": 3.6509524462107722e-06, + "loss": 0.1236, + "step": 47080 + }, + { + "epoch": 3.038564839934889, + "grad_norm": 3.0499343872070312, + "learning_rate": 3.650815403590517e-06, + "loss": 0.0649, + "step": 47081 + }, + { + "epoch": 3.0385784047748237, + "grad_norm": 2.4340405464172363, + "learning_rate": 3.650678360970262e-06, + "loss": 0.0488, + "step": 47082 + }, + { + "epoch": 3.0385919696147585, + "grad_norm": 4.098819255828857, + "learning_rate": 3.6505413183500073e-06, + "loss": 0.117, + "step": 47083 + }, + { + "epoch": 3.0386055344546934, + "grad_norm": 3.591259002685547, + "learning_rate": 3.650404275729752e-06, + "loss": 0.1005, + "step": 47084 + }, + { + "epoch": 3.0386190992946283, + "grad_norm": 4.483841896057129, + "learning_rate": 3.6502672331094973e-06, + "loss": 0.1669, + "step": 47085 + }, + { + "epoch": 3.038632664134563, + "grad_norm": 3.9880881309509277, + "learning_rate": 3.650130190489242e-06, + "loss": 0.1306, + "step": 47086 + }, + { + "epoch": 3.038646228974498, + "grad_norm": 5.202661037445068, + "learning_rate": 3.6499931478689876e-06, + "loss": 0.2275, + "step": 47087 + }, + { + "epoch": 3.038659793814433, + "grad_norm": 4.315067768096924, + "learning_rate": 3.6498561052487324e-06, + "loss": 0.1229, + "step": 47088 + }, + { + "epoch": 3.0386733586543677, + "grad_norm": 5.6769866943359375, + "learning_rate": 3.649719062628478e-06, + "loss": 0.1196, + "step": 47089 + }, + { + "epoch": 3.0386869234943026, + "grad_norm": 5.822902202606201, + "learning_rate": 3.6495820200082227e-06, + "loss": 0.1509, + "step": 47090 + }, + { + "epoch": 3.0387004883342374, + "grad_norm": 4.168835163116455, + "learning_rate": 3.649444977387968e-06, + "loss": 0.0859, + "step": 47091 + }, + { + "epoch": 3.0387140531741728, + "grad_norm": 4.231726169586182, + "learning_rate": 3.649307934767713e-06, + "loss": 0.114, + "step": 47092 + }, + { + "epoch": 3.0387276180141076, + "grad_norm": 5.93998908996582, + "learning_rate": 3.6491708921474583e-06, + "loss": 0.1421, + "step": 47093 + }, + { + "epoch": 3.0387411828540425, + "grad_norm": 7.943330764770508, + "learning_rate": 3.649033849527203e-06, + "loss": 0.1219, + "step": 47094 + }, + { + "epoch": 3.0387547476939774, + "grad_norm": 4.655385494232178, + "learning_rate": 3.6488968069069486e-06, + "loss": 0.1696, + "step": 47095 + }, + { + "epoch": 3.038768312533912, + "grad_norm": 7.4574456214904785, + "learning_rate": 3.6487597642866934e-06, + "loss": 0.2022, + "step": 47096 + }, + { + "epoch": 3.038781877373847, + "grad_norm": 4.982844829559326, + "learning_rate": 3.648622721666439e-06, + "loss": 0.094, + "step": 47097 + }, + { + "epoch": 3.038795442213782, + "grad_norm": 4.726414680480957, + "learning_rate": 3.6484856790461838e-06, + "loss": 0.124, + "step": 47098 + }, + { + "epoch": 3.038809007053717, + "grad_norm": 3.6127567291259766, + "learning_rate": 3.6483486364259285e-06, + "loss": 0.0821, + "step": 47099 + }, + { + "epoch": 3.0388225718936517, + "grad_norm": 4.003477573394775, + "learning_rate": 3.648211593805674e-06, + "loss": 0.1249, + "step": 47100 + }, + { + "epoch": 3.0388361367335865, + "grad_norm": 5.168701171875, + "learning_rate": 3.648074551185419e-06, + "loss": 0.2003, + "step": 47101 + }, + { + "epoch": 3.0388497015735214, + "grad_norm": 3.8078908920288086, + "learning_rate": 3.647937508565164e-06, + "loss": 0.1063, + "step": 47102 + }, + { + "epoch": 3.0388632664134563, + "grad_norm": 4.352163791656494, + "learning_rate": 3.647800465944909e-06, + "loss": 0.1192, + "step": 47103 + }, + { + "epoch": 3.038876831253391, + "grad_norm": 4.596829891204834, + "learning_rate": 3.6476634233246544e-06, + "loss": 0.1255, + "step": 47104 + }, + { + "epoch": 3.038890396093326, + "grad_norm": 5.188440322875977, + "learning_rate": 3.647526380704399e-06, + "loss": 0.1001, + "step": 47105 + }, + { + "epoch": 3.038903960933261, + "grad_norm": 5.269174098968506, + "learning_rate": 3.6473893380841448e-06, + "loss": 0.1489, + "step": 47106 + }, + { + "epoch": 3.0389175257731957, + "grad_norm": 5.06461238861084, + "learning_rate": 3.6472522954638895e-06, + "loss": 0.0958, + "step": 47107 + }, + { + "epoch": 3.0389310906131306, + "grad_norm": 3.7613232135772705, + "learning_rate": 3.6471152528436347e-06, + "loss": 0.1506, + "step": 47108 + }, + { + "epoch": 3.0389446554530655, + "grad_norm": 4.844725608825684, + "learning_rate": 3.64697821022338e-06, + "loss": 0.1122, + "step": 47109 + }, + { + "epoch": 3.0389582202930003, + "grad_norm": 4.451785087585449, + "learning_rate": 3.646841167603125e-06, + "loss": 0.0944, + "step": 47110 + }, + { + "epoch": 3.0389717851329356, + "grad_norm": 3.86846661567688, + "learning_rate": 3.64670412498287e-06, + "loss": 0.1411, + "step": 47111 + }, + { + "epoch": 3.0389853499728705, + "grad_norm": 5.154175758361816, + "learning_rate": 3.646567082362615e-06, + "loss": 0.1742, + "step": 47112 + }, + { + "epoch": 3.0389989148128054, + "grad_norm": 4.677231788635254, + "learning_rate": 3.64643003974236e-06, + "loss": 0.0597, + "step": 47113 + }, + { + "epoch": 3.0390124796527402, + "grad_norm": 3.843696117401123, + "learning_rate": 3.646292997122105e-06, + "loss": 0.1049, + "step": 47114 + }, + { + "epoch": 3.039026044492675, + "grad_norm": 3.322265863418579, + "learning_rate": 3.6461559545018505e-06, + "loss": 0.0557, + "step": 47115 + }, + { + "epoch": 3.03903960933261, + "grad_norm": 4.211303234100342, + "learning_rate": 3.6460189118815953e-06, + "loss": 0.0701, + "step": 47116 + }, + { + "epoch": 3.039053174172545, + "grad_norm": 4.592422962188721, + "learning_rate": 3.645881869261341e-06, + "loss": 0.085, + "step": 47117 + }, + { + "epoch": 3.0390667390124797, + "grad_norm": 4.023621082305908, + "learning_rate": 3.6457448266410856e-06, + "loss": 0.079, + "step": 47118 + }, + { + "epoch": 3.0390803038524146, + "grad_norm": 4.082159519195557, + "learning_rate": 3.645607784020831e-06, + "loss": 0.1407, + "step": 47119 + }, + { + "epoch": 3.0390938686923494, + "grad_norm": 3.9416937828063965, + "learning_rate": 3.6454707414005756e-06, + "loss": 0.1124, + "step": 47120 + }, + { + "epoch": 3.0391074335322843, + "grad_norm": 4.493527412414551, + "learning_rate": 3.645333698780321e-06, + "loss": 0.1233, + "step": 47121 + }, + { + "epoch": 3.039120998372219, + "grad_norm": 3.365312099456787, + "learning_rate": 3.645196656160066e-06, + "loss": 0.0737, + "step": 47122 + }, + { + "epoch": 3.039134563212154, + "grad_norm": 2.764462947845459, + "learning_rate": 3.6450596135398115e-06, + "loss": 0.0855, + "step": 47123 + }, + { + "epoch": 3.039148128052089, + "grad_norm": 4.5685858726501465, + "learning_rate": 3.6449225709195563e-06, + "loss": 0.1945, + "step": 47124 + }, + { + "epoch": 3.0391616928920238, + "grad_norm": 3.531207799911499, + "learning_rate": 3.6447855282993015e-06, + "loss": 0.1001, + "step": 47125 + }, + { + "epoch": 3.0391752577319586, + "grad_norm": 2.5937535762786865, + "learning_rate": 3.6446484856790466e-06, + "loss": 0.0823, + "step": 47126 + }, + { + "epoch": 3.0391888225718935, + "grad_norm": 4.3271484375, + "learning_rate": 3.6445114430587914e-06, + "loss": 0.132, + "step": 47127 + }, + { + "epoch": 3.0392023874118284, + "grad_norm": 3.657022714614868, + "learning_rate": 3.6443744004385366e-06, + "loss": 0.1262, + "step": 47128 + }, + { + "epoch": 3.039215952251763, + "grad_norm": 5.161551475524902, + "learning_rate": 3.6442373578182818e-06, + "loss": 0.1332, + "step": 47129 + }, + { + "epoch": 3.0392295170916985, + "grad_norm": 3.594843626022339, + "learning_rate": 3.644100315198027e-06, + "loss": 0.095, + "step": 47130 + }, + { + "epoch": 3.0392430819316334, + "grad_norm": 4.397260665893555, + "learning_rate": 3.6439632725777717e-06, + "loss": 0.179, + "step": 47131 + }, + { + "epoch": 3.0392566467715683, + "grad_norm": 3.863197088241577, + "learning_rate": 3.6438262299575173e-06, + "loss": 0.0861, + "step": 47132 + }, + { + "epoch": 3.039270211611503, + "grad_norm": 4.668326377868652, + "learning_rate": 3.643689187337262e-06, + "loss": 0.1466, + "step": 47133 + }, + { + "epoch": 3.039283776451438, + "grad_norm": 3.5121352672576904, + "learning_rate": 3.6435521447170077e-06, + "loss": 0.0717, + "step": 47134 + }, + { + "epoch": 3.039297341291373, + "grad_norm": 5.335738182067871, + "learning_rate": 3.6434151020967524e-06, + "loss": 0.1188, + "step": 47135 + }, + { + "epoch": 3.0393109061313077, + "grad_norm": 4.969570636749268, + "learning_rate": 3.6432780594764976e-06, + "loss": 0.1224, + "step": 47136 + }, + { + "epoch": 3.0393244709712426, + "grad_norm": 2.2220711708068848, + "learning_rate": 3.6431410168562423e-06, + "loss": 0.0362, + "step": 47137 + }, + { + "epoch": 3.0393380358111775, + "grad_norm": 6.5490875244140625, + "learning_rate": 3.643003974235988e-06, + "loss": 0.1878, + "step": 47138 + }, + { + "epoch": 3.0393516006511123, + "grad_norm": 3.900268077850342, + "learning_rate": 3.6428669316157327e-06, + "loss": 0.0738, + "step": 47139 + }, + { + "epoch": 3.039365165491047, + "grad_norm": 5.172567844390869, + "learning_rate": 3.6427298889954775e-06, + "loss": 0.1337, + "step": 47140 + }, + { + "epoch": 3.039378730330982, + "grad_norm": 5.5399861335754395, + "learning_rate": 3.642592846375223e-06, + "loss": 0.1316, + "step": 47141 + }, + { + "epoch": 3.039392295170917, + "grad_norm": 3.131021499633789, + "learning_rate": 3.642455803754968e-06, + "loss": 0.0605, + "step": 47142 + }, + { + "epoch": 3.039405860010852, + "grad_norm": 3.9941532611846924, + "learning_rate": 3.6423187611347134e-06, + "loss": 0.0617, + "step": 47143 + }, + { + "epoch": 3.0394194248507866, + "grad_norm": 3.9302825927734375, + "learning_rate": 3.642181718514458e-06, + "loss": 0.0729, + "step": 47144 + }, + { + "epoch": 3.0394329896907215, + "grad_norm": 4.989674091339111, + "learning_rate": 3.6420446758942034e-06, + "loss": 0.0987, + "step": 47145 + }, + { + "epoch": 3.0394465545306564, + "grad_norm": 4.357276439666748, + "learning_rate": 3.6419076332739485e-06, + "loss": 0.0777, + "step": 47146 + }, + { + "epoch": 3.0394601193705912, + "grad_norm": 4.365479946136475, + "learning_rate": 3.6417705906536937e-06, + "loss": 0.0927, + "step": 47147 + }, + { + "epoch": 3.039473684210526, + "grad_norm": 3.4148330688476562, + "learning_rate": 3.6416335480334385e-06, + "loss": 0.0872, + "step": 47148 + }, + { + "epoch": 3.0394872490504614, + "grad_norm": 4.988868713378906, + "learning_rate": 3.641496505413184e-06, + "loss": 0.1031, + "step": 47149 + }, + { + "epoch": 3.0395008138903963, + "grad_norm": 3.565237045288086, + "learning_rate": 3.641359462792929e-06, + "loss": 0.1023, + "step": 47150 + }, + { + "epoch": 3.039514378730331, + "grad_norm": 2.065056800842285, + "learning_rate": 3.6412224201726744e-06, + "loss": 0.0235, + "step": 47151 + }, + { + "epoch": 3.039527943570266, + "grad_norm": 3.991656541824341, + "learning_rate": 3.641085377552419e-06, + "loss": 0.1135, + "step": 47152 + }, + { + "epoch": 3.039541508410201, + "grad_norm": 5.18222188949585, + "learning_rate": 3.640948334932164e-06, + "loss": 0.0971, + "step": 47153 + }, + { + "epoch": 3.0395550732501357, + "grad_norm": 3.0995595455169678, + "learning_rate": 3.6408112923119095e-06, + "loss": 0.0748, + "step": 47154 + }, + { + "epoch": 3.0395686380900706, + "grad_norm": 4.090176582336426, + "learning_rate": 3.6406742496916543e-06, + "loss": 0.0841, + "step": 47155 + }, + { + "epoch": 3.0395822029300055, + "grad_norm": 5.4897990226745605, + "learning_rate": 3.6405372070713995e-06, + "loss": 0.101, + "step": 47156 + }, + { + "epoch": 3.0395957677699403, + "grad_norm": 4.308558940887451, + "learning_rate": 3.6404001644511442e-06, + "loss": 0.0974, + "step": 47157 + }, + { + "epoch": 3.039609332609875, + "grad_norm": 5.230873107910156, + "learning_rate": 3.64026312183089e-06, + "loss": 0.2172, + "step": 47158 + }, + { + "epoch": 3.03962289744981, + "grad_norm": 6.518733501434326, + "learning_rate": 3.6401260792106346e-06, + "loss": 0.145, + "step": 47159 + }, + { + "epoch": 3.039636462289745, + "grad_norm": 4.661663055419922, + "learning_rate": 3.63998903659038e-06, + "loss": 0.1058, + "step": 47160 + }, + { + "epoch": 3.03965002712968, + "grad_norm": 2.6117589473724365, + "learning_rate": 3.639851993970125e-06, + "loss": 0.0652, + "step": 47161 + }, + { + "epoch": 3.0396635919696147, + "grad_norm": 3.4543662071228027, + "learning_rate": 3.63971495134987e-06, + "loss": 0.0938, + "step": 47162 + }, + { + "epoch": 3.0396771568095495, + "grad_norm": 3.80702805519104, + "learning_rate": 3.6395779087296153e-06, + "loss": 0.1414, + "step": 47163 + }, + { + "epoch": 3.0396907216494844, + "grad_norm": 3.732447862625122, + "learning_rate": 3.6394408661093605e-06, + "loss": 0.1074, + "step": 47164 + }, + { + "epoch": 3.0397042864894193, + "grad_norm": 3.9433112144470215, + "learning_rate": 3.6393038234891052e-06, + "loss": 0.1137, + "step": 47165 + }, + { + "epoch": 3.039717851329354, + "grad_norm": 3.4836227893829346, + "learning_rate": 3.639166780868851e-06, + "loss": 0.0677, + "step": 47166 + }, + { + "epoch": 3.039731416169289, + "grad_norm": 3.5698697566986084, + "learning_rate": 3.6390297382485956e-06, + "loss": 0.0666, + "step": 47167 + }, + { + "epoch": 3.0397449810092243, + "grad_norm": 3.545696496963501, + "learning_rate": 3.6388926956283403e-06, + "loss": 0.0808, + "step": 47168 + }, + { + "epoch": 3.039758545849159, + "grad_norm": 3.593578338623047, + "learning_rate": 3.638755653008086e-06, + "loss": 0.0797, + "step": 47169 + }, + { + "epoch": 3.039772110689094, + "grad_norm": 4.486180305480957, + "learning_rate": 3.6386186103878307e-06, + "loss": 0.0866, + "step": 47170 + }, + { + "epoch": 3.039785675529029, + "grad_norm": 3.3235645294189453, + "learning_rate": 3.6384815677675763e-06, + "loss": 0.0976, + "step": 47171 + }, + { + "epoch": 3.0397992403689638, + "grad_norm": 4.460183620452881, + "learning_rate": 3.638344525147321e-06, + "loss": 0.1091, + "step": 47172 + }, + { + "epoch": 3.0398128052088986, + "grad_norm": 2.885996103286743, + "learning_rate": 3.6382074825270662e-06, + "loss": 0.0766, + "step": 47173 + }, + { + "epoch": 3.0398263700488335, + "grad_norm": 4.0611162185668945, + "learning_rate": 3.638070439906811e-06, + "loss": 0.0949, + "step": 47174 + }, + { + "epoch": 3.0398399348887684, + "grad_norm": 4.022807598114014, + "learning_rate": 3.6379333972865566e-06, + "loss": 0.0837, + "step": 47175 + }, + { + "epoch": 3.0398534997287032, + "grad_norm": 5.5393500328063965, + "learning_rate": 3.6377963546663014e-06, + "loss": 0.1772, + "step": 47176 + }, + { + "epoch": 3.039867064568638, + "grad_norm": 3.863668441772461, + "learning_rate": 3.637659312046047e-06, + "loss": 0.0614, + "step": 47177 + }, + { + "epoch": 3.039880629408573, + "grad_norm": 5.131038665771484, + "learning_rate": 3.6375222694257917e-06, + "loss": 0.0879, + "step": 47178 + }, + { + "epoch": 3.039894194248508, + "grad_norm": 4.2632856369018555, + "learning_rate": 3.637385226805537e-06, + "loss": 0.0939, + "step": 47179 + }, + { + "epoch": 3.0399077590884427, + "grad_norm": 2.092930793762207, + "learning_rate": 3.637248184185282e-06, + "loss": 0.0445, + "step": 47180 + }, + { + "epoch": 3.0399213239283776, + "grad_norm": 7.168996334075928, + "learning_rate": 3.637111141565027e-06, + "loss": 0.1082, + "step": 47181 + }, + { + "epoch": 3.0399348887683124, + "grad_norm": 3.1742684841156006, + "learning_rate": 3.636974098944772e-06, + "loss": 0.0821, + "step": 47182 + }, + { + "epoch": 3.0399484536082473, + "grad_norm": 3.554518699645996, + "learning_rate": 3.636837056324517e-06, + "loss": 0.0863, + "step": 47183 + }, + { + "epoch": 3.039962018448182, + "grad_norm": 5.4391093254089355, + "learning_rate": 3.6367000137042624e-06, + "loss": 0.1314, + "step": 47184 + }, + { + "epoch": 3.039975583288117, + "grad_norm": 2.6291658878326416, + "learning_rate": 3.636562971084007e-06, + "loss": 0.0329, + "step": 47185 + }, + { + "epoch": 3.039989148128052, + "grad_norm": 3.565228223800659, + "learning_rate": 3.6364259284637527e-06, + "loss": 0.1003, + "step": 47186 + }, + { + "epoch": 3.040002712967987, + "grad_norm": 3.4874885082244873, + "learning_rate": 3.6362888858434975e-06, + "loss": 0.0837, + "step": 47187 + }, + { + "epoch": 3.040016277807922, + "grad_norm": 6.283756732940674, + "learning_rate": 3.636151843223243e-06, + "loss": 0.1144, + "step": 47188 + }, + { + "epoch": 3.040029842647857, + "grad_norm": 5.639317989349365, + "learning_rate": 3.636014800602988e-06, + "loss": 0.1414, + "step": 47189 + }, + { + "epoch": 3.040043407487792, + "grad_norm": 6.302765369415283, + "learning_rate": 3.635877757982733e-06, + "loss": 0.1699, + "step": 47190 + }, + { + "epoch": 3.0400569723277266, + "grad_norm": 3.4499566555023193, + "learning_rate": 3.6357407153624778e-06, + "loss": 0.0972, + "step": 47191 + }, + { + "epoch": 3.0400705371676615, + "grad_norm": 5.223276615142822, + "learning_rate": 3.6356036727422234e-06, + "loss": 0.1416, + "step": 47192 + }, + { + "epoch": 3.0400841020075964, + "grad_norm": 3.4886832237243652, + "learning_rate": 3.635466630121968e-06, + "loss": 0.0963, + "step": 47193 + }, + { + "epoch": 3.0400976668475312, + "grad_norm": 3.914159059524536, + "learning_rate": 3.635329587501713e-06, + "loss": 0.0728, + "step": 47194 + }, + { + "epoch": 3.040111231687466, + "grad_norm": 5.236810207366943, + "learning_rate": 3.6351925448814585e-06, + "loss": 0.1025, + "step": 47195 + }, + { + "epoch": 3.040124796527401, + "grad_norm": 3.1704673767089844, + "learning_rate": 3.6350555022612032e-06, + "loss": 0.0638, + "step": 47196 + }, + { + "epoch": 3.040138361367336, + "grad_norm": 5.0640974044799805, + "learning_rate": 3.634918459640949e-06, + "loss": 0.1223, + "step": 47197 + }, + { + "epoch": 3.0401519262072707, + "grad_norm": 2.9676859378814697, + "learning_rate": 3.6347814170206936e-06, + "loss": 0.0684, + "step": 47198 + }, + { + "epoch": 3.0401654910472056, + "grad_norm": 3.94795298576355, + "learning_rate": 3.6346443744004388e-06, + "loss": 0.0555, + "step": 47199 + }, + { + "epoch": 3.0401790558871404, + "grad_norm": 2.693995475769043, + "learning_rate": 3.634507331780184e-06, + "loss": 0.0571, + "step": 47200 + }, + { + "epoch": 3.0401926207270753, + "grad_norm": 3.67272686958313, + "learning_rate": 3.634370289159929e-06, + "loss": 0.0651, + "step": 47201 + }, + { + "epoch": 3.04020618556701, + "grad_norm": 6.683569431304932, + "learning_rate": 3.634233246539674e-06, + "loss": 0.1608, + "step": 47202 + }, + { + "epoch": 3.040219750406945, + "grad_norm": 3.128955602645874, + "learning_rate": 3.6340962039194195e-06, + "loss": 0.0716, + "step": 47203 + }, + { + "epoch": 3.04023331524688, + "grad_norm": 3.6265435218811035, + "learning_rate": 3.6339591612991642e-06, + "loss": 0.0786, + "step": 47204 + }, + { + "epoch": 3.0402468800868148, + "grad_norm": 3.0945324897766113, + "learning_rate": 3.63382211867891e-06, + "loss": 0.0498, + "step": 47205 + }, + { + "epoch": 3.04026044492675, + "grad_norm": 3.0287840366363525, + "learning_rate": 3.6336850760586546e-06, + "loss": 0.0754, + "step": 47206 + }, + { + "epoch": 3.040274009766685, + "grad_norm": 2.623781442642212, + "learning_rate": 3.6335480334383998e-06, + "loss": 0.0522, + "step": 47207 + }, + { + "epoch": 3.04028757460662, + "grad_norm": 5.394983768463135, + "learning_rate": 3.6334109908181445e-06, + "loss": 0.1363, + "step": 47208 + }, + { + "epoch": 3.0403011394465547, + "grad_norm": 2.3973917961120605, + "learning_rate": 3.6332739481978897e-06, + "loss": 0.0308, + "step": 47209 + }, + { + "epoch": 3.0403147042864895, + "grad_norm": 3.2177324295043945, + "learning_rate": 3.633136905577635e-06, + "loss": 0.0498, + "step": 47210 + }, + { + "epoch": 3.0403282691264244, + "grad_norm": 4.941718101501465, + "learning_rate": 3.6329998629573797e-06, + "loss": 0.1101, + "step": 47211 + }, + { + "epoch": 3.0403418339663593, + "grad_norm": 3.6734635829925537, + "learning_rate": 3.6328628203371253e-06, + "loss": 0.0665, + "step": 47212 + }, + { + "epoch": 3.040355398806294, + "grad_norm": 3.9064462184906006, + "learning_rate": 3.63272577771687e-06, + "loss": 0.1309, + "step": 47213 + }, + { + "epoch": 3.040368963646229, + "grad_norm": 2.3461220264434814, + "learning_rate": 3.6325887350966156e-06, + "loss": 0.0289, + "step": 47214 + }, + { + "epoch": 3.040382528486164, + "grad_norm": 4.056967735290527, + "learning_rate": 3.6324516924763604e-06, + "loss": 0.0951, + "step": 47215 + }, + { + "epoch": 3.0403960933260987, + "grad_norm": 3.275684356689453, + "learning_rate": 3.6323146498561055e-06, + "loss": 0.1238, + "step": 47216 + }, + { + "epoch": 3.0404096581660336, + "grad_norm": 5.136318206787109, + "learning_rate": 3.6321776072358507e-06, + "loss": 0.0948, + "step": 47217 + }, + { + "epoch": 3.0404232230059685, + "grad_norm": 3.0369160175323486, + "learning_rate": 3.632040564615596e-06, + "loss": 0.0799, + "step": 47218 + }, + { + "epoch": 3.0404367878459033, + "grad_norm": 3.2018697261810303, + "learning_rate": 3.6319035219953407e-06, + "loss": 0.1026, + "step": 47219 + }, + { + "epoch": 3.040450352685838, + "grad_norm": 3.087158441543579, + "learning_rate": 3.6317664793750863e-06, + "loss": 0.0781, + "step": 47220 + }, + { + "epoch": 3.040463917525773, + "grad_norm": 2.838887929916382, + "learning_rate": 3.631629436754831e-06, + "loss": 0.0971, + "step": 47221 + }, + { + "epoch": 3.040477482365708, + "grad_norm": 3.230477809906006, + "learning_rate": 3.6314923941345758e-06, + "loss": 0.052, + "step": 47222 + }, + { + "epoch": 3.040491047205643, + "grad_norm": 4.476428508758545, + "learning_rate": 3.6313553515143214e-06, + "loss": 0.105, + "step": 47223 + }, + { + "epoch": 3.0405046120455776, + "grad_norm": 3.154982089996338, + "learning_rate": 3.631218308894066e-06, + "loss": 0.0803, + "step": 47224 + }, + { + "epoch": 3.040518176885513, + "grad_norm": 3.096384048461914, + "learning_rate": 3.6310812662738117e-06, + "loss": 0.0622, + "step": 47225 + }, + { + "epoch": 3.040531741725448, + "grad_norm": 2.3242623805999756, + "learning_rate": 3.6309442236535565e-06, + "loss": 0.0333, + "step": 47226 + }, + { + "epoch": 3.0405453065653827, + "grad_norm": 3.6323859691619873, + "learning_rate": 3.6308071810333017e-06, + "loss": 0.0755, + "step": 47227 + }, + { + "epoch": 3.0405588714053176, + "grad_norm": 2.9378039836883545, + "learning_rate": 3.6306701384130464e-06, + "loss": 0.0704, + "step": 47228 + }, + { + "epoch": 3.0405724362452524, + "grad_norm": 2.930485486984253, + "learning_rate": 3.630533095792792e-06, + "loss": 0.1594, + "step": 47229 + }, + { + "epoch": 3.0405860010851873, + "grad_norm": 3.8465192317962646, + "learning_rate": 3.6303960531725368e-06, + "loss": 0.1035, + "step": 47230 + }, + { + "epoch": 3.040599565925122, + "grad_norm": 3.857895851135254, + "learning_rate": 3.6302590105522824e-06, + "loss": 0.106, + "step": 47231 + }, + { + "epoch": 3.040613130765057, + "grad_norm": 3.363168478012085, + "learning_rate": 3.630121967932027e-06, + "loss": 0.0944, + "step": 47232 + }, + { + "epoch": 3.040626695604992, + "grad_norm": 5.540727615356445, + "learning_rate": 3.6299849253117723e-06, + "loss": 0.1023, + "step": 47233 + }, + { + "epoch": 3.0406402604449267, + "grad_norm": 3.9991555213928223, + "learning_rate": 3.6298478826915175e-06, + "loss": 0.0765, + "step": 47234 + }, + { + "epoch": 3.0406538252848616, + "grad_norm": 4.019188404083252, + "learning_rate": 3.6297108400712627e-06, + "loss": 0.1658, + "step": 47235 + }, + { + "epoch": 3.0406673901247965, + "grad_norm": 2.685680389404297, + "learning_rate": 3.6295737974510074e-06, + "loss": 0.0581, + "step": 47236 + }, + { + "epoch": 3.0406809549647313, + "grad_norm": 3.390627145767212, + "learning_rate": 3.6294367548307526e-06, + "loss": 0.1013, + "step": 47237 + }, + { + "epoch": 3.040694519804666, + "grad_norm": 5.823672771453857, + "learning_rate": 3.6292997122104978e-06, + "loss": 0.1018, + "step": 47238 + }, + { + "epoch": 3.040708084644601, + "grad_norm": 3.8777453899383545, + "learning_rate": 3.6291626695902425e-06, + "loss": 0.1087, + "step": 47239 + }, + { + "epoch": 3.040721649484536, + "grad_norm": 4.4039106369018555, + "learning_rate": 3.629025626969988e-06, + "loss": 0.1139, + "step": 47240 + }, + { + "epoch": 3.040735214324471, + "grad_norm": 3.7064504623413086, + "learning_rate": 3.628888584349733e-06, + "loss": 0.1083, + "step": 47241 + }, + { + "epoch": 3.0407487791644057, + "grad_norm": 3.175729513168335, + "learning_rate": 3.6287515417294785e-06, + "loss": 0.0832, + "step": 47242 + }, + { + "epoch": 3.0407623440043405, + "grad_norm": 3.0495829582214355, + "learning_rate": 3.6286144991092233e-06, + "loss": 0.1063, + "step": 47243 + }, + { + "epoch": 3.040775908844276, + "grad_norm": 4.281947612762451, + "learning_rate": 3.6284774564889684e-06, + "loss": 0.1179, + "step": 47244 + }, + { + "epoch": 3.0407894736842107, + "grad_norm": 4.4452433586120605, + "learning_rate": 3.628340413868713e-06, + "loss": 0.1968, + "step": 47245 + }, + { + "epoch": 3.0408030385241456, + "grad_norm": 4.2920427322387695, + "learning_rate": 3.628203371248459e-06, + "loss": 0.1415, + "step": 47246 + }, + { + "epoch": 3.0408166033640804, + "grad_norm": 4.423068523406982, + "learning_rate": 3.6280663286282035e-06, + "loss": 0.1864, + "step": 47247 + }, + { + "epoch": 3.0408301682040153, + "grad_norm": 3.9359230995178223, + "learning_rate": 3.627929286007949e-06, + "loss": 0.1407, + "step": 47248 + }, + { + "epoch": 3.04084373304395, + "grad_norm": 2.8660240173339844, + "learning_rate": 3.627792243387694e-06, + "loss": 0.1228, + "step": 47249 + }, + { + "epoch": 3.040857297883885, + "grad_norm": 3.717912197113037, + "learning_rate": 3.6276552007674387e-06, + "loss": 0.1358, + "step": 47250 + }, + { + "epoch": 3.04087086272382, + "grad_norm": 3.702749013900757, + "learning_rate": 3.6275181581471843e-06, + "loss": 0.0882, + "step": 47251 + }, + { + "epoch": 3.0408844275637548, + "grad_norm": 3.3423500061035156, + "learning_rate": 3.627381115526929e-06, + "loss": 0.1603, + "step": 47252 + }, + { + "epoch": 3.0408979924036896, + "grad_norm": 3.1518115997314453, + "learning_rate": 3.627244072906674e-06, + "loss": 0.0713, + "step": 47253 + }, + { + "epoch": 3.0409115572436245, + "grad_norm": 4.354740619659424, + "learning_rate": 3.6271070302864194e-06, + "loss": 0.1121, + "step": 47254 + }, + { + "epoch": 3.0409251220835594, + "grad_norm": 3.4116814136505127, + "learning_rate": 3.6269699876661646e-06, + "loss": 0.0813, + "step": 47255 + }, + { + "epoch": 3.0409386869234942, + "grad_norm": 3.1773698329925537, + "learning_rate": 3.6268329450459093e-06, + "loss": 0.1075, + "step": 47256 + }, + { + "epoch": 3.040952251763429, + "grad_norm": 2.99214768409729, + "learning_rate": 3.626695902425655e-06, + "loss": 0.1064, + "step": 47257 + }, + { + "epoch": 3.040965816603364, + "grad_norm": 2.508425712585449, + "learning_rate": 3.6265588598053997e-06, + "loss": 0.069, + "step": 47258 + }, + { + "epoch": 3.040979381443299, + "grad_norm": 4.577579021453857, + "learning_rate": 3.6264218171851453e-06, + "loss": 0.1041, + "step": 47259 + }, + { + "epoch": 3.0409929462832337, + "grad_norm": 3.297273874282837, + "learning_rate": 3.62628477456489e-06, + "loss": 0.0759, + "step": 47260 + }, + { + "epoch": 3.0410065111231686, + "grad_norm": 4.612371921539307, + "learning_rate": 3.626147731944635e-06, + "loss": 0.0897, + "step": 47261 + }, + { + "epoch": 3.0410200759631034, + "grad_norm": 4.220597267150879, + "learning_rate": 3.62601068932438e-06, + "loss": 0.1429, + "step": 47262 + }, + { + "epoch": 3.0410336408030387, + "grad_norm": 3.789445400238037, + "learning_rate": 3.625873646704125e-06, + "loss": 0.0553, + "step": 47263 + }, + { + "epoch": 3.0410472056429736, + "grad_norm": 2.746793508529663, + "learning_rate": 3.6257366040838703e-06, + "loss": 0.0656, + "step": 47264 + }, + { + "epoch": 3.0410607704829085, + "grad_norm": 5.515222072601318, + "learning_rate": 3.625599561463615e-06, + "loss": 0.136, + "step": 47265 + }, + { + "epoch": 3.0410743353228433, + "grad_norm": 4.493452072143555, + "learning_rate": 3.6254625188433607e-06, + "loss": 0.1063, + "step": 47266 + }, + { + "epoch": 3.041087900162778, + "grad_norm": 4.13193416595459, + "learning_rate": 3.6253254762231054e-06, + "loss": 0.1047, + "step": 47267 + }, + { + "epoch": 3.041101465002713, + "grad_norm": 4.174191474914551, + "learning_rate": 3.625188433602851e-06, + "loss": 0.148, + "step": 47268 + }, + { + "epoch": 3.041115029842648, + "grad_norm": 3.2592711448669434, + "learning_rate": 3.625051390982596e-06, + "loss": 0.113, + "step": 47269 + }, + { + "epoch": 3.041128594682583, + "grad_norm": 5.682046413421631, + "learning_rate": 3.624914348362341e-06, + "loss": 0.1401, + "step": 47270 + }, + { + "epoch": 3.0411421595225177, + "grad_norm": 3.48075008392334, + "learning_rate": 3.624777305742086e-06, + "loss": 0.0962, + "step": 47271 + }, + { + "epoch": 3.0411557243624525, + "grad_norm": 6.596531391143799, + "learning_rate": 3.6246402631218313e-06, + "loss": 0.1505, + "step": 47272 + }, + { + "epoch": 3.0411692892023874, + "grad_norm": 4.847977161407471, + "learning_rate": 3.624503220501576e-06, + "loss": 0.0755, + "step": 47273 + }, + { + "epoch": 3.0411828540423222, + "grad_norm": 4.031292915344238, + "learning_rate": 3.6243661778813217e-06, + "loss": 0.136, + "step": 47274 + }, + { + "epoch": 3.041196418882257, + "grad_norm": 3.8380889892578125, + "learning_rate": 3.6242291352610664e-06, + "loss": 0.1374, + "step": 47275 + }, + { + "epoch": 3.041209983722192, + "grad_norm": 4.77449369430542, + "learning_rate": 3.624092092640812e-06, + "loss": 0.1454, + "step": 47276 + }, + { + "epoch": 3.041223548562127, + "grad_norm": 4.6179656982421875, + "learning_rate": 3.623955050020557e-06, + "loss": 0.1057, + "step": 47277 + }, + { + "epoch": 3.0412371134020617, + "grad_norm": 4.164539813995361, + "learning_rate": 3.6238180074003016e-06, + "loss": 0.1513, + "step": 47278 + }, + { + "epoch": 3.0412506782419966, + "grad_norm": 4.086702823638916, + "learning_rate": 3.6236809647800467e-06, + "loss": 0.1409, + "step": 47279 + }, + { + "epoch": 3.0412642430819314, + "grad_norm": 5.833071708679199, + "learning_rate": 3.623543922159792e-06, + "loss": 0.1495, + "step": 47280 + }, + { + "epoch": 3.0412778079218663, + "grad_norm": 3.913907051086426, + "learning_rate": 3.623406879539537e-06, + "loss": 0.1243, + "step": 47281 + }, + { + "epoch": 3.0412913727618016, + "grad_norm": 4.510562896728516, + "learning_rate": 3.623269836919282e-06, + "loss": 0.1124, + "step": 47282 + }, + { + "epoch": 3.0413049376017365, + "grad_norm": 3.272432804107666, + "learning_rate": 3.6231327942990274e-06, + "loss": 0.1167, + "step": 47283 + }, + { + "epoch": 3.0413185024416713, + "grad_norm": 4.8289690017700195, + "learning_rate": 3.622995751678772e-06, + "loss": 0.1171, + "step": 47284 + }, + { + "epoch": 3.041332067281606, + "grad_norm": 5.51137113571167, + "learning_rate": 3.622858709058518e-06, + "loss": 0.1726, + "step": 47285 + }, + { + "epoch": 3.041345632121541, + "grad_norm": 4.262781143188477, + "learning_rate": 3.6227216664382626e-06, + "loss": 0.1048, + "step": 47286 + }, + { + "epoch": 3.041359196961476, + "grad_norm": 3.1224114894866943, + "learning_rate": 3.6225846238180077e-06, + "loss": 0.0793, + "step": 47287 + }, + { + "epoch": 3.041372761801411, + "grad_norm": 4.414718151092529, + "learning_rate": 3.622447581197753e-06, + "loss": 0.1256, + "step": 47288 + }, + { + "epoch": 3.0413863266413457, + "grad_norm": 5.131211757659912, + "learning_rate": 3.622310538577498e-06, + "loss": 0.1797, + "step": 47289 + }, + { + "epoch": 3.0413998914812805, + "grad_norm": 5.999087333679199, + "learning_rate": 3.622173495957243e-06, + "loss": 0.1556, + "step": 47290 + }, + { + "epoch": 3.0414134563212154, + "grad_norm": 3.5005106925964355, + "learning_rate": 3.622036453336988e-06, + "loss": 0.0681, + "step": 47291 + }, + { + "epoch": 3.0414270211611503, + "grad_norm": 3.8034322261810303, + "learning_rate": 3.621899410716733e-06, + "loss": 0.1128, + "step": 47292 + }, + { + "epoch": 3.041440586001085, + "grad_norm": 3.8476321697235107, + "learning_rate": 3.621762368096478e-06, + "loss": 0.1085, + "step": 47293 + }, + { + "epoch": 3.04145415084102, + "grad_norm": 4.515671730041504, + "learning_rate": 3.6216253254762236e-06, + "loss": 0.1593, + "step": 47294 + }, + { + "epoch": 3.041467715680955, + "grad_norm": 4.586877346038818, + "learning_rate": 3.6214882828559683e-06, + "loss": 0.1022, + "step": 47295 + }, + { + "epoch": 3.0414812805208897, + "grad_norm": 3.722419261932373, + "learning_rate": 3.621351240235714e-06, + "loss": 0.0735, + "step": 47296 + }, + { + "epoch": 3.0414948453608246, + "grad_norm": 2.2569475173950195, + "learning_rate": 3.6212141976154587e-06, + "loss": 0.0369, + "step": 47297 + }, + { + "epoch": 3.0415084102007595, + "grad_norm": 6.808414459228516, + "learning_rate": 3.621077154995204e-06, + "loss": 0.1743, + "step": 47298 + }, + { + "epoch": 3.0415219750406943, + "grad_norm": 4.759591102600098, + "learning_rate": 3.6209401123749486e-06, + "loss": 0.1668, + "step": 47299 + }, + { + "epoch": 3.041535539880629, + "grad_norm": 3.617936372756958, + "learning_rate": 3.6208030697546942e-06, + "loss": 0.0692, + "step": 47300 + }, + { + "epoch": 3.0415491047205645, + "grad_norm": 3.015030860900879, + "learning_rate": 3.620666027134439e-06, + "loss": 0.0568, + "step": 47301 + }, + { + "epoch": 3.0415626695604994, + "grad_norm": 4.627802848815918, + "learning_rate": 3.6205289845141846e-06, + "loss": 0.1471, + "step": 47302 + }, + { + "epoch": 3.0415762344004342, + "grad_norm": 3.501986265182495, + "learning_rate": 3.6203919418939293e-06, + "loss": 0.0654, + "step": 47303 + }, + { + "epoch": 3.041589799240369, + "grad_norm": 4.21612548828125, + "learning_rate": 3.6202548992736745e-06, + "loss": 0.1133, + "step": 47304 + }, + { + "epoch": 3.041603364080304, + "grad_norm": 3.845982313156128, + "learning_rate": 3.6201178566534197e-06, + "loss": 0.1033, + "step": 47305 + }, + { + "epoch": 3.041616928920239, + "grad_norm": 5.101373672485352, + "learning_rate": 3.6199808140331644e-06, + "loss": 0.1265, + "step": 47306 + }, + { + "epoch": 3.0416304937601737, + "grad_norm": 3.5948026180267334, + "learning_rate": 3.6198437714129096e-06, + "loss": 0.0769, + "step": 47307 + }, + { + "epoch": 3.0416440586001086, + "grad_norm": 5.063244342803955, + "learning_rate": 3.619706728792655e-06, + "loss": 0.1227, + "step": 47308 + }, + { + "epoch": 3.0416576234400434, + "grad_norm": 6.9530253410339355, + "learning_rate": 3.6195696861724e-06, + "loss": 0.2322, + "step": 47309 + }, + { + "epoch": 3.0416711882799783, + "grad_norm": 6.466022491455078, + "learning_rate": 3.6194326435521447e-06, + "loss": 0.2299, + "step": 47310 + }, + { + "epoch": 3.041684753119913, + "grad_norm": 5.472865581512451, + "learning_rate": 3.6192956009318903e-06, + "loss": 0.1815, + "step": 47311 + }, + { + "epoch": 3.041698317959848, + "grad_norm": 5.470768451690674, + "learning_rate": 3.619158558311635e-06, + "loss": 0.1725, + "step": 47312 + }, + { + "epoch": 3.041711882799783, + "grad_norm": 9.822364807128906, + "learning_rate": 3.6190215156913807e-06, + "loss": 0.4167, + "step": 47313 + }, + { + "epoch": 3.0417254476397177, + "grad_norm": 4.9619011878967285, + "learning_rate": 3.6188844730711255e-06, + "loss": 0.1685, + "step": 47314 + }, + { + "epoch": 3.0417390124796526, + "grad_norm": 5.1537370681762695, + "learning_rate": 3.6187474304508706e-06, + "loss": 0.1471, + "step": 47315 + }, + { + "epoch": 3.0417525773195875, + "grad_norm": 5.57182502746582, + "learning_rate": 3.6186103878306154e-06, + "loss": 0.1403, + "step": 47316 + }, + { + "epoch": 3.0417661421595223, + "grad_norm": 6.1088762283325195, + "learning_rate": 3.618473345210361e-06, + "loss": 0.2386, + "step": 47317 + }, + { + "epoch": 3.041779706999457, + "grad_norm": 5.483623027801514, + "learning_rate": 3.6183363025901057e-06, + "loss": 0.1871, + "step": 47318 + }, + { + "epoch": 3.041793271839392, + "grad_norm": 4.211436748504639, + "learning_rate": 3.6181992599698505e-06, + "loss": 0.1152, + "step": 47319 + }, + { + "epoch": 3.0418068366793274, + "grad_norm": 4.030902862548828, + "learning_rate": 3.618062217349596e-06, + "loss": 0.1711, + "step": 47320 + }, + { + "epoch": 3.0418204015192623, + "grad_norm": 5.941957950592041, + "learning_rate": 3.617925174729341e-06, + "loss": 0.219, + "step": 47321 + }, + { + "epoch": 3.041833966359197, + "grad_norm": 5.9128546714782715, + "learning_rate": 3.6177881321090865e-06, + "loss": 0.1439, + "step": 47322 + }, + { + "epoch": 3.041847531199132, + "grad_norm": 4.433171272277832, + "learning_rate": 3.6176510894888312e-06, + "loss": 0.1197, + "step": 47323 + }, + { + "epoch": 3.041861096039067, + "grad_norm": 5.532384395599365, + "learning_rate": 3.6175140468685764e-06, + "loss": 0.1709, + "step": 47324 + }, + { + "epoch": 3.0418746608790017, + "grad_norm": 4.9513444900512695, + "learning_rate": 3.6173770042483216e-06, + "loss": 0.0855, + "step": 47325 + }, + { + "epoch": 3.0418882257189366, + "grad_norm": 6.192899703979492, + "learning_rate": 3.6172399616280668e-06, + "loss": 0.1949, + "step": 47326 + }, + { + "epoch": 3.0419017905588714, + "grad_norm": 5.018211364746094, + "learning_rate": 3.6171029190078115e-06, + "loss": 0.1495, + "step": 47327 + }, + { + "epoch": 3.0419153553988063, + "grad_norm": 9.017536163330078, + "learning_rate": 3.616965876387557e-06, + "loss": 0.2532, + "step": 47328 + }, + { + "epoch": 3.041928920238741, + "grad_norm": 3.2806403636932373, + "learning_rate": 3.616828833767302e-06, + "loss": 0.0996, + "step": 47329 + }, + { + "epoch": 3.041942485078676, + "grad_norm": 4.426778316497803, + "learning_rate": 3.6166917911470475e-06, + "loss": 0.157, + "step": 47330 + }, + { + "epoch": 3.041956049918611, + "grad_norm": 5.963333606719971, + "learning_rate": 3.6165547485267922e-06, + "loss": 0.1806, + "step": 47331 + }, + { + "epoch": 3.0419696147585458, + "grad_norm": 6.976807117462158, + "learning_rate": 3.616417705906537e-06, + "loss": 0.2621, + "step": 47332 + }, + { + "epoch": 3.0419831795984806, + "grad_norm": 4.748448371887207, + "learning_rate": 3.616280663286282e-06, + "loss": 0.1842, + "step": 47333 + }, + { + "epoch": 3.0419967444384155, + "grad_norm": 3.8637895584106445, + "learning_rate": 3.6161436206660273e-06, + "loss": 0.1264, + "step": 47334 + }, + { + "epoch": 3.0420103092783504, + "grad_norm": 6.932089328765869, + "learning_rate": 3.6160065780457725e-06, + "loss": 0.2246, + "step": 47335 + }, + { + "epoch": 3.0420238741182852, + "grad_norm": 2.996952772140503, + "learning_rate": 3.6158695354255173e-06, + "loss": 0.0606, + "step": 47336 + }, + { + "epoch": 3.04203743895822, + "grad_norm": 4.830018043518066, + "learning_rate": 3.615732492805263e-06, + "loss": 0.1682, + "step": 47337 + }, + { + "epoch": 3.0420510037981554, + "grad_norm": 4.857123374938965, + "learning_rate": 3.6155954501850076e-06, + "loss": 0.1237, + "step": 47338 + }, + { + "epoch": 3.0420645686380903, + "grad_norm": 5.906186103820801, + "learning_rate": 3.6154584075647532e-06, + "loss": 0.1818, + "step": 47339 + }, + { + "epoch": 3.042078133478025, + "grad_norm": 4.539059162139893, + "learning_rate": 3.615321364944498e-06, + "loss": 0.0843, + "step": 47340 + }, + { + "epoch": 3.04209169831796, + "grad_norm": 4.9210381507873535, + "learning_rate": 3.615184322324243e-06, + "loss": 0.146, + "step": 47341 + }, + { + "epoch": 3.042105263157895, + "grad_norm": 4.046108245849609, + "learning_rate": 3.6150472797039883e-06, + "loss": 0.1019, + "step": 47342 + }, + { + "epoch": 3.0421188279978297, + "grad_norm": 4.280527114868164, + "learning_rate": 3.6149102370837335e-06, + "loss": 0.1474, + "step": 47343 + }, + { + "epoch": 3.0421323928377646, + "grad_norm": 4.41911506652832, + "learning_rate": 3.6147731944634783e-06, + "loss": 0.151, + "step": 47344 + }, + { + "epoch": 3.0421459576776995, + "grad_norm": 4.225762367248535, + "learning_rate": 3.614636151843224e-06, + "loss": 0.1574, + "step": 47345 + }, + { + "epoch": 3.0421595225176343, + "grad_norm": 4.976288795471191, + "learning_rate": 3.6144991092229686e-06, + "loss": 0.1162, + "step": 47346 + }, + { + "epoch": 3.042173087357569, + "grad_norm": 4.980096817016602, + "learning_rate": 3.6143620666027134e-06, + "loss": 0.1492, + "step": 47347 + }, + { + "epoch": 3.042186652197504, + "grad_norm": 3.0080084800720215, + "learning_rate": 3.614225023982459e-06, + "loss": 0.0544, + "step": 47348 + }, + { + "epoch": 3.042200217037439, + "grad_norm": 4.465351104736328, + "learning_rate": 3.6140879813622037e-06, + "loss": 0.1508, + "step": 47349 + }, + { + "epoch": 3.042213781877374, + "grad_norm": 4.338363170623779, + "learning_rate": 3.613950938741949e-06, + "loss": 0.1116, + "step": 47350 + }, + { + "epoch": 3.0422273467173087, + "grad_norm": 4.651838779449463, + "learning_rate": 3.613813896121694e-06, + "loss": 0.126, + "step": 47351 + }, + { + "epoch": 3.0422409115572435, + "grad_norm": 4.147058963775635, + "learning_rate": 3.6136768535014393e-06, + "loss": 0.1381, + "step": 47352 + }, + { + "epoch": 3.0422544763971784, + "grad_norm": 4.647231101989746, + "learning_rate": 3.613539810881184e-06, + "loss": 0.1207, + "step": 47353 + }, + { + "epoch": 3.0422680412371133, + "grad_norm": 4.773993015289307, + "learning_rate": 3.6134027682609296e-06, + "loss": 0.0852, + "step": 47354 + }, + { + "epoch": 3.042281606077048, + "grad_norm": 4.062865734100342, + "learning_rate": 3.6132657256406744e-06, + "loss": 0.1161, + "step": 47355 + }, + { + "epoch": 3.042295170916983, + "grad_norm": 5.9042510986328125, + "learning_rate": 3.61312868302042e-06, + "loss": 0.2783, + "step": 47356 + }, + { + "epoch": 3.042308735756918, + "grad_norm": 4.534495830535889, + "learning_rate": 3.6129916404001648e-06, + "loss": 0.1221, + "step": 47357 + }, + { + "epoch": 3.042322300596853, + "grad_norm": 4.64512300491333, + "learning_rate": 3.61285459777991e-06, + "loss": 0.114, + "step": 47358 + }, + { + "epoch": 3.042335865436788, + "grad_norm": 5.124697208404541, + "learning_rate": 3.612717555159655e-06, + "loss": 0.1297, + "step": 47359 + }, + { + "epoch": 3.042349430276723, + "grad_norm": 4.658596992492676, + "learning_rate": 3.6125805125394e-06, + "loss": 0.1127, + "step": 47360 + }, + { + "epoch": 3.0423629951166578, + "grad_norm": 5.052186965942383, + "learning_rate": 3.612443469919145e-06, + "loss": 0.1428, + "step": 47361 + }, + { + "epoch": 3.0423765599565926, + "grad_norm": 4.343218803405762, + "learning_rate": 3.6123064272988902e-06, + "loss": 0.1049, + "step": 47362 + }, + { + "epoch": 3.0423901247965275, + "grad_norm": 3.707681179046631, + "learning_rate": 3.6121693846786354e-06, + "loss": 0.1244, + "step": 47363 + }, + { + "epoch": 3.0424036896364623, + "grad_norm": 3.5831899642944336, + "learning_rate": 3.61203234205838e-06, + "loss": 0.0862, + "step": 47364 + }, + { + "epoch": 3.042417254476397, + "grad_norm": 4.614787578582764, + "learning_rate": 3.6118952994381258e-06, + "loss": 0.0893, + "step": 47365 + }, + { + "epoch": 3.042430819316332, + "grad_norm": 4.959485054016113, + "learning_rate": 3.6117582568178705e-06, + "loss": 0.1511, + "step": 47366 + }, + { + "epoch": 3.042444384156267, + "grad_norm": 4.780080318450928, + "learning_rate": 3.611621214197616e-06, + "loss": 0.1269, + "step": 47367 + }, + { + "epoch": 3.042457948996202, + "grad_norm": 4.562402725219727, + "learning_rate": 3.611484171577361e-06, + "loss": 0.1008, + "step": 47368 + }, + { + "epoch": 3.0424715138361367, + "grad_norm": 5.044215679168701, + "learning_rate": 3.611347128957106e-06, + "loss": 0.1117, + "step": 47369 + }, + { + "epoch": 3.0424850786760715, + "grad_norm": 4.715810775756836, + "learning_rate": 3.611210086336851e-06, + "loss": 0.0941, + "step": 47370 + }, + { + "epoch": 3.0424986435160064, + "grad_norm": 4.497865676879883, + "learning_rate": 3.6110730437165964e-06, + "loss": 0.081, + "step": 47371 + }, + { + "epoch": 3.0425122083559413, + "grad_norm": 4.265623092651367, + "learning_rate": 3.610936001096341e-06, + "loss": 0.1512, + "step": 47372 + }, + { + "epoch": 3.042525773195876, + "grad_norm": 3.4620306491851807, + "learning_rate": 3.6107989584760868e-06, + "loss": 0.1411, + "step": 47373 + }, + { + "epoch": 3.042539338035811, + "grad_norm": 6.4129462242126465, + "learning_rate": 3.6106619158558315e-06, + "loss": 0.1519, + "step": 47374 + }, + { + "epoch": 3.042552902875746, + "grad_norm": 4.6892523765563965, + "learning_rate": 3.6105248732355763e-06, + "loss": 0.12, + "step": 47375 + }, + { + "epoch": 3.042566467715681, + "grad_norm": 3.0806093215942383, + "learning_rate": 3.610387830615322e-06, + "loss": 0.0757, + "step": 47376 + }, + { + "epoch": 3.042580032555616, + "grad_norm": 3.376471757888794, + "learning_rate": 3.6102507879950666e-06, + "loss": 0.0851, + "step": 47377 + }, + { + "epoch": 3.042593597395551, + "grad_norm": 3.174032688140869, + "learning_rate": 3.610113745374812e-06, + "loss": 0.0794, + "step": 47378 + }, + { + "epoch": 3.0426071622354858, + "grad_norm": 5.244276523590088, + "learning_rate": 3.609976702754557e-06, + "loss": 0.1288, + "step": 47379 + }, + { + "epoch": 3.0426207270754206, + "grad_norm": 3.626485586166382, + "learning_rate": 3.609839660134302e-06, + "loss": 0.0917, + "step": 47380 + }, + { + "epoch": 3.0426342919153555, + "grad_norm": 3.4375288486480713, + "learning_rate": 3.609702617514047e-06, + "loss": 0.1, + "step": 47381 + }, + { + "epoch": 3.0426478567552904, + "grad_norm": 3.3058478832244873, + "learning_rate": 3.6095655748937925e-06, + "loss": 0.0634, + "step": 47382 + }, + { + "epoch": 3.0426614215952252, + "grad_norm": 3.7379324436187744, + "learning_rate": 3.6094285322735373e-06, + "loss": 0.0668, + "step": 47383 + }, + { + "epoch": 3.04267498643516, + "grad_norm": 3.4340529441833496, + "learning_rate": 3.609291489653283e-06, + "loss": 0.0685, + "step": 47384 + }, + { + "epoch": 3.042688551275095, + "grad_norm": 4.305068492889404, + "learning_rate": 3.6091544470330276e-06, + "loss": 0.0875, + "step": 47385 + }, + { + "epoch": 3.04270211611503, + "grad_norm": 2.9203786849975586, + "learning_rate": 3.609017404412773e-06, + "loss": 0.0717, + "step": 47386 + }, + { + "epoch": 3.0427156809549647, + "grad_norm": 4.3998823165893555, + "learning_rate": 3.6088803617925176e-06, + "loss": 0.0916, + "step": 47387 + }, + { + "epoch": 3.0427292457948996, + "grad_norm": 4.98665714263916, + "learning_rate": 3.6087433191722628e-06, + "loss": 0.0899, + "step": 47388 + }, + { + "epoch": 3.0427428106348344, + "grad_norm": 3.4767885208129883, + "learning_rate": 3.608606276552008e-06, + "loss": 0.0657, + "step": 47389 + }, + { + "epoch": 3.0427563754747693, + "grad_norm": 3.1927037239074707, + "learning_rate": 3.6084692339317527e-06, + "loss": 0.0908, + "step": 47390 + }, + { + "epoch": 3.042769940314704, + "grad_norm": 5.02715539932251, + "learning_rate": 3.6083321913114983e-06, + "loss": 0.1074, + "step": 47391 + }, + { + "epoch": 3.042783505154639, + "grad_norm": 3.0947537422180176, + "learning_rate": 3.608195148691243e-06, + "loss": 0.0755, + "step": 47392 + }, + { + "epoch": 3.042797069994574, + "grad_norm": 5.824127197265625, + "learning_rate": 3.6080581060709887e-06, + "loss": 0.1238, + "step": 47393 + }, + { + "epoch": 3.0428106348345088, + "grad_norm": 3.9654202461242676, + "learning_rate": 3.6079210634507334e-06, + "loss": 0.0894, + "step": 47394 + }, + { + "epoch": 3.0428241996744436, + "grad_norm": 5.347305774688721, + "learning_rate": 3.6077840208304786e-06, + "loss": 0.1869, + "step": 47395 + }, + { + "epoch": 3.042837764514379, + "grad_norm": 3.3181381225585938, + "learning_rate": 3.6076469782102238e-06, + "loss": 0.0667, + "step": 47396 + }, + { + "epoch": 3.042851329354314, + "grad_norm": 4.103802680969238, + "learning_rate": 3.607509935589969e-06, + "loss": 0.1082, + "step": 47397 + }, + { + "epoch": 3.0428648941942487, + "grad_norm": 4.2779927253723145, + "learning_rate": 3.6073728929697137e-06, + "loss": 0.1282, + "step": 47398 + }, + { + "epoch": 3.0428784590341835, + "grad_norm": 4.968666076660156, + "learning_rate": 3.6072358503494593e-06, + "loss": 0.098, + "step": 47399 + }, + { + "epoch": 3.0428920238741184, + "grad_norm": 3.4507222175598145, + "learning_rate": 3.607098807729204e-06, + "loss": 0.077, + "step": 47400 + }, + { + "epoch": 3.0429055887140533, + "grad_norm": 1.7235664129257202, + "learning_rate": 3.606961765108949e-06, + "loss": 0.0485, + "step": 47401 + }, + { + "epoch": 3.042919153553988, + "grad_norm": 3.438298225402832, + "learning_rate": 3.6068247224886944e-06, + "loss": 0.1108, + "step": 47402 + }, + { + "epoch": 3.042932718393923, + "grad_norm": 2.6884591579437256, + "learning_rate": 3.606687679868439e-06, + "loss": 0.0605, + "step": 47403 + }, + { + "epoch": 3.042946283233858, + "grad_norm": 3.4331037998199463, + "learning_rate": 3.6065506372481844e-06, + "loss": 0.082, + "step": 47404 + }, + { + "epoch": 3.0429598480737927, + "grad_norm": 2.8479204177856445, + "learning_rate": 3.6064135946279295e-06, + "loss": 0.098, + "step": 47405 + }, + { + "epoch": 3.0429734129137276, + "grad_norm": 3.6191537380218506, + "learning_rate": 3.6062765520076747e-06, + "loss": 0.0796, + "step": 47406 + }, + { + "epoch": 3.0429869777536624, + "grad_norm": 3.533633232116699, + "learning_rate": 3.6061395093874195e-06, + "loss": 0.0985, + "step": 47407 + }, + { + "epoch": 3.0430005425935973, + "grad_norm": 5.066773414611816, + "learning_rate": 3.606002466767165e-06, + "loss": 0.1029, + "step": 47408 + }, + { + "epoch": 3.043014107433532, + "grad_norm": 3.152832508087158, + "learning_rate": 3.60586542414691e-06, + "loss": 0.0527, + "step": 47409 + }, + { + "epoch": 3.043027672273467, + "grad_norm": 4.0932936668396, + "learning_rate": 3.6057283815266554e-06, + "loss": 0.1041, + "step": 47410 + }, + { + "epoch": 3.043041237113402, + "grad_norm": 4.005271911621094, + "learning_rate": 3.6055913389064e-06, + "loss": 0.1108, + "step": 47411 + }, + { + "epoch": 3.0430548019533368, + "grad_norm": 2.1595499515533447, + "learning_rate": 3.6054542962861454e-06, + "loss": 0.0523, + "step": 47412 + }, + { + "epoch": 3.0430683667932716, + "grad_norm": 3.2881500720977783, + "learning_rate": 3.6053172536658905e-06, + "loss": 0.0775, + "step": 47413 + }, + { + "epoch": 3.043081931633207, + "grad_norm": 5.065530300140381, + "learning_rate": 3.6051802110456357e-06, + "loss": 0.1619, + "step": 47414 + }, + { + "epoch": 3.043095496473142, + "grad_norm": 5.079180717468262, + "learning_rate": 3.6050431684253805e-06, + "loss": 0.1417, + "step": 47415 + }, + { + "epoch": 3.0431090613130767, + "grad_norm": 3.8306875228881836, + "learning_rate": 3.6049061258051257e-06, + "loss": 0.1054, + "step": 47416 + }, + { + "epoch": 3.0431226261530115, + "grad_norm": 3.4840152263641357, + "learning_rate": 3.604769083184871e-06, + "loss": 0.1052, + "step": 47417 + }, + { + "epoch": 3.0431361909929464, + "grad_norm": 4.196961879730225, + "learning_rate": 3.6046320405646156e-06, + "loss": 0.0951, + "step": 47418 + }, + { + "epoch": 3.0431497558328813, + "grad_norm": 4.595775127410889, + "learning_rate": 3.604494997944361e-06, + "loss": 0.1602, + "step": 47419 + }, + { + "epoch": 3.043163320672816, + "grad_norm": 3.7880094051361084, + "learning_rate": 3.604357955324106e-06, + "loss": 0.1001, + "step": 47420 + }, + { + "epoch": 3.043176885512751, + "grad_norm": 3.6915841102600098, + "learning_rate": 3.604220912703851e-06, + "loss": 0.1283, + "step": 47421 + }, + { + "epoch": 3.043190450352686, + "grad_norm": 4.558861255645752, + "learning_rate": 3.6040838700835963e-06, + "loss": 0.131, + "step": 47422 + }, + { + "epoch": 3.0432040151926207, + "grad_norm": 3.482940196990967, + "learning_rate": 3.6039468274633415e-06, + "loss": 0.1097, + "step": 47423 + }, + { + "epoch": 3.0432175800325556, + "grad_norm": 2.942871332168579, + "learning_rate": 3.6038097848430862e-06, + "loss": 0.1126, + "step": 47424 + }, + { + "epoch": 3.0432311448724905, + "grad_norm": 6.737363338470459, + "learning_rate": 3.603672742222832e-06, + "loss": 0.1979, + "step": 47425 + }, + { + "epoch": 3.0432447097124253, + "grad_norm": 4.787877082824707, + "learning_rate": 3.6035356996025766e-06, + "loss": 0.1466, + "step": 47426 + }, + { + "epoch": 3.04325827455236, + "grad_norm": 5.000993251800537, + "learning_rate": 3.603398656982322e-06, + "loss": 0.1548, + "step": 47427 + }, + { + "epoch": 3.043271839392295, + "grad_norm": 4.999753952026367, + "learning_rate": 3.603261614362067e-06, + "loss": 0.1028, + "step": 47428 + }, + { + "epoch": 3.04328540423223, + "grad_norm": 4.889072895050049, + "learning_rate": 3.6031245717418117e-06, + "loss": 0.1511, + "step": 47429 + }, + { + "epoch": 3.043298969072165, + "grad_norm": 3.25512433052063, + "learning_rate": 3.6029875291215573e-06, + "loss": 0.1217, + "step": 47430 + }, + { + "epoch": 3.0433125339120997, + "grad_norm": 3.381027936935425, + "learning_rate": 3.602850486501302e-06, + "loss": 0.1155, + "step": 47431 + }, + { + "epoch": 3.0433260987520345, + "grad_norm": 4.197052478790283, + "learning_rate": 3.6027134438810472e-06, + "loss": 0.1483, + "step": 47432 + }, + { + "epoch": 3.0433396635919694, + "grad_norm": 3.62618088722229, + "learning_rate": 3.6025764012607924e-06, + "loss": 0.1813, + "step": 47433 + }, + { + "epoch": 3.0433532284319047, + "grad_norm": 3.312906503677368, + "learning_rate": 3.6024393586405376e-06, + "loss": 0.0788, + "step": 47434 + }, + { + "epoch": 3.0433667932718396, + "grad_norm": 4.762977600097656, + "learning_rate": 3.6023023160202824e-06, + "loss": 0.1715, + "step": 47435 + }, + { + "epoch": 3.0433803581117744, + "grad_norm": 3.1337430477142334, + "learning_rate": 3.602165273400028e-06, + "loss": 0.0881, + "step": 47436 + }, + { + "epoch": 3.0433939229517093, + "grad_norm": 5.613866329193115, + "learning_rate": 3.6020282307797727e-06, + "loss": 0.2046, + "step": 47437 + }, + { + "epoch": 3.043407487791644, + "grad_norm": 4.594252109527588, + "learning_rate": 3.601891188159518e-06, + "loss": 0.1708, + "step": 47438 + }, + { + "epoch": 3.043421052631579, + "grad_norm": 4.423647403717041, + "learning_rate": 3.601754145539263e-06, + "loss": 0.1274, + "step": 47439 + }, + { + "epoch": 3.043434617471514, + "grad_norm": 4.217621326446533, + "learning_rate": 3.6016171029190083e-06, + "loss": 0.1064, + "step": 47440 + }, + { + "epoch": 3.0434481823114488, + "grad_norm": 3.466920852661133, + "learning_rate": 3.601480060298753e-06, + "loss": 0.0901, + "step": 47441 + }, + { + "epoch": 3.0434617471513836, + "grad_norm": 5.33099889755249, + "learning_rate": 3.601343017678498e-06, + "loss": 0.1738, + "step": 47442 + }, + { + "epoch": 3.0434753119913185, + "grad_norm": 3.8321971893310547, + "learning_rate": 3.6012059750582434e-06, + "loss": 0.0726, + "step": 47443 + }, + { + "epoch": 3.0434888768312534, + "grad_norm": 3.8582067489624023, + "learning_rate": 3.601068932437988e-06, + "loss": 0.192, + "step": 47444 + }, + { + "epoch": 3.043502441671188, + "grad_norm": 4.251525402069092, + "learning_rate": 3.6009318898177337e-06, + "loss": 0.1388, + "step": 47445 + }, + { + "epoch": 3.043516006511123, + "grad_norm": 3.202768564224243, + "learning_rate": 3.6007948471974785e-06, + "loss": 0.1067, + "step": 47446 + }, + { + "epoch": 3.043529571351058, + "grad_norm": 3.26031494140625, + "learning_rate": 3.600657804577224e-06, + "loss": 0.1063, + "step": 47447 + }, + { + "epoch": 3.043543136190993, + "grad_norm": 4.544322967529297, + "learning_rate": 3.600520761956969e-06, + "loss": 0.2256, + "step": 47448 + }, + { + "epoch": 3.0435567010309277, + "grad_norm": 4.161062240600586, + "learning_rate": 3.600383719336714e-06, + "loss": 0.184, + "step": 47449 + }, + { + "epoch": 3.0435702658708625, + "grad_norm": 4.062180519104004, + "learning_rate": 3.600246676716459e-06, + "loss": 0.1343, + "step": 47450 + }, + { + "epoch": 3.0435838307107974, + "grad_norm": 3.6467394828796387, + "learning_rate": 3.6001096340962044e-06, + "loss": 0.0985, + "step": 47451 + }, + { + "epoch": 3.0435973955507327, + "grad_norm": 4.229243278503418, + "learning_rate": 3.599972591475949e-06, + "loss": 0.1201, + "step": 47452 + }, + { + "epoch": 3.0436109603906676, + "grad_norm": 4.539399147033691, + "learning_rate": 3.5998355488556947e-06, + "loss": 0.1148, + "step": 47453 + }, + { + "epoch": 3.0436245252306025, + "grad_norm": 2.999044179916382, + "learning_rate": 3.5996985062354395e-06, + "loss": 0.0831, + "step": 47454 + }, + { + "epoch": 3.0436380900705373, + "grad_norm": 3.7518301010131836, + "learning_rate": 3.599561463615185e-06, + "loss": 0.1236, + "step": 47455 + }, + { + "epoch": 3.043651654910472, + "grad_norm": 2.8966639041900635, + "learning_rate": 3.59942442099493e-06, + "loss": 0.0908, + "step": 47456 + }, + { + "epoch": 3.043665219750407, + "grad_norm": 3.1962504386901855, + "learning_rate": 3.5992873783746746e-06, + "loss": 0.0723, + "step": 47457 + }, + { + "epoch": 3.043678784590342, + "grad_norm": 2.980616569519043, + "learning_rate": 3.5991503357544198e-06, + "loss": 0.1018, + "step": 47458 + }, + { + "epoch": 3.0436923494302768, + "grad_norm": 3.030656099319458, + "learning_rate": 3.599013293134165e-06, + "loss": 0.1203, + "step": 47459 + }, + { + "epoch": 3.0437059142702116, + "grad_norm": 5.141095161437988, + "learning_rate": 3.59887625051391e-06, + "loss": 0.1947, + "step": 47460 + }, + { + "epoch": 3.0437194791101465, + "grad_norm": 4.095332622528076, + "learning_rate": 3.598739207893655e-06, + "loss": 0.092, + "step": 47461 + }, + { + "epoch": 3.0437330439500814, + "grad_norm": 3.669557571411133, + "learning_rate": 3.5986021652734005e-06, + "loss": 0.0841, + "step": 47462 + }, + { + "epoch": 3.0437466087900162, + "grad_norm": 4.15123987197876, + "learning_rate": 3.5984651226531452e-06, + "loss": 0.1448, + "step": 47463 + }, + { + "epoch": 3.043760173629951, + "grad_norm": 5.044987201690674, + "learning_rate": 3.598328080032891e-06, + "loss": 0.0946, + "step": 47464 + }, + { + "epoch": 3.043773738469886, + "grad_norm": 4.824219703674316, + "learning_rate": 3.5981910374126356e-06, + "loss": 0.0929, + "step": 47465 + }, + { + "epoch": 3.043787303309821, + "grad_norm": 3.4730703830718994, + "learning_rate": 3.5980539947923808e-06, + "loss": 0.0718, + "step": 47466 + }, + { + "epoch": 3.0438008681497557, + "grad_norm": 5.783492565155029, + "learning_rate": 3.597916952172126e-06, + "loss": 0.1199, + "step": 47467 + }, + { + "epoch": 3.0438144329896906, + "grad_norm": 5.786564350128174, + "learning_rate": 3.597779909551871e-06, + "loss": 0.1242, + "step": 47468 + }, + { + "epoch": 3.0438279978296254, + "grad_norm": 3.7372934818267822, + "learning_rate": 3.597642866931616e-06, + "loss": 0.1096, + "step": 47469 + }, + { + "epoch": 3.0438415626695603, + "grad_norm": 3.6802773475646973, + "learning_rate": 3.5975058243113607e-06, + "loss": 0.0764, + "step": 47470 + }, + { + "epoch": 3.0438551275094956, + "grad_norm": 3.653822660446167, + "learning_rate": 3.5973687816911063e-06, + "loss": 0.0521, + "step": 47471 + }, + { + "epoch": 3.0438686923494305, + "grad_norm": 3.364488124847412, + "learning_rate": 3.597231739070851e-06, + "loss": 0.0745, + "step": 47472 + }, + { + "epoch": 3.0438822571893653, + "grad_norm": 3.4540932178497314, + "learning_rate": 3.5970946964505966e-06, + "loss": 0.0563, + "step": 47473 + }, + { + "epoch": 3.0438958220293, + "grad_norm": 3.2270147800445557, + "learning_rate": 3.5969576538303414e-06, + "loss": 0.0692, + "step": 47474 + }, + { + "epoch": 3.043909386869235, + "grad_norm": 3.8704230785369873, + "learning_rate": 3.5968206112100865e-06, + "loss": 0.1575, + "step": 47475 + }, + { + "epoch": 3.04392295170917, + "grad_norm": 4.961657524108887, + "learning_rate": 3.5966835685898317e-06, + "loss": 0.1657, + "step": 47476 + }, + { + "epoch": 3.043936516549105, + "grad_norm": 4.25325870513916, + "learning_rate": 3.596546525969577e-06, + "loss": 0.1283, + "step": 47477 + }, + { + "epoch": 3.0439500813890397, + "grad_norm": 3.9993951320648193, + "learning_rate": 3.5964094833493217e-06, + "loss": 0.1562, + "step": 47478 + }, + { + "epoch": 3.0439636462289745, + "grad_norm": 3.7343828678131104, + "learning_rate": 3.5962724407290673e-06, + "loss": 0.088, + "step": 47479 + }, + { + "epoch": 3.0439772110689094, + "grad_norm": 2.4523708820343018, + "learning_rate": 3.596135398108812e-06, + "loss": 0.0539, + "step": 47480 + }, + { + "epoch": 3.0439907759088443, + "grad_norm": 4.766554832458496, + "learning_rate": 3.5959983554885576e-06, + "loss": 0.1155, + "step": 47481 + }, + { + "epoch": 3.044004340748779, + "grad_norm": 3.7971701622009277, + "learning_rate": 3.5958613128683024e-06, + "loss": 0.1074, + "step": 47482 + }, + { + "epoch": 3.044017905588714, + "grad_norm": 3.003695011138916, + "learning_rate": 3.5957242702480476e-06, + "loss": 0.0755, + "step": 47483 + }, + { + "epoch": 3.044031470428649, + "grad_norm": 4.0389251708984375, + "learning_rate": 3.5955872276277927e-06, + "loss": 0.0791, + "step": 47484 + }, + { + "epoch": 3.0440450352685837, + "grad_norm": 5.215526103973389, + "learning_rate": 3.5954501850075375e-06, + "loss": 0.1997, + "step": 47485 + }, + { + "epoch": 3.0440586001085186, + "grad_norm": 3.6551215648651123, + "learning_rate": 3.5953131423872827e-06, + "loss": 0.095, + "step": 47486 + }, + { + "epoch": 3.0440721649484535, + "grad_norm": 3.9349424839019775, + "learning_rate": 3.595176099767028e-06, + "loss": 0.0898, + "step": 47487 + }, + { + "epoch": 3.0440857297883883, + "grad_norm": 2.9790570735931396, + "learning_rate": 3.595039057146773e-06, + "loss": 0.0585, + "step": 47488 + }, + { + "epoch": 3.044099294628323, + "grad_norm": 3.8848960399627686, + "learning_rate": 3.5949020145265178e-06, + "loss": 0.0995, + "step": 47489 + }, + { + "epoch": 3.0441128594682585, + "grad_norm": 5.15410852432251, + "learning_rate": 3.5947649719062634e-06, + "loss": 0.1209, + "step": 47490 + }, + { + "epoch": 3.0441264243081934, + "grad_norm": 3.51943039894104, + "learning_rate": 3.594627929286008e-06, + "loss": 0.1087, + "step": 47491 + }, + { + "epoch": 3.0441399891481282, + "grad_norm": 3.713371753692627, + "learning_rate": 3.5944908866657533e-06, + "loss": 0.0996, + "step": 47492 + }, + { + "epoch": 3.044153553988063, + "grad_norm": 3.2748377323150635, + "learning_rate": 3.5943538440454985e-06, + "loss": 0.0824, + "step": 47493 + }, + { + "epoch": 3.044167118827998, + "grad_norm": 4.344156742095947, + "learning_rate": 3.5942168014252437e-06, + "loss": 0.1309, + "step": 47494 + }, + { + "epoch": 3.044180683667933, + "grad_norm": 3.781665086746216, + "learning_rate": 3.5940797588049884e-06, + "loss": 0.0813, + "step": 47495 + }, + { + "epoch": 3.0441942485078677, + "grad_norm": 2.9157726764678955, + "learning_rate": 3.593942716184734e-06, + "loss": 0.0457, + "step": 47496 + }, + { + "epoch": 3.0442078133478025, + "grad_norm": 4.77657413482666, + "learning_rate": 3.593805673564479e-06, + "loss": 0.1418, + "step": 47497 + }, + { + "epoch": 3.0442213781877374, + "grad_norm": 3.3420372009277344, + "learning_rate": 3.5936686309442235e-06, + "loss": 0.1102, + "step": 47498 + }, + { + "epoch": 3.0442349430276723, + "grad_norm": 3.6927175521850586, + "learning_rate": 3.593531588323969e-06, + "loss": 0.1104, + "step": 47499 + }, + { + "epoch": 3.044248507867607, + "grad_norm": 4.230146408081055, + "learning_rate": 3.593394545703714e-06, + "loss": 0.1005, + "step": 47500 + }, + { + "epoch": 3.044262072707542, + "grad_norm": 2.6409823894500732, + "learning_rate": 3.5932575030834595e-06, + "loss": 0.0392, + "step": 47501 + }, + { + "epoch": 3.044275637547477, + "grad_norm": 6.162597179412842, + "learning_rate": 3.5931204604632043e-06, + "loss": 0.1894, + "step": 47502 + }, + { + "epoch": 3.0442892023874117, + "grad_norm": 3.6565496921539307, + "learning_rate": 3.5929834178429494e-06, + "loss": 0.1138, + "step": 47503 + }, + { + "epoch": 3.0443027672273466, + "grad_norm": 4.150543689727783, + "learning_rate": 3.5928463752226946e-06, + "loss": 0.1467, + "step": 47504 + }, + { + "epoch": 3.0443163320672815, + "grad_norm": 3.325110912322998, + "learning_rate": 3.59270933260244e-06, + "loss": 0.0753, + "step": 47505 + }, + { + "epoch": 3.0443298969072163, + "grad_norm": 4.556614875793457, + "learning_rate": 3.5925722899821846e-06, + "loss": 0.0736, + "step": 47506 + }, + { + "epoch": 3.044343461747151, + "grad_norm": 4.299317359924316, + "learning_rate": 3.59243524736193e-06, + "loss": 0.1844, + "step": 47507 + }, + { + "epoch": 3.044357026587086, + "grad_norm": 4.559935569763184, + "learning_rate": 3.592298204741675e-06, + "loss": 0.1144, + "step": 47508 + }, + { + "epoch": 3.0443705914270214, + "grad_norm": 3.6267693042755127, + "learning_rate": 3.59216116212142e-06, + "loss": 0.1275, + "step": 47509 + }, + { + "epoch": 3.0443841562669562, + "grad_norm": 3.4088332653045654, + "learning_rate": 3.5920241195011653e-06, + "loss": 0.0594, + "step": 47510 + }, + { + "epoch": 3.044397721106891, + "grad_norm": 5.411903381347656, + "learning_rate": 3.59188707688091e-06, + "loss": 0.1034, + "step": 47511 + }, + { + "epoch": 3.044411285946826, + "grad_norm": 4.8065690994262695, + "learning_rate": 3.591750034260655e-06, + "loss": 0.1006, + "step": 47512 + }, + { + "epoch": 3.044424850786761, + "grad_norm": 4.407468795776367, + "learning_rate": 3.5916129916404004e-06, + "loss": 0.1163, + "step": 47513 + }, + { + "epoch": 3.0444384156266957, + "grad_norm": 5.273415565490723, + "learning_rate": 3.5914759490201456e-06, + "loss": 0.1513, + "step": 47514 + }, + { + "epoch": 3.0444519804666306, + "grad_norm": 5.069380283355713, + "learning_rate": 3.5913389063998903e-06, + "loss": 0.1315, + "step": 47515 + }, + { + "epoch": 3.0444655453065654, + "grad_norm": 4.518784046173096, + "learning_rate": 3.591201863779636e-06, + "loss": 0.2062, + "step": 47516 + }, + { + "epoch": 3.0444791101465003, + "grad_norm": 5.811708927154541, + "learning_rate": 3.5910648211593807e-06, + "loss": 0.1289, + "step": 47517 + }, + { + "epoch": 3.044492674986435, + "grad_norm": 4.136709690093994, + "learning_rate": 3.5909277785391263e-06, + "loss": 0.1024, + "step": 47518 + }, + { + "epoch": 3.04450623982637, + "grad_norm": 4.774452209472656, + "learning_rate": 3.590790735918871e-06, + "loss": 0.1116, + "step": 47519 + }, + { + "epoch": 3.044519804666305, + "grad_norm": 3.6828572750091553, + "learning_rate": 3.590653693298616e-06, + "loss": 0.0691, + "step": 47520 + }, + { + "epoch": 3.0445333695062398, + "grad_norm": 5.223153591156006, + "learning_rate": 3.5905166506783614e-06, + "loss": 0.1163, + "step": 47521 + }, + { + "epoch": 3.0445469343461746, + "grad_norm": 3.5503733158111572, + "learning_rate": 3.5903796080581066e-06, + "loss": 0.095, + "step": 47522 + }, + { + "epoch": 3.0445604991861095, + "grad_norm": 5.5900421142578125, + "learning_rate": 3.5902425654378513e-06, + "loss": 0.1713, + "step": 47523 + }, + { + "epoch": 3.0445740640260444, + "grad_norm": 5.265277862548828, + "learning_rate": 3.590105522817597e-06, + "loss": 0.0915, + "step": 47524 + }, + { + "epoch": 3.0445876288659792, + "grad_norm": 5.14742374420166, + "learning_rate": 3.5899684801973417e-06, + "loss": 0.1287, + "step": 47525 + }, + { + "epoch": 3.044601193705914, + "grad_norm": 4.640995502471924, + "learning_rate": 3.5898314375770864e-06, + "loss": 0.106, + "step": 47526 + }, + { + "epoch": 3.044614758545849, + "grad_norm": 4.458736896514893, + "learning_rate": 3.589694394956832e-06, + "loss": 0.157, + "step": 47527 + }, + { + "epoch": 3.0446283233857843, + "grad_norm": 3.967031478881836, + "learning_rate": 3.589557352336577e-06, + "loss": 0.0541, + "step": 47528 + }, + { + "epoch": 3.044641888225719, + "grad_norm": 5.60711145401001, + "learning_rate": 3.589420309716322e-06, + "loss": 0.163, + "step": 47529 + }, + { + "epoch": 3.044655453065654, + "grad_norm": 7.458782196044922, + "learning_rate": 3.589283267096067e-06, + "loss": 0.2304, + "step": 47530 + }, + { + "epoch": 3.044669017905589, + "grad_norm": 6.686065673828125, + "learning_rate": 3.5891462244758123e-06, + "loss": 0.1208, + "step": 47531 + }, + { + "epoch": 3.0446825827455237, + "grad_norm": 5.61234188079834, + "learning_rate": 3.589009181855557e-06, + "loss": 0.1103, + "step": 47532 + }, + { + "epoch": 3.0446961475854586, + "grad_norm": 5.04072904586792, + "learning_rate": 3.5888721392353027e-06, + "loss": 0.0985, + "step": 47533 + }, + { + "epoch": 3.0447097124253935, + "grad_norm": 5.8451361656188965, + "learning_rate": 3.5887350966150474e-06, + "loss": 0.1529, + "step": 47534 + }, + { + "epoch": 3.0447232772653283, + "grad_norm": 5.622317314147949, + "learning_rate": 3.588598053994793e-06, + "loss": 0.1114, + "step": 47535 + }, + { + "epoch": 3.044736842105263, + "grad_norm": 3.5366790294647217, + "learning_rate": 3.588461011374538e-06, + "loss": 0.0982, + "step": 47536 + }, + { + "epoch": 3.044750406945198, + "grad_norm": 5.407096862792969, + "learning_rate": 3.588323968754283e-06, + "loss": 0.142, + "step": 47537 + }, + { + "epoch": 3.044763971785133, + "grad_norm": 6.49893045425415, + "learning_rate": 3.588186926134028e-06, + "loss": 0.1574, + "step": 47538 + }, + { + "epoch": 3.044777536625068, + "grad_norm": 4.164093494415283, + "learning_rate": 3.588049883513773e-06, + "loss": 0.1117, + "step": 47539 + }, + { + "epoch": 3.0447911014650026, + "grad_norm": 5.068321704864502, + "learning_rate": 3.587912840893518e-06, + "loss": 0.167, + "step": 47540 + }, + { + "epoch": 3.0448046663049375, + "grad_norm": 6.198688507080078, + "learning_rate": 3.587775798273263e-06, + "loss": 0.1243, + "step": 47541 + }, + { + "epoch": 3.0448182311448724, + "grad_norm": 3.6959946155548096, + "learning_rate": 3.5876387556530085e-06, + "loss": 0.1015, + "step": 47542 + }, + { + "epoch": 3.0448317959848072, + "grad_norm": 4.055473327636719, + "learning_rate": 3.587501713032753e-06, + "loss": 0.1003, + "step": 47543 + }, + { + "epoch": 3.044845360824742, + "grad_norm": 3.3913214206695557, + "learning_rate": 3.587364670412499e-06, + "loss": 0.1126, + "step": 47544 + }, + { + "epoch": 3.044858925664677, + "grad_norm": 7.799496650695801, + "learning_rate": 3.5872276277922436e-06, + "loss": 0.2058, + "step": 47545 + }, + { + "epoch": 3.044872490504612, + "grad_norm": 5.225625991821289, + "learning_rate": 3.5870905851719887e-06, + "loss": 0.1084, + "step": 47546 + }, + { + "epoch": 3.044886055344547, + "grad_norm": 2.8089752197265625, + "learning_rate": 3.586953542551734e-06, + "loss": 0.071, + "step": 47547 + }, + { + "epoch": 3.044899620184482, + "grad_norm": 4.310281276702881, + "learning_rate": 3.586816499931479e-06, + "loss": 0.08, + "step": 47548 + }, + { + "epoch": 3.044913185024417, + "grad_norm": 4.4029436111450195, + "learning_rate": 3.586679457311224e-06, + "loss": 0.1283, + "step": 47549 + }, + { + "epoch": 3.0449267498643517, + "grad_norm": 5.164461135864258, + "learning_rate": 3.5865424146909695e-06, + "loss": 0.1659, + "step": 47550 + }, + { + "epoch": 3.0449403147042866, + "grad_norm": 4.58919095993042, + "learning_rate": 3.5864053720707142e-06, + "loss": 0.1109, + "step": 47551 + }, + { + "epoch": 3.0449538795442215, + "grad_norm": 5.5345587730407715, + "learning_rate": 3.58626832945046e-06, + "loss": 0.1865, + "step": 47552 + }, + { + "epoch": 3.0449674443841563, + "grad_norm": 5.730001926422119, + "learning_rate": 3.5861312868302046e-06, + "loss": 0.1767, + "step": 47553 + }, + { + "epoch": 3.044981009224091, + "grad_norm": 5.138589859008789, + "learning_rate": 3.5859942442099493e-06, + "loss": 0.1053, + "step": 47554 + }, + { + "epoch": 3.044994574064026, + "grad_norm": 4.546421527862549, + "learning_rate": 3.585857201589695e-06, + "loss": 0.1068, + "step": 47555 + }, + { + "epoch": 3.045008138903961, + "grad_norm": 4.1820502281188965, + "learning_rate": 3.5857201589694397e-06, + "loss": 0.1037, + "step": 47556 + }, + { + "epoch": 3.045021703743896, + "grad_norm": 5.849576950073242, + "learning_rate": 3.585583116349185e-06, + "loss": 0.2628, + "step": 47557 + }, + { + "epoch": 3.0450352685838307, + "grad_norm": 4.961144924163818, + "learning_rate": 3.58544607372893e-06, + "loss": 0.202, + "step": 47558 + }, + { + "epoch": 3.0450488334237655, + "grad_norm": 4.047219753265381, + "learning_rate": 3.5853090311086752e-06, + "loss": 0.2095, + "step": 47559 + }, + { + "epoch": 3.0450623982637004, + "grad_norm": 5.16314172744751, + "learning_rate": 3.58517198848842e-06, + "loss": 0.1604, + "step": 47560 + }, + { + "epoch": 3.0450759631036353, + "grad_norm": 4.656130313873291, + "learning_rate": 3.5850349458681656e-06, + "loss": 0.1101, + "step": 47561 + }, + { + "epoch": 3.04508952794357, + "grad_norm": 4.989808559417725, + "learning_rate": 3.5848979032479103e-06, + "loss": 0.1238, + "step": 47562 + }, + { + "epoch": 3.045103092783505, + "grad_norm": 4.903071880340576, + "learning_rate": 3.5847608606276555e-06, + "loss": 0.1667, + "step": 47563 + }, + { + "epoch": 3.04511665762344, + "grad_norm": 5.548062801361084, + "learning_rate": 3.5846238180074007e-06, + "loss": 0.2295, + "step": 47564 + }, + { + "epoch": 3.0451302224633747, + "grad_norm": 4.015807628631592, + "learning_rate": 3.584486775387146e-06, + "loss": 0.205, + "step": 47565 + }, + { + "epoch": 3.04514378730331, + "grad_norm": 4.82930850982666, + "learning_rate": 3.5843497327668906e-06, + "loss": 0.229, + "step": 47566 + }, + { + "epoch": 3.045157352143245, + "grad_norm": 3.4011385440826416, + "learning_rate": 3.584212690146636e-06, + "loss": 0.0782, + "step": 47567 + }, + { + "epoch": 3.0451709169831798, + "grad_norm": 4.951098442077637, + "learning_rate": 3.584075647526381e-06, + "loss": 0.1181, + "step": 47568 + }, + { + "epoch": 3.0451844818231146, + "grad_norm": 5.744794845581055, + "learning_rate": 3.5839386049061257e-06, + "loss": 0.2085, + "step": 47569 + }, + { + "epoch": 3.0451980466630495, + "grad_norm": 4.863259315490723, + "learning_rate": 3.5838015622858713e-06, + "loss": 0.1447, + "step": 47570 + }, + { + "epoch": 3.0452116115029844, + "grad_norm": 11.068098068237305, + "learning_rate": 3.583664519665616e-06, + "loss": 0.1863, + "step": 47571 + }, + { + "epoch": 3.0452251763429192, + "grad_norm": 5.249007225036621, + "learning_rate": 3.5835274770453617e-06, + "loss": 0.199, + "step": 47572 + }, + { + "epoch": 3.045238741182854, + "grad_norm": 3.322568655014038, + "learning_rate": 3.5833904344251065e-06, + "loss": 0.0925, + "step": 47573 + }, + { + "epoch": 3.045252306022789, + "grad_norm": 6.6730451583862305, + "learning_rate": 3.5832533918048516e-06, + "loss": 0.1451, + "step": 47574 + }, + { + "epoch": 3.045265870862724, + "grad_norm": 3.768301248550415, + "learning_rate": 3.583116349184597e-06, + "loss": 0.1234, + "step": 47575 + }, + { + "epoch": 3.0452794357026587, + "grad_norm": 4.552462577819824, + "learning_rate": 3.582979306564342e-06, + "loss": 0.145, + "step": 47576 + }, + { + "epoch": 3.0452930005425936, + "grad_norm": 3.6091835498809814, + "learning_rate": 3.5828422639440867e-06, + "loss": 0.1031, + "step": 47577 + }, + { + "epoch": 3.0453065653825284, + "grad_norm": 4.693686485290527, + "learning_rate": 3.5827052213238323e-06, + "loss": 0.1194, + "step": 47578 + }, + { + "epoch": 3.0453201302224633, + "grad_norm": 5.051157474517822, + "learning_rate": 3.582568178703577e-06, + "loss": 0.088, + "step": 47579 + }, + { + "epoch": 3.045333695062398, + "grad_norm": 5.410297393798828, + "learning_rate": 3.582431136083322e-06, + "loss": 0.1644, + "step": 47580 + }, + { + "epoch": 3.045347259902333, + "grad_norm": 4.160839557647705, + "learning_rate": 3.5822940934630675e-06, + "loss": 0.148, + "step": 47581 + }, + { + "epoch": 3.045360824742268, + "grad_norm": 3.916879177093506, + "learning_rate": 3.5821570508428122e-06, + "loss": 0.0716, + "step": 47582 + }, + { + "epoch": 3.0453743895822027, + "grad_norm": 6.724865436553955, + "learning_rate": 3.5820200082225574e-06, + "loss": 0.2837, + "step": 47583 + }, + { + "epoch": 3.0453879544221376, + "grad_norm": 5.589980602264404, + "learning_rate": 3.5818829656023026e-06, + "loss": 0.3035, + "step": 47584 + }, + { + "epoch": 3.045401519262073, + "grad_norm": 4.204381942749023, + "learning_rate": 3.5817459229820478e-06, + "loss": 0.2013, + "step": 47585 + }, + { + "epoch": 3.045415084102008, + "grad_norm": 6.4917683601379395, + "learning_rate": 3.5816088803617925e-06, + "loss": 0.2267, + "step": 47586 + }, + { + "epoch": 3.0454286489419427, + "grad_norm": 5.125844955444336, + "learning_rate": 3.581471837741538e-06, + "loss": 0.1399, + "step": 47587 + }, + { + "epoch": 3.0454422137818775, + "grad_norm": 3.84879469871521, + "learning_rate": 3.581334795121283e-06, + "loss": 0.091, + "step": 47588 + }, + { + "epoch": 3.0454557786218124, + "grad_norm": 6.267420768737793, + "learning_rate": 3.5811977525010285e-06, + "loss": 0.1644, + "step": 47589 + }, + { + "epoch": 3.0454693434617472, + "grad_norm": 5.87723970413208, + "learning_rate": 3.5810607098807732e-06, + "loss": 0.1328, + "step": 47590 + }, + { + "epoch": 3.045482908301682, + "grad_norm": 5.830419540405273, + "learning_rate": 3.5809236672605184e-06, + "loss": 0.1734, + "step": 47591 + }, + { + "epoch": 3.045496473141617, + "grad_norm": 4.677687644958496, + "learning_rate": 3.5807866246402636e-06, + "loss": 0.2019, + "step": 47592 + }, + { + "epoch": 3.045510037981552, + "grad_norm": 3.995242118835449, + "learning_rate": 3.5806495820200088e-06, + "loss": 0.1282, + "step": 47593 + }, + { + "epoch": 3.0455236028214867, + "grad_norm": 4.173748970031738, + "learning_rate": 3.5805125393997535e-06, + "loss": 0.0664, + "step": 47594 + }, + { + "epoch": 3.0455371676614216, + "grad_norm": 4.699945449829102, + "learning_rate": 3.5803754967794983e-06, + "loss": 0.124, + "step": 47595 + }, + { + "epoch": 3.0455507325013564, + "grad_norm": 5.0947465896606445, + "learning_rate": 3.580238454159244e-06, + "loss": 0.1508, + "step": 47596 + }, + { + "epoch": 3.0455642973412913, + "grad_norm": 6.266222953796387, + "learning_rate": 3.5801014115389886e-06, + "loss": 0.1469, + "step": 47597 + }, + { + "epoch": 3.045577862181226, + "grad_norm": 6.399480819702148, + "learning_rate": 3.5799643689187342e-06, + "loss": 0.1648, + "step": 47598 + }, + { + "epoch": 3.045591427021161, + "grad_norm": 3.794572591781616, + "learning_rate": 3.579827326298479e-06, + "loss": 0.101, + "step": 47599 + }, + { + "epoch": 3.045604991861096, + "grad_norm": 5.216463088989258, + "learning_rate": 3.579690283678224e-06, + "loss": 0.2749, + "step": 47600 + }, + { + "epoch": 3.0456185567010308, + "grad_norm": 8.001269340515137, + "learning_rate": 3.5795532410579693e-06, + "loss": 0.2458, + "step": 47601 + }, + { + "epoch": 3.0456321215409656, + "grad_norm": 3.619929313659668, + "learning_rate": 3.5794161984377145e-06, + "loss": 0.0958, + "step": 47602 + }, + { + "epoch": 3.0456456863809005, + "grad_norm": 3.2181265354156494, + "learning_rate": 3.5792791558174593e-06, + "loss": 0.08, + "step": 47603 + }, + { + "epoch": 3.045659251220836, + "grad_norm": 6.5985612869262695, + "learning_rate": 3.579142113197205e-06, + "loss": 0.1642, + "step": 47604 + }, + { + "epoch": 3.0456728160607707, + "grad_norm": 4.773575782775879, + "learning_rate": 3.5790050705769496e-06, + "loss": 0.1552, + "step": 47605 + }, + { + "epoch": 3.0456863809007055, + "grad_norm": 3.3938333988189697, + "learning_rate": 3.5788680279566952e-06, + "loss": 0.1007, + "step": 47606 + }, + { + "epoch": 3.0456999457406404, + "grad_norm": 4.763852596282959, + "learning_rate": 3.57873098533644e-06, + "loss": 0.0969, + "step": 47607 + }, + { + "epoch": 3.0457135105805753, + "grad_norm": 5.078793048858643, + "learning_rate": 3.5785939427161848e-06, + "loss": 0.2102, + "step": 47608 + }, + { + "epoch": 3.04572707542051, + "grad_norm": 4.329375743865967, + "learning_rate": 3.5784569000959304e-06, + "loss": 0.0957, + "step": 47609 + }, + { + "epoch": 3.045740640260445, + "grad_norm": 4.272141933441162, + "learning_rate": 3.578319857475675e-06, + "loss": 0.1061, + "step": 47610 + }, + { + "epoch": 3.04575420510038, + "grad_norm": 4.781337738037109, + "learning_rate": 3.5781828148554203e-06, + "loss": 0.1416, + "step": 47611 + }, + { + "epoch": 3.0457677699403147, + "grad_norm": 6.0089497566223145, + "learning_rate": 3.578045772235165e-06, + "loss": 0.1921, + "step": 47612 + }, + { + "epoch": 3.0457813347802496, + "grad_norm": 4.347249984741211, + "learning_rate": 3.5779087296149106e-06, + "loss": 0.1458, + "step": 47613 + }, + { + "epoch": 3.0457948996201845, + "grad_norm": 4.651727676391602, + "learning_rate": 3.5777716869946554e-06, + "loss": 0.0985, + "step": 47614 + }, + { + "epoch": 3.0458084644601193, + "grad_norm": 4.736379146575928, + "learning_rate": 3.577634644374401e-06, + "loss": 0.1555, + "step": 47615 + }, + { + "epoch": 3.045822029300054, + "grad_norm": 4.199477195739746, + "learning_rate": 3.5774976017541458e-06, + "loss": 0.0998, + "step": 47616 + }, + { + "epoch": 3.045835594139989, + "grad_norm": 6.770132541656494, + "learning_rate": 3.577360559133891e-06, + "loss": 0.1207, + "step": 47617 + }, + { + "epoch": 3.045849158979924, + "grad_norm": 5.2930908203125, + "learning_rate": 3.577223516513636e-06, + "loss": 0.1885, + "step": 47618 + }, + { + "epoch": 3.045862723819859, + "grad_norm": 3.738471746444702, + "learning_rate": 3.5770864738933813e-06, + "loss": 0.0649, + "step": 47619 + }, + { + "epoch": 3.0458762886597937, + "grad_norm": 4.605503082275391, + "learning_rate": 3.576949431273126e-06, + "loss": 0.165, + "step": 47620 + }, + { + "epoch": 3.0458898534997285, + "grad_norm": 2.920714855194092, + "learning_rate": 3.5768123886528712e-06, + "loss": 0.0724, + "step": 47621 + }, + { + "epoch": 3.0459034183396634, + "grad_norm": 5.024339199066162, + "learning_rate": 3.5766753460326164e-06, + "loss": 0.1252, + "step": 47622 + }, + { + "epoch": 3.0459169831795987, + "grad_norm": 6.73847770690918, + "learning_rate": 3.576538303412361e-06, + "loss": 0.2932, + "step": 47623 + }, + { + "epoch": 3.0459305480195336, + "grad_norm": 3.824415683746338, + "learning_rate": 3.5764012607921068e-06, + "loss": 0.0829, + "step": 47624 + }, + { + "epoch": 3.0459441128594684, + "grad_norm": 6.268229007720947, + "learning_rate": 3.5762642181718515e-06, + "loss": 0.1599, + "step": 47625 + }, + { + "epoch": 3.0459576776994033, + "grad_norm": 5.240115165710449, + "learning_rate": 3.576127175551597e-06, + "loss": 0.1071, + "step": 47626 + }, + { + "epoch": 3.045971242539338, + "grad_norm": 4.255275249481201, + "learning_rate": 3.575990132931342e-06, + "loss": 0.177, + "step": 47627 + }, + { + "epoch": 3.045984807379273, + "grad_norm": 4.061059474945068, + "learning_rate": 3.575853090311087e-06, + "loss": 0.1616, + "step": 47628 + }, + { + "epoch": 3.045998372219208, + "grad_norm": 3.54436993598938, + "learning_rate": 3.5757160476908322e-06, + "loss": 0.1005, + "step": 47629 + }, + { + "epoch": 3.0460119370591427, + "grad_norm": 3.911696672439575, + "learning_rate": 3.5755790050705774e-06, + "loss": 0.1035, + "step": 47630 + }, + { + "epoch": 3.0460255018990776, + "grad_norm": 4.074967861175537, + "learning_rate": 3.575441962450322e-06, + "loss": 0.0818, + "step": 47631 + }, + { + "epoch": 3.0460390667390125, + "grad_norm": 3.4680936336517334, + "learning_rate": 3.5753049198300678e-06, + "loss": 0.1455, + "step": 47632 + }, + { + "epoch": 3.0460526315789473, + "grad_norm": 5.7202677726745605, + "learning_rate": 3.5751678772098125e-06, + "loss": 0.1869, + "step": 47633 + }, + { + "epoch": 3.046066196418882, + "grad_norm": 4.029864311218262, + "learning_rate": 3.5750308345895577e-06, + "loss": 0.1099, + "step": 47634 + }, + { + "epoch": 3.046079761258817, + "grad_norm": 5.0605082511901855, + "learning_rate": 3.574893791969303e-06, + "loss": 0.3001, + "step": 47635 + }, + { + "epoch": 3.046093326098752, + "grad_norm": 4.198847770690918, + "learning_rate": 3.5747567493490476e-06, + "loss": 0.1597, + "step": 47636 + }, + { + "epoch": 3.046106890938687, + "grad_norm": 4.587434768676758, + "learning_rate": 3.574619706728793e-06, + "loss": 0.2413, + "step": 47637 + }, + { + "epoch": 3.0461204557786217, + "grad_norm": 3.7784764766693115, + "learning_rate": 3.574482664108538e-06, + "loss": 0.1359, + "step": 47638 + }, + { + "epoch": 3.0461340206185565, + "grad_norm": 4.67735481262207, + "learning_rate": 3.574345621488283e-06, + "loss": 0.1473, + "step": 47639 + }, + { + "epoch": 3.0461475854584914, + "grad_norm": 4.447368144989014, + "learning_rate": 3.574208578868028e-06, + "loss": 0.1442, + "step": 47640 + }, + { + "epoch": 3.0461611502984263, + "grad_norm": 4.282196998596191, + "learning_rate": 3.5740715362477735e-06, + "loss": 0.1393, + "step": 47641 + }, + { + "epoch": 3.0461747151383616, + "grad_norm": 4.106393337249756, + "learning_rate": 3.5739344936275183e-06, + "loss": 0.1363, + "step": 47642 + }, + { + "epoch": 3.0461882799782964, + "grad_norm": 4.468235015869141, + "learning_rate": 3.573797451007264e-06, + "loss": 0.1598, + "step": 47643 + }, + { + "epoch": 3.0462018448182313, + "grad_norm": 4.698599815368652, + "learning_rate": 3.5736604083870086e-06, + "loss": 0.121, + "step": 47644 + }, + { + "epoch": 3.046215409658166, + "grad_norm": 4.8129167556762695, + "learning_rate": 3.573523365766754e-06, + "loss": 0.1906, + "step": 47645 + }, + { + "epoch": 3.046228974498101, + "grad_norm": 4.628476619720459, + "learning_rate": 3.573386323146499e-06, + "loss": 0.1329, + "step": 47646 + }, + { + "epoch": 3.046242539338036, + "grad_norm": 3.7785468101501465, + "learning_rate": 3.573249280526244e-06, + "loss": 0.1489, + "step": 47647 + }, + { + "epoch": 3.0462561041779708, + "grad_norm": 6.102750778198242, + "learning_rate": 3.573112237905989e-06, + "loss": 0.1173, + "step": 47648 + }, + { + "epoch": 3.0462696690179056, + "grad_norm": 4.486855506896973, + "learning_rate": 3.5729751952857337e-06, + "loss": 0.1382, + "step": 47649 + }, + { + "epoch": 3.0462832338578405, + "grad_norm": 4.855997562408447, + "learning_rate": 3.5728381526654793e-06, + "loss": 0.2013, + "step": 47650 + }, + { + "epoch": 3.0462967986977754, + "grad_norm": 4.270355701446533, + "learning_rate": 3.572701110045224e-06, + "loss": 0.235, + "step": 47651 + }, + { + "epoch": 3.0463103635377102, + "grad_norm": 4.625048637390137, + "learning_rate": 3.5725640674249697e-06, + "loss": 0.1374, + "step": 47652 + }, + { + "epoch": 3.046323928377645, + "grad_norm": 4.889730930328369, + "learning_rate": 3.5724270248047144e-06, + "loss": 0.0867, + "step": 47653 + }, + { + "epoch": 3.04633749321758, + "grad_norm": 5.441093444824219, + "learning_rate": 3.5722899821844596e-06, + "loss": 0.1826, + "step": 47654 + }, + { + "epoch": 3.046351058057515, + "grad_norm": 4.957079887390137, + "learning_rate": 3.5721529395642048e-06, + "loss": 0.1583, + "step": 47655 + }, + { + "epoch": 3.0463646228974497, + "grad_norm": 2.72749662399292, + "learning_rate": 3.57201589694395e-06, + "loss": 0.0999, + "step": 47656 + }, + { + "epoch": 3.0463781877373846, + "grad_norm": 4.515476703643799, + "learning_rate": 3.5718788543236947e-06, + "loss": 0.2415, + "step": 47657 + }, + { + "epoch": 3.0463917525773194, + "grad_norm": 4.67293643951416, + "learning_rate": 3.5717418117034403e-06, + "loss": 0.1369, + "step": 47658 + }, + { + "epoch": 3.0464053174172543, + "grad_norm": 3.0830078125, + "learning_rate": 3.571604769083185e-06, + "loss": 0.0984, + "step": 47659 + }, + { + "epoch": 3.046418882257189, + "grad_norm": 5.583224773406982, + "learning_rate": 3.5714677264629307e-06, + "loss": 0.2433, + "step": 47660 + }, + { + "epoch": 3.0464324470971245, + "grad_norm": 4.907060146331787, + "learning_rate": 3.5713306838426754e-06, + "loss": 0.2572, + "step": 47661 + }, + { + "epoch": 3.0464460119370593, + "grad_norm": 3.8825478553771973, + "learning_rate": 3.5711936412224206e-06, + "loss": 0.1124, + "step": 47662 + }, + { + "epoch": 3.046459576776994, + "grad_norm": 2.8154101371765137, + "learning_rate": 3.5710565986021658e-06, + "loss": 0.0699, + "step": 47663 + }, + { + "epoch": 3.046473141616929, + "grad_norm": 3.6730473041534424, + "learning_rate": 3.5709195559819105e-06, + "loss": 0.1012, + "step": 47664 + }, + { + "epoch": 3.046486706456864, + "grad_norm": 3.1069469451904297, + "learning_rate": 3.5707825133616557e-06, + "loss": 0.0738, + "step": 47665 + }, + { + "epoch": 3.046500271296799, + "grad_norm": 4.289422035217285, + "learning_rate": 3.5706454707414005e-06, + "loss": 0.1402, + "step": 47666 + }, + { + "epoch": 3.0465138361367337, + "grad_norm": 4.252751350402832, + "learning_rate": 3.570508428121146e-06, + "loss": 0.1344, + "step": 47667 + }, + { + "epoch": 3.0465274009766685, + "grad_norm": 4.190173625946045, + "learning_rate": 3.570371385500891e-06, + "loss": 0.1498, + "step": 47668 + }, + { + "epoch": 3.0465409658166034, + "grad_norm": 5.469798564910889, + "learning_rate": 3.5702343428806364e-06, + "loss": 0.1318, + "step": 47669 + }, + { + "epoch": 3.0465545306565383, + "grad_norm": 4.123505115509033, + "learning_rate": 3.570097300260381e-06, + "loss": 0.0853, + "step": 47670 + }, + { + "epoch": 3.046568095496473, + "grad_norm": 5.2490997314453125, + "learning_rate": 3.5699602576401264e-06, + "loss": 0.1189, + "step": 47671 + }, + { + "epoch": 3.046581660336408, + "grad_norm": 4.813716411590576, + "learning_rate": 3.5698232150198715e-06, + "loss": 0.2256, + "step": 47672 + }, + { + "epoch": 3.046595225176343, + "grad_norm": 4.996391296386719, + "learning_rate": 3.5696861723996167e-06, + "loss": 0.1601, + "step": 47673 + }, + { + "epoch": 3.0466087900162777, + "grad_norm": 4.884063720703125, + "learning_rate": 3.5695491297793615e-06, + "loss": 0.1141, + "step": 47674 + }, + { + "epoch": 3.0466223548562126, + "grad_norm": 4.708674907684326, + "learning_rate": 3.569412087159107e-06, + "loss": 0.1511, + "step": 47675 + }, + { + "epoch": 3.0466359196961474, + "grad_norm": 4.890563488006592, + "learning_rate": 3.569275044538852e-06, + "loss": 0.1154, + "step": 47676 + }, + { + "epoch": 3.0466494845360823, + "grad_norm": 2.569728136062622, + "learning_rate": 3.5691380019185966e-06, + "loss": 0.0715, + "step": 47677 + }, + { + "epoch": 3.046663049376017, + "grad_norm": 2.8726871013641357, + "learning_rate": 3.569000959298342e-06, + "loss": 0.0666, + "step": 47678 + }, + { + "epoch": 3.046676614215952, + "grad_norm": 2.9250648021698, + "learning_rate": 3.568863916678087e-06, + "loss": 0.1014, + "step": 47679 + }, + { + "epoch": 3.0466901790558873, + "grad_norm": 3.3366878032684326, + "learning_rate": 3.5687268740578325e-06, + "loss": 0.0768, + "step": 47680 + }, + { + "epoch": 3.046703743895822, + "grad_norm": 2.816298007965088, + "learning_rate": 3.5685898314375773e-06, + "loss": 0.0713, + "step": 47681 + }, + { + "epoch": 3.046717308735757, + "grad_norm": 4.594555854797363, + "learning_rate": 3.5684527888173225e-06, + "loss": 0.1089, + "step": 47682 + }, + { + "epoch": 3.046730873575692, + "grad_norm": 5.465687274932861, + "learning_rate": 3.5683157461970672e-06, + "loss": 0.1191, + "step": 47683 + }, + { + "epoch": 3.046744438415627, + "grad_norm": 3.3574163913726807, + "learning_rate": 3.568178703576813e-06, + "loss": 0.0563, + "step": 47684 + }, + { + "epoch": 3.0467580032555617, + "grad_norm": 3.832714796066284, + "learning_rate": 3.5680416609565576e-06, + "loss": 0.0999, + "step": 47685 + }, + { + "epoch": 3.0467715680954965, + "grad_norm": 4.108188629150391, + "learning_rate": 3.567904618336303e-06, + "loss": 0.12, + "step": 47686 + }, + { + "epoch": 3.0467851329354314, + "grad_norm": 3.7077298164367676, + "learning_rate": 3.567767575716048e-06, + "loss": 0.1247, + "step": 47687 + }, + { + "epoch": 3.0467986977753663, + "grad_norm": 5.390626907348633, + "learning_rate": 3.567630533095793e-06, + "loss": 0.1514, + "step": 47688 + }, + { + "epoch": 3.046812262615301, + "grad_norm": 4.724735736846924, + "learning_rate": 3.5674934904755383e-06, + "loss": 0.1392, + "step": 47689 + }, + { + "epoch": 3.046825827455236, + "grad_norm": 4.572995185852051, + "learning_rate": 3.567356447855283e-06, + "loss": 0.1091, + "step": 47690 + }, + { + "epoch": 3.046839392295171, + "grad_norm": 4.5321269035339355, + "learning_rate": 3.5672194052350282e-06, + "loss": 0.0972, + "step": 47691 + }, + { + "epoch": 3.0468529571351057, + "grad_norm": 3.4434926509857178, + "learning_rate": 3.5670823626147734e-06, + "loss": 0.0811, + "step": 47692 + }, + { + "epoch": 3.0468665219750406, + "grad_norm": 4.8537492752075195, + "learning_rate": 3.5669453199945186e-06, + "loss": 0.1471, + "step": 47693 + }, + { + "epoch": 3.0468800868149755, + "grad_norm": 5.131655693054199, + "learning_rate": 3.5668082773742634e-06, + "loss": 0.218, + "step": 47694 + }, + { + "epoch": 3.0468936516549103, + "grad_norm": 5.542774200439453, + "learning_rate": 3.566671234754009e-06, + "loss": 0.1399, + "step": 47695 + }, + { + "epoch": 3.046907216494845, + "grad_norm": 4.093079566955566, + "learning_rate": 3.5665341921337537e-06, + "loss": 0.1176, + "step": 47696 + }, + { + "epoch": 3.04692078133478, + "grad_norm": 3.2453315258026123, + "learning_rate": 3.5663971495134993e-06, + "loss": 0.0608, + "step": 47697 + }, + { + "epoch": 3.046934346174715, + "grad_norm": 4.373722553253174, + "learning_rate": 3.566260106893244e-06, + "loss": 0.1103, + "step": 47698 + }, + { + "epoch": 3.0469479110146502, + "grad_norm": 4.06632661819458, + "learning_rate": 3.5661230642729893e-06, + "loss": 0.1024, + "step": 47699 + }, + { + "epoch": 3.046961475854585, + "grad_norm": 5.565243244171143, + "learning_rate": 3.565986021652734e-06, + "loss": 0.2749, + "step": 47700 + }, + { + "epoch": 3.04697504069452, + "grad_norm": 4.325808525085449, + "learning_rate": 3.5658489790324796e-06, + "loss": 0.0728, + "step": 47701 + }, + { + "epoch": 3.046988605534455, + "grad_norm": 4.5999040603637695, + "learning_rate": 3.5657119364122244e-06, + "loss": 0.1129, + "step": 47702 + }, + { + "epoch": 3.0470021703743897, + "grad_norm": 4.144103527069092, + "learning_rate": 3.56557489379197e-06, + "loss": 0.1118, + "step": 47703 + }, + { + "epoch": 3.0470157352143246, + "grad_norm": 3.9859039783477783, + "learning_rate": 3.5654378511717147e-06, + "loss": 0.0854, + "step": 47704 + }, + { + "epoch": 3.0470293000542594, + "grad_norm": 4.333792686462402, + "learning_rate": 3.5653008085514595e-06, + "loss": 0.1378, + "step": 47705 + }, + { + "epoch": 3.0470428648941943, + "grad_norm": 3.488100051879883, + "learning_rate": 3.565163765931205e-06, + "loss": 0.0975, + "step": 47706 + }, + { + "epoch": 3.047056429734129, + "grad_norm": 3.503216028213501, + "learning_rate": 3.56502672331095e-06, + "loss": 0.0778, + "step": 47707 + }, + { + "epoch": 3.047069994574064, + "grad_norm": 4.47297477722168, + "learning_rate": 3.564889680690695e-06, + "loss": 0.1685, + "step": 47708 + }, + { + "epoch": 3.047083559413999, + "grad_norm": 4.242711067199707, + "learning_rate": 3.56475263807044e-06, + "loss": 0.0655, + "step": 47709 + }, + { + "epoch": 3.0470971242539338, + "grad_norm": 5.182710647583008, + "learning_rate": 3.5646155954501854e-06, + "loss": 0.1149, + "step": 47710 + }, + { + "epoch": 3.0471106890938686, + "grad_norm": 4.04691219329834, + "learning_rate": 3.56447855282993e-06, + "loss": 0.0916, + "step": 47711 + }, + { + "epoch": 3.0471242539338035, + "grad_norm": 4.47132682800293, + "learning_rate": 3.5643415102096757e-06, + "loss": 0.1409, + "step": 47712 + }, + { + "epoch": 3.0471378187737383, + "grad_norm": 3.176748275756836, + "learning_rate": 3.5642044675894205e-06, + "loss": 0.0818, + "step": 47713 + }, + { + "epoch": 3.047151383613673, + "grad_norm": 3.136371612548828, + "learning_rate": 3.564067424969166e-06, + "loss": 0.1113, + "step": 47714 + }, + { + "epoch": 3.047164948453608, + "grad_norm": 3.482820749282837, + "learning_rate": 3.563930382348911e-06, + "loss": 0.0936, + "step": 47715 + }, + { + "epoch": 3.047178513293543, + "grad_norm": 5.625025272369385, + "learning_rate": 3.563793339728656e-06, + "loss": 0.1164, + "step": 47716 + }, + { + "epoch": 3.047192078133478, + "grad_norm": 5.0873517990112305, + "learning_rate": 3.563656297108401e-06, + "loss": 0.0943, + "step": 47717 + }, + { + "epoch": 3.047205642973413, + "grad_norm": 7.876702785491943, + "learning_rate": 3.563519254488146e-06, + "loss": 0.1561, + "step": 47718 + }, + { + "epoch": 3.047219207813348, + "grad_norm": 3.9821009635925293, + "learning_rate": 3.563382211867891e-06, + "loss": 0.0943, + "step": 47719 + }, + { + "epoch": 3.047232772653283, + "grad_norm": 5.206404685974121, + "learning_rate": 3.563245169247636e-06, + "loss": 0.1242, + "step": 47720 + }, + { + "epoch": 3.0472463374932177, + "grad_norm": 3.1493847370147705, + "learning_rate": 3.5631081266273815e-06, + "loss": 0.0728, + "step": 47721 + }, + { + "epoch": 3.0472599023331526, + "grad_norm": 4.033642292022705, + "learning_rate": 3.5629710840071262e-06, + "loss": 0.1171, + "step": 47722 + }, + { + "epoch": 3.0472734671730874, + "grad_norm": 4.945361137390137, + "learning_rate": 3.562834041386872e-06, + "loss": 0.1128, + "step": 47723 + }, + { + "epoch": 3.0472870320130223, + "grad_norm": 5.131868362426758, + "learning_rate": 3.5626969987666166e-06, + "loss": 0.1678, + "step": 47724 + }, + { + "epoch": 3.047300596852957, + "grad_norm": 4.398863792419434, + "learning_rate": 3.5625599561463618e-06, + "loss": 0.0774, + "step": 47725 + }, + { + "epoch": 3.047314161692892, + "grad_norm": 7.1320600509643555, + "learning_rate": 3.562422913526107e-06, + "loss": 0.1399, + "step": 47726 + }, + { + "epoch": 3.047327726532827, + "grad_norm": 5.577498912811279, + "learning_rate": 3.562285870905852e-06, + "loss": 0.1213, + "step": 47727 + }, + { + "epoch": 3.0473412913727618, + "grad_norm": 7.004269599914551, + "learning_rate": 3.562148828285597e-06, + "loss": 0.1851, + "step": 47728 + }, + { + "epoch": 3.0473548562126966, + "grad_norm": 4.738752841949463, + "learning_rate": 3.5620117856653425e-06, + "loss": 0.163, + "step": 47729 + }, + { + "epoch": 3.0473684210526315, + "grad_norm": 4.500484943389893, + "learning_rate": 3.5618747430450873e-06, + "loss": 0.108, + "step": 47730 + }, + { + "epoch": 3.0473819858925664, + "grad_norm": 4.99409294128418, + "learning_rate": 3.561737700424833e-06, + "loss": 0.1656, + "step": 47731 + }, + { + "epoch": 3.0473955507325012, + "grad_norm": 4.348855495452881, + "learning_rate": 3.5616006578045776e-06, + "loss": 0.1091, + "step": 47732 + }, + { + "epoch": 3.047409115572436, + "grad_norm": 5.355744361877441, + "learning_rate": 3.5614636151843224e-06, + "loss": 0.1231, + "step": 47733 + }, + { + "epoch": 3.047422680412371, + "grad_norm": 5.757516384124756, + "learning_rate": 3.561326572564068e-06, + "loss": 0.2106, + "step": 47734 + }, + { + "epoch": 3.047436245252306, + "grad_norm": 3.8953990936279297, + "learning_rate": 3.5611895299438127e-06, + "loss": 0.1221, + "step": 47735 + }, + { + "epoch": 3.0474498100922407, + "grad_norm": 3.6800153255462646, + "learning_rate": 3.561052487323558e-06, + "loss": 0.1389, + "step": 47736 + }, + { + "epoch": 3.047463374932176, + "grad_norm": 3.5099005699157715, + "learning_rate": 3.5609154447033027e-06, + "loss": 0.108, + "step": 47737 + }, + { + "epoch": 3.047476939772111, + "grad_norm": 4.311309337615967, + "learning_rate": 3.5607784020830483e-06, + "loss": 0.1565, + "step": 47738 + }, + { + "epoch": 3.0474905046120457, + "grad_norm": 5.340907573699951, + "learning_rate": 3.560641359462793e-06, + "loss": 0.1418, + "step": 47739 + }, + { + "epoch": 3.0475040694519806, + "grad_norm": 4.4290266036987305, + "learning_rate": 3.5605043168425386e-06, + "loss": 0.1226, + "step": 47740 + }, + { + "epoch": 3.0475176342919155, + "grad_norm": 4.483893394470215, + "learning_rate": 3.5603672742222834e-06, + "loss": 0.1322, + "step": 47741 + }, + { + "epoch": 3.0475311991318503, + "grad_norm": 4.912096977233887, + "learning_rate": 3.5602302316020286e-06, + "loss": 0.1362, + "step": 47742 + }, + { + "epoch": 3.047544763971785, + "grad_norm": 4.251318454742432, + "learning_rate": 3.5600931889817737e-06, + "loss": 0.119, + "step": 47743 + }, + { + "epoch": 3.04755832881172, + "grad_norm": 4.067452907562256, + "learning_rate": 3.559956146361519e-06, + "loss": 0.1665, + "step": 47744 + }, + { + "epoch": 3.047571893651655, + "grad_norm": 4.315066814422607, + "learning_rate": 3.5598191037412637e-06, + "loss": 0.1511, + "step": 47745 + }, + { + "epoch": 3.04758545849159, + "grad_norm": 4.544399738311768, + "learning_rate": 3.559682061121009e-06, + "loss": 0.1326, + "step": 47746 + }, + { + "epoch": 3.0475990233315247, + "grad_norm": 5.151200771331787, + "learning_rate": 3.559545018500754e-06, + "loss": 0.1761, + "step": 47747 + }, + { + "epoch": 3.0476125881714595, + "grad_norm": 5.055314064025879, + "learning_rate": 3.5594079758804988e-06, + "loss": 0.1606, + "step": 47748 + }, + { + "epoch": 3.0476261530113944, + "grad_norm": 4.810178279876709, + "learning_rate": 3.5592709332602444e-06, + "loss": 0.0917, + "step": 47749 + }, + { + "epoch": 3.0476397178513293, + "grad_norm": 4.479691028594971, + "learning_rate": 3.559133890639989e-06, + "loss": 0.1748, + "step": 47750 + }, + { + "epoch": 3.047653282691264, + "grad_norm": 3.9126076698303223, + "learning_rate": 3.5589968480197347e-06, + "loss": 0.1422, + "step": 47751 + }, + { + "epoch": 3.047666847531199, + "grad_norm": 3.9498345851898193, + "learning_rate": 3.5588598053994795e-06, + "loss": 0.1779, + "step": 47752 + }, + { + "epoch": 3.047680412371134, + "grad_norm": 5.187180042266846, + "learning_rate": 3.5587227627792247e-06, + "loss": 0.1605, + "step": 47753 + }, + { + "epoch": 3.0476939772110687, + "grad_norm": 4.731667518615723, + "learning_rate": 3.5585857201589694e-06, + "loss": 0.1888, + "step": 47754 + }, + { + "epoch": 3.0477075420510036, + "grad_norm": 4.07150411605835, + "learning_rate": 3.558448677538715e-06, + "loss": 0.1002, + "step": 47755 + }, + { + "epoch": 3.047721106890939, + "grad_norm": 4.476958274841309, + "learning_rate": 3.55831163491846e-06, + "loss": 0.1674, + "step": 47756 + }, + { + "epoch": 3.0477346717308738, + "grad_norm": 6.836511611938477, + "learning_rate": 3.5581745922982054e-06, + "loss": 0.2594, + "step": 47757 + }, + { + "epoch": 3.0477482365708086, + "grad_norm": 3.238103151321411, + "learning_rate": 3.55803754967795e-06, + "loss": 0.1363, + "step": 47758 + }, + { + "epoch": 3.0477618014107435, + "grad_norm": 5.351140975952148, + "learning_rate": 3.557900507057695e-06, + "loss": 0.2098, + "step": 47759 + }, + { + "epoch": 3.0477753662506784, + "grad_norm": 3.672037124633789, + "learning_rate": 3.5577634644374405e-06, + "loss": 0.128, + "step": 47760 + }, + { + "epoch": 3.047788931090613, + "grad_norm": 3.580148220062256, + "learning_rate": 3.5576264218171853e-06, + "loss": 0.1155, + "step": 47761 + }, + { + "epoch": 3.047802495930548, + "grad_norm": 3.6167256832122803, + "learning_rate": 3.5574893791969304e-06, + "loss": 0.1115, + "step": 47762 + }, + { + "epoch": 3.047816060770483, + "grad_norm": 5.164254665374756, + "learning_rate": 3.5573523365766756e-06, + "loss": 0.1523, + "step": 47763 + }, + { + "epoch": 3.047829625610418, + "grad_norm": 4.966663360595703, + "learning_rate": 3.557215293956421e-06, + "loss": 0.23, + "step": 47764 + }, + { + "epoch": 3.0478431904503527, + "grad_norm": 4.00277853012085, + "learning_rate": 3.5570782513361656e-06, + "loss": 0.1432, + "step": 47765 + }, + { + "epoch": 3.0478567552902875, + "grad_norm": 3.151763677597046, + "learning_rate": 3.556941208715911e-06, + "loss": 0.1317, + "step": 47766 + }, + { + "epoch": 3.0478703201302224, + "grad_norm": 4.546665191650391, + "learning_rate": 3.556804166095656e-06, + "loss": 0.1324, + "step": 47767 + }, + { + "epoch": 3.0478838849701573, + "grad_norm": 3.1699836254119873, + "learning_rate": 3.5566671234754015e-06, + "loss": 0.1103, + "step": 47768 + }, + { + "epoch": 3.047897449810092, + "grad_norm": 4.114074230194092, + "learning_rate": 3.5565300808551463e-06, + "loss": 0.0948, + "step": 47769 + }, + { + "epoch": 3.047911014650027, + "grad_norm": 3.5710670948028564, + "learning_rate": 3.5563930382348914e-06, + "loss": 0.0935, + "step": 47770 + }, + { + "epoch": 3.047924579489962, + "grad_norm": 3.936005115509033, + "learning_rate": 3.556255995614636e-06, + "loss": 0.1419, + "step": 47771 + }, + { + "epoch": 3.0479381443298967, + "grad_norm": 4.051276683807373, + "learning_rate": 3.556118952994382e-06, + "loss": 0.1286, + "step": 47772 + }, + { + "epoch": 3.0479517091698316, + "grad_norm": 4.388226509094238, + "learning_rate": 3.5559819103741266e-06, + "loss": 0.222, + "step": 47773 + }, + { + "epoch": 3.0479652740097665, + "grad_norm": 4.240363597869873, + "learning_rate": 3.5558448677538713e-06, + "loss": 0.1344, + "step": 47774 + }, + { + "epoch": 3.0479788388497018, + "grad_norm": 4.336796283721924, + "learning_rate": 3.555707825133617e-06, + "loss": 0.1902, + "step": 47775 + }, + { + "epoch": 3.0479924036896366, + "grad_norm": 4.536684989929199, + "learning_rate": 3.5555707825133617e-06, + "loss": 0.2549, + "step": 47776 + }, + { + "epoch": 3.0480059685295715, + "grad_norm": 4.7726922035217285, + "learning_rate": 3.5554337398931073e-06, + "loss": 0.1163, + "step": 47777 + }, + { + "epoch": 3.0480195333695064, + "grad_norm": 3.380519151687622, + "learning_rate": 3.555296697272852e-06, + "loss": 0.1198, + "step": 47778 + }, + { + "epoch": 3.0480330982094412, + "grad_norm": 4.063440322875977, + "learning_rate": 3.555159654652597e-06, + "loss": 0.18, + "step": 47779 + }, + { + "epoch": 3.048046663049376, + "grad_norm": 4.202153205871582, + "learning_rate": 3.5550226120323424e-06, + "loss": 0.1229, + "step": 47780 + }, + { + "epoch": 3.048060227889311, + "grad_norm": 4.158390045166016, + "learning_rate": 3.5548855694120876e-06, + "loss": 0.1074, + "step": 47781 + }, + { + "epoch": 3.048073792729246, + "grad_norm": 3.656583309173584, + "learning_rate": 3.5547485267918323e-06, + "loss": 0.1064, + "step": 47782 + }, + { + "epoch": 3.0480873575691807, + "grad_norm": 4.190524101257324, + "learning_rate": 3.554611484171578e-06, + "loss": 0.1048, + "step": 47783 + }, + { + "epoch": 3.0481009224091156, + "grad_norm": 4.646515846252441, + "learning_rate": 3.5544744415513227e-06, + "loss": 0.1875, + "step": 47784 + }, + { + "epoch": 3.0481144872490504, + "grad_norm": 3.3945765495300293, + "learning_rate": 3.5543373989310683e-06, + "loss": 0.1192, + "step": 47785 + }, + { + "epoch": 3.0481280520889853, + "grad_norm": 3.3845934867858887, + "learning_rate": 3.554200356310813e-06, + "loss": 0.0852, + "step": 47786 + }, + { + "epoch": 3.04814161692892, + "grad_norm": 3.8417890071868896, + "learning_rate": 3.554063313690558e-06, + "loss": 0.1437, + "step": 47787 + }, + { + "epoch": 3.048155181768855, + "grad_norm": 4.821474075317383, + "learning_rate": 3.5539262710703034e-06, + "loss": 0.2063, + "step": 47788 + }, + { + "epoch": 3.04816874660879, + "grad_norm": 4.906798839569092, + "learning_rate": 3.553789228450048e-06, + "loss": 0.1413, + "step": 47789 + }, + { + "epoch": 3.0481823114487248, + "grad_norm": 4.149974346160889, + "learning_rate": 3.5536521858297933e-06, + "loss": 0.172, + "step": 47790 + }, + { + "epoch": 3.0481958762886596, + "grad_norm": 3.797353506088257, + "learning_rate": 3.553515143209538e-06, + "loss": 0.1176, + "step": 47791 + }, + { + "epoch": 3.0482094411285945, + "grad_norm": 4.131716251373291, + "learning_rate": 3.5533781005892837e-06, + "loss": 0.1337, + "step": 47792 + }, + { + "epoch": 3.0482230059685294, + "grad_norm": 3.9449527263641357, + "learning_rate": 3.5532410579690284e-06, + "loss": 0.1609, + "step": 47793 + }, + { + "epoch": 3.0482365708084647, + "grad_norm": 4.000823497772217, + "learning_rate": 3.553104015348774e-06, + "loss": 0.1388, + "step": 47794 + }, + { + "epoch": 3.0482501356483995, + "grad_norm": 4.778949737548828, + "learning_rate": 3.552966972728519e-06, + "loss": 0.1074, + "step": 47795 + }, + { + "epoch": 3.0482637004883344, + "grad_norm": 5.739470481872559, + "learning_rate": 3.552829930108264e-06, + "loss": 0.2877, + "step": 47796 + }, + { + "epoch": 3.0482772653282693, + "grad_norm": 3.427583932876587, + "learning_rate": 3.552692887488009e-06, + "loss": 0.1243, + "step": 47797 + }, + { + "epoch": 3.048290830168204, + "grad_norm": 3.645305871963501, + "learning_rate": 3.5525558448677543e-06, + "loss": 0.1077, + "step": 47798 + }, + { + "epoch": 3.048304395008139, + "grad_norm": 3.834648609161377, + "learning_rate": 3.552418802247499e-06, + "loss": 0.1303, + "step": 47799 + }, + { + "epoch": 3.048317959848074, + "grad_norm": 4.297847270965576, + "learning_rate": 3.5522817596272443e-06, + "loss": 0.1276, + "step": 47800 + }, + { + "epoch": 3.0483315246880087, + "grad_norm": 3.6093647480010986, + "learning_rate": 3.5521447170069895e-06, + "loss": 0.0768, + "step": 47801 + }, + { + "epoch": 3.0483450895279436, + "grad_norm": 4.5425848960876465, + "learning_rate": 3.552007674386734e-06, + "loss": 0.159, + "step": 47802 + }, + { + "epoch": 3.0483586543678785, + "grad_norm": 5.5576395988464355, + "learning_rate": 3.55187063176648e-06, + "loss": 0.1684, + "step": 47803 + }, + { + "epoch": 3.0483722192078133, + "grad_norm": 4.747740745544434, + "learning_rate": 3.5517335891462246e-06, + "loss": 0.1137, + "step": 47804 + }, + { + "epoch": 3.048385784047748, + "grad_norm": 4.267213821411133, + "learning_rate": 3.55159654652597e-06, + "loss": 0.0911, + "step": 47805 + }, + { + "epoch": 3.048399348887683, + "grad_norm": 5.928680419921875, + "learning_rate": 3.551459503905715e-06, + "loss": 0.3191, + "step": 47806 + }, + { + "epoch": 3.048412913727618, + "grad_norm": 3.9822840690612793, + "learning_rate": 3.55132246128546e-06, + "loss": 0.1488, + "step": 47807 + }, + { + "epoch": 3.0484264785675528, + "grad_norm": 4.568456172943115, + "learning_rate": 3.551185418665205e-06, + "loss": 0.2098, + "step": 47808 + }, + { + "epoch": 3.0484400434074876, + "grad_norm": 4.893517971038818, + "learning_rate": 3.5510483760449505e-06, + "loss": 0.159, + "step": 47809 + }, + { + "epoch": 3.0484536082474225, + "grad_norm": 3.050814151763916, + "learning_rate": 3.5509113334246952e-06, + "loss": 0.0903, + "step": 47810 + }, + { + "epoch": 3.0484671730873574, + "grad_norm": 4.450896263122559, + "learning_rate": 3.550774290804441e-06, + "loss": 0.1493, + "step": 47811 + }, + { + "epoch": 3.0484807379272922, + "grad_norm": 3.3398447036743164, + "learning_rate": 3.5506372481841856e-06, + "loss": 0.1348, + "step": 47812 + }, + { + "epoch": 3.0484943027672275, + "grad_norm": 4.869377136230469, + "learning_rate": 3.5505002055639308e-06, + "loss": 0.1005, + "step": 47813 + }, + { + "epoch": 3.0485078676071624, + "grad_norm": 5.064892292022705, + "learning_rate": 3.550363162943676e-06, + "loss": 0.1407, + "step": 47814 + }, + { + "epoch": 3.0485214324470973, + "grad_norm": 5.885260581970215, + "learning_rate": 3.5502261203234207e-06, + "loss": 0.2235, + "step": 47815 + }, + { + "epoch": 3.048534997287032, + "grad_norm": 4.2052001953125, + "learning_rate": 3.550089077703166e-06, + "loss": 0.1588, + "step": 47816 + }, + { + "epoch": 3.048548562126967, + "grad_norm": 2.944287061691284, + "learning_rate": 3.549952035082911e-06, + "loss": 0.0982, + "step": 47817 + }, + { + "epoch": 3.048562126966902, + "grad_norm": 4.090321063995361, + "learning_rate": 3.5498149924626562e-06, + "loss": 0.1311, + "step": 47818 + }, + { + "epoch": 3.0485756918068367, + "grad_norm": 3.1321890354156494, + "learning_rate": 3.549677949842401e-06, + "loss": 0.1119, + "step": 47819 + }, + { + "epoch": 3.0485892566467716, + "grad_norm": 5.006597518920898, + "learning_rate": 3.5495409072221466e-06, + "loss": 0.1166, + "step": 47820 + }, + { + "epoch": 3.0486028214867065, + "grad_norm": 3.987562656402588, + "learning_rate": 3.5494038646018913e-06, + "loss": 0.1115, + "step": 47821 + }, + { + "epoch": 3.0486163863266413, + "grad_norm": 4.160793304443359, + "learning_rate": 3.549266821981637e-06, + "loss": 0.1271, + "step": 47822 + }, + { + "epoch": 3.048629951166576, + "grad_norm": 4.168507099151611, + "learning_rate": 3.5491297793613817e-06, + "loss": 0.084, + "step": 47823 + }, + { + "epoch": 3.048643516006511, + "grad_norm": 2.7319278717041016, + "learning_rate": 3.548992736741127e-06, + "loss": 0.0739, + "step": 47824 + }, + { + "epoch": 3.048657080846446, + "grad_norm": 2.941166400909424, + "learning_rate": 3.5488556941208716e-06, + "loss": 0.1046, + "step": 47825 + }, + { + "epoch": 3.048670645686381, + "grad_norm": 5.191341876983643, + "learning_rate": 3.5487186515006172e-06, + "loss": 0.1221, + "step": 47826 + }, + { + "epoch": 3.0486842105263157, + "grad_norm": 4.28891134262085, + "learning_rate": 3.548581608880362e-06, + "loss": 0.0912, + "step": 47827 + }, + { + "epoch": 3.0486977753662505, + "grad_norm": 4.173285961151123, + "learning_rate": 3.5484445662601067e-06, + "loss": 0.091, + "step": 47828 + }, + { + "epoch": 3.0487113402061854, + "grad_norm": 4.842278003692627, + "learning_rate": 3.5483075236398523e-06, + "loss": 0.1432, + "step": 47829 + }, + { + "epoch": 3.0487249050461203, + "grad_norm": 3.5077381134033203, + "learning_rate": 3.548170481019597e-06, + "loss": 0.1273, + "step": 47830 + }, + { + "epoch": 3.0487384698860556, + "grad_norm": 3.984189748764038, + "learning_rate": 3.5480334383993427e-06, + "loss": 0.1149, + "step": 47831 + }, + { + "epoch": 3.0487520347259904, + "grad_norm": 3.4995498657226562, + "learning_rate": 3.5478963957790875e-06, + "loss": 0.0956, + "step": 47832 + }, + { + "epoch": 3.0487655995659253, + "grad_norm": 3.9101736545562744, + "learning_rate": 3.5477593531588326e-06, + "loss": 0.1801, + "step": 47833 + }, + { + "epoch": 3.04877916440586, + "grad_norm": 4.070279598236084, + "learning_rate": 3.547622310538578e-06, + "loss": 0.1233, + "step": 47834 + }, + { + "epoch": 3.048792729245795, + "grad_norm": 4.142975330352783, + "learning_rate": 3.547485267918323e-06, + "loss": 0.1742, + "step": 47835 + }, + { + "epoch": 3.04880629408573, + "grad_norm": 4.539153099060059, + "learning_rate": 3.5473482252980677e-06, + "loss": 0.1466, + "step": 47836 + }, + { + "epoch": 3.0488198589256648, + "grad_norm": 3.4458816051483154, + "learning_rate": 3.5472111826778134e-06, + "loss": 0.0611, + "step": 47837 + }, + { + "epoch": 3.0488334237655996, + "grad_norm": 3.6912753582000732, + "learning_rate": 3.547074140057558e-06, + "loss": 0.0576, + "step": 47838 + }, + { + "epoch": 3.0488469886055345, + "grad_norm": 3.7857470512390137, + "learning_rate": 3.5469370974373037e-06, + "loss": 0.1295, + "step": 47839 + }, + { + "epoch": 3.0488605534454694, + "grad_norm": 3.8251683712005615, + "learning_rate": 3.5468000548170485e-06, + "loss": 0.0633, + "step": 47840 + }, + { + "epoch": 3.0488741182854042, + "grad_norm": 3.3973910808563232, + "learning_rate": 3.5466630121967936e-06, + "loss": 0.0732, + "step": 47841 + }, + { + "epoch": 3.048887683125339, + "grad_norm": 3.2277932167053223, + "learning_rate": 3.5465259695765384e-06, + "loss": 0.081, + "step": 47842 + }, + { + "epoch": 3.048901247965274, + "grad_norm": 4.027072429656982, + "learning_rate": 3.5463889269562836e-06, + "loss": 0.1636, + "step": 47843 + }, + { + "epoch": 3.048914812805209, + "grad_norm": 3.0517218112945557, + "learning_rate": 3.5462518843360288e-06, + "loss": 0.1028, + "step": 47844 + }, + { + "epoch": 3.0489283776451437, + "grad_norm": 4.165374279022217, + "learning_rate": 3.5461148417157735e-06, + "loss": 0.1164, + "step": 47845 + }, + { + "epoch": 3.0489419424850785, + "grad_norm": 5.107906341552734, + "learning_rate": 3.545977799095519e-06, + "loss": 0.1245, + "step": 47846 + }, + { + "epoch": 3.0489555073250134, + "grad_norm": 4.926775932312012, + "learning_rate": 3.545840756475264e-06, + "loss": 0.1915, + "step": 47847 + }, + { + "epoch": 3.0489690721649483, + "grad_norm": 3.3453660011291504, + "learning_rate": 3.5457037138550095e-06, + "loss": 0.0651, + "step": 47848 + }, + { + "epoch": 3.048982637004883, + "grad_norm": 3.2569940090179443, + "learning_rate": 3.5455666712347542e-06, + "loss": 0.06, + "step": 47849 + }, + { + "epoch": 3.048996201844818, + "grad_norm": 4.152381420135498, + "learning_rate": 3.5454296286144994e-06, + "loss": 0.1101, + "step": 47850 + }, + { + "epoch": 3.0490097666847533, + "grad_norm": 3.8632116317749023, + "learning_rate": 3.5452925859942446e-06, + "loss": 0.1179, + "step": 47851 + }, + { + "epoch": 3.049023331524688, + "grad_norm": 3.532705783843994, + "learning_rate": 3.5451555433739898e-06, + "loss": 0.1505, + "step": 47852 + }, + { + "epoch": 3.049036896364623, + "grad_norm": 4.530447483062744, + "learning_rate": 3.5450185007537345e-06, + "loss": 0.1188, + "step": 47853 + }, + { + "epoch": 3.049050461204558, + "grad_norm": 5.784629821777344, + "learning_rate": 3.54488145813348e-06, + "loss": 0.1295, + "step": 47854 + }, + { + "epoch": 3.049064026044493, + "grad_norm": 2.5015125274658203, + "learning_rate": 3.544744415513225e-06, + "loss": 0.0515, + "step": 47855 + }, + { + "epoch": 3.0490775908844276, + "grad_norm": 3.8773465156555176, + "learning_rate": 3.5446073728929696e-06, + "loss": 0.065, + "step": 47856 + }, + { + "epoch": 3.0490911557243625, + "grad_norm": 3.051978349685669, + "learning_rate": 3.5444703302727152e-06, + "loss": 0.0474, + "step": 47857 + }, + { + "epoch": 3.0491047205642974, + "grad_norm": 4.015772819519043, + "learning_rate": 3.54433328765246e-06, + "loss": 0.0853, + "step": 47858 + }, + { + "epoch": 3.0491182854042322, + "grad_norm": 5.380717754364014, + "learning_rate": 3.5441962450322056e-06, + "loss": 0.085, + "step": 47859 + }, + { + "epoch": 3.049131850244167, + "grad_norm": 3.9806160926818848, + "learning_rate": 3.5440592024119503e-06, + "loss": 0.0935, + "step": 47860 + }, + { + "epoch": 3.049145415084102, + "grad_norm": 2.204355239868164, + "learning_rate": 3.5439221597916955e-06, + "loss": 0.0359, + "step": 47861 + }, + { + "epoch": 3.049158979924037, + "grad_norm": 2.5664286613464355, + "learning_rate": 3.5437851171714403e-06, + "loss": 0.0711, + "step": 47862 + }, + { + "epoch": 3.0491725447639717, + "grad_norm": 3.5438497066497803, + "learning_rate": 3.543648074551186e-06, + "loss": 0.1163, + "step": 47863 + }, + { + "epoch": 3.0491861096039066, + "grad_norm": 2.843644618988037, + "learning_rate": 3.5435110319309306e-06, + "loss": 0.0669, + "step": 47864 + }, + { + "epoch": 3.0491996744438414, + "grad_norm": 3.5011146068573, + "learning_rate": 3.5433739893106762e-06, + "loss": 0.1209, + "step": 47865 + }, + { + "epoch": 3.0492132392837763, + "grad_norm": 3.034358024597168, + "learning_rate": 3.543236946690421e-06, + "loss": 0.058, + "step": 47866 + }, + { + "epoch": 3.049226804123711, + "grad_norm": 3.3778412342071533, + "learning_rate": 3.543099904070166e-06, + "loss": 0.1267, + "step": 47867 + }, + { + "epoch": 3.049240368963646, + "grad_norm": 4.1721062660217285, + "learning_rate": 3.5429628614499114e-06, + "loss": 0.1247, + "step": 47868 + }, + { + "epoch": 3.0492539338035813, + "grad_norm": 3.034531593322754, + "learning_rate": 3.542825818829656e-06, + "loss": 0.0908, + "step": 47869 + }, + { + "epoch": 3.049267498643516, + "grad_norm": 2.4443657398223877, + "learning_rate": 3.5426887762094013e-06, + "loss": 0.0797, + "step": 47870 + }, + { + "epoch": 3.049281063483451, + "grad_norm": 2.035944938659668, + "learning_rate": 3.5425517335891465e-06, + "loss": 0.0447, + "step": 47871 + }, + { + "epoch": 3.049294628323386, + "grad_norm": 2.947559356689453, + "learning_rate": 3.5424146909688916e-06, + "loss": 0.0622, + "step": 47872 + }, + { + "epoch": 3.049308193163321, + "grad_norm": 2.800128698348999, + "learning_rate": 3.5422776483486364e-06, + "loss": 0.0826, + "step": 47873 + }, + { + "epoch": 3.0493217580032557, + "grad_norm": 2.9522688388824463, + "learning_rate": 3.542140605728382e-06, + "loss": 0.0512, + "step": 47874 + }, + { + "epoch": 3.0493353228431905, + "grad_norm": 6.018420696258545, + "learning_rate": 3.5420035631081268e-06, + "loss": 0.1891, + "step": 47875 + }, + { + "epoch": 3.0493488876831254, + "grad_norm": 2.6964449882507324, + "learning_rate": 3.5418665204878724e-06, + "loss": 0.0433, + "step": 47876 + }, + { + "epoch": 3.0493624525230603, + "grad_norm": 2.8214802742004395, + "learning_rate": 3.541729477867617e-06, + "loss": 0.0503, + "step": 47877 + }, + { + "epoch": 3.049376017362995, + "grad_norm": 4.650803565979004, + "learning_rate": 3.5415924352473623e-06, + "loss": 0.0863, + "step": 47878 + }, + { + "epoch": 3.04938958220293, + "grad_norm": 3.07987380027771, + "learning_rate": 3.541455392627107e-06, + "loss": 0.0597, + "step": 47879 + }, + { + "epoch": 3.049403147042865, + "grad_norm": 2.9047467708587646, + "learning_rate": 3.5413183500068527e-06, + "loss": 0.0338, + "step": 47880 + }, + { + "epoch": 3.0494167118827997, + "grad_norm": 3.1641573905944824, + "learning_rate": 3.5411813073865974e-06, + "loss": 0.0546, + "step": 47881 + }, + { + "epoch": 3.0494302767227346, + "grad_norm": 4.666249752044678, + "learning_rate": 3.541044264766343e-06, + "loss": 0.0479, + "step": 47882 + }, + { + "epoch": 3.0494438415626695, + "grad_norm": 4.833500385284424, + "learning_rate": 3.5409072221460878e-06, + "loss": 0.0688, + "step": 47883 + }, + { + "epoch": 3.0494574064026043, + "grad_norm": 3.7489683628082275, + "learning_rate": 3.5407701795258325e-06, + "loss": 0.0767, + "step": 47884 + }, + { + "epoch": 3.049470971242539, + "grad_norm": 2.6300597190856934, + "learning_rate": 3.540633136905578e-06, + "loss": 0.0362, + "step": 47885 + }, + { + "epoch": 3.049484536082474, + "grad_norm": 4.608031272888184, + "learning_rate": 3.540496094285323e-06, + "loss": 0.0874, + "step": 47886 + }, + { + "epoch": 3.049498100922409, + "grad_norm": 4.269404888153076, + "learning_rate": 3.540359051665068e-06, + "loss": 0.0457, + "step": 47887 + }, + { + "epoch": 3.049511665762344, + "grad_norm": 6.449234485626221, + "learning_rate": 3.5402220090448132e-06, + "loss": 0.1454, + "step": 47888 + }, + { + "epoch": 3.049525230602279, + "grad_norm": 4.146302223205566, + "learning_rate": 3.5400849664245584e-06, + "loss": 0.0812, + "step": 47889 + }, + { + "epoch": 3.049538795442214, + "grad_norm": 2.616779088973999, + "learning_rate": 3.539947923804303e-06, + "loss": 0.0248, + "step": 47890 + }, + { + "epoch": 3.049552360282149, + "grad_norm": 4.0748796463012695, + "learning_rate": 3.5398108811840488e-06, + "loss": 0.1154, + "step": 47891 + }, + { + "epoch": 3.0495659251220837, + "grad_norm": 2.722811460494995, + "learning_rate": 3.5396738385637935e-06, + "loss": 0.0382, + "step": 47892 + }, + { + "epoch": 3.0495794899620186, + "grad_norm": 2.659069299697876, + "learning_rate": 3.539536795943539e-06, + "loss": 0.0599, + "step": 47893 + }, + { + "epoch": 3.0495930548019534, + "grad_norm": 3.00616192817688, + "learning_rate": 3.539399753323284e-06, + "loss": 0.0605, + "step": 47894 + }, + { + "epoch": 3.0496066196418883, + "grad_norm": 4.139433860778809, + "learning_rate": 3.539262710703029e-06, + "loss": 0.1118, + "step": 47895 + }, + { + "epoch": 3.049620184481823, + "grad_norm": 2.4313201904296875, + "learning_rate": 3.539125668082774e-06, + "loss": 0.073, + "step": 47896 + }, + { + "epoch": 3.049633749321758, + "grad_norm": 2.9586942195892334, + "learning_rate": 3.538988625462519e-06, + "loss": 0.0428, + "step": 47897 + }, + { + "epoch": 3.049647314161693, + "grad_norm": 4.212771415710449, + "learning_rate": 3.538851582842264e-06, + "loss": 0.0732, + "step": 47898 + }, + { + "epoch": 3.0496608790016277, + "grad_norm": 3.334043025970459, + "learning_rate": 3.538714540222009e-06, + "loss": 0.0753, + "step": 47899 + }, + { + "epoch": 3.0496744438415626, + "grad_norm": 3.977825880050659, + "learning_rate": 3.5385774976017545e-06, + "loss": 0.1135, + "step": 47900 + }, + { + "epoch": 3.0496880086814975, + "grad_norm": 4.669389724731445, + "learning_rate": 3.5384404549814993e-06, + "loss": 0.1035, + "step": 47901 + }, + { + "epoch": 3.0497015735214323, + "grad_norm": 4.8407416343688965, + "learning_rate": 3.538303412361245e-06, + "loss": 0.1161, + "step": 47902 + }, + { + "epoch": 3.049715138361367, + "grad_norm": 2.7395663261413574, + "learning_rate": 3.5381663697409897e-06, + "loss": 0.0573, + "step": 47903 + }, + { + "epoch": 3.049728703201302, + "grad_norm": 5.5974531173706055, + "learning_rate": 3.538029327120735e-06, + "loss": 0.1439, + "step": 47904 + }, + { + "epoch": 3.049742268041237, + "grad_norm": 4.917092323303223, + "learning_rate": 3.53789228450048e-06, + "loss": 0.1037, + "step": 47905 + }, + { + "epoch": 3.049755832881172, + "grad_norm": 3.2059338092803955, + "learning_rate": 3.537755241880225e-06, + "loss": 0.0851, + "step": 47906 + }, + { + "epoch": 3.049769397721107, + "grad_norm": 4.514588356018066, + "learning_rate": 3.53761819925997e-06, + "loss": 0.084, + "step": 47907 + }, + { + "epoch": 3.049782962561042, + "grad_norm": 4.014796733856201, + "learning_rate": 3.5374811566397155e-06, + "loss": 0.0925, + "step": 47908 + }, + { + "epoch": 3.049796527400977, + "grad_norm": 5.2432122230529785, + "learning_rate": 3.5373441140194603e-06, + "loss": 0.1318, + "step": 47909 + }, + { + "epoch": 3.0498100922409117, + "grad_norm": 3.6981801986694336, + "learning_rate": 3.537207071399206e-06, + "loss": 0.1472, + "step": 47910 + }, + { + "epoch": 3.0498236570808466, + "grad_norm": 4.4618449211120605, + "learning_rate": 3.5370700287789507e-06, + "loss": 0.1084, + "step": 47911 + }, + { + "epoch": 3.0498372219207814, + "grad_norm": 5.142724990844727, + "learning_rate": 3.5369329861586954e-06, + "loss": 0.1774, + "step": 47912 + }, + { + "epoch": 3.0498507867607163, + "grad_norm": 3.9547736644744873, + "learning_rate": 3.5367959435384406e-06, + "loss": 0.0942, + "step": 47913 + }, + { + "epoch": 3.049864351600651, + "grad_norm": 3.3449654579162598, + "learning_rate": 3.5366589009181858e-06, + "loss": 0.0758, + "step": 47914 + }, + { + "epoch": 3.049877916440586, + "grad_norm": 3.9029531478881836, + "learning_rate": 3.536521858297931e-06, + "loss": 0.1589, + "step": 47915 + }, + { + "epoch": 3.049891481280521, + "grad_norm": 2.9652585983276367, + "learning_rate": 3.5363848156776757e-06, + "loss": 0.0413, + "step": 47916 + }, + { + "epoch": 3.0499050461204558, + "grad_norm": 4.121825695037842, + "learning_rate": 3.5362477730574213e-06, + "loss": 0.0932, + "step": 47917 + }, + { + "epoch": 3.0499186109603906, + "grad_norm": 2.5862715244293213, + "learning_rate": 3.536110730437166e-06, + "loss": 0.0761, + "step": 47918 + }, + { + "epoch": 3.0499186109603906, + "eval_loss": 0.3676658570766449, + "eval_noise_accuracy": NaN, + "eval_runtime": 4486.474, + "eval_samples_per_second": 1.12, + "eval_steps_per_second": 0.07, + "eval_wer": 26.054184300763026, + "step": 47918 + }, + { + "epoch": 3.0499321758003255, + "grad_norm": 4.303838729858398, + "learning_rate": 3.5359736878169117e-06, + "loss": 0.1653, + "step": 47919 + }, + { + "epoch": 3.0499457406402604, + "grad_norm": 4.404543876647949, + "learning_rate": 3.5358366451966564e-06, + "loss": 0.1322, + "step": 47920 + }, + { + "epoch": 3.0499593054801952, + "grad_norm": 3.123141050338745, + "learning_rate": 3.5356996025764016e-06, + "loss": 0.0803, + "step": 47921 + }, + { + "epoch": 3.04997287032013, + "grad_norm": 4.493557453155518, + "learning_rate": 3.5355625599561468e-06, + "loss": 0.1876, + "step": 47922 + }, + { + "epoch": 3.049986435160065, + "grad_norm": 5.246953010559082, + "learning_rate": 3.535425517335892e-06, + "loss": 0.1569, + "step": 47923 + }, + { + "epoch": 3.05, + "grad_norm": 4.195197105407715, + "learning_rate": 3.5352884747156367e-06, + "loss": 0.095, + "step": 47924 + }, + { + "epoch": 3.0500135648399347, + "grad_norm": 5.600550651550293, + "learning_rate": 3.535151432095382e-06, + "loss": 0.0713, + "step": 47925 + }, + { + "epoch": 3.0500271296798696, + "grad_norm": 3.074293851852417, + "learning_rate": 3.535014389475127e-06, + "loss": 0.0756, + "step": 47926 + }, + { + "epoch": 3.050040694519805, + "grad_norm": 5.667027950286865, + "learning_rate": 3.534877346854872e-06, + "loss": 0.223, + "step": 47927 + }, + { + "epoch": 3.0500542593597397, + "grad_norm": 3.7494711875915527, + "learning_rate": 3.5347403042346174e-06, + "loss": 0.1377, + "step": 47928 + }, + { + "epoch": 3.0500678241996746, + "grad_norm": 3.6034722328186035, + "learning_rate": 3.534603261614362e-06, + "loss": 0.1421, + "step": 47929 + }, + { + "epoch": 3.0500813890396095, + "grad_norm": 6.2754950523376465, + "learning_rate": 3.5344662189941078e-06, + "loss": 0.3235, + "step": 47930 + }, + { + "epoch": 3.0500949538795443, + "grad_norm": 4.281858921051025, + "learning_rate": 3.5343291763738525e-06, + "loss": 0.1511, + "step": 47931 + }, + { + "epoch": 3.050108518719479, + "grad_norm": 3.6795804500579834, + "learning_rate": 3.5341921337535977e-06, + "loss": 0.11, + "step": 47932 + }, + { + "epoch": 3.050122083559414, + "grad_norm": 5.247788906097412, + "learning_rate": 3.5340550911333425e-06, + "loss": 0.0924, + "step": 47933 + }, + { + "epoch": 3.050135648399349, + "grad_norm": 4.806238651275635, + "learning_rate": 3.533918048513088e-06, + "loss": 0.1563, + "step": 47934 + }, + { + "epoch": 3.050149213239284, + "grad_norm": 4.3632588386535645, + "learning_rate": 3.533781005892833e-06, + "loss": 0.2296, + "step": 47935 + }, + { + "epoch": 3.0501627780792187, + "grad_norm": 3.5330474376678467, + "learning_rate": 3.5336439632725784e-06, + "loss": 0.1766, + "step": 47936 + }, + { + "epoch": 3.0501763429191535, + "grad_norm": 4.1107635498046875, + "learning_rate": 3.533506920652323e-06, + "loss": 0.1275, + "step": 47937 + }, + { + "epoch": 3.0501899077590884, + "grad_norm": 4.357471942901611, + "learning_rate": 3.533369878032068e-06, + "loss": 0.1542, + "step": 47938 + }, + { + "epoch": 3.0502034725990232, + "grad_norm": 5.770273685455322, + "learning_rate": 3.5332328354118136e-06, + "loss": 0.1942, + "step": 47939 + }, + { + "epoch": 3.050217037438958, + "grad_norm": 2.8590989112854004, + "learning_rate": 3.5330957927915583e-06, + "loss": 0.0725, + "step": 47940 + }, + { + "epoch": 3.050230602278893, + "grad_norm": 4.396090030670166, + "learning_rate": 3.5329587501713035e-06, + "loss": 0.2125, + "step": 47941 + }, + { + "epoch": 3.050244167118828, + "grad_norm": 4.401114463806152, + "learning_rate": 3.5328217075510487e-06, + "loss": 0.0722, + "step": 47942 + }, + { + "epoch": 3.0502577319587627, + "grad_norm": 3.3187661170959473, + "learning_rate": 3.532684664930794e-06, + "loss": 0.1284, + "step": 47943 + }, + { + "epoch": 3.0502712967986976, + "grad_norm": 3.8770370483398438, + "learning_rate": 3.5325476223105386e-06, + "loss": 0.0888, + "step": 47944 + }, + { + "epoch": 3.050284861638633, + "grad_norm": 3.778700828552246, + "learning_rate": 3.532410579690284e-06, + "loss": 0.0893, + "step": 47945 + }, + { + "epoch": 3.0502984264785677, + "grad_norm": 4.0475029945373535, + "learning_rate": 3.532273537070029e-06, + "loss": 0.0811, + "step": 47946 + }, + { + "epoch": 3.0503119913185026, + "grad_norm": 5.196780681610107, + "learning_rate": 3.5321364944497746e-06, + "loss": 0.1966, + "step": 47947 + }, + { + "epoch": 3.0503255561584375, + "grad_norm": 5.971695423126221, + "learning_rate": 3.5319994518295193e-06, + "loss": 0.2203, + "step": 47948 + }, + { + "epoch": 3.0503391209983723, + "grad_norm": 3.052598237991333, + "learning_rate": 3.5318624092092645e-06, + "loss": 0.0757, + "step": 47949 + }, + { + "epoch": 3.050352685838307, + "grad_norm": 4.458308219909668, + "learning_rate": 3.5317253665890092e-06, + "loss": 0.0972, + "step": 47950 + }, + { + "epoch": 3.050366250678242, + "grad_norm": 4.364290714263916, + "learning_rate": 3.531588323968755e-06, + "loss": 0.2019, + "step": 47951 + }, + { + "epoch": 3.050379815518177, + "grad_norm": 5.1653313636779785, + "learning_rate": 3.5314512813484996e-06, + "loss": 0.2651, + "step": 47952 + }, + { + "epoch": 3.050393380358112, + "grad_norm": 6.538871765136719, + "learning_rate": 3.5313142387282444e-06, + "loss": 0.2661, + "step": 47953 + }, + { + "epoch": 3.0504069451980467, + "grad_norm": 4.297046184539795, + "learning_rate": 3.53117719610799e-06, + "loss": 0.1957, + "step": 47954 + }, + { + "epoch": 3.0504205100379815, + "grad_norm": 3.222264051437378, + "learning_rate": 3.5310401534877347e-06, + "loss": 0.0904, + "step": 47955 + }, + { + "epoch": 3.0504340748779164, + "grad_norm": 5.145192623138428, + "learning_rate": 3.5309031108674803e-06, + "loss": 0.1139, + "step": 47956 + }, + { + "epoch": 3.0504476397178513, + "grad_norm": 4.125871181488037, + "learning_rate": 3.530766068247225e-06, + "loss": 0.1762, + "step": 47957 + }, + { + "epoch": 3.050461204557786, + "grad_norm": 5.467905044555664, + "learning_rate": 3.5306290256269703e-06, + "loss": 0.1754, + "step": 47958 + }, + { + "epoch": 3.050474769397721, + "grad_norm": 3.7101004123687744, + "learning_rate": 3.5304919830067154e-06, + "loss": 0.1436, + "step": 47959 + }, + { + "epoch": 3.050488334237656, + "grad_norm": 4.168275833129883, + "learning_rate": 3.5303549403864606e-06, + "loss": 0.0771, + "step": 47960 + }, + { + "epoch": 3.0505018990775907, + "grad_norm": 4.252419471740723, + "learning_rate": 3.5302178977662054e-06, + "loss": 0.1205, + "step": 47961 + }, + { + "epoch": 3.0505154639175256, + "grad_norm": 4.246718406677246, + "learning_rate": 3.530080855145951e-06, + "loss": 0.1634, + "step": 47962 + }, + { + "epoch": 3.0505290287574605, + "grad_norm": 3.630049228668213, + "learning_rate": 3.5299438125256957e-06, + "loss": 0.1258, + "step": 47963 + }, + { + "epoch": 3.0505425935973958, + "grad_norm": 6.0629987716674805, + "learning_rate": 3.5298067699054413e-06, + "loss": 0.1764, + "step": 47964 + }, + { + "epoch": 3.0505561584373306, + "grad_norm": 4.992425918579102, + "learning_rate": 3.529669727285186e-06, + "loss": 0.212, + "step": 47965 + }, + { + "epoch": 3.0505697232772655, + "grad_norm": 7.359036445617676, + "learning_rate": 3.529532684664931e-06, + "loss": 0.175, + "step": 47966 + }, + { + "epoch": 3.0505832881172004, + "grad_norm": 4.16422700881958, + "learning_rate": 3.529395642044676e-06, + "loss": 0.1806, + "step": 47967 + }, + { + "epoch": 3.0505968529571352, + "grad_norm": 3.6377546787261963, + "learning_rate": 3.529258599424421e-06, + "loss": 0.1789, + "step": 47968 + }, + { + "epoch": 3.05061041779707, + "grad_norm": 3.8175196647644043, + "learning_rate": 3.5291215568041664e-06, + "loss": 0.134, + "step": 47969 + }, + { + "epoch": 3.050623982637005, + "grad_norm": 5.3295392990112305, + "learning_rate": 3.528984514183911e-06, + "loss": 0.0958, + "step": 47970 + }, + { + "epoch": 3.05063754747694, + "grad_norm": 3.4910402297973633, + "learning_rate": 3.5288474715636567e-06, + "loss": 0.1065, + "step": 47971 + }, + { + "epoch": 3.0506511123168747, + "grad_norm": 3.542830467224121, + "learning_rate": 3.5287104289434015e-06, + "loss": 0.1545, + "step": 47972 + }, + { + "epoch": 3.0506646771568096, + "grad_norm": 4.221230983734131, + "learning_rate": 3.528573386323147e-06, + "loss": 0.1837, + "step": 47973 + }, + { + "epoch": 3.0506782419967444, + "grad_norm": 2.8545045852661133, + "learning_rate": 3.528436343702892e-06, + "loss": 0.0495, + "step": 47974 + }, + { + "epoch": 3.0506918068366793, + "grad_norm": 3.9367010593414307, + "learning_rate": 3.528299301082637e-06, + "loss": 0.0983, + "step": 47975 + }, + { + "epoch": 3.050705371676614, + "grad_norm": 6.146083354949951, + "learning_rate": 3.528162258462382e-06, + "loss": 0.252, + "step": 47976 + }, + { + "epoch": 3.050718936516549, + "grad_norm": 3.2855935096740723, + "learning_rate": 3.5280252158421274e-06, + "loss": 0.1337, + "step": 47977 + }, + { + "epoch": 3.050732501356484, + "grad_norm": 4.0010833740234375, + "learning_rate": 3.527888173221872e-06, + "loss": 0.088, + "step": 47978 + }, + { + "epoch": 3.0507460661964187, + "grad_norm": 4.651163578033447, + "learning_rate": 3.5277511306016177e-06, + "loss": 0.1046, + "step": 47979 + }, + { + "epoch": 3.0507596310363536, + "grad_norm": 4.219958305358887, + "learning_rate": 3.5276140879813625e-06, + "loss": 0.1445, + "step": 47980 + }, + { + "epoch": 3.0507731958762885, + "grad_norm": 4.738506317138672, + "learning_rate": 3.5274770453611073e-06, + "loss": 0.1426, + "step": 47981 + }, + { + "epoch": 3.0507867607162233, + "grad_norm": 3.3087432384490967, + "learning_rate": 3.527340002740853e-06, + "loss": 0.0884, + "step": 47982 + }, + { + "epoch": 3.0508003255561587, + "grad_norm": 4.464295864105225, + "learning_rate": 3.5272029601205976e-06, + "loss": 0.1033, + "step": 47983 + }, + { + "epoch": 3.0508138903960935, + "grad_norm": 4.058941841125488, + "learning_rate": 3.5270659175003428e-06, + "loss": 0.1008, + "step": 47984 + }, + { + "epoch": 3.0508274552360284, + "grad_norm": 4.335707187652588, + "learning_rate": 3.526928874880088e-06, + "loss": 0.0793, + "step": 47985 + }, + { + "epoch": 3.0508410200759633, + "grad_norm": 5.125497817993164, + "learning_rate": 3.526791832259833e-06, + "loss": 0.2103, + "step": 47986 + }, + { + "epoch": 3.050854584915898, + "grad_norm": 4.817554950714111, + "learning_rate": 3.526654789639578e-06, + "loss": 0.146, + "step": 47987 + }, + { + "epoch": 3.050868149755833, + "grad_norm": 4.747632026672363, + "learning_rate": 3.5265177470193235e-06, + "loss": 0.1435, + "step": 47988 + }, + { + "epoch": 3.050881714595768, + "grad_norm": 4.791482448577881, + "learning_rate": 3.5263807043990683e-06, + "loss": 0.1703, + "step": 47989 + }, + { + "epoch": 3.0508952794357027, + "grad_norm": 4.965359687805176, + "learning_rate": 3.526243661778814e-06, + "loss": 0.2388, + "step": 47990 + }, + { + "epoch": 3.0509088442756376, + "grad_norm": 4.261197090148926, + "learning_rate": 3.5261066191585586e-06, + "loss": 0.1533, + "step": 47991 + }, + { + "epoch": 3.0509224091155724, + "grad_norm": 3.464174747467041, + "learning_rate": 3.525969576538304e-06, + "loss": 0.0585, + "step": 47992 + }, + { + "epoch": 3.0509359739555073, + "grad_norm": 3.513993740081787, + "learning_rate": 3.525832533918049e-06, + "loss": 0.1331, + "step": 47993 + }, + { + "epoch": 3.050949538795442, + "grad_norm": 4.869075298309326, + "learning_rate": 3.5256954912977937e-06, + "loss": 0.1612, + "step": 47994 + }, + { + "epoch": 3.050963103635377, + "grad_norm": 6.547517776489258, + "learning_rate": 3.525558448677539e-06, + "loss": 0.1498, + "step": 47995 + }, + { + "epoch": 3.050976668475312, + "grad_norm": 7.250662803649902, + "learning_rate": 3.525421406057284e-06, + "loss": 0.1389, + "step": 47996 + }, + { + "epoch": 3.0509902333152468, + "grad_norm": 4.04797887802124, + "learning_rate": 3.5252843634370293e-06, + "loss": 0.1109, + "step": 47997 + }, + { + "epoch": 3.0510037981551816, + "grad_norm": 5.013151168823242, + "learning_rate": 3.525147320816774e-06, + "loss": 0.1765, + "step": 47998 + }, + { + "epoch": 3.0510173629951165, + "grad_norm": 4.095651149749756, + "learning_rate": 3.5250102781965196e-06, + "loss": 0.1425, + "step": 47999 + }, + { + "epoch": 3.0510309278350514, + "grad_norm": 3.527984619140625, + "learning_rate": 3.5248732355762644e-06, + "loss": 0.0909, + "step": 48000 + }, + { + "epoch": 3.0510444926749862, + "grad_norm": 4.661745071411133, + "learning_rate": 3.52473619295601e-06, + "loss": 0.2083, + "step": 48001 + }, + { + "epoch": 3.0510580575149215, + "grad_norm": 3.936337947845459, + "learning_rate": 3.5245991503357547e-06, + "loss": 0.1222, + "step": 48002 + }, + { + "epoch": 3.0510716223548564, + "grad_norm": 4.969940662384033, + "learning_rate": 3.5244621077155e-06, + "loss": 0.1563, + "step": 48003 + }, + { + "epoch": 3.0510851871947913, + "grad_norm": 4.337571620941162, + "learning_rate": 3.5243250650952447e-06, + "loss": 0.1219, + "step": 48004 + }, + { + "epoch": 3.051098752034726, + "grad_norm": 6.296539783477783, + "learning_rate": 3.5241880224749903e-06, + "loss": 0.2101, + "step": 48005 + }, + { + "epoch": 3.051112316874661, + "grad_norm": 4.535722732543945, + "learning_rate": 3.524050979854735e-06, + "loss": 0.1576, + "step": 48006 + }, + { + "epoch": 3.051125881714596, + "grad_norm": 4.575758457183838, + "learning_rate": 3.5239139372344798e-06, + "loss": 0.1647, + "step": 48007 + }, + { + "epoch": 3.0511394465545307, + "grad_norm": 5.107469081878662, + "learning_rate": 3.5237768946142254e-06, + "loss": 0.1374, + "step": 48008 + }, + { + "epoch": 3.0511530113944656, + "grad_norm": 5.7576189041137695, + "learning_rate": 3.52363985199397e-06, + "loss": 0.1571, + "step": 48009 + }, + { + "epoch": 3.0511665762344005, + "grad_norm": 4.3423261642456055, + "learning_rate": 3.5235028093737157e-06, + "loss": 0.1207, + "step": 48010 + }, + { + "epoch": 3.0511801410743353, + "grad_norm": 5.184204578399658, + "learning_rate": 3.5233657667534605e-06, + "loss": 0.155, + "step": 48011 + }, + { + "epoch": 3.05119370591427, + "grad_norm": 4.819862365722656, + "learning_rate": 3.5232287241332057e-06, + "loss": 0.1071, + "step": 48012 + }, + { + "epoch": 3.051207270754205, + "grad_norm": 6.339678764343262, + "learning_rate": 3.523091681512951e-06, + "loss": 0.1201, + "step": 48013 + }, + { + "epoch": 3.05122083559414, + "grad_norm": 3.8345882892608643, + "learning_rate": 3.522954638892696e-06, + "loss": 0.135, + "step": 48014 + }, + { + "epoch": 3.051234400434075, + "grad_norm": 5.331929683685303, + "learning_rate": 3.522817596272441e-06, + "loss": 0.1881, + "step": 48015 + }, + { + "epoch": 3.0512479652740097, + "grad_norm": 4.552523136138916, + "learning_rate": 3.5226805536521864e-06, + "loss": 0.0895, + "step": 48016 + }, + { + "epoch": 3.0512615301139445, + "grad_norm": 5.045965194702148, + "learning_rate": 3.522543511031931e-06, + "loss": 0.1657, + "step": 48017 + }, + { + "epoch": 3.0512750949538794, + "grad_norm": 3.2860095500946045, + "learning_rate": 3.5224064684116768e-06, + "loss": 0.1415, + "step": 48018 + }, + { + "epoch": 3.0512886597938143, + "grad_norm": 4.545428276062012, + "learning_rate": 3.5222694257914215e-06, + "loss": 0.1375, + "step": 48019 + }, + { + "epoch": 3.051302224633749, + "grad_norm": 3.7344250679016113, + "learning_rate": 3.5221323831711667e-06, + "loss": 0.1263, + "step": 48020 + }, + { + "epoch": 3.0513157894736844, + "grad_norm": 3.396740436553955, + "learning_rate": 3.5219953405509114e-06, + "loss": 0.0941, + "step": 48021 + }, + { + "epoch": 3.0513293543136193, + "grad_norm": 5.027937412261963, + "learning_rate": 3.5218582979306566e-06, + "loss": 0.1391, + "step": 48022 + }, + { + "epoch": 3.051342919153554, + "grad_norm": 3.486966133117676, + "learning_rate": 3.521721255310402e-06, + "loss": 0.0901, + "step": 48023 + }, + { + "epoch": 3.051356483993489, + "grad_norm": 3.7338576316833496, + "learning_rate": 3.5215842126901466e-06, + "loss": 0.099, + "step": 48024 + }, + { + "epoch": 3.051370048833424, + "grad_norm": 3.556790590286255, + "learning_rate": 3.521447170069892e-06, + "loss": 0.0932, + "step": 48025 + }, + { + "epoch": 3.0513836136733588, + "grad_norm": 4.221817493438721, + "learning_rate": 3.521310127449637e-06, + "loss": 0.0683, + "step": 48026 + }, + { + "epoch": 3.0513971785132936, + "grad_norm": 4.432433605194092, + "learning_rate": 3.5211730848293825e-06, + "loss": 0.1446, + "step": 48027 + }, + { + "epoch": 3.0514107433532285, + "grad_norm": 6.145651817321777, + "learning_rate": 3.5210360422091273e-06, + "loss": 0.191, + "step": 48028 + }, + { + "epoch": 3.0514243081931633, + "grad_norm": 3.3933446407318115, + "learning_rate": 3.5208989995888724e-06, + "loss": 0.1001, + "step": 48029 + }, + { + "epoch": 3.051437873033098, + "grad_norm": 4.648515701293945, + "learning_rate": 3.5207619569686176e-06, + "loss": 0.1092, + "step": 48030 + }, + { + "epoch": 3.051451437873033, + "grad_norm": 3.540776252746582, + "learning_rate": 3.520624914348363e-06, + "loss": 0.0955, + "step": 48031 + }, + { + "epoch": 3.051465002712968, + "grad_norm": 3.7991628646850586, + "learning_rate": 3.5204878717281076e-06, + "loss": 0.117, + "step": 48032 + }, + { + "epoch": 3.051478567552903, + "grad_norm": 5.036059856414795, + "learning_rate": 3.520350829107853e-06, + "loss": 0.086, + "step": 48033 + }, + { + "epoch": 3.0514921323928377, + "grad_norm": 3.0192267894744873, + "learning_rate": 3.520213786487598e-06, + "loss": 0.0564, + "step": 48034 + }, + { + "epoch": 3.0515056972327725, + "grad_norm": 4.059959411621094, + "learning_rate": 3.5200767438673427e-06, + "loss": 0.1442, + "step": 48035 + }, + { + "epoch": 3.0515192620727074, + "grad_norm": 4.265510082244873, + "learning_rate": 3.5199397012470883e-06, + "loss": 0.1116, + "step": 48036 + }, + { + "epoch": 3.0515328269126423, + "grad_norm": 4.598901271820068, + "learning_rate": 3.519802658626833e-06, + "loss": 0.167, + "step": 48037 + }, + { + "epoch": 3.051546391752577, + "grad_norm": 4.345268249511719, + "learning_rate": 3.5196656160065782e-06, + "loss": 0.0932, + "step": 48038 + }, + { + "epoch": 3.051559956592512, + "grad_norm": 10.035805702209473, + "learning_rate": 3.5195285733863234e-06, + "loss": 0.1245, + "step": 48039 + }, + { + "epoch": 3.0515735214324473, + "grad_norm": 3.4440650939941406, + "learning_rate": 3.5193915307660686e-06, + "loss": 0.0854, + "step": 48040 + }, + { + "epoch": 3.051587086272382, + "grad_norm": 4.856942176818848, + "learning_rate": 3.5192544881458133e-06, + "loss": 0.0868, + "step": 48041 + }, + { + "epoch": 3.051600651112317, + "grad_norm": 4.244924545288086, + "learning_rate": 3.519117445525559e-06, + "loss": 0.1423, + "step": 48042 + }, + { + "epoch": 3.051614215952252, + "grad_norm": 4.380073547363281, + "learning_rate": 3.5189804029053037e-06, + "loss": 0.1204, + "step": 48043 + }, + { + "epoch": 3.0516277807921868, + "grad_norm": 3.45129132270813, + "learning_rate": 3.5188433602850493e-06, + "loss": 0.0533, + "step": 48044 + }, + { + "epoch": 3.0516413456321216, + "grad_norm": 4.970003128051758, + "learning_rate": 3.518706317664794e-06, + "loss": 0.1142, + "step": 48045 + }, + { + "epoch": 3.0516549104720565, + "grad_norm": 3.285639524459839, + "learning_rate": 3.5185692750445392e-06, + "loss": 0.0609, + "step": 48046 + }, + { + "epoch": 3.0516684753119914, + "grad_norm": 5.2846999168396, + "learning_rate": 3.5184322324242844e-06, + "loss": 0.1536, + "step": 48047 + }, + { + "epoch": 3.0516820401519262, + "grad_norm": 3.535320520401001, + "learning_rate": 3.518295189804029e-06, + "loss": 0.0989, + "step": 48048 + }, + { + "epoch": 3.051695604991861, + "grad_norm": 5.424454689025879, + "learning_rate": 3.5181581471837743e-06, + "loss": 0.1609, + "step": 48049 + }, + { + "epoch": 3.051709169831796, + "grad_norm": 6.57574462890625, + "learning_rate": 3.5180211045635195e-06, + "loss": 0.1693, + "step": 48050 + }, + { + "epoch": 3.051722734671731, + "grad_norm": 3.5932719707489014, + "learning_rate": 3.5178840619432647e-06, + "loss": 0.1019, + "step": 48051 + }, + { + "epoch": 3.0517362995116657, + "grad_norm": 4.236029624938965, + "learning_rate": 3.5177470193230094e-06, + "loss": 0.0652, + "step": 48052 + }, + { + "epoch": 3.0517498643516006, + "grad_norm": 3.7180097103118896, + "learning_rate": 3.517609976702755e-06, + "loss": 0.133, + "step": 48053 + }, + { + "epoch": 3.0517634291915354, + "grad_norm": 3.3586344718933105, + "learning_rate": 3.5174729340825e-06, + "loss": 0.0847, + "step": 48054 + }, + { + "epoch": 3.0517769940314703, + "grad_norm": 2.58042573928833, + "learning_rate": 3.517335891462245e-06, + "loss": 0.1107, + "step": 48055 + }, + { + "epoch": 3.051790558871405, + "grad_norm": 4.1233439445495605, + "learning_rate": 3.51719884884199e-06, + "loss": 0.2047, + "step": 48056 + }, + { + "epoch": 3.05180412371134, + "grad_norm": 4.12664794921875, + "learning_rate": 3.5170618062217353e-06, + "loss": 0.0911, + "step": 48057 + }, + { + "epoch": 3.051817688551275, + "grad_norm": 2.9967780113220215, + "learning_rate": 3.51692476360148e-06, + "loss": 0.0989, + "step": 48058 + }, + { + "epoch": 3.05183125339121, + "grad_norm": 5.295865058898926, + "learning_rate": 3.5167877209812257e-06, + "loss": 0.1907, + "step": 48059 + }, + { + "epoch": 3.051844818231145, + "grad_norm": 6.1873884201049805, + "learning_rate": 3.5166506783609705e-06, + "loss": 0.1484, + "step": 48060 + }, + { + "epoch": 3.05185838307108, + "grad_norm": 5.395339488983154, + "learning_rate": 3.516513635740716e-06, + "loss": 0.139, + "step": 48061 + }, + { + "epoch": 3.051871947911015, + "grad_norm": 4.468459606170654, + "learning_rate": 3.516376593120461e-06, + "loss": 0.1003, + "step": 48062 + }, + { + "epoch": 3.0518855127509497, + "grad_norm": 3.557448625564575, + "learning_rate": 3.5162395505002056e-06, + "loss": 0.1534, + "step": 48063 + }, + { + "epoch": 3.0518990775908845, + "grad_norm": 3.823469400405884, + "learning_rate": 3.516102507879951e-06, + "loss": 0.0758, + "step": 48064 + }, + { + "epoch": 3.0519126424308194, + "grad_norm": 3.662976026535034, + "learning_rate": 3.515965465259696e-06, + "loss": 0.126, + "step": 48065 + }, + { + "epoch": 3.0519262072707543, + "grad_norm": 3.255342721939087, + "learning_rate": 3.515828422639441e-06, + "loss": 0.1194, + "step": 48066 + }, + { + "epoch": 3.051939772110689, + "grad_norm": 5.217485427856445, + "learning_rate": 3.5156913800191863e-06, + "loss": 0.1128, + "step": 48067 + }, + { + "epoch": 3.051953336950624, + "grad_norm": 6.722712993621826, + "learning_rate": 3.5155543373989315e-06, + "loss": 0.1747, + "step": 48068 + }, + { + "epoch": 3.051966901790559, + "grad_norm": 5.736660480499268, + "learning_rate": 3.5154172947786762e-06, + "loss": 0.127, + "step": 48069 + }, + { + "epoch": 3.0519804666304937, + "grad_norm": 3.8607022762298584, + "learning_rate": 3.515280252158422e-06, + "loss": 0.1074, + "step": 48070 + }, + { + "epoch": 3.0519940314704286, + "grad_norm": 4.903855800628662, + "learning_rate": 3.5151432095381666e-06, + "loss": 0.1049, + "step": 48071 + }, + { + "epoch": 3.0520075963103634, + "grad_norm": 5.525363922119141, + "learning_rate": 3.515006166917912e-06, + "loss": 0.14, + "step": 48072 + }, + { + "epoch": 3.0520211611502983, + "grad_norm": 17.406023025512695, + "learning_rate": 3.514869124297657e-06, + "loss": 0.1443, + "step": 48073 + }, + { + "epoch": 3.052034725990233, + "grad_norm": 3.5914549827575684, + "learning_rate": 3.514732081677402e-06, + "loss": 0.1027, + "step": 48074 + }, + { + "epoch": 3.052048290830168, + "grad_norm": 2.808952808380127, + "learning_rate": 3.514595039057147e-06, + "loss": 0.0555, + "step": 48075 + }, + { + "epoch": 3.052061855670103, + "grad_norm": 4.652855396270752, + "learning_rate": 3.514457996436892e-06, + "loss": 0.1094, + "step": 48076 + }, + { + "epoch": 3.0520754205100378, + "grad_norm": 3.1486942768096924, + "learning_rate": 3.5143209538166372e-06, + "loss": 0.1313, + "step": 48077 + }, + { + "epoch": 3.052088985349973, + "grad_norm": 3.5100185871124268, + "learning_rate": 3.514183911196382e-06, + "loss": 0.1092, + "step": 48078 + }, + { + "epoch": 3.052102550189908, + "grad_norm": 4.082460403442383, + "learning_rate": 3.5140468685761276e-06, + "loss": 0.0767, + "step": 48079 + }, + { + "epoch": 3.052116115029843, + "grad_norm": 3.029320001602173, + "learning_rate": 3.5139098259558723e-06, + "loss": 0.073, + "step": 48080 + }, + { + "epoch": 3.0521296798697777, + "grad_norm": 5.0418243408203125, + "learning_rate": 3.513772783335618e-06, + "loss": 0.2571, + "step": 48081 + }, + { + "epoch": 3.0521432447097125, + "grad_norm": 4.649621486663818, + "learning_rate": 3.5136357407153627e-06, + "loss": 0.0555, + "step": 48082 + }, + { + "epoch": 3.0521568095496474, + "grad_norm": 5.024406433105469, + "learning_rate": 3.513498698095108e-06, + "loss": 0.1818, + "step": 48083 + }, + { + "epoch": 3.0521703743895823, + "grad_norm": 5.297597885131836, + "learning_rate": 3.513361655474853e-06, + "loss": 0.1577, + "step": 48084 + }, + { + "epoch": 3.052183939229517, + "grad_norm": 5.6549224853515625, + "learning_rate": 3.5132246128545982e-06, + "loss": 0.0625, + "step": 48085 + }, + { + "epoch": 3.052197504069452, + "grad_norm": 4.636375427246094, + "learning_rate": 3.513087570234343e-06, + "loss": 0.1498, + "step": 48086 + }, + { + "epoch": 3.052211068909387, + "grad_norm": 3.420011043548584, + "learning_rate": 3.5129505276140886e-06, + "loss": 0.0615, + "step": 48087 + }, + { + "epoch": 3.0522246337493217, + "grad_norm": 3.9060158729553223, + "learning_rate": 3.5128134849938333e-06, + "loss": 0.1113, + "step": 48088 + }, + { + "epoch": 3.0522381985892566, + "grad_norm": 4.2664642333984375, + "learning_rate": 3.512676442373579e-06, + "loss": 0.0541, + "step": 48089 + }, + { + "epoch": 3.0522517634291915, + "grad_norm": 3.6606011390686035, + "learning_rate": 3.5125393997533237e-06, + "loss": 0.1162, + "step": 48090 + }, + { + "epoch": 3.0522653282691263, + "grad_norm": 6.016322135925293, + "learning_rate": 3.5124023571330685e-06, + "loss": 0.1863, + "step": 48091 + }, + { + "epoch": 3.052278893109061, + "grad_norm": 4.45436429977417, + "learning_rate": 3.5122653145128136e-06, + "loss": 0.1122, + "step": 48092 + }, + { + "epoch": 3.052292457948996, + "grad_norm": 3.8359334468841553, + "learning_rate": 3.512128271892559e-06, + "loss": 0.0716, + "step": 48093 + }, + { + "epoch": 3.052306022788931, + "grad_norm": 3.8018970489501953, + "learning_rate": 3.511991229272304e-06, + "loss": 0.1283, + "step": 48094 + }, + { + "epoch": 3.052319587628866, + "grad_norm": 3.749382972717285, + "learning_rate": 3.5118541866520488e-06, + "loss": 0.1041, + "step": 48095 + }, + { + "epoch": 3.0523331524688007, + "grad_norm": 4.497142314910889, + "learning_rate": 3.5117171440317944e-06, + "loss": 0.1694, + "step": 48096 + }, + { + "epoch": 3.052346717308736, + "grad_norm": 4.025882720947266, + "learning_rate": 3.511580101411539e-06, + "loss": 0.1332, + "step": 48097 + }, + { + "epoch": 3.052360282148671, + "grad_norm": 4.099175453186035, + "learning_rate": 3.5114430587912847e-06, + "loss": 0.1629, + "step": 48098 + }, + { + "epoch": 3.0523738469886057, + "grad_norm": 5.008094787597656, + "learning_rate": 3.5113060161710295e-06, + "loss": 0.0988, + "step": 48099 + }, + { + "epoch": 3.0523874118285406, + "grad_norm": 3.7018003463745117, + "learning_rate": 3.5111689735507746e-06, + "loss": 0.1244, + "step": 48100 + }, + { + "epoch": 3.0524009766684754, + "grad_norm": 4.149724006652832, + "learning_rate": 3.51103193093052e-06, + "loss": 0.1062, + "step": 48101 + }, + { + "epoch": 3.0524145415084103, + "grad_norm": 3.7661283016204834, + "learning_rate": 3.510894888310265e-06, + "loss": 0.1127, + "step": 48102 + }, + { + "epoch": 3.052428106348345, + "grad_norm": 4.94805908203125, + "learning_rate": 3.5107578456900098e-06, + "loss": 0.1193, + "step": 48103 + }, + { + "epoch": 3.05244167118828, + "grad_norm": 3.2365312576293945, + "learning_rate": 3.5106208030697545e-06, + "loss": 0.1078, + "step": 48104 + }, + { + "epoch": 3.052455236028215, + "grad_norm": 5.757606029510498, + "learning_rate": 3.5104837604495e-06, + "loss": 0.1776, + "step": 48105 + }, + { + "epoch": 3.0524688008681498, + "grad_norm": 4.771509647369385, + "learning_rate": 3.510346717829245e-06, + "loss": 0.1566, + "step": 48106 + }, + { + "epoch": 3.0524823657080846, + "grad_norm": 5.640839099884033, + "learning_rate": 3.5102096752089905e-06, + "loss": 0.1554, + "step": 48107 + }, + { + "epoch": 3.0524959305480195, + "grad_norm": 3.8305230140686035, + "learning_rate": 3.5100726325887352e-06, + "loss": 0.1312, + "step": 48108 + }, + { + "epoch": 3.0525094953879544, + "grad_norm": 3.5041675567626953, + "learning_rate": 3.5099355899684804e-06, + "loss": 0.1286, + "step": 48109 + }, + { + "epoch": 3.052523060227889, + "grad_norm": 3.457200050354004, + "learning_rate": 3.5097985473482256e-06, + "loss": 0.103, + "step": 48110 + }, + { + "epoch": 3.052536625067824, + "grad_norm": 3.4181034564971924, + "learning_rate": 3.5096615047279708e-06, + "loss": 0.077, + "step": 48111 + }, + { + "epoch": 3.052550189907759, + "grad_norm": 4.712477207183838, + "learning_rate": 3.5095244621077155e-06, + "loss": 0.1219, + "step": 48112 + }, + { + "epoch": 3.052563754747694, + "grad_norm": 3.906020164489746, + "learning_rate": 3.509387419487461e-06, + "loss": 0.0898, + "step": 48113 + }, + { + "epoch": 3.0525773195876287, + "grad_norm": 5.344051361083984, + "learning_rate": 3.509250376867206e-06, + "loss": 0.1018, + "step": 48114 + }, + { + "epoch": 3.0525908844275635, + "grad_norm": 4.141454219818115, + "learning_rate": 3.5091133342469515e-06, + "loss": 0.1717, + "step": 48115 + }, + { + "epoch": 3.052604449267499, + "grad_norm": 5.481645107269287, + "learning_rate": 3.5089762916266962e-06, + "loss": 0.17, + "step": 48116 + }, + { + "epoch": 3.0526180141074337, + "grad_norm": 4.274497032165527, + "learning_rate": 3.508839249006441e-06, + "loss": 0.074, + "step": 48117 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 4.383728504180908, + "learning_rate": 3.5087022063861866e-06, + "loss": 0.1385, + "step": 48118 + }, + { + "epoch": 3.0526451437873035, + "grad_norm": 4.267404556274414, + "learning_rate": 3.5085651637659313e-06, + "loss": 0.0863, + "step": 48119 + }, + { + "epoch": 3.0526587086272383, + "grad_norm": 4.568194389343262, + "learning_rate": 3.5084281211456765e-06, + "loss": 0.0867, + "step": 48120 + }, + { + "epoch": 3.052672273467173, + "grad_norm": 4.640806674957275, + "learning_rate": 3.5082910785254217e-06, + "loss": 0.155, + "step": 48121 + }, + { + "epoch": 3.052685838307108, + "grad_norm": 6.037110805511475, + "learning_rate": 3.508154035905167e-06, + "loss": 0.2205, + "step": 48122 + }, + { + "epoch": 3.052699403147043, + "grad_norm": 5.396743297576904, + "learning_rate": 3.5080169932849116e-06, + "loss": 0.1181, + "step": 48123 + }, + { + "epoch": 3.0527129679869778, + "grad_norm": 5.296169757843018, + "learning_rate": 3.5078799506646572e-06, + "loss": 0.1505, + "step": 48124 + }, + { + "epoch": 3.0527265328269126, + "grad_norm": 3.5307300090789795, + "learning_rate": 3.507742908044402e-06, + "loss": 0.0728, + "step": 48125 + }, + { + "epoch": 3.0527400976668475, + "grad_norm": 5.274941921234131, + "learning_rate": 3.507605865424147e-06, + "loss": 0.1107, + "step": 48126 + }, + { + "epoch": 3.0527536625067824, + "grad_norm": 4.850125789642334, + "learning_rate": 3.5074688228038924e-06, + "loss": 0.1158, + "step": 48127 + }, + { + "epoch": 3.0527672273467172, + "grad_norm": 6.26255989074707, + "learning_rate": 3.5073317801836375e-06, + "loss": 0.255, + "step": 48128 + }, + { + "epoch": 3.052780792186652, + "grad_norm": 5.011120319366455, + "learning_rate": 3.5071947375633823e-06, + "loss": 0.1414, + "step": 48129 + }, + { + "epoch": 3.052794357026587, + "grad_norm": 4.353614807128906, + "learning_rate": 3.507057694943128e-06, + "loss": 0.093, + "step": 48130 + }, + { + "epoch": 3.052807921866522, + "grad_norm": 4.759771823883057, + "learning_rate": 3.5069206523228726e-06, + "loss": 0.1052, + "step": 48131 + }, + { + "epoch": 3.0528214867064567, + "grad_norm": 3.793186664581299, + "learning_rate": 3.5067836097026174e-06, + "loss": 0.1643, + "step": 48132 + }, + { + "epoch": 3.0528350515463916, + "grad_norm": 4.3519206047058105, + "learning_rate": 3.506646567082363e-06, + "loss": 0.1415, + "step": 48133 + }, + { + "epoch": 3.0528486163863264, + "grad_norm": 5.170506477355957, + "learning_rate": 3.5065095244621078e-06, + "loss": 0.1541, + "step": 48134 + }, + { + "epoch": 3.0528621812262617, + "grad_norm": 4.739335060119629, + "learning_rate": 3.5063724818418534e-06, + "loss": 0.093, + "step": 48135 + }, + { + "epoch": 3.0528757460661966, + "grad_norm": 3.125626564025879, + "learning_rate": 3.506235439221598e-06, + "loss": 0.0518, + "step": 48136 + }, + { + "epoch": 3.0528893109061315, + "grad_norm": 5.923468589782715, + "learning_rate": 3.5060983966013433e-06, + "loss": 0.1906, + "step": 48137 + }, + { + "epoch": 3.0529028757460663, + "grad_norm": 5.569023132324219, + "learning_rate": 3.5059613539810885e-06, + "loss": 0.1668, + "step": 48138 + }, + { + "epoch": 3.052916440586001, + "grad_norm": 5.03473424911499, + "learning_rate": 3.5058243113608337e-06, + "loss": 0.1693, + "step": 48139 + }, + { + "epoch": 3.052930005425936, + "grad_norm": 4.900145530700684, + "learning_rate": 3.5056872687405784e-06, + "loss": 0.1201, + "step": 48140 + }, + { + "epoch": 3.052943570265871, + "grad_norm": 4.666340351104736, + "learning_rate": 3.505550226120324e-06, + "loss": 0.1011, + "step": 48141 + }, + { + "epoch": 3.052957135105806, + "grad_norm": 5.334585666656494, + "learning_rate": 3.5054131835000688e-06, + "loss": 0.1967, + "step": 48142 + }, + { + "epoch": 3.0529706999457407, + "grad_norm": 4.955873489379883, + "learning_rate": 3.505276140879814e-06, + "loss": 0.1107, + "step": 48143 + }, + { + "epoch": 3.0529842647856755, + "grad_norm": 4.09614896774292, + "learning_rate": 3.505139098259559e-06, + "loss": 0.0929, + "step": 48144 + }, + { + "epoch": 3.0529978296256104, + "grad_norm": 3.3519225120544434, + "learning_rate": 3.505002055639304e-06, + "loss": 0.0702, + "step": 48145 + }, + { + "epoch": 3.0530113944655453, + "grad_norm": 5.94588565826416, + "learning_rate": 3.504865013019049e-06, + "loss": 0.1072, + "step": 48146 + }, + { + "epoch": 3.05302495930548, + "grad_norm": 6.03890323638916, + "learning_rate": 3.5047279703987942e-06, + "loss": 0.1704, + "step": 48147 + }, + { + "epoch": 3.053038524145415, + "grad_norm": 3.559041738510132, + "learning_rate": 3.5045909277785394e-06, + "loss": 0.1172, + "step": 48148 + }, + { + "epoch": 3.05305208898535, + "grad_norm": 4.434334754943848, + "learning_rate": 3.504453885158284e-06, + "loss": 0.0735, + "step": 48149 + }, + { + "epoch": 3.0530656538252847, + "grad_norm": 4.67458963394165, + "learning_rate": 3.5043168425380298e-06, + "loss": 0.153, + "step": 48150 + }, + { + "epoch": 3.0530792186652196, + "grad_norm": 3.828950881958008, + "learning_rate": 3.5041797999177745e-06, + "loss": 0.1066, + "step": 48151 + }, + { + "epoch": 3.0530927835051545, + "grad_norm": 5.325812816619873, + "learning_rate": 3.50404275729752e-06, + "loss": 0.14, + "step": 48152 + }, + { + "epoch": 3.0531063483450893, + "grad_norm": 3.40453839302063, + "learning_rate": 3.503905714677265e-06, + "loss": 0.0719, + "step": 48153 + }, + { + "epoch": 3.0531199131850246, + "grad_norm": 5.077463626861572, + "learning_rate": 3.50376867205701e-06, + "loss": 0.1145, + "step": 48154 + }, + { + "epoch": 3.0531334780249595, + "grad_norm": 4.485071182250977, + "learning_rate": 3.5036316294367552e-06, + "loss": 0.1448, + "step": 48155 + }, + { + "epoch": 3.0531470428648944, + "grad_norm": 5.275511264801025, + "learning_rate": 3.5034945868165004e-06, + "loss": 0.1457, + "step": 48156 + }, + { + "epoch": 3.053160607704829, + "grad_norm": 4.492672443389893, + "learning_rate": 3.503357544196245e-06, + "loss": 0.1105, + "step": 48157 + }, + { + "epoch": 3.053174172544764, + "grad_norm": 7.742552757263184, + "learning_rate": 3.5032205015759908e-06, + "loss": 0.1289, + "step": 48158 + }, + { + "epoch": 3.053187737384699, + "grad_norm": 4.829555034637451, + "learning_rate": 3.5030834589557355e-06, + "loss": 0.1534, + "step": 48159 + }, + { + "epoch": 3.053201302224634, + "grad_norm": 5.125940799713135, + "learning_rate": 3.5029464163354803e-06, + "loss": 0.1173, + "step": 48160 + }, + { + "epoch": 3.0532148670645687, + "grad_norm": 4.028138160705566, + "learning_rate": 3.502809373715226e-06, + "loss": 0.0763, + "step": 48161 + }, + { + "epoch": 3.0532284319045035, + "grad_norm": 5.182250499725342, + "learning_rate": 3.5026723310949707e-06, + "loss": 0.1075, + "step": 48162 + }, + { + "epoch": 3.0532419967444384, + "grad_norm": 4.443206310272217, + "learning_rate": 3.502535288474716e-06, + "loss": 0.1005, + "step": 48163 + }, + { + "epoch": 3.0532555615843733, + "grad_norm": 4.258150577545166, + "learning_rate": 3.502398245854461e-06, + "loss": 0.0839, + "step": 48164 + }, + { + "epoch": 3.053269126424308, + "grad_norm": 5.974291801452637, + "learning_rate": 3.502261203234206e-06, + "loss": 0.111, + "step": 48165 + }, + { + "epoch": 3.053282691264243, + "grad_norm": 5.586116790771484, + "learning_rate": 3.502124160613951e-06, + "loss": 0.1375, + "step": 48166 + }, + { + "epoch": 3.053296256104178, + "grad_norm": 5.70380163192749, + "learning_rate": 3.5019871179936965e-06, + "loss": 0.1756, + "step": 48167 + }, + { + "epoch": 3.0533098209441127, + "grad_norm": 4.49208402633667, + "learning_rate": 3.5018500753734413e-06, + "loss": 0.1959, + "step": 48168 + }, + { + "epoch": 3.0533233857840476, + "grad_norm": 4.189204216003418, + "learning_rate": 3.501713032753187e-06, + "loss": 0.1433, + "step": 48169 + }, + { + "epoch": 3.0533369506239825, + "grad_norm": 5.484205722808838, + "learning_rate": 3.5015759901329317e-06, + "loss": 0.1967, + "step": 48170 + }, + { + "epoch": 3.0533505154639173, + "grad_norm": 4.172648906707764, + "learning_rate": 3.501438947512677e-06, + "loss": 0.0875, + "step": 48171 + }, + { + "epoch": 3.053364080303852, + "grad_norm": 4.473733901977539, + "learning_rate": 3.501301904892422e-06, + "loss": 0.1229, + "step": 48172 + }, + { + "epoch": 3.0533776451437875, + "grad_norm": 4.282706260681152, + "learning_rate": 3.5011648622721668e-06, + "loss": 0.1399, + "step": 48173 + }, + { + "epoch": 3.0533912099837224, + "grad_norm": 4.6590399742126465, + "learning_rate": 3.501027819651912e-06, + "loss": 0.1558, + "step": 48174 + }, + { + "epoch": 3.0534047748236572, + "grad_norm": 4.422346115112305, + "learning_rate": 3.5008907770316567e-06, + "loss": 0.1218, + "step": 48175 + }, + { + "epoch": 3.053418339663592, + "grad_norm": 4.993782043457031, + "learning_rate": 3.5007537344114023e-06, + "loss": 0.11, + "step": 48176 + }, + { + "epoch": 3.053431904503527, + "grad_norm": 4.044743537902832, + "learning_rate": 3.500616691791147e-06, + "loss": 0.1311, + "step": 48177 + }, + { + "epoch": 3.053445469343462, + "grad_norm": 6.094583988189697, + "learning_rate": 3.5004796491708927e-06, + "loss": 0.1574, + "step": 48178 + }, + { + "epoch": 3.0534590341833967, + "grad_norm": 4.073729515075684, + "learning_rate": 3.5003426065506374e-06, + "loss": 0.0751, + "step": 48179 + }, + { + "epoch": 3.0534725990233316, + "grad_norm": 6.184235095977783, + "learning_rate": 3.5002055639303826e-06, + "loss": 0.2506, + "step": 48180 + }, + { + "epoch": 3.0534861638632664, + "grad_norm": 3.861349105834961, + "learning_rate": 3.5000685213101278e-06, + "loss": 0.1587, + "step": 48181 + }, + { + "epoch": 3.0534997287032013, + "grad_norm": 4.390016555786133, + "learning_rate": 3.499931478689873e-06, + "loss": 0.1018, + "step": 48182 + }, + { + "epoch": 3.053513293543136, + "grad_norm": 3.139343500137329, + "learning_rate": 3.4997944360696177e-06, + "loss": 0.0468, + "step": 48183 + }, + { + "epoch": 3.053526858383071, + "grad_norm": 3.7808752059936523, + "learning_rate": 3.4996573934493633e-06, + "loss": 0.1087, + "step": 48184 + }, + { + "epoch": 3.053540423223006, + "grad_norm": 5.155313491821289, + "learning_rate": 3.499520350829108e-06, + "loss": 0.0891, + "step": 48185 + }, + { + "epoch": 3.0535539880629408, + "grad_norm": 5.087480068206787, + "learning_rate": 3.499383308208853e-06, + "loss": 0.1056, + "step": 48186 + }, + { + "epoch": 3.0535675529028756, + "grad_norm": 3.8009562492370605, + "learning_rate": 3.4992462655885984e-06, + "loss": 0.1436, + "step": 48187 + }, + { + "epoch": 3.0535811177428105, + "grad_norm": 3.173022985458374, + "learning_rate": 3.499109222968343e-06, + "loss": 0.0661, + "step": 48188 + }, + { + "epoch": 3.0535946825827454, + "grad_norm": 2.706339120864868, + "learning_rate": 3.498972180348089e-06, + "loss": 0.0531, + "step": 48189 + }, + { + "epoch": 3.05360824742268, + "grad_norm": 4.3226118087768555, + "learning_rate": 3.4988351377278335e-06, + "loss": 0.0638, + "step": 48190 + }, + { + "epoch": 3.053621812262615, + "grad_norm": 3.898833990097046, + "learning_rate": 3.4986980951075787e-06, + "loss": 0.0831, + "step": 48191 + }, + { + "epoch": 3.0536353771025504, + "grad_norm": 4.167848587036133, + "learning_rate": 3.498561052487324e-06, + "loss": 0.0714, + "step": 48192 + }, + { + "epoch": 3.0536489419424853, + "grad_norm": 2.8828065395355225, + "learning_rate": 3.498424009867069e-06, + "loss": 0.0983, + "step": 48193 + }, + { + "epoch": 3.05366250678242, + "grad_norm": 4.723873615264893, + "learning_rate": 3.498286967246814e-06, + "loss": 0.1444, + "step": 48194 + }, + { + "epoch": 3.053676071622355, + "grad_norm": 4.8373188972473145, + "learning_rate": 3.4981499246265594e-06, + "loss": 0.0973, + "step": 48195 + }, + { + "epoch": 3.05368963646229, + "grad_norm": 4.487802028656006, + "learning_rate": 3.498012882006304e-06, + "loss": 0.1222, + "step": 48196 + }, + { + "epoch": 3.0537032013022247, + "grad_norm": 5.491758346557617, + "learning_rate": 3.4978758393860494e-06, + "loss": 0.1664, + "step": 48197 + }, + { + "epoch": 3.0537167661421596, + "grad_norm": 4.1707353591918945, + "learning_rate": 3.4977387967657946e-06, + "loss": 0.175, + "step": 48198 + }, + { + "epoch": 3.0537303309820945, + "grad_norm": 4.432431697845459, + "learning_rate": 3.4976017541455397e-06, + "loss": 0.1247, + "step": 48199 + }, + { + "epoch": 3.0537438958220293, + "grad_norm": 3.5349955558776855, + "learning_rate": 3.4974647115252845e-06, + "loss": 0.0683, + "step": 48200 + }, + { + "epoch": 3.053757460661964, + "grad_norm": 3.9335646629333496, + "learning_rate": 3.4973276689050297e-06, + "loss": 0.1021, + "step": 48201 + }, + { + "epoch": 3.053771025501899, + "grad_norm": 3.8489699363708496, + "learning_rate": 3.497190626284775e-06, + "loss": 0.1006, + "step": 48202 + }, + { + "epoch": 3.053784590341834, + "grad_norm": 3.1967437267303467, + "learning_rate": 3.4970535836645196e-06, + "loss": 0.1165, + "step": 48203 + }, + { + "epoch": 3.053798155181769, + "grad_norm": 4.413318157196045, + "learning_rate": 3.496916541044265e-06, + "loss": 0.0941, + "step": 48204 + }, + { + "epoch": 3.0538117200217036, + "grad_norm": 4.0345354080200195, + "learning_rate": 3.49677949842401e-06, + "loss": 0.1308, + "step": 48205 + }, + { + "epoch": 3.0538252848616385, + "grad_norm": 4.667153358459473, + "learning_rate": 3.4966424558037556e-06, + "loss": 0.0755, + "step": 48206 + }, + { + "epoch": 3.0538388497015734, + "grad_norm": 4.7917799949646, + "learning_rate": 3.4965054131835003e-06, + "loss": 0.1007, + "step": 48207 + }, + { + "epoch": 3.0538524145415082, + "grad_norm": 7.577671527862549, + "learning_rate": 3.4963683705632455e-06, + "loss": 0.2308, + "step": 48208 + }, + { + "epoch": 3.053865979381443, + "grad_norm": 3.654935598373413, + "learning_rate": 3.4962313279429907e-06, + "loss": 0.0767, + "step": 48209 + }, + { + "epoch": 3.053879544221378, + "grad_norm": 3.176708936691284, + "learning_rate": 3.496094285322736e-06, + "loss": 0.0681, + "step": 48210 + }, + { + "epoch": 3.0538931090613133, + "grad_norm": 4.471118450164795, + "learning_rate": 3.4959572427024806e-06, + "loss": 0.1207, + "step": 48211 + }, + { + "epoch": 3.053906673901248, + "grad_norm": 4.1792378425598145, + "learning_rate": 3.495820200082226e-06, + "loss": 0.0857, + "step": 48212 + }, + { + "epoch": 3.053920238741183, + "grad_norm": 7.166203498840332, + "learning_rate": 3.495683157461971e-06, + "loss": 0.1035, + "step": 48213 + }, + { + "epoch": 3.053933803581118, + "grad_norm": 3.9663522243499756, + "learning_rate": 3.4955461148417157e-06, + "loss": 0.0932, + "step": 48214 + }, + { + "epoch": 3.0539473684210527, + "grad_norm": 4.746815204620361, + "learning_rate": 3.4954090722214613e-06, + "loss": 0.1292, + "step": 48215 + }, + { + "epoch": 3.0539609332609876, + "grad_norm": 3.835724353790283, + "learning_rate": 3.495272029601206e-06, + "loss": 0.0615, + "step": 48216 + }, + { + "epoch": 3.0539744981009225, + "grad_norm": 4.076540470123291, + "learning_rate": 3.4951349869809513e-06, + "loss": 0.1259, + "step": 48217 + }, + { + "epoch": 3.0539880629408573, + "grad_norm": 3.1318440437316895, + "learning_rate": 3.4949979443606964e-06, + "loss": 0.0701, + "step": 48218 + }, + { + "epoch": 3.054001627780792, + "grad_norm": 3.3067452907562256, + "learning_rate": 3.4948609017404416e-06, + "loss": 0.0752, + "step": 48219 + }, + { + "epoch": 3.054015192620727, + "grad_norm": 3.5659165382385254, + "learning_rate": 3.4947238591201864e-06, + "loss": 0.0952, + "step": 48220 + }, + { + "epoch": 3.054028757460662, + "grad_norm": 3.494310140609741, + "learning_rate": 3.494586816499932e-06, + "loss": 0.0556, + "step": 48221 + }, + { + "epoch": 3.054042322300597, + "grad_norm": 2.552152633666992, + "learning_rate": 3.4944497738796767e-06, + "loss": 0.0636, + "step": 48222 + }, + { + "epoch": 3.0540558871405317, + "grad_norm": 3.1218206882476807, + "learning_rate": 3.4943127312594223e-06, + "loss": 0.1075, + "step": 48223 + }, + { + "epoch": 3.0540694519804665, + "grad_norm": 4.390185356140137, + "learning_rate": 3.494175688639167e-06, + "loss": 0.15, + "step": 48224 + }, + { + "epoch": 3.0540830168204014, + "grad_norm": 4.935224533081055, + "learning_rate": 3.4940386460189123e-06, + "loss": 0.1576, + "step": 48225 + }, + { + "epoch": 3.0540965816603363, + "grad_norm": 4.608349323272705, + "learning_rate": 3.4939016033986574e-06, + "loss": 0.1218, + "step": 48226 + }, + { + "epoch": 3.054110146500271, + "grad_norm": 3.7095372676849365, + "learning_rate": 3.493764560778402e-06, + "loss": 0.0974, + "step": 48227 + }, + { + "epoch": 3.054123711340206, + "grad_norm": 4.768877983093262, + "learning_rate": 3.4936275181581474e-06, + "loss": 0.1528, + "step": 48228 + }, + { + "epoch": 3.054137276180141, + "grad_norm": 3.2523717880249023, + "learning_rate": 3.493490475537892e-06, + "loss": 0.091, + "step": 48229 + }, + { + "epoch": 3.054150841020076, + "grad_norm": 2.8243556022644043, + "learning_rate": 3.4933534329176377e-06, + "loss": 0.1043, + "step": 48230 + }, + { + "epoch": 3.054164405860011, + "grad_norm": 3.6253879070281982, + "learning_rate": 3.4932163902973825e-06, + "loss": 0.1303, + "step": 48231 + }, + { + "epoch": 3.054177970699946, + "grad_norm": 4.919281005859375, + "learning_rate": 3.493079347677128e-06, + "loss": 0.1457, + "step": 48232 + }, + { + "epoch": 3.0541915355398808, + "grad_norm": 3.5585005283355713, + "learning_rate": 3.492942305056873e-06, + "loss": 0.0962, + "step": 48233 + }, + { + "epoch": 3.0542051003798156, + "grad_norm": 4.616936683654785, + "learning_rate": 3.492805262436618e-06, + "loss": 0.0968, + "step": 48234 + }, + { + "epoch": 3.0542186652197505, + "grad_norm": 4.562932968139648, + "learning_rate": 3.492668219816363e-06, + "loss": 0.1798, + "step": 48235 + }, + { + "epoch": 3.0542322300596854, + "grad_norm": 3.1365950107574463, + "learning_rate": 3.4925311771961084e-06, + "loss": 0.0978, + "step": 48236 + }, + { + "epoch": 3.0542457948996202, + "grad_norm": 2.860917091369629, + "learning_rate": 3.492394134575853e-06, + "loss": 0.0852, + "step": 48237 + }, + { + "epoch": 3.054259359739555, + "grad_norm": 6.970487594604492, + "learning_rate": 3.4922570919555987e-06, + "loss": 0.165, + "step": 48238 + }, + { + "epoch": 3.05427292457949, + "grad_norm": 4.386868000030518, + "learning_rate": 3.4921200493353435e-06, + "loss": 0.1138, + "step": 48239 + }, + { + "epoch": 3.054286489419425, + "grad_norm": 5.241332530975342, + "learning_rate": 3.491983006715089e-06, + "loss": 0.1735, + "step": 48240 + }, + { + "epoch": 3.0543000542593597, + "grad_norm": 6.509639263153076, + "learning_rate": 3.491845964094834e-06, + "loss": 0.2233, + "step": 48241 + }, + { + "epoch": 3.0543136190992946, + "grad_norm": 4.117558002471924, + "learning_rate": 3.4917089214745786e-06, + "loss": 0.1213, + "step": 48242 + }, + { + "epoch": 3.0543271839392294, + "grad_norm": 4.069526672363281, + "learning_rate": 3.4915718788543242e-06, + "loss": 0.1643, + "step": 48243 + }, + { + "epoch": 3.0543407487791643, + "grad_norm": 3.6347885131835938, + "learning_rate": 3.491434836234069e-06, + "loss": 0.079, + "step": 48244 + }, + { + "epoch": 3.054354313619099, + "grad_norm": 4.1160149574279785, + "learning_rate": 3.491297793613814e-06, + "loss": 0.1084, + "step": 48245 + }, + { + "epoch": 3.054367878459034, + "grad_norm": 4.149995803833008, + "learning_rate": 3.491160750993559e-06, + "loss": 0.089, + "step": 48246 + }, + { + "epoch": 3.054381443298969, + "grad_norm": 7.476353168487549, + "learning_rate": 3.4910237083733045e-06, + "loss": 0.1286, + "step": 48247 + }, + { + "epoch": 3.0543950081389037, + "grad_norm": 4.338367462158203, + "learning_rate": 3.4908866657530493e-06, + "loss": 0.141, + "step": 48248 + }, + { + "epoch": 3.054408572978839, + "grad_norm": 4.762388229370117, + "learning_rate": 3.490749623132795e-06, + "loss": 0.1371, + "step": 48249 + }, + { + "epoch": 3.054422137818774, + "grad_norm": 2.746269464492798, + "learning_rate": 3.4906125805125396e-06, + "loss": 0.0795, + "step": 48250 + }, + { + "epoch": 3.054435702658709, + "grad_norm": 3.889503002166748, + "learning_rate": 3.490475537892285e-06, + "loss": 0.0874, + "step": 48251 + }, + { + "epoch": 3.0544492674986436, + "grad_norm": 3.9345643520355225, + "learning_rate": 3.49033849527203e-06, + "loss": 0.1519, + "step": 48252 + }, + { + "epoch": 3.0544628323385785, + "grad_norm": 3.068549394607544, + "learning_rate": 3.490201452651775e-06, + "loss": 0.0777, + "step": 48253 + }, + { + "epoch": 3.0544763971785134, + "grad_norm": 4.18794584274292, + "learning_rate": 3.49006441003152e-06, + "loss": 0.1017, + "step": 48254 + }, + { + "epoch": 3.0544899620184482, + "grad_norm": 4.2313055992126465, + "learning_rate": 3.489927367411265e-06, + "loss": 0.1476, + "step": 48255 + }, + { + "epoch": 3.054503526858383, + "grad_norm": 4.217282295227051, + "learning_rate": 3.4897903247910103e-06, + "loss": 0.1304, + "step": 48256 + }, + { + "epoch": 3.054517091698318, + "grad_norm": 3.7384555339813232, + "learning_rate": 3.489653282170755e-06, + "loss": 0.0962, + "step": 48257 + }, + { + "epoch": 3.054530656538253, + "grad_norm": 4.618122100830078, + "learning_rate": 3.4895162395505006e-06, + "loss": 0.154, + "step": 48258 + }, + { + "epoch": 3.0545442213781877, + "grad_norm": 2.708805799484253, + "learning_rate": 3.4893791969302454e-06, + "loss": 0.083, + "step": 48259 + }, + { + "epoch": 3.0545577862181226, + "grad_norm": 2.79105806350708, + "learning_rate": 3.489242154309991e-06, + "loss": 0.0564, + "step": 48260 + }, + { + "epoch": 3.0545713510580574, + "grad_norm": 5.3641510009765625, + "learning_rate": 3.4891051116897357e-06, + "loss": 0.1619, + "step": 48261 + }, + { + "epoch": 3.0545849158979923, + "grad_norm": 3.8072803020477295, + "learning_rate": 3.488968069069481e-06, + "loss": 0.1038, + "step": 48262 + }, + { + "epoch": 3.054598480737927, + "grad_norm": 3.0868680477142334, + "learning_rate": 3.488831026449226e-06, + "loss": 0.0772, + "step": 48263 + }, + { + "epoch": 3.054612045577862, + "grad_norm": 5.523595333099365, + "learning_rate": 3.4886939838289713e-06, + "loss": 0.0996, + "step": 48264 + }, + { + "epoch": 3.054625610417797, + "grad_norm": 3.0146992206573486, + "learning_rate": 3.488556941208716e-06, + "loss": 0.0692, + "step": 48265 + }, + { + "epoch": 3.0546391752577318, + "grad_norm": 3.2642080783843994, + "learning_rate": 3.4884198985884616e-06, + "loss": 0.0963, + "step": 48266 + }, + { + "epoch": 3.0546527400976666, + "grad_norm": 4.096192836761475, + "learning_rate": 3.4882828559682064e-06, + "loss": 0.1064, + "step": 48267 + }, + { + "epoch": 3.054666304937602, + "grad_norm": 3.7643566131591797, + "learning_rate": 3.4881458133479516e-06, + "loss": 0.0618, + "step": 48268 + }, + { + "epoch": 3.054679869777537, + "grad_norm": 3.9366722106933594, + "learning_rate": 3.4880087707276967e-06, + "loss": 0.1684, + "step": 48269 + }, + { + "epoch": 3.0546934346174717, + "grad_norm": 2.5824007987976074, + "learning_rate": 3.4878717281074415e-06, + "loss": 0.0525, + "step": 48270 + }, + { + "epoch": 3.0547069994574065, + "grad_norm": 4.213279724121094, + "learning_rate": 3.4877346854871867e-06, + "loss": 0.1045, + "step": 48271 + }, + { + "epoch": 3.0547205642973414, + "grad_norm": 3.1986896991729736, + "learning_rate": 3.487597642866932e-06, + "loss": 0.0923, + "step": 48272 + }, + { + "epoch": 3.0547341291372763, + "grad_norm": 3.9112420082092285, + "learning_rate": 3.487460600246677e-06, + "loss": 0.0863, + "step": 48273 + }, + { + "epoch": 3.054747693977211, + "grad_norm": 2.463304281234741, + "learning_rate": 3.487323557626422e-06, + "loss": 0.0856, + "step": 48274 + }, + { + "epoch": 3.054761258817146, + "grad_norm": 2.645435094833374, + "learning_rate": 3.4871865150061674e-06, + "loss": 0.0564, + "step": 48275 + }, + { + "epoch": 3.054774823657081, + "grad_norm": 3.8175439834594727, + "learning_rate": 3.487049472385912e-06, + "loss": 0.0562, + "step": 48276 + }, + { + "epoch": 3.0547883884970157, + "grad_norm": 3.0132579803466797, + "learning_rate": 3.4869124297656578e-06, + "loss": 0.0547, + "step": 48277 + }, + { + "epoch": 3.0548019533369506, + "grad_norm": 3.451333522796631, + "learning_rate": 3.4867753871454025e-06, + "loss": 0.0545, + "step": 48278 + }, + { + "epoch": 3.0548155181768855, + "grad_norm": 3.311849594116211, + "learning_rate": 3.4866383445251477e-06, + "loss": 0.1213, + "step": 48279 + }, + { + "epoch": 3.0548290830168203, + "grad_norm": 3.271721601486206, + "learning_rate": 3.486501301904893e-06, + "loss": 0.1104, + "step": 48280 + }, + { + "epoch": 3.054842647856755, + "grad_norm": 3.4797823429107666, + "learning_rate": 3.486364259284638e-06, + "loss": 0.0785, + "step": 48281 + }, + { + "epoch": 3.05485621269669, + "grad_norm": 3.450512170791626, + "learning_rate": 3.486227216664383e-06, + "loss": 0.0655, + "step": 48282 + }, + { + "epoch": 3.054869777536625, + "grad_norm": 2.5515058040618896, + "learning_rate": 3.4860901740441276e-06, + "loss": 0.0365, + "step": 48283 + }, + { + "epoch": 3.05488334237656, + "grad_norm": 3.237910747528076, + "learning_rate": 3.485953131423873e-06, + "loss": 0.071, + "step": 48284 + }, + { + "epoch": 3.0548969072164947, + "grad_norm": 2.928028106689453, + "learning_rate": 3.485816088803618e-06, + "loss": 0.0596, + "step": 48285 + }, + { + "epoch": 3.0549104720564295, + "grad_norm": 2.7268991470336914, + "learning_rate": 3.4856790461833635e-06, + "loss": 0.049, + "step": 48286 + }, + { + "epoch": 3.054924036896365, + "grad_norm": 2.5740809440612793, + "learning_rate": 3.4855420035631083e-06, + "loss": 0.0434, + "step": 48287 + }, + { + "epoch": 3.0549376017362997, + "grad_norm": 2.4095041751861572, + "learning_rate": 3.4854049609428535e-06, + "loss": 0.0397, + "step": 48288 + }, + { + "epoch": 3.0549511665762346, + "grad_norm": 2.621201515197754, + "learning_rate": 3.4852679183225986e-06, + "loss": 0.0719, + "step": 48289 + }, + { + "epoch": 3.0549647314161694, + "grad_norm": 2.752044677734375, + "learning_rate": 3.485130875702344e-06, + "loss": 0.0583, + "step": 48290 + }, + { + "epoch": 3.0549782962561043, + "grad_norm": 2.6169328689575195, + "learning_rate": 3.4849938330820886e-06, + "loss": 0.0482, + "step": 48291 + }, + { + "epoch": 3.054991861096039, + "grad_norm": 3.206545352935791, + "learning_rate": 3.484856790461834e-06, + "loss": 0.0318, + "step": 48292 + }, + { + "epoch": 3.055005425935974, + "grad_norm": 3.1299877166748047, + "learning_rate": 3.484719747841579e-06, + "loss": 0.0938, + "step": 48293 + }, + { + "epoch": 3.055018990775909, + "grad_norm": 3.0963969230651855, + "learning_rate": 3.4845827052213245e-06, + "loss": 0.0853, + "step": 48294 + }, + { + "epoch": 3.0550325556158437, + "grad_norm": 2.240915298461914, + "learning_rate": 3.4844456626010693e-06, + "loss": 0.0541, + "step": 48295 + }, + { + "epoch": 3.0550461204557786, + "grad_norm": 4.029554843902588, + "learning_rate": 3.484308619980814e-06, + "loss": 0.0761, + "step": 48296 + }, + { + "epoch": 3.0550596852957135, + "grad_norm": 3.3639779090881348, + "learning_rate": 3.4841715773605596e-06, + "loss": 0.1188, + "step": 48297 + }, + { + "epoch": 3.0550732501356483, + "grad_norm": 3.192831039428711, + "learning_rate": 3.4840345347403044e-06, + "loss": 0.0686, + "step": 48298 + }, + { + "epoch": 3.055086814975583, + "grad_norm": 2.9266817569732666, + "learning_rate": 3.4838974921200496e-06, + "loss": 0.1022, + "step": 48299 + }, + { + "epoch": 3.055100379815518, + "grad_norm": 1.887415885925293, + "learning_rate": 3.4837604494997943e-06, + "loss": 0.0346, + "step": 48300 + }, + { + "epoch": 3.055113944655453, + "grad_norm": 2.3401975631713867, + "learning_rate": 3.48362340687954e-06, + "loss": 0.0445, + "step": 48301 + }, + { + "epoch": 3.055127509495388, + "grad_norm": 3.5322799682617188, + "learning_rate": 3.4834863642592847e-06, + "loss": 0.0792, + "step": 48302 + }, + { + "epoch": 3.0551410743353227, + "grad_norm": 2.6148788928985596, + "learning_rate": 3.4833493216390303e-06, + "loss": 0.0444, + "step": 48303 + }, + { + "epoch": 3.0551546391752575, + "grad_norm": 2.5275144577026367, + "learning_rate": 3.483212279018775e-06, + "loss": 0.0398, + "step": 48304 + }, + { + "epoch": 3.0551682040151924, + "grad_norm": 4.39943265914917, + "learning_rate": 3.4830752363985202e-06, + "loss": 0.0598, + "step": 48305 + }, + { + "epoch": 3.0551817688551277, + "grad_norm": 2.1912519931793213, + "learning_rate": 3.4829381937782654e-06, + "loss": 0.0626, + "step": 48306 + }, + { + "epoch": 3.0551953336950626, + "grad_norm": 3.7625083923339844, + "learning_rate": 3.4828011511580106e-06, + "loss": 0.1172, + "step": 48307 + }, + { + "epoch": 3.0552088985349974, + "grad_norm": 3.8369481563568115, + "learning_rate": 3.4826641085377553e-06, + "loss": 0.0542, + "step": 48308 + }, + { + "epoch": 3.0552224633749323, + "grad_norm": 4.564218521118164, + "learning_rate": 3.482527065917501e-06, + "loss": 0.0776, + "step": 48309 + }, + { + "epoch": 3.055236028214867, + "grad_norm": 4.119039058685303, + "learning_rate": 3.4823900232972457e-06, + "loss": 0.1176, + "step": 48310 + }, + { + "epoch": 3.055249593054802, + "grad_norm": 4.048638820648193, + "learning_rate": 3.4822529806769904e-06, + "loss": 0.0646, + "step": 48311 + }, + { + "epoch": 3.055263157894737, + "grad_norm": 2.4678101539611816, + "learning_rate": 3.482115938056736e-06, + "loss": 0.0557, + "step": 48312 + }, + { + "epoch": 3.0552767227346718, + "grad_norm": 4.715344429016113, + "learning_rate": 3.481978895436481e-06, + "loss": 0.1202, + "step": 48313 + }, + { + "epoch": 3.0552902875746066, + "grad_norm": 3.841179609298706, + "learning_rate": 3.4818418528162264e-06, + "loss": 0.0842, + "step": 48314 + }, + { + "epoch": 3.0553038524145415, + "grad_norm": 3.9641194343566895, + "learning_rate": 3.481704810195971e-06, + "loss": 0.06, + "step": 48315 + }, + { + "epoch": 3.0553174172544764, + "grad_norm": 3.4773061275482178, + "learning_rate": 3.4815677675757163e-06, + "loss": 0.0906, + "step": 48316 + }, + { + "epoch": 3.0553309820944112, + "grad_norm": 4.389013767242432, + "learning_rate": 3.481430724955461e-06, + "loss": 0.1221, + "step": 48317 + }, + { + "epoch": 3.055344546934346, + "grad_norm": 5.014064788818359, + "learning_rate": 3.4812936823352067e-06, + "loss": 0.0863, + "step": 48318 + }, + { + "epoch": 3.055358111774281, + "grad_norm": 2.4695236682891846, + "learning_rate": 3.4811566397149515e-06, + "loss": 0.0688, + "step": 48319 + }, + { + "epoch": 3.055371676614216, + "grad_norm": 6.344973087310791, + "learning_rate": 3.481019597094697e-06, + "loss": 0.0855, + "step": 48320 + }, + { + "epoch": 3.0553852414541507, + "grad_norm": 3.7253503799438477, + "learning_rate": 3.480882554474442e-06, + "loss": 0.0705, + "step": 48321 + }, + { + "epoch": 3.0553988062940856, + "grad_norm": 4.219398498535156, + "learning_rate": 3.480745511854187e-06, + "loss": 0.0744, + "step": 48322 + }, + { + "epoch": 3.0554123711340204, + "grad_norm": 4.263296604156494, + "learning_rate": 3.480608469233932e-06, + "loss": 0.0976, + "step": 48323 + }, + { + "epoch": 3.0554259359739557, + "grad_norm": 4.246002674102783, + "learning_rate": 3.480471426613677e-06, + "loss": 0.0935, + "step": 48324 + }, + { + "epoch": 3.0554395008138906, + "grad_norm": 4.486166000366211, + "learning_rate": 3.480334383993422e-06, + "loss": 0.0901, + "step": 48325 + }, + { + "epoch": 3.0554530656538255, + "grad_norm": 5.596351623535156, + "learning_rate": 3.4801973413731673e-06, + "loss": 0.0969, + "step": 48326 + }, + { + "epoch": 3.0554666304937603, + "grad_norm": 3.2880513668060303, + "learning_rate": 3.4800602987529125e-06, + "loss": 0.0471, + "step": 48327 + }, + { + "epoch": 3.055480195333695, + "grad_norm": 4.175585746765137, + "learning_rate": 3.4799232561326572e-06, + "loss": 0.0875, + "step": 48328 + }, + { + "epoch": 3.05549376017363, + "grad_norm": 4.31176233291626, + "learning_rate": 3.479786213512403e-06, + "loss": 0.0652, + "step": 48329 + }, + { + "epoch": 3.055507325013565, + "grad_norm": 8.161026000976562, + "learning_rate": 3.4796491708921476e-06, + "loss": 0.204, + "step": 48330 + }, + { + "epoch": 3.0555208898535, + "grad_norm": 4.034010887145996, + "learning_rate": 3.479512128271893e-06, + "loss": 0.1283, + "step": 48331 + }, + { + "epoch": 3.0555344546934347, + "grad_norm": 2.6782193183898926, + "learning_rate": 3.479375085651638e-06, + "loss": 0.0769, + "step": 48332 + }, + { + "epoch": 3.0555480195333695, + "grad_norm": 3.919088125228882, + "learning_rate": 3.479238043031383e-06, + "loss": 0.0822, + "step": 48333 + }, + { + "epoch": 3.0555615843733044, + "grad_norm": 2.834465980529785, + "learning_rate": 3.4791010004111283e-06, + "loss": 0.0772, + "step": 48334 + }, + { + "epoch": 3.0555751492132392, + "grad_norm": 2.9625327587127686, + "learning_rate": 3.4789639577908735e-06, + "loss": 0.0877, + "step": 48335 + }, + { + "epoch": 3.055588714053174, + "grad_norm": 5.492376327514648, + "learning_rate": 3.4788269151706182e-06, + "loss": 0.0622, + "step": 48336 + }, + { + "epoch": 3.055602278893109, + "grad_norm": 4.29355525970459, + "learning_rate": 3.478689872550364e-06, + "loss": 0.1081, + "step": 48337 + }, + { + "epoch": 3.055615843733044, + "grad_norm": 4.542497158050537, + "learning_rate": 3.4785528299301086e-06, + "loss": 0.1311, + "step": 48338 + }, + { + "epoch": 3.0556294085729787, + "grad_norm": 2.9822754859924316, + "learning_rate": 3.4784157873098533e-06, + "loss": 0.0453, + "step": 48339 + }, + { + "epoch": 3.0556429734129136, + "grad_norm": 3.8120217323303223, + "learning_rate": 3.478278744689599e-06, + "loss": 0.1005, + "step": 48340 + }, + { + "epoch": 3.0556565382528484, + "grad_norm": 4.308424949645996, + "learning_rate": 3.4781417020693437e-06, + "loss": 0.123, + "step": 48341 + }, + { + "epoch": 3.0556701030927833, + "grad_norm": 2.709419012069702, + "learning_rate": 3.478004659449089e-06, + "loss": 0.056, + "step": 48342 + }, + { + "epoch": 3.055683667932718, + "grad_norm": 3.248131513595581, + "learning_rate": 3.477867616828834e-06, + "loss": 0.0522, + "step": 48343 + }, + { + "epoch": 3.0556972327726535, + "grad_norm": 4.013557434082031, + "learning_rate": 3.4777305742085792e-06, + "loss": 0.1127, + "step": 48344 + }, + { + "epoch": 3.0557107976125883, + "grad_norm": 3.9843976497650146, + "learning_rate": 3.477593531588324e-06, + "loss": 0.089, + "step": 48345 + }, + { + "epoch": 3.055724362452523, + "grad_norm": 3.683405637741089, + "learning_rate": 3.4774564889680696e-06, + "loss": 0.1239, + "step": 48346 + }, + { + "epoch": 3.055737927292458, + "grad_norm": 4.04309606552124, + "learning_rate": 3.4773194463478143e-06, + "loss": 0.063, + "step": 48347 + }, + { + "epoch": 3.055751492132393, + "grad_norm": 4.787387847900391, + "learning_rate": 3.47718240372756e-06, + "loss": 0.0693, + "step": 48348 + }, + { + "epoch": 3.055765056972328, + "grad_norm": 4.233224391937256, + "learning_rate": 3.4770453611073047e-06, + "loss": 0.0943, + "step": 48349 + }, + { + "epoch": 3.0557786218122627, + "grad_norm": 2.680222988128662, + "learning_rate": 3.47690831848705e-06, + "loss": 0.0632, + "step": 48350 + }, + { + "epoch": 3.0557921866521975, + "grad_norm": 3.8085484504699707, + "learning_rate": 3.476771275866795e-06, + "loss": 0.1225, + "step": 48351 + }, + { + "epoch": 3.0558057514921324, + "grad_norm": 3.350881338119507, + "learning_rate": 3.47663423324654e-06, + "loss": 0.0915, + "step": 48352 + }, + { + "epoch": 3.0558193163320673, + "grad_norm": 3.712550640106201, + "learning_rate": 3.476497190626285e-06, + "loss": 0.0955, + "step": 48353 + }, + { + "epoch": 3.055832881172002, + "grad_norm": 4.615227699279785, + "learning_rate": 3.4763601480060298e-06, + "loss": 0.169, + "step": 48354 + }, + { + "epoch": 3.055846446011937, + "grad_norm": 5.263448238372803, + "learning_rate": 3.4762231053857754e-06, + "loss": 0.185, + "step": 48355 + }, + { + "epoch": 3.055860010851872, + "grad_norm": 5.640740871429443, + "learning_rate": 3.47608606276552e-06, + "loss": 0.173, + "step": 48356 + }, + { + "epoch": 3.0558735756918067, + "grad_norm": 4.046340465545654, + "learning_rate": 3.4759490201452657e-06, + "loss": 0.0727, + "step": 48357 + }, + { + "epoch": 3.0558871405317416, + "grad_norm": 4.207666873931885, + "learning_rate": 3.4758119775250105e-06, + "loss": 0.106, + "step": 48358 + }, + { + "epoch": 3.0559007053716765, + "grad_norm": 3.3510584831237793, + "learning_rate": 3.4756749349047556e-06, + "loss": 0.0772, + "step": 48359 + }, + { + "epoch": 3.0559142702116113, + "grad_norm": 4.459901809692383, + "learning_rate": 3.475537892284501e-06, + "loss": 0.1195, + "step": 48360 + }, + { + "epoch": 3.055927835051546, + "grad_norm": 3.571714401245117, + "learning_rate": 3.475400849664246e-06, + "loss": 0.0835, + "step": 48361 + }, + { + "epoch": 3.0559413998914815, + "grad_norm": 3.9429805278778076, + "learning_rate": 3.4752638070439908e-06, + "loss": 0.156, + "step": 48362 + }, + { + "epoch": 3.0559549647314164, + "grad_norm": 7.077261924743652, + "learning_rate": 3.4751267644237364e-06, + "loss": 0.2988, + "step": 48363 + }, + { + "epoch": 3.0559685295713512, + "grad_norm": 4.420964241027832, + "learning_rate": 3.474989721803481e-06, + "loss": 0.1806, + "step": 48364 + }, + { + "epoch": 3.055982094411286, + "grad_norm": 4.695983409881592, + "learning_rate": 3.474852679183226e-06, + "loss": 0.1076, + "step": 48365 + }, + { + "epoch": 3.055995659251221, + "grad_norm": 4.583096504211426, + "learning_rate": 3.4747156365629715e-06, + "loss": 0.1026, + "step": 48366 + }, + { + "epoch": 3.056009224091156, + "grad_norm": 3.8857014179229736, + "learning_rate": 3.4745785939427162e-06, + "loss": 0.1227, + "step": 48367 + }, + { + "epoch": 3.0560227889310907, + "grad_norm": 3.117724895477295, + "learning_rate": 3.474441551322462e-06, + "loss": 0.121, + "step": 48368 + }, + { + "epoch": 3.0560363537710256, + "grad_norm": 4.005396842956543, + "learning_rate": 3.4743045087022066e-06, + "loss": 0.0941, + "step": 48369 + }, + { + "epoch": 3.0560499186109604, + "grad_norm": 4.983189582824707, + "learning_rate": 3.4741674660819518e-06, + "loss": 0.162, + "step": 48370 + }, + { + "epoch": 3.0560634834508953, + "grad_norm": 5.353478908538818, + "learning_rate": 3.4740304234616965e-06, + "loss": 0.2432, + "step": 48371 + }, + { + "epoch": 3.05607704829083, + "grad_norm": 4.959333419799805, + "learning_rate": 3.473893380841442e-06, + "loss": 0.1626, + "step": 48372 + }, + { + "epoch": 3.056090613130765, + "grad_norm": 3.5141067504882812, + "learning_rate": 3.473756338221187e-06, + "loss": 0.092, + "step": 48373 + }, + { + "epoch": 3.0561041779707, + "grad_norm": 4.662606239318848, + "learning_rate": 3.4736192956009325e-06, + "loss": 0.2183, + "step": 48374 + }, + { + "epoch": 3.0561177428106348, + "grad_norm": 4.306761264801025, + "learning_rate": 3.4734822529806772e-06, + "loss": 0.1018, + "step": 48375 + }, + { + "epoch": 3.0561313076505696, + "grad_norm": 4.902820110321045, + "learning_rate": 3.4733452103604224e-06, + "loss": 0.1692, + "step": 48376 + }, + { + "epoch": 3.0561448724905045, + "grad_norm": 4.568312644958496, + "learning_rate": 3.4732081677401676e-06, + "loss": 0.0955, + "step": 48377 + }, + { + "epoch": 3.0561584373304393, + "grad_norm": 3.8550217151641846, + "learning_rate": 3.4730711251199128e-06, + "loss": 0.159, + "step": 48378 + }, + { + "epoch": 3.056172002170374, + "grad_norm": 4.7482452392578125, + "learning_rate": 3.4729340824996575e-06, + "loss": 0.1496, + "step": 48379 + }, + { + "epoch": 3.056185567010309, + "grad_norm": 4.963575839996338, + "learning_rate": 3.4727970398794027e-06, + "loss": 0.1404, + "step": 48380 + }, + { + "epoch": 3.056199131850244, + "grad_norm": 3.834392547607422, + "learning_rate": 3.472659997259148e-06, + "loss": 0.0567, + "step": 48381 + }, + { + "epoch": 3.0562126966901793, + "grad_norm": 4.778637409210205, + "learning_rate": 3.4725229546388926e-06, + "loss": 0.1391, + "step": 48382 + }, + { + "epoch": 3.056226261530114, + "grad_norm": 4.467653274536133, + "learning_rate": 3.4723859120186382e-06, + "loss": 0.1346, + "step": 48383 + }, + { + "epoch": 3.056239826370049, + "grad_norm": 3.9631054401397705, + "learning_rate": 3.472248869398383e-06, + "loss": 0.1398, + "step": 48384 + }, + { + "epoch": 3.056253391209984, + "grad_norm": 6.470217704772949, + "learning_rate": 3.4721118267781286e-06, + "loss": 0.2014, + "step": 48385 + }, + { + "epoch": 3.0562669560499187, + "grad_norm": 5.512729167938232, + "learning_rate": 3.4719747841578734e-06, + "loss": 0.1327, + "step": 48386 + }, + { + "epoch": 3.0562805208898536, + "grad_norm": 6.091353416442871, + "learning_rate": 3.4718377415376185e-06, + "loss": 0.19, + "step": 48387 + }, + { + "epoch": 3.0562940857297884, + "grad_norm": 4.752683639526367, + "learning_rate": 3.4717006989173633e-06, + "loss": 0.133, + "step": 48388 + }, + { + "epoch": 3.0563076505697233, + "grad_norm": 4.40042781829834, + "learning_rate": 3.471563656297109e-06, + "loss": 0.0785, + "step": 48389 + }, + { + "epoch": 3.056321215409658, + "grad_norm": 3.5633344650268555, + "learning_rate": 3.4714266136768537e-06, + "loss": 0.1259, + "step": 48390 + }, + { + "epoch": 3.056334780249593, + "grad_norm": 4.900376319885254, + "learning_rate": 3.4712895710565993e-06, + "loss": 0.0989, + "step": 48391 + }, + { + "epoch": 3.056348345089528, + "grad_norm": 5.184466361999512, + "learning_rate": 3.471152528436344e-06, + "loss": 0.1308, + "step": 48392 + }, + { + "epoch": 3.0563619099294628, + "grad_norm": 2.925947427749634, + "learning_rate": 3.4710154858160888e-06, + "loss": 0.0792, + "step": 48393 + }, + { + "epoch": 3.0563754747693976, + "grad_norm": 3.4710161685943604, + "learning_rate": 3.4708784431958344e-06, + "loss": 0.1054, + "step": 48394 + }, + { + "epoch": 3.0563890396093325, + "grad_norm": 3.463995933532715, + "learning_rate": 3.470741400575579e-06, + "loss": 0.0597, + "step": 48395 + }, + { + "epoch": 3.0564026044492674, + "grad_norm": 2.8069820404052734, + "learning_rate": 3.4706043579553243e-06, + "loss": 0.0845, + "step": 48396 + }, + { + "epoch": 3.0564161692892022, + "grad_norm": 3.3949270248413086, + "learning_rate": 3.4704673153350695e-06, + "loss": 0.1458, + "step": 48397 + }, + { + "epoch": 3.056429734129137, + "grad_norm": 4.159850120544434, + "learning_rate": 3.4703302727148147e-06, + "loss": 0.1058, + "step": 48398 + }, + { + "epoch": 3.056443298969072, + "grad_norm": 3.386326313018799, + "learning_rate": 3.4701932300945594e-06, + "loss": 0.1028, + "step": 48399 + }, + { + "epoch": 3.0564568638090073, + "grad_norm": 3.337918281555176, + "learning_rate": 3.470056187474305e-06, + "loss": 0.0752, + "step": 48400 + }, + { + "epoch": 3.056470428648942, + "grad_norm": 4.671263694763184, + "learning_rate": 3.4699191448540498e-06, + "loss": 0.1542, + "step": 48401 + }, + { + "epoch": 3.056483993488877, + "grad_norm": 4.129109859466553, + "learning_rate": 3.4697821022337954e-06, + "loss": 0.1249, + "step": 48402 + }, + { + "epoch": 3.056497558328812, + "grad_norm": 3.9183428287506104, + "learning_rate": 3.46964505961354e-06, + "loss": 0.1335, + "step": 48403 + }, + { + "epoch": 3.0565111231687467, + "grad_norm": 3.7692036628723145, + "learning_rate": 3.4695080169932853e-06, + "loss": 0.1263, + "step": 48404 + }, + { + "epoch": 3.0565246880086816, + "grad_norm": 2.946676731109619, + "learning_rate": 3.4693709743730305e-06, + "loss": 0.1032, + "step": 48405 + }, + { + "epoch": 3.0565382528486165, + "grad_norm": 4.042171001434326, + "learning_rate": 3.4692339317527757e-06, + "loss": 0.0885, + "step": 48406 + }, + { + "epoch": 3.0565518176885513, + "grad_norm": 3.421105146408081, + "learning_rate": 3.4690968891325204e-06, + "loss": 0.0964, + "step": 48407 + }, + { + "epoch": 3.056565382528486, + "grad_norm": 3.897562265396118, + "learning_rate": 3.468959846512265e-06, + "loss": 0.0828, + "step": 48408 + }, + { + "epoch": 3.056578947368421, + "grad_norm": 3.062812566757202, + "learning_rate": 3.4688228038920108e-06, + "loss": 0.1226, + "step": 48409 + }, + { + "epoch": 3.056592512208356, + "grad_norm": 3.2596595287323, + "learning_rate": 3.4686857612717555e-06, + "loss": 0.088, + "step": 48410 + }, + { + "epoch": 3.056606077048291, + "grad_norm": 2.298372983932495, + "learning_rate": 3.468548718651501e-06, + "loss": 0.0454, + "step": 48411 + }, + { + "epoch": 3.0566196418882257, + "grad_norm": 4.0384063720703125, + "learning_rate": 3.468411676031246e-06, + "loss": 0.1067, + "step": 48412 + }, + { + "epoch": 3.0566332067281605, + "grad_norm": 2.707411527633667, + "learning_rate": 3.468274633410991e-06, + "loss": 0.0421, + "step": 48413 + }, + { + "epoch": 3.0566467715680954, + "grad_norm": 5.371304512023926, + "learning_rate": 3.4681375907907363e-06, + "loss": 0.1822, + "step": 48414 + }, + { + "epoch": 3.0566603364080303, + "grad_norm": 2.7020251750946045, + "learning_rate": 3.4680005481704814e-06, + "loss": 0.077, + "step": 48415 + }, + { + "epoch": 3.056673901247965, + "grad_norm": 5.159797191619873, + "learning_rate": 3.467863505550226e-06, + "loss": 0.1727, + "step": 48416 + }, + { + "epoch": 3.0566874660879, + "grad_norm": 2.2624833583831787, + "learning_rate": 3.4677264629299718e-06, + "loss": 0.0471, + "step": 48417 + }, + { + "epoch": 3.056701030927835, + "grad_norm": 3.1420326232910156, + "learning_rate": 3.4675894203097165e-06, + "loss": 0.0699, + "step": 48418 + }, + { + "epoch": 3.0567145957677697, + "grad_norm": 3.4595844745635986, + "learning_rate": 3.467452377689462e-06, + "loss": 0.0842, + "step": 48419 + }, + { + "epoch": 3.056728160607705, + "grad_norm": 2.132816791534424, + "learning_rate": 3.467315335069207e-06, + "loss": 0.0403, + "step": 48420 + }, + { + "epoch": 3.05674172544764, + "grad_norm": 2.1920418739318848, + "learning_rate": 3.4671782924489517e-06, + "loss": 0.0619, + "step": 48421 + }, + { + "epoch": 3.0567552902875748, + "grad_norm": 3.5597031116485596, + "learning_rate": 3.4670412498286973e-06, + "loss": 0.1081, + "step": 48422 + }, + { + "epoch": 3.0567688551275096, + "grad_norm": 3.5613677501678467, + "learning_rate": 3.466904207208442e-06, + "loss": 0.1259, + "step": 48423 + }, + { + "epoch": 3.0567824199674445, + "grad_norm": 3.287989854812622, + "learning_rate": 3.466767164588187e-06, + "loss": 0.0837, + "step": 48424 + }, + { + "epoch": 3.0567959848073794, + "grad_norm": 4.6085896492004395, + "learning_rate": 3.466630121967932e-06, + "loss": 0.1718, + "step": 48425 + }, + { + "epoch": 3.056809549647314, + "grad_norm": 2.6716105937957764, + "learning_rate": 3.4664930793476775e-06, + "loss": 0.0711, + "step": 48426 + }, + { + "epoch": 3.056823114487249, + "grad_norm": 4.602151393890381, + "learning_rate": 3.4663560367274223e-06, + "loss": 0.0836, + "step": 48427 + }, + { + "epoch": 3.056836679327184, + "grad_norm": 2.542922019958496, + "learning_rate": 3.466218994107168e-06, + "loss": 0.049, + "step": 48428 + }, + { + "epoch": 3.056850244167119, + "grad_norm": 2.870004653930664, + "learning_rate": 3.4660819514869127e-06, + "loss": 0.1076, + "step": 48429 + }, + { + "epoch": 3.0568638090070537, + "grad_norm": 3.767902374267578, + "learning_rate": 3.465944908866658e-06, + "loss": 0.0487, + "step": 48430 + }, + { + "epoch": 3.0568773738469885, + "grad_norm": 3.7687010765075684, + "learning_rate": 3.465807866246403e-06, + "loss": 0.0523, + "step": 48431 + }, + { + "epoch": 3.0568909386869234, + "grad_norm": 4.429832935333252, + "learning_rate": 3.465670823626148e-06, + "loss": 0.0956, + "step": 48432 + }, + { + "epoch": 3.0569045035268583, + "grad_norm": 3.8529605865478516, + "learning_rate": 3.465533781005893e-06, + "loss": 0.1173, + "step": 48433 + }, + { + "epoch": 3.056918068366793, + "grad_norm": 4.266252517700195, + "learning_rate": 3.465396738385638e-06, + "loss": 0.0898, + "step": 48434 + }, + { + "epoch": 3.056931633206728, + "grad_norm": 3.287297487258911, + "learning_rate": 3.4652596957653833e-06, + "loss": 0.0886, + "step": 48435 + }, + { + "epoch": 3.056945198046663, + "grad_norm": 2.6335694789886475, + "learning_rate": 3.465122653145128e-06, + "loss": 0.0699, + "step": 48436 + }, + { + "epoch": 3.0569587628865977, + "grad_norm": 3.8309364318847656, + "learning_rate": 3.4649856105248737e-06, + "loss": 0.1121, + "step": 48437 + }, + { + "epoch": 3.056972327726533, + "grad_norm": 4.150725364685059, + "learning_rate": 3.4648485679046184e-06, + "loss": 0.1445, + "step": 48438 + }, + { + "epoch": 3.056985892566468, + "grad_norm": 5.632614612579346, + "learning_rate": 3.464711525284364e-06, + "loss": 0.1226, + "step": 48439 + }, + { + "epoch": 3.0569994574064028, + "grad_norm": 2.7491321563720703, + "learning_rate": 3.4645744826641088e-06, + "loss": 0.0715, + "step": 48440 + }, + { + "epoch": 3.0570130222463376, + "grad_norm": 3.7895054817199707, + "learning_rate": 3.464437440043854e-06, + "loss": 0.0915, + "step": 48441 + }, + { + "epoch": 3.0570265870862725, + "grad_norm": 3.5800812244415283, + "learning_rate": 3.4643003974235987e-06, + "loss": 0.1055, + "step": 48442 + }, + { + "epoch": 3.0570401519262074, + "grad_norm": 4.900529384613037, + "learning_rate": 3.4641633548033443e-06, + "loss": 0.17, + "step": 48443 + }, + { + "epoch": 3.0570537167661422, + "grad_norm": 5.493019104003906, + "learning_rate": 3.464026312183089e-06, + "loss": 0.1231, + "step": 48444 + }, + { + "epoch": 3.057067281606077, + "grad_norm": 3.7984683513641357, + "learning_rate": 3.4638892695628347e-06, + "loss": 0.0912, + "step": 48445 + }, + { + "epoch": 3.057080846446012, + "grad_norm": 5.436013698577881, + "learning_rate": 3.4637522269425794e-06, + "loss": 0.1507, + "step": 48446 + }, + { + "epoch": 3.057094411285947, + "grad_norm": 4.564310073852539, + "learning_rate": 3.4636151843223246e-06, + "loss": 0.0897, + "step": 48447 + }, + { + "epoch": 3.0571079761258817, + "grad_norm": 5.041521072387695, + "learning_rate": 3.46347814170207e-06, + "loss": 0.2364, + "step": 48448 + }, + { + "epoch": 3.0571215409658166, + "grad_norm": 2.5568253993988037, + "learning_rate": 3.4633410990818145e-06, + "loss": 0.0418, + "step": 48449 + }, + { + "epoch": 3.0571351058057514, + "grad_norm": 3.020139455795288, + "learning_rate": 3.4632040564615597e-06, + "loss": 0.061, + "step": 48450 + }, + { + "epoch": 3.0571486706456863, + "grad_norm": 5.703956127166748, + "learning_rate": 3.463067013841305e-06, + "loss": 0.1419, + "step": 48451 + }, + { + "epoch": 3.057162235485621, + "grad_norm": 5.217434883117676, + "learning_rate": 3.46292997122105e-06, + "loss": 0.0929, + "step": 48452 + }, + { + "epoch": 3.057175800325556, + "grad_norm": 5.85264778137207, + "learning_rate": 3.462792928600795e-06, + "loss": 0.1553, + "step": 48453 + }, + { + "epoch": 3.057189365165491, + "grad_norm": 2.594524383544922, + "learning_rate": 3.4626558859805404e-06, + "loss": 0.0537, + "step": 48454 + }, + { + "epoch": 3.0572029300054258, + "grad_norm": 4.2112860679626465, + "learning_rate": 3.462518843360285e-06, + "loss": 0.0721, + "step": 48455 + }, + { + "epoch": 3.0572164948453606, + "grad_norm": 4.900262355804443, + "learning_rate": 3.462381800740031e-06, + "loss": 0.1422, + "step": 48456 + }, + { + "epoch": 3.057230059685296, + "grad_norm": 5.9704742431640625, + "learning_rate": 3.4622447581197756e-06, + "loss": 0.2154, + "step": 48457 + }, + { + "epoch": 3.057243624525231, + "grad_norm": 6.4840497970581055, + "learning_rate": 3.4621077154995207e-06, + "loss": 0.1555, + "step": 48458 + }, + { + "epoch": 3.0572571893651657, + "grad_norm": 6.0790557861328125, + "learning_rate": 3.4619706728792655e-06, + "loss": 0.1902, + "step": 48459 + }, + { + "epoch": 3.0572707542051005, + "grad_norm": 3.6403393745422363, + "learning_rate": 3.461833630259011e-06, + "loss": 0.102, + "step": 48460 + }, + { + "epoch": 3.0572843190450354, + "grad_norm": 3.511788845062256, + "learning_rate": 3.461696587638756e-06, + "loss": 0.0918, + "step": 48461 + }, + { + "epoch": 3.0572978838849703, + "grad_norm": 4.584465980529785, + "learning_rate": 3.4615595450185006e-06, + "loss": 0.061, + "step": 48462 + }, + { + "epoch": 3.057311448724905, + "grad_norm": 3.5097389221191406, + "learning_rate": 3.461422502398246e-06, + "loss": 0.053, + "step": 48463 + }, + { + "epoch": 3.05732501356484, + "grad_norm": 3.568622350692749, + "learning_rate": 3.461285459777991e-06, + "loss": 0.0942, + "step": 48464 + }, + { + "epoch": 3.057338578404775, + "grad_norm": 4.8765788078308105, + "learning_rate": 3.4611484171577366e-06, + "loss": 0.1346, + "step": 48465 + }, + { + "epoch": 3.0573521432447097, + "grad_norm": 4.506810665130615, + "learning_rate": 3.4610113745374813e-06, + "loss": 0.0883, + "step": 48466 + }, + { + "epoch": 3.0573657080846446, + "grad_norm": 4.864118576049805, + "learning_rate": 3.4608743319172265e-06, + "loss": 0.1384, + "step": 48467 + }, + { + "epoch": 3.0573792729245794, + "grad_norm": 4.959000587463379, + "learning_rate": 3.4607372892969717e-06, + "loss": 0.1176, + "step": 48468 + }, + { + "epoch": 3.0573928377645143, + "grad_norm": 4.035531520843506, + "learning_rate": 3.460600246676717e-06, + "loss": 0.0677, + "step": 48469 + }, + { + "epoch": 3.057406402604449, + "grad_norm": 3.7553584575653076, + "learning_rate": 3.4604632040564616e-06, + "loss": 0.087, + "step": 48470 + }, + { + "epoch": 3.057419967444384, + "grad_norm": 5.010223388671875, + "learning_rate": 3.460326161436207e-06, + "loss": 0.1886, + "step": 48471 + }, + { + "epoch": 3.057433532284319, + "grad_norm": 2.3306806087493896, + "learning_rate": 3.460189118815952e-06, + "loss": 0.034, + "step": 48472 + }, + { + "epoch": 3.0574470971242538, + "grad_norm": 4.344125747680664, + "learning_rate": 3.4600520761956976e-06, + "loss": 0.1981, + "step": 48473 + }, + { + "epoch": 3.0574606619641886, + "grad_norm": 4.676513195037842, + "learning_rate": 3.4599150335754423e-06, + "loss": 0.0847, + "step": 48474 + }, + { + "epoch": 3.0574742268041235, + "grad_norm": 3.983644485473633, + "learning_rate": 3.459777990955187e-06, + "loss": 0.1363, + "step": 48475 + }, + { + "epoch": 3.057487791644059, + "grad_norm": 5.375332832336426, + "learning_rate": 3.4596409483349323e-06, + "loss": 0.0901, + "step": 48476 + }, + { + "epoch": 3.0575013564839937, + "grad_norm": 3.2129557132720947, + "learning_rate": 3.4595039057146774e-06, + "loss": 0.0862, + "step": 48477 + }, + { + "epoch": 3.0575149213239285, + "grad_norm": 4.606673240661621, + "learning_rate": 3.4593668630944226e-06, + "loss": 0.1448, + "step": 48478 + }, + { + "epoch": 3.0575284861638634, + "grad_norm": 3.879904270172119, + "learning_rate": 3.4592298204741674e-06, + "loss": 0.0915, + "step": 48479 + }, + { + "epoch": 3.0575420510037983, + "grad_norm": 7.579920768737793, + "learning_rate": 3.459092777853913e-06, + "loss": 0.2293, + "step": 48480 + }, + { + "epoch": 3.057555615843733, + "grad_norm": 7.737078666687012, + "learning_rate": 3.4589557352336577e-06, + "loss": 0.1648, + "step": 48481 + }, + { + "epoch": 3.057569180683668, + "grad_norm": 3.3946752548217773, + "learning_rate": 3.4588186926134033e-06, + "loss": 0.0834, + "step": 48482 + }, + { + "epoch": 3.057582745523603, + "grad_norm": 4.117327690124512, + "learning_rate": 3.458681649993148e-06, + "loss": 0.0938, + "step": 48483 + }, + { + "epoch": 3.0575963103635377, + "grad_norm": 4.009705066680908, + "learning_rate": 3.4585446073728933e-06, + "loss": 0.1708, + "step": 48484 + }, + { + "epoch": 3.0576098752034726, + "grad_norm": 4.724881649017334, + "learning_rate": 3.4584075647526384e-06, + "loss": 0.1894, + "step": 48485 + }, + { + "epoch": 3.0576234400434075, + "grad_norm": 5.515007019042969, + "learning_rate": 3.4582705221323836e-06, + "loss": 0.1515, + "step": 48486 + }, + { + "epoch": 3.0576370048833423, + "grad_norm": 4.821640968322754, + "learning_rate": 3.4581334795121284e-06, + "loss": 0.0903, + "step": 48487 + }, + { + "epoch": 3.057650569723277, + "grad_norm": 4.893676280975342, + "learning_rate": 3.457996436891874e-06, + "loss": 0.1383, + "step": 48488 + }, + { + "epoch": 3.057664134563212, + "grad_norm": 4.5408549308776855, + "learning_rate": 3.4578593942716187e-06, + "loss": 0.1228, + "step": 48489 + }, + { + "epoch": 3.057677699403147, + "grad_norm": 5.87305212020874, + "learning_rate": 3.4577223516513635e-06, + "loss": 0.1784, + "step": 48490 + }, + { + "epoch": 3.057691264243082, + "grad_norm": 3.996635913848877, + "learning_rate": 3.457585309031109e-06, + "loss": 0.1458, + "step": 48491 + }, + { + "epoch": 3.0577048290830167, + "grad_norm": 5.250082492828369, + "learning_rate": 3.457448266410854e-06, + "loss": 0.1903, + "step": 48492 + }, + { + "epoch": 3.0577183939229515, + "grad_norm": 4.948619842529297, + "learning_rate": 3.4573112237905995e-06, + "loss": 0.1313, + "step": 48493 + }, + { + "epoch": 3.0577319587628864, + "grad_norm": 2.942239761352539, + "learning_rate": 3.457174181170344e-06, + "loss": 0.1135, + "step": 48494 + }, + { + "epoch": 3.0577455236028217, + "grad_norm": 3.4363114833831787, + "learning_rate": 3.4570371385500894e-06, + "loss": 0.1286, + "step": 48495 + }, + { + "epoch": 3.0577590884427566, + "grad_norm": 4.20009183883667, + "learning_rate": 3.456900095929834e-06, + "loss": 0.1252, + "step": 48496 + }, + { + "epoch": 3.0577726532826914, + "grad_norm": 8.473917961120605, + "learning_rate": 3.4567630533095797e-06, + "loss": 0.2875, + "step": 48497 + }, + { + "epoch": 3.0577862181226263, + "grad_norm": 3.9085750579833984, + "learning_rate": 3.4566260106893245e-06, + "loss": 0.074, + "step": 48498 + }, + { + "epoch": 3.057799782962561, + "grad_norm": 4.3939595222473145, + "learning_rate": 3.45648896806907e-06, + "loss": 0.1663, + "step": 48499 + }, + { + "epoch": 3.057813347802496, + "grad_norm": 5.644002914428711, + "learning_rate": 3.456351925448815e-06, + "loss": 0.1203, + "step": 48500 + }, + { + "epoch": 3.057826912642431, + "grad_norm": 2.5315358638763428, + "learning_rate": 3.45621488282856e-06, + "loss": 0.0448, + "step": 48501 + }, + { + "epoch": 3.0578404774823658, + "grad_norm": 6.028303146362305, + "learning_rate": 3.4560778402083052e-06, + "loss": 0.1802, + "step": 48502 + }, + { + "epoch": 3.0578540423223006, + "grad_norm": 4.276213645935059, + "learning_rate": 3.45594079758805e-06, + "loss": 0.1057, + "step": 48503 + }, + { + "epoch": 3.0578676071622355, + "grad_norm": 4.9997992515563965, + "learning_rate": 3.455803754967795e-06, + "loss": 0.1369, + "step": 48504 + }, + { + "epoch": 3.0578811720021704, + "grad_norm": 3.990835666656494, + "learning_rate": 3.4556667123475403e-06, + "loss": 0.1298, + "step": 48505 + }, + { + "epoch": 3.057894736842105, + "grad_norm": 3.4598782062530518, + "learning_rate": 3.4555296697272855e-06, + "loss": 0.0561, + "step": 48506 + }, + { + "epoch": 3.05790830168204, + "grad_norm": 2.9932799339294434, + "learning_rate": 3.4553926271070303e-06, + "loss": 0.0512, + "step": 48507 + }, + { + "epoch": 3.057921866521975, + "grad_norm": 4.319654941558838, + "learning_rate": 3.455255584486776e-06, + "loss": 0.0979, + "step": 48508 + }, + { + "epoch": 3.05793543136191, + "grad_norm": 5.933401107788086, + "learning_rate": 3.4551185418665206e-06, + "loss": 0.2076, + "step": 48509 + }, + { + "epoch": 3.0579489962018447, + "grad_norm": 2.687211036682129, + "learning_rate": 3.4549814992462662e-06, + "loss": 0.0532, + "step": 48510 + }, + { + "epoch": 3.0579625610417795, + "grad_norm": 3.4340503215789795, + "learning_rate": 3.454844456626011e-06, + "loss": 0.1213, + "step": 48511 + }, + { + "epoch": 3.0579761258817144, + "grad_norm": 3.8529069423675537, + "learning_rate": 3.454707414005756e-06, + "loss": 0.09, + "step": 48512 + }, + { + "epoch": 3.0579896907216493, + "grad_norm": 3.8200812339782715, + "learning_rate": 3.454570371385501e-06, + "loss": 0.1198, + "step": 48513 + }, + { + "epoch": 3.0580032555615846, + "grad_norm": 5.071035861968994, + "learning_rate": 3.4544333287652465e-06, + "loss": 0.122, + "step": 48514 + }, + { + "epoch": 3.0580168204015195, + "grad_norm": 3.208439350128174, + "learning_rate": 3.4542962861449913e-06, + "loss": 0.0723, + "step": 48515 + }, + { + "epoch": 3.0580303852414543, + "grad_norm": 3.798990488052368, + "learning_rate": 3.454159243524737e-06, + "loss": 0.0933, + "step": 48516 + }, + { + "epoch": 3.058043950081389, + "grad_norm": 4.007834434509277, + "learning_rate": 3.4540222009044816e-06, + "loss": 0.0576, + "step": 48517 + }, + { + "epoch": 3.058057514921324, + "grad_norm": 4.769769668579102, + "learning_rate": 3.4538851582842264e-06, + "loss": 0.1491, + "step": 48518 + }, + { + "epoch": 3.058071079761259, + "grad_norm": 2.6254382133483887, + "learning_rate": 3.453748115663972e-06, + "loss": 0.042, + "step": 48519 + }, + { + "epoch": 3.058084644601194, + "grad_norm": 3.309609889984131, + "learning_rate": 3.4536110730437167e-06, + "loss": 0.0428, + "step": 48520 + }, + { + "epoch": 3.0580982094411286, + "grad_norm": 3.1833226680755615, + "learning_rate": 3.453474030423462e-06, + "loss": 0.0741, + "step": 48521 + }, + { + "epoch": 3.0581117742810635, + "grad_norm": 4.355951309204102, + "learning_rate": 3.453336987803207e-06, + "loss": 0.1205, + "step": 48522 + }, + { + "epoch": 3.0581253391209984, + "grad_norm": 3.553384780883789, + "learning_rate": 3.4531999451829523e-06, + "loss": 0.0726, + "step": 48523 + }, + { + "epoch": 3.0581389039609332, + "grad_norm": 5.56899881362915, + "learning_rate": 3.453062902562697e-06, + "loss": 0.1359, + "step": 48524 + }, + { + "epoch": 3.058152468800868, + "grad_norm": 4.428424835205078, + "learning_rate": 3.4529258599424426e-06, + "loss": 0.1016, + "step": 48525 + }, + { + "epoch": 3.058166033640803, + "grad_norm": 3.254340648651123, + "learning_rate": 3.4527888173221874e-06, + "loss": 0.0985, + "step": 48526 + }, + { + "epoch": 3.058179598480738, + "grad_norm": 4.856616973876953, + "learning_rate": 3.452651774701933e-06, + "loss": 0.1648, + "step": 48527 + }, + { + "epoch": 3.0581931633206727, + "grad_norm": 3.817716360092163, + "learning_rate": 3.4525147320816777e-06, + "loss": 0.0699, + "step": 48528 + }, + { + "epoch": 3.0582067281606076, + "grad_norm": 6.562379837036133, + "learning_rate": 3.452377689461423e-06, + "loss": 0.1368, + "step": 48529 + }, + { + "epoch": 3.0582202930005424, + "grad_norm": 4.211941242218018, + "learning_rate": 3.4522406468411677e-06, + "loss": 0.1253, + "step": 48530 + }, + { + "epoch": 3.0582338578404773, + "grad_norm": 3.7452869415283203, + "learning_rate": 3.452103604220913e-06, + "loss": 0.1639, + "step": 48531 + }, + { + "epoch": 3.058247422680412, + "grad_norm": 3.5942792892456055, + "learning_rate": 3.451966561600658e-06, + "loss": 0.0899, + "step": 48532 + }, + { + "epoch": 3.0582609875203475, + "grad_norm": 5.831683158874512, + "learning_rate": 3.451829518980403e-06, + "loss": 0.1319, + "step": 48533 + }, + { + "epoch": 3.0582745523602823, + "grad_norm": 4.358236312866211, + "learning_rate": 3.4516924763601484e-06, + "loss": 0.083, + "step": 48534 + }, + { + "epoch": 3.058288117200217, + "grad_norm": 4.380354404449463, + "learning_rate": 3.451555433739893e-06, + "loss": 0.0757, + "step": 48535 + }, + { + "epoch": 3.058301682040152, + "grad_norm": 4.813615798950195, + "learning_rate": 3.4514183911196388e-06, + "loss": 0.1469, + "step": 48536 + }, + { + "epoch": 3.058315246880087, + "grad_norm": 4.798304557800293, + "learning_rate": 3.4512813484993835e-06, + "loss": 0.1974, + "step": 48537 + }, + { + "epoch": 3.058328811720022, + "grad_norm": 4.622365474700928, + "learning_rate": 3.4511443058791287e-06, + "loss": 0.1309, + "step": 48538 + }, + { + "epoch": 3.0583423765599567, + "grad_norm": 3.992689847946167, + "learning_rate": 3.451007263258874e-06, + "loss": 0.0846, + "step": 48539 + }, + { + "epoch": 3.0583559413998915, + "grad_norm": 4.875163555145264, + "learning_rate": 3.450870220638619e-06, + "loss": 0.1435, + "step": 48540 + }, + { + "epoch": 3.0583695062398264, + "grad_norm": 6.22860860824585, + "learning_rate": 3.450733178018364e-06, + "loss": 0.152, + "step": 48541 + }, + { + "epoch": 3.0583830710797613, + "grad_norm": 4.428443431854248, + "learning_rate": 3.4505961353981094e-06, + "loss": 0.1141, + "step": 48542 + }, + { + "epoch": 3.058396635919696, + "grad_norm": 2.881574869155884, + "learning_rate": 3.450459092777854e-06, + "loss": 0.041, + "step": 48543 + }, + { + "epoch": 3.058410200759631, + "grad_norm": 4.356116771697998, + "learning_rate": 3.450322050157599e-06, + "loss": 0.1006, + "step": 48544 + }, + { + "epoch": 3.058423765599566, + "grad_norm": 4.72275447845459, + "learning_rate": 3.4501850075373445e-06, + "loss": 0.1127, + "step": 48545 + }, + { + "epoch": 3.0584373304395007, + "grad_norm": 4.426755905151367, + "learning_rate": 3.4500479649170893e-06, + "loss": 0.1874, + "step": 48546 + }, + { + "epoch": 3.0584508952794356, + "grad_norm": 4.0301079750061035, + "learning_rate": 3.4499109222968345e-06, + "loss": 0.1347, + "step": 48547 + }, + { + "epoch": 3.0584644601193705, + "grad_norm": 4.151064395904541, + "learning_rate": 3.4497738796765796e-06, + "loss": 0.1357, + "step": 48548 + }, + { + "epoch": 3.0584780249593053, + "grad_norm": 3.717320203781128, + "learning_rate": 3.449636837056325e-06, + "loss": 0.1466, + "step": 48549 + }, + { + "epoch": 3.05849158979924, + "grad_norm": 3.4333062171936035, + "learning_rate": 3.4494997944360696e-06, + "loss": 0.0705, + "step": 48550 + }, + { + "epoch": 3.058505154639175, + "grad_norm": 5.783604145050049, + "learning_rate": 3.449362751815815e-06, + "loss": 0.1714, + "step": 48551 + }, + { + "epoch": 3.0585187194791104, + "grad_norm": 4.9068450927734375, + "learning_rate": 3.44922570919556e-06, + "loss": 0.0874, + "step": 48552 + }, + { + "epoch": 3.0585322843190452, + "grad_norm": 4.625579833984375, + "learning_rate": 3.4490886665753055e-06, + "loss": 0.0865, + "step": 48553 + }, + { + "epoch": 3.05854584915898, + "grad_norm": 3.240967273712158, + "learning_rate": 3.4489516239550503e-06, + "loss": 0.0786, + "step": 48554 + }, + { + "epoch": 3.058559413998915, + "grad_norm": 4.455328464508057, + "learning_rate": 3.4488145813347955e-06, + "loss": 0.1199, + "step": 48555 + }, + { + "epoch": 3.05857297883885, + "grad_norm": 2.4632365703582764, + "learning_rate": 3.4486775387145406e-06, + "loss": 0.0648, + "step": 48556 + }, + { + "epoch": 3.0585865436787847, + "grad_norm": 3.4488368034362793, + "learning_rate": 3.448540496094286e-06, + "loss": 0.0989, + "step": 48557 + }, + { + "epoch": 3.0586001085187196, + "grad_norm": 3.972001552581787, + "learning_rate": 3.4484034534740306e-06, + "loss": 0.0878, + "step": 48558 + }, + { + "epoch": 3.0586136733586544, + "grad_norm": 3.4503612518310547, + "learning_rate": 3.4482664108537758e-06, + "loss": 0.0598, + "step": 48559 + }, + { + "epoch": 3.0586272381985893, + "grad_norm": 3.5948328971862793, + "learning_rate": 3.448129368233521e-06, + "loss": 0.0831, + "step": 48560 + }, + { + "epoch": 3.058640803038524, + "grad_norm": 4.31319522857666, + "learning_rate": 3.4479923256132657e-06, + "loss": 0.1241, + "step": 48561 + }, + { + "epoch": 3.058654367878459, + "grad_norm": 5.814835548400879, + "learning_rate": 3.4478552829930113e-06, + "loss": 0.1386, + "step": 48562 + }, + { + "epoch": 3.058667932718394, + "grad_norm": 4.341078281402588, + "learning_rate": 3.447718240372756e-06, + "loss": 0.1214, + "step": 48563 + }, + { + "epoch": 3.0586814975583287, + "grad_norm": 4.075437545776367, + "learning_rate": 3.4475811977525016e-06, + "loss": 0.1073, + "step": 48564 + }, + { + "epoch": 3.0586950623982636, + "grad_norm": 2.729726791381836, + "learning_rate": 3.4474441551322464e-06, + "loss": 0.0486, + "step": 48565 + }, + { + "epoch": 3.0587086272381985, + "grad_norm": 4.043341159820557, + "learning_rate": 3.4473071125119916e-06, + "loss": 0.0681, + "step": 48566 + }, + { + "epoch": 3.0587221920781333, + "grad_norm": 3.447498321533203, + "learning_rate": 3.4471700698917363e-06, + "loss": 0.0572, + "step": 48567 + }, + { + "epoch": 3.058735756918068, + "grad_norm": 3.234548568725586, + "learning_rate": 3.447033027271482e-06, + "loss": 0.0434, + "step": 48568 + }, + { + "epoch": 3.058749321758003, + "grad_norm": 4.0929107666015625, + "learning_rate": 3.4468959846512267e-06, + "loss": 0.0977, + "step": 48569 + }, + { + "epoch": 3.058762886597938, + "grad_norm": 6.162106513977051, + "learning_rate": 3.4467589420309723e-06, + "loss": 0.1927, + "step": 48570 + }, + { + "epoch": 3.0587764514378732, + "grad_norm": 3.4824531078338623, + "learning_rate": 3.446621899410717e-06, + "loss": 0.0596, + "step": 48571 + }, + { + "epoch": 3.058790016277808, + "grad_norm": 2.753269672393799, + "learning_rate": 3.446484856790462e-06, + "loss": 0.0626, + "step": 48572 + }, + { + "epoch": 3.058803581117743, + "grad_norm": 2.8527727127075195, + "learning_rate": 3.4463478141702074e-06, + "loss": 0.0762, + "step": 48573 + }, + { + "epoch": 3.058817145957678, + "grad_norm": 3.7295989990234375, + "learning_rate": 3.446210771549952e-06, + "loss": 0.1097, + "step": 48574 + }, + { + "epoch": 3.0588307107976127, + "grad_norm": 2.5802512168884277, + "learning_rate": 3.4460737289296973e-06, + "loss": 0.0636, + "step": 48575 + }, + { + "epoch": 3.0588442756375476, + "grad_norm": 2.7790591716766357, + "learning_rate": 3.4459366863094425e-06, + "loss": 0.0712, + "step": 48576 + }, + { + "epoch": 3.0588578404774824, + "grad_norm": 4.311746597290039, + "learning_rate": 3.4457996436891877e-06, + "loss": 0.0684, + "step": 48577 + }, + { + "epoch": 3.0588714053174173, + "grad_norm": 5.687047958374023, + "learning_rate": 3.4456626010689325e-06, + "loss": 0.118, + "step": 48578 + }, + { + "epoch": 3.058884970157352, + "grad_norm": 4.796680927276611, + "learning_rate": 3.445525558448678e-06, + "loss": 0.0816, + "step": 48579 + }, + { + "epoch": 3.058898534997287, + "grad_norm": 2.516022205352783, + "learning_rate": 3.445388515828423e-06, + "loss": 0.0465, + "step": 48580 + }, + { + "epoch": 3.058912099837222, + "grad_norm": 3.888742685317993, + "learning_rate": 3.4452514732081684e-06, + "loss": 0.0936, + "step": 48581 + }, + { + "epoch": 3.0589256646771568, + "grad_norm": 4.006579399108887, + "learning_rate": 3.445114430587913e-06, + "loss": 0.0876, + "step": 48582 + }, + { + "epoch": 3.0589392295170916, + "grad_norm": 4.926144599914551, + "learning_rate": 3.4449773879676584e-06, + "loss": 0.0866, + "step": 48583 + }, + { + "epoch": 3.0589527943570265, + "grad_norm": 3.267061710357666, + "learning_rate": 3.444840345347403e-06, + "loss": 0.0526, + "step": 48584 + }, + { + "epoch": 3.0589663591969614, + "grad_norm": 1.7694939374923706, + "learning_rate": 3.4447033027271487e-06, + "loss": 0.0281, + "step": 48585 + }, + { + "epoch": 3.0589799240368962, + "grad_norm": 5.641242027282715, + "learning_rate": 3.4445662601068935e-06, + "loss": 0.1248, + "step": 48586 + }, + { + "epoch": 3.058993488876831, + "grad_norm": 3.382888078689575, + "learning_rate": 3.4444292174866382e-06, + "loss": 0.0837, + "step": 48587 + }, + { + "epoch": 3.059007053716766, + "grad_norm": 4.85176420211792, + "learning_rate": 3.444292174866384e-06, + "loss": 0.1309, + "step": 48588 + }, + { + "epoch": 3.059020618556701, + "grad_norm": 5.0577239990234375, + "learning_rate": 3.4441551322461286e-06, + "loss": 0.1228, + "step": 48589 + }, + { + "epoch": 3.059034183396636, + "grad_norm": 4.040360927581787, + "learning_rate": 3.444018089625874e-06, + "loss": 0.0872, + "step": 48590 + }, + { + "epoch": 3.059047748236571, + "grad_norm": 4.969161510467529, + "learning_rate": 3.443881047005619e-06, + "loss": 0.1351, + "step": 48591 + }, + { + "epoch": 3.059061313076506, + "grad_norm": 3.312788486480713, + "learning_rate": 3.443744004385364e-06, + "loss": 0.0829, + "step": 48592 + }, + { + "epoch": 3.0590748779164407, + "grad_norm": 3.2383084297180176, + "learning_rate": 3.4436069617651093e-06, + "loss": 0.0875, + "step": 48593 + }, + { + "epoch": 3.0590884427563756, + "grad_norm": 4.0812602043151855, + "learning_rate": 3.4434699191448545e-06, + "loss": 0.0606, + "step": 48594 + }, + { + "epoch": 3.0591020075963105, + "grad_norm": 3.0231804847717285, + "learning_rate": 3.4433328765245992e-06, + "loss": 0.0893, + "step": 48595 + }, + { + "epoch": 3.0591155724362453, + "grad_norm": 3.5610597133636475, + "learning_rate": 3.443195833904345e-06, + "loss": 0.1092, + "step": 48596 + }, + { + "epoch": 3.05912913727618, + "grad_norm": 4.055371284484863, + "learning_rate": 3.4430587912840896e-06, + "loss": 0.0555, + "step": 48597 + }, + { + "epoch": 3.059142702116115, + "grad_norm": 6.532557487487793, + "learning_rate": 3.442921748663835e-06, + "loss": 0.1401, + "step": 48598 + }, + { + "epoch": 3.05915626695605, + "grad_norm": 4.599536418914795, + "learning_rate": 3.44278470604358e-06, + "loss": 0.0675, + "step": 48599 + }, + { + "epoch": 3.059169831795985, + "grad_norm": 4.438448905944824, + "learning_rate": 3.4426476634233247e-06, + "loss": 0.1624, + "step": 48600 + }, + { + "epoch": 3.0591833966359196, + "grad_norm": 4.6176276206970215, + "learning_rate": 3.44251062080307e-06, + "loss": 0.1035, + "step": 48601 + }, + { + "epoch": 3.0591969614758545, + "grad_norm": 5.033537864685059, + "learning_rate": 3.442373578182815e-06, + "loss": 0.1139, + "step": 48602 + }, + { + "epoch": 3.0592105263157894, + "grad_norm": 4.1726579666137695, + "learning_rate": 3.4422365355625602e-06, + "loss": 0.0828, + "step": 48603 + }, + { + "epoch": 3.0592240911557242, + "grad_norm": 5.15648889541626, + "learning_rate": 3.442099492942305e-06, + "loss": 0.1276, + "step": 48604 + }, + { + "epoch": 3.059237655995659, + "grad_norm": 3.6241087913513184, + "learning_rate": 3.4419624503220506e-06, + "loss": 0.1254, + "step": 48605 + }, + { + "epoch": 3.059251220835594, + "grad_norm": 3.1860899925231934, + "learning_rate": 3.4418254077017953e-06, + "loss": 0.0788, + "step": 48606 + }, + { + "epoch": 3.059264785675529, + "grad_norm": 8.301939010620117, + "learning_rate": 3.441688365081541e-06, + "loss": 0.1008, + "step": 48607 + }, + { + "epoch": 3.0592783505154637, + "grad_norm": 4.953678131103516, + "learning_rate": 3.4415513224612857e-06, + "loss": 0.1228, + "step": 48608 + }, + { + "epoch": 3.059291915355399, + "grad_norm": 4.670387268066406, + "learning_rate": 3.441414279841031e-06, + "loss": 0.1406, + "step": 48609 + }, + { + "epoch": 3.059305480195334, + "grad_norm": 3.976329803466797, + "learning_rate": 3.441277237220776e-06, + "loss": 0.1075, + "step": 48610 + }, + { + "epoch": 3.0593190450352687, + "grad_norm": 3.632676362991333, + "learning_rate": 3.4411401946005212e-06, + "loss": 0.1401, + "step": 48611 + }, + { + "epoch": 3.0593326098752036, + "grad_norm": 3.371140718460083, + "learning_rate": 3.441003151980266e-06, + "loss": 0.0826, + "step": 48612 + }, + { + "epoch": 3.0593461747151385, + "grad_norm": 4.059020519256592, + "learning_rate": 3.440866109360011e-06, + "loss": 0.0926, + "step": 48613 + }, + { + "epoch": 3.0593597395550733, + "grad_norm": 4.034318923950195, + "learning_rate": 3.4407290667397564e-06, + "loss": 0.1122, + "step": 48614 + }, + { + "epoch": 3.059373304395008, + "grad_norm": 3.8566384315490723, + "learning_rate": 3.440592024119501e-06, + "loss": 0.0724, + "step": 48615 + }, + { + "epoch": 3.059386869234943, + "grad_norm": 4.405250072479248, + "learning_rate": 3.4404549814992467e-06, + "loss": 0.1062, + "step": 48616 + }, + { + "epoch": 3.059400434074878, + "grad_norm": 3.6131904125213623, + "learning_rate": 3.4403179388789915e-06, + "loss": 0.0559, + "step": 48617 + }, + { + "epoch": 3.059413998914813, + "grad_norm": 3.7983996868133545, + "learning_rate": 3.4401808962587366e-06, + "loss": 0.0982, + "step": 48618 + }, + { + "epoch": 3.0594275637547477, + "grad_norm": 4.084929943084717, + "learning_rate": 3.440043853638482e-06, + "loss": 0.0801, + "step": 48619 + }, + { + "epoch": 3.0594411285946825, + "grad_norm": 3.0764923095703125, + "learning_rate": 3.439906811018227e-06, + "loss": 0.0852, + "step": 48620 + }, + { + "epoch": 3.0594546934346174, + "grad_norm": 3.7368462085723877, + "learning_rate": 3.4397697683979718e-06, + "loss": 0.0821, + "step": 48621 + }, + { + "epoch": 3.0594682582745523, + "grad_norm": 6.1114702224731445, + "learning_rate": 3.4396327257777174e-06, + "loss": 0.1782, + "step": 48622 + }, + { + "epoch": 3.059481823114487, + "grad_norm": 4.1551618576049805, + "learning_rate": 3.439495683157462e-06, + "loss": 0.1117, + "step": 48623 + }, + { + "epoch": 3.059495387954422, + "grad_norm": 7.024766445159912, + "learning_rate": 3.4393586405372077e-06, + "loss": 0.1804, + "step": 48624 + }, + { + "epoch": 3.059508952794357, + "grad_norm": 5.222872257232666, + "learning_rate": 3.4392215979169525e-06, + "loss": 0.1355, + "step": 48625 + }, + { + "epoch": 3.0595225176342917, + "grad_norm": 3.2129688262939453, + "learning_rate": 3.4390845552966977e-06, + "loss": 0.1265, + "step": 48626 + }, + { + "epoch": 3.0595360824742266, + "grad_norm": 6.245869159698486, + "learning_rate": 3.438947512676443e-06, + "loss": 0.1062, + "step": 48627 + }, + { + "epoch": 3.059549647314162, + "grad_norm": 3.5660641193389893, + "learning_rate": 3.4388104700561876e-06, + "loss": 0.0643, + "step": 48628 + }, + { + "epoch": 3.0595632121540968, + "grad_norm": 3.3105437755584717, + "learning_rate": 3.4386734274359328e-06, + "loss": 0.0581, + "step": 48629 + }, + { + "epoch": 3.0595767769940316, + "grad_norm": 4.394432067871094, + "learning_rate": 3.438536384815678e-06, + "loss": 0.07, + "step": 48630 + }, + { + "epoch": 3.0595903418339665, + "grad_norm": 3.793782949447632, + "learning_rate": 3.438399342195423e-06, + "loss": 0.079, + "step": 48631 + }, + { + "epoch": 3.0596039066739014, + "grad_norm": 5.375168800354004, + "learning_rate": 3.438262299575168e-06, + "loss": 0.1302, + "step": 48632 + }, + { + "epoch": 3.0596174715138362, + "grad_norm": 3.812861204147339, + "learning_rate": 3.4381252569549135e-06, + "loss": 0.0884, + "step": 48633 + }, + { + "epoch": 3.059631036353771, + "grad_norm": 4.390145301818848, + "learning_rate": 3.4379882143346582e-06, + "loss": 0.0852, + "step": 48634 + }, + { + "epoch": 3.059644601193706, + "grad_norm": 3.982309341430664, + "learning_rate": 3.437851171714404e-06, + "loss": 0.1064, + "step": 48635 + }, + { + "epoch": 3.059658166033641, + "grad_norm": 4.383361339569092, + "learning_rate": 3.4377141290941486e-06, + "loss": 0.1052, + "step": 48636 + }, + { + "epoch": 3.0596717308735757, + "grad_norm": 3.6706645488739014, + "learning_rate": 3.4375770864738938e-06, + "loss": 0.1402, + "step": 48637 + }, + { + "epoch": 3.0596852957135106, + "grad_norm": 4.231591701507568, + "learning_rate": 3.4374400438536385e-06, + "loss": 0.1332, + "step": 48638 + }, + { + "epoch": 3.0596988605534454, + "grad_norm": 3.7432565689086914, + "learning_rate": 3.437303001233384e-06, + "loss": 0.0863, + "step": 48639 + }, + { + "epoch": 3.0597124253933803, + "grad_norm": 4.278861999511719, + "learning_rate": 3.437165958613129e-06, + "loss": 0.1055, + "step": 48640 + }, + { + "epoch": 3.059725990233315, + "grad_norm": 2.987584114074707, + "learning_rate": 3.4370289159928736e-06, + "loss": 0.093, + "step": 48641 + }, + { + "epoch": 3.05973955507325, + "grad_norm": 3.4040708541870117, + "learning_rate": 3.4368918733726192e-06, + "loss": 0.0762, + "step": 48642 + }, + { + "epoch": 3.059753119913185, + "grad_norm": 5.535431385040283, + "learning_rate": 3.436754830752364e-06, + "loss": 0.1282, + "step": 48643 + }, + { + "epoch": 3.0597666847531197, + "grad_norm": 3.235177993774414, + "learning_rate": 3.4366177881321096e-06, + "loss": 0.0757, + "step": 48644 + }, + { + "epoch": 3.0597802495930546, + "grad_norm": 4.326260089874268, + "learning_rate": 3.4364807455118544e-06, + "loss": 0.1161, + "step": 48645 + }, + { + "epoch": 3.0597938144329895, + "grad_norm": 4.468843936920166, + "learning_rate": 3.4363437028915995e-06, + "loss": 0.0981, + "step": 48646 + }, + { + "epoch": 3.059807379272925, + "grad_norm": 3.126798152923584, + "learning_rate": 3.4362066602713447e-06, + "loss": 0.0955, + "step": 48647 + }, + { + "epoch": 3.0598209441128597, + "grad_norm": 4.480541706085205, + "learning_rate": 3.43606961765109e-06, + "loss": 0.1031, + "step": 48648 + }, + { + "epoch": 3.0598345089527945, + "grad_norm": 3.9791786670684814, + "learning_rate": 3.4359325750308347e-06, + "loss": 0.1637, + "step": 48649 + }, + { + "epoch": 3.0598480737927294, + "grad_norm": 3.7060558795928955, + "learning_rate": 3.4357955324105803e-06, + "loss": 0.0993, + "step": 48650 + }, + { + "epoch": 3.0598616386326642, + "grad_norm": 3.385849952697754, + "learning_rate": 3.435658489790325e-06, + "loss": 0.1029, + "step": 48651 + }, + { + "epoch": 3.059875203472599, + "grad_norm": 5.135103702545166, + "learning_rate": 3.4355214471700706e-06, + "loss": 0.2018, + "step": 48652 + }, + { + "epoch": 3.059888768312534, + "grad_norm": 4.230329513549805, + "learning_rate": 3.4353844045498154e-06, + "loss": 0.1187, + "step": 48653 + }, + { + "epoch": 3.059902333152469, + "grad_norm": 3.247793674468994, + "learning_rate": 3.43524736192956e-06, + "loss": 0.0799, + "step": 48654 + }, + { + "epoch": 3.0599158979924037, + "grad_norm": 3.754812002182007, + "learning_rate": 3.4351103193093053e-06, + "loss": 0.0942, + "step": 48655 + }, + { + "epoch": 3.0599294628323386, + "grad_norm": 4.511361598968506, + "learning_rate": 3.4349732766890505e-06, + "loss": 0.0755, + "step": 48656 + }, + { + "epoch": 3.0599430276722734, + "grad_norm": 5.974640846252441, + "learning_rate": 3.4348362340687957e-06, + "loss": 0.1484, + "step": 48657 + }, + { + "epoch": 3.0599565925122083, + "grad_norm": 6.492243766784668, + "learning_rate": 3.4346991914485404e-06, + "loss": 0.1211, + "step": 48658 + }, + { + "epoch": 3.059970157352143, + "grad_norm": 3.7363829612731934, + "learning_rate": 3.434562148828286e-06, + "loss": 0.1019, + "step": 48659 + }, + { + "epoch": 3.059983722192078, + "grad_norm": 4.363400936126709, + "learning_rate": 3.4344251062080308e-06, + "loss": 0.1238, + "step": 48660 + }, + { + "epoch": 3.059997287032013, + "grad_norm": 4.034008502960205, + "learning_rate": 3.4342880635877764e-06, + "loss": 0.0725, + "step": 48661 + }, + { + "epoch": 3.0600108518719478, + "grad_norm": 6.516880512237549, + "learning_rate": 3.434151020967521e-06, + "loss": 0.1379, + "step": 48662 + }, + { + "epoch": 3.0600244167118826, + "grad_norm": 4.163651466369629, + "learning_rate": 3.4340139783472663e-06, + "loss": 0.0922, + "step": 48663 + }, + { + "epoch": 3.0600379815518175, + "grad_norm": 3.159818649291992, + "learning_rate": 3.4338769357270115e-06, + "loss": 0.0765, + "step": 48664 + }, + { + "epoch": 3.0600515463917524, + "grad_norm": 4.050711631774902, + "learning_rate": 3.4337398931067567e-06, + "loss": 0.1337, + "step": 48665 + }, + { + "epoch": 3.0600651112316877, + "grad_norm": 4.7145256996154785, + "learning_rate": 3.4336028504865014e-06, + "loss": 0.1321, + "step": 48666 + }, + { + "epoch": 3.0600786760716225, + "grad_norm": 5.36901330947876, + "learning_rate": 3.433465807866247e-06, + "loss": 0.15, + "step": 48667 + }, + { + "epoch": 3.0600922409115574, + "grad_norm": 4.608122825622559, + "learning_rate": 3.4333287652459918e-06, + "loss": 0.1363, + "step": 48668 + }, + { + "epoch": 3.0601058057514923, + "grad_norm": 5.129957675933838, + "learning_rate": 3.4331917226257365e-06, + "loss": 0.1102, + "step": 48669 + }, + { + "epoch": 3.060119370591427, + "grad_norm": 4.8166303634643555, + "learning_rate": 3.433054680005482e-06, + "loss": 0.1897, + "step": 48670 + }, + { + "epoch": 3.060132935431362, + "grad_norm": 3.385927200317383, + "learning_rate": 3.432917637385227e-06, + "loss": 0.0908, + "step": 48671 + }, + { + "epoch": 3.060146500271297, + "grad_norm": 4.49239444732666, + "learning_rate": 3.432780594764972e-06, + "loss": 0.1424, + "step": 48672 + }, + { + "epoch": 3.0601600651112317, + "grad_norm": 3.4637954235076904, + "learning_rate": 3.4326435521447173e-06, + "loss": 0.0908, + "step": 48673 + }, + { + "epoch": 3.0601736299511666, + "grad_norm": 3.5891313552856445, + "learning_rate": 3.4325065095244624e-06, + "loss": 0.0902, + "step": 48674 + }, + { + "epoch": 3.0601871947911015, + "grad_norm": 6.6175689697265625, + "learning_rate": 3.432369466904207e-06, + "loss": 0.1699, + "step": 48675 + }, + { + "epoch": 3.0602007596310363, + "grad_norm": 5.034763813018799, + "learning_rate": 3.432232424283953e-06, + "loss": 0.0869, + "step": 48676 + }, + { + "epoch": 3.060214324470971, + "grad_norm": 3.6382498741149902, + "learning_rate": 3.4320953816636975e-06, + "loss": 0.061, + "step": 48677 + }, + { + "epoch": 3.060227889310906, + "grad_norm": 4.79609489440918, + "learning_rate": 3.431958339043443e-06, + "loss": 0.1143, + "step": 48678 + }, + { + "epoch": 3.060241454150841, + "grad_norm": 4.253261089324951, + "learning_rate": 3.431821296423188e-06, + "loss": 0.103, + "step": 48679 + }, + { + "epoch": 3.060255018990776, + "grad_norm": 4.226440906524658, + "learning_rate": 3.431684253802933e-06, + "loss": 0.0455, + "step": 48680 + }, + { + "epoch": 3.0602685838307107, + "grad_norm": 3.351696014404297, + "learning_rate": 3.4315472111826783e-06, + "loss": 0.0974, + "step": 48681 + }, + { + "epoch": 3.0602821486706455, + "grad_norm": 4.880884170532227, + "learning_rate": 3.431410168562423e-06, + "loss": 0.109, + "step": 48682 + }, + { + "epoch": 3.0602957135105804, + "grad_norm": 4.677700042724609, + "learning_rate": 3.431273125942168e-06, + "loss": 0.1343, + "step": 48683 + }, + { + "epoch": 3.0603092783505152, + "grad_norm": 6.327657699584961, + "learning_rate": 3.4311360833219134e-06, + "loss": 0.1497, + "step": 48684 + }, + { + "epoch": 3.0603228431904506, + "grad_norm": 3.70788311958313, + "learning_rate": 3.4309990407016586e-06, + "loss": 0.1137, + "step": 48685 + }, + { + "epoch": 3.0603364080303854, + "grad_norm": 3.4968433380126953, + "learning_rate": 3.4308619980814033e-06, + "loss": 0.082, + "step": 48686 + }, + { + "epoch": 3.0603499728703203, + "grad_norm": 3.6916236877441406, + "learning_rate": 3.430724955461149e-06, + "loss": 0.1227, + "step": 48687 + }, + { + "epoch": 3.060363537710255, + "grad_norm": 2.370004653930664, + "learning_rate": 3.4305879128408937e-06, + "loss": 0.0512, + "step": 48688 + }, + { + "epoch": 3.06037710255019, + "grad_norm": 6.425679683685303, + "learning_rate": 3.430450870220639e-06, + "loss": 0.2, + "step": 48689 + }, + { + "epoch": 3.060390667390125, + "grad_norm": 3.631619453430176, + "learning_rate": 3.430313827600384e-06, + "loss": 0.0549, + "step": 48690 + }, + { + "epoch": 3.0604042322300598, + "grad_norm": 3.937490940093994, + "learning_rate": 3.430176784980129e-06, + "loss": 0.1368, + "step": 48691 + }, + { + "epoch": 3.0604177970699946, + "grad_norm": 3.792086124420166, + "learning_rate": 3.430039742359874e-06, + "loss": 0.1115, + "step": 48692 + }, + { + "epoch": 3.0604313619099295, + "grad_norm": 3.9190943241119385, + "learning_rate": 3.4299026997396196e-06, + "loss": 0.0735, + "step": 48693 + }, + { + "epoch": 3.0604449267498643, + "grad_norm": 6.580208778381348, + "learning_rate": 3.4297656571193643e-06, + "loss": 0.1099, + "step": 48694 + }, + { + "epoch": 3.060458491589799, + "grad_norm": 3.836841583251953, + "learning_rate": 3.42962861449911e-06, + "loss": 0.1009, + "step": 48695 + }, + { + "epoch": 3.060472056429734, + "grad_norm": 3.773444890975952, + "learning_rate": 3.4294915718788547e-06, + "loss": 0.0933, + "step": 48696 + }, + { + "epoch": 3.060485621269669, + "grad_norm": 2.9590909481048584, + "learning_rate": 3.4293545292585994e-06, + "loss": 0.0807, + "step": 48697 + }, + { + "epoch": 3.060499186109604, + "grad_norm": 3.7153255939483643, + "learning_rate": 3.429217486638345e-06, + "loss": 0.0953, + "step": 48698 + }, + { + "epoch": 3.0605127509495387, + "grad_norm": 4.562044143676758, + "learning_rate": 3.4290804440180898e-06, + "loss": 0.1056, + "step": 48699 + }, + { + "epoch": 3.0605263157894735, + "grad_norm": 3.235145330429077, + "learning_rate": 3.428943401397835e-06, + "loss": 0.0861, + "step": 48700 + }, + { + "epoch": 3.0605398806294084, + "grad_norm": 4.836298942565918, + "learning_rate": 3.42880635877758e-06, + "loss": 0.0984, + "step": 48701 + }, + { + "epoch": 3.0605534454693433, + "grad_norm": 4.2339301109313965, + "learning_rate": 3.4286693161573253e-06, + "loss": 0.0714, + "step": 48702 + }, + { + "epoch": 3.060567010309278, + "grad_norm": 4.285900115966797, + "learning_rate": 3.42853227353707e-06, + "loss": 0.0826, + "step": 48703 + }, + { + "epoch": 3.0605805751492134, + "grad_norm": 4.393596172332764, + "learning_rate": 3.4283952309168157e-06, + "loss": 0.1399, + "step": 48704 + }, + { + "epoch": 3.0605941399891483, + "grad_norm": 2.747282028198242, + "learning_rate": 3.4282581882965604e-06, + "loss": 0.0471, + "step": 48705 + }, + { + "epoch": 3.060607704829083, + "grad_norm": 2.634256362915039, + "learning_rate": 3.428121145676306e-06, + "loss": 0.0978, + "step": 48706 + }, + { + "epoch": 3.060621269669018, + "grad_norm": 10.269420623779297, + "learning_rate": 3.427984103056051e-06, + "loss": 0.1057, + "step": 48707 + }, + { + "epoch": 3.060634834508953, + "grad_norm": 3.0621654987335205, + "learning_rate": 3.427847060435796e-06, + "loss": 0.0678, + "step": 48708 + }, + { + "epoch": 3.0606483993488878, + "grad_norm": 4.243135452270508, + "learning_rate": 3.4277100178155407e-06, + "loss": 0.0904, + "step": 48709 + }, + { + "epoch": 3.0606619641888226, + "grad_norm": 4.13405704498291, + "learning_rate": 3.427572975195286e-06, + "loss": 0.0416, + "step": 48710 + }, + { + "epoch": 3.0606755290287575, + "grad_norm": 2.1559298038482666, + "learning_rate": 3.427435932575031e-06, + "loss": 0.0365, + "step": 48711 + }, + { + "epoch": 3.0606890938686924, + "grad_norm": 2.3831238746643066, + "learning_rate": 3.427298889954776e-06, + "loss": 0.0615, + "step": 48712 + }, + { + "epoch": 3.0607026587086272, + "grad_norm": 3.473134994506836, + "learning_rate": 3.4271618473345214e-06, + "loss": 0.1066, + "step": 48713 + }, + { + "epoch": 3.060716223548562, + "grad_norm": 2.824575185775757, + "learning_rate": 3.427024804714266e-06, + "loss": 0.104, + "step": 48714 + }, + { + "epoch": 3.060729788388497, + "grad_norm": 4.543121814727783, + "learning_rate": 3.426887762094012e-06, + "loss": 0.0655, + "step": 48715 + }, + { + "epoch": 3.060743353228432, + "grad_norm": 6.7552361488342285, + "learning_rate": 3.4267507194737566e-06, + "loss": 0.0892, + "step": 48716 + }, + { + "epoch": 3.0607569180683667, + "grad_norm": 3.7911596298217773, + "learning_rate": 3.4266136768535017e-06, + "loss": 0.1568, + "step": 48717 + }, + { + "epoch": 3.0607704829083016, + "grad_norm": 3.524477958679199, + "learning_rate": 3.426476634233247e-06, + "loss": 0.1069, + "step": 48718 + }, + { + "epoch": 3.0607840477482364, + "grad_norm": 5.225085258483887, + "learning_rate": 3.426339591612992e-06, + "loss": 0.1161, + "step": 48719 + }, + { + "epoch": 3.0607976125881713, + "grad_norm": 4.1100592613220215, + "learning_rate": 3.426202548992737e-06, + "loss": 0.0864, + "step": 48720 + }, + { + "epoch": 3.060811177428106, + "grad_norm": 2.9681780338287354, + "learning_rate": 3.4260655063724825e-06, + "loss": 0.0679, + "step": 48721 + }, + { + "epoch": 3.060824742268041, + "grad_norm": 4.135751724243164, + "learning_rate": 3.425928463752227e-06, + "loss": 0.0937, + "step": 48722 + }, + { + "epoch": 3.0608383071079763, + "grad_norm": 3.8625290393829346, + "learning_rate": 3.425791421131972e-06, + "loss": 0.1089, + "step": 48723 + }, + { + "epoch": 3.060851871947911, + "grad_norm": 3.195106267929077, + "learning_rate": 3.4256543785117176e-06, + "loss": 0.0751, + "step": 48724 + }, + { + "epoch": 3.060865436787846, + "grad_norm": 4.537188529968262, + "learning_rate": 3.4255173358914623e-06, + "loss": 0.083, + "step": 48725 + }, + { + "epoch": 3.060879001627781, + "grad_norm": 5.819281101226807, + "learning_rate": 3.4253802932712075e-06, + "loss": 0.1363, + "step": 48726 + }, + { + "epoch": 3.060892566467716, + "grad_norm": 5.290797710418701, + "learning_rate": 3.4252432506509527e-06, + "loss": 0.1565, + "step": 48727 + }, + { + "epoch": 3.0609061313076507, + "grad_norm": 4.331011772155762, + "learning_rate": 3.425106208030698e-06, + "loss": 0.1173, + "step": 48728 + }, + { + "epoch": 3.0609196961475855, + "grad_norm": 3.6627044677734375, + "learning_rate": 3.4249691654104426e-06, + "loss": 0.11, + "step": 48729 + }, + { + "epoch": 3.0609332609875204, + "grad_norm": 5.525667190551758, + "learning_rate": 3.4248321227901882e-06, + "loss": 0.1814, + "step": 48730 + }, + { + "epoch": 3.0609468258274553, + "grad_norm": 4.153732776641846, + "learning_rate": 3.424695080169933e-06, + "loss": 0.1137, + "step": 48731 + }, + { + "epoch": 3.06096039066739, + "grad_norm": 3.562222957611084, + "learning_rate": 3.4245580375496786e-06, + "loss": 0.0957, + "step": 48732 + }, + { + "epoch": 3.060973955507325, + "grad_norm": 3.889887571334839, + "learning_rate": 3.4244209949294233e-06, + "loss": 0.1194, + "step": 48733 + }, + { + "epoch": 3.06098752034726, + "grad_norm": 3.689865827560425, + "learning_rate": 3.4242839523091685e-06, + "loss": 0.0454, + "step": 48734 + }, + { + "epoch": 3.0610010851871947, + "grad_norm": 5.081212520599365, + "learning_rate": 3.4241469096889137e-06, + "loss": 0.123, + "step": 48735 + }, + { + "epoch": 3.0610146500271296, + "grad_norm": 5.244605541229248, + "learning_rate": 3.424009867068659e-06, + "loss": 0.1078, + "step": 48736 + }, + { + "epoch": 3.0610282148670644, + "grad_norm": 4.633152008056641, + "learning_rate": 3.4238728244484036e-06, + "loss": 0.0976, + "step": 48737 + }, + { + "epoch": 3.0610417797069993, + "grad_norm": 4.665870189666748, + "learning_rate": 3.423735781828149e-06, + "loss": 0.1144, + "step": 48738 + }, + { + "epoch": 3.061055344546934, + "grad_norm": 3.1650123596191406, + "learning_rate": 3.423598739207894e-06, + "loss": 0.0783, + "step": 48739 + }, + { + "epoch": 3.061068909386869, + "grad_norm": 3.9532878398895264, + "learning_rate": 3.4234616965876387e-06, + "loss": 0.0608, + "step": 48740 + }, + { + "epoch": 3.061082474226804, + "grad_norm": 6.889862060546875, + "learning_rate": 3.4233246539673843e-06, + "loss": 0.1831, + "step": 48741 + }, + { + "epoch": 3.061096039066739, + "grad_norm": 3.881382465362549, + "learning_rate": 3.423187611347129e-06, + "loss": 0.1252, + "step": 48742 + }, + { + "epoch": 3.061109603906674, + "grad_norm": 3.0350403785705566, + "learning_rate": 3.4230505687268743e-06, + "loss": 0.0523, + "step": 48743 + }, + { + "epoch": 3.061123168746609, + "grad_norm": 4.303310871124268, + "learning_rate": 3.4229135261066194e-06, + "loss": 0.0837, + "step": 48744 + }, + { + "epoch": 3.061136733586544, + "grad_norm": 5.752620697021484, + "learning_rate": 3.4227764834863646e-06, + "loss": 0.1399, + "step": 48745 + }, + { + "epoch": 3.0611502984264787, + "grad_norm": 5.521579742431641, + "learning_rate": 3.4226394408661094e-06, + "loss": 0.1172, + "step": 48746 + }, + { + "epoch": 3.0611638632664135, + "grad_norm": 4.403937816619873, + "learning_rate": 3.422502398245855e-06, + "loss": 0.0811, + "step": 48747 + }, + { + "epoch": 3.0611774281063484, + "grad_norm": 5.0596184730529785, + "learning_rate": 3.4223653556255997e-06, + "loss": 0.1162, + "step": 48748 + }, + { + "epoch": 3.0611909929462833, + "grad_norm": 5.026480197906494, + "learning_rate": 3.4222283130053453e-06, + "loss": 0.076, + "step": 48749 + }, + { + "epoch": 3.061204557786218, + "grad_norm": 3.266242265701294, + "learning_rate": 3.42209127038509e-06, + "loss": 0.0614, + "step": 48750 + }, + { + "epoch": 3.061218122626153, + "grad_norm": 4.9296088218688965, + "learning_rate": 3.421954227764835e-06, + "loss": 0.0998, + "step": 48751 + }, + { + "epoch": 3.061231687466088, + "grad_norm": 3.3350586891174316, + "learning_rate": 3.4218171851445805e-06, + "loss": 0.0888, + "step": 48752 + }, + { + "epoch": 3.0612452523060227, + "grad_norm": 3.561464548110962, + "learning_rate": 3.421680142524325e-06, + "loss": 0.0669, + "step": 48753 + }, + { + "epoch": 3.0612588171459576, + "grad_norm": 4.26938009262085, + "learning_rate": 3.4215430999040704e-06, + "loss": 0.0789, + "step": 48754 + }, + { + "epoch": 3.0612723819858925, + "grad_norm": 6.005220890045166, + "learning_rate": 3.4214060572838156e-06, + "loss": 0.136, + "step": 48755 + }, + { + "epoch": 3.0612859468258273, + "grad_norm": 3.04461407661438, + "learning_rate": 3.4212690146635607e-06, + "loss": 0.0619, + "step": 48756 + }, + { + "epoch": 3.061299511665762, + "grad_norm": 6.4861979484558105, + "learning_rate": 3.4211319720433055e-06, + "loss": 0.1272, + "step": 48757 + }, + { + "epoch": 3.061313076505697, + "grad_norm": 4.428889751434326, + "learning_rate": 3.420994929423051e-06, + "loss": 0.1021, + "step": 48758 + }, + { + "epoch": 3.061326641345632, + "grad_norm": 5.281325817108154, + "learning_rate": 3.420857886802796e-06, + "loss": 0.1036, + "step": 48759 + }, + { + "epoch": 3.061340206185567, + "grad_norm": 4.1941447257995605, + "learning_rate": 3.420720844182541e-06, + "loss": 0.0866, + "step": 48760 + }, + { + "epoch": 3.061353771025502, + "grad_norm": 4.734994411468506, + "learning_rate": 3.4205838015622862e-06, + "loss": 0.09, + "step": 48761 + }, + { + "epoch": 3.061367335865437, + "grad_norm": 3.231644630432129, + "learning_rate": 3.4204467589420314e-06, + "loss": 0.0594, + "step": 48762 + }, + { + "epoch": 3.061380900705372, + "grad_norm": 2.9347381591796875, + "learning_rate": 3.420309716321776e-06, + "loss": 0.0521, + "step": 48763 + }, + { + "epoch": 3.0613944655453067, + "grad_norm": 3.1927521228790283, + "learning_rate": 3.4201726737015218e-06, + "loss": 0.0839, + "step": 48764 + }, + { + "epoch": 3.0614080303852416, + "grad_norm": 6.638393878936768, + "learning_rate": 3.4200356310812665e-06, + "loss": 0.1349, + "step": 48765 + }, + { + "epoch": 3.0614215952251764, + "grad_norm": 4.194842338562012, + "learning_rate": 3.4198985884610113e-06, + "loss": 0.1335, + "step": 48766 + }, + { + "epoch": 3.0614351600651113, + "grad_norm": 4.8670268058776855, + "learning_rate": 3.419761545840757e-06, + "loss": 0.0833, + "step": 48767 + }, + { + "epoch": 3.061448724905046, + "grad_norm": 4.507717609405518, + "learning_rate": 3.4196245032205016e-06, + "loss": 0.0831, + "step": 48768 + }, + { + "epoch": 3.061462289744981, + "grad_norm": 4.2783074378967285, + "learning_rate": 3.4194874606002472e-06, + "loss": 0.1329, + "step": 48769 + }, + { + "epoch": 3.061475854584916, + "grad_norm": 3.00161075592041, + "learning_rate": 3.419350417979992e-06, + "loss": 0.0643, + "step": 48770 + }, + { + "epoch": 3.0614894194248508, + "grad_norm": 6.491103649139404, + "learning_rate": 3.419213375359737e-06, + "loss": 0.1125, + "step": 48771 + }, + { + "epoch": 3.0615029842647856, + "grad_norm": 13.972576141357422, + "learning_rate": 3.4190763327394823e-06, + "loss": 0.1502, + "step": 48772 + }, + { + "epoch": 3.0615165491047205, + "grad_norm": 5.565570831298828, + "learning_rate": 3.4189392901192275e-06, + "loss": 0.0853, + "step": 48773 + }, + { + "epoch": 3.0615301139446554, + "grad_norm": 2.8673312664031982, + "learning_rate": 3.4188022474989723e-06, + "loss": 0.0565, + "step": 48774 + }, + { + "epoch": 3.06154367878459, + "grad_norm": 3.984598159790039, + "learning_rate": 3.418665204878718e-06, + "loss": 0.1289, + "step": 48775 + }, + { + "epoch": 3.061557243624525, + "grad_norm": 4.588527679443359, + "learning_rate": 3.4185281622584626e-06, + "loss": 0.1047, + "step": 48776 + }, + { + "epoch": 3.06157080846446, + "grad_norm": 7.1942315101623535, + "learning_rate": 3.4183911196382082e-06, + "loss": 0.1141, + "step": 48777 + }, + { + "epoch": 3.061584373304395, + "grad_norm": 4.196471691131592, + "learning_rate": 3.418254077017953e-06, + "loss": 0.1256, + "step": 48778 + }, + { + "epoch": 3.06159793814433, + "grad_norm": 4.9591193199157715, + "learning_rate": 3.4181170343976977e-06, + "loss": 0.1109, + "step": 48779 + }, + { + "epoch": 3.061611502984265, + "grad_norm": 5.075167179107666, + "learning_rate": 3.417979991777443e-06, + "loss": 0.1566, + "step": 48780 + }, + { + "epoch": 3.0616250678242, + "grad_norm": 4.399041652679443, + "learning_rate": 3.417842949157188e-06, + "loss": 0.1451, + "step": 48781 + }, + { + "epoch": 3.0616386326641347, + "grad_norm": 4.6859211921691895, + "learning_rate": 3.4177059065369333e-06, + "loss": 0.159, + "step": 48782 + }, + { + "epoch": 3.0616521975040696, + "grad_norm": 3.8120031356811523, + "learning_rate": 3.417568863916678e-06, + "loss": 0.1045, + "step": 48783 + }, + { + "epoch": 3.0616657623440044, + "grad_norm": 5.30147647857666, + "learning_rate": 3.4174318212964236e-06, + "loss": 0.1062, + "step": 48784 + }, + { + "epoch": 3.0616793271839393, + "grad_norm": 4.411267280578613, + "learning_rate": 3.4172947786761684e-06, + "loss": 0.1607, + "step": 48785 + }, + { + "epoch": 3.061692892023874, + "grad_norm": 3.0961358547210693, + "learning_rate": 3.417157736055914e-06, + "loss": 0.0822, + "step": 48786 + }, + { + "epoch": 3.061706456863809, + "grad_norm": 4.335632801055908, + "learning_rate": 3.4170206934356588e-06, + "loss": 0.1375, + "step": 48787 + }, + { + "epoch": 3.061720021703744, + "grad_norm": 7.246870517730713, + "learning_rate": 3.416883650815404e-06, + "loss": 0.1511, + "step": 48788 + }, + { + "epoch": 3.0617335865436788, + "grad_norm": 6.651096820831299, + "learning_rate": 3.416746608195149e-06, + "loss": 0.151, + "step": 48789 + }, + { + "epoch": 3.0617471513836136, + "grad_norm": 5.21104621887207, + "learning_rate": 3.4166095655748943e-06, + "loss": 0.1094, + "step": 48790 + }, + { + "epoch": 3.0617607162235485, + "grad_norm": 5.12464714050293, + "learning_rate": 3.416472522954639e-06, + "loss": 0.1457, + "step": 48791 + }, + { + "epoch": 3.0617742810634834, + "grad_norm": 3.9105169773101807, + "learning_rate": 3.416335480334384e-06, + "loss": 0.0855, + "step": 48792 + }, + { + "epoch": 3.0617878459034182, + "grad_norm": 4.018906593322754, + "learning_rate": 3.4161984377141294e-06, + "loss": 0.061, + "step": 48793 + }, + { + "epoch": 3.061801410743353, + "grad_norm": 5.883506774902344, + "learning_rate": 3.416061395093874e-06, + "loss": 0.1355, + "step": 48794 + }, + { + "epoch": 3.061814975583288, + "grad_norm": 5.1462907791137695, + "learning_rate": 3.4159243524736198e-06, + "loss": 0.1138, + "step": 48795 + }, + { + "epoch": 3.061828540423223, + "grad_norm": 4.3577880859375, + "learning_rate": 3.4157873098533645e-06, + "loss": 0.1036, + "step": 48796 + }, + { + "epoch": 3.0618421052631577, + "grad_norm": 5.054193019866943, + "learning_rate": 3.4156502672331097e-06, + "loss": 0.182, + "step": 48797 + }, + { + "epoch": 3.0618556701030926, + "grad_norm": 5.95626163482666, + "learning_rate": 3.415513224612855e-06, + "loss": 0.1991, + "step": 48798 + }, + { + "epoch": 3.061869234943028, + "grad_norm": 3.8877620697021484, + "learning_rate": 3.4153761819926e-06, + "loss": 0.1119, + "step": 48799 + }, + { + "epoch": 3.0618827997829627, + "grad_norm": 4.1403961181640625, + "learning_rate": 3.415239139372345e-06, + "loss": 0.1922, + "step": 48800 + }, + { + "epoch": 3.0618963646228976, + "grad_norm": 4.921388149261475, + "learning_rate": 3.4151020967520904e-06, + "loss": 0.1859, + "step": 48801 + }, + { + "epoch": 3.0619099294628325, + "grad_norm": 4.108338832855225, + "learning_rate": 3.414965054131835e-06, + "loss": 0.0923, + "step": 48802 + }, + { + "epoch": 3.0619234943027673, + "grad_norm": 5.886101722717285, + "learning_rate": 3.4148280115115808e-06, + "loss": 0.215, + "step": 48803 + }, + { + "epoch": 3.061937059142702, + "grad_norm": 4.834615707397461, + "learning_rate": 3.4146909688913255e-06, + "loss": 0.1717, + "step": 48804 + }, + { + "epoch": 3.061950623982637, + "grad_norm": 6.1018524169921875, + "learning_rate": 3.4145539262710707e-06, + "loss": 0.162, + "step": 48805 + }, + { + "epoch": 3.061964188822572, + "grad_norm": 4.958333969116211, + "learning_rate": 3.414416883650816e-06, + "loss": 0.1659, + "step": 48806 + }, + { + "epoch": 3.061977753662507, + "grad_norm": 4.381429195404053, + "learning_rate": 3.4142798410305606e-06, + "loss": 0.1173, + "step": 48807 + }, + { + "epoch": 3.0619913185024417, + "grad_norm": 3.1732215881347656, + "learning_rate": 3.414142798410306e-06, + "loss": 0.1053, + "step": 48808 + }, + { + "epoch": 3.0620048833423765, + "grad_norm": 4.198628902435303, + "learning_rate": 3.4140057557900506e-06, + "loss": 0.1112, + "step": 48809 + }, + { + "epoch": 3.0620184481823114, + "grad_norm": 4.713738918304443, + "learning_rate": 3.413868713169796e-06, + "loss": 0.1023, + "step": 48810 + }, + { + "epoch": 3.0620320130222463, + "grad_norm": 3.3993208408355713, + "learning_rate": 3.413731670549541e-06, + "loss": 0.1217, + "step": 48811 + }, + { + "epoch": 3.062045577862181, + "grad_norm": 7.162456035614014, + "learning_rate": 3.4135946279292865e-06, + "loss": 0.2063, + "step": 48812 + }, + { + "epoch": 3.062059142702116, + "grad_norm": 4.45851469039917, + "learning_rate": 3.4134575853090313e-06, + "loss": 0.205, + "step": 48813 + }, + { + "epoch": 3.062072707542051, + "grad_norm": 6.633137226104736, + "learning_rate": 3.4133205426887765e-06, + "loss": 0.1289, + "step": 48814 + }, + { + "epoch": 3.0620862723819857, + "grad_norm": 3.8565142154693604, + "learning_rate": 3.4131835000685216e-06, + "loss": 0.0797, + "step": 48815 + }, + { + "epoch": 3.0620998372219206, + "grad_norm": 3.818981170654297, + "learning_rate": 3.413046457448267e-06, + "loss": 0.1412, + "step": 48816 + }, + { + "epoch": 3.062113402061856, + "grad_norm": 6.569081783294678, + "learning_rate": 3.4129094148280116e-06, + "loss": 0.1506, + "step": 48817 + }, + { + "epoch": 3.0621269669017908, + "grad_norm": 6.072782516479492, + "learning_rate": 3.412772372207757e-06, + "loss": 0.2063, + "step": 48818 + }, + { + "epoch": 3.0621405317417256, + "grad_norm": 3.7459957599639893, + "learning_rate": 3.412635329587502e-06, + "loss": 0.1176, + "step": 48819 + }, + { + "epoch": 3.0621540965816605, + "grad_norm": 3.552854061126709, + "learning_rate": 3.4124982869672467e-06, + "loss": 0.0812, + "step": 48820 + }, + { + "epoch": 3.0621676614215954, + "grad_norm": 4.584230899810791, + "learning_rate": 3.4123612443469923e-06, + "loss": 0.1862, + "step": 48821 + }, + { + "epoch": 3.06218122626153, + "grad_norm": 4.934412479400635, + "learning_rate": 3.412224201726737e-06, + "loss": 0.1727, + "step": 48822 + }, + { + "epoch": 3.062194791101465, + "grad_norm": 3.974763870239258, + "learning_rate": 3.4120871591064826e-06, + "loss": 0.0976, + "step": 48823 + }, + { + "epoch": 3.0622083559414, + "grad_norm": 5.635578155517578, + "learning_rate": 3.4119501164862274e-06, + "loss": 0.1612, + "step": 48824 + }, + { + "epoch": 3.062221920781335, + "grad_norm": 3.643345355987549, + "learning_rate": 3.4118130738659726e-06, + "loss": 0.148, + "step": 48825 + }, + { + "epoch": 3.0622354856212697, + "grad_norm": 3.1062076091766357, + "learning_rate": 3.4116760312457178e-06, + "loss": 0.1026, + "step": 48826 + }, + { + "epoch": 3.0622490504612045, + "grad_norm": 4.056476593017578, + "learning_rate": 3.411538988625463e-06, + "loss": 0.1433, + "step": 48827 + }, + { + "epoch": 3.0622626153011394, + "grad_norm": 4.306588172912598, + "learning_rate": 3.4114019460052077e-06, + "loss": 0.1169, + "step": 48828 + }, + { + "epoch": 3.0622761801410743, + "grad_norm": 5.308005332946777, + "learning_rate": 3.4112649033849533e-06, + "loss": 0.1788, + "step": 48829 + }, + { + "epoch": 3.062289744981009, + "grad_norm": 4.1204142570495605, + "learning_rate": 3.411127860764698e-06, + "loss": 0.1048, + "step": 48830 + }, + { + "epoch": 3.062303309820944, + "grad_norm": 3.3823859691619873, + "learning_rate": 3.4109908181444432e-06, + "loss": 0.0774, + "step": 48831 + }, + { + "epoch": 3.062316874660879, + "grad_norm": 5.572521209716797, + "learning_rate": 3.4108537755241884e-06, + "loss": 0.1965, + "step": 48832 + }, + { + "epoch": 3.0623304395008137, + "grad_norm": 3.627595901489258, + "learning_rate": 3.410716732903933e-06, + "loss": 0.0721, + "step": 48833 + }, + { + "epoch": 3.0623440043407486, + "grad_norm": 7.383589267730713, + "learning_rate": 3.4105796902836783e-06, + "loss": 0.2152, + "step": 48834 + }, + { + "epoch": 3.0623575691806835, + "grad_norm": 5.294255256652832, + "learning_rate": 3.4104426476634235e-06, + "loss": 0.144, + "step": 48835 + }, + { + "epoch": 3.0623711340206183, + "grad_norm": 5.789475440979004, + "learning_rate": 3.4103056050431687e-06, + "loss": 0.1542, + "step": 48836 + }, + { + "epoch": 3.0623846988605536, + "grad_norm": 4.322473049163818, + "learning_rate": 3.4101685624229135e-06, + "loss": 0.1045, + "step": 48837 + }, + { + "epoch": 3.0623982637004885, + "grad_norm": 3.92607045173645, + "learning_rate": 3.410031519802659e-06, + "loss": 0.1548, + "step": 48838 + }, + { + "epoch": 3.0624118285404234, + "grad_norm": 3.6528122425079346, + "learning_rate": 3.409894477182404e-06, + "loss": 0.118, + "step": 48839 + }, + { + "epoch": 3.0624253933803582, + "grad_norm": 4.136981010437012, + "learning_rate": 3.4097574345621494e-06, + "loss": 0.1348, + "step": 48840 + }, + { + "epoch": 3.062438958220293, + "grad_norm": 5.314816474914551, + "learning_rate": 3.409620391941894e-06, + "loss": 0.1048, + "step": 48841 + }, + { + "epoch": 3.062452523060228, + "grad_norm": 3.162039041519165, + "learning_rate": 3.4094833493216394e-06, + "loss": 0.0845, + "step": 48842 + }, + { + "epoch": 3.062466087900163, + "grad_norm": 3.8541810512542725, + "learning_rate": 3.4093463067013845e-06, + "loss": 0.1048, + "step": 48843 + }, + { + "epoch": 3.0624796527400977, + "grad_norm": 4.366377830505371, + "learning_rate": 3.4092092640811297e-06, + "loss": 0.1761, + "step": 48844 + }, + { + "epoch": 3.0624932175800326, + "grad_norm": 5.405677318572998, + "learning_rate": 3.4090722214608745e-06, + "loss": 0.1091, + "step": 48845 + }, + { + "epoch": 3.0625067824199674, + "grad_norm": 6.462157249450684, + "learning_rate": 3.40893517884062e-06, + "loss": 0.1708, + "step": 48846 + }, + { + "epoch": 3.0625203472599023, + "grad_norm": 4.561732292175293, + "learning_rate": 3.408798136220365e-06, + "loss": 0.0745, + "step": 48847 + }, + { + "epoch": 3.062533912099837, + "grad_norm": 3.341282844543457, + "learning_rate": 3.4086610936001096e-06, + "loss": 0.1157, + "step": 48848 + }, + { + "epoch": 3.062547476939772, + "grad_norm": 5.1418867111206055, + "learning_rate": 3.408524050979855e-06, + "loss": 0.1312, + "step": 48849 + }, + { + "epoch": 3.062561041779707, + "grad_norm": 2.861199378967285, + "learning_rate": 3.4083870083596e-06, + "loss": 0.0822, + "step": 48850 + }, + { + "epoch": 3.0625746066196418, + "grad_norm": 4.684053897857666, + "learning_rate": 3.408249965739345e-06, + "loss": 0.118, + "step": 48851 + }, + { + "epoch": 3.0625881714595766, + "grad_norm": 2.9569413661956787, + "learning_rate": 3.4081129231190903e-06, + "loss": 0.0943, + "step": 48852 + }, + { + "epoch": 3.0626017362995115, + "grad_norm": 4.648085117340088, + "learning_rate": 3.4079758804988355e-06, + "loss": 0.1861, + "step": 48853 + }, + { + "epoch": 3.0626153011394464, + "grad_norm": 5.110703945159912, + "learning_rate": 3.4078388378785802e-06, + "loss": 0.1502, + "step": 48854 + }, + { + "epoch": 3.0626288659793817, + "grad_norm": 5.122091770172119, + "learning_rate": 3.407701795258326e-06, + "loss": 0.1231, + "step": 48855 + }, + { + "epoch": 3.0626424308193165, + "grad_norm": 3.3733887672424316, + "learning_rate": 3.4075647526380706e-06, + "loss": 0.0663, + "step": 48856 + }, + { + "epoch": 3.0626559956592514, + "grad_norm": 3.915972948074341, + "learning_rate": 3.407427710017816e-06, + "loss": 0.1469, + "step": 48857 + }, + { + "epoch": 3.0626695604991863, + "grad_norm": 4.073666572570801, + "learning_rate": 3.407290667397561e-06, + "loss": 0.1224, + "step": 48858 + }, + { + "epoch": 3.062683125339121, + "grad_norm": 4.77789306640625, + "learning_rate": 3.407153624777306e-06, + "loss": 0.1431, + "step": 48859 + }, + { + "epoch": 3.062696690179056, + "grad_norm": 3.9496512413024902, + "learning_rate": 3.4070165821570513e-06, + "loss": 0.1261, + "step": 48860 + }, + { + "epoch": 3.062710255018991, + "grad_norm": 4.44894552230835, + "learning_rate": 3.406879539536796e-06, + "loss": 0.1048, + "step": 48861 + }, + { + "epoch": 3.0627238198589257, + "grad_norm": 4.807783603668213, + "learning_rate": 3.4067424969165412e-06, + "loss": 0.1648, + "step": 48862 + }, + { + "epoch": 3.0627373846988606, + "grad_norm": 5.102991104125977, + "learning_rate": 3.406605454296286e-06, + "loss": 0.1522, + "step": 48863 + }, + { + "epoch": 3.0627509495387955, + "grad_norm": 3.9973485469818115, + "learning_rate": 3.4064684116760316e-06, + "loss": 0.1456, + "step": 48864 + }, + { + "epoch": 3.0627645143787303, + "grad_norm": 5.565627574920654, + "learning_rate": 3.4063313690557764e-06, + "loss": 0.2238, + "step": 48865 + }, + { + "epoch": 3.062778079218665, + "grad_norm": 3.270212173461914, + "learning_rate": 3.406194326435522e-06, + "loss": 0.1009, + "step": 48866 + }, + { + "epoch": 3.0627916440586, + "grad_norm": 3.649623155593872, + "learning_rate": 3.4060572838152667e-06, + "loss": 0.0983, + "step": 48867 + }, + { + "epoch": 3.062805208898535, + "grad_norm": 3.5254733562469482, + "learning_rate": 3.405920241195012e-06, + "loss": 0.1055, + "step": 48868 + }, + { + "epoch": 3.06281877373847, + "grad_norm": 5.414852619171143, + "learning_rate": 3.405783198574757e-06, + "loss": 0.1533, + "step": 48869 + }, + { + "epoch": 3.0628323385784046, + "grad_norm": 4.554752826690674, + "learning_rate": 3.4056461559545022e-06, + "loss": 0.1767, + "step": 48870 + }, + { + "epoch": 3.0628459034183395, + "grad_norm": 4.133328437805176, + "learning_rate": 3.405509113334247e-06, + "loss": 0.1257, + "step": 48871 + }, + { + "epoch": 3.0628594682582744, + "grad_norm": 4.50989294052124, + "learning_rate": 3.4053720707139926e-06, + "loss": 0.1536, + "step": 48872 + }, + { + "epoch": 3.0628730330982092, + "grad_norm": 4.366485118865967, + "learning_rate": 3.4052350280937374e-06, + "loss": 0.1311, + "step": 48873 + }, + { + "epoch": 3.062886597938144, + "grad_norm": 3.1324799060821533, + "learning_rate": 3.405097985473483e-06, + "loss": 0.0947, + "step": 48874 + }, + { + "epoch": 3.0629001627780794, + "grad_norm": 5.645705223083496, + "learning_rate": 3.4049609428532277e-06, + "loss": 0.1845, + "step": 48875 + }, + { + "epoch": 3.0629137276180143, + "grad_norm": 3.6916000843048096, + "learning_rate": 3.4048239002329725e-06, + "loss": 0.1071, + "step": 48876 + }, + { + "epoch": 3.062927292457949, + "grad_norm": 5.831727504730225, + "learning_rate": 3.404686857612718e-06, + "loss": 0.1717, + "step": 48877 + }, + { + "epoch": 3.062940857297884, + "grad_norm": 5.581070899963379, + "learning_rate": 3.404549814992463e-06, + "loss": 0.132, + "step": 48878 + }, + { + "epoch": 3.062954422137819, + "grad_norm": 5.258115768432617, + "learning_rate": 3.404412772372208e-06, + "loss": 0.1711, + "step": 48879 + }, + { + "epoch": 3.0629679869777537, + "grad_norm": 6.183511257171631, + "learning_rate": 3.4042757297519528e-06, + "loss": 0.1588, + "step": 48880 + }, + { + "epoch": 3.0629815518176886, + "grad_norm": 3.513303756713867, + "learning_rate": 3.4041386871316984e-06, + "loss": 0.1111, + "step": 48881 + }, + { + "epoch": 3.0629951166576235, + "grad_norm": 3.06795072555542, + "learning_rate": 3.404001644511443e-06, + "loss": 0.1133, + "step": 48882 + }, + { + "epoch": 3.0630086814975583, + "grad_norm": 2.5261993408203125, + "learning_rate": 3.4038646018911887e-06, + "loss": 0.0676, + "step": 48883 + }, + { + "epoch": 3.063022246337493, + "grad_norm": 4.171631813049316, + "learning_rate": 3.4037275592709335e-06, + "loss": 0.1067, + "step": 48884 + }, + { + "epoch": 3.063035811177428, + "grad_norm": 4.495262622833252, + "learning_rate": 3.4035905166506787e-06, + "loss": 0.1062, + "step": 48885 + }, + { + "epoch": 3.063049376017363, + "grad_norm": 4.648736000061035, + "learning_rate": 3.403453474030424e-06, + "loss": 0.1689, + "step": 48886 + }, + { + "epoch": 3.063062940857298, + "grad_norm": 4.359235763549805, + "learning_rate": 3.403316431410169e-06, + "loss": 0.125, + "step": 48887 + }, + { + "epoch": 3.0630765056972327, + "grad_norm": 4.342535972595215, + "learning_rate": 3.4031793887899138e-06, + "loss": 0.1625, + "step": 48888 + }, + { + "epoch": 3.0630900705371675, + "grad_norm": 4.6810221672058105, + "learning_rate": 3.403042346169659e-06, + "loss": 0.1345, + "step": 48889 + }, + { + "epoch": 3.0631036353771024, + "grad_norm": 5.072720050811768, + "learning_rate": 3.402905303549404e-06, + "loss": 0.2487, + "step": 48890 + }, + { + "epoch": 3.0631172002170373, + "grad_norm": 5.327756881713867, + "learning_rate": 3.402768260929149e-06, + "loss": 0.2016, + "step": 48891 + }, + { + "epoch": 3.063130765056972, + "grad_norm": 3.492689371109009, + "learning_rate": 3.4026312183088945e-06, + "loss": 0.0636, + "step": 48892 + }, + { + "epoch": 3.0631443298969074, + "grad_norm": 5.018345832824707, + "learning_rate": 3.4024941756886392e-06, + "loss": 0.1963, + "step": 48893 + }, + { + "epoch": 3.0631578947368423, + "grad_norm": 6.2776923179626465, + "learning_rate": 3.402357133068385e-06, + "loss": 0.1375, + "step": 48894 + }, + { + "epoch": 3.063171459576777, + "grad_norm": 4.459383010864258, + "learning_rate": 3.4022200904481296e-06, + "loss": 0.1312, + "step": 48895 + }, + { + "epoch": 3.063185024416712, + "grad_norm": 4.092250347137451, + "learning_rate": 3.4020830478278748e-06, + "loss": 0.1694, + "step": 48896 + }, + { + "epoch": 3.063198589256647, + "grad_norm": 3.7408525943756104, + "learning_rate": 3.40194600520762e-06, + "loss": 0.0979, + "step": 48897 + }, + { + "epoch": 3.0632121540965818, + "grad_norm": 5.1550164222717285, + "learning_rate": 3.401808962587365e-06, + "loss": 0.1318, + "step": 48898 + }, + { + "epoch": 3.0632257189365166, + "grad_norm": 4.656837463378906, + "learning_rate": 3.40167191996711e-06, + "loss": 0.1078, + "step": 48899 + }, + { + "epoch": 3.0632392837764515, + "grad_norm": 5.089038848876953, + "learning_rate": 3.4015348773468555e-06, + "loss": 0.1173, + "step": 48900 + }, + { + "epoch": 3.0632528486163864, + "grad_norm": 3.3587467670440674, + "learning_rate": 3.4013978347266002e-06, + "loss": 0.0746, + "step": 48901 + }, + { + "epoch": 3.0632664134563212, + "grad_norm": 5.048763751983643, + "learning_rate": 3.401260792106345e-06, + "loss": 0.1181, + "step": 48902 + }, + { + "epoch": 3.063279978296256, + "grad_norm": 4.454492568969727, + "learning_rate": 3.4011237494860906e-06, + "loss": 0.1471, + "step": 48903 + }, + { + "epoch": 3.063293543136191, + "grad_norm": 5.131720066070557, + "learning_rate": 3.4009867068658354e-06, + "loss": 0.0932, + "step": 48904 + }, + { + "epoch": 3.063307107976126, + "grad_norm": 5.882717609405518, + "learning_rate": 3.4008496642455805e-06, + "loss": 0.1939, + "step": 48905 + }, + { + "epoch": 3.0633206728160607, + "grad_norm": 4.657680034637451, + "learning_rate": 3.4007126216253257e-06, + "loss": 0.1701, + "step": 48906 + }, + { + "epoch": 3.0633342376559956, + "grad_norm": 3.6997146606445312, + "learning_rate": 3.400575579005071e-06, + "loss": 0.098, + "step": 48907 + }, + { + "epoch": 3.0633478024959304, + "grad_norm": 3.9548916816711426, + "learning_rate": 3.4004385363848157e-06, + "loss": 0.1445, + "step": 48908 + }, + { + "epoch": 3.0633613673358653, + "grad_norm": 3.525148391723633, + "learning_rate": 3.4003014937645613e-06, + "loss": 0.0846, + "step": 48909 + }, + { + "epoch": 3.0633749321758, + "grad_norm": 5.045090675354004, + "learning_rate": 3.400164451144306e-06, + "loss": 0.1303, + "step": 48910 + }, + { + "epoch": 3.063388497015735, + "grad_norm": 4.600019454956055, + "learning_rate": 3.4000274085240516e-06, + "loss": 0.1354, + "step": 48911 + }, + { + "epoch": 3.06340206185567, + "grad_norm": 4.4656219482421875, + "learning_rate": 3.3998903659037964e-06, + "loss": 0.0958, + "step": 48912 + }, + { + "epoch": 3.063415626695605, + "grad_norm": 3.9292869567871094, + "learning_rate": 3.3997533232835415e-06, + "loss": 0.1185, + "step": 48913 + }, + { + "epoch": 3.06342919153554, + "grad_norm": 3.4903461933135986, + "learning_rate": 3.3996162806632867e-06, + "loss": 0.1163, + "step": 48914 + }, + { + "epoch": 3.063442756375475, + "grad_norm": 3.2142221927642822, + "learning_rate": 3.399479238043032e-06, + "loss": 0.075, + "step": 48915 + }, + { + "epoch": 3.06345632121541, + "grad_norm": 3.976285696029663, + "learning_rate": 3.3993421954227767e-06, + "loss": 0.1617, + "step": 48916 + }, + { + "epoch": 3.0634698860553446, + "grad_norm": 3.0026843547821045, + "learning_rate": 3.3992051528025214e-06, + "loss": 0.0624, + "step": 48917 + }, + { + "epoch": 3.0634834508952795, + "grad_norm": 4.128446102142334, + "learning_rate": 3.399068110182267e-06, + "loss": 0.1327, + "step": 48918 + }, + { + "epoch": 3.0634970157352144, + "grad_norm": 4.635796070098877, + "learning_rate": 3.3989310675620118e-06, + "loss": 0.2256, + "step": 48919 + }, + { + "epoch": 3.0635105805751492, + "grad_norm": 3.400559902191162, + "learning_rate": 3.3987940249417574e-06, + "loss": 0.1485, + "step": 48920 + }, + { + "epoch": 3.063524145415084, + "grad_norm": 4.637674808502197, + "learning_rate": 3.398656982321502e-06, + "loss": 0.0977, + "step": 48921 + }, + { + "epoch": 3.063537710255019, + "grad_norm": 5.546982288360596, + "learning_rate": 3.3985199397012473e-06, + "loss": 0.161, + "step": 48922 + }, + { + "epoch": 3.063551275094954, + "grad_norm": 3.701207160949707, + "learning_rate": 3.3983828970809925e-06, + "loss": 0.103, + "step": 48923 + }, + { + "epoch": 3.0635648399348887, + "grad_norm": 4.558450222015381, + "learning_rate": 3.3982458544607377e-06, + "loss": 0.1678, + "step": 48924 + }, + { + "epoch": 3.0635784047748236, + "grad_norm": 4.487007141113281, + "learning_rate": 3.3981088118404824e-06, + "loss": 0.1897, + "step": 48925 + }, + { + "epoch": 3.0635919696147584, + "grad_norm": 4.168875694274902, + "learning_rate": 3.397971769220228e-06, + "loss": 0.1172, + "step": 48926 + }, + { + "epoch": 3.0636055344546933, + "grad_norm": 5.44714879989624, + "learning_rate": 3.3978347265999728e-06, + "loss": 0.2328, + "step": 48927 + }, + { + "epoch": 3.063619099294628, + "grad_norm": 4.472718238830566, + "learning_rate": 3.3976976839797184e-06, + "loss": 0.1942, + "step": 48928 + }, + { + "epoch": 3.063632664134563, + "grad_norm": 7.34502649307251, + "learning_rate": 3.397560641359463e-06, + "loss": 0.2217, + "step": 48929 + }, + { + "epoch": 3.063646228974498, + "grad_norm": 3.74014949798584, + "learning_rate": 3.397423598739208e-06, + "loss": 0.1431, + "step": 48930 + }, + { + "epoch": 3.063659793814433, + "grad_norm": 3.9101381301879883, + "learning_rate": 3.3972865561189535e-06, + "loss": 0.132, + "step": 48931 + }, + { + "epoch": 3.063673358654368, + "grad_norm": 5.457937240600586, + "learning_rate": 3.3971495134986983e-06, + "loss": 0.0811, + "step": 48932 + }, + { + "epoch": 3.063686923494303, + "grad_norm": 5.210110187530518, + "learning_rate": 3.3970124708784434e-06, + "loss": 0.1854, + "step": 48933 + }, + { + "epoch": 3.063700488334238, + "grad_norm": 5.585605621337891, + "learning_rate": 3.396875428258188e-06, + "loss": 0.2499, + "step": 48934 + }, + { + "epoch": 3.0637140531741727, + "grad_norm": 3.64870023727417, + "learning_rate": 3.396738385637934e-06, + "loss": 0.1364, + "step": 48935 + }, + { + "epoch": 3.0637276180141075, + "grad_norm": 3.73013973236084, + "learning_rate": 3.3966013430176785e-06, + "loss": 0.0751, + "step": 48936 + }, + { + "epoch": 3.0637411828540424, + "grad_norm": 4.187911033630371, + "learning_rate": 3.396464300397424e-06, + "loss": 0.1168, + "step": 48937 + }, + { + "epoch": 3.0637547476939773, + "grad_norm": 4.392340183258057, + "learning_rate": 3.396327257777169e-06, + "loss": 0.2331, + "step": 48938 + }, + { + "epoch": 3.063768312533912, + "grad_norm": 5.426912307739258, + "learning_rate": 3.396190215156914e-06, + "loss": 0.2169, + "step": 48939 + }, + { + "epoch": 3.063781877373847, + "grad_norm": 4.2334513664245605, + "learning_rate": 3.3960531725366593e-06, + "loss": 0.1375, + "step": 48940 + }, + { + "epoch": 3.063795442213782, + "grad_norm": 3.733710765838623, + "learning_rate": 3.3959161299164044e-06, + "loss": 0.1383, + "step": 48941 + }, + { + "epoch": 3.0638090070537167, + "grad_norm": 3.3406641483306885, + "learning_rate": 3.395779087296149e-06, + "loss": 0.1084, + "step": 48942 + }, + { + "epoch": 3.0638225718936516, + "grad_norm": 4.555084705352783, + "learning_rate": 3.395642044675895e-06, + "loss": 0.1892, + "step": 48943 + }, + { + "epoch": 3.0638361367335865, + "grad_norm": 4.16385555267334, + "learning_rate": 3.3955050020556396e-06, + "loss": 0.0928, + "step": 48944 + }, + { + "epoch": 3.0638497015735213, + "grad_norm": 3.3408615589141846, + "learning_rate": 3.3953679594353843e-06, + "loss": 0.0929, + "step": 48945 + }, + { + "epoch": 3.063863266413456, + "grad_norm": 6.293335914611816, + "learning_rate": 3.39523091681513e-06, + "loss": 0.1917, + "step": 48946 + }, + { + "epoch": 3.063876831253391, + "grad_norm": 4.399913787841797, + "learning_rate": 3.3950938741948747e-06, + "loss": 0.1053, + "step": 48947 + }, + { + "epoch": 3.063890396093326, + "grad_norm": 4.918832302093506, + "learning_rate": 3.3949568315746203e-06, + "loss": 0.1445, + "step": 48948 + }, + { + "epoch": 3.063903960933261, + "grad_norm": 5.130680561065674, + "learning_rate": 3.394819788954365e-06, + "loss": 0.1088, + "step": 48949 + }, + { + "epoch": 3.0639175257731956, + "grad_norm": 4.784664630889893, + "learning_rate": 3.39468274633411e-06, + "loss": 0.1945, + "step": 48950 + }, + { + "epoch": 3.063931090613131, + "grad_norm": 5.369500637054443, + "learning_rate": 3.394545703713855e-06, + "loss": 0.2837, + "step": 48951 + }, + { + "epoch": 3.063944655453066, + "grad_norm": 5.180606365203857, + "learning_rate": 3.3944086610936006e-06, + "loss": 0.1943, + "step": 48952 + }, + { + "epoch": 3.0639582202930007, + "grad_norm": 4.144265651702881, + "learning_rate": 3.3942716184733453e-06, + "loss": 0.1596, + "step": 48953 + }, + { + "epoch": 3.0639717851329356, + "grad_norm": 5.298122406005859, + "learning_rate": 3.394134575853091e-06, + "loss": 0.1638, + "step": 48954 + }, + { + "epoch": 3.0639853499728704, + "grad_norm": 3.8501486778259277, + "learning_rate": 3.3939975332328357e-06, + "loss": 0.1203, + "step": 48955 + }, + { + "epoch": 3.0639989148128053, + "grad_norm": 5.5838775634765625, + "learning_rate": 3.393860490612581e-06, + "loss": 0.2357, + "step": 48956 + }, + { + "epoch": 3.06401247965274, + "grad_norm": 5.040474891662598, + "learning_rate": 3.393723447992326e-06, + "loss": 0.1919, + "step": 48957 + }, + { + "epoch": 3.064026044492675, + "grad_norm": 4.849470138549805, + "learning_rate": 3.3935864053720708e-06, + "loss": 0.1695, + "step": 48958 + }, + { + "epoch": 3.06403960933261, + "grad_norm": 3.676469564437866, + "learning_rate": 3.393449362751816e-06, + "loss": 0.1108, + "step": 48959 + }, + { + "epoch": 3.0640531741725447, + "grad_norm": 4.8023552894592285, + "learning_rate": 3.393312320131561e-06, + "loss": 0.0997, + "step": 48960 + }, + { + "epoch": 3.0640667390124796, + "grad_norm": 3.8935868740081787, + "learning_rate": 3.3931752775113063e-06, + "loss": 0.1349, + "step": 48961 + }, + { + "epoch": 3.0640803038524145, + "grad_norm": 7.285523891448975, + "learning_rate": 3.393038234891051e-06, + "loss": 0.1339, + "step": 48962 + }, + { + "epoch": 3.0640938686923493, + "grad_norm": 5.102584362030029, + "learning_rate": 3.3929011922707967e-06, + "loss": 0.1374, + "step": 48963 + }, + { + "epoch": 3.064107433532284, + "grad_norm": 4.9677324295043945, + "learning_rate": 3.3927641496505414e-06, + "loss": 0.186, + "step": 48964 + }, + { + "epoch": 3.064120998372219, + "grad_norm": 3.8831889629364014, + "learning_rate": 3.392627107030287e-06, + "loss": 0.1005, + "step": 48965 + }, + { + "epoch": 3.064134563212154, + "grad_norm": 3.6232452392578125, + "learning_rate": 3.392490064410032e-06, + "loss": 0.1295, + "step": 48966 + }, + { + "epoch": 3.064148128052089, + "grad_norm": 5.146784782409668, + "learning_rate": 3.392353021789777e-06, + "loss": 0.2204, + "step": 48967 + }, + { + "epoch": 3.0641616928920237, + "grad_norm": 4.553475856781006, + "learning_rate": 3.392215979169522e-06, + "loss": 0.1884, + "step": 48968 + }, + { + "epoch": 3.064175257731959, + "grad_norm": 6.780030727386475, + "learning_rate": 3.3920789365492673e-06, + "loss": 0.255, + "step": 48969 + }, + { + "epoch": 3.064188822571894, + "grad_norm": 4.116221904754639, + "learning_rate": 3.391941893929012e-06, + "loss": 0.1677, + "step": 48970 + }, + { + "epoch": 3.0642023874118287, + "grad_norm": 4.306347370147705, + "learning_rate": 3.391804851308757e-06, + "loss": 0.1194, + "step": 48971 + }, + { + "epoch": 3.0642159522517636, + "grad_norm": 3.6942641735076904, + "learning_rate": 3.3916678086885024e-06, + "loss": 0.1401, + "step": 48972 + }, + { + "epoch": 3.0642295170916984, + "grad_norm": 6.301149368286133, + "learning_rate": 3.391530766068247e-06, + "loss": 0.3569, + "step": 48973 + }, + { + "epoch": 3.0642430819316333, + "grad_norm": 5.816544055938721, + "learning_rate": 3.391393723447993e-06, + "loss": 0.2611, + "step": 48974 + }, + { + "epoch": 3.064256646771568, + "grad_norm": 4.834969997406006, + "learning_rate": 3.3912566808277376e-06, + "loss": 0.2211, + "step": 48975 + }, + { + "epoch": 3.064270211611503, + "grad_norm": 5.03893518447876, + "learning_rate": 3.3911196382074827e-06, + "loss": 0.1348, + "step": 48976 + }, + { + "epoch": 3.064283776451438, + "grad_norm": 4.691337585449219, + "learning_rate": 3.390982595587228e-06, + "loss": 0.1843, + "step": 48977 + }, + { + "epoch": 3.0642973412913728, + "grad_norm": 3.2288873195648193, + "learning_rate": 3.390845552966973e-06, + "loss": 0.1239, + "step": 48978 + }, + { + "epoch": 3.0643109061313076, + "grad_norm": 4.7917351722717285, + "learning_rate": 3.390708510346718e-06, + "loss": 0.2112, + "step": 48979 + }, + { + "epoch": 3.0643244709712425, + "grad_norm": 4.92703914642334, + "learning_rate": 3.3905714677264635e-06, + "loss": 0.2657, + "step": 48980 + }, + { + "epoch": 3.0643380358111774, + "grad_norm": 3.8047947883605957, + "learning_rate": 3.390434425106208e-06, + "loss": 0.2063, + "step": 48981 + }, + { + "epoch": 3.0643516006511122, + "grad_norm": 4.786705017089844, + "learning_rate": 3.390297382485954e-06, + "loss": 0.197, + "step": 48982 + }, + { + "epoch": 3.064365165491047, + "grad_norm": 3.2479050159454346, + "learning_rate": 3.3901603398656986e-06, + "loss": 0.105, + "step": 48983 + }, + { + "epoch": 3.064378730330982, + "grad_norm": 3.8635001182556152, + "learning_rate": 3.3900232972454437e-06, + "loss": 0.1548, + "step": 48984 + }, + { + "epoch": 3.064392295170917, + "grad_norm": 4.189601898193359, + "learning_rate": 3.389886254625189e-06, + "loss": 0.1594, + "step": 48985 + }, + { + "epoch": 3.0644058600108517, + "grad_norm": 4.647217273712158, + "learning_rate": 3.3897492120049337e-06, + "loss": 0.2087, + "step": 48986 + }, + { + "epoch": 3.0644194248507866, + "grad_norm": 4.989738464355469, + "learning_rate": 3.389612169384679e-06, + "loss": 0.1142, + "step": 48987 + }, + { + "epoch": 3.0644329896907214, + "grad_norm": 3.737755060195923, + "learning_rate": 3.3894751267644236e-06, + "loss": 0.115, + "step": 48988 + }, + { + "epoch": 3.0644465545306567, + "grad_norm": 3.7840616703033447, + "learning_rate": 3.3893380841441692e-06, + "loss": 0.0981, + "step": 48989 + }, + { + "epoch": 3.0644601193705916, + "grad_norm": 5.394926071166992, + "learning_rate": 3.389201041523914e-06, + "loss": 0.1741, + "step": 48990 + }, + { + "epoch": 3.0644736842105265, + "grad_norm": 3.130293130874634, + "learning_rate": 3.3890639989036596e-06, + "loss": 0.1466, + "step": 48991 + }, + { + "epoch": 3.0644872490504613, + "grad_norm": 4.666548728942871, + "learning_rate": 3.3889269562834043e-06, + "loss": 0.1667, + "step": 48992 + }, + { + "epoch": 3.064500813890396, + "grad_norm": 5.999435901641846, + "learning_rate": 3.3887899136631495e-06, + "loss": 0.1856, + "step": 48993 + }, + { + "epoch": 3.064514378730331, + "grad_norm": 4.220405578613281, + "learning_rate": 3.3886528710428947e-06, + "loss": 0.1975, + "step": 48994 + }, + { + "epoch": 3.064527943570266, + "grad_norm": 4.860105991363525, + "learning_rate": 3.38851582842264e-06, + "loss": 0.1329, + "step": 48995 + }, + { + "epoch": 3.064541508410201, + "grad_norm": 3.8924720287323, + "learning_rate": 3.3883787858023846e-06, + "loss": 0.1987, + "step": 48996 + }, + { + "epoch": 3.0645550732501357, + "grad_norm": 4.0058112144470215, + "learning_rate": 3.3882417431821302e-06, + "loss": 0.0833, + "step": 48997 + }, + { + "epoch": 3.0645686380900705, + "grad_norm": 4.134970664978027, + "learning_rate": 3.388104700561875e-06, + "loss": 0.13, + "step": 48998 + }, + { + "epoch": 3.0645822029300054, + "grad_norm": 6.973124980926514, + "learning_rate": 3.3879676579416197e-06, + "loss": 0.2648, + "step": 48999 + }, + { + "epoch": 3.0645957677699402, + "grad_norm": 4.169399261474609, + "learning_rate": 3.3878306153213653e-06, + "loss": 0.0729, + "step": 49000 + }, + { + "epoch": 3.064609332609875, + "grad_norm": 5.094453811645508, + "learning_rate": 3.38769357270111e-06, + "loss": 0.1303, + "step": 49001 + }, + { + "epoch": 3.06462289744981, + "grad_norm": 6.509006977081299, + "learning_rate": 3.3875565300808557e-06, + "loss": 0.1473, + "step": 49002 + }, + { + "epoch": 3.064636462289745, + "grad_norm": 6.858884811401367, + "learning_rate": 3.3874194874606004e-06, + "loss": 0.1757, + "step": 49003 + }, + { + "epoch": 3.0646500271296797, + "grad_norm": 4.252318382263184, + "learning_rate": 3.3872824448403456e-06, + "loss": 0.183, + "step": 49004 + }, + { + "epoch": 3.0646635919696146, + "grad_norm": 5.010467052459717, + "learning_rate": 3.3871454022200904e-06, + "loss": 0.1813, + "step": 49005 + }, + { + "epoch": 3.0646771568095494, + "grad_norm": 6.063973903656006, + "learning_rate": 3.387008359599836e-06, + "loss": 0.2048, + "step": 49006 + }, + { + "epoch": 3.0646907216494848, + "grad_norm": 4.032290935516357, + "learning_rate": 3.3868713169795807e-06, + "loss": 0.1224, + "step": 49007 + }, + { + "epoch": 3.0647042864894196, + "grad_norm": 4.421580791473389, + "learning_rate": 3.3867342743593263e-06, + "loss": 0.1701, + "step": 49008 + }, + { + "epoch": 3.0647178513293545, + "grad_norm": 4.805085182189941, + "learning_rate": 3.386597231739071e-06, + "loss": 0.096, + "step": 49009 + }, + { + "epoch": 3.0647314161692893, + "grad_norm": 5.713301658630371, + "learning_rate": 3.3864601891188163e-06, + "loss": 0.1671, + "step": 49010 + }, + { + "epoch": 3.064744981009224, + "grad_norm": 3.6550519466400146, + "learning_rate": 3.3863231464985615e-06, + "loss": 0.1355, + "step": 49011 + }, + { + "epoch": 3.064758545849159, + "grad_norm": 5.04339075088501, + "learning_rate": 3.3861861038783066e-06, + "loss": 0.1848, + "step": 49012 + }, + { + "epoch": 3.064772110689094, + "grad_norm": 6.1776323318481445, + "learning_rate": 3.3860490612580514e-06, + "loss": 0.1041, + "step": 49013 + }, + { + "epoch": 3.064785675529029, + "grad_norm": 4.237614154815674, + "learning_rate": 3.3859120186377966e-06, + "loss": 0.1416, + "step": 49014 + }, + { + "epoch": 3.0647992403689637, + "grad_norm": 5.658615589141846, + "learning_rate": 3.3857749760175417e-06, + "loss": 0.0968, + "step": 49015 + }, + { + "epoch": 3.0648128052088985, + "grad_norm": 4.31805944442749, + "learning_rate": 3.3856379333972865e-06, + "loss": 0.1446, + "step": 49016 + }, + { + "epoch": 3.0648263700488334, + "grad_norm": 3.8613407611846924, + "learning_rate": 3.385500890777032e-06, + "loss": 0.1654, + "step": 49017 + }, + { + "epoch": 3.0648399348887683, + "grad_norm": 3.6911678314208984, + "learning_rate": 3.385363848156777e-06, + "loss": 0.093, + "step": 49018 + }, + { + "epoch": 3.064853499728703, + "grad_norm": 3.57973313331604, + "learning_rate": 3.3852268055365225e-06, + "loss": 0.1107, + "step": 49019 + }, + { + "epoch": 3.064867064568638, + "grad_norm": 3.2797744274139404, + "learning_rate": 3.3850897629162672e-06, + "loss": 0.1324, + "step": 49020 + }, + { + "epoch": 3.064880629408573, + "grad_norm": 4.202740669250488, + "learning_rate": 3.3849527202960124e-06, + "loss": 0.1201, + "step": 49021 + }, + { + "epoch": 3.0648941942485077, + "grad_norm": 4.982145309448242, + "learning_rate": 3.384815677675757e-06, + "loss": 0.1405, + "step": 49022 + }, + { + "epoch": 3.0649077590884426, + "grad_norm": 3.914419412612915, + "learning_rate": 3.3846786350555028e-06, + "loss": 0.0708, + "step": 49023 + }, + { + "epoch": 3.0649213239283775, + "grad_norm": 6.642858028411865, + "learning_rate": 3.3845415924352475e-06, + "loss": 0.2644, + "step": 49024 + }, + { + "epoch": 3.0649348887683123, + "grad_norm": 4.692524433135986, + "learning_rate": 3.384404549814993e-06, + "loss": 0.1896, + "step": 49025 + }, + { + "epoch": 3.0649484536082476, + "grad_norm": 3.940626859664917, + "learning_rate": 3.384267507194738e-06, + "loss": 0.1037, + "step": 49026 + }, + { + "epoch": 3.0649620184481825, + "grad_norm": 3.0296812057495117, + "learning_rate": 3.3841304645744826e-06, + "loss": 0.0736, + "step": 49027 + }, + { + "epoch": 3.0649755832881174, + "grad_norm": 3.751668930053711, + "learning_rate": 3.3839934219542282e-06, + "loss": 0.1433, + "step": 49028 + }, + { + "epoch": 3.0649891481280522, + "grad_norm": 4.058467388153076, + "learning_rate": 3.383856379333973e-06, + "loss": 0.1091, + "step": 49029 + }, + { + "epoch": 3.065002712967987, + "grad_norm": 5.1084885597229, + "learning_rate": 3.383719336713718e-06, + "loss": 0.1147, + "step": 49030 + }, + { + "epoch": 3.065016277807922, + "grad_norm": 3.8585615158081055, + "learning_rate": 3.3835822940934633e-06, + "loss": 0.1282, + "step": 49031 + }, + { + "epoch": 3.065029842647857, + "grad_norm": 6.263958930969238, + "learning_rate": 3.3834452514732085e-06, + "loss": 0.1227, + "step": 49032 + }, + { + "epoch": 3.0650434074877917, + "grad_norm": 4.2346954345703125, + "learning_rate": 3.3833082088529533e-06, + "loss": 0.1154, + "step": 49033 + }, + { + "epoch": 3.0650569723277266, + "grad_norm": 4.007374286651611, + "learning_rate": 3.383171166232699e-06, + "loss": 0.082, + "step": 49034 + }, + { + "epoch": 3.0650705371676614, + "grad_norm": 3.7658379077911377, + "learning_rate": 3.3830341236124436e-06, + "loss": 0.1383, + "step": 49035 + }, + { + "epoch": 3.0650841020075963, + "grad_norm": 5.440553665161133, + "learning_rate": 3.3828970809921892e-06, + "loss": 0.112, + "step": 49036 + }, + { + "epoch": 3.065097666847531, + "grad_norm": 5.638605117797852, + "learning_rate": 3.382760038371934e-06, + "loss": 0.2067, + "step": 49037 + }, + { + "epoch": 3.065111231687466, + "grad_norm": 4.722243309020996, + "learning_rate": 3.382622995751679e-06, + "loss": 0.1904, + "step": 49038 + }, + { + "epoch": 3.065124796527401, + "grad_norm": 4.781343936920166, + "learning_rate": 3.3824859531314243e-06, + "loss": 0.1629, + "step": 49039 + }, + { + "epoch": 3.0651383613673358, + "grad_norm": 4.377948760986328, + "learning_rate": 3.382348910511169e-06, + "loss": 0.0688, + "step": 49040 + }, + { + "epoch": 3.0651519262072706, + "grad_norm": 4.317586421966553, + "learning_rate": 3.3822118678909143e-06, + "loss": 0.1311, + "step": 49041 + }, + { + "epoch": 3.0651654910472055, + "grad_norm": 7.111381530761719, + "learning_rate": 3.382074825270659e-06, + "loss": 0.2605, + "step": 49042 + }, + { + "epoch": 3.0651790558871403, + "grad_norm": 5.090862274169922, + "learning_rate": 3.3819377826504046e-06, + "loss": 0.1676, + "step": 49043 + }, + { + "epoch": 3.065192620727075, + "grad_norm": 4.045505523681641, + "learning_rate": 3.3818007400301494e-06, + "loss": 0.11, + "step": 49044 + }, + { + "epoch": 3.0652061855670105, + "grad_norm": 7.537674427032471, + "learning_rate": 3.381663697409895e-06, + "loss": 0.2741, + "step": 49045 + }, + { + "epoch": 3.0652197504069454, + "grad_norm": 6.015883922576904, + "learning_rate": 3.3815266547896398e-06, + "loss": 0.1966, + "step": 49046 + }, + { + "epoch": 3.0652333152468803, + "grad_norm": 3.063502788543701, + "learning_rate": 3.381389612169385e-06, + "loss": 0.0649, + "step": 49047 + }, + { + "epoch": 3.065246880086815, + "grad_norm": 5.0223612785339355, + "learning_rate": 3.38125256954913e-06, + "loss": 0.1885, + "step": 49048 + }, + { + "epoch": 3.06526044492675, + "grad_norm": 4.451106071472168, + "learning_rate": 3.3811155269288753e-06, + "loss": 0.1313, + "step": 49049 + }, + { + "epoch": 3.065274009766685, + "grad_norm": 5.419399738311768, + "learning_rate": 3.38097848430862e-06, + "loss": 0.1843, + "step": 49050 + }, + { + "epoch": 3.0652875746066197, + "grad_norm": 5.236734390258789, + "learning_rate": 3.3808414416883656e-06, + "loss": 0.1629, + "step": 49051 + }, + { + "epoch": 3.0653011394465546, + "grad_norm": 5.814694881439209, + "learning_rate": 3.3807043990681104e-06, + "loss": 0.1326, + "step": 49052 + }, + { + "epoch": 3.0653147042864894, + "grad_norm": 6.39832878112793, + "learning_rate": 3.380567356447856e-06, + "loss": 0.1958, + "step": 49053 + }, + { + "epoch": 3.0653282691264243, + "grad_norm": 3.3560409545898438, + "learning_rate": 3.3804303138276008e-06, + "loss": 0.0613, + "step": 49054 + }, + { + "epoch": 3.065341833966359, + "grad_norm": 5.486504554748535, + "learning_rate": 3.3802932712073455e-06, + "loss": 0.0966, + "step": 49055 + }, + { + "epoch": 3.065355398806294, + "grad_norm": 5.026877403259277, + "learning_rate": 3.380156228587091e-06, + "loss": 0.1281, + "step": 49056 + }, + { + "epoch": 3.065368963646229, + "grad_norm": 4.42038631439209, + "learning_rate": 3.380019185966836e-06, + "loss": 0.1176, + "step": 49057 + }, + { + "epoch": 3.0653825284861638, + "grad_norm": 5.3576226234436035, + "learning_rate": 3.379882143346581e-06, + "loss": 0.1646, + "step": 49058 + }, + { + "epoch": 3.0653960933260986, + "grad_norm": 3.855494499206543, + "learning_rate": 3.379745100726326e-06, + "loss": 0.1592, + "step": 49059 + }, + { + "epoch": 3.0654096581660335, + "grad_norm": 5.414994239807129, + "learning_rate": 3.3796080581060714e-06, + "loss": 0.1207, + "step": 49060 + }, + { + "epoch": 3.0654232230059684, + "grad_norm": 4.044095516204834, + "learning_rate": 3.379471015485816e-06, + "loss": 0.0838, + "step": 49061 + }, + { + "epoch": 3.0654367878459032, + "grad_norm": 4.326946258544922, + "learning_rate": 3.3793339728655618e-06, + "loss": 0.1072, + "step": 49062 + }, + { + "epoch": 3.065450352685838, + "grad_norm": 3.4397811889648438, + "learning_rate": 3.3791969302453065e-06, + "loss": 0.0874, + "step": 49063 + }, + { + "epoch": 3.0654639175257734, + "grad_norm": 5.929597854614258, + "learning_rate": 3.3790598876250517e-06, + "loss": 0.1604, + "step": 49064 + }, + { + "epoch": 3.0654774823657083, + "grad_norm": 5.655011177062988, + "learning_rate": 3.378922845004797e-06, + "loss": 0.1496, + "step": 49065 + }, + { + "epoch": 3.065491047205643, + "grad_norm": 3.0150504112243652, + "learning_rate": 3.378785802384542e-06, + "loss": 0.0703, + "step": 49066 + }, + { + "epoch": 3.065504612045578, + "grad_norm": 4.4817118644714355, + "learning_rate": 3.378648759764287e-06, + "loss": 0.1099, + "step": 49067 + }, + { + "epoch": 3.065518176885513, + "grad_norm": 5.056041717529297, + "learning_rate": 3.378511717144032e-06, + "loss": 0.1249, + "step": 49068 + }, + { + "epoch": 3.0655317417254477, + "grad_norm": 5.407799243927002, + "learning_rate": 3.378374674523777e-06, + "loss": 0.1984, + "step": 49069 + }, + { + "epoch": 3.0655453065653826, + "grad_norm": 3.3904733657836914, + "learning_rate": 3.378237631903522e-06, + "loss": 0.0525, + "step": 49070 + }, + { + "epoch": 3.0655588714053175, + "grad_norm": 3.5430688858032227, + "learning_rate": 3.3781005892832675e-06, + "loss": 0.0604, + "step": 49071 + }, + { + "epoch": 3.0655724362452523, + "grad_norm": 5.624545574188232, + "learning_rate": 3.3779635466630123e-06, + "loss": 0.0895, + "step": 49072 + }, + { + "epoch": 3.065586001085187, + "grad_norm": 2.823356866836548, + "learning_rate": 3.377826504042758e-06, + "loss": 0.0413, + "step": 49073 + }, + { + "epoch": 3.065599565925122, + "grad_norm": 3.284745931625366, + "learning_rate": 3.3776894614225026e-06, + "loss": 0.0664, + "step": 49074 + }, + { + "epoch": 3.065613130765057, + "grad_norm": 4.720526218414307, + "learning_rate": 3.377552418802248e-06, + "loss": 0.1004, + "step": 49075 + }, + { + "epoch": 3.065626695604992, + "grad_norm": 4.975574970245361, + "learning_rate": 3.3774153761819926e-06, + "loss": 0.1359, + "step": 49076 + }, + { + "epoch": 3.0656402604449267, + "grad_norm": 5.007899284362793, + "learning_rate": 3.377278333561738e-06, + "loss": 0.1017, + "step": 49077 + }, + { + "epoch": 3.0656538252848615, + "grad_norm": 3.7495322227478027, + "learning_rate": 3.377141290941483e-06, + "loss": 0.1751, + "step": 49078 + }, + { + "epoch": 3.0656673901247964, + "grad_norm": 4.055522918701172, + "learning_rate": 3.3770042483212285e-06, + "loss": 0.1014, + "step": 49079 + }, + { + "epoch": 3.0656809549647313, + "grad_norm": 6.1694536209106445, + "learning_rate": 3.3768672057009733e-06, + "loss": 0.2002, + "step": 49080 + }, + { + "epoch": 3.065694519804666, + "grad_norm": 5.562863349914551, + "learning_rate": 3.376730163080718e-06, + "loss": 0.0734, + "step": 49081 + }, + { + "epoch": 3.065708084644601, + "grad_norm": 4.888099670410156, + "learning_rate": 3.3765931204604637e-06, + "loss": 0.0909, + "step": 49082 + }, + { + "epoch": 3.0657216494845363, + "grad_norm": 3.848109722137451, + "learning_rate": 3.3764560778402084e-06, + "loss": 0.0699, + "step": 49083 + }, + { + "epoch": 3.065735214324471, + "grad_norm": 3.891313314437866, + "learning_rate": 3.3763190352199536e-06, + "loss": 0.102, + "step": 49084 + }, + { + "epoch": 3.065748779164406, + "grad_norm": 5.171710014343262, + "learning_rate": 3.3761819925996988e-06, + "loss": 0.1294, + "step": 49085 + }, + { + "epoch": 3.065762344004341, + "grad_norm": 5.450583457946777, + "learning_rate": 3.376044949979444e-06, + "loss": 0.0783, + "step": 49086 + }, + { + "epoch": 3.0657759088442758, + "grad_norm": 4.15049934387207, + "learning_rate": 3.3759079073591887e-06, + "loss": 0.0791, + "step": 49087 + }, + { + "epoch": 3.0657894736842106, + "grad_norm": 3.4284162521362305, + "learning_rate": 3.3757708647389343e-06, + "loss": 0.0892, + "step": 49088 + }, + { + "epoch": 3.0658030385241455, + "grad_norm": 4.340379238128662, + "learning_rate": 3.375633822118679e-06, + "loss": 0.1575, + "step": 49089 + }, + { + "epoch": 3.0658166033640804, + "grad_norm": 3.250980854034424, + "learning_rate": 3.3754967794984247e-06, + "loss": 0.0964, + "step": 49090 + }, + { + "epoch": 3.065830168204015, + "grad_norm": 7.914238929748535, + "learning_rate": 3.3753597368781694e-06, + "loss": 0.1317, + "step": 49091 + }, + { + "epoch": 3.06584373304395, + "grad_norm": 5.449883460998535, + "learning_rate": 3.3752226942579146e-06, + "loss": 0.1343, + "step": 49092 + }, + { + "epoch": 3.065857297883885, + "grad_norm": 4.236059188842773, + "learning_rate": 3.3750856516376593e-06, + "loss": 0.1302, + "step": 49093 + }, + { + "epoch": 3.06587086272382, + "grad_norm": 5.108232021331787, + "learning_rate": 3.374948609017405e-06, + "loss": 0.1307, + "step": 49094 + }, + { + "epoch": 3.0658844275637547, + "grad_norm": 3.005232095718384, + "learning_rate": 3.3748115663971497e-06, + "loss": 0.0556, + "step": 49095 + }, + { + "epoch": 3.0658979924036895, + "grad_norm": 3.5596227645874023, + "learning_rate": 3.3746745237768945e-06, + "loss": 0.0814, + "step": 49096 + }, + { + "epoch": 3.0659115572436244, + "grad_norm": 5.330691814422607, + "learning_rate": 3.37453748115664e-06, + "loss": 0.1174, + "step": 49097 + }, + { + "epoch": 3.0659251220835593, + "grad_norm": 3.0301923751831055, + "learning_rate": 3.374400438536385e-06, + "loss": 0.089, + "step": 49098 + }, + { + "epoch": 3.065938686923494, + "grad_norm": 3.7793798446655273, + "learning_rate": 3.3742633959161304e-06, + "loss": 0.1335, + "step": 49099 + }, + { + "epoch": 3.065952251763429, + "grad_norm": 4.216061115264893, + "learning_rate": 3.374126353295875e-06, + "loss": 0.1102, + "step": 49100 + }, + { + "epoch": 3.065965816603364, + "grad_norm": 5.1097307205200195, + "learning_rate": 3.3739893106756204e-06, + "loss": 0.1139, + "step": 49101 + }, + { + "epoch": 3.065979381443299, + "grad_norm": 5.7667555809021, + "learning_rate": 3.3738522680553655e-06, + "loss": 0.2345, + "step": 49102 + }, + { + "epoch": 3.065992946283234, + "grad_norm": 3.628430128097534, + "learning_rate": 3.3737152254351107e-06, + "loss": 0.1113, + "step": 49103 + }, + { + "epoch": 3.066006511123169, + "grad_norm": 3.6219348907470703, + "learning_rate": 3.3735781828148555e-06, + "loss": 0.1054, + "step": 49104 + }, + { + "epoch": 3.0660200759631038, + "grad_norm": 3.4845314025878906, + "learning_rate": 3.373441140194601e-06, + "loss": 0.1051, + "step": 49105 + }, + { + "epoch": 3.0660336408030386, + "grad_norm": 3.0858185291290283, + "learning_rate": 3.373304097574346e-06, + "loss": 0.0413, + "step": 49106 + }, + { + "epoch": 3.0660472056429735, + "grad_norm": 4.334888935089111, + "learning_rate": 3.3731670549540914e-06, + "loss": 0.0763, + "step": 49107 + }, + { + "epoch": 3.0660607704829084, + "grad_norm": 3.647657632827759, + "learning_rate": 3.373030012333836e-06, + "loss": 0.0945, + "step": 49108 + }, + { + "epoch": 3.0660743353228432, + "grad_norm": 3.804682970046997, + "learning_rate": 3.372892969713581e-06, + "loss": 0.0986, + "step": 49109 + }, + { + "epoch": 3.066087900162778, + "grad_norm": 3.7394161224365234, + "learning_rate": 3.3727559270933265e-06, + "loss": 0.0883, + "step": 49110 + }, + { + "epoch": 3.066101465002713, + "grad_norm": 3.2751498222351074, + "learning_rate": 3.3726188844730713e-06, + "loss": 0.0878, + "step": 49111 + }, + { + "epoch": 3.066115029842648, + "grad_norm": 4.876883029937744, + "learning_rate": 3.3724818418528165e-06, + "loss": 0.0895, + "step": 49112 + }, + { + "epoch": 3.0661285946825827, + "grad_norm": 2.7511439323425293, + "learning_rate": 3.3723447992325612e-06, + "loss": 0.0831, + "step": 49113 + }, + { + "epoch": 3.0661421595225176, + "grad_norm": 6.577259063720703, + "learning_rate": 3.372207756612307e-06, + "loss": 0.1554, + "step": 49114 + }, + { + "epoch": 3.0661557243624524, + "grad_norm": 4.46968412399292, + "learning_rate": 3.3720707139920516e-06, + "loss": 0.137, + "step": 49115 + }, + { + "epoch": 3.0661692892023873, + "grad_norm": 3.6498961448669434, + "learning_rate": 3.371933671371797e-06, + "loss": 0.1359, + "step": 49116 + }, + { + "epoch": 3.066182854042322, + "grad_norm": 2.8109984397888184, + "learning_rate": 3.371796628751542e-06, + "loss": 0.0587, + "step": 49117 + }, + { + "epoch": 3.066196418882257, + "grad_norm": 3.242708206176758, + "learning_rate": 3.371659586131287e-06, + "loss": 0.0513, + "step": 49118 + }, + { + "epoch": 3.066209983722192, + "grad_norm": 2.9834442138671875, + "learning_rate": 3.3715225435110323e-06, + "loss": 0.0818, + "step": 49119 + }, + { + "epoch": 3.0662235485621268, + "grad_norm": 4.1790852546691895, + "learning_rate": 3.3713855008907775e-06, + "loss": 0.1165, + "step": 49120 + }, + { + "epoch": 3.066237113402062, + "grad_norm": 3.8364920616149902, + "learning_rate": 3.3712484582705222e-06, + "loss": 0.0715, + "step": 49121 + }, + { + "epoch": 3.066250678241997, + "grad_norm": 3.375497579574585, + "learning_rate": 3.371111415650268e-06, + "loss": 0.1034, + "step": 49122 + }, + { + "epoch": 3.066264243081932, + "grad_norm": 2.9466354846954346, + "learning_rate": 3.3709743730300126e-06, + "loss": 0.0812, + "step": 49123 + }, + { + "epoch": 3.0662778079218667, + "grad_norm": 4.272659778594971, + "learning_rate": 3.3708373304097574e-06, + "loss": 0.1236, + "step": 49124 + }, + { + "epoch": 3.0662913727618015, + "grad_norm": 3.425570249557495, + "learning_rate": 3.370700287789503e-06, + "loss": 0.0636, + "step": 49125 + }, + { + "epoch": 3.0663049376017364, + "grad_norm": 4.949716091156006, + "learning_rate": 3.3705632451692477e-06, + "loss": 0.122, + "step": 49126 + }, + { + "epoch": 3.0663185024416713, + "grad_norm": 4.195733547210693, + "learning_rate": 3.3704262025489933e-06, + "loss": 0.132, + "step": 49127 + }, + { + "epoch": 3.066332067281606, + "grad_norm": 3.6676347255706787, + "learning_rate": 3.370289159928738e-06, + "loss": 0.1453, + "step": 49128 + }, + { + "epoch": 3.066345632121541, + "grad_norm": 2.3650944232940674, + "learning_rate": 3.3701521173084832e-06, + "loss": 0.0434, + "step": 49129 + }, + { + "epoch": 3.066359196961476, + "grad_norm": 3.301114559173584, + "learning_rate": 3.370015074688228e-06, + "loss": 0.0613, + "step": 49130 + }, + { + "epoch": 3.0663727618014107, + "grad_norm": 4.294020175933838, + "learning_rate": 3.3698780320679736e-06, + "loss": 0.0893, + "step": 49131 + }, + { + "epoch": 3.0663863266413456, + "grad_norm": 3.6091256141662598, + "learning_rate": 3.3697409894477184e-06, + "loss": 0.105, + "step": 49132 + }, + { + "epoch": 3.0663998914812804, + "grad_norm": 3.2966110706329346, + "learning_rate": 3.369603946827464e-06, + "loss": 0.0638, + "step": 49133 + }, + { + "epoch": 3.0664134563212153, + "grad_norm": 5.137697219848633, + "learning_rate": 3.3694669042072087e-06, + "loss": 0.0932, + "step": 49134 + }, + { + "epoch": 3.06642702116115, + "grad_norm": 4.607972621917725, + "learning_rate": 3.369329861586954e-06, + "loss": 0.1073, + "step": 49135 + }, + { + "epoch": 3.066440586001085, + "grad_norm": 2.942427158355713, + "learning_rate": 3.369192818966699e-06, + "loss": 0.102, + "step": 49136 + }, + { + "epoch": 3.06645415084102, + "grad_norm": 4.116865158081055, + "learning_rate": 3.369055776346444e-06, + "loss": 0.1281, + "step": 49137 + }, + { + "epoch": 3.0664677156809548, + "grad_norm": 3.425346612930298, + "learning_rate": 3.368918733726189e-06, + "loss": 0.1228, + "step": 49138 + }, + { + "epoch": 3.0664812805208896, + "grad_norm": 4.74216365814209, + "learning_rate": 3.368781691105934e-06, + "loss": 0.1661, + "step": 49139 + }, + { + "epoch": 3.066494845360825, + "grad_norm": 4.201370716094971, + "learning_rate": 3.3686446484856794e-06, + "loss": 0.157, + "step": 49140 + }, + { + "epoch": 3.06650841020076, + "grad_norm": 3.4658048152923584, + "learning_rate": 3.368507605865424e-06, + "loss": 0.0904, + "step": 49141 + }, + { + "epoch": 3.0665219750406947, + "grad_norm": 2.708146095275879, + "learning_rate": 3.3683705632451697e-06, + "loss": 0.0699, + "step": 49142 + }, + { + "epoch": 3.0665355398806295, + "grad_norm": 3.247168779373169, + "learning_rate": 3.3682335206249145e-06, + "loss": 0.0749, + "step": 49143 + }, + { + "epoch": 3.0665491047205644, + "grad_norm": 2.8301236629486084, + "learning_rate": 3.36809647800466e-06, + "loss": 0.0488, + "step": 49144 + }, + { + "epoch": 3.0665626695604993, + "grad_norm": 3.65317702293396, + "learning_rate": 3.367959435384405e-06, + "loss": 0.115, + "step": 49145 + }, + { + "epoch": 3.066576234400434, + "grad_norm": 4.549195289611816, + "learning_rate": 3.36782239276415e-06, + "loss": 0.0756, + "step": 49146 + }, + { + "epoch": 3.066589799240369, + "grad_norm": 4.108317852020264, + "learning_rate": 3.3676853501438948e-06, + "loss": 0.143, + "step": 49147 + }, + { + "epoch": 3.066603364080304, + "grad_norm": 3.6208503246307373, + "learning_rate": 3.3675483075236404e-06, + "loss": 0.1005, + "step": 49148 + }, + { + "epoch": 3.0666169289202387, + "grad_norm": 3.031268358230591, + "learning_rate": 3.367411264903385e-06, + "loss": 0.1115, + "step": 49149 + }, + { + "epoch": 3.0666304937601736, + "grad_norm": 4.640493869781494, + "learning_rate": 3.36727422228313e-06, + "loss": 0.1468, + "step": 49150 + }, + { + "epoch": 3.0666440586001085, + "grad_norm": 4.0005364418029785, + "learning_rate": 3.3671371796628755e-06, + "loss": 0.1281, + "step": 49151 + }, + { + "epoch": 3.0666576234400433, + "grad_norm": 5.620461940765381, + "learning_rate": 3.3670001370426202e-06, + "loss": 0.1908, + "step": 49152 + }, + { + "epoch": 3.066671188279978, + "grad_norm": 2.787876844406128, + "learning_rate": 3.366863094422366e-06, + "loss": 0.0826, + "step": 49153 + }, + { + "epoch": 3.066684753119913, + "grad_norm": 3.2299373149871826, + "learning_rate": 3.3667260518021106e-06, + "loss": 0.1028, + "step": 49154 + }, + { + "epoch": 3.066698317959848, + "grad_norm": 3.8861498832702637, + "learning_rate": 3.3665890091818558e-06, + "loss": 0.1215, + "step": 49155 + }, + { + "epoch": 3.066711882799783, + "grad_norm": 4.272050380706787, + "learning_rate": 3.366451966561601e-06, + "loss": 0.1184, + "step": 49156 + }, + { + "epoch": 3.0667254476397177, + "grad_norm": 2.8546297550201416, + "learning_rate": 3.366314923941346e-06, + "loss": 0.0492, + "step": 49157 + }, + { + "epoch": 3.0667390124796525, + "grad_norm": 5.155973434448242, + "learning_rate": 3.366177881321091e-06, + "loss": 0.1543, + "step": 49158 + }, + { + "epoch": 3.066752577319588, + "grad_norm": 4.797646522521973, + "learning_rate": 3.3660408387008365e-06, + "loss": 0.1494, + "step": 49159 + }, + { + "epoch": 3.0667661421595227, + "grad_norm": 2.928724765777588, + "learning_rate": 3.3659037960805813e-06, + "loss": 0.0528, + "step": 49160 + }, + { + "epoch": 3.0667797069994576, + "grad_norm": 5.574450969696045, + "learning_rate": 3.365766753460327e-06, + "loss": 0.1444, + "step": 49161 + }, + { + "epoch": 3.0667932718393924, + "grad_norm": 5.740880489349365, + "learning_rate": 3.3656297108400716e-06, + "loss": 0.1589, + "step": 49162 + }, + { + "epoch": 3.0668068366793273, + "grad_norm": 3.818101644515991, + "learning_rate": 3.365492668219817e-06, + "loss": 0.1654, + "step": 49163 + }, + { + "epoch": 3.066820401519262, + "grad_norm": 3.4099576473236084, + "learning_rate": 3.3653556255995615e-06, + "loss": 0.081, + "step": 49164 + }, + { + "epoch": 3.066833966359197, + "grad_norm": 4.012747287750244, + "learning_rate": 3.3652185829793067e-06, + "loss": 0.1993, + "step": 49165 + }, + { + "epoch": 3.066847531199132, + "grad_norm": 4.1534857749938965, + "learning_rate": 3.365081540359052e-06, + "loss": 0.1133, + "step": 49166 + }, + { + "epoch": 3.0668610960390668, + "grad_norm": 5.383401393890381, + "learning_rate": 3.3649444977387967e-06, + "loss": 0.2505, + "step": 49167 + }, + { + "epoch": 3.0668746608790016, + "grad_norm": 3.9232637882232666, + "learning_rate": 3.3648074551185423e-06, + "loss": 0.0669, + "step": 49168 + }, + { + "epoch": 3.0668882257189365, + "grad_norm": 3.5258090496063232, + "learning_rate": 3.364670412498287e-06, + "loss": 0.0948, + "step": 49169 + }, + { + "epoch": 3.0669017905588714, + "grad_norm": 4.283684253692627, + "learning_rate": 3.3645333698780326e-06, + "loss": 0.1442, + "step": 49170 + }, + { + "epoch": 3.066915355398806, + "grad_norm": 3.7880704402923584, + "learning_rate": 3.3643963272577774e-06, + "loss": 0.1799, + "step": 49171 + }, + { + "epoch": 3.066928920238741, + "grad_norm": 4.089993000030518, + "learning_rate": 3.3642592846375226e-06, + "loss": 0.1241, + "step": 49172 + }, + { + "epoch": 3.066942485078676, + "grad_norm": 4.8274054527282715, + "learning_rate": 3.3641222420172677e-06, + "loss": 0.1355, + "step": 49173 + }, + { + "epoch": 3.066956049918611, + "grad_norm": 4.8702898025512695, + "learning_rate": 3.363985199397013e-06, + "loss": 0.1721, + "step": 49174 + }, + { + "epoch": 3.0669696147585457, + "grad_norm": 4.073767185211182, + "learning_rate": 3.3638481567767577e-06, + "loss": 0.1022, + "step": 49175 + }, + { + "epoch": 3.0669831795984805, + "grad_norm": 3.1933038234710693, + "learning_rate": 3.3637111141565033e-06, + "loss": 0.0864, + "step": 49176 + }, + { + "epoch": 3.0669967444384154, + "grad_norm": 4.619418144226074, + "learning_rate": 3.363574071536248e-06, + "loss": 0.064, + "step": 49177 + }, + { + "epoch": 3.0670103092783507, + "grad_norm": 3.1983301639556885, + "learning_rate": 3.3634370289159928e-06, + "loss": 0.1129, + "step": 49178 + }, + { + "epoch": 3.0670238741182856, + "grad_norm": 2.78328800201416, + "learning_rate": 3.3632999862957384e-06, + "loss": 0.0441, + "step": 49179 + }, + { + "epoch": 3.0670374389582205, + "grad_norm": 4.154510021209717, + "learning_rate": 3.363162943675483e-06, + "loss": 0.1257, + "step": 49180 + }, + { + "epoch": 3.0670510037981553, + "grad_norm": 4.110802173614502, + "learning_rate": 3.3630259010552283e-06, + "loss": 0.1482, + "step": 49181 + }, + { + "epoch": 3.06706456863809, + "grad_norm": 5.779505252838135, + "learning_rate": 3.3628888584349735e-06, + "loss": 0.1221, + "step": 49182 + }, + { + "epoch": 3.067078133478025, + "grad_norm": 5.795300483703613, + "learning_rate": 3.3627518158147187e-06, + "loss": 0.1274, + "step": 49183 + }, + { + "epoch": 3.06709169831796, + "grad_norm": 3.293513536453247, + "learning_rate": 3.3626147731944634e-06, + "loss": 0.0815, + "step": 49184 + }, + { + "epoch": 3.067105263157895, + "grad_norm": 3.999699115753174, + "learning_rate": 3.362477730574209e-06, + "loss": 0.134, + "step": 49185 + }, + { + "epoch": 3.0671188279978296, + "grad_norm": 3.307093858718872, + "learning_rate": 3.3623406879539538e-06, + "loss": 0.0765, + "step": 49186 + }, + { + "epoch": 3.0671323928377645, + "grad_norm": 4.259178161621094, + "learning_rate": 3.3622036453336994e-06, + "loss": 0.1382, + "step": 49187 + }, + { + "epoch": 3.0671459576776994, + "grad_norm": 4.098701000213623, + "learning_rate": 3.362066602713444e-06, + "loss": 0.1234, + "step": 49188 + }, + { + "epoch": 3.0671595225176342, + "grad_norm": 6.22632360458374, + "learning_rate": 3.3619295600931893e-06, + "loss": 0.1177, + "step": 49189 + }, + { + "epoch": 3.067173087357569, + "grad_norm": 4.598819732666016, + "learning_rate": 3.3617925174729345e-06, + "loss": 0.1013, + "step": 49190 + }, + { + "epoch": 3.067186652197504, + "grad_norm": 4.746016979217529, + "learning_rate": 3.3616554748526797e-06, + "loss": 0.1406, + "step": 49191 + }, + { + "epoch": 3.067200217037439, + "grad_norm": 3.6619138717651367, + "learning_rate": 3.3615184322324244e-06, + "loss": 0.1286, + "step": 49192 + }, + { + "epoch": 3.0672137818773737, + "grad_norm": 4.540489673614502, + "learning_rate": 3.3613813896121696e-06, + "loss": 0.153, + "step": 49193 + }, + { + "epoch": 3.0672273467173086, + "grad_norm": 4.200231552124023, + "learning_rate": 3.361244346991915e-06, + "loss": 0.1074, + "step": 49194 + }, + { + "epoch": 3.0672409115572434, + "grad_norm": 5.60476016998291, + "learning_rate": 3.3611073043716595e-06, + "loss": 0.1827, + "step": 49195 + }, + { + "epoch": 3.0672544763971787, + "grad_norm": 3.7223780155181885, + "learning_rate": 3.360970261751405e-06, + "loss": 0.1117, + "step": 49196 + }, + { + "epoch": 3.0672680412371136, + "grad_norm": 5.206801414489746, + "learning_rate": 3.36083321913115e-06, + "loss": 0.1577, + "step": 49197 + }, + { + "epoch": 3.0672816060770485, + "grad_norm": 5.735447406768799, + "learning_rate": 3.3606961765108955e-06, + "loss": 0.1155, + "step": 49198 + }, + { + "epoch": 3.0672951709169833, + "grad_norm": 3.0823488235473633, + "learning_rate": 3.3605591338906403e-06, + "loss": 0.0977, + "step": 49199 + }, + { + "epoch": 3.067308735756918, + "grad_norm": 2.904426336288452, + "learning_rate": 3.3604220912703854e-06, + "loss": 0.0612, + "step": 49200 + }, + { + "epoch": 3.067322300596853, + "grad_norm": 4.4935383796691895, + "learning_rate": 3.36028504865013e-06, + "loss": 0.171, + "step": 49201 + }, + { + "epoch": 3.067335865436788, + "grad_norm": 3.5358214378356934, + "learning_rate": 3.360148006029876e-06, + "loss": 0.0912, + "step": 49202 + }, + { + "epoch": 3.067349430276723, + "grad_norm": 3.891340970993042, + "learning_rate": 3.3600109634096206e-06, + "loss": 0.1395, + "step": 49203 + }, + { + "epoch": 3.0673629951166577, + "grad_norm": 4.943097114562988, + "learning_rate": 3.359873920789366e-06, + "loss": 0.1466, + "step": 49204 + }, + { + "epoch": 3.0673765599565925, + "grad_norm": 2.8031558990478516, + "learning_rate": 3.359736878169111e-06, + "loss": 0.0931, + "step": 49205 + }, + { + "epoch": 3.0673901247965274, + "grad_norm": 4.188027381896973, + "learning_rate": 3.3595998355488557e-06, + "loss": 0.1689, + "step": 49206 + }, + { + "epoch": 3.0674036896364623, + "grad_norm": 3.98128342628479, + "learning_rate": 3.3594627929286013e-06, + "loss": 0.0736, + "step": 49207 + }, + { + "epoch": 3.067417254476397, + "grad_norm": 3.8641743659973145, + "learning_rate": 3.359325750308346e-06, + "loss": 0.1104, + "step": 49208 + }, + { + "epoch": 3.067430819316332, + "grad_norm": 3.6380245685577393, + "learning_rate": 3.359188707688091e-06, + "loss": 0.088, + "step": 49209 + }, + { + "epoch": 3.067444384156267, + "grad_norm": 5.099705696105957, + "learning_rate": 3.3590516650678364e-06, + "loss": 0.1378, + "step": 49210 + }, + { + "epoch": 3.0674579489962017, + "grad_norm": 3.446545124053955, + "learning_rate": 3.3589146224475816e-06, + "loss": 0.0679, + "step": 49211 + }, + { + "epoch": 3.0674715138361366, + "grad_norm": 3.2951977252960205, + "learning_rate": 3.3587775798273263e-06, + "loss": 0.112, + "step": 49212 + }, + { + "epoch": 3.0674850786760715, + "grad_norm": 2.7023189067840576, + "learning_rate": 3.358640537207072e-06, + "loss": 0.0541, + "step": 49213 + }, + { + "epoch": 3.0674986435160063, + "grad_norm": 3.244004487991333, + "learning_rate": 3.3585034945868167e-06, + "loss": 0.0602, + "step": 49214 + }, + { + "epoch": 3.067512208355941, + "grad_norm": 3.7226500511169434, + "learning_rate": 3.3583664519665623e-06, + "loss": 0.1073, + "step": 49215 + }, + { + "epoch": 3.0675257731958765, + "grad_norm": 3.6210129261016846, + "learning_rate": 3.358229409346307e-06, + "loss": 0.0975, + "step": 49216 + }, + { + "epoch": 3.0675393380358114, + "grad_norm": 4.39024019241333, + "learning_rate": 3.3580923667260522e-06, + "loss": 0.1031, + "step": 49217 + }, + { + "epoch": 3.0675529028757462, + "grad_norm": 1.9104506969451904, + "learning_rate": 3.357955324105797e-06, + "loss": 0.0311, + "step": 49218 + }, + { + "epoch": 3.067566467715681, + "grad_norm": 2.790783166885376, + "learning_rate": 3.357818281485542e-06, + "loss": 0.0795, + "step": 49219 + }, + { + "epoch": 3.067580032555616, + "grad_norm": 4.20957088470459, + "learning_rate": 3.3576812388652873e-06, + "loss": 0.1217, + "step": 49220 + }, + { + "epoch": 3.067593597395551, + "grad_norm": 4.538148403167725, + "learning_rate": 3.357544196245032e-06, + "loss": 0.1111, + "step": 49221 + }, + { + "epoch": 3.0676071622354857, + "grad_norm": 4.262786388397217, + "learning_rate": 3.3574071536247777e-06, + "loss": 0.1329, + "step": 49222 + }, + { + "epoch": 3.0676207270754206, + "grad_norm": 5.57061243057251, + "learning_rate": 3.3572701110045224e-06, + "loss": 0.1007, + "step": 49223 + }, + { + "epoch": 3.0676342919153554, + "grad_norm": 5.003657341003418, + "learning_rate": 3.357133068384268e-06, + "loss": 0.1355, + "step": 49224 + }, + { + "epoch": 3.0676478567552903, + "grad_norm": 4.176823139190674, + "learning_rate": 3.356996025764013e-06, + "loss": 0.0598, + "step": 49225 + }, + { + "epoch": 3.067661421595225, + "grad_norm": 5.377752304077148, + "learning_rate": 3.356858983143758e-06, + "loss": 0.1324, + "step": 49226 + }, + { + "epoch": 3.06767498643516, + "grad_norm": 6.697861194610596, + "learning_rate": 3.356721940523503e-06, + "loss": 0.2531, + "step": 49227 + }, + { + "epoch": 3.067688551275095, + "grad_norm": 4.23623514175415, + "learning_rate": 3.3565848979032483e-06, + "loss": 0.1431, + "step": 49228 + }, + { + "epoch": 3.0677021161150297, + "grad_norm": 4.64799165725708, + "learning_rate": 3.356447855282993e-06, + "loss": 0.137, + "step": 49229 + }, + { + "epoch": 3.0677156809549646, + "grad_norm": 4.763408184051514, + "learning_rate": 3.3563108126627387e-06, + "loss": 0.159, + "step": 49230 + }, + { + "epoch": 3.0677292457948995, + "grad_norm": 5.482911586761475, + "learning_rate": 3.3561737700424834e-06, + "loss": 0.1335, + "step": 49231 + }, + { + "epoch": 3.0677428106348343, + "grad_norm": 3.9997057914733887, + "learning_rate": 3.356036727422229e-06, + "loss": 0.0454, + "step": 49232 + }, + { + "epoch": 3.067756375474769, + "grad_norm": 3.4024431705474854, + "learning_rate": 3.355899684801974e-06, + "loss": 0.0946, + "step": 49233 + }, + { + "epoch": 3.0677699403147045, + "grad_norm": 3.077324628829956, + "learning_rate": 3.3557626421817186e-06, + "loss": 0.0481, + "step": 49234 + }, + { + "epoch": 3.0677835051546394, + "grad_norm": 4.132358074188232, + "learning_rate": 3.3556255995614637e-06, + "loss": 0.1283, + "step": 49235 + }, + { + "epoch": 3.0677970699945742, + "grad_norm": 4.622515678405762, + "learning_rate": 3.355488556941209e-06, + "loss": 0.0845, + "step": 49236 + }, + { + "epoch": 3.067810634834509, + "grad_norm": 5.066006660461426, + "learning_rate": 3.355351514320954e-06, + "loss": 0.0859, + "step": 49237 + }, + { + "epoch": 3.067824199674444, + "grad_norm": 4.272841930389404, + "learning_rate": 3.355214471700699e-06, + "loss": 0.0847, + "step": 49238 + }, + { + "epoch": 3.067837764514379, + "grad_norm": 3.549151659011841, + "learning_rate": 3.3550774290804445e-06, + "loss": 0.0818, + "step": 49239 + }, + { + "epoch": 3.0678513293543137, + "grad_norm": 3.8129665851593018, + "learning_rate": 3.354940386460189e-06, + "loss": 0.0999, + "step": 49240 + }, + { + "epoch": 3.0678648941942486, + "grad_norm": 4.995217323303223, + "learning_rate": 3.354803343839935e-06, + "loss": 0.1275, + "step": 49241 + }, + { + "epoch": 3.0678784590341834, + "grad_norm": 2.7890210151672363, + "learning_rate": 3.3546663012196796e-06, + "loss": 0.0692, + "step": 49242 + }, + { + "epoch": 3.0678920238741183, + "grad_norm": 4.429668426513672, + "learning_rate": 3.3545292585994247e-06, + "loss": 0.1036, + "step": 49243 + }, + { + "epoch": 3.067905588714053, + "grad_norm": 3.5043365955352783, + "learning_rate": 3.35439221597917e-06, + "loss": 0.0837, + "step": 49244 + }, + { + "epoch": 3.067919153553988, + "grad_norm": 2.8761632442474365, + "learning_rate": 3.354255173358915e-06, + "loss": 0.071, + "step": 49245 + }, + { + "epoch": 3.067932718393923, + "grad_norm": 4.480981349945068, + "learning_rate": 3.35411813073866e-06, + "loss": 0.0819, + "step": 49246 + }, + { + "epoch": 3.0679462832338578, + "grad_norm": 4.061190605163574, + "learning_rate": 3.353981088118405e-06, + "loss": 0.0929, + "step": 49247 + }, + { + "epoch": 3.0679598480737926, + "grad_norm": 3.9100282192230225, + "learning_rate": 3.3538440454981502e-06, + "loss": 0.0766, + "step": 49248 + }, + { + "epoch": 3.0679734129137275, + "grad_norm": 5.578010082244873, + "learning_rate": 3.353707002877895e-06, + "loss": 0.1786, + "step": 49249 + }, + { + "epoch": 3.0679869777536624, + "grad_norm": 2.7999653816223145, + "learning_rate": 3.3535699602576406e-06, + "loss": 0.0435, + "step": 49250 + }, + { + "epoch": 3.0680005425935972, + "grad_norm": 3.185310125350952, + "learning_rate": 3.3534329176373853e-06, + "loss": 0.0972, + "step": 49251 + }, + { + "epoch": 3.068014107433532, + "grad_norm": 2.9917380809783936, + "learning_rate": 3.3532958750171305e-06, + "loss": 0.0815, + "step": 49252 + }, + { + "epoch": 3.068027672273467, + "grad_norm": 4.221384048461914, + "learning_rate": 3.3531588323968757e-06, + "loss": 0.1403, + "step": 49253 + }, + { + "epoch": 3.0680412371134023, + "grad_norm": 3.3649609088897705, + "learning_rate": 3.353021789776621e-06, + "loss": 0.0706, + "step": 49254 + }, + { + "epoch": 3.068054801953337, + "grad_norm": 4.403556823730469, + "learning_rate": 3.3528847471563656e-06, + "loss": 0.1136, + "step": 49255 + }, + { + "epoch": 3.068068366793272, + "grad_norm": 3.8673243522644043, + "learning_rate": 3.3527477045361112e-06, + "loss": 0.0872, + "step": 49256 + }, + { + "epoch": 3.068081931633207, + "grad_norm": 5.119351863861084, + "learning_rate": 3.352610661915856e-06, + "loss": 0.1025, + "step": 49257 + }, + { + "epoch": 3.0680954964731417, + "grad_norm": 5.69400691986084, + "learning_rate": 3.3524736192956016e-06, + "loss": 0.0978, + "step": 49258 + }, + { + "epoch": 3.0681090613130766, + "grad_norm": 4.6554436683654785, + "learning_rate": 3.3523365766753463e-06, + "loss": 0.1444, + "step": 49259 + }, + { + "epoch": 3.0681226261530115, + "grad_norm": 4.374721050262451, + "learning_rate": 3.352199534055091e-06, + "loss": 0.0793, + "step": 49260 + }, + { + "epoch": 3.0681361909929463, + "grad_norm": 3.686544895172119, + "learning_rate": 3.3520624914348367e-06, + "loss": 0.1071, + "step": 49261 + }, + { + "epoch": 3.068149755832881, + "grad_norm": 6.247331142425537, + "learning_rate": 3.3519254488145815e-06, + "loss": 0.1782, + "step": 49262 + }, + { + "epoch": 3.068163320672816, + "grad_norm": 4.4100446701049805, + "learning_rate": 3.3517884061943266e-06, + "loss": 0.0977, + "step": 49263 + }, + { + "epoch": 3.068176885512751, + "grad_norm": 3.6004390716552734, + "learning_rate": 3.351651363574072e-06, + "loss": 0.1199, + "step": 49264 + }, + { + "epoch": 3.068190450352686, + "grad_norm": 3.8366622924804688, + "learning_rate": 3.351514320953817e-06, + "loss": 0.1639, + "step": 49265 + }, + { + "epoch": 3.0682040151926206, + "grad_norm": 3.1203055381774902, + "learning_rate": 3.3513772783335617e-06, + "loss": 0.0856, + "step": 49266 + }, + { + "epoch": 3.0682175800325555, + "grad_norm": 3.335658550262451, + "learning_rate": 3.3512402357133073e-06, + "loss": 0.0712, + "step": 49267 + }, + { + "epoch": 3.0682311448724904, + "grad_norm": 3.528200626373291, + "learning_rate": 3.351103193093052e-06, + "loss": 0.0992, + "step": 49268 + }, + { + "epoch": 3.0682447097124252, + "grad_norm": 5.318568229675293, + "learning_rate": 3.3509661504727977e-06, + "loss": 0.1226, + "step": 49269 + }, + { + "epoch": 3.06825827455236, + "grad_norm": 3.9547765254974365, + "learning_rate": 3.3508291078525425e-06, + "loss": 0.1294, + "step": 49270 + }, + { + "epoch": 3.068271839392295, + "grad_norm": 4.2318549156188965, + "learning_rate": 3.3506920652322876e-06, + "loss": 0.1055, + "step": 49271 + }, + { + "epoch": 3.0682854042322303, + "grad_norm": 3.915825128555298, + "learning_rate": 3.3505550226120324e-06, + "loss": 0.0889, + "step": 49272 + }, + { + "epoch": 3.068298969072165, + "grad_norm": 4.522284507751465, + "learning_rate": 3.350417979991778e-06, + "loss": 0.1515, + "step": 49273 + }, + { + "epoch": 3.0683125339121, + "grad_norm": 5.3323564529418945, + "learning_rate": 3.3502809373715228e-06, + "loss": 0.1505, + "step": 49274 + }, + { + "epoch": 3.068326098752035, + "grad_norm": 4.720124244689941, + "learning_rate": 3.3501438947512675e-06, + "loss": 0.1431, + "step": 49275 + }, + { + "epoch": 3.0683396635919697, + "grad_norm": 4.19878625869751, + "learning_rate": 3.350006852131013e-06, + "loss": 0.0948, + "step": 49276 + }, + { + "epoch": 3.0683532284319046, + "grad_norm": 4.084724426269531, + "learning_rate": 3.349869809510758e-06, + "loss": 0.098, + "step": 49277 + }, + { + "epoch": 3.0683667932718395, + "grad_norm": 4.397781848907471, + "learning_rate": 3.3497327668905035e-06, + "loss": 0.1399, + "step": 49278 + }, + { + "epoch": 3.0683803581117743, + "grad_norm": 7.376858711242676, + "learning_rate": 3.3495957242702482e-06, + "loss": 0.1358, + "step": 49279 + }, + { + "epoch": 3.068393922951709, + "grad_norm": 3.20218825340271, + "learning_rate": 3.3494586816499934e-06, + "loss": 0.0614, + "step": 49280 + }, + { + "epoch": 3.068407487791644, + "grad_norm": 3.8924026489257812, + "learning_rate": 3.3493216390297386e-06, + "loss": 0.1013, + "step": 49281 + }, + { + "epoch": 3.068421052631579, + "grad_norm": 2.579824447631836, + "learning_rate": 3.3491845964094838e-06, + "loss": 0.06, + "step": 49282 + }, + { + "epoch": 3.068434617471514, + "grad_norm": 3.36977481842041, + "learning_rate": 3.3490475537892285e-06, + "loss": 0.0978, + "step": 49283 + }, + { + "epoch": 3.0684481823114487, + "grad_norm": 4.312091827392578, + "learning_rate": 3.348910511168974e-06, + "loss": 0.1577, + "step": 49284 + }, + { + "epoch": 3.0684617471513835, + "grad_norm": 3.8428685665130615, + "learning_rate": 3.348773468548719e-06, + "loss": 0.1108, + "step": 49285 + }, + { + "epoch": 3.0684753119913184, + "grad_norm": 3.40897536277771, + "learning_rate": 3.3486364259284645e-06, + "loss": 0.0959, + "step": 49286 + }, + { + "epoch": 3.0684888768312533, + "grad_norm": 3.0634119510650635, + "learning_rate": 3.3484993833082092e-06, + "loss": 0.0764, + "step": 49287 + }, + { + "epoch": 3.068502441671188, + "grad_norm": 3.9796197414398193, + "learning_rate": 3.348362340687954e-06, + "loss": 0.1012, + "step": 49288 + }, + { + "epoch": 3.068516006511123, + "grad_norm": 2.363513708114624, + "learning_rate": 3.348225298067699e-06, + "loss": 0.0575, + "step": 49289 + }, + { + "epoch": 3.068529571351058, + "grad_norm": 3.713099718093872, + "learning_rate": 3.3480882554474443e-06, + "loss": 0.0766, + "step": 49290 + }, + { + "epoch": 3.0685431361909927, + "grad_norm": 4.618884086608887, + "learning_rate": 3.3479512128271895e-06, + "loss": 0.1034, + "step": 49291 + }, + { + "epoch": 3.068556701030928, + "grad_norm": 3.8876235485076904, + "learning_rate": 3.3478141702069343e-06, + "loss": 0.0751, + "step": 49292 + }, + { + "epoch": 3.068570265870863, + "grad_norm": 2.9514997005462646, + "learning_rate": 3.34767712758668e-06, + "loss": 0.0505, + "step": 49293 + }, + { + "epoch": 3.0685838307107978, + "grad_norm": 6.858623504638672, + "learning_rate": 3.3475400849664246e-06, + "loss": 0.0704, + "step": 49294 + }, + { + "epoch": 3.0685973955507326, + "grad_norm": 3.3357434272766113, + "learning_rate": 3.3474030423461702e-06, + "loss": 0.0694, + "step": 49295 + }, + { + "epoch": 3.0686109603906675, + "grad_norm": 3.056368589401245, + "learning_rate": 3.347265999725915e-06, + "loss": 0.1055, + "step": 49296 + }, + { + "epoch": 3.0686245252306024, + "grad_norm": 3.285891532897949, + "learning_rate": 3.34712895710566e-06, + "loss": 0.1105, + "step": 49297 + }, + { + "epoch": 3.0686380900705372, + "grad_norm": 3.537853240966797, + "learning_rate": 3.3469919144854053e-06, + "loss": 0.1098, + "step": 49298 + }, + { + "epoch": 3.068651654910472, + "grad_norm": 4.014118194580078, + "learning_rate": 3.3468548718651505e-06, + "loss": 0.141, + "step": 49299 + }, + { + "epoch": 3.068665219750407, + "grad_norm": 5.988209247589111, + "learning_rate": 3.3467178292448953e-06, + "loss": 0.0952, + "step": 49300 + }, + { + "epoch": 3.068678784590342, + "grad_norm": 2.344266891479492, + "learning_rate": 3.346580786624641e-06, + "loss": 0.0438, + "step": 49301 + }, + { + "epoch": 3.0686923494302767, + "grad_norm": 3.173691749572754, + "learning_rate": 3.3464437440043856e-06, + "loss": 0.0804, + "step": 49302 + }, + { + "epoch": 3.0687059142702116, + "grad_norm": 3.69276762008667, + "learning_rate": 3.3463067013841304e-06, + "loss": 0.1177, + "step": 49303 + }, + { + "epoch": 3.0687194791101464, + "grad_norm": 4.185023784637451, + "learning_rate": 3.346169658763876e-06, + "loss": 0.0877, + "step": 49304 + }, + { + "epoch": 3.0687330439500813, + "grad_norm": 3.8320255279541016, + "learning_rate": 3.3460326161436208e-06, + "loss": 0.0635, + "step": 49305 + }, + { + "epoch": 3.068746608790016, + "grad_norm": 3.6795008182525635, + "learning_rate": 3.345895573523366e-06, + "loss": 0.0805, + "step": 49306 + }, + { + "epoch": 3.068760173629951, + "grad_norm": 3.013129711151123, + "learning_rate": 3.345758530903111e-06, + "loss": 0.0609, + "step": 49307 + }, + { + "epoch": 3.068773738469886, + "grad_norm": 3.137620210647583, + "learning_rate": 3.3456214882828563e-06, + "loss": 0.0483, + "step": 49308 + }, + { + "epoch": 3.0687873033098207, + "grad_norm": 3.23089861869812, + "learning_rate": 3.345484445662601e-06, + "loss": 0.0702, + "step": 49309 + }, + { + "epoch": 3.068800868149756, + "grad_norm": 3.343353271484375, + "learning_rate": 3.3453474030423466e-06, + "loss": 0.0663, + "step": 49310 + }, + { + "epoch": 3.068814432989691, + "grad_norm": 3.9501445293426514, + "learning_rate": 3.3452103604220914e-06, + "loss": 0.0463, + "step": 49311 + }, + { + "epoch": 3.068827997829626, + "grad_norm": 3.3330509662628174, + "learning_rate": 3.345073317801837e-06, + "loss": 0.0778, + "step": 49312 + }, + { + "epoch": 3.0688415626695607, + "grad_norm": 5.007869243621826, + "learning_rate": 3.3449362751815818e-06, + "loss": 0.0972, + "step": 49313 + }, + { + "epoch": 3.0688551275094955, + "grad_norm": 3.749509572982788, + "learning_rate": 3.344799232561327e-06, + "loss": 0.0794, + "step": 49314 + }, + { + "epoch": 3.0688686923494304, + "grad_norm": 5.179024696350098, + "learning_rate": 3.344662189941072e-06, + "loss": 0.0894, + "step": 49315 + }, + { + "epoch": 3.0688822571893652, + "grad_norm": 3.2341017723083496, + "learning_rate": 3.344525147320817e-06, + "loss": 0.0527, + "step": 49316 + }, + { + "epoch": 3.0688958220293, + "grad_norm": 3.584935426712036, + "learning_rate": 3.344388104700562e-06, + "loss": 0.1061, + "step": 49317 + }, + { + "epoch": 3.068909386869235, + "grad_norm": 4.003571510314941, + "learning_rate": 3.3442510620803072e-06, + "loss": 0.0825, + "step": 49318 + }, + { + "epoch": 3.06892295170917, + "grad_norm": 4.964996814727783, + "learning_rate": 3.3441140194600524e-06, + "loss": 0.1041, + "step": 49319 + }, + { + "epoch": 3.0689365165491047, + "grad_norm": 3.116989850997925, + "learning_rate": 3.343976976839797e-06, + "loss": 0.0632, + "step": 49320 + }, + { + "epoch": 3.0689500813890396, + "grad_norm": 3.625241279602051, + "learning_rate": 3.3438399342195428e-06, + "loss": 0.0635, + "step": 49321 + }, + { + "epoch": 3.0689636462289744, + "grad_norm": 2.1600701808929443, + "learning_rate": 3.3437028915992875e-06, + "loss": 0.0445, + "step": 49322 + }, + { + "epoch": 3.0689772110689093, + "grad_norm": 6.0666022300720215, + "learning_rate": 3.3435658489790327e-06, + "loss": 0.1376, + "step": 49323 + }, + { + "epoch": 3.068990775908844, + "grad_norm": 5.098206043243408, + "learning_rate": 3.343428806358778e-06, + "loss": 0.1445, + "step": 49324 + }, + { + "epoch": 3.069004340748779, + "grad_norm": 6.0585856437683105, + "learning_rate": 3.343291763738523e-06, + "loss": 0.1374, + "step": 49325 + }, + { + "epoch": 3.069017905588714, + "grad_norm": 3.4120917320251465, + "learning_rate": 3.343154721118268e-06, + "loss": 0.0783, + "step": 49326 + }, + { + "epoch": 3.0690314704286488, + "grad_norm": 3.986790895462036, + "learning_rate": 3.3430176784980134e-06, + "loss": 0.1321, + "step": 49327 + }, + { + "epoch": 3.0690450352685836, + "grad_norm": 3.658839702606201, + "learning_rate": 3.342880635877758e-06, + "loss": 0.1351, + "step": 49328 + }, + { + "epoch": 3.0690586001085185, + "grad_norm": 2.892179250717163, + "learning_rate": 3.342743593257503e-06, + "loss": 0.0891, + "step": 49329 + }, + { + "epoch": 3.069072164948454, + "grad_norm": 4.515835285186768, + "learning_rate": 3.3426065506372485e-06, + "loss": 0.1819, + "step": 49330 + }, + { + "epoch": 3.0690857297883887, + "grad_norm": 3.291424036026001, + "learning_rate": 3.3424695080169933e-06, + "loss": 0.0513, + "step": 49331 + }, + { + "epoch": 3.0690992946283235, + "grad_norm": 3.3422508239746094, + "learning_rate": 3.342332465396739e-06, + "loss": 0.118, + "step": 49332 + }, + { + "epoch": 3.0691128594682584, + "grad_norm": 5.357380390167236, + "learning_rate": 3.3421954227764836e-06, + "loss": 0.1115, + "step": 49333 + }, + { + "epoch": 3.0691264243081933, + "grad_norm": 4.558484077453613, + "learning_rate": 3.342058380156229e-06, + "loss": 0.1151, + "step": 49334 + }, + { + "epoch": 3.069139989148128, + "grad_norm": 5.619832515716553, + "learning_rate": 3.341921337535974e-06, + "loss": 0.0908, + "step": 49335 + }, + { + "epoch": 3.069153553988063, + "grad_norm": 3.3456714153289795, + "learning_rate": 3.341784294915719e-06, + "loss": 0.1386, + "step": 49336 + }, + { + "epoch": 3.069167118827998, + "grad_norm": 5.941598415374756, + "learning_rate": 3.341647252295464e-06, + "loss": 0.1183, + "step": 49337 + }, + { + "epoch": 3.0691806836679327, + "grad_norm": 4.168514251708984, + "learning_rate": 3.3415102096752095e-06, + "loss": 0.1057, + "step": 49338 + }, + { + "epoch": 3.0691942485078676, + "grad_norm": 5.712820053100586, + "learning_rate": 3.3413731670549543e-06, + "loss": 0.1349, + "step": 49339 + }, + { + "epoch": 3.0692078133478025, + "grad_norm": 3.723621368408203, + "learning_rate": 3.3412361244347e-06, + "loss": 0.1193, + "step": 49340 + }, + { + "epoch": 3.0692213781877373, + "grad_norm": 4.536413669586182, + "learning_rate": 3.3410990818144447e-06, + "loss": 0.1247, + "step": 49341 + }, + { + "epoch": 3.069234943027672, + "grad_norm": 3.133209705352783, + "learning_rate": 3.34096203919419e-06, + "loss": 0.0603, + "step": 49342 + }, + { + "epoch": 3.069248507867607, + "grad_norm": 6.24199914932251, + "learning_rate": 3.3408249965739346e-06, + "loss": 0.1852, + "step": 49343 + }, + { + "epoch": 3.069262072707542, + "grad_norm": 3.706056594848633, + "learning_rate": 3.3406879539536798e-06, + "loss": 0.1122, + "step": 49344 + }, + { + "epoch": 3.069275637547477, + "grad_norm": 5.1308674812316895, + "learning_rate": 3.340550911333425e-06, + "loss": 0.0932, + "step": 49345 + }, + { + "epoch": 3.0692892023874117, + "grad_norm": 3.705925226211548, + "learning_rate": 3.3404138687131697e-06, + "loss": 0.1031, + "step": 49346 + }, + { + "epoch": 3.0693027672273465, + "grad_norm": 4.882379055023193, + "learning_rate": 3.3402768260929153e-06, + "loss": 0.1551, + "step": 49347 + }, + { + "epoch": 3.069316332067282, + "grad_norm": 5.337729454040527, + "learning_rate": 3.34013978347266e-06, + "loss": 0.1138, + "step": 49348 + }, + { + "epoch": 3.0693298969072167, + "grad_norm": 5.51746940612793, + "learning_rate": 3.3400027408524057e-06, + "loss": 0.1439, + "step": 49349 + }, + { + "epoch": 3.0693434617471516, + "grad_norm": 5.389616966247559, + "learning_rate": 3.3398656982321504e-06, + "loss": 0.1342, + "step": 49350 + }, + { + "epoch": 3.0693570265870864, + "grad_norm": 3.474874496459961, + "learning_rate": 3.3397286556118956e-06, + "loss": 0.0941, + "step": 49351 + }, + { + "epoch": 3.0693705914270213, + "grad_norm": 5.460740089416504, + "learning_rate": 3.3395916129916408e-06, + "loss": 0.1778, + "step": 49352 + }, + { + "epoch": 3.069384156266956, + "grad_norm": 4.903871536254883, + "learning_rate": 3.339454570371386e-06, + "loss": 0.1266, + "step": 49353 + }, + { + "epoch": 3.069397721106891, + "grad_norm": 4.207666873931885, + "learning_rate": 3.3393175277511307e-06, + "loss": 0.1137, + "step": 49354 + }, + { + "epoch": 3.069411285946826, + "grad_norm": 6.308868885040283, + "learning_rate": 3.3391804851308763e-06, + "loss": 0.1233, + "step": 49355 + }, + { + "epoch": 3.0694248507867608, + "grad_norm": 4.533684730529785, + "learning_rate": 3.339043442510621e-06, + "loss": 0.1205, + "step": 49356 + }, + { + "epoch": 3.0694384156266956, + "grad_norm": 4.523166656494141, + "learning_rate": 3.338906399890366e-06, + "loss": 0.175, + "step": 49357 + }, + { + "epoch": 3.0694519804666305, + "grad_norm": 4.015976428985596, + "learning_rate": 3.3387693572701114e-06, + "loss": 0.0956, + "step": 49358 + }, + { + "epoch": 3.0694655453065653, + "grad_norm": 6.637105464935303, + "learning_rate": 3.338632314649856e-06, + "loss": 0.2879, + "step": 49359 + }, + { + "epoch": 3.0694791101465, + "grad_norm": 5.701869487762451, + "learning_rate": 3.3384952720296014e-06, + "loss": 0.2098, + "step": 49360 + }, + { + "epoch": 3.069492674986435, + "grad_norm": 2.968196153640747, + "learning_rate": 3.3383582294093465e-06, + "loss": 0.0651, + "step": 49361 + }, + { + "epoch": 3.06950623982637, + "grad_norm": 3.787137031555176, + "learning_rate": 3.3382211867890917e-06, + "loss": 0.1043, + "step": 49362 + }, + { + "epoch": 3.069519804666305, + "grad_norm": 3.9211244583129883, + "learning_rate": 3.3380841441688365e-06, + "loss": 0.116, + "step": 49363 + }, + { + "epoch": 3.0695333695062397, + "grad_norm": 4.57141637802124, + "learning_rate": 3.337947101548582e-06, + "loss": 0.1683, + "step": 49364 + }, + { + "epoch": 3.0695469343461745, + "grad_norm": 4.531013011932373, + "learning_rate": 3.337810058928327e-06, + "loss": 0.1779, + "step": 49365 + }, + { + "epoch": 3.0695604991861094, + "grad_norm": 3.53224515914917, + "learning_rate": 3.3376730163080724e-06, + "loss": 0.0808, + "step": 49366 + }, + { + "epoch": 3.0695740640260443, + "grad_norm": 3.5119755268096924, + "learning_rate": 3.337535973687817e-06, + "loss": 0.1266, + "step": 49367 + }, + { + "epoch": 3.0695876288659796, + "grad_norm": 3.8358359336853027, + "learning_rate": 3.3373989310675624e-06, + "loss": 0.1395, + "step": 49368 + }, + { + "epoch": 3.0696011937059144, + "grad_norm": 3.5308873653411865, + "learning_rate": 3.3372618884473075e-06, + "loss": 0.1229, + "step": 49369 + }, + { + "epoch": 3.0696147585458493, + "grad_norm": 4.490169048309326, + "learning_rate": 3.3371248458270527e-06, + "loss": 0.1011, + "step": 49370 + }, + { + "epoch": 3.069628323385784, + "grad_norm": 6.210794925689697, + "learning_rate": 3.3369878032067975e-06, + "loss": 0.138, + "step": 49371 + }, + { + "epoch": 3.069641888225719, + "grad_norm": 2.906289577484131, + "learning_rate": 3.3368507605865427e-06, + "loss": 0.1025, + "step": 49372 + }, + { + "epoch": 3.069655453065654, + "grad_norm": 6.03731632232666, + "learning_rate": 3.336713717966288e-06, + "loss": 0.1276, + "step": 49373 + }, + { + "epoch": 3.0696690179055888, + "grad_norm": 3.147559642791748, + "learning_rate": 3.3365766753460326e-06, + "loss": 0.066, + "step": 49374 + }, + { + "epoch": 3.0696825827455236, + "grad_norm": 4.0416693687438965, + "learning_rate": 3.336439632725778e-06, + "loss": 0.1113, + "step": 49375 + }, + { + "epoch": 3.0696961475854585, + "grad_norm": 3.8069159984588623, + "learning_rate": 3.336302590105523e-06, + "loss": 0.0685, + "step": 49376 + }, + { + "epoch": 3.0697097124253934, + "grad_norm": 4.090577125549316, + "learning_rate": 3.336165547485268e-06, + "loss": 0.0932, + "step": 49377 + }, + { + "epoch": 3.0697232772653282, + "grad_norm": 3.500180721282959, + "learning_rate": 3.3360285048650133e-06, + "loss": 0.0953, + "step": 49378 + }, + { + "epoch": 3.069736842105263, + "grad_norm": 6.471298694610596, + "learning_rate": 3.3358914622447585e-06, + "loss": 0.172, + "step": 49379 + }, + { + "epoch": 3.069750406945198, + "grad_norm": 3.9713587760925293, + "learning_rate": 3.3357544196245032e-06, + "loss": 0.1484, + "step": 49380 + }, + { + "epoch": 3.069763971785133, + "grad_norm": 3.926276683807373, + "learning_rate": 3.335617377004249e-06, + "loss": 0.0675, + "step": 49381 + }, + { + "epoch": 3.0697775366250677, + "grad_norm": 3.545168399810791, + "learning_rate": 3.3354803343839936e-06, + "loss": 0.1246, + "step": 49382 + }, + { + "epoch": 3.0697911014650026, + "grad_norm": 3.5868892669677734, + "learning_rate": 3.335343291763739e-06, + "loss": 0.0676, + "step": 49383 + }, + { + "epoch": 3.0698046663049374, + "grad_norm": 4.016318321228027, + "learning_rate": 3.335206249143484e-06, + "loss": 0.0614, + "step": 49384 + }, + { + "epoch": 3.0698182311448723, + "grad_norm": 4.006157875061035, + "learning_rate": 3.3350692065232287e-06, + "loss": 0.0955, + "step": 49385 + }, + { + "epoch": 3.0698317959848076, + "grad_norm": 3.0433452129364014, + "learning_rate": 3.3349321639029743e-06, + "loss": 0.0749, + "step": 49386 + }, + { + "epoch": 3.0698453608247425, + "grad_norm": 3.7004151344299316, + "learning_rate": 3.334795121282719e-06, + "loss": 0.1194, + "step": 49387 + }, + { + "epoch": 3.0698589256646773, + "grad_norm": 3.9479587078094482, + "learning_rate": 3.3346580786624642e-06, + "loss": 0.1593, + "step": 49388 + }, + { + "epoch": 3.069872490504612, + "grad_norm": 4.0548882484436035, + "learning_rate": 3.3345210360422094e-06, + "loss": 0.1526, + "step": 49389 + }, + { + "epoch": 3.069886055344547, + "grad_norm": 5.374600887298584, + "learning_rate": 3.3343839934219546e-06, + "loss": 0.1194, + "step": 49390 + }, + { + "epoch": 3.069899620184482, + "grad_norm": 4.760918140411377, + "learning_rate": 3.3342469508016994e-06, + "loss": 0.1558, + "step": 49391 + }, + { + "epoch": 3.069913185024417, + "grad_norm": 5.288773536682129, + "learning_rate": 3.334109908181445e-06, + "loss": 0.0938, + "step": 49392 + }, + { + "epoch": 3.0699267498643517, + "grad_norm": 4.923446178436279, + "learning_rate": 3.3339728655611897e-06, + "loss": 0.1233, + "step": 49393 + }, + { + "epoch": 3.0699403147042865, + "grad_norm": 4.399510860443115, + "learning_rate": 3.333835822940935e-06, + "loss": 0.0881, + "step": 49394 + }, + { + "epoch": 3.0699538795442214, + "grad_norm": 4.290751934051514, + "learning_rate": 3.33369878032068e-06, + "loss": 0.095, + "step": 49395 + }, + { + "epoch": 3.0699674443841563, + "grad_norm": 3.779019832611084, + "learning_rate": 3.3335617377004253e-06, + "loss": 0.1051, + "step": 49396 + }, + { + "epoch": 3.069981009224091, + "grad_norm": 3.456935405731201, + "learning_rate": 3.33342469508017e-06, + "loss": 0.0726, + "step": 49397 + }, + { + "epoch": 3.069994574064026, + "grad_norm": 4.740591049194336, + "learning_rate": 3.333287652459915e-06, + "loss": 0.1413, + "step": 49398 + }, + { + "epoch": 3.070008138903961, + "grad_norm": 3.908191204071045, + "learning_rate": 3.3331506098396604e-06, + "loss": 0.1019, + "step": 49399 + }, + { + "epoch": 3.0700217037438957, + "grad_norm": 4.6649394035339355, + "learning_rate": 3.333013567219405e-06, + "loss": 0.1446, + "step": 49400 + }, + { + "epoch": 3.0700352685838306, + "grad_norm": 4.2749104499816895, + "learning_rate": 3.3328765245991507e-06, + "loss": 0.0975, + "step": 49401 + }, + { + "epoch": 3.0700488334237654, + "grad_norm": 4.368945121765137, + "learning_rate": 3.3327394819788955e-06, + "loss": 0.0713, + "step": 49402 + }, + { + "epoch": 3.0700623982637003, + "grad_norm": 3.820855140686035, + "learning_rate": 3.332602439358641e-06, + "loss": 0.0943, + "step": 49403 + }, + { + "epoch": 3.070075963103635, + "grad_norm": 3.896995782852173, + "learning_rate": 3.332465396738386e-06, + "loss": 0.1057, + "step": 49404 + }, + { + "epoch": 3.07008952794357, + "grad_norm": 4.710398197174072, + "learning_rate": 3.332328354118131e-06, + "loss": 0.0974, + "step": 49405 + }, + { + "epoch": 3.0701030927835053, + "grad_norm": 3.116969108581543, + "learning_rate": 3.332191311497876e-06, + "loss": 0.0755, + "step": 49406 + }, + { + "epoch": 3.07011665762344, + "grad_norm": 4.477221488952637, + "learning_rate": 3.3320542688776214e-06, + "loss": 0.1273, + "step": 49407 + }, + { + "epoch": 3.070130222463375, + "grad_norm": 2.66794490814209, + "learning_rate": 3.331917226257366e-06, + "loss": 0.0451, + "step": 49408 + }, + { + "epoch": 3.07014378730331, + "grad_norm": 4.210514068603516, + "learning_rate": 3.3317801836371117e-06, + "loss": 0.1183, + "step": 49409 + }, + { + "epoch": 3.070157352143245, + "grad_norm": 3.24725079536438, + "learning_rate": 3.3316431410168565e-06, + "loss": 0.0994, + "step": 49410 + }, + { + "epoch": 3.0701709169831797, + "grad_norm": 3.3807942867279053, + "learning_rate": 3.331506098396602e-06, + "loss": 0.0512, + "step": 49411 + }, + { + "epoch": 3.0701844818231145, + "grad_norm": 3.058497190475464, + "learning_rate": 3.331369055776347e-06, + "loss": 0.0674, + "step": 49412 + }, + { + "epoch": 3.0701980466630494, + "grad_norm": 4.223400592803955, + "learning_rate": 3.3312320131560916e-06, + "loss": 0.1739, + "step": 49413 + }, + { + "epoch": 3.0702116115029843, + "grad_norm": 4.131156921386719, + "learning_rate": 3.3310949705358368e-06, + "loss": 0.1146, + "step": 49414 + }, + { + "epoch": 3.070225176342919, + "grad_norm": 3.4059767723083496, + "learning_rate": 3.330957927915582e-06, + "loss": 0.0629, + "step": 49415 + }, + { + "epoch": 3.070238741182854, + "grad_norm": 3.3177504539489746, + "learning_rate": 3.330820885295327e-06, + "loss": 0.0695, + "step": 49416 + }, + { + "epoch": 3.070252306022789, + "grad_norm": 3.9167566299438477, + "learning_rate": 3.330683842675072e-06, + "loss": 0.1133, + "step": 49417 + }, + { + "epoch": 3.0702658708627237, + "grad_norm": 3.4047791957855225, + "learning_rate": 3.3305468000548175e-06, + "loss": 0.1138, + "step": 49418 + }, + { + "epoch": 3.0702794357026586, + "grad_norm": 4.149077415466309, + "learning_rate": 3.3304097574345623e-06, + "loss": 0.099, + "step": 49419 + }, + { + "epoch": 3.0702930005425935, + "grad_norm": 4.574666976928711, + "learning_rate": 3.330272714814308e-06, + "loss": 0.1651, + "step": 49420 + }, + { + "epoch": 3.0703065653825283, + "grad_norm": 2.44944167137146, + "learning_rate": 3.3301356721940526e-06, + "loss": 0.0534, + "step": 49421 + }, + { + "epoch": 3.070320130222463, + "grad_norm": 4.758255958557129, + "learning_rate": 3.329998629573798e-06, + "loss": 0.1936, + "step": 49422 + }, + { + "epoch": 3.070333695062398, + "grad_norm": 3.6326990127563477, + "learning_rate": 3.329861586953543e-06, + "loss": 0.1015, + "step": 49423 + }, + { + "epoch": 3.0703472599023334, + "grad_norm": 3.393313407897949, + "learning_rate": 3.329724544333288e-06, + "loss": 0.0917, + "step": 49424 + }, + { + "epoch": 3.0703608247422682, + "grad_norm": 5.204495429992676, + "learning_rate": 3.329587501713033e-06, + "loss": 0.0967, + "step": 49425 + }, + { + "epoch": 3.070374389582203, + "grad_norm": 3.1384849548339844, + "learning_rate": 3.3294504590927777e-06, + "loss": 0.0772, + "step": 49426 + }, + { + "epoch": 3.070387954422138, + "grad_norm": 5.157290935516357, + "learning_rate": 3.3293134164725233e-06, + "loss": 0.152, + "step": 49427 + }, + { + "epoch": 3.070401519262073, + "grad_norm": 4.2389936447143555, + "learning_rate": 3.329176373852268e-06, + "loss": 0.0923, + "step": 49428 + }, + { + "epoch": 3.0704150841020077, + "grad_norm": 3.759018898010254, + "learning_rate": 3.3290393312320136e-06, + "loss": 0.1116, + "step": 49429 + }, + { + "epoch": 3.0704286489419426, + "grad_norm": 2.981604814529419, + "learning_rate": 3.3289022886117584e-06, + "loss": 0.0716, + "step": 49430 + }, + { + "epoch": 3.0704422137818774, + "grad_norm": 3.5859954357147217, + "learning_rate": 3.3287652459915036e-06, + "loss": 0.059, + "step": 49431 + }, + { + "epoch": 3.0704557786218123, + "grad_norm": 6.033355712890625, + "learning_rate": 3.3286282033712487e-06, + "loss": 0.1105, + "step": 49432 + }, + { + "epoch": 3.070469343461747, + "grad_norm": 3.172283887863159, + "learning_rate": 3.328491160750994e-06, + "loss": 0.0735, + "step": 49433 + }, + { + "epoch": 3.070482908301682, + "grad_norm": 4.167273044586182, + "learning_rate": 3.3283541181307387e-06, + "loss": 0.0669, + "step": 49434 + }, + { + "epoch": 3.070496473141617, + "grad_norm": 4.2082343101501465, + "learning_rate": 3.3282170755104843e-06, + "loss": 0.1176, + "step": 49435 + }, + { + "epoch": 3.0705100379815518, + "grad_norm": 3.3353936672210693, + "learning_rate": 3.328080032890229e-06, + "loss": 0.0911, + "step": 49436 + }, + { + "epoch": 3.0705236028214866, + "grad_norm": 3.548184633255005, + "learning_rate": 3.3279429902699746e-06, + "loss": 0.0401, + "step": 49437 + }, + { + "epoch": 3.0705371676614215, + "grad_norm": 3.4100723266601562, + "learning_rate": 3.3278059476497194e-06, + "loss": 0.0646, + "step": 49438 + }, + { + "epoch": 3.0705507325013564, + "grad_norm": 3.6342484951019287, + "learning_rate": 3.327668905029464e-06, + "loss": 0.0723, + "step": 49439 + }, + { + "epoch": 3.070564297341291, + "grad_norm": 4.455843448638916, + "learning_rate": 3.3275318624092097e-06, + "loss": 0.1264, + "step": 49440 + }, + { + "epoch": 3.070577862181226, + "grad_norm": 6.883517742156982, + "learning_rate": 3.3273948197889545e-06, + "loss": 0.1327, + "step": 49441 + }, + { + "epoch": 3.070591427021161, + "grad_norm": 4.837939739227295, + "learning_rate": 3.3272577771686997e-06, + "loss": 0.1364, + "step": 49442 + }, + { + "epoch": 3.070604991861096, + "grad_norm": 3.6307358741760254, + "learning_rate": 3.327120734548445e-06, + "loss": 0.088, + "step": 49443 + }, + { + "epoch": 3.070618556701031, + "grad_norm": 5.486464977264404, + "learning_rate": 3.32698369192819e-06, + "loss": 0.1212, + "step": 49444 + }, + { + "epoch": 3.070632121540966, + "grad_norm": 3.2373251914978027, + "learning_rate": 3.3268466493079348e-06, + "loss": 0.1073, + "step": 49445 + }, + { + "epoch": 3.070645686380901, + "grad_norm": 4.031116008758545, + "learning_rate": 3.3267096066876804e-06, + "loss": 0.0959, + "step": 49446 + }, + { + "epoch": 3.0706592512208357, + "grad_norm": 3.7056643962860107, + "learning_rate": 3.326572564067425e-06, + "loss": 0.0743, + "step": 49447 + }, + { + "epoch": 3.0706728160607706, + "grad_norm": 4.782411098480225, + "learning_rate": 3.3264355214471703e-06, + "loss": 0.1413, + "step": 49448 + }, + { + "epoch": 3.0706863809007054, + "grad_norm": 3.219132900238037, + "learning_rate": 3.3262984788269155e-06, + "loss": 0.053, + "step": 49449 + }, + { + "epoch": 3.0706999457406403, + "grad_norm": 4.533480167388916, + "learning_rate": 3.3261614362066607e-06, + "loss": 0.1025, + "step": 49450 + }, + { + "epoch": 3.070713510580575, + "grad_norm": 4.908227920532227, + "learning_rate": 3.3260243935864054e-06, + "loss": 0.1076, + "step": 49451 + }, + { + "epoch": 3.07072707542051, + "grad_norm": 3.238786458969116, + "learning_rate": 3.325887350966151e-06, + "loss": 0.0712, + "step": 49452 + }, + { + "epoch": 3.070740640260445, + "grad_norm": 4.487070560455322, + "learning_rate": 3.325750308345896e-06, + "loss": 0.0857, + "step": 49453 + }, + { + "epoch": 3.0707542051003798, + "grad_norm": 2.6672284603118896, + "learning_rate": 3.3256132657256405e-06, + "loss": 0.0658, + "step": 49454 + }, + { + "epoch": 3.0707677699403146, + "grad_norm": 4.1329474449157715, + "learning_rate": 3.325476223105386e-06, + "loss": 0.0787, + "step": 49455 + }, + { + "epoch": 3.0707813347802495, + "grad_norm": 4.357668399810791, + "learning_rate": 3.325339180485131e-06, + "loss": 0.1262, + "step": 49456 + }, + { + "epoch": 3.0707948996201844, + "grad_norm": 2.799192190170288, + "learning_rate": 3.3252021378648765e-06, + "loss": 0.0646, + "step": 49457 + }, + { + "epoch": 3.0708084644601192, + "grad_norm": 3.5313236713409424, + "learning_rate": 3.3250650952446213e-06, + "loss": 0.0664, + "step": 49458 + }, + { + "epoch": 3.070822029300054, + "grad_norm": 4.075056552886963, + "learning_rate": 3.3249280526243664e-06, + "loss": 0.097, + "step": 49459 + }, + { + "epoch": 3.070835594139989, + "grad_norm": 5.9758734703063965, + "learning_rate": 3.3247910100041116e-06, + "loss": 0.1337, + "step": 49460 + }, + { + "epoch": 3.070849158979924, + "grad_norm": 3.7623424530029297, + "learning_rate": 3.324653967383857e-06, + "loss": 0.1062, + "step": 49461 + }, + { + "epoch": 3.070862723819859, + "grad_norm": 5.983356952667236, + "learning_rate": 3.3245169247636016e-06, + "loss": 0.0775, + "step": 49462 + }, + { + "epoch": 3.070876288659794, + "grad_norm": 5.8481316566467285, + "learning_rate": 3.324379882143347e-06, + "loss": 0.1144, + "step": 49463 + }, + { + "epoch": 3.070889853499729, + "grad_norm": 4.871433734893799, + "learning_rate": 3.324242839523092e-06, + "loss": 0.1514, + "step": 49464 + }, + { + "epoch": 3.0709034183396637, + "grad_norm": 3.3879051208496094, + "learning_rate": 3.324105796902837e-06, + "loss": 0.0716, + "step": 49465 + }, + { + "epoch": 3.0709169831795986, + "grad_norm": 4.069340229034424, + "learning_rate": 3.3239687542825823e-06, + "loss": 0.1387, + "step": 49466 + }, + { + "epoch": 3.0709305480195335, + "grad_norm": 3.8011233806610107, + "learning_rate": 3.323831711662327e-06, + "loss": 0.108, + "step": 49467 + }, + { + "epoch": 3.0709441128594683, + "grad_norm": 4.636171817779541, + "learning_rate": 3.323694669042072e-06, + "loss": 0.1331, + "step": 49468 + }, + { + "epoch": 3.070957677699403, + "grad_norm": 5.670330047607422, + "learning_rate": 3.3235576264218174e-06, + "loss": 0.1916, + "step": 49469 + }, + { + "epoch": 3.070971242539338, + "grad_norm": 3.682788372039795, + "learning_rate": 3.3234205838015626e-06, + "loss": 0.1189, + "step": 49470 + }, + { + "epoch": 3.070984807379273, + "grad_norm": 4.2137250900268555, + "learning_rate": 3.3232835411813073e-06, + "loss": 0.0852, + "step": 49471 + }, + { + "epoch": 3.070998372219208, + "grad_norm": 3.6800713539123535, + "learning_rate": 3.323146498561053e-06, + "loss": 0.0935, + "step": 49472 + }, + { + "epoch": 3.0710119370591427, + "grad_norm": 2.6204864978790283, + "learning_rate": 3.3230094559407977e-06, + "loss": 0.0628, + "step": 49473 + }, + { + "epoch": 3.0710255018990775, + "grad_norm": 2.5236270427703857, + "learning_rate": 3.3228724133205433e-06, + "loss": 0.0611, + "step": 49474 + }, + { + "epoch": 3.0710390667390124, + "grad_norm": 4.234386444091797, + "learning_rate": 3.322735370700288e-06, + "loss": 0.0641, + "step": 49475 + }, + { + "epoch": 3.0710526315789473, + "grad_norm": 3.5784339904785156, + "learning_rate": 3.3225983280800332e-06, + "loss": 0.052, + "step": 49476 + }, + { + "epoch": 3.071066196418882, + "grad_norm": 4.041812419891357, + "learning_rate": 3.3224612854597784e-06, + "loss": 0.0867, + "step": 49477 + }, + { + "epoch": 3.071079761258817, + "grad_norm": 3.7001874446868896, + "learning_rate": 3.3223242428395236e-06, + "loss": 0.1099, + "step": 49478 + }, + { + "epoch": 3.071093326098752, + "grad_norm": 4.131525039672852, + "learning_rate": 3.3221872002192683e-06, + "loss": 0.0948, + "step": 49479 + }, + { + "epoch": 3.0711068909386867, + "grad_norm": 3.9879744052886963, + "learning_rate": 3.322050157599014e-06, + "loss": 0.0746, + "step": 49480 + }, + { + "epoch": 3.071120455778622, + "grad_norm": 4.136778831481934, + "learning_rate": 3.3219131149787587e-06, + "loss": 0.0774, + "step": 49481 + }, + { + "epoch": 3.071134020618557, + "grad_norm": 3.2869579792022705, + "learning_rate": 3.3217760723585034e-06, + "loss": 0.0549, + "step": 49482 + }, + { + "epoch": 3.0711475854584918, + "grad_norm": 2.620473861694336, + "learning_rate": 3.321639029738249e-06, + "loss": 0.0892, + "step": 49483 + }, + { + "epoch": 3.0711611502984266, + "grad_norm": 5.445140838623047, + "learning_rate": 3.321501987117994e-06, + "loss": 0.1003, + "step": 49484 + }, + { + "epoch": 3.0711747151383615, + "grad_norm": 2.281752109527588, + "learning_rate": 3.321364944497739e-06, + "loss": 0.0442, + "step": 49485 + }, + { + "epoch": 3.0711882799782964, + "grad_norm": 3.426659345626831, + "learning_rate": 3.321227901877484e-06, + "loss": 0.1608, + "step": 49486 + }, + { + "epoch": 3.071201844818231, + "grad_norm": 3.631361722946167, + "learning_rate": 3.3210908592572293e-06, + "loss": 0.0898, + "step": 49487 + }, + { + "epoch": 3.071215409658166, + "grad_norm": 2.8854284286499023, + "learning_rate": 3.320953816636974e-06, + "loss": 0.1185, + "step": 49488 + }, + { + "epoch": 3.071228974498101, + "grad_norm": 2.5774149894714355, + "learning_rate": 3.3208167740167197e-06, + "loss": 0.0536, + "step": 49489 + }, + { + "epoch": 3.071242539338036, + "grad_norm": 5.184669017791748, + "learning_rate": 3.3206797313964644e-06, + "loss": 0.1132, + "step": 49490 + }, + { + "epoch": 3.0712561041779707, + "grad_norm": 3.5261197090148926, + "learning_rate": 3.32054268877621e-06, + "loss": 0.0732, + "step": 49491 + }, + { + "epoch": 3.0712696690179055, + "grad_norm": 3.0551249980926514, + "learning_rate": 3.320405646155955e-06, + "loss": 0.0781, + "step": 49492 + }, + { + "epoch": 3.0712832338578404, + "grad_norm": 4.487669944763184, + "learning_rate": 3.3202686035357e-06, + "loss": 0.0811, + "step": 49493 + }, + { + "epoch": 3.0712967986977753, + "grad_norm": 2.7832024097442627, + "learning_rate": 3.320131560915445e-06, + "loss": 0.0739, + "step": 49494 + }, + { + "epoch": 3.07131036353771, + "grad_norm": 3.4755351543426514, + "learning_rate": 3.31999451829519e-06, + "loss": 0.0913, + "step": 49495 + }, + { + "epoch": 3.071323928377645, + "grad_norm": 3.697312355041504, + "learning_rate": 3.319857475674935e-06, + "loss": 0.0894, + "step": 49496 + }, + { + "epoch": 3.07133749321758, + "grad_norm": 2.0223653316497803, + "learning_rate": 3.31972043305468e-06, + "loss": 0.0336, + "step": 49497 + }, + { + "epoch": 3.0713510580575147, + "grad_norm": 3.7350575923919678, + "learning_rate": 3.3195833904344255e-06, + "loss": 0.1036, + "step": 49498 + }, + { + "epoch": 3.0713646228974496, + "grad_norm": 3.0012378692626953, + "learning_rate": 3.31944634781417e-06, + "loss": 0.0738, + "step": 49499 + }, + { + "epoch": 3.071378187737385, + "grad_norm": 2.315345525741577, + "learning_rate": 3.319309305193916e-06, + "loss": 0.0611, + "step": 49500 + }, + { + "epoch": 3.07139175257732, + "grad_norm": 2.2902166843414307, + "learning_rate": 3.3191722625736606e-06, + "loss": 0.0604, + "step": 49501 + }, + { + "epoch": 3.0714053174172546, + "grad_norm": 3.518007278442383, + "learning_rate": 3.3190352199534057e-06, + "loss": 0.0613, + "step": 49502 + }, + { + "epoch": 3.0714188822571895, + "grad_norm": 3.187623977661133, + "learning_rate": 3.318898177333151e-06, + "loss": 0.0737, + "step": 49503 + }, + { + "epoch": 3.0714324470971244, + "grad_norm": 2.6671533584594727, + "learning_rate": 3.318761134712896e-06, + "loss": 0.0622, + "step": 49504 + }, + { + "epoch": 3.0714460119370592, + "grad_norm": 2.4030532836914062, + "learning_rate": 3.318624092092641e-06, + "loss": 0.0678, + "step": 49505 + }, + { + "epoch": 3.071459576776994, + "grad_norm": 2.7879793643951416, + "learning_rate": 3.3184870494723865e-06, + "loss": 0.0537, + "step": 49506 + }, + { + "epoch": 3.071473141616929, + "grad_norm": 2.348464250564575, + "learning_rate": 3.3183500068521312e-06, + "loss": 0.0612, + "step": 49507 + }, + { + "epoch": 3.071486706456864, + "grad_norm": 3.6714422702789307, + "learning_rate": 3.318212964231876e-06, + "loss": 0.0996, + "step": 49508 + }, + { + "epoch": 3.0715002712967987, + "grad_norm": 3.9770092964172363, + "learning_rate": 3.3180759216116216e-06, + "loss": 0.0458, + "step": 49509 + }, + { + "epoch": 3.0715138361367336, + "grad_norm": 3.4671497344970703, + "learning_rate": 3.3179388789913663e-06, + "loss": 0.0558, + "step": 49510 + }, + { + "epoch": 3.0715274009766684, + "grad_norm": 3.5413641929626465, + "learning_rate": 3.317801836371112e-06, + "loss": 0.0995, + "step": 49511 + }, + { + "epoch": 3.0715409658166033, + "grad_norm": 2.3921570777893066, + "learning_rate": 3.3176647937508567e-06, + "loss": 0.0705, + "step": 49512 + }, + { + "epoch": 3.071554530656538, + "grad_norm": 3.524034023284912, + "learning_rate": 3.317527751130602e-06, + "loss": 0.0948, + "step": 49513 + }, + { + "epoch": 3.071568095496473, + "grad_norm": 3.9655447006225586, + "learning_rate": 3.3173907085103466e-06, + "loss": 0.0782, + "step": 49514 + }, + { + "epoch": 3.071581660336408, + "grad_norm": 4.443963050842285, + "learning_rate": 3.3172536658900922e-06, + "loss": 0.0994, + "step": 49515 + }, + { + "epoch": 3.0715952251763428, + "grad_norm": 2.2245845794677734, + "learning_rate": 3.317116623269837e-06, + "loss": 0.0404, + "step": 49516 + }, + { + "epoch": 3.0716087900162776, + "grad_norm": 2.794410228729248, + "learning_rate": 3.3169795806495826e-06, + "loss": 0.0562, + "step": 49517 + }, + { + "epoch": 3.0716223548562125, + "grad_norm": 2.8153679370880127, + "learning_rate": 3.3168425380293273e-06, + "loss": 0.0449, + "step": 49518 + }, + { + "epoch": 3.071635919696148, + "grad_norm": 3.3739144802093506, + "learning_rate": 3.3167054954090725e-06, + "loss": 0.0692, + "step": 49519 + }, + { + "epoch": 3.0716494845360827, + "grad_norm": 3.380528211593628, + "learning_rate": 3.3165684527888177e-06, + "loss": 0.0652, + "step": 49520 + }, + { + "epoch": 3.0716630493760175, + "grad_norm": 5.981499671936035, + "learning_rate": 3.316431410168563e-06, + "loss": 0.168, + "step": 49521 + }, + { + "epoch": 3.0716766142159524, + "grad_norm": 3.5235838890075684, + "learning_rate": 3.3162943675483076e-06, + "loss": 0.0882, + "step": 49522 + }, + { + "epoch": 3.0716901790558873, + "grad_norm": 3.1039373874664307, + "learning_rate": 3.316157324928053e-06, + "loss": 0.0479, + "step": 49523 + }, + { + "epoch": 3.071703743895822, + "grad_norm": 3.4627339839935303, + "learning_rate": 3.316020282307798e-06, + "loss": 0.0877, + "step": 49524 + }, + { + "epoch": 3.071717308735757, + "grad_norm": 3.061830759048462, + "learning_rate": 3.3158832396875427e-06, + "loss": 0.0613, + "step": 49525 + }, + { + "epoch": 3.071730873575692, + "grad_norm": 5.980820655822754, + "learning_rate": 3.3157461970672883e-06, + "loss": 0.1262, + "step": 49526 + }, + { + "epoch": 3.0717444384156267, + "grad_norm": 3.769172430038452, + "learning_rate": 3.315609154447033e-06, + "loss": 0.055, + "step": 49527 + }, + { + "epoch": 3.0717580032555616, + "grad_norm": 3.267324686050415, + "learning_rate": 3.3154721118267787e-06, + "loss": 0.0534, + "step": 49528 + }, + { + "epoch": 3.0717715680954965, + "grad_norm": 4.073732852935791, + "learning_rate": 3.3153350692065235e-06, + "loss": 0.0794, + "step": 49529 + }, + { + "epoch": 3.0717851329354313, + "grad_norm": 3.408958673477173, + "learning_rate": 3.3151980265862686e-06, + "loss": 0.1202, + "step": 49530 + }, + { + "epoch": 3.071798697775366, + "grad_norm": 2.8429722785949707, + "learning_rate": 3.315060983966014e-06, + "loss": 0.0445, + "step": 49531 + }, + { + "epoch": 3.071812262615301, + "grad_norm": 3.98564076423645, + "learning_rate": 3.314923941345759e-06, + "loss": 0.1057, + "step": 49532 + }, + { + "epoch": 3.071825827455236, + "grad_norm": 4.539351940155029, + "learning_rate": 3.3147868987255038e-06, + "loss": 0.1213, + "step": 49533 + }, + { + "epoch": 3.071839392295171, + "grad_norm": 2.453352212905884, + "learning_rate": 3.3146498561052494e-06, + "loss": 0.0689, + "step": 49534 + }, + { + "epoch": 3.0718529571351056, + "grad_norm": 3.559173822402954, + "learning_rate": 3.314512813484994e-06, + "loss": 0.1287, + "step": 49535 + }, + { + "epoch": 3.0718665219750405, + "grad_norm": 5.79032039642334, + "learning_rate": 3.314375770864739e-06, + "loss": 0.1287, + "step": 49536 + }, + { + "epoch": 3.0718800868149754, + "grad_norm": 6.142359733581543, + "learning_rate": 3.3142387282444845e-06, + "loss": 0.2208, + "step": 49537 + }, + { + "epoch": 3.0718936516549107, + "grad_norm": 3.7714226245880127, + "learning_rate": 3.3141016856242292e-06, + "loss": 0.0998, + "step": 49538 + }, + { + "epoch": 3.0719072164948455, + "grad_norm": 5.004083633422852, + "learning_rate": 3.3139646430039744e-06, + "loss": 0.1316, + "step": 49539 + }, + { + "epoch": 3.0719207813347804, + "grad_norm": 5.246303558349609, + "learning_rate": 3.3138276003837196e-06, + "loss": 0.1568, + "step": 49540 + }, + { + "epoch": 3.0719343461747153, + "grad_norm": 3.4712722301483154, + "learning_rate": 3.3136905577634648e-06, + "loss": 0.1259, + "step": 49541 + }, + { + "epoch": 3.07194791101465, + "grad_norm": 5.200132369995117, + "learning_rate": 3.3135535151432095e-06, + "loss": 0.1569, + "step": 49542 + }, + { + "epoch": 3.071961475854585, + "grad_norm": 3.5561275482177734, + "learning_rate": 3.313416472522955e-06, + "loss": 0.059, + "step": 49543 + }, + { + "epoch": 3.07197504069452, + "grad_norm": 2.9016427993774414, + "learning_rate": 3.3132794299027e-06, + "loss": 0.0647, + "step": 49544 + }, + { + "epoch": 3.0719886055344547, + "grad_norm": 7.386875629425049, + "learning_rate": 3.3131423872824455e-06, + "loss": 0.2484, + "step": 49545 + }, + { + "epoch": 3.0720021703743896, + "grad_norm": 4.004014492034912, + "learning_rate": 3.3130053446621902e-06, + "loss": 0.1333, + "step": 49546 + }, + { + "epoch": 3.0720157352143245, + "grad_norm": 3.8278379440307617, + "learning_rate": 3.3128683020419354e-06, + "loss": 0.1051, + "step": 49547 + }, + { + "epoch": 3.0720293000542593, + "grad_norm": 3.4986445903778076, + "learning_rate": 3.3127312594216806e-06, + "loss": 0.089, + "step": 49548 + }, + { + "epoch": 3.072042864894194, + "grad_norm": 3.42362117767334, + "learning_rate": 3.3125942168014258e-06, + "loss": 0.0599, + "step": 49549 + }, + { + "epoch": 3.072056429734129, + "grad_norm": 4.4574503898620605, + "learning_rate": 3.3124571741811705e-06, + "loss": 0.1561, + "step": 49550 + }, + { + "epoch": 3.072069994574064, + "grad_norm": 4.807557582855225, + "learning_rate": 3.3123201315609153e-06, + "loss": 0.1397, + "step": 49551 + }, + { + "epoch": 3.072083559413999, + "grad_norm": 4.146395683288574, + "learning_rate": 3.312183088940661e-06, + "loss": 0.1186, + "step": 49552 + }, + { + "epoch": 3.0720971242539337, + "grad_norm": 3.8297171592712402, + "learning_rate": 3.3120460463204056e-06, + "loss": 0.1639, + "step": 49553 + }, + { + "epoch": 3.0721106890938685, + "grad_norm": 2.7605960369110107, + "learning_rate": 3.3119090037001512e-06, + "loss": 0.0611, + "step": 49554 + }, + { + "epoch": 3.0721242539338034, + "grad_norm": 4.141751289367676, + "learning_rate": 3.311771961079896e-06, + "loss": 0.1404, + "step": 49555 + }, + { + "epoch": 3.0721378187737383, + "grad_norm": 4.1097211837768555, + "learning_rate": 3.311634918459641e-06, + "loss": 0.1137, + "step": 49556 + }, + { + "epoch": 3.0721513836136736, + "grad_norm": 4.263561725616455, + "learning_rate": 3.3114978758393864e-06, + "loss": 0.1148, + "step": 49557 + }, + { + "epoch": 3.0721649484536084, + "grad_norm": 4.920116901397705, + "learning_rate": 3.3113608332191315e-06, + "loss": 0.0909, + "step": 49558 + }, + { + "epoch": 3.0721785132935433, + "grad_norm": 4.520976543426514, + "learning_rate": 3.3112237905988763e-06, + "loss": 0.0809, + "step": 49559 + }, + { + "epoch": 3.072192078133478, + "grad_norm": 3.759451389312744, + "learning_rate": 3.311086747978622e-06, + "loss": 0.1144, + "step": 49560 + }, + { + "epoch": 3.072205642973413, + "grad_norm": 3.749152183532715, + "learning_rate": 3.3109497053583666e-06, + "loss": 0.0765, + "step": 49561 + }, + { + "epoch": 3.072219207813348, + "grad_norm": 4.2780914306640625, + "learning_rate": 3.3108126627381122e-06, + "loss": 0.0945, + "step": 49562 + }, + { + "epoch": 3.0722327726532828, + "grad_norm": 2.649118185043335, + "learning_rate": 3.310675620117857e-06, + "loss": 0.0583, + "step": 49563 + }, + { + "epoch": 3.0722463374932176, + "grad_norm": 5.1986308097839355, + "learning_rate": 3.3105385774976018e-06, + "loss": 0.1418, + "step": 49564 + }, + { + "epoch": 3.0722599023331525, + "grad_norm": 4.039560317993164, + "learning_rate": 3.3104015348773474e-06, + "loss": 0.1454, + "step": 49565 + }, + { + "epoch": 3.0722734671730874, + "grad_norm": 3.798583507537842, + "learning_rate": 3.310264492257092e-06, + "loss": 0.0515, + "step": 49566 + }, + { + "epoch": 3.0722870320130222, + "grad_norm": 3.95052433013916, + "learning_rate": 3.3101274496368373e-06, + "loss": 0.0669, + "step": 49567 + }, + { + "epoch": 3.072300596852957, + "grad_norm": 3.3254945278167725, + "learning_rate": 3.309990407016582e-06, + "loss": 0.1516, + "step": 49568 + }, + { + "epoch": 3.072314161692892, + "grad_norm": 4.743563175201416, + "learning_rate": 3.3098533643963277e-06, + "loss": 0.1415, + "step": 49569 + }, + { + "epoch": 3.072327726532827, + "grad_norm": 7.658566474914551, + "learning_rate": 3.3097163217760724e-06, + "loss": 0.1492, + "step": 49570 + }, + { + "epoch": 3.0723412913727617, + "grad_norm": 4.107680797576904, + "learning_rate": 3.309579279155818e-06, + "loss": 0.0664, + "step": 49571 + }, + { + "epoch": 3.0723548562126965, + "grad_norm": 4.873994827270508, + "learning_rate": 3.3094422365355628e-06, + "loss": 0.125, + "step": 49572 + }, + { + "epoch": 3.0723684210526314, + "grad_norm": 4.911466598510742, + "learning_rate": 3.309305193915308e-06, + "loss": 0.1309, + "step": 49573 + }, + { + "epoch": 3.0723819858925663, + "grad_norm": 7.749454498291016, + "learning_rate": 3.309168151295053e-06, + "loss": 0.1275, + "step": 49574 + }, + { + "epoch": 3.072395550732501, + "grad_norm": 3.684312105178833, + "learning_rate": 3.3090311086747983e-06, + "loss": 0.1172, + "step": 49575 + }, + { + "epoch": 3.0724091155724365, + "grad_norm": 3.3639001846313477, + "learning_rate": 3.308894066054543e-06, + "loss": 0.0648, + "step": 49576 + }, + { + "epoch": 3.0724226804123713, + "grad_norm": 4.4691243171691895, + "learning_rate": 3.3087570234342882e-06, + "loss": 0.1391, + "step": 49577 + }, + { + "epoch": 3.072436245252306, + "grad_norm": 5.105765342712402, + "learning_rate": 3.3086199808140334e-06, + "loss": 0.1388, + "step": 49578 + }, + { + "epoch": 3.072449810092241, + "grad_norm": 5.168992519378662, + "learning_rate": 3.308482938193778e-06, + "loss": 0.1067, + "step": 49579 + }, + { + "epoch": 3.072463374932176, + "grad_norm": 4.016931056976318, + "learning_rate": 3.3083458955735238e-06, + "loss": 0.0686, + "step": 49580 + }, + { + "epoch": 3.072476939772111, + "grad_norm": 3.3997416496276855, + "learning_rate": 3.3082088529532685e-06, + "loss": 0.0789, + "step": 49581 + }, + { + "epoch": 3.0724905046120456, + "grad_norm": 3.007535934448242, + "learning_rate": 3.308071810333014e-06, + "loss": 0.0559, + "step": 49582 + }, + { + "epoch": 3.0725040694519805, + "grad_norm": 4.589180946350098, + "learning_rate": 3.307934767712759e-06, + "loss": 0.0756, + "step": 49583 + }, + { + "epoch": 3.0725176342919154, + "grad_norm": 5.321258544921875, + "learning_rate": 3.307797725092504e-06, + "loss": 0.1179, + "step": 49584 + }, + { + "epoch": 3.0725311991318502, + "grad_norm": 3.0998644828796387, + "learning_rate": 3.307660682472249e-06, + "loss": 0.0948, + "step": 49585 + }, + { + "epoch": 3.072544763971785, + "grad_norm": 5.699288368225098, + "learning_rate": 3.3075236398519944e-06, + "loss": 0.1616, + "step": 49586 + }, + { + "epoch": 3.07255832881172, + "grad_norm": 3.6958329677581787, + "learning_rate": 3.307386597231739e-06, + "loss": 0.0838, + "step": 49587 + }, + { + "epoch": 3.072571893651655, + "grad_norm": 5.020745754241943, + "learning_rate": 3.3072495546114848e-06, + "loss": 0.1622, + "step": 49588 + }, + { + "epoch": 3.0725854584915897, + "grad_norm": 3.5882155895233154, + "learning_rate": 3.3071125119912295e-06, + "loss": 0.1161, + "step": 49589 + }, + { + "epoch": 3.0725990233315246, + "grad_norm": 4.36763334274292, + "learning_rate": 3.3069754693709747e-06, + "loss": 0.2153, + "step": 49590 + }, + { + "epoch": 3.0726125881714594, + "grad_norm": 3.2025742530822754, + "learning_rate": 3.30683842675072e-06, + "loss": 0.0967, + "step": 49591 + }, + { + "epoch": 3.0726261530113943, + "grad_norm": 4.016386032104492, + "learning_rate": 3.3067013841304646e-06, + "loss": 0.094, + "step": 49592 + }, + { + "epoch": 3.072639717851329, + "grad_norm": 3.6301889419555664, + "learning_rate": 3.30656434151021e-06, + "loss": 0.0791, + "step": 49593 + }, + { + "epoch": 3.072653282691264, + "grad_norm": 3.626591920852661, + "learning_rate": 3.306427298889955e-06, + "loss": 0.0807, + "step": 49594 + }, + { + "epoch": 3.0726668475311993, + "grad_norm": 3.9629054069519043, + "learning_rate": 3.3062902562697e-06, + "loss": 0.1169, + "step": 49595 + }, + { + "epoch": 3.072680412371134, + "grad_norm": 4.681788444519043, + "learning_rate": 3.306153213649445e-06, + "loss": 0.1579, + "step": 49596 + }, + { + "epoch": 3.072693977211069, + "grad_norm": 4.22786283493042, + "learning_rate": 3.3060161710291905e-06, + "loss": 0.086, + "step": 49597 + }, + { + "epoch": 3.072707542051004, + "grad_norm": 4.246283054351807, + "learning_rate": 3.3058791284089353e-06, + "loss": 0.0934, + "step": 49598 + }, + { + "epoch": 3.072721106890939, + "grad_norm": 6.2590718269348145, + "learning_rate": 3.305742085788681e-06, + "loss": 0.0687, + "step": 49599 + }, + { + "epoch": 3.0727346717308737, + "grad_norm": 3.23036789894104, + "learning_rate": 3.3056050431684257e-06, + "loss": 0.0748, + "step": 49600 + }, + { + "epoch": 3.0727482365708085, + "grad_norm": 2.6252939701080322, + "learning_rate": 3.305468000548171e-06, + "loss": 0.0329, + "step": 49601 + }, + { + "epoch": 3.0727618014107434, + "grad_norm": 4.014194011688232, + "learning_rate": 3.305330957927916e-06, + "loss": 0.0945, + "step": 49602 + }, + { + "epoch": 3.0727753662506783, + "grad_norm": 2.840073347091675, + "learning_rate": 3.305193915307661e-06, + "loss": 0.0766, + "step": 49603 + }, + { + "epoch": 3.072788931090613, + "grad_norm": 3.972242832183838, + "learning_rate": 3.305056872687406e-06, + "loss": 0.0888, + "step": 49604 + }, + { + "epoch": 3.072802495930548, + "grad_norm": 2.924255132675171, + "learning_rate": 3.3049198300671507e-06, + "loss": 0.0905, + "step": 49605 + }, + { + "epoch": 3.072816060770483, + "grad_norm": 7.044824600219727, + "learning_rate": 3.3047827874468963e-06, + "loss": 0.1075, + "step": 49606 + }, + { + "epoch": 3.0728296256104177, + "grad_norm": 4.38740873336792, + "learning_rate": 3.304645744826641e-06, + "loss": 0.1351, + "step": 49607 + }, + { + "epoch": 3.0728431904503526, + "grad_norm": 4.1580891609191895, + "learning_rate": 3.3045087022063867e-06, + "loss": 0.0871, + "step": 49608 + }, + { + "epoch": 3.0728567552902875, + "grad_norm": 5.518540382385254, + "learning_rate": 3.3043716595861314e-06, + "loss": 0.1753, + "step": 49609 + }, + { + "epoch": 3.0728703201302223, + "grad_norm": 4.832537651062012, + "learning_rate": 3.3042346169658766e-06, + "loss": 0.1, + "step": 49610 + }, + { + "epoch": 3.072883884970157, + "grad_norm": 3.9692256450653076, + "learning_rate": 3.3040975743456218e-06, + "loss": 0.0978, + "step": 49611 + }, + { + "epoch": 3.072897449810092, + "grad_norm": 4.093334674835205, + "learning_rate": 3.303960531725367e-06, + "loss": 0.0957, + "step": 49612 + }, + { + "epoch": 3.072911014650027, + "grad_norm": 4.453056812286377, + "learning_rate": 3.3038234891051117e-06, + "loss": 0.1283, + "step": 49613 + }, + { + "epoch": 3.0729245794899622, + "grad_norm": 3.4467878341674805, + "learning_rate": 3.3036864464848573e-06, + "loss": 0.0791, + "step": 49614 + }, + { + "epoch": 3.072938144329897, + "grad_norm": 5.226415157318115, + "learning_rate": 3.303549403864602e-06, + "loss": 0.0831, + "step": 49615 + }, + { + "epoch": 3.072951709169832, + "grad_norm": 3.39631986618042, + "learning_rate": 3.3034123612443477e-06, + "loss": 0.1366, + "step": 49616 + }, + { + "epoch": 3.072965274009767, + "grad_norm": 3.549086332321167, + "learning_rate": 3.3032753186240924e-06, + "loss": 0.0738, + "step": 49617 + }, + { + "epoch": 3.0729788388497017, + "grad_norm": 3.9016659259796143, + "learning_rate": 3.3031382760038376e-06, + "loss": 0.1232, + "step": 49618 + }, + { + "epoch": 3.0729924036896366, + "grad_norm": 4.183976173400879, + "learning_rate": 3.3030012333835828e-06, + "loss": 0.1114, + "step": 49619 + }, + { + "epoch": 3.0730059685295714, + "grad_norm": 3.3072152137756348, + "learning_rate": 3.3028641907633275e-06, + "loss": 0.0762, + "step": 49620 + }, + { + "epoch": 3.0730195333695063, + "grad_norm": 3.6104135513305664, + "learning_rate": 3.3027271481430727e-06, + "loss": 0.1231, + "step": 49621 + }, + { + "epoch": 3.073033098209441, + "grad_norm": 3.732235908508301, + "learning_rate": 3.3025901055228175e-06, + "loss": 0.098, + "step": 49622 + }, + { + "epoch": 3.073046663049376, + "grad_norm": 3.0367040634155273, + "learning_rate": 3.302453062902563e-06, + "loss": 0.059, + "step": 49623 + }, + { + "epoch": 3.073060227889311, + "grad_norm": 3.159928798675537, + "learning_rate": 3.302316020282308e-06, + "loss": 0.0755, + "step": 49624 + }, + { + "epoch": 3.0730737927292457, + "grad_norm": 6.321511745452881, + "learning_rate": 3.3021789776620534e-06, + "loss": 0.1125, + "step": 49625 + }, + { + "epoch": 3.0730873575691806, + "grad_norm": 3.3659465312957764, + "learning_rate": 3.302041935041798e-06, + "loss": 0.0726, + "step": 49626 + }, + { + "epoch": 3.0731009224091155, + "grad_norm": 5.317593574523926, + "learning_rate": 3.3019048924215434e-06, + "loss": 0.1059, + "step": 49627 + }, + { + "epoch": 3.0731144872490503, + "grad_norm": 4.831381320953369, + "learning_rate": 3.3017678498012885e-06, + "loss": 0.1511, + "step": 49628 + }, + { + "epoch": 3.073128052088985, + "grad_norm": 4.579162120819092, + "learning_rate": 3.3016308071810337e-06, + "loss": 0.123, + "step": 49629 + }, + { + "epoch": 3.07314161692892, + "grad_norm": 3.4436166286468506, + "learning_rate": 3.3014937645607785e-06, + "loss": 0.0766, + "step": 49630 + }, + { + "epoch": 3.073155181768855, + "grad_norm": 4.919467926025391, + "learning_rate": 3.301356721940524e-06, + "loss": 0.1545, + "step": 49631 + }, + { + "epoch": 3.07316874660879, + "grad_norm": 4.510692596435547, + "learning_rate": 3.301219679320269e-06, + "loss": 0.1736, + "step": 49632 + }, + { + "epoch": 3.073182311448725, + "grad_norm": 3.8607065677642822, + "learning_rate": 3.3010826367000136e-06, + "loss": 0.0815, + "step": 49633 + }, + { + "epoch": 3.07319587628866, + "grad_norm": 3.898942232131958, + "learning_rate": 3.300945594079759e-06, + "loss": 0.1503, + "step": 49634 + }, + { + "epoch": 3.073209441128595, + "grad_norm": 4.45076322555542, + "learning_rate": 3.300808551459504e-06, + "loss": 0.0938, + "step": 49635 + }, + { + "epoch": 3.0732230059685297, + "grad_norm": 3.5334136486053467, + "learning_rate": 3.3006715088392496e-06, + "loss": 0.0822, + "step": 49636 + }, + { + "epoch": 3.0732365708084646, + "grad_norm": 4.934363842010498, + "learning_rate": 3.3005344662189943e-06, + "loss": 0.0986, + "step": 49637 + }, + { + "epoch": 3.0732501356483994, + "grad_norm": 3.385660409927368, + "learning_rate": 3.3003974235987395e-06, + "loss": 0.0505, + "step": 49638 + }, + { + "epoch": 3.0732637004883343, + "grad_norm": 3.8334014415740967, + "learning_rate": 3.3002603809784842e-06, + "loss": 0.0807, + "step": 49639 + }, + { + "epoch": 3.073277265328269, + "grad_norm": 3.509535312652588, + "learning_rate": 3.30012333835823e-06, + "loss": 0.106, + "step": 49640 + }, + { + "epoch": 3.073290830168204, + "grad_norm": 3.24402117729187, + "learning_rate": 3.2999862957379746e-06, + "loss": 0.0881, + "step": 49641 + }, + { + "epoch": 3.073304395008139, + "grad_norm": 4.083996295928955, + "learning_rate": 3.29984925311772e-06, + "loss": 0.0587, + "step": 49642 + }, + { + "epoch": 3.0733179598480738, + "grad_norm": 3.5531187057495117, + "learning_rate": 3.299712210497465e-06, + "loss": 0.1095, + "step": 49643 + }, + { + "epoch": 3.0733315246880086, + "grad_norm": 4.242344856262207, + "learning_rate": 3.29957516787721e-06, + "loss": 0.0923, + "step": 49644 + }, + { + "epoch": 3.0733450895279435, + "grad_norm": 5.9557976722717285, + "learning_rate": 3.2994381252569553e-06, + "loss": 0.1742, + "step": 49645 + }, + { + "epoch": 3.0733586543678784, + "grad_norm": 3.320787191390991, + "learning_rate": 3.2993010826367e-06, + "loss": 0.0646, + "step": 49646 + }, + { + "epoch": 3.0733722192078132, + "grad_norm": 5.200268268585205, + "learning_rate": 3.2991640400164453e-06, + "loss": 0.1256, + "step": 49647 + }, + { + "epoch": 3.073385784047748, + "grad_norm": 3.7395858764648438, + "learning_rate": 3.2990269973961904e-06, + "loss": 0.0804, + "step": 49648 + }, + { + "epoch": 3.073399348887683, + "grad_norm": 2.7443811893463135, + "learning_rate": 3.2988899547759356e-06, + "loss": 0.0635, + "step": 49649 + }, + { + "epoch": 3.073412913727618, + "grad_norm": 3.851947069168091, + "learning_rate": 3.2987529121556804e-06, + "loss": 0.1586, + "step": 49650 + }, + { + "epoch": 3.0734264785675527, + "grad_norm": 4.85139799118042, + "learning_rate": 3.298615869535426e-06, + "loss": 0.1433, + "step": 49651 + }, + { + "epoch": 3.073440043407488, + "grad_norm": 4.4789838790893555, + "learning_rate": 3.2984788269151707e-06, + "loss": 0.1199, + "step": 49652 + }, + { + "epoch": 3.073453608247423, + "grad_norm": 5.43988037109375, + "learning_rate": 3.2983417842949163e-06, + "loss": 0.1538, + "step": 49653 + }, + { + "epoch": 3.0734671730873577, + "grad_norm": 3.619049072265625, + "learning_rate": 3.298204741674661e-06, + "loss": 0.082, + "step": 49654 + }, + { + "epoch": 3.0734807379272926, + "grad_norm": 4.481590747833252, + "learning_rate": 3.2980676990544063e-06, + "loss": 0.085, + "step": 49655 + }, + { + "epoch": 3.0734943027672275, + "grad_norm": 3.415534734725952, + "learning_rate": 3.297930656434151e-06, + "loss": 0.1245, + "step": 49656 + }, + { + "epoch": 3.0735078676071623, + "grad_norm": 3.8913991451263428, + "learning_rate": 3.2977936138138966e-06, + "loss": 0.1141, + "step": 49657 + }, + { + "epoch": 3.073521432447097, + "grad_norm": 3.9528043270111084, + "learning_rate": 3.2976565711936414e-06, + "loss": 0.0903, + "step": 49658 + }, + { + "epoch": 3.073534997287032, + "grad_norm": 4.452668190002441, + "learning_rate": 3.297519528573387e-06, + "loss": 0.157, + "step": 49659 + }, + { + "epoch": 3.073548562126967, + "grad_norm": 4.98584508895874, + "learning_rate": 3.2973824859531317e-06, + "loss": 0.1729, + "step": 49660 + }, + { + "epoch": 3.073562126966902, + "grad_norm": 4.619263172149658, + "learning_rate": 3.2972454433328765e-06, + "loss": 0.192, + "step": 49661 + }, + { + "epoch": 3.0735756918068367, + "grad_norm": 3.240631341934204, + "learning_rate": 3.297108400712622e-06, + "loss": 0.0937, + "step": 49662 + }, + { + "epoch": 3.0735892566467715, + "grad_norm": 3.4823806285858154, + "learning_rate": 3.296971358092367e-06, + "loss": 0.1003, + "step": 49663 + }, + { + "epoch": 3.0736028214867064, + "grad_norm": 3.4689199924468994, + "learning_rate": 3.296834315472112e-06, + "loss": 0.071, + "step": 49664 + }, + { + "epoch": 3.0736163863266412, + "grad_norm": 4.956240653991699, + "learning_rate": 3.296697272851857e-06, + "loss": 0.1371, + "step": 49665 + }, + { + "epoch": 3.073629951166576, + "grad_norm": 3.458214521408081, + "learning_rate": 3.2965602302316024e-06, + "loss": 0.13, + "step": 49666 + }, + { + "epoch": 3.073643516006511, + "grad_norm": 3.069737434387207, + "learning_rate": 3.296423187611347e-06, + "loss": 0.0597, + "step": 49667 + }, + { + "epoch": 3.073657080846446, + "grad_norm": 4.125096797943115, + "learning_rate": 3.2962861449910927e-06, + "loss": 0.1342, + "step": 49668 + }, + { + "epoch": 3.0736706456863807, + "grad_norm": 4.502094268798828, + "learning_rate": 3.2961491023708375e-06, + "loss": 0.1291, + "step": 49669 + }, + { + "epoch": 3.0736842105263156, + "grad_norm": 3.7352259159088135, + "learning_rate": 3.296012059750583e-06, + "loss": 0.1481, + "step": 49670 + }, + { + "epoch": 3.073697775366251, + "grad_norm": 4.421124458312988, + "learning_rate": 3.295875017130328e-06, + "loss": 0.1643, + "step": 49671 + }, + { + "epoch": 3.0737113402061857, + "grad_norm": 2.4324512481689453, + "learning_rate": 3.295737974510073e-06, + "loss": 0.0813, + "step": 49672 + }, + { + "epoch": 3.0737249050461206, + "grad_norm": 6.764410972595215, + "learning_rate": 3.295600931889818e-06, + "loss": 0.1519, + "step": 49673 + }, + { + "epoch": 3.0737384698860555, + "grad_norm": 3.7241687774658203, + "learning_rate": 3.295463889269563e-06, + "loss": 0.1245, + "step": 49674 + }, + { + "epoch": 3.0737520347259903, + "grad_norm": 4.625344276428223, + "learning_rate": 3.295326846649308e-06, + "loss": 0.0956, + "step": 49675 + }, + { + "epoch": 3.073765599565925, + "grad_norm": 3.967787981033325, + "learning_rate": 3.295189804029053e-06, + "loss": 0.0905, + "step": 49676 + }, + { + "epoch": 3.07377916440586, + "grad_norm": 3.757874011993408, + "learning_rate": 3.2950527614087985e-06, + "loss": 0.0999, + "step": 49677 + }, + { + "epoch": 3.073792729245795, + "grad_norm": 5.27332067489624, + "learning_rate": 3.2949157187885433e-06, + "loss": 0.2832, + "step": 49678 + }, + { + "epoch": 3.07380629408573, + "grad_norm": 5.568241119384766, + "learning_rate": 3.294778676168289e-06, + "loss": 0.2589, + "step": 49679 + }, + { + "epoch": 3.0738198589256647, + "grad_norm": 3.7424089908599854, + "learning_rate": 3.2946416335480336e-06, + "loss": 0.1685, + "step": 49680 + }, + { + "epoch": 3.0738334237655995, + "grad_norm": 3.8376049995422363, + "learning_rate": 3.294504590927779e-06, + "loss": 0.0945, + "step": 49681 + }, + { + "epoch": 3.0738469886055344, + "grad_norm": 5.210161209106445, + "learning_rate": 3.294367548307524e-06, + "loss": 0.1235, + "step": 49682 + }, + { + "epoch": 3.0738605534454693, + "grad_norm": 6.681077480316162, + "learning_rate": 3.294230505687269e-06, + "loss": 0.2231, + "step": 49683 + }, + { + "epoch": 3.073874118285404, + "grad_norm": 3.507338047027588, + "learning_rate": 3.294093463067014e-06, + "loss": 0.0856, + "step": 49684 + }, + { + "epoch": 3.073887683125339, + "grad_norm": 3.580211639404297, + "learning_rate": 3.2939564204467595e-06, + "loss": 0.1329, + "step": 49685 + }, + { + "epoch": 3.073901247965274, + "grad_norm": 5.158836841583252, + "learning_rate": 3.2938193778265043e-06, + "loss": 0.1517, + "step": 49686 + }, + { + "epoch": 3.0739148128052087, + "grad_norm": 5.163144111633301, + "learning_rate": 3.293682335206249e-06, + "loss": 0.201, + "step": 49687 + }, + { + "epoch": 3.0739283776451436, + "grad_norm": 5.053549766540527, + "learning_rate": 3.2935452925859946e-06, + "loss": 0.2258, + "step": 49688 + }, + { + "epoch": 3.073941942485079, + "grad_norm": 4.329472064971924, + "learning_rate": 3.2934082499657394e-06, + "loss": 0.1393, + "step": 49689 + }, + { + "epoch": 3.0739555073250138, + "grad_norm": 3.6250147819519043, + "learning_rate": 3.293271207345485e-06, + "loss": 0.1062, + "step": 49690 + }, + { + "epoch": 3.0739690721649486, + "grad_norm": 5.710036277770996, + "learning_rate": 3.2931341647252297e-06, + "loss": 0.1768, + "step": 49691 + }, + { + "epoch": 3.0739826370048835, + "grad_norm": 5.167826175689697, + "learning_rate": 3.292997122104975e-06, + "loss": 0.1358, + "step": 49692 + }, + { + "epoch": 3.0739962018448184, + "grad_norm": 6.048487663269043, + "learning_rate": 3.2928600794847197e-06, + "loss": 0.2054, + "step": 49693 + }, + { + "epoch": 3.0740097666847532, + "grad_norm": 5.387302875518799, + "learning_rate": 3.2927230368644653e-06, + "loss": 0.1299, + "step": 49694 + }, + { + "epoch": 3.074023331524688, + "grad_norm": 4.036723613739014, + "learning_rate": 3.29258599424421e-06, + "loss": 0.1547, + "step": 49695 + }, + { + "epoch": 3.074036896364623, + "grad_norm": 5.168625354766846, + "learning_rate": 3.2924489516239556e-06, + "loss": 0.1778, + "step": 49696 + }, + { + "epoch": 3.074050461204558, + "grad_norm": 4.938729763031006, + "learning_rate": 3.2923119090037004e-06, + "loss": 0.105, + "step": 49697 + }, + { + "epoch": 3.0740640260444927, + "grad_norm": 4.053805351257324, + "learning_rate": 3.2921748663834456e-06, + "loss": 0.1868, + "step": 49698 + }, + { + "epoch": 3.0740775908844276, + "grad_norm": 4.889498710632324, + "learning_rate": 3.2920378237631907e-06, + "loss": 0.1612, + "step": 49699 + }, + { + "epoch": 3.0740911557243624, + "grad_norm": 5.903809547424316, + "learning_rate": 3.291900781142936e-06, + "loss": 0.218, + "step": 49700 + }, + { + "epoch": 3.0741047205642973, + "grad_norm": 4.808906078338623, + "learning_rate": 3.2917637385226807e-06, + "loss": 0.1888, + "step": 49701 + }, + { + "epoch": 3.074118285404232, + "grad_norm": 5.293550491333008, + "learning_rate": 3.291626695902426e-06, + "loss": 0.302, + "step": 49702 + }, + { + "epoch": 3.074131850244167, + "grad_norm": 7.735405445098877, + "learning_rate": 3.291489653282171e-06, + "loss": 0.1481, + "step": 49703 + }, + { + "epoch": 3.074145415084102, + "grad_norm": 6.2830424308776855, + "learning_rate": 3.291352610661916e-06, + "loss": 0.2953, + "step": 49704 + }, + { + "epoch": 3.0741589799240367, + "grad_norm": 4.568044662475586, + "learning_rate": 3.2912155680416614e-06, + "loss": 0.1298, + "step": 49705 + }, + { + "epoch": 3.0741725447639716, + "grad_norm": 5.2251691818237305, + "learning_rate": 3.291078525421406e-06, + "loss": 0.1756, + "step": 49706 + }, + { + "epoch": 3.0741861096039065, + "grad_norm": 6.873052597045898, + "learning_rate": 3.2909414828011517e-06, + "loss": 0.1446, + "step": 49707 + }, + { + "epoch": 3.0741996744438413, + "grad_norm": 4.133537292480469, + "learning_rate": 3.2908044401808965e-06, + "loss": 0.1767, + "step": 49708 + }, + { + "epoch": 3.0742132392837767, + "grad_norm": 4.326920032501221, + "learning_rate": 3.2906673975606417e-06, + "loss": 0.1099, + "step": 49709 + }, + { + "epoch": 3.0742268041237115, + "grad_norm": 6.467278957366943, + "learning_rate": 3.2905303549403864e-06, + "loss": 0.214, + "step": 49710 + }, + { + "epoch": 3.0742403689636464, + "grad_norm": 5.671291828155518, + "learning_rate": 3.290393312320132e-06, + "loss": 0.1762, + "step": 49711 + }, + { + "epoch": 3.0742539338035813, + "grad_norm": 4.4021124839782715, + "learning_rate": 3.290256269699877e-06, + "loss": 0.1623, + "step": 49712 + }, + { + "epoch": 3.074267498643516, + "grad_norm": 4.969974517822266, + "learning_rate": 3.2901192270796224e-06, + "loss": 0.1557, + "step": 49713 + }, + { + "epoch": 3.074281063483451, + "grad_norm": 4.8165459632873535, + "learning_rate": 3.289982184459367e-06, + "loss": 0.1328, + "step": 49714 + }, + { + "epoch": 3.074294628323386, + "grad_norm": 4.237053871154785, + "learning_rate": 3.289845141839112e-06, + "loss": 0.1559, + "step": 49715 + }, + { + "epoch": 3.0743081931633207, + "grad_norm": 5.273493766784668, + "learning_rate": 3.2897080992188575e-06, + "loss": 0.1748, + "step": 49716 + }, + { + "epoch": 3.0743217580032556, + "grad_norm": 4.245416641235352, + "learning_rate": 3.2895710565986023e-06, + "loss": 0.1294, + "step": 49717 + }, + { + "epoch": 3.0743353228431904, + "grad_norm": 5.3312177658081055, + "learning_rate": 3.2894340139783474e-06, + "loss": 0.2217, + "step": 49718 + }, + { + "epoch": 3.0743488876831253, + "grad_norm": 3.9022345542907715, + "learning_rate": 3.2892969713580926e-06, + "loss": 0.0756, + "step": 49719 + }, + { + "epoch": 3.07436245252306, + "grad_norm": 5.414154529571533, + "learning_rate": 3.289159928737838e-06, + "loss": 0.1626, + "step": 49720 + }, + { + "epoch": 3.074376017362995, + "grad_norm": 4.943247318267822, + "learning_rate": 3.2890228861175826e-06, + "loss": 0.1236, + "step": 49721 + }, + { + "epoch": 3.07438958220293, + "grad_norm": 4.19594144821167, + "learning_rate": 3.288885843497328e-06, + "loss": 0.1435, + "step": 49722 + }, + { + "epoch": 3.0744031470428648, + "grad_norm": 4.136141777038574, + "learning_rate": 3.288748800877073e-06, + "loss": 0.1009, + "step": 49723 + }, + { + "epoch": 3.0744167118827996, + "grad_norm": 6.752120494842529, + "learning_rate": 3.2886117582568185e-06, + "loss": 0.294, + "step": 49724 + }, + { + "epoch": 3.0744302767227345, + "grad_norm": 3.571647882461548, + "learning_rate": 3.2884747156365633e-06, + "loss": 0.1599, + "step": 49725 + }, + { + "epoch": 3.0744438415626694, + "grad_norm": 4.6019792556762695, + "learning_rate": 3.2883376730163085e-06, + "loss": 0.1475, + "step": 49726 + }, + { + "epoch": 3.0744574064026047, + "grad_norm": 4.754836559295654, + "learning_rate": 3.288200630396053e-06, + "loss": 0.1717, + "step": 49727 + }, + { + "epoch": 3.0744709712425395, + "grad_norm": 3.9271063804626465, + "learning_rate": 3.288063587775799e-06, + "loss": 0.1396, + "step": 49728 + }, + { + "epoch": 3.0744845360824744, + "grad_norm": 5.239897727966309, + "learning_rate": 3.2879265451555436e-06, + "loss": 0.1245, + "step": 49729 + }, + { + "epoch": 3.0744981009224093, + "grad_norm": 3.6713428497314453, + "learning_rate": 3.2877895025352883e-06, + "loss": 0.0936, + "step": 49730 + }, + { + "epoch": 3.074511665762344, + "grad_norm": 2.9850192070007324, + "learning_rate": 3.287652459915034e-06, + "loss": 0.0953, + "step": 49731 + }, + { + "epoch": 3.074525230602279, + "grad_norm": 4.097355365753174, + "learning_rate": 3.2875154172947787e-06, + "loss": 0.1017, + "step": 49732 + }, + { + "epoch": 3.074538795442214, + "grad_norm": 3.6387712955474854, + "learning_rate": 3.2873783746745243e-06, + "loss": 0.0922, + "step": 49733 + }, + { + "epoch": 3.0745523602821487, + "grad_norm": 3.167360305786133, + "learning_rate": 3.287241332054269e-06, + "loss": 0.1152, + "step": 49734 + }, + { + "epoch": 3.0745659251220836, + "grad_norm": 5.23660135269165, + "learning_rate": 3.2871042894340142e-06, + "loss": 0.1568, + "step": 49735 + }, + { + "epoch": 3.0745794899620185, + "grad_norm": 3.6108455657958984, + "learning_rate": 3.2869672468137594e-06, + "loss": 0.0938, + "step": 49736 + }, + { + "epoch": 3.0745930548019533, + "grad_norm": 3.553980588912964, + "learning_rate": 3.2868302041935046e-06, + "loss": 0.053, + "step": 49737 + }, + { + "epoch": 3.074606619641888, + "grad_norm": 4.652257919311523, + "learning_rate": 3.2866931615732493e-06, + "loss": 0.1009, + "step": 49738 + }, + { + "epoch": 3.074620184481823, + "grad_norm": 4.541880130767822, + "learning_rate": 3.286556118952995e-06, + "loss": 0.1404, + "step": 49739 + }, + { + "epoch": 3.074633749321758, + "grad_norm": 4.102879524230957, + "learning_rate": 3.2864190763327397e-06, + "loss": 0.1537, + "step": 49740 + }, + { + "epoch": 3.074647314161693, + "grad_norm": 3.372499704360962, + "learning_rate": 3.2862820337124853e-06, + "loss": 0.1219, + "step": 49741 + }, + { + "epoch": 3.0746608790016277, + "grad_norm": 3.631448984146118, + "learning_rate": 3.28614499109223e-06, + "loss": 0.1076, + "step": 49742 + }, + { + "epoch": 3.0746744438415625, + "grad_norm": 3.995941638946533, + "learning_rate": 3.286007948471975e-06, + "loss": 0.0679, + "step": 49743 + }, + { + "epoch": 3.0746880086814974, + "grad_norm": 6.445837020874023, + "learning_rate": 3.2858709058517204e-06, + "loss": 0.1129, + "step": 49744 + }, + { + "epoch": 3.0747015735214323, + "grad_norm": 3.8664517402648926, + "learning_rate": 3.285733863231465e-06, + "loss": 0.0896, + "step": 49745 + }, + { + "epoch": 3.074715138361367, + "grad_norm": 5.511532783508301, + "learning_rate": 3.2855968206112103e-06, + "loss": 0.1417, + "step": 49746 + }, + { + "epoch": 3.0747287032013024, + "grad_norm": 4.503820896148682, + "learning_rate": 3.285459777990955e-06, + "loss": 0.1049, + "step": 49747 + }, + { + "epoch": 3.0747422680412373, + "grad_norm": 5.194578647613525, + "learning_rate": 3.2853227353707007e-06, + "loss": 0.1019, + "step": 49748 + }, + { + "epoch": 3.074755832881172, + "grad_norm": 5.327191352844238, + "learning_rate": 3.2851856927504455e-06, + "loss": 0.1115, + "step": 49749 + }, + { + "epoch": 3.074769397721107, + "grad_norm": 5.1542510986328125, + "learning_rate": 3.285048650130191e-06, + "loss": 0.1117, + "step": 49750 + }, + { + "epoch": 3.074782962561042, + "grad_norm": 3.67386794090271, + "learning_rate": 3.284911607509936e-06, + "loss": 0.1326, + "step": 49751 + }, + { + "epoch": 3.0747965274009768, + "grad_norm": 4.607021331787109, + "learning_rate": 3.284774564889681e-06, + "loss": 0.1493, + "step": 49752 + }, + { + "epoch": 3.0748100922409116, + "grad_norm": 3.5992069244384766, + "learning_rate": 3.284637522269426e-06, + "loss": 0.1077, + "step": 49753 + }, + { + "epoch": 3.0748236570808465, + "grad_norm": 4.818291664123535, + "learning_rate": 3.2845004796491713e-06, + "loss": 0.1695, + "step": 49754 + }, + { + "epoch": 3.0748372219207813, + "grad_norm": 4.867337226867676, + "learning_rate": 3.284363437028916e-06, + "loss": 0.1801, + "step": 49755 + }, + { + "epoch": 3.074850786760716, + "grad_norm": 6.852706432342529, + "learning_rate": 3.2842263944086613e-06, + "loss": 0.1163, + "step": 49756 + }, + { + "epoch": 3.074864351600651, + "grad_norm": 5.5644307136535645, + "learning_rate": 3.2840893517884065e-06, + "loss": 0.1676, + "step": 49757 + }, + { + "epoch": 3.074877916440586, + "grad_norm": 3.9672865867614746, + "learning_rate": 3.2839523091681512e-06, + "loss": 0.0637, + "step": 49758 + }, + { + "epoch": 3.074891481280521, + "grad_norm": 5.734317302703857, + "learning_rate": 3.283815266547897e-06, + "loss": 0.122, + "step": 49759 + }, + { + "epoch": 3.0749050461204557, + "grad_norm": 3.8203773498535156, + "learning_rate": 3.2836782239276416e-06, + "loss": 0.103, + "step": 49760 + }, + { + "epoch": 3.0749186109603905, + "grad_norm": 4.9679365158081055, + "learning_rate": 3.283541181307387e-06, + "loss": 0.1238, + "step": 49761 + }, + { + "epoch": 3.0749321758003254, + "grad_norm": 4.404623031616211, + "learning_rate": 3.283404138687132e-06, + "loss": 0.1347, + "step": 49762 + }, + { + "epoch": 3.0749457406402603, + "grad_norm": 4.399552822113037, + "learning_rate": 3.283267096066877e-06, + "loss": 0.1123, + "step": 49763 + }, + { + "epoch": 3.074959305480195, + "grad_norm": 3.844714641571045, + "learning_rate": 3.283130053446622e-06, + "loss": 0.1004, + "step": 49764 + }, + { + "epoch": 3.0749728703201304, + "grad_norm": 4.468515872955322, + "learning_rate": 3.2829930108263675e-06, + "loss": 0.1342, + "step": 49765 + }, + { + "epoch": 3.0749864351600653, + "grad_norm": 4.626185894012451, + "learning_rate": 3.2828559682061122e-06, + "loss": 0.0822, + "step": 49766 + }, + { + "epoch": 3.075, + "grad_norm": 5.183245658874512, + "learning_rate": 3.282718925585858e-06, + "loss": 0.1546, + "step": 49767 + }, + { + "epoch": 3.075013564839935, + "grad_norm": 4.445494651794434, + "learning_rate": 3.2825818829656026e-06, + "loss": 0.1055, + "step": 49768 + }, + { + "epoch": 3.07502712967987, + "grad_norm": 4.2260050773620605, + "learning_rate": 3.2824448403453478e-06, + "loss": 0.132, + "step": 49769 + }, + { + "epoch": 3.0750406945198048, + "grad_norm": 3.9325318336486816, + "learning_rate": 3.282307797725093e-06, + "loss": 0.1014, + "step": 49770 + }, + { + "epoch": 3.0750542593597396, + "grad_norm": 4.386498928070068, + "learning_rate": 3.2821707551048377e-06, + "loss": 0.132, + "step": 49771 + }, + { + "epoch": 3.0750678241996745, + "grad_norm": 2.996986150741577, + "learning_rate": 3.282033712484583e-06, + "loss": 0.0987, + "step": 49772 + }, + { + "epoch": 3.0750813890396094, + "grad_norm": 3.9930503368377686, + "learning_rate": 3.281896669864328e-06, + "loss": 0.1076, + "step": 49773 + }, + { + "epoch": 3.0750949538795442, + "grad_norm": 6.956021785736084, + "learning_rate": 3.2817596272440732e-06, + "loss": 0.192, + "step": 49774 + }, + { + "epoch": 3.075108518719479, + "grad_norm": 4.266304016113281, + "learning_rate": 3.281622584623818e-06, + "loss": 0.163, + "step": 49775 + }, + { + "epoch": 3.075122083559414, + "grad_norm": 5.086277484893799, + "learning_rate": 3.2814855420035636e-06, + "loss": 0.1517, + "step": 49776 + }, + { + "epoch": 3.075135648399349, + "grad_norm": 5.863672733306885, + "learning_rate": 3.2813484993833083e-06, + "loss": 0.1882, + "step": 49777 + }, + { + "epoch": 3.0751492132392837, + "grad_norm": 4.118113040924072, + "learning_rate": 3.281211456763054e-06, + "loss": 0.0801, + "step": 49778 + }, + { + "epoch": 3.0751627780792186, + "grad_norm": 5.185129642486572, + "learning_rate": 3.2810744141427987e-06, + "loss": 0.0933, + "step": 49779 + }, + { + "epoch": 3.0751763429191534, + "grad_norm": 4.970293998718262, + "learning_rate": 3.280937371522544e-06, + "loss": 0.1119, + "step": 49780 + }, + { + "epoch": 3.0751899077590883, + "grad_norm": 3.2413716316223145, + "learning_rate": 3.2808003289022886e-06, + "loss": 0.0662, + "step": 49781 + }, + { + "epoch": 3.075203472599023, + "grad_norm": 5.3729681968688965, + "learning_rate": 3.2806632862820342e-06, + "loss": 0.1151, + "step": 49782 + }, + { + "epoch": 3.075217037438958, + "grad_norm": 3.1593284606933594, + "learning_rate": 3.280526243661779e-06, + "loss": 0.0998, + "step": 49783 + }, + { + "epoch": 3.075230602278893, + "grad_norm": 2.9763829708099365, + "learning_rate": 3.2803892010415237e-06, + "loss": 0.1, + "step": 49784 + }, + { + "epoch": 3.075244167118828, + "grad_norm": 5.089171409606934, + "learning_rate": 3.2802521584212693e-06, + "loss": 0.186, + "step": 49785 + }, + { + "epoch": 3.075257731958763, + "grad_norm": 2.6040806770324707, + "learning_rate": 3.280115115801014e-06, + "loss": 0.0526, + "step": 49786 + }, + { + "epoch": 3.075271296798698, + "grad_norm": 4.571316242218018, + "learning_rate": 3.2799780731807597e-06, + "loss": 0.2191, + "step": 49787 + }, + { + "epoch": 3.075284861638633, + "grad_norm": 5.348711013793945, + "learning_rate": 3.2798410305605045e-06, + "loss": 0.1903, + "step": 49788 + }, + { + "epoch": 3.0752984264785677, + "grad_norm": 5.407539367675781, + "learning_rate": 3.2797039879402496e-06, + "loss": 0.0725, + "step": 49789 + }, + { + "epoch": 3.0753119913185025, + "grad_norm": 5.37324857711792, + "learning_rate": 3.279566945319995e-06, + "loss": 0.1265, + "step": 49790 + }, + { + "epoch": 3.0753255561584374, + "grad_norm": 6.455852508544922, + "learning_rate": 3.27942990269974e-06, + "loss": 0.1258, + "step": 49791 + }, + { + "epoch": 3.0753391209983723, + "grad_norm": 2.940520763397217, + "learning_rate": 3.2792928600794848e-06, + "loss": 0.0435, + "step": 49792 + }, + { + "epoch": 3.075352685838307, + "grad_norm": 2.880790948867798, + "learning_rate": 3.2791558174592304e-06, + "loss": 0.0637, + "step": 49793 + }, + { + "epoch": 3.075366250678242, + "grad_norm": 3.792020320892334, + "learning_rate": 3.279018774838975e-06, + "loss": 0.0795, + "step": 49794 + }, + { + "epoch": 3.075379815518177, + "grad_norm": 4.382287502288818, + "learning_rate": 3.2788817322187207e-06, + "loss": 0.1141, + "step": 49795 + }, + { + "epoch": 3.0753933803581117, + "grad_norm": 2.3851120471954346, + "learning_rate": 3.2787446895984655e-06, + "loss": 0.035, + "step": 49796 + }, + { + "epoch": 3.0754069451980466, + "grad_norm": 5.390882968902588, + "learning_rate": 3.2786076469782106e-06, + "loss": 0.1641, + "step": 49797 + }, + { + "epoch": 3.0754205100379814, + "grad_norm": 3.8127384185791016, + "learning_rate": 3.2784706043579554e-06, + "loss": 0.0597, + "step": 49798 + }, + { + "epoch": 3.0754340748779163, + "grad_norm": 3.9461398124694824, + "learning_rate": 3.2783335617377006e-06, + "loss": 0.0825, + "step": 49799 + }, + { + "epoch": 3.075447639717851, + "grad_norm": 3.2863786220550537, + "learning_rate": 3.2781965191174458e-06, + "loss": 0.0927, + "step": 49800 + }, + { + "epoch": 3.075461204557786, + "grad_norm": 5.302789688110352, + "learning_rate": 3.2780594764971905e-06, + "loss": 0.0921, + "step": 49801 + }, + { + "epoch": 3.075474769397721, + "grad_norm": 3.0358591079711914, + "learning_rate": 3.277922433876936e-06, + "loss": 0.0859, + "step": 49802 + }, + { + "epoch": 3.075488334237656, + "grad_norm": 4.605861663818359, + "learning_rate": 3.277785391256681e-06, + "loss": 0.1078, + "step": 49803 + }, + { + "epoch": 3.075501899077591, + "grad_norm": 4.301411151885986, + "learning_rate": 3.2776483486364265e-06, + "loss": 0.1754, + "step": 49804 + }, + { + "epoch": 3.075515463917526, + "grad_norm": 3.634434223175049, + "learning_rate": 3.2775113060161712e-06, + "loss": 0.1164, + "step": 49805 + }, + { + "epoch": 3.075529028757461, + "grad_norm": 9.164871215820312, + "learning_rate": 3.2773742633959164e-06, + "loss": 0.1587, + "step": 49806 + }, + { + "epoch": 3.0755425935973957, + "grad_norm": 4.4683122634887695, + "learning_rate": 3.2772372207756616e-06, + "loss": 0.0916, + "step": 49807 + }, + { + "epoch": 3.0755561584373305, + "grad_norm": 4.434727668762207, + "learning_rate": 3.2771001781554068e-06, + "loss": 0.077, + "step": 49808 + }, + { + "epoch": 3.0755697232772654, + "grad_norm": 3.9231674671173096, + "learning_rate": 3.2769631355351515e-06, + "loss": 0.0857, + "step": 49809 + }, + { + "epoch": 3.0755832881172003, + "grad_norm": 3.778155565261841, + "learning_rate": 3.276826092914897e-06, + "loss": 0.0656, + "step": 49810 + }, + { + "epoch": 3.075596852957135, + "grad_norm": 5.021026611328125, + "learning_rate": 3.276689050294642e-06, + "loss": 0.1241, + "step": 49811 + }, + { + "epoch": 3.07561041779707, + "grad_norm": 4.930185794830322, + "learning_rate": 3.2765520076743866e-06, + "loss": 0.1034, + "step": 49812 + }, + { + "epoch": 3.075623982637005, + "grad_norm": 3.7433700561523438, + "learning_rate": 3.2764149650541322e-06, + "loss": 0.0624, + "step": 49813 + }, + { + "epoch": 3.0756375474769397, + "grad_norm": 2.8875231742858887, + "learning_rate": 3.276277922433877e-06, + "loss": 0.0743, + "step": 49814 + }, + { + "epoch": 3.0756511123168746, + "grad_norm": 3.3347179889678955, + "learning_rate": 3.2761408798136226e-06, + "loss": 0.1145, + "step": 49815 + }, + { + "epoch": 3.0756646771568095, + "grad_norm": 4.7279133796691895, + "learning_rate": 3.2760038371933674e-06, + "loss": 0.1294, + "step": 49816 + }, + { + "epoch": 3.0756782419967443, + "grad_norm": 3.0724945068359375, + "learning_rate": 3.2758667945731125e-06, + "loss": 0.0804, + "step": 49817 + }, + { + "epoch": 3.075691806836679, + "grad_norm": 5.189722537994385, + "learning_rate": 3.2757297519528573e-06, + "loss": 0.1045, + "step": 49818 + }, + { + "epoch": 3.075705371676614, + "grad_norm": 4.204914569854736, + "learning_rate": 3.275592709332603e-06, + "loss": 0.0893, + "step": 49819 + }, + { + "epoch": 3.075718936516549, + "grad_norm": 2.5828864574432373, + "learning_rate": 3.2754556667123476e-06, + "loss": 0.0562, + "step": 49820 + }, + { + "epoch": 3.075732501356484, + "grad_norm": 2.730140209197998, + "learning_rate": 3.2753186240920932e-06, + "loss": 0.0423, + "step": 49821 + }, + { + "epoch": 3.0757460661964187, + "grad_norm": 3.3486523628234863, + "learning_rate": 3.275181581471838e-06, + "loss": 0.0574, + "step": 49822 + }, + { + "epoch": 3.075759631036354, + "grad_norm": 5.996628284454346, + "learning_rate": 3.275044538851583e-06, + "loss": 0.1537, + "step": 49823 + }, + { + "epoch": 3.075773195876289, + "grad_norm": 4.220724582672119, + "learning_rate": 3.2749074962313284e-06, + "loss": 0.1393, + "step": 49824 + }, + { + "epoch": 3.0757867607162237, + "grad_norm": 4.438657760620117, + "learning_rate": 3.274770453611073e-06, + "loss": 0.1601, + "step": 49825 + }, + { + "epoch": 3.0758003255561586, + "grad_norm": 4.484395503997803, + "learning_rate": 3.2746334109908183e-06, + "loss": 0.1102, + "step": 49826 + }, + { + "epoch": 3.0758138903960934, + "grad_norm": 5.322057723999023, + "learning_rate": 3.2744963683705635e-06, + "loss": 0.1525, + "step": 49827 + }, + { + "epoch": 3.0758274552360283, + "grad_norm": 5.557416915893555, + "learning_rate": 3.2743593257503087e-06, + "loss": 0.1454, + "step": 49828 + }, + { + "epoch": 3.075841020075963, + "grad_norm": 4.133238315582275, + "learning_rate": 3.2742222831300534e-06, + "loss": 0.0962, + "step": 49829 + }, + { + "epoch": 3.075854584915898, + "grad_norm": 3.2479779720306396, + "learning_rate": 3.274085240509799e-06, + "loss": 0.076, + "step": 49830 + }, + { + "epoch": 3.075868149755833, + "grad_norm": 4.1566548347473145, + "learning_rate": 3.2739481978895438e-06, + "loss": 0.1124, + "step": 49831 + }, + { + "epoch": 3.0758817145957678, + "grad_norm": 5.807626247406006, + "learning_rate": 3.2738111552692894e-06, + "loss": 0.1552, + "step": 49832 + }, + { + "epoch": 3.0758952794357026, + "grad_norm": 3.6002190113067627, + "learning_rate": 3.273674112649034e-06, + "loss": 0.1159, + "step": 49833 + }, + { + "epoch": 3.0759088442756375, + "grad_norm": 4.531519412994385, + "learning_rate": 3.2735370700287793e-06, + "loss": 0.088, + "step": 49834 + }, + { + "epoch": 3.0759224091155724, + "grad_norm": 4.081352710723877, + "learning_rate": 3.273400027408524e-06, + "loss": 0.1201, + "step": 49835 + }, + { + "epoch": 3.075935973955507, + "grad_norm": 8.919836044311523, + "learning_rate": 3.2732629847882697e-06, + "loss": 0.2993, + "step": 49836 + }, + { + "epoch": 3.075949538795442, + "grad_norm": 3.849846839904785, + "learning_rate": 3.2731259421680144e-06, + "loss": 0.0854, + "step": 49837 + }, + { + "epoch": 3.075963103635377, + "grad_norm": 3.611948013305664, + "learning_rate": 3.27298889954776e-06, + "loss": 0.121, + "step": 49838 + }, + { + "epoch": 3.075976668475312, + "grad_norm": 5.145490646362305, + "learning_rate": 3.2728518569275048e-06, + "loss": 0.167, + "step": 49839 + }, + { + "epoch": 3.0759902333152467, + "grad_norm": 3.1538424491882324, + "learning_rate": 3.2727148143072495e-06, + "loss": 0.0927, + "step": 49840 + }, + { + "epoch": 3.076003798155182, + "grad_norm": 4.28175163269043, + "learning_rate": 3.272577771686995e-06, + "loss": 0.1804, + "step": 49841 + }, + { + "epoch": 3.076017362995117, + "grad_norm": 4.9023356437683105, + "learning_rate": 3.27244072906674e-06, + "loss": 0.1271, + "step": 49842 + }, + { + "epoch": 3.0760309278350517, + "grad_norm": 5.468594551086426, + "learning_rate": 3.272303686446485e-06, + "loss": 0.1212, + "step": 49843 + }, + { + "epoch": 3.0760444926749866, + "grad_norm": 5.112062454223633, + "learning_rate": 3.2721666438262302e-06, + "loss": 0.1073, + "step": 49844 + }, + { + "epoch": 3.0760580575149215, + "grad_norm": 3.568807363510132, + "learning_rate": 3.2720296012059754e-06, + "loss": 0.1138, + "step": 49845 + }, + { + "epoch": 3.0760716223548563, + "grad_norm": 6.424602508544922, + "learning_rate": 3.27189255858572e-06, + "loss": 0.1255, + "step": 49846 + }, + { + "epoch": 3.076085187194791, + "grad_norm": 4.982677936553955, + "learning_rate": 3.2717555159654658e-06, + "loss": 0.1379, + "step": 49847 + }, + { + "epoch": 3.076098752034726, + "grad_norm": 4.569610595703125, + "learning_rate": 3.2716184733452105e-06, + "loss": 0.18, + "step": 49848 + }, + { + "epoch": 3.076112316874661, + "grad_norm": 2.9070098400115967, + "learning_rate": 3.271481430724956e-06, + "loss": 0.0776, + "step": 49849 + }, + { + "epoch": 3.0761258817145958, + "grad_norm": 4.697257995605469, + "learning_rate": 3.271344388104701e-06, + "loss": 0.1893, + "step": 49850 + }, + { + "epoch": 3.0761394465545306, + "grad_norm": 7.563516616821289, + "learning_rate": 3.271207345484446e-06, + "loss": 0.1826, + "step": 49851 + }, + { + "epoch": 3.0761530113944655, + "grad_norm": 3.179028272628784, + "learning_rate": 3.271070302864191e-06, + "loss": 0.1141, + "step": 49852 + }, + { + "epoch": 3.0761665762344004, + "grad_norm": 4.019516468048096, + "learning_rate": 3.270933260243936e-06, + "loss": 0.1334, + "step": 49853 + }, + { + "epoch": 3.0761801410743352, + "grad_norm": 3.7384188175201416, + "learning_rate": 3.270796217623681e-06, + "loss": 0.1914, + "step": 49854 + }, + { + "epoch": 3.07619370591427, + "grad_norm": 3.4224653244018555, + "learning_rate": 3.270659175003426e-06, + "loss": 0.1513, + "step": 49855 + }, + { + "epoch": 3.076207270754205, + "grad_norm": 3.225245475769043, + "learning_rate": 3.2705221323831715e-06, + "loss": 0.1096, + "step": 49856 + }, + { + "epoch": 3.07622083559414, + "grad_norm": 3.659458875656128, + "learning_rate": 3.2703850897629163e-06, + "loss": 0.1378, + "step": 49857 + }, + { + "epoch": 3.0762344004340747, + "grad_norm": 4.151686191558838, + "learning_rate": 3.270248047142662e-06, + "loss": 0.2274, + "step": 49858 + }, + { + "epoch": 3.0762479652740096, + "grad_norm": 3.4433155059814453, + "learning_rate": 3.2701110045224067e-06, + "loss": 0.1483, + "step": 49859 + }, + { + "epoch": 3.0762615301139444, + "grad_norm": 3.7569875717163086, + "learning_rate": 3.269973961902152e-06, + "loss": 0.1523, + "step": 49860 + }, + { + "epoch": 3.0762750949538797, + "grad_norm": 4.744974613189697, + "learning_rate": 3.269836919281897e-06, + "loss": 0.0988, + "step": 49861 + }, + { + "epoch": 3.0762886597938146, + "grad_norm": 3.706758737564087, + "learning_rate": 3.269699876661642e-06, + "loss": 0.112, + "step": 49862 + }, + { + "epoch": 3.0763022246337495, + "grad_norm": 2.9619174003601074, + "learning_rate": 3.269562834041387e-06, + "loss": 0.1142, + "step": 49863 + }, + { + "epoch": 3.0763157894736843, + "grad_norm": 4.000762939453125, + "learning_rate": 3.2694257914211326e-06, + "loss": 0.0875, + "step": 49864 + }, + { + "epoch": 3.076329354313619, + "grad_norm": 4.838887691497803, + "learning_rate": 3.2692887488008773e-06, + "loss": 0.1574, + "step": 49865 + }, + { + "epoch": 3.076342919153554, + "grad_norm": 4.341851711273193, + "learning_rate": 3.269151706180622e-06, + "loss": 0.1375, + "step": 49866 + }, + { + "epoch": 3.076356483993489, + "grad_norm": 4.700599193572998, + "learning_rate": 3.2690146635603677e-06, + "loss": 0.0833, + "step": 49867 + }, + { + "epoch": 3.076370048833424, + "grad_norm": 4.579439640045166, + "learning_rate": 3.2688776209401124e-06, + "loss": 0.1261, + "step": 49868 + }, + { + "epoch": 3.0763836136733587, + "grad_norm": 6.3633599281311035, + "learning_rate": 3.2687405783198576e-06, + "loss": 0.1879, + "step": 49869 + }, + { + "epoch": 3.0763971785132935, + "grad_norm": 4.39522123336792, + "learning_rate": 3.2686035356996028e-06, + "loss": 0.1738, + "step": 49870 + }, + { + "epoch": 3.0764107433532284, + "grad_norm": 3.771496534347534, + "learning_rate": 3.268466493079348e-06, + "loss": 0.1148, + "step": 49871 + }, + { + "epoch": 3.0764243081931633, + "grad_norm": 5.236638069152832, + "learning_rate": 3.2683294504590927e-06, + "loss": 0.1426, + "step": 49872 + }, + { + "epoch": 3.076437873033098, + "grad_norm": 4.485317707061768, + "learning_rate": 3.2681924078388383e-06, + "loss": 0.0727, + "step": 49873 + }, + { + "epoch": 3.076451437873033, + "grad_norm": 5.409073829650879, + "learning_rate": 3.268055365218583e-06, + "loss": 0.1351, + "step": 49874 + }, + { + "epoch": 3.076465002712968, + "grad_norm": 4.2552809715271, + "learning_rate": 3.2679183225983287e-06, + "loss": 0.0872, + "step": 49875 + }, + { + "epoch": 3.0764785675529027, + "grad_norm": 4.054361820220947, + "learning_rate": 3.2677812799780734e-06, + "loss": 0.1249, + "step": 49876 + }, + { + "epoch": 3.0764921323928376, + "grad_norm": 3.971062421798706, + "learning_rate": 3.2676442373578186e-06, + "loss": 0.1508, + "step": 49877 + }, + { + "epoch": 3.0765056972327725, + "grad_norm": 7.390801429748535, + "learning_rate": 3.2675071947375638e-06, + "loss": 0.1697, + "step": 49878 + }, + { + "epoch": 3.0765192620727078, + "grad_norm": 4.598783016204834, + "learning_rate": 3.267370152117309e-06, + "loss": 0.1124, + "step": 49879 + }, + { + "epoch": 3.0765328269126426, + "grad_norm": 4.7693986892700195, + "learning_rate": 3.2672331094970537e-06, + "loss": 0.1001, + "step": 49880 + }, + { + "epoch": 3.0765463917525775, + "grad_norm": 3.5719311237335205, + "learning_rate": 3.267096066876799e-06, + "loss": 0.1228, + "step": 49881 + }, + { + "epoch": 3.0765599565925124, + "grad_norm": 4.370859622955322, + "learning_rate": 3.266959024256544e-06, + "loss": 0.1624, + "step": 49882 + }, + { + "epoch": 3.0765735214324472, + "grad_norm": 6.367376327514648, + "learning_rate": 3.266821981636289e-06, + "loss": 0.1795, + "step": 49883 + }, + { + "epoch": 3.076587086272382, + "grad_norm": 2.7839789390563965, + "learning_rate": 3.2666849390160344e-06, + "loss": 0.0677, + "step": 49884 + }, + { + "epoch": 3.076600651112317, + "grad_norm": 4.252567768096924, + "learning_rate": 3.266547896395779e-06, + "loss": 0.1126, + "step": 49885 + }, + { + "epoch": 3.076614215952252, + "grad_norm": 5.326406955718994, + "learning_rate": 3.266410853775525e-06, + "loss": 0.1556, + "step": 49886 + }, + { + "epoch": 3.0766277807921867, + "grad_norm": 3.3594751358032227, + "learning_rate": 3.2662738111552695e-06, + "loss": 0.0819, + "step": 49887 + }, + { + "epoch": 3.0766413456321215, + "grad_norm": 3.8000271320343018, + "learning_rate": 3.2661367685350147e-06, + "loss": 0.1059, + "step": 49888 + }, + { + "epoch": 3.0766549104720564, + "grad_norm": 4.25271463394165, + "learning_rate": 3.2659997259147595e-06, + "loss": 0.1147, + "step": 49889 + }, + { + "epoch": 3.0766684753119913, + "grad_norm": 4.3891191482543945, + "learning_rate": 3.265862683294505e-06, + "loss": 0.098, + "step": 49890 + }, + { + "epoch": 3.076682040151926, + "grad_norm": 3.6214051246643066, + "learning_rate": 3.26572564067425e-06, + "loss": 0.0885, + "step": 49891 + }, + { + "epoch": 3.076695604991861, + "grad_norm": 5.011360168457031, + "learning_rate": 3.2655885980539954e-06, + "loss": 0.0994, + "step": 49892 + }, + { + "epoch": 3.076709169831796, + "grad_norm": 5.109261989593506, + "learning_rate": 3.26545155543374e-06, + "loss": 0.2219, + "step": 49893 + }, + { + "epoch": 3.0767227346717307, + "grad_norm": 4.347604274749756, + "learning_rate": 3.265314512813485e-06, + "loss": 0.0844, + "step": 49894 + }, + { + "epoch": 3.0767362995116656, + "grad_norm": 4.764649391174316, + "learning_rate": 3.2651774701932306e-06, + "loss": 0.1422, + "step": 49895 + }, + { + "epoch": 3.0767498643516005, + "grad_norm": 2.9542765617370605, + "learning_rate": 3.2650404275729753e-06, + "loss": 0.0344, + "step": 49896 + }, + { + "epoch": 3.0767634291915353, + "grad_norm": 4.207781791687012, + "learning_rate": 3.2649033849527205e-06, + "loss": 0.2092, + "step": 49897 + }, + { + "epoch": 3.07677699403147, + "grad_norm": 3.6228814125061035, + "learning_rate": 3.2647663423324657e-06, + "loss": 0.0543, + "step": 49898 + }, + { + "epoch": 3.0767905588714055, + "grad_norm": 2.9057891368865967, + "learning_rate": 3.264629299712211e-06, + "loss": 0.0641, + "step": 49899 + }, + { + "epoch": 3.0768041237113404, + "grad_norm": 3.3452911376953125, + "learning_rate": 3.2644922570919556e-06, + "loss": 0.0505, + "step": 49900 + }, + { + "epoch": 3.0768176885512752, + "grad_norm": 4.308302879333496, + "learning_rate": 3.264355214471701e-06, + "loss": 0.1172, + "step": 49901 + }, + { + "epoch": 3.07683125339121, + "grad_norm": 4.1837615966796875, + "learning_rate": 3.264218171851446e-06, + "loss": 0.0939, + "step": 49902 + }, + { + "epoch": 3.076844818231145, + "grad_norm": 3.1090424060821533, + "learning_rate": 3.2640811292311916e-06, + "loss": 0.1034, + "step": 49903 + }, + { + "epoch": 3.07685838307108, + "grad_norm": 2.2138075828552246, + "learning_rate": 3.2639440866109363e-06, + "loss": 0.0407, + "step": 49904 + }, + { + "epoch": 3.0768719479110147, + "grad_norm": 5.657049655914307, + "learning_rate": 3.2638070439906815e-06, + "loss": 0.1396, + "step": 49905 + }, + { + "epoch": 3.0768855127509496, + "grad_norm": 2.8899011611938477, + "learning_rate": 3.2636700013704263e-06, + "loss": 0.0546, + "step": 49906 + }, + { + "epoch": 3.0768990775908844, + "grad_norm": 4.166865825653076, + "learning_rate": 3.263532958750172e-06, + "loss": 0.0717, + "step": 49907 + }, + { + "epoch": 3.0769126424308193, + "grad_norm": 3.903839349746704, + "learning_rate": 3.2633959161299166e-06, + "loss": 0.0842, + "step": 49908 + }, + { + "epoch": 3.076926207270754, + "grad_norm": 4.903419494628906, + "learning_rate": 3.2632588735096614e-06, + "loss": 0.1052, + "step": 49909 + }, + { + "epoch": 3.076939772110689, + "grad_norm": 4.215639591217041, + "learning_rate": 3.263121830889407e-06, + "loss": 0.0922, + "step": 49910 + }, + { + "epoch": 3.076953336950624, + "grad_norm": 4.151927471160889, + "learning_rate": 3.2629847882691517e-06, + "loss": 0.0984, + "step": 49911 + }, + { + "epoch": 3.0769669017905588, + "grad_norm": 4.734694480895996, + "learning_rate": 3.2628477456488973e-06, + "loss": 0.0802, + "step": 49912 + }, + { + "epoch": 3.0769804666304936, + "grad_norm": 4.653255462646484, + "learning_rate": 3.262710703028642e-06, + "loss": 0.0547, + "step": 49913 + }, + { + "epoch": 3.0769940314704285, + "grad_norm": 2.386399507522583, + "learning_rate": 3.2625736604083873e-06, + "loss": 0.0301, + "step": 49914 + }, + { + "epoch": 3.0770075963103634, + "grad_norm": 4.72817850112915, + "learning_rate": 3.2624366177881324e-06, + "loss": 0.1847, + "step": 49915 + }, + { + "epoch": 3.0770211611502982, + "grad_norm": 3.791147470474243, + "learning_rate": 3.2622995751678776e-06, + "loss": 0.1001, + "step": 49916 + }, + { + "epoch": 3.0770347259902335, + "grad_norm": 3.482583522796631, + "learning_rate": 3.2621625325476224e-06, + "loss": 0.0982, + "step": 49917 + }, + { + "epoch": 3.0770482908301684, + "grad_norm": 4.1526780128479, + "learning_rate": 3.262025489927368e-06, + "loss": 0.1155, + "step": 49918 + }, + { + "epoch": 3.0770618556701033, + "grad_norm": 4.008365154266357, + "learning_rate": 3.2618884473071127e-06, + "loss": 0.0696, + "step": 49919 + }, + { + "epoch": 3.077075420510038, + "grad_norm": 3.0876049995422363, + "learning_rate": 3.2617514046868583e-06, + "loss": 0.0862, + "step": 49920 + }, + { + "epoch": 3.077088985349973, + "grad_norm": 5.117144584655762, + "learning_rate": 3.261614362066603e-06, + "loss": 0.1151, + "step": 49921 + }, + { + "epoch": 3.077102550189908, + "grad_norm": 4.071685791015625, + "learning_rate": 3.261477319446348e-06, + "loss": 0.0921, + "step": 49922 + }, + { + "epoch": 3.0771161150298427, + "grad_norm": 4.496025085449219, + "learning_rate": 3.261340276826093e-06, + "loss": 0.0982, + "step": 49923 + }, + { + "epoch": 3.0771296798697776, + "grad_norm": 4.0144853591918945, + "learning_rate": 3.261203234205838e-06, + "loss": 0.0719, + "step": 49924 + }, + { + "epoch": 3.0771432447097125, + "grad_norm": 3.035841226577759, + "learning_rate": 3.2610661915855834e-06, + "loss": 0.124, + "step": 49925 + }, + { + "epoch": 3.0771568095496473, + "grad_norm": 4.772952079772949, + "learning_rate": 3.260929148965328e-06, + "loss": 0.1385, + "step": 49926 + }, + { + "epoch": 3.077170374389582, + "grad_norm": 4.250277996063232, + "learning_rate": 3.2607921063450737e-06, + "loss": 0.1167, + "step": 49927 + }, + { + "epoch": 3.077183939229517, + "grad_norm": 3.9079060554504395, + "learning_rate": 3.2606550637248185e-06, + "loss": 0.1462, + "step": 49928 + }, + { + "epoch": 3.077197504069452, + "grad_norm": 3.446598768234253, + "learning_rate": 3.260518021104564e-06, + "loss": 0.0781, + "step": 49929 + }, + { + "epoch": 3.077211068909387, + "grad_norm": 4.333515167236328, + "learning_rate": 3.260380978484309e-06, + "loss": 0.0674, + "step": 49930 + }, + { + "epoch": 3.0772246337493216, + "grad_norm": 5.532262802124023, + "learning_rate": 3.260243935864054e-06, + "loss": 0.1726, + "step": 49931 + }, + { + "epoch": 3.0772381985892565, + "grad_norm": 6.010513782501221, + "learning_rate": 3.260106893243799e-06, + "loss": 0.1549, + "step": 49932 + }, + { + "epoch": 3.0772517634291914, + "grad_norm": 4.697855472564697, + "learning_rate": 3.2599698506235444e-06, + "loss": 0.0933, + "step": 49933 + }, + { + "epoch": 3.0772653282691262, + "grad_norm": 5.111359119415283, + "learning_rate": 3.259832808003289e-06, + "loss": 0.086, + "step": 49934 + }, + { + "epoch": 3.077278893109061, + "grad_norm": 4.619521141052246, + "learning_rate": 3.2596957653830343e-06, + "loss": 0.1259, + "step": 49935 + }, + { + "epoch": 3.077292457948996, + "grad_norm": 3.4038732051849365, + "learning_rate": 3.2595587227627795e-06, + "loss": 0.0467, + "step": 49936 + }, + { + "epoch": 3.0773060227889313, + "grad_norm": 4.058474063873291, + "learning_rate": 3.2594216801425243e-06, + "loss": 0.1673, + "step": 49937 + }, + { + "epoch": 3.077319587628866, + "grad_norm": 2.8358070850372314, + "learning_rate": 3.25928463752227e-06, + "loss": 0.076, + "step": 49938 + }, + { + "epoch": 3.077333152468801, + "grad_norm": 4.491872310638428, + "learning_rate": 3.2591475949020146e-06, + "loss": 0.1405, + "step": 49939 + }, + { + "epoch": 3.077346717308736, + "grad_norm": 4.35173225402832, + "learning_rate": 3.25901055228176e-06, + "loss": 0.1299, + "step": 49940 + }, + { + "epoch": 3.0773602821486707, + "grad_norm": 5.020026206970215, + "learning_rate": 3.258873509661505e-06, + "loss": 0.1238, + "step": 49941 + }, + { + "epoch": 3.0773738469886056, + "grad_norm": 3.546771764755249, + "learning_rate": 3.25873646704125e-06, + "loss": 0.0575, + "step": 49942 + }, + { + "epoch": 3.0773874118285405, + "grad_norm": 5.577152729034424, + "learning_rate": 3.258599424420995e-06, + "loss": 0.2053, + "step": 49943 + }, + { + "epoch": 3.0774009766684753, + "grad_norm": 5.591898441314697, + "learning_rate": 3.2584623818007405e-06, + "loss": 0.2125, + "step": 49944 + }, + { + "epoch": 3.07741454150841, + "grad_norm": 3.8671646118164062, + "learning_rate": 3.2583253391804853e-06, + "loss": 0.1072, + "step": 49945 + }, + { + "epoch": 3.077428106348345, + "grad_norm": 5.130497455596924, + "learning_rate": 3.258188296560231e-06, + "loss": 0.1389, + "step": 49946 + }, + { + "epoch": 3.07744167118828, + "grad_norm": 4.72212028503418, + "learning_rate": 3.2580512539399756e-06, + "loss": 0.085, + "step": 49947 + }, + { + "epoch": 3.077455236028215, + "grad_norm": 4.782510280609131, + "learning_rate": 3.257914211319721e-06, + "loss": 0.1525, + "step": 49948 + }, + { + "epoch": 3.0774688008681497, + "grad_norm": 7.338179588317871, + "learning_rate": 3.257777168699466e-06, + "loss": 0.4461, + "step": 49949 + }, + { + "epoch": 3.0774823657080845, + "grad_norm": 5.275296688079834, + "learning_rate": 3.2576401260792107e-06, + "loss": 0.1533, + "step": 49950 + }, + { + "epoch": 3.0774959305480194, + "grad_norm": 7.37385368347168, + "learning_rate": 3.257503083458956e-06, + "loss": 0.3611, + "step": 49951 + }, + { + "epoch": 3.0775094953879543, + "grad_norm": 5.853983402252197, + "learning_rate": 3.257366040838701e-06, + "loss": 0.0998, + "step": 49952 + }, + { + "epoch": 3.077523060227889, + "grad_norm": 5.555813312530518, + "learning_rate": 3.2572289982184463e-06, + "loss": 0.1416, + "step": 49953 + }, + { + "epoch": 3.077536625067824, + "grad_norm": 3.3402352333068848, + "learning_rate": 3.257091955598191e-06, + "loss": 0.0409, + "step": 49954 + }, + { + "epoch": 3.0775501899077593, + "grad_norm": 3.8500211238861084, + "learning_rate": 3.2569549129779366e-06, + "loss": 0.064, + "step": 49955 + }, + { + "epoch": 3.077563754747694, + "grad_norm": 4.562756061553955, + "learning_rate": 3.2568178703576814e-06, + "loss": 0.097, + "step": 49956 + }, + { + "epoch": 3.077577319587629, + "grad_norm": 5.85934591293335, + "learning_rate": 3.2566808277374266e-06, + "loss": 0.1449, + "step": 49957 + }, + { + "epoch": 3.077590884427564, + "grad_norm": 3.8653013706207275, + "learning_rate": 3.2565437851171717e-06, + "loss": 0.0862, + "step": 49958 + }, + { + "epoch": 3.0776044492674988, + "grad_norm": 5.348531246185303, + "learning_rate": 3.256406742496917e-06, + "loss": 0.211, + "step": 49959 + }, + { + "epoch": 3.0776180141074336, + "grad_norm": 4.47174072265625, + "learning_rate": 3.2562696998766617e-06, + "loss": 0.1797, + "step": 49960 + }, + { + "epoch": 3.0776315789473685, + "grad_norm": 4.980137348175049, + "learning_rate": 3.2561326572564073e-06, + "loss": 0.1135, + "step": 49961 + }, + { + "epoch": 3.0776451437873034, + "grad_norm": 3.9327125549316406, + "learning_rate": 3.255995614636152e-06, + "loss": 0.0751, + "step": 49962 + }, + { + "epoch": 3.0776587086272382, + "grad_norm": 3.684933662414551, + "learning_rate": 3.255858572015897e-06, + "loss": 0.0897, + "step": 49963 + }, + { + "epoch": 3.077672273467173, + "grad_norm": 5.450483798980713, + "learning_rate": 3.2557215293956424e-06, + "loss": 0.1765, + "step": 49964 + }, + { + "epoch": 3.077685838307108, + "grad_norm": 4.661901950836182, + "learning_rate": 3.255584486775387e-06, + "loss": 0.1267, + "step": 49965 + }, + { + "epoch": 3.077699403147043, + "grad_norm": 5.486534118652344, + "learning_rate": 3.2554474441551328e-06, + "loss": 0.3571, + "step": 49966 + }, + { + "epoch": 3.0777129679869777, + "grad_norm": 7.667779445648193, + "learning_rate": 3.2553104015348775e-06, + "loss": 0.2003, + "step": 49967 + }, + { + "epoch": 3.0777265328269126, + "grad_norm": 4.195274353027344, + "learning_rate": 3.2551733589146227e-06, + "loss": 0.0849, + "step": 49968 + }, + { + "epoch": 3.0777400976668474, + "grad_norm": 5.647269248962402, + "learning_rate": 3.255036316294368e-06, + "loss": 0.2056, + "step": 49969 + }, + { + "epoch": 3.0777536625067823, + "grad_norm": 4.53321647644043, + "learning_rate": 3.254899273674113e-06, + "loss": 0.1101, + "step": 49970 + }, + { + "epoch": 3.077767227346717, + "grad_norm": 7.3465118408203125, + "learning_rate": 3.254762231053858e-06, + "loss": 0.1803, + "step": 49971 + }, + { + "epoch": 3.077780792186652, + "grad_norm": 4.8423638343811035, + "learning_rate": 3.2546251884336034e-06, + "loss": 0.2096, + "step": 49972 + }, + { + "epoch": 3.077794357026587, + "grad_norm": 6.7318220138549805, + "learning_rate": 3.254488145813348e-06, + "loss": 0.1728, + "step": 49973 + }, + { + "epoch": 3.077807921866522, + "grad_norm": 4.563593864440918, + "learning_rate": 3.2543511031930938e-06, + "loss": 0.1869, + "step": 49974 + }, + { + "epoch": 3.077821486706457, + "grad_norm": 6.535475254058838, + "learning_rate": 3.2542140605728385e-06, + "loss": 0.1101, + "step": 49975 + }, + { + "epoch": 3.077835051546392, + "grad_norm": 4.110241889953613, + "learning_rate": 3.2540770179525837e-06, + "loss": 0.1954, + "step": 49976 + }, + { + "epoch": 3.077848616386327, + "grad_norm": 5.9585137367248535, + "learning_rate": 3.2539399753323284e-06, + "loss": 0.1551, + "step": 49977 + }, + { + "epoch": 3.0778621812262617, + "grad_norm": 5.3950700759887695, + "learning_rate": 3.2538029327120736e-06, + "loss": 0.1395, + "step": 49978 + }, + { + "epoch": 3.0778757460661965, + "grad_norm": 5.252823829650879, + "learning_rate": 3.253665890091819e-06, + "loss": 0.1313, + "step": 49979 + }, + { + "epoch": 3.0778893109061314, + "grad_norm": 5.150930404663086, + "learning_rate": 3.2535288474715636e-06, + "loss": 0.1704, + "step": 49980 + }, + { + "epoch": 3.0779028757460662, + "grad_norm": 5.133358478546143, + "learning_rate": 3.253391804851309e-06, + "loss": 0.1111, + "step": 49981 + }, + { + "epoch": 3.077916440586001, + "grad_norm": 4.660030364990234, + "learning_rate": 3.253254762231054e-06, + "loss": 0.1387, + "step": 49982 + }, + { + "epoch": 3.077930005425936, + "grad_norm": 5.090720176696777, + "learning_rate": 3.2531177196107995e-06, + "loss": 0.1092, + "step": 49983 + }, + { + "epoch": 3.077943570265871, + "grad_norm": 4.477046966552734, + "learning_rate": 3.2529806769905443e-06, + "loss": 0.133, + "step": 49984 + }, + { + "epoch": 3.0779571351058057, + "grad_norm": 4.016351222991943, + "learning_rate": 3.2528436343702895e-06, + "loss": 0.1657, + "step": 49985 + }, + { + "epoch": 3.0779706999457406, + "grad_norm": 4.192056179046631, + "learning_rate": 3.2527065917500346e-06, + "loss": 0.1117, + "step": 49986 + }, + { + "epoch": 3.0779842647856754, + "grad_norm": 5.704893112182617, + "learning_rate": 3.25256954912978e-06, + "loss": 0.1778, + "step": 49987 + }, + { + "epoch": 3.0779978296256103, + "grad_norm": 4.137737274169922, + "learning_rate": 3.2524325065095246e-06, + "loss": 0.1001, + "step": 49988 + }, + { + "epoch": 3.078011394465545, + "grad_norm": 4.828647613525391, + "learning_rate": 3.25229546388927e-06, + "loss": 0.1175, + "step": 49989 + }, + { + "epoch": 3.07802495930548, + "grad_norm": 6.050973415374756, + "learning_rate": 3.252158421269015e-06, + "loss": 0.2482, + "step": 49990 + }, + { + "epoch": 3.078038524145415, + "grad_norm": 5.1155242919921875, + "learning_rate": 3.2520213786487597e-06, + "loss": 0.1161, + "step": 49991 + }, + { + "epoch": 3.0780520889853498, + "grad_norm": 7.473790645599365, + "learning_rate": 3.2518843360285053e-06, + "loss": 0.2276, + "step": 49992 + }, + { + "epoch": 3.078065653825285, + "grad_norm": 5.156586647033691, + "learning_rate": 3.25174729340825e-06, + "loss": 0.1553, + "step": 49993 + }, + { + "epoch": 3.07807921866522, + "grad_norm": 3.3100011348724365, + "learning_rate": 3.2516102507879952e-06, + "loss": 0.0951, + "step": 49994 + }, + { + "epoch": 3.078092783505155, + "grad_norm": 3.4871034622192383, + "learning_rate": 3.2514732081677404e-06, + "loss": 0.0742, + "step": 49995 + }, + { + "epoch": 3.0781063483450897, + "grad_norm": 7.2066168785095215, + "learning_rate": 3.2513361655474856e-06, + "loss": 0.2506, + "step": 49996 + }, + { + "epoch": 3.0781199131850245, + "grad_norm": 6.6251606941223145, + "learning_rate": 3.2511991229272303e-06, + "loss": 0.1581, + "step": 49997 + }, + { + "epoch": 3.0781334780249594, + "grad_norm": 3.1861863136291504, + "learning_rate": 3.251062080306976e-06, + "loss": 0.0787, + "step": 49998 + }, + { + "epoch": 3.0781470428648943, + "grad_norm": 5.904118061065674, + "learning_rate": 3.2509250376867207e-06, + "loss": 0.3648, + "step": 49999 + }, + { + "epoch": 3.078160607704829, + "grad_norm": 4.087127685546875, + "learning_rate": 3.2507879950664663e-06, + "loss": 0.1199, + "step": 50000 + }, + { + "epoch": 3.078174172544764, + "grad_norm": 4.530323028564453, + "learning_rate": 3.250650952446211e-06, + "loss": 0.0877, + "step": 50001 + }, + { + "epoch": 3.078187737384699, + "grad_norm": 4.307511329650879, + "learning_rate": 3.2505139098259562e-06, + "loss": 0.1459, + "step": 50002 + }, + { + "epoch": 3.0782013022246337, + "grad_norm": 6.333501815795898, + "learning_rate": 3.2503768672057014e-06, + "loss": 0.209, + "step": 50003 + }, + { + "epoch": 3.0782148670645686, + "grad_norm": 4.377661228179932, + "learning_rate": 3.250239824585446e-06, + "loss": 0.1698, + "step": 50004 + }, + { + "epoch": 3.0782284319045035, + "grad_norm": 4.756988048553467, + "learning_rate": 3.2501027819651913e-06, + "loss": 0.1559, + "step": 50005 + }, + { + "epoch": 3.0782419967444383, + "grad_norm": 3.9219608306884766, + "learning_rate": 3.2499657393449365e-06, + "loss": 0.0635, + "step": 50006 + }, + { + "epoch": 3.078255561584373, + "grad_norm": 5.190572738647461, + "learning_rate": 3.2498286967246817e-06, + "loss": 0.1432, + "step": 50007 + }, + { + "epoch": 3.078269126424308, + "grad_norm": 5.7969818115234375, + "learning_rate": 3.2496916541044265e-06, + "loss": 0.0941, + "step": 50008 + }, + { + "epoch": 3.078282691264243, + "grad_norm": 5.985844612121582, + "learning_rate": 3.249554611484172e-06, + "loss": 0.1469, + "step": 50009 + }, + { + "epoch": 3.078296256104178, + "grad_norm": 6.578766822814941, + "learning_rate": 3.249417568863917e-06, + "loss": 0.1507, + "step": 50010 + }, + { + "epoch": 3.0783098209441127, + "grad_norm": 6.595543384552002, + "learning_rate": 3.249280526243662e-06, + "loss": 0.2043, + "step": 50011 + }, + { + "epoch": 3.078323385784048, + "grad_norm": 3.919034004211426, + "learning_rate": 3.249143483623407e-06, + "loss": 0.1361, + "step": 50012 + }, + { + "epoch": 3.078336950623983, + "grad_norm": 5.733395099639893, + "learning_rate": 3.2490064410031523e-06, + "loss": 0.1578, + "step": 50013 + }, + { + "epoch": 3.0783505154639177, + "grad_norm": 3.5689361095428467, + "learning_rate": 3.248869398382897e-06, + "loss": 0.111, + "step": 50014 + }, + { + "epoch": 3.0783640803038526, + "grad_norm": 6.318293571472168, + "learning_rate": 3.2487323557626427e-06, + "loss": 0.2132, + "step": 50015 + }, + { + "epoch": 3.0783776451437874, + "grad_norm": 5.09248161315918, + "learning_rate": 3.2485953131423875e-06, + "loss": 0.1467, + "step": 50016 + }, + { + "epoch": 3.0783912099837223, + "grad_norm": 4.886919975280762, + "learning_rate": 3.248458270522133e-06, + "loss": 0.1278, + "step": 50017 + }, + { + "epoch": 3.078404774823657, + "grad_norm": 3.0442681312561035, + "learning_rate": 3.248321227901878e-06, + "loss": 0.0854, + "step": 50018 + }, + { + "epoch": 3.078418339663592, + "grad_norm": 4.034862518310547, + "learning_rate": 3.2481841852816226e-06, + "loss": 0.1427, + "step": 50019 + }, + { + "epoch": 3.078431904503527, + "grad_norm": 3.476515293121338, + "learning_rate": 3.248047142661368e-06, + "loss": 0.0786, + "step": 50020 + }, + { + "epoch": 3.0784454693434617, + "grad_norm": 5.7637505531311035, + "learning_rate": 3.247910100041113e-06, + "loss": 0.231, + "step": 50021 + }, + { + "epoch": 3.0784590341833966, + "grad_norm": 5.052663803100586, + "learning_rate": 3.247773057420858e-06, + "loss": 0.1419, + "step": 50022 + }, + { + "epoch": 3.0784725990233315, + "grad_norm": 4.671457767486572, + "learning_rate": 3.2476360148006033e-06, + "loss": 0.1384, + "step": 50023 + }, + { + "epoch": 3.0784861638632663, + "grad_norm": 4.8368682861328125, + "learning_rate": 3.2474989721803485e-06, + "loss": 0.1603, + "step": 50024 + }, + { + "epoch": 3.078499728703201, + "grad_norm": 3.6114096641540527, + "learning_rate": 3.2473619295600932e-06, + "loss": 0.0839, + "step": 50025 + }, + { + "epoch": 3.078513293543136, + "grad_norm": 4.902982711791992, + "learning_rate": 3.247224886939839e-06, + "loss": 0.1856, + "step": 50026 + }, + { + "epoch": 3.078526858383071, + "grad_norm": 4.01646089553833, + "learning_rate": 3.2470878443195836e-06, + "loss": 0.1441, + "step": 50027 + }, + { + "epoch": 3.078540423223006, + "grad_norm": 4.496959686279297, + "learning_rate": 3.2469508016993288e-06, + "loss": 0.1217, + "step": 50028 + }, + { + "epoch": 3.0785539880629407, + "grad_norm": 5.172683238983154, + "learning_rate": 3.246813759079074e-06, + "loss": 0.0954, + "step": 50029 + }, + { + "epoch": 3.0785675529028755, + "grad_norm": 5.983489036560059, + "learning_rate": 3.246676716458819e-06, + "loss": 0.1165, + "step": 50030 + }, + { + "epoch": 3.078581117742811, + "grad_norm": 3.6965415477752686, + "learning_rate": 3.246539673838564e-06, + "loss": 0.0797, + "step": 50031 + }, + { + "epoch": 3.0785946825827457, + "grad_norm": 3.218759775161743, + "learning_rate": 3.246402631218309e-06, + "loss": 0.0879, + "step": 50032 + }, + { + "epoch": 3.0786082474226806, + "grad_norm": 4.126936912536621, + "learning_rate": 3.2462655885980542e-06, + "loss": 0.1088, + "step": 50033 + }, + { + "epoch": 3.0786218122626154, + "grad_norm": 5.87421178817749, + "learning_rate": 3.246128545977799e-06, + "loss": 0.1687, + "step": 50034 + }, + { + "epoch": 3.0786353771025503, + "grad_norm": 3.7892420291900635, + "learning_rate": 3.2459915033575446e-06, + "loss": 0.1084, + "step": 50035 + }, + { + "epoch": 3.078648941942485, + "grad_norm": 4.6010847091674805, + "learning_rate": 3.2458544607372893e-06, + "loss": 0.116, + "step": 50036 + }, + { + "epoch": 3.07866250678242, + "grad_norm": 5.581413269042969, + "learning_rate": 3.245717418117035e-06, + "loss": 0.1684, + "step": 50037 + }, + { + "epoch": 3.078676071622355, + "grad_norm": 3.4927361011505127, + "learning_rate": 3.2455803754967797e-06, + "loss": 0.0779, + "step": 50038 + }, + { + "epoch": 3.0786896364622898, + "grad_norm": 4.970440864562988, + "learning_rate": 3.245443332876525e-06, + "loss": 0.2108, + "step": 50039 + }, + { + "epoch": 3.0787032013022246, + "grad_norm": 3.223137617111206, + "learning_rate": 3.24530629025627e-06, + "loss": 0.1008, + "step": 50040 + }, + { + "epoch": 3.0787167661421595, + "grad_norm": 4.541026592254639, + "learning_rate": 3.2451692476360152e-06, + "loss": 0.1431, + "step": 50041 + }, + { + "epoch": 3.0787303309820944, + "grad_norm": 5.419956207275391, + "learning_rate": 3.24503220501576e-06, + "loss": 0.1157, + "step": 50042 + }, + { + "epoch": 3.0787438958220292, + "grad_norm": 4.411306858062744, + "learning_rate": 3.2448951623955056e-06, + "loss": 0.1385, + "step": 50043 + }, + { + "epoch": 3.078757460661964, + "grad_norm": 3.8102025985717773, + "learning_rate": 3.2447581197752504e-06, + "loss": 0.0764, + "step": 50044 + }, + { + "epoch": 3.078771025501899, + "grad_norm": 3.223977565765381, + "learning_rate": 3.244621077154996e-06, + "loss": 0.0641, + "step": 50045 + }, + { + "epoch": 3.078784590341834, + "grad_norm": 4.413539886474609, + "learning_rate": 3.2444840345347407e-06, + "loss": 0.1158, + "step": 50046 + }, + { + "epoch": 3.0787981551817687, + "grad_norm": 6.2384934425354, + "learning_rate": 3.2443469919144855e-06, + "loss": 0.2312, + "step": 50047 + }, + { + "epoch": 3.0788117200217036, + "grad_norm": 4.025998592376709, + "learning_rate": 3.2442099492942306e-06, + "loss": 0.1428, + "step": 50048 + }, + { + "epoch": 3.0788252848616384, + "grad_norm": 3.8963472843170166, + "learning_rate": 3.244072906673976e-06, + "loss": 0.1035, + "step": 50049 + }, + { + "epoch": 3.0788388497015737, + "grad_norm": 5.597774505615234, + "learning_rate": 3.243935864053721e-06, + "loss": 0.1618, + "step": 50050 + }, + { + "epoch": 3.0788524145415086, + "grad_norm": 3.3401641845703125, + "learning_rate": 3.2437988214334658e-06, + "loss": 0.0777, + "step": 50051 + }, + { + "epoch": 3.0788659793814435, + "grad_norm": 3.608903408050537, + "learning_rate": 3.2436617788132114e-06, + "loss": 0.0896, + "step": 50052 + }, + { + "epoch": 3.0788795442213783, + "grad_norm": 3.5673115253448486, + "learning_rate": 3.243524736192956e-06, + "loss": 0.1001, + "step": 50053 + }, + { + "epoch": 3.078893109061313, + "grad_norm": 3.5816094875335693, + "learning_rate": 3.2433876935727017e-06, + "loss": 0.061, + "step": 50054 + }, + { + "epoch": 3.078906673901248, + "grad_norm": 3.785475254058838, + "learning_rate": 3.2432506509524465e-06, + "loss": 0.0933, + "step": 50055 + }, + { + "epoch": 3.078920238741183, + "grad_norm": 3.6492574214935303, + "learning_rate": 3.2431136083321917e-06, + "loss": 0.1701, + "step": 50056 + }, + { + "epoch": 3.078933803581118, + "grad_norm": 5.359621047973633, + "learning_rate": 3.242976565711937e-06, + "loss": 0.1333, + "step": 50057 + }, + { + "epoch": 3.0789473684210527, + "grad_norm": 4.8041157722473145, + "learning_rate": 3.242839523091682e-06, + "loss": 0.1174, + "step": 50058 + }, + { + "epoch": 3.0789609332609875, + "grad_norm": 4.150110244750977, + "learning_rate": 3.2427024804714268e-06, + "loss": 0.075, + "step": 50059 + }, + { + "epoch": 3.0789744981009224, + "grad_norm": 3.8014118671417236, + "learning_rate": 3.2425654378511715e-06, + "loss": 0.1387, + "step": 50060 + }, + { + "epoch": 3.0789880629408573, + "grad_norm": 5.132997989654541, + "learning_rate": 3.242428395230917e-06, + "loss": 0.2252, + "step": 50061 + }, + { + "epoch": 3.079001627780792, + "grad_norm": 3.36092209815979, + "learning_rate": 3.242291352610662e-06, + "loss": 0.072, + "step": 50062 + }, + { + "epoch": 3.079015192620727, + "grad_norm": 3.6587257385253906, + "learning_rate": 3.2421543099904075e-06, + "loss": 0.06, + "step": 50063 + }, + { + "epoch": 3.079028757460662, + "grad_norm": 5.43681526184082, + "learning_rate": 3.2420172673701522e-06, + "loss": 0.093, + "step": 50064 + }, + { + "epoch": 3.0790423223005967, + "grad_norm": 5.539815902709961, + "learning_rate": 3.2418802247498974e-06, + "loss": 0.1124, + "step": 50065 + }, + { + "epoch": 3.0790558871405316, + "grad_norm": 3.508387327194214, + "learning_rate": 3.2417431821296426e-06, + "loss": 0.128, + "step": 50066 + }, + { + "epoch": 3.0790694519804664, + "grad_norm": 4.095484256744385, + "learning_rate": 3.2416061395093878e-06, + "loss": 0.1338, + "step": 50067 + }, + { + "epoch": 3.0790830168204013, + "grad_norm": 5.449358940124512, + "learning_rate": 3.2414690968891325e-06, + "loss": 0.1349, + "step": 50068 + }, + { + "epoch": 3.0790965816603366, + "grad_norm": 3.4929306507110596, + "learning_rate": 3.241332054268878e-06, + "loss": 0.1219, + "step": 50069 + }, + { + "epoch": 3.0791101465002715, + "grad_norm": 3.6315183639526367, + "learning_rate": 3.241195011648623e-06, + "loss": 0.0961, + "step": 50070 + }, + { + "epoch": 3.0791237113402063, + "grad_norm": 4.075375556945801, + "learning_rate": 3.2410579690283685e-06, + "loss": 0.1306, + "step": 50071 + }, + { + "epoch": 3.079137276180141, + "grad_norm": 4.609992980957031, + "learning_rate": 3.2409209264081132e-06, + "loss": 0.0659, + "step": 50072 + }, + { + "epoch": 3.079150841020076, + "grad_norm": 4.392221927642822, + "learning_rate": 3.240783883787858e-06, + "loss": 0.1411, + "step": 50073 + }, + { + "epoch": 3.079164405860011, + "grad_norm": 5.40377950668335, + "learning_rate": 3.2406468411676036e-06, + "loss": 0.1628, + "step": 50074 + }, + { + "epoch": 3.079177970699946, + "grad_norm": 4.251143932342529, + "learning_rate": 3.2405097985473484e-06, + "loss": 0.1518, + "step": 50075 + }, + { + "epoch": 3.0791915355398807, + "grad_norm": 4.468515872955322, + "learning_rate": 3.2403727559270935e-06, + "loss": 0.117, + "step": 50076 + }, + { + "epoch": 3.0792051003798155, + "grad_norm": 4.029988765716553, + "learning_rate": 3.2402357133068387e-06, + "loss": 0.0976, + "step": 50077 + }, + { + "epoch": 3.0792186652197504, + "grad_norm": 4.184288501739502, + "learning_rate": 3.240098670686584e-06, + "loss": 0.1121, + "step": 50078 + }, + { + "epoch": 3.0792322300596853, + "grad_norm": 4.120193958282471, + "learning_rate": 3.2399616280663286e-06, + "loss": 0.1004, + "step": 50079 + }, + { + "epoch": 3.07924579489962, + "grad_norm": 2.9552183151245117, + "learning_rate": 3.2398245854460742e-06, + "loss": 0.0884, + "step": 50080 + }, + { + "epoch": 3.079259359739555, + "grad_norm": 5.393067359924316, + "learning_rate": 3.239687542825819e-06, + "loss": 0.0762, + "step": 50081 + }, + { + "epoch": 3.07927292457949, + "grad_norm": 4.113245010375977, + "learning_rate": 3.239550500205564e-06, + "loss": 0.1367, + "step": 50082 + }, + { + "epoch": 3.0792864894194247, + "grad_norm": 4.857727527618408, + "learning_rate": 3.2394134575853094e-06, + "loss": 0.1249, + "step": 50083 + }, + { + "epoch": 3.0793000542593596, + "grad_norm": 4.663201332092285, + "learning_rate": 3.2392764149650545e-06, + "loss": 0.1277, + "step": 50084 + }, + { + "epoch": 3.0793136190992945, + "grad_norm": 5.952507019042969, + "learning_rate": 3.2391393723447993e-06, + "loss": 0.1433, + "step": 50085 + }, + { + "epoch": 3.0793271839392293, + "grad_norm": 5.0222697257995605, + "learning_rate": 3.239002329724545e-06, + "loss": 0.1881, + "step": 50086 + }, + { + "epoch": 3.079340748779164, + "grad_norm": 4.695916175842285, + "learning_rate": 3.2388652871042897e-06, + "loss": 0.1202, + "step": 50087 + }, + { + "epoch": 3.0793543136190995, + "grad_norm": 4.7380499839782715, + "learning_rate": 3.2387282444840344e-06, + "loss": 0.0891, + "step": 50088 + }, + { + "epoch": 3.0793678784590344, + "grad_norm": 5.715705871582031, + "learning_rate": 3.23859120186378e-06, + "loss": 0.2229, + "step": 50089 + }, + { + "epoch": 3.0793814432989692, + "grad_norm": 5.53567361831665, + "learning_rate": 3.2384541592435248e-06, + "loss": 0.1503, + "step": 50090 + }, + { + "epoch": 3.079395008138904, + "grad_norm": 5.5378007888793945, + "learning_rate": 3.2383171166232704e-06, + "loss": 0.1863, + "step": 50091 + }, + { + "epoch": 3.079408572978839, + "grad_norm": 4.385624408721924, + "learning_rate": 3.238180074003015e-06, + "loss": 0.1237, + "step": 50092 + }, + { + "epoch": 3.079422137818774, + "grad_norm": 3.9675252437591553, + "learning_rate": 3.2380430313827603e-06, + "loss": 0.0986, + "step": 50093 + }, + { + "epoch": 3.0794357026587087, + "grad_norm": 2.903616428375244, + "learning_rate": 3.2379059887625055e-06, + "loss": 0.0441, + "step": 50094 + }, + { + "epoch": 3.0794492674986436, + "grad_norm": 5.323693752288818, + "learning_rate": 3.2377689461422507e-06, + "loss": 0.1652, + "step": 50095 + }, + { + "epoch": 3.0794628323385784, + "grad_norm": 4.875805377960205, + "learning_rate": 3.2376319035219954e-06, + "loss": 0.1578, + "step": 50096 + }, + { + "epoch": 3.0794763971785133, + "grad_norm": 3.3671979904174805, + "learning_rate": 3.237494860901741e-06, + "loss": 0.058, + "step": 50097 + }, + { + "epoch": 3.079489962018448, + "grad_norm": 3.6651480197906494, + "learning_rate": 3.2373578182814858e-06, + "loss": 0.0784, + "step": 50098 + }, + { + "epoch": 3.079503526858383, + "grad_norm": 3.1929378509521484, + "learning_rate": 3.237220775661231e-06, + "loss": 0.0765, + "step": 50099 + }, + { + "epoch": 3.079517091698318, + "grad_norm": 6.036241054534912, + "learning_rate": 3.237083733040976e-06, + "loss": 0.2022, + "step": 50100 + }, + { + "epoch": 3.0795306565382528, + "grad_norm": 7.012574672698975, + "learning_rate": 3.236946690420721e-06, + "loss": 0.1366, + "step": 50101 + }, + { + "epoch": 3.0795442213781876, + "grad_norm": 3.7562415599823, + "learning_rate": 3.236809647800466e-06, + "loss": 0.0838, + "step": 50102 + }, + { + "epoch": 3.0795577862181225, + "grad_norm": 2.7891006469726562, + "learning_rate": 3.2366726051802112e-06, + "loss": 0.0681, + "step": 50103 + }, + { + "epoch": 3.0795713510580573, + "grad_norm": 2.5422892570495605, + "learning_rate": 3.2365355625599564e-06, + "loss": 0.0345, + "step": 50104 + }, + { + "epoch": 3.079584915897992, + "grad_norm": 5.0451226234436035, + "learning_rate": 3.236398519939701e-06, + "loss": 0.1137, + "step": 50105 + }, + { + "epoch": 3.079598480737927, + "grad_norm": 4.005371570587158, + "learning_rate": 3.2362614773194468e-06, + "loss": 0.1223, + "step": 50106 + }, + { + "epoch": 3.0796120455778624, + "grad_norm": 5.653090000152588, + "learning_rate": 3.2361244346991915e-06, + "loss": 0.1645, + "step": 50107 + }, + { + "epoch": 3.0796256104177973, + "grad_norm": 4.120436668395996, + "learning_rate": 3.235987392078937e-06, + "loss": 0.1002, + "step": 50108 + }, + { + "epoch": 3.079639175257732, + "grad_norm": 6.763566017150879, + "learning_rate": 3.235850349458682e-06, + "loss": 0.1609, + "step": 50109 + }, + { + "epoch": 3.079652740097667, + "grad_norm": 3.6437411308288574, + "learning_rate": 3.235713306838427e-06, + "loss": 0.0892, + "step": 50110 + }, + { + "epoch": 3.079666304937602, + "grad_norm": 3.3671951293945312, + "learning_rate": 3.2355762642181723e-06, + "loss": 0.126, + "step": 50111 + }, + { + "epoch": 3.0796798697775367, + "grad_norm": 4.901975631713867, + "learning_rate": 3.2354392215979174e-06, + "loss": 0.1394, + "step": 50112 + }, + { + "epoch": 3.0796934346174716, + "grad_norm": 2.736299991607666, + "learning_rate": 3.235302178977662e-06, + "loss": 0.0405, + "step": 50113 + }, + { + "epoch": 3.0797069994574064, + "grad_norm": 3.814084053039551, + "learning_rate": 3.235165136357407e-06, + "loss": 0.0904, + "step": 50114 + }, + { + "epoch": 3.0797205642973413, + "grad_norm": 3.6887171268463135, + "learning_rate": 3.2350280937371525e-06, + "loss": 0.0902, + "step": 50115 + }, + { + "epoch": 3.079734129137276, + "grad_norm": 3.3823277950286865, + "learning_rate": 3.2348910511168973e-06, + "loss": 0.0789, + "step": 50116 + }, + { + "epoch": 3.079747693977211, + "grad_norm": 4.549580097198486, + "learning_rate": 3.234754008496643e-06, + "loss": 0.0965, + "step": 50117 + }, + { + "epoch": 3.079761258817146, + "grad_norm": 2.965975522994995, + "learning_rate": 3.2346169658763877e-06, + "loss": 0.0721, + "step": 50118 + }, + { + "epoch": 3.0797748236570808, + "grad_norm": 4.3515119552612305, + "learning_rate": 3.234479923256133e-06, + "loss": 0.1067, + "step": 50119 + }, + { + "epoch": 3.0797883884970156, + "grad_norm": 3.9759037494659424, + "learning_rate": 3.234342880635878e-06, + "loss": 0.1004, + "step": 50120 + }, + { + "epoch": 3.0798019533369505, + "grad_norm": 3.902985095977783, + "learning_rate": 3.234205838015623e-06, + "loss": 0.1212, + "step": 50121 + }, + { + "epoch": 3.0798155181768854, + "grad_norm": 3.683197259902954, + "learning_rate": 3.234068795395368e-06, + "loss": 0.1087, + "step": 50122 + }, + { + "epoch": 3.0798290830168202, + "grad_norm": 2.3385095596313477, + "learning_rate": 3.2339317527751136e-06, + "loss": 0.0918, + "step": 50123 + }, + { + "epoch": 3.079842647856755, + "grad_norm": 3.5513744354248047, + "learning_rate": 3.2337947101548583e-06, + "loss": 0.0794, + "step": 50124 + }, + { + "epoch": 3.07985621269669, + "grad_norm": 4.132396221160889, + "learning_rate": 3.233657667534604e-06, + "loss": 0.1003, + "step": 50125 + }, + { + "epoch": 3.0798697775366253, + "grad_norm": 4.562742710113525, + "learning_rate": 3.2335206249143487e-06, + "loss": 0.0861, + "step": 50126 + }, + { + "epoch": 3.07988334237656, + "grad_norm": 2.2994496822357178, + "learning_rate": 3.233383582294094e-06, + "loss": 0.0442, + "step": 50127 + }, + { + "epoch": 3.079896907216495, + "grad_norm": 2.961712598800659, + "learning_rate": 3.233246539673839e-06, + "loss": 0.0697, + "step": 50128 + }, + { + "epoch": 3.07991047205643, + "grad_norm": 4.178286552429199, + "learning_rate": 3.2331094970535838e-06, + "loss": 0.1223, + "step": 50129 + }, + { + "epoch": 3.0799240368963647, + "grad_norm": 2.2925469875335693, + "learning_rate": 3.232972454433329e-06, + "loss": 0.0579, + "step": 50130 + }, + { + "epoch": 3.0799376017362996, + "grad_norm": 4.294524669647217, + "learning_rate": 3.2328354118130737e-06, + "loss": 0.0992, + "step": 50131 + }, + { + "epoch": 3.0799511665762345, + "grad_norm": 2.7817506790161133, + "learning_rate": 3.2326983691928193e-06, + "loss": 0.0631, + "step": 50132 + }, + { + "epoch": 3.0799647314161693, + "grad_norm": 3.8800318241119385, + "learning_rate": 3.232561326572564e-06, + "loss": 0.112, + "step": 50133 + }, + { + "epoch": 3.079978296256104, + "grad_norm": 4.1863555908203125, + "learning_rate": 3.2324242839523097e-06, + "loss": 0.0728, + "step": 50134 + }, + { + "epoch": 3.079991861096039, + "grad_norm": 2.9704883098602295, + "learning_rate": 3.2322872413320544e-06, + "loss": 0.0742, + "step": 50135 + }, + { + "epoch": 3.080005425935974, + "grad_norm": 2.712690591812134, + "learning_rate": 3.2321501987117996e-06, + "loss": 0.0596, + "step": 50136 + }, + { + "epoch": 3.080018990775909, + "grad_norm": 3.2939646244049072, + "learning_rate": 3.2320131560915448e-06, + "loss": 0.0929, + "step": 50137 + }, + { + "epoch": 3.0800325556158437, + "grad_norm": 2.529954433441162, + "learning_rate": 3.23187611347129e-06, + "loss": 0.0771, + "step": 50138 + }, + { + "epoch": 3.0800461204557785, + "grad_norm": 3.4505720138549805, + "learning_rate": 3.2317390708510347e-06, + "loss": 0.0788, + "step": 50139 + }, + { + "epoch": 3.0800596852957134, + "grad_norm": 3.1559600830078125, + "learning_rate": 3.2316020282307803e-06, + "loss": 0.0483, + "step": 50140 + }, + { + "epoch": 3.0800732501356483, + "grad_norm": 3.6187853813171387, + "learning_rate": 3.231464985610525e-06, + "loss": 0.0644, + "step": 50141 + }, + { + "epoch": 3.080086814975583, + "grad_norm": 2.8913414478302, + "learning_rate": 3.23132794299027e-06, + "loss": 0.0696, + "step": 50142 + }, + { + "epoch": 3.080100379815518, + "grad_norm": 3.4230356216430664, + "learning_rate": 3.2311909003700154e-06, + "loss": 0.1035, + "step": 50143 + }, + { + "epoch": 3.080113944655453, + "grad_norm": 4.741894245147705, + "learning_rate": 3.23105385774976e-06, + "loss": 0.1631, + "step": 50144 + }, + { + "epoch": 3.080127509495388, + "grad_norm": 2.9933457374572754, + "learning_rate": 3.230916815129506e-06, + "loss": 0.077, + "step": 50145 + }, + { + "epoch": 3.080141074335323, + "grad_norm": 3.755859136581421, + "learning_rate": 3.2307797725092506e-06, + "loss": 0.0627, + "step": 50146 + }, + { + "epoch": 3.080154639175258, + "grad_norm": 5.1912841796875, + "learning_rate": 3.2306427298889957e-06, + "loss": 0.0979, + "step": 50147 + }, + { + "epoch": 3.0801682040151928, + "grad_norm": 4.246581554412842, + "learning_rate": 3.230505687268741e-06, + "loss": 0.1003, + "step": 50148 + }, + { + "epoch": 3.0801817688551276, + "grad_norm": 3.2912912368774414, + "learning_rate": 3.230368644648486e-06, + "loss": 0.0591, + "step": 50149 + }, + { + "epoch": 3.0801953336950625, + "grad_norm": 2.814958095550537, + "learning_rate": 3.230231602028231e-06, + "loss": 0.0567, + "step": 50150 + }, + { + "epoch": 3.0802088985349974, + "grad_norm": 4.111283302307129, + "learning_rate": 3.2300945594079764e-06, + "loss": 0.1835, + "step": 50151 + }, + { + "epoch": 3.080222463374932, + "grad_norm": 2.947831630706787, + "learning_rate": 3.229957516787721e-06, + "loss": 0.1108, + "step": 50152 + }, + { + "epoch": 3.080236028214867, + "grad_norm": 3.2907416820526123, + "learning_rate": 3.2298204741674664e-06, + "loss": 0.0778, + "step": 50153 + }, + { + "epoch": 3.080249593054802, + "grad_norm": 3.6767783164978027, + "learning_rate": 3.2296834315472116e-06, + "loss": 0.1256, + "step": 50154 + }, + { + "epoch": 3.080263157894737, + "grad_norm": 3.940131902694702, + "learning_rate": 3.2295463889269567e-06, + "loss": 0.117, + "step": 50155 + }, + { + "epoch": 3.0802767227346717, + "grad_norm": 2.7368710041046143, + "learning_rate": 3.2294093463067015e-06, + "loss": 0.0961, + "step": 50156 + }, + { + "epoch": 3.0802902875746065, + "grad_norm": 3.397085428237915, + "learning_rate": 3.2292723036864467e-06, + "loss": 0.1092, + "step": 50157 + }, + { + "epoch": 3.0803038524145414, + "grad_norm": 2.931638479232788, + "learning_rate": 3.229135261066192e-06, + "loss": 0.0462, + "step": 50158 + }, + { + "epoch": 3.0803174172544763, + "grad_norm": 4.153409957885742, + "learning_rate": 3.2289982184459366e-06, + "loss": 0.1617, + "step": 50159 + }, + { + "epoch": 3.080330982094411, + "grad_norm": 4.59157133102417, + "learning_rate": 3.228861175825682e-06, + "loss": 0.077, + "step": 50160 + }, + { + "epoch": 3.080344546934346, + "grad_norm": 3.5415945053100586, + "learning_rate": 3.228724133205427e-06, + "loss": 0.1194, + "step": 50161 + }, + { + "epoch": 3.080358111774281, + "grad_norm": 2.4258534908294678, + "learning_rate": 3.2285870905851726e-06, + "loss": 0.0511, + "step": 50162 + }, + { + "epoch": 3.0803716766142157, + "grad_norm": 2.8483996391296387, + "learning_rate": 3.2284500479649173e-06, + "loss": 0.0465, + "step": 50163 + }, + { + "epoch": 3.080385241454151, + "grad_norm": 3.8683133125305176, + "learning_rate": 3.2283130053446625e-06, + "loss": 0.0876, + "step": 50164 + }, + { + "epoch": 3.080398806294086, + "grad_norm": 3.8945674896240234, + "learning_rate": 3.2281759627244077e-06, + "loss": 0.1212, + "step": 50165 + }, + { + "epoch": 3.0804123711340208, + "grad_norm": 3.6934707164764404, + "learning_rate": 3.228038920104153e-06, + "loss": 0.1006, + "step": 50166 + }, + { + "epoch": 3.0804259359739556, + "grad_norm": 4.771878242492676, + "learning_rate": 3.2279018774838976e-06, + "loss": 0.1, + "step": 50167 + }, + { + "epoch": 3.0804395008138905, + "grad_norm": 2.91127610206604, + "learning_rate": 3.2277648348636432e-06, + "loss": 0.0917, + "step": 50168 + }, + { + "epoch": 3.0804530656538254, + "grad_norm": 3.9348087310791016, + "learning_rate": 3.227627792243388e-06, + "loss": 0.0908, + "step": 50169 + }, + { + "epoch": 3.0804666304937602, + "grad_norm": 3.5357577800750732, + "learning_rate": 3.2274907496231327e-06, + "loss": 0.0714, + "step": 50170 + }, + { + "epoch": 3.080480195333695, + "grad_norm": 5.967251777648926, + "learning_rate": 3.2273537070028783e-06, + "loss": 0.1552, + "step": 50171 + }, + { + "epoch": 3.08049376017363, + "grad_norm": 3.2645092010498047, + "learning_rate": 3.227216664382623e-06, + "loss": 0.0526, + "step": 50172 + }, + { + "epoch": 3.080507325013565, + "grad_norm": 5.54518985748291, + "learning_rate": 3.2270796217623683e-06, + "loss": 0.1358, + "step": 50173 + }, + { + "epoch": 3.0805208898534997, + "grad_norm": 3.7478272914886475, + "learning_rate": 3.2269425791421134e-06, + "loss": 0.0669, + "step": 50174 + }, + { + "epoch": 3.0805344546934346, + "grad_norm": 3.413978338241577, + "learning_rate": 3.2268055365218586e-06, + "loss": 0.0812, + "step": 50175 + }, + { + "epoch": 3.0805480195333694, + "grad_norm": 4.188338279724121, + "learning_rate": 3.2266684939016034e-06, + "loss": 0.1127, + "step": 50176 + }, + { + "epoch": 3.0805615843733043, + "grad_norm": 4.8655242919921875, + "learning_rate": 3.226531451281349e-06, + "loss": 0.1082, + "step": 50177 + }, + { + "epoch": 3.080575149213239, + "grad_norm": 6.771101474761963, + "learning_rate": 3.2263944086610937e-06, + "loss": 0.1219, + "step": 50178 + }, + { + "epoch": 3.080588714053174, + "grad_norm": 3.456406593322754, + "learning_rate": 3.2262573660408393e-06, + "loss": 0.0959, + "step": 50179 + }, + { + "epoch": 3.080602278893109, + "grad_norm": 4.883218288421631, + "learning_rate": 3.226120323420584e-06, + "loss": 0.1026, + "step": 50180 + }, + { + "epoch": 3.0806158437330438, + "grad_norm": 3.332918882369995, + "learning_rate": 3.2259832808003293e-06, + "loss": 0.0448, + "step": 50181 + }, + { + "epoch": 3.080629408572979, + "grad_norm": 6.906566619873047, + "learning_rate": 3.2258462381800744e-06, + "loss": 0.1517, + "step": 50182 + }, + { + "epoch": 3.080642973412914, + "grad_norm": 3.7883903980255127, + "learning_rate": 3.225709195559819e-06, + "loss": 0.0903, + "step": 50183 + }, + { + "epoch": 3.080656538252849, + "grad_norm": 4.72249174118042, + "learning_rate": 3.2255721529395644e-06, + "loss": 0.1283, + "step": 50184 + }, + { + "epoch": 3.0806701030927837, + "grad_norm": 5.48521089553833, + "learning_rate": 3.225435110319309e-06, + "loss": 0.1075, + "step": 50185 + }, + { + "epoch": 3.0806836679327185, + "grad_norm": 3.101038694381714, + "learning_rate": 3.2252980676990547e-06, + "loss": 0.0808, + "step": 50186 + }, + { + "epoch": 3.0806972327726534, + "grad_norm": 3.607665538787842, + "learning_rate": 3.2251610250787995e-06, + "loss": 0.0829, + "step": 50187 + }, + { + "epoch": 3.0807107976125883, + "grad_norm": 5.4603095054626465, + "learning_rate": 3.225023982458545e-06, + "loss": 0.1397, + "step": 50188 + }, + { + "epoch": 3.080724362452523, + "grad_norm": 4.477492809295654, + "learning_rate": 3.22488693983829e-06, + "loss": 0.1224, + "step": 50189 + }, + { + "epoch": 3.080737927292458, + "grad_norm": 3.345236301422119, + "learning_rate": 3.224749897218035e-06, + "loss": 0.0865, + "step": 50190 + }, + { + "epoch": 3.080751492132393, + "grad_norm": 3.3641936779022217, + "learning_rate": 3.2246128545977802e-06, + "loss": 0.0701, + "step": 50191 + }, + { + "epoch": 3.0807650569723277, + "grad_norm": 3.275688648223877, + "learning_rate": 3.2244758119775254e-06, + "loss": 0.0594, + "step": 50192 + }, + { + "epoch": 3.0807786218122626, + "grad_norm": 5.202935695648193, + "learning_rate": 3.22433876935727e-06, + "loss": 0.1128, + "step": 50193 + }, + { + "epoch": 3.0807921866521975, + "grad_norm": 4.246278762817383, + "learning_rate": 3.2242017267370157e-06, + "loss": 0.0679, + "step": 50194 + }, + { + "epoch": 3.0808057514921323, + "grad_norm": 3.6077933311462402, + "learning_rate": 3.2240646841167605e-06, + "loss": 0.0591, + "step": 50195 + }, + { + "epoch": 3.080819316332067, + "grad_norm": 3.8094873428344727, + "learning_rate": 3.223927641496506e-06, + "loss": 0.08, + "step": 50196 + }, + { + "epoch": 3.080832881172002, + "grad_norm": 3.9334537982940674, + "learning_rate": 3.223790598876251e-06, + "loss": 0.0891, + "step": 50197 + }, + { + "epoch": 3.080846446011937, + "grad_norm": 4.907464504241943, + "learning_rate": 3.2236535562559956e-06, + "loss": 0.0887, + "step": 50198 + }, + { + "epoch": 3.0808600108518718, + "grad_norm": 3.5764405727386475, + "learning_rate": 3.2235165136357412e-06, + "loss": 0.0899, + "step": 50199 + }, + { + "epoch": 3.0808735756918066, + "grad_norm": 5.498872756958008, + "learning_rate": 3.223379471015486e-06, + "loss": 0.0936, + "step": 50200 + }, + { + "epoch": 3.0808871405317415, + "grad_norm": 6.355682373046875, + "learning_rate": 3.223242428395231e-06, + "loss": 0.1628, + "step": 50201 + }, + { + "epoch": 3.080900705371677, + "grad_norm": 4.241909027099609, + "learning_rate": 3.223105385774976e-06, + "loss": 0.08, + "step": 50202 + }, + { + "epoch": 3.0809142702116117, + "grad_norm": 3.554394245147705, + "learning_rate": 3.2229683431547215e-06, + "loss": 0.0971, + "step": 50203 + }, + { + "epoch": 3.0809278350515465, + "grad_norm": 3.499601364135742, + "learning_rate": 3.2228313005344663e-06, + "loss": 0.0826, + "step": 50204 + }, + { + "epoch": 3.0809413998914814, + "grad_norm": 3.9861087799072266, + "learning_rate": 3.222694257914212e-06, + "loss": 0.1107, + "step": 50205 + }, + { + "epoch": 3.0809549647314163, + "grad_norm": 2.9169864654541016, + "learning_rate": 3.2225572152939566e-06, + "loss": 0.0631, + "step": 50206 + }, + { + "epoch": 3.080968529571351, + "grad_norm": 3.0271682739257812, + "learning_rate": 3.222420172673702e-06, + "loss": 0.0685, + "step": 50207 + }, + { + "epoch": 3.080982094411286, + "grad_norm": 4.693615913391113, + "learning_rate": 3.222283130053447e-06, + "loss": 0.1517, + "step": 50208 + }, + { + "epoch": 3.080995659251221, + "grad_norm": 2.5836312770843506, + "learning_rate": 3.222146087433192e-06, + "loss": 0.0543, + "step": 50209 + }, + { + "epoch": 3.0810092240911557, + "grad_norm": 4.785449028015137, + "learning_rate": 3.222009044812937e-06, + "loss": 0.0964, + "step": 50210 + }, + { + "epoch": 3.0810227889310906, + "grad_norm": 6.701879024505615, + "learning_rate": 3.221872002192682e-06, + "loss": 0.1557, + "step": 50211 + }, + { + "epoch": 3.0810363537710255, + "grad_norm": 3.68782114982605, + "learning_rate": 3.2217349595724273e-06, + "loss": 0.0606, + "step": 50212 + }, + { + "epoch": 3.0810499186109603, + "grad_norm": 3.9763739109039307, + "learning_rate": 3.221597916952172e-06, + "loss": 0.1324, + "step": 50213 + }, + { + "epoch": 3.081063483450895, + "grad_norm": 3.734766960144043, + "learning_rate": 3.2214608743319176e-06, + "loss": 0.1535, + "step": 50214 + }, + { + "epoch": 3.08107704829083, + "grad_norm": 4.135337829589844, + "learning_rate": 3.2213238317116624e-06, + "loss": 0.113, + "step": 50215 + }, + { + "epoch": 3.081090613130765, + "grad_norm": 4.913413047790527, + "learning_rate": 3.221186789091408e-06, + "loss": 0.1628, + "step": 50216 + }, + { + "epoch": 3.0811041779707, + "grad_norm": 2.597024917602539, + "learning_rate": 3.2210497464711527e-06, + "loss": 0.0431, + "step": 50217 + }, + { + "epoch": 3.0811177428106347, + "grad_norm": 5.9268341064453125, + "learning_rate": 3.220912703850898e-06, + "loss": 0.192, + "step": 50218 + }, + { + "epoch": 3.0811313076505695, + "grad_norm": 3.8980727195739746, + "learning_rate": 3.220775661230643e-06, + "loss": 0.0611, + "step": 50219 + }, + { + "epoch": 3.081144872490505, + "grad_norm": 2.844724655151367, + "learning_rate": 3.2206386186103883e-06, + "loss": 0.052, + "step": 50220 + }, + { + "epoch": 3.0811584373304397, + "grad_norm": 5.63442325592041, + "learning_rate": 3.220501575990133e-06, + "loss": 0.0917, + "step": 50221 + }, + { + "epoch": 3.0811720021703746, + "grad_norm": 5.210987567901611, + "learning_rate": 3.2203645333698786e-06, + "loss": 0.1826, + "step": 50222 + }, + { + "epoch": 3.0811855670103094, + "grad_norm": 3.2741189002990723, + "learning_rate": 3.2202274907496234e-06, + "loss": 0.0807, + "step": 50223 + }, + { + "epoch": 3.0811991318502443, + "grad_norm": 4.772363662719727, + "learning_rate": 3.2200904481293686e-06, + "loss": 0.1337, + "step": 50224 + }, + { + "epoch": 3.081212696690179, + "grad_norm": 3.542612314224243, + "learning_rate": 3.2199534055091138e-06, + "loss": 0.0847, + "step": 50225 + }, + { + "epoch": 3.081226261530114, + "grad_norm": 4.507958889007568, + "learning_rate": 3.2198163628888585e-06, + "loss": 0.0442, + "step": 50226 + }, + { + "epoch": 3.081239826370049, + "grad_norm": 3.1804513931274414, + "learning_rate": 3.2196793202686037e-06, + "loss": 0.1093, + "step": 50227 + }, + { + "epoch": 3.0812533912099838, + "grad_norm": 4.015318393707275, + "learning_rate": 3.219542277648349e-06, + "loss": 0.1094, + "step": 50228 + }, + { + "epoch": 3.0812669560499186, + "grad_norm": 3.326866865158081, + "learning_rate": 3.219405235028094e-06, + "loss": 0.109, + "step": 50229 + }, + { + "epoch": 3.0812805208898535, + "grad_norm": 5.68398904800415, + "learning_rate": 3.219268192407839e-06, + "loss": 0.1506, + "step": 50230 + }, + { + "epoch": 3.0812940857297884, + "grad_norm": 4.222694396972656, + "learning_rate": 3.2191311497875844e-06, + "loss": 0.1084, + "step": 50231 + }, + { + "epoch": 3.0813076505697232, + "grad_norm": 4.155445575714111, + "learning_rate": 3.218994107167329e-06, + "loss": 0.0941, + "step": 50232 + }, + { + "epoch": 3.081321215409658, + "grad_norm": 5.259878635406494, + "learning_rate": 3.2188570645470748e-06, + "loss": 0.0822, + "step": 50233 + }, + { + "epoch": 3.081334780249593, + "grad_norm": 3.1123812198638916, + "learning_rate": 3.2187200219268195e-06, + "loss": 0.0575, + "step": 50234 + }, + { + "epoch": 3.081348345089528, + "grad_norm": 5.171810626983643, + "learning_rate": 3.2185829793065647e-06, + "loss": 0.1258, + "step": 50235 + }, + { + "epoch": 3.0813619099294627, + "grad_norm": 4.351196765899658, + "learning_rate": 3.21844593668631e-06, + "loss": 0.1471, + "step": 50236 + }, + { + "epoch": 3.0813754747693975, + "grad_norm": 4.218262195587158, + "learning_rate": 3.218308894066055e-06, + "loss": 0.075, + "step": 50237 + }, + { + "epoch": 3.0813890396093324, + "grad_norm": 2.556666612625122, + "learning_rate": 3.2181718514458e-06, + "loss": 0.0785, + "step": 50238 + }, + { + "epoch": 3.0814026044492673, + "grad_norm": 4.477230072021484, + "learning_rate": 3.2180348088255446e-06, + "loss": 0.0648, + "step": 50239 + }, + { + "epoch": 3.0814161692892026, + "grad_norm": 3.540876865386963, + "learning_rate": 3.21789776620529e-06, + "loss": 0.0578, + "step": 50240 + }, + { + "epoch": 3.0814297341291375, + "grad_norm": 3.5700721740722656, + "learning_rate": 3.217760723585035e-06, + "loss": 0.1059, + "step": 50241 + }, + { + "epoch": 3.0814432989690723, + "grad_norm": 3.753281354904175, + "learning_rate": 3.2176236809647805e-06, + "loss": 0.0802, + "step": 50242 + }, + { + "epoch": 3.081456863809007, + "grad_norm": 4.7710981369018555, + "learning_rate": 3.2174866383445253e-06, + "loss": 0.1995, + "step": 50243 + }, + { + "epoch": 3.081470428648942, + "grad_norm": 4.945558547973633, + "learning_rate": 3.2173495957242705e-06, + "loss": 0.1179, + "step": 50244 + }, + { + "epoch": 3.081483993488877, + "grad_norm": 5.704932689666748, + "learning_rate": 3.2172125531040156e-06, + "loss": 0.1822, + "step": 50245 + }, + { + "epoch": 3.081497558328812, + "grad_norm": 5.752743244171143, + "learning_rate": 3.217075510483761e-06, + "loss": 0.1612, + "step": 50246 + }, + { + "epoch": 3.0815111231687466, + "grad_norm": 4.558454513549805, + "learning_rate": 3.2169384678635056e-06, + "loss": 0.0929, + "step": 50247 + }, + { + "epoch": 3.0815246880086815, + "grad_norm": 4.868691921234131, + "learning_rate": 3.216801425243251e-06, + "loss": 0.0849, + "step": 50248 + }, + { + "epoch": 3.0815382528486164, + "grad_norm": 3.604966640472412, + "learning_rate": 3.216664382622996e-06, + "loss": 0.0854, + "step": 50249 + }, + { + "epoch": 3.0815518176885512, + "grad_norm": 4.197568416595459, + "learning_rate": 3.2165273400027415e-06, + "loss": 0.1674, + "step": 50250 + }, + { + "epoch": 3.081565382528486, + "grad_norm": 4.144709587097168, + "learning_rate": 3.2163902973824863e-06, + "loss": 0.1081, + "step": 50251 + }, + { + "epoch": 3.081578947368421, + "grad_norm": 3.3550474643707275, + "learning_rate": 3.216253254762231e-06, + "loss": 0.0614, + "step": 50252 + }, + { + "epoch": 3.081592512208356, + "grad_norm": 4.681048393249512, + "learning_rate": 3.2161162121419766e-06, + "loss": 0.1676, + "step": 50253 + }, + { + "epoch": 3.0816060770482907, + "grad_norm": 6.117180824279785, + "learning_rate": 3.2159791695217214e-06, + "loss": 0.1005, + "step": 50254 + }, + { + "epoch": 3.0816196418882256, + "grad_norm": 5.544483661651611, + "learning_rate": 3.2158421269014666e-06, + "loss": 0.2422, + "step": 50255 + }, + { + "epoch": 3.0816332067281604, + "grad_norm": 5.512833118438721, + "learning_rate": 3.2157050842812113e-06, + "loss": 0.1068, + "step": 50256 + }, + { + "epoch": 3.0816467715680953, + "grad_norm": 5.294391632080078, + "learning_rate": 3.215568041660957e-06, + "loss": 0.1883, + "step": 50257 + }, + { + "epoch": 3.0816603364080306, + "grad_norm": 3.3917009830474854, + "learning_rate": 3.2154309990407017e-06, + "loss": 0.1043, + "step": 50258 + }, + { + "epoch": 3.0816739012479655, + "grad_norm": 5.145746231079102, + "learning_rate": 3.2152939564204473e-06, + "loss": 0.1038, + "step": 50259 + }, + { + "epoch": 3.0816874660879003, + "grad_norm": 5.543396949768066, + "learning_rate": 3.215156913800192e-06, + "loss": 0.1157, + "step": 50260 + }, + { + "epoch": 3.081701030927835, + "grad_norm": 3.700958490371704, + "learning_rate": 3.2150198711799372e-06, + "loss": 0.117, + "step": 50261 + }, + { + "epoch": 3.08171459576777, + "grad_norm": 6.475712776184082, + "learning_rate": 3.2148828285596824e-06, + "loss": 0.1076, + "step": 50262 + }, + { + "epoch": 3.081728160607705, + "grad_norm": 4.983556747436523, + "learning_rate": 3.2147457859394276e-06, + "loss": 0.1368, + "step": 50263 + }, + { + "epoch": 3.08174172544764, + "grad_norm": 4.618095397949219, + "learning_rate": 3.2146087433191723e-06, + "loss": 0.1179, + "step": 50264 + }, + { + "epoch": 3.0817552902875747, + "grad_norm": 5.768962383270264, + "learning_rate": 3.214471700698918e-06, + "loss": 0.1443, + "step": 50265 + }, + { + "epoch": 3.0817688551275095, + "grad_norm": 6.498034477233887, + "learning_rate": 3.2143346580786627e-06, + "loss": 0.2386, + "step": 50266 + }, + { + "epoch": 3.0817824199674444, + "grad_norm": 6.245460510253906, + "learning_rate": 3.2141976154584075e-06, + "loss": 0.1738, + "step": 50267 + }, + { + "epoch": 3.0817959848073793, + "grad_norm": 4.695821285247803, + "learning_rate": 3.214060572838153e-06, + "loss": 0.1144, + "step": 50268 + }, + { + "epoch": 3.081809549647314, + "grad_norm": 4.953762054443359, + "learning_rate": 3.213923530217898e-06, + "loss": 0.1842, + "step": 50269 + }, + { + "epoch": 3.081823114487249, + "grad_norm": 3.695894718170166, + "learning_rate": 3.2137864875976434e-06, + "loss": 0.0702, + "step": 50270 + }, + { + "epoch": 3.081836679327184, + "grad_norm": 5.2859392166137695, + "learning_rate": 3.213649444977388e-06, + "loss": 0.2444, + "step": 50271 + }, + { + "epoch": 3.0818502441671187, + "grad_norm": 4.21448278427124, + "learning_rate": 3.2135124023571333e-06, + "loss": 0.1554, + "step": 50272 + }, + { + "epoch": 3.0818638090070536, + "grad_norm": 4.878465175628662, + "learning_rate": 3.213375359736878e-06, + "loss": 0.1209, + "step": 50273 + }, + { + "epoch": 3.0818773738469885, + "grad_norm": 4.673379421234131, + "learning_rate": 3.2132383171166237e-06, + "loss": 0.0956, + "step": 50274 + }, + { + "epoch": 3.0818909386869233, + "grad_norm": 4.095751762390137, + "learning_rate": 3.2131012744963685e-06, + "loss": 0.1212, + "step": 50275 + }, + { + "epoch": 3.081904503526858, + "grad_norm": 3.188798427581787, + "learning_rate": 3.212964231876114e-06, + "loss": 0.1073, + "step": 50276 + }, + { + "epoch": 3.081918068366793, + "grad_norm": 4.256283760070801, + "learning_rate": 3.212827189255859e-06, + "loss": 0.1053, + "step": 50277 + }, + { + "epoch": 3.0819316332067284, + "grad_norm": 5.445074081420898, + "learning_rate": 3.212690146635604e-06, + "loss": 0.2144, + "step": 50278 + }, + { + "epoch": 3.0819451980466632, + "grad_norm": 4.515791416168213, + "learning_rate": 3.212553104015349e-06, + "loss": 0.0778, + "step": 50279 + }, + { + "epoch": 3.081958762886598, + "grad_norm": 4.3656206130981445, + "learning_rate": 3.212416061395094e-06, + "loss": 0.1307, + "step": 50280 + }, + { + "epoch": 3.081972327726533, + "grad_norm": 4.573212146759033, + "learning_rate": 3.212279018774839e-06, + "loss": 0.1443, + "step": 50281 + }, + { + "epoch": 3.081985892566468, + "grad_norm": 3.5859854221343994, + "learning_rate": 3.2121419761545843e-06, + "loss": 0.1007, + "step": 50282 + }, + { + "epoch": 3.0819994574064027, + "grad_norm": 4.187178611755371, + "learning_rate": 3.2120049335343295e-06, + "loss": 0.1149, + "step": 50283 + }, + { + "epoch": 3.0820130222463376, + "grad_norm": 8.775240898132324, + "learning_rate": 3.2118678909140742e-06, + "loss": 0.107, + "step": 50284 + }, + { + "epoch": 3.0820265870862724, + "grad_norm": 4.2281317710876465, + "learning_rate": 3.21173084829382e-06, + "loss": 0.0699, + "step": 50285 + }, + { + "epoch": 3.0820401519262073, + "grad_norm": 3.8534274101257324, + "learning_rate": 3.2115938056735646e-06, + "loss": 0.0746, + "step": 50286 + }, + { + "epoch": 3.082053716766142, + "grad_norm": 3.382819652557373, + "learning_rate": 3.21145676305331e-06, + "loss": 0.1228, + "step": 50287 + }, + { + "epoch": 3.082067281606077, + "grad_norm": 5.901304244995117, + "learning_rate": 3.211319720433055e-06, + "loss": 0.1416, + "step": 50288 + }, + { + "epoch": 3.082080846446012, + "grad_norm": 4.942901134490967, + "learning_rate": 3.2111826778128e-06, + "loss": 0.1754, + "step": 50289 + }, + { + "epoch": 3.0820944112859467, + "grad_norm": 3.266143321990967, + "learning_rate": 3.211045635192545e-06, + "loss": 0.0941, + "step": 50290 + }, + { + "epoch": 3.0821079761258816, + "grad_norm": 3.282519817352295, + "learning_rate": 3.2109085925722905e-06, + "loss": 0.0643, + "step": 50291 + }, + { + "epoch": 3.0821215409658165, + "grad_norm": 4.2189202308654785, + "learning_rate": 3.2107715499520352e-06, + "loss": 0.0888, + "step": 50292 + }, + { + "epoch": 3.0821351058057513, + "grad_norm": 2.6300137042999268, + "learning_rate": 3.21063450733178e-06, + "loss": 0.08, + "step": 50293 + }, + { + "epoch": 3.082148670645686, + "grad_norm": 4.392673492431641, + "learning_rate": 3.2104974647115256e-06, + "loss": 0.0676, + "step": 50294 + }, + { + "epoch": 3.082162235485621, + "grad_norm": 5.402163028717041, + "learning_rate": 3.2103604220912703e-06, + "loss": 0.113, + "step": 50295 + }, + { + "epoch": 3.0821758003255564, + "grad_norm": 7.366876602172852, + "learning_rate": 3.210223379471016e-06, + "loss": 0.1409, + "step": 50296 + }, + { + "epoch": 3.0821893651654912, + "grad_norm": 2.3815505504608154, + "learning_rate": 3.2100863368507607e-06, + "loss": 0.0356, + "step": 50297 + }, + { + "epoch": 3.082202930005426, + "grad_norm": 2.9819822311401367, + "learning_rate": 3.209949294230506e-06, + "loss": 0.0627, + "step": 50298 + }, + { + "epoch": 3.082216494845361, + "grad_norm": 5.057644367218018, + "learning_rate": 3.209812251610251e-06, + "loss": 0.129, + "step": 50299 + }, + { + "epoch": 3.082230059685296, + "grad_norm": 5.995362281799316, + "learning_rate": 3.2096752089899962e-06, + "loss": 0.1107, + "step": 50300 + }, + { + "epoch": 3.0822436245252307, + "grad_norm": 4.707855701446533, + "learning_rate": 3.209538166369741e-06, + "loss": 0.1208, + "step": 50301 + }, + { + "epoch": 3.0822571893651656, + "grad_norm": 4.103480815887451, + "learning_rate": 3.2094011237494866e-06, + "loss": 0.1063, + "step": 50302 + }, + { + "epoch": 3.0822707542051004, + "grad_norm": 3.7846689224243164, + "learning_rate": 3.2092640811292314e-06, + "loss": 0.0496, + "step": 50303 + }, + { + "epoch": 3.0822843190450353, + "grad_norm": 5.454782962799072, + "learning_rate": 3.209127038508977e-06, + "loss": 0.1193, + "step": 50304 + }, + { + "epoch": 3.08229788388497, + "grad_norm": 6.917416095733643, + "learning_rate": 3.2089899958887217e-06, + "loss": 0.0849, + "step": 50305 + }, + { + "epoch": 3.082311448724905, + "grad_norm": 5.145164489746094, + "learning_rate": 3.208852953268467e-06, + "loss": 0.1334, + "step": 50306 + }, + { + "epoch": 3.08232501356484, + "grad_norm": 6.064850807189941, + "learning_rate": 3.208715910648212e-06, + "loss": 0.1785, + "step": 50307 + }, + { + "epoch": 3.0823385784047748, + "grad_norm": 4.462855339050293, + "learning_rate": 3.208578868027957e-06, + "loss": 0.1555, + "step": 50308 + }, + { + "epoch": 3.0823521432447096, + "grad_norm": 3.9284403324127197, + "learning_rate": 3.208441825407702e-06, + "loss": 0.1407, + "step": 50309 + }, + { + "epoch": 3.0823657080846445, + "grad_norm": 4.851612091064453, + "learning_rate": 3.2083047827874468e-06, + "loss": 0.1216, + "step": 50310 + }, + { + "epoch": 3.0823792729245794, + "grad_norm": 3.964136838912964, + "learning_rate": 3.2081677401671924e-06, + "loss": 0.0673, + "step": 50311 + }, + { + "epoch": 3.0823928377645142, + "grad_norm": 5.025935649871826, + "learning_rate": 3.208030697546937e-06, + "loss": 0.118, + "step": 50312 + }, + { + "epoch": 3.082406402604449, + "grad_norm": 5.008012294769287, + "learning_rate": 3.2078936549266827e-06, + "loss": 0.0613, + "step": 50313 + }, + { + "epoch": 3.082419967444384, + "grad_norm": 4.8533406257629395, + "learning_rate": 3.2077566123064275e-06, + "loss": 0.1018, + "step": 50314 + }, + { + "epoch": 3.082433532284319, + "grad_norm": 6.0983476638793945, + "learning_rate": 3.2076195696861727e-06, + "loss": 0.1609, + "step": 50315 + }, + { + "epoch": 3.082447097124254, + "grad_norm": 6.157415390014648, + "learning_rate": 3.207482527065918e-06, + "loss": 0.1445, + "step": 50316 + }, + { + "epoch": 3.082460661964189, + "grad_norm": 3.0550389289855957, + "learning_rate": 3.207345484445663e-06, + "loss": 0.0489, + "step": 50317 + }, + { + "epoch": 3.082474226804124, + "grad_norm": 4.398521900177002, + "learning_rate": 3.2072084418254078e-06, + "loss": 0.0944, + "step": 50318 + }, + { + "epoch": 3.0824877916440587, + "grad_norm": 4.249445915222168, + "learning_rate": 3.2070713992051534e-06, + "loss": 0.0848, + "step": 50319 + }, + { + "epoch": 3.0825013564839936, + "grad_norm": 4.662030220031738, + "learning_rate": 3.206934356584898e-06, + "loss": 0.1172, + "step": 50320 + }, + { + "epoch": 3.0825149213239285, + "grad_norm": 5.595041275024414, + "learning_rate": 3.206797313964643e-06, + "loss": 0.1153, + "step": 50321 + }, + { + "epoch": 3.0825284861638633, + "grad_norm": 4.6402764320373535, + "learning_rate": 3.2066602713443885e-06, + "loss": 0.1142, + "step": 50322 + }, + { + "epoch": 3.082542051003798, + "grad_norm": 4.371725082397461, + "learning_rate": 3.2065232287241332e-06, + "loss": 0.0901, + "step": 50323 + }, + { + "epoch": 3.082555615843733, + "grad_norm": 5.389533519744873, + "learning_rate": 3.206386186103879e-06, + "loss": 0.0928, + "step": 50324 + }, + { + "epoch": 3.082569180683668, + "grad_norm": 3.258828639984131, + "learning_rate": 3.2062491434836236e-06, + "loss": 0.0502, + "step": 50325 + }, + { + "epoch": 3.082582745523603, + "grad_norm": 5.314384937286377, + "learning_rate": 3.2061121008633688e-06, + "loss": 0.0956, + "step": 50326 + }, + { + "epoch": 3.0825963103635377, + "grad_norm": 3.930800199508667, + "learning_rate": 3.2059750582431135e-06, + "loss": 0.1311, + "step": 50327 + }, + { + "epoch": 3.0826098752034725, + "grad_norm": 3.0528862476348877, + "learning_rate": 3.205838015622859e-06, + "loss": 0.0456, + "step": 50328 + }, + { + "epoch": 3.0826234400434074, + "grad_norm": 7.199331760406494, + "learning_rate": 3.205700973002604e-06, + "loss": 0.1371, + "step": 50329 + }, + { + "epoch": 3.0826370048833422, + "grad_norm": 4.208772659301758, + "learning_rate": 3.2055639303823495e-06, + "loss": 0.1188, + "step": 50330 + }, + { + "epoch": 3.082650569723277, + "grad_norm": 4.732541084289551, + "learning_rate": 3.2054268877620942e-06, + "loss": 0.1702, + "step": 50331 + }, + { + "epoch": 3.082664134563212, + "grad_norm": 4.023075103759766, + "learning_rate": 3.2052898451418394e-06, + "loss": 0.0629, + "step": 50332 + }, + { + "epoch": 3.082677699403147, + "grad_norm": 5.358325004577637, + "learning_rate": 3.2051528025215846e-06, + "loss": 0.1387, + "step": 50333 + }, + { + "epoch": 3.082691264243082, + "grad_norm": 3.28794527053833, + "learning_rate": 3.2050157599013298e-06, + "loss": 0.0817, + "step": 50334 + }, + { + "epoch": 3.082704829083017, + "grad_norm": 3.969251871109009, + "learning_rate": 3.2048787172810745e-06, + "loss": 0.0813, + "step": 50335 + }, + { + "epoch": 3.082718393922952, + "grad_norm": 4.235287189483643, + "learning_rate": 3.2047416746608197e-06, + "loss": 0.1008, + "step": 50336 + }, + { + "epoch": 3.0827319587628867, + "grad_norm": 3.899677038192749, + "learning_rate": 3.204604632040565e-06, + "loss": 0.1642, + "step": 50337 + }, + { + "epoch": 3.0827455236028216, + "grad_norm": 4.743561744689941, + "learning_rate": 3.2044675894203096e-06, + "loss": 0.1187, + "step": 50338 + }, + { + "epoch": 3.0827590884427565, + "grad_norm": 3.986664056777954, + "learning_rate": 3.2043305468000553e-06, + "loss": 0.1626, + "step": 50339 + }, + { + "epoch": 3.0827726532826913, + "grad_norm": 3.95989727973938, + "learning_rate": 3.2041935041798e-06, + "loss": 0.0868, + "step": 50340 + }, + { + "epoch": 3.082786218122626, + "grad_norm": 3.7932302951812744, + "learning_rate": 3.2040564615595456e-06, + "loss": 0.0976, + "step": 50341 + }, + { + "epoch": 3.082799782962561, + "grad_norm": 5.532299041748047, + "learning_rate": 3.2039194189392904e-06, + "loss": 0.1355, + "step": 50342 + }, + { + "epoch": 3.082813347802496, + "grad_norm": 4.375161647796631, + "learning_rate": 3.2037823763190355e-06, + "loss": 0.1175, + "step": 50343 + }, + { + "epoch": 3.082826912642431, + "grad_norm": 3.1512765884399414, + "learning_rate": 3.2036453336987803e-06, + "loss": 0.1061, + "step": 50344 + }, + { + "epoch": 3.0828404774823657, + "grad_norm": 3.4858007431030273, + "learning_rate": 3.203508291078526e-06, + "loss": 0.1047, + "step": 50345 + }, + { + "epoch": 3.0828540423223005, + "grad_norm": 8.824077606201172, + "learning_rate": 3.2033712484582707e-06, + "loss": 0.2683, + "step": 50346 + }, + { + "epoch": 3.0828676071622354, + "grad_norm": 3.6836302280426025, + "learning_rate": 3.2032342058380163e-06, + "loss": 0.0911, + "step": 50347 + }, + { + "epoch": 3.0828811720021703, + "grad_norm": 3.577333688735962, + "learning_rate": 3.203097163217761e-06, + "loss": 0.1002, + "step": 50348 + }, + { + "epoch": 3.082894736842105, + "grad_norm": 3.6623408794403076, + "learning_rate": 3.2029601205975058e-06, + "loss": 0.0952, + "step": 50349 + }, + { + "epoch": 3.08290830168204, + "grad_norm": 5.41517448425293, + "learning_rate": 3.2028230779772514e-06, + "loss": 0.2463, + "step": 50350 + }, + { + "epoch": 3.082921866521975, + "grad_norm": 3.9422616958618164, + "learning_rate": 3.202686035356996e-06, + "loss": 0.0976, + "step": 50351 + }, + { + "epoch": 3.0829354313619097, + "grad_norm": 2.6303341388702393, + "learning_rate": 3.2025489927367413e-06, + "loss": 0.0757, + "step": 50352 + }, + { + "epoch": 3.0829489962018446, + "grad_norm": 3.798168420791626, + "learning_rate": 3.2024119501164865e-06, + "loss": 0.061, + "step": 50353 + }, + { + "epoch": 3.08296256104178, + "grad_norm": 6.133768558502197, + "learning_rate": 3.2022749074962317e-06, + "loss": 0.2783, + "step": 50354 + }, + { + "epoch": 3.0829761258817148, + "grad_norm": 6.7434306144714355, + "learning_rate": 3.2021378648759764e-06, + "loss": 0.3169, + "step": 50355 + }, + { + "epoch": 3.0829896907216496, + "grad_norm": 7.326297760009766, + "learning_rate": 3.202000822255722e-06, + "loss": 0.1735, + "step": 50356 + }, + { + "epoch": 3.0830032555615845, + "grad_norm": 6.18363094329834, + "learning_rate": 3.2018637796354668e-06, + "loss": 0.0682, + "step": 50357 + }, + { + "epoch": 3.0830168204015194, + "grad_norm": 4.129181861877441, + "learning_rate": 3.2017267370152124e-06, + "loss": 0.1863, + "step": 50358 + }, + { + "epoch": 3.0830303852414542, + "grad_norm": 6.127315998077393, + "learning_rate": 3.201589694394957e-06, + "loss": 0.1516, + "step": 50359 + }, + { + "epoch": 3.083043950081389, + "grad_norm": 4.3139567375183105, + "learning_rate": 3.2014526517747023e-06, + "loss": 0.0653, + "step": 50360 + }, + { + "epoch": 3.083057514921324, + "grad_norm": 4.354074478149414, + "learning_rate": 3.201315609154447e-06, + "loss": 0.1058, + "step": 50361 + }, + { + "epoch": 3.083071079761259, + "grad_norm": 3.6543447971343994, + "learning_rate": 3.2011785665341922e-06, + "loss": 0.0659, + "step": 50362 + }, + { + "epoch": 3.0830846446011937, + "grad_norm": 5.587377548217773, + "learning_rate": 3.2010415239139374e-06, + "loss": 0.1017, + "step": 50363 + }, + { + "epoch": 3.0830982094411286, + "grad_norm": 3.581629991531372, + "learning_rate": 3.200904481293682e-06, + "loss": 0.0829, + "step": 50364 + }, + { + "epoch": 3.0831117742810634, + "grad_norm": 5.554537773132324, + "learning_rate": 3.2007674386734278e-06, + "loss": 0.1608, + "step": 50365 + }, + { + "epoch": 3.0831253391209983, + "grad_norm": 4.919439315795898, + "learning_rate": 3.2006303960531725e-06, + "loss": 0.132, + "step": 50366 + }, + { + "epoch": 3.083138903960933, + "grad_norm": 5.321518421173096, + "learning_rate": 3.200493353432918e-06, + "loss": 0.2044, + "step": 50367 + }, + { + "epoch": 3.083152468800868, + "grad_norm": 6.999900817871094, + "learning_rate": 3.200356310812663e-06, + "loss": 0.1044, + "step": 50368 + }, + { + "epoch": 3.083166033640803, + "grad_norm": 4.224502086639404, + "learning_rate": 3.200219268192408e-06, + "loss": 0.0771, + "step": 50369 + }, + { + "epoch": 3.0831795984807377, + "grad_norm": 4.970264434814453, + "learning_rate": 3.2000822255721533e-06, + "loss": 0.1781, + "step": 50370 + }, + { + "epoch": 3.0831931633206726, + "grad_norm": 3.0455410480499268, + "learning_rate": 3.1999451829518984e-06, + "loss": 0.089, + "step": 50371 + }, + { + "epoch": 3.083206728160608, + "grad_norm": 4.139904975891113, + "learning_rate": 3.199808140331643e-06, + "loss": 0.11, + "step": 50372 + }, + { + "epoch": 3.083220293000543, + "grad_norm": 4.39544153213501, + "learning_rate": 3.199671097711389e-06, + "loss": 0.0643, + "step": 50373 + }, + { + "epoch": 3.0832338578404777, + "grad_norm": 4.395941734313965, + "learning_rate": 3.1995340550911335e-06, + "loss": 0.071, + "step": 50374 + }, + { + "epoch": 3.0832474226804125, + "grad_norm": 5.2474684715271, + "learning_rate": 3.199397012470879e-06, + "loss": 0.1407, + "step": 50375 + }, + { + "epoch": 3.0832609875203474, + "grad_norm": 4.326675891876221, + "learning_rate": 3.199259969850624e-06, + "loss": 0.0878, + "step": 50376 + }, + { + "epoch": 3.0832745523602823, + "grad_norm": 6.110598087310791, + "learning_rate": 3.1991229272303687e-06, + "loss": 0.1916, + "step": 50377 + }, + { + "epoch": 3.083288117200217, + "grad_norm": 5.1977105140686035, + "learning_rate": 3.1989858846101143e-06, + "loss": 0.213, + "step": 50378 + }, + { + "epoch": 3.083301682040152, + "grad_norm": 6.06907320022583, + "learning_rate": 3.198848841989859e-06, + "loss": 0.1622, + "step": 50379 + }, + { + "epoch": 3.083315246880087, + "grad_norm": 3.4378232955932617, + "learning_rate": 3.198711799369604e-06, + "loss": 0.0737, + "step": 50380 + }, + { + "epoch": 3.0833288117200217, + "grad_norm": 4.009823322296143, + "learning_rate": 3.198574756749349e-06, + "loss": 0.1107, + "step": 50381 + }, + { + "epoch": 3.0833423765599566, + "grad_norm": 6.189359664916992, + "learning_rate": 3.1984377141290946e-06, + "loss": 0.1287, + "step": 50382 + }, + { + "epoch": 3.0833559413998914, + "grad_norm": 5.93825626373291, + "learning_rate": 3.1983006715088393e-06, + "loss": 0.1079, + "step": 50383 + }, + { + "epoch": 3.0833695062398263, + "grad_norm": 3.913107395172119, + "learning_rate": 3.198163628888585e-06, + "loss": 0.0742, + "step": 50384 + }, + { + "epoch": 3.083383071079761, + "grad_norm": 3.6207802295684814, + "learning_rate": 3.1980265862683297e-06, + "loss": 0.0949, + "step": 50385 + }, + { + "epoch": 3.083396635919696, + "grad_norm": 3.991919755935669, + "learning_rate": 3.197889543648075e-06, + "loss": 0.101, + "step": 50386 + }, + { + "epoch": 3.083410200759631, + "grad_norm": 5.662746906280518, + "learning_rate": 3.19775250102782e-06, + "loss": 0.213, + "step": 50387 + }, + { + "epoch": 3.0834237655995658, + "grad_norm": 5.009104251861572, + "learning_rate": 3.197615458407565e-06, + "loss": 0.0986, + "step": 50388 + }, + { + "epoch": 3.0834373304395006, + "grad_norm": 4.851365566253662, + "learning_rate": 3.19747841578731e-06, + "loss": 0.1444, + "step": 50389 + }, + { + "epoch": 3.0834508952794355, + "grad_norm": 3.7788937091827393, + "learning_rate": 3.197341373167055e-06, + "loss": 0.0685, + "step": 50390 + }, + { + "epoch": 3.0834644601193704, + "grad_norm": 4.793246746063232, + "learning_rate": 3.1972043305468003e-06, + "loss": 0.2176, + "step": 50391 + }, + { + "epoch": 3.0834780249593057, + "grad_norm": 4.520595073699951, + "learning_rate": 3.197067287926545e-06, + "loss": 0.1088, + "step": 50392 + }, + { + "epoch": 3.0834915897992405, + "grad_norm": 4.188943386077881, + "learning_rate": 3.1969302453062907e-06, + "loss": 0.1038, + "step": 50393 + }, + { + "epoch": 3.0835051546391754, + "grad_norm": 4.7191338539123535, + "learning_rate": 3.1967932026860354e-06, + "loss": 0.1165, + "step": 50394 + }, + { + "epoch": 3.0835187194791103, + "grad_norm": 4.264090538024902, + "learning_rate": 3.196656160065781e-06, + "loss": 0.1398, + "step": 50395 + }, + { + "epoch": 3.083532284319045, + "grad_norm": 3.604689598083496, + "learning_rate": 3.196519117445526e-06, + "loss": 0.1007, + "step": 50396 + }, + { + "epoch": 3.08354584915898, + "grad_norm": 5.065433502197266, + "learning_rate": 3.196382074825271e-06, + "loss": 0.0823, + "step": 50397 + }, + { + "epoch": 3.083559413998915, + "grad_norm": 5.904481887817383, + "learning_rate": 3.1962450322050157e-06, + "loss": 0.1166, + "step": 50398 + }, + { + "epoch": 3.0835729788388497, + "grad_norm": 4.030813217163086, + "learning_rate": 3.1961079895847613e-06, + "loss": 0.1239, + "step": 50399 + }, + { + "epoch": 3.0835865436787846, + "grad_norm": 6.222935676574707, + "learning_rate": 3.195970946964506e-06, + "loss": 0.0998, + "step": 50400 + }, + { + "epoch": 3.0836001085187195, + "grad_norm": 3.845559597015381, + "learning_rate": 3.1958339043442517e-06, + "loss": 0.0621, + "step": 50401 + }, + { + "epoch": 3.0836136733586543, + "grad_norm": 3.5344645977020264, + "learning_rate": 3.1956968617239964e-06, + "loss": 0.0454, + "step": 50402 + }, + { + "epoch": 3.083627238198589, + "grad_norm": 3.863497495651245, + "learning_rate": 3.1955598191037416e-06, + "loss": 0.0977, + "step": 50403 + }, + { + "epoch": 3.083640803038524, + "grad_norm": 3.2273473739624023, + "learning_rate": 3.195422776483487e-06, + "loss": 0.085, + "step": 50404 + }, + { + "epoch": 3.083654367878459, + "grad_norm": 2.6792807579040527, + "learning_rate": 3.1952857338632316e-06, + "loss": 0.0503, + "step": 50405 + }, + { + "epoch": 3.083667932718394, + "grad_norm": 3.970238447189331, + "learning_rate": 3.1951486912429767e-06, + "loss": 0.0641, + "step": 50406 + }, + { + "epoch": 3.0836814975583287, + "grad_norm": 4.27419376373291, + "learning_rate": 3.195011648622722e-06, + "loss": 0.0942, + "step": 50407 + }, + { + "epoch": 3.0836950623982635, + "grad_norm": 3.4702389240264893, + "learning_rate": 3.194874606002467e-06, + "loss": 0.0694, + "step": 50408 + }, + { + "epoch": 3.0837086272381984, + "grad_norm": 3.1124000549316406, + "learning_rate": 3.194737563382212e-06, + "loss": 0.0472, + "step": 50409 + }, + { + "epoch": 3.0837221920781337, + "grad_norm": 4.401230335235596, + "learning_rate": 3.1946005207619574e-06, + "loss": 0.0803, + "step": 50410 + }, + { + "epoch": 3.0837357569180686, + "grad_norm": 3.893707752227783, + "learning_rate": 3.194463478141702e-06, + "loss": 0.1273, + "step": 50411 + }, + { + "epoch": 3.0837493217580034, + "grad_norm": 4.96966028213501, + "learning_rate": 3.194326435521448e-06, + "loss": 0.1241, + "step": 50412 + }, + { + "epoch": 3.0837628865979383, + "grad_norm": 2.7064595222473145, + "learning_rate": 3.1941893929011926e-06, + "loss": 0.0525, + "step": 50413 + }, + { + "epoch": 3.083776451437873, + "grad_norm": 3.828798532485962, + "learning_rate": 3.1940523502809377e-06, + "loss": 0.0479, + "step": 50414 + }, + { + "epoch": 3.083790016277808, + "grad_norm": 6.498968601226807, + "learning_rate": 3.1939153076606825e-06, + "loss": 0.1189, + "step": 50415 + }, + { + "epoch": 3.083803581117743, + "grad_norm": 4.6974711418151855, + "learning_rate": 3.193778265040428e-06, + "loss": 0.0842, + "step": 50416 + }, + { + "epoch": 3.0838171459576778, + "grad_norm": 3.712026834487915, + "learning_rate": 3.193641222420173e-06, + "loss": 0.0524, + "step": 50417 + }, + { + "epoch": 3.0838307107976126, + "grad_norm": 2.6822011470794678, + "learning_rate": 3.1935041797999176e-06, + "loss": 0.0321, + "step": 50418 + }, + { + "epoch": 3.0838442756375475, + "grad_norm": 5.718078136444092, + "learning_rate": 3.193367137179663e-06, + "loss": 0.1704, + "step": 50419 + }, + { + "epoch": 3.0838578404774823, + "grad_norm": 5.049199104309082, + "learning_rate": 3.193230094559408e-06, + "loss": 0.0761, + "step": 50420 + }, + { + "epoch": 3.083871405317417, + "grad_norm": 4.288061618804932, + "learning_rate": 3.1930930519391536e-06, + "loss": 0.0644, + "step": 50421 + }, + { + "epoch": 3.083884970157352, + "grad_norm": 3.7870993614196777, + "learning_rate": 3.1929560093188983e-06, + "loss": 0.0806, + "step": 50422 + }, + { + "epoch": 3.083898534997287, + "grad_norm": 2.7132742404937744, + "learning_rate": 3.1928189666986435e-06, + "loss": 0.0617, + "step": 50423 + }, + { + "epoch": 3.083912099837222, + "grad_norm": 3.4838037490844727, + "learning_rate": 3.1926819240783887e-06, + "loss": 0.0744, + "step": 50424 + }, + { + "epoch": 3.0839256646771567, + "grad_norm": 3.244368314743042, + "learning_rate": 3.192544881458134e-06, + "loss": 0.0985, + "step": 50425 + }, + { + "epoch": 3.0839392295170915, + "grad_norm": 3.2397336959838867, + "learning_rate": 3.1924078388378786e-06, + "loss": 0.0701, + "step": 50426 + }, + { + "epoch": 3.0839527943570264, + "grad_norm": 4.324544906616211, + "learning_rate": 3.1922707962176242e-06, + "loss": 0.1274, + "step": 50427 + }, + { + "epoch": 3.0839663591969613, + "grad_norm": 2.9434549808502197, + "learning_rate": 3.192133753597369e-06, + "loss": 0.1107, + "step": 50428 + }, + { + "epoch": 3.083979924036896, + "grad_norm": 3.719130516052246, + "learning_rate": 3.1919967109771146e-06, + "loss": 0.1091, + "step": 50429 + }, + { + "epoch": 3.0839934888768314, + "grad_norm": 4.994365692138672, + "learning_rate": 3.1918596683568593e-06, + "loss": 0.13, + "step": 50430 + }, + { + "epoch": 3.0840070537167663, + "grad_norm": 2.585221767425537, + "learning_rate": 3.191722625736604e-06, + "loss": 0.0867, + "step": 50431 + }, + { + "epoch": 3.084020618556701, + "grad_norm": 4.329520225524902, + "learning_rate": 3.1915855831163493e-06, + "loss": 0.1547, + "step": 50432 + }, + { + "epoch": 3.084034183396636, + "grad_norm": 4.01404333114624, + "learning_rate": 3.1914485404960944e-06, + "loss": 0.1064, + "step": 50433 + }, + { + "epoch": 3.084047748236571, + "grad_norm": 3.811530351638794, + "learning_rate": 3.1913114978758396e-06, + "loss": 0.1068, + "step": 50434 + }, + { + "epoch": 3.0840613130765058, + "grad_norm": 3.5611164569854736, + "learning_rate": 3.1911744552555844e-06, + "loss": 0.063, + "step": 50435 + }, + { + "epoch": 3.0840748779164406, + "grad_norm": 3.977099895477295, + "learning_rate": 3.19103741263533e-06, + "loss": 0.1177, + "step": 50436 + }, + { + "epoch": 3.0840884427563755, + "grad_norm": 2.823343276977539, + "learning_rate": 3.1909003700150747e-06, + "loss": 0.0591, + "step": 50437 + }, + { + "epoch": 3.0841020075963104, + "grad_norm": 5.172677040100098, + "learning_rate": 3.1907633273948203e-06, + "loss": 0.2052, + "step": 50438 + }, + { + "epoch": 3.0841155724362452, + "grad_norm": 2.84199595451355, + "learning_rate": 3.190626284774565e-06, + "loss": 0.0503, + "step": 50439 + }, + { + "epoch": 3.08412913727618, + "grad_norm": 5.171652793884277, + "learning_rate": 3.1904892421543103e-06, + "loss": 0.0832, + "step": 50440 + }, + { + "epoch": 3.084142702116115, + "grad_norm": 3.874363660812378, + "learning_rate": 3.1903521995340555e-06, + "loss": 0.1334, + "step": 50441 + }, + { + "epoch": 3.08415626695605, + "grad_norm": 3.5549192428588867, + "learning_rate": 3.1902151569138006e-06, + "loss": 0.0981, + "step": 50442 + }, + { + "epoch": 3.0841698317959847, + "grad_norm": 3.5703213214874268, + "learning_rate": 3.1900781142935454e-06, + "loss": 0.1415, + "step": 50443 + }, + { + "epoch": 3.0841833966359196, + "grad_norm": 5.202738285064697, + "learning_rate": 3.189941071673291e-06, + "loss": 0.1425, + "step": 50444 + }, + { + "epoch": 3.0841969614758544, + "grad_norm": 4.675556659698486, + "learning_rate": 3.1898040290530357e-06, + "loss": 0.1051, + "step": 50445 + }, + { + "epoch": 3.0842105263157893, + "grad_norm": 8.623143196105957, + "learning_rate": 3.1896669864327805e-06, + "loss": 0.2904, + "step": 50446 + }, + { + "epoch": 3.084224091155724, + "grad_norm": 3.426851987838745, + "learning_rate": 3.189529943812526e-06, + "loss": 0.1097, + "step": 50447 + }, + { + "epoch": 3.0842376559956595, + "grad_norm": 2.7671046257019043, + "learning_rate": 3.189392901192271e-06, + "loss": 0.0567, + "step": 50448 + }, + { + "epoch": 3.0842512208355943, + "grad_norm": 3.8537490367889404, + "learning_rate": 3.1892558585720165e-06, + "loss": 0.0779, + "step": 50449 + }, + { + "epoch": 3.084264785675529, + "grad_norm": 5.568544864654541, + "learning_rate": 3.1891188159517612e-06, + "loss": 0.0994, + "step": 50450 + }, + { + "epoch": 3.084278350515464, + "grad_norm": 3.096395492553711, + "learning_rate": 3.1889817733315064e-06, + "loss": 0.048, + "step": 50451 + }, + { + "epoch": 3.084291915355399, + "grad_norm": 3.9211556911468506, + "learning_rate": 3.188844730711251e-06, + "loss": 0.0967, + "step": 50452 + }, + { + "epoch": 3.084305480195334, + "grad_norm": 6.084661483764648, + "learning_rate": 3.1887076880909968e-06, + "loss": 0.0988, + "step": 50453 + }, + { + "epoch": 3.0843190450352687, + "grad_norm": 3.408222198486328, + "learning_rate": 3.1885706454707415e-06, + "loss": 0.0688, + "step": 50454 + }, + { + "epoch": 3.0843326098752035, + "grad_norm": 3.149498462677002, + "learning_rate": 3.188433602850487e-06, + "loss": 0.0786, + "step": 50455 + }, + { + "epoch": 3.0843461747151384, + "grad_norm": 5.794590473175049, + "learning_rate": 3.188296560230232e-06, + "loss": 0.1082, + "step": 50456 + }, + { + "epoch": 3.0843597395550733, + "grad_norm": 3.950942277908325, + "learning_rate": 3.188159517609977e-06, + "loss": 0.0916, + "step": 50457 + }, + { + "epoch": 3.084373304395008, + "grad_norm": 3.322359800338745, + "learning_rate": 3.1880224749897222e-06, + "loss": 0.0501, + "step": 50458 + }, + { + "epoch": 3.084386869234943, + "grad_norm": 5.079146862030029, + "learning_rate": 3.187885432369467e-06, + "loss": 0.1552, + "step": 50459 + }, + { + "epoch": 3.084400434074878, + "grad_norm": 3.388066291809082, + "learning_rate": 3.187748389749212e-06, + "loss": 0.0471, + "step": 50460 + }, + { + "epoch": 3.0844139989148127, + "grad_norm": 4.726696491241455, + "learning_rate": 3.1876113471289573e-06, + "loss": 0.1412, + "step": 50461 + }, + { + "epoch": 3.0844275637547476, + "grad_norm": 4.267420291900635, + "learning_rate": 3.1874743045087025e-06, + "loss": 0.0499, + "step": 50462 + }, + { + "epoch": 3.0844411285946824, + "grad_norm": 5.53743839263916, + "learning_rate": 3.1873372618884473e-06, + "loss": 0.2269, + "step": 50463 + }, + { + "epoch": 3.0844546934346173, + "grad_norm": 4.074769020080566, + "learning_rate": 3.187200219268193e-06, + "loss": 0.0871, + "step": 50464 + }, + { + "epoch": 3.084468258274552, + "grad_norm": 3.0092780590057373, + "learning_rate": 3.1870631766479376e-06, + "loss": 0.0449, + "step": 50465 + }, + { + "epoch": 3.084481823114487, + "grad_norm": 3.6291167736053467, + "learning_rate": 3.1869261340276832e-06, + "loss": 0.0713, + "step": 50466 + }, + { + "epoch": 3.0844953879544224, + "grad_norm": 5.27639102935791, + "learning_rate": 3.186789091407428e-06, + "loss": 0.1114, + "step": 50467 + }, + { + "epoch": 3.084508952794357, + "grad_norm": 2.788038969039917, + "learning_rate": 3.186652048787173e-06, + "loss": 0.1111, + "step": 50468 + }, + { + "epoch": 3.084522517634292, + "grad_norm": 3.03438138961792, + "learning_rate": 3.186515006166918e-06, + "loss": 0.046, + "step": 50469 + }, + { + "epoch": 3.084536082474227, + "grad_norm": 3.554879903793335, + "learning_rate": 3.1863779635466635e-06, + "loss": 0.0903, + "step": 50470 + }, + { + "epoch": 3.084549647314162, + "grad_norm": 3.639418840408325, + "learning_rate": 3.1862409209264083e-06, + "loss": 0.0706, + "step": 50471 + }, + { + "epoch": 3.0845632121540967, + "grad_norm": 3.2210628986358643, + "learning_rate": 3.186103878306153e-06, + "loss": 0.0775, + "step": 50472 + }, + { + "epoch": 3.0845767769940315, + "grad_norm": 4.349989414215088, + "learning_rate": 3.1859668356858986e-06, + "loss": 0.066, + "step": 50473 + }, + { + "epoch": 3.0845903418339664, + "grad_norm": 2.828235149383545, + "learning_rate": 3.1858297930656434e-06, + "loss": 0.0545, + "step": 50474 + }, + { + "epoch": 3.0846039066739013, + "grad_norm": 3.8549256324768066, + "learning_rate": 3.185692750445389e-06, + "loss": 0.0676, + "step": 50475 + }, + { + "epoch": 3.084617471513836, + "grad_norm": 4.8723859786987305, + "learning_rate": 3.1855557078251337e-06, + "loss": 0.1626, + "step": 50476 + }, + { + "epoch": 3.084631036353771, + "grad_norm": 3.5037569999694824, + "learning_rate": 3.185418665204879e-06, + "loss": 0.0917, + "step": 50477 + }, + { + "epoch": 3.084644601193706, + "grad_norm": 3.657731771469116, + "learning_rate": 3.185281622584624e-06, + "loss": 0.0732, + "step": 50478 + }, + { + "epoch": 3.0846581660336407, + "grad_norm": 2.987311840057373, + "learning_rate": 3.1851445799643693e-06, + "loss": 0.056, + "step": 50479 + }, + { + "epoch": 3.0846717308735756, + "grad_norm": 3.7434895038604736, + "learning_rate": 3.185007537344114e-06, + "loss": 0.0779, + "step": 50480 + }, + { + "epoch": 3.0846852957135105, + "grad_norm": 2.65906023979187, + "learning_rate": 3.1848704947238596e-06, + "loss": 0.0541, + "step": 50481 + }, + { + "epoch": 3.0846988605534453, + "grad_norm": 3.705897808074951, + "learning_rate": 3.1847334521036044e-06, + "loss": 0.0888, + "step": 50482 + }, + { + "epoch": 3.08471242539338, + "grad_norm": 3.9792239665985107, + "learning_rate": 3.18459640948335e-06, + "loss": 0.096, + "step": 50483 + }, + { + "epoch": 3.084725990233315, + "grad_norm": 4.63381290435791, + "learning_rate": 3.1844593668630948e-06, + "loss": 0.1018, + "step": 50484 + }, + { + "epoch": 3.08473955507325, + "grad_norm": 4.679434299468994, + "learning_rate": 3.18432232424284e-06, + "loss": 0.1474, + "step": 50485 + }, + { + "epoch": 3.0847531199131852, + "grad_norm": 4.4079484939575195, + "learning_rate": 3.1841852816225847e-06, + "loss": 0.1065, + "step": 50486 + }, + { + "epoch": 3.08476668475312, + "grad_norm": 2.8910999298095703, + "learning_rate": 3.18404823900233e-06, + "loss": 0.0755, + "step": 50487 + }, + { + "epoch": 3.084780249593055, + "grad_norm": 3.097045421600342, + "learning_rate": 3.183911196382075e-06, + "loss": 0.0681, + "step": 50488 + }, + { + "epoch": 3.08479381443299, + "grad_norm": 3.573164701461792, + "learning_rate": 3.18377415376182e-06, + "loss": 0.0686, + "step": 50489 + }, + { + "epoch": 3.0848073792729247, + "grad_norm": 3.4849307537078857, + "learning_rate": 3.1836371111415654e-06, + "loss": 0.067, + "step": 50490 + }, + { + "epoch": 3.0848209441128596, + "grad_norm": 3.1254734992980957, + "learning_rate": 3.18350006852131e-06, + "loss": 0.0647, + "step": 50491 + }, + { + "epoch": 3.0848345089527944, + "grad_norm": 4.89152193069458, + "learning_rate": 3.1833630259010558e-06, + "loss": 0.1205, + "step": 50492 + }, + { + "epoch": 3.0848480737927293, + "grad_norm": 4.069997787475586, + "learning_rate": 3.1832259832808005e-06, + "loss": 0.1454, + "step": 50493 + }, + { + "epoch": 3.084861638632664, + "grad_norm": 3.5832817554473877, + "learning_rate": 3.1830889406605457e-06, + "loss": 0.0928, + "step": 50494 + }, + { + "epoch": 3.084875203472599, + "grad_norm": 3.624446153640747, + "learning_rate": 3.182951898040291e-06, + "loss": 0.1059, + "step": 50495 + }, + { + "epoch": 3.084888768312534, + "grad_norm": 4.354439735412598, + "learning_rate": 3.182814855420036e-06, + "loss": 0.1322, + "step": 50496 + }, + { + "epoch": 3.0849023331524688, + "grad_norm": 3.6401896476745605, + "learning_rate": 3.182677812799781e-06, + "loss": 0.0582, + "step": 50497 + }, + { + "epoch": 3.0849158979924036, + "grad_norm": 4.323216915130615, + "learning_rate": 3.1825407701795264e-06, + "loss": 0.0733, + "step": 50498 + }, + { + "epoch": 3.0849294628323385, + "grad_norm": 4.132720947265625, + "learning_rate": 3.182403727559271e-06, + "loss": 0.1121, + "step": 50499 + }, + { + "epoch": 3.0849430276722734, + "grad_norm": 3.5560479164123535, + "learning_rate": 3.182266684939016e-06, + "loss": 0.0911, + "step": 50500 + }, + { + "epoch": 3.084956592512208, + "grad_norm": 5.758903980255127, + "learning_rate": 3.1821296423187615e-06, + "loss": 0.195, + "step": 50501 + }, + { + "epoch": 3.084970157352143, + "grad_norm": 2.7715935707092285, + "learning_rate": 3.1819925996985063e-06, + "loss": 0.0614, + "step": 50502 + }, + { + "epoch": 3.084983722192078, + "grad_norm": 4.9724955558776855, + "learning_rate": 3.1818555570782515e-06, + "loss": 0.0913, + "step": 50503 + }, + { + "epoch": 3.084997287032013, + "grad_norm": 3.0167369842529297, + "learning_rate": 3.1817185144579966e-06, + "loss": 0.0557, + "step": 50504 + }, + { + "epoch": 3.085010851871948, + "grad_norm": 3.4898087978363037, + "learning_rate": 3.181581471837742e-06, + "loss": 0.1091, + "step": 50505 + }, + { + "epoch": 3.085024416711883, + "grad_norm": 3.929264783859253, + "learning_rate": 3.1814444292174866e-06, + "loss": 0.1132, + "step": 50506 + }, + { + "epoch": 3.085037981551818, + "grad_norm": 2.727038860321045, + "learning_rate": 3.181307386597232e-06, + "loss": 0.0824, + "step": 50507 + }, + { + "epoch": 3.0850515463917527, + "grad_norm": 5.428134918212891, + "learning_rate": 3.181170343976977e-06, + "loss": 0.1325, + "step": 50508 + }, + { + "epoch": 3.0850651112316876, + "grad_norm": 3.629395008087158, + "learning_rate": 3.1810333013567225e-06, + "loss": 0.0952, + "step": 50509 + }, + { + "epoch": 3.0850786760716224, + "grad_norm": 5.968124866485596, + "learning_rate": 3.1808962587364673e-06, + "loss": 0.1488, + "step": 50510 + }, + { + "epoch": 3.0850922409115573, + "grad_norm": 4.782726764678955, + "learning_rate": 3.1807592161162125e-06, + "loss": 0.0725, + "step": 50511 + }, + { + "epoch": 3.085105805751492, + "grad_norm": 2.565235137939453, + "learning_rate": 3.1806221734959576e-06, + "loss": 0.0626, + "step": 50512 + }, + { + "epoch": 3.085119370591427, + "grad_norm": 5.362046241760254, + "learning_rate": 3.180485130875703e-06, + "loss": 0.1595, + "step": 50513 + }, + { + "epoch": 3.085132935431362, + "grad_norm": 3.996840000152588, + "learning_rate": 3.1803480882554476e-06, + "loss": 0.1295, + "step": 50514 + }, + { + "epoch": 3.0851465002712968, + "grad_norm": 3.8165102005004883, + "learning_rate": 3.1802110456351928e-06, + "loss": 0.0679, + "step": 50515 + }, + { + "epoch": 3.0851600651112316, + "grad_norm": 4.791131019592285, + "learning_rate": 3.180074003014938e-06, + "loss": 0.1355, + "step": 50516 + }, + { + "epoch": 3.0851736299511665, + "grad_norm": 4.958132743835449, + "learning_rate": 3.1799369603946827e-06, + "loss": 0.1055, + "step": 50517 + }, + { + "epoch": 3.0851871947911014, + "grad_norm": 4.170219421386719, + "learning_rate": 3.1797999177744283e-06, + "loss": 0.1501, + "step": 50518 + }, + { + "epoch": 3.0852007596310362, + "grad_norm": 2.9344184398651123, + "learning_rate": 3.179662875154173e-06, + "loss": 0.069, + "step": 50519 + }, + { + "epoch": 3.085214324470971, + "grad_norm": 2.932100534439087, + "learning_rate": 3.1795258325339187e-06, + "loss": 0.0741, + "step": 50520 + }, + { + "epoch": 3.085227889310906, + "grad_norm": 3.809964895248413, + "learning_rate": 3.1793887899136634e-06, + "loss": 0.114, + "step": 50521 + }, + { + "epoch": 3.085241454150841, + "grad_norm": 3.4106316566467285, + "learning_rate": 3.1792517472934086e-06, + "loss": 0.0703, + "step": 50522 + }, + { + "epoch": 3.0852550189907757, + "grad_norm": 4.536506175994873, + "learning_rate": 3.1791147046731533e-06, + "loss": 0.1852, + "step": 50523 + }, + { + "epoch": 3.085268583830711, + "grad_norm": 6.2255859375, + "learning_rate": 3.178977662052899e-06, + "loss": 0.1888, + "step": 50524 + }, + { + "epoch": 3.085282148670646, + "grad_norm": 5.603017330169678, + "learning_rate": 3.1788406194326437e-06, + "loss": 0.2444, + "step": 50525 + }, + { + "epoch": 3.0852957135105807, + "grad_norm": 4.388084888458252, + "learning_rate": 3.1787035768123893e-06, + "loss": 0.2023, + "step": 50526 + }, + { + "epoch": 3.0853092783505156, + "grad_norm": 4.694801330566406, + "learning_rate": 3.178566534192134e-06, + "loss": 0.2083, + "step": 50527 + }, + { + "epoch": 3.0853228431904505, + "grad_norm": 4.129315376281738, + "learning_rate": 3.178429491571879e-06, + "loss": 0.1094, + "step": 50528 + }, + { + "epoch": 3.0853364080303853, + "grad_norm": 3.7164430618286133, + "learning_rate": 3.1782924489516244e-06, + "loss": 0.0918, + "step": 50529 + }, + { + "epoch": 3.08534997287032, + "grad_norm": 5.0877861976623535, + "learning_rate": 3.178155406331369e-06, + "loss": 0.109, + "step": 50530 + }, + { + "epoch": 3.085363537710255, + "grad_norm": 3.841552734375, + "learning_rate": 3.1780183637111144e-06, + "loss": 0.1298, + "step": 50531 + }, + { + "epoch": 3.08537710255019, + "grad_norm": 4.606503963470459, + "learning_rate": 3.1778813210908595e-06, + "loss": 0.1557, + "step": 50532 + }, + { + "epoch": 3.085390667390125, + "grad_norm": 4.563666343688965, + "learning_rate": 3.1777442784706047e-06, + "loss": 0.0873, + "step": 50533 + }, + { + "epoch": 3.0854042322300597, + "grad_norm": 4.2750349044799805, + "learning_rate": 3.1776072358503495e-06, + "loss": 0.1019, + "step": 50534 + }, + { + "epoch": 3.0854177970699945, + "grad_norm": 4.550011157989502, + "learning_rate": 3.177470193230095e-06, + "loss": 0.1663, + "step": 50535 + }, + { + "epoch": 3.0854313619099294, + "grad_norm": 3.1712214946746826, + "learning_rate": 3.17733315060984e-06, + "loss": 0.0788, + "step": 50536 + }, + { + "epoch": 3.0854449267498643, + "grad_norm": 5.019540309906006, + "learning_rate": 3.1771961079895854e-06, + "loss": 0.1831, + "step": 50537 + }, + { + "epoch": 3.085458491589799, + "grad_norm": 3.8153913021087646, + "learning_rate": 3.17705906536933e-06, + "loss": 0.0972, + "step": 50538 + }, + { + "epoch": 3.085472056429734, + "grad_norm": 4.841148853302002, + "learning_rate": 3.1769220227490754e-06, + "loss": 0.1106, + "step": 50539 + }, + { + "epoch": 3.085485621269669, + "grad_norm": 3.8372409343719482, + "learning_rate": 3.17678498012882e-06, + "loss": 0.1028, + "step": 50540 + }, + { + "epoch": 3.0854991861096037, + "grad_norm": 3.5986714363098145, + "learning_rate": 3.1766479375085653e-06, + "loss": 0.1248, + "step": 50541 + }, + { + "epoch": 3.0855127509495386, + "grad_norm": 4.304009914398193, + "learning_rate": 3.1765108948883105e-06, + "loss": 0.1466, + "step": 50542 + }, + { + "epoch": 3.085526315789474, + "grad_norm": 6.050378322601318, + "learning_rate": 3.1763738522680552e-06, + "loss": 0.257, + "step": 50543 + }, + { + "epoch": 3.0855398806294088, + "grad_norm": 4.870584011077881, + "learning_rate": 3.176236809647801e-06, + "loss": 0.1128, + "step": 50544 + }, + { + "epoch": 3.0855534454693436, + "grad_norm": 3.2408182621002197, + "learning_rate": 3.1760997670275456e-06, + "loss": 0.0536, + "step": 50545 + }, + { + "epoch": 3.0855670103092785, + "grad_norm": 4.093046188354492, + "learning_rate": 3.175962724407291e-06, + "loss": 0.1172, + "step": 50546 + }, + { + "epoch": 3.0855805751492134, + "grad_norm": 4.2123308181762695, + "learning_rate": 3.175825681787036e-06, + "loss": 0.1305, + "step": 50547 + }, + { + "epoch": 3.085594139989148, + "grad_norm": 4.629154682159424, + "learning_rate": 3.175688639166781e-06, + "loss": 0.0728, + "step": 50548 + }, + { + "epoch": 3.085607704829083, + "grad_norm": 3.5380024909973145, + "learning_rate": 3.1755515965465263e-06, + "loss": 0.0832, + "step": 50549 + }, + { + "epoch": 3.085621269669018, + "grad_norm": 4.045045852661133, + "learning_rate": 3.1754145539262715e-06, + "loss": 0.0833, + "step": 50550 + }, + { + "epoch": 3.085634834508953, + "grad_norm": 5.207328796386719, + "learning_rate": 3.1752775113060162e-06, + "loss": 0.1204, + "step": 50551 + }, + { + "epoch": 3.0856483993488877, + "grad_norm": 3.585646867752075, + "learning_rate": 3.175140468685762e-06, + "loss": 0.0594, + "step": 50552 + }, + { + "epoch": 3.0856619641888225, + "grad_norm": 6.334235668182373, + "learning_rate": 3.1750034260655066e-06, + "loss": 0.1606, + "step": 50553 + }, + { + "epoch": 3.0856755290287574, + "grad_norm": 5.067856788635254, + "learning_rate": 3.174866383445252e-06, + "loss": 0.1166, + "step": 50554 + }, + { + "epoch": 3.0856890938686923, + "grad_norm": 3.794825315475464, + "learning_rate": 3.174729340824997e-06, + "loss": 0.0721, + "step": 50555 + }, + { + "epoch": 3.085702658708627, + "grad_norm": 5.051828384399414, + "learning_rate": 3.1745922982047417e-06, + "loss": 0.1244, + "step": 50556 + }, + { + "epoch": 3.085716223548562, + "grad_norm": 2.700083017349243, + "learning_rate": 3.174455255584487e-06, + "loss": 0.0641, + "step": 50557 + }, + { + "epoch": 3.085729788388497, + "grad_norm": 4.280364036560059, + "learning_rate": 3.174318212964232e-06, + "loss": 0.1671, + "step": 50558 + }, + { + "epoch": 3.0857433532284317, + "grad_norm": 3.598893642425537, + "learning_rate": 3.1741811703439772e-06, + "loss": 0.1185, + "step": 50559 + }, + { + "epoch": 3.0857569180683666, + "grad_norm": 2.8572285175323486, + "learning_rate": 3.174044127723722e-06, + "loss": 0.0776, + "step": 50560 + }, + { + "epoch": 3.0857704829083015, + "grad_norm": 3.216090202331543, + "learning_rate": 3.1739070851034676e-06, + "loss": 0.0783, + "step": 50561 + }, + { + "epoch": 3.085784047748237, + "grad_norm": 4.5971598625183105, + "learning_rate": 3.1737700424832124e-06, + "loss": 0.101, + "step": 50562 + }, + { + "epoch": 3.0857976125881716, + "grad_norm": 6.594761848449707, + "learning_rate": 3.173632999862958e-06, + "loss": 0.1057, + "step": 50563 + }, + { + "epoch": 3.0858111774281065, + "grad_norm": 5.0981550216674805, + "learning_rate": 3.1734959572427027e-06, + "loss": 0.1947, + "step": 50564 + }, + { + "epoch": 3.0858247422680414, + "grad_norm": 3.563075542449951, + "learning_rate": 3.173358914622448e-06, + "loss": 0.0916, + "step": 50565 + }, + { + "epoch": 3.0858383071079762, + "grad_norm": 4.314299583435059, + "learning_rate": 3.173221872002193e-06, + "loss": 0.062, + "step": 50566 + }, + { + "epoch": 3.085851871947911, + "grad_norm": 3.872077226638794, + "learning_rate": 3.1730848293819382e-06, + "loss": 0.0906, + "step": 50567 + }, + { + "epoch": 3.085865436787846, + "grad_norm": 4.270732402801514, + "learning_rate": 3.172947786761683e-06, + "loss": 0.1113, + "step": 50568 + }, + { + "epoch": 3.085879001627781, + "grad_norm": 3.7937326431274414, + "learning_rate": 3.172810744141428e-06, + "loss": 0.0548, + "step": 50569 + }, + { + "epoch": 3.0858925664677157, + "grad_norm": 4.408466339111328, + "learning_rate": 3.1726737015211734e-06, + "loss": 0.123, + "step": 50570 + }, + { + "epoch": 3.0859061313076506, + "grad_norm": 5.7126665115356445, + "learning_rate": 3.172536658900918e-06, + "loss": 0.1596, + "step": 50571 + }, + { + "epoch": 3.0859196961475854, + "grad_norm": 3.6573479175567627, + "learning_rate": 3.1723996162806637e-06, + "loss": 0.0611, + "step": 50572 + }, + { + "epoch": 3.0859332609875203, + "grad_norm": 3.5189924240112305, + "learning_rate": 3.1722625736604085e-06, + "loss": 0.0649, + "step": 50573 + }, + { + "epoch": 3.085946825827455, + "grad_norm": 3.9063682556152344, + "learning_rate": 3.1721255310401537e-06, + "loss": 0.0736, + "step": 50574 + }, + { + "epoch": 3.08596039066739, + "grad_norm": 5.2573370933532715, + "learning_rate": 3.171988488419899e-06, + "loss": 0.1227, + "step": 50575 + }, + { + "epoch": 3.085973955507325, + "grad_norm": 3.0415449142456055, + "learning_rate": 3.171851445799644e-06, + "loss": 0.0612, + "step": 50576 + }, + { + "epoch": 3.0859875203472598, + "grad_norm": 7.0455780029296875, + "learning_rate": 3.1717144031793888e-06, + "loss": 0.2462, + "step": 50577 + }, + { + "epoch": 3.0860010851871946, + "grad_norm": 6.037854194641113, + "learning_rate": 3.1715773605591344e-06, + "loss": 0.2286, + "step": 50578 + }, + { + "epoch": 3.0860146500271295, + "grad_norm": 4.719365119934082, + "learning_rate": 3.171440317938879e-06, + "loss": 0.122, + "step": 50579 + }, + { + "epoch": 3.0860282148670644, + "grad_norm": 3.613334894180298, + "learning_rate": 3.1713032753186247e-06, + "loss": 0.0848, + "step": 50580 + }, + { + "epoch": 3.0860417797069997, + "grad_norm": 5.590460300445557, + "learning_rate": 3.1711662326983695e-06, + "loss": 0.1164, + "step": 50581 + }, + { + "epoch": 3.0860553445469345, + "grad_norm": 6.17996072769165, + "learning_rate": 3.1710291900781147e-06, + "loss": 0.1651, + "step": 50582 + }, + { + "epoch": 3.0860689093868694, + "grad_norm": 4.240153789520264, + "learning_rate": 3.17089214745786e-06, + "loss": 0.0807, + "step": 50583 + }, + { + "epoch": 3.0860824742268043, + "grad_norm": 4.8889899253845215, + "learning_rate": 3.1707551048376046e-06, + "loss": 0.1011, + "step": 50584 + }, + { + "epoch": 3.086096039066739, + "grad_norm": 4.421908378601074, + "learning_rate": 3.1706180622173498e-06, + "loss": 0.1552, + "step": 50585 + }, + { + "epoch": 3.086109603906674, + "grad_norm": 5.645147800445557, + "learning_rate": 3.170481019597095e-06, + "loss": 0.2338, + "step": 50586 + }, + { + "epoch": 3.086123168746609, + "grad_norm": 3.655454158782959, + "learning_rate": 3.17034397697684e-06, + "loss": 0.0903, + "step": 50587 + }, + { + "epoch": 3.0861367335865437, + "grad_norm": 5.052793502807617, + "learning_rate": 3.170206934356585e-06, + "loss": 0.1332, + "step": 50588 + }, + { + "epoch": 3.0861502984264786, + "grad_norm": 4.578149795532227, + "learning_rate": 3.1700698917363305e-06, + "loss": 0.25, + "step": 50589 + }, + { + "epoch": 3.0861638632664135, + "grad_norm": 3.9099929332733154, + "learning_rate": 3.1699328491160752e-06, + "loss": 0.0646, + "step": 50590 + }, + { + "epoch": 3.0861774281063483, + "grad_norm": 4.006290912628174, + "learning_rate": 3.169795806495821e-06, + "loss": 0.1175, + "step": 50591 + }, + { + "epoch": 3.086190992946283, + "grad_norm": 4.055121898651123, + "learning_rate": 3.1696587638755656e-06, + "loss": 0.1625, + "step": 50592 + }, + { + "epoch": 3.086204557786218, + "grad_norm": 3.425462245941162, + "learning_rate": 3.1695217212553108e-06, + "loss": 0.0806, + "step": 50593 + }, + { + "epoch": 3.086218122626153, + "grad_norm": 4.1985764503479, + "learning_rate": 3.1693846786350555e-06, + "loss": 0.1325, + "step": 50594 + }, + { + "epoch": 3.086231687466088, + "grad_norm": 4.032721996307373, + "learning_rate": 3.169247636014801e-06, + "loss": 0.0882, + "step": 50595 + }, + { + "epoch": 3.0862452523060226, + "grad_norm": 5.1047139167785645, + "learning_rate": 3.169110593394546e-06, + "loss": 0.2002, + "step": 50596 + }, + { + "epoch": 3.0862588171459575, + "grad_norm": 4.222636699676514, + "learning_rate": 3.1689735507742907e-06, + "loss": 0.1016, + "step": 50597 + }, + { + "epoch": 3.0862723819858924, + "grad_norm": 4.115021228790283, + "learning_rate": 3.1688365081540363e-06, + "loss": 0.0927, + "step": 50598 + }, + { + "epoch": 3.0862859468258272, + "grad_norm": 4.035211086273193, + "learning_rate": 3.168699465533781e-06, + "loss": 0.1126, + "step": 50599 + }, + { + "epoch": 3.0862995116657626, + "grad_norm": 4.0084452629089355, + "learning_rate": 3.1685624229135266e-06, + "loss": 0.1073, + "step": 50600 + }, + { + "epoch": 3.0863130765056974, + "grad_norm": 4.307782173156738, + "learning_rate": 3.1684253802932714e-06, + "loss": 0.1239, + "step": 50601 + }, + { + "epoch": 3.0863266413456323, + "grad_norm": 4.492543697357178, + "learning_rate": 3.1682883376730165e-06, + "loss": 0.1338, + "step": 50602 + }, + { + "epoch": 3.086340206185567, + "grad_norm": 4.877072811126709, + "learning_rate": 3.1681512950527617e-06, + "loss": 0.1106, + "step": 50603 + }, + { + "epoch": 3.086353771025502, + "grad_norm": 4.317912578582764, + "learning_rate": 3.168014252432507e-06, + "loss": 0.1761, + "step": 50604 + }, + { + "epoch": 3.086367335865437, + "grad_norm": 4.183811664581299, + "learning_rate": 3.1678772098122517e-06, + "loss": 0.1353, + "step": 50605 + }, + { + "epoch": 3.0863809007053717, + "grad_norm": 4.974423408508301, + "learning_rate": 3.1677401671919973e-06, + "loss": 0.087, + "step": 50606 + }, + { + "epoch": 3.0863944655453066, + "grad_norm": 4.6640496253967285, + "learning_rate": 3.167603124571742e-06, + "loss": 0.188, + "step": 50607 + }, + { + "epoch": 3.0864080303852415, + "grad_norm": 4.236595153808594, + "learning_rate": 3.1674660819514876e-06, + "loss": 0.089, + "step": 50608 + }, + { + "epoch": 3.0864215952251763, + "grad_norm": 4.74153470993042, + "learning_rate": 3.1673290393312324e-06, + "loss": 0.1673, + "step": 50609 + }, + { + "epoch": 3.086435160065111, + "grad_norm": 3.3359832763671875, + "learning_rate": 3.167191996710977e-06, + "loss": 0.0777, + "step": 50610 + }, + { + "epoch": 3.086448724905046, + "grad_norm": 4.6769795417785645, + "learning_rate": 3.1670549540907223e-06, + "loss": 0.0859, + "step": 50611 + }, + { + "epoch": 3.086462289744981, + "grad_norm": 4.1380205154418945, + "learning_rate": 3.1669179114704675e-06, + "loss": 0.1224, + "step": 50612 + }, + { + "epoch": 3.086475854584916, + "grad_norm": 6.111844539642334, + "learning_rate": 3.1667808688502127e-06, + "loss": 0.2082, + "step": 50613 + }, + { + "epoch": 3.0864894194248507, + "grad_norm": 4.527327060699463, + "learning_rate": 3.1666438262299574e-06, + "loss": 0.1846, + "step": 50614 + }, + { + "epoch": 3.0865029842647855, + "grad_norm": 3.769845962524414, + "learning_rate": 3.166506783609703e-06, + "loss": 0.0887, + "step": 50615 + }, + { + "epoch": 3.0865165491047204, + "grad_norm": 5.771226406097412, + "learning_rate": 3.1663697409894478e-06, + "loss": 0.2208, + "step": 50616 + }, + { + "epoch": 3.0865301139446553, + "grad_norm": 2.745361089706421, + "learning_rate": 3.1662326983691934e-06, + "loss": 0.0442, + "step": 50617 + }, + { + "epoch": 3.08654367878459, + "grad_norm": 3.3170719146728516, + "learning_rate": 3.166095655748938e-06, + "loss": 0.1063, + "step": 50618 + }, + { + "epoch": 3.0865572436245254, + "grad_norm": 4.155642509460449, + "learning_rate": 3.1659586131286833e-06, + "loss": 0.1516, + "step": 50619 + }, + { + "epoch": 3.0865708084644603, + "grad_norm": 4.759194374084473, + "learning_rate": 3.1658215705084285e-06, + "loss": 0.1886, + "step": 50620 + }, + { + "epoch": 3.086584373304395, + "grad_norm": 4.99343204498291, + "learning_rate": 3.1656845278881737e-06, + "loss": 0.0947, + "step": 50621 + }, + { + "epoch": 3.08659793814433, + "grad_norm": 4.895116806030273, + "learning_rate": 3.1655474852679184e-06, + "loss": 0.1796, + "step": 50622 + }, + { + "epoch": 3.086611502984265, + "grad_norm": 4.9200119972229, + "learning_rate": 3.165410442647664e-06, + "loss": 0.111, + "step": 50623 + }, + { + "epoch": 3.0866250678241998, + "grad_norm": 4.714372634887695, + "learning_rate": 3.1652734000274088e-06, + "loss": 0.0874, + "step": 50624 + }, + { + "epoch": 3.0866386326641346, + "grad_norm": 3.4519762992858887, + "learning_rate": 3.1651363574071535e-06, + "loss": 0.1116, + "step": 50625 + }, + { + "epoch": 3.0866521975040695, + "grad_norm": 3.0102224349975586, + "learning_rate": 3.164999314786899e-06, + "loss": 0.0957, + "step": 50626 + }, + { + "epoch": 3.0866657623440044, + "grad_norm": 5.607184886932373, + "learning_rate": 3.164862272166644e-06, + "loss": 0.1065, + "step": 50627 + }, + { + "epoch": 3.0866793271839392, + "grad_norm": 3.769859552383423, + "learning_rate": 3.164725229546389e-06, + "loss": 0.1181, + "step": 50628 + }, + { + "epoch": 3.086692892023874, + "grad_norm": 2.9152066707611084, + "learning_rate": 3.1645881869261343e-06, + "loss": 0.0697, + "step": 50629 + }, + { + "epoch": 3.086706456863809, + "grad_norm": 5.0576934814453125, + "learning_rate": 3.1644511443058794e-06, + "loss": 0.0904, + "step": 50630 + }, + { + "epoch": 3.086720021703744, + "grad_norm": 3.7495341300964355, + "learning_rate": 3.164314101685624e-06, + "loss": 0.0805, + "step": 50631 + }, + { + "epoch": 3.0867335865436787, + "grad_norm": 4.6325554847717285, + "learning_rate": 3.16417705906537e-06, + "loss": 0.216, + "step": 50632 + }, + { + "epoch": 3.0867471513836136, + "grad_norm": 5.175491809844971, + "learning_rate": 3.1640400164451146e-06, + "loss": 0.1923, + "step": 50633 + }, + { + "epoch": 3.0867607162235484, + "grad_norm": 3.948429584503174, + "learning_rate": 3.16390297382486e-06, + "loss": 0.096, + "step": 50634 + }, + { + "epoch": 3.0867742810634833, + "grad_norm": 3.5542683601379395, + "learning_rate": 3.163765931204605e-06, + "loss": 0.1039, + "step": 50635 + }, + { + "epoch": 3.086787845903418, + "grad_norm": 4.588237762451172, + "learning_rate": 3.16362888858435e-06, + "loss": 0.132, + "step": 50636 + }, + { + "epoch": 3.086801410743353, + "grad_norm": 4.330488681793213, + "learning_rate": 3.1634918459640953e-06, + "loss": 0.1385, + "step": 50637 + }, + { + "epoch": 3.0868149755832883, + "grad_norm": 3.9562647342681885, + "learning_rate": 3.16335480334384e-06, + "loss": 0.0882, + "step": 50638 + }, + { + "epoch": 3.086828540423223, + "grad_norm": 5.132361888885498, + "learning_rate": 3.163217760723585e-06, + "loss": 0.2163, + "step": 50639 + }, + { + "epoch": 3.086842105263158, + "grad_norm": 4.434279441833496, + "learning_rate": 3.1630807181033304e-06, + "loss": 0.1347, + "step": 50640 + }, + { + "epoch": 3.086855670103093, + "grad_norm": 4.7461466789245605, + "learning_rate": 3.1629436754830756e-06, + "loss": 0.1254, + "step": 50641 + }, + { + "epoch": 3.086869234943028, + "grad_norm": 4.532566070556641, + "learning_rate": 3.1628066328628203e-06, + "loss": 0.1103, + "step": 50642 + }, + { + "epoch": 3.0868827997829626, + "grad_norm": 5.0121331214904785, + "learning_rate": 3.162669590242566e-06, + "loss": 0.1181, + "step": 50643 + }, + { + "epoch": 3.0868963646228975, + "grad_norm": 6.6429314613342285, + "learning_rate": 3.1625325476223107e-06, + "loss": 0.1486, + "step": 50644 + }, + { + "epoch": 3.0869099294628324, + "grad_norm": 4.652331829071045, + "learning_rate": 3.162395505002056e-06, + "loss": 0.1102, + "step": 50645 + }, + { + "epoch": 3.0869234943027672, + "grad_norm": 6.473508358001709, + "learning_rate": 3.162258462381801e-06, + "loss": 0.2233, + "step": 50646 + }, + { + "epoch": 3.086937059142702, + "grad_norm": 3.3088419437408447, + "learning_rate": 3.162121419761546e-06, + "loss": 0.1236, + "step": 50647 + }, + { + "epoch": 3.086950623982637, + "grad_norm": 4.260295867919922, + "learning_rate": 3.161984377141291e-06, + "loss": 0.1237, + "step": 50648 + }, + { + "epoch": 3.086964188822572, + "grad_norm": 4.360564708709717, + "learning_rate": 3.1618473345210366e-06, + "loss": 0.1158, + "step": 50649 + }, + { + "epoch": 3.0869777536625067, + "grad_norm": 4.4460554122924805, + "learning_rate": 3.1617102919007813e-06, + "loss": 0.1396, + "step": 50650 + }, + { + "epoch": 3.0869913185024416, + "grad_norm": 4.627320289611816, + "learning_rate": 3.161573249280527e-06, + "loss": 0.1126, + "step": 50651 + }, + { + "epoch": 3.0870048833423764, + "grad_norm": 5.531746864318848, + "learning_rate": 3.1614362066602717e-06, + "loss": 0.2195, + "step": 50652 + }, + { + "epoch": 3.0870184481823113, + "grad_norm": 5.205234527587891, + "learning_rate": 3.1612991640400164e-06, + "loss": 0.1531, + "step": 50653 + }, + { + "epoch": 3.087032013022246, + "grad_norm": 3.9710984230041504, + "learning_rate": 3.161162121419762e-06, + "loss": 0.13, + "step": 50654 + }, + { + "epoch": 3.087045577862181, + "grad_norm": 5.540141582489014, + "learning_rate": 3.161025078799507e-06, + "loss": 0.1242, + "step": 50655 + }, + { + "epoch": 3.087059142702116, + "grad_norm": 4.885348796844482, + "learning_rate": 3.160888036179252e-06, + "loss": 0.1532, + "step": 50656 + }, + { + "epoch": 3.087072707542051, + "grad_norm": 4.666478157043457, + "learning_rate": 3.160750993558997e-06, + "loss": 0.1093, + "step": 50657 + }, + { + "epoch": 3.087086272381986, + "grad_norm": 5.905576229095459, + "learning_rate": 3.1606139509387423e-06, + "loss": 0.146, + "step": 50658 + }, + { + "epoch": 3.087099837221921, + "grad_norm": 4.989850044250488, + "learning_rate": 3.160476908318487e-06, + "loss": 0.0994, + "step": 50659 + }, + { + "epoch": 3.087113402061856, + "grad_norm": 6.3896164894104, + "learning_rate": 3.1603398656982327e-06, + "loss": 0.1061, + "step": 50660 + }, + { + "epoch": 3.0871269669017907, + "grad_norm": 3.499872922897339, + "learning_rate": 3.1602028230779774e-06, + "loss": 0.0974, + "step": 50661 + }, + { + "epoch": 3.0871405317417255, + "grad_norm": 4.691530227661133, + "learning_rate": 3.1600657804577226e-06, + "loss": 0.1654, + "step": 50662 + }, + { + "epoch": 3.0871540965816604, + "grad_norm": 3.528207302093506, + "learning_rate": 3.159928737837468e-06, + "loss": 0.1248, + "step": 50663 + }, + { + "epoch": 3.0871676614215953, + "grad_norm": 4.39634895324707, + "learning_rate": 3.159791695217213e-06, + "loss": 0.1172, + "step": 50664 + }, + { + "epoch": 3.08718122626153, + "grad_norm": 4.09644079208374, + "learning_rate": 3.1596546525969577e-06, + "loss": 0.1496, + "step": 50665 + }, + { + "epoch": 3.087194791101465, + "grad_norm": 3.821821928024292, + "learning_rate": 3.159517609976703e-06, + "loss": 0.1109, + "step": 50666 + }, + { + "epoch": 3.0872083559414, + "grad_norm": 5.566218852996826, + "learning_rate": 3.159380567356448e-06, + "loss": 0.1267, + "step": 50667 + }, + { + "epoch": 3.0872219207813347, + "grad_norm": 3.4676926136016846, + "learning_rate": 3.159243524736193e-06, + "loss": 0.0594, + "step": 50668 + }, + { + "epoch": 3.0872354856212696, + "grad_norm": 2.8759469985961914, + "learning_rate": 3.1591064821159384e-06, + "loss": 0.09, + "step": 50669 + }, + { + "epoch": 3.0872490504612045, + "grad_norm": 4.282042503356934, + "learning_rate": 3.158969439495683e-06, + "loss": 0.1001, + "step": 50670 + }, + { + "epoch": 3.0872626153011393, + "grad_norm": 5.625671863555908, + "learning_rate": 3.158832396875429e-06, + "loss": 0.174, + "step": 50671 + }, + { + "epoch": 3.087276180141074, + "grad_norm": 3.3526220321655273, + "learning_rate": 3.1586953542551736e-06, + "loss": 0.0661, + "step": 50672 + }, + { + "epoch": 3.087289744981009, + "grad_norm": 2.6674020290374756, + "learning_rate": 3.1585583116349187e-06, + "loss": 0.1053, + "step": 50673 + }, + { + "epoch": 3.087303309820944, + "grad_norm": 3.992562770843506, + "learning_rate": 3.158421269014664e-06, + "loss": 0.118, + "step": 50674 + }, + { + "epoch": 3.0873168746608792, + "grad_norm": 3.987948417663574, + "learning_rate": 3.158284226394409e-06, + "loss": 0.1159, + "step": 50675 + }, + { + "epoch": 3.087330439500814, + "grad_norm": 3.5839483737945557, + "learning_rate": 3.158147183774154e-06, + "loss": 0.0971, + "step": 50676 + }, + { + "epoch": 3.087344004340749, + "grad_norm": 3.445918321609497, + "learning_rate": 3.1580101411538995e-06, + "loss": 0.0914, + "step": 50677 + }, + { + "epoch": 3.087357569180684, + "grad_norm": 3.542975902557373, + "learning_rate": 3.157873098533644e-06, + "loss": 0.1149, + "step": 50678 + }, + { + "epoch": 3.0873711340206187, + "grad_norm": 6.297481536865234, + "learning_rate": 3.157736055913389e-06, + "loss": 0.131, + "step": 50679 + }, + { + "epoch": 3.0873846988605536, + "grad_norm": 3.15964674949646, + "learning_rate": 3.1575990132931346e-06, + "loss": 0.0543, + "step": 50680 + }, + { + "epoch": 3.0873982637004884, + "grad_norm": 5.035623550415039, + "learning_rate": 3.1574619706728793e-06, + "loss": 0.1024, + "step": 50681 + }, + { + "epoch": 3.0874118285404233, + "grad_norm": 3.8207204341888428, + "learning_rate": 3.1573249280526245e-06, + "loss": 0.1438, + "step": 50682 + }, + { + "epoch": 3.087425393380358, + "grad_norm": 3.0177974700927734, + "learning_rate": 3.1571878854323697e-06, + "loss": 0.0701, + "step": 50683 + }, + { + "epoch": 3.087438958220293, + "grad_norm": 3.7978854179382324, + "learning_rate": 3.157050842812115e-06, + "loss": 0.07, + "step": 50684 + }, + { + "epoch": 3.087452523060228, + "grad_norm": 4.207542896270752, + "learning_rate": 3.1569138001918596e-06, + "loss": 0.163, + "step": 50685 + }, + { + "epoch": 3.0874660879001627, + "grad_norm": 3.5560121536254883, + "learning_rate": 3.1567767575716052e-06, + "loss": 0.0798, + "step": 50686 + }, + { + "epoch": 3.0874796527400976, + "grad_norm": 4.388232707977295, + "learning_rate": 3.15663971495135e-06, + "loss": 0.0702, + "step": 50687 + }, + { + "epoch": 3.0874932175800325, + "grad_norm": 6.182225704193115, + "learning_rate": 3.1565026723310956e-06, + "loss": 0.1182, + "step": 50688 + }, + { + "epoch": 3.0875067824199673, + "grad_norm": 5.914413928985596, + "learning_rate": 3.1563656297108403e-06, + "loss": 0.1403, + "step": 50689 + }, + { + "epoch": 3.087520347259902, + "grad_norm": 2.9920523166656494, + "learning_rate": 3.1562285870905855e-06, + "loss": 0.0391, + "step": 50690 + }, + { + "epoch": 3.087533912099837, + "grad_norm": 3.571639060974121, + "learning_rate": 3.1560915444703307e-06, + "loss": 0.116, + "step": 50691 + }, + { + "epoch": 3.087547476939772, + "grad_norm": 3.914885997772217, + "learning_rate": 3.155954501850076e-06, + "loss": 0.1453, + "step": 50692 + }, + { + "epoch": 3.087561041779707, + "grad_norm": 3.1288199424743652, + "learning_rate": 3.1558174592298206e-06, + "loss": 0.1101, + "step": 50693 + }, + { + "epoch": 3.0875746066196417, + "grad_norm": 3.4847898483276367, + "learning_rate": 3.1556804166095654e-06, + "loss": 0.0732, + "step": 50694 + }, + { + "epoch": 3.087588171459577, + "grad_norm": 4.483119487762451, + "learning_rate": 3.155543373989311e-06, + "loss": 0.0912, + "step": 50695 + }, + { + "epoch": 3.087601736299512, + "grad_norm": 5.436282157897949, + "learning_rate": 3.1554063313690557e-06, + "loss": 0.141, + "step": 50696 + }, + { + "epoch": 3.0876153011394467, + "grad_norm": 3.131934642791748, + "learning_rate": 3.1552692887488013e-06, + "loss": 0.0589, + "step": 50697 + }, + { + "epoch": 3.0876288659793816, + "grad_norm": 3.565828323364258, + "learning_rate": 3.155132246128546e-06, + "loss": 0.1152, + "step": 50698 + }, + { + "epoch": 3.0876424308193164, + "grad_norm": 3.4111502170562744, + "learning_rate": 3.1549952035082913e-06, + "loss": 0.0754, + "step": 50699 + }, + { + "epoch": 3.0876559956592513, + "grad_norm": 5.043093681335449, + "learning_rate": 3.1548581608880365e-06, + "loss": 0.1453, + "step": 50700 + }, + { + "epoch": 3.087669560499186, + "grad_norm": 2.8060169219970703, + "learning_rate": 3.1547211182677816e-06, + "loss": 0.0796, + "step": 50701 + }, + { + "epoch": 3.087683125339121, + "grad_norm": 3.7288758754730225, + "learning_rate": 3.1545840756475264e-06, + "loss": 0.0708, + "step": 50702 + }, + { + "epoch": 3.087696690179056, + "grad_norm": 4.033035755157471, + "learning_rate": 3.154447033027272e-06, + "loss": 0.071, + "step": 50703 + }, + { + "epoch": 3.0877102550189908, + "grad_norm": 3.5081911087036133, + "learning_rate": 3.1543099904070167e-06, + "loss": 0.0665, + "step": 50704 + }, + { + "epoch": 3.0877238198589256, + "grad_norm": 2.9609293937683105, + "learning_rate": 3.1541729477867623e-06, + "loss": 0.0619, + "step": 50705 + }, + { + "epoch": 3.0877373846988605, + "grad_norm": 2.566652297973633, + "learning_rate": 3.154035905166507e-06, + "loss": 0.0494, + "step": 50706 + }, + { + "epoch": 3.0877509495387954, + "grad_norm": 4.018341064453125, + "learning_rate": 3.153898862546252e-06, + "loss": 0.0742, + "step": 50707 + }, + { + "epoch": 3.0877645143787302, + "grad_norm": 3.3138678073883057, + "learning_rate": 3.1537618199259975e-06, + "loss": 0.071, + "step": 50708 + }, + { + "epoch": 3.087778079218665, + "grad_norm": 5.7685160636901855, + "learning_rate": 3.1536247773057422e-06, + "loss": 0.0911, + "step": 50709 + }, + { + "epoch": 3.0877916440586, + "grad_norm": 4.722752571105957, + "learning_rate": 3.1534877346854874e-06, + "loss": 0.0742, + "step": 50710 + }, + { + "epoch": 3.087805208898535, + "grad_norm": 5.07375955581665, + "learning_rate": 3.1533506920652326e-06, + "loss": 0.1137, + "step": 50711 + }, + { + "epoch": 3.0878187737384697, + "grad_norm": 3.52429461479187, + "learning_rate": 3.1532136494449778e-06, + "loss": 0.0694, + "step": 50712 + }, + { + "epoch": 3.087832338578405, + "grad_norm": 4.344387054443359, + "learning_rate": 3.1530766068247225e-06, + "loss": 0.091, + "step": 50713 + }, + { + "epoch": 3.08784590341834, + "grad_norm": 4.541759490966797, + "learning_rate": 3.152939564204468e-06, + "loss": 0.1675, + "step": 50714 + }, + { + "epoch": 3.0878594682582747, + "grad_norm": 5.467320442199707, + "learning_rate": 3.152802521584213e-06, + "loss": 0.1649, + "step": 50715 + }, + { + "epoch": 3.0878730330982096, + "grad_norm": 4.559815406799316, + "learning_rate": 3.152665478963958e-06, + "loss": 0.1374, + "step": 50716 + }, + { + "epoch": 3.0878865979381445, + "grad_norm": 3.8504385948181152, + "learning_rate": 3.1525284363437032e-06, + "loss": 0.0836, + "step": 50717 + }, + { + "epoch": 3.0879001627780793, + "grad_norm": 4.224707126617432, + "learning_rate": 3.1523913937234484e-06, + "loss": 0.1121, + "step": 50718 + }, + { + "epoch": 3.087913727618014, + "grad_norm": 4.7024946212768555, + "learning_rate": 3.152254351103193e-06, + "loss": 0.1366, + "step": 50719 + }, + { + "epoch": 3.087927292457949, + "grad_norm": 3.8131003379821777, + "learning_rate": 3.1521173084829383e-06, + "loss": 0.1351, + "step": 50720 + }, + { + "epoch": 3.087940857297884, + "grad_norm": 8.049172401428223, + "learning_rate": 3.1519802658626835e-06, + "loss": 0.27, + "step": 50721 + }, + { + "epoch": 3.087954422137819, + "grad_norm": 3.5741846561431885, + "learning_rate": 3.1518432232424283e-06, + "loss": 0.0721, + "step": 50722 + }, + { + "epoch": 3.0879679869777537, + "grad_norm": 5.188390254974365, + "learning_rate": 3.151706180622174e-06, + "loss": 0.1662, + "step": 50723 + }, + { + "epoch": 3.0879815518176885, + "grad_norm": 5.25888729095459, + "learning_rate": 3.1515691380019186e-06, + "loss": 0.1989, + "step": 50724 + }, + { + "epoch": 3.0879951166576234, + "grad_norm": 3.529569149017334, + "learning_rate": 3.1514320953816642e-06, + "loss": 0.1385, + "step": 50725 + }, + { + "epoch": 3.0880086814975582, + "grad_norm": 3.580763816833496, + "learning_rate": 3.151295052761409e-06, + "loss": 0.1271, + "step": 50726 + }, + { + "epoch": 3.088022246337493, + "grad_norm": 3.515552520751953, + "learning_rate": 3.151158010141154e-06, + "loss": 0.0852, + "step": 50727 + }, + { + "epoch": 3.088035811177428, + "grad_norm": 3.2791495323181152, + "learning_rate": 3.1510209675208993e-06, + "loss": 0.0613, + "step": 50728 + }, + { + "epoch": 3.088049376017363, + "grad_norm": 2.917651653289795, + "learning_rate": 3.1508839249006445e-06, + "loss": 0.079, + "step": 50729 + }, + { + "epoch": 3.0880629408572977, + "grad_norm": 3.756786346435547, + "learning_rate": 3.1507468822803893e-06, + "loss": 0.1437, + "step": 50730 + }, + { + "epoch": 3.0880765056972326, + "grad_norm": 3.5480873584747314, + "learning_rate": 3.150609839660135e-06, + "loss": 0.0854, + "step": 50731 + }, + { + "epoch": 3.0880900705371674, + "grad_norm": 5.842038631439209, + "learning_rate": 3.1504727970398796e-06, + "loss": 0.2265, + "step": 50732 + }, + { + "epoch": 3.0881036353771028, + "grad_norm": 4.227360248565674, + "learning_rate": 3.150335754419625e-06, + "loss": 0.1537, + "step": 50733 + }, + { + "epoch": 3.0881172002170376, + "grad_norm": 4.023140907287598, + "learning_rate": 3.15019871179937e-06, + "loss": 0.1077, + "step": 50734 + }, + { + "epoch": 3.0881307650569725, + "grad_norm": 5.913344383239746, + "learning_rate": 3.1500616691791147e-06, + "loss": 0.1294, + "step": 50735 + }, + { + "epoch": 3.0881443298969073, + "grad_norm": 4.3016462326049805, + "learning_rate": 3.14992462655886e-06, + "loss": 0.1005, + "step": 50736 + }, + { + "epoch": 3.088157894736842, + "grad_norm": 4.244360446929932, + "learning_rate": 3.149787583938605e-06, + "loss": 0.0979, + "step": 50737 + }, + { + "epoch": 3.088171459576777, + "grad_norm": 4.947944641113281, + "learning_rate": 3.1496505413183503e-06, + "loss": 0.1461, + "step": 50738 + }, + { + "epoch": 3.088185024416712, + "grad_norm": 3.4166691303253174, + "learning_rate": 3.149513498698095e-06, + "loss": 0.1148, + "step": 50739 + }, + { + "epoch": 3.088198589256647, + "grad_norm": 4.143735408782959, + "learning_rate": 3.1493764560778406e-06, + "loss": 0.067, + "step": 50740 + }, + { + "epoch": 3.0882121540965817, + "grad_norm": 3.525057554244995, + "learning_rate": 3.1492394134575854e-06, + "loss": 0.088, + "step": 50741 + }, + { + "epoch": 3.0882257189365165, + "grad_norm": 2.7820301055908203, + "learning_rate": 3.149102370837331e-06, + "loss": 0.0774, + "step": 50742 + }, + { + "epoch": 3.0882392837764514, + "grad_norm": 5.0960211753845215, + "learning_rate": 3.1489653282170758e-06, + "loss": 0.2196, + "step": 50743 + }, + { + "epoch": 3.0882528486163863, + "grad_norm": 3.938100576400757, + "learning_rate": 3.148828285596821e-06, + "loss": 0.074, + "step": 50744 + }, + { + "epoch": 3.088266413456321, + "grad_norm": 3.576120138168335, + "learning_rate": 3.148691242976566e-06, + "loss": 0.1187, + "step": 50745 + }, + { + "epoch": 3.088279978296256, + "grad_norm": 5.836338996887207, + "learning_rate": 3.1485542003563113e-06, + "loss": 0.0997, + "step": 50746 + }, + { + "epoch": 3.088293543136191, + "grad_norm": 4.658596992492676, + "learning_rate": 3.148417157736056e-06, + "loss": 0.0918, + "step": 50747 + }, + { + "epoch": 3.0883071079761257, + "grad_norm": 3.8966972827911377, + "learning_rate": 3.148280115115801e-06, + "loss": 0.0959, + "step": 50748 + }, + { + "epoch": 3.0883206728160606, + "grad_norm": 3.339479684829712, + "learning_rate": 3.1481430724955464e-06, + "loss": 0.0555, + "step": 50749 + }, + { + "epoch": 3.0883342376559955, + "grad_norm": 5.1340107917785645, + "learning_rate": 3.148006029875291e-06, + "loss": 0.1433, + "step": 50750 + }, + { + "epoch": 3.0883478024959308, + "grad_norm": 4.038206577301025, + "learning_rate": 3.1478689872550368e-06, + "loss": 0.0855, + "step": 50751 + }, + { + "epoch": 3.0883613673358656, + "grad_norm": 4.56103515625, + "learning_rate": 3.1477319446347815e-06, + "loss": 0.1275, + "step": 50752 + }, + { + "epoch": 3.0883749321758005, + "grad_norm": 4.525640964508057, + "learning_rate": 3.1475949020145267e-06, + "loss": 0.1874, + "step": 50753 + }, + { + "epoch": 3.0883884970157354, + "grad_norm": 3.7656822204589844, + "learning_rate": 3.147457859394272e-06, + "loss": 0.0821, + "step": 50754 + }, + { + "epoch": 3.0884020618556702, + "grad_norm": 5.369359016418457, + "learning_rate": 3.147320816774017e-06, + "loss": 0.1322, + "step": 50755 + }, + { + "epoch": 3.088415626695605, + "grad_norm": 4.753566741943359, + "learning_rate": 3.147183774153762e-06, + "loss": 0.1291, + "step": 50756 + }, + { + "epoch": 3.08842919153554, + "grad_norm": 5.872854232788086, + "learning_rate": 3.1470467315335074e-06, + "loss": 0.1467, + "step": 50757 + }, + { + "epoch": 3.088442756375475, + "grad_norm": 3.225856065750122, + "learning_rate": 3.146909688913252e-06, + "loss": 0.1405, + "step": 50758 + }, + { + "epoch": 3.0884563212154097, + "grad_norm": 3.90259051322937, + "learning_rate": 3.1467726462929978e-06, + "loss": 0.1609, + "step": 50759 + }, + { + "epoch": 3.0884698860553446, + "grad_norm": 4.593652725219727, + "learning_rate": 3.1466356036727425e-06, + "loss": 0.125, + "step": 50760 + }, + { + "epoch": 3.0884834508952794, + "grad_norm": 4.963886737823486, + "learning_rate": 3.1464985610524877e-06, + "loss": 0.1098, + "step": 50761 + }, + { + "epoch": 3.0884970157352143, + "grad_norm": 3.3098669052124023, + "learning_rate": 3.146361518432233e-06, + "loss": 0.1116, + "step": 50762 + }, + { + "epoch": 3.088510580575149, + "grad_norm": 3.3283040523529053, + "learning_rate": 3.1462244758119776e-06, + "loss": 0.0721, + "step": 50763 + }, + { + "epoch": 3.088524145415084, + "grad_norm": 4.932735443115234, + "learning_rate": 3.146087433191723e-06, + "loss": 0.1557, + "step": 50764 + }, + { + "epoch": 3.088537710255019, + "grad_norm": 5.069260597229004, + "learning_rate": 3.1459503905714676e-06, + "loss": 0.1097, + "step": 50765 + }, + { + "epoch": 3.0885512750949538, + "grad_norm": 5.503778457641602, + "learning_rate": 3.145813347951213e-06, + "loss": 0.1912, + "step": 50766 + }, + { + "epoch": 3.0885648399348886, + "grad_norm": 6.228158473968506, + "learning_rate": 3.145676305330958e-06, + "loss": 0.1948, + "step": 50767 + }, + { + "epoch": 3.0885784047748235, + "grad_norm": 4.728517055511475, + "learning_rate": 3.1455392627107035e-06, + "loss": 0.1061, + "step": 50768 + }, + { + "epoch": 3.0885919696147583, + "grad_norm": 4.0798468589782715, + "learning_rate": 3.1454022200904483e-06, + "loss": 0.1085, + "step": 50769 + }, + { + "epoch": 3.088605534454693, + "grad_norm": 3.5577821731567383, + "learning_rate": 3.1452651774701935e-06, + "loss": 0.1077, + "step": 50770 + }, + { + "epoch": 3.0886190992946285, + "grad_norm": 5.026297092437744, + "learning_rate": 3.1451281348499386e-06, + "loss": 0.1308, + "step": 50771 + }, + { + "epoch": 3.0886326641345634, + "grad_norm": 4.8385162353515625, + "learning_rate": 3.144991092229684e-06, + "loss": 0.1358, + "step": 50772 + }, + { + "epoch": 3.0886462289744983, + "grad_norm": 5.01192045211792, + "learning_rate": 3.1448540496094286e-06, + "loss": 0.1752, + "step": 50773 + }, + { + "epoch": 3.088659793814433, + "grad_norm": 6.029703140258789, + "learning_rate": 3.144717006989174e-06, + "loss": 0.1952, + "step": 50774 + }, + { + "epoch": 3.088673358654368, + "grad_norm": 3.418208360671997, + "learning_rate": 3.144579964368919e-06, + "loss": 0.1017, + "step": 50775 + }, + { + "epoch": 3.088686923494303, + "grad_norm": 3.7365100383758545, + "learning_rate": 3.1444429217486637e-06, + "loss": 0.1383, + "step": 50776 + }, + { + "epoch": 3.0887004883342377, + "grad_norm": 4.4518513679504395, + "learning_rate": 3.1443058791284093e-06, + "loss": 0.1351, + "step": 50777 + }, + { + "epoch": 3.0887140531741726, + "grad_norm": 3.788566827774048, + "learning_rate": 3.144168836508154e-06, + "loss": 0.1248, + "step": 50778 + }, + { + "epoch": 3.0887276180141074, + "grad_norm": 4.354734897613525, + "learning_rate": 3.1440317938878997e-06, + "loss": 0.1067, + "step": 50779 + }, + { + "epoch": 3.0887411828540423, + "grad_norm": 5.357298374176025, + "learning_rate": 3.1438947512676444e-06, + "loss": 0.1631, + "step": 50780 + }, + { + "epoch": 3.088754747693977, + "grad_norm": 4.8731889724731445, + "learning_rate": 3.1437577086473896e-06, + "loss": 0.1572, + "step": 50781 + }, + { + "epoch": 3.088768312533912, + "grad_norm": 3.8757331371307373, + "learning_rate": 3.1436206660271348e-06, + "loss": 0.1812, + "step": 50782 + }, + { + "epoch": 3.088781877373847, + "grad_norm": 3.9700918197631836, + "learning_rate": 3.14348362340688e-06, + "loss": 0.1164, + "step": 50783 + }, + { + "epoch": 3.0887954422137818, + "grad_norm": 4.105166435241699, + "learning_rate": 3.1433465807866247e-06, + "loss": 0.0994, + "step": 50784 + }, + { + "epoch": 3.0888090070537166, + "grad_norm": 4.820668697357178, + "learning_rate": 3.1432095381663703e-06, + "loss": 0.0976, + "step": 50785 + }, + { + "epoch": 3.0888225718936515, + "grad_norm": 3.502149820327759, + "learning_rate": 3.143072495546115e-06, + "loss": 0.1047, + "step": 50786 + }, + { + "epoch": 3.0888361367335864, + "grad_norm": 4.502040386199951, + "learning_rate": 3.1429354529258602e-06, + "loss": 0.0893, + "step": 50787 + }, + { + "epoch": 3.0888497015735212, + "grad_norm": 2.6007261276245117, + "learning_rate": 3.1427984103056054e-06, + "loss": 0.0776, + "step": 50788 + }, + { + "epoch": 3.0888632664134565, + "grad_norm": 2.8079023361206055, + "learning_rate": 3.14266136768535e-06, + "loss": 0.0679, + "step": 50789 + }, + { + "epoch": 3.0888768312533914, + "grad_norm": 2.9830400943756104, + "learning_rate": 3.1425243250650954e-06, + "loss": 0.0821, + "step": 50790 + }, + { + "epoch": 3.0888903960933263, + "grad_norm": 2.8268704414367676, + "learning_rate": 3.1423872824448405e-06, + "loss": 0.0996, + "step": 50791 + }, + { + "epoch": 3.088903960933261, + "grad_norm": 4.680916786193848, + "learning_rate": 3.1422502398245857e-06, + "loss": 0.1534, + "step": 50792 + }, + { + "epoch": 3.088917525773196, + "grad_norm": 3.9859824180603027, + "learning_rate": 3.1421131972043305e-06, + "loss": 0.0774, + "step": 50793 + }, + { + "epoch": 3.088931090613131, + "grad_norm": 4.221418857574463, + "learning_rate": 3.141976154584076e-06, + "loss": 0.1047, + "step": 50794 + }, + { + "epoch": 3.0889446554530657, + "grad_norm": 3.4167442321777344, + "learning_rate": 3.141839111963821e-06, + "loss": 0.088, + "step": 50795 + }, + { + "epoch": 3.0889582202930006, + "grad_norm": 5.285792827606201, + "learning_rate": 3.1417020693435664e-06, + "loss": 0.1193, + "step": 50796 + }, + { + "epoch": 3.0889717851329355, + "grad_norm": 4.454182147979736, + "learning_rate": 3.141565026723311e-06, + "loss": 0.1287, + "step": 50797 + }, + { + "epoch": 3.0889853499728703, + "grad_norm": 3.6478822231292725, + "learning_rate": 3.1414279841030564e-06, + "loss": 0.1245, + "step": 50798 + }, + { + "epoch": 3.088998914812805, + "grad_norm": 4.286261558532715, + "learning_rate": 3.1412909414828015e-06, + "loss": 0.0798, + "step": 50799 + }, + { + "epoch": 3.08901247965274, + "grad_norm": 3.1081125736236572, + "learning_rate": 3.1411538988625467e-06, + "loss": 0.1065, + "step": 50800 + }, + { + "epoch": 3.089026044492675, + "grad_norm": 4.078763961791992, + "learning_rate": 3.1410168562422915e-06, + "loss": 0.0746, + "step": 50801 + }, + { + "epoch": 3.08903960933261, + "grad_norm": 3.2651634216308594, + "learning_rate": 3.140879813622037e-06, + "loss": 0.07, + "step": 50802 + }, + { + "epoch": 3.0890531741725447, + "grad_norm": 3.8984479904174805, + "learning_rate": 3.140742771001782e-06, + "loss": 0.0829, + "step": 50803 + }, + { + "epoch": 3.0890667390124795, + "grad_norm": 3.543321132659912, + "learning_rate": 3.1406057283815266e-06, + "loss": 0.0604, + "step": 50804 + }, + { + "epoch": 3.0890803038524144, + "grad_norm": 7.714803218841553, + "learning_rate": 3.140468685761272e-06, + "loss": 0.2335, + "step": 50805 + }, + { + "epoch": 3.0890938686923493, + "grad_norm": 4.445099353790283, + "learning_rate": 3.140331643141017e-06, + "loss": 0.1369, + "step": 50806 + }, + { + "epoch": 3.089107433532284, + "grad_norm": 3.635514974594116, + "learning_rate": 3.140194600520762e-06, + "loss": 0.1107, + "step": 50807 + }, + { + "epoch": 3.089120998372219, + "grad_norm": 5.203861236572266, + "learning_rate": 3.1400575579005073e-06, + "loss": 0.1017, + "step": 50808 + }, + { + "epoch": 3.0891345632121543, + "grad_norm": 2.8391001224517822, + "learning_rate": 3.1399205152802525e-06, + "loss": 0.0907, + "step": 50809 + }, + { + "epoch": 3.089148128052089, + "grad_norm": 4.379624366760254, + "learning_rate": 3.1397834726599972e-06, + "loss": 0.12, + "step": 50810 + }, + { + "epoch": 3.089161692892024, + "grad_norm": 3.4305524826049805, + "learning_rate": 3.139646430039743e-06, + "loss": 0.1138, + "step": 50811 + }, + { + "epoch": 3.089175257731959, + "grad_norm": 3.4477529525756836, + "learning_rate": 3.1395093874194876e-06, + "loss": 0.1014, + "step": 50812 + }, + { + "epoch": 3.0891888225718938, + "grad_norm": 2.897540807723999, + "learning_rate": 3.139372344799233e-06, + "loss": 0.1229, + "step": 50813 + }, + { + "epoch": 3.0892023874118286, + "grad_norm": 6.0510945320129395, + "learning_rate": 3.139235302178978e-06, + "loss": 0.1408, + "step": 50814 + }, + { + "epoch": 3.0892159522517635, + "grad_norm": 2.9043378829956055, + "learning_rate": 3.139098259558723e-06, + "loss": 0.1163, + "step": 50815 + }, + { + "epoch": 3.0892295170916984, + "grad_norm": 2.5978989601135254, + "learning_rate": 3.1389612169384683e-06, + "loss": 0.0792, + "step": 50816 + }, + { + "epoch": 3.089243081931633, + "grad_norm": 3.897294521331787, + "learning_rate": 3.138824174318213e-06, + "loss": 0.1089, + "step": 50817 + }, + { + "epoch": 3.089256646771568, + "grad_norm": 4.639462471008301, + "learning_rate": 3.1386871316979582e-06, + "loss": 0.1403, + "step": 50818 + }, + { + "epoch": 3.089270211611503, + "grad_norm": 5.4683027267456055, + "learning_rate": 3.138550089077703e-06, + "loss": 0.1862, + "step": 50819 + }, + { + "epoch": 3.089283776451438, + "grad_norm": 3.616004467010498, + "learning_rate": 3.1384130464574486e-06, + "loss": 0.0773, + "step": 50820 + }, + { + "epoch": 3.0892973412913727, + "grad_norm": 3.4759702682495117, + "learning_rate": 3.1382760038371934e-06, + "loss": 0.0788, + "step": 50821 + }, + { + "epoch": 3.0893109061313075, + "grad_norm": 3.799797773361206, + "learning_rate": 3.138138961216939e-06, + "loss": 0.0591, + "step": 50822 + }, + { + "epoch": 3.0893244709712424, + "grad_norm": 3.413588047027588, + "learning_rate": 3.1380019185966837e-06, + "loss": 0.0547, + "step": 50823 + }, + { + "epoch": 3.0893380358111773, + "grad_norm": 3.9552526473999023, + "learning_rate": 3.137864875976429e-06, + "loss": 0.1301, + "step": 50824 + }, + { + "epoch": 3.089351600651112, + "grad_norm": 3.7583425045013428, + "learning_rate": 3.137727833356174e-06, + "loss": 0.1197, + "step": 50825 + }, + { + "epoch": 3.089365165491047, + "grad_norm": 3.551020860671997, + "learning_rate": 3.1375907907359193e-06, + "loss": 0.0859, + "step": 50826 + }, + { + "epoch": 3.0893787303309823, + "grad_norm": 4.848618984222412, + "learning_rate": 3.137453748115664e-06, + "loss": 0.2109, + "step": 50827 + }, + { + "epoch": 3.089392295170917, + "grad_norm": 6.2259392738342285, + "learning_rate": 3.1373167054954096e-06, + "loss": 0.1413, + "step": 50828 + }, + { + "epoch": 3.089405860010852, + "grad_norm": 4.425434112548828, + "learning_rate": 3.1371796628751544e-06, + "loss": 0.0992, + "step": 50829 + }, + { + "epoch": 3.089419424850787, + "grad_norm": 5.492574691772461, + "learning_rate": 3.1370426202549e-06, + "loss": 0.1282, + "step": 50830 + }, + { + "epoch": 3.0894329896907218, + "grad_norm": 3.6944494247436523, + "learning_rate": 3.1369055776346447e-06, + "loss": 0.0676, + "step": 50831 + }, + { + "epoch": 3.0894465545306566, + "grad_norm": 5.110254764556885, + "learning_rate": 3.1367685350143895e-06, + "loss": 0.1051, + "step": 50832 + }, + { + "epoch": 3.0894601193705915, + "grad_norm": 3.8317043781280518, + "learning_rate": 3.136631492394135e-06, + "loss": 0.1597, + "step": 50833 + }, + { + "epoch": 3.0894736842105264, + "grad_norm": 5.859023571014404, + "learning_rate": 3.13649444977388e-06, + "loss": 0.167, + "step": 50834 + }, + { + "epoch": 3.0894872490504612, + "grad_norm": 5.255301475524902, + "learning_rate": 3.136357407153625e-06, + "loss": 0.1772, + "step": 50835 + }, + { + "epoch": 3.089500813890396, + "grad_norm": 2.9358742237091064, + "learning_rate": 3.1362203645333698e-06, + "loss": 0.0757, + "step": 50836 + }, + { + "epoch": 3.089514378730331, + "grad_norm": 2.6141998767852783, + "learning_rate": 3.1360833219131154e-06, + "loss": 0.0578, + "step": 50837 + }, + { + "epoch": 3.089527943570266, + "grad_norm": 2.9342756271362305, + "learning_rate": 3.13594627929286e-06, + "loss": 0.0856, + "step": 50838 + }, + { + "epoch": 3.0895415084102007, + "grad_norm": 4.862823963165283, + "learning_rate": 3.1358092366726057e-06, + "loss": 0.1461, + "step": 50839 + }, + { + "epoch": 3.0895550732501356, + "grad_norm": 5.174872875213623, + "learning_rate": 3.1356721940523505e-06, + "loss": 0.1892, + "step": 50840 + }, + { + "epoch": 3.0895686380900704, + "grad_norm": 5.744875907897949, + "learning_rate": 3.1355351514320957e-06, + "loss": 0.1788, + "step": 50841 + }, + { + "epoch": 3.0895822029300053, + "grad_norm": 4.880739212036133, + "learning_rate": 3.135398108811841e-06, + "loss": 0.2694, + "step": 50842 + }, + { + "epoch": 3.08959576776994, + "grad_norm": 2.615691900253296, + "learning_rate": 3.135261066191586e-06, + "loss": 0.053, + "step": 50843 + }, + { + "epoch": 3.089609332609875, + "grad_norm": 4.583766937255859, + "learning_rate": 3.1351240235713308e-06, + "loss": 0.0791, + "step": 50844 + }, + { + "epoch": 3.08962289744981, + "grad_norm": 4.603076934814453, + "learning_rate": 3.134986980951076e-06, + "loss": 0.1309, + "step": 50845 + }, + { + "epoch": 3.0896364622897448, + "grad_norm": 3.7164976596832275, + "learning_rate": 3.134849938330821e-06, + "loss": 0.0825, + "step": 50846 + }, + { + "epoch": 3.08965002712968, + "grad_norm": 4.758041858673096, + "learning_rate": 3.134712895710566e-06, + "loss": 0.2887, + "step": 50847 + }, + { + "epoch": 3.089663591969615, + "grad_norm": 4.729218482971191, + "learning_rate": 3.1345758530903115e-06, + "loss": 0.1817, + "step": 50848 + }, + { + "epoch": 3.08967715680955, + "grad_norm": 3.766561269760132, + "learning_rate": 3.1344388104700562e-06, + "loss": 0.0889, + "step": 50849 + }, + { + "epoch": 3.0896907216494847, + "grad_norm": 2.5324392318725586, + "learning_rate": 3.134301767849802e-06, + "loss": 0.0689, + "step": 50850 + }, + { + "epoch": 3.0897042864894195, + "grad_norm": 4.069735050201416, + "learning_rate": 3.1341647252295466e-06, + "loss": 0.2104, + "step": 50851 + }, + { + "epoch": 3.0897178513293544, + "grad_norm": 4.223333358764648, + "learning_rate": 3.1340276826092918e-06, + "loss": 0.1207, + "step": 50852 + }, + { + "epoch": 3.0897314161692893, + "grad_norm": 6.3860979080200195, + "learning_rate": 3.133890639989037e-06, + "loss": 0.119, + "step": 50853 + }, + { + "epoch": 3.089744981009224, + "grad_norm": 5.3671183586120605, + "learning_rate": 3.133753597368782e-06, + "loss": 0.1252, + "step": 50854 + }, + { + "epoch": 3.089758545849159, + "grad_norm": 3.9024624824523926, + "learning_rate": 3.133616554748527e-06, + "loss": 0.0698, + "step": 50855 + }, + { + "epoch": 3.089772110689094, + "grad_norm": 3.794644594192505, + "learning_rate": 3.1334795121282725e-06, + "loss": 0.0748, + "step": 50856 + }, + { + "epoch": 3.0897856755290287, + "grad_norm": 4.895856857299805, + "learning_rate": 3.1333424695080173e-06, + "loss": 0.2025, + "step": 50857 + }, + { + "epoch": 3.0897992403689636, + "grad_norm": 2.427694320678711, + "learning_rate": 3.133205426887762e-06, + "loss": 0.0597, + "step": 50858 + }, + { + "epoch": 3.0898128052088984, + "grad_norm": 7.213088035583496, + "learning_rate": 3.1330683842675076e-06, + "loss": 0.167, + "step": 50859 + }, + { + "epoch": 3.0898263700488333, + "grad_norm": 4.041142463684082, + "learning_rate": 3.1329313416472524e-06, + "loss": 0.128, + "step": 50860 + }, + { + "epoch": 3.089839934888768, + "grad_norm": 4.374770164489746, + "learning_rate": 3.1327942990269975e-06, + "loss": 0.159, + "step": 50861 + }, + { + "epoch": 3.089853499728703, + "grad_norm": 4.249978065490723, + "learning_rate": 3.1326572564067427e-06, + "loss": 0.1622, + "step": 50862 + }, + { + "epoch": 3.089867064568638, + "grad_norm": 4.483623027801514, + "learning_rate": 3.132520213786488e-06, + "loss": 0.1719, + "step": 50863 + }, + { + "epoch": 3.0898806294085728, + "grad_norm": 4.371545791625977, + "learning_rate": 3.1323831711662327e-06, + "loss": 0.181, + "step": 50864 + }, + { + "epoch": 3.089894194248508, + "grad_norm": 4.451322078704834, + "learning_rate": 3.1322461285459783e-06, + "loss": 0.1232, + "step": 50865 + }, + { + "epoch": 3.089907759088443, + "grad_norm": 3.964162588119507, + "learning_rate": 3.132109085925723e-06, + "loss": 0.1279, + "step": 50866 + }, + { + "epoch": 3.089921323928378, + "grad_norm": 3.294034004211426, + "learning_rate": 3.1319720433054686e-06, + "loss": 0.0592, + "step": 50867 + }, + { + "epoch": 3.0899348887683127, + "grad_norm": 3.2062392234802246, + "learning_rate": 3.1318350006852134e-06, + "loss": 0.098, + "step": 50868 + }, + { + "epoch": 3.0899484536082475, + "grad_norm": 2.658921718597412, + "learning_rate": 3.1316979580649586e-06, + "loss": 0.123, + "step": 50869 + }, + { + "epoch": 3.0899620184481824, + "grad_norm": 4.179840564727783, + "learning_rate": 3.1315609154447037e-06, + "loss": 0.157, + "step": 50870 + }, + { + "epoch": 3.0899755832881173, + "grad_norm": 4.5801897048950195, + "learning_rate": 3.131423872824449e-06, + "loss": 0.1801, + "step": 50871 + }, + { + "epoch": 3.089989148128052, + "grad_norm": 3.7980949878692627, + "learning_rate": 3.1312868302041937e-06, + "loss": 0.0814, + "step": 50872 + }, + { + "epoch": 3.090002712967987, + "grad_norm": 4.236822128295898, + "learning_rate": 3.1311497875839384e-06, + "loss": 0.1265, + "step": 50873 + }, + { + "epoch": 3.090016277807922, + "grad_norm": 4.669559001922607, + "learning_rate": 3.131012744963684e-06, + "loss": 0.1295, + "step": 50874 + }, + { + "epoch": 3.0900298426478567, + "grad_norm": 3.4461400508880615, + "learning_rate": 3.1308757023434288e-06, + "loss": 0.0563, + "step": 50875 + }, + { + "epoch": 3.0900434074877916, + "grad_norm": 5.581552982330322, + "learning_rate": 3.1307386597231744e-06, + "loss": 0.0945, + "step": 50876 + }, + { + "epoch": 3.0900569723277265, + "grad_norm": 5.010395050048828, + "learning_rate": 3.130601617102919e-06, + "loss": 0.1854, + "step": 50877 + }, + { + "epoch": 3.0900705371676613, + "grad_norm": 2.8657944202423096, + "learning_rate": 3.1304645744826643e-06, + "loss": 0.0567, + "step": 50878 + }, + { + "epoch": 3.090084102007596, + "grad_norm": 4.562011241912842, + "learning_rate": 3.1303275318624095e-06, + "loss": 0.0614, + "step": 50879 + }, + { + "epoch": 3.090097666847531, + "grad_norm": 5.039026260375977, + "learning_rate": 3.1301904892421547e-06, + "loss": 0.1475, + "step": 50880 + }, + { + "epoch": 3.090111231687466, + "grad_norm": 3.9737508296966553, + "learning_rate": 3.1300534466218994e-06, + "loss": 0.113, + "step": 50881 + }, + { + "epoch": 3.090124796527401, + "grad_norm": 4.733882427215576, + "learning_rate": 3.129916404001645e-06, + "loss": 0.1241, + "step": 50882 + }, + { + "epoch": 3.0901383613673357, + "grad_norm": 4.156677722930908, + "learning_rate": 3.12977936138139e-06, + "loss": 0.1131, + "step": 50883 + }, + { + "epoch": 3.0901519262072705, + "grad_norm": 3.504485845565796, + "learning_rate": 3.1296423187611354e-06, + "loss": 0.1037, + "step": 50884 + }, + { + "epoch": 3.090165491047206, + "grad_norm": 3.9748897552490234, + "learning_rate": 3.12950527614088e-06, + "loss": 0.2276, + "step": 50885 + }, + { + "epoch": 3.0901790558871407, + "grad_norm": 3.0389249324798584, + "learning_rate": 3.129368233520625e-06, + "loss": 0.06, + "step": 50886 + }, + { + "epoch": 3.0901926207270756, + "grad_norm": 5.377063274383545, + "learning_rate": 3.1292311909003705e-06, + "loss": 0.2435, + "step": 50887 + }, + { + "epoch": 3.0902061855670104, + "grad_norm": 4.531270980834961, + "learning_rate": 3.1290941482801153e-06, + "loss": 0.2023, + "step": 50888 + }, + { + "epoch": 3.0902197504069453, + "grad_norm": 4.608437538146973, + "learning_rate": 3.1289571056598604e-06, + "loss": 0.1104, + "step": 50889 + }, + { + "epoch": 3.09023331524688, + "grad_norm": 6.049904823303223, + "learning_rate": 3.128820063039605e-06, + "loss": 0.1278, + "step": 50890 + }, + { + "epoch": 3.090246880086815, + "grad_norm": 3.7160556316375732, + "learning_rate": 3.128683020419351e-06, + "loss": 0.1457, + "step": 50891 + }, + { + "epoch": 3.09026044492675, + "grad_norm": 3.3519787788391113, + "learning_rate": 3.1285459777990956e-06, + "loss": 0.1066, + "step": 50892 + }, + { + "epoch": 3.0902740097666848, + "grad_norm": 5.143502235412598, + "learning_rate": 3.128408935178841e-06, + "loss": 0.1673, + "step": 50893 + }, + { + "epoch": 3.0902875746066196, + "grad_norm": 4.673651218414307, + "learning_rate": 3.128271892558586e-06, + "loss": 0.1452, + "step": 50894 + }, + { + "epoch": 3.0903011394465545, + "grad_norm": 5.059712886810303, + "learning_rate": 3.128134849938331e-06, + "loss": 0.1209, + "step": 50895 + }, + { + "epoch": 3.0903147042864894, + "grad_norm": 5.793701171875, + "learning_rate": 3.1279978073180763e-06, + "loss": 0.172, + "step": 50896 + }, + { + "epoch": 3.090328269126424, + "grad_norm": 3.7392635345458984, + "learning_rate": 3.1278607646978214e-06, + "loss": 0.0924, + "step": 50897 + }, + { + "epoch": 3.090341833966359, + "grad_norm": 4.244540214538574, + "learning_rate": 3.127723722077566e-06, + "loss": 0.086, + "step": 50898 + }, + { + "epoch": 3.090355398806294, + "grad_norm": 2.932842493057251, + "learning_rate": 3.1275866794573114e-06, + "loss": 0.0939, + "step": 50899 + }, + { + "epoch": 3.090368963646229, + "grad_norm": 2.2085273265838623, + "learning_rate": 3.1274496368370566e-06, + "loss": 0.0908, + "step": 50900 + }, + { + "epoch": 3.0903825284861637, + "grad_norm": 3.270387887954712, + "learning_rate": 3.1273125942168013e-06, + "loss": 0.0868, + "step": 50901 + }, + { + "epoch": 3.0903960933260985, + "grad_norm": 3.861802816390991, + "learning_rate": 3.127175551596547e-06, + "loss": 0.1306, + "step": 50902 + }, + { + "epoch": 3.090409658166034, + "grad_norm": 3.269339084625244, + "learning_rate": 3.1270385089762917e-06, + "loss": 0.0969, + "step": 50903 + }, + { + "epoch": 3.0904232230059687, + "grad_norm": 4.7392144203186035, + "learning_rate": 3.1269014663560373e-06, + "loss": 0.1504, + "step": 50904 + }, + { + "epoch": 3.0904367878459036, + "grad_norm": 6.36543607711792, + "learning_rate": 3.126764423735782e-06, + "loss": 0.1796, + "step": 50905 + }, + { + "epoch": 3.0904503526858385, + "grad_norm": 4.8374128341674805, + "learning_rate": 3.126627381115527e-06, + "loss": 0.2201, + "step": 50906 + }, + { + "epoch": 3.0904639175257733, + "grad_norm": 4.11959171295166, + "learning_rate": 3.126490338495272e-06, + "loss": 0.1058, + "step": 50907 + }, + { + "epoch": 3.090477482365708, + "grad_norm": 6.927818298339844, + "learning_rate": 3.1263532958750176e-06, + "loss": 0.22, + "step": 50908 + }, + { + "epoch": 3.090491047205643, + "grad_norm": 3.4103758335113525, + "learning_rate": 3.1262162532547623e-06, + "loss": 0.0827, + "step": 50909 + }, + { + "epoch": 3.090504612045578, + "grad_norm": 6.1973652839660645, + "learning_rate": 3.126079210634508e-06, + "loss": 0.277, + "step": 50910 + }, + { + "epoch": 3.090518176885513, + "grad_norm": 3.566519021987915, + "learning_rate": 3.1259421680142527e-06, + "loss": 0.1148, + "step": 50911 + }, + { + "epoch": 3.0905317417254476, + "grad_norm": 4.354536533355713, + "learning_rate": 3.125805125393998e-06, + "loss": 0.1253, + "step": 50912 + }, + { + "epoch": 3.0905453065653825, + "grad_norm": 5.123315334320068, + "learning_rate": 3.125668082773743e-06, + "loss": 0.179, + "step": 50913 + }, + { + "epoch": 3.0905588714053174, + "grad_norm": 4.9080634117126465, + "learning_rate": 3.125531040153488e-06, + "loss": 0.1733, + "step": 50914 + }, + { + "epoch": 3.0905724362452522, + "grad_norm": 3.8431127071380615, + "learning_rate": 3.125393997533233e-06, + "loss": 0.1205, + "step": 50915 + }, + { + "epoch": 3.090586001085187, + "grad_norm": 4.554977893829346, + "learning_rate": 3.125256954912978e-06, + "loss": 0.077, + "step": 50916 + }, + { + "epoch": 3.090599565925122, + "grad_norm": 3.310955762863159, + "learning_rate": 3.1251199122927233e-06, + "loss": 0.0844, + "step": 50917 + }, + { + "epoch": 3.090613130765057, + "grad_norm": 2.7772045135498047, + "learning_rate": 3.124982869672468e-06, + "loss": 0.0716, + "step": 50918 + }, + { + "epoch": 3.0906266956049917, + "grad_norm": 5.144615173339844, + "learning_rate": 3.1248458270522137e-06, + "loss": 0.1834, + "step": 50919 + }, + { + "epoch": 3.0906402604449266, + "grad_norm": 6.976436138153076, + "learning_rate": 3.1247087844319584e-06, + "loss": 0.1432, + "step": 50920 + }, + { + "epoch": 3.0906538252848614, + "grad_norm": 4.398464202880859, + "learning_rate": 3.124571741811704e-06, + "loss": 0.147, + "step": 50921 + }, + { + "epoch": 3.0906673901247963, + "grad_norm": 3.893946409225464, + "learning_rate": 3.124434699191449e-06, + "loss": 0.1521, + "step": 50922 + }, + { + "epoch": 3.0906809549647316, + "grad_norm": 3.8012802600860596, + "learning_rate": 3.124297656571194e-06, + "loss": 0.1332, + "step": 50923 + }, + { + "epoch": 3.0906945198046665, + "grad_norm": 3.9199979305267334, + "learning_rate": 3.124160613950939e-06, + "loss": 0.1302, + "step": 50924 + }, + { + "epoch": 3.0907080846446013, + "grad_norm": 4.901688098907471, + "learning_rate": 3.1240235713306843e-06, + "loss": 0.1716, + "step": 50925 + }, + { + "epoch": 3.090721649484536, + "grad_norm": 7.359967231750488, + "learning_rate": 3.123886528710429e-06, + "loss": 0.1114, + "step": 50926 + }, + { + "epoch": 3.090735214324471, + "grad_norm": 3.692207098007202, + "learning_rate": 3.123749486090174e-06, + "loss": 0.1059, + "step": 50927 + }, + { + "epoch": 3.090748779164406, + "grad_norm": 3.174082040786743, + "learning_rate": 3.1236124434699195e-06, + "loss": 0.0707, + "step": 50928 + }, + { + "epoch": 3.090762344004341, + "grad_norm": 3.765076160430908, + "learning_rate": 3.123475400849664e-06, + "loss": 0.1316, + "step": 50929 + }, + { + "epoch": 3.0907759088442757, + "grad_norm": 3.8150107860565186, + "learning_rate": 3.12333835822941e-06, + "loss": 0.0913, + "step": 50930 + }, + { + "epoch": 3.0907894736842105, + "grad_norm": 3.971550226211548, + "learning_rate": 3.1232013156091546e-06, + "loss": 0.1009, + "step": 50931 + }, + { + "epoch": 3.0908030385241454, + "grad_norm": 2.9999427795410156, + "learning_rate": 3.1230642729888997e-06, + "loss": 0.0615, + "step": 50932 + }, + { + "epoch": 3.0908166033640803, + "grad_norm": 4.5301923751831055, + "learning_rate": 3.122927230368645e-06, + "loss": 0.0993, + "step": 50933 + }, + { + "epoch": 3.090830168204015, + "grad_norm": 3.3511438369750977, + "learning_rate": 3.12279018774839e-06, + "loss": 0.0852, + "step": 50934 + }, + { + "epoch": 3.09084373304395, + "grad_norm": 3.7051949501037598, + "learning_rate": 3.122653145128135e-06, + "loss": 0.0851, + "step": 50935 + }, + { + "epoch": 3.090857297883885, + "grad_norm": 4.1207780838012695, + "learning_rate": 3.1225161025078805e-06, + "loss": 0.1507, + "step": 50936 + }, + { + "epoch": 3.0908708627238197, + "grad_norm": 4.225313186645508, + "learning_rate": 3.1223790598876252e-06, + "loss": 0.0905, + "step": 50937 + }, + { + "epoch": 3.0908844275637546, + "grad_norm": 3.7121198177337646, + "learning_rate": 3.122242017267371e-06, + "loss": 0.0981, + "step": 50938 + }, + { + "epoch": 3.0908979924036895, + "grad_norm": 3.4345524311065674, + "learning_rate": 3.1221049746471156e-06, + "loss": 0.0533, + "step": 50939 + }, + { + "epoch": 3.0909115572436243, + "grad_norm": 3.8118085861206055, + "learning_rate": 3.1219679320268608e-06, + "loss": 0.1113, + "step": 50940 + }, + { + "epoch": 3.0909251220835596, + "grad_norm": 3.1337904930114746, + "learning_rate": 3.121830889406606e-06, + "loss": 0.0863, + "step": 50941 + }, + { + "epoch": 3.0909386869234945, + "grad_norm": 3.3048436641693115, + "learning_rate": 3.1216938467863507e-06, + "loss": 0.0943, + "step": 50942 + }, + { + "epoch": 3.0909522517634294, + "grad_norm": 4.150449752807617, + "learning_rate": 3.121556804166096e-06, + "loss": 0.0653, + "step": 50943 + }, + { + "epoch": 3.0909658166033642, + "grad_norm": 9.484659194946289, + "learning_rate": 3.1214197615458406e-06, + "loss": 0.1642, + "step": 50944 + }, + { + "epoch": 3.090979381443299, + "grad_norm": 6.096083641052246, + "learning_rate": 3.1212827189255862e-06, + "loss": 0.1244, + "step": 50945 + }, + { + "epoch": 3.090992946283234, + "grad_norm": 3.8407464027404785, + "learning_rate": 3.121145676305331e-06, + "loss": 0.1176, + "step": 50946 + }, + { + "epoch": 3.091006511123169, + "grad_norm": 3.0998449325561523, + "learning_rate": 3.1210086336850766e-06, + "loss": 0.0636, + "step": 50947 + }, + { + "epoch": 3.0910200759631037, + "grad_norm": 4.2853474617004395, + "learning_rate": 3.1208715910648213e-06, + "loss": 0.0756, + "step": 50948 + }, + { + "epoch": 3.0910336408030386, + "grad_norm": 3.0420844554901123, + "learning_rate": 3.1207345484445665e-06, + "loss": 0.0823, + "step": 50949 + }, + { + "epoch": 3.0910472056429734, + "grad_norm": 4.890241622924805, + "learning_rate": 3.1205975058243117e-06, + "loss": 0.1067, + "step": 50950 + }, + { + "epoch": 3.0910607704829083, + "grad_norm": 4.186123847961426, + "learning_rate": 3.120460463204057e-06, + "loss": 0.1198, + "step": 50951 + }, + { + "epoch": 3.091074335322843, + "grad_norm": 5.531469821929932, + "learning_rate": 3.1203234205838016e-06, + "loss": 0.1843, + "step": 50952 + }, + { + "epoch": 3.091087900162778, + "grad_norm": 3.3990249633789062, + "learning_rate": 3.1201863779635472e-06, + "loss": 0.0644, + "step": 50953 + }, + { + "epoch": 3.091101465002713, + "grad_norm": 3.9490461349487305, + "learning_rate": 3.120049335343292e-06, + "loss": 0.0787, + "step": 50954 + }, + { + "epoch": 3.0911150298426477, + "grad_norm": 5.097476959228516, + "learning_rate": 3.1199122927230367e-06, + "loss": 0.1367, + "step": 50955 + }, + { + "epoch": 3.0911285946825826, + "grad_norm": 4.7251877784729, + "learning_rate": 3.1197752501027823e-06, + "loss": 0.1289, + "step": 50956 + }, + { + "epoch": 3.0911421595225175, + "grad_norm": 3.2931323051452637, + "learning_rate": 3.119638207482527e-06, + "loss": 0.056, + "step": 50957 + }, + { + "epoch": 3.0911557243624523, + "grad_norm": 3.2879717350006104, + "learning_rate": 3.1195011648622727e-06, + "loss": 0.0784, + "step": 50958 + }, + { + "epoch": 3.091169289202387, + "grad_norm": 3.028383255004883, + "learning_rate": 3.1193641222420175e-06, + "loss": 0.0675, + "step": 50959 + }, + { + "epoch": 3.0911828540423225, + "grad_norm": 3.1735610961914062, + "learning_rate": 3.1192270796217626e-06, + "loss": 0.0809, + "step": 50960 + }, + { + "epoch": 3.0911964188822574, + "grad_norm": 3.3643252849578857, + "learning_rate": 3.1190900370015074e-06, + "loss": 0.0696, + "step": 50961 + }, + { + "epoch": 3.0912099837221922, + "grad_norm": 3.295287847518921, + "learning_rate": 3.118952994381253e-06, + "loss": 0.0656, + "step": 50962 + }, + { + "epoch": 3.091223548562127, + "grad_norm": 4.20247745513916, + "learning_rate": 3.1188159517609977e-06, + "loss": 0.0896, + "step": 50963 + }, + { + "epoch": 3.091237113402062, + "grad_norm": 3.5139095783233643, + "learning_rate": 3.1186789091407433e-06, + "loss": 0.0917, + "step": 50964 + }, + { + "epoch": 3.091250678241997, + "grad_norm": 4.43373441696167, + "learning_rate": 3.118541866520488e-06, + "loss": 0.1688, + "step": 50965 + }, + { + "epoch": 3.0912642430819317, + "grad_norm": 4.200990200042725, + "learning_rate": 3.1184048239002333e-06, + "loss": 0.1181, + "step": 50966 + }, + { + "epoch": 3.0912778079218666, + "grad_norm": 5.061041831970215, + "learning_rate": 3.1182677812799785e-06, + "loss": 0.1385, + "step": 50967 + }, + { + "epoch": 3.0912913727618014, + "grad_norm": 5.367997169494629, + "learning_rate": 3.1181307386597232e-06, + "loss": 0.1304, + "step": 50968 + }, + { + "epoch": 3.0913049376017363, + "grad_norm": 4.0557122230529785, + "learning_rate": 3.1179936960394684e-06, + "loss": 0.1414, + "step": 50969 + }, + { + "epoch": 3.091318502441671, + "grad_norm": 3.2753405570983887, + "learning_rate": 3.1178566534192136e-06, + "loss": 0.099, + "step": 50970 + }, + { + "epoch": 3.091332067281606, + "grad_norm": 4.736713886260986, + "learning_rate": 3.1177196107989588e-06, + "loss": 0.1033, + "step": 50971 + }, + { + "epoch": 3.091345632121541, + "grad_norm": 4.654048919677734, + "learning_rate": 3.1175825681787035e-06, + "loss": 0.1399, + "step": 50972 + }, + { + "epoch": 3.0913591969614758, + "grad_norm": 5.196313381195068, + "learning_rate": 3.117445525558449e-06, + "loss": 0.139, + "step": 50973 + }, + { + "epoch": 3.0913727618014106, + "grad_norm": 5.273433208465576, + "learning_rate": 3.117308482938194e-06, + "loss": 0.1305, + "step": 50974 + }, + { + "epoch": 3.0913863266413455, + "grad_norm": 3.1751301288604736, + "learning_rate": 3.1171714403179395e-06, + "loss": 0.0696, + "step": 50975 + }, + { + "epoch": 3.0913998914812804, + "grad_norm": 4.156219005584717, + "learning_rate": 3.1170343976976842e-06, + "loss": 0.1045, + "step": 50976 + }, + { + "epoch": 3.0914134563212152, + "grad_norm": 3.9702868461608887, + "learning_rate": 3.1168973550774294e-06, + "loss": 0.107, + "step": 50977 + }, + { + "epoch": 3.09142702116115, + "grad_norm": 4.920409679412842, + "learning_rate": 3.116760312457174e-06, + "loss": 0.1512, + "step": 50978 + }, + { + "epoch": 3.0914405860010854, + "grad_norm": 4.505566596984863, + "learning_rate": 3.1166232698369198e-06, + "loss": 0.1672, + "step": 50979 + }, + { + "epoch": 3.0914541508410203, + "grad_norm": 4.912600994110107, + "learning_rate": 3.1164862272166645e-06, + "loss": 0.1598, + "step": 50980 + }, + { + "epoch": 3.091467715680955, + "grad_norm": 5.211689472198486, + "learning_rate": 3.11634918459641e-06, + "loss": 0.1227, + "step": 50981 + }, + { + "epoch": 3.09148128052089, + "grad_norm": 3.093742847442627, + "learning_rate": 3.116212141976155e-06, + "loss": 0.103, + "step": 50982 + }, + { + "epoch": 3.091494845360825, + "grad_norm": 4.899111270904541, + "learning_rate": 3.1160750993558996e-06, + "loss": 0.2292, + "step": 50983 + }, + { + "epoch": 3.0915084102007597, + "grad_norm": 5.467587947845459, + "learning_rate": 3.1159380567356452e-06, + "loss": 0.2159, + "step": 50984 + }, + { + "epoch": 3.0915219750406946, + "grad_norm": 5.065424919128418, + "learning_rate": 3.11580101411539e-06, + "loss": 0.1708, + "step": 50985 + }, + { + "epoch": 3.0915355398806295, + "grad_norm": 4.815159320831299, + "learning_rate": 3.115663971495135e-06, + "loss": 0.1327, + "step": 50986 + }, + { + "epoch": 3.0915491047205643, + "grad_norm": 4.174822807312012, + "learning_rate": 3.1155269288748803e-06, + "loss": 0.1581, + "step": 50987 + }, + { + "epoch": 3.091562669560499, + "grad_norm": 5.256163597106934, + "learning_rate": 3.1153898862546255e-06, + "loss": 0.1757, + "step": 50988 + }, + { + "epoch": 3.091576234400434, + "grad_norm": 4.432349681854248, + "learning_rate": 3.1152528436343703e-06, + "loss": 0.1175, + "step": 50989 + }, + { + "epoch": 3.091589799240369, + "grad_norm": 3.8867082595825195, + "learning_rate": 3.115115801014116e-06, + "loss": 0.0795, + "step": 50990 + }, + { + "epoch": 3.091603364080304, + "grad_norm": 5.2770466804504395, + "learning_rate": 3.1149787583938606e-06, + "loss": 0.1513, + "step": 50991 + }, + { + "epoch": 3.0916169289202386, + "grad_norm": 4.002877712249756, + "learning_rate": 3.1148417157736062e-06, + "loss": 0.1694, + "step": 50992 + }, + { + "epoch": 3.0916304937601735, + "grad_norm": 4.680055141448975, + "learning_rate": 3.114704673153351e-06, + "loss": 0.1973, + "step": 50993 + }, + { + "epoch": 3.0916440586001084, + "grad_norm": 3.9926598072052, + "learning_rate": 3.114567630533096e-06, + "loss": 0.1269, + "step": 50994 + }, + { + "epoch": 3.0916576234400432, + "grad_norm": 5.015242099761963, + "learning_rate": 3.114430587912841e-06, + "loss": 0.1781, + "step": 50995 + }, + { + "epoch": 3.091671188279978, + "grad_norm": 5.477383613586426, + "learning_rate": 3.114293545292586e-06, + "loss": 0.1992, + "step": 50996 + }, + { + "epoch": 3.091684753119913, + "grad_norm": 5.6713786125183105, + "learning_rate": 3.1141565026723313e-06, + "loss": 0.2418, + "step": 50997 + }, + { + "epoch": 3.0916983179598483, + "grad_norm": 3.8707239627838135, + "learning_rate": 3.114019460052076e-06, + "loss": 0.0772, + "step": 50998 + }, + { + "epoch": 3.091711882799783, + "grad_norm": 7.493198394775391, + "learning_rate": 3.1138824174318216e-06, + "loss": 0.1448, + "step": 50999 + }, + { + "epoch": 3.091725447639718, + "grad_norm": 4.783509254455566, + "learning_rate": 3.1137453748115664e-06, + "loss": 0.1734, + "step": 51000 + }, + { + "epoch": 3.091739012479653, + "grad_norm": 5.28372859954834, + "learning_rate": 3.113608332191312e-06, + "loss": 0.1067, + "step": 51001 + }, + { + "epoch": 3.0917525773195877, + "grad_norm": 5.491480350494385, + "learning_rate": 3.1134712895710568e-06, + "loss": 0.2386, + "step": 51002 + }, + { + "epoch": 3.0917661421595226, + "grad_norm": 3.3912787437438965, + "learning_rate": 3.113334246950802e-06, + "loss": 0.0937, + "step": 51003 + }, + { + "epoch": 3.0917797069994575, + "grad_norm": 5.143856048583984, + "learning_rate": 3.113197204330547e-06, + "loss": 0.1307, + "step": 51004 + }, + { + "epoch": 3.0917932718393923, + "grad_norm": 3.1084794998168945, + "learning_rate": 3.1130601617102923e-06, + "loss": 0.1037, + "step": 51005 + }, + { + "epoch": 3.091806836679327, + "grad_norm": 5.456308364868164, + "learning_rate": 3.112923119090037e-06, + "loss": 0.1805, + "step": 51006 + }, + { + "epoch": 3.091820401519262, + "grad_norm": 4.7104997634887695, + "learning_rate": 3.1127860764697827e-06, + "loss": 0.1737, + "step": 51007 + }, + { + "epoch": 3.091833966359197, + "grad_norm": 3.7044084072113037, + "learning_rate": 3.1126490338495274e-06, + "loss": 0.0909, + "step": 51008 + }, + { + "epoch": 3.091847531199132, + "grad_norm": 4.5959649085998535, + "learning_rate": 3.112511991229273e-06, + "loss": 0.2245, + "step": 51009 + }, + { + "epoch": 3.0918610960390667, + "grad_norm": 4.670745372772217, + "learning_rate": 3.1123749486090178e-06, + "loss": 0.1623, + "step": 51010 + }, + { + "epoch": 3.0918746608790015, + "grad_norm": 4.248022556304932, + "learning_rate": 3.1122379059887625e-06, + "loss": 0.1225, + "step": 51011 + }, + { + "epoch": 3.0918882257189364, + "grad_norm": 5.404211521148682, + "learning_rate": 3.112100863368508e-06, + "loss": 0.1703, + "step": 51012 + }, + { + "epoch": 3.0919017905588713, + "grad_norm": 6.118349552154541, + "learning_rate": 3.111963820748253e-06, + "loss": 0.1843, + "step": 51013 + }, + { + "epoch": 3.091915355398806, + "grad_norm": 3.27471923828125, + "learning_rate": 3.111826778127998e-06, + "loss": 0.0774, + "step": 51014 + }, + { + "epoch": 3.091928920238741, + "grad_norm": 3.451645612716675, + "learning_rate": 3.111689735507743e-06, + "loss": 0.0973, + "step": 51015 + }, + { + "epoch": 3.091942485078676, + "grad_norm": 3.535400629043579, + "learning_rate": 3.1115526928874884e-06, + "loss": 0.1149, + "step": 51016 + }, + { + "epoch": 3.091956049918611, + "grad_norm": 5.295029640197754, + "learning_rate": 3.111415650267233e-06, + "loss": 0.1909, + "step": 51017 + }, + { + "epoch": 3.091969614758546, + "grad_norm": 4.304749011993408, + "learning_rate": 3.1112786076469788e-06, + "loss": 0.1307, + "step": 51018 + }, + { + "epoch": 3.091983179598481, + "grad_norm": 4.156501770019531, + "learning_rate": 3.1111415650267235e-06, + "loss": 0.1009, + "step": 51019 + }, + { + "epoch": 3.0919967444384158, + "grad_norm": 4.7853593826293945, + "learning_rate": 3.1110045224064687e-06, + "loss": 0.0995, + "step": 51020 + }, + { + "epoch": 3.0920103092783506, + "grad_norm": 4.7932448387146, + "learning_rate": 3.110867479786214e-06, + "loss": 0.1353, + "step": 51021 + }, + { + "epoch": 3.0920238741182855, + "grad_norm": 5.233034133911133, + "learning_rate": 3.110730437165959e-06, + "loss": 0.1062, + "step": 51022 + }, + { + "epoch": 3.0920374389582204, + "grad_norm": 3.897183895111084, + "learning_rate": 3.110593394545704e-06, + "loss": 0.1044, + "step": 51023 + }, + { + "epoch": 3.0920510037981552, + "grad_norm": 4.599888324737549, + "learning_rate": 3.110456351925449e-06, + "loss": 0.1055, + "step": 51024 + }, + { + "epoch": 3.09206456863809, + "grad_norm": 4.066664218902588, + "learning_rate": 3.110319309305194e-06, + "loss": 0.1572, + "step": 51025 + }, + { + "epoch": 3.092078133478025, + "grad_norm": 4.5993218421936035, + "learning_rate": 3.110182266684939e-06, + "loss": 0.0952, + "step": 51026 + }, + { + "epoch": 3.09209169831796, + "grad_norm": 5.031561374664307, + "learning_rate": 3.1100452240646845e-06, + "loss": 0.1366, + "step": 51027 + }, + { + "epoch": 3.0921052631578947, + "grad_norm": 4.391663074493408, + "learning_rate": 3.1099081814444293e-06, + "loss": 0.1112, + "step": 51028 + }, + { + "epoch": 3.0921188279978296, + "grad_norm": 4.844785690307617, + "learning_rate": 3.109771138824175e-06, + "loss": 0.1237, + "step": 51029 + }, + { + "epoch": 3.0921323928377644, + "grad_norm": 2.661560535430908, + "learning_rate": 3.1096340962039196e-06, + "loss": 0.06, + "step": 51030 + }, + { + "epoch": 3.0921459576776993, + "grad_norm": 3.0787675380706787, + "learning_rate": 3.109497053583665e-06, + "loss": 0.0789, + "step": 51031 + }, + { + "epoch": 3.092159522517634, + "grad_norm": 6.70943021774292, + "learning_rate": 3.1093600109634096e-06, + "loss": 0.1425, + "step": 51032 + }, + { + "epoch": 3.092173087357569, + "grad_norm": 3.788938045501709, + "learning_rate": 3.109222968343155e-06, + "loss": 0.1509, + "step": 51033 + }, + { + "epoch": 3.092186652197504, + "grad_norm": 6.035351276397705, + "learning_rate": 3.1090859257229e-06, + "loss": 0.1512, + "step": 51034 + }, + { + "epoch": 3.0922002170374387, + "grad_norm": 2.8476428985595703, + "learning_rate": 3.1089488831026455e-06, + "loss": 0.0824, + "step": 51035 + }, + { + "epoch": 3.092213781877374, + "grad_norm": 3.0011794567108154, + "learning_rate": 3.1088118404823903e-06, + "loss": 0.0738, + "step": 51036 + }, + { + "epoch": 3.092227346717309, + "grad_norm": 4.881143093109131, + "learning_rate": 3.108674797862135e-06, + "loss": 0.1627, + "step": 51037 + }, + { + "epoch": 3.092240911557244, + "grad_norm": 3.2571730613708496, + "learning_rate": 3.1085377552418807e-06, + "loss": 0.1066, + "step": 51038 + }, + { + "epoch": 3.0922544763971787, + "grad_norm": 3.7267777919769287, + "learning_rate": 3.1084007126216254e-06, + "loss": 0.1482, + "step": 51039 + }, + { + "epoch": 3.0922680412371135, + "grad_norm": 4.015718460083008, + "learning_rate": 3.1082636700013706e-06, + "loss": 0.1034, + "step": 51040 + }, + { + "epoch": 3.0922816060770484, + "grad_norm": 3.883875846862793, + "learning_rate": 3.1081266273811158e-06, + "loss": 0.0827, + "step": 51041 + }, + { + "epoch": 3.0922951709169832, + "grad_norm": 3.2778143882751465, + "learning_rate": 3.107989584760861e-06, + "loss": 0.098, + "step": 51042 + }, + { + "epoch": 3.092308735756918, + "grad_norm": 3.0228512287139893, + "learning_rate": 3.1078525421406057e-06, + "loss": 0.0533, + "step": 51043 + }, + { + "epoch": 3.092322300596853, + "grad_norm": 4.310119152069092, + "learning_rate": 3.1077154995203513e-06, + "loss": 0.122, + "step": 51044 + }, + { + "epoch": 3.092335865436788, + "grad_norm": 5.550957679748535, + "learning_rate": 3.107578456900096e-06, + "loss": 0.1339, + "step": 51045 + }, + { + "epoch": 3.0923494302767227, + "grad_norm": 4.587433338165283, + "learning_rate": 3.1074414142798417e-06, + "loss": 0.1286, + "step": 51046 + }, + { + "epoch": 3.0923629951166576, + "grad_norm": 3.377570390701294, + "learning_rate": 3.1073043716595864e-06, + "loss": 0.1063, + "step": 51047 + }, + { + "epoch": 3.0923765599565924, + "grad_norm": 5.237345218658447, + "learning_rate": 3.1071673290393316e-06, + "loss": 0.1324, + "step": 51048 + }, + { + "epoch": 3.0923901247965273, + "grad_norm": 3.1079912185668945, + "learning_rate": 3.1070302864190764e-06, + "loss": 0.0539, + "step": 51049 + }, + { + "epoch": 3.092403689636462, + "grad_norm": 2.7761125564575195, + "learning_rate": 3.106893243798822e-06, + "loss": 0.0835, + "step": 51050 + }, + { + "epoch": 3.092417254476397, + "grad_norm": 4.548881530761719, + "learning_rate": 3.1067562011785667e-06, + "loss": 0.1244, + "step": 51051 + }, + { + "epoch": 3.092430819316332, + "grad_norm": 4.177525043487549, + "learning_rate": 3.1066191585583115e-06, + "loss": 0.1128, + "step": 51052 + }, + { + "epoch": 3.0924443841562668, + "grad_norm": 4.037286758422852, + "learning_rate": 3.106482115938057e-06, + "loss": 0.1006, + "step": 51053 + }, + { + "epoch": 3.0924579489962016, + "grad_norm": 3.8973073959350586, + "learning_rate": 3.106345073317802e-06, + "loss": 0.1231, + "step": 51054 + }, + { + "epoch": 3.092471513836137, + "grad_norm": 2.4581754207611084, + "learning_rate": 3.1062080306975474e-06, + "loss": 0.0921, + "step": 51055 + }, + { + "epoch": 3.092485078676072, + "grad_norm": 3.894662380218506, + "learning_rate": 3.106070988077292e-06, + "loss": 0.1102, + "step": 51056 + }, + { + "epoch": 3.0924986435160067, + "grad_norm": 3.0374362468719482, + "learning_rate": 3.1059339454570374e-06, + "loss": 0.1041, + "step": 51057 + }, + { + "epoch": 3.0925122083559415, + "grad_norm": 5.303795337677002, + "learning_rate": 3.1057969028367825e-06, + "loss": 0.1538, + "step": 51058 + }, + { + "epoch": 3.0925257731958764, + "grad_norm": 4.30210018157959, + "learning_rate": 3.1056598602165277e-06, + "loss": 0.0929, + "step": 51059 + }, + { + "epoch": 3.0925393380358113, + "grad_norm": 5.639370441436768, + "learning_rate": 3.1055228175962725e-06, + "loss": 0.1142, + "step": 51060 + }, + { + "epoch": 3.092552902875746, + "grad_norm": 3.5474443435668945, + "learning_rate": 3.105385774976018e-06, + "loss": 0.1218, + "step": 51061 + }, + { + "epoch": 3.092566467715681, + "grad_norm": 3.9856643676757812, + "learning_rate": 3.105248732355763e-06, + "loss": 0.0907, + "step": 51062 + }, + { + "epoch": 3.092580032555616, + "grad_norm": 4.3392815589904785, + "learning_rate": 3.1051116897355084e-06, + "loss": 0.104, + "step": 51063 + }, + { + "epoch": 3.0925935973955507, + "grad_norm": 3.1662991046905518, + "learning_rate": 3.104974647115253e-06, + "loss": 0.0713, + "step": 51064 + }, + { + "epoch": 3.0926071622354856, + "grad_norm": 3.675485610961914, + "learning_rate": 3.104837604494998e-06, + "loss": 0.1116, + "step": 51065 + }, + { + "epoch": 3.0926207270754205, + "grad_norm": 3.219309091567993, + "learning_rate": 3.104700561874743e-06, + "loss": 0.0941, + "step": 51066 + }, + { + "epoch": 3.0926342919153553, + "grad_norm": 5.50878381729126, + "learning_rate": 3.1045635192544883e-06, + "loss": 0.1847, + "step": 51067 + }, + { + "epoch": 3.09264785675529, + "grad_norm": 3.4762279987335205, + "learning_rate": 3.1044264766342335e-06, + "loss": 0.133, + "step": 51068 + }, + { + "epoch": 3.092661421595225, + "grad_norm": 3.395900011062622, + "learning_rate": 3.1042894340139782e-06, + "loss": 0.0733, + "step": 51069 + }, + { + "epoch": 3.09267498643516, + "grad_norm": 3.3774967193603516, + "learning_rate": 3.104152391393724e-06, + "loss": 0.1101, + "step": 51070 + }, + { + "epoch": 3.092688551275095, + "grad_norm": 3.602573871612549, + "learning_rate": 3.1040153487734686e-06, + "loss": 0.1213, + "step": 51071 + }, + { + "epoch": 3.0927021161150297, + "grad_norm": 4.039381980895996, + "learning_rate": 3.103878306153214e-06, + "loss": 0.1102, + "step": 51072 + }, + { + "epoch": 3.0927156809549645, + "grad_norm": 2.962348699569702, + "learning_rate": 3.103741263532959e-06, + "loss": 0.0985, + "step": 51073 + }, + { + "epoch": 3.0927292457949, + "grad_norm": 3.659663200378418, + "learning_rate": 3.103604220912704e-06, + "loss": 0.0762, + "step": 51074 + }, + { + "epoch": 3.0927428106348347, + "grad_norm": 3.8697049617767334, + "learning_rate": 3.1034671782924493e-06, + "loss": 0.1378, + "step": 51075 + }, + { + "epoch": 3.0927563754747696, + "grad_norm": 3.8330113887786865, + "learning_rate": 3.1033301356721945e-06, + "loss": 0.0891, + "step": 51076 + }, + { + "epoch": 3.0927699403147044, + "grad_norm": 5.491934776306152, + "learning_rate": 3.1031930930519392e-06, + "loss": 0.0988, + "step": 51077 + }, + { + "epoch": 3.0927835051546393, + "grad_norm": 3.9900457859039307, + "learning_rate": 3.1030560504316844e-06, + "loss": 0.0797, + "step": 51078 + }, + { + "epoch": 3.092797069994574, + "grad_norm": 4.271851539611816, + "learning_rate": 3.1029190078114296e-06, + "loss": 0.0793, + "step": 51079 + }, + { + "epoch": 3.092810634834509, + "grad_norm": 4.923598289489746, + "learning_rate": 3.1027819651911744e-06, + "loss": 0.1316, + "step": 51080 + }, + { + "epoch": 3.092824199674444, + "grad_norm": 5.005331516265869, + "learning_rate": 3.10264492257092e-06, + "loss": 0.134, + "step": 51081 + }, + { + "epoch": 3.0928377645143788, + "grad_norm": 2.8469011783599854, + "learning_rate": 3.1025078799506647e-06, + "loss": 0.0792, + "step": 51082 + }, + { + "epoch": 3.0928513293543136, + "grad_norm": 3.4834580421447754, + "learning_rate": 3.1023708373304103e-06, + "loss": 0.1079, + "step": 51083 + }, + { + "epoch": 3.0928648941942485, + "grad_norm": 3.6239383220672607, + "learning_rate": 3.102233794710155e-06, + "loss": 0.0855, + "step": 51084 + }, + { + "epoch": 3.0928784590341833, + "grad_norm": 3.236743927001953, + "learning_rate": 3.1020967520899003e-06, + "loss": 0.0854, + "step": 51085 + }, + { + "epoch": 3.092892023874118, + "grad_norm": 4.489981651306152, + "learning_rate": 3.101959709469645e-06, + "loss": 0.1237, + "step": 51086 + }, + { + "epoch": 3.092905588714053, + "grad_norm": 3.5871782302856445, + "learning_rate": 3.1018226668493906e-06, + "loss": 0.0918, + "step": 51087 + }, + { + "epoch": 3.092919153553988, + "grad_norm": 3.932182550430298, + "learning_rate": 3.1016856242291354e-06, + "loss": 0.0733, + "step": 51088 + }, + { + "epoch": 3.092932718393923, + "grad_norm": 3.9707069396972656, + "learning_rate": 3.101548581608881e-06, + "loss": 0.1572, + "step": 51089 + }, + { + "epoch": 3.0929462832338577, + "grad_norm": 3.640289068222046, + "learning_rate": 3.1014115389886257e-06, + "loss": 0.0664, + "step": 51090 + }, + { + "epoch": 3.0929598480737925, + "grad_norm": 2.96174693107605, + "learning_rate": 3.101274496368371e-06, + "loss": 0.077, + "step": 51091 + }, + { + "epoch": 3.0929734129137274, + "grad_norm": 5.9110260009765625, + "learning_rate": 3.101137453748116e-06, + "loss": 0.131, + "step": 51092 + }, + { + "epoch": 3.0929869777536627, + "grad_norm": 3.024883270263672, + "learning_rate": 3.101000411127861e-06, + "loss": 0.0595, + "step": 51093 + }, + { + "epoch": 3.0930005425935976, + "grad_norm": 4.1537370681762695, + "learning_rate": 3.100863368507606e-06, + "loss": 0.0599, + "step": 51094 + }, + { + "epoch": 3.0930141074335324, + "grad_norm": 3.6346356868743896, + "learning_rate": 3.100726325887351e-06, + "loss": 0.0906, + "step": 51095 + }, + { + "epoch": 3.0930276722734673, + "grad_norm": 3.1271204948425293, + "learning_rate": 3.1005892832670964e-06, + "loss": 0.0648, + "step": 51096 + }, + { + "epoch": 3.093041237113402, + "grad_norm": 4.12783145904541, + "learning_rate": 3.100452240646841e-06, + "loss": 0.0998, + "step": 51097 + }, + { + "epoch": 3.093054801953337, + "grad_norm": 4.2971320152282715, + "learning_rate": 3.1003151980265867e-06, + "loss": 0.124, + "step": 51098 + }, + { + "epoch": 3.093068366793272, + "grad_norm": 3.213804244995117, + "learning_rate": 3.1001781554063315e-06, + "loss": 0.0724, + "step": 51099 + }, + { + "epoch": 3.0930819316332068, + "grad_norm": 4.4380879402160645, + "learning_rate": 3.100041112786077e-06, + "loss": 0.094, + "step": 51100 + }, + { + "epoch": 3.0930954964731416, + "grad_norm": 3.8827004432678223, + "learning_rate": 3.099904070165822e-06, + "loss": 0.1013, + "step": 51101 + }, + { + "epoch": 3.0931090613130765, + "grad_norm": 2.6768665313720703, + "learning_rate": 3.099767027545567e-06, + "loss": 0.0391, + "step": 51102 + }, + { + "epoch": 3.0931226261530114, + "grad_norm": 3.8620145320892334, + "learning_rate": 3.0996299849253118e-06, + "loss": 0.0891, + "step": 51103 + }, + { + "epoch": 3.0931361909929462, + "grad_norm": 3.5901498794555664, + "learning_rate": 3.0994929423050574e-06, + "loss": 0.0543, + "step": 51104 + }, + { + "epoch": 3.093149755832881, + "grad_norm": 4.528907775878906, + "learning_rate": 3.099355899684802e-06, + "loss": 0.1177, + "step": 51105 + }, + { + "epoch": 3.093163320672816, + "grad_norm": 3.4956231117248535, + "learning_rate": 3.099218857064547e-06, + "loss": 0.094, + "step": 51106 + }, + { + "epoch": 3.093176885512751, + "grad_norm": 4.345501899719238, + "learning_rate": 3.0990818144442925e-06, + "loss": 0.0913, + "step": 51107 + }, + { + "epoch": 3.0931904503526857, + "grad_norm": 5.4398603439331055, + "learning_rate": 3.0989447718240373e-06, + "loss": 0.1313, + "step": 51108 + }, + { + "epoch": 3.0932040151926206, + "grad_norm": 3.8277266025543213, + "learning_rate": 3.098807729203783e-06, + "loss": 0.0851, + "step": 51109 + }, + { + "epoch": 3.0932175800325554, + "grad_norm": 4.72921895980835, + "learning_rate": 3.0986706865835276e-06, + "loss": 0.0899, + "step": 51110 + }, + { + "epoch": 3.0932311448724903, + "grad_norm": 4.7779717445373535, + "learning_rate": 3.0985336439632728e-06, + "loss": 0.0911, + "step": 51111 + }, + { + "epoch": 3.0932447097124256, + "grad_norm": 4.002803802490234, + "learning_rate": 3.098396601343018e-06, + "loss": 0.0623, + "step": 51112 + }, + { + "epoch": 3.0932582745523605, + "grad_norm": 4.220117568969727, + "learning_rate": 3.098259558722763e-06, + "loss": 0.1095, + "step": 51113 + }, + { + "epoch": 3.0932718393922953, + "grad_norm": 4.780436992645264, + "learning_rate": 3.098122516102508e-06, + "loss": 0.0867, + "step": 51114 + }, + { + "epoch": 3.09328540423223, + "grad_norm": 4.573004245758057, + "learning_rate": 3.0979854734822535e-06, + "loss": 0.119, + "step": 51115 + }, + { + "epoch": 3.093298969072165, + "grad_norm": 5.0805792808532715, + "learning_rate": 3.0978484308619983e-06, + "loss": 0.1646, + "step": 51116 + }, + { + "epoch": 3.0933125339121, + "grad_norm": 4.915535926818848, + "learning_rate": 3.097711388241744e-06, + "loss": 0.1202, + "step": 51117 + }, + { + "epoch": 3.093326098752035, + "grad_norm": 4.9698686599731445, + "learning_rate": 3.0975743456214886e-06, + "loss": 0.1115, + "step": 51118 + }, + { + "epoch": 3.0933396635919697, + "grad_norm": 6.063602924346924, + "learning_rate": 3.097437303001234e-06, + "loss": 0.1635, + "step": 51119 + }, + { + "epoch": 3.0933532284319045, + "grad_norm": 5.322012901306152, + "learning_rate": 3.0973002603809785e-06, + "loss": 0.076, + "step": 51120 + }, + { + "epoch": 3.0933667932718394, + "grad_norm": 3.767167091369629, + "learning_rate": 3.0971632177607237e-06, + "loss": 0.065, + "step": 51121 + }, + { + "epoch": 3.0933803581117743, + "grad_norm": 3.247756004333496, + "learning_rate": 3.097026175140469e-06, + "loss": 0.0654, + "step": 51122 + }, + { + "epoch": 3.093393922951709, + "grad_norm": 4.819911479949951, + "learning_rate": 3.0968891325202137e-06, + "loss": 0.1616, + "step": 51123 + }, + { + "epoch": 3.093407487791644, + "grad_norm": 4.408833980560303, + "learning_rate": 3.0967520898999593e-06, + "loss": 0.133, + "step": 51124 + }, + { + "epoch": 3.093421052631579, + "grad_norm": 5.449925422668457, + "learning_rate": 3.096615047279704e-06, + "loss": 0.1553, + "step": 51125 + }, + { + "epoch": 3.0934346174715137, + "grad_norm": 3.475628137588501, + "learning_rate": 3.0964780046594496e-06, + "loss": 0.0886, + "step": 51126 + }, + { + "epoch": 3.0934481823114486, + "grad_norm": 5.924703121185303, + "learning_rate": 3.0963409620391944e-06, + "loss": 0.0802, + "step": 51127 + }, + { + "epoch": 3.0934617471513834, + "grad_norm": 5.300312519073486, + "learning_rate": 3.0962039194189396e-06, + "loss": 0.1537, + "step": 51128 + }, + { + "epoch": 3.0934753119913183, + "grad_norm": 5.232615947723389, + "learning_rate": 3.0960668767986847e-06, + "loss": 0.0756, + "step": 51129 + }, + { + "epoch": 3.0934888768312536, + "grad_norm": 4.281097412109375, + "learning_rate": 3.09592983417843e-06, + "loss": 0.1154, + "step": 51130 + }, + { + "epoch": 3.0935024416711885, + "grad_norm": 5.165923595428467, + "learning_rate": 3.0957927915581747e-06, + "loss": 0.105, + "step": 51131 + }, + { + "epoch": 3.0935160065111234, + "grad_norm": 5.037541389465332, + "learning_rate": 3.0956557489379203e-06, + "loss": 0.1312, + "step": 51132 + }, + { + "epoch": 3.093529571351058, + "grad_norm": 3.9156062602996826, + "learning_rate": 3.095518706317665e-06, + "loss": 0.0891, + "step": 51133 + }, + { + "epoch": 3.093543136190993, + "grad_norm": 3.7862257957458496, + "learning_rate": 3.0953816636974098e-06, + "loss": 0.0645, + "step": 51134 + }, + { + "epoch": 3.093556701030928, + "grad_norm": 4.204444408416748, + "learning_rate": 3.0952446210771554e-06, + "loss": 0.0894, + "step": 51135 + }, + { + "epoch": 3.093570265870863, + "grad_norm": 4.045570373535156, + "learning_rate": 3.0951075784569e-06, + "loss": 0.0495, + "step": 51136 + }, + { + "epoch": 3.0935838307107977, + "grad_norm": 2.9808366298675537, + "learning_rate": 3.0949705358366453e-06, + "loss": 0.0563, + "step": 51137 + }, + { + "epoch": 3.0935973955507325, + "grad_norm": 3.2608108520507812, + "learning_rate": 3.0948334932163905e-06, + "loss": 0.0792, + "step": 51138 + }, + { + "epoch": 3.0936109603906674, + "grad_norm": 3.2553322315216064, + "learning_rate": 3.0946964505961357e-06, + "loss": 0.0603, + "step": 51139 + }, + { + "epoch": 3.0936245252306023, + "grad_norm": 4.426767349243164, + "learning_rate": 3.0945594079758804e-06, + "loss": 0.1119, + "step": 51140 + }, + { + "epoch": 3.093638090070537, + "grad_norm": 4.390369415283203, + "learning_rate": 3.094422365355626e-06, + "loss": 0.0975, + "step": 51141 + }, + { + "epoch": 3.093651654910472, + "grad_norm": 2.9917688369750977, + "learning_rate": 3.094285322735371e-06, + "loss": 0.037, + "step": 51142 + }, + { + "epoch": 3.093665219750407, + "grad_norm": 2.349494457244873, + "learning_rate": 3.0941482801151164e-06, + "loss": 0.025, + "step": 51143 + }, + { + "epoch": 3.0936787845903417, + "grad_norm": 4.6930952072143555, + "learning_rate": 3.094011237494861e-06, + "loss": 0.0826, + "step": 51144 + }, + { + "epoch": 3.0936923494302766, + "grad_norm": 4.3929314613342285, + "learning_rate": 3.0938741948746063e-06, + "loss": 0.1449, + "step": 51145 + }, + { + "epoch": 3.0937059142702115, + "grad_norm": 5.26421594619751, + "learning_rate": 3.0937371522543515e-06, + "loss": 0.0579, + "step": 51146 + }, + { + "epoch": 3.0937194791101463, + "grad_norm": 3.634882688522339, + "learning_rate": 3.0936001096340963e-06, + "loss": 0.0556, + "step": 51147 + }, + { + "epoch": 3.093733043950081, + "grad_norm": 3.931907892227173, + "learning_rate": 3.0934630670138414e-06, + "loss": 0.0591, + "step": 51148 + }, + { + "epoch": 3.093746608790016, + "grad_norm": 3.1504898071289062, + "learning_rate": 3.0933260243935866e-06, + "loss": 0.0454, + "step": 51149 + }, + { + "epoch": 3.0937601736299514, + "grad_norm": 3.7724711894989014, + "learning_rate": 3.093188981773332e-06, + "loss": 0.0423, + "step": 51150 + }, + { + "epoch": 3.0937737384698862, + "grad_norm": 3.201551914215088, + "learning_rate": 3.0930519391530766e-06, + "loss": 0.0562, + "step": 51151 + }, + { + "epoch": 3.093787303309821, + "grad_norm": 3.584244966506958, + "learning_rate": 3.092914896532822e-06, + "loss": 0.084, + "step": 51152 + }, + { + "epoch": 3.093800868149756, + "grad_norm": 4.586495399475098, + "learning_rate": 3.092777853912567e-06, + "loss": 0.1035, + "step": 51153 + }, + { + "epoch": 3.093814432989691, + "grad_norm": 2.191964864730835, + "learning_rate": 3.0926408112923125e-06, + "loss": 0.023, + "step": 51154 + }, + { + "epoch": 3.0938279978296257, + "grad_norm": 2.305157423019409, + "learning_rate": 3.0925037686720573e-06, + "loss": 0.0396, + "step": 51155 + }, + { + "epoch": 3.0938415626695606, + "grad_norm": 3.4308295249938965, + "learning_rate": 3.0923667260518024e-06, + "loss": 0.0824, + "step": 51156 + }, + { + "epoch": 3.0938551275094954, + "grad_norm": 3.939222812652588, + "learning_rate": 3.092229683431547e-06, + "loss": 0.0719, + "step": 51157 + }, + { + "epoch": 3.0938686923494303, + "grad_norm": 5.631048202514648, + "learning_rate": 3.092092640811293e-06, + "loss": 0.0661, + "step": 51158 + }, + { + "epoch": 3.093882257189365, + "grad_norm": 3.9540951251983643, + "learning_rate": 3.0919555981910376e-06, + "loss": 0.0915, + "step": 51159 + }, + { + "epoch": 3.0938958220293, + "grad_norm": 3.0856988430023193, + "learning_rate": 3.091818555570783e-06, + "loss": 0.0665, + "step": 51160 + }, + { + "epoch": 3.093909386869235, + "grad_norm": 3.0181846618652344, + "learning_rate": 3.091681512950528e-06, + "loss": 0.0404, + "step": 51161 + }, + { + "epoch": 3.0939229517091698, + "grad_norm": 4.285304546356201, + "learning_rate": 3.0915444703302727e-06, + "loss": 0.1168, + "step": 51162 + }, + { + "epoch": 3.0939365165491046, + "grad_norm": 3.370431423187256, + "learning_rate": 3.0914074277100183e-06, + "loss": 0.0884, + "step": 51163 + }, + { + "epoch": 3.0939500813890395, + "grad_norm": 3.153917074203491, + "learning_rate": 3.091270385089763e-06, + "loss": 0.0776, + "step": 51164 + }, + { + "epoch": 3.0939636462289744, + "grad_norm": 2.9950313568115234, + "learning_rate": 3.091133342469508e-06, + "loss": 0.0702, + "step": 51165 + }, + { + "epoch": 3.093977211068909, + "grad_norm": 4.7606892585754395, + "learning_rate": 3.0909962998492534e-06, + "loss": 0.0905, + "step": 51166 + }, + { + "epoch": 3.093990775908844, + "grad_norm": 5.5833964347839355, + "learning_rate": 3.0908592572289986e-06, + "loss": 0.1246, + "step": 51167 + }, + { + "epoch": 3.0940043407487794, + "grad_norm": 3.3104968070983887, + "learning_rate": 3.0907222146087433e-06, + "loss": 0.0906, + "step": 51168 + }, + { + "epoch": 3.0940179055887143, + "grad_norm": 5.440854549407959, + "learning_rate": 3.090585171988489e-06, + "loss": 0.1864, + "step": 51169 + }, + { + "epoch": 3.094031470428649, + "grad_norm": 3.9950549602508545, + "learning_rate": 3.0904481293682337e-06, + "loss": 0.071, + "step": 51170 + }, + { + "epoch": 3.094045035268584, + "grad_norm": 4.864925861358643, + "learning_rate": 3.0903110867479793e-06, + "loss": 0.1341, + "step": 51171 + }, + { + "epoch": 3.094058600108519, + "grad_norm": 3.672426462173462, + "learning_rate": 3.090174044127724e-06, + "loss": 0.0773, + "step": 51172 + }, + { + "epoch": 3.0940721649484537, + "grad_norm": 4.0670928955078125, + "learning_rate": 3.0900370015074692e-06, + "loss": 0.0969, + "step": 51173 + }, + { + "epoch": 3.0940857297883886, + "grad_norm": 5.439786434173584, + "learning_rate": 3.089899958887214e-06, + "loss": 0.1608, + "step": 51174 + }, + { + "epoch": 3.0940992946283234, + "grad_norm": 4.978444576263428, + "learning_rate": 3.089762916266959e-06, + "loss": 0.2037, + "step": 51175 + }, + { + "epoch": 3.0941128594682583, + "grad_norm": 4.167434215545654, + "learning_rate": 3.0896258736467043e-06, + "loss": 0.0952, + "step": 51176 + }, + { + "epoch": 3.094126424308193, + "grad_norm": 4.028343200683594, + "learning_rate": 3.089488831026449e-06, + "loss": 0.0965, + "step": 51177 + }, + { + "epoch": 3.094139989148128, + "grad_norm": 6.364018440246582, + "learning_rate": 3.0893517884061947e-06, + "loss": 0.1694, + "step": 51178 + }, + { + "epoch": 3.094153553988063, + "grad_norm": 4.639756679534912, + "learning_rate": 3.0892147457859394e-06, + "loss": 0.083, + "step": 51179 + }, + { + "epoch": 3.0941671188279978, + "grad_norm": 3.771728038787842, + "learning_rate": 3.089077703165685e-06, + "loss": 0.1501, + "step": 51180 + }, + { + "epoch": 3.0941806836679326, + "grad_norm": 4.359224796295166, + "learning_rate": 3.08894066054543e-06, + "loss": 0.0975, + "step": 51181 + }, + { + "epoch": 3.0941942485078675, + "grad_norm": 3.7257378101348877, + "learning_rate": 3.088803617925175e-06, + "loss": 0.0676, + "step": 51182 + }, + { + "epoch": 3.0942078133478024, + "grad_norm": 2.784608840942383, + "learning_rate": 3.08866657530492e-06, + "loss": 0.0367, + "step": 51183 + }, + { + "epoch": 3.0942213781877372, + "grad_norm": 4.951247215270996, + "learning_rate": 3.0885295326846653e-06, + "loss": 0.1746, + "step": 51184 + }, + { + "epoch": 3.094234943027672, + "grad_norm": 2.5886073112487793, + "learning_rate": 3.08839249006441e-06, + "loss": 0.0725, + "step": 51185 + }, + { + "epoch": 3.094248507867607, + "grad_norm": 2.9275615215301514, + "learning_rate": 3.0882554474441557e-06, + "loss": 0.0942, + "step": 51186 + }, + { + "epoch": 3.094262072707542, + "grad_norm": 3.2219738960266113, + "learning_rate": 3.0881184048239005e-06, + "loss": 0.102, + "step": 51187 + }, + { + "epoch": 3.094275637547477, + "grad_norm": 6.1520867347717285, + "learning_rate": 3.087981362203646e-06, + "loss": 0.1769, + "step": 51188 + }, + { + "epoch": 3.094289202387412, + "grad_norm": 4.022210597991943, + "learning_rate": 3.087844319583391e-06, + "loss": 0.1553, + "step": 51189 + }, + { + "epoch": 3.094302767227347, + "grad_norm": 4.370378494262695, + "learning_rate": 3.0877072769631356e-06, + "loss": 0.1056, + "step": 51190 + }, + { + "epoch": 3.0943163320672817, + "grad_norm": 3.5726122856140137, + "learning_rate": 3.0875702343428807e-06, + "loss": 0.1242, + "step": 51191 + }, + { + "epoch": 3.0943298969072166, + "grad_norm": 3.970008134841919, + "learning_rate": 3.087433191722626e-06, + "loss": 0.1194, + "step": 51192 + }, + { + "epoch": 3.0943434617471515, + "grad_norm": 6.125698089599609, + "learning_rate": 3.087296149102371e-06, + "loss": 0.1181, + "step": 51193 + }, + { + "epoch": 3.0943570265870863, + "grad_norm": 3.783555746078491, + "learning_rate": 3.087159106482116e-06, + "loss": 0.0843, + "step": 51194 + }, + { + "epoch": 3.094370591427021, + "grad_norm": 5.589104652404785, + "learning_rate": 3.0870220638618615e-06, + "loss": 0.1831, + "step": 51195 + }, + { + "epoch": 3.094384156266956, + "grad_norm": 4.862617492675781, + "learning_rate": 3.0868850212416062e-06, + "loss": 0.1228, + "step": 51196 + }, + { + "epoch": 3.094397721106891, + "grad_norm": 3.7715506553649902, + "learning_rate": 3.086747978621352e-06, + "loss": 0.1177, + "step": 51197 + }, + { + "epoch": 3.094411285946826, + "grad_norm": 3.350924491882324, + "learning_rate": 3.0866109360010966e-06, + "loss": 0.0728, + "step": 51198 + }, + { + "epoch": 3.0944248507867607, + "grad_norm": 2.642057418823242, + "learning_rate": 3.0864738933808418e-06, + "loss": 0.0777, + "step": 51199 + }, + { + "epoch": 3.0944384156266955, + "grad_norm": 2.670758008956909, + "learning_rate": 3.086336850760587e-06, + "loss": 0.0728, + "step": 51200 + }, + { + "epoch": 3.0944519804666304, + "grad_norm": 4.071785926818848, + "learning_rate": 3.086199808140332e-06, + "loss": 0.1349, + "step": 51201 + }, + { + "epoch": 3.0944655453065653, + "grad_norm": 4.868289947509766, + "learning_rate": 3.086062765520077e-06, + "loss": 0.1574, + "step": 51202 + }, + { + "epoch": 3.0944791101465, + "grad_norm": 4.901766300201416, + "learning_rate": 3.085925722899822e-06, + "loss": 0.1776, + "step": 51203 + }, + { + "epoch": 3.094492674986435, + "grad_norm": 4.629043102264404, + "learning_rate": 3.0857886802795672e-06, + "loss": 0.1734, + "step": 51204 + }, + { + "epoch": 3.09450623982637, + "grad_norm": 5.534543514251709, + "learning_rate": 3.085651637659312e-06, + "loss": 0.1778, + "step": 51205 + }, + { + "epoch": 3.094519804666305, + "grad_norm": 3.7979958057403564, + "learning_rate": 3.0855145950390576e-06, + "loss": 0.1618, + "step": 51206 + }, + { + "epoch": 3.09453336950624, + "grad_norm": 4.716497421264648, + "learning_rate": 3.0853775524188023e-06, + "loss": 0.1578, + "step": 51207 + }, + { + "epoch": 3.094546934346175, + "grad_norm": 4.260490894317627, + "learning_rate": 3.0852405097985475e-06, + "loss": 0.1352, + "step": 51208 + }, + { + "epoch": 3.0945604991861098, + "grad_norm": 5.088809490203857, + "learning_rate": 3.0851034671782927e-06, + "loss": 0.1622, + "step": 51209 + }, + { + "epoch": 3.0945740640260446, + "grad_norm": 3.6048948764801025, + "learning_rate": 3.084966424558038e-06, + "loss": 0.1227, + "step": 51210 + }, + { + "epoch": 3.0945876288659795, + "grad_norm": 3.7373197078704834, + "learning_rate": 3.0848293819377826e-06, + "loss": 0.0896, + "step": 51211 + }, + { + "epoch": 3.0946011937059144, + "grad_norm": 4.563309669494629, + "learning_rate": 3.0846923393175282e-06, + "loss": 0.1478, + "step": 51212 + }, + { + "epoch": 3.094614758545849, + "grad_norm": 4.673367977142334, + "learning_rate": 3.084555296697273e-06, + "loss": 0.1315, + "step": 51213 + }, + { + "epoch": 3.094628323385784, + "grad_norm": 4.548478603363037, + "learning_rate": 3.0844182540770186e-06, + "loss": 0.1598, + "step": 51214 + }, + { + "epoch": 3.094641888225719, + "grad_norm": 6.442018508911133, + "learning_rate": 3.0842812114567633e-06, + "loss": 0.1905, + "step": 51215 + }, + { + "epoch": 3.094655453065654, + "grad_norm": 4.05949592590332, + "learning_rate": 3.084144168836508e-06, + "loss": 0.1289, + "step": 51216 + }, + { + "epoch": 3.0946690179055887, + "grad_norm": 4.055449485778809, + "learning_rate": 3.0840071262162537e-06, + "loss": 0.0791, + "step": 51217 + }, + { + "epoch": 3.0946825827455235, + "grad_norm": 3.806785821914673, + "learning_rate": 3.0838700835959985e-06, + "loss": 0.1421, + "step": 51218 + }, + { + "epoch": 3.0946961475854584, + "grad_norm": 3.82991099357605, + "learning_rate": 3.0837330409757436e-06, + "loss": 0.1285, + "step": 51219 + }, + { + "epoch": 3.0947097124253933, + "grad_norm": 3.926443576812744, + "learning_rate": 3.083595998355489e-06, + "loss": 0.1555, + "step": 51220 + }, + { + "epoch": 3.094723277265328, + "grad_norm": 4.941397190093994, + "learning_rate": 3.083458955735234e-06, + "loss": 0.1703, + "step": 51221 + }, + { + "epoch": 3.094736842105263, + "grad_norm": 3.7640457153320312, + "learning_rate": 3.0833219131149787e-06, + "loss": 0.1036, + "step": 51222 + }, + { + "epoch": 3.094750406945198, + "grad_norm": 3.3532092571258545, + "learning_rate": 3.0831848704947244e-06, + "loss": 0.0959, + "step": 51223 + }, + { + "epoch": 3.0947639717851327, + "grad_norm": 4.174958229064941, + "learning_rate": 3.083047827874469e-06, + "loss": 0.2001, + "step": 51224 + }, + { + "epoch": 3.0947775366250676, + "grad_norm": 4.252398490905762, + "learning_rate": 3.0829107852542147e-06, + "loss": 0.1079, + "step": 51225 + }, + { + "epoch": 3.094791101465003, + "grad_norm": 4.249948978424072, + "learning_rate": 3.0827737426339595e-06, + "loss": 0.12, + "step": 51226 + }, + { + "epoch": 3.094804666304938, + "grad_norm": 4.10571813583374, + "learning_rate": 3.0826367000137046e-06, + "loss": 0.1315, + "step": 51227 + }, + { + "epoch": 3.0948182311448726, + "grad_norm": 3.771939516067505, + "learning_rate": 3.0824996573934494e-06, + "loss": 0.1082, + "step": 51228 + }, + { + "epoch": 3.0948317959848075, + "grad_norm": 6.016176223754883, + "learning_rate": 3.082362614773195e-06, + "loss": 0.2505, + "step": 51229 + }, + { + "epoch": 3.0948453608247424, + "grad_norm": 2.3524489402770996, + "learning_rate": 3.0822255721529398e-06, + "loss": 0.0556, + "step": 51230 + }, + { + "epoch": 3.0948589256646772, + "grad_norm": 3.8709921836853027, + "learning_rate": 3.0820885295326845e-06, + "loss": 0.1447, + "step": 51231 + }, + { + "epoch": 3.094872490504612, + "grad_norm": 5.061777591705322, + "learning_rate": 3.08195148691243e-06, + "loss": 0.1471, + "step": 51232 + }, + { + "epoch": 3.094886055344547, + "grad_norm": 5.916505813598633, + "learning_rate": 3.081814444292175e-06, + "loss": 0.1528, + "step": 51233 + }, + { + "epoch": 3.094899620184482, + "grad_norm": 4.158607006072998, + "learning_rate": 3.0816774016719205e-06, + "loss": 0.1148, + "step": 51234 + }, + { + "epoch": 3.0949131850244167, + "grad_norm": 5.387204170227051, + "learning_rate": 3.0815403590516652e-06, + "loss": 0.1465, + "step": 51235 + }, + { + "epoch": 3.0949267498643516, + "grad_norm": 2.7469887733459473, + "learning_rate": 3.0814033164314104e-06, + "loss": 0.1096, + "step": 51236 + }, + { + "epoch": 3.0949403147042864, + "grad_norm": 4.061931133270264, + "learning_rate": 3.0812662738111556e-06, + "loss": 0.1333, + "step": 51237 + }, + { + "epoch": 3.0949538795442213, + "grad_norm": 3.7653448581695557, + "learning_rate": 3.0811292311909008e-06, + "loss": 0.1251, + "step": 51238 + }, + { + "epoch": 3.094967444384156, + "grad_norm": 3.864027500152588, + "learning_rate": 3.0809921885706455e-06, + "loss": 0.1325, + "step": 51239 + }, + { + "epoch": 3.094981009224091, + "grad_norm": 2.827394962310791, + "learning_rate": 3.080855145950391e-06, + "loss": 0.0854, + "step": 51240 + }, + { + "epoch": 3.094994574064026, + "grad_norm": 3.7412474155426025, + "learning_rate": 3.080718103330136e-06, + "loss": 0.1423, + "step": 51241 + }, + { + "epoch": 3.0950081389039608, + "grad_norm": 3.8711447715759277, + "learning_rate": 3.0805810607098815e-06, + "loss": 0.0811, + "step": 51242 + }, + { + "epoch": 3.0950217037438956, + "grad_norm": 3.702803611755371, + "learning_rate": 3.0804440180896262e-06, + "loss": 0.0782, + "step": 51243 + }, + { + "epoch": 3.095035268583831, + "grad_norm": 5.590970993041992, + "learning_rate": 3.080306975469371e-06, + "loss": 0.1771, + "step": 51244 + }, + { + "epoch": 3.095048833423766, + "grad_norm": 3.0694870948791504, + "learning_rate": 3.080169932849116e-06, + "loss": 0.1118, + "step": 51245 + }, + { + "epoch": 3.0950623982637007, + "grad_norm": 4.70847225189209, + "learning_rate": 3.0800328902288613e-06, + "loss": 0.1444, + "step": 51246 + }, + { + "epoch": 3.0950759631036355, + "grad_norm": 3.9664371013641357, + "learning_rate": 3.0798958476086065e-06, + "loss": 0.096, + "step": 51247 + }, + { + "epoch": 3.0950895279435704, + "grad_norm": 4.211836814880371, + "learning_rate": 3.0797588049883513e-06, + "loss": 0.1094, + "step": 51248 + }, + { + "epoch": 3.0951030927835053, + "grad_norm": 3.7598886489868164, + "learning_rate": 3.079621762368097e-06, + "loss": 0.1188, + "step": 51249 + }, + { + "epoch": 3.09511665762344, + "grad_norm": 2.543961524963379, + "learning_rate": 3.0794847197478416e-06, + "loss": 0.0688, + "step": 51250 + }, + { + "epoch": 3.095130222463375, + "grad_norm": 4.659799098968506, + "learning_rate": 3.0793476771275872e-06, + "loss": 0.1147, + "step": 51251 + }, + { + "epoch": 3.09514378730331, + "grad_norm": 4.657317638397217, + "learning_rate": 3.079210634507332e-06, + "loss": 0.1165, + "step": 51252 + }, + { + "epoch": 3.0951573521432447, + "grad_norm": 4.753658771514893, + "learning_rate": 3.079073591887077e-06, + "loss": 0.1213, + "step": 51253 + }, + { + "epoch": 3.0951709169831796, + "grad_norm": 4.85764217376709, + "learning_rate": 3.0789365492668224e-06, + "loss": 0.1277, + "step": 51254 + }, + { + "epoch": 3.0951844818231145, + "grad_norm": 4.695246696472168, + "learning_rate": 3.0787995066465675e-06, + "loss": 0.1075, + "step": 51255 + }, + { + "epoch": 3.0951980466630493, + "grad_norm": 4.864492416381836, + "learning_rate": 3.0786624640263123e-06, + "loss": 0.1481, + "step": 51256 + }, + { + "epoch": 3.095211611502984, + "grad_norm": 3.5602991580963135, + "learning_rate": 3.078525421406058e-06, + "loss": 0.1082, + "step": 51257 + }, + { + "epoch": 3.095225176342919, + "grad_norm": 5.471630096435547, + "learning_rate": 3.0783883787858026e-06, + "loss": 0.1944, + "step": 51258 + }, + { + "epoch": 3.095238741182854, + "grad_norm": 3.0734853744506836, + "learning_rate": 3.0782513361655474e-06, + "loss": 0.0628, + "step": 51259 + }, + { + "epoch": 3.095252306022789, + "grad_norm": 3.889191150665283, + "learning_rate": 3.078114293545293e-06, + "loss": 0.1138, + "step": 51260 + }, + { + "epoch": 3.0952658708627236, + "grad_norm": 3.812160015106201, + "learning_rate": 3.0779772509250378e-06, + "loss": 0.1411, + "step": 51261 + }, + { + "epoch": 3.0952794357026585, + "grad_norm": 3.885064125061035, + "learning_rate": 3.077840208304783e-06, + "loss": 0.166, + "step": 51262 + }, + { + "epoch": 3.0952930005425934, + "grad_norm": 5.1359429359436035, + "learning_rate": 3.077703165684528e-06, + "loss": 0.1888, + "step": 51263 + }, + { + "epoch": 3.0953065653825287, + "grad_norm": 4.4181718826293945, + "learning_rate": 3.0775661230642733e-06, + "loss": 0.1706, + "step": 51264 + }, + { + "epoch": 3.0953201302224636, + "grad_norm": 2.7259631156921387, + "learning_rate": 3.077429080444018e-06, + "loss": 0.0497, + "step": 51265 + }, + { + "epoch": 3.0953336950623984, + "grad_norm": 3.21858549118042, + "learning_rate": 3.0772920378237637e-06, + "loss": 0.0738, + "step": 51266 + }, + { + "epoch": 3.0953472599023333, + "grad_norm": 3.2251923084259033, + "learning_rate": 3.0771549952035084e-06, + "loss": 0.0609, + "step": 51267 + }, + { + "epoch": 3.095360824742268, + "grad_norm": 5.617879390716553, + "learning_rate": 3.077017952583254e-06, + "loss": 0.1599, + "step": 51268 + }, + { + "epoch": 3.095374389582203, + "grad_norm": 4.685853004455566, + "learning_rate": 3.0768809099629988e-06, + "loss": 0.1132, + "step": 51269 + }, + { + "epoch": 3.095387954422138, + "grad_norm": 4.735690593719482, + "learning_rate": 3.076743867342744e-06, + "loss": 0.0985, + "step": 51270 + }, + { + "epoch": 3.0954015192620727, + "grad_norm": 3.989617347717285, + "learning_rate": 3.076606824722489e-06, + "loss": 0.0958, + "step": 51271 + }, + { + "epoch": 3.0954150841020076, + "grad_norm": 5.247983455657959, + "learning_rate": 3.076469782102234e-06, + "loss": 0.1126, + "step": 51272 + }, + { + "epoch": 3.0954286489419425, + "grad_norm": 3.5504488945007324, + "learning_rate": 3.076332739481979e-06, + "loss": 0.0913, + "step": 51273 + }, + { + "epoch": 3.0954422137818773, + "grad_norm": 4.112853050231934, + "learning_rate": 3.0761956968617242e-06, + "loss": 0.0944, + "step": 51274 + }, + { + "epoch": 3.095455778621812, + "grad_norm": 4.051699161529541, + "learning_rate": 3.0760586542414694e-06, + "loss": 0.1758, + "step": 51275 + }, + { + "epoch": 3.095469343461747, + "grad_norm": 6.742144584655762, + "learning_rate": 3.075921611621214e-06, + "loss": 0.1099, + "step": 51276 + }, + { + "epoch": 3.095482908301682, + "grad_norm": 5.445526123046875, + "learning_rate": 3.0757845690009598e-06, + "loss": 0.1747, + "step": 51277 + }, + { + "epoch": 3.095496473141617, + "grad_norm": 4.658421516418457, + "learning_rate": 3.0756475263807045e-06, + "loss": 0.1345, + "step": 51278 + }, + { + "epoch": 3.0955100379815517, + "grad_norm": 4.346694469451904, + "learning_rate": 3.0755104837604497e-06, + "loss": 0.1068, + "step": 51279 + }, + { + "epoch": 3.0955236028214865, + "grad_norm": 5.622933864593506, + "learning_rate": 3.075373441140195e-06, + "loss": 0.1201, + "step": 51280 + }, + { + "epoch": 3.0955371676614214, + "grad_norm": 3.115218162536621, + "learning_rate": 3.07523639851994e-06, + "loss": 0.0471, + "step": 51281 + }, + { + "epoch": 3.0955507325013567, + "grad_norm": 3.747958183288574, + "learning_rate": 3.075099355899685e-06, + "loss": 0.0684, + "step": 51282 + }, + { + "epoch": 3.0955642973412916, + "grad_norm": 3.05521297454834, + "learning_rate": 3.0749623132794304e-06, + "loss": 0.0666, + "step": 51283 + }, + { + "epoch": 3.0955778621812264, + "grad_norm": 5.773528575897217, + "learning_rate": 3.074825270659175e-06, + "loss": 0.13, + "step": 51284 + }, + { + "epoch": 3.0955914270211613, + "grad_norm": 4.4237141609191895, + "learning_rate": 3.07468822803892e-06, + "loss": 0.1062, + "step": 51285 + }, + { + "epoch": 3.095604991861096, + "grad_norm": 6.245006084442139, + "learning_rate": 3.0745511854186655e-06, + "loss": 0.146, + "step": 51286 + }, + { + "epoch": 3.095618556701031, + "grad_norm": 5.416166305541992, + "learning_rate": 3.0744141427984103e-06, + "loss": 0.0559, + "step": 51287 + }, + { + "epoch": 3.095632121540966, + "grad_norm": 2.499612808227539, + "learning_rate": 3.074277100178156e-06, + "loss": 0.0305, + "step": 51288 + }, + { + "epoch": 3.0956456863809008, + "grad_norm": 4.371592998504639, + "learning_rate": 3.0741400575579007e-06, + "loss": 0.0783, + "step": 51289 + }, + { + "epoch": 3.0956592512208356, + "grad_norm": 3.1330368518829346, + "learning_rate": 3.074003014937646e-06, + "loss": 0.0896, + "step": 51290 + }, + { + "epoch": 3.0956728160607705, + "grad_norm": 4.236929416656494, + "learning_rate": 3.073865972317391e-06, + "loss": 0.0961, + "step": 51291 + }, + { + "epoch": 3.0956863809007054, + "grad_norm": 6.031837463378906, + "learning_rate": 3.073728929697136e-06, + "loss": 0.1285, + "step": 51292 + }, + { + "epoch": 3.0956999457406402, + "grad_norm": 3.0645432472229004, + "learning_rate": 3.073591887076881e-06, + "loss": 0.0683, + "step": 51293 + }, + { + "epoch": 3.095713510580575, + "grad_norm": 3.0082433223724365, + "learning_rate": 3.0734548444566265e-06, + "loss": 0.0606, + "step": 51294 + }, + { + "epoch": 3.09572707542051, + "grad_norm": 4.252532005310059, + "learning_rate": 3.0733178018363713e-06, + "loss": 0.1449, + "step": 51295 + }, + { + "epoch": 3.095740640260445, + "grad_norm": 3.675057888031006, + "learning_rate": 3.073180759216117e-06, + "loss": 0.0923, + "step": 51296 + }, + { + "epoch": 3.0957542051003797, + "grad_norm": 4.756216526031494, + "learning_rate": 3.0730437165958617e-06, + "loss": 0.1342, + "step": 51297 + }, + { + "epoch": 3.0957677699403146, + "grad_norm": 4.224958419799805, + "learning_rate": 3.072906673975607e-06, + "loss": 0.0633, + "step": 51298 + }, + { + "epoch": 3.0957813347802494, + "grad_norm": 5.154955863952637, + "learning_rate": 3.0727696313553516e-06, + "loss": 0.0954, + "step": 51299 + }, + { + "epoch": 3.0957948996201843, + "grad_norm": 3.285933256149292, + "learning_rate": 3.0726325887350968e-06, + "loss": 0.047, + "step": 51300 + }, + { + "epoch": 3.095808464460119, + "grad_norm": 3.69409441947937, + "learning_rate": 3.072495546114842e-06, + "loss": 0.0833, + "step": 51301 + }, + { + "epoch": 3.0958220293000545, + "grad_norm": 5.65084171295166, + "learning_rate": 3.0723585034945867e-06, + "loss": 0.1141, + "step": 51302 + }, + { + "epoch": 3.0958355941399893, + "grad_norm": 4.098963737487793, + "learning_rate": 3.0722214608743323e-06, + "loss": 0.0828, + "step": 51303 + }, + { + "epoch": 3.095849158979924, + "grad_norm": 3.4949111938476562, + "learning_rate": 3.072084418254077e-06, + "loss": 0.0762, + "step": 51304 + }, + { + "epoch": 3.095862723819859, + "grad_norm": 2.6293325424194336, + "learning_rate": 3.0719473756338227e-06, + "loss": 0.0725, + "step": 51305 + }, + { + "epoch": 3.095876288659794, + "grad_norm": 3.7163407802581787, + "learning_rate": 3.0718103330135674e-06, + "loss": 0.0606, + "step": 51306 + }, + { + "epoch": 3.095889853499729, + "grad_norm": 2.690656900405884, + "learning_rate": 3.0716732903933126e-06, + "loss": 0.0679, + "step": 51307 + }, + { + "epoch": 3.0959034183396636, + "grad_norm": 2.9964468479156494, + "learning_rate": 3.0715362477730578e-06, + "loss": 0.0345, + "step": 51308 + }, + { + "epoch": 3.0959169831795985, + "grad_norm": 3.4293947219848633, + "learning_rate": 3.071399205152803e-06, + "loss": 0.0665, + "step": 51309 + }, + { + "epoch": 3.0959305480195334, + "grad_norm": 3.381120443344116, + "learning_rate": 3.0712621625325477e-06, + "loss": 0.0771, + "step": 51310 + }, + { + "epoch": 3.0959441128594682, + "grad_norm": 4.980320453643799, + "learning_rate": 3.0711251199122933e-06, + "loss": 0.1459, + "step": 51311 + }, + { + "epoch": 3.095957677699403, + "grad_norm": 5.416153430938721, + "learning_rate": 3.070988077292038e-06, + "loss": 0.0953, + "step": 51312 + }, + { + "epoch": 3.095971242539338, + "grad_norm": 2.9663772583007812, + "learning_rate": 3.070851034671783e-06, + "loss": 0.0444, + "step": 51313 + }, + { + "epoch": 3.095984807379273, + "grad_norm": 2.513089418411255, + "learning_rate": 3.0707139920515284e-06, + "loss": 0.0571, + "step": 51314 + }, + { + "epoch": 3.0959983722192077, + "grad_norm": 6.132740497589111, + "learning_rate": 3.070576949431273e-06, + "loss": 0.1705, + "step": 51315 + }, + { + "epoch": 3.0960119370591426, + "grad_norm": 4.209099769592285, + "learning_rate": 3.0704399068110184e-06, + "loss": 0.0874, + "step": 51316 + }, + { + "epoch": 3.0960255018990774, + "grad_norm": 2.967055082321167, + "learning_rate": 3.0703028641907635e-06, + "loss": 0.0631, + "step": 51317 + }, + { + "epoch": 3.0960390667390123, + "grad_norm": 2.7309842109680176, + "learning_rate": 3.0701658215705087e-06, + "loss": 0.0369, + "step": 51318 + }, + { + "epoch": 3.096052631578947, + "grad_norm": 4.0386128425598145, + "learning_rate": 3.0700287789502535e-06, + "loss": 0.0833, + "step": 51319 + }, + { + "epoch": 3.0960661964188825, + "grad_norm": 3.8151257038116455, + "learning_rate": 3.069891736329999e-06, + "loss": 0.0852, + "step": 51320 + }, + { + "epoch": 3.0960797612588173, + "grad_norm": 5.960270881652832, + "learning_rate": 3.069754693709744e-06, + "loss": 0.1315, + "step": 51321 + }, + { + "epoch": 3.096093326098752, + "grad_norm": 4.085968971252441, + "learning_rate": 3.0696176510894894e-06, + "loss": 0.0598, + "step": 51322 + }, + { + "epoch": 3.096106890938687, + "grad_norm": 3.1366095542907715, + "learning_rate": 3.069480608469234e-06, + "loss": 0.053, + "step": 51323 + }, + { + "epoch": 3.096120455778622, + "grad_norm": 4.385416507720947, + "learning_rate": 3.0693435658489794e-06, + "loss": 0.0598, + "step": 51324 + }, + { + "epoch": 3.096134020618557, + "grad_norm": 3.902431011199951, + "learning_rate": 3.0692065232287246e-06, + "loss": 0.0585, + "step": 51325 + }, + { + "epoch": 3.0961475854584917, + "grad_norm": 3.5301802158355713, + "learning_rate": 3.0690694806084693e-06, + "loss": 0.0758, + "step": 51326 + }, + { + "epoch": 3.0961611502984265, + "grad_norm": 2.9463999271392822, + "learning_rate": 3.0689324379882145e-06, + "loss": 0.0485, + "step": 51327 + }, + { + "epoch": 3.0961747151383614, + "grad_norm": 3.8482882976531982, + "learning_rate": 3.0687953953679592e-06, + "loss": 0.0737, + "step": 51328 + }, + { + "epoch": 3.0961882799782963, + "grad_norm": 3.4958815574645996, + "learning_rate": 3.068658352747705e-06, + "loss": 0.0822, + "step": 51329 + }, + { + "epoch": 3.096201844818231, + "grad_norm": 3.260176420211792, + "learning_rate": 3.0685213101274496e-06, + "loss": 0.0615, + "step": 51330 + }, + { + "epoch": 3.096215409658166, + "grad_norm": 4.110658168792725, + "learning_rate": 3.068384267507195e-06, + "loss": 0.096, + "step": 51331 + }, + { + "epoch": 3.096228974498101, + "grad_norm": 5.3178606033325195, + "learning_rate": 3.06824722488694e-06, + "loss": 0.1138, + "step": 51332 + }, + { + "epoch": 3.0962425393380357, + "grad_norm": 4.303011894226074, + "learning_rate": 3.068110182266685e-06, + "loss": 0.0974, + "step": 51333 + }, + { + "epoch": 3.0962561041779706, + "grad_norm": 2.105969190597534, + "learning_rate": 3.0679731396464303e-06, + "loss": 0.0718, + "step": 51334 + }, + { + "epoch": 3.0962696690179055, + "grad_norm": 2.9493236541748047, + "learning_rate": 3.0678360970261755e-06, + "loss": 0.1151, + "step": 51335 + }, + { + "epoch": 3.0962832338578403, + "grad_norm": 3.8292486667633057, + "learning_rate": 3.0676990544059202e-06, + "loss": 0.0778, + "step": 51336 + }, + { + "epoch": 3.096296798697775, + "grad_norm": 3.3310656547546387, + "learning_rate": 3.067562011785666e-06, + "loss": 0.0778, + "step": 51337 + }, + { + "epoch": 3.09631036353771, + "grad_norm": 3.488023042678833, + "learning_rate": 3.0674249691654106e-06, + "loss": 0.073, + "step": 51338 + }, + { + "epoch": 3.096323928377645, + "grad_norm": 3.2670605182647705, + "learning_rate": 3.067287926545156e-06, + "loss": 0.0855, + "step": 51339 + }, + { + "epoch": 3.0963374932175802, + "grad_norm": 7.113583087921143, + "learning_rate": 3.067150883924901e-06, + "loss": 0.1275, + "step": 51340 + }, + { + "epoch": 3.096351058057515, + "grad_norm": 4.307428359985352, + "learning_rate": 3.0670138413046457e-06, + "loss": 0.1013, + "step": 51341 + }, + { + "epoch": 3.09636462289745, + "grad_norm": 3.551532745361328, + "learning_rate": 3.0668767986843913e-06, + "loss": 0.112, + "step": 51342 + }, + { + "epoch": 3.096378187737385, + "grad_norm": 3.9568240642547607, + "learning_rate": 3.066739756064136e-06, + "loss": 0.1216, + "step": 51343 + }, + { + "epoch": 3.0963917525773197, + "grad_norm": 3.4692623615264893, + "learning_rate": 3.0666027134438813e-06, + "loss": 0.0825, + "step": 51344 + }, + { + "epoch": 3.0964053174172546, + "grad_norm": 4.332705020904541, + "learning_rate": 3.0664656708236264e-06, + "loss": 0.1136, + "step": 51345 + }, + { + "epoch": 3.0964188822571894, + "grad_norm": 3.970902681350708, + "learning_rate": 3.0663286282033716e-06, + "loss": 0.1269, + "step": 51346 + }, + { + "epoch": 3.0964324470971243, + "grad_norm": 3.7597146034240723, + "learning_rate": 3.0661915855831164e-06, + "loss": 0.1087, + "step": 51347 + }, + { + "epoch": 3.096446011937059, + "grad_norm": 4.162759780883789, + "learning_rate": 3.066054542962862e-06, + "loss": 0.1058, + "step": 51348 + }, + { + "epoch": 3.096459576776994, + "grad_norm": 6.734442234039307, + "learning_rate": 3.0659175003426067e-06, + "loss": 0.0696, + "step": 51349 + }, + { + "epoch": 3.096473141616929, + "grad_norm": 2.725080728530884, + "learning_rate": 3.065780457722352e-06, + "loss": 0.0575, + "step": 51350 + }, + { + "epoch": 3.0964867064568637, + "grad_norm": 3.727543830871582, + "learning_rate": 3.065643415102097e-06, + "loss": 0.1215, + "step": 51351 + }, + { + "epoch": 3.0965002712967986, + "grad_norm": 3.0850324630737305, + "learning_rate": 3.0655063724818423e-06, + "loss": 0.0824, + "step": 51352 + }, + { + "epoch": 3.0965138361367335, + "grad_norm": 3.5416934490203857, + "learning_rate": 3.065369329861587e-06, + "loss": 0.0891, + "step": 51353 + }, + { + "epoch": 3.0965274009766683, + "grad_norm": 2.9939234256744385, + "learning_rate": 3.065232287241332e-06, + "loss": 0.0683, + "step": 51354 + }, + { + "epoch": 3.096540965816603, + "grad_norm": 3.8376779556274414, + "learning_rate": 3.0650952446210774e-06, + "loss": 0.1004, + "step": 51355 + }, + { + "epoch": 3.096554530656538, + "grad_norm": 2.65583872795105, + "learning_rate": 3.064958202000822e-06, + "loss": 0.0659, + "step": 51356 + }, + { + "epoch": 3.096568095496473, + "grad_norm": 3.655606269836426, + "learning_rate": 3.0648211593805677e-06, + "loss": 0.1038, + "step": 51357 + }, + { + "epoch": 3.0965816603364082, + "grad_norm": 3.533550977706909, + "learning_rate": 3.0646841167603125e-06, + "loss": 0.1155, + "step": 51358 + }, + { + "epoch": 3.096595225176343, + "grad_norm": 4.156770706176758, + "learning_rate": 3.064547074140058e-06, + "loss": 0.0941, + "step": 51359 + }, + { + "epoch": 3.096608790016278, + "grad_norm": 4.7132391929626465, + "learning_rate": 3.064410031519803e-06, + "loss": 0.094, + "step": 51360 + }, + { + "epoch": 3.096622354856213, + "grad_norm": 3.3503050804138184, + "learning_rate": 3.064272988899548e-06, + "loss": 0.0784, + "step": 51361 + }, + { + "epoch": 3.0966359196961477, + "grad_norm": 4.701788425445557, + "learning_rate": 3.064135946279293e-06, + "loss": 0.1639, + "step": 51362 + }, + { + "epoch": 3.0966494845360826, + "grad_norm": 4.296726226806641, + "learning_rate": 3.0639989036590384e-06, + "loss": 0.1087, + "step": 51363 + }, + { + "epoch": 3.0966630493760174, + "grad_norm": 3.4894614219665527, + "learning_rate": 3.063861861038783e-06, + "loss": 0.1137, + "step": 51364 + }, + { + "epoch": 3.0966766142159523, + "grad_norm": 3.906867027282715, + "learning_rate": 3.0637248184185287e-06, + "loss": 0.1343, + "step": 51365 + }, + { + "epoch": 3.096690179055887, + "grad_norm": 4.6313395500183105, + "learning_rate": 3.0635877757982735e-06, + "loss": 0.1181, + "step": 51366 + }, + { + "epoch": 3.096703743895822, + "grad_norm": 4.059659481048584, + "learning_rate": 3.063450733178019e-06, + "loss": 0.0908, + "step": 51367 + }, + { + "epoch": 3.096717308735757, + "grad_norm": 3.419174909591675, + "learning_rate": 3.063313690557764e-06, + "loss": 0.0562, + "step": 51368 + }, + { + "epoch": 3.0967308735756918, + "grad_norm": 3.0184476375579834, + "learning_rate": 3.0631766479375086e-06, + "loss": 0.05, + "step": 51369 + }, + { + "epoch": 3.0967444384156266, + "grad_norm": 3.394821882247925, + "learning_rate": 3.063039605317254e-06, + "loss": 0.0706, + "step": 51370 + }, + { + "epoch": 3.0967580032555615, + "grad_norm": 2.8590481281280518, + "learning_rate": 3.062902562696999e-06, + "loss": 0.0763, + "step": 51371 + }, + { + "epoch": 3.0967715680954964, + "grad_norm": 3.1535825729370117, + "learning_rate": 3.062765520076744e-06, + "loss": 0.0381, + "step": 51372 + }, + { + "epoch": 3.0967851329354312, + "grad_norm": 2.418236255645752, + "learning_rate": 3.062628477456489e-06, + "loss": 0.0327, + "step": 51373 + }, + { + "epoch": 3.096798697775366, + "grad_norm": 3.619807481765747, + "learning_rate": 3.0624914348362345e-06, + "loss": 0.0903, + "step": 51374 + }, + { + "epoch": 3.096812262615301, + "grad_norm": 4.583805084228516, + "learning_rate": 3.0623543922159793e-06, + "loss": 0.0869, + "step": 51375 + }, + { + "epoch": 3.096825827455236, + "grad_norm": 2.3630099296569824, + "learning_rate": 3.062217349595725e-06, + "loss": 0.0284, + "step": 51376 + }, + { + "epoch": 3.0968393922951707, + "grad_norm": 4.3819074630737305, + "learning_rate": 3.0620803069754696e-06, + "loss": 0.0454, + "step": 51377 + }, + { + "epoch": 3.096852957135106, + "grad_norm": 2.8631129264831543, + "learning_rate": 3.061943264355215e-06, + "loss": 0.0503, + "step": 51378 + }, + { + "epoch": 3.096866521975041, + "grad_norm": 3.940913677215576, + "learning_rate": 3.06180622173496e-06, + "loss": 0.0841, + "step": 51379 + }, + { + "epoch": 3.0968800868149757, + "grad_norm": 5.201568603515625, + "learning_rate": 3.061669179114705e-06, + "loss": 0.1256, + "step": 51380 + }, + { + "epoch": 3.0968936516549106, + "grad_norm": 3.850558280944824, + "learning_rate": 3.06153213649445e-06, + "loss": 0.0548, + "step": 51381 + }, + { + "epoch": 3.0969072164948455, + "grad_norm": 5.221724510192871, + "learning_rate": 3.0613950938741947e-06, + "loss": 0.0948, + "step": 51382 + }, + { + "epoch": 3.0969207813347803, + "grad_norm": 7.031120777130127, + "learning_rate": 3.0612580512539403e-06, + "loss": 0.1029, + "step": 51383 + }, + { + "epoch": 3.096934346174715, + "grad_norm": 2.388934850692749, + "learning_rate": 3.061121008633685e-06, + "loss": 0.0345, + "step": 51384 + }, + { + "epoch": 3.09694791101465, + "grad_norm": 3.5243003368377686, + "learning_rate": 3.0609839660134306e-06, + "loss": 0.0484, + "step": 51385 + }, + { + "epoch": 3.096961475854585, + "grad_norm": 5.38654899597168, + "learning_rate": 3.0608469233931754e-06, + "loss": 0.1578, + "step": 51386 + }, + { + "epoch": 3.09697504069452, + "grad_norm": 2.584385871887207, + "learning_rate": 3.0607098807729206e-06, + "loss": 0.0565, + "step": 51387 + }, + { + "epoch": 3.0969886055344547, + "grad_norm": 3.7351646423339844, + "learning_rate": 3.0605728381526657e-06, + "loss": 0.0818, + "step": 51388 + }, + { + "epoch": 3.0970021703743895, + "grad_norm": 3.7923965454101562, + "learning_rate": 3.060435795532411e-06, + "loss": 0.0607, + "step": 51389 + }, + { + "epoch": 3.0970157352143244, + "grad_norm": 8.306854248046875, + "learning_rate": 3.0602987529121557e-06, + "loss": 0.0453, + "step": 51390 + }, + { + "epoch": 3.0970293000542592, + "grad_norm": 3.935472249984741, + "learning_rate": 3.0601617102919013e-06, + "loss": 0.0935, + "step": 51391 + }, + { + "epoch": 3.097042864894194, + "grad_norm": 8.070773124694824, + "learning_rate": 3.060024667671646e-06, + "loss": 0.0852, + "step": 51392 + }, + { + "epoch": 3.097056429734129, + "grad_norm": 4.740516662597656, + "learning_rate": 3.0598876250513916e-06, + "loss": 0.1057, + "step": 51393 + }, + { + "epoch": 3.097069994574064, + "grad_norm": 4.800549507141113, + "learning_rate": 3.0597505824311364e-06, + "loss": 0.0948, + "step": 51394 + }, + { + "epoch": 3.0970835594139987, + "grad_norm": 3.615598678588867, + "learning_rate": 3.059613539810881e-06, + "loss": 0.0849, + "step": 51395 + }, + { + "epoch": 3.097097124253934, + "grad_norm": 3.774935245513916, + "learning_rate": 3.0594764971906267e-06, + "loss": 0.0878, + "step": 51396 + }, + { + "epoch": 3.097110689093869, + "grad_norm": 2.974020481109619, + "learning_rate": 3.0593394545703715e-06, + "loss": 0.0431, + "step": 51397 + }, + { + "epoch": 3.0971242539338038, + "grad_norm": 1.7771403789520264, + "learning_rate": 3.0592024119501167e-06, + "loss": 0.0334, + "step": 51398 + }, + { + "epoch": 3.0971378187737386, + "grad_norm": 3.9354372024536133, + "learning_rate": 3.0590653693298614e-06, + "loss": 0.077, + "step": 51399 + }, + { + "epoch": 3.0971513836136735, + "grad_norm": 4.0808939933776855, + "learning_rate": 3.058928326709607e-06, + "loss": 0.1134, + "step": 51400 + }, + { + "epoch": 3.0971649484536083, + "grad_norm": 3.2188823223114014, + "learning_rate": 3.058791284089352e-06, + "loss": 0.0962, + "step": 51401 + }, + { + "epoch": 3.097178513293543, + "grad_norm": 3.58193302154541, + "learning_rate": 3.0586542414690974e-06, + "loss": 0.0702, + "step": 51402 + }, + { + "epoch": 3.097192078133478, + "grad_norm": 3.132467269897461, + "learning_rate": 3.058517198848842e-06, + "loss": 0.0853, + "step": 51403 + }, + { + "epoch": 3.097205642973413, + "grad_norm": 3.889742374420166, + "learning_rate": 3.0583801562285873e-06, + "loss": 0.0871, + "step": 51404 + }, + { + "epoch": 3.097219207813348, + "grad_norm": 4.623022079467773, + "learning_rate": 3.0582431136083325e-06, + "loss": 0.1521, + "step": 51405 + }, + { + "epoch": 3.0972327726532827, + "grad_norm": 3.1403801441192627, + "learning_rate": 3.0581060709880777e-06, + "loss": 0.0625, + "step": 51406 + }, + { + "epoch": 3.0972463374932175, + "grad_norm": 4.099480628967285, + "learning_rate": 3.0579690283678224e-06, + "loss": 0.0973, + "step": 51407 + }, + { + "epoch": 3.0972599023331524, + "grad_norm": 4.651482582092285, + "learning_rate": 3.057831985747568e-06, + "loss": 0.1375, + "step": 51408 + }, + { + "epoch": 3.0972734671730873, + "grad_norm": 3.7243709564208984, + "learning_rate": 3.057694943127313e-06, + "loss": 0.0813, + "step": 51409 + }, + { + "epoch": 3.097287032013022, + "grad_norm": 5.838160514831543, + "learning_rate": 3.0575579005070576e-06, + "loss": 0.0851, + "step": 51410 + }, + { + "epoch": 3.097300596852957, + "grad_norm": 5.237764358520508, + "learning_rate": 3.057420857886803e-06, + "loss": 0.1193, + "step": 51411 + }, + { + "epoch": 3.097314161692892, + "grad_norm": 3.1349096298217773, + "learning_rate": 3.057283815266548e-06, + "loss": 0.0797, + "step": 51412 + }, + { + "epoch": 3.0973277265328267, + "grad_norm": 3.4832282066345215, + "learning_rate": 3.0571467726462935e-06, + "loss": 0.0982, + "step": 51413 + }, + { + "epoch": 3.0973412913727616, + "grad_norm": 6.020821571350098, + "learning_rate": 3.0570097300260383e-06, + "loss": 0.1156, + "step": 51414 + }, + { + "epoch": 3.097354856212697, + "grad_norm": 5.312338352203369, + "learning_rate": 3.0568726874057835e-06, + "loss": 0.1003, + "step": 51415 + }, + { + "epoch": 3.0973684210526318, + "grad_norm": 3.634676933288574, + "learning_rate": 3.0567356447855286e-06, + "loss": 0.0778, + "step": 51416 + }, + { + "epoch": 3.0973819858925666, + "grad_norm": 5.033390045166016, + "learning_rate": 3.056598602165274e-06, + "loss": 0.0915, + "step": 51417 + }, + { + "epoch": 3.0973955507325015, + "grad_norm": 3.3497321605682373, + "learning_rate": 3.0564615595450186e-06, + "loss": 0.09, + "step": 51418 + }, + { + "epoch": 3.0974091155724364, + "grad_norm": 4.862257480621338, + "learning_rate": 3.056324516924764e-06, + "loss": 0.1411, + "step": 51419 + }, + { + "epoch": 3.0974226804123712, + "grad_norm": 3.3938419818878174, + "learning_rate": 3.056187474304509e-06, + "loss": 0.07, + "step": 51420 + }, + { + "epoch": 3.097436245252306, + "grad_norm": 2.6362600326538086, + "learning_rate": 3.056050431684254e-06, + "loss": 0.0442, + "step": 51421 + }, + { + "epoch": 3.097449810092241, + "grad_norm": 3.3378875255584717, + "learning_rate": 3.0559133890639993e-06, + "loss": 0.1155, + "step": 51422 + }, + { + "epoch": 3.097463374932176, + "grad_norm": 2.802063226699829, + "learning_rate": 3.055776346443744e-06, + "loss": 0.057, + "step": 51423 + }, + { + "epoch": 3.0974769397721107, + "grad_norm": 4.449822902679443, + "learning_rate": 3.0556393038234892e-06, + "loss": 0.1506, + "step": 51424 + }, + { + "epoch": 3.0974905046120456, + "grad_norm": 3.4331703186035156, + "learning_rate": 3.0555022612032344e-06, + "loss": 0.1082, + "step": 51425 + }, + { + "epoch": 3.0975040694519804, + "grad_norm": 2.8792836666107178, + "learning_rate": 3.0553652185829796e-06, + "loss": 0.0654, + "step": 51426 + }, + { + "epoch": 3.0975176342919153, + "grad_norm": 3.1574666500091553, + "learning_rate": 3.0552281759627243e-06, + "loss": 0.0738, + "step": 51427 + }, + { + "epoch": 3.09753119913185, + "grad_norm": 4.190772533416748, + "learning_rate": 3.05509113334247e-06, + "loss": 0.0979, + "step": 51428 + }, + { + "epoch": 3.097544763971785, + "grad_norm": 3.613471508026123, + "learning_rate": 3.0549540907222147e-06, + "loss": 0.0696, + "step": 51429 + }, + { + "epoch": 3.09755832881172, + "grad_norm": 3.34201979637146, + "learning_rate": 3.0548170481019603e-06, + "loss": 0.0577, + "step": 51430 + }, + { + "epoch": 3.0975718936516548, + "grad_norm": 3.418703556060791, + "learning_rate": 3.054680005481705e-06, + "loss": 0.0634, + "step": 51431 + }, + { + "epoch": 3.0975854584915896, + "grad_norm": 2.808445930480957, + "learning_rate": 3.0545429628614502e-06, + "loss": 0.07, + "step": 51432 + }, + { + "epoch": 3.0975990233315245, + "grad_norm": 3.7062957286834717, + "learning_rate": 3.0544059202411954e-06, + "loss": 0.0915, + "step": 51433 + }, + { + "epoch": 3.09761258817146, + "grad_norm": 10.63636589050293, + "learning_rate": 3.0542688776209406e-06, + "loss": 0.0863, + "step": 51434 + }, + { + "epoch": 3.0976261530113947, + "grad_norm": 4.119612693786621, + "learning_rate": 3.0541318350006853e-06, + "loss": 0.072, + "step": 51435 + }, + { + "epoch": 3.0976397178513295, + "grad_norm": 2.3835599422454834, + "learning_rate": 3.053994792380431e-06, + "loss": 0.082, + "step": 51436 + }, + { + "epoch": 3.0976532826912644, + "grad_norm": 3.179049253463745, + "learning_rate": 3.0538577497601757e-06, + "loss": 0.081, + "step": 51437 + }, + { + "epoch": 3.0976668475311993, + "grad_norm": 4.059381484985352, + "learning_rate": 3.0537207071399204e-06, + "loss": 0.109, + "step": 51438 + }, + { + "epoch": 3.097680412371134, + "grad_norm": 3.4417030811309814, + "learning_rate": 3.053583664519666e-06, + "loss": 0.0609, + "step": 51439 + }, + { + "epoch": 3.097693977211069, + "grad_norm": 5.188576698303223, + "learning_rate": 3.053446621899411e-06, + "loss": 0.0921, + "step": 51440 + }, + { + "epoch": 3.097707542051004, + "grad_norm": 3.4345808029174805, + "learning_rate": 3.053309579279156e-06, + "loss": 0.0657, + "step": 51441 + }, + { + "epoch": 3.0977211068909387, + "grad_norm": 4.3681111335754395, + "learning_rate": 3.053172536658901e-06, + "loss": 0.1503, + "step": 51442 + }, + { + "epoch": 3.0977346717308736, + "grad_norm": 4.583781719207764, + "learning_rate": 3.0530354940386463e-06, + "loss": 0.1056, + "step": 51443 + }, + { + "epoch": 3.0977482365708084, + "grad_norm": 5.746610164642334, + "learning_rate": 3.052898451418391e-06, + "loss": 0.1477, + "step": 51444 + }, + { + "epoch": 3.0977618014107433, + "grad_norm": 3.065214157104492, + "learning_rate": 3.0527614087981367e-06, + "loss": 0.089, + "step": 51445 + }, + { + "epoch": 3.097775366250678, + "grad_norm": 2.604355573654175, + "learning_rate": 3.0526243661778815e-06, + "loss": 0.0648, + "step": 51446 + }, + { + "epoch": 3.097788931090613, + "grad_norm": 2.2784488201141357, + "learning_rate": 3.052487323557627e-06, + "loss": 0.041, + "step": 51447 + }, + { + "epoch": 3.097802495930548, + "grad_norm": 3.9536633491516113, + "learning_rate": 3.052350280937372e-06, + "loss": 0.1356, + "step": 51448 + }, + { + "epoch": 3.0978160607704828, + "grad_norm": 2.7700915336608887, + "learning_rate": 3.052213238317117e-06, + "loss": 0.112, + "step": 51449 + }, + { + "epoch": 3.0978296256104176, + "grad_norm": 3.7712817192077637, + "learning_rate": 3.052076195696862e-06, + "loss": 0.1148, + "step": 51450 + }, + { + "epoch": 3.0978431904503525, + "grad_norm": 4.3623881340026855, + "learning_rate": 3.051939153076607e-06, + "loss": 0.1234, + "step": 51451 + }, + { + "epoch": 3.0978567552902874, + "grad_norm": 7.763937473297119, + "learning_rate": 3.051802110456352e-06, + "loss": 0.1647, + "step": 51452 + }, + { + "epoch": 3.0978703201302227, + "grad_norm": 3.5905420780181885, + "learning_rate": 3.051665067836097e-06, + "loss": 0.0741, + "step": 51453 + }, + { + "epoch": 3.0978838849701575, + "grad_norm": 4.601487159729004, + "learning_rate": 3.0515280252158425e-06, + "loss": 0.1133, + "step": 51454 + }, + { + "epoch": 3.0978974498100924, + "grad_norm": 4.602004528045654, + "learning_rate": 3.0513909825955872e-06, + "loss": 0.112, + "step": 51455 + }, + { + "epoch": 3.0979110146500273, + "grad_norm": 3.729506015777588, + "learning_rate": 3.051253939975333e-06, + "loss": 0.0828, + "step": 51456 + }, + { + "epoch": 3.097924579489962, + "grad_norm": 4.47662878036499, + "learning_rate": 3.0511168973550776e-06, + "loss": 0.1353, + "step": 51457 + }, + { + "epoch": 3.097938144329897, + "grad_norm": 2.1340460777282715, + "learning_rate": 3.0509798547348228e-06, + "loss": 0.0396, + "step": 51458 + }, + { + "epoch": 3.097951709169832, + "grad_norm": 5.291795253753662, + "learning_rate": 3.050842812114568e-06, + "loss": 0.1221, + "step": 51459 + }, + { + "epoch": 3.0979652740097667, + "grad_norm": 6.554056644439697, + "learning_rate": 3.050705769494313e-06, + "loss": 0.094, + "step": 51460 + }, + { + "epoch": 3.0979788388497016, + "grad_norm": 4.093897342681885, + "learning_rate": 3.050568726874058e-06, + "loss": 0.0918, + "step": 51461 + }, + { + "epoch": 3.0979924036896365, + "grad_norm": 3.9123334884643555, + "learning_rate": 3.0504316842538035e-06, + "loss": 0.0707, + "step": 51462 + }, + { + "epoch": 3.0980059685295713, + "grad_norm": 4.581665515899658, + "learning_rate": 3.0502946416335482e-06, + "loss": 0.1329, + "step": 51463 + }, + { + "epoch": 3.098019533369506, + "grad_norm": 3.933115005493164, + "learning_rate": 3.050157599013293e-06, + "loss": 0.0624, + "step": 51464 + }, + { + "epoch": 3.098033098209441, + "grad_norm": 4.140960693359375, + "learning_rate": 3.0500205563930386e-06, + "loss": 0.1776, + "step": 51465 + }, + { + "epoch": 3.098046663049376, + "grad_norm": 7.207629680633545, + "learning_rate": 3.0498835137727833e-06, + "loss": 0.1088, + "step": 51466 + }, + { + "epoch": 3.098060227889311, + "grad_norm": 3.527716636657715, + "learning_rate": 3.049746471152529e-06, + "loss": 0.0853, + "step": 51467 + }, + { + "epoch": 3.0980737927292457, + "grad_norm": 7.108616828918457, + "learning_rate": 3.0496094285322737e-06, + "loss": 0.0992, + "step": 51468 + }, + { + "epoch": 3.0980873575691805, + "grad_norm": 5.409942150115967, + "learning_rate": 3.049472385912019e-06, + "loss": 0.105, + "step": 51469 + }, + { + "epoch": 3.0981009224091154, + "grad_norm": 4.719419002532959, + "learning_rate": 3.0493353432917636e-06, + "loss": 0.1162, + "step": 51470 + }, + { + "epoch": 3.0981144872490503, + "grad_norm": 4.254302024841309, + "learning_rate": 3.0491983006715092e-06, + "loss": 0.1427, + "step": 51471 + }, + { + "epoch": 3.0981280520889856, + "grad_norm": 5.7827277183532715, + "learning_rate": 3.049061258051254e-06, + "loss": 0.142, + "step": 51472 + }, + { + "epoch": 3.0981416169289204, + "grad_norm": 4.2272233963012695, + "learning_rate": 3.0489242154309996e-06, + "loss": 0.1907, + "step": 51473 + }, + { + "epoch": 3.0981551817688553, + "grad_norm": 4.431958198547363, + "learning_rate": 3.0487871728107443e-06, + "loss": 0.1658, + "step": 51474 + }, + { + "epoch": 3.09816874660879, + "grad_norm": 5.314500331878662, + "learning_rate": 3.0486501301904895e-06, + "loss": 0.138, + "step": 51475 + }, + { + "epoch": 3.098182311448725, + "grad_norm": 3.1684494018554688, + "learning_rate": 3.0485130875702347e-06, + "loss": 0.0641, + "step": 51476 + }, + { + "epoch": 3.09819587628866, + "grad_norm": 5.445928573608398, + "learning_rate": 3.04837604494998e-06, + "loss": 0.1922, + "step": 51477 + }, + { + "epoch": 3.0982094411285948, + "grad_norm": 7.509047031402588, + "learning_rate": 3.0482390023297246e-06, + "loss": 0.1904, + "step": 51478 + }, + { + "epoch": 3.0982230059685296, + "grad_norm": 5.319990158081055, + "learning_rate": 3.04810195970947e-06, + "loss": 0.178, + "step": 51479 + }, + { + "epoch": 3.0982365708084645, + "grad_norm": 5.433893203735352, + "learning_rate": 3.047964917089215e-06, + "loss": 0.231, + "step": 51480 + }, + { + "epoch": 3.0982501356483994, + "grad_norm": 3.7962465286254883, + "learning_rate": 3.0478278744689598e-06, + "loss": 0.1327, + "step": 51481 + }, + { + "epoch": 3.098263700488334, + "grad_norm": 3.7834129333496094, + "learning_rate": 3.0476908318487054e-06, + "loss": 0.0545, + "step": 51482 + }, + { + "epoch": 3.098277265328269, + "grad_norm": 3.4786550998687744, + "learning_rate": 3.04755378922845e-06, + "loss": 0.0867, + "step": 51483 + }, + { + "epoch": 3.098290830168204, + "grad_norm": 4.04520845413208, + "learning_rate": 3.0474167466081957e-06, + "loss": 0.0641, + "step": 51484 + }, + { + "epoch": 3.098304395008139, + "grad_norm": 4.206044673919678, + "learning_rate": 3.0472797039879405e-06, + "loss": 0.0869, + "step": 51485 + }, + { + "epoch": 3.0983179598480737, + "grad_norm": 6.172721862792969, + "learning_rate": 3.0471426613676856e-06, + "loss": 0.2996, + "step": 51486 + }, + { + "epoch": 3.0983315246880085, + "grad_norm": 5.203311920166016, + "learning_rate": 3.047005618747431e-06, + "loss": 0.08, + "step": 51487 + }, + { + "epoch": 3.0983450895279434, + "grad_norm": 5.68187952041626, + "learning_rate": 3.046868576127176e-06, + "loss": 0.3052, + "step": 51488 + }, + { + "epoch": 3.0983586543678783, + "grad_norm": 3.3459129333496094, + "learning_rate": 3.0467315335069208e-06, + "loss": 0.0821, + "step": 51489 + }, + { + "epoch": 3.098372219207813, + "grad_norm": 4.017200469970703, + "learning_rate": 3.0465944908866664e-06, + "loss": 0.114, + "step": 51490 + }, + { + "epoch": 3.0983857840477484, + "grad_norm": 5.209418296813965, + "learning_rate": 3.046457448266411e-06, + "loss": 0.2969, + "step": 51491 + }, + { + "epoch": 3.0983993488876833, + "grad_norm": 4.96990442276001, + "learning_rate": 3.046320405646156e-06, + "loss": 0.2165, + "step": 51492 + }, + { + "epoch": 3.098412913727618, + "grad_norm": 5.374952793121338, + "learning_rate": 3.0461833630259015e-06, + "loss": 0.1498, + "step": 51493 + }, + { + "epoch": 3.098426478567553, + "grad_norm": 3.4881579875946045, + "learning_rate": 3.0460463204056462e-06, + "loss": 0.0687, + "step": 51494 + }, + { + "epoch": 3.098440043407488, + "grad_norm": 4.771442890167236, + "learning_rate": 3.0459092777853914e-06, + "loss": 0.1471, + "step": 51495 + }, + { + "epoch": 3.0984536082474228, + "grad_norm": 5.506068706512451, + "learning_rate": 3.0457722351651366e-06, + "loss": 0.116, + "step": 51496 + }, + { + "epoch": 3.0984671730873576, + "grad_norm": 5.240971565246582, + "learning_rate": 3.0456351925448818e-06, + "loss": 0.1537, + "step": 51497 + }, + { + "epoch": 3.0984807379272925, + "grad_norm": 3.9866671562194824, + "learning_rate": 3.0454981499246265e-06, + "loss": 0.0667, + "step": 51498 + }, + { + "epoch": 3.0984943027672274, + "grad_norm": 4.2651190757751465, + "learning_rate": 3.045361107304372e-06, + "loss": 0.0742, + "step": 51499 + }, + { + "epoch": 3.0985078676071622, + "grad_norm": 5.531188488006592, + "learning_rate": 3.045224064684117e-06, + "loss": 0.0967, + "step": 51500 + }, + { + "epoch": 3.098521432447097, + "grad_norm": 4.582568645477295, + "learning_rate": 3.0450870220638625e-06, + "loss": 0.1994, + "step": 51501 + }, + { + "epoch": 3.098534997287032, + "grad_norm": 8.054612159729004, + "learning_rate": 3.0449499794436072e-06, + "loss": 0.2148, + "step": 51502 + }, + { + "epoch": 3.098548562126967, + "grad_norm": 5.620095252990723, + "learning_rate": 3.0448129368233524e-06, + "loss": 0.1318, + "step": 51503 + }, + { + "epoch": 3.0985621269669017, + "grad_norm": 3.2654459476470947, + "learning_rate": 3.0446758942030976e-06, + "loss": 0.0768, + "step": 51504 + }, + { + "epoch": 3.0985756918068366, + "grad_norm": 4.84312105178833, + "learning_rate": 3.0445388515828423e-06, + "loss": 0.2228, + "step": 51505 + }, + { + "epoch": 3.0985892566467714, + "grad_norm": 3.8850812911987305, + "learning_rate": 3.0444018089625875e-06, + "loss": 0.0749, + "step": 51506 + }, + { + "epoch": 3.0986028214867063, + "grad_norm": 4.777198791503906, + "learning_rate": 3.0442647663423323e-06, + "loss": 0.1149, + "step": 51507 + }, + { + "epoch": 3.098616386326641, + "grad_norm": 4.5563249588012695, + "learning_rate": 3.044127723722078e-06, + "loss": 0.2299, + "step": 51508 + }, + { + "epoch": 3.098629951166576, + "grad_norm": 4.926703929901123, + "learning_rate": 3.0439906811018226e-06, + "loss": 0.2342, + "step": 51509 + }, + { + "epoch": 3.0986435160065113, + "grad_norm": 3.879631519317627, + "learning_rate": 3.0438536384815682e-06, + "loss": 0.078, + "step": 51510 + }, + { + "epoch": 3.098657080846446, + "grad_norm": 4.568789005279541, + "learning_rate": 3.043716595861313e-06, + "loss": 0.1373, + "step": 51511 + }, + { + "epoch": 3.098670645686381, + "grad_norm": 4.761256217956543, + "learning_rate": 3.043579553241058e-06, + "loss": 0.1394, + "step": 51512 + }, + { + "epoch": 3.098684210526316, + "grad_norm": 3.7944326400756836, + "learning_rate": 3.0434425106208034e-06, + "loss": 0.2739, + "step": 51513 + }, + { + "epoch": 3.098697775366251, + "grad_norm": 3.909740686416626, + "learning_rate": 3.0433054680005485e-06, + "loss": 0.1772, + "step": 51514 + }, + { + "epoch": 3.0987113402061857, + "grad_norm": 4.497644901275635, + "learning_rate": 3.0431684253802933e-06, + "loss": 0.1531, + "step": 51515 + }, + { + "epoch": 3.0987249050461205, + "grad_norm": 3.9494001865386963, + "learning_rate": 3.043031382760039e-06, + "loss": 0.1016, + "step": 51516 + }, + { + "epoch": 3.0987384698860554, + "grad_norm": 3.8455870151519775, + "learning_rate": 3.0428943401397836e-06, + "loss": 0.1619, + "step": 51517 + }, + { + "epoch": 3.0987520347259903, + "grad_norm": 3.5455987453460693, + "learning_rate": 3.0427572975195293e-06, + "loss": 0.1648, + "step": 51518 + }, + { + "epoch": 3.098765599565925, + "grad_norm": 4.365762710571289, + "learning_rate": 3.042620254899274e-06, + "loss": 0.1062, + "step": 51519 + }, + { + "epoch": 3.09877916440586, + "grad_norm": 4.752975940704346, + "learning_rate": 3.0424832122790188e-06, + "loss": 0.1295, + "step": 51520 + }, + { + "epoch": 3.098792729245795, + "grad_norm": 4.921285152435303, + "learning_rate": 3.0423461696587644e-06, + "loss": 0.1773, + "step": 51521 + }, + { + "epoch": 3.0988062940857297, + "grad_norm": 3.4809327125549316, + "learning_rate": 3.042209127038509e-06, + "loss": 0.1092, + "step": 51522 + }, + { + "epoch": 3.0988198589256646, + "grad_norm": 3.825199604034424, + "learning_rate": 3.0420720844182543e-06, + "loss": 0.1226, + "step": 51523 + }, + { + "epoch": 3.0988334237655994, + "grad_norm": 2.781426429748535, + "learning_rate": 3.041935041797999e-06, + "loss": 0.0815, + "step": 51524 + }, + { + "epoch": 3.0988469886055343, + "grad_norm": 4.024630546569824, + "learning_rate": 3.0417979991777447e-06, + "loss": 0.108, + "step": 51525 + }, + { + "epoch": 3.098860553445469, + "grad_norm": 3.26287841796875, + "learning_rate": 3.0416609565574894e-06, + "loss": 0.108, + "step": 51526 + }, + { + "epoch": 3.098874118285404, + "grad_norm": 4.333217620849609, + "learning_rate": 3.041523913937235e-06, + "loss": 0.1602, + "step": 51527 + }, + { + "epoch": 3.098887683125339, + "grad_norm": 2.7961037158966064, + "learning_rate": 3.0413868713169798e-06, + "loss": 0.1051, + "step": 51528 + }, + { + "epoch": 3.098901247965274, + "grad_norm": 3.3348007202148438, + "learning_rate": 3.041249828696725e-06, + "loss": 0.1069, + "step": 51529 + }, + { + "epoch": 3.098914812805209, + "grad_norm": 4.222961902618408, + "learning_rate": 3.04111278607647e-06, + "loss": 0.1481, + "step": 51530 + }, + { + "epoch": 3.098928377645144, + "grad_norm": 3.8582022190093994, + "learning_rate": 3.0409757434562153e-06, + "loss": 0.1045, + "step": 51531 + }, + { + "epoch": 3.098941942485079, + "grad_norm": 5.331137657165527, + "learning_rate": 3.04083870083596e-06, + "loss": 0.1801, + "step": 51532 + }, + { + "epoch": 3.0989555073250137, + "grad_norm": 6.082674503326416, + "learning_rate": 3.0407016582157052e-06, + "loss": 0.2272, + "step": 51533 + }, + { + "epoch": 3.0989690721649485, + "grad_norm": 2.941291093826294, + "learning_rate": 3.0405646155954504e-06, + "loss": 0.0831, + "step": 51534 + }, + { + "epoch": 3.0989826370048834, + "grad_norm": 3.0627660751342773, + "learning_rate": 3.040427572975195e-06, + "loss": 0.0883, + "step": 51535 + }, + { + "epoch": 3.0989962018448183, + "grad_norm": 4.586031913757324, + "learning_rate": 3.0402905303549408e-06, + "loss": 0.1417, + "step": 51536 + }, + { + "epoch": 3.099009766684753, + "grad_norm": 3.515864849090576, + "learning_rate": 3.0401534877346855e-06, + "loss": 0.1092, + "step": 51537 + }, + { + "epoch": 3.099023331524688, + "grad_norm": 4.175335884094238, + "learning_rate": 3.040016445114431e-06, + "loss": 0.076, + "step": 51538 + }, + { + "epoch": 3.099036896364623, + "grad_norm": 3.3899481296539307, + "learning_rate": 3.039879402494176e-06, + "loss": 0.1043, + "step": 51539 + }, + { + "epoch": 3.0990504612045577, + "grad_norm": 4.655806541442871, + "learning_rate": 3.039742359873921e-06, + "loss": 0.2089, + "step": 51540 + }, + { + "epoch": 3.0990640260444926, + "grad_norm": 4.638919353485107, + "learning_rate": 3.039605317253666e-06, + "loss": 0.0917, + "step": 51541 + }, + { + "epoch": 3.0990775908844275, + "grad_norm": 4.944026470184326, + "learning_rate": 3.0394682746334114e-06, + "loss": 0.2577, + "step": 51542 + }, + { + "epoch": 3.0990911557243623, + "grad_norm": 3.505800247192383, + "learning_rate": 3.039331232013156e-06, + "loss": 0.1203, + "step": 51543 + }, + { + "epoch": 3.099104720564297, + "grad_norm": 2.87536883354187, + "learning_rate": 3.0391941893929018e-06, + "loss": 0.0506, + "step": 51544 + }, + { + "epoch": 3.099118285404232, + "grad_norm": 3.2576417922973633, + "learning_rate": 3.0390571467726465e-06, + "loss": 0.1171, + "step": 51545 + }, + { + "epoch": 3.099131850244167, + "grad_norm": 5.486572265625, + "learning_rate": 3.0389201041523917e-06, + "loss": 0.1103, + "step": 51546 + }, + { + "epoch": 3.099145415084102, + "grad_norm": 4.827855587005615, + "learning_rate": 3.038783061532137e-06, + "loss": 0.0708, + "step": 51547 + }, + { + "epoch": 3.099158979924037, + "grad_norm": 3.952758550643921, + "learning_rate": 3.0386460189118817e-06, + "loss": 0.1168, + "step": 51548 + }, + { + "epoch": 3.099172544763972, + "grad_norm": 3.494689702987671, + "learning_rate": 3.038508976291627e-06, + "loss": 0.1222, + "step": 51549 + }, + { + "epoch": 3.099186109603907, + "grad_norm": 5.111595153808594, + "learning_rate": 3.038371933671372e-06, + "loss": 0.1855, + "step": 51550 + }, + { + "epoch": 3.0991996744438417, + "grad_norm": 4.463962078094482, + "learning_rate": 3.038234891051117e-06, + "loss": 0.1313, + "step": 51551 + }, + { + "epoch": 3.0992132392837766, + "grad_norm": 3.4609761238098145, + "learning_rate": 3.038097848430862e-06, + "loss": 0.132, + "step": 51552 + }, + { + "epoch": 3.0992268041237114, + "grad_norm": 3.932612419128418, + "learning_rate": 3.0379608058106075e-06, + "loss": 0.1025, + "step": 51553 + }, + { + "epoch": 3.0992403689636463, + "grad_norm": 3.841693162918091, + "learning_rate": 3.0378237631903523e-06, + "loss": 0.0924, + "step": 51554 + }, + { + "epoch": 3.099253933803581, + "grad_norm": 3.972346067428589, + "learning_rate": 3.037686720570098e-06, + "loss": 0.1608, + "step": 51555 + }, + { + "epoch": 3.099267498643516, + "grad_norm": 3.983555555343628, + "learning_rate": 3.0375496779498427e-06, + "loss": 0.0403, + "step": 51556 + }, + { + "epoch": 3.099281063483451, + "grad_norm": 3.132591485977173, + "learning_rate": 3.037412635329588e-06, + "loss": 0.1158, + "step": 51557 + }, + { + "epoch": 3.0992946283233858, + "grad_norm": 4.094784736633301, + "learning_rate": 3.037275592709333e-06, + "loss": 0.0715, + "step": 51558 + }, + { + "epoch": 3.0993081931633206, + "grad_norm": 3.0002126693725586, + "learning_rate": 3.037138550089078e-06, + "loss": 0.044, + "step": 51559 + }, + { + "epoch": 3.0993217580032555, + "grad_norm": 3.6393702030181885, + "learning_rate": 3.037001507468823e-06, + "loss": 0.116, + "step": 51560 + }, + { + "epoch": 3.0993353228431904, + "grad_norm": 2.7688727378845215, + "learning_rate": 3.0368644648485677e-06, + "loss": 0.0536, + "step": 51561 + }, + { + "epoch": 3.099348887683125, + "grad_norm": 3.1379268169403076, + "learning_rate": 3.0367274222283133e-06, + "loss": 0.0978, + "step": 51562 + }, + { + "epoch": 3.09936245252306, + "grad_norm": 3.3335094451904297, + "learning_rate": 3.036590379608058e-06, + "loss": 0.0632, + "step": 51563 + }, + { + "epoch": 3.099376017362995, + "grad_norm": 3.6466424465179443, + "learning_rate": 3.0364533369878037e-06, + "loss": 0.1036, + "step": 51564 + }, + { + "epoch": 3.09938958220293, + "grad_norm": 3.3417930603027344, + "learning_rate": 3.0363162943675484e-06, + "loss": 0.0478, + "step": 51565 + }, + { + "epoch": 3.0994031470428647, + "grad_norm": 4.089666843414307, + "learning_rate": 3.0361792517472936e-06, + "loss": 0.1355, + "step": 51566 + }, + { + "epoch": 3.0994167118828, + "grad_norm": 4.301679611206055, + "learning_rate": 3.0360422091270388e-06, + "loss": 0.1352, + "step": 51567 + }, + { + "epoch": 3.099430276722735, + "grad_norm": 2.846527099609375, + "learning_rate": 3.035905166506784e-06, + "loss": 0.0478, + "step": 51568 + }, + { + "epoch": 3.0994438415626697, + "grad_norm": 2.2869722843170166, + "learning_rate": 3.0357681238865287e-06, + "loss": 0.0342, + "step": 51569 + }, + { + "epoch": 3.0994574064026046, + "grad_norm": 3.9854421615600586, + "learning_rate": 3.0356310812662743e-06, + "loss": 0.1018, + "step": 51570 + }, + { + "epoch": 3.0994709712425395, + "grad_norm": 5.848342418670654, + "learning_rate": 3.035494038646019e-06, + "loss": 0.0971, + "step": 51571 + }, + { + "epoch": 3.0994845360824743, + "grad_norm": 3.4634382724761963, + "learning_rate": 3.0353569960257647e-06, + "loss": 0.0704, + "step": 51572 + }, + { + "epoch": 3.099498100922409, + "grad_norm": 2.4916458129882812, + "learning_rate": 3.0352199534055094e-06, + "loss": 0.0465, + "step": 51573 + }, + { + "epoch": 3.099511665762344, + "grad_norm": 3.251311779022217, + "learning_rate": 3.035082910785254e-06, + "loss": 0.0941, + "step": 51574 + }, + { + "epoch": 3.099525230602279, + "grad_norm": 2.6265993118286133, + "learning_rate": 3.034945868165e-06, + "loss": 0.0466, + "step": 51575 + }, + { + "epoch": 3.099538795442214, + "grad_norm": 5.0161051750183105, + "learning_rate": 3.0348088255447445e-06, + "loss": 0.0995, + "step": 51576 + }, + { + "epoch": 3.0995523602821486, + "grad_norm": 3.326028823852539, + "learning_rate": 3.0346717829244897e-06, + "loss": 0.0944, + "step": 51577 + }, + { + "epoch": 3.0995659251220835, + "grad_norm": 3.2462046146392822, + "learning_rate": 3.0345347403042345e-06, + "loss": 0.05, + "step": 51578 + }, + { + "epoch": 3.0995794899620184, + "grad_norm": 3.2893295288085938, + "learning_rate": 3.03439769768398e-06, + "loss": 0.0976, + "step": 51579 + }, + { + "epoch": 3.0995930548019532, + "grad_norm": 2.8845267295837402, + "learning_rate": 3.034260655063725e-06, + "loss": 0.0463, + "step": 51580 + }, + { + "epoch": 3.099606619641888, + "grad_norm": 4.775150299072266, + "learning_rate": 3.0341236124434704e-06, + "loss": 0.0743, + "step": 51581 + }, + { + "epoch": 3.099620184481823, + "grad_norm": 3.3256380558013916, + "learning_rate": 3.033986569823215e-06, + "loss": 0.0486, + "step": 51582 + }, + { + "epoch": 3.099633749321758, + "grad_norm": 2.3162031173706055, + "learning_rate": 3.0338495272029604e-06, + "loss": 0.0324, + "step": 51583 + }, + { + "epoch": 3.0996473141616927, + "grad_norm": 8.39599895477295, + "learning_rate": 3.0337124845827056e-06, + "loss": 0.2164, + "step": 51584 + }, + { + "epoch": 3.0996608790016276, + "grad_norm": 5.586108207702637, + "learning_rate": 3.0335754419624507e-06, + "loss": 0.1278, + "step": 51585 + }, + { + "epoch": 3.099674443841563, + "grad_norm": 5.4885053634643555, + "learning_rate": 3.0334383993421955e-06, + "loss": 0.0929, + "step": 51586 + }, + { + "epoch": 3.0996880086814977, + "grad_norm": 3.3320353031158447, + "learning_rate": 3.033301356721941e-06, + "loss": 0.0641, + "step": 51587 + }, + { + "epoch": 3.0997015735214326, + "grad_norm": 5.567497253417969, + "learning_rate": 3.033164314101686e-06, + "loss": 0.1276, + "step": 51588 + }, + { + "epoch": 3.0997151383613675, + "grad_norm": 2.9118473529815674, + "learning_rate": 3.0330272714814306e-06, + "loss": 0.0593, + "step": 51589 + }, + { + "epoch": 3.0997287032013023, + "grad_norm": 4.26547384262085, + "learning_rate": 3.032890228861176e-06, + "loss": 0.0909, + "step": 51590 + }, + { + "epoch": 3.099742268041237, + "grad_norm": 4.615215301513672, + "learning_rate": 3.032753186240921e-06, + "loss": 0.1046, + "step": 51591 + }, + { + "epoch": 3.099755832881172, + "grad_norm": 5.506869792938232, + "learning_rate": 3.0326161436206666e-06, + "loss": 0.1533, + "step": 51592 + }, + { + "epoch": 3.099769397721107, + "grad_norm": 3.2550361156463623, + "learning_rate": 3.0324791010004113e-06, + "loss": 0.0556, + "step": 51593 + }, + { + "epoch": 3.099782962561042, + "grad_norm": 4.134032726287842, + "learning_rate": 3.0323420583801565e-06, + "loss": 0.0617, + "step": 51594 + }, + { + "epoch": 3.0997965274009767, + "grad_norm": 4.035905361175537, + "learning_rate": 3.0322050157599012e-06, + "loss": 0.0876, + "step": 51595 + }, + { + "epoch": 3.0998100922409115, + "grad_norm": 7.930234909057617, + "learning_rate": 3.032067973139647e-06, + "loss": 0.1467, + "step": 51596 + }, + { + "epoch": 3.0998236570808464, + "grad_norm": 4.380107879638672, + "learning_rate": 3.0319309305193916e-06, + "loss": 0.0715, + "step": 51597 + }, + { + "epoch": 3.0998372219207813, + "grad_norm": 4.748717784881592, + "learning_rate": 3.031793887899137e-06, + "loss": 0.1159, + "step": 51598 + }, + { + "epoch": 3.099850786760716, + "grad_norm": 4.294613361358643, + "learning_rate": 3.031656845278882e-06, + "loss": 0.1111, + "step": 51599 + }, + { + "epoch": 3.099864351600651, + "grad_norm": 5.924972057342529, + "learning_rate": 3.031519802658627e-06, + "loss": 0.1583, + "step": 51600 + }, + { + "epoch": 3.099877916440586, + "grad_norm": 3.301656484603882, + "learning_rate": 3.0313827600383723e-06, + "loss": 0.0746, + "step": 51601 + }, + { + "epoch": 3.0998914812805207, + "grad_norm": 4.482128143310547, + "learning_rate": 3.031245717418117e-06, + "loss": 0.098, + "step": 51602 + }, + { + "epoch": 3.0999050461204556, + "grad_norm": 5.491907596588135, + "learning_rate": 3.0311086747978623e-06, + "loss": 0.1416, + "step": 51603 + }, + { + "epoch": 3.0999186109603905, + "grad_norm": 4.286031723022461, + "learning_rate": 3.0309716321776074e-06, + "loss": 0.0928, + "step": 51604 + }, + { + "epoch": 3.0999186109603905, + "eval_loss": 0.3648233711719513, + "eval_noise_accuracy": NaN, + "eval_runtime": 4463.1389, + "eval_samples_per_second": 1.126, + "eval_steps_per_second": 0.07, + "eval_wer": 25.47496215748664, + "step": 51604 + } + ], + "logging_steps": 1, + "max_steps": 73720, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 3686, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.426889776214835e+20, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}